blk-throttle.c 29.3 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12
/*
 * Interface for controlling IO bandwidth on a request queue
 *
 * Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com>
 */

#include <linux/module.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/bio.h>
#include <linux/blktrace_api.h>
#include "blk-cgroup.h"
13
#include "blk.h"
14 15 16 17 18 19 20 21 22 23

/* Max dispatch from a group in 1 round */
static int throtl_grp_quantum = 8;

/* Total max dispatch from all groups in one round */
static int throtl_quantum = 32;

/* Throttling is performed over 100ms slice and after that slice is renewed */
static unsigned long throtl_slice = HZ/10;	/* 100 ms */

T
Tejun Heo 已提交
24
static struct blkcg_policy blkcg_policy_throtl;
25

26 27 28
/* A workqueue to queue throttle related work */
static struct workqueue_struct *kthrotld_workqueue;

29 30 31 32 33
struct throtl_service_queue {
	struct rb_root		pending_tree;	/* RB tree of active tgs */
	struct rb_node		*first_pending;	/* first node in the tree */
	unsigned int		nr_pending;	/* # queued in the tree */
	unsigned long		first_pending_disptime;	/* disptime of the first tg */
34 35
};

36 37
#define THROTL_SERVICE_QUEUE_INITIALIZER				\
	(struct throtl_service_queue){ .pending_tree = RB_ROOT }
38

39 40 41 42
enum tg_state_flags {
	THROTL_TG_PENDING	= 1 << 0,	/* on parent's pending tree */
};

43 44
#define rb_entry_tg(node)	rb_entry((node), struct throtl_grp, rb_node)

45 46 47 48 49 50 51 52
/* Per-cpu group stats */
struct tg_stats_cpu {
	/* total bytes transferred */
	struct blkg_rwstat		service_bytes;
	/* total IOs serviced, post merge */
	struct blkg_rwstat		serviced;
};

53
struct throtl_grp {
54 55 56
	/* must be the first member */
	struct blkg_policy_data pd;

57
	/* active throtl group service_queue member */
58 59
	struct rb_node rb_node;

60 61 62
	/* throtl_data this group belongs to */
	struct throtl_data *td;

63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
	/*
	 * Dispatch time in jiffies. This is the estimated time when group
	 * will unthrottle and is ready to dispatch more bio. It is used as
	 * key to sort active groups in service tree.
	 */
	unsigned long disptime;

	unsigned int flags;

	/* Two lists for READ and WRITE */
	struct bio_list bio_lists[2];

	/* Number of queued bios on READ and WRITE lists */
	unsigned int nr_queued[2];

	/* bytes per second rate limits */
	uint64_t bps[2];

81 82 83
	/* IOPS limits */
	unsigned int iops[2];

84 85
	/* Number of bytes disptached in current slice */
	uint64_t bytes_disp[2];
86 87
	/* Number of bio's dispatched in current slice */
	unsigned int io_disp[2];
88 89 90 91

	/* When did we start a new slice */
	unsigned long slice_start[2];
	unsigned long slice_end[2];
92

93 94 95 96 97
	/* Per cpu stats pointer */
	struct tg_stats_cpu __percpu *stats_cpu;

	/* List of tgs waiting for per cpu stats memory to be allocated */
	struct list_head stats_alloc_node;
98 99 100 101 102
};

struct throtl_data
{
	/* service tree for active throtl groups */
103
	struct throtl_service_queue service_queue;
104 105 106 107 108 109 110

	struct request_queue *queue;

	/* Total Number of queued bios on READ and WRITE lists */
	unsigned int nr_queued[2];

	/*
V
Vivek Goyal 已提交
111
	 * number of total undestroyed groups
112 113 114 115
	 */
	unsigned int nr_undestroyed_grps;

	/* Work for dispatching throttled bios */
116
	struct delayed_work dispatch_work;
117 118
};

119 120 121 122 123 124 125
/* list and work item to allocate percpu group stats */
static DEFINE_SPINLOCK(tg_stats_alloc_lock);
static LIST_HEAD(tg_stats_alloc_list);

static void tg_stats_alloc_fn(struct work_struct *);
static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn);

126 127 128 129 130
static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
{
	return pd ? container_of(pd, struct throtl_grp, pd) : NULL;
}

T
Tejun Heo 已提交
131
static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg)
132
{
133
	return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl));
134 135
}

T
Tejun Heo 已提交
136
static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg)
137
{
138
	return pd_to_blkg(&tg->pd);
139 140
}

T
Tejun Heo 已提交
141 142 143 144 145
static inline struct throtl_grp *td_root_tg(struct throtl_data *td)
{
	return blkg_to_tg(td->queue->root_blkg);
}

146
#define throtl_log_tg(tg, fmt, args...)	do {				\
T
Tejun Heo 已提交
147 148 149
	char __pbuf[128];						\
									\
	blkg_path(tg_to_blkg(tg), __pbuf, sizeof(__pbuf));		\
150
	blk_add_trace_msg((tg)->td->queue, "throtl %s " fmt, __pbuf, ##args); \
T
Tejun Heo 已提交
151
} while (0)
152 153 154 155

#define throtl_log(td, fmt, args...)	\
	blk_add_trace_msg((td)->queue, "throtl " fmt, ##args)

156 157
/*
 * Worker for allocating per cpu stat for tgs. This is scheduled on the
158
 * system_wq once there are some groups on the alloc_list waiting for
159 160 161 162 163 164 165 166 167 168 169 170 171
 * allocation.
 */
static void tg_stats_alloc_fn(struct work_struct *work)
{
	static struct tg_stats_cpu *stats_cpu;	/* this fn is non-reentrant */
	struct delayed_work *dwork = to_delayed_work(work);
	bool empty = false;

alloc_stats:
	if (!stats_cpu) {
		stats_cpu = alloc_percpu(struct tg_stats_cpu);
		if (!stats_cpu) {
			/* allocation failed, try again after some time */
172
			schedule_delayed_work(dwork, msecs_to_jiffies(10));
173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192
			return;
		}
	}

	spin_lock_irq(&tg_stats_alloc_lock);

	if (!list_empty(&tg_stats_alloc_list)) {
		struct throtl_grp *tg = list_first_entry(&tg_stats_alloc_list,
							 struct throtl_grp,
							 stats_alloc_node);
		swap(tg->stats_cpu, stats_cpu);
		list_del_init(&tg->stats_alloc_node);
	}

	empty = list_empty(&tg_stats_alloc_list);
	spin_unlock_irq(&tg_stats_alloc_lock);
	if (!empty)
		goto alloc_stats;
}

T
Tejun Heo 已提交
193
static void throtl_pd_init(struct blkcg_gq *blkg)
194
{
195
	struct throtl_grp *tg = blkg_to_tg(blkg);
196
	unsigned long flags;
197

198
	RB_CLEAR_NODE(&tg->rb_node);
199
	tg->td = blkg->q->td;
200 201 202
	bio_list_init(&tg->bio_lists[0]);
	bio_list_init(&tg->bio_lists[1]);

203 204 205 206
	tg->bps[READ] = -1;
	tg->bps[WRITE] = -1;
	tg->iops[READ] = -1;
	tg->iops[WRITE] = -1;
207 208 209 210 211 212

	/*
	 * Ugh... We need to perform per-cpu allocation for tg->stats_cpu
	 * but percpu allocator can't be called from IO path.  Queue tg on
	 * tg_stats_alloc_list and allocate from work item.
	 */
213
	spin_lock_irqsave(&tg_stats_alloc_lock, flags);
214
	list_add(&tg->stats_alloc_node, &tg_stats_alloc_list);
215
	schedule_delayed_work(&tg_stats_alloc_work, 0);
216
	spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);
217 218
}

T
Tejun Heo 已提交
219
static void throtl_pd_exit(struct blkcg_gq *blkg)
220 221
{
	struct throtl_grp *tg = blkg_to_tg(blkg);
222
	unsigned long flags;
223

224
	spin_lock_irqsave(&tg_stats_alloc_lock, flags);
225
	list_del_init(&tg->stats_alloc_node);
226
	spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);
227 228 229 230

	free_percpu(tg->stats_cpu);
}

T
Tejun Heo 已提交
231
static void throtl_pd_reset_stats(struct blkcg_gq *blkg)
232 233 234 235 236 237 238 239 240 241 242 243 244
{
	struct throtl_grp *tg = blkg_to_tg(blkg);
	int cpu;

	if (tg->stats_cpu == NULL)
		return;

	for_each_possible_cpu(cpu) {
		struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);

		blkg_rwstat_reset(&sc->service_bytes);
		blkg_rwstat_reset(&sc->serviced);
	}
245 246
}

T
Tejun Heo 已提交
247 248
static struct throtl_grp *throtl_lookup_tg(struct throtl_data *td,
					   struct blkcg *blkcg)
249
{
250
	/*
T
Tejun Heo 已提交
251 252
	 * This is the common case when there are no blkcgs.  Avoid lookup
	 * in this case
253
	 */
T
Tejun Heo 已提交
254
	if (blkcg == &blkcg_root)
T
Tejun Heo 已提交
255
		return td_root_tg(td);
256

257
	return blkg_to_tg(blkg_lookup(blkcg, td->queue));
258 259
}

260
static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td,
T
Tejun Heo 已提交
261
						  struct blkcg *blkcg)
262
{
263
	struct request_queue *q = td->queue;
264
	struct throtl_grp *tg = NULL;
265

266
	/*
T
Tejun Heo 已提交
267 268
	 * This is the common case when there are no blkcgs.  Avoid lookup
	 * in this case
269
	 */
T
Tejun Heo 已提交
270
	if (blkcg == &blkcg_root) {
T
Tejun Heo 已提交
271
		tg = td_root_tg(td);
272
	} else {
T
Tejun Heo 已提交
273
		struct blkcg_gq *blkg;
274

275
		blkg = blkg_lookup_create(blkcg, q);
276

277 278
		/* if %NULL and @q is alive, fall back to root_tg */
		if (!IS_ERR(blkg))
279
			tg = blkg_to_tg(blkg);
B
Bart Van Assche 已提交
280
		else if (!blk_queue_dying(q))
T
Tejun Heo 已提交
281
			tg = td_root_tg(td);
282 283
	}

284 285 286
	return tg;
}

287
static struct throtl_grp *throtl_rb_first(struct throtl_service_queue *sq)
288 289
{
	/* Service tree is empty */
290
	if (!sq->nr_pending)
291 292
		return NULL;

293 294
	if (!sq->first_pending)
		sq->first_pending = rb_first(&sq->pending_tree);
295

296 297
	if (sq->first_pending)
		return rb_entry_tg(sq->first_pending);
298 299 300 301 302 303 304 305 306 307

	return NULL;
}

static void rb_erase_init(struct rb_node *n, struct rb_root *root)
{
	rb_erase(n, root);
	RB_CLEAR_NODE(n);
}

308
static void throtl_rb_erase(struct rb_node *n, struct throtl_service_queue *sq)
309
{
310 311 312 313
	if (sq->first_pending == n)
		sq->first_pending = NULL;
	rb_erase_init(n, &sq->pending_tree);
	--sq->nr_pending;
314 315
}

316
static void update_min_dispatch_time(struct throtl_service_queue *sq)
317 318 319
{
	struct throtl_grp *tg;

320
	tg = throtl_rb_first(sq);
321 322 323
	if (!tg)
		return;

324
	sq->first_pending_disptime = tg->disptime;
325 326
}

327 328
static void tg_service_queue_add(struct throtl_service_queue *sq,
				 struct throtl_grp *tg)
329
{
330
	struct rb_node **node = &sq->pending_tree.rb_node;
331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348
	struct rb_node *parent = NULL;
	struct throtl_grp *__tg;
	unsigned long key = tg->disptime;
	int left = 1;

	while (*node != NULL) {
		parent = *node;
		__tg = rb_entry_tg(parent);

		if (time_before(key, __tg->disptime))
			node = &parent->rb_left;
		else {
			node = &parent->rb_right;
			left = 0;
		}
	}

	if (left)
349
		sq->first_pending = &tg->rb_node;
350 351

	rb_link_node(&tg->rb_node, parent, node);
352
	rb_insert_color(&tg->rb_node, &sq->pending_tree);
353 354 355 356
}

static void __throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
{
357
	struct throtl_service_queue *sq = &td->service_queue;
358

359
	tg_service_queue_add(sq, tg);
360
	tg->flags |= THROTL_TG_PENDING;
361
	sq->nr_pending++;
362 363 364 365
}

static void throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
{
366
	if (!(tg->flags & THROTL_TG_PENDING))
367 368 369 370 371
		__throtl_enqueue_tg(td, tg);
}

static void __throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
{
372
	throtl_rb_erase(&tg->rb_node, &td->service_queue);
373
	tg->flags &= ~THROTL_TG_PENDING;
374 375 376 377
}

static void throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
{
378
	if (tg->flags & THROTL_TG_PENDING)
379 380 381
		__throtl_dequeue_tg(td, tg);
}

382 383 384 385 386 387
/* Call with queue lock held */
static void throtl_schedule_delayed_work(struct throtl_data *td,
					 unsigned long delay)
{
	struct delayed_work *dwork = &td->dispatch_work;

388 389
	mod_delayed_work(kthrotld_workqueue, dwork, delay);
	throtl_log(td, "schedule work. delay=%lu jiffies=%lu", delay, jiffies);
390 391
}

392 393
static void throtl_schedule_next_dispatch(struct throtl_data *td)
{
394
	struct throtl_service_queue *sq = &td->service_queue;
395

396
	/* any pending children left? */
397
	if (!sq->nr_pending)
398 399
		return;

400
	update_min_dispatch_time(sq);
401

402
	if (time_before_eq(sq->first_pending_disptime, jiffies))
403
		throtl_schedule_delayed_work(td, 0);
404
	else
405
		throtl_schedule_delayed_work(td, sq->first_pending_disptime - jiffies);
406 407
}

408
static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw)
409 410
{
	tg->bytes_disp[rw] = 0;
411
	tg->io_disp[rw] = 0;
412 413
	tg->slice_start[rw] = jiffies;
	tg->slice_end[rw] = jiffies + throtl_slice;
414
	throtl_log_tg(tg, "[%c] new slice start=%lu end=%lu jiffies=%lu",
415 416 417 418
			rw == READ ? 'R' : 'W', tg->slice_start[rw],
			tg->slice_end[rw], jiffies);
}

419 420
static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw,
					unsigned long jiffy_end)
421 422 423 424
{
	tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
}

425 426
static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw,
				       unsigned long jiffy_end)
427 428
{
	tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
429
	throtl_log_tg(tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu",
430 431 432 433 434
			rw == READ ? 'R' : 'W', tg->slice_start[rw],
			tg->slice_end[rw], jiffies);
}

/* Determine if previously allocated or extended slice is complete or not */
435
static bool throtl_slice_used(struct throtl_grp *tg, bool rw)
436 437 438 439 440 441 442 443
{
	if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw]))
		return 0;

	return 1;
}

/* Trim the used slices and adjust slice start accordingly */
444
static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
445
{
446 447
	unsigned long nr_slices, time_elapsed, io_trim;
	u64 bytes_trim, tmp;
448 449 450 451 452 453 454 455

	BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw]));

	/*
	 * If bps are unlimited (-1), then time slice don't get
	 * renewed. Don't try to trim the slice if slice is used. A new
	 * slice will start when appropriate.
	 */
456
	if (throtl_slice_used(tg, rw))
457 458
		return;

459 460 461 462 463 464 465 466
	/*
	 * A bio has been dispatched. Also adjust slice_end. It might happen
	 * that initially cgroup limit was very low resulting in high
	 * slice_end, but later limit was bumped up and bio was dispached
	 * sooner, then we need to reduce slice_end. A high bogus slice_end
	 * is bad because it does not allow new slice to start.
	 */

467
	throtl_set_slice_end(tg, rw, jiffies + throtl_slice);
468

469 470 471 472 473 474
	time_elapsed = jiffies - tg->slice_start[rw];

	nr_slices = time_elapsed / throtl_slice;

	if (!nr_slices)
		return;
475 476 477
	tmp = tg->bps[rw] * throtl_slice * nr_slices;
	do_div(tmp, HZ);
	bytes_trim = tmp;
478

479
	io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ;
480

481
	if (!bytes_trim && !io_trim)
482 483 484 485 486 487 488
		return;

	if (tg->bytes_disp[rw] >= bytes_trim)
		tg->bytes_disp[rw] -= bytes_trim;
	else
		tg->bytes_disp[rw] = 0;

489 490 491 492 493
	if (tg->io_disp[rw] >= io_trim)
		tg->io_disp[rw] -= io_trim;
	else
		tg->io_disp[rw] = 0;

494 495
	tg->slice_start[rw] += nr_slices * throtl_slice;

496
	throtl_log_tg(tg, "[%c] trim slice nr=%lu bytes=%llu io=%lu"
497
			" start=%lu end=%lu jiffies=%lu",
498
			rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim,
499 500 501
			tg->slice_start[rw], tg->slice_end[rw], jiffies);
}

502 503
static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
				  unsigned long *wait)
504 505
{
	bool rw = bio_data_dir(bio);
506
	unsigned int io_allowed;
507
	unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
508
	u64 tmp;
509

510
	jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
511

512 513 514 515 516 517
	/* Slice has just started. Consider one slice interval */
	if (!jiffy_elapsed)
		jiffy_elapsed_rnd = throtl_slice;

	jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);

518 519 520 521 522 523 524 525 526 527 528 529 530 531
	/*
	 * jiffy_elapsed_rnd should not be a big value as minimum iops can be
	 * 1 then at max jiffy elapsed should be equivalent of 1 second as we
	 * will allow dispatch after 1 second and after that slice should
	 * have been trimmed.
	 */

	tmp = (u64)tg->iops[rw] * jiffy_elapsed_rnd;
	do_div(tmp, HZ);

	if (tmp > UINT_MAX)
		io_allowed = UINT_MAX;
	else
		io_allowed = tmp;
532 533

	if (tg->io_disp[rw] + 1 <= io_allowed) {
534 535 536 537 538
		if (wait)
			*wait = 0;
		return 1;
	}

539 540 541 542 543 544 545 546 547 548 549 550 551
	/* Calc approx time to dispatch */
	jiffy_wait = ((tg->io_disp[rw] + 1) * HZ)/tg->iops[rw] + 1;

	if (jiffy_wait > jiffy_elapsed)
		jiffy_wait = jiffy_wait - jiffy_elapsed;
	else
		jiffy_wait = 1;

	if (wait)
		*wait = jiffy_wait;
	return 0;
}

552 553
static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
				 unsigned long *wait)
554 555
{
	bool rw = bio_data_dir(bio);
556
	u64 bytes_allowed, extra_bytes, tmp;
557
	unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
558 559 560 561 562 563 564 565 566

	jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];

	/* Slice has just started. Consider one slice interval */
	if (!jiffy_elapsed)
		jiffy_elapsed_rnd = throtl_slice;

	jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);

567 568
	tmp = tg->bps[rw] * jiffy_elapsed_rnd;
	do_div(tmp, HZ);
569
	bytes_allowed = tmp;
570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590

	if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) {
		if (wait)
			*wait = 0;
		return 1;
	}

	/* Calc approx time to dispatch */
	extra_bytes = tg->bytes_disp[rw] + bio->bi_size - bytes_allowed;
	jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]);

	if (!jiffy_wait)
		jiffy_wait = 1;

	/*
	 * This wait time is without taking into consideration the rounding
	 * up we did. Add that time also.
	 */
	jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed);
	if (wait)
		*wait = jiffy_wait;
591 592 593
	return 0;
}

594 595 596 597 598 599
static bool tg_no_rule_group(struct throtl_grp *tg, bool rw) {
	if (tg->bps[rw] == -1 && tg->iops[rw] == -1)
		return 1;
	return 0;
}

600 601 602 603
/*
 * Returns whether one can dispatch a bio or not. Also returns approx number
 * of jiffies to wait before this bio is with-in IO rate and can be dispatched
 */
604 605
static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
			    unsigned long *wait)
606 607 608 609 610 611 612 613 614 615 616
{
	bool rw = bio_data_dir(bio);
	unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;

	/*
 	 * Currently whole state machine of group depends on first bio
	 * queued in the group bio list. So one should not be calling
	 * this function with a different bio if there are other bios
	 * queued.
	 */
	BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw]));
617

618 619 620 621 622 623 624 625 626 627 628 629
	/* If tg->bps = -1, then BW is unlimited */
	if (tg->bps[rw] == -1 && tg->iops[rw] == -1) {
		if (wait)
			*wait = 0;
		return 1;
	}

	/*
	 * If previous slice expired, start a new one otherwise renew/extend
	 * existing slice to make sure it is at least throtl_slice interval
	 * long since now.
	 */
630 631
	if (throtl_slice_used(tg, rw))
		throtl_start_new_slice(tg, rw);
632 633
	else {
		if (time_before(tg->slice_end[rw], jiffies + throtl_slice))
634
			throtl_extend_slice(tg, rw, jiffies + throtl_slice);
635 636
	}

637 638
	if (tg_with_in_bps_limit(tg, bio, &bps_wait) &&
	    tg_with_in_iops_limit(tg, bio, &iops_wait)) {
639 640 641 642 643 644 645 646 647 648 649
		if (wait)
			*wait = 0;
		return 1;
	}

	max_wait = max(bps_wait, iops_wait);

	if (wait)
		*wait = max_wait;

	if (time_before(tg->slice_end[rw], jiffies + max_wait))
650
		throtl_extend_slice(tg, rw, jiffies + max_wait);
651 652 653 654

	return 0;
}

T
Tejun Heo 已提交
655
static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes,
656 657
					 int rw)
{
658 659
	struct throtl_grp *tg = blkg_to_tg(blkg);
	struct tg_stats_cpu *stats_cpu;
660 661 662
	unsigned long flags;

	/* If per cpu stats are not allocated yet, don't do any accounting. */
663
	if (tg->stats_cpu == NULL)
664 665 666 667 668 669 670 671 672
		return;

	/*
	 * Disabling interrupts to provide mutual exclusion between two
	 * writes on same cpu. It probably is not needed for 64bit. Not
	 * optimizing that case yet.
	 */
	local_irq_save(flags);

673
	stats_cpu = this_cpu_ptr(tg->stats_cpu);
674 675 676 677 678 679 680

	blkg_rwstat_add(&stats_cpu->serviced, rw, 1);
	blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes);

	local_irq_restore(flags);
}

681 682 683 684 685 686
static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
{
	bool rw = bio_data_dir(bio);

	/* Charge the bio to the group */
	tg->bytes_disp[rw] += bio->bi_size;
687
	tg->io_disp[rw]++;
688

689
	throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size, bio->bi_rw);
690 691 692 693 694 695 696 697 698
}

static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg,
			struct bio *bio)
{
	bool rw = bio_data_dir(bio);

	bio_list_add(&tg->bio_lists[rw], bio);
	/* Take a bio reference on tg */
T
Tejun Heo 已提交
699
	blkg_get(tg_to_blkg(tg));
700 701 702 703 704 705 706 707 708 709 710
	tg->nr_queued[rw]++;
	td->nr_queued[rw]++;
	throtl_enqueue_tg(td, tg);
}

static void tg_update_disptime(struct throtl_data *td, struct throtl_grp *tg)
{
	unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
	struct bio *bio;

	if ((bio = bio_list_peek(&tg->bio_lists[READ])))
711
		tg_may_dispatch(tg, bio, &read_wait);
712 713

	if ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
714
		tg_may_dispatch(tg, bio, &write_wait);
715 716 717 718 719 720 721 722 723 724

	min_wait = min(read_wait, write_wait);
	disptime = jiffies + min_wait;

	/* Update dispatch time */
	throtl_dequeue_tg(td, tg);
	tg->disptime = disptime;
	throtl_enqueue_tg(td, tg);
}

725 726
static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw,
				struct bio_list *bl)
727 728 729 730 731
{
	struct bio *bio;

	bio = bio_list_pop(&tg->bio_lists[rw]);
	tg->nr_queued[rw]--;
T
Tejun Heo 已提交
732 733
	/* Drop bio reference on blkg */
	blkg_put(tg_to_blkg(tg));
734

735 736
	BUG_ON(tg->td->nr_queued[rw] <= 0);
	tg->td->nr_queued[rw]--;
737 738 739 740 741

	throtl_charge_bio(tg, bio);
	bio_list_add(bl, bio);
	bio->bi_rw |= REQ_THROTTLED;

742
	throtl_trim_slice(tg, rw);
743 744
}

745
static int throtl_dispatch_tg(struct throtl_grp *tg, struct bio_list *bl)
746 747 748
{
	unsigned int nr_reads = 0, nr_writes = 0;
	unsigned int max_nr_reads = throtl_grp_quantum*3/4;
749
	unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads;
750 751 752 753
	struct bio *bio;

	/* Try to dispatch 75% READS and 25% WRITES */

754 755
	while ((bio = bio_list_peek(&tg->bio_lists[READ])) &&
	       tg_may_dispatch(tg, bio, NULL)) {
756

757
		tg_dispatch_one_bio(tg, bio_data_dir(bio), bl);
758 759 760 761 762 763
		nr_reads++;

		if (nr_reads >= max_nr_reads)
			break;
	}

764 765
	while ((bio = bio_list_peek(&tg->bio_lists[WRITE])) &&
	       tg_may_dispatch(tg, bio, NULL)) {
766

767
		tg_dispatch_one_bio(tg, bio_data_dir(bio), bl);
768 769 770 771 772 773 774 775 776 777 778 779 780
		nr_writes++;

		if (nr_writes >= max_nr_writes)
			break;
	}

	return nr_reads + nr_writes;
}

static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
{
	unsigned int nr_disp = 0;
	struct throtl_grp *tg;
781
	struct throtl_service_queue *sq = &td->service_queue;
782 783

	while (1) {
784
		tg = throtl_rb_first(sq);
785 786 787 788 789 790 791 792 793

		if (!tg)
			break;

		if (time_before(jiffies, tg->disptime))
			break;

		throtl_dequeue_tg(td, tg);

794
		nr_disp += throtl_dispatch_tg(tg, bl);
795

796
		if (tg->nr_queued[0] || tg->nr_queued[1])
797 798 799 800 801 802 803 804 805
			tg_update_disptime(td, tg);

		if (nr_disp >= throtl_quantum)
			break;
	}

	return nr_disp;
}

806 807
/* work function to dispatch throttled bios */
void blk_throtl_dispatch_work_fn(struct work_struct *work)
808
{
809 810 811
	struct throtl_data *td = container_of(to_delayed_work(work),
					      struct throtl_data, dispatch_work);
	struct request_queue *q = td->queue;
812 813 814
	unsigned int nr_disp = 0;
	struct bio_list bio_list_on_stack;
	struct bio *bio;
815
	struct blk_plug plug;
816 817 818 819 820

	spin_lock_irq(q->queue_lock);

	bio_list_init(&bio_list_on_stack);

821
	throtl_log(td, "dispatch nr_queued=%u read=%u write=%u",
822 823
		   td->nr_queued[READ] + td->nr_queued[WRITE],
		   td->nr_queued[READ], td->nr_queued[WRITE]);
824 825 826 827 828 829 830

	nr_disp = throtl_select_dispatch(td, &bio_list_on_stack);

	if (nr_disp)
		throtl_log(td, "bios disp=%u", nr_disp);

	throtl_schedule_next_dispatch(td);
831

832 833 834 835 836 837 838
	spin_unlock_irq(q->queue_lock);

	/*
	 * If we dispatched some requests, unplug the queue to make sure
	 * immediate dispatch
	 */
	if (nr_disp) {
839
		blk_start_plug(&plug);
840 841
		while((bio = bio_list_pop(&bio_list_on_stack)))
			generic_make_request(bio);
842
		blk_finish_plug(&plug);
843 844 845
	}
}

846 847
static u64 tg_prfill_cpu_rwstat(struct seq_file *sf,
				struct blkg_policy_data *pd, int off)
848
{
849
	struct throtl_grp *tg = pd_to_tg(pd);
850 851 852 853
	struct blkg_rwstat rwstat = { }, tmp;
	int i, cpu;

	for_each_possible_cpu(cpu) {
854
		struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);
855 856 857 858 859 860

		tmp = blkg_rwstat_read((void *)sc + off);
		for (i = 0; i < BLKG_RWSTAT_NR; i++)
			rwstat.cnt[i] += tmp.cnt[i];
	}

861
	return __blkg_prfill_rwstat(sf, pd, &rwstat);
862 863
}

864 865
static int tg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft,
			       struct seq_file *sf)
866
{
T
Tejun Heo 已提交
867
	struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
868

T
Tejun Heo 已提交
869
	blkcg_print_blkgs(sf, blkcg, tg_prfill_cpu_rwstat, &blkcg_policy_throtl,
870
			  cft->private, true);
871 872 873
	return 0;
}

874 875
static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd,
			      int off)
876
{
877 878
	struct throtl_grp *tg = pd_to_tg(pd);
	u64 v = *(u64 *)((void *)tg + off);
879

880
	if (v == -1)
881
		return 0;
882
	return __blkg_prfill_u64(sf, pd, v);
883 884
}

885 886
static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd,
			       int off)
887
{
888 889
	struct throtl_grp *tg = pd_to_tg(pd);
	unsigned int v = *(unsigned int *)((void *)tg + off);
890

891 892
	if (v == -1)
		return 0;
893
	return __blkg_prfill_u64(sf, pd, v);
894 895
}

896 897
static int tg_print_conf_u64(struct cgroup *cgrp, struct cftype *cft,
			     struct seq_file *sf)
898
{
T
Tejun Heo 已提交
899 900
	blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_u64,
			  &blkcg_policy_throtl, cft->private, false);
901
	return 0;
902 903
}

904 905
static int tg_print_conf_uint(struct cgroup *cgrp, struct cftype *cft,
			      struct seq_file *sf)
906
{
T
Tejun Heo 已提交
907 908
	blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_uint,
			  &blkcg_policy_throtl, cft->private, false);
909
	return 0;
910 911
}

912 913
static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf,
		       bool is_u64)
914
{
T
Tejun Heo 已提交
915
	struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
916
	struct blkg_conf_ctx ctx;
917
	struct throtl_grp *tg;
918
	struct throtl_data *td;
919 920
	int ret;

T
Tejun Heo 已提交
921
	ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
922 923 924
	if (ret)
		return ret;

925
	tg = blkg_to_tg(ctx.blkg);
926
	td = ctx.blkg->q->td;
927

928 929
	if (!ctx.v)
		ctx.v = -1;
930

931 932 933 934
	if (is_u64)
		*(u64 *)((void *)tg + cft->private) = ctx.v;
	else
		*(unsigned int *)((void *)tg + cft->private) = ctx.v;
935

936
	throtl_log_tg(tg, "limit change rbps=%llu wbps=%llu riops=%u wiops=%u",
937 938 939 940 941 942 943 944 945 946 947
		      tg->bps[READ], tg->bps[WRITE],
		      tg->iops[READ], tg->iops[WRITE]);

	/*
	 * We're already holding queue_lock and know @tg is valid.  Let's
	 * apply the new config directly.
	 *
	 * Restart the slices for both READ and WRITES. It might happen
	 * that a group's limit are dropped suddenly and we don't want to
	 * account recently dispatched IO with new low rate.
	 */
948 949
	throtl_start_new_slice(tg, 0);
	throtl_start_new_slice(tg, 1);
950

951
	if (tg->flags & THROTL_TG_PENDING) {
952 953 954
		tg_update_disptime(td, tg);
		throtl_schedule_next_dispatch(td);
	}
955 956

	blkg_conf_finish(&ctx);
957
	return 0;
958 959
}

960 961
static int tg_set_conf_u64(struct cgroup *cgrp, struct cftype *cft,
			   const char *buf)
962
{
963
	return tg_set_conf(cgrp, cft, buf, true);
964 965
}

966 967
static int tg_set_conf_uint(struct cgroup *cgrp, struct cftype *cft,
			    const char *buf)
968
{
969
	return tg_set_conf(cgrp, cft, buf, false);
970 971 972 973 974
}

static struct cftype throtl_files[] = {
	{
		.name = "throttle.read_bps_device",
975 976 977
		.private = offsetof(struct throtl_grp, bps[READ]),
		.read_seq_string = tg_print_conf_u64,
		.write_string = tg_set_conf_u64,
978 979 980 981
		.max_write_len = 256,
	},
	{
		.name = "throttle.write_bps_device",
982 983 984
		.private = offsetof(struct throtl_grp, bps[WRITE]),
		.read_seq_string = tg_print_conf_u64,
		.write_string = tg_set_conf_u64,
985 986 987 988
		.max_write_len = 256,
	},
	{
		.name = "throttle.read_iops_device",
989 990 991
		.private = offsetof(struct throtl_grp, iops[READ]),
		.read_seq_string = tg_print_conf_uint,
		.write_string = tg_set_conf_uint,
992 993 994 995
		.max_write_len = 256,
	},
	{
		.name = "throttle.write_iops_device",
996 997 998
		.private = offsetof(struct throtl_grp, iops[WRITE]),
		.read_seq_string = tg_print_conf_uint,
		.write_string = tg_set_conf_uint,
999 1000 1001 1002
		.max_write_len = 256,
	},
	{
		.name = "throttle.io_service_bytes",
1003
		.private = offsetof(struct tg_stats_cpu, service_bytes),
1004
		.read_seq_string = tg_print_cpu_rwstat,
1005 1006 1007
	},
	{
		.name = "throttle.io_serviced",
1008
		.private = offsetof(struct tg_stats_cpu, serviced),
1009
		.read_seq_string = tg_print_cpu_rwstat,
1010 1011 1012 1013
	},
	{ }	/* terminate */
};

1014
static void throtl_shutdown_wq(struct request_queue *q)
1015 1016 1017
{
	struct throtl_data *td = q->td;

1018
	cancel_delayed_work_sync(&td->dispatch_work);
1019 1020
}

T
Tejun Heo 已提交
1021
static struct blkcg_policy blkcg_policy_throtl = {
1022 1023 1024 1025 1026 1027
	.pd_size		= sizeof(struct throtl_grp),
	.cftypes		= throtl_files,

	.pd_init_fn		= throtl_pd_init,
	.pd_exit_fn		= throtl_pd_exit,
	.pd_reset_stats_fn	= throtl_pd_reset_stats,
1028 1029
};

1030
bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
1031 1032 1033 1034
{
	struct throtl_data *td = q->td;
	struct throtl_grp *tg;
	bool rw = bio_data_dir(bio), update_disptime = true;
T
Tejun Heo 已提交
1035
	struct blkcg *blkcg;
1036
	bool throttled = false;
1037 1038 1039

	if (bio->bi_rw & REQ_THROTTLED) {
		bio->bi_rw &= ~REQ_THROTTLED;
1040
		goto out;
1041 1042
	}

1043 1044 1045 1046 1047 1048
	/*
	 * A throtl_grp pointer retrieved under rcu can be used to access
	 * basic fields like stats and io rates. If a group has no rules,
	 * just update the dispatch stats in lockless manner and return.
	 */
	rcu_read_lock();
T
Tejun Heo 已提交
1049
	blkcg = bio_blkcg(bio);
1050
	tg = throtl_lookup_tg(td, blkcg);
1051 1052
	if (tg) {
		if (tg_no_rule_group(tg, rw)) {
1053 1054
			throtl_update_dispatch_stats(tg_to_blkg(tg),
						     bio->bi_size, bio->bi_rw);
1055
			goto out_unlock_rcu;
1056 1057 1058 1059 1060 1061 1062
		}
	}

	/*
	 * Either group has not been allocated yet or it is not an unlimited
	 * IO group
	 */
1063
	spin_lock_irq(q->queue_lock);
1064
	tg = throtl_lookup_create_tg(td, blkcg);
1065 1066
	if (unlikely(!tg))
		goto out_unlock;
1067

1068 1069 1070 1071 1072
	if (tg->nr_queued[rw]) {
		/*
		 * There is already another bio queued in same dir. No
		 * need to update dispatch time.
		 */
1073
		update_disptime = false;
1074
		goto queue_bio;
1075

1076 1077 1078
	}

	/* Bio is with-in rate limit of group */
1079
	if (tg_may_dispatch(tg, bio, NULL)) {
1080
		throtl_charge_bio(tg, bio);
1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092

		/*
		 * We need to trim slice even when bios are not being queued
		 * otherwise it might happen that a bio is not queued for
		 * a long time and slice keeps on extending and trim is not
		 * called for a long time. Now if limits are reduced suddenly
		 * we take into account all the IO dispatched so far at new
		 * low rate and * newly queued IO gets a really long dispatch
		 * time.
		 *
		 * So keep on trimming slice even if bio is not queued.
		 */
1093
		throtl_trim_slice(tg, rw);
1094
		goto out_unlock;
1095 1096 1097
	}

queue_bio:
1098
	throtl_log_tg(tg, "[%c] bio. bdisp=%llu sz=%u bps=%llu"
1099 1100
			" iodisp=%u iops=%u queued=%d/%d",
			rw == READ ? 'R' : 'W',
1101
			tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],
1102
			tg->io_disp[rw], tg->iops[rw],
1103 1104
			tg->nr_queued[READ], tg->nr_queued[WRITE]);

1105
	bio_associate_current(bio);
1106
	throtl_add_bio_tg(q->td, tg, bio);
1107
	throttled = true;
1108 1109 1110 1111 1112 1113

	if (update_disptime) {
		tg_update_disptime(td, tg);
		throtl_schedule_next_dispatch(td);
	}

1114
out_unlock:
1115
	spin_unlock_irq(q->queue_lock);
1116 1117
out_unlock_rcu:
	rcu_read_unlock();
1118 1119
out:
	return throttled;
1120 1121
}

1122 1123 1124 1125 1126 1127 1128 1129 1130 1131
/**
 * blk_throtl_drain - drain throttled bios
 * @q: request_queue to drain throttled bios for
 *
 * Dispatch all currently throttled bios on @q through ->make_request_fn().
 */
void blk_throtl_drain(struct request_queue *q)
	__releases(q->queue_lock) __acquires(q->queue_lock)
{
	struct throtl_data *td = q->td;
1132
	struct throtl_service_queue *sq = &td->service_queue;
1133 1134 1135 1136
	struct throtl_grp *tg;
	struct bio_list bl;
	struct bio *bio;

1137
	queue_lockdep_assert_held(q);
1138 1139 1140

	bio_list_init(&bl);

1141
	while ((tg = throtl_rb_first(sq))) {
1142 1143 1144
		throtl_dequeue_tg(td, tg);

		while ((bio = bio_list_peek(&tg->bio_lists[READ])))
1145
			tg_dispatch_one_bio(tg, bio_data_dir(bio), &bl);
1146
		while ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
1147
			tg_dispatch_one_bio(tg, bio_data_dir(bio), &bl);
1148 1149 1150 1151 1152 1153 1154 1155 1156
	}
	spin_unlock_irq(q->queue_lock);

	while ((bio = bio_list_pop(&bl)))
		generic_make_request(bio);

	spin_lock_irq(q->queue_lock);
}

1157 1158 1159
int blk_throtl_init(struct request_queue *q)
{
	struct throtl_data *td;
1160
	int ret;
1161 1162 1163 1164 1165

	td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
	if (!td)
		return -ENOMEM;

1166
	td->service_queue = THROTL_SERVICE_QUEUE_INITIALIZER;
1167
	INIT_DELAYED_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
1168

1169
	q->td = td;
1170
	td->queue = q;
V
Vivek Goyal 已提交
1171

1172
	/* activate policy */
T
Tejun Heo 已提交
1173
	ret = blkcg_activate_policy(q, &blkcg_policy_throtl);
1174
	if (ret)
1175
		kfree(td);
1176
	return ret;
1177 1178 1179 1180
}

void blk_throtl_exit(struct request_queue *q)
{
T
Tejun Heo 已提交
1181
	BUG_ON(!q->td);
1182
	throtl_shutdown_wq(q);
T
Tejun Heo 已提交
1183
	blkcg_deactivate_policy(q, &blkcg_policy_throtl);
1184
	kfree(q->td);
1185 1186 1187 1188
}

static int __init throtl_init(void)
{
1189 1190 1191 1192
	kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0);
	if (!kthrotld_workqueue)
		panic("Failed to create kthrotld\n");

T
Tejun Heo 已提交
1193
	return blkcg_policy_register(&blkcg_policy_throtl);
1194 1195 1196
}

module_init(throtl_init);