blk-cgroup.c 44.5 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13
/*
 * Common Block IO controller cgroup interface
 *
 * Based on ideas and code from CFQ, CFS and BFQ:
 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
 *
 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
 *		      Paolo Valente <paolo.valente@unimore.it>
 *
 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
 * 	              Nauman Rafique <nauman@google.com>
 */
#include <linux/ioprio.h>
14 15
#include <linux/seq_file.h>
#include <linux/kdev_t.h>
16
#include <linux/module.h>
17
#include <linux/err.h>
18
#include <linux/blkdev.h>
19
#include <linux/slab.h>
20
#include <linux/genhd.h>
21
#include <linux/delay.h>
T
Tejun Heo 已提交
22
#include <linux/atomic.h>
23
#include "blk-cgroup.h"
24
#include "blk.h"
25

26 27
#define MAX_KEY_LEN 100

28 29
static DEFINE_SPINLOCK(blkio_list_lock);
static LIST_HEAD(blkio_list);
30

31 32 33
static DEFINE_MUTEX(all_q_mutex);
static LIST_HEAD(all_q_list);

34 35 36 37 38 39 40
/* List of groups pending per cpu stats allocation */
static DEFINE_SPINLOCK(alloc_list_lock);
static LIST_HEAD(alloc_list);

static void blkio_stat_alloc_fn(struct work_struct *);
static DECLARE_DELAYED_WORK(blkio_stat_alloc_work, blkio_stat_alloc_fn);

41
struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
42 43
EXPORT_SYMBOL_GPL(blkio_root_cgroup);

44 45
static struct blkio_policy_type *blkio_policy[BLKIO_NR_POLICIES];

46 47 48 49 50 51
/* for encoding cft->private value on file */
#define BLKIOFILE_PRIVATE(x, val)	(((x) << 16) | (val))
/* What policy owns the file, proportional or throttle */
#define BLKIOFILE_POLICY(val)		(((val) >> 16) & 0xffff)
#define BLKIOFILE_ATTR(val)		((val) & 0xffff)

52 53 54 55 56
struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
{
	return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
			    struct blkio_cgroup, css);
}
57
EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
58

59
static struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk)
60 61 62 63
{
	return container_of(task_subsys_state(tsk, blkio_subsys_id),
			    struct blkio_cgroup, css);
}
64 65 66 67 68 69 70 71

struct blkio_cgroup *bio_blkio_cgroup(struct bio *bio)
{
	if (bio && bio->bi_css)
		return container_of(bio->bi_css, struct blkio_cgroup, css);
	return task_blkio_cgroup(current);
}
EXPORT_SYMBOL_GPL(bio_blkio_cgroup);
72

73 74
static inline void blkio_update_group_weight(struct blkio_group *blkg,
					     int plid, unsigned int weight)
75 76 77 78 79
{
	struct blkio_policy_type *blkiop;

	list_for_each_entry(blkiop, &blkio_list, list) {
		/* If this policy does not own the blkg, do not send updates */
80
		if (blkiop->plid != plid)
81 82
			continue;
		if (blkiop->ops.blkio_update_group_weight_fn)
83
			blkiop->ops.blkio_update_group_weight_fn(blkg->q,
84
							blkg, weight);
85 86 87
	}
}

88 89
static inline void blkio_update_group_bps(struct blkio_group *blkg, int plid,
					  u64 bps, int fileid)
90 91 92 93 94 95
{
	struct blkio_policy_type *blkiop;

	list_for_each_entry(blkiop, &blkio_list, list) {

		/* If this policy does not own the blkg, do not send updates */
96
		if (blkiop->plid != plid)
97 98 99 100
			continue;

		if (fileid == BLKIO_THROTL_read_bps_device
		    && blkiop->ops.blkio_update_group_read_bps_fn)
101
			blkiop->ops.blkio_update_group_read_bps_fn(blkg->q,
102
								blkg, bps);
103 104 105

		if (fileid == BLKIO_THROTL_write_bps_device
		    && blkiop->ops.blkio_update_group_write_bps_fn)
106
			blkiop->ops.blkio_update_group_write_bps_fn(blkg->q,
107
								blkg, bps);
108 109 110
	}
}

111
static inline void blkio_update_group_iops(struct blkio_group *blkg,
112 113
					   int plid, unsigned int iops,
					   int fileid)
114 115 116 117 118 119
{
	struct blkio_policy_type *blkiop;

	list_for_each_entry(blkiop, &blkio_list, list) {

		/* If this policy does not own the blkg, do not send updates */
120
		if (blkiop->plid != plid)
121 122 123 124
			continue;

		if (fileid == BLKIO_THROTL_read_iops_device
		    && blkiop->ops.blkio_update_group_read_iops_fn)
125
			blkiop->ops.blkio_update_group_read_iops_fn(blkg->q,
126
								blkg, iops);
127 128 129

		if (fileid == BLKIO_THROTL_write_iops_device
		    && blkiop->ops.blkio_update_group_write_iops_fn)
130
			blkiop->ops.blkio_update_group_write_iops_fn(blkg->q,
131
								blkg,iops);
132 133 134
	}
}

135 136
/*
 * Add to the appropriate stat variable depending on the request type.
137
 * This should be called with queue_lock held.
138
 */
139 140
static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction,
				bool sync)
141
{
142 143
	if (direction)
		stat[BLKIO_STAT_WRITE] += add;
144
	else
145 146 147
		stat[BLKIO_STAT_READ] += add;
	if (sync)
		stat[BLKIO_STAT_SYNC] += add;
148
	else
149
		stat[BLKIO_STAT_ASYNC] += add;
150 151
}

152 153 154
/*
 * Decrements the appropriate stat variable if non-zero depending on the
 * request type. Panics on value being zero.
155
 * This should be called with the queue_lock held.
156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175
 */
static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync)
{
	if (direction) {
		BUG_ON(stat[BLKIO_STAT_WRITE] == 0);
		stat[BLKIO_STAT_WRITE]--;
	} else {
		BUG_ON(stat[BLKIO_STAT_READ] == 0);
		stat[BLKIO_STAT_READ]--;
	}
	if (sync) {
		BUG_ON(stat[BLKIO_STAT_SYNC] == 0);
		stat[BLKIO_STAT_SYNC]--;
	} else {
		BUG_ON(stat[BLKIO_STAT_ASYNC] == 0);
		stat[BLKIO_STAT_ASYNC]--;
	}
}

#ifdef CONFIG_DEBUG_BLK_CGROUP
176
/* This should be called with the queue_lock held. */
177
static void blkio_set_start_group_wait_time(struct blkio_group *blkg,
178 179
					    struct blkio_policy_type *pol,
					    struct blkio_group *curr_blkg)
180
{
181
	struct blkg_policy_data *pd = blkg->pd[pol->plid];
182 183

	if (blkio_blkg_waiting(&pd->stats))
184 185 186
		return;
	if (blkg == curr_blkg)
		return;
187 188
	pd->stats.start_group_wait_time = sched_clock();
	blkio_mark_blkg_waiting(&pd->stats);
189 190
}

191
/* This should be called with the queue_lock held. */
192 193 194 195 196 197 198 199 200 201 202 203 204
static void blkio_update_group_wait_time(struct blkio_group_stats *stats)
{
	unsigned long long now;

	if (!blkio_blkg_waiting(stats))
		return;

	now = sched_clock();
	if (time_after64(now, stats->start_group_wait_time))
		stats->group_wait_time += now - stats->start_group_wait_time;
	blkio_clear_blkg_waiting(stats);
}

205
/* This should be called with the queue_lock held. */
206 207 208 209 210 211 212 213 214 215 216 217 218
static void blkio_end_empty_time(struct blkio_group_stats *stats)
{
	unsigned long long now;

	if (!blkio_blkg_empty(stats))
		return;

	now = sched_clock();
	if (time_after64(now, stats->start_empty_time))
		stats->empty_time += now - stats->start_empty_time;
	blkio_clear_blkg_empty(stats);
}

219 220
void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg,
					struct blkio_policy_type *pol)
221
{
222
	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
223

224 225 226 227 228
	lockdep_assert_held(blkg->q->queue_lock);
	BUG_ON(blkio_blkg_idling(stats));

	stats->start_idle_time = sched_clock();
	blkio_mark_blkg_idling(stats);
229 230 231
}
EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats);

232 233
void blkiocg_update_idle_time_stats(struct blkio_group *blkg,
				    struct blkio_policy_type *pol)
234
{
235 236 237
	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;

	lockdep_assert_held(blkg->q->queue_lock);
238 239

	if (blkio_blkg_idling(stats)) {
240 241 242 243
		unsigned long long now = sched_clock();

		if (time_after64(now, stats->start_idle_time)) {
			u64_stats_update_begin(&stats->syncp);
244
			stats->idle_time += now - stats->start_idle_time;
245 246
			u64_stats_update_end(&stats->syncp);
		}
247 248 249 250 251
		blkio_clear_blkg_idling(stats);
	}
}
EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats);

252 253
void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg,
					 struct blkio_policy_type *pol)
254
{
255
	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
256

257 258 259
	lockdep_assert_held(blkg->q->queue_lock);

	u64_stats_update_begin(&stats->syncp);
260 261 262 263
	stats->avg_queue_size_sum +=
			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] +
			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE];
	stats->avg_queue_size_samples++;
264
	blkio_update_group_wait_time(stats);
265
	u64_stats_update_end(&stats->syncp);
266
}
267 268
EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats);

269 270
void blkiocg_set_start_empty_time(struct blkio_group *blkg,
				  struct blkio_policy_type *pol)
D
Divyesh Shah 已提交
271
{
272
	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
D
Divyesh Shah 已提交
273

274
	lockdep_assert_held(blkg->q->queue_lock);
D
Divyesh Shah 已提交
275 276

	if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] ||
277
			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE])
D
Divyesh Shah 已提交
278 279 280
		return;

	/*
281 282 283
	 * group is already marked empty. This can happen if cfqq got new
	 * request in parent group and moved to this group while being added
	 * to service tree. Just ignore the event and move on.
D
Divyesh Shah 已提交
284
	 */
285
	if (blkio_blkg_empty(stats))
286 287
		return;

D
Divyesh Shah 已提交
288 289 290 291 292
	stats->start_empty_time = sched_clock();
	blkio_mark_blkg_empty(stats);
}
EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time);

293
void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
294 295
				  struct blkio_policy_type *pol,
				  unsigned long dequeue)
296
{
297
	struct blkg_policy_data *pd = blkg->pd[pol->plid];
298

299 300
	lockdep_assert_held(blkg->q->queue_lock);

301
	pd->stats.dequeue += dequeue;
302 303
}
EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats);
304 305
#else
static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg,
306 307 308
					struct blkio_policy_type *pol,
					struct blkio_group *curr_blkg) { }
static inline void blkio_end_empty_time(struct blkio_group_stats *stats) { }
309 310
#endif

311
void blkiocg_update_io_add_stats(struct blkio_group *blkg,
312 313 314
				 struct blkio_policy_type *pol,
				 struct blkio_group *curr_blkg, bool direction,
				 bool sync)
315
{
316 317 318 319 320 321 322 323
	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;

	lockdep_assert_held(blkg->q->queue_lock);

	u64_stats_update_begin(&stats->syncp);
	blkio_add_stat(stats->stat_arr[BLKIO_STAT_QUEUED], 1, direction, sync);
	blkio_end_empty_time(stats);
	u64_stats_update_end(&stats->syncp);
324

325
	blkio_set_start_group_wait_time(blkg, pol, curr_blkg);
326
}
327
EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats);
328

329
void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
330 331
				    struct blkio_policy_type *pol,
				    bool direction, bool sync)
332
{
333 334 335
	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;

	lockdep_assert_held(blkg->q->queue_lock);
336

337 338 339 340
	u64_stats_update_begin(&stats->syncp);
	blkio_check_and_dec_stat(stats->stat_arr[BLKIO_STAT_QUEUED], direction,
				 sync);
	u64_stats_update_end(&stats->syncp);
341
}
342
EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats);
343

344 345 346 347
void blkiocg_update_timeslice_used(struct blkio_group *blkg,
				   struct blkio_policy_type *pol,
				   unsigned long time,
				   unsigned long unaccounted_time)
348
{
349 350 351
	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;

	lockdep_assert_held(blkg->q->queue_lock);
352

353 354
	u64_stats_update_begin(&stats->syncp);
	stats->time += time;
355
#ifdef CONFIG_DEBUG_BLK_CGROUP
356
	stats->unaccounted_time += unaccounted_time;
357
#endif
358
	u64_stats_update_end(&stats->syncp);
359
}
360
EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
361

362 363 364 365
/*
 * should be called under rcu read lock or queue lock to make sure blkg pointer
 * is valid.
 */
366
void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
367 368
				   struct blkio_policy_type *pol,
				   uint64_t bytes, bool direction, bool sync)
369
{
370
	struct blkg_policy_data *pd = blkg->pd[pol->plid];
371
	struct blkio_group_stats_cpu *stats_cpu;
372 373
	unsigned long flags;

374 375 376 377
	/* If per cpu stats are not allocated yet, don't do any accounting. */
	if (pd->stats_cpu == NULL)
		return;

378 379 380 381 382 383
	/*
	 * Disabling interrupts to provide mutual exclusion between two
	 * writes on same cpu. It probably is not needed for 64bit. Not
	 * optimizing that case yet.
	 */
	local_irq_save(flags);
384

385
	stats_cpu = this_cpu_ptr(pd->stats_cpu);
386

387
	u64_stats_update_begin(&stats_cpu->syncp);
388 389 390 391 392
	stats_cpu->sectors += bytes >> 9;
	blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICED],
			1, direction, sync);
	blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICE_BYTES],
			bytes, direction, sync);
393 394
	u64_stats_update_end(&stats_cpu->syncp);
	local_irq_restore(flags);
395
}
396
EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats);
397

398
void blkiocg_update_completion_stats(struct blkio_group *blkg,
399 400 401 402
				     struct blkio_policy_type *pol,
				     uint64_t start_time,
				     uint64_t io_start_time, bool direction,
				     bool sync)
403
{
404
	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
405 406
	unsigned long long now = sched_clock();

407 408 409
	lockdep_assert_held(blkg->q->queue_lock);

	u64_stats_update_begin(&stats->syncp);
410 411 412 413 414 415
	if (time_after64(now, io_start_time))
		blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME],
				now - io_start_time, direction, sync);
	if (time_after64(io_start_time, start_time))
		blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME],
				io_start_time - start_time, direction, sync);
416
	u64_stats_update_end(&stats->syncp);
417
}
418
EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
419

420
/*  Merged stats are per cpu.  */
421 422 423
void blkiocg_update_io_merged_stats(struct blkio_group *blkg,
				    struct blkio_policy_type *pol,
				    bool direction, bool sync)
D
Divyesh Shah 已提交
424
{
425 426 427
	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;

	lockdep_assert_held(blkg->q->queue_lock);
D
Divyesh Shah 已提交
428

429
	u64_stats_update_begin(&stats->syncp);
430
	blkio_add_stat(stats->stat_arr[BLKIO_STAT_MERGED], 1, direction, sync);
431
	u64_stats_update_end(&stats->syncp);
D
Divyesh Shah 已提交
432 433 434
}
EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);

435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488
/*
 * Worker for allocating per cpu stat for blk groups. This is scheduled on
 * the system_nrt_wq once there are some groups on the alloc_list waiting
 * for allocation.
 */
static void blkio_stat_alloc_fn(struct work_struct *work)
{
	static void *pcpu_stats[BLKIO_NR_POLICIES];
	struct delayed_work *dwork = to_delayed_work(work);
	struct blkio_group *blkg;
	int i;
	bool empty = false;

alloc_stats:
	for (i = 0; i < BLKIO_NR_POLICIES; i++) {
		if (pcpu_stats[i] != NULL)
			continue;

		pcpu_stats[i] = alloc_percpu(struct blkio_group_stats_cpu);

		/* Allocation failed. Try again after some time. */
		if (pcpu_stats[i] == NULL) {
			queue_delayed_work(system_nrt_wq, dwork,
						msecs_to_jiffies(10));
			return;
		}
	}

	spin_lock_irq(&blkio_list_lock);
	spin_lock(&alloc_list_lock);

	/* cgroup got deleted or queue exited. */
	if (!list_empty(&alloc_list)) {
		blkg = list_first_entry(&alloc_list, struct blkio_group,
						alloc_node);
		for (i = 0; i < BLKIO_NR_POLICIES; i++) {
			struct blkg_policy_data *pd = blkg->pd[i];

			if (blkio_policy[i] && pd && !pd->stats_cpu)
				swap(pd->stats_cpu, pcpu_stats[i]);
		}

		list_del_init(&blkg->alloc_node);
	}

	empty = list_empty(&alloc_list);

	spin_unlock(&alloc_list_lock);
	spin_unlock_irq(&blkio_list_lock);

	if (!empty)
		goto alloc_stats;
}

489 490 491 492 493 494 495 496
/**
 * blkg_free - free a blkg
 * @blkg: blkg to free
 *
 * Free @blkg which may be partially allocated.
 */
static void blkg_free(struct blkio_group *blkg)
{
497
	int i;
498 499 500 501

	if (!blkg)
		return;

502 503 504 505 506 507 508
	for (i = 0; i < BLKIO_NR_POLICIES; i++) {
		struct blkg_policy_data *pd = blkg->pd[i];

		if (pd) {
			free_percpu(pd->stats_cpu);
			kfree(pd);
		}
509
	}
510

511
	kfree(blkg);
512 513 514 515 516 517 518
}

/**
 * blkg_alloc - allocate a blkg
 * @blkcg: block cgroup the new blkg is associated with
 * @q: request_queue the new blkg is associated with
 *
519
 * Allocate a new blkg assocating @blkcg and @q.
520 521
 */
static struct blkio_group *blkg_alloc(struct blkio_cgroup *blkcg,
522
				      struct request_queue *q)
523 524
{
	struct blkio_group *blkg;
525
	int i;
526 527 528 529 530 531

	/* alloc and init base part */
	blkg = kzalloc_node(sizeof(*blkg), GFP_ATOMIC, q->node);
	if (!blkg)
		return NULL;

T
Tejun Heo 已提交
532
	blkg->q = q;
533
	INIT_LIST_HEAD(&blkg->q_node);
534
	INIT_LIST_HEAD(&blkg->alloc_node);
535
	blkg->blkcg = blkcg;
T
Tejun Heo 已提交
536
	blkg->refcnt = 1;
537 538
	cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));

539 540 541
	for (i = 0; i < BLKIO_NR_POLICIES; i++) {
		struct blkio_policy_type *pol = blkio_policy[i];
		struct blkg_policy_data *pd;
542

543 544 545 546 547 548 549 550 551 552
		if (!pol)
			continue;

		/* alloc per-policy data and attach it to blkg */
		pd = kzalloc_node(sizeof(*pd) + pol->pdata_size, GFP_ATOMIC,
				  q->node);
		if (!pd) {
			blkg_free(blkg);
			return NULL;
		}
553

554 555
		blkg->pd[i] = pd;
		pd->blkg = blkg;
556 557
	}

558
	/* invoke per-policy init */
559 560 561 562 563 564 565
	for (i = 0; i < BLKIO_NR_POLICIES; i++) {
		struct blkio_policy_type *pol = blkio_policy[i];

		if (pol)
			pol->ops.blkio_init_group_fn(blkg);
	}

566 567 568
	return blkg;
}

569 570 571 572 573
struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
				       struct request_queue *q,
				       enum blkio_policy_id plid,
				       bool for_root)
	__releases(q->queue_lock) __acquires(q->queue_lock)
574
{
575
	struct blkio_group *blkg;
576

577 578 579 580 581 582 583 584 585 586 587 588
	WARN_ON_ONCE(!rcu_read_lock_held());
	lockdep_assert_held(q->queue_lock);

	/*
	 * This could be the first entry point of blkcg implementation and
	 * we shouldn't allow anything to go through for a bypassing queue.
	 * The following can be removed if blkg lookup is guaranteed to
	 * fail on a bypassing queue.
	 */
	if (unlikely(blk_queue_bypass(q)) && !for_root)
		return ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY);

589
	blkg = blkg_lookup(blkcg, q);
590 591 592
	if (blkg)
		return blkg;

593
	/* blkg holds a reference to blkcg */
594 595 596 597 598 599
	if (!css_tryget(&blkcg->css))
		return ERR_PTR(-EINVAL);

	/*
	 * Allocate and initialize.
	 */
600
	blkg = blkg_alloc(blkcg, q);
601 602

	/* did alloc fail? */
603
	if (unlikely(!blkg)) {
604 605 606 607 608 609
		blkg = ERR_PTR(-ENOMEM);
		goto out;
	}

	/* insert */
	spin_lock(&blkcg->lock);
610
	hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
611
	list_add(&blkg->q_node, &q->blkg_list);
612
	spin_unlock(&blkcg->lock);
613 614 615 616 617 618

	spin_lock(&alloc_list_lock);
	list_add(&blkg->alloc_node, &alloc_list);
	/* Queue per cpu stat allocation from worker thread. */
	queue_delayed_work(system_nrt_wq, &blkio_stat_alloc_work, 0);
	spin_unlock(&alloc_list_lock);
619 620
out:
	return blkg;
621
}
622
EXPORT_SYMBOL_GPL(blkg_lookup_create);
623 624

/* called under rcu_read_lock(). */
625
struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg,
626
				struct request_queue *q)
627 628 629 630
{
	struct blkio_group *blkg;
	struct hlist_node *n;

631
	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node)
632
		if (blkg->q == q)
633 634 635
			return blkg;
	return NULL;
}
636
EXPORT_SYMBOL_GPL(blkg_lookup);
637

638
static void blkg_destroy(struct blkio_group *blkg)
639 640
{
	struct request_queue *q = blkg->q;
641
	struct blkio_cgroup *blkcg = blkg->blkcg;
642 643

	lockdep_assert_held(q->queue_lock);
644
	lockdep_assert_held(&blkcg->lock);
645 646

	/* Something wrong if we are trying to remove same group twice */
647
	WARN_ON_ONCE(list_empty(&blkg->q_node));
648
	WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
649
	list_del_init(&blkg->q_node);
650
	hlist_del_init_rcu(&blkg->blkcg_node);
651

652 653 654 655
	spin_lock(&alloc_list_lock);
	list_del_init(&blkg->alloc_node);
	spin_unlock(&alloc_list_lock);

656 657 658 659 660 661 662
	/*
	 * Put the reference taken at the time of creation so that when all
	 * queues are gone, group can be destroyed.
	 */
	blkg_put(blkg);
}

663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695
/*
 * XXX: This updates blkg policy data in-place for root blkg, which is
 * necessary across elevator switch and policy registration as root blkgs
 * aren't shot down.  This broken and racy implementation is temporary.
 * Eventually, blkg shoot down will be replaced by proper in-place update.
 */
void update_root_blkg_pd(struct request_queue *q, enum blkio_policy_id plid)
{
	struct blkio_policy_type *pol = blkio_policy[plid];
	struct blkio_group *blkg = blkg_lookup(&blkio_root_cgroup, q);
	struct blkg_policy_data *pd;

	if (!blkg)
		return;

	kfree(blkg->pd[plid]);
	blkg->pd[plid] = NULL;

	if (!pol)
		return;

	pd = kzalloc(sizeof(*pd) + pol->pdata_size, GFP_KERNEL);
	WARN_ON_ONCE(!pd);

	pd->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu);
	WARN_ON_ONCE(!pd->stats_cpu);

	blkg->pd[plid] = pd;
	pd->blkg = blkg;
	pol->ops.blkio_init_group_fn(blkg);
}
EXPORT_SYMBOL_GPL(update_root_blkg_pd);

696 697 698 699 700 701 702 703
/**
 * blkg_destroy_all - destroy all blkgs associated with a request_queue
 * @q: request_queue of interest
 * @destroy_root: whether to destroy root blkg or not
 *
 * Destroy blkgs associated with @q.  If @destroy_root is %true, all are
 * destroyed; otherwise, root blkg is left alone.
 */
704
void blkg_destroy_all(struct request_queue *q, bool destroy_root)
705
{
706
	struct blkio_group *blkg, *n;
707

708
	spin_lock_irq(q->queue_lock);
709

710 711
	list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
		struct blkio_cgroup *blkcg = blkg->blkcg;
712

713 714 715
		/* skip root? */
		if (!destroy_root && blkg->blkcg == &blkio_root_cgroup)
			continue;
716

717 718 719
		spin_lock(&blkcg->lock);
		blkg_destroy(blkg);
		spin_unlock(&blkcg->lock);
720
	}
721 722

	spin_unlock_irq(q->queue_lock);
723
}
724
EXPORT_SYMBOL_GPL(blkg_destroy_all);
725

T
Tejun Heo 已提交
726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748
static void blkg_rcu_free(struct rcu_head *rcu_head)
{
	blkg_free(container_of(rcu_head, struct blkio_group, rcu_head));
}

void __blkg_release(struct blkio_group *blkg)
{
	/* release the extra blkcg reference this blkg has been holding */
	css_put(&blkg->blkcg->css);

	/*
	 * A group is freed in rcu manner. But having an rcu lock does not
	 * mean that one can access all the fields of blkg and assume these
	 * are valid. For example, don't try to follow throtl_data and
	 * request queue links.
	 *
	 * Having a reference to blkg under an rcu allows acess to only
	 * values local to groups like group stats and group rate limits
	 */
	call_rcu(&blkg->rcu_head, blkg_rcu_free);
}
EXPORT_SYMBOL_GPL(__blkg_release);

749
static void blkio_reset_stats_cpu(struct blkio_group *blkg, int plid)
750
{
751
	struct blkg_policy_data *pd = blkg->pd[plid];
T
Tejun Heo 已提交
752
	int cpu;
753 754 755

	if (pd->stats_cpu == NULL)
		return;
T
Tejun Heo 已提交
756 757 758 759 760 761 762

	for_each_possible_cpu(cpu) {
		struct blkio_group_stats_cpu *sc =
			per_cpu_ptr(pd->stats_cpu, cpu);

		sc->sectors = 0;
		memset(sc->stat_arr_cpu, 0, sizeof(sc->stat_arr_cpu));
763 764 765
	}
}

766
static int
767
blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
768
{
T
Tejun Heo 已提交
769
	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
770 771
	struct blkio_group *blkg;
	struct hlist_node *n;
772
	int i;
773

774
	spin_lock(&blkio_list_lock);
775
	spin_lock_irq(&blkcg->lock);
T
Tejun Heo 已提交
776 777 778 779 780 781

	/*
	 * Note that stat reset is racy - it doesn't synchronize against
	 * stat updates.  This is a debug feature which shouldn't exist
	 * anyway.  If you get hit by a race, retry.
	 */
782
	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
783
		struct blkio_policy_type *pol;
784

785 786
		list_for_each_entry(pol, &blkio_list, list) {
			struct blkg_policy_data *pd = blkg->pd[pol->plid];
T
Tejun Heo 已提交
787 788 789 790 791 792 793 794
			struct blkio_group_stats *stats = &pd->stats;

			/* queued stats shouldn't be cleared */
			for (i = 0; i < ARRAY_SIZE(stats->stat_arr); i++)
				if (i != BLKIO_STAT_QUEUED)
					memset(stats->stat_arr[i], 0,
					       sizeof(stats->stat_arr[i]));
			stats->time = 0;
795
#ifdef CONFIG_DEBUG_BLK_CGROUP
T
Tejun Heo 已提交
796 797
			memset((void *)stats + BLKG_STATS_DEBUG_CLEAR_START, 0,
			       BLKG_STATS_DEBUG_CLEAR_SIZE);
798
#endif
799 800
			blkio_reset_stats_cpu(blkg, pol->plid);
		}
801
	}
802

803
	spin_unlock_irq(&blkcg->lock);
804
	spin_unlock(&blkio_list_lock);
805 806 807
	return 0;
}

808 809
static void blkio_get_key_name(enum stat_sub_type type, const char *dname,
			       char *str, int chars_left, bool diskname_only)
810
{
811
	snprintf(str, chars_left, "%s", dname);
812 813 814 815 816 817
	chars_left -= strlen(str);
	if (chars_left <= 0) {
		printk(KERN_WARNING
			"Possibly incorrect cgroup stat display format");
		return;
	}
818 819
	if (diskname_only)
		return;
820
	switch (type) {
821
	case BLKIO_STAT_READ:
822 823
		strlcat(str, " Read", chars_left);
		break;
824
	case BLKIO_STAT_WRITE:
825 826
		strlcat(str, " Write", chars_left);
		break;
827
	case BLKIO_STAT_SYNC:
828 829
		strlcat(str, " Sync", chars_left);
		break;
830
	case BLKIO_STAT_ASYNC:
831 832
		strlcat(str, " Async", chars_left);
		break;
833
	case BLKIO_STAT_TOTAL:
834 835 836 837 838 839 840
		strlcat(str, " Total", chars_left);
		break;
	default:
		strlcat(str, " Invalid", chars_left);
	}
}

841
static uint64_t blkio_read_stat_cpu(struct blkio_group *blkg, int plid,
842 843
			enum stat_type_cpu type, enum stat_sub_type sub_type)
{
844
	struct blkg_policy_data *pd = blkg->pd[plid];
845 846
	int cpu;
	struct blkio_group_stats_cpu *stats_cpu;
847
	u64 val = 0, tval;
848

849 850 851
	if (pd->stats_cpu == NULL)
		return val;

852
	for_each_possible_cpu(cpu) {
853
		unsigned int start;
854
		stats_cpu = per_cpu_ptr(pd->stats_cpu, cpu);
855

856 857 858 859 860 861 862 863 864
		do {
			start = u64_stats_fetch_begin(&stats_cpu->syncp);
			if (type == BLKIO_STAT_CPU_SECTORS)
				tval = stats_cpu->sectors;
			else
				tval = stats_cpu->stat_arr_cpu[type][sub_type];
		} while(u64_stats_fetch_retry(&stats_cpu->syncp, start));

		val += tval;
865 866 867 868 869
	}

	return val;
}

870
static uint64_t blkio_get_stat_cpu(struct blkio_group *blkg, int plid,
871 872
				   struct cgroup_map_cb *cb, const char *dname,
				   enum stat_type_cpu type)
873 874 875 876 877 878
{
	uint64_t disk_total, val;
	char key_str[MAX_KEY_LEN];
	enum stat_sub_type sub_type;

	if (type == BLKIO_STAT_CPU_SECTORS) {
879
		val = blkio_read_stat_cpu(blkg, plid, type, 0);
T
Tejun Heo 已提交
880 881 882
		blkio_get_key_name(0, dname, key_str, MAX_KEY_LEN, true);
		cb->fill(cb, key_str, val);
		return val;
883 884 885 886
	}

	for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
			sub_type++) {
887 888
		blkio_get_key_name(sub_type, dname, key_str, MAX_KEY_LEN,
				   false);
889
		val = blkio_read_stat_cpu(blkg, plid, type, sub_type);
890 891 892
		cb->fill(cb, key_str, val);
	}

893 894
	disk_total = blkio_read_stat_cpu(blkg, plid, type, BLKIO_STAT_READ) +
		blkio_read_stat_cpu(blkg, plid, type, BLKIO_STAT_WRITE);
895

896 897
	blkio_get_key_name(BLKIO_STAT_TOTAL, dname, key_str, MAX_KEY_LEN,
			   false);
898 899 900 901
	cb->fill(cb, key_str, disk_total);
	return disk_total;
}

902
static uint64_t blkio_get_stat(struct blkio_group *blkg, int plid,
903 904
			       struct cgroup_map_cb *cb, const char *dname,
			       enum stat_type type)
905
{
T
Tejun Heo 已提交
906 907
	struct blkio_group_stats *stats = &blkg->pd[plid]->stats;
	uint64_t v = 0, disk_total = 0;
908
	char key_str[MAX_KEY_LEN];
909
	unsigned int sync_start;
T
Tejun Heo 已提交
910
	int st;
911

T
Tejun Heo 已提交
912
	if (type >= BLKIO_STAT_ARR_NR) {
913 914 915 916 917 918
		do {
			sync_start = u64_stats_fetch_begin(&stats->syncp);
			switch (type) {
			case BLKIO_STAT_TIME:
				v = stats->time;
				break;
919
#ifdef CONFIG_DEBUG_BLK_CGROUP
920 921 922 923 924
			case BLKIO_STAT_UNACCOUNTED_TIME:
				v = stats->unaccounted_time;
				break;
			case BLKIO_STAT_AVG_QUEUE_SIZE: {
				uint64_t samples = stats->avg_queue_size_samples;
T
Tejun Heo 已提交
925

926 927 928 929 930
				if (samples) {
					v = stats->avg_queue_size_sum;
					do_div(v, samples);
				}
				break;
T
Tejun Heo 已提交
931
			}
932 933 934 935 936 937 938 939 940 941 942 943
			case BLKIO_STAT_IDLE_TIME:
				v = stats->idle_time;
				break;
			case BLKIO_STAT_EMPTY_TIME:
				v = stats->empty_time;
				break;
			case BLKIO_STAT_DEQUEUE:
				v = stats->dequeue;
				break;
			case BLKIO_STAT_GROUP_WAIT_TIME:
				v = stats->group_wait_time;
				break;
944
#endif
945 946 947 948
			default:
				WARN_ON_ONCE(1);
			}
		} while (u64_stats_fetch_retry(&stats->syncp, sync_start));
949

T
Tejun Heo 已提交
950 951 952
		blkio_get_key_name(0, dname, key_str, MAX_KEY_LEN, true);
		cb->fill(cb, key_str, v);
		return v;
953
	}
T
Tejun Heo 已提交
954 955

	for (st = BLKIO_STAT_READ; st < BLKIO_STAT_TOTAL; st++) {
956 957 958 959
		do {
			sync_start = u64_stats_fetch_begin(&stats->syncp);
			v = stats->stat_arr[type][st];
		} while (u64_stats_fetch_retry(&stats->syncp, sync_start));
T
Tejun Heo 已提交
960 961 962 963 964 965 966

		blkio_get_key_name(st, dname, key_str, MAX_KEY_LEN, false);
		cb->fill(cb, key_str, v);
		if (st == BLKIO_STAT_READ || st == BLKIO_STAT_WRITE)
			disk_total += v;
	}

967 968
	blkio_get_key_name(BLKIO_STAT_TOTAL, dname, key_str, MAX_KEY_LEN,
			   false);
969 970 971 972
	cb->fill(cb, key_str, disk_total);
	return disk_total;
}

T
Tejun Heo 已提交
973 974
static int blkio_policy_parse_and_set(char *buf, enum blkio_policy_id plid,
				      int fileid, struct blkio_cgroup *blkcg)
975
{
976
	struct gendisk *disk = NULL;
977
	struct blkio_group *blkg = NULL;
978
	struct blkg_policy_data *pd;
979
	char *s[4], *p, *major_s = NULL, *minor_s = NULL;
980
	unsigned long major, minor;
981 982
	int i = 0, ret = -EINVAL;
	int part;
983
	dev_t dev;
984
	u64 temp;
985 986 987 988 989 990 991 992 993 994 995 996 997 998 999

	memset(s, 0, sizeof(s));

	while ((p = strsep(&buf, " ")) != NULL) {
		if (!*p)
			continue;

		s[i++] = p;

		/* Prevent from inputing too many things */
		if (i == 3)
			break;
	}

	if (i != 2)
1000
		goto out;
1001 1002 1003 1004 1005

	p = strsep(&s[0], ":");
	if (p != NULL)
		major_s = p;
	else
1006
		goto out;
1007 1008 1009

	minor_s = s[0];
	if (!minor_s)
1010
		goto out;
1011

1012 1013
	if (strict_strtoul(major_s, 10, &major))
		goto out;
1014

1015 1016
	if (strict_strtoul(minor_s, 10, &minor))
		goto out;
1017 1018 1019

	dev = MKDEV(major, minor);

1020 1021
	if (strict_strtoull(s[1], 10, &temp))
		goto out;
1022

1023
	disk = get_gendisk(dev, &part);
T
Tejun Heo 已提交
1024
	if (!disk || part)
1025 1026 1027 1028
		goto out;

	rcu_read_lock();

T
Tejun Heo 已提交
1029 1030 1031
	spin_lock_irq(disk->queue->queue_lock);
	blkg = blkg_lookup_create(blkcg, disk->queue, plid, false);
	spin_unlock_irq(disk->queue->queue_lock);
1032

T
Tejun Heo 已提交
1033 1034 1035
	if (IS_ERR(blkg)) {
		ret = PTR_ERR(blkg);
		goto out_unlock;
1036
	}
1037

1038 1039
	pd = blkg->pd[plid];

1040 1041
	switch (plid) {
	case BLKIO_POLICY_PROP:
1042 1043
		if ((temp < BLKIO_WEIGHT_MIN && temp > 0) ||
		     temp > BLKIO_WEIGHT_MAX)
1044
			goto out_unlock;
1045

1046
		pd->conf.weight = temp;
1047
		blkio_update_group_weight(blkg, plid, temp ?: blkcg->weight);
1048 1049
		break;
	case BLKIO_POLICY_THROTL:
1050 1051
		switch(fileid) {
		case BLKIO_THROTL_read_bps_device:
1052
			pd->conf.bps[READ] = temp;
1053
			blkio_update_group_bps(blkg, plid, temp ?: -1, fileid);
1054
			break;
1055
		case BLKIO_THROTL_write_bps_device:
1056
			pd->conf.bps[WRITE] = temp;
1057
			blkio_update_group_bps(blkg, plid, temp ?: -1, fileid);
1058 1059
			break;
		case BLKIO_THROTL_read_iops_device:
1060 1061
			if (temp > THROTL_IOPS_MAX)
				goto out_unlock;
1062
			pd->conf.iops[READ] = temp;
1063
			blkio_update_group_iops(blkg, plid, temp ?: -1, fileid);
1064
			break;
1065
		case BLKIO_THROTL_write_iops_device:
1066
			if (temp > THROTL_IOPS_MAX)
1067
				goto out_unlock;
1068
			pd->conf.iops[WRITE] = temp;
1069
			blkio_update_group_iops(blkg, plid, temp ?: -1, fileid);
1070 1071
			break;
		}
1072 1073 1074 1075
		break;
	default:
		BUG();
	}
1076
	ret = 0;
1077 1078
out_unlock:
	rcu_read_unlock();
1079 1080
out:
	put_disk(disk);
1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091

	/*
	 * If queue was bypassing, we should retry.  Do so after a short
	 * msleep().  It isn't strictly necessary but queue can be
	 * bypassing for some time and it's always nice to avoid busy
	 * looping.
	 */
	if (ret == -EBUSY) {
		msleep(10);
		return restart_syscall();
	}
1092
	return ret;
1093 1094
}

1095 1096
static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft,
 				       const char *buffer)
1097 1098 1099
{
	int ret = 0;
	char *buf;
1100
	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
1101 1102
	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
	int fileid = BLKIOFILE_ATTR(cft->private);
1103 1104 1105 1106 1107

	buf = kstrdup(buffer, GFP_KERNEL);
	if (!buf)
		return -ENOMEM;

T
Tejun Heo 已提交
1108
	ret = blkio_policy_parse_and_set(buf, plid, fileid, blkcg);
1109 1110 1111 1112
	kfree(buf);
	return ret;
}

1113 1114 1115 1116 1117 1118 1119 1120
static const char *blkg_dev_name(struct blkio_group *blkg)
{
	/* some drivers (floppy) instantiate a queue w/o disk registered */
	if (blkg->q->backing_dev_info.dev)
		return dev_name(blkg->q->backing_dev_info.dev);
	return NULL;
}

T
Tejun Heo 已提交
1121 1122
static void blkio_print_group_conf(struct cftype *cft, struct blkio_group *blkg,
				   struct seq_file *m)
1123
{
1124
	int plid = BLKIOFILE_POLICY(cft->private);
T
Tejun Heo 已提交
1125
	int fileid = BLKIOFILE_ATTR(cft->private);
1126 1127
	struct blkg_policy_data *pd = blkg->pd[plid];
	const char *dname = blkg_dev_name(blkg);
T
Tejun Heo 已提交
1128 1129
	int rw = WRITE;

1130 1131 1132
	if (!dname)
		return;

1133
	switch (plid) {
1134
		case BLKIO_POLICY_PROP:
1135
			if (pd->conf.weight)
1136
				seq_printf(m, "%s\t%u\n",
1137
					   dname, pd->conf.weight);
1138 1139
			break;
		case BLKIO_POLICY_THROTL:
T
Tejun Heo 已提交
1140
			switch (fileid) {
1141
			case BLKIO_THROTL_read_bps_device:
T
Tejun Heo 已提交
1142
				rw = READ;
1143
			case BLKIO_THROTL_write_bps_device:
1144
				if (pd->conf.bps[rw])
1145
					seq_printf(m, "%s\t%llu\n",
1146
						   dname, pd->conf.bps[rw]);
1147 1148
				break;
			case BLKIO_THROTL_read_iops_device:
T
Tejun Heo 已提交
1149
				rw = READ;
1150
			case BLKIO_THROTL_write_iops_device:
1151
				if (pd->conf.iops[rw])
1152
					seq_printf(m, "%s\t%u\n",
1153
						   dname, pd->conf.iops[rw]);
1154 1155
				break;
			}
1156 1157 1158 1159 1160
			break;
		default:
			BUG();
	}
}
1161

1162
/* cgroup files which read their data from policy nodes end up here */
T
Tejun Heo 已提交
1163 1164
static void blkio_read_conf(struct cftype *cft, struct blkio_cgroup *blkcg,
			    struct seq_file *m)
1165
{
T
Tejun Heo 已提交
1166 1167
	struct blkio_group *blkg;
	struct hlist_node *n;
1168

T
Tejun Heo 已提交
1169 1170
	spin_lock_irq(&blkcg->lock);
	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node)
1171
		blkio_print_group_conf(cft, blkg, m);
T
Tejun Heo 已提交
1172
	spin_unlock_irq(&blkcg->lock);
1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187
}

static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft,
				struct seq_file *m)
{
	struct blkio_cgroup *blkcg;
	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
	int name = BLKIOFILE_ATTR(cft->private);

	blkcg = cgroup_to_blkio_cgroup(cgrp);

	switch(plid) {
	case BLKIO_POLICY_PROP:
		switch(name) {
		case BLKIO_PROP_weight_device:
T
Tejun Heo 已提交
1188
			blkio_read_conf(cft, blkcg, m);
1189 1190 1191 1192 1193
			return 0;
		default:
			BUG();
		}
		break;
1194 1195 1196 1197
	case BLKIO_POLICY_THROTL:
		switch(name){
		case BLKIO_THROTL_read_bps_device:
		case BLKIO_THROTL_write_bps_device:
1198 1199
		case BLKIO_THROTL_read_iops_device:
		case BLKIO_THROTL_write_iops_device:
T
Tejun Heo 已提交
1200
			blkio_read_conf(cft, blkcg, m);
1201 1202 1203 1204 1205
			return 0;
		default:
			BUG();
		}
		break;
1206 1207 1208 1209 1210 1211 1212 1213
	default:
		BUG();
	}

	return 0;
}

static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg,
1214 1215
		struct cftype *cft, struct cgroup_map_cb *cb,
		enum stat_type type, bool show_total, bool pcpu)
1216 1217 1218 1219 1220
{
	struct blkio_group *blkg;
	struct hlist_node *n;
	uint64_t cgroup_total = 0;

T
Tejun Heo 已提交
1221 1222 1223
	spin_lock_irq(&blkcg->lock);

	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
1224
		const char *dname = blkg_dev_name(blkg);
1225
		int plid = BLKIOFILE_POLICY(cft->private);
1226

1227
		if (!dname)
1228
			continue;
1229
		if (pcpu)
1230 1231
			cgroup_total += blkio_get_stat_cpu(blkg, plid,
							   cb, dname, type);
1232
		else
1233 1234
			cgroup_total += blkio_get_stat(blkg, plid,
						       cb, dname, type);
1235 1236 1237
	}
	if (show_total)
		cb->fill(cb, "Total", cgroup_total);
T
Tejun Heo 已提交
1238 1239

	spin_unlock_irq(&blkcg->lock);
1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257
	return 0;
}

/* All map kind of cgroup file get serviced by this function */
static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
				struct cgroup_map_cb *cb)
{
	struct blkio_cgroup *blkcg;
	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
	int name = BLKIOFILE_ATTR(cft->private);

	blkcg = cgroup_to_blkio_cgroup(cgrp);

	switch(plid) {
	case BLKIO_POLICY_PROP:
		switch(name) {
		case BLKIO_PROP_time:
			return blkio_read_blkg_stats(blkcg, cft, cb,
1258
						BLKIO_STAT_TIME, 0, 0);
1259 1260
		case BLKIO_PROP_sectors:
			return blkio_read_blkg_stats(blkcg, cft, cb,
1261
						BLKIO_STAT_CPU_SECTORS, 0, 1);
1262 1263
		case BLKIO_PROP_io_service_bytes:
			return blkio_read_blkg_stats(blkcg, cft, cb,
1264
					BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1);
1265 1266
		case BLKIO_PROP_io_serviced:
			return blkio_read_blkg_stats(blkcg, cft, cb,
1267
						BLKIO_STAT_CPU_SERVICED, 1, 1);
1268 1269
		case BLKIO_PROP_io_service_time:
			return blkio_read_blkg_stats(blkcg, cft, cb,
1270
						BLKIO_STAT_SERVICE_TIME, 1, 0);
1271 1272
		case BLKIO_PROP_io_wait_time:
			return blkio_read_blkg_stats(blkcg, cft, cb,
1273
						BLKIO_STAT_WAIT_TIME, 1, 0);
1274 1275
		case BLKIO_PROP_io_merged:
			return blkio_read_blkg_stats(blkcg, cft, cb,
1276
						BLKIO_STAT_MERGED, 1, 0);
1277 1278
		case BLKIO_PROP_io_queued:
			return blkio_read_blkg_stats(blkcg, cft, cb,
1279
						BLKIO_STAT_QUEUED, 1, 0);
1280
#ifdef CONFIG_DEBUG_BLK_CGROUP
1281 1282
		case BLKIO_PROP_unaccounted_time:
			return blkio_read_blkg_stats(blkcg, cft, cb,
1283
					BLKIO_STAT_UNACCOUNTED_TIME, 0, 0);
1284 1285
		case BLKIO_PROP_dequeue:
			return blkio_read_blkg_stats(blkcg, cft, cb,
1286
						BLKIO_STAT_DEQUEUE, 0, 0);
1287 1288
		case BLKIO_PROP_avg_queue_size:
			return blkio_read_blkg_stats(blkcg, cft, cb,
1289
					BLKIO_STAT_AVG_QUEUE_SIZE, 0, 0);
1290 1291
		case BLKIO_PROP_group_wait_time:
			return blkio_read_blkg_stats(blkcg, cft, cb,
1292
					BLKIO_STAT_GROUP_WAIT_TIME, 0, 0);
1293 1294
		case BLKIO_PROP_idle_time:
			return blkio_read_blkg_stats(blkcg, cft, cb,
1295
						BLKIO_STAT_IDLE_TIME, 0, 0);
1296 1297
		case BLKIO_PROP_empty_time:
			return blkio_read_blkg_stats(blkcg, cft, cb,
1298
						BLKIO_STAT_EMPTY_TIME, 0, 0);
1299 1300 1301 1302 1303
#endif
		default:
			BUG();
		}
		break;
1304 1305 1306 1307
	case BLKIO_POLICY_THROTL:
		switch(name){
		case BLKIO_THROTL_io_service_bytes:
			return blkio_read_blkg_stats(blkcg, cft, cb,
1308
						BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1);
1309 1310
		case BLKIO_THROTL_io_serviced:
			return blkio_read_blkg_stats(blkcg, cft, cb,
1311
						BLKIO_STAT_CPU_SERVICED, 1, 1);
1312 1313 1314 1315
		default:
			BUG();
		}
		break;
1316 1317 1318 1319 1320 1321 1322
	default:
		BUG();
	}

	return 0;
}

T
Tejun Heo 已提交
1323
static int blkio_weight_write(struct blkio_cgroup *blkcg, int plid, u64 val)
1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334
{
	struct blkio_group *blkg;
	struct hlist_node *n;

	if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
		return -EINVAL;

	spin_lock(&blkio_list_lock);
	spin_lock_irq(&blkcg->lock);
	blkcg->weight = (unsigned int)val;

1335
	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
1336
		struct blkg_policy_data *pd = blkg->pd[plid];
1337

1338
		if (!pd->conf.weight)
1339
			blkio_update_group_weight(blkg, plid, blkcg->weight);
1340
	}
1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379

	spin_unlock_irq(&blkcg->lock);
	spin_unlock(&blkio_list_lock);
	return 0;
}

static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
	struct blkio_cgroup *blkcg;
	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
	int name = BLKIOFILE_ATTR(cft->private);

	blkcg = cgroup_to_blkio_cgroup(cgrp);

	switch(plid) {
	case BLKIO_POLICY_PROP:
		switch(name) {
		case BLKIO_PROP_weight:
			return (u64)blkcg->weight;
		}
		break;
	default:
		BUG();
	}
	return 0;
}

static int
blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
{
	struct blkio_cgroup *blkcg;
	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
	int name = BLKIOFILE_ATTR(cft->private);

	blkcg = cgroup_to_blkio_cgroup(cgrp);

	switch(plid) {
	case BLKIO_POLICY_PROP:
		switch(name) {
		case BLKIO_PROP_weight:
T
Tejun Heo 已提交
1380
			return blkio_weight_write(blkcg, plid, val);
1381 1382 1383 1384 1385
		}
		break;
	default:
		BUG();
	}
1386 1387 1388 1389

	return 0;
}

1390
struct cftype blkio_files[] = {
1391 1392
	{
		.name = "weight_device",
1393 1394 1395 1396
		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
				BLKIO_PROP_weight_device),
		.read_seq_string = blkiocg_file_read,
		.write_string = blkiocg_file_write,
1397 1398
		.max_write_len = 256,
	},
1399 1400
	{
		.name = "weight",
1401 1402 1403 1404
		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
				BLKIO_PROP_weight),
		.read_u64 = blkiocg_file_read_u64,
		.write_u64 = blkiocg_file_write_u64,
1405
	},
1406 1407
	{
		.name = "time",
1408 1409 1410
		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
				BLKIO_PROP_time),
		.read_map = blkiocg_file_read_map,
1411 1412 1413
	},
	{
		.name = "sectors",
1414 1415 1416
		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
				BLKIO_PROP_sectors),
		.read_map = blkiocg_file_read_map,
1417 1418 1419
	},
	{
		.name = "io_service_bytes",
1420 1421 1422
		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
				BLKIO_PROP_io_service_bytes),
		.read_map = blkiocg_file_read_map,
1423 1424 1425
	},
	{
		.name = "io_serviced",
1426 1427 1428
		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
				BLKIO_PROP_io_serviced),
		.read_map = blkiocg_file_read_map,
1429 1430 1431
	},
	{
		.name = "io_service_time",
1432 1433 1434
		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
				BLKIO_PROP_io_service_time),
		.read_map = blkiocg_file_read_map,
1435 1436 1437
	},
	{
		.name = "io_wait_time",
1438 1439 1440
		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
				BLKIO_PROP_io_wait_time),
		.read_map = blkiocg_file_read_map,
1441
	},
D
Divyesh Shah 已提交
1442 1443
	{
		.name = "io_merged",
1444 1445 1446
		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
				BLKIO_PROP_io_merged),
		.read_map = blkiocg_file_read_map,
D
Divyesh Shah 已提交
1447
	},
1448 1449
	{
		.name = "io_queued",
1450 1451 1452
		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
				BLKIO_PROP_io_queued),
		.read_map = blkiocg_file_read_map,
1453
	},
1454 1455 1456
	{
		.name = "reset_stats",
		.write_u64 = blkiocg_reset_stats,
1457
	},
1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507
#ifdef CONFIG_BLK_DEV_THROTTLING
	{
		.name = "throttle.read_bps_device",
		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
				BLKIO_THROTL_read_bps_device),
		.read_seq_string = blkiocg_file_read,
		.write_string = blkiocg_file_write,
		.max_write_len = 256,
	},

	{
		.name = "throttle.write_bps_device",
		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
				BLKIO_THROTL_write_bps_device),
		.read_seq_string = blkiocg_file_read,
		.write_string = blkiocg_file_write,
		.max_write_len = 256,
	},

	{
		.name = "throttle.read_iops_device",
		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
				BLKIO_THROTL_read_iops_device),
		.read_seq_string = blkiocg_file_read,
		.write_string = blkiocg_file_write,
		.max_write_len = 256,
	},

	{
		.name = "throttle.write_iops_device",
		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
				BLKIO_THROTL_write_iops_device),
		.read_seq_string = blkiocg_file_read,
		.write_string = blkiocg_file_write,
		.max_write_len = 256,
	},
	{
		.name = "throttle.io_service_bytes",
		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
				BLKIO_THROTL_io_service_bytes),
		.read_map = blkiocg_file_read_map,
	},
	{
		.name = "throttle.io_serviced",
		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
				BLKIO_THROTL_io_serviced),
		.read_map = blkiocg_file_read_map,
	},
#endif /* CONFIG_BLK_DEV_THROTTLING */

1508
#ifdef CONFIG_DEBUG_BLK_CGROUP
1509 1510
	{
		.name = "avg_queue_size",
1511 1512 1513
		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
				BLKIO_PROP_avg_queue_size),
		.read_map = blkiocg_file_read_map,
1514
	},
1515 1516
	{
		.name = "group_wait_time",
1517 1518 1519
		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
				BLKIO_PROP_group_wait_time),
		.read_map = blkiocg_file_read_map,
1520 1521 1522
	},
	{
		.name = "idle_time",
1523 1524 1525
		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
				BLKIO_PROP_idle_time),
		.read_map = blkiocg_file_read_map,
1526 1527 1528
	},
	{
		.name = "empty_time",
1529 1530 1531
		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
				BLKIO_PROP_empty_time),
		.read_map = blkiocg_file_read_map,
1532
	},
1533
	{
1534
		.name = "dequeue",
1535 1536 1537
		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
				BLKIO_PROP_dequeue),
		.read_map = blkiocg_file_read_map,
1538
	},
1539 1540 1541 1542 1543 1544
	{
		.name = "unaccounted_time",
		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
				BLKIO_PROP_unaccounted_time),
		.read_map = blkiocg_file_read_map,
	},
1545
#endif
1546
	{ }	/* terminate */
1547 1548
};

1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559
/**
 * blkiocg_pre_destroy - cgroup pre_destroy callback
 * @cgroup: cgroup of interest
 *
 * This function is called when @cgroup is about to go away and responsible
 * for shooting down all blkgs associated with @cgroup.  blkgs should be
 * removed while holding both q and blkcg locks.  As blkcg lock is nested
 * inside q lock, this function performs reverse double lock dancing.
 *
 * This is the blkcg counterpart of ioc_release_fn().
 */
1560
static int blkiocg_pre_destroy(struct cgroup *cgroup)
1561 1562
{
	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
1563

1564
	spin_lock_irq(&blkcg->lock);
1565

1566 1567 1568
	while (!hlist_empty(&blkcg->blkg_list)) {
		struct blkio_group *blkg = hlist_entry(blkcg->blkg_list.first,
						struct blkio_group, blkcg_node);
T
Tejun Heo 已提交
1569
		struct request_queue *q = blkg->q;
1570

1571 1572 1573 1574 1575 1576
		if (spin_trylock(q->queue_lock)) {
			blkg_destroy(blkg);
			spin_unlock(q->queue_lock);
		} else {
			spin_unlock_irq(&blkcg->lock);
			cpu_relax();
1577
			spin_lock_irq(&blkcg->lock);
1578
		}
1579
	}
1580

1581
	spin_unlock_irq(&blkcg->lock);
1582 1583 1584
	return 0;
}

1585
static void blkiocg_destroy(struct cgroup *cgroup)
1586 1587 1588
{
	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);

B
Ben Blum 已提交
1589 1590
	if (blkcg != &blkio_root_cgroup)
		kfree(blkcg);
1591 1592
}

1593
static struct cgroup_subsys_state *blkiocg_create(struct cgroup *cgroup)
1594
{
T
Tejun Heo 已提交
1595
	static atomic64_t id_seq = ATOMIC64_INIT(0);
1596 1597
	struct blkio_cgroup *blkcg;
	struct cgroup *parent = cgroup->parent;
1598

1599
	if (!parent) {
1600 1601 1602 1603 1604 1605 1606 1607 1608
		blkcg = &blkio_root_cgroup;
		goto done;
	}

	blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
	if (!blkcg)
		return ERR_PTR(-ENOMEM);

	blkcg->weight = BLKIO_WEIGHT_DEFAULT;
T
Tejun Heo 已提交
1609
	blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */
1610 1611 1612 1613 1614 1615 1616
done:
	spin_lock_init(&blkcg->lock);
	INIT_HLIST_HEAD(&blkcg->blkg_list);

	return &blkcg->css;
}

1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628
/**
 * blkcg_init_queue - initialize blkcg part of request queue
 * @q: request_queue to initialize
 *
 * Called from blk_alloc_queue_node(). Responsible for initializing blkcg
 * part of new request_queue @q.
 *
 * RETURNS:
 * 0 on success, -errno on failure.
 */
int blkcg_init_queue(struct request_queue *q)
{
1629 1630
	int ret;

1631 1632
	might_sleep();

1633 1634 1635 1636 1637 1638 1639 1640 1641 1642
	ret = blk_throtl_init(q);
	if (ret)
		return ret;

	mutex_lock(&all_q_mutex);
	INIT_LIST_HEAD(&q->all_q_node);
	list_add_tail(&q->all_q_node, &all_q_list);
	mutex_unlock(&all_q_mutex);

	return 0;
1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665
}

/**
 * blkcg_drain_queue - drain blkcg part of request_queue
 * @q: request_queue to drain
 *
 * Called from blk_drain_queue().  Responsible for draining blkcg part.
 */
void blkcg_drain_queue(struct request_queue *q)
{
	lockdep_assert_held(q->queue_lock);

	blk_throtl_drain(q);
}

/**
 * blkcg_exit_queue - exit and release blkcg part of request_queue
 * @q: request_queue being released
 *
 * Called from blk_release_queue().  Responsible for exiting blkcg part.
 */
void blkcg_exit_queue(struct request_queue *q)
{
1666 1667 1668 1669
	mutex_lock(&all_q_mutex);
	list_del_init(&q->all_q_node);
	mutex_unlock(&all_q_mutex);

1670 1671
	blkg_destroy_all(q, true);

1672 1673 1674
	blk_throtl_exit(q);
}

1675 1676 1677 1678 1679 1680
/*
 * We cannot support shared io contexts, as we have no mean to support
 * two tasks with the same ioc in two different groups without major rework
 * of the main cic data structures.  For now we allow a task to change
 * its cgroup only if it's the only owner of its ioc.
 */
1681
static int blkiocg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1682
{
1683
	struct task_struct *task;
1684 1685 1686 1687
	struct io_context *ioc;
	int ret = 0;

	/* task_lock() is needed to avoid races with exit_io_context() */
1688 1689 1690 1691 1692 1693 1694 1695 1696
	cgroup_taskset_for_each(task, cgrp, tset) {
		task_lock(task);
		ioc = task->io_context;
		if (ioc && atomic_read(&ioc->nr_tasks) > 1)
			ret = -EINVAL;
		task_unlock(task);
		if (ret)
			break;
	}
1697 1698 1699
	return ret;
}

1700 1701 1702 1703 1704 1705 1706 1707 1708
static void blkcg_bypass_start(void)
	__acquires(&all_q_mutex)
{
	struct request_queue *q;

	mutex_lock(&all_q_mutex);

	list_for_each_entry(q, &all_q_list, all_q_node) {
		blk_queue_bypass_start(q);
1709
		blkg_destroy_all(q, false);
1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723
	}
}

static void blkcg_bypass_end(void)
	__releases(&all_q_mutex)
{
	struct request_queue *q;

	list_for_each_entry(q, &all_q_list, all_q_node)
		blk_queue_bypass_end(q);

	mutex_unlock(&all_q_mutex);
}

1724 1725 1726 1727
struct cgroup_subsys blkio_subsys = {
	.name = "blkio",
	.create = blkiocg_create,
	.can_attach = blkiocg_can_attach,
1728
	.pre_destroy = blkiocg_pre_destroy,
1729 1730
	.destroy = blkiocg_destroy,
	.subsys_id = blkio_subsys_id,
1731
	.base_cftypes = blkio_files,
1732 1733 1734 1735
	.module = THIS_MODULE,
};
EXPORT_SYMBOL_GPL(blkio_subsys);

1736 1737
void blkio_policy_register(struct blkio_policy_type *blkiop)
{
1738 1739
	struct request_queue *q;

1740
	blkcg_bypass_start();
1741
	spin_lock(&blkio_list_lock);
1742 1743 1744

	BUG_ON(blkio_policy[blkiop->plid]);
	blkio_policy[blkiop->plid] = blkiop;
1745
	list_add_tail(&blkiop->list, &blkio_list);
1746

1747
	spin_unlock(&blkio_list_lock);
1748 1749
	list_for_each_entry(q, &all_q_list, all_q_node)
		update_root_blkg_pd(q, blkiop->plid);
1750
	blkcg_bypass_end();
1751 1752 1753 1754 1755
}
EXPORT_SYMBOL_GPL(blkio_policy_register);

void blkio_policy_unregister(struct blkio_policy_type *blkiop)
{
1756 1757
	struct request_queue *q;

1758
	blkcg_bypass_start();
1759
	spin_lock(&blkio_list_lock);
1760 1761 1762

	BUG_ON(blkio_policy[blkiop->plid] != blkiop);
	blkio_policy[blkiop->plid] = NULL;
1763
	list_del_init(&blkiop->list);
1764

1765
	spin_unlock(&blkio_list_lock);
1766 1767
	list_for_each_entry(q, &all_q_list, all_q_node)
		update_root_blkg_pd(q, blkiop->plid);
1768
	blkcg_bypass_end();
1769 1770
}
EXPORT_SYMBOL_GPL(blkio_policy_unregister);