blk-cgroup.c 45.2 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13
/*
 * Common Block IO controller cgroup interface
 *
 * Based on ideas and code from CFQ, CFS and BFQ:
 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
 *
 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
 *		      Paolo Valente <paolo.valente@unimore.it>
 *
 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
 * 	              Nauman Rafique <nauman@google.com>
 */
#include <linux/ioprio.h>
14 15
#include <linux/seq_file.h>
#include <linux/kdev_t.h>
16
#include <linux/module.h>
17
#include <linux/err.h>
18
#include <linux/blkdev.h>
19
#include <linux/slab.h>
20
#include <linux/genhd.h>
21
#include <linux/delay.h>
T
Tejun Heo 已提交
22
#include <linux/atomic.h>
23
#include "blk-cgroup.h"
24
#include "blk.h"
25

26 27
#define MAX_KEY_LEN 100

28 29
static DEFINE_SPINLOCK(blkio_list_lock);
static LIST_HEAD(blkio_list);
30

31 32 33
static DEFINE_MUTEX(all_q_mutex);
static LIST_HEAD(all_q_list);

34 35 36 37 38 39 40
/* List of groups pending per cpu stats allocation */
static DEFINE_SPINLOCK(alloc_list_lock);
static LIST_HEAD(alloc_list);

static void blkio_stat_alloc_fn(struct work_struct *);
static DECLARE_DELAYED_WORK(blkio_stat_alloc_work, blkio_stat_alloc_fn);

41
struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
42 43
EXPORT_SYMBOL_GPL(blkio_root_cgroup);

44 45
static struct blkio_policy_type *blkio_policy[BLKIO_NR_POLICIES];

B
Ben Blum 已提交
46 47
static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
						  struct cgroup *);
48 49
static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *,
			      struct cgroup_taskset *);
50
static int blkiocg_pre_destroy(struct cgroup_subsys *, struct cgroup *);
B
Ben Blum 已提交
51 52 53
static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *);
static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);

54 55 56 57 58 59
/* for encoding cft->private value on file */
#define BLKIOFILE_PRIVATE(x, val)	(((x) << 16) | (val))
/* What policy owns the file, proportional or throttle */
#define BLKIOFILE_POLICY(val)		(((val) >> 16) & 0xffff)
#define BLKIOFILE_ATTR(val)		((val) & 0xffff)

B
Ben Blum 已提交
60 61 62
struct cgroup_subsys blkio_subsys = {
	.name = "blkio",
	.create = blkiocg_create,
63
	.can_attach = blkiocg_can_attach,
64
	.pre_destroy = blkiocg_pre_destroy,
B
Ben Blum 已提交
65 66 67 68 69 70 71
	.destroy = blkiocg_destroy,
	.populate = blkiocg_populate,
	.subsys_id = blkio_subsys_id,
	.module = THIS_MODULE,
};
EXPORT_SYMBOL_GPL(blkio_subsys);

72 73 74 75 76
struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
{
	return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
			    struct blkio_cgroup, css);
}
77
EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
78

79
static struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk)
80 81 82 83
{
	return container_of(task_subsys_state(tsk, blkio_subsys_id),
			    struct blkio_cgroup, css);
}
84 85 86 87 88 89 90 91

struct blkio_cgroup *bio_blkio_cgroup(struct bio *bio)
{
	if (bio && bio->bi_css)
		return container_of(bio->bi_css, struct blkio_cgroup, css);
	return task_blkio_cgroup(current);
}
EXPORT_SYMBOL_GPL(bio_blkio_cgroup);
92

93 94
static inline void blkio_update_group_weight(struct blkio_group *blkg,
					     int plid, unsigned int weight)
95 96 97 98 99
{
	struct blkio_policy_type *blkiop;

	list_for_each_entry(blkiop, &blkio_list, list) {
		/* If this policy does not own the blkg, do not send updates */
100
		if (blkiop->plid != plid)
101 102
			continue;
		if (blkiop->ops.blkio_update_group_weight_fn)
103
			blkiop->ops.blkio_update_group_weight_fn(blkg->q,
104
							blkg, weight);
105 106 107
	}
}

108 109
static inline void blkio_update_group_bps(struct blkio_group *blkg, int plid,
					  u64 bps, int fileid)
110 111 112 113 114 115
{
	struct blkio_policy_type *blkiop;

	list_for_each_entry(blkiop, &blkio_list, list) {

		/* If this policy does not own the blkg, do not send updates */
116
		if (blkiop->plid != plid)
117 118 119 120
			continue;

		if (fileid == BLKIO_THROTL_read_bps_device
		    && blkiop->ops.blkio_update_group_read_bps_fn)
121
			blkiop->ops.blkio_update_group_read_bps_fn(blkg->q,
122
								blkg, bps);
123 124 125

		if (fileid == BLKIO_THROTL_write_bps_device
		    && blkiop->ops.blkio_update_group_write_bps_fn)
126
			blkiop->ops.blkio_update_group_write_bps_fn(blkg->q,
127
								blkg, bps);
128 129 130
	}
}

131
static inline void blkio_update_group_iops(struct blkio_group *blkg,
132 133
					   int plid, unsigned int iops,
					   int fileid)
134 135 136 137 138 139
{
	struct blkio_policy_type *blkiop;

	list_for_each_entry(blkiop, &blkio_list, list) {

		/* If this policy does not own the blkg, do not send updates */
140
		if (blkiop->plid != plid)
141 142 143 144
			continue;

		if (fileid == BLKIO_THROTL_read_iops_device
		    && blkiop->ops.blkio_update_group_read_iops_fn)
145
			blkiop->ops.blkio_update_group_read_iops_fn(blkg->q,
146
								blkg, iops);
147 148 149

		if (fileid == BLKIO_THROTL_write_iops_device
		    && blkiop->ops.blkio_update_group_write_iops_fn)
150
			blkiop->ops.blkio_update_group_write_iops_fn(blkg->q,
151
								blkg,iops);
152 153 154
	}
}

155 156
/*
 * Add to the appropriate stat variable depending on the request type.
157
 * This should be called with queue_lock held.
158
 */
159 160
static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction,
				bool sync)
161
{
162 163
	if (direction)
		stat[BLKIO_STAT_WRITE] += add;
164
	else
165 166 167
		stat[BLKIO_STAT_READ] += add;
	if (sync)
		stat[BLKIO_STAT_SYNC] += add;
168
	else
169
		stat[BLKIO_STAT_ASYNC] += add;
170 171
}

172 173 174
/*
 * Decrements the appropriate stat variable if non-zero depending on the
 * request type. Panics on value being zero.
175
 * This should be called with the queue_lock held.
176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195
 */
static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync)
{
	if (direction) {
		BUG_ON(stat[BLKIO_STAT_WRITE] == 0);
		stat[BLKIO_STAT_WRITE]--;
	} else {
		BUG_ON(stat[BLKIO_STAT_READ] == 0);
		stat[BLKIO_STAT_READ]--;
	}
	if (sync) {
		BUG_ON(stat[BLKIO_STAT_SYNC] == 0);
		stat[BLKIO_STAT_SYNC]--;
	} else {
		BUG_ON(stat[BLKIO_STAT_ASYNC] == 0);
		stat[BLKIO_STAT_ASYNC]--;
	}
}

#ifdef CONFIG_DEBUG_BLK_CGROUP
196
/* This should be called with the queue_lock held. */
197
static void blkio_set_start_group_wait_time(struct blkio_group *blkg,
198 199
					    struct blkio_policy_type *pol,
					    struct blkio_group *curr_blkg)
200
{
201
	struct blkg_policy_data *pd = blkg->pd[pol->plid];
202 203

	if (blkio_blkg_waiting(&pd->stats))
204 205 206
		return;
	if (blkg == curr_blkg)
		return;
207 208
	pd->stats.start_group_wait_time = sched_clock();
	blkio_mark_blkg_waiting(&pd->stats);
209 210
}

211
/* This should be called with the queue_lock held. */
212 213 214 215 216 217 218 219 220 221 222 223 224
static void blkio_update_group_wait_time(struct blkio_group_stats *stats)
{
	unsigned long long now;

	if (!blkio_blkg_waiting(stats))
		return;

	now = sched_clock();
	if (time_after64(now, stats->start_group_wait_time))
		stats->group_wait_time += now - stats->start_group_wait_time;
	blkio_clear_blkg_waiting(stats);
}

225
/* This should be called with the queue_lock held. */
226 227 228 229 230 231 232 233 234 235 236 237 238
static void blkio_end_empty_time(struct blkio_group_stats *stats)
{
	unsigned long long now;

	if (!blkio_blkg_empty(stats))
		return;

	now = sched_clock();
	if (time_after64(now, stats->start_empty_time))
		stats->empty_time += now - stats->start_empty_time;
	blkio_clear_blkg_empty(stats);
}

239 240
void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg,
					struct blkio_policy_type *pol)
241
{
242
	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
243

244 245 246 247 248
	lockdep_assert_held(blkg->q->queue_lock);
	BUG_ON(blkio_blkg_idling(stats));

	stats->start_idle_time = sched_clock();
	blkio_mark_blkg_idling(stats);
249 250 251
}
EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats);

252 253
void blkiocg_update_idle_time_stats(struct blkio_group *blkg,
				    struct blkio_policy_type *pol)
254
{
255 256 257
	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;

	lockdep_assert_held(blkg->q->queue_lock);
258 259

	if (blkio_blkg_idling(stats)) {
260 261 262 263
		unsigned long long now = sched_clock();

		if (time_after64(now, stats->start_idle_time)) {
			u64_stats_update_begin(&stats->syncp);
264
			stats->idle_time += now - stats->start_idle_time;
265 266
			u64_stats_update_end(&stats->syncp);
		}
267 268 269 270 271
		blkio_clear_blkg_idling(stats);
	}
}
EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats);

272 273
void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg,
					 struct blkio_policy_type *pol)
274
{
275
	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
276

277 278 279
	lockdep_assert_held(blkg->q->queue_lock);

	u64_stats_update_begin(&stats->syncp);
280 281 282 283
	stats->avg_queue_size_sum +=
			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] +
			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE];
	stats->avg_queue_size_samples++;
284
	blkio_update_group_wait_time(stats);
285
	u64_stats_update_end(&stats->syncp);
286
}
287 288
EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats);

289 290
void blkiocg_set_start_empty_time(struct blkio_group *blkg,
				  struct blkio_policy_type *pol)
D
Divyesh Shah 已提交
291
{
292
	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
D
Divyesh Shah 已提交
293

294
	lockdep_assert_held(blkg->q->queue_lock);
D
Divyesh Shah 已提交
295 296

	if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] ||
297
			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE])
D
Divyesh Shah 已提交
298 299 300
		return;

	/*
301 302 303
	 * group is already marked empty. This can happen if cfqq got new
	 * request in parent group and moved to this group while being added
	 * to service tree. Just ignore the event and move on.
D
Divyesh Shah 已提交
304
	 */
305
	if (blkio_blkg_empty(stats))
306 307
		return;

D
Divyesh Shah 已提交
308 309 310 311 312
	stats->start_empty_time = sched_clock();
	blkio_mark_blkg_empty(stats);
}
EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time);

313
void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
314 315
				  struct blkio_policy_type *pol,
				  unsigned long dequeue)
316
{
317
	struct blkg_policy_data *pd = blkg->pd[pol->plid];
318

319 320
	lockdep_assert_held(blkg->q->queue_lock);

321
	pd->stats.dequeue += dequeue;
322 323
}
EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats);
324 325
#else
static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg,
326 327 328
					struct blkio_policy_type *pol,
					struct blkio_group *curr_blkg) { }
static inline void blkio_end_empty_time(struct blkio_group_stats *stats) { }
329 330
#endif

331
void blkiocg_update_io_add_stats(struct blkio_group *blkg,
332 333 334
				 struct blkio_policy_type *pol,
				 struct blkio_group *curr_blkg, bool direction,
				 bool sync)
335
{
336 337 338 339 340 341 342 343
	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;

	lockdep_assert_held(blkg->q->queue_lock);

	u64_stats_update_begin(&stats->syncp);
	blkio_add_stat(stats->stat_arr[BLKIO_STAT_QUEUED], 1, direction, sync);
	blkio_end_empty_time(stats);
	u64_stats_update_end(&stats->syncp);
344

345
	blkio_set_start_group_wait_time(blkg, pol, curr_blkg);
346
}
347
EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats);
348

349
void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
350 351
				    struct blkio_policy_type *pol,
				    bool direction, bool sync)
352
{
353 354 355
	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;

	lockdep_assert_held(blkg->q->queue_lock);
356

357 358 359 360
	u64_stats_update_begin(&stats->syncp);
	blkio_check_and_dec_stat(stats->stat_arr[BLKIO_STAT_QUEUED], direction,
				 sync);
	u64_stats_update_end(&stats->syncp);
361
}
362
EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats);
363

364 365 366 367
void blkiocg_update_timeslice_used(struct blkio_group *blkg,
				   struct blkio_policy_type *pol,
				   unsigned long time,
				   unsigned long unaccounted_time)
368
{
369 370 371
	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;

	lockdep_assert_held(blkg->q->queue_lock);
372

373 374
	u64_stats_update_begin(&stats->syncp);
	stats->time += time;
375
#ifdef CONFIG_DEBUG_BLK_CGROUP
376
	stats->unaccounted_time += unaccounted_time;
377
#endif
378
	u64_stats_update_end(&stats->syncp);
379
}
380
EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
381

382 383 384 385
/*
 * should be called under rcu read lock or queue lock to make sure blkg pointer
 * is valid.
 */
386
void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
387 388
				   struct blkio_policy_type *pol,
				   uint64_t bytes, bool direction, bool sync)
389
{
390
	struct blkg_policy_data *pd = blkg->pd[pol->plid];
391
	struct blkio_group_stats_cpu *stats_cpu;
392 393
	unsigned long flags;

394 395 396 397
	/* If per cpu stats are not allocated yet, don't do any accounting. */
	if (pd->stats_cpu == NULL)
		return;

398 399 400 401 402 403
	/*
	 * Disabling interrupts to provide mutual exclusion between two
	 * writes on same cpu. It probably is not needed for 64bit. Not
	 * optimizing that case yet.
	 */
	local_irq_save(flags);
404

405
	stats_cpu = this_cpu_ptr(pd->stats_cpu);
406

407
	u64_stats_update_begin(&stats_cpu->syncp);
408 409 410 411 412
	stats_cpu->sectors += bytes >> 9;
	blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICED],
			1, direction, sync);
	blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICE_BYTES],
			bytes, direction, sync);
413 414
	u64_stats_update_end(&stats_cpu->syncp);
	local_irq_restore(flags);
415
}
416
EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats);
417

418
void blkiocg_update_completion_stats(struct blkio_group *blkg,
419 420 421 422
				     struct blkio_policy_type *pol,
				     uint64_t start_time,
				     uint64_t io_start_time, bool direction,
				     bool sync)
423
{
424
	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
425 426
	unsigned long long now = sched_clock();

427 428 429
	lockdep_assert_held(blkg->q->queue_lock);

	u64_stats_update_begin(&stats->syncp);
430 431 432 433 434 435
	if (time_after64(now, io_start_time))
		blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME],
				now - io_start_time, direction, sync);
	if (time_after64(io_start_time, start_time))
		blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME],
				io_start_time - start_time, direction, sync);
436
	u64_stats_update_end(&stats->syncp);
437
}
438
EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
439

440
/*  Merged stats are per cpu.  */
441 442 443
void blkiocg_update_io_merged_stats(struct blkio_group *blkg,
				    struct blkio_policy_type *pol,
				    bool direction, bool sync)
D
Divyesh Shah 已提交
444
{
445 446 447
	struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;

	lockdep_assert_held(blkg->q->queue_lock);
D
Divyesh Shah 已提交
448

449
	u64_stats_update_begin(&stats->syncp);
450
	blkio_add_stat(stats->stat_arr[BLKIO_STAT_MERGED], 1, direction, sync);
451
	u64_stats_update_end(&stats->syncp);
D
Divyesh Shah 已提交
452 453 454
}
EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);

455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508
/*
 * Worker for allocating per cpu stat for blk groups. This is scheduled on
 * the system_nrt_wq once there are some groups on the alloc_list waiting
 * for allocation.
 */
static void blkio_stat_alloc_fn(struct work_struct *work)
{
	static void *pcpu_stats[BLKIO_NR_POLICIES];
	struct delayed_work *dwork = to_delayed_work(work);
	struct blkio_group *blkg;
	int i;
	bool empty = false;

alloc_stats:
	for (i = 0; i < BLKIO_NR_POLICIES; i++) {
		if (pcpu_stats[i] != NULL)
			continue;

		pcpu_stats[i] = alloc_percpu(struct blkio_group_stats_cpu);

		/* Allocation failed. Try again after some time. */
		if (pcpu_stats[i] == NULL) {
			queue_delayed_work(system_nrt_wq, dwork,
						msecs_to_jiffies(10));
			return;
		}
	}

	spin_lock_irq(&blkio_list_lock);
	spin_lock(&alloc_list_lock);

	/* cgroup got deleted or queue exited. */
	if (!list_empty(&alloc_list)) {
		blkg = list_first_entry(&alloc_list, struct blkio_group,
						alloc_node);
		for (i = 0; i < BLKIO_NR_POLICIES; i++) {
			struct blkg_policy_data *pd = blkg->pd[i];

			if (blkio_policy[i] && pd && !pd->stats_cpu)
				swap(pd->stats_cpu, pcpu_stats[i]);
		}

		list_del_init(&blkg->alloc_node);
	}

	empty = list_empty(&alloc_list);

	spin_unlock(&alloc_list_lock);
	spin_unlock_irq(&blkio_list_lock);

	if (!empty)
		goto alloc_stats;
}

509 510 511 512 513 514 515 516
/**
 * blkg_free - free a blkg
 * @blkg: blkg to free
 *
 * Free @blkg which may be partially allocated.
 */
static void blkg_free(struct blkio_group *blkg)
{
517
	int i;
518 519 520 521

	if (!blkg)
		return;

522 523 524 525 526 527 528
	for (i = 0; i < BLKIO_NR_POLICIES; i++) {
		struct blkg_policy_data *pd = blkg->pd[i];

		if (pd) {
			free_percpu(pd->stats_cpu);
			kfree(pd);
		}
529
	}
530

531
	kfree(blkg);
532 533 534 535 536 537 538
}

/**
 * blkg_alloc - allocate a blkg
 * @blkcg: block cgroup the new blkg is associated with
 * @q: request_queue the new blkg is associated with
 *
539
 * Allocate a new blkg assocating @blkcg and @q.
540 541
 */
static struct blkio_group *blkg_alloc(struct blkio_cgroup *blkcg,
542
				      struct request_queue *q)
543 544
{
	struct blkio_group *blkg;
545
	int i;
546 547 548 549 550 551

	/* alloc and init base part */
	blkg = kzalloc_node(sizeof(*blkg), GFP_ATOMIC, q->node);
	if (!blkg)
		return NULL;

T
Tejun Heo 已提交
552
	blkg->q = q;
553
	INIT_LIST_HEAD(&blkg->q_node);
554
	INIT_LIST_HEAD(&blkg->alloc_node);
555
	blkg->blkcg = blkcg;
T
Tejun Heo 已提交
556
	blkg->refcnt = 1;
557 558
	cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));

559 560 561
	for (i = 0; i < BLKIO_NR_POLICIES; i++) {
		struct blkio_policy_type *pol = blkio_policy[i];
		struct blkg_policy_data *pd;
562

563 564 565 566 567 568 569 570 571 572
		if (!pol)
			continue;

		/* alloc per-policy data and attach it to blkg */
		pd = kzalloc_node(sizeof(*pd) + pol->pdata_size, GFP_ATOMIC,
				  q->node);
		if (!pd) {
			blkg_free(blkg);
			return NULL;
		}
573

574 575
		blkg->pd[i] = pd;
		pd->blkg = blkg;
576 577
	}

578
	/* invoke per-policy init */
579 580 581 582 583 584 585
	for (i = 0; i < BLKIO_NR_POLICIES; i++) {
		struct blkio_policy_type *pol = blkio_policy[i];

		if (pol)
			pol->ops.blkio_init_group_fn(blkg);
	}

586 587 588
	return blkg;
}

589 590 591 592 593
struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
				       struct request_queue *q,
				       enum blkio_policy_id plid,
				       bool for_root)
	__releases(q->queue_lock) __acquires(q->queue_lock)
594
{
595
	struct blkio_group *blkg;
596

597 598 599 600 601 602 603 604 605 606 607 608
	WARN_ON_ONCE(!rcu_read_lock_held());
	lockdep_assert_held(q->queue_lock);

	/*
	 * This could be the first entry point of blkcg implementation and
	 * we shouldn't allow anything to go through for a bypassing queue.
	 * The following can be removed if blkg lookup is guaranteed to
	 * fail on a bypassing queue.
	 */
	if (unlikely(blk_queue_bypass(q)) && !for_root)
		return ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY);

609
	blkg = blkg_lookup(blkcg, q);
610 611 612
	if (blkg)
		return blkg;

613
	/* blkg holds a reference to blkcg */
614 615 616 617 618 619
	if (!css_tryget(&blkcg->css))
		return ERR_PTR(-EINVAL);

	/*
	 * Allocate and initialize.
	 */
620
	blkg = blkg_alloc(blkcg, q);
621 622

	/* did alloc fail? */
623
	if (unlikely(!blkg)) {
624 625 626 627 628 629
		blkg = ERR_PTR(-ENOMEM);
		goto out;
	}

	/* insert */
	spin_lock(&blkcg->lock);
630
	hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
631
	list_add(&blkg->q_node, &q->blkg_list);
632
	spin_unlock(&blkcg->lock);
633 634 635 636 637 638

	spin_lock(&alloc_list_lock);
	list_add(&blkg->alloc_node, &alloc_list);
	/* Queue per cpu stat allocation from worker thread. */
	queue_delayed_work(system_nrt_wq, &blkio_stat_alloc_work, 0);
	spin_unlock(&alloc_list_lock);
639 640
out:
	return blkg;
641
}
642
EXPORT_SYMBOL_GPL(blkg_lookup_create);
643 644

/* called under rcu_read_lock(). */
645
struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg,
646
				struct request_queue *q)
647 648 649 650
{
	struct blkio_group *blkg;
	struct hlist_node *n;

651
	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node)
652
		if (blkg->q == q)
653 654 655
			return blkg;
	return NULL;
}
656
EXPORT_SYMBOL_GPL(blkg_lookup);
657

658
static void blkg_destroy(struct blkio_group *blkg)
659 660
{
	struct request_queue *q = blkg->q;
661
	struct blkio_cgroup *blkcg = blkg->blkcg;
662 663

	lockdep_assert_held(q->queue_lock);
664
	lockdep_assert_held(&blkcg->lock);
665 666

	/* Something wrong if we are trying to remove same group twice */
667
	WARN_ON_ONCE(list_empty(&blkg->q_node));
668
	WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
669
	list_del_init(&blkg->q_node);
670
	hlist_del_init_rcu(&blkg->blkcg_node);
671

672 673 674 675
	spin_lock(&alloc_list_lock);
	list_del_init(&blkg->alloc_node);
	spin_unlock(&alloc_list_lock);

676 677 678 679 680 681 682
	/*
	 * Put the reference taken at the time of creation so that when all
	 * queues are gone, group can be destroyed.
	 */
	blkg_put(blkg);
}

683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715
/*
 * XXX: This updates blkg policy data in-place for root blkg, which is
 * necessary across elevator switch and policy registration as root blkgs
 * aren't shot down.  This broken and racy implementation is temporary.
 * Eventually, blkg shoot down will be replaced by proper in-place update.
 */
void update_root_blkg_pd(struct request_queue *q, enum blkio_policy_id plid)
{
	struct blkio_policy_type *pol = blkio_policy[plid];
	struct blkio_group *blkg = blkg_lookup(&blkio_root_cgroup, q);
	struct blkg_policy_data *pd;

	if (!blkg)
		return;

	kfree(blkg->pd[plid]);
	blkg->pd[plid] = NULL;

	if (!pol)
		return;

	pd = kzalloc(sizeof(*pd) + pol->pdata_size, GFP_KERNEL);
	WARN_ON_ONCE(!pd);

	pd->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu);
	WARN_ON_ONCE(!pd->stats_cpu);

	blkg->pd[plid] = pd;
	pd->blkg = blkg;
	pol->ops.blkio_init_group_fn(blkg);
}
EXPORT_SYMBOL_GPL(update_root_blkg_pd);

716 717 718 719 720 721 722 723
/**
 * blkg_destroy_all - destroy all blkgs associated with a request_queue
 * @q: request_queue of interest
 * @destroy_root: whether to destroy root blkg or not
 *
 * Destroy blkgs associated with @q.  If @destroy_root is %true, all are
 * destroyed; otherwise, root blkg is left alone.
 */
724
void blkg_destroy_all(struct request_queue *q, bool destroy_root)
725
{
726
	struct blkio_group *blkg, *n;
727

728
	spin_lock_irq(q->queue_lock);
729

730 731
	list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
		struct blkio_cgroup *blkcg = blkg->blkcg;
732

733 734 735
		/* skip root? */
		if (!destroy_root && blkg->blkcg == &blkio_root_cgroup)
			continue;
736

737 738 739
		spin_lock(&blkcg->lock);
		blkg_destroy(blkg);
		spin_unlock(&blkcg->lock);
740
	}
741 742

	spin_unlock_irq(q->queue_lock);
743
}
744
EXPORT_SYMBOL_GPL(blkg_destroy_all);
745

T
Tejun Heo 已提交
746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768
static void blkg_rcu_free(struct rcu_head *rcu_head)
{
	blkg_free(container_of(rcu_head, struct blkio_group, rcu_head));
}

void __blkg_release(struct blkio_group *blkg)
{
	/* release the extra blkcg reference this blkg has been holding */
	css_put(&blkg->blkcg->css);

	/*
	 * A group is freed in rcu manner. But having an rcu lock does not
	 * mean that one can access all the fields of blkg and assume these
	 * are valid. For example, don't try to follow throtl_data and
	 * request queue links.
	 *
	 * Having a reference to blkg under an rcu allows acess to only
	 * values local to groups like group stats and group rate limits
	 */
	call_rcu(&blkg->rcu_head, blkg_rcu_free);
}
EXPORT_SYMBOL_GPL(__blkg_release);

769
static void blkio_reset_stats_cpu(struct blkio_group *blkg, int plid)
770
{
771
	struct blkg_policy_data *pd = blkg->pd[plid];
T
Tejun Heo 已提交
772
	int cpu;
773 774 775

	if (pd->stats_cpu == NULL)
		return;
T
Tejun Heo 已提交
776 777 778 779 780 781 782

	for_each_possible_cpu(cpu) {
		struct blkio_group_stats_cpu *sc =
			per_cpu_ptr(pd->stats_cpu, cpu);

		sc->sectors = 0;
		memset(sc->stat_arr_cpu, 0, sizeof(sc->stat_arr_cpu));
783 784 785
	}
}

786
static int
787
blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
788
{
T
Tejun Heo 已提交
789
	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
790 791
	struct blkio_group *blkg;
	struct hlist_node *n;
792
	int i;
793

794
	spin_lock(&blkio_list_lock);
795
	spin_lock_irq(&blkcg->lock);
T
Tejun Heo 已提交
796 797 798 799 800 801

	/*
	 * Note that stat reset is racy - it doesn't synchronize against
	 * stat updates.  This is a debug feature which shouldn't exist
	 * anyway.  If you get hit by a race, retry.
	 */
802
	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
803
		struct blkio_policy_type *pol;
804

805 806
		list_for_each_entry(pol, &blkio_list, list) {
			struct blkg_policy_data *pd = blkg->pd[pol->plid];
T
Tejun Heo 已提交
807 808 809 810 811 812 813 814
			struct blkio_group_stats *stats = &pd->stats;

			/* queued stats shouldn't be cleared */
			for (i = 0; i < ARRAY_SIZE(stats->stat_arr); i++)
				if (i != BLKIO_STAT_QUEUED)
					memset(stats->stat_arr[i], 0,
					       sizeof(stats->stat_arr[i]));
			stats->time = 0;
815
#ifdef CONFIG_DEBUG_BLK_CGROUP
T
Tejun Heo 已提交
816 817
			memset((void *)stats + BLKG_STATS_DEBUG_CLEAR_START, 0,
			       BLKG_STATS_DEBUG_CLEAR_SIZE);
818
#endif
819 820
			blkio_reset_stats_cpu(blkg, pol->plid);
		}
821
	}
822

823
	spin_unlock_irq(&blkcg->lock);
824
	spin_unlock(&blkio_list_lock);
825 826 827
	return 0;
}

828 829
static void blkio_get_key_name(enum stat_sub_type type, const char *dname,
			       char *str, int chars_left, bool diskname_only)
830
{
831
	snprintf(str, chars_left, "%s", dname);
832 833 834 835 836 837
	chars_left -= strlen(str);
	if (chars_left <= 0) {
		printk(KERN_WARNING
			"Possibly incorrect cgroup stat display format");
		return;
	}
838 839
	if (diskname_only)
		return;
840
	switch (type) {
841
	case BLKIO_STAT_READ:
842 843
		strlcat(str, " Read", chars_left);
		break;
844
	case BLKIO_STAT_WRITE:
845 846
		strlcat(str, " Write", chars_left);
		break;
847
	case BLKIO_STAT_SYNC:
848 849
		strlcat(str, " Sync", chars_left);
		break;
850
	case BLKIO_STAT_ASYNC:
851 852
		strlcat(str, " Async", chars_left);
		break;
853
	case BLKIO_STAT_TOTAL:
854 855 856 857 858 859 860
		strlcat(str, " Total", chars_left);
		break;
	default:
		strlcat(str, " Invalid", chars_left);
	}
}

861
static uint64_t blkio_read_stat_cpu(struct blkio_group *blkg, int plid,
862 863
			enum stat_type_cpu type, enum stat_sub_type sub_type)
{
864
	struct blkg_policy_data *pd = blkg->pd[plid];
865 866
	int cpu;
	struct blkio_group_stats_cpu *stats_cpu;
867
	u64 val = 0, tval;
868

869 870 871
	if (pd->stats_cpu == NULL)
		return val;

872
	for_each_possible_cpu(cpu) {
873
		unsigned int start;
874
		stats_cpu = per_cpu_ptr(pd->stats_cpu, cpu);
875

876 877 878 879 880 881 882 883 884
		do {
			start = u64_stats_fetch_begin(&stats_cpu->syncp);
			if (type == BLKIO_STAT_CPU_SECTORS)
				tval = stats_cpu->sectors;
			else
				tval = stats_cpu->stat_arr_cpu[type][sub_type];
		} while(u64_stats_fetch_retry(&stats_cpu->syncp, start));

		val += tval;
885 886 887 888 889
	}

	return val;
}

890
static uint64_t blkio_get_stat_cpu(struct blkio_group *blkg, int plid,
891 892
				   struct cgroup_map_cb *cb, const char *dname,
				   enum stat_type_cpu type)
893 894 895 896 897 898
{
	uint64_t disk_total, val;
	char key_str[MAX_KEY_LEN];
	enum stat_sub_type sub_type;

	if (type == BLKIO_STAT_CPU_SECTORS) {
899
		val = blkio_read_stat_cpu(blkg, plid, type, 0);
T
Tejun Heo 已提交
900 901 902
		blkio_get_key_name(0, dname, key_str, MAX_KEY_LEN, true);
		cb->fill(cb, key_str, val);
		return val;
903 904 905 906
	}

	for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
			sub_type++) {
907 908
		blkio_get_key_name(sub_type, dname, key_str, MAX_KEY_LEN,
				   false);
909
		val = blkio_read_stat_cpu(blkg, plid, type, sub_type);
910 911 912
		cb->fill(cb, key_str, val);
	}

913 914
	disk_total = blkio_read_stat_cpu(blkg, plid, type, BLKIO_STAT_READ) +
		blkio_read_stat_cpu(blkg, plid, type, BLKIO_STAT_WRITE);
915

916 917
	blkio_get_key_name(BLKIO_STAT_TOTAL, dname, key_str, MAX_KEY_LEN,
			   false);
918 919 920 921
	cb->fill(cb, key_str, disk_total);
	return disk_total;
}

922
static uint64_t blkio_get_stat(struct blkio_group *blkg, int plid,
923 924
			       struct cgroup_map_cb *cb, const char *dname,
			       enum stat_type type)
925
{
T
Tejun Heo 已提交
926 927
	struct blkio_group_stats *stats = &blkg->pd[plid]->stats;
	uint64_t v = 0, disk_total = 0;
928
	char key_str[MAX_KEY_LEN];
929
	unsigned int sync_start;
T
Tejun Heo 已提交
930
	int st;
931

T
Tejun Heo 已提交
932
	if (type >= BLKIO_STAT_ARR_NR) {
933 934 935 936 937 938
		do {
			sync_start = u64_stats_fetch_begin(&stats->syncp);
			switch (type) {
			case BLKIO_STAT_TIME:
				v = stats->time;
				break;
939
#ifdef CONFIG_DEBUG_BLK_CGROUP
940 941 942 943 944
			case BLKIO_STAT_UNACCOUNTED_TIME:
				v = stats->unaccounted_time;
				break;
			case BLKIO_STAT_AVG_QUEUE_SIZE: {
				uint64_t samples = stats->avg_queue_size_samples;
T
Tejun Heo 已提交
945

946 947 948 949 950
				if (samples) {
					v = stats->avg_queue_size_sum;
					do_div(v, samples);
				}
				break;
T
Tejun Heo 已提交
951
			}
952 953 954 955 956 957 958 959 960 961 962 963
			case BLKIO_STAT_IDLE_TIME:
				v = stats->idle_time;
				break;
			case BLKIO_STAT_EMPTY_TIME:
				v = stats->empty_time;
				break;
			case BLKIO_STAT_DEQUEUE:
				v = stats->dequeue;
				break;
			case BLKIO_STAT_GROUP_WAIT_TIME:
				v = stats->group_wait_time;
				break;
964
#endif
965 966 967 968
			default:
				WARN_ON_ONCE(1);
			}
		} while (u64_stats_fetch_retry(&stats->syncp, sync_start));
969

T
Tejun Heo 已提交
970 971 972
		blkio_get_key_name(0, dname, key_str, MAX_KEY_LEN, true);
		cb->fill(cb, key_str, v);
		return v;
973
	}
T
Tejun Heo 已提交
974 975

	for (st = BLKIO_STAT_READ; st < BLKIO_STAT_TOTAL; st++) {
976 977 978 979
		do {
			sync_start = u64_stats_fetch_begin(&stats->syncp);
			v = stats->stat_arr[type][st];
		} while (u64_stats_fetch_retry(&stats->syncp, sync_start));
T
Tejun Heo 已提交
980 981 982 983 984 985 986

		blkio_get_key_name(st, dname, key_str, MAX_KEY_LEN, false);
		cb->fill(cb, key_str, v);
		if (st == BLKIO_STAT_READ || st == BLKIO_STAT_WRITE)
			disk_total += v;
	}

987 988
	blkio_get_key_name(BLKIO_STAT_TOTAL, dname, key_str, MAX_KEY_LEN,
			   false);
989 990 991 992
	cb->fill(cb, key_str, disk_total);
	return disk_total;
}

T
Tejun Heo 已提交
993 994
static int blkio_policy_parse_and_set(char *buf, enum blkio_policy_id plid,
				      int fileid, struct blkio_cgroup *blkcg)
995
{
996
	struct gendisk *disk = NULL;
997
	struct blkio_group *blkg = NULL;
998
	struct blkg_policy_data *pd;
999
	char *s[4], *p, *major_s = NULL, *minor_s = NULL;
1000
	unsigned long major, minor;
1001 1002
	int i = 0, ret = -EINVAL;
	int part;
1003
	dev_t dev;
1004
	u64 temp;
1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019

	memset(s, 0, sizeof(s));

	while ((p = strsep(&buf, " ")) != NULL) {
		if (!*p)
			continue;

		s[i++] = p;

		/* Prevent from inputing too many things */
		if (i == 3)
			break;
	}

	if (i != 2)
1020
		goto out;
1021 1022 1023 1024 1025

	p = strsep(&s[0], ":");
	if (p != NULL)
		major_s = p;
	else
1026
		goto out;
1027 1028 1029

	minor_s = s[0];
	if (!minor_s)
1030
		goto out;
1031

1032 1033
	if (strict_strtoul(major_s, 10, &major))
		goto out;
1034

1035 1036
	if (strict_strtoul(minor_s, 10, &minor))
		goto out;
1037 1038 1039

	dev = MKDEV(major, minor);

1040 1041
	if (strict_strtoull(s[1], 10, &temp))
		goto out;
1042

1043
	disk = get_gendisk(dev, &part);
T
Tejun Heo 已提交
1044
	if (!disk || part)
1045 1046 1047 1048
		goto out;

	rcu_read_lock();

T
Tejun Heo 已提交
1049 1050 1051
	spin_lock_irq(disk->queue->queue_lock);
	blkg = blkg_lookup_create(blkcg, disk->queue, plid, false);
	spin_unlock_irq(disk->queue->queue_lock);
1052

T
Tejun Heo 已提交
1053 1054 1055
	if (IS_ERR(blkg)) {
		ret = PTR_ERR(blkg);
		goto out_unlock;
1056
	}
1057

1058 1059
	pd = blkg->pd[plid];

1060 1061
	switch (plid) {
	case BLKIO_POLICY_PROP:
1062 1063
		if ((temp < BLKIO_WEIGHT_MIN && temp > 0) ||
		     temp > BLKIO_WEIGHT_MAX)
1064
			goto out_unlock;
1065

1066
		pd->conf.weight = temp;
1067
		blkio_update_group_weight(blkg, plid, temp ?: blkcg->weight);
1068 1069
		break;
	case BLKIO_POLICY_THROTL:
1070 1071
		switch(fileid) {
		case BLKIO_THROTL_read_bps_device:
1072
			pd->conf.bps[READ] = temp;
1073
			blkio_update_group_bps(blkg, plid, temp ?: -1, fileid);
1074
			break;
1075
		case BLKIO_THROTL_write_bps_device:
1076
			pd->conf.bps[WRITE] = temp;
1077
			blkio_update_group_bps(blkg, plid, temp ?: -1, fileid);
1078 1079
			break;
		case BLKIO_THROTL_read_iops_device:
1080 1081
			if (temp > THROTL_IOPS_MAX)
				goto out_unlock;
1082
			pd->conf.iops[READ] = temp;
1083
			blkio_update_group_iops(blkg, plid, temp ?: -1, fileid);
1084
			break;
1085
		case BLKIO_THROTL_write_iops_device:
1086
			if (temp > THROTL_IOPS_MAX)
1087
				goto out_unlock;
1088
			pd->conf.iops[WRITE] = temp;
1089
			blkio_update_group_iops(blkg, plid, temp ?: -1, fileid);
1090 1091
			break;
		}
1092 1093 1094 1095
		break;
	default:
		BUG();
	}
1096
	ret = 0;
1097 1098
out_unlock:
	rcu_read_unlock();
1099 1100
out:
	put_disk(disk);
1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111

	/*
	 * If queue was bypassing, we should retry.  Do so after a short
	 * msleep().  It isn't strictly necessary but queue can be
	 * bypassing for some time and it's always nice to avoid busy
	 * looping.
	 */
	if (ret == -EBUSY) {
		msleep(10);
		return restart_syscall();
	}
1112
	return ret;
1113 1114
}

1115 1116
static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft,
 				       const char *buffer)
1117 1118 1119
{
	int ret = 0;
	char *buf;
1120
	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
1121 1122
	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
	int fileid = BLKIOFILE_ATTR(cft->private);
1123 1124 1125 1126 1127

	buf = kstrdup(buffer, GFP_KERNEL);
	if (!buf)
		return -ENOMEM;

T
Tejun Heo 已提交
1128
	ret = blkio_policy_parse_and_set(buf, plid, fileid, blkcg);
1129 1130 1131 1132
	kfree(buf);
	return ret;
}

1133 1134 1135 1136 1137 1138 1139 1140
static const char *blkg_dev_name(struct blkio_group *blkg)
{
	/* some drivers (floppy) instantiate a queue w/o disk registered */
	if (blkg->q->backing_dev_info.dev)
		return dev_name(blkg->q->backing_dev_info.dev);
	return NULL;
}

T
Tejun Heo 已提交
1141 1142
static void blkio_print_group_conf(struct cftype *cft, struct blkio_group *blkg,
				   struct seq_file *m)
1143
{
1144
	int plid = BLKIOFILE_POLICY(cft->private);
T
Tejun Heo 已提交
1145
	int fileid = BLKIOFILE_ATTR(cft->private);
1146 1147
	struct blkg_policy_data *pd = blkg->pd[plid];
	const char *dname = blkg_dev_name(blkg);
T
Tejun Heo 已提交
1148 1149
	int rw = WRITE;

1150 1151 1152
	if (!dname)
		return;

1153
	switch (plid) {
1154
		case BLKIO_POLICY_PROP:
1155
			if (pd->conf.weight)
1156
				seq_printf(m, "%s\t%u\n",
1157
					   dname, pd->conf.weight);
1158 1159
			break;
		case BLKIO_POLICY_THROTL:
T
Tejun Heo 已提交
1160
			switch (fileid) {
1161
			case BLKIO_THROTL_read_bps_device:
T
Tejun Heo 已提交
1162
				rw = READ;
1163
			case BLKIO_THROTL_write_bps_device:
1164
				if (pd->conf.bps[rw])
1165
					seq_printf(m, "%s\t%llu\n",
1166
						   dname, pd->conf.bps[rw]);
1167 1168
				break;
			case BLKIO_THROTL_read_iops_device:
T
Tejun Heo 已提交
1169
				rw = READ;
1170
			case BLKIO_THROTL_write_iops_device:
1171
				if (pd->conf.iops[rw])
1172
					seq_printf(m, "%s\t%u\n",
1173
						   dname, pd->conf.iops[rw]);
1174 1175
				break;
			}
1176 1177 1178 1179 1180
			break;
		default:
			BUG();
	}
}
1181

1182
/* cgroup files which read their data from policy nodes end up here */
T
Tejun Heo 已提交
1183 1184
static void blkio_read_conf(struct cftype *cft, struct blkio_cgroup *blkcg,
			    struct seq_file *m)
1185
{
T
Tejun Heo 已提交
1186 1187
	struct blkio_group *blkg;
	struct hlist_node *n;
1188

T
Tejun Heo 已提交
1189 1190
	spin_lock_irq(&blkcg->lock);
	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node)
1191
		blkio_print_group_conf(cft, blkg, m);
T
Tejun Heo 已提交
1192
	spin_unlock_irq(&blkcg->lock);
1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207
}

static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft,
				struct seq_file *m)
{
	struct blkio_cgroup *blkcg;
	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
	int name = BLKIOFILE_ATTR(cft->private);

	blkcg = cgroup_to_blkio_cgroup(cgrp);

	switch(plid) {
	case BLKIO_POLICY_PROP:
		switch(name) {
		case BLKIO_PROP_weight_device:
T
Tejun Heo 已提交
1208
			blkio_read_conf(cft, blkcg, m);
1209 1210 1211 1212 1213
			return 0;
		default:
			BUG();
		}
		break;
1214 1215 1216 1217
	case BLKIO_POLICY_THROTL:
		switch(name){
		case BLKIO_THROTL_read_bps_device:
		case BLKIO_THROTL_write_bps_device:
1218 1219
		case BLKIO_THROTL_read_iops_device:
		case BLKIO_THROTL_write_iops_device:
T
Tejun Heo 已提交
1220
			blkio_read_conf(cft, blkcg, m);
1221 1222 1223 1224 1225
			return 0;
		default:
			BUG();
		}
		break;
1226 1227 1228 1229 1230 1231 1232 1233
	default:
		BUG();
	}

	return 0;
}

static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg,
1234 1235
		struct cftype *cft, struct cgroup_map_cb *cb,
		enum stat_type type, bool show_total, bool pcpu)
1236 1237 1238 1239 1240
{
	struct blkio_group *blkg;
	struct hlist_node *n;
	uint64_t cgroup_total = 0;

T
Tejun Heo 已提交
1241 1242 1243
	spin_lock_irq(&blkcg->lock);

	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
1244
		const char *dname = blkg_dev_name(blkg);
1245
		int plid = BLKIOFILE_POLICY(cft->private);
1246

1247
		if (!dname)
1248
			continue;
1249
		if (pcpu)
1250 1251
			cgroup_total += blkio_get_stat_cpu(blkg, plid,
							   cb, dname, type);
1252
		else
1253 1254
			cgroup_total += blkio_get_stat(blkg, plid,
						       cb, dname, type);
1255 1256 1257
	}
	if (show_total)
		cb->fill(cb, "Total", cgroup_total);
T
Tejun Heo 已提交
1258 1259

	spin_unlock_irq(&blkcg->lock);
1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277
	return 0;
}

/* All map kind of cgroup file get serviced by this function */
static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
				struct cgroup_map_cb *cb)
{
	struct blkio_cgroup *blkcg;
	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
	int name = BLKIOFILE_ATTR(cft->private);

	blkcg = cgroup_to_blkio_cgroup(cgrp);

	switch(plid) {
	case BLKIO_POLICY_PROP:
		switch(name) {
		case BLKIO_PROP_time:
			return blkio_read_blkg_stats(blkcg, cft, cb,
1278
						BLKIO_STAT_TIME, 0, 0);
1279 1280
		case BLKIO_PROP_sectors:
			return blkio_read_blkg_stats(blkcg, cft, cb,
1281
						BLKIO_STAT_CPU_SECTORS, 0, 1);
1282 1283
		case BLKIO_PROP_io_service_bytes:
			return blkio_read_blkg_stats(blkcg, cft, cb,
1284
					BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1);
1285 1286
		case BLKIO_PROP_io_serviced:
			return blkio_read_blkg_stats(blkcg, cft, cb,
1287
						BLKIO_STAT_CPU_SERVICED, 1, 1);
1288 1289
		case BLKIO_PROP_io_service_time:
			return blkio_read_blkg_stats(blkcg, cft, cb,
1290
						BLKIO_STAT_SERVICE_TIME, 1, 0);
1291 1292
		case BLKIO_PROP_io_wait_time:
			return blkio_read_blkg_stats(blkcg, cft, cb,
1293
						BLKIO_STAT_WAIT_TIME, 1, 0);
1294 1295
		case BLKIO_PROP_io_merged:
			return blkio_read_blkg_stats(blkcg, cft, cb,
1296
						BLKIO_STAT_MERGED, 1, 0);
1297 1298
		case BLKIO_PROP_io_queued:
			return blkio_read_blkg_stats(blkcg, cft, cb,
1299
						BLKIO_STAT_QUEUED, 1, 0);
1300
#ifdef CONFIG_DEBUG_BLK_CGROUP
1301 1302
		case BLKIO_PROP_unaccounted_time:
			return blkio_read_blkg_stats(blkcg, cft, cb,
1303
					BLKIO_STAT_UNACCOUNTED_TIME, 0, 0);
1304 1305
		case BLKIO_PROP_dequeue:
			return blkio_read_blkg_stats(blkcg, cft, cb,
1306
						BLKIO_STAT_DEQUEUE, 0, 0);
1307 1308
		case BLKIO_PROP_avg_queue_size:
			return blkio_read_blkg_stats(blkcg, cft, cb,
1309
					BLKIO_STAT_AVG_QUEUE_SIZE, 0, 0);
1310 1311
		case BLKIO_PROP_group_wait_time:
			return blkio_read_blkg_stats(blkcg, cft, cb,
1312
					BLKIO_STAT_GROUP_WAIT_TIME, 0, 0);
1313 1314
		case BLKIO_PROP_idle_time:
			return blkio_read_blkg_stats(blkcg, cft, cb,
1315
						BLKIO_STAT_IDLE_TIME, 0, 0);
1316 1317
		case BLKIO_PROP_empty_time:
			return blkio_read_blkg_stats(blkcg, cft, cb,
1318
						BLKIO_STAT_EMPTY_TIME, 0, 0);
1319 1320 1321 1322 1323
#endif
		default:
			BUG();
		}
		break;
1324 1325 1326 1327
	case BLKIO_POLICY_THROTL:
		switch(name){
		case BLKIO_THROTL_io_service_bytes:
			return blkio_read_blkg_stats(blkcg, cft, cb,
1328
						BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1);
1329 1330
		case BLKIO_THROTL_io_serviced:
			return blkio_read_blkg_stats(blkcg, cft, cb,
1331
						BLKIO_STAT_CPU_SERVICED, 1, 1);
1332 1333 1334 1335
		default:
			BUG();
		}
		break;
1336 1337 1338 1339 1340 1341 1342
	default:
		BUG();
	}

	return 0;
}

T
Tejun Heo 已提交
1343
static int blkio_weight_write(struct blkio_cgroup *blkcg, int plid, u64 val)
1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354
{
	struct blkio_group *blkg;
	struct hlist_node *n;

	if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
		return -EINVAL;

	spin_lock(&blkio_list_lock);
	spin_lock_irq(&blkcg->lock);
	blkcg->weight = (unsigned int)val;

1355
	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
1356
		struct blkg_policy_data *pd = blkg->pd[plid];
1357

1358
		if (!pd->conf.weight)
1359
			blkio_update_group_weight(blkg, plid, blkcg->weight);
1360
	}
1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399

	spin_unlock_irq(&blkcg->lock);
	spin_unlock(&blkio_list_lock);
	return 0;
}

static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
	struct blkio_cgroup *blkcg;
	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
	int name = BLKIOFILE_ATTR(cft->private);

	blkcg = cgroup_to_blkio_cgroup(cgrp);

	switch(plid) {
	case BLKIO_POLICY_PROP:
		switch(name) {
		case BLKIO_PROP_weight:
			return (u64)blkcg->weight;
		}
		break;
	default:
		BUG();
	}
	return 0;
}

static int
blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
{
	struct blkio_cgroup *blkcg;
	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
	int name = BLKIOFILE_ATTR(cft->private);

	blkcg = cgroup_to_blkio_cgroup(cgrp);

	switch(plid) {
	case BLKIO_POLICY_PROP:
		switch(name) {
		case BLKIO_PROP_weight:
T
Tejun Heo 已提交
1400
			return blkio_weight_write(blkcg, plid, val);
1401 1402 1403 1404 1405
		}
		break;
	default:
		BUG();
	}
1406 1407 1408 1409

	return 0;
}

1410
struct cftype blkio_files[] = {
1411 1412
	{
		.name = "weight_device",
1413 1414 1415 1416
		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
				BLKIO_PROP_weight_device),
		.read_seq_string = blkiocg_file_read,
		.write_string = blkiocg_file_write,
1417 1418
		.max_write_len = 256,
	},
1419 1420
	{
		.name = "weight",
1421 1422 1423 1424
		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
				BLKIO_PROP_weight),
		.read_u64 = blkiocg_file_read_u64,
		.write_u64 = blkiocg_file_write_u64,
1425
	},
1426 1427
	{
		.name = "time",
1428 1429 1430
		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
				BLKIO_PROP_time),
		.read_map = blkiocg_file_read_map,
1431 1432 1433
	},
	{
		.name = "sectors",
1434 1435 1436
		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
				BLKIO_PROP_sectors),
		.read_map = blkiocg_file_read_map,
1437 1438 1439
	},
	{
		.name = "io_service_bytes",
1440 1441 1442
		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
				BLKIO_PROP_io_service_bytes),
		.read_map = blkiocg_file_read_map,
1443 1444 1445
	},
	{
		.name = "io_serviced",
1446 1447 1448
		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
				BLKIO_PROP_io_serviced),
		.read_map = blkiocg_file_read_map,
1449 1450 1451
	},
	{
		.name = "io_service_time",
1452 1453 1454
		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
				BLKIO_PROP_io_service_time),
		.read_map = blkiocg_file_read_map,
1455 1456 1457
	},
	{
		.name = "io_wait_time",
1458 1459 1460
		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
				BLKIO_PROP_io_wait_time),
		.read_map = blkiocg_file_read_map,
1461
	},
D
Divyesh Shah 已提交
1462 1463
	{
		.name = "io_merged",
1464 1465 1466
		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
				BLKIO_PROP_io_merged),
		.read_map = blkiocg_file_read_map,
D
Divyesh Shah 已提交
1467
	},
1468 1469
	{
		.name = "io_queued",
1470 1471 1472
		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
				BLKIO_PROP_io_queued),
		.read_map = blkiocg_file_read_map,
1473
	},
1474 1475 1476
	{
		.name = "reset_stats",
		.write_u64 = blkiocg_reset_stats,
1477
	},
1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527
#ifdef CONFIG_BLK_DEV_THROTTLING
	{
		.name = "throttle.read_bps_device",
		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
				BLKIO_THROTL_read_bps_device),
		.read_seq_string = blkiocg_file_read,
		.write_string = blkiocg_file_write,
		.max_write_len = 256,
	},

	{
		.name = "throttle.write_bps_device",
		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
				BLKIO_THROTL_write_bps_device),
		.read_seq_string = blkiocg_file_read,
		.write_string = blkiocg_file_write,
		.max_write_len = 256,
	},

	{
		.name = "throttle.read_iops_device",
		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
				BLKIO_THROTL_read_iops_device),
		.read_seq_string = blkiocg_file_read,
		.write_string = blkiocg_file_write,
		.max_write_len = 256,
	},

	{
		.name = "throttle.write_iops_device",
		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
				BLKIO_THROTL_write_iops_device),
		.read_seq_string = blkiocg_file_read,
		.write_string = blkiocg_file_write,
		.max_write_len = 256,
	},
	{
		.name = "throttle.io_service_bytes",
		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
				BLKIO_THROTL_io_service_bytes),
		.read_map = blkiocg_file_read_map,
	},
	{
		.name = "throttle.io_serviced",
		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
				BLKIO_THROTL_io_serviced),
		.read_map = blkiocg_file_read_map,
	},
#endif /* CONFIG_BLK_DEV_THROTTLING */

1528
#ifdef CONFIG_DEBUG_BLK_CGROUP
1529 1530
	{
		.name = "avg_queue_size",
1531 1532 1533
		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
				BLKIO_PROP_avg_queue_size),
		.read_map = blkiocg_file_read_map,
1534
	},
1535 1536
	{
		.name = "group_wait_time",
1537 1538 1539
		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
				BLKIO_PROP_group_wait_time),
		.read_map = blkiocg_file_read_map,
1540 1541 1542
	},
	{
		.name = "idle_time",
1543 1544 1545
		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
				BLKIO_PROP_idle_time),
		.read_map = blkiocg_file_read_map,
1546 1547 1548
	},
	{
		.name = "empty_time",
1549 1550 1551
		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
				BLKIO_PROP_empty_time),
		.read_map = blkiocg_file_read_map,
1552
	},
1553
	{
1554
		.name = "dequeue",
1555 1556 1557
		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
				BLKIO_PROP_dequeue),
		.read_map = blkiocg_file_read_map,
1558
	},
1559 1560 1561 1562 1563 1564
	{
		.name = "unaccounted_time",
		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
				BLKIO_PROP_unaccounted_time),
		.read_map = blkiocg_file_read_map,
	},
1565
#endif
1566 1567 1568 1569 1570 1571 1572 1573
};

static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup)
{
	return cgroup_add_files(cgroup, subsys, blkio_files,
				ARRAY_SIZE(blkio_files));
}

1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585
/**
 * blkiocg_pre_destroy - cgroup pre_destroy callback
 * @subsys: cgroup subsys
 * @cgroup: cgroup of interest
 *
 * This function is called when @cgroup is about to go away and responsible
 * for shooting down all blkgs associated with @cgroup.  blkgs should be
 * removed while holding both q and blkcg locks.  As blkcg lock is nested
 * inside q lock, this function performs reverse double lock dancing.
 *
 * This is the blkcg counterpart of ioc_release_fn().
 */
1586 1587
static int blkiocg_pre_destroy(struct cgroup_subsys *subsys,
			       struct cgroup *cgroup)
1588 1589
{
	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
1590

1591
	spin_lock_irq(&blkcg->lock);
1592

1593 1594 1595
	while (!hlist_empty(&blkcg->blkg_list)) {
		struct blkio_group *blkg = hlist_entry(blkcg->blkg_list.first,
						struct blkio_group, blkcg_node);
T
Tejun Heo 已提交
1596
		struct request_queue *q = blkg->q;
1597

1598 1599 1600 1601 1602 1603
		if (spin_trylock(q->queue_lock)) {
			blkg_destroy(blkg);
			spin_unlock(q->queue_lock);
		} else {
			spin_unlock_irq(&blkcg->lock);
			cpu_relax();
1604
			spin_lock_irq(&blkcg->lock);
1605
		}
1606
	}
1607

1608
	spin_unlock_irq(&blkcg->lock);
1609 1610 1611 1612 1613 1614 1615
	return 0;
}

static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
{
	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);

B
Ben Blum 已提交
1616 1617
	if (blkcg != &blkio_root_cgroup)
		kfree(blkcg);
1618 1619 1620 1621 1622
}

static struct cgroup_subsys_state *
blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
{
T
Tejun Heo 已提交
1623
	static atomic64_t id_seq = ATOMIC64_INIT(0);
1624 1625
	struct blkio_cgroup *blkcg;
	struct cgroup *parent = cgroup->parent;
1626

1627
	if (!parent) {
1628 1629 1630 1631 1632 1633 1634 1635 1636
		blkcg = &blkio_root_cgroup;
		goto done;
	}

	blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
	if (!blkcg)
		return ERR_PTR(-ENOMEM);

	blkcg->weight = BLKIO_WEIGHT_DEFAULT;
T
Tejun Heo 已提交
1637
	blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */
1638 1639 1640 1641 1642 1643 1644
done:
	spin_lock_init(&blkcg->lock);
	INIT_HLIST_HEAD(&blkcg->blkg_list);

	return &blkcg->css;
}

1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656
/**
 * blkcg_init_queue - initialize blkcg part of request queue
 * @q: request_queue to initialize
 *
 * Called from blk_alloc_queue_node(). Responsible for initializing blkcg
 * part of new request_queue @q.
 *
 * RETURNS:
 * 0 on success, -errno on failure.
 */
int blkcg_init_queue(struct request_queue *q)
{
1657 1658
	int ret;

1659 1660
	might_sleep();

1661 1662 1663 1664 1665 1666 1667 1668 1669 1670
	ret = blk_throtl_init(q);
	if (ret)
		return ret;

	mutex_lock(&all_q_mutex);
	INIT_LIST_HEAD(&q->all_q_node);
	list_add_tail(&q->all_q_node, &all_q_list);
	mutex_unlock(&all_q_mutex);

	return 0;
1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693
}

/**
 * blkcg_drain_queue - drain blkcg part of request_queue
 * @q: request_queue to drain
 *
 * Called from blk_drain_queue().  Responsible for draining blkcg part.
 */
void blkcg_drain_queue(struct request_queue *q)
{
	lockdep_assert_held(q->queue_lock);

	blk_throtl_drain(q);
}

/**
 * blkcg_exit_queue - exit and release blkcg part of request_queue
 * @q: request_queue being released
 *
 * Called from blk_release_queue().  Responsible for exiting blkcg part.
 */
void blkcg_exit_queue(struct request_queue *q)
{
1694 1695 1696 1697
	mutex_lock(&all_q_mutex);
	list_del_init(&q->all_q_node);
	mutex_unlock(&all_q_mutex);

1698 1699
	blkg_destroy_all(q, true);

1700 1701 1702
	blk_throtl_exit(q);
}

1703 1704 1705 1706 1707 1708
/*
 * We cannot support shared io contexts, as we have no mean to support
 * two tasks with the same ioc in two different groups without major rework
 * of the main cic data structures.  For now we allow a task to change
 * its cgroup only if it's the only owner of its ioc.
 */
1709 1710
static int blkiocg_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
			      struct cgroup_taskset *tset)
1711
{
1712
	struct task_struct *task;
1713 1714 1715 1716
	struct io_context *ioc;
	int ret = 0;

	/* task_lock() is needed to avoid races with exit_io_context() */
1717 1718 1719 1720 1721 1722 1723 1724 1725
	cgroup_taskset_for_each(task, cgrp, tset) {
		task_lock(task);
		ioc = task->io_context;
		if (ioc && atomic_read(&ioc->nr_tasks) > 1)
			ret = -EINVAL;
		task_unlock(task);
		if (ret)
			break;
	}
1726 1727 1728
	return ret;
}

1729 1730 1731 1732 1733 1734 1735 1736 1737
static void blkcg_bypass_start(void)
	__acquires(&all_q_mutex)
{
	struct request_queue *q;

	mutex_lock(&all_q_mutex);

	list_for_each_entry(q, &all_q_list, all_q_node) {
		blk_queue_bypass_start(q);
1738
		blkg_destroy_all(q, false);
1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752
	}
}

static void blkcg_bypass_end(void)
	__releases(&all_q_mutex)
{
	struct request_queue *q;

	list_for_each_entry(q, &all_q_list, all_q_node)
		blk_queue_bypass_end(q);

	mutex_unlock(&all_q_mutex);
}

1753 1754
void blkio_policy_register(struct blkio_policy_type *blkiop)
{
1755 1756
	struct request_queue *q;

1757
	blkcg_bypass_start();
1758
	spin_lock(&blkio_list_lock);
1759 1760 1761

	BUG_ON(blkio_policy[blkiop->plid]);
	blkio_policy[blkiop->plid] = blkiop;
1762
	list_add_tail(&blkiop->list, &blkio_list);
1763

1764
	spin_unlock(&blkio_list_lock);
1765 1766
	list_for_each_entry(q, &all_q_list, all_q_node)
		update_root_blkg_pd(q, blkiop->plid);
1767
	blkcg_bypass_end();
1768 1769 1770 1771 1772
}
EXPORT_SYMBOL_GPL(blkio_policy_register);

void blkio_policy_unregister(struct blkio_policy_type *blkiop)
{
1773 1774
	struct request_queue *q;

1775
	blkcg_bypass_start();
1776
	spin_lock(&blkio_list_lock);
1777 1778 1779

	BUG_ON(blkio_policy[blkiop->plid] != blkiop);
	blkio_policy[blkiop->plid] = NULL;
1780
	list_del_init(&blkiop->list);
1781

1782
	spin_unlock(&blkio_list_lock);
1783 1784
	list_for_each_entry(q, &all_q_list, all_q_node)
		update_root_blkg_pd(q, blkiop->plid);
1785
	blkcg_bypass_end();
1786 1787
}
EXPORT_SYMBOL_GPL(blkio_policy_unregister);