blk-cgroup.c 23.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13
/*
 * Common Block IO controller cgroup interface
 *
 * Based on ideas and code from CFQ, CFS and BFQ:
 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
 *
 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
 *		      Paolo Valente <paolo.valente@unimore.it>
 *
 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
 * 	              Nauman Rafique <nauman@google.com>
 */
#include <linux/ioprio.h>
14
#include <linux/kdev_t.h>
15
#include <linux/module.h>
16
#include <linux/err.h>
17
#include <linux/blkdev.h>
18
#include <linux/slab.h>
19
#include <linux/genhd.h>
20
#include <linux/delay.h>
T
Tejun Heo 已提交
21
#include <linux/atomic.h>
22
#include "blk-cgroup.h"
23
#include "blk.h"
24

25 26
#define MAX_KEY_LEN 100

27 28
static DEFINE_SPINLOCK(blkio_list_lock);
static LIST_HEAD(blkio_list);
29

30 31 32
static DEFINE_MUTEX(all_q_mutex);
static LIST_HEAD(all_q_list);

33 34 35 36 37 38 39
/* List of groups pending per cpu stats allocation */
static DEFINE_SPINLOCK(alloc_list_lock);
static LIST_HEAD(alloc_list);

static void blkio_stat_alloc_fn(struct work_struct *);
static DECLARE_DELAYED_WORK(blkio_stat_alloc_work, blkio_stat_alloc_fn);

40
struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
41 42
EXPORT_SYMBOL_GPL(blkio_root_cgroup);

43 44
static struct blkio_policy_type *blkio_policy[BLKIO_NR_POLICIES];

45 46 47 48 49
struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
{
	return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
			    struct blkio_cgroup, css);
}
50
EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
51

52
static struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk)
53 54 55 56
{
	return container_of(task_subsys_state(tsk, blkio_subsys_id),
			    struct blkio_cgroup, css);
}
57 58 59 60 61 62 63 64

struct blkio_cgroup *bio_blkio_cgroup(struct bio *bio)
{
	if (bio && bio->bi_css)
		return container_of(bio->bi_css, struct blkio_cgroup, css);
	return task_blkio_cgroup(current);
}
EXPORT_SYMBOL_GPL(bio_blkio_cgroup);
65

66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
/*
 * Worker for allocating per cpu stat for blk groups. This is scheduled on
 * the system_nrt_wq once there are some groups on the alloc_list waiting
 * for allocation.
 */
static void blkio_stat_alloc_fn(struct work_struct *work)
{
	static void *pcpu_stats[BLKIO_NR_POLICIES];
	struct delayed_work *dwork = to_delayed_work(work);
	struct blkio_group *blkg;
	int i;
	bool empty = false;

alloc_stats:
	for (i = 0; i < BLKIO_NR_POLICIES; i++) {
		if (pcpu_stats[i] != NULL)
			continue;

		pcpu_stats[i] = alloc_percpu(struct blkio_group_stats_cpu);

		/* Allocation failed. Try again after some time. */
		if (pcpu_stats[i] == NULL) {
			queue_delayed_work(system_nrt_wq, dwork,
						msecs_to_jiffies(10));
			return;
		}
	}

	spin_lock_irq(&blkio_list_lock);
	spin_lock(&alloc_list_lock);

	/* cgroup got deleted or queue exited. */
	if (!list_empty(&alloc_list)) {
		blkg = list_first_entry(&alloc_list, struct blkio_group,
						alloc_node);
		for (i = 0; i < BLKIO_NR_POLICIES; i++) {
			struct blkg_policy_data *pd = blkg->pd[i];

			if (blkio_policy[i] && pd && !pd->stats_cpu)
				swap(pd->stats_cpu, pcpu_stats[i]);
		}

		list_del_init(&blkg->alloc_node);
	}

	empty = list_empty(&alloc_list);

	spin_unlock(&alloc_list_lock);
	spin_unlock_irq(&blkio_list_lock);

	if (!empty)
		goto alloc_stats;
}

120 121 122 123 124 125 126 127
/**
 * blkg_free - free a blkg
 * @blkg: blkg to free
 *
 * Free @blkg which may be partially allocated.
 */
static void blkg_free(struct blkio_group *blkg)
{
128
	int i;
129 130 131 132

	if (!blkg)
		return;

133 134 135 136 137 138 139
	for (i = 0; i < BLKIO_NR_POLICIES; i++) {
		struct blkg_policy_data *pd = blkg->pd[i];

		if (pd) {
			free_percpu(pd->stats_cpu);
			kfree(pd);
		}
140
	}
141

142
	kfree(blkg);
143 144 145 146 147 148 149
}

/**
 * blkg_alloc - allocate a blkg
 * @blkcg: block cgroup the new blkg is associated with
 * @q: request_queue the new blkg is associated with
 *
150
 * Allocate a new blkg assocating @blkcg and @q.
151 152
 */
static struct blkio_group *blkg_alloc(struct blkio_cgroup *blkcg,
153
				      struct request_queue *q)
154 155
{
	struct blkio_group *blkg;
156
	int i;
157 158 159 160 161 162

	/* alloc and init base part */
	blkg = kzalloc_node(sizeof(*blkg), GFP_ATOMIC, q->node);
	if (!blkg)
		return NULL;

T
Tejun Heo 已提交
163
	blkg->q = q;
164
	INIT_LIST_HEAD(&blkg->q_node);
165
	INIT_LIST_HEAD(&blkg->alloc_node);
166
	blkg->blkcg = blkcg;
T
Tejun Heo 已提交
167
	blkg->refcnt = 1;
168 169
	cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));

170 171 172
	for (i = 0; i < BLKIO_NR_POLICIES; i++) {
		struct blkio_policy_type *pol = blkio_policy[i];
		struct blkg_policy_data *pd;
173

174 175 176 177 178 179 180 181 182 183
		if (!pol)
			continue;

		/* alloc per-policy data and attach it to blkg */
		pd = kzalloc_node(sizeof(*pd) + pol->pdata_size, GFP_ATOMIC,
				  q->node);
		if (!pd) {
			blkg_free(blkg);
			return NULL;
		}
184

185 186
		blkg->pd[i] = pd;
		pd->blkg = blkg;
187 188
	}

189
	/* invoke per-policy init */
190 191 192 193 194 195 196
	for (i = 0; i < BLKIO_NR_POLICIES; i++) {
		struct blkio_policy_type *pol = blkio_policy[i];

		if (pol)
			pol->ops.blkio_init_group_fn(blkg);
	}

197 198 199
	return blkg;
}

200 201 202 203
struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
				       struct request_queue *q,
				       bool for_root)
	__releases(q->queue_lock) __acquires(q->queue_lock)
204
{
205
	struct blkio_group *blkg;
206

207 208 209 210 211 212 213 214 215 216 217 218
	WARN_ON_ONCE(!rcu_read_lock_held());
	lockdep_assert_held(q->queue_lock);

	/*
	 * This could be the first entry point of blkcg implementation and
	 * we shouldn't allow anything to go through for a bypassing queue.
	 * The following can be removed if blkg lookup is guaranteed to
	 * fail on a bypassing queue.
	 */
	if (unlikely(blk_queue_bypass(q)) && !for_root)
		return ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY);

219
	blkg = blkg_lookup(blkcg, q);
220 221 222
	if (blkg)
		return blkg;

223
	/* blkg holds a reference to blkcg */
224 225 226 227 228 229
	if (!css_tryget(&blkcg->css))
		return ERR_PTR(-EINVAL);

	/*
	 * Allocate and initialize.
	 */
230
	blkg = blkg_alloc(blkcg, q);
231 232

	/* did alloc fail? */
233
	if (unlikely(!blkg)) {
234 235 236 237 238 239
		blkg = ERR_PTR(-ENOMEM);
		goto out;
	}

	/* insert */
	spin_lock(&blkcg->lock);
240
	hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
241
	list_add(&blkg->q_node, &q->blkg_list);
242
	spin_unlock(&blkcg->lock);
243 244 245 246 247 248

	spin_lock(&alloc_list_lock);
	list_add(&blkg->alloc_node, &alloc_list);
	/* Queue per cpu stat allocation from worker thread. */
	queue_delayed_work(system_nrt_wq, &blkio_stat_alloc_work, 0);
	spin_unlock(&alloc_list_lock);
249 250
out:
	return blkg;
251
}
252
EXPORT_SYMBOL_GPL(blkg_lookup_create);
253 254

/* called under rcu_read_lock(). */
255
struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg,
256
				struct request_queue *q)
257 258 259 260
{
	struct blkio_group *blkg;
	struct hlist_node *n;

261
	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node)
262
		if (blkg->q == q)
263 264 265
			return blkg;
	return NULL;
}
266
EXPORT_SYMBOL_GPL(blkg_lookup);
267

268
static void blkg_destroy(struct blkio_group *blkg)
269 270
{
	struct request_queue *q = blkg->q;
271
	struct blkio_cgroup *blkcg = blkg->blkcg;
272 273

	lockdep_assert_held(q->queue_lock);
274
	lockdep_assert_held(&blkcg->lock);
275 276

	/* Something wrong if we are trying to remove same group twice */
277
	WARN_ON_ONCE(list_empty(&blkg->q_node));
278
	WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
279
	list_del_init(&blkg->q_node);
280
	hlist_del_init_rcu(&blkg->blkcg_node);
281

282 283 284 285
	spin_lock(&alloc_list_lock);
	list_del_init(&blkg->alloc_node);
	spin_unlock(&alloc_list_lock);

286 287 288 289 290 291 292
	/*
	 * Put the reference taken at the time of creation so that when all
	 * queues are gone, group can be destroyed.
	 */
	blkg_put(blkg);
}

293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325
/*
 * XXX: This updates blkg policy data in-place for root blkg, which is
 * necessary across elevator switch and policy registration as root blkgs
 * aren't shot down.  This broken and racy implementation is temporary.
 * Eventually, blkg shoot down will be replaced by proper in-place update.
 */
void update_root_blkg_pd(struct request_queue *q, enum blkio_policy_id plid)
{
	struct blkio_policy_type *pol = blkio_policy[plid];
	struct blkio_group *blkg = blkg_lookup(&blkio_root_cgroup, q);
	struct blkg_policy_data *pd;

	if (!blkg)
		return;

	kfree(blkg->pd[plid]);
	blkg->pd[plid] = NULL;

	if (!pol)
		return;

	pd = kzalloc(sizeof(*pd) + pol->pdata_size, GFP_KERNEL);
	WARN_ON_ONCE(!pd);

	pd->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu);
	WARN_ON_ONCE(!pd->stats_cpu);

	blkg->pd[plid] = pd;
	pd->blkg = blkg;
	pol->ops.blkio_init_group_fn(blkg);
}
EXPORT_SYMBOL_GPL(update_root_blkg_pd);

326 327 328 329 330 331 332 333
/**
 * blkg_destroy_all - destroy all blkgs associated with a request_queue
 * @q: request_queue of interest
 * @destroy_root: whether to destroy root blkg or not
 *
 * Destroy blkgs associated with @q.  If @destroy_root is %true, all are
 * destroyed; otherwise, root blkg is left alone.
 */
334
void blkg_destroy_all(struct request_queue *q, bool destroy_root)
335
{
336
	struct blkio_group *blkg, *n;
337

338
	spin_lock_irq(q->queue_lock);
339

340 341
	list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
		struct blkio_cgroup *blkcg = blkg->blkcg;
342

343 344 345
		/* skip root? */
		if (!destroy_root && blkg->blkcg == &blkio_root_cgroup)
			continue;
346

347 348 349
		spin_lock(&blkcg->lock);
		blkg_destroy(blkg);
		spin_unlock(&blkcg->lock);
350
	}
351 352

	spin_unlock_irq(q->queue_lock);
353
}
354
EXPORT_SYMBOL_GPL(blkg_destroy_all);
355

T
Tejun Heo 已提交
356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378
static void blkg_rcu_free(struct rcu_head *rcu_head)
{
	blkg_free(container_of(rcu_head, struct blkio_group, rcu_head));
}

void __blkg_release(struct blkio_group *blkg)
{
	/* release the extra blkcg reference this blkg has been holding */
	css_put(&blkg->blkcg->css);

	/*
	 * A group is freed in rcu manner. But having an rcu lock does not
	 * mean that one can access all the fields of blkg and assume these
	 * are valid. For example, don't try to follow throtl_data and
	 * request queue links.
	 *
	 * Having a reference to blkg under an rcu allows acess to only
	 * values local to groups like group stats and group rate limits
	 */
	call_rcu(&blkg->rcu_head, blkg_rcu_free);
}
EXPORT_SYMBOL_GPL(__blkg_release);

379
static void blkio_reset_stats_cpu(struct blkio_group *blkg, int plid)
380
{
381
	struct blkg_policy_data *pd = blkg->pd[plid];
T
Tejun Heo 已提交
382
	int cpu;
383 384 385

	if (pd->stats_cpu == NULL)
		return;
T
Tejun Heo 已提交
386 387 388 389 390

	for_each_possible_cpu(cpu) {
		struct blkio_group_stats_cpu *sc =
			per_cpu_ptr(pd->stats_cpu, cpu);

391 392 393
		blkg_rwstat_reset(&sc->service_bytes);
		blkg_rwstat_reset(&sc->serviced);
		blkg_stat_reset(&sc->sectors);
394 395 396
	}
}

397
static int
398
blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
399
{
T
Tejun Heo 已提交
400
	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
401 402 403
	struct blkio_group *blkg;
	struct hlist_node *n;

404
	spin_lock(&blkio_list_lock);
405
	spin_lock_irq(&blkcg->lock);
T
Tejun Heo 已提交
406 407 408 409 410 411

	/*
	 * Note that stat reset is racy - it doesn't synchronize against
	 * stat updates.  This is a debug feature which shouldn't exist
	 * anyway.  If you get hit by a race, retry.
	 */
412
	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
413
		struct blkio_policy_type *pol;
414

415 416
		list_for_each_entry(pol, &blkio_list, list) {
			struct blkg_policy_data *pd = blkg->pd[pol->plid];
T
Tejun Heo 已提交
417 418 419
			struct blkio_group_stats *stats = &pd->stats;

			/* queued stats shouldn't be cleared */
420 421 422 423
			blkg_rwstat_reset(&stats->merged);
			blkg_rwstat_reset(&stats->service_time);
			blkg_rwstat_reset(&stats->wait_time);
			blkg_stat_reset(&stats->time);
424
#ifdef CONFIG_DEBUG_BLK_CGROUP
425 426 427 428 429 430 431
			blkg_stat_reset(&stats->unaccounted_time);
			blkg_stat_reset(&stats->avg_queue_size_sum);
			blkg_stat_reset(&stats->avg_queue_size_samples);
			blkg_stat_reset(&stats->dequeue);
			blkg_stat_reset(&stats->group_wait_time);
			blkg_stat_reset(&stats->idle_time);
			blkg_stat_reset(&stats->empty_time);
432
#endif
433 434
			blkio_reset_stats_cpu(blkg, pol->plid);
		}
435
	}
436

437
	spin_unlock_irq(&blkcg->lock);
438
	spin_unlock(&blkio_list_lock);
439 440 441
	return 0;
}

442
static const char *blkg_dev_name(struct blkio_group *blkg)
443
{
444 445 446 447
	/* some drivers (floppy) instantiate a queue w/o disk registered */
	if (blkg->q->backing_dev_info.dev)
		return dev_name(blkg->q->backing_dev_info.dev);
	return NULL;
448 449
}

450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466
/**
 * blkcg_print_blkgs - helper for printing per-blkg data
 * @sf: seq_file to print to
 * @blkcg: blkcg of interest
 * @prfill: fill function to print out a blkg
 * @pol: policy in question
 * @data: data to be passed to @prfill
 * @show_total: to print out sum of prfill return values or not
 *
 * This function invokes @prfill on each blkg of @blkcg if pd for the
 * policy specified by @pol exists.  @prfill is invoked with @sf, the
 * policy data and @data.  If @show_total is %true, the sum of the return
 * values from @prfill is printed with "Total" label at the end.
 *
 * This is to be used to construct print functions for
 * cftype->read_seq_string method.
 */
467 468 469
void blkcg_print_blkgs(struct seq_file *sf, struct blkio_cgroup *blkcg,
		       u64 (*prfill)(struct seq_file *, struct blkg_policy_data *, int),
		       int pol, int data, bool show_total)
470
{
471 472 473
	struct blkio_group *blkg;
	struct hlist_node *n;
	u64 total = 0;
474

475 476 477 478 479 480 481 482 483
	spin_lock_irq(&blkcg->lock);
	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node)
		if (blkg->pd[pol])
			total += prfill(sf, blkg->pd[pol], data);
	spin_unlock_irq(&blkcg->lock);

	if (show_total)
		seq_printf(sf, "Total %llu\n", (unsigned long long)total);
}
484
EXPORT_SYMBOL_GPL(blkcg_print_blkgs);
485 486 487 488 489 490 491 492 493

/**
 * __blkg_prfill_u64 - prfill helper for a single u64 value
 * @sf: seq_file to print to
 * @pd: policy data of interest
 * @v: value to print
 *
 * Print @v to @sf for the device assocaited with @pd.
 */
494
u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v)
495 496 497 498 499 500 501 502 503
{
	const char *dname = blkg_dev_name(pd->blkg);

	if (!dname)
		return 0;

	seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v);
	return v;
}
504
EXPORT_SYMBOL_GPL(__blkg_prfill_u64);
505 506 507 508 509 510 511 512 513

/**
 * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat
 * @sf: seq_file to print to
 * @pd: policy data of interest
 * @rwstat: rwstat to print
 *
 * Print @rwstat to @sf for the device assocaited with @pd.
 */
514 515
u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
			 const struct blkg_rwstat *rwstat)
516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554
{
	static const char *rwstr[] = {
		[BLKG_RWSTAT_READ]	= "Read",
		[BLKG_RWSTAT_WRITE]	= "Write",
		[BLKG_RWSTAT_SYNC]	= "Sync",
		[BLKG_RWSTAT_ASYNC]	= "Async",
	};
	const char *dname = blkg_dev_name(pd->blkg);
	u64 v;
	int i;

	if (!dname)
		return 0;

	for (i = 0; i < BLKG_RWSTAT_NR; i++)
		seq_printf(sf, "%s %s %llu\n", dname, rwstr[i],
			   (unsigned long long)rwstat->cnt[i]);

	v = rwstat->cnt[BLKG_RWSTAT_READ] + rwstat->cnt[BLKG_RWSTAT_WRITE];
	seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
	return v;
}

static u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd,
			    int off)
{
	return __blkg_prfill_u64(sf, pd,
				 blkg_stat_read((void *)&pd->stats + off));
}

static u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
			      int off)
{
	struct blkg_rwstat rwstat = blkg_rwstat_read((void *)&pd->stats + off);

	return __blkg_prfill_rwstat(sf, pd, &rwstat);
}

/* print blkg_stat specified by BLKCG_STAT_PRIV() */
555 556
int blkcg_print_stat(struct cgroup *cgrp, struct cftype *cft,
		     struct seq_file *sf)
557 558 559 560 561 562 563 564
{
	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);

	blkcg_print_blkgs(sf, blkcg, blkg_prfill_stat,
			  BLKCG_STAT_POL(cft->private),
			  BLKCG_STAT_OFF(cft->private), false);
	return 0;
}
565
EXPORT_SYMBOL_GPL(blkcg_print_stat);
566 567

/* print blkg_rwstat specified by BLKCG_STAT_PRIV() */
568 569
int blkcg_print_rwstat(struct cgroup *cgrp, struct cftype *cft,
		       struct seq_file *sf)
570 571 572 573 574 575 576 577
{
	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);

	blkcg_print_blkgs(sf, blkcg, blkg_prfill_rwstat,
			  BLKCG_STAT_POL(cft->private),
			  BLKCG_STAT_OFF(cft->private), true);
	return 0;
}
578
EXPORT_SYMBOL_GPL(blkcg_print_rwstat);
579 580 581 582 583 584

static u64 blkg_prfill_cpu_stat(struct seq_file *sf,
				struct blkg_policy_data *pd, int off)
{
	u64 v = 0;
	int cpu;
585

586
	for_each_possible_cpu(cpu) {
587
		struct blkio_group_stats_cpu *sc =
588 589
			per_cpu_ptr(pd->stats_cpu, cpu);

590
		v += blkg_stat_read((void *)sc + off);
591 592
	}

593
	return __blkg_prfill_u64(sf, pd, v);
594 595
}

596 597
static u64 blkg_prfill_cpu_rwstat(struct seq_file *sf,
				  struct blkg_policy_data *pd, int off)
598
{
599 600 601 602 603 604
	struct blkg_rwstat rwstat = { }, tmp;
	int i, cpu;

	for_each_possible_cpu(cpu) {
		struct blkio_group_stats_cpu *sc =
			per_cpu_ptr(pd->stats_cpu, cpu);
605

606 607 608
		tmp = blkg_rwstat_read((void *)sc + off);
		for (i = 0; i < BLKG_RWSTAT_NR; i++)
			rwstat.cnt[i] += tmp.cnt[i];
609 610
	}

611 612
	return __blkg_prfill_rwstat(sf, pd, &rwstat);
}
613

614
/* print per-cpu blkg_stat specified by BLKCG_STAT_PRIV() */
615 616
int blkcg_print_cpu_stat(struct cgroup *cgrp, struct cftype *cft,
			 struct seq_file *sf)
617 618 619 620 621 622 623
{
	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);

	blkcg_print_blkgs(sf, blkcg, blkg_prfill_cpu_stat,
			  BLKCG_STAT_POL(cft->private),
			  BLKCG_STAT_OFF(cft->private), false);
	return 0;
624
}
625
EXPORT_SYMBOL_GPL(blkcg_print_cpu_stat);
626

627
/* print per-cpu blkg_rwstat specified by BLKCG_STAT_PRIV() */
628 629
int blkcg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft,
			   struct seq_file *sf)
630
{
631
	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
632

633 634 635 636 637
	blkcg_print_blkgs(sf, blkcg, blkg_prfill_cpu_rwstat,
			  BLKCG_STAT_POL(cft->private),
			  BLKCG_STAT_OFF(cft->private), true);
	return 0;
}
638
EXPORT_SYMBOL_GPL(blkcg_print_cpu_rwstat);
639

640 641 642 643 644 645 646 647 648 649 650
/**
 * blkg_conf_prep - parse and prepare for per-blkg config update
 * @blkcg: target block cgroup
 * @input: input string
 * @ctx: blkg_conf_ctx to be filled
 *
 * Parse per-blkg config update from @input and initialize @ctx with the
 * result.  @ctx->blkg points to the blkg to be updated and @ctx->v the new
 * value.  This function returns with RCU read locked and must be paired
 * with blkg_conf_finish().
 */
651 652
int blkg_conf_prep(struct blkio_cgroup *blkcg, const char *input,
		   struct blkg_conf_ctx *ctx)
653
	__acquires(rcu)
654
{
655 656
	struct gendisk *disk;
	struct blkio_group *blkg;
T
Tejun Heo 已提交
657 658 659
	unsigned int major, minor;
	unsigned long long v;
	int part, ret;
660

T
Tejun Heo 已提交
661 662
	if (sscanf(input, "%u:%u %llu", &major, &minor, &v) != 3)
		return -EINVAL;
663

T
Tejun Heo 已提交
664
	disk = get_gendisk(MKDEV(major, minor), &part);
T
Tejun Heo 已提交
665
	if (!disk || part)
T
Tejun Heo 已提交
666
		return -EINVAL;
667 668 669

	rcu_read_lock();

T
Tejun Heo 已提交
670
	spin_lock_irq(disk->queue->queue_lock);
671
	blkg = blkg_lookup_create(blkcg, disk->queue, false);
T
Tejun Heo 已提交
672
	spin_unlock_irq(disk->queue->queue_lock);
673

T
Tejun Heo 已提交
674 675
	if (IS_ERR(blkg)) {
		ret = PTR_ERR(blkg);
676 677 678 679 680 681 682 683 684 685 686
		rcu_read_unlock();
		put_disk(disk);
		/*
		 * If queue was bypassing, we should retry.  Do so after a
		 * short msleep().  It isn't strictly necessary but queue
		 * can be bypassing for some time and it's always nice to
		 * avoid busy looping.
		 */
		if (ret == -EBUSY) {
			msleep(10);
			ret = restart_syscall();
687
		}
T
Tejun Heo 已提交
688
		return ret;
689
	}
690 691 692

	ctx->disk = disk;
	ctx->blkg = blkg;
T
Tejun Heo 已提交
693 694
	ctx->v = v;
	return 0;
695
}
696
EXPORT_SYMBOL_GPL(blkg_conf_prep);
697

698 699 700 701 702 703 704
/**
 * blkg_conf_finish - finish up per-blkg config update
 * @ctx: blkg_conf_ctx intiailized by blkg_conf_prep()
 *
 * Finish up after per-blkg config update.  This function must be paired
 * with blkg_conf_prep().
 */
705
void blkg_conf_finish(struct blkg_conf_ctx *ctx)
706
	__releases(rcu)
707
{
708 709
	rcu_read_unlock();
	put_disk(ctx->disk);
710
}
711
EXPORT_SYMBOL_GPL(blkg_conf_finish);
712

713
struct cftype blkio_files[] = {
714 715 716
	{
		.name = "reset_stats",
		.write_u64 = blkiocg_reset_stats,
717
	},
718
	{ }	/* terminate */
719 720
};

721 722 723 724 725 726 727 728 729 730 731
/**
 * blkiocg_pre_destroy - cgroup pre_destroy callback
 * @cgroup: cgroup of interest
 *
 * This function is called when @cgroup is about to go away and responsible
 * for shooting down all blkgs associated with @cgroup.  blkgs should be
 * removed while holding both q and blkcg locks.  As blkcg lock is nested
 * inside q lock, this function performs reverse double lock dancing.
 *
 * This is the blkcg counterpart of ioc_release_fn().
 */
732
static int blkiocg_pre_destroy(struct cgroup *cgroup)
733 734
{
	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
735

736
	spin_lock_irq(&blkcg->lock);
737

738 739 740
	while (!hlist_empty(&blkcg->blkg_list)) {
		struct blkio_group *blkg = hlist_entry(blkcg->blkg_list.first,
						struct blkio_group, blkcg_node);
T
Tejun Heo 已提交
741
		struct request_queue *q = blkg->q;
742

743 744 745 746 747 748
		if (spin_trylock(q->queue_lock)) {
			blkg_destroy(blkg);
			spin_unlock(q->queue_lock);
		} else {
			spin_unlock_irq(&blkcg->lock);
			cpu_relax();
749
			spin_lock_irq(&blkcg->lock);
750
		}
751
	}
752

753
	spin_unlock_irq(&blkcg->lock);
754 755 756
	return 0;
}

757
static void blkiocg_destroy(struct cgroup *cgroup)
758 759 760
{
	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);

B
Ben Blum 已提交
761 762
	if (blkcg != &blkio_root_cgroup)
		kfree(blkcg);
763 764
}

765
static struct cgroup_subsys_state *blkiocg_create(struct cgroup *cgroup)
766
{
T
Tejun Heo 已提交
767
	static atomic64_t id_seq = ATOMIC64_INIT(0);
768 769
	struct blkio_cgroup *blkcg;
	struct cgroup *parent = cgroup->parent;
770

771
	if (!parent) {
772 773 774 775 776 777 778 779 780
		blkcg = &blkio_root_cgroup;
		goto done;
	}

	blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
	if (!blkcg)
		return ERR_PTR(-ENOMEM);

	blkcg->weight = BLKIO_WEIGHT_DEFAULT;
T
Tejun Heo 已提交
781
	blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */
782 783 784 785 786 787 788
done:
	spin_lock_init(&blkcg->lock);
	INIT_HLIST_HEAD(&blkcg->blkg_list);

	return &blkcg->css;
}

789 790 791 792 793 794 795 796 797 798 799 800
/**
 * blkcg_init_queue - initialize blkcg part of request queue
 * @q: request_queue to initialize
 *
 * Called from blk_alloc_queue_node(). Responsible for initializing blkcg
 * part of new request_queue @q.
 *
 * RETURNS:
 * 0 on success, -errno on failure.
 */
int blkcg_init_queue(struct request_queue *q)
{
801 802
	int ret;

803 804
	might_sleep();

805 806 807 808 809 810 811 812 813 814
	ret = blk_throtl_init(q);
	if (ret)
		return ret;

	mutex_lock(&all_q_mutex);
	INIT_LIST_HEAD(&q->all_q_node);
	list_add_tail(&q->all_q_node, &all_q_list);
	mutex_unlock(&all_q_mutex);

	return 0;
815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837
}

/**
 * blkcg_drain_queue - drain blkcg part of request_queue
 * @q: request_queue to drain
 *
 * Called from blk_drain_queue().  Responsible for draining blkcg part.
 */
void blkcg_drain_queue(struct request_queue *q)
{
	lockdep_assert_held(q->queue_lock);

	blk_throtl_drain(q);
}

/**
 * blkcg_exit_queue - exit and release blkcg part of request_queue
 * @q: request_queue being released
 *
 * Called from blk_release_queue().  Responsible for exiting blkcg part.
 */
void blkcg_exit_queue(struct request_queue *q)
{
838 839 840 841
	mutex_lock(&all_q_mutex);
	list_del_init(&q->all_q_node);
	mutex_unlock(&all_q_mutex);

842 843
	blkg_destroy_all(q, true);

844 845 846
	blk_throtl_exit(q);
}

847 848 849 850 851 852
/*
 * We cannot support shared io contexts, as we have no mean to support
 * two tasks with the same ioc in two different groups without major rework
 * of the main cic data structures.  For now we allow a task to change
 * its cgroup only if it's the only owner of its ioc.
 */
853
static int blkiocg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
854
{
855
	struct task_struct *task;
856 857 858 859
	struct io_context *ioc;
	int ret = 0;

	/* task_lock() is needed to avoid races with exit_io_context() */
860 861 862 863 864 865 866 867 868
	cgroup_taskset_for_each(task, cgrp, tset) {
		task_lock(task);
		ioc = task->io_context;
		if (ioc && atomic_read(&ioc->nr_tasks) > 1)
			ret = -EINVAL;
		task_unlock(task);
		if (ret)
			break;
	}
869 870 871
	return ret;
}

872 873 874 875 876 877 878 879 880
static void blkcg_bypass_start(void)
	__acquires(&all_q_mutex)
{
	struct request_queue *q;

	mutex_lock(&all_q_mutex);

	list_for_each_entry(q, &all_q_list, all_q_node) {
		blk_queue_bypass_start(q);
881
		blkg_destroy_all(q, false);
882 883 884 885 886 887 888 889 890 891 892 893 894 895
	}
}

static void blkcg_bypass_end(void)
	__releases(&all_q_mutex)
{
	struct request_queue *q;

	list_for_each_entry(q, &all_q_list, all_q_node)
		blk_queue_bypass_end(q);

	mutex_unlock(&all_q_mutex);
}

896 897 898 899
struct cgroup_subsys blkio_subsys = {
	.name = "blkio",
	.create = blkiocg_create,
	.can_attach = blkiocg_can_attach,
900
	.pre_destroy = blkiocg_pre_destroy,
901 902
	.destroy = blkiocg_destroy,
	.subsys_id = blkio_subsys_id,
903
	.base_cftypes = blkio_files,
904 905 906 907
	.module = THIS_MODULE,
};
EXPORT_SYMBOL_GPL(blkio_subsys);

908 909
void blkio_policy_register(struct blkio_policy_type *blkiop)
{
910 911
	struct request_queue *q;

912
	blkcg_bypass_start();
913
	spin_lock(&blkio_list_lock);
914 915 916

	BUG_ON(blkio_policy[blkiop->plid]);
	blkio_policy[blkiop->plid] = blkiop;
917
	list_add_tail(&blkiop->list, &blkio_list);
918

919
	spin_unlock(&blkio_list_lock);
920 921
	list_for_each_entry(q, &all_q_list, all_q_node)
		update_root_blkg_pd(q, blkiop->plid);
922
	blkcg_bypass_end();
923 924 925

	if (blkiop->cftypes)
		WARN_ON(cgroup_add_cftypes(&blkio_subsys, blkiop->cftypes));
926 927 928 929 930
}
EXPORT_SYMBOL_GPL(blkio_policy_register);

void blkio_policy_unregister(struct blkio_policy_type *blkiop)
{
931 932
	struct request_queue *q;

933 934 935
	if (blkiop->cftypes)
		cgroup_rm_cftypes(&blkio_subsys, blkiop->cftypes);

936
	blkcg_bypass_start();
937
	spin_lock(&blkio_list_lock);
938 939 940

	BUG_ON(blkio_policy[blkiop->plid] != blkiop);
	blkio_policy[blkiop->plid] = NULL;
941
	list_del_init(&blkiop->list);
942

943
	spin_unlock(&blkio_list_lock);
944 945
	list_for_each_entry(q, &all_q_list, all_q_node)
		update_root_blkg_pd(q, blkiop->plid);
946
	blkcg_bypass_end();
947 948
}
EXPORT_SYMBOL_GPL(blkio_policy_unregister);