blk-cgroup.c 22.3 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13
/*
 * Common Block IO controller cgroup interface
 *
 * Based on ideas and code from CFQ, CFS and BFQ:
 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
 *
 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
 *		      Paolo Valente <paolo.valente@unimore.it>
 *
 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
 * 	              Nauman Rafique <nauman@google.com>
 */
#include <linux/ioprio.h>
14
#include <linux/kdev_t.h>
15
#include <linux/module.h>
16
#include <linux/err.h>
17
#include <linux/blkdev.h>
18
#include <linux/slab.h>
19
#include <linux/genhd.h>
20
#include <linux/delay.h>
T
Tejun Heo 已提交
21
#include <linux/atomic.h>
22
#include "blk-cgroup.h"
23
#include "blk.h"
24

25 26
#define MAX_KEY_LEN 100

27 28
static DEFINE_SPINLOCK(blkio_list_lock);
static LIST_HEAD(blkio_list);
29

30 31 32
static DEFINE_MUTEX(all_q_mutex);
static LIST_HEAD(all_q_list);

33 34 35 36 37 38 39
/* List of groups pending per cpu stats allocation */
static DEFINE_SPINLOCK(alloc_list_lock);
static LIST_HEAD(alloc_list);

static void blkio_stat_alloc_fn(struct work_struct *);
static DECLARE_DELAYED_WORK(blkio_stat_alloc_work, blkio_stat_alloc_fn);

40
struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
41 42
EXPORT_SYMBOL_GPL(blkio_root_cgroup);

43 44
static struct blkio_policy_type *blkio_policy[BLKIO_NR_POLICIES];

45 46 47 48 49
struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
{
	return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
			    struct blkio_cgroup, css);
}
50
EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
51

52
static struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk)
53 54 55 56
{
	return container_of(task_subsys_state(tsk, blkio_subsys_id),
			    struct blkio_cgroup, css);
}
57 58 59 60 61 62 63 64

struct blkio_cgroup *bio_blkio_cgroup(struct bio *bio)
{
	if (bio && bio->bi_css)
		return container_of(bio->bi_css, struct blkio_cgroup, css);
	return task_blkio_cgroup(current);
}
EXPORT_SYMBOL_GPL(bio_blkio_cgroup);
65

66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
/*
 * Worker for allocating per cpu stat for blk groups. This is scheduled on
 * the system_nrt_wq once there are some groups on the alloc_list waiting
 * for allocation.
 */
static void blkio_stat_alloc_fn(struct work_struct *work)
{
	static void *pcpu_stats[BLKIO_NR_POLICIES];
	struct delayed_work *dwork = to_delayed_work(work);
	struct blkio_group *blkg;
	int i;
	bool empty = false;

alloc_stats:
	for (i = 0; i < BLKIO_NR_POLICIES; i++) {
		if (pcpu_stats[i] != NULL)
			continue;

		pcpu_stats[i] = alloc_percpu(struct blkio_group_stats_cpu);

		/* Allocation failed. Try again after some time. */
		if (pcpu_stats[i] == NULL) {
			queue_delayed_work(system_nrt_wq, dwork,
						msecs_to_jiffies(10));
			return;
		}
	}

	spin_lock_irq(&blkio_list_lock);
	spin_lock(&alloc_list_lock);

	/* cgroup got deleted or queue exited. */
	if (!list_empty(&alloc_list)) {
		blkg = list_first_entry(&alloc_list, struct blkio_group,
						alloc_node);
		for (i = 0; i < BLKIO_NR_POLICIES; i++) {
			struct blkg_policy_data *pd = blkg->pd[i];

			if (blkio_policy[i] && pd && !pd->stats_cpu)
				swap(pd->stats_cpu, pcpu_stats[i]);
		}

		list_del_init(&blkg->alloc_node);
	}

	empty = list_empty(&alloc_list);

	spin_unlock(&alloc_list_lock);
	spin_unlock_irq(&blkio_list_lock);

	if (!empty)
		goto alloc_stats;
}

120 121 122 123 124 125 126 127
/**
 * blkg_free - free a blkg
 * @blkg: blkg to free
 *
 * Free @blkg which may be partially allocated.
 */
static void blkg_free(struct blkio_group *blkg)
{
128
	int i;
129 130 131 132

	if (!blkg)
		return;

133 134 135 136 137 138 139
	for (i = 0; i < BLKIO_NR_POLICIES; i++) {
		struct blkg_policy_data *pd = blkg->pd[i];

		if (pd) {
			free_percpu(pd->stats_cpu);
			kfree(pd);
		}
140
	}
141

142
	kfree(blkg);
143 144 145 146 147 148 149
}

/**
 * blkg_alloc - allocate a blkg
 * @blkcg: block cgroup the new blkg is associated with
 * @q: request_queue the new blkg is associated with
 *
150
 * Allocate a new blkg assocating @blkcg and @q.
151 152
 */
static struct blkio_group *blkg_alloc(struct blkio_cgroup *blkcg,
153
				      struct request_queue *q)
154 155
{
	struct blkio_group *blkg;
156
	int i;
157 158 159 160 161 162

	/* alloc and init base part */
	blkg = kzalloc_node(sizeof(*blkg), GFP_ATOMIC, q->node);
	if (!blkg)
		return NULL;

T
Tejun Heo 已提交
163
	blkg->q = q;
164
	INIT_LIST_HEAD(&blkg->q_node);
165
	INIT_LIST_HEAD(&blkg->alloc_node);
166
	blkg->blkcg = blkcg;
T
Tejun Heo 已提交
167
	blkg->refcnt = 1;
168 169
	cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));

170 171 172
	for (i = 0; i < BLKIO_NR_POLICIES; i++) {
		struct blkio_policy_type *pol = blkio_policy[i];
		struct blkg_policy_data *pd;
173

174 175 176 177 178 179 180 181 182 183
		if (!pol)
			continue;

		/* alloc per-policy data and attach it to blkg */
		pd = kzalloc_node(sizeof(*pd) + pol->pdata_size, GFP_ATOMIC,
				  q->node);
		if (!pd) {
			blkg_free(blkg);
			return NULL;
		}
184

185 186
		blkg->pd[i] = pd;
		pd->blkg = blkg;
187 188
	}

189
	/* invoke per-policy init */
190 191 192 193 194 195 196
	for (i = 0; i < BLKIO_NR_POLICIES; i++) {
		struct blkio_policy_type *pol = blkio_policy[i];

		if (pol)
			pol->ops.blkio_init_group_fn(blkg);
	}

197 198 199
	return blkg;
}

200 201 202 203
struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
				       struct request_queue *q,
				       bool for_root)
	__releases(q->queue_lock) __acquires(q->queue_lock)
204
{
205
	struct blkio_group *blkg;
206

207 208 209 210 211 212 213 214 215 216 217 218
	WARN_ON_ONCE(!rcu_read_lock_held());
	lockdep_assert_held(q->queue_lock);

	/*
	 * This could be the first entry point of blkcg implementation and
	 * we shouldn't allow anything to go through for a bypassing queue.
	 * The following can be removed if blkg lookup is guaranteed to
	 * fail on a bypassing queue.
	 */
	if (unlikely(blk_queue_bypass(q)) && !for_root)
		return ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY);

219
	blkg = blkg_lookup(blkcg, q);
220 221 222
	if (blkg)
		return blkg;

223
	/* blkg holds a reference to blkcg */
224 225 226 227 228 229
	if (!css_tryget(&blkcg->css))
		return ERR_PTR(-EINVAL);

	/*
	 * Allocate and initialize.
	 */
230
	blkg = blkg_alloc(blkcg, q);
231 232

	/* did alloc fail? */
233
	if (unlikely(!blkg)) {
234 235 236 237 238 239
		blkg = ERR_PTR(-ENOMEM);
		goto out;
	}

	/* insert */
	spin_lock(&blkcg->lock);
240
	hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
241
	list_add(&blkg->q_node, &q->blkg_list);
242
	spin_unlock(&blkcg->lock);
243 244 245 246 247 248

	spin_lock(&alloc_list_lock);
	list_add(&blkg->alloc_node, &alloc_list);
	/* Queue per cpu stat allocation from worker thread. */
	queue_delayed_work(system_nrt_wq, &blkio_stat_alloc_work, 0);
	spin_unlock(&alloc_list_lock);
249 250
out:
	return blkg;
251
}
252
EXPORT_SYMBOL_GPL(blkg_lookup_create);
253 254

/* called under rcu_read_lock(). */
255
struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg,
256
				struct request_queue *q)
257 258 259 260
{
	struct blkio_group *blkg;
	struct hlist_node *n;

261
	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node)
262
		if (blkg->q == q)
263 264 265
			return blkg;
	return NULL;
}
266
EXPORT_SYMBOL_GPL(blkg_lookup);
267

268
static void blkg_destroy(struct blkio_group *blkg)
269 270
{
	struct request_queue *q = blkg->q;
271
	struct blkio_cgroup *blkcg = blkg->blkcg;
272 273

	lockdep_assert_held(q->queue_lock);
274
	lockdep_assert_held(&blkcg->lock);
275 276

	/* Something wrong if we are trying to remove same group twice */
277
	WARN_ON_ONCE(list_empty(&blkg->q_node));
278
	WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
279
	list_del_init(&blkg->q_node);
280
	hlist_del_init_rcu(&blkg->blkcg_node);
281

282 283 284 285
	spin_lock(&alloc_list_lock);
	list_del_init(&blkg->alloc_node);
	spin_unlock(&alloc_list_lock);

286 287 288 289 290 291 292
	/*
	 * Put the reference taken at the time of creation so that when all
	 * queues are gone, group can be destroyed.
	 */
	blkg_put(blkg);
}

293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325
/*
 * XXX: This updates blkg policy data in-place for root blkg, which is
 * necessary across elevator switch and policy registration as root blkgs
 * aren't shot down.  This broken and racy implementation is temporary.
 * Eventually, blkg shoot down will be replaced by proper in-place update.
 */
void update_root_blkg_pd(struct request_queue *q, enum blkio_policy_id plid)
{
	struct blkio_policy_type *pol = blkio_policy[plid];
	struct blkio_group *blkg = blkg_lookup(&blkio_root_cgroup, q);
	struct blkg_policy_data *pd;

	if (!blkg)
		return;

	kfree(blkg->pd[plid]);
	blkg->pd[plid] = NULL;

	if (!pol)
		return;

	pd = kzalloc(sizeof(*pd) + pol->pdata_size, GFP_KERNEL);
	WARN_ON_ONCE(!pd);

	pd->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu);
	WARN_ON_ONCE(!pd->stats_cpu);

	blkg->pd[plid] = pd;
	pd->blkg = blkg;
	pol->ops.blkio_init_group_fn(blkg);
}
EXPORT_SYMBOL_GPL(update_root_blkg_pd);

326 327 328 329 330 331 332 333
/**
 * blkg_destroy_all - destroy all blkgs associated with a request_queue
 * @q: request_queue of interest
 * @destroy_root: whether to destroy root blkg or not
 *
 * Destroy blkgs associated with @q.  If @destroy_root is %true, all are
 * destroyed; otherwise, root blkg is left alone.
 */
334
void blkg_destroy_all(struct request_queue *q, bool destroy_root)
335
{
336
	struct blkio_group *blkg, *n;
337

338
	spin_lock_irq(q->queue_lock);
339

340 341
	list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
		struct blkio_cgroup *blkcg = blkg->blkcg;
342

343 344 345
		/* skip root? */
		if (!destroy_root && blkg->blkcg == &blkio_root_cgroup)
			continue;
346

347 348 349
		spin_lock(&blkcg->lock);
		blkg_destroy(blkg);
		spin_unlock(&blkcg->lock);
350
	}
351 352

	spin_unlock_irq(q->queue_lock);
353
}
354
EXPORT_SYMBOL_GPL(blkg_destroy_all);
355

T
Tejun Heo 已提交
356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378
static void blkg_rcu_free(struct rcu_head *rcu_head)
{
	blkg_free(container_of(rcu_head, struct blkio_group, rcu_head));
}

void __blkg_release(struct blkio_group *blkg)
{
	/* release the extra blkcg reference this blkg has been holding */
	css_put(&blkg->blkcg->css);

	/*
	 * A group is freed in rcu manner. But having an rcu lock does not
	 * mean that one can access all the fields of blkg and assume these
	 * are valid. For example, don't try to follow throtl_data and
	 * request queue links.
	 *
	 * Having a reference to blkg under an rcu allows acess to only
	 * values local to groups like group stats and group rate limits
	 */
	call_rcu(&blkg->rcu_head, blkg_rcu_free);
}
EXPORT_SYMBOL_GPL(__blkg_release);

379
static void blkio_reset_stats_cpu(struct blkio_group *blkg, int plid)
380
{
381
	struct blkg_policy_data *pd = blkg->pd[plid];
T
Tejun Heo 已提交
382
	int cpu;
383 384 385

	if (pd->stats_cpu == NULL)
		return;
T
Tejun Heo 已提交
386 387 388 389 390

	for_each_possible_cpu(cpu) {
		struct blkio_group_stats_cpu *sc =
			per_cpu_ptr(pd->stats_cpu, cpu);

391 392
		blkg_rwstat_reset(&sc->service_bytes);
		blkg_rwstat_reset(&sc->serviced);
393 394 395
	}
}

396
static int
397
blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
398
{
T
Tejun Heo 已提交
399
	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
400 401 402
	struct blkio_group *blkg;
	struct hlist_node *n;

403
	spin_lock(&blkio_list_lock);
404
	spin_lock_irq(&blkcg->lock);
T
Tejun Heo 已提交
405 406 407 408 409 410

	/*
	 * Note that stat reset is racy - it doesn't synchronize against
	 * stat updates.  This is a debug feature which shouldn't exist
	 * anyway.  If you get hit by a race, retry.
	 */
411
	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
412
		struct blkio_policy_type *pol;
413

414 415
		list_for_each_entry(pol, &blkio_list, list) {
			struct blkg_policy_data *pd = blkg->pd[pol->plid];
T
Tejun Heo 已提交
416 417 418
			struct blkio_group_stats *stats = &pd->stats;

			/* queued stats shouldn't be cleared */
419 420
			blkg_rwstat_reset(&stats->service_bytes);
			blkg_rwstat_reset(&stats->serviced);
421 422 423 424
			blkg_rwstat_reset(&stats->merged);
			blkg_rwstat_reset(&stats->service_time);
			blkg_rwstat_reset(&stats->wait_time);
			blkg_stat_reset(&stats->time);
425
#ifdef CONFIG_DEBUG_BLK_CGROUP
426 427 428 429 430 431 432
			blkg_stat_reset(&stats->unaccounted_time);
			blkg_stat_reset(&stats->avg_queue_size_sum);
			blkg_stat_reset(&stats->avg_queue_size_samples);
			blkg_stat_reset(&stats->dequeue);
			blkg_stat_reset(&stats->group_wait_time);
			blkg_stat_reset(&stats->idle_time);
			blkg_stat_reset(&stats->empty_time);
433
#endif
434 435
			blkio_reset_stats_cpu(blkg, pol->plid);
		}
436
	}
437

438
	spin_unlock_irq(&blkcg->lock);
439
	spin_unlock(&blkio_list_lock);
440 441 442
	return 0;
}

443
static const char *blkg_dev_name(struct blkio_group *blkg)
444
{
445 446 447 448
	/* some drivers (floppy) instantiate a queue w/o disk registered */
	if (blkg->q->backing_dev_info.dev)
		return dev_name(blkg->q->backing_dev_info.dev);
	return NULL;
449 450
}

451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467
/**
 * blkcg_print_blkgs - helper for printing per-blkg data
 * @sf: seq_file to print to
 * @blkcg: blkcg of interest
 * @prfill: fill function to print out a blkg
 * @pol: policy in question
 * @data: data to be passed to @prfill
 * @show_total: to print out sum of prfill return values or not
 *
 * This function invokes @prfill on each blkg of @blkcg if pd for the
 * policy specified by @pol exists.  @prfill is invoked with @sf, the
 * policy data and @data.  If @show_total is %true, the sum of the return
 * values from @prfill is printed with "Total" label at the end.
 *
 * This is to be used to construct print functions for
 * cftype->read_seq_string method.
 */
468 469 470
void blkcg_print_blkgs(struct seq_file *sf, struct blkio_cgroup *blkcg,
		       u64 (*prfill)(struct seq_file *, struct blkg_policy_data *, int),
		       int pol, int data, bool show_total)
471
{
472 473 474
	struct blkio_group *blkg;
	struct hlist_node *n;
	u64 total = 0;
475

476 477 478 479 480 481 482 483 484
	spin_lock_irq(&blkcg->lock);
	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node)
		if (blkg->pd[pol])
			total += prfill(sf, blkg->pd[pol], data);
	spin_unlock_irq(&blkcg->lock);

	if (show_total)
		seq_printf(sf, "Total %llu\n", (unsigned long long)total);
}
485
EXPORT_SYMBOL_GPL(blkcg_print_blkgs);
486 487 488 489 490 491 492 493 494

/**
 * __blkg_prfill_u64 - prfill helper for a single u64 value
 * @sf: seq_file to print to
 * @pd: policy data of interest
 * @v: value to print
 *
 * Print @v to @sf for the device assocaited with @pd.
 */
495
u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v)
496 497 498 499 500 501 502 503 504
{
	const char *dname = blkg_dev_name(pd->blkg);

	if (!dname)
		return 0;

	seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v);
	return v;
}
505
EXPORT_SYMBOL_GPL(__blkg_prfill_u64);
506 507 508 509 510 511 512 513 514

/**
 * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat
 * @sf: seq_file to print to
 * @pd: policy data of interest
 * @rwstat: rwstat to print
 *
 * Print @rwstat to @sf for the device assocaited with @pd.
 */
515 516
u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
			 const struct blkg_rwstat *rwstat)
517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555
{
	static const char *rwstr[] = {
		[BLKG_RWSTAT_READ]	= "Read",
		[BLKG_RWSTAT_WRITE]	= "Write",
		[BLKG_RWSTAT_SYNC]	= "Sync",
		[BLKG_RWSTAT_ASYNC]	= "Async",
	};
	const char *dname = blkg_dev_name(pd->blkg);
	u64 v;
	int i;

	if (!dname)
		return 0;

	for (i = 0; i < BLKG_RWSTAT_NR; i++)
		seq_printf(sf, "%s %s %llu\n", dname, rwstr[i],
			   (unsigned long long)rwstat->cnt[i]);

	v = rwstat->cnt[BLKG_RWSTAT_READ] + rwstat->cnt[BLKG_RWSTAT_WRITE];
	seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
	return v;
}

static u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd,
			    int off)
{
	return __blkg_prfill_u64(sf, pd,
				 blkg_stat_read((void *)&pd->stats + off));
}

static u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
			      int off)
{
	struct blkg_rwstat rwstat = blkg_rwstat_read((void *)&pd->stats + off);

	return __blkg_prfill_rwstat(sf, pd, &rwstat);
}

/* print blkg_stat specified by BLKCG_STAT_PRIV() */
556 557
int blkcg_print_stat(struct cgroup *cgrp, struct cftype *cft,
		     struct seq_file *sf)
558 559 560 561 562 563 564 565
{
	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);

	blkcg_print_blkgs(sf, blkcg, blkg_prfill_stat,
			  BLKCG_STAT_POL(cft->private),
			  BLKCG_STAT_OFF(cft->private), false);
	return 0;
}
566
EXPORT_SYMBOL_GPL(blkcg_print_stat);
567 568

/* print blkg_rwstat specified by BLKCG_STAT_PRIV() */
569 570
int blkcg_print_rwstat(struct cgroup *cgrp, struct cftype *cft,
		       struct seq_file *sf)
571 572 573 574 575 576 577 578
{
	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);

	blkcg_print_blkgs(sf, blkcg, blkg_prfill_rwstat,
			  BLKCG_STAT_POL(cft->private),
			  BLKCG_STAT_OFF(cft->private), true);
	return 0;
}
579
EXPORT_SYMBOL_GPL(blkcg_print_rwstat);
580

581 582 583 584 585 586 587 588 589 590 591
/**
 * blkg_conf_prep - parse and prepare for per-blkg config update
 * @blkcg: target block cgroup
 * @input: input string
 * @ctx: blkg_conf_ctx to be filled
 *
 * Parse per-blkg config update from @input and initialize @ctx with the
 * result.  @ctx->blkg points to the blkg to be updated and @ctx->v the new
 * value.  This function returns with RCU read locked and must be paired
 * with blkg_conf_finish().
 */
592 593
int blkg_conf_prep(struct blkio_cgroup *blkcg, const char *input,
		   struct blkg_conf_ctx *ctx)
594
	__acquires(rcu)
595
{
596 597
	struct gendisk *disk;
	struct blkio_group *blkg;
T
Tejun Heo 已提交
598 599 600
	unsigned int major, minor;
	unsigned long long v;
	int part, ret;
601

T
Tejun Heo 已提交
602 603
	if (sscanf(input, "%u:%u %llu", &major, &minor, &v) != 3)
		return -EINVAL;
604

T
Tejun Heo 已提交
605
	disk = get_gendisk(MKDEV(major, minor), &part);
T
Tejun Heo 已提交
606
	if (!disk || part)
T
Tejun Heo 已提交
607
		return -EINVAL;
608 609 610

	rcu_read_lock();

T
Tejun Heo 已提交
611
	spin_lock_irq(disk->queue->queue_lock);
612
	blkg = blkg_lookup_create(blkcg, disk->queue, false);
T
Tejun Heo 已提交
613
	spin_unlock_irq(disk->queue->queue_lock);
614

T
Tejun Heo 已提交
615 616
	if (IS_ERR(blkg)) {
		ret = PTR_ERR(blkg);
617 618 619 620 621 622 623 624 625 626 627
		rcu_read_unlock();
		put_disk(disk);
		/*
		 * If queue was bypassing, we should retry.  Do so after a
		 * short msleep().  It isn't strictly necessary but queue
		 * can be bypassing for some time and it's always nice to
		 * avoid busy looping.
		 */
		if (ret == -EBUSY) {
			msleep(10);
			ret = restart_syscall();
628
		}
T
Tejun Heo 已提交
629
		return ret;
630
	}
631 632 633

	ctx->disk = disk;
	ctx->blkg = blkg;
T
Tejun Heo 已提交
634 635
	ctx->v = v;
	return 0;
636
}
637
EXPORT_SYMBOL_GPL(blkg_conf_prep);
638

639 640 641 642 643 644 645
/**
 * blkg_conf_finish - finish up per-blkg config update
 * @ctx: blkg_conf_ctx intiailized by blkg_conf_prep()
 *
 * Finish up after per-blkg config update.  This function must be paired
 * with blkg_conf_prep().
 */
646
void blkg_conf_finish(struct blkg_conf_ctx *ctx)
647
	__releases(rcu)
648
{
649 650
	rcu_read_unlock();
	put_disk(ctx->disk);
651
}
652
EXPORT_SYMBOL_GPL(blkg_conf_finish);
653

654
struct cftype blkio_files[] = {
655 656 657
	{
		.name = "reset_stats",
		.write_u64 = blkiocg_reset_stats,
658
	},
659
	{ }	/* terminate */
660 661
};

662 663 664 665 666 667 668 669 670 671 672
/**
 * blkiocg_pre_destroy - cgroup pre_destroy callback
 * @cgroup: cgroup of interest
 *
 * This function is called when @cgroup is about to go away and responsible
 * for shooting down all blkgs associated with @cgroup.  blkgs should be
 * removed while holding both q and blkcg locks.  As blkcg lock is nested
 * inside q lock, this function performs reverse double lock dancing.
 *
 * This is the blkcg counterpart of ioc_release_fn().
 */
673
static int blkiocg_pre_destroy(struct cgroup *cgroup)
674 675
{
	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
676

677
	spin_lock_irq(&blkcg->lock);
678

679 680 681
	while (!hlist_empty(&blkcg->blkg_list)) {
		struct blkio_group *blkg = hlist_entry(blkcg->blkg_list.first,
						struct blkio_group, blkcg_node);
T
Tejun Heo 已提交
682
		struct request_queue *q = blkg->q;
683

684 685 686 687 688 689
		if (spin_trylock(q->queue_lock)) {
			blkg_destroy(blkg);
			spin_unlock(q->queue_lock);
		} else {
			spin_unlock_irq(&blkcg->lock);
			cpu_relax();
690
			spin_lock_irq(&blkcg->lock);
691
		}
692
	}
693

694
	spin_unlock_irq(&blkcg->lock);
695 696 697
	return 0;
}

698
static void blkiocg_destroy(struct cgroup *cgroup)
699 700 701
{
	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);

B
Ben Blum 已提交
702 703
	if (blkcg != &blkio_root_cgroup)
		kfree(blkcg);
704 705
}

706
static struct cgroup_subsys_state *blkiocg_create(struct cgroup *cgroup)
707
{
T
Tejun Heo 已提交
708
	static atomic64_t id_seq = ATOMIC64_INIT(0);
709 710
	struct blkio_cgroup *blkcg;
	struct cgroup *parent = cgroup->parent;
711

712
	if (!parent) {
713 714 715 716 717 718 719 720 721
		blkcg = &blkio_root_cgroup;
		goto done;
	}

	blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
	if (!blkcg)
		return ERR_PTR(-ENOMEM);

	blkcg->weight = BLKIO_WEIGHT_DEFAULT;
T
Tejun Heo 已提交
722
	blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */
723 724 725 726 727 728 729
done:
	spin_lock_init(&blkcg->lock);
	INIT_HLIST_HEAD(&blkcg->blkg_list);

	return &blkcg->css;
}

730 731 732 733 734 735 736 737 738 739 740 741
/**
 * blkcg_init_queue - initialize blkcg part of request queue
 * @q: request_queue to initialize
 *
 * Called from blk_alloc_queue_node(). Responsible for initializing blkcg
 * part of new request_queue @q.
 *
 * RETURNS:
 * 0 on success, -errno on failure.
 */
int blkcg_init_queue(struct request_queue *q)
{
742 743
	int ret;

744 745
	might_sleep();

746 747 748 749 750 751 752 753 754 755
	ret = blk_throtl_init(q);
	if (ret)
		return ret;

	mutex_lock(&all_q_mutex);
	INIT_LIST_HEAD(&q->all_q_node);
	list_add_tail(&q->all_q_node, &all_q_list);
	mutex_unlock(&all_q_mutex);

	return 0;
756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778
}

/**
 * blkcg_drain_queue - drain blkcg part of request_queue
 * @q: request_queue to drain
 *
 * Called from blk_drain_queue().  Responsible for draining blkcg part.
 */
void blkcg_drain_queue(struct request_queue *q)
{
	lockdep_assert_held(q->queue_lock);

	blk_throtl_drain(q);
}

/**
 * blkcg_exit_queue - exit and release blkcg part of request_queue
 * @q: request_queue being released
 *
 * Called from blk_release_queue().  Responsible for exiting blkcg part.
 */
void blkcg_exit_queue(struct request_queue *q)
{
779 780 781 782
	mutex_lock(&all_q_mutex);
	list_del_init(&q->all_q_node);
	mutex_unlock(&all_q_mutex);

783 784
	blkg_destroy_all(q, true);

785 786 787
	blk_throtl_exit(q);
}

788 789 790 791 792 793
/*
 * We cannot support shared io contexts, as we have no mean to support
 * two tasks with the same ioc in two different groups without major rework
 * of the main cic data structures.  For now we allow a task to change
 * its cgroup only if it's the only owner of its ioc.
 */
794
static int blkiocg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
795
{
796
	struct task_struct *task;
797 798 799 800
	struct io_context *ioc;
	int ret = 0;

	/* task_lock() is needed to avoid races with exit_io_context() */
801 802 803 804 805 806 807 808 809
	cgroup_taskset_for_each(task, cgrp, tset) {
		task_lock(task);
		ioc = task->io_context;
		if (ioc && atomic_read(&ioc->nr_tasks) > 1)
			ret = -EINVAL;
		task_unlock(task);
		if (ret)
			break;
	}
810 811 812
	return ret;
}

813 814 815 816 817 818 819 820 821
static void blkcg_bypass_start(void)
	__acquires(&all_q_mutex)
{
	struct request_queue *q;

	mutex_lock(&all_q_mutex);

	list_for_each_entry(q, &all_q_list, all_q_node) {
		blk_queue_bypass_start(q);
822
		blkg_destroy_all(q, false);
823 824 825 826 827 828 829 830 831 832 833 834 835 836
	}
}

static void blkcg_bypass_end(void)
	__releases(&all_q_mutex)
{
	struct request_queue *q;

	list_for_each_entry(q, &all_q_list, all_q_node)
		blk_queue_bypass_end(q);

	mutex_unlock(&all_q_mutex);
}

837 838 839 840
struct cgroup_subsys blkio_subsys = {
	.name = "blkio",
	.create = blkiocg_create,
	.can_attach = blkiocg_can_attach,
841
	.pre_destroy = blkiocg_pre_destroy,
842 843
	.destroy = blkiocg_destroy,
	.subsys_id = blkio_subsys_id,
844
	.base_cftypes = blkio_files,
845 846 847 848
	.module = THIS_MODULE,
};
EXPORT_SYMBOL_GPL(blkio_subsys);

849 850
void blkio_policy_register(struct blkio_policy_type *blkiop)
{
851 852
	struct request_queue *q;

853
	blkcg_bypass_start();
854
	spin_lock(&blkio_list_lock);
855 856 857

	BUG_ON(blkio_policy[blkiop->plid]);
	blkio_policy[blkiop->plid] = blkiop;
858
	list_add_tail(&blkiop->list, &blkio_list);
859

860
	spin_unlock(&blkio_list_lock);
861 862
	list_for_each_entry(q, &all_q_list, all_q_node)
		update_root_blkg_pd(q, blkiop->plid);
863
	blkcg_bypass_end();
864 865 866

	if (blkiop->cftypes)
		WARN_ON(cgroup_add_cftypes(&blkio_subsys, blkiop->cftypes));
867 868 869 870 871
}
EXPORT_SYMBOL_GPL(blkio_policy_register);

void blkio_policy_unregister(struct blkio_policy_type *blkiop)
{
872 873
	struct request_queue *q;

874 875 876
	if (blkiop->cftypes)
		cgroup_rm_cftypes(&blkio_subsys, blkiop->cftypes);

877
	blkcg_bypass_start();
878
	spin_lock(&blkio_list_lock);
879 880 881

	BUG_ON(blkio_policy[blkiop->plid] != blkiop);
	blkio_policy[blkiop->plid] = NULL;
882
	list_del_init(&blkiop->list);
883

884
	spin_unlock(&blkio_list_lock);
885 886
	list_for_each_entry(q, &all_q_list, all_q_node)
		update_root_blkg_pd(q, blkiop->plid);
887
	blkcg_bypass_end();
888 889
}
EXPORT_SYMBOL_GPL(blkio_policy_unregister);