blk-cgroup.c 19.3 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13
/*
 * Common Block IO controller cgroup interface
 *
 * Based on ideas and code from CFQ, CFS and BFQ:
 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
 *
 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
 *		      Paolo Valente <paolo.valente@unimore.it>
 *
 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
 * 	              Nauman Rafique <nauman@google.com>
 */
#include <linux/ioprio.h>
14
#include <linux/kdev_t.h>
15
#include <linux/module.h>
16
#include <linux/err.h>
17
#include <linux/blkdev.h>
18
#include <linux/slab.h>
19
#include <linux/genhd.h>
20
#include <linux/delay.h>
T
Tejun Heo 已提交
21
#include <linux/atomic.h>
22
#include "blk-cgroup.h"
23
#include "blk.h"
24

25 26
#define MAX_KEY_LEN 100

27 28
static DEFINE_SPINLOCK(blkio_list_lock);
static LIST_HEAD(blkio_list);
29

30 31 32
static DEFINE_MUTEX(all_q_mutex);
static LIST_HEAD(all_q_list);

33
struct blkio_cgroup blkio_root_cgroup = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT };
34 35
EXPORT_SYMBOL_GPL(blkio_root_cgroup);

36 37
static struct blkio_policy_type *blkio_policy[BLKIO_NR_POLICIES];

38 39 40 41 42
struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
{
	return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
			    struct blkio_cgroup, css);
}
43
EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
44

45
static struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk)
46 47 48 49
{
	return container_of(task_subsys_state(tsk, blkio_subsys_id),
			    struct blkio_cgroup, css);
}
50 51 52 53 54 55 56 57

struct blkio_cgroup *bio_blkio_cgroup(struct bio *bio)
{
	if (bio && bio->bi_css)
		return container_of(bio->bi_css, struct blkio_cgroup, css);
	return task_blkio_cgroup(current);
}
EXPORT_SYMBOL_GPL(bio_blkio_cgroup);
58

59 60 61 62 63 64 65 66
/**
 * blkg_free - free a blkg
 * @blkg: blkg to free
 *
 * Free @blkg which may be partially allocated.
 */
static void blkg_free(struct blkio_group *blkg)
{
67
	int i;
68 69 70 71

	if (!blkg)
		return;

72
	for (i = 0; i < BLKIO_NR_POLICIES; i++) {
73
		struct blkio_policy_type *pol = blkio_policy[i];
74 75
		struct blkg_policy_data *pd = blkg->pd[i];

76 77 78 79 80 81 82
		if (!pd)
			continue;

		if (pol && pol->ops.blkio_exit_group_fn)
			pol->ops.blkio_exit_group_fn(blkg);

		kfree(pd);
83
	}
84

85
	kfree(blkg);
86 87 88 89 90 91 92
}

/**
 * blkg_alloc - allocate a blkg
 * @blkcg: block cgroup the new blkg is associated with
 * @q: request_queue the new blkg is associated with
 *
93
 * Allocate a new blkg assocating @blkcg and @q.
94 95
 */
static struct blkio_group *blkg_alloc(struct blkio_cgroup *blkcg,
96
				      struct request_queue *q)
97 98
{
	struct blkio_group *blkg;
99
	int i;
100 101 102 103 104 105

	/* alloc and init base part */
	blkg = kzalloc_node(sizeof(*blkg), GFP_ATOMIC, q->node);
	if (!blkg)
		return NULL;

T
Tejun Heo 已提交
106
	blkg->q = q;
107
	INIT_LIST_HEAD(&blkg->q_node);
108
	blkg->blkcg = blkcg;
T
Tejun Heo 已提交
109
	blkg->refcnt = 1;
110 111
	cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));

112 113 114
	for (i = 0; i < BLKIO_NR_POLICIES; i++) {
		struct blkio_policy_type *pol = blkio_policy[i];
		struct blkg_policy_data *pd;
115

116 117 118 119 120 121 122 123 124 125
		if (!pol)
			continue;

		/* alloc per-policy data and attach it to blkg */
		pd = kzalloc_node(sizeof(*pd) + pol->pdata_size, GFP_ATOMIC,
				  q->node);
		if (!pd) {
			blkg_free(blkg);
			return NULL;
		}
126

127 128
		blkg->pd[i] = pd;
		pd->blkg = blkg;
129 130
	}

131
	/* invoke per-policy init */
132 133 134 135 136 137 138
	for (i = 0; i < BLKIO_NR_POLICIES; i++) {
		struct blkio_policy_type *pol = blkio_policy[i];

		if (pol)
			pol->ops.blkio_init_group_fn(blkg);
	}

139 140 141
	return blkg;
}

142 143 144 145
struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
				       struct request_queue *q,
				       bool for_root)
	__releases(q->queue_lock) __acquires(q->queue_lock)
146
{
147
	struct blkio_group *blkg;
148

149 150 151 152 153 154 155 156 157 158 159 160
	WARN_ON_ONCE(!rcu_read_lock_held());
	lockdep_assert_held(q->queue_lock);

	/*
	 * This could be the first entry point of blkcg implementation and
	 * we shouldn't allow anything to go through for a bypassing queue.
	 * The following can be removed if blkg lookup is guaranteed to
	 * fail on a bypassing queue.
	 */
	if (unlikely(blk_queue_bypass(q)) && !for_root)
		return ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY);

161
	blkg = blkg_lookup(blkcg, q);
162 163 164
	if (blkg)
		return blkg;

165
	/* blkg holds a reference to blkcg */
166 167 168 169 170 171
	if (!css_tryget(&blkcg->css))
		return ERR_PTR(-EINVAL);

	/*
	 * Allocate and initialize.
	 */
172
	blkg = blkg_alloc(blkcg, q);
173 174

	/* did alloc fail? */
175
	if (unlikely(!blkg)) {
176 177 178 179 180 181
		blkg = ERR_PTR(-ENOMEM);
		goto out;
	}

	/* insert */
	spin_lock(&blkcg->lock);
182
	hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
183
	list_add(&blkg->q_node, &q->blkg_list);
184 185 186
	spin_unlock(&blkcg->lock);
out:
	return blkg;
187
}
188
EXPORT_SYMBOL_GPL(blkg_lookup_create);
189 190

/* called under rcu_read_lock(). */
191
struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg,
192
				struct request_queue *q)
193 194 195 196
{
	struct blkio_group *blkg;
	struct hlist_node *n;

197
	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node)
198
		if (blkg->q == q)
199 200 201
			return blkg;
	return NULL;
}
202
EXPORT_SYMBOL_GPL(blkg_lookup);
203

204
static void blkg_destroy(struct blkio_group *blkg)
205 206
{
	struct request_queue *q = blkg->q;
207
	struct blkio_cgroup *blkcg = blkg->blkcg;
208 209

	lockdep_assert_held(q->queue_lock);
210
	lockdep_assert_held(&blkcg->lock);
211 212

	/* Something wrong if we are trying to remove same group twice */
213
	WARN_ON_ONCE(list_empty(&blkg->q_node));
214
	WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
215
	list_del_init(&blkg->q_node);
216
	hlist_del_init_rcu(&blkg->blkcg_node);
217 218 219 220 221 222 223 224

	/*
	 * Put the reference taken at the time of creation so that when all
	 * queues are gone, group can be destroyed.
	 */
	blkg_put(blkg);
}

225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254
/*
 * XXX: This updates blkg policy data in-place for root blkg, which is
 * necessary across elevator switch and policy registration as root blkgs
 * aren't shot down.  This broken and racy implementation is temporary.
 * Eventually, blkg shoot down will be replaced by proper in-place update.
 */
void update_root_blkg_pd(struct request_queue *q, enum blkio_policy_id plid)
{
	struct blkio_policy_type *pol = blkio_policy[plid];
	struct blkio_group *blkg = blkg_lookup(&blkio_root_cgroup, q);
	struct blkg_policy_data *pd;

	if (!blkg)
		return;

	kfree(blkg->pd[plid]);
	blkg->pd[plid] = NULL;

	if (!pol)
		return;

	pd = kzalloc(sizeof(*pd) + pol->pdata_size, GFP_KERNEL);
	WARN_ON_ONCE(!pd);

	blkg->pd[plid] = pd;
	pd->blkg = blkg;
	pol->ops.blkio_init_group_fn(blkg);
}
EXPORT_SYMBOL_GPL(update_root_blkg_pd);

255 256 257 258 259 260 261 262
/**
 * blkg_destroy_all - destroy all blkgs associated with a request_queue
 * @q: request_queue of interest
 * @destroy_root: whether to destroy root blkg or not
 *
 * Destroy blkgs associated with @q.  If @destroy_root is %true, all are
 * destroyed; otherwise, root blkg is left alone.
 */
263
void blkg_destroy_all(struct request_queue *q, bool destroy_root)
264
{
265
	struct blkio_group *blkg, *n;
266

267
	spin_lock_irq(q->queue_lock);
268

269 270
	list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
		struct blkio_cgroup *blkcg = blkg->blkcg;
271

272 273 274
		/* skip root? */
		if (!destroy_root && blkg->blkcg == &blkio_root_cgroup)
			continue;
275

276 277 278
		spin_lock(&blkcg->lock);
		blkg_destroy(blkg);
		spin_unlock(&blkcg->lock);
279
	}
280 281

	spin_unlock_irq(q->queue_lock);
282
}
283
EXPORT_SYMBOL_GPL(blkg_destroy_all);
284

T
Tejun Heo 已提交
285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307
static void blkg_rcu_free(struct rcu_head *rcu_head)
{
	blkg_free(container_of(rcu_head, struct blkio_group, rcu_head));
}

void __blkg_release(struct blkio_group *blkg)
{
	/* release the extra blkcg reference this blkg has been holding */
	css_put(&blkg->blkcg->css);

	/*
	 * A group is freed in rcu manner. But having an rcu lock does not
	 * mean that one can access all the fields of blkg and assume these
	 * are valid. For example, don't try to follow throtl_data and
	 * request queue links.
	 *
	 * Having a reference to blkg under an rcu allows acess to only
	 * values local to groups like group stats and group rate limits
	 */
	call_rcu(&blkg->rcu_head, blkg_rcu_free);
}
EXPORT_SYMBOL_GPL(__blkg_release);

308
static int
309
blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
310
{
T
Tejun Heo 已提交
311
	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
312 313 314
	struct blkio_group *blkg;
	struct hlist_node *n;

315
	spin_lock(&blkio_list_lock);
316
	spin_lock_irq(&blkcg->lock);
T
Tejun Heo 已提交
317 318 319 320 321 322

	/*
	 * Note that stat reset is racy - it doesn't synchronize against
	 * stat updates.  This is a debug feature which shouldn't exist
	 * anyway.  If you get hit by a race, retry.
	 */
323
	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
324
		struct blkio_policy_type *pol;
325

326
		list_for_each_entry(pol, &blkio_list, list)
327 328
			if (pol->ops.blkio_reset_group_stats_fn)
				pol->ops.blkio_reset_group_stats_fn(blkg);
329
	}
330

331
	spin_unlock_irq(&blkcg->lock);
332
	spin_unlock(&blkio_list_lock);
333 334 335
	return 0;
}

336
static const char *blkg_dev_name(struct blkio_group *blkg)
337
{
338 339 340 341
	/* some drivers (floppy) instantiate a queue w/o disk registered */
	if (blkg->q->backing_dev_info.dev)
		return dev_name(blkg->q->backing_dev_info.dev);
	return NULL;
342 343
}

344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360
/**
 * blkcg_print_blkgs - helper for printing per-blkg data
 * @sf: seq_file to print to
 * @blkcg: blkcg of interest
 * @prfill: fill function to print out a blkg
 * @pol: policy in question
 * @data: data to be passed to @prfill
 * @show_total: to print out sum of prfill return values or not
 *
 * This function invokes @prfill on each blkg of @blkcg if pd for the
 * policy specified by @pol exists.  @prfill is invoked with @sf, the
 * policy data and @data.  If @show_total is %true, the sum of the return
 * values from @prfill is printed with "Total" label at the end.
 *
 * This is to be used to construct print functions for
 * cftype->read_seq_string method.
 */
361
void blkcg_print_blkgs(struct seq_file *sf, struct blkio_cgroup *blkcg,
362
		       u64 (*prfill)(struct seq_file *, void *, int),
363
		       int pol, int data, bool show_total)
364
{
365 366 367
	struct blkio_group *blkg;
	struct hlist_node *n;
	u64 total = 0;
368

369 370 371
	spin_lock_irq(&blkcg->lock);
	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node)
		if (blkg->pd[pol])
372
			total += prfill(sf, blkg->pd[pol]->pdata, data);
373 374 375 376 377
	spin_unlock_irq(&blkcg->lock);

	if (show_total)
		seq_printf(sf, "Total %llu\n", (unsigned long long)total);
}
378
EXPORT_SYMBOL_GPL(blkcg_print_blkgs);
379 380 381 382

/**
 * __blkg_prfill_u64 - prfill helper for a single u64 value
 * @sf: seq_file to print to
383
 * @pdata: policy private data of interest
384 385
 * @v: value to print
 *
386
 * Print @v to @sf for the device assocaited with @pdata.
387
 */
388
u64 __blkg_prfill_u64(struct seq_file *sf, void *pdata, u64 v)
389
{
390
	const char *dname = blkg_dev_name(pdata_to_blkg(pdata));
391 392 393 394 395 396 397

	if (!dname)
		return 0;

	seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v);
	return v;
}
398
EXPORT_SYMBOL_GPL(__blkg_prfill_u64);
399 400 401 402

/**
 * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat
 * @sf: seq_file to print to
403
 * @pdata: policy private data of interest
404 405
 * @rwstat: rwstat to print
 *
406
 * Print @rwstat to @sf for the device assocaited with @pdata.
407
 */
408
u64 __blkg_prfill_rwstat(struct seq_file *sf, void *pdata,
409
			 const struct blkg_rwstat *rwstat)
410 411 412 413 414 415 416
{
	static const char *rwstr[] = {
		[BLKG_RWSTAT_READ]	= "Read",
		[BLKG_RWSTAT_WRITE]	= "Write",
		[BLKG_RWSTAT_SYNC]	= "Sync",
		[BLKG_RWSTAT_ASYNC]	= "Async",
	};
417
	const char *dname = blkg_dev_name(pdata_to_blkg(pdata));
418 419 420 421 422 423 424 425 426 427 428 429 430 431 432
	u64 v;
	int i;

	if (!dname)
		return 0;

	for (i = 0; i < BLKG_RWSTAT_NR; i++)
		seq_printf(sf, "%s %s %llu\n", dname, rwstr[i],
			   (unsigned long long)rwstat->cnt[i]);

	v = rwstat->cnt[BLKG_RWSTAT_READ] + rwstat->cnt[BLKG_RWSTAT_WRITE];
	seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
	return v;
}

433
static u64 blkg_prfill_stat(struct seq_file *sf, void *pdata, int off)
434
{
435
	return __blkg_prfill_u64(sf, pdata, blkg_stat_read(pdata + off));
436 437
}

438
static u64 blkg_prfill_rwstat(struct seq_file *sf, void *pdata, int off)
439
{
440
	struct blkg_rwstat rwstat = blkg_rwstat_read(pdata + off);
441

442
	return __blkg_prfill_rwstat(sf, pdata, &rwstat);
443 444 445
}

/* print blkg_stat specified by BLKCG_STAT_PRIV() */
446 447
int blkcg_print_stat(struct cgroup *cgrp, struct cftype *cft,
		     struct seq_file *sf)
448 449 450 451 452 453 454 455
{
	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);

	blkcg_print_blkgs(sf, blkcg, blkg_prfill_stat,
			  BLKCG_STAT_POL(cft->private),
			  BLKCG_STAT_OFF(cft->private), false);
	return 0;
}
456
EXPORT_SYMBOL_GPL(blkcg_print_stat);
457 458

/* print blkg_rwstat specified by BLKCG_STAT_PRIV() */
459 460
int blkcg_print_rwstat(struct cgroup *cgrp, struct cftype *cft,
		       struct seq_file *sf)
461 462 463 464 465 466 467 468
{
	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);

	blkcg_print_blkgs(sf, blkcg, blkg_prfill_rwstat,
			  BLKCG_STAT_POL(cft->private),
			  BLKCG_STAT_OFF(cft->private), true);
	return 0;
}
469
EXPORT_SYMBOL_GPL(blkcg_print_rwstat);
470

471 472 473 474 475 476 477 478 479 480 481
/**
 * blkg_conf_prep - parse and prepare for per-blkg config update
 * @blkcg: target block cgroup
 * @input: input string
 * @ctx: blkg_conf_ctx to be filled
 *
 * Parse per-blkg config update from @input and initialize @ctx with the
 * result.  @ctx->blkg points to the blkg to be updated and @ctx->v the new
 * value.  This function returns with RCU read locked and must be paired
 * with blkg_conf_finish().
 */
482 483
int blkg_conf_prep(struct blkio_cgroup *blkcg, const char *input,
		   struct blkg_conf_ctx *ctx)
484
	__acquires(rcu)
485
{
486 487
	struct gendisk *disk;
	struct blkio_group *blkg;
T
Tejun Heo 已提交
488 489 490
	unsigned int major, minor;
	unsigned long long v;
	int part, ret;
491

T
Tejun Heo 已提交
492 493
	if (sscanf(input, "%u:%u %llu", &major, &minor, &v) != 3)
		return -EINVAL;
494

T
Tejun Heo 已提交
495
	disk = get_gendisk(MKDEV(major, minor), &part);
T
Tejun Heo 已提交
496
	if (!disk || part)
T
Tejun Heo 已提交
497
		return -EINVAL;
498 499 500

	rcu_read_lock();

T
Tejun Heo 已提交
501
	spin_lock_irq(disk->queue->queue_lock);
502
	blkg = blkg_lookup_create(blkcg, disk->queue, false);
T
Tejun Heo 已提交
503
	spin_unlock_irq(disk->queue->queue_lock);
504

T
Tejun Heo 已提交
505 506
	if (IS_ERR(blkg)) {
		ret = PTR_ERR(blkg);
507 508 509 510 511 512 513 514 515 516 517
		rcu_read_unlock();
		put_disk(disk);
		/*
		 * If queue was bypassing, we should retry.  Do so after a
		 * short msleep().  It isn't strictly necessary but queue
		 * can be bypassing for some time and it's always nice to
		 * avoid busy looping.
		 */
		if (ret == -EBUSY) {
			msleep(10);
			ret = restart_syscall();
518
		}
T
Tejun Heo 已提交
519
		return ret;
520
	}
521 522 523

	ctx->disk = disk;
	ctx->blkg = blkg;
T
Tejun Heo 已提交
524 525
	ctx->v = v;
	return 0;
526
}
527
EXPORT_SYMBOL_GPL(blkg_conf_prep);
528

529 530 531 532 533 534 535
/**
 * blkg_conf_finish - finish up per-blkg config update
 * @ctx: blkg_conf_ctx intiailized by blkg_conf_prep()
 *
 * Finish up after per-blkg config update.  This function must be paired
 * with blkg_conf_prep().
 */
536
void blkg_conf_finish(struct blkg_conf_ctx *ctx)
537
	__releases(rcu)
538
{
539 540
	rcu_read_unlock();
	put_disk(ctx->disk);
541
}
542
EXPORT_SYMBOL_GPL(blkg_conf_finish);
543

544
struct cftype blkio_files[] = {
545 546 547
	{
		.name = "reset_stats",
		.write_u64 = blkiocg_reset_stats,
548
	},
549
	{ }	/* terminate */
550 551
};

552 553 554 555 556 557 558 559 560 561 562
/**
 * blkiocg_pre_destroy - cgroup pre_destroy callback
 * @cgroup: cgroup of interest
 *
 * This function is called when @cgroup is about to go away and responsible
 * for shooting down all blkgs associated with @cgroup.  blkgs should be
 * removed while holding both q and blkcg locks.  As blkcg lock is nested
 * inside q lock, this function performs reverse double lock dancing.
 *
 * This is the blkcg counterpart of ioc_release_fn().
 */
563
static int blkiocg_pre_destroy(struct cgroup *cgroup)
564 565
{
	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
566

567
	spin_lock_irq(&blkcg->lock);
568

569 570 571
	while (!hlist_empty(&blkcg->blkg_list)) {
		struct blkio_group *blkg = hlist_entry(blkcg->blkg_list.first,
						struct blkio_group, blkcg_node);
T
Tejun Heo 已提交
572
		struct request_queue *q = blkg->q;
573

574 575 576 577 578 579
		if (spin_trylock(q->queue_lock)) {
			blkg_destroy(blkg);
			spin_unlock(q->queue_lock);
		} else {
			spin_unlock_irq(&blkcg->lock);
			cpu_relax();
580
			spin_lock_irq(&blkcg->lock);
581
		}
582
	}
583

584
	spin_unlock_irq(&blkcg->lock);
585 586 587
	return 0;
}

588
static void blkiocg_destroy(struct cgroup *cgroup)
589 590 591
{
	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);

B
Ben Blum 已提交
592 593
	if (blkcg != &blkio_root_cgroup)
		kfree(blkcg);
594 595
}

596
static struct cgroup_subsys_state *blkiocg_create(struct cgroup *cgroup)
597
{
T
Tejun Heo 已提交
598
	static atomic64_t id_seq = ATOMIC64_INIT(0);
599 600
	struct blkio_cgroup *blkcg;
	struct cgroup *parent = cgroup->parent;
601

602
	if (!parent) {
603 604 605 606 607 608 609 610
		blkcg = &blkio_root_cgroup;
		goto done;
	}

	blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
	if (!blkcg)
		return ERR_PTR(-ENOMEM);

611
	blkcg->cfq_weight = CFQ_WEIGHT_DEFAULT;
T
Tejun Heo 已提交
612
	blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */
613 614 615 616 617 618 619
done:
	spin_lock_init(&blkcg->lock);
	INIT_HLIST_HEAD(&blkcg->blkg_list);

	return &blkcg->css;
}

620 621 622 623 624 625 626 627 628 629 630 631
/**
 * blkcg_init_queue - initialize blkcg part of request queue
 * @q: request_queue to initialize
 *
 * Called from blk_alloc_queue_node(). Responsible for initializing blkcg
 * part of new request_queue @q.
 *
 * RETURNS:
 * 0 on success, -errno on failure.
 */
int blkcg_init_queue(struct request_queue *q)
{
632 633
	int ret;

634 635
	might_sleep();

636 637 638 639 640 641 642 643 644 645
	ret = blk_throtl_init(q);
	if (ret)
		return ret;

	mutex_lock(&all_q_mutex);
	INIT_LIST_HEAD(&q->all_q_node);
	list_add_tail(&q->all_q_node, &all_q_list);
	mutex_unlock(&all_q_mutex);

	return 0;
646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668
}

/**
 * blkcg_drain_queue - drain blkcg part of request_queue
 * @q: request_queue to drain
 *
 * Called from blk_drain_queue().  Responsible for draining blkcg part.
 */
void blkcg_drain_queue(struct request_queue *q)
{
	lockdep_assert_held(q->queue_lock);

	blk_throtl_drain(q);
}

/**
 * blkcg_exit_queue - exit and release blkcg part of request_queue
 * @q: request_queue being released
 *
 * Called from blk_release_queue().  Responsible for exiting blkcg part.
 */
void blkcg_exit_queue(struct request_queue *q)
{
669 670 671 672
	mutex_lock(&all_q_mutex);
	list_del_init(&q->all_q_node);
	mutex_unlock(&all_q_mutex);

673 674
	blkg_destroy_all(q, true);

675 676 677
	blk_throtl_exit(q);
}

678 679 680 681 682 683
/*
 * We cannot support shared io contexts, as we have no mean to support
 * two tasks with the same ioc in two different groups without major rework
 * of the main cic data structures.  For now we allow a task to change
 * its cgroup only if it's the only owner of its ioc.
 */
684
static int blkiocg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
685
{
686
	struct task_struct *task;
687 688 689 690
	struct io_context *ioc;
	int ret = 0;

	/* task_lock() is needed to avoid races with exit_io_context() */
691 692 693 694 695 696 697 698 699
	cgroup_taskset_for_each(task, cgrp, tset) {
		task_lock(task);
		ioc = task->io_context;
		if (ioc && atomic_read(&ioc->nr_tasks) > 1)
			ret = -EINVAL;
		task_unlock(task);
		if (ret)
			break;
	}
700 701 702
	return ret;
}

703 704 705 706 707 708 709 710 711
static void blkcg_bypass_start(void)
	__acquires(&all_q_mutex)
{
	struct request_queue *q;

	mutex_lock(&all_q_mutex);

	list_for_each_entry(q, &all_q_list, all_q_node) {
		blk_queue_bypass_start(q);
712
		blkg_destroy_all(q, false);
713 714 715 716 717 718 719 720 721 722 723 724 725 726
	}
}

static void blkcg_bypass_end(void)
	__releases(&all_q_mutex)
{
	struct request_queue *q;

	list_for_each_entry(q, &all_q_list, all_q_node)
		blk_queue_bypass_end(q);

	mutex_unlock(&all_q_mutex);
}

727 728 729 730
struct cgroup_subsys blkio_subsys = {
	.name = "blkio",
	.create = blkiocg_create,
	.can_attach = blkiocg_can_attach,
731
	.pre_destroy = blkiocg_pre_destroy,
732 733
	.destroy = blkiocg_destroy,
	.subsys_id = blkio_subsys_id,
734
	.base_cftypes = blkio_files,
735 736 737 738
	.module = THIS_MODULE,
};
EXPORT_SYMBOL_GPL(blkio_subsys);

739 740
void blkio_policy_register(struct blkio_policy_type *blkiop)
{
741 742
	struct request_queue *q;

743
	blkcg_bypass_start();
744
	spin_lock(&blkio_list_lock);
745 746 747

	BUG_ON(blkio_policy[blkiop->plid]);
	blkio_policy[blkiop->plid] = blkiop;
748
	list_add_tail(&blkiop->list, &blkio_list);
749

750
	spin_unlock(&blkio_list_lock);
751 752
	list_for_each_entry(q, &all_q_list, all_q_node)
		update_root_blkg_pd(q, blkiop->plid);
753
	blkcg_bypass_end();
754 755 756

	if (blkiop->cftypes)
		WARN_ON(cgroup_add_cftypes(&blkio_subsys, blkiop->cftypes));
757 758 759 760 761
}
EXPORT_SYMBOL_GPL(blkio_policy_register);

void blkio_policy_unregister(struct blkio_policy_type *blkiop)
{
762 763
	struct request_queue *q;

764 765 766
	if (blkiop->cftypes)
		cgroup_rm_cftypes(&blkio_subsys, blkiop->cftypes);

767
	blkcg_bypass_start();
768
	spin_lock(&blkio_list_lock);
769 770 771

	BUG_ON(blkio_policy[blkiop->plid] != blkiop);
	blkio_policy[blkiop->plid] = NULL;
772
	list_del_init(&blkiop->list);
773

774
	spin_unlock(&blkio_list_lock);
775 776
	list_for_each_entry(q, &all_q_list, all_q_node)
		update_root_blkg_pd(q, blkiop->plid);
777
	blkcg_bypass_end();
778 779
}
EXPORT_SYMBOL_GPL(blkio_policy_unregister);