blk-cgroup.c 19.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13
/*
 * Common Block IO controller cgroup interface
 *
 * Based on ideas and code from CFQ, CFS and BFQ:
 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
 *
 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
 *		      Paolo Valente <paolo.valente@unimore.it>
 *
 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
 * 	              Nauman Rafique <nauman@google.com>
 */
#include <linux/ioprio.h>
14
#include <linux/kdev_t.h>
15
#include <linux/module.h>
16
#include <linux/err.h>
17
#include <linux/blkdev.h>
18
#include <linux/slab.h>
19
#include <linux/genhd.h>
20
#include <linux/delay.h>
T
Tejun Heo 已提交
21
#include <linux/atomic.h>
22
#include "blk-cgroup.h"
23
#include "blk.h"
24

25 26
#define MAX_KEY_LEN 100

27
static DEFINE_MUTEX(blkcg_pol_mutex);
28 29 30
static DEFINE_MUTEX(all_q_mutex);
static LIST_HEAD(all_q_list);

31
struct blkio_cgroup blkio_root_cgroup = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT };
32 33
EXPORT_SYMBOL_GPL(blkio_root_cgroup);

T
Tejun Heo 已提交
34
static struct blkio_policy_type *blkio_policy[BLKCG_MAX_POLS];
35

36 37 38 39 40
struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
{
	return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
			    struct blkio_cgroup, css);
}
41
EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
42

43
static struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk)
44 45 46 47
{
	return container_of(task_subsys_state(tsk, blkio_subsys_id),
			    struct blkio_cgroup, css);
}
48 49 50 51 52 53 54 55

struct blkio_cgroup *bio_blkio_cgroup(struct bio *bio)
{
	if (bio && bio->bi_css)
		return container_of(bio->bi_css, struct blkio_cgroup, css);
	return task_blkio_cgroup(current);
}
EXPORT_SYMBOL_GPL(bio_blkio_cgroup);
56

57 58 59 60 61 62 63 64
/**
 * blkg_free - free a blkg
 * @blkg: blkg to free
 *
 * Free @blkg which may be partially allocated.
 */
static void blkg_free(struct blkio_group *blkg)
{
65
	int i;
66 67 68 69

	if (!blkg)
		return;

T
Tejun Heo 已提交
70
	for (i = 0; i < BLKCG_MAX_POLS; i++) {
71
		struct blkio_policy_type *pol = blkio_policy[i];
72 73
		struct blkg_policy_data *pd = blkg->pd[i];

74 75 76 77 78 79 80
		if (!pd)
			continue;

		if (pol && pol->ops.blkio_exit_group_fn)
			pol->ops.blkio_exit_group_fn(blkg);

		kfree(pd);
81
	}
82

83
	kfree(blkg);
84 85 86 87 88 89 90
}

/**
 * blkg_alloc - allocate a blkg
 * @blkcg: block cgroup the new blkg is associated with
 * @q: request_queue the new blkg is associated with
 *
91
 * Allocate a new blkg assocating @blkcg and @q.
92 93
 */
static struct blkio_group *blkg_alloc(struct blkio_cgroup *blkcg,
94
				      struct request_queue *q)
95 96
{
	struct blkio_group *blkg;
97
	int i;
98 99 100 101 102 103

	/* alloc and init base part */
	blkg = kzalloc_node(sizeof(*blkg), GFP_ATOMIC, q->node);
	if (!blkg)
		return NULL;

T
Tejun Heo 已提交
104
	blkg->q = q;
105
	INIT_LIST_HEAD(&blkg->q_node);
106
	blkg->blkcg = blkcg;
T
Tejun Heo 已提交
107
	blkg->refcnt = 1;
108 109
	cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));

T
Tejun Heo 已提交
110
	for (i = 0; i < BLKCG_MAX_POLS; i++) {
111 112
		struct blkio_policy_type *pol = blkio_policy[i];
		struct blkg_policy_data *pd;
113

114 115 116 117 118 119 120 121 122 123
		if (!pol)
			continue;

		/* alloc per-policy data and attach it to blkg */
		pd = kzalloc_node(sizeof(*pd) + pol->pdata_size, GFP_ATOMIC,
				  q->node);
		if (!pd) {
			blkg_free(blkg);
			return NULL;
		}
124

125 126
		blkg->pd[i] = pd;
		pd->blkg = blkg;
127 128
	}

129
	/* invoke per-policy init */
T
Tejun Heo 已提交
130
	for (i = 0; i < BLKCG_MAX_POLS; i++) {
131 132 133 134 135 136
		struct blkio_policy_type *pol = blkio_policy[i];

		if (pol)
			pol->ops.blkio_init_group_fn(blkg);
	}

137 138 139
	return blkg;
}

140 141 142 143
struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
				       struct request_queue *q,
				       bool for_root)
	__releases(q->queue_lock) __acquires(q->queue_lock)
144
{
145
	struct blkio_group *blkg;
146

147 148 149 150 151 152 153 154 155 156 157 158
	WARN_ON_ONCE(!rcu_read_lock_held());
	lockdep_assert_held(q->queue_lock);

	/*
	 * This could be the first entry point of blkcg implementation and
	 * we shouldn't allow anything to go through for a bypassing queue.
	 * The following can be removed if blkg lookup is guaranteed to
	 * fail on a bypassing queue.
	 */
	if (unlikely(blk_queue_bypass(q)) && !for_root)
		return ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY);

159
	blkg = blkg_lookup(blkcg, q);
160 161 162
	if (blkg)
		return blkg;

163
	/* blkg holds a reference to blkcg */
164 165 166 167 168 169
	if (!css_tryget(&blkcg->css))
		return ERR_PTR(-EINVAL);

	/*
	 * Allocate and initialize.
	 */
170
	blkg = blkg_alloc(blkcg, q);
171 172

	/* did alloc fail? */
173
	if (unlikely(!blkg)) {
174 175 176 177 178 179
		blkg = ERR_PTR(-ENOMEM);
		goto out;
	}

	/* insert */
	spin_lock(&blkcg->lock);
180
	hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
181
	list_add(&blkg->q_node, &q->blkg_list);
182 183 184
	spin_unlock(&blkcg->lock);
out:
	return blkg;
185
}
186
EXPORT_SYMBOL_GPL(blkg_lookup_create);
187 188

/* called under rcu_read_lock(). */
189
struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg,
190
				struct request_queue *q)
191 192 193 194
{
	struct blkio_group *blkg;
	struct hlist_node *n;

195
	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node)
196
		if (blkg->q == q)
197 198 199
			return blkg;
	return NULL;
}
200
EXPORT_SYMBOL_GPL(blkg_lookup);
201

202
static void blkg_destroy(struct blkio_group *blkg)
203 204
{
	struct request_queue *q = blkg->q;
205
	struct blkio_cgroup *blkcg = blkg->blkcg;
206 207

	lockdep_assert_held(q->queue_lock);
208
	lockdep_assert_held(&blkcg->lock);
209 210

	/* Something wrong if we are trying to remove same group twice */
211
	WARN_ON_ONCE(list_empty(&blkg->q_node));
212
	WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
213
	list_del_init(&blkg->q_node);
214
	hlist_del_init_rcu(&blkg->blkcg_node);
215 216 217 218 219 220 221 222

	/*
	 * Put the reference taken at the time of creation so that when all
	 * queues are gone, group can be destroyed.
	 */
	blkg_put(blkg);
}

223 224 225 226 227 228
/*
 * XXX: This updates blkg policy data in-place for root blkg, which is
 * necessary across elevator switch and policy registration as root blkgs
 * aren't shot down.  This broken and racy implementation is temporary.
 * Eventually, blkg shoot down will be replaced by proper in-place update.
 */
229 230
void update_root_blkg_pd(struct request_queue *q,
			 const struct blkio_policy_type *pol)
231 232 233 234 235 236 237
{
	struct blkio_group *blkg = blkg_lookup(&blkio_root_cgroup, q);
	struct blkg_policy_data *pd;

	if (!blkg)
		return;

238 239
	kfree(blkg->pd[pol->plid]);
	blkg->pd[pol->plid] = NULL;
240 241 242 243 244 245 246

	if (!pol)
		return;

	pd = kzalloc(sizeof(*pd) + pol->pdata_size, GFP_KERNEL);
	WARN_ON_ONCE(!pd);

247
	blkg->pd[pol->plid] = pd;
248 249 250 251 252
	pd->blkg = blkg;
	pol->ops.blkio_init_group_fn(blkg);
}
EXPORT_SYMBOL_GPL(update_root_blkg_pd);

253 254 255 256 257 258 259 260
/**
 * blkg_destroy_all - destroy all blkgs associated with a request_queue
 * @q: request_queue of interest
 * @destroy_root: whether to destroy root blkg or not
 *
 * Destroy blkgs associated with @q.  If @destroy_root is %true, all are
 * destroyed; otherwise, root blkg is left alone.
 */
261
void blkg_destroy_all(struct request_queue *q, bool destroy_root)
262
{
263
	struct blkio_group *blkg, *n;
264

265
	spin_lock_irq(q->queue_lock);
266

267 268
	list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
		struct blkio_cgroup *blkcg = blkg->blkcg;
269

270 271 272
		/* skip root? */
		if (!destroy_root && blkg->blkcg == &blkio_root_cgroup)
			continue;
273

274 275 276
		spin_lock(&blkcg->lock);
		blkg_destroy(blkg);
		spin_unlock(&blkcg->lock);
277
	}
278 279

	spin_unlock_irq(q->queue_lock);
280
}
281
EXPORT_SYMBOL_GPL(blkg_destroy_all);
282

T
Tejun Heo 已提交
283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305
static void blkg_rcu_free(struct rcu_head *rcu_head)
{
	blkg_free(container_of(rcu_head, struct blkio_group, rcu_head));
}

void __blkg_release(struct blkio_group *blkg)
{
	/* release the extra blkcg reference this blkg has been holding */
	css_put(&blkg->blkcg->css);

	/*
	 * A group is freed in rcu manner. But having an rcu lock does not
	 * mean that one can access all the fields of blkg and assume these
	 * are valid. For example, don't try to follow throtl_data and
	 * request queue links.
	 *
	 * Having a reference to blkg under an rcu allows acess to only
	 * values local to groups like group stats and group rate limits
	 */
	call_rcu(&blkg->rcu_head, blkg_rcu_free);
}
EXPORT_SYMBOL_GPL(__blkg_release);

306
static int
307
blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
308
{
T
Tejun Heo 已提交
309
	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
310 311
	struct blkio_group *blkg;
	struct hlist_node *n;
312
	int i;
313

314
	mutex_lock(&blkcg_pol_mutex);
315
	spin_lock_irq(&blkcg->lock);
T
Tejun Heo 已提交
316 317 318 319 320 321

	/*
	 * Note that stat reset is racy - it doesn't synchronize against
	 * stat updates.  This is a debug feature which shouldn't exist
	 * anyway.  If you get hit by a race, retry.
	 */
322
	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
T
Tejun Heo 已提交
323
		for (i = 0; i < BLKCG_MAX_POLS; i++) {
324
			struct blkio_policy_type *pol = blkio_policy[i];
325

326
			if (pol && pol->ops.blkio_reset_group_stats_fn)
327
				pol->ops.blkio_reset_group_stats_fn(blkg);
328
		}
329
	}
330

331
	spin_unlock_irq(&blkcg->lock);
332
	mutex_unlock(&blkcg_pol_mutex);
333 334 335
	return 0;
}

336
static const char *blkg_dev_name(struct blkio_group *blkg)
337
{
338 339 340 341
	/* some drivers (floppy) instantiate a queue w/o disk registered */
	if (blkg->q->backing_dev_info.dev)
		return dev_name(blkg->q->backing_dev_info.dev);
	return NULL;
342 343
}

344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360
/**
 * blkcg_print_blkgs - helper for printing per-blkg data
 * @sf: seq_file to print to
 * @blkcg: blkcg of interest
 * @prfill: fill function to print out a blkg
 * @pol: policy in question
 * @data: data to be passed to @prfill
 * @show_total: to print out sum of prfill return values or not
 *
 * This function invokes @prfill on each blkg of @blkcg if pd for the
 * policy specified by @pol exists.  @prfill is invoked with @sf, the
 * policy data and @data.  If @show_total is %true, the sum of the return
 * values from @prfill is printed with "Total" label at the end.
 *
 * This is to be used to construct print functions for
 * cftype->read_seq_string method.
 */
361
void blkcg_print_blkgs(struct seq_file *sf, struct blkio_cgroup *blkcg,
362
		       u64 (*prfill)(struct seq_file *, void *, int),
363 364
		       const struct blkio_policy_type *pol, int data,
		       bool show_total)
365
{
366 367 368
	struct blkio_group *blkg;
	struct hlist_node *n;
	u64 total = 0;
369

370 371
	spin_lock_irq(&blkcg->lock);
	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node)
372 373
		if (blkg->pd[pol->plid])
			total += prfill(sf, blkg->pd[pol->plid]->pdata, data);
374 375 376 377 378
	spin_unlock_irq(&blkcg->lock);

	if (show_total)
		seq_printf(sf, "Total %llu\n", (unsigned long long)total);
}
379
EXPORT_SYMBOL_GPL(blkcg_print_blkgs);
380 381 382 383

/**
 * __blkg_prfill_u64 - prfill helper for a single u64 value
 * @sf: seq_file to print to
384
 * @pdata: policy private data of interest
385 386
 * @v: value to print
 *
387
 * Print @v to @sf for the device assocaited with @pdata.
388
 */
389
u64 __blkg_prfill_u64(struct seq_file *sf, void *pdata, u64 v)
390
{
391
	const char *dname = blkg_dev_name(pdata_to_blkg(pdata));
392 393 394 395 396 397 398

	if (!dname)
		return 0;

	seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v);
	return v;
}
399
EXPORT_SYMBOL_GPL(__blkg_prfill_u64);
400 401 402 403

/**
 * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat
 * @sf: seq_file to print to
404
 * @pdata: policy private data of interest
405 406
 * @rwstat: rwstat to print
 *
407
 * Print @rwstat to @sf for the device assocaited with @pdata.
408
 */
409
u64 __blkg_prfill_rwstat(struct seq_file *sf, void *pdata,
410
			 const struct blkg_rwstat *rwstat)
411 412 413 414 415 416 417
{
	static const char *rwstr[] = {
		[BLKG_RWSTAT_READ]	= "Read",
		[BLKG_RWSTAT_WRITE]	= "Write",
		[BLKG_RWSTAT_SYNC]	= "Sync",
		[BLKG_RWSTAT_ASYNC]	= "Async",
	};
418
	const char *dname = blkg_dev_name(pdata_to_blkg(pdata));
419 420 421 422 423 424 425 426 427 428 429 430 431 432 433
	u64 v;
	int i;

	if (!dname)
		return 0;

	for (i = 0; i < BLKG_RWSTAT_NR; i++)
		seq_printf(sf, "%s %s %llu\n", dname, rwstr[i],
			   (unsigned long long)rwstat->cnt[i]);

	v = rwstat->cnt[BLKG_RWSTAT_READ] + rwstat->cnt[BLKG_RWSTAT_WRITE];
	seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
	return v;
}

434 435 436 437 438 439 440 441 442
/**
 * blkg_prfill_stat - prfill callback for blkg_stat
 * @sf: seq_file to print to
 * @pdata: policy private data of interest
 * @off: offset to the blkg_stat in @pdata
 *
 * prfill callback for printing a blkg_stat.
 */
u64 blkg_prfill_stat(struct seq_file *sf, void *pdata, int off)
443
{
444
	return __blkg_prfill_u64(sf, pdata, blkg_stat_read(pdata + off));
445
}
446
EXPORT_SYMBOL_GPL(blkg_prfill_stat);
447

448 449 450 451 452 453 454 455 456
/**
 * blkg_prfill_rwstat - prfill callback for blkg_rwstat
 * @sf: seq_file to print to
 * @pdata: policy private data of interest
 * @off: offset to the blkg_rwstat in @pdata
 *
 * prfill callback for printing a blkg_rwstat.
 */
u64 blkg_prfill_rwstat(struct seq_file *sf, void *pdata, int off)
457
{
458
	struct blkg_rwstat rwstat = blkg_rwstat_read(pdata + off);
459

460
	return __blkg_prfill_rwstat(sf, pdata, &rwstat);
461
}
462
EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
463

464 465 466 467 468 469 470 471 472 473 474
/**
 * blkg_conf_prep - parse and prepare for per-blkg config update
 * @blkcg: target block cgroup
 * @input: input string
 * @ctx: blkg_conf_ctx to be filled
 *
 * Parse per-blkg config update from @input and initialize @ctx with the
 * result.  @ctx->blkg points to the blkg to be updated and @ctx->v the new
 * value.  This function returns with RCU read locked and must be paired
 * with blkg_conf_finish().
 */
475 476
int blkg_conf_prep(struct blkio_cgroup *blkcg, const char *input,
		   struct blkg_conf_ctx *ctx)
477
	__acquires(rcu)
478
{
479 480
	struct gendisk *disk;
	struct blkio_group *blkg;
T
Tejun Heo 已提交
481 482 483
	unsigned int major, minor;
	unsigned long long v;
	int part, ret;
484

T
Tejun Heo 已提交
485 486
	if (sscanf(input, "%u:%u %llu", &major, &minor, &v) != 3)
		return -EINVAL;
487

T
Tejun Heo 已提交
488
	disk = get_gendisk(MKDEV(major, minor), &part);
T
Tejun Heo 已提交
489
	if (!disk || part)
T
Tejun Heo 已提交
490
		return -EINVAL;
491 492 493

	rcu_read_lock();

T
Tejun Heo 已提交
494
	spin_lock_irq(disk->queue->queue_lock);
495
	blkg = blkg_lookup_create(blkcg, disk->queue, false);
T
Tejun Heo 已提交
496
	spin_unlock_irq(disk->queue->queue_lock);
497

T
Tejun Heo 已提交
498 499
	if (IS_ERR(blkg)) {
		ret = PTR_ERR(blkg);
500 501 502 503 504 505 506 507 508 509 510
		rcu_read_unlock();
		put_disk(disk);
		/*
		 * If queue was bypassing, we should retry.  Do so after a
		 * short msleep().  It isn't strictly necessary but queue
		 * can be bypassing for some time and it's always nice to
		 * avoid busy looping.
		 */
		if (ret == -EBUSY) {
			msleep(10);
			ret = restart_syscall();
511
		}
T
Tejun Heo 已提交
512
		return ret;
513
	}
514 515 516

	ctx->disk = disk;
	ctx->blkg = blkg;
T
Tejun Heo 已提交
517 518
	ctx->v = v;
	return 0;
519
}
520
EXPORT_SYMBOL_GPL(blkg_conf_prep);
521

522 523 524 525 526 527 528
/**
 * blkg_conf_finish - finish up per-blkg config update
 * @ctx: blkg_conf_ctx intiailized by blkg_conf_prep()
 *
 * Finish up after per-blkg config update.  This function must be paired
 * with blkg_conf_prep().
 */
529
void blkg_conf_finish(struct blkg_conf_ctx *ctx)
530
	__releases(rcu)
531
{
532 533
	rcu_read_unlock();
	put_disk(ctx->disk);
534
}
535
EXPORT_SYMBOL_GPL(blkg_conf_finish);
536

537
struct cftype blkio_files[] = {
538 539 540
	{
		.name = "reset_stats",
		.write_u64 = blkiocg_reset_stats,
541
	},
542
	{ }	/* terminate */
543 544
};

545 546 547 548 549 550 551 552 553 554 555
/**
 * blkiocg_pre_destroy - cgroup pre_destroy callback
 * @cgroup: cgroup of interest
 *
 * This function is called when @cgroup is about to go away and responsible
 * for shooting down all blkgs associated with @cgroup.  blkgs should be
 * removed while holding both q and blkcg locks.  As blkcg lock is nested
 * inside q lock, this function performs reverse double lock dancing.
 *
 * This is the blkcg counterpart of ioc_release_fn().
 */
556
static int blkiocg_pre_destroy(struct cgroup *cgroup)
557 558
{
	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
559

560
	spin_lock_irq(&blkcg->lock);
561

562 563 564
	while (!hlist_empty(&blkcg->blkg_list)) {
		struct blkio_group *blkg = hlist_entry(blkcg->blkg_list.first,
						struct blkio_group, blkcg_node);
T
Tejun Heo 已提交
565
		struct request_queue *q = blkg->q;
566

567 568 569 570 571 572
		if (spin_trylock(q->queue_lock)) {
			blkg_destroy(blkg);
			spin_unlock(q->queue_lock);
		} else {
			spin_unlock_irq(&blkcg->lock);
			cpu_relax();
573
			spin_lock_irq(&blkcg->lock);
574
		}
575
	}
576

577
	spin_unlock_irq(&blkcg->lock);
578 579 580
	return 0;
}

581
static void blkiocg_destroy(struct cgroup *cgroup)
582 583 584
{
	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);

B
Ben Blum 已提交
585 586
	if (blkcg != &blkio_root_cgroup)
		kfree(blkcg);
587 588
}

589
static struct cgroup_subsys_state *blkiocg_create(struct cgroup *cgroup)
590
{
T
Tejun Heo 已提交
591
	static atomic64_t id_seq = ATOMIC64_INIT(0);
592 593
	struct blkio_cgroup *blkcg;
	struct cgroup *parent = cgroup->parent;
594

595
	if (!parent) {
596 597 598 599 600 601 602 603
		blkcg = &blkio_root_cgroup;
		goto done;
	}

	blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
	if (!blkcg)
		return ERR_PTR(-ENOMEM);

604
	blkcg->cfq_weight = CFQ_WEIGHT_DEFAULT;
T
Tejun Heo 已提交
605
	blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */
606 607 608 609 610 611 612
done:
	spin_lock_init(&blkcg->lock);
	INIT_HLIST_HEAD(&blkcg->blkg_list);

	return &blkcg->css;
}

613 614 615 616 617 618 619 620 621 622 623 624
/**
 * blkcg_init_queue - initialize blkcg part of request queue
 * @q: request_queue to initialize
 *
 * Called from blk_alloc_queue_node(). Responsible for initializing blkcg
 * part of new request_queue @q.
 *
 * RETURNS:
 * 0 on success, -errno on failure.
 */
int blkcg_init_queue(struct request_queue *q)
{
625 626
	int ret;

627 628
	might_sleep();

629 630 631 632 633 634 635 636 637 638
	ret = blk_throtl_init(q);
	if (ret)
		return ret;

	mutex_lock(&all_q_mutex);
	INIT_LIST_HEAD(&q->all_q_node);
	list_add_tail(&q->all_q_node, &all_q_list);
	mutex_unlock(&all_q_mutex);

	return 0;
639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661
}

/**
 * blkcg_drain_queue - drain blkcg part of request_queue
 * @q: request_queue to drain
 *
 * Called from blk_drain_queue().  Responsible for draining blkcg part.
 */
void blkcg_drain_queue(struct request_queue *q)
{
	lockdep_assert_held(q->queue_lock);

	blk_throtl_drain(q);
}

/**
 * blkcg_exit_queue - exit and release blkcg part of request_queue
 * @q: request_queue being released
 *
 * Called from blk_release_queue().  Responsible for exiting blkcg part.
 */
void blkcg_exit_queue(struct request_queue *q)
{
662 663 664 665
	mutex_lock(&all_q_mutex);
	list_del_init(&q->all_q_node);
	mutex_unlock(&all_q_mutex);

666 667
	blkg_destroy_all(q, true);

668 669 670
	blk_throtl_exit(q);
}

671 672 673 674 675 676
/*
 * We cannot support shared io contexts, as we have no mean to support
 * two tasks with the same ioc in two different groups without major rework
 * of the main cic data structures.  For now we allow a task to change
 * its cgroup only if it's the only owner of its ioc.
 */
677
static int blkiocg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
678
{
679
	struct task_struct *task;
680 681 682 683
	struct io_context *ioc;
	int ret = 0;

	/* task_lock() is needed to avoid races with exit_io_context() */
684 685 686 687 688 689 690 691 692
	cgroup_taskset_for_each(task, cgrp, tset) {
		task_lock(task);
		ioc = task->io_context;
		if (ioc && atomic_read(&ioc->nr_tasks) > 1)
			ret = -EINVAL;
		task_unlock(task);
		if (ret)
			break;
	}
693 694 695
	return ret;
}

696 697 698 699 700 701 702 703 704
static void blkcg_bypass_start(void)
	__acquires(&all_q_mutex)
{
	struct request_queue *q;

	mutex_lock(&all_q_mutex);

	list_for_each_entry(q, &all_q_list, all_q_node) {
		blk_queue_bypass_start(q);
705
		blkg_destroy_all(q, false);
706 707 708 709 710 711 712 713 714 715 716 717 718 719
	}
}

static void blkcg_bypass_end(void)
	__releases(&all_q_mutex)
{
	struct request_queue *q;

	list_for_each_entry(q, &all_q_list, all_q_node)
		blk_queue_bypass_end(q);

	mutex_unlock(&all_q_mutex);
}

720 721 722 723
struct cgroup_subsys blkio_subsys = {
	.name = "blkio",
	.create = blkiocg_create,
	.can_attach = blkiocg_can_attach,
724
	.pre_destroy = blkiocg_pre_destroy,
725 726
	.destroy = blkiocg_destroy,
	.subsys_id = blkio_subsys_id,
727
	.base_cftypes = blkio_files,
728 729 730 731
	.module = THIS_MODULE,
};
EXPORT_SYMBOL_GPL(blkio_subsys);

T
Tejun Heo 已提交
732 733 734 735 736 737 738 739 740
/**
 * blkio_policy_register - register a blkcg policy
 * @blkiop: blkcg policy to register
 *
 * Register @blkiop with blkcg core.  Might sleep and @blkiop may be
 * modified on successful registration.  Returns 0 on success and -errno on
 * failure.
 */
int blkio_policy_register(struct blkio_policy_type *blkiop)
741
{
742
	struct request_queue *q;
T
Tejun Heo 已提交
743
	int i, ret;
744

745 746
	mutex_lock(&blkcg_pol_mutex);

T
Tejun Heo 已提交
747 748 749 750 751 752 753
	/* find an empty slot */
	ret = -ENOSPC;
	for (i = 0; i < BLKCG_MAX_POLS; i++)
		if (!blkio_policy[i])
			break;
	if (i >= BLKCG_MAX_POLS)
		goto out_unlock;
754

T
Tejun Heo 已提交
755 756 757 758 759
	/* register and update blkgs */
	blkiop->plid = i;
	blkio_policy[i] = blkiop;

	blkcg_bypass_start();
760
	list_for_each_entry(q, &all_q_list, all_q_node)
761
		update_root_blkg_pd(q, blkiop);
762
	blkcg_bypass_end();
763

T
Tejun Heo 已提交
764
	/* everything is in place, add intf files for the new policy */
765 766
	if (blkiop->cftypes)
		WARN_ON(cgroup_add_cftypes(&blkio_subsys, blkiop->cftypes));
T
Tejun Heo 已提交
767 768
	ret = 0;
out_unlock:
769
	mutex_unlock(&blkcg_pol_mutex);
T
Tejun Heo 已提交
770
	return ret;
771 772 773
}
EXPORT_SYMBOL_GPL(blkio_policy_register);

T
Tejun Heo 已提交
774 775 776 777 778 779
/**
 * blkiop_policy_unregister - unregister a blkcg policy
 * @blkiop: blkcg policy to unregister
 *
 * Undo blkio_policy_register(@blkiop).  Might sleep.
 */
780 781
void blkio_policy_unregister(struct blkio_policy_type *blkiop)
{
782 783
	struct request_queue *q;

784 785
	mutex_lock(&blkcg_pol_mutex);

T
Tejun Heo 已提交
786 787 788 789
	if (WARN_ON(blkio_policy[blkiop->plid] != blkiop))
		goto out_unlock;

	/* kill the intf files first */
790 791 792
	if (blkiop->cftypes)
		cgroup_rm_cftypes(&blkio_subsys, blkiop->cftypes);

T
Tejun Heo 已提交
793
	/* unregister and update blkgs */
794 795
	blkio_policy[blkiop->plid] = NULL;

T
Tejun Heo 已提交
796
	blkcg_bypass_start();
797
	list_for_each_entry(q, &all_q_list, all_q_node)
798
		update_root_blkg_pd(q, blkiop);
799
	blkcg_bypass_end();
T
Tejun Heo 已提交
800
out_unlock:
801
	mutex_unlock(&blkcg_pol_mutex);
802 803
}
EXPORT_SYMBOL_GPL(blkio_policy_unregister);