blk-cgroup.c 21.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13
/*
 * Common Block IO controller cgroup interface
 *
 * Based on ideas and code from CFQ, CFS and BFQ:
 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
 *
 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
 *		      Paolo Valente <paolo.valente@unimore.it>
 *
 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
 * 	              Nauman Rafique <nauman@google.com>
 */
#include <linux/ioprio.h>
14
#include <linux/kdev_t.h>
15
#include <linux/module.h>
16
#include <linux/err.h>
17
#include <linux/blkdev.h>
18
#include <linux/slab.h>
19
#include <linux/genhd.h>
20
#include <linux/delay.h>
T
Tejun Heo 已提交
21
#include <linux/atomic.h>
22
#include "blk-cgroup.h"
23
#include "blk.h"
24

25 26
#define MAX_KEY_LEN 100

27 28
static DEFINE_SPINLOCK(blkio_list_lock);
static LIST_HEAD(blkio_list);
29

30 31 32
static DEFINE_MUTEX(all_q_mutex);
static LIST_HEAD(all_q_list);

33 34 35 36 37 38 39
/* List of groups pending per cpu stats allocation */
static DEFINE_SPINLOCK(alloc_list_lock);
static LIST_HEAD(alloc_list);

static void blkio_stat_alloc_fn(struct work_struct *);
static DECLARE_DELAYED_WORK(blkio_stat_alloc_work, blkio_stat_alloc_fn);

40
struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
41 42
EXPORT_SYMBOL_GPL(blkio_root_cgroup);

43 44
static struct blkio_policy_type *blkio_policy[BLKIO_NR_POLICIES];

45 46 47 48 49
struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
{
	return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
			    struct blkio_cgroup, css);
}
50
EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
51

52
static struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk)
53 54 55 56
{
	return container_of(task_subsys_state(tsk, blkio_subsys_id),
			    struct blkio_cgroup, css);
}
57 58 59 60 61 62 63 64

struct blkio_cgroup *bio_blkio_cgroup(struct bio *bio)
{
	if (bio && bio->bi_css)
		return container_of(bio->bi_css, struct blkio_cgroup, css);
	return task_blkio_cgroup(current);
}
EXPORT_SYMBOL_GPL(bio_blkio_cgroup);
65

66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
/*
 * Worker for allocating per cpu stat for blk groups. This is scheduled on
 * the system_nrt_wq once there are some groups on the alloc_list waiting
 * for allocation.
 */
static void blkio_stat_alloc_fn(struct work_struct *work)
{
	static void *pcpu_stats[BLKIO_NR_POLICIES];
	struct delayed_work *dwork = to_delayed_work(work);
	struct blkio_group *blkg;
	int i;
	bool empty = false;

alloc_stats:
	for (i = 0; i < BLKIO_NR_POLICIES; i++) {
		if (pcpu_stats[i] != NULL)
			continue;

		pcpu_stats[i] = alloc_percpu(struct blkio_group_stats_cpu);

		/* Allocation failed. Try again after some time. */
		if (pcpu_stats[i] == NULL) {
			queue_delayed_work(system_nrt_wq, dwork,
						msecs_to_jiffies(10));
			return;
		}
	}

	spin_lock_irq(&blkio_list_lock);
	spin_lock(&alloc_list_lock);

	/* cgroup got deleted or queue exited. */
	if (!list_empty(&alloc_list)) {
		blkg = list_first_entry(&alloc_list, struct blkio_group,
						alloc_node);
		for (i = 0; i < BLKIO_NR_POLICIES; i++) {
			struct blkg_policy_data *pd = blkg->pd[i];

			if (blkio_policy[i] && pd && !pd->stats_cpu)
				swap(pd->stats_cpu, pcpu_stats[i]);
		}

		list_del_init(&blkg->alloc_node);
	}

	empty = list_empty(&alloc_list);

	spin_unlock(&alloc_list_lock);
	spin_unlock_irq(&blkio_list_lock);

	if (!empty)
		goto alloc_stats;
}

120 121 122 123 124 125 126 127
/**
 * blkg_free - free a blkg
 * @blkg: blkg to free
 *
 * Free @blkg which may be partially allocated.
 */
static void blkg_free(struct blkio_group *blkg)
{
128
	int i;
129 130 131 132

	if (!blkg)
		return;

133
	for (i = 0; i < BLKIO_NR_POLICIES; i++) {
134
		struct blkio_policy_type *pol = blkio_policy[i];
135 136
		struct blkg_policy_data *pd = blkg->pd[i];

137 138 139 140 141 142 143 144
		if (!pd)
			continue;

		if (pol && pol->ops.blkio_exit_group_fn)
			pol->ops.blkio_exit_group_fn(blkg);

		free_percpu(pd->stats_cpu);
		kfree(pd);
145
	}
146

147
	kfree(blkg);
148 149 150 151 152 153 154
}

/**
 * blkg_alloc - allocate a blkg
 * @blkcg: block cgroup the new blkg is associated with
 * @q: request_queue the new blkg is associated with
 *
155
 * Allocate a new blkg assocating @blkcg and @q.
156 157
 */
static struct blkio_group *blkg_alloc(struct blkio_cgroup *blkcg,
158
				      struct request_queue *q)
159 160
{
	struct blkio_group *blkg;
161
	int i;
162 163 164 165 166 167

	/* alloc and init base part */
	blkg = kzalloc_node(sizeof(*blkg), GFP_ATOMIC, q->node);
	if (!blkg)
		return NULL;

T
Tejun Heo 已提交
168
	blkg->q = q;
169
	INIT_LIST_HEAD(&blkg->q_node);
170
	INIT_LIST_HEAD(&blkg->alloc_node);
171
	blkg->blkcg = blkcg;
T
Tejun Heo 已提交
172
	blkg->refcnt = 1;
173 174
	cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));

175 176 177
	for (i = 0; i < BLKIO_NR_POLICIES; i++) {
		struct blkio_policy_type *pol = blkio_policy[i];
		struct blkg_policy_data *pd;
178

179 180 181 182 183 184 185 186 187 188
		if (!pol)
			continue;

		/* alloc per-policy data and attach it to blkg */
		pd = kzalloc_node(sizeof(*pd) + pol->pdata_size, GFP_ATOMIC,
				  q->node);
		if (!pd) {
			blkg_free(blkg);
			return NULL;
		}
189

190 191
		blkg->pd[i] = pd;
		pd->blkg = blkg;
192 193
	}

194
	/* invoke per-policy init */
195 196 197 198 199 200 201
	for (i = 0; i < BLKIO_NR_POLICIES; i++) {
		struct blkio_policy_type *pol = blkio_policy[i];

		if (pol)
			pol->ops.blkio_init_group_fn(blkg);
	}

202 203 204
	return blkg;
}

205 206 207 208
struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
				       struct request_queue *q,
				       bool for_root)
	__releases(q->queue_lock) __acquires(q->queue_lock)
209
{
210
	struct blkio_group *blkg;
211

212 213 214 215 216 217 218 219 220 221 222 223
	WARN_ON_ONCE(!rcu_read_lock_held());
	lockdep_assert_held(q->queue_lock);

	/*
	 * This could be the first entry point of blkcg implementation and
	 * we shouldn't allow anything to go through for a bypassing queue.
	 * The following can be removed if blkg lookup is guaranteed to
	 * fail on a bypassing queue.
	 */
	if (unlikely(blk_queue_bypass(q)) && !for_root)
		return ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY);

224
	blkg = blkg_lookup(blkcg, q);
225 226 227
	if (blkg)
		return blkg;

228
	/* blkg holds a reference to blkcg */
229 230 231 232 233 234
	if (!css_tryget(&blkcg->css))
		return ERR_PTR(-EINVAL);

	/*
	 * Allocate and initialize.
	 */
235
	blkg = blkg_alloc(blkcg, q);
236 237

	/* did alloc fail? */
238
	if (unlikely(!blkg)) {
239 240 241 242 243 244
		blkg = ERR_PTR(-ENOMEM);
		goto out;
	}

	/* insert */
	spin_lock(&blkcg->lock);
245
	hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
246
	list_add(&blkg->q_node, &q->blkg_list);
247
	spin_unlock(&blkcg->lock);
248 249 250 251 252 253

	spin_lock(&alloc_list_lock);
	list_add(&blkg->alloc_node, &alloc_list);
	/* Queue per cpu stat allocation from worker thread. */
	queue_delayed_work(system_nrt_wq, &blkio_stat_alloc_work, 0);
	spin_unlock(&alloc_list_lock);
254 255
out:
	return blkg;
256
}
257
EXPORT_SYMBOL_GPL(blkg_lookup_create);
258 259

/* called under rcu_read_lock(). */
260
struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg,
261
				struct request_queue *q)
262 263 264 265
{
	struct blkio_group *blkg;
	struct hlist_node *n;

266
	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node)
267
		if (blkg->q == q)
268 269 270
			return blkg;
	return NULL;
}
271
EXPORT_SYMBOL_GPL(blkg_lookup);
272

273
static void blkg_destroy(struct blkio_group *blkg)
274 275
{
	struct request_queue *q = blkg->q;
276
	struct blkio_cgroup *blkcg = blkg->blkcg;
277 278

	lockdep_assert_held(q->queue_lock);
279
	lockdep_assert_held(&blkcg->lock);
280 281

	/* Something wrong if we are trying to remove same group twice */
282
	WARN_ON_ONCE(list_empty(&blkg->q_node));
283
	WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
284
	list_del_init(&blkg->q_node);
285
	hlist_del_init_rcu(&blkg->blkcg_node);
286

287 288 289 290
	spin_lock(&alloc_list_lock);
	list_del_init(&blkg->alloc_node);
	spin_unlock(&alloc_list_lock);

291 292 293 294 295 296 297
	/*
	 * Put the reference taken at the time of creation so that when all
	 * queues are gone, group can be destroyed.
	 */
	blkg_put(blkg);
}

298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330
/*
 * XXX: This updates blkg policy data in-place for root blkg, which is
 * necessary across elevator switch and policy registration as root blkgs
 * aren't shot down.  This broken and racy implementation is temporary.
 * Eventually, blkg shoot down will be replaced by proper in-place update.
 */
void update_root_blkg_pd(struct request_queue *q, enum blkio_policy_id plid)
{
	struct blkio_policy_type *pol = blkio_policy[plid];
	struct blkio_group *blkg = blkg_lookup(&blkio_root_cgroup, q);
	struct blkg_policy_data *pd;

	if (!blkg)
		return;

	kfree(blkg->pd[plid]);
	blkg->pd[plid] = NULL;

	if (!pol)
		return;

	pd = kzalloc(sizeof(*pd) + pol->pdata_size, GFP_KERNEL);
	WARN_ON_ONCE(!pd);

	pd->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu);
	WARN_ON_ONCE(!pd->stats_cpu);

	blkg->pd[plid] = pd;
	pd->blkg = blkg;
	pol->ops.blkio_init_group_fn(blkg);
}
EXPORT_SYMBOL_GPL(update_root_blkg_pd);

331 332 333 334 335 336 337 338
/**
 * blkg_destroy_all - destroy all blkgs associated with a request_queue
 * @q: request_queue of interest
 * @destroy_root: whether to destroy root blkg or not
 *
 * Destroy blkgs associated with @q.  If @destroy_root is %true, all are
 * destroyed; otherwise, root blkg is left alone.
 */
339
void blkg_destroy_all(struct request_queue *q, bool destroy_root)
340
{
341
	struct blkio_group *blkg, *n;
342

343
	spin_lock_irq(q->queue_lock);
344

345 346
	list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
		struct blkio_cgroup *blkcg = blkg->blkcg;
347

348 349 350
		/* skip root? */
		if (!destroy_root && blkg->blkcg == &blkio_root_cgroup)
			continue;
351

352 353 354
		spin_lock(&blkcg->lock);
		blkg_destroy(blkg);
		spin_unlock(&blkcg->lock);
355
	}
356 357

	spin_unlock_irq(q->queue_lock);
358
}
359
EXPORT_SYMBOL_GPL(blkg_destroy_all);
360

T
Tejun Heo 已提交
361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383
static void blkg_rcu_free(struct rcu_head *rcu_head)
{
	blkg_free(container_of(rcu_head, struct blkio_group, rcu_head));
}

void __blkg_release(struct blkio_group *blkg)
{
	/* release the extra blkcg reference this blkg has been holding */
	css_put(&blkg->blkcg->css);

	/*
	 * A group is freed in rcu manner. But having an rcu lock does not
	 * mean that one can access all the fields of blkg and assume these
	 * are valid. For example, don't try to follow throtl_data and
	 * request queue links.
	 *
	 * Having a reference to blkg under an rcu allows acess to only
	 * values local to groups like group stats and group rate limits
	 */
	call_rcu(&blkg->rcu_head, blkg_rcu_free);
}
EXPORT_SYMBOL_GPL(__blkg_release);

384
static void blkio_reset_stats_cpu(struct blkio_group *blkg, int plid)
385
{
386
	struct blkg_policy_data *pd = blkg->pd[plid];
T
Tejun Heo 已提交
387
	int cpu;
388 389 390

	if (pd->stats_cpu == NULL)
		return;
T
Tejun Heo 已提交
391 392 393 394 395

	for_each_possible_cpu(cpu) {
		struct blkio_group_stats_cpu *sc =
			per_cpu_ptr(pd->stats_cpu, cpu);

396 397
		blkg_rwstat_reset(&sc->service_bytes);
		blkg_rwstat_reset(&sc->serviced);
398 399 400
	}
}

401
static int
402
blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
403
{
T
Tejun Heo 已提交
404
	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
405 406 407
	struct blkio_group *blkg;
	struct hlist_node *n;

408
	spin_lock(&blkio_list_lock);
409
	spin_lock_irq(&blkcg->lock);
T
Tejun Heo 已提交
410 411 412 413 414 415

	/*
	 * Note that stat reset is racy - it doesn't synchronize against
	 * stat updates.  This is a debug feature which shouldn't exist
	 * anyway.  If you get hit by a race, retry.
	 */
416
	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
417
		struct blkio_policy_type *pol;
418

419 420
		list_for_each_entry(pol, &blkio_list, list) {
			blkio_reset_stats_cpu(blkg, pol->plid);
421 422 423

			if (pol->ops.blkio_reset_group_stats_fn)
				pol->ops.blkio_reset_group_stats_fn(blkg);
424
		}
425
	}
426

427
	spin_unlock_irq(&blkcg->lock);
428
	spin_unlock(&blkio_list_lock);
429 430 431
	return 0;
}

432
static const char *blkg_dev_name(struct blkio_group *blkg)
433
{
434 435 436 437
	/* some drivers (floppy) instantiate a queue w/o disk registered */
	if (blkg->q->backing_dev_info.dev)
		return dev_name(blkg->q->backing_dev_info.dev);
	return NULL;
438 439
}

440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456
/**
 * blkcg_print_blkgs - helper for printing per-blkg data
 * @sf: seq_file to print to
 * @blkcg: blkcg of interest
 * @prfill: fill function to print out a blkg
 * @pol: policy in question
 * @data: data to be passed to @prfill
 * @show_total: to print out sum of prfill return values or not
 *
 * This function invokes @prfill on each blkg of @blkcg if pd for the
 * policy specified by @pol exists.  @prfill is invoked with @sf, the
 * policy data and @data.  If @show_total is %true, the sum of the return
 * values from @prfill is printed with "Total" label at the end.
 *
 * This is to be used to construct print functions for
 * cftype->read_seq_string method.
 */
457 458 459
void blkcg_print_blkgs(struct seq_file *sf, struct blkio_cgroup *blkcg,
		       u64 (*prfill)(struct seq_file *, struct blkg_policy_data *, int),
		       int pol, int data, bool show_total)
460
{
461 462 463
	struct blkio_group *blkg;
	struct hlist_node *n;
	u64 total = 0;
464

465 466 467 468 469 470 471 472 473
	spin_lock_irq(&blkcg->lock);
	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node)
		if (blkg->pd[pol])
			total += prfill(sf, blkg->pd[pol], data);
	spin_unlock_irq(&blkcg->lock);

	if (show_total)
		seq_printf(sf, "Total %llu\n", (unsigned long long)total);
}
474
EXPORT_SYMBOL_GPL(blkcg_print_blkgs);
475 476 477 478 479 480 481 482 483

/**
 * __blkg_prfill_u64 - prfill helper for a single u64 value
 * @sf: seq_file to print to
 * @pd: policy data of interest
 * @v: value to print
 *
 * Print @v to @sf for the device assocaited with @pd.
 */
484
u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v)
485 486 487 488 489 490 491 492 493
{
	const char *dname = blkg_dev_name(pd->blkg);

	if (!dname)
		return 0;

	seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v);
	return v;
}
494
EXPORT_SYMBOL_GPL(__blkg_prfill_u64);
495 496 497 498 499 500 501 502 503

/**
 * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat
 * @sf: seq_file to print to
 * @pd: policy data of interest
 * @rwstat: rwstat to print
 *
 * Print @rwstat to @sf for the device assocaited with @pd.
 */
504 505
u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
			 const struct blkg_rwstat *rwstat)
506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532
{
	static const char *rwstr[] = {
		[BLKG_RWSTAT_READ]	= "Read",
		[BLKG_RWSTAT_WRITE]	= "Write",
		[BLKG_RWSTAT_SYNC]	= "Sync",
		[BLKG_RWSTAT_ASYNC]	= "Async",
	};
	const char *dname = blkg_dev_name(pd->blkg);
	u64 v;
	int i;

	if (!dname)
		return 0;

	for (i = 0; i < BLKG_RWSTAT_NR; i++)
		seq_printf(sf, "%s %s %llu\n", dname, rwstr[i],
			   (unsigned long long)rwstat->cnt[i]);

	v = rwstat->cnt[BLKG_RWSTAT_READ] + rwstat->cnt[BLKG_RWSTAT_WRITE];
	seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
	return v;
}

static u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd,
			    int off)
{
	return __blkg_prfill_u64(sf, pd,
533
				 blkg_stat_read((void *)pd->pdata + off));
534 535 536 537 538
}

static u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
			      int off)
{
539
	struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd->pdata + off);
540 541 542 543 544

	return __blkg_prfill_rwstat(sf, pd, &rwstat);
}

/* print blkg_stat specified by BLKCG_STAT_PRIV() */
545 546
int blkcg_print_stat(struct cgroup *cgrp, struct cftype *cft,
		     struct seq_file *sf)
547 548 549 550 551 552 553 554
{
	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);

	blkcg_print_blkgs(sf, blkcg, blkg_prfill_stat,
			  BLKCG_STAT_POL(cft->private),
			  BLKCG_STAT_OFF(cft->private), false);
	return 0;
}
555
EXPORT_SYMBOL_GPL(blkcg_print_stat);
556 557

/* print blkg_rwstat specified by BLKCG_STAT_PRIV() */
558 559
int blkcg_print_rwstat(struct cgroup *cgrp, struct cftype *cft,
		       struct seq_file *sf)
560 561 562 563 564 565 566 567
{
	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);

	blkcg_print_blkgs(sf, blkcg, blkg_prfill_rwstat,
			  BLKCG_STAT_POL(cft->private),
			  BLKCG_STAT_OFF(cft->private), true);
	return 0;
}
568
EXPORT_SYMBOL_GPL(blkcg_print_rwstat);
569

570 571 572 573 574 575 576 577 578 579 580
/**
 * blkg_conf_prep - parse and prepare for per-blkg config update
 * @blkcg: target block cgroup
 * @input: input string
 * @ctx: blkg_conf_ctx to be filled
 *
 * Parse per-blkg config update from @input and initialize @ctx with the
 * result.  @ctx->blkg points to the blkg to be updated and @ctx->v the new
 * value.  This function returns with RCU read locked and must be paired
 * with blkg_conf_finish().
 */
581 582
int blkg_conf_prep(struct blkio_cgroup *blkcg, const char *input,
		   struct blkg_conf_ctx *ctx)
583
	__acquires(rcu)
584
{
585 586
	struct gendisk *disk;
	struct blkio_group *blkg;
T
Tejun Heo 已提交
587 588 589
	unsigned int major, minor;
	unsigned long long v;
	int part, ret;
590

T
Tejun Heo 已提交
591 592
	if (sscanf(input, "%u:%u %llu", &major, &minor, &v) != 3)
		return -EINVAL;
593

T
Tejun Heo 已提交
594
	disk = get_gendisk(MKDEV(major, minor), &part);
T
Tejun Heo 已提交
595
	if (!disk || part)
T
Tejun Heo 已提交
596
		return -EINVAL;
597 598 599

	rcu_read_lock();

T
Tejun Heo 已提交
600
	spin_lock_irq(disk->queue->queue_lock);
601
	blkg = blkg_lookup_create(blkcg, disk->queue, false);
T
Tejun Heo 已提交
602
	spin_unlock_irq(disk->queue->queue_lock);
603

T
Tejun Heo 已提交
604 605
	if (IS_ERR(blkg)) {
		ret = PTR_ERR(blkg);
606 607 608 609 610 611 612 613 614 615 616
		rcu_read_unlock();
		put_disk(disk);
		/*
		 * If queue was bypassing, we should retry.  Do so after a
		 * short msleep().  It isn't strictly necessary but queue
		 * can be bypassing for some time and it's always nice to
		 * avoid busy looping.
		 */
		if (ret == -EBUSY) {
			msleep(10);
			ret = restart_syscall();
617
		}
T
Tejun Heo 已提交
618
		return ret;
619
	}
620 621 622

	ctx->disk = disk;
	ctx->blkg = blkg;
T
Tejun Heo 已提交
623 624
	ctx->v = v;
	return 0;
625
}
626
EXPORT_SYMBOL_GPL(blkg_conf_prep);
627

628 629 630 631 632 633 634
/**
 * blkg_conf_finish - finish up per-blkg config update
 * @ctx: blkg_conf_ctx intiailized by blkg_conf_prep()
 *
 * Finish up after per-blkg config update.  This function must be paired
 * with blkg_conf_prep().
 */
635
void blkg_conf_finish(struct blkg_conf_ctx *ctx)
636
	__releases(rcu)
637
{
638 639
	rcu_read_unlock();
	put_disk(ctx->disk);
640
}
641
EXPORT_SYMBOL_GPL(blkg_conf_finish);
642

643
struct cftype blkio_files[] = {
644 645 646
	{
		.name = "reset_stats",
		.write_u64 = blkiocg_reset_stats,
647
	},
648
	{ }	/* terminate */
649 650
};

651 652 653 654 655 656 657 658 659 660 661
/**
 * blkiocg_pre_destroy - cgroup pre_destroy callback
 * @cgroup: cgroup of interest
 *
 * This function is called when @cgroup is about to go away and responsible
 * for shooting down all blkgs associated with @cgroup.  blkgs should be
 * removed while holding both q and blkcg locks.  As blkcg lock is nested
 * inside q lock, this function performs reverse double lock dancing.
 *
 * This is the blkcg counterpart of ioc_release_fn().
 */
662
static int blkiocg_pre_destroy(struct cgroup *cgroup)
663 664
{
	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
665

666
	spin_lock_irq(&blkcg->lock);
667

668 669 670
	while (!hlist_empty(&blkcg->blkg_list)) {
		struct blkio_group *blkg = hlist_entry(blkcg->blkg_list.first,
						struct blkio_group, blkcg_node);
T
Tejun Heo 已提交
671
		struct request_queue *q = blkg->q;
672

673 674 675 676 677 678
		if (spin_trylock(q->queue_lock)) {
			blkg_destroy(blkg);
			spin_unlock(q->queue_lock);
		} else {
			spin_unlock_irq(&blkcg->lock);
			cpu_relax();
679
			spin_lock_irq(&blkcg->lock);
680
		}
681
	}
682

683
	spin_unlock_irq(&blkcg->lock);
684 685 686
	return 0;
}

687
static void blkiocg_destroy(struct cgroup *cgroup)
688 689 690
{
	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);

B
Ben Blum 已提交
691 692
	if (blkcg != &blkio_root_cgroup)
		kfree(blkcg);
693 694
}

695
static struct cgroup_subsys_state *blkiocg_create(struct cgroup *cgroup)
696
{
T
Tejun Heo 已提交
697
	static atomic64_t id_seq = ATOMIC64_INIT(0);
698 699
	struct blkio_cgroup *blkcg;
	struct cgroup *parent = cgroup->parent;
700

701
	if (!parent) {
702 703 704 705 706 707 708 709 710
		blkcg = &blkio_root_cgroup;
		goto done;
	}

	blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
	if (!blkcg)
		return ERR_PTR(-ENOMEM);

	blkcg->weight = BLKIO_WEIGHT_DEFAULT;
T
Tejun Heo 已提交
711
	blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */
712 713 714 715 716 717 718
done:
	spin_lock_init(&blkcg->lock);
	INIT_HLIST_HEAD(&blkcg->blkg_list);

	return &blkcg->css;
}

719 720 721 722 723 724 725 726 727 728 729 730
/**
 * blkcg_init_queue - initialize blkcg part of request queue
 * @q: request_queue to initialize
 *
 * Called from blk_alloc_queue_node(). Responsible for initializing blkcg
 * part of new request_queue @q.
 *
 * RETURNS:
 * 0 on success, -errno on failure.
 */
int blkcg_init_queue(struct request_queue *q)
{
731 732
	int ret;

733 734
	might_sleep();

735 736 737 738 739 740 741 742 743 744
	ret = blk_throtl_init(q);
	if (ret)
		return ret;

	mutex_lock(&all_q_mutex);
	INIT_LIST_HEAD(&q->all_q_node);
	list_add_tail(&q->all_q_node, &all_q_list);
	mutex_unlock(&all_q_mutex);

	return 0;
745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767
}

/**
 * blkcg_drain_queue - drain blkcg part of request_queue
 * @q: request_queue to drain
 *
 * Called from blk_drain_queue().  Responsible for draining blkcg part.
 */
void blkcg_drain_queue(struct request_queue *q)
{
	lockdep_assert_held(q->queue_lock);

	blk_throtl_drain(q);
}

/**
 * blkcg_exit_queue - exit and release blkcg part of request_queue
 * @q: request_queue being released
 *
 * Called from blk_release_queue().  Responsible for exiting blkcg part.
 */
void blkcg_exit_queue(struct request_queue *q)
{
768 769 770 771
	mutex_lock(&all_q_mutex);
	list_del_init(&q->all_q_node);
	mutex_unlock(&all_q_mutex);

772 773
	blkg_destroy_all(q, true);

774 775 776
	blk_throtl_exit(q);
}

777 778 779 780 781 782
/*
 * We cannot support shared io contexts, as we have no mean to support
 * two tasks with the same ioc in two different groups without major rework
 * of the main cic data structures.  For now we allow a task to change
 * its cgroup only if it's the only owner of its ioc.
 */
783
static int blkiocg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
784
{
785
	struct task_struct *task;
786 787 788 789
	struct io_context *ioc;
	int ret = 0;

	/* task_lock() is needed to avoid races with exit_io_context() */
790 791 792 793 794 795 796 797 798
	cgroup_taskset_for_each(task, cgrp, tset) {
		task_lock(task);
		ioc = task->io_context;
		if (ioc && atomic_read(&ioc->nr_tasks) > 1)
			ret = -EINVAL;
		task_unlock(task);
		if (ret)
			break;
	}
799 800 801
	return ret;
}

802 803 804 805 806 807 808 809 810
static void blkcg_bypass_start(void)
	__acquires(&all_q_mutex)
{
	struct request_queue *q;

	mutex_lock(&all_q_mutex);

	list_for_each_entry(q, &all_q_list, all_q_node) {
		blk_queue_bypass_start(q);
811
		blkg_destroy_all(q, false);
812 813 814 815 816 817 818 819 820 821 822 823 824 825
	}
}

static void blkcg_bypass_end(void)
	__releases(&all_q_mutex)
{
	struct request_queue *q;

	list_for_each_entry(q, &all_q_list, all_q_node)
		blk_queue_bypass_end(q);

	mutex_unlock(&all_q_mutex);
}

826 827 828 829
struct cgroup_subsys blkio_subsys = {
	.name = "blkio",
	.create = blkiocg_create,
	.can_attach = blkiocg_can_attach,
830
	.pre_destroy = blkiocg_pre_destroy,
831 832
	.destroy = blkiocg_destroy,
	.subsys_id = blkio_subsys_id,
833
	.base_cftypes = blkio_files,
834 835 836 837
	.module = THIS_MODULE,
};
EXPORT_SYMBOL_GPL(blkio_subsys);

838 839
void blkio_policy_register(struct blkio_policy_type *blkiop)
{
840 841
	struct request_queue *q;

842
	blkcg_bypass_start();
843
	spin_lock(&blkio_list_lock);
844 845 846

	BUG_ON(blkio_policy[blkiop->plid]);
	blkio_policy[blkiop->plid] = blkiop;
847
	list_add_tail(&blkiop->list, &blkio_list);
848

849
	spin_unlock(&blkio_list_lock);
850 851
	list_for_each_entry(q, &all_q_list, all_q_node)
		update_root_blkg_pd(q, blkiop->plid);
852
	blkcg_bypass_end();
853 854 855

	if (blkiop->cftypes)
		WARN_ON(cgroup_add_cftypes(&blkio_subsys, blkiop->cftypes));
856 857 858 859 860
}
EXPORT_SYMBOL_GPL(blkio_policy_register);

void blkio_policy_unregister(struct blkio_policy_type *blkiop)
{
861 862
	struct request_queue *q;

863 864 865
	if (blkiop->cftypes)
		cgroup_rm_cftypes(&blkio_subsys, blkiop->cftypes);

866
	blkcg_bypass_start();
867
	spin_lock(&blkio_list_lock);
868 869 870

	BUG_ON(blkio_policy[blkiop->plid] != blkiop);
	blkio_policy[blkiop->plid] = NULL;
871
	list_del_init(&blkiop->list);
872

873
	spin_unlock(&blkio_list_lock);
874 875
	list_for_each_entry(q, &all_q_list, all_q_node)
		update_root_blkg_pd(q, blkiop->plid);
876
	blkcg_bypass_end();
877 878
}
EXPORT_SYMBOL_GPL(blkio_policy_unregister);