blk-cgroup.c 22.5 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13
/*
 * Common Block IO controller cgroup interface
 *
 * Based on ideas and code from CFQ, CFS and BFQ:
 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
 *
 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
 *		      Paolo Valente <paolo.valente@unimore.it>
 *
 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
 * 	              Nauman Rafique <nauman@google.com>
 */
#include <linux/ioprio.h>
14
#include <linux/kdev_t.h>
15
#include <linux/module.h>
16
#include <linux/err.h>
17
#include <linux/blkdev.h>
18
#include <linux/slab.h>
19
#include <linux/genhd.h>
20
#include <linux/delay.h>
T
Tejun Heo 已提交
21
#include <linux/atomic.h>
22
#include "blk-cgroup.h"
23
#include "blk.h"
24

25 26
#define MAX_KEY_LEN 100

27 28
static DEFINE_SPINLOCK(blkio_list_lock);
static LIST_HEAD(blkio_list);
29

30 31 32
static DEFINE_MUTEX(all_q_mutex);
static LIST_HEAD(all_q_list);

33 34 35 36 37 38 39
/* List of groups pending per cpu stats allocation */
static DEFINE_SPINLOCK(alloc_list_lock);
static LIST_HEAD(alloc_list);

static void blkio_stat_alloc_fn(struct work_struct *);
static DECLARE_DELAYED_WORK(blkio_stat_alloc_work, blkio_stat_alloc_fn);

40
struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
41 42
EXPORT_SYMBOL_GPL(blkio_root_cgroup);

43 44
static struct blkio_policy_type *blkio_policy[BLKIO_NR_POLICIES];

45 46 47 48 49
struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
{
	return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
			    struct blkio_cgroup, css);
}
50
EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
51

52
static struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk)
53 54 55 56
{
	return container_of(task_subsys_state(tsk, blkio_subsys_id),
			    struct blkio_cgroup, css);
}
57 58 59 60 61 62 63 64

struct blkio_cgroup *bio_blkio_cgroup(struct bio *bio)
{
	if (bio && bio->bi_css)
		return container_of(bio->bi_css, struct blkio_cgroup, css);
	return task_blkio_cgroup(current);
}
EXPORT_SYMBOL_GPL(bio_blkio_cgroup);
65

66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
/*
 * Worker for allocating per cpu stat for blk groups. This is scheduled on
 * the system_nrt_wq once there are some groups on the alloc_list waiting
 * for allocation.
 */
static void blkio_stat_alloc_fn(struct work_struct *work)
{
	static void *pcpu_stats[BLKIO_NR_POLICIES];
	struct delayed_work *dwork = to_delayed_work(work);
	struct blkio_group *blkg;
	int i;
	bool empty = false;

alloc_stats:
	for (i = 0; i < BLKIO_NR_POLICIES; i++) {
		if (pcpu_stats[i] != NULL)
			continue;

		pcpu_stats[i] = alloc_percpu(struct blkio_group_stats_cpu);

		/* Allocation failed. Try again after some time. */
		if (pcpu_stats[i] == NULL) {
			queue_delayed_work(system_nrt_wq, dwork,
						msecs_to_jiffies(10));
			return;
		}
	}

	spin_lock_irq(&blkio_list_lock);
	spin_lock(&alloc_list_lock);

	/* cgroup got deleted or queue exited. */
	if (!list_empty(&alloc_list)) {
		blkg = list_first_entry(&alloc_list, struct blkio_group,
						alloc_node);
		for (i = 0; i < BLKIO_NR_POLICIES; i++) {
			struct blkg_policy_data *pd = blkg->pd[i];

			if (blkio_policy[i] && pd && !pd->stats_cpu)
				swap(pd->stats_cpu, pcpu_stats[i]);
		}

		list_del_init(&blkg->alloc_node);
	}

	empty = list_empty(&alloc_list);

	spin_unlock(&alloc_list_lock);
	spin_unlock_irq(&blkio_list_lock);

	if (!empty)
		goto alloc_stats;
}

120 121 122 123 124 125 126 127
/**
 * blkg_free - free a blkg
 * @blkg: blkg to free
 *
 * Free @blkg which may be partially allocated.
 */
static void blkg_free(struct blkio_group *blkg)
{
128
	int i;
129 130 131 132

	if (!blkg)
		return;

133
	for (i = 0; i < BLKIO_NR_POLICIES; i++) {
134
		struct blkio_policy_type *pol = blkio_policy[i];
135 136
		struct blkg_policy_data *pd = blkg->pd[i];

137 138 139 140 141 142 143 144
		if (!pd)
			continue;

		if (pol && pol->ops.blkio_exit_group_fn)
			pol->ops.blkio_exit_group_fn(blkg);

		free_percpu(pd->stats_cpu);
		kfree(pd);
145
	}
146

147
	kfree(blkg);
148 149 150 151 152 153 154
}

/**
 * blkg_alloc - allocate a blkg
 * @blkcg: block cgroup the new blkg is associated with
 * @q: request_queue the new blkg is associated with
 *
155
 * Allocate a new blkg assocating @blkcg and @q.
156 157
 */
static struct blkio_group *blkg_alloc(struct blkio_cgroup *blkcg,
158
				      struct request_queue *q)
159 160
{
	struct blkio_group *blkg;
161
	int i;
162 163 164 165 166 167

	/* alloc and init base part */
	blkg = kzalloc_node(sizeof(*blkg), GFP_ATOMIC, q->node);
	if (!blkg)
		return NULL;

T
Tejun Heo 已提交
168
	blkg->q = q;
169
	INIT_LIST_HEAD(&blkg->q_node);
170
	INIT_LIST_HEAD(&blkg->alloc_node);
171
	blkg->blkcg = blkcg;
T
Tejun Heo 已提交
172
	blkg->refcnt = 1;
173 174
	cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));

175 176 177
	for (i = 0; i < BLKIO_NR_POLICIES; i++) {
		struct blkio_policy_type *pol = blkio_policy[i];
		struct blkg_policy_data *pd;
178

179 180 181 182 183 184 185 186 187 188
		if (!pol)
			continue;

		/* alloc per-policy data and attach it to blkg */
		pd = kzalloc_node(sizeof(*pd) + pol->pdata_size, GFP_ATOMIC,
				  q->node);
		if (!pd) {
			blkg_free(blkg);
			return NULL;
		}
189

190 191
		blkg->pd[i] = pd;
		pd->blkg = blkg;
192 193
	}

194
	/* invoke per-policy init */
195 196 197 198 199 200 201
	for (i = 0; i < BLKIO_NR_POLICIES; i++) {
		struct blkio_policy_type *pol = blkio_policy[i];

		if (pol)
			pol->ops.blkio_init_group_fn(blkg);
	}

202 203 204
	return blkg;
}

205 206 207 208
struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
				       struct request_queue *q,
				       bool for_root)
	__releases(q->queue_lock) __acquires(q->queue_lock)
209
{
210
	struct blkio_group *blkg;
211

212 213 214 215 216 217 218 219 220 221 222 223
	WARN_ON_ONCE(!rcu_read_lock_held());
	lockdep_assert_held(q->queue_lock);

	/*
	 * This could be the first entry point of blkcg implementation and
	 * we shouldn't allow anything to go through for a bypassing queue.
	 * The following can be removed if blkg lookup is guaranteed to
	 * fail on a bypassing queue.
	 */
	if (unlikely(blk_queue_bypass(q)) && !for_root)
		return ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY);

224
	blkg = blkg_lookup(blkcg, q);
225 226 227
	if (blkg)
		return blkg;

228
	/* blkg holds a reference to blkcg */
229 230 231 232 233 234
	if (!css_tryget(&blkcg->css))
		return ERR_PTR(-EINVAL);

	/*
	 * Allocate and initialize.
	 */
235
	blkg = blkg_alloc(blkcg, q);
236 237

	/* did alloc fail? */
238
	if (unlikely(!blkg)) {
239 240 241 242 243 244
		blkg = ERR_PTR(-ENOMEM);
		goto out;
	}

	/* insert */
	spin_lock(&blkcg->lock);
245
	hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
246
	list_add(&blkg->q_node, &q->blkg_list);
247
	spin_unlock(&blkcg->lock);
248 249 250 251 252 253

	spin_lock(&alloc_list_lock);
	list_add(&blkg->alloc_node, &alloc_list);
	/* Queue per cpu stat allocation from worker thread. */
	queue_delayed_work(system_nrt_wq, &blkio_stat_alloc_work, 0);
	spin_unlock(&alloc_list_lock);
254 255
out:
	return blkg;
256
}
257
EXPORT_SYMBOL_GPL(blkg_lookup_create);
258 259

/* called under rcu_read_lock(). */
260
struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg,
261
				struct request_queue *q)
262 263 264 265
{
	struct blkio_group *blkg;
	struct hlist_node *n;

266
	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node)
267
		if (blkg->q == q)
268 269 270
			return blkg;
	return NULL;
}
271
EXPORT_SYMBOL_GPL(blkg_lookup);
272

273
static void blkg_destroy(struct blkio_group *blkg)
274 275
{
	struct request_queue *q = blkg->q;
276
	struct blkio_cgroup *blkcg = blkg->blkcg;
277 278

	lockdep_assert_held(q->queue_lock);
279
	lockdep_assert_held(&blkcg->lock);
280 281

	/* Something wrong if we are trying to remove same group twice */
282
	WARN_ON_ONCE(list_empty(&blkg->q_node));
283
	WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
284
	list_del_init(&blkg->q_node);
285
	hlist_del_init_rcu(&blkg->blkcg_node);
286

287 288 289 290
	spin_lock(&alloc_list_lock);
	list_del_init(&blkg->alloc_node);
	spin_unlock(&alloc_list_lock);

291 292 293 294 295 296 297
	/*
	 * Put the reference taken at the time of creation so that when all
	 * queues are gone, group can be destroyed.
	 */
	blkg_put(blkg);
}

298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330
/*
 * XXX: This updates blkg policy data in-place for root blkg, which is
 * necessary across elevator switch and policy registration as root blkgs
 * aren't shot down.  This broken and racy implementation is temporary.
 * Eventually, blkg shoot down will be replaced by proper in-place update.
 */
void update_root_blkg_pd(struct request_queue *q, enum blkio_policy_id plid)
{
	struct blkio_policy_type *pol = blkio_policy[plid];
	struct blkio_group *blkg = blkg_lookup(&blkio_root_cgroup, q);
	struct blkg_policy_data *pd;

	if (!blkg)
		return;

	kfree(blkg->pd[plid]);
	blkg->pd[plid] = NULL;

	if (!pol)
		return;

	pd = kzalloc(sizeof(*pd) + pol->pdata_size, GFP_KERNEL);
	WARN_ON_ONCE(!pd);

	pd->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu);
	WARN_ON_ONCE(!pd->stats_cpu);

	blkg->pd[plid] = pd;
	pd->blkg = blkg;
	pol->ops.blkio_init_group_fn(blkg);
}
EXPORT_SYMBOL_GPL(update_root_blkg_pd);

331 332 333 334 335 336 337 338
/**
 * blkg_destroy_all - destroy all blkgs associated with a request_queue
 * @q: request_queue of interest
 * @destroy_root: whether to destroy root blkg or not
 *
 * Destroy blkgs associated with @q.  If @destroy_root is %true, all are
 * destroyed; otherwise, root blkg is left alone.
 */
339
void blkg_destroy_all(struct request_queue *q, bool destroy_root)
340
{
341
	struct blkio_group *blkg, *n;
342

343
	spin_lock_irq(q->queue_lock);
344

345 346
	list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
		struct blkio_cgroup *blkcg = blkg->blkcg;
347

348 349 350
		/* skip root? */
		if (!destroy_root && blkg->blkcg == &blkio_root_cgroup)
			continue;
351

352 353 354
		spin_lock(&blkcg->lock);
		blkg_destroy(blkg);
		spin_unlock(&blkcg->lock);
355
	}
356 357

	spin_unlock_irq(q->queue_lock);
358
}
359
EXPORT_SYMBOL_GPL(blkg_destroy_all);
360

T
Tejun Heo 已提交
361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383
static void blkg_rcu_free(struct rcu_head *rcu_head)
{
	blkg_free(container_of(rcu_head, struct blkio_group, rcu_head));
}

void __blkg_release(struct blkio_group *blkg)
{
	/* release the extra blkcg reference this blkg has been holding */
	css_put(&blkg->blkcg->css);

	/*
	 * A group is freed in rcu manner. But having an rcu lock does not
	 * mean that one can access all the fields of blkg and assume these
	 * are valid. For example, don't try to follow throtl_data and
	 * request queue links.
	 *
	 * Having a reference to blkg under an rcu allows acess to only
	 * values local to groups like group stats and group rate limits
	 */
	call_rcu(&blkg->rcu_head, blkg_rcu_free);
}
EXPORT_SYMBOL_GPL(__blkg_release);

384
static void blkio_reset_stats_cpu(struct blkio_group *blkg, int plid)
385
{
386
	struct blkg_policy_data *pd = blkg->pd[plid];
T
Tejun Heo 已提交
387
	int cpu;
388 389 390

	if (pd->stats_cpu == NULL)
		return;
T
Tejun Heo 已提交
391 392 393 394 395

	for_each_possible_cpu(cpu) {
		struct blkio_group_stats_cpu *sc =
			per_cpu_ptr(pd->stats_cpu, cpu);

396 397
		blkg_rwstat_reset(&sc->service_bytes);
		blkg_rwstat_reset(&sc->serviced);
398 399 400
	}
}

401
static int
402
blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
403
{
T
Tejun Heo 已提交
404
	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
405 406 407
	struct blkio_group *blkg;
	struct hlist_node *n;

408
	spin_lock(&blkio_list_lock);
409
	spin_lock_irq(&blkcg->lock);
T
Tejun Heo 已提交
410 411 412 413 414 415

	/*
	 * Note that stat reset is racy - it doesn't synchronize against
	 * stat updates.  This is a debug feature which shouldn't exist
	 * anyway.  If you get hit by a race, retry.
	 */
416
	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
417
		struct blkio_policy_type *pol;
418

419 420
		list_for_each_entry(pol, &blkio_list, list) {
			struct blkg_policy_data *pd = blkg->pd[pol->plid];
T
Tejun Heo 已提交
421 422 423
			struct blkio_group_stats *stats = &pd->stats;

			/* queued stats shouldn't be cleared */
424 425
			blkg_rwstat_reset(&stats->service_bytes);
			blkg_rwstat_reset(&stats->serviced);
426 427 428 429
			blkg_rwstat_reset(&stats->merged);
			blkg_rwstat_reset(&stats->service_time);
			blkg_rwstat_reset(&stats->wait_time);
			blkg_stat_reset(&stats->time);
430
#ifdef CONFIG_DEBUG_BLK_CGROUP
431 432 433 434 435 436 437
			blkg_stat_reset(&stats->unaccounted_time);
			blkg_stat_reset(&stats->avg_queue_size_sum);
			blkg_stat_reset(&stats->avg_queue_size_samples);
			blkg_stat_reset(&stats->dequeue);
			blkg_stat_reset(&stats->group_wait_time);
			blkg_stat_reset(&stats->idle_time);
			blkg_stat_reset(&stats->empty_time);
438
#endif
439
			blkio_reset_stats_cpu(blkg, pol->plid);
440 441 442

			if (pol->ops.blkio_reset_group_stats_fn)
				pol->ops.blkio_reset_group_stats_fn(blkg);
443
		}
444
	}
445

446
	spin_unlock_irq(&blkcg->lock);
447
	spin_unlock(&blkio_list_lock);
448 449 450
	return 0;
}

451
static const char *blkg_dev_name(struct blkio_group *blkg)
452
{
453 454 455 456
	/* some drivers (floppy) instantiate a queue w/o disk registered */
	if (blkg->q->backing_dev_info.dev)
		return dev_name(blkg->q->backing_dev_info.dev);
	return NULL;
457 458
}

459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475
/**
 * blkcg_print_blkgs - helper for printing per-blkg data
 * @sf: seq_file to print to
 * @blkcg: blkcg of interest
 * @prfill: fill function to print out a blkg
 * @pol: policy in question
 * @data: data to be passed to @prfill
 * @show_total: to print out sum of prfill return values or not
 *
 * This function invokes @prfill on each blkg of @blkcg if pd for the
 * policy specified by @pol exists.  @prfill is invoked with @sf, the
 * policy data and @data.  If @show_total is %true, the sum of the return
 * values from @prfill is printed with "Total" label at the end.
 *
 * This is to be used to construct print functions for
 * cftype->read_seq_string method.
 */
476 477 478
void blkcg_print_blkgs(struct seq_file *sf, struct blkio_cgroup *blkcg,
		       u64 (*prfill)(struct seq_file *, struct blkg_policy_data *, int),
		       int pol, int data, bool show_total)
479
{
480 481 482
	struct blkio_group *blkg;
	struct hlist_node *n;
	u64 total = 0;
483

484 485 486 487 488 489 490 491 492
	spin_lock_irq(&blkcg->lock);
	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node)
		if (blkg->pd[pol])
			total += prfill(sf, blkg->pd[pol], data);
	spin_unlock_irq(&blkcg->lock);

	if (show_total)
		seq_printf(sf, "Total %llu\n", (unsigned long long)total);
}
493
EXPORT_SYMBOL_GPL(blkcg_print_blkgs);
494 495 496 497 498 499 500 501 502

/**
 * __blkg_prfill_u64 - prfill helper for a single u64 value
 * @sf: seq_file to print to
 * @pd: policy data of interest
 * @v: value to print
 *
 * Print @v to @sf for the device assocaited with @pd.
 */
503
u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v)
504 505 506 507 508 509 510 511 512
{
	const char *dname = blkg_dev_name(pd->blkg);

	if (!dname)
		return 0;

	seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v);
	return v;
}
513
EXPORT_SYMBOL_GPL(__blkg_prfill_u64);
514 515 516 517 518 519 520 521 522

/**
 * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat
 * @sf: seq_file to print to
 * @pd: policy data of interest
 * @rwstat: rwstat to print
 *
 * Print @rwstat to @sf for the device assocaited with @pd.
 */
523 524
u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
			 const struct blkg_rwstat *rwstat)
525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563
{
	static const char *rwstr[] = {
		[BLKG_RWSTAT_READ]	= "Read",
		[BLKG_RWSTAT_WRITE]	= "Write",
		[BLKG_RWSTAT_SYNC]	= "Sync",
		[BLKG_RWSTAT_ASYNC]	= "Async",
	};
	const char *dname = blkg_dev_name(pd->blkg);
	u64 v;
	int i;

	if (!dname)
		return 0;

	for (i = 0; i < BLKG_RWSTAT_NR; i++)
		seq_printf(sf, "%s %s %llu\n", dname, rwstr[i],
			   (unsigned long long)rwstat->cnt[i]);

	v = rwstat->cnt[BLKG_RWSTAT_READ] + rwstat->cnt[BLKG_RWSTAT_WRITE];
	seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
	return v;
}

static u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd,
			    int off)
{
	return __blkg_prfill_u64(sf, pd,
				 blkg_stat_read((void *)&pd->stats + off));
}

static u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
			      int off)
{
	struct blkg_rwstat rwstat = blkg_rwstat_read((void *)&pd->stats + off);

	return __blkg_prfill_rwstat(sf, pd, &rwstat);
}

/* print blkg_stat specified by BLKCG_STAT_PRIV() */
564 565
int blkcg_print_stat(struct cgroup *cgrp, struct cftype *cft,
		     struct seq_file *sf)
566 567 568 569 570 571 572 573
{
	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);

	blkcg_print_blkgs(sf, blkcg, blkg_prfill_stat,
			  BLKCG_STAT_POL(cft->private),
			  BLKCG_STAT_OFF(cft->private), false);
	return 0;
}
574
EXPORT_SYMBOL_GPL(blkcg_print_stat);
575 576

/* print blkg_rwstat specified by BLKCG_STAT_PRIV() */
577 578
int blkcg_print_rwstat(struct cgroup *cgrp, struct cftype *cft,
		       struct seq_file *sf)
579 580 581 582 583 584 585 586
{
	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);

	blkcg_print_blkgs(sf, blkcg, blkg_prfill_rwstat,
			  BLKCG_STAT_POL(cft->private),
			  BLKCG_STAT_OFF(cft->private), true);
	return 0;
}
587
EXPORT_SYMBOL_GPL(blkcg_print_rwstat);
588

589 590 591 592 593 594 595 596 597 598 599
/**
 * blkg_conf_prep - parse and prepare for per-blkg config update
 * @blkcg: target block cgroup
 * @input: input string
 * @ctx: blkg_conf_ctx to be filled
 *
 * Parse per-blkg config update from @input and initialize @ctx with the
 * result.  @ctx->blkg points to the blkg to be updated and @ctx->v the new
 * value.  This function returns with RCU read locked and must be paired
 * with blkg_conf_finish().
 */
600 601
int blkg_conf_prep(struct blkio_cgroup *blkcg, const char *input,
		   struct blkg_conf_ctx *ctx)
602
	__acquires(rcu)
603
{
604 605
	struct gendisk *disk;
	struct blkio_group *blkg;
T
Tejun Heo 已提交
606 607 608
	unsigned int major, minor;
	unsigned long long v;
	int part, ret;
609

T
Tejun Heo 已提交
610 611
	if (sscanf(input, "%u:%u %llu", &major, &minor, &v) != 3)
		return -EINVAL;
612

T
Tejun Heo 已提交
613
	disk = get_gendisk(MKDEV(major, minor), &part);
T
Tejun Heo 已提交
614
	if (!disk || part)
T
Tejun Heo 已提交
615
		return -EINVAL;
616 617 618

	rcu_read_lock();

T
Tejun Heo 已提交
619
	spin_lock_irq(disk->queue->queue_lock);
620
	blkg = blkg_lookup_create(blkcg, disk->queue, false);
T
Tejun Heo 已提交
621
	spin_unlock_irq(disk->queue->queue_lock);
622

T
Tejun Heo 已提交
623 624
	if (IS_ERR(blkg)) {
		ret = PTR_ERR(blkg);
625 626 627 628 629 630 631 632 633 634 635
		rcu_read_unlock();
		put_disk(disk);
		/*
		 * If queue was bypassing, we should retry.  Do so after a
		 * short msleep().  It isn't strictly necessary but queue
		 * can be bypassing for some time and it's always nice to
		 * avoid busy looping.
		 */
		if (ret == -EBUSY) {
			msleep(10);
			ret = restart_syscall();
636
		}
T
Tejun Heo 已提交
637
		return ret;
638
	}
639 640 641

	ctx->disk = disk;
	ctx->blkg = blkg;
T
Tejun Heo 已提交
642 643
	ctx->v = v;
	return 0;
644
}
645
EXPORT_SYMBOL_GPL(blkg_conf_prep);
646

647 648 649 650 651 652 653
/**
 * blkg_conf_finish - finish up per-blkg config update
 * @ctx: blkg_conf_ctx intiailized by blkg_conf_prep()
 *
 * Finish up after per-blkg config update.  This function must be paired
 * with blkg_conf_prep().
 */
654
void blkg_conf_finish(struct blkg_conf_ctx *ctx)
655
	__releases(rcu)
656
{
657 658
	rcu_read_unlock();
	put_disk(ctx->disk);
659
}
660
EXPORT_SYMBOL_GPL(blkg_conf_finish);
661

662
struct cftype blkio_files[] = {
663 664 665
	{
		.name = "reset_stats",
		.write_u64 = blkiocg_reset_stats,
666
	},
667
	{ }	/* terminate */
668 669
};

670 671 672 673 674 675 676 677 678 679 680
/**
 * blkiocg_pre_destroy - cgroup pre_destroy callback
 * @cgroup: cgroup of interest
 *
 * This function is called when @cgroup is about to go away and responsible
 * for shooting down all blkgs associated with @cgroup.  blkgs should be
 * removed while holding both q and blkcg locks.  As blkcg lock is nested
 * inside q lock, this function performs reverse double lock dancing.
 *
 * This is the blkcg counterpart of ioc_release_fn().
 */
681
static int blkiocg_pre_destroy(struct cgroup *cgroup)
682 683
{
	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
684

685
	spin_lock_irq(&blkcg->lock);
686

687 688 689
	while (!hlist_empty(&blkcg->blkg_list)) {
		struct blkio_group *blkg = hlist_entry(blkcg->blkg_list.first,
						struct blkio_group, blkcg_node);
T
Tejun Heo 已提交
690
		struct request_queue *q = blkg->q;
691

692 693 694 695 696 697
		if (spin_trylock(q->queue_lock)) {
			blkg_destroy(blkg);
			spin_unlock(q->queue_lock);
		} else {
			spin_unlock_irq(&blkcg->lock);
			cpu_relax();
698
			spin_lock_irq(&blkcg->lock);
699
		}
700
	}
701

702
	spin_unlock_irq(&blkcg->lock);
703 704 705
	return 0;
}

706
static void blkiocg_destroy(struct cgroup *cgroup)
707 708 709
{
	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);

B
Ben Blum 已提交
710 711
	if (blkcg != &blkio_root_cgroup)
		kfree(blkcg);
712 713
}

714
static struct cgroup_subsys_state *blkiocg_create(struct cgroup *cgroup)
715
{
T
Tejun Heo 已提交
716
	static atomic64_t id_seq = ATOMIC64_INIT(0);
717 718
	struct blkio_cgroup *blkcg;
	struct cgroup *parent = cgroup->parent;
719

720
	if (!parent) {
721 722 723 724 725 726 727 728 729
		blkcg = &blkio_root_cgroup;
		goto done;
	}

	blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
	if (!blkcg)
		return ERR_PTR(-ENOMEM);

	blkcg->weight = BLKIO_WEIGHT_DEFAULT;
T
Tejun Heo 已提交
730
	blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */
731 732 733 734 735 736 737
done:
	spin_lock_init(&blkcg->lock);
	INIT_HLIST_HEAD(&blkcg->blkg_list);

	return &blkcg->css;
}

738 739 740 741 742 743 744 745 746 747 748 749
/**
 * blkcg_init_queue - initialize blkcg part of request queue
 * @q: request_queue to initialize
 *
 * Called from blk_alloc_queue_node(). Responsible for initializing blkcg
 * part of new request_queue @q.
 *
 * RETURNS:
 * 0 on success, -errno on failure.
 */
int blkcg_init_queue(struct request_queue *q)
{
750 751
	int ret;

752 753
	might_sleep();

754 755 756 757 758 759 760 761 762 763
	ret = blk_throtl_init(q);
	if (ret)
		return ret;

	mutex_lock(&all_q_mutex);
	INIT_LIST_HEAD(&q->all_q_node);
	list_add_tail(&q->all_q_node, &all_q_list);
	mutex_unlock(&all_q_mutex);

	return 0;
764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786
}

/**
 * blkcg_drain_queue - drain blkcg part of request_queue
 * @q: request_queue to drain
 *
 * Called from blk_drain_queue().  Responsible for draining blkcg part.
 */
void blkcg_drain_queue(struct request_queue *q)
{
	lockdep_assert_held(q->queue_lock);

	blk_throtl_drain(q);
}

/**
 * blkcg_exit_queue - exit and release blkcg part of request_queue
 * @q: request_queue being released
 *
 * Called from blk_release_queue().  Responsible for exiting blkcg part.
 */
void blkcg_exit_queue(struct request_queue *q)
{
787 788 789 790
	mutex_lock(&all_q_mutex);
	list_del_init(&q->all_q_node);
	mutex_unlock(&all_q_mutex);

791 792
	blkg_destroy_all(q, true);

793 794 795
	blk_throtl_exit(q);
}

796 797 798 799 800 801
/*
 * We cannot support shared io contexts, as we have no mean to support
 * two tasks with the same ioc in two different groups without major rework
 * of the main cic data structures.  For now we allow a task to change
 * its cgroup only if it's the only owner of its ioc.
 */
802
static int blkiocg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
803
{
804
	struct task_struct *task;
805 806 807 808
	struct io_context *ioc;
	int ret = 0;

	/* task_lock() is needed to avoid races with exit_io_context() */
809 810 811 812 813 814 815 816 817
	cgroup_taskset_for_each(task, cgrp, tset) {
		task_lock(task);
		ioc = task->io_context;
		if (ioc && atomic_read(&ioc->nr_tasks) > 1)
			ret = -EINVAL;
		task_unlock(task);
		if (ret)
			break;
	}
818 819 820
	return ret;
}

821 822 823 824 825 826 827 828 829
static void blkcg_bypass_start(void)
	__acquires(&all_q_mutex)
{
	struct request_queue *q;

	mutex_lock(&all_q_mutex);

	list_for_each_entry(q, &all_q_list, all_q_node) {
		blk_queue_bypass_start(q);
830
		blkg_destroy_all(q, false);
831 832 833 834 835 836 837 838 839 840 841 842 843 844
	}
}

static void blkcg_bypass_end(void)
	__releases(&all_q_mutex)
{
	struct request_queue *q;

	list_for_each_entry(q, &all_q_list, all_q_node)
		blk_queue_bypass_end(q);

	mutex_unlock(&all_q_mutex);
}

845 846 847 848
struct cgroup_subsys blkio_subsys = {
	.name = "blkio",
	.create = blkiocg_create,
	.can_attach = blkiocg_can_attach,
849
	.pre_destroy = blkiocg_pre_destroy,
850 851
	.destroy = blkiocg_destroy,
	.subsys_id = blkio_subsys_id,
852
	.base_cftypes = blkio_files,
853 854 855 856
	.module = THIS_MODULE,
};
EXPORT_SYMBOL_GPL(blkio_subsys);

857 858
void blkio_policy_register(struct blkio_policy_type *blkiop)
{
859 860
	struct request_queue *q;

861
	blkcg_bypass_start();
862
	spin_lock(&blkio_list_lock);
863 864 865

	BUG_ON(blkio_policy[blkiop->plid]);
	blkio_policy[blkiop->plid] = blkiop;
866
	list_add_tail(&blkiop->list, &blkio_list);
867

868
	spin_unlock(&blkio_list_lock);
869 870
	list_for_each_entry(q, &all_q_list, all_q_node)
		update_root_blkg_pd(q, blkiop->plid);
871
	blkcg_bypass_end();
872 873 874

	if (blkiop->cftypes)
		WARN_ON(cgroup_add_cftypes(&blkio_subsys, blkiop->cftypes));
875 876 877 878 879
}
EXPORT_SYMBOL_GPL(blkio_policy_register);

void blkio_policy_unregister(struct blkio_policy_type *blkiop)
{
880 881
	struct request_queue *q;

882 883 884
	if (blkiop->cftypes)
		cgroup_rm_cftypes(&blkio_subsys, blkiop->cftypes);

885
	blkcg_bypass_start();
886
	spin_lock(&blkio_list_lock);
887 888 889

	BUG_ON(blkio_policy[blkiop->plid] != blkiop);
	blkio_policy[blkiop->plid] = NULL;
890
	list_del_init(&blkiop->list);
891

892
	spin_unlock(&blkio_list_lock);
893 894
	list_for_each_entry(q, &all_q_list, all_q_node)
		update_root_blkg_pd(q, blkiop->plid);
895
	blkcg_bypass_end();
896 897
}
EXPORT_SYMBOL_GPL(blkio_policy_unregister);