diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 90c33ebdd6bc8a66f02a10c18026052d0ad3aff4..8f64b459fbd42286b515422c1bade921821cbf00 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -534,6 +534,100 @@ static inline struct cgroup* task_cgroup(struct task_struct *task, return task_subsys_state(task, subsys_id)->cgroup; } +/** + * cgroup_for_each_child - iterate through children of a cgroup + * @pos: the cgroup * to use as the loop cursor + * @cgroup: cgroup whose children to walk + * + * Walk @cgroup's children. Must be called under rcu_read_lock(). A child + * cgroup which hasn't finished ->post_create() or already has finished + * ->pre_destroy() may show up during traversal and it's each subsystem's + * responsibility to verify that each @pos is alive. + * + * If a subsystem synchronizes against the parent in its ->post_create() + * and before starting iterating, a cgroup which finished ->post_create() + * is guaranteed to be visible in the future iterations. + */ +#define cgroup_for_each_child(pos, cgroup) \ + list_for_each_entry_rcu(pos, &(cgroup)->children, sibling) + +struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, + struct cgroup *cgroup); + +/** + * cgroup_for_each_descendant_pre - pre-order walk of a cgroup's descendants + * @pos: the cgroup * to use as the loop cursor + * @cgroup: cgroup whose descendants to walk + * + * Walk @cgroup's descendants. Must be called under rcu_read_lock(). A + * descendant cgroup which hasn't finished ->post_create() or already has + * finished ->pre_destroy() may show up during traversal and it's each + * subsystem's responsibility to verify that each @pos is alive. + * + * If a subsystem synchronizes against the parent in its ->post_create() + * and before starting iterating, and synchronizes against @pos on each + * iteration, any descendant cgroup which finished ->post_create() is + * guaranteed to be visible in the future iterations. + * + * In other words, the following guarantees that a descendant can't escape + * state updates of its ancestors. + * + * my_post_create(@cgrp) + * { + * Lock @cgrp->parent and @cgrp; + * Inherit state from @cgrp->parent; + * Unlock both. + * } + * + * my_update_state(@cgrp) + * { + * Lock @cgrp; + * Update @cgrp's state; + * Unlock @cgrp; + * + * cgroup_for_each_descendant_pre(@pos, @cgrp) { + * Lock @pos; + * Verify @pos is alive and inherit state from @pos->parent; + * Unlock @pos; + * } + * } + * + * As long as the inheriting step, including checking the parent state, is + * enclosed inside @pos locking, double-locking the parent isn't necessary + * while inheriting. The state update to the parent is guaranteed to be + * visible by walking order and, as long as inheriting operations to the + * same @pos are atomic to each other, multiple updates racing each other + * still result in the correct state. It's guaranateed that at least one + * inheritance happens for any cgroup after the latest update to its + * parent. + * + * If checking parent's state requires locking the parent, each inheriting + * iteration should lock and unlock both @pos->parent and @pos. + * + * Alternatively, a subsystem may choose to use a single global lock to + * synchronize ->post_create() and ->pre_destroy() against tree-walking + * operations. + */ +#define cgroup_for_each_descendant_pre(pos, cgroup) \ + for (pos = cgroup_next_descendant_pre(NULL, (cgroup)); (pos); \ + pos = cgroup_next_descendant_pre((pos), (cgroup))) + +struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, + struct cgroup *cgroup); + +/** + * cgroup_for_each_descendant_post - post-order walk of a cgroup's descendants + * @pos: the cgroup * to use as the loop cursor + * @cgroup: cgroup whose descendants to walk + * + * Similar to cgroup_for_each_descendant_pre() but performs post-order + * traversal instead. Note that the walk visibility guarantee described in + * pre-order walk doesn't apply the same to post-order walks. + */ +#define cgroup_for_each_descendant_post(pos, cgroup) \ + for (pos = cgroup_next_descendant_post(NULL, (cgroup)); (pos); \ + pos = cgroup_next_descendant_post((pos), (cgroup))) + /* A cgroup_iter should be treated as an opaque object */ struct cgroup_iter { struct list_head *cg_link; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2ed5968a04e774f230d26f5a5d8e3750f3001ab6..0f8fa6aa371b6eb5db6e225c68a62301f54717a0 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2984,6 +2984,92 @@ static void cgroup_enable_task_cg_lists(void) write_unlock(&css_set_lock); } +/** + * cgroup_next_descendant_pre - find the next descendant for pre-order walk + * @pos: the current position (%NULL to initiate traversal) + * @cgroup: cgroup whose descendants to walk + * + * To be used by cgroup_for_each_descendant_pre(). Find the next + * descendant to visit for pre-order traversal of @cgroup's descendants. + */ +struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, + struct cgroup *cgroup) +{ + struct cgroup *next; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + /* if first iteration, pretend we just visited @cgroup */ + if (!pos) { + if (list_empty(&cgroup->children)) + return NULL; + pos = cgroup; + } + + /* visit the first child if exists */ + next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling); + if (next) + return next; + + /* no child, visit my or the closest ancestor's next sibling */ + do { + next = list_entry_rcu(pos->sibling.next, struct cgroup, + sibling); + if (&next->sibling != &pos->parent->children) + return next; + + pos = pos->parent; + } while (pos != cgroup); + + return NULL; +} +EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); + +static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos) +{ + struct cgroup *last; + + do { + last = pos; + pos = list_first_or_null_rcu(&pos->children, struct cgroup, + sibling); + } while (pos); + + return last; +} + +/** + * cgroup_next_descendant_post - find the next descendant for post-order walk + * @pos: the current position (%NULL to initiate traversal) + * @cgroup: cgroup whose descendants to walk + * + * To be used by cgroup_for_each_descendant_post(). Find the next + * descendant to visit for post-order traversal of @cgroup's descendants. + */ +struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, + struct cgroup *cgroup) +{ + struct cgroup *next; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + /* if first iteration, visit the leftmost descendant */ + if (!pos) { + next = cgroup_leftmost_descendant(cgroup); + return next != cgroup ? next : NULL; + } + + /* if there's an unvisited sibling, visit its leftmost descendant */ + next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); + if (&next->sibling != &pos->parent->children) + return cgroup_leftmost_descendant(next); + + /* no sibling left, visit parent */ + next = pos->parent; + return next != cgroup ? next : NULL; +} +EXPORT_SYMBOL_GPL(cgroup_next_descendant_post); + void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) __acquires(css_set_lock) {