提交 f075e0f6 编写于 作者: L Linus Torvalds

Merge branch 'for-3.14' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

Pull cgroup updates from Tejun Heo:
 "The bulk of changes are cleanups and preparations for the upcoming
  kernfs conversion.

   - cgroup_event mechanism which is and will be used only by memcg is
     moved to memcg.

   - pidlist handling is updated so that it can be served by seq_file.

     Also, the list is not sorted if sane_behavior.  cgroup
     documentation explicitly states that the file is not sorted but it
     has been for quite some time.

   - All cgroup file handling now happens on top of seq_file.  This is
     to prepare for kernfs conversion.  In addition, all operations are
     restructured so that they map 1-1 to kernfs operations.

   - Other cleanups and low-pri fixes"

* 'for-3.14' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (40 commits)
  cgroup: trivial style updates
  cgroup: remove stray references to css_id
  doc: cgroups: Fix typo in doc/cgroups
  cgroup: fix fail path in cgroup_load_subsys()
  cgroup: fix missing unlock on error in cgroup_load_subsys()
  cgroup: remove for_each_root_subsys()
  cgroup: implement for_each_css()
  cgroup: factor out cgroup_subsys_state creation into create_css()
  cgroup: combine css handling loops in cgroup_create()
  cgroup: reorder operations in cgroup_create()
  cgroup: make for_each_subsys() useable under cgroup_root_mutex
  cgroup: css iterations and css_from_dir() are safe under cgroup_mutex
  cgroup: unify pidlist and other file handling
  cgroup: replace cftype->read_seq_string() with cftype->seq_show()
  cgroup: attach cgroup_open_file to all cgroup files
  cgroup: generalize cgroup_pidlist_open_file
  cgroup: unify read path so that seq_file is always used
  cgroup: unify cgroup_write_X64() and cgroup_write_string()
  cgroup: remove cftype->read(), ->read_map() and ->write()
  hugetlb_cgroup: convert away from cftype->read()
  ...
......@@ -24,7 +24,6 @@ CONTENTS:
2.1 Basic Usage
2.2 Attaching processes
2.3 Mounting hierarchies by name
2.4 Notification API
3. Kernel API
3.1 Overview
3.2 Synchronization
......@@ -472,25 +471,6 @@ you give a subsystem a name.
The name of the subsystem appears as part of the hierarchy description
in /proc/mounts and /proc/<pid>/cgroups.
2.4 Notification API
--------------------
There is mechanism which allows to get notifications about changing
status of a cgroup.
To register a new notification handler you need to:
- create a file descriptor for event notification using eventfd(2);
- open a control file to be monitored (e.g. memory.usage_in_bytes);
- write "<event_fd> <control_fd> <args>" to cgroup.event_control.
Interpretation of args is defined by control file implementation;
eventfd will be woken up by control file implementation or when the
cgroup is removed.
To unregister a notification handler just close eventfd.
NOTE: Support of notifications should be implemented for the control
file. See documentation for the subsystem.
3. Kernel API
=============
......
......@@ -577,7 +577,7 @@ Each memcg's numa_stat file includes "total", "file", "anon" and "unevictable"
per-node page counts including "hierarchical_<counter>" which sums up all
hierarchical children's values in addition to the memcg's own value.
The ouput format of memory.numa_stat is:
The output format of memory.numa_stat is:
total=<total pages> N0=<node 0 pages> N1=<node 1 pages> ...
file=<total file pages> N0=<node 0 pages> N1=<node 1 pages> ...
......@@ -670,7 +670,7 @@ page tables.
8.1 Interface
This feature is disabled by default. It can be enabledi (and disabled again) by
This feature is disabled by default. It can be enabled (and disabled again) by
writing to memory.move_charge_at_immigrate of the destination cgroup.
If you want to enable it:
......
......@@ -97,8 +97,8 @@ to work with it.
(struct res_counter *rc, struct res_counter *top,
unsinged long val)
Almost same as res_cunter_uncharge() but propagation of uncharge
stops when rc == top. This is useful when kill a res_coutner in
Almost same as res_counter_uncharge() but propagation of uncharge
stops when rc == top. This is useful when kill a res_counter in
child cgroup.
2.1 Other accounting routines
......
......@@ -1303,13 +1303,10 @@ static u64 tg_prfill_cpu_rwstat(struct seq_file *sf,
return __blkg_prfill_rwstat(sf, pd, &rwstat);
}
static int tg_print_cpu_rwstat(struct cgroup_subsys_state *css,
struct cftype *cft, struct seq_file *sf)
static int tg_print_cpu_rwstat(struct seq_file *sf, void *v)
{
struct blkcg *blkcg = css_to_blkcg(css);
blkcg_print_blkgs(sf, blkcg, tg_prfill_cpu_rwstat, &blkcg_policy_throtl,
cft->private, true);
blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_cpu_rwstat,
&blkcg_policy_throtl, seq_cft(sf)->private, true);
return 0;
}
......@@ -1335,19 +1332,17 @@ static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd,
return __blkg_prfill_u64(sf, pd, v);
}
static int tg_print_conf_u64(struct cgroup_subsys_state *css,
struct cftype *cft, struct seq_file *sf)
static int tg_print_conf_u64(struct seq_file *sf, void *v)
{
blkcg_print_blkgs(sf, css_to_blkcg(css), tg_prfill_conf_u64,
&blkcg_policy_throtl, cft->private, false);
blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_u64,
&blkcg_policy_throtl, seq_cft(sf)->private, false);
return 0;
}
static int tg_print_conf_uint(struct cgroup_subsys_state *css,
struct cftype *cft, struct seq_file *sf)
static int tg_print_conf_uint(struct seq_file *sf, void *v)
{
blkcg_print_blkgs(sf, css_to_blkcg(css), tg_prfill_conf_uint,
&blkcg_policy_throtl, cft->private, false);
blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_uint,
&blkcg_policy_throtl, seq_cft(sf)->private, false);
return 0;
}
......@@ -1428,40 +1423,40 @@ static struct cftype throtl_files[] = {
{
.name = "throttle.read_bps_device",
.private = offsetof(struct throtl_grp, bps[READ]),
.read_seq_string = tg_print_conf_u64,
.seq_show = tg_print_conf_u64,
.write_string = tg_set_conf_u64,
.max_write_len = 256,
},
{
.name = "throttle.write_bps_device",
.private = offsetof(struct throtl_grp, bps[WRITE]),
.read_seq_string = tg_print_conf_u64,
.seq_show = tg_print_conf_u64,
.write_string = tg_set_conf_u64,
.max_write_len = 256,
},
{
.name = "throttle.read_iops_device",
.private = offsetof(struct throtl_grp, iops[READ]),
.read_seq_string = tg_print_conf_uint,
.seq_show = tg_print_conf_uint,
.write_string = tg_set_conf_uint,
.max_write_len = 256,
},
{
.name = "throttle.write_iops_device",
.private = offsetof(struct throtl_grp, iops[WRITE]),
.read_seq_string = tg_print_conf_uint,
.seq_show = tg_print_conf_uint,
.write_string = tg_set_conf_uint,
.max_write_len = 256,
},
{
.name = "throttle.io_service_bytes",
.private = offsetof(struct tg_stats_cpu, service_bytes),
.read_seq_string = tg_print_cpu_rwstat,
.seq_show = tg_print_cpu_rwstat,
},
{
.name = "throttle.io_serviced",
.private = offsetof(struct tg_stats_cpu, serviced),
.read_seq_string = tg_print_cpu_rwstat,
.seq_show = tg_print_cpu_rwstat,
},
{ } /* terminate */
};
......
......@@ -1632,11 +1632,11 @@ static u64 cfqg_prfill_weight_device(struct seq_file *sf,
return __blkg_prfill_u64(sf, pd, cfqg->dev_weight);
}
static int cfqg_print_weight_device(struct cgroup_subsys_state *css,
struct cftype *cft, struct seq_file *sf)
static int cfqg_print_weight_device(struct seq_file *sf, void *v)
{
blkcg_print_blkgs(sf, css_to_blkcg(css), cfqg_prfill_weight_device,
&blkcg_policy_cfq, 0, false);
blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
cfqg_prfill_weight_device, &blkcg_policy_cfq,
0, false);
return 0;
}
......@@ -1650,26 +1650,23 @@ static u64 cfqg_prfill_leaf_weight_device(struct seq_file *sf,
return __blkg_prfill_u64(sf, pd, cfqg->dev_leaf_weight);
}
static int cfqg_print_leaf_weight_device(struct cgroup_subsys_state *css,
struct cftype *cft,
struct seq_file *sf)
static int cfqg_print_leaf_weight_device(struct seq_file *sf, void *v)
{
blkcg_print_blkgs(sf, css_to_blkcg(css), cfqg_prfill_leaf_weight_device,
&blkcg_policy_cfq, 0, false);
blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
cfqg_prfill_leaf_weight_device, &blkcg_policy_cfq,
0, false);
return 0;
}
static int cfq_print_weight(struct cgroup_subsys_state *css, struct cftype *cft,
struct seq_file *sf)
static int cfq_print_weight(struct seq_file *sf, void *v)
{
seq_printf(sf, "%u\n", css_to_blkcg(css)->cfq_weight);
seq_printf(sf, "%u\n", css_to_blkcg(seq_css(sf))->cfq_weight);
return 0;
}
static int cfq_print_leaf_weight(struct cgroup_subsys_state *css,
struct cftype *cft, struct seq_file *sf)
static int cfq_print_leaf_weight(struct seq_file *sf, void *v)
{
seq_printf(sf, "%u\n", css_to_blkcg(css)->cfq_leaf_weight);
seq_printf(sf, "%u\n", css_to_blkcg(seq_css(sf))->cfq_leaf_weight);
return 0;
}
......@@ -1762,23 +1759,17 @@ static int cfq_set_leaf_weight(struct cgroup_subsys_state *css,
return __cfq_set_weight(css, cft, val, true);
}
static int cfqg_print_stat(struct cgroup_subsys_state *css, struct cftype *cft,
struct seq_file *sf)
static int cfqg_print_stat(struct seq_file *sf, void *v)
{
struct blkcg *blkcg = css_to_blkcg(css);
blkcg_print_blkgs(sf, blkcg, blkg_prfill_stat, &blkcg_policy_cfq,
cft->private, false);
blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
&blkcg_policy_cfq, seq_cft(sf)->private, false);
return 0;
}
static int cfqg_print_rwstat(struct cgroup_subsys_state *css,
struct cftype *cft, struct seq_file *sf)
static int cfqg_print_rwstat(struct seq_file *sf, void *v)
{
struct blkcg *blkcg = css_to_blkcg(css);
blkcg_print_blkgs(sf, blkcg, blkg_prfill_rwstat, &blkcg_policy_cfq,
cft->private, true);
blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
&blkcg_policy_cfq, seq_cft(sf)->private, true);
return 0;
}
......@@ -1798,23 +1789,19 @@ static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf,
return __blkg_prfill_rwstat(sf, pd, &sum);
}
static int cfqg_print_stat_recursive(struct cgroup_subsys_state *css,
struct cftype *cft, struct seq_file *sf)
static int cfqg_print_stat_recursive(struct seq_file *sf, void *v)
{
struct blkcg *blkcg = css_to_blkcg(css);
blkcg_print_blkgs(sf, blkcg, cfqg_prfill_stat_recursive,
&blkcg_policy_cfq, cft->private, false);
blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
cfqg_prfill_stat_recursive, &blkcg_policy_cfq,
seq_cft(sf)->private, false);
return 0;
}
static int cfqg_print_rwstat_recursive(struct cgroup_subsys_state *css,
struct cftype *cft, struct seq_file *sf)
static int cfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
{
struct blkcg *blkcg = css_to_blkcg(css);
blkcg_print_blkgs(sf, blkcg, cfqg_prfill_rwstat_recursive,
&blkcg_policy_cfq, cft->private, true);
blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
cfqg_prfill_rwstat_recursive, &blkcg_policy_cfq,
seq_cft(sf)->private, true);
return 0;
}
......@@ -1835,13 +1822,11 @@ static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf,
}
/* print avg_queue_size */
static int cfqg_print_avg_queue_size(struct cgroup_subsys_state *css,
struct cftype *cft, struct seq_file *sf)
static int cfqg_print_avg_queue_size(struct seq_file *sf, void *v)
{
struct blkcg *blkcg = css_to_blkcg(css);
blkcg_print_blkgs(sf, blkcg, cfqg_prfill_avg_queue_size,
&blkcg_policy_cfq, 0, false);
blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
cfqg_prfill_avg_queue_size, &blkcg_policy_cfq,
0, false);
return 0;
}
#endif /* CONFIG_DEBUG_BLK_CGROUP */
......@@ -1851,14 +1836,14 @@ static struct cftype cfq_blkcg_files[] = {
{
.name = "weight_device",
.flags = CFTYPE_ONLY_ON_ROOT,
.read_seq_string = cfqg_print_leaf_weight_device,
.seq_show = cfqg_print_leaf_weight_device,
.write_string = cfqg_set_leaf_weight_device,
.max_write_len = 256,
},
{
.name = "weight",
.flags = CFTYPE_ONLY_ON_ROOT,
.read_seq_string = cfq_print_leaf_weight,
.seq_show = cfq_print_leaf_weight,
.write_u64 = cfq_set_leaf_weight,
},
......@@ -1866,26 +1851,26 @@ static struct cftype cfq_blkcg_files[] = {
{
.name = "weight_device",
.flags = CFTYPE_NOT_ON_ROOT,
.read_seq_string = cfqg_print_weight_device,
.seq_show = cfqg_print_weight_device,
.write_string = cfqg_set_weight_device,
.max_write_len = 256,
},
{
.name = "weight",
.flags = CFTYPE_NOT_ON_ROOT,
.read_seq_string = cfq_print_weight,
.seq_show = cfq_print_weight,
.write_u64 = cfq_set_weight,
},
{
.name = "leaf_weight_device",
.read_seq_string = cfqg_print_leaf_weight_device,
.seq_show = cfqg_print_leaf_weight_device,
.write_string = cfqg_set_leaf_weight_device,
.max_write_len = 256,
},
{
.name = "leaf_weight",
.read_seq_string = cfq_print_leaf_weight,
.seq_show = cfq_print_leaf_weight,
.write_u64 = cfq_set_leaf_weight,
},
......@@ -1893,114 +1878,114 @@ static struct cftype cfq_blkcg_files[] = {
{
.name = "time",
.private = offsetof(struct cfq_group, stats.time),
.read_seq_string = cfqg_print_stat,
.seq_show = cfqg_print_stat,
},
{
.name = "sectors",
.private = offsetof(struct cfq_group, stats.sectors),
.read_seq_string = cfqg_print_stat,
.seq_show = cfqg_print_stat,
},
{
.name = "io_service_bytes",
.private = offsetof(struct cfq_group, stats.service_bytes),
.read_seq_string = cfqg_print_rwstat,
.seq_show = cfqg_print_rwstat,
},
{
.name = "io_serviced",
.private = offsetof(struct cfq_group, stats.serviced),
.read_seq_string = cfqg_print_rwstat,
.seq_show = cfqg_print_rwstat,
},
{
.name = "io_service_time",
.private = offsetof(struct cfq_group, stats.service_time),
.read_seq_string = cfqg_print_rwstat,
.seq_show = cfqg_print_rwstat,
},
{
.name = "io_wait_time",
.private = offsetof(struct cfq_group, stats.wait_time),
.read_seq_string = cfqg_print_rwstat,
.seq_show = cfqg_print_rwstat,
},
{
.name = "io_merged",
.private = offsetof(struct cfq_group, stats.merged),
.read_seq_string = cfqg_print_rwstat,
.seq_show = cfqg_print_rwstat,
},
{
.name = "io_queued",
.private = offsetof(struct cfq_group, stats.queued),
.read_seq_string = cfqg_print_rwstat,
.seq_show = cfqg_print_rwstat,
},
/* the same statictics which cover the cfqg and its descendants */
{
.name = "time_recursive",
.private = offsetof(struct cfq_group, stats.time),
.read_seq_string = cfqg_print_stat_recursive,
.seq_show = cfqg_print_stat_recursive,
},
{
.name = "sectors_recursive",
.private = offsetof(struct cfq_group, stats.sectors),
.read_seq_string = cfqg_print_stat_recursive,
.seq_show = cfqg_print_stat_recursive,
},
{
.name = "io_service_bytes_recursive",
.private = offsetof(struct cfq_group, stats.service_bytes),
.read_seq_string = cfqg_print_rwstat_recursive,
.seq_show = cfqg_print_rwstat_recursive,
},
{
.name = "io_serviced_recursive",
.private = offsetof(struct cfq_group, stats.serviced),
.read_seq_string = cfqg_print_rwstat_recursive,
.seq_show = cfqg_print_rwstat_recursive,
},
{
.name = "io_service_time_recursive",
.private = offsetof(struct cfq_group, stats.service_time),
.read_seq_string = cfqg_print_rwstat_recursive,
.seq_show = cfqg_print_rwstat_recursive,
},
{
.name = "io_wait_time_recursive",
.private = offsetof(struct cfq_group, stats.wait_time),
.read_seq_string = cfqg_print_rwstat_recursive,
.seq_show = cfqg_print_rwstat_recursive,
},
{
.name = "io_merged_recursive",
.private = offsetof(struct cfq_group, stats.merged),
.read_seq_string = cfqg_print_rwstat_recursive,
.seq_show = cfqg_print_rwstat_recursive,
},
{
.name = "io_queued_recursive",
.private = offsetof(struct cfq_group, stats.queued),
.read_seq_string = cfqg_print_rwstat_recursive,
.seq_show = cfqg_print_rwstat_recursive,
},
#ifdef CONFIG_DEBUG_BLK_CGROUP
{
.name = "avg_queue_size",
.read_seq_string = cfqg_print_avg_queue_size,
.seq_show = cfqg_print_avg_queue_size,
},
{
.name = "group_wait_time",
.private = offsetof(struct cfq_group, stats.group_wait_time),
.read_seq_string = cfqg_print_stat,
.seq_show = cfqg_print_stat,
},
{
.name = "idle_time",
.private = offsetof(struct cfq_group, stats.idle_time),
.read_seq_string = cfqg_print_stat,
.seq_show = cfqg_print_stat,
},
{
.name = "empty_time",
.private = offsetof(struct cfq_group, stats.empty_time),
.read_seq_string = cfqg_print_stat,
.seq_show = cfqg_print_stat,
},
{
.name = "dequeue",
.private = offsetof(struct cfq_group, stats.dequeue),
.read_seq_string = cfqg_print_stat,
.seq_show = cfqg_print_stat,
},
{
.name = "unaccounted_time",
.private = offsetof(struct cfq_group, stats.unaccounted_time),
.read_seq_string = cfqg_print_stat,
.seq_show = cfqg_print_stat,
},
#endif /* CONFIG_DEBUG_BLK_CGROUP */
{ } /* terminate */
......
......@@ -163,7 +163,6 @@ static struct cgroup_subsys_state *bcachecg_create(struct cgroup *cgroup)
static void bcachecg_destroy(struct cgroup *cgroup)
{
struct bch_cgroup *cg = cgroup_to_bcache(cgroup);
free_css_id(&bcache_subsys, &cg->css);
kfree(cg);
}
......
......@@ -21,6 +21,7 @@
#include <linux/xattr.h>
#include <linux/fs.h>
#include <linux/percpu-refcount.h>
#include <linux/seq_file.h>
#ifdef CONFIG_CGROUPS
......@@ -28,8 +29,6 @@ struct cgroupfs_root;
struct cgroup_subsys;
struct inode;
struct cgroup;
struct css_id;
struct eventfd_ctx;
extern int cgroup_init_early(void);
extern int cgroup_init(void);
......@@ -79,8 +78,6 @@ struct cgroup_subsys_state {
struct cgroup_subsys_state *parent;
unsigned long flags;
/* ID for this css, if possible */
struct css_id __rcu *id;
/* percpu_ref killing and RCU release */
struct rcu_head rcu_head;
......@@ -239,10 +236,6 @@ struct cgroup {
struct rcu_head rcu_head;
struct work_struct destroy_work;
/* List of events which userspace want to receive */
struct list_head event_list;
spinlock_t event_list_lock;
/* directory xattrs */
struct simple_xattrs xattrs;
};
......@@ -280,6 +273,9 @@ enum {
* - "tasks" is removed. Everything should be at process
* granularity. Use "cgroup.procs" instead.
*
* - "cgroup.procs" is not sorted. pids will be unique unless they
* got recycled inbetween reads.
*
* - "release_agent" and "notify_on_release" are removed.
* Replacement notification mechanism will be implemented.
*
......@@ -320,9 +316,6 @@ struct cgroupfs_root {
/* Unique id for this hierarchy. */
int hierarchy_id;
/* A list running through the attached subsystems */
struct list_head subsys_list;
/* The root cgroup for this hierarchy */
struct cgroup top_cgroup;
......@@ -388,16 +381,6 @@ struct css_set {
struct rcu_head rcu_head;
};
/*
* cgroup_map_cb is an abstract callback API for reporting map-valued
* control files
*/
struct cgroup_map_cb {
int (*fill)(struct cgroup_map_cb *cb, const char *key, u64 value);
void *state;
};
/*
* struct cftype: handler definitions for cgroup control files
*
......@@ -445,10 +428,6 @@ struct cftype {
*/
struct cgroup_subsys *ss;
int (*open)(struct inode *inode, struct file *file);
ssize_t (*read)(struct cgroup_subsys_state *css, struct cftype *cft,
struct file *file,
char __user *buf, size_t nbytes, loff_t *ppos);
/*
* read_u64() is a shortcut for the common case of returning a
* single integer. Use it in place of read()
......@@ -458,24 +437,14 @@ struct cftype {
* read_s64() is a signed version of read_u64()
*/
s64 (*read_s64)(struct cgroup_subsys_state *css, struct cftype *cft);
/*
* read_map() is used for defining a map of key/value
* pairs. It should call cb->fill(cb, key, value) for each
* entry. The key/value pairs (and their ordering) should not
* change between reboots.
*/
int (*read_map)(struct cgroup_subsys_state *css, struct cftype *cft,
struct cgroup_map_cb *cb);
/*
* read_seq_string() is used for outputting a simple sequence
* using seqfile.
*/
int (*read_seq_string)(struct cgroup_subsys_state *css,
struct cftype *cft, struct seq_file *m);
ssize_t (*write)(struct cgroup_subsys_state *css, struct cftype *cft,
struct file *file,
const char __user *buf, size_t nbytes, loff_t *ppos);
/* generic seq_file read interface */
int (*seq_show)(struct seq_file *sf, void *v);
/* optional ops, implement all or none */
void *(*seq_start)(struct seq_file *sf, loff_t *ppos);
void *(*seq_next)(struct seq_file *sf, void *v, loff_t *ppos);
void (*seq_stop)(struct seq_file *sf, void *v);
/*
* write_u64() is a shortcut for the common case of accepting
......@@ -504,27 +473,6 @@ struct cftype {
* kick type for multiplexing.
*/
int (*trigger)(struct cgroup_subsys_state *css, unsigned int event);
int (*release)(struct inode *inode, struct file *file);
/*
* register_event() callback will be used to add new userspace
* waiter for changes related to the cftype. Implement it if
* you want to provide this functionality. Use eventfd_signal()
* on eventfd to send notification to userspace.
*/
int (*register_event)(struct cgroup_subsys_state *css,
struct cftype *cft, struct eventfd_ctx *eventfd,
const char *args);
/*
* unregister_event() callback will be called when userspace
* closes the eventfd or on cgroup removing.
* This callback must be implemented, if you want provide
* notification functionality.
*/
void (*unregister_event)(struct cgroup_subsys_state *css,
struct cftype *cft,
struct eventfd_ctx *eventfd);
};
/*
......@@ -537,6 +485,26 @@ struct cftype_set {
struct cftype *cfts;
};
/*
* cgroupfs file entry, pointed to from leaf dentry->d_fsdata. Don't
* access directly.
*/
struct cfent {
struct list_head node;
struct dentry *dentry;
struct cftype *type;
struct cgroup_subsys_state *css;
/* file xattrs */
struct simple_xattrs xattrs;
};
/* seq_file->private points to the following, only ->priv is public */
struct cgroup_open_file {
struct cfent *cfe;
void *priv;
};
/*
* See the comment above CGRP_ROOT_SANE_BEHAVIOR for details. This
* function can be called as long as @cgrp is accessible.
......@@ -552,6 +520,18 @@ static inline const char *cgroup_name(const struct cgroup *cgrp)
return rcu_dereference(cgrp->name)->name;
}
static inline struct cgroup_subsys_state *seq_css(struct seq_file *seq)
{
struct cgroup_open_file *of = seq->private;
return of->cfe->css;
}
static inline struct cftype *seq_cft(struct seq_file *seq)
{
struct cgroup_open_file *of = seq->private;
return of->cfe->type;
}
int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
int cgroup_rm_cftypes(struct cftype *cfts);
......@@ -631,12 +611,8 @@ struct cgroup_subsys {
#define MAX_CGROUP_TYPE_NAMELEN 32
const char *name;
/*
* Link to parent, and list entry in parent's children.
* Protected by cgroup_lock()
*/
/* link to parent, protected by cgroup_lock() */
struct cgroupfs_root *root;
struct list_head sibling;
/* list of cftype_sets */
struct list_head cftsets;
......
......@@ -7,6 +7,7 @@
#include <linux/gfp.h>
#include <linux/types.h>
#include <linux/cgroup.h>
#include <linux/eventfd.h>
struct vmpressure {
unsigned long scanned;
......@@ -33,13 +34,10 @@ extern void vmpressure_init(struct vmpressure *vmpr);
extern void vmpressure_cleanup(struct vmpressure *vmpr);
extern struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg);
extern struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr);
extern struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css);
extern int vmpressure_register_event(struct cgroup_subsys_state *css,
struct cftype *cft,
extern int vmpressure_register_event(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd,
const char *args);
extern void vmpressure_unregister_event(struct cgroup_subsys_state *css,
struct cftype *cft,
extern void vmpressure_unregister_event(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd);
#else
static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
......
......@@ -854,7 +854,6 @@ config NUMA_BALANCING
menuconfig CGROUPS
boolean "Control Group support"
depends on EVENTFD
help
This option adds support for grouping sets of processes together, for
use with process control subsystems such as Cpusets, CFS, memory
......@@ -921,6 +920,7 @@ config MEMCG
bool "Memory Resource Controller for Control Groups"
depends on RESOURCE_COUNTERS
select MM_OWNER
select EVENTFD
help
Provides a memory resource controller that manages both anonymous
memory and page cache. (See Documentation/cgroups/memory.txt)
......@@ -1160,7 +1160,6 @@ config UIDGID_STRICT_TYPE_CHECKS
config SCHED_AUTOGROUP
bool "Automatic process group scheduling"
select EVENTFD
select CGROUPS
select CGROUP_SCHED
select FAIR_GROUP_SCHED
......
此差异已折叠。
......@@ -301,10 +301,9 @@ static void update_if_frozen(struct cgroup_subsys_state *css)
spin_unlock_irq(&freezer->lock);
}
static int freezer_read(struct cgroup_subsys_state *css, struct cftype *cft,
struct seq_file *m)
static int freezer_read(struct seq_file *m, void *v)
{
struct cgroup_subsys_state *pos;
struct cgroup_subsys_state *css = seq_css(m), *pos;
rcu_read_lock();
......@@ -458,7 +457,7 @@ static struct cftype files[] = {
{
.name = "state",
.flags = CFTYPE_NOT_ON_ROOT,
.read_seq_string = freezer_read,
.seq_show = freezer_read,
.write_string = freezer_write,
},
{
......
......@@ -1731,66 +1731,41 @@ static int cpuset_write_resmask(struct cgroup_subsys_state *css,
* used, list of ranges of sequential numbers, is variable length,
* and since these maps can change value dynamically, one could read
* gibberish by doing partial reads while a list was changing.
* A single large read to a buffer that crosses a page boundary is
* ok, because the result being copied to user land is not recomputed
* across a page fault.
*/
static size_t cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
static int cpuset_common_seq_show(struct seq_file *sf, void *v)
{
size_t count;
mutex_lock(&callback_mutex);
count = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed);
mutex_unlock(&callback_mutex);
struct cpuset *cs = css_cs(seq_css(sf));
cpuset_filetype_t type = seq_cft(sf)->private;
ssize_t count;
char *buf, *s;
int ret = 0;
return count;
}
static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs)
{
size_t count;
count = seq_get_buf(sf, &buf);
s = buf;
mutex_lock(&callback_mutex);
count = nodelist_scnprintf(page, PAGE_SIZE, cs->mems_allowed);
mutex_unlock(&callback_mutex);
return count;
}
static ssize_t cpuset_common_file_read(struct cgroup_subsys_state *css,
struct cftype *cft, struct file *file,
char __user *buf, size_t nbytes,
loff_t *ppos)
{
struct cpuset *cs = css_cs(css);
cpuset_filetype_t type = cft->private;
char *page;
ssize_t retval = 0;
char *s;
if (!(page = (char *)__get_free_page(GFP_TEMPORARY)))
return -ENOMEM;
s = page;
switch (type) {
case FILE_CPULIST:
s += cpuset_sprintf_cpulist(s, cs);
s += cpulist_scnprintf(s, count, cs->cpus_allowed);
break;
case FILE_MEMLIST:
s += cpuset_sprintf_memlist(s, cs);
s += nodelist_scnprintf(s, count, cs->mems_allowed);
break;
default:
retval = -EINVAL;
goto out;
ret = -EINVAL;
goto out_unlock;
}
*s++ = '\n';
retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page);
out:
free_page((unsigned long)page);
return retval;
if (s < buf + count - 1) {
*s++ = '\n';
seq_commit(sf, s - buf);
} else {
seq_commit(sf, -1);
}
out_unlock:
mutex_unlock(&callback_mutex);
return ret;
}
static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
......@@ -1847,7 +1822,7 @@ static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
static struct cftype files[] = {
{
.name = "cpus",
.read = cpuset_common_file_read,
.seq_show = cpuset_common_seq_show,
.write_string = cpuset_write_resmask,
.max_write_len = (100U + 6 * NR_CPUS),
.private = FILE_CPULIST,
......@@ -1855,7 +1830,7 @@ static struct cftype files[] = {
{
.name = "mems",
.read = cpuset_common_file_read,
.seq_show = cpuset_common_seq_show,
.write_string = cpuset_write_resmask,
.max_write_len = (100U + 6 * MAX_NUMNODES),
.private = FILE_MEMLIST,
......
......@@ -7852,15 +7852,14 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
return ret;
}
static int cpu_stats_show(struct cgroup_subsys_state *css, struct cftype *cft,
struct cgroup_map_cb *cb)
static int cpu_stats_show(struct seq_file *sf, void *v)
{
struct task_group *tg = css_tg(css);
struct task_group *tg = css_tg(seq_css(sf));
struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
cb->fill(cb, "nr_periods", cfs_b->nr_periods);
cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
cb->fill(cb, "throttled_time", cfs_b->throttled_time);
seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods);
seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
return 0;
}
......@@ -7914,7 +7913,7 @@ static struct cftype cpu_files[] = {
},
{
.name = "stat",
.read_map = cpu_stats_show,
.seq_show = cpu_stats_show,
},
#endif
#ifdef CONFIG_RT_GROUP_SCHED
......
......@@ -163,10 +163,9 @@ static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
return err;
}
static int cpuacct_percpu_seq_read(struct cgroup_subsys_state *css,
struct cftype *cft, struct seq_file *m)
static int cpuacct_percpu_seq_show(struct seq_file *m, void *V)
{
struct cpuacct *ca = css_ca(css);
struct cpuacct *ca = css_ca(seq_css(m));
u64 percpu;
int i;
......@@ -183,10 +182,9 @@ static const char * const cpuacct_stat_desc[] = {
[CPUACCT_STAT_SYSTEM] = "system",
};
static int cpuacct_stats_show(struct cgroup_subsys_state *css,
struct cftype *cft, struct cgroup_map_cb *cb)
static int cpuacct_stats_show(struct seq_file *sf, void *v)
{
struct cpuacct *ca = css_ca(css);
struct cpuacct *ca = css_ca(seq_css(sf));
int cpu;
s64 val = 0;
......@@ -196,7 +194,7 @@ static int cpuacct_stats_show(struct cgroup_subsys_state *css,
val += kcpustat->cpustat[CPUTIME_NICE];
}
val = cputime64_to_clock_t(val);
cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val);
val = 0;
for_each_online_cpu(cpu) {
......@@ -207,7 +205,7 @@ static int cpuacct_stats_show(struct cgroup_subsys_state *css,
}
val = cputime64_to_clock_t(val);
cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
return 0;
}
......@@ -220,11 +218,11 @@ static struct cftype files[] = {
},
{
.name = "usage_percpu",
.read_seq_string = cpuacct_percpu_seq_read,
.seq_show = cpuacct_percpu_seq_show,
},
{
.name = "stat",
.read_map = cpuacct_stats_show,
.seq_show = cpuacct_stats_show,
},
{ } /* terminate */
};
......
......@@ -242,22 +242,16 @@ void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
return;
}
static ssize_t hugetlb_cgroup_read(struct cgroup_subsys_state *css,
struct cftype *cft, struct file *file,
char __user *buf, size_t nbytes,
loff_t *ppos)
static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css,
struct cftype *cft)
{
u64 val;
char str[64];
int idx, name, len;
int idx, name;
struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
idx = MEMFILE_IDX(cft->private);
name = MEMFILE_ATTR(cft->private);
val = res_counter_read_u64(&h_cg->hugepage[idx], name);
len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
return simple_read_from_buffer(buf, nbytes, ppos, str, len);
return res_counter_read_u64(&h_cg->hugepage[idx], name);
}
static int hugetlb_cgroup_write(struct cgroup_subsys_state *css,
......@@ -337,28 +331,28 @@ static void __init __hugetlb_cgroup_file_init(int idx)
cft = &h->cgroup_files[0];
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf);
cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT);
cft->read = hugetlb_cgroup_read;
cft->read_u64 = hugetlb_cgroup_read_u64;
cft->write_string = hugetlb_cgroup_write;
/* Add the usage file */
cft = &h->cgroup_files[1];
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf);
cft->private = MEMFILE_PRIVATE(idx, RES_USAGE);
cft->read = hugetlb_cgroup_read;
cft->read_u64 = hugetlb_cgroup_read_u64;
/* Add the MAX usage file */
cft = &h->cgroup_files[2];
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf);
cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE);
cft->trigger = hugetlb_cgroup_reset;
cft->read = hugetlb_cgroup_read;
cft->read_u64 = hugetlb_cgroup_read_u64;
/* Add the failcntfile */
cft = &h->cgroup_files[3];
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf);
cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT);
cft->trigger = hugetlb_cgroup_reset;
cft->read = hugetlb_cgroup_read;
cft->read_u64 = hugetlb_cgroup_read_u64;
/* NULL terminate the last cft */
cft = &h->cgroup_files[4];
......
......@@ -45,6 +45,7 @@
#include <linux/swapops.h>
#include <linux/spinlock.h>
#include <linux/eventfd.h>
#include <linux/poll.h>
#include <linux/sort.h>
#include <linux/fs.h>
#include <linux/seq_file.h>
......@@ -55,6 +56,7 @@
#include <linux/cpu.h>
#include <linux/oom.h>
#include <linux/lockdep.h>
#include <linux/file.h>
#include "internal.h"
#include <net/sock.h>
#include <net/ip.h>
......@@ -227,6 +229,46 @@ struct mem_cgroup_eventfd_list {
struct eventfd_ctx *eventfd;
};
/*
* cgroup_event represents events which userspace want to receive.
*/
struct mem_cgroup_event {
/*
* memcg which the event belongs to.
*/
struct mem_cgroup *memcg;
/*
* eventfd to signal userspace about the event.
*/
struct eventfd_ctx *eventfd;
/*
* Each of these stored in a list by the cgroup.
*/
struct list_head list;
/*
* register_event() callback will be used to add new userspace
* waiter for changes related to this event. Use eventfd_signal()
* on eventfd to send notification to userspace.
*/
int (*register_event)(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd, const char *args);
/*
* unregister_event() callback will be called when userspace closes
* the eventfd or on cgroup removing. This callback must be set,
* if you want provide notification functionality.
*/
void (*unregister_event)(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd);
/*
* All fields below needed to unregister event when
* userspace closes eventfd.
*/
poll_table pt;
wait_queue_head_t *wqh;
wait_queue_t wait;
struct work_struct remove;
};
static void mem_cgroup_threshold(struct mem_cgroup *memcg);
static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
......@@ -331,6 +373,10 @@ struct mem_cgroup {
atomic_t numainfo_updating;
#endif
/* List of events which userspace want to receive */
struct list_head event_list;
spinlock_t event_list_lock;
struct mem_cgroup_per_node *nodeinfo[0];
/* WARNING: nodeinfo must be the last member here */
};
......@@ -490,11 +536,6 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
}
struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css)
{
return &mem_cgroup_from_css(css)->vmpressure;
}
static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
{
return (memcg == root_mem_cgroup);
......@@ -2976,10 +3017,9 @@ static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
}
#ifdef CONFIG_SLABINFO
static int mem_cgroup_slabinfo_read(struct cgroup_subsys_state *css,
struct cftype *cft, struct seq_file *m)
static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
struct memcg_cache_params *params;
if (!memcg_can_account_kmem(memcg))
......@@ -5112,14 +5152,12 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
return val << PAGE_SHIFT;
}
static ssize_t mem_cgroup_read(struct cgroup_subsys_state *css,
struct cftype *cft, struct file *file,
char __user *buf, size_t nbytes, loff_t *ppos)
static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
struct cftype *cft)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
char str[64];
u64 val;
int name, len;
int name;
enum res_type type;
type = MEMFILE_TYPE(cft->private);
......@@ -5145,8 +5183,7 @@ static ssize_t mem_cgroup_read(struct cgroup_subsys_state *css,
BUG();
}
len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
return simple_read_from_buffer(buf, nbytes, ppos, str, len);
return val;
}
static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val)
......@@ -5383,8 +5420,7 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
#endif
#ifdef CONFIG_NUMA
static int memcg_numa_stat_show(struct cgroup_subsys_state *css,
struct cftype *cft, struct seq_file *m)
static int memcg_numa_stat_show(struct seq_file *m, void *v)
{
struct numa_stat {
const char *name;
......@@ -5400,7 +5436,7 @@ static int memcg_numa_stat_show(struct cgroup_subsys_state *css,
const struct numa_stat *stat;
int nid;
unsigned long nr;
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
......@@ -5439,10 +5475,9 @@ static inline void mem_cgroup_lru_names_not_uptodate(void)
BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
}
static int memcg_stat_show(struct cgroup_subsys_state *css, struct cftype *cft,
struct seq_file *m)
static int memcg_stat_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
struct mem_cgroup *mi;
unsigned int i;
......@@ -5651,13 +5686,11 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
mem_cgroup_oom_notify_cb(iter);
}
static int mem_cgroup_usage_register_event(struct cgroup_subsys_state *css,
struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd, const char *args, enum res_type type)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup_thresholds *thresholds;
struct mem_cgroup_threshold_ary *new;
enum res_type type = MEMFILE_TYPE(cft->private);
u64 threshold, usage;
int i, size, ret;
......@@ -5734,13 +5767,23 @@ static int mem_cgroup_usage_register_event(struct cgroup_subsys_state *css,
return ret;
}
static void mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css,
struct cftype *cft, struct eventfd_ctx *eventfd)
static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd, const char *args)
{
return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
}
static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd, const char *args)
{
return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
}
static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd, enum res_type type)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup_thresholds *thresholds;
struct mem_cgroup_threshold_ary *new;
enum res_type type = MEMFILE_TYPE(cft->private);
u64 usage;
int i, j, size;
......@@ -5813,14 +5856,23 @@ static void mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css,
mutex_unlock(&memcg->thresholds_lock);
}
static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css,
struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd)
{
return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
}
static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd)
{
return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
}
static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd, const char *args)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup_eventfd_list *event;
enum res_type type = MEMFILE_TYPE(cft->private);
BUG_ON(type != _OOM_TYPE);
event = kmalloc(sizeof(*event), GFP_KERNEL);
if (!event)
return -ENOMEM;
......@@ -5838,14 +5890,10 @@ static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css,
return 0;
}
static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css,
struct cftype *cft, struct eventfd_ctx *eventfd)
static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup_eventfd_list *ev, *tmp;
enum res_type type = MEMFILE_TYPE(cft->private);
BUG_ON(type != _OOM_TYPE);
spin_lock(&memcg_oom_lock);
......@@ -5859,17 +5907,12 @@ static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css,
spin_unlock(&memcg_oom_lock);
}
static int mem_cgroup_oom_control_read(struct cgroup_subsys_state *css,
struct cftype *cft, struct cgroup_map_cb *cb)
static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable);
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
if (atomic_read(&memcg->under_oom))
cb->fill(cb, "under_oom", 1);
else
cb->fill(cb, "under_oom", 0);
seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom));
return 0;
}
......@@ -5962,41 +6005,261 @@ static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
}
#endif
/*
* DO NOT USE IN NEW FILES.
*
* "cgroup.event_control" implementation.
*
* This is way over-engineered. It tries to support fully configurable
* events for each user. Such level of flexibility is completely
* unnecessary especially in the light of the planned unified hierarchy.
*
* Please deprecate this and replace with something simpler if at all
* possible.
*/
/*
* Unregister event and free resources.
*
* Gets called from workqueue.
*/
static void memcg_event_remove(struct work_struct *work)
{
struct mem_cgroup_event *event =
container_of(work, struct mem_cgroup_event, remove);
struct mem_cgroup *memcg = event->memcg;
remove_wait_queue(event->wqh, &event->wait);
event->unregister_event(memcg, event->eventfd);
/* Notify userspace the event is going away. */
eventfd_signal(event->eventfd, 1);
eventfd_ctx_put(event->eventfd);
kfree(event);
css_put(&memcg->css);
}
/*
* Gets called on POLLHUP on eventfd when user closes it.
*
* Called with wqh->lock held and interrupts disabled.
*/
static int memcg_event_wake(wait_queue_t *wait, unsigned mode,
int sync, void *key)
{
struct mem_cgroup_event *event =
container_of(wait, struct mem_cgroup_event, wait);
struct mem_cgroup *memcg = event->memcg;
unsigned long flags = (unsigned long)key;
if (flags & POLLHUP) {
/*
* If the event has been detached at cgroup removal, we
* can simply return knowing the other side will cleanup
* for us.
*
* We can't race against event freeing since the other
* side will require wqh->lock via remove_wait_queue(),
* which we hold.
*/
spin_lock(&memcg->event_list_lock);
if (!list_empty(&event->list)) {
list_del_init(&event->list);
/*
* We are in atomic context, but cgroup_event_remove()
* may sleep, so we have to call it in workqueue.
*/
schedule_work(&event->remove);
}
spin_unlock(&memcg->event_list_lock);
}
return 0;
}
static void memcg_event_ptable_queue_proc(struct file *file,
wait_queue_head_t *wqh, poll_table *pt)
{
struct mem_cgroup_event *event =
container_of(pt, struct mem_cgroup_event, pt);
event->wqh = wqh;
add_wait_queue(wqh, &event->wait);
}
/*
* DO NOT USE IN NEW FILES.
*
* Parse input and register new cgroup event handler.
*
* Input must be in format '<event_fd> <control_fd> <args>'.
* Interpretation of args is defined by control file implementation.
*/
static int memcg_write_event_control(struct cgroup_subsys_state *css,
struct cftype *cft, const char *buffer)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup_event *event;
struct cgroup_subsys_state *cfile_css;
unsigned int efd, cfd;
struct fd efile;
struct fd cfile;
const char *name;
char *endp;
int ret;
efd = simple_strtoul(buffer, &endp, 10);
if (*endp != ' ')
return -EINVAL;
buffer = endp + 1;
cfd = simple_strtoul(buffer, &endp, 10);
if ((*endp != ' ') && (*endp != '\0'))
return -EINVAL;
buffer = endp + 1;
event = kzalloc(sizeof(*event), GFP_KERNEL);
if (!event)
return -ENOMEM;
event->memcg = memcg;
INIT_LIST_HEAD(&event->list);
init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
init_waitqueue_func_entry(&event->wait, memcg_event_wake);
INIT_WORK(&event->remove, memcg_event_remove);
efile = fdget(efd);
if (!efile.file) {
ret = -EBADF;
goto out_kfree;
}
event->eventfd = eventfd_ctx_fileget(efile.file);
if (IS_ERR(event->eventfd)) {
ret = PTR_ERR(event->eventfd);
goto out_put_efile;
}
cfile = fdget(cfd);
if (!cfile.file) {
ret = -EBADF;
goto out_put_eventfd;
}
/* the process need read permission on control file */
/* AV: shouldn't we check that it's been opened for read instead? */
ret = inode_permission(file_inode(cfile.file), MAY_READ);
if (ret < 0)
goto out_put_cfile;
/*
* Determine the event callbacks and set them in @event. This used
* to be done via struct cftype but cgroup core no longer knows
* about these events. The following is crude but the whole thing
* is for compatibility anyway.
*
* DO NOT ADD NEW FILES.
*/
name = cfile.file->f_dentry->d_name.name;
if (!strcmp(name, "memory.usage_in_bytes")) {
event->register_event = mem_cgroup_usage_register_event;
event->unregister_event = mem_cgroup_usage_unregister_event;
} else if (!strcmp(name, "memory.oom_control")) {
event->register_event = mem_cgroup_oom_register_event;
event->unregister_event = mem_cgroup_oom_unregister_event;
} else if (!strcmp(name, "memory.pressure_level")) {
event->register_event = vmpressure_register_event;
event->unregister_event = vmpressure_unregister_event;
} else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
event->register_event = memsw_cgroup_usage_register_event;
event->unregister_event = memsw_cgroup_usage_unregister_event;
} else {
ret = -EINVAL;
goto out_put_cfile;
}
/*
* Verify @cfile should belong to @css. Also, remaining events are
* automatically removed on cgroup destruction but the removal is
* asynchronous, so take an extra ref on @css.
*/
rcu_read_lock();
ret = -EINVAL;
cfile_css = css_from_dir(cfile.file->f_dentry->d_parent,
&mem_cgroup_subsys);
if (cfile_css == css && css_tryget(css))
ret = 0;
rcu_read_unlock();
if (ret)
goto out_put_cfile;
ret = event->register_event(memcg, event->eventfd, buffer);
if (ret)
goto out_put_css;
efile.file->f_op->poll(efile.file, &event->pt);
spin_lock(&memcg->event_list_lock);
list_add(&event->list, &memcg->event_list);
spin_unlock(&memcg->event_list_lock);
fdput(cfile);
fdput(efile);
return 0;
out_put_css:
css_put(css);
out_put_cfile:
fdput(cfile);
out_put_eventfd:
eventfd_ctx_put(event->eventfd);
out_put_efile:
fdput(efile);
out_kfree:
kfree(event);
return ret;
}
static struct cftype mem_cgroup_files[] = {
{
.name = "usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
.read = mem_cgroup_read,
.register_event = mem_cgroup_usage_register_event,
.unregister_event = mem_cgroup_usage_unregister_event,
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "max_usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
.trigger = mem_cgroup_reset,
.read = mem_cgroup_read,
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "limit_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
.write_string = mem_cgroup_write,
.read = mem_cgroup_read,
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "soft_limit_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
.write_string = mem_cgroup_write,
.read = mem_cgroup_read,
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "failcnt",
.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
.trigger = mem_cgroup_reset,
.read = mem_cgroup_read,
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "stat",
.read_seq_string = memcg_stat_show,
.seq_show = memcg_stat_show,
},
{
.name = "force_empty",
......@@ -6008,6 +6271,12 @@ static struct cftype mem_cgroup_files[] = {
.write_u64 = mem_cgroup_hierarchy_write,
.read_u64 = mem_cgroup_hierarchy_read,
},
{
.name = "cgroup.event_control", /* XXX: for compat */
.write_string = memcg_write_event_control,
.flags = CFTYPE_NO_PREFIX,
.mode = S_IWUGO,
},
{
.name = "swappiness",
.read_u64 = mem_cgroup_swappiness_read,
......@@ -6020,21 +6289,17 @@ static struct cftype mem_cgroup_files[] = {
},
{
.name = "oom_control",
.read_map = mem_cgroup_oom_control_read,
.seq_show = mem_cgroup_oom_control_read,
.write_u64 = mem_cgroup_oom_control_write,
.register_event = mem_cgroup_oom_register_event,
.unregister_event = mem_cgroup_oom_unregister_event,
.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
},
{
.name = "pressure_level",
.register_event = vmpressure_register_event,
.unregister_event = vmpressure_unregister_event,
},
#ifdef CONFIG_NUMA
{
.name = "numa_stat",
.read_seq_string = memcg_numa_stat_show,
.seq_show = memcg_numa_stat_show,
},
#endif
#ifdef CONFIG_MEMCG_KMEM
......@@ -6042,29 +6307,29 @@ static struct cftype mem_cgroup_files[] = {
.name = "kmem.limit_in_bytes",
.private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
.write_string = mem_cgroup_write,
.read = mem_cgroup_read,
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "kmem.usage_in_bytes",
.private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
.read = mem_cgroup_read,
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "kmem.failcnt",
.private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
.trigger = mem_cgroup_reset,
.read = mem_cgroup_read,
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "kmem.max_usage_in_bytes",
.private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
.trigger = mem_cgroup_reset,
.read = mem_cgroup_read,
.read_u64 = mem_cgroup_read_u64,
},
#ifdef CONFIG_SLABINFO
{
.name = "kmem.slabinfo",
.read_seq_string = mem_cgroup_slabinfo_read,
.seq_show = mem_cgroup_slabinfo_read,
},
#endif
#endif
......@@ -6076,27 +6341,25 @@ static struct cftype memsw_cgroup_files[] = {
{
.name = "memsw.usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
.read = mem_cgroup_read,
.register_event = mem_cgroup_usage_register_event,
.unregister_event = mem_cgroup_usage_unregister_event,
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "memsw.max_usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
.trigger = mem_cgroup_reset,
.read = mem_cgroup_read,
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "memsw.limit_in_bytes",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
.write_string = mem_cgroup_write,
.read = mem_cgroup_read,
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "memsw.failcnt",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
.trigger = mem_cgroup_reset,
.read = mem_cgroup_read,
.read_u64 = mem_cgroup_read_u64,
},
{ }, /* terminate */
};
......@@ -6268,6 +6531,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
mutex_init(&memcg->thresholds_lock);
spin_lock_init(&memcg->move_lock);
vmpressure_init(&memcg->vmpressure);
INIT_LIST_HEAD(&memcg->event_list);
spin_lock_init(&memcg->event_list_lock);
return &memcg->css;
......@@ -6343,6 +6608,19 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup_event *event, *tmp;
/*
* Unregister events and notify userspace.
* Notify userspace about cgroup removing only after rmdir of cgroup
* directory to avoid race between userspace and kernelspace.
*/
spin_lock(&memcg->event_list_lock);
list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
list_del_init(&event->list);
schedule_work(&event->remove);
}
spin_unlock(&memcg->event_list_lock);
kmem_cgroup_css_offline(memcg);
......
......@@ -451,7 +451,7 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
* lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry
* @ent: swap entry to be looked up.
*
* Returns CSS ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
* Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
*/
unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
{
......
......@@ -278,8 +278,7 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
/**
* vmpressure_register_event() - Bind vmpressure notifications to an eventfd
* @css: css that is interested in vmpressure notifications
* @cft: cgroup control files handle
* @memcg: memcg that is interested in vmpressure notifications
* @eventfd: eventfd context to link notifications with
* @args: event arguments (used to set up a pressure level threshold)
*
......@@ -289,15 +288,12 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
* threshold (one of vmpressure_str_levels, i.e. "low", "medium", or
* "critical").
*
* This function should not be used directly, just pass it to (struct
* cftype).register_event, and then cgroup core will handle everything by
* itself.
* To be used as memcg event method.
*/
int vmpressure_register_event(struct cgroup_subsys_state *css,
struct cftype *cft, struct eventfd_ctx *eventfd,
const char *args)
int vmpressure_register_event(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd, const char *args)
{
struct vmpressure *vmpr = css_to_vmpressure(css);
struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
struct vmpressure_event *ev;
int level;
......@@ -325,23 +321,19 @@ int vmpressure_register_event(struct cgroup_subsys_state *css,
/**
* vmpressure_unregister_event() - Unbind eventfd from vmpressure
* @css: css handle
* @cft: cgroup control files handle
* @memcg: memcg handle
* @eventfd: eventfd context that was used to link vmpressure with the @cg
*
* This function does internal manipulations to detach the @eventfd from
* the vmpressure notifications, and then frees internal resources
* associated with the @eventfd (but the @eventfd itself is not freed).
*
* This function should not be used directly, just pass it to (struct
* cftype).unregister_event, and then cgroup core will handle everything
* by itself.
* To be used as memcg event method.
*/
void vmpressure_unregister_event(struct cgroup_subsys_state *css,
struct cftype *cft,
void vmpressure_unregister_event(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd)
{
struct vmpressure *vmpr = css_to_vmpressure(css);
struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
struct vmpressure_event *ev;
mutex_lock(&vmpr->events_lock);
......
......@@ -173,14 +173,14 @@ static u64 read_prioidx(struct cgroup_subsys_state *css, struct cftype *cft)
return css->cgroup->id;
}
static int read_priomap(struct cgroup_subsys_state *css, struct cftype *cft,
struct cgroup_map_cb *cb)
static int read_priomap(struct seq_file *sf, void *v)
{
struct net_device *dev;
rcu_read_lock();
for_each_netdev_rcu(&init_net, dev)
cb->fill(cb, dev->name, netprio_prio(css, dev));
seq_printf(sf, "%s %u\n", dev->name,
netprio_prio(seq_css(sf), dev));
rcu_read_unlock();
return 0;
}
......@@ -238,7 +238,7 @@ static struct cftype ss_files[] = {
},
{
.name = "ifpriomap",
.read_map = read_priomap,
.seq_show = read_priomap,
.write_string = write_priomap,
},
{ } /* terminate */
......
......@@ -274,10 +274,9 @@ static void set_majmin(char *str, unsigned m)
sprintf(str, "%u", m);
}
static int devcgroup_seq_read(struct cgroup_subsys_state *css,
struct cftype *cft, struct seq_file *m)
static int devcgroup_seq_show(struct seq_file *m, void *v)
{
struct dev_cgroup *devcgroup = css_to_devcgroup(css);
struct dev_cgroup *devcgroup = css_to_devcgroup(seq_css(m));
struct dev_exception_item *ex;
char maj[MAJMINLEN], min[MAJMINLEN], acc[ACCLEN];
......@@ -679,7 +678,7 @@ static struct cftype dev_cgroup_files[] = {
},
{
.name = "list",
.read_seq_string = devcgroup_seq_read,
.seq_show = devcgroup_seq_show,
.private = DEVCG_LIST,
},
{ } /* terminate */
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册