diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index 6cab1f29da4c9b94b35c603fd9a91a252c0231aa..eac22d3b2f7b70f662740518122866b8ea460e97 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt @@ -184,6 +184,9 @@ limits on the root cgroup. Note2: When panic_on_oom is set to "2", the whole system will panic. +When oom event notifier is registered, event will be delivered. +(See oom_control section) + 2. Locking The memory controller uses the following hierarchy @@ -488,7 +491,22 @@ threshold in any direction. It's applicable for root and non-root cgroup. -10. TODO +10. OOM Control + +Memory controler implements oom notifier using cgroup notification +API (See cgroups.txt). It allows to register multiple oom notification +delivery and gets notification when oom happens. + +To register a notifier, application need: + - create an eventfd using eventfd(2) + - open memory.oom_control file + - write string like " " to cgroup.event_control + +Application will be notifier through eventfd when oom happens. +OOM notification doesn't work for root cgroup. + + +11. TODO 1. Add support for accounting huge pages (as a separate controller) 2. Make per-cgroup scanner reclaim not-shared pages first diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 94ac208b1490f4c23e88f79e38ef60cacbba59d0..da2ed3913316c107fd175abc912023bd4616e521 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -149,6 +149,7 @@ struct mem_cgroup_threshold { u64 threshold; }; +/* For threshold */ struct mem_cgroup_threshold_ary { /* An array index points to threshold just below usage. */ atomic_t current_threshold; @@ -157,8 +158,14 @@ struct mem_cgroup_threshold_ary { /* Array of thresholds */ struct mem_cgroup_threshold entries[0]; }; +/* for OOM */ +struct mem_cgroup_eventfd_list { + struct list_head list; + struct eventfd_ctx *eventfd; +}; static void mem_cgroup_threshold(struct mem_cgroup *mem); +static void mem_cgroup_oom_notify(struct mem_cgroup *mem); /* * The memory controller data structure. The memory controller controls both @@ -220,6 +227,9 @@ struct mem_cgroup { /* thresholds for mem+swap usage. RCU-protected */ struct mem_cgroup_threshold_ary *memsw_thresholds; + /* For oom notifier event fd */ + struct list_head oom_notify; + /* * Should we move charges of a task when a task is moved into this * mem_cgroup ? And what type of charges should we move ? @@ -282,9 +292,12 @@ enum charge_type { /* for encoding cft->private value on file */ #define _MEM (0) #define _MEMSWAP (1) +#define _OOM_TYPE (2) #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) #define MEMFILE_ATTR(val) ((val) & 0xffff) +/* Used for OOM nofiier */ +#define OOM_CONTROL (0) /* * Reclaim flags for mem_cgroup_hierarchical_reclaim @@ -1353,6 +1366,8 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) */ if (!locked) prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); + else + mem_cgroup_oom_notify(mem); mutex_unlock(&memcg_oom_mutex); if (locked) @@ -3398,8 +3413,22 @@ static int compare_thresholds(const void *a, const void *b) return _a->threshold - _b->threshold; } -static int mem_cgroup_register_event(struct cgroup *cgrp, struct cftype *cft, - struct eventfd_ctx *eventfd, const char *args) +static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem, void *data) +{ + struct mem_cgroup_eventfd_list *ev; + + list_for_each_entry(ev, &mem->oom_notify, list) + eventfd_signal(ev->eventfd, 1); + return 0; +} + +static void mem_cgroup_oom_notify(struct mem_cgroup *mem) +{ + mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_notify_cb); +} + +static int mem_cgroup_usage_register_event(struct cgroup *cgrp, + struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); struct mem_cgroup_threshold_ary *thresholds, *thresholds_new; @@ -3483,8 +3512,8 @@ static int mem_cgroup_register_event(struct cgroup *cgrp, struct cftype *cft, return ret; } -static int mem_cgroup_unregister_event(struct cgroup *cgrp, struct cftype *cft, - struct eventfd_ctx *eventfd) +static int mem_cgroup_usage_unregister_event(struct cgroup *cgrp, + struct cftype *cft, struct eventfd_ctx *eventfd) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); struct mem_cgroup_threshold_ary *thresholds, *thresholds_new; @@ -3568,13 +3597,61 @@ static int mem_cgroup_unregister_event(struct cgroup *cgrp, struct cftype *cft, return ret; } +static int mem_cgroup_oom_register_event(struct cgroup *cgrp, + struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) +{ + struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); + struct mem_cgroup_eventfd_list *event; + int type = MEMFILE_TYPE(cft->private); + + BUG_ON(type != _OOM_TYPE); + event = kmalloc(sizeof(*event), GFP_KERNEL); + if (!event) + return -ENOMEM; + + mutex_lock(&memcg_oom_mutex); + + event->eventfd = eventfd; + list_add(&event->list, &memcg->oom_notify); + + /* already in OOM ? */ + if (atomic_read(&memcg->oom_lock)) + eventfd_signal(eventfd, 1); + mutex_unlock(&memcg_oom_mutex); + + return 0; +} + +static int mem_cgroup_oom_unregister_event(struct cgroup *cgrp, + struct cftype *cft, struct eventfd_ctx *eventfd) +{ + struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); + struct mem_cgroup_eventfd_list *ev, *tmp; + int type = MEMFILE_TYPE(cft->private); + + BUG_ON(type != _OOM_TYPE); + + mutex_lock(&memcg_oom_mutex); + + list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) { + if (ev->eventfd == eventfd) { + list_del(&ev->list); + kfree(ev); + } + } + + mutex_unlock(&memcg_oom_mutex); + + return 0; +} + static struct cftype mem_cgroup_files[] = { { .name = "usage_in_bytes", .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), .read_u64 = mem_cgroup_read, - .register_event = mem_cgroup_register_event, - .unregister_event = mem_cgroup_unregister_event, + .register_event = mem_cgroup_usage_register_event, + .unregister_event = mem_cgroup_usage_unregister_event, }, { .name = "max_usage_in_bytes", @@ -3623,6 +3700,12 @@ static struct cftype mem_cgroup_files[] = { .read_u64 = mem_cgroup_move_charge_read, .write_u64 = mem_cgroup_move_charge_write, }, + { + .name = "oom_control", + .register_event = mem_cgroup_oom_register_event, + .unregister_event = mem_cgroup_oom_unregister_event, + .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), + }, }; #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP @@ -3631,8 +3714,8 @@ static struct cftype memsw_cgroup_files[] = { .name = "memsw.usage_in_bytes", .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), .read_u64 = mem_cgroup_read, - .register_event = mem_cgroup_register_event, - .unregister_event = mem_cgroup_unregister_event, + .register_event = mem_cgroup_usage_register_event, + .unregister_event = mem_cgroup_usage_unregister_event, }, { .name = "memsw.max_usage_in_bytes", @@ -3878,6 +3961,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) } mem->last_scanned_child = 0; spin_lock_init(&mem->reclaim_param_lock); + INIT_LIST_HEAD(&mem->oom_notify); if (parent) mem->swappiness = get_swappiness(parent);