diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index a355f61a2ed39ba68cb8b13dc40659072f2a490b..c5a51039df57ff55d8e487a6b817385e7be4fd9e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -249,6 +249,10 @@ struct mem_cgroup { struct wb_domain cgwb_domain; #endif +#ifdef CONFIG_INET + unsigned long socket_pressure; +#endif + /* List of events which userspace want to receive */ struct list_head event_list; spinlock_t event_list_lock; @@ -290,18 +294,34 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *); bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg); struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); -struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg); static inline struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){ return css ? container_of(css, struct mem_cgroup, css) : NULL; } +#define mem_cgroup_from_counter(counter, member) \ + container_of(counter, struct mem_cgroup, member) + struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *, struct mem_cgroup *, struct mem_cgroup_reclaim_cookie *); void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *); +/** + * parent_mem_cgroup - find the accounting parent of a memcg + * @memcg: memcg whose parent to find + * + * Returns the parent memcg, or NULL if this is the root or the memory + * controller is in legacy no-hierarchy mode. + */ +static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) +{ + if (!memcg->memory.parent) + return NULL; + return mem_cgroup_from_counter(memcg->memory.parent, memory); +} + static inline bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, struct mem_cgroup *root) { @@ -689,10 +709,14 @@ extern struct static_key memcg_sockets_enabled_key; static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) { #ifdef CONFIG_MEMCG_KMEM - return memcg->tcp_mem.memory_pressure; -#else - return false; + if (memcg->tcp_mem.memory_pressure) + return true; #endif + do { + if (time_before(jiffies, memcg->socket_pressure)) + return true; + } while ((memcg = parent_mem_cgroup(memcg))); + return false; } #else #define mem_cgroup_sockets_enabled 0 diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h index 3e4535876d37493e590d43a687aa980646dab0c4..3347cc3ec0ab6b8ab6f43a6977d4512b59febe22 100644 --- a/include/linux/vmpressure.h +++ b/include/linux/vmpressure.h @@ -12,6 +12,9 @@ struct vmpressure { unsigned long scanned; unsigned long reclaimed; + + unsigned long tree_scanned; + unsigned long tree_reclaimed; /* The lock is used to keep the scanned/reclaimed above in sync. */ struct spinlock sr_lock; @@ -26,7 +29,7 @@ struct vmpressure { struct mem_cgroup; #ifdef CONFIG_MEMCG -extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, +extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, unsigned long scanned, unsigned long reclaimed); extern void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio); @@ -40,7 +43,7 @@ extern int vmpressure_register_event(struct mem_cgroup *memcg, extern void vmpressure_unregister_event(struct mem_cgroup *memcg, struct eventfd_ctx *eventfd); #else -static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, +static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, unsigned long scanned, unsigned long reclaimed) {} static inline void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) {} diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 60ebc486c2aa6ff559c3ac297518acb21d1f2229..df7f144a5a4b7b9287c019dfcfa69d643e5c62c4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1113,9 +1113,6 @@ bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg) return ret; } -#define mem_cgroup_from_counter(counter, member) \ - container_of(counter, struct mem_cgroup, member) - /** * mem_cgroup_margin - calculate chargeable space of a memory cgroup * @memcg: the memory cgroup @@ -4183,17 +4180,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) kfree(memcg); } -/* - * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. - */ -struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) -{ - if (!memcg->memory.parent) - return NULL; - return mem_cgroup_from_counter(memcg->memory.parent, memory); -} -EXPORT_SYMBOL(parent_mem_cgroup); - static struct cgroup_subsys_state * __ref mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { @@ -4233,6 +4219,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) #endif #ifdef CONFIG_CGROUP_WRITEBACK INIT_LIST_HEAD(&memcg->cgwb_list); +#endif +#ifdef CONFIG_INET + memcg->socket_pressure = jiffies; #endif return &memcg->css; diff --git a/mm/vmpressure.c b/mm/vmpressure.c index c5afd573d7da79afc814225043319cc66120addf..506f03e4be47a13f6dc2240b7497d506ce3f8eee 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -137,14 +137,11 @@ struct vmpressure_event { }; static bool vmpressure_event(struct vmpressure *vmpr, - unsigned long scanned, unsigned long reclaimed) + enum vmpressure_levels level) { struct vmpressure_event *ev; - enum vmpressure_levels level; bool signalled = false; - level = vmpressure_calc_level(scanned, reclaimed); - mutex_lock(&vmpr->events_lock); list_for_each_entry(ev, &vmpr->events, node) { @@ -164,6 +161,7 @@ static void vmpressure_work_fn(struct work_struct *work) struct vmpressure *vmpr = work_to_vmpressure(work); unsigned long scanned; unsigned long reclaimed; + enum vmpressure_levels level; spin_lock(&vmpr->sr_lock); /* @@ -174,19 +172,21 @@ static void vmpressure_work_fn(struct work_struct *work) * here. No need for any locks here since we don't care if * vmpr->reclaimed is in sync. */ - scanned = vmpr->scanned; + scanned = vmpr->tree_scanned; if (!scanned) { spin_unlock(&vmpr->sr_lock); return; } - reclaimed = vmpr->reclaimed; - vmpr->scanned = 0; - vmpr->reclaimed = 0; + reclaimed = vmpr->tree_reclaimed; + vmpr->tree_scanned = 0; + vmpr->tree_reclaimed = 0; spin_unlock(&vmpr->sr_lock); + level = vmpressure_calc_level(scanned, reclaimed); + do { - if (vmpressure_event(vmpr, scanned, reclaimed)) + if (vmpressure_event(vmpr, level)) break; /* * If not handled, propagate the event upward into the @@ -199,6 +199,7 @@ static void vmpressure_work_fn(struct work_struct *work) * vmpressure() - Account memory pressure through scanned/reclaimed ratio * @gfp: reclaimer's gfp mask * @memcg: cgroup memory controller handle + * @tree: legacy subtree mode * @scanned: number of pages scanned * @reclaimed: number of pages reclaimed * @@ -206,9 +207,16 @@ static void vmpressure_work_fn(struct work_struct *work) * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw * pressure index is then further refined and averaged over time. * + * If @tree is set, vmpressure is in traditional userspace reporting + * mode: @memcg is considered the pressure root and userspace is + * notified of the entire subtree's reclaim efficiency. + * + * If @tree is not set, reclaim efficiency is recorded for @memcg, and + * only in-kernel users are notified. + * * This function does not return any value. */ -void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, +void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, unsigned long scanned, unsigned long reclaimed) { struct vmpressure *vmpr = memcg_to_vmpressure(memcg); @@ -238,15 +246,47 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, if (!scanned) return; - spin_lock(&vmpr->sr_lock); - vmpr->scanned += scanned; - vmpr->reclaimed += reclaimed; - scanned = vmpr->scanned; - spin_unlock(&vmpr->sr_lock); + if (tree) { + spin_lock(&vmpr->sr_lock); + vmpr->tree_scanned += scanned; + vmpr->tree_reclaimed += reclaimed; + scanned = vmpr->scanned; + spin_unlock(&vmpr->sr_lock); - if (scanned < vmpressure_win) - return; - schedule_work(&vmpr->work); + if (scanned < vmpressure_win) + return; + schedule_work(&vmpr->work); + } else { + enum vmpressure_levels level; + + /* For now, no users for root-level efficiency */ + if (memcg == root_mem_cgroup) + return; + + spin_lock(&vmpr->sr_lock); + scanned = vmpr->scanned += scanned; + reclaimed = vmpr->reclaimed += reclaimed; + if (scanned < vmpressure_win) { + spin_unlock(&vmpr->sr_lock); + return; + } + vmpr->scanned = vmpr->reclaimed = 0; + spin_unlock(&vmpr->sr_lock); + + level = vmpressure_calc_level(scanned, reclaimed); + + if (level > VMPRESSURE_LOW) { + /* + * Let the socket buffer allocator know that + * we are having trouble reclaiming LRU pages. + * + * For hysteresis keep the pressure state + * asserted for a second in which subsequent + * pressure events can occur. + */ + memcg->socket_pressure = jiffies + HZ; + } + } } /** @@ -276,7 +316,7 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) * to the vmpressure() basically means that we signal 'critical' * level. */ - vmpressure(gfp, memcg, vmpressure_win, 0); + vmpressure(gfp, memcg, true, vmpressure_win, 0); } /** diff --git a/mm/vmscan.c b/mm/vmscan.c index e36d766dade97e668f8987c5c7a80b2f3af02fec..bb0cbd4c9f01f913a4779fd6ab3daf8bba4b207c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2396,6 +2396,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, memcg = mem_cgroup_iter(root, NULL, &reclaim); do { unsigned long lru_pages; + unsigned long reclaimed; unsigned long scanned; struct lruvec *lruvec; int swappiness; @@ -2408,6 +2409,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, lruvec = mem_cgroup_zone_lruvec(zone, memcg); swappiness = mem_cgroup_swappiness(memcg); + reclaimed = sc->nr_reclaimed; scanned = sc->nr_scanned; shrink_lruvec(lruvec, swappiness, sc, &lru_pages); @@ -2418,6 +2420,11 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, memcg, sc->nr_scanned - scanned, lru_pages); + /* Record the group's reclaim efficiency */ + vmpressure(sc->gfp_mask, memcg, false, + sc->nr_scanned - scanned, + sc->nr_reclaimed - reclaimed); + /* * Direct reclaim and kswapd have to scan all memory * cgroups to fulfill the overall scan target for the @@ -2449,7 +2456,8 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, reclaim_state->reclaimed_slab = 0; } - vmpressure(sc->gfp_mask, sc->target_mem_cgroup, + /* Record the subtree's reclaim efficiency */ + vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true, sc->nr_scanned - nr_scanned, sc->nr_reclaimed - nr_reclaimed);