提交 6e543d57 编写于 作者: L Lisa Du 提交者: Linus Torvalds

mm: vmscan: fix do_try_to_free_pages() livelock

This patch is based on KOSAKI's work and I add a little more description,
please refer https://lkml.org/lkml/2012/6/14/74.

Currently, I found system can enter a state that there are lots of free
pages in a zone but only order-0 and order-1 pages which means the zone is
heavily fragmented, then high order allocation could make direct reclaim
path's long stall(ex, 60 seconds) especially in no swap and no compaciton
enviroment.  This problem happened on v3.4, but it seems issue still lives
in current tree, the reason is do_try_to_free_pages enter live lock:

kswapd will go to sleep if the zones have been fully scanned and are still
not balanced.  As kswapd thinks there's little point trying all over again
to avoid infinite loop.  Instead it changes order from high-order to
0-order because kswapd think order-0 is the most important.  Look at
73ce02e9 in detail.  If watermarks are ok, kswapd will go back to sleep
and may leave zone->all_unreclaimable =3D 0.  It assume high-order users
can still perform direct reclaim if they wish.

Direct reclaim continue to reclaim for a high order which is not a
COSTLY_ORDER without oom-killer until kswapd turn on
zone->all_unreclaimble= .  This is because to avoid too early oom-kill.
So it means direct_reclaim depends on kswapd to break this loop.

In worst case, direct-reclaim may continue to page reclaim forever when
kswapd sleeps forever until someone like watchdog detect and finally kill
the process.  As described in:
http://thread.gmane.org/gmane.linux.kernel.mm/103737

We can't turn on zone->all_unreclaimable from direct reclaim path because
direct reclaim path don't take any lock and this way is racy.  Thus this
patch removes zone->all_unreclaimable field completely and recalculates
zone reclaimable state every time.

Note: we can't take the idea that direct-reclaim see zone->pages_scanned
directly and kswapd continue to use zone->all_unreclaimable.  Because, it
is racy.  commit 929bea7c (vmscan: all_unreclaimable() use
zone->all_unreclaimable as a name) describes the detail.

[akpm@linux-foundation.org: uninline zone_reclaimable_pages() and zone_reclaimable()]
Cc: Aaditya Kumar <aaditya.kumar.30@gmail.com>
Cc: Ying Han <yinghan@google.com>
Cc: Nick Piggin <npiggin@gmail.com>
Acked-by: NRik van Riel <riel@redhat.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Bob Liu <lliubbo@gmail.com>
Cc: Neil Zhang <zhangwm@marvell.com>
Cc: Russell King - ARM Linux <linux@arm.linux.org.uk>
Reviewed-by: NMichal Hocko <mhocko@suse.cz>
Acked-by: NMinchan Kim <minchan@kernel.org>
Acked-by: NJohannes Weiner <hannes@cmpxchg.org>
Signed-off-by: NKOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: NLisa Du <cldu@marvell.com>
Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: NLinus Torvalds <torvalds@linux-foundation.org>
上级 7a8010cd
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
#define LINUX_MM_INLINE_H #define LINUX_MM_INLINE_H
#include <linux/huge_mm.h> #include <linux/huge_mm.h>
#include <linux/swap.h>
/** /**
* page_is_file_cache - should the page be on a file LRU or anon LRU? * page_is_file_cache - should the page be on a file LRU or anon LRU?
......
...@@ -353,7 +353,6 @@ struct zone { ...@@ -353,7 +353,6 @@ struct zone {
* free areas of different sizes * free areas of different sizes
*/ */
spinlock_t lock; spinlock_t lock;
int all_unreclaimable; /* All pages pinned */
#if defined CONFIG_COMPACTION || defined CONFIG_CMA #if defined CONFIG_COMPACTION || defined CONFIG_CMA
/* Set to true when the PG_migrate_skip bits should be cleared */ /* Set to true when the PG_migrate_skip bits should be cleared */
bool compact_blockskip_flush; bool compact_blockskip_flush;
......
...@@ -143,7 +143,6 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone, ...@@ -143,7 +143,6 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone,
} }
extern unsigned long global_reclaimable_pages(void); extern unsigned long global_reclaimable_pages(void);
extern unsigned long zone_reclaimable_pages(struct zone *zone);
#ifdef CONFIG_NUMA #ifdef CONFIG_NUMA
/* /*
......
...@@ -85,6 +85,8 @@ extern unsigned long highest_memmap_pfn; ...@@ -85,6 +85,8 @@ extern unsigned long highest_memmap_pfn;
*/ */
extern int isolate_lru_page(struct page *page); extern int isolate_lru_page(struct page *page);
extern void putback_lru_page(struct page *page); extern void putback_lru_page(struct page *page);
extern unsigned long zone_reclaimable_pages(struct zone *zone);
extern bool zone_reclaimable(struct zone *zone);
/* /*
* in mm/rmap.c: * in mm/rmap.c:
......
...@@ -1471,7 +1471,7 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat, ...@@ -1471,7 +1471,7 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
if (!populated_zone(zone)) if (!populated_zone(zone))
continue; continue;
if (zone->all_unreclaimable) if (!zone_reclaimable(zone))
continue; continue;
/* Avoid waking kswapd by allocating pages_to_migrate pages. */ /* Avoid waking kswapd by allocating pages_to_migrate pages. */
......
...@@ -36,8 +36,11 @@ ...@@ -36,8 +36,11 @@
#include <linux/pagevec.h> #include <linux/pagevec.h>
#include <linux/timer.h> #include <linux/timer.h>
#include <linux/sched/rt.h> #include <linux/sched/rt.h>
#include <linux/mm_inline.h>
#include <trace/events/writeback.h> #include <trace/events/writeback.h>
#include "internal.h"
/* /*
* Sleep at most 200ms at a time in balance_dirty_pages(). * Sleep at most 200ms at a time in balance_dirty_pages().
*/ */
......
...@@ -56,6 +56,7 @@ ...@@ -56,6 +56,7 @@
#include <linux/ftrace_event.h> #include <linux/ftrace_event.h>
#include <linux/memcontrol.h> #include <linux/memcontrol.h>
#include <linux/prefetch.h> #include <linux/prefetch.h>
#include <linux/mm_inline.h>
#include <linux/migrate.h> #include <linux/migrate.h>
#include <linux/page-debug-flags.h> #include <linux/page-debug-flags.h>
#include <linux/hugetlb.h> #include <linux/hugetlb.h>
...@@ -647,7 +648,6 @@ static void free_pcppages_bulk(struct zone *zone, int count, ...@@ -647,7 +648,6 @@ static void free_pcppages_bulk(struct zone *zone, int count,
int to_free = count; int to_free = count;
spin_lock(&zone->lock); spin_lock(&zone->lock);
zone->all_unreclaimable = 0;
zone->pages_scanned = 0; zone->pages_scanned = 0;
while (to_free) { while (to_free) {
...@@ -696,7 +696,6 @@ static void free_one_page(struct zone *zone, struct page *page, int order, ...@@ -696,7 +696,6 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
int migratetype) int migratetype)
{ {
spin_lock(&zone->lock); spin_lock(&zone->lock);
zone->all_unreclaimable = 0;
zone->pages_scanned = 0; zone->pages_scanned = 0;
__free_one_page(page, zone, order, migratetype); __free_one_page(page, zone, order, migratetype);
...@@ -3164,7 +3163,7 @@ void show_free_areas(unsigned int filter) ...@@ -3164,7 +3163,7 @@ void show_free_areas(unsigned int filter)
K(zone_page_state(zone, NR_FREE_CMA_PAGES)), K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
K(zone_page_state(zone, NR_WRITEBACK_TEMP)), K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
zone->pages_scanned, zone->pages_scanned,
(zone->all_unreclaimable ? "yes" : "no") (!zone_reclaimable(zone) ? "yes" : "no")
); );
printk("lowmem_reserve[]:"); printk("lowmem_reserve[]:");
for (i = 0; i < MAX_NR_ZONES; i++) for (i = 0; i < MAX_NR_ZONES; i++)
......
...@@ -146,6 +146,25 @@ static bool global_reclaim(struct scan_control *sc) ...@@ -146,6 +146,25 @@ static bool global_reclaim(struct scan_control *sc)
} }
#endif #endif
unsigned long zone_reclaimable_pages(struct zone *zone)
{
int nr;
nr = zone_page_state(zone, NR_ACTIVE_FILE) +
zone_page_state(zone, NR_INACTIVE_FILE);
if (get_nr_swap_pages() > 0)
nr += zone_page_state(zone, NR_ACTIVE_ANON) +
zone_page_state(zone, NR_INACTIVE_ANON);
return nr;
}
bool zone_reclaimable(struct zone *zone)
{
return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
}
static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru) static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
{ {
if (!mem_cgroup_disabled()) if (!mem_cgroup_disabled())
...@@ -1789,7 +1808,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, ...@@ -1789,7 +1808,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
* latencies, so it's better to scan a minimum amount there as * latencies, so it's better to scan a minimum amount there as
* well. * well.
*/ */
if (current_is_kswapd() && zone->all_unreclaimable) if (current_is_kswapd() && !zone_reclaimable(zone))
force_scan = true; force_scan = true;
if (!global_reclaim(sc)) if (!global_reclaim(sc))
force_scan = true; force_scan = true;
...@@ -2244,8 +2263,8 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) ...@@ -2244,8 +2263,8 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
if (global_reclaim(sc)) { if (global_reclaim(sc)) {
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
continue; continue;
if (zone->all_unreclaimable && if (sc->priority != DEF_PRIORITY &&
sc->priority != DEF_PRIORITY) !zone_reclaimable(zone))
continue; /* Let kswapd poll it */ continue; /* Let kswapd poll it */
if (IS_ENABLED(CONFIG_COMPACTION)) { if (IS_ENABLED(CONFIG_COMPACTION)) {
/* /*
...@@ -2283,11 +2302,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) ...@@ -2283,11 +2302,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
return aborted_reclaim; return aborted_reclaim;
} }
static bool zone_reclaimable(struct zone *zone)
{
return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
}
/* All zones in zonelist are unreclaimable? */ /* All zones in zonelist are unreclaimable? */
static bool all_unreclaimable(struct zonelist *zonelist, static bool all_unreclaimable(struct zonelist *zonelist,
struct scan_control *sc) struct scan_control *sc)
...@@ -2301,7 +2315,7 @@ static bool all_unreclaimable(struct zonelist *zonelist, ...@@ -2301,7 +2315,7 @@ static bool all_unreclaimable(struct zonelist *zonelist,
continue; continue;
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
continue; continue;
if (!zone->all_unreclaimable) if (zone_reclaimable(zone))
return false; return false;
} }
...@@ -2712,7 +2726,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) ...@@ -2712,7 +2726,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
* DEF_PRIORITY. Effectively, it considers them balanced so * DEF_PRIORITY. Effectively, it considers them balanced so
* they must be considered balanced here as well! * they must be considered balanced here as well!
*/ */
if (zone->all_unreclaimable) { if (!zone_reclaimable(zone)) {
balanced_pages += zone->managed_pages; balanced_pages += zone->managed_pages;
continue; continue;
} }
...@@ -2773,7 +2787,6 @@ static bool kswapd_shrink_zone(struct zone *zone, ...@@ -2773,7 +2787,6 @@ static bool kswapd_shrink_zone(struct zone *zone,
unsigned long lru_pages, unsigned long lru_pages,
unsigned long *nr_attempted) unsigned long *nr_attempted)
{ {
unsigned long nr_slab;
int testorder = sc->order; int testorder = sc->order;
unsigned long balance_gap; unsigned long balance_gap;
struct reclaim_state *reclaim_state = current->reclaim_state; struct reclaim_state *reclaim_state = current->reclaim_state;
...@@ -2818,15 +2831,12 @@ static bool kswapd_shrink_zone(struct zone *zone, ...@@ -2818,15 +2831,12 @@ static bool kswapd_shrink_zone(struct zone *zone,
shrink_zone(zone, sc); shrink_zone(zone, sc);
reclaim_state->reclaimed_slab = 0; reclaim_state->reclaimed_slab = 0;
nr_slab = shrink_slab(&shrink, sc->nr_scanned, lru_pages); shrink_slab(&shrink, sc->nr_scanned, lru_pages);
sc->nr_reclaimed += reclaim_state->reclaimed_slab; sc->nr_reclaimed += reclaim_state->reclaimed_slab;
/* Account for the number of pages attempted to reclaim */ /* Account for the number of pages attempted to reclaim */
*nr_attempted += sc->nr_to_reclaim; *nr_attempted += sc->nr_to_reclaim;
if (nr_slab == 0 && !zone_reclaimable(zone))
zone->all_unreclaimable = 1;
zone_clear_flag(zone, ZONE_WRITEBACK); zone_clear_flag(zone, ZONE_WRITEBACK);
/* /*
...@@ -2835,7 +2845,7 @@ static bool kswapd_shrink_zone(struct zone *zone, ...@@ -2835,7 +2845,7 @@ static bool kswapd_shrink_zone(struct zone *zone,
* BDIs but as pressure is relieved, speculatively avoid congestion * BDIs but as pressure is relieved, speculatively avoid congestion
* waits. * waits.
*/ */
if (!zone->all_unreclaimable && if (zone_reclaimable(zone) &&
zone_balanced(zone, testorder, 0, classzone_idx)) { zone_balanced(zone, testorder, 0, classzone_idx)) {
zone_clear_flag(zone, ZONE_CONGESTED); zone_clear_flag(zone, ZONE_CONGESTED);
zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
...@@ -2901,8 +2911,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, ...@@ -2901,8 +2911,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
if (!populated_zone(zone)) if (!populated_zone(zone))
continue; continue;
if (zone->all_unreclaimable && if (sc.priority != DEF_PRIORITY &&
sc.priority != DEF_PRIORITY) !zone_reclaimable(zone))
continue; continue;
/* /*
...@@ -2980,8 +2990,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, ...@@ -2980,8 +2990,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
if (!populated_zone(zone)) if (!populated_zone(zone))
continue; continue;
if (zone->all_unreclaimable && if (sc.priority != DEF_PRIORITY &&
sc.priority != DEF_PRIORITY) !zone_reclaimable(zone))
continue; continue;
sc.nr_scanned = 0; sc.nr_scanned = 0;
...@@ -3265,20 +3275,6 @@ unsigned long global_reclaimable_pages(void) ...@@ -3265,20 +3275,6 @@ unsigned long global_reclaimable_pages(void)
return nr; return nr;
} }
unsigned long zone_reclaimable_pages(struct zone *zone)
{
int nr;
nr = zone_page_state(zone, NR_ACTIVE_FILE) +
zone_page_state(zone, NR_INACTIVE_FILE);
if (get_nr_swap_pages() > 0)
nr += zone_page_state(zone, NR_ACTIVE_ANON) +
zone_page_state(zone, NR_INACTIVE_ANON);
return nr;
}
#ifdef CONFIG_HIBERNATION #ifdef CONFIG_HIBERNATION
/* /*
* Try to free `nr_to_reclaim' of memory, system-wide, and return the number of * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
...@@ -3576,7 +3572,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) ...@@ -3576,7 +3572,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
return ZONE_RECLAIM_FULL; return ZONE_RECLAIM_FULL;
if (zone->all_unreclaimable) if (!zone_reclaimable(zone))
return ZONE_RECLAIM_FULL; return ZONE_RECLAIM_FULL;
/* /*
......
...@@ -19,6 +19,9 @@ ...@@ -19,6 +19,9 @@
#include <linux/math64.h> #include <linux/math64.h>
#include <linux/writeback.h> #include <linux/writeback.h>
#include <linux/compaction.h> #include <linux/compaction.h>
#include <linux/mm_inline.h>
#include "internal.h"
#ifdef CONFIG_VM_EVENT_COUNTERS #ifdef CONFIG_VM_EVENT_COUNTERS
DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
...@@ -1088,7 +1091,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, ...@@ -1088,7 +1091,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
"\n all_unreclaimable: %u" "\n all_unreclaimable: %u"
"\n start_pfn: %lu" "\n start_pfn: %lu"
"\n inactive_ratio: %u", "\n inactive_ratio: %u",
zone->all_unreclaimable, !zone_reclaimable(zone),
zone->zone_start_pfn, zone->zone_start_pfn,
zone->inactive_ratio); zone->inactive_ratio);
seq_putc(m, '\n'); seq_putc(m, '\n');
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册