mm: vmscan: block kswapd if it is encountering pages under writeback

Historically, kswapd used to congestion_wait() at higher priorities if it was not making forward progress. This made no sense as the failure to make progress could be completely independent of IO. It was later replaced by wait_iff_congested() and removed entirely by commit 258401a6 (mm: don't wait on congested zones in balance_pgdat()) as it was duplicating logic in shrink_inactive_list(). This is problematic. If kswapd encounters many pages under writeback and it continues to scan until it reaches the high watermark then it will quickly skip over the pages under writeback and reclaim clean young pages or push applications out to swap. The use of wait_iff_congested() is not suited to kswapd as it will only stall if the underlying BDI is really congested or a direct reclaimer was unable to write to the underlying BDI. kswapd bypasses the BDI congestion as it sets PF_SWAPWRITE but even if this was taken into account then it would cause direct reclaimers to stall on writeback which is not desirable. This patch sets a ZONE_WRITEBACK flag if direct reclaim or kswapd is encountering too many pages under writeback. If this flag is set and kswapd encounters a PageReclaim page under writeback then it'll assume that the LRU lists are being recycled too quickly before IO can complete and block waiting for some IO to complete. Signed-off-by: N Mel Gorman <mgorman@suse.de> Reviewed-by: N Michal Hocko <mhocko@suse.cz> Acked-by: N Rik van Riel <riel@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Jiri Slaby <jslaby@suse.cz> Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu> Tested-by: N Zlatko Calusic <zcalusic@bitsync.net> Cc: dormando <dormando@rydia.net> Signed-off-by: N Andrew Morton <akpm@linux-foundation.org> Signed-off-by: N Linus Torvalds <torvalds@linux-foundation.org>

mm: vmscan: block kswapd if it is encountering pages under writeback
Historically, kswapd used to congestion_wait() at higher priorities if it was not making forward progress. This made no sense as the failure to make progress could be completely independent of IO. It was later replaced by wait_iff_congested() and removed entirely by commit 258401a6 (mm: don't wait on congested zones in balance_pgdat()) as it was duplicating logic in shrink_inactive_list(). This is problematic. If kswapd encounters many pages under writeback and it continues to scan until it reaches the high watermark then it will quickly skip over the pages under writeback and reclaim clean young pages or push applications out to swap. The use of wait_iff_congested() is not suited to kswapd as it will only stall if the underlying BDI is really congested or a direct reclaimer was unable to write to the underlying BDI. kswapd bypasses the BDI congestion as it sets PF_SWAPWRITE but even if this was taken into account then it would cause direct reclaimers to stall on writeback which is not desirable. This patch sets a ZONE_WRITEBACK flag if direct reclaim or kswapd is encountering too many pages under writeback. If this flag is set and kswapd encounters a PageReclaim page under writeback then it'll assume that the LRU lists are being recycled too quickly before IO can complete and block waiting for some IO to complete. Signed-off-by: N Mel Gorman <mgorman@suse.de> Reviewed-by: N Michal Hocko <mhocko@suse.cz> Acked-by: N Rik van Riel <riel@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Jiri Slaby <jslaby@suse.cz> Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu> Tested-by: N Zlatko Calusic <zcalusic@bitsync.net> Cc: dormando <dormando@rydia.net> Signed-off-by: N Andrew Morton <akpm@linux-foundation.org> Signed-off-by: N Linus Torvalds <torvalds@linux-foundation.org>
283aba9f · Mel Gorman · Linus Torvalds · d43006d5 · 283aba9f · 283aba9f
显示空白变更内容
内联并排

Showing with 68 addition and 22 deletion

include/linux/mmzone.h include/linux/mmzone.h +8 -0

mm/vmscan.c mm/vmscan.c +60 -22

未找到文件。
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -499,6 +499,9 @@ typedef enum {
 					 * many dirty file pages at the tail
 					 * of the LRU.
 					 */
+	ZONE_WRITEBACK,			/* reclaim scanning has recently found
+					 * many pages under writeback
+					 */
 } zone_flags_t;
 static inline void zone_set_flag(struct zone *zone, zone_flags_t flag)
@@ -526,6 +529,11 @@ static inline int zone_is_reclaim_dirty(const struct zone *zone)
 	return test_bit(ZONE_TAIL_LRU_DIRTY, &zone->flags);
 }
+static inline int zone_is_reclaim_writeback(const struct zone *zone)
+{
+	return test_bit(ZONE_WRITEBACK, &zone->flags);
+}
 static inline int zone_is_reclaim_locked(const struct zone *zone)
 {
 	return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags);

--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -724,25 +724,55 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
 			(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
-		if (PageWriteback(page)) {
 		/*
-			 * memcg doesn't have any dirty pages throttling so we
+		 * If a page at the tail of the LRU is under writeback, there
-			 * could easily OOM just because too many pages are in
+		 * are three cases to consider.
-			 * writeback and there is nothing else to reclaim.
+		 *
-			 *
+		 * 1) If reclaim is encountering an excessive number of pages
-			 * Check __GFP_IO, certainly because a loop driver
+		 *    under writeback and this page is both under writeback and
-			 * thread might enter reclaim, and deadlock if it waits
+		 *    PageReclaim then it indicates that pages are being queued
-			 * on a page for which it is needed to do the write
+		 *    for IO but are being recycled through the LRU before the
-			 * (loop masks off __GFP_IO|__GFP_FS for this reason);
+		 *    IO can complete. Waiting on the page itself risks an
-			 * but more thought would probably show more reasons.
+		 *    indefinite stall if it is impossible to writeback the
-			 *
+		 *    page due to IO error or disconnected storage so instead
-			 * Don't require __GFP_FS, since we're not going into
+		 *    block for HZ/10 or until some IO completes then clear the
-			 * the FS, just waiting on its writeback completion.
+		 *    ZONE_WRITEBACK flag to recheck if the condition exists.
-			 * Worryingly, ext4 gfs2 and xfs allocate pages with
+		 *
-			 * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so
+		 * 2) Global reclaim encounters a page, memcg encounters a
-			 * testing may_enter_fs here is liable to OOM on them.
+		 *    page that is not marked for immediate reclaim or
+		 *    the caller does not have __GFP_IO. In this case mark
+		 *    the page for immediate reclaim and continue scanning.
+		 *
+		 *    __GFP_IO is checked  because a loop driver thread might
+		 *    enter reclaim, and deadlock if it waits on a page for
+		 *    which it is needed to do the write (loop masks off
+		 *    __GFP_IO|__GFP_FS for this reason); but more thought
+		 *    would probably show more reasons.
+		 *
+		 *    Don't require __GFP_FS, since we're not going into the
+		 *    FS, just waiting on its writeback completion. Worryingly,
+		 *    ext4 gfs2 and xfs allocate pages with
+		 *    grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing
+		 *    may_enter_fs here is liable to OOM on them.
+		 *
+		 * 3) memcg encounters a page that is not already marked
+		 *    PageReclaim. memcg does not have any dirty pages
+		 *    throttling so we could easily OOM just because too many
+		 *    pages are in writeback and there is nothing else to
+		 *    reclaim. Wait for the writeback to complete.
 		 */
-			if (global_reclaim(sc) ||
+		if (PageWriteback(page)) {
+			/* Case 1 above */
+			if (current_is_kswapd() &&
+			    PageReclaim(page) &&
+			    zone_is_reclaim_writeback(zone)) {
+				unlock_page(page);
+				congestion_wait(BLK_RW_ASYNC, HZ/10);
+				zone_clear_flag(zone, ZONE_WRITEBACK);
+				goto keep;
+			/* Case 2 above */
+			} else if (global_reclaim(sc) ||
 			    !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
 				/*
 				 * This is slightly racy - end_page_writeback()
@@ -757,10 +787,14 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 				 */
 				SetPageReclaim(page);
 				nr_writeback++;
 				goto keep_locked;
-			}
+			/* Case 3 above */
+			} else {
 				wait_on_page_writeback(page);
 			}
+		}
 		if (!force_reclaim)
 			references = page_check_references(page, sc);
@@ -1374,8 +1408,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 	 *                     isolated page is PageWriteback
 	 */
 	if (nr_writeback && nr_writeback >=
-			(nr_taken >> (DEF_PRIORITY - sc->priority)))
+			(nr_taken >> (DEF_PRIORITY - sc->priority))) {
 		wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
+		zone_set_flag(zone, ZONE_WRITEBACK);
+	}
 	/*
 	 * Similarly, if many dirty pages are encountered that are not
@@ -2669,8 +2705,8 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
 * the high watermark.
 *
 * Returns true if kswapd scanned at least the requested number of pages to
- * reclaim. This is used to determine if the scanning priority needs to be
+ * reclaim or if the lack of progress was due to pages under writeback.
- * raised.
+ * This is used to determine if the scanning priority needs to be raised.
 */
 static bool kswapd_shrink_zone(struct zone *zone,
 			       struct scan_control *sc,
@@ -2697,6 +2733,8 @@ static bool kswapd_shrink_zone(struct zone *zone,
 	if (nr_slab == 0 && !zone_reclaimable(zone))
 		zone->all_unreclaimable = 1;
+	zone_clear_flag(zone, ZONE_WRITEBACK);
 	return sc->nr_scanned >= sc->nr_to_reclaim;
 }