diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6be8fcb6f74f8cfbed43e825fa4d9e006a4837d1..512bf9a618c7ad884347fbad5ce81c6b289009c2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1457,47 +1457,171 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, return page; } -/* - * This is the 'heart' of the zoned buddy allocator. - */ -struct page * -__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist, nodemask_t *nodemask) +static inline int +should_alloc_retry(gfp_t gfp_mask, unsigned int order, + unsigned long pages_reclaimed) { - const gfp_t wait = gfp_mask & __GFP_WAIT; - enum zone_type high_zoneidx = gfp_zone(gfp_mask); - struct zoneref *z; - struct zone *zone; - struct page *page; - struct reclaim_state reclaim_state; - struct task_struct *p = current; - int do_retry; - int alloc_flags; - unsigned long did_some_progress; - unsigned long pages_reclaimed = 0; + /* Do not loop if specifically requested */ + if (gfp_mask & __GFP_NORETRY) + return 0; - lockdep_trace_alloc(gfp_mask); + /* + * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER + * means __GFP_NOFAIL, but that may not be true in other + * implementations. + */ + if (order <= PAGE_ALLOC_COSTLY_ORDER) + return 1; - might_sleep_if(wait); + /* + * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is + * specified, then we retry until we no longer reclaim any pages + * (above), or we've reclaimed an order of pages at least as + * large as the allocation's order. In both cases, if the + * allocation still fails, we stop retrying. + */ + if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order)) + return 1; - if (should_fail_alloc_page(gfp_mask, order)) - return NULL; + /* + * Don't let big-order allocations loop unless the caller + * explicitly requests that. + */ + if (gfp_mask & __GFP_NOFAIL) + return 1; - /* the list of zones suitable for gfp_mask */ - z = zonelist->_zonerefs; - if (unlikely(!z->zone)) { - /* - * Happens if we have an empty zonelist as a result of - * GFP_THISNODE being used on a memoryless node - */ + return 0; +} + +static inline struct page * +__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, enum zone_type high_zoneidx, + nodemask_t *nodemask) +{ + struct page *page; + + /* Acquire the OOM killer lock for the zones in zonelist */ + if (!try_set_zone_oom(zonelist, gfp_mask)) { + schedule_timeout_uninterruptible(1); return NULL; } -restart: - page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, - zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET); + /* + * Go through the zonelist yet one more time, keep very high watermark + * here, this is only to catch a parallel oom killing, we must fail if + * we're still under heavy pressure. + */ + page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, + order, zonelist, high_zoneidx, + ALLOC_WMARK_HIGH|ALLOC_CPUSET); if (page) - goto got_pg; + goto out; + + /* The OOM killer will not help higher order allocs */ + if (order > PAGE_ALLOC_COSTLY_ORDER) + goto out; + + /* Exhausted what can be done so it's blamo time */ + out_of_memory(zonelist, gfp_mask, order); + +out: + clear_zonelist_oom(zonelist, gfp_mask); + return page; +} + +/* The really slow allocator path where we enter direct reclaim */ +static inline struct page * +__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, enum zone_type high_zoneidx, + nodemask_t *nodemask, int alloc_flags, unsigned long *did_some_progress) +{ + struct page *page = NULL; + struct reclaim_state reclaim_state; + struct task_struct *p = current; + + cond_resched(); + + /* We now go into synchronous reclaim */ + cpuset_memory_pressure_bump(); + + /* + * The task's cpuset might have expanded its set of allowable nodes + */ + p->flags |= PF_MEMALLOC; + lockdep_set_current_reclaim_state(gfp_mask); + reclaim_state.reclaimed_slab = 0; + p->reclaim_state = &reclaim_state; + + *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); + + p->reclaim_state = NULL; + lockdep_clear_current_reclaim_state(); + p->flags &= ~PF_MEMALLOC; + + cond_resched(); + + if (order != 0) + drain_all_pages(); + + if (likely(*did_some_progress)) + page = get_page_from_freelist(gfp_mask, nodemask, order, + zonelist, high_zoneidx, alloc_flags); + return page; +} + +static inline int +is_allocation_high_priority(struct task_struct *p, gfp_t gfp_mask) +{ + if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) + && !in_interrupt()) + return 1; + return 0; +} + +/* + * This is called in the allocator slow-path if the allocation request is of + * sufficient urgency to ignore watermarks and take other desperate measures + */ +static inline struct page * +__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, enum zone_type high_zoneidx, + nodemask_t *nodemask) +{ + struct page *page; + + do { + page = get_page_from_freelist(gfp_mask, nodemask, order, + zonelist, high_zoneidx, ALLOC_NO_WATERMARKS); + + if (!page && gfp_mask & __GFP_NOFAIL) + congestion_wait(WRITE, HZ/50); + } while (!page && (gfp_mask & __GFP_NOFAIL)); + + return page; +} + +static inline +void wake_all_kswapd(unsigned int order, struct zonelist *zonelist, + enum zone_type high_zoneidx) +{ + struct zoneref *z; + struct zone *zone; + + for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) + wakeup_kswapd(zone, order); +} + +static inline struct page * +__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, enum zone_type high_zoneidx, + nodemask_t *nodemask) +{ + const gfp_t wait = gfp_mask & __GFP_WAIT; + struct page *page = NULL; + int alloc_flags; + unsigned long pages_reclaimed = 0; + unsigned long did_some_progress; + struct task_struct *p = current; /* * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and @@ -1510,8 +1634,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) goto nopage; - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) - wakeup_kswapd(zone, order); + wake_all_kswapd(order, zonelist, high_zoneidx); /* * OK, we're below the kswapd watermark and have kicked background @@ -1531,6 +1654,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, if (wait) alloc_flags |= ALLOC_CPUSET; +restart: /* * Go through the zonelist again. Let __GFP_HIGH and allocations * coming from realtime tasks go deeper into reserves. @@ -1544,23 +1668,18 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, if (page) goto got_pg; - /* This allocation should allow future memory freeing. */ - rebalance: - if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) - && !in_interrupt()) { + /* Allocate without watermarks if the context allows */ + if (is_allocation_high_priority(p, gfp_mask)) { + /* Do not dip into emergency reserves if specified */ if (!(gfp_mask & __GFP_NOMEMALLOC)) { -nofail_alloc: - /* go through the zonelist yet again, ignoring mins */ - page = get_page_from_freelist(gfp_mask, nodemask, order, - zonelist, high_zoneidx, ALLOC_NO_WATERMARKS); + page = __alloc_pages_high_priority(gfp_mask, order, + zonelist, high_zoneidx, nodemask); if (page) goto got_pg; - if (gfp_mask & __GFP_NOFAIL) { - congestion_wait(WRITE, HZ/50); - goto nofail_alloc; - } } + + /* Ensure no recursion into the allocator */ goto nopage; } @@ -1568,93 +1687,42 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, if (!wait) goto nopage; - cond_resched(); - - /* We now go into synchronous reclaim */ - cpuset_memory_pressure_bump(); - - p->flags |= PF_MEMALLOC; - - lockdep_set_current_reclaim_state(gfp_mask); - reclaim_state.reclaimed_slab = 0; - p->reclaim_state = &reclaim_state; - - did_some_progress = try_to_free_pages(zonelist, order, - gfp_mask, nodemask); - - p->reclaim_state = NULL; - lockdep_clear_current_reclaim_state(); - p->flags &= ~PF_MEMALLOC; + /* Try direct reclaim and then allocating */ + page = __alloc_pages_direct_reclaim(gfp_mask, order, + zonelist, high_zoneidx, + nodemask, + alloc_flags, &did_some_progress); + if (page) + goto got_pg; - cond_resched(); + /* + * If we failed to make any progress reclaiming, then we are + * running out of options and have to consider going OOM + */ + if (!did_some_progress) { + if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { + page = __alloc_pages_may_oom(gfp_mask, order, + zonelist, high_zoneidx, + nodemask); + if (page) + goto got_pg; - if (order != 0) - drain_all_pages(); + /* + * The OOM killer does not trigger for high-order allocations + * but if no progress is being made, there are no other + * options and retrying is unlikely to help + */ + if (order > PAGE_ALLOC_COSTLY_ORDER) + goto nopage; - if (likely(did_some_progress)) { - page = get_page_from_freelist(gfp_mask, nodemask, order, - zonelist, high_zoneidx, alloc_flags); - if (page) - goto got_pg; - } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { - if (!try_set_zone_oom(zonelist, gfp_mask)) { - schedule_timeout_uninterruptible(1); goto restart; } - - /* - * Go through the zonelist yet one more time, keep - * very high watermark here, this is only to catch - * a parallel oom killing, we must fail if we're still - * under heavy pressure. - */ - page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, - order, zonelist, high_zoneidx, - ALLOC_WMARK_HIGH|ALLOC_CPUSET); - if (page) { - clear_zonelist_oom(zonelist, gfp_mask); - goto got_pg; - } - - /* The OOM killer will not help higher order allocs so fail */ - if (order > PAGE_ALLOC_COSTLY_ORDER) { - clear_zonelist_oom(zonelist, gfp_mask); - goto nopage; - } - - out_of_memory(zonelist, gfp_mask, order); - clear_zonelist_oom(zonelist, gfp_mask); - goto restart; } - /* - * Don't let big-order allocations loop unless the caller explicitly - * requests that. Wait for some write requests to complete then retry. - * - * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER - * means __GFP_NOFAIL, but that may not be true in other - * implementations. - * - * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is - * specified, then we retry until we no longer reclaim any pages - * (above), or we've reclaimed an order of pages at least as - * large as the allocation's order. In both cases, if the - * allocation still fails, we stop retrying. - */ + /* Check if we should retry the allocation */ pages_reclaimed += did_some_progress; - do_retry = 0; - if (!(gfp_mask & __GFP_NORETRY)) { - if (order <= PAGE_ALLOC_COSTLY_ORDER) { - do_retry = 1; - } else { - if (gfp_mask & __GFP_REPEAT && - pages_reclaimed < (1 << order)) - do_retry = 1; - } - if (gfp_mask & __GFP_NOFAIL) - do_retry = 1; - } - if (do_retry) { + if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { + /* Wait for some write requests to complete then retry */ congestion_wait(WRITE, HZ/50); goto rebalance; } @@ -1669,6 +1737,41 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, } got_pg: return page; + +} + +/* + * This is the 'heart' of the zoned buddy allocator. + */ +struct page * +__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, nodemask_t *nodemask) +{ + enum zone_type high_zoneidx = gfp_zone(gfp_mask); + struct page *page; + + lockdep_trace_alloc(gfp_mask); + + might_sleep_if(gfp_mask & __GFP_WAIT); + + if (should_fail_alloc_page(gfp_mask, order)) + return NULL; + + /* + * Check the zones suitable for the gfp_mask contain at least one + * valid zone. It's possible to have an empty zonelist as a result + * of GFP_THISNODE and a memoryless node + */ + if (unlikely(!zonelist->_zonerefs->zone)) + return NULL; + + page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, + zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET); + if (unlikely(!page)) + page = __alloc_pages_slowpath(gfp_mask, order, + zonelist, high_zoneidx, nodemask); + + return page; } EXPORT_SYMBOL(__alloc_pages_nodemask);