diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 7cee90223d3a0624296b42d983d786b25462dc56..20d0d797f539ce51d18f903aa4aec942ead16643 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -29,6 +29,7 @@ Currently, these files are in /proc/sys/vm: - drop-caches - zone_reclaim_mode - min_unmapped_ratio +- min_slab_ratio - panic_on_oom ============================================================== @@ -138,7 +139,6 @@ This is value ORed together of 1 = Zone reclaim on 2 = Zone reclaim writes dirty pages out 4 = Zone reclaim swaps pages -8 = Also do a global slab reclaim pass zone_reclaim_mode is set during bootup to 1 if it is determined that pages from remote zones will cause a measurable performance reduction. The @@ -162,18 +162,13 @@ Allowing regular swap effectively restricts allocations to the local node unless explicitly overridden by memory policies or cpuset configurations. -It may be advisable to allow slab reclaim if the system makes heavy -use of files and builds up large slab caches. However, the slab -shrink operation is global, may take a long time and free slabs -in all nodes of the system. - ============================================================= min_unmapped_ratio: This is available only on NUMA kernels. -A percentage of the file backed pages in each zone. Zone reclaim will only +A percentage of the total pages in each zone. Zone reclaim will only occur if more than this percentage of pages are file backed and unmapped. This is to insure that a minimal amount of local pages is still available for file I/O even if the node is overallocated. @@ -182,6 +177,24 @@ The default is 1 percent. ============================================================= +min_slab_ratio: + +This is available only on NUMA kernels. + +A percentage of the total pages in each zone. On Zone reclaim +(fallback from the local zone occurs) slabs will be reclaimed if more +than this percentage of pages in a zone are reclaimable slab pages. +This insures that the slab growth stays under control even in NUMA +systems that rarely perform global reclaim. + +The default is 5 percent. + +Note that slab reclaim is triggered in a per zone / node fashion. +The process of reclaiming slab memory is currently not node specific +and may not be fast. + +============================================================= + panic_on_oom This enables or disables panic on out-of-memory feature. If this is set to 1, diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 08c41b9f92e0e8dc54621e1838be20564cd3c7ab..3693f1a5278839c23243691564a5085af0617313 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -171,6 +171,7 @@ struct zone { * zone reclaim becomes active if more unmapped pages exist. */ unsigned long min_unmapped_pages; + unsigned long min_slab_pages; struct per_cpu_pageset *pageset[NR_CPUS]; #else struct per_cpu_pageset pageset[NR_CPUS]; @@ -448,6 +449,8 @@ int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int, struct file void __user *, size_t *, loff_t *); int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); +int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int, + struct file *, void __user *, size_t *, loff_t *); #include /* Returns the number of the current Node. */ diff --git a/include/linux/swap.h b/include/linux/swap.h index 32db06c8ffe0801fe894afc10c9a8ba0200faee3..a2f5ad7c2d2e2ec700dd3d8b4e7dfac57f9e1136 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -193,6 +193,7 @@ extern long vm_total_pages; #ifdef CONFIG_NUMA extern int zone_reclaim_mode; extern int sysctl_min_unmapped_ratio; +extern int sysctl_min_slab_ratio; extern int zone_reclaim(struct zone *, gfp_t, unsigned int); #else #define zone_reclaim_mode 0 diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 736ed917a4f8df2af5d25e1e8cc567cada1d863d..eca555781d05e1d67e62d070202ced6ac6bcfb84 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -191,6 +191,7 @@ enum VM_MIN_UNMAPPED=32, /* Set min percent of unmapped pages */ VM_PANIC_ON_OOM=33, /* panic at out-of-memory */ VM_VDSO_ENABLED=34, /* map VDSO into new processes? */ + VM_MIN_SLAB=35, /* Percent pages ignored by zone reclaim */ }; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 362a0cc371381cceab3f4a77b03430a604ecdb71..fd43c3e6786b5b19c8ff173b45638c56392c7d86 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -943,6 +943,17 @@ static ctl_table vm_table[] = { .extra1 = &zero, .extra2 = &one_hundred, }, + { + .ctl_name = VM_MIN_SLAB, + .procname = "min_slab_ratio", + .data = &sysctl_min_slab_ratio, + .maxlen = sizeof(sysctl_min_slab_ratio), + .mode = 0644, + .proc_handler = &sysctl_min_slab_ratio_sysctl_handler, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &one_hundred, + }, #endif #ifdef CONFIG_X86_32 { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 47e98423b30dd6da8dea3e1bf03e8d7e3bf85cf0..cf913bdd433e29b3b371bd97383f2c3c152248ce 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2005,6 +2005,7 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat, #ifdef CONFIG_NUMA zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) / 100; + zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100; #endif zone->name = zone_names[j]; spin_lock_init(&zone->lock); @@ -2318,6 +2319,22 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, sysctl_min_unmapped_ratio) / 100; return 0; } + +int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, + struct file *file, void __user *buffer, size_t *length, loff_t *ppos) +{ + struct zone *zone; + int rc; + + rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); + if (rc) + return rc; + + for_each_zone(zone) + zone->min_slab_pages = (zone->present_pages * + sysctl_min_slab_ratio) / 100; + return 0; +} #endif /* diff --git a/mm/vmscan.c b/mm/vmscan.c index 349797ba4bac37fa476c5f33af8212afd33f3b3b..089e943c4d38f56c75309386dde288ab35eed7e8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1527,7 +1527,6 @@ int zone_reclaim_mode __read_mostly; #define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */ #define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ #define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ -#define RECLAIM_SLAB (1<<3) /* Do a global slab shrink if the zone is out of memory */ /* * Priority for ZONE_RECLAIM. This determines the fraction of pages @@ -1542,6 +1541,12 @@ int zone_reclaim_mode __read_mostly; */ int sysctl_min_unmapped_ratio = 1; +/* + * If the number of slab pages in a zone grows beyond this percentage then + * slab reclaim needs to occur. + */ +int sysctl_min_slab_ratio = 5; + /* * Try to free up some pages from this zone through reclaim. */ @@ -1573,29 +1578,37 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; - /* - * Free memory by calling shrink zone with increasing priorities - * until we have enough memory freed. - */ - priority = ZONE_RECLAIM_PRIORITY; - do { - nr_reclaimed += shrink_zone(priority, zone, &sc); - priority--; - } while (priority >= 0 && nr_reclaimed < nr_pages); + if (zone_page_state(zone, NR_FILE_PAGES) - + zone_page_state(zone, NR_FILE_MAPPED) > + zone->min_unmapped_pages) { + /* + * Free memory by calling shrink zone with increasing + * priorities until we have enough memory freed. + */ + priority = ZONE_RECLAIM_PRIORITY; + do { + nr_reclaimed += shrink_zone(priority, zone, &sc); + priority--; + } while (priority >= 0 && nr_reclaimed < nr_pages); + } - if (nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) { + if (zone_page_state(zone, NR_SLAB_RECLAIMABLE) > zone->min_slab_pages) { /* * shrink_slab() does not currently allow us to determine how - * many pages were freed in this zone. So we just shake the slab - * a bit and then go off node for this particular allocation - * despite possibly having freed enough memory to allocate in - * this zone. If we freed local memory then the next - * allocations will be local again. + * many pages were freed in this zone. So we take the current + * number of slab pages and shake the slab until it is reduced + * by the same nr_pages that we used for reclaiming unmapped + * pages. * - * shrink_slab will free memory on all zones and may take - * a long time. + * Note that shrink_slab will free memory on all zones and may + * take a long time. */ - shrink_slab(sc.nr_scanned, gfp_mask, order); + unsigned long limit = zone_page_state(zone, + NR_SLAB_RECLAIMABLE) - nr_pages; + + while (shrink_slab(sc.nr_scanned, gfp_mask, order) && + zone_page_state(zone, NR_SLAB_RECLAIMABLE) > limit) + ; } p->reclaim_state = NULL; @@ -1609,7 +1622,8 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) int node_id; /* - * Zone reclaim reclaims unmapped file backed pages. + * Zone reclaim reclaims unmapped file backed pages and + * slab pages if we are over the defined limits. * * A small portion of unmapped file backed pages is needed for * file I/O otherwise pages read by file I/O will be immediately @@ -1618,7 +1632,9 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) * unmapped file backed pages. */ if (zone_page_state(zone, NR_FILE_PAGES) - - zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages) + zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages + && zone_page_state(zone, NR_SLAB_RECLAIMABLE) + <= zone->min_slab_pages) return 0; /*