提交 4eeab4f5 编写于 作者: A Andrew Shewmaker 提交者: Linus Torvalds

mm: replace hardcoded 3% with admin_reserve_pages knob

Add an admin_reserve_kbytes knob to allow admins to change the hardcoded
memory reserve to something other than 3%, which may be multiple
gigabytes on large memory systems.  Only about 8MB is necessary to
enable recovery in the default mode, and only a few hundred MB are
required even when overcommit is disabled.

This affects OVERCOMMIT_GUESS and OVERCOMMIT_NEVER.

admin_reserve_kbytes is initialized to min(3% free pages, 8MB)

I arrived at 8MB by summing the RSS of sshd or login, bash, and top.

Please see first patch in this series for full background, motivation,
testing, and full changelog.

[akpm@linux-foundation.org: coding-style fixes]
[akpm@linux-foundation.org: make init_admin_reserve() static]
Signed-off-by: NAndrew Shewmaker <agshew@gmail.com>
Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: NLinus Torvalds <torvalds@linux-foundation.org>
上级 c9b1d098
...@@ -18,6 +18,7 @@ files can be found in mm/swap.c. ...@@ -18,6 +18,7 @@ files can be found in mm/swap.c.
Currently, these files are in /proc/sys/vm: Currently, these files are in /proc/sys/vm:
- admin_reserve_kbytes
- block_dump - block_dump
- compact_memory - compact_memory
- dirty_background_bytes - dirty_background_bytes
...@@ -59,6 +60,35 @@ Currently, these files are in /proc/sys/vm: ...@@ -59,6 +60,35 @@ Currently, these files are in /proc/sys/vm:
============================================================== ==============================================================
admin_reserve_kbytes
The amount of free memory in the system that should be reserved for users
with the capability cap_sys_admin.
admin_reserve_kbytes defaults to min(3% of free pages, 8MB)
That should provide enough for the admin to log in and kill a process,
if necessary, under the default overcommit 'guess' mode.
Systems running under overcommit 'never' should increase this to account
for the full Virtual Memory Size of programs used to recover. Otherwise,
root may not be able to log in to recover the system.
How do you calculate a minimum useful reserve?
sshd or login + bash (or some other shell) + top (or ps, kill, etc.)
For overcommit 'guess', we can sum resident set sizes (RSS).
On x86_64 this is about 8MB.
For overcommit 'never', we can take the max of their virtual sizes (VSZ)
and add the sum of their RSS.
On x86_64 this is about 128MB.
Changing this takes effect whenever an application requests memory.
==============================================================
block_dump block_dump
block_dump enables block I/O debugging when set to a nonzero value. More block_dump enables block I/O debugging when set to a nonzero value. More
......
...@@ -45,6 +45,7 @@ extern int sysctl_legacy_va_layout; ...@@ -45,6 +45,7 @@ extern int sysctl_legacy_va_layout;
#include <asm/processor.h> #include <asm/processor.h>
extern unsigned long sysctl_user_reserve_kbytes; extern unsigned long sysctl_user_reserve_kbytes;
extern unsigned long sysctl_admin_reserve_kbytes;
#define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n))
......
...@@ -1436,6 +1436,13 @@ static struct ctl_table vm_table[] = { ...@@ -1436,6 +1436,13 @@ static struct ctl_table vm_table[] = {
.mode = 0644, .mode = 0644,
.proc_handler = proc_doulongvec_minmax, .proc_handler = proc_doulongvec_minmax,
}, },
{
.procname = "admin_reserve_kbytes",
.data = &sysctl_admin_reserve_kbytes,
.maxlen = sizeof(sysctl_admin_reserve_kbytes),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
},
{ } { }
}; };
......
...@@ -85,6 +85,7 @@ int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic ove ...@@ -85,6 +85,7 @@ int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic ove
int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */ int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */
int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
/* /*
* Make sure vm_committed_as in one cacheline and not cacheline shared with * Make sure vm_committed_as in one cacheline and not cacheline shared with
* other variables. It can be updated by several CPUs frequently. * other variables. It can be updated by several CPUs frequently.
...@@ -164,10 +165,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) ...@@ -164,10 +165,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
free -= totalreserve_pages; free -= totalreserve_pages;
/* /*
* Leave the last 3% for root * Reserve some for root
*/ */
if (!cap_sys_admin) if (!cap_sys_admin)
free -= free / 32; free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
if (free > pages) if (free > pages)
return 0; return 0;
...@@ -178,10 +179,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) ...@@ -178,10 +179,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
allowed = (totalram_pages - hugetlb_total_pages()) allowed = (totalram_pages - hugetlb_total_pages())
* sysctl_overcommit_ratio / 100; * sysctl_overcommit_ratio / 100;
/* /*
* Leave the last 3% for root * Reserve some for root
*/ */
if (!cap_sys_admin) if (!cap_sys_admin)
allowed -= allowed / 32; allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
allowed += total_swap_pages; allowed += total_swap_pages;
/* /*
...@@ -3119,3 +3120,24 @@ static int __meminit init_user_reserve(void) ...@@ -3119,3 +3120,24 @@ static int __meminit init_user_reserve(void)
return 0; return 0;
} }
module_init(init_user_reserve) module_init(init_user_reserve)
/*
* Initialise sysctl_admin_reserve_kbytes.
*
* The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin
* to log in and kill a memory hogging process.
*
* Systems with more than 256MB will reserve 8MB, enough to recover
* with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will
* only reserve 3% of free pages by default.
*/
static int __meminit init_admin_reserve(void)
{
unsigned long free_kbytes;
free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
return 0;
}
module_init(init_admin_reserve)
...@@ -64,6 +64,7 @@ int sysctl_overcommit_ratio = 50; /* default is 50% */ ...@@ -64,6 +64,7 @@ int sysctl_overcommit_ratio = 50; /* default is 50% */
int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
int heap_stack_gap = 0; int heap_stack_gap = 0;
atomic_long_t mmap_pages_allocated; atomic_long_t mmap_pages_allocated;
...@@ -1939,10 +1940,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) ...@@ -1939,10 +1940,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
free -= totalreserve_pages; free -= totalreserve_pages;
/* /*
* Leave the last 3% for root * Reserve some for root
*/ */
if (!cap_sys_admin) if (!cap_sys_admin)
free -= free / 32; free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
if (free > pages) if (free > pages)
return 0; return 0;
...@@ -1952,10 +1953,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) ...@@ -1952,10 +1953,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
allowed = totalram_pages * sysctl_overcommit_ratio / 100; allowed = totalram_pages * sysctl_overcommit_ratio / 100;
/* /*
* Leave the last 3% for root * Reserve some 3% for root
*/ */
if (!cap_sys_admin) if (!cap_sys_admin)
allowed -= allowed / 32; allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
allowed += total_swap_pages; allowed += total_swap_pages;
/* /*
...@@ -2147,3 +2148,24 @@ static int __meminit init_user_reserve(void) ...@@ -2147,3 +2148,24 @@ static int __meminit init_user_reserve(void)
return 0; return 0;
} }
module_init(init_user_reserve) module_init(init_user_reserve)
/*
* Initialise sysctl_admin_reserve_kbytes.
*
* The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin
* to log in and kill a memory hogging process.
*
* Systems with more than 256MB will reserve 8MB, enough to recover
* with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will
* only reserve 3% of free pages by default.
*/
static int __meminit init_admin_reserve(void)
{
unsigned long free_kbytes;
free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
return 0;
}
module_init(init_admin_reserve)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册