提交 b34a6b1d 编写于 作者: V Vasiliy Kulikov 提交者: Linus Torvalds

ipc: introduce shm_rmid_forced sysctl

Add support for the shm_rmid_forced sysctl.  If set to 1, all shared
memory objects in current ipc namespace will be automatically forced to
use IPC_RMID.

The POSIX way of handling shmem allows one to create shm objects and
call shmdt(), leaving shm object associated with no process, thus
consuming memory not counted via rlimits.

With shm_rmid_forced=1 the shared memory object is counted at least for
one process, so OOM killer may effectively kill the fat process holding
the shared memory.

It obviously breaks POSIX - some programs relying on the feature would
stop working.  So set shm_rmid_forced=1 only if you're sure nobody uses
"orphaned" memory.  Use shm_rmid_forced=0 by default for compatability
reasons.

The feature was previously impemented in -ow as a configure option.

[akpm@linux-foundation.org: fix documentation, per Randy]
[akpm@linux-foundation.org: fix warning]
[akpm@linux-foundation.org: readability/conventionality tweaks]
[akpm@linux-foundation.org: fix shm_rmid_forced/shm_forced_rmid confusion, use standard comment layout]
Signed-off-by: NVasiliy Kulikov <segoon@openwall.com>
Cc: Randy Dunlap <rdunlap@xenotime.net>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: "Serge E. Hallyn" <serge.hallyn@canonical.com>
Cc: Daniel Lezcano <daniel.lezcano@free.fr>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Solar Designer <solar@openwall.com>
Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: NLinus Torvalds <torvalds@linux-foundation.org>
上级 d40dcdb0
...@@ -61,6 +61,7 @@ show up in /proc/sys/kernel: ...@@ -61,6 +61,7 @@ show up in /proc/sys/kernel:
- rtsig-nr - rtsig-nr
- sem - sem
- sg-big-buff [ generic SCSI device (sg) ] - sg-big-buff [ generic SCSI device (sg) ]
- shm_rmid_forced
- shmall - shmall
- shmmax [ sysv ipc ] - shmmax [ sysv ipc ]
- shmmni - shmmni
...@@ -518,6 +519,27 @@ kernel. This value defaults to SHMMAX. ...@@ -518,6 +519,27 @@ kernel. This value defaults to SHMMAX.
============================================================== ==============================================================
shm_rmid_forced:
Linux lets you set resource limits, including how much memory one
process can consume, via setrlimit(2). Unfortunately, shared memory
segments are allowed to exist without association with any process, and
thus might not be counted against any resource limits. If enabled,
shared memory segments are automatically destroyed when their attach
count becomes zero after a detach or a process termination. It will
also destroy segments that were created, but never attached to, on exit
from the process. The only use left for IPC_RMID is to immediately
destroy an unattached segment. Of course, this breaks the way things are
defined, so some applications might stop working. Note that this
feature will do you no good unless you also configure your resource
limits (in particular, RLIMIT_AS and RLIMIT_NPROC). Most systems don't
need this.
Note that if you change this from 0 to 1, already created segments
without users and with a dead originative process will be destroyed.
==============================================================
softlockup_thresh: softlockup_thresh:
This value can be used to lower the softlockup tolerance threshold. The This value can be used to lower the softlockup tolerance threshold. The
......
...@@ -44,6 +44,11 @@ struct ipc_namespace { ...@@ -44,6 +44,11 @@ struct ipc_namespace {
size_t shm_ctlall; size_t shm_ctlall;
int shm_ctlmni; int shm_ctlmni;
int shm_tot; int shm_tot;
/*
* Defines whether IPC_RMID is forced for _all_ shm segments regardless
* of shmctl()
*/
int shm_rmid_forced;
struct notifier_block ipcns_nb; struct notifier_block ipcns_nb;
...@@ -72,6 +77,7 @@ extern int register_ipcns_notifier(struct ipc_namespace *); ...@@ -72,6 +77,7 @@ extern int register_ipcns_notifier(struct ipc_namespace *);
extern int cond_register_ipcns_notifier(struct ipc_namespace *); extern int cond_register_ipcns_notifier(struct ipc_namespace *);
extern void unregister_ipcns_notifier(struct ipc_namespace *); extern void unregister_ipcns_notifier(struct ipc_namespace *);
extern int ipcns_notify(unsigned long); extern int ipcns_notify(unsigned long);
extern void shm_destroy_orphaned(struct ipc_namespace *ns);
#else /* CONFIG_SYSVIPC */ #else /* CONFIG_SYSVIPC */
static inline int register_ipcns_notifier(struct ipc_namespace *ns) static inline int register_ipcns_notifier(struct ipc_namespace *ns)
{ return 0; } { return 0; }
...@@ -79,6 +85,7 @@ static inline int cond_register_ipcns_notifier(struct ipc_namespace *ns) ...@@ -79,6 +85,7 @@ static inline int cond_register_ipcns_notifier(struct ipc_namespace *ns)
{ return 0; } { return 0; }
static inline void unregister_ipcns_notifier(struct ipc_namespace *ns) { } static inline void unregister_ipcns_notifier(struct ipc_namespace *ns) { }
static inline int ipcns_notify(unsigned long l) { return 0; } static inline int ipcns_notify(unsigned long l) { return 0; }
static inline void shm_destroy_orphaned(struct ipc_namespace *ns) {}
#endif /* CONFIG_SYSVIPC */ #endif /* CONFIG_SYSVIPC */
#ifdef CONFIG_POSIX_MQUEUE #ifdef CONFIG_POSIX_MQUEUE
......
...@@ -106,6 +106,7 @@ struct shmid_kernel /* private to the kernel */ ...@@ -106,6 +106,7 @@ struct shmid_kernel /* private to the kernel */
#ifdef CONFIG_SYSVIPC #ifdef CONFIG_SYSVIPC
long do_shmat(int shmid, char __user *shmaddr, int shmflg, unsigned long *addr); long do_shmat(int shmid, char __user *shmaddr, int shmflg, unsigned long *addr);
extern int is_file_shm_hugepages(struct file *file); extern int is_file_shm_hugepages(struct file *file);
extern void exit_shm(struct task_struct *task);
#else #else
static inline long do_shmat(int shmid, char __user *shmaddr, static inline long do_shmat(int shmid, char __user *shmaddr,
int shmflg, unsigned long *addr) int shmflg, unsigned long *addr)
...@@ -116,6 +117,9 @@ static inline int is_file_shm_hugepages(struct file *file) ...@@ -116,6 +117,9 @@ static inline int is_file_shm_hugepages(struct file *file)
{ {
return 0; return 0;
} }
static inline void exit_shm(struct task_struct *task)
{
}
#endif #endif
#endif /* __KERNEL__ */ #endif /* __KERNEL__ */
......
...@@ -31,12 +31,37 @@ static int proc_ipc_dointvec(ctl_table *table, int write, ...@@ -31,12 +31,37 @@ static int proc_ipc_dointvec(ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos) void __user *buffer, size_t *lenp, loff_t *ppos)
{ {
struct ctl_table ipc_table; struct ctl_table ipc_table;
memcpy(&ipc_table, table, sizeof(ipc_table)); memcpy(&ipc_table, table, sizeof(ipc_table));
ipc_table.data = get_ipc(table); ipc_table.data = get_ipc(table);
return proc_dointvec(&ipc_table, write, buffer, lenp, ppos); return proc_dointvec(&ipc_table, write, buffer, lenp, ppos);
} }
static int proc_ipc_dointvec_minmax(ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table ipc_table;
memcpy(&ipc_table, table, sizeof(ipc_table));
ipc_table.data = get_ipc(table);
return proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos);
}
static int proc_ipc_dointvec_minmax_orphans(ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
struct ipc_namespace *ns = current->nsproxy->ipc_ns;
int err = proc_ipc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (err < 0)
return err;
if (ns->shm_rmid_forced)
shm_destroy_orphaned(ns);
return err;
}
static int proc_ipc_callback_dointvec(ctl_table *table, int write, static int proc_ipc_callback_dointvec(ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos) void __user *buffer, size_t *lenp, loff_t *ppos)
{ {
...@@ -125,6 +150,8 @@ static int proc_ipcauto_dointvec_minmax(ctl_table *table, int write, ...@@ -125,6 +150,8 @@ static int proc_ipcauto_dointvec_minmax(ctl_table *table, int write,
#else #else
#define proc_ipc_doulongvec_minmax NULL #define proc_ipc_doulongvec_minmax NULL
#define proc_ipc_dointvec NULL #define proc_ipc_dointvec NULL
#define proc_ipc_dointvec_minmax NULL
#define proc_ipc_dointvec_minmax_orphans NULL
#define proc_ipc_callback_dointvec NULL #define proc_ipc_callback_dointvec NULL
#define proc_ipcauto_dointvec_minmax NULL #define proc_ipcauto_dointvec_minmax NULL
#endif #endif
...@@ -154,6 +181,15 @@ static struct ctl_table ipc_kern_table[] = { ...@@ -154,6 +181,15 @@ static struct ctl_table ipc_kern_table[] = {
.mode = 0644, .mode = 0644,
.proc_handler = proc_ipc_dointvec, .proc_handler = proc_ipc_dointvec,
}, },
{
.procname = "shm_rmid_forced",
.data = &init_ipc_ns.shm_rmid_forced,
.maxlen = sizeof(init_ipc_ns.shm_rmid_forced),
.mode = 0644,
.proc_handler = proc_ipc_dointvec_minmax_orphans,
.extra1 = &zero,
.extra2 = &one,
},
{ {
.procname = "msgmax", .procname = "msgmax",
.data = &init_ipc_ns.msg_ctlmax, .data = &init_ipc_ns.msg_ctlmax,
......
...@@ -74,6 +74,7 @@ void shm_init_ns(struct ipc_namespace *ns) ...@@ -74,6 +74,7 @@ void shm_init_ns(struct ipc_namespace *ns)
ns->shm_ctlmax = SHMMAX; ns->shm_ctlmax = SHMMAX;
ns->shm_ctlall = SHMALL; ns->shm_ctlall = SHMALL;
ns->shm_ctlmni = SHMMNI; ns->shm_ctlmni = SHMMNI;
ns->shm_rmid_forced = 0;
ns->shm_tot = 0; ns->shm_tot = 0;
ipc_init_ids(&shm_ids(ns)); ipc_init_ids(&shm_ids(ns));
} }
...@@ -186,6 +187,23 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) ...@@ -186,6 +187,23 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
ipc_rcu_putref(shp); ipc_rcu_putref(shp);
} }
/*
* shm_may_destroy - identifies whether shm segment should be destroyed now
*
* Returns true if and only if there are no active users of the segment and
* one of the following is true:
*
* 1) shmctl(id, IPC_RMID, NULL) was called for this shp
*
* 2) sysctl kernel.shm_rmid_forced is set to 1.
*/
static bool shm_may_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
{
return (shp->shm_nattch == 0) &&
(ns->shm_rmid_forced ||
(shp->shm_perm.mode & SHM_DEST));
}
/* /*
* remove the attach descriptor vma. * remove the attach descriptor vma.
* free memory for segment if it is marked destroyed. * free memory for segment if it is marked destroyed.
...@@ -206,11 +224,83 @@ static void shm_close(struct vm_area_struct *vma) ...@@ -206,11 +224,83 @@ static void shm_close(struct vm_area_struct *vma)
shp->shm_lprid = task_tgid_vnr(current); shp->shm_lprid = task_tgid_vnr(current);
shp->shm_dtim = get_seconds(); shp->shm_dtim = get_seconds();
shp->shm_nattch--; shp->shm_nattch--;
if(shp->shm_nattch == 0 && if (shm_may_destroy(ns, shp))
shp->shm_perm.mode & SHM_DEST) shm_destroy(ns, shp);
else
shm_unlock(shp);
up_write(&shm_ids(ns).rw_mutex);
}
static int shm_try_destroy_current(int id, void *p, void *data)
{
struct ipc_namespace *ns = data;
struct shmid_kernel *shp = shm_lock(ns, id);
if (IS_ERR(shp))
return 0;
if (shp->shm_cprid != task_tgid_vnr(current)) {
shm_unlock(shp);
return 0;
}
if (shm_may_destroy(ns, shp))
shm_destroy(ns, shp);
else
shm_unlock(shp);
return 0;
}
static int shm_try_destroy_orphaned(int id, void *p, void *data)
{
struct ipc_namespace *ns = data;
struct shmid_kernel *shp = shm_lock(ns, id);
struct task_struct *task;
if (IS_ERR(shp))
return 0;
/*
* We want to destroy segments without users and with already
* exit'ed originating process.
*
* XXX: the originating process may exist in another pid namespace.
*/
task = find_task_by_vpid(shp->shm_cprid);
if (task != NULL) {
shm_unlock(shp);
return 0;
}
if (shm_may_destroy(ns, shp))
shm_destroy(ns, shp); shm_destroy(ns, shp);
else else
shm_unlock(shp); shm_unlock(shp);
return 0;
}
void shm_destroy_orphaned(struct ipc_namespace *ns)
{
down_write(&shm_ids(ns).rw_mutex);
idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_orphaned, ns);
up_write(&shm_ids(ns).rw_mutex);
}
void exit_shm(struct task_struct *task)
{
struct nsproxy *nsp = task->nsproxy;
struct ipc_namespace *ns;
if (!nsp)
return;
ns = nsp->ipc_ns;
if (!ns || !ns->shm_rmid_forced)
return;
/* Destroy all already created segments, but not mapped yet */
down_write(&shm_ids(ns).rw_mutex);
idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_current, ns);
up_write(&shm_ids(ns).rw_mutex); up_write(&shm_ids(ns).rw_mutex);
} }
...@@ -950,8 +1040,7 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr) ...@@ -950,8 +1040,7 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr)
shp = shm_lock(ns, shmid); shp = shm_lock(ns, shmid);
BUG_ON(IS_ERR(shp)); BUG_ON(IS_ERR(shp));
shp->shm_nattch--; shp->shm_nattch--;
if(shp->shm_nattch == 0 && if (shm_may_destroy(ns, shp))
shp->shm_perm.mode & SHM_DEST)
shm_destroy(ns, shp); shm_destroy(ns, shp);
else else
shm_unlock(shp); shm_unlock(shp);
......
...@@ -980,6 +980,7 @@ NORET_TYPE void do_exit(long code) ...@@ -980,6 +980,7 @@ NORET_TYPE void do_exit(long code)
trace_sched_process_exit(tsk); trace_sched_process_exit(tsk);
exit_sem(tsk); exit_sem(tsk);
exit_shm(tsk);
exit_files(tsk); exit_files(tsk);
exit_fs(tsk); exit_fs(tsk);
check_stack_usage(); check_stack_usage();
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册