diff --git a/include/linux/sched.h b/include/linux/sched.h index b0bf326143a9500a1b5705822f0a7bdba7167785..1301c087537056949c225b1ad4a288bd90e360b5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -25,6 +25,7 @@ #define CLONE_NEWUTS 0x04000000 /* New utsname group? */ #define CLONE_NEWIPC 0x08000000 /* New ipcs */ #define CLONE_NEWUSER 0x10000000 /* New user namespace */ +#define CLONE_NEWPID 0x20000000 /* New pid namespace */ #define CLONE_NEWNET 0x40000000 /* New network namespace */ /* diff --git a/kernel/fork.c b/kernel/fork.c index bab34192799bf0dd04be9bbce60b1b7a86eb1643..f252784f93308077d483bca888593ec7fad36774 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -973,7 +973,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, unsigned long stack_start, struct pt_regs *regs, unsigned long stack_size, - int __user *parent_tidptr, int __user *child_tidptr, struct pid *pid) { @@ -1043,11 +1042,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->did_exec = 0; delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ copy_flags(clone_flags, p); - retval = -EFAULT; - if (clone_flags & CLONE_PARENT_SETTID) - if (put_user(p->pid, parent_tidptr)) - goto bad_fork_cleanup_delays_binfmt; - INIT_LIST_HEAD(&p->children); INIT_LIST_HEAD(&p->sibling); p->vfork_done = NULL; @@ -1289,11 +1283,22 @@ static struct task_struct *copy_process(unsigned long clone_flags, __ptrace_link(p, current->parent); if (thread_group_leader(p)) { - p->signal->tty = current->signal->tty; - p->signal->pgrp = task_pgrp_nr(current); - set_task_session(p, task_session_nr(current)); - attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); - attach_pid(p, PIDTYPE_SID, task_session(current)); + if (clone_flags & CLONE_NEWPID) { + p->nsproxy->pid_ns->child_reaper = p; + p->signal->tty = NULL; + p->signal->pgrp = p->pid; + set_task_session(p, p->pid); + attach_pid(p, PIDTYPE_PGID, pid); + attach_pid(p, PIDTYPE_SID, pid); + } else { + p->signal->tty = current->signal->tty; + p->signal->pgrp = task_pgrp_nr(current); + set_task_session(p, task_session_nr(current)); + attach_pid(p, PIDTYPE_PGID, + task_pgrp(current)); + attach_pid(p, PIDTYPE_SID, + task_session(current)); + } list_add_tail_rcu(&p->tasks, &init_task.tasks); __get_cpu_var(process_counts)++; @@ -1339,7 +1344,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, bad_fork_cleanup_cgroup: #endif cgroup_exit(p, cgroup_callbacks_done); -bad_fork_cleanup_delays_binfmt: delayacct_tsk_free(p); if (p->binfmt) module_put(p->binfmt->module); @@ -1366,7 +1370,7 @@ struct task_struct * __cpuinit fork_idle(int cpu) struct task_struct *task; struct pt_regs regs; - task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL, NULL, + task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL, &init_struct_pid); if (!IS_ERR(task)) init_idle(task, cpu); @@ -1414,7 +1418,7 @@ long do_fork(unsigned long clone_flags, } p = copy_process(clone_flags, stack_start, regs, stack_size, - parent_tidptr, child_tidptr, NULL); + child_tidptr, NULL); /* * Do this prior waking up the new thread - the thread pointer * might get invalid after that point, if the thread exits quickly. @@ -1422,7 +1426,16 @@ long do_fork(unsigned long clone_flags, if (!IS_ERR(p)) { struct completion vfork; - nr = pid_nr(task_pid(p)); + /* + * this is enough to call pid_nr_ns here, but this if + * improves optimisation of regular fork() + */ + nr = (clone_flags & CLONE_NEWPID) ? + task_pid_nr_ns(p, current->nsproxy->pid_ns) : + task_pid_vnr(p); + + if (clone_flags & CLONE_PARENT_SETTID) + put_user(nr, parent_tidptr); if (clone_flags & CLONE_VFORK) { p->vfork_done = &vfork; diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index c8ef7c2992ed2e32689f09bcac40104805d3b7fe..79f871bc0ef40026e111df575622cbc6b11031c3 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -129,7 +129,8 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) get_nsproxy(old_ns); - if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWUSER | CLONE_NEWNET))) + if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | + CLONE_NEWUSER | CLONE_NEWPID | CLONE_NEWNET))) return 0; if (!capable(CAP_SYS_ADMIN)) { diff --git a/kernel/pid.c b/kernel/pid.c index 4b17acdb862f5b1e917b492bc9118d895a27c0a7..f76097c60475506646c54e375e91916e00d09ca3 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -18,6 +18,12 @@ * allocation scenario when all but one out of 1 million PIDs possible are * allocated already: the scanning of 32 list entries and at most PAGE_SIZE * bytes. The typical fastpath is a single successful setbit. Freeing is O(1). + * + * Pid namespaces: + * (C) 2007 Pavel Emelyanov , OpenVZ, SWsoft Inc. + * (C) 2007 Sukadev Bhattiprolu , IBM + * Many thanks to Oleg Nesterov for comments and help + * */ #include @@ -456,8 +462,8 @@ static struct kmem_cache *create_pid_cachep(int nr_ids) snprintf(pcache->name, sizeof(pcache->name), "pid_%d", nr_ids); cachep = kmem_cache_create(pcache->name, - /* FIXME add numerical ids here */ - sizeof(struct pid), 0, SLAB_HWCACHE_ALIGN, NULL); + sizeof(struct pid) + (nr_ids - 1) * sizeof(struct upid), + 0, SLAB_HWCACHE_ALIGN, NULL); if (cachep == NULL) goto err_cachep; @@ -475,19 +481,89 @@ static struct kmem_cache *create_pid_cachep(int nr_ids) return NULL; } +static struct pid_namespace *create_pid_namespace(int level) +{ + struct pid_namespace *ns; + int i; + + ns = kmalloc(sizeof(struct pid_namespace), GFP_KERNEL); + if (ns == NULL) + goto out; + + ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!ns->pidmap[0].page) + goto out_free; + + ns->pid_cachep = create_pid_cachep(level + 1); + if (ns->pid_cachep == NULL) + goto out_free_map; + + kref_init(&ns->kref); + ns->last_pid = 0; + ns->child_reaper = NULL; + ns->level = level; + + set_bit(0, ns->pidmap[0].page); + atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); + + for (i = 1; i < PIDMAP_ENTRIES; i++) { + ns->pidmap[i].page = 0; + atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); + } + + return ns; + +out_free_map: + kfree(ns->pidmap[0].page); +out_free: + kfree(ns); +out: + return ERR_PTR(-ENOMEM); +} + +static void destroy_pid_namespace(struct pid_namespace *ns) +{ + int i; + + for (i = 0; i < PIDMAP_ENTRIES; i++) + kfree(ns->pidmap[i].page); + kfree(ns); +} + struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns) { + struct pid_namespace *new_ns; + BUG_ON(!old_ns); - get_pid_ns(old_ns); - return old_ns; + new_ns = get_pid_ns(old_ns); + if (!(flags & CLONE_NEWPID)) + goto out; + + new_ns = ERR_PTR(-EINVAL); + if (flags & CLONE_THREAD) + goto out_put; + + new_ns = create_pid_namespace(old_ns->level + 1); + if (!IS_ERR(new_ns)) + new_ns->parent = get_pid_ns(old_ns); + +out_put: + put_pid_ns(old_ns); +out: + return new_ns; } void free_pid_ns(struct kref *kref) { - struct pid_namespace *ns; + struct pid_namespace *ns, *parent; ns = container_of(kref, struct pid_namespace, kref); - kfree(ns); + + parent = ns->parent; + destroy_pid_namespace(ns); + + if (parent != NULL) + put_pid_ns(parent); } /*