pid_namespace.c 11.0 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12
/*
 * Pid namespaces
 *
 * Authors:
 *    (C) 2007 Pavel Emelyanov <xemul@openvz.org>, OpenVZ, SWsoft Inc.
 *    (C) 2007 Sukadev Bhattiprolu <sukadev@us.ibm.com>, IBM
 *     Many thanks to Oleg Nesterov for comments and help
 *
 */

#include <linux/pid.h>
#include <linux/pid_namespace.h>
13
#include <linux/user_namespace.h>
14 15
#include <linux/syscalls.h>
#include <linux/err.h>
16
#include <linux/acct.h>
17
#include <linux/slab.h>
18
#include <linux/proc_ns.h>
19
#include <linux/reboot.h>
E
Eric W. Biederman 已提交
20
#include <linux/export.h>
21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72

struct pid_cache {
	int nr_ids;
	char name[16];
	struct kmem_cache *cachep;
	struct list_head list;
};

static LIST_HEAD(pid_caches_lh);
static DEFINE_MUTEX(pid_caches_mutex);
static struct kmem_cache *pid_ns_cachep;

/*
 * creates the kmem cache to allocate pids from.
 * @nr_ids: the number of numerical ids this pid will have to carry
 */

static struct kmem_cache *create_pid_cachep(int nr_ids)
{
	struct pid_cache *pcache;
	struct kmem_cache *cachep;

	mutex_lock(&pid_caches_mutex);
	list_for_each_entry(pcache, &pid_caches_lh, list)
		if (pcache->nr_ids == nr_ids)
			goto out;

	pcache = kmalloc(sizeof(struct pid_cache), GFP_KERNEL);
	if (pcache == NULL)
		goto err_alloc;

	snprintf(pcache->name, sizeof(pcache->name), "pid_%d", nr_ids);
	cachep = kmem_cache_create(pcache->name,
			sizeof(struct pid) + (nr_ids - 1) * sizeof(struct upid),
			0, SLAB_HWCACHE_ALIGN, NULL);
	if (cachep == NULL)
		goto err_cachep;

	pcache->nr_ids = nr_ids;
	pcache->cachep = cachep;
	list_add(&pcache->list, &pid_caches_lh);
out:
	mutex_unlock(&pid_caches_mutex);
	return pcache->cachep;

err_cachep:
	kfree(pcache);
err_alloc:
	mutex_unlock(&pid_caches_mutex);
	return NULL;
}

73 74 75 76 77 78
static void proc_cleanup_work(struct work_struct *work)
{
	struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_work);
	pid_ns_release_proc(ns);
}

79 80 81
/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
#define MAX_PID_NS_LEVEL 32

82 83 84 85 86 87 88 89 90 91
static struct ucounts *inc_pid_namespaces(struct user_namespace *ns)
{
	return inc_ucount(ns, current_euid(), UCOUNT_PID_NAMESPACES);
}

static void dec_pid_namespaces(struct ucounts *ucounts)
{
	dec_ucount(ucounts, UCOUNT_PID_NAMESPACES);
}

92 93
static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns,
	struct pid_namespace *parent_pid_ns)
94 95
{
	struct pid_namespace *ns;
96
	unsigned int level = parent_pid_ns->level + 1;
97
	struct ucounts *ucounts;
98 99 100
	int i;
	int err;

101
	err = -ENOSPC;
102 103 104 105
	if (level > MAX_PID_NS_LEVEL)
		goto out;
	ucounts = inc_pid_namespaces(user_ns);
	if (!ucounts)
106
		goto out;
107

108
	err = -ENOMEM;
109
	ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
110
	if (ns == NULL)
111
		goto out_dec;
112 113 114 115 116 117 118 119 120

	ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
	if (!ns->pidmap[0].page)
		goto out_free;

	ns->pid_cachep = create_pid_cachep(level + 1);
	if (ns->pid_cachep == NULL)
		goto out_free_map;

A
Al Viro 已提交
121
	err = ns_alloc_inum(&ns->ns);
122 123
	if (err)
		goto out_free_map;
124
	ns->ns.ops = &pidns_operations;
125

126 127
	kref_init(&ns->kref);
	ns->level = level;
128
	ns->parent = get_pid_ns(parent_pid_ns);
129
	ns->user_ns = get_user_ns(user_ns);
130
	ns->ucounts = ucounts;
131
	ns->nr_hashed = PIDNS_HASH_ADDING;
132
	INIT_WORK(&ns->proc_work, proc_cleanup_work);
133 134 135 136

	set_bit(0, ns->pidmap[0].page);
	atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);

137
	for (i = 1; i < PIDMAP_ENTRIES; i++)
138 139 140 141 142 143 144 145
		atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);

	return ns;

out_free_map:
	kfree(ns->pidmap[0].page);
out_free:
	kmem_cache_free(pid_ns_cachep, ns);
146 147
out_dec:
	dec_pid_namespaces(ucounts);
148
out:
149
	return ERR_PTR(err);
150 151
}

152 153
static void delayed_free_pidns(struct rcu_head *p)
{
154 155 156 157 158 159
	struct pid_namespace *ns = container_of(p, struct pid_namespace, rcu);

	dec_pid_namespaces(ns->ucounts);
	put_user_ns(ns->user_ns);

	kmem_cache_free(pid_ns_cachep, ns);
160 161
}

162 163 164 165
static void destroy_pid_namespace(struct pid_namespace *ns)
{
	int i;

A
Al Viro 已提交
166
	ns_free_inum(&ns->ns);
167 168
	for (i = 0; i < PIDMAP_ENTRIES; i++)
		kfree(ns->pidmap[i].page);
169
	call_rcu(&ns->rcu, delayed_free_pidns);
170 171
}

172 173
struct pid_namespace *copy_pid_ns(unsigned long flags,
	struct user_namespace *user_ns, struct pid_namespace *old_ns)
174 175
{
	if (!(flags & CLONE_NEWPID))
A
Alexey Dobriyan 已提交
176
		return get_pid_ns(old_ns);
177 178
	if (task_active_pid_ns(current) != old_ns)
		return ERR_PTR(-EINVAL);
179
	return create_pid_namespace(user_ns, old_ns);
180 181
}

182
static void free_pid_ns(struct kref *kref)
183
{
184
	struct pid_namespace *ns;
185 186 187

	ns = container_of(kref, struct pid_namespace, kref);
	destroy_pid_namespace(ns);
188
}
189

190 191 192 193 194 195 196 197 198 199
void put_pid_ns(struct pid_namespace *ns)
{
	struct pid_namespace *parent;

	while (ns != &init_pid_ns) {
		parent = ns->parent;
		if (!kref_put(&ns->kref, free_pid_ns))
			break;
		ns = parent;
	}
200
}
201
EXPORT_SYMBOL_GPL(put_pid_ns);
202 203 204 205 206

void zap_pid_ns_processes(struct pid_namespace *pid_ns)
{
	int nr;
	int rc;
207
	struct task_struct *task, *me = current;
208
	int init_pids = thread_group_leader(me) ? 1 : 2;
209

210 211 212
	/* Don't allow any more processes into the pid namespace */
	disable_pid_allocation(pid_ns);

213 214 215 216 217
	/*
	 * Ignore SIGCHLD causing any terminated children to autoreap.
	 * This speeds up the namespace shutdown, plus see the comment
	 * below.
	 */
218 219 220
	spin_lock_irq(&me->sighand->siglock);
	me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN;
	spin_unlock_irq(&me->sighand->siglock);
221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237

	/*
	 * The last thread in the cgroup-init thread group is terminating.
	 * Find remaining pid_ts in the namespace, signal and wait for them
	 * to exit.
	 *
	 * Note:  This signals each threads in the namespace - even those that
	 * 	  belong to the same thread group, To avoid this, we would have
	 * 	  to walk the entire tasklist looking a processes in this
	 * 	  namespace, but that could be unnecessarily expensive if the
	 * 	  pid namespace has just a few processes. Or we need to
	 * 	  maintain a tasklist for each pid namespace.
	 *
	 */
	read_lock(&tasklist_lock);
	nr = next_pidmap(pid_ns, 1);
	while (nr > 0) {
238 239 240
		rcu_read_lock();

		task = pid_task(find_vpid(nr), PIDTYPE_PID);
241 242
		if (task && !__fatal_signal_pending(task))
			send_sig_info(SIGKILL, SEND_SIG_FORCED, task);
243 244 245

		rcu_read_unlock();

246 247 248 249
		nr = next_pidmap(pid_ns, nr);
	}
	read_unlock(&tasklist_lock);

250 251 252 253 254
	/*
	 * Reap the EXIT_ZOMBIE children we had before we ignored SIGCHLD.
	 * sys_wait4() will also block until our children traced from the
	 * parent namespace are detached and become EXIT_DEAD.
	 */
255 256 257 258 259
	do {
		clear_thread_flag(TIF_SIGPENDING);
		rc = sys_wait4(-1, NULL, __WALL, NULL);
	} while (rc != -ECHILD);

260
	/*
261 262 263 264 265 266 267 268 269 270 271 272 273 274
	 * sys_wait4() above can't reap the EXIT_DEAD children but we do not
	 * really care, we could reparent them to the global init. We could
	 * exit and reap ->child_reaper even if it is not the last thread in
	 * this pid_ns, free_pid(nr_hashed == 0) calls proc_cleanup_work(),
	 * pid_ns can not go away until proc_kill_sb() drops the reference.
	 *
	 * But this ns can also have other tasks injected by setns()+fork().
	 * Again, ignoring the user visible semantics we do not really need
	 * to wait until they are all reaped, but they can be reparented to
	 * us and thus we need to ensure that pid->child_reaper stays valid
	 * until they all go away. See free_pid()->wake_up_process().
	 *
	 * We rely on ignored SIGCHLD, an injected zombie must be autoreaped
	 * if reparented.
275 276
	 */
	for (;;) {
277
		set_current_state(TASK_UNINTERRUPTIBLE);
278
		if (pid_ns->nr_hashed == init_pids)
279 280 281
			break;
		schedule();
	}
282
	__set_current_state(TASK_RUNNING);
283

284 285 286
	if (pid_ns->reboot)
		current->signal->group_exit_code = pid_ns->reboot;

287
	acct_exit_ns(pid_ns);
288 289 290
	return;
}

291
#ifdef CONFIG_CHECKPOINT_RESTORE
292 293 294
static int pid_ns_ctl_handler(struct ctl_table *table, int write,
		void __user *buffer, size_t *lenp, loff_t *ppos)
{
295
	struct pid_namespace *pid_ns = task_active_pid_ns(current);
296 297
	struct ctl_table tmp = *table;

298
	if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
299 300 301 302 303 304 305 306
		return -EPERM;

	/*
	 * Writing directly to ns' last_pid field is OK, since this field
	 * is volatile in a living namespace anyway and a code writing to
	 * it should synchronize its usage with external means.
	 */

307
	tmp.data = &pid_ns->last_pid;
308
	return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
309 310
}

311 312
extern int pid_max;
static int zero = 0;
313 314 315 316 317 318
static struct ctl_table pid_ns_ctl_table[] = {
	{
		.procname = "ns_last_pid",
		.maxlen = sizeof(int),
		.mode = 0666, /* permissions are checked in the handler */
		.proc_handler = pid_ns_ctl_handler,
319 320
		.extra1 = &zero,
		.extra2 = &pid_max,
321 322 323 324
	},
	{ }
};
static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } };
325
#endif	/* CONFIG_CHECKPOINT_RESTORE */
326

327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355
int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
{
	if (pid_ns == &init_pid_ns)
		return 0;

	switch (cmd) {
	case LINUX_REBOOT_CMD_RESTART2:
	case LINUX_REBOOT_CMD_RESTART:
		pid_ns->reboot = SIGHUP;
		break;

	case LINUX_REBOOT_CMD_POWER_OFF:
	case LINUX_REBOOT_CMD_HALT:
		pid_ns->reboot = SIGINT;
		break;
	default:
		return -EINVAL;
	}

	read_lock(&tasklist_lock);
	force_sig(SIGKILL, pid_ns->child_reaper);
	read_unlock(&tasklist_lock);

	do_exit(0);

	/* Not reached */
	return 0;
}

356 357 358 359 360
static inline struct pid_namespace *to_pid_ns(struct ns_common *ns)
{
	return container_of(ns, struct pid_namespace, ns);
}

361
static struct ns_common *pidns_get(struct task_struct *task)
E
Eric W. Biederman 已提交
362 363 364 365
{
	struct pid_namespace *ns;

	rcu_read_lock();
366 367 368
	ns = task_active_pid_ns(task);
	if (ns)
		get_pid_ns(ns);
E
Eric W. Biederman 已提交
369 370
	rcu_read_unlock();

371
	return ns ? &ns->ns : NULL;
E
Eric W. Biederman 已提交
372 373
}

374
static void pidns_put(struct ns_common *ns)
E
Eric W. Biederman 已提交
375
{
376
	put_pid_ns(to_pid_ns(ns));
E
Eric W. Biederman 已提交
377 378
}

379
static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns)
E
Eric W. Biederman 已提交
380 381
{
	struct pid_namespace *active = task_active_pid_ns(current);
382
	struct pid_namespace *ancestor, *new = to_pid_ns(ns);
E
Eric W. Biederman 已提交
383

384
	if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) ||
385
	    !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
E
Eric W. Biederman 已提交
386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404
		return -EPERM;

	/*
	 * Only allow entering the current active pid namespace
	 * or a child of the current active pid namespace.
	 *
	 * This is required for fork to return a usable pid value and
	 * this maintains the property that processes and their
	 * children can not escape their current pid namespace.
	 */
	if (new->level < active->level)
		return -EINVAL;

	ancestor = new;
	while (ancestor->level > active->level)
		ancestor = ancestor->parent;
	if (ancestor != active)
		return -EINVAL;

405 406
	put_pid_ns(nsproxy->pid_ns_for_children);
	nsproxy->pid_ns_for_children = get_pid_ns(new);
E
Eric W. Biederman 已提交
407 408 409
	return 0;
}

410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427
static struct ns_common *pidns_get_parent(struct ns_common *ns)
{
	struct pid_namespace *active = task_active_pid_ns(current);
	struct pid_namespace *pid_ns, *p;

	/* See if the parent is in the current namespace */
	pid_ns = p = to_pid_ns(ns)->parent;
	for (;;) {
		if (!p)
			return ERR_PTR(-EPERM);
		if (p == active)
			break;
		p = p->parent;
	}

	return &get_pid_ns(pid_ns)->ns;
}

428 429 430 431 432
static struct user_namespace *pidns_owner(struct ns_common *ns)
{
	return to_pid_ns(ns)->user_ns;
}

E
Eric W. Biederman 已提交
433 434 435 436 437 438
const struct proc_ns_operations pidns_operations = {
	.name		= "pid",
	.type		= CLONE_NEWPID,
	.get		= pidns_get,
	.put		= pidns_put,
	.install	= pidns_install,
439
	.owner		= pidns_owner,
440
	.get_parent	= pidns_get_parent,
E
Eric W. Biederman 已提交
441 442
};

443 444 445
static __init int pid_namespaces_init(void)
{
	pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
446 447

#ifdef CONFIG_CHECKPOINT_RESTORE
448
	register_sysctl_paths(kern_path, pid_ns_ctl_table);
449
#endif
450 451 452 453
	return 0;
}

__initcall(pid_namespaces_init);