net_namespace.c 15.2 KB
Newer Older
J
Joe Perches 已提交
1 2
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

3 4 5 6 7 8
#include <linux/workqueue.h>
#include <linux/rtnetlink.h>
#include <linux/cache.h>
#include <linux/slab.h>
#include <linux/list.h>
#include <linux/delay.h>
9
#include <linux/sched.h>
10
#include <linux/idr.h>
11
#include <linux/rculist.h>
12
#include <linux/nsproxy.h>
13 14
#include <linux/proc_fs.h>
#include <linux/file.h>
15
#include <linux/export.h>
16
#include <linux/user_namespace.h>
17
#include <net/net_namespace.h>
18
#include <net/netns/generic.h>
19 20 21 22 23 24 25 26 27 28

/*
 *	Our network namespace constructor/destructor lists
 */

static LIST_HEAD(pernet_list);
static struct list_head *first_device = &pernet_list;
static DEFINE_MUTEX(net_mutex);

LIST_HEAD(net_namespace_list);
A
Alexey Dobriyan 已提交
29
EXPORT_SYMBOL_GPL(net_namespace_list);
30

31 32 33
struct net init_net = {
	.dev_base_head = LIST_HEAD_INIT(init_net.dev_base_head),
};
34
EXPORT_SYMBOL(init_net);
35

36 37
#define INITIAL_NET_GEN_PTRS	13 /* +1 for len +2 for rcu_head */

E
Eric Dumazet 已提交
38 39 40 41 42 43 44 45 46 47 48 49 50 51
static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS;

static struct net_generic *net_alloc_generic(void)
{
	struct net_generic *ng;
	size_t generic_size = offsetof(struct net_generic, ptr[max_gen_ptrs]);

	ng = kzalloc(generic_size, GFP_KERNEL);
	if (ng)
		ng->len = max_gen_ptrs;

	return ng;
}

52 53 54 55 56 57 58
static int net_assign_generic(struct net *net, int id, void *data)
{
	struct net_generic *ng, *old_ng;

	BUG_ON(!mutex_is_locked(&net_mutex));
	BUG_ON(id == 0);

E
Eric Dumazet 已提交
59 60 61
	old_ng = rcu_dereference_protected(net->gen,
					   lockdep_is_held(&net_mutex));
	ng = old_ng;
62 63 64
	if (old_ng->len >= id)
		goto assign;

E
Eric Dumazet 已提交
65
	ng = net_alloc_generic();
66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82
	if (ng == NULL)
		return -ENOMEM;

	/*
	 * Some synchronisation notes:
	 *
	 * The net_generic explores the net->gen array inside rcu
	 * read section. Besides once set the net->gen->ptr[x]
	 * pointer never changes (see rules in netns/generic.h).
	 *
	 * That said, we simply duplicate this array and schedule
	 * the old copy for kfree after a grace period.
	 */

	memcpy(&ng->ptr, &old_ng->ptr, old_ng->len * sizeof(void*));

	rcu_assign_pointer(net->gen, ng);
83
	kfree_rcu(old_ng, rcu);
84 85 86 87 88
assign:
	ng->ptr[id - 1] = data;
	return 0;
}

89 90
static int ops_init(const struct pernet_operations *ops, struct net *net)
{
91 92 93
	int err = -ENOMEM;
	void *data = NULL;

94
	if (ops->id && ops->size) {
95
		data = kzalloc(ops->size, GFP_KERNEL);
96
		if (!data)
97
			goto out;
98 99

		err = net_assign_generic(net, *ops->id, data);
100 101
		if (err)
			goto cleanup;
102
	}
103
	err = 0;
104
	if (ops->init)
105 106 107 108 109 110 111 112 113
		err = ops->init(net);
	if (!err)
		return 0;

cleanup:
	kfree(data);

out:
	return err;
114 115 116 117 118 119 120 121 122 123
}

static void ops_free(const struct pernet_operations *ops, struct net *net)
{
	if (ops->id && ops->size) {
		int id = *ops->id;
		kfree(net_generic(net, id));
	}
}

124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145
static void ops_exit_list(const struct pernet_operations *ops,
			  struct list_head *net_exit_list)
{
	struct net *net;
	if (ops->exit) {
		list_for_each_entry(net, net_exit_list, exit_list)
			ops->exit(net);
	}
	if (ops->exit_batch)
		ops->exit_batch(net_exit_list);
}

static void ops_free_list(const struct pernet_operations *ops,
			  struct list_head *net_exit_list)
{
	struct net *net;
	if (ops->size && ops->id) {
		list_for_each_entry(net, net_exit_list, exit_list)
			ops_free(ops, net);
	}
}

146 147 148
/*
 * setup_net runs the initializers for the network namespace object.
 */
149
static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
150 151
{
	/* Must be called with net_mutex held */
152
	const struct pernet_operations *ops, *saved_ops;
153
	int error = 0;
154
	LIST_HEAD(net_exit_list);
155 156

	atomic_set(&net->count, 1);
157
	atomic_set(&net->passive, 1);
158
	net->dev_base_seq = 1;
159
	net->user_ns = user_ns;
160

161
#ifdef NETNS_REFCNT_DEBUG
162
	atomic_set(&net->use_count, 0);
163
#endif
164

165
	list_for_each_entry(ops, &pernet_list, list) {
166 167 168
		error = ops_init(ops, net);
		if (error < 0)
			goto out_undo;
169 170 171
	}
out:
	return error;
172

173 174 175 176
out_undo:
	/* Walk through the list backwards calling the exit functions
	 * for the pernet modules whose init functions did not fail.
	 */
177
	list_add(&net->exit_list, &net_exit_list);
178
	saved_ops = ops;
179 180 181
	list_for_each_entry_continue_reverse(ops, &pernet_list, list)
		ops_exit_list(ops, &net_exit_list);

182 183
	ops = saved_ops;
	list_for_each_entry_continue_reverse(ops, &pernet_list, list)
184
		ops_free_list(ops, &net_exit_list);
185 186

	rcu_barrier();
187 188 189
	goto out;
}

190

191 192 193 194
#ifdef CONFIG_NET_NS
static struct kmem_cache *net_cachep;
static struct workqueue_struct *netns_wq;

195
static struct net *net_alloc(void)
196
{
197 198 199 200 201 202 203 204
	struct net *net = NULL;
	struct net_generic *ng;

	ng = net_alloc_generic();
	if (!ng)
		goto out;

	net = kmem_cache_zalloc(net_cachep, GFP_KERNEL);
205
	if (!net)
206
		goto out_free;
207

208 209 210 211 212 213 214 215 216 217 218
	rcu_assign_pointer(net->gen, ng);
out:
	return net;

out_free:
	kfree(ng);
	goto out;
}

static void net_free(struct net *net)
{
219
#ifdef NETNS_REFCNT_DEBUG
220
	if (unlikely(atomic_read(&net->use_count) != 0)) {
J
Joe Perches 已提交
221 222
		pr_emerg("network namespace not free! Usage: %d\n",
			 atomic_read(&net->use_count));
223 224
		return;
	}
225
#endif
226
	kfree(net->gen);
227 228 229
	kmem_cache_free(net_cachep, net);
}

230 231 232 233 234 235 236
void net_drop_ns(void *p)
{
	struct net *ns = p;
	if (ns && atomic_dec_and_test(&ns->passive))
		net_free(ns);
}

237 238
struct net *copy_net_ns(unsigned long flags,
			struct user_namespace *user_ns, struct net *old_net)
239
{
240 241
	struct net *net;
	int rv;
242

243 244 245
	if (!(flags & CLONE_NEWNET))
		return get_net(old_net);

246 247 248
	net = net_alloc();
	if (!net)
		return ERR_PTR(-ENOMEM);
249 250 251

	get_user_ns(user_ns);

252
	mutex_lock(&net_mutex);
253
	rv = setup_net(net, user_ns);
254
	if (rv == 0) {
255
		rtnl_lock();
256
		list_add_tail_rcu(&net->list, &net_namespace_list);
257 258
		rtnl_unlock();
	}
259
	mutex_unlock(&net_mutex);
260
	if (rv < 0) {
261
		put_user_ns(user_ns);
262
		net_drop_ns(net);
263 264 265 266
		return ERR_PTR(rv);
	}
	return net;
}
267

268 269 270
static DEFINE_SPINLOCK(cleanup_list_lock);
static LIST_HEAD(cleanup_list);  /* Must hold cleanup_list_lock to touch */

271 272
static void cleanup_net(struct work_struct *work)
{
273
	const struct pernet_operations *ops;
274 275
	struct net *net, *tmp;
	LIST_HEAD(net_kill_list);
276
	LIST_HEAD(net_exit_list);
277

278 279 280 281
	/* Atomically snapshot the list of namespaces to cleanup */
	spin_lock_irq(&cleanup_list_lock);
	list_replace_init(&cleanup_list, &net_kill_list);
	spin_unlock_irq(&cleanup_list_lock);
282 283 284 285 286

	mutex_lock(&net_mutex);

	/* Don't let anyone else find us. */
	rtnl_lock();
287
	list_for_each_entry(net, &net_kill_list, cleanup_list) {
288
		list_del_rcu(&net->list);
289 290
		list_add_tail(&net->exit_list, &net_exit_list);
	}
291 292
	rtnl_unlock();

293 294 295 296 297 298 299
	/*
	 * Another CPU might be rcu-iterating the list, wait for it.
	 * This needs to be before calling the exit() notifiers, so
	 * the rcu_barrier() below isn't sufficient alone.
	 */
	synchronize_rcu();

300
	/* Run all of the network namespace exit methods */
301 302 303
	list_for_each_entry_reverse(ops, &pernet_list, list)
		ops_exit_list(ops, &net_exit_list);

304
	/* Free the net generic variables */
305 306
	list_for_each_entry_reverse(ops, &pernet_list, list)
		ops_free_list(ops, &net_exit_list);
307 308 309 310 311 312 313 314 315

	mutex_unlock(&net_mutex);

	/* Ensure there are no outstanding rcu callbacks using this
	 * network namespace.
	 */
	rcu_barrier();

	/* Finally it is safe to free my network namespace structure */
316 317
	list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) {
		list_del_init(&net->exit_list);
318
		put_user_ns(net->user_ns);
319
		net_drop_ns(net);
320
	}
321
}
322
static DECLARE_WORK(net_cleanup_work, cleanup_net);
323 324 325 326

void __put_net(struct net *net)
{
	/* Cleanup the network namespace in process context */
327 328 329 330 331 332 333
	unsigned long flags;

	spin_lock_irqsave(&cleanup_list_lock, flags);
	list_add(&net->cleanup_list, &cleanup_list);
	spin_unlock_irqrestore(&cleanup_list_lock, flags);

	queue_work(netns_wq, &net_cleanup_work);
334 335 336
}
EXPORT_SYMBOL_GPL(__put_net);

337 338 339 340 341 342 343
struct net *get_net_ns_by_fd(int fd)
{
	struct proc_inode *ei;
	struct file *file;
	struct net *net;

	file = proc_ns_fget(fd);
344 345
	if (IS_ERR(file))
		return ERR_CAST(file);
346

A
Al Viro 已提交
347
	ei = PROC_I(file_inode(file));
348 349 350 351
	if (ei->ns_ops == &netns_operations)
		net = get_net(ei->ns);
	else
		net = ERR_PTR(-EINVAL);
352

353
	fput(file);
354 355 356
	return net;
}

357
#else
358 359 360 361
struct net *get_net_ns_by_fd(int fd)
{
	return ERR_PTR(-EINVAL);
}
362 363
#endif

364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383
struct net *get_net_ns_by_pid(pid_t pid)
{
	struct task_struct *tsk;
	struct net *net;

	/* Lookup the network namespace */
	net = ERR_PTR(-ESRCH);
	rcu_read_lock();
	tsk = find_task_by_vpid(pid);
	if (tsk) {
		struct nsproxy *nsproxy;
		nsproxy = task_nsproxy(tsk);
		if (nsproxy)
			net = get_net(nsproxy->net_ns);
	}
	rcu_read_unlock();
	return net;
}
EXPORT_SYMBOL_GPL(get_net_ns_by_pid);

384 385 386 387 388 389 390 391 392 393 394 395 396 397 398
static __net_init int net_ns_net_init(struct net *net)
{
	return proc_alloc_inum(&net->proc_inum);
}

static __net_exit void net_ns_net_exit(struct net *net)
{
	proc_free_inum(net->proc_inum);
}

static struct pernet_operations __net_initdata net_ns_ops = {
	.init = net_ns_net_init,
	.exit = net_ns_net_exit,
};

399 400
static int __init net_ns_init(void)
{
401
	struct net_generic *ng;
402

403
#ifdef CONFIG_NET_NS
404 405 406
	net_cachep = kmem_cache_create("net_namespace", sizeof(struct net),
					SMP_CACHE_BYTES,
					SLAB_PANIC, NULL);
407 408 409 410 411

	/* Create workqueue for cleanup */
	netns_wq = create_singlethread_workqueue("netns");
	if (!netns_wq)
		panic("Could not create netns workq");
412
#endif
413

414 415 416 417 418 419
	ng = net_alloc_generic();
	if (!ng)
		panic("Could not allocate generic netns");

	rcu_assign_pointer(init_net.gen, ng);

420
	mutex_lock(&net_mutex);
421
	if (setup_net(&init_net, &init_user_ns))
S
Stephen Hemminger 已提交
422
		panic("Could not setup the initial network namespace");
423

424
	rtnl_lock();
425
	list_add_tail_rcu(&init_net.list, &net_namespace_list);
426
	rtnl_unlock();
427 428 429

	mutex_unlock(&net_mutex);

430 431
	register_pernet_subsys(&net_ns_ops);

432 433 434 435 436
	return 0;
}

pure_initcall(net_ns_init);

437
#ifdef CONFIG_NET_NS
438 439
static int __register_pernet_operations(struct list_head *list,
					struct pernet_operations *ops)
440
{
441
	struct net *net;
442
	int error;
443
	LIST_HEAD(net_exit_list);
444 445

	list_add_tail(&ops->list, list);
446
	if (ops->init || (ops->id && ops->size)) {
447
		for_each_net(net) {
448
			error = ops_init(ops, net);
449 450
			if (error)
				goto out_undo;
451
			list_add_tail(&net->exit_list, &net_exit_list);
452 453
		}
	}
454
	return 0;
455 456 457 458

out_undo:
	/* If I have an error cleanup all namespaces I initialized */
	list_del(&ops->list);
459 460
	ops_exit_list(ops, &net_exit_list);
	ops_free_list(ops, &net_exit_list);
461
	return error;
462 463
}

464
static void __unregister_pernet_operations(struct pernet_operations *ops)
465 466
{
	struct net *net;
467
	LIST_HEAD(net_exit_list);
468 469

	list_del(&ops->list);
470 471 472 473
	for_each_net(net)
		list_add_tail(&net->exit_list, &net_exit_list);
	ops_exit_list(ops, &net_exit_list);
	ops_free_list(ops, &net_exit_list);
474 475
}

476 477
#else

478 479
static int __register_pernet_operations(struct list_head *list,
					struct pernet_operations *ops)
480
{
481
	return ops_init(ops, &init_net);
482 483
}

484
static void __unregister_pernet_operations(struct pernet_operations *ops)
485
{
486 487 488 489
	LIST_HEAD(net_exit_list);
	list_add(&init_net.exit_list, &net_exit_list);
	ops_exit_list(ops, &net_exit_list);
	ops_free_list(ops, &net_exit_list);
490
}
491 492

#endif /* CONFIG_NET_NS */
493

494 495
static DEFINE_IDA(net_generic_ids);

496 497 498 499 500 501 502 503 504 505 506 507 508 509 510
static int register_pernet_operations(struct list_head *list,
				      struct pernet_operations *ops)
{
	int error;

	if (ops->id) {
again:
		error = ida_get_new_above(&net_generic_ids, 1, ops->id);
		if (error < 0) {
			if (error == -EAGAIN) {
				ida_pre_get(&net_generic_ids, GFP_KERNEL);
				goto again;
			}
			return error;
		}
E
Eric Dumazet 已提交
511
		max_gen_ptrs = max_t(unsigned int, max_gen_ptrs, *ops->id);
512 513
	}
	error = __register_pernet_operations(list, ops);
514 515 516 517 518
	if (error) {
		rcu_barrier();
		if (ops->id)
			ida_remove(&net_generic_ids, *ops->id);
	}
519 520 521 522 523 524 525 526

	return error;
}

static void unregister_pernet_operations(struct pernet_operations *ops)
{
	
	__unregister_pernet_operations(ops);
527
	rcu_barrier();
528 529 530 531
	if (ops->id)
		ida_remove(&net_generic_ids, *ops->id);
}

532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565
/**
 *      register_pernet_subsys - register a network namespace subsystem
 *	@ops:  pernet operations structure for the subsystem
 *
 *	Register a subsystem which has init and exit functions
 *	that are called when network namespaces are created and
 *	destroyed respectively.
 *
 *	When registered all network namespace init functions are
 *	called for every existing network namespace.  Allowing kernel
 *	modules to have a race free view of the set of network namespaces.
 *
 *	When a new network namespace is created all of the init
 *	methods are called in the order in which they were registered.
 *
 *	When a network namespace is destroyed all of the exit methods
 *	are called in the reverse of the order with which they were
 *	registered.
 */
int register_pernet_subsys(struct pernet_operations *ops)
{
	int error;
	mutex_lock(&net_mutex);
	error =  register_pernet_operations(first_device, ops);
	mutex_unlock(&net_mutex);
	return error;
}
EXPORT_SYMBOL_GPL(register_pernet_subsys);

/**
 *      unregister_pernet_subsys - unregister a network namespace subsystem
 *	@ops: pernet operations structure to manipulate
 *
 *	Remove the pernet operations structure from the list to be
566
 *	used when network namespaces are created or destroyed.  In
567 568 569
 *	addition run the exit method for all existing network
 *	namespaces.
 */
570
void unregister_pernet_subsys(struct pernet_operations *ops)
571 572
{
	mutex_lock(&net_mutex);
573
	unregister_pernet_operations(ops);
574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613
	mutex_unlock(&net_mutex);
}
EXPORT_SYMBOL_GPL(unregister_pernet_subsys);

/**
 *      register_pernet_device - register a network namespace device
 *	@ops:  pernet operations structure for the subsystem
 *
 *	Register a device which has init and exit functions
 *	that are called when network namespaces are created and
 *	destroyed respectively.
 *
 *	When registered all network namespace init functions are
 *	called for every existing network namespace.  Allowing kernel
 *	modules to have a race free view of the set of network namespaces.
 *
 *	When a new network namespace is created all of the init
 *	methods are called in the order in which they were registered.
 *
 *	When a network namespace is destroyed all of the exit methods
 *	are called in the reverse of the order with which they were
 *	registered.
 */
int register_pernet_device(struct pernet_operations *ops)
{
	int error;
	mutex_lock(&net_mutex);
	error = register_pernet_operations(&pernet_list, ops);
	if (!error && (first_device == &pernet_list))
		first_device = &ops->list;
	mutex_unlock(&net_mutex);
	return error;
}
EXPORT_SYMBOL_GPL(register_pernet_device);

/**
 *      unregister_pernet_device - unregister a network namespace netdevice
 *	@ops: pernet operations structure to manipulate
 *
 *	Remove the pernet operations structure from the list to be
614
 *	used when network namespaces are created or destroyed.  In
615 616 617 618 619 620 621 622 623 624 625 626
 *	addition run the exit method for all existing network
 *	namespaces.
 */
void unregister_pernet_device(struct pernet_operations *ops)
{
	mutex_lock(&net_mutex);
	if (&ops->list == first_device)
		first_device = first_device->next;
	unregister_pernet_operations(ops);
	mutex_unlock(&net_mutex);
}
EXPORT_SYMBOL_GPL(unregister_pernet_device);
627 628 629 630

#ifdef CONFIG_NET_NS
static void *netns_get(struct task_struct *task)
{
631 632 633
	struct net *net = NULL;
	struct nsproxy *nsproxy;

634
	rcu_read_lock();
635 636 637
	nsproxy = task_nsproxy(task);
	if (nsproxy)
		net = get_net(nsproxy->net_ns);
638
	rcu_read_unlock();
639

640 641 642 643 644 645 646 647 648 649
	return net;
}

static void netns_put(void *ns)
{
	put_net(ns);
}

static int netns_install(struct nsproxy *nsproxy, void *ns)
{
650 651
	struct net *net = ns;

652 653
	if (!ns_capable(net->user_ns, CAP_SYS_ADMIN) ||
	    !nsown_capable(CAP_SYS_ADMIN))
654 655
		return -EPERM;

656
	put_net(nsproxy->net_ns);
657
	nsproxy->net_ns = get_net(net);
658 659 660
	return 0;
}

661 662 663 664 665 666
static unsigned int netns_inum(void *ns)
{
	struct net *net = ns;
	return net->proc_inum;
}

667 668 669 670 671 672
const struct proc_ns_operations netns_operations = {
	.name		= "net",
	.type		= CLONE_NEWNET,
	.get		= netns_get,
	.put		= netns_put,
	.install	= netns_install,
673
	.inum		= netns_inum,
674 675
};
#endif