net_namespace.c 13.4 KB
Newer Older
1 2 3 4 5 6
#include <linux/workqueue.h>
#include <linux/rtnetlink.h>
#include <linux/cache.h>
#include <linux/slab.h>
#include <linux/list.h>
#include <linux/delay.h>
7
#include <linux/sched.h>
8
#include <linux/idr.h>
9
#include <linux/rculist.h>
10
#include <linux/nsproxy.h>
11
#include <net/net_namespace.h>
12
#include <net/netns/generic.h>
13 14 15 16 17 18 19 20 21 22

/*
 *	Our network namespace constructor/destructor lists
 */

static LIST_HEAD(pernet_list);
static struct list_head *first_device = &pernet_list;
static DEFINE_MUTEX(net_mutex);

LIST_HEAD(net_namespace_list);
A
Alexey Dobriyan 已提交
23
EXPORT_SYMBOL_GPL(net_namespace_list);
24 25

struct net init_net;
26
EXPORT_SYMBOL(init_net);
27

28 29
#define INITIAL_NET_GEN_PTRS	13 /* +1 for len +2 for rcu_head */

30 31 32 33 34 35 36 37 38 39 40 41 42 43 44
static void net_generic_release(struct rcu_head *rcu)
{
	struct net_generic *ng;

	ng = container_of(rcu, struct net_generic, rcu);
	kfree(ng);
}

static int net_assign_generic(struct net *net, int id, void *data)
{
	struct net_generic *ng, *old_ng;

	BUG_ON(!mutex_is_locked(&net_mutex));
	BUG_ON(id == 0);

E
Eric Dumazet 已提交
45 46 47
	old_ng = rcu_dereference_protected(net->gen,
					   lockdep_is_held(&net_mutex));
	ng = old_ng;
48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76
	if (old_ng->len >= id)
		goto assign;

	ng = kzalloc(sizeof(struct net_generic) +
			id * sizeof(void *), GFP_KERNEL);
	if (ng == NULL)
		return -ENOMEM;

	/*
	 * Some synchronisation notes:
	 *
	 * The net_generic explores the net->gen array inside rcu
	 * read section. Besides once set the net->gen->ptr[x]
	 * pointer never changes (see rules in netns/generic.h).
	 *
	 * That said, we simply duplicate this array and schedule
	 * the old copy for kfree after a grace period.
	 */

	ng->len = id;
	memcpy(&ng->ptr, &old_ng->ptr, old_ng->len * sizeof(void*));

	rcu_assign_pointer(net->gen, ng);
	call_rcu(&old_ng->rcu, net_generic_release);
assign:
	ng->ptr[id - 1] = data;
	return 0;
}

77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103
static int ops_init(const struct pernet_operations *ops, struct net *net)
{
	int err;
	if (ops->id && ops->size) {
		void *data = kzalloc(ops->size, GFP_KERNEL);
		if (!data)
			return -ENOMEM;

		err = net_assign_generic(net, *ops->id, data);
		if (err) {
			kfree(data);
			return err;
		}
	}
	if (ops->init)
		return ops->init(net);
	return 0;
}

static void ops_free(const struct pernet_operations *ops, struct net *net)
{
	if (ops->id && ops->size) {
		int id = *ops->id;
		kfree(net_generic(net, id));
	}
}

104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
static void ops_exit_list(const struct pernet_operations *ops,
			  struct list_head *net_exit_list)
{
	struct net *net;
	if (ops->exit) {
		list_for_each_entry(net, net_exit_list, exit_list)
			ops->exit(net);
	}
	if (ops->exit_batch)
		ops->exit_batch(net_exit_list);
}

static void ops_free_list(const struct pernet_operations *ops,
			  struct list_head *net_exit_list)
{
	struct net *net;
	if (ops->size && ops->id) {
		list_for_each_entry(net, net_exit_list, exit_list)
			ops_free(ops, net);
	}
}

126 127 128
/*
 * setup_net runs the initializers for the network namespace object.
 */
129
static __net_init int setup_net(struct net *net)
130 131
{
	/* Must be called with net_mutex held */
132
	const struct pernet_operations *ops, *saved_ops;
133
	int error = 0;
134
	LIST_HEAD(net_exit_list);
135 136

	atomic_set(&net->count, 1);
137

138
#ifdef NETNS_REFCNT_DEBUG
139
	atomic_set(&net->use_count, 0);
140
#endif
141

142
	list_for_each_entry(ops, &pernet_list, list) {
143 144 145
		error = ops_init(ops, net);
		if (error < 0)
			goto out_undo;
146 147 148
	}
out:
	return error;
149

150 151 152 153
out_undo:
	/* Walk through the list backwards calling the exit functions
	 * for the pernet modules whose init functions did not fail.
	 */
154
	list_add(&net->exit_list, &net_exit_list);
155
	saved_ops = ops;
156 157 158
	list_for_each_entry_continue_reverse(ops, &pernet_list, list)
		ops_exit_list(ops, &net_exit_list);

159 160
	ops = saved_ops;
	list_for_each_entry_continue_reverse(ops, &pernet_list, list)
161
		ops_free_list(ops, &net_exit_list);
162 163

	rcu_barrier();
164 165 166
	goto out;
}

167
static struct net_generic *net_alloc_generic(void)
168
{
169 170 171 172 173 174 175 176 177
	struct net_generic *ng;
	size_t generic_size = sizeof(struct net_generic) +
		INITIAL_NET_GEN_PTRS * sizeof(void *);

	ng = kzalloc(generic_size, GFP_KERNEL);
	if (ng)
		ng->len = INITIAL_NET_GEN_PTRS;

	return ng;
178 179
}

180 181 182 183
#ifdef CONFIG_NET_NS
static struct kmem_cache *net_cachep;
static struct workqueue_struct *netns_wq;

184
static struct net *net_alloc(void)
185
{
186 187 188 189 190 191 192 193
	struct net *net = NULL;
	struct net_generic *ng;

	ng = net_alloc_generic();
	if (!ng)
		goto out;

	net = kmem_cache_zalloc(net_cachep, GFP_KERNEL);
194
	if (!net)
195
		goto out_free;
196

197 198 199 200 201 202 203 204 205 206 207
	rcu_assign_pointer(net->gen, ng);
out:
	return net;

out_free:
	kfree(ng);
	goto out;
}

static void net_free(struct net *net)
{
208
#ifdef NETNS_REFCNT_DEBUG
209 210 211 212 213
	if (unlikely(atomic_read(&net->use_count) != 0)) {
		printk(KERN_EMERG "network namespace not free! Usage: %d\n",
			atomic_read(&net->use_count));
		return;
	}
214
#endif
215
	kfree(net->gen);
216 217 218
	kmem_cache_free(net_cachep, net);
}

219
static struct net *net_create(void)
220
{
221 222
	struct net *net;
	int rv;
223

224 225 226
	net = net_alloc();
	if (!net)
		return ERR_PTR(-ENOMEM);
227
	mutex_lock(&net_mutex);
228 229
	rv = setup_net(net);
	if (rv == 0) {
230
		rtnl_lock();
231
		list_add_tail_rcu(&net->list, &net_namespace_list);
232 233
		rtnl_unlock();
	}
234
	mutex_unlock(&net_mutex);
235 236 237 238 239 240
	if (rv < 0) {
		net_free(net);
		return ERR_PTR(rv);
	}
	return net;
}
241

242 243 244 245 246
struct net *copy_net_ns(unsigned long flags, struct net *old_net)
{
	if (!(flags & CLONE_NEWNET))
		return get_net(old_net);
	return net_create();
247 248
}

249 250 251
static DEFINE_SPINLOCK(cleanup_list_lock);
static LIST_HEAD(cleanup_list);  /* Must hold cleanup_list_lock to touch */

252 253
static void cleanup_net(struct work_struct *work)
{
254
	const struct pernet_operations *ops;
255 256
	struct net *net, *tmp;
	LIST_HEAD(net_kill_list);
257
	LIST_HEAD(net_exit_list);
258

259 260 261 262
	/* Atomically snapshot the list of namespaces to cleanup */
	spin_lock_irq(&cleanup_list_lock);
	list_replace_init(&cleanup_list, &net_kill_list);
	spin_unlock_irq(&cleanup_list_lock);
263 264 265 266 267

	mutex_lock(&net_mutex);

	/* Don't let anyone else find us. */
	rtnl_lock();
268
	list_for_each_entry(net, &net_kill_list, cleanup_list) {
269
		list_del_rcu(&net->list);
270 271
		list_add_tail(&net->exit_list, &net_exit_list);
	}
272 273
	rtnl_unlock();

274 275 276 277 278 279 280
	/*
	 * Another CPU might be rcu-iterating the list, wait for it.
	 * This needs to be before calling the exit() notifiers, so
	 * the rcu_barrier() below isn't sufficient alone.
	 */
	synchronize_rcu();

281
	/* Run all of the network namespace exit methods */
282 283 284
	list_for_each_entry_reverse(ops, &pernet_list, list)
		ops_exit_list(ops, &net_exit_list);

285
	/* Free the net generic variables */
286 287
	list_for_each_entry_reverse(ops, &pernet_list, list)
		ops_free_list(ops, &net_exit_list);
288 289 290 291 292 293 294 295 296

	mutex_unlock(&net_mutex);

	/* Ensure there are no outstanding rcu callbacks using this
	 * network namespace.
	 */
	rcu_barrier();

	/* Finally it is safe to free my network namespace structure */
297 298
	list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) {
		list_del_init(&net->exit_list);
299 300
		net_free(net);
	}
301
}
302
static DECLARE_WORK(net_cleanup_work, cleanup_net);
303 304 305 306

void __put_net(struct net *net)
{
	/* Cleanup the network namespace in process context */
307 308 309 310 311 312 313
	unsigned long flags;

	spin_lock_irqsave(&cleanup_list_lock, flags);
	list_add(&net->cleanup_list, &cleanup_list);
	spin_unlock_irqrestore(&cleanup_list_lock, flags);

	queue_work(netns_wq, &net_cleanup_work);
314 315 316 317 318 319 320 321 322 323 324 325
}
EXPORT_SYMBOL_GPL(__put_net);

#else
struct net *copy_net_ns(unsigned long flags, struct net *old_net)
{
	if (flags & CLONE_NEWNET)
		return ERR_PTR(-EINVAL);
	return old_net;
}
#endif

326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345
struct net *get_net_ns_by_pid(pid_t pid)
{
	struct task_struct *tsk;
	struct net *net;

	/* Lookup the network namespace */
	net = ERR_PTR(-ESRCH);
	rcu_read_lock();
	tsk = find_task_by_vpid(pid);
	if (tsk) {
		struct nsproxy *nsproxy;
		nsproxy = task_nsproxy(tsk);
		if (nsproxy)
			net = get_net(nsproxy->net_ns);
	}
	rcu_read_unlock();
	return net;
}
EXPORT_SYMBOL_GPL(get_net_ns_by_pid);

346 347
static int __init net_ns_init(void)
{
348
	struct net_generic *ng;
349

350
#ifdef CONFIG_NET_NS
351 352 353
	net_cachep = kmem_cache_create("net_namespace", sizeof(struct net),
					SMP_CACHE_BYTES,
					SLAB_PANIC, NULL);
354 355 356 357 358

	/* Create workqueue for cleanup */
	netns_wq = create_singlethread_workqueue("netns");
	if (!netns_wq)
		panic("Could not create netns workq");
359
#endif
360

361 362 363 364 365 366
	ng = net_alloc_generic();
	if (!ng)
		panic("Could not allocate generic netns");

	rcu_assign_pointer(init_net.gen, ng);

367
	mutex_lock(&net_mutex);
S
Stephen Hemminger 已提交
368 369
	if (setup_net(&init_net))
		panic("Could not setup the initial network namespace");
370

371
	rtnl_lock();
372
	list_add_tail_rcu(&init_net.list, &net_namespace_list);
373
	rtnl_unlock();
374 375 376 377 378 379 380 381

	mutex_unlock(&net_mutex);

	return 0;
}

pure_initcall(net_ns_init);

382
#ifdef CONFIG_NET_NS
383 384
static int __register_pernet_operations(struct list_head *list,
					struct pernet_operations *ops)
385
{
386
	struct net *net;
387
	int error;
388
	LIST_HEAD(net_exit_list);
389 390

	list_add_tail(&ops->list, list);
391
	if (ops->init || (ops->id && ops->size)) {
392
		for_each_net(net) {
393
			error = ops_init(ops, net);
394 395
			if (error)
				goto out_undo;
396
			list_add_tail(&net->exit_list, &net_exit_list);
397 398
		}
	}
399
	return 0;
400 401 402 403

out_undo:
	/* If I have an error cleanup all namespaces I initialized */
	list_del(&ops->list);
404 405
	ops_exit_list(ops, &net_exit_list);
	ops_free_list(ops, &net_exit_list);
406
	return error;
407 408
}

409
static void __unregister_pernet_operations(struct pernet_operations *ops)
410 411
{
	struct net *net;
412
	LIST_HEAD(net_exit_list);
413 414

	list_del(&ops->list);
415 416 417 418
	for_each_net(net)
		list_add_tail(&net->exit_list, &net_exit_list);
	ops_exit_list(ops, &net_exit_list);
	ops_free_list(ops, &net_exit_list);
419 420
}

421 422
#else

423 424
static int __register_pernet_operations(struct list_head *list,
					struct pernet_operations *ops)
425
{
426 427 428 429 430 431
	int err = 0;
	err = ops_init(ops, &init_net);
	if (err)
		ops_free(ops, &init_net);
	return err;
	
432 433
}

434
static void __unregister_pernet_operations(struct pernet_operations *ops)
435
{
436 437 438 439
	LIST_HEAD(net_exit_list);
	list_add(&init_net.exit_list, &net_exit_list);
	ops_exit_list(ops, &net_exit_list);
	ops_free_list(ops, &net_exit_list);
440
}
441 442

#endif /* CONFIG_NET_NS */
443

444 445
static DEFINE_IDA(net_generic_ids);

446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462
static int register_pernet_operations(struct list_head *list,
				      struct pernet_operations *ops)
{
	int error;

	if (ops->id) {
again:
		error = ida_get_new_above(&net_generic_ids, 1, ops->id);
		if (error < 0) {
			if (error == -EAGAIN) {
				ida_pre_get(&net_generic_ids, GFP_KERNEL);
				goto again;
			}
			return error;
		}
	}
	error = __register_pernet_operations(list, ops);
463 464 465 466 467
	if (error) {
		rcu_barrier();
		if (ops->id)
			ida_remove(&net_generic_ids, *ops->id);
	}
468 469 470 471 472 473 474 475

	return error;
}

static void unregister_pernet_operations(struct pernet_operations *ops)
{
	
	__unregister_pernet_operations(ops);
476
	rcu_barrier();
477 478 479 480
	if (ops->id)
		ida_remove(&net_generic_ids, *ops->id);
}

481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514
/**
 *      register_pernet_subsys - register a network namespace subsystem
 *	@ops:  pernet operations structure for the subsystem
 *
 *	Register a subsystem which has init and exit functions
 *	that are called when network namespaces are created and
 *	destroyed respectively.
 *
 *	When registered all network namespace init functions are
 *	called for every existing network namespace.  Allowing kernel
 *	modules to have a race free view of the set of network namespaces.
 *
 *	When a new network namespace is created all of the init
 *	methods are called in the order in which they were registered.
 *
 *	When a network namespace is destroyed all of the exit methods
 *	are called in the reverse of the order with which they were
 *	registered.
 */
int register_pernet_subsys(struct pernet_operations *ops)
{
	int error;
	mutex_lock(&net_mutex);
	error =  register_pernet_operations(first_device, ops);
	mutex_unlock(&net_mutex);
	return error;
}
EXPORT_SYMBOL_GPL(register_pernet_subsys);

/**
 *      unregister_pernet_subsys - unregister a network namespace subsystem
 *	@ops: pernet operations structure to manipulate
 *
 *	Remove the pernet operations structure from the list to be
515
 *	used when network namespaces are created or destroyed.  In
516 517 518
 *	addition run the exit method for all existing network
 *	namespaces.
 */
519
void unregister_pernet_subsys(struct pernet_operations *ops)
520 521
{
	mutex_lock(&net_mutex);
522
	unregister_pernet_operations(ops);
523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562
	mutex_unlock(&net_mutex);
}
EXPORT_SYMBOL_GPL(unregister_pernet_subsys);

/**
 *      register_pernet_device - register a network namespace device
 *	@ops:  pernet operations structure for the subsystem
 *
 *	Register a device which has init and exit functions
 *	that are called when network namespaces are created and
 *	destroyed respectively.
 *
 *	When registered all network namespace init functions are
 *	called for every existing network namespace.  Allowing kernel
 *	modules to have a race free view of the set of network namespaces.
 *
 *	When a new network namespace is created all of the init
 *	methods are called in the order in which they were registered.
 *
 *	When a network namespace is destroyed all of the exit methods
 *	are called in the reverse of the order with which they were
 *	registered.
 */
int register_pernet_device(struct pernet_operations *ops)
{
	int error;
	mutex_lock(&net_mutex);
	error = register_pernet_operations(&pernet_list, ops);
	if (!error && (first_device == &pernet_list))
		first_device = &ops->list;
	mutex_unlock(&net_mutex);
	return error;
}
EXPORT_SYMBOL_GPL(register_pernet_device);

/**
 *      unregister_pernet_device - unregister a network namespace netdevice
 *	@ops: pernet operations structure to manipulate
 *
 *	Remove the pernet operations structure from the list to be
563
 *	used when network namespaces are created or destroyed.  In
564 565 566 567 568 569 570 571 572 573 574 575
 *	addition run the exit method for all existing network
 *	namespaces.
 */
void unregister_pernet_device(struct pernet_operations *ops)
{
	mutex_lock(&net_mutex);
	if (&ops->list == first_device)
		first_device = first_device->next;
	unregister_pernet_operations(ops);
	mutex_unlock(&net_mutex);
}
EXPORT_SYMBOL_GPL(unregister_pernet_device);