shm.c 31.6 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/*
 * linux/ipc/shm.c
 * Copyright (C) 1992, 1993 Krishna Balasubramanian
 *	 Many improvements/fixes by Bruno Haible.
 * Replaced `struct shm_desc' by `struct vm_area_struct', July 1994.
 * Fixed the shm swap deallocation (shm_unuse()), August 1998 Andrea Arcangeli.
 *
 * /proc/sysvipc/shm support (c) 1999 Dragos Acostachioaie <dragos@iname.com>
 * BIGMEM support, Andrea Arcangeli <andrea@suse.de>
 * SMP thread shm, Jean-Luc Boyard <jean-luc.boyard@siemens.fr>
 * HIGHMEM support, Ingo Molnar <mingo@redhat.com>
 * Make shmmax, shmall, shmmni sysctl'able, Christoph Rohland <cr@sap.com>
 * Shared /dev/zero support, Kanoj Sarcar <kanoj@sgi.com>
 * Move the mm functionality over to mm/shmem.c, Christoph Rohland <cr@sap.com>
 *
S
Steve Grubb 已提交
16 17
 * support for audit of ipc object properties and permission changes
 * Dustin Kirkland <dustin.kirkland@us.ibm.com>
K
Kirill Korotaev 已提交
18 19 20 21
 *
 * namespaces support
 * OpenVZ, SWsoft Inc.
 * Pavel Emelianov <xemul@openvz.org>
22 23 24
 *
 * Better ipc lock (kern_ipc_perm.lock) handling
 * Davidlohr Bueso <davidlohr.bueso@hp.com>, June 2013.
L
Linus Torvalds 已提交
25 26 27 28 29 30 31 32 33 34 35 36 37
 */

#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <linux/shm.h>
#include <linux/init.h>
#include <linux/file.h>
#include <linux/mman.h>
#include <linux/shmem_fs.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/audit.h>
38
#include <linux/capability.h>
S
Stephen Rothwell 已提交
39
#include <linux/ptrace.h>
40
#include <linux/seq_file.h>
N
Nadia Derbey 已提交
41
#include <linux/rwsem.h>
K
Kirill Korotaev 已提交
42
#include <linux/nsproxy.h>
43
#include <linux/mount.h>
44
#include <linux/ipc_namespace.h>
S
Stephen Rothwell 已提交
45

L
Linus Torvalds 已提交
46 47 48 49
#include <asm/uaccess.h>

#include "util.h"

50 51 52 53 54 55 56 57 58
struct shm_file_data {
	int id;
	struct ipc_namespace *ns;
	struct file *file;
	const struct vm_operations_struct *vm_ops;
};

#define shm_file_data(file) (*((struct shm_file_data **)&(file)->private_data))

59
static const struct file_operations shm_file_operations;
60
static const struct vm_operations_struct shm_vm_ops;
L
Linus Torvalds 已提交
61

62
#define shm_ids(ns)	((ns)->ids[IPC_SHM_IDS])
L
Linus Torvalds 已提交
63

K
Kirill Korotaev 已提交
64 65
#define shm_unlock(shp)			\
	ipc_unlock(&(shp)->shm_perm)
L
Linus Torvalds 已提交
66

N
Nadia Derbey 已提交
67
static int newseg(struct ipc_namespace *, struct ipc_params *);
68 69
static void shm_open(struct vm_area_struct *vma);
static void shm_close(struct vm_area_struct *vma);
K
Kirill Korotaev 已提交
70
static void shm_destroy (struct ipc_namespace *ns, struct shmid_kernel *shp);
L
Linus Torvalds 已提交
71
#ifdef CONFIG_PROC_FS
72
static int sysvipc_shm_proc_show(struct seq_file *s, void *it);
L
Linus Torvalds 已提交
73 74
#endif

75
void shm_init_ns(struct ipc_namespace *ns)
K
Kirill Korotaev 已提交
76 77 78 79
{
	ns->shm_ctlmax = SHMMAX;
	ns->shm_ctlall = SHMALL;
	ns->shm_ctlmni = SHMMNI;
80
	ns->shm_rmid_forced = 0;
K
Kirill Korotaev 已提交
81
	ns->shm_tot = 0;
W
WANG Cong 已提交
82
	ipc_init_ids(&shm_ids(ns));
K
Kirill Korotaev 已提交
83 84
}

N
Nadia Derbey 已提交
85
/*
D
Davidlohr Bueso 已提交
86 87
 * Called with shm_ids.rwsem (writer) and the shp structure locked.
 * Only shm_ids.rwsem remains locked on exit.
N
Nadia Derbey 已提交
88
 */
89
static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
K
Kirill Korotaev 已提交
90
{
91 92 93
	struct shmid_kernel *shp;
	shp = container_of(ipcp, struct shmid_kernel, shm_perm);

K
Kirill Korotaev 已提交
94 95 96 97 98 99 100 101 102
	if (shp->shm_nattch){
		shp->shm_perm.mode |= SHM_DEST;
		/* Do not find it any more */
		shp->shm_perm.key = IPC_PRIVATE;
		shm_unlock(shp);
	} else
		shm_destroy(ns, shp);
}

103
#ifdef CONFIG_IPC_NS
K
Kirill Korotaev 已提交
104 105
void shm_exit_ns(struct ipc_namespace *ns)
{
106
	free_ipcs(ns, &shm_ids(ns), do_shm_rmid);
S
Serge E. Hallyn 已提交
107
	idr_destroy(&ns->ids[IPC_SHM_IDS].ipcs_idr);
K
Kirill Korotaev 已提交
108
}
109
#endif
L
Linus Torvalds 已提交
110

111
static int __init ipc_ns_init(void)
L
Linus Torvalds 已提交
112
{
113
	shm_init_ns(&init_ipc_ns);
114 115 116 117 118 119 120
	return 0;
}

pure_initcall(ipc_ns_init);

void __init shm_init (void)
{
121
	ipc_init_proc_interface("sysvipc/shm",
122 123 124 125 126
#if BITS_PER_LONG <= 32
				"       key      shmid perms       size  cpid  lpid nattch   uid   gid  cuid  cgid      atime      dtime      ctime        rss       swap\n",
#else
				"       key      shmid perms                  size  cpid  lpid nattch   uid   gid  cuid  cgid      atime      dtime      ctime                   rss                  swap\n",
#endif
K
Kirill Korotaev 已提交
127
				IPC_SHM_IDS, sysvipc_shm_proc_show);
L
Linus Torvalds 已提交
128 129
}

130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149
static inline struct shmid_kernel *shm_obtain_object(struct ipc_namespace *ns, int id)
{
	struct kern_ipc_perm *ipcp = ipc_obtain_object(&shm_ids(ns), id);

	if (IS_ERR(ipcp))
		return ERR_CAST(ipcp);

	return container_of(ipcp, struct shmid_kernel, shm_perm);
}

static inline struct shmid_kernel *shm_obtain_object_check(struct ipc_namespace *ns, int id)
{
	struct kern_ipc_perm *ipcp = ipc_obtain_object_check(&shm_ids(ns), id);

	if (IS_ERR(ipcp))
		return ERR_CAST(ipcp);

	return container_of(ipcp, struct shmid_kernel, shm_perm);
}

N
Nadia Derbey 已提交
150
/*
D
Davidlohr Bueso 已提交
151
 * shm_lock_(check_) routines are called in the paths where the rwsem
N
Nadia Derbey 已提交
152
 * is not necessarily held.
N
Nadia Derbey 已提交
153
 */
154
static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id)
L
Linus Torvalds 已提交
155
{
N
Nadia Derbey 已提交
156 157
	struct kern_ipc_perm *ipcp = ipc_lock(&shm_ids(ns), id);

158 159 160
	if (IS_ERR(ipcp))
		return (struct shmid_kernel *)ipcp;

N
Nadia Derbey 已提交
161
	return container_of(ipcp, struct shmid_kernel, shm_perm);
162 163
}

164 165 166
static inline void shm_lock_by_ptr(struct shmid_kernel *ipcp)
{
	rcu_read_lock();
167
	ipc_lock_object(&ipcp->shm_perm);
168 169
}

170 171 172
static inline struct shmid_kernel *shm_lock_check(struct ipc_namespace *ns,
						int id)
{
N
Nadia Derbey 已提交
173 174
	struct kern_ipc_perm *ipcp = ipc_lock_check(&shm_ids(ns), id);

175 176 177
	if (IS_ERR(ipcp))
		return (struct shmid_kernel *)ipcp;

N
Nadia Derbey 已提交
178
	return container_of(ipcp, struct shmid_kernel, shm_perm);
L
Linus Torvalds 已提交
179 180
}

N
Nadia Derbey 已提交
181
static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s)
L
Linus Torvalds 已提交
182
{
N
Nadia Derbey 已提交
183
	ipc_rmid(&shm_ids(ns), &s->shm_perm);
L
Linus Torvalds 已提交
184 185 186
}


187 188
/* This is called by fork, once for every shm attach. */
static void shm_open(struct vm_area_struct *vma)
K
Kirill Korotaev 已提交
189
{
190 191
	struct file *file = vma->vm_file;
	struct shm_file_data *sfd = shm_file_data(file);
L
Linus Torvalds 已提交
192 193
	struct shmid_kernel *shp;

194
	shp = shm_lock(sfd->ns, sfd->id);
195
	BUG_ON(IS_ERR(shp));
L
Linus Torvalds 已提交
196
	shp->shm_atim = get_seconds();
197
	shp->shm_lprid = task_tgid_vnr(current);
L
Linus Torvalds 已提交
198 199 200 201 202 203 204
	shp->shm_nattch++;
	shm_unlock(shp);
}

/*
 * shm_destroy - free the struct shmid_kernel
 *
N
Nadia Derbey 已提交
205
 * @ns: namespace
L
Linus Torvalds 已提交
206 207
 * @shp: struct to free
 *
D
Davidlohr Bueso 已提交
208
 * It has to be called with shp and shm_ids.rwsem (writer) locked,
L
Linus Torvalds 已提交
209 210
 * but returns with shp unlocked and freed.
 */
K
Kirill Korotaev 已提交
211
static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
L
Linus Torvalds 已提交
212
{
K
Kirill Korotaev 已提交
213
	ns->shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT;
N
Nadia Derbey 已提交
214
	shm_rmid(ns, shp);
L
Linus Torvalds 已提交
215 216 217
	shm_unlock(shp);
	if (!is_file_hugepages(shp->shm_file))
		shmem_lock(shp->shm_file, 0, shp->mlock_user);
218
	else if (shp->mlock_user)
A
Al Viro 已提交
219
		user_shm_unlock(file_inode(shp->shm_file)->i_size,
L
Linus Torvalds 已提交
220 221 222 223 224 225
						shp->mlock_user);
	fput (shp->shm_file);
	security_shm_free(shp);
	ipc_rcu_putref(shp);
}

226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242
/*
 * shm_may_destroy - identifies whether shm segment should be destroyed now
 *
 * Returns true if and only if there are no active users of the segment and
 * one of the following is true:
 *
 * 1) shmctl(id, IPC_RMID, NULL) was called for this shp
 *
 * 2) sysctl kernel.shm_rmid_forced is set to 1.
 */
static bool shm_may_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
{
	return (shp->shm_nattch == 0) &&
	       (ns->shm_rmid_forced ||
		(shp->shm_perm.mode & SHM_DEST));
}

L
Linus Torvalds 已提交
243
/*
244
 * remove the attach descriptor vma.
L
Linus Torvalds 已提交
245 246 247 248
 * free memory for segment if it is marked destroyed.
 * The descriptor has already been removed from the current->mm->mmap list
 * and will later be kfree()d.
 */
249
static void shm_close(struct vm_area_struct *vma)
L
Linus Torvalds 已提交
250
{
251 252
	struct file * file = vma->vm_file;
	struct shm_file_data *sfd = shm_file_data(file);
L
Linus Torvalds 已提交
253
	struct shmid_kernel *shp;
254
	struct ipc_namespace *ns = sfd->ns;
K
Kirill Korotaev 已提交
255

D
Davidlohr Bueso 已提交
256
	down_write(&shm_ids(ns).rwsem);
L
Linus Torvalds 已提交
257
	/* remove from the list of attaches of the shm segment */
N
Nadia Derbey 已提交
258
	shp = shm_lock(ns, sfd->id);
259
	BUG_ON(IS_ERR(shp));
260
	shp->shm_lprid = task_tgid_vnr(current);
L
Linus Torvalds 已提交
261 262
	shp->shm_dtim = get_seconds();
	shp->shm_nattch--;
263 264 265 266
	if (shm_may_destroy(ns, shp))
		shm_destroy(ns, shp);
	else
		shm_unlock(shp);
D
Davidlohr Bueso 已提交
267
	up_write(&shm_ids(ns).rwsem);
268 269
}

D
Davidlohr Bueso 已提交
270
/* Called with ns->shm_ids(ns).rwsem locked */
271 272 273
static int shm_try_destroy_current(int id, void *p, void *data)
{
	struct ipc_namespace *ns = data;
274 275
	struct kern_ipc_perm *ipcp = p;
	struct shmid_kernel *shp = container_of(ipcp, struct shmid_kernel, shm_perm);
276

277
	if (shp->shm_creator != current)
278 279 280 281 282 283 284 285 286 287 288 289 290
		return 0;

	/*
	 * Mark it as orphaned to destroy the segment when
	 * kernel.shm_rmid_forced is changed.
	 * It is noop if the following shm_may_destroy() returns true.
	 */
	shp->shm_creator = NULL;

	/*
	 * Don't even try to destroy it.  If shm_rmid_forced=0 and IPC_RMID
	 * is not set, it shouldn't be deleted here.
	 */
291
	if (!ns->shm_rmid_forced)
292 293
		return 0;

294 295
	if (shm_may_destroy(ns, shp)) {
		shm_lock_by_ptr(shp);
296
		shm_destroy(ns, shp);
297
	}
298 299 300
	return 0;
}

D
Davidlohr Bueso 已提交
301
/* Called with ns->shm_ids(ns).rwsem locked */
302 303 304
static int shm_try_destroy_orphaned(int id, void *p, void *data)
{
	struct ipc_namespace *ns = data;
305 306
	struct kern_ipc_perm *ipcp = p;
	struct shmid_kernel *shp = container_of(ipcp, struct shmid_kernel, shm_perm);
307 308 309 310

	/*
	 * We want to destroy segments without users and with already
	 * exit'ed originating process.
311
	 *
D
Davidlohr Bueso 已提交
312
	 * As shp->* are changed under rwsem, it's safe to skip shp locking.
313
	 */
314
	if (shp->shm_creator != NULL)
315 316
		return 0;

317 318
	if (shm_may_destroy(ns, shp)) {
		shm_lock_by_ptr(shp);
K
Kirill Korotaev 已提交
319
		shm_destroy(ns, shp);
320
	}
321 322 323 324 325
	return 0;
}

void shm_destroy_orphaned(struct ipc_namespace *ns)
{
D
Davidlohr Bueso 已提交
326
	down_write(&shm_ids(ns).rwsem);
V
Vasiliy Kulikov 已提交
327
	if (shm_ids(ns).in_use)
328
		idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_orphaned, ns);
D
Davidlohr Bueso 已提交
329
	up_write(&shm_ids(ns).rwsem);
330 331 332 333 334
}


void exit_shm(struct task_struct *task)
{
335
	struct ipc_namespace *ns = task->nsproxy->ipc_ns;
336

V
Vasiliy Kulikov 已提交
337 338 339
	if (shm_ids(ns).in_use == 0)
		return;

340
	/* Destroy all already created segments, but not mapped yet */
D
Davidlohr Bueso 已提交
341
	down_write(&shm_ids(ns).rwsem);
V
Vasiliy Kulikov 已提交
342
	if (shm_ids(ns).in_use)
343
		idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_current, ns);
D
Davidlohr Bueso 已提交
344
	up_write(&shm_ids(ns).rwsem);
L
Linus Torvalds 已提交
345 346
}

N
Nick Piggin 已提交
347
static int shm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
348 349 350 351
{
	struct file *file = vma->vm_file;
	struct shm_file_data *sfd = shm_file_data(file);

N
Nick Piggin 已提交
352
	return sfd->vm_ops->fault(vma, vmf);
353 354 355
}

#ifdef CONFIG_NUMA
A
Adrian Bunk 已提交
356
static int shm_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
357 358 359 360 361 362 363 364 365
{
	struct file *file = vma->vm_file;
	struct shm_file_data *sfd = shm_file_data(file);
	int err = 0;
	if (sfd->vm_ops->set_policy)
		err = sfd->vm_ops->set_policy(vma, new);
	return err;
}

A
Adrian Bunk 已提交
366 367
static struct mempolicy *shm_get_policy(struct vm_area_struct *vma,
					unsigned long addr)
368 369 370 371 372 373 374
{
	struct file *file = vma->vm_file;
	struct shm_file_data *sfd = shm_file_data(file);
	struct mempolicy *pol = NULL;

	if (sfd->vm_ops->get_policy)
		pol = sfd->vm_ops->get_policy(vma, addr);
375
	else if (vma->vm_policy)
376
		pol = vma->vm_policy;
377

378 379 380 381
	return pol;
}
#endif

L
Linus Torvalds 已提交
382 383
static int shm_mmap(struct file * file, struct vm_area_struct * vma)
{
384
	struct shm_file_data *sfd = shm_file_data(file);
385 386
	int ret;

387 388 389 390
	ret = sfd->file->f_op->mmap(sfd->file, vma);
	if (ret != 0)
		return ret;
	sfd->vm_ops = vma->vm_ops;
D
David Howells 已提交
391
#ifdef CONFIG_MMU
392
	BUG_ON(!sfd->vm_ops->fault);
D
David Howells 已提交
393
#endif
394 395
	vma->vm_ops = &shm_vm_ops;
	shm_open(vma);
396 397

	return ret;
L
Linus Torvalds 已提交
398 399
}

K
Kirill Korotaev 已提交
400 401
static int shm_release(struct inode *ino, struct file *file)
{
402
	struct shm_file_data *sfd = shm_file_data(file);
K
Kirill Korotaev 已提交
403

404 405 406
	put_ipc_ns(sfd->ns);
	shm_file_data(file) = NULL;
	kfree(sfd);
K
Kirill Korotaev 已提交
407 408 409
	return 0;
}

410
static int shm_fsync(struct file *file, loff_t start, loff_t end, int datasync)
411 412 413
{
	struct shm_file_data *sfd = shm_file_data(file);

414 415
	if (!sfd->file->f_op->fsync)
		return -EINVAL;
416
	return sfd->file->f_op->fsync(sfd->file, start, end, datasync);
417 418
}

419 420 421 422 423 424 425 426 427 428
static long shm_fallocate(struct file *file, int mode, loff_t offset,
			  loff_t len)
{
	struct shm_file_data *sfd = shm_file_data(file);

	if (!sfd->file->f_op->fallocate)
		return -EOPNOTSUPP;
	return sfd->file->f_op->fallocate(file, mode, offset, len);
}

429 430 431 432 433
static unsigned long shm_get_unmapped_area(struct file *file,
	unsigned long addr, unsigned long len, unsigned long pgoff,
	unsigned long flags)
{
	struct shm_file_data *sfd = shm_file_data(file);
434 435
	return sfd->file->f_op->get_unmapped_area(sfd->file, addr, len,
						pgoff, flags);
436 437
}

438
static const struct file_operations shm_file_operations = {
K
Kirill Korotaev 已提交
439
	.mmap		= shm_mmap,
440
	.fsync		= shm_fsync,
K
Kirill Korotaev 已提交
441
	.release	= shm_release,
D
David Howells 已提交
442 443 444
#ifndef CONFIG_MMU
	.get_unmapped_area	= shm_get_unmapped_area,
#endif
445
	.llseek		= noop_llseek,
446
	.fallocate	= shm_fallocate,
447 448 449 450 451 452
};

static const struct file_operations shm_file_operations_huge = {
	.mmap		= shm_mmap,
	.fsync		= shm_fsync,
	.release	= shm_release,
453
	.get_unmapped_area	= shm_get_unmapped_area,
454
	.llseek		= noop_llseek,
455
	.fallocate	= shm_fallocate,
L
Linus Torvalds 已提交
456 457
};

458 459 460 461 462
int is_file_shm_hugepages(struct file *file)
{
	return file->f_op == &shm_file_operations_huge;
}

463
static const struct vm_operations_struct shm_vm_ops = {
L
Linus Torvalds 已提交
464 465
	.open	= shm_open,	/* callback for a new vm-area open */
	.close	= shm_close,	/* callback for when the vm-area is released */
466
	.fault	= shm_fault,
467 468 469
#if defined(CONFIG_NUMA)
	.set_policy = shm_set_policy,
	.get_policy = shm_get_policy,
L
Linus Torvalds 已提交
470 471 472
#endif
};

N
Nadia Derbey 已提交
473 474 475 476 477
/**
 * newseg - Create a new shared memory segment
 * @ns: namespace
 * @params: ptr to the structure that contains key, size and shmflg
 *
D
Davidlohr Bueso 已提交
478
 * Called with shm_ids.rwsem held as a writer.
N
Nadia Derbey 已提交
479 480
 */

N
Nadia Derbey 已提交
481
static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
L
Linus Torvalds 已提交
482
{
N
Nadia Derbey 已提交
483 484 485
	key_t key = params->key;
	int shmflg = params->flg;
	size_t size = params->u.size;
L
Linus Torvalds 已提交
486 487
	int error;
	struct shmid_kernel *shp;
488
	size_t numpages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
L
Linus Torvalds 已提交
489 490 491
	struct file * file;
	char name[13];
	int id;
492
	vm_flags_t acctflag = 0;
L
Linus Torvalds 已提交
493

K
Kirill Korotaev 已提交
494
	if (size < SHMMIN || size > ns->shm_ctlmax)
L
Linus Torvalds 已提交
495 496
		return -EINVAL;

497
	if (ns->shm_tot + numpages > ns->shm_ctlall)
L
Linus Torvalds 已提交
498 499 500 501 502 503 504
		return -ENOSPC;

	shp = ipc_rcu_alloc(sizeof(*shp));
	if (!shp)
		return -ENOMEM;

	shp->shm_perm.key = key;
A
Andrew Morton 已提交
505
	shp->shm_perm.mode = (shmflg & S_IRWXUGO);
L
Linus Torvalds 已提交
506 507 508 509 510 511 512 513 514
	shp->mlock_user = NULL;

	shp->shm_perm.security = NULL;
	error = security_shm_alloc(shp);
	if (error) {
		ipc_rcu_putref(shp);
		return error;
	}

515
	sprintf (name, "SYSV%08x", key);
L
Linus Torvalds 已提交
516
	if (shmflg & SHM_HUGETLB) {
517
		struct hstate *hs;
518 519
		size_t hugesize;

520
		hs = hstate_sizelog((shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK);
521 522 523 524 525
		if (!hs) {
			error = -EINVAL;
			goto no_file;
		}
		hugesize = ALIGN(size, huge_page_size(hs));
526

527 528 529
		/* hugetlb_file_setup applies strict accounting */
		if (shmflg & SHM_NORESERVE)
			acctflag = VM_NORESERVE;
530
		file = hugetlb_file_setup(name, hugesize, acctflag,
531 532
				  &shp->mlock_user, HUGETLB_SHMFS_INODE,
				(shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK);
L
Linus Torvalds 已提交
533
	} else {
534 535 536 537 538 539
		/*
		 * Do not allow no accounting for OVERCOMMIT_NEVER, even
	 	 * if it's asked for.
		 */
		if  ((shmflg & SHM_NORESERVE) &&
				sysctl_overcommit_memory != OVERCOMMIT_NEVER)
540
			acctflag = VM_NORESERVE;
541
		file = shmem_file_setup(name, size, acctflag);
L
Linus Torvalds 已提交
542 543 544 545 546
	}
	error = PTR_ERR(file);
	if (IS_ERR(file))
		goto no_file;

547
	id = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni);
548 549
	if (id < 0) {
		error = id;
L
Linus Torvalds 已提交
550
		goto no_id;
551
	}
L
Linus Torvalds 已提交
552

553
	shp->shm_cprid = task_tgid_vnr(current);
L
Linus Torvalds 已提交
554 555 556 557 558 559
	shp->shm_lprid = 0;
	shp->shm_atim = shp->shm_dtim = 0;
	shp->shm_ctim = get_seconds();
	shp->shm_segsz = size;
	shp->shm_nattch = 0;
	shp->shm_file = file;
560
	shp->shm_creator = current;
561

562 563 564 565
	/*
	 * shmid gets reported as "inode#" in /proc/pid/maps.
	 * proc-ps tools use this. Changing this will break them.
	 */
A
Al Viro 已提交
566
	file_inode(file)->i_ino = shp->shm_perm.id;
567

K
Kirill Korotaev 已提交
568
	ns->shm_tot += numpages;
N
Nadia Derbey 已提交
569
	error = shp->shm_perm.id;
570

571
	ipc_unlock_object(&shp->shm_perm);
572
	rcu_read_unlock();
N
Nadia Derbey 已提交
573
	return error;
L
Linus Torvalds 已提交
574 575

no_id:
576
	if (is_file_hugepages(file) && shp->mlock_user)
577
		user_shm_unlock(size, shp->mlock_user);
L
Linus Torvalds 已提交
578 579 580 581 582 583 584
	fput(file);
no_file:
	security_shm_free(shp);
	ipc_rcu_putref(shp);
	return error;
}

N
Nadia Derbey 已提交
585
/*
D
Davidlohr Bueso 已提交
586
 * Called with shm_ids.rwsem and ipcp locked.
N
Nadia Derbey 已提交
587
 */
N
Nadia Derbey 已提交
588
static inline int shm_security(struct kern_ipc_perm *ipcp, int shmflg)
N
Nadia Derbey 已提交
589
{
N
Nadia Derbey 已提交
590 591 592 593
	struct shmid_kernel *shp;

	shp = container_of(ipcp, struct shmid_kernel, shm_perm);
	return security_shm_associate(shp, shmflg);
N
Nadia Derbey 已提交
594 595
}

N
Nadia Derbey 已提交
596
/*
D
Davidlohr Bueso 已提交
597
 * Called with shm_ids.rwsem and ipcp locked.
N
Nadia Derbey 已提交
598
 */
N
Nadia Derbey 已提交
599 600
static inline int shm_more_checks(struct kern_ipc_perm *ipcp,
				struct ipc_params *params)
N
Nadia Derbey 已提交
601
{
N
Nadia Derbey 已提交
602 603 604 605
	struct shmid_kernel *shp;

	shp = container_of(ipcp, struct shmid_kernel, shm_perm);
	if (shp->shm_segsz < params->u.size)
N
Nadia Derbey 已提交
606 607 608 609 610
		return -EINVAL;

	return 0;
}

611
SYSCALL_DEFINE3(shmget, key_t, key, size_t, size, int, shmflg)
L
Linus Torvalds 已提交
612
{
K
Kirill Korotaev 已提交
613
	struct ipc_namespace *ns;
N
Nadia Derbey 已提交
614 615
	struct ipc_ops shm_ops;
	struct ipc_params shm_params;
K
Kirill Korotaev 已提交
616 617

	ns = current->nsproxy->ipc_ns;
L
Linus Torvalds 已提交
618

N
Nadia Derbey 已提交
619 620 621
	shm_ops.getnew = newseg;
	shm_ops.associate = shm_security;
	shm_ops.more_checks = shm_more_checks;
N
Nadia Derbey 已提交
622

N
Nadia Derbey 已提交
623 624 625
	shm_params.key = key;
	shm_params.flg = shmflg;
	shm_params.u.size = size;
L
Linus Torvalds 已提交
626

N
Nadia Derbey 已提交
627
	return ipcget(ns, &shm_ids(ns), &shm_ops, &shm_params);
L
Linus Torvalds 已提交
628 629 630 631 632 633 634 635 636 637 638
}

static inline unsigned long copy_shmid_to_user(void __user *buf, struct shmid64_ds *in, int version)
{
	switch(version) {
	case IPC_64:
		return copy_to_user(buf, in, sizeof(*in));
	case IPC_OLD:
	    {
		struct shmid_ds out;

639
		memset(&out, 0, sizeof(out));
L
Linus Torvalds 已提交
640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655
		ipc64_perm_to_ipc_perm(&in->shm_perm, &out.shm_perm);
		out.shm_segsz	= in->shm_segsz;
		out.shm_atime	= in->shm_atime;
		out.shm_dtime	= in->shm_dtime;
		out.shm_ctime	= in->shm_ctime;
		out.shm_cpid	= in->shm_cpid;
		out.shm_lpid	= in->shm_lpid;
		out.shm_nattch	= in->shm_nattch;

		return copy_to_user(buf, &out, sizeof(out));
	    }
	default:
		return -EINVAL;
	}
}

656 657
static inline unsigned long
copy_shmid_from_user(struct shmid64_ds *out, void __user *buf, int version)
L
Linus Torvalds 已提交
658 659 660
{
	switch(version) {
	case IPC_64:
661
		if (copy_from_user(out, buf, sizeof(*out)))
L
Linus Torvalds 已提交
662 663 664 665 666 667 668 669 670
			return -EFAULT;
		return 0;
	case IPC_OLD:
	    {
		struct shmid_ds tbuf_old;

		if (copy_from_user(&tbuf_old, buf, sizeof(tbuf_old)))
			return -EFAULT;

671 672 673
		out->shm_perm.uid	= tbuf_old.shm_perm.uid;
		out->shm_perm.gid	= tbuf_old.shm_perm.gid;
		out->shm_perm.mode	= tbuf_old.shm_perm.mode;
L
Linus Torvalds 已提交
674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707

		return 0;
	    }
	default:
		return -EINVAL;
	}
}

static inline unsigned long copy_shminfo_to_user(void __user *buf, struct shminfo64 *in, int version)
{
	switch(version) {
	case IPC_64:
		return copy_to_user(buf, in, sizeof(*in));
	case IPC_OLD:
	    {
		struct shminfo out;

		if(in->shmmax > INT_MAX)
			out.shmmax = INT_MAX;
		else
			out.shmmax = (int)in->shmmax;

		out.shmmin	= in->shmmin;
		out.shmmni	= in->shmmni;
		out.shmseg	= in->shmseg;
		out.shmall	= in->shmall; 

		return copy_to_user(buf, &out, sizeof(out));
	    }
	default:
		return -EINVAL;
	}
}

708 709
/*
 * Calculate and add used RSS and swap pages of a shm.
D
Davidlohr Bueso 已提交
710
 * Called with shm_ids.rwsem held as a reader
711 712 713 714 715 716
 */
static void shm_add_rss_swap(struct shmid_kernel *shp,
	unsigned long *rss_add, unsigned long *swp_add)
{
	struct inode *inode;

A
Al Viro 已提交
717
	inode = file_inode(shp->shm_file);
718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735

	if (is_file_hugepages(shp->shm_file)) {
		struct address_space *mapping = inode->i_mapping;
		struct hstate *h = hstate_file(shp->shm_file);
		*rss_add += pages_per_huge_page(h) * mapping->nrpages;
	} else {
#ifdef CONFIG_SHMEM
		struct shmem_inode_info *info = SHMEM_I(inode);
		spin_lock(&info->lock);
		*rss_add += inode->i_mapping->nrpages;
		*swp_add += info->swapped;
		spin_unlock(&info->lock);
#else
		*rss_add += inode->i_mapping->nrpages;
#endif
	}
}

N
Nadia Derbey 已提交
736
/*
D
Davidlohr Bueso 已提交
737
 * Called with shm_ids.rwsem held as a reader
N
Nadia Derbey 已提交
738
 */
K
Kirill Korotaev 已提交
739 740
static void shm_get_stat(struct ipc_namespace *ns, unsigned long *rss,
		unsigned long *swp)
L
Linus Torvalds 已提交
741
{
N
Nadia Derbey 已提交
742 743
	int next_id;
	int total, in_use;
L
Linus Torvalds 已提交
744 745 746 747

	*rss = 0;
	*swp = 0;

N
Nadia Derbey 已提交
748 749 750
	in_use = shm_ids(ns).in_use;

	for (total = 0, next_id = 0; total < in_use; next_id++) {
751
		struct kern_ipc_perm *ipc;
L
Linus Torvalds 已提交
752 753
		struct shmid_kernel *shp;

754 755
		ipc = idr_find(&shm_ids(ns).ipcs_idr, next_id);
		if (ipc == NULL)
L
Linus Torvalds 已提交
756
			continue;
757
		shp = container_of(ipc, struct shmid_kernel, shm_perm);
L
Linus Torvalds 已提交
758

759
		shm_add_rss_swap(shp, rss, swp);
N
Nadia Derbey 已提交
760 761

		total++;
L
Linus Torvalds 已提交
762 763 764
	}
}

765
/*
D
Davidlohr Bueso 已提交
766
 * This function handles some shmctl commands which require the rwsem
767
 * to be held in write mode.
D
Davidlohr Bueso 已提交
768
 * NOTE: no locks must be held, the rwsem is taken inside this function.
769 770 771
 */
static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd,
		       struct shmid_ds __user *buf, int version)
L
Linus Torvalds 已提交
772
{
773
	struct kern_ipc_perm *ipcp;
774
	struct shmid64_ds shmid64;
775 776 777 778
	struct shmid_kernel *shp;
	int err;

	if (cmd == IPC_SET) {
779
		if (copy_shmid_from_user(&shmid64, buf, version))
780 781 782
			return -EFAULT;
	}

D
Davidlohr Bueso 已提交
783
	down_write(&shm_ids(ns).rwsem);
784 785
	rcu_read_lock();

786 787
	ipcp = ipcctl_pre_down_nolock(ns, &shm_ids(ns), shmid, cmd,
				      &shmid64.shm_perm, 0);
788 789 790 791
	if (IS_ERR(ipcp)) {
		err = PTR_ERR(ipcp);
		goto out_unlock1;
	}
792

793
	shp = container_of(ipcp, struct shmid_kernel, shm_perm);
794 795 796

	err = security_shm_shmctl(shp, cmd);
	if (err)
797
		goto out_unlock1;
798

799 800
	switch (cmd) {
	case IPC_RMID:
801
		ipc_lock_object(&shp->shm_perm);
802
		/* do_shm_rmid unlocks the ipc object and rcu */
803 804 805
		do_shm_rmid(ns, ipcp);
		goto out_up;
	case IPC_SET:
806
		ipc_lock_object(&shp->shm_perm);
807 808
		err = ipc_update_perm(&shmid64.shm_perm, ipcp);
		if (err)
809
			goto out_unlock0;
810 811 812 813
		shp->shm_ctim = get_seconds();
		break;
	default:
		err = -EINVAL;
814
		goto out_unlock1;
815
	}
816 817 818 819 820

out_unlock0:
	ipc_unlock_object(&shp->shm_perm);
out_unlock1:
	rcu_read_unlock();
821
out_up:
D
Davidlohr Bueso 已提交
822
	up_write(&shm_ids(ns).rwsem);
823 824 825
	return err;
}

826 827
static int shmctl_nolock(struct ipc_namespace *ns, int shmid,
			 int cmd, int version, void __user *buf)
828
{
829
	int err;
L
Linus Torvalds 已提交
830 831
	struct shmid_kernel *shp;

832 833 834 835 836
	/* preliminary security checks for *_INFO */
	if (cmd == IPC_INFO || cmd == SHM_INFO) {
		err = security_shm_shmctl(NULL, cmd);
		if (err)
			return err;
L
Linus Torvalds 已提交
837 838
	}

839
	switch (cmd) {
L
Linus Torvalds 已提交
840 841 842 843
	case IPC_INFO:
	{
		struct shminfo64 shminfo;

W
WANG Cong 已提交
844
		memset(&shminfo, 0, sizeof(shminfo));
K
Kirill Korotaev 已提交
845 846 847
		shminfo.shmmni = shminfo.shmseg = ns->shm_ctlmni;
		shminfo.shmmax = ns->shm_ctlmax;
		shminfo.shmall = ns->shm_ctlall;
L
Linus Torvalds 已提交
848 849 850 851

		shminfo.shmmin = SHMMIN;
		if(copy_shminfo_to_user (buf, &shminfo, version))
			return -EFAULT;
N
Nadia Derbey 已提交
852

D
Davidlohr Bueso 已提交
853
		down_read(&shm_ids(ns).rwsem);
N
Nadia Derbey 已提交
854
		err = ipc_get_maxid(&shm_ids(ns));
D
Davidlohr Bueso 已提交
855
		up_read(&shm_ids(ns).rwsem);
N
Nadia Derbey 已提交
856

L
Linus Torvalds 已提交
857 858 859 860 861 862 863 864
		if(err<0)
			err = 0;
		goto out;
	}
	case SHM_INFO:
	{
		struct shm_info shm_info;

W
WANG Cong 已提交
865
		memset(&shm_info, 0, sizeof(shm_info));
D
Davidlohr Bueso 已提交
866
		down_read(&shm_ids(ns).rwsem);
K
Kirill Korotaev 已提交
867 868 869
		shm_info.used_ids = shm_ids(ns).in_use;
		shm_get_stat (ns, &shm_info.shm_rss, &shm_info.shm_swp);
		shm_info.shm_tot = ns->shm_tot;
L
Linus Torvalds 已提交
870 871
		shm_info.swap_attempts = 0;
		shm_info.swap_successes = 0;
N
Nadia Derbey 已提交
872
		err = ipc_get_maxid(&shm_ids(ns));
D
Davidlohr Bueso 已提交
873
		up_read(&shm_ids(ns).rwsem);
W
WANG Cong 已提交
874
		if (copy_to_user(buf, &shm_info, sizeof(shm_info))) {
L
Linus Torvalds 已提交
875 876 877 878 879 880 881 882 883 884 885 886
			err = -EFAULT;
			goto out;
		}

		err = err < 0 ? 0 : err;
		goto out;
	}
	case SHM_STAT:
	case IPC_STAT:
	{
		struct shmid64_ds tbuf;
		int result;
887

888
		rcu_read_lock();
889
		if (cmd == SHM_STAT) {
890
			shp = shm_obtain_object(ns, shmid);
891 892
			if (IS_ERR(shp)) {
				err = PTR_ERR(shp);
893
				goto out_unlock;
894
			}
N
Nadia Derbey 已提交
895
			result = shp->shm_perm.id;
L
Linus Torvalds 已提交
896
		} else {
897
			shp = shm_obtain_object_check(ns, shmid);
898 899
			if (IS_ERR(shp)) {
				err = PTR_ERR(shp);
900
				goto out_unlock;
901
			}
L
Linus Torvalds 已提交
902 903
			result = 0;
		}
904

W
WANG Cong 已提交
905
		err = -EACCES;
906
		if (ipcperms(ns, &shp->shm_perm, S_IRUGO))
L
Linus Torvalds 已提交
907
			goto out_unlock;
908

L
Linus Torvalds 已提交
909 910 911
		err = security_shm_shmctl(shp, cmd);
		if (err)
			goto out_unlock;
912

913
		memset(&tbuf, 0, sizeof(tbuf));
L
Linus Torvalds 已提交
914 915 916 917 918 919 920
		kernel_to_ipc64_perm(&shp->shm_perm, &tbuf.shm_perm);
		tbuf.shm_segsz	= shp->shm_segsz;
		tbuf.shm_atime	= shp->shm_atim;
		tbuf.shm_dtime	= shp->shm_dtim;
		tbuf.shm_ctime	= shp->shm_ctim;
		tbuf.shm_cpid	= shp->shm_cprid;
		tbuf.shm_lpid	= shp->shm_lprid;
921
		tbuf.shm_nattch	= shp->shm_nattch;
922 923 924
		rcu_read_unlock();

		if (copy_shmid_to_user(buf, &tbuf, version))
L
Linus Torvalds 已提交
925 926 927 928 929
			err = -EFAULT;
		else
			err = result;
		goto out;
	}
930 931 932 933 934
	default:
		return -EINVAL;
	}

out_unlock:
935
	rcu_read_unlock();
936 937 938 939 940 941 942 943 944 945
out:
	return err;
}

SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf)
{
	struct shmid_kernel *shp;
	int err, version;
	struct ipc_namespace *ns;

946 947
	if (cmd < 0 || shmid < 0)
		return -EINVAL;
948 949 950 951 952 953 954 955 956 957

	version = ipc_parse_version(&cmd);
	ns = current->nsproxy->ipc_ns;

	switch (cmd) {
	case IPC_INFO:
	case SHM_INFO:
	case SHM_STAT:
	case IPC_STAT:
		return shmctl_nolock(ns, shmid, cmd, version, buf);
958 959 960
	case IPC_RMID:
	case IPC_SET:
		return shmctl_down(ns, shmid, cmd, buf, version);
L
Linus Torvalds 已提交
961 962 963
	case SHM_LOCK:
	case SHM_UNLOCK:
	{
964
		struct file *shm_file;
965

966 967
		rcu_read_lock();
		shp = shm_obtain_object_check(ns, shmid);
968 969
		if (IS_ERR(shp)) {
			err = PTR_ERR(shp);
970
			goto out_unlock1;
L
Linus Torvalds 已提交
971 972
		}

A
Al Viro 已提交
973
		audit_ipc_obj(&(shp->shm_perm));
974 975 976
		err = security_shm_shmctl(shp, cmd);
		if (err)
			goto out_unlock1;
S
Steve Grubb 已提交
977

978
		ipc_lock_object(&shp->shm_perm);
979
		if (!ns_capable(ns->user_ns, CAP_IPC_LOCK)) {
980
			kuid_t euid = current_euid();
L
Linus Torvalds 已提交
981
			err = -EPERM;
982 983
			if (!uid_eq(euid, shp->shm_perm.uid) &&
			    !uid_eq(euid, shp->shm_perm.cuid))
984
				goto out_unlock0;
J
Jiri Slaby 已提交
985
			if (cmd == SHM_LOCK && !rlimit(RLIMIT_MEMLOCK))
986
				goto out_unlock0;
L
Linus Torvalds 已提交
987 988
		}

989 990
		shm_file = shp->shm_file;
		if (is_file_hugepages(shm_file))
991
			goto out_unlock0;
992 993

		if (cmd == SHM_LOCK) {
994
			struct user_struct *user = current_user();
995 996 997 998
			err = shmem_lock(shm_file, 1, user);
			if (!err && !(shp->shm_perm.mode & SHM_LOCKED)) {
				shp->shm_perm.mode |= SHM_LOCKED;
				shp->mlock_user = user;
L
Linus Torvalds 已提交
999
			}
1000
			goto out_unlock0;
L
Linus Torvalds 已提交
1001
		}
1002 1003 1004

		/* SHM_UNLOCK */
		if (!(shp->shm_perm.mode & SHM_LOCKED))
1005
			goto out_unlock0;
1006 1007 1008 1009
		shmem_lock(shm_file, 0, shp->mlock_user);
		shp->shm_perm.mode &= ~SHM_LOCKED;
		shp->mlock_user = NULL;
		get_file(shm_file);
1010 1011
		ipc_unlock_object(&shp->shm_perm);
		rcu_read_unlock();
1012
		shmem_unlock_mapping(shm_file->f_mapping);
1013

1014
		fput(shm_file);
1015
		return err;
1016
	}
L
Linus Torvalds 已提交
1017
	default:
1018
		return -EINVAL;
L
Linus Torvalds 已提交
1019 1020
	}

1021 1022 1023 1024
out_unlock0:
	ipc_unlock_object(&shp->shm_perm);
out_unlock1:
	rcu_read_unlock();
L
Linus Torvalds 已提交
1025 1026 1027 1028 1029 1030 1031 1032 1033 1034
	return err;
}

/*
 * Fix shmaddr, allocate descriptor, map shm, add attach descriptor to lists.
 *
 * NOTE! Despite the name, this is NOT a direct system call entrypoint. The
 * "raddr" thing points to kernel space, and there has to be a wrapper around
 * this.
 */
W
Will Deacon 已提交
1035 1036
long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr,
	      unsigned long shmlba)
L
Linus Torvalds 已提交
1037 1038 1039 1040 1041 1042 1043 1044 1045
{
	struct shmid_kernel *shp;
	unsigned long addr;
	unsigned long size;
	struct file * file;
	int    err;
	unsigned long flags;
	unsigned long prot;
	int acc_mode;
K
Kirill Korotaev 已提交
1046
	struct ipc_namespace *ns;
1047 1048
	struct shm_file_data *sfd;
	struct path path;
1049
	fmode_t f_mode;
1050
	unsigned long populate = 0;
L
Linus Torvalds 已提交
1051

1052 1053
	err = -EINVAL;
	if (shmid < 0)
L
Linus Torvalds 已提交
1054
		goto out;
1055
	else if ((addr = (ulong)shmaddr)) {
W
Will Deacon 已提交
1056
		if (addr & (shmlba - 1)) {
L
Linus Torvalds 已提交
1057
			if (shmflg & SHM_RND)
W
Will Deacon 已提交
1058
				addr &= ~(shmlba - 1);	   /* round down */
L
Linus Torvalds 已提交
1059 1060 1061 1062
			else
#ifndef __ARCH_FORCE_SHMLBA
				if (addr & ~PAGE_MASK)
#endif
1063
					goto out;
L
Linus Torvalds 已提交
1064 1065 1066 1067
		}
		flags = MAP_SHARED | MAP_FIXED;
	} else {
		if ((shmflg & SHM_REMAP))
1068
			goto out;
L
Linus Torvalds 已提交
1069 1070 1071 1072 1073 1074 1075

		flags = MAP_SHARED;
	}

	if (shmflg & SHM_RDONLY) {
		prot = PROT_READ;
		acc_mode = S_IRUGO;
1076
		f_mode = FMODE_READ;
L
Linus Torvalds 已提交
1077 1078 1079
	} else {
		prot = PROT_READ | PROT_WRITE;
		acc_mode = S_IRUGO | S_IWUGO;
1080
		f_mode = FMODE_READ | FMODE_WRITE;
L
Linus Torvalds 已提交
1081 1082 1083 1084 1085 1086 1087 1088 1089 1090
	}
	if (shmflg & SHM_EXEC) {
		prot |= PROT_EXEC;
		acc_mode |= S_IXUGO;
	}

	/*
	 * We cannot rely on the fs check since SYSV IPC does have an
	 * additional creator id...
	 */
K
Kirill Korotaev 已提交
1091
	ns = current->nsproxy->ipc_ns;
1092 1093
	rcu_read_lock();
	shp = shm_obtain_object_check(ns, shmid);
1094 1095
	if (IS_ERR(shp)) {
		err = PTR_ERR(shp);
1096
		goto out_unlock;
1097
	}
1098 1099

	err = -EACCES;
1100
	if (ipcperms(ns, &shp->shm_perm, acc_mode))
1101
		goto out_unlock;
L
Linus Torvalds 已提交
1102 1103

	err = security_shm_shmat(shp, shmaddr, shmflg);
1104 1105 1106
	if (err)
		goto out_unlock;

1107
	ipc_lock_object(&shp->shm_perm);
1108 1109
	path = shp->shm_file->f_path;
	path_get(&path);
L
Linus Torvalds 已提交
1110
	shp->shm_nattch++;
1111
	size = i_size_read(path.dentry->d_inode);
1112 1113
	ipc_unlock_object(&shp->shm_perm);
	rcu_read_unlock();
L
Linus Torvalds 已提交
1114

1115 1116
	err = -ENOMEM;
	sfd = kzalloc(sizeof(*sfd), GFP_KERNEL);
1117 1118 1119 1120
	if (!sfd) {
		path_put(&path);
		goto out_nattch;
	}
1121

1122 1123
	file = alloc_file(&path, f_mode,
			  is_file_hugepages(shp->shm_file) ?
1124 1125
				&shm_file_operations_huge :
				&shm_file_operations);
1126
	err = PTR_ERR(file);
1127 1128 1129 1130 1131
	if (IS_ERR(file)) {
		kfree(sfd);
		path_put(&path);
		goto out_nattch;
	}
1132 1133 1134

	file->private_data = sfd;
	file->f_mapping = shp->shm_file->f_mapping;
N
Nadia Derbey 已提交
1135
	sfd->id = shp->shm_perm.id;
1136 1137 1138 1139
	sfd->ns = get_ipc_ns(ns);
	sfd->file = shp->shm_file;
	sfd->vm_ops = NULL;

1140 1141 1142 1143
	err = security_mmap_file(file, prot, flags);
	if (err)
		goto out_fput;

L
Linus Torvalds 已提交
1144 1145
	down_write(&current->mm->mmap_sem);
	if (addr && !(shmflg & SHM_REMAP)) {
1146
		err = -EINVAL;
L
Linus Torvalds 已提交
1147 1148 1149 1150 1151 1152 1153 1154 1155 1156
		if (find_vma_intersection(current->mm, addr, addr + size))
			goto invalid;
		/*
		 * If shm segment goes below stack, make sure there is some
		 * space left for the stack to grow (at least 4 pages).
		 */
		if (addr < current->mm->start_stack &&
		    addr > current->mm->start_stack - size - PAGE_SIZE * 5)
			goto invalid;
	}
1157

1158 1159
	addr = do_mmap_pgoff(file, addr, size, prot, flags, 0, &populate);
	*raddr = addr;
1160
	err = 0;
1161 1162
	if (IS_ERR_VALUE(addr))
		err = (long)addr;
L
Linus Torvalds 已提交
1163 1164
invalid:
	up_write(&current->mm->mmap_sem);
1165
	if (populate)
1166
		mm_populate(addr, populate);
L
Linus Torvalds 已提交
1167

1168
out_fput:
1169 1170 1171
	fput(file);

out_nattch:
D
Davidlohr Bueso 已提交
1172
	down_write(&shm_ids(ns).rwsem);
N
Nadia Derbey 已提交
1173
	shp = shm_lock(ns, shmid);
1174
	BUG_ON(IS_ERR(shp));
L
Linus Torvalds 已提交
1175
	shp->shm_nattch--;
1176
	if (shm_may_destroy(ns, shp))
K
Kirill Korotaev 已提交
1177
		shm_destroy(ns, shp);
L
Linus Torvalds 已提交
1178 1179
	else
		shm_unlock(shp);
D
Davidlohr Bueso 已提交
1180
	up_write(&shm_ids(ns).rwsem);
L
Linus Torvalds 已提交
1181
	return err;
1182 1183

out_unlock:
1184
	rcu_read_unlock();
1185 1186
out:
	return err;
L
Linus Torvalds 已提交
1187 1188
}

1189
SYSCALL_DEFINE3(shmat, int, shmid, char __user *, shmaddr, int, shmflg)
S
Stephen Rothwell 已提交
1190 1191 1192 1193
{
	unsigned long ret;
	long err;

W
Will Deacon 已提交
1194
	err = do_shmat(shmid, shmaddr, shmflg, &ret, SHMLBA);
S
Stephen Rothwell 已提交
1195 1196 1197 1198 1199 1200
	if (err)
		return err;
	force_successful_syscall_return();
	return (long)ret;
}

L
Linus Torvalds 已提交
1201 1202 1203 1204
/*
 * detach and kill segment if marked destroyed.
 * The work is done in shm_close.
 */
1205
SYSCALL_DEFINE1(shmdt, char __user *, shmaddr)
L
Linus Torvalds 已提交
1206 1207
{
	struct mm_struct *mm = current->mm;
1208
	struct vm_area_struct *vma;
L
Linus Torvalds 已提交
1209 1210
	unsigned long addr = (unsigned long)shmaddr;
	int retval = -EINVAL;
1211 1212 1213 1214
#ifdef CONFIG_MMU
	loff_t size = 0;
	struct vm_area_struct *next;
#endif
L
Linus Torvalds 已提交
1215

1216 1217 1218
	if (addr & ~PAGE_MASK)
		return retval;

L
Linus Torvalds 已提交
1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242
	down_write(&mm->mmap_sem);

	/*
	 * This function tries to be smart and unmap shm segments that
	 * were modified by partial mlock or munmap calls:
	 * - It first determines the size of the shm segment that should be
	 *   unmapped: It searches for a vma that is backed by shm and that
	 *   started at address shmaddr. It records it's size and then unmaps
	 *   it.
	 * - Then it unmaps all shm vmas that started at shmaddr and that
	 *   are within the initially determined size.
	 * Errors from do_munmap are ignored: the function only fails if
	 * it's called with invalid parameters or if it's called to unmap
	 * a part of a vma. Both calls in this function are for full vmas,
	 * the parameters are directly copied from the vma itself and always
	 * valid - therefore do_munmap cannot fail. (famous last words?)
	 */
	/*
	 * If it had been mremap()'d, the starting address would not
	 * match the usual checks anyway. So assume all vma's are
	 * above the starting address given.
	 */
	vma = find_vma(mm, addr);

1243
#ifdef CONFIG_MMU
L
Linus Torvalds 已提交
1244 1245 1246 1247 1248 1249 1250 1251
	while (vma) {
		next = vma->vm_next;

		/*
		 * Check if the starting address would match, i.e. it's
		 * a fragment created by mprotect() and/or munmap(), or it
		 * otherwise it starts at this address with no hassles.
		 */
1252
		if ((vma->vm_ops == &shm_vm_ops) &&
L
Linus Torvalds 已提交
1253 1254 1255
			(vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) {


A
Al Viro 已提交
1256
			size = file_inode(vma->vm_file)->i_size;
L
Linus Torvalds 已提交
1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273
			do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
			/*
			 * We discovered the size of the shm segment, so
			 * break out of here and fall through to the next
			 * loop that uses the size information to stop
			 * searching for matching vma's.
			 */
			retval = 0;
			vma = next;
			break;
		}
		vma = next;
	}

	/*
	 * We need look no further than the maximum address a fragment
	 * could possibly have landed at. Also cast things to loff_t to
L
Lucas De Marchi 已提交
1274
	 * prevent overflows and make comparisons vs. equal-width types.
L
Linus Torvalds 已提交
1275
	 */
1276
	size = PAGE_ALIGN(size);
L
Linus Torvalds 已提交
1277 1278 1279 1280
	while (vma && (loff_t)(vma->vm_end - addr) <= size) {
		next = vma->vm_next;

		/* finding a matching vma now does not alter retval */
1281
		if ((vma->vm_ops == &shm_vm_ops) &&
L
Linus Torvalds 已提交
1282 1283 1284 1285 1286 1287
			(vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff)

			do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
		vma = next;
	}

1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298
#else /* CONFIG_MMU */
	/* under NOMMU conditions, the exact address to be destroyed must be
	 * given */
	retval = -EINVAL;
	if (vma->vm_start == addr && vma->vm_ops == &shm_vm_ops) {
		do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
		retval = 0;
	}

#endif

L
Linus Torvalds 已提交
1299 1300 1301 1302 1303
	up_write(&mm->mmap_sem);
	return retval;
}

#ifdef CONFIG_PROC_FS
1304
static int sysvipc_shm_proc_show(struct seq_file *s, void *it)
L
Linus Torvalds 已提交
1305
{
1306
	struct user_namespace *user_ns = seq_user_ns(s);
1307
	struct shmid_kernel *shp = it;
1308 1309 1310
	unsigned long rss = 0, swp = 0;

	shm_add_rss_swap(shp, &rss, &swp);
L
Linus Torvalds 已提交
1311

1312 1313 1314 1315 1316
#if BITS_PER_LONG <= 32
#define SIZE_SPEC "%10lu"
#else
#define SIZE_SPEC "%21lu"
#endif
L
Linus Torvalds 已提交
1317

1318 1319
	return seq_printf(s,
			  "%10d %10d  %4o " SIZE_SPEC " %5u %5u  "
1320 1321
			  "%5lu %5u %5u %5u %5u %10lu %10lu %10lu "
			  SIZE_SPEC " " SIZE_SPEC "\n",
1322
			  shp->shm_perm.key,
N
Nadia Derbey 已提交
1323
			  shp->shm_perm.id,
A
Andrew Morton 已提交
1324
			  shp->shm_perm.mode,
1325 1326 1327
			  shp->shm_segsz,
			  shp->shm_cprid,
			  shp->shm_lprid,
1328
			  shp->shm_nattch,
1329 1330 1331 1332
			  from_kuid_munged(user_ns, shp->shm_perm.uid),
			  from_kgid_munged(user_ns, shp->shm_perm.gid),
			  from_kuid_munged(user_ns, shp->shm_perm.cuid),
			  from_kgid_munged(user_ns, shp->shm_perm.cgid),
1333 1334
			  shp->shm_atim,
			  shp->shm_dtim,
1335 1336 1337
			  shp->shm_ctim,
			  rss * PAGE_SIZE,
			  swp * PAGE_SIZE);
L
Linus Torvalds 已提交
1338 1339
}
#endif