shm.c 31.9 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/*
 * linux/ipc/shm.c
 * Copyright (C) 1992, 1993 Krishna Balasubramanian
 *	 Many improvements/fixes by Bruno Haible.
 * Replaced `struct shm_desc' by `struct vm_area_struct', July 1994.
 * Fixed the shm swap deallocation (shm_unuse()), August 1998 Andrea Arcangeli.
 *
 * /proc/sysvipc/shm support (c) 1999 Dragos Acostachioaie <dragos@iname.com>
 * BIGMEM support, Andrea Arcangeli <andrea@suse.de>
 * SMP thread shm, Jean-Luc Boyard <jean-luc.boyard@siemens.fr>
 * HIGHMEM support, Ingo Molnar <mingo@redhat.com>
 * Make shmmax, shmall, shmmni sysctl'able, Christoph Rohland <cr@sap.com>
 * Shared /dev/zero support, Kanoj Sarcar <kanoj@sgi.com>
 * Move the mm functionality over to mm/shmem.c, Christoph Rohland <cr@sap.com>
 *
S
Steve Grubb 已提交
16 17
 * support for audit of ipc object properties and permission changes
 * Dustin Kirkland <dustin.kirkland@us.ibm.com>
K
Kirill Korotaev 已提交
18 19 20 21
 *
 * namespaces support
 * OpenVZ, SWsoft Inc.
 * Pavel Emelianov <xemul@openvz.org>
22 23 24
 *
 * Better ipc lock (kern_ipc_perm.lock) handling
 * Davidlohr Bueso <davidlohr.bueso@hp.com>, June 2013.
L
Linus Torvalds 已提交
25 26 27 28 29 30 31 32 33 34 35 36 37
 */

#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <linux/shm.h>
#include <linux/init.h>
#include <linux/file.h>
#include <linux/mman.h>
#include <linux/shmem_fs.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/audit.h>
38
#include <linux/capability.h>
S
Stephen Rothwell 已提交
39
#include <linux/ptrace.h>
40
#include <linux/seq_file.h>
N
Nadia Derbey 已提交
41
#include <linux/rwsem.h>
K
Kirill Korotaev 已提交
42
#include <linux/nsproxy.h>
43
#include <linux/mount.h>
44
#include <linux/ipc_namespace.h>
S
Stephen Rothwell 已提交
45

L
Linus Torvalds 已提交
46 47 48 49
#include <asm/uaccess.h>

#include "util.h"

50 51 52 53 54 55 56 57 58
struct shm_file_data {
	int id;
	struct ipc_namespace *ns;
	struct file *file;
	const struct vm_operations_struct *vm_ops;
};

#define shm_file_data(file) (*((struct shm_file_data **)&(file)->private_data))

59
static const struct file_operations shm_file_operations;
60
static const struct vm_operations_struct shm_vm_ops;
L
Linus Torvalds 已提交
61

62
#define shm_ids(ns)	((ns)->ids[IPC_SHM_IDS])
L
Linus Torvalds 已提交
63

K
Kirill Korotaev 已提交
64 65
#define shm_unlock(shp)			\
	ipc_unlock(&(shp)->shm_perm)
L
Linus Torvalds 已提交
66

N
Nadia Derbey 已提交
67
static int newseg(struct ipc_namespace *, struct ipc_params *);
68 69
static void shm_open(struct vm_area_struct *vma);
static void shm_close(struct vm_area_struct *vma);
K
Kirill Korotaev 已提交
70
static void shm_destroy (struct ipc_namespace *ns, struct shmid_kernel *shp);
L
Linus Torvalds 已提交
71
#ifdef CONFIG_PROC_FS
72
static int sysvipc_shm_proc_show(struct seq_file *s, void *it);
L
Linus Torvalds 已提交
73 74
#endif

75
void shm_init_ns(struct ipc_namespace *ns)
K
Kirill Korotaev 已提交
76 77 78 79
{
	ns->shm_ctlmax = SHMMAX;
	ns->shm_ctlall = SHMALL;
	ns->shm_ctlmni = SHMMNI;
80
	ns->shm_rmid_forced = 0;
K
Kirill Korotaev 已提交
81
	ns->shm_tot = 0;
W
WANG Cong 已提交
82
	ipc_init_ids(&shm_ids(ns));
K
Kirill Korotaev 已提交
83 84
}

N
Nadia Derbey 已提交
85
/*
D
Davidlohr Bueso 已提交
86 87
 * Called with shm_ids.rwsem (writer) and the shp structure locked.
 * Only shm_ids.rwsem remains locked on exit.
N
Nadia Derbey 已提交
88
 */
89
static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
K
Kirill Korotaev 已提交
90
{
91 92 93
	struct shmid_kernel *shp;
	shp = container_of(ipcp, struct shmid_kernel, shm_perm);

K
Kirill Korotaev 已提交
94 95 96 97 98 99 100 101 102
	if (shp->shm_nattch){
		shp->shm_perm.mode |= SHM_DEST;
		/* Do not find it any more */
		shp->shm_perm.key = IPC_PRIVATE;
		shm_unlock(shp);
	} else
		shm_destroy(ns, shp);
}

103
#ifdef CONFIG_IPC_NS
K
Kirill Korotaev 已提交
104 105
void shm_exit_ns(struct ipc_namespace *ns)
{
106
	free_ipcs(ns, &shm_ids(ns), do_shm_rmid);
S
Serge E. Hallyn 已提交
107
	idr_destroy(&ns->ids[IPC_SHM_IDS].ipcs_idr);
K
Kirill Korotaev 已提交
108
}
109
#endif
L
Linus Torvalds 已提交
110

111
static int __init ipc_ns_init(void)
L
Linus Torvalds 已提交
112
{
113
	shm_init_ns(&init_ipc_ns);
114 115 116 117 118 119 120
	return 0;
}

pure_initcall(ipc_ns_init);

void __init shm_init (void)
{
121
	ipc_init_proc_interface("sysvipc/shm",
122 123 124 125 126
#if BITS_PER_LONG <= 32
				"       key      shmid perms       size  cpid  lpid nattch   uid   gid  cuid  cgid      atime      dtime      ctime        rss       swap\n",
#else
				"       key      shmid perms                  size  cpid  lpid nattch   uid   gid  cuid  cgid      atime      dtime      ctime                   rss                  swap\n",
#endif
K
Kirill Korotaev 已提交
127
				IPC_SHM_IDS, sysvipc_shm_proc_show);
L
Linus Torvalds 已提交
128 129
}

130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149
static inline struct shmid_kernel *shm_obtain_object(struct ipc_namespace *ns, int id)
{
	struct kern_ipc_perm *ipcp = ipc_obtain_object(&shm_ids(ns), id);

	if (IS_ERR(ipcp))
		return ERR_CAST(ipcp);

	return container_of(ipcp, struct shmid_kernel, shm_perm);
}

static inline struct shmid_kernel *shm_obtain_object_check(struct ipc_namespace *ns, int id)
{
	struct kern_ipc_perm *ipcp = ipc_obtain_object_check(&shm_ids(ns), id);

	if (IS_ERR(ipcp))
		return ERR_CAST(ipcp);

	return container_of(ipcp, struct shmid_kernel, shm_perm);
}

N
Nadia Derbey 已提交
150
/*
D
Davidlohr Bueso 已提交
151
 * shm_lock_(check_) routines are called in the paths where the rwsem
N
Nadia Derbey 已提交
152
 * is not necessarily held.
N
Nadia Derbey 已提交
153
 */
154
static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id)
L
Linus Torvalds 已提交
155
{
N
Nadia Derbey 已提交
156 157
	struct kern_ipc_perm *ipcp = ipc_lock(&shm_ids(ns), id);

158 159 160
	if (IS_ERR(ipcp))
		return (struct shmid_kernel *)ipcp;

N
Nadia Derbey 已提交
161
	return container_of(ipcp, struct shmid_kernel, shm_perm);
162 163
}

164 165 166
static inline void shm_lock_by_ptr(struct shmid_kernel *ipcp)
{
	rcu_read_lock();
167
	ipc_lock_object(&ipcp->shm_perm);
168 169
}

D
Davidlohr Bueso 已提交
170 171 172 173 174 175 176 177 178
static void shm_rcu_free(struct rcu_head *head)
{
	struct ipc_rcu *p = container_of(head, struct ipc_rcu, rcu);
	struct shmid_kernel *shp = ipc_rcu_to_struct(p);

	security_shm_free(shp);
	ipc_rcu_free(head);
}

N
Nadia Derbey 已提交
179
static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s)
L
Linus Torvalds 已提交
180
{
N
Nadia Derbey 已提交
181
	ipc_rmid(&shm_ids(ns), &s->shm_perm);
L
Linus Torvalds 已提交
182 183 184
}


185 186
/* This is called by fork, once for every shm attach. */
static void shm_open(struct vm_area_struct *vma)
K
Kirill Korotaev 已提交
187
{
188 189
	struct file *file = vma->vm_file;
	struct shm_file_data *sfd = shm_file_data(file);
L
Linus Torvalds 已提交
190 191
	struct shmid_kernel *shp;

192
	shp = shm_lock(sfd->ns, sfd->id);
193
	BUG_ON(IS_ERR(shp));
L
Linus Torvalds 已提交
194
	shp->shm_atim = get_seconds();
195
	shp->shm_lprid = task_tgid_vnr(current);
L
Linus Torvalds 已提交
196 197 198 199 200 201 202
	shp->shm_nattch++;
	shm_unlock(shp);
}

/*
 * shm_destroy - free the struct shmid_kernel
 *
N
Nadia Derbey 已提交
203
 * @ns: namespace
L
Linus Torvalds 已提交
204 205
 * @shp: struct to free
 *
D
Davidlohr Bueso 已提交
206
 * It has to be called with shp and shm_ids.rwsem (writer) locked,
L
Linus Torvalds 已提交
207 208
 * but returns with shp unlocked and freed.
 */
K
Kirill Korotaev 已提交
209
static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
L
Linus Torvalds 已提交
210
{
G
Greg Thelen 已提交
211 212 213 214
	struct file *shm_file;

	shm_file = shp->shm_file;
	shp->shm_file = NULL;
K
Kirill Korotaev 已提交
215
	ns->shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT;
N
Nadia Derbey 已提交
216
	shm_rmid(ns, shp);
L
Linus Torvalds 已提交
217
	shm_unlock(shp);
G
Greg Thelen 已提交
218 219
	if (!is_file_hugepages(shm_file))
		shmem_lock(shm_file, 0, shp->mlock_user);
220
	else if (shp->mlock_user)
G
Greg Thelen 已提交
221 222
		user_shm_unlock(file_inode(shm_file)->i_size, shp->mlock_user);
	fput(shm_file);
D
Davidlohr Bueso 已提交
223
	ipc_rcu_putref(shp, shm_rcu_free);
L
Linus Torvalds 已提交
224 225
}

226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242
/*
 * shm_may_destroy - identifies whether shm segment should be destroyed now
 *
 * Returns true if and only if there are no active users of the segment and
 * one of the following is true:
 *
 * 1) shmctl(id, IPC_RMID, NULL) was called for this shp
 *
 * 2) sysctl kernel.shm_rmid_forced is set to 1.
 */
static bool shm_may_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
{
	return (shp->shm_nattch == 0) &&
	       (ns->shm_rmid_forced ||
		(shp->shm_perm.mode & SHM_DEST));
}

L
Linus Torvalds 已提交
243
/*
244
 * remove the attach descriptor vma.
L
Linus Torvalds 已提交
245 246 247 248
 * free memory for segment if it is marked destroyed.
 * The descriptor has already been removed from the current->mm->mmap list
 * and will later be kfree()d.
 */
249
static void shm_close(struct vm_area_struct *vma)
L
Linus Torvalds 已提交
250
{
251 252
	struct file * file = vma->vm_file;
	struct shm_file_data *sfd = shm_file_data(file);
L
Linus Torvalds 已提交
253
	struct shmid_kernel *shp;
254
	struct ipc_namespace *ns = sfd->ns;
K
Kirill Korotaev 已提交
255

D
Davidlohr Bueso 已提交
256
	down_write(&shm_ids(ns).rwsem);
L
Linus Torvalds 已提交
257
	/* remove from the list of attaches of the shm segment */
N
Nadia Derbey 已提交
258
	shp = shm_lock(ns, sfd->id);
259
	BUG_ON(IS_ERR(shp));
260
	shp->shm_lprid = task_tgid_vnr(current);
L
Linus Torvalds 已提交
261 262
	shp->shm_dtim = get_seconds();
	shp->shm_nattch--;
263 264 265 266
	if (shm_may_destroy(ns, shp))
		shm_destroy(ns, shp);
	else
		shm_unlock(shp);
D
Davidlohr Bueso 已提交
267
	up_write(&shm_ids(ns).rwsem);
268 269
}

D
Davidlohr Bueso 已提交
270
/* Called with ns->shm_ids(ns).rwsem locked */
271 272 273
static int shm_try_destroy_current(int id, void *p, void *data)
{
	struct ipc_namespace *ns = data;
274 275
	struct kern_ipc_perm *ipcp = p;
	struct shmid_kernel *shp = container_of(ipcp, struct shmid_kernel, shm_perm);
276

277
	if (shp->shm_creator != current)
278 279 280 281 282 283 284 285 286 287 288 289 290
		return 0;

	/*
	 * Mark it as orphaned to destroy the segment when
	 * kernel.shm_rmid_forced is changed.
	 * It is noop if the following shm_may_destroy() returns true.
	 */
	shp->shm_creator = NULL;

	/*
	 * Don't even try to destroy it.  If shm_rmid_forced=0 and IPC_RMID
	 * is not set, it shouldn't be deleted here.
	 */
291
	if (!ns->shm_rmid_forced)
292 293
		return 0;

294 295
	if (shm_may_destroy(ns, shp)) {
		shm_lock_by_ptr(shp);
296
		shm_destroy(ns, shp);
297
	}
298 299 300
	return 0;
}

D
Davidlohr Bueso 已提交
301
/* Called with ns->shm_ids(ns).rwsem locked */
302 303 304
static int shm_try_destroy_orphaned(int id, void *p, void *data)
{
	struct ipc_namespace *ns = data;
305 306
	struct kern_ipc_perm *ipcp = p;
	struct shmid_kernel *shp = container_of(ipcp, struct shmid_kernel, shm_perm);
307 308 309 310

	/*
	 * We want to destroy segments without users and with already
	 * exit'ed originating process.
311
	 *
D
Davidlohr Bueso 已提交
312
	 * As shp->* are changed under rwsem, it's safe to skip shp locking.
313
	 */
314
	if (shp->shm_creator != NULL)
315 316
		return 0;

317 318
	if (shm_may_destroy(ns, shp)) {
		shm_lock_by_ptr(shp);
K
Kirill Korotaev 已提交
319
		shm_destroy(ns, shp);
320
	}
321 322 323 324 325
	return 0;
}

void shm_destroy_orphaned(struct ipc_namespace *ns)
{
D
Davidlohr Bueso 已提交
326
	down_write(&shm_ids(ns).rwsem);
V
Vasiliy Kulikov 已提交
327
	if (shm_ids(ns).in_use)
328
		idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_orphaned, ns);
D
Davidlohr Bueso 已提交
329
	up_write(&shm_ids(ns).rwsem);
330 331 332 333 334
}


void exit_shm(struct task_struct *task)
{
335
	struct ipc_namespace *ns = task->nsproxy->ipc_ns;
336

V
Vasiliy Kulikov 已提交
337 338 339
	if (shm_ids(ns).in_use == 0)
		return;

340
	/* Destroy all already created segments, but not mapped yet */
D
Davidlohr Bueso 已提交
341
	down_write(&shm_ids(ns).rwsem);
V
Vasiliy Kulikov 已提交
342
	if (shm_ids(ns).in_use)
343
		idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_current, ns);
D
Davidlohr Bueso 已提交
344
	up_write(&shm_ids(ns).rwsem);
L
Linus Torvalds 已提交
345 346
}

N
Nick Piggin 已提交
347
static int shm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
348 349 350 351
{
	struct file *file = vma->vm_file;
	struct shm_file_data *sfd = shm_file_data(file);

N
Nick Piggin 已提交
352
	return sfd->vm_ops->fault(vma, vmf);
353 354 355
}

#ifdef CONFIG_NUMA
A
Adrian Bunk 已提交
356
static int shm_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
357 358 359 360 361 362 363 364 365
{
	struct file *file = vma->vm_file;
	struct shm_file_data *sfd = shm_file_data(file);
	int err = 0;
	if (sfd->vm_ops->set_policy)
		err = sfd->vm_ops->set_policy(vma, new);
	return err;
}

A
Adrian Bunk 已提交
366 367
static struct mempolicy *shm_get_policy(struct vm_area_struct *vma,
					unsigned long addr)
368 369 370 371 372 373 374
{
	struct file *file = vma->vm_file;
	struct shm_file_data *sfd = shm_file_data(file);
	struct mempolicy *pol = NULL;

	if (sfd->vm_ops->get_policy)
		pol = sfd->vm_ops->get_policy(vma, addr);
375
	else if (vma->vm_policy)
376
		pol = vma->vm_policy;
377

378 379 380 381
	return pol;
}
#endif

L
Linus Torvalds 已提交
382 383
static int shm_mmap(struct file * file, struct vm_area_struct * vma)
{
384
	struct shm_file_data *sfd = shm_file_data(file);
385 386
	int ret;

387 388 389 390
	ret = sfd->file->f_op->mmap(sfd->file, vma);
	if (ret != 0)
		return ret;
	sfd->vm_ops = vma->vm_ops;
D
David Howells 已提交
391
#ifdef CONFIG_MMU
392
	BUG_ON(!sfd->vm_ops->fault);
D
David Howells 已提交
393
#endif
394 395
	vma->vm_ops = &shm_vm_ops;
	shm_open(vma);
396 397

	return ret;
L
Linus Torvalds 已提交
398 399
}

K
Kirill Korotaev 已提交
400 401
static int shm_release(struct inode *ino, struct file *file)
{
402
	struct shm_file_data *sfd = shm_file_data(file);
K
Kirill Korotaev 已提交
403

404 405 406
	put_ipc_ns(sfd->ns);
	shm_file_data(file) = NULL;
	kfree(sfd);
K
Kirill Korotaev 已提交
407 408 409
	return 0;
}

410
static int shm_fsync(struct file *file, loff_t start, loff_t end, int datasync)
411 412 413
{
	struct shm_file_data *sfd = shm_file_data(file);

414 415
	if (!sfd->file->f_op->fsync)
		return -EINVAL;
416
	return sfd->file->f_op->fsync(sfd->file, start, end, datasync);
417 418
}

419 420 421 422 423 424 425 426 427 428
static long shm_fallocate(struct file *file, int mode, loff_t offset,
			  loff_t len)
{
	struct shm_file_data *sfd = shm_file_data(file);

	if (!sfd->file->f_op->fallocate)
		return -EOPNOTSUPP;
	return sfd->file->f_op->fallocate(file, mode, offset, len);
}

429 430 431 432 433
static unsigned long shm_get_unmapped_area(struct file *file,
	unsigned long addr, unsigned long len, unsigned long pgoff,
	unsigned long flags)
{
	struct shm_file_data *sfd = shm_file_data(file);
434 435
	return sfd->file->f_op->get_unmapped_area(sfd->file, addr, len,
						pgoff, flags);
436 437
}

438
static const struct file_operations shm_file_operations = {
K
Kirill Korotaev 已提交
439
	.mmap		= shm_mmap,
440
	.fsync		= shm_fsync,
K
Kirill Korotaev 已提交
441
	.release	= shm_release,
D
David Howells 已提交
442 443 444
#ifndef CONFIG_MMU
	.get_unmapped_area	= shm_get_unmapped_area,
#endif
445
	.llseek		= noop_llseek,
446
	.fallocate	= shm_fallocate,
447 448 449 450 451 452
};

static const struct file_operations shm_file_operations_huge = {
	.mmap		= shm_mmap,
	.fsync		= shm_fsync,
	.release	= shm_release,
453
	.get_unmapped_area	= shm_get_unmapped_area,
454
	.llseek		= noop_llseek,
455
	.fallocate	= shm_fallocate,
L
Linus Torvalds 已提交
456 457
};

458 459 460 461 462
int is_file_shm_hugepages(struct file *file)
{
	return file->f_op == &shm_file_operations_huge;
}

463
static const struct vm_operations_struct shm_vm_ops = {
L
Linus Torvalds 已提交
464 465
	.open	= shm_open,	/* callback for a new vm-area open */
	.close	= shm_close,	/* callback for when the vm-area is released */
466
	.fault	= shm_fault,
467 468 469
#if defined(CONFIG_NUMA)
	.set_policy = shm_set_policy,
	.get_policy = shm_get_policy,
L
Linus Torvalds 已提交
470 471 472
#endif
};

N
Nadia Derbey 已提交
473 474 475 476 477
/**
 * newseg - Create a new shared memory segment
 * @ns: namespace
 * @params: ptr to the structure that contains key, size and shmflg
 *
D
Davidlohr Bueso 已提交
478
 * Called with shm_ids.rwsem held as a writer.
N
Nadia Derbey 已提交
479 480
 */

N
Nadia Derbey 已提交
481
static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
L
Linus Torvalds 已提交
482
{
N
Nadia Derbey 已提交
483 484 485
	key_t key = params->key;
	int shmflg = params->flg;
	size_t size = params->u.size;
L
Linus Torvalds 已提交
486 487
	int error;
	struct shmid_kernel *shp;
488
	size_t numpages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
L
Linus Torvalds 已提交
489 490 491
	struct file * file;
	char name[13];
	int id;
492
	vm_flags_t acctflag = 0;
L
Linus Torvalds 已提交
493

K
Kirill Korotaev 已提交
494
	if (size < SHMMIN || size > ns->shm_ctlmax)
L
Linus Torvalds 已提交
495 496
		return -EINVAL;

497
	if (ns->shm_tot + numpages > ns->shm_ctlall)
L
Linus Torvalds 已提交
498 499 500 501 502 503 504
		return -ENOSPC;

	shp = ipc_rcu_alloc(sizeof(*shp));
	if (!shp)
		return -ENOMEM;

	shp->shm_perm.key = key;
A
Andrew Morton 已提交
505
	shp->shm_perm.mode = (shmflg & S_IRWXUGO);
L
Linus Torvalds 已提交
506 507 508 509 510
	shp->mlock_user = NULL;

	shp->shm_perm.security = NULL;
	error = security_shm_alloc(shp);
	if (error) {
D
Davidlohr Bueso 已提交
511
		ipc_rcu_putref(shp, ipc_rcu_free);
L
Linus Torvalds 已提交
512 513 514
		return error;
	}

515
	sprintf (name, "SYSV%08x", key);
L
Linus Torvalds 已提交
516
	if (shmflg & SHM_HUGETLB) {
517
		struct hstate *hs;
518 519
		size_t hugesize;

520
		hs = hstate_sizelog((shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK);
521 522 523 524 525
		if (!hs) {
			error = -EINVAL;
			goto no_file;
		}
		hugesize = ALIGN(size, huge_page_size(hs));
526

527 528 529
		/* hugetlb_file_setup applies strict accounting */
		if (shmflg & SHM_NORESERVE)
			acctflag = VM_NORESERVE;
530
		file = hugetlb_file_setup(name, hugesize, acctflag,
531 532
				  &shp->mlock_user, HUGETLB_SHMFS_INODE,
				(shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK);
L
Linus Torvalds 已提交
533
	} else {
534 535 536 537 538 539
		/*
		 * Do not allow no accounting for OVERCOMMIT_NEVER, even
	 	 * if it's asked for.
		 */
		if  ((shmflg & SHM_NORESERVE) &&
				sysctl_overcommit_memory != OVERCOMMIT_NEVER)
540
			acctflag = VM_NORESERVE;
541
		file = shmem_file_setup(name, size, acctflag);
L
Linus Torvalds 已提交
542 543 544 545 546
	}
	error = PTR_ERR(file);
	if (IS_ERR(file))
		goto no_file;

547
	id = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni);
548 549
	if (id < 0) {
		error = id;
L
Linus Torvalds 已提交
550
		goto no_id;
551
	}
L
Linus Torvalds 已提交
552

553
	shp->shm_cprid = task_tgid_vnr(current);
L
Linus Torvalds 已提交
554 555 556 557 558 559
	shp->shm_lprid = 0;
	shp->shm_atim = shp->shm_dtim = 0;
	shp->shm_ctim = get_seconds();
	shp->shm_segsz = size;
	shp->shm_nattch = 0;
	shp->shm_file = file;
560
	shp->shm_creator = current;
561

562 563 564 565
	/*
	 * shmid gets reported as "inode#" in /proc/pid/maps.
	 * proc-ps tools use this. Changing this will break them.
	 */
A
Al Viro 已提交
566
	file_inode(file)->i_ino = shp->shm_perm.id;
567

K
Kirill Korotaev 已提交
568
	ns->shm_tot += numpages;
N
Nadia Derbey 已提交
569
	error = shp->shm_perm.id;
570

571
	ipc_unlock_object(&shp->shm_perm);
572
	rcu_read_unlock();
N
Nadia Derbey 已提交
573
	return error;
L
Linus Torvalds 已提交
574 575

no_id:
576
	if (is_file_hugepages(file) && shp->mlock_user)
577
		user_shm_unlock(size, shp->mlock_user);
L
Linus Torvalds 已提交
578 579
	fput(file);
no_file:
D
Davidlohr Bueso 已提交
580
	ipc_rcu_putref(shp, shm_rcu_free);
L
Linus Torvalds 已提交
581 582 583
	return error;
}

N
Nadia Derbey 已提交
584
/*
D
Davidlohr Bueso 已提交
585
 * Called with shm_ids.rwsem and ipcp locked.
N
Nadia Derbey 已提交
586
 */
N
Nadia Derbey 已提交
587
static inline int shm_security(struct kern_ipc_perm *ipcp, int shmflg)
N
Nadia Derbey 已提交
588
{
N
Nadia Derbey 已提交
589 590 591 592
	struct shmid_kernel *shp;

	shp = container_of(ipcp, struct shmid_kernel, shm_perm);
	return security_shm_associate(shp, shmflg);
N
Nadia Derbey 已提交
593 594
}

N
Nadia Derbey 已提交
595
/*
D
Davidlohr Bueso 已提交
596
 * Called with shm_ids.rwsem and ipcp locked.
N
Nadia Derbey 已提交
597
 */
N
Nadia Derbey 已提交
598 599
static inline int shm_more_checks(struct kern_ipc_perm *ipcp,
				struct ipc_params *params)
N
Nadia Derbey 已提交
600
{
N
Nadia Derbey 已提交
601 602 603 604
	struct shmid_kernel *shp;

	shp = container_of(ipcp, struct shmid_kernel, shm_perm);
	if (shp->shm_segsz < params->u.size)
N
Nadia Derbey 已提交
605 606 607 608 609
		return -EINVAL;

	return 0;
}

610
SYSCALL_DEFINE3(shmget, key_t, key, size_t, size, int, shmflg)
L
Linus Torvalds 已提交
611
{
K
Kirill Korotaev 已提交
612
	struct ipc_namespace *ns;
N
Nadia Derbey 已提交
613 614
	struct ipc_ops shm_ops;
	struct ipc_params shm_params;
K
Kirill Korotaev 已提交
615 616

	ns = current->nsproxy->ipc_ns;
L
Linus Torvalds 已提交
617

N
Nadia Derbey 已提交
618 619 620
	shm_ops.getnew = newseg;
	shm_ops.associate = shm_security;
	shm_ops.more_checks = shm_more_checks;
N
Nadia Derbey 已提交
621

N
Nadia Derbey 已提交
622 623 624
	shm_params.key = key;
	shm_params.flg = shmflg;
	shm_params.u.size = size;
L
Linus Torvalds 已提交
625

N
Nadia Derbey 已提交
626
	return ipcget(ns, &shm_ids(ns), &shm_ops, &shm_params);
L
Linus Torvalds 已提交
627 628 629 630 631 632 633 634 635 636 637
}

static inline unsigned long copy_shmid_to_user(void __user *buf, struct shmid64_ds *in, int version)
{
	switch(version) {
	case IPC_64:
		return copy_to_user(buf, in, sizeof(*in));
	case IPC_OLD:
	    {
		struct shmid_ds out;

638
		memset(&out, 0, sizeof(out));
L
Linus Torvalds 已提交
639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654
		ipc64_perm_to_ipc_perm(&in->shm_perm, &out.shm_perm);
		out.shm_segsz	= in->shm_segsz;
		out.shm_atime	= in->shm_atime;
		out.shm_dtime	= in->shm_dtime;
		out.shm_ctime	= in->shm_ctime;
		out.shm_cpid	= in->shm_cpid;
		out.shm_lpid	= in->shm_lpid;
		out.shm_nattch	= in->shm_nattch;

		return copy_to_user(buf, &out, sizeof(out));
	    }
	default:
		return -EINVAL;
	}
}

655 656
static inline unsigned long
copy_shmid_from_user(struct shmid64_ds *out, void __user *buf, int version)
L
Linus Torvalds 已提交
657 658 659
{
	switch(version) {
	case IPC_64:
660
		if (copy_from_user(out, buf, sizeof(*out)))
L
Linus Torvalds 已提交
661 662 663 664 665 666 667 668 669
			return -EFAULT;
		return 0;
	case IPC_OLD:
	    {
		struct shmid_ds tbuf_old;

		if (copy_from_user(&tbuf_old, buf, sizeof(tbuf_old)))
			return -EFAULT;

670 671 672
		out->shm_perm.uid	= tbuf_old.shm_perm.uid;
		out->shm_perm.gid	= tbuf_old.shm_perm.gid;
		out->shm_perm.mode	= tbuf_old.shm_perm.mode;
L
Linus Torvalds 已提交
673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706

		return 0;
	    }
	default:
		return -EINVAL;
	}
}

static inline unsigned long copy_shminfo_to_user(void __user *buf, struct shminfo64 *in, int version)
{
	switch(version) {
	case IPC_64:
		return copy_to_user(buf, in, sizeof(*in));
	case IPC_OLD:
	    {
		struct shminfo out;

		if(in->shmmax > INT_MAX)
			out.shmmax = INT_MAX;
		else
			out.shmmax = (int)in->shmmax;

		out.shmmin	= in->shmmin;
		out.shmmni	= in->shmmni;
		out.shmseg	= in->shmseg;
		out.shmall	= in->shmall; 

		return copy_to_user(buf, &out, sizeof(out));
	    }
	default:
		return -EINVAL;
	}
}

707 708
/*
 * Calculate and add used RSS and swap pages of a shm.
D
Davidlohr Bueso 已提交
709
 * Called with shm_ids.rwsem held as a reader
710 711 712 713 714 715
 */
static void shm_add_rss_swap(struct shmid_kernel *shp,
	unsigned long *rss_add, unsigned long *swp_add)
{
	struct inode *inode;

A
Al Viro 已提交
716
	inode = file_inode(shp->shm_file);
717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734

	if (is_file_hugepages(shp->shm_file)) {
		struct address_space *mapping = inode->i_mapping;
		struct hstate *h = hstate_file(shp->shm_file);
		*rss_add += pages_per_huge_page(h) * mapping->nrpages;
	} else {
#ifdef CONFIG_SHMEM
		struct shmem_inode_info *info = SHMEM_I(inode);
		spin_lock(&info->lock);
		*rss_add += inode->i_mapping->nrpages;
		*swp_add += info->swapped;
		spin_unlock(&info->lock);
#else
		*rss_add += inode->i_mapping->nrpages;
#endif
	}
}

N
Nadia Derbey 已提交
735
/*
D
Davidlohr Bueso 已提交
736
 * Called with shm_ids.rwsem held as a reader
N
Nadia Derbey 已提交
737
 */
K
Kirill Korotaev 已提交
738 739
static void shm_get_stat(struct ipc_namespace *ns, unsigned long *rss,
		unsigned long *swp)
L
Linus Torvalds 已提交
740
{
N
Nadia Derbey 已提交
741 742
	int next_id;
	int total, in_use;
L
Linus Torvalds 已提交
743 744 745 746

	*rss = 0;
	*swp = 0;

N
Nadia Derbey 已提交
747 748 749
	in_use = shm_ids(ns).in_use;

	for (total = 0, next_id = 0; total < in_use; next_id++) {
750
		struct kern_ipc_perm *ipc;
L
Linus Torvalds 已提交
751 752
		struct shmid_kernel *shp;

753 754
		ipc = idr_find(&shm_ids(ns).ipcs_idr, next_id);
		if (ipc == NULL)
L
Linus Torvalds 已提交
755
			continue;
756
		shp = container_of(ipc, struct shmid_kernel, shm_perm);
L
Linus Torvalds 已提交
757

758
		shm_add_rss_swap(shp, rss, swp);
N
Nadia Derbey 已提交
759 760

		total++;
L
Linus Torvalds 已提交
761 762 763
	}
}

764
/*
D
Davidlohr Bueso 已提交
765
 * This function handles some shmctl commands which require the rwsem
766
 * to be held in write mode.
D
Davidlohr Bueso 已提交
767
 * NOTE: no locks must be held, the rwsem is taken inside this function.
768 769 770
 */
static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd,
		       struct shmid_ds __user *buf, int version)
L
Linus Torvalds 已提交
771
{
772
	struct kern_ipc_perm *ipcp;
773
	struct shmid64_ds shmid64;
774 775 776 777
	struct shmid_kernel *shp;
	int err;

	if (cmd == IPC_SET) {
778
		if (copy_shmid_from_user(&shmid64, buf, version))
779 780 781
			return -EFAULT;
	}

D
Davidlohr Bueso 已提交
782
	down_write(&shm_ids(ns).rwsem);
783 784
	rcu_read_lock();

785 786
	ipcp = ipcctl_pre_down_nolock(ns, &shm_ids(ns), shmid, cmd,
				      &shmid64.shm_perm, 0);
787 788 789 790
	if (IS_ERR(ipcp)) {
		err = PTR_ERR(ipcp);
		goto out_unlock1;
	}
791

792
	shp = container_of(ipcp, struct shmid_kernel, shm_perm);
793 794 795

	err = security_shm_shmctl(shp, cmd);
	if (err)
796
		goto out_unlock1;
797

798 799
	switch (cmd) {
	case IPC_RMID:
800
		ipc_lock_object(&shp->shm_perm);
801
		/* do_shm_rmid unlocks the ipc object and rcu */
802 803 804
		do_shm_rmid(ns, ipcp);
		goto out_up;
	case IPC_SET:
805
		ipc_lock_object(&shp->shm_perm);
806 807
		err = ipc_update_perm(&shmid64.shm_perm, ipcp);
		if (err)
808
			goto out_unlock0;
809 810 811 812
		shp->shm_ctim = get_seconds();
		break;
	default:
		err = -EINVAL;
813
		goto out_unlock1;
814
	}
815 816 817 818 819

out_unlock0:
	ipc_unlock_object(&shp->shm_perm);
out_unlock1:
	rcu_read_unlock();
820
out_up:
D
Davidlohr Bueso 已提交
821
	up_write(&shm_ids(ns).rwsem);
822 823 824
	return err;
}

825 826
static int shmctl_nolock(struct ipc_namespace *ns, int shmid,
			 int cmd, int version, void __user *buf)
827
{
828
	int err;
L
Linus Torvalds 已提交
829 830
	struct shmid_kernel *shp;

831 832 833 834 835
	/* preliminary security checks for *_INFO */
	if (cmd == IPC_INFO || cmd == SHM_INFO) {
		err = security_shm_shmctl(NULL, cmd);
		if (err)
			return err;
L
Linus Torvalds 已提交
836 837
	}

838
	switch (cmd) {
L
Linus Torvalds 已提交
839 840 841 842
	case IPC_INFO:
	{
		struct shminfo64 shminfo;

W
WANG Cong 已提交
843
		memset(&shminfo, 0, sizeof(shminfo));
K
Kirill Korotaev 已提交
844 845 846
		shminfo.shmmni = shminfo.shmseg = ns->shm_ctlmni;
		shminfo.shmmax = ns->shm_ctlmax;
		shminfo.shmall = ns->shm_ctlall;
L
Linus Torvalds 已提交
847 848 849 850

		shminfo.shmmin = SHMMIN;
		if(copy_shminfo_to_user (buf, &shminfo, version))
			return -EFAULT;
N
Nadia Derbey 已提交
851

D
Davidlohr Bueso 已提交
852
		down_read(&shm_ids(ns).rwsem);
N
Nadia Derbey 已提交
853
		err = ipc_get_maxid(&shm_ids(ns));
D
Davidlohr Bueso 已提交
854
		up_read(&shm_ids(ns).rwsem);
N
Nadia Derbey 已提交
855

L
Linus Torvalds 已提交
856 857 858 859 860 861 862 863
		if(err<0)
			err = 0;
		goto out;
	}
	case SHM_INFO:
	{
		struct shm_info shm_info;

W
WANG Cong 已提交
864
		memset(&shm_info, 0, sizeof(shm_info));
D
Davidlohr Bueso 已提交
865
		down_read(&shm_ids(ns).rwsem);
K
Kirill Korotaev 已提交
866 867 868
		shm_info.used_ids = shm_ids(ns).in_use;
		shm_get_stat (ns, &shm_info.shm_rss, &shm_info.shm_swp);
		shm_info.shm_tot = ns->shm_tot;
L
Linus Torvalds 已提交
869 870
		shm_info.swap_attempts = 0;
		shm_info.swap_successes = 0;
N
Nadia Derbey 已提交
871
		err = ipc_get_maxid(&shm_ids(ns));
D
Davidlohr Bueso 已提交
872
		up_read(&shm_ids(ns).rwsem);
W
WANG Cong 已提交
873
		if (copy_to_user(buf, &shm_info, sizeof(shm_info))) {
L
Linus Torvalds 已提交
874 875 876 877 878 879 880 881 882 883 884 885
			err = -EFAULT;
			goto out;
		}

		err = err < 0 ? 0 : err;
		goto out;
	}
	case SHM_STAT:
	case IPC_STAT:
	{
		struct shmid64_ds tbuf;
		int result;
886

887
		rcu_read_lock();
888
		if (cmd == SHM_STAT) {
889
			shp = shm_obtain_object(ns, shmid);
890 891
			if (IS_ERR(shp)) {
				err = PTR_ERR(shp);
892
				goto out_unlock;
893
			}
N
Nadia Derbey 已提交
894
			result = shp->shm_perm.id;
L
Linus Torvalds 已提交
895
		} else {
896
			shp = shm_obtain_object_check(ns, shmid);
897 898
			if (IS_ERR(shp)) {
				err = PTR_ERR(shp);
899
				goto out_unlock;
900
			}
L
Linus Torvalds 已提交
901 902
			result = 0;
		}
903

W
WANG Cong 已提交
904
		err = -EACCES;
905
		if (ipcperms(ns, &shp->shm_perm, S_IRUGO))
L
Linus Torvalds 已提交
906
			goto out_unlock;
907

L
Linus Torvalds 已提交
908 909 910
		err = security_shm_shmctl(shp, cmd);
		if (err)
			goto out_unlock;
911

912
		memset(&tbuf, 0, sizeof(tbuf));
L
Linus Torvalds 已提交
913 914 915 916 917 918 919
		kernel_to_ipc64_perm(&shp->shm_perm, &tbuf.shm_perm);
		tbuf.shm_segsz	= shp->shm_segsz;
		tbuf.shm_atime	= shp->shm_atim;
		tbuf.shm_dtime	= shp->shm_dtim;
		tbuf.shm_ctime	= shp->shm_ctim;
		tbuf.shm_cpid	= shp->shm_cprid;
		tbuf.shm_lpid	= shp->shm_lprid;
920
		tbuf.shm_nattch	= shp->shm_nattch;
921 922 923
		rcu_read_unlock();

		if (copy_shmid_to_user(buf, &tbuf, version))
L
Linus Torvalds 已提交
924 925 926 927 928
			err = -EFAULT;
		else
			err = result;
		goto out;
	}
929 930 931 932 933
	default:
		return -EINVAL;
	}

out_unlock:
934
	rcu_read_unlock();
935 936 937 938 939 940 941 942 943 944
out:
	return err;
}

SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf)
{
	struct shmid_kernel *shp;
	int err, version;
	struct ipc_namespace *ns;

945 946
	if (cmd < 0 || shmid < 0)
		return -EINVAL;
947 948 949 950 951 952 953 954 955 956

	version = ipc_parse_version(&cmd);
	ns = current->nsproxy->ipc_ns;

	switch (cmd) {
	case IPC_INFO:
	case SHM_INFO:
	case SHM_STAT:
	case IPC_STAT:
		return shmctl_nolock(ns, shmid, cmd, version, buf);
957 958 959
	case IPC_RMID:
	case IPC_SET:
		return shmctl_down(ns, shmid, cmd, buf, version);
L
Linus Torvalds 已提交
960 961 962
	case SHM_LOCK:
	case SHM_UNLOCK:
	{
963
		struct file *shm_file;
964

965 966
		rcu_read_lock();
		shp = shm_obtain_object_check(ns, shmid);
967 968
		if (IS_ERR(shp)) {
			err = PTR_ERR(shp);
969
			goto out_unlock1;
L
Linus Torvalds 已提交
970 971
		}

A
Al Viro 已提交
972
		audit_ipc_obj(&(shp->shm_perm));
973 974 975
		err = security_shm_shmctl(shp, cmd);
		if (err)
			goto out_unlock1;
S
Steve Grubb 已提交
976

977
		ipc_lock_object(&shp->shm_perm);
978
		if (!ns_capable(ns->user_ns, CAP_IPC_LOCK)) {
979 980
			kuid_t euid = current_euid();
			if (!uid_eq(euid, shp->shm_perm.uid) &&
981 982
			    !uid_eq(euid, shp->shm_perm.cuid)) {
				err = -EPERM;
983
				goto out_unlock0;
984 985 986
			}
			if (cmd == SHM_LOCK && !rlimit(RLIMIT_MEMLOCK)) {
				err = -EPERM;
987
				goto out_unlock0;
988
			}
L
Linus Torvalds 已提交
989 990
		}

991
		shm_file = shp->shm_file;
G
Greg Thelen 已提交
992 993 994 995 996 997 998

		/* check if shm_destroy() is tearing down shp */
		if (shm_file == NULL) {
			err = -EIDRM;
			goto out_unlock0;
		}

999
		if (is_file_hugepages(shm_file))
1000
			goto out_unlock0;
1001 1002

		if (cmd == SHM_LOCK) {
1003
			struct user_struct *user = current_user();
1004 1005 1006 1007
			err = shmem_lock(shm_file, 1, user);
			if (!err && !(shp->shm_perm.mode & SHM_LOCKED)) {
				shp->shm_perm.mode |= SHM_LOCKED;
				shp->mlock_user = user;
L
Linus Torvalds 已提交
1008
			}
1009
			goto out_unlock0;
L
Linus Torvalds 已提交
1010
		}
1011 1012 1013

		/* SHM_UNLOCK */
		if (!(shp->shm_perm.mode & SHM_LOCKED))
1014
			goto out_unlock0;
1015 1016 1017 1018
		shmem_lock(shm_file, 0, shp->mlock_user);
		shp->shm_perm.mode &= ~SHM_LOCKED;
		shp->mlock_user = NULL;
		get_file(shm_file);
1019 1020
		ipc_unlock_object(&shp->shm_perm);
		rcu_read_unlock();
1021
		shmem_unlock_mapping(shm_file->f_mapping);
1022

1023
		fput(shm_file);
1024
		return err;
1025
	}
L
Linus Torvalds 已提交
1026
	default:
1027
		return -EINVAL;
L
Linus Torvalds 已提交
1028 1029
	}

1030 1031 1032 1033
out_unlock0:
	ipc_unlock_object(&shp->shm_perm);
out_unlock1:
	rcu_read_unlock();
L
Linus Torvalds 已提交
1034 1035 1036 1037 1038 1039 1040 1041 1042 1043
	return err;
}

/*
 * Fix shmaddr, allocate descriptor, map shm, add attach descriptor to lists.
 *
 * NOTE! Despite the name, this is NOT a direct system call entrypoint. The
 * "raddr" thing points to kernel space, and there has to be a wrapper around
 * this.
 */
W
Will Deacon 已提交
1044 1045
long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr,
	      unsigned long shmlba)
L
Linus Torvalds 已提交
1046 1047 1048 1049 1050 1051 1052 1053 1054
{
	struct shmid_kernel *shp;
	unsigned long addr;
	unsigned long size;
	struct file * file;
	int    err;
	unsigned long flags;
	unsigned long prot;
	int acc_mode;
K
Kirill Korotaev 已提交
1055
	struct ipc_namespace *ns;
1056 1057
	struct shm_file_data *sfd;
	struct path path;
1058
	fmode_t f_mode;
1059
	unsigned long populate = 0;
L
Linus Torvalds 已提交
1060

1061 1062
	err = -EINVAL;
	if (shmid < 0)
L
Linus Torvalds 已提交
1063
		goto out;
1064
	else if ((addr = (ulong)shmaddr)) {
W
Will Deacon 已提交
1065
		if (addr & (shmlba - 1)) {
L
Linus Torvalds 已提交
1066
			if (shmflg & SHM_RND)
W
Will Deacon 已提交
1067
				addr &= ~(shmlba - 1);	   /* round down */
L
Linus Torvalds 已提交
1068 1069 1070 1071
			else
#ifndef __ARCH_FORCE_SHMLBA
				if (addr & ~PAGE_MASK)
#endif
1072
					goto out;
L
Linus Torvalds 已提交
1073 1074 1075 1076
		}
		flags = MAP_SHARED | MAP_FIXED;
	} else {
		if ((shmflg & SHM_REMAP))
1077
			goto out;
L
Linus Torvalds 已提交
1078 1079 1080 1081 1082 1083 1084

		flags = MAP_SHARED;
	}

	if (shmflg & SHM_RDONLY) {
		prot = PROT_READ;
		acc_mode = S_IRUGO;
1085
		f_mode = FMODE_READ;
L
Linus Torvalds 已提交
1086 1087 1088
	} else {
		prot = PROT_READ | PROT_WRITE;
		acc_mode = S_IRUGO | S_IWUGO;
1089
		f_mode = FMODE_READ | FMODE_WRITE;
L
Linus Torvalds 已提交
1090 1091 1092 1093 1094 1095 1096 1097 1098 1099
	}
	if (shmflg & SHM_EXEC) {
		prot |= PROT_EXEC;
		acc_mode |= S_IXUGO;
	}

	/*
	 * We cannot rely on the fs check since SYSV IPC does have an
	 * additional creator id...
	 */
K
Kirill Korotaev 已提交
1100
	ns = current->nsproxy->ipc_ns;
1101 1102
	rcu_read_lock();
	shp = shm_obtain_object_check(ns, shmid);
1103 1104
	if (IS_ERR(shp)) {
		err = PTR_ERR(shp);
1105
		goto out_unlock;
1106
	}
1107 1108

	err = -EACCES;
1109
	if (ipcperms(ns, &shp->shm_perm, acc_mode))
1110
		goto out_unlock;
L
Linus Torvalds 已提交
1111 1112

	err = security_shm_shmat(shp, shmaddr, shmflg);
1113 1114 1115
	if (err)
		goto out_unlock;

1116
	ipc_lock_object(&shp->shm_perm);
G
Greg Thelen 已提交
1117 1118 1119 1120 1121 1122 1123 1124

	/* check if shm_destroy() is tearing down shp */
	if (shp->shm_file == NULL) {
		ipc_unlock_object(&shp->shm_perm);
		err = -EIDRM;
		goto out_unlock;
	}

1125 1126
	path = shp->shm_file->f_path;
	path_get(&path);
L
Linus Torvalds 已提交
1127
	shp->shm_nattch++;
1128
	size = i_size_read(path.dentry->d_inode);
1129 1130
	ipc_unlock_object(&shp->shm_perm);
	rcu_read_unlock();
L
Linus Torvalds 已提交
1131

1132 1133
	err = -ENOMEM;
	sfd = kzalloc(sizeof(*sfd), GFP_KERNEL);
1134 1135 1136 1137
	if (!sfd) {
		path_put(&path);
		goto out_nattch;
	}
1138

1139 1140
	file = alloc_file(&path, f_mode,
			  is_file_hugepages(shp->shm_file) ?
1141 1142
				&shm_file_operations_huge :
				&shm_file_operations);
1143
	err = PTR_ERR(file);
1144 1145 1146 1147 1148
	if (IS_ERR(file)) {
		kfree(sfd);
		path_put(&path);
		goto out_nattch;
	}
1149 1150 1151

	file->private_data = sfd;
	file->f_mapping = shp->shm_file->f_mapping;
N
Nadia Derbey 已提交
1152
	sfd->id = shp->shm_perm.id;
1153 1154 1155 1156
	sfd->ns = get_ipc_ns(ns);
	sfd->file = shp->shm_file;
	sfd->vm_ops = NULL;

1157 1158 1159 1160
	err = security_mmap_file(file, prot, flags);
	if (err)
		goto out_fput;

L
Linus Torvalds 已提交
1161 1162
	down_write(&current->mm->mmap_sem);
	if (addr && !(shmflg & SHM_REMAP)) {
1163
		err = -EINVAL;
L
Linus Torvalds 已提交
1164 1165 1166 1167 1168 1169 1170 1171 1172 1173
		if (find_vma_intersection(current->mm, addr, addr + size))
			goto invalid;
		/*
		 * If shm segment goes below stack, make sure there is some
		 * space left for the stack to grow (at least 4 pages).
		 */
		if (addr < current->mm->start_stack &&
		    addr > current->mm->start_stack - size - PAGE_SIZE * 5)
			goto invalid;
	}
1174

1175 1176
	addr = do_mmap_pgoff(file, addr, size, prot, flags, 0, &populate);
	*raddr = addr;
1177
	err = 0;
1178 1179
	if (IS_ERR_VALUE(addr))
		err = (long)addr;
L
Linus Torvalds 已提交
1180 1181
invalid:
	up_write(&current->mm->mmap_sem);
1182
	if (populate)
1183
		mm_populate(addr, populate);
L
Linus Torvalds 已提交
1184

1185
out_fput:
1186 1187 1188
	fput(file);

out_nattch:
D
Davidlohr Bueso 已提交
1189
	down_write(&shm_ids(ns).rwsem);
N
Nadia Derbey 已提交
1190
	shp = shm_lock(ns, shmid);
1191
	BUG_ON(IS_ERR(shp));
L
Linus Torvalds 已提交
1192
	shp->shm_nattch--;
1193
	if (shm_may_destroy(ns, shp))
K
Kirill Korotaev 已提交
1194
		shm_destroy(ns, shp);
L
Linus Torvalds 已提交
1195 1196
	else
		shm_unlock(shp);
D
Davidlohr Bueso 已提交
1197
	up_write(&shm_ids(ns).rwsem);
L
Linus Torvalds 已提交
1198
	return err;
1199 1200

out_unlock:
1201
	rcu_read_unlock();
1202 1203
out:
	return err;
L
Linus Torvalds 已提交
1204 1205
}

1206
SYSCALL_DEFINE3(shmat, int, shmid, char __user *, shmaddr, int, shmflg)
S
Stephen Rothwell 已提交
1207 1208 1209 1210
{
	unsigned long ret;
	long err;

W
Will Deacon 已提交
1211
	err = do_shmat(shmid, shmaddr, shmflg, &ret, SHMLBA);
S
Stephen Rothwell 已提交
1212 1213 1214 1215 1216 1217
	if (err)
		return err;
	force_successful_syscall_return();
	return (long)ret;
}

L
Linus Torvalds 已提交
1218 1219 1220 1221
/*
 * detach and kill segment if marked destroyed.
 * The work is done in shm_close.
 */
1222
SYSCALL_DEFINE1(shmdt, char __user *, shmaddr)
L
Linus Torvalds 已提交
1223 1224
{
	struct mm_struct *mm = current->mm;
1225
	struct vm_area_struct *vma;
L
Linus Torvalds 已提交
1226 1227
	unsigned long addr = (unsigned long)shmaddr;
	int retval = -EINVAL;
1228 1229 1230 1231
#ifdef CONFIG_MMU
	loff_t size = 0;
	struct vm_area_struct *next;
#endif
L
Linus Torvalds 已提交
1232

1233 1234 1235
	if (addr & ~PAGE_MASK)
		return retval;

L
Linus Torvalds 已提交
1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259
	down_write(&mm->mmap_sem);

	/*
	 * This function tries to be smart and unmap shm segments that
	 * were modified by partial mlock or munmap calls:
	 * - It first determines the size of the shm segment that should be
	 *   unmapped: It searches for a vma that is backed by shm and that
	 *   started at address shmaddr. It records it's size and then unmaps
	 *   it.
	 * - Then it unmaps all shm vmas that started at shmaddr and that
	 *   are within the initially determined size.
	 * Errors from do_munmap are ignored: the function only fails if
	 * it's called with invalid parameters or if it's called to unmap
	 * a part of a vma. Both calls in this function are for full vmas,
	 * the parameters are directly copied from the vma itself and always
	 * valid - therefore do_munmap cannot fail. (famous last words?)
	 */
	/*
	 * If it had been mremap()'d, the starting address would not
	 * match the usual checks anyway. So assume all vma's are
	 * above the starting address given.
	 */
	vma = find_vma(mm, addr);

1260
#ifdef CONFIG_MMU
L
Linus Torvalds 已提交
1261 1262 1263 1264 1265 1266 1267 1268
	while (vma) {
		next = vma->vm_next;

		/*
		 * Check if the starting address would match, i.e. it's
		 * a fragment created by mprotect() and/or munmap(), or it
		 * otherwise it starts at this address with no hassles.
		 */
1269
		if ((vma->vm_ops == &shm_vm_ops) &&
L
Linus Torvalds 已提交
1270 1271 1272
			(vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) {


A
Al Viro 已提交
1273
			size = file_inode(vma->vm_file)->i_size;
L
Linus Torvalds 已提交
1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290
			do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
			/*
			 * We discovered the size of the shm segment, so
			 * break out of here and fall through to the next
			 * loop that uses the size information to stop
			 * searching for matching vma's.
			 */
			retval = 0;
			vma = next;
			break;
		}
		vma = next;
	}

	/*
	 * We need look no further than the maximum address a fragment
	 * could possibly have landed at. Also cast things to loff_t to
L
Lucas De Marchi 已提交
1291
	 * prevent overflows and make comparisons vs. equal-width types.
L
Linus Torvalds 已提交
1292
	 */
1293
	size = PAGE_ALIGN(size);
L
Linus Torvalds 已提交
1294 1295 1296 1297
	while (vma && (loff_t)(vma->vm_end - addr) <= size) {
		next = vma->vm_next;

		/* finding a matching vma now does not alter retval */
1298
		if ((vma->vm_ops == &shm_vm_ops) &&
L
Linus Torvalds 已提交
1299 1300 1301 1302 1303 1304
			(vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff)

			do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
		vma = next;
	}

1305 1306 1307
#else /* CONFIG_MMU */
	/* under NOMMU conditions, the exact address to be destroyed must be
	 * given */
1308
	if (vma && vma->vm_start == addr && vma->vm_ops == &shm_vm_ops) {
1309 1310 1311 1312 1313 1314
		do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
		retval = 0;
	}

#endif

L
Linus Torvalds 已提交
1315 1316 1317 1318 1319
	up_write(&mm->mmap_sem);
	return retval;
}

#ifdef CONFIG_PROC_FS
1320
static int sysvipc_shm_proc_show(struct seq_file *s, void *it)
L
Linus Torvalds 已提交
1321
{
1322
	struct user_namespace *user_ns = seq_user_ns(s);
1323
	struct shmid_kernel *shp = it;
1324 1325 1326
	unsigned long rss = 0, swp = 0;

	shm_add_rss_swap(shp, &rss, &swp);
L
Linus Torvalds 已提交
1327

1328 1329 1330 1331 1332
#if BITS_PER_LONG <= 32
#define SIZE_SPEC "%10lu"
#else
#define SIZE_SPEC "%21lu"
#endif
L
Linus Torvalds 已提交
1333

1334 1335
	return seq_printf(s,
			  "%10d %10d  %4o " SIZE_SPEC " %5u %5u  "
1336 1337
			  "%5lu %5u %5u %5u %5u %10lu %10lu %10lu "
			  SIZE_SPEC " " SIZE_SPEC "\n",
1338
			  shp->shm_perm.key,
N
Nadia Derbey 已提交
1339
			  shp->shm_perm.id,
A
Andrew Morton 已提交
1340
			  shp->shm_perm.mode,
1341 1342 1343
			  shp->shm_segsz,
			  shp->shm_cprid,
			  shp->shm_lprid,
1344
			  shp->shm_nattch,
1345 1346 1347 1348
			  from_kuid_munged(user_ns, shp->shm_perm.uid),
			  from_kgid_munged(user_ns, shp->shm_perm.gid),
			  from_kuid_munged(user_ns, shp->shm_perm.cuid),
			  from_kgid_munged(user_ns, shp->shm_perm.cgid),
1349 1350
			  shp->shm_atim,
			  shp->shm_dtim,
1351 1352 1353
			  shp->shm_ctim,
			  rss * PAGE_SIZE,
			  swp * PAGE_SIZE);
L
Linus Torvalds 已提交
1354 1355
}
#endif