sys.c 57.5 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6
/*
 *  linux/kernel/sys.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

7
#include <linux/export.h>
L
Linus Torvalds 已提交
8 9 10 11 12 13 14
#include <linux/mm.h>
#include <linux/utsname.h>
#include <linux/mman.h>
#include <linux/reboot.h>
#include <linux/prctl.h>
#include <linux/highuid.h>
#include <linux/fs.h>
15
#include <linux/kmod.h>
16
#include <linux/perf_event.h>
17
#include <linux/resource.h>
18
#include <linux/kernel.h>
L
Linus Torvalds 已提交
19
#include <linux/workqueue.h>
20
#include <linux/capability.h>
L
Linus Torvalds 已提交
21 22 23 24 25 26 27 28
#include <linux/device.h>
#include <linux/key.h>
#include <linux/times.h>
#include <linux/posix-timers.h>
#include <linux/security.h>
#include <linux/dcookies.h>
#include <linux/suspend.h>
#include <linux/tty.h>
29
#include <linux/signal.h>
M
Matt Helsley 已提交
30
#include <linux/cn_proc.h>
31
#include <linux/getcpu.h>
32
#include <linux/task_io_accounting_ops.h>
33
#include <linux/seccomp.h>
M
Mark Lord 已提交
34
#include <linux/cpu.h>
35
#include <linux/personality.h>
36
#include <linux/ptrace.h>
37
#include <linux/fs_struct.h>
38 39
#include <linux/file.h>
#include <linux/mount.h>
40
#include <linux/gfp.h>
41
#include <linux/syscore_ops.h>
42 43
#include <linux/version.h>
#include <linux/ctype.h>
L
Linus Torvalds 已提交
44 45 46

#include <linux/compat.h>
#include <linux/syscalls.h>
47
#include <linux/kprobes.h>
48
#include <linux/user_namespace.h>
49
#include <linux/binfmts.h>
L
Linus Torvalds 已提交
50

51 52 53 54 55
#include <linux/sched.h>
#include <linux/rcupdate.h>
#include <linux/uidgid.h>
#include <linux/cred.h>

56
#include <linux/kmsg_dump.h>
57 58
/* Move somewhere else to avoid recompiling? */
#include <generated/utsrelease.h>
59

L
Linus Torvalds 已提交
60 61 62 63 64
#include <asm/uaccess.h>
#include <asm/io.h>
#include <asm/unistd.h>

#ifndef SET_UNALIGN_CTL
V
vishnu.ps 已提交
65
# define SET_UNALIGN_CTL(a, b)	(-EINVAL)
L
Linus Torvalds 已提交
66 67
#endif
#ifndef GET_UNALIGN_CTL
V
vishnu.ps 已提交
68
# define GET_UNALIGN_CTL(a, b)	(-EINVAL)
L
Linus Torvalds 已提交
69 70
#endif
#ifndef SET_FPEMU_CTL
V
vishnu.ps 已提交
71
# define SET_FPEMU_CTL(a, b)	(-EINVAL)
L
Linus Torvalds 已提交
72 73
#endif
#ifndef GET_FPEMU_CTL
V
vishnu.ps 已提交
74
# define GET_FPEMU_CTL(a, b)	(-EINVAL)
L
Linus Torvalds 已提交
75 76
#endif
#ifndef SET_FPEXC_CTL
V
vishnu.ps 已提交
77
# define SET_FPEXC_CTL(a, b)	(-EINVAL)
L
Linus Torvalds 已提交
78 79
#endif
#ifndef GET_FPEXC_CTL
V
vishnu.ps 已提交
80
# define GET_FPEXC_CTL(a, b)	(-EINVAL)
L
Linus Torvalds 已提交
81
#endif
82
#ifndef GET_ENDIAN
V
vishnu.ps 已提交
83
# define GET_ENDIAN(a, b)	(-EINVAL)
84 85
#endif
#ifndef SET_ENDIAN
V
vishnu.ps 已提交
86
# define SET_ENDIAN(a, b)	(-EINVAL)
87
#endif
88 89 90 91 92 93
#ifndef GET_TSC_CTL
# define GET_TSC_CTL(a)		(-EINVAL)
#endif
#ifndef SET_TSC_CTL
# define SET_TSC_CTL(a)		(-EINVAL)
#endif
94
#ifndef MPX_ENABLE_MANAGEMENT
95
# define MPX_ENABLE_MANAGEMENT()	(-EINVAL)
96 97
#endif
#ifndef MPX_DISABLE_MANAGEMENT
98
# define MPX_DISABLE_MANAGEMENT()	(-EINVAL)
99
#endif
100 101 102 103 104 105
#ifndef GET_FP_MODE
# define GET_FP_MODE(a)		(-EINVAL)
#endif
#ifndef SET_FP_MODE
# define SET_FP_MODE(a,b)	(-EINVAL)
#endif
L
Linus Torvalds 已提交
106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128

/*
 * this is where the system-wide overflow UID and GID are defined, for
 * architectures that now have 32-bit UID/GID but didn't in the past
 */

int overflowuid = DEFAULT_OVERFLOWUID;
int overflowgid = DEFAULT_OVERFLOWGID;

EXPORT_SYMBOL(overflowuid);
EXPORT_SYMBOL(overflowgid);

/*
 * the same as above, but for filesystems which can only store a 16-bit
 * UID and GID. as such, this is needed on all architectures
 */

int fs_overflowuid = DEFAULT_FS_OVERFLOWUID;
int fs_overflowgid = DEFAULT_FS_OVERFLOWUID;

EXPORT_SYMBOL(fs_overflowuid);
EXPORT_SYMBOL(fs_overflowgid);

129 130 131 132 133 134 135 136 137 138
/*
 * Returns true if current's euid is same as p's uid or euid,
 * or has CAP_SYS_NICE to p's user_ns.
 *
 * Called with rcu_read_lock, creds are safe
 */
static bool set_one_prio_perm(struct task_struct *p)
{
	const struct cred *cred = current_cred(), *pcred = __task_cred(p);

139 140
	if (uid_eq(pcred->uid,  cred->euid) ||
	    uid_eq(pcred->euid, cred->euid))
141
		return true;
142
	if (ns_capable(pcred->user_ns, CAP_SYS_NICE))
143 144 145 146
		return true;
	return false;
}

147 148 149 150
/*
 * set the priority of a task
 * - the caller must hold the RCU read lock
 */
L
Linus Torvalds 已提交
151 152 153 154
static int set_one_prio(struct task_struct *p, int niceval, int error)
{
	int no_nice;

155
	if (!set_one_prio_perm(p)) {
L
Linus Torvalds 已提交
156 157 158
		error = -EPERM;
		goto out;
	}
M
Matt Mackall 已提交
159
	if (niceval < task_nice(p) && !can_nice(p, niceval)) {
L
Linus Torvalds 已提交
160 161 162 163 164 165 166 167 168 169 170 171 172 173 174
		error = -EACCES;
		goto out;
	}
	no_nice = security_task_setnice(p, niceval);
	if (no_nice) {
		error = no_nice;
		goto out;
	}
	if (error == -ESRCH)
		error = 0;
	set_user_nice(p, niceval);
out:
	return error;
}

175
SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
L
Linus Torvalds 已提交
176 177 178
{
	struct task_struct *g, *p;
	struct user_struct *user;
179
	const struct cred *cred = current_cred();
L
Linus Torvalds 已提交
180
	int error = -EINVAL;
181
	struct pid *pgrp;
182
	kuid_t uid;
L
Linus Torvalds 已提交
183

184
	if (which > PRIO_USER || which < PRIO_PROCESS)
L
Linus Torvalds 已提交
185 186 187 188
		goto out;

	/* normalize: avoid signed division (rounding problems) */
	error = -ESRCH;
189 190 191 192
	if (niceval < MIN_NICE)
		niceval = MIN_NICE;
	if (niceval > MAX_NICE)
		niceval = MAX_NICE;
L
Linus Torvalds 已提交
193

194
	rcu_read_lock();
L
Linus Torvalds 已提交
195 196
	read_lock(&tasklist_lock);
	switch (which) {
V
vishnu.ps 已提交
197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221
	case PRIO_PROCESS:
		if (who)
			p = find_task_by_vpid(who);
		else
			p = current;
		if (p)
			error = set_one_prio(p, niceval, error);
		break;
	case PRIO_PGRP:
		if (who)
			pgrp = find_vpid(who);
		else
			pgrp = task_pgrp(current);
		do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
			error = set_one_prio(p, niceval, error);
		} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
		break;
	case PRIO_USER:
		uid = make_kuid(cred->user_ns, who);
		user = cred->user;
		if (!who)
			uid = cred->uid;
		else if (!uid_eq(uid, cred->uid)) {
			user = find_user(uid);
			if (!user)
222
				goto out_unlock;	/* No processes for this user */
V
vishnu.ps 已提交
223 224 225 226 227 228 229 230
		}
		do_each_thread(g, p) {
			if (uid_eq(task_uid(p), uid))
				error = set_one_prio(p, niceval, error);
		} while_each_thread(g, p);
		if (!uid_eq(uid, cred->uid))
			free_uid(user);		/* For find_user() */
		break;
L
Linus Torvalds 已提交
231 232 233
	}
out_unlock:
	read_unlock(&tasklist_lock);
234
	rcu_read_unlock();
L
Linus Torvalds 已提交
235 236 237 238 239 240 241 242 243 244
out:
	return error;
}

/*
 * Ugh. To avoid negative return values, "getpriority()" will
 * not return the normal nice-value, but a negated value that
 * has been offset by 20 (ie it returns 40..1 instead of -20..19)
 * to stay compatible.
 */
245
SYSCALL_DEFINE2(getpriority, int, which, int, who)
L
Linus Torvalds 已提交
246 247 248
{
	struct task_struct *g, *p;
	struct user_struct *user;
249
	const struct cred *cred = current_cred();
L
Linus Torvalds 已提交
250
	long niceval, retval = -ESRCH;
251
	struct pid *pgrp;
252
	kuid_t uid;
L
Linus Torvalds 已提交
253

254
	if (which > PRIO_USER || which < PRIO_PROCESS)
L
Linus Torvalds 已提交
255 256
		return -EINVAL;

257
	rcu_read_lock();
L
Linus Torvalds 已提交
258 259
	read_lock(&tasklist_lock);
	switch (which) {
V
vishnu.ps 已提交
260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293
	case PRIO_PROCESS:
		if (who)
			p = find_task_by_vpid(who);
		else
			p = current;
		if (p) {
			niceval = nice_to_rlimit(task_nice(p));
			if (niceval > retval)
				retval = niceval;
		}
		break;
	case PRIO_PGRP:
		if (who)
			pgrp = find_vpid(who);
		else
			pgrp = task_pgrp(current);
		do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
			niceval = nice_to_rlimit(task_nice(p));
			if (niceval > retval)
				retval = niceval;
		} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
		break;
	case PRIO_USER:
		uid = make_kuid(cred->user_ns, who);
		user = cred->user;
		if (!who)
			uid = cred->uid;
		else if (!uid_eq(uid, cred->uid)) {
			user = find_user(uid);
			if (!user)
				goto out_unlock;	/* No processes for this user */
		}
		do_each_thread(g, p) {
			if (uid_eq(task_uid(p), uid)) {
294
				niceval = nice_to_rlimit(task_nice(p));
L
Linus Torvalds 已提交
295 296 297
				if (niceval > retval)
					retval = niceval;
			}
V
vishnu.ps 已提交
298 299 300 301
		} while_each_thread(g, p);
		if (!uid_eq(uid, cred->uid))
			free_uid(user);		/* for find_user() */
		break;
L
Linus Torvalds 已提交
302 303 304
	}
out_unlock:
	read_unlock(&tasklist_lock);
305
	rcu_read_unlock();
L
Linus Torvalds 已提交
306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322

	return retval;
}

/*
 * Unprivileged users may change the real gid to the effective gid
 * or vice versa.  (BSD-style)
 *
 * If you set the real gid at all, or set the effective gid to a value not
 * equal to the real gid, then the saved gid is set to the new effective gid.
 *
 * This makes it possible for a setgid program to completely drop its
 * privileges, which is often a useful assertion to make when you are doing
 * a security audit over a program.
 *
 * The general idea is that a program which uses just setregid() will be
 * 100% compatible with BSD.  A program which uses just setgid() will be
V
vishnu.ps 已提交
323
 * 100% compatible with POSIX with saved IDs.
L
Linus Torvalds 已提交
324 325 326 327
 *
 * SMP: There are not races, the GIDs are checked only by filesystem
 *      operations (as far as semantic preservation is concerned).
 */
328
#ifdef CONFIG_MULTIUSER
329
SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
L
Linus Torvalds 已提交
330
{
331
	struct user_namespace *ns = current_user_ns();
D
David Howells 已提交
332 333
	const struct cred *old;
	struct cred *new;
L
Linus Torvalds 已提交
334
	int retval;
335 336 337 338 339 340 341 342 343
	kgid_t krgid, kegid;

	krgid = make_kgid(ns, rgid);
	kegid = make_kgid(ns, egid);

	if ((rgid != (gid_t) -1) && !gid_valid(krgid))
		return -EINVAL;
	if ((egid != (gid_t) -1) && !gid_valid(kegid))
		return -EINVAL;
L
Linus Torvalds 已提交
344

D
David Howells 已提交
345 346 347 348 349 350
	new = prepare_creds();
	if (!new)
		return -ENOMEM;
	old = current_cred();

	retval = -EPERM;
L
Linus Torvalds 已提交
351
	if (rgid != (gid_t) -1) {
352 353
		if (gid_eq(old->gid, krgid) ||
		    gid_eq(old->egid, krgid) ||
354
		    ns_capable(old->user_ns, CAP_SETGID))
355
			new->gid = krgid;
L
Linus Torvalds 已提交
356
		else
D
David Howells 已提交
357
			goto error;
L
Linus Torvalds 已提交
358 359
	}
	if (egid != (gid_t) -1) {
360 361 362
		if (gid_eq(old->gid, kegid) ||
		    gid_eq(old->egid, kegid) ||
		    gid_eq(old->sgid, kegid) ||
363
		    ns_capable(old->user_ns, CAP_SETGID))
364
			new->egid = kegid;
365
		else
D
David Howells 已提交
366
			goto error;
L
Linus Torvalds 已提交
367
	}
D
David Howells 已提交
368

L
Linus Torvalds 已提交
369
	if (rgid != (gid_t) -1 ||
370
	    (egid != (gid_t) -1 && !gid_eq(kegid, old->gid)))
D
David Howells 已提交
371 372 373 374 375 376 377 378
		new->sgid = new->egid;
	new->fsgid = new->egid;

	return commit_creds(new);

error:
	abort_creds(new);
	return retval;
L
Linus Torvalds 已提交
379 380 381
}

/*
V
vishnu.ps 已提交
382
 * setgid() is implemented like SysV w/ SAVED_IDS
L
Linus Torvalds 已提交
383 384 385
 *
 * SMP: Same implicit races as above.
 */
386
SYSCALL_DEFINE1(setgid, gid_t, gid)
L
Linus Torvalds 已提交
387
{
388
	struct user_namespace *ns = current_user_ns();
D
David Howells 已提交
389 390
	const struct cred *old;
	struct cred *new;
L
Linus Torvalds 已提交
391
	int retval;
392 393 394 395 396
	kgid_t kgid;

	kgid = make_kgid(ns, gid);
	if (!gid_valid(kgid))
		return -EINVAL;
L
Linus Torvalds 已提交
397

D
David Howells 已提交
398 399 400 401 402 403
	new = prepare_creds();
	if (!new)
		return -ENOMEM;
	old = current_cred();

	retval = -EPERM;
404
	if (ns_capable(old->user_ns, CAP_SETGID))
405 406 407
		new->gid = new->egid = new->sgid = new->fsgid = kgid;
	else if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->sgid))
		new->egid = new->fsgid = kgid;
L
Linus Torvalds 已提交
408
	else
D
David Howells 已提交
409
		goto error;
L
Linus Torvalds 已提交
410

D
David Howells 已提交
411 412 413 414 415
	return commit_creds(new);

error:
	abort_creds(new);
	return retval;
L
Linus Torvalds 已提交
416
}
417

D
David Howells 已提交
418 419 420 421
/*
 * change the user struct in a credentials set to match the new UID
 */
static int set_user(struct cred *new)
L
Linus Torvalds 已提交
422 423 424
{
	struct user_struct *new_user;

425
	new_user = alloc_uid(new->uid);
L
Linus Torvalds 已提交
426 427 428
	if (!new_user)
		return -EAGAIN;

429 430 431 432 433 434 435
	/*
	 * We don't fail in case of NPROC limit excess here because too many
	 * poorly written programs don't check set*uid() return code, assuming
	 * it never fails if called by root.  We may still enforce NPROC limit
	 * for programs doing set*uid()+execve() by harmlessly deferring the
	 * failure to the execve() stage.
	 */
436
	if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) &&
437 438 439 440
			new_user != INIT_USER)
		current->flags |= PF_NPROC_EXCEEDED;
	else
		current->flags &= ~PF_NPROC_EXCEEDED;
L
Linus Torvalds 已提交
441

D
David Howells 已提交
442 443
	free_uid(new->user);
	new->user = new_user;
L
Linus Torvalds 已提交
444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459
	return 0;
}

/*
 * Unprivileged users may change the real uid to the effective uid
 * or vice versa.  (BSD-style)
 *
 * If you set the real uid at all, or set the effective uid to a value not
 * equal to the real uid, then the saved uid is set to the new effective uid.
 *
 * This makes it possible for a setuid program to completely drop its
 * privileges, which is often a useful assertion to make when you are doing
 * a security audit over a program.
 *
 * The general idea is that a program which uses just setreuid() will be
 * 100% compatible with BSD.  A program which uses just setuid() will be
V
vishnu.ps 已提交
460
 * 100% compatible with POSIX with saved IDs.
L
Linus Torvalds 已提交
461
 */
462
SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
L
Linus Torvalds 已提交
463
{
464
	struct user_namespace *ns = current_user_ns();
D
David Howells 已提交
465 466
	const struct cred *old;
	struct cred *new;
L
Linus Torvalds 已提交
467
	int retval;
468 469 470 471 472 473 474 475 476
	kuid_t kruid, keuid;

	kruid = make_kuid(ns, ruid);
	keuid = make_kuid(ns, euid);

	if ((ruid != (uid_t) -1) && !uid_valid(kruid))
		return -EINVAL;
	if ((euid != (uid_t) -1) && !uid_valid(keuid))
		return -EINVAL;
L
Linus Torvalds 已提交
477

D
David Howells 已提交
478 479 480 481 482 483
	new = prepare_creds();
	if (!new)
		return -ENOMEM;
	old = current_cred();

	retval = -EPERM;
L
Linus Torvalds 已提交
484
	if (ruid != (uid_t) -1) {
485 486 487
		new->uid = kruid;
		if (!uid_eq(old->uid, kruid) &&
		    !uid_eq(old->euid, kruid) &&
488
		    !ns_capable(old->user_ns, CAP_SETUID))
D
David Howells 已提交
489
			goto error;
L
Linus Torvalds 已提交
490 491 492
	}

	if (euid != (uid_t) -1) {
493 494 495 496
		new->euid = keuid;
		if (!uid_eq(old->uid, keuid) &&
		    !uid_eq(old->euid, keuid) &&
		    !uid_eq(old->suid, keuid) &&
497
		    !ns_capable(old->user_ns, CAP_SETUID))
D
David Howells 已提交
498
			goto error;
L
Linus Torvalds 已提交
499 500
	}

501
	if (!uid_eq(new->uid, old->uid)) {
502 503 504 505
		retval = set_user(new);
		if (retval < 0)
			goto error;
	}
L
Linus Torvalds 已提交
506
	if (ruid != (uid_t) -1 ||
507
	    (euid != (uid_t) -1 && !uid_eq(keuid, old->uid)))
D
David Howells 已提交
508 509
		new->suid = new->euid;
	new->fsuid = new->euid;
L
Linus Torvalds 已提交
510

D
David Howells 已提交
511 512 513
	retval = security_task_fix_setuid(new, old, LSM_SETID_RE);
	if (retval < 0)
		goto error;
L
Linus Torvalds 已提交
514

D
David Howells 已提交
515
	return commit_creds(new);
L
Linus Torvalds 已提交
516

D
David Howells 已提交
517 518 519 520
error:
	abort_creds(new);
	return retval;
}
V
vishnu.ps 已提交
521

L
Linus Torvalds 已提交
522
/*
V
vishnu.ps 已提交
523 524
 * setuid() is implemented like SysV with SAVED_IDS
 *
L
Linus Torvalds 已提交
525
 * Note that SAVED_ID's is deficient in that a setuid root program
V
vishnu.ps 已提交
526
 * like sendmail, for example, cannot set its uid to be a normal
L
Linus Torvalds 已提交
527 528 529 530
 * user and then switch back, because if you're root, setuid() sets
 * the saved uid too.  If you don't like this, blame the bright people
 * in the POSIX committee and/or USG.  Note that the BSD-style setreuid()
 * will allow a root program to temporarily drop privileges and be able to
V
vishnu.ps 已提交
531
 * regain them by swapping the real and effective uid.
L
Linus Torvalds 已提交
532
 */
533
SYSCALL_DEFINE1(setuid, uid_t, uid)
L
Linus Torvalds 已提交
534
{
535
	struct user_namespace *ns = current_user_ns();
D
David Howells 已提交
536 537
	const struct cred *old;
	struct cred *new;
L
Linus Torvalds 已提交
538
	int retval;
539 540 541 542 543
	kuid_t kuid;

	kuid = make_kuid(ns, uid);
	if (!uid_valid(kuid))
		return -EINVAL;
L
Linus Torvalds 已提交
544

D
David Howells 已提交
545 546 547 548 549 550
	new = prepare_creds();
	if (!new)
		return -ENOMEM;
	old = current_cred();

	retval = -EPERM;
551
	if (ns_capable(old->user_ns, CAP_SETUID)) {
552 553
		new->suid = new->uid = kuid;
		if (!uid_eq(kuid, old->uid)) {
554 555 556
			retval = set_user(new);
			if (retval < 0)
				goto error;
D
David Howells 已提交
557
		}
558
	} else if (!uid_eq(kuid, old->uid) && !uid_eq(kuid, new->suid)) {
D
David Howells 已提交
559
		goto error;
L
Linus Torvalds 已提交
560 561
	}

562
	new->fsuid = new->euid = kuid;
D
David Howells 已提交
563 564 565 566

	retval = security_task_fix_setuid(new, old, LSM_SETID_ID);
	if (retval < 0)
		goto error;
L
Linus Torvalds 已提交
567

D
David Howells 已提交
568
	return commit_creds(new);
L
Linus Torvalds 已提交
569

D
David Howells 已提交
570 571 572
error:
	abort_creds(new);
	return retval;
L
Linus Torvalds 已提交
573 574 575 576 577 578 579
}


/*
 * This function implements a generic ability to update ruid, euid,
 * and suid.  This allows you to implement the 4.4 compatible seteuid().
 */
580
SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
L
Linus Torvalds 已提交
581
{
582
	struct user_namespace *ns = current_user_ns();
D
David Howells 已提交
583 584
	const struct cred *old;
	struct cred *new;
L
Linus Torvalds 已提交
585
	int retval;
586 587 588 589 590 591 592 593 594 595 596 597 598 599
	kuid_t kruid, keuid, ksuid;

	kruid = make_kuid(ns, ruid);
	keuid = make_kuid(ns, euid);
	ksuid = make_kuid(ns, suid);

	if ((ruid != (uid_t) -1) && !uid_valid(kruid))
		return -EINVAL;

	if ((euid != (uid_t) -1) && !uid_valid(keuid))
		return -EINVAL;

	if ((suid != (uid_t) -1) && !uid_valid(ksuid))
		return -EINVAL;
L
Linus Torvalds 已提交
600

D
David Howells 已提交
601 602 603 604 605
	new = prepare_creds();
	if (!new)
		return -ENOMEM;

	old = current_cred();
L
Linus Torvalds 已提交
606

D
David Howells 已提交
607
	retval = -EPERM;
608
	if (!ns_capable(old->user_ns, CAP_SETUID)) {
609 610
		if (ruid != (uid_t) -1        && !uid_eq(kruid, old->uid) &&
		    !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid))
D
David Howells 已提交
611
			goto error;
612 613
		if (euid != (uid_t) -1        && !uid_eq(keuid, old->uid) &&
		    !uid_eq(keuid, old->euid) && !uid_eq(keuid, old->suid))
D
David Howells 已提交
614
			goto error;
615 616
		if (suid != (uid_t) -1        && !uid_eq(ksuid, old->uid) &&
		    !uid_eq(ksuid, old->euid) && !uid_eq(ksuid, old->suid))
D
David Howells 已提交
617
			goto error;
L
Linus Torvalds 已提交
618
	}
D
David Howells 已提交
619

L
Linus Torvalds 已提交
620
	if (ruid != (uid_t) -1) {
621 622
		new->uid = kruid;
		if (!uid_eq(kruid, old->uid)) {
623 624 625 626
			retval = set_user(new);
			if (retval < 0)
				goto error;
		}
L
Linus Torvalds 已提交
627
	}
D
David Howells 已提交
628
	if (euid != (uid_t) -1)
629
		new->euid = keuid;
L
Linus Torvalds 已提交
630
	if (suid != (uid_t) -1)
631
		new->suid = ksuid;
D
David Howells 已提交
632
	new->fsuid = new->euid;
L
Linus Torvalds 已提交
633

D
David Howells 已提交
634 635 636
	retval = security_task_fix_setuid(new, old, LSM_SETID_RES);
	if (retval < 0)
		goto error;
L
Linus Torvalds 已提交
637

D
David Howells 已提交
638
	return commit_creds(new);
L
Linus Torvalds 已提交
639

D
David Howells 已提交
640 641 642
error:
	abort_creds(new);
	return retval;
L
Linus Torvalds 已提交
643 644
}

645
SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t __user *, suidp)
L
Linus Torvalds 已提交
646
{
647
	const struct cred *cred = current_cred();
L
Linus Torvalds 已提交
648
	int retval;
649 650 651 652 653
	uid_t ruid, euid, suid;

	ruid = from_kuid_munged(cred->user_ns, cred->uid);
	euid = from_kuid_munged(cred->user_ns, cred->euid);
	suid = from_kuid_munged(cred->user_ns, cred->suid);
L
Linus Torvalds 已提交
654

V
vishnu.ps 已提交
655 656 657 658 659 660
	retval = put_user(ruid, ruidp);
	if (!retval) {
		retval = put_user(euid, euidp);
		if (!retval)
			return put_user(suid, suidp);
	}
L
Linus Torvalds 已提交
661 662 663 664 665 666
	return retval;
}

/*
 * Same as above, but for rgid, egid, sgid.
 */
667
SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
L
Linus Torvalds 已提交
668
{
669
	struct user_namespace *ns = current_user_ns();
D
David Howells 已提交
670 671
	const struct cred *old;
	struct cred *new;
L
Linus Torvalds 已提交
672
	int retval;
673 674 675 676 677 678 679 680 681 682 683 684
	kgid_t krgid, kegid, ksgid;

	krgid = make_kgid(ns, rgid);
	kegid = make_kgid(ns, egid);
	ksgid = make_kgid(ns, sgid);

	if ((rgid != (gid_t) -1) && !gid_valid(krgid))
		return -EINVAL;
	if ((egid != (gid_t) -1) && !gid_valid(kegid))
		return -EINVAL;
	if ((sgid != (gid_t) -1) && !gid_valid(ksgid))
		return -EINVAL;
L
Linus Torvalds 已提交
685

D
David Howells 已提交
686 687 688 689 690 691
	new = prepare_creds();
	if (!new)
		return -ENOMEM;
	old = current_cred();

	retval = -EPERM;
692
	if (!ns_capable(old->user_ns, CAP_SETGID)) {
693 694
		if (rgid != (gid_t) -1        && !gid_eq(krgid, old->gid) &&
		    !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid))
D
David Howells 已提交
695
			goto error;
696 697
		if (egid != (gid_t) -1        && !gid_eq(kegid, old->gid) &&
		    !gid_eq(kegid, old->egid) && !gid_eq(kegid, old->sgid))
D
David Howells 已提交
698
			goto error;
699 700
		if (sgid != (gid_t) -1        && !gid_eq(ksgid, old->gid) &&
		    !gid_eq(ksgid, old->egid) && !gid_eq(ksgid, old->sgid))
D
David Howells 已提交
701
			goto error;
L
Linus Torvalds 已提交
702
	}
D
David Howells 已提交
703

L
Linus Torvalds 已提交
704
	if (rgid != (gid_t) -1)
705
		new->gid = krgid;
D
David Howells 已提交
706
	if (egid != (gid_t) -1)
707
		new->egid = kegid;
L
Linus Torvalds 已提交
708
	if (sgid != (gid_t) -1)
709
		new->sgid = ksgid;
D
David Howells 已提交
710
	new->fsgid = new->egid;
L
Linus Torvalds 已提交
711

D
David Howells 已提交
712 713 714 715 716
	return commit_creds(new);

error:
	abort_creds(new);
	return retval;
L
Linus Torvalds 已提交
717 718
}

719
SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t __user *, sgidp)
L
Linus Torvalds 已提交
720
{
721
	const struct cred *cred = current_cred();
L
Linus Torvalds 已提交
722
	int retval;
723 724 725 726 727
	gid_t rgid, egid, sgid;

	rgid = from_kgid_munged(cred->user_ns, cred->gid);
	egid = from_kgid_munged(cred->user_ns, cred->egid);
	sgid = from_kgid_munged(cred->user_ns, cred->sgid);
L
Linus Torvalds 已提交
728

V
vishnu.ps 已提交
729 730 731 732 733 734
	retval = put_user(rgid, rgidp);
	if (!retval) {
		retval = put_user(egid, egidp);
		if (!retval)
			retval = put_user(sgid, sgidp);
	}
L
Linus Torvalds 已提交
735 736 737 738 739 740 741 742 743 744 745

	return retval;
}


/*
 * "setfsuid()" sets the fsuid - the uid used for filesystem checks. This
 * is used for "access()" and for the NFS daemon (letting nfsd stay at
 * whatever uid it wants to). It normally shadows "euid", except when
 * explicitly set by setfsuid() or for access..
 */
746
SYSCALL_DEFINE1(setfsuid, uid_t, uid)
L
Linus Torvalds 已提交
747
{
D
David Howells 已提交
748 749 750
	const struct cred *old;
	struct cred *new;
	uid_t old_fsuid;
751 752 753 754 755 756 757 758
	kuid_t kuid;

	old = current_cred();
	old_fsuid = from_kuid_munged(old->user_ns, old->fsuid);

	kuid = make_kuid(old->user_ns, uid);
	if (!uid_valid(kuid))
		return old_fsuid;
L
Linus Torvalds 已提交
759

D
David Howells 已提交
760 761
	new = prepare_creds();
	if (!new)
762
		return old_fsuid;
L
Linus Torvalds 已提交
763

764 765
	if (uid_eq(kuid, old->uid)  || uid_eq(kuid, old->euid)  ||
	    uid_eq(kuid, old->suid) || uid_eq(kuid, old->fsuid) ||
766
	    ns_capable(old->user_ns, CAP_SETUID)) {
767 768
		if (!uid_eq(kuid, old->fsuid)) {
			new->fsuid = kuid;
D
David Howells 已提交
769 770
			if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
				goto change_okay;
L
Linus Torvalds 已提交
771 772 773
		}
	}

D
David Howells 已提交
774 775
	abort_creds(new);
	return old_fsuid;
L
Linus Torvalds 已提交
776

D
David Howells 已提交
777 778
change_okay:
	commit_creds(new);
L
Linus Torvalds 已提交
779 780 781 782
	return old_fsuid;
}

/*
783
 * Samma på svenska..
L
Linus Torvalds 已提交
784
 */
785
SYSCALL_DEFINE1(setfsgid, gid_t, gid)
L
Linus Torvalds 已提交
786
{
D
David Howells 已提交
787 788 789
	const struct cred *old;
	struct cred *new;
	gid_t old_fsgid;
790 791 792 793 794 795 796 797
	kgid_t kgid;

	old = current_cred();
	old_fsgid = from_kgid_munged(old->user_ns, old->fsgid);

	kgid = make_kgid(old->user_ns, gid);
	if (!gid_valid(kgid))
		return old_fsgid;
D
David Howells 已提交
798 799 800

	new = prepare_creds();
	if (!new)
801
		return old_fsgid;
L
Linus Torvalds 已提交
802

803 804
	if (gid_eq(kgid, old->gid)  || gid_eq(kgid, old->egid)  ||
	    gid_eq(kgid, old->sgid) || gid_eq(kgid, old->fsgid) ||
805
	    ns_capable(old->user_ns, CAP_SETGID)) {
806 807
		if (!gid_eq(kgid, old->fsgid)) {
			new->fsgid = kgid;
D
David Howells 已提交
808
			goto change_okay;
L
Linus Torvalds 已提交
809 810
		}
	}
D
David Howells 已提交
811 812 813 814 815 816

	abort_creds(new);
	return old_fsgid;

change_okay:
	commit_creds(new);
L
Linus Torvalds 已提交
817 818
	return old_fsgid;
}
819
#endif /* CONFIG_MULTIUSER */
L
Linus Torvalds 已提交
820

821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881
/**
 * sys_getpid - return the thread group id of the current process
 *
 * Note, despite the name, this returns the tgid not the pid.  The tgid and
 * the pid are identical unless CLONE_THREAD was specified on clone() in
 * which case the tgid is the same in all threads of the same group.
 *
 * This is SMP safe as current->tgid does not change.
 */
SYSCALL_DEFINE0(getpid)
{
	return task_tgid_vnr(current);
}

/* Thread ID - the internal kernel "pid" */
SYSCALL_DEFINE0(gettid)
{
	return task_pid_vnr(current);
}

/*
 * Accessing ->real_parent is not SMP-safe, it could
 * change from under us. However, we can use a stale
 * value of ->real_parent under rcu_read_lock(), see
 * release_task()->call_rcu(delayed_put_task_struct).
 */
SYSCALL_DEFINE0(getppid)
{
	int pid;

	rcu_read_lock();
	pid = task_tgid_vnr(rcu_dereference(current->real_parent));
	rcu_read_unlock();

	return pid;
}

SYSCALL_DEFINE0(getuid)
{
	/* Only we change this so SMP safe */
	return from_kuid_munged(current_user_ns(), current_uid());
}

SYSCALL_DEFINE0(geteuid)
{
	/* Only we change this so SMP safe */
	return from_kuid_munged(current_user_ns(), current_euid());
}

SYSCALL_DEFINE0(getgid)
{
	/* Only we change this so SMP safe */
	return from_kgid_munged(current_user_ns(), current_gid());
}

SYSCALL_DEFINE0(getegid)
{
	/* Only we change this so SMP safe */
	return from_kgid_munged(current_user_ns(), current_egid());
}

882 883
void do_sys_times(struct tms *tms)
{
884
	cputime_t tgutime, tgstime, cutime, cstime;
885

886
	thread_group_cputime_adjusted(current, &tgutime, &tgstime);
887 888
	cutime = current->signal->cutime;
	cstime = current->signal->cstime;
889 890
	tms->tms_utime = cputime_to_clock_t(tgutime);
	tms->tms_stime = cputime_to_clock_t(tgstime);
891 892 893 894
	tms->tms_cutime = cputime_to_clock_t(cutime);
	tms->tms_cstime = cputime_to_clock_t(cstime);
}

895
SYSCALL_DEFINE1(times, struct tms __user *, tbuf)
L
Linus Torvalds 已提交
896 897 898
{
	if (tbuf) {
		struct tms tmp;
899 900

		do_sys_times(&tmp);
L
Linus Torvalds 已提交
901 902 903
		if (copy_to_user(tbuf, &tmp, sizeof(struct tms)))
			return -EFAULT;
	}
904
	force_successful_syscall_return();
L
Linus Torvalds 已提交
905 906 907 908 909 910 911 912 913 914 915 916
	return (long) jiffies_64_to_clock_t(get_jiffies_64());
}

/*
 * This needs some heavy checking ...
 * I just haven't the stomach for it. I also don't fully
 * understand sessions/pgrp etc. Let somebody who does explain it.
 *
 * OK, I think I have the protection semantics right.... this is really
 * only important on a multi-user system anyway, to make sure one user
 * can't send a signal to a process owned by another.  -TYT, 12/12/91
 *
O
Oleg Nesterov 已提交
917
 * !PF_FORKNOEXEC check to conform completely to POSIX.
L
Linus Torvalds 已提交
918
 */
919
SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
L
Linus Torvalds 已提交
920 921
{
	struct task_struct *p;
922
	struct task_struct *group_leader = current->group_leader;
923 924
	struct pid *pgrp;
	int err;
L
Linus Torvalds 已提交
925 926

	if (!pid)
927
		pid = task_pid_vnr(group_leader);
L
Linus Torvalds 已提交
928 929 930 931
	if (!pgid)
		pgid = pid;
	if (pgid < 0)
		return -EINVAL;
932
	rcu_read_lock();
L
Linus Torvalds 已提交
933 934 935 936 937 938 939

	/* From this point forward we keep holding onto the tasklist lock
	 * so that our parent does not change from under us. -DaveM
	 */
	write_lock_irq(&tasklist_lock);

	err = -ESRCH;
940
	p = find_task_by_vpid(pid);
L
Linus Torvalds 已提交
941 942 943 944 945 946 947
	if (!p)
		goto out;

	err = -EINVAL;
	if (!thread_group_leader(p))
		goto out;

948
	if (same_thread_group(p->real_parent, group_leader)) {
L
Linus Torvalds 已提交
949
		err = -EPERM;
950
		if (task_session(p) != task_session(group_leader))
L
Linus Torvalds 已提交
951 952
			goto out;
		err = -EACCES;
O
Oleg Nesterov 已提交
953
		if (!(p->flags & PF_FORKNOEXEC))
L
Linus Torvalds 已提交
954 955 956
			goto out;
	} else {
		err = -ESRCH;
957
		if (p != group_leader)
L
Linus Torvalds 已提交
958 959 960 961 962 963 964
			goto out;
	}

	err = -EPERM;
	if (p->signal->leader)
		goto out;

965
	pgrp = task_pid(p);
L
Linus Torvalds 已提交
966
	if (pgid != pid) {
967
		struct task_struct *g;
L
Linus Torvalds 已提交
968

969 970
		pgrp = find_vpid(pgid);
		g = pid_task(pgrp, PIDTYPE_PGID);
971
		if (!g || task_session(g) != task_session(group_leader))
972
			goto out;
L
Linus Torvalds 已提交
973 974 975 976 977 978
	}

	err = security_task_setpgid(p, pgid);
	if (err)
		goto out;

979
	if (task_pgrp(p) != pgrp)
980
		change_pid(p, PIDTYPE_PGID, pgrp);
L
Linus Torvalds 已提交
981 982 983 984 985

	err = 0;
out:
	/* All paths lead to here, thus we are safe. -DaveM */
	write_unlock_irq(&tasklist_lock);
986
	rcu_read_unlock();
L
Linus Torvalds 已提交
987 988 989
	return err;
}

990
SYSCALL_DEFINE1(getpgid, pid_t, pid)
L
Linus Torvalds 已提交
991
{
992 993 994 995 996
	struct task_struct *p;
	struct pid *grp;
	int retval;

	rcu_read_lock();
997
	if (!pid)
998
		grp = task_pgrp(current);
999
	else {
L
Linus Torvalds 已提交
1000
		retval = -ESRCH;
1001 1002 1003 1004 1005 1006 1007 1008 1009 1010
		p = find_task_by_vpid(pid);
		if (!p)
			goto out;
		grp = task_pgrp(p);
		if (!grp)
			goto out;

		retval = security_task_getpgid(p);
		if (retval)
			goto out;
L
Linus Torvalds 已提交
1011
	}
1012 1013 1014 1015
	retval = pid_vnr(grp);
out:
	rcu_read_unlock();
	return retval;
L
Linus Torvalds 已提交
1016 1017 1018 1019
}

#ifdef __ARCH_WANT_SYS_GETPGRP

1020
SYSCALL_DEFINE0(getpgrp)
L
Linus Torvalds 已提交
1021
{
1022
	return sys_getpgid(0);
L
Linus Torvalds 已提交
1023 1024 1025 1026
}

#endif

1027
SYSCALL_DEFINE1(getsid, pid_t, pid)
L
Linus Torvalds 已提交
1028
{
1029 1030 1031 1032 1033
	struct task_struct *p;
	struct pid *sid;
	int retval;

	rcu_read_lock();
1034
	if (!pid)
1035
		sid = task_session(current);
1036
	else {
L
Linus Torvalds 已提交
1037
		retval = -ESRCH;
1038 1039 1040 1041 1042 1043 1044 1045 1046 1047
		p = find_task_by_vpid(pid);
		if (!p)
			goto out;
		sid = task_session(p);
		if (!sid)
			goto out;

		retval = security_task_getsid(p);
		if (retval)
			goto out;
L
Linus Torvalds 已提交
1048
	}
1049 1050 1051 1052
	retval = pid_vnr(sid);
out:
	rcu_read_unlock();
	return retval;
L
Linus Torvalds 已提交
1053 1054
}

1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065
static void set_special_pids(struct pid *pid)
{
	struct task_struct *curr = current->group_leader;

	if (task_session(curr) != pid)
		change_pid(curr, PIDTYPE_SID, pid);

	if (task_pgrp(curr) != pid)
		change_pid(curr, PIDTYPE_PGID, pid);
}

1066
SYSCALL_DEFINE0(setsid)
L
Linus Torvalds 已提交
1067
{
1068
	struct task_struct *group_leader = current->group_leader;
1069 1070
	struct pid *sid = task_pid(group_leader);
	pid_t session = pid_vnr(sid);
L
Linus Torvalds 已提交
1071 1072 1073
	int err = -EPERM;

	write_lock_irq(&tasklist_lock);
1074 1075 1076 1077
	/* Fail if I am already a session leader */
	if (group_leader->signal->leader)
		goto out;

1078 1079
	/* Fail if a process group id already exists that equals the
	 * proposed session id.
1080
	 */
1081
	if (pid_task(sid, PIDTYPE_PGID))
L
Linus Torvalds 已提交
1082 1083
		goto out;

1084
	group_leader->signal->leader = 1;
1085
	set_special_pids(sid);
1086

A
Alan Cox 已提交
1087
	proc_clear_tty(group_leader);
1088

1089
	err = session;
L
Linus Torvalds 已提交
1090 1091
out:
	write_unlock_irq(&tasklist_lock);
1092
	if (err > 0) {
1093
		proc_sid_connector(group_leader);
1094 1095
		sched_autogroup_create_attach(group_leader);
	}
L
Linus Torvalds 已提交
1096 1097 1098 1099 1100
	return err;
}

DECLARE_RWSEM(uts_sem);

1101 1102
#ifdef COMPAT_UTS_MACHINE
#define override_architecture(name) \
1103
	(personality(current->personality) == PER_LINUX32 && \
1104 1105 1106 1107 1108 1109
	 copy_to_user(name->machine, COMPAT_UTS_MACHINE, \
		      sizeof(COMPAT_UTS_MACHINE)))
#else
#define override_architecture(name)	0
#endif

1110 1111 1112
/*
 * Work around broken programs that cannot handle "Linux 3.0".
 * Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40
J
Jon DeVree 已提交
1113
 * And we map 4.x to 2.6.60+x, so 4.0 would be 2.6.60.
1114
 */
1115
static int override_release(char __user *release, size_t len)
1116 1117 1118 1119
{
	int ret = 0;

	if (current->personality & UNAME26) {
1120 1121
		const char *rest = UTS_RELEASE;
		char buf[65] = { 0 };
1122 1123
		int ndots = 0;
		unsigned v;
1124
		size_t copy;
1125 1126 1127 1128 1129 1130 1131 1132

		while (*rest) {
			if (*rest == '.' && ++ndots >= 3)
				break;
			if (!isdigit(*rest) && *rest != '.')
				break;
			rest++;
		}
J
Jon DeVree 已提交
1133
		v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 60;
K
Kees Cook 已提交
1134
		copy = clamp_t(size_t, len, 1, sizeof(buf));
1135 1136
		copy = scnprintf(buf, copy, "2.6.%u%s", v, rest);
		ret = copy_to_user(release, buf, copy + 1);
1137 1138 1139 1140
	}
	return ret;
}

1141
SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
L
Linus Torvalds 已提交
1142 1143 1144 1145
{
	int errno = 0;

	down_read(&uts_sem);
1146
	if (copy_to_user(name, utsname(), sizeof *name))
L
Linus Torvalds 已提交
1147 1148
		errno = -EFAULT;
	up_read(&uts_sem);
1149

1150 1151
	if (!errno && override_release(name->release, sizeof(name->release)))
		errno = -EFAULT;
1152 1153
	if (!errno && override_architecture(name))
		errno = -EFAULT;
L
Linus Torvalds 已提交
1154 1155 1156
	return errno;
}

C
Christoph Hellwig 已提交
1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172
#ifdef __ARCH_WANT_SYS_OLD_UNAME
/*
 * Old cruft
 */
SYSCALL_DEFINE1(uname, struct old_utsname __user *, name)
{
	int error = 0;

	if (!name)
		return -EFAULT;

	down_read(&uts_sem);
	if (copy_to_user(name, utsname(), sizeof(*name)))
		error = -EFAULT;
	up_read(&uts_sem);

1173 1174
	if (!error && override_release(name->release, sizeof(name->release)))
		error = -EFAULT;
C
Christoph Hellwig 已提交
1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208
	if (!error && override_architecture(name))
		error = -EFAULT;
	return error;
}

SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name)
{
	int error;

	if (!name)
		return -EFAULT;
	if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname)))
		return -EFAULT;

	down_read(&uts_sem);
	error = __copy_to_user(&name->sysname, &utsname()->sysname,
			       __OLD_UTS_LEN);
	error |= __put_user(0, name->sysname + __OLD_UTS_LEN);
	error |= __copy_to_user(&name->nodename, &utsname()->nodename,
				__OLD_UTS_LEN);
	error |= __put_user(0, name->nodename + __OLD_UTS_LEN);
	error |= __copy_to_user(&name->release, &utsname()->release,
				__OLD_UTS_LEN);
	error |= __put_user(0, name->release + __OLD_UTS_LEN);
	error |= __copy_to_user(&name->version, &utsname()->version,
				__OLD_UTS_LEN);
	error |= __put_user(0, name->version + __OLD_UTS_LEN);
	error |= __copy_to_user(&name->machine, &utsname()->machine,
				__OLD_UTS_LEN);
	error |= __put_user(0, name->machine + __OLD_UTS_LEN);
	up_read(&uts_sem);

	if (!error && override_architecture(name))
		error = -EFAULT;
1209 1210
	if (!error && override_release(name->release, sizeof(name->release)))
		error = -EFAULT;
C
Christoph Hellwig 已提交
1211 1212 1213 1214
	return error ? -EFAULT : 0;
}
#endif

1215
SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
L
Linus Torvalds 已提交
1216 1217 1218 1219
{
	int errno;
	char tmp[__NEW_UTS_LEN];

1220
	if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN))
L
Linus Torvalds 已提交
1221
		return -EPERM;
1222

L
Linus Torvalds 已提交
1223 1224 1225 1226 1227
	if (len < 0 || len > __NEW_UTS_LEN)
		return -EINVAL;
	down_write(&uts_sem);
	errno = -EFAULT;
	if (!copy_from_user(tmp, name, len)) {
1228 1229 1230 1231
		struct new_utsname *u = utsname();

		memcpy(u->nodename, tmp, len);
		memset(u->nodename + len, 0, sizeof(u->nodename) - len);
L
Linus Torvalds 已提交
1232
		errno = 0;
1233
		uts_proc_notify(UTS_PROC_HOSTNAME);
L
Linus Torvalds 已提交
1234 1235 1236 1237 1238 1239 1240
	}
	up_write(&uts_sem);
	return errno;
}

#ifdef __ARCH_WANT_SYS_GETHOSTNAME

1241
SYSCALL_DEFINE2(gethostname, char __user *, name, int, len)
L
Linus Torvalds 已提交
1242 1243
{
	int i, errno;
1244
	struct new_utsname *u;
L
Linus Torvalds 已提交
1245 1246 1247 1248

	if (len < 0)
		return -EINVAL;
	down_read(&uts_sem);
1249 1250
	u = utsname();
	i = 1 + strlen(u->nodename);
L
Linus Torvalds 已提交
1251 1252 1253
	if (i > len)
		i = len;
	errno = 0;
1254
	if (copy_to_user(name, u->nodename, i))
L
Linus Torvalds 已提交
1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265
		errno = -EFAULT;
	up_read(&uts_sem);
	return errno;
}

#endif

/*
 * Only setdomainname; getdomainname can be implemented by calling
 * uname()
 */
1266
SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
L
Linus Torvalds 已提交
1267 1268 1269 1270
{
	int errno;
	char tmp[__NEW_UTS_LEN];

1271
	if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN))
L
Linus Torvalds 已提交
1272 1273 1274 1275 1276 1277 1278
		return -EPERM;
	if (len < 0 || len > __NEW_UTS_LEN)
		return -EINVAL;

	down_write(&uts_sem);
	errno = -EFAULT;
	if (!copy_from_user(tmp, name, len)) {
1279 1280 1281 1282
		struct new_utsname *u = utsname();

		memcpy(u->domainname, tmp, len);
		memset(u->domainname + len, 0, sizeof(u->domainname) - len);
L
Linus Torvalds 已提交
1283
		errno = 0;
1284
		uts_proc_notify(UTS_PROC_DOMAINNAME);
L
Linus Torvalds 已提交
1285 1286 1287 1288 1289
	}
	up_write(&uts_sem);
	return errno;
}

1290
SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim)
L
Linus Torvalds 已提交
1291
{
1292 1293 1294 1295 1296 1297 1298 1299
	struct rlimit value;
	int ret;

	ret = do_prlimit(current, resource, NULL, &value);
	if (!ret)
		ret = copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0;

	return ret;
L
Linus Torvalds 已提交
1300 1301 1302 1303 1304 1305 1306
}

#ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT

/*
 *	Back compatibility for getrlimit. Needed for some apps.
 */
1307 1308
SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
		struct rlimit __user *, rlim)
L
Linus Torvalds 已提交
1309 1310 1311 1312 1313 1314 1315 1316
{
	struct rlimit x;
	if (resource >= RLIM_NLIMITS)
		return -EINVAL;

	task_lock(current->group_leader);
	x = current->signal->rlim[resource];
	task_unlock(current->group_leader);
1317
	if (x.rlim_cur > 0x7FFFFFFF)
L
Linus Torvalds 已提交
1318
		x.rlim_cur = 0x7FFFFFFF;
1319
	if (x.rlim_max > 0x7FFFFFFF)
L
Linus Torvalds 已提交
1320
		x.rlim_max = 0x7FFFFFFF;
V
vishnu.ps 已提交
1321
	return copy_to_user(rlim, &x, sizeof(x)) ? -EFAULT : 0;
L
Linus Torvalds 已提交
1322 1323 1324 1325
}

#endif

1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358
static inline bool rlim64_is_infinity(__u64 rlim64)
{
#if BITS_PER_LONG < 64
	return rlim64 >= ULONG_MAX;
#else
	return rlim64 == RLIM64_INFINITY;
#endif
}

static void rlim_to_rlim64(const struct rlimit *rlim, struct rlimit64 *rlim64)
{
	if (rlim->rlim_cur == RLIM_INFINITY)
		rlim64->rlim_cur = RLIM64_INFINITY;
	else
		rlim64->rlim_cur = rlim->rlim_cur;
	if (rlim->rlim_max == RLIM_INFINITY)
		rlim64->rlim_max = RLIM64_INFINITY;
	else
		rlim64->rlim_max = rlim->rlim_max;
}

static void rlim64_to_rlim(const struct rlimit64 *rlim64, struct rlimit *rlim)
{
	if (rlim64_is_infinity(rlim64->rlim_cur))
		rlim->rlim_cur = RLIM_INFINITY;
	else
		rlim->rlim_cur = (unsigned long)rlim64->rlim_cur;
	if (rlim64_is_infinity(rlim64->rlim_max))
		rlim->rlim_max = RLIM_INFINITY;
	else
		rlim->rlim_max = (unsigned long)rlim64->rlim_max;
}

1359
/* make sure you are allowed to change @tsk limits before calling this */
1360 1361
int do_prlimit(struct task_struct *tsk, unsigned int resource,
		struct rlimit *new_rlim, struct rlimit *old_rlim)
L
Linus Torvalds 已提交
1362
{
1363
	struct rlimit *rlim;
1364
	int retval = 0;
L
Linus Torvalds 已提交
1365 1366 1367

	if (resource >= RLIM_NLIMITS)
		return -EINVAL;
1368 1369 1370 1371 1372 1373 1374
	if (new_rlim) {
		if (new_rlim->rlim_cur > new_rlim->rlim_max)
			return -EINVAL;
		if (resource == RLIMIT_NOFILE &&
				new_rlim->rlim_max > sysctl_nr_open)
			return -EPERM;
	}
L
Linus Torvalds 已提交
1375

1376 1377 1378 1379 1380 1381 1382
	/* protect tsk->signal and tsk->sighand from disappearing */
	read_lock(&tasklist_lock);
	if (!tsk->sighand) {
		retval = -ESRCH;
		goto out;
	}

1383
	rlim = tsk->signal->rlim + resource;
1384
	task_lock(tsk->group_leader);
1385
	if (new_rlim) {
1386 1387
		/* Keep the capable check against init_user_ns until
		   cgroups can contain all limits */
1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408
		if (new_rlim->rlim_max > rlim->rlim_max &&
				!capable(CAP_SYS_RESOURCE))
			retval = -EPERM;
		if (!retval)
			retval = security_task_setrlimit(tsk->group_leader,
					resource, new_rlim);
		if (resource == RLIMIT_CPU && new_rlim->rlim_cur == 0) {
			/*
			 * The caller is asking for an immediate RLIMIT_CPU
			 * expiry.  But we use the zero value to mean "it was
			 * never set".  So let's cheat and make it one second
			 * instead
			 */
			new_rlim->rlim_cur = 1;
		}
	}
	if (!retval) {
		if (old_rlim)
			*old_rlim = *rlim;
		if (new_rlim)
			*rlim = *new_rlim;
1409
	}
J
Jiri Slaby 已提交
1410
	task_unlock(tsk->group_leader);
L
Linus Torvalds 已提交
1411

1412 1413 1414 1415 1416 1417
	/*
	 * RLIMIT_CPU handling.   Note that the kernel fails to return an error
	 * code if it rejected the user's attempt to set RLIMIT_CPU.  This is a
	 * very long-standing error, and fixing it now risks breakage of
	 * applications, so we live with it
	 */
1418 1419 1420
	 if (!retval && new_rlim && resource == RLIMIT_CPU &&
			 new_rlim->rlim_cur != RLIM_INFINITY)
		update_rlimit_cpu(tsk, new_rlim->rlim_cur);
A
Andrew Morton 已提交
1421
out:
1422
	read_unlock(&tasklist_lock);
1423
	return retval;
L
Linus Torvalds 已提交
1424 1425
}

1426 1427 1428 1429 1430
/* rcu lock must be held */
static int check_prlimit_permission(struct task_struct *task)
{
	const struct cred *cred = current_cred(), *tcred;

1431 1432
	if (current == task)
		return 0;
1433

1434
	tcred = __task_cred(task);
1435 1436 1437 1438 1439 1440
	if (uid_eq(cred->uid, tcred->euid) &&
	    uid_eq(cred->uid, tcred->suid) &&
	    uid_eq(cred->uid, tcred->uid)  &&
	    gid_eq(cred->gid, tcred->egid) &&
	    gid_eq(cred->gid, tcred->sgid) &&
	    gid_eq(cred->gid, tcred->gid))
1441
		return 0;
1442
	if (ns_capable(tcred->user_ns, CAP_SYS_RESOURCE))
1443 1444 1445
		return 0;

	return -EPERM;
1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489
}

SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource,
		const struct rlimit64 __user *, new_rlim,
		struct rlimit64 __user *, old_rlim)
{
	struct rlimit64 old64, new64;
	struct rlimit old, new;
	struct task_struct *tsk;
	int ret;

	if (new_rlim) {
		if (copy_from_user(&new64, new_rlim, sizeof(new64)))
			return -EFAULT;
		rlim64_to_rlim(&new64, &new);
	}

	rcu_read_lock();
	tsk = pid ? find_task_by_vpid(pid) : current;
	if (!tsk) {
		rcu_read_unlock();
		return -ESRCH;
	}
	ret = check_prlimit_permission(tsk);
	if (ret) {
		rcu_read_unlock();
		return ret;
	}
	get_task_struct(tsk);
	rcu_read_unlock();

	ret = do_prlimit(tsk, resource, new_rlim ? &new : NULL,
			old_rlim ? &old : NULL);

	if (!ret && old_rlim) {
		rlim_to_rlim64(&old, &old64);
		if (copy_to_user(old_rlim, &old64, sizeof(old64)))
			ret = -EFAULT;
	}

	put_task_struct(tsk);
	return ret;
}

J
Jiri Slaby 已提交
1490 1491 1492 1493 1494 1495
SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim)
{
	struct rlimit new_rlim;

	if (copy_from_user(&new_rlim, rlim, sizeof(*rlim)))
		return -EFAULT;
1496
	return do_prlimit(current, resource, &new_rlim, NULL);
J
Jiri Slaby 已提交
1497 1498
}

L
Linus Torvalds 已提交
1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513
/*
 * It would make sense to put struct rusage in the task_struct,
 * except that would make the task_struct be *really big*.  After
 * task_struct gets moved into malloc'ed memory, it would
 * make sense to do this.  It will make moving the rest of the information
 * a lot simpler!  (Which we're not doing right now because we're not
 * measuring them yet).
 *
 * When sampling multiple threads for RUSAGE_SELF, under SMP we might have
 * races with threads incrementing their own counters.  But since word
 * reads are atomic, we either get new values or old values and we don't
 * care which for the sums.  We always take the siglock to protect reading
 * the c* fields from p->signal from races with exit.c updating those
 * fields when reaping, so a sample either gets all the additions of a
 * given child after it's reaped, or none so this sample is before reaping.
1514
 *
1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528
 * Locking:
 * We need to take the siglock for CHILDEREN, SELF and BOTH
 * for  the cases current multithreaded, non-current single threaded
 * non-current multithreaded.  Thread traversal is now safe with
 * the siglock held.
 * Strictly speaking, we donot need to take the siglock if we are current and
 * single threaded,  as no one else can take our signal_struct away, no one
 * else can  reap the  children to update signal->c* counters, and no one else
 * can race with the signal-> fields. If we do not take any lock, the
 * signal-> fields could be read out of order while another thread was just
 * exiting. So we should  place a read memory barrier when we avoid the lock.
 * On the writer side,  write memory barrier is implied in  __exit_signal
 * as __exit_signal releases  the siglock spinlock after updating the signal->
 * fields. But we don't do this yet to keep things simple.
1529
 *
L
Linus Torvalds 已提交
1530 1531
 */

1532
static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r)
S
Sripathi Kodi 已提交
1533 1534 1535 1536 1537 1538 1539 1540 1541
{
	r->ru_nvcsw += t->nvcsw;
	r->ru_nivcsw += t->nivcsw;
	r->ru_minflt += t->min_flt;
	r->ru_majflt += t->maj_flt;
	r->ru_inblock += task_io_get_inblock(t);
	r->ru_oublock += task_io_get_oublock(t);
}

L
Linus Torvalds 已提交
1542 1543 1544 1545
static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
{
	struct task_struct *t;
	unsigned long flags;
1546
	cputime_t tgutime, tgstime, utime, stime;
J
Jiri Pirko 已提交
1547
	unsigned long maxrss = 0;
L
Linus Torvalds 已提交
1548

V
vishnu.ps 已提交
1549
	memset((char *)r, 0, sizeof (*r));
1550
	utime = stime = 0;
L
Linus Torvalds 已提交
1551

S
Sripathi Kodi 已提交
1552
	if (who == RUSAGE_THREAD) {
1553
		task_cputime_adjusted(current, &utime, &stime);
1554
		accumulate_thread_rusage(p, r);
J
Jiri Pirko 已提交
1555
		maxrss = p->signal->maxrss;
S
Sripathi Kodi 已提交
1556 1557 1558
		goto out;
	}

1559
	if (!lock_task_sighand(p, &flags))
1560
		return;
O
Oleg Nesterov 已提交
1561

L
Linus Torvalds 已提交
1562
	switch (who) {
V
vishnu.ps 已提交
1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575
	case RUSAGE_BOTH:
	case RUSAGE_CHILDREN:
		utime = p->signal->cutime;
		stime = p->signal->cstime;
		r->ru_nvcsw = p->signal->cnvcsw;
		r->ru_nivcsw = p->signal->cnivcsw;
		r->ru_minflt = p->signal->cmin_flt;
		r->ru_majflt = p->signal->cmaj_flt;
		r->ru_inblock = p->signal->cinblock;
		r->ru_oublock = p->signal->coublock;
		maxrss = p->signal->cmaxrss;

		if (who == RUSAGE_CHILDREN)
L
Linus Torvalds 已提交
1576
			break;
O
Oleg Nesterov 已提交
1577

V
vishnu.ps 已提交
1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597
	case RUSAGE_SELF:
		thread_group_cputime_adjusted(p, &tgutime, &tgstime);
		utime += tgutime;
		stime += tgstime;
		r->ru_nvcsw += p->signal->nvcsw;
		r->ru_nivcsw += p->signal->nivcsw;
		r->ru_minflt += p->signal->min_flt;
		r->ru_majflt += p->signal->maj_flt;
		r->ru_inblock += p->signal->inblock;
		r->ru_oublock += p->signal->oublock;
		if (maxrss < p->signal->maxrss)
			maxrss = p->signal->maxrss;
		t = p;
		do {
			accumulate_thread_rusage(t, r);
		} while_each_thread(p, t);
		break;

	default:
		BUG();
L
Linus Torvalds 已提交
1598
	}
1599 1600
	unlock_task_sighand(p, &flags);

S
Sripathi Kodi 已提交
1601
out:
O
Oleg Nesterov 已提交
1602 1603
	cputime_to_timeval(utime, &r->ru_utime);
	cputime_to_timeval(stime, &r->ru_stime);
J
Jiri Pirko 已提交
1604 1605 1606

	if (who != RUSAGE_CHILDREN) {
		struct mm_struct *mm = get_task_mm(p);
V
vishnu.ps 已提交
1607

J
Jiri Pirko 已提交
1608 1609 1610 1611 1612 1613
		if (mm) {
			setmax_mm_hiwater_rss(&maxrss, mm);
			mmput(mm);
		}
	}
	r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */
L
Linus Torvalds 已提交
1614 1615 1616 1617 1618
}

int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
{
	struct rusage r;
V
vishnu.ps 已提交
1619

L
Linus Torvalds 已提交
1620 1621 1622 1623
	k_getrusage(p, who, &r);
	return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0;
}

1624
SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru)
L
Linus Torvalds 已提交
1625
{
S
Sripathi Kodi 已提交
1626 1627
	if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN &&
	    who != RUSAGE_THREAD)
L
Linus Torvalds 已提交
1628 1629 1630 1631
		return -EINVAL;
	return getrusage(current, who, ru);
}

1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE2(getrusage, int, who, struct compat_rusage __user *, ru)
{
	struct rusage r;

	if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN &&
	    who != RUSAGE_THREAD)
		return -EINVAL;

	k_getrusage(current, who, &r);
	return put_compat_rusage(&r, ru);
}
#endif

1646
SYSCALL_DEFINE1(umask, int, mask)
L
Linus Torvalds 已提交
1647 1648 1649 1650
{
	mask = xchg(&current->fs->umask, mask & S_IRWXUGO);
	return mask;
}
1651

1652
static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
1653
{
1654
	struct fd exe;
1655
	struct file *old_exe, *exe_file;
A
Al Viro 已提交
1656
	struct inode *inode;
1657
	int err;
1658

1659 1660
	exe = fdget(fd);
	if (!exe.file)
1661 1662
		return -EBADF;

A
Al Viro 已提交
1663
	inode = file_inode(exe.file);
1664 1665 1666 1667 1668 1669 1670

	/*
	 * Because the original mm->exe_file points to executable file, make
	 * sure that this one is executable as well, to avoid breaking an
	 * overall picture.
	 */
	err = -EACCES;
1671
	if (!S_ISREG(inode->i_mode) || path_noexec(&exe.file->f_path))
1672 1673
		goto exit;

A
Al Viro 已提交
1674
	err = inode_permission(inode, MAY_EXEC);
1675 1676 1677
	if (err)
		goto exit;

1678
	/*
1679
	 * Forbid mm->exe_file change if old file still mapped.
1680
	 */
1681
	exe_file = get_mm_exe_file(mm);
1682
	err = -EBUSY;
1683
	if (exe_file) {
1684 1685
		struct vm_area_struct *vma;

1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696
		down_read(&mm->mmap_sem);
		for (vma = mm->mmap; vma; vma = vma->vm_next) {
			if (!vma->vm_file)
				continue;
			if (path_equal(&vma->vm_file->f_path,
				       &exe_file->f_path))
				goto exit_err;
		}

		up_read(&mm->mmap_sem);
		fput(exe_file);
1697 1698
	}

1699 1700 1701 1702 1703 1704
	/*
	 * The symlink can be changed only once, just to disallow arbitrary
	 * transitions malicious software might bring in. This means one
	 * could make a snapshot over all processes running and monitor
	 * /proc/pid/exe changes to notice unusual activity if needed.
	 */
1705 1706
	err = -EPERM;
	if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags))
1707
		goto exit;
1708

1709
	err = 0;
1710 1711 1712 1713 1714
	/* set the new file, lockless */
	get_file(exe.file);
	old_exe = xchg(&mm->exe_file, exe.file);
	if (old_exe)
		fput(old_exe);
1715
exit:
1716
	fdput(exe);
1717
	return err;
1718 1719 1720 1721
exit_err:
	up_read(&mm->mmap_sem);
	fput(exe_file);
	goto exit;
1722 1723
}

1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818
/*
 * WARNING: we don't require any capability here so be very careful
 * in what is allowed for modification from userspace.
 */
static int validate_prctl_map(struct prctl_mm_map *prctl_map)
{
	unsigned long mmap_max_addr = TASK_SIZE;
	struct mm_struct *mm = current->mm;
	int error = -EINVAL, i;

	static const unsigned char offsets[] = {
		offsetof(struct prctl_mm_map, start_code),
		offsetof(struct prctl_mm_map, end_code),
		offsetof(struct prctl_mm_map, start_data),
		offsetof(struct prctl_mm_map, end_data),
		offsetof(struct prctl_mm_map, start_brk),
		offsetof(struct prctl_mm_map, brk),
		offsetof(struct prctl_mm_map, start_stack),
		offsetof(struct prctl_mm_map, arg_start),
		offsetof(struct prctl_mm_map, arg_end),
		offsetof(struct prctl_mm_map, env_start),
		offsetof(struct prctl_mm_map, env_end),
	};

	/*
	 * Make sure the members are not somewhere outside
	 * of allowed address space.
	 */
	for (i = 0; i < ARRAY_SIZE(offsets); i++) {
		u64 val = *(u64 *)((char *)prctl_map + offsets[i]);

		if ((unsigned long)val >= mmap_max_addr ||
		    (unsigned long)val < mmap_min_addr)
			goto out;
	}

	/*
	 * Make sure the pairs are ordered.
	 */
#define __prctl_check_order(__m1, __op, __m2)				\
	((unsigned long)prctl_map->__m1 __op				\
	 (unsigned long)prctl_map->__m2) ? 0 : -EINVAL
	error  = __prctl_check_order(start_code, <, end_code);
	error |= __prctl_check_order(start_data, <, end_data);
	error |= __prctl_check_order(start_brk, <=, brk);
	error |= __prctl_check_order(arg_start, <=, arg_end);
	error |= __prctl_check_order(env_start, <=, env_end);
	if (error)
		goto out;
#undef __prctl_check_order

	error = -EINVAL;

	/*
	 * @brk should be after @end_data in traditional maps.
	 */
	if (prctl_map->start_brk <= prctl_map->end_data ||
	    prctl_map->brk <= prctl_map->end_data)
		goto out;

	/*
	 * Neither we should allow to override limits if they set.
	 */
	if (check_data_rlimit(rlimit(RLIMIT_DATA), prctl_map->brk,
			      prctl_map->start_brk, prctl_map->end_data,
			      prctl_map->start_data))
			goto out;

	/*
	 * Someone is trying to cheat the auxv vector.
	 */
	if (prctl_map->auxv_size) {
		if (!prctl_map->auxv || prctl_map->auxv_size > sizeof(mm->saved_auxv))
			goto out;
	}

	/*
	 * Finally, make sure the caller has the rights to
	 * change /proc/pid/exe link: only local root should
	 * be allowed to.
	 */
	if (prctl_map->exe_fd != (u32)-1) {
		struct user_namespace *ns = current_user_ns();
		const struct cred *cred = current_cred();

		if (!uid_eq(cred->uid, make_kuid(ns, 0)) ||
		    !gid_eq(cred->gid, make_kgid(ns, 0)))
			goto out;
	}

	error = 0;
out:
	return error;
}

1819
#ifdef CONFIG_CHECKPOINT_RESTORE
1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856
static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data_size)
{
	struct prctl_mm_map prctl_map = { .exe_fd = (u32)-1, };
	unsigned long user_auxv[AT_VECTOR_SIZE];
	struct mm_struct *mm = current->mm;
	int error;

	BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv));
	BUILD_BUG_ON(sizeof(struct prctl_mm_map) > 256);

	if (opt == PR_SET_MM_MAP_SIZE)
		return put_user((unsigned int)sizeof(prctl_map),
				(unsigned int __user *)addr);

	if (data_size != sizeof(prctl_map))
		return -EINVAL;

	if (copy_from_user(&prctl_map, addr, sizeof(prctl_map)))
		return -EFAULT;

	error = validate_prctl_map(&prctl_map);
	if (error)
		return error;

	if (prctl_map.auxv_size) {
		memset(user_auxv, 0, sizeof(user_auxv));
		if (copy_from_user(user_auxv,
				   (const void __user *)prctl_map.auxv,
				   prctl_map.auxv_size))
			return -EFAULT;

		/* Last entry must be AT_NULL as specification requires */
		user_auxv[AT_VECTOR_SIZE - 2] = AT_NULL;
		user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL;
	}

	if (prctl_map.exe_fd != (u32)-1)
1857 1858
		error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd);
	down_read(&mm->mmap_sem);
1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903
	if (error)
		goto out;

	/*
	 * We don't validate if these members are pointing to
	 * real present VMAs because application may have correspond
	 * VMAs already unmapped and kernel uses these members for statistics
	 * output in procfs mostly, except
	 *
	 *  - @start_brk/@brk which are used in do_brk but kernel lookups
	 *    for VMAs when updating these memvers so anything wrong written
	 *    here cause kernel to swear at userspace program but won't lead
	 *    to any problem in kernel itself
	 */

	mm->start_code	= prctl_map.start_code;
	mm->end_code	= prctl_map.end_code;
	mm->start_data	= prctl_map.start_data;
	mm->end_data	= prctl_map.end_data;
	mm->start_brk	= prctl_map.start_brk;
	mm->brk		= prctl_map.brk;
	mm->start_stack	= prctl_map.start_stack;
	mm->arg_start	= prctl_map.arg_start;
	mm->arg_end	= prctl_map.arg_end;
	mm->env_start	= prctl_map.env_start;
	mm->env_end	= prctl_map.env_end;

	/*
	 * Note this update of @saved_auxv is lockless thus
	 * if someone reads this member in procfs while we're
	 * updating -- it may get partly updated results. It's
	 * known and acceptable trade off: we leave it as is to
	 * not introduce additional locks here making the kernel
	 * more complex.
	 */
	if (prctl_map.auxv_size)
		memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv));

	error = 0;
out:
	up_read(&mm->mmap_sem);
	return error;
}
#endif /* CONFIG_CHECKPOINT_RESTORE */

1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933
static int prctl_set_auxv(struct mm_struct *mm, unsigned long addr,
			  unsigned long len)
{
	/*
	 * This doesn't move the auxiliary vector itself since it's pinned to
	 * mm_struct, but it permits filling the vector with new values.  It's
	 * up to the caller to provide sane values here, otherwise userspace
	 * tools which use this vector might be unhappy.
	 */
	unsigned long user_auxv[AT_VECTOR_SIZE];

	if (len > sizeof(user_auxv))
		return -EINVAL;

	if (copy_from_user(user_auxv, (const void __user *)addr, len))
		return -EFAULT;

	/* Make sure the last entry is always AT_NULL */
	user_auxv[AT_VECTOR_SIZE - 2] = 0;
	user_auxv[AT_VECTOR_SIZE - 1] = 0;

	BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv));

	task_lock(current);
	memcpy(mm->saved_auxv, user_auxv, len);
	task_unlock(current);

	return 0;
}

1934 1935 1936 1937
static int prctl_set_mm(int opt, unsigned long addr,
			unsigned long arg4, unsigned long arg5)
{
	struct mm_struct *mm = current->mm;
1938
	struct prctl_mm_map prctl_map;
1939 1940
	struct vm_area_struct *vma;
	int error;
1941

1942 1943 1944
	if (arg5 || (arg4 && (opt != PR_SET_MM_AUXV &&
			      opt != PR_SET_MM_MAP &&
			      opt != PR_SET_MM_MAP_SIZE)))
1945 1946
		return -EINVAL;

1947 1948 1949 1950 1951
#ifdef CONFIG_CHECKPOINT_RESTORE
	if (opt == PR_SET_MM_MAP || opt == PR_SET_MM_MAP_SIZE)
		return prctl_set_mm_map(opt, (const void __user *)addr, arg4);
#endif

1952
	if (!capable(CAP_SYS_RESOURCE))
1953 1954
		return -EPERM;

1955 1956
	if (opt == PR_SET_MM_EXE_FILE)
		return prctl_set_mm_exe_file(mm, (unsigned int)addr);
1957

1958 1959 1960
	if (opt == PR_SET_MM_AUXV)
		return prctl_set_auxv(mm, addr, arg4);

1961
	if (addr >= TASK_SIZE || addr < mmap_min_addr)
1962 1963
		return -EINVAL;

1964 1965
	error = -EINVAL;

1966 1967 1968
	down_read(&mm->mmap_sem);
	vma = find_vma(mm, addr);

1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983
	prctl_map.start_code	= mm->start_code;
	prctl_map.end_code	= mm->end_code;
	prctl_map.start_data	= mm->start_data;
	prctl_map.end_data	= mm->end_data;
	prctl_map.start_brk	= mm->start_brk;
	prctl_map.brk		= mm->brk;
	prctl_map.start_stack	= mm->start_stack;
	prctl_map.arg_start	= mm->arg_start;
	prctl_map.arg_end	= mm->arg_end;
	prctl_map.env_start	= mm->env_start;
	prctl_map.env_end	= mm->env_end;
	prctl_map.auxv		= NULL;
	prctl_map.auxv_size	= 0;
	prctl_map.exe_fd	= -1;

1984 1985
	switch (opt) {
	case PR_SET_MM_START_CODE:
1986
		prctl_map.start_code = addr;
1987
		break;
1988
	case PR_SET_MM_END_CODE:
1989
		prctl_map.end_code = addr;
1990 1991
		break;
	case PR_SET_MM_START_DATA:
1992
		prctl_map.start_data = addr;
1993
		break;
1994
	case PR_SET_MM_END_DATA:
1995 1996 1997 1998
		prctl_map.end_data = addr;
		break;
	case PR_SET_MM_START_STACK:
		prctl_map.start_stack = addr;
1999 2000
		break;
	case PR_SET_MM_START_BRK:
2001
		prctl_map.start_brk = addr;
2002 2003
		break;
	case PR_SET_MM_BRK:
2004
		prctl_map.brk = addr;
2005
		break;
2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024
	case PR_SET_MM_ARG_START:
		prctl_map.arg_start = addr;
		break;
	case PR_SET_MM_ARG_END:
		prctl_map.arg_end = addr;
		break;
	case PR_SET_MM_ENV_START:
		prctl_map.env_start = addr;
		break;
	case PR_SET_MM_ENV_END:
		prctl_map.env_end = addr;
		break;
	default:
		goto out;
	}

	error = validate_prctl_map(&prctl_map);
	if (error)
		goto out;
2025

2026
	switch (opt) {
2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042
	/*
	 * If command line arguments and environment
	 * are placed somewhere else on stack, we can
	 * set them up here, ARG_START/END to setup
	 * command line argumets and ENV_START/END
	 * for environment.
	 */
	case PR_SET_MM_START_STACK:
	case PR_SET_MM_ARG_START:
	case PR_SET_MM_ARG_END:
	case PR_SET_MM_ENV_START:
	case PR_SET_MM_ENV_END:
		if (!vma) {
			error = -EFAULT;
			goto out;
		}
2043 2044
	}

2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056
	mm->start_code	= prctl_map.start_code;
	mm->end_code	= prctl_map.end_code;
	mm->start_data	= prctl_map.start_data;
	mm->end_data	= prctl_map.end_data;
	mm->start_brk	= prctl_map.start_brk;
	mm->brk		= prctl_map.brk;
	mm->start_stack	= prctl_map.start_stack;
	mm->arg_start	= prctl_map.arg_start;
	mm->arg_end	= prctl_map.arg_end;
	mm->env_start	= prctl_map.env_start;
	mm->env_end	= prctl_map.env_end;

2057 2058 2059 2060 2061
	error = 0;
out:
	up_read(&mm->mmap_sem);
	return error;
}
2062

2063
#ifdef CONFIG_CHECKPOINT_RESTORE
2064 2065 2066 2067
static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
{
	return put_user(me->clear_child_tid, tid_addr);
}
2068
#else
2069 2070 2071 2072
static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
{
	return -EINVAL;
}
2073 2074
#endif

2075 2076
SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
		unsigned long, arg4, unsigned long, arg5)
L
Linus Torvalds 已提交
2077
{
2078 2079 2080
	struct task_struct *me = current;
	unsigned char comm[sizeof(me->comm)];
	long error;
L
Linus Torvalds 已提交
2081

D
David Howells 已提交
2082 2083
	error = security_task_prctl(option, arg2, arg3, arg4, arg5);
	if (error != -ENOSYS)
L
Linus Torvalds 已提交
2084 2085
		return error;

D
David Howells 已提交
2086
	error = 0;
L
Linus Torvalds 已提交
2087
	switch (option) {
2088 2089 2090
	case PR_SET_PDEATHSIG:
		if (!valid_signal(arg2)) {
			error = -EINVAL;
L
Linus Torvalds 已提交
2091
			break;
2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103
		}
		me->pdeath_signal = arg2;
		break;
	case PR_GET_PDEATHSIG:
		error = put_user(me->pdeath_signal, (int __user *)arg2);
		break;
	case PR_GET_DUMPABLE:
		error = get_dumpable(me->mm);
		break;
	case PR_SET_DUMPABLE:
		if (arg2 != SUID_DUMP_DISABLE && arg2 != SUID_DUMP_USER) {
			error = -EINVAL;
L
Linus Torvalds 已提交
2104
			break;
2105 2106 2107
		}
		set_dumpable(me->mm, arg2);
		break;
L
Linus Torvalds 已提交
2108

2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176
	case PR_SET_UNALIGN:
		error = SET_UNALIGN_CTL(me, arg2);
		break;
	case PR_GET_UNALIGN:
		error = GET_UNALIGN_CTL(me, arg2);
		break;
	case PR_SET_FPEMU:
		error = SET_FPEMU_CTL(me, arg2);
		break;
	case PR_GET_FPEMU:
		error = GET_FPEMU_CTL(me, arg2);
		break;
	case PR_SET_FPEXC:
		error = SET_FPEXC_CTL(me, arg2);
		break;
	case PR_GET_FPEXC:
		error = GET_FPEXC_CTL(me, arg2);
		break;
	case PR_GET_TIMING:
		error = PR_TIMING_STATISTICAL;
		break;
	case PR_SET_TIMING:
		if (arg2 != PR_TIMING_STATISTICAL)
			error = -EINVAL;
		break;
	case PR_SET_NAME:
		comm[sizeof(me->comm) - 1] = 0;
		if (strncpy_from_user(comm, (char __user *)arg2,
				      sizeof(me->comm) - 1) < 0)
			return -EFAULT;
		set_task_comm(me, comm);
		proc_comm_connector(me);
		break;
	case PR_GET_NAME:
		get_task_comm(comm, me);
		if (copy_to_user((char __user *)arg2, comm, sizeof(comm)))
			return -EFAULT;
		break;
	case PR_GET_ENDIAN:
		error = GET_ENDIAN(me, arg2);
		break;
	case PR_SET_ENDIAN:
		error = SET_ENDIAN(me, arg2);
		break;
	case PR_GET_SECCOMP:
		error = prctl_get_seccomp();
		break;
	case PR_SET_SECCOMP:
		error = prctl_set_seccomp(arg2, (char __user *)arg3);
		break;
	case PR_GET_TSC:
		error = GET_TSC_CTL(arg2);
		break;
	case PR_SET_TSC:
		error = SET_TSC_CTL(arg2);
		break;
	case PR_TASK_PERF_EVENTS_DISABLE:
		error = perf_event_task_disable();
		break;
	case PR_TASK_PERF_EVENTS_ENABLE:
		error = perf_event_task_enable();
		break;
	case PR_GET_TIMERSLACK:
		error = current->timer_slack_ns;
		break;
	case PR_SET_TIMERSLACK:
		if (arg2 <= 0)
			current->timer_slack_ns =
2177
					current->default_timer_slack_ns;
2178 2179 2180 2181 2182 2183 2184 2185 2186
		else
			current->timer_slack_ns = arg2;
		break;
	case PR_MCE_KILL:
		if (arg4 | arg5)
			return -EINVAL;
		switch (arg2) {
		case PR_MCE_KILL_CLEAR:
			if (arg3 != 0)
2187
				return -EINVAL;
2188
			current->flags &= ~PF_MCE_PROCESS;
2189
			break;
2190 2191 2192 2193 2194 2195 2196 2197 2198
		case PR_MCE_KILL_SET:
			current->flags |= PF_MCE_PROCESS;
			if (arg3 == PR_MCE_KILL_EARLY)
				current->flags |= PF_MCE_EARLY;
			else if (arg3 == PR_MCE_KILL_LATE)
				current->flags &= ~PF_MCE_EARLY;
			else if (arg3 == PR_MCE_KILL_DEFAULT)
				current->flags &=
						~(PF_MCE_EARLY|PF_MCE_PROCESS);
2199
			else
2200 2201
				return -EINVAL;
			break;
L
Linus Torvalds 已提交
2202
		default:
2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231
			return -EINVAL;
		}
		break;
	case PR_MCE_KILL_GET:
		if (arg2 | arg3 | arg4 | arg5)
			return -EINVAL;
		if (current->flags & PF_MCE_PROCESS)
			error = (current->flags & PF_MCE_EARLY) ?
				PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE;
		else
			error = PR_MCE_KILL_DEFAULT;
		break;
	case PR_SET_MM:
		error = prctl_set_mm(arg2, arg3, arg4, arg5);
		break;
	case PR_GET_TID_ADDRESS:
		error = prctl_get_tid_address(me, (int __user **)arg2);
		break;
	case PR_SET_CHILD_SUBREAPER:
		me->signal->is_child_subreaper = !!arg2;
		break;
	case PR_GET_CHILD_SUBREAPER:
		error = put_user(me->signal->is_child_subreaper,
				 (int __user *)arg2);
		break;
	case PR_SET_NO_NEW_PRIVS:
		if (arg2 != 1 || arg3 || arg4 || arg5)
			return -EINVAL;

2232
		task_set_no_new_privs(current);
2233 2234 2235 2236
		break;
	case PR_GET_NO_NEW_PRIVS:
		if (arg2 || arg3 || arg4 || arg5)
			return -EINVAL;
2237
		return task_no_new_privs(current) ? 1 : 0;
2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252
	case PR_GET_THP_DISABLE:
		if (arg2 || arg3 || arg4 || arg5)
			return -EINVAL;
		error = !!(me->mm->def_flags & VM_NOHUGEPAGE);
		break;
	case PR_SET_THP_DISABLE:
		if (arg3 || arg4 || arg5)
			return -EINVAL;
		down_write(&me->mm->mmap_sem);
		if (arg2)
			me->mm->def_flags |= VM_NOHUGEPAGE;
		else
			me->mm->def_flags &= ~VM_NOHUGEPAGE;
		up_write(&me->mm->mmap_sem);
		break;
2253
	case PR_MPX_ENABLE_MANAGEMENT:
2254 2255
		if (arg2 || arg3 || arg4 || arg5)
			return -EINVAL;
2256
		error = MPX_ENABLE_MANAGEMENT();
2257 2258
		break;
	case PR_MPX_DISABLE_MANAGEMENT:
2259 2260
		if (arg2 || arg3 || arg4 || arg5)
			return -EINVAL;
2261
		error = MPX_DISABLE_MANAGEMENT();
2262
		break;
2263 2264 2265 2266 2267 2268
	case PR_SET_FP_MODE:
		error = SET_FP_MODE(me, arg2);
		break;
	case PR_GET_FP_MODE:
		error = GET_FP_MODE(me);
		break;
2269 2270 2271
	default:
		error = -EINVAL;
		break;
L
Linus Torvalds 已提交
2272 2273 2274
	}
	return error;
}
2275

2276 2277
SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
		struct getcpu_cache __user *, unused)
2278 2279 2280
{
	int err = 0;
	int cpu = raw_smp_processor_id();
V
vishnu.ps 已提交
2281

2282 2283 2284 2285 2286 2287
	if (cpup)
		err |= put_user(cpu, cpup);
	if (nodep)
		err |= put_user(cpu_to_node(cpu), nodep);
	return err ? -EFAULT : 0;
}
2288

2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300
/**
 * do_sysinfo - fill in sysinfo struct
 * @info: pointer to buffer to fill
 */
static int do_sysinfo(struct sysinfo *info)
{
	unsigned long mem_total, sav_total;
	unsigned int mem_unit, bitcount;
	struct timespec tp;

	memset(info, 0, sizeof(struct sysinfo));

2301
	get_monotonic_boottime(&tp);
2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393
	info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);

	get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);

	info->procs = nr_threads;

	si_meminfo(info);
	si_swapinfo(info);

	/*
	 * If the sum of all the available memory (i.e. ram + swap)
	 * is less than can be stored in a 32 bit unsigned long then
	 * we can be binary compatible with 2.2.x kernels.  If not,
	 * well, in that case 2.2.x was broken anyways...
	 *
	 *  -Erik Andersen <andersee@debian.org>
	 */

	mem_total = info->totalram + info->totalswap;
	if (mem_total < info->totalram || mem_total < info->totalswap)
		goto out;
	bitcount = 0;
	mem_unit = info->mem_unit;
	while (mem_unit > 1) {
		bitcount++;
		mem_unit >>= 1;
		sav_total = mem_total;
		mem_total <<= 1;
		if (mem_total < sav_total)
			goto out;
	}

	/*
	 * If mem_total did not overflow, multiply all memory values by
	 * info->mem_unit and set it to 1.  This leaves things compatible
	 * with 2.2.x, and also retains compatibility with earlier 2.4.x
	 * kernels...
	 */

	info->mem_unit = 1;
	info->totalram <<= bitcount;
	info->freeram <<= bitcount;
	info->sharedram <<= bitcount;
	info->bufferram <<= bitcount;
	info->totalswap <<= bitcount;
	info->freeswap <<= bitcount;
	info->totalhigh <<= bitcount;
	info->freehigh <<= bitcount;

out:
	return 0;
}

SYSCALL_DEFINE1(sysinfo, struct sysinfo __user *, info)
{
	struct sysinfo val;

	do_sysinfo(&val);

	if (copy_to_user(info, &val, sizeof(struct sysinfo)))
		return -EFAULT;

	return 0;
}

#ifdef CONFIG_COMPAT
struct compat_sysinfo {
	s32 uptime;
	u32 loads[3];
	u32 totalram;
	u32 freeram;
	u32 sharedram;
	u32 bufferram;
	u32 totalswap;
	u32 freeswap;
	u16 procs;
	u16 pad;
	u32 totalhigh;
	u32 freehigh;
	u32 mem_unit;
	char _f[20-2*sizeof(u32)-sizeof(int)];
};

COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info)
{
	struct sysinfo s;

	do_sysinfo(&s);

	/* Check to see if any memory value is too large for 32-bit and scale
	 *  down if needed
	 */
2394
	if (upper_32_bits(s.totalram) || upper_32_bits(s.totalswap)) {
2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431
		int bitcount = 0;

		while (s.mem_unit < PAGE_SIZE) {
			s.mem_unit <<= 1;
			bitcount++;
		}

		s.totalram >>= bitcount;
		s.freeram >>= bitcount;
		s.sharedram >>= bitcount;
		s.bufferram >>= bitcount;
		s.totalswap >>= bitcount;
		s.freeswap >>= bitcount;
		s.totalhigh >>= bitcount;
		s.freehigh >>= bitcount;
	}

	if (!access_ok(VERIFY_WRITE, info, sizeof(struct compat_sysinfo)) ||
	    __put_user(s.uptime, &info->uptime) ||
	    __put_user(s.loads[0], &info->loads[0]) ||
	    __put_user(s.loads[1], &info->loads[1]) ||
	    __put_user(s.loads[2], &info->loads[2]) ||
	    __put_user(s.totalram, &info->totalram) ||
	    __put_user(s.freeram, &info->freeram) ||
	    __put_user(s.sharedram, &info->sharedram) ||
	    __put_user(s.bufferram, &info->bufferram) ||
	    __put_user(s.totalswap, &info->totalswap) ||
	    __put_user(s.freeswap, &info->freeswap) ||
	    __put_user(s.procs, &info->procs) ||
	    __put_user(s.totalhigh, &info->totalhigh) ||
	    __put_user(s.freehigh, &info->freehigh) ||
	    __put_user(s.mem_unit, &info->mem_unit))
		return -EFAULT;

	return 0;
}
#endif /* CONFIG_COMPAT */