exec.c 41.5 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
/*
 *  linux/fs/exec.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

/*
 * #!-checking implemented by tytso.
 */
/*
 * Demand-loading implemented 01.12.91 - no need to read anything but
 * the header into memory. The inode of the executable is put into
 * "current->executable", and page faults do the actual loading. Clean.
 *
 * Once more I can proudly say that linux stood up to being changed: it
 * was less than 2 hours work to get demand-loading completely implemented.
 *
 * Demand loading changed July 1993 by Eric Youngdale.   Use mmap instead,
 * current->executable is only used by the procfs.  This allows a dispatch
 * table to check for several different types  of binary formats.  We keep
 * trying until we recognize the file or we run out of supported binary
 * formats. 
 */

#include <linux/slab.h>
#include <linux/file.h>
A
Al Viro 已提交
27
#include <linux/fdtable.h>
H
Hugh Dickins 已提交
28
#include <linux/mm.h>
D
Davidlohr Bueso 已提交
29
#include <linux/vmacache.h>
L
Linus Torvalds 已提交
30 31
#include <linux/stat.h>
#include <linux/fcntl.h>
H
Hugh Dickins 已提交
32
#include <linux/swap.h>
33
#include <linux/string.h>
L
Linus Torvalds 已提交
34
#include <linux/init.h>
35
#include <linux/pagemap.h>
36
#include <linux/perf_event.h>
L
Linus Torvalds 已提交
37 38 39 40 41 42
#include <linux/highmem.h>
#include <linux/spinlock.h>
#include <linux/key.h>
#include <linux/personality.h>
#include <linux/binfmts.h>
#include <linux/utsname.h>
43
#include <linux/pid_namespace.h>
L
Linus Torvalds 已提交
44 45 46 47 48
#include <linux/module.h>
#include <linux/namei.h>
#include <linux/mount.h>
#include <linux/security.h>
#include <linux/syscalls.h>
49
#include <linux/tsacct_kern.h>
M
Matt Helsley 已提交
50
#include <linux/cn_proc.h>
A
Al Viro 已提交
51
#include <linux/audit.h>
R
Roland McGrath 已提交
52
#include <linux/tracehook.h>
J
Johannes Berg 已提交
53
#include <linux/kmod.h>
54
#include <linux/fsnotify.h>
55
#include <linux/fs_struct.h>
56
#include <linux/pipe_fs_i.h>
Y
Ying Han 已提交
57
#include <linux/oom.h>
58
#include <linux/compat.h>
59
#include <linux/vmalloc.h>
L
Linus Torvalds 已提交
60 61 62

#include <asm/uaccess.h>
#include <asm/mmu_context.h>
63
#include <asm/tlb.h>
64 65

#include <trace/events/task.h>
66
#include "internal.h"
L
Linus Torvalds 已提交
67

68 69
#include <trace/events/sched.h>

A
Alan Cox 已提交
70 71
int suid_dumpable = 0;

A
Alexey Dobriyan 已提交
72
static LIST_HEAD(formats);
L
Linus Torvalds 已提交
73 74
static DEFINE_RWLOCK(binfmt_lock);

A
Al Viro 已提交
75
void __register_binfmt(struct linux_binfmt * fmt, int insert)
L
Linus Torvalds 已提交
76
{
A
Al Viro 已提交
77
	BUG_ON(!fmt);
78 79
	if (WARN_ON(!fmt->load_binary))
		return;
L
Linus Torvalds 已提交
80
	write_lock(&binfmt_lock);
I
Ivan Kokshaysky 已提交
81 82
	insert ? list_add(&fmt->lh, &formats) :
		 list_add_tail(&fmt->lh, &formats);
L
Linus Torvalds 已提交
83 84 85
	write_unlock(&binfmt_lock);
}

I
Ivan Kokshaysky 已提交
86
EXPORT_SYMBOL(__register_binfmt);
L
Linus Torvalds 已提交
87

88
void unregister_binfmt(struct linux_binfmt * fmt)
L
Linus Torvalds 已提交
89 90
{
	write_lock(&binfmt_lock);
A
Alexey Dobriyan 已提交
91
	list_del(&fmt->lh);
L
Linus Torvalds 已提交
92 93 94 95 96 97 98 99 100 101
	write_unlock(&binfmt_lock);
}

EXPORT_SYMBOL(unregister_binfmt);

static inline void put_binfmt(struct linux_binfmt * fmt)
{
	module_put(fmt->module);
}

102 103 104 105 106 107
bool path_noexec(const struct path *path)
{
	return (path->mnt->mnt_flags & MNT_NOEXEC) ||
	       (path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC);
}

108
#ifdef CONFIG_USELIB
L
Linus Torvalds 已提交
109 110 111 112 113 114
/*
 * Note that a shared library must be both readable and executable due to
 * security reasons.
 *
 * Also note that we take the address to load from from the file itself.
 */
115
SYSCALL_DEFINE1(uselib, const char __user *, library)
L
Linus Torvalds 已提交
116
{
A
Al Viro 已提交
117
	struct linux_binfmt *fmt;
118
	struct file *file;
119
	struct filename *tmp = getname(library);
120
	int error = PTR_ERR(tmp);
121 122
	static const struct open_flags uselib_flags = {
		.open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
A
Al Viro 已提交
123
		.acc_mode = MAY_READ | MAY_EXEC,
124 125
		.intent = LOOKUP_OPEN,
		.lookup_flags = LOOKUP_FOLLOW,
126
	};
127

128 129 130
	if (IS_ERR(tmp))
		goto out;

131
	file = do_filp_open(AT_FDCWD, tmp, &uselib_flags);
132 133 134
	putname(tmp);
	error = PTR_ERR(file);
	if (IS_ERR(file))
L
Linus Torvalds 已提交
135 136 137
		goto out;

	error = -EINVAL;
A
Al Viro 已提交
138
	if (!S_ISREG(file_inode(file)->i_mode))
L
Linus Torvalds 已提交
139 140
		goto exit;

141
	error = -EACCES;
142
	if (path_noexec(&file->f_path))
L
Linus Torvalds 已提交
143 144
		goto exit;

145
	fsnotify_open(file);
146

L
Linus Torvalds 已提交
147 148
	error = -ENOEXEC;

A
Al Viro 已提交
149 150 151 152 153 154
	read_lock(&binfmt_lock);
	list_for_each_entry(fmt, &formats, lh) {
		if (!fmt->load_shlib)
			continue;
		if (!try_module_get(fmt->module))
			continue;
L
Linus Torvalds 已提交
155
		read_unlock(&binfmt_lock);
A
Al Viro 已提交
156 157 158 159 160
		error = fmt->load_shlib(file);
		read_lock(&binfmt_lock);
		put_binfmt(fmt);
		if (error != -ENOEXEC)
			break;
L
Linus Torvalds 已提交
161
	}
A
Al Viro 已提交
162
	read_unlock(&binfmt_lock);
163
exit:
L
Linus Torvalds 已提交
164 165 166 167
	fput(file);
out:
  	return error;
}
168
#endif /* #ifdef CONFIG_USELIB */
L
Linus Torvalds 已提交
169

170
#ifdef CONFIG_MMU
O
Oleg Nesterov 已提交
171 172 173 174 175 176
/*
 * The nascent bprm->mm is not visible until exec_mmap() but it can
 * use a lot of memory, account these pages in current->mm temporary
 * for oom_badness()->get_mm_rss(). Once exec succeeds or fails, we
 * change the counter back via acct_arg_size(0).
 */
177
static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
178 179 180 181 182 183 184 185 186 187 188
{
	struct mm_struct *mm = current->mm;
	long diff = (long)(pages - bprm->vma_pages);

	if (!mm || !diff)
		return;

	bprm->vma_pages = pages;
	add_mm_counter(mm, MM_ANONPAGES, diff);
}

189
static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
190 191 192 193 194 195 196
		int write)
{
	struct page *page;
	int ret;

#ifdef CONFIG_STACK_GROWSUP
	if (write) {
197
		ret = expand_downwards(bprm->vma, pos);
198 199 200 201 202 203 204 205 206 207 208
		if (ret < 0)
			return NULL;
	}
#endif
	ret = get_user_pages(current, bprm->mm, pos,
			1, write, 1, &page, NULL);
	if (ret <= 0)
		return NULL;

	if (write) {
		unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start;
209 210
		struct rlimit *rlim;

211 212
		acct_arg_size(bprm, size / PAGE_SIZE);

213 214 215 216 217 218
		/*
		 * We've historically supported up to 32 pages (ARG_MAX)
		 * of argument strings even with small stacks
		 */
		if (size <= ARG_MAX)
			return page;
219 220 221 222 223 224 225 226

		/*
		 * Limit to 1/4-th the stack size for the argv+env strings.
		 * This ensures that:
		 *  - the remaining binfmt code will not run out of stack space,
		 *  - the program will have a reasonable amount of stack left
		 *    to work from.
		 */
227
		rlim = current->signal->rlim;
J
Jiri Slaby 已提交
228
		if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur) / 4) {
229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257
			put_page(page);
			return NULL;
		}
	}

	return page;
}

static void put_arg_page(struct page *page)
{
	put_page(page);
}

static void free_arg_page(struct linux_binprm *bprm, int i)
{
}

static void free_arg_pages(struct linux_binprm *bprm)
{
}

static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
		struct page *page)
{
	flush_cache_page(bprm->vma, pos, page_to_pfn(page));
}

static int __bprm_mm_init(struct linux_binprm *bprm)
{
258
	int err;
259 260 261 262 263
	struct vm_area_struct *vma = NULL;
	struct mm_struct *mm = bprm->mm;

	bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
	if (!vma)
264
		return -ENOMEM;
265 266 267 268 269 270 271 272 273 274

	down_write(&mm->mmap_sem);
	vma->vm_mm = mm;

	/*
	 * Place the stack at the largest stack address the architecture
	 * supports. Later, we'll move this to an appropriate place. We don't
	 * use STACK_TOP because that can depend on attributes which aren't
	 * configured yet.
	 */
275
	BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
276 277
	vma->vm_end = STACK_TOP_MAX;
	vma->vm_start = vma->vm_end - PAGE_SIZE;
278
	vma->vm_flags = VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
279
	vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
280
	INIT_LIST_HEAD(&vma->anon_vma_chain);
281

282
	err = insert_vm_struct(mm, vma);
283
	if (err)
284 285 286
		goto err;

	mm->stack_vm = mm->total_vm = 1;
287
	arch_bprm_mm_init(mm, vma);
288 289 290 291
	up_write(&mm->mmap_sem);
	bprm->p = vma->vm_end - sizeof(void *);
	return 0;
err:
292 293 294
	up_write(&mm->mmap_sem);
	bprm->vma = NULL;
	kmem_cache_free(vm_area_cachep, vma);
295 296 297 298 299 300 301 302 303 304
	return err;
}

static bool valid_arg_len(struct linux_binprm *bprm, long len)
{
	return len <= MAX_ARG_STRLEN;
}

#else

305
static inline void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
306 307 308
{
}

309
static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368
		int write)
{
	struct page *page;

	page = bprm->page[pos / PAGE_SIZE];
	if (!page && write) {
		page = alloc_page(GFP_HIGHUSER|__GFP_ZERO);
		if (!page)
			return NULL;
		bprm->page[pos / PAGE_SIZE] = page;
	}

	return page;
}

static void put_arg_page(struct page *page)
{
}

static void free_arg_page(struct linux_binprm *bprm, int i)
{
	if (bprm->page[i]) {
		__free_page(bprm->page[i]);
		bprm->page[i] = NULL;
	}
}

static void free_arg_pages(struct linux_binprm *bprm)
{
	int i;

	for (i = 0; i < MAX_ARG_PAGES; i++)
		free_arg_page(bprm, i);
}

static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
		struct page *page)
{
}

static int __bprm_mm_init(struct linux_binprm *bprm)
{
	bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *);
	return 0;
}

static bool valid_arg_len(struct linux_binprm *bprm, long len)
{
	return len <= bprm->p;
}

#endif /* CONFIG_MMU */

/*
 * Create a new mm_struct and populate it with a temporary stack
 * vm_area_struct.  We don't have enough context at this point to set the stack
 * flags, permissions, and offset, so we use temporary values.  We'll update
 * them later in setup_arg_pages().
 */
369
static int bprm_mm_init(struct linux_binprm *bprm)
370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393
{
	int err;
	struct mm_struct *mm = NULL;

	bprm->mm = mm = mm_alloc();
	err = -ENOMEM;
	if (!mm)
		goto err;

	err = __bprm_mm_init(bprm);
	if (err)
		goto err;

	return 0;

err:
	if (mm) {
		bprm->mm = NULL;
		mmdrop(mm);
	}

	return err;
}

394
struct user_arg_ptr {
395 396 397 398 399 400
#ifdef CONFIG_COMPAT
	bool is_compat;
#endif
	union {
		const char __user *const __user *native;
#ifdef CONFIG_COMPAT
A
Al Viro 已提交
401
		const compat_uptr_t __user *compat;
402 403
#endif
	} ptr;
404 405 406
};

static const char __user *get_user_arg_ptr(struct user_arg_ptr argv, int nr)
407
{
408 409 410 411 412 413 414 415
	const char __user *native;

#ifdef CONFIG_COMPAT
	if (unlikely(argv.is_compat)) {
		compat_uptr_t compat;

		if (get_user(compat, argv.ptr.compat + nr))
			return ERR_PTR(-EFAULT);
416

417 418 419 420 421
		return compat_ptr(compat);
	}
#endif

	if (get_user(native, argv.ptr.native + nr))
422 423
		return ERR_PTR(-EFAULT);

424
	return native;
425 426
}

L
Linus Torvalds 已提交
427 428 429
/*
 * count() counts the number of strings in array ARGV.
 */
430
static int count(struct user_arg_ptr argv, int max)
L
Linus Torvalds 已提交
431 432 433
{
	int i = 0;

434
	if (argv.ptr.native != NULL) {
L
Linus Torvalds 已提交
435
		for (;;) {
436
			const char __user *p = get_user_arg_ptr(argv, i);
L
Linus Torvalds 已提交
437 438 439

			if (!p)
				break;
440 441 442 443

			if (IS_ERR(p))
				return -EFAULT;

444
			if (i >= max)
L
Linus Torvalds 已提交
445
				return -E2BIG;
446
			++i;
447 448 449

			if (fatal_signal_pending(current))
				return -ERESTARTNOHAND;
L
Linus Torvalds 已提交
450 451 452 453 454 455 456
			cond_resched();
		}
	}
	return i;
}

/*
457 458 459
 * 'copy_strings()' copies argument/environment strings from the old
 * processes's memory to the new process's stack.  The call to get_user_pages()
 * ensures the destination page is created and not swapped out.
L
Linus Torvalds 已提交
460
 */
461
static int copy_strings(int argc, struct user_arg_ptr argv,
A
Adrian Bunk 已提交
462
			struct linux_binprm *bprm)
L
Linus Torvalds 已提交
463 464 465
{
	struct page *kmapped_page = NULL;
	char *kaddr = NULL;
466
	unsigned long kpos = 0;
L
Linus Torvalds 已提交
467 468 469
	int ret;

	while (argc-- > 0) {
470
		const char __user *str;
L
Linus Torvalds 已提交
471 472 473
		int len;
		unsigned long pos;

474 475 476
		ret = -EFAULT;
		str = get_user_arg_ptr(argv, argc);
		if (IS_ERR(str))
L
Linus Torvalds 已提交
477 478
			goto out;

479 480 481 482 483 484
		len = strnlen_user(str, MAX_ARG_STRLEN);
		if (!len)
			goto out;

		ret = -E2BIG;
		if (!valid_arg_len(bprm, len))
L
Linus Torvalds 已提交
485 486
			goto out;

487
		/* We're going to work our way backwords. */
L
Linus Torvalds 已提交
488
		pos = bprm->p;
489 490
		str += len;
		bprm->p -= len;
L
Linus Torvalds 已提交
491 492 493 494

		while (len > 0) {
			int offset, bytes_to_copy;

495 496 497 498
			if (fatal_signal_pending(current)) {
				ret = -ERESTARTNOHAND;
				goto out;
			}
499 500
			cond_resched();

L
Linus Torvalds 已提交
501
			offset = pos % PAGE_SIZE;
502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517
			if (offset == 0)
				offset = PAGE_SIZE;

			bytes_to_copy = offset;
			if (bytes_to_copy > len)
				bytes_to_copy = len;

			offset -= bytes_to_copy;
			pos -= bytes_to_copy;
			str -= bytes_to_copy;
			len -= bytes_to_copy;

			if (!kmapped_page || kpos != (pos & PAGE_MASK)) {
				struct page *page;

				page = get_arg_page(bprm, pos, 1);
L
Linus Torvalds 已提交
518
				if (!page) {
519
					ret = -E2BIG;
L
Linus Torvalds 已提交
520 521 522
					goto out;
				}

523 524
				if (kmapped_page) {
					flush_kernel_dcache_page(kmapped_page);
L
Linus Torvalds 已提交
525
					kunmap(kmapped_page);
526 527
					put_arg_page(kmapped_page);
				}
L
Linus Torvalds 已提交
528 529
				kmapped_page = page;
				kaddr = kmap(kmapped_page);
530 531
				kpos = pos & PAGE_MASK;
				flush_arg_page(bprm, kpos, kmapped_page);
L
Linus Torvalds 已提交
532
			}
533
			if (copy_from_user(kaddr+offset, str, bytes_to_copy)) {
L
Linus Torvalds 已提交
534 535 536 537 538 539 540
				ret = -EFAULT;
				goto out;
			}
		}
	}
	ret = 0;
out:
541 542
	if (kmapped_page) {
		flush_kernel_dcache_page(kmapped_page);
L
Linus Torvalds 已提交
543
		kunmap(kmapped_page);
544 545
		put_arg_page(kmapped_page);
	}
L
Linus Torvalds 已提交
546 547 548 549 550 551
	return ret;
}

/*
 * Like copy_strings, but get argv and its values from kernel memory.
 */
552
int copy_strings_kernel(int argc, const char *const *__argv,
553
			struct linux_binprm *bprm)
L
Linus Torvalds 已提交
554 555 556
{
	int r;
	mm_segment_t oldfs = get_fs();
557
	struct user_arg_ptr argv = {
558
		.ptr.native = (const char __user *const  __user *)__argv,
559 560
	};

L
Linus Torvalds 已提交
561
	set_fs(KERNEL_DS);
562
	r = copy_strings(argc, argv, bprm);
L
Linus Torvalds 已提交
563
	set_fs(oldfs);
564

L
Linus Torvalds 已提交
565 566 567 568 569
	return r;
}
EXPORT_SYMBOL(copy_strings_kernel);

#ifdef CONFIG_MMU
570

L
Linus Torvalds 已提交
571
/*
572 573 574
 * During bprm_mm_init(), we create a temporary stack at STACK_TOP_MAX.  Once
 * the binfmt code determines where the new stack should reside, we shift it to
 * its final location.  The process proceeds as follows:
L
Linus Torvalds 已提交
575
 *
576 577 578 579 580 581
 * 1) Use shift to calculate the new vma endpoints.
 * 2) Extend vma to cover both the old and new ranges.  This ensures the
 *    arguments passed to subsequent functions are consistent.
 * 3) Move vma's page tables to the new range.
 * 4) Free up any cleared pgd range.
 * 5) Shrink the vma to cover only the new range.
L
Linus Torvalds 已提交
582
 */
583
static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
L
Linus Torvalds 已提交
584 585
{
	struct mm_struct *mm = vma->vm_mm;
586 587 588 589 590
	unsigned long old_start = vma->vm_start;
	unsigned long old_end = vma->vm_end;
	unsigned long length = old_end - old_start;
	unsigned long new_start = old_start - shift;
	unsigned long new_end = old_end - shift;
P
Peter Zijlstra 已提交
591
	struct mmu_gather tlb;
L
Linus Torvalds 已提交
592

593
	BUG_ON(new_start > new_end);
L
Linus Torvalds 已提交
594

595 596 597 598 599 600 601 602 603 604
	/*
	 * ensure there are no vmas between where we want to go
	 * and where we are
	 */
	if (vma != find_vma(mm, new_start))
		return -EFAULT;

	/*
	 * cover the whole range: [new_start, old_end)
	 */
605 606
	if (vma_adjust(vma, new_start, old_end, vma->vm_pgoff, NULL))
		return -ENOMEM;
607 608 609 610 611 612

	/*
	 * move the page tables downwards, on failure we rely on
	 * process cleanup to remove whatever mess we made.
	 */
	if (length != move_page_tables(vma, old_start,
613
				       vma, new_start, length, false))
614 615 616
		return -ENOMEM;

	lru_add_drain();
617
	tlb_gather_mmu(&tlb, mm, old_start, old_end);
618 619 620 621
	if (new_end > old_start) {
		/*
		 * when the old and new regions overlap clear from new_end.
		 */
P
Peter Zijlstra 已提交
622
		free_pgd_range(&tlb, new_end, old_end, new_end,
623
			vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING);
624 625 626 627 628 629 630
	} else {
		/*
		 * otherwise, clean from old_start; this is done to not touch
		 * the address space in [new_end, old_start) some architectures
		 * have constraints on va-space that make this illegal (IA64) -
		 * for the others its just a little faster.
		 */
P
Peter Zijlstra 已提交
631
		free_pgd_range(&tlb, old_start, old_end, new_end,
632
			vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING);
L
Linus Torvalds 已提交
633
	}
634
	tlb_finish_mmu(&tlb, old_start, old_end);
635 636

	/*
637
	 * Shrink the vma to just the new range.  Always succeeds.
638 639 640 641
	 */
	vma_adjust(vma, new_start, new_end, vma->vm_pgoff, NULL);

	return 0;
L
Linus Torvalds 已提交
642 643
}

644 645 646 647
/*
 * Finalizes the stack vm_area_struct. The flags and permissions are updated,
 * the stack is optionally relocated, and some extra space is added.
 */
L
Linus Torvalds 已提交
648 649 650 651
int setup_arg_pages(struct linux_binprm *bprm,
		    unsigned long stack_top,
		    int executable_stack)
{
652 653
	unsigned long ret;
	unsigned long stack_shift;
L
Linus Torvalds 已提交
654
	struct mm_struct *mm = current->mm;
655 656 657 658
	struct vm_area_struct *vma = bprm->vma;
	struct vm_area_struct *prev = NULL;
	unsigned long vm_flags;
	unsigned long stack_base;
659 660 661
	unsigned long stack_size;
	unsigned long stack_expand;
	unsigned long rlim_stack;
L
Linus Torvalds 已提交
662 663

#ifdef CONFIG_STACK_GROWSUP
664
	/* Limit stack size */
J
Jiri Slaby 已提交
665
	stack_base = rlimit_max(RLIMIT_STACK);
666 667
	if (stack_base > STACK_SIZE_MAX)
		stack_base = STACK_SIZE_MAX;
L
Linus Torvalds 已提交
668

669 670 671
	/* Add space for stack randomization. */
	stack_base += (STACK_RND_MASK << PAGE_SHIFT);

672 673 674
	/* Make sure we didn't let the argument array grow too large. */
	if (vma->vm_end - vma->vm_start > stack_base)
		return -ENOMEM;
L
Linus Torvalds 已提交
675

676
	stack_base = PAGE_ALIGN(stack_top - stack_base);
L
Linus Torvalds 已提交
677

678 679 680
	stack_shift = vma->vm_start - stack_base;
	mm->arg_start = bprm->p - stack_shift;
	bprm->p = vma->vm_end - stack_shift;
L
Linus Torvalds 已提交
681
#else
682 683
	stack_top = arch_align_stack(stack_top);
	stack_top = PAGE_ALIGN(stack_top);
684 685 686 687 688

	if (unlikely(stack_top < mmap_min_addr) ||
	    unlikely(vma->vm_end - vma->vm_start >= stack_top - mmap_min_addr))
		return -ENOMEM;

689 690 691
	stack_shift = vma->vm_end - stack_top;

	bprm->p -= stack_shift;
L
Linus Torvalds 已提交
692 693 694 695
	mm->arg_start = bprm->p;
#endif

	if (bprm->loader)
696 697
		bprm->loader -= stack_shift;
	bprm->exec -= stack_shift;
L
Linus Torvalds 已提交
698 699

	down_write(&mm->mmap_sem);
700
	vm_flags = VM_STACK_FLAGS;
701 702 703 704 705 706 707 708 709 710 711

	/*
	 * Adjust stack execute permissions; explicitly enable for
	 * EXSTACK_ENABLE_X, disable for EXSTACK_DISABLE_X and leave alone
	 * (arch default) otherwise.
	 */
	if (unlikely(executable_stack == EXSTACK_ENABLE_X))
		vm_flags |= VM_EXEC;
	else if (executable_stack == EXSTACK_DISABLE_X)
		vm_flags &= ~VM_EXEC;
	vm_flags |= mm->def_flags;
712
	vm_flags |= VM_STACK_INCOMPLETE_SETUP;
713 714 715 716 717 718 719 720 721 722

	ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end,
			vm_flags);
	if (ret)
		goto out_unlock;
	BUG_ON(prev != vma);

	/* Move stack pages down in memory. */
	if (stack_shift) {
		ret = shift_arg_pages(vma, stack_shift);
723 724
		if (ret)
			goto out_unlock;
L
Linus Torvalds 已提交
725 726
	}

727 728 729
	/* mprotect_fixup is overkill to remove the temporary stack flags */
	vma->vm_flags &= ~VM_STACK_INCOMPLETE_SETUP;

730
	stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */
731 732 733 734 735 736
	stack_size = vma->vm_end - vma->vm_start;
	/*
	 * Align this down to a page boundary as expand_stack
	 * will align it up.
	 */
	rlim_stack = rlimit(RLIMIT_STACK) & PAGE_MASK;
737
#ifdef CONFIG_STACK_GROWSUP
738 739 740 741
	if (stack_size + stack_expand > rlim_stack)
		stack_base = vma->vm_start + rlim_stack;
	else
		stack_base = vma->vm_end + stack_expand;
742
#else
743 744 745 746
	if (stack_size + stack_expand > rlim_stack)
		stack_base = vma->vm_end - rlim_stack;
	else
		stack_base = vma->vm_start - stack_expand;
747
#endif
748
	current->mm->start_stack = bprm->p;
749 750 751 752 753
	ret = expand_stack(vma, stack_base);
	if (ret)
		ret = -EFAULT;

out_unlock:
L
Linus Torvalds 已提交
754
	up_write(&mm->mmap_sem);
755
	return ret;
L
Linus Torvalds 已提交
756 757 758 759 760
}
EXPORT_SYMBOL(setup_arg_pages);

#endif /* CONFIG_MMU */

761
static struct file *do_open_execat(int fd, struct filename *name, int flags)
L
Linus Torvalds 已提交
762 763
{
	struct file *file;
764
	int err;
765
	struct open_flags open_exec_flags = {
766
		.open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
A
Al Viro 已提交
767
		.acc_mode = MAY_EXEC,
768 769
		.intent = LOOKUP_OPEN,
		.lookup_flags = LOOKUP_FOLLOW,
770
	};
L
Linus Torvalds 已提交
771

772 773 774 775 776 777 778 779
	if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
		return ERR_PTR(-EINVAL);
	if (flags & AT_SYMLINK_NOFOLLOW)
		open_exec_flags.lookup_flags &= ~LOOKUP_FOLLOW;
	if (flags & AT_EMPTY_PATH)
		open_exec_flags.lookup_flags |= LOOKUP_EMPTY;

	file = do_filp_open(fd, name, &open_exec_flags);
780
	if (IS_ERR(file))
781 782 783
		goto out;

	err = -EACCES;
A
Al Viro 已提交
784
	if (!S_ISREG(file_inode(file)->i_mode))
785
		goto exit;
786

787
	if (path_noexec(&file->f_path))
788
		goto exit;
789 790

	err = deny_write_access(file);
791 792
	if (err)
		goto exit;
L
Linus Torvalds 已提交
793

794 795 796
	if (name->name[0] != '\0')
		fsnotify_open(file);

797
out:
798 799
	return file;

800 801
exit:
	fput(file);
802 803
	return ERR_PTR(err);
}
804 805 806

struct file *open_exec(const char *name)
{
807 808 809 810 811 812 813 814
	struct filename *filename = getname_kernel(name);
	struct file *f = ERR_CAST(filename);

	if (!IS_ERR(filename)) {
		f = do_open_execat(AT_FDCWD, filename, 0);
		putname(filename);
	}
	return f;
815
}
L
Linus Torvalds 已提交
816 817
EXPORT_SYMBOL(open_exec);

M
Mimi Zohar 已提交
818 819
int kernel_read(struct file *file, loff_t offset,
		char *addr, unsigned long count)
L
Linus Torvalds 已提交
820 821 822 823 824 825 826 827 828 829 830 831 832 833 834
{
	mm_segment_t old_fs;
	loff_t pos = offset;
	int result;

	old_fs = get_fs();
	set_fs(get_ds());
	/* The cast to a user pointer is valid due to the set_fs() */
	result = vfs_read(file, (void __user *)addr, count, &pos);
	set_fs(old_fs);
	return result;
}

EXPORT_SYMBOL(kernel_read);

835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886
int kernel_read_file(struct file *file, void **buf, loff_t *size,
		     loff_t max_size)
{
	loff_t i_size, pos;
	ssize_t bytes = 0;
	int ret;

	if (!S_ISREG(file_inode(file)->i_mode) || max_size < 0)
		return -EINVAL;

	i_size = i_size_read(file_inode(file));
	if (max_size > 0 && i_size > max_size)
		return -EFBIG;
	if (i_size <= 0)
		return -EINVAL;

	*buf = vmalloc(i_size);
	if (!*buf)
		return -ENOMEM;

	pos = 0;
	while (pos < i_size) {
		bytes = kernel_read(file, pos, (char *)(*buf) + pos,
				    i_size - pos);
		if (bytes < 0) {
			ret = bytes;
			goto out;
		}

		if (bytes == 0)
			break;
		pos += bytes;
	}

	if (pos != i_size) {
		ret = -EIO;
		goto out;
	}

	ret = security_kernel_post_read_file(file, *buf, i_size);
	if (!ret)
		*size = pos;

out:
	if (ret < 0) {
		vfree(*buf);
		*buf = NULL;
	}
	return ret;
}
EXPORT_SYMBOL_GPL(kernel_read_file);

A
Al Viro 已提交
887 888
ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len)
{
889
	ssize_t res = vfs_read(file, (void __user *)addr, len, &pos);
A
Al Viro 已提交
890 891 892 893 894 895
	if (res > 0)
		flush_icache_range(addr, addr + len);
	return res;
}
EXPORT_SYMBOL(read_code);

L
Linus Torvalds 已提交
896 897 898
static int exec_mmap(struct mm_struct *mm)
{
	struct task_struct *tsk;
D
Davidlohr Bueso 已提交
899
	struct mm_struct *old_mm, *active_mm;
L
Linus Torvalds 已提交
900 901 902 903 904 905 906

	/* Notify parent that we're no longer interested in the old VM */
	tsk = current;
	old_mm = current->mm;
	mm_release(tsk, old_mm);

	if (old_mm) {
907
		sync_mm_rss(old_mm);
L
Linus Torvalds 已提交
908 909 910 911
		/*
		 * Make sure that if there is a core dump in progress
		 * for the old mm, we get out and die instead of going
		 * through with the exec.  We must hold mmap_sem around
912
		 * checking core_state and changing tsk->mm.
L
Linus Torvalds 已提交
913 914
		 */
		down_read(&old_mm->mmap_sem);
915
		if (unlikely(old_mm->core_state)) {
L
Linus Torvalds 已提交
916 917 918 919 920 921 922 923 924
			up_read(&old_mm->mmap_sem);
			return -EINTR;
		}
	}
	task_lock(tsk);
	active_mm = tsk->active_mm;
	tsk->mm = mm;
	tsk->active_mm = mm;
	activate_mm(active_mm, mm);
D
Davidlohr Bueso 已提交
925 926
	tsk->mm->vmacache_seqnum = 0;
	vmacache_flush(tsk);
L
Linus Torvalds 已提交
927 928 929
	task_unlock(tsk);
	if (old_mm) {
		up_read(&old_mm->mmap_sem);
930
		BUG_ON(active_mm != old_mm);
931
		setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm);
932
		mm_update_next_owner(old_mm);
L
Linus Torvalds 已提交
933 934 935 936 937 938 939 940 941 942 943 944 945
		mmput(old_mm);
		return 0;
	}
	mmdrop(active_mm);
	return 0;
}

/*
 * This function makes sure the current process has its own signal table,
 * so that flush_signal_handlers can later reset the handlers without
 * disturbing other processes.  (Other processes might share the signal
 * table via the CLONE_SIGHAND option to clone().)
 */
946
static int de_thread(struct task_struct *tsk)
L
Linus Torvalds 已提交
947 948
{
	struct signal_struct *sig = tsk->signal;
949
	struct sighand_struct *oldsighand = tsk->sighand;
L
Linus Torvalds 已提交
950 951
	spinlock_t *lock = &oldsighand->siglock;

952
	if (thread_group_empty(tsk))
L
Linus Torvalds 已提交
953 954 955 956 957 958
		goto no_thread_group;

	/*
	 * Kill all other threads in the thread group.
	 */
	spin_lock_irq(lock);
959
	if (signal_group_exit(sig)) {
L
Linus Torvalds 已提交
960 961 962 963 964 965 966
		/*
		 * Another group action in progress, just
		 * return so that the signal is processed.
		 */
		spin_unlock_irq(lock);
		return -EAGAIN;
	}
967

968
	sig->group_exit_task = tsk;
969 970 971
	sig->notify_count = zap_other_threads(tsk);
	if (!thread_group_leader(tsk))
		sig->notify_count--;
L
Linus Torvalds 已提交
972

973
	while (sig->notify_count) {
O
Oleg Nesterov 已提交
974
		__set_current_state(TASK_KILLABLE);
L
Linus Torvalds 已提交
975 976
		spin_unlock_irq(lock);
		schedule();
O
Oleg Nesterov 已提交
977 978
		if (unlikely(__fatal_signal_pending(tsk)))
			goto killed;
L
Linus Torvalds 已提交
979 980 981 982 983 984 985 986 987
		spin_lock_irq(lock);
	}
	spin_unlock_irq(lock);

	/*
	 * At this point all other threads have exited, all we have to
	 * do is to wait for the thread group leader to become inactive,
	 * and to assume its PID:
	 */
988
	if (!thread_group_leader(tsk)) {
989
		struct task_struct *leader = tsk->group_leader;
990 991

		for (;;) {
992
			threadgroup_change_begin(tsk);
993
			write_lock_irq(&tasklist_lock);
994 995 996 997 998
			/*
			 * Do this under tasklist_lock to ensure that
			 * exit_notify() can't miss ->group_exit_task
			 */
			sig->notify_count = -1;
999 1000
			if (likely(leader->exit_state))
				break;
O
Oleg Nesterov 已提交
1001
			__set_current_state(TASK_KILLABLE);
1002
			write_unlock_irq(&tasklist_lock);
1003
			threadgroup_change_end(tsk);
1004
			schedule();
O
Oleg Nesterov 已提交
1005 1006
			if (unlikely(__fatal_signal_pending(tsk)))
				goto killed;
1007
		}
L
Linus Torvalds 已提交
1008

1009 1010 1011 1012 1013 1014 1015 1016 1017 1018
		/*
		 * The only record we have of the real-time age of a
		 * process, regardless of execs it's done, is start_time.
		 * All the past CPU time is accumulated in signal_struct
		 * from sister threads now dead.  But in this non-leader
		 * exec, nothing survives from the original leader thread,
		 * whose birth marks the true age of this process now.
		 * When we take on its identity by switching to its PID, we
		 * also take its birthdate (always earlier than our own).
		 */
1019
		tsk->start_time = leader->start_time;
1020
		tsk->real_start_time = leader->real_start_time;
1021

1022 1023
		BUG_ON(!same_thread_group(leader, tsk));
		BUG_ON(has_group_leader_pid(tsk));
L
Linus Torvalds 已提交
1024 1025 1026 1027 1028 1029
		/*
		 * An exec() starts a new thread group with the
		 * TGID of the previous thread group. Rehash the
		 * two threads with a switched PID, and release
		 * the former thread group leader:
		 */
1030 1031

		/* Become a process group leader with the old leader's pid.
1032 1033
		 * The old leader becomes a thread of the this thread group.
		 * Note: The old leader also uses this pid until release_task
1034 1035
		 *       is called.  Odd but simple and correct.
		 */
1036
		tsk->pid = leader->pid;
1037
		change_pid(tsk, PIDTYPE_PID, task_pid(leader));
1038 1039
		transfer_pid(leader, tsk, PIDTYPE_PGID);
		transfer_pid(leader, tsk, PIDTYPE_SID);
1040

1041
		list_replace_rcu(&leader->tasks, &tsk->tasks);
1042
		list_replace_init(&leader->sibling, &tsk->sibling);
L
Linus Torvalds 已提交
1043

1044 1045
		tsk->group_leader = tsk;
		leader->group_leader = tsk;
1046

1047
		tsk->exit_signal = SIGCHLD;
1048
		leader->exit_signal = -1;
1049 1050 1051

		BUG_ON(leader->exit_state != EXIT_ZOMBIE);
		leader->exit_state = EXIT_DEAD;
1052 1053 1054 1055 1056 1057 1058 1059

		/*
		 * We are going to release_task()->ptrace_unlink() silently,
		 * the tracer can sleep in do_wait(). EXIT_DEAD guarantees
		 * the tracer wont't block again waiting for this thread.
		 */
		if (unlikely(leader->ptrace))
			__wake_up_parent(leader, leader->parent);
L
Linus Torvalds 已提交
1060
		write_unlock_irq(&tasklist_lock);
1061
		threadgroup_change_end(tsk);
1062 1063

		release_task(leader);
1064
	}
L
Linus Torvalds 已提交
1065

1066 1067
	sig->group_exit_task = NULL;
	sig->notify_count = 0;
L
Linus Torvalds 已提交
1068 1069

no_thread_group:
1070 1071 1072
	/* we have changed execution domain */
	tsk->exit_signal = SIGCHLD;

L
Linus Torvalds 已提交
1073
	exit_itimers(sig);
1074
	flush_itimer_signals();
1075

1076 1077
	if (atomic_read(&oldsighand->count) != 1) {
		struct sighand_struct *newsighand;
L
Linus Torvalds 已提交
1078
		/*
1079 1080
		 * This ->sighand is shared with the CLONE_SIGHAND
		 * but not CLONE_THREAD task, switch to the new one.
L
Linus Torvalds 已提交
1081
		 */
1082 1083 1084 1085
		newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
		if (!newsighand)
			return -ENOMEM;

L
Linus Torvalds 已提交
1086 1087 1088 1089 1090 1091
		atomic_set(&newsighand->count, 1);
		memcpy(newsighand->action, oldsighand->action,
		       sizeof(newsighand->action));

		write_lock_irq(&tasklist_lock);
		spin_lock(&oldsighand->siglock);
1092
		rcu_assign_pointer(tsk->sighand, newsighand);
L
Linus Torvalds 已提交
1093 1094 1095
		spin_unlock(&oldsighand->siglock);
		write_unlock_irq(&tasklist_lock);

1096
		__cleanup_sighand(oldsighand);
L
Linus Torvalds 已提交
1097 1098
	}

1099
	BUG_ON(!thread_group_leader(tsk));
L
Linus Torvalds 已提交
1100
	return 0;
O
Oleg Nesterov 已提交
1101 1102 1103 1104 1105 1106 1107 1108

killed:
	/* protects against exit_notify() and __exit_signal() */
	read_lock(&tasklist_lock);
	sig->group_exit_task = NULL;
	sig->notify_count = 0;
	read_unlock(&tasklist_lock);
	return -EAGAIN;
L
Linus Torvalds 已提交
1109
}
O
Oleg Nesterov 已提交
1110

1111
char *get_task_comm(char *buf, struct task_struct *tsk)
L
Linus Torvalds 已提交
1112 1113 1114 1115 1116
{
	/* buf must be at least sizeof(tsk->comm) in size */
	task_lock(tsk);
	strncpy(buf, tsk->comm, sizeof(tsk->comm));
	task_unlock(tsk);
1117
	return buf;
L
Linus Torvalds 已提交
1118
}
1119
EXPORT_SYMBOL_GPL(get_task_comm);
L
Linus Torvalds 已提交
1120

1121 1122 1123 1124 1125
/*
 * These functions flushes out all traces of the currently running executable
 * so that a new one can be started
 */

1126
void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec)
L
Linus Torvalds 已提交
1127 1128
{
	task_lock(tsk);
1129
	trace_task_rename(tsk, buf);
L
Linus Torvalds 已提交
1130 1131
	strlcpy(tsk->comm, buf, sizeof(tsk->comm));
	task_unlock(tsk);
1132
	perf_event_comm(tsk, exec);
L
Linus Torvalds 已提交
1133 1134 1135 1136
}

int flush_old_exec(struct linux_binprm * bprm)
{
1137
	int retval;
L
Linus Torvalds 已提交
1138 1139 1140 1141 1142 1143 1144 1145 1146

	/*
	 * Make sure we have a private signal table and that
	 * we are unassociated from the previous thread group.
	 */
	retval = de_thread(current);
	if (retval)
		goto out;

1147 1148 1149 1150 1151
	/*
	 * Must be called _before_ exec_mmap() as bprm->mm is
	 * not visibile until then. This also enables the update
	 * to be lockless.
	 */
M
Matt Helsley 已提交
1152
	set_mm_exe_file(bprm->mm, bprm->file);
1153

L
Linus Torvalds 已提交
1154 1155 1156
	/*
	 * Release all of the old mmap stuff
	 */
1157
	acct_arg_size(bprm, 0);
L
Linus Torvalds 已提交
1158 1159
	retval = exec_mmap(bprm->mm);
	if (retval)
1160
		goto out;
L
Linus Torvalds 已提交
1161 1162

	bprm->mm = NULL;		/* We're using it now */
1163

1164
	set_fs(USER_DS);
1165 1166
	current->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD |
					PF_NOFREEZE | PF_NO_SETAFFINITY);
1167 1168 1169
	flush_thread();
	current->personality &= ~bprm->per_clear;

1170 1171 1172 1173 1174 1175 1176
	return 0;

out:
	return retval;
}
EXPORT_SYMBOL(flush_old_exec);

1177 1178
void would_dump(struct linux_binprm *bprm, struct file *file)
{
A
Al Viro 已提交
1179
	if (inode_permission(file_inode(file), MAY_READ) < 0)
1180 1181 1182 1183
		bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
}
EXPORT_SYMBOL(would_dump);

1184 1185 1186
void setup_new_exec(struct linux_binprm * bprm)
{
	arch_pick_mmap_layout(current->mm);
L
Linus Torvalds 已提交
1187 1188 1189 1190

	/* This is the point of no return */
	current->sas_ss_sp = current->sas_ss_size = 0;

1191
	if (uid_eq(current_euid(), current_uid()) && gid_eq(current_egid(), current_gid()))
1192
		set_dumpable(current->mm, SUID_DUMP_USER);
A
Alan Cox 已提交
1193
	else
1194
		set_dumpable(current->mm, suid_dumpable);
A
Alan Cox 已提交
1195

1196
	perf_event_exec();
1197
	__set_task_comm(current, kbasename(bprm->filename), true);
L
Linus Torvalds 已提交
1198

1199 1200 1201 1202 1203 1204
	/* Set the new mm task size. We have to do that late because it may
	 * depend on TIF_32BIT which is only updated in flush_thread() on
	 * some architectures like powerpc
	 */
	current->mm->task_size = TASK_SIZE;

1205
	/* install the new credentials */
1206 1207
	if (!uid_eq(bprm->cred->uid, current_euid()) ||
	    !gid_eq(bprm->cred->gid, current_egid())) {
1208
		current->pdeath_signal = 0;
1209 1210 1211 1212
	} else {
		would_dump(bprm, bprm->file);
		if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)
			set_dumpable(current->mm, suid_dumpable);
L
Linus Torvalds 已提交
1213 1214 1215 1216 1217 1218
	}

	/* An exec changes our domain. We are no longer part of the thread
	   group */
	current->self_exec_id++;
	flush_signal_handlers(current, 0);
1219
	do_close_on_exec(current->files);
L
Linus Torvalds 已提交
1220
}
1221
EXPORT_SYMBOL(setup_new_exec);
L
Linus Torvalds 已提交
1222

1223 1224 1225 1226 1227 1228 1229 1230
/*
 * Prepare credentials and lock ->cred_guard_mutex.
 * install_exec_creds() commits the new creds and drops the lock.
 * Or, if exec fails before, free_bprm() should release ->cred and
 * and unlock.
 */
int prepare_bprm_creds(struct linux_binprm *bprm)
{
1231
	if (mutex_lock_interruptible(&current->signal->cred_guard_mutex))
1232 1233 1234 1235 1236 1237
		return -ERESTARTNOINTR;

	bprm->cred = prepare_exec_creds();
	if (likely(bprm->cred))
		return 0;

1238
	mutex_unlock(&current->signal->cred_guard_mutex);
1239 1240 1241
	return -ENOMEM;
}

1242
static void free_bprm(struct linux_binprm *bprm)
1243 1244 1245
{
	free_arg_pages(bprm);
	if (bprm->cred) {
1246
		mutex_unlock(&current->signal->cred_guard_mutex);
1247 1248
		abort_creds(bprm->cred);
	}
1249 1250 1251 1252
	if (bprm->file) {
		allow_write_access(bprm->file);
		fput(bprm->file);
	}
1253 1254 1255
	/* If a binfmt changed the interp, free it. */
	if (bprm->interp != bprm->filename)
		kfree(bprm->interp);
1256 1257 1258
	kfree(bprm);
}

1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270
int bprm_change_interp(char *interp, struct linux_binprm *bprm)
{
	/* If a binfmt changed the interp, free it first. */
	if (bprm->interp != bprm->filename)
		kfree(bprm->interp);
	bprm->interp = kstrdup(interp, GFP_KERNEL);
	if (!bprm->interp)
		return -ENOMEM;
	return 0;
}
EXPORT_SYMBOL(bprm_change_interp);

1271 1272 1273 1274 1275 1276 1277 1278 1279
/*
 * install the new credentials for this executable
 */
void install_exec_creds(struct linux_binprm *bprm)
{
	security_bprm_committing_creds(bprm);

	commit_creds(bprm->cred);
	bprm->cred = NULL;
1280 1281 1282 1283 1284 1285 1286 1287 1288

	/*
	 * Disable monitoring for regular users
	 * when executing setuid binaries. Must
	 * wait until new credentials are committed
	 * by commit_creds() above
	 */
	if (get_dumpable(current->mm) != SUID_DUMP_USER)
		perf_event_exit_task(current);
1289 1290
	/*
	 * cred_guard_mutex must be held at least to this point to prevent
1291
	 * ptrace_attach() from altering our determination of the task's
1292 1293
	 * credentials; any time after this it may be unlocked.
	 */
1294
	security_bprm_committed_creds(bprm);
1295
	mutex_unlock(&current->signal->cred_guard_mutex);
1296 1297 1298 1299 1300
}
EXPORT_SYMBOL(install_exec_creds);

/*
 * determine how safe it is to execute the proposed program
1301
 * - the caller must hold ->cred_guard_mutex to protect against
1302
 *   PTRACE_ATTACH or seccomp thread-sync
1303
 */
1304
static void check_unsafe_exec(struct linux_binprm *bprm)
1305
{
D
David Howells 已提交
1306
	struct task_struct *p = current, *t;
1307
	unsigned n_fs;
1308

T
Tejun Heo 已提交
1309 1310 1311 1312 1313 1314
	if (p->ptrace) {
		if (p->ptrace & PT_PTRACE_CAP)
			bprm->unsafe |= LSM_UNSAFE_PTRACE_CAP;
		else
			bprm->unsafe |= LSM_UNSAFE_PTRACE;
	}
1315

1316 1317 1318 1319
	/*
	 * This isn't strictly necessary, but it makes it harder for LSMs to
	 * mess up.
	 */
1320
	if (task_no_new_privs(current))
1321 1322
		bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS;

1323
	t = p;
D
David Howells 已提交
1324
	n_fs = 1;
N
Nick Piggin 已提交
1325
	spin_lock(&p->fs->lock);
1326
	rcu_read_lock();
1327
	while_each_thread(p, t) {
D
David Howells 已提交
1328 1329 1330
		if (t->fs == p->fs)
			n_fs++;
	}
1331
	rcu_read_unlock();
D
David Howells 已提交
1332

1333
	if (p->fs->users > n_fs)
1334
		bprm->unsafe |= LSM_UNSAFE_SHARE;
1335 1336
	else
		p->fs->in_exec = 1;
N
Nick Piggin 已提交
1337
	spin_unlock(&p->fs->lock);
1338 1339
}

1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362
static void bprm_fill_uid(struct linux_binprm *bprm)
{
	struct inode *inode;
	unsigned int mode;
	kuid_t uid;
	kgid_t gid;

	/* clear any previous set[ug]id data from a previous binary */
	bprm->cred->euid = current_euid();
	bprm->cred->egid = current_egid();

	if (bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)
		return;

	if (task_no_new_privs(current))
		return;

	inode = file_inode(bprm->file);
	mode = READ_ONCE(inode->i_mode);
	if (!(mode & (S_ISUID|S_ISGID)))
		return;

	/* Be careful if suid/sgid is set */
A
Al Viro 已提交
1363
	inode_lock(inode);
1364 1365 1366 1367 1368

	/* reload atomically mode/uid/gid now that lock held */
	mode = inode->i_mode;
	uid = inode->i_uid;
	gid = inode->i_gid;
A
Al Viro 已提交
1369
	inode_unlock(inode);
1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386

	/* We ignore suid/sgid if there are no mappings for them in the ns */
	if (!kuid_has_mapping(bprm->cred->user_ns, uid) ||
		 !kgid_has_mapping(bprm->cred->user_ns, gid))
		return;

	if (mode & S_ISUID) {
		bprm->per_clear |= PER_CLEAR_ON_SETID;
		bprm->cred->euid = uid;
	}

	if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
		bprm->per_clear |= PER_CLEAR_ON_SETID;
		bprm->cred->egid = gid;
	}
}

1387 1388
/*
 * Fill the binprm structure from the inode.
L
Linus Torvalds 已提交
1389
 * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes
1390 1391
 *
 * This may be called multiple times for binary chains (scripts for example).
L
Linus Torvalds 已提交
1392 1393 1394 1395 1396
 */
int prepare_binprm(struct linux_binprm *bprm)
{
	int retval;

1397
	bprm_fill_uid(bprm);
L
Linus Torvalds 已提交
1398 1399

	/* fill in binprm security blob */
1400
	retval = security_bprm_set_creds(bprm);
L
Linus Torvalds 已提交
1401 1402
	if (retval)
		return retval;
1403
	bprm->cred_prepared = 1;
L
Linus Torvalds 已提交
1404

1405 1406
	memset(bprm->buf, 0, BINPRM_BUF_SIZE);
	return kernel_read(bprm->file, 0, bprm->buf, BINPRM_BUF_SIZE);
L
Linus Torvalds 已提交
1407 1408 1409 1410
}

EXPORT_SYMBOL(prepare_binprm);

N
Nick Piggin 已提交
1411 1412 1413 1414 1415
/*
 * Arguments are '\0' separated strings found at the location bprm->p
 * points to; chop off the first by relocating brpm->p to right after
 * the first '\0' encountered.
 */
1416
int remove_arg_zero(struct linux_binprm *bprm)
L
Linus Torvalds 已提交
1417
{
1418 1419 1420 1421
	int ret = 0;
	unsigned long offset;
	char *kaddr;
	struct page *page;
N
Nick Piggin 已提交
1422

1423 1424
	if (!bprm->argc)
		return 0;
L
Linus Torvalds 已提交
1425

1426 1427 1428 1429 1430 1431 1432
	do {
		offset = bprm->p & ~PAGE_MASK;
		page = get_arg_page(bprm, bprm->p, 0);
		if (!page) {
			ret = -EFAULT;
			goto out;
		}
1433
		kaddr = kmap_atomic(page);
N
Nick Piggin 已提交
1434

1435 1436 1437
		for (; offset < PAGE_SIZE && kaddr[offset];
				offset++, bprm->p++)
			;
N
Nick Piggin 已提交
1438

1439
		kunmap_atomic(kaddr);
1440
		put_arg_page(page);
N
Nick Piggin 已提交
1441

1442 1443 1444
		if (offset == PAGE_SIZE)
			free_arg_page(bprm, (bprm->p >> PAGE_SHIFT) - 1);
	} while (offset == PAGE_SIZE);
N
Nick Piggin 已提交
1445

1446 1447 1448
	bprm->p++;
	bprm->argc--;
	ret = 0;
N
Nick Piggin 已提交
1449

1450 1451
out:
	return ret;
L
Linus Torvalds 已提交
1452 1453 1454
}
EXPORT_SYMBOL(remove_arg_zero);

1455
#define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e))
L
Linus Torvalds 已提交
1456 1457 1458
/*
 * cycle the list of binary formats handler, until one recognizes the image
 */
1459
int search_binary_handler(struct linux_binprm *bprm)
L
Linus Torvalds 已提交
1460
{
1461
	bool need_retry = IS_ENABLED(CONFIG_MODULES);
L
Linus Torvalds 已提交
1462
	struct linux_binfmt *fmt;
1463
	int retval;
L
Linus Torvalds 已提交
1464

1465
	/* This allows 4 levels of binfmt rewrites before failing hard. */
1466
	if (bprm->recursion_depth > 5)
1467 1468
		return -ELOOP;

L
Linus Torvalds 已提交
1469 1470 1471 1472 1473
	retval = security_bprm_check(bprm);
	if (retval)
		return retval;

	retval = -ENOENT;
1474 1475 1476 1477 1478 1479 1480 1481
 retry:
	read_lock(&binfmt_lock);
	list_for_each_entry(fmt, &formats, lh) {
		if (!try_module_get(fmt->module))
			continue;
		read_unlock(&binfmt_lock);
		bprm->recursion_depth++;
		retval = fmt->load_binary(bprm);
1482 1483
		read_lock(&binfmt_lock);
		put_binfmt(fmt);
1484
		bprm->recursion_depth--;
1485 1486 1487 1488 1489 1490 1491 1492
		if (retval < 0 && !bprm->mm) {
			/* we got to flush_old_exec() and failed after it */
			read_unlock(&binfmt_lock);
			force_sigsegv(SIGSEGV, current);
			return retval;
		}
		if (retval != -ENOEXEC || !bprm->file) {
			read_unlock(&binfmt_lock);
1493
			return retval;
L
Linus Torvalds 已提交
1494 1495
		}
	}
1496 1497
	read_unlock(&binfmt_lock);

1498
	if (need_retry) {
1499 1500 1501
		if (printable(bprm->buf[0]) && printable(bprm->buf[1]) &&
		    printable(bprm->buf[2]) && printable(bprm->buf[3]))
			return retval;
1502 1503
		if (request_module("binfmt-%04x", *(ushort *)(bprm->buf + 2)) < 0)
			return retval;
1504 1505 1506 1507
		need_retry = false;
		goto retry;
	}

L
Linus Torvalds 已提交
1508 1509 1510 1511
	return retval;
}
EXPORT_SYMBOL(search_binary_handler);

1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524
static int exec_binprm(struct linux_binprm *bprm)
{
	pid_t old_pid, old_vpid;
	int ret;

	/* Need to fetch pid before load_binary changes it */
	old_pid = current->pid;
	rcu_read_lock();
	old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent));
	rcu_read_unlock();

	ret = search_binary_handler(bprm);
	if (ret >= 0) {
1525
		audit_bprm(bprm);
1526 1527
		trace_sched_process_exec(current, old_pid, bprm);
		ptrace_event(PTRACE_EVENT_EXEC, old_vpid);
1528
		proc_exec_connector(current);
1529 1530 1531 1532 1533
	}

	return ret;
}

L
Linus Torvalds 已提交
1534 1535 1536
/*
 * sys_execve() executes a new program.
 */
1537 1538 1539 1540
static int do_execveat_common(int fd, struct filename *filename,
			      struct user_arg_ptr argv,
			      struct user_arg_ptr envp,
			      int flags)
L
Linus Torvalds 已提交
1541
{
1542
	char *pathbuf = NULL;
L
Linus Torvalds 已提交
1543 1544
	struct linux_binprm *bprm;
	struct file *file;
1545
	struct files_struct *displaced;
L
Linus Torvalds 已提交
1546
	int retval;
1547

1548 1549 1550
	if (IS_ERR(filename))
		return PTR_ERR(filename);

1551 1552 1553 1554 1555 1556 1557
	/*
	 * We move the actual failure in case of RLIMIT_NPROC excess from
	 * set*uid() to execve() because too many poorly written programs
	 * don't check setuid() return code.  Here we additionally recheck
	 * whether NPROC limit is still exceeded.
	 */
	if ((current->flags & PF_NPROC_EXCEEDED) &&
1558
	    atomic_read(&current_user()->processes) > rlimit(RLIMIT_NPROC)) {
1559 1560 1561 1562 1563 1564 1565
		retval = -EAGAIN;
		goto out_ret;
	}

	/* We're below the limit (still or again), so we don't want to make
	 * further execve() calls fail. */
	current->flags &= ~PF_NPROC_EXCEEDED;
L
Linus Torvalds 已提交
1566

1567
	retval = unshare_files(&displaced);
1568 1569 1570
	if (retval)
		goto out_ret;

L
Linus Torvalds 已提交
1571
	retval = -ENOMEM;
1572
	bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
L
Linus Torvalds 已提交
1573
	if (!bprm)
1574
		goto out_files;
L
Linus Torvalds 已提交
1575

1576 1577
	retval = prepare_bprm_creds(bprm);
	if (retval)
1578
		goto out_free;
A
Al Viro 已提交
1579

1580
	check_unsafe_exec(bprm);
1581
	current->in_execve = 1;
1582

1583
	file = do_open_execat(fd, filename, flags);
L
Linus Torvalds 已提交
1584 1585
	retval = PTR_ERR(file);
	if (IS_ERR(file))
A
Al Viro 已提交
1586
		goto out_unmark;
L
Linus Torvalds 已提交
1587 1588 1589 1590

	sched_exec();

	bprm->file = file;
1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612
	if (fd == AT_FDCWD || filename->name[0] == '/') {
		bprm->filename = filename->name;
	} else {
		if (filename->name[0] == '\0')
			pathbuf = kasprintf(GFP_TEMPORARY, "/dev/fd/%d", fd);
		else
			pathbuf = kasprintf(GFP_TEMPORARY, "/dev/fd/%d/%s",
					    fd, filename->name);
		if (!pathbuf) {
			retval = -ENOMEM;
			goto out_unmark;
		}
		/*
		 * Record that a name derived from an O_CLOEXEC fd will be
		 * inaccessible after exec. Relies on having exclusive access to
		 * current->files (due to unshare_files above).
		 */
		if (close_on_exec(fd, rcu_dereference_raw(current->files->fdt)))
			bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE;
		bprm->filename = pathbuf;
	}
	bprm->interp = bprm->filename;
L
Linus Torvalds 已提交
1613

1614 1615
	retval = bprm_mm_init(bprm);
	if (retval)
1616
		goto out_unmark;
L
Linus Torvalds 已提交
1617

1618
	bprm->argc = count(argv, MAX_ARG_STRINGS);
L
Linus Torvalds 已提交
1619
	if ((retval = bprm->argc) < 0)
1620
		goto out;
L
Linus Torvalds 已提交
1621

1622
	bprm->envc = count(envp, MAX_ARG_STRINGS);
L
Linus Torvalds 已提交
1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642
	if ((retval = bprm->envc) < 0)
		goto out;

	retval = prepare_binprm(bprm);
	if (retval < 0)
		goto out;

	retval = copy_strings_kernel(1, &bprm->filename, bprm);
	if (retval < 0)
		goto out;

	bprm->exec = bprm->p;
	retval = copy_strings(bprm->envc, envp, bprm);
	if (retval < 0)
		goto out;

	retval = copy_strings(bprm->argc, argv, bprm);
	if (retval < 0)
		goto out;

1643
	retval = exec_binprm(bprm);
1644 1645
	if (retval < 0)
		goto out;
L
Linus Torvalds 已提交
1646

1647
	/* execve succeeded */
A
Al Viro 已提交
1648
	current->fs->in_exec = 0;
1649
	current->in_execve = 0;
1650
	acct_update_integrals(current);
1651
	task_numa_free(current);
1652
	free_bprm(bprm);
1653
	kfree(pathbuf);
1654
	putname(filename);
1655 1656 1657
	if (displaced)
		put_files_struct(displaced);
	return retval;
L
Linus Torvalds 已提交
1658

1659
out:
1660 1661 1662 1663
	if (bprm->mm) {
		acct_arg_size(bprm, 0);
		mmput(bprm->mm);
	}
L
Linus Torvalds 已提交
1664

A
Al Viro 已提交
1665
out_unmark:
1666
	current->fs->in_exec = 0;
1667
	current->in_execve = 0;
1668 1669

out_free:
1670
	free_bprm(bprm);
1671
	kfree(pathbuf);
L
Linus Torvalds 已提交
1672

1673
out_files:
1674 1675
	if (displaced)
		reset_files_struct(displaced);
L
Linus Torvalds 已提交
1676
out_ret:
1677
	putname(filename);
L
Linus Torvalds 已提交
1678 1679 1680
	return retval;
}

1681
int do_execve(struct filename *filename,
1682
	const char __user *const __user *__argv,
1683
	const char __user *const __user *__envp)
1684
{
1685 1686
	struct user_arg_ptr argv = { .ptr.native = __argv };
	struct user_arg_ptr envp = { .ptr.native = __envp };
1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698
	return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
}

int do_execveat(int fd, struct filename *filename,
		const char __user *const __user *__argv,
		const char __user *const __user *__envp,
		int flags)
{
	struct user_arg_ptr argv = { .ptr.native = __argv };
	struct user_arg_ptr envp = { .ptr.native = __envp };

	return do_execveat_common(fd, filename, argv, envp, flags);
1699 1700 1701
}

#ifdef CONFIG_COMPAT
1702
static int compat_do_execve(struct filename *filename,
A
Al Viro 已提交
1703
	const compat_uptr_t __user *__argv,
1704
	const compat_uptr_t __user *__envp)
1705 1706 1707 1708 1709 1710 1711 1712 1713
{
	struct user_arg_ptr argv = {
		.is_compat = true,
		.ptr.compat = __argv,
	};
	struct user_arg_ptr envp = {
		.is_compat = true,
		.ptr.compat = __envp,
	};
1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730
	return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
}

static int compat_do_execveat(int fd, struct filename *filename,
			      const compat_uptr_t __user *__argv,
			      const compat_uptr_t __user *__envp,
			      int flags)
{
	struct user_arg_ptr argv = {
		.is_compat = true,
		.ptr.compat = __argv,
	};
	struct user_arg_ptr envp = {
		.is_compat = true,
		.ptr.compat = __envp,
	};
	return do_execveat_common(fd, filename, argv, envp, flags);
1731
}
1732
#endif
1733

1734
void set_binfmt(struct linux_binfmt *new)
L
Linus Torvalds 已提交
1735
{
1736 1737 1738 1739
	struct mm_struct *mm = current->mm;

	if (mm->binfmt)
		module_put(mm->binfmt->module);
L
Linus Torvalds 已提交
1740

1741
	mm->binfmt = new;
1742 1743
	if (new)
		__module_get(new->module);
L
Linus Torvalds 已提交
1744 1745 1746
}
EXPORT_SYMBOL(set_binfmt);

1747
/*
1748
 * set_dumpable stores three-value SUID_DUMP_* into mm->flags.
1749 1750 1751
 */
void set_dumpable(struct mm_struct *mm, int value)
{
1752 1753
	unsigned long old, new;

1754 1755 1756
	if (WARN_ON((unsigned)value > SUID_DUMP_ROOT))
		return;

1757 1758
	do {
		old = ACCESS_ONCE(mm->flags);
1759
		new = (old & ~MMF_DUMPABLE_MASK) | value;
1760
	} while (cmpxchg(&mm->flags, old, new) != old);
1761 1762
}

A
Al Viro 已提交
1763 1764 1765 1766 1767
SYSCALL_DEFINE3(execve,
		const char __user *, filename,
		const char __user *const __user *, argv,
		const char __user *const __user *, envp)
{
1768
	return do_execve(getname(filename), argv, envp);
A
Al Viro 已提交
1769
}
1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783

SYSCALL_DEFINE5(execveat,
		int, fd, const char __user *, filename,
		const char __user *const __user *, argv,
		const char __user *const __user *, envp,
		int, flags)
{
	int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;

	return do_execveat(fd,
			   getname_flags(filename, lookup_flags, NULL),
			   argv, envp, flags);
}

A
Al Viro 已提交
1784
#ifdef CONFIG_COMPAT
1785 1786 1787
COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename,
	const compat_uptr_t __user *, argv,
	const compat_uptr_t __user *, envp)
A
Al Viro 已提交
1788
{
1789
	return compat_do_execve(getname(filename), argv, envp);
A
Al Viro 已提交
1790
}
1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803

COMPAT_SYSCALL_DEFINE5(execveat, int, fd,
		       const char __user *, filename,
		       const compat_uptr_t __user *, argv,
		       const compat_uptr_t __user *, envp,
		       int,  flags)
{
	int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;

	return compat_do_execveat(fd,
				  getname_flags(filename, lookup_flags, NULL),
				  argv, envp, flags);
}
A
Al Viro 已提交
1804
#endif