coredump.c 28.6 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
2 3 4
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/fdtable.h>
5
#include <linux/freezer.h>
6 7 8 9
#include <linux/mm.h>
#include <linux/stat.h>
#include <linux/fcntl.h>
#include <linux/swap.h>
10
#include <linux/ctype.h>
11 12 13 14 15 16 17 18 19
#include <linux/string.h>
#include <linux/init.h>
#include <linux/pagemap.h>
#include <linux/perf_event.h>
#include <linux/highmem.h>
#include <linux/spinlock.h>
#include <linux/key.h>
#include <linux/personality.h>
#include <linux/binfmts.h>
20
#include <linux/coredump.h>
21
#include <linux/sched/coredump.h>
22
#include <linux/sched/signal.h>
23
#include <linux/sched/task_stack.h>
24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
#include <linux/utsname.h>
#include <linux/pid_namespace.h>
#include <linux/module.h>
#include <linux/namei.h>
#include <linux/mount.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/tsacct_kern.h>
#include <linux/cn_proc.h>
#include <linux/audit.h>
#include <linux/tracehook.h>
#include <linux/kmod.h>
#include <linux/fsnotify.h>
#include <linux/fs_struct.h>
#include <linux/pipe_fs_i.h>
#include <linux/oom.h>
#include <linux/compat.h>
41 42
#include <linux/fs.h>
#include <linux/path.h>
43
#include <linux/timekeeping.h>
44
#include <linux/elf.h>
45

46
#include <linux/uaccess.h>
47 48 49 50 51 52 53 54 55 56 57
#include <asm/mmu_context.h>
#include <asm/tlb.h>
#include <asm/exec.h>

#include <trace/events/task.h>
#include "internal.h"

#include <trace/events/sched.h>

int core_uses_pid;
unsigned int core_pipe_limit;
58 59
char core_pattern[CORENAME_MAX_SIZE] = "core";
static int core_name_size = CORENAME_MAX_SIZE;
60 61 62 63 64 65 66 67

struct core_name {
	char *corename;
	int used, size;
};

/* The maximal length of core_pattern is also specified in sysctl.c */

68
static int expand_corename(struct core_name *cn, int size)
69
{
70
	char *corename = krealloc(cn->corename, size, GFP_KERNEL);
71

72
	if (!corename)
73 74
		return -ENOMEM;

75 76 77 78
	if (size > core_name_size) /* racy but harmless */
		core_name_size = size;

	cn->size = ksize(corename);
79
	cn->corename = corename;
80 81 82
	return 0;
}

83 84
static __printf(2, 0) int cn_vprintf(struct core_name *cn, const char *fmt,
				     va_list arg)
85
{
86
	int free, need;
E
Eric Dumazet 已提交
87
	va_list arg_copy;
88

89 90
again:
	free = cn->size - cn->used;
E
Eric Dumazet 已提交
91 92 93 94 95

	va_copy(arg_copy, arg);
	need = vsnprintf(cn->corename + cn->used, free, fmt, arg_copy);
	va_end(arg_copy);

96 97 98 99
	if (need < free) {
		cn->used += need;
		return 0;
	}
100

101
	if (!expand_corename(cn, cn->size + need - free + 1))
102
		goto again;
103

104
	return -ENOMEM;
105 106
}

107
static __printf(2, 3) int cn_printf(struct core_name *cn, const char *fmt, ...)
O
Oleg Nesterov 已提交
108 109 110 111 112 113 114 115 116 117 118
{
	va_list arg;
	int ret;

	va_start(arg, fmt);
	ret = cn_vprintf(cn, fmt, arg);
	va_end(arg);

	return ret;
}

119 120
static __printf(2, 3)
int cn_esc_printf(struct core_name *cn, const char *fmt, ...)
121
{
122 123 124 125 126 127 128 129
	int cur = cn->used;
	va_list arg;
	int ret;

	va_start(arg, fmt);
	ret = cn_vprintf(cn, fmt, arg);
	va_end(arg);

130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149
	if (ret == 0) {
		/*
		 * Ensure that this coredump name component can't cause the
		 * resulting corefile path to consist of a ".." or ".".
		 */
		if ((cn->used - cur == 1 && cn->corename[cur] == '.') ||
				(cn->used - cur == 2 && cn->corename[cur] == '.'
				&& cn->corename[cur+1] == '.'))
			cn->corename[cur] = '!';

		/*
		 * Empty names are fishy and could be used to create a "//" in a
		 * corefile name, causing the coredump to happen one directory
		 * level too high. Enforce that all components of the core
		 * pattern are at least one character long.
		 */
		if (cn->used == cur)
			ret = cn_printf(cn, "!");
	}

150 151 152 153 154
	for (; cur < cn->used; ++cur) {
		if (cn->corename[cur] == '/')
			cn->corename[cur] = '!';
	}
	return ret;
155 156
}

157
static int cn_print_exe_file(struct core_name *cn, bool name_only)
158 159
{
	struct file *exe_file;
160
	char *pathbuf, *path, *ptr;
161 162 163
	int ret;

	exe_file = get_mm_exe_file(current->mm);
164 165
	if (!exe_file)
		return cn_esc_printf(cn, "%s (path unknown)", current->comm);
166

167
	pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
168 169 170 171 172
	if (!pathbuf) {
		ret = -ENOMEM;
		goto put_exe_file;
	}

M
Miklos Szeredi 已提交
173
	path = file_path(exe_file, pathbuf, PATH_MAX);
174 175 176 177 178
	if (IS_ERR(path)) {
		ret = PTR_ERR(path);
		goto free_buf;
	}

179 180 181 182 183
	if (name_only) {
		ptr = strrchr(path, '/');
		if (ptr)
			path = ptr + 1;
	}
184
	ret = cn_esc_printf(cn, "%s", path);
185 186 187 188 189 190 191 192 193 194 195 196

free_buf:
	kfree(pathbuf);
put_exe_file:
	fput(exe_file);
	return ret;
}

/* format_corename will inspect the pattern parameter, and output a
 * name into corename, which must have space for at least
 * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
 */
197 198
static int format_corename(struct core_name *cn, struct coredump_params *cprm,
			   size_t **argv, int *argc)
199 200 201 202
{
	const struct cred *cred = current_cred();
	const char *pat_ptr = core_pattern;
	int ispipe = (*pat_ptr == '|');
203
	bool was_space = false;
204 205 206
	int pid_in_pattern = 0;
	int err = 0;

207
	cn->used = 0;
208 209
	cn->corename = NULL;
	if (expand_corename(cn, core_name_size))
210
		return -ENOMEM;
211 212
	cn->corename[0] = '\0';

213 214 215 216 217 218
	if (ispipe) {
		int argvs = sizeof(core_pattern) / 2;
		(*argv) = kmalloc_array(argvs, sizeof(**argv), GFP_KERNEL);
		if (!(*argv))
			return -ENOMEM;
		(*argv)[(*argc)++] = 0;
219
		++pat_ptr;
220 221
		if (!(*pat_ptr))
			return -ENOMEM;
222
	}
223 224 225 226

	/* Repeat as long as we have more pattern to process and more output
	   space */
	while (*pat_ptr) {
227 228 229 230 231 232
		/*
		 * Split on spaces before doing template expansion so that
		 * %e and %E don't get split if they have spaces in them
		 */
		if (ispipe) {
			if (isspace(*pat_ptr)) {
233 234
				if (cn->used != 0)
					was_space = true;
235 236 237 238 239 240 241 242 243 244
				pat_ptr++;
				continue;
			} else if (was_space) {
				was_space = false;
				err = cn_printf(cn, "%c", '\0');
				if (err)
					return err;
				(*argv)[(*argc)++] = cn->used;
			}
		}
245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261
		if (*pat_ptr != '%') {
			err = cn_printf(cn, "%c", *pat_ptr++);
		} else {
			switch (*++pat_ptr) {
			/* single % at the end, drop that */
			case 0:
				goto out;
			/* Double percent, output one percent */
			case '%':
				err = cn_printf(cn, "%c", '%');
				break;
			/* pid */
			case 'p':
				pid_in_pattern = 1;
				err = cn_printf(cn, "%d",
					      task_tgid_vnr(current));
				break;
262 263 264 265 266
			/* global pid */
			case 'P':
				err = cn_printf(cn, "%d",
					      task_tgid_nr(current));
				break;
267 268 269 270 271 272 273 274
			case 'i':
				err = cn_printf(cn, "%d",
					      task_pid_vnr(current));
				break;
			case 'I':
				err = cn_printf(cn, "%d",
					      task_pid_nr(current));
				break;
275 276
			/* uid */
			case 'u':
277 278 279
				err = cn_printf(cn, "%u",
						from_kuid(&init_user_ns,
							  cred->uid));
280 281 282
				break;
			/* gid */
			case 'g':
283 284 285
				err = cn_printf(cn, "%u",
						from_kgid(&init_user_ns,
							  cred->gid));
286
				break;
287 288 289 290
			case 'd':
				err = cn_printf(cn, "%d",
					__get_dumpable(cprm->mm_flags));
				break;
291 292
			/* signal that caused the coredump */
			case 's':
293 294
				err = cn_printf(cn, "%d",
						cprm->siginfo->si_signo);
295 296 297
				break;
			/* UNIX time of coredump */
			case 't': {
298 299 300 301
				time64_t time;

				time = ktime_get_real_seconds();
				err = cn_printf(cn, "%lld", time);
302 303 304
				break;
			}
			/* hostname */
305
			case 'h':
306
				down_read(&uts_sem);
307
				err = cn_esc_printf(cn, "%s",
308 309 310
					      utsname()->nodename);
				up_read(&uts_sem);
				break;
311
			/* executable, could be changed by prctl PR_SET_NAME etc */
312 313
			case 'e':
				err = cn_esc_printf(cn, "%s", current->comm);
314
				break;
315 316 317 318
			/* file name of executable */
			case 'f':
				err = cn_print_exe_file(cn, true);
				break;
319
			case 'E':
320
				err = cn_print_exe_file(cn, false);
321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336
				break;
			/* core limit size */
			case 'c':
				err = cn_printf(cn, "%lu",
					      rlimit(RLIMIT_CORE));
				break;
			default:
				break;
			}
			++pat_ptr;
		}

		if (err)
			return err;
	}

337
out:
338 339 340 341 342 343 344 345 346 347 348 349 350
	/* Backward compatibility with core_uses_pid:
	 *
	 * If core_pattern does not include a %p (as is the default)
	 * and core_uses_pid is set, then .%pid will be appended to
	 * the filename. Do not do this for piped commands. */
	if (!ispipe && !pid_in_pattern && core_uses_pid) {
		err = cn_printf(cn, ".%d", task_tgid_vnr(current));
		if (err)
			return err;
	}
	return ispipe;
}

351
static int zap_process(struct task_struct *start, int exit_code, int flags)
352 353 354 355
{
	struct task_struct *t;
	int nr = 0;

356 357
	/* ignore all signals except SIGKILL, see prepare_signal() */
	start->signal->flags = SIGNAL_GROUP_COREDUMP | flags;
358 359 360
	start->signal->group_exit_code = exit_code;
	start->signal->group_stop_count = 0;

361
	for_each_thread(start, t) {
362 363 364 365 366 367
		task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
		if (t != current && t->mm) {
			sigaddset(&t->pending.signal, SIGKILL);
			signal_wake_up(t, 1);
			nr++;
		}
368
	}
369 370 371 372

	return nr;
}

373 374
static int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
			struct core_state *core_state, int exit_code)
375 376 377 378 379 380 381 382
{
	struct task_struct *g, *p;
	unsigned long flags;
	int nr = -EAGAIN;

	spin_lock_irq(&tsk->sighand->siglock);
	if (!signal_group_exit(tsk->signal)) {
		mm->core_state = core_state;
383
		tsk->signal->group_exit_task = tsk;
384
		nr = zap_process(tsk, exit_code, 0);
385
		clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
386 387 388 389 390
	}
	spin_unlock_irq(&tsk->sighand->siglock);
	if (unlikely(nr < 0))
		return nr;

391
	tsk->flags |= PF_DUMPCORE;
392 393 394 395 396 397 398 399 400 401 402 403 404 405 406
	if (atomic_read(&mm->mm_users) == nr + 1)
		goto done;
	/*
	 * We should find and kill all tasks which use this mm, and we should
	 * count them correctly into ->nr_threads. We don't take tasklist
	 * lock, but this is safe wrt:
	 *
	 * fork:
	 *	None of sub-threads can fork after zap_process(leader). All
	 *	processes which were created before this point should be
	 *	visible to zap_threads() because copy_process() adds the new
	 *	process to the tail of init_task.tasks list, and lock/unlock
	 *	of ->siglock provides a memory barrier.
	 *
	 * do_exit:
407
	 *	The caller holds mm->mmap_lock. This means that the task which
408 409 410 411 412 413 414
	 *	uses this mm can't pass exit_mm(), so it can't exit or clear
	 *	its ->mm.
	 *
	 * de_thread:
	 *	It does list_replace_rcu(&leader->tasks, &current->tasks),
	 *	we must see either old or new leader, this does not matter.
	 *	However, it can change p->sighand, so lock_task_sighand(p)
415
	 *	must be used. Since p->mm != NULL and we hold ->mmap_lock
416 417 418 419 420 421 422 423 424 425 426 427 428 429
	 *	it can't fail.
	 *
	 *	Note also that "g" can be the old leader with ->mm == NULL
	 *	and already unhashed and thus removed from ->thread_group.
	 *	This is OK, __unhash_process()->list_del_rcu() does not
	 *	clear the ->next pointer, we will find the new leader via
	 *	next_thread().
	 */
	rcu_read_lock();
	for_each_process(g) {
		if (g == tsk->group_leader)
			continue;
		if (g->flags & PF_KTHREAD)
			continue;
430 431 432 433 434 435 436 437 438

		for_each_thread(g, p) {
			if (unlikely(!p->mm))
				continue;
			if (unlikely(p->mm == mm)) {
				lock_task_sighand(p, &flags);
				nr += zap_process(p, exit_code,
							SIGNAL_GROUP_EXIT);
				unlock_task_sighand(p, &flags);
439
			}
440 441
			break;
		}
442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458
	}
	rcu_read_unlock();
done:
	atomic_set(&core_state->nr_threads, nr);
	return nr;
}

static int coredump_wait(int exit_code, struct core_state *core_state)
{
	struct task_struct *tsk = current;
	struct mm_struct *mm = tsk->mm;
	int core_waiters = -EBUSY;

	init_completion(&core_state->startup);
	core_state->dumper.task = tsk;
	core_state->dumper.next = NULL;

459
	if (mmap_write_lock_killable(mm))
460 461
		return -EINTR;

462 463
	if (!mm->core_state)
		core_waiters = zap_threads(tsk, mm, core_state, exit_code);
464
	mmap_write_unlock(mm);
465 466 467 468

	if (core_waiters > 0) {
		struct core_thread *ptr;

469
		freezer_do_not_count();
470
		wait_for_completion(&core_state->startup);
471
		freezer_count();
472 473 474 475 476 477 478 479 480 481 482 483 484 485 486
		/*
		 * Wait for all the threads to become inactive, so that
		 * all the thread context (extended register state, like
		 * fpu etc) gets copied to the memory.
		 */
		ptr = core_state->dumper.next;
		while (ptr != NULL) {
			wait_task_inactive(ptr->task, 0);
			ptr = ptr->next;
		}
	}

	return core_waiters;
}

487
static void coredump_finish(struct mm_struct *mm, bool core_dumped)
488 489 490 491
{
	struct core_thread *curr, *next;
	struct task_struct *task;

492
	spin_lock_irq(&current->sighand->siglock);
493 494
	if (core_dumped && !__fatal_signal_pending(current))
		current->signal->group_exit_code |= 0x80;
495 496 497 498
	current->signal->group_exit_task = NULL;
	current->signal->flags = SIGNAL_GROUP_EXIT;
	spin_unlock_irq(&current->sighand->siglock);

499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514
	next = mm->core_state->dumper.next;
	while ((curr = next) != NULL) {
		next = curr->next;
		task = curr->task;
		/*
		 * see exit_mm(), curr->task must not see
		 * ->task == NULL before we read ->next.
		 */
		smp_mb();
		curr->task = NULL;
		wake_up_process(task);
	}

	mm->core_state = NULL;
}

515 516 517 518 519 520 521 522 523 524 525
static bool dump_interrupted(void)
{
	/*
	 * SIGKILL or freezing() interrupt the coredumping. Perhaps we
	 * can do try_to_freeze() and check __fatal_signal_pending(),
	 * but then we need to teach dump_write() to restart and clear
	 * TIF_SIGPENDING.
	 */
	return signal_pending(current);
}

526 527
static void wait_for_dump_helpers(struct file *file)
{
528
	struct pipe_inode_info *pipe = file->private_data;
529 530 531 532

	pipe_lock(pipe);
	pipe->readers++;
	pipe->writers--;
533
	wake_up_interruptible_sync(&pipe->rd_wait);
534 535
	kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
	pipe_unlock(pipe);
536

537 538 539 540
	/*
	 * We actually want wait_event_freezable() but then we need
	 * to clear TIF_SIGPENDING and improve dump_interrupted().
	 */
541
	wait_event_interruptible(pipe->rd_wait, pipe->readers == 1);
542

543
	pipe_lock(pipe);
544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569
	pipe->readers--;
	pipe->writers++;
	pipe_unlock(pipe);
}

/*
 * umh_pipe_setup
 * helper function to customize the process used
 * to collect the core in userspace.  Specifically
 * it sets up a pipe and installs it as fd 0 (stdin)
 * for the process.  Returns 0 on success, or
 * PTR_ERR on failure.
 * Note that it also sets the core limit to 1.  This
 * is a special value that we use to trap recursive
 * core dumps
 */
static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
{
	struct file *files[2];
	struct coredump_params *cp = (struct coredump_params *)info->data;
	int err = create_pipe_files(files, 0);
	if (err)
		return err;

	cp->file = files[1];

A
Al Viro 已提交
570 571
	err = replace_fd(0, files[0], 0);
	fput(files[0]);
572 573 574
	/* and disallow core files too */
	current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1};

A
Al Viro 已提交
575
	return err;
576 577
}

578
void do_coredump(const kernel_siginfo_t *siginfo)
579 580 581 582 583 584 585 586 587
{
	struct core_state core_state;
	struct core_name cn;
	struct mm_struct *mm = current->mm;
	struct linux_binfmt * binfmt;
	const struct cred *old_cred;
	struct cred *cred;
	int retval = 0;
	int ispipe;
588 589
	size_t *argv = NULL;
	int argc = 0;
590
	struct files_struct *displaced;
591 592
	/* require nonrelative corefile path and be extra careful */
	bool need_suid_safe = false;
593
	bool core_dumped = false;
594 595
	static atomic_t core_dump_count = ATOMIC_INIT(0);
	struct coredump_params cprm = {
596
		.siginfo = siginfo,
597
		.regs = signal_pt_regs(),
598 599 600 601 602 603 604 605 606
		.limit = rlimit(RLIMIT_CORE),
		/*
		 * We must use the same mm->flags while dumping core to avoid
		 * inconsistency of bit flags, since this flag is not protected
		 * by any locks.
		 */
		.mm_flags = mm->flags,
	};

607
	audit_core_dumps(siginfo->si_signo);
608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623

	binfmt = mm->binfmt;
	if (!binfmt || !binfmt->core_dump)
		goto fail;
	if (!__get_dumpable(cprm.mm_flags))
		goto fail;

	cred = prepare_creds();
	if (!cred)
		goto fail;
	/*
	 * We cannot trust fsuid as being the "true" uid of the process
	 * nor do we know its entire history. We only know it was tainted
	 * so we dump it as root in mode 2, and only into a controlled
	 * environment (pipe handler or fully qualified path).
	 */
624
	if (__get_dumpable(cprm.mm_flags) == SUID_DUMP_ROOT) {
625 626
		/* Setuid core dump mode */
		cred->fsuid = GLOBAL_ROOT_UID;	/* Dump root private */
627
		need_suid_safe = true;
628 629
	}

630
	retval = coredump_wait(siginfo->si_signo, &core_state);
631 632 633 634 635
	if (retval < 0)
		goto fail_creds;

	old_cred = override_creds(cred);

636
	ispipe = format_corename(&cn, &cprm, &argv, &argc);
637

638
	if (ispipe) {
639
		int argi;
640 641
		int dump_count;
		char **helper_argv;
642
		struct subprocess_info *sub_info;
643 644 645 646

		if (ispipe < 0) {
			printk(KERN_WARNING "format_corename failed\n");
			printk(KERN_WARNING "Aborting core\n");
647
			goto fail_unlock;
648 649 650 651 652 653 654
		}

		if (cprm.limit == 1) {
			/* See umh_pipe_setup() which sets RLIMIT_CORE = 1.
			 *
			 * Normally core limits are irrelevant to pipes, since
			 * we're not writing to the file system, but we use
B
Bastien Nocera 已提交
655
			 * cprm.limit of 1 here as a special value, this is a
656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681
			 * consistent way to catch recursive crashes.
			 * We can still crash if the core_pattern binary sets
			 * RLIM_CORE = !1, but it runs as root, and can do
			 * lots of stupid things.
			 *
			 * Note that we use task_tgid_vnr here to grab the pid
			 * of the process group leader.  That way we get the
			 * right pid if a thread in a multi-threaded
			 * core_pattern process dies.
			 */
			printk(KERN_WARNING
				"Process %d(%s) has RLIMIT_CORE set to 1\n",
				task_tgid_vnr(current), current->comm);
			printk(KERN_WARNING "Aborting core\n");
			goto fail_unlock;
		}
		cprm.limit = RLIM_INFINITY;

		dump_count = atomic_inc_return(&core_dump_count);
		if (core_pipe_limit && (core_pipe_limit < dump_count)) {
			printk(KERN_WARNING "Pid %d(%s) over core_pipe_limit\n",
			       task_tgid_vnr(current), current->comm);
			printk(KERN_WARNING "Skipping core dump\n");
			goto fail_dropcount;
		}

682 683
		helper_argv = kmalloc_array(argc + 1, sizeof(*helper_argv),
					    GFP_KERNEL);
684 685 686 687 688
		if (!helper_argv) {
			printk(KERN_WARNING "%s failed to allocate memory\n",
			       __func__);
			goto fail_dropcount;
		}
689 690 691
		for (argi = 0; argi < argc; argi++)
			helper_argv[argi] = cn.corename + argv[argi];
		helper_argv[argi] = NULL;
692

693 694 695 696 697 698 699 700
		retval = -ENOMEM;
		sub_info = call_usermodehelper_setup(helper_argv[0],
						helper_argv, NULL, GFP_KERNEL,
						umh_pipe_setup, NULL, &cprm);
		if (sub_info)
			retval = call_usermodehelper_exec(sub_info,
							  UMH_WAIT_EXEC);

701
		kfree(helper_argv);
702
		if (retval) {
703
			printk(KERN_INFO "Core dump to |%s pipe failed\n",
704 705
			       cn.corename);
			goto close_fail;
706
		}
707 708
	} else {
		struct inode *inode;
709 710
		int open_flags = O_CREAT | O_RDWR | O_NOFOLLOW |
				 O_LARGEFILE | O_EXCL;
711 712 713 714

		if (cprm.limit < binfmt->min_coredump)
			goto fail_unlock;

715
		if (need_suid_safe && cn.corename[0] != '/') {
716 717 718 719 720 721 722
			printk(KERN_WARNING "Pid %d(%s) can only dump core "\
				"to fully qualified path!\n",
				task_tgid_vnr(current), current->comm);
			printk(KERN_WARNING "Skipping core dump\n");
			goto fail_unlock;
		}

723 724 725 726 727 728 729 730 731 732
		/*
		 * Unlink the file if it exists unless this is a SUID
		 * binary - in that case, we're running around with root
		 * privs and don't want to unlink another user's coredump.
		 */
		if (!need_suid_safe) {
			/*
			 * If it doesn't exist, that's fine. If there's some
			 * other problem, we'll catch it at the filp_open().
			 */
733
			do_unlinkat(AT_FDCWD, getname_kernel(cn.corename));
734 735 736 737 738 739 740 741 742 743
		}

		/*
		 * There is a race between unlinking and creating the
		 * file, but if that causes an EEXIST here, that's
		 * fine - another process raced with us while creating
		 * the corefile, and the other process won. To userspace,
		 * what matters is that at least one of the two processes
		 * writes its coredump successfully, not which one.
		 */
744 745 746 747 748 749 750 751 752 753 754 755 756 757 758
		if (need_suid_safe) {
			/*
			 * Using user namespaces, normal user tasks can change
			 * their current->fs->root to point to arbitrary
			 * directories. Since the intention of the "only dump
			 * with a fully qualified path" rule is to control where
			 * coredumps may be placed using root privileges,
			 * current->fs->root must not be used. Instead, use the
			 * root directory of init_task.
			 */
			struct path root;

			task_lock(&init_task);
			get_fs_root(init_task.fs, &root);
			task_unlock(&init_task);
A
Al Viro 已提交
759 760
			cprm.file = file_open_root(&root, cn.corename,
						   open_flags, 0600);
761 762 763 764
			path_put(&root);
		} else {
			cprm.file = filp_open(cn.corename, open_flags, 0600);
		}
765 766 767
		if (IS_ERR(cprm.file))
			goto fail_unlock;

A
Al Viro 已提交
768
		inode = file_inode(cprm.file);
769 770 771 772 773 774 775 776 777 778 779
		if (inode->i_nlink > 1)
			goto close_fail;
		if (d_unhashed(cprm.file->f_path.dentry))
			goto close_fail;
		/*
		 * AK: actually i see no reason to not allow this for named
		 * pipes etc, but keep the previous behaviour for now.
		 */
		if (!S_ISREG(inode->i_mode))
			goto close_fail;
		/*
780 781 782 783
		 * Don't dump core if the filesystem changed owner or mode
		 * of the file during file creation. This is an issue when
		 * a process dumps core while its cwd is e.g. on a vfat
		 * filesystem.
784 785 786
		 */
		if (!uid_eq(inode->i_uid, current_fsuid()))
			goto close_fail;
787 788
		if ((inode->i_mode & 0677) != 0600)
			goto close_fail;
A
Al Viro 已提交
789
		if (!(cprm.file->f_mode & FMODE_CAN_WRITE))
790 791 792 793 794 795 796 797 798 799 800
			goto close_fail;
		if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file))
			goto close_fail;
	}

	/* get us an unshared descriptor table; almost always a no-op */
	retval = unshare_files(&displaced);
	if (retval)
		goto close_fail;
	if (displaced)
		put_files_struct(displaced);
801
	if (!dump_interrupted()) {
802 803 804 805 806 807 808 809
		/*
		 * umh disabled with CONFIG_STATIC_USERMODEHELPER_PATH="" would
		 * have this set to NULL.
		 */
		if (!cprm.file) {
			pr_info("Core dump to |%s disabled\n", cn.corename);
			goto close_fail;
		}
810 811 812 813
		file_start_write(cprm.file);
		core_dumped = binfmt->core_dump(&cprm);
		file_end_write(cprm.file);
	}
814 815 816 817 818 819 820 821 822
	if (ispipe && core_pipe_limit)
		wait_for_dump_helpers(cprm.file);
close_fail:
	if (cprm.file)
		filp_close(cprm.file, NULL);
fail_dropcount:
	if (ispipe)
		atomic_dec(&core_dump_count);
fail_unlock:
823
	kfree(argv);
824
	kfree(cn.corename);
825
	coredump_finish(mm, core_dumped);
826 827 828 829 830 831 832 833 834 835 836 837
	revert_creds(old_cred);
fail_creds:
	put_cred(cred);
fail:
	return;
}

/*
 * Core dumping helper functions.  These are the only things you should
 * do on a core-file: use only these functions to write out all the
 * necessary info.
 */
A
Al Viro 已提交
838 839 840
int dump_emit(struct coredump_params *cprm, const void *addr, int nr)
{
	struct file *file = cprm->file;
841 842
	loff_t pos = file->f_pos;
	ssize_t n;
843
	if (cprm->written + nr > cprm->limit)
A
Al Viro 已提交
844
		return 0;
845 846 847 848 849 850 851 852 853 854 855


	if (dump_interrupted())
		return 0;
	n = __kernel_write(file, addr, nr, &pos);
	if (n != nr)
		return 0;
	file->f_pos = pos;
	cprm->written += n;
	cprm->pos += n;

A
Al Viro 已提交
856 857 858 859
	return 1;
}
EXPORT_SYMBOL(dump_emit);

860
int dump_skip(struct coredump_params *cprm, size_t nr)
861
{
862 863
	static char zeroes[PAGE_SIZE];
	struct file *file = cprm->file;
864
	if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
865
		if (dump_interrupted() ||
866
		    file->f_op->llseek(file, nr, SEEK_CUR) < 0)
867
			return 0;
868
		cprm->pos += nr;
869
		return 1;
870
	} else {
871 872 873 874
		while (nr > PAGE_SIZE) {
			if (!dump_emit(cprm, zeroes, PAGE_SIZE))
				return 0;
			nr -= PAGE_SIZE;
875
		}
876
		return dump_emit(cprm, zeroes, nr);
877 878
	}
}
879
EXPORT_SYMBOL(dump_skip);
A
Al Viro 已提交
880

881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901
#ifdef CONFIG_ELF_CORE
int dump_user_range(struct coredump_params *cprm, unsigned long start,
		    unsigned long len)
{
	unsigned long addr;

	for (addr = start; addr < start + len; addr += PAGE_SIZE) {
		struct page *page;
		int stop;

		/*
		 * To avoid having to allocate page tables for virtual address
		 * ranges that have never been used yet, and also to make it
		 * easy to generate sparse core files, use a helper that returns
		 * NULL when encountering an empty page table entry that would
		 * otherwise have been filled with the zero page.
		 */
		page = get_dump_page(addr);
		if (page) {
			void *kaddr = kmap(page);

902
			current->flags |= PF_COREDUMP_MCS;
903
			stop = !dump_emit(cprm, kaddr, PAGE_SIZE);
904
			current->flags &= ~PF_COREDUMP_MCS;
905 906 907 908 909 910 911 912 913 914 915 916
			kunmap(page);
			put_page(page);
		} else {
			stop = !dump_skip(cprm, PAGE_SIZE);
		}
		if (stop)
			return 0;
	}
	return 1;
}
#endif

A
Al Viro 已提交
917 918
int dump_align(struct coredump_params *cprm, int align)
{
919
	unsigned mod = cprm->pos & (align - 1);
A
Al Viro 已提交
920
	if (align & (align - 1))
A
Al Viro 已提交
921 922
		return 0;
	return mod ? dump_skip(cprm, align - mod) : 1;
A
Al Viro 已提交
923 924
}
EXPORT_SYMBOL(dump_align);
925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942

/*
 * Ensures that file size is big enough to contain the current file
 * postion. This prevents gdb from complaining about a truncated file
 * if the last "write" to the file was dump_skip.
 */
void dump_truncate(struct coredump_params *cprm)
{
	struct file *file = cprm->file;
	loff_t offset;

	if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
		offset = file->f_op->llseek(file, 0, SEEK_CUR);
		if (i_size_read(file->f_mapping->host) < offset)
			do_truncate(file->f_path.dentry, offset, 0, file);
	}
}
EXPORT_SYMBOL(dump_truncate);
943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974

/*
 * The purpose of always_dump_vma() is to make sure that special kernel mappings
 * that are useful for post-mortem analysis are included in every core dump.
 * In that way we ensure that the core dump is fully interpretable later
 * without matching up the same kernel and hardware config to see what PC values
 * meant. These special mappings include - vDSO, vsyscall, and other
 * architecture specific mappings
 */
static bool always_dump_vma(struct vm_area_struct *vma)
{
	/* Any vsyscall mappings? */
	if (vma == get_gate_vma(vma->vm_mm))
		return true;

	/*
	 * Assume that all vmas with a .name op should always be dumped.
	 * If this changes, a new vm_ops field can easily be added.
	 */
	if (vma->vm_ops && vma->vm_ops->name && vma->vm_ops->name(vma))
		return true;

	/*
	 * arch_vma_name() returns non-NULL for special architecture mappings,
	 * such as vDSO sections.
	 */
	if (arch_vma_name(vma))
		return true;

	return false;
}

975 976
#define DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER 1

977 978 979
/*
 * Decide how much of @vma's contents should be included in a core dump.
 */
980 981
static unsigned long vma_dump_size(struct vm_area_struct *vma,
				   unsigned long mm_flags)
982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035
{
#define FILTER(type)	(mm_flags & (1UL << MMF_DUMP_##type))

	/* always dump the vdso and vsyscall sections */
	if (always_dump_vma(vma))
		goto whole;

	if (vma->vm_flags & VM_DONTDUMP)
		return 0;

	/* support for DAX */
	if (vma_is_dax(vma)) {
		if ((vma->vm_flags & VM_SHARED) && FILTER(DAX_SHARED))
			goto whole;
		if (!(vma->vm_flags & VM_SHARED) && FILTER(DAX_PRIVATE))
			goto whole;
		return 0;
	}

	/* Hugetlb memory check */
	if (is_vm_hugetlb_page(vma)) {
		if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED))
			goto whole;
		if (!(vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_PRIVATE))
			goto whole;
		return 0;
	}

	/* Do not dump I/O mapped devices or special mappings */
	if (vma->vm_flags & VM_IO)
		return 0;

	/* By default, dump shared memory if mapped from an anonymous file. */
	if (vma->vm_flags & VM_SHARED) {
		if (file_inode(vma->vm_file)->i_nlink == 0 ?
		    FILTER(ANON_SHARED) : FILTER(MAPPED_SHARED))
			goto whole;
		return 0;
	}

	/* Dump segments that have been written to.  */
	if ((!IS_ENABLED(CONFIG_MMU) || vma->anon_vma) && FILTER(ANON_PRIVATE))
		goto whole;
	if (vma->vm_file == NULL)
		return 0;

	if (FILTER(MAPPED_PRIVATE))
		goto whole;

	/*
	 * If this is the beginning of an executable file mapping,
	 * dump the first page to aid in determining what was mapped here.
	 */
	if (FILTER(ELF_HEADERS) &&
1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049
	    vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ)) {
		if ((READ_ONCE(file_inode(vma->vm_file)->i_mode) & 0111) != 0)
			return PAGE_SIZE;

		/*
		 * ELF libraries aren't always executable.
		 * We'll want to check whether the mapping starts with the ELF
		 * magic, but not now - we're holding the mmap lock,
		 * so copy_from_user() doesn't work here.
		 * Use a placeholder instead, and fix it up later in
		 * dump_vma_snapshot().
		 */
		return DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER;
	}
1050 1051 1052 1053 1054 1055 1056 1057

#undef	FILTER

	return 0;

whole:
	return vma->vm_end - vma->vm_start;
}
1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127

static struct vm_area_struct *first_vma(struct task_struct *tsk,
					struct vm_area_struct *gate_vma)
{
	struct vm_area_struct *ret = tsk->mm->mmap;

	if (ret)
		return ret;
	return gate_vma;
}

/*
 * Helper function for iterating across a vma list.  It ensures that the caller
 * will visit `gate_vma' prior to terminating the search.
 */
static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma,
				       struct vm_area_struct *gate_vma)
{
	struct vm_area_struct *ret;

	ret = this_vma->vm_next;
	if (ret)
		return ret;
	if (this_vma == gate_vma)
		return NULL;
	return gate_vma;
}

/*
 * Under the mmap_lock, take a snapshot of relevant information about the task's
 * VMAs.
 */
int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count,
		      struct core_vma_metadata **vma_meta,
		      size_t *vma_data_size_ptr)
{
	struct vm_area_struct *vma, *gate_vma;
	struct mm_struct *mm = current->mm;
	int i;
	size_t vma_data_size = 0;

	/*
	 * Once the stack expansion code is fixed to not change VMA bounds
	 * under mmap_lock in read mode, this can be changed to take the
	 * mmap_lock in read mode.
	 */
	if (mmap_write_lock_killable(mm))
		return -EINTR;

	gate_vma = get_gate_vma(mm);
	*vma_count = mm->map_count + (gate_vma ? 1 : 0);

	*vma_meta = kvmalloc_array(*vma_count, sizeof(**vma_meta), GFP_KERNEL);
	if (!*vma_meta) {
		mmap_write_unlock(mm);
		return -ENOMEM;
	}

	for (i = 0, vma = first_vma(current, gate_vma); vma != NULL;
			vma = next_vma(vma, gate_vma), i++) {
		struct core_vma_metadata *m = (*vma_meta) + i;

		m->start = vma->vm_start;
		m->end = vma->vm_end;
		m->flags = vma->vm_flags;
		m->dump_size = vma_dump_size(vma, cprm->mm_flags);
	}

	mmap_write_unlock(mm);

1128 1129
	if (WARN_ON(i != *vma_count)) {
		kvfree(*vma_meta);
1130
		return -EFAULT;
1131
	}
1132

1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149
	for (i = 0; i < *vma_count; i++) {
		struct core_vma_metadata *m = (*vma_meta) + i;

		if (m->dump_size == DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER) {
			char elfmag[SELFMAG];

			if (copy_from_user(elfmag, (void __user *)m->start, SELFMAG) ||
					memcmp(elfmag, ELFMAG, SELFMAG) != 0) {
				m->dump_size = 0;
			} else {
				m->dump_size = PAGE_SIZE;
			}
		}

		vma_data_size += m->dump_size;
	}

1150 1151 1152
	*vma_data_size_ptr = vma_data_size;
	return 0;
}