process_64.c 21.0 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5
/*
 *  Copyright (C) 1995  Linus Torvalds
 *
 *  Pentium III FXSR, SSE support
 *	Gareth Hughes <gareth@valinux.com>, May 2000
6
 *
L
Linus Torvalds 已提交
7 8
 *  X86-64 port
 *	Andi Kleen.
A
Ashok Raj 已提交
9 10
 *
 *	CPU hotplug support - ashok.raj@intel.com
L
Linus Torvalds 已提交
11 12 13 14 15 16 17 18
 */

/*
 * This file handles the architecture-dependent parts of process handling..
 */

#include <stdarg.h>

A
Ashok Raj 已提交
19
#include <linux/cpu.h>
L
Linus Torvalds 已提交
20 21
#include <linux/errno.h>
#include <linux/sched.h>
22
#include <linux/fs.h>
L
Linus Torvalds 已提交
23 24 25 26 27 28 29
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/elfcore.h>
#include <linux/smp.h>
#include <linux/slab.h>
#include <linux/user.h>
#include <linux/interrupt.h>
30
#include <linux/utsname.h>
L
Linus Torvalds 已提交
31
#include <linux/delay.h>
32
#include <linux/module.h>
L
Linus Torvalds 已提交
33 34
#include <linux/ptrace.h>
#include <linux/random.h>
A
Andi Kleen 已提交
35
#include <linux/notifier.h>
36
#include <linux/kprobes.h>
37
#include <linux/kdebug.h>
38
#include <linux/tick.h>
39
#include <linux/prctl.h>
40 41
#include <linux/uaccess.h>
#include <linux/io.h>
L
Linus Torvalds 已提交
42 43 44 45 46 47 48 49 50 51 52

#include <asm/pgtable.h>
#include <asm/system.h>
#include <asm/processor.h>
#include <asm/i387.h>
#include <asm/mmu_context.h>
#include <asm/pda.h>
#include <asm/prctl.h>
#include <asm/desc.h>
#include <asm/proto.h>
#include <asm/ia32.h>
A
Andi Kleen 已提交
53
#include <asm/idle.h>
54
#include <asm/syscalls.h>
L
Linus Torvalds 已提交
55 56 57 58 59

asmlinkage extern void ret_from_fork(void);

unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;

60
static ATOMIC_NOTIFIER_HEAD(idle_notifier);
A
Andi Kleen 已提交
61 62 63

void idle_notifier_register(struct notifier_block *n)
{
64
	atomic_notifier_chain_register(&idle_notifier, n);
A
Andi Kleen 已提交
65 66 67 68
}

void enter_idle(void)
{
A
Andi Kleen 已提交
69
	write_pda(isidle, 1);
70
	atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
A
Andi Kleen 已提交
71 72 73 74
}

static void __exit_idle(void)
{
A
Andi Kleen 已提交
75
	if (test_and_clear_bit_pda(0, isidle) == 0)
A
Andi Kleen 已提交
76
		return;
77
	atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
A
Andi Kleen 已提交
78 79 80 81 82
}

/* Called from interrupts to signify idle end */
void exit_idle(void)
{
A
Andi Kleen 已提交
83 84
	/* idle loop has pid 0 */
	if (current->pid)
A
Andi Kleen 已提交
85 86 87 88
		return;
	__exit_idle();
}

A
Ashok Raj 已提交
89 90 91
#ifdef CONFIG_HOTPLUG_CPU
DECLARE_PER_CPU(int, cpu_state);

92
#include <linux/nmi.h>
93
/* We halt the CPU with physical CPU hotplug */
A
Ashok Raj 已提交
94 95 96
static inline void play_dead(void)
{
	idle_task_exit();
97 98
	c1e_remove_cpu(raw_smp_processor_id());

A
Ashok Raj 已提交
99 100 101 102
	mb();
	/* Ack it */
	__get_cpu_var(cpu_state) = CPU_DEAD;

103
	local_irq_disable();
104 105
	/* mask all interrupts, flush any and all caches, and halt */
	wbinvd_halt();
A
Ashok Raj 已提交
106 107 108 109 110 111 112 113
}
#else
static inline void play_dead(void)
{
	BUG();
}
#endif /* CONFIG_HOTPLUG_CPU */

L
Linus Torvalds 已提交
114 115 116 117 118 119
/*
 * The idle thread. There's no useful work to be
 * done, so just try to conserve power and have a
 * low exit latency (ie sit in a loop waiting for
 * somebody to say that they'd like to reschedule)
 */
P
Pavel Machek 已提交
120
void cpu_idle(void)
L
Linus Torvalds 已提交
121
{
122
	current_thread_info()->status |= TS_POLLING;
L
Linus Torvalds 已提交
123 124
	/* endless idle loop with no priority at all */
	while (1) {
125
		tick_nohz_stop_sched_tick(1);
L
Linus Torvalds 已提交
126 127 128
		while (!need_resched()) {

			rmb();
T
Thomas Gleixner 已提交
129

A
Ashok Raj 已提交
130 131
			if (cpu_is_offline(smp_processor_id()))
				play_dead();
132 133 134 135 136 137
			/*
			 * Idle routines should keep interrupts disabled
			 * from here on, until they go to idle.
			 * Otherwise, idle callbacks can misfire.
			 */
			local_irq_disable();
A
Andi Kleen 已提交
138
			enter_idle();
139 140
			/* Don't trace irqs off for idle */
			stop_critical_timings();
T
Thomas Gleixner 已提交
141
			pm_idle();
142
			start_critical_timings();
A
Andi Kleen 已提交
143 144 145
			/* In many cases the interrupt that ended idle
			   has already called exit_idle. But some idle
			   loops can be woken up without interrupt. */
A
Andi Kleen 已提交
146
			__exit_idle();
L
Linus Torvalds 已提交
147 148
		}

149
		tick_nohz_restart_sched_tick();
150
		preempt_enable_no_resched();
L
Linus Torvalds 已提交
151
		schedule();
152
		preempt_disable();
L
Linus Torvalds 已提交
153 154 155
	}
}

156
/* Prints also some state that isn't saved in the pt_regs */
157
void __show_regs(struct pt_regs *regs)
L
Linus Torvalds 已提交
158 159
{
	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
160
	unsigned long d0, d1, d2, d3, d6, d7;
161 162
	unsigned int fsindex, gsindex;
	unsigned int ds, cs, es;
L
Linus Torvalds 已提交
163 164 165

	printk("\n");
	print_modules();
166
	printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s\n",
167
		current->pid, current->comm, print_tainted(),
168 169 170
		init_utsname()->release,
		(int)strcspn(init_utsname()->version, " "),
		init_utsname()->version);
171
	printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
172
	printk_address(regs->ip, 1);
173 174 175
	printk(KERN_INFO "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,
			regs->sp, regs->flags);
	printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n",
176
	       regs->ax, regs->bx, regs->cx);
177
	printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n",
178
	       regs->dx, regs->si, regs->di);
179
	printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n",
180
	       regs->bp, regs->r8, regs->r9);
181
	printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n",
182
	       regs->r10, regs->r11, regs->r12);
183
	printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n",
184
	       regs->r13, regs->r14, regs->r15);
L
Linus Torvalds 已提交
185

186 187 188
	asm("movl %%ds,%0" : "=r" (ds));
	asm("movl %%cs,%0" : "=r" (cs));
	asm("movl %%es,%0" : "=r" (es));
L
Linus Torvalds 已提交
189 190 191 192
	asm("movl %%fs,%0" : "=r" (fsindex));
	asm("movl %%gs,%0" : "=r" (gsindex));

	rdmsrl(MSR_FS_BASE, fs);
193 194
	rdmsrl(MSR_GS_BASE, gs);
	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
L
Linus Torvalds 已提交
195

196 197 198 199
	cr0 = read_cr0();
	cr2 = read_cr2();
	cr3 = read_cr3();
	cr4 = read_cr4();
L
Linus Torvalds 已提交
200

201
	printk(KERN_INFO "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
202
	       fs, fsindex, gs, gsindex, shadowgs);
203 204 205 206
	printk(KERN_INFO "CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
			es, cr0);
	printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
			cr4);
207 208 209 210

	get_debugreg(d0, 0);
	get_debugreg(d1, 1);
	get_debugreg(d2, 2);
211
	printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
212 213 214
	get_debugreg(d3, 3);
	get_debugreg(d6, 6);
	get_debugreg(d7, 7);
215
	printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
L
Linus Torvalds 已提交
216 217 218 219
}

void show_regs(struct pt_regs *regs)
{
220
	printk(KERN_INFO "CPU %d:", smp_processor_id());
L
Linus Torvalds 已提交
221
	__show_regs(regs);
222
	show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
L
Linus Torvalds 已提交
223 224 225 226 227 228 229 230 231
}

/*
 * Free current thread data structures etc..
 */
void exit_thread(void)
{
	struct task_struct *me = current;
	struct thread_struct *t = &me->thread;
232

233
	if (me->thread.io_bitmap_ptr) {
L
Linus Torvalds 已提交
234 235 236 237
		struct tss_struct *tss = &per_cpu(init_tss, get_cpu());

		kfree(t->io_bitmap_ptr);
		t->io_bitmap_ptr = NULL;
238
		clear_thread_flag(TIF_IO_BITMAP);
L
Linus Torvalds 已提交
239 240 241 242 243 244 245
		/*
		 * Careful, clear this in the TSS too:
		 */
		memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
		t->io_bitmap_max = 0;
		put_cpu();
	}
M
Markus Metzger 已提交
246 247 248 249 250 251 252 253
#ifdef CONFIG_X86_DS
	/* Free any DS contexts that have not been properly released. */
	if (unlikely(t->ds_ctx)) {
		/* we clear debugctl to make sure DS is not used. */
		update_debugctlmsr(0);
		ds_free(t->ds_ctx);
	}
#endif /* CONFIG_X86_DS */
L
Linus Torvalds 已提交
254 255 256 257 258 259
}

void flush_thread(void)
{
	struct task_struct *tsk = current;

260 261 262 263 264 265
	if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
		clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
		if (test_tsk_thread_flag(tsk, TIF_IA32)) {
			clear_tsk_thread_flag(tsk, TIF_IA32);
		} else {
			set_tsk_thread_flag(tsk, TIF_IA32);
266
			current_thread_info()->status |= TS_COMPAT;
267
		}
268
	}
269
	clear_tsk_thread_flag(tsk, TIF_DEBUG);
L
Linus Torvalds 已提交
270 271 272 273 274 275 276

	tsk->thread.debugreg0 = 0;
	tsk->thread.debugreg1 = 0;
	tsk->thread.debugreg2 = 0;
	tsk->thread.debugreg3 = 0;
	tsk->thread.debugreg6 = 0;
	tsk->thread.debugreg7 = 0;
277
	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
L
Linus Torvalds 已提交
278 279 280
	/*
	 * Forget coprocessor state..
	 */
281
	tsk->fpu_counter = 0;
L
Linus Torvalds 已提交
282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300
	clear_fpu(tsk);
	clear_used_math();
}

void release_thread(struct task_struct *dead_task)
{
	if (dead_task->mm) {
		if (dead_task->mm->context.size) {
			printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
					dead_task->comm,
					dead_task->mm->context.ldt,
					dead_task->mm->context.size);
			BUG();
		}
	}
}

static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
{
301
	struct user_desc ud = {
L
Linus Torvalds 已提交
302 303 304 305 306 307
		.base_addr = addr,
		.limit = 0xfffff,
		.seg_32bit = 1,
		.limit_in_pages = 1,
		.useable = 1,
	};
J
Jan Engelhardt 已提交
308
	struct desc_struct *desc = t->thread.tls_array;
L
Linus Torvalds 已提交
309
	desc += tls;
310
	fill_ldt(desc, &ud);
L
Linus Torvalds 已提交
311 312 313 314
}

static inline u32 read_32bit_tls(struct task_struct *t, int tls)
{
R
Roland McGrath 已提交
315
	return get_desc_base(&t->thread.tls_array[tls]);
L
Linus Torvalds 已提交
316 317 318 319 320 321 322 323 324 325 326
}

/*
 * This gets called before we allocate a new thread and copy
 * the current task into it.
 */
void prepare_to_copy(struct task_struct *tsk)
{
	unlazy_fpu(tsk);
}

327
int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
L
Linus Torvalds 已提交
328
		unsigned long unused,
329
	struct task_struct *p, struct pt_regs *regs)
L
Linus Torvalds 已提交
330 331
{
	int err;
332
	struct pt_regs *childregs;
L
Linus Torvalds 已提交
333 334
	struct task_struct *me = current;

335
	childregs = ((struct pt_regs *)
A
Al Viro 已提交
336
			(THREAD_SIZE + task_stack_page(p))) - 1;
L
Linus Torvalds 已提交
337 338
	*childregs = *regs;

339 340 341 342
	childregs->ax = 0;
	childregs->sp = sp;
	if (sp == ~0UL)
		childregs->sp = (unsigned long)childregs;
L
Linus Torvalds 已提交
343

344 345 346
	p->thread.sp = (unsigned long) childregs;
	p->thread.sp0 = (unsigned long) (childregs+1);
	p->thread.usersp = me->thread.usersp;
L
Linus Torvalds 已提交
347

A
Al Viro 已提交
348
	set_tsk_thread_flag(p, TIF_FORK);
L
Linus Torvalds 已提交
349 350 351 352

	p->thread.fs = me->thread.fs;
	p->thread.gs = me->thread.gs;

353 354 355 356
	savesegment(gs, p->thread.gsindex);
	savesegment(fs, p->thread.fsindex);
	savesegment(es, p->thread.es);
	savesegment(ds, p->thread.ds);
L
Linus Torvalds 已提交
357

358
	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
L
Linus Torvalds 已提交
359 360 361 362 363
		p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
		if (!p->thread.io_bitmap_ptr) {
			p->thread.io_bitmap_max = 0;
			return -ENOMEM;
		}
364 365
		memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
				IO_BITMAP_BYTES);
366
		set_tsk_thread_flag(p, TIF_IO_BITMAP);
367
	}
L
Linus Torvalds 已提交
368 369 370 371 372 373 374

	/*
	 * Set a new TLS for the child thread?
	 */
	if (clone_flags & CLONE_SETTLS) {
#ifdef CONFIG_IA32_EMULATION
		if (test_thread_flag(TIF_IA32))
R
Roland McGrath 已提交
375
			err = do_set_thread_area(p, -1,
376
				(struct user_desc __user *)childregs->si, 0);
377 378 379 380
		else
#endif
			err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
		if (err)
L
Linus Torvalds 已提交
381 382 383 384 385 386 387 388 389 390 391
			goto out;
	}
	err = 0;
out:
	if (err && p->thread.io_bitmap_ptr) {
		kfree(p->thread.io_bitmap_ptr);
		p->thread.io_bitmap_max = 0;
	}
	return err;
}

I
Ingo Molnar 已提交
392 393 394
void
start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
{
395 396 397
	loadsegment(fs, 0);
	loadsegment(es, 0);
	loadsegment(ds, 0);
I
Ingo Molnar 已提交
398 399 400 401 402 403 404 405
	load_gs_index(0);
	regs->ip		= new_ip;
	regs->sp		= new_sp;
	write_pda(oldrsp, new_sp);
	regs->cs		= __USER_CS;
	regs->ss		= __USER_DS;
	regs->flags		= 0x200;
	set_fs(USER_DS);
406 407 408 409
	/*
	 * Free the old FP and other extended state
	 */
	free_thread_xstate(current);
I
Ingo Molnar 已提交
410 411 412
}
EXPORT_SYMBOL_GPL(start_thread);

413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434
static void hard_disable_TSC(void)
{
	write_cr4(read_cr4() | X86_CR4_TSD);
}

void disable_TSC(void)
{
	preempt_disable();
	if (!test_and_set_thread_flag(TIF_NOTSC))
		/*
		 * Must flip the CPU state synchronously with
		 * TIF_NOTSC in the current running context.
		 */
		hard_disable_TSC();
	preempt_enable();
}

static void hard_enable_TSC(void)
{
	write_cr4(read_cr4() & ~X86_CR4_TSD);
}

I
Ingo Molnar 已提交
435
static void enable_TSC(void)
436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470
{
	preempt_disable();
	if (test_and_clear_thread_flag(TIF_NOTSC))
		/*
		 * Must flip the CPU state synchronously with
		 * TIF_NOTSC in the current running context.
		 */
		hard_enable_TSC();
	preempt_enable();
}

int get_tsc_mode(unsigned long adr)
{
	unsigned int val;

	if (test_thread_flag(TIF_NOTSC))
		val = PR_TSC_SIGSEGV;
	else
		val = PR_TSC_ENABLE;

	return put_user(val, (unsigned int __user *)adr);
}

int set_tsc_mode(unsigned int val)
{
	if (val == PR_TSC_SIGSEGV)
		disable_TSC();
	else if (val == PR_TSC_ENABLE)
		enable_TSC();
	else
		return -EINVAL;

	return 0;
}

L
Linus Torvalds 已提交
471 472 473
/*
 * This special macro can be used to load a debugging register
 */
474 475
#define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)

476
static inline void __switch_to_xtra(struct task_struct *prev_p,
477 478
				    struct task_struct *next_p,
				    struct tss_struct *tss)
479 480
{
	struct thread_struct *prev, *next;
481
	unsigned long debugctl;
482 483 484 485

	prev = &prev_p->thread,
	next = &next_p->thread;

486
	debugctl = prev->debugctlmsr;
M
Markus Metzger 已提交
487 488

#ifdef CONFIG_X86_DS
I
Ingo Molnar 已提交
489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505
	{
		unsigned long ds_prev = 0, ds_next = 0;

		if (prev->ds_ctx)
			ds_prev = (unsigned long)prev->ds_ctx->ds;
		if (next->ds_ctx)
			ds_next = (unsigned long)next->ds_ctx->ds;

		if (ds_next != ds_prev) {
			/*
			 * We clear debugctl to make sure DS
			 * is not in use when we change it:
			 */
			debugctl = 0;
			update_debugctlmsr(0);
			wrmsrl(MSR_IA32_DS_AREA, ds_next);
		}
506
	}
M
Markus Metzger 已提交
507
#endif /* CONFIG_X86_DS */
508 509

	if (next->debugctlmsr != debugctl)
510
		update_debugctlmsr(next->debugctlmsr);
R
Roland McGrath 已提交
511

512 513 514 515 516 517 518 519 520 521
	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
		loaddebug(next, 0);
		loaddebug(next, 1);
		loaddebug(next, 2);
		loaddebug(next, 3);
		/* no 4 and 5 */
		loaddebug(next, 6);
		loaddebug(next, 7);
	}

522 523 524 525 526 527 528 529 530
	if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
	    test_tsk_thread_flag(next_p, TIF_NOTSC)) {
		/* prev and next are different */
		if (test_tsk_thread_flag(next_p, TIF_NOTSC))
			hard_disable_TSC();
		else
			hard_enable_TSC();
	}

531 532 533 534 535 536 537 538 539 540 541 542 543
	if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
		/*
		 * Copy the relevant range of the IO bitmap.
		 * Normally this is 128 bytes or less:
		 */
		memcpy(tss->io_bitmap, next->io_bitmap_ptr,
		       max(prev->io_bitmap_max, next->io_bitmap_max));
	} else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
		/*
		 * Clear any possible leftover bits:
		 */
		memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
	}
544

M
Markus Metzger 已提交
545
#ifdef CONFIG_X86_PTRACE_BTS
546 547 548 549 550
	if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
		ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);

	if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
		ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
M
Markus Metzger 已提交
551
#endif /* CONFIG_X86_PTRACE_BTS */
552 553
}

L
Linus Torvalds 已提交
554 555 556
/*
 *	switch_to(x,y) should switch tasks from x to y.
 *
557
 * This could still be optimized:
L
Linus Torvalds 已提交
558 559
 * - fold all the options into a flag word and test it with a single test.
 * - could test fs/gs bitsliced
560 561
 *
 * Kprobes not supported here. Set the probe on schedule instead.
L
Linus Torvalds 已提交
562
 */
563
struct task_struct *
564
__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
L
Linus Torvalds 已提交
565
{
566 567
	struct thread_struct *prev = &prev_p->thread;
	struct thread_struct *next = &next_p->thread;
568
	int cpu = smp_processor_id();
L
Linus Torvalds 已提交
569
	struct tss_struct *tss = &per_cpu(init_tss, cpu);
570
	unsigned fsindex, gsindex;
L
Linus Torvalds 已提交
571

572
	/* we're going to use this soon, after a few expensive things */
573
	if (next_p->fpu_counter > 5)
574
		prefetch(next->xstate);
575

L
Linus Torvalds 已提交
576 577 578
	/*
	 * Reload esp0, LDT and the page table pointer:
	 */
579
	load_sp0(tss, next);
L
Linus Torvalds 已提交
580

581
	/*
L
Linus Torvalds 已提交
582 583 584
	 * Switch DS and ES.
	 * This won't pick up thread selector changes, but I guess that is ok.
	 */
585
	savesegment(es, prev->es);
L
Linus Torvalds 已提交
586
	if (unlikely(next->es | prev->es))
587
		loadsegment(es, next->es);
588 589

	savesegment(ds, prev->ds);
L
Linus Torvalds 已提交
590 591 592
	if (unlikely(next->ds | prev->ds))
		loadsegment(ds, next->ds);

593 594 595 596 597 598 599 600 601

	/* We must save %fs and %gs before load_TLS() because
	 * %fs and %gs may be cleared by load_TLS().
	 *
	 * (e.g. xen_load_tls())
	 */
	savesegment(fs, fsindex);
	savesegment(gs, gsindex);

L
Linus Torvalds 已提交
602 603
	load_TLS(next, cpu);

604 605 606 607 608 609 610 611 612
	/*
	 * Leave lazy mode, flushing any hypercalls made here.
	 * This must be done before restoring TLS segments so
	 * the GDT and LDT are properly updated, and must be
	 * done before math_state_restore, so the TS bit is up
	 * to date.
	 */
	arch_leave_lazy_cpu_mode();

613
	/*
L
Linus Torvalds 已提交
614
	 * Switch FS and GS.
615 616 617 618
	 *
	 * Segment register != 0 always requires a reload.  Also
	 * reload when it has changed.  When prev process used 64bit
	 * base always reload to avoid an information leak.
L
Linus Torvalds 已提交
619
	 */
620 621
	if (unlikely(fsindex | next->fsindex | prev->fs)) {
		loadsegment(fs, next->fsindex);
622
		/*
623 624 625 626 627
		 * Check if the user used a selector != 0; if yes
		 *  clear 64bit base, since overloaded base is always
		 *  mapped to the Null selector
		 */
		if (fsindex)
628
			prev->fs = 0;
629 630 631 632 633 634 635 636 637
	}
	/* when next process has a 64bit base use it */
	if (next->fs)
		wrmsrl(MSR_FS_BASE, next->fs);
	prev->fsindex = fsindex;

	if (unlikely(gsindex | next->gsindex | prev->gs)) {
		load_gs_index(next->gsindex);
		if (gsindex)
638
			prev->gs = 0;
L
Linus Torvalds 已提交
639
	}
640 641 642
	if (next->gs)
		wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
	prev->gsindex = gsindex;
L
Linus Torvalds 已提交
643

A
Andi Kleen 已提交
644 645 646
	/* Must be after DS reload */
	unlazy_fpu(prev_p);

647
	/*
648
	 * Switch the PDA and FPU contexts.
L
Linus Torvalds 已提交
649
	 */
650 651
	prev->usersp = read_pda(oldrsp);
	write_pda(oldrsp, next->usersp);
652
	write_pda(pcurrent, next_p);
653

654
	write_pda(kernelstack,
655 656
		  (unsigned long)task_stack_page(next_p) +
		  THREAD_SIZE - PDA_STACKOFFSET);
657 658 659 660 661 662 663 664
#ifdef CONFIG_CC_STACKPROTECTOR
	write_pda(stack_canary, next_p->stack_canary);
	/*
	 * Build time only check to make sure the stack_canary is at
	 * offset 40 in the pda; this is a gcc ABI requirement
	 */
	BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
#endif
L
Linus Torvalds 已提交
665 666

	/*
667
	 * Now maybe reload the debug registers and handle I/O bitmaps
L
Linus Torvalds 已提交
668
	 */
669 670
	if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
		     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
671
		__switch_to_xtra(prev_p, next_p, tss);
L
Linus Torvalds 已提交
672

673 674 675
	/* If the task has used fpu the last 5 timeslices, just do a full
	 * restore of the math state immediately to avoid the trap; the
	 * chances of needing FPU soon are obviously high now
676 677 678
	 *
	 * tsk_used_math() checks prevent calling math_state_restore(),
	 * which can sleep in the case of !tsk_used_math()
679
	 */
680
	if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
681
		math_state_restore();
L
Linus Torvalds 已提交
682 683 684 685 686 687
	return prev_p;
}

/*
 * sys_execve() executes a new program.
 */
688
asmlinkage
L
Linus Torvalds 已提交
689
long sys_execve(char __user *name, char __user * __user *argv,
690
		char __user * __user *envp, struct pt_regs *regs)
L
Linus Torvalds 已提交
691 692
{
	long error;
693
	char *filename;
L
Linus Torvalds 已提交
694 695 696

	filename = getname(name);
	error = PTR_ERR(filename);
697
	if (IS_ERR(filename))
L
Linus Torvalds 已提交
698
		return error;
699
	error = do_execve(filename, argv, envp, regs);
L
Linus Torvalds 已提交
700 701 702 703 704 705 706 707 708
	putname(filename);
	return error;
}

void set_personality_64bit(void)
{
	/* inherit personality from parent */

	/* Make sure to be in 64bit mode */
709
	clear_thread_flag(TIF_IA32);
L
Linus Torvalds 已提交
710 711 712 713

	/* TBD: overwrites user setup. Should have two bits.
	   But 64bit processes have always behaved this way,
	   so it's not too bad. The main problem is just that
714
	   32bit childs are affected again. */
L
Linus Torvalds 已提交
715 716 717 718 719
	current->personality &= ~READ_IMPLIES_EXEC;
}

asmlinkage long sys_fork(struct pt_regs *regs)
{
720
	return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
L
Linus Torvalds 已提交
721 722
}

723 724 725
asmlinkage long
sys_clone(unsigned long clone_flags, unsigned long newsp,
	  void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
L
Linus Torvalds 已提交
726 727
{
	if (!newsp)
728
		newsp = regs->sp;
L
Linus Torvalds 已提交
729 730 731 732 733 734 735 736 737 738 739 740 741 742 743
	return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
}

/*
 * This is trivial, and on the face of it looks like it
 * could equally well be done in user mode.
 *
 * Not so, for quite unobvious reasons - register pressure.
 * In user mode vfork() cannot have a stack frame, and if
 * done by calling the "clone()" system call directly, you
 * do not have enough call-clobbered registers to hold all
 * the information you need.
 */
asmlinkage long sys_vfork(struct pt_regs *regs)
{
744
	return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
L
Linus Torvalds 已提交
745 746 747 748 749 750
		    NULL, NULL);
}

unsigned long get_wchan(struct task_struct *p)
{
	unsigned long stack;
751
	u64 fp, ip;
L
Linus Torvalds 已提交
752 753
	int count = 0;

754 755
	if (!p || p == current || p->state == TASK_RUNNING)
		return 0;
A
Al Viro 已提交
756
	stack = (unsigned long)task_stack_page(p);
757
	if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE)
L
Linus Torvalds 已提交
758
		return 0;
759
	fp = *(u64 *)(p->thread.sp);
760
	do {
761 762
		if (fp < (unsigned long)stack ||
		    fp > (unsigned long)stack+THREAD_SIZE)
763
			return 0;
764 765 766
		ip = *(u64 *)(fp+8);
		if (!in_sched_functions(ip))
			return ip;
767 768
		fp = *(u64 *)fp;
	} while (count++ < 16);
L
Linus Torvalds 已提交
769 770 771 772
	return 0;
}

long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
773 774
{
	int ret = 0;
L
Linus Torvalds 已提交
775 776 777
	int doit = task == current;
	int cpu;

778
	switch (code) {
L
Linus Torvalds 已提交
779
	case ARCH_SET_GS:
780
		if (addr >= TASK_SIZE_OF(task))
781
			return -EPERM;
L
Linus Torvalds 已提交
782
		cpu = get_cpu();
783
		/* handle small bases via the GDT because that's faster to
L
Linus Torvalds 已提交
784
		   switch. */
785 786 787
		if (addr <= 0xffffffff) {
			set_32bit_tls(task, GS_TLS, addr);
			if (doit) {
L
Linus Torvalds 已提交
788
				load_TLS(&task->thread, cpu);
789
				load_gs_index(GS_TLS_SEL);
L
Linus Torvalds 已提交
790
			}
791
			task->thread.gsindex = GS_TLS_SEL;
L
Linus Torvalds 已提交
792
			task->thread.gs = 0;
793
		} else {
L
Linus Torvalds 已提交
794 795 796
			task->thread.gsindex = 0;
			task->thread.gs = addr;
			if (doit) {
797 798
				load_gs_index(0);
				ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
799
			}
L
Linus Torvalds 已提交
800 801 802 803 804 805
		}
		put_cpu();
		break;
	case ARCH_SET_FS:
		/* Not strictly needed for fs, but do it for symmetry
		   with gs */
806
		if (addr >= TASK_SIZE_OF(task))
807
			return -EPERM;
L
Linus Torvalds 已提交
808
		cpu = get_cpu();
809
		/* handle small bases via the GDT because that's faster to
L
Linus Torvalds 已提交
810
		   switch. */
811
		if (addr <= 0xffffffff) {
L
Linus Torvalds 已提交
812
			set_32bit_tls(task, FS_TLS, addr);
813 814
			if (doit) {
				load_TLS(&task->thread, cpu);
815
				loadsegment(fs, FS_TLS_SEL);
L
Linus Torvalds 已提交
816 817 818
			}
			task->thread.fsindex = FS_TLS_SEL;
			task->thread.fs = 0;
819
		} else {
L
Linus Torvalds 已提交
820 821 822 823 824
			task->thread.fsindex = 0;
			task->thread.fs = addr;
			if (doit) {
				/* set the selector to 0 to not confuse
				   __switch_to */
825
				loadsegment(fs, 0);
826
				ret = checking_wrmsrl(MSR_FS_BASE, addr);
L
Linus Torvalds 已提交
827 828 829 830
			}
		}
		put_cpu();
		break;
831 832
	case ARCH_GET_FS: {
		unsigned long base;
L
Linus Torvalds 已提交
833 834
		if (task->thread.fsindex == FS_TLS_SEL)
			base = read_32bit_tls(task, FS_TLS);
835
		else if (doit)
L
Linus Torvalds 已提交
836
			rdmsrl(MSR_FS_BASE, base);
837
		else
L
Linus Torvalds 已提交
838
			base = task->thread.fs;
839 840
		ret = put_user(base, (unsigned long __user *)addr);
		break;
L
Linus Torvalds 已提交
841
	}
842
	case ARCH_GET_GS: {
L
Linus Torvalds 已提交
843
		unsigned long base;
844
		unsigned gsindex;
L
Linus Torvalds 已提交
845 846
		if (task->thread.gsindex == GS_TLS_SEL)
			base = read_32bit_tls(task, GS_TLS);
847
		else if (doit) {
848
			savesegment(gs, gsindex);
849 850 851 852
			if (gsindex)
				rdmsrl(MSR_KERNEL_GS_BASE, base);
			else
				base = task->thread.gs;
853
		} else
L
Linus Torvalds 已提交
854
			base = task->thread.gs;
855
		ret = put_user(base, (unsigned long __user *)addr);
L
Linus Torvalds 已提交
856 857 858 859 860 861
		break;
	}

	default:
		ret = -EINVAL;
		break;
862
	}
L
Linus Torvalds 已提交
863

864 865
	return ret;
}
L
Linus Torvalds 已提交
866 867 868 869 870 871 872 873

long sys_arch_prctl(int code, unsigned long addr)
{
	return do_arch_prctl(current, code, addr);
}

unsigned long arch_align_stack(unsigned long sp)
{
874
	if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
L
Linus Torvalds 已提交
875 876 877
		sp -= get_random_int() % 8192;
	return sp & ~0xf;
}
J
Jiri Kosina 已提交
878 879 880 881 882 883

unsigned long arch_randomize_brk(struct mm_struct *mm)
{
	unsigned long range_end = mm->brk + 0x02000000;
	return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
}