process_64.c 22.0 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5
/*
 *  Copyright (C) 1995  Linus Torvalds
 *
 *  Pentium III FXSR, SSE support
 *	Gareth Hughes <gareth@valinux.com>, May 2000
6
 *
L
Linus Torvalds 已提交
7 8
 *  X86-64 port
 *	Andi Kleen.
A
Ashok Raj 已提交
9 10
 *
 *	CPU hotplug support - ashok.raj@intel.com
L
Linus Torvalds 已提交
11 12 13 14 15 16 17 18
 */

/*
 * This file handles the architecture-dependent parts of process handling..
 */

#include <stdarg.h>

A
Ashok Raj 已提交
19
#include <linux/cpu.h>
L
Linus Torvalds 已提交
20 21
#include <linux/errno.h>
#include <linux/sched.h>
22
#include <linux/fs.h>
L
Linus Torvalds 已提交
23 24 25 26 27 28 29
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/elfcore.h>
#include <linux/smp.h>
#include <linux/slab.h>
#include <linux/user.h>
#include <linux/interrupt.h>
30
#include <linux/utsname.h>
L
Linus Torvalds 已提交
31
#include <linux/delay.h>
32
#include <linux/module.h>
L
Linus Torvalds 已提交
33 34
#include <linux/ptrace.h>
#include <linux/random.h>
A
Andi Kleen 已提交
35
#include <linux/notifier.h>
36
#include <linux/kprobes.h>
37
#include <linux/kdebug.h>
38
#include <linux/tick.h>
L
Linus Torvalds 已提交
39 40 41 42 43 44 45 46 47 48 49 50 51

#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/system.h>
#include <asm/io.h>
#include <asm/processor.h>
#include <asm/i387.h>
#include <asm/mmu_context.h>
#include <asm/pda.h>
#include <asm/prctl.h>
#include <asm/desc.h>
#include <asm/proto.h>
#include <asm/ia32.h>
A
Andi Kleen 已提交
52
#include <asm/idle.h>
L
Linus Torvalds 已提交
53 54 55 56 57 58 59 60 61 62 63 64

asmlinkage extern void ret_from_fork(void);

unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;

unsigned long boot_option_idle_override = 0;
EXPORT_SYMBOL(boot_option_idle_override);

/*
 * Powermanagement idle function, if any..
 */
void (*pm_idle)(void);
65
EXPORT_SYMBOL(pm_idle);
L
Linus Torvalds 已提交
66

67
static ATOMIC_NOTIFIER_HEAD(idle_notifier);
A
Andi Kleen 已提交
68 69 70

void idle_notifier_register(struct notifier_block *n)
{
71
	atomic_notifier_chain_register(&idle_notifier, n);
A
Andi Kleen 已提交
72 73 74 75
}

void enter_idle(void)
{
A
Andi Kleen 已提交
76
	write_pda(isidle, 1);
77
	atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
A
Andi Kleen 已提交
78 79 80 81
}

static void __exit_idle(void)
{
A
Andi Kleen 已提交
82
	if (test_and_clear_bit_pda(0, isidle) == 0)
A
Andi Kleen 已提交
83
		return;
84
	atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
A
Andi Kleen 已提交
85 86 87 88 89
}

/* Called from interrupts to signify idle end */
void exit_idle(void)
{
A
Andi Kleen 已提交
90 91
	/* idle loop has pid 0 */
	if (current->pid)
A
Andi Kleen 已提交
92 93 94 95
		return;
	__exit_idle();
}

L
Linus Torvalds 已提交
96 97 98 99
/*
 * We use this if we don't have any better
 * idle routine..
 */
100
void default_idle(void)
L
Linus Torvalds 已提交
101
{
102
	current_thread_info()->status &= ~TS_POLLING;
103 104 105 106 107
	/*
	 * TS_POLLING-cleared state must be visible before we
	 * test NEED_RESCHED:
	 */
	smp_mb();
108 109
	local_irq_disable();
	if (!need_resched()) {
110 111
		safe_halt();	/* enables interrupts racelessly */
		local_irq_disable();
112 113
	}
	local_irq_enable();
114
	current_thread_info()->status |= TS_POLLING;
L
Linus Torvalds 已提交
115 116 117 118 119 120 121
}

/*
 * On SMP it's slightly faster (but much more power-consuming!)
 * to poll the ->need_resched flag instead of waiting for the
 * cross-CPU IPI to arrive. Use this option with caution.
 */
122
static void poll_idle(void)
L
Linus Torvalds 已提交
123
{
124
	local_irq_enable();
125
	cpu_relax();
L
Linus Torvalds 已提交
126 127
}

A
Ashok Raj 已提交
128 129 130 131
#ifdef CONFIG_HOTPLUG_CPU
DECLARE_PER_CPU(int, cpu_state);

#include <asm/nmi.h>
132
/* We halt the CPU with physical CPU hotplug */
A
Ashok Raj 已提交
133 134 135 136 137 138 139 140
static inline void play_dead(void)
{
	idle_task_exit();
	wbinvd();
	mb();
	/* Ack it */
	__get_cpu_var(cpu_state) = CPU_DEAD;

141
	local_irq_disable();
A
Ashok Raj 已提交
142
	while (1)
143
		halt();
A
Ashok Raj 已提交
144 145 146 147 148 149 150 151
}
#else
static inline void play_dead(void)
{
	BUG();
}
#endif /* CONFIG_HOTPLUG_CPU */

L
Linus Torvalds 已提交
152 153 154 155 156 157
/*
 * The idle thread. There's no useful work to be
 * done, so just try to conserve power and have a
 * low exit latency (ie sit in a loop waiting for
 * somebody to say that they'd like to reschedule)
 */
P
Pavel Machek 已提交
158
void cpu_idle(void)
L
Linus Torvalds 已提交
159
{
160
	current_thread_info()->status |= TS_POLLING;
L
Linus Torvalds 已提交
161 162
	/* endless idle loop with no priority at all */
	while (1) {
163
		tick_nohz_stop_sched_tick();
L
Linus Torvalds 已提交
164 165 166 167 168 169 170
		while (!need_resched()) {
			void (*idle)(void);

			rmb();
			idle = pm_idle;
			if (!idle)
				idle = default_idle;
A
Ashok Raj 已提交
171 172
			if (cpu_is_offline(smp_processor_id()))
				play_dead();
173 174 175 176 177 178
			/*
			 * Idle routines should keep interrupts disabled
			 * from here on, until they go to idle.
			 * Otherwise, idle callbacks can misfire.
			 */
			local_irq_disable();
A
Andi Kleen 已提交
179
			enter_idle();
L
Linus Torvalds 已提交
180
			idle();
A
Andi Kleen 已提交
181 182 183
			/* In many cases the interrupt that ended idle
			   has already called exit_idle. But some idle
			   loops can be woken up without interrupt. */
A
Andi Kleen 已提交
184
			__exit_idle();
L
Linus Torvalds 已提交
185 186
		}

187
		tick_nohz_restart_sched_tick();
188
		preempt_enable_no_resched();
L
Linus Torvalds 已提交
189
		schedule();
190
		preempt_disable();
L
Linus Torvalds 已提交
191 192 193
	}
}

194 195 196 197
static void do_nothing(void *unused)
{
}

V
Venki Pallipadi 已提交
198 199 200 201 202 203 204 205
/*
 * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
 * pm_idle and update to new pm_idle value. Required while changing pm_idle
 * handler on SMP systems.
 *
 * Caller must have changed pm_idle to the new value before the call. Old
 * pm_idle value will not be used by any CPU after the return of this function.
 */
206 207
void cpu_idle_wait(void)
{
V
Venki Pallipadi 已提交
208 209 210
	smp_mb();
	/* kick all the CPUs so that they exit out of pm_idle */
	smp_call_function(do_nothing, NULL, 0, 1);
211 212 213
}
EXPORT_SYMBOL_GPL(cpu_idle_wait);

L
Linus Torvalds 已提交
214 215 216 217 218 219
/*
 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
 * which can obviate IPI to trigger checking of need_resched.
 * We execute MONITOR against need_resched and enter optimized wait state
 * through MWAIT. Whenever someone changes need_resched, we would be woken
 * up from MWAIT (without an IPI).
220 221 222
 *
 * New with Core Duo processors, MWAIT can take some hints based on CPU
 * capability.
L
Linus Torvalds 已提交
223
 */
224
void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
L
Linus Torvalds 已提交
225
{
226
	if (!need_resched()) {
227 228
		__monitor((void *)&current_thread_info()->flags, 0, 0);
		smp_mb();
229
		if (!need_resched())
230
			__mwait(ax, cx);
L
Linus Torvalds 已提交
231 232 233
	}
}

234 235 236
/* Default MONITOR/MWAIT with no hints, used for default C1 state */
static void mwait_idle(void)
{
237 238 239 240 241 242 243 244 245 246
	if (!need_resched()) {
		__monitor((void *)&current_thread_info()->flags, 0, 0);
		smp_mb();
		if (!need_resched())
			__sti_mwait(0, 0);
		else
			local_irq_enable();
	} else {
		local_irq_enable();
	}
247 248
}

249

250
static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
251 252 253 254 255 256 257
{
	if (force_mwait)
		return 1;
	/* Any C1 states supported? */
	return c->cpuid_level >= 5 && ((cpuid_edx(5) >> 4) & 0xf) > 0;
}

258
void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
L
Linus Torvalds 已提交
259
{
260 261 262 263 264 265 266 267 268 269
	static int selected;

	if (selected)
		return;
#ifdef CONFIG_X86_SMP
	if (pm_idle == poll_idle && smp_num_siblings > 1) {
		printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
			" performance may degrade.\n");
	}
#endif
270
	if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
L
Linus Torvalds 已提交
271 272 273 274 275
		/*
		 * Skip, if setup has overridden idle.
		 * One CPU supports mwait => All CPUs supports mwait
		 */
		if (!pm_idle) {
276
			printk(KERN_INFO "using mwait in idle threads.\n");
L
Linus Torvalds 已提交
277 278 279
			pm_idle = mwait_idle;
		}
	}
280
	selected = 1;
L
Linus Torvalds 已提交
281 282
}

283
static int __init idle_setup(char *str)
L
Linus Torvalds 已提交
284
{
285
	if (!strcmp(str, "poll")) {
L
Linus Torvalds 已提交
286 287
		printk("using polling idle threads.\n");
		pm_idle = poll_idle;
288 289 290 291
	} else if (!strcmp(str, "mwait"))
		force_mwait = 1;
	else
		return -1;
L
Linus Torvalds 已提交
292 293

	boot_option_idle_override = 1;
294
	return 0;
L
Linus Torvalds 已提交
295
}
296
early_param("idle", idle_setup);
L
Linus Torvalds 已提交
297

298
/* Prints also some state that isn't saved in the pt_regs */
L
Linus Torvalds 已提交
299 300 301
void __show_regs(struct pt_regs * regs)
{
	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
302
	unsigned long d0, d1, d2, d3, d6, d7;
303 304
	unsigned int fsindex, gsindex;
	unsigned int ds, cs, es;
L
Linus Torvalds 已提交
305 306 307

	printk("\n");
	print_modules();
308 309
	printk("Pid: %d, comm: %.20s %s %s %.*s\n",
		current->pid, current->comm, print_tainted(),
310 311 312
		init_utsname()->release,
		(int)strcspn(init_utsname()->version, " "),
		init_utsname()->version);
313
	printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
314
	printk_address(regs->ip, 1);
315 316
	printk("RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss, regs->sp,
		regs->flags);
L
Linus Torvalds 已提交
317
	printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
318
	       regs->ax, regs->bx, regs->cx);
L
Linus Torvalds 已提交
319
	printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
320
	       regs->dx, regs->si, regs->di);
L
Linus Torvalds 已提交
321
	printk("RBP: %016lx R08: %016lx R09: %016lx\n",
322
	       regs->bp, regs->r8, regs->r9);
L
Linus Torvalds 已提交
323 324 325 326 327 328 329 330 331 332 333 334 335 336 337
	printk("R10: %016lx R11: %016lx R12: %016lx\n",
	       regs->r10, regs->r11, regs->r12); 
	printk("R13: %016lx R14: %016lx R15: %016lx\n",
	       regs->r13, regs->r14, regs->r15); 

	asm("movl %%ds,%0" : "=r" (ds)); 
	asm("movl %%cs,%0" : "=r" (cs)); 
	asm("movl %%es,%0" : "=r" (es)); 
	asm("movl %%fs,%0" : "=r" (fsindex));
	asm("movl %%gs,%0" : "=r" (gsindex));

	rdmsrl(MSR_FS_BASE, fs);
	rdmsrl(MSR_GS_BASE, gs); 
	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); 

338 339 340 341
	cr0 = read_cr0();
	cr2 = read_cr2();
	cr3 = read_cr3();
	cr4 = read_cr4();
L
Linus Torvalds 已提交
342 343 344 345 346

	printk("FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", 
	       fs,fsindex,gs,gsindex,shadowgs); 
	printk("CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0); 
	printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
347 348 349 350 351 352 353 354 355

	get_debugreg(d0, 0);
	get_debugreg(d1, 1);
	get_debugreg(d2, 2);
	printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
	get_debugreg(d3, 3);
	get_debugreg(d6, 6);
	get_debugreg(d7, 7);
	printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
L
Linus Torvalds 已提交
356 357 358 359
}

void show_regs(struct pt_regs *regs)
{
360
	printk("CPU %d:", smp_processor_id());
L
Linus Torvalds 已提交
361
	__show_regs(regs);
362
	show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
L
Linus Torvalds 已提交
363 364 365 366 367 368 369 370 371
}

/*
 * Free current thread data structures etc..
 */
void exit_thread(void)
{
	struct task_struct *me = current;
	struct thread_struct *t = &me->thread;
372

373
	if (me->thread.io_bitmap_ptr) {
L
Linus Torvalds 已提交
374 375 376 377
		struct tss_struct *tss = &per_cpu(init_tss, get_cpu());

		kfree(t->io_bitmap_ptr);
		t->io_bitmap_ptr = NULL;
378
		clear_thread_flag(TIF_IO_BITMAP);
L
Linus Torvalds 已提交
379 380 381 382 383 384 385 386 387 388 389 390 391
		/*
		 * Careful, clear this in the TSS too:
		 */
		memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
		t->io_bitmap_max = 0;
		put_cpu();
	}
}

void flush_thread(void)
{
	struct task_struct *tsk = current;

392 393 394 395 396 397
	if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
		clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
		if (test_tsk_thread_flag(tsk, TIF_IA32)) {
			clear_tsk_thread_flag(tsk, TIF_IA32);
		} else {
			set_tsk_thread_flag(tsk, TIF_IA32);
398
			current_thread_info()->status |= TS_COMPAT;
399
		}
400
	}
401
	clear_tsk_thread_flag(tsk, TIF_DEBUG);
L
Linus Torvalds 已提交
402 403 404 405 406 407 408

	tsk->thread.debugreg0 = 0;
	tsk->thread.debugreg1 = 0;
	tsk->thread.debugreg2 = 0;
	tsk->thread.debugreg3 = 0;
	tsk->thread.debugreg6 = 0;
	tsk->thread.debugreg7 = 0;
409
	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
L
Linus Torvalds 已提交
410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431
	/*
	 * Forget coprocessor state..
	 */
	clear_fpu(tsk);
	clear_used_math();
}

void release_thread(struct task_struct *dead_task)
{
	if (dead_task->mm) {
		if (dead_task->mm->context.size) {
			printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
					dead_task->comm,
					dead_task->mm->context.ldt,
					dead_task->mm->context.size);
			BUG();
		}
	}
}

static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
{
432
	struct user_desc ud = {
L
Linus Torvalds 已提交
433 434 435 436 437 438
		.base_addr = addr,
		.limit = 0xfffff,
		.seg_32bit = 1,
		.limit_in_pages = 1,
		.useable = 1,
	};
J
Jan Engelhardt 已提交
439
	struct desc_struct *desc = t->thread.tls_array;
L
Linus Torvalds 已提交
440
	desc += tls;
441
	fill_ldt(desc, &ud);
L
Linus Torvalds 已提交
442 443 444 445
}

static inline u32 read_32bit_tls(struct task_struct *t, int tls)
{
R
Roland McGrath 已提交
446
	return get_desc_base(&t->thread.tls_array[tls]);
L
Linus Torvalds 已提交
447 448 449 450 451 452 453 454 455 456 457
}

/*
 * This gets called before we allocate a new thread and copy
 * the current task into it.
 */
void prepare_to_copy(struct task_struct *tsk)
{
	unlazy_fpu(tsk);
}

458
int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
L
Linus Torvalds 已提交
459 460 461 462 463 464 465
		unsigned long unused,
	struct task_struct * p, struct pt_regs * regs)
{
	int err;
	struct pt_regs * childregs;
	struct task_struct *me = current;

466
	childregs = ((struct pt_regs *)
A
Al Viro 已提交
467
			(THREAD_SIZE + task_stack_page(p))) - 1;
L
Linus Torvalds 已提交
468 469
	*childregs = *regs;

470 471 472 473
	childregs->ax = 0;
	childregs->sp = sp;
	if (sp == ~0UL)
		childregs->sp = (unsigned long)childregs;
L
Linus Torvalds 已提交
474

475 476 477
	p->thread.sp = (unsigned long) childregs;
	p->thread.sp0 = (unsigned long) (childregs+1);
	p->thread.usersp = me->thread.usersp;
L
Linus Torvalds 已提交
478

A
Al Viro 已提交
479
	set_tsk_thread_flag(p, TIF_FORK);
L
Linus Torvalds 已提交
480 481 482 483

	p->thread.fs = me->thread.fs;
	p->thread.gs = me->thread.gs;

484 485 486 487
	asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
	asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
	asm("mov %%es,%0" : "=m" (p->thread.es));
	asm("mov %%ds,%0" : "=m" (p->thread.ds));
L
Linus Torvalds 已提交
488

489
	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
L
Linus Torvalds 已提交
490 491 492 493 494
		p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
		if (!p->thread.io_bitmap_ptr) {
			p->thread.io_bitmap_max = 0;
			return -ENOMEM;
		}
495 496
		memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
				IO_BITMAP_BYTES);
497
		set_tsk_thread_flag(p, TIF_IO_BITMAP);
498
	}
L
Linus Torvalds 已提交
499 500 501 502 503 504 505

	/*
	 * Set a new TLS for the child thread?
	 */
	if (clone_flags & CLONE_SETTLS) {
#ifdef CONFIG_IA32_EMULATION
		if (test_thread_flag(TIF_IA32))
R
Roland McGrath 已提交
506
			err = do_set_thread_area(p, -1,
507
				(struct user_desc __user *)childregs->si, 0);
L
Linus Torvalds 已提交
508 509 510 511 512 513 514 515 516 517 518 519 520 521 522
		else 			
#endif	 
			err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); 
		if (err) 
			goto out;
	}
	err = 0;
out:
	if (err && p->thread.io_bitmap_ptr) {
		kfree(p->thread.io_bitmap_ptr);
		p->thread.io_bitmap_max = 0;
	}
	return err;
}

I
Ingo Molnar 已提交
523 524 525 526 527 528 529 530 531 532 533 534 535 536 537
void
start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
{
	asm volatile("movl %0, %%fs; movl %0, %%es; movl %0, %%ds" :: "r"(0));
	load_gs_index(0);
	regs->ip		= new_ip;
	regs->sp		= new_sp;
	write_pda(oldrsp, new_sp);
	regs->cs		= __USER_CS;
	regs->ss		= __USER_DS;
	regs->flags		= 0x200;
	set_fs(USER_DS);
}
EXPORT_SYMBOL_GPL(start_thread);

L
Linus Torvalds 已提交
538 539 540
/*
 * This special macro can be used to load a debugging register
 */
541 542
#define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)

543
static inline void __switch_to_xtra(struct task_struct *prev_p,
544 545
				    struct task_struct *next_p,
				    struct tss_struct *tss)
546 547
{
	struct thread_struct *prev, *next;
548
	unsigned long debugctl;
549 550 551 552

	prev = &prev_p->thread,
	next = &next_p->thread;

553 554 555 556 557
	debugctl = prev->debugctlmsr;
	if (next->ds_area_msr != prev->ds_area_msr) {
		/* we clear debugctl to make sure DS
		 * is not in use when we change it */
		debugctl = 0;
558
		update_debugctlmsr(0);
559 560 561 562
		wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr);
	}

	if (next->debugctlmsr != debugctl)
563
		update_debugctlmsr(next->debugctlmsr);
R
Roland McGrath 已提交
564

565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587
	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
		loaddebug(next, 0);
		loaddebug(next, 1);
		loaddebug(next, 2);
		loaddebug(next, 3);
		/* no 4 and 5 */
		loaddebug(next, 6);
		loaddebug(next, 7);
	}

	if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
		/*
		 * Copy the relevant range of the IO bitmap.
		 * Normally this is 128 bytes or less:
		 */
		memcpy(tss->io_bitmap, next->io_bitmap_ptr,
		       max(prev->io_bitmap_max, next->io_bitmap_max));
	} else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
		/*
		 * Clear any possible leftover bits:
		 */
		memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
	}
588

589
#ifdef X86_BTS
590 591 592 593 594
	if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
		ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);

	if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
		ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
595
#endif
596 597
}

L
Linus Torvalds 已提交
598 599 600
/*
 *	switch_to(x,y) should switch tasks from x to y.
 *
601
 * This could still be optimized:
L
Linus Torvalds 已提交
602 603
 * - fold all the options into a flag word and test it with a single test.
 * - could test fs/gs bitsliced
604 605
 *
 * Kprobes not supported here. Set the probe on schedule instead.
L
Linus Torvalds 已提交
606
 */
607
struct task_struct *
608
__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
L
Linus Torvalds 已提交
609 610 611
{
	struct thread_struct *prev = &prev_p->thread,
				 *next = &next_p->thread;
612
	int cpu = smp_processor_id();
L
Linus Torvalds 已提交
613 614
	struct tss_struct *tss = &per_cpu(init_tss, cpu);

615 616 617 618
	/* we're going to use this soon, after a few expensive things */
	if (next_p->fpu_counter>5)
		prefetch(&next->i387.fxsave);

L
Linus Torvalds 已提交
619 620 621
	/*
	 * Reload esp0, LDT and the page table pointer:
	 */
622
	load_sp0(tss, next);
L
Linus Torvalds 已提交
623 624 625 626 627

	/* 
	 * Switch DS and ES.
	 * This won't pick up thread selector changes, but I guess that is ok.
	 */
628
	asm volatile("mov %%es,%0" : "=m" (prev->es));
L
Linus Torvalds 已提交
629 630 631
	if (unlikely(next->es | prev->es))
		loadsegment(es, next->es); 
	
632
	asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
L
Linus Torvalds 已提交
633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674
	if (unlikely(next->ds | prev->ds))
		loadsegment(ds, next->ds);

	load_TLS(next, cpu);

	/* 
	 * Switch FS and GS.
	 */
	{ 
		unsigned fsindex;
		asm volatile("movl %%fs,%0" : "=r" (fsindex)); 
		/* segment register != 0 always requires a reload. 
		   also reload when it has changed. 
		   when prev process used 64bit base always reload
		   to avoid an information leak. */
		if (unlikely(fsindex | next->fsindex | prev->fs)) {
			loadsegment(fs, next->fsindex);
			/* check if the user used a selector != 0
	                 * if yes clear 64bit base, since overloaded base
                         * is always mapped to the Null selector
                         */
			if (fsindex)
			prev->fs = 0;				
		}
		/* when next process has a 64bit base use it */
		if (next->fs) 
			wrmsrl(MSR_FS_BASE, next->fs); 
		prev->fsindex = fsindex;
	}
	{ 
		unsigned gsindex;
		asm volatile("movl %%gs,%0" : "=r" (gsindex)); 
		if (unlikely(gsindex | next->gsindex | prev->gs)) {
			load_gs_index(next->gsindex);
			if (gsindex)
			prev->gs = 0;				
		}
		if (next->gs)
			wrmsrl(MSR_KERNEL_GS_BASE, next->gs); 
		prev->gsindex = gsindex;
	}

A
Andi Kleen 已提交
675 676 677
	/* Must be after DS reload */
	unlazy_fpu(prev_p);

L
Linus Torvalds 已提交
678
	/* 
679
	 * Switch the PDA and FPU contexts.
L
Linus Torvalds 已提交
680
	 */
681 682
	prev->usersp = read_pda(oldrsp);
	write_pda(oldrsp, next->usersp);
L
Linus Torvalds 已提交
683
	write_pda(pcurrent, next_p); 
684

685
	write_pda(kernelstack,
686
	(unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
687 688 689 690 691 692 693 694
#ifdef CONFIG_CC_STACKPROTECTOR
	write_pda(stack_canary, next_p->stack_canary);
	/*
	 * Build time only check to make sure the stack_canary is at
	 * offset 40 in the pda; this is a gcc ABI requirement
	 */
	BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
#endif
L
Linus Torvalds 已提交
695 696

	/*
697
	 * Now maybe reload the debug registers and handle I/O bitmaps
L
Linus Torvalds 已提交
698
	 */
699 700
	if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
		     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
701
		__switch_to_xtra(prev_p, next_p, tss);
L
Linus Torvalds 已提交
702

703 704 705 706 707 708
	/* If the task has used fpu the last 5 timeslices, just do a full
	 * restore of the math state immediately to avoid the trap; the
	 * chances of needing FPU soon are obviously high now
	 */
	if (next_p->fpu_counter>5)
		math_state_restore();
L
Linus Torvalds 已提交
709 710 711 712 713 714
	return prev_p;
}

/*
 * sys_execve() executes a new program.
 */
715
asmlinkage
L
Linus Torvalds 已提交
716
long sys_execve(char __user *name, char __user * __user *argv,
717
		char __user * __user *envp, struct pt_regs *regs)
L
Linus Torvalds 已提交
718 719 720 721 722 723
{
	long error;
	char * filename;

	filename = getname(name);
	error = PTR_ERR(filename);
724
	if (IS_ERR(filename))
L
Linus Torvalds 已提交
725
		return error;
726
	error = do_execve(filename, argv, envp, regs);
L
Linus Torvalds 已提交
727 728 729 730 731 732 733 734 735
	putname(filename);
	return error;
}

void set_personality_64bit(void)
{
	/* inherit personality from parent */

	/* Make sure to be in 64bit mode */
736
	clear_thread_flag(TIF_IA32);
L
Linus Torvalds 已提交
737 738 739 740

	/* TBD: overwrites user setup. Should have two bits.
	   But 64bit processes have always behaved this way,
	   so it's not too bad. The main problem is just that
741
	   32bit childs are affected again. */
L
Linus Torvalds 已提交
742 743 744 745 746
	current->personality &= ~READ_IMPLIES_EXEC;
}

asmlinkage long sys_fork(struct pt_regs *regs)
{
747
	return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
L
Linus Torvalds 已提交
748 749
}

750 751 752
asmlinkage long
sys_clone(unsigned long clone_flags, unsigned long newsp,
	  void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
L
Linus Torvalds 已提交
753 754
{
	if (!newsp)
755
		newsp = regs->sp;
L
Linus Torvalds 已提交
756 757 758 759 760 761 762 763 764 765 766 767 768 769 770
	return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
}

/*
 * This is trivial, and on the face of it looks like it
 * could equally well be done in user mode.
 *
 * Not so, for quite unobvious reasons - register pressure.
 * In user mode vfork() cannot have a stack frame, and if
 * done by calling the "clone()" system call directly, you
 * do not have enough call-clobbered registers to hold all
 * the information you need.
 */
asmlinkage long sys_vfork(struct pt_regs *regs)
{
771
	return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
L
Linus Torvalds 已提交
772 773 774 775 776 777
		    NULL, NULL);
}

unsigned long get_wchan(struct task_struct *p)
{
	unsigned long stack;
778
	u64 fp,ip;
L
Linus Torvalds 已提交
779 780 781 782
	int count = 0;

	if (!p || p == current || p->state==TASK_RUNNING)
		return 0; 
A
Al Viro 已提交
783
	stack = (unsigned long)task_stack_page(p);
784
	if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE)
L
Linus Torvalds 已提交
785
		return 0;
786
	fp = *(u64 *)(p->thread.sp);
L
Linus Torvalds 已提交
787
	do { 
788 789
		if (fp < (unsigned long)stack ||
		    fp > (unsigned long)stack+THREAD_SIZE)
L
Linus Torvalds 已提交
790
			return 0; 
791 792 793
		ip = *(u64 *)(fp+8);
		if (!in_sched_functions(ip))
			return ip;
L
Linus Torvalds 已提交
794 795 796 797 798 799 800 801 802 803 804 805 806
		fp = *(u64 *)fp; 
	} while (count++ < 16); 
	return 0;
}

long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
{ 
	int ret = 0; 
	int doit = task == current;
	int cpu;

	switch (code) { 
	case ARCH_SET_GS:
807
		if (addr >= TASK_SIZE_OF(task))
L
Linus Torvalds 已提交
808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823
			return -EPERM; 
		cpu = get_cpu();
		/* handle small bases via the GDT because that's faster to 
		   switch. */
		if (addr <= 0xffffffff) {  
			set_32bit_tls(task, GS_TLS, addr); 
			if (doit) { 
				load_TLS(&task->thread, cpu);
				load_gs_index(GS_TLS_SEL); 
			}
			task->thread.gsindex = GS_TLS_SEL; 
			task->thread.gs = 0;
		} else { 
			task->thread.gsindex = 0;
			task->thread.gs = addr;
			if (doit) {
824 825
				load_gs_index(0);
				ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
L
Linus Torvalds 已提交
826 827 828 829 830 831 832
			} 
		}
		put_cpu();
		break;
	case ARCH_SET_FS:
		/* Not strictly needed for fs, but do it for symmetry
		   with gs */
833
		if (addr >= TASK_SIZE_OF(task))
834
			return -EPERM;
L
Linus Torvalds 已提交
835
		cpu = get_cpu();
836
		/* handle small bases via the GDT because that's faster to
L
Linus Torvalds 已提交
837
		   switch. */
838
		if (addr <= 0xffffffff) {
L
Linus Torvalds 已提交
839
			set_32bit_tls(task, FS_TLS, addr);
840 841
			if (doit) {
				load_TLS(&task->thread, cpu);
842
				asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
L
Linus Torvalds 已提交
843 844 845
			}
			task->thread.fsindex = FS_TLS_SEL;
			task->thread.fs = 0;
846
		} else {
L
Linus Torvalds 已提交
847 848 849 850 851
			task->thread.fsindex = 0;
			task->thread.fs = addr;
			if (doit) {
				/* set the selector to 0 to not confuse
				   __switch_to */
852 853
				asm volatile("movl %0,%%fs" :: "r" (0));
				ret = checking_wrmsrl(MSR_FS_BASE, addr);
L
Linus Torvalds 已提交
854 855 856 857
			}
		}
		put_cpu();
		break;
858 859
	case ARCH_GET_FS: {
		unsigned long base;
L
Linus Torvalds 已提交
860 861
		if (task->thread.fsindex == FS_TLS_SEL)
			base = read_32bit_tls(task, FS_TLS);
862
		else if (doit)
L
Linus Torvalds 已提交
863
			rdmsrl(MSR_FS_BASE, base);
864
		else
L
Linus Torvalds 已提交
865
			base = task->thread.fs;
866 867
		ret = put_user(base, (unsigned long __user *)addr);
		break;
L
Linus Torvalds 已提交
868
	}
869
	case ARCH_GET_GS: {
L
Linus Torvalds 已提交
870
		unsigned long base;
871
		unsigned gsindex;
L
Linus Torvalds 已提交
872 873
		if (task->thread.gsindex == GS_TLS_SEL)
			base = read_32bit_tls(task, GS_TLS);
874
		else if (doit) {
875
			asm("movl %%gs,%0" : "=r" (gsindex));
876 877 878 879 880
			if (gsindex)
				rdmsrl(MSR_KERNEL_GS_BASE, base);
			else
				base = task->thread.gs;
		}
881
		else
L
Linus Torvalds 已提交
882
			base = task->thread.gs;
883
		ret = put_user(base, (unsigned long __user *)addr);
L
Linus Torvalds 已提交
884 885 886 887 888 889
		break;
	}

	default:
		ret = -EINVAL;
		break;
890
	}
L
Linus Torvalds 已提交
891

892 893
	return ret;
}
L
Linus Torvalds 已提交
894 895 896 897 898 899 900 901

long sys_arch_prctl(int code, unsigned long addr)
{
	return do_arch_prctl(current, code, addr);
}

unsigned long arch_align_stack(unsigned long sp)
{
902
	if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
L
Linus Torvalds 已提交
903 904 905
		sp -= get_random_int() % 8192;
	return sp & ~0xf;
}
J
Jiri Kosina 已提交
906 907 908 909 910 911

unsigned long arch_randomize_brk(struct mm_struct *mm)
{
	unsigned long range_end = mm->brk + 0x02000000;
	return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
}