process_64.c 22.2 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5
/*
 *  Copyright (C) 1995  Linus Torvalds
 *
 *  Pentium III FXSR, SSE support
 *	Gareth Hughes <gareth@valinux.com>, May 2000
6
 *
L
Linus Torvalds 已提交
7 8
 *  X86-64 port
 *	Andi Kleen.
A
Ashok Raj 已提交
9 10
 *
 *	CPU hotplug support - ashok.raj@intel.com
L
Linus Torvalds 已提交
11 12 13 14 15 16 17 18
 */

/*
 * This file handles the architecture-dependent parts of process handling..
 */

#include <stdarg.h>

A
Ashok Raj 已提交
19
#include <linux/cpu.h>
L
Linus Torvalds 已提交
20 21
#include <linux/errno.h>
#include <linux/sched.h>
22
#include <linux/fs.h>
L
Linus Torvalds 已提交
23 24 25 26 27 28 29
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/elfcore.h>
#include <linux/smp.h>
#include <linux/slab.h>
#include <linux/user.h>
#include <linux/interrupt.h>
30
#include <linux/utsname.h>
L
Linus Torvalds 已提交
31
#include <linux/delay.h>
32
#include <linux/module.h>
L
Linus Torvalds 已提交
33 34
#include <linux/ptrace.h>
#include <linux/random.h>
A
Andi Kleen 已提交
35
#include <linux/notifier.h>
36
#include <linux/kprobes.h>
37
#include <linux/kdebug.h>
38
#include <linux/tick.h>
L
Linus Torvalds 已提交
39 40 41 42 43 44 45 46 47 48 49 50 51

#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/system.h>
#include <asm/io.h>
#include <asm/processor.h>
#include <asm/i387.h>
#include <asm/mmu_context.h>
#include <asm/pda.h>
#include <asm/prctl.h>
#include <asm/desc.h>
#include <asm/proto.h>
#include <asm/ia32.h>
A
Andi Kleen 已提交
52
#include <asm/idle.h>
L
Linus Torvalds 已提交
53 54 55 56 57 58 59 60 61 62 63 64

asmlinkage extern void ret_from_fork(void);

unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;

unsigned long boot_option_idle_override = 0;
EXPORT_SYMBOL(boot_option_idle_override);

/*
 * Powermanagement idle function, if any..
 */
void (*pm_idle)(void);
65
EXPORT_SYMBOL(pm_idle);
L
Linus Torvalds 已提交
66

67
static ATOMIC_NOTIFIER_HEAD(idle_notifier);
A
Andi Kleen 已提交
68 69 70

void idle_notifier_register(struct notifier_block *n)
{
71
	atomic_notifier_chain_register(&idle_notifier, n);
A
Andi Kleen 已提交
72 73 74 75
}

void enter_idle(void)
{
A
Andi Kleen 已提交
76
	write_pda(isidle, 1);
77
	atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
A
Andi Kleen 已提交
78 79 80 81
}

static void __exit_idle(void)
{
A
Andi Kleen 已提交
82
	if (test_and_clear_bit_pda(0, isidle) == 0)
A
Andi Kleen 已提交
83
		return;
84
	atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
A
Andi Kleen 已提交
85 86 87 88 89
}

/* Called from interrupts to signify idle end */
void exit_idle(void)
{
A
Andi Kleen 已提交
90 91
	/* idle loop has pid 0 */
	if (current->pid)
A
Andi Kleen 已提交
92 93 94 95
		return;
	__exit_idle();
}

L
Linus Torvalds 已提交
96 97 98 99
/*
 * We use this if we don't have any better
 * idle routine..
 */
100
void default_idle(void)
L
Linus Torvalds 已提交
101
{
102
	current_thread_info()->status &= ~TS_POLLING;
103 104 105 106 107
	/*
	 * TS_POLLING-cleared state must be visible before we
	 * test NEED_RESCHED:
	 */
	smp_mb();
108 109
	local_irq_disable();
	if (!need_resched()) {
110 111 112 113 114 115 116 117 118 119
		ktime_t t0, t1;
		u64 t0n, t1n;

		t0 = ktime_get();
		t0n = ktime_to_ns(t0);
		safe_halt();	/* enables interrupts racelessly */
		local_irq_disable();
		t1 = ktime_get();
		t1n = ktime_to_ns(t1);
		sched_clock_idle_wakeup_event(t1n - t0n);
120 121
	}
	local_irq_enable();
122
	current_thread_info()->status |= TS_POLLING;
L
Linus Torvalds 已提交
123 124 125 126 127 128 129
}

/*
 * On SMP it's slightly faster (but much more power-consuming!)
 * to poll the ->need_resched flag instead of waiting for the
 * cross-CPU IPI to arrive. Use this option with caution.
 */
130
static void poll_idle(void)
L
Linus Torvalds 已提交
131
{
132
	local_irq_enable();
133
	cpu_relax();
L
Linus Torvalds 已提交
134 135
}

A
Ashok Raj 已提交
136 137 138 139
#ifdef CONFIG_HOTPLUG_CPU
DECLARE_PER_CPU(int, cpu_state);

#include <asm/nmi.h>
140
/* We halt the CPU with physical CPU hotplug */
A
Ashok Raj 已提交
141 142 143 144 145 146 147 148
static inline void play_dead(void)
{
	idle_task_exit();
	wbinvd();
	mb();
	/* Ack it */
	__get_cpu_var(cpu_state) = CPU_DEAD;

149
	local_irq_disable();
A
Ashok Raj 已提交
150
	while (1)
151
		halt();
A
Ashok Raj 已提交
152 153 154 155 156 157 158 159
}
#else
static inline void play_dead(void)
{
	BUG();
}
#endif /* CONFIG_HOTPLUG_CPU */

L
Linus Torvalds 已提交
160 161 162 163 164 165
/*
 * The idle thread. There's no useful work to be
 * done, so just try to conserve power and have a
 * low exit latency (ie sit in a loop waiting for
 * somebody to say that they'd like to reschedule)
 */
P
Pavel Machek 已提交
166
void cpu_idle(void)
L
Linus Torvalds 已提交
167
{
168
	current_thread_info()->status |= TS_POLLING;
L
Linus Torvalds 已提交
169 170
	/* endless idle loop with no priority at all */
	while (1) {
171
		tick_nohz_stop_sched_tick();
L
Linus Torvalds 已提交
172 173 174 175 176 177 178
		while (!need_resched()) {
			void (*idle)(void);

			rmb();
			idle = pm_idle;
			if (!idle)
				idle = default_idle;
A
Ashok Raj 已提交
179 180
			if (cpu_is_offline(smp_processor_id()))
				play_dead();
181 182 183 184 185 186
			/*
			 * Idle routines should keep interrupts disabled
			 * from here on, until they go to idle.
			 * Otherwise, idle callbacks can misfire.
			 */
			local_irq_disable();
A
Andi Kleen 已提交
187
			enter_idle();
L
Linus Torvalds 已提交
188
			idle();
A
Andi Kleen 已提交
189 190 191
			/* In many cases the interrupt that ended idle
			   has already called exit_idle. But some idle
			   loops can be woken up without interrupt. */
A
Andi Kleen 已提交
192
			__exit_idle();
L
Linus Torvalds 已提交
193 194
		}

195
		tick_nohz_restart_sched_tick();
196
		preempt_enable_no_resched();
L
Linus Torvalds 已提交
197
		schedule();
198
		preempt_disable();
L
Linus Torvalds 已提交
199 200 201
	}
}

202 203 204 205
static void do_nothing(void *unused)
{
}

V
Venki Pallipadi 已提交
206 207 208 209 210 211 212 213
/*
 * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
 * pm_idle and update to new pm_idle value. Required while changing pm_idle
 * handler on SMP systems.
 *
 * Caller must have changed pm_idle to the new value before the call. Old
 * pm_idle value will not be used by any CPU after the return of this function.
 */
214 215
void cpu_idle_wait(void)
{
V
Venki Pallipadi 已提交
216 217 218
	smp_mb();
	/* kick all the CPUs so that they exit out of pm_idle */
	smp_call_function(do_nothing, NULL, 0, 1);
219 220 221
}
EXPORT_SYMBOL_GPL(cpu_idle_wait);

L
Linus Torvalds 已提交
222 223 224 225 226 227
/*
 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
 * which can obviate IPI to trigger checking of need_resched.
 * We execute MONITOR against need_resched and enter optimized wait state
 * through MWAIT. Whenever someone changes need_resched, we would be woken
 * up from MWAIT (without an IPI).
228 229 230
 *
 * New with Core Duo processors, MWAIT can take some hints based on CPU
 * capability.
L
Linus Torvalds 已提交
231
 */
232
void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
L
Linus Torvalds 已提交
233
{
234
	if (!need_resched()) {
235 236
		__monitor((void *)&current_thread_info()->flags, 0, 0);
		smp_mb();
237
		if (!need_resched())
238
			__mwait(ax, cx);
L
Linus Torvalds 已提交
239 240 241
	}
}

242 243 244
/* Default MONITOR/MWAIT with no hints, used for default C1 state */
static void mwait_idle(void)
{
245 246 247 248 249 250 251 252 253 254
	if (!need_resched()) {
		__monitor((void *)&current_thread_info()->flags, 0, 0);
		smp_mb();
		if (!need_resched())
			__sti_mwait(0, 0);
		else
			local_irq_enable();
	} else {
		local_irq_enable();
	}
255 256
}

257

258
static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
259 260 261 262 263 264 265
{
	if (force_mwait)
		return 1;
	/* Any C1 states supported? */
	return c->cpuid_level >= 5 && ((cpuid_edx(5) >> 4) & 0xf) > 0;
}

266
void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
L
Linus Torvalds 已提交
267
{
268 269 270 271 272 273 274 275 276 277
	static int selected;

	if (selected)
		return;
#ifdef CONFIG_X86_SMP
	if (pm_idle == poll_idle && smp_num_siblings > 1) {
		printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
			" performance may degrade.\n");
	}
#endif
278
	if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
L
Linus Torvalds 已提交
279 280 281 282 283
		/*
		 * Skip, if setup has overridden idle.
		 * One CPU supports mwait => All CPUs supports mwait
		 */
		if (!pm_idle) {
284
			printk(KERN_INFO "using mwait in idle threads.\n");
L
Linus Torvalds 已提交
285 286 287
			pm_idle = mwait_idle;
		}
	}
288
	selected = 1;
L
Linus Torvalds 已提交
289 290
}

291
static int __init idle_setup(char *str)
L
Linus Torvalds 已提交
292
{
293
	if (!strcmp(str, "poll")) {
L
Linus Torvalds 已提交
294 295
		printk("using polling idle threads.\n");
		pm_idle = poll_idle;
296 297 298 299
	} else if (!strcmp(str, "mwait"))
		force_mwait = 1;
	else
		return -1;
L
Linus Torvalds 已提交
300 301

	boot_option_idle_override = 1;
302
	return 0;
L
Linus Torvalds 已提交
303
}
304
early_param("idle", idle_setup);
L
Linus Torvalds 已提交
305

306
/* Prints also some state that isn't saved in the pt_regs */
L
Linus Torvalds 已提交
307 308 309
void __show_regs(struct pt_regs * regs)
{
	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
310
	unsigned long d0, d1, d2, d3, d6, d7;
311 312
	unsigned int fsindex, gsindex;
	unsigned int ds, cs, es;
L
Linus Torvalds 已提交
313 314 315

	printk("\n");
	print_modules();
316 317
	printk("Pid: %d, comm: %.20s %s %s %.*s\n",
		current->pid, current->comm, print_tainted(),
318 319 320
		init_utsname()->release,
		(int)strcspn(init_utsname()->version, " "),
		init_utsname()->version);
321
	printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
322
	printk_address(regs->ip, 1);
323 324
	printk("RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss, regs->sp,
		regs->flags);
L
Linus Torvalds 已提交
325
	printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
326
	       regs->ax, regs->bx, regs->cx);
L
Linus Torvalds 已提交
327
	printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
328
	       regs->dx, regs->si, regs->di);
L
Linus Torvalds 已提交
329
	printk("RBP: %016lx R08: %016lx R09: %016lx\n",
330
	       regs->bp, regs->r8, regs->r9);
L
Linus Torvalds 已提交
331 332 333 334 335 336 337 338 339 340 341 342 343 344 345
	printk("R10: %016lx R11: %016lx R12: %016lx\n",
	       regs->r10, regs->r11, regs->r12); 
	printk("R13: %016lx R14: %016lx R15: %016lx\n",
	       regs->r13, regs->r14, regs->r15); 

	asm("movl %%ds,%0" : "=r" (ds)); 
	asm("movl %%cs,%0" : "=r" (cs)); 
	asm("movl %%es,%0" : "=r" (es)); 
	asm("movl %%fs,%0" : "=r" (fsindex));
	asm("movl %%gs,%0" : "=r" (gsindex));

	rdmsrl(MSR_FS_BASE, fs);
	rdmsrl(MSR_GS_BASE, gs); 
	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); 

346 347 348 349
	cr0 = read_cr0();
	cr2 = read_cr2();
	cr3 = read_cr3();
	cr4 = read_cr4();
L
Linus Torvalds 已提交
350 351 352 353 354

	printk("FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", 
	       fs,fsindex,gs,gsindex,shadowgs); 
	printk("CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0); 
	printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
355 356 357 358 359 360 361 362 363

	get_debugreg(d0, 0);
	get_debugreg(d1, 1);
	get_debugreg(d2, 2);
	printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
	get_debugreg(d3, 3);
	get_debugreg(d6, 6);
	get_debugreg(d7, 7);
	printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
L
Linus Torvalds 已提交
364 365 366 367
}

void show_regs(struct pt_regs *regs)
{
368
	printk("CPU %d:", smp_processor_id());
L
Linus Torvalds 已提交
369
	__show_regs(regs);
370
	show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
L
Linus Torvalds 已提交
371 372 373 374 375 376 377 378 379
}

/*
 * Free current thread data structures etc..
 */
void exit_thread(void)
{
	struct task_struct *me = current;
	struct thread_struct *t = &me->thread;
380

381
	if (me->thread.io_bitmap_ptr) {
L
Linus Torvalds 已提交
382 383 384 385
		struct tss_struct *tss = &per_cpu(init_tss, get_cpu());

		kfree(t->io_bitmap_ptr);
		t->io_bitmap_ptr = NULL;
386
		clear_thread_flag(TIF_IO_BITMAP);
L
Linus Torvalds 已提交
387 388 389 390 391 392 393 394 395 396 397 398 399
		/*
		 * Careful, clear this in the TSS too:
		 */
		memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
		t->io_bitmap_max = 0;
		put_cpu();
	}
}

void flush_thread(void)
{
	struct task_struct *tsk = current;

400 401 402 403 404 405
	if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
		clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
		if (test_tsk_thread_flag(tsk, TIF_IA32)) {
			clear_tsk_thread_flag(tsk, TIF_IA32);
		} else {
			set_tsk_thread_flag(tsk, TIF_IA32);
406
			current_thread_info()->status |= TS_COMPAT;
407
		}
408
	}
409
	clear_tsk_thread_flag(tsk, TIF_DEBUG);
L
Linus Torvalds 已提交
410 411 412 413 414 415 416

	tsk->thread.debugreg0 = 0;
	tsk->thread.debugreg1 = 0;
	tsk->thread.debugreg2 = 0;
	tsk->thread.debugreg3 = 0;
	tsk->thread.debugreg6 = 0;
	tsk->thread.debugreg7 = 0;
417
	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
L
Linus Torvalds 已提交
418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439
	/*
	 * Forget coprocessor state..
	 */
	clear_fpu(tsk);
	clear_used_math();
}

void release_thread(struct task_struct *dead_task)
{
	if (dead_task->mm) {
		if (dead_task->mm->context.size) {
			printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
					dead_task->comm,
					dead_task->mm->context.ldt,
					dead_task->mm->context.size);
			BUG();
		}
	}
}

static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
{
440
	struct user_desc ud = {
L
Linus Torvalds 已提交
441 442 443 444 445 446
		.base_addr = addr,
		.limit = 0xfffff,
		.seg_32bit = 1,
		.limit_in_pages = 1,
		.useable = 1,
	};
J
Jan Engelhardt 已提交
447
	struct desc_struct *desc = t->thread.tls_array;
L
Linus Torvalds 已提交
448
	desc += tls;
449
	fill_ldt(desc, &ud);
L
Linus Torvalds 已提交
450 451 452 453
}

static inline u32 read_32bit_tls(struct task_struct *t, int tls)
{
R
Roland McGrath 已提交
454
	return get_desc_base(&t->thread.tls_array[tls]);
L
Linus Torvalds 已提交
455 456 457 458 459 460 461 462 463 464 465
}

/*
 * This gets called before we allocate a new thread and copy
 * the current task into it.
 */
void prepare_to_copy(struct task_struct *tsk)
{
	unlazy_fpu(tsk);
}

466
int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
L
Linus Torvalds 已提交
467 468 469 470 471 472 473
		unsigned long unused,
	struct task_struct * p, struct pt_regs * regs)
{
	int err;
	struct pt_regs * childregs;
	struct task_struct *me = current;

474
	childregs = ((struct pt_regs *)
A
Al Viro 已提交
475
			(THREAD_SIZE + task_stack_page(p))) - 1;
L
Linus Torvalds 已提交
476 477
	*childregs = *regs;

478 479 480 481
	childregs->ax = 0;
	childregs->sp = sp;
	if (sp == ~0UL)
		childregs->sp = (unsigned long)childregs;
L
Linus Torvalds 已提交
482

483 484 485
	p->thread.sp = (unsigned long) childregs;
	p->thread.sp0 = (unsigned long) (childregs+1);
	p->thread.usersp = me->thread.usersp;
L
Linus Torvalds 已提交
486

A
Al Viro 已提交
487
	set_tsk_thread_flag(p, TIF_FORK);
L
Linus Torvalds 已提交
488 489 490 491

	p->thread.fs = me->thread.fs;
	p->thread.gs = me->thread.gs;

492 493 494 495
	asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
	asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
	asm("mov %%es,%0" : "=m" (p->thread.es));
	asm("mov %%ds,%0" : "=m" (p->thread.ds));
L
Linus Torvalds 已提交
496

497
	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
L
Linus Torvalds 已提交
498 499 500 501 502
		p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
		if (!p->thread.io_bitmap_ptr) {
			p->thread.io_bitmap_max = 0;
			return -ENOMEM;
		}
503 504
		memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
				IO_BITMAP_BYTES);
505
		set_tsk_thread_flag(p, TIF_IO_BITMAP);
506
	}
L
Linus Torvalds 已提交
507 508 509 510 511 512 513

	/*
	 * Set a new TLS for the child thread?
	 */
	if (clone_flags & CLONE_SETTLS) {
#ifdef CONFIG_IA32_EMULATION
		if (test_thread_flag(TIF_IA32))
R
Roland McGrath 已提交
514
			err = do_set_thread_area(p, -1,
515
				(struct user_desc __user *)childregs->si, 0);
L
Linus Torvalds 已提交
516 517 518 519 520 521 522 523 524 525 526 527 528 529 530
		else 			
#endif	 
			err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); 
		if (err) 
			goto out;
	}
	err = 0;
out:
	if (err && p->thread.io_bitmap_ptr) {
		kfree(p->thread.io_bitmap_ptr);
		p->thread.io_bitmap_max = 0;
	}
	return err;
}

I
Ingo Molnar 已提交
531 532 533 534 535 536 537 538 539 540 541 542 543 544 545
void
start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
{
	asm volatile("movl %0, %%fs; movl %0, %%es; movl %0, %%ds" :: "r"(0));
	load_gs_index(0);
	regs->ip		= new_ip;
	regs->sp		= new_sp;
	write_pda(oldrsp, new_sp);
	regs->cs		= __USER_CS;
	regs->ss		= __USER_DS;
	regs->flags		= 0x200;
	set_fs(USER_DS);
}
EXPORT_SYMBOL_GPL(start_thread);

L
Linus Torvalds 已提交
546 547 548
/*
 * This special macro can be used to load a debugging register
 */
549 550
#define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)

551
static inline void __switch_to_xtra(struct task_struct *prev_p,
552 553
				    struct task_struct *next_p,
				    struct tss_struct *tss)
554 555
{
	struct thread_struct *prev, *next;
556
	unsigned long debugctl;
557 558 559 560

	prev = &prev_p->thread,
	next = &next_p->thread;

561 562 563 564 565 566 567 568 569 570
	debugctl = prev->debugctlmsr;
	if (next->ds_area_msr != prev->ds_area_msr) {
		/* we clear debugctl to make sure DS
		 * is not in use when we change it */
		debugctl = 0;
		wrmsrl(MSR_IA32_DEBUGCTLMSR, 0);
		wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr);
	}

	if (next->debugctlmsr != debugctl)
R
Roland McGrath 已提交
571 572
		wrmsrl(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr);

573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595
	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
		loaddebug(next, 0);
		loaddebug(next, 1);
		loaddebug(next, 2);
		loaddebug(next, 3);
		/* no 4 and 5 */
		loaddebug(next, 6);
		loaddebug(next, 7);
	}

	if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
		/*
		 * Copy the relevant range of the IO bitmap.
		 * Normally this is 128 bytes or less:
		 */
		memcpy(tss->io_bitmap, next->io_bitmap_ptr,
		       max(prev->io_bitmap_max, next->io_bitmap_max));
	} else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
		/*
		 * Clear any possible leftover bits:
		 */
		memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
	}
596

597
#ifdef X86_BTS
598 599 600 601 602
	if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
		ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);

	if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
		ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
603
#endif
604 605
}

L
Linus Torvalds 已提交
606 607 608
/*
 *	switch_to(x,y) should switch tasks from x to y.
 *
609
 * This could still be optimized:
L
Linus Torvalds 已提交
610 611
 * - fold all the options into a flag word and test it with a single test.
 * - could test fs/gs bitsliced
612 613
 *
 * Kprobes not supported here. Set the probe on schedule instead.
L
Linus Torvalds 已提交
614
 */
615
struct task_struct *
616
__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
L
Linus Torvalds 已提交
617 618 619
{
	struct thread_struct *prev = &prev_p->thread,
				 *next = &next_p->thread;
620
	int cpu = smp_processor_id();
L
Linus Torvalds 已提交
621 622
	struct tss_struct *tss = &per_cpu(init_tss, cpu);

623 624 625 626
	/* we're going to use this soon, after a few expensive things */
	if (next_p->fpu_counter>5)
		prefetch(&next->i387.fxsave);

L
Linus Torvalds 已提交
627 628 629
	/*
	 * Reload esp0, LDT and the page table pointer:
	 */
630
	load_sp0(tss, next);
L
Linus Torvalds 已提交
631 632 633 634 635

	/* 
	 * Switch DS and ES.
	 * This won't pick up thread selector changes, but I guess that is ok.
	 */
636
	asm volatile("mov %%es,%0" : "=m" (prev->es));
L
Linus Torvalds 已提交
637 638 639
	if (unlikely(next->es | prev->es))
		loadsegment(es, next->es); 
	
640
	asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
L
Linus Torvalds 已提交
641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682
	if (unlikely(next->ds | prev->ds))
		loadsegment(ds, next->ds);

	load_TLS(next, cpu);

	/* 
	 * Switch FS and GS.
	 */
	{ 
		unsigned fsindex;
		asm volatile("movl %%fs,%0" : "=r" (fsindex)); 
		/* segment register != 0 always requires a reload. 
		   also reload when it has changed. 
		   when prev process used 64bit base always reload
		   to avoid an information leak. */
		if (unlikely(fsindex | next->fsindex | prev->fs)) {
			loadsegment(fs, next->fsindex);
			/* check if the user used a selector != 0
	                 * if yes clear 64bit base, since overloaded base
                         * is always mapped to the Null selector
                         */
			if (fsindex)
			prev->fs = 0;				
		}
		/* when next process has a 64bit base use it */
		if (next->fs) 
			wrmsrl(MSR_FS_BASE, next->fs); 
		prev->fsindex = fsindex;
	}
	{ 
		unsigned gsindex;
		asm volatile("movl %%gs,%0" : "=r" (gsindex)); 
		if (unlikely(gsindex | next->gsindex | prev->gs)) {
			load_gs_index(next->gsindex);
			if (gsindex)
			prev->gs = 0;				
		}
		if (next->gs)
			wrmsrl(MSR_KERNEL_GS_BASE, next->gs); 
		prev->gsindex = gsindex;
	}

A
Andi Kleen 已提交
683 684 685
	/* Must be after DS reload */
	unlazy_fpu(prev_p);

L
Linus Torvalds 已提交
686
	/* 
687
	 * Switch the PDA and FPU contexts.
L
Linus Torvalds 已提交
688
	 */
689 690
	prev->usersp = read_pda(oldrsp);
	write_pda(oldrsp, next->usersp);
L
Linus Torvalds 已提交
691
	write_pda(pcurrent, next_p); 
692

693
	write_pda(kernelstack,
694
	(unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
695 696 697 698 699 700 701 702
#ifdef CONFIG_CC_STACKPROTECTOR
	write_pda(stack_canary, next_p->stack_canary);
	/*
	 * Build time only check to make sure the stack_canary is at
	 * offset 40 in the pda; this is a gcc ABI requirement
	 */
	BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
#endif
L
Linus Torvalds 已提交
703 704

	/*
705
	 * Now maybe reload the debug registers and handle I/O bitmaps
L
Linus Torvalds 已提交
706
	 */
707 708
	if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
		     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
709
		__switch_to_xtra(prev_p, next_p, tss);
L
Linus Torvalds 已提交
710

711 712 713 714 715 716
	/* If the task has used fpu the last 5 timeslices, just do a full
	 * restore of the math state immediately to avoid the trap; the
	 * chances of needing FPU soon are obviously high now
	 */
	if (next_p->fpu_counter>5)
		math_state_restore();
L
Linus Torvalds 已提交
717 718 719 720 721 722
	return prev_p;
}

/*
 * sys_execve() executes a new program.
 */
723
asmlinkage
L
Linus Torvalds 已提交
724
long sys_execve(char __user *name, char __user * __user *argv,
725
		char __user * __user *envp, struct pt_regs *regs)
L
Linus Torvalds 已提交
726 727 728 729 730 731
{
	long error;
	char * filename;

	filename = getname(name);
	error = PTR_ERR(filename);
732
	if (IS_ERR(filename))
L
Linus Torvalds 已提交
733
		return error;
734
	error = do_execve(filename, argv, envp, regs);
L
Linus Torvalds 已提交
735 736 737 738 739 740 741 742 743
	putname(filename);
	return error;
}

void set_personality_64bit(void)
{
	/* inherit personality from parent */

	/* Make sure to be in 64bit mode */
744
	clear_thread_flag(TIF_IA32);
L
Linus Torvalds 已提交
745 746 747 748

	/* TBD: overwrites user setup. Should have two bits.
	   But 64bit processes have always behaved this way,
	   so it's not too bad. The main problem is just that
749
	   32bit childs are affected again. */
L
Linus Torvalds 已提交
750 751 752 753 754
	current->personality &= ~READ_IMPLIES_EXEC;
}

asmlinkage long sys_fork(struct pt_regs *regs)
{
755
	return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
L
Linus Torvalds 已提交
756 757
}

758 759 760
asmlinkage long
sys_clone(unsigned long clone_flags, unsigned long newsp,
	  void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
L
Linus Torvalds 已提交
761 762
{
	if (!newsp)
763
		newsp = regs->sp;
L
Linus Torvalds 已提交
764 765 766 767 768 769 770 771 772 773 774 775 776 777 778
	return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
}

/*
 * This is trivial, and on the face of it looks like it
 * could equally well be done in user mode.
 *
 * Not so, for quite unobvious reasons - register pressure.
 * In user mode vfork() cannot have a stack frame, and if
 * done by calling the "clone()" system call directly, you
 * do not have enough call-clobbered registers to hold all
 * the information you need.
 */
asmlinkage long sys_vfork(struct pt_regs *regs)
{
779
	return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
L
Linus Torvalds 已提交
780 781 782 783 784 785
		    NULL, NULL);
}

unsigned long get_wchan(struct task_struct *p)
{
	unsigned long stack;
786
	u64 fp,ip;
L
Linus Torvalds 已提交
787 788 789 790
	int count = 0;

	if (!p || p == current || p->state==TASK_RUNNING)
		return 0; 
A
Al Viro 已提交
791
	stack = (unsigned long)task_stack_page(p);
792
	if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE)
L
Linus Torvalds 已提交
793
		return 0;
794
	fp = *(u64 *)(p->thread.sp);
L
Linus Torvalds 已提交
795
	do { 
796 797
		if (fp < (unsigned long)stack ||
		    fp > (unsigned long)stack+THREAD_SIZE)
L
Linus Torvalds 已提交
798
			return 0; 
799 800 801
		ip = *(u64 *)(fp+8);
		if (!in_sched_functions(ip))
			return ip;
L
Linus Torvalds 已提交
802 803 804 805 806 807 808 809 810 811 812 813 814
		fp = *(u64 *)fp; 
	} while (count++ < 16); 
	return 0;
}

long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
{ 
	int ret = 0; 
	int doit = task == current;
	int cpu;

	switch (code) { 
	case ARCH_SET_GS:
815
		if (addr >= TASK_SIZE_OF(task))
L
Linus Torvalds 已提交
816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831
			return -EPERM; 
		cpu = get_cpu();
		/* handle small bases via the GDT because that's faster to 
		   switch. */
		if (addr <= 0xffffffff) {  
			set_32bit_tls(task, GS_TLS, addr); 
			if (doit) { 
				load_TLS(&task->thread, cpu);
				load_gs_index(GS_TLS_SEL); 
			}
			task->thread.gsindex = GS_TLS_SEL; 
			task->thread.gs = 0;
		} else { 
			task->thread.gsindex = 0;
			task->thread.gs = addr;
			if (doit) {
832 833
				load_gs_index(0);
				ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
L
Linus Torvalds 已提交
834 835 836 837 838 839 840
			} 
		}
		put_cpu();
		break;
	case ARCH_SET_FS:
		/* Not strictly needed for fs, but do it for symmetry
		   with gs */
841
		if (addr >= TASK_SIZE_OF(task))
842
			return -EPERM;
L
Linus Torvalds 已提交
843
		cpu = get_cpu();
844
		/* handle small bases via the GDT because that's faster to
L
Linus Torvalds 已提交
845
		   switch. */
846
		if (addr <= 0xffffffff) {
L
Linus Torvalds 已提交
847
			set_32bit_tls(task, FS_TLS, addr);
848 849
			if (doit) {
				load_TLS(&task->thread, cpu);
850
				asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
L
Linus Torvalds 已提交
851 852 853
			}
			task->thread.fsindex = FS_TLS_SEL;
			task->thread.fs = 0;
854
		} else {
L
Linus Torvalds 已提交
855 856 857 858 859
			task->thread.fsindex = 0;
			task->thread.fs = addr;
			if (doit) {
				/* set the selector to 0 to not confuse
				   __switch_to */
860 861
				asm volatile("movl %0,%%fs" :: "r" (0));
				ret = checking_wrmsrl(MSR_FS_BASE, addr);
L
Linus Torvalds 已提交
862 863 864 865
			}
		}
		put_cpu();
		break;
866 867
	case ARCH_GET_FS: {
		unsigned long base;
L
Linus Torvalds 已提交
868 869
		if (task->thread.fsindex == FS_TLS_SEL)
			base = read_32bit_tls(task, FS_TLS);
870
		else if (doit)
L
Linus Torvalds 已提交
871
			rdmsrl(MSR_FS_BASE, base);
872
		else
L
Linus Torvalds 已提交
873
			base = task->thread.fs;
874 875
		ret = put_user(base, (unsigned long __user *)addr);
		break;
L
Linus Torvalds 已提交
876
	}
877
	case ARCH_GET_GS: {
L
Linus Torvalds 已提交
878
		unsigned long base;
879
		unsigned gsindex;
L
Linus Torvalds 已提交
880 881
		if (task->thread.gsindex == GS_TLS_SEL)
			base = read_32bit_tls(task, GS_TLS);
882
		else if (doit) {
883
			asm("movl %%gs,%0" : "=r" (gsindex));
884 885 886 887 888
			if (gsindex)
				rdmsrl(MSR_KERNEL_GS_BASE, base);
			else
				base = task->thread.gs;
		}
889
		else
L
Linus Torvalds 已提交
890
			base = task->thread.gs;
891
		ret = put_user(base, (unsigned long __user *)addr);
L
Linus Torvalds 已提交
892 893 894 895 896 897
		break;
	}

	default:
		ret = -EINVAL;
		break;
898
	}
L
Linus Torvalds 已提交
899

900 901
	return ret;
}
L
Linus Torvalds 已提交
902 903 904 905 906 907 908 909

long sys_arch_prctl(int code, unsigned long addr)
{
	return do_arch_prctl(current, code, addr);
}

unsigned long arch_align_stack(unsigned long sp)
{
910
	if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
L
Linus Torvalds 已提交
911 912 913
		sp -= get_random_int() % 8192;
	return sp & ~0xf;
}
J
Jiri Kosina 已提交
914 915 916 917 918 919

unsigned long arch_randomize_brk(struct mm_struct *mm)
{
	unsigned long range_end = mm->brk + 0x02000000;
	return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
}