process.c 11.6 KB
Newer Older
1 2
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

3 4 5 6
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/smp.h>
7
#include <linux/prctl.h>
8 9
#include <linux/slab.h>
#include <linux/sched.h>
10 11
#include <linux/module.h>
#include <linux/pm.h>
12
#include <linux/tick.h>
A
Amerigo Wang 已提交
13
#include <linux/random.h>
A
Avi Kivity 已提交
14
#include <linux/user-return-notifier.h>
15 16
#include <linux/dmi.h>
#include <linux/utsname.h>
17 18 19
#include <linux/stackprotector.h>
#include <linux/tick.h>
#include <linux/cpuidle.h>
20
#include <trace/events/power.h>
21
#include <linux/hw_breakpoint.h>
22
#include <asm/cpu.h>
23
#include <asm/apic.h>
24
#include <asm/syscalls.h>
25 26
#include <asm/idle.h>
#include <asm/uaccess.h>
27
#include <asm/mwait.h>
28
#include <asm/fpu-internal.h>
29
#include <asm/debugreg.h>
30
#include <asm/nmi.h>
A
Andy Lutomirski 已提交
31
#include <asm/tlbflush.h>
32

T
Thomas Gleixner 已提交
33 34 35 36 37 38 39
/*
 * per-CPU TSS segments. Threads are completely 'soft' on Linux,
 * no more per-task TSS's. The TSS size is kept cacheline-aligned
 * so they are allowed to end up in the .data..cacheline_aligned
 * section. Since TSS's are completely CPU-local, we want them
 * on exact cacheline boundaries, to eliminate cacheline ping-pong.
 */
40 41
__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
	.x86_tss = {
42
		.sp0 = TOP_OF_INIT_STACK,
43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58
#ifdef CONFIG_X86_32
		.ss0 = __KERNEL_DS,
		.ss1 = __KERNEL_CS,
		.io_bitmap_base	= INVALID_IO_BITMAP_OFFSET,
#endif
	 },
#ifdef CONFIG_X86_32
	 /*
	  * Note that the .io_bitmap member must be extra-big. This is because
	  * the CPU will access an additional byte beyond the end of the IO
	  * permission bitmap. The extra byte must be all 1 bits, and must
	  * be within the limit.
	  */
	.io_bitmap		= { [0 ... IO_BITMAP_LONGS] = ~0 },
#endif
};
59
EXPORT_PER_CPU_SYMBOL(cpu_tss);
T
Thomas Gleixner 已提交
60

61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76
#ifdef CONFIG_X86_64
static DEFINE_PER_CPU(unsigned char, is_idle);
static ATOMIC_NOTIFIER_HEAD(idle_notifier);

void idle_notifier_register(struct notifier_block *n)
{
	atomic_notifier_chain_register(&idle_notifier, n);
}
EXPORT_SYMBOL_GPL(idle_notifier_register);

void idle_notifier_unregister(struct notifier_block *n)
{
	atomic_notifier_chain_unregister(&idle_notifier, n);
}
EXPORT_SYMBOL_GPL(idle_notifier_unregister);
#endif
Z
Zhao Yakui 已提交
77

78 79 80 81
/*
 * this gets called so that we can store lazy state into memory and copy the
 * current task into the new thread.
 */
82 83 84
int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
{
	*dst = *src;
85

86
	dst->thread.fpu.counter = 0;
87 88
	dst->thread.fpu.has_fpu = 0;
	dst->thread.fpu.state = NULL;
89
	task_disable_lazy_fpu_restore(dst);
90
	if (tsk_used_math(src)) {
91
		int err = fpstate_alloc(&dst->thread.fpu);
92 93
		if (err)
			return err;
94
		fpu_copy(dst, src);
95
	}
96 97 98
	return 0;
}

99
void arch_release_task_struct(struct task_struct *tsk)
100
{
101
	fpstate_free(&tsk->thread.fpu);
102 103 104 105
}

void arch_task_cache_init(void)
{
106
	fpstate_cache_init();
107
}
108

109 110 111 112 113 114 115
/*
 * Free current thread data structures etc..
 */
void exit_thread(void)
{
	struct task_struct *me = current;
	struct thread_struct *t = &me->thread;
116
	unsigned long *bp = t->io_bitmap_ptr;
117

118
	if (bp) {
119
		struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu());
120 121 122 123 124 125 126 127 128

		t->io_bitmap_ptr = NULL;
		clear_thread_flag(TIF_IO_BITMAP);
		/*
		 * Careful, clear this in the TSS too:
		 */
		memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
		t->io_bitmap_max = 0;
		put_cpu();
129
		kfree(bp);
130
	}
131 132

	drop_fpu(me);
133 134 135 136 137 138
}

void flush_thread(void)
{
	struct task_struct *tsk = current;

139
	flush_ptrace_hw_breakpoint(tsk);
140
	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
141

142
	fpu__flush_thread(tsk);
143 144 145 146
}

static void hard_disable_TSC(void)
{
A
Andy Lutomirski 已提交
147
	cr4_set_bits(X86_CR4_TSD);
148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163
}

void disable_TSC(void)
{
	preempt_disable();
	if (!test_and_set_thread_flag(TIF_NOTSC))
		/*
		 * Must flip the CPU state synchronously with
		 * TIF_NOTSC in the current running context.
		 */
		hard_disable_TSC();
	preempt_enable();
}

static void hard_enable_TSC(void)
{
A
Andy Lutomirski 已提交
164
	cr4_clear_bits(X86_CR4_TSD);
165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210
}

static void enable_TSC(void)
{
	preempt_disable();
	if (test_and_clear_thread_flag(TIF_NOTSC))
		/*
		 * Must flip the CPU state synchronously with
		 * TIF_NOTSC in the current running context.
		 */
		hard_enable_TSC();
	preempt_enable();
}

int get_tsc_mode(unsigned long adr)
{
	unsigned int val;

	if (test_thread_flag(TIF_NOTSC))
		val = PR_TSC_SIGSEGV;
	else
		val = PR_TSC_ENABLE;

	return put_user(val, (unsigned int __user *)adr);
}

int set_tsc_mode(unsigned int val)
{
	if (val == PR_TSC_SIGSEGV)
		disable_TSC();
	else if (val == PR_TSC_ENABLE)
		enable_TSC();
	else
		return -EINVAL;

	return 0;
}

void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
		      struct tss_struct *tss)
{
	struct thread_struct *prev, *next;

	prev = &prev_p->thread;
	next = &next_p->thread;

P
Peter Zijlstra 已提交
211 212 213 214 215 216 217 218 219 220
	if (test_tsk_thread_flag(prev_p, TIF_BLOCKSTEP) ^
	    test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) {
		unsigned long debugctl = get_debugctlmsr();

		debugctl &= ~DEBUGCTLMSR_BTF;
		if (test_tsk_thread_flag(next_p, TIF_BLOCKSTEP))
			debugctl |= DEBUGCTLMSR_BTF;

		update_debugctlmsr(debugctl);
	}
221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243

	if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
	    test_tsk_thread_flag(next_p, TIF_NOTSC)) {
		/* prev and next are different */
		if (test_tsk_thread_flag(next_p, TIF_NOTSC))
			hard_disable_TSC();
		else
			hard_enable_TSC();
	}

	if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
		/*
		 * Copy the relevant range of the IO bitmap.
		 * Normally this is 128 bytes or less:
		 */
		memcpy(tss->io_bitmap, next->io_bitmap_ptr,
		       max(prev->io_bitmap_max, next->io_bitmap_max));
	} else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
		/*
		 * Clear any possible leftover bits:
		 */
		memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
	}
A
Avi Kivity 已提交
244
	propagate_user_return_notify(prev_p, next_p);
245 246
}

247 248 249
/*
 * Idle related variables and functions
 */
250
unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE;
251 252
EXPORT_SYMBOL(boot_option_idle_override);

253
static void (*x86_idle)(void);
254

255 256 257 258 259 260 261 262 263 264
#ifndef CONFIG_SMP
static inline void play_dead(void)
{
	BUG();
}
#endif

#ifdef CONFIG_X86_64
void enter_idle(void)
{
265
	this_cpu_write(is_idle, 1);
266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285
	atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
}

static void __exit_idle(void)
{
	if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
		return;
	atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
}

/* Called from interrupts to signify idle end */
void exit_idle(void)
{
	/* idle loop has pid 0 */
	if (current->pid)
		return;
	__exit_idle();
}
#endif

T
Thomas Gleixner 已提交
286 287 288 289 290
void arch_cpu_idle_enter(void)
{
	local_touch_nmi();
	enter_idle();
}
291

T
Thomas Gleixner 已提交
292 293 294 295
void arch_cpu_idle_exit(void)
{
	__exit_idle();
}
296

T
Thomas Gleixner 已提交
297 298 299 300
void arch_cpu_idle_dead(void)
{
	play_dead();
}
301

T
Thomas Gleixner 已提交
302 303 304 305 306
/*
 * Called from the generic idle code.
 */
void arch_cpu_idle(void)
{
307
	x86_idle();
308 309
}

310
/*
T
Thomas Gleixner 已提交
311
 * We use this if we don't have any better idle routine..
312 313 314
 */
void default_idle(void)
{
315
	trace_cpu_idle_rcuidle(1, smp_processor_id());
T
Thomas Gleixner 已提交
316
	safe_halt();
317
	trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
318
}
319
#ifdef CONFIG_APM_MODULE
320 321 322
EXPORT_SYMBOL(default_idle);
#endif

323 324
#ifdef CONFIG_XEN
bool xen_set_default_idle(void)
325
{
326
	bool ret = !!x86_idle;
327

328
	x86_idle = default_idle;
329 330 331

	return ret;
}
332
#endif
333 334 335 336 337 338
void stop_this_cpu(void *dummy)
{
	local_irq_disable();
	/*
	 * Remove this CPU:
	 */
339
	set_cpu_online(smp_processor_id(), false);
340 341
	disable_local_APIC();

342 343
	for (;;)
		halt();
344 345
}

346 347
bool amd_e400_c1e_detected;
EXPORT_SYMBOL(amd_e400_c1e_detected);
348

349
static cpumask_var_t amd_e400_c1e_mask;
350

351
void amd_e400_remove_cpu(int cpu)
352
{
353 354
	if (amd_e400_c1e_mask != NULL)
		cpumask_clear_cpu(cpu, amd_e400_c1e_mask);
355 356
}

357
/*
358
 * AMD Erratum 400 aware idle routine. We check for C1E active in the interrupt
359 360 361
 * pending message MSR. If we detect C1E, then we handle it the same
 * way as C3 power states (local apic timer and TSC stop)
 */
362
static void amd_e400_idle(void)
363
{
364
	if (!amd_e400_c1e_detected) {
365 366 367
		u32 lo, hi;

		rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
368

369
		if (lo & K8_INTP_C1E_ACTIVE_MASK) {
370
			amd_e400_c1e_detected = true;
371
			if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
372
				mark_tsc_unstable("TSC halt in AMD C1E");
373
			pr_info("System has AMD C1E enabled\n");
374 375 376
		}
	}

377
	if (amd_e400_c1e_detected) {
378 379
		int cpu = smp_processor_id();

380 381
		if (!cpumask_test_cpu(cpu, amd_e400_c1e_mask)) {
			cpumask_set_cpu(cpu, amd_e400_c1e_mask);
382 383
			/* Force broadcast so ACPI can not interfere. */
			tick_broadcast_force();
384
			pr_info("Switch to broadcast mode on CPU%d\n", cpu);
385
		}
386
		tick_broadcast_enter();
387

388
		default_idle();
389 390 391 392 393

		/*
		 * The switch back from broadcast mode needs to be
		 * called with interrupts disabled.
		 */
394
		local_irq_disable();
395
		tick_broadcast_exit();
396
		local_irq_enable();
397 398 399 400
	} else
		default_idle();
}

401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429
/*
 * Intel Core2 and older machines prefer MWAIT over HALT for C1.
 * We can't rely on cpuidle installing MWAIT, because it will not load
 * on systems that support only C1 -- so the boot default must be MWAIT.
 *
 * Some AMD machines are the opposite, they depend on using HALT.
 *
 * So for default C1, which is used during boot until cpuidle loads,
 * use MWAIT-C1 on Intel HW that has it, else use HALT.
 */
static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c)
{
	if (c->x86_vendor != X86_VENDOR_INTEL)
		return 0;

	if (!cpu_has(c, X86_FEATURE_MWAIT))
		return 0;

	return 1;
}

/*
 * MONITOR/MWAIT with no hints, used for default default C1 state.
 * This invokes MWAIT with interrutps enabled and no flags,
 * which is backwards compatible with the original MWAIT implementation.
 */

static void mwait_idle(void)
{
430 431 432
	if (!current_set_polling_and_test()) {
		if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) {
			smp_mb(); /* quirk */
433
			clflush((void *)&current_thread_info()->flags);
434 435
			smp_mb(); /* quirk */
		}
436 437 438 439 440 441

		__monitor((void *)&current_thread_info()->flags, 0, 0);
		if (!need_resched())
			__sti_mwait(0, 0);
		else
			local_irq_enable();
442
	} else {
443
		local_irq_enable();
444 445
	}
	__current_clr_polling();
446 447
}

448
void select_idle_routine(const struct cpuinfo_x86 *c)
449
{
450
#ifdef CONFIG_SMP
T
Thomas Gleixner 已提交
451
	if (boot_option_idle_override == IDLE_POLL && smp_num_siblings > 1)
452
		pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n");
453
#endif
T
Thomas Gleixner 已提交
454
	if (x86_idle || boot_option_idle_override == IDLE_POLL)
T
Thomas Gleixner 已提交
455 456
		return;

457
	if (cpu_has_bug(c, X86_BUG_AMD_APIC_C1E)) {
458
		/* E400: APIC timer interrupt does not wake up CPU from C1e */
459
		pr_info("using AMD E400 aware idle routine\n");
460
		x86_idle = amd_e400_idle;
461 462 463
	} else if (prefer_mwait_c1_over_halt(c)) {
		pr_info("using mwait in idle threads\n");
		x86_idle = mwait_idle;
T
Thomas Gleixner 已提交
464
	} else
465
		x86_idle = default_idle;
466 467
}

468
void __init init_amd_e400_c1e_mask(void)
469
{
470
	/* If we're using amd_e400_idle, we need to allocate amd_e400_c1e_mask. */
471
	if (x86_idle == amd_e400_idle)
472
		zalloc_cpumask_var(&amd_e400_c1e_mask, GFP_KERNEL);
473 474
}

475 476
static int __init idle_setup(char *str)
{
477 478 479
	if (!str)
		return -EINVAL;

480
	if (!strcmp(str, "poll")) {
481
		pr_info("using polling idle threads\n");
482
		boot_option_idle_override = IDLE_POLL;
T
Thomas Gleixner 已提交
483
		cpu_idle_poll_ctrl(true);
484
	} else if (!strcmp(str, "halt")) {
Z
Zhao Yakui 已提交
485 486 487 488 489 490 491
		/*
		 * When the boot option of idle=halt is added, halt is
		 * forced to be used for CPU idle. In such case CPU C2/C3
		 * won't be used again.
		 * To continue to load the CPU idle driver, don't touch
		 * the boot_option_idle_override.
		 */
492
		x86_idle = default_idle;
493
		boot_option_idle_override = IDLE_HALT;
494 495 496 497 498 499 500
	} else if (!strcmp(str, "nomwait")) {
		/*
		 * If the boot option of "idle=nomwait" is added,
		 * it means that mwait will be disabled for CPU C2/C3
		 * states. In such case it won't touch the variable
		 * of boot_option_idle_override.
		 */
501
		boot_option_idle_override = IDLE_NOMWAIT;
Z
Zhao Yakui 已提交
502
	} else
503 504 505 506 507 508
		return -1;

	return 0;
}
early_param("idle", idle_setup);

A
Amerigo Wang 已提交
509 510 511 512 513 514 515 516 517 518 519 520 521
unsigned long arch_align_stack(unsigned long sp)
{
	if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
		sp -= get_random_int() % 8192;
	return sp & ~0xf;
}

unsigned long arch_randomize_brk(struct mm_struct *mm)
{
	unsigned long range_end = mm->brk + 0x02000000;
	return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
}