1. 04 2月, 2015 1 次提交
  2. 11 12月, 2014 1 次提交
    • A
      x86_64, switch_to(): Load TLS descriptors before switching DS and ES · f647d7c1
      Andy Lutomirski 提交于
      Otherwise, if buggy user code points DS or ES into the TLS
      array, they would be corrupted after a context switch.
      
      This also significantly improves the comments and documents some
      gotchas in the code.
      
      Before this patch, the both tests below failed.  With this
      patch, the es test passes, although the gsbase test still fails.
      
       ----- begin es test -----
      
      /*
       * Copyright (c) 2014 Andy Lutomirski
       * GPL v2
       */
      
      static unsigned short GDT3(int idx)
      {
      	return (idx << 3) | 3;
      }
      
      static int create_tls(int idx, unsigned int base)
      {
      	struct user_desc desc = {
      		.entry_number    = idx,
      		.base_addr       = base,
      		.limit           = 0xfffff,
      		.seg_32bit       = 1,
      		.contents        = 0, /* Data, grow-up */
      		.read_exec_only  = 0,
      		.limit_in_pages  = 1,
      		.seg_not_present = 0,
      		.useable         = 0,
      	};
      
      	if (syscall(SYS_set_thread_area, &desc) != 0)
      		err(1, "set_thread_area");
      
      	return desc.entry_number;
      }
      
      int main()
      {
      	int idx = create_tls(-1, 0);
      	printf("Allocated GDT index %d\n", idx);
      
      	unsigned short orig_es;
      	asm volatile ("mov %%es,%0" : "=rm" (orig_es));
      
      	int errors = 0;
      	int total = 1000;
      	for (int i = 0; i < total; i++) {
      		asm volatile ("mov %0,%%es" : : "rm" (GDT3(idx)));
      		usleep(100);
      
      		unsigned short es;
      		asm volatile ("mov %%es,%0" : "=rm" (es));
      		asm volatile ("mov %0,%%es" : : "rm" (orig_es));
      		if (es != GDT3(idx)) {
      			if (errors == 0)
      				printf("[FAIL]\tES changed from 0x%hx to 0x%hx\n",
      				       GDT3(idx), es);
      			errors++;
      		}
      	}
      
      	if (errors) {
      		printf("[FAIL]\tES was corrupted %d/%d times\n", errors, total);
      		return 1;
      	} else {
      		printf("[OK]\tES was preserved\n");
      		return 0;
      	}
      }
      
       ----- end es test -----
      
       ----- begin gsbase test -----
      
      /*
       * gsbase.c, a gsbase test
       * Copyright (c) 2014 Andy Lutomirski
       * GPL v2
       */
      
      static unsigned char *testptr, *testptr2;
      
      static unsigned char read_gs_testvals(void)
      {
      	unsigned char ret;
      	asm volatile ("movb %%gs:%1, %0" : "=r" (ret) : "m" (*testptr));
      	return ret;
      }
      
      int main()
      {
      	int errors = 0;
      
      	testptr = mmap((void *)0x200000000UL, 1, PROT_READ | PROT_WRITE,
      		       MAP_PRIVATE | MAP_FIXED | MAP_ANONYMOUS, -1, 0);
      	if (testptr == MAP_FAILED)
      		err(1, "mmap");
      
      	testptr2 = mmap((void *)0x300000000UL, 1, PROT_READ | PROT_WRITE,
      		       MAP_PRIVATE | MAP_FIXED | MAP_ANONYMOUS, -1, 0);
      	if (testptr2 == MAP_FAILED)
      		err(1, "mmap");
      
      	*testptr = 0;
      	*testptr2 = 1;
      
      	if (syscall(SYS_arch_prctl, ARCH_SET_GS,
      		    (unsigned long)testptr2 - (unsigned long)testptr) != 0)
      		err(1, "ARCH_SET_GS");
      
      	usleep(100);
      
      	if (read_gs_testvals() == 1) {
      		printf("[OK]\tARCH_SET_GS worked\n");
      	} else {
      		printf("[FAIL]\tARCH_SET_GS failed\n");
      		errors++;
      	}
      
      	asm volatile ("mov %0,%%gs" : : "r" (0));
      
      	if (read_gs_testvals() == 0) {
      		printf("[OK]\tWriting 0 to gs worked\n");
      	} else {
      		printf("[FAIL]\tWriting 0 to gs failed\n");
      		errors++;
      	}
      
      	usleep(100);
      
      	if (read_gs_testvals() == 0) {
      		printf("[OK]\tgsbase is still zero\n");
      	} else {
      		printf("[FAIL]\tgsbase was corrupted\n");
      		errors++;
      	}
      
      	return errors == 0 ? 0 : 1;
      }
      
       ----- end gsbase test -----
      Signed-off-by: NAndy Lutomirski <luto@amacapital.net>
      Cc: <stable@vger.kernel.org>
      Cc: Andi Kleen <andi@firstfloor.org>
      Cc: Linus Torvalds <torvalds@linux-foundation.org>
      Link: http://lkml.kernel.org/r/509d27c9fec78217691c3dad91cec87e1006b34a.1418075657.git.luto@amacapital.netSigned-off-by: NIngo Molnar <mingo@kernel.org>
      f647d7c1
  3. 03 9月, 2014 2 次提交
  4. 06 5月, 2014 1 次提交
  5. 01 5月, 2014 1 次提交
  6. 13 11月, 2013 2 次提交
    • V
      x86: move fpu_counter into ARCH specific thread_struct · c375f15a
      Vineet Gupta 提交于
      Only a couple of arches (sh/x86) use fpu_counter in task_struct so it can
      be moved out into ARCH specific thread_struct, reducing the size of
      task_struct for other arches.
      
      Compile tested i386_defconfig + gcc 4.7.3
      Signed-off-by: NVineet Gupta <vgupta@synopsys.com>
      Acked-by: NIngo Molnar <mingo@kernel.org>
      Cc: Paul Mundt <paul.mundt@gmail.com>
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: NLinus Torvalds <torvalds@linux-foundation.org>
      c375f15a
    • J
      x86/dumpstack: Fix printk_address for direct addresses · 5f01c988
      Jiri Slaby 提交于
      Consider a kernel crash in a module, simulated the following way:
      
       static int my_init(void)
       {
               char *map = (void *)0x5;
               *map = 3;
               return 0;
       }
       module_init(my_init);
      
      When we turn off FRAME_POINTERs, the very first instruction in
      that function causes a BUG. The problem is that we print IP in
      the BUG report using %pB (from printk_address). And %pB
      decrements the pointer by one to fix printing addresses of
      functions with tail calls.
      
      This was added in commit 71f9e598 ("x86, dumpstack: Use
      %pB format specifier for stack trace") to fix the call stack
      printouts.
      
      So instead of correct output:
      
        BUG: unable to handle kernel NULL pointer dereference at 0000000000000005
        IP: [<ffffffffa01ac000>] my_init+0x0/0x10 [pb173]
      
      We get:
      
        BUG: unable to handle kernel NULL pointer dereference at 0000000000000005
        IP: [<ffffffffa0152000>] 0xffffffffa0151fff
      
      To fix that, we use %pS only for stack addresses printouts (via
      newly added printk_stack_address) and %pB for regs->ip (via
      printk_address). I.e. we revert to the old behaviour for all
      except call stacks. And since from all those reliable is 1, we
      remove that parameter from printk_address.
      Signed-off-by: NJiri Slaby <jslaby@suse.cz>
      Cc: Namhyung Kim <namhyung@gmail.com>
      Cc: Frederic Weisbecker <fweisbec@gmail.com>
      Cc: Ingo Molnar <mingo@elte.hu>
      Cc: "H. Peter Anvin" <hpa@zytor.com>
      Cc: joe@perches.com
      Cc: jirislaby@gmail.com
      Link: http://lkml.kernel.org/r/1382706418-8435-1-git-send-email-jslaby@suse.czSigned-off-by: NIngo Molnar <mingo@kernel.org>
      5f01c988
  7. 25 9月, 2013 1 次提交
  8. 07 8月, 2013 2 次提交
  9. 26 6月, 2013 1 次提交
  10. 19 6月, 2013 1 次提交
  11. 01 5月, 2013 1 次提交
    • T
      dump_stack: unify debug information printed by show_regs() · a43cb95d
      Tejun Heo 提交于
      show_regs() is inherently arch-dependent but it does make sense to print
      generic debug information and some archs already do albeit in slightly
      different forms.  This patch introduces a generic function to print debug
      information from show_regs() so that different archs print out the same
      information and it's much easier to modify what's printed.
      
      show_regs_print_info() prints out the same debug info as dump_stack()
      does plus task and thread_info pointers.
      
      * Archs which didn't print debug info now do.
      
        alpha, arc, blackfin, c6x, cris, frv, h8300, hexagon, ia64, m32r,
        metag, microblaze, mn10300, openrisc, parisc, score, sh64, sparc,
        um, xtensa
      
      * Already prints debug info.  Replaced with show_regs_print_info().
        The printed information is superset of what used to be there.
      
        arm, arm64, avr32, mips, powerpc, sh32, tile, unicore32, x86
      
      * s390 is special in that it used to print arch-specific information
        along with generic debug info.  Heiko and Martin think that the
        arch-specific extra isn't worth keeping s390 specfic implementation.
        Converted to use the generic version.
      
      Note that now all archs print the debug info before actual register
      dumps.
      
      An example BUG() dump follows.
      
       kernel BUG at /work/os/work/kernel/workqueue.c:4841!
       invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
       Modules linked in:
       CPU: 0 PID: 1 Comm: swapper/0 Not tainted 3.9.0-rc1-work+ #7
       Hardware name: empty empty/S3992, BIOS 080011  10/26/2007
       task: ffff88007c85e040 ti: ffff88007c860000 task.ti: ffff88007c860000
       RIP: 0010:[<ffffffff8234a07e>]  [<ffffffff8234a07e>] init_workqueues+0x4/0x6
       RSP: 0000:ffff88007c861ec8  EFLAGS: 00010246
       RAX: ffff88007c861fd8 RBX: ffffffff824466a8 RCX: 0000000000000001
       RDX: 0000000000000046 RSI: 0000000000000001 RDI: ffffffff8234a07a
       RBP: ffff88007c861ec8 R08: 0000000000000000 R09: 0000000000000000
       R10: 0000000000000001 R11: 0000000000000000 R12: ffffffff8234a07a
       R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
       FS:  0000000000000000(0000) GS:ffff88007dc00000(0000) knlGS:0000000000000000
       CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
       CR2: ffff88015f7ff000 CR3: 00000000021f1000 CR4: 00000000000007f0
       DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
       DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
       Stack:
        ffff88007c861ef8 ffffffff81000312 ffffffff824466a8 ffff88007c85e650
        0000000000000003 0000000000000000 ffff88007c861f38 ffffffff82335e5d
        ffff88007c862080 ffffffff8223d8c0 ffff88007c862080 ffffffff81c47760
       Call Trace:
        [<ffffffff81000312>] do_one_initcall+0x122/0x170
        [<ffffffff82335e5d>] kernel_init_freeable+0x9b/0x1c8
        [<ffffffff81c47760>] ? rest_init+0x140/0x140
        [<ffffffff81c4776e>] kernel_init+0xe/0xf0
        [<ffffffff81c6be9c>] ret_from_fork+0x7c/0xb0
        [<ffffffff81c47760>] ? rest_init+0x140/0x140
        ...
      
      v2: Typo fix in x86-32.
      
      v3: CPU number dropped from show_regs_print_info() as
          dump_stack_print_info() has been updated to print it.  s390
          specific implementation dropped as requested by s390 maintainers.
      Signed-off-by: NTejun Heo <tj@kernel.org>
      Acked-by: NDavid S. Miller <davem@davemloft.net>
      Acked-by: NJesper Nilsson <jesper.nilsson@axis.com>
      Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
      Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
      Cc: Bjorn Helgaas <bhelgaas@google.com>
      Cc: Fengguang Wu <fengguang.wu@intel.com>
      Cc: Mike Frysinger <vapier@gentoo.org>
      Cc: Vineet Gupta <vgupta@synopsys.com>
      Cc: Sam Ravnborg <sam@ravnborg.org>
      Acked-by: Chris Metcalf <cmetcalf@tilera.com>		[tile bits]
      Acked-by: Richard Kuo <rkuo@codeaurora.org>		[hexagon bits]
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: NLinus Torvalds <torvalds@linux-foundation.org>
      a43cb95d
  12. 25 1月, 2013 1 次提交
  13. 29 11月, 2012 2 次提交
  14. 01 10月, 2012 1 次提交
  15. 19 9月, 2012 1 次提交
    • S
      x86, fpu: use non-lazy fpu restore for processors supporting xsave · 304bceda
      Suresh Siddha 提交于
      Fundamental model of the current Linux kernel is to lazily init and
      restore FPU instead of restoring the task state during context switch.
      This changes that fundamental lazy model to the non-lazy model for
      the processors supporting xsave feature.
      
      Reasons driving this model change are:
      
      i. Newer processors support optimized state save/restore using xsaveopt and
      xrstor by tracking the INIT state and MODIFIED state during context-switch.
      This is faster than modifying the cr0.TS bit which has serializing semantics.
      
      ii. Newer glibc versions use SSE for some of the optimized copy/clear routines.
      With certain workloads (like boot, kernel-compilation etc), application
      completes its work with in the first 5 task switches, thus taking upto 5 #DNA
      traps with the kernel not getting a chance to apply the above mentioned
      pre-load heuristic.
      
      iii. Some xstate features (like AMD's LWP feature) don't honor the cr0.TS bit
      and thus will not work correctly in the presence of lazy restore. Non-lazy
      state restore is needed for enabling such features.
      
      Some data on a two socket SNB system:
       * Saved 20K DNA exceptions during boot on a two socket SNB system.
       * Saved 50K DNA exceptions during kernel-compilation workload.
       * Improved throughput of the AVX based checksumming function inside the
         kernel by ~15% as xsave/xrstor is faster than the serializing clts/stts
         pair.
      
      Also now kernel_fpu_begin/end() relies on the patched
      alternative instructions. So move check_fpu() which uses the
      kernel_fpu_begin/end() after alternative_instructions().
      Signed-off-by: NSuresh Siddha <suresh.b.siddha@intel.com>
      Link: http://lkml.kernel.org/r/1345842782-24175-7-git-send-email-suresh.b.siddha@intel.com
      Merge 32-bit boot fix from,
      Link: http://lkml.kernel.org/r/1347300665-6209-4-git-send-email-suresh.b.siddha@intel.com
      Cc: Jim Kukunas <james.t.kukunas@linux.intel.com>
      Cc: NeilBrown <neilb@suse.de>
      Cc: Avi Kivity <avi@redhat.com>
      Signed-off-by: NH. Peter Anvin <hpa@linux.intel.com>
      304bceda
  16. 08 6月, 2012 1 次提交
  17. 06 6月, 2012 1 次提交
  18. 17 5月, 2012 1 次提交
    • S
      fork: move the real prepare_to_copy() users to arch_dup_task_struct() · 55ccf3fe
      Suresh Siddha 提交于
      Historical prepare_to_copy() is mostly a no-op, duplicated for majority of
      the architectures and the rest following the x86 model of flushing the extended
      register state like fpu there.
      
      Remove it and use the arch_dup_task_struct() instead.
      Suggested-by: NOleg Nesterov <oleg@redhat.com>
      Suggested-by: NLinus Torvalds <torvalds@linux-foundation.org>
      Signed-off-by: NSuresh Siddha <suresh.b.siddha@intel.com>
      Link: http://lkml.kernel.org/r/1336692811-30576-1-git-send-email-suresh.b.siddha@intel.comAcked-by: NBenjamin Herrenschmidt <benh@kernel.crashing.org>
      Cc: David Howells <dhowells@redhat.com>
      Cc: Koichi Yasutake <yasutake.koichi@jp.panasonic.com>
      Cc: Paul Mackerras <paulus@samba.org>
      Cc: Paul Mundt <lethal@linux-sh.org>
      Cc: Chris Zankel <chris@zankel.net>
      Cc: Richard Henderson <rth@twiddle.net>
      Cc: Russell King <linux@arm.linux.org.uk>
      Cc: Haavard Skinnemoen <hskinnemoen@gmail.com>
      Cc: Mike Frysinger <vapier@gentoo.org>
      Cc: Mark Salter <msalter@redhat.com>
      Cc: Aurelien Jacquiot <a-jacquiot@ti.com>
      Cc: Mikael Starvik <starvik@axis.com>
      Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
      Cc: Richard Kuo <rkuo@codeaurora.org>
      Cc: Tony Luck <tony.luck@intel.com>
      Cc: Michal Simek <monstr@monstr.eu>
      Cc: Ralf Baechle <ralf@linux-mips.org>
      Cc: Jonas Bonn <jonas@southpole.se>
      Cc: James E.J. Bottomley <jejb@parisc-linux.org>
      Cc: Helge Deller <deller@gmx.de>
      Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
      Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
      Cc: Chen Liqin <liqin.chen@sunplusct.com>
      Cc: Lennox Wu <lennox.wu@gmail.com>
      Cc: David S. Miller <davem@davemloft.net>
      Cc: Chris Metcalf <cmetcalf@tilera.com>
      Cc: Jeff Dike <jdike@addtoit.com>
      Cc: Richard Weinberger <richard@nod.at>
      Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
      Signed-off-by: NH. Peter Anvin <hpa@linux.intel.com>
      55ccf3fe
  19. 15 5月, 2012 1 次提交
  20. 07 5月, 2012 1 次提交
  21. 29 3月, 2012 1 次提交
  22. 26 3月, 2012 1 次提交
  23. 01 3月, 2012 1 次提交
  24. 27 2月, 2012 1 次提交
  25. 26 2月, 2012 2 次提交
  26. 22 2月, 2012 1 次提交
    • L
      i387: Split up <asm/i387.h> into exported and internal interfaces · 1361b83a
      Linus Torvalds 提交于
      While various modules include <asm/i387.h> to get access to things we
      actually *intend* for them to use, most of that header file was really
      pretty low-level internal stuff that we really don't want to expose to
      others.
      
      So split the header file into two: the small exported interfaces remain
      in <asm/i387.h>, while the internal definitions that are only used by
      core architecture code are now in <asm/fpu-internal.h>.
      
      The guiding principle for this was to expose functions that we export to
      modules, and leave them in <asm/i387.h>, while stuff that is used by
      task switching or was marked GPL-only is in <asm/fpu-internal.h>.
      
      The fpu-internal.h file could be further split up too, especially since
      arch/x86/kvm/ uses some of the remaining stuff for its module.  But that
      kvm usage should probably be abstracted out a bit, and at least now the
      internal FPU accessor functions are much more contained.  Even if it
      isn't perhaps as contained as it _could_ be.
      Signed-off-by: NLinus Torvalds <torvalds@linux-foundation.org>
      Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1202211340330.5354@i5.linux-foundation.orgSigned-off-by: NH. Peter Anvin <hpa@linux.intel.com>
      1361b83a
  27. 21 2月, 2012 5 次提交
    • H
      x32: Handle process creation · d1a797f3
      H. Peter Anvin 提交于
      Allow an x32 process to be started.
      Originally-by: NH. J. Lu <hjl.tools@gmail.com>
      Signed-off-by: NH. Peter Anvin <hpa@zytor.com>
      Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
      d1a797f3
    • H
      x32: Add a thread flag for x32 processes · bb212724
      H. Peter Anvin 提交于
      An x32 process is *almost* the same thing as a 64-bit process with a
      32-bit address limit, but there are a few minor differences -- in
      particular core dumps are 32 bits and signal handling is different.
      Signed-off-by: NH. Peter Anvin <hpa@zytor.com>
      bb212724
    • H
      x86: Factor out TIF_IA32 from 32-bit address space · 6bd33008
      H. Peter Anvin 提交于
      Factor out IA32 (compatibility instruction set) from 32-bit address
      space in the thread_info flags; this is a precondition patch for x32
      support.
      Originally-by: NH. J. Lu <hjl.tools@gmail.com>
      Signed-off-by: NH. Peter Anvin <hpa@zytor.com>
      Link: http://lkml.kernel.org/n/tip-4pr1xnnksprt7t0h3w5fw4rv@git.kernel.org
      6bd33008
    • L
      i387: support lazy restore of FPU state · 7e16838d
      Linus Torvalds 提交于
      This makes us recognize when we try to restore FPU state that matches
      what we already have in the FPU on this CPU, and avoids the restore
      entirely if so.
      
      To do this, we add two new data fields:
      
       - a percpu 'fpu_owner_task' variable that gets written any time we
         update the "has_fpu" field, and thus acts as a kind of back-pointer
         to the task that owns the CPU.  The exception is when we save the FPU
         state as part of a context switch - if the save can keep the FPU
         state around, we leave the 'fpu_owner_task' variable pointing at the
         task whose FP state still remains on the CPU.
      
       - a per-thread 'last_cpu' field, that indicates which CPU that thread
         used its FPU on last.  We update this on every context switch
         (writing an invalid CPU number if the last context switch didn't
         leave the FPU in a lazily usable state), so we know that *that*
         thread has done nothing else with the FPU since.
      
      These two fields together can be used when next switching back to the
      task to see if the CPU still matches: if 'fpu_owner_task' matches the
      task we are switching to, we know that no other task (or kernel FPU
      usage) touched the FPU on this CPU in the meantime, and if the current
      CPU number matches the 'last_cpu' field, we know that this thread did no
      other FP work on any other CPU, so the FPU state on the CPU must match
      what was saved on last context switch.
      
      In that case, we can avoid the 'f[x]rstor' entirely, and just clear the
      CR0.TS bit.
      Signed-off-by: NLinus Torvalds <torvalds@linux-foundation.org>
      7e16838d
    • L
      i387: fix up some fpu_counter confusion · cea20ca3
      Linus Torvalds 提交于
      This makes sure we clear the FPU usage counter for newly created tasks,
      just so that we start off in a known state (for example, don't try to
      preload the FPU state on the first task switch etc).
      
      It also fixes a thinko in when we increment the fpu_counter at task
      switch time, introduced by commit 34ddc81a ("i387: re-introduce FPU
      state preloading at context switch time").  We should increment the
      *new* task fpu_counter, not the old task, and only if we decide to use
      that state (whether lazily or preloaded).
      Signed-off-by: NLinus Torvalds <torvalds@linux-foundation.org>
      cea20ca3
  28. 19 2月, 2012 1 次提交
    • L
      i387: re-introduce FPU state preloading at context switch time · 34ddc81a
      Linus Torvalds 提交于
      After all the FPU state cleanups and finally finding the problem that
      caused all our FPU save/restore problems, this re-introduces the
      preloading of FPU state that was removed in commit b3b0870e ("i387:
      do not preload FPU state at task switch time").
      
      However, instead of simply reverting the removal, this reimplements
      preloading with several fixes, most notably
      
       - properly abstracted as a true FPU state switch, rather than as
         open-coded save and restore with various hacks.
      
         In particular, implementing it as a proper FPU state switch allows us
         to optimize the CR0.TS flag accesses: there is no reason to set the
         TS bit only to then almost immediately clear it again.  CR0 accesses
         are quite slow and expensive, don't flip the bit back and forth for
         no good reason.
      
       - Make sure that the same model works for both x86-32 and x86-64, so
         that there are no gratuitous differences between the two due to the
         way they save and restore segment state differently due to
         architectural differences that really don't matter to the FPU state.
      
       - Avoid exposing the "preload" state to the context switch routines,
         and in particular allow the concept of lazy state restore: if nothing
         else has used the FPU in the meantime, and the process is still on
         the same CPU, we can avoid restoring state from memory entirely, just
         re-expose the state that is still in the FPU unit.
      
         That optimized lazy restore isn't actually implemented here, but the
         infrastructure is set up for it.  Of course, older CPU's that use
         'fnsave' to save the state cannot take advantage of this, since the
         state saving also trashes the state.
      
      In other words, there is now an actual _design_ to the FPU state saving,
      rather than just random historical baggage.  Hopefully it's easier to
      follow as a result.
      Signed-off-by: NLinus Torvalds <torvalds@linux-foundation.org>
      34ddc81a
  29. 17 2月, 2012 2 次提交
    • L
      i387: move AMD K7/K8 fpu fxsave/fxrstor workaround from save to restore · 4903062b
      Linus Torvalds 提交于
      The AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is
      pending.  In order to not leak FIP state from one process to another, we
      need to do a floating point load after the fxsave of the old process,
      and before the fxrstor of the new FPU state.  That resets the state to
      the (uninteresting) kernel load, rather than some potentially sensitive
      user information.
      
      We used to do this directly after the FPU state save, but that is
      actually very inconvenient, since it
      
       (a) corrupts what is potentially perfectly good FPU state that we might
           want to lazy avoid restoring later and
      
       (b) on x86-64 it resulted in a very annoying ordering constraint, where
           "__unlazy_fpu()" in the task switch needs to be delayed until after
           the DS segment has been reloaded just to get the new DS value.
      
      Coupling it to the fxrstor instead of the fxsave automatically avoids
      both of these issues, and also ensures that we only do it when actually
      necessary (the FP state after a save may never actually get used).  It's
      simply a much more natural place for the leaked state cleanup.
      Signed-off-by: NLinus Torvalds <torvalds@linux-foundation.org>
      4903062b
    • L
      i387: do not preload FPU state at task switch time · b3b0870e
      Linus Torvalds 提交于
      Yes, taking the trap to re-load the FPU/MMX state is expensive, but so
      is spending several days looking for a bug in the state save/restore
      code.  And the preload code has some rather subtle interactions with
      both paravirtualization support and segment state restore, so it's not
      nearly as simple as it should be.
      
      Also, now that we no longer necessarily depend on a single bit (ie
      TS_USEDFPU) for keeping track of the state of the FPU, we migth be able
      to do better.  If we are really switching between two processes that
      keep touching the FP state, save/restore is inevitable, but in the case
      of having one process that does most of the FPU usage, we may actually
      be able to do much better than the preloading.
      
      In particular, we may be able to keep track of which CPU the process ran
      on last, and also per CPU keep track of which process' FP state that CPU
      has.  For modern CPU's that don't destroy the FPU contents on save time,
      that would allow us to do a lazy restore by just re-enabling the
      existing FPU state - with no restore cost at all!
      Signed-off-by: NLinus Torvalds <torvalds@linux-foundation.org>
      b3b0870e
  30. 12 12月, 2011 1 次提交
    • F
      x86: Enter rcu extended qs after idle notifier call · e37e112d
      Frederic Weisbecker 提交于
      The idle notifier, called by enter_idle(), enters into rcu read
      side critical section but at that time we already switched into
      the RCU-idle window (rcu_idle_enter() has been called). And it's
      illegal to use rcu_read_lock() in that state.
      
      This results in rcu reporting its bad mood:
      
      [    1.275635] WARNING: at include/linux/rcupdate.h:194 __atomic_notifier_call_chain+0xd2/0x110()
      [    1.275635] Hardware name: AMD690VM-FMH
      [    1.275635] Modules linked in:
      [    1.275635] Pid: 0, comm: swapper Not tainted 3.0.0-rc6+ #252
      [    1.275635] Call Trace:
      [    1.275635]  [<ffffffff81051c8a>] warn_slowpath_common+0x7a/0xb0
      [    1.275635]  [<ffffffff81051cd5>] warn_slowpath_null+0x15/0x20
      [    1.275635]  [<ffffffff817d6f22>] __atomic_notifier_call_chain+0xd2/0x110
      [    1.275635]  [<ffffffff817d6f71>] atomic_notifier_call_chain+0x11/0x20
      [    1.275635]  [<ffffffff810018a0>] enter_idle+0x20/0x30
      [    1.275635]  [<ffffffff81001995>] cpu_idle+0xa5/0x110
      [    1.275635]  [<ffffffff817a7465>] rest_init+0xe5/0x140
      [    1.275635]  [<ffffffff817a73c8>] ? rest_init+0x48/0x140
      [    1.275635]  [<ffffffff81cc5ca3>] start_kernel+0x3d1/0x3dc
      [    1.275635]  [<ffffffff81cc5321>] x86_64_start_reservations+0x131/0x135
      [    1.275635]  [<ffffffff81cc5412>] x86_64_start_kernel+0xed/0xf4
      [    1.275635] ---[ end trace a22d306b065d4a66 ]---
      
      Fix this by entering rcu extended quiescent state later, just before
      the CPU goes to sleep.
      Signed-off-by: NFrederic Weisbecker <fweisbec@gmail.com>
      Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
      Cc: Ingo Molnar <mingo@redhat.com>
      Cc: Thomas Gleixner <tglx@linutronix.de>
      Cc: H. Peter Anvin <hpa@zytor.com>
      Signed-off-by: NPaul E. McKenney <paulmck@linux.vnet.ibm.com>
      Reviewed-by: NJosh Triplett <josh@joshtriplett.org>
      e37e112d