diff --git a/include/asm-x86_64/i387.h b/include/asm-x86_64/i387.h
index aa39cfd0e0016f212a124097bb4bf5e563251539..57f7e14338492df5c8c1b37f54658d6f61894afd 100644
--- a/include/asm-x86_64/i387.h
+++ b/include/asm-x86_64/i387.h
@@ -75,7 +75,8 @@ extern int set_fpregs(struct task_struct *tsk,
 static inline int restore_fpu_checking(struct i387_fxsave_struct *fx) 
 { 
 	int err;
-	asm volatile("1:  rex64 ; fxrstor (%[fx])\n\t"
+
+	asm volatile("1:  rex64/fxrstor (%[fx])\n\t"
 		     "2:\n"
 		     ".section .fixup,\"ax\"\n"
 		     "3:  movl $-1,%[err]\n"
@@ -86,7 +87,11 @@ static inline int restore_fpu_checking(struct i387_fxsave_struct *fx)
 		     "   .quad  1b,3b\n"
 		     ".previous"
 		     : [err] "=r" (err)
-		     : [fx] "r" (fx), "0" (0)); 
+#if 0 /* See comment in __fxsave_clear() below. */
+		     : [fx] "r" (fx), "m" (*fx), "0" (0));
+#else
+		     : [fx] "cdaSDb" (fx), "m" (*fx), "0" (0));
+#endif
 	if (unlikely(err))
 		init_fpu(current);
 	return err;
@@ -95,7 +100,8 @@ static inline int restore_fpu_checking(struct i387_fxsave_struct *fx)
 static inline int save_i387_checking(struct i387_fxsave_struct __user *fx) 
 { 
 	int err;
-	asm volatile("1:  rex64 ; fxsave (%[fx])\n\t"
+
+	asm volatile("1:  rex64/fxsave (%[fx])\n\t"
 		     "2:\n"
 		     ".section .fixup,\"ax\"\n"
 		     "3:  movl $-1,%[err]\n"
@@ -105,20 +111,53 @@ static inline int save_i387_checking(struct i387_fxsave_struct __user *fx)
 		     "   .align 8\n"
 		     "   .quad  1b,3b\n"
 		     ".previous"
-		     : [err] "=r" (err)
-		     : [fx] "r" (fx), "0" (0)); 
+		     : [err] "=r" (err), "=m" (*fx)
+#if 0 /* See comment in __fxsave_clear() below. */
+		     : [fx] "r" (fx), "0" (0));
+#else
+		     : [fx] "cdaSDb" (fx), "0" (0));
+#endif
 	if (unlikely(err))
 		__clear_user(fx, sizeof(struct i387_fxsave_struct));
 	return err;
 } 
 
+static inline void __fxsave_clear(struct task_struct *tsk)
+{
+	/* Using "rex64; fxsave %0" is broken because, if the memory operand
+	   uses any extended registers for addressing, a second REX prefix
+	   will be generated (to the assembler, rex64 followed by semicolon
+	   is a separate instruction), and hence the 64-bitness is lost. */
+#if 0
+	/* Using "fxsaveq %0" would be the ideal choice, but is only supported
+	   starting with gas 2.16. */
+	__asm__ __volatile__("fxsaveq %0"
+			     : "=m" (tsk->thread.i387.fxsave));
+#elif 0
+	/* Using, as a workaround, the properly prefixed form below isn't
+	   accepted by any binutils version so far released, complaining that
+	   the same type of prefix is used twice if an extended register is
+	   needed for addressing (fix submitted to mainline 2005-11-21). */
+	__asm__ __volatile__("rex64/fxsave %0"
+			     : "=m" (tsk->thread.i387.fxsave));
+#else
+	/* This, however, we can work around by forcing the compiler to select
+	   an addressing mode that doesn't require extended registers. */
+	__asm__ __volatile__("rex64/fxsave %P2(%1)"
+			     : "=m" (tsk->thread.i387.fxsave)
+			     : "cdaSDb" (tsk),
+				"i" (offsetof(__typeof__(*tsk),
+					      thread.i387.fxsave)));
+#endif
+	__asm__ __volatile__("fnclex");
+}
+
 static inline void kernel_fpu_begin(void)
 {
 	struct thread_info *me = current_thread_info();
 	preempt_disable();
-	if (me->status & TS_USEDFPU) { 
-		asm volatile("rex64 ; fxsave %0 ; fnclex"
-			      : "=m" (me->task->thread.i387.fxsave));
+	if (me->status & TS_USEDFPU) {
+		__fxsave_clear(me->task);
 		me->status &= ~TS_USEDFPU;
 		return;
 	}
@@ -133,8 +172,7 @@ static inline void kernel_fpu_end(void)
 
 static inline void save_init_fpu( struct task_struct *tsk )
 {
-	asm volatile( "rex64 ; fxsave %0 ; fnclex"
-		      : "=m" (tsk->thread.i387.fxsave));
+ 	__fxsave_clear(tsk);
 	tsk->thread_info->status &= ~TS_USEDFPU;
 	stts();
 }