futex: Fix the write access fault problem for real

commit 64d1304a (futex: setup writeable mapping for futex ops which modify user space data) did address only half of the problem of write access faults. The patch was made on two wrong assumptions: 1) access_ok(VERIFY_WRITE,...) would actually check write access. On x86 it does _NOT_. It's a pure address range check. 2) a RW mapped region can not go away under us. That's wrong as well. Nobody can prevent another thread to call mprotect(PROT_READ) on that region where the futex resides. If that call hits between the get_user_pages_fast() verification and the actual write access in the atomic region we are toast again. The solution is to not rely on access_ok and get_user() for any write access related fault on private and shared futexes. Instead we need to fault it in with verification of write access. There is no generic non destructive write mechanism which would fault the user page in trough a #PF, but as we already know that we will fault we can as well call get_user_pages() directly and avoid the #PF overhead. If get_user_pages() returns -EFAULT we know that we can not fix it anymore and need to bail out to user space. Remove a bunch of confusing comments on this issue as well. Signed-off-by: N Thomas Gleixner <tglx@linutronix.de> Cc: stable@kernel.org

futex: Fix the write access fault problem for real
commit 64d1304a (futex: setup writeable mapping for futex ops which modify user space data) did address only half of the problem of write access faults. The patch was made on two wrong assumptions: 1) access_ok(VERIFY_WRITE,...) would actually check write access. On x86 it does _NOT_. It's a pure address range check. 2) a RW mapped region can not go away under us. That's wrong as well. Nobody can prevent another thread to call mprotect(PROT_READ) on that region where the futex resides. If that call hits between the get_user_pages_fast() verification and the actual write access in the atomic region we are toast again. The solution is to not rely on access_ok and get_user() for any write access related fault on private and shared futexes. Instead we need to fault it in with verification of write access. There is no generic non destructive write mechanism which would fault the user page in trough a #PF, but as we already know that we will fault we can as well call get_user_pages() directly and avoid the #PF overhead. If get_user_pages() returns -EFAULT we know that we can not fix it anymore and need to bail out to user space. Remove a bunch of confusing comments on this issue as well. Signed-off-by: N Thomas Gleixner <tglx@linutronix.de> Cc: stable@kernel.org
d0725992 · Thomas Gleixner · c82e6d45 · d0725992
隐藏空白更改
内联并排

Showing with 24 addition and 21 deletion

kernel/futex.c kernel/futex.c +24 -21

未找到文件。
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -284,6 +284,25 @@ void put_futex_key(int fshared, union futex_key *key)
 	drop_futex_key_refs(key);
 }

+/*
+ * fault_in_user_writeable - fault in user address and verify RW access
+ * @uaddr:	pointer to faulting user space address
+ *
+ * Slow path to fixup the fault we just took in the atomic write
+ * access to @uaddr.
+ *
+ * We have no generic implementation of a non destructive write to the
+ * user address. We know that we faulted in the atomic pagefault
+ * disabled section so we can as well avoid the #PF overhead by
+ * calling get_user_pages() right away.
+ */
+static int fault_in_user_writeable(u32 __user *uaddr)
+{
+	int ret = get_user_pages(current, current->mm, (unsigned long)uaddr,
+				 sizeof(*uaddr), 1, 0, NULL, NULL);
+	return ret < 0 ? ret : 0;
+}
+
 /**
 * futex_top_waiter() - Return the highest priority waiter on a futex
 * @hb:     the hash bucket the futex_q's reside in
@@ -896,7 +915,6 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
 retry_private:
 	op_ret = futex_atomic_op_inuser(op, uaddr2);
 	if (unlikely(op_ret < 0)) {
-		u32 dummy;

 		double_unlock_hb(hb1, hb2);

@@ -914,7 +932,7 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
 			goto out_put_keys;
 		}

-		ret = get_user(dummy, uaddr2);
+		ret = fault_in_user_writeable(uaddr2);
 		if (ret)
 			goto out_put_keys;

@@ -1204,7 +1222,7 @@ static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
 			double_unlock_hb(hb1, hb2);
 			put_futex_key(fshared, &key2);
 			put_futex_key(fshared, &key1);
-			ret = get_user(curval2, uaddr2);
+			ret = fault_in_user_writeable(uaddr2);
 			if (!ret)
 				goto retry;
 			goto out;
@@ -1482,7 +1500,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
 handle_fault:
 	spin_unlock(q->lock_ptr);

-	ret = get_user(uval, uaddr);
+	ret = fault_in_user_writeable(uaddr);

 	spin_lock(q->lock_ptr);

@@ -1807,7 +1825,6 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
 {
 	struct hrtimer_sleeper timeout, *to = NULL;
 	struct futex_hash_bucket *hb;
-	u32 uval;
 	struct futex_q q;
 	int res, ret;

@@ -1909,16 +1926,9 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
 	return ret != -EINTR ? ret : -ERESTARTNOINTR;

 uaddr_faulted:
-	/*
-	 * We have to r/w  *(int __user *)uaddr, and we have to modify it
-	 * atomically.  Therefore, if we continue to fault after get_user()
-	 * below, we need to handle the fault ourselves, while still holding
-	 * the mmap_sem.  This can occur if the uaddr is under contention as
-	 * we have to drop the mmap_sem in order to call get_user().
-	 */
 	queue_unlock(&q, hb);

-	ret = get_user(uval, uaddr);
+	ret = fault_in_user_writeable(uaddr);
 	if (ret)
 		goto out_put_key;

@@ -2013,17 +2023,10 @@ static int futex_unlock_pi(u32 __user *uaddr, int fshared)
 	return ret;

 pi_faulted:
-	/*
-	 * We have to r/w  *(int __user *)uaddr, and we have to modify it
-	 * atomically.  Therefore, if we continue to fault after get_user()
-	 * below, we need to handle the fault ourselves, while still holding
-	 * the mmap_sem.  This can occur if the uaddr is under contention as
-	 * we have to drop the mmap_sem in order to call get_user().
-	 */
 	spin_unlock(&hb->lock);
 	put_futex_key(fshared, &key);

-	ret = get_user(uval, uaddr);
+	ret = fault_in_user_writeable(uaddr);
 	if (!ret)
 		goto retry;