Merge branch 'ptr_ring-fixes'

Michael S. Tsirkin says: ==================== ptr_ring fixes This fixes a bunch of issues around ptr_ring use in net core. One of these: "tap: fix use-after-free" is also needed on net, but can't be backported cleanly. I will post a net patch separately. Lightly tested - Jason, could you pls confirm this addresses the security issue you saw with ptr_ring? Testing reports would be appreciated too. ==================== Signed-off-by: N David S. Miller <davem@davemloft.net> Tested-by: N Jason Wang <jasowang@redhat.com> Acked-by: N Jason Wang <jasowang@redhat.com>

Merge branch 'ptr_ring-fixes'
Michael S. Tsirkin says: ==================== ptr_ring fixes This fixes a bunch of issues around ptr_ring use in net core. One of these: "tap: fix use-after-free" is also needed on net, but can't be backported cleanly. I will post a net patch separately. Lightly tested - Jason, could you pls confirm this addresses the security issue you saw with ptr_ring? Testing reports would be appreciated too. ==================== Signed-off-by: N David S. Miller <davem@davemloft.net> Tested-by: N Jason Wang <jasowang@redhat.com> Acked-by: N Jason Wang <jasowang@redhat.com>
bfbe5bab · David S. Miller · 7ece54a6 · 491847f3 · bfbe5bab · bfbe5bab
7 changed file
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -330,9 +330,6 @@ rx_handler_result_t tap_handle_frame(struct sk_buff **pskb)
 	if (!q)
 		return RX_HANDLER_PASS;

-	if (__ptr_ring_full(&q->ring))
-		goto drop;
-
 	skb_push(skb, ETH_HLEN);

 	/* Apply the forward feature mask so that we perform segmentation

--- a/include/linux/ptr_ring.h
+++ b/include/linux/ptr_ring.h
@@ -45,9 +45,10 @@ struct ptr_ring {
 };

 /* Note: callers invoking this in a loop must use a compiler barrier,
- * for example cpu_relax().  If ring is ever resized, callers must hold
- * producer_lock - see e.g. ptr_ring_full.  Otherwise, if callers don't hold
- * producer_lock, the next call to __ptr_ring_produce may fail.
+ * for example cpu_relax().
+ *
+ * NB: this is unlike __ptr_ring_empty in that callers must hold producer_lock:
+ * see e.g. ptr_ring_full.
 */
 static inline bool __ptr_ring_full(struct ptr_ring *r)
 {
@@ -113,7 +114,7 @@ static inline int __ptr_ring_produce(struct ptr_ring *r, void *ptr)
 	/* Pairs with smp_read_barrier_depends in __ptr_ring_consume. */
 	smp_wmb();

-	r->queue[r->producer++] = ptr;
+	WRITE_ONCE(r->queue[r->producer++], ptr);
 	if (unlikely(r->producer >= r->size))
 		r->producer = 0;
 	return 0;
@@ -169,32 +170,36 @@ static inline int ptr_ring_produce_bh(struct ptr_ring *r, void *ptr)
 	return ret;
 }

-/* Note: callers invoking this in a loop must use a compiler barrier,
- * for example cpu_relax(). Callers must take consumer_lock
- * if they dereference the pointer - see e.g. PTR_RING_PEEK_CALL.
- * If ring is never resized, and if the pointer is merely
- * tested, there's no need to take the lock - see e.g.  __ptr_ring_empty.
- * However, if called outside the lock, and if some other CPU
- * consumes ring entries at the same time, the value returned
- * is not guaranteed to be correct.
- * In this case - to avoid incorrectly detecting the ring
- * as empty - the CPU consuming the ring entries is responsible
- * for either consuming all ring entries until the ring is empty,
- * or synchronizing with some other CPU and causing it to
- * execute __ptr_ring_peek and/or consume the ring enteries
- * after the synchronization point.
- */
 static inline void *__ptr_ring_peek(struct ptr_ring *r)
 {
 	if (likely(r->size))
-		return r->queue[r->consumer_head];
+		return READ_ONCE(r->queue[r->consumer_head]);
 	return NULL;
 }

-/* See __ptr_ring_peek above for locking rules. */
+/*
+ * Test ring empty status without taking any locks.
+ *
+ * NB: This is only safe to call if ring is never resized.
+ *
+ * However, if some other CPU consumes ring entries at the same time, the value
+ * returned is not guaranteed to be correct.
+ *
+ * In this case - to avoid incorrectly detecting the ring
+ * as empty - the CPU consuming the ring entries is responsible
+ * for either consuming all ring entries until the ring is empty,
+ * or synchronizing with some other CPU and causing it to
+ * re-test __ptr_ring_empty and/or consume the ring enteries
+ * after the synchronization point.
+ *
+ * Note: callers invoking this in a loop must use a compiler barrier,
+ * for example cpu_relax().
+ */
 static inline bool __ptr_ring_empty(struct ptr_ring *r)
 {
-	return !__ptr_ring_peek(r);
+	if (likely(r->size))
+		return !r->queue[READ_ONCE(r->consumer_head)];
+	return true;
 }

 static inline bool ptr_ring_empty(struct ptr_ring *r)
@@ -248,22 +253,28 @@ static inline void __ptr_ring_discard_one(struct ptr_ring *r)
 	/* Fundamentally, what we want to do is update consumer
 	 * index and zero out the entry so producer can reuse it.
 	 * Doing it naively at each consume would be as simple as:
-	 *       r->queue[r->consumer++] = NULL;
-	 *       if (unlikely(r->consumer >= r->size))
-	 *               r->consumer = 0;
+	 *       consumer = r->consumer;
+	 *       r->queue[consumer++] = NULL;
+	 *       if (unlikely(consumer >= r->size))
+	 *               consumer = 0;
+	 *       r->consumer = consumer;
 	 * but that is suboptimal when the ring is full as producer is writing
 	 * out new entries in the same cache line.  Defer these updates until a
 	 * batch of entries has been consumed.
 	 */
-	int head = r->consumer_head++;
+	/* Note: we must keep consumer_head valid at all times for __ptr_ring_empty
+	 * to work correctly.
+	 */
+	int consumer_head = r->consumer_head;
+	int head = consumer_head++;

 	/* Once we have processed enough entries invalidate them in
 	 * the ring all at once so producer can reuse their space in the ring.
 	 * We also do this when we reach end of the ring - not mandatory
 	 * but helps keep the implementation simple.
 	 */
-	if (unlikely(r->consumer_head - r->consumer_tail >= r->batch ||
-		     r->consumer_head >= r->size)) {
+	if (unlikely(consumer_head - r->consumer_tail >= r->batch ||
+		     consumer_head >= r->size)) {
 		/* Zero out entries in the reverse order: this way we touch the
 		 * cache line that producer might currently be reading the last;
 		 * producer won't make progress and touch other cache lines
@@ -271,12 +282,14 @@ static inline void __ptr_ring_discard_one(struct ptr_ring *r)
 		 */
 		while (likely(head >= r->consumer_tail))
 			r->queue[head--] = NULL;
-		r->consumer_tail = r->consumer_head;
+		r->consumer_tail = consumer_head;
 	}
-	if (unlikely(r->consumer_head >= r->size)) {
-		r->consumer_head = 0;
+	if (unlikely(consumer_head >= r->size)) {
+		consumer_head = 0;
 		r->consumer_tail = 0;
 	}
+	/* matching READ_ONCE in __ptr_ring_empty for lockless tests */
+	WRITE_ONCE(r->consumer_head, consumer_head);
 }

 static inline void *__ptr_ring_consume(struct ptr_ring *r)
@@ -453,12 +466,7 @@ static inline int ptr_ring_consume_batched_bh(struct ptr_ring *r,

 static inline void **__ptr_ring_init_queue_alloc(unsigned int size, gfp_t gfp)
 {
-	/* Allocate an extra dummy element at end of ring to avoid consumer head
-	 * or produce head access past the end of the array. Possible when
-	 * producer/consumer operations and __ptr_ring_peek operations run in
-	 * parallel.
-	 */
-	return kcalloc(size + 1, sizeof(void *), gfp);
+	return kcalloc(size, sizeof(void *), gfp);
 }

 static inline void __ptr_ring_set_size(struct ptr_ring *r, int size)
@@ -532,7 +540,9 @@ static inline void ptr_ring_unconsume(struct ptr_ring *r, void **batch, int n,
 			goto done;
 		}
 		r->queue[head] = batch[--n];
-		r->consumer_tail = r->consumer_head = head;
+		r->consumer_tail = head;
+		/* matching READ_ONCE in __ptr_ring_empty for lockless tests */
+		WRITE_ONCE(r->consumer_head, head);
 	}

 done:

--- a/include/linux/skb_array.h
+++ b/include/linux/skb_array.h
@@ -69,7 +69,7 @@ static inline int skb_array_produce_any(struct skb_array *a, struct sk_buff *skb
 */
 static inline bool __skb_array_empty(struct skb_array *a)
 {
-	return !__ptr_ring_peek(&a->ring);
+	return __ptr_ring_empty(&a->ring);
 }

 static inline struct sk_buff *__skb_array_peek(struct skb_array *a)

--- a/tools/virtio/linux/kernel.h
+++ b/tools/virtio/linux/kernel.h
@@ -118,7 +118,7 @@ static inline void free_page(unsigned long addr)
 #define dev_err(dev, format, ...) fprintf (stderr, format, ## __VA_ARGS__)
 #define dev_warn(dev, format, ...) fprintf (stderr, format, ## __VA_ARGS__)

-#define WARN_ON_ONCE(cond) ((cond) && fprintf (stderr, "WARNING\n"))
+#define WARN_ON_ONCE(cond) ((cond) ? fprintf (stderr, "WARNING\n") : 0)

 #define min(x, y) ({				\
 	typeof(x) _min1 = (x);			\

--- a/tools/virtio/linux/thread_info.h
+++ b/tools/virtio/linux/thread_info.h
+#define check_copy_size(A, B, C) (1)
--- a/tools/virtio/ringtest/main.h
+++ b/tools/virtio/ringtest/main.h
@@ -111,7 +111,7 @@ static inline void busy_wait(void)
 } 

 #if defined(__x86_64__) || defined(__i386__)
-#define smp_mb()     asm volatile("lock; addl $0,-128(%%rsp)" ::: "memory", "cc")
+#define smp_mb()     asm volatile("lock; addl $0,-132(%%rsp)" ::: "memory", "cc")
 #else
 /*
 * Not using __ATOMIC_SEQ_CST since gcc docs say they are only synchronized
@@ -134,4 +134,61 @@ static inline void busy_wait(void)
    barrier(); \
 } while (0)

+#if defined(__i386__) || defined(__x86_64__) || defined(__s390x__)
+#define smp_wmb() barrier()
+#else
+#define smp_wmb() smp_release()
+#endif
+
+#ifdef __alpha__
+#define smp_read_barrier_depends() smp_acquire()
+#else
+#define smp_read_barrier_depends() do {} while(0)
+#endif
+
+static __always_inline
+void __read_once_size(const volatile void *p, void *res, int size)
+{
+        switch (size) {                                                 \
+        case 1: *(unsigned char *)res = *(volatile unsigned char *)p; break;              \
+        case 2: *(unsigned short *)res = *(volatile unsigned short *)p; break;            \
+        case 4: *(unsigned int *)res = *(volatile unsigned int *)p; break;            \
+        case 8: *(unsigned long long *)res = *(volatile unsigned long long *)p; break;            \
+        default:                                                        \
+                barrier();                                              \
+                __builtin_memcpy((void *)res, (const void *)p, size);   \
+                barrier();                                              \
+        }                                                               \
+}
+
+static __always_inline void __write_once_size(volatile void *p, void *res, int size)
+{
+	switch (size) {
+	case 1: *(volatile unsigned char *)p = *(unsigned char *)res; break;
+	case 2: *(volatile unsigned short *)p = *(unsigned short *)res; break;
+	case 4: *(volatile unsigned int *)p = *(unsigned int *)res; break;
+	case 8: *(volatile unsigned long long *)p = *(unsigned long long *)res; break;
+	default:
+		barrier();
+		__builtin_memcpy((void *)p, (const void *)res, size);
+		barrier();
+	}
+}
+
+#define READ_ONCE(x) \
+({									\
+	union { typeof(x) __val; char __c[1]; } __u;			\
+	__read_once_size(&(x), __u.__c, sizeof(x));		\
+	smp_read_barrier_depends(); /* Enforce dependency ordering from x */ \
+	__u.__val;							\
+})
+
+#define WRITE_ONCE(x, val) \
+({							\
+	union { typeof(x) __val; char __c[1]; } __u =	\
+		{ .__val = (typeof(x)) (val) }; \
+	__write_once_size(&(x), __u.__c, sizeof(x));	\
+	__u.__val;					\
+})
+
 #endif
--- a/tools/virtio/ringtest/ptr_ring.c
+++ b/tools/virtio/ringtest/ptr_ring.c
@@ -187,7 +187,7 @@ bool enable_kick()

 bool avail_empty()
 {
-	return !__ptr_ring_peek(&array);
+	return __ptr_ring_empty(&array);
 }

 bool use_buf(unsigned *lenp, void **bufp)