diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index e02e3f80d363bbb87d8db6c74b2a0d5370818419..84f58de08c2bdbff7ce98119ff2e360d7d5175a1 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -521,7 +521,8 @@ do {									\
 static __always_inline bool x86_this_cpu_constant_test_bit(unsigned int nr,
                         const unsigned long __percpu *addr)
 {
-	unsigned long __percpu *a = (unsigned long *)addr + nr / BITS_PER_LONG;
+	unsigned long __percpu *a =
+		(unsigned long __percpu *)addr + nr / BITS_PER_LONG;
 
 #ifdef CONFIG_X86_64
 	return ((1UL << (nr % BITS_PER_LONG)) & raw_cpu_read_8(*a)) != 0;
@@ -538,7 +539,7 @@ static inline bool x86_this_cpu_variable_test_bit(int nr,
 	asm volatile("bt "__percpu_arg(2)",%1\n\t"
 			CC_SET(c)
 			: CC_OUT(c) (oldbit)
-			: "m" (*(unsigned long *)addr), "Ir" (nr));
+			: "m" (*(unsigned long __percpu *)addr), "Ir" (nr));
 
 	return oldbit;
 }
diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h
index 4d9f233c4ba8343504b977e0273cc9a04cbbedb8..40e887068da213e595951f16caf8eb7ca83d3179 100644
--- a/include/asm-generic/percpu.h
+++ b/include/asm-generic/percpu.h
@@ -65,6 +65,11 @@ extern void setup_per_cpu_areas(void);
 #define PER_CPU_DEF_ATTRIBUTES
 #endif
 
+#define raw_cpu_generic_read(pcp)					\
+({									\
+	*raw_cpu_ptr(&(pcp));						\
+})
+
 #define raw_cpu_generic_to_op(pcp, val, op)				\
 do {									\
 	*raw_cpu_ptr(&(pcp)) op val;					\
@@ -72,34 +77,39 @@ do {									\
 
 #define raw_cpu_generic_add_return(pcp, val)				\
 ({									\
-	raw_cpu_add(pcp, val);						\
-	raw_cpu_read(pcp);						\
+	typeof(&(pcp)) __p = raw_cpu_ptr(&(pcp));			\
+									\
+	*__p += val;							\
+	*__p;								\
 })
 
 #define raw_cpu_generic_xchg(pcp, nval)					\
 ({									\
+	typeof(&(pcp)) __p = raw_cpu_ptr(&(pcp));			\
 	typeof(pcp) __ret;						\
-	__ret = raw_cpu_read(pcp);					\
-	raw_cpu_write(pcp, nval);					\
+	__ret = *__p;							\
+	*__p = nval;							\
 	__ret;								\
 })
 
 #define raw_cpu_generic_cmpxchg(pcp, oval, nval)			\
 ({									\
+	typeof(&(pcp)) __p = raw_cpu_ptr(&(pcp));			\
 	typeof(pcp) __ret;						\
-	__ret = raw_cpu_read(pcp);					\
+	__ret = *__p;							\
 	if (__ret == (oval))						\
-		raw_cpu_write(pcp, nval);				\
+		*__p = nval;						\
 	__ret;								\
 })
 
 #define raw_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) \
 ({									\
+	typeof(&(pcp1)) __p1 = raw_cpu_ptr(&(pcp1));			\
+	typeof(&(pcp2)) __p2 = raw_cpu_ptr(&(pcp2));			\
 	int __ret = 0;							\
-	if (raw_cpu_read(pcp1) == (oval1) &&				\
-			 raw_cpu_read(pcp2)  == (oval2)) {		\
-		raw_cpu_write(pcp1, nval1);				\
-		raw_cpu_write(pcp2, nval2);				\
+	if (*__p1 == (oval1) && *__p2  == (oval2)) {			\
+		*__p1 = nval1;						\
+		*__p2 = nval2;						\
 		__ret = 1;						\
 	}								\
 	(__ret);							\
@@ -109,7 +119,7 @@ do {									\
 ({									\
 	typeof(pcp) __ret;						\
 	preempt_disable();						\
-	__ret = *this_cpu_ptr(&(pcp));					\
+	__ret = raw_cpu_generic_read(pcp);				\
 	preempt_enable();						\
 	__ret;								\
 })
@@ -118,17 +128,17 @@ do {									\
 do {									\
 	unsigned long __flags;						\
 	raw_local_irq_save(__flags);					\
-	*raw_cpu_ptr(&(pcp)) op val;					\
+	raw_cpu_generic_to_op(pcp, val, op);				\
 	raw_local_irq_restore(__flags);					\
 } while (0)
 
+
 #define this_cpu_generic_add_return(pcp, val)				\
 ({									\
 	typeof(pcp) __ret;						\
 	unsigned long __flags;						\
 	raw_local_irq_save(__flags);					\
-	raw_cpu_add(pcp, val);						\
-	__ret = raw_cpu_read(pcp);					\
+	__ret = raw_cpu_generic_add_return(pcp, val);			\
 	raw_local_irq_restore(__flags);					\
 	__ret;								\
 })
@@ -138,8 +148,7 @@ do {									\
 	typeof(pcp) __ret;						\
 	unsigned long __flags;						\
 	raw_local_irq_save(__flags);					\
-	__ret = raw_cpu_read(pcp);					\
-	raw_cpu_write(pcp, nval);					\
+	__ret = raw_cpu_generic_xchg(pcp, nval);			\
 	raw_local_irq_restore(__flags);					\
 	__ret;								\
 })
@@ -149,9 +158,7 @@ do {									\
 	typeof(pcp) __ret;						\
 	unsigned long __flags;						\
 	raw_local_irq_save(__flags);					\
-	__ret = raw_cpu_read(pcp);					\
-	if (__ret == (oval))						\
-		raw_cpu_write(pcp, nval);				\
+	__ret = raw_cpu_generic_cmpxchg(pcp, oval, nval);		\
 	raw_local_irq_restore(__flags);					\
 	__ret;								\
 })
@@ -168,16 +175,16 @@ do {									\
 })
 
 #ifndef raw_cpu_read_1
-#define raw_cpu_read_1(pcp)		(*raw_cpu_ptr(&(pcp)))
+#define raw_cpu_read_1(pcp)		raw_cpu_generic_read(pcp)
 #endif
 #ifndef raw_cpu_read_2
-#define raw_cpu_read_2(pcp)		(*raw_cpu_ptr(&(pcp)))
+#define raw_cpu_read_2(pcp)		raw_cpu_generic_read(pcp)
 #endif
 #ifndef raw_cpu_read_4
-#define raw_cpu_read_4(pcp)		(*raw_cpu_ptr(&(pcp)))
+#define raw_cpu_read_4(pcp)		raw_cpu_generic_read(pcp)
 #endif
 #ifndef raw_cpu_read_8
-#define raw_cpu_read_8(pcp)		(*raw_cpu_ptr(&(pcp)))
+#define raw_cpu_read_8(pcp)		raw_cpu_generic_read(pcp)
 #endif
 
 #ifndef raw_cpu_write_1
diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c
index 27fe74948882e9558ad8cc72dfa6da1b7051dce7..9ac959ef4cae972374540f34895465507e656d7b 100644
--- a/lib/percpu-refcount.c
+++ b/lib/percpu-refcount.c
@@ -33,6 +33,7 @@
 
 #define PERCPU_COUNT_BIAS	(1LU << (BITS_PER_LONG - 1))
 
+static DEFINE_SPINLOCK(percpu_ref_switch_lock);
 static DECLARE_WAIT_QUEUE_HEAD(percpu_ref_switch_waitq);
 
 static unsigned long __percpu *percpu_count_ptr(struct percpu_ref *ref)
@@ -82,6 +83,7 @@ int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release,
 	atomic_long_set(&ref->count, start_count);
 
 	ref->release = release;
+	ref->confirm_switch = NULL;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(percpu_ref_init);
@@ -101,6 +103,8 @@ void percpu_ref_exit(struct percpu_ref *ref)
 	unsigned long __percpu *percpu_count = percpu_count_ptr(ref);
 
 	if (percpu_count) {
+		/* non-NULL confirm_switch indicates switching in progress */
+		WARN_ON_ONCE(ref->confirm_switch);
 		free_percpu(percpu_count);
 		ref->percpu_count_ptr = __PERCPU_REF_ATOMIC_DEAD;
 	}
@@ -161,66 +165,23 @@ static void percpu_ref_noop_confirm_switch(struct percpu_ref *ref)
 static void __percpu_ref_switch_to_atomic(struct percpu_ref *ref,
 					  percpu_ref_func_t *confirm_switch)
 {
-	if (!(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC)) {
-		/* switching from percpu to atomic */
-		ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC;
-
-		/*
-		 * Non-NULL ->confirm_switch is used to indicate that
-		 * switching is in progress.  Use noop one if unspecified.
-		 */
-		WARN_ON_ONCE(ref->confirm_switch);
-		ref->confirm_switch =
-			confirm_switch ?: percpu_ref_noop_confirm_switch;
-
-		percpu_ref_get(ref);	/* put after confirmation */
-		call_rcu_sched(&ref->rcu, percpu_ref_switch_to_atomic_rcu);
-	} else if (confirm_switch) {
-		/*
-		 * Somebody already set ATOMIC.  Switching may still be in
-		 * progress.  @confirm_switch must be invoked after the
-		 * switching is complete and a full sched RCU grace period
-		 * has passed.  Wait synchronously for the previous
-		 * switching and schedule @confirm_switch invocation.
-		 */
-		wait_event(percpu_ref_switch_waitq, !ref->confirm_switch);
-		ref->confirm_switch = confirm_switch;
-
-		percpu_ref_get(ref);	/* put after confirmation */
-		call_rcu_sched(&ref->rcu, percpu_ref_call_confirm_rcu);
+	if (ref->percpu_count_ptr & __PERCPU_REF_ATOMIC) {
+		if (confirm_switch)
+			confirm_switch(ref);
+		return;
 	}
-}
 
-/**
- * percpu_ref_switch_to_atomic - switch a percpu_ref to atomic mode
- * @ref: percpu_ref to switch to atomic mode
- * @confirm_switch: optional confirmation callback
- *
- * There's no reason to use this function for the usual reference counting.
- * Use percpu_ref_kill[_and_confirm]().
- *
- * Schedule switching of @ref to atomic mode.  All its percpu counts will
- * be collected to the main atomic counter.  On completion, when all CPUs
- * are guaraneed to be in atomic mode, @confirm_switch, which may not
- * block, is invoked.  This function may be invoked concurrently with all
- * the get/put operations and can safely be mixed with kill and reinit
- * operations.  Note that @ref will stay in atomic mode across kill/reinit
- * cycles until percpu_ref_switch_to_percpu() is called.
- *
- * This function normally doesn't block and can be called from any context
- * but it may block if @confirm_kill is specified and @ref is already in
- * the process of switching to atomic mode.  In such cases, @confirm_switch
- * will be invoked after the switching is complete.
- *
- * Due to the way percpu_ref is implemented, @confirm_switch will be called
- * after at least one full sched RCU grace period has passed but this is an
- * implementation detail and must not be depended upon.
- */
-void percpu_ref_switch_to_atomic(struct percpu_ref *ref,
-				 percpu_ref_func_t *confirm_switch)
-{
-	ref->force_atomic = true;
-	__percpu_ref_switch_to_atomic(ref, confirm_switch);
+	/* switching from percpu to atomic */
+	ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC;
+
+	/*
+	 * Non-NULL ->confirm_switch is used to indicate that switching is
+	 * in progress.  Use noop one if unspecified.
+	 */
+	ref->confirm_switch = confirm_switch ?: percpu_ref_noop_confirm_switch;
+
+	percpu_ref_get(ref);	/* put after confirmation */
+	call_rcu_sched(&ref->rcu, percpu_ref_switch_to_atomic_rcu);
 }
 
 static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref)
@@ -233,8 +194,6 @@ static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref)
 	if (!(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC))
 		return;
 
-	wait_event(percpu_ref_switch_waitq, !ref->confirm_switch);
-
 	atomic_long_add(PERCPU_COUNT_BIAS, &ref->count);
 
 	/*
@@ -250,6 +209,58 @@ static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref)
 			  ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC);
 }
 
+static void __percpu_ref_switch_mode(struct percpu_ref *ref,
+				     percpu_ref_func_t *confirm_switch)
+{
+	lockdep_assert_held(&percpu_ref_switch_lock);
+
+	/*
+	 * If the previous ATOMIC switching hasn't finished yet, wait for
+	 * its completion.  If the caller ensures that ATOMIC switching
+	 * isn't in progress, this function can be called from any context.
+	 */
+	wait_event_lock_irq(percpu_ref_switch_waitq, !ref->confirm_switch,
+			    percpu_ref_switch_lock);
+
+	if (ref->force_atomic || (ref->percpu_count_ptr & __PERCPU_REF_DEAD))
+		__percpu_ref_switch_to_atomic(ref, confirm_switch);
+	else
+		__percpu_ref_switch_to_percpu(ref);
+}
+
+/**
+ * percpu_ref_switch_to_atomic - switch a percpu_ref to atomic mode
+ * @ref: percpu_ref to switch to atomic mode
+ * @confirm_switch: optional confirmation callback
+ *
+ * There's no reason to use this function for the usual reference counting.
+ * Use percpu_ref_kill[_and_confirm]().
+ *
+ * Schedule switching of @ref to atomic mode.  All its percpu counts will
+ * be collected to the main atomic counter.  On completion, when all CPUs
+ * are guaraneed to be in atomic mode, @confirm_switch, which may not
+ * block, is invoked.  This function may be invoked concurrently with all
+ * the get/put operations and can safely be mixed with kill and reinit
+ * operations.  Note that @ref will stay in atomic mode across kill/reinit
+ * cycles until percpu_ref_switch_to_percpu() is called.
+ *
+ * This function may block if @ref is in the process of switching to atomic
+ * mode.  If the caller ensures that @ref is not in the process of
+ * switching to atomic mode, this function can be called from any context.
+ */
+void percpu_ref_switch_to_atomic(struct percpu_ref *ref,
+				 percpu_ref_func_t *confirm_switch)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&percpu_ref_switch_lock, flags);
+
+	ref->force_atomic = true;
+	__percpu_ref_switch_mode(ref, confirm_switch);
+
+	spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
+}
+
 /**
  * percpu_ref_switch_to_percpu - switch a percpu_ref to percpu mode
  * @ref: percpu_ref to switch to percpu mode
@@ -264,17 +275,20 @@ static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref)
  * dying or dead, the actual switching takes place on the following
  * percpu_ref_reinit().
  *
- * This function normally doesn't block and can be called from any context
- * but it may block if @ref is in the process of switching to atomic mode
- * by percpu_ref_switch_atomic().
+ * This function may block if @ref is in the process of switching to atomic
+ * mode.  If the caller ensures that @ref is not in the process of
+ * switching to atomic mode, this function can be called from any context.
  */
 void percpu_ref_switch_to_percpu(struct percpu_ref *ref)
 {
+	unsigned long flags;
+
+	spin_lock_irqsave(&percpu_ref_switch_lock, flags);
+
 	ref->force_atomic = false;
+	__percpu_ref_switch_mode(ref, NULL);
 
-	/* a dying or dead ref can't be switched to percpu mode w/o reinit */
-	if (!(ref->percpu_count_ptr & __PERCPU_REF_DEAD))
-		__percpu_ref_switch_to_percpu(ref);
+	spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
 }
 
 /**
@@ -290,21 +304,23 @@ void percpu_ref_switch_to_percpu(struct percpu_ref *ref)
  *
  * This function normally doesn't block and can be called from any context
  * but it may block if @confirm_kill is specified and @ref is in the
- * process of switching to atomic mode by percpu_ref_switch_atomic().
- *
- * Due to the way percpu_ref is implemented, @confirm_switch will be called
- * after at least one full sched RCU grace period has passed but this is an
- * implementation detail and must not be depended upon.
+ * process of switching to atomic mode by percpu_ref_switch_to_atomic().
  */
 void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
 				 percpu_ref_func_t *confirm_kill)
 {
+	unsigned long flags;
+
+	spin_lock_irqsave(&percpu_ref_switch_lock, flags);
+
 	WARN_ONCE(ref->percpu_count_ptr & __PERCPU_REF_DEAD,
 		  "%s called more than once on %pf!", __func__, ref->release);
 
 	ref->percpu_count_ptr |= __PERCPU_REF_DEAD;
-	__percpu_ref_switch_to_atomic(ref, confirm_kill);
+	__percpu_ref_switch_mode(ref, confirm_kill);
 	percpu_ref_put(ref);
+
+	spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
 }
 EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm);
 
@@ -321,11 +337,16 @@ EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm);
  */
 void percpu_ref_reinit(struct percpu_ref *ref)
 {
+	unsigned long flags;
+
+	spin_lock_irqsave(&percpu_ref_switch_lock, flags);
+
 	WARN_ON_ONCE(!percpu_ref_is_zero(ref));
 
 	ref->percpu_count_ptr &= ~__PERCPU_REF_DEAD;
 	percpu_ref_get(ref);
-	if (!ref->force_atomic)
-		__percpu_ref_switch_to_percpu(ref);
+	__percpu_ref_switch_mode(ref, NULL);
+
+	spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
 }
 EXPORT_SYMBOL_GPL(percpu_ref_reinit);
diff --git a/mm/percpu.c b/mm/percpu.c
index 9903830aaebbf00256fbb108891c2e5e09baecac..255714302394137e37e4d8bd5cf87d685732d9d6 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1961,8 +1961,9 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
 	void *base = (void *)ULONG_MAX;
 	void **areas = NULL;
 	struct pcpu_alloc_info *ai;
-	size_t size_sum, areas_size, max_distance;
-	int group, i, rc;
+	size_t size_sum, areas_size;
+	unsigned long max_distance;
+	int group, i, highest_group, rc;
 
 	ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
 				   cpu_distance_fn);
@@ -1978,7 +1979,8 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
 		goto out_free;
 	}
 
-	/* allocate, copy and determine base address */
+	/* allocate, copy and determine base address & max_distance */
+	highest_group = 0;
 	for (group = 0; group < ai->nr_groups; group++) {
 		struct pcpu_group_info *gi = &ai->groups[group];
 		unsigned int cpu = NR_CPUS;
@@ -1999,6 +2001,21 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
 		areas[group] = ptr;
 
 		base = min(ptr, base);
+		if (ptr > areas[highest_group])
+			highest_group = group;
+	}
+	max_distance = areas[highest_group] - base;
+	max_distance += ai->unit_size * ai->groups[highest_group].nr_units;
+
+	/* warn if maximum distance is further than 75% of vmalloc space */
+	if (max_distance > VMALLOC_TOTAL * 3 / 4) {
+		pr_warn("max_distance=0x%lx too large for vmalloc space 0x%lx\n",
+				max_distance, VMALLOC_TOTAL);
+#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
+		/* and fail if we have fallback */
+		rc = -EINVAL;
+		goto out_free_areas;
+#endif
 	}
 
 	/*
@@ -2023,23 +2040,8 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
 	}
 
 	/* base address is now known, determine group base offsets */
-	max_distance = 0;
 	for (group = 0; group < ai->nr_groups; group++) {
 		ai->groups[group].base_offset = areas[group] - base;
-		max_distance = max_t(size_t, max_distance,
-				     ai->groups[group].base_offset);
-	}
-	max_distance += ai->unit_size;
-
-	/* warn if maximum distance is further than 75% of vmalloc space */
-	if (max_distance > VMALLOC_TOTAL * 3 / 4) {
-		pr_warn("max_distance=0x%zx too large for vmalloc space 0x%lx\n",
-			max_distance, VMALLOC_TOTAL);
-#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
-		/* and fail if we have fallback */
-		rc = -EINVAL;
-		goto out_free;
-#endif
 	}
 
 	pr_info("Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n",