diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index e02e3f80d363bbb87d8db6c74b2a0d5370818419..84f58de08c2bdbff7ce98119ff2e360d7d5175a1 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -521,7 +521,8 @@ do { \ static __always_inline bool x86_this_cpu_constant_test_bit(unsigned int nr, const unsigned long __percpu *addr) { - unsigned long __percpu *a = (unsigned long *)addr + nr / BITS_PER_LONG; + unsigned long __percpu *a = + (unsigned long __percpu *)addr + nr / BITS_PER_LONG; #ifdef CONFIG_X86_64 return ((1UL << (nr % BITS_PER_LONG)) & raw_cpu_read_8(*a)) != 0; @@ -538,7 +539,7 @@ static inline bool x86_this_cpu_variable_test_bit(int nr, asm volatile("bt "__percpu_arg(2)",%1\n\t" CC_SET(c) : CC_OUT(c) (oldbit) - : "m" (*(unsigned long *)addr), "Ir" (nr)); + : "m" (*(unsigned long __percpu *)addr), "Ir" (nr)); return oldbit; } diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h index 4d9f233c4ba8343504b977e0273cc9a04cbbedb8..40e887068da213e595951f16caf8eb7ca83d3179 100644 --- a/include/asm-generic/percpu.h +++ b/include/asm-generic/percpu.h @@ -65,6 +65,11 @@ extern void setup_per_cpu_areas(void); #define PER_CPU_DEF_ATTRIBUTES #endif +#define raw_cpu_generic_read(pcp) \ +({ \ + *raw_cpu_ptr(&(pcp)); \ +}) + #define raw_cpu_generic_to_op(pcp, val, op) \ do { \ *raw_cpu_ptr(&(pcp)) op val; \ @@ -72,34 +77,39 @@ do { \ #define raw_cpu_generic_add_return(pcp, val) \ ({ \ - raw_cpu_add(pcp, val); \ - raw_cpu_read(pcp); \ + typeof(&(pcp)) __p = raw_cpu_ptr(&(pcp)); \ + \ + *__p += val; \ + *__p; \ }) #define raw_cpu_generic_xchg(pcp, nval) \ ({ \ + typeof(&(pcp)) __p = raw_cpu_ptr(&(pcp)); \ typeof(pcp) __ret; \ - __ret = raw_cpu_read(pcp); \ - raw_cpu_write(pcp, nval); \ + __ret = *__p; \ + *__p = nval; \ __ret; \ }) #define raw_cpu_generic_cmpxchg(pcp, oval, nval) \ ({ \ + typeof(&(pcp)) __p = raw_cpu_ptr(&(pcp)); \ typeof(pcp) __ret; \ - __ret = raw_cpu_read(pcp); \ + __ret = *__p; \ if (__ret == (oval)) \ - raw_cpu_write(pcp, nval); \ + *__p = nval; \ __ret; \ }) #define raw_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) \ ({ \ + typeof(&(pcp1)) __p1 = raw_cpu_ptr(&(pcp1)); \ + typeof(&(pcp2)) __p2 = raw_cpu_ptr(&(pcp2)); \ int __ret = 0; \ - if (raw_cpu_read(pcp1) == (oval1) && \ - raw_cpu_read(pcp2) == (oval2)) { \ - raw_cpu_write(pcp1, nval1); \ - raw_cpu_write(pcp2, nval2); \ + if (*__p1 == (oval1) && *__p2 == (oval2)) { \ + *__p1 = nval1; \ + *__p2 = nval2; \ __ret = 1; \ } \ (__ret); \ @@ -109,7 +119,7 @@ do { \ ({ \ typeof(pcp) __ret; \ preempt_disable(); \ - __ret = *this_cpu_ptr(&(pcp)); \ + __ret = raw_cpu_generic_read(pcp); \ preempt_enable(); \ __ret; \ }) @@ -118,17 +128,17 @@ do { \ do { \ unsigned long __flags; \ raw_local_irq_save(__flags); \ - *raw_cpu_ptr(&(pcp)) op val; \ + raw_cpu_generic_to_op(pcp, val, op); \ raw_local_irq_restore(__flags); \ } while (0) + #define this_cpu_generic_add_return(pcp, val) \ ({ \ typeof(pcp) __ret; \ unsigned long __flags; \ raw_local_irq_save(__flags); \ - raw_cpu_add(pcp, val); \ - __ret = raw_cpu_read(pcp); \ + __ret = raw_cpu_generic_add_return(pcp, val); \ raw_local_irq_restore(__flags); \ __ret; \ }) @@ -138,8 +148,7 @@ do { \ typeof(pcp) __ret; \ unsigned long __flags; \ raw_local_irq_save(__flags); \ - __ret = raw_cpu_read(pcp); \ - raw_cpu_write(pcp, nval); \ + __ret = raw_cpu_generic_xchg(pcp, nval); \ raw_local_irq_restore(__flags); \ __ret; \ }) @@ -149,9 +158,7 @@ do { \ typeof(pcp) __ret; \ unsigned long __flags; \ raw_local_irq_save(__flags); \ - __ret = raw_cpu_read(pcp); \ - if (__ret == (oval)) \ - raw_cpu_write(pcp, nval); \ + __ret = raw_cpu_generic_cmpxchg(pcp, oval, nval); \ raw_local_irq_restore(__flags); \ __ret; \ }) @@ -168,16 +175,16 @@ do { \ }) #ifndef raw_cpu_read_1 -#define raw_cpu_read_1(pcp) (*raw_cpu_ptr(&(pcp))) +#define raw_cpu_read_1(pcp) raw_cpu_generic_read(pcp) #endif #ifndef raw_cpu_read_2 -#define raw_cpu_read_2(pcp) (*raw_cpu_ptr(&(pcp))) +#define raw_cpu_read_2(pcp) raw_cpu_generic_read(pcp) #endif #ifndef raw_cpu_read_4 -#define raw_cpu_read_4(pcp) (*raw_cpu_ptr(&(pcp))) +#define raw_cpu_read_4(pcp) raw_cpu_generic_read(pcp) #endif #ifndef raw_cpu_read_8 -#define raw_cpu_read_8(pcp) (*raw_cpu_ptr(&(pcp))) +#define raw_cpu_read_8(pcp) raw_cpu_generic_read(pcp) #endif #ifndef raw_cpu_write_1 diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index 27fe74948882e9558ad8cc72dfa6da1b7051dce7..9ac959ef4cae972374540f34895465507e656d7b 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -33,6 +33,7 @@ #define PERCPU_COUNT_BIAS (1LU << (BITS_PER_LONG - 1)) +static DEFINE_SPINLOCK(percpu_ref_switch_lock); static DECLARE_WAIT_QUEUE_HEAD(percpu_ref_switch_waitq); static unsigned long __percpu *percpu_count_ptr(struct percpu_ref *ref) @@ -82,6 +83,7 @@ int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release, atomic_long_set(&ref->count, start_count); ref->release = release; + ref->confirm_switch = NULL; return 0; } EXPORT_SYMBOL_GPL(percpu_ref_init); @@ -101,6 +103,8 @@ void percpu_ref_exit(struct percpu_ref *ref) unsigned long __percpu *percpu_count = percpu_count_ptr(ref); if (percpu_count) { + /* non-NULL confirm_switch indicates switching in progress */ + WARN_ON_ONCE(ref->confirm_switch); free_percpu(percpu_count); ref->percpu_count_ptr = __PERCPU_REF_ATOMIC_DEAD; } @@ -161,66 +165,23 @@ static void percpu_ref_noop_confirm_switch(struct percpu_ref *ref) static void __percpu_ref_switch_to_atomic(struct percpu_ref *ref, percpu_ref_func_t *confirm_switch) { - if (!(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC)) { - /* switching from percpu to atomic */ - ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC; - - /* - * Non-NULL ->confirm_switch is used to indicate that - * switching is in progress. Use noop one if unspecified. - */ - WARN_ON_ONCE(ref->confirm_switch); - ref->confirm_switch = - confirm_switch ?: percpu_ref_noop_confirm_switch; - - percpu_ref_get(ref); /* put after confirmation */ - call_rcu_sched(&ref->rcu, percpu_ref_switch_to_atomic_rcu); - } else if (confirm_switch) { - /* - * Somebody already set ATOMIC. Switching may still be in - * progress. @confirm_switch must be invoked after the - * switching is complete and a full sched RCU grace period - * has passed. Wait synchronously for the previous - * switching and schedule @confirm_switch invocation. - */ - wait_event(percpu_ref_switch_waitq, !ref->confirm_switch); - ref->confirm_switch = confirm_switch; - - percpu_ref_get(ref); /* put after confirmation */ - call_rcu_sched(&ref->rcu, percpu_ref_call_confirm_rcu); + if (ref->percpu_count_ptr & __PERCPU_REF_ATOMIC) { + if (confirm_switch) + confirm_switch(ref); + return; } -} -/** - * percpu_ref_switch_to_atomic - switch a percpu_ref to atomic mode - * @ref: percpu_ref to switch to atomic mode - * @confirm_switch: optional confirmation callback - * - * There's no reason to use this function for the usual reference counting. - * Use percpu_ref_kill[_and_confirm](). - * - * Schedule switching of @ref to atomic mode. All its percpu counts will - * be collected to the main atomic counter. On completion, when all CPUs - * are guaraneed to be in atomic mode, @confirm_switch, which may not - * block, is invoked. This function may be invoked concurrently with all - * the get/put operations and can safely be mixed with kill and reinit - * operations. Note that @ref will stay in atomic mode across kill/reinit - * cycles until percpu_ref_switch_to_percpu() is called. - * - * This function normally doesn't block and can be called from any context - * but it may block if @confirm_kill is specified and @ref is already in - * the process of switching to atomic mode. In such cases, @confirm_switch - * will be invoked after the switching is complete. - * - * Due to the way percpu_ref is implemented, @confirm_switch will be called - * after at least one full sched RCU grace period has passed but this is an - * implementation detail and must not be depended upon. - */ -void percpu_ref_switch_to_atomic(struct percpu_ref *ref, - percpu_ref_func_t *confirm_switch) -{ - ref->force_atomic = true; - __percpu_ref_switch_to_atomic(ref, confirm_switch); + /* switching from percpu to atomic */ + ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC; + + /* + * Non-NULL ->confirm_switch is used to indicate that switching is + * in progress. Use noop one if unspecified. + */ + ref->confirm_switch = confirm_switch ?: percpu_ref_noop_confirm_switch; + + percpu_ref_get(ref); /* put after confirmation */ + call_rcu_sched(&ref->rcu, percpu_ref_switch_to_atomic_rcu); } static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref) @@ -233,8 +194,6 @@ static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref) if (!(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC)) return; - wait_event(percpu_ref_switch_waitq, !ref->confirm_switch); - atomic_long_add(PERCPU_COUNT_BIAS, &ref->count); /* @@ -250,6 +209,58 @@ static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref) ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC); } +static void __percpu_ref_switch_mode(struct percpu_ref *ref, + percpu_ref_func_t *confirm_switch) +{ + lockdep_assert_held(&percpu_ref_switch_lock); + + /* + * If the previous ATOMIC switching hasn't finished yet, wait for + * its completion. If the caller ensures that ATOMIC switching + * isn't in progress, this function can be called from any context. + */ + wait_event_lock_irq(percpu_ref_switch_waitq, !ref->confirm_switch, + percpu_ref_switch_lock); + + if (ref->force_atomic || (ref->percpu_count_ptr & __PERCPU_REF_DEAD)) + __percpu_ref_switch_to_atomic(ref, confirm_switch); + else + __percpu_ref_switch_to_percpu(ref); +} + +/** + * percpu_ref_switch_to_atomic - switch a percpu_ref to atomic mode + * @ref: percpu_ref to switch to atomic mode + * @confirm_switch: optional confirmation callback + * + * There's no reason to use this function for the usual reference counting. + * Use percpu_ref_kill[_and_confirm](). + * + * Schedule switching of @ref to atomic mode. All its percpu counts will + * be collected to the main atomic counter. On completion, when all CPUs + * are guaraneed to be in atomic mode, @confirm_switch, which may not + * block, is invoked. This function may be invoked concurrently with all + * the get/put operations and can safely be mixed with kill and reinit + * operations. Note that @ref will stay in atomic mode across kill/reinit + * cycles until percpu_ref_switch_to_percpu() is called. + * + * This function may block if @ref is in the process of switching to atomic + * mode. If the caller ensures that @ref is not in the process of + * switching to atomic mode, this function can be called from any context. + */ +void percpu_ref_switch_to_atomic(struct percpu_ref *ref, + percpu_ref_func_t *confirm_switch) +{ + unsigned long flags; + + spin_lock_irqsave(&percpu_ref_switch_lock, flags); + + ref->force_atomic = true; + __percpu_ref_switch_mode(ref, confirm_switch); + + spin_unlock_irqrestore(&percpu_ref_switch_lock, flags); +} + /** * percpu_ref_switch_to_percpu - switch a percpu_ref to percpu mode * @ref: percpu_ref to switch to percpu mode @@ -264,17 +275,20 @@ static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref) * dying or dead, the actual switching takes place on the following * percpu_ref_reinit(). * - * This function normally doesn't block and can be called from any context - * but it may block if @ref is in the process of switching to atomic mode - * by percpu_ref_switch_atomic(). + * This function may block if @ref is in the process of switching to atomic + * mode. If the caller ensures that @ref is not in the process of + * switching to atomic mode, this function can be called from any context. */ void percpu_ref_switch_to_percpu(struct percpu_ref *ref) { + unsigned long flags; + + spin_lock_irqsave(&percpu_ref_switch_lock, flags); + ref->force_atomic = false; + __percpu_ref_switch_mode(ref, NULL); - /* a dying or dead ref can't be switched to percpu mode w/o reinit */ - if (!(ref->percpu_count_ptr & __PERCPU_REF_DEAD)) - __percpu_ref_switch_to_percpu(ref); + spin_unlock_irqrestore(&percpu_ref_switch_lock, flags); } /** @@ -290,21 +304,23 @@ void percpu_ref_switch_to_percpu(struct percpu_ref *ref) * * This function normally doesn't block and can be called from any context * but it may block if @confirm_kill is specified and @ref is in the - * process of switching to atomic mode by percpu_ref_switch_atomic(). - * - * Due to the way percpu_ref is implemented, @confirm_switch will be called - * after at least one full sched RCU grace period has passed but this is an - * implementation detail and must not be depended upon. + * process of switching to atomic mode by percpu_ref_switch_to_atomic(). */ void percpu_ref_kill_and_confirm(struct percpu_ref *ref, percpu_ref_func_t *confirm_kill) { + unsigned long flags; + + spin_lock_irqsave(&percpu_ref_switch_lock, flags); + WARN_ONCE(ref->percpu_count_ptr & __PERCPU_REF_DEAD, "%s called more than once on %pf!", __func__, ref->release); ref->percpu_count_ptr |= __PERCPU_REF_DEAD; - __percpu_ref_switch_to_atomic(ref, confirm_kill); + __percpu_ref_switch_mode(ref, confirm_kill); percpu_ref_put(ref); + + spin_unlock_irqrestore(&percpu_ref_switch_lock, flags); } EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm); @@ -321,11 +337,16 @@ EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm); */ void percpu_ref_reinit(struct percpu_ref *ref) { + unsigned long flags; + + spin_lock_irqsave(&percpu_ref_switch_lock, flags); + WARN_ON_ONCE(!percpu_ref_is_zero(ref)); ref->percpu_count_ptr &= ~__PERCPU_REF_DEAD; percpu_ref_get(ref); - if (!ref->force_atomic) - __percpu_ref_switch_to_percpu(ref); + __percpu_ref_switch_mode(ref, NULL); + + spin_unlock_irqrestore(&percpu_ref_switch_lock, flags); } EXPORT_SYMBOL_GPL(percpu_ref_reinit); diff --git a/mm/percpu.c b/mm/percpu.c index 9903830aaebbf00256fbb108891c2e5e09baecac..255714302394137e37e4d8bd5cf87d685732d9d6 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1961,8 +1961,9 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, void *base = (void *)ULONG_MAX; void **areas = NULL; struct pcpu_alloc_info *ai; - size_t size_sum, areas_size, max_distance; - int group, i, rc; + size_t size_sum, areas_size; + unsigned long max_distance; + int group, i, highest_group, rc; ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size, cpu_distance_fn); @@ -1978,7 +1979,8 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, goto out_free; } - /* allocate, copy and determine base address */ + /* allocate, copy and determine base address & max_distance */ + highest_group = 0; for (group = 0; group < ai->nr_groups; group++) { struct pcpu_group_info *gi = &ai->groups[group]; unsigned int cpu = NR_CPUS; @@ -1999,6 +2001,21 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, areas[group] = ptr; base = min(ptr, base); + if (ptr > areas[highest_group]) + highest_group = group; + } + max_distance = areas[highest_group] - base; + max_distance += ai->unit_size * ai->groups[highest_group].nr_units; + + /* warn if maximum distance is further than 75% of vmalloc space */ + if (max_distance > VMALLOC_TOTAL * 3 / 4) { + pr_warn("max_distance=0x%lx too large for vmalloc space 0x%lx\n", + max_distance, VMALLOC_TOTAL); +#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK + /* and fail if we have fallback */ + rc = -EINVAL; + goto out_free_areas; +#endif } /* @@ -2023,23 +2040,8 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, } /* base address is now known, determine group base offsets */ - max_distance = 0; for (group = 0; group < ai->nr_groups; group++) { ai->groups[group].base_offset = areas[group] - base; - max_distance = max_t(size_t, max_distance, - ai->groups[group].base_offset); - } - max_distance += ai->unit_size; - - /* warn if maximum distance is further than 75% of vmalloc space */ - if (max_distance > VMALLOC_TOTAL * 3 / 4) { - pr_warn("max_distance=0x%zx too large for vmalloc space 0x%lx\n", - max_distance, VMALLOC_TOTAL); -#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK - /* and fail if we have fallback */ - rc = -EINVAL; - goto out_free; -#endif } pr_info("Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n",