diff --git a/include/asm-generic/bitops/find.h b/include/asm-generic/bitops/find.h index 9fdf21302fdf3b773e8dc459fccb23a26d3ab8bf..39ef8d3c54dcb99fa412a6e9bf22d09d80364659 100644 --- a/include/asm-generic/bitops/find.h +++ b/include/asm-generic/bitops/find.h @@ -97,4 +97,42 @@ extern unsigned long find_next_clump8(unsigned long *clump, #define find_first_clump8(clump, bits, size) \ find_next_clump8((clump), (bits), (size), 0) +unsigned long _find_next_or_bit(const unsigned long *addr1, const unsigned long *addr2, + unsigned long nbits, unsigned long start); + +#ifndef find_next_or_bit +/** + * find_next_or_bit - find the next set bit in either memory regions + * @addr1: The first address to base the search on + * @addr2: The second address to base the search on + * @size: The bitmap size in bits + * @offset: The bitnumber to start searching at + * + * Returns the bit number for the next set bit + * If no bits are set, returns @size. + */ +static inline +unsigned long find_next_or_bit(const unsigned long *addr1, + const unsigned long *addr2, unsigned long size, + unsigned long offset) +{ + if (small_const_nbits(size)) { + unsigned long val; + + if (unlikely(offset >= size)) + return size; + + val = (*addr1 | *addr2) & GENMASK(size - 1, offset); + return val ? __ffs(val) : size; + } + + return _find_next_or_bit(addr1, addr2, size, offset); +} +#endif + +#define for_each_or_bit(bit, addr1, addr2, size) \ + for ((bit) = 0; \ + (bit) = find_next_or_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\ + (bit)++) + #endif /*_ASM_GENERIC_BITOPS_FIND_H_ */ diff --git a/include/asm-generic/bitsperlong.h b/include/asm-generic/bitsperlong.h index 3905c1c93dc206482627eda2a80902b4849d65e5..1023e2a4bd372fae7f4568f3d85c707518b2bb84 100644 --- a/include/asm-generic/bitsperlong.h +++ b/include/asm-generic/bitsperlong.h @@ -23,4 +23,16 @@ #define BITS_PER_LONG_LONG 64 #endif +/* + * small_const_nbits(n) is true precisely when it is known at compile-time + * that BITMAP_SIZE(n) is 1, i.e. 1 <= n <= BITS_PER_LONG. This allows + * various bit/bitmap APIs to provide a fast inline implementation. Bitmaps + * of size 0 are very rare, and a compile-time-known-size 0 is most likely + * a sign of error. They will be handled correctly by the bit/bitmap APIs, + * but using the out-of-line functions, so that the inline implementations + * can unconditionally dereference the pointer(s). + */ +#define small_const_nbits(nbits) \ + (__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG && (nbits) > 0) + #endif /* __ASM_GENERIC_BITS_PER_LONG */ diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index e98da9994cdde3971de90780501dab6dd47ba00c..6f8803eaae25f9bf0cb9777a278a4f4ac4f79dcb 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -234,14 +234,6 @@ extern int bitmap_print_to_pagebuf(bool list, char *buf, #define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1))) #define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1))) -/* - * The static inlines below do not handle constant nbits==0 correctly, - * so make such users (should any ever turn up) call the out-of-line - * versions. - */ -#define small_const_nbits(nbits) \ - (__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG && (nbits) > 0) - static inline void bitmap_zero(unsigned long *dst, unsigned int nbits) { unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long); diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 0159986ac9ce7c81b480c8efed10f8d74de10359..7cdec529b1d956a4698d972d357de894fcecc335 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -91,10 +91,12 @@ extern struct cpumask __cpu_possible_mask; extern struct cpumask __cpu_online_mask; extern struct cpumask __cpu_present_mask; extern struct cpumask __cpu_active_mask; +extern struct cpumask __cpu_dying_mask; #define cpu_possible_mask ((const struct cpumask *)&__cpu_possible_mask) #define cpu_online_mask ((const struct cpumask *)&__cpu_online_mask) #define cpu_present_mask ((const struct cpumask *)&__cpu_present_mask) #define cpu_active_mask ((const struct cpumask *)&__cpu_active_mask) +#define cpu_dying_mask ((const struct cpumask *)&__cpu_dying_mask) extern atomic_t __num_online_cpus; @@ -118,6 +120,11 @@ static inline unsigned int num_online_cpus(void) #define cpu_possible(cpu) cpumask_test_cpu((cpu), cpu_possible_mask) #define cpu_present(cpu) cpumask_test_cpu((cpu), cpu_present_mask) #define cpu_active(cpu) cpumask_test_cpu((cpu), cpu_active_mask) +static inline int cpumask_test_cpu(int cpu, const struct cpumask *cpumask); +static inline bool cpu_dying(unsigned int cpu) +{ + return cpumask_test_cpu(cpu, cpu_dying_mask); +} #else #define num_online_cpus() 1U #define num_possible_cpus() 1U @@ -127,6 +134,10 @@ static inline unsigned int num_online_cpus(void) #define cpu_possible(cpu) ((cpu) == 0) #define cpu_present(cpu) ((cpu) == 0) #define cpu_active(cpu) ((cpu) == 0) +static inline bool cpu_dying(unsigned int cpu) +{ + return false; +} #endif extern cpumask_t cpus_booted_once_mask; @@ -314,6 +325,58 @@ extern int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool (cpu) < nr_cpu_ids;) #endif /* SMP */ +/* + * We have several different "preferred sizes" for the cpumask + * operations, depending on operation. + * + * For example, the bitmap scanning and operating operations have + * optimized routines that work for the single-word case, but only when + * the size is constant. So if NR_CPUS fits in one single word, we are + * better off using that small constant, in order to trigger the + * optimized bit finding. That is 'small_cpumask_size'. + * + * The clearing and copying operations will similarly perform better + * with a constant size, but we limit that size arbitrarily to four + * words. We call this 'large_cpumask_size'. + * + * Finally, some operations just want the exact limit, either because + * they set bits or just don't have any faster fixed-sized versions. We + * call this just 'nr_cpumask_bits'. + * + * Note that these optional constants are always guaranteed to be at + * least as big as 'nr_cpu_ids' itself is, and all our cpumask + * allocations are at least that size (see cpumask_size()). The + * optimization comes from being able to potentially use a compile-time + * constant instead of a run-time generated exact number of CPUs. + */ +#if NR_CPUS <= BITS_PER_LONG + #define small_cpumask_bits ((unsigned int)NR_CPUS) + #define large_cpumask_bits ((unsigned int)NR_CPUS) +#elif NR_CPUS <= 4*BITS_PER_LONG + #define small_cpumask_bits nr_cpu_ids + #define large_cpumask_bits ((unsigned int)NR_CPUS) +#else + #define small_cpumask_bits nr_cpu_ids + #define large_cpumask_bits nr_cpu_ids +#endif + +/** + * for_each_cpu_or - iterate over every cpu present in either mask + * @cpu: the (optionally unsigned) integer iterator + * @mask1: the first cpumask pointer + * @mask2: the second cpumask pointer + * + * This saves a temporary CPU mask in many places. It is equivalent to: + * struct cpumask tmp; + * cpumask_or(&tmp, &mask1, &mask2); + * for_each_cpu(cpu, &tmp) + * ... + * + * After the loop, cpu is >= nr_cpu_ids. + */ +#define for_each_cpu_or(cpu, mask1, mask2) \ + for_each_or_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), small_cpumask_bits) + #define CPU_BITS_NONE \ { \ [0 ... BITS_TO_LONGS(NR_CPUS)-1] = 0UL \ @@ -851,6 +914,14 @@ set_cpu_active(unsigned int cpu, bool active) cpumask_clear_cpu(cpu, &__cpu_active_mask); } +static inline void +set_cpu_dying(unsigned int cpu, bool dying) +{ + if (dying) + cpumask_set_cpu(cpu, &__cpu_dying_mask); + else + cpumask_clear_cpu(cpu, &__cpu_dying_mask); +} /** * to_cpumask - convert an NR_CPUS bitmap to a struct cpumask * diff --git a/kernel/cpu.c b/kernel/cpu.c index c06ced18f78ad37c7c93263de1ee729204c65606..2a81a9dae3a7b0d02fb02d6d042a5ea40c29f91d 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -157,6 +157,9 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, int (*cb)(unsigned int cpu); int ret, cnt; + if (cpu_dying(cpu) != !bringup) + set_cpu_dying(cpu, !bringup); + if (st->fail == state) { st->fail = CPUHP_INVALID; @@ -2502,6 +2505,9 @@ EXPORT_SYMBOL(__cpu_present_mask); struct cpumask __cpu_active_mask __read_mostly; EXPORT_SYMBOL(__cpu_active_mask); +struct cpumask __cpu_dying_mask __read_mostly; +EXPORT_SYMBOL(__cpu_dying_mask); + atomic_t __num_online_cpus __read_mostly; EXPORT_SYMBOL(__num_online_cpus); diff --git a/lib/find_bit.c b/lib/find_bit.c index 4a8751010d59ffb08f7bf8f25c82096ceaf12398..d9aef1da9929f6cd367517c5fd4d87faa8bd87a5 100644 --- a/lib/find_bit.c +++ b/lib/find_bit.c @@ -81,6 +81,43 @@ unsigned long find_next_bit(const unsigned long *addr, unsigned long size, EXPORT_SYMBOL(find_next_bit); #endif +/* + * Common helper for find_next_bit() function family + * @FETCH: The expression that fetches and pre-processes each word of bitmap(s) + * @MUNGE: The expression that post-processes a word containing found bit (may be empty) + * @size: The bitmap size in bits + * @start: The bitnumber to start searching at + */ +#define FIND_NEXT_BIT(FETCH, MUNGE, size, start) \ +({ \ + unsigned long mask, idx, tmp, sz = (size), __start = (start); \ + \ + if (unlikely(__start >= sz)) \ + goto out; \ + \ + mask = MUNGE(BITMAP_FIRST_WORD_MASK(__start)); \ + idx = __start / BITS_PER_LONG; \ + \ + for (tmp = (FETCH) & mask; !tmp; tmp = (FETCH)) { \ + if ((idx + 1) * BITS_PER_LONG >= sz) \ + goto out; \ + idx++; \ + } \ + \ + sz = min(idx * BITS_PER_LONG + __ffs(MUNGE(tmp)), sz); \ +out: \ + sz; \ +}) + +#ifndef find_next_or_bit +unsigned long _find_next_or_bit(const unsigned long *addr1, const unsigned long *addr2, + unsigned long nbits, unsigned long start) +{ + return FIND_NEXT_BIT(addr1[idx] | addr2[idx], /* nop */, nbits, start); +} +EXPORT_SYMBOL(_find_next_or_bit); +#endif + #ifndef find_next_zero_bit unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, unsigned long offset) diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index 00f666d944867773f6c9d02587e9dcd449b5b303..8940ca14acca0789d832266952620ad58f636785 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -119,7 +119,15 @@ EXPORT_SYMBOL(percpu_counter_sync); /* * Add up all the per-cpu counts, return the result. This is a more accurate - * but much slower version of percpu_counter_read_positive() + * but much slower version of percpu_counter_read_positive(). + * + * We use the cpu mask of (cpu_online_mask | cpu_dying_mask) to capture sums + * from CPUs that are in the process of being taken offline. Dying cpus have + * been removed from the online mask, but may not have had the hotplug dead + * notifier called to fold the percpu count back into the global counter sum. + * By including dying CPUs in the iteration mask, we avoid this race condition + * so __percpu_counter_sum() just does the right thing when CPUs are being taken + * offline. */ s64 __percpu_counter_sum(struct percpu_counter *fbc) { @@ -129,7 +137,7 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc) raw_spin_lock_irqsave(&fbc->lock, flags); ret = fbc->count; - for_each_online_cpu(cpu) { + for_each_cpu_or(cpu, cpu_online_mask, cpu_dying_mask) { s32 *pcount = per_cpu_ptr(fbc->counters, cpu); ret += *pcount; }