提交 28e92f99 编写于 作者: L Linus Torvalds

Merge branch 'core-rcu-2021.07.04' of...

Merge branch 'core-rcu-2021.07.04' of git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu

Pull RCU updates from Paul McKenney:

 - Bitmap parsing support for "all" as an alias for all bits

 - Documentation updates

 - Miscellaneous fixes, including some that overlap into mm and lockdep

 - kvfree_rcu() updates

 - mem_dump_obj() updates, with acks from one of the slab-allocator
   maintainers

 - RCU NOCB CPU updates, including limited deoffloading

 - SRCU updates

 - Tasks-RCU updates

 - Torture-test updates

* 'core-rcu-2021.07.04' of git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu: (78 commits)
  tasks-rcu: Make show_rcu_tasks_gp_kthreads() be static inline
  rcu-tasks: Make ksoftirqd provide RCU Tasks quiescent states
  rcu: Add missing __releases() annotation
  rcu: Remove obsolete rcu_read_unlock() deadlock commentary
  rcu: Improve comments describing RCU read-side critical sections
  rcu: Create an unrcu_pointer() to remove __rcu from a pointer
  srcu: Early test SRCU polling start
  rcu: Fix various typos in comments
  rcu/nocb: Unify timers
  rcu/nocb: Prepare for fine-grained deferred wakeup
  rcu/nocb: Only cancel nocb timer if not polling
  rcu/nocb: Delete bypass_timer upon nocb_gp wakeup
  rcu/nocb: Cancel nocb_timer upon nocb_gp wakeup
  rcu/nocb: Allow de-offloading rdp leader
  rcu/nocb: Directly call __wake_nocb_gp() from bypass timer
  rcu: Don't penalize priority boosting when there is nothing to boost
  rcu: Point to documentation of ordering guarantees
  rcu: Make rcu_gp_cleanup() be noinline for tracing
  rcu: Restrict RCU_STRICT_GRACE_PERIOD to at most four CPUs
  rcu: Make show_rcu_gp_kthreads() dump rcu_node structures blocking GP
  ...
......@@ -21,7 +21,7 @@ Any code that happens after the end of a given RCU grace period is guaranteed
to see the effects of all accesses prior to the beginning of that grace
period that are within RCU read-side critical sections.
Similarly, any code that happens before the beginning of a given RCU grace
period is guaranteed to see the effects of all accesses following the end
period is guaranteed to not see the effects of all accesses following the end
of that grace period that are within RCU read-side critical sections.
Note well that RCU-sched read-side critical sections include any region
......@@ -339,14 +339,14 @@ The diagram below shows the path of ordering if the leftmost
leftmost ``rcu_node`` structure offlines its last CPU and if the next
``rcu_node`` structure has no online CPUs).
.. kernel-figure:: TreeRCU-gp-init-1.svg
.. kernel-figure:: TreeRCU-gp-init-2.svg
The final ``rcu_gp_init()`` pass through the ``rcu_node`` tree traverses
breadth-first, setting each ``rcu_node`` structure's ``->gp_seq`` field
to the newly advanced value from the ``rcu_state`` structure, as shown
in the following diagram.
.. kernel-figure:: TreeRCU-gp-init-1.svg
.. kernel-figure:: TreeRCU-gp-init-3.svg
This change will also cause each CPU's next call to
``__note_gp_changes()`` to notice that a new grace period has started,
......
......@@ -76,6 +76,11 @@ to change, such as less cores in the CPU list, then N and any ranges using N
will also change. Use the same on a small 4 core system, and "16-N" becomes
"16-3" and now the same boot input will be flagged as invalid (start > end).
The special case-tolerant group name "all" has a meaning of selecting all CPUs,
so that "nohz_full=all" is the equivalent of "nohz_full=0-N".
The semantics of "N" and "all" is supported on a level of bitmaps and holds for
all users of bitmap_parse().
This document may not be entirely up to date and comprehensive. The command
"modinfo -p ${modulename}" shows a current list of all parameters of a loadable
......
......@@ -4354,6 +4354,11 @@
whole algorithm to behave better in low memory
condition.
rcutree.rcu_delay_page_cache_fill_msec= [KNL]
Set the page-cache refill delay (in milliseconds)
in response to low-memory conditions. The range
of permitted values is in the range 0:100000.
rcutree.jiffies_till_first_fqs= [KNL]
Set delay from grace-period initialization to
first attempt to force quiescent states.
......
......@@ -315,7 +315,7 @@ static inline int rcu_read_lock_any_held(void)
#define RCU_LOCKDEP_WARN(c, s) \
do { \
static bool __section(".data.unlikely") __warned; \
if (debug_lockdep_rcu_enabled() && !__warned && (c)) { \
if ((c) && debug_lockdep_rcu_enabled() && !__warned) { \
__warned = true; \
lockdep_rcu_suspicious(__FILE__, __LINE__, s); \
} \
......@@ -373,7 +373,7 @@ static inline void rcu_preempt_sleep_check(void) { }
#define unrcu_pointer(p) \
({ \
typeof(*p) *_________p1 = (typeof(*p) *__force)(p); \
rcu_check_sparse(p, __rcu); \
rcu_check_sparse(p, __rcu); \
((typeof(*p) __force __kernel *)(_________p1)); \
})
......@@ -532,7 +532,12 @@ do { \
* @p: The pointer to read, prior to dereferencing
* @c: The conditions under which the dereference will take place
*
* This is the RCU-bh counterpart to rcu_dereference_check().
* This is the RCU-bh counterpart to rcu_dereference_check(). However,
* please note that starting in v5.0 kernels, vanilla RCU grace periods
* wait for local_bh_disable() regions of code in addition to regions of
* code demarked by rcu_read_lock() and rcu_read_unlock(). This means
* that synchronize_rcu(), call_rcu, and friends all take not only
* rcu_read_lock() but also rcu_read_lock_bh() into account.
*/
#define rcu_dereference_bh_check(p, c) \
__rcu_dereference_check((p), (c) || rcu_read_lock_bh_held(), __rcu)
......@@ -543,6 +548,11 @@ do { \
* @c: The conditions under which the dereference will take place
*
* This is the RCU-sched counterpart to rcu_dereference_check().
* However, please note that starting in v5.0 kernels, vanilla RCU grace
* periods wait for preempt_disable() regions of code in addition to
* regions of code demarked by rcu_read_lock() and rcu_read_unlock().
* This means that synchronize_rcu(), call_rcu, and friends all take not
* only rcu_read_lock() but also rcu_read_lock_sched() into account.
*/
#define rcu_dereference_sched_check(p, c) \
__rcu_dereference_check((p), (c) || rcu_read_lock_sched_held(), \
......@@ -634,6 +644,12 @@ do { \
* sections, invocation of the corresponding RCU callback is deferred
* until after the all the other CPUs exit their critical sections.
*
* In v5.0 and later kernels, synchronize_rcu() and call_rcu() also
* wait for regions of code with preemption disabled, including regions of
* code with interrupts or softirqs disabled. In pre-v5.0 kernels, which
* define synchronize_sched(), only code enclosed within rcu_read_lock()
* and rcu_read_unlock() are guaranteed to be waited for.
*
* Note, however, that RCU callbacks are permitted to run concurrently
* with new RCU read-side critical sections. One way that this can happen
* is via the following sequence of events: (1) CPU 0 enters an RCU
......@@ -686,33 +702,12 @@ static __always_inline void rcu_read_lock(void)
/**
* rcu_read_unlock() - marks the end of an RCU read-side critical section.
*
* In most situations, rcu_read_unlock() is immune from deadlock.
* However, in kernels built with CONFIG_RCU_BOOST, rcu_read_unlock()
* is responsible for deboosting, which it does via rt_mutex_unlock().
* Unfortunately, this function acquires the scheduler's runqueue and
* priority-inheritance spinlocks. This means that deadlock could result
* if the caller of rcu_read_unlock() already holds one of these locks or
* any lock that is ever acquired while holding them.
*
* That said, RCU readers are never priority boosted unless they were
* preempted. Therefore, one way to avoid deadlock is to make sure
* that preemption never happens within any RCU read-side critical
* section whose outermost rcu_read_unlock() is called with one of
* rt_mutex_unlock()'s locks held. Such preemption can be avoided in
* a number of ways, for example, by invoking preempt_disable() before
* critical section's outermost rcu_read_lock().
*
* Given that the set of locks acquired by rt_mutex_unlock() might change
* at any time, a somewhat more future-proofed approach is to make sure
* that that preemption never happens within any RCU read-side critical
* section whose outermost rcu_read_unlock() is called with irqs disabled.
* This approach relies on the fact that rt_mutex_unlock() currently only
* acquires irq-disabled locks.
*
* The second of these two approaches is best in most situations,
* however, the first approach can also be useful, at least to those
* developers willing to keep abreast of the set of locks acquired by
* rt_mutex_unlock().
* In almost all situations, rcu_read_unlock() is immune from deadlock.
* In recent kernels that have consolidated synchronize_sched() and
* synchronize_rcu_bh() into synchronize_rcu(), this deadlock immunity
* also extends to the scheduler's runqueue and priority-inheritance
* spinlocks, courtesy of the quiescent-state deferral that is carried
* out when rcu_read_unlock() is invoked with interrupts disabled.
*
* See rcu_read_lock() for more information.
*/
......@@ -728,9 +723,11 @@ static inline void rcu_read_unlock(void)
/**
* rcu_read_lock_bh() - mark the beginning of an RCU-bh critical section
*
* This is equivalent of rcu_read_lock(), but also disables softirqs.
* Note that anything else that disables softirqs can also serve as
* an RCU read-side critical section.
* This is equivalent to rcu_read_lock(), but also disables softirqs.
* Note that anything else that disables softirqs can also serve as an RCU
* read-side critical section. However, please note that this equivalence
* applies only to v5.0 and later. Before v5.0, rcu_read_lock() and
* rcu_read_lock_bh() were unrelated.
*
* Note that rcu_read_lock_bh() and the matching rcu_read_unlock_bh()
* must occur in the same context, for example, it is illegal to invoke
......@@ -763,9 +760,12 @@ static inline void rcu_read_unlock_bh(void)
/**
* rcu_read_lock_sched() - mark the beginning of a RCU-sched critical section
*
* This is equivalent of rcu_read_lock(), but disables preemption.
* Read-side critical sections can also be introduced by anything else
* that disables preemption, including local_irq_disable() and friends.
* This is equivalent to rcu_read_lock(), but also disables preemption.
* Read-side critical sections can also be introduced by anything else that
* disables preemption, including local_irq_disable() and friends. However,
* please note that the equivalence to rcu_read_lock() applies only to
* v5.0 and later. Before v5.0, rcu_read_lock() and rcu_read_lock_sched()
* were unrelated.
*
* Note that rcu_read_lock_sched() and the matching rcu_read_unlock_sched()
* must occur in the same context, for example, it is illegal to invoke
......
......@@ -86,7 +86,6 @@ static inline void rcu_irq_enter(void) { }
static inline void rcu_irq_exit_irqson(void) { }
static inline void rcu_irq_enter_irqson(void) { }
static inline void rcu_irq_exit(void) { }
static inline void rcu_irq_exit_preempt(void) { }
static inline void rcu_irq_exit_check_preempt(void) { }
#define rcu_is_idle_cpu(cpu) \
(is_idle_task(current) && !in_nmi() && !in_irq() && !in_serving_softirq())
......
......@@ -49,7 +49,6 @@ void rcu_idle_enter(void);
void rcu_idle_exit(void);
void rcu_irq_enter(void);
void rcu_irq_exit(void);
void rcu_irq_exit_preempt(void);
void rcu_irq_enter_irqson(void);
void rcu_irq_exit_irqson(void);
bool rcu_is_idle_cpu(int cpu);
......
......@@ -64,6 +64,12 @@ unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp);
unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp);
bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie);
#ifdef CONFIG_SRCU
void srcu_init(void);
#else /* #ifdef CONFIG_SRCU */
static inline void srcu_init(void) { }
#endif /* #else #ifdef CONFIG_SRCU */
#ifdef CONFIG_DEBUG_LOCK_ALLOC
/**
......
......@@ -82,9 +82,7 @@ struct srcu_struct {
/* callback for the barrier */
/* operation. */
struct delayed_work work;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map dep_map;
#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
};
/* Values for state variable (bottom bits of ->srcu_gp_seq). */
......
......@@ -192,8 +192,6 @@ extern int try_to_del_timer_sync(struct timer_list *timer);
#define del_singleshot_timer_sync(t) del_timer_sync(t)
extern bool timer_curr_running(struct timer_list *timer);
extern void init_timers(void);
struct hrtimer;
extern enum hrtimer_restart it_real_fn(struct hrtimer *);
......
......@@ -278,6 +278,7 @@ TRACE_EVENT_RCU(rcu_exp_funnel_lock,
* "WakeNot": Don't wake rcuo kthread.
* "WakeNotPoll": Don't wake rcuo kthread because it is polling.
* "WakeOvfIsDeferred": Wake rcuo kthread later, CB list is huge.
* "WakeBypassIsDeferred": Wake rcuo kthread later, bypass list is contended.
* "WokeEmpty": rcuo CB kthread woke to find empty list.
*/
TRACE_EVENT_RCU(rcu_nocb_wake,
......
......@@ -42,6 +42,7 @@
#include <linux/profile.h>
#include <linux/kfence.h>
#include <linux/rcupdate.h>
#include <linux/srcu.h>
#include <linux/moduleparam.h>
#include <linux/kallsyms.h>
#include <linux/writeback.h>
......@@ -1008,6 +1009,7 @@ asmlinkage __visible void __init __no_sanitize_address start_kernel(void)
tick_init();
rcu_init_nohz();
init_timers();
srcu_init();
hrtimers_init();
softirq_init();
timekeeping_init();
......
......@@ -6506,6 +6506,7 @@ asmlinkage __visible void lockdep_sys_exit(void)
void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
{
struct task_struct *curr = current;
int dl = READ_ONCE(debug_locks);
/* Note: the following can be executed concurrently, so be careful. */
pr_warn("\n");
......@@ -6515,11 +6516,12 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
pr_warn("-----------------------------\n");
pr_warn("%s:%d %s!\n", file, line, s);
pr_warn("\nother info that might help us debug this:\n\n");
pr_warn("\n%srcu_scheduler_active = %d, debug_locks = %d\n",
pr_warn("\n%srcu_scheduler_active = %d, debug_locks = %d\n%s",
!rcu_lockdep_current_cpu_online()
? "RCU used illegally from offline CPU!\n"
: "",
rcu_scheduler_active, debug_locks);
rcu_scheduler_active, dl,
dl ? "" : "Possible false positive due to lockdep disabling via debug_locks = 0\n");
/*
* If a CPU is in the RCU-free window in idle (ie: in the section
......
......@@ -116,7 +116,7 @@ config RCU_EQS_DEBUG
config RCU_STRICT_GRACE_PERIOD
bool "Provide debug RCU implementation with short grace periods"
depends on DEBUG_KERNEL && RCU_EXPERT
depends on DEBUG_KERNEL && RCU_EXPERT && NR_CPUS <= 4
default n
select PREEMPT_COUNT if PREEMPT=n
help
......
......@@ -308,6 +308,8 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt)
}
}
extern void rcu_init_geometry(void);
/* Returns a pointer to the first leaf rcu_node structure. */
#define rcu_first_leaf_node() (rcu_state.level[rcu_num_lvls - 1])
......@@ -422,12 +424,6 @@ do { \
#endif /* #if defined(CONFIG_SRCU) || !defined(CONFIG_TINY_RCU) */
#ifdef CONFIG_SRCU
void srcu_init(void);
#else /* #ifdef CONFIG_SRCU */
static inline void srcu_init(void) { }
#endif /* #else #ifdef CONFIG_SRCU */
#ifdef CONFIG_TINY_RCU
/* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */
static inline bool rcu_gp_is_normal(void) { return true; }
......@@ -441,7 +437,11 @@ bool rcu_gp_is_expedited(void); /* Internal RCU use. */
void rcu_expedite_gp(void);
void rcu_unexpedite_gp(void);
void rcupdate_announce_bootup_oddness(void);
#ifdef CONFIG_TASKS_RCU_GENERIC
void show_rcu_tasks_gp_kthreads(void);
#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */
static inline void show_rcu_tasks_gp_kthreads(void) {}
#endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */
void rcu_request_urgent_qs_task(struct task_struct *t);
#endif /* #else #ifdef CONFIG_TINY_RCU */
......@@ -519,6 +519,7 @@ static inline unsigned long rcu_exp_batches_completed(void) { return 0; }
static inline unsigned long
srcu_batches_completed(struct srcu_struct *sp) { return 0; }
static inline void rcu_force_quiescent_state(void) { }
static inline bool rcu_check_boost_fail(unsigned long gp_state, int *cpup) { return true; }
static inline void show_rcu_gp_kthreads(void) { }
static inline int rcu_get_gp_kthreads_prio(void) { return 0; }
static inline void rcu_fwd_progress_check(unsigned long j) { }
......@@ -527,6 +528,7 @@ bool rcu_dynticks_zero_in_eqs(int cpu, int *vp);
unsigned long rcu_get_gp_seq(void);
unsigned long rcu_exp_batches_completed(void);
unsigned long srcu_batches_completed(struct srcu_struct *sp);
bool rcu_check_boost_fail(unsigned long gp_state, int *cpup);
void show_rcu_gp_kthreads(void);
int rcu_get_gp_kthreads_prio(void);
void rcu_fwd_progress_check(unsigned long j);
......
......@@ -245,12 +245,6 @@ static const char *rcu_torture_writer_state_getname(void)
return rcu_torture_writer_state_names[i];
}
#if defined(CONFIG_RCU_BOOST) && defined(CONFIG_PREEMPT_RT)
# define rcu_can_boost() 1
#else
# define rcu_can_boost() 0
#endif
#ifdef CONFIG_RCU_TRACE
static u64 notrace rcu_trace_clock_local(void)
{
......@@ -331,6 +325,7 @@ struct rcu_torture_ops {
void (*read_delay)(struct torture_random_state *rrsp,
struct rt_read_seg *rtrsp);
void (*readunlock)(int idx);
int (*readlock_held)(void);
unsigned long (*get_gp_seq)(void);
unsigned long (*gp_diff)(unsigned long new, unsigned long old);
void (*deferred_free)(struct rcu_torture *p);
......@@ -345,6 +340,7 @@ struct rcu_torture_ops {
void (*fqs)(void);
void (*stats)(void);
void (*gp_kthread_dbg)(void);
bool (*check_boost_failed)(unsigned long gp_state, int *cpup);
int (*stall_dur)(void);
int irq_capable;
int can_boost;
......@@ -359,6 +355,11 @@ static struct rcu_torture_ops *cur_ops;
* Definitions for rcu torture testing.
*/
static int torture_readlock_not_held(void)
{
return rcu_read_lock_bh_held() || rcu_read_lock_sched_held();
}
static int rcu_torture_read_lock(void) __acquires(RCU)
{
rcu_read_lock();
......@@ -483,30 +484,32 @@ static void rcu_sync_torture_init(void)
}
static struct rcu_torture_ops rcu_ops = {
.ttype = RCU_FLAVOR,
.init = rcu_sync_torture_init,
.readlock = rcu_torture_read_lock,
.read_delay = rcu_read_delay,
.readunlock = rcu_torture_read_unlock,
.get_gp_seq = rcu_get_gp_seq,
.gp_diff = rcu_seq_diff,
.deferred_free = rcu_torture_deferred_free,
.sync = synchronize_rcu,
.exp_sync = synchronize_rcu_expedited,
.get_gp_state = get_state_synchronize_rcu,
.start_gp_poll = start_poll_synchronize_rcu,
.poll_gp_state = poll_state_synchronize_rcu,
.cond_sync = cond_synchronize_rcu,
.call = call_rcu,
.cb_barrier = rcu_barrier,
.fqs = rcu_force_quiescent_state,
.stats = NULL,
.gp_kthread_dbg = show_rcu_gp_kthreads,
.stall_dur = rcu_jiffies_till_stall_check,
.irq_capable = 1,
.can_boost = rcu_can_boost(),
.extendables = RCUTORTURE_MAX_EXTEND,
.name = "rcu"
.ttype = RCU_FLAVOR,
.init = rcu_sync_torture_init,
.readlock = rcu_torture_read_lock,
.read_delay = rcu_read_delay,
.readunlock = rcu_torture_read_unlock,
.readlock_held = torture_readlock_not_held,
.get_gp_seq = rcu_get_gp_seq,
.gp_diff = rcu_seq_diff,
.deferred_free = rcu_torture_deferred_free,
.sync = synchronize_rcu,
.exp_sync = synchronize_rcu_expedited,
.get_gp_state = get_state_synchronize_rcu,
.start_gp_poll = start_poll_synchronize_rcu,
.poll_gp_state = poll_state_synchronize_rcu,
.cond_sync = cond_synchronize_rcu,
.call = call_rcu,
.cb_barrier = rcu_barrier,
.fqs = rcu_force_quiescent_state,
.stats = NULL,
.gp_kthread_dbg = show_rcu_gp_kthreads,
.check_boost_failed = rcu_check_boost_fail,
.stall_dur = rcu_jiffies_till_stall_check,
.irq_capable = 1,
.can_boost = IS_ENABLED(CONFIG_RCU_BOOST),
.extendables = RCUTORTURE_MAX_EXTEND,
.name = "rcu"
};
/*
......@@ -540,6 +543,7 @@ static struct rcu_torture_ops rcu_busted_ops = {
.readlock = rcu_torture_read_lock,
.read_delay = rcu_read_delay, /* just reuse rcu's version. */
.readunlock = rcu_torture_read_unlock,
.readlock_held = torture_readlock_not_held,
.get_gp_seq = rcu_no_completed,
.deferred_free = rcu_busted_torture_deferred_free,
.sync = synchronize_rcu_busted,
......@@ -589,6 +593,11 @@ static void srcu_torture_read_unlock(int idx) __releases(srcu_ctlp)
srcu_read_unlock(srcu_ctlp, idx);
}
static int torture_srcu_read_lock_held(void)
{
return srcu_read_lock_held(srcu_ctlp);
}
static unsigned long srcu_torture_completed(void)
{
return srcu_batches_completed(srcu_ctlp);
......@@ -646,6 +655,7 @@ static struct rcu_torture_ops srcu_ops = {
.readlock = srcu_torture_read_lock,
.read_delay = srcu_read_delay,
.readunlock = srcu_torture_read_unlock,
.readlock_held = torture_srcu_read_lock_held,
.get_gp_seq = srcu_torture_completed,
.deferred_free = srcu_torture_deferred_free,
.sync = srcu_torture_synchronize,
......@@ -681,6 +691,7 @@ static struct rcu_torture_ops srcud_ops = {
.readlock = srcu_torture_read_lock,
.read_delay = srcu_read_delay,
.readunlock = srcu_torture_read_unlock,
.readlock_held = torture_srcu_read_lock_held,
.get_gp_seq = srcu_torture_completed,
.deferred_free = srcu_torture_deferred_free,
.sync = srcu_torture_synchronize,
......@@ -700,6 +711,7 @@ static struct rcu_torture_ops busted_srcud_ops = {
.readlock = srcu_torture_read_lock,
.read_delay = rcu_read_delay,
.readunlock = srcu_torture_read_unlock,
.readlock_held = torture_srcu_read_lock_held,
.get_gp_seq = srcu_torture_completed,
.deferred_free = srcu_torture_deferred_free,
.sync = srcu_torture_synchronize,
......@@ -787,6 +799,7 @@ static struct rcu_torture_ops trivial_ops = {
.readlock = rcu_torture_read_lock_trivial,
.read_delay = rcu_read_delay, /* just reuse rcu's version. */
.readunlock = rcu_torture_read_unlock_trivial,
.readlock_held = torture_readlock_not_held,
.get_gp_seq = rcu_no_completed,
.sync = synchronize_rcu_trivial,
.exp_sync = synchronize_rcu_trivial,
......@@ -850,6 +863,7 @@ static struct rcu_torture_ops tasks_tracing_ops = {
.readlock = tasks_tracing_torture_read_lock,
.read_delay = srcu_read_delay, /* just reuse srcu's version. */
.readunlock = tasks_tracing_torture_read_unlock,
.readlock_held = rcu_read_lock_trace_held,
.get_gp_seq = rcu_no_completed,
.deferred_free = rcu_tasks_tracing_torture_deferred_free,
.sync = synchronize_rcu_tasks_trace,
......@@ -871,32 +885,13 @@ static unsigned long rcutorture_seq_diff(unsigned long new, unsigned long old)
return cur_ops->gp_diff(new, old);
}
static bool __maybe_unused torturing_tasks(void)
{
return cur_ops == &tasks_ops || cur_ops == &tasks_rude_ops;
}
/*
* RCU torture priority-boost testing. Runs one real-time thread per
* CPU for moderate bursts, repeatedly registering RCU callbacks and
* spinning waiting for them to be invoked. If a given callback takes
* too long to be invoked, we assume that priority inversion has occurred.
* CPU for moderate bursts, repeatedly starting grace periods and waiting
* for them to complete. If a given grace period takes too long, we assume
* that priority inversion has occurred.
*/
struct rcu_boost_inflight {
struct rcu_head rcu;
int inflight;
};
static void rcu_torture_boost_cb(struct rcu_head *head)
{
struct rcu_boost_inflight *rbip =
container_of(head, struct rcu_boost_inflight, rcu);
/* Ensure RCU-core accesses precede clearing ->inflight */
smp_store_release(&rbip->inflight, 0);
}
static int old_rt_runtime = -1;
static void rcu_torture_disable_rt_throttle(void)
......@@ -923,49 +918,68 @@ static void rcu_torture_enable_rt_throttle(void)
old_rt_runtime = -1;
}
static bool rcu_torture_boost_failed(unsigned long start, unsigned long end)
static bool rcu_torture_boost_failed(unsigned long gp_state, unsigned long *start)
{
int cpu;
static int dbg_done;
if (end - start > test_boost_duration * HZ - HZ / 2) {
unsigned long end = jiffies;
bool gp_done;
unsigned long j;
static unsigned long last_persist;
unsigned long lp;
unsigned long mininterval = test_boost_duration * HZ - HZ / 2;
if (end - *start > mininterval) {
// Recheck after checking time to avoid false positives.
smp_mb(); // Time check before grace-period check.
if (cur_ops->poll_gp_state(gp_state))
return false; // passed, though perhaps just barely
if (cur_ops->check_boost_failed && !cur_ops->check_boost_failed(gp_state, &cpu)) {
// At most one persisted message per boost test.
j = jiffies;
lp = READ_ONCE(last_persist);
if (time_after(j, lp + mininterval) && cmpxchg(&last_persist, lp, j) == lp)
pr_info("Boost inversion persisted: No QS from CPU %d\n", cpu);
return false; // passed on a technicality
}
VERBOSE_TOROUT_STRING("rcu_torture_boost boosting failed");
n_rcu_torture_boost_failure++;
if (!xchg(&dbg_done, 1) && cur_ops->gp_kthread_dbg)
if (!xchg(&dbg_done, 1) && cur_ops->gp_kthread_dbg) {
pr_info("Boost inversion thread ->rt_priority %u gp_state %lu jiffies %lu\n",
current->rt_priority, gp_state, end - *start);
cur_ops->gp_kthread_dbg();
// Recheck after print to flag grace period ending during splat.
gp_done = cur_ops->poll_gp_state(gp_state);
pr_info("Boost inversion: GP %lu %s.\n", gp_state,
gp_done ? "ended already" : "still pending");
return true; /* failed */
}
return true; // failed
} else if (cur_ops->check_boost_failed && !cur_ops->check_boost_failed(gp_state, NULL)) {
*start = jiffies;
}
return false; /* passed */
return false; // passed
}
static int rcu_torture_boost(void *arg)
{
unsigned long call_rcu_time;
unsigned long endtime;
unsigned long gp_state;
unsigned long gp_state_time;
unsigned long oldstarttime;
struct rcu_boost_inflight rbi = { .inflight = 0 };
VERBOSE_TOROUT_STRING("rcu_torture_boost started");
/* Set real-time priority. */
sched_set_fifo_low(current);
init_rcu_head_on_stack(&rbi.rcu);
/* Each pass through the following loop does one boost-test cycle. */
do {
bool failed = false; // Test failed already in this test interval
bool firsttime = true;
bool gp_initiated = false;
/* Increment n_rcu_torture_boosts once per boost-test */
while (!kthread_should_stop()) {
if (mutex_trylock(&boost_mutex)) {
n_rcu_torture_boosts++;
mutex_unlock(&boost_mutex);
break;
}
schedule_timeout_uninterruptible(1);
}
if (kthread_should_stop())
goto checkwait;
......@@ -979,33 +993,33 @@ static int rcu_torture_boost(void *arg)
goto checkwait;
}
/* Do one boost-test interval. */
// Do one boost-test interval.
endtime = oldstarttime + test_boost_duration * HZ;
while (time_before(jiffies, endtime)) {
/* If we don't have a callback in flight, post one. */
if (!smp_load_acquire(&rbi.inflight)) {
/* RCU core before ->inflight = 1. */
smp_store_release(&rbi.inflight, 1);
cur_ops->call(&rbi.rcu, rcu_torture_boost_cb);
/* Check if the boost test failed */
if (!firsttime && !failed)
failed = rcu_torture_boost_failed(call_rcu_time, jiffies);
call_rcu_time = jiffies;
firsttime = false;
// Has current GP gone too long?
if (gp_initiated && !failed && !cur_ops->poll_gp_state(gp_state))
failed = rcu_torture_boost_failed(gp_state, &gp_state_time);
// If we don't have a grace period in flight, start one.
if (!gp_initiated || cur_ops->poll_gp_state(gp_state)) {
gp_state = cur_ops->start_gp_poll();
gp_initiated = true;
gp_state_time = jiffies;
}
if (stutter_wait("rcu_torture_boost"))
if (stutter_wait("rcu_torture_boost")) {
sched_set_fifo_low(current);
// If the grace period already ended,
// we don't know when that happened, so
// start over.
if (cur_ops->poll_gp_state(gp_state))
gp_initiated = false;
}
if (torture_must_stop())
goto checkwait;
}
/*
* If boost never happened, then inflight will always be 1, in
* this case the boost check would never happen in the above
* loop so do another one here.
*/
if (!firsttime && !failed && smp_load_acquire(&rbi.inflight))
rcu_torture_boost_failed(call_rcu_time, jiffies);
// In case the grace period extended beyond the end of the loop.
if (gp_initiated && !failed && !cur_ops->poll_gp_state(gp_state))
rcu_torture_boost_failed(gp_state, &gp_state_time);
/*
* Set the start time of the next test interval.
......@@ -1014,11 +1028,12 @@ static int rcu_torture_boost(void *arg)
* interval. Besides, we are running at RT priority,
* so delays should be relatively rare.
*/
while (oldstarttime == boost_starttime &&
!kthread_should_stop()) {
while (oldstarttime == boost_starttime && !kthread_should_stop()) {
if (mutex_trylock(&boost_mutex)) {
boost_starttime = jiffies +
test_boost_interval * HZ;
if (oldstarttime == boost_starttime) {
boost_starttime = jiffies + test_boost_interval * HZ;
n_rcu_torture_boosts++;
}
mutex_unlock(&boost_mutex);
break;
}
......@@ -1030,15 +1045,11 @@ checkwait: if (stutter_wait("rcu_torture_boost"))
sched_set_fifo_low(current);
} while (!torture_must_stop());
while (smp_load_acquire(&rbi.inflight))
schedule_timeout_uninterruptible(1); // rcu_barrier() deadlocks.
/* Clean up and exit. */
while (!kthread_should_stop() || smp_load_acquire(&rbi.inflight)) {
while (!kthread_should_stop()) {
torture_shutdown_absorb("rcu_torture_boost");
schedule_timeout_uninterruptible(1);
}
destroy_rcu_head_on_stack(&rbi.rcu);
torture_kthread_stopping("rcu_torture_boost");
return 0;
}
......@@ -1553,11 +1564,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid)
started = cur_ops->get_gp_seq();
ts = rcu_trace_clock_local();
p = rcu_dereference_check(rcu_torture_current,
rcu_read_lock_bh_held() ||
rcu_read_lock_sched_held() ||
srcu_read_lock_held(srcu_ctlp) ||
rcu_read_lock_trace_held() ||
torturing_tasks());
!cur_ops->readlock_held || cur_ops->readlock_held());
if (p == NULL) {
/* Wait for rcu_torture_writer to get underway */
rcutorture_one_extend(&readstate, 0, trsp, rtrsp);
......@@ -1861,48 +1868,49 @@ rcu_torture_stats(void *arg)
torture_shutdown_absorb("rcu_torture_stats");
} while (!torture_must_stop());
torture_kthread_stopping("rcu_torture_stats");
{
struct rcu_head *rhp;
struct kmem_cache *kcp;
static int z;
kcp = kmem_cache_create("rcuscale", 136, 8, SLAB_STORE_USER, NULL);
rhp = kmem_cache_alloc(kcp, GFP_KERNEL);
pr_alert("mem_dump_obj() slab test: rcu_torture_stats = %px, &rhp = %px, rhp = %px, &z = %px\n", stats_task, &rhp, rhp, &z);
pr_alert("mem_dump_obj(ZERO_SIZE_PTR):");
mem_dump_obj(ZERO_SIZE_PTR);
pr_alert("mem_dump_obj(NULL):");
mem_dump_obj(NULL);
pr_alert("mem_dump_obj(%px):", &rhp);
mem_dump_obj(&rhp);
pr_alert("mem_dump_obj(%px):", rhp);
mem_dump_obj(rhp);
pr_alert("mem_dump_obj(%px):", &rhp->func);
mem_dump_obj(&rhp->func);
pr_alert("mem_dump_obj(%px):", &z);
mem_dump_obj(&z);
kmem_cache_free(kcp, rhp);
kmem_cache_destroy(kcp);
rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
pr_alert("mem_dump_obj() kmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp);
pr_alert("mem_dump_obj(kmalloc %px):", rhp);
mem_dump_obj(rhp);
pr_alert("mem_dump_obj(kmalloc %px):", &rhp->func);
mem_dump_obj(&rhp->func);
kfree(rhp);
rhp = vmalloc(4096);
pr_alert("mem_dump_obj() vmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp);
pr_alert("mem_dump_obj(vmalloc %px):", rhp);
mem_dump_obj(rhp);
pr_alert("mem_dump_obj(vmalloc %px):", &rhp->func);
mem_dump_obj(&rhp->func);
vfree(rhp);
}
return 0;
}
/* Test mem_dump_obj() and friends. */
static void rcu_torture_mem_dump_obj(void)
{
struct rcu_head *rhp;
struct kmem_cache *kcp;
static int z;
kcp = kmem_cache_create("rcuscale", 136, 8, SLAB_STORE_USER, NULL);
rhp = kmem_cache_alloc(kcp, GFP_KERNEL);
pr_alert("mem_dump_obj() slab test: rcu_torture_stats = %px, &rhp = %px, rhp = %px, &z = %px\n", stats_task, &rhp, rhp, &z);
pr_alert("mem_dump_obj(ZERO_SIZE_PTR):");
mem_dump_obj(ZERO_SIZE_PTR);
pr_alert("mem_dump_obj(NULL):");
mem_dump_obj(NULL);
pr_alert("mem_dump_obj(%px):", &rhp);
mem_dump_obj(&rhp);
pr_alert("mem_dump_obj(%px):", rhp);
mem_dump_obj(rhp);
pr_alert("mem_dump_obj(%px):", &rhp->func);
mem_dump_obj(&rhp->func);
pr_alert("mem_dump_obj(%px):", &z);
mem_dump_obj(&z);
kmem_cache_free(kcp, rhp);
kmem_cache_destroy(kcp);
rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
pr_alert("mem_dump_obj() kmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp);
pr_alert("mem_dump_obj(kmalloc %px):", rhp);
mem_dump_obj(rhp);
pr_alert("mem_dump_obj(kmalloc %px):", &rhp->func);
mem_dump_obj(&rhp->func);
kfree(rhp);
rhp = vmalloc(4096);
pr_alert("mem_dump_obj() vmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp);
pr_alert("mem_dump_obj(vmalloc %px):", rhp);
mem_dump_obj(rhp);
pr_alert("mem_dump_obj(vmalloc %px):", &rhp->func);
mem_dump_obj(&rhp->func);
vfree(rhp);
}
static void
rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
{
......@@ -2634,7 +2642,7 @@ static bool rcu_torture_can_boost(void)
if (!(test_boost == 1 && cur_ops->can_boost) && test_boost != 2)
return false;
if (!cur_ops->call)
if (!cur_ops->start_gp_poll || !cur_ops->poll_gp_state)
return false;
prio = rcu_get_gp_kthreads_prio();
......@@ -2642,7 +2650,7 @@ static bool rcu_torture_can_boost(void)
return false;
if (prio < 2) {
if (boost_warn_once == 1)
if (boost_warn_once == 1)
return false;
pr_alert("%s: WARN: RCU kthread priority too low to test boosting. Skipping RCU boost test. Try passing rcutree.kthread_prio > 1 on the kernel command line.\n", KBUILD_MODNAME);
......@@ -2818,6 +2826,8 @@ rcu_torture_cleanup(void)
if (cur_ops->cleanup != NULL)
cur_ops->cleanup();
rcu_torture_mem_dump_obj();
rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
if (err_segs_recorded) {
......@@ -3120,6 +3130,21 @@ rcu_torture_init(void)
if (firsterr < 0)
goto unwind;
rcutor_hp = firsterr;
// Testing RCU priority boosting requires rcutorture do
// some serious abuse. Counter this by running ksoftirqd
// at higher priority.
if (IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)) {
for_each_online_cpu(cpu) {
struct sched_param sp;
struct task_struct *t;
t = per_cpu(ksoftirqd, cpu);
WARN_ON_ONCE(!t);
sp.sched_priority = 2;
sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
}
}
}
shutdown_jiffies = jiffies + shutdown_secs * HZ;
firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup);
......
......@@ -362,6 +362,111 @@ static struct ref_scale_ops rwsem_ops = {
.name = "rwsem"
};
// Definitions for global spinlock
static DEFINE_SPINLOCK(test_lock);
static void ref_lock_section(const int nloops)
{
int i;
preempt_disable();
for (i = nloops; i >= 0; i--) {
spin_lock(&test_lock);
spin_unlock(&test_lock);
}
preempt_enable();
}
static void ref_lock_delay_section(const int nloops, const int udl, const int ndl)
{
int i;
preempt_disable();
for (i = nloops; i >= 0; i--) {
spin_lock(&test_lock);
un_delay(udl, ndl);
spin_unlock(&test_lock);
}
preempt_enable();
}
static struct ref_scale_ops lock_ops = {
.readsection = ref_lock_section,
.delaysection = ref_lock_delay_section,
.name = "lock"
};
// Definitions for global irq-save spinlock
static void ref_lock_irq_section(const int nloops)
{
unsigned long flags;
int i;
preempt_disable();
for (i = nloops; i >= 0; i--) {
spin_lock_irqsave(&test_lock, flags);
spin_unlock_irqrestore(&test_lock, flags);
}
preempt_enable();
}
static void ref_lock_irq_delay_section(const int nloops, const int udl, const int ndl)
{
unsigned long flags;
int i;
preempt_disable();
for (i = nloops; i >= 0; i--) {
spin_lock_irqsave(&test_lock, flags);
un_delay(udl, ndl);
spin_unlock_irqrestore(&test_lock, flags);
}
preempt_enable();
}
static struct ref_scale_ops lock_irq_ops = {
.readsection = ref_lock_irq_section,
.delaysection = ref_lock_irq_delay_section,
.name = "lock-irq"
};
// Definitions acquire-release.
static DEFINE_PER_CPU(unsigned long, test_acqrel);
static void ref_acqrel_section(const int nloops)
{
unsigned long x;
int i;
preempt_disable();
for (i = nloops; i >= 0; i--) {
x = smp_load_acquire(this_cpu_ptr(&test_acqrel));
smp_store_release(this_cpu_ptr(&test_acqrel), x + 1);
}
preempt_enable();
}
static void ref_acqrel_delay_section(const int nloops, const int udl, const int ndl)
{
unsigned long x;
int i;
preempt_disable();
for (i = nloops; i >= 0; i--) {
x = smp_load_acquire(this_cpu_ptr(&test_acqrel));
un_delay(udl, ndl);
smp_store_release(this_cpu_ptr(&test_acqrel), x + 1);
}
preempt_enable();
}
static struct ref_scale_ops acqrel_ops = {
.readsection = ref_acqrel_section,
.delaysection = ref_acqrel_delay_section,
.name = "acqrel"
};
static void rcu_scale_one_reader(void)
{
if (readdelay <= 0)
......@@ -653,8 +758,8 @@ ref_scale_init(void)
long i;
int firsterr = 0;
static struct ref_scale_ops *scale_ops[] = {
&rcu_ops, &srcu_ops, &rcu_trace_ops, &rcu_tasks_ops,
&refcnt_ops, &rwlock_ops, &rwsem_ops,
&rcu_ops, &srcu_ops, &rcu_trace_ops, &rcu_tasks_ops, &refcnt_ops, &rwlock_ops,
&rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops,
};
if (!torture_init_begin(scale_type, verbose))
......
......@@ -80,7 +80,7 @@ do { \
* srcu_read_unlock() running against them. So if the is_static parameter
* is set, don't initialize ->srcu_lock_count[] and ->srcu_unlock_count[].
*/
static void init_srcu_struct_nodes(struct srcu_struct *ssp, bool is_static)
static void init_srcu_struct_nodes(struct srcu_struct *ssp)
{
int cpu;
int i;
......@@ -90,6 +90,9 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp, bool is_static)
struct srcu_node *snp;
struct srcu_node *snp_first;
/* Initialize geometry if it has not already been initialized. */
rcu_init_geometry();
/* Work out the overall tree geometry. */
ssp->level[0] = &ssp->node[0];
for (i = 1; i < rcu_num_lvls; i++)
......@@ -148,14 +151,6 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp, bool is_static)
timer_setup(&sdp->delay_work, srcu_delay_timer, 0);
sdp->ssp = ssp;
sdp->grpmask = 1 << (cpu - sdp->mynode->grplo);
if (is_static)
continue;
/* Dynamically allocated, better be no srcu_read_locks()! */
for (i = 0; i < ARRAY_SIZE(sdp->srcu_lock_count); i++) {
sdp->srcu_lock_count[i] = 0;
sdp->srcu_unlock_count[i] = 0;
}
}
}
......@@ -179,7 +174,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
ssp->sda = alloc_percpu(struct srcu_data);
if (!ssp->sda)
return -ENOMEM;
init_srcu_struct_nodes(ssp, is_static);
init_srcu_struct_nodes(ssp);
ssp->srcu_gp_seq_needed_exp = 0;
ssp->srcu_last_gp_end = ktime_get_mono_fast_ns();
smp_store_release(&ssp->srcu_gp_seq_needed, 0); /* Init done. */
......@@ -777,9 +772,9 @@ static bool srcu_might_be_idle(struct srcu_struct *ssp)
spin_unlock_irqrestore_rcu_node(sdp, flags);
/*
* No local callbacks, so probabalistically probe global state.
* No local callbacks, so probabilistically probe global state.
* Exact information would require acquiring locks, which would
* kill scalability, hence the probabalistic nature of the probe.
* kill scalability, hence the probabilistic nature of the probe.
*/
/* First, see if enough time has passed since the last GP. */
......@@ -1000,6 +995,9 @@ EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
* synchronize_srcu(), srcu_read_lock(), and srcu_read_unlock() are
* passed the same srcu_struct structure.
*
* Implementation of these memory-ordering guarantees is similar to
* that of synchronize_rcu().
*
* If SRCU is likely idle, expedite the first request. This semantic
* was provided by Classic SRCU, and is relied upon by its users, so TREE
* SRCU must also provide it. Note that detecting idleness is heuristic
......@@ -1392,11 +1390,15 @@ void __init srcu_init(void)
{
struct srcu_struct *ssp;
/*
* Once that is set, call_srcu() can follow the normal path and
* queue delayed work. This must follow RCU workqueues creation
* and timers initialization.
*/
srcu_init_done = true;
while (!list_empty(&srcu_boot_list)) {
ssp = list_first_entry(&srcu_boot_list, struct srcu_struct,
work.work.entry);
check_init_srcu_struct(ssp);
list_del_init(&ssp->work.work.entry);
queue_work(rcu_gp_wq, &ssp->work.work);
}
......
......@@ -94,9 +94,9 @@ static void rcu_sync_func(struct rcu_head *rhp)
rcu_sync_call(rsp);
} else {
/*
* We're at least a GP after the last rcu_sync_exit(); eveybody
* We're at least a GP after the last rcu_sync_exit(); everybody
* will now have observed the write side critical section.
* Let 'em rip!.
* Let 'em rip!
*/
WRITE_ONCE(rsp->gp_state, GP_IDLE);
}
......
......@@ -23,7 +23,7 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp);
* struct rcu_tasks - Definition for a Tasks-RCU-like mechanism.
* @cbs_head: Head of callback list.
* @cbs_tail: Tail pointer for callback list.
* @cbs_wq: Wait queue allowning new callback to get kthread's attention.
* @cbs_wq: Wait queue allowing new callback to get kthread's attention.
* @cbs_lock: Lock protecting callback list.
* @kthread_ptr: This flavor's grace-period/callback-invocation kthread.
* @gp_func: This flavor's grace-period-wait function.
......@@ -377,6 +377,46 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
// Finally, this implementation does not support high call_rcu_tasks()
// rates from multiple CPUs. If this is required, per-CPU callback lists
// will be needed.
//
// The implementation uses rcu_tasks_wait_gp(), which relies on function
// pointers in the rcu_tasks structure. The rcu_spawn_tasks_kthread()
// function sets these function pointers up so that rcu_tasks_wait_gp()
// invokes these functions in this order:
//
// rcu_tasks_pregp_step():
// Invokes synchronize_rcu() in order to wait for all in-flight
// t->on_rq and t->nvcsw transitions to complete. This works because
// all such transitions are carried out with interrupts disabled.
// rcu_tasks_pertask(), invoked on every non-idle task:
// For every runnable non-idle task other than the current one, use
// get_task_struct() to pin down that task, snapshot that task's
// number of voluntary context switches, and add that task to the
// holdout list.
// rcu_tasks_postscan():
// Invoke synchronize_srcu() to ensure that all tasks that were
// in the process of exiting (and which thus might not know to
// synchronize with this RCU Tasks grace period) have completed
// exiting.
// check_all_holdout_tasks(), repeatedly until holdout list is empty:
// Scans the holdout list, attempting to identify a quiescent state
// for each task on the list. If there is a quiescent state, the
// corresponding task is removed from the holdout list.
// rcu_tasks_postgp():
// Invokes synchronize_rcu() in order to ensure that all prior
// t->on_rq and t->nvcsw transitions are seen by all CPUs and tasks
// to have happened before the end of this RCU Tasks grace period.
// Again, this works because all such transitions are carried out
// with interrupts disabled.
//
// For each exiting task, the exit_tasks_rcu_start() and
// exit_tasks_rcu_finish() functions begin and end, respectively, the SRCU
// read-side critical sections waited for by rcu_tasks_postscan().
//
// Pre-grace-period update-side code is ordered before the grace via the
// ->cbs_lock and the smp_mb__after_spinlock(). Pre-grace-period read-side
// code is ordered before the grace period via synchronize_rcu() call
// in rcu_tasks_pregp_step() and by the scheduler's locks and interrupt
// disabling.
/* Pre-grace-period preparation. */
static void rcu_tasks_pregp_step(void)
......@@ -504,7 +544,7 @@ DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks, "RCU Tasks");
* or transition to usermode execution. As such, there are no read-side
* primitives analogous to rcu_read_lock() and rcu_read_unlock() because
* this primitive is intended to determine that all tasks have passed
* through a safe state, not so much for data-strcuture synchronization.
* through a safe state, not so much for data-structure synchronization.
*
* See the description of call_rcu() for more detailed information on
* memory ordering guarantees.
......@@ -605,8 +645,13 @@ void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); }
// passing an empty function to schedule_on_each_cpu(). This approach
// provides an asynchronous call_rcu_tasks_rude() API and batching
// of concurrent calls to the synchronous synchronize_rcu_rude() API.
// This sends IPIs far and wide and induces otherwise unnecessary context
// switches on all online CPUs, whether idle or not.
// This invokes schedule_on_each_cpu() in order to send IPIs far and wide
// and induces otherwise unnecessary context switches on all online CPUs,
// whether idle or not.
//
// Callback handling is provided by the rcu_tasks_kthread() function.
//
// Ordering is provided by the scheduler's context-switch code.
// Empty function to allow workqueues to force a context switch.
static void rcu_tasks_be_rude(struct work_struct *work)
......@@ -637,7 +682,7 @@ DEFINE_RCU_TASKS(rcu_tasks_rude, rcu_tasks_rude_wait_gp, call_rcu_tasks_rude,
* there are no read-side primitives analogous to rcu_read_lock() and
* rcu_read_unlock() because this primitive is intended to determine
* that all tasks have passed through a safe state, not so much for
* data-strcuture synchronization.
* data-structure synchronization.
*
* See the description of call_rcu() for more detailed information on
* memory ordering guarantees.
......@@ -1163,7 +1208,7 @@ static void exit_tasks_rcu_finish_trace(struct task_struct *t)
* there are no read-side primitives analogous to rcu_read_lock() and
* rcu_read_unlock() because this primitive is intended to determine
* that all tasks have passed through a safe state, not so much for
* data-strcuture synchronization.
* data-structure synchronization.
*
* See the description of call_rcu() for more detailed information on
* memory ordering guarantees.
......@@ -1356,5 +1401,4 @@ void __init rcu_init_tasks_generic(void)
#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */
static inline void rcu_tasks_bootup_oddness(void) {}
void show_rcu_tasks_gp_kthreads(void) {}
#endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */
......@@ -221,5 +221,4 @@ void __init rcu_init(void)
{
open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
rcu_early_boot_tests();
srcu_init();
}
......@@ -188,6 +188,17 @@ module_param(rcu_unlock_delay, int, 0444);
static int rcu_min_cached_objs = 5;
module_param(rcu_min_cached_objs, int, 0444);
// A page shrinker can ask for pages to be freed to make them
// available for other parts of the system. This usually happens
// under low memory conditions, and in that case we should also
// defer page-cache filling for a short time period.
//
// The default value is 5 seconds, which is long enough to reduce
// interference with the shrinker while it asks other systems to
// drain their caches.
static int rcu_delay_page_cache_fill_msec = 5000;
module_param(rcu_delay_page_cache_fill_msec, int, 0444);
/* Retrieve RCU kthreads priority for rcutorture */
int rcu_get_gp_kthreads_prio(void)
{
......@@ -204,7 +215,7 @@ EXPORT_SYMBOL_GPL(rcu_get_gp_kthreads_prio);
* the need for long delays to increase some race probabilities with the
* need for fast grace periods to increase other race probabilities.
*/
#define PER_RCU_NODE_PERIOD 3 /* Number of grace periods between delays. */
#define PER_RCU_NODE_PERIOD 3 /* Number of grace periods between delays for debugging. */
/*
* Compute the mask of online CPUs for the specified rcu_node structure.
......@@ -244,6 +255,7 @@ void rcu_softirq_qs(void)
{
rcu_qs();
rcu_preempt_deferred_qs(current);
rcu_tasks_qs(current, false);
}
/*
......@@ -835,28 +847,6 @@ void noinstr rcu_irq_exit(void)
rcu_nmi_exit();
}
/**
* rcu_irq_exit_preempt - Inform RCU that current CPU is exiting irq
* towards in kernel preemption
*
* Same as rcu_irq_exit() but has a sanity check that scheduling is safe
* from RCU point of view. Invoked from return from interrupt before kernel
* preemption.
*/
void rcu_irq_exit_preempt(void)
{
lockdep_assert_irqs_disabled();
rcu_nmi_exit();
RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) <= 0,
"RCU dynticks_nesting counter underflow/zero!");
RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) !=
DYNTICK_IRQ_NONIDLE,
"Bad RCU dynticks_nmi_nesting counter\n");
RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(),
"RCU in extended quiescent state!");
}
#ifdef CONFIG_PROVE_RCU
/**
* rcu_irq_exit_check_preempt - Validate that scheduling is possible
......@@ -961,7 +951,7 @@ EXPORT_SYMBOL_GPL(rcu_idle_exit);
*/
void noinstr rcu_user_exit(void)
{
rcu_eqs_exit(1);
rcu_eqs_exit(true);
}
/**
......@@ -1227,7 +1217,7 @@ EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
#endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */
/*
* We are reporting a quiescent state on behalf of some other CPU, so
* When trying to report a quiescent state on behalf of some other CPU,
* it is our responsibility to check for and handle potential overflow
* of the rcu_node ->gp_seq counter with respect to the rcu_data counters.
* After all, the CPU might be in deep idle state, and thus executing no
......@@ -2050,7 +2040,7 @@ static void rcu_gp_fqs_loop(void)
/*
* Clean up after the old grace period.
*/
static void rcu_gp_cleanup(void)
static noinline void rcu_gp_cleanup(void)
{
int cpu;
bool needgp = false;
......@@ -2491,7 +2481,7 @@ int rcutree_dead_cpu(unsigned int cpu)
/*
* Invoke any RCU callbacks that have made it to the end of their grace
* period. Thottle as specified by rdp->blimit.
* period. Throttle as specified by rdp->blimit.
*/
static void rcu_do_batch(struct rcu_data *rdp)
{
......@@ -2631,7 +2621,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
* state, for example, user mode or idle loop. It also schedules RCU
* core processing. If the current grace period has gone on too long,
* it will ask the scheduler to manufacture a context switch for the sole
* purpose of providing a providing the needed quiescent state.
* purpose of providing the needed quiescent state.
*/
void rcu_sched_clock_irq(int user)
{
......@@ -2913,7 +2903,6 @@ static int __init rcu_spawn_core_kthreads(void)
"%s: Could not start rcuc kthread, OOM is now expected behavior\n", __func__);
return 0;
}
early_initcall(rcu_spawn_core_kthreads);
/*
* Handle any core-RCU processing required by a call_rcu() invocation.
......@@ -3084,12 +3073,14 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
* period elapses, in other words after all pre-existing RCU read-side
* critical sections have completed. However, the callback function
* might well execute concurrently with RCU read-side critical sections
* that started after call_rcu() was invoked. RCU read-side critical
* sections are delimited by rcu_read_lock() and rcu_read_unlock(), and
* may be nested. In addition, regions of code across which interrupts,
* preemption, or softirqs have been disabled also serve as RCU read-side
* critical sections. This includes hardware interrupt handlers, softirq
* handlers, and NMI handlers.
* that started after call_rcu() was invoked.
*
* RCU read-side critical sections are delimited by rcu_read_lock()
* and rcu_read_unlock(), and may be nested. In addition, but only in
* v5.0 and later, regions of code across which interrupts, preemption,
* or softirqs have been disabled also serve as RCU read-side critical
* sections. This includes hardware interrupt handlers, softirq handlers,
* and NMI handlers.
*
* Note that all CPUs must agree that the grace period extended beyond
* all pre-existing RCU read-side critical section. On systems with more
......@@ -3109,6 +3100,9 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
* between the call to call_rcu() and the invocation of "func()" -- even
* if CPU A and CPU B are the same CPU (but again only if the system has
* more than one CPU).
*
* Implementation of these memory-ordering guarantees is described here:
* Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
*/
void call_rcu(struct rcu_head *head, rcu_callback_t func)
{
......@@ -3173,6 +3167,7 @@ struct kfree_rcu_cpu_work {
* Even though it is lockless an access has to be protected by the
* per-cpu lock.
* @page_cache_work: A work to refill the cache when it is empty
* @backoff_page_cache_fill: Delay cache refills
* @work_in_progress: Indicates that page_cache_work is running
* @hrtimer: A hrtimer for scheduling a page_cache_work
* @nr_bkv_objs: number of allocated objects at @bkvcache.
......@@ -3192,7 +3187,8 @@ struct kfree_rcu_cpu {
bool initialized;
int count;
struct work_struct page_cache_work;
struct delayed_work page_cache_work;
atomic_t backoff_page_cache_fill;
atomic_t work_in_progress;
struct hrtimer hrtimer;
......@@ -3239,7 +3235,7 @@ get_cached_bnode(struct kfree_rcu_cpu *krcp)
if (!krcp->nr_bkv_objs)
return NULL;
krcp->nr_bkv_objs--;
WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs - 1);
return (struct kvfree_rcu_bulk_data *)
llist_del_first(&krcp->bkvcache);
}
......@@ -3253,14 +3249,33 @@ put_cached_bnode(struct kfree_rcu_cpu *krcp,
return false;
llist_add((struct llist_node *) bnode, &krcp->bkvcache);
krcp->nr_bkv_objs++;
WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs + 1);
return true;
}
static int
drain_page_cache(struct kfree_rcu_cpu *krcp)
{
unsigned long flags;
struct llist_node *page_list, *pos, *n;
int freed = 0;
raw_spin_lock_irqsave(&krcp->lock, flags);
page_list = llist_del_all(&krcp->bkvcache);
WRITE_ONCE(krcp->nr_bkv_objs, 0);
raw_spin_unlock_irqrestore(&krcp->lock, flags);
llist_for_each_safe(pos, n, page_list) {
free_page((unsigned long)pos);
freed++;
}
return freed;
}
/*
* This function is invoked in workqueue context after a grace period.
* It frees all the objects queued on ->bhead_free or ->head_free.
* It frees all the objects queued on ->bkvhead_free or ->head_free.
*/
static void kfree_rcu_work(struct work_struct *work)
{
......@@ -3287,7 +3302,7 @@ static void kfree_rcu_work(struct work_struct *work)
krwp->head_free = NULL;
raw_spin_unlock_irqrestore(&krcp->lock, flags);
// Handle two first channels.
// Handle the first two channels.
for (i = 0; i < FREE_N_CHANNELS; i++) {
for (; bkvhead[i]; bkvhead[i] = bnext) {
bnext = bkvhead[i]->next;
......@@ -3325,9 +3340,11 @@ static void kfree_rcu_work(struct work_struct *work)
}
/*
* Emergency case only. It can happen under low memory
* condition when an allocation gets failed, so the "bulk"
* path can not be temporary maintained.
* This is used when the "bulk" path can not be used for the
* double-argument of kvfree_rcu(). This happens when the
* page-cache is empty, which means that objects are instead
* queued on a linked list through their rcu_head structures.
* This list is named "Channel 3".
*/
for (; head; head = next) {
unsigned long offset = (unsigned long)head->func;
......@@ -3347,34 +3364,31 @@ static void kfree_rcu_work(struct work_struct *work)
}
/*
* Schedule the kfree batch RCU work to run in workqueue context after a GP.
*
* This function is invoked by kfree_rcu_monitor() when the KFREE_DRAIN_JIFFIES
* timeout has been reached.
* This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
*/
static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
static void kfree_rcu_monitor(struct work_struct *work)
{
struct kfree_rcu_cpu_work *krwp;
bool repeat = false;
struct kfree_rcu_cpu *krcp = container_of(work,
struct kfree_rcu_cpu, monitor_work.work);
unsigned long flags;
int i, j;
lockdep_assert_held(&krcp->lock);
raw_spin_lock_irqsave(&krcp->lock, flags);
// Attempt to start a new batch.
for (i = 0; i < KFREE_N_BATCHES; i++) {
krwp = &(krcp->krw_arr[i]);
struct kfree_rcu_cpu_work *krwp = &(krcp->krw_arr[i]);
/*
* Try to detach bkvhead or head and attach it over any
* available corresponding free channel. It can be that
* a previous RCU batch is in progress, it means that
* immediately to queue another one is not possible so
* return false to tell caller to retry.
*/
// Try to detach bkvhead or head and attach it over any
// available corresponding free channel. It can be that
// a previous RCU batch is in progress, it means that
// immediately to queue another one is not possible so
// in that case the monitor work is rearmed.
if ((krcp->bkvhead[0] && !krwp->bkvhead_free[0]) ||
(krcp->bkvhead[1] && !krwp->bkvhead_free[1]) ||
(krcp->head && !krwp->head_free)) {
// Channel 1 corresponds to SLAB ptrs.
// Channel 2 corresponds to vmalloc ptrs.
// Channel 1 corresponds to the SLAB-pointer bulk path.
// Channel 2 corresponds to vmalloc-pointer bulk path.
for (j = 0; j < FREE_N_CHANNELS; j++) {
if (!krwp->bkvhead_free[j]) {
krwp->bkvhead_free[j] = krcp->bkvhead[j];
......@@ -3382,7 +3396,8 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
}
}
// Channel 3 corresponds to emergency path.
// Channel 3 corresponds to both SLAB and vmalloc
// objects queued on the linked list.
if (!krwp->head_free) {
krwp->head_free = krcp->head;
krcp->head = NULL;
......@@ -3390,65 +3405,35 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
WRITE_ONCE(krcp->count, 0);
/*
* One work is per one batch, so there are three
* "free channels", the batch can handle. It can
* be that the work is in the pending state when
* channels have been detached following by each
* other.
*/
// One work is per one batch, so there are three
// "free channels", the batch can handle. It can
// be that the work is in the pending state when
// channels have been detached following by each
// other.
queue_rcu_work(system_wq, &krwp->rcu_work);
}
// Repeat if any "free" corresponding channel is still busy.
if (krcp->bkvhead[0] || krcp->bkvhead[1] || krcp->head)
repeat = true;
}
return !repeat;
}
static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp,
unsigned long flags)
{
// Attempt to start a new batch.
krcp->monitor_todo = false;
if (queue_kfree_rcu_work(krcp)) {
// Success! Our job is done here.
raw_spin_unlock_irqrestore(&krcp->lock, flags);
return;
}
// If there is nothing to detach, it means that our job is
// successfully done here. In case of having at least one
// of the channels that is still busy we should rearm the
// work to repeat an attempt. Because previous batches are
// still in progress.
if (!krcp->bkvhead[0] && !krcp->bkvhead[1] && !krcp->head)
krcp->monitor_todo = false;
else
schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
// Previous RCU batch still in progress, try again later.
krcp->monitor_todo = true;
schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
raw_spin_unlock_irqrestore(&krcp->lock, flags);
}
/*
* This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
* It invokes kfree_rcu_drain_unlock() to attempt to start another batch.
*/
static void kfree_rcu_monitor(struct work_struct *work)
{
unsigned long flags;
struct kfree_rcu_cpu *krcp = container_of(work, struct kfree_rcu_cpu,
monitor_work.work);
raw_spin_lock_irqsave(&krcp->lock, flags);
if (krcp->monitor_todo)
kfree_rcu_drain_unlock(krcp, flags);
else
raw_spin_unlock_irqrestore(&krcp->lock, flags);
}
static enum hrtimer_restart
schedule_page_work_fn(struct hrtimer *t)
{
struct kfree_rcu_cpu *krcp =
container_of(t, struct kfree_rcu_cpu, hrtimer);
queue_work(system_highpri_wq, &krcp->page_cache_work);
queue_delayed_work(system_highpri_wq, &krcp->page_cache_work, 0);
return HRTIMER_NORESTART;
}
......@@ -3457,12 +3442,16 @@ static void fill_page_cache_func(struct work_struct *work)
struct kvfree_rcu_bulk_data *bnode;
struct kfree_rcu_cpu *krcp =
container_of(work, struct kfree_rcu_cpu,
page_cache_work);
page_cache_work.work);
unsigned long flags;
int nr_pages;
bool pushed;
int i;
for (i = 0; i < rcu_min_cached_objs; i++) {
nr_pages = atomic_read(&krcp->backoff_page_cache_fill) ?
1 : rcu_min_cached_objs;
for (i = 0; i < nr_pages; i++) {
bnode = (struct kvfree_rcu_bulk_data *)
__get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
......@@ -3479,6 +3468,7 @@ static void fill_page_cache_func(struct work_struct *work)
}
atomic_set(&krcp->work_in_progress, 0);
atomic_set(&krcp->backoff_page_cache_fill, 0);
}
static void
......@@ -3486,10 +3476,15 @@ run_page_cache_worker(struct kfree_rcu_cpu *krcp)
{
if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
!atomic_xchg(&krcp->work_in_progress, 1)) {
hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC,
HRTIMER_MODE_REL);
krcp->hrtimer.function = schedule_page_work_fn;
hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL);
if (atomic_read(&krcp->backoff_page_cache_fill)) {
queue_delayed_work(system_wq,
&krcp->page_cache_work,
msecs_to_jiffies(rcu_delay_page_cache_fill_msec));
} else {
hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
krcp->hrtimer.function = schedule_page_work_fn;
hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL);
}
}
}
......@@ -3554,11 +3549,11 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
}
/*
* Queue a request for lazy invocation of appropriate free routine after a
* grace period. Please note there are three paths are maintained, two are the
* main ones that use array of pointers interface and third one is emergency
* one, that is used only when the main path can not be maintained temporary,
* due to memory pressure.
* Queue a request for lazy invocation of the appropriate free routine
* after a grace period. Please note that three paths are maintained,
* two for the common case using arrays of pointers and a third one that
* is used only when the main paths cannot be used, for example, due to
* memory pressure.
*
* Each kvfree_call_rcu() request is added to a batch. The batch will be drained
* every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will
......@@ -3647,6 +3642,8 @@ kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
count += READ_ONCE(krcp->count);
count += READ_ONCE(krcp->nr_bkv_objs);
atomic_set(&krcp->backoff_page_cache_fill, 1);
}
return count;
......@@ -3656,18 +3653,14 @@ static unsigned long
kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
{
int cpu, freed = 0;
unsigned long flags;
for_each_possible_cpu(cpu) {
int count;
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
count = krcp->count;
raw_spin_lock_irqsave(&krcp->lock, flags);
if (krcp->monitor_todo)
kfree_rcu_drain_unlock(krcp, flags);
else
raw_spin_unlock_irqrestore(&krcp->lock, flags);
count += drain_page_cache(krcp);
kfree_rcu_monitor(&krcp->monitor_work.work);
sc->nr_to_scan -= count;
freed += count;
......@@ -3695,7 +3688,8 @@ void __init kfree_rcu_scheduler_running(void)
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
raw_spin_lock_irqsave(&krcp->lock, flags);
if (!krcp->head || krcp->monitor_todo) {
if ((!krcp->bkvhead[0] && !krcp->bkvhead[1] && !krcp->head) ||
krcp->monitor_todo) {
raw_spin_unlock_irqrestore(&krcp->lock, flags);
continue;
}
......@@ -3752,10 +3746,12 @@ static int rcu_blocking_is_gp(void)
* read-side critical sections have completed. Note, however, that
* upon return from synchronize_rcu(), the caller might well be executing
* concurrently with new RCU read-side critical sections that began while
* synchronize_rcu() was waiting. RCU read-side critical sections are
* delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
* In addition, regions of code across which interrupts, preemption, or
* softirqs have been disabled also serve as RCU read-side critical
* synchronize_rcu() was waiting.
*
* RCU read-side critical sections are delimited by rcu_read_lock()
* and rcu_read_unlock(), and may be nested. In addition, but only in
* v5.0 and later, regions of code across which interrupts, preemption,
* or softirqs have been disabled also serve as RCU read-side critical
* sections. This includes hardware interrupt handlers, softirq handlers,
* and NMI handlers.
*
......@@ -3776,6 +3772,9 @@ static int rcu_blocking_is_gp(void)
* to have executed a full memory barrier during the execution of
* synchronize_rcu() -- even if CPU A and CPU B are the same CPU (but
* again only if the system has more than one CPU).
*
* Implementation of these memory-ordering guarantees is described here:
* Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
*/
void synchronize_rcu(void)
{
......@@ -3846,11 +3845,11 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
/**
* poll_state_synchronize_rcu - Conditionally wait for an RCU grace period
*
* @oldstate: return from call to get_state_synchronize_rcu() or start_poll_synchronize_rcu()
* @oldstate: value from get_state_synchronize_rcu() or start_poll_synchronize_rcu()
*
* If a full RCU grace period has elapsed since the earlier call from
* which oldstate was obtained, return @true, otherwise return @false.
* If @false is returned, it is the caller's responsibilty to invoke this
* If @false is returned, it is the caller's responsibility to invoke this
* function later on until it does return @true. Alternatively, the caller
* can explicitly wait for a grace period, for example, by passing @oldstate
* to cond_synchronize_rcu() or by directly invoking synchronize_rcu().
......@@ -3862,6 +3861,11 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
* (many hours even on 32-bit systems) should check them occasionally
* and either refresh them or set a flag indicating that the grace period
* has completed.
*
* This function provides the same memory-ordering guarantees that
* would be provided by a synchronize_rcu() that was invoked at the call
* to the function that provided @oldstate, and that returned at the end
* of this function.
*/
bool poll_state_synchronize_rcu(unsigned long oldstate)
{
......@@ -3876,7 +3880,7 @@ EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu);
/**
* cond_synchronize_rcu - Conditionally wait for an RCU grace period
*
* @oldstate: return value from earlier call to get_state_synchronize_rcu()
* @oldstate: value from get_state_synchronize_rcu() or start_poll_synchronize_rcu()
*
* If a full RCU grace period has elapsed since the earlier call to
* get_state_synchronize_rcu() or start_poll_synchronize_rcu(), just return.
......@@ -3886,6 +3890,11 @@ EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu);
* counter wrap is harmless. If the counter wraps, we have waited for
* more than 2 billion grace periods (and way more on a 64-bit system!),
* so waiting for one additional grace period should be just fine.
*
* This function provides the same memory-ordering guarantees that
* would be provided by a synchronize_rcu() that was invoked at the call
* to the function that provided @oldstate, and that returned at the end
* of this function.
*/
void cond_synchronize_rcu(unsigned long oldstate)
{
......@@ -3913,7 +3922,7 @@ static int rcu_pending(int user)
check_cpu_stall(rdp);
/* Does this CPU need a deferred NOCB wakeup? */
if (rcu_nocb_need_deferred_wakeup(rdp))
if (rcu_nocb_need_deferred_wakeup(rdp, RCU_NOCB_WAKE))
return 1;
/* Is this a nohz_full CPU in userspace or idle? (Ignore RCU if so.) */
......@@ -4096,7 +4105,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier);
/*
* Propagate ->qsinitmask bits up the rcu_node tree to account for the
* first CPU in a given leaf rcu_node structure coming online. The caller
* must hold the corresponding leaf rcu_node ->lock with interrrupts
* must hold the corresponding leaf rcu_node ->lock with interrupts
* disabled.
*/
static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
......@@ -4191,7 +4200,7 @@ int rcutree_prepare_cpu(unsigned int cpu)
rdp->rcu_iw_gp_seq = rdp->gp_seq - 1;
trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl"));
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
rcu_prepare_kthreads(cpu);
rcu_spawn_one_boost_kthread(rnp);
rcu_spawn_cpu_nocb_kthread(cpu);
WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus + 1);
......@@ -4474,6 +4483,7 @@ static int __init rcu_spawn_gp_kthread(void)
wake_up_process(t);
rcu_spawn_nocb_kthreads();
rcu_spawn_boost_kthreads();
rcu_spawn_core_kthreads();
return 0;
}
early_initcall(rcu_spawn_gp_kthread);
......@@ -4584,11 +4594,25 @@ static void __init rcu_init_one(void)
* replace the definitions in tree.h because those are needed to size
* the ->node array in the rcu_state structure.
*/
static void __init rcu_init_geometry(void)
void rcu_init_geometry(void)
{
ulong d;
int i;
static unsigned long old_nr_cpu_ids;
int rcu_capacity[RCU_NUM_LVLS];
static bool initialized;
if (initialized) {
/*
* Warn if setup_nr_cpu_ids() had not yet been invoked,
* unless nr_cpus_ids == NR_CPUS, in which case who cares?
*/
WARN_ON_ONCE(old_nr_cpu_ids != nr_cpu_ids);
return;
}
old_nr_cpu_ids = nr_cpu_ids;
initialized = true;
/*
* Initialize any unspecified boot parameters.
......@@ -4689,6 +4713,18 @@ static void __init kfree_rcu_batch_init(void)
int cpu;
int i;
/* Clamp it to [0:100] seconds interval. */
if (rcu_delay_page_cache_fill_msec < 0 ||
rcu_delay_page_cache_fill_msec > 100 * MSEC_PER_SEC) {
rcu_delay_page_cache_fill_msec =
clamp(rcu_delay_page_cache_fill_msec, 0,
(int) (100 * MSEC_PER_SEC));
pr_info("Adjusting rcutree.rcu_delay_page_cache_fill_msec to %d ms.\n",
rcu_delay_page_cache_fill_msec);
}
for_each_possible_cpu(cpu) {
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
......@@ -4698,7 +4734,7 @@ static void __init kfree_rcu_batch_init(void)
}
INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
INIT_WORK(&krcp->page_cache_work, fill_page_cache_func);
INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func);
krcp->initialized = true;
}
if (register_shrinker(&kfree_rcu_shrinker))
......@@ -4732,12 +4768,11 @@ void __init rcu_init(void)
rcutree_online_cpu(cpu);
}
/* Create workqueue for expedited GPs and for Tree SRCU. */
/* Create workqueue for Tree SRCU and for expedited GPs. */
rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0);
WARN_ON(!rcu_gp_wq);
rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0);
WARN_ON(!rcu_par_gp_wq);
srcu_init();
/* Fill in default value for rcutree.qovld boot parameter. */
/* -After- the rcu_node ->lock fields are initialized! */
......
......@@ -115,6 +115,7 @@ struct rcu_node {
/* boosting for this rcu_node structure. */
unsigned int boost_kthread_status;
/* State of boost_kthread_task for tracing. */
unsigned long n_boosts; /* Number of boosts for this rcu_node structure. */
#ifdef CONFIG_RCU_NOCB_CPU
struct swait_queue_head nocb_gp_wq[2];
/* Place for rcu_nocb_kthread() to wait GP. */
......@@ -153,7 +154,7 @@ struct rcu_data {
unsigned long gp_seq; /* Track rsp->gp_seq counter. */
unsigned long gp_seq_needed; /* Track furthest future GP request. */
union rcu_noqs cpu_no_qs; /* No QSes yet for this CPU. */
bool core_needs_qs; /* Core waits for quiesc state. */
bool core_needs_qs; /* Core waits for quiescent state. */
bool beenonline; /* CPU online at least once. */
bool gpwrap; /* Possible ->gp_seq wrap. */
bool exp_deferred_qs; /* This CPU awaiting a deferred QS? */
......@@ -218,7 +219,6 @@ struct rcu_data {
/* The following fields are used by GP kthread, hence own cacheline. */
raw_spinlock_t nocb_gp_lock ____cacheline_internodealigned_in_smp;
struct timer_list nocb_bypass_timer; /* Force nocb_bypass flush. */
u8 nocb_gp_sleep; /* Is the nocb GP thread asleep? */
u8 nocb_gp_bypass; /* Found a bypass on last scan? */
u8 nocb_gp_gp; /* GP to wait for on last scan? */
......@@ -257,10 +257,10 @@ struct rcu_data {
};
/* Values for nocb_defer_wakeup field in struct rcu_data. */
#define RCU_NOCB_WAKE_OFF -1
#define RCU_NOCB_WAKE_NOT 0
#define RCU_NOCB_WAKE 1
#define RCU_NOCB_WAKE_FORCE 2
#define RCU_NOCB_WAKE_BYPASS 1
#define RCU_NOCB_WAKE 2
#define RCU_NOCB_WAKE_FORCE 3
#define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500))
/* For jiffies_till_first_fqs and */
......@@ -417,8 +417,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
static bool rcu_is_callbacks_kthread(void);
static void rcu_cpu_kthread_setup(unsigned int cpu);
static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp);
static void __init rcu_spawn_boost_kthreads(void);
static void rcu_prepare_kthreads(int cpu);
static void rcu_cleanup_after_idle(void);
static void rcu_prepare_for_idle(void);
static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
......@@ -434,7 +434,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
bool *was_alldone, unsigned long flags);
static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
unsigned long flags);
static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level);
static bool do_nocb_deferred_wakeup(struct rcu_data *rdp);
static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
static void rcu_spawn_cpu_nocb_kthread(int cpu);
......
......@@ -33,10 +33,6 @@ static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp)
return false;
}
static inline bool rcu_running_nocb_timer(struct rcu_data *rdp)
{
return (timer_curr_running(&rdp->nocb_timer) && !in_irq());
}
#else
static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp)
{
......@@ -48,11 +44,6 @@ static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp)
return false;
}
static inline bool rcu_running_nocb_timer(struct rcu_data *rdp)
{
return false;
}
#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
static bool rcu_rdp_is_offloaded(struct rcu_data *rdp)
......@@ -72,8 +63,7 @@ static bool rcu_rdp_is_offloaded(struct rcu_data *rdp)
rcu_lockdep_is_held_nocb(rdp) ||
(rdp == this_cpu_ptr(&rcu_data) &&
!(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible())) ||
rcu_current_is_nocb_kthread(rdp) ||
rcu_running_nocb_timer(rdp)),
rcu_current_is_nocb_kthread(rdp)),
"Unsafe read of RCU_NOCB offloaded state"
);
......@@ -1098,6 +1088,7 @@ static int rcu_boost(struct rcu_node *rnp)
/* Lock only for side effect: boosts task t's priority. */
rt_mutex_lock(&rnp->boost_mtx);
rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */
rnp->n_boosts++;
return READ_ONCE(rnp->exp_tasks) != NULL ||
READ_ONCE(rnp->boost_tasks) != NULL;
......@@ -1197,22 +1188,16 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
*/
static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
{
int rnp_index = rnp - rcu_get_root();
unsigned long flags;
int rnp_index = rnp - rcu_get_root();
struct sched_param sp;
struct task_struct *t;
if (!IS_ENABLED(CONFIG_PREEMPT_RCU))
return;
if (!rcu_scheduler_fully_active || rcu_rnp_online_cpus(rnp) == 0)
if (rnp->boost_kthread_task || !rcu_scheduler_fully_active)
return;
rcu_state.boost = 1;
if (rnp->boost_kthread_task != NULL)
return;
t = kthread_create(rcu_boost_kthread, (void *)rnp,
"rcub/%d", rnp_index);
if (WARN_ON_ONCE(IS_ERR(t)))
......@@ -1264,17 +1249,8 @@ static void __init rcu_spawn_boost_kthreads(void)
struct rcu_node *rnp;
rcu_for_each_leaf_node(rnp)
rcu_spawn_one_boost_kthread(rnp);
}
static void rcu_prepare_kthreads(int cpu)
{
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
struct rcu_node *rnp = rdp->mynode;
/* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
if (rcu_scheduler_fully_active)
rcu_spawn_one_boost_kthread(rnp);
if (rcu_rnp_online_cpus(rnp))
rcu_spawn_one_boost_kthread(rnp);
}
#else /* #ifdef CONFIG_RCU_BOOST */
......@@ -1294,15 +1270,15 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
{
}
static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
{
}
static void __init rcu_spawn_boost_kthreads(void)
static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
{
}
static void rcu_prepare_kthreads(int cpu)
static void __init rcu_spawn_boost_kthreads(void)
{
}
......@@ -1535,13 +1511,10 @@ static void rcu_cleanup_after_idle(void)
static int __init rcu_nocb_setup(char *str)
{
alloc_bootmem_cpumask_var(&rcu_nocb_mask);
if (!strcasecmp(str, "all")) /* legacy: use "0-N" instead */
if (cpulist_parse(str, rcu_nocb_mask)) {
pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n");
cpumask_setall(rcu_nocb_mask);
else
if (cpulist_parse(str, rcu_nocb_mask)) {
pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n");
cpumask_setall(rcu_nocb_mask);
}
}
return 1;
}
__setup("rcu_nocbs=", rcu_nocb_setup);
......@@ -1692,43 +1665,50 @@ bool rcu_is_nocb_cpu(int cpu)
return false;
}
/*
* Kick the GP kthread for this NOCB group. Caller holds ->nocb_lock
* and this function releases it.
*/
static bool wake_nocb_gp(struct rcu_data *rdp, bool force,
unsigned long flags)
__releases(rdp->nocb_lock)
static bool __wake_nocb_gp(struct rcu_data *rdp_gp,
struct rcu_data *rdp,
bool force, unsigned long flags)
__releases(rdp_gp->nocb_gp_lock)
{
bool needwake = false;
struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
lockdep_assert_held(&rdp->nocb_lock);
if (!READ_ONCE(rdp_gp->nocb_gp_kthread)) {
rcu_nocb_unlock_irqrestore(rdp, flags);
raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
TPS("AlreadyAwake"));
return false;
}
if (READ_ONCE(rdp->nocb_defer_wakeup) > RCU_NOCB_WAKE_NOT) {
WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
del_timer(&rdp->nocb_timer);
if (rdp_gp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) {
WRITE_ONCE(rdp_gp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
del_timer(&rdp_gp->nocb_timer);
}
rcu_nocb_unlock_irqrestore(rdp, flags);
raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
if (force || READ_ONCE(rdp_gp->nocb_gp_sleep)) {
WRITE_ONCE(rdp_gp->nocb_gp_sleep, false);
needwake = true;
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake"));
}
raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
if (needwake)
if (needwake) {
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake"));
wake_up_process(rdp_gp->nocb_gp_kthread);
}
return needwake;
}
/*
* Kick the GP kthread for this NOCB group.
*/
static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
{
unsigned long flags;
struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
return __wake_nocb_gp(rdp_gp, rdp, force, flags);
}
/*
* Arrange to wake the GP kthread for this NOCB group at some future
* time when it is safe to do so.
......@@ -1736,12 +1716,27 @@ static bool wake_nocb_gp(struct rcu_data *rdp, bool force,
static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
const char *reason)
{
if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_OFF)
return;
if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT)
mod_timer(&rdp->nocb_timer, jiffies + 1);
if (rdp->nocb_defer_wakeup < waketype)
WRITE_ONCE(rdp->nocb_defer_wakeup, waketype);
unsigned long flags;
struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
/*
* Bypass wakeup overrides previous deferments. In case
* of callback storm, no need to wake up too early.
*/
if (waketype == RCU_NOCB_WAKE_BYPASS) {
mod_timer(&rdp_gp->nocb_timer, jiffies + 2);
WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
} else {
if (rdp_gp->nocb_defer_wakeup < RCU_NOCB_WAKE)
mod_timer(&rdp_gp->nocb_timer, jiffies + 1);
if (rdp_gp->nocb_defer_wakeup < waketype)
WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
}
raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason);
}
......@@ -1940,7 +1935,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
}
/*
* Awaken the no-CBs grace-period kthead if needed, either due to it
* Awaken the no-CBs grace-period kthread if needed, either due to it
* legitimately being asleep or due to overload conditions.
*
* If warranted, also wake up the kthread servicing this CPUs queues.
......@@ -1968,13 +1963,14 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
rdp->qlen_last_fqs_check = len;
if (!irqs_disabled_flags(flags)) {
/* ... if queue was empty ... */
wake_nocb_gp(rdp, false, flags);
rcu_nocb_unlock_irqrestore(rdp, flags);
wake_nocb_gp(rdp, false);
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
TPS("WakeEmpty"));
} else {
rcu_nocb_unlock_irqrestore(rdp, flags);
wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE,
TPS("WakeEmptyIsDeferred"));
rcu_nocb_unlock_irqrestore(rdp, flags);
}
} else if (len > rdp->qlen_last_fqs_check + qhimark) {
/* ... or if many callbacks queued. */
......@@ -1989,10 +1985,14 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
smp_mb(); /* Enqueue before timer_pending(). */
if ((rdp->nocb_cb_sleep ||
!rcu_segcblist_ready_cbs(&rdp->cblist)) &&
!timer_pending(&rdp->nocb_bypass_timer))
!timer_pending(&rdp->nocb_timer)) {
rcu_nocb_unlock_irqrestore(rdp, flags);
wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE,
TPS("WakeOvfIsDeferred"));
rcu_nocb_unlock_irqrestore(rdp, flags);
} else {
rcu_nocb_unlock_irqrestore(rdp, flags);
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
}
} else {
rcu_nocb_unlock_irqrestore(rdp, flags);
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
......@@ -2000,18 +2000,6 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
return;
}
/* Wake up the no-CBs GP kthread to flush ->nocb_bypass. */
static void do_nocb_bypass_wakeup_timer(struct timer_list *t)
{
unsigned long flags;
struct rcu_data *rdp = from_timer(rdp, t, nocb_bypass_timer);
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Timer"));
rcu_nocb_lock_irqsave(rdp, flags);
smp_mb__after_spinlock(); /* Timer expire before wakeup. */
__call_rcu_nocb_wake(rdp, true, flags);
}
/*
* Check if we ignore this rdp.
*
......@@ -2118,11 +2106,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
bypass = true;
}
rnp = rdp->mynode;
if (bypass) { // Avoid race with first bypass CB.
WRITE_ONCE(my_rdp->nocb_defer_wakeup,
RCU_NOCB_WAKE_NOT);
del_timer(&my_rdp->nocb_timer);
}
// Advance callbacks if helpful and low contention.
needwake_gp = false;
if (!rcu_segcblist_restempty(&rdp->cblist,
......@@ -2168,12 +2152,12 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
my_rdp->nocb_gp_bypass = bypass;
my_rdp->nocb_gp_gp = needwait_gp;
my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0;
if (bypass && !rcu_nocb_poll) {
// At least one child with non-empty ->nocb_bypass, so set
// timer in order to avoid stranding its callbacks.
raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
mod_timer(&my_rdp->nocb_bypass_timer, j + 2);
raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_BYPASS,
TPS("WakeBypassIsDeferred"));
}
if (rcu_nocb_poll) {
/* Polling, so trace if first poll in the series. */
......@@ -2197,8 +2181,10 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
}
if (!rcu_nocb_poll) {
raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
if (bypass)
del_timer(&my_rdp->nocb_bypass_timer);
if (my_rdp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) {
WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
del_timer(&my_rdp->nocb_timer);
}
WRITE_ONCE(my_rdp->nocb_gp_sleep, true);
raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
}
......@@ -2334,25 +2320,27 @@ static int rcu_nocb_cb_kthread(void *arg)
}
/* Is a deferred wakeup of rcu_nocb_kthread() required? */
static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level)
{
return READ_ONCE(rdp->nocb_defer_wakeup) > RCU_NOCB_WAKE_NOT;
return READ_ONCE(rdp->nocb_defer_wakeup) >= level;
}
/* Do a deferred wakeup of rcu_nocb_kthread(). */
static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp)
static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp_gp,
struct rcu_data *rdp, int level,
unsigned long flags)
__releases(rdp_gp->nocb_gp_lock)
{
unsigned long flags;
int ndw;
int ret;
rcu_nocb_lock_irqsave(rdp, flags);
if (!rcu_nocb_need_deferred_wakeup(rdp)) {
rcu_nocb_unlock_irqrestore(rdp, flags);
if (!rcu_nocb_need_deferred_wakeup(rdp_gp, level)) {
raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
return false;
}
ndw = READ_ONCE(rdp->nocb_defer_wakeup);
ret = wake_nocb_gp(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
ndw = rdp_gp->nocb_defer_wakeup;
ret = __wake_nocb_gp(rdp_gp, rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake"));
return ret;
......@@ -2361,9 +2349,15 @@ static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp)
/* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */
static void do_nocb_deferred_wakeup_timer(struct timer_list *t)
{
unsigned long flags;
struct rcu_data *rdp = from_timer(rdp, t, nocb_timer);
do_nocb_deferred_wakeup_common(rdp);
WARN_ON_ONCE(rdp->nocb_gp_rdp != rdp);
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Timer"));
raw_spin_lock_irqsave(&rdp->nocb_gp_lock, flags);
smp_mb__after_spinlock(); /* Timer expire before wakeup. */
do_nocb_deferred_wakeup_common(rdp, rdp, RCU_NOCB_WAKE_BYPASS, flags);
}
/*
......@@ -2373,9 +2367,14 @@ static void do_nocb_deferred_wakeup_timer(struct timer_list *t)
*/
static bool do_nocb_deferred_wakeup(struct rcu_data *rdp)
{
if (rcu_nocb_need_deferred_wakeup(rdp))
return do_nocb_deferred_wakeup_common(rdp);
return false;
unsigned long flags;
struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
if (!rdp_gp || !rcu_nocb_need_deferred_wakeup(rdp_gp, RCU_NOCB_WAKE))
return false;
raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
return do_nocb_deferred_wakeup_common(rdp_gp, rdp, RCU_NOCB_WAKE, flags);
}
void rcu_nocb_flush_deferred_wakeup(void)
......@@ -2443,17 +2442,15 @@ static long rcu_nocb_rdp_deoffload(void *arg)
swait_event_exclusive(rdp->nocb_state_wq,
!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB |
SEGCBLIST_KTHREAD_GP));
rcu_nocb_lock_irqsave(rdp, flags);
/* Make sure nocb timer won't stay around */
WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_OFF);
rcu_nocb_unlock_irqrestore(rdp, flags);
del_timer_sync(&rdp->nocb_timer);
/*
* Theoretically we could set SEGCBLIST_SOFTIRQ_ONLY with CB unlocked
* and IRQs disabled but let's be paranoid.
* Lock one last time to acquire latest callback updates from kthreads
* so we can later handle callbacks locally without locking.
*/
rcu_nocb_lock_irqsave(rdp, flags);
/*
* Theoretically we could set SEGCBLIST_SOFTIRQ_ONLY after the nocb
* lock is released but how about being paranoid for once?
*/
rcu_segcblist_set_flags(cblist, SEGCBLIST_SOFTIRQ_ONLY);
/*
* With SEGCBLIST_SOFTIRQ_ONLY, we can't use
......@@ -2473,10 +2470,6 @@ int rcu_nocb_cpu_deoffload(int cpu)
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
int ret = 0;
if (rdp == rdp->nocb_gp_rdp) {
pr_info("Can't deoffload an rdp GP leader (yet)\n");
return -EINVAL;
}
mutex_lock(&rcu_state.barrier_mutex);
cpus_read_lock();
if (rcu_rdp_is_offloaded(rdp)) {
......@@ -2517,8 +2510,7 @@ static long rcu_nocb_rdp_offload(void *arg)
* SEGCBLIST_SOFTIRQ_ONLY mode.
*/
raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
/* Re-enable nocb timer */
WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
/*
* We didn't take the nocb lock while working on the
* rdp->cblist in SEGCBLIST_SOFTIRQ_ONLY mode.
......@@ -2626,7 +2618,6 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
raw_spin_lock_init(&rdp->nocb_bypass_lock);
raw_spin_lock_init(&rdp->nocb_gp_lock);
timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0);
timer_setup(&rdp->nocb_bypass_timer, do_nocb_bypass_wakeup_timer, 0);
rcu_cblist_init(&rdp->nocb_bypass);
}
......@@ -2785,13 +2776,12 @@ static void show_rcu_nocb_gp_state(struct rcu_data *rdp)
{
struct rcu_node *rnp = rdp->mynode;
pr_info("nocb GP %d %c%c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu %c CPU %d%s\n",
pr_info("nocb GP %d %c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu %c CPU %d%s\n",
rdp->cpu,
"kK"[!!rdp->nocb_gp_kthread],
"lL"[raw_spin_is_locked(&rdp->nocb_gp_lock)],
"dD"[!!rdp->nocb_defer_wakeup],
"tT"[timer_pending(&rdp->nocb_timer)],
"bB"[timer_pending(&rdp->nocb_bypass_timer)],
"sS"[!!rdp->nocb_gp_sleep],
".W"[swait_active(&rdp->nocb_gp_wq)],
".W"[swait_active(&rnp->nocb_gp_wq[0])],
......@@ -2812,7 +2802,6 @@ static void show_rcu_nocb_state(struct rcu_data *rdp)
char bufr[20];
struct rcu_segcblist *rsclp = &rdp->cblist;
bool waslocked;
bool wastimer;
bool wassleep;
if (rdp->nocb_gp_rdp == rdp)
......@@ -2849,15 +2838,13 @@ static void show_rcu_nocb_state(struct rcu_data *rdp)
return;
waslocked = raw_spin_is_locked(&rdp->nocb_gp_lock);
wastimer = timer_pending(&rdp->nocb_bypass_timer);
wassleep = swait_active(&rdp->nocb_gp_wq);
if (!rdp->nocb_gp_sleep && !waslocked && !wastimer && !wassleep)
return; /* Nothing untowards. */
if (!rdp->nocb_gp_sleep && !waslocked && !wassleep)
return; /* Nothing untoward. */
pr_info(" nocb GP activity on CB-only CPU!!! %c%c%c%c %c\n",
pr_info(" nocb GP activity on CB-only CPU!!! %c%c%c %c\n",
"lL"[waslocked],
"dD"[!!rdp->nocb_defer_wakeup],
"tT"[wastimer],
"sS"[!!rdp->nocb_gp_sleep],
".W"[wassleep]);
}
......@@ -2922,7 +2909,7 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
{
}
static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level)
{
return false;
}
......
......@@ -314,6 +314,7 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
* tasks blocked within RCU read-side critical sections.
*/
static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags)
__releases(rnp->lock)
{
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return 0;
......@@ -716,6 +717,63 @@ static void check_cpu_stall(struct rcu_data *rdp)
// RCU forward-progress mechanisms, including of callback invocation.
/*
* Check to see if a failure to end RCU priority inversion was due to
* a CPU not passing through a quiescent state. When this happens, there
* is nothing that RCU priority boosting can do to help, so we shouldn't
* count this as an RCU priority boosting failure. A return of true says
* RCU priority boosting is to blame, and false says otherwise. If false
* is returned, the first of the CPUs to blame is stored through cpup.
* If there was no CPU blocking the current grace period, but also nothing
* in need of being boosted, *cpup is set to -1. This can happen in case
* of vCPU preemption while the last CPU is reporting its quiscent state,
* for example.
*
* If cpup is NULL, then a lockless quick check is carried out, suitable
* for high-rate usage. On the other hand, if cpup is non-NULL, each
* rcu_node structure's ->lock is acquired, ruling out high-rate usage.
*/
bool rcu_check_boost_fail(unsigned long gp_state, int *cpup)
{
bool atb = false;
int cpu;
unsigned long flags;
struct rcu_node *rnp;
rcu_for_each_leaf_node(rnp) {
if (!cpup) {
if (READ_ONCE(rnp->qsmask)) {
return false;
} else {
if (READ_ONCE(rnp->gp_tasks))
atb = true;
continue;
}
}
*cpup = -1;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
if (rnp->gp_tasks)
atb = true;
if (!rnp->qsmask) {
// No CPUs without quiescent states for this rnp.
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
continue;
}
// Find the first holdout CPU.
for_each_leaf_node_possible_cpu(rnp, cpu) {
if (rnp->qsmask & (1UL << (cpu - rnp->grplo))) {
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
*cpup = cpu;
return false;
}
}
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
// Can't blame CPUs, so must blame RCU priority boosting.
return atb;
}
EXPORT_SYMBOL_GPL(rcu_check_boost_fail);
/*
* Show the state of the grace-period kthreads.
*/
......@@ -726,6 +784,7 @@ void show_rcu_gp_kthreads(void)
unsigned long j;
unsigned long ja;
unsigned long jr;
unsigned long js;
unsigned long jw;
struct rcu_data *rdp;
struct rcu_node *rnp;
......@@ -734,21 +793,30 @@ void show_rcu_gp_kthreads(void)
j = jiffies;
ja = j - data_race(rcu_state.gp_activity);
jr = j - data_race(rcu_state.gp_req_activity);
js = j - data_race(rcu_state.gp_start);
jw = j - data_race(rcu_state.gp_wake_time);
pr_info("%s: wait state: %s(%d) ->state: %#x delta ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_flags %#x\n",
pr_info("%s: wait state: %s(%d) ->state: %#lx ->rt_priority %u delta ->gp_start %lu ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_max %lu ->gp_flags %#x\n",
rcu_state.name, gp_state_getname(rcu_state.gp_state),
rcu_state.gp_state, t ? t->__state : 0x1ffff,
ja, jr, jw, (long)data_race(rcu_state.gp_wake_seq),
rcu_state.gp_state, t ? t->__state : 0x1ffffL, t ? t->rt_priority : 0xffU,
js, ja, jr, jw, (long)data_race(rcu_state.gp_wake_seq),
(long)data_race(rcu_state.gp_seq),
(long)data_race(rcu_get_root()->gp_seq_needed),
data_race(rcu_state.gp_max),
data_race(rcu_state.gp_flags));
rcu_for_each_node_breadth_first(rnp) {
if (ULONG_CMP_GE(READ_ONCE(rcu_state.gp_seq),
READ_ONCE(rnp->gp_seq_needed)))
if (ULONG_CMP_GE(READ_ONCE(rcu_state.gp_seq), READ_ONCE(rnp->gp_seq_needed)) &&
!data_race(rnp->qsmask) && !data_race(rnp->boost_tasks) &&
!data_race(rnp->exp_tasks) && !data_race(rnp->gp_tasks))
continue;
pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld\n",
rnp->grplo, rnp->grphi, (long)data_race(rnp->gp_seq),
(long)data_race(rnp->gp_seq_needed));
pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld ->qsmask %#lx %c%c%c%c ->n_boosts %ld\n",
rnp->grplo, rnp->grphi,
(long)data_race(rnp->gp_seq), (long)data_race(rnp->gp_seq_needed),
data_race(rnp->qsmask),
".b"[!!data_race(rnp->boost_kthread_task)],
".B"[!!data_race(rnp->boost_tasks)],
".E"[!!data_race(rnp->exp_tasks)],
".G"[!!data_race(rnp->gp_tasks)],
data_race(rnp->n_boosts));
if (!rcu_is_leaf_node(rnp))
continue;
for_each_leaf_node_possible_cpu(rnp, cpu) {
......
......@@ -277,7 +277,7 @@ EXPORT_SYMBOL_GPL(rcu_callback_map);
noinstr int notrace debug_lockdep_rcu_enabled(void)
{
return rcu_scheduler_active != RCU_SCHEDULER_INACTIVE && debug_locks &&
return rcu_scheduler_active != RCU_SCHEDULER_INACTIVE && READ_ONCE(debug_locks) &&
current->lockdep_recursion == 0;
}
EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
......@@ -524,6 +524,7 @@ static void test_callback(struct rcu_head *r)
}
DEFINE_STATIC_SRCU(early_srcu);
static unsigned long early_srcu_cookie;
struct early_boot_kfree_rcu {
struct rcu_head rh;
......@@ -536,8 +537,10 @@ static void early_boot_test_call_rcu(void)
struct early_boot_kfree_rcu *rhp;
call_rcu(&head, test_callback);
if (IS_ENABLED(CONFIG_SRCU))
if (IS_ENABLED(CONFIG_SRCU)) {
early_srcu_cookie = start_poll_synchronize_srcu(&early_srcu);
call_srcu(&early_srcu, &shead, test_callback);
}
rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
if (!WARN_ON_ONCE(!rhp))
kfree_rcu(rhp, rh);
......@@ -563,6 +566,7 @@ static int rcu_verify_early_boot_tests(void)
if (IS_ENABLED(CONFIG_SRCU)) {
early_boot_test_counter++;
srcu_barrier(&early_srcu);
WARN_ON_ONCE(!poll_state_synchronize_srcu(&early_srcu, early_srcu_cookie));
}
}
if (rcu_self_test_counter != early_boot_test_counter) {
......
......@@ -1237,20 +1237,6 @@ int try_to_del_timer_sync(struct timer_list *timer)
}
EXPORT_SYMBOL(try_to_del_timer_sync);
bool timer_curr_running(struct timer_list *timer)
{
int i;
for (i = 0; i < NR_BASES; i++) {
struct timer_base *base = this_cpu_ptr(&timer_bases[i]);
if (base->running_timer == timer)
return true;
}
return false;
}
#ifdef CONFIG_PREEMPT_RT
static __init void timer_base_init_expiry_lock(struct timer_base *base)
{
......
......@@ -581,6 +581,14 @@ static const char *bitmap_parse_region(const char *str, struct region *r)
{
unsigned int lastbit = r->nbits - 1;
if (!strncasecmp(str, "all", 3)) {
r->start = 0;
r->end = lastbit;
str += 3;
goto check_pattern;
}
str = bitmap_getnum(str, &r->start, lastbit);
if (IS_ERR(str))
return str;
......@@ -595,6 +603,7 @@ static const char *bitmap_parse_region(const char *str, struct region *r)
if (IS_ERR(str))
return str;
check_pattern:
if (end_of_region(*str))
goto no_pattern;
......
......@@ -366,6 +366,13 @@ static const struct test_bitmap_parselist parselist_tests[] __initconst = {
{0, "0-31:1/3,1-31:1/3,2-31:1/3", &exp1[8 * step], 32, 0},
{0, "1-10:8/12,8-31:24/29,0-31:0/3", &exp1[9 * step], 32, 0},
{0, "all", &exp1[8 * step], 32, 0},
{0, "0, 1, all, ", &exp1[8 * step], 32, 0},
{0, "all:1/2", &exp1[4 * step], 32, 0},
{0, "ALL:1/2", &exp1[4 * step], 32, 0},
{-EINVAL, "al", NULL, 8, 0},
{-EINVAL, "alll", NULL, 8, 0},
{-EINVAL, "-1", NULL, 8, 0},
{-EINVAL, "-0", NULL, 8, 0},
{-EINVAL, "10-1", NULL, 8, 0},
......
......@@ -922,7 +922,7 @@ static void __oom_kill_process(struct task_struct *victim, const char *message)
continue;
}
/*
* No kthead_use_mm() user needs to read from the userspace so
* No kthread_use_mm() user needs to read from the userspace so
* we are ok to reap it.
*/
if (unlikely(p->flags & PF_KTHREAD))
......
......@@ -634,6 +634,7 @@ struct kmem_obj_info {
struct kmem_cache *kp_slab_cache;
void *kp_ret;
void *kp_stack[KS_ADDRS_COUNT];
void *kp_free_stack[KS_ADDRS_COUNT];
};
void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page);
#endif
......
......@@ -575,7 +575,7 @@ EXPORT_SYMBOL_GPL(kmem_valid_obj);
* depends on the type of object and on how much debugging is enabled.
* For a slab-cache object, the fact that it is a slab object is printed,
* and, if available, the slab name, return address, and stack trace from
* the allocation of that object.
* the allocation and last free path of that object.
*
* This function will splat if passed a pointer to a non-slab object.
* If you are not sure what type of object you have, you should instead
......@@ -620,6 +620,16 @@ void kmem_dump_obj(void *object)
break;
pr_info(" %pS\n", kp.kp_stack[i]);
}
if (kp.kp_free_stack[0])
pr_cont(" Free path:\n");
for (i = 0; i < ARRAY_SIZE(kp.kp_free_stack); i++) {
if (!kp.kp_free_stack[i])
break;
pr_info(" %pS\n", kp.kp_free_stack[i]);
}
}
EXPORT_SYMBOL_GPL(kmem_dump_obj);
#endif
......
......@@ -4045,6 +4045,7 @@ void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page)
!(s->flags & SLAB_STORE_USER))
return;
#ifdef CONFIG_SLUB_DEBUG
objp = fixup_red_left(s, objp);
trackp = get_track(s, objp, TRACK_ALLOC);
kpp->kp_ret = (void *)trackp->addr;
#ifdef CONFIG_STACKTRACE
......@@ -4053,6 +4054,13 @@ void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page)
if (!kpp->kp_stack[i])
break;
}
trackp = get_track(s, objp, TRACK_FREE);
for (i = 0; i < KS_ADDRS_COUNT && i < TRACK_ADDRS_COUNT; i++) {
kpp->kp_free_stack[i] = (void *)trackp->addrs[i];
if (!kpp->kp_free_stack[i])
break;
}
#endif
#endif
}
......
......@@ -983,7 +983,7 @@ int __weak memcmp_pages(struct page *page1, struct page *page2)
* depends on the type of object and on how much debugging is enabled.
* For example, for a slab-cache object, the slab name is printed, and,
* if available, the return address and stack trace from the allocation
* of that object.
* and last free path of that object.
*/
void mem_dump_obj(void *object)
{
......
#!/usr/bin/env drgn
# SPDX-License-Identifier: GPL-2.0+
#
# Dump out the number of RCU callbacks outstanding.
#
# On older kernels having multiple flavors of RCU, this dumps out the
# number of callbacks for the most heavily used flavor.
#
# Usage: sudo drgn rcu-cbs.py
#
# Copyright (C) 2021 Facebook, Inc.
#
# Authors: Paul E. McKenney <paulmck@kernel.org>
import sys
import drgn
from drgn import NULL, Object
from drgn.helpers.linux import *
def get_rdp0(prog):
try:
rdp0 = prog.variable('rcu_preempt_data', 'kernel/rcu/tree.c');
except LookupError:
rdp0 = NULL;
if rdp0 == NULL:
try:
rdp0 = prog.variable('rcu_sched_data',
'kernel/rcu/tree.c');
except LookupError:
rdp0 = NULL;
if rdp0 == NULL:
rdp0 = prog.variable('rcu_data', 'kernel/rcu/tree.c');
return rdp0.address_of_();
rdp0 = get_rdp0(prog);
# Sum up RCU callbacks.
sum = 0;
for cpu in for_each_possible_cpu(prog):
rdp = per_cpu_ptr(rdp0, cpu);
len = rdp.cblist.len.value_();
# print("CPU " + str(cpu) + " RCU callbacks: " + str(len));
sum += len;
print("Number of RCU callbacks in flight: " + str(sum));
......@@ -29,7 +29,7 @@ then
echo "Usage: $scriptname /path/to/old/run [ options ]"
exit 1
fi
if ! cp "$oldrun/batches" $T/batches.oldrun
if ! cp "$oldrun/scenarios" $T/scenarios.oldrun
then
# Later on, can reconstitute this from console.log files.
echo Prior run batches file does not exist: $oldrun/batches
......@@ -143,6 +143,8 @@ then
usage
fi
rm -f "$rundir"/*/{console.log,console.log.diags,qemu_pid,qemu-retval,Warnings,kvm-test-1-run.sh.out,kvm-test-1-run-qemu.sh.out,vmlinux} "$rundir"/log
touch "$rundir/log"
echo $scriptname $args | tee -a "$rundir/log"
echo $oldrun > "$rundir/re-run"
if ! test -d "$rundir/../../bin"
then
......@@ -165,22 +167,12 @@ done
grep '^#' $i | sed -e 's/^# //' > $T/qemu-cmd-settings
. $T/qemu-cmd-settings
grep -v '^#' $T/batches.oldrun | awk '
BEGIN {
oldbatch = 1;
}
grep -v '^#' $T/scenarios.oldrun | awk '
{
if (oldbatch != $1) {
print "kvm-test-1-run-batch.sh" curbatch;
curbatch = "";
oldbatch = $1;
}
curbatch = curbatch " " $2;
}
END {
print "kvm-test-1-run-batch.sh" curbatch
curbatch = "";
for (i = 2; i <= NF; i++)
curbatch = curbatch " " $i;
print "kvm-test-1-run-batch.sh" curbatch;
}' > $T/runbatches.sh
if test -n "$dryrun"
......@@ -188,12 +180,5 @@ then
echo ---- Dryrun complete, directory: $rundir | tee -a "$rundir/log"
else
( cd "$rundir"; sh $T/runbatches.sh )
kcsan-collapse.sh "$rundir" | tee -a "$rundir/log"
echo | tee -a "$rundir/log"
echo ---- Results directory: $rundir | tee -a "$rundir/log"
kvm-recheck.sh "$rundir" > $T/kvm-recheck.sh.out 2>&1
ret=$?
cat $T/kvm-recheck.sh.out | tee -a "$rundir/log"
echo " --- Done at `date` (`get_starttime_duration $starttime`) exitcode $ret" | tee -a "$rundir/log"
exit $ret
kvm-end-run-stats.sh "$rundir" "$starttime"
fi
......@@ -40,8 +40,10 @@ if test $retval -gt 1
then
exit 2
fi
ncpus=`cpus2use.sh`
make -j$ncpus $TORTURE_KMAKE_ARG > $resdir/Make.out 2>&1
# Tell "make" to use double the number of real CPUs on the build system.
ncpus="`getconf _NPROCESSORS_ONLN`"
make -j$((2 * ncpus)) $TORTURE_KMAKE_ARG > $resdir/Make.out 2>&1
retval=$?
if test $retval -ne 0 || grep "rcu[^/]*": < $resdir/Make.out | egrep -q "Stop|Error|error:|warning:" || egrep -q "Stop|Error|error:" < $resdir/Make.out
then
......
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0+
#
# Check the status of the specified run.
#
# Usage: kvm-end-run-stats.sh /path/to/run starttime
#
# Copyright (C) 2021 Facebook, Inc.
#
# Authors: Paul E. McKenney <paulmck@kernel.org>
# scriptname=$0
# args="$*"
rundir="$1"
if ! test -d "$rundir"
then
echo kvm-end-run-stats.sh: Specified run directory does not exist: $rundir
exit 1
fi
T=${TMPDIR-/tmp}/kvm-end-run-stats.sh.$$
trap 'rm -rf $T' 0
mkdir $T
KVM="`pwd`/tools/testing/selftests/rcutorture"; export KVM
PATH=${KVM}/bin:$PATH; export PATH
. functions.sh
default_starttime="`get_starttime`"
starttime="${2-default_starttime}"
echo | tee -a "$rundir/log"
echo | tee -a "$rundir/log"
echo " --- `date` Test summary:" | tee -a "$rundir/log"
echo Results directory: $rundir | tee -a "$rundir/log"
kcsan-collapse.sh "$rundir" | tee -a "$rundir/log"
kvm-recheck.sh "$rundir" > $T/kvm-recheck.sh.out 2>&1
ret=$?
cat $T/kvm-recheck.sh.out | tee -a "$rundir/log"
echo " --- Done at `date` (`get_starttime_duration $starttime`) exitcode $ret" | tee -a "$rundir/log"
exit $ret
......@@ -43,7 +43,7 @@ then
else
echo No build errors.
fi
if grep -q -e "--buildonly" < ${rundir}/log
if grep -q -e "--build-\?only" < ${rundir}/log && ! test -f "${rundir}/remote-log"
then
echo Build-only run, no console logs to check.
exit $editorret
......
......@@ -31,7 +31,7 @@ then
echo "$configfile ------- " $stopstate
else
title="$configfile ------- $ngps GPs"
dur=`sed -e 's/^.* rcutorture.shutdown_secs=//' -e 's/ .*$//' < $i/qemu-cmd 2> /dev/null`
dur=`grep -v '^#' $i/qemu-cmd | sed -e 's/^.* rcutorture.shutdown_secs=//' -e 's/ .*$//'`
if test -z "$dur"
then
:
......
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0+
#
# Run a series of tests on remote systems under KVM.
#
# Usage: kvm-remote.sh "systems" [ <kvm.sh args> ]
# kvm-remote.sh "systems" /path/to/old/run [ <kvm-again.sh args> ]
#
# Copyright (C) 2021 Facebook, Inc.
#
# Authors: Paul E. McKenney <paulmck@kernel.org>
scriptname=$0
args="$*"
if ! test -d tools/testing/selftests/rcutorture/bin
then
echo $scriptname must be run from top-level directory of kernel source tree.
exit 1
fi
KVM="`pwd`/tools/testing/selftests/rcutorture"; export KVM
PATH=${KVM}/bin:$PATH; export PATH
. functions.sh
starttime="`get_starttime`"
systems="$1"
if test -z "$systems"
then
echo $scriptname: Empty list of systems will go nowhere good, giving up.
exit 1
fi
shift
# Pathnames:
# T: /tmp/kvm-remote.sh.$$
# resdir: /tmp/kvm-remote.sh.$$/res
# rundir: /tmp/kvm-remote.sh.$$/res/$ds ("-remote" suffix)
# oldrun: `pwd`/tools/testing/.../res/$otherds
#
# Pathname segments:
# TD: kvm-remote.sh.$$
# ds: yyyy.mm.dd-hh.mm.ss-remote
TD=kvm-remote.sh.$$
T=${TMPDIR-/tmp}/$TD
trap 'rm -rf $T' 0
mkdir $T
resdir="$T/res"
ds=`date +%Y.%m.%d-%H.%M.%S`-remote
rundir=$resdir/$ds
echo Results directory: $rundir
echo $scriptname $args
if echo $1 | grep -q '^--'
then
# Fresh build. Create a datestamp unless the caller supplied one.
datestamp="`echo "$@" | awk -v ds="$ds" '{
for (i = 1; i < NF; i++) {
if ($i == "--datestamp") {
ds = "";
break;
}
}
if (ds != "")
print "--datestamp " ds;
}'`"
kvm.sh --remote "$@" $datestamp --buildonly > $T/kvm.sh.out 2>&1
ret=$?
if test "$ret" -ne 0
then
echo $scriptname: kvm.sh failed exit code $?
cat $T/kvm.sh.out
exit 2
fi
oldrun="`grep -m 1 "^Results directory: " $T/kvm.sh.out | awk '{ print $3 }'`"
touch "$oldrun/remote-log"
echo $scriptname $args >> "$oldrun/remote-log"
echo | tee -a "$oldrun/remote-log"
echo " ----" kvm.sh output: "(`date`)" | tee -a "$oldrun/remote-log"
cat $T/kvm.sh.out | tee -a "$oldrun/remote-log"
# We are going to run this, so remove the buildonly files.
rm -f "$oldrun"/*/buildonly
kvm-again.sh $oldrun --dryrun --remote --rundir "$rundir" > $T/kvm-again.sh.out 2>&1
ret=$?
if test "$ret" -ne 0
then
echo $scriptname: kvm-again.sh failed exit code $? | tee -a "$oldrun/remote-log"
cat $T/kvm-again.sh.out | tee -a "$oldrun/remote-log"
exit 2
fi
else
# Re-use old run.
oldrun="$1"
if ! echo $oldrun | grep -q '^/'
then
oldrun="`pwd`/$oldrun"
fi
shift
touch "$oldrun/remote-log"
echo $scriptname $args >> "$oldrun/remote-log"
kvm-again.sh "$oldrun" "$@" --dryrun --remote --rundir "$rundir" > $T/kvm-again.sh.out 2>&1
ret=$?
if test "$ret" -ne 0
then
echo $scriptname: kvm-again.sh failed exit code $? | tee -a "$oldrun/remote-log"
cat $T/kvm-again.sh.out | tee -a "$oldrun/remote-log"
exit 2
fi
cp -a "$rundir" "$KVM/res/"
oldrun="$KVM/res/$ds"
fi
echo | tee -a "$oldrun/remote-log"
echo " ----" kvm-again.sh output: "(`date`)" | tee -a "$oldrun/remote-log"
cat $T/kvm-again.sh.out
echo | tee -a "$oldrun/remote-log"
echo Remote run directory: $rundir | tee -a "$oldrun/remote-log"
echo Local build-side run directory: $oldrun | tee -a "$oldrun/remote-log"
# Create the kvm-remote-N.sh scripts in the bin directory.
awk < "$rundir"/scenarios -v dest="$T/bin" -v rundir="$rundir" '
{
n = $1;
sub(/\./, "", n);
fn = dest "/kvm-remote-" n ".sh"
scenarios = "";
for (i = 2; i <= NF; i++)
scenarios = scenarios " " $i;
print "kvm-test-1-run-batch.sh" scenarios > fn;
print "rm " rundir "/remote.run" >> fn;
}'
chmod +x $T/bin/kvm-remote-*.sh
( cd "`dirname $T`"; tar -chzf $T/binres.tgz "$TD/bin" "$TD/res" )
# Check first to avoid the need for cleanup for system-name typos
for i in $systems
do
ncpus="`ssh $i getconf _NPROCESSORS_ONLN 2> /dev/null`"
echo $i: $ncpus CPUs " " `date` | tee -a "$oldrun/remote-log"
ret=$?
if test "$ret" -ne 0
then
echo System $i unreachable, giving up. | tee -a "$oldrun/remote-log"
exit 4 | tee -a "$oldrun/remote-log"
fi
done
# Download and expand the tarball on all systems.
for i in $systems
do
echo Downloading tarball to $i `date` | tee -a "$oldrun/remote-log"
cat $T/binres.tgz | ssh $i "cd /tmp; tar -xzf -"
ret=$?
if test "$ret" -ne 0
then
echo Unable to download $T/binres.tgz to system $i, giving up. | tee -a "$oldrun/remote-log"
exit 10 | tee -a "$oldrun/remote-log"
fi
done
# Function to check for presence of a file on the specified system.
# Complain if the system cannot be reached, and retry after a wait.
# Currently just waits forever if a machine disappears.
#
# Usage: checkremotefile system pathname
checkremotefile () {
local ret
local sleeptime=60
while :
do
ssh $1 "test -f \"$2\""
ret=$?
if test "$ret" -ne 255
then
return $ret
fi
echo " ---" ssh failure to $1 checking for file $2, retry after $sleeptime seconds. `date`
sleep $sleeptime
done
}
# Function to start batches on idle remote $systems
#
# Usage: startbatches curbatch nbatches
#
# Batches are numbered starting at 1. Returns the next batch to start.
# Be careful to redirect all debug output to FD 2 (stderr).
startbatches () {
local curbatch="$1"
local nbatches="$2"
local ret
# Each pass through the following loop examines one system.
for i in $systems
do
if test "$curbatch" -gt "$nbatches"
then
echo $((nbatches + 1))
return 0
fi
if checkremotefile "$i" "$resdir/$ds/remote.run" 1>&2
then
continue # System still running last test, skip.
fi
ssh "$i" "cd \"$resdir/$ds\"; touch remote.run; PATH=\"$T/bin:$PATH\" nohup kvm-remote-$curbatch.sh > kvm-remote-$curbatch.sh.out 2>&1 &" 1>&2
ret=$?
if test "$ret" -ne 0
then
echo ssh $i failed: exitcode $ret 1>&2
exit 11
fi
echo " ----" System $i Batch `head -n $curbatch < "$rundir"/scenarios | tail -1` `date` 1>&2
curbatch=$((curbatch + 1))
done
echo $curbatch
}
# Launch all the scenarios.
nbatches="`wc -l "$rundir"/scenarios | awk '{ print $1 }'`"
curbatch=1
while test "$curbatch" -le "$nbatches"
do
startbatches $curbatch $nbatches > $T/curbatch 2> $T/startbatches.stderr
curbatch="`cat $T/curbatch`"
if test -s "$T/startbatches.stderr"
then
cat "$T/startbatches.stderr" | tee -a "$oldrun/remote-log"
fi
if test "$curbatch" -le "$nbatches"
then
sleep 30
fi
done
echo All batches started. `date`
# Wait for all remaining scenarios to complete and collect results.
for i in $systems
do
while checkremotefile "$i" "$resdir/$ds/remote.run"
do
sleep 30
done
( cd "$oldrun"; ssh $i "cd $rundir; tar -czf - kvm-remote-*.sh.out */console.log */kvm-test-1-run*.sh.out */qemu_pid */qemu-retval; rm -rf $T > /dev/null 2>&1" | tar -xzf - )
done
( kvm-end-run-stats.sh "$oldrun" "$starttime"; echo $? > $T/exitcode ) | tee -a "$oldrun/remote-log"
exit "`cat $T/exitcode`"
......@@ -20,6 +20,9 @@ mkdir $T
cd `dirname $scriptname`/../../../../../
# This script knows only English.
LANG=en_US.UTF-8; export LANG
dur=$((30*60))
dryrun=""
KVM="`pwd`/tools/testing/selftests/rcutorture"; export KVM
......@@ -41,6 +44,7 @@ TORTURE_KCONFIG_KASAN_ARG=""
TORTURE_KCONFIG_KCSAN_ARG=""
TORTURE_KMAKE_ARG=""
TORTURE_QEMU_MEM=512
TORTURE_REMOTE=
TORTURE_SHUTDOWN_GRACE=180
TORTURE_SUITE=rcu
TORTURE_MOD=rcutorture
......@@ -64,7 +68,7 @@ usage () {
echo " --cpus N"
echo " --datestamp string"
echo " --defconfig string"
echo " --dryrun batches|sched|script"
echo " --dryrun batches|scenarios|sched|script"
echo " --duration minutes | <seconds>s | <hours>h | <days>d"
echo " --gdb"
echo " --help"
......@@ -77,6 +81,7 @@ usage () {
echo " --no-initrd"
echo " --qemu-args qemu-arguments"
echo " --qemu-cmd qemu-system-..."
echo " --remote"
echo " --results absolute-pathname"
echo " --torture lock|rcu|rcuscale|refscale|scf"
echo " --trust-make"
......@@ -112,10 +117,13 @@ do
checkarg --cpus "(number)" "$#" "$2" '^[0-9]*$' '^--'
cpus=$2
TORTURE_ALLOTED_CPUS="$2"
max_cpus="`identify_qemu_vcpus`"
if test "$TORTURE_ALLOTED_CPUS" -gt "$max_cpus"
if test -z "$TORTURE_REMOTE"
then
TORTURE_ALLOTED_CPUS=$max_cpus
max_cpus="`identify_qemu_vcpus`"
if test "$TORTURE_ALLOTED_CPUS" -gt "$max_cpus"
then
TORTURE_ALLOTED_CPUS=$max_cpus
fi
fi
shift
;;
......@@ -130,7 +138,7 @@ do
shift
;;
--dryrun)
checkarg --dryrun "batches|sched|script" $# "$2" 'batches\|sched\|script' '^--'
checkarg --dryrun "batches|sched|script" $# "$2" 'batches\|scenarios\|sched\|script' '^--'
dryrun=$2
shift
;;
......@@ -206,6 +214,9 @@ do
TORTURE_QEMU_CMD="$2"
shift
;;
--remote)
TORTURE_REMOTE=1
;;
--results)
checkarg --results "(absolute pathname)" "$#" "$2" '^/' '^error'
resdir=$2
......@@ -550,20 +561,7 @@ END {
if (ncpus != 0)
dump(first, i, batchnum);
}' >> $T/script
cat << '___EOF___' >> $T/script
echo | tee -a $TORTURE_RESDIR/log
echo | tee -a $TORTURE_RESDIR/log
echo " --- `date` Test summary:" | tee -a $TORTURE_RESDIR/log
___EOF___
cat << ___EOF___ >> $T/script
echo Results directory: $resdir/$ds | tee -a $resdir/$ds/log
kcsan-collapse.sh $resdir/$ds | tee -a $resdir/$ds/log
kvm-recheck.sh $resdir/$ds > $T/kvm-recheck.sh.out 2>&1
___EOF___
echo 'ret=$?' >> $T/script
echo "cat $T/kvm-recheck.sh.out | tee -a $resdir/$ds/log" >> $T/script
echo 'exit $ret' >> $T/script
echo kvm-end-run-stats.sh "$resdir/$ds" "$starttime" >> $T/script
# Extract the tests and their batches from the script.
egrep 'Start batch|Starting build\.' $T/script | grep -v ">>" |
......@@ -577,6 +575,25 @@ egrep 'Start batch|Starting build\.' $T/script | grep -v ">>" |
print batchno, $1, $2
}' > $T/batches
# As above, but one line per batch.
grep -v '^#' $T/batches | awk '
BEGIN {
oldbatch = 1;
}
{
if (oldbatch != $1) {
print ++n ". " curbatch;
curbatch = "";
oldbatch = $1;
}
curbatch = curbatch " " $2;
}
END {
print ++n ". " curbatch;
}' > $T/scenarios
if test "$dryrun" = script
then
cat $T/script
......@@ -597,13 +614,17 @@ elif test "$dryrun" = batches
then
cat $T/batches
exit 0
elif test "$dryrun" = scenarios
then
cat $T/scenarios
exit 0
else
# Not a dryrun. Record the batches and the number of CPUs, then run the script.
bash $T/script
ret=$?
cp $T/batches $resdir/$ds/batches
cp $T/scenarios $resdir/$ds/scenarios
echo '#' cpus=$cpus >> $resdir/$ds/batches
echo " --- Done at `date` (`get_starttime_duration $starttime`) exitcode $ret" | tee -a $resdir/$ds/log
exit $ret
fi
......
......@@ -302,7 +302,7 @@ function torture_set {
kcsan_kmake_tag="--kmake-args"
cur_kcsan_kmake_args="$kcsan_kmake_args"
fi
torture_one $* --kconfig "CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y" $kcsan_kmake_tag $cur_kcsan_kmake_args --kcsan
torture_one "$@" --kconfig "CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y" $kcsan_kmake_tag $cur_kcsan_kmake_args --kcsan
fi
}
......
CONFIG_SMP=y
CONFIG_NR_CPUS=16
CONFIG_PREEMPT_NONE=n
CONFIG_PREEMPT_VOLUNTARY=n
CONFIG_PREEMPT=y
#CHECK#CONFIG_PREEMPT_RCU=y
CONFIG_HZ_PERIODIC=y
CONFIG_NO_HZ_IDLE=n
CONFIG_NO_HZ_FULL=n
CONFIG_RCU_TRACE=y
CONFIG_HOTPLUG_CPU=y
CONFIG_RCU_FANOUT=2
CONFIG_RCU_FANOUT_LEAF=2
CONFIG_RCU_NOCB_CPU=n
CONFIG_DEBUG_LOCK_ALLOC=n
CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
CONFIG_RCU_EXPERT=y
rcutorture.test_boost=2
rcutorture.stutter=0
rcutree.gp_preinit_delay=12
rcutree.gp_init_delay=3
rcutree.gp_cleanup_delay=3
rcutree.kthread_prio=2
threadirqs
tree.use_softirq=0
......@@ -7,7 +7,7 @@ CONFIG_HZ_PERIODIC=n
CONFIG_NO_HZ_IDLE=y
CONFIG_NO_HZ_FULL=n
CONFIG_RCU_FAST_NO_HZ=n
CONFIG_HOTPLUG_CPU=n
CONFIG_HOTPLUG_CPU=y
CONFIG_SUSPEND=n
CONFIG_HIBERNATION=n
CONFIG_RCU_NOCB_CPU=n
......
......@@ -8,7 +8,7 @@ CONFIG_HZ_PERIODIC=n
CONFIG_NO_HZ_IDLE=y
CONFIG_NO_HZ_FULL=n
CONFIG_RCU_FAST_NO_HZ=n
CONFIG_HOTPLUG_CPU=n
CONFIG_HOTPLUG_CPU=y
CONFIG_SUSPEND=n
CONFIG_HIBERNATION=n
CONFIG_RCU_FANOUT=3
......
......@@ -7,7 +7,7 @@ CONFIG_HZ_PERIODIC=n
CONFIG_NO_HZ_IDLE=y
CONFIG_NO_HZ_FULL=n
CONFIG_RCU_FAST_NO_HZ=n
CONFIG_HOTPLUG_CPU=n
CONFIG_HOTPLUG_CPU=y
CONFIG_SUSPEND=n
CONFIG_HIBERNATION=n
CONFIG_RCU_NOCB_CPU=n
......
......@@ -7,7 +7,7 @@ CONFIG_HZ_PERIODIC=n
CONFIG_NO_HZ_IDLE=y
CONFIG_NO_HZ_FULL=n
CONFIG_RCU_FAST_NO_HZ=n
CONFIG_HOTPLUG_CPU=n
CONFIG_HOTPLUG_CPU=y
CONFIG_SUSPEND=n
CONFIG_HIBERNATION=n
CONFIG_RCU_NOCB_CPU=n
......
......@@ -174,7 +174,7 @@ static inline bool spin_trylock(spinlock_t *lock)
}
struct completion {
/* Hopefuly this won't overflow. */
/* Hopefully this won't overflow. */
unsigned int count;
};
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册