path: root/kernel
diff options
authorPaul E. McKenney <paulmck@linux.ibm.com>2019-04-04 12:19:25 -0700
committerPaul E. McKenney <paulmck@linux.ibm.com>2019-05-25 14:50:49 -0700
commit0864f057b050bc6dd68106b3185e02db5140012d (patch)
treeccc4c06e416b2a75b0add24d2c69e07b29e52e91 /kernel
parent385b599e8c04fa843c4d7f785478827cc512d720 (diff)
rcu: Use irq_work to get scheduler's attention in clean context
When rcu_read_unlock_special() is invoked with interrupts disabled, is either not in an interrupt handler or is not using RCU_SOFTIRQ, is not the first RCU read-side critical section in the chain, and either there is an expedited grace period in flight or this is a NO_HZ_FULL kernel, the end of the grace period can be unduly delayed. The reason for this is that it is not safe to do wakeups in this situation. This commit fixes this problem by using the irq_work subsystem to force a later interrupt handler in a clean environment. Because set_tsk_need_resched(current) and set_preempt_need_resched() are invoked prior to this, the scheduler will force a context switch upon return from this interrupt (though perhaps at the end of any interrupted preempt-disable or BH-disable region of code), which will invoke rcu_note_context_switch() (again in a clean environment), which will in turn give RCU the chance to report the deferred quiescent state. Of course, by then this task might be within another RCU read-side critical section. But that will be detected at that time and reporting will be further deferred to the outermost rcu_read_unlock(). See rcu_preempt_need_deferred_qs() and rcu_preempt_deferred_qs() for more details on the checking. Suggested-by: Peter Zijlstra <peterz@infradead.org> Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
Diffstat (limited to 'kernel')
2 files changed, 22 insertions, 0 deletions
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index a1a72a1ecb02..21d740f0b8dc 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -161,6 +161,8 @@ struct rcu_data {
/* ticks this CPU has handled */
/* during and after the last grace */
/* period it is aware of. */
+ struct irq_work defer_qs_iw; /* Obtain later scheduler attention. */
+ bool defer_qs_iw_pending; /* Scheduler attention pending? */
/* 2) batch handling */
struct rcu_segcblist cblist; /* Segmented callback list, with */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index e1005f5e8094..58c7853f19e7 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -588,6 +588,17 @@ static void rcu_preempt_deferred_qs(struct task_struct *t)
+ * Minimal handler to give the scheduler a chance to re-evaluate.
+ */
+static void rcu_preempt_deferred_qs_handler(struct irq_work *iwp)
+ struct rcu_data *rdp;
+ rdp = container_of(iwp, struct rcu_data, defer_qs_iw);
+ rdp->defer_qs_iw_pending = false;
* Handle special cases during rcu_read_unlock(), such as needing to
* notify RCU core processing or task having blocked during the RCU
* read-side critical section.
@@ -630,6 +641,15 @@ static void rcu_read_unlock_special(struct task_struct *t)
// Also if no expediting or NO_HZ_FULL, slow is OK.
+ !rdp->defer_qs_iw_pending && exp) {
+ // Get scheduler to re-evaluate and call hooks.
+ // If !IRQ_WORK, FQS scan will eventually IPI.
+ init_irq_work(&rdp->defer_qs_iw,
+ rcu_preempt_deferred_qs_handler);
+ rdp->defer_qs_iw_pending = true;
+ irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu);
+ }
t->rcu_read_unlock_special.b.deferred_qs = true;