sched/core: Allow kthreads to fall back to online && !active cpus

[cascardo/linux.git] / kernel / sched / core.c
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index d1f7149..51d7105 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -33,7 +33,7 @@
  #include <linux/init.h>
  #include <linux/uaccess.h>
  #include <linux/highmem.h>
-#include <asm/mmu_context.h>
+#include <linux/mmu_context.h>
  #include <linux/interrupt.h>
  #include <linux/capability.h>
  #include <linux/completion.h>
@@ -170,6 +170,71 @@ static struct rq *this_rq_lock(void)
         return rq;
  }
  
+/*
+ * __task_rq_lock - lock the rq @p resides on.
+ */
+struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
+       __acquires(rq->lock)
+{
+       struct rq *rq;
+
+       lockdep_assert_held(&p->pi_lock);
+
+       for (;;) {
+               rq = task_rq(p);
+               raw_spin_lock(&rq->lock);
+               if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
+                       rf->cookie = lockdep_pin_lock(&rq->lock);
+                       return rq;
+               }
+               raw_spin_unlock(&rq->lock);
+
+               while (unlikely(task_on_rq_migrating(p)))
+                       cpu_relax();
+       }
+}
+
+/*
+ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
+ */
+struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
+       __acquires(p->pi_lock)
+       __acquires(rq->lock)
+{
+       struct rq *rq;
+
+       for (;;) {
+               raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
+               rq = task_rq(p);
+               raw_spin_lock(&rq->lock);
+               /*
+                *      move_queued_task()              task_rq_lock()
+                *
+                *      ACQUIRE (rq->lock)
+                *      [S] ->on_rq = MIGRATING         [L] rq = task_rq()
+                *      WMB (__set_task_cpu())          ACQUIRE (rq->lock);
+                *      [S] ->cpu = new_cpu             [L] task_rq()
+                *                                      [L] ->on_rq
+                *      RELEASE (rq->lock)
+                *
+                * If we observe the old cpu in task_rq_lock, the acquire of
+                * the old rq->lock will fully serialize against the stores.
+                *
+                * If we observe the new cpu in task_rq_lock, the acquire will
+                * pair with the WMB to ensure we must then also see migrating.
+                */
+               if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
+                       rf->cookie = lockdep_pin_lock(&rq->lock);
+                       return rq;
+               }
+               raw_spin_unlock(&rq->lock);
+               raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
+
+               while (unlikely(task_on_rq_migrating(p)))
+                       cpu_relax();
+       }
+}
+
  #ifdef CONFIG_SCHED_HRTICK
  /*
   * Use HR-timers to deliver accurate preemption points.
@@ -249,29 +314,6 @@ void hrtick_start(struct rq *rq, u64 delay)
         }
  }
  
-static int
-hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
-{
-       int cpu = (int)(long)hcpu;
-
-       switch (action) {
-       case CPU_UP_CANCELED:
-       case CPU_UP_CANCELED_FROZEN:
-       case CPU_DOWN_PREPARE:
-       case CPU_DOWN_PREPARE_FROZEN:
-       case CPU_DEAD:
-       case CPU_DEAD_FROZEN:
-               hrtick_clear(cpu_rq(cpu));
-               return NOTIFY_OK;
-       }
-
-       return NOTIFY_DONE;
-}
-
-static __init void init_hrtick(void)
-{
-       hotcpu_notifier(hotplug_hrtick, 0);
-}
  #else
  /*
   * Called to set the hrtick timer state.
@@ -288,10 +330,6 @@ void hrtick_start(struct rq *rq, u64 delay)
         hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
                       HRTIMER_MODE_REL_PINNED);
  }
-
-static inline void init_hrtick(void)
-{
-}
  #endif /* CONFIG_SMP */
  
  static void init_rq_hrtick(struct rq *rq)
@@ -315,10 +353,6 @@ static inline void hrtick_clear(struct rq *rq)
  static inline void init_rq_hrtick(struct rq *rq)
  {
  }
-
-static inline void init_hrtick(void)
-{
-}
  #endif /* CONFIG_SCHED_HRTICK */
  
  /*
@@ -400,7 +434,7 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task)
          * wakeup due to that.
          *
          * This cmpxchg() implies a full barrier, which pairs with the write
-        * barrier implied by the wakeup in wake_up_list().
+        * barrier implied by the wakeup in wake_up_q().
          */
         if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
                 return;
@@ -499,7 +533,10 @@ int get_nohz_timer_target(void)
         rcu_read_lock();
         for_each_domain(cpu, sd) {
                 for_each_cpu(i, sched_domain_span(sd)) {
-                       if (!idle_cpu(i) && is_housekeeping_cpu(cpu)) {
+                       if (cpu == i)
+                               continue;
+
+                       if (!idle_cpu(i) && is_housekeeping_cpu(i)) {
                                 cpu = i;
                                 goto unlock;
                         }
@@ -1085,12 +1122,20 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
  static int __set_cpus_allowed_ptr(struct task_struct *p,
                                   const struct cpumask *new_mask, bool check)
  {
-       unsigned long flags;
-       struct rq *rq;
+       const struct cpumask *cpu_valid_mask = cpu_active_mask;
         unsigned int dest_cpu;
+       struct rq_flags rf;
+       struct rq *rq;
         int ret = 0;
  
-       rq = task_rq_lock(p, &flags);
+       rq = task_rq_lock(p, &rf);
+
+       if (p->flags & PF_KTHREAD) {
+               /*
+                * Kernel threads are allowed on online && !active CPUs
+                */
+               cpu_valid_mask = cpu_online_mask;
+       }
  
         /*
          * Must re-check here, to close a race against __kthread_bind(),
@@ -1104,22 +1149,32 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
         if (cpumask_equal(&p->cpus_allowed, new_mask))
                 goto out;
  
-       if (!cpumask_intersects(new_mask, cpu_active_mask)) {
+       if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
                 ret = -EINVAL;
                 goto out;
         }
  
         do_set_cpus_allowed(p, new_mask);
  
+       if (p->flags & PF_KTHREAD) {
+               /*
+                * For kernel threads that do indeed end up on online &&
+                * !active we want to ensure they are strict per-cpu threads.
+                */
+               WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
+                       !cpumask_intersects(new_mask, cpu_active_mask) &&
+                       p->nr_cpus_allowed != 1);
+       }
+
         /* Can the task run on the task's current CPU? If so, we're done */
         if (cpumask_test_cpu(task_cpu(p), new_mask))
                 goto out;
  
-       dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
+       dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
         if (task_running(rq, p) || p->state == TASK_WAKING) {
                 struct migration_arg arg = { p, dest_cpu };
                 /* Need help from migration thread: drop lock and wait. */
-               task_rq_unlock(rq, p, &flags);
+               task_rq_unlock(rq, p, &rf);
                 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
                 tlb_migrate_finish(p->mm);
                 return 0;
@@ -1128,12 +1183,12 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
                  * OK, since we're going to drop the lock immediately
                  * afterwards anyway.
                  */
-               lockdep_unpin_lock(&rq->lock);
+               lockdep_unpin_lock(&rq->lock, rf.cookie);
                 rq = move_queued_task(rq, p, dest_cpu);
-               lockdep_pin_lock(&rq->lock);
+               lockdep_repin_lock(&rq->lock, rf.cookie);
         }
  out:
-       task_rq_unlock(rq, p, &flags);
+       task_rq_unlock(rq, p, &rf);
  
         return ret;
  }
@@ -1317,8 +1372,8 @@ out:
   */
  unsigned long wait_task_inactive(struct task_struct *p, long match_state)
  {
-       unsigned long flags;
         int running, queued;
+       struct rq_flags rf;
         unsigned long ncsw;
         struct rq *rq;
  
@@ -1353,14 +1408,14 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
                  * lock now, to be *sure*. If we're wrong, we'll
                  * just go back and repeat.
                  */
-               rq = task_rq_lock(p, &flags);
+               rq = task_rq_lock(p, &rf);
                 trace_sched_wait_task(p);
                 running = task_running(rq, p);
                 queued = task_on_rq_queued(p);
                 ncsw = 0;
                 if (!match_state || p->state == match_state)
                         ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
-               task_rq_unlock(rq, p, &flags);
+               task_rq_unlock(rq, p, &rf);
  
                 /*
                  * If it changed from the expected state, bail out now.
@@ -1434,6 +1489,25 @@ EXPORT_SYMBOL_GPL(kick_process);
  
  /*
   * ->cpus_allowed is protected by both rq->lock and p->pi_lock
+ *
+ * A few notes on cpu_active vs cpu_online:
+ *
+ *  - cpu_active must be a subset of cpu_online
+ *
+ *  - on cpu-up we allow per-cpu kthreads on the online && !active cpu,
+ *    see __set_cpus_allowed_ptr(). At this point the newly online
+ *    cpu isn't yet part of the sched domains, and balancing will not
+ *    see it.
+ *
+ *  - on cpu-down we clear cpu_active() to mask the sched domains and
+ *    avoid the load balancer to place new tasks on the to be removed
+ *    cpu. Existing tasks will remain running there and will be taken
+ *    off.
+ *
+ * This means that fallback selection must not select !active CPUs.
+ * And can assume that any active CPU must be online. Conversely
+ * select_task_rq() below may allow selection of !active CPUs in order
+ * to satisfy the above rules.
   */
  static int select_fallback_rq(int cpu, struct task_struct *p)
  {
@@ -1452,8 +1526,6 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
  
                 /* Look for allowed, online CPU in same node. */
                 for_each_cpu(dest_cpu, nodemask) {
-                       if (!cpu_online(dest_cpu))
-                               continue;
                         if (!cpu_active(dest_cpu))
                                 continue;
                         if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
@@ -1464,9 +1536,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
         for (;;) {
                 /* Any allowed, online CPU? */
                 for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
-                       if (!cpu_online(dest_cpu))
+                       if (!(p->flags & PF_KTHREAD) && !cpu_active(dest_cpu))
                                 continue;
-                       if (!cpu_active(dest_cpu))
+                       if (!cpu_online(dest_cpu))
                                 continue;
                         goto out;
                 }
@@ -1515,8 +1587,10 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
  {
         lockdep_assert_held(&p->pi_lock);
  
-       if (p->nr_cpus_allowed > 1)
+       if (tsk_nr_cpus_allowed(p) > 1)
                 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
+       else
+               cpu = cpumask_any(tsk_cpus_allowed(p));
  
         /*
          * In order not to call set_task_cpu() on a blocking task we need
@@ -1604,8 +1678,8 @@ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_fl
  /*
   * Mark the task runnable and perform wakeup-preemption.
   */
-static void
-ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
+static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
+                          struct pin_cookie cookie)
  {
         check_preempt_curr(rq, p, wake_flags);
         p->state = TASK_RUNNING;
@@ -1617,9 +1691,9 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
                  * Our task @p is fully woken up and running; so its safe to
                  * drop the rq->lock, hereafter rq is only used for statistics.
                  */
-               lockdep_unpin_lock(&rq->lock);
+               lockdep_unpin_lock(&rq->lock, cookie);
                 p->sched_class->task_woken(rq, p);
-               lockdep_pin_lock(&rq->lock);
+               lockdep_repin_lock(&rq->lock, cookie);
         }
  
         if (rq->idle_stamp) {
@@ -1637,17 +1711,23 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
  }
  
  static void
-ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
+ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
+                struct pin_cookie cookie)
  {
+       int en_flags = ENQUEUE_WAKEUP;
+
         lockdep_assert_held(&rq->lock);
  
  #ifdef CONFIG_SMP
         if (p->sched_contributes_to_load)
                 rq->nr_uninterruptible--;
+
+       if (wake_flags & WF_MIGRATED)
+               en_flags |= ENQUEUE_MIGRATED;
  #endif
  
-       ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
-       ttwu_do_wakeup(rq, p, wake_flags);
+       ttwu_activate(rq, p, en_flags);
+       ttwu_do_wakeup(rq, p, wake_flags, cookie);
  }
  
  /*
@@ -1658,17 +1738,18 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
   */
  static int ttwu_remote(struct task_struct *p, int wake_flags)
  {
+       struct rq_flags rf;
         struct rq *rq;
         int ret = 0;
  
-       rq = __task_rq_lock(p);
+       rq = __task_rq_lock(p, &rf);
         if (task_on_rq_queued(p)) {
                 /* check_preempt_curr() may use rq clock */
                 update_rq_clock(rq);
-               ttwu_do_wakeup(rq, p, wake_flags);
+               ttwu_do_wakeup(rq, p, wake_flags, rf.cookie);
                 ret = 1;
         }
-       __task_rq_unlock(rq);
+       __task_rq_unlock(rq, &rf);
  
         return ret;
  }
@@ -1678,6 +1759,7 @@ void sched_ttwu_pending(void)
  {
         struct rq *rq = this_rq();
         struct llist_node *llist = llist_del_all(&rq->wake_list);
+       struct pin_cookie cookie;
         struct task_struct *p;
         unsigned long flags;
  
@@ -1685,15 +1767,21 @@ void sched_ttwu_pending(void)
                 return;
  
         raw_spin_lock_irqsave(&rq->lock, flags);
-       lockdep_pin_lock(&rq->lock);
+       cookie = lockdep_pin_lock(&rq->lock);
  
         while (llist) {
+               int wake_flags = 0;
+
                 p = llist_entry(llist, struct task_struct, wake_entry);
                 llist = llist_next(llist);
-               ttwu_do_activate(rq, p, 0);
+
+               if (p->sched_remote_wakeup)
+                       wake_flags = WF_MIGRATED;
+
+               ttwu_do_activate(rq, p, wake_flags, cookie);
         }
  
-       lockdep_unpin_lock(&rq->lock);
+       lockdep_unpin_lock(&rq->lock, cookie);
         raw_spin_unlock_irqrestore(&rq->lock, flags);
  }
  
@@ -1735,10 +1823,12 @@ void scheduler_ipi(void)
         irq_exit();
  }
  
-static void ttwu_queue_remote(struct task_struct *p, int cpu)
+static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
  {
         struct rq *rq = cpu_rq(cpu);
  
+       p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
+
         if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
                 if (!set_nr_if_polling(rq->idle))
                         smp_send_reschedule(cpu);
@@ -1777,22 +1867,23 @@ bool cpus_share_cache(int this_cpu, int that_cpu)
  }
  #endif /* CONFIG_SMP */
  
-static void ttwu_queue(struct task_struct *p, int cpu)
+static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
  {
         struct rq *rq = cpu_rq(cpu);
+       struct pin_cookie cookie;
  
  #if defined(CONFIG_SMP)
         if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
                 sched_clock_cpu(cpu); /* sync clocks x-cpu */
-               ttwu_queue_remote(p, cpu);
+               ttwu_queue_remote(p, cpu, wake_flags);
                 return;
         }
  #endif
  
         raw_spin_lock(&rq->lock);
-       lockdep_pin_lock(&rq->lock);
-       ttwu_do_activate(rq, p, 0);
-       lockdep_unpin_lock(&rq->lock);
+       cookie = lockdep_pin_lock(&rq->lock);
+       ttwu_do_activate(rq, p, wake_flags, cookie);
+       lockdep_unpin_lock(&rq->lock, cookie);
         raw_spin_unlock(&rq->lock);
  }
  
@@ -1961,9 +2052,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
         p->sched_contributes_to_load = !!task_contributes_to_load(p);
         p->state = TASK_WAKING;
  
-       if (p->sched_class->task_waking)
-               p->sched_class->task_waking(p);
-
         cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
         if (task_cpu(p) != cpu) {
                 wake_flags |= WF_MIGRATED;
@@ -1971,7 +2059,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
         }
  #endif /* CONFIG_SMP */
  
-       ttwu_queue(p, cpu);
+       ttwu_queue(p, cpu, wake_flags);
  stat:
         if (schedstat_enabled())
                 ttwu_stat(p, cpu, wake_flags);
@@ -1989,7 +2077,7 @@ out:
   * ensure that this_rq() is locked, @p is bound to this_rq() and not
   * the current task.
   */
-static void try_to_wake_up_local(struct task_struct *p)
+static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie)
  {
         struct rq *rq = task_rq(p);
  
@@ -2006,11 +2094,11 @@ static void try_to_wake_up_local(struct task_struct *p)
                  * disabled avoiding further scheduler activity on it and we've
                  * not yet picked a replacement task.
                  */
-               lockdep_unpin_lock(&rq->lock);
+               lockdep_unpin_lock(&rq->lock, cookie);
                 raw_spin_unlock(&rq->lock);
                 raw_spin_lock(&p->pi_lock);
                 raw_spin_lock(&rq->lock);
-               lockdep_pin_lock(&rq->lock);
+               lockdep_repin_lock(&rq->lock, cookie);
         }
  
         if (!(p->state & TASK_NORMAL))
@@ -2021,7 +2109,7 @@ static void try_to_wake_up_local(struct task_struct *p)
         if (!task_on_rq_queued(p))
                 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
  
-       ttwu_do_wakeup(rq, p, 0);
+       ttwu_do_wakeup(rq, p, 0, cookie);
         if (schedstat_enabled())
                 ttwu_stat(p, smp_processor_id(), 0);
  out:
@@ -2167,9 +2255,11 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
  #endif
  #endif
  
+#ifdef CONFIG_SCHEDSTATS
+
  DEFINE_STATIC_KEY_FALSE(sched_schedstats);
+static bool __initdata __sched_schedstats = false;
  
-#ifdef CONFIG_SCHEDSTATS
  static void set_schedstats(bool enabled)
  {
         if (enabled)
@@ -2192,11 +2282,16 @@ static int __init setup_schedstats(char *str)
         if (!str)
                 goto out;
  
+       /*
+        * This code is called before jump labels have been set up, so we can't
+        * change the static branch directly just yet.  Instead set a temporary
+        * variable so init_schedstats() can do it later.
+        */
         if (!strcmp(str, "enable")) {
-               set_schedstats(true);
+               __sched_schedstats = true;
                 ret = 1;
         } else if (!strcmp(str, "disable")) {
-               set_schedstats(false);
+               __sched_schedstats = false;
                 ret = 1;
         }
  out:
@@ -2207,6 +2302,11 @@ out:
  }
  __setup("schedstats=", setup_schedstats);
  
+static void __init init_schedstats(void)
+{
+       set_schedstats(__sched_schedstats);
+}
+
  #ifdef CONFIG_PROC_SYSCTL
  int sysctl_schedstats(struct ctl_table *table, int write,
                          void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -2227,8 +2327,10 @@ int sysctl_schedstats(struct ctl_table *table, int write,
                 set_schedstats(state);
         return err;
  }
-#endif
-#endif
+#endif /* CONFIG_PROC_SYSCTL */
+#else  /* !CONFIG_SCHEDSTATS */
+static inline void init_schedstats(void) {}
+#endif /* CONFIG_SCHEDSTATS */
  
  /*
   * fork()/clone()-time setup:
@@ -2381,7 +2483,8 @@ static int dl_overflow(struct task_struct *p, int policy,
         u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
         int cpus, err = -1;
  
-       if (new_bw == p->dl.dl_bw)
+       /* !deadline task may carry old deadline bandwidth */
+       if (new_bw == p->dl.dl_bw && task_has_dl_policy(p))
                 return 0;
  
         /*
@@ -2420,12 +2523,12 @@ extern void init_dl_bw(struct dl_bw *dl_b);
   */
  void wake_up_new_task(struct task_struct *p)
  {
-       unsigned long flags;
+       struct rq_flags rf;
         struct rq *rq;
  
-       raw_spin_lock_irqsave(&p->pi_lock, flags);
         /* Initialize new task's runnable average */
         init_entity_runnable_average(&p->se);
+       raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
  #ifdef CONFIG_SMP
         /*
          * Fork balancing, do it here and not earlier because:
@@ -2434,8 +2537,9 @@ void wake_up_new_task(struct task_struct *p)
          */
         set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
  #endif
+       rq = __task_rq_lock(p, &rf);
+       post_init_entity_util_avg(&p->se);
  
-       rq = __task_rq_lock(p);
         activate_task(rq, p, 0);
         p->on_rq = TASK_ON_RQ_QUEUED;
         trace_sched_wakeup_new(p);
@@ -2446,12 +2550,12 @@ void wake_up_new_task(struct task_struct *p)
                  * Nothing relies on rq->lock after this, so its fine to
                  * drop it.
                  */
-               lockdep_unpin_lock(&rq->lock);
+               lockdep_unpin_lock(&rq->lock, rf.cookie);
                 p->sched_class->task_woken(rq, p);
-               lockdep_pin_lock(&rq->lock);
+               lockdep_repin_lock(&rq->lock, rf.cookie);
         }
  #endif
-       task_rq_unlock(rq, p, &flags);
+       task_rq_unlock(rq, p, &rf);
  }
  
  #ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -2713,7 +2817,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
   */
  static __always_inline struct rq *
  context_switch(struct rq *rq, struct task_struct *prev,
-              struct task_struct *next)
+              struct task_struct *next, struct pin_cookie cookie)
  {
         struct mm_struct *mm, *oldmm;
  
@@ -2733,7 +2837,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
                 atomic_inc(&oldmm->mm_count);
                 enter_lazy_tlb(oldmm, next);
         } else
-               switch_mm(oldmm, mm, next);
+               switch_mm_irqs_off(oldmm, mm, next);
  
         if (!prev->mm) {
                 prev->active_mm = NULL;
@@ -2745,7 +2849,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
          * of the scheduler it's an obvious special-case), so we
          * do an early lockdep release here:
          */
-       lockdep_unpin_lock(&rq->lock);
+       lockdep_unpin_lock(&rq->lock, cookie);
         spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
  
         /* Here we just switch the register state and the stack. */
@@ -2867,7 +2971,7 @@ EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
   */
  unsigned long long task_sched_runtime(struct task_struct *p)
  {
-       unsigned long flags;
+       struct rq_flags rf;
         struct rq *rq;
         u64 ns;
  
@@ -2887,7 +2991,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
                 return p->se.sum_exec_runtime;
  #endif
  
-       rq = task_rq_lock(p, &flags);
+       rq = task_rq_lock(p, &rf);
         /*
          * Must be ->curr _and_ ->on_rq.  If dequeued, we would
          * project cycles that may never be accounted to this
@@ -2898,7 +3002,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
                 p->sched_class->update_curr(rq);
         }
         ns = p->se.sum_exec_runtime;
-       task_rq_unlock(rq, p, &flags);
+       task_rq_unlock(rq, p, &rf);
  
         return ns;
  }
@@ -2918,7 +3022,7 @@ void scheduler_tick(void)
         raw_spin_lock(&rq->lock);
         update_rq_clock(rq);
         curr->sched_class->task_tick(rq, curr, 0);
-       update_cpu_load_active(rq);
+       cpu_load_update_active(rq);
         calc_global_load_tick(rq);
         raw_spin_unlock(&rq->lock);
  
@@ -2961,6 +3065,20 @@ u64 scheduler_tick_max_deferment(void)
  
  #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
                                 defined(CONFIG_PREEMPT_TRACER))
+/*
+ * If the value passed in is equal to the current preempt count
+ * then we just disabled preemption. Start timing the latency.
+ */
+static inline void preempt_latency_start(int val)
+{
+       if (preempt_count() == val) {
+               unsigned long ip = get_lock_parent_ip();
+#ifdef CONFIG_DEBUG_PREEMPT
+               current->preempt_disable_ip = ip;
+#endif
+               trace_preempt_off(CALLER_ADDR0, ip);
+       }
+}
  
  void preempt_count_add(int val)
  {
@@ -2979,17 +3097,21 @@ void preempt_count_add(int val)
         DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
                                 PREEMPT_MASK - 10);
  #endif
-       if (preempt_count() == val) {
-               unsigned long ip = get_lock_parent_ip();
-#ifdef CONFIG_DEBUG_PREEMPT
-               current->preempt_disable_ip = ip;
-#endif
-               trace_preempt_off(CALLER_ADDR0, ip);
-       }
+       preempt_latency_start(val);
  }
  EXPORT_SYMBOL(preempt_count_add);
  NOKPROBE_SYMBOL(preempt_count_add);
  
+/*
+ * If the value passed in equals to the current preempt count
+ * then we just enabled preemption. Stop timing the latency.
+ */
+static inline void preempt_latency_stop(int val)
+{
+       if (preempt_count() == val)
+               trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
+}
+
  void preempt_count_sub(int val)
  {
  #ifdef CONFIG_DEBUG_PREEMPT
@@ -3006,13 +3128,15 @@ void preempt_count_sub(int val)
                 return;
  #endif
  
-       if (preempt_count() == val)
-               trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
+       preempt_latency_stop(val);
         __preempt_count_sub(val);
  }
  EXPORT_SYMBOL(preempt_count_sub);
  NOKPROBE_SYMBOL(preempt_count_sub);
  
+#else
+static inline void preempt_latency_start(int val) { }
+static inline void preempt_latency_stop(int val) { }
  #endif
  
  /*
@@ -3047,7 +3171,8 @@ static noinline void __schedule_bug(struct task_struct *prev)
  static inline void schedule_debug(struct task_struct *prev)
  {
  #ifdef CONFIG_SCHED_STACK_END_CHECK
-       BUG_ON(task_stack_end_corrupted(prev));
+       if (task_stack_end_corrupted(prev))
+               panic("corrupted stack end detected inside scheduler\n");
  #endif
  
         if (unlikely(in_atomic_preempt_off())) {
@@ -3065,7 +3190,7 @@ static inline void schedule_debug(struct task_struct *prev)
   * Pick up the highest-prio task:
   */
  static inline struct task_struct *
-pick_next_task(struct rq *rq, struct task_struct *prev)
+pick_next_task(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
  {
         const struct sched_class *class = &fair_sched_class;
         struct task_struct *p;
@@ -3076,20 +3201,20 @@ pick_next_task(struct rq *rq, struct task_struct *prev)
          */
         if (likely(prev->sched_class == class &&
                    rq->nr_running == rq->cfs.h_nr_running)) {
-               p = fair_sched_class.pick_next_task(rq, prev);
+               p = fair_sched_class.pick_next_task(rq, prev, cookie);
                 if (unlikely(p == RETRY_TASK))
                         goto again;
  
                 /* assumes fair_sched_class->next == idle_sched_class */
                 if (unlikely(!p))
-                       p = idle_sched_class.pick_next_task(rq, prev);
+                       p = idle_sched_class.pick_next_task(rq, prev, cookie);
  
                 return p;
         }
  
  again:
         for_each_class(class) {
-               p = class->pick_next_task(rq, prev);
+               p = class->pick_next_task(rq, prev, cookie);
                 if (p) {
                         if (unlikely(p == RETRY_TASK))
                                 goto again;
@@ -3143,6 +3268,7 @@ static void __sched notrace __schedule(bool preempt)
  {
         struct task_struct *prev, *next;
         unsigned long *switch_count;
+       struct pin_cookie cookie;
         struct rq *rq;
         int cpu;
  
@@ -3176,7 +3302,7 @@ static void __sched notrace __schedule(bool preempt)
          */
         smp_mb__before_spinlock();
         raw_spin_lock(&rq->lock);
-       lockdep_pin_lock(&rq->lock);
+       cookie = lockdep_pin_lock(&rq->lock);
  
         rq->clock_skip_update <<= 1; /* promote REQ to ACT */
  
@@ -3198,7 +3324,7 @@ static void __sched notrace __schedule(bool preempt)
  
                                 to_wakeup = wq_worker_sleeping(prev);
                                 if (to_wakeup)
-                                       try_to_wake_up_local(to_wakeup);
+                                       try_to_wake_up_local(to_wakeup, cookie);
                         }
                 }
                 switch_count = &prev->nvcsw;
@@ -3207,7 +3333,7 @@ static void __sched notrace __schedule(bool preempt)
         if (task_on_rq_queued(prev))
                 update_rq_clock(rq);
  
-       next = pick_next_task(rq, prev);
+       next = pick_next_task(rq, prev, cookie);
         clear_tsk_need_resched(prev);
         clear_preempt_need_resched();
         rq->clock_skip_update = 0;
@@ -3218,9 +3344,9 @@ static void __sched notrace __schedule(bool preempt)
                 ++*switch_count;
  
                 trace_sched_switch(preempt, prev, next);
-               rq = context_switch(rq, prev, next); /* unlocks the rq */
+               rq = context_switch(rq, prev, next, cookie); /* unlocks the rq */
         } else {
-               lockdep_unpin_lock(&rq->lock);
+               lockdep_unpin_lock(&rq->lock, cookie);
                 raw_spin_unlock_irq(&rq->lock);
         }
  
@@ -3287,8 +3413,23 @@ void __sched schedule_preempt_disabled(void)
  static void __sched notrace preempt_schedule_common(void)
  {
         do {
+               /*
+                * Because the function tracer can trace preempt_count_sub()
+                * and it also uses preempt_enable/disable_notrace(), if
+                * NEED_RESCHED is set, the preempt_enable_notrace() called
+                * by the function tracer will call this function again and
+                * cause infinite recursion.
+                *
+                * Preemption must be disabled here before the function
+                * tracer can trace. Break up preempt_disable() into two
+                * calls. One to disable preemption without fear of being
+                * traced. The other to still record the preemption latency,
+                * which can also be traced by the function tracer.
+                */
                 preempt_disable_notrace();
+               preempt_latency_start(1);
                 __schedule(true);
+               preempt_latency_stop(1);
                 preempt_enable_no_resched_notrace();
  
                 /*
@@ -3340,7 +3481,21 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
                 return;
  
         do {
+               /*
+                * Because the function tracer can trace preempt_count_sub()
+                * and it also uses preempt_enable/disable_notrace(), if
+                * NEED_RESCHED is set, the preempt_enable_notrace() called
+                * by the function tracer will call this function again and
+                * cause infinite recursion.
+                *
+                * Preemption must be disabled here before the function
+                * tracer can trace. Break up preempt_disable() into two
+                * calls. One to disable preemption without fear of being
+                * traced. The other to still record the preemption latency,
+                * which can also be traced by the function tracer.
+                */
                 preempt_disable_notrace();
+               preempt_latency_start(1);
                 /*
                  * Needs preempt disabled in case user_exit() is traced
                  * and the tracer calls preempt_enable_notrace() causing
@@ -3350,6 +3505,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
                 __schedule(true);
                 exception_exit(prev_ctx);
  
+               preempt_latency_stop(1);
                 preempt_enable_no_resched_notrace();
         } while (need_resched());
  }
@@ -3406,12 +3562,13 @@ EXPORT_SYMBOL(default_wake_function);
  void rt_mutex_setprio(struct task_struct *p, int prio)
  {
         int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
-       struct rq *rq;
         const struct sched_class *prev_class;
+       struct rq_flags rf;
+       struct rq *rq;
  
         BUG_ON(prio > MAX_PRIO);
  
-       rq = __task_rq_lock(p);
+       rq = __task_rq_lock(p, &rf);
  
         /*
          * Idle task boosting is a nono in general. There is one
@@ -3487,7 +3644,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
         check_class_changed(rq, p, prev_class, oldprio);
  out_unlock:
         preempt_disable(); /* avoid rq from going away on us */
-       __task_rq_unlock(rq);
+       __task_rq_unlock(rq, &rf);
  
         balance_callback(rq);
         preempt_enable();
@@ -3497,7 +3654,7 @@ out_unlock:
  void set_user_nice(struct task_struct *p, long nice)
  {
         int old_prio, delta, queued;
-       unsigned long flags;
+       struct rq_flags rf;
         struct rq *rq;
  
         if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
@@ -3506,7 +3663,7 @@ void set_user_nice(struct task_struct *p, long nice)
          * We have to be careful, if called from sys_setpriority(),
          * the task might be in the middle of scheduling on another CPU.
          */
-       rq = task_rq_lock(p, &flags);
+       rq = task_rq_lock(p, &rf);
         /*
          * The RT priorities are set via sched_setscheduler(), but we still
          * allow the 'normal' nice value to be set - but as expected
@@ -3537,7 +3694,7 @@ void set_user_nice(struct task_struct *p, long nice)
                         resched_curr(rq);
         }
  out_unlock:
-       task_rq_unlock(rq, p, &flags);
+       task_rq_unlock(rq, p, &rf);
  }
  EXPORT_SYMBOL(set_user_nice);
  
@@ -3834,11 +3991,11 @@ static int __sched_setscheduler(struct task_struct *p,
                       MAX_RT_PRIO - 1 - attr->sched_priority;
         int retval, oldprio, oldpolicy = -1, queued, running;
         int new_effective_prio, policy = attr->sched_policy;
-       unsigned long flags;
         const struct sched_class *prev_class;
-       struct rq *rq;
+       struct rq_flags rf;
         int reset_on_fork;
         int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
+       struct rq *rq;
  
         /* may grab non-irq protected spin_locks */
         BUG_ON(in_interrupt());
@@ -3933,13 +4090,13 @@ recheck:
          * To be able to change p->policy safely, the appropriate
          * runqueue lock must be held.
          */
-       rq = task_rq_lock(p, &flags);
+       rq = task_rq_lock(p, &rf);
  
         /*
          * Changing the policy of the stop threads its a very bad idea
          */
         if (p == rq->stop) {
-               task_rq_unlock(rq, p, &flags);
+               task_rq_unlock(rq, p, &rf);
                 return -EINVAL;
         }
  
@@ -3956,7 +4113,7 @@ recheck:
                         goto change;
  
                 p->sched_reset_on_fork = reset_on_fork;
-               task_rq_unlock(rq, p, &flags);
+               task_rq_unlock(rq, p, &rf);
                 return 0;
         }
  change:
@@ -3970,7 +4127,7 @@ change:
                 if (rt_bandwidth_enabled() && rt_policy(policy) &&
                                 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
                                 !task_group_is_autogroup(task_group(p))) {
-                       task_rq_unlock(rq, p, &flags);
+                       task_rq_unlock(rq, p, &rf);
                         return -EPERM;
                 }
  #endif
@@ -3985,7 +4142,7 @@ change:
                          */
                         if (!cpumask_subset(span, &p->cpus_allowed) ||
                             rq->rd->dl_bw.bw == 0) {
-                               task_rq_unlock(rq, p, &flags);
+                               task_rq_unlock(rq, p, &rf);
                                 return -EPERM;
                         }
                 }
@@ -3995,7 +4152,7 @@ change:
         /* recheck policy now with rq lock held */
         if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
                 policy = oldpolicy = -1;
-               task_rq_unlock(rq, p, &flags);
+               task_rq_unlock(rq, p, &rf);
                 goto recheck;
         }
  
@@ -4005,7 +4162,7 @@ change:
          * is available.
          */
         if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) {
-               task_rq_unlock(rq, p, &flags);
+               task_rq_unlock(rq, p, &rf);
                 return -EBUSY;
         }
  
@@ -4050,7 +4207,7 @@ change:
  
         check_class_changed(rq, p, prev_class, oldprio);
         preempt_disable(); /* avoid rq from going away on us */
-       task_rq_unlock(rq, p, &flags);
+       task_rq_unlock(rq, p, &rf);
  
         if (pi)
                 rt_mutex_adjust_pi(p);
@@ -4903,10 +5060,10 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
  {
         struct task_struct *p;
         unsigned int time_slice;
-       unsigned long flags;
+       struct rq_flags rf;
+       struct timespec t;
         struct rq *rq;
         int retval;
-       struct timespec t;
  
         if (pid < 0)
                 return -EINVAL;
@@ -4921,11 +5078,11 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
         if (retval)
                 goto out_unlock;
  
-       rq = task_rq_lock(p, &flags);
+       rq = task_rq_lock(p, &rf);
         time_slice = 0;
         if (p->sched_class->get_rr_interval)
                 time_slice = p->sched_class->get_rr_interval(rq, p);
-       task_rq_unlock(rq, p, &flags);
+       task_rq_unlock(rq, p, &rf);
  
         rcu_read_unlock();
         jiffies_to_timespec(time_slice, &t);
@@ -4992,16 +5149,19 @@ void show_state_filter(unsigned long state_filter)
                 /*
                  * reset the NMI-timeout, listing all files on a slow
                  * console might take a lot of time:
+                * Also, reset softlockup watchdogs on all CPUs, because
+                * another CPU might be blocked waiting for us to process
+                * an IPI.
                  */
                 touch_nmi_watchdog();
+               touch_all_softlockup_watchdogs();
                 if (!state_filter || (p->state & state_filter))
                         sched_show_task(p);
         }
  
-       touch_all_softlockup_watchdogs();
-
  #ifdef CONFIG_SCHED_DEBUG
-       sysrq_sched_debug_show();
+       if (!state_filter)
+               sysrq_sched_debug_show();
  #endif
         rcu_read_unlock();
         /*
@@ -5163,6 +5323,8 @@ out:
  
  #ifdef CONFIG_SMP
  
+static bool sched_smp_initialized __read_mostly;
+
  #ifdef CONFIG_NUMA_BALANCING
  /* Migrate current task p to target_cpu */
  int migrate_task_to(struct task_struct *p, int target_cpu)
@@ -5188,11 +5350,11 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
   */
  void sched_setnuma(struct task_struct *p, int nid)
  {
-       struct rq *rq;
-       unsigned long flags;
         bool queued, running;
+       struct rq_flags rf;
+       struct rq *rq;
  
-       rq = task_rq_lock(p, &flags);
+       rq = task_rq_lock(p, &rf);
         queued = task_on_rq_queued(p);
         running = task_current(rq, p);
  
@@ -5207,7 +5369,7 @@ void sched_setnuma(struct task_struct *p, int nid)
                 p->sched_class->set_curr_task(rq);
         if (queued)
                 enqueue_task(rq, p, ENQUEUE_RESTORE);
-       task_rq_unlock(rq, p, &flags);
+       task_rq_unlock(rq, p, &rf);
  }
  #endif /* CONFIG_NUMA_BALANCING */
  
@@ -5223,7 +5385,7 @@ void idle_task_exit(void)
         BUG_ON(cpu_online(smp_processor_id()));
  
         if (mm != &init_mm) {
-               switch_mm(mm, &init_mm, current);
+               switch_mm_irqs_off(mm, &init_mm, current);
                 finish_arch_post_lock_switch();
         }
         mmdrop(mm);
@@ -5271,6 +5433,7 @@ static void migrate_tasks(struct rq *dead_rq)
  {
         struct rq *rq = dead_rq;
         struct task_struct *next, *stop = rq->stop;
+       struct pin_cookie cookie;
         int dest_cpu;
  
         /*
@@ -5302,8 +5465,8 @@ static void migrate_tasks(struct rq *dead_rq)
                 /*
                  * pick_next_task assumes pinned rq->lock.
                  */
-               lockdep_pin_lock(&rq->lock);
-               next = pick_next_task(rq, &fake_task);
+               cookie = lockdep_pin_lock(&rq->lock);
+               next = pick_next_task(rq, &fake_task, cookie);
                 BUG_ON(!next);
                 next->sched_class->put_prev_task(rq, next);
  
@@ -5316,7 +5479,7 @@ static void migrate_tasks(struct rq *dead_rq)
                  * because !cpu_active at this point, which means load-balance
                  * will not interfere. Also, stop-machine.
                  */
-               lockdep_unpin_lock(&rq->lock);
+               lockdep_unpin_lock(&rq->lock, cookie);
                 raw_spin_unlock(&rq->lock);
                 raw_spin_lock(&next->pi_lock);
                 raw_spin_lock(&rq->lock);
@@ -5377,127 +5540,13 @@ static void set_rq_offline(struct rq *rq)
         }
  }
  
-/*
- * migration_call - callback that gets triggered when a CPU is added.
- * Here we can start up the necessary migration thread for the new CPU.
- */
-static int
-migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
+static void set_cpu_rq_start_time(unsigned int cpu)
  {
-       int cpu = (long)hcpu;
-       unsigned long flags;
         struct rq *rq = cpu_rq(cpu);
  
-       switch (action & ~CPU_TASKS_FROZEN) {
-
-       case CPU_UP_PREPARE:
-               rq->calc_load_update = calc_load_update;
-               account_reset_rq(rq);
-               break;
-
-       case CPU_ONLINE:
-               /* Update our root-domain */
-               raw_spin_lock_irqsave(&rq->lock, flags);
-               if (rq->rd) {
-                       BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
-
-                       set_rq_online(rq);
-               }
-               raw_spin_unlock_irqrestore(&rq->lock, flags);
-               break;
-
-#ifdef CONFIG_HOTPLUG_CPU
-       case CPU_DYING:
-               sched_ttwu_pending();
-               /* Update our root-domain */
-               raw_spin_lock_irqsave(&rq->lock, flags);
-               if (rq->rd) {
-                       BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
-                       set_rq_offline(rq);
-               }
-               migrate_tasks(rq);
-               BUG_ON(rq->nr_running != 1); /* the migration thread */
-               raw_spin_unlock_irqrestore(&rq->lock, flags);
-               break;
-
-       case CPU_DEAD:
-               calc_load_migrate(rq);
-               break;
-#endif
-       }
-
-       update_max_interval();
-
-       return NOTIFY_OK;
-}
-
-/*
- * Register at high priority so that task migration (migrate_all_tasks)
- * happens before everything else.  This has to be lower priority than
- * the notifier in the perf_event subsystem, though.
- */
-static struct notifier_block migration_notifier = {
-       .notifier_call = migration_call,
-       .priority = CPU_PRI_MIGRATION,
-};
-
-static void set_cpu_rq_start_time(void)
-{
-       int cpu = smp_processor_id();
-       struct rq *rq = cpu_rq(cpu);
         rq->age_stamp = sched_clock_cpu(cpu);
  }
  
-static int sched_cpu_active(struct notifier_block *nfb,
-                                     unsigned long action, void *hcpu)
-{
-       int cpu = (long)hcpu;
-
-       switch (action & ~CPU_TASKS_FROZEN) {
-       case CPU_STARTING:
-               set_cpu_rq_start_time();
-               return NOTIFY_OK;
-
-       case CPU_DOWN_FAILED:
-               set_cpu_active(cpu, true);
-               return NOTIFY_OK;
-
-       default:
-               return NOTIFY_DONE;
-       }
-}
-
-static int sched_cpu_inactive(struct notifier_block *nfb,
-                                       unsigned long action, void *hcpu)
-{
-       switch (action & ~CPU_TASKS_FROZEN) {
-       case CPU_DOWN_PREPARE:
-               set_cpu_active((long)hcpu, false);
-               return NOTIFY_OK;
-       default:
-               return NOTIFY_DONE;
-       }
-}
-
-static int __init migration_init(void)
-{
-       void *cpu = (void *)(long)smp_processor_id();
-       int err;
-
-       /* Initialize migration for the boot CPU */
-       err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
-       BUG_ON(err == NOTIFY_BAD);
-       migration_call(&migration_notifier, CPU_ONLINE, cpu);
-       register_cpu_notifier(&migration_notifier);
-
-       /* Register cpu active notifiers */
-       cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
-       cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
-
-       return 0;
-}
-early_initcall(migration_init);
-
  static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
  
  #ifdef CONFIG_SCHED_DEBUG
@@ -6645,10 +6694,10 @@ static void sched_init_numa(void)
         init_numa_topology_type();
  }
  
-static void sched_domains_numa_masks_set(int cpu)
+static void sched_domains_numa_masks_set(unsigned int cpu)
  {
-       int i, j;
         int node = cpu_to_node(cpu);
+       int i, j;
  
         for (i = 0; i < sched_domains_numa_levels; i++) {
                 for (j = 0; j < nr_node_ids; j++) {
@@ -6658,51 +6707,20 @@ static void sched_domains_numa_masks_set(int cpu)
         }
  }
  
-static void sched_domains_numa_masks_clear(int cpu)
+static void sched_domains_numa_masks_clear(unsigned int cpu)
  {
         int i, j;
+
         for (i = 0; i < sched_domains_numa_levels; i++) {
                 for (j = 0; j < nr_node_ids; j++)
                         cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
         }
  }
  
-/*
- * Update sched_domains_numa_masks[level][node] array when new cpus
- * are onlined.
- */
-static int sched_domains_numa_masks_update(struct notifier_block *nfb,
-                                          unsigned long action,
-                                          void *hcpu)
-{
-       int cpu = (long)hcpu;
-
-       switch (action & ~CPU_TASKS_FROZEN) {
-       case CPU_ONLINE:
-               sched_domains_numa_masks_set(cpu);
-               break;
-
-       case CPU_DEAD:
-               sched_domains_numa_masks_clear(cpu);
-               break;
-
-       default:
-               return NOTIFY_DONE;
-       }
-
-       return NOTIFY_OK;
-}
  #else
-static inline void sched_init_numa(void)
-{
-}
-
-static int sched_domains_numa_masks_update(struct notifier_block *nfb,
-                                          unsigned long action,
-                                          void *hcpu)
-{
-       return 0;
-}
+static inline void sched_init_numa(void) { }
+static void sched_domains_numa_masks_set(unsigned int cpu) { }
+static void sched_domains_numa_masks_clear(unsigned int cpu) { }
  #endif /* CONFIG_NUMA */
  
  static int __sdt_alloc(const struct cpumask *cpu_map)
@@ -7092,13 +7110,9 @@ static int num_cpus_frozen;      /* used to mark begin/end of suspend/resume */
   * If we come here as part of a suspend/resume, don't touch cpusets because we
   * want to restore it back to its original state upon resume anyway.
   */
-static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
-                            void *hcpu)
+static void cpuset_cpu_active(void)
  {
-       switch (action) {
-       case CPU_ONLINE_FROZEN:
-       case CPU_DOWN_FAILED_FROZEN:
-
+       if (cpuhp_tasks_frozen) {
                 /*
                  * num_cpus_frozen tracks how many CPUs are involved in suspend
                  * resume sequence. As long as this is not the last online
@@ -7108,35 +7122,25 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
                 num_cpus_frozen--;
                 if (likely(num_cpus_frozen)) {
                         partition_sched_domains(1, NULL, NULL);
-                       break;
+                       return;
                 }
-
                 /*
                  * This is the last CPU online operation. So fall through and
                  * restore the original sched domains by considering the
                  * cpuset configurations.
                  */
-
-       case CPU_ONLINE:
-               cpuset_update_active_cpus(true);
-               break;
-       default:
-               return NOTIFY_DONE;
         }
-       return NOTIFY_OK;
+       cpuset_update_active_cpus(true);
  }
  
-static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
-                              void *hcpu)
+static int cpuset_cpu_inactive(unsigned int cpu)
  {
         unsigned long flags;
-       long cpu = (long)hcpu;
         struct dl_bw *dl_b;
         bool overflow;
         int cpus;
  
-       switch (action) {
-       case CPU_DOWN_PREPARE:
+       if (!cpuhp_tasks_frozen) {
                 rcu_read_lock_sched();
                 dl_b = dl_bw_of(cpu);
  
@@ -7148,19 +7152,120 @@ static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
                 rcu_read_unlock_sched();
  
                 if (overflow)
-                       return notifier_from_errno(-EBUSY);
+                       return -EBUSY;
                 cpuset_update_active_cpus(false);
-               break;
-       case CPU_DOWN_PREPARE_FROZEN:
+       } else {
                 num_cpus_frozen++;
                 partition_sched_domains(1, NULL, NULL);
-               break;
-       default:
-               return NOTIFY_DONE;
         }
-       return NOTIFY_OK;
+       return 0;
+}
+
+int sched_cpu_activate(unsigned int cpu)
+{
+       struct rq *rq = cpu_rq(cpu);
+       unsigned long flags;
+
+       set_cpu_active(cpu, true);
+
+       if (sched_smp_initialized) {
+               sched_domains_numa_masks_set(cpu);
+               cpuset_cpu_active();
+       }
+
+       /*
+        * Put the rq online, if not already. This happens:
+        *
+        * 1) In the early boot process, because we build the real domains
+        *    after all cpus have been brought up.
+        *
+        * 2) At runtime, if cpuset_cpu_active() fails to rebuild the
+        *    domains.
+        */
+       raw_spin_lock_irqsave(&rq->lock, flags);
+       if (rq->rd) {
+               BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
+               set_rq_online(rq);
+       }
+       raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+       update_max_interval();
+
+       return 0;
  }
  
+int sched_cpu_deactivate(unsigned int cpu)
+{
+       int ret;
+
+       set_cpu_active(cpu, false);
+       /*
+        * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU
+        * users of this state to go away such that all new such users will
+        * observe it.
+        *
+        * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
+        * not imply sync_sched(), so wait for both.
+        *
+        * Do sync before park smpboot threads to take care the rcu boost case.
+        */
+       if (IS_ENABLED(CONFIG_PREEMPT))
+               synchronize_rcu_mult(call_rcu, call_rcu_sched);
+       else
+               synchronize_rcu();
+
+       if (!sched_smp_initialized)
+               return 0;
+
+       ret = cpuset_cpu_inactive(cpu);
+       if (ret) {
+               set_cpu_active(cpu, true);
+               return ret;
+       }
+       sched_domains_numa_masks_clear(cpu);
+       return 0;
+}
+
+static void sched_rq_cpu_starting(unsigned int cpu)
+{
+       struct rq *rq = cpu_rq(cpu);
+
+       rq->calc_load_update = calc_load_update;
+       account_reset_rq(rq);
+       update_max_interval();
+}
+
+int sched_cpu_starting(unsigned int cpu)
+{
+       set_cpu_rq_start_time(cpu);
+       sched_rq_cpu_starting(cpu);
+       return 0;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+int sched_cpu_dying(unsigned int cpu)
+{
+       struct rq *rq = cpu_rq(cpu);
+       unsigned long flags;
+
+       /* Handle pending wakeups and then migrate everything off */
+       sched_ttwu_pending();
+       raw_spin_lock_irqsave(&rq->lock, flags);
+       if (rq->rd) {
+               BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
+               set_rq_offline(rq);
+       }
+       migrate_tasks(rq);
+       BUG_ON(rq->nr_running != 1);
+       raw_spin_unlock_irqrestore(&rq->lock, flags);
+       calc_load_migrate(rq);
+       update_max_interval();
+       nohz_balance_exit_idle(cpu);
+       hrtick_clear(rq);
+       return 0;
+}
+#endif
+
  void __init sched_init_smp(void)
  {
         cpumask_var_t non_isolated_cpus;
@@ -7182,12 +7287,6 @@ void __init sched_init_smp(void)
                 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
         mutex_unlock(&sched_domains_mutex);
  
-       hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
-       hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
-       hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
-
-       init_hrtick();
-
         /* Move init over to a non-isolated CPU */
         if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
                 BUG();
@@ -7196,7 +7295,16 @@ void __init sched_init_smp(void)
  
         init_sched_rt_class();
         init_sched_dl_class();
+       sched_smp_initialized = true;
+}
+
+static int __init migration_init(void)
+{
+       sched_rq_cpu_starting(smp_processor_id());
+       return 0;
  }
+early_initcall(migration_init);
+
  #else
  void __init sched_init_smp(void)
  {
@@ -7331,8 +7439,6 @@ void __init sched_init(void)
                 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
                         rq->cpu_load[j] = 0;
  
-               rq->last_load_update_tick = jiffies;
-
  #ifdef CONFIG_SMP
                 rq->sd = NULL;
                 rq->rd = NULL;
@@ -7351,12 +7457,13 @@ void __init sched_init(void)
  
                 rq_attach_root(rq, &def_root_domain);
  #ifdef CONFIG_NO_HZ_COMMON
+               rq->last_load_update_tick = jiffies;
                 rq->nohz_flags = 0;
  #endif
  #ifdef CONFIG_NO_HZ_FULL
                 rq->last_sched_tick = 0;
  #endif
-#endif
+#endif /* CONFIG_SMP */
                 init_rq_hrtick(rq);
                 atomic_set(&rq->nr_iowait, 0);
         }
@@ -7394,10 +7501,12 @@ void __init sched_init(void)
         if (cpu_isolated_map == NULL)
                 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
         idle_thread_set_boot_cpu();
-       set_cpu_rq_start_time();
+       set_cpu_rq_start_time(smp_processor_id());
  #endif
         init_sched_fair_class();
  
+       init_schedstats();
+
         scheduler_running = 1;
  }
  
@@ -7639,10 +7748,10 @@ void sched_move_task(struct task_struct *tsk)
  {
         struct task_group *tg;
         int queued, running;
-       unsigned long flags;
+       struct rq_flags rf;
         struct rq *rq;
  
-       rq = task_rq_lock(tsk, &flags);
+       rq = task_rq_lock(tsk, &rf);
  
         running = task_current(rq, tsk);
         queued = task_on_rq_queued(tsk);
@@ -7674,7 +7783,7 @@ void sched_move_task(struct task_struct *tsk)
         if (queued)
                 enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE);
  
-       task_rq_unlock(rq, tsk, &flags);
+       task_rq_unlock(rq, tsk, &rf);
  }
  #endif /* CONFIG_CGROUP_SCHED */
  
@@ -7894,7 +8003,7 @@ static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
  static int sched_rt_global_constraints(void)
  {
         unsigned long flags;
-       int i, ret = 0;
+       int i;
  
         raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
         for_each_possible_cpu(i) {
@@ -7906,7 +8015,7 @@ static int sched_rt_global_constraints(void)
         }
         raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
  
-       return ret;
+       return 0;
  }
  #endif /* CONFIG_RT_GROUP_SCHED */