From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 10 Jun 2016 19:10:02 +0000 (-0700)
Subject: Merge branch 'stacking-fixes' (vfs stacking fixes from Jann)
X-Git-Tag: v4.7-rc3~11
X-Git-Url: http://git.cascardo.eti.br/?a=commitdiff_plain;h=f5364c150aa645b3d7daa21b5c0b9feaa1c9cd6d;hp=-c;p=cascardo%2Flinux.git

Merge branch 'stacking-fixes' (vfs stacking fixes from Jann)

Merge filesystem stacking fixes from Jann Horn.

* emailed patches from Jann Horn <jannh@google.com>:
  sched: panic on corrupted stack end
  ecryptfs: forbid opening files without mmap handler
  proc: prevent stacking filesystems on top
---

f5364c150aa645b3d7daa21b5c0b9feaa1c9cd6d
diff --combined fs/proc/root.c
index 55bc7d6c8aac,ec649c92d270..06702783bf40
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@@ -121,6 -121,13 +121,13 @@@ static struct dentry *proc_mount(struc
  	if (IS_ERR(sb))
  		return ERR_CAST(sb);
  
+ 	/*
+ 	 * procfs isn't actually a stacking filesystem; however, there is
+ 	 * too much magic going on inside it to permit stacking things on
+ 	 * top of it
+ 	 */
+ 	sb->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
+ 
  	if (!proc_parse_options(options, ns)) {
  		deactivate_locked_super(sb);
  		return ERR_PTR(-EINVAL);
@@@ -226,8 -233,8 +233,8 @@@ static int proc_root_readdir(struct fil
   */
  static const struct file_operations proc_root_operations = {
  	.read		 = generic_read_dir,
 -	.iterate	 = proc_root_readdir,
 -	.llseek		= default_llseek,
 +	.iterate_shared	 = proc_root_readdir,
 +	.llseek		= generic_file_llseek,
  };
  
  /*
diff --combined kernel/sched/core.c
index 385c947482e1,11546a6ed5df..017d5394f5dc
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@@ -33,7 -33,7 +33,7 @@@
  #include <linux/init.h>
  #include <linux/uaccess.h>
  #include <linux/highmem.h>
 -#include <asm/mmu_context.h>
 +#include <linux/mmu_context.h>
  #include <linux/interrupt.h>
  #include <linux/capability.h>
  #include <linux/completion.h>
@@@ -170,71 -170,6 +170,71 @@@ static struct rq *this_rq_lock(void
  	return rq;
  }
  
 +/*
 + * __task_rq_lock - lock the rq @p resides on.
 + */
 +struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
 +	__acquires(rq->lock)
 +{
 +	struct rq *rq;
 +
 +	lockdep_assert_held(&p->pi_lock);
 +
 +	for (;;) {
 +		rq = task_rq(p);
 +		raw_spin_lock(&rq->lock);
 +		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
 +			rf->cookie = lockdep_pin_lock(&rq->lock);
 +			return rq;
 +		}
 +		raw_spin_unlock(&rq->lock);
 +
 +		while (unlikely(task_on_rq_migrating(p)))
 +			cpu_relax();
 +	}
 +}
 +
 +/*
 + * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
 + */
 +struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
 +	__acquires(p->pi_lock)
 +	__acquires(rq->lock)
 +{
 +	struct rq *rq;
 +
 +	for (;;) {
 +		raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
 +		rq = task_rq(p);
 +		raw_spin_lock(&rq->lock);
 +		/*
 +		 *	move_queued_task()		task_rq_lock()
 +		 *
 +		 *	ACQUIRE (rq->lock)
 +		 *	[S] ->on_rq = MIGRATING		[L] rq = task_rq()
 +		 *	WMB (__set_task_cpu())		ACQUIRE (rq->lock);
 +		 *	[S] ->cpu = new_cpu		[L] task_rq()
 +		 *					[L] ->on_rq
 +		 *	RELEASE (rq->lock)
 +		 *
 +		 * If we observe the old cpu in task_rq_lock, the acquire of
 +		 * the old rq->lock will fully serialize against the stores.
 +		 *
 +		 * If we observe the new cpu in task_rq_lock, the acquire will
 +		 * pair with the WMB to ensure we must then also see migrating.
 +		 */
 +		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
 +			rf->cookie = lockdep_pin_lock(&rq->lock);
 +			return rq;
 +		}
 +		raw_spin_unlock(&rq->lock);
 +		raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
 +
 +		while (unlikely(task_on_rq_migrating(p)))
 +			cpu_relax();
 +	}
 +}
 +
  #ifdef CONFIG_SCHED_HRTICK
  /*
   * Use HR-timers to deliver accurate preemption points.
@@@ -314,6 -249,29 +314,6 @@@ void hrtick_start(struct rq *rq, u64 de
  	}
  }
  
 -static int
 -hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
 -{
 -	int cpu = (int)(long)hcpu;
 -
 -	switch (action) {
 -	case CPU_UP_CANCELED:
 -	case CPU_UP_CANCELED_FROZEN:
 -	case CPU_DOWN_PREPARE:
 -	case CPU_DOWN_PREPARE_FROZEN:
 -	case CPU_DEAD:
 -	case CPU_DEAD_FROZEN:
 -		hrtick_clear(cpu_rq(cpu));
 -		return NOTIFY_OK;
 -	}
 -
 -	return NOTIFY_DONE;
 -}
 -
 -static __init void init_hrtick(void)
 -{
 -	hotcpu_notifier(hotplug_hrtick, 0);
 -}
  #else
  /*
   * Called to set the hrtick timer state.
@@@ -330,6 -288,10 +330,6 @@@ void hrtick_start(struct rq *rq, u64 de
  	hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
  		      HRTIMER_MODE_REL_PINNED);
  }
 -
 -static inline void init_hrtick(void)
 -{
 -}
  #endif /* CONFIG_SMP */
  
  static void init_rq_hrtick(struct rq *rq)
@@@ -353,6 -315,10 +353,6 @@@ static inline void hrtick_clear(struct 
  static inline void init_rq_hrtick(struct rq *rq)
  {
  }
 -
 -static inline void init_hrtick(void)
 -{
 -}
  #endif	/* CONFIG_SCHED_HRTICK */
  
  /*
@@@ -434,7 -400,7 +434,7 @@@ void wake_q_add(struct wake_q_head *hea
  	 * wakeup due to that.
  	 *
  	 * This cmpxchg() implies a full barrier, which pairs with the write
 -	 * barrier implied by the wakeup in wake_up_list().
 +	 * barrier implied by the wakeup in wake_up_q().
  	 */
  	if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
  		return;
@@@ -533,10 -499,7 +533,10 @@@ int get_nohz_timer_target(void
  	rcu_read_lock();
  	for_each_domain(cpu, sd) {
  		for_each_cpu(i, sched_domain_span(sd)) {
 -			if (!idle_cpu(i) && is_housekeeping_cpu(cpu)) {
 +			if (cpu == i)
 +				continue;
 +
 +			if (!idle_cpu(i) && is_housekeeping_cpu(i)) {
  				cpu = i;
  				goto unlock;
  			}
@@@ -1122,20 -1085,12 +1122,20 @@@ void do_set_cpus_allowed(struct task_st
  static int __set_cpus_allowed_ptr(struct task_struct *p,
  				  const struct cpumask *new_mask, bool check)
  {
 -	unsigned long flags;
 -	struct rq *rq;
 +	const struct cpumask *cpu_valid_mask = cpu_active_mask;
  	unsigned int dest_cpu;
 +	struct rq_flags rf;
 +	struct rq *rq;
  	int ret = 0;
  
 -	rq = task_rq_lock(p, &flags);
 +	rq = task_rq_lock(p, &rf);
 +
 +	if (p->flags & PF_KTHREAD) {
 +		/*
 +		 * Kernel threads are allowed on online && !active CPUs
 +		 */
 +		cpu_valid_mask = cpu_online_mask;
 +	}
  
  	/*
  	 * Must re-check here, to close a race against __kthread_bind(),
@@@ -1149,32 -1104,22 +1149,32 @@@
  	if (cpumask_equal(&p->cpus_allowed, new_mask))
  		goto out;
  
 -	if (!cpumask_intersects(new_mask, cpu_active_mask)) {
 +	if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
  		ret = -EINVAL;
  		goto out;
  	}
  
  	do_set_cpus_allowed(p, new_mask);
  
 +	if (p->flags & PF_KTHREAD) {
 +		/*
 +		 * For kernel threads that do indeed end up on online &&
 +		 * !active we want to ensure they are strict per-cpu threads.
 +		 */
 +		WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
 +			!cpumask_intersects(new_mask, cpu_active_mask) &&
 +			p->nr_cpus_allowed != 1);
 +	}
 +
  	/* Can the task run on the task's current CPU? If so, we're done */
  	if (cpumask_test_cpu(task_cpu(p), new_mask))
  		goto out;
  
 -	dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
 +	dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
  	if (task_running(rq, p) || p->state == TASK_WAKING) {
  		struct migration_arg arg = { p, dest_cpu };
  		/* Need help from migration thread: drop lock and wait. */
 -		task_rq_unlock(rq, p, &flags);
 +		task_rq_unlock(rq, p, &rf);
  		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
  		tlb_migrate_finish(p->mm);
  		return 0;
@@@ -1183,12 -1128,12 +1183,12 @@@
  		 * OK, since we're going to drop the lock immediately
  		 * afterwards anyway.
  		 */
 -		lockdep_unpin_lock(&rq->lock);
 +		lockdep_unpin_lock(&rq->lock, rf.cookie);
  		rq = move_queued_task(rq, p, dest_cpu);
 -		lockdep_pin_lock(&rq->lock);
 +		lockdep_repin_lock(&rq->lock, rf.cookie);
  	}
  out:
 -	task_rq_unlock(rq, p, &flags);
 +	task_rq_unlock(rq, p, &rf);
  
  	return ret;
  }
@@@ -1372,8 -1317,8 +1372,8 @@@ out
   */
  unsigned long wait_task_inactive(struct task_struct *p, long match_state)
  {
 -	unsigned long flags;
  	int running, queued;
 +	struct rq_flags rf;
  	unsigned long ncsw;
  	struct rq *rq;
  
@@@ -1408,14 -1353,14 +1408,14 @@@
  		 * lock now, to be *sure*. If we're wrong, we'll
  		 * just go back and repeat.
  		 */
 -		rq = task_rq_lock(p, &flags);
 +		rq = task_rq_lock(p, &rf);
  		trace_sched_wait_task(p);
  		running = task_running(rq, p);
  		queued = task_on_rq_queued(p);
  		ncsw = 0;
  		if (!match_state || p->state == match_state)
  			ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
 -		task_rq_unlock(rq, p, &flags);
 +		task_rq_unlock(rq, p, &rf);
  
  		/*
  		 * If it changed from the expected state, bail out now.
@@@ -1489,25 -1434,6 +1489,25 @@@ EXPORT_SYMBOL_GPL(kick_process)
  
  /*
   * ->cpus_allowed is protected by both rq->lock and p->pi_lock
 + *
 + * A few notes on cpu_active vs cpu_online:
 + *
 + *  - cpu_active must be a subset of cpu_online
 + *
 + *  - on cpu-up we allow per-cpu kthreads on the online && !active cpu,
 + *    see __set_cpus_allowed_ptr(). At this point the newly online
 + *    cpu isn't yet part of the sched domains, and balancing will not
 + *    see it.
 + *
 + *  - on cpu-down we clear cpu_active() to mask the sched domains and
 + *    avoid the load balancer to place new tasks on the to be removed
 + *    cpu. Existing tasks will remain running there and will be taken
 + *    off.
 + *
 + * This means that fallback selection must not select !active CPUs.
 + * And can assume that any active CPU must be online. Conversely
 + * select_task_rq() below may allow selection of !active CPUs in order
 + * to satisfy the above rules.
   */
  static int select_fallback_rq(int cpu, struct task_struct *p)
  {
@@@ -1526,6 -1452,8 +1526,6 @@@
  
  		/* Look for allowed, online CPU in same node. */
  		for_each_cpu(dest_cpu, nodemask) {
 -			if (!cpu_online(dest_cpu))
 -				continue;
  			if (!cpu_active(dest_cpu))
  				continue;
  			if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
@@@ -1536,6 -1464,8 +1536,6 @@@
  	for (;;) {
  		/* Any allowed, online CPU? */
  		for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
 -			if (!cpu_online(dest_cpu))
 -				continue;
  			if (!cpu_active(dest_cpu))
  				continue;
  			goto out;
@@@ -1585,10 -1515,8 +1585,10 @@@ int select_task_rq(struct task_struct *
  {
  	lockdep_assert_held(&p->pi_lock);
  
 -	if (p->nr_cpus_allowed > 1)
 +	if (tsk_nr_cpus_allowed(p) > 1)
  		cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
 +	else
 +		cpu = cpumask_any(tsk_cpus_allowed(p));
  
  	/*
  	 * In order not to call set_task_cpu() on a blocking task we need
@@@ -1676,8 -1604,8 +1676,8 @@@ static inline void ttwu_activate(struc
  /*
   * Mark the task runnable and perform wakeup-preemption.
   */
 -static void
 -ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
 +static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
 +			   struct pin_cookie cookie)
  {
  	check_preempt_curr(rq, p, wake_flags);
  	p->state = TASK_RUNNING;
@@@ -1689,9 -1617,9 +1689,9 @@@
  		 * Our task @p is fully woken up and running; so its safe to
  		 * drop the rq->lock, hereafter rq is only used for statistics.
  		 */
 -		lockdep_unpin_lock(&rq->lock);
 +		lockdep_unpin_lock(&rq->lock, cookie);
  		p->sched_class->task_woken(rq, p);
 -		lockdep_pin_lock(&rq->lock);
 +		lockdep_repin_lock(&rq->lock, cookie);
  	}
  
  	if (rq->idle_stamp) {
@@@ -1709,23 -1637,17 +1709,23 @@@
  }
  
  static void
 -ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
 +ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
 +		 struct pin_cookie cookie)
  {
 +	int en_flags = ENQUEUE_WAKEUP;
 +
  	lockdep_assert_held(&rq->lock);
  
  #ifdef CONFIG_SMP
  	if (p->sched_contributes_to_load)
  		rq->nr_uninterruptible--;
 +
 +	if (wake_flags & WF_MIGRATED)
 +		en_flags |= ENQUEUE_MIGRATED;
  #endif
  
 -	ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
 -	ttwu_do_wakeup(rq, p, wake_flags);
 +	ttwu_activate(rq, p, en_flags);
 +	ttwu_do_wakeup(rq, p, wake_flags, cookie);
  }
  
  /*
@@@ -1736,18 -1658,17 +1736,18 @@@
   */
  static int ttwu_remote(struct task_struct *p, int wake_flags)
  {
 +	struct rq_flags rf;
  	struct rq *rq;
  	int ret = 0;
  
 -	rq = __task_rq_lock(p);
 +	rq = __task_rq_lock(p, &rf);
  	if (task_on_rq_queued(p)) {
  		/* check_preempt_curr() may use rq clock */
  		update_rq_clock(rq);
 -		ttwu_do_wakeup(rq, p, wake_flags);
 +		ttwu_do_wakeup(rq, p, wake_flags, rf.cookie);
  		ret = 1;
  	}
 -	__task_rq_unlock(rq);
 +	__task_rq_unlock(rq, &rf);
  
  	return ret;
  }
@@@ -1757,7 -1678,6 +1757,7 @@@ void sched_ttwu_pending(void
  {
  	struct rq *rq = this_rq();
  	struct llist_node *llist = llist_del_all(&rq->wake_list);
 +	struct pin_cookie cookie;
  	struct task_struct *p;
  	unsigned long flags;
  
@@@ -1765,21 -1685,15 +1765,21 @@@
  		return;
  
  	raw_spin_lock_irqsave(&rq->lock, flags);
 -	lockdep_pin_lock(&rq->lock);
 +	cookie = lockdep_pin_lock(&rq->lock);
  
  	while (llist) {
 +		int wake_flags = 0;
 +
  		p = llist_entry(llist, struct task_struct, wake_entry);
  		llist = llist_next(llist);
 -		ttwu_do_activate(rq, p, 0);
 +
 +		if (p->sched_remote_wakeup)
 +			wake_flags = WF_MIGRATED;
 +
 +		ttwu_do_activate(rq, p, wake_flags, cookie);
  	}
  
 -	lockdep_unpin_lock(&rq->lock);
 +	lockdep_unpin_lock(&rq->lock, cookie);
  	raw_spin_unlock_irqrestore(&rq->lock, flags);
  }
  
@@@ -1821,12 -1735,10 +1821,12 @@@ void scheduler_ipi(void
  	irq_exit();
  }
  
 -static void ttwu_queue_remote(struct task_struct *p, int cpu)
 +static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
  {
  	struct rq *rq = cpu_rq(cpu);
  
 +	p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
 +
  	if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
  		if (!set_nr_if_polling(rq->idle))
  			smp_send_reschedule(cpu);
@@@ -1865,23 -1777,22 +1865,23 @@@ bool cpus_share_cache(int this_cpu, in
  }
  #endif /* CONFIG_SMP */
  
 -static void ttwu_queue(struct task_struct *p, int cpu)
 +static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
  {
  	struct rq *rq = cpu_rq(cpu);
 +	struct pin_cookie cookie;
  
  #if defined(CONFIG_SMP)
  	if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
  		sched_clock_cpu(cpu); /* sync clocks x-cpu */
 -		ttwu_queue_remote(p, cpu);
 +		ttwu_queue_remote(p, cpu, wake_flags);
  		return;
  	}
  #endif
  
  	raw_spin_lock(&rq->lock);
 -	lockdep_pin_lock(&rq->lock);
 -	ttwu_do_activate(rq, p, 0);
 -	lockdep_unpin_lock(&rq->lock);
 +	cookie = lockdep_pin_lock(&rq->lock);
 +	ttwu_do_activate(rq, p, wake_flags, cookie);
 +	lockdep_unpin_lock(&rq->lock, cookie);
  	raw_spin_unlock(&rq->lock);
  }
  
@@@ -2050,6 -1961,9 +2050,6 @@@ try_to_wake_up(struct task_struct *p, u
  	p->sched_contributes_to_load = !!task_contributes_to_load(p);
  	p->state = TASK_WAKING;
  
 -	if (p->sched_class->task_waking)
 -		p->sched_class->task_waking(p);
 -
  	cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
  	if (task_cpu(p) != cpu) {
  		wake_flags |= WF_MIGRATED;
@@@ -2057,7 -1971,7 +2057,7 @@@
  	}
  #endif /* CONFIG_SMP */
  
 -	ttwu_queue(p, cpu);
 +	ttwu_queue(p, cpu, wake_flags);
  stat:
  	if (schedstat_enabled())
  		ttwu_stat(p, cpu, wake_flags);
@@@ -2075,7 -1989,7 +2075,7 @@@ out
   * ensure that this_rq() is locked, @p is bound to this_rq() and not
   * the current task.
   */
 -static void try_to_wake_up_local(struct task_struct *p)
 +static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie)
  {
  	struct rq *rq = task_rq(p);
  
@@@ -2092,11 -2006,11 +2092,11 @@@
  		 * disabled avoiding further scheduler activity on it and we've
  		 * not yet picked a replacement task.
  		 */
 -		lockdep_unpin_lock(&rq->lock);
 +		lockdep_unpin_lock(&rq->lock, cookie);
  		raw_spin_unlock(&rq->lock);
  		raw_spin_lock(&p->pi_lock);
  		raw_spin_lock(&rq->lock);
 -		lockdep_pin_lock(&rq->lock);
 +		lockdep_repin_lock(&rq->lock, cookie);
  	}
  
  	if (!(p->state & TASK_NORMAL))
@@@ -2107,7 -2021,7 +2107,7 @@@
  	if (!task_on_rq_queued(p))
  		ttwu_activate(rq, p, ENQUEUE_WAKEUP);
  
 -	ttwu_do_wakeup(rq, p, 0);
 +	ttwu_do_wakeup(rq, p, 0, cookie);
  	if (schedstat_enabled())
  		ttwu_stat(p, smp_processor_id(), 0);
  out:
@@@ -2253,11 -2167,9 +2253,11 @@@ int sysctl_numa_balancing(struct ctl_ta
  #endif
  #endif
  
 +#ifdef CONFIG_SCHEDSTATS
 +
  DEFINE_STATIC_KEY_FALSE(sched_schedstats);
 +static bool __initdata __sched_schedstats = false;
  
 -#ifdef CONFIG_SCHEDSTATS
  static void set_schedstats(bool enabled)
  {
  	if (enabled)
@@@ -2280,16 -2192,11 +2280,16 @@@ static int __init setup_schedstats(cha
  	if (!str)
  		goto out;
  
 +	/*
 +	 * This code is called before jump labels have been set up, so we can't
 +	 * change the static branch directly just yet.  Instead set a temporary
 +	 * variable so init_schedstats() can do it later.
 +	 */
  	if (!strcmp(str, "enable")) {
 -		set_schedstats(true);
 +		__sched_schedstats = true;
  		ret = 1;
  	} else if (!strcmp(str, "disable")) {
 -		set_schedstats(false);
 +		__sched_schedstats = false;
  		ret = 1;
  	}
  out:
@@@ -2300,11 -2207,6 +2300,11 @@@
  }
  __setup("schedstats=", setup_schedstats);
  
 +static void __init init_schedstats(void)
 +{
 +	set_schedstats(__sched_schedstats);
 +}
 +
  #ifdef CONFIG_PROC_SYSCTL
  int sysctl_schedstats(struct ctl_table *table, int write,
  			 void __user *buffer, size_t *lenp, loff_t *ppos)
@@@ -2325,10 -2227,8 +2325,10 @@@
  		set_schedstats(state);
  	return err;
  }
 -#endif
 -#endif
 +#endif /* CONFIG_PROC_SYSCTL */
 +#else  /* !CONFIG_SCHEDSTATS */
 +static inline void init_schedstats(void) {}
 +#endif /* CONFIG_SCHEDSTATS */
  
  /*
   * fork()/clone()-time setup:
@@@ -2481,8 -2381,7 +2481,8 @@@ static int dl_overflow(struct task_stru
  	u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
  	int cpus, err = -1;
  
 -	if (new_bw == p->dl.dl_bw)
 +	/* !deadline task may carry old deadline bandwidth */
 +	if (new_bw == p->dl.dl_bw && task_has_dl_policy(p))
  		return 0;
  
  	/*
@@@ -2521,12 -2420,12 +2521,12 @@@ extern void init_dl_bw(struct dl_bw *dl
   */
  void wake_up_new_task(struct task_struct *p)
  {
 -	unsigned long flags;
 +	struct rq_flags rf;
  	struct rq *rq;
  
 -	raw_spin_lock_irqsave(&p->pi_lock, flags);
  	/* Initialize new task's runnable average */
  	init_entity_runnable_average(&p->se);
 +	raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
  #ifdef CONFIG_SMP
  	/*
  	 * Fork balancing, do it here and not earlier because:
@@@ -2535,10 -2434,8 +2535,10 @@@
  	 */
  	set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
  #endif
 +	/* Post initialize new task's util average when its cfs_rq is set */
 +	post_init_entity_util_avg(&p->se);
  
 -	rq = __task_rq_lock(p);
 +	rq = __task_rq_lock(p, &rf);
  	activate_task(rq, p, 0);
  	p->on_rq = TASK_ON_RQ_QUEUED;
  	trace_sched_wakeup_new(p);
@@@ -2549,12 -2446,12 +2549,12 @@@
  		 * Nothing relies on rq->lock after this, so its fine to
  		 * drop it.
  		 */
 -		lockdep_unpin_lock(&rq->lock);
 +		lockdep_unpin_lock(&rq->lock, rf.cookie);
  		p->sched_class->task_woken(rq, p);
 -		lockdep_pin_lock(&rq->lock);
 +		lockdep_repin_lock(&rq->lock, rf.cookie);
  	}
  #endif
 -	task_rq_unlock(rq, p, &flags);
 +	task_rq_unlock(rq, p, &rf);
  }
  
  #ifdef CONFIG_PREEMPT_NOTIFIERS
@@@ -2816,7 -2713,7 +2816,7 @@@ asmlinkage __visible void schedule_tail
   */
  static __always_inline struct rq *
  context_switch(struct rq *rq, struct task_struct *prev,
 -	       struct task_struct *next)
 +	       struct task_struct *next, struct pin_cookie cookie)
  {
  	struct mm_struct *mm, *oldmm;
  
@@@ -2836,7 -2733,7 +2836,7 @@@
  		atomic_inc(&oldmm->mm_count);
  		enter_lazy_tlb(oldmm, next);
  	} else
 -		switch_mm(oldmm, mm, next);
 +		switch_mm_irqs_off(oldmm, mm, next);
  
  	if (!prev->mm) {
  		prev->active_mm = NULL;
@@@ -2848,7 -2745,7 +2848,7 @@@
  	 * of the scheduler it's an obvious special-case), so we
  	 * do an early lockdep release here:
  	 */
 -	lockdep_unpin_lock(&rq->lock);
 +	lockdep_unpin_lock(&rq->lock, cookie);
  	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
  
  	/* Here we just switch the register state and the stack. */
@@@ -2970,7 -2867,7 +2970,7 @@@ EXPORT_PER_CPU_SYMBOL(kernel_cpustat)
   */
  unsigned long long task_sched_runtime(struct task_struct *p)
  {
 -	unsigned long flags;
 +	struct rq_flags rf;
  	struct rq *rq;
  	u64 ns;
  
@@@ -2990,7 -2887,7 +2990,7 @@@
  		return p->se.sum_exec_runtime;
  #endif
  
 -	rq = task_rq_lock(p, &flags);
 +	rq = task_rq_lock(p, &rf);
  	/*
  	 * Must be ->curr _and_ ->on_rq.  If dequeued, we would
  	 * project cycles that may never be accounted to this
@@@ -3001,7 -2898,7 +3001,7 @@@
  		p->sched_class->update_curr(rq);
  	}
  	ns = p->se.sum_exec_runtime;
 -	task_rq_unlock(rq, p, &flags);
 +	task_rq_unlock(rq, p, &rf);
  
  	return ns;
  }
@@@ -3021,7 -2918,7 +3021,7 @@@ void scheduler_tick(void
  	raw_spin_lock(&rq->lock);
  	update_rq_clock(rq);
  	curr->sched_class->task_tick(rq, curr, 0);
 -	update_cpu_load_active(rq);
 +	cpu_load_update_active(rq);
  	calc_global_load_tick(rq);
  	raw_spin_unlock(&rq->lock);
  
@@@ -3064,20 -2961,6 +3064,20 @@@ u64 scheduler_tick_max_deferment(void
  
  #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
  				defined(CONFIG_PREEMPT_TRACER))
 +/*
 + * If the value passed in is equal to the current preempt count
 + * then we just disabled preemption. Start timing the latency.
 + */
 +static inline void preempt_latency_start(int val)
 +{
 +	if (preempt_count() == val) {
 +		unsigned long ip = get_lock_parent_ip();
 +#ifdef CONFIG_DEBUG_PREEMPT
 +		current->preempt_disable_ip = ip;
 +#endif
 +		trace_preempt_off(CALLER_ADDR0, ip);
 +	}
 +}
  
  void preempt_count_add(int val)
  {
@@@ -3096,21 -2979,17 +3096,21 @@@
  	DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
  				PREEMPT_MASK - 10);
  #endif
 -	if (preempt_count() == val) {
 -		unsigned long ip = get_lock_parent_ip();
 -#ifdef CONFIG_DEBUG_PREEMPT
 -		current->preempt_disable_ip = ip;
 -#endif
 -		trace_preempt_off(CALLER_ADDR0, ip);
 -	}
 +	preempt_latency_start(val);
  }
  EXPORT_SYMBOL(preempt_count_add);
  NOKPROBE_SYMBOL(preempt_count_add);
  
 +/*
 + * If the value passed in equals to the current preempt count
 + * then we just enabled preemption. Stop timing the latency.
 + */
 +static inline void preempt_latency_stop(int val)
 +{
 +	if (preempt_count() == val)
 +		trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
 +}
 +
  void preempt_count_sub(int val)
  {
  #ifdef CONFIG_DEBUG_PREEMPT
@@@ -3127,15 -3006,13 +3127,15 @@@
  		return;
  #endif
  
 -	if (preempt_count() == val)
 -		trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
 +	preempt_latency_stop(val);
  	__preempt_count_sub(val);
  }
  EXPORT_SYMBOL(preempt_count_sub);
  NOKPROBE_SYMBOL(preempt_count_sub);
  
 +#else
 +static inline void preempt_latency_start(int val) { }
 +static inline void preempt_latency_stop(int val) { }
  #endif
  
  /*
@@@ -3170,7 -3047,8 +3170,8 @@@ static noinline void __schedule_bug(str
  static inline void schedule_debug(struct task_struct *prev)
  {
  #ifdef CONFIG_SCHED_STACK_END_CHECK
- 	BUG_ON(task_stack_end_corrupted(prev));
+ 	if (task_stack_end_corrupted(prev))
+ 		panic("corrupted stack end detected inside scheduler\n");
  #endif
  
  	if (unlikely(in_atomic_preempt_off())) {
@@@ -3188,7 -3066,7 +3189,7 @@@
   * Pick up the highest-prio task:
   */
  static inline struct task_struct *
 -pick_next_task(struct rq *rq, struct task_struct *prev)
 +pick_next_task(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
  {
  	const struct sched_class *class = &fair_sched_class;
  	struct task_struct *p;
@@@ -3199,20 -3077,20 +3200,20 @@@
  	 */
  	if (likely(prev->sched_class == class &&
  		   rq->nr_running == rq->cfs.h_nr_running)) {
 -		p = fair_sched_class.pick_next_task(rq, prev);
 +		p = fair_sched_class.pick_next_task(rq, prev, cookie);
  		if (unlikely(p == RETRY_TASK))
  			goto again;
  
  		/* assumes fair_sched_class->next == idle_sched_class */
  		if (unlikely(!p))
 -			p = idle_sched_class.pick_next_task(rq, prev);
 +			p = idle_sched_class.pick_next_task(rq, prev, cookie);
  
  		return p;
  	}
  
  again:
  	for_each_class(class) {
 -		p = class->pick_next_task(rq, prev);
 +		p = class->pick_next_task(rq, prev, cookie);
  		if (p) {
  			if (unlikely(p == RETRY_TASK))
  				goto again;
@@@ -3266,7 -3144,6 +3267,7 @@@ static void __sched notrace __schedule(
  {
  	struct task_struct *prev, *next;
  	unsigned long *switch_count;
 +	struct pin_cookie cookie;
  	struct rq *rq;
  	int cpu;
  
@@@ -3300,7 -3177,7 +3301,7 @@@
  	 */
  	smp_mb__before_spinlock();
  	raw_spin_lock(&rq->lock);
 -	lockdep_pin_lock(&rq->lock);
 +	cookie = lockdep_pin_lock(&rq->lock);
  
  	rq->clock_skip_update <<= 1; /* promote REQ to ACT */
  
@@@ -3322,7 -3199,7 +3323,7 @@@
  
  				to_wakeup = wq_worker_sleeping(prev);
  				if (to_wakeup)
 -					try_to_wake_up_local(to_wakeup);
 +					try_to_wake_up_local(to_wakeup, cookie);
  			}
  		}
  		switch_count = &prev->nvcsw;
@@@ -3331,7 -3208,7 +3332,7 @@@
  	if (task_on_rq_queued(prev))
  		update_rq_clock(rq);
  
 -	next = pick_next_task(rq, prev);
 +	next = pick_next_task(rq, prev, cookie);
  	clear_tsk_need_resched(prev);
  	clear_preempt_need_resched();
  	rq->clock_skip_update = 0;
@@@ -3342,9 -3219,9 +3343,9 @@@
  		++*switch_count;
  
  		trace_sched_switch(preempt, prev, next);
 -		rq = context_switch(rq, prev, next); /* unlocks the rq */
 +		rq = context_switch(rq, prev, next, cookie); /* unlocks the rq */
  	} else {
 -		lockdep_unpin_lock(&rq->lock);
 +		lockdep_unpin_lock(&rq->lock, cookie);
  		raw_spin_unlock_irq(&rq->lock);
  	}
  
@@@ -3411,23 -3288,8 +3412,23 @@@ void __sched schedule_preempt_disabled(
  static void __sched notrace preempt_schedule_common(void)
  {
  	do {
 +		/*
 +		 * Because the function tracer can trace preempt_count_sub()
 +		 * and it also uses preempt_enable/disable_notrace(), if
 +		 * NEED_RESCHED is set, the preempt_enable_notrace() called
 +		 * by the function tracer will call this function again and
 +		 * cause infinite recursion.
 +		 *
 +		 * Preemption must be disabled here before the function
 +		 * tracer can trace. Break up preempt_disable() into two
 +		 * calls. One to disable preemption without fear of being
 +		 * traced. The other to still record the preemption latency,
 +		 * which can also be traced by the function tracer.
 +		 */
  		preempt_disable_notrace();
 +		preempt_latency_start(1);
  		__schedule(true);
 +		preempt_latency_stop(1);
  		preempt_enable_no_resched_notrace();
  
  		/*
@@@ -3479,21 -3341,7 +3480,21 @@@ asmlinkage __visible void __sched notra
  		return;
  
  	do {
 +		/*
 +		 * Because the function tracer can trace preempt_count_sub()
 +		 * and it also uses preempt_enable/disable_notrace(), if
 +		 * NEED_RESCHED is set, the preempt_enable_notrace() called
 +		 * by the function tracer will call this function again and
 +		 * cause infinite recursion.
 +		 *
 +		 * Preemption must be disabled here before the function
 +		 * tracer can trace. Break up preempt_disable() into two
 +		 * calls. One to disable preemption without fear of being
 +		 * traced. The other to still record the preemption latency,
 +		 * which can also be traced by the function tracer.
 +		 */
  		preempt_disable_notrace();
 +		preempt_latency_start(1);
  		/*
  		 * Needs preempt disabled in case user_exit() is traced
  		 * and the tracer calls preempt_enable_notrace() causing
@@@ -3503,7 -3351,6 +3504,7 @@@
  		__schedule(true);
  		exception_exit(prev_ctx);
  
 +		preempt_latency_stop(1);
  		preempt_enable_no_resched_notrace();
  	} while (need_resched());
  }
@@@ -3560,13 -3407,12 +3561,13 @@@ EXPORT_SYMBOL(default_wake_function)
  void rt_mutex_setprio(struct task_struct *p, int prio)
  {
  	int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
 -	struct rq *rq;
  	const struct sched_class *prev_class;
 +	struct rq_flags rf;
 +	struct rq *rq;
  
  	BUG_ON(prio > MAX_PRIO);
  
 -	rq = __task_rq_lock(p);
 +	rq = __task_rq_lock(p, &rf);
  
  	/*
  	 * Idle task boosting is a nono in general. There is one
@@@ -3642,7 -3488,7 +3643,7 @@@
  	check_class_changed(rq, p, prev_class, oldprio);
  out_unlock:
  	preempt_disable(); /* avoid rq from going away on us */
 -	__task_rq_unlock(rq);
 +	__task_rq_unlock(rq, &rf);
  
  	balance_callback(rq);
  	preempt_enable();
@@@ -3652,7 -3498,7 +3653,7 @@@
  void set_user_nice(struct task_struct *p, long nice)
  {
  	int old_prio, delta, queued;
 -	unsigned long flags;
 +	struct rq_flags rf;
  	struct rq *rq;
  
  	if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
@@@ -3661,7 -3507,7 +3662,7 @@@
  	 * We have to be careful, if called from sys_setpriority(),
  	 * the task might be in the middle of scheduling on another CPU.
  	 */
 -	rq = task_rq_lock(p, &flags);
 +	rq = task_rq_lock(p, &rf);
  	/*
  	 * The RT priorities are set via sched_setscheduler(), but we still
  	 * allow the 'normal' nice value to be set - but as expected
@@@ -3692,7 -3538,7 +3693,7 @@@
  			resched_curr(rq);
  	}
  out_unlock:
 -	task_rq_unlock(rq, p, &flags);
 +	task_rq_unlock(rq, p, &rf);
  }
  EXPORT_SYMBOL(set_user_nice);
  
@@@ -3989,11 -3835,11 +3990,11 @@@ static int __sched_setscheduler(struct 
  		      MAX_RT_PRIO - 1 - attr->sched_priority;
  	int retval, oldprio, oldpolicy = -1, queued, running;
  	int new_effective_prio, policy = attr->sched_policy;
 -	unsigned long flags;
  	const struct sched_class *prev_class;
 -	struct rq *rq;
 +	struct rq_flags rf;
  	int reset_on_fork;
  	int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
 +	struct rq *rq;
  
  	/* may grab non-irq protected spin_locks */
  	BUG_ON(in_interrupt());
@@@ -4088,13 -3934,13 +4089,13 @@@ recheck
  	 * To be able to change p->policy safely, the appropriate
  	 * runqueue lock must be held.
  	 */
 -	rq = task_rq_lock(p, &flags);
 +	rq = task_rq_lock(p, &rf);
  
  	/*
  	 * Changing the policy of the stop threads its a very bad idea
  	 */
  	if (p == rq->stop) {
 -		task_rq_unlock(rq, p, &flags);
 +		task_rq_unlock(rq, p, &rf);
  		return -EINVAL;
  	}
  
@@@ -4111,7 -3957,7 +4112,7 @@@
  			goto change;
  
  		p->sched_reset_on_fork = reset_on_fork;
 -		task_rq_unlock(rq, p, &flags);
 +		task_rq_unlock(rq, p, &rf);
  		return 0;
  	}
  change:
@@@ -4125,7 -3971,7 +4126,7 @@@
  		if (rt_bandwidth_enabled() && rt_policy(policy) &&
  				task_group(p)->rt_bandwidth.rt_runtime == 0 &&
  				!task_group_is_autogroup(task_group(p))) {
 -			task_rq_unlock(rq, p, &flags);
 +			task_rq_unlock(rq, p, &rf);
  			return -EPERM;
  		}
  #endif
@@@ -4140,7 -3986,7 +4141,7 @@@
  			 */
  			if (!cpumask_subset(span, &p->cpus_allowed) ||
  			    rq->rd->dl_bw.bw == 0) {
 -				task_rq_unlock(rq, p, &flags);
 +				task_rq_unlock(rq, p, &rf);
  				return -EPERM;
  			}
  		}
@@@ -4150,7 -3996,7 +4151,7 @@@
  	/* recheck policy now with rq lock held */
  	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
  		policy = oldpolicy = -1;
 -		task_rq_unlock(rq, p, &flags);
 +		task_rq_unlock(rq, p, &rf);
  		goto recheck;
  	}
  
@@@ -4160,7 -4006,7 +4161,7 @@@
  	 * is available.
  	 */
  	if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) {
 -		task_rq_unlock(rq, p, &flags);
 +		task_rq_unlock(rq, p, &rf);
  		return -EBUSY;
  	}
  
@@@ -4205,7 -4051,7 +4206,7 @@@
  
  	check_class_changed(rq, p, prev_class, oldprio);
  	preempt_disable(); /* avoid rq from going away on us */
 -	task_rq_unlock(rq, p, &flags);
 +	task_rq_unlock(rq, p, &rf);
  
  	if (pi)
  		rt_mutex_adjust_pi(p);
@@@ -5058,10 -4904,10 +5059,10 @@@ SYSCALL_DEFINE2(sched_rr_get_interval, 
  {
  	struct task_struct *p;
  	unsigned int time_slice;
 -	unsigned long flags;
 +	struct rq_flags rf;
 +	struct timespec t;
  	struct rq *rq;
  	int retval;
 -	struct timespec t;
  
  	if (pid < 0)
  		return -EINVAL;
@@@ -5076,11 -4922,11 +5077,11 @@@
  	if (retval)
  		goto out_unlock;
  
 -	rq = task_rq_lock(p, &flags);
 +	rq = task_rq_lock(p, &rf);
  	time_slice = 0;
  	if (p->sched_class->get_rr_interval)
  		time_slice = p->sched_class->get_rr_interval(rq, p);
 -	task_rq_unlock(rq, p, &flags);
 +	task_rq_unlock(rq, p, &rf);
  
  	rcu_read_unlock();
  	jiffies_to_timespec(time_slice, &t);
@@@ -5156,8 -5002,7 +5157,8 @@@ void show_state_filter(unsigned long st
  	touch_all_softlockup_watchdogs();
  
  #ifdef CONFIG_SCHED_DEBUG
 -	sysrq_sched_debug_show();
 +	if (!state_filter)
 +		sysrq_sched_debug_show();
  #endif
  	rcu_read_unlock();
  	/*
@@@ -5319,8 -5164,6 +5320,8 @@@ out
  
  #ifdef CONFIG_SMP
  
 +static bool sched_smp_initialized __read_mostly;
 +
  #ifdef CONFIG_NUMA_BALANCING
  /* Migrate current task p to target_cpu */
  int migrate_task_to(struct task_struct *p, int target_cpu)
@@@ -5346,11 -5189,11 +5347,11 @@@
   */
  void sched_setnuma(struct task_struct *p, int nid)
  {
 -	struct rq *rq;
 -	unsigned long flags;
  	bool queued, running;
 +	struct rq_flags rf;
 +	struct rq *rq;
  
 -	rq = task_rq_lock(p, &flags);
 +	rq = task_rq_lock(p, &rf);
  	queued = task_on_rq_queued(p);
  	running = task_current(rq, p);
  
@@@ -5365,7 -5208,7 +5366,7 @@@
  		p->sched_class->set_curr_task(rq);
  	if (queued)
  		enqueue_task(rq, p, ENQUEUE_RESTORE);
 -	task_rq_unlock(rq, p, &flags);
 +	task_rq_unlock(rq, p, &rf);
  }
  #endif /* CONFIG_NUMA_BALANCING */
  
@@@ -5381,7 -5224,7 +5382,7 @@@ void idle_task_exit(void
  	BUG_ON(cpu_online(smp_processor_id()));
  
  	if (mm != &init_mm) {
 -		switch_mm(mm, &init_mm, current);
 +		switch_mm_irqs_off(mm, &init_mm, current);
  		finish_arch_post_lock_switch();
  	}
  	mmdrop(mm);
@@@ -5429,7 -5272,6 +5430,7 @@@ static void migrate_tasks(struct rq *de
  {
  	struct rq *rq = dead_rq;
  	struct task_struct *next, *stop = rq->stop;
 +	struct pin_cookie cookie;
  	int dest_cpu;
  
  	/*
@@@ -5461,8 -5303,8 +5462,8 @@@
  		/*
  		 * pick_next_task assumes pinned rq->lock.
  		 */
 -		lockdep_pin_lock(&rq->lock);
 -		next = pick_next_task(rq, &fake_task);
 +		cookie = lockdep_pin_lock(&rq->lock);
 +		next = pick_next_task(rq, &fake_task, cookie);
  		BUG_ON(!next);
  		next->sched_class->put_prev_task(rq, next);
  
@@@ -5475,7 -5317,7 +5476,7 @@@
  		 * because !cpu_active at this point, which means load-balance
  		 * will not interfere. Also, stop-machine.
  		 */
 -		lockdep_unpin_lock(&rq->lock);
 +		lockdep_unpin_lock(&rq->lock, cookie);
  		raw_spin_unlock(&rq->lock);
  		raw_spin_lock(&next->pi_lock);
  		raw_spin_lock(&rq->lock);
@@@ -5536,13 -5378,127 +5537,13 @@@ static void set_rq_offline(struct rq *r
  	}
  }
  
 -/*
 - * migration_call - callback that gets triggered when a CPU is added.
 - * Here we can start up the necessary migration thread for the new CPU.
 - */
 -static int
 -migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 +static void set_cpu_rq_start_time(unsigned int cpu)
  {
 -	int cpu = (long)hcpu;
 -	unsigned long flags;
  	struct rq *rq = cpu_rq(cpu);
  
 -	switch (action & ~CPU_TASKS_FROZEN) {
 -
 -	case CPU_UP_PREPARE:
 -		rq->calc_load_update = calc_load_update;
 -		account_reset_rq(rq);
 -		break;
 -
 -	case CPU_ONLINE:
 -		/* Update our root-domain */
 -		raw_spin_lock_irqsave(&rq->lock, flags);
 -		if (rq->rd) {
 -			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
 -
 -			set_rq_online(rq);
 -		}
 -		raw_spin_unlock_irqrestore(&rq->lock, flags);
 -		break;
 -
 -#ifdef CONFIG_HOTPLUG_CPU
 -	case CPU_DYING:
 -		sched_ttwu_pending();
 -		/* Update our root-domain */
 -		raw_spin_lock_irqsave(&rq->lock, flags);
 -		if (rq->rd) {
 -			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
 -			set_rq_offline(rq);
 -		}
 -		migrate_tasks(rq);
 -		BUG_ON(rq->nr_running != 1); /* the migration thread */
 -		raw_spin_unlock_irqrestore(&rq->lock, flags);
 -		break;
 -
 -	case CPU_DEAD:
 -		calc_load_migrate(rq);
 -		break;
 -#endif
 -	}
 -
 -	update_max_interval();
 -
 -	return NOTIFY_OK;
 -}
 -
 -/*
 - * Register at high priority so that task migration (migrate_all_tasks)
 - * happens before everything else.  This has to be lower priority than
 - * the notifier in the perf_event subsystem, though.
 - */
 -static struct notifier_block migration_notifier = {
 -	.notifier_call = migration_call,
 -	.priority = CPU_PRI_MIGRATION,
 -};
 -
 -static void set_cpu_rq_start_time(void)
 -{
 -	int cpu = smp_processor_id();
 -	struct rq *rq = cpu_rq(cpu);
  	rq->age_stamp = sched_clock_cpu(cpu);
  }
  
 -static int sched_cpu_active(struct notifier_block *nfb,
 -				      unsigned long action, void *hcpu)
 -{
 -	int cpu = (long)hcpu;
 -
 -	switch (action & ~CPU_TASKS_FROZEN) {
 -	case CPU_STARTING:
 -		set_cpu_rq_start_time();
 -		return NOTIFY_OK;
 -
 -	case CPU_DOWN_FAILED:
 -		set_cpu_active(cpu, true);
 -		return NOTIFY_OK;
 -
 -	default:
 -		return NOTIFY_DONE;
 -	}
 -}
 -
 -static int sched_cpu_inactive(struct notifier_block *nfb,
 -					unsigned long action, void *hcpu)
 -{
 -	switch (action & ~CPU_TASKS_FROZEN) {
 -	case CPU_DOWN_PREPARE:
 -		set_cpu_active((long)hcpu, false);
 -		return NOTIFY_OK;
 -	default:
 -		return NOTIFY_DONE;
 -	}
 -}
 -
 -static int __init migration_init(void)
 -{
 -	void *cpu = (void *)(long)smp_processor_id();
 -	int err;
 -
 -	/* Initialize migration for the boot CPU */
 -	err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
 -	BUG_ON(err == NOTIFY_BAD);
 -	migration_call(&migration_notifier, CPU_ONLINE, cpu);
 -	register_cpu_notifier(&migration_notifier);
 -
 -	/* Register cpu active notifiers */
 -	cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
 -	cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
 -
 -	return 0;
 -}
 -early_initcall(migration_init);
 -
  static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
  
  #ifdef CONFIG_SCHED_DEBUG
@@@ -6690,10 -6646,10 +6691,10 @@@ static void sched_init_numa(void
  	init_numa_topology_type();
  }
  
 -static void sched_domains_numa_masks_set(int cpu)
 +static void sched_domains_numa_masks_set(unsigned int cpu)
  {
 -	int i, j;
  	int node = cpu_to_node(cpu);
 +	int i, j;
  
  	for (i = 0; i < sched_domains_numa_levels; i++) {
  		for (j = 0; j < nr_node_ids; j++) {
@@@ -6703,20 -6659,51 +6704,20 @@@
  	}
  }
  
 -static void sched_domains_numa_masks_clear(int cpu)
 +static void sched_domains_numa_masks_clear(unsigned int cpu)
  {
  	int i, j;
 +
  	for (i = 0; i < sched_domains_numa_levels; i++) {
  		for (j = 0; j < nr_node_ids; j++)
  			cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
  	}
  }
  
 -/*
 - * Update sched_domains_numa_masks[level][node] array when new cpus
 - * are onlined.
 - */
 -static int sched_domains_numa_masks_update(struct notifier_block *nfb,
 -					   unsigned long action,
 -					   void *hcpu)
 -{
 -	int cpu = (long)hcpu;
 -
 -	switch (action & ~CPU_TASKS_FROZEN) {
 -	case CPU_ONLINE:
 -		sched_domains_numa_masks_set(cpu);
 -		break;
 -
 -	case CPU_DEAD:
 -		sched_domains_numa_masks_clear(cpu);
 -		break;
 -
 -	default:
 -		return NOTIFY_DONE;
 -	}
 -
 -	return NOTIFY_OK;
 -}
  #else
 -static inline void sched_init_numa(void)
 -{
 -}
 -
 -static int sched_domains_numa_masks_update(struct notifier_block *nfb,
 -					   unsigned long action,
 -					   void *hcpu)
 -{
 -	return 0;
 -}
 +static inline void sched_init_numa(void) { }
 +static void sched_domains_numa_masks_set(unsigned int cpu) { }
 +static void sched_domains_numa_masks_clear(unsigned int cpu) { }
  #endif /* CONFIG_NUMA */
  
  static int __sdt_alloc(const struct cpumask *cpu_map)
@@@ -7106,9 -7093,13 +7107,9 @@@ static int num_cpus_frozen;	/* used to 
   * If we come here as part of a suspend/resume, don't touch cpusets because we
   * want to restore it back to its original state upon resume anyway.
   */
 -static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
 -			     void *hcpu)
 +static void cpuset_cpu_active(void)
  {
 -	switch (action) {
 -	case CPU_ONLINE_FROZEN:
 -	case CPU_DOWN_FAILED_FROZEN:
 -
 +	if (cpuhp_tasks_frozen) {
  		/*
  		 * num_cpus_frozen tracks how many CPUs are involved in suspend
  		 * resume sequence. As long as this is not the last online
@@@ -7118,25 -7109,35 +7119,25 @@@
  		num_cpus_frozen--;
  		if (likely(num_cpus_frozen)) {
  			partition_sched_domains(1, NULL, NULL);
 -			break;
 +			return;
  		}
 -
  		/*
  		 * This is the last CPU online operation. So fall through and
  		 * restore the original sched domains by considering the
  		 * cpuset configurations.
  		 */
 -
 -	case CPU_ONLINE:
 -		cpuset_update_active_cpus(true);
 -		break;
 -	default:
 -		return NOTIFY_DONE;
  	}
 -	return NOTIFY_OK;
 +	cpuset_update_active_cpus(true);
  }
  
 -static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
 -			       void *hcpu)
 +static int cpuset_cpu_inactive(unsigned int cpu)
  {
  	unsigned long flags;
 -	long cpu = (long)hcpu;
  	struct dl_bw *dl_b;
  	bool overflow;
  	int cpus;
  
 -	switch (action) {
 -	case CPU_DOWN_PREPARE:
 +	if (!cpuhp_tasks_frozen) {
  		rcu_read_lock_sched();
  		dl_b = dl_bw_of(cpu);
  
@@@ -7148,119 -7149,18 +7149,119 @@@
  		rcu_read_unlock_sched();
  
  		if (overflow)
 -			return notifier_from_errno(-EBUSY);
 +			return -EBUSY;
  		cpuset_update_active_cpus(false);
 -		break;
 -	case CPU_DOWN_PREPARE_FROZEN:
 +	} else {
  		num_cpus_frozen++;
  		partition_sched_domains(1, NULL, NULL);
 -		break;
 -	default:
 -		return NOTIFY_DONE;
  	}
 -	return NOTIFY_OK;
 +	return 0;
 +}
 +
 +int sched_cpu_activate(unsigned int cpu)
 +{
 +	struct rq *rq = cpu_rq(cpu);
 +	unsigned long flags;
 +
 +	set_cpu_active(cpu, true);
 +
 +	if (sched_smp_initialized) {
 +		sched_domains_numa_masks_set(cpu);
 +		cpuset_cpu_active();
 +	}
 +
 +	/*
 +	 * Put the rq online, if not already. This happens:
 +	 *
 +	 * 1) In the early boot process, because we build the real domains
 +	 *    after all cpus have been brought up.
 +	 *
 +	 * 2) At runtime, if cpuset_cpu_active() fails to rebuild the
 +	 *    domains.
 +	 */
 +	raw_spin_lock_irqsave(&rq->lock, flags);
 +	if (rq->rd) {
 +		BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
 +		set_rq_online(rq);
 +	}
 +	raw_spin_unlock_irqrestore(&rq->lock, flags);
 +
 +	update_max_interval();
 +
 +	return 0;
 +}
 +
 +int sched_cpu_deactivate(unsigned int cpu)
 +{
 +	int ret;
 +
 +	set_cpu_active(cpu, false);
 +	/*
 +	 * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU
 +	 * users of this state to go away such that all new such users will
 +	 * observe it.
 +	 *
 +	 * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
 +	 * not imply sync_sched(), so wait for both.
 +	 *
 +	 * Do sync before park smpboot threads to take care the rcu boost case.
 +	 */
 +	if (IS_ENABLED(CONFIG_PREEMPT))
 +		synchronize_rcu_mult(call_rcu, call_rcu_sched);
 +	else
 +		synchronize_rcu();
 +
 +	if (!sched_smp_initialized)
 +		return 0;
 +
 +	ret = cpuset_cpu_inactive(cpu);
 +	if (ret) {
 +		set_cpu_active(cpu, true);
 +		return ret;
 +	}
 +	sched_domains_numa_masks_clear(cpu);
 +	return 0;
 +}
 +
 +static void sched_rq_cpu_starting(unsigned int cpu)
 +{
 +	struct rq *rq = cpu_rq(cpu);
 +
 +	rq->calc_load_update = calc_load_update;
 +	account_reset_rq(rq);
 +	update_max_interval();
 +}
 +
 +int sched_cpu_starting(unsigned int cpu)
 +{
 +	set_cpu_rq_start_time(cpu);
 +	sched_rq_cpu_starting(cpu);
 +	return 0;
 +}
 +
 +#ifdef CONFIG_HOTPLUG_CPU
 +int sched_cpu_dying(unsigned int cpu)
 +{
 +	struct rq *rq = cpu_rq(cpu);
 +	unsigned long flags;
 +
 +	/* Handle pending wakeups and then migrate everything off */
 +	sched_ttwu_pending();
 +	raw_spin_lock_irqsave(&rq->lock, flags);
 +	if (rq->rd) {
 +		BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
 +		set_rq_offline(rq);
 +	}
 +	migrate_tasks(rq);
 +	BUG_ON(rq->nr_running != 1);
 +	raw_spin_unlock_irqrestore(&rq->lock, flags);
 +	calc_load_migrate(rq);
 +	update_max_interval();
 +	nohz_balance_exit_idle(cpu);
 +	hrtick_clear(rq);
 +	return 0;
  }
 +#endif
  
  void __init sched_init_smp(void)
  {
@@@ -7283,6 -7183,12 +7284,6 @@@
  		cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
  	mutex_unlock(&sched_domains_mutex);
  
 -	hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
 -	hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
 -	hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
 -
 -	init_hrtick();
 -
  	/* Move init over to a non-isolated CPU */
  	if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
  		BUG();
@@@ -7291,16 -7197,7 +7292,16 @@@
  
  	init_sched_rt_class();
  	init_sched_dl_class();
 +	sched_smp_initialized = true;
 +}
 +
 +static int __init migration_init(void)
 +{
 +	sched_rq_cpu_starting(smp_processor_id());
 +	return 0;
  }
 +early_initcall(migration_init);
 +
  #else
  void __init sched_init_smp(void)
  {
@@@ -7435,6 -7332,8 +7436,6 @@@ void __init sched_init(void
  		for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
  			rq->cpu_load[j] = 0;
  
 -		rq->last_load_update_tick = jiffies;
 -
  #ifdef CONFIG_SMP
  		rq->sd = NULL;
  		rq->rd = NULL;
@@@ -7453,13 -7352,12 +7454,13 @@@
  
  		rq_attach_root(rq, &def_root_domain);
  #ifdef CONFIG_NO_HZ_COMMON
 +		rq->last_load_update_tick = jiffies;
  		rq->nohz_flags = 0;
  #endif
  #ifdef CONFIG_NO_HZ_FULL
  		rq->last_sched_tick = 0;
  #endif
 -#endif
 +#endif /* CONFIG_SMP */
  		init_rq_hrtick(rq);
  		atomic_set(&rq->nr_iowait, 0);
  	}
@@@ -7497,12 -7395,10 +7498,12 @@@
  	if (cpu_isolated_map == NULL)
  		zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
  	idle_thread_set_boot_cpu();
 -	set_cpu_rq_start_time();
 +	set_cpu_rq_start_time(smp_processor_id());
  #endif
  	init_sched_fair_class();
  
 +	init_schedstats();
 +
  	scheduler_running = 1;
  }
  
@@@ -7744,10 -7640,10 +7745,10 @@@ void sched_move_task(struct task_struc
  {
  	struct task_group *tg;
  	int queued, running;
 -	unsigned long flags;
 +	struct rq_flags rf;
  	struct rq *rq;
  
 -	rq = task_rq_lock(tsk, &flags);
 +	rq = task_rq_lock(tsk, &rf);
  
  	running = task_current(rq, tsk);
  	queued = task_on_rq_queued(tsk);
@@@ -7779,7 -7675,7 +7780,7 @@@
  	if (queued)
  		enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE);
  
 -	task_rq_unlock(rq, tsk, &flags);
 +	task_rq_unlock(rq, tsk, &rf);
  }
  #endif /* CONFIG_CGROUP_SCHED */
  
@@@ -7999,7 -7895,7 +8000,7 @@@ static int sched_rt_can_attach(struct t
  static int sched_rt_global_constraints(void)
  {
  	unsigned long flags;
 -	int i, ret = 0;
 +	int i;
  
  	raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
  	for_each_possible_cpu(i) {
@@@ -8011,7 -7907,7 +8012,7 @@@
  	}
  	raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
  
 -	return ret;
 +	return 0;
  }
  #endif /* CONFIG_RT_GROUP_SCHED */