From: Linus Torvalds Date: Fri, 10 Jun 2016 19:10:02 +0000 (-0700) Subject: Merge branch 'stacking-fixes' (vfs stacking fixes from Jann) X-Git-Tag: v4.7-rc3~11 X-Git-Url: http://git.cascardo.eti.br/?a=commitdiff_plain;h=f5364c150aa645b3d7daa21b5c0b9feaa1c9cd6d;hp=-c;p=cascardo%2Flinux.git Merge branch 'stacking-fixes' (vfs stacking fixes from Jann) Merge filesystem stacking fixes from Jann Horn. * emailed patches from Jann Horn : sched: panic on corrupted stack end ecryptfs: forbid opening files without mmap handler proc: prevent stacking filesystems on top --- f5364c150aa645b3d7daa21b5c0b9feaa1c9cd6d diff --combined fs/proc/root.c index 55bc7d6c8aac,ec649c92d270..06702783bf40 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@@ -121,6 -121,13 +121,13 @@@ static struct dentry *proc_mount(struc if (IS_ERR(sb)) return ERR_CAST(sb); + /* + * procfs isn't actually a stacking filesystem; however, there is + * too much magic going on inside it to permit stacking things on + * top of it + */ + sb->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH; + if (!proc_parse_options(options, ns)) { deactivate_locked_super(sb); return ERR_PTR(-EINVAL); @@@ -226,8 -233,8 +233,8 @@@ static int proc_root_readdir(struct fil */ static const struct file_operations proc_root_operations = { .read = generic_read_dir, - .iterate = proc_root_readdir, - .llseek = default_llseek, + .iterate_shared = proc_root_readdir, + .llseek = generic_file_llseek, }; /* diff --combined kernel/sched/core.c index 385c947482e1,11546a6ed5df..017d5394f5dc --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@@ -33,7 -33,7 +33,7 @@@ #include #include #include -#include +#include #include #include #include @@@ -170,71 -170,6 +170,71 @@@ static struct rq *this_rq_lock(void return rq; } +/* + * __task_rq_lock - lock the rq @p resides on. + */ +struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) + __acquires(rq->lock) +{ + struct rq *rq; + + lockdep_assert_held(&p->pi_lock); + + for (;;) { + rq = task_rq(p); + raw_spin_lock(&rq->lock); + if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { + rf->cookie = lockdep_pin_lock(&rq->lock); + return rq; + } + raw_spin_unlock(&rq->lock); + + while (unlikely(task_on_rq_migrating(p))) + cpu_relax(); + } +} + +/* + * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. + */ +struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) + __acquires(p->pi_lock) + __acquires(rq->lock) +{ + struct rq *rq; + + for (;;) { + raw_spin_lock_irqsave(&p->pi_lock, rf->flags); + rq = task_rq(p); + raw_spin_lock(&rq->lock); + /* + * move_queued_task() task_rq_lock() + * + * ACQUIRE (rq->lock) + * [S] ->on_rq = MIGRATING [L] rq = task_rq() + * WMB (__set_task_cpu()) ACQUIRE (rq->lock); + * [S] ->cpu = new_cpu [L] task_rq() + * [L] ->on_rq + * RELEASE (rq->lock) + * + * If we observe the old cpu in task_rq_lock, the acquire of + * the old rq->lock will fully serialize against the stores. + * + * If we observe the new cpu in task_rq_lock, the acquire will + * pair with the WMB to ensure we must then also see migrating. + */ + if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { + rf->cookie = lockdep_pin_lock(&rq->lock); + return rq; + } + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); + + while (unlikely(task_on_rq_migrating(p))) + cpu_relax(); + } +} + #ifdef CONFIG_SCHED_HRTICK /* * Use HR-timers to deliver accurate preemption points. @@@ -314,6 -249,29 +314,6 @@@ void hrtick_start(struct rq *rq, u64 de } } -static int -hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu) -{ - int cpu = (int)(long)hcpu; - - switch (action) { - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - case CPU_DEAD: - case CPU_DEAD_FROZEN: - hrtick_clear(cpu_rq(cpu)); - return NOTIFY_OK; - } - - return NOTIFY_DONE; -} - -static __init void init_hrtick(void) -{ - hotcpu_notifier(hotplug_hrtick, 0); -} #else /* * Called to set the hrtick timer state. @@@ -330,6 -288,10 +330,6 @@@ void hrtick_start(struct rq *rq, u64 de hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL_PINNED); } - -static inline void init_hrtick(void) -{ -} #endif /* CONFIG_SMP */ static void init_rq_hrtick(struct rq *rq) @@@ -353,6 -315,10 +353,6 @@@ static inline void hrtick_clear(struct static inline void init_rq_hrtick(struct rq *rq) { } - -static inline void init_hrtick(void) -{ -} #endif /* CONFIG_SCHED_HRTICK */ /* @@@ -434,7 -400,7 +434,7 @@@ void wake_q_add(struct wake_q_head *hea * wakeup due to that. * * This cmpxchg() implies a full barrier, which pairs with the write - * barrier implied by the wakeup in wake_up_list(). + * barrier implied by the wakeup in wake_up_q(). */ if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL)) return; @@@ -533,10 -499,7 +533,10 @@@ int get_nohz_timer_target(void rcu_read_lock(); for_each_domain(cpu, sd) { for_each_cpu(i, sched_domain_span(sd)) { - if (!idle_cpu(i) && is_housekeeping_cpu(cpu)) { + if (cpu == i) + continue; + + if (!idle_cpu(i) && is_housekeeping_cpu(i)) { cpu = i; goto unlock; } @@@ -1122,20 -1085,12 +1122,20 @@@ void do_set_cpus_allowed(struct task_st static int __set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask, bool check) { - unsigned long flags; - struct rq *rq; + const struct cpumask *cpu_valid_mask = cpu_active_mask; unsigned int dest_cpu; + struct rq_flags rf; + struct rq *rq; int ret = 0; - rq = task_rq_lock(p, &flags); + rq = task_rq_lock(p, &rf); + + if (p->flags & PF_KTHREAD) { + /* + * Kernel threads are allowed on online && !active CPUs + */ + cpu_valid_mask = cpu_online_mask; + } /* * Must re-check here, to close a race against __kthread_bind(), @@@ -1149,32 -1104,22 +1149,32 @@@ if (cpumask_equal(&p->cpus_allowed, new_mask)) goto out; - if (!cpumask_intersects(new_mask, cpu_active_mask)) { + if (!cpumask_intersects(new_mask, cpu_valid_mask)) { ret = -EINVAL; goto out; } do_set_cpus_allowed(p, new_mask); + if (p->flags & PF_KTHREAD) { + /* + * For kernel threads that do indeed end up on online && + * !active we want to ensure they are strict per-cpu threads. + */ + WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && + !cpumask_intersects(new_mask, cpu_active_mask) && + p->nr_cpus_allowed != 1); + } + /* Can the task run on the task's current CPU? If so, we're done */ if (cpumask_test_cpu(task_cpu(p), new_mask)) goto out; - dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); + dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); if (task_running(rq, p) || p->state == TASK_WAKING) { struct migration_arg arg = { p, dest_cpu }; /* Need help from migration thread: drop lock and wait. */ - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); tlb_migrate_finish(p->mm); return 0; @@@ -1183,12 -1128,12 +1183,12 @@@ * OK, since we're going to drop the lock immediately * afterwards anyway. */ - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, rf.cookie); rq = move_queued_task(rq, p, dest_cpu); - lockdep_pin_lock(&rq->lock); + lockdep_repin_lock(&rq->lock, rf.cookie); } out: - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); return ret; } @@@ -1372,8 -1317,8 +1372,8 @@@ out */ unsigned long wait_task_inactive(struct task_struct *p, long match_state) { - unsigned long flags; int running, queued; + struct rq_flags rf; unsigned long ncsw; struct rq *rq; @@@ -1408,14 -1353,14 +1408,14 @@@ * lock now, to be *sure*. If we're wrong, we'll * just go back and repeat. */ - rq = task_rq_lock(p, &flags); + rq = task_rq_lock(p, &rf); trace_sched_wait_task(p); running = task_running(rq, p); queued = task_on_rq_queued(p); ncsw = 0; if (!match_state || p->state == match_state) ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); /* * If it changed from the expected state, bail out now. @@@ -1489,25 -1434,6 +1489,25 @@@ EXPORT_SYMBOL_GPL(kick_process) /* * ->cpus_allowed is protected by both rq->lock and p->pi_lock + * + * A few notes on cpu_active vs cpu_online: + * + * - cpu_active must be a subset of cpu_online + * + * - on cpu-up we allow per-cpu kthreads on the online && !active cpu, + * see __set_cpus_allowed_ptr(). At this point the newly online + * cpu isn't yet part of the sched domains, and balancing will not + * see it. + * + * - on cpu-down we clear cpu_active() to mask the sched domains and + * avoid the load balancer to place new tasks on the to be removed + * cpu. Existing tasks will remain running there and will be taken + * off. + * + * This means that fallback selection must not select !active CPUs. + * And can assume that any active CPU must be online. Conversely + * select_task_rq() below may allow selection of !active CPUs in order + * to satisfy the above rules. */ static int select_fallback_rq(int cpu, struct task_struct *p) { @@@ -1526,6 -1452,8 +1526,6 @@@ /* Look for allowed, online CPU in same node. */ for_each_cpu(dest_cpu, nodemask) { - if (!cpu_online(dest_cpu)) - continue; if (!cpu_active(dest_cpu)) continue; if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) @@@ -1536,6 -1464,8 +1536,6 @@@ for (;;) { /* Any allowed, online CPU? */ for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) { - if (!cpu_online(dest_cpu)) - continue; if (!cpu_active(dest_cpu)) continue; goto out; @@@ -1585,10 -1515,8 +1585,10 @@@ int select_task_rq(struct task_struct * { lockdep_assert_held(&p->pi_lock); - if (p->nr_cpus_allowed > 1) + if (tsk_nr_cpus_allowed(p) > 1) cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); + else + cpu = cpumask_any(tsk_cpus_allowed(p)); /* * In order not to call set_task_cpu() on a blocking task we need @@@ -1676,8 -1604,8 +1676,8 @@@ static inline void ttwu_activate(struc /* * Mark the task runnable and perform wakeup-preemption. */ -static void -ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) +static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, + struct pin_cookie cookie) { check_preempt_curr(rq, p, wake_flags); p->state = TASK_RUNNING; @@@ -1689,9 -1617,9 +1689,9 @@@ * Our task @p is fully woken up and running; so its safe to * drop the rq->lock, hereafter rq is only used for statistics. */ - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, cookie); p->sched_class->task_woken(rq, p); - lockdep_pin_lock(&rq->lock); + lockdep_repin_lock(&rq->lock, cookie); } if (rq->idle_stamp) { @@@ -1709,23 -1637,17 +1709,23 @@@ } static void -ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) +ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, + struct pin_cookie cookie) { + int en_flags = ENQUEUE_WAKEUP; + lockdep_assert_held(&rq->lock); #ifdef CONFIG_SMP if (p->sched_contributes_to_load) rq->nr_uninterruptible--; + + if (wake_flags & WF_MIGRATED) + en_flags |= ENQUEUE_MIGRATED; #endif - ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING); - ttwu_do_wakeup(rq, p, wake_flags); + ttwu_activate(rq, p, en_flags); + ttwu_do_wakeup(rq, p, wake_flags, cookie); } /* @@@ -1736,18 -1658,17 +1736,18 @@@ */ static int ttwu_remote(struct task_struct *p, int wake_flags) { + struct rq_flags rf; struct rq *rq; int ret = 0; - rq = __task_rq_lock(p); + rq = __task_rq_lock(p, &rf); if (task_on_rq_queued(p)) { /* check_preempt_curr() may use rq clock */ update_rq_clock(rq); - ttwu_do_wakeup(rq, p, wake_flags); + ttwu_do_wakeup(rq, p, wake_flags, rf.cookie); ret = 1; } - __task_rq_unlock(rq); + __task_rq_unlock(rq, &rf); return ret; } @@@ -1757,7 -1678,6 +1757,7 @@@ void sched_ttwu_pending(void { struct rq *rq = this_rq(); struct llist_node *llist = llist_del_all(&rq->wake_list); + struct pin_cookie cookie; struct task_struct *p; unsigned long flags; @@@ -1765,21 -1685,15 +1765,21 @@@ return; raw_spin_lock_irqsave(&rq->lock, flags); - lockdep_pin_lock(&rq->lock); + cookie = lockdep_pin_lock(&rq->lock); while (llist) { + int wake_flags = 0; + p = llist_entry(llist, struct task_struct, wake_entry); llist = llist_next(llist); - ttwu_do_activate(rq, p, 0); + + if (p->sched_remote_wakeup) + wake_flags = WF_MIGRATED; + + ttwu_do_activate(rq, p, wake_flags, cookie); } - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, cookie); raw_spin_unlock_irqrestore(&rq->lock, flags); } @@@ -1821,12 -1735,10 +1821,12 @@@ void scheduler_ipi(void irq_exit(); } -static void ttwu_queue_remote(struct task_struct *p, int cpu) +static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) { struct rq *rq = cpu_rq(cpu); + p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED); + if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) { if (!set_nr_if_polling(rq->idle)) smp_send_reschedule(cpu); @@@ -1865,23 -1777,22 +1865,23 @@@ bool cpus_share_cache(int this_cpu, in } #endif /* CONFIG_SMP */ -static void ttwu_queue(struct task_struct *p, int cpu) +static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) { struct rq *rq = cpu_rq(cpu); + struct pin_cookie cookie; #if defined(CONFIG_SMP) if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { sched_clock_cpu(cpu); /* sync clocks x-cpu */ - ttwu_queue_remote(p, cpu); + ttwu_queue_remote(p, cpu, wake_flags); return; } #endif raw_spin_lock(&rq->lock); - lockdep_pin_lock(&rq->lock); - ttwu_do_activate(rq, p, 0); - lockdep_unpin_lock(&rq->lock); + cookie = lockdep_pin_lock(&rq->lock); + ttwu_do_activate(rq, p, wake_flags, cookie); + lockdep_unpin_lock(&rq->lock, cookie); raw_spin_unlock(&rq->lock); } @@@ -2050,6 -1961,9 +2050,6 @@@ try_to_wake_up(struct task_struct *p, u p->sched_contributes_to_load = !!task_contributes_to_load(p); p->state = TASK_WAKING; - if (p->sched_class->task_waking) - p->sched_class->task_waking(p); - cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); if (task_cpu(p) != cpu) { wake_flags |= WF_MIGRATED; @@@ -2057,7 -1971,7 +2057,7 @@@ } #endif /* CONFIG_SMP */ - ttwu_queue(p, cpu); + ttwu_queue(p, cpu, wake_flags); stat: if (schedstat_enabled()) ttwu_stat(p, cpu, wake_flags); @@@ -2075,7 -1989,7 +2075,7 @@@ out * ensure that this_rq() is locked, @p is bound to this_rq() and not * the current task. */ -static void try_to_wake_up_local(struct task_struct *p) +static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie) { struct rq *rq = task_rq(p); @@@ -2092,11 -2006,11 +2092,11 @@@ * disabled avoiding further scheduler activity on it and we've * not yet picked a replacement task. */ - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, cookie); raw_spin_unlock(&rq->lock); raw_spin_lock(&p->pi_lock); raw_spin_lock(&rq->lock); - lockdep_pin_lock(&rq->lock); + lockdep_repin_lock(&rq->lock, cookie); } if (!(p->state & TASK_NORMAL)) @@@ -2107,7 -2021,7 +2107,7 @@@ if (!task_on_rq_queued(p)) ttwu_activate(rq, p, ENQUEUE_WAKEUP); - ttwu_do_wakeup(rq, p, 0); + ttwu_do_wakeup(rq, p, 0, cookie); if (schedstat_enabled()) ttwu_stat(p, smp_processor_id(), 0); out: @@@ -2253,11 -2167,9 +2253,11 @@@ int sysctl_numa_balancing(struct ctl_ta #endif #endif +#ifdef CONFIG_SCHEDSTATS + DEFINE_STATIC_KEY_FALSE(sched_schedstats); +static bool __initdata __sched_schedstats = false; -#ifdef CONFIG_SCHEDSTATS static void set_schedstats(bool enabled) { if (enabled) @@@ -2280,16 -2192,11 +2280,16 @@@ static int __init setup_schedstats(cha if (!str) goto out; + /* + * This code is called before jump labels have been set up, so we can't + * change the static branch directly just yet. Instead set a temporary + * variable so init_schedstats() can do it later. + */ if (!strcmp(str, "enable")) { - set_schedstats(true); + __sched_schedstats = true; ret = 1; } else if (!strcmp(str, "disable")) { - set_schedstats(false); + __sched_schedstats = false; ret = 1; } out: @@@ -2300,11 -2207,6 +2300,11 @@@ } __setup("schedstats=", setup_schedstats); +static void __init init_schedstats(void) +{ + set_schedstats(__sched_schedstats); +} + #ifdef CONFIG_PROC_SYSCTL int sysctl_schedstats(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@@ -2325,10 -2227,8 +2325,10 @@@ set_schedstats(state); return err; } -#endif -#endif +#endif /* CONFIG_PROC_SYSCTL */ +#else /* !CONFIG_SCHEDSTATS */ +static inline void init_schedstats(void) {} +#endif /* CONFIG_SCHEDSTATS */ /* * fork()/clone()-time setup: @@@ -2481,8 -2381,7 +2481,8 @@@ static int dl_overflow(struct task_stru u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; int cpus, err = -1; - if (new_bw == p->dl.dl_bw) + /* !deadline task may carry old deadline bandwidth */ + if (new_bw == p->dl.dl_bw && task_has_dl_policy(p)) return 0; /* @@@ -2521,12 -2420,12 +2521,12 @@@ extern void init_dl_bw(struct dl_bw *dl */ void wake_up_new_task(struct task_struct *p) { - unsigned long flags; + struct rq_flags rf; struct rq *rq; - raw_spin_lock_irqsave(&p->pi_lock, flags); /* Initialize new task's runnable average */ init_entity_runnable_average(&p->se); + raw_spin_lock_irqsave(&p->pi_lock, rf.flags); #ifdef CONFIG_SMP /* * Fork balancing, do it here and not earlier because: @@@ -2535,10 -2434,8 +2535,10 @@@ */ set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); #endif + /* Post initialize new task's util average when its cfs_rq is set */ + post_init_entity_util_avg(&p->se); - rq = __task_rq_lock(p); + rq = __task_rq_lock(p, &rf); activate_task(rq, p, 0); p->on_rq = TASK_ON_RQ_QUEUED; trace_sched_wakeup_new(p); @@@ -2549,12 -2446,12 +2549,12 @@@ * Nothing relies on rq->lock after this, so its fine to * drop it. */ - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, rf.cookie); p->sched_class->task_woken(rq, p); - lockdep_pin_lock(&rq->lock); + lockdep_repin_lock(&rq->lock, rf.cookie); } #endif - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); } #ifdef CONFIG_PREEMPT_NOTIFIERS @@@ -2816,7 -2713,7 +2816,7 @@@ asmlinkage __visible void schedule_tail */ static __always_inline struct rq * context_switch(struct rq *rq, struct task_struct *prev, - struct task_struct *next) + struct task_struct *next, struct pin_cookie cookie) { struct mm_struct *mm, *oldmm; @@@ -2836,7 -2733,7 +2836,7 @@@ atomic_inc(&oldmm->mm_count); enter_lazy_tlb(oldmm, next); } else - switch_mm(oldmm, mm, next); + switch_mm_irqs_off(oldmm, mm, next); if (!prev->mm) { prev->active_mm = NULL; @@@ -2848,7 -2745,7 +2848,7 @@@ * of the scheduler it's an obvious special-case), so we * do an early lockdep release here: */ - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, cookie); spin_release(&rq->lock.dep_map, 1, _THIS_IP_); /* Here we just switch the register state and the stack. */ @@@ -2970,7 -2867,7 +2970,7 @@@ EXPORT_PER_CPU_SYMBOL(kernel_cpustat) */ unsigned long long task_sched_runtime(struct task_struct *p) { - unsigned long flags; + struct rq_flags rf; struct rq *rq; u64 ns; @@@ -2990,7 -2887,7 +2990,7 @@@ return p->se.sum_exec_runtime; #endif - rq = task_rq_lock(p, &flags); + rq = task_rq_lock(p, &rf); /* * Must be ->curr _and_ ->on_rq. If dequeued, we would * project cycles that may never be accounted to this @@@ -3001,7 -2898,7 +3001,7 @@@ p->sched_class->update_curr(rq); } ns = p->se.sum_exec_runtime; - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); return ns; } @@@ -3021,7 -2918,7 +3021,7 @@@ void scheduler_tick(void raw_spin_lock(&rq->lock); update_rq_clock(rq); curr->sched_class->task_tick(rq, curr, 0); - update_cpu_load_active(rq); + cpu_load_update_active(rq); calc_global_load_tick(rq); raw_spin_unlock(&rq->lock); @@@ -3064,20 -2961,6 +3064,20 @@@ u64 scheduler_tick_max_deferment(void #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ defined(CONFIG_PREEMPT_TRACER)) +/* + * If the value passed in is equal to the current preempt count + * then we just disabled preemption. Start timing the latency. + */ +static inline void preempt_latency_start(int val) +{ + if (preempt_count() == val) { + unsigned long ip = get_lock_parent_ip(); +#ifdef CONFIG_DEBUG_PREEMPT + current->preempt_disable_ip = ip; +#endif + trace_preempt_off(CALLER_ADDR0, ip); + } +} void preempt_count_add(int val) { @@@ -3096,21 -2979,17 +3096,21 @@@ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK - 10); #endif - if (preempt_count() == val) { - unsigned long ip = get_lock_parent_ip(); -#ifdef CONFIG_DEBUG_PREEMPT - current->preempt_disable_ip = ip; -#endif - trace_preempt_off(CALLER_ADDR0, ip); - } + preempt_latency_start(val); } EXPORT_SYMBOL(preempt_count_add); NOKPROBE_SYMBOL(preempt_count_add); +/* + * If the value passed in equals to the current preempt count + * then we just enabled preemption. Stop timing the latency. + */ +static inline void preempt_latency_stop(int val) +{ + if (preempt_count() == val) + trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); +} + void preempt_count_sub(int val) { #ifdef CONFIG_DEBUG_PREEMPT @@@ -3127,15 -3006,13 +3127,15 @@@ return; #endif - if (preempt_count() == val) - trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); + preempt_latency_stop(val); __preempt_count_sub(val); } EXPORT_SYMBOL(preempt_count_sub); NOKPROBE_SYMBOL(preempt_count_sub); +#else +static inline void preempt_latency_start(int val) { } +static inline void preempt_latency_stop(int val) { } #endif /* @@@ -3170,7 -3047,8 +3170,8 @@@ static noinline void __schedule_bug(str static inline void schedule_debug(struct task_struct *prev) { #ifdef CONFIG_SCHED_STACK_END_CHECK - BUG_ON(task_stack_end_corrupted(prev)); + if (task_stack_end_corrupted(prev)) + panic("corrupted stack end detected inside scheduler\n"); #endif if (unlikely(in_atomic_preempt_off())) { @@@ -3188,7 -3066,7 +3189,7 @@@ * Pick up the highest-prio task: */ static inline struct task_struct * -pick_next_task(struct rq *rq, struct task_struct *prev) +pick_next_task(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) { const struct sched_class *class = &fair_sched_class; struct task_struct *p; @@@ -3199,20 -3077,20 +3200,20 @@@ */ if (likely(prev->sched_class == class && rq->nr_running == rq->cfs.h_nr_running)) { - p = fair_sched_class.pick_next_task(rq, prev); + p = fair_sched_class.pick_next_task(rq, prev, cookie); if (unlikely(p == RETRY_TASK)) goto again; /* assumes fair_sched_class->next == idle_sched_class */ if (unlikely(!p)) - p = idle_sched_class.pick_next_task(rq, prev); + p = idle_sched_class.pick_next_task(rq, prev, cookie); return p; } again: for_each_class(class) { - p = class->pick_next_task(rq, prev); + p = class->pick_next_task(rq, prev, cookie); if (p) { if (unlikely(p == RETRY_TASK)) goto again; @@@ -3266,7 -3144,6 +3267,7 @@@ static void __sched notrace __schedule( { struct task_struct *prev, *next; unsigned long *switch_count; + struct pin_cookie cookie; struct rq *rq; int cpu; @@@ -3300,7 -3177,7 +3301,7 @@@ */ smp_mb__before_spinlock(); raw_spin_lock(&rq->lock); - lockdep_pin_lock(&rq->lock); + cookie = lockdep_pin_lock(&rq->lock); rq->clock_skip_update <<= 1; /* promote REQ to ACT */ @@@ -3322,7 -3199,7 +3323,7 @@@ to_wakeup = wq_worker_sleeping(prev); if (to_wakeup) - try_to_wake_up_local(to_wakeup); + try_to_wake_up_local(to_wakeup, cookie); } } switch_count = &prev->nvcsw; @@@ -3331,7 -3208,7 +3332,7 @@@ if (task_on_rq_queued(prev)) update_rq_clock(rq); - next = pick_next_task(rq, prev); + next = pick_next_task(rq, prev, cookie); clear_tsk_need_resched(prev); clear_preempt_need_resched(); rq->clock_skip_update = 0; @@@ -3342,9 -3219,9 +3343,9 @@@ ++*switch_count; trace_sched_switch(preempt, prev, next); - rq = context_switch(rq, prev, next); /* unlocks the rq */ + rq = context_switch(rq, prev, next, cookie); /* unlocks the rq */ } else { - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, cookie); raw_spin_unlock_irq(&rq->lock); } @@@ -3411,23 -3288,8 +3412,23 @@@ void __sched schedule_preempt_disabled( static void __sched notrace preempt_schedule_common(void) { do { + /* + * Because the function tracer can trace preempt_count_sub() + * and it also uses preempt_enable/disable_notrace(), if + * NEED_RESCHED is set, the preempt_enable_notrace() called + * by the function tracer will call this function again and + * cause infinite recursion. + * + * Preemption must be disabled here before the function + * tracer can trace. Break up preempt_disable() into two + * calls. One to disable preemption without fear of being + * traced. The other to still record the preemption latency, + * which can also be traced by the function tracer. + */ preempt_disable_notrace(); + preempt_latency_start(1); __schedule(true); + preempt_latency_stop(1); preempt_enable_no_resched_notrace(); /* @@@ -3479,21 -3341,7 +3480,21 @@@ asmlinkage __visible void __sched notra return; do { + /* + * Because the function tracer can trace preempt_count_sub() + * and it also uses preempt_enable/disable_notrace(), if + * NEED_RESCHED is set, the preempt_enable_notrace() called + * by the function tracer will call this function again and + * cause infinite recursion. + * + * Preemption must be disabled here before the function + * tracer can trace. Break up preempt_disable() into two + * calls. One to disable preemption without fear of being + * traced. The other to still record the preemption latency, + * which can also be traced by the function tracer. + */ preempt_disable_notrace(); + preempt_latency_start(1); /* * Needs preempt disabled in case user_exit() is traced * and the tracer calls preempt_enable_notrace() causing @@@ -3503,7 -3351,6 +3504,7 @@@ __schedule(true); exception_exit(prev_ctx); + preempt_latency_stop(1); preempt_enable_no_resched_notrace(); } while (need_resched()); } @@@ -3560,13 -3407,12 +3561,13 @@@ EXPORT_SYMBOL(default_wake_function) void rt_mutex_setprio(struct task_struct *p, int prio) { int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE; - struct rq *rq; const struct sched_class *prev_class; + struct rq_flags rf; + struct rq *rq; BUG_ON(prio > MAX_PRIO); - rq = __task_rq_lock(p); + rq = __task_rq_lock(p, &rf); /* * Idle task boosting is a nono in general. There is one @@@ -3642,7 -3488,7 +3643,7 @@@ check_class_changed(rq, p, prev_class, oldprio); out_unlock: preempt_disable(); /* avoid rq from going away on us */ - __task_rq_unlock(rq); + __task_rq_unlock(rq, &rf); balance_callback(rq); preempt_enable(); @@@ -3652,7 -3498,7 +3653,7 @@@ void set_user_nice(struct task_struct *p, long nice) { int old_prio, delta, queued; - unsigned long flags; + struct rq_flags rf; struct rq *rq; if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) @@@ -3661,7 -3507,7 +3662,7 @@@ * We have to be careful, if called from sys_setpriority(), * the task might be in the middle of scheduling on another CPU. */ - rq = task_rq_lock(p, &flags); + rq = task_rq_lock(p, &rf); /* * The RT priorities are set via sched_setscheduler(), but we still * allow the 'normal' nice value to be set - but as expected @@@ -3692,7 -3538,7 +3693,7 @@@ resched_curr(rq); } out_unlock: - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); } EXPORT_SYMBOL(set_user_nice); @@@ -3989,11 -3835,11 +3990,11 @@@ static int __sched_setscheduler(struct MAX_RT_PRIO - 1 - attr->sched_priority; int retval, oldprio, oldpolicy = -1, queued, running; int new_effective_prio, policy = attr->sched_policy; - unsigned long flags; const struct sched_class *prev_class; - struct rq *rq; + struct rq_flags rf; int reset_on_fork; int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; + struct rq *rq; /* may grab non-irq protected spin_locks */ BUG_ON(in_interrupt()); @@@ -4088,13 -3934,13 +4089,13 @@@ recheck * To be able to change p->policy safely, the appropriate * runqueue lock must be held. */ - rq = task_rq_lock(p, &flags); + rq = task_rq_lock(p, &rf); /* * Changing the policy of the stop threads its a very bad idea */ if (p == rq->stop) { - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); return -EINVAL; } @@@ -4111,7 -3957,7 +4112,7 @@@ goto change; p->sched_reset_on_fork = reset_on_fork; - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); return 0; } change: @@@ -4125,7 -3971,7 +4126,7 @@@ if (rt_bandwidth_enabled() && rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0 && !task_group_is_autogroup(task_group(p))) { - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); return -EPERM; } #endif @@@ -4140,7 -3986,7 +4141,7 @@@ */ if (!cpumask_subset(span, &p->cpus_allowed) || rq->rd->dl_bw.bw == 0) { - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); return -EPERM; } } @@@ -4150,7 -3996,7 +4151,7 @@@ /* recheck policy now with rq lock held */ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { policy = oldpolicy = -1; - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); goto recheck; } @@@ -4160,7 -4006,7 +4161,7 @@@ * is available. */ if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) { - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); return -EBUSY; } @@@ -4205,7 -4051,7 +4206,7 @@@ check_class_changed(rq, p, prev_class, oldprio); preempt_disable(); /* avoid rq from going away on us */ - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); if (pi) rt_mutex_adjust_pi(p); @@@ -5058,10 -4904,10 +5059,10 @@@ SYSCALL_DEFINE2(sched_rr_get_interval, { struct task_struct *p; unsigned int time_slice; - unsigned long flags; + struct rq_flags rf; + struct timespec t; struct rq *rq; int retval; - struct timespec t; if (pid < 0) return -EINVAL; @@@ -5076,11 -4922,11 +5077,11 @@@ if (retval) goto out_unlock; - rq = task_rq_lock(p, &flags); + rq = task_rq_lock(p, &rf); time_slice = 0; if (p->sched_class->get_rr_interval) time_slice = p->sched_class->get_rr_interval(rq, p); - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); rcu_read_unlock(); jiffies_to_timespec(time_slice, &t); @@@ -5156,8 -5002,7 +5157,8 @@@ void show_state_filter(unsigned long st touch_all_softlockup_watchdogs(); #ifdef CONFIG_SCHED_DEBUG - sysrq_sched_debug_show(); + if (!state_filter) + sysrq_sched_debug_show(); #endif rcu_read_unlock(); /* @@@ -5319,8 -5164,6 +5320,8 @@@ out #ifdef CONFIG_SMP +static bool sched_smp_initialized __read_mostly; + #ifdef CONFIG_NUMA_BALANCING /* Migrate current task p to target_cpu */ int migrate_task_to(struct task_struct *p, int target_cpu) @@@ -5346,11 -5189,11 +5347,11 @@@ */ void sched_setnuma(struct task_struct *p, int nid) { - struct rq *rq; - unsigned long flags; bool queued, running; + struct rq_flags rf; + struct rq *rq; - rq = task_rq_lock(p, &flags); + rq = task_rq_lock(p, &rf); queued = task_on_rq_queued(p); running = task_current(rq, p); @@@ -5365,7 -5208,7 +5366,7 @@@ p->sched_class->set_curr_task(rq); if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE); - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); } #endif /* CONFIG_NUMA_BALANCING */ @@@ -5381,7 -5224,7 +5382,7 @@@ void idle_task_exit(void BUG_ON(cpu_online(smp_processor_id())); if (mm != &init_mm) { - switch_mm(mm, &init_mm, current); + switch_mm_irqs_off(mm, &init_mm, current); finish_arch_post_lock_switch(); } mmdrop(mm); @@@ -5429,7 -5272,6 +5430,7 @@@ static void migrate_tasks(struct rq *de { struct rq *rq = dead_rq; struct task_struct *next, *stop = rq->stop; + struct pin_cookie cookie; int dest_cpu; /* @@@ -5461,8 -5303,8 +5462,8 @@@ /* * pick_next_task assumes pinned rq->lock. */ - lockdep_pin_lock(&rq->lock); - next = pick_next_task(rq, &fake_task); + cookie = lockdep_pin_lock(&rq->lock); + next = pick_next_task(rq, &fake_task, cookie); BUG_ON(!next); next->sched_class->put_prev_task(rq, next); @@@ -5475,7 -5317,7 +5476,7 @@@ * because !cpu_active at this point, which means load-balance * will not interfere. Also, stop-machine. */ - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, cookie); raw_spin_unlock(&rq->lock); raw_spin_lock(&next->pi_lock); raw_spin_lock(&rq->lock); @@@ -5536,13 -5378,127 +5537,13 @@@ static void set_rq_offline(struct rq *r } } -/* - * migration_call - callback that gets triggered when a CPU is added. - * Here we can start up the necessary migration thread for the new CPU. - */ -static int -migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) +static void set_cpu_rq_start_time(unsigned int cpu) { - int cpu = (long)hcpu; - unsigned long flags; struct rq *rq = cpu_rq(cpu); - switch (action & ~CPU_TASKS_FROZEN) { - - case CPU_UP_PREPARE: - rq->calc_load_update = calc_load_update; - account_reset_rq(rq); - break; - - case CPU_ONLINE: - /* Update our root-domain */ - raw_spin_lock_irqsave(&rq->lock, flags); - if (rq->rd) { - BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); - - set_rq_online(rq); - } - raw_spin_unlock_irqrestore(&rq->lock, flags); - break; - -#ifdef CONFIG_HOTPLUG_CPU - case CPU_DYING: - sched_ttwu_pending(); - /* Update our root-domain */ - raw_spin_lock_irqsave(&rq->lock, flags); - if (rq->rd) { - BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); - set_rq_offline(rq); - } - migrate_tasks(rq); - BUG_ON(rq->nr_running != 1); /* the migration thread */ - raw_spin_unlock_irqrestore(&rq->lock, flags); - break; - - case CPU_DEAD: - calc_load_migrate(rq); - break; -#endif - } - - update_max_interval(); - - return NOTIFY_OK; -} - -/* - * Register at high priority so that task migration (migrate_all_tasks) - * happens before everything else. This has to be lower priority than - * the notifier in the perf_event subsystem, though. - */ -static struct notifier_block migration_notifier = { - .notifier_call = migration_call, - .priority = CPU_PRI_MIGRATION, -}; - -static void set_cpu_rq_start_time(void) -{ - int cpu = smp_processor_id(); - struct rq *rq = cpu_rq(cpu); rq->age_stamp = sched_clock_cpu(cpu); } -static int sched_cpu_active(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - int cpu = (long)hcpu; - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_STARTING: - set_cpu_rq_start_time(); - return NOTIFY_OK; - - case CPU_DOWN_FAILED: - set_cpu_active(cpu, true); - return NOTIFY_OK; - - default: - return NOTIFY_DONE; - } -} - -static int sched_cpu_inactive(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_DOWN_PREPARE: - set_cpu_active((long)hcpu, false); - return NOTIFY_OK; - default: - return NOTIFY_DONE; - } -} - -static int __init migration_init(void) -{ - void *cpu = (void *)(long)smp_processor_id(); - int err; - - /* Initialize migration for the boot CPU */ - err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); - BUG_ON(err == NOTIFY_BAD); - migration_call(&migration_notifier, CPU_ONLINE, cpu); - register_cpu_notifier(&migration_notifier); - - /* Register cpu active notifiers */ - cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE); - cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE); - - return 0; -} -early_initcall(migration_init); - static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ #ifdef CONFIG_SCHED_DEBUG @@@ -6690,10 -6646,10 +6691,10 @@@ static void sched_init_numa(void init_numa_topology_type(); } -static void sched_domains_numa_masks_set(int cpu) +static void sched_domains_numa_masks_set(unsigned int cpu) { - int i, j; int node = cpu_to_node(cpu); + int i, j; for (i = 0; i < sched_domains_numa_levels; i++) { for (j = 0; j < nr_node_ids; j++) { @@@ -6703,20 -6659,51 +6704,20 @@@ } } -static void sched_domains_numa_masks_clear(int cpu) +static void sched_domains_numa_masks_clear(unsigned int cpu) { int i, j; + for (i = 0; i < sched_domains_numa_levels; i++) { for (j = 0; j < nr_node_ids; j++) cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]); } } -/* - * Update sched_domains_numa_masks[level][node] array when new cpus - * are onlined. - */ -static int sched_domains_numa_masks_update(struct notifier_block *nfb, - unsigned long action, - void *hcpu) -{ - int cpu = (long)hcpu; - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_ONLINE: - sched_domains_numa_masks_set(cpu); - break; - - case CPU_DEAD: - sched_domains_numa_masks_clear(cpu); - break; - - default: - return NOTIFY_DONE; - } - - return NOTIFY_OK; -} #else -static inline void sched_init_numa(void) -{ -} - -static int sched_domains_numa_masks_update(struct notifier_block *nfb, - unsigned long action, - void *hcpu) -{ - return 0; -} +static inline void sched_init_numa(void) { } +static void sched_domains_numa_masks_set(unsigned int cpu) { } +static void sched_domains_numa_masks_clear(unsigned int cpu) { } #endif /* CONFIG_NUMA */ static int __sdt_alloc(const struct cpumask *cpu_map) @@@ -7106,9 -7093,13 +7107,9 @@@ static int num_cpus_frozen; /* used to * If we come here as part of a suspend/resume, don't touch cpusets because we * want to restore it back to its original state upon resume anyway. */ -static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, - void *hcpu) +static void cpuset_cpu_active(void) { - switch (action) { - case CPU_ONLINE_FROZEN: - case CPU_DOWN_FAILED_FROZEN: - + if (cpuhp_tasks_frozen) { /* * num_cpus_frozen tracks how many CPUs are involved in suspend * resume sequence. As long as this is not the last online @@@ -7118,25 -7109,35 +7119,25 @@@ num_cpus_frozen--; if (likely(num_cpus_frozen)) { partition_sched_domains(1, NULL, NULL); - break; + return; } - /* * This is the last CPU online operation. So fall through and * restore the original sched domains by considering the * cpuset configurations. */ - - case CPU_ONLINE: - cpuset_update_active_cpus(true); - break; - default: - return NOTIFY_DONE; } - return NOTIFY_OK; + cpuset_update_active_cpus(true); } -static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, - void *hcpu) +static int cpuset_cpu_inactive(unsigned int cpu) { unsigned long flags; - long cpu = (long)hcpu; struct dl_bw *dl_b; bool overflow; int cpus; - switch (action) { - case CPU_DOWN_PREPARE: + if (!cpuhp_tasks_frozen) { rcu_read_lock_sched(); dl_b = dl_bw_of(cpu); @@@ -7148,119 -7149,18 +7149,119 @@@ rcu_read_unlock_sched(); if (overflow) - return notifier_from_errno(-EBUSY); + return -EBUSY; cpuset_update_active_cpus(false); - break; - case CPU_DOWN_PREPARE_FROZEN: + } else { num_cpus_frozen++; partition_sched_domains(1, NULL, NULL); - break; - default: - return NOTIFY_DONE; } - return NOTIFY_OK; + return 0; +} + +int sched_cpu_activate(unsigned int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + set_cpu_active(cpu, true); + + if (sched_smp_initialized) { + sched_domains_numa_masks_set(cpu); + cpuset_cpu_active(); + } + + /* + * Put the rq online, if not already. This happens: + * + * 1) In the early boot process, because we build the real domains + * after all cpus have been brought up. + * + * 2) At runtime, if cpuset_cpu_active() fails to rebuild the + * domains. + */ + raw_spin_lock_irqsave(&rq->lock, flags); + if (rq->rd) { + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); + set_rq_online(rq); + } + raw_spin_unlock_irqrestore(&rq->lock, flags); + + update_max_interval(); + + return 0; +} + +int sched_cpu_deactivate(unsigned int cpu) +{ + int ret; + + set_cpu_active(cpu, false); + /* + * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU + * users of this state to go away such that all new such users will + * observe it. + * + * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might + * not imply sync_sched(), so wait for both. + * + * Do sync before park smpboot threads to take care the rcu boost case. + */ + if (IS_ENABLED(CONFIG_PREEMPT)) + synchronize_rcu_mult(call_rcu, call_rcu_sched); + else + synchronize_rcu(); + + if (!sched_smp_initialized) + return 0; + + ret = cpuset_cpu_inactive(cpu); + if (ret) { + set_cpu_active(cpu, true); + return ret; + } + sched_domains_numa_masks_clear(cpu); + return 0; +} + +static void sched_rq_cpu_starting(unsigned int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + rq->calc_load_update = calc_load_update; + account_reset_rq(rq); + update_max_interval(); +} + +int sched_cpu_starting(unsigned int cpu) +{ + set_cpu_rq_start_time(cpu); + sched_rq_cpu_starting(cpu); + return 0; +} + +#ifdef CONFIG_HOTPLUG_CPU +int sched_cpu_dying(unsigned int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + /* Handle pending wakeups and then migrate everything off */ + sched_ttwu_pending(); + raw_spin_lock_irqsave(&rq->lock, flags); + if (rq->rd) { + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); + set_rq_offline(rq); + } + migrate_tasks(rq); + BUG_ON(rq->nr_running != 1); + raw_spin_unlock_irqrestore(&rq->lock, flags); + calc_load_migrate(rq); + update_max_interval(); + nohz_balance_exit_idle(cpu); + hrtick_clear(rq); + return 0; } +#endif void __init sched_init_smp(void) { @@@ -7283,6 -7183,12 +7284,6 @@@ cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); mutex_unlock(&sched_domains_mutex); - hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE); - hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); - hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); - - init_hrtick(); - /* Move init over to a non-isolated CPU */ if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0) BUG(); @@@ -7291,16 -7197,7 +7292,16 @@@ init_sched_rt_class(); init_sched_dl_class(); + sched_smp_initialized = true; +} + +static int __init migration_init(void) +{ + sched_rq_cpu_starting(smp_processor_id()); + return 0; } +early_initcall(migration_init); + #else void __init sched_init_smp(void) { @@@ -7435,6 -7332,8 +7436,6 @@@ void __init sched_init(void for (j = 0; j < CPU_LOAD_IDX_MAX; j++) rq->cpu_load[j] = 0; - rq->last_load_update_tick = jiffies; - #ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; @@@ -7453,13 -7352,12 +7454,13 @@@ rq_attach_root(rq, &def_root_domain); #ifdef CONFIG_NO_HZ_COMMON + rq->last_load_update_tick = jiffies; rq->nohz_flags = 0; #endif #ifdef CONFIG_NO_HZ_FULL rq->last_sched_tick = 0; #endif -#endif +#endif /* CONFIG_SMP */ init_rq_hrtick(rq); atomic_set(&rq->nr_iowait, 0); } @@@ -7497,12 -7395,10 +7498,12 @@@ if (cpu_isolated_map == NULL) zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); idle_thread_set_boot_cpu(); - set_cpu_rq_start_time(); + set_cpu_rq_start_time(smp_processor_id()); #endif init_sched_fair_class(); + init_schedstats(); + scheduler_running = 1; } @@@ -7744,10 -7640,10 +7745,10 @@@ void sched_move_task(struct task_struc { struct task_group *tg; int queued, running; - unsigned long flags; + struct rq_flags rf; struct rq *rq; - rq = task_rq_lock(tsk, &flags); + rq = task_rq_lock(tsk, &rf); running = task_current(rq, tsk); queued = task_on_rq_queued(tsk); @@@ -7779,7 -7675,7 +7780,7 @@@ if (queued) enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE); - task_rq_unlock(rq, tsk, &flags); + task_rq_unlock(rq, tsk, &rf); } #endif /* CONFIG_CGROUP_SCHED */ @@@ -7999,7 -7895,7 +8000,7 @@@ static int sched_rt_can_attach(struct t static int sched_rt_global_constraints(void) { unsigned long flags; - int i, ret = 0; + int i; raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); for_each_possible_cpu(i) { @@@ -8011,7 -7907,7 +8012,7 @@@ } raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); - return ret; + return 0; } #endif /* CONFIG_RT_GROUP_SCHED */