Merge branch 'x86-trampoline-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...
[cascardo/linux.git] / arch / x86 / kernel / smpboot.c
index 757c4b1..f56f96d 100644 (file)
 /* State of each CPU */
 DEFINE_PER_CPU(int, cpu_state) = { 0 };
 
-/* Store all idle threads, this can be reused instead of creating
-* a new thread. Also avoids complicated thread destroy functionality
-* for idle threads.
-*/
 #ifdef CONFIG_HOTPLUG_CPU
-/*
- * Needed only for CONFIG_HOTPLUG_CPU because __cpuinitdata is
- * removed after init for !CONFIG_HOTPLUG_CPU.
- */
-static DEFINE_PER_CPU(struct task_struct *, idle_thread_array);
-#define get_idle_for_cpu(x)      (per_cpu(idle_thread_array, x))
-#define set_idle_for_cpu(x, p)   (per_cpu(idle_thread_array, x) = (p))
-
 /*
  * We need this for trampoline_base protection from concurrent accesses when
  * off- and onlining cores wildly.
@@ -99,20 +87,16 @@ static DEFINE_MUTEX(x86_cpu_hotplug_driver_mutex);
 
 void cpu_hotplug_driver_lock(void)
 {
-        mutex_lock(&x86_cpu_hotplug_driver_mutex);
+       mutex_lock(&x86_cpu_hotplug_driver_mutex);
 }
 
 void cpu_hotplug_driver_unlock(void)
 {
-        mutex_unlock(&x86_cpu_hotplug_driver_mutex);
+       mutex_unlock(&x86_cpu_hotplug_driver_mutex);
 }
 
 ssize_t arch_cpu_probe(const char *buf, size_t count) { return -1; }
 ssize_t arch_cpu_release(const char *buf, size_t count) { return -1; }
-#else
-static struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
-#define get_idle_for_cpu(x)      (idle_thread_array[(x)])
-#define set_idle_for_cpu(x, p)   (idle_thread_array[(x)] = (p))
 #endif
 
 /* Number of siblings per CPU package */
@@ -317,59 +301,90 @@ void __cpuinit smp_store_cpu_info(int id)
                identify_secondary_cpu(c);
 }
 
-static void __cpuinit link_thread_siblings(int cpu1, int cpu2)
+static bool __cpuinit
+topology_sane(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o, const char *name)
 {
-       cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2));
-       cpumask_set_cpu(cpu2, cpu_sibling_mask(cpu1));
-       cpumask_set_cpu(cpu1, cpu_core_mask(cpu2));
-       cpumask_set_cpu(cpu2, cpu_core_mask(cpu1));
-       cpumask_set_cpu(cpu1, cpu_llc_shared_mask(cpu2));
-       cpumask_set_cpu(cpu2, cpu_llc_shared_mask(cpu1));
+       int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
+
+       return !WARN_ONCE(cpu_to_node(cpu1) != cpu_to_node(cpu2),
+               "sched: CPU #%d's %s-sibling CPU #%d is not on the same node! "
+               "[node: %d != %d]. Ignoring dependency.\n",
+               cpu1, name, cpu2, cpu_to_node(cpu1), cpu_to_node(cpu2));
 }
 
+#define link_mask(_m, c1, c2)                                          \
+do {                                                                   \
+       cpumask_set_cpu((c1), cpu_##_m##_mask(c2));                     \
+       cpumask_set_cpu((c2), cpu_##_m##_mask(c1));                     \
+} while (0)
+
+static bool __cpuinit match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
+{
+       if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
+               int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
+
+               if (c->phys_proc_id == o->phys_proc_id &&
+                   per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2) &&
+                   c->compute_unit_id == o->compute_unit_id)
+                       return topology_sane(c, o, "smt");
+
+       } else if (c->phys_proc_id == o->phys_proc_id &&
+                  c->cpu_core_id == o->cpu_core_id) {
+               return topology_sane(c, o, "smt");
+       }
+
+       return false;
+}
+
+static bool __cpuinit match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
+{
+       int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
+
+       if (per_cpu(cpu_llc_id, cpu1) != BAD_APICID &&
+           per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2))
+               return topology_sane(c, o, "llc");
+
+       return false;
+}
+
+static bool __cpuinit match_mc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
+{
+       if (c->phys_proc_id == o->phys_proc_id)
+               return topology_sane(c, o, "mc");
+
+       return false;
+}
 
 void __cpuinit set_cpu_sibling_map(int cpu)
 {
-       int i;
+       bool has_mc = boot_cpu_data.x86_max_cores > 1;
+       bool has_smt = smp_num_siblings > 1;
        struct cpuinfo_x86 *c = &cpu_data(cpu);
+       struct cpuinfo_x86 *o;
+       int i;
 
        cpumask_set_cpu(cpu, cpu_sibling_setup_mask);
 
-       if (smp_num_siblings > 1) {
-               for_each_cpu(i, cpu_sibling_setup_mask) {
-                       struct cpuinfo_x86 *o = &cpu_data(i);
-
-                       if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
-                               if (c->phys_proc_id == o->phys_proc_id &&
-                                   per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i) &&
-                                   c->compute_unit_id == o->compute_unit_id)
-                                       link_thread_siblings(cpu, i);
-                       } else if (c->phys_proc_id == o->phys_proc_id &&
-                                  c->cpu_core_id == o->cpu_core_id) {
-                               link_thread_siblings(cpu, i);
-                       }
-               }
-       } else {
+       if (!has_smt && !has_mc) {
                cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
-       }
-
-       cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
-
-       if (__this_cpu_read(cpu_info.x86_max_cores) == 1) {
-               cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu));
+               cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
+               cpumask_set_cpu(cpu, cpu_core_mask(cpu));
                c->booted_cores = 1;
                return;
        }
 
        for_each_cpu(i, cpu_sibling_setup_mask) {
-               if (per_cpu(cpu_llc_id, cpu) != BAD_APICID &&
-                   per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) {
-                       cpumask_set_cpu(i, cpu_llc_shared_mask(cpu));
-                       cpumask_set_cpu(cpu, cpu_llc_shared_mask(i));
-               }
-               if (c->phys_proc_id == cpu_data(i).phys_proc_id) {
-                       cpumask_set_cpu(i, cpu_core_mask(cpu));
-                       cpumask_set_cpu(cpu, cpu_core_mask(i));
+               o = &cpu_data(i);
+
+               if ((i == cpu) || (has_smt && match_smt(c, o)))
+                       link_mask(sibling, cpu, i);
+
+               if ((i == cpu) || (has_mc && match_llc(c, o)))
+                       link_mask(llc_shared, cpu, i);
+
+               if ((i == cpu) || (has_mc && match_mc(c, o))) {
+                       link_mask(core, cpu, i);
+
                        /*
                         *  Does this new cpu bringup a new core?
                         */
@@ -400,8 +415,7 @@ const struct cpumask *cpu_coregroup_mask(int cpu)
         * For perf, we return last level cache shared map.
         * And for power savings, we return cpu_core_map
         */
-       if ((sched_mc_power_savings || sched_smt_power_savings) &&
-           !(cpu_has(c, X86_FEATURE_AMD_DCM)))
+       if (!(cpu_has(c, X86_FEATURE_AMD_DCM)))
                return cpu_core_mask(cpu);
        else
                return cpu_llc_shared_mask(cpu);
@@ -620,22 +634,6 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
        return (send_status | accept_status);
 }
 
-struct create_idle {
-       struct work_struct work;
-       struct task_struct *idle;
-       struct completion done;
-       int cpu;
-};
-
-static void __cpuinit do_fork_idle(struct work_struct *work)
-{
-       struct create_idle *c_idle =
-               container_of(work, struct create_idle, work);
-
-       c_idle->idle = fork_idle(c_idle->cpu);
-       complete(&c_idle->done);
-}
-
 /* reduce the number of lines printed when booting a large cpu count system */
 static void __cpuinit announce_cpu(int cpu, int apicid)
 {
@@ -662,7 +660,7 @@ static void __cpuinit announce_cpu(int cpu, int apicid)
  * Returns zero if CPU booted OK, else error code from
  * ->wakeup_secondary_cpu.
  */
-static int __cpuinit do_boot_cpu(int apicid, int cpu)
+static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
 {
        volatile u32 *trampoline_status =
                (volatile u32 *) __va(real_mode_header->trampoline_status);
@@ -671,53 +669,26 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
 
        unsigned long boot_error = 0;
        int timeout;
-       struct create_idle c_idle = {
-               .cpu    = cpu,
-               .done   = COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
-       };
-
-       INIT_WORK_ONSTACK(&c_idle.work, do_fork_idle);
 
        alternatives_smp_switch(1);
 
-       c_idle.idle = get_idle_for_cpu(cpu);
-
-       /*
-        * We can't use kernel_thread since we must avoid to
-        * reschedule the child.
-        */
-       if (c_idle.idle) {
-               c_idle.idle->thread.sp = (unsigned long) (((struct pt_regs *)
-                       (THREAD_SIZE +  task_stack_page(c_idle.idle))) - 1);
-               init_idle(c_idle.idle, cpu);
-               goto do_rest;
-       }
-
-       schedule_work(&c_idle.work);
-       wait_for_completion(&c_idle.done);
+       idle->thread.sp = (unsigned long) (((struct pt_regs *)
+                         (THREAD_SIZE +  task_stack_page(idle))) - 1);
+       per_cpu(current_task, cpu) = idle;
 
-       if (IS_ERR(c_idle.idle)) {
-               printk("failed fork for CPU %d\n", cpu);
-               destroy_work_on_stack(&c_idle.work);
-               return PTR_ERR(c_idle.idle);
-       }
-
-       set_idle_for_cpu(cpu, c_idle.idle);
-do_rest:
-       per_cpu(current_task, cpu) = c_idle.idle;
 #ifdef CONFIG_X86_32
        /* Stack for startup_32 can be just as for start_secondary onwards */
        irq_ctx_init(cpu);
 #else
-       clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
+       clear_tsk_thread_flag(idle, TIF_FORK);
        initial_gs = per_cpu_offset(cpu);
        per_cpu(kernel_stack, cpu) =
-               (unsigned long)task_stack_page(c_idle.idle) -
+               (unsigned long)task_stack_page(idle) -
                KERNEL_STACK_OFFSET + THREAD_SIZE;
 #endif
        early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
        initial_code = (unsigned long)start_secondary;
-       stack_start  = c_idle.idle->thread.sp;
+       stack_start  = idle->thread.sp;
 
        /* So we see what's up */
        announce_cpu(cpu, apicid);
@@ -815,12 +786,10 @@ do_rest:
                 */
                smpboot_restore_warm_reset_vector();
        }
-
-       destroy_work_on_stack(&c_idle.work);
        return boot_error;
 }
 
-int __cpuinit native_cpu_up(unsigned int cpu)
+int __cpuinit native_cpu_up(unsigned int cpu, struct task_struct *tidle)
 {
        int apicid = apic->cpu_present_to_apicid(cpu);
        unsigned long flags;
@@ -853,7 +822,7 @@ int __cpuinit native_cpu_up(unsigned int cpu)
 
        per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
 
-       err = do_boot_cpu(apicid, cpu);
+       err = do_boot_cpu(apicid, cpu, tidle);
        if (err) {
                pr_debug("do_boot_cpu failed %d\n", err);
                return -EIO;