Merge branch 'x86-trampoline-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...

[cascardo/linux.git] / arch / x86 / kernel / smpboot.c
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c

index 757c4b1..f56f96d 100644 (file)
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -78,19 +78,7 @@
  /* State of each CPU */
  DEFINE_PER_CPU(int, cpu_state) = { 0 };
  
-/* Store all idle threads, this can be reused instead of creating
-* a new thread. Also avoids complicated thread destroy functionality
-* for idle threads.
-*/
  #ifdef CONFIG_HOTPLUG_CPU
-/*
- * Needed only for CONFIG_HOTPLUG_CPU because __cpuinitdata is
- * removed after init for !CONFIG_HOTPLUG_CPU.
- */
-static DEFINE_PER_CPU(struct task_struct *, idle_thread_array);
-#define get_idle_for_cpu(x)      (per_cpu(idle_thread_array, x))
-#define set_idle_for_cpu(x, p)   (per_cpu(idle_thread_array, x) = (p))
-
  /*
   * We need this for trampoline_base protection from concurrent accesses when
   * off- and onlining cores wildly.
@@ -99,20 +87,16 @@ static DEFINE_MUTEX(x86_cpu_hotplug_driver_mutex);
  
  void cpu_hotplug_driver_lock(void)
  {
-        mutex_lock(&x86_cpu_hotplug_driver_mutex);
+       mutex_lock(&x86_cpu_hotplug_driver_mutex);
  }
  
  void cpu_hotplug_driver_unlock(void)
  {
-        mutex_unlock(&x86_cpu_hotplug_driver_mutex);
+       mutex_unlock(&x86_cpu_hotplug_driver_mutex);
  }
  
  ssize_t arch_cpu_probe(const char *buf, size_t count) { return -1; }
  ssize_t arch_cpu_release(const char *buf, size_t count) { return -1; }
-#else
-static struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
-#define get_idle_for_cpu(x)      (idle_thread_array[(x)])
-#define set_idle_for_cpu(x, p)   (idle_thread_array[(x)] = (p))
  #endif
  
  /* Number of siblings per CPU package */
@@ -317,59 +301,90 @@ void __cpuinit smp_store_cpu_info(int id)
                 identify_secondary_cpu(c);
  }
  
-static void __cpuinit link_thread_siblings(int cpu1, int cpu2)
+static bool __cpuinit
+topology_sane(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o, const char *name)
  {
-       cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2));
-       cpumask_set_cpu(cpu2, cpu_sibling_mask(cpu1));
-       cpumask_set_cpu(cpu1, cpu_core_mask(cpu2));
-       cpumask_set_cpu(cpu2, cpu_core_mask(cpu1));
-       cpumask_set_cpu(cpu1, cpu_llc_shared_mask(cpu2));
-       cpumask_set_cpu(cpu2, cpu_llc_shared_mask(cpu1));
+       int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
+
+       return !WARN_ONCE(cpu_to_node(cpu1) != cpu_to_node(cpu2),
+               "sched: CPU #%d's %s-sibling CPU #%d is not on the same node! "
+               "[node: %d != %d]. Ignoring dependency.\n",
+               cpu1, name, cpu2, cpu_to_node(cpu1), cpu_to_node(cpu2));
  }
  
+#define link_mask(_m, c1, c2)                                          \
+do {                                                                   \
+       cpumask_set_cpu((c1), cpu_##_m##_mask(c2));                     \
+       cpumask_set_cpu((c2), cpu_##_m##_mask(c1));                     \
+} while (0)
+
+static bool __cpuinit match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
+{
+       if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
+               int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
+
+               if (c->phys_proc_id == o->phys_proc_id &&
+                   per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2) &&
+                   c->compute_unit_id == o->compute_unit_id)
+                       return topology_sane(c, o, "smt");
+
+       } else if (c->phys_proc_id == o->phys_proc_id &&
+                  c->cpu_core_id == o->cpu_core_id) {
+               return topology_sane(c, o, "smt");
+       }
+
+       return false;
+}
+
+static bool __cpuinit match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
+{
+       int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
+
+       if (per_cpu(cpu_llc_id, cpu1) != BAD_APICID &&
+           per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2))
+               return topology_sane(c, o, "llc");
+
+       return false;
+}
+
+static bool __cpuinit match_mc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
+{
+       if (c->phys_proc_id == o->phys_proc_id)
+               return topology_sane(c, o, "mc");
+
+       return false;
+}
  
  void __cpuinit set_cpu_sibling_map(int cpu)
  {
-       int i;
+       bool has_mc = boot_cpu_data.x86_max_cores > 1;
+       bool has_smt = smp_num_siblings > 1;
         struct cpuinfo_x86 *c = &cpu_data(cpu);
+       struct cpuinfo_x86 *o;
+       int i;
  
         cpumask_set_cpu(cpu, cpu_sibling_setup_mask);
  
-       if (smp_num_siblings > 1) {
-               for_each_cpu(i, cpu_sibling_setup_mask) {
-                       struct cpuinfo_x86 *o = &cpu_data(i);
-
-                       if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
-                               if (c->phys_proc_id == o->phys_proc_id &&
-                                   per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i) &&
-                                   c->compute_unit_id == o->compute_unit_id)
-                                       link_thread_siblings(cpu, i);
-                       } else if (c->phys_proc_id == o->phys_proc_id &&
-                                  c->cpu_core_id == o->cpu_core_id) {
-                               link_thread_siblings(cpu, i);
-                       }
-               }
-       } else {
+       if (!has_smt && !has_mc) {
                 cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
-       }
-
-       cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
-
-       if (__this_cpu_read(cpu_info.x86_max_cores) == 1) {
-               cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu));
+               cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
+               cpumask_set_cpu(cpu, cpu_core_mask(cpu));
                 c->booted_cores = 1;
                 return;
         }
  
         for_each_cpu(i, cpu_sibling_setup_mask) {
-               if (per_cpu(cpu_llc_id, cpu) != BAD_APICID &&
-                   per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) {
-                       cpumask_set_cpu(i, cpu_llc_shared_mask(cpu));
-                       cpumask_set_cpu(cpu, cpu_llc_shared_mask(i));
-               }
-               if (c->phys_proc_id == cpu_data(i).phys_proc_id) {
-                       cpumask_set_cpu(i, cpu_core_mask(cpu));
-                       cpumask_set_cpu(cpu, cpu_core_mask(i));
+               o = &cpu_data(i);
+
+               if ((i == cpu) || (has_smt && match_smt(c, o)))
+                       link_mask(sibling, cpu, i);
+
+               if ((i == cpu) || (has_mc && match_llc(c, o)))
+                       link_mask(llc_shared, cpu, i);
+
+               if ((i == cpu) || (has_mc && match_mc(c, o))) {
+                       link_mask(core, cpu, i);
+
                         /*
                          *  Does this new cpu bringup a new core?
                          */
@@ -400,8 +415,7 @@ const struct cpumask *cpu_coregroup_mask(int cpu)
          * For perf, we return last level cache shared map.
          * And for power savings, we return cpu_core_map
          */
-       if ((sched_mc_power_savings || sched_smt_power_savings) &&
-           !(cpu_has(c, X86_FEATURE_AMD_DCM)))
+       if (!(cpu_has(c, X86_FEATURE_AMD_DCM)))
                 return cpu_core_mask(cpu);
         else
                 return cpu_llc_shared_mask(cpu);
@@ -620,22 +634,6 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
         return (send_status | accept_status);
  }
  
-struct create_idle {
-       struct work_struct work;
-       struct task_struct *idle;
-       struct completion done;
-       int cpu;
-};
-
-static void __cpuinit do_fork_idle(struct work_struct *work)
-{
-       struct create_idle *c_idle =
-               container_of(work, struct create_idle, work);
-
-       c_idle->idle = fork_idle(c_idle->cpu);
-       complete(&c_idle->done);
-}
-
  /* reduce the number of lines printed when booting a large cpu count system */
  static void __cpuinit announce_cpu(int cpu, int apicid)
  {
@@ -662,7 +660,7 @@ static void __cpuinit announce_cpu(int cpu, int apicid)
   * Returns zero if CPU booted OK, else error code from
   * ->wakeup_secondary_cpu.
   */
-static int __cpuinit do_boot_cpu(int apicid, int cpu)
+static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
  {
         volatile u32 *trampoline_status =
                 (volatile u32 *) __va(real_mode_header->trampoline_status);
@@ -671,53 +669,26 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
  
         unsigned long boot_error = 0;
         int timeout;
-       struct create_idle c_idle = {
-               .cpu    = cpu,
-               .done   = COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
-       };
-
-       INIT_WORK_ONSTACK(&c_idle.work, do_fork_idle);
  
         alternatives_smp_switch(1);
  
-       c_idle.idle = get_idle_for_cpu(cpu);
-
-       /*
-        * We can't use kernel_thread since we must avoid to
-        * reschedule the child.
-        */
-       if (c_idle.idle) {
-               c_idle.idle->thread.sp = (unsigned long) (((struct pt_regs *)
-                       (THREAD_SIZE +  task_stack_page(c_idle.idle))) - 1);
-               init_idle(c_idle.idle, cpu);
-               goto do_rest;
-       }
-
-       schedule_work(&c_idle.work);
-       wait_for_completion(&c_idle.done);
+       idle->thread.sp = (unsigned long) (((struct pt_regs *)
+                         (THREAD_SIZE +  task_stack_page(idle))) - 1);
+       per_cpu(current_task, cpu) = idle;
  
-       if (IS_ERR(c_idle.idle)) {
-               printk("failed fork for CPU %d\n", cpu);
-               destroy_work_on_stack(&c_idle.work);
-               return PTR_ERR(c_idle.idle);
-       }
-
-       set_idle_for_cpu(cpu, c_idle.idle);
-do_rest:
-       per_cpu(current_task, cpu) = c_idle.idle;
  #ifdef CONFIG_X86_32
         /* Stack for startup_32 can be just as for start_secondary onwards */
         irq_ctx_init(cpu);
  #else
-       clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
+       clear_tsk_thread_flag(idle, TIF_FORK);
         initial_gs = per_cpu_offset(cpu);
         per_cpu(kernel_stack, cpu) =
-               (unsigned long)task_stack_page(c_idle.idle) -
+               (unsigned long)task_stack_page(idle) -
                 KERNEL_STACK_OFFSET + THREAD_SIZE;
  #endif
         early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
         initial_code = (unsigned long)start_secondary;
-       stack_start  = c_idle.idle->thread.sp;
+       stack_start  = idle->thread.sp;
  
         /* So we see what's up */
         announce_cpu(cpu, apicid);
@@ -815,12 +786,10 @@ do_rest:
                  */
                 smpboot_restore_warm_reset_vector();
         }
-
-       destroy_work_on_stack(&c_idle.work);
         return boot_error;
  }
  
-int __cpuinit native_cpu_up(unsigned int cpu)
+int __cpuinit native_cpu_up(unsigned int cpu, struct task_struct *tidle)
  {
         int apicid = apic->cpu_present_to_apicid(cpu);
         unsigned long flags;
@@ -853,7 +822,7 @@ int __cpuinit native_cpu_up(unsigned int cpu)
  
         per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
  
-       err = do_boot_cpu(apicid, cpu);
+       err = do_boot_cpu(apicid, cpu, tidle);
         if (err) {
                 pr_debug("do_boot_cpu failed %d\n", err);
                 return -EIO;