sched: remove the 'u64 now' parameter from enqueue_task()

[cascardo/linux.git] / kernel / sched.c
diff --git a/kernel/sched.c b/kernel/sched.c

index 93cf241..0ecfdd1 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -53,6 +53,7 @@
  #include <linux/percpu.h>
  #include <linux/kthread.h>
  #include <linux/seq_file.h>
+#include <linux/sysctl.h>
  #include <linux/syscalls.h>
  #include <linux/times.h>
  #include <linux/tsacct_kern.h>
@@ -263,8 +264,6 @@ struct rq {
         unsigned int clock_warps, clock_overflows;
         unsigned int clock_unstable_events;
  
-       struct sched_class *load_balance_class;
-
         atomic_t nr_iowait;
  
  #ifdef CONFIG_SMP
@@ -319,15 +318,19 @@ static inline int cpu_of(struct rq *rq)
  }
  
  /*
- * Per-runqueue clock, as finegrained as the platform can give us:
+ * Update the per-runqueue clock, as finegrained as the platform can give
+ * us, but without assuming monotonicity, etc.:
   */
-static unsigned long long __rq_clock(struct rq *rq)
+static void __update_rq_clock(struct rq *rq)
  {
         u64 prev_raw = rq->prev_clock_raw;
         u64 now = sched_clock();
         s64 delta = now - prev_raw;
         u64 clock = rq->clock;
  
+#ifdef CONFIG_SCHED_DEBUG
+       WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
+#endif
         /*
          * Protect against sched_clock() occasionally going backwards:
          */
@@ -350,18 +353,12 @@ static unsigned long long __rq_clock(struct rq *rq)
  
         rq->prev_clock_raw = now;
         rq->clock = clock;
-
-       return clock;
  }
  
-static inline unsigned long long rq_clock(struct rq *rq)
+static void update_rq_clock(struct rq *rq)
  {
-       int this_cpu = smp_processor_id();
-
-       if (this_cpu == cpu_of(rq))
-               return __rq_clock(rq);
-
-       return rq->clock;
+       if (likely(smp_processor_id() == cpu_of(rq)))
+               __update_rq_clock(rq);
  }
  
  /*
@@ -385,13 +382,15 @@ static inline unsigned long long rq_clock(struct rq *rq)
   */
  unsigned long long cpu_clock(int cpu)
  {
-       struct rq *rq = cpu_rq(cpu);
         unsigned long long now;
         unsigned long flags;
+       struct rq *rq;
  
-       spin_lock_irqsave(&rq->lock, flags);
-       now = rq_clock(rq);
-       spin_unlock_irqrestore(&rq->lock, flags);
+       local_irq_save(flags);
+       rq = cpu_rq(cpu);
+       update_rq_clock(rq);
+       now = rq->clock;
+       local_irq_restore(flags);
  
         return now;
  }
@@ -639,7 +638,7 @@ static u64 div64_likely32(u64 divident, unsigned long divisor)
  
  #define WMULT_SHIFT    32
  
-static inline unsigned long
+static unsigned long
  calc_delta_mine(unsigned long delta_exec, unsigned long weight,
                 struct load_weight *lw)
  {
@@ -659,7 +658,7 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
                 tmp = (tmp * lw->inv_weight) >> WMULT_SHIFT;
         }
  
-       return (unsigned long)min(tmp, (u64)sysctl_sched_runtime_limit);
+       return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
  }
  
  static inline unsigned long
@@ -680,46 +679,6 @@ static void update_load_sub(struct load_weight *lw, unsigned long dec)
         lw->inv_weight = 0;
  }
  
-static void __update_curr_load(struct rq *rq, struct load_stat *ls)
-{
-       if (rq->curr != rq->idle && ls->load.weight) {
-               ls->delta_exec += ls->delta_stat;
-               ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);
-               ls->delta_stat = 0;
-       }
-}
-
-/*
- * Update delta_exec, delta_fair fields for rq.
- *
- * delta_fair clock advances at a rate inversely proportional to
- * total load (rq->ls.load.weight) on the runqueue, while
- * delta_exec advances at the same rate as wall-clock (provided
- * cpu is not idle).
- *
- * delta_exec / delta_fair is a measure of the (smoothened) load on this
- * runqueue over any given interval. This (smoothened) load is used
- * during load balance.
- *
- * This function is called /before/ updating rq->ls.load
- * and when switching tasks.
- */
-static void update_curr_load(struct rq *rq, u64 now)
-{
-       struct load_stat *ls = &rq->ls;
-       u64 start;
-
-       start = ls->load_update_start;
-       ls->load_update_start = now;
-       ls->delta_stat += now - start;
-       /*
-        * Stagger updates to ls->delta_fair. Very frequent updates
-        * can be expensive.
-        */
-       if (ls->delta_stat >= sysctl_sched_stat_granularity)
-               __update_curr_load(rq, ls);
-}
-
  /*
   * To aid in avoiding the subversion of "niceness" due to uneven distribution
   * of tasks with abnormal "nice" values across CPUs the contribution that
@@ -729,19 +688,6 @@ static void update_curr_load(struct rq *rq, u64 now)
   * slice expiry etc.
   */
  
-/*
- * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
- * If static_prio_timeslice() is ever changed to break this assumption then
- * this code will need modification
- */
-#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
-#define load_weight(lp) \
-       (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
-#define PRIO_TO_LOAD_WEIGHT(prio) \
-       load_weight(static_prio_timeslice(prio))
-#define RTPRIO_TO_LOAD_WEIGHT(rp) \
-       (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + load_weight(rp))
-
  #define WEIGHT_IDLEPRIO                2
  #define WMULT_IDLEPRIO         (1 << 31)
  
@@ -783,32 +729,6 @@ static const u32 prio_to_wmult[40] = {
  /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
  };
  
-static inline void
-inc_load(struct rq *rq, const struct task_struct *p, u64 now)
-{
-       update_curr_load(rq, now);
-       update_load_add(&rq->ls.load, p->se.load.weight);
-}
-
-static inline void
-dec_load(struct rq *rq, const struct task_struct *p, u64 now)
-{
-       update_curr_load(rq, now);
-       update_load_sub(&rq->ls.load, p->se.load.weight);
-}
-
-static inline void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now)
-{
-       rq->nr_running++;
-       inc_load(rq, p, now);
-}
-
-static inline void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now)
-{
-       rq->nr_running--;
-       dec_load(rq, p, now);
-}
-
  static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
  
  /*
@@ -826,8 +746,7 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
                       unsigned long max_nr_move, unsigned long max_load_move,
                       struct sched_domain *sd, enum cpu_idle_type idle,
                       int *all_pinned, unsigned long *load_moved,
-                     int this_best_prio, int best_prio, int best_prio_seen,
-                     struct rq_iterator *iterator);
+                     int *this_best_prio, struct rq_iterator *iterator);
  
  #include "sched_stats.h"
  #include "sched_rt.c"
@@ -839,6 +758,70 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
  
  #define sched_class_highest (&rt_sched_class)
  
+static void __update_curr_load(struct rq *rq, struct load_stat *ls)
+{
+       if (rq->curr != rq->idle && ls->load.weight) {
+               ls->delta_exec += ls->delta_stat;
+               ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);
+               ls->delta_stat = 0;
+       }
+}
+
+/*
+ * Update delta_exec, delta_fair fields for rq.
+ *
+ * delta_fair clock advances at a rate inversely proportional to
+ * total load (rq->ls.load.weight) on the runqueue, while
+ * delta_exec advances at the same rate as wall-clock (provided
+ * cpu is not idle).
+ *
+ * delta_exec / delta_fair is a measure of the (smoothened) load on this
+ * runqueue over any given interval. This (smoothened) load is used
+ * during load balance.
+ *
+ * This function is called /before/ updating rq->ls.load
+ * and when switching tasks.
+ */
+static void update_curr_load(struct rq *rq)
+{
+       struct load_stat *ls = &rq->ls;
+       u64 start;
+
+       start = ls->load_update_start;
+       ls->load_update_start = rq->clock;
+       ls->delta_stat += rq->clock - start;
+       /*
+        * Stagger updates to ls->delta_fair. Very frequent updates
+        * can be expensive.
+        */
+       if (ls->delta_stat >= sysctl_sched_stat_granularity)
+               __update_curr_load(rq, ls);
+}
+
+static inline void inc_load(struct rq *rq, const struct task_struct *p)
+{
+       update_curr_load(rq);
+       update_load_add(&rq->ls.load, p->se.load.weight);
+}
+
+static inline void dec_load(struct rq *rq, const struct task_struct *p)
+{
+       update_curr_load(rq);
+       update_load_sub(&rq->ls.load, p->se.load.weight);
+}
+
+static void inc_nr_running(struct task_struct *p, struct rq *rq)
+{
+       rq->nr_running++;
+       inc_load(rq, p);
+}
+
+static void dec_nr_running(struct task_struct *p, struct rq *rq)
+{
+       rq->nr_running--;
+       dec_load(rq, p);
+}
+
  static void set_load_weight(struct task_struct *p)
  {
         task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime;
@@ -863,18 +846,17 @@ static void set_load_weight(struct task_struct *p)
         p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
  }
  
-static void
-enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
+static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
  {
         sched_info_queued(p);
-       p->sched_class->enqueue_task(rq, p, wakeup, now);
+       p->sched_class->enqueue_task(rq, p, wakeup);
         p->se.on_rq = 1;
  }
  
  static void
  dequeue_task(struct rq *rq, struct task_struct *p, int sleep, u64 now)
  {
-       p->sched_class->dequeue_task(rq, p, sleep, now);
+       p->sched_class->dequeue_task(rq, p, sleep);
         p->se.on_rq = 0;
  }
  
@@ -929,13 +911,16 @@ static int effective_prio(struct task_struct *p)
   */
  static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
  {
-       u64 now = rq_clock(rq);
+       u64 now;
+
+       update_rq_clock(rq);
+       now = rq->clock;
  
         if (p->state == TASK_UNINTERRUPTIBLE)
                 rq->nr_uninterruptible--;
  
-       enqueue_task(rq, p, wakeup, now);
-       inc_nr_running(p, rq, now);
+       enqueue_task(rq, p, wakeup);
+       inc_nr_running(p, rq);
  }
  
  /*
@@ -943,27 +928,29 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
   */
  static inline void activate_idle_task(struct task_struct *p, struct rq *rq)
  {
-       u64 now = rq_clock(rq);
+       u64 now;
+
+       update_rq_clock(rq);
+       now = rq->clock;
  
         if (p->state == TASK_UNINTERRUPTIBLE)
                 rq->nr_uninterruptible--;
  
-       enqueue_task(rq, p, 0, now);
-       inc_nr_running(p, rq, now);
+       enqueue_task(rq, p, 0);
+       inc_nr_running(p, rq);
  }
  
  /*
   * deactivate_task - remove a task from the runqueue.
   */
-static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
+static void
+deactivate_task(struct rq *rq, struct task_struct *p, int sleep, u64 now)
  {
-       u64 now = rq_clock(rq);
-
         if (p->state == TASK_UNINTERRUPTIBLE)
                 rq->nr_uninterruptible++;
  
         dequeue_task(rq, p, sleep, now);
-       dec_nr_running(p, rq, now);
+       dec_nr_running(p, rq);
  }
  
  /**
@@ -998,18 +985,21 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
         u64 clock_offset, fair_clock_offset;
  
         clock_offset = old_rq->clock - new_rq->clock;
-       fair_clock_offset = old_rq->cfs.fair_clock -
-                                                new_rq->cfs.fair_clock;
-       if (p->se.wait_start)
-               p->se.wait_start -= clock_offset;
+       fair_clock_offset = old_rq->cfs.fair_clock - new_rq->cfs.fair_clock;
+
         if (p->se.wait_start_fair)
                 p->se.wait_start_fair -= fair_clock_offset;
+       if (p->se.sleep_start_fair)
+               p->se.sleep_start_fair -= fair_clock_offset;
+
+#ifdef CONFIG_SCHEDSTATS
+       if (p->se.wait_start)
+               p->se.wait_start -= clock_offset;
         if (p->se.sleep_start)
                 p->se.sleep_start -= clock_offset;
         if (p->se.block_start)
                 p->se.block_start -= clock_offset;
-       if (p->se.sleep_start_fair)
-               p->se.sleep_start_fair -= fair_clock_offset;
+#endif
  
         __set_task_cpu(p, new_cpu);
  }
@@ -1570,17 +1560,19 @@ int fastcall wake_up_state(struct task_struct *p, unsigned int state)
  static void __sched_fork(struct task_struct *p)
  {
         p->se.wait_start_fair           = 0;
-       p->se.wait_start                = 0;
         p->se.exec_start                = 0;
         p->se.sum_exec_runtime          = 0;
         p->se.delta_exec                = 0;
         p->se.delta_fair_run            = 0;
         p->se.delta_fair_sleep          = 0;
         p->se.wait_runtime              = 0;
+       p->se.sleep_start_fair          = 0;
+
+#ifdef CONFIG_SCHEDSTATS
+       p->se.wait_start                = 0;
         p->se.sum_wait_runtime          = 0;
         p->se.sum_sleep_runtime         = 0;
         p->se.sleep_start               = 0;
-       p->se.sleep_start_fair          = 0;
         p->se.block_start               = 0;
         p->se.sleep_max                 = 0;
         p->se.block_max                 = 0;
@@ -1588,10 +1580,15 @@ static void __sched_fork(struct task_struct *p)
         p->se.wait_max                  = 0;
         p->se.wait_runtime_overruns     = 0;
         p->se.wait_runtime_underruns    = 0;
+#endif
  
         INIT_LIST_HEAD(&p->run_list);
         p->se.on_rq = 0;
  
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+       INIT_HLIST_HEAD(&p->preempt_notifiers);
+#endif
+
         /*
          * We mark the process as running here, but have not actually
          * inserted it onto the runqueue yet. This guarantees that
@@ -1652,15 +1649,20 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
         unsigned long flags;
         struct rq *rq;
         int this_cpu;
+       u64 now;
  
         rq = task_rq_lock(p, &flags);
         BUG_ON(p->state != TASK_RUNNING);
         this_cpu = smp_processor_id(); /* parent's CPU */
+       update_rq_clock(rq);
+       now = rq->clock;
  
         p->prio = effective_prio(p);
  
-       if (!sysctl_sched_child_runs_first || (clone_flags & CLONE_VM) ||
-                       task_cpu(p) != this_cpu || !current->se.on_rq) {
+       if (!p->sched_class->task_new || !sysctl_sched_child_runs_first ||
+                       (clone_flags & CLONE_VM) || task_cpu(p) != this_cpu ||
+                       !current->se.on_rq) {
+
                 activate_task(rq, p, 0);
         } else {
                 /*
@@ -1668,14 +1670,74 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
                  * management (if any):
                  */
                 p->sched_class->task_new(rq, p);
+               inc_nr_running(p, rq);
         }
         check_preempt_curr(rq, p);
         task_rq_unlock(rq, &flags);
  }
  
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+
+/**
+ * preempt_notifier_register - tell me when current is being being preempted & rescheduled
+ * @notifier: notifier struct to register
+ */
+void preempt_notifier_register(struct preempt_notifier *notifier)
+{
+       hlist_add_head(&notifier->link, &current->preempt_notifiers);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_register);
+
+/**
+ * preempt_notifier_unregister - no longer interested in preemption notifications
+ * @notifier: notifier struct to unregister
+ *
+ * This is safe to call from within a preemption notifier.
+ */
+void preempt_notifier_unregister(struct preempt_notifier *notifier)
+{
+       hlist_del(&notifier->link);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
+
+static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+{
+       struct preempt_notifier *notifier;
+       struct hlist_node *node;
+
+       hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
+               notifier->ops->sched_in(notifier, raw_smp_processor_id());
+}
+
+static void
+fire_sched_out_preempt_notifiers(struct task_struct *curr,
+                                struct task_struct *next)
+{
+       struct preempt_notifier *notifier;
+       struct hlist_node *node;
+
+       hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
+               notifier->ops->sched_out(notifier, next);
+}
+
+#else
+
+static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+{
+}
+
+static void
+fire_sched_out_preempt_notifiers(struct task_struct *curr,
+                                struct task_struct *next)
+{
+}
+
+#endif
+
  /**
   * prepare_task_switch - prepare to switch tasks
   * @rq: the runqueue preparing to switch
+ * @prev: the current task that is being switched out
   * @next: the task we are going to switch to.
   *
   * This is called with the rq lock held and interrupts off. It must
@@ -1685,8 +1747,11 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
   * prepare_task_switch sets up locking and calls architecture specific
   * hooks.
   */
-static inline void prepare_task_switch(struct rq *rq, struct task_struct *next)
+static inline void
+prepare_task_switch(struct rq *rq, struct task_struct *prev,
+                   struct task_struct *next)
  {
+       fire_sched_out_preempt_notifiers(prev, next);
         prepare_lock_switch(rq, next);
         prepare_arch_switch(next);
  }
@@ -1728,6 +1793,7 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
         prev_state = prev->state;
         finish_arch_switch(prev);
         finish_lock_switch(rq, prev);
+       fire_sched_in_preempt_notifiers(current);
         if (mm)
                 mmdrop(mm);
         if (unlikely(prev_state == TASK_DEAD)) {
@@ -1768,7 +1834,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
  {
         struct mm_struct *mm, *oldmm;
  
-       prepare_task_switch(rq, next);
+       prepare_task_switch(rq, prev, next);
         mm = next->mm;
         oldmm = prev->active_mm;
         /*
@@ -1891,15 +1957,18 @@ static void update_cpu_load(struct rq *this_rq)
         unsigned long total_load = this_rq->ls.load.weight;
         unsigned long this_load =  total_load;
         struct load_stat *ls = &this_rq->ls;
-       u64 now = __rq_clock(this_rq);
+       u64 now;
         int i, scale;
  
+       __update_rq_clock(this_rq);
+       now = this_rq->clock;
+
         this_rq->nr_load_updates++;
         if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD)))
                 goto do_avg;
  
         /* Update delta_fair/delta_exec fields first */
-       update_curr_load(this_rq, now);
+       update_curr_load(this_rq);
  
         fair_delta64 = ls->delta_fair + 1;
         ls->delta_fair = 0;
@@ -1907,8 +1976,8 @@ static void update_cpu_load(struct rq *this_rq)
         exec_delta64 = ls->delta_exec + 1;
         ls->delta_exec = 0;
  
-       sample_interval64 = now - ls->load_update_last;
-       ls->load_update_last = now;
+       sample_interval64 = this_rq->clock - ls->load_update_last;
+       ls->load_update_last = this_rq->clock;
  
         if ((s64)sample_interval64 < (s64)TICK_NSEC)
                 sample_interval64 = TICK_NSEC;
@@ -2059,7 +2128,8 @@ void sched_exec(void)
  static void pull_task(struct rq *src_rq, struct task_struct *p,
                       struct rq *this_rq, int this_cpu)
  {
-       deactivate_task(src_rq, p, 0);
+       update_rq_clock(src_rq);
+       deactivate_task(src_rq, p, 0, src_rq->clock);
         set_task_cpu(p, this_cpu);
         activate_task(this_rq, p, 0);
         /*
@@ -2103,8 +2173,7 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
                       unsigned long max_nr_move, unsigned long max_load_move,
                       struct sched_domain *sd, enum cpu_idle_type idle,
                       int *all_pinned, unsigned long *load_moved,
-                     int this_best_prio, int best_prio, int best_prio_seen,
-                     struct rq_iterator *iterator)
+                     int *this_best_prio, struct rq_iterator *iterator)
  {
         int pulled = 0, pinned = 0, skip_for_load;
         struct task_struct *p;
@@ -2129,12 +2198,8 @@ next:
          */
         skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
                                                          SCHED_LOAD_SCALE_FUZZ;
-       if (skip_for_load && p->prio < this_best_prio)
-               skip_for_load = !best_prio_seen && p->prio == best_prio;
-       if (skip_for_load ||
+       if ((skip_for_load && p->prio >= *this_best_prio) ||
             !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
-
-               best_prio_seen |= p->prio == best_prio;
                 p = iterator->next(iterator->arg);
                 goto next;
         }
@@ -2148,8 +2213,8 @@ next:
          * and the prescribed amount of weighted load.
          */
         if (pulled < max_nr_move && rem_load_move > 0) {
-               if (p->prio < this_best_prio)
-                       this_best_prio = p->prio;
+               if (p->prio < *this_best_prio)
+                       *this_best_prio = p->prio;
                 p = iterator->next(iterator->arg);
                 goto next;
         }
@@ -2168,32 +2233,52 @@ out:
  }
  
  /*
- * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
- * load from busiest to this_rq, as part of a balancing operation within
- * "domain". Returns the number of tasks moved.
+ * move_tasks tries to move up to max_load_move weighted load from busiest to
+ * this_rq, as part of a balancing operation within domain "sd".
+ * Returns 1 if successful and 0 otherwise.
   *
   * Called with both runqueues locked.
   */
  static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
-                     unsigned long max_nr_move, unsigned long max_load_move,
+                     unsigned long max_load_move,
                       struct sched_domain *sd, enum cpu_idle_type idle,
                       int *all_pinned)
  {
         struct sched_class *class = sched_class_highest;
-       unsigned long load_moved, total_nr_moved = 0, nr_moved;
-       long rem_load_move = max_load_move;
+       unsigned long total_load_moved = 0;
+       int this_best_prio = this_rq->curr->prio;
  
         do {
-               nr_moved = class->load_balance(this_rq, this_cpu, busiest,
-                               max_nr_move, (unsigned long)rem_load_move,
-                               sd, idle, all_pinned, &load_moved);
-               total_nr_moved += nr_moved;
-               max_nr_move -= nr_moved;
-               rem_load_move -= load_moved;
+               total_load_moved +=
+                       class->load_balance(this_rq, this_cpu, busiest,
+                               ULONG_MAX, max_load_move - total_load_moved,
+                               sd, idle, all_pinned, &this_best_prio);
                 class = class->next;
-       } while (class && max_nr_move && rem_load_move > 0);
+       } while (class && max_load_move > total_load_moved);
  
-       return total_nr_moved;
+       return total_load_moved > 0;
+}
+
+/*
+ * move_one_task tries to move exactly one task from busiest to this_rq, as
+ * part of active balancing operations within "domain".
+ * Returns 1 if successful and 0 otherwise.
+ *
+ * Called with both runqueues locked.
+ */
+static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
+                        struct sched_domain *sd, enum cpu_idle_type idle)
+{
+       struct sched_class *class;
+       int this_best_prio = MAX_PRIO;
+
+       for (class = sched_class_highest; class; class = class->next)
+               if (class->load_balance(this_rq, this_cpu, busiest,
+                                       1, ULONG_MAX, sd, idle, NULL,
+                                       &this_best_prio))
+                       return 1;
+
+       return 0;
  }
  
  /*
@@ -2525,11 +2610,6 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
   */
  #define MAX_PINNED_INTERVAL    512
  
-static inline unsigned long minus_1_or_zero(unsigned long n)
-{
-       return n > 0 ? n - 1 : 0;
-}
-
  /*
   * Check this_cpu to ensure it is balanced within domain. Attempt to move
   * tasks if there is an imbalance.
@@ -2538,7 +2618,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                         struct sched_domain *sd, enum cpu_idle_type idle,
                         int *balance)
  {
-       int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
+       int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
         struct sched_group *group;
         unsigned long imbalance;
         struct rq *busiest;
@@ -2579,18 +2659,17 @@ redo:
  
         schedstat_add(sd, lb_imbalance[idle], imbalance);
  
-       nr_moved = 0;
+       ld_moved = 0;
         if (busiest->nr_running > 1) {
                 /*
                  * Attempt to move tasks. If find_busiest_group has found
                  * an imbalance but busiest->nr_running <= 1, the group is
-                * still unbalanced. nr_moved simply stays zero, so it is
+                * still unbalanced. ld_moved simply stays zero, so it is
                  * correctly treated as an imbalance.
                  */
                 local_irq_save(flags);
                 double_rq_lock(this_rq, busiest);
-               nr_moved = move_tasks(this_rq, this_cpu, busiest,
-                                     minus_1_or_zero(busiest->nr_running),
+               ld_moved = move_tasks(this_rq, this_cpu, busiest,
                                       imbalance, sd, idle, &all_pinned);
                 double_rq_unlock(this_rq, busiest);
                 local_irq_restore(flags);
@@ -2598,7 +2677,7 @@ redo:
                 /*
                  * some other cpu did the load balance for us.
                  */
-               if (nr_moved && this_cpu != smp_processor_id())
+               if (ld_moved && this_cpu != smp_processor_id())
                         resched_cpu(this_cpu);
  
                 /* All tasks on this runqueue were pinned by CPU affinity */
@@ -2610,7 +2689,7 @@ redo:
                 }
         }
  
-       if (!nr_moved) {
+       if (!ld_moved) {
                 schedstat_inc(sd, lb_failed[idle]);
                 sd->nr_balance_failed++;
  
@@ -2659,10 +2738,10 @@ redo:
                         sd->balance_interval *= 2;
         }
  
-       if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
+       if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
             !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
                 return -1;
-       return nr_moved;
+       return ld_moved;
  
  out_balanced:
         schedstat_inc(sd, lb_balanced[idle]);
@@ -2694,7 +2773,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
         struct sched_group *group;
         struct rq *busiest = NULL;
         unsigned long imbalance;
-       int nr_moved = 0;
+       int ld_moved = 0;
         int sd_idle = 0;
         int all_pinned = 0;
         cpumask_t cpus = CPU_MASK_ALL;
@@ -2729,12 +2808,11 @@ redo:
  
         schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
  
-       nr_moved = 0;
+       ld_moved = 0;
         if (busiest->nr_running > 1) {
                 /* Attempt to move tasks */
                 double_lock_balance(this_rq, busiest);
-               nr_moved = move_tasks(this_rq, this_cpu, busiest,
-                                       minus_1_or_zero(busiest->nr_running),
+               ld_moved = move_tasks(this_rq, this_cpu, busiest,
                                         imbalance, sd, CPU_NEWLY_IDLE,
                                         &all_pinned);
                 spin_unlock(&busiest->lock);
@@ -2746,7 +2824,7 @@ redo:
                 }
         }
  
-       if (!nr_moved) {
+       if (!ld_moved) {
                 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
                 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
                     !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
@@ -2754,7 +2832,7 @@ redo:
         } else
                 sd->nr_balance_failed = 0;
  
-       return nr_moved;
+       return ld_moved;
  
  out_balanced:
         schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
@@ -2842,9 +2920,8 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
         if (likely(sd)) {
                 schedstat_inc(sd, alb_cnt);
  
-               if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
-                              RTPRIO_TO_LOAD_WEIGHT(100), sd, CPU_IDLE,
-                              NULL))
+               if (move_one_task(target_rq, target_cpu, busiest_rq,
+                                 sd, CPU_IDLE))
                         schedstat_inc(sd, alb_pushed);
                 else
                         schedstat_inc(sd, alb_failed);
@@ -3113,8 +3190,7 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
                       unsigned long max_nr_move, unsigned long max_load_move,
                       struct sched_domain *sd, enum cpu_idle_type idle,
                       int *all_pinned, unsigned long *load_moved,
-                     int this_best_prio, int best_prio, int best_prio_seen,
-                     struct rq_iterator *iterator)
+                     int *this_best_prio, struct rq_iterator *iterator)
  {
         *load_moved = 0;
  
@@ -3140,7 +3216,8 @@ unsigned long long task_sched_runtime(struct task_struct *p)
         rq = task_rq_lock(p, &flags);
         ns = p->se.sum_exec_runtime;
         if (rq->curr == p) {
-               delta_exec = rq_clock(rq) - p->se.exec_start;
+               update_rq_clock(rq);
+               delta_exec = rq->clock - p->se.exec_start;
                 if ((s64)delta_exec > 0)
                         ns += delta_exec;
         }
@@ -3236,9 +3313,9 @@ void scheduler_tick(void)
         struct task_struct *curr = rq->curr;
  
         spin_lock(&rq->lock);
+       update_cpu_load(rq);
         if (curr != rq->idle) /* FIXME: needed? */
                 curr->sched_class->task_tick(rq, curr);
-       update_cpu_load(rq);
         spin_unlock(&rq->lock);
  
  #ifdef CONFIG_SMP
@@ -3320,7 +3397,7 @@ static inline void schedule_debug(struct task_struct *prev)
   * Pick up the highest-prio task:
   */
  static inline struct task_struct *
-pick_next_task(struct rq *rq, struct task_struct *prev, u64 now)
+pick_next_task(struct rq *rq, struct task_struct *prev)
  {
         struct sched_class *class;
         struct task_struct *p;
@@ -3330,14 +3407,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev, u64 now)
          * the fair class we can call that function directly:
          */
         if (likely(rq->nr_running == rq->cfs.nr_running)) {
-               p = fair_sched_class.pick_next_task(rq, now);
+               p = fair_sched_class.pick_next_task(rq);
                 if (likely(p))
                         return p;
         }
  
         class = sched_class_highest;
         for ( ; ; ) {
-               p = class->pick_next_task(rq, now);
+               p = class->pick_next_task(rq);
                 if (p)
                         return p;
                 /*
@@ -3374,13 +3451,15 @@ need_resched_nonpreemptible:
  
         spin_lock_irq(&rq->lock);
         clear_tsk_need_resched(prev);
+       __update_rq_clock(rq);
+       now = rq->clock;
  
         if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
                 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
                                 unlikely(signal_pending(prev)))) {
                         prev->state = TASK_RUNNING;
                 } else {
-                       deactivate_task(rq, prev, 1);
+                       deactivate_task(rq, prev, 1, now);
                 }
                 switch_count = &prev->nvcsw;
         }
@@ -3388,9 +3467,8 @@ need_resched_nonpreemptible:
         if (unlikely(!rq->nr_running))
                 idle_balance(cpu, rq);
  
-       now = __rq_clock(rq);
-       prev->sched_class->put_prev_task(rq, prev, now);
-       next = pick_next_task(rq, prev, now);
+       prev->sched_class->put_prev_task(rq, prev);
+       next = pick_next_task(rq, prev);
  
         sched_info_switch(prev, next);
  
@@ -3838,7 +3916,8 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
         BUG_ON(prio < 0 || prio > MAX_PRIO);
  
         rq = task_rq_lock(p, &flags);
-       now = rq_clock(rq);
+       update_rq_clock(rq);
+       now = rq->clock;
  
         oldprio = p->prio;
         on_rq = p->se.on_rq;
@@ -3853,7 +3932,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
         p->prio = prio;
  
         if (on_rq) {
-               enqueue_task(rq, p, 0, now);
+               enqueue_task(rq, p, 0);
                 /*
                  * Reschedule if we are currently running on this runqueue and
                  * our priority decreased, or if we are not currently running on
@@ -3885,7 +3964,8 @@ void set_user_nice(struct task_struct *p, long nice)
          * the task might be in the middle of scheduling on another CPU.
          */
         rq = task_rq_lock(p, &flags);
-       now = rq_clock(rq);
+       update_rq_clock(rq);
+       now = rq->clock;
         /*
          * The RT priorities are set via sched_setscheduler(), but we still
          * allow the 'normal' nice value to be set - but as expected
@@ -3899,7 +3979,7 @@ void set_user_nice(struct task_struct *p, long nice)
         on_rq = p->se.on_rq;
         if (on_rq) {
                 dequeue_task(rq, p, 0, now);
-               dec_load(rq, p, now);
+               dec_load(rq, p);
         }
  
         p->static_prio = NICE_TO_PRIO(nice);
@@ -3909,8 +3989,8 @@ void set_user_nice(struct task_struct *p, long nice)
         delta = p->prio - old_prio;
  
         if (on_rq) {
-               enqueue_task(rq, p, 0, now);
-               inc_load(rq, p, now);
+               enqueue_task(rq, p, 0);
+               inc_load(rq, p);
                 /*
                  * If the task increased its priority or is running and
                  * lowered its priority, then reschedule its CPU:
@@ -4147,8 +4227,10 @@ recheck:
                 goto recheck;
         }
         on_rq = p->se.on_rq;
-       if (on_rq)
-               deactivate_task(rq, p, 0);
+       if (on_rq) {
+               update_rq_clock(rq);
+               deactivate_task(rq, p, 0, rq->clock);
+       }
         oldprio = p->prio;
         __setscheduler(rq, p, policy, param->sched_priority);
         if (on_rq) {
@@ -4401,10 +4483,8 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
  out_unlock:
         read_unlock(&tasklist_lock);
         mutex_unlock(&sched_hotcpu_mutex);
-       if (retval)
-               return retval;
  
-       return 0;
+       return retval;
  }
  
  /**
@@ -4902,8 +4982,10 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
                 goto out;
  
         on_rq = p->se.on_rq;
-       if (on_rq)
-               deactivate_task(rq_src, p, 0);
+       if (on_rq) {
+               update_rq_clock(rq_src);
+               deactivate_task(rq_src, p, 0, rq_src->clock);
+       }
         set_task_cpu(p, dest_cpu);
         if (on_rq) {
                 activate_task(rq_dest, p, 0);
@@ -5136,14 +5218,136 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
         for ( ; ; ) {
                 if (!rq->nr_running)
                         break;
-               next = pick_next_task(rq, rq->curr, rq_clock(rq));
+               update_rq_clock(rq);
+               next = pick_next_task(rq, rq->curr);
                 if (!next)
                         break;
                 migrate_dead(dead_cpu, next);
+
         }
  }
  #endif /* CONFIG_HOTPLUG_CPU */
  
+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
+
+static struct ctl_table sd_ctl_dir[] = {
+       {
+               .procname       = "sched_domain",
+               .mode           = 0755,
+       },
+       {0,},
+};
+
+static struct ctl_table sd_ctl_root[] = {
+       {
+               .procname       = "kernel",
+               .mode           = 0755,
+               .child          = sd_ctl_dir,
+       },
+       {0,},
+};
+
+static struct ctl_table *sd_alloc_ctl_entry(int n)
+{
+       struct ctl_table *entry =
+               kmalloc(n * sizeof(struct ctl_table), GFP_KERNEL);
+
+       BUG_ON(!entry);
+       memset(entry, 0, n * sizeof(struct ctl_table));
+
+       return entry;
+}
+
+static void
+set_table_entry(struct ctl_table *entry,
+               const char *procname, void *data, int maxlen,
+               mode_t mode, proc_handler *proc_handler)
+{
+       entry->procname = procname;
+       entry->data = data;
+       entry->maxlen = maxlen;
+       entry->mode = mode;
+       entry->proc_handler = proc_handler;
+}
+
+static struct ctl_table *
+sd_alloc_ctl_domain_table(struct sched_domain *sd)
+{
+       struct ctl_table *table = sd_alloc_ctl_entry(14);
+
+       set_table_entry(&table[0], "min_interval", &sd->min_interval,
+               sizeof(long), 0644, proc_doulongvec_minmax);
+       set_table_entry(&table[1], "max_interval", &sd->max_interval,
+               sizeof(long), 0644, proc_doulongvec_minmax);
+       set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
+               sizeof(int), 0644, proc_dointvec_minmax);
+       set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
+               sizeof(int), 0644, proc_dointvec_minmax);
+       set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
+               sizeof(int), 0644, proc_dointvec_minmax);
+       set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
+               sizeof(int), 0644, proc_dointvec_minmax);
+       set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
+               sizeof(int), 0644, proc_dointvec_minmax);
+       set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
+               sizeof(int), 0644, proc_dointvec_minmax);
+       set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
+               sizeof(int), 0644, proc_dointvec_minmax);
+       set_table_entry(&table[10], "cache_nice_tries",
+               &sd->cache_nice_tries,
+               sizeof(int), 0644, proc_dointvec_minmax);
+       set_table_entry(&table[12], "flags", &sd->flags,
+               sizeof(int), 0644, proc_dointvec_minmax);
+
+       return table;
+}
+
+static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
+{
+       struct ctl_table *entry, *table;
+       struct sched_domain *sd;
+       int domain_num = 0, i;
+       char buf[32];
+
+       for_each_domain(cpu, sd)
+               domain_num++;
+       entry = table = sd_alloc_ctl_entry(domain_num + 1);
+
+       i = 0;
+       for_each_domain(cpu, sd) {
+               snprintf(buf, 32, "domain%d", i);
+               entry->procname = kstrdup(buf, GFP_KERNEL);
+               entry->mode = 0755;
+               entry->child = sd_alloc_ctl_domain_table(sd);
+               entry++;
+               i++;
+       }
+       return table;
+}
+
+static struct ctl_table_header *sd_sysctl_header;
+static void init_sched_domain_sysctl(void)
+{
+       int i, cpu_num = num_online_cpus();
+       struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
+       char buf[32];
+
+       sd_ctl_dir[0].child = entry;
+
+       for (i = 0; i < cpu_num; i++, entry++) {
+               snprintf(buf, 32, "cpu%d", i);
+               entry->procname = kstrdup(buf, GFP_KERNEL);
+               entry->mode = 0755;
+               entry->child = sd_alloc_ctl_cpu_table(i);
+       }
+       sd_sysctl_header = register_sysctl_table(sd_ctl_root);
+}
+#else
+static void init_sched_domain_sysctl(void)
+{
+}
+#endif
+
  /*
   * migration_call - callback that gets triggered when a CPU is added.
   * Here we can start up the necessary migration thread for the new CPU.
@@ -5200,7 +5404,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                 rq->migration_thread = NULL;
                 /* Idle task back to normal (off runqueue, low prio) */
                 rq = task_rq_lock(rq->idle, &flags);
-               deactivate_task(rq, rq->idle, 0);
+               update_rq_clock(rq);
+               deactivate_task(rq, rq->idle, 0, rq->clock);
                 rq->idle->static_prio = MAX_PRIO;
                 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
                 rq->idle->sched_class = &idle_sched_class;
@@ -6249,6 +6454,8 @@ void __init sched_init_smp(void)
         /* XXX: Theoretical race here - CPU may be hotplugged now */
         hotcpu_notifier(update_sched_domains, 0);
  
+       init_sched_domain_sysctl();
+
         /* Move init over to a non-isolated CPU */
         if (set_cpus_allowed(current, non_isolated_cpus) < 0)
                 BUG();
@@ -6335,6 +6542,10 @@ void __init sched_init(void)
  
         set_load_weight(&init_task);
  
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+       INIT_HLIST_HEAD(&init_task.preempt_notifiers);
+#endif
+
  #ifdef CONFIG_SMP
         nr_cpu_ids = highest_cpu + 1;
         open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
@@ -6400,12 +6611,14 @@ void normalize_rt_tasks(void)
         do_each_thread(g, p) {
                 p->se.fair_key                  = 0;
                 p->se.wait_runtime              = 0;
+               p->se.exec_start                = 0;
                 p->se.wait_start_fair           = 0;
+               p->se.sleep_start_fair          = 0;
+#ifdef CONFIG_SCHEDSTATS
                 p->se.wait_start                = 0;
-               p->se.exec_start                = 0;
                 p->se.sleep_start               = 0;
-               p->se.sleep_start_fair          = 0;
                 p->se.block_start               = 0;
+#endif
                 task_rq(p)->cfs.fair_clock      = 0;
                 task_rq(p)->clock               = 0;
  
@@ -6430,8 +6643,10 @@ void normalize_rt_tasks(void)
  #endif
  
                 on_rq = p->se.on_rq;
-               if (on_rq)
-                       deactivate_task(task_rq(p), p, 0);
+               if (on_rq) {
+                       update_rq_clock(task_rq(p));
+                       deactivate_task(task_rq(p), p, 0, task_rq(p)->clock);
+               }
                 __setscheduler(rq, p, SCHED_NORMAL, 0);
                 if (on_rq) {
                         activate_task(task_rq(p), p, 0);