sched/fair: Fix fairness issue on migration

[cascardo/linux.git] / kernel / sched / fair.c
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index ecd81c4..d28d89d 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -719,7 +719,7 @@ void post_init_entity_util_avg(struct sched_entity *se)
  {
         struct cfs_rq *cfs_rq = cfs_rq_of(se);
         struct sched_avg *sa = &se->avg;
-       long cap = (long)(scale_load_down(SCHED_LOAD_SCALE) - cfs_rq->avg.util_avg) / 2;
+       long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
  
         if (cap > 0) {
                 if (cfs_rq->avg.util_avg != 0) {
@@ -2602,6 +2602,16 @@ static const u32 runnable_avg_yN_sum[] = {
         17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
  };
  
+/*
+ * Precomputed \Sum y^k { 1<=k<=n, where n%32=0). Values are rolled down to
+ * lower integers. See Documentation/scheduler/sched-avg.txt how these
+ * were generated:
+ */
+static const u32 __accumulated_sum_N32[] = {
+           0, 23371, 35056, 40899, 43820, 45281,
+       46011, 46376, 46559, 46650, 46696, 46719,
+};
+
  /*
   * Approximate:
   *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
@@ -2650,22 +2660,13 @@ static u32 __compute_runnable_contrib(u64 n)
         else if (unlikely(n >= LOAD_AVG_MAX_N))
                 return LOAD_AVG_MAX;
  
-       /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
-       do {
-               contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
-               contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
-
-               n -= LOAD_AVG_PERIOD;
-       } while (n > LOAD_AVG_PERIOD);
-
+       /* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */
+       contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD];
+       n %= LOAD_AVG_PERIOD;
         contrib = decay_load(contrib, n);
         return contrib + runnable_avg_yN_sum[n];
  }
  
-#if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10
-#error "load tracking assumes 2^10 as unit"
-#endif
-
  #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
  
  /*
@@ -3098,7 +3099,14 @@ static int idle_balance(struct rq *this_rq);
  
  #else /* CONFIG_SMP */
  
-static inline void update_load_avg(struct sched_entity *se, int update_tg) {}
+static inline void update_load_avg(struct sched_entity *se, int not_used)
+{
+       struct cfs_rq *cfs_rq = cfs_rq_of(se);
+       struct rq *rq = rq_of(cfs_rq);
+
+       cpufreq_trigger_update(rq_clock(rq));
+}
+
  static inline void
  enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
  static inline void
@@ -3246,10 +3254,41 @@ static inline void check_schedstat_required(void)
  #endif
  }
  
+
+/*
+ * MIGRATION
+ *
+ *     dequeue
+ *       update_curr()
+ *         update_min_vruntime()
+ *       vruntime -= min_vruntime
+ *
+ *     enqueue
+ *       update_curr()
+ *         update_min_vruntime()
+ *       vruntime += min_vruntime
+ *
+ * this way the vruntime transition between RQs is done when both
+ * min_vruntime are up-to-date.
+ *
+ * WAKEUP (remote)
+ *
+ *     ->migrate_task_rq_fair() (p->state == TASK_WAKING)
+ *       vruntime -= min_vruntime
+ *
+ *     enqueue
+ *       update_curr()
+ *         update_min_vruntime()
+ *       vruntime += min_vruntime
+ *
+ * this way we don't have the most up-to-date min_vruntime on the originating
+ * CPU and an up-to-date min_vruntime on the destination CPU.
+ */
+
  static void
  enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
  {
-       bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING);
+       bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
         bool curr = cfs_rq->curr == se;
  
         /*
@@ -3263,7 +3302,9 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
  
         /*
          * Otherwise, renormalise after, such that we're placed at the current
-        * moment in time, instead of some random moment in the past.
+        * moment in time, instead of some random moment in the past. Being
+        * placed in the past could significantly boost this task to the
+        * fairness detriment of existing tasks.
          */
         if (renorm && !curr)
                 se->vruntime += cfs_rq->min_vruntime;
@@ -4491,7 +4532,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
  }
  
  #ifdef CONFIG_SMP
-
+#ifdef CONFIG_NO_HZ_COMMON
  /*
   * per rq 'load' arrray crap; XXX kill this.
   */
@@ -4557,13 +4598,13 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
         }
         return load;
  }
+#endif /* CONFIG_NO_HZ_COMMON */
  
  /**
   * __cpu_load_update - update the rq->cpu_load[] statistics
   * @this_rq: The rq to update statistics for
   * @this_load: The current load
   * @pending_updates: The number of missed updates
- * @active: !0 for NOHZ_FULL
   *
   * Update rq->cpu_load[] statistics. This function is usually called every
   * scheduler tick (TICK_NSEC).
@@ -4592,12 +4633,12 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
   *   load[i]_n = (1 - 1/2^i)^n * load[i]_0
   *
   * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
- * term. See the @active paramter.
+ * term.
   */
-static void __cpu_load_update(struct rq *this_rq, unsigned long this_load,
-                             unsigned long pending_updates, int active)
+static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
+                           unsigned long pending_updates)
  {
-       unsigned long tickless_load = active ? this_rq->cpu_load[0] : 0;
+       unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0];
         int i, scale;
  
         this_rq->nr_load_updates++;
@@ -4610,6 +4651,7 @@ static void __cpu_load_update(struct rq *this_rq, unsigned long this_load,
                 /* scale is effectively 1 << i now, and >> i divides by scale */
  
                 old_load = this_rq->cpu_load[i];
+#ifdef CONFIG_NO_HZ_COMMON
                 old_load = decay_load_missed(old_load, pending_updates - 1, i);
                 if (tickless_load) {
                         old_load -= decay_load_missed(tickless_load, pending_updates - 1, i);
@@ -4620,6 +4662,7 @@ static void __cpu_load_update(struct rq *this_rq, unsigned long this_load,
                          */
                         old_load += tickless_load;
                 }
+#endif
                 new_load = this_load;
                 /*
                  * Round up the averaging division if load is increasing. This
@@ -4642,10 +4685,23 @@ static unsigned long weighted_cpuload(const int cpu)
  }
  
  #ifdef CONFIG_NO_HZ_COMMON
-static void __cpu_load_update_nohz(struct rq *this_rq,
-                                  unsigned long curr_jiffies,
-                                  unsigned long load,
-                                  int active)
+/*
+ * There is no sane way to deal with nohz on smp when using jiffies because the
+ * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
+ * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
+ *
+ * Therefore we need to avoid the delta approach from the regular tick when
+ * possible since that would seriously skew the load calculation. This is why we
+ * use cpu_load_update_periodic() for CPUs out of nohz. However we'll rely on
+ * jiffies deltas for updates happening while in nohz mode (idle ticks, idle
+ * loop exit, nohz_idle_balance, nohz full exit...)
+ *
+ * This means we might still be one tick off for nohz periods.
+ */
+
+static void cpu_load_update_nohz(struct rq *this_rq,
+                                unsigned long curr_jiffies,
+                                unsigned long load)
  {
         unsigned long pending_updates;
  
@@ -4657,23 +4713,10 @@ static void __cpu_load_update_nohz(struct rq *this_rq,
                  * In the NOHZ_FULL case, we were non-idle, we should consider
                  * its weighted load.
                  */
-               __cpu_load_update(this_rq, load, pending_updates, active);
+               cpu_load_update(this_rq, load, pending_updates);
         }
  }
  
-/*
- * There is no sane way to deal with nohz on smp when using jiffies because the
- * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
- * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
- *
- * Therefore we cannot use the delta approach from the regular tick since that
- * would seriously skew the load calculation. However we'll make do for those
- * updates happening while idle (nohz_idle_balance) or coming out of idle
- * (tick_nohz_idle_exit).
- *
- * This means we might still be one tick off for nohz periods.
- */
-
  /*
   * Called from nohz_idle_balance() to update the load ratings before doing the
   * idle balance.
@@ -4686,26 +4729,59 @@ static void cpu_load_update_idle(struct rq *this_rq)
         if (weighted_cpuload(cpu_of(this_rq)))
                 return;
  
-       __cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0, 0);
+       cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
  }
  
  /*
- * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
+ * Record CPU load on nohz entry so we know the tickless load to account
+ * on nohz exit. cpu_load[0] happens then to be updated more frequently
+ * than other cpu_load[idx] but it should be fine as cpu_load readers
+ * shouldn't rely into synchronized cpu_load[*] updates.
   */
-void cpu_load_update_nohz(int active)
+void cpu_load_update_nohz_start(void)
  {
         struct rq *this_rq = this_rq();
+
+       /*
+        * This is all lockless but should be fine. If weighted_cpuload changes
+        * concurrently we'll exit nohz. And cpu_load write can race with
+        * cpu_load_update_idle() but both updater would be writing the same.
+        */
+       this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq));
+}
+
+/*
+ * Account the tickless load in the end of a nohz frame.
+ */
+void cpu_load_update_nohz_stop(void)
+{
         unsigned long curr_jiffies = READ_ONCE(jiffies);
-       unsigned long load = active ? weighted_cpuload(cpu_of(this_rq)) : 0;
+       struct rq *this_rq = this_rq();
+       unsigned long load;
  
         if (curr_jiffies == this_rq->last_load_update_tick)
                 return;
  
+       load = weighted_cpuload(cpu_of(this_rq));
         raw_spin_lock(&this_rq->lock);
-       __cpu_load_update_nohz(this_rq, curr_jiffies, load, active);
+       update_rq_clock(this_rq);
+       cpu_load_update_nohz(this_rq, curr_jiffies, load);
         raw_spin_unlock(&this_rq->lock);
  }
-#endif /* CONFIG_NO_HZ */
+#else /* !CONFIG_NO_HZ_COMMON */
+static inline void cpu_load_update_nohz(struct rq *this_rq,
+                                       unsigned long curr_jiffies,
+                                       unsigned long load) { }
+#endif /* CONFIG_NO_HZ_COMMON */
+
+static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
+{
+#ifdef CONFIG_NO_HZ_COMMON
+       /* See the mess around cpu_load_update_nohz(). */
+       this_rq->last_load_update_tick = READ_ONCE(jiffies);
+#endif
+       cpu_load_update(this_rq, load, 1);
+}
  
  /*
   * Called from scheduler_tick()
@@ -4713,11 +4789,11 @@ void cpu_load_update_nohz(int active)
  void cpu_load_update_active(struct rq *this_rq)
  {
         unsigned long load = weighted_cpuload(cpu_of(this_rq));
-       /*
-        * See the mess around cpu_load_update_idle() / cpu_load_update_nohz().
-        */
-       this_rq->last_load_update_tick = jiffies;
-       __cpu_load_update(this_rq, load, 1, 1);
+
+       if (tick_nohz_tick_stopped())
+               cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
+       else
+               cpu_load_update_periodic(this_rq, load);
  }
  
  /*
@@ -4775,46 +4851,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
         return 0;
  }
  
-static void record_wakee(struct task_struct *p)
-{
-       /*
-        * Rough decay (wiping) for cost saving, don't worry
-        * about the boundary, really active task won't care
-        * about the loss.
-        */
-       if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
-               current->wakee_flips >>= 1;
-               current->wakee_flip_decay_ts = jiffies;
-       }
-
-       if (current->last_wakee != p) {
-               current->last_wakee = p;
-               current->wakee_flips++;
-       }
-}
-
-static void task_waking_fair(struct task_struct *p)
-{
-       struct sched_entity *se = &p->se;
-       struct cfs_rq *cfs_rq = cfs_rq_of(se);
-       u64 min_vruntime;
-
-#ifndef CONFIG_64BIT
-       u64 min_vruntime_copy;
-
-       do {
-               min_vruntime_copy = cfs_rq->min_vruntime_copy;
-               smp_rmb();
-               min_vruntime = cfs_rq->min_vruntime;
-       } while (min_vruntime != min_vruntime_copy);
-#else
-       min_vruntime = cfs_rq->min_vruntime;
-#endif
-
-       se->vruntime -= min_vruntime;
-       record_wakee(p);
-}
-
  #ifdef CONFIG_FAIR_GROUP_SCHED
  /*
   * effective_load() calculates the load change as seen from the root_task_group
@@ -4930,17 +4966,39 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
  
  #endif
  
+static void record_wakee(struct task_struct *p)
+{
+       /*
+        * Only decay a single time; tasks that have less then 1 wakeup per
+        * jiffy will not have built up many flips.
+        */
+       if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
+               current->wakee_flips >>= 1;
+               current->wakee_flip_decay_ts = jiffies;
+       }
+
+       if (current->last_wakee != p) {
+               current->last_wakee = p;
+               current->wakee_flips++;
+       }
+}
+
  /*
   * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
+ *
   * A waker of many should wake a different task than the one last awakened
- * at a frequency roughly N times higher than one of its wakees.  In order
- * to determine whether we should let the load spread vs consolodating to
- * shared cache, we look for a minimum 'flip' frequency of llc_size in one
- * partner, and a factor of lls_size higher frequency in the other.  With
- * both conditions met, we can be relatively sure that the relationship is
- * non-monogamous, with partner count exceeding socket size.  Waker/wakee
- * being client/server, worker/dispatcher, interrupt source or whatever is
- * irrelevant, spread criteria is apparent partner count exceeds socket size.
+ * at a frequency roughly N times higher than one of its wakees.
+ *
+ * In order to determine whether we should let the load spread vs consolidating
+ * to shared cache, we look for a minimum 'flip' frequency of llc_size in one
+ * partner, and a factor of lls_size higher frequency in the other.
+ *
+ * With both conditions met, we can be relatively sure that the relationship is
+ * non-monogamous, with partner count exceeding socket size.
+ *
+ * Waker/wakee being client/server, worker/dispatcher, interrupt source or
+ * whatever is irrelevant, spread criteria is apparent partner count exceeds
+ * socket size.
   */
  static int wake_wide(struct task_struct *p)
  {
@@ -5245,8 +5303,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
         int want_affine = 0;
         int sync = wake_flags & WF_SYNC;
  
-       if (sd_flag & SD_BALANCE_WAKE)
+       if (sd_flag & SD_BALANCE_WAKE) {
+               record_wakee(p);
                 want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
+       }
  
         rcu_read_lock();
         for_each_domain(cpu, tmp) {
@@ -5325,6 +5385,32 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
   */
  static void migrate_task_rq_fair(struct task_struct *p)
  {
+       /*
+        * As blocked tasks retain absolute vruntime the migration needs to
+        * deal with this by subtracting the old and adding the new
+        * min_vruntime -- the latter is done by enqueue_entity() when placing
+        * the task on the new runqueue.
+        */
+       if (p->state == TASK_WAKING) {
+               struct sched_entity *se = &p->se;
+               struct cfs_rq *cfs_rq = cfs_rq_of(se);
+               u64 min_vruntime;
+
+#ifndef CONFIG_64BIT
+               u64 min_vruntime_copy;
+
+               do {
+                       min_vruntime_copy = cfs_rq->min_vruntime_copy;
+                       smp_rmb();
+                       min_vruntime = cfs_rq->min_vruntime;
+               } while (min_vruntime != min_vruntime_copy);
+#else
+               min_vruntime = cfs_rq->min_vruntime;
+#endif
+
+               se->vruntime -= min_vruntime;
+       }
+
         /*
          * We are supposed to update the task to "current" time, then its up to date
          * and ready to go to new CPU/cfs_rq. But we have difficulty in getting
@@ -5508,7 +5594,7 @@ preempt:
  }
  
  static struct task_struct *
-pick_next_task_fair(struct rq *rq, struct task_struct *prev)
+pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
  {
         struct cfs_rq *cfs_rq = &rq->cfs;
         struct sched_entity *se;
@@ -5621,9 +5707,9 @@ idle:
          * further scheduler activity on it and we're being very careful to
          * re-start the picking loop.
          */
-       lockdep_unpin_lock(&rq->lock);
+       lockdep_unpin_lock(&rq->lock, cookie);
         new_tasks = idle_balance(rq);
-       lockdep_pin_lock(&rq->lock);
+       lockdep_repin_lock(&rq->lock, cookie);
         /*
          * Because idle_balance() releases (and re-acquires) rq->lock, it is
          * possible for any higher priority task to appear. In that case we
@@ -6964,9 +7050,10 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
         }
  
         /*
-        * In the presence of smp nice balancing, certain scenarios can have
-        * max load less than avg load(as we skip the groups at or below
-        * its cpu_capacity, while calculating max_load..)
+        * Avg load of busiest sg can be less and avg load of local sg can
+        * be greater than avg load across all sgs of sd because avg load
+        * factors in sg capacity and sgs with smaller group_type are
+        * skipped when updating the busiest sg:
          */
         if (busiest->avg_load <= sds->avg_load ||
             local->avg_load >= sds->avg_load) {
@@ -6980,7 +7067,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
         if (busiest->group_type == group_overloaded &&
             local->group_type   == group_overloaded) {
                 load_above_capacity = busiest->sum_nr_running *
-                                       SCHED_LOAD_SCALE;
+                                     scale_load_down(NICE_0_LOAD);
                 if (load_above_capacity > busiest->group_capacity)
                         load_above_capacity -= busiest->group_capacity;
                 else
@@ -6991,9 +7078,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
          * We're trying to get all the cpus to the average_load, so we don't
          * want to push ourselves above the average load, nor do we wish to
          * reduce the max loaded cpu below the average load. At the same time,
-        * we also don't want to reduce the group load below the group capacity
-        * (so that we can implement power-savings policies etc). Thus we look
-        * for the minimum possible imbalance.
+        * we also don't want to reduce the group load below the group
+        * capacity. Thus we look for the minimum possible imbalance.
          */
         max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
  
@@ -7017,10 +7103,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
  
  /**
   * find_busiest_group - Returns the busiest group within the sched_domain
- * if there is an imbalance. If there isn't an imbalance, and
- * the user has opted for power-savings, it returns a group whose
- * CPUs can be put to idle by rebalancing those tasks elsewhere, if
- * such a group exists.
+ * if there is an imbalance.
   *
   * Also calculates the amount of weighted load which should be moved
   * to restore balance.
@@ -7028,9 +7111,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
   * @env: The load balancing environment.
   *
   * Return:     - The busiest group if imbalance exists.
- *             - If no imbalance and user has opted for power-savings balance,
- *                return the least loaded group whose CPUs can be
- *                put to idle by rebalancing its tasks onto our group.
   */
  static struct sched_group *find_busiest_group(struct lb_env *env)
  {
@@ -7785,7 +7865,7 @@ static void nohz_balancer_kick(void)
         return;
  }
  
-static inline void nohz_balance_exit_idle(int cpu)
+void nohz_balance_exit_idle(unsigned int cpu)
  {
         if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
                 /*
@@ -7858,18 +7938,6 @@ void nohz_balance_enter_idle(int cpu)
         atomic_inc(&nohz.nr_cpus);
         set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
  }
-
-static int sched_ilb_notifier(struct notifier_block *nfb,
-                                       unsigned long action, void *hcpu)
-{
-       switch (action & ~CPU_TASKS_FROZEN) {
-       case CPU_DYING:
-               nohz_balance_exit_idle(smp_processor_id());
-               return NOTIFY_OK;
-       default:
-               return NOTIFY_DONE;
-       }
-}
  #endif
  
  static DEFINE_SPINLOCK(balancing);
@@ -8613,7 +8681,6 @@ const struct sched_class fair_sched_class = {
         .rq_online              = rq_online_fair,
         .rq_offline             = rq_offline_fair,
  
-       .task_waking            = task_waking_fair,
         .task_dead              = task_dead_fair,
         .set_cpus_allowed       = set_cpus_allowed_common,
  #endif
@@ -8675,7 +8742,6 @@ __init void init_sched_fair_class(void)
  #ifdef CONFIG_NO_HZ_COMMON
         nohz.next_balance = jiffies;
         zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
-       cpu_notifier(sched_ilb_notifier, 0);
  #endif
  #endif /* SMP */