sched/fair: Get rid of scaling utilization by capacity_orig
[cascardo/linux.git] / kernel / sched / fair.c
index 47ece22..047fd1c 100644 (file)
@@ -2515,6 +2515,8 @@ static u32 __compute_runnable_contrib(u64 n)
        return contrib + runnable_avg_yN_sum[n];
 }
 
+#define scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
+
 /*
  * We can represent the historical contribution to runnable average as the
  * coefficients of a geometric series.  To do this we sub-divide our runnable
@@ -2547,10 +2549,11 @@ static __always_inline int
 __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
                  unsigned long weight, int running, struct cfs_rq *cfs_rq)
 {
-       u64 delta, periods;
+       u64 delta, scaled_delta, periods;
        u32 contrib;
-       int delta_w, decayed = 0;
+       int delta_w, scaled_delta_w, decayed = 0;
        unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu);
+       unsigned long scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
 
        delta = now - sa->last_update_time;
        /*
@@ -2585,13 +2588,16 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
                 * period and accrue it.
                 */
                delta_w = 1024 - delta_w;
+               scaled_delta_w = scale(delta_w, scale_freq);
                if (weight) {
-                       sa->load_sum += weight * delta_w;
-                       if (cfs_rq)
-                               cfs_rq->runnable_load_sum += weight * delta_w;
+                       sa->load_sum += weight * scaled_delta_w;
+                       if (cfs_rq) {
+                               cfs_rq->runnable_load_sum +=
+                                               weight * scaled_delta_w;
+                       }
                }
                if (running)
-                       sa->util_sum += delta_w * scale_freq >> SCHED_CAPACITY_SHIFT;
+                       sa->util_sum += scale(scaled_delta_w, scale_cpu);
 
                delta -= delta_w;
 
@@ -2608,23 +2614,25 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
 
                /* Efficiently calculate \sum (1..n_period) 1024*y^i */
                contrib = __compute_runnable_contrib(periods);
+               contrib = scale(contrib, scale_freq);
                if (weight) {
                        sa->load_sum += weight * contrib;
                        if (cfs_rq)
                                cfs_rq->runnable_load_sum += weight * contrib;
                }
                if (running)
-                       sa->util_sum += contrib * scale_freq >> SCHED_CAPACITY_SHIFT;
+                       sa->util_sum += scale(contrib, scale_cpu);
        }
 
        /* Remainder of delta accrued against u_0` */
+       scaled_delta = scale(delta, scale_freq);
        if (weight) {
-               sa->load_sum += weight * delta;
+               sa->load_sum += weight * scaled_delta;
                if (cfs_rq)
-                       cfs_rq->runnable_load_sum += weight * delta;
+                       cfs_rq->runnable_load_sum += weight * scaled_delta;
        }
        if (running)
-               sa->util_sum += delta * scale_freq >> SCHED_CAPACITY_SHIFT;
+               sa->util_sum += scale(scaled_delta, scale_cpu);
 
        sa->period_contrib += delta;
 
@@ -4854,32 +4862,39 @@ next:
 done:
        return target;
 }
+
 /*
- * get_cpu_usage returns the amount of capacity of a CPU that is used by CFS
+ * cpu_util returns the amount of capacity of a CPU that is used by CFS
  * tasks. The unit of the return value must be the one of capacity so we can
- * compare the usage with the capacity of the CPU that is available for CFS
- * task (ie cpu_capacity).
- * cfs.avg.util_avg is the sum of running time of runnable tasks on a
- * CPU. It represents the amount of utilization of a CPU in the range
- * [0..SCHED_LOAD_SCALE].  The usage of a CPU can't be higher than the full
- * capacity of the CPU because it's about the running time on this CPU.
- * Nevertheless, cfs.avg.util_avg can be higher than SCHED_LOAD_SCALE
- * because of unfortunate rounding in util_avg or just
- * after migrating tasks until the average stabilizes with the new running
- * time. So we need to check that the usage stays into the range
- * [0..cpu_capacity_orig] and cap if necessary.
- * Without capping the usage, a group could be seen as overloaded (CPU0 usage
- * at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity
+ * compare the utilization with the capacity of the CPU that is available for
+ * CFS task (ie cpu_capacity).
+ *
+ * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
+ * recent utilization of currently non-runnable tasks on a CPU. It represents
+ * the amount of utilization of a CPU in the range [0..capacity_orig] where
+ * capacity_orig is the cpu_capacity available at the highest frequency
+ * (arch_scale_freq_capacity()).
+ * The utilization of a CPU converges towards a sum equal to or less than the
+ * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
+ * the running time on this CPU scaled by capacity_curr.
+ *
+ * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
+ * higher than capacity_orig because of unfortunate rounding in
+ * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
+ * the average stabilizes with the new running time. We need to check that the
+ * utilization stays within the range of [0..capacity_orig] and cap it if
+ * necessary. Without utilization capping, a group could be seen as overloaded
+ * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
+ * available capacity. We allow utilization to overshoot capacity_curr (but not
+ * capacity_orig) as it useful for predicting the capacity required after task
+ * migrations (scheduler-driven DVFS).
  */
-static int get_cpu_usage(int cpu)
+static int cpu_util(int cpu)
 {
-       unsigned long usage = cpu_rq(cpu)->cfs.avg.util_avg;
+       unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
        unsigned long capacity = capacity_orig_of(cpu);
 
-       if (usage >= SCHED_LOAD_SCALE)
-               return capacity;
-
-       return (usage * capacity) >> SCHED_LOAD_SHIFT;
+       return (util >= capacity) ? capacity : util;
 }
 
 /*
@@ -5971,7 +5986,7 @@ struct sg_lb_stats {
        unsigned long sum_weighted_load; /* Weighted load of group's tasks */
        unsigned long load_per_task;
        unsigned long group_capacity;
-       unsigned long group_usage; /* Total usage of the group */
+       unsigned long group_util; /* Total utilization of the group */
        unsigned int sum_nr_running; /* Nr tasks running in the group */
        unsigned int idle_cpus;
        unsigned int group_weight;
@@ -6047,19 +6062,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
        return load_idx;
 }
 
-static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu)
-{
-       if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1))
-               return sd->smt_gain / sd->span_weight;
-
-       return SCHED_CAPACITY_SCALE;
-}
-
-unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
-{
-       return default_scale_cpu_capacity(sd, cpu);
-}
-
 static unsigned long scale_rt_capacity(int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
@@ -6089,16 +6091,9 @@ static unsigned long scale_rt_capacity(int cpu)
 
 static void update_cpu_capacity(struct sched_domain *sd, int cpu)
 {
-       unsigned long capacity = SCHED_CAPACITY_SCALE;
+       unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
        struct sched_group *sdg = sd->groups;
 
-       if (sched_feat(ARCH_CAPACITY))
-               capacity *= arch_scale_cpu_capacity(sd, cpu);
-       else
-               capacity *= default_scale_cpu_capacity(sd, cpu);
-
-       capacity >>= SCHED_CAPACITY_SHIFT;
-
        cpu_rq(cpu)->cpu_capacity_orig = capacity;
 
        capacity *= scale_rt_capacity(cpu);
@@ -6224,8 +6219,8 @@ static inline int sg_imbalanced(struct sched_group *group)
  * group_has_capacity returns true if the group has spare capacity that could
  * be used by some tasks.
  * We consider that a group has spare capacity if the  * number of task is
- * smaller than the number of CPUs or if the usage is lower than the available
- * capacity for CFS tasks.
+ * smaller than the number of CPUs or if the utilization is lower than the
+ * available capacity for CFS tasks.
  * For the latter, we use a threshold to stabilize the state, to take into
  * account the variance of the tasks' load and to return true if the available
  * capacity in meaningful for the load balancer.
@@ -6239,7 +6234,7 @@ group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
                return true;
 
        if ((sgs->group_capacity * 100) >
-                       (sgs->group_usage * env->sd->imbalance_pct))
+                       (sgs->group_util * env->sd->imbalance_pct))
                return true;
 
        return false;
@@ -6260,7 +6255,7 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
                return false;
 
        if ((sgs->group_capacity * 100) <
-                       (sgs->group_usage * env->sd->imbalance_pct))
+                       (sgs->group_util * env->sd->imbalance_pct))
                return true;
 
        return false;
@@ -6308,7 +6303,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                        load = source_load(i, load_idx);
 
                sgs->group_load += load;
-               sgs->group_usage += get_cpu_usage(i);
+               sgs->group_util += cpu_util(i);
                sgs->sum_nr_running += rq->cfs.h_nr_running;
 
                if (rq->nr_running > 1)