return contrib + runnable_avg_yN_sum[n];
}
+#define scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
+
/*
* We can represent the historical contribution to runnable average as the
* coefficients of a geometric series. To do this we sub-divide our runnable
__update_load_avg(u64 now, int cpu, struct sched_avg *sa,
unsigned long weight, int running, struct cfs_rq *cfs_rq)
{
- u64 delta, periods;
+ u64 delta, scaled_delta, periods;
u32 contrib;
- int delta_w, decayed = 0;
+ int delta_w, scaled_delta_w, decayed = 0;
unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu);
+ unsigned long scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
delta = now - sa->last_update_time;
/*
* period and accrue it.
*/
delta_w = 1024 - delta_w;
+ scaled_delta_w = scale(delta_w, scale_freq);
if (weight) {
- sa->load_sum += weight * delta_w;
- if (cfs_rq)
- cfs_rq->runnable_load_sum += weight * delta_w;
+ sa->load_sum += weight * scaled_delta_w;
+ if (cfs_rq) {
+ cfs_rq->runnable_load_sum +=
+ weight * scaled_delta_w;
+ }
}
if (running)
- sa->util_sum += delta_w * scale_freq >> SCHED_CAPACITY_SHIFT;
+ sa->util_sum += scale(scaled_delta_w, scale_cpu);
delta -= delta_w;
/* Efficiently calculate \sum (1..n_period) 1024*y^i */
contrib = __compute_runnable_contrib(periods);
+ contrib = scale(contrib, scale_freq);
if (weight) {
sa->load_sum += weight * contrib;
if (cfs_rq)
cfs_rq->runnable_load_sum += weight * contrib;
}
if (running)
- sa->util_sum += contrib * scale_freq >> SCHED_CAPACITY_SHIFT;
+ sa->util_sum += scale(contrib, scale_cpu);
}
/* Remainder of delta accrued against u_0` */
+ scaled_delta = scale(delta, scale_freq);
if (weight) {
- sa->load_sum += weight * delta;
+ sa->load_sum += weight * scaled_delta;
if (cfs_rq)
- cfs_rq->runnable_load_sum += weight * delta;
+ cfs_rq->runnable_load_sum += weight * scaled_delta;
}
if (running)
- sa->util_sum += delta * scale_freq >> SCHED_CAPACITY_SHIFT;
+ sa->util_sum += scale(scaled_delta, scale_cpu);
sa->period_contrib += delta;
done:
return target;
}
+
/*
- * get_cpu_usage returns the amount of capacity of a CPU that is used by CFS
+ * cpu_util returns the amount of capacity of a CPU that is used by CFS
* tasks. The unit of the return value must be the one of capacity so we can
- * compare the usage with the capacity of the CPU that is available for CFS
- * task (ie cpu_capacity).
- * cfs.avg.util_avg is the sum of running time of runnable tasks on a
- * CPU. It represents the amount of utilization of a CPU in the range
- * [0..SCHED_LOAD_SCALE]. The usage of a CPU can't be higher than the full
- * capacity of the CPU because it's about the running time on this CPU.
- * Nevertheless, cfs.avg.util_avg can be higher than SCHED_LOAD_SCALE
- * because of unfortunate rounding in util_avg or just
- * after migrating tasks until the average stabilizes with the new running
- * time. So we need to check that the usage stays into the range
- * [0..cpu_capacity_orig] and cap if necessary.
- * Without capping the usage, a group could be seen as overloaded (CPU0 usage
- * at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity
+ * compare the utilization with the capacity of the CPU that is available for
+ * CFS task (ie cpu_capacity).
+ *
+ * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
+ * recent utilization of currently non-runnable tasks on a CPU. It represents
+ * the amount of utilization of a CPU in the range [0..capacity_orig] where
+ * capacity_orig is the cpu_capacity available at the highest frequency
+ * (arch_scale_freq_capacity()).
+ * The utilization of a CPU converges towards a sum equal to or less than the
+ * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
+ * the running time on this CPU scaled by capacity_curr.
+ *
+ * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
+ * higher than capacity_orig because of unfortunate rounding in
+ * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
+ * the average stabilizes with the new running time. We need to check that the
+ * utilization stays within the range of [0..capacity_orig] and cap it if
+ * necessary. Without utilization capping, a group could be seen as overloaded
+ * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
+ * available capacity. We allow utilization to overshoot capacity_curr (but not
+ * capacity_orig) as it useful for predicting the capacity required after task
+ * migrations (scheduler-driven DVFS).
*/
-static int get_cpu_usage(int cpu)
+static int cpu_util(int cpu)
{
- unsigned long usage = cpu_rq(cpu)->cfs.avg.util_avg;
+ unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
unsigned long capacity = capacity_orig_of(cpu);
- if (usage >= SCHED_LOAD_SCALE)
- return capacity;
-
- return (usage * capacity) >> SCHED_LOAD_SHIFT;
+ return (util >= capacity) ? capacity : util;
}
/*
unsigned long sum_weighted_load; /* Weighted load of group's tasks */
unsigned long load_per_task;
unsigned long group_capacity;
- unsigned long group_usage; /* Total usage of the group */
+ unsigned long group_util; /* Total utilization of the group */
unsigned int sum_nr_running; /* Nr tasks running in the group */
unsigned int idle_cpus;
unsigned int group_weight;
return load_idx;
}
-static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu)
-{
- if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1))
- return sd->smt_gain / sd->span_weight;
-
- return SCHED_CAPACITY_SCALE;
-}
-
-unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
-{
- return default_scale_cpu_capacity(sd, cpu);
-}
-
static unsigned long scale_rt_capacity(int cpu)
{
struct rq *rq = cpu_rq(cpu);
static void update_cpu_capacity(struct sched_domain *sd, int cpu)
{
- unsigned long capacity = SCHED_CAPACITY_SCALE;
+ unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
struct sched_group *sdg = sd->groups;
- if (sched_feat(ARCH_CAPACITY))
- capacity *= arch_scale_cpu_capacity(sd, cpu);
- else
- capacity *= default_scale_cpu_capacity(sd, cpu);
-
- capacity >>= SCHED_CAPACITY_SHIFT;
-
cpu_rq(cpu)->cpu_capacity_orig = capacity;
capacity *= scale_rt_capacity(cpu);
* group_has_capacity returns true if the group has spare capacity that could
* be used by some tasks.
* We consider that a group has spare capacity if the * number of task is
- * smaller than the number of CPUs or if the usage is lower than the available
- * capacity for CFS tasks.
+ * smaller than the number of CPUs or if the utilization is lower than the
+ * available capacity for CFS tasks.
* For the latter, we use a threshold to stabilize the state, to take into
* account the variance of the tasks' load and to return true if the available
* capacity in meaningful for the load balancer.
return true;
if ((sgs->group_capacity * 100) >
- (sgs->group_usage * env->sd->imbalance_pct))
+ (sgs->group_util * env->sd->imbalance_pct))
return true;
return false;
return false;
if ((sgs->group_capacity * 100) <
- (sgs->group_usage * env->sd->imbalance_pct))
+ (sgs->group_util * env->sd->imbalance_pct))
return true;
return false;
load = source_load(i, load_idx);
sgs->group_load += load;
- sgs->group_usage += get_cpu_usage(i);
+ sgs->group_util += cpu_util(i);
sgs->sum_nr_running += rq->cfs.h_nr_running;
if (rq->nr_running > 1)