summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Turquette <mturquette@deferred.io>2014-11-05 06:56:27 -0800
committerMichael Turquette <mturquette@deferred.io>2014-11-05 06:56:27 -0800
commite0496ee20818a396db34d96668025e87fec596fd (patch)
treefbd109391bc8083269da9e5b66fce08dff49e1fb
parentedabeb52dff9edc3f7c493f3cda963b53846042a (diff)
parentc722ab273ac7fb17936549b90728ca922959088f (diff)
Merge branch 'sched-for-mike' of git://git.linaro.org/people/vincent.guittot/kernel into eas-next
-rw-r--r--include/linux/sched.h21
-rw-r--r--kernel/sched/core.c15
-rw-r--r--kernel/sched/debug.c12
-rw-r--r--kernel/sched/fair.c369
-rw-r--r--kernel/sched/sched.h15
5 files changed, 276 insertions, 156 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index bd7c14ba86c4..e60a100d8713 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1111,15 +1111,28 @@ struct load_weight {
};
struct sched_avg {
+ u64 last_runnable_update;
+ s64 decay_count;
+ /*
+ * utilization_avg_contrib describes the amount of time that a
+ * sched_entity is running on a CPU. It is based on running_avg_sum
+ * and is scaled in the range [0..SCHED_LOAD_SCALE].
+ * load_avg_contrib described the amount of time that a sched_entity
+ * is runnable on a rq. It is based on both runnable_avg_sum and the
+ * weight of the task.
+ */
+ unsigned long load_avg_contrib, utilization_avg_contrib;
/*
* These sums represent an infinite geometric series and so are bound
* above by 1024/(1-y). Thus we only need a u32 to store them for all
* choices of y < 1-2^(-32)*1024.
+ * running_avg_sum reflects the time that the sched_entity is
+ * effectively running on the CPU.
+ * runnable_avg_sum represents the amount of time a sched_entity is on
+ * a runqueue which includes the running time that is monitored by
+ * running_avg_sum.
*/
- u32 runnable_avg_sum, runnable_avg_period;
- u64 last_runnable_update;
- s64 decay_count;
- unsigned long load_avg_contrib;
+ u32 runnable_avg_sum, avg_period, running_avg_sum;
};
#ifdef CONFIG_SCHEDSTATS
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 72d9d926a034..02ec4a5b5fd3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5460,17 +5460,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
break;
}
- /*
- * Even though we initialize ->capacity to something semi-sane,
- * we leave capacity_orig unset. This allows us to detect if
- * domain iteration is still funny without causing /0 traps.
- */
- if (!group->sgc->capacity_orig) {
- printk(KERN_CONT "\n");
- printk(KERN_ERR "ERROR: domain->cpu_capacity not set\n");
- break;
- }
-
if (!cpumask_weight(sched_group_cpus(group))) {
printk(KERN_CONT "\n");
printk(KERN_ERR "ERROR: empty group\n");
@@ -5955,7 +5944,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
* die on a /0 trap.
*/
sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
- sg->sgc->capacity_orig = sg->sgc->capacity;
/*
* Make sure the first group of this domain contains the
@@ -6266,6 +6254,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
*/
if (sd->flags & SD_SHARE_CPUCAPACITY) {
+ sd->flags |= SD_PREFER_SIBLING;
sd->imbalance_pct = 110;
sd->smt_gain = 1178; /* ~15% */
@@ -7229,7 +7218,7 @@ void __init sched_init(void)
#ifdef CONFIG_SMP
rq->sd = NULL;
rq->rd = NULL;
- rq->cpu_capacity = SCHED_CAPACITY_SCALE;
+ rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
rq->post_schedule = 0;
rq->active_balance = 0;
rq->next_balance = jiffies;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 92cc52001e74..9dce8b5abeea 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -71,7 +71,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
if (!se) {
struct sched_avg *avg = &cpu_rq(cpu)->avg;
P(avg->runnable_avg_sum);
- P(avg->runnable_avg_period);
+ P(avg->avg_period);
return;
}
@@ -94,8 +94,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
P(se->load.weight);
#ifdef CONFIG_SMP
P(se->avg.runnable_avg_sum);
- P(se->avg.runnable_avg_period);
+ P(se->avg.running_avg_sum);
+ P(se->avg.avg_period);
P(se->avg.load_avg_contrib);
+ P(se->avg.utilization_avg_contrib);
P(se->avg.decay_count);
#endif
#undef PN
@@ -214,6 +216,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
cfs_rq->runnable_load_avg);
SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg",
cfs_rq->blocked_load_avg);
+ SEQ_printf(m, " .%-30s: %ld\n", "utilization_load_avg",
+ cfs_rq->utilization_load_avg);
#ifdef CONFIG_FAIR_GROUP_SCHED
SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib",
cfs_rq->tg_load_contrib);
@@ -635,8 +639,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
P(se.load.weight);
#ifdef CONFIG_SMP
P(se.avg.runnable_avg_sum);
- P(se.avg.runnable_avg_period);
+ P(se.avg.running_avg_sum);
+ P(se.avg.avg_period);
P(se.avg.load_avg_contrib);
+ P(se.avg.utilization_avg_contrib);
P(se.avg.decay_count);
#endif
P(policy);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 826fdf326683..81d1d038c9a0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -670,6 +670,7 @@ static int select_idle_sibling(struct task_struct *p, int cpu);
static unsigned long task_h_load(struct task_struct *p);
static inline void __update_task_entity_contrib(struct sched_entity *se);
+static inline void __update_task_entity_utilization(struct sched_entity *se);
/* Give new task start runnable values to heavy its load in infant time */
void init_task_runnable_average(struct task_struct *p)
@@ -678,9 +679,10 @@ void init_task_runnable_average(struct task_struct *p)
p->se.avg.decay_count = 0;
slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
- p->se.avg.runnable_avg_sum = slice;
- p->se.avg.runnable_avg_period = slice;
+ p->se.avg.runnable_avg_sum = p->se.avg.running_avg_sum = slice;
+ p->se.avg.avg_period = slice;
__update_task_entity_contrib(&p->se);
+ __update_task_entity_utilization(&p->se);
}
#else
void init_task_runnable_average(struct task_struct *p)
@@ -1663,7 +1665,7 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
*period = now - p->last_task_numa_placement;
} else {
delta = p->se.avg.runnable_avg_sum;
- *period = p->se.avg.runnable_avg_period;
+ *period = p->se.avg.avg_period;
}
p->last_sum_exec_runtime = runtime;
@@ -2463,6 +2465,8 @@ static u32 __compute_runnable_contrib(u64 n)
return contrib + runnable_avg_yN_sum[n];
}
+unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu);
+
/*
* We can represent the historical contribution to runnable average as the
* coefficients of a geometric series. To do this we sub-divide our runnable
@@ -2491,13 +2495,15 @@ static u32 __compute_runnable_contrib(u64 n)
* load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
* = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
*/
-static __always_inline int __update_entity_runnable_avg(u64 now,
+static __always_inline int __update_entity_runnable_avg(u64 now, int cpu,
struct sched_avg *sa,
- int runnable)
+ int runnable,
+ int running)
{
u64 delta, periods;
u32 runnable_contrib;
int delta_w, decayed = 0;
+ unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu);
delta = now - sa->last_runnable_update;
/*
@@ -2519,7 +2525,7 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
sa->last_runnable_update = now;
/* delta_w is the amount already accumulated against our next period */
- delta_w = sa->runnable_avg_period % 1024;
+ delta_w = sa->avg_period % 1024;
if (delta + delta_w >= 1024) {
/* period roll-over */
decayed = 1;
@@ -2532,7 +2538,10 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
delta_w = 1024 - delta_w;
if (runnable)
sa->runnable_avg_sum += delta_w;
- sa->runnable_avg_period += delta_w;
+ if (running)
+ sa->running_avg_sum += delta_w * scale_freq
+ >> SCHED_CAPACITY_SHIFT;
+ sa->avg_period += delta_w;
delta -= delta_w;
@@ -2542,20 +2551,28 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
periods + 1);
- sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
+ sa->running_avg_sum = decay_load(sa->running_avg_sum,
+ periods + 1);
+ sa->avg_period = decay_load(sa->avg_period,
periods + 1);
/* Efficiently calculate \sum (1..n_period) 1024*y^i */
runnable_contrib = __compute_runnable_contrib(periods);
if (runnable)
sa->runnable_avg_sum += runnable_contrib;
- sa->runnable_avg_period += runnable_contrib;
+ if (running)
+ sa->running_avg_sum += runnable_contrib * scale_freq
+ >> SCHED_CAPACITY_SHIFT;
+ sa->avg_period += runnable_contrib;
}
/* Remainder of delta accrued against u_0` */
if (runnable)
sa->runnable_avg_sum += delta;
- sa->runnable_avg_period += delta;
+ if (running)
+ sa->running_avg_sum += delta * scale_freq
+ >> SCHED_CAPACITY_SHIFT;
+ sa->avg_period += delta;
return decayed;
}
@@ -2571,6 +2588,8 @@ static inline u64 __synchronize_entity_decay(struct sched_entity *se)
return 0;
se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
+ se->avg.utilization_avg_contrib =
+ decay_load(se->avg.utilization_avg_contrib, decays);
se->avg.decay_count = 0;
return decays;
@@ -2607,7 +2626,7 @@ static inline void __update_tg_runnable_avg(struct sched_avg *sa,
/* The fraction of a cpu used by this cfs_rq */
contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT,
- sa->runnable_avg_period + 1);
+ sa->avg_period + 1);
contrib -= cfs_rq->tg_runnable_contrib;
if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
@@ -2660,7 +2679,8 @@ static inline void __update_group_entity_contrib(struct sched_entity *se)
static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
{
- __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable);
+ __update_entity_runnable_avg(rq_clock_task(rq), cpu_of(rq), &rq->avg,
+ runnable, runnable);
__update_tg_runnable_avg(&rq->avg, &rq->cfs);
}
#else /* CONFIG_FAIR_GROUP_SCHED */
@@ -2678,7 +2698,7 @@ static inline void __update_task_entity_contrib(struct sched_entity *se)
/* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
- contrib /= (se->avg.runnable_avg_period + 1);
+ contrib /= (se->avg.avg_period + 1);
se->avg.load_avg_contrib = scale_load(contrib);
}
@@ -2697,6 +2717,30 @@ static long __update_entity_load_avg_contrib(struct sched_entity *se)
return se->avg.load_avg_contrib - old_contrib;
}
+
+static inline void __update_task_entity_utilization(struct sched_entity *se)
+{
+ u32 contrib;
+
+ /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
+ contrib = se->avg.running_avg_sum * scale_load_down(SCHED_LOAD_SCALE);
+ contrib /= (se->avg.avg_period + 1);
+ se->avg.utilization_avg_contrib = scale_load(contrib);
+}
+
+static long __update_entity_utilization_avg_contrib(struct sched_entity *se)
+{
+ long old_contrib = se->avg.utilization_avg_contrib;
+
+ if (entity_is_task(se))
+ __update_task_entity_utilization(se);
+ else
+ se->avg.utilization_avg_contrib =
+ group_cfs_rq(se)->utilization_load_avg;
+
+ return se->avg.utilization_avg_contrib - old_contrib;
+}
+
static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
long load_contrib)
{
@@ -2713,7 +2757,8 @@ static inline void update_entity_load_avg(struct sched_entity *se,
int update_cfs_rq)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- long contrib_delta;
+ long contrib_delta, utilization_delta;
+ int cpu = cpu_of(rq_of(cfs_rq));
u64 now;
/*
@@ -2725,18 +2770,22 @@ static inline void update_entity_load_avg(struct sched_entity *se,
else
now = cfs_rq_clock_task(group_cfs_rq(se));
- if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
+ if (!__update_entity_runnable_avg(now, cpu, &se->avg, se->on_rq,
+ cfs_rq->curr == se))
return;
contrib_delta = __update_entity_load_avg_contrib(se);
+ utilization_delta = __update_entity_utilization_avg_contrib(se);
if (!update_cfs_rq)
return;
- if (se->on_rq)
+ if (se->on_rq) {
cfs_rq->runnable_load_avg += contrib_delta;
- else
+ cfs_rq->utilization_load_avg += utilization_delta;
+ } else {
subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
+ }
}
/*
@@ -2811,6 +2860,7 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
}
cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
+ cfs_rq->utilization_load_avg += se->avg.utilization_avg_contrib;
/* we force update consideration on load-balancer moves */
update_cfs_rq_blocked_load(cfs_rq, !wakeup);
}
@@ -2829,6 +2879,7 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
update_cfs_rq_blocked_load(cfs_rq, !sleep);
cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
+ cfs_rq->utilization_load_avg -= se->avg.utilization_avg_contrib;
if (sleep) {
cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
@@ -3166,6 +3217,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
*/
update_stats_wait_end(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
+ update_entity_load_avg(se, 1);
}
update_stats_curr_start(cfs_rq, se);
@@ -4288,6 +4340,11 @@ static unsigned long capacity_of(int cpu)
return cpu_rq(cpu)->cpu_capacity;
}
+static unsigned long capacity_orig_of(int cpu)
+{
+ return cpu_rq(cpu)->cpu_capacity_orig;
+}
+
static unsigned long cpu_avg_load_per_task(int cpu)
{
struct rq *rq = cpu_rq(cpu);
@@ -4701,6 +4758,33 @@ next:
done:
return target;
}
+/*
+ * get_cpu_usage returns the amount of capacity of a CPU that is used by CFS
+ * tasks. The unit of the return value must capacity so we can compare the
+ * usage with the capacity of the CPU that is available for CFS task (ie
+ * cpu_capacity).
+ * cfs.utilization_load_avg is the sum of running time of runnable tasks on a
+ * CPU. It represents the amount of utilization of a CPU in the range
+ * [0..SCHED_LOAD_SCALE]. The usage of a CPU can't be higher than the full
+ * capacity of the CPU because it's about the running time on this CPU.
+ * Nevertheless, cfs.utilization_load_avg can be higher than SCHED_LOAD_SCALE
+ * because of unfortunate rounding in avg_period and running_load_avg or just
+ * after migrating tasks until the average stabilizes with the new running
+ * time. So we need to check that the usage stays into the range
+ * [0..cpu_capacity_orig] and cap if necessary.
+ * Without capping the usage, a group could be seen as overloaded (CPU0 usage
+ * at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity/
+ */
+static int get_cpu_usage(int cpu)
+{
+ unsigned long usage = cpu_rq(cpu)->cfs.utilization_load_avg;
+ unsigned long capacity = capacity_orig_of(cpu);
+
+ if (usage >= SCHED_LOAD_SCALE)
+ return capacity;
+
+ return (usage * capacity) >> SCHED_LOAD_SHIFT;
+}
/*
* select_task_rq_fair: Select target runqueue for the waking task in domains
@@ -5830,12 +5914,12 @@ struct sg_lb_stats {
unsigned long sum_weighted_load; /* Weighted load of group's tasks */
unsigned long load_per_task;
unsigned long group_capacity;
+ unsigned long group_usage; /* Total usage of the group */
unsigned int sum_nr_running; /* Nr tasks running in the group */
- unsigned int group_capacity_factor;
unsigned int idle_cpus;
unsigned int group_weight;
enum group_type group_type;
- int group_has_free_capacity;
+ int group_no_capacity;
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
unsigned int nr_preferred_running;
@@ -5932,7 +6016,7 @@ unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
static unsigned long scale_rt_capacity(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- u64 total, available, age_stamp, avg;
+ u64 total, used, age_stamp, avg;
s64 delta;
/*
@@ -5948,19 +6032,12 @@ static unsigned long scale_rt_capacity(int cpu)
total = sched_avg_period() + delta;
- if (unlikely(total < avg)) {
- /* Ensures that capacity won't end up being negative */
- available = 0;
- } else {
- available = total - avg;
- }
-
- if (unlikely((s64)total < SCHED_CAPACITY_SCALE))
- total = SCHED_CAPACITY_SCALE;
+ used = div_u64(avg, total);
- total >>= SCHED_CAPACITY_SHIFT;
+ if (likely(used < SCHED_CAPACITY_SCALE))
+ return SCHED_CAPACITY_SCALE - used;
- return div_u64(available, total);
+ return 1;
}
static void update_cpu_capacity(struct sched_domain *sd, int cpu)
@@ -5975,14 +6052,7 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
capacity >>= SCHED_CAPACITY_SHIFT;
- sdg->sgc->capacity_orig = capacity;
-
- if (sched_feat(ARCH_CAPACITY))
- capacity *= arch_scale_freq_capacity(sd, cpu);
- else
- capacity *= default_scale_capacity(sd, cpu);
-
- capacity >>= SCHED_CAPACITY_SHIFT;
+ cpu_rq(cpu)->cpu_capacity_orig = capacity;
capacity *= scale_rt_capacity(cpu);
capacity >>= SCHED_CAPACITY_SHIFT;
@@ -5998,7 +6068,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
{
struct sched_domain *child = sd->child;
struct sched_group *group, *sdg = sd->groups;
- unsigned long capacity, capacity_orig;
+ unsigned long capacity;
unsigned long interval;
interval = msecs_to_jiffies(sd->balance_interval);
@@ -6010,7 +6080,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
return;
}
- capacity_orig = capacity = 0;
+ capacity = 0;
if (child->flags & SD_OVERLAP) {
/*
@@ -6030,19 +6100,15 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
* Use capacity_of(), which is set irrespective of domains
* in update_cpu_capacity().
*
- * This avoids capacity/capacity_orig from being 0 and
+ * This avoids capacity from being 0 and
* causing divide-by-zero issues on boot.
- *
- * Runtime updates will correct capacity_orig.
*/
if (unlikely(!rq->sd)) {
- capacity_orig += capacity_of(cpu);
capacity += capacity_of(cpu);
continue;
}
sgc = rq->sd->groups->sgc;
- capacity_orig += sgc->capacity_orig;
capacity += sgc->capacity;
}
} else {
@@ -6053,39 +6119,24 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
group = child->groups;
do {
- capacity_orig += group->sgc->capacity_orig;
capacity += group->sgc->capacity;
group = group->next;
} while (group != child->groups);
}
- sdg->sgc->capacity_orig = capacity_orig;
sdg->sgc->capacity = capacity;
}
/*
- * Try and fix up capacity for tiny siblings, this is needed when
- * things like SD_ASYM_PACKING need f_b_g to select another sibling
- * which on its own isn't powerful enough.
- *
- * See update_sd_pick_busiest() and check_asym_packing().
+ * Check whether the capacity of the rq has been noticeably reduced by side
+ * activity. The imbalance_pct is used for the threshold.
+ * Return true is the capacity is reduced
*/
static inline int
-fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
+check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
{
- /*
- * Only siblings can have significantly less than SCHED_CAPACITY_SCALE
- */
- if (!(sd->flags & SD_SHARE_CPUCAPACITY))
- return 0;
-
- /*
- * If ~90% of the cpu_capacity is still there, we're good.
- */
- if (group->sgc->capacity * 32 > group->sgc->capacity_orig * 29)
- return 1;
-
- return 0;
+ return ((rq->cpu_capacity * sd->imbalance_pct) <
+ (rq->cpu_capacity_orig * 100));
}
/*
@@ -6123,37 +6174,54 @@ static inline int sg_imbalanced(struct sched_group *group)
}
/*
- * Compute the group capacity factor.
- *
- * Avoid the issue where N*frac(smt_capacity) >= 1 creates 'phantom' cores by
- * first dividing out the smt factor and computing the actual number of cores
- * and limit unit capacity with that.
+ * group_has_capacity returns true if the group has spare capacity that could
+ * be used by some tasks. We consider that a group has spare capacity if the
+ * number of task is smaller than the number of CPUs or if the usage is lower
+ * than the available capacity for CFS tasks. For the latter, we use a
+ * threshold to stabilize the state, to take into account the variance of the
+ * tasks' load and to return true if the available capacity in meaningful for
+ * the load balancer. As an example, an available capacity of 1% can appear
+ * but it doesn't make any benefit for the load balance.
*/
-static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *group)
+static inline bool
+group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
{
- unsigned int capacity_factor, smt, cpus;
- unsigned int capacity, capacity_orig;
+ if ((sgs->group_capacity * 100) >
+ (sgs->group_usage * env->sd->imbalance_pct))
+ return true;
- capacity = group->sgc->capacity;
- capacity_orig = group->sgc->capacity_orig;
- cpus = group->group_weight;
+ if (sgs->sum_nr_running < sgs->group_weight)
+ return true;
- /* smt := ceil(cpus / capacity), assumes: 1 < smt_capacity < 2 */
- smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, capacity_orig);
- capacity_factor = cpus / smt; /* cores */
+ return false;
+}
- capacity_factor = min_t(unsigned,
- capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE));
- if (!capacity_factor)
- capacity_factor = fix_small_capacity(env->sd, group);
+/*
+ * group_is_overloaded returns true if the group has more tasks than it can
+ * handle. We consider that a group is overloaded if the number of tasks is
+ * greater than the number of CPUs and the tasks already use all available
+ * capacity for CFS tasks. For the latter, we use a threshold to stabilize
+ * the state, to take into account the variance of tasks' load and to return
+ * true if available capacity is no more meaningful for load balancer
+ */
+static inline bool
+group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
+{
+ if (sgs->sum_nr_running <= sgs->group_weight)
+ return false;
+
+ if ((sgs->group_capacity * 100) <
+ (sgs->group_usage * env->sd->imbalance_pct))
+ return true;
- return capacity_factor;
+ return false;
}
-static enum group_type
-group_classify(struct sched_group *group, struct sg_lb_stats *sgs)
+static enum group_type group_classify(struct lb_env *env,
+ struct sched_group *group,
+ struct sg_lb_stats *sgs)
{
- if (sgs->sum_nr_running > sgs->group_capacity_factor)
+ if (sgs->group_no_capacity)
return group_overloaded;
if (sg_imbalanced(group))
@@ -6191,6 +6259,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
load = source_load(i, load_idx);
sgs->group_load += load;
+ sgs->group_usage += get_cpu_usage(i);
sgs->sum_nr_running += rq->cfs.h_nr_running;
if (rq->nr_running > 1)
@@ -6213,11 +6282,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
sgs->group_weight = group->group_weight;
- sgs->group_capacity_factor = sg_capacity_factor(env, group);
- sgs->group_type = group_classify(group, sgs);
- if (sgs->group_capacity_factor > sgs->sum_nr_running)
- sgs->group_has_free_capacity = 1;
+ sgs->group_no_capacity = group_is_overloaded(env, sgs);
+ sgs->group_type = group_classify(env, group, sgs);
}
/**
@@ -6339,17 +6406,20 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
/*
* In case the child domain prefers tasks go to siblings
- * first, lower the sg capacity factor to one so that we'll try
+ * first, lower the sg capacity so that we'll try
* and move all the excess tasks away. We lower the capacity
* of a group only if the local group has the capacity to fit
- * these excess tasks, i.e. nr_running < group_capacity_factor. The
- * extra check prevents the case where you always pull from the
- * heaviest group when it is already under-utilized (possible
- * with a large weight task outweighs the tasks on the system).
+ * these excess tasks. The extra check prevents the case where
+ * you always pull from the heaviest group when it is already
+ * under-utilized (possible with a large weight task outweighs
+ * the tasks on the system).
*/
if (prefer_sibling && sds->local &&
- sds->local_stat.group_has_free_capacity)
- sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U);
+ group_has_capacity(env, &sds->local_stat) &&
+ (sgs->sum_nr_running > 1)) {
+ sgs->group_no_capacity = 1;
+ sgs->group_type = group_overloaded;
+ }
if (update_sd_pick_busiest(env, sds, sg, sgs)) {
sds->busiest = sg;
@@ -6528,11 +6598,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
*/
if (busiest->group_type == group_overloaded &&
local->group_type == group_overloaded) {
- load_above_capacity =
- (busiest->sum_nr_running - busiest->group_capacity_factor);
-
- load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_CAPACITY_SCALE);
- load_above_capacity /= busiest->group_capacity;
+ load_above_capacity = busiest->sum_nr_running *
+ SCHED_LOAD_SCALE;
+ if (load_above_capacity > busiest->group_capacity)
+ load_above_capacity -= busiest->group_capacity;
+ else
+ load_above_capacity = ~0UL;
}
/*
@@ -6595,6 +6666,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
local = &sds.local_stat;
busiest = &sds.busiest_stat;
+ /* ASYM feature bypasses nice load balance check */
if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
check_asym_packing(env, &sds))
return sds.busiest;
@@ -6615,8 +6687,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
goto force_balance;
/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
- if (env->idle == CPU_NEWLY_IDLE && local->group_has_free_capacity &&
- !busiest->group_has_free_capacity)
+ if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) &&
+ busiest->group_no_capacity)
goto force_balance;
/*
@@ -6675,7 +6747,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
int i;
for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
- unsigned long capacity, capacity_factor, wl;
+ unsigned long capacity, wl;
enum fbq_type rt;
rq = cpu_rq(i);
@@ -6704,9 +6776,6 @@ static struct rq *find_busiest_queue(struct lb_env *env,
continue;
capacity = capacity_of(i);
- capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE);
- if (!capacity_factor)
- capacity_factor = fix_small_capacity(env->sd, group);
wl = weighted_cpuload(i);
@@ -6714,7 +6783,9 @@ static struct rq *find_busiest_queue(struct lb_env *env,
* When comparing with imbalance, use weighted_cpuload()
* which is not scaled with the cpu capacity.
*/
- if (capacity_factor && rq->nr_running == 1 && wl > env->imbalance)
+
+ if (rq->nr_running == 1 && wl > env->imbalance &&
+ !check_cpu_capacity(rq, env->sd))
continue;
/*
@@ -6762,6 +6833,28 @@ static int need_active_balance(struct lb_env *env)
return 1;
}
+ /*
+ * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
+ * It's worth migrating the task if the src_cpu's capacity is reduced
+ * because of other sched_class or IRQs whereas capacity stays
+ * available on dst_cpu.
+ */
+ if ((env->idle != CPU_NOT_IDLE) &&
+ (env->src_rq->cfs.h_nr_running == 1)) {
+ unsigned long src_eff_capacity, dst_eff_capacity;
+
+ dst_eff_capacity = 100;
+ dst_eff_capacity *= capacity_of(env->dst_cpu);
+ dst_eff_capacity *= capacity_orig_of(env->src_cpu);
+
+ src_eff_capacity = sd->imbalance_pct;
+ src_eff_capacity *= capacity_of(env->src_cpu);
+ src_eff_capacity *= capacity_orig_of(env->dst_cpu);
+
+ if (src_eff_capacity < dst_eff_capacity)
+ return 1;
+ }
+
return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
}
@@ -6861,6 +6954,9 @@ redo:
schedstat_add(sd, lb_imbalance[idle], env.imbalance);
+ env.src_cpu = busiest->cpu;
+ env.src_rq = busiest;
+
ld_moved = 0;
if (busiest->nr_running > 1) {
/*
@@ -6870,8 +6966,6 @@ redo:
* correctly treated as an imbalance.
*/
env.flags |= LBF_ALL_PINNED;
- env.src_cpu = busiest->cpu;
- env.src_rq = busiest;
env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
more_balance:
@@ -7571,22 +7665,25 @@ end:
/*
* Current heuristic for kicking the idle load balancer in the presence
- * of an idle cpu is the system.
+ * of an idle cpu in the system.
* - This rq has more than one task.
- * - At any scheduler domain level, this cpu's scheduler group has multiple
- * busy cpu's exceeding the group's capacity.
+ * - This rq has at least one CFS task and the capacity of the CPU is
+ * significantly reduced because of RT tasks or IRQs.
+ * - At parent of LLC scheduler domain level, this cpu's scheduler group has
+ * multiple busy cpu.
* - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
* domain span are idle.
*/
-static inline int nohz_kick_needed(struct rq *rq)
+static inline bool nohz_kick_needed(struct rq *rq)
{
unsigned long now = jiffies;
struct sched_domain *sd;
struct sched_group_capacity *sgc;
int nr_busy, cpu = rq->cpu;
+ bool kick = false;
if (unlikely(rq->idle_balance))
- return 0;
+ return false;
/*
* We may be recently in ticked or tickless idle mode. At the first
@@ -7600,38 +7697,44 @@ static inline int nohz_kick_needed(struct rq *rq)
* balancing.
*/
if (likely(!atomic_read(&nohz.nr_cpus)))
- return 0;
+ return false;
if (time_before(now, nohz.next_balance))
- return 0;
+ return false;
if (rq->nr_running >= 2)
- goto need_kick;
+ return true;
rcu_read_lock();
sd = rcu_dereference(per_cpu(sd_busy, cpu));
-
if (sd) {
sgc = sd->groups->sgc;
nr_busy = atomic_read(&sgc->nr_busy_cpus);
- if (nr_busy > 1)
- goto need_kick_unlock;
+ if (nr_busy > 1) {
+ kick = true;
+ goto unlock;
+ }
+
}
- sd = rcu_dereference(per_cpu(sd_asym, cpu));
+ sd = rcu_dereference(rq->sd);
+ if (sd) {
+ if ((rq->cfs.h_nr_running >= 1) &&
+ check_cpu_capacity(rq, sd)) {
+ kick = true;
+ goto unlock;
+ }
+ }
+ sd = rcu_dereference(per_cpu(sd_asym, cpu));
if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
sched_domain_span(sd)) < cpu))
- goto need_kick_unlock;
-
- rcu_read_unlock();
- return 0;
+ kick = true;
-need_kick_unlock:
+unlock:
rcu_read_unlock();
-need_kick:
- return 1;
+ return kick;
}
#else
static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 31f1e4d2996a..d4c580f7f12c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -362,8 +362,14 @@ struct cfs_rq {
* Under CFS, load is tracked on a per-entity basis and aggregated up.
* This allows for the description of both thread and group usage (in
* the FAIR_GROUP_SCHED case).
+ * runnable_load_avg is the sum of the load_avg_contrib of the
+ * sched_entities on the rq.
+ * blocked_load_avg is similar to runnable_load_avg except that its
+ * the blocked sched_entities on the rq.
+ * utilization_load_avg is the sum of the average running time of the
+ * sched_entities on the rq.
*/
- unsigned long runnable_load_avg, blocked_load_avg;
+ unsigned long runnable_load_avg, blocked_load_avg, utilization_load_avg;
atomic64_t decay_counter;
u64 last_decay;
atomic_long_t removed_load;
@@ -598,6 +604,7 @@ struct rq {
struct sched_domain *sd;
unsigned long cpu_capacity;
+ unsigned long cpu_capacity_orig;
unsigned char idle_balance;
/* For active balancing */
@@ -789,7 +796,7 @@ struct sched_group_capacity {
* CPU capacity of this group, SCHED_LOAD_SCALE being max capacity
* for a single CPU.
*/
- unsigned int capacity, capacity_orig;
+ unsigned int capacity;
unsigned long next_update;
int imbalance; /* XXX unrelated to capacity but shared group state */
/*
@@ -1348,9 +1355,11 @@ static inline int hrtick_enabled(struct rq *rq)
#ifdef CONFIG_SMP
extern void sched_avg_update(struct rq *rq);
+extern unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu);
+
static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
{
- rq->rt_avg += rt_delta;
+ rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
sched_avg_update(rq);
}
#else