summaryrefslogtreecommitdiff
path: root/kernel/sched/topology.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/topology.c')
-rw-r--r--kernel/sched/topology.c236
1 files changed, 204 insertions, 32 deletions
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 867d173dab48..c5de38dc192f 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -39,9 +39,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
if (!(sd->flags & SD_LOAD_BALANCE)) {
printk("does not load-balance\n");
- if (sd->parent)
- printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
- " has parent");
return -1;
}
@@ -154,8 +151,12 @@ static inline bool sched_debug(void)
static int sd_degenerate(struct sched_domain *sd)
{
- if (cpumask_weight(sched_domain_span(sd)) == 1)
- return 1;
+ if (cpumask_weight(sched_domain_span(sd)) == 1) {
+ if (sd->groups->sge)
+ sd->flags &= ~SD_LOAD_BALANCE;
+ else
+ return 1;
+ }
/* Following flags need at least 2 groups */
if (sd->flags & (SD_LOAD_BALANCE |
@@ -165,7 +166,8 @@ static int sd_degenerate(struct sched_domain *sd)
SD_SHARE_CPUCAPACITY |
SD_ASYM_CPUCAPACITY |
SD_SHARE_PKG_RESOURCES |
- SD_SHARE_POWERDOMAIN)) {
+ SD_SHARE_POWERDOMAIN |
+ SD_SHARE_CAP_STATES)) {
if (sd->groups != sd->groups->next)
return 0;
}
@@ -198,7 +200,12 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
SD_SHARE_CPUCAPACITY |
SD_SHARE_PKG_RESOURCES |
SD_PREFER_SIBLING |
- SD_SHARE_POWERDOMAIN);
+ SD_SHARE_POWERDOMAIN |
+ SD_SHARE_CAP_STATES);
+ if (parent->groups->sge) {
+ parent->flags &= ~SD_LOAD_BALANCE;
+ return 0;
+ }
if (nr_node_ids == 1)
pflags &= ~SD_SERIALIZE;
}
@@ -294,6 +301,11 @@ static int init_rootdomain(struct root_domain *rd)
if (cpupri_init(&rd->cpupri) != 0)
goto free_cpudl;
+
+ rd->max_cap_orig_cpu = rd->min_cap_orig_cpu = -1;
+
+ init_max_cpu_capacity(&rd->max_cpu_capacity);
+
return 0;
free_cpudl:
@@ -405,11 +417,15 @@ DEFINE_PER_CPU(int, sd_llc_id);
DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
DEFINE_PER_CPU(struct sched_domain *, sd_numa);
DEFINE_PER_CPU(struct sched_domain *, sd_asym);
+DEFINE_PER_CPU(struct sched_domain *, sd_ea);
+DEFINE_PER_CPU(struct sched_domain *, sd_scs);
+DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
static void update_top_cache_domain(int cpu)
{
struct sched_domain_shared *sds = NULL;
struct sched_domain *sd;
+ struct sched_domain *ea_sd = NULL;
int id = cpu;
int size = 1;
@@ -430,6 +446,32 @@ static void update_top_cache_domain(int cpu)
sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
+
+ for_each_domain(cpu, sd) {
+ if (sd->groups->sge)
+ ea_sd = sd;
+ else
+ break;
+ }
+ rcu_assign_pointer(per_cpu(sd_ea, cpu), ea_sd);
+
+ sd = highest_flag_domain(cpu, SD_SHARE_CAP_STATES);
+ rcu_assign_pointer(per_cpu(sd_scs, cpu), sd);
+}
+
+static void update_asym_cpucapacity(int cpu)
+{
+ int enable = false;
+
+ rcu_read_lock();
+ if (lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY))
+ enable = true;
+ rcu_read_unlock();
+
+ if (enable) {
+ /* This expects to be hotplug-safe */
+ static_branch_enable_cpuslocked(&sched_asym_cpucapacity);
+ }
}
/*
@@ -714,6 +756,7 @@ static void init_overlap_sched_group(struct sched_domain *sd,
sg_span = sched_group_span(sg);
sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
+ sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
}
static int
@@ -873,6 +916,7 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd)
sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg));
sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
+ sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
return sg;
}
@@ -962,6 +1006,108 @@ next:
update_group_capacity(sd, cpu);
}
+#define cap_state_power(s,i) (s->cap_states[i].power)
+#define cap_state_cap(s,i) (s->cap_states[i].cap)
+#define idle_state_power(s,i) (s->idle_states[i].power)
+
+static inline int sched_group_energy_equal(const struct sched_group_energy *a,
+ const struct sched_group_energy *b)
+{
+ int i;
+
+ /* check pointers first */
+ if (a == b)
+ return true;
+
+ /* check contents are equivalent */
+ if (a->nr_cap_states != b->nr_cap_states)
+ return false;
+ if (a->nr_idle_states != b->nr_idle_states)
+ return false;
+ for (i=0;i<a->nr_cap_states;i++){
+ if (cap_state_power(a,i) !=
+ cap_state_power(b,i))
+ return false;
+ if (cap_state_cap(a,i) !=
+ cap_state_cap(b,i))
+ return false;
+ }
+ for (i=0;i<a->nr_idle_states;i++){
+ if (idle_state_power(a,i) !=
+ idle_state_power(b,i))
+ return false;
+ }
+
+ return true;
+}
+
+#define energy_eff(e, n) \
+ ((e->cap_states[n].cap << SCHED_CAPACITY_SHIFT)/e->cap_states[n].power)
+
+static void init_sched_groups_energy(int cpu, struct sched_domain *sd,
+ sched_domain_energy_f fn)
+{
+ struct sched_group *sg = sd->groups;
+ const struct sched_group_energy *sge;
+ int i;
+
+ if (!(fn && fn(cpu)))
+ return;
+
+ if (cpu != group_balance_cpu(sg))
+ return;
+
+ if (sd->flags & SD_OVERLAP) {
+ pr_err("BUG: EAS does not support overlapping sd spans\n");
+#ifdef CONFIG_SCHED_DEBUG
+ pr_err(" the %s domain has SD_OVERLAP set\n", sd->name);
+#endif
+ return;
+ }
+
+ if (sd->child && !sd->child->groups->sge) {
+ pr_err("BUG: EAS setup borken for CPU%d\n", cpu);
+#ifdef CONFIG_SCHED_DEBUG
+ pr_err(" energy data on %s but not on %s domain\n",
+ sd->name, sd->child->name);
+#endif
+ return;
+ }
+
+ sge = fn(cpu);
+
+ /*
+ * Check that the per-cpu provided sd energy data is consistent for all
+ * cpus within the mask.
+ */
+ if (cpumask_weight(sched_group_span(sg)) > 1) {
+ struct cpumask mask;
+
+ cpumask_xor(&mask, sched_group_span(sg), get_cpu_mask(cpu));
+
+ for_each_cpu(i, &mask)
+ BUG_ON(!sched_group_energy_equal(sge,fn(i)));
+ }
+
+ /* Check that energy efficiency (capacity/power) is monotonically
+ * decreasing in the capacity state vector with higher indexes
+ */
+ for (i = 0; i < (sge->nr_cap_states - 1); i++) {
+ if (energy_eff(sge, i) > energy_eff(sge, i+1))
+ continue;
+#ifdef CONFIG_SCHED_DEBUG
+ pr_warn("WARN: cpu=%d, domain=%s: incr. energy eff %lu[%d]->%lu[%d]\n",
+ cpu, sd->name, energy_eff(sge, i), i,
+ energy_eff(sge, i+1), i+1);
+#else
+ pr_warn("WARN: cpu=%d: incr. energy eff %lu[%d]->%lu[%d]\n",
+ cpu, energy_eff(sge, i), i, energy_eff(sge, i+1), i+1);
+#endif
+ }
+
+ sd->groups->sge = fn(cpu);
+}
+
/*
* Initializers for schedule domains
* Non-inlined to reduce accumulated stack pressure in build_sched_domains()
@@ -1081,6 +1227,7 @@ static int sched_domains_curr_level;
* SD_NUMA - describes NUMA topologies
* SD_SHARE_POWERDOMAIN - describes shared power domain
* SD_ASYM_CPUCAPACITY - describes mixed capacity topologies
+ * SD_SHARE_CAP_STATES - describes shared capacity states
*
* Odd one out, which beside describing the topology has a quirk also
* prescribes the desired behaviour that goes along with it:
@@ -1093,7 +1240,8 @@ static int sched_domains_curr_level;
SD_NUMA | \
SD_ASYM_PACKING | \
SD_ASYM_CPUCAPACITY | \
- SD_SHARE_POWERDOMAIN)
+ SD_SHARE_POWERDOMAIN | \
+ SD_SHARE_CAP_STATES)
static struct sched_domain *
sd_init(struct sched_domain_topology_level *tl,
@@ -1141,7 +1289,7 @@ sd_init(struct sched_domain_topology_level *tl,
| 0*SD_SHARE_CPUCAPACITY
| 0*SD_SHARE_PKG_RESOURCES
| 0*SD_SERIALIZE
- | 0*SD_PREFER_SIBLING
+ | 1*SD_PREFER_SIBLING
| 0*SD_NUMA
| sd_flags
,
@@ -1161,18 +1309,43 @@ sd_init(struct sched_domain_topology_level *tl,
sd_id = cpumask_first(sched_domain_span(sd));
/*
+ * Check if cpu_map eclipses cpu capacity asymmetry.
+ */
+
+ if (sd->flags & SD_ASYM_CPUCAPACITY) {
+ long capacity = arch_scale_cpu_capacity(NULL, sd_id);
+ bool disable = true;
+ int i;
+
+ for_each_cpu(i, sched_domain_span(sd)) {
+ if (capacity != arch_scale_cpu_capacity(NULL, i)) {
+ disable = false;
+ break;
+ }
+ }
+
+ if (disable)
+ sd->flags &= ~SD_ASYM_CPUCAPACITY;
+ }
+
+ /*
* Convert topological properties into behaviour.
*/
if (sd->flags & SD_ASYM_CPUCAPACITY) {
struct sched_domain *t = sd;
+ /*
+ * Don't attempt to spread across cpus of different capacities.
+ */
+ if (sd->child)
+ sd->child->flags &= ~SD_PREFER_SIBLING;
+
for_each_lower_domain(t)
t->flags |= SD_BALANCE_WAKE;
}
if (sd->flags & SD_SHARE_CPUCAPACITY) {
- sd->flags |= SD_PREFER_SIBLING;
sd->imbalance_pct = 110;
sd->smt_gain = 1178; /* ~15% */
@@ -1187,6 +1360,7 @@ sd_init(struct sched_domain_topology_level *tl,
sd->busy_idx = 3;
sd->idle_idx = 2;
+ sd->flags &= ~SD_PREFER_SIBLING;
sd->flags |= SD_SERIALIZE;
if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
sd->flags &= ~(SD_BALANCE_EXEC |
@@ -1196,21 +1370,16 @@ sd_init(struct sched_domain_topology_level *tl,
#endif
} else {
- sd->flags |= SD_PREFER_SIBLING;
sd->cache_nice_tries = 1;
sd->busy_idx = 2;
sd->idle_idx = 1;
}
- /*
- * For all levels sharing cache; connect a sched_domain_shared
- * instance.
- */
- if (sd->flags & SD_SHARE_PKG_RESOURCES) {
- sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
- atomic_inc(&sd->shared->ref);
+ sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
+ atomic_inc(&sd->shared->ref);
+
+ if (sd->flags & SD_SHARE_PKG_RESOURCES)
atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
- }
sd->private = sdd;
@@ -1651,7 +1820,6 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
enum s_alloc alloc_state;
struct sched_domain *sd;
struct s_data d;
- struct rq *rq = NULL;
int i, ret = -ENOMEM;
alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
@@ -1669,8 +1837,6 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
*per_cpu_ptr(d.sd, i) = sd;
if (tl->flags & SDTL_OVERLAP)
sd->flags |= SD_OVERLAP;
- if (cpumask_equal(cpu_map, sched_domain_span(sd)))
- break;
}
}
@@ -1690,10 +1856,13 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
/* Calculate CPU capacity for physical packages and nodes */
for (i = nr_cpumask_bits-1; i >= 0; i--) {
+ struct sched_domain_topology_level *tl = sched_domain_topology;
+
if (!cpumask_test_cpu(i, cpu_map))
continue;
- for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+ for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent, tl++) {
+ init_sched_groups_energy(i, sd, tl->energy);
claim_allocations(i, sd);
init_sched_groups_capacity(i, sd);
}
@@ -1702,21 +1871,25 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
/* Attach the domains */
rcu_read_lock();
for_each_cpu(i, cpu_map) {
- rq = cpu_rq(i);
+ int max_cpu = READ_ONCE(d.rd->max_cap_orig_cpu);
+ int min_cpu = READ_ONCE(d.rd->min_cap_orig_cpu);
+
sd = *per_cpu_ptr(d.sd, i);
- /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
- if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
- WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
+ if ((max_cpu < 0) || (cpu_rq(i)->cpu_capacity_orig >
+ cpu_rq(max_cpu)->cpu_capacity_orig))
+ WRITE_ONCE(d.rd->max_cap_orig_cpu, i);
+
+ if ((min_cpu < 0) || (cpu_rq(i)->cpu_capacity_orig <
+ cpu_rq(min_cpu)->cpu_capacity_orig))
+ WRITE_ONCE(d.rd->min_cap_orig_cpu, i);
cpu_attach_domain(sd, d.rd, i);
}
rcu_read_unlock();
- if (rq && sched_debug_enabled) {
- pr_info("span: %*pbl (max cpu_capacity = %lu)\n",
- cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
- }
+ if (!cpumask_empty(cpu_map))
+ update_asym_cpucapacity(cpumask_first(cpu_map));
ret = 0;
error:
@@ -1928,4 +2101,3 @@ match2:
mutex_unlock(&sched_domains_mutex);
}
-