diff options
Diffstat (limited to 'kernel/sched/topology.c')
-rw-r--r-- | kernel/sched/topology.c | 236 |
1 files changed, 204 insertions, 32 deletions
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 867d173dab48..c5de38dc192f 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -39,9 +39,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, if (!(sd->flags & SD_LOAD_BALANCE)) { printk("does not load-balance\n"); - if (sd->parent) - printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" - " has parent"); return -1; } @@ -154,8 +151,12 @@ static inline bool sched_debug(void) static int sd_degenerate(struct sched_domain *sd) { - if (cpumask_weight(sched_domain_span(sd)) == 1) - return 1; + if (cpumask_weight(sched_domain_span(sd)) == 1) { + if (sd->groups->sge) + sd->flags &= ~SD_LOAD_BALANCE; + else + return 1; + } /* Following flags need at least 2 groups */ if (sd->flags & (SD_LOAD_BALANCE | @@ -165,7 +166,8 @@ static int sd_degenerate(struct sched_domain *sd) SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY | SD_SHARE_PKG_RESOURCES | - SD_SHARE_POWERDOMAIN)) { + SD_SHARE_POWERDOMAIN | + SD_SHARE_CAP_STATES)) { if (sd->groups != sd->groups->next) return 0; } @@ -198,7 +200,12 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES | SD_PREFER_SIBLING | - SD_SHARE_POWERDOMAIN); + SD_SHARE_POWERDOMAIN | + SD_SHARE_CAP_STATES); + if (parent->groups->sge) { + parent->flags &= ~SD_LOAD_BALANCE; + return 0; + } if (nr_node_ids == 1) pflags &= ~SD_SERIALIZE; } @@ -294,6 +301,11 @@ static int init_rootdomain(struct root_domain *rd) if (cpupri_init(&rd->cpupri) != 0) goto free_cpudl; + + rd->max_cap_orig_cpu = rd->min_cap_orig_cpu = -1; + + init_max_cpu_capacity(&rd->max_cpu_capacity); + return 0; free_cpudl: @@ -405,11 +417,15 @@ DEFINE_PER_CPU(int, sd_llc_id); DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); DEFINE_PER_CPU(struct sched_domain *, sd_numa); DEFINE_PER_CPU(struct sched_domain *, sd_asym); +DEFINE_PER_CPU(struct sched_domain *, sd_ea); +DEFINE_PER_CPU(struct sched_domain *, sd_scs); +DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity); static void update_top_cache_domain(int cpu) { struct sched_domain_shared *sds = NULL; struct sched_domain *sd; + struct sched_domain *ea_sd = NULL; int id = cpu; int size = 1; @@ -430,6 +446,32 @@ static void update_top_cache_domain(int cpu) sd = highest_flag_domain(cpu, SD_ASYM_PACKING); rcu_assign_pointer(per_cpu(sd_asym, cpu), sd); + + for_each_domain(cpu, sd) { + if (sd->groups->sge) + ea_sd = sd; + else + break; + } + rcu_assign_pointer(per_cpu(sd_ea, cpu), ea_sd); + + sd = highest_flag_domain(cpu, SD_SHARE_CAP_STATES); + rcu_assign_pointer(per_cpu(sd_scs, cpu), sd); +} + +static void update_asym_cpucapacity(int cpu) +{ + int enable = false; + + rcu_read_lock(); + if (lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY)) + enable = true; + rcu_read_unlock(); + + if (enable) { + /* This expects to be hotplug-safe */ + static_branch_enable_cpuslocked(&sched_asym_cpucapacity); + } } /* @@ -714,6 +756,7 @@ static void init_overlap_sched_group(struct sched_domain *sd, sg_span = sched_group_span(sg); sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; + sg->sgc->max_capacity = SCHED_CAPACITY_SCALE; } static int @@ -873,6 +916,7 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd) sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg)); sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; + sg->sgc->max_capacity = SCHED_CAPACITY_SCALE; return sg; } @@ -962,6 +1006,108 @@ next: update_group_capacity(sd, cpu); } +#define cap_state_power(s,i) (s->cap_states[i].power) +#define cap_state_cap(s,i) (s->cap_states[i].cap) +#define idle_state_power(s,i) (s->idle_states[i].power) + +static inline int sched_group_energy_equal(const struct sched_group_energy *a, + const struct sched_group_energy *b) +{ + int i; + + /* check pointers first */ + if (a == b) + return true; + + /* check contents are equivalent */ + if (a->nr_cap_states != b->nr_cap_states) + return false; + if (a->nr_idle_states != b->nr_idle_states) + return false; + for (i=0;i<a->nr_cap_states;i++){ + if (cap_state_power(a,i) != + cap_state_power(b,i)) + return false; + if (cap_state_cap(a,i) != + cap_state_cap(b,i)) + return false; + } + for (i=0;i<a->nr_idle_states;i++){ + if (idle_state_power(a,i) != + idle_state_power(b,i)) + return false; + } + + return true; +} + +#define energy_eff(e, n) \ + ((e->cap_states[n].cap << SCHED_CAPACITY_SHIFT)/e->cap_states[n].power) + +static void init_sched_groups_energy(int cpu, struct sched_domain *sd, + sched_domain_energy_f fn) +{ + struct sched_group *sg = sd->groups; + const struct sched_group_energy *sge; + int i; + + if (!(fn && fn(cpu))) + return; + + if (cpu != group_balance_cpu(sg)) + return; + + if (sd->flags & SD_OVERLAP) { + pr_err("BUG: EAS does not support overlapping sd spans\n"); +#ifdef CONFIG_SCHED_DEBUG + pr_err(" the %s domain has SD_OVERLAP set\n", sd->name); +#endif + return; + } + + if (sd->child && !sd->child->groups->sge) { + pr_err("BUG: EAS setup borken for CPU%d\n", cpu); +#ifdef CONFIG_SCHED_DEBUG + pr_err(" energy data on %s but not on %s domain\n", + sd->name, sd->child->name); +#endif + return; + } + + sge = fn(cpu); + + /* + * Check that the per-cpu provided sd energy data is consistent for all + * cpus within the mask. + */ + if (cpumask_weight(sched_group_span(sg)) > 1) { + struct cpumask mask; + + cpumask_xor(&mask, sched_group_span(sg), get_cpu_mask(cpu)); + + for_each_cpu(i, &mask) + BUG_ON(!sched_group_energy_equal(sge,fn(i))); + } + + /* Check that energy efficiency (capacity/power) is monotonically + * decreasing in the capacity state vector with higher indexes + */ + for (i = 0; i < (sge->nr_cap_states - 1); i++) { + if (energy_eff(sge, i) > energy_eff(sge, i+1)) + continue; +#ifdef CONFIG_SCHED_DEBUG + pr_warn("WARN: cpu=%d, domain=%s: incr. energy eff %lu[%d]->%lu[%d]\n", + cpu, sd->name, energy_eff(sge, i), i, + energy_eff(sge, i+1), i+1); +#else + pr_warn("WARN: cpu=%d: incr. energy eff %lu[%d]->%lu[%d]\n", + cpu, energy_eff(sge, i), i, energy_eff(sge, i+1), i+1); +#endif + } + + sd->groups->sge = fn(cpu); +} + /* * Initializers for schedule domains * Non-inlined to reduce accumulated stack pressure in build_sched_domains() @@ -1081,6 +1227,7 @@ static int sched_domains_curr_level; * SD_NUMA - describes NUMA topologies * SD_SHARE_POWERDOMAIN - describes shared power domain * SD_ASYM_CPUCAPACITY - describes mixed capacity topologies + * SD_SHARE_CAP_STATES - describes shared capacity states * * Odd one out, which beside describing the topology has a quirk also * prescribes the desired behaviour that goes along with it: @@ -1093,7 +1240,8 @@ static int sched_domains_curr_level; SD_NUMA | \ SD_ASYM_PACKING | \ SD_ASYM_CPUCAPACITY | \ - SD_SHARE_POWERDOMAIN) + SD_SHARE_POWERDOMAIN | \ + SD_SHARE_CAP_STATES) static struct sched_domain * sd_init(struct sched_domain_topology_level *tl, @@ -1141,7 +1289,7 @@ sd_init(struct sched_domain_topology_level *tl, | 0*SD_SHARE_CPUCAPACITY | 0*SD_SHARE_PKG_RESOURCES | 0*SD_SERIALIZE - | 0*SD_PREFER_SIBLING + | 1*SD_PREFER_SIBLING | 0*SD_NUMA | sd_flags , @@ -1161,18 +1309,43 @@ sd_init(struct sched_domain_topology_level *tl, sd_id = cpumask_first(sched_domain_span(sd)); /* + * Check if cpu_map eclipses cpu capacity asymmetry. + */ + + if (sd->flags & SD_ASYM_CPUCAPACITY) { + long capacity = arch_scale_cpu_capacity(NULL, sd_id); + bool disable = true; + int i; + + for_each_cpu(i, sched_domain_span(sd)) { + if (capacity != arch_scale_cpu_capacity(NULL, i)) { + disable = false; + break; + } + } + + if (disable) + sd->flags &= ~SD_ASYM_CPUCAPACITY; + } + + /* * Convert topological properties into behaviour. */ if (sd->flags & SD_ASYM_CPUCAPACITY) { struct sched_domain *t = sd; + /* + * Don't attempt to spread across cpus of different capacities. + */ + if (sd->child) + sd->child->flags &= ~SD_PREFER_SIBLING; + for_each_lower_domain(t) t->flags |= SD_BALANCE_WAKE; } if (sd->flags & SD_SHARE_CPUCAPACITY) { - sd->flags |= SD_PREFER_SIBLING; sd->imbalance_pct = 110; sd->smt_gain = 1178; /* ~15% */ @@ -1187,6 +1360,7 @@ sd_init(struct sched_domain_topology_level *tl, sd->busy_idx = 3; sd->idle_idx = 2; + sd->flags &= ~SD_PREFER_SIBLING; sd->flags |= SD_SERIALIZE; if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) { sd->flags &= ~(SD_BALANCE_EXEC | @@ -1196,21 +1370,16 @@ sd_init(struct sched_domain_topology_level *tl, #endif } else { - sd->flags |= SD_PREFER_SIBLING; sd->cache_nice_tries = 1; sd->busy_idx = 2; sd->idle_idx = 1; } - /* - * For all levels sharing cache; connect a sched_domain_shared - * instance. - */ - if (sd->flags & SD_SHARE_PKG_RESOURCES) { - sd->shared = *per_cpu_ptr(sdd->sds, sd_id); - atomic_inc(&sd->shared->ref); + sd->shared = *per_cpu_ptr(sdd->sds, sd_id); + atomic_inc(&sd->shared->ref); + + if (sd->flags & SD_SHARE_PKG_RESOURCES) atomic_set(&sd->shared->nr_busy_cpus, sd_weight); - } sd->private = sdd; @@ -1651,7 +1820,6 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att enum s_alloc alloc_state; struct sched_domain *sd; struct s_data d; - struct rq *rq = NULL; int i, ret = -ENOMEM; alloc_state = __visit_domain_allocation_hell(&d, cpu_map); @@ -1669,8 +1837,6 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att *per_cpu_ptr(d.sd, i) = sd; if (tl->flags & SDTL_OVERLAP) sd->flags |= SD_OVERLAP; - if (cpumask_equal(cpu_map, sched_domain_span(sd))) - break; } } @@ -1690,10 +1856,13 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att /* Calculate CPU capacity for physical packages and nodes */ for (i = nr_cpumask_bits-1; i >= 0; i--) { + struct sched_domain_topology_level *tl = sched_domain_topology; + if (!cpumask_test_cpu(i, cpu_map)) continue; - for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { + for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent, tl++) { + init_sched_groups_energy(i, sd, tl->energy); claim_allocations(i, sd); init_sched_groups_capacity(i, sd); } @@ -1702,21 +1871,25 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att /* Attach the domains */ rcu_read_lock(); for_each_cpu(i, cpu_map) { - rq = cpu_rq(i); + int max_cpu = READ_ONCE(d.rd->max_cap_orig_cpu); + int min_cpu = READ_ONCE(d.rd->min_cap_orig_cpu); + sd = *per_cpu_ptr(d.sd, i); - /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */ - if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity)) - WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig); + if ((max_cpu < 0) || (cpu_rq(i)->cpu_capacity_orig > + cpu_rq(max_cpu)->cpu_capacity_orig)) + WRITE_ONCE(d.rd->max_cap_orig_cpu, i); + + if ((min_cpu < 0) || (cpu_rq(i)->cpu_capacity_orig < + cpu_rq(min_cpu)->cpu_capacity_orig)) + WRITE_ONCE(d.rd->min_cap_orig_cpu, i); cpu_attach_domain(sd, d.rd, i); } rcu_read_unlock(); - if (rq && sched_debug_enabled) { - pr_info("span: %*pbl (max cpu_capacity = %lu)\n", - cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); - } + if (!cpumask_empty(cpu_map)) + update_asym_cpucapacity(cpumask_first(cpu_map)); ret = 0; error: @@ -1928,4 +2101,3 @@ match2: mutex_unlock(&sched_domains_mutex); } - |