diff options
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r-- | kernel/sched/fair.c | 2787 |
1 files changed, 2609 insertions, 178 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0b4e997fea1a..149db89c66d8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -37,6 +37,8 @@ #include <trace/events/sched.h> #include "sched.h" +#include "tune.h" +#include "walt.h" /* * Targeted preemption latency for CPU-bound tasks: @@ -55,6 +57,15 @@ unsigned int sysctl_sched_latency = 6000000ULL; unsigned int normalized_sysctl_sched_latency = 6000000ULL; /* + * Enable/disable honoring sync flag in energy-aware wakeups. + */ +unsigned int sysctl_sched_sync_hint_enable = 1; +/* + * Enable/disable using cstate knowledge in idle sibling selection + */ +unsigned int sysctl_sched_cstate_aware = 1; + +/* * The initial- and re-scaling of tunables is configurable * * Options are: @@ -100,6 +111,13 @@ unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; +#ifdef CONFIG_SCHED_WALT +unsigned int sysctl_sched_use_walt_cpu_util = 1; +unsigned int sysctl_sched_use_walt_task_util = 1; +__read_mostly unsigned int sysctl_sched_walt_cpu_high_irqload = + (10 * NSEC_PER_MSEC); +#endif + #ifdef CONFIG_SMP /* * For asym packing, by default the lower numbered cpu has higher priority. @@ -710,14 +728,17 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); static unsigned long task_h_load(struct task_struct *p); +static unsigned long capacity_of(int cpu); /* Give new sched_entity start runnable values to heavy its load in infant time */ void init_entity_runnable_average(struct sched_entity *se) { struct sched_avg *sa = &se->avg; - sa->last_update_time = 0; + memset(sa, 0, sizeof(*sa)); /* + * util_avg is initialized in post_init_entity_util_avg. + * util_est should start from zero. * sched_avg's period_contrib should be strictly less then 1024, so * we give it 1023 to make sure it is almost a period (1024us), and * will definitely be update (after enqueue). @@ -732,11 +753,6 @@ void init_entity_runnable_average(struct sched_entity *se) if (entity_is_task(se)) sa->load_avg = scale_load_down(se->load.weight); sa->load_sum = sa->load_avg * LOAD_AVG_MAX; - /* - * At this point, util_avg won't be used in select_task_rq_fair anyway - */ - sa->util_avg = 0; - sa->util_sum = 0; /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } @@ -967,6 +983,7 @@ update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) } trace_sched_stat_blocked(tsk, delta); + trace_sched_blocked_reason(tsk); /* * Blocking time is in units of nanosecs, so shift by @@ -1431,7 +1448,6 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, static unsigned long weighted_cpuload(struct rq *rq); static unsigned long source_load(int cpu, int type); static unsigned long target_load(int cpu, int type); -static unsigned long capacity_of(int cpu); /* Cached statistics for all CPUs within a node */ struct numa_stats { @@ -2988,7 +3004,8 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, */ static __always_inline int ___update_load_avg(u64 now, int cpu, struct sched_avg *sa, - unsigned long weight, int running, struct cfs_rq *cfs_rq) + unsigned long weight, int running, struct cfs_rq *cfs_rq, + struct rt_rq *rt_rq) { u64 delta; @@ -3044,21 +3061,83 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa, } sa->util_avg = sa->util_sum / (LOAD_AVG_MAX - 1024 + sa->period_contrib); + if (cfs_rq) + trace_sched_load_cfs_rq(cfs_rq); + else { + if (likely(!rt_rq)) + trace_sched_load_se(container_of(sa, struct sched_entity, avg)); + else + trace_sched_load_rt_rq(cpu, rt_rq); + } + return 1; } +/* + * When a task is dequeued, its estimated utilization should not be update if + * its util_avg has not been updated at least once. + * This flag is used to synchronize util_avg updates with util_est updates. + * We map this information into the LSB bit of the utilization saved at + * dequeue time (i.e. util_est.dequeued). + */ +#define UTIL_AVG_UNCHANGED 0x1 + +static inline void cfs_se_util_change(struct sched_avg *avg) +{ + unsigned int enqueued; + + if (!sched_feat(UTIL_EST)) + return; + + /* Avoid store if the flag has been already set */ + enqueued = avg->util_est.enqueued; + if (!(enqueued & UTIL_AVG_UNCHANGED)) + return; + + /* Reset flag to report util_avg has been updated */ + enqueued &= ~UTIL_AVG_UNCHANGED; + WRITE_ONCE(avg->util_est.enqueued, enqueued); +} + static int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) { - return ___update_load_avg(now, cpu, &se->avg, 0, 0, NULL); + return ___update_load_avg(now, cpu, &se->avg, 0, 0, NULL, NULL); } static int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se) { - return ___update_load_avg(now, cpu, &se->avg, - se->on_rq * scale_load_down(se->load.weight), - cfs_rq->curr == se, NULL); + if (___update_load_avg(now, cpu, &se->avg, + se->on_rq * scale_load_down(se->load.weight), + cfs_rq->curr == se, NULL, NULL)) { + cfs_se_util_change(&se->avg); + +#ifdef UTIL_EST_DEBUG + /* + * Trace utilization only for actual tasks. + * + * These trace events are mostly useful to get easier to + * read plots for the estimated utilization, where we can + * compare it with the actual grow/decrease of the original + * PELT signal. + * Let's keep them disabled by default in "production kernels". + */ + if (entity_is_task(se)) { + struct task_struct *tsk = task_of(se); + + trace_sched_util_est_task(tsk, &se->avg); + + /* Trace utilization only for top level CFS RQ */ + cfs_rq = &(task_rq(tsk)->cfs); + trace_sched_util_est_cpu(cpu, cfs_rq); + } +#endif /* UTIL_EST_DEBUG */ + + return 1; + } + + return 0; } static int @@ -3066,7 +3145,7 @@ __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq) { return ___update_load_avg(now, cpu, &cfs_rq->avg, scale_load_down(cfs_rq->load.weight), - cfs_rq->curr != NULL, cfs_rq); + cfs_rq->curr != NULL, cfs_rq, NULL); } /* @@ -3119,6 +3198,8 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) atomic_long_add(delta, &cfs_rq->tg->load_avg); cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg; } + + trace_sched_load_tg(cfs_rq); } /* @@ -3287,6 +3368,9 @@ static inline int propagate_entity_load_avg(struct sched_entity *se) update_tg_cfs_util(cfs_rq, se); update_tg_cfs_load(cfs_rq, se); + trace_sched_load_cfs_rq(cfs_rq); + trace_sched_load_se(se); + return 1; } @@ -3401,6 +3485,15 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) return decayed || removed_load; } +int update_rt_rq_load_avg(u64 now, int cpu, struct rt_rq *rt_rq, int running) +{ + int ret; + + ret = ___update_load_avg(now, cpu, &rt_rq->avg, running, running, NULL, rt_rq); + + return ret; +} + /* * Optional action to be done while updating the load average */ @@ -3448,6 +3541,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s set_tg_cfs_propagate(cfs_rq); cfs_rq_util_change(cfs_rq); + + trace_sched_load_cfs_rq(cfs_rq); } /** @@ -3468,6 +3563,8 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s set_tg_cfs_propagate(cfs_rq); cfs_rq_util_change(cfs_rq); + + trace_sched_load_cfs_rq(cfs_rq); } /* Add the load generated by se into cfs_rq's load average */ @@ -3564,6 +3661,157 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) static int idle_balance(struct rq *this_rq, struct rq_flags *rf); +static inline int task_fits_capacity(struct task_struct *p, long capacity); + +static inline void update_misfit_status(struct task_struct *p, struct rq *rq) +{ + if (!static_branch_unlikely(&sched_asym_cpucapacity)) + return; + + if (!p) { + rq->misfit_task_load = 0; + return; + } + + if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) { + rq->misfit_task_load = 0; + return; + } + + rq->misfit_task_load = task_h_load(p); +} + +static inline unsigned long task_util(struct task_struct *p) +{ +#ifdef CONFIG_SCHED_WALT + if (likely(!walt_disabled && sysctl_sched_use_walt_task_util)) + return (p->ravg.demand / + (walt_ravg_window >> SCHED_CAPACITY_SHIFT)); +#endif + return READ_ONCE(p->se.avg.util_avg); +} + +static inline unsigned long _task_util_est(struct task_struct *p) +{ + struct util_est ue = READ_ONCE(p->se.avg.util_est); + + return max(ue.ewma, ue.enqueued); +} + +static inline unsigned long task_util_est(struct task_struct *p) +{ +#ifdef CONFIG_SCHED_WALT + if (likely(!walt_disabled && sysctl_sched_use_walt_task_util)) + return (p->ravg.demand / + (walt_ravg_window >> SCHED_CAPACITY_SHIFT)); +#endif + return max(task_util(p), _task_util_est(p)); +} + +static inline void util_est_enqueue(struct cfs_rq *cfs_rq, + struct task_struct *p) +{ + unsigned int enqueued; + + if (!sched_feat(UTIL_EST)) + return; + + /* Update root cfs_rq's estimated utilization */ + enqueued = cfs_rq->avg.util_est.enqueued; + enqueued += (_task_util_est(p) | UTIL_AVG_UNCHANGED); + WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued); + + trace_sched_util_est_task(p, &p->se.avg); + trace_sched_util_est_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq); +} + +/* + * Check if a (signed) value is within a specified (unsigned) margin, + * based on the observation that: + * + * abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1) + * + * NOTE: this only works when value + maring < INT_MAX. + */ +static inline bool within_margin(int value, int margin) +{ + return ((unsigned int)(value + margin - 1) < (2 * margin - 1)); +} + +static void +util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) +{ + long last_ewma_diff; + struct util_est ue; + + if (!sched_feat(UTIL_EST)) + return; + + /* + * Update root cfs_rq's estimated utilization + * + * If *p is the last task then the root cfs_rq's estimated utilization + * of a CPU is 0 by definition. + */ + ue.enqueued = 0; + if (cfs_rq->nr_running) { + ue.enqueued = cfs_rq->avg.util_est.enqueued; + ue.enqueued -= min_t(unsigned int, ue.enqueued, + (_task_util_est(p) | UTIL_AVG_UNCHANGED)); + } + WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued); + + trace_sched_util_est_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq); + + /* + * Skip update of task's estimated utilization when the task has not + * yet completed an activation, e.g. being migrated. + */ + if (!task_sleep) + return; + + /* + * If the PELT values haven't changed since enqueue time, + * skip the util_est update. + */ + ue = p->se.avg.util_est; + if (ue.enqueued & UTIL_AVG_UNCHANGED) + return; + + /* + * Skip update of task's estimated utilization when its EWMA is + * already ~1% close to its last activation value. + */ + ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED); + last_ewma_diff = ue.enqueued - ue.ewma; + if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100))) + return; + + /* + * Update Task's estimated utilization + * + * When *p completes an activation we can consolidate another sample + * of the task size. This is done by storing the current PELT value + * as ue.enqueued and by using this value to update the Exponential + * Weighted Moving Average (EWMA): + * + * ewma(t) = w * task_util(p) + (1-w) * ewma(t-1) + * = w * task_util(p) + ewma(t-1) - w * ewma(t-1) + * = w * (task_util(p) - ewma(t-1)) + ewma(t-1) + * = w * ( last_ewma_diff ) + ewma(t-1) + * = w * (last_ewma_diff + ewma(t-1) / w) + * + * Where 'w' is the weight of new samples, which is configured to be + * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT) + */ + ue.ewma <<= UTIL_EST_WEIGHT_SHIFT; + ue.ewma += last_ewma_diff; + ue.ewma >>= UTIL_EST_WEIGHT_SHIFT; + WRITE_ONCE(p->se.avg.util_est, ue); + + trace_sched_util_est_task(p, &p->se.avg); +} + #else /* CONFIG_SMP */ static inline int @@ -3572,6 +3820,11 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) return 0; } +int update_rt_rq_load_avg(u64 now, int cpu, struct rt_rq *rt_rq, int running) +{ + return 0; +} + #define UPDATE_TG 0x0 #define SKIP_AGE_LOAD 0x0 @@ -3596,6 +3849,15 @@ static inline int idle_balance(struct rq *rq, struct rq_flags *rf) return 0; } +static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} + +static inline void +util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {} + +static inline void +util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, + bool task_sleep) {} + #endif /* CONFIG_SMP */ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -4230,6 +4492,23 @@ static int tg_throttle_down(struct task_group *tg, void *data) return 0; } +#ifdef CONFIG_SCHED_WALT +static inline void walt_propagate_cumulative_runnable_avg(u64 *accumulated, + u64 value, bool add) +{ + if (add) + *accumulated += value; + else + *accumulated -= value; +} +#else +/* + * Provide a nop definition since cumulative_runnable_avg is not + * available in rq or cfs_rq when WALT is not enabled. + */ +#define walt_propagate_cumulative_runnable_avg(...) +#endif + static void throttle_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); @@ -4255,13 +4534,21 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) if (dequeue) dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); qcfs_rq->h_nr_running -= task_delta; + walt_propagate_cumulative_runnable_avg( + &qcfs_rq->cumulative_runnable_avg, + cfs_rq->cumulative_runnable_avg, false); + if (qcfs_rq->load.weight) dequeue = 0; } - if (!se) + if (!se) { sub_nr_running(rq, task_delta); + walt_propagate_cumulative_runnable_avg( + &rq->cumulative_runnable_avg, + cfs_rq->cumulative_runnable_avg, false); + } cfs_rq->throttled = 1; cfs_rq->throttled_clock = rq_clock(rq); @@ -4295,6 +4582,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) struct sched_entity *se; int enqueue = 1; long task_delta; + struct cfs_rq *tcfs_rq __maybe_unused = cfs_rq; se = cfs_rq->tg->se[cpu_of(rq)]; @@ -4322,13 +4610,20 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) if (enqueue) enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); cfs_rq->h_nr_running += task_delta; + walt_propagate_cumulative_runnable_avg( + &cfs_rq->cumulative_runnable_avg, + tcfs_rq->cumulative_runnable_avg, true); if (cfs_rq_throttled(cfs_rq)) break; } - if (!se) + if (!se) { add_nr_running(rq, task_delta); + walt_propagate_cumulative_runnable_avg( + &rq->cumulative_runnable_avg, + tcfs_rq->cumulative_runnable_avg, true); + } /* determine whether we need to wake up potentially idle cpu */ if (rq->curr == rq->idle && rq->cfs.nr_running) @@ -4790,6 +5085,30 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) rcu_read_unlock(); } +#ifdef CONFIG_SCHED_WALT +static void walt_fixup_cumulative_runnable_avg_fair(struct rq *rq, + struct task_struct *p, + u64 new_task_load) +{ + struct cfs_rq *cfs_rq; + struct sched_entity *se = &p->se; + s64 task_load_delta = (s64)new_task_load - p->ravg.demand; + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + + cfs_rq->cumulative_runnable_avg += task_load_delta; + if (cfs_rq_throttled(cfs_rq)) + break; + } + + /* Fix up rq only if we didn't find any throttled cfs_rq */ + if (!se) + walt_fixup_cumulative_runnable_avg(rq, p, new_task_load); +} + +#endif /* CONFIG_SCHED_WALT */ + #else /* CONFIG_CFS_BANDWIDTH */ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) { @@ -4832,6 +5151,9 @@ static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} static inline void update_runtime_enabled(struct rq *rq) {} static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {} +#define walt_fixup_cumulative_runnable_avg_fair \ + walt_fixup_cumulative_runnable_avg + #endif /* CONFIG_CFS_BANDWIDTH */ /************************************************** @@ -4886,6 +5208,46 @@ static inline void hrtick_update(struct rq *rq) } #endif +#ifdef CONFIG_SMP +static bool cpu_overutilized(int cpu); + +static bool sd_overutilized(struct sched_domain *sd) +{ + return sd->shared->overutilized; +} + +static void set_sd_overutilized(struct sched_domain *sd) +{ + trace_sched_overutilized(sd, sd->shared->overutilized, true); + sd->shared->overutilized = true; +} + +static void clear_sd_overutilized(struct sched_domain *sd) +{ + trace_sched_overutilized(sd, sd->shared->overutilized, false); + sd->shared->overutilized = false; +} + +static inline void update_overutilized_status(struct rq *rq) +{ + struct sched_domain *sd; + + rcu_read_lock(); + sd = rcu_dereference(rq->sd); + if (sd && !sd_overutilized(sd) && + cpu_overutilized(rq->cpu)) + set_sd_overutilized(sd); + rcu_read_unlock(); +} + +unsigned long boosted_cpu_util(int cpu); +#else + +#define update_overutilized_status(rq) do {} while (0) +#define boosted_cpu_util(cpu) cpu_util_freq(cpu) + +#endif /* CONFIG_SMP */ + /* * The enqueue_task method is called before nr_running is * increased. Here we update the fair scheduling stats and @@ -4896,6 +5258,33 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; + int task_new = !(flags & ENQUEUE_WAKEUP); + + /* + * The code below (indirectly) updates schedutil which looks at + * the cfs_rq utilization to select a frequency. + * Let's add the task's estimated utilization to the cfs_rq's + * estimated utilization, before we update schedutil. + */ + util_est_enqueue(&rq->cfs, p); + + /* + * The code below (indirectly) updates schedutil which looks at + * the cfs_rq utilization to select a frequency. + * Let's update schedtune here to ensure the boost value of the + * current task is accounted for in the selection of the OPP. + * + * We do it also in the case where we enqueue a throttled task; + * we could argue that a throttled task should not boost a CPU, + * however: + * a) properly implementing CPU boosting considering throttled + * tasks will increase a lot the complexity of the solution + * b) it's not easy to quantify the benefits introduced by + * such a more complex solution. + * Thus, for the time being we go for the simple solution and boost + * also for throttled RQs. + */ + schedtune_enqueue_task(p, cpu_of(rq)); /* * If in_iowait is set, the code below may not trigger any cpufreq @@ -4920,6 +5309,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running++; + walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p); flags = ENQUEUE_WAKEUP; } @@ -4927,6 +5317,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_running++; + walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p); if (cfs_rq_throttled(cfs_rq)) break; @@ -4935,8 +5326,12 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_cfs_shares(se); } - if (!se) + if (!se) { add_nr_running(rq, 1); + if (!task_new) + update_overutilized_status(rq); + walt_inc_cumulative_runnable_avg(rq, p); + } hrtick_update(rq); } @@ -4954,6 +5349,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct sched_entity *se = &p->se; int task_sleep = flags & DEQUEUE_SLEEP; + /* + * The code below (indirectly) updates schedutil which looks at + * the cfs_rq utilization to select a frequency. + * Let's update schedtune here to ensure the boost value of the + * current task is not more accounted for in the selection of the OPP. + */ + schedtune_dequeue_task(p, cpu_of(rq)); + for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); dequeue_entity(cfs_rq, se, flags); @@ -4967,6 +5370,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running--; + walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p); /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { @@ -4986,6 +5390,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_running--; + walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p); if (cfs_rq_throttled(cfs_rq)) break; @@ -4994,9 +5399,12 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_cfs_shares(se); } - if (!se) + if (!se) { sub_nr_running(rq, 1); + walt_dec_cumulative_runnable_avg(rq, p); + } + util_est_dequeue(&rq->cfs, p, task_sleep); hrtick_update(rq); } @@ -5304,16 +5712,6 @@ static unsigned long target_load(int cpu, int type) return max(rq->cpu_load[type-1], total); } -static unsigned long capacity_of(int cpu) -{ - return cpu_rq(cpu)->cpu_capacity; -} - -static unsigned long capacity_orig_of(int cpu) -{ - return cpu_rq(cpu)->cpu_capacity_orig; -} - static unsigned long cpu_avg_load_per_task(int cpu) { struct rq *rq = cpu_rq(cpu); @@ -5344,6 +5742,862 @@ static void record_wakee(struct task_struct *p) } /* + * Returns the current capacity of cpu after applying both + * cpu and freq scaling. + */ +unsigned long capacity_curr_of(int cpu) +{ + unsigned long max_cap = cpu_rq(cpu)->cpu_capacity_orig; + unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu); + + return cap_scale(max_cap, scale_freq); +} + +inline bool energy_aware(void) +{ + return sched_feat(ENERGY_AWARE); +} + +/* + * __cpu_norm_util() returns the cpu util relative to a specific capacity, + * i.e. it's busy ratio, in the range [0..SCHED_CAPACITY_SCALE] which is useful + * for energy calculations. Using the scale-invariant util returned by + * cpu_util() and approximating scale-invariant util by: + * + * util ~ (curr_freq/max_freq)*1024 * capacity_orig/1024 * running_time/time + * + * the normalized util can be found using the specific capacity. + * + * capacity = capacity_orig * curr_freq/max_freq + * + * norm_util = running_time/time ~ util/capacity + */ +static unsigned long __cpu_norm_util(unsigned long util, unsigned long capacity) +{ + if (util >= capacity) + return SCHED_CAPACITY_SCALE; + + return (util << SCHED_CAPACITY_SHIFT)/capacity; +} + +/* + * CPU candidates. + * + * These are labels to reference CPU candidates for an energy_diff. + * Currently we support only two possible candidates: the task's previous CPU + * and another candiate CPU. + * More advanced/aggressive EAS selection policies can consider more + * candidates. + */ +#define EAS_CPU_PRV 0 +#define EAS_CPU_NXT 1 +#define EAS_CPU_BKP 2 + +/* + * energy_diff - supports the computation of the estimated energy impact in + * moving a "task"'s "util_delta" between different CPU candidates. + */ +/* + * NOTE: When using or examining WALT task signals, all wakeup + * latency is included as busy time for task util. + * + * This is relevant here because: + * When debugging is enabled, it can take as much as 1ms to + * write the output to the trace buffer for each eenv + * scenario. For periodic tasks where the sleep time is of + * a similar order, the WALT task util can be inflated. + * + * Further, and even without debugging enabled, + * task wakeup latency changes depending upon the EAS + * wakeup algorithm selected - FIND_BEST_TARGET only does + * energy calculations for up to 2 candidate CPUs. When + * NO_FIND_BEST_TARGET is configured, we can potentially + * do an energy calculation across all CPUS in the system. + * + * The impact to WALT task util on a Juno board + * running a periodic task which only sleeps for 200usec + * between 1ms activations has been measured. + * (i.e. the wakeup latency induced by energy calculation + * and debug output is double the desired sleep time and + * almost equivalent to the runtime which is more-or-less + * the worst case possible for this test) + * + * In this scenario, a task which has a PELT util of around + * 220 is inflated under WALT to have util around 400. + * + * This is simply a property of the way WALT includes + * wakeup latency in busy time while PELT does not. + * + * Hence - be careful when enabling DEBUG_EENV_DECISIONS + * expecially if WALT is the task signal. + */ +/*#define DEBUG_EENV_DECISIONS*/ + +#ifdef DEBUG_EENV_DECISIONS +/* max of 8 levels of sched groups traversed */ +#define EAS_EENV_DEBUG_LEVELS 16 + +struct _eenv_debug { + unsigned long cap; + unsigned long norm_util; + unsigned long cap_energy; + unsigned long idle_energy; + unsigned long this_energy; + unsigned long this_busy_energy; + unsigned long this_idle_energy; + cpumask_t group_cpumask; + unsigned long cpu_util[1]; +}; +#endif + +struct eenv_cpu { + /* CPU ID, must be in cpus_mask */ + int cpu_id; + + /* + * Index (into sched_group_energy::cap_states) of the OPP the + * CPU needs to run at if the task is placed on it. + * This includes the both active and blocked load, due to + * other tasks on this CPU, as well as the task's own + * utilization. + */ + int cap_idx; + int cap; + + /* Estimated system energy */ + unsigned long energy; + + /* Estimated energy variation wrt EAS_CPU_PRV */ + long nrg_delta; + +#ifdef DEBUG_EENV_DECISIONS + struct _eenv_debug *debug; + int debug_idx; +#endif /* DEBUG_EENV_DECISIONS */ +}; + +struct energy_env { + /* Utilization to move */ + struct task_struct *p; + unsigned long util_delta; + unsigned long util_delta_boosted; + + /* Mask of CPUs candidates to evaluate */ + cpumask_t cpus_mask; + + /* CPU candidates to evaluate */ + struct eenv_cpu *cpu; + int eenv_cpu_count; + +#ifdef DEBUG_EENV_DECISIONS + /* pointer to the memory block reserved + * for debug on this CPU - there will be + * sizeof(struct _eenv_debug) * + * (EAS_CPU_CNT * EAS_EENV_DEBUG_LEVELS) + * bytes allocated here. + */ + struct _eenv_debug *debug; +#endif + /* + * Index (into energy_env::cpu) of the morst energy efficient CPU for + * the specified energy_env::task + */ + int next_idx; + int max_cpu_count; + + /* Support data */ + struct sched_group *sg_top; + struct sched_group *sg_cap; + struct sched_group *sg; +}; + +/** + * Amount of capacity of a CPU that is (estimated to be) used by CFS tasks + * @cpu: the CPU to get the utilization of + * + * The unit of the return value must be the one of capacity so we can compare + * the utilization with the capacity of the CPU that is available for CFS task + * (ie cpu_capacity). + * + * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the + * recent utilization of currently non-runnable tasks on a CPU. It represents + * the amount of utilization of a CPU in the range [0..capacity_orig] where + * capacity_orig is the cpu_capacity available at the highest frequency, + * i.e. arch_scale_cpu_capacity(). + * The utilization of a CPU converges towards a sum equal to or less than the + * current capacity (capacity_curr <= capacity_orig) of the CPU because it is + * the running time on this CPU scaled by capacity_curr. + * + * The estimated utilization of a CPU is defined to be the maximum between its + * cfs_rq.avg.util_avg and the sum of the estimated utilization of the tasks + * currently RUNNABLE on that CPU. + * This allows to properly represent the expected utilization of a CPU which + * has just got a big task running since a long sleep period. At the same time + * however it preserves the benefits of the "blocked utilization" in + * describing the potential for other tasks waking up on the same CPU. + * + * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even + * higher than capacity_orig because of unfortunate rounding in + * cfs.avg.util_avg or just after migrating tasks and new task wakeups until + * the average stabilizes with the new running time. We need to check that the + * utilization stays within the range of [0..capacity_orig] and cap it if + * necessary. Without utilization capping, a group could be seen as overloaded + * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of + * available capacity. We allow utilization to overshoot capacity_curr (but not + * capacity_orig) as it useful for predicting the capacity required after task + * migrations (scheduler-driven DVFS). + * + * Return: the (estimated) utilization for the specified CPU + */ +static inline unsigned long cpu_util(int cpu) +{ + struct cfs_rq *cfs_rq; + unsigned int util; + +#ifdef CONFIG_SCHED_WALT + if (likely(!walt_disabled && sysctl_sched_use_walt_cpu_util)) { + u64 walt_cpu_util = cpu_rq(cpu)->cumulative_runnable_avg; + + walt_cpu_util <<= SCHED_CAPACITY_SHIFT; + do_div(walt_cpu_util, walt_ravg_window); + + return min_t(unsigned long, walt_cpu_util, + capacity_orig_of(cpu)); + } +#endif + + cfs_rq = &cpu_rq(cpu)->cfs; + util = READ_ONCE(cfs_rq->avg.util_avg); + + if (sched_feat(UTIL_EST)) + util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued)); + + return min_t(unsigned long, util, capacity_orig_of(cpu)); +} + +static inline unsigned long cpu_util_rt(int cpu) +{ + struct rt_rq *rt_rq = &(cpu_rq(cpu)->rt); + + return rt_rq->avg.util_avg; +} + +static inline unsigned long cpu_util_freq(int cpu) +{ +#ifdef CONFIG_SCHED_WALT + u64 walt_cpu_util; + + if (unlikely(walt_disabled || !sysctl_sched_use_walt_cpu_util)) { + return min(cpu_util(cpu) + cpu_util_rt(cpu), + capacity_orig_of(cpu)); + } + + walt_cpu_util = cpu_rq(cpu)->prev_runnable_sum; + walt_cpu_util <<= SCHED_CAPACITY_SHIFT; + do_div(walt_cpu_util, walt_ravg_window); + + return min_t(unsigned long, walt_cpu_util, capacity_orig_of(cpu)); +#else + return min(cpu_util(cpu) + cpu_util_rt(cpu), capacity_orig_of(cpu)); +#endif +} + +/* + * cpu_util_without: compute cpu utilization without any contributions from *p + * @cpu: the CPU which utilization is requested + * @p: the task which utilization should be discounted + * + * The utilization of a CPU is defined by the utilization of tasks currently + * enqueued on that CPU as well as tasks which are currently sleeping after an + * execution on that CPU. + * + * This method returns the utilization of the specified CPU by discounting the + * utilization of the specified task, whenever the task is currently + * contributing to the CPU utilization. + */ +static unsigned long cpu_util_without(int cpu, struct task_struct *p) +{ + struct cfs_rq *cfs_rq; + unsigned int util; + +#ifdef CONFIG_SCHED_WALT + /* + * WALT does not decay idle tasks in the same manner + * as PELT, so it makes little sense to subtract task + * utilization from cpu utilization. Instead just use + * cpu_util for this case. + */ + if (likely(!walt_disabled && sysctl_sched_use_walt_cpu_util)) + return cpu_util(cpu); +#endif + + /* Task has no contribution or is new */ + if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) + return cpu_util(cpu); + + cfs_rq = &cpu_rq(cpu)->cfs; + util = READ_ONCE(cfs_rq->avg.util_avg); + + /* Discount task's util from CPU's util */ + util -= min_t(unsigned int, util, task_util(p)); + + /* + * Covered cases: + * + * a) if *p is the only task sleeping on this CPU, then: + * cpu_util (== task_util) > util_est (== 0) + * and thus we return: + * cpu_util_without = (cpu_util - task_util) = 0 + * + * b) if other tasks are SLEEPING on this CPU, which is now exiting + * IDLE, then: + * cpu_util >= task_util + * cpu_util > util_est (== 0) + * and thus we discount *p's blocked utilization to return: + * cpu_util_without = (cpu_util - task_util) >= 0 + * + * c) if other tasks are RUNNABLE on that CPU and + * util_est > cpu_util + * then we use util_est since it returns a more restrictive + * estimation of the spare capacity on that CPU, by just + * considering the expected utilization of tasks already + * runnable on that CPU. + * + * Cases a) and b) are covered by the above code, while case c) is + * covered by the following code when estimated utilization is + * enabled. + */ + if (sched_feat(UTIL_EST)) { + unsigned int estimated = + READ_ONCE(cfs_rq->avg.util_est.enqueued); + + /* + * Despite the following checks we still have a small window + * for a possible race, when an execl's select_task_rq_fair() + * races with LB's detach_task(): + * + * detach_task() + * p->on_rq = TASK_ON_RQ_MIGRATING; + * ---------------------------------- A + * deactivate_task() \ + * dequeue_task() + RaceTime + * util_est_dequeue() / + * ---------------------------------- B + * + * The additional check on "current == p" it's required to + * properly fix the execl regression and it helps in further + * reducing the chances for the above race. + */ + if (unlikely(task_on_rq_queued(p) || current == p)) { + estimated -= min_t(unsigned int, estimated, + (_task_util_est(p) | UTIL_AVG_UNCHANGED)); + } + util = max(util, estimated); + } + + /* + * Utilization (estimated) can exceed the CPU capacity, thus let's + * clamp to the maximum CPU capacity to ensure consistency with + * the cpu_util call. + */ + return min_t(unsigned long, util, capacity_orig_of(cpu)); +} + +static unsigned long group_max_util(struct energy_env *eenv, int cpu_idx) +{ + unsigned long max_util = 0; + unsigned long util; + int cpu; + + for_each_cpu(cpu, sched_group_span(eenv->sg_cap)) { + util = cpu_util_without(cpu, eenv->p); + + /* + * If we are looking at the target CPU specified by the eenv, + * then we should add the (estimated) utilization of the task + * assuming we will wake it up on that CPU. + */ + if (unlikely(cpu == eenv->cpu[cpu_idx].cpu_id)) + util += eenv->util_delta_boosted; + + max_util = max(max_util, util); + } + + return max_util; +} + +/* + * group_norm_util() returns the approximated group util relative to it's + * current capacity (busy ratio) in the range [0..SCHED_CAPACITY_SCALE] for use + * in energy calculations. Since task executions may or may not overlap in time + * in the group the true normalized util is between max(cpu_norm_util(i)) and + * sum(cpu_norm_util(i)) when iterating over all cpus in the group, i. The + * latter is used as the estimate as it leads to a more pessimistic energy + * estimate (more busy). + */ +static unsigned +long group_norm_util(struct energy_env *eenv, int cpu_idx) +{ + unsigned long capacity = eenv->cpu[cpu_idx].cap; + unsigned long util, util_sum = 0; + int cpu; + + for_each_cpu(cpu, sched_group_span(eenv->sg)) { + util = cpu_util_without(cpu, eenv->p); + + /* + * If we are looking at the target CPU specified by the eenv, + * then we should add the (estimated) utilization of the task + * assuming we will wake it up on that CPU. + */ + if (unlikely(cpu == eenv->cpu[cpu_idx].cpu_id)) + util += eenv->util_delta; + + util_sum += __cpu_norm_util(util, capacity); + } + + if (util_sum > SCHED_CAPACITY_SCALE) + return SCHED_CAPACITY_SCALE; + return util_sum; +} + +static int find_new_capacity(struct energy_env *eenv, int cpu_idx) +{ + const struct sched_group_energy *sge = eenv->sg_cap->sge; + unsigned long util = group_max_util(eenv, cpu_idx); + int idx, cap_idx; + + cap_idx = sge->nr_cap_states - 1; + + for (idx = 0; idx < sge->nr_cap_states; idx++) { + if (sge->cap_states[idx].cap >= util) { + cap_idx = idx; + break; + } + } + /* Keep track of SG's capacity */ + eenv->cpu[cpu_idx].cap = sge->cap_states[cap_idx].cap; + eenv->cpu[cpu_idx].cap_idx = cap_idx; + + return cap_idx; +} + +static int group_idle_state(struct energy_env *eenv, int cpu_idx) +{ + struct sched_group *sg = eenv->sg; + int src_in_grp, dst_in_grp; + int i, state = INT_MAX; + int max_idle_state_idx; + long grp_util = 0; + int new_state; + + /* Find the shallowest idle state in the sched group. */ + for_each_cpu(i, sched_group_span(sg)) + state = min(state, idle_get_state_idx(cpu_rq(i))); + + /* Take non-cpuidle idling into account (active idle/arch_cpu_idle()) */ + state++; + /* + * Try to estimate if a deeper idle state is + * achievable when we move the task. + */ + for_each_cpu(i, sched_group_span(sg)) + grp_util += cpu_util(i); + + src_in_grp = cpumask_test_cpu(eenv->cpu[EAS_CPU_PRV].cpu_id, + sched_group_span(sg)); + dst_in_grp = cpumask_test_cpu(eenv->cpu[cpu_idx].cpu_id, + sched_group_span(sg)); + if (src_in_grp == dst_in_grp) { + /* + * both CPUs under consideration are in the same group or not in + * either group, migration should leave idle state the same. + */ + return state; + } + /* + * add or remove util as appropriate to indicate what group util + * will be (worst case - no concurrent execution) after moving the task + */ + grp_util += src_in_grp ? -eenv->util_delta : eenv->util_delta; + + if (grp_util > + ((long)sg->sgc->max_capacity * (int)sg->group_weight)) { + /* + * After moving, the group will be fully occupied + * so assume it will not be idle at all. + */ + return 0; + } + + /* + * after moving, this group is at most partly + * occupied, so it should have some idle time. + */ + max_idle_state_idx = sg->sge->nr_idle_states - 2; + new_state = grp_util * max_idle_state_idx; + if (grp_util <= 0) { + /* group will have no util, use lowest state */ + new_state = max_idle_state_idx + 1; + } else { + /* + * for partially idle, linearly map util to idle + * states, excluding the lowest one. This does not + * correspond to the state we expect to enter in + * reality, but an indication of what might happen. + */ + new_state = min_t(int, max_idle_state_idx, + new_state / sg->sgc->max_capacity); + new_state = max_idle_state_idx - new_state; + } + return new_state; +} + +#ifdef DEBUG_EENV_DECISIONS +static struct _eenv_debug *eenv_debug_entry_ptr(struct _eenv_debug *base, int idx); + +static void store_energy_calc_debug_info(struct energy_env *eenv, int cpu_idx, int cap_idx, int idle_idx) +{ + int debug_idx = eenv->cpu[cpu_idx].debug_idx; + unsigned long sg_util, busy_energy, idle_energy; + const struct sched_group_energy *sge; + struct _eenv_debug *dbg; + int cpu; + + if (debug_idx < EAS_EENV_DEBUG_LEVELS) { + sge = eenv->sg->sge; + sg_util = group_norm_util(eenv, cpu_idx); + busy_energy = sge->cap_states[cap_idx].power; + busy_energy *= sg_util; + idle_energy = SCHED_CAPACITY_SCALE - sg_util; + idle_energy *= sge->idle_states[idle_idx].power; + /* should we use sg_cap or sg? */ + dbg = eenv_debug_entry_ptr(eenv->cpu[cpu_idx].debug, debug_idx); + dbg->cap = sge->cap_states[cap_idx].cap; + dbg->norm_util = sg_util; + dbg->cap_energy = sge->cap_states[cap_idx].power; + dbg->idle_energy = sge->idle_states[idle_idx].power; + dbg->this_energy = busy_energy + idle_energy; + dbg->this_busy_energy = busy_energy; + dbg->this_idle_energy = idle_energy; + + cpumask_copy(&dbg->group_cpumask, + sched_group_span(eenv->sg)); + + for_each_cpu(cpu, &dbg->group_cpumask) + dbg->cpu_util[cpu] = cpu_util(cpu); + + eenv->cpu[cpu_idx].debug_idx = debug_idx+1; + } +} +#else +#define store_energy_calc_debug_info(a,b,c,d) {} +#endif /* DEBUG_EENV_DECISIONS */ + +/* + * calc_sg_energy: compute energy for the eenv's SG (i.e. eenv->sg). + * + * This works in iterations to compute the SG's energy for each CPU + * candidate defined by the energy_env's cpu array. + */ +static void calc_sg_energy(struct energy_env *eenv) +{ + struct sched_group *sg = eenv->sg; + unsigned long busy_energy, idle_energy; + unsigned int busy_power, idle_power; + unsigned long total_energy = 0; + unsigned long sg_util; + int cap_idx, idle_idx; + int cpu_idx; + + for (cpu_idx = EAS_CPU_PRV; cpu_idx < eenv->max_cpu_count; ++cpu_idx) { + if (eenv->cpu[cpu_idx].cpu_id == -1) + continue; + + /* Compute ACTIVE energy */ + cap_idx = find_new_capacity(eenv, cpu_idx); + busy_power = sg->sge->cap_states[cap_idx].power; + sg_util = group_norm_util(eenv, cpu_idx); + busy_energy = sg_util * busy_power; + + /* Compute IDLE energy */ + idle_idx = group_idle_state(eenv, cpu_idx); + idle_power = sg->sge->idle_states[idle_idx].power; + idle_energy = SCHED_CAPACITY_SCALE - sg_util; + idle_energy *= idle_power; + + total_energy = busy_energy + idle_energy; + eenv->cpu[cpu_idx].energy += total_energy; + + store_energy_calc_debug_info(eenv, cpu_idx, cap_idx, idle_idx); + } +} + +/* + * compute_energy() computes the absolute variation in energy consumption by + * moving eenv.util_delta from EAS_CPU_PRV to EAS_CPU_NXT. + * + * NOTE: compute_energy() may fail when racing with sched_domain updates, in + * which case we abort by returning -EINVAL. + */ +static int compute_energy(struct energy_env *eenv) +{ + struct sched_domain *sd; + int cpu; + struct cpumask visit_cpus; + struct sched_group *sg; + int cpu_count; + + WARN_ON(!eenv->sg_top->sge); + + cpumask_copy(&visit_cpus, sched_group_span(eenv->sg_top)); + /* If a cpu is hotplugged in while we are in this function, it does + * not appear in the existing visit_cpus mask which came from the + * sched_group pointer of the sched_domain pointed at by sd_ea for + * either the prev or next cpu and was dereferenced in + * select_energy_cpu_idx. + * Since we will dereference sd_scs later as we iterate through the + * CPUs we expect to visit, new CPUs can be present which are not in + * the visit_cpus mask. Guard this with cpu_count. + */ + cpu_count = cpumask_weight(&visit_cpus); + while (!cpumask_empty(&visit_cpus)) { + struct sched_group *sg_shared_cap = NULL; + + cpu = cpumask_first(&visit_cpus); + + /* + * Is the group utilization affected by cpus outside this + * sched_group? + * This sd may have groups with cpus which were not present + * when we took visit_cpus. + */ + sd = rcu_dereference(per_cpu(sd_scs, cpu)); + if (sd && sd->parent) + sg_shared_cap = sd->parent->groups; + + for_each_domain(cpu, sd) { + sg = sd->groups; + + /* Has this sched_domain already been visited? */ + if (sd->child && group_first_cpu(sg) != cpu) + break; + + do { + eenv->sg_cap = sg; + if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight) + eenv->sg_cap = sg_shared_cap; + + /* + * Compute the energy for all the candidate + * CPUs in the current visited SG. + */ + eenv->sg = sg; + calc_sg_energy(eenv); + + /* remove CPUs we have just visited */ + if (!sd->child) { + /* + * cpu_count here is the number of + * cpus we expect to visit in this + * calculation. If we race against + * hotplug, we can have extra cpus + * added to the groups we are + * iterating which do not appear in + * the visit_cpus mask. In that case + * we are not able to calculate energy + * without restarting so we will bail + * out and use prev_cpu this time. + */ + if (!cpu_count) + return -EINVAL; + cpumask_xor(&visit_cpus, &visit_cpus, sched_group_span(sg)); + cpu_count--; + } + + if (cpumask_equal(sched_group_span(sg), sched_group_span(eenv->sg_top)) && + sd->child) + goto next_cpu; + + } while (sg = sg->next, sg != sd->groups); + } +next_cpu: + cpumask_clear_cpu(cpu, &visit_cpus); + continue; + } + + return 0; +} + +static inline bool cpu_in_sg(struct sched_group *sg, int cpu) +{ + return cpu != -1 && cpumask_test_cpu(cpu, sched_group_span(sg)); +} + +#ifdef DEBUG_EENV_DECISIONS +static void dump_eenv_debug(struct energy_env *eenv) +{ + int cpu_idx, grp_idx; + char cpu_utils[(NR_CPUS*12)+10]="cpu_util: "; + char cpulist[64]; + + trace_printk("eenv scenario: task=%p %s task_util=%lu prev_cpu=%d", + eenv->p, eenv->p->comm, eenv->util_delta, eenv->cpu[EAS_CPU_PRV].cpu_id); + + for (cpu_idx=EAS_CPU_PRV; cpu_idx < eenv->max_cpu_count; cpu_idx++) { + if (eenv->cpu[cpu_idx].cpu_id == -1) + continue; + trace_printk("---Scenario %d: Place task on cpu %d energy=%lu (%d debug logs at %p)", + cpu_idx+1, eenv->cpu[cpu_idx].cpu_id, + eenv->cpu[cpu_idx].energy >> SCHED_CAPACITY_SHIFT, + eenv->cpu[cpu_idx].debug_idx, + eenv->cpu[cpu_idx].debug); + for (grp_idx = 0; grp_idx < eenv->cpu[cpu_idx].debug_idx; grp_idx++) { + struct _eenv_debug *debug; + int cpu, written=0; + + debug = eenv_debug_entry_ptr(eenv->cpu[cpu_idx].debug, grp_idx); + cpu = scnprintf(cpulist, sizeof(cpulist), "%*pbl", cpumask_pr_args(&debug->group_cpumask)); + + cpu_utils[0] = 0; + /* print out the relevant cpu_util */ + for_each_cpu(cpu, &(debug->group_cpumask)) { + char tmp[64]; + if (written > sizeof(cpu_utils)-10) { + cpu_utils[written]=0; + break; + } + written += snprintf(tmp, sizeof(tmp), "cpu%d(%lu) ", cpu, debug->cpu_util[cpu]); + strcat(cpu_utils, tmp); + } + /* trace the data */ + trace_printk(" | %s : cap=%lu nutil=%lu, cap_nrg=%lu, idle_nrg=%lu energy=%lu busy_energy=%lu idle_energy=%lu %s", + cpulist, debug->cap, debug->norm_util, + debug->cap_energy, debug->idle_energy, + debug->this_energy >> SCHED_CAPACITY_SHIFT, + debug->this_busy_energy >> SCHED_CAPACITY_SHIFT, + debug->this_idle_energy >> SCHED_CAPACITY_SHIFT, + cpu_utils); + + } + trace_printk("---"); + } + trace_printk("----- done"); + return; +} +#else +#define dump_eenv_debug(a) {} +#endif /* DEBUG_EENV_DECISIONS */ +/* + * select_energy_cpu_idx(): estimate the energy impact of changing the + * utilization distribution. + * + * The eenv parameter specifies the changes: utilization amount and a + * collection of possible CPU candidates. The number of candidates + * depends upon the selection algorithm used. + * + * If find_best_target was used to select candidate CPUs, there will + * be at most 3 including prev_cpu. If not, we used a brute force + * selection which will provide the union of: + * * CPUs belonging to the highest sd which is not overutilized + * * CPUs the task is allowed to run on + * * online CPUs + * + * This function returns the index of a CPU candidate specified by the + * energy_env which corresponds to the most energy efficient CPU. + * Thus, 0 (EAS_CPU_PRV) means that non of the CPU candidate is more energy + * efficient than running on prev_cpu. This is also the value returned in case + * of abort due to error conditions during the computations. The only + * exception to this if we fail to access the energy model via sd_ea, where + * we return -1 with the intent of asking the system to use a different + * wakeup placement algorithm. + * + * A value greater than zero means that the most energy efficient CPU is the + * one represented by eenv->cpu[eenv->next_idx].cpu_id. + */ +static inline int select_energy_cpu_idx(struct energy_env *eenv) +{ + int last_cpu_idx = eenv->max_cpu_count - 1; + struct sched_domain *sd; + struct sched_group *sg; + int sd_cpu = -1; + int cpu_idx; + int margin; + + sd_cpu = eenv->cpu[EAS_CPU_PRV].cpu_id; + sd = rcu_dereference(per_cpu(sd_ea, sd_cpu)); + if (!sd) + return -1; + + cpumask_clear(&eenv->cpus_mask); + for (cpu_idx = EAS_CPU_PRV; cpu_idx < eenv->max_cpu_count; ++cpu_idx) { + int cpu = eenv->cpu[cpu_idx].cpu_id; + + if (cpu < 0) + continue; + cpumask_set_cpu(cpu, &eenv->cpus_mask); + } + + sg = sd->groups; + do { + /* Skip SGs which do not contains a candidate CPU */ + if (!cpumask_intersects(&eenv->cpus_mask, sched_group_span(sg))) + continue; + + eenv->sg_top = sg; + if (compute_energy(eenv) == -EINVAL) + return EAS_CPU_PRV; + } while (sg = sg->next, sg != sd->groups); + /* remember - eenv energy values are unscaled */ + + /* + * Compute the dead-zone margin used to prevent too many task + * migrations with negligible energy savings. + * An energy saving is considered meaningful if it reduces the energy + * consumption of EAS_CPU_PRV CPU candidate by at least ~1.56% + */ + margin = eenv->cpu[EAS_CPU_PRV].energy >> 6; + + /* + * By default the EAS_CPU_PRV CPU is considered the most energy + * efficient, with a 0 energy variation. + */ + eenv->next_idx = EAS_CPU_PRV; + eenv->cpu[EAS_CPU_PRV].nrg_delta = 0; + + dump_eenv_debug(eenv); + + /* + * Compare the other CPU candidates to find a CPU which can be + * more energy efficient then EAS_CPU_PRV + */ + if (sched_feat(FBT_STRICT_ORDER)) + last_cpu_idx = EAS_CPU_BKP; + + for(cpu_idx = EAS_CPU_NXT; cpu_idx <= last_cpu_idx; cpu_idx++) { + if (eenv->cpu[cpu_idx].cpu_id < 0) + continue; + eenv->cpu[cpu_idx].nrg_delta = + eenv->cpu[cpu_idx].energy - + eenv->cpu[EAS_CPU_PRV].energy; + + /* filter energy variations within the dead-zone margin */ + if (abs(eenv->cpu[cpu_idx].nrg_delta) < margin) + eenv->cpu[cpu_idx].nrg_delta = 0; + /* update the schedule candidate with min(nrg_delta) */ + if (eenv->cpu[cpu_idx].nrg_delta < + eenv->cpu[eenv->next_idx].nrg_delta) { + eenv->next_idx = cpu_idx; + /* break out if we want to stop on first saving candidate */ + if (sched_feat(FBT_STRICT_ORDER)) + break; + } + } + + return eenv->next_idx; +} + +/* * Detect M:N waker/wakee relationships via a switching-frequency heuristic. * * A waker of many should wake a different task than the one last awakened @@ -5360,15 +6614,18 @@ static void record_wakee(struct task_struct *p) * whatever is irrelevant, spread criteria is apparent partner count exceeds * socket size. */ -static int wake_wide(struct task_struct *p) +static int wake_wide(struct task_struct *p, int sibling_count_hint) { unsigned int master = current->wakee_flips; unsigned int slave = p->wakee_flips; - int factor = this_cpu_read(sd_llc_size); + int llc_size = this_cpu_read(sd_llc_size); + + if (sibling_count_hint >= llc_size) + return 1; if (master < slave) swap(master, slave); - if (slave < factor || master < slave * factor) + if (slave < llc_size || master < slave * llc_size) return 0; return 1; } @@ -5454,17 +6711,112 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, return affine; } -static inline int task_util(struct task_struct *p); -static int cpu_util_wake(int cpu, struct task_struct *p); +#ifdef CONFIG_SCHED_TUNE +struct reciprocal_value schedtune_spc_rdiv; + +static long +schedtune_margin(unsigned long signal, long boost) +{ + long long margin = 0; + + /* + * Signal proportional compensation (SPC) + * + * The Boost (B) value is used to compute a Margin (M) which is + * proportional to the complement of the original Signal (S): + * M = B * (SCHED_CAPACITY_SCALE - S) + * The obtained M could be used by the caller to "boost" S. + */ + if (boost >= 0) { + margin = SCHED_CAPACITY_SCALE - signal; + margin *= boost; + } else + margin = -signal * boost; + + margin = reciprocal_divide(margin, schedtune_spc_rdiv); + + if (boost < 0) + margin *= -1; + return margin; +} + +static inline int +schedtune_cpu_margin(unsigned long util, int cpu) +{ + int boost = schedtune_cpu_boost(cpu); + + if (boost == 0) + return 0; + + return schedtune_margin(util, boost); +} + +static inline long +schedtune_task_margin(struct task_struct *task) +{ + int boost = schedtune_task_boost(task); + unsigned long util; + long margin; + + if (boost == 0) + return 0; + + util = task_util_est(task); + margin = schedtune_margin(util, boost); + + return margin; +} + +#else /* CONFIG_SCHED_TUNE */ + +static inline int +schedtune_cpu_margin(unsigned long util, int cpu) +{ + return 0; +} + +static inline int +schedtune_task_margin(struct task_struct *task) +{ + return 0; +} + +#endif /* CONFIG_SCHED_TUNE */ + +unsigned long +boosted_cpu_util(int cpu) +{ + unsigned long util = cpu_util_freq(cpu); + long margin = schedtune_cpu_margin(util, cpu); + + trace_sched_boost_cpu(cpu, util, margin); + + return util + margin; +} + +static inline unsigned long +boosted_task_util(struct task_struct *task) +{ + unsigned long util = task_util_est(task); + long margin = schedtune_task_margin(task); + + trace_sched_boost_task(task, util, margin); + + return util + margin; +} + +static unsigned long cpu_util_without(int cpu, struct task_struct *p); -static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) +static unsigned long capacity_spare_without(int cpu, struct task_struct *p) { - return capacity_orig_of(cpu) - cpu_util_wake(cpu, p); + return max_t(long, capacity_of(cpu) - cpu_util_without(cpu, p), 0); } /* * find_idlest_group finds and returns the least busy CPU group within the * domain. + * + * Assumes p is allowed on at least one CPU in sd. */ static struct sched_group * find_idlest_group(struct sched_domain *sd, struct task_struct *p, @@ -5472,8 +6824,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, { struct sched_group *idlest = NULL, *group = sd->groups; struct sched_group *most_spare_sg = NULL; - unsigned long min_runnable_load = ULONG_MAX, this_runnable_load = 0; - unsigned long min_avg_load = ULONG_MAX, this_avg_load = 0; + unsigned long min_runnable_load = ULONG_MAX; + unsigned long this_runnable_load = ULONG_MAX; + unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX; unsigned long most_spare = 0, this_spare = 0; int load_idx = sd->forkexec_idx; int imbalance_scale = 100 + (sd->imbalance_pct-100)/2; @@ -5516,7 +6869,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs); - spare_cap = capacity_spare_wake(i, p); + spare_cap = capacity_spare_without(i, p); if (spare_cap > max_spare_cap) max_spare_cap = spare_cap; @@ -5594,10 +6947,10 @@ skip_spare: } /* - * find_idlest_cpu - find the idlest cpu among the cpus in group. + * find_idlest_group_cpu - find the idlest cpu among the cpus in group. */ static int -find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) +find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) { unsigned long load, min_load = ULONG_MAX; unsigned int min_exit_latency = UINT_MAX; @@ -5646,6 +6999,53 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu; } +static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p, + int cpu, int prev_cpu, int sd_flag) +{ + int new_cpu = cpu; + + if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed)) + return prev_cpu; + + while (sd) { + struct sched_group *group; + struct sched_domain *tmp; + int weight; + + if (!(sd->flags & sd_flag)) { + sd = sd->child; + continue; + } + + group = find_idlest_group(sd, p, cpu, sd_flag); + if (!group) { + sd = sd->child; + continue; + } + + new_cpu = find_idlest_group_cpu(group, p, cpu); + if (new_cpu == cpu) { + /* Now try balancing at a lower domain level of cpu */ + sd = sd->child; + continue; + } + + /* Now try balancing at a lower domain level of new_cpu */ + cpu = new_cpu; + weight = sd->span_weight; + sd = NULL; + for_each_domain(cpu, tmp) { + if (weight <= tmp->span_weight) + break; + if (tmp->flags & sd_flag) + sd = tmp; + } + /* while loop will break here if sd == NULL */ + } + + return new_cpu; +} + #ifdef CONFIG_SCHED_SMT DEFINE_STATIC_KEY_FALSE(sched_smt_present); EXPORT_SYMBOL_GPL(sched_smt_present); @@ -5829,7 +7229,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t /* * Try and locate an idle core/thread in the LLC cache domain. */ -static int select_idle_sibling(struct task_struct *p, int prev, int target) +static inline int __select_idle_sibling(struct task_struct *p, int prev, int target) { struct sched_domain *sd; int i; @@ -5862,61 +7262,408 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) return target; } -/* - * cpu_util returns the amount of capacity of a CPU that is used by CFS - * tasks. The unit of the return value must be the one of capacity so we can - * compare the utilization with the capacity of the CPU that is available for - * CFS task (ie cpu_capacity). - * - * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the - * recent utilization of currently non-runnable tasks on a CPU. It represents - * the amount of utilization of a CPU in the range [0..capacity_orig] where - * capacity_orig is the cpu_capacity available at the highest frequency - * (arch_scale_freq_capacity()). - * The utilization of a CPU converges towards a sum equal to or less than the - * current capacity (capacity_curr <= capacity_orig) of the CPU because it is - * the running time on this CPU scaled by capacity_curr. - * - * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even - * higher than capacity_orig because of unfortunate rounding in - * cfs.avg.util_avg or just after migrating tasks and new task wakeups until - * the average stabilizes with the new running time. We need to check that the - * utilization stays within the range of [0..capacity_orig] and cap it if - * necessary. Without utilization capping, a group could be seen as overloaded - * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of - * available capacity. We allow utilization to overshoot capacity_curr (but not - * capacity_orig) as it useful for predicting the capacity required after task - * migrations (scheduler-driven DVFS). - */ -static int cpu_util(int cpu) +static inline int select_idle_sibling_cstate_aware(struct task_struct *p, int prev, int target) +{ + struct sched_domain *sd; + struct sched_group *sg; + int best_idle_cpu = -1; + int best_idle_cstate = -1; + int best_idle_capacity = INT_MAX; + int i; + + /* + * Iterate the domains and find an elegible idle cpu. + */ + sd = rcu_dereference(per_cpu(sd_llc, target)); + for_each_lower_domain(sd) { + sg = sd->groups; + do { + if (!cpumask_intersects( + sched_group_span(sg), &p->cpus_allowed)) + goto next; + + for_each_cpu_and(i, &p->cpus_allowed, sched_group_span(sg)) { + int idle_idx; + unsigned long new_usage; + unsigned long capacity_orig; + + if (!idle_cpu(i)) + goto next; + + /* figure out if the task can fit here at all */ + new_usage = boosted_task_util(p); + capacity_orig = capacity_orig_of(i); + + if (new_usage > capacity_orig) + goto next; + + /* if the task fits without changing OPP and we + * intended to use this CPU, just proceed + */ + if (i == target && new_usage <= capacity_curr_of(target)) { + return target; + } + + /* otherwise select CPU with shallowest idle state + * to reduce wakeup latency. + */ + idle_idx = idle_get_state_idx(cpu_rq(i)); + + if (idle_idx < best_idle_cstate && + capacity_orig <= best_idle_capacity) { + best_idle_cpu = i; + best_idle_cstate = idle_idx; + best_idle_capacity = capacity_orig; + } + } + next: + sg = sg->next; + } while (sg != sd->groups); + } + + if (best_idle_cpu >= 0) + target = best_idle_cpu; + + return target; +} + +static int select_idle_sibling(struct task_struct *p, int prev, int target) { - unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; - unsigned long capacity = capacity_orig_of(cpu); + if (!sysctl_sched_cstate_aware) + return __select_idle_sibling(p, prev, target); - return (util >= capacity) ? capacity : util; + return select_idle_sibling_cstate_aware(p, prev, target); } -static inline int task_util(struct task_struct *p) +static inline int task_fits_capacity(struct task_struct *p, long capacity) { - return p->se.avg.util_avg; + return capacity * 1024 > boosted_task_util(p) * capacity_margin; } -/* - * cpu_util_wake: Compute cpu utilization with any contributions from - * the waking task p removed. - */ -static int cpu_util_wake(int cpu, struct task_struct *p) +static int start_cpu(bool boosted) { - unsigned long util, capacity; + struct root_domain *rd = cpu_rq(smp_processor_id())->rd; - /* Task has no contribution or is new */ - if (cpu != task_cpu(p) || !p->se.avg.last_update_time) - return cpu_util(cpu); + return boosted ? rd->max_cap_orig_cpu : rd->min_cap_orig_cpu; +} - capacity = capacity_orig_of(cpu); - util = max_t(long, cpu_rq(cpu)->cfs.avg.util_avg - task_util(p), 0); +static inline int find_best_target(struct task_struct *p, int *backup_cpu, + bool boosted, bool prefer_idle) +{ + unsigned long min_util = boosted_task_util(p); + unsigned long target_capacity = ULONG_MAX; + unsigned long min_wake_util = ULONG_MAX; + unsigned long target_max_spare_cap = 0; + unsigned long target_util = ULONG_MAX; + unsigned long best_active_util = ULONG_MAX; + int best_idle_cstate = INT_MAX; + struct sched_domain *sd; + struct sched_group *sg; + int best_active_cpu = -1; + int best_idle_cpu = -1; + int target_cpu = -1; + int cpu, i; - return (util >= capacity) ? capacity : util; + *backup_cpu = -1; + + /* + * In most cases, target_capacity tracks capacity_orig of the most + * energy efficient CPU candidate, thus requiring to minimise + * target_capacity. For these cases target_capacity is already + * initialized to ULONG_MAX. + * However, for prefer_idle and boosted tasks we look for a high + * performance CPU, thus requiring to maximise target_capacity. In this + * case we initialise target_capacity to 0. + */ + if (prefer_idle && boosted) + target_capacity = 0; + + /* Find start CPU based on boost value */ + cpu = start_cpu(boosted); + if (cpu < 0) + return -1; + + /* Find SD for the start CPU */ + sd = rcu_dereference(per_cpu(sd_ea, cpu)); + if (!sd) + return -1; + + /* Scan CPUs in all SDs */ + sg = sd->groups; + do { + for_each_cpu_and(i, &p->cpus_allowed, sched_group_span(sg)) { + unsigned long capacity_curr = capacity_curr_of(i); + unsigned long capacity_orig = capacity_orig_of(i); + unsigned long wake_util, new_util; + long spare_cap; + int idle_idx = INT_MAX; + + if (!cpu_online(i)) + continue; + + if (walt_cpu_high_irqload(i)) + continue; + + /* + * p's blocked utilization is still accounted for on prev_cpu + * so prev_cpu will receive a negative bias due to the double + * accounting. However, the blocked utilization may be zero. + */ + wake_util = cpu_util_without(i, p); + new_util = wake_util + task_util_est(p); + + /* + * Ensure minimum capacity to grant the required boost. + * The target CPU can be already at a capacity level higher + * than the one required to boost the task. + */ + new_util = max(min_util, new_util); + if (new_util > capacity_orig) + continue; + + /* + * Pre-compute the maximum possible capacity we expect + * to have available on this CPU once the task is + * enqueued here. + */ + spare_cap = capacity_orig - new_util; + + if (idle_cpu(i)) + idle_idx = idle_get_state_idx(cpu_rq(i)); + + + /* + * Case A) Latency sensitive tasks + * + * Unconditionally favoring tasks that prefer idle CPU to + * improve latency. + * + * Looking for: + * - an idle CPU, whatever its idle_state is, since + * the first CPUs we explore are more likely to be + * reserved for latency sensitive tasks. + * - a non idle CPU where the task fits in its current + * capacity and has the maximum spare capacity. + * - a non idle CPU with lower contention from other + * tasks and running at the lowest possible OPP. + * + * The last two goals tries to favor a non idle CPU + * where the task can run as if it is "almost alone". + * A maximum spare capacity CPU is favoured since + * the task already fits into that CPU's capacity + * without waiting for an OPP chance. + * + * The following code path is the only one in the CPUs + * exploration loop which is always used by + * prefer_idle tasks. It exits the loop with wither a + * best_active_cpu or a target_cpu which should + * represent an optimal choice for latency sensitive + * tasks. + */ + if (prefer_idle) { + + /* + * Case A.1: IDLE CPU + * Return the best IDLE CPU we find: + * - for boosted tasks: the CPU with the highest + * performance (i.e. biggest capacity_orig) + * - for !boosted tasks: the most energy + * efficient CPU (i.e. smallest capacity_orig) + */ + if (idle_cpu(i)) { + if (boosted && + capacity_orig < target_capacity) + continue; + if (!boosted && + capacity_orig > target_capacity) + continue; + if (capacity_orig == target_capacity && + sysctl_sched_cstate_aware && + best_idle_cstate <= idle_idx) + continue; + + target_capacity = capacity_orig; + best_idle_cstate = idle_idx; + best_idle_cpu = i; + continue; + } + if (best_idle_cpu != -1) + continue; + + /* + * Case A.2: Target ACTIVE CPU + * Favor CPUs with max spare capacity. + */ + if (capacity_curr > new_util && + spare_cap > target_max_spare_cap) { + target_max_spare_cap = spare_cap; + target_cpu = i; + continue; + } + if (target_cpu != -1) + continue; + + + /* + * Case A.3: Backup ACTIVE CPU + * Favor CPUs with: + * - lower utilization due to other tasks + * - lower utilization with the task in + */ + if (wake_util > min_wake_util) + continue; + if (new_util > best_active_util) + continue; + min_wake_util = wake_util; + best_active_util = new_util; + best_active_cpu = i; + continue; + } + + /* + * Enforce EAS mode + * + * For non latency sensitive tasks, skip CPUs that + * will be overutilized by moving the task there. + * + * The goal here is to remain in EAS mode as long as + * possible at least for !prefer_idle tasks. + */ + if ((new_util * capacity_margin) > + (capacity_orig * SCHED_CAPACITY_SCALE)) + continue; + + /* + * Favor CPUs with smaller capacity for non latency + * sensitive tasks. + */ + if (capacity_orig > target_capacity) + continue; + + /* + * Case B) Non latency sensitive tasks on IDLE CPUs. + * + * Find an optimal backup IDLE CPU for non latency + * sensitive tasks. + * + * Looking for: + * - minimizing the capacity_orig, + * i.e. preferring LITTLE CPUs + * - favoring shallowest idle states + * i.e. avoid to wakeup deep-idle CPUs + * + * The following code path is used by non latency + * sensitive tasks if IDLE CPUs are available. If at + * least one of such CPUs are available it sets the + * best_idle_cpu to the most suitable idle CPU to be + * selected. + * + * If idle CPUs are available, favour these CPUs to + * improve performances by spreading tasks. + * Indeed, the energy_diff() computed by the caller + * will take care to ensure the minimization of energy + * consumptions without affecting performance. + */ + if (idle_cpu(i)) { + /* + * Skip CPUs in deeper idle state, but only + * if they are also less energy efficient. + * IOW, prefer a deep IDLE LITTLE CPU vs a + * shallow idle big CPU. + */ + if (capacity_orig == target_capacity && + sysctl_sched_cstate_aware && + best_idle_cstate <= idle_idx) + continue; + + target_capacity = capacity_orig; + best_idle_cstate = idle_idx; + best_idle_cpu = i; + continue; + } + + /* + * Case C) Non latency sensitive tasks on ACTIVE CPUs. + * + * Pack tasks in the most energy efficient capacities. + * + * This task packing strategy prefers more energy + * efficient CPUs (i.e. pack on smaller maximum + * capacity CPUs) while also trying to spread tasks to + * run them all at the lower OPP. + * + * This assumes for example that it's more energy + * efficient to run two tasks on two CPUs at a lower + * OPP than packing both on a single CPU but running + * that CPU at an higher OPP. + * + * Thus, this case keep track of the CPU with the + * smallest maximum capacity and highest spare maximum + * capacity. + */ + + /* Favor CPUs with maximum spare capacity */ + if (capacity_orig == target_capacity && + spare_cap < target_max_spare_cap) + continue; + + target_max_spare_cap = spare_cap; + target_capacity = capacity_orig; + target_util = new_util; + target_cpu = i; + } + + } while (sg = sg->next, sg != sd->groups); + + /* + * For non latency sensitive tasks, cases B and C in the previous loop, + * we pick the best IDLE CPU only if we was not able to find a target + * ACTIVE CPU. + * + * Policies priorities: + * + * - prefer_idle tasks: + * + * a) IDLE CPU available: best_idle_cpu + * b) ACTIVE CPU where task fits and has the bigger maximum spare + * capacity (i.e. target_cpu) + * c) ACTIVE CPU with less contention due to other tasks + * (i.e. best_active_cpu) + * + * - NON prefer_idle tasks: + * + * a) ACTIVE CPU: target_cpu + * b) IDLE CPU: best_idle_cpu + */ + + if (prefer_idle && (best_idle_cpu != -1)) { + trace_sched_find_best_target(p, prefer_idle, min_util, cpu, + best_idle_cpu, best_active_cpu, + best_idle_cpu); + + return best_idle_cpu; + } + + if (target_cpu == -1) + target_cpu = prefer_idle + ? best_active_cpu + : best_idle_cpu; + else + *backup_cpu = prefer_idle + ? best_active_cpu + : best_idle_cpu; + + trace_sched_find_best_target(p, prefer_idle, min_util, cpu, + best_idle_cpu, best_active_cpu, + target_cpu); + + /* it is possible for target and backup + * to select same CPU - if so, drop backup + */ + if (*backup_cpu == target_cpu) + *backup_cpu = -1; + + return target_cpu; } /* @@ -5930,8 +7677,11 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) { long min_cap, max_cap; + if (!static_branch_unlikely(&sched_asym_cpucapacity)) + return 0; + min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu)); - max_cap = cpu_rq(cpu)->rd->max_cpu_capacity; + max_cap = cpu_rq(cpu)->rd->max_cpu_capacity.val; /* Minimum capacity is close to max, no need to abort wake_affine */ if (max_cap - min_cap < max_cap >> 3) @@ -5940,7 +7690,297 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) /* Bring task utilization in sync with prev_cpu */ sync_entity_load_avg(&p->se); - return min_cap * 1024 < task_util(p) * capacity_margin; + return !task_fits_capacity(p, min_cap); +} + +static bool cpu_overutilized(int cpu) +{ + return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin); +} + +DEFINE_PER_CPU(struct energy_env, eenv_cache); + +/* kernels often have NR_CPUS defined to be much + * larger than exist in practise on booted systems. + * Allocate the cpu array for eenv calculations + * at boot time to avoid massive overprovisioning. + */ +#ifdef DEBUG_EENV_DECISIONS +static inline int eenv_debug_size_per_dbg_entry(void) +{ + return sizeof(struct _eenv_debug) + (sizeof(unsigned long) * num_possible_cpus()); +} + +static inline int eenv_debug_size_per_cpu_entry(void) +{ + /* each cpu struct has an array of _eenv_debug structs + * which have an array of unsigned longs at the end - + * the allocation should be extended so that there are + * at least 'num_possible_cpus' entries in the array. + */ + return EAS_EENV_DEBUG_LEVELS * eenv_debug_size_per_dbg_entry(); +} +/* given a per-_eenv_cpu debug env ptr, get the ptr for a given index */ +static inline struct _eenv_debug *eenv_debug_entry_ptr(struct _eenv_debug *base, int idx) +{ + char *ptr = (char *)base; + ptr += (idx * eenv_debug_size_per_dbg_entry()); + return (struct _eenv_debug *)ptr; +} +/* given a pointer to the per-cpu global copy of _eenv_debug, get + * a pointer to the specified _eenv_cpu debug env. + */ +static inline struct _eenv_debug *eenv_debug_percpu_debug_env_ptr(struct _eenv_debug *base, int cpu_idx) +{ + char *ptr = (char *)base; + ptr += (cpu_idx * eenv_debug_size_per_cpu_entry()); + return (struct _eenv_debug *)ptr; +} + +static inline int eenv_debug_size(void) +{ + return num_possible_cpus() * eenv_debug_size_per_cpu_entry(); +} +#endif + +static inline void alloc_eenv(void) +{ + int cpu; + int cpu_count = num_possible_cpus(); + + for_each_possible_cpu(cpu) { + struct energy_env *eenv = &per_cpu(eenv_cache, cpu); + eenv->cpu = kmalloc(sizeof(struct eenv_cpu) * cpu_count, GFP_KERNEL); + eenv->eenv_cpu_count = cpu_count; +#ifdef DEBUG_EENV_DECISIONS + eenv->debug = (struct _eenv_debug *)kmalloc(eenv_debug_size(), GFP_KERNEL); +#endif + } +} + +static inline void reset_eenv(struct energy_env *eenv) +{ + int cpu_count; + struct eenv_cpu *cpu; +#ifdef DEBUG_EENV_DECISIONS + struct _eenv_debug *debug; + int cpu_idx; + debug = eenv->debug; +#endif + + cpu_count = eenv->eenv_cpu_count; + cpu = eenv->cpu; + memset(eenv, 0, sizeof(struct energy_env)); + eenv->cpu = cpu; + memset(eenv->cpu, 0, sizeof(struct eenv_cpu)*cpu_count); + eenv->eenv_cpu_count = cpu_count; + +#ifdef DEBUG_EENV_DECISIONS + memset(debug, 0, eenv_debug_size()); + eenv->debug = debug; + for(cpu_idx = 0; cpu_idx < eenv->eenv_cpu_count; cpu_idx++) + eenv->cpu[cpu_idx].debug = eenv_debug_percpu_debug_env_ptr(debug, cpu_idx); +#endif +} +/* + * get_eenv - reset the eenv struct cached for this CPU + * + * When the eenv is returned, it is configured to do + * energy calculations for the maximum number of CPUs + * the task can be placed on. The prev_cpu entry is + * filled in here. Callers are responsible for adding + * other CPU candidates up to eenv->max_cpu_count. + */ +static inline struct energy_env *get_eenv(struct task_struct *p, int prev_cpu) +{ + struct energy_env *eenv; + cpumask_t cpumask_possible_cpus; + int cpu = smp_processor_id(); + int i; + + eenv = &(per_cpu(eenv_cache, cpu)); + reset_eenv(eenv); + + /* populate eenv */ + eenv->p = p; + /* use boosted task util for capacity selection + * during energy calculation, but unboosted task + * util for group utilization calculations + */ + eenv->util_delta = task_util_est(p); + eenv->util_delta_boosted = boosted_task_util(p); + + cpumask_and(&cpumask_possible_cpus, &p->cpus_allowed, cpu_online_mask); + eenv->max_cpu_count = cpumask_weight(&cpumask_possible_cpus); + + for (i=0; i < eenv->max_cpu_count; i++) + eenv->cpu[i].cpu_id = -1; + eenv->cpu[EAS_CPU_PRV].cpu_id = prev_cpu; + eenv->next_idx = EAS_CPU_PRV; + + return eenv; +} + +/* + * Needs to be called inside rcu_read_lock critical section. + * sd is a pointer to the sched domain we wish to use for an + * energy-aware placement option. + */ +static int find_energy_efficient_cpu(struct sched_domain *sd, + struct task_struct *p, + int cpu, int prev_cpu, + int sync) +{ + int use_fbt = sched_feat(FIND_BEST_TARGET); + int cpu_iter, eas_cpu_idx = EAS_CPU_NXT; + int target_cpu = -1; + struct energy_env *eenv; + + if (sysctl_sched_sync_hint_enable && sync) { + if (cpumask_test_cpu(cpu, &p->cpus_allowed)) { + return cpu; + } + } + + /* prepopulate energy diff environment */ + eenv = get_eenv(p, prev_cpu); + if (eenv->max_cpu_count < 2) + return -1; + + if(!use_fbt) { + /* + * using this function outside wakeup balance will not supply + * an sd ptr. Instead, fetch the highest level with energy data. + */ + if (!sd) + sd = rcu_dereference(per_cpu(sd_ea, prev_cpu)); + + for_each_cpu_and(cpu_iter, &p->cpus_allowed, sched_domain_span(sd)) { + unsigned long spare; + + /* prev_cpu already in list */ + if (cpu_iter == prev_cpu) + continue; + + /* + * Consider only CPUs where the task is expected to + * fit without making the CPU overutilized. + */ + spare = capacity_spare_without(cpu_iter, p); + if (spare * 1024 < capacity_margin * task_util_est(p)) + continue; + + /* Add CPU candidate */ + eenv->cpu[eas_cpu_idx++].cpu_id = cpu_iter; + eenv->max_cpu_count = eas_cpu_idx; + + /* stop adding CPUs if we have no space left */ + if (eas_cpu_idx >= eenv->eenv_cpu_count) + break; + } + } else { + int boosted = (schedtune_task_boost(p) > 0); + int prefer_idle; + + /* + * give compiler a hint that if sched_features + * cannot be changed, it is safe to optimise out + * all if(prefer_idle) blocks. + */ + prefer_idle = sched_feat(EAS_PREFER_IDLE) ? + (schedtune_prefer_idle(p) > 0) : 0; + + eenv->max_cpu_count = EAS_CPU_BKP + 1; + + /* Find a cpu with sufficient capacity */ + target_cpu = find_best_target(p, &eenv->cpu[EAS_CPU_BKP].cpu_id, + boosted, prefer_idle); + + /* Immediately return a found idle CPU for a prefer_idle task */ + if (prefer_idle && target_cpu >= 0 && idle_cpu(target_cpu)) + return target_cpu; + + /* Place target into NEXT slot */ + eenv->cpu[EAS_CPU_NXT].cpu_id = target_cpu; + + /* take note if no backup was found */ + if (eenv->cpu[EAS_CPU_BKP].cpu_id < 0) + eenv->max_cpu_count = EAS_CPU_BKP; + + /* take note if no target was found */ + if (eenv->cpu[EAS_CPU_NXT].cpu_id < 0) + eenv->max_cpu_count = EAS_CPU_NXT; + } + + if (eenv->max_cpu_count == EAS_CPU_NXT) { + /* + * we did not find any energy-awareness + * candidates beyond prev_cpu, so we will + * fall-back to the regular slow-path. + */ + return -1; + } + + /* find most energy-efficient CPU */ + target_cpu = select_energy_cpu_idx(eenv) < 0 ? -1 : + eenv->cpu[eenv->next_idx].cpu_id; + + return target_cpu; +} + +static inline bool nohz_kick_needed(struct rq *rq, bool only_update); +static void nohz_balancer_kick(bool only_update); + +/* + * wake_energy: Make the decision if we want to use an energy-aware + * wakeup task placement or not. This is limited to situations where + * we cannot use energy-awareness right now. + * + * Returns TRUE if we should attempt energy-aware wakeup, FALSE if not. + * + * Should only be called from select_task_rq_fair inside the RCU + * read-side critical section. + */ +static inline int wake_energy(struct task_struct *p, int prev_cpu, + int sd_flag, int wake_flags) +{ + struct sched_domain *sd = NULL; + int sync = wake_flags & WF_SYNC; + + sd = rcu_dereference_sched(cpu_rq(prev_cpu)->sd); + + /* + * Check all definite no-energy-awareness conditions + */ + if (!sd) + return false; + + if (!energy_aware()) + return false; + + if (sd_overutilized(sd)) + return false; + + /* + * we cannot do energy-aware wakeup placement sensibly + * for tasks with 0 utilization, so let them be placed + * according to the normal strategy. + * However if fbt is in use we may still benefit from + * the heuristics we use there in selecting candidate + * CPUs. + */ + if (unlikely(!sched_feat(FIND_BEST_TARGET) && !task_util_est(p))) + return false; + + if(!sched_feat(EAS_PREFER_IDLE)){ + /* + * Force prefer-idle tasks into the slow path, this may not happen + * if none of the sd flags matched. + */ + if (schedtune_prefer_idle(p) > 0 && !sync) + return false; + } + return true; } /* @@ -5956,24 +7996,31 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) * preempt must be disabled. */ static int -select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags) +select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags, + int sibling_count_hint) { - struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; + struct sched_domain *tmp, *affine_sd = NULL; + struct sched_domain *sd = NULL, *energy_sd = NULL; int cpu = smp_processor_id(); int new_cpu = prev_cpu; int want_affine = 0; + int want_energy = 0; int sync = wake_flags & WF_SYNC; + rcu_read_lock(); + if (sd_flag & SD_BALANCE_WAKE) { record_wakee(p); - want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) - && cpumask_test_cpu(cpu, &p->cpus_allowed); + want_energy = wake_energy(p, prev_cpu, sd_flag, wake_flags); + want_affine = !want_energy && + !wake_wide(p, sibling_count_hint) && + !wake_cap(p, cpu, prev_cpu) && + cpumask_test_cpu(cpu, &p->cpus_allowed); } - rcu_read_lock(); for_each_domain(cpu, tmp) { if (!(tmp->flags & SD_LOAD_BALANCE)) - break; + continue; /* * If both cpu and prev_cpu are part of this domain, @@ -5985,9 +8032,22 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f break; } + /* + * If we are able to try an energy-aware wakeup, + * select the highest non-overutilized sched domain + * which includes this cpu and prev_cpu + * + * maybe want to not test prev_cpu and only consider + * the current one? + */ + if (want_energy && + !sd_overutilized(tmp) && + cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) + energy_sd = tmp; + if (tmp->flags & sd_flag) sd = tmp; - else if (!want_affine) + else if (!(want_affine || want_energy)) break; } @@ -6000,47 +8060,38 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f new_cpu = cpu; } + if (sd && !(sd_flag & SD_BALANCE_FORK)) { + /* + * We're going to need the task's util for capacity_spare_without + * in find_idlest_group. Sync it up to prev_cpu's + * last_update_time. + */ + sync_entity_load_avg(&p->se); + } + if (!sd) { - pick_cpu: +pick_cpu: if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); - } else while (sd) { - struct sched_group *group; - int weight; - - if (!(sd->flags & sd_flag)) { - sd = sd->child; - continue; - } - - group = find_idlest_group(sd, p, cpu, sd_flag); - if (!group) { - sd = sd->child; - continue; - } - - new_cpu = find_idlest_cpu(group, p, cpu); - if (new_cpu == -1 || new_cpu == cpu) { - /* Now try balancing at a lower domain level of cpu */ - sd = sd->child; - continue; - } + } else { + if (energy_sd) + new_cpu = find_energy_efficient_cpu(energy_sd, p, cpu, prev_cpu, sync); - /* Now try balancing at a lower domain level of new_cpu */ - cpu = new_cpu; - weight = sd->span_weight; - sd = NULL; - for_each_domain(cpu, tmp) { - if (weight <= tmp->span_weight) - break; - if (tmp->flags & sd_flag) - sd = tmp; - } - /* while loop will break here if sd == NULL */ + /* if we did an energy-aware placement and had no choices available + * then fall back to the default find_idlest_cpu choice + */ + if (!energy_sd || (energy_sd && new_cpu == -1)) + new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag); } + rcu_read_unlock(); +#ifdef CONFIG_NO_HZ_COMMON + if (nohz_kick_needed(cpu_rq(new_cpu), true)) + nohz_balancer_kick(true); +#endif + return new_cpu; } @@ -6355,6 +8406,8 @@ again: if (hrtick_enabled(rq)) hrtick_start_fair(rq, p); + update_misfit_status(p, rq); + return p; simple: #endif @@ -6372,9 +8425,12 @@ simple: if (hrtick_enabled(rq)) hrtick_start_fair(rq, p); + update_misfit_status(p, rq); + return p; idle: + update_misfit_status(NULL, rq); new_tasks = idle_balance(rq, rf); /* @@ -6580,6 +8636,13 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10; enum fbq_type { regular, remote, all }; +enum group_type { + group_other = 0, + group_misfit_task, + group_imbalanced, + group_overloaded, +}; + #define LBF_ALL_PINNED 0x01 #define LBF_NEED_BREAK 0x02 #define LBF_DST_PINNED 0x04 @@ -6598,6 +8661,7 @@ struct lb_env { int new_dst_cpu; enum cpu_idle_type idle; long imbalance; + unsigned int src_grp_nr_running; /* The set of CPUs under consideration for load-balancing */ struct cpumask *cpus; @@ -6608,6 +8672,7 @@ struct lb_env { unsigned int loop_max; enum fbq_type fbq_type; + enum group_type src_grp_type; struct list_head tasks; }; @@ -6787,13 +8852,18 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) /* * detach_task() -- detach the task for the migration specified in env */ -static void detach_task(struct task_struct *p, struct lb_env *env) +static void detach_task(struct task_struct *p, struct lb_env *env, + struct rq_flags *rf) { lockdep_assert_held(&env->src_rq->lock); p->on_rq = TASK_ON_RQ_MIGRATING; deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK); + rq_unpin_lock(env->src_rq, rf); + double_lock_balance(env->src_rq, env->dst_rq); set_task_cpu(p, env->dst_cpu); + double_unlock_balance(env->src_rq, env->dst_rq); + rq_repin_lock(env->src_rq, rf); } /* @@ -6802,7 +8872,8 @@ static void detach_task(struct task_struct *p, struct lb_env *env) * * Returns a task if successful and NULL otherwise. */ -static struct task_struct *detach_one_task(struct lb_env *env) +static struct task_struct *detach_one_task(struct lb_env *env, + struct rq_flags *rf) { struct task_struct *p, *n; @@ -6812,7 +8883,7 @@ static struct task_struct *detach_one_task(struct lb_env *env) if (!can_migrate_task(p, env)) continue; - detach_task(p, env); + detach_task(p, env, rf); /* * Right now, this is only the second place where @@ -6834,7 +8905,7 @@ static const unsigned int sched_nr_migrate_break = 32; * * Returns number of detached tasks if successful and 0 otherwise. */ -static int detach_tasks(struct lb_env *env) +static int detach_tasks(struct lb_env *env, struct rq_flags *rf) { struct list_head *tasks = &env->src_rq->cfs_tasks; struct task_struct *p; @@ -6879,7 +8950,7 @@ static int detach_tasks(struct lb_env *env) if ((load / 2) > env->imbalance) goto next; - detach_task(p, env); + detach_task(p, env, rf); list_add(&p->se.group_node, &env->tasks); detached++; @@ -6997,6 +9068,10 @@ static void update_blocked_averages(int cpu) if (se && !skip_blocked_update(se)) update_load_avg(se, 0); } + update_rt_rq_load_avg(rq_clock_task(rq), cpu, &rq->rt, 0); +#ifdef CONFIG_NO_HZ_COMMON + rq->last_blocked_load_update_tick = jiffies; +#endif rq_unlock_irqrestore(rq, &rf); } @@ -7056,6 +9131,10 @@ static inline void update_blocked_averages(int cpu) rq_lock_irqsave(rq, &rf); update_rq_clock(rq); update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); + update_rt_rq_load_avg(rq_clock_task(rq), cpu, &rq->rt, 0); +#ifdef CONFIG_NO_HZ_COMMON + rq->last_blocked_load_update_tick = jiffies; +#endif rq_unlock_irqrestore(rq, &rf); } @@ -7067,12 +9146,6 @@ static unsigned long task_h_load(struct task_struct *p) /********** Helpers for find_busiest_group ************************/ -enum group_type { - group_other = 0, - group_imbalanced, - group_overloaded, -}; - /* * sg_lb_stats - stats of a sched_group required for load_balancing */ @@ -7088,6 +9161,8 @@ struct sg_lb_stats { unsigned int group_weight; enum group_type group_type; int group_no_capacity; + /* A cpu has a task too big for its capacity */ + unsigned long group_misfit_task_load; #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; unsigned int nr_preferred_running; @@ -7104,6 +9179,7 @@ struct sd_lb_stats { unsigned long total_running; unsigned long total_load; /* Total load of all groups in sd */ unsigned long total_capacity; /* Total capacity of all groups in sd */ + unsigned long total_util; /* Total util of all groups in sd */ unsigned long avg_load; /* Average load across all groups in sd */ struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */ @@ -7124,6 +9200,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) .total_running = 0UL, .total_load = 0UL, .total_capacity = 0UL, + .total_util = 0UL, .busiest_stat = { .avg_load = 0UL, .sum_nr_running = 0, @@ -7187,13 +9264,47 @@ static unsigned long scale_rt_capacity(int cpu) return 1; } +void init_max_cpu_capacity(struct max_cpu_capacity *mcc) +{ + raw_spin_lock_init(&mcc->lock); + mcc->val = 0; + mcc->cpu = -1; +} + static void update_cpu_capacity(struct sched_domain *sd, int cpu) { unsigned long capacity = arch_scale_cpu_capacity(sd, cpu); struct sched_group *sdg = sd->groups; + struct max_cpu_capacity *mcc; + unsigned long max_capacity; + int max_cap_cpu; + unsigned long flags; cpu_rq(cpu)->cpu_capacity_orig = capacity; + capacity *= arch_scale_max_freq_capacity(sd, cpu); + capacity >>= SCHED_CAPACITY_SHIFT; + + mcc = &cpu_rq(cpu)->rd->max_cpu_capacity; + + raw_spin_lock_irqsave(&mcc->lock, flags); + max_capacity = mcc->val; + max_cap_cpu = mcc->cpu; + + if ((max_capacity > capacity && max_cap_cpu == cpu) || + max_capacity < capacity) { + mcc->val = capacity; + mcc->cpu = cpu; +#ifdef CONFIG_SCHED_DEBUG + raw_spin_unlock_irqrestore(&mcc->lock, flags); + printk_deferred(KERN_INFO "CPU%d: update max cpu_capacity %lu\n", + cpu, capacity); + goto skip_unlock; +#endif + } + raw_spin_unlock_irqrestore(&mcc->lock, flags); + +skip_unlock: __attribute__ ((unused)); capacity *= scale_rt_capacity(cpu); capacity >>= SCHED_CAPACITY_SHIFT; @@ -7203,13 +9314,14 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) cpu_rq(cpu)->cpu_capacity = capacity; sdg->sgc->capacity = capacity; sdg->sgc->min_capacity = capacity; + sdg->sgc->max_capacity = capacity; } void update_group_capacity(struct sched_domain *sd, int cpu) { struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; - unsigned long capacity, min_capacity; + unsigned long capacity, min_capacity, max_capacity; unsigned long interval; interval = msecs_to_jiffies(sd->balance_interval); @@ -7223,6 +9335,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) capacity = 0; min_capacity = ULONG_MAX; + max_capacity = 0; if (child->flags & SD_OVERLAP) { /* @@ -7253,6 +9366,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) } min_capacity = min(capacity, min_capacity); + max_capacity = max(capacity, max_capacity); } } else { /* @@ -7266,12 +9380,14 @@ void update_group_capacity(struct sched_domain *sd, int cpu) capacity += sgc->capacity; min_capacity = min(sgc->min_capacity, min_capacity); + max_capacity = max(sgc->max_capacity, max_capacity); group = group->next; } while (group != child->groups); } sdg->sgc->capacity = capacity; sdg->sgc->min_capacity = min_capacity; + sdg->sgc->max_capacity = max_capacity; } /* @@ -7367,16 +9483,40 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) } /* - * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller + * group_smaller_min_cpu_capacity: Returns true if sched_group sg has smaller * per-CPU capacity than sched_group ref. */ static inline bool -group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref) +group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref) { return sg->sgc->min_capacity * capacity_margin < ref->sgc->min_capacity * 1024; } +/* + * group_smaller_max_cpu_capacity: Returns true if sched_group sg has smaller + * per-CPU capacity_orig than sched_group ref. + */ +static inline bool +group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref) +{ + return sg->sgc->max_capacity * capacity_margin < + ref->sgc->max_capacity * 1024; +} + +/* + * group_similar_cpu_capacity: Returns true if the minimum capacity of the + * compared groups differ by less than 12.5%. + */ +static inline bool +group_similar_cpu_capacity(struct sched_group *sg, struct sched_group *ref) +{ + long diff = sg->sgc->min_capacity - ref->sgc->min_capacity; + long max = max(sg->sgc->min_capacity, ref->sgc->min_capacity); + + return abs(diff) < max >> 3; +} + static inline enum group_type group_classify(struct sched_group *group, struct sg_lb_stats *sgs) @@ -7387,6 +9527,9 @@ group_type group_classify(struct sched_group *group, if (sg_imbalanced(group)) return group_imbalanced; + if (sgs->group_misfit_task_load) + return group_misfit_task; + return group_other; } @@ -7397,12 +9540,13 @@ group_type group_classify(struct sched_group *group, * @load_idx: Load index of sched_domain of this_cpu for load calc. * @local_group: Does group contain this_cpu. * @sgs: variable to hold the statistics for this group. - * @overload: Indicate more than one runnable task for any CPU. + * @overload: Indicate pullable load (e.g. >1 runnable task). + * @overutilized: Indicate overutilization for any CPU. */ static inline void update_sg_lb_stats(struct lb_env *env, struct sched_group *group, int load_idx, int local_group, struct sg_lb_stats *sgs, - bool *overload) + bool *overload, bool *overutilized, bool *misfit_task) { unsigned long load; int i, nr_running; @@ -7436,6 +9580,20 @@ static inline void update_sg_lb_stats(struct lb_env *env, */ if (!nr_running && idle_cpu(i)) sgs->idle_cpus++; + + if (env->sd->flags & SD_ASYM_CPUCAPACITY && + sgs->group_misfit_task_load < rq->misfit_task_load) { + sgs->group_misfit_task_load = rq->misfit_task_load; + *overload = 1; + } + + + if (cpu_overutilized(i)) { + *overutilized = true; + + if (rq->misfit_task_load) + *misfit_task = true; + } } /* Adjust by relative CPU capacity of the group */ @@ -7471,6 +9629,17 @@ static bool update_sd_pick_busiest(struct lb_env *env, { struct sg_lb_stats *busiest = &sds->busiest_stat; + /* + * Don't try to pull misfit tasks we can't help. + * We can use max_capacity here as reduction in capacity on some + * cpus in the group should either be possible to resolve + * internally or be covered by avg_load imbalance (eventually). + */ + if (sgs->group_type == group_misfit_task && + (!group_smaller_max_cpu_capacity(sg, sds->local) || + !group_has_capacity(env, &sds->local_stat))) + return false; + if (sgs->group_type > busiest->group_type) return true; @@ -7490,7 +9659,23 @@ static bool update_sd_pick_busiest(struct lb_env *env, * power/energy consequences are not considered. */ if (sgs->sum_nr_running <= sgs->group_weight && - group_smaller_cpu_capacity(sds->local, sg)) + group_smaller_min_cpu_capacity(sds->local, sg)) + return false; + + /* + * Candidate sg doesn't face any severe imbalance issues so + * don't disturb unless the groups are of similar capacity + * where balancing is more harmless. + */ + if (sgs->group_type == group_other && + !group_similar_cpu_capacity(sds->local, sg)) + return false; + + /* + * If we have more than one misfit sg go with the biggest misfit. + */ + if (sgs->group_type == group_misfit_task && + sgs->group_misfit_task_load < busiest->group_misfit_task_load) return false; asym_packing: @@ -7550,6 +9735,18 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq) } #endif /* CONFIG_NUMA_BALANCING */ +#ifdef CONFIG_NO_HZ_COMMON +static struct { + cpumask_var_t idle_cpus_mask; + atomic_t nr_cpus; + unsigned long next_balance; /* in jiffy units */ + unsigned long next_update; /* in jiffy units */ +} nohz ____cacheline_aligned; +#endif + +#define lb_sd_parent(sd) \ + (sd->parent && sd->parent->groups != sd->parent->groups->next) + /** * update_sd_lb_stats - Update sched_domain's statistics for load balancing. * @env: The load balancing environment. @@ -7561,11 +9758,33 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd struct sched_group *sg = env->sd->groups; struct sg_lb_stats *local = &sds->local_stat; struct sg_lb_stats tmp_sgs; - int load_idx, prefer_sibling = 0; - bool overload = false; + int load_idx; + bool overload = false, overutilized = false, misfit_task = false; + bool prefer_sibling = child && child->flags & SD_PREFER_SIBLING; - if (child && child->flags & SD_PREFER_SIBLING) - prefer_sibling = 1; +#ifdef CONFIG_NO_HZ_COMMON + if (env->idle == CPU_NEWLY_IDLE) { + int cpu; + + /* Update the stats of NOHZ idle CPUs in the sd */ + for_each_cpu_and(cpu, sched_domain_span(env->sd), + nohz.idle_cpus_mask) { + struct rq *rq = cpu_rq(cpu); + + /* ... Unless we've already done since the last tick */ + if (time_after(jiffies, + rq->last_blocked_load_update_tick)) + update_blocked_averages(cpu); + } + } + /* + * If we've just updated all of the NOHZ idle CPUs, then we can push + * back the next nohz.next_update, which will prevent an unnecessary + * wakeup for the nohz stats kick + */ + if (cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) + nohz.next_update = jiffies + LOAD_AVG_PERIOD; +#endif load_idx = get_sd_load_idx(env->sd, env->idle); @@ -7584,7 +9803,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd } update_sg_lb_stats(env, sg, load_idx, local_group, sgs, - &overload); + &overload, &overutilized, + &misfit_task); if (local_group) goto next_group; @@ -7616,6 +9836,7 @@ next_group: sds->total_running += sgs->sum_nr_running; sds->total_load += sgs->group_load; sds->total_capacity += sgs->group_capacity; + sds->total_util += sgs->group_util; sg = sg->next; } while (sg != env->sd->groups); @@ -7623,11 +9844,52 @@ next_group: if (env->sd->flags & SD_NUMA) env->fbq_type = fbq_classify_group(&sds->busiest_stat); - if (!env->sd->parent) { + env->src_grp_nr_running = sds->busiest_stat.sum_nr_running; + + if (!lb_sd_parent(env->sd)) { /* update overload indicator if we are at root domain */ - if (env->dst_rq->rd->overload != overload) - env->dst_rq->rd->overload = overload; + if (READ_ONCE(env->dst_rq->rd->overload) != overload) + WRITE_ONCE(env->dst_rq->rd->overload, overload); } + + if (overutilized) + set_sd_overutilized(env->sd); + else + clear_sd_overutilized(env->sd); + + /* + * If there is a misfit task in one cpu in this sched_domain + * it is likely that the imbalance cannot be sorted out among + * the cpu's in this sched_domain. In this case set the + * overutilized flag at the parent sched_domain. + */ + if (misfit_task) { + struct sched_domain *sd = env->sd->parent; + + /* + * In case of a misfit task, load balance at the parent + * sched domain level will make sense only if the the cpus + * have a different capacity. If cpus at a domain level have + * the same capacity, the misfit task cannot be well + * accomodated in any of the cpus and there in no point in + * trying a load balance at this level + */ + while (sd) { + if (sd->flags & SD_ASYM_CPUCAPACITY) { + set_sd_overutilized(sd); + break; + } + sd = sd->parent; + } + } + + /* + * If the domain util is greater that domain capacity, load balancing + * needs to be done at the next sched domain level as well. + */ + if (lb_sd_parent(env->sd) && + sds->total_capacity * 1024 < sds->total_util * capacity_margin) + set_sd_overutilized(env->sd->parent); } /** @@ -7743,7 +10005,22 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) capa_move /= SCHED_CAPACITY_SCALE; /* Move if we gain throughput */ - if (capa_move > capa_now) + if (capa_move > capa_now) { + env->imbalance = busiest->load_per_task; + return; + } + + /* We can't see throughput improvement with the load-based + * method, but it is possible depending upon group size and + * capacity range that there might still be an underutilized + * cpu available in an asymmetric capacity system. Do one last + * check just in case. + */ + if (env->sd->flags & SD_ASYM_CPUCAPACITY && + busiest->group_type == group_overloaded && + busiest->sum_nr_running > busiest->group_weight && + local->sum_nr_running < local->group_weight && + local->group_capacity < busiest->group_capacity) env->imbalance = busiest->load_per_task; } @@ -7776,8 +10053,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * factors in sg capacity and sgs with smaller group_type are * skipped when updating the busiest sg: */ - if (busiest->avg_load <= sds->avg_load || - local->avg_load >= sds->avg_load) { + if (busiest->group_type != group_misfit_task && + (busiest->avg_load <= sds->avg_load || + local->avg_load >= sds->avg_load)) { env->imbalance = 0; return fix_small_imbalance(env, sds); } @@ -7811,6 +10089,22 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s (sds->avg_load - local->avg_load) * local->group_capacity ) / SCHED_CAPACITY_SCALE; + /* Boost imbalance to allow misfit task to be balanced. + * Always do this if we are doing a NEWLY_IDLE balance + * on the assumption that any tasks we have must not be + * long-running (and hence we cannot rely upon load). + * However if we are not idle, we should assume the tasks + * we have are longer running and not override load-based + * calculations above unless we are sure that the local + * group is underutilized. + */ + if (busiest->group_type == group_misfit_task && + (env->idle == CPU_NEWLY_IDLE || + local->sum_nr_running < local->group_weight)) { + env->imbalance = max_t(long, env->imbalance, + busiest->group_misfit_task_load); + } + /* * if *imbalance is less than the average load per runnable task * there is no guarantee that any tasks will be moved so we'll have @@ -7846,6 +10140,10 @@ static struct sched_group *find_busiest_group(struct lb_env *env) * this level. */ update_sd_lb_stats(env, &sds); + + if (energy_aware() && !sd_overutilized(env->sd)) + goto out_balanced; + local = &sds.local_stat; busiest = &sds.busiest_stat; @@ -7869,11 +10167,18 @@ static struct sched_group *find_busiest_group(struct lb_env *env) if (busiest->group_type == group_imbalanced) goto force_balance; - /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ - if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) && + /* + * When dst_cpu is idle, prevent SMP nice and/or asymmetric group + * capacities from resulting in underutilization due to avg_load. + */ + if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) && busiest->group_no_capacity) goto force_balance; + /* Misfit tasks should be dealt with regardless of the avg load */ + if (busiest->group_type == group_misfit_task) + goto force_balance; + /* * If the local group is busier than the selected busiest group * don't try and pull any tasks. @@ -7911,6 +10216,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) force_balance: /* Looks like there is an imbalance. Compute it */ + env->src_grp_type = busiest->group_type; calculate_imbalance(env, &sds); return sds.busiest; @@ -7958,8 +10264,31 @@ static struct rq *find_busiest_queue(struct lb_env *env, if (rt > env->fbq_type) continue; + /* + * For ASYM_CPUCAPACITY domains with misfit tasks we simply + * seek the "biggest" misfit task. + */ + if (env->src_grp_type == group_misfit_task) { + if (rq->misfit_task_load > busiest_load) { + busiest_load = rq->misfit_task_load; + busiest = rq; + } + continue; + } + capacity = capacity_of(i); + /* + * For ASYM_CPUCAPACITY domains, don't pick a cpu that could + * eventually lead to active_balancing high->low capacity. + * Higher per-cpu capacity is considered better than balancing + * average load. + */ + if (env->sd->flags & SD_ASYM_CPUCAPACITY && + capacity_of(env->dst_cpu) < capacity && + rq->nr_running == 1) + continue; + wl = weighted_cpuload(rq); /* @@ -8027,6 +10356,21 @@ static int need_active_balance(struct lb_env *env) return 1; } + if ((capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) && + ((capacity_orig_of(env->src_cpu) < capacity_orig_of(env->dst_cpu))) && + env->src_rq->cfs.h_nr_running == 1 && + cpu_overutilized(env->src_cpu) && + !cpu_overutilized(env->dst_cpu)) { + return 1; + } + + if (env->src_grp_type == group_misfit_task) + return 1; + + if (env->src_grp_type == group_overloaded && + env->src_rq->misfit_task_load) + return 1; + return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); } @@ -8079,7 +10423,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, int *continue_balancing) { int ld_moved, cur_ld_moved, active_balance = 0; - struct sched_domain *sd_parent = sd->parent; + struct sched_domain *sd_parent = lb_sd_parent(sd) ? sd->parent : NULL; struct sched_group *group; struct rq *busiest; struct rq_flags rf; @@ -8145,7 +10489,7 @@ more_balance: * cur_ld_moved - load moved in current iteration * ld_moved - cumulative load moved across iterations */ - cur_ld_moved = detach_tasks(&env); + cur_ld_moved = detach_tasks(&env, &rf); /* * We've detached some tasks from busiest_rq. Every @@ -8245,7 +10589,8 @@ more_balance: * excessive cache_hot migrations and active balances. */ if (idle != CPU_NEWLY_IDLE) - sd->nr_balance_failed++; + if (env.src_grp_nr_running > 1) + sd->nr_balance_failed++; if (need_active_balance(&env)) { unsigned long flags; @@ -8351,6 +10696,7 @@ static inline unsigned long get_sd_balance_interval(struct sched_domain *sd, int cpu_busy) { unsigned long interval = sd->balance_interval; + unsigned int cpu; if (cpu_busy) interval *= sd->busy_factor; @@ -8359,6 +10705,24 @@ get_sd_balance_interval(struct sched_domain *sd, int cpu_busy) interval = msecs_to_jiffies(interval); interval = clamp(interval, 1UL, max_load_balance_interval); + /* + * check if sched domain is marked as overutilized + * we ought to only do this on systems which have SD_ASYMCAPACITY + * but we want to do it for all sched domains in those systems + * So for now, just check if overutilized as a proxy. + */ + /* + * If we are overutilized and we have a misfit task, then + * we want to balance as soon as practically possible, so + * we return an interval of zero. + */ + if (energy_aware() && sd_overutilized(sd)) { + /* we know the root is overutilized, let's check for a misfit task */ + for_each_cpu(cpu, sched_domain_span(sd)) { + if (cpu_rq(cpu)->misfit_task_load) + return 1; + } + } return interval; } @@ -8408,7 +10772,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf) rq_unpin_lock(this_rq, rf); if (this_rq->avg_idle < sysctl_sched_migration_cost || - !this_rq->rd->overload) { + !READ_ONCE(this_rq->rd->overload)) { rcu_read_lock(); sd = rcu_dereference_check_sched_domain(this_rq->sd); if (sd) @@ -8426,8 +10790,12 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf) int continue_balancing = 1; u64 t0, domain_cost; - if (!(sd->flags & SD_LOAD_BALANCE)) + if (!(sd->flags & SD_LOAD_BALANCE)) { + if (time_after_eq(jiffies, + sd->groups->sgc->next_update)) + update_group_capacity(sd, this_cpu); continue; + } if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { update_next_balance(sd, &next_balance); @@ -8558,7 +10926,7 @@ static int active_load_balance_cpu_stop(void *data) schedstat_inc(sd->alb_count); update_rq_clock(busiest_rq); - p = detach_one_task(&env); + p = detach_one_task(&env, &rf); if (p) { schedstat_inc(sd->alb_pushed); /* Active balancing done, reset the failure counter. */ @@ -8592,11 +10960,6 @@ static inline int on_null_domain(struct rq *rq) * needed, they will kick the idle load balancer, which then does idle * load balancing for all the idle CPUs. */ -static struct { - cpumask_var_t idle_cpus_mask; - atomic_t nr_cpus; - unsigned long next_balance; /* in jiffy units */ -} nohz ____cacheline_aligned; static inline int find_new_ilb(void) { @@ -8613,7 +10976,7 @@ static inline int find_new_ilb(void) * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle * CPU (if there is one). */ -static void nohz_balancer_kick(void) +static void nohz_balancer_kick(bool only_update) { int ilb_cpu; @@ -8626,6 +10989,10 @@ static void nohz_balancer_kick(void) if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu))) return; + + if (only_update) + set_bit(NOHZ_STATS_KICK, nohz_flags(ilb_cpu)); + /* * Use smp_send_reschedule() instead of resched_cpu(). * This way we generate a sched IPI on the target cpu which @@ -8713,6 +11080,8 @@ void nohz_balance_enter_idle(int cpu) atomic_inc(&nohz.nr_cpus); set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); } +#else +static inline void nohz_balancer_kick(bool only_update) {} #endif static DEFINE_SPINLOCK(balancing); @@ -8744,8 +11113,6 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) int need_serialize, need_decay = 0; u64 max_cost = 0; - update_blocked_averages(cpu); - rcu_read_lock(); for_each_domain(cpu, sd) { /* @@ -8760,9 +11127,16 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) } max_cost += sd->max_newidle_lb_cost; - if (!(sd->flags & SD_LOAD_BALANCE)) + if (energy_aware() && !sd_overutilized(sd)) continue; + if (!(sd->flags & SD_LOAD_BALANCE)) { + if (time_after_eq(jiffies, + sd->groups->sgc->next_update)) + update_group_capacity(sd, cpu); + continue; + } + /* * Stop the load balance at this level. There is another * CPU in our sched group which is doing load balancing more @@ -8844,6 +11218,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { int this_cpu = this_rq->cpu; struct rq *rq; + struct sched_domain *sd; int balance_cpu; /* Earliest time when we have to do rebalance again */ unsigned long next_balance = jiffies + 60*HZ; @@ -8853,6 +11228,23 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu))) goto end; + /* + * This cpu is going to update the blocked load of idle CPUs either + * before doing a rebalancing or just to keep metrics up to date. we + * can safely update the next update timestamp + */ + rcu_read_lock(); + sd = rcu_dereference(this_rq->sd); + /* + * Check whether there is a sched_domain available for this cpu. + * The last other cpu can have been unplugged since the ILB has been + * triggered and the sched_domain can now be null. The idle balance + * sequence will quickly be aborted as there is no more idle CPUs + */ + if (sd) + nohz.next_update = jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD); + rcu_read_unlock(); + for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { if (balance_cpu == this_cpu || !idle_cpu(balance_cpu)) continue; @@ -8879,7 +11271,15 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) cpu_load_update_idle(rq); rq_unlock_irq(rq, &rf); - rebalance_domains(rq, CPU_IDLE); + update_blocked_averages(balance_cpu); + /* + * This idle load balance softirq may have been + * triggered only to update the blocked load and shares + * of idle CPUs (which we have just done for + * balance_cpu). In that case skip the actual balance. + */ + if (!test_bit(NOHZ_STATS_KICK, nohz_flags(this_cpu))) + rebalance_domains(rq, idle); } if (time_after(next_balance, rq->next_balance)) { @@ -8910,7 +11310,7 @@ end: * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler * domain span are idle. */ -static inline bool nohz_kick_needed(struct rq *rq) +static inline bool nohz_kick_needed(struct rq *rq, bool only_update) { unsigned long now = jiffies; struct sched_domain_shared *sds; @@ -8918,7 +11318,7 @@ static inline bool nohz_kick_needed(struct rq *rq) int nr_busy, i, cpu = rq->cpu; bool kick = false; - if (unlikely(rq->idle_balance)) + if (unlikely(rq->idle_balance) && !only_update) return false; /* @@ -8935,15 +11335,27 @@ static inline bool nohz_kick_needed(struct rq *rq) if (likely(!atomic_read(&nohz.nr_cpus))) return false; + if (only_update) { + if (time_before(now, nohz.next_update)) + return false; + else + return true; + } + if (time_before(now, nohz.next_balance)) return false; - if (rq->nr_running >= 2) + if (rq->nr_running >= 2 && + (!energy_aware() || cpu_overutilized(cpu))) return true; + /* Do idle load balance if there have misfit task */ + if (energy_aware()) + return rq->misfit_task_load; + rcu_read_lock(); sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); - if (sds) { + if (sds && !energy_aware()) { /* * XXX: write a coherent comment on why we do this. * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com @@ -8984,6 +11396,7 @@ unlock: } #else static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } +static inline bool nohz_kick_needed(struct rq *rq, bool only_update) { return false; } #endif /* @@ -9005,7 +11418,14 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h) * and abort nohz_idle_balance altogether if we pull some load. */ nohz_idle_balance(this_rq, idle); + update_blocked_averages(this_rq->cpu); +#ifdef CONFIG_NO_HZ_COMMON + if (!test_bit(NOHZ_STATS_KICK, nohz_flags(this_rq->cpu))) + rebalance_domains(this_rq, idle); + clear_bit(NOHZ_STATS_KICK, nohz_flags(this_rq->cpu)); +#else rebalance_domains(this_rq, idle); +#endif } /* @@ -9020,8 +11440,8 @@ void trigger_load_balance(struct rq *rq) if (time_after_eq(jiffies, rq->next_balance)) raise_softirq(SCHED_SOFTIRQ); #ifdef CONFIG_NO_HZ_COMMON - if (nohz_kick_needed(rq)) - nohz_balancer_kick(); + if (nohz_kick_needed(rq, false)) + nohz_balancer_kick(false); #endif } @@ -9057,6 +11477,10 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) if (static_branch_unlikely(&sched_numa_balancing)) task_tick_numa(rq, curr); + + update_misfit_status(curr, rq); + + update_overutilized_status(rq); } /* @@ -9560,6 +11984,10 @@ const struct sched_class fair_sched_class = { #ifdef CONFIG_FAIR_GROUP_SCHED .task_change_group = task_change_group_fair, #endif +#ifdef CONFIG_SCHED_WALT + .fixup_cumulative_runnable_avg = + walt_fixup_cumulative_runnable_avg_fair, +#endif }; #ifdef CONFIG_SCHED_DEBUG @@ -9601,8 +12029,11 @@ __init void init_sched_fair_class(void) #ifdef CONFIG_NO_HZ_COMMON nohz.next_balance = jiffies; + nohz.next_update = jiffies; zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); #endif + + alloc_eenv(); #endif /* SMP */ } |