diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/sched/core.c | 2 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 6 | ||||
-rw-r--r-- | kernel/sched/fair.c | 16 | ||||
-rw-r--r-- | kernel/sched/pelt.c | 88 | ||||
-rw-r--r-- | kernel/sched/pelt.h | 27 | ||||
-rw-r--r-- | kernel/sched/rt.c | 6 | ||||
-rw-r--r-- | kernel/sched/sched.h | 2 |
7 files changed, 123 insertions, 24 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 625bc9897f62..84e5c4840a2b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -181,6 +181,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) update_irq_load_avg(rq, irq_delta + steal); #endif + update_rq_clock_pelt(rq, delta); } void update_rq_clock(struct rq *rq) @@ -205,7 +206,6 @@ void update_rq_clock(struct rq *rq) update_rq_clock_task(rq, delta); } - #ifdef CONFIG_SCHED_HRTICK /* * Use HR-timers to deliver accurate preemption points. diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 997ea7b839fa..68cb4dc200fb 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1761,7 +1761,7 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) deadline_queue_push_tasks(rq); if (rq->curr->sched_class != &dl_sched_class) - update_dl_rq_load_avg(rq_clock_task(rq), rq, 0); + update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0); return p; } @@ -1770,7 +1770,7 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p) { update_curr_dl(rq); - update_dl_rq_load_avg(rq_clock_task(rq), rq, 1); + update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1); if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1) enqueue_pushable_dl_task(rq, p); } @@ -1787,7 +1787,7 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) { update_curr_dl(rq); - update_dl_rq_load_avg(rq_clock_task(rq), rq, 1); + update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1); /* * Even when we have runtime, update_curr_dl() might have resulted in us * not being the leftmost task anymore. In that case NEED_RESCHED will diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0969ce333c8e..5677254d4abf 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -764,7 +764,7 @@ void post_init_entity_util_avg(struct sched_entity *se) * such that the next switched_to_fair() has the * expected state. */ - se->avg.last_update_time = cfs_rq_clock_task(cfs_rq); + se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq); return; } } @@ -3400,7 +3400,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s /* Update task and its cfs_rq load average */ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { - u64 now = cfs_rq_clock_task(cfs_rq); + u64 now = cfs_rq_clock_pelt(cfs_rq); struct rq *rq = rq_of(cfs_rq); int cpu = cpu_of(rq); int decayed; @@ -7285,7 +7285,7 @@ static void update_blocked_averages(int cpu) if (throttled_hierarchy(cfs_rq)) continue; - if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq)) + if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) update_tg_load_avg(cfs_rq, 0); /* Propagate pending load changes to the parent, if any: */ @@ -7306,8 +7306,8 @@ static void update_blocked_averages(int cpu) } curr_class = rq->curr->sched_class; - update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class); - update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class); + update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class); + update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class); update_irq_load_avg(rq, 0); /* Don't need periodic decay once load/util_avg are null */ if (others_have_blocked(rq)) @@ -7377,11 +7377,11 @@ static inline void update_blocked_averages(int cpu) rq_lock_irqsave(rq, &rf); update_rq_clock(rq); - update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); + update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq); curr_class = rq->curr->sched_class; - update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class); - update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class); + update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class); + update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class); update_irq_load_avg(rq, 0); #ifdef CONFIG_NO_HZ_COMMON rq->last_blocked_load_update_tick = jiffies; diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index 35475c0c5419..48f4f07dcf8f 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -30,6 +30,72 @@ #include "pelt.h" /* + * The clock_pelt scales the time to reflect the effective amount of + * computation done during the running delta time but then sync back to + * clock_task when rq is idle. + * + * + * absolute time | 1| 2| 3| 4| 5| 6| 7| 8| 9|10|11|12|13|14|15|16 + * @ max capacity ------******---------------******--------------- + * @ half capacity ------************---------************--------- + * clock pelt | 1| 2| 3| 4| 7| 8| 9| 10| 11|14|15|16 + * + */ +void update_rq_clock_pelt(struct rq *rq, s64 delta) +{ + + if (is_idle_task(rq->curr)) { + u32 divider = (LOAD_AVG_MAX - 1024 + rq->cfs.avg.period_contrib) << SCHED_CAPACITY_SHIFT; + u32 overload = rq->cfs.avg.util_sum + LOAD_AVG_MAX; + overload += rq->avg_rt.util_sum; + overload += rq->avg_dl.util_sum; + + /* + * Reflecting some stolen time makes sense only if the idle + * phase would be present at max capacity. As soon as the + * utilization of a rq has reached the maximum value, it is + * considered as an always runnnig rq without idle time to + * steal. This potential idle time is considered as lost in + * this case. We keep track of this lost idle time compare to + * rq's clock_task. + */ + if (overload >= divider) + rq->lost_idle_time += rq_clock_task(rq) - rq->clock_pelt; + + + /* The rq is idle, we can sync to clock_task */ + rq->clock_pelt = rq_clock_task(rq); + + + } else { + /* + * When a rq runs at a lower compute capacity, it will need + * more time to do the same amount of work than at max + * capacity: either because it takes more time to compute the + * same amount of work or because taking more time means + * sharing more often the CPU between entities. + * In order to be invariant, we scale the delta to reflect how + * much work has been really done. + * Running at lower capacity also means running longer to do + * the same amount of work and this results in stealing some + * idle time that will disturb the load signal compared to + * max capacity; This stolen idle time will be automaticcally + * reflected when the rq will be idle and the clock will be + * synced with rq_clock_task. + */ + + /* + * scale the elapsed time to reflect the real amount of + * computation + */ + delta = cap_scale(delta, arch_scale_freq_capacity(cpu_of(rq))); + delta = cap_scale(delta, arch_scale_cpu_capacity(NULL, cpu_of(rq))); + + rq->clock_pelt += delta; + } +} + +/* * Approximate: * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) */ @@ -106,16 +172,12 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3) * n=1 */ static __always_inline u32 -accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, +accumulate_sum(u64 delta, struct sched_avg *sa, unsigned long load, unsigned long runnable, int running) { - unsigned long scale_freq, scale_cpu; u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */ u64 periods; - scale_freq = arch_scale_freq_capacity(cpu); - scale_cpu = arch_scale_cpu_capacity(NULL, cpu); - delta += sa->period_contrib; periods = delta / 1024; /* A period is 1024us (~1ms) */ @@ -137,13 +199,12 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, } sa->period_contrib = delta; - contrib = cap_scale(contrib, scale_freq); if (load) sa->load_sum += load * contrib; if (runnable) sa->runnable_load_sum += runnable * contrib; if (running) - sa->util_sum += contrib * scale_cpu; + sa->util_sum += contrib << SCHED_CAPACITY_SHIFT; return periods; } @@ -221,7 +282,7 @@ ___update_load_sum(u64 now, int cpu, struct sched_avg *sa, * Step 1: accumulate *_sum since last_update_time. If we haven't * crossed period boundaries, finish. */ - if (!accumulate_sum(delta, cpu, sa, load, runnable, running)) + if (!accumulate_sum(delta, sa, load, runnable, running)) return 0; return 1; @@ -371,12 +432,21 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) int update_irq_load_avg(struct rq *rq, u64 running) { int ret = 0; + + /* + * We can't use clock_pelt because irq time is not accounted in + * clock_task. Instead we directly scale the running time to + * reflect the real amount of computation + */ + running = cap_scale(running, arch_scale_freq_capacity(cpu_of(rq))); + running = cap_scale(running, arch_scale_cpu_capacity(NULL, cpu_of(rq))); + /* * We know the time that has been used by interrupt since last update * but we don't when. Let be pessimistic and assume that interrupt has * happened just before the update. This is not so far from reality * because interrupt will most probably wake up task and trig an update - * of rq clock during which the metric si updated. + * of rq clock during which the metric is updated. * We start to decay with normal context time and then we add the * interrupt context time. * We can safely remove running from rq->clock because diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index d2894db28955..b4ce173b2f9e 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -42,6 +42,29 @@ static inline void cfs_se_util_change(struct sched_avg *avg) WRITE_ONCE(avg->util_est.enqueued, enqueued); } +void update_rq_clock_pelt(struct rq *rq, s64 delta); + +static inline u64 rq_clock_pelt(struct rq *rq) +{ + return rq->clock_pelt - rq->lost_idle_time; +} + +#ifdef CONFIG_CFS_BANDWIDTH +/* rq->task_clock normalized against any time this cfs_rq has spent throttled */ +static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) +{ + if (unlikely(cfs_rq->throttle_count)) + return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time; + + return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; +} +#else +static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) +{ + return rq_clock_pelt(rq_of(cfs_rq)); +} +#endif + #else static inline int @@ -67,6 +90,10 @@ update_irq_load_avg(struct rq *rq, u64 running) { return 0; } + +static inline void +update_rq_clock_pelt(struct rq *rq, s64 delta) {} + #endif diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 2e2955a8cf8f..f62f2d537b5a 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1584,7 +1584,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * rt task */ if (rq->curr->sched_class != &rt_sched_class) - update_rt_rq_load_avg(rq_clock_task(rq), rq, 0); + update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0); return p; } @@ -1593,7 +1593,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) { update_curr_rt(rq); - update_rt_rq_load_avg(rq_clock_task(rq), rq, 1); + update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1); /* * The previous task needs to be made eligible for pushing @@ -2324,7 +2324,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) struct sched_rt_entity *rt_se = &p->rt; update_curr_rt(rq); - update_rt_rq_load_avg(rq_clock_task(rq), rq, 1); + update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1); watchdog(rq, p); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c45b5f26704b..77ec09aaf255 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -832,6 +832,8 @@ struct rq { unsigned int clock_update_flags; u64 clock; u64 clock_task; + u64 clock_pelt; + unsigned long lost_idle_time; atomic_t nr_iowait; |