aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/sched/core.c2
-rw-r--r--kernel/sched/deadline.c6
-rw-r--r--kernel/sched/fair.c16
-rw-r--r--kernel/sched/pelt.c88
-rw-r--r--kernel/sched/pelt.h27
-rw-r--r--kernel/sched/rt.c6
-rw-r--r--kernel/sched/sched.h2
7 files changed, 123 insertions, 24 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 625bc9897f62..84e5c4840a2b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -181,6 +181,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
update_irq_load_avg(rq, irq_delta + steal);
#endif
+ update_rq_clock_pelt(rq, delta);
}
void update_rq_clock(struct rq *rq)
@@ -205,7 +206,6 @@ void update_rq_clock(struct rq *rq)
update_rq_clock_task(rq, delta);
}
-
#ifdef CONFIG_SCHED_HRTICK
/*
* Use HR-timers to deliver accurate preemption points.
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 997ea7b839fa..68cb4dc200fb 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1761,7 +1761,7 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
deadline_queue_push_tasks(rq);
if (rq->curr->sched_class != &dl_sched_class)
- update_dl_rq_load_avg(rq_clock_task(rq), rq, 0);
+ update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
return p;
}
@@ -1770,7 +1770,7 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
{
update_curr_dl(rq);
- update_dl_rq_load_avg(rq_clock_task(rq), rq, 1);
+ update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1);
if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
enqueue_pushable_dl_task(rq, p);
}
@@ -1787,7 +1787,7 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
{
update_curr_dl(rq);
- update_dl_rq_load_avg(rq_clock_task(rq), rq, 1);
+ update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1);
/*
* Even when we have runtime, update_curr_dl() might have resulted in us
* not being the leftmost task anymore. In that case NEED_RESCHED will
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0969ce333c8e..5677254d4abf 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -764,7 +764,7 @@ void post_init_entity_util_avg(struct sched_entity *se)
* such that the next switched_to_fair() has the
* expected state.
*/
- se->avg.last_update_time = cfs_rq_clock_task(cfs_rq);
+ se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq);
return;
}
}
@@ -3400,7 +3400,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
/* Update task and its cfs_rq load average */
static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
- u64 now = cfs_rq_clock_task(cfs_rq);
+ u64 now = cfs_rq_clock_pelt(cfs_rq);
struct rq *rq = rq_of(cfs_rq);
int cpu = cpu_of(rq);
int decayed;
@@ -7285,7 +7285,7 @@ static void update_blocked_averages(int cpu)
if (throttled_hierarchy(cfs_rq))
continue;
- if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
+ if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq))
update_tg_load_avg(cfs_rq, 0);
/* Propagate pending load changes to the parent, if any: */
@@ -7306,8 +7306,8 @@ static void update_blocked_averages(int cpu)
}
curr_class = rq->curr->sched_class;
- update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class);
- update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class);
+ update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
+ update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
update_irq_load_avg(rq, 0);
/* Don't need periodic decay once load/util_avg are null */
if (others_have_blocked(rq))
@@ -7377,11 +7377,11 @@ static inline void update_blocked_averages(int cpu)
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
- update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
+ update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
curr_class = rq->curr->sched_class;
- update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class);
- update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class);
+ update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
+ update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
update_irq_load_avg(rq, 0);
#ifdef CONFIG_NO_HZ_COMMON
rq->last_blocked_load_update_tick = jiffies;
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index 35475c0c5419..48f4f07dcf8f 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -30,6 +30,72 @@
#include "pelt.h"
/*
+ * The clock_pelt scales the time to reflect the effective amount of
+ * computation done during the running delta time but then sync back to
+ * clock_task when rq is idle.
+ *
+ *
+ * absolute time | 1| 2| 3| 4| 5| 6| 7| 8| 9|10|11|12|13|14|15|16
+ * @ max capacity ------******---------------******---------------
+ * @ half capacity ------************---------************---------
+ * clock pelt | 1| 2| 3| 4| 7| 8| 9| 10| 11|14|15|16
+ *
+ */
+void update_rq_clock_pelt(struct rq *rq, s64 delta)
+{
+
+ if (is_idle_task(rq->curr)) {
+ u32 divider = (LOAD_AVG_MAX - 1024 + rq->cfs.avg.period_contrib) << SCHED_CAPACITY_SHIFT;
+ u32 overload = rq->cfs.avg.util_sum + LOAD_AVG_MAX;
+ overload += rq->avg_rt.util_sum;
+ overload += rq->avg_dl.util_sum;
+
+ /*
+ * Reflecting some stolen time makes sense only if the idle
+ * phase would be present at max capacity. As soon as the
+ * utilization of a rq has reached the maximum value, it is
+ * considered as an always runnnig rq without idle time to
+ * steal. This potential idle time is considered as lost in
+ * this case. We keep track of this lost idle time compare to
+ * rq's clock_task.
+ */
+ if (overload >= divider)
+ rq->lost_idle_time += rq_clock_task(rq) - rq->clock_pelt;
+
+
+ /* The rq is idle, we can sync to clock_task */
+ rq->clock_pelt = rq_clock_task(rq);
+
+
+ } else {
+ /*
+ * When a rq runs at a lower compute capacity, it will need
+ * more time to do the same amount of work than at max
+ * capacity: either because it takes more time to compute the
+ * same amount of work or because taking more time means
+ * sharing more often the CPU between entities.
+ * In order to be invariant, we scale the delta to reflect how
+ * much work has been really done.
+ * Running at lower capacity also means running longer to do
+ * the same amount of work and this results in stealing some
+ * idle time that will disturb the load signal compared to
+ * max capacity; This stolen idle time will be automaticcally
+ * reflected when the rq will be idle and the clock will be
+ * synced with rq_clock_task.
+ */
+
+ /*
+ * scale the elapsed time to reflect the real amount of
+ * computation
+ */
+ delta = cap_scale(delta, arch_scale_freq_capacity(cpu_of(rq)));
+ delta = cap_scale(delta, arch_scale_cpu_capacity(NULL, cpu_of(rq)));
+
+ rq->clock_pelt += delta;
+ }
+}
+
+/*
* Approximate:
* val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
*/
@@ -106,16 +172,12 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
* n=1
*/
static __always_inline u32
-accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
+accumulate_sum(u64 delta, struct sched_avg *sa,
unsigned long load, unsigned long runnable, int running)
{
- unsigned long scale_freq, scale_cpu;
u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
u64 periods;
- scale_freq = arch_scale_freq_capacity(cpu);
- scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
-
delta += sa->period_contrib;
periods = delta / 1024; /* A period is 1024us (~1ms) */
@@ -137,13 +199,12 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
}
sa->period_contrib = delta;
- contrib = cap_scale(contrib, scale_freq);
if (load)
sa->load_sum += load * contrib;
if (runnable)
sa->runnable_load_sum += runnable * contrib;
if (running)
- sa->util_sum += contrib * scale_cpu;
+ sa->util_sum += contrib << SCHED_CAPACITY_SHIFT;
return periods;
}
@@ -221,7 +282,7 @@ ___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
* Step 1: accumulate *_sum since last_update_time. If we haven't
* crossed period boundaries, finish.
*/
- if (!accumulate_sum(delta, cpu, sa, load, runnable, running))
+ if (!accumulate_sum(delta, sa, load, runnable, running))
return 0;
return 1;
@@ -371,12 +432,21 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
int update_irq_load_avg(struct rq *rq, u64 running)
{
int ret = 0;
+
+ /*
+ * We can't use clock_pelt because irq time is not accounted in
+ * clock_task. Instead we directly scale the running time to
+ * reflect the real amount of computation
+ */
+ running = cap_scale(running, arch_scale_freq_capacity(cpu_of(rq)));
+ running = cap_scale(running, arch_scale_cpu_capacity(NULL, cpu_of(rq)));
+
/*
* We know the time that has been used by interrupt since last update
* but we don't when. Let be pessimistic and assume that interrupt has
* happened just before the update. This is not so far from reality
* because interrupt will most probably wake up task and trig an update
- * of rq clock during which the metric si updated.
+ * of rq clock during which the metric is updated.
* We start to decay with normal context time and then we add the
* interrupt context time.
* We can safely remove running from rq->clock because
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index d2894db28955..b4ce173b2f9e 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -42,6 +42,29 @@ static inline void cfs_se_util_change(struct sched_avg *avg)
WRITE_ONCE(avg->util_est.enqueued, enqueued);
}
+void update_rq_clock_pelt(struct rq *rq, s64 delta);
+
+static inline u64 rq_clock_pelt(struct rq *rq)
+{
+ return rq->clock_pelt - rq->lost_idle_time;
+}
+
+#ifdef CONFIG_CFS_BANDWIDTH
+/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
+static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
+{
+ if (unlikely(cfs_rq->throttle_count))
+ return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
+
+ return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
+}
+#else
+static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
+{
+ return rq_clock_pelt(rq_of(cfs_rq));
+}
+#endif
+
#else
static inline int
@@ -67,6 +90,10 @@ update_irq_load_avg(struct rq *rq, u64 running)
{
return 0;
}
+
+static inline void
+update_rq_clock_pelt(struct rq *rq, s64 delta) {}
+
#endif
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 2e2955a8cf8f..f62f2d537b5a 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1584,7 +1584,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
* rt task
*/
if (rq->curr->sched_class != &rt_sched_class)
- update_rt_rq_load_avg(rq_clock_task(rq), rq, 0);
+ update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
return p;
}
@@ -1593,7 +1593,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
{
update_curr_rt(rq);
- update_rt_rq_load_avg(rq_clock_task(rq), rq, 1);
+ update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);
/*
* The previous task needs to be made eligible for pushing
@@ -2324,7 +2324,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
struct sched_rt_entity *rt_se = &p->rt;
update_curr_rt(rq);
- update_rt_rq_load_avg(rq_clock_task(rq), rq, 1);
+ update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);
watchdog(rq, p);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c45b5f26704b..77ec09aaf255 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -832,6 +832,8 @@ struct rq {
unsigned int clock_update_flags;
u64 clock;
u64 clock_task;
+ u64 clock_pelt;
+ unsigned long lost_idle_time;
atomic_t nr_iowait;