From 3d2e83a2a6a0657c1cf145fa6ba23620715d6c36 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 17 Jul 2020 16:05:41 +0200 Subject: timers: Preserve higher bits of expiration on index calculation The higher bits of the timer expiration are cropped while calling calc_index() due to the implicit cast from unsigned long to unsigned int. This loss shouldn't have consequences on the current code since all the computation to calculate the index is done on the lower 32 bits. However to prepare for returning the actual bucket expiration from calc_index() in order to properly fix base->next_expiry updates, the higher bits need to be preserved. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200717140551.29076-3-frederic@kernel.org --- kernel/time/timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index df1ff803acc4..bcdc3045138d 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -487,7 +487,7 @@ static inline void timer_set_idx(struct timer_list *timer, unsigned int idx) * Helper function to calculate the array index for a given expiry * time. */ -static inline unsigned calc_index(unsigned expires, unsigned lvl) +static inline unsigned calc_index(unsigned long expires, unsigned lvl) { expires = (expires + LVL_GRAN(lvl)) >> LVL_SHIFT(lvl); return LVL_OFFS(lvl) + (expires & LVL_MASK); -- cgit v1.2.3 From 1f32cab0db4bdf6491eb4a60838f278e01c31698 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Fri, 17 Jul 2020 16:05:42 +0200 Subject: timers: Use only bucket expiry for base->next_expiry value The bucket expiry time is the effective expriy time of timers and is greater than or equal to the requested timer expiry time. This is due to the guarantee that timers never expire early and the reduced expiry granularity in the secondary wheel levels. When a timer is enqueued, trigger_dyntick_cpu() checks whether the timer is the new first timer. This check compares next_expiry with the requested timer expiry value and not with the effective expiry value of the bucket into which the timer was queued. Storing the requested timer expiry value in base->next_expiry can lead to base->clk going backwards if the requested timer expiry value is smaller than base->clk. Commit 30c66fc30ee7 ("timer: Prevent base->clk from moving backward") worked around this by preventing the store when timer->expiry is before base->clk, but did not fix the underlying problem. Use the expiry value of the bucket into which the timer is queued to do the new first timer check. This fixes the base->clk going backward problem. The workaround of commit 30c66fc30ee7 ("timer: Prevent base->clk from moving backward") in trigger_dyntick_cpu() is not longer necessary as the timers bucket expiry is guaranteed to be greater than or equal base->clk. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200717140551.29076-4-frederic@kernel.org --- kernel/time/timer.c | 64 ++++++++++++++++++++++++++++------------------------- 1 file changed, 34 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index bcdc3045138d..a7a3cf737411 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -487,35 +487,39 @@ static inline void timer_set_idx(struct timer_list *timer, unsigned int idx) * Helper function to calculate the array index for a given expiry * time. */ -static inline unsigned calc_index(unsigned long expires, unsigned lvl) +static inline unsigned calc_index(unsigned long expires, unsigned lvl, + unsigned long *bucket_expiry) { expires = (expires + LVL_GRAN(lvl)) >> LVL_SHIFT(lvl); + *bucket_expiry = expires << LVL_SHIFT(lvl); return LVL_OFFS(lvl) + (expires & LVL_MASK); } -static int calc_wheel_index(unsigned long expires, unsigned long clk) +static int calc_wheel_index(unsigned long expires, unsigned long clk, + unsigned long *bucket_expiry) { unsigned long delta = expires - clk; unsigned int idx; if (delta < LVL_START(1)) { - idx = calc_index(expires, 0); + idx = calc_index(expires, 0, bucket_expiry); } else if (delta < LVL_START(2)) { - idx = calc_index(expires, 1); + idx = calc_index(expires, 1, bucket_expiry); } else if (delta < LVL_START(3)) { - idx = calc_index(expires, 2); + idx = calc_index(expires, 2, bucket_expiry); } else if (delta < LVL_START(4)) { - idx = calc_index(expires, 3); + idx = calc_index(expires, 3, bucket_expiry); } else if (delta < LVL_START(5)) { - idx = calc_index(expires, 4); + idx = calc_index(expires, 4, bucket_expiry); } else if (delta < LVL_START(6)) { - idx = calc_index(expires, 5); + idx = calc_index(expires, 5, bucket_expiry); } else if (delta < LVL_START(7)) { - idx = calc_index(expires, 6); + idx = calc_index(expires, 6, bucket_expiry); } else if (LVL_DEPTH > 8 && delta < LVL_START(8)) { - idx = calc_index(expires, 7); + idx = calc_index(expires, 7, bucket_expiry); } else if ((long) delta < 0) { idx = clk & LVL_MASK; + *bucket_expiry = clk; } else { /* * Force expire obscene large timeouts to expire at the @@ -524,7 +528,7 @@ static int calc_wheel_index(unsigned long expires, unsigned long clk) if (delta >= WHEEL_TIMEOUT_CUTOFF) expires = clk + WHEEL_TIMEOUT_MAX; - idx = calc_index(expires, LVL_DEPTH - 1); + idx = calc_index(expires, LVL_DEPTH - 1, bucket_expiry); } return idx; } @@ -544,16 +548,18 @@ static void enqueue_timer(struct timer_base *base, struct timer_list *timer, } static void -__internal_add_timer(struct timer_base *base, struct timer_list *timer) +__internal_add_timer(struct timer_base *base, struct timer_list *timer, + unsigned long *bucket_expiry) { unsigned int idx; - idx = calc_wheel_index(timer->expires, base->clk); + idx = calc_wheel_index(timer->expires, base->clk, bucket_expiry); enqueue_timer(base, timer, idx); } static void -trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer) +trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer, + unsigned long bucket_expiry) { if (!is_timers_nohz_active()) return; @@ -576,31 +582,29 @@ trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer) if (!base->is_idle) return; - /* Check whether this is the new first expiring timer: */ - if (time_after_eq(timer->expires, base->next_expiry)) + /* + * Check whether this is the new first expiring timer. The + * effective expiry time of the timer is required here + * (bucket_expiry) instead of timer->expires. + */ + if (time_after_eq(bucket_expiry, base->next_expiry)) return; /* * Set the next expiry time and kick the CPU so it can reevaluate the * wheel: */ - if (time_before(timer->expires, base->clk)) { - /* - * Prevent from forward_timer_base() moving the base->clk - * backward - */ - base->next_expiry = base->clk; - } else { - base->next_expiry = timer->expires; - } + base->next_expiry = bucket_expiry; wake_up_nohz_cpu(base->cpu); } static void internal_add_timer(struct timer_base *base, struct timer_list *timer) { - __internal_add_timer(base, timer); - trigger_dyntick_cpu(base, timer); + unsigned long bucket_expiry; + + __internal_add_timer(base, timer, &bucket_expiry); + trigger_dyntick_cpu(base, timer, bucket_expiry); } #ifdef CONFIG_DEBUG_OBJECTS_TIMERS @@ -959,9 +963,9 @@ static struct timer_base *lock_timer_base(struct timer_list *timer, static inline int __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int options) { + unsigned long clk = 0, flags, bucket_expiry; struct timer_base *base, *new_base; unsigned int idx = UINT_MAX; - unsigned long clk = 0, flags; int ret = 0; BUG_ON(!timer->function); @@ -1000,7 +1004,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option } clk = base->clk; - idx = calc_wheel_index(expires, clk); + idx = calc_wheel_index(expires, clk, &bucket_expiry); /* * Retrieve and compare the array index of the pending @@ -1059,7 +1063,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option */ if (idx != UINT_MAX && clk == base->clk) { enqueue_timer(base, timer, idx); - trigger_dyntick_cpu(base, timer); + trigger_dyntick_cpu(base, timer, bucket_expiry); } else { internal_add_timer(base, timer); } -- cgit v1.2.3 From 9a2b764b06c880678416d803d027f575ae40ec99 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 17 Jul 2020 16:05:43 +0200 Subject: timers: Move trigger_dyntick_cpu() to enqueue_timer() Consolidate the code by calling trigger_dyntick_cpu() from enqueue_timer() instead of calling it from all its callers. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Tested-by: Juri Lelli Link: https://lkml.kernel.org/r/20200717140551.29076-5-frederic@kernel.org --- kernel/time/timer.c | 61 ++++++++++++++++++++++------------------------------- 1 file changed, 25 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index a7a3cf737411..2af08a169564 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -533,30 +533,6 @@ static int calc_wheel_index(unsigned long expires, unsigned long clk, return idx; } -/* - * Enqueue the timer into the hash bucket, mark it pending in - * the bitmap and store the index in the timer flags. - */ -static void enqueue_timer(struct timer_base *base, struct timer_list *timer, - unsigned int idx) -{ - hlist_add_head(&timer->entry, base->vectors + idx); - __set_bit(idx, base->pending_map); - timer_set_idx(timer, idx); - - trace_timer_start(timer, timer->expires, timer->flags); -} - -static void -__internal_add_timer(struct timer_base *base, struct timer_list *timer, - unsigned long *bucket_expiry) -{ - unsigned int idx; - - idx = calc_wheel_index(timer->expires, base->clk, bucket_expiry); - enqueue_timer(base, timer, idx); -} - static void trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer, unsigned long bucket_expiry) @@ -598,15 +574,31 @@ trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer, wake_up_nohz_cpu(base->cpu); } -static void -internal_add_timer(struct timer_base *base, struct timer_list *timer) +/* + * Enqueue the timer into the hash bucket, mark it pending in + * the bitmap, store the index in the timer flags then wake up + * the target CPU if needed. + */ +static void enqueue_timer(struct timer_base *base, struct timer_list *timer, + unsigned int idx, unsigned long bucket_expiry) { - unsigned long bucket_expiry; + hlist_add_head(&timer->entry, base->vectors + idx); + __set_bit(idx, base->pending_map); + timer_set_idx(timer, idx); - __internal_add_timer(base, timer, &bucket_expiry); + trace_timer_start(timer, timer->expires, timer->flags); trigger_dyntick_cpu(base, timer, bucket_expiry); } +static void internal_add_timer(struct timer_base *base, struct timer_list *timer) +{ + unsigned long bucket_expiry; + unsigned int idx; + + idx = calc_wheel_index(timer->expires, base->clk, &bucket_expiry); + enqueue_timer(base, timer, idx, bucket_expiry); +} + #ifdef CONFIG_DEBUG_OBJECTS_TIMERS static struct debug_obj_descr timer_debug_descr; @@ -1057,16 +1049,13 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option /* * If 'idx' was calculated above and the base time did not advance * between calculating 'idx' and possibly switching the base, only - * enqueue_timer() and trigger_dyntick_cpu() is required. Otherwise - * we need to (re)calculate the wheel index via - * internal_add_timer(). + * enqueue_timer() is required. Otherwise we need to (re)calculate + * the wheel index via internal_add_timer(). */ - if (idx != UINT_MAX && clk == base->clk) { - enqueue_timer(base, timer, idx); - trigger_dyntick_cpu(base, timer, bucket_expiry); - } else { + if (idx != UINT_MAX && clk == base->clk) + enqueue_timer(base, timer, idx, bucket_expiry); + else internal_add_timer(base, timer); - } out_unlock: raw_spin_unlock_irqrestore(&base->lock, flags); -- cgit v1.2.3 From 4468897211628865ee2392acb5ad281f74176f63 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 17 Jul 2020 16:05:44 +0200 Subject: timers: Add comments about calc_index() ceiling work calc_index() adds 1 unit of the level granularity to the expiry passed in parameter to ensure that the timer doesn't expire too early. Add a comment to explain that and the resulting layout in the wheel. Suggested-by: Thomas Gleixner Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Tested-by: Juri Lelli Link: https://lkml.kernel.org/r/20200717140551.29076-6-frederic@kernel.org --- kernel/time/timer.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 2af08a169564..af1c08b0b168 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -156,7 +156,8 @@ EXPORT_SYMBOL(jiffies_64); /* * The time start value for each level to select the bucket at enqueue - * time. + * time. We start from the last possible delta of the previous level + * so that we can later add an extra LVL_GRAN(n) to n (see calc_index()). */ #define LVL_START(n) ((LVL_SIZE - 1) << (((n) - 1) * LVL_CLK_SHIFT)) @@ -490,6 +491,15 @@ static inline void timer_set_idx(struct timer_list *timer, unsigned int idx) static inline unsigned calc_index(unsigned long expires, unsigned lvl, unsigned long *bucket_expiry) { + + /* + * The timer wheel has to guarantee that a timer does not fire + * early. Early expiry can happen due to: + * - Timer is armed at the edge of a tick + * - Truncation of the expiry time in the outer wheel levels + * + * Round up with level granularity to prevent this. + */ expires = (expires + LVL_GRAN(lvl)) >> LVL_SHIFT(lvl); *bucket_expiry = expires << LVL_SHIFT(lvl); return LVL_OFFS(lvl) + (expires & LVL_MASK); -- cgit v1.2.3 From 001ec1b3925da0d51847c23fc0aa4129282db526 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 17 Jul 2020 16:05:45 +0200 Subject: timers: Optimize _next_timer_interrupt() level iteration If a level has a timer that expires before reaching the next level, there is no need to iterate further. The next level is reached when the 3 lower bits of the current level are cleared. If the next event happens before/during that, the next levels won't provide an earlier expiration. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Tested-by: Juri Lelli Link: https://lkml.kernel.org/r/20200717140551.29076-7-frederic@kernel.org --- kernel/time/timer.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index af1c08b0b168..9abc41715fd2 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1526,6 +1526,7 @@ static unsigned long __next_timer_interrupt(struct timer_base *base) clk = base->clk; for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) { int pos = next_pending_bucket(base, offset, clk & LVL_MASK); + unsigned long lvl_clk = clk & LVL_CLK_MASK; if (pos >= 0) { unsigned long tmp = clk + (unsigned long) pos; @@ -1533,6 +1534,13 @@ static unsigned long __next_timer_interrupt(struct timer_base *base) tmp <<= LVL_SHIFT(lvl); if (time_before(tmp, next)) next = tmp; + + /* + * If the next expiration happens before we reach + * the next level, no need to check further. + */ + if (pos <= ((LVL_CLK_DIV - lvl_clk) & LVL_CLK_MASK)) + break; } /* * Clock for the next level. If the current level clock lower @@ -1570,7 +1578,7 @@ static unsigned long __next_timer_interrupt(struct timer_base *base) * So the simple check whether the lower bits of the current * level are 0 or not is sufficient for all cases. */ - adj = clk & LVL_CLK_MASK ? 1 : 0; + adj = lvl_clk ? 1 : 0; clk >>= LVL_CLK_SHIFT; clk += adj; } -- cgit v1.2.3 From dc2a0f1fb2a06df09f5094f29aea56b763aa7cca Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 17 Jul 2020 16:05:46 +0200 Subject: timers: Always keep track of next expiry So far next expiry was only tracked while the CPU was in nohz_idle mode in order to cope with missing ticks that can't increment the base->clk periodically anymore. This logic is going to be expanded beyond nohz in order to spare timer softirqs so do it unconditionally. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Tested-by: Juri Lelli Link: https://lkml.kernel.org/r/20200717140551.29076-8-frederic@kernel.org --- kernel/time/timer.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 9abc41715fd2..76fd9644638b 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -544,8 +544,7 @@ static int calc_wheel_index(unsigned long expires, unsigned long clk, } static void -trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer, - unsigned long bucket_expiry) +trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer) { if (!is_timers_nohz_active()) return; @@ -565,23 +564,8 @@ trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer, * timer is not deferrable. If the other CPU is on the way to idle * then it can't set base->is_idle as we hold the base lock: */ - if (!base->is_idle) - return; - - /* - * Check whether this is the new first expiring timer. The - * effective expiry time of the timer is required here - * (bucket_expiry) instead of timer->expires. - */ - if (time_after_eq(bucket_expiry, base->next_expiry)) - return; - - /* - * Set the next expiry time and kick the CPU so it can reevaluate the - * wheel: - */ - base->next_expiry = bucket_expiry; - wake_up_nohz_cpu(base->cpu); + if (base->is_idle) + wake_up_nohz_cpu(base->cpu); } /* @@ -592,12 +576,26 @@ trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer, static void enqueue_timer(struct timer_base *base, struct timer_list *timer, unsigned int idx, unsigned long bucket_expiry) { + hlist_add_head(&timer->entry, base->vectors + idx); __set_bit(idx, base->pending_map); timer_set_idx(timer, idx); trace_timer_start(timer, timer->expires, timer->flags); - trigger_dyntick_cpu(base, timer, bucket_expiry); + + /* + * Check whether this is the new first expiring timer. The + * effective expiry time of the timer is required here + * (bucket_expiry) instead of timer->expires. + */ + if (time_before(bucket_expiry, base->next_expiry)) { + /* + * Set the next expiry time and kick the CPU so it + * can reevaluate the wheel: + */ + base->next_expiry = bucket_expiry; + trigger_dyntick_cpu(base, timer); + } } static void internal_add_timer(struct timer_base *base, struct timer_list *timer) @@ -1493,7 +1491,6 @@ static int __collect_expired_timers(struct timer_base *base, return levels; } -#ifdef CONFIG_NO_HZ_COMMON /* * Find the next pending bucket of a level. Search from level start (@offset) * + @clk upwards and if nothing there, search from start of the level @@ -1585,6 +1582,7 @@ static unsigned long __next_timer_interrupt(struct timer_base *base) return next; } +#ifdef CONFIG_NO_HZ_COMMON /* * Check, if the next hrtimer event is before the next timer wheel * event: @@ -1790,6 +1788,7 @@ static inline void __run_timers(struct timer_base *base) levels = collect_expired_timers(base, heads); base->clk++; + base->next_expiry = __next_timer_interrupt(base); while (levels--) expire_timers(base, heads + levels); @@ -2042,6 +2041,7 @@ static void __init init_timer_cpu(int cpu) base->cpu = cpu; raw_spin_lock_init(&base->lock); base->clk = jiffies; + base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA; timer_base_init_expiry_lock(base); } } -- cgit v1.2.3 From 90d52f65f303091be17b5f4ffab7090b2064b4a1 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 17 Jul 2020 16:05:47 +0200 Subject: timers: Reuse next expiry cache after nohz exit Now that the next expiry it tracked unconditionally when a timer is added, this information can be reused on a tick firing after exiting nohz. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Tested-by: Juri Lelli Link: https://lkml.kernel.org/r/20200717140551.29076-9-frederic@kernel.org --- kernel/time/timer.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 76fd9644638b..13f48ee708aa 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1706,13 +1706,11 @@ static int collect_expired_timers(struct timer_base *base, * the next expiring timer. */ if ((long)(now - base->clk) > 2) { - unsigned long next = __next_timer_interrupt(base); - /* * If the next timer is ahead of time forward to current * jiffies, otherwise forward to the next expiry time: */ - if (time_after(next, now)) { + if (time_after(base->next_expiry, now)) { /* * The call site will increment base->clk and then * terminate the expiry loop immediately. @@ -1720,7 +1718,7 @@ static int collect_expired_timers(struct timer_base *base, base->clk = now; return 0; } - base->clk = next; + base->clk = base->next_expiry; } return __collect_expired_timers(base, heads); } -- cgit v1.2.3 From 1f8a4212dc83f8353843fabf6465fd918372fbbf Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 17 Jul 2020 16:05:48 +0200 Subject: timers: Expand clk forward logic beyond nohz As for next_expiry, the base->clk catch up logic will be expanded beyond NOHZ in order to avoid triggering useless softirqs. If softirqs should only fire to expire pending timers, periodic base->clk increments must be skippable for random amounts of time. Therefore prepare to catch-up with missing updates whenever an up-to-date base clock is needed. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Tested-by: Juri Lelli Link: https://lkml.kernel.org/r/20200717140551.29076-10-frederic@kernel.org --- kernel/time/timer.c | 26 ++++---------------------- 1 file changed, 4 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 13f48ee708aa..1be92b53b75f 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -888,19 +888,12 @@ get_target_base(struct timer_base *base, unsigned tflags) static inline void forward_timer_base(struct timer_base *base) { -#ifdef CONFIG_NO_HZ_COMMON unsigned long jnow; - /* - * We only forward the base when we are idle or have just come out of - * idle (must_forward_clk logic), and have a delta between base clock - * and jiffies. In the common case, run_timers will take care of it. - */ - if (likely(!base->must_forward_clk)) + if (!base->must_forward_clk) return; jnow = READ_ONCE(jiffies); - base->must_forward_clk = base->is_idle; if ((long)(jnow - base->clk) < 2) return; @@ -915,7 +908,6 @@ static inline void forward_timer_base(struct timer_base *base) return; base->clk = base->next_expiry; } -#endif } @@ -1667,10 +1659,8 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) * logic is only maintained for the BASE_STD base, deferrable * timers may still see large granularity skew (by design). */ - if ((expires - basem) > TICK_NSEC) { - base->must_forward_clk = true; + if ((expires - basem) > TICK_NSEC) base->is_idle = true; - } } raw_spin_unlock(&base->lock); @@ -1769,16 +1759,7 @@ static inline void __run_timers(struct timer_base *base) /* * timer_base::must_forward_clk must be cleared before running * timers so that any timer functions that call mod_timer() will - * not try to forward the base. Idle tracking / clock forwarding - * logic is only used with BASE_STD timers. - * - * The must_forward_clk flag is cleared unconditionally also for - * the deferrable base. The deferrable base is not affected by idle - * tracking and never forwarded, so clearing the flag is a NOOP. - * - * The fact that the deferrable base is never forwarded can cause - * large variations in granularity for deferrable timers, but they - * can be deferred for long periods due to idle anyway. + * not try to forward the base. */ base->must_forward_clk = false; @@ -1791,6 +1772,7 @@ static inline void __run_timers(struct timer_base *base) while (levels--) expire_timers(base, heads + levels); } + base->must_forward_clk = true; raw_spin_unlock_irq(&base->lock); timer_base_unlock_expiry(base); } -- cgit v1.2.3 From d4f7dae87096dfe722bf32aa82076ece1063746c Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 17 Jul 2020 16:05:49 +0200 Subject: timers: Spare timer softirq until next expiry Now that the core timer infrastructure doesn't depend anymore on periodic base->clk increments, even when the CPU is not in NO_HZ mode, timer softirqs can be skipped until there are timers to expire. Some spurious softirqs can still remain since base->next_expiry doesn't keep track of canceled timers but this still reduces the number of softirqs significantly: ~15 times less for HZ=1000 and ~5 times less for HZ=100. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Tested-by: Juri Lelli Link: https://lkml.kernel.org/r/20200717140551.29076-11-frederic@kernel.org --- kernel/time/timer.c | 49 ++++++++----------------------------------------- 1 file changed, 8 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 1be92b53b75f..4f78a7bff9e1 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1458,10 +1458,10 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head) } } -static int __collect_expired_timers(struct timer_base *base, - struct hlist_head *heads) +static int collect_expired_timers(struct timer_base *base, + struct hlist_head *heads) { - unsigned long clk = base->clk; + unsigned long clk = base->clk = base->next_expiry; struct hlist_head *vec; int i, levels = 0; unsigned int idx; @@ -1684,40 +1684,6 @@ void timer_clear_idle(void) */ base->is_idle = false; } - -static int collect_expired_timers(struct timer_base *base, - struct hlist_head *heads) -{ - unsigned long now = READ_ONCE(jiffies); - - /* - * NOHZ optimization. After a long idle sleep we need to forward the - * base to current jiffies. Avoid a loop by searching the bitfield for - * the next expiring timer. - */ - if ((long)(now - base->clk) > 2) { - /* - * If the next timer is ahead of time forward to current - * jiffies, otherwise forward to the next expiry time: - */ - if (time_after(base->next_expiry, now)) { - /* - * The call site will increment base->clk and then - * terminate the expiry loop immediately. - */ - base->clk = now; - return 0; - } - base->clk = base->next_expiry; - } - return __collect_expired_timers(base, heads); -} -#else -static inline int collect_expired_timers(struct timer_base *base, - struct hlist_head *heads) -{ - return __collect_expired_timers(base, heads); -} #endif /* @@ -1750,7 +1716,7 @@ static inline void __run_timers(struct timer_base *base) struct hlist_head heads[LVL_DEPTH]; int levels; - if (!time_after_eq(jiffies, base->clk)) + if (time_before(jiffies, base->next_expiry)) return; timer_base_lock_expiry(base); @@ -1763,7 +1729,8 @@ static inline void __run_timers(struct timer_base *base) */ base->must_forward_clk = false; - while (time_after_eq(jiffies, base->clk)) { + while (time_after_eq(jiffies, base->clk) && + time_after_eq(jiffies, base->next_expiry)) { levels = collect_expired_timers(base, heads); base->clk++; @@ -1798,12 +1765,12 @@ void run_local_timers(void) hrtimer_run_queues(); /* Raise the softirq only if required. */ - if (time_before(jiffies, base->clk)) { + if (time_before(jiffies, base->next_expiry)) { if (!IS_ENABLED(CONFIG_NO_HZ_COMMON)) return; /* CPU is awake, so check the deferrable base. */ base++; - if (time_before(jiffies, base->clk)) + if (time_before(jiffies, base->next_expiry)) return; } raise_softirq(TIMER_SOFTIRQ); -- cgit v1.2.3 From 0975fb565b8b8f9e0c96d0de39fcb954833ea5e0 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 17 Jul 2020 16:05:50 +0200 Subject: timers: Remove must_forward_clk There is no reason to keep this guard around. The code makes sure that base->clk remains sane and won't be forwarded beyond jiffies nor set backward. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Tested-by: Juri Lelli Link: https://lkml.kernel.org/r/20200717140551.29076-12-frederic@kernel.org --- kernel/time/timer.c | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 4f78a7bff9e1..8b3fb52d8c47 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -205,7 +205,6 @@ struct timer_base { unsigned long next_expiry; unsigned int cpu; bool is_idle; - bool must_forward_clk; DECLARE_BITMAP(pending_map, WHEEL_SIZE); struct hlist_head vectors[WHEEL_SIZE]; } ____cacheline_aligned; @@ -888,12 +887,13 @@ get_target_base(struct timer_base *base, unsigned tflags) static inline void forward_timer_base(struct timer_base *base) { - unsigned long jnow; + unsigned long jnow = READ_ONCE(jiffies); - if (!base->must_forward_clk) - return; - - jnow = READ_ONCE(jiffies); + /* + * No need to forward if we are close enough below jiffies. + * Also while executing timers, base->clk is 1 offset ahead + * of jiffies to avoid endless requeuing to current jffies. + */ if ((long)(jnow - base->clk) < 2) return; @@ -1722,16 +1722,8 @@ static inline void __run_timers(struct timer_base *base) timer_base_lock_expiry(base); raw_spin_lock_irq(&base->lock); - /* - * timer_base::must_forward_clk must be cleared before running - * timers so that any timer functions that call mod_timer() will - * not try to forward the base. - */ - base->must_forward_clk = false; - while (time_after_eq(jiffies, base->clk) && time_after_eq(jiffies, base->next_expiry)) { - levels = collect_expired_timers(base, heads); base->clk++; base->next_expiry = __next_timer_interrupt(base); @@ -1739,7 +1731,6 @@ static inline void __run_timers(struct timer_base *base) while (levels--) expire_timers(base, heads + levels); } - base->must_forward_clk = true; raw_spin_unlock_irq(&base->lock); timer_base_unlock_expiry(base); } @@ -1935,7 +1926,6 @@ int timers_prepare_cpu(unsigned int cpu) base->clk = jiffies; base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA; base->is_idle = false; - base->must_forward_clk = true; } return 0; } -- cgit v1.2.3 From 36cd28a4cdd05d47ccb62a2d86e8f37839cc879a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 17 Jul 2020 16:05:51 +0200 Subject: timers: Lower base clock forwarding threshold There is nothing that prevents from forwarding the base clock if it's one jiffy off. The reason for this arbitrary limit of two jiffies is historical and does not longer exist. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Tested-by: Juri Lelli Link: https://lkml.kernel.org/r/20200717140551.29076-13-frederic@kernel.org --- kernel/time/timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 8b3fb52d8c47..77e21e98ec32 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -894,7 +894,7 @@ static inline void forward_timer_base(struct timer_base *base) * Also while executing timers, base->clk is 1 offset ahead * of jiffies to avoid endless requeuing to current jffies. */ - if ((long)(jnow - base->clk) < 2) + if ((long)(jnow - base->clk) < 1) return; /* -- cgit v1.2.3 From 31cd0e119d50cf27ebe214d1a8f7ca36692f13a5 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 23 Jul 2020 17:16:41 +0200 Subject: timers: Recalculate next timer interrupt only when necessary The nohz tick code recalculates the timer wheel's next expiry on each idle loop iteration. On the other hand, the base next expiry is now always cached and updated upon timer enqueue and execution. Only timer dequeue may leave base->next_expiry out of date (but then its stale value won't ever go past the actual next expiry to be recalculated). Since recalculating the next_expiry isn't a free operation, especially when the last wheel level is reached to find out that no timer has been enqueued at all, reuse the next expiry cache when it is known to be reliable, which it is most of the time. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200723151641.12236-1-frederic@kernel.org --- kernel/time/timer.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 77e21e98ec32..96d802e9769e 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -204,6 +204,7 @@ struct timer_base { unsigned long clk; unsigned long next_expiry; unsigned int cpu; + bool next_expiry_recalc; bool is_idle; DECLARE_BITMAP(pending_map, WHEEL_SIZE); struct hlist_head vectors[WHEEL_SIZE]; @@ -593,6 +594,7 @@ static void enqueue_timer(struct timer_base *base, struct timer_list *timer, * can reevaluate the wheel: */ base->next_expiry = bucket_expiry; + base->next_expiry_recalc = false; trigger_dyntick_cpu(base, timer); } } @@ -836,8 +838,10 @@ static int detach_if_pending(struct timer_list *timer, struct timer_base *base, if (!timer_pending(timer)) return 0; - if (hlist_is_singular_node(&timer->entry, base->vectors + idx)) + if (hlist_is_singular_node(&timer->entry, base->vectors + idx)) { __clear_bit(idx, base->pending_map); + base->next_expiry_recalc = true; + } detach_timer(timer, clear_pending); return 1; @@ -1571,6 +1575,9 @@ static unsigned long __next_timer_interrupt(struct timer_base *base) clk >>= LVL_CLK_SHIFT; clk += adj; } + + base->next_expiry_recalc = false; + return next; } @@ -1631,9 +1638,11 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) return expires; raw_spin_lock(&base->lock); - nextevt = __next_timer_interrupt(base); + if (base->next_expiry_recalc) + base->next_expiry = __next_timer_interrupt(base); + nextevt = base->next_expiry; is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA); - base->next_expiry = nextevt; + /* * We have a fresh next event. Check whether we can forward the * base. We can only do that when @basej is past base->clk @@ -1725,6 +1734,12 @@ static inline void __run_timers(struct timer_base *base) while (time_after_eq(jiffies, base->clk) && time_after_eq(jiffies, base->next_expiry)) { levels = collect_expired_timers(base, heads); + /* + * The only possible reason for not finding any expired + * timer at this clk is that all matching timers have been + * dequeued. + */ + WARN_ON_ONCE(!levels && !base->next_expiry_recalc); base->clk++; base->next_expiry = __next_timer_interrupt(base); -- cgit v1.2.3