diff options
Diffstat (limited to 'kernel/sched')
-rw-r--r-- | kernel/sched/core.c | 28 | ||||
-rw-r--r-- | kernel/sched/cpufreq_schedutil.c | 114 | ||||
-rw-r--r-- | kernel/sched/cputime.c | 36 | ||||
-rw-r--r-- | kernel/sched/idle.c | 28 | ||||
-rw-r--r-- | kernel/sched/membarrier.c | 77 | ||||
-rw-r--r-- | kernel/sched/wait.c | 17 |
6 files changed, 225 insertions, 75 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0ca7d2dc16d5..15d2562118d1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4077,6 +4077,22 @@ static inline void finish_lock_switch(struct rq *rq) # define finish_arch_post_lock_switch() do { } while (0) #endif +static inline void kmap_local_sched_out(void) +{ +#ifdef CONFIG_KMAP_LOCAL + if (unlikely(current->kmap_ctrl.idx)) + __kmap_local_sched_out(); +#endif +} + +static inline void kmap_local_sched_in(void) +{ +#ifdef CONFIG_KMAP_LOCAL + if (unlikely(current->kmap_ctrl.idx)) + __kmap_local_sched_in(); +#endif +} + /** * prepare_task_switch - prepare to switch tasks * @rq: the runqueue preparing to switch @@ -4099,6 +4115,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, perf_event_task_sched_out(prev, next); rseq_preempt(prev); fire_sched_out_preempt_notifiers(prev, next); + kmap_local_sched_out(); prepare_task(next); prepare_arch_switch(next); } @@ -4165,6 +4182,14 @@ static struct rq *finish_task_switch(struct task_struct *prev) finish_lock_switch(rq); finish_arch_post_lock_switch(); kcov_finish_switch(current); + /* + * kmap_local_sched_out() is invoked with rq::lock held and + * interrupts disabled. There is no requirement for that, but the + * sched out code does not have an interrupt enabled section. + * Restoring the maps on sched in does not require interrupts being + * disabled either. + */ + kmap_local_sched_in(); fire_sched_in_preempt_notifiers(current); /* @@ -4805,6 +4830,7 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt) preempt_count_set(PREEMPT_DISABLED); } rcu_sleep_check(); + SCHED_WARN_ON(ct_state() == CONTEXT_USER); profile_hit(SCHED_PROFILING, __builtin_return_address(0)); @@ -5146,7 +5172,7 @@ void __sched schedule_idle(void) } while (need_resched()); } -#ifdef CONFIG_CONTEXT_TRACKING +#if defined(CONFIG_CONTEXT_TRACKING) && !defined(CONFIG_HAVE_CONTEXT_TRACKING_OFFSTACK) asmlinkage __visible void __sched schedule_user(void) { /* diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index d90cad7a374f..6931f0cdeb80 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -53,6 +53,7 @@ struct sugov_cpu { unsigned int iowait_boost; u64 last_update; + unsigned long util; unsigned long bw_dl; unsigned long max; @@ -102,12 +103,10 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time, unsigned int next_freq) { - if (!sg_policy->need_freq_update) { - if (sg_policy->next_freq == next_freq) - return false; - } else { + if (sg_policy->need_freq_update) sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS); - } + else if (sg_policy->next_freq == next_freq) + return false; sg_policy->next_freq = next_freq; sg_policy->last_freq_update_time = time; @@ -278,16 +277,15 @@ unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, return min(max, util); } -static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) +static void sugov_get_util(struct sugov_cpu *sg_cpu) { struct rq *rq = cpu_rq(sg_cpu->cpu); - unsigned long util = cpu_util_cfs(rq); unsigned long max = arch_scale_cpu_capacity(sg_cpu->cpu); sg_cpu->max = max; sg_cpu->bw_dl = cpu_bw_dl(rq); - - return schedutil_cpu_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL, NULL); + sg_cpu->util = schedutil_cpu_util(sg_cpu->cpu, cpu_util_cfs(rq), max, + FREQUENCY_UTIL, NULL); } /** @@ -364,8 +362,6 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, * sugov_iowait_apply() - Apply the IO boost to a CPU. * @sg_cpu: the sugov data for the cpu to boost * @time: the update time from the caller - * @util: the utilization to (eventually) boost - * @max: the maximum value the utilization can be boosted to * * A CPU running a task which woken up after an IO operation can have its * utilization boosted to speed up the completion of those IO operations. @@ -379,18 +375,17 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, * This mechanism is designed to boost high frequently IO waiting tasks, while * being more conservative on tasks which does sporadic IO operations. */ -static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, - unsigned long util, unsigned long max) +static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time) { unsigned long boost; /* No boost currently required */ if (!sg_cpu->iowait_boost) - return util; + return; /* Reset boost if the CPU appears to have been idle enough */ if (sugov_iowait_reset(sg_cpu, time, false)) - return util; + return; if (!sg_cpu->iowait_boost_pending) { /* @@ -399,18 +394,19 @@ static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, sg_cpu->iowait_boost >>= 1; if (sg_cpu->iowait_boost < IOWAIT_BOOST_MIN) { sg_cpu->iowait_boost = 0; - return util; + return; } } sg_cpu->iowait_boost_pending = false; /* - * @util is already in capacity scale; convert iowait_boost + * sg_cpu->util is already in capacity scale; convert iowait_boost * into the same scale so we can compare. */ - boost = (sg_cpu->iowait_boost * max) >> SCHED_CAPACITY_SHIFT; - return max(boost, util); + boost = (sg_cpu->iowait_boost * sg_cpu->max) >> SCHED_CAPACITY_SHIFT; + if (sg_cpu->util < boost) + sg_cpu->util = boost; } #ifdef CONFIG_NO_HZ_COMMON @@ -436,14 +432,10 @@ static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_p sg_policy->limits_changed = true; } -static void sugov_update_single(struct update_util_data *hook, u64 time, - unsigned int flags) +static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu, + u64 time, unsigned int flags) { - struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); struct sugov_policy *sg_policy = sg_cpu->sg_policy; - unsigned long util, max; - unsigned int next_f; - unsigned int cached_freq = sg_policy->cached_raw_freq; sugov_iowait_boost(sg_cpu, time, flags); sg_cpu->last_update = time; @@ -451,12 +443,26 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, ignore_dl_rate_limit(sg_cpu, sg_policy); if (!sugov_should_update_freq(sg_policy, time)) + return false; + + sugov_get_util(sg_cpu); + sugov_iowait_apply(sg_cpu, time); + + return true; +} + +static void sugov_update_single_freq(struct update_util_data *hook, u64 time, + unsigned int flags) +{ + struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); + struct sugov_policy *sg_policy = sg_cpu->sg_policy; + unsigned int cached_freq = sg_policy->cached_raw_freq; + unsigned int next_f; + + if (!sugov_update_single_common(sg_cpu, time, flags)) return; - util = sugov_get_util(sg_cpu); - max = sg_cpu->max; - util = sugov_iowait_apply(sg_cpu, time, util, max); - next_f = get_next_freq(sg_policy, util, max); + next_f = get_next_freq(sg_policy, sg_cpu->util, sg_cpu->max); /* * Do not reduce the frequency if the CPU has not been idle * recently, as the reduction is likely to be premature then. @@ -482,6 +488,38 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, } } +static void sugov_update_single_perf(struct update_util_data *hook, u64 time, + unsigned int flags) +{ + struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); + unsigned long prev_util = sg_cpu->util; + + /* + * Fall back to the "frequency" path if frequency invariance is not + * supported, because the direct mapping between the utilization and + * the performance levels depends on the frequency invariance. + */ + if (!arch_scale_freq_invariant()) { + sugov_update_single_freq(hook, time, flags); + return; + } + + if (!sugov_update_single_common(sg_cpu, time, flags)) + return; + + /* + * Do not reduce the target performance level if the CPU has not been + * idle recently, as the reduction is likely to be premature then. + */ + if (sugov_cpu_is_busy(sg_cpu) && sg_cpu->util < prev_util) + sg_cpu->util = prev_util; + + cpufreq_driver_adjust_perf(sg_cpu->cpu, map_util_perf(sg_cpu->bw_dl), + map_util_perf(sg_cpu->util), sg_cpu->max); + + sg_cpu->sg_policy->last_freq_update_time = time; +} + static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) { struct sugov_policy *sg_policy = sg_cpu->sg_policy; @@ -493,9 +531,10 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j); unsigned long j_util, j_max; - j_util = sugov_get_util(j_sg_cpu); + sugov_get_util(j_sg_cpu); + sugov_iowait_apply(j_sg_cpu, time); + j_util = j_sg_cpu->util; j_max = j_sg_cpu->max; - j_util = sugov_iowait_apply(j_sg_cpu, time, j_util, j_max); if (j_util * max > j_max * util) { util = j_util; @@ -819,6 +858,7 @@ static void sugov_exit(struct cpufreq_policy *policy) static int sugov_start(struct cpufreq_policy *policy) { struct sugov_policy *sg_policy = policy->governor_data; + void (*uu)(struct update_util_data *data, u64 time, unsigned int flags); unsigned int cpu; sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC; @@ -838,13 +878,17 @@ static int sugov_start(struct cpufreq_policy *policy) sg_cpu->sg_policy = sg_policy; } + if (policy_is_shared(policy)) + uu = sugov_update_shared; + else if (policy->fast_switch_enabled && cpufreq_driver_has_adjust_perf()) + uu = sugov_update_single_perf; + else + uu = sugov_update_single_freq; + for_each_cpu(cpu, policy->cpus) { struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); - cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, - policy_is_shared(policy) ? - sugov_update_shared : - sugov_update_single); + cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, uu); } return 0; } diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 5a55d2300452..5f611658eeab 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -44,12 +44,13 @@ static void irqtime_account_delta(struct irqtime *irqtime, u64 delta, } /* - * Called before incrementing preempt_count on {soft,}irq_enter + * Called after incrementing preempt_count on {soft,}irq_enter * and before decrementing preempt_count on {soft,}irq_exit. */ -void irqtime_account_irq(struct task_struct *curr) +void irqtime_account_irq(struct task_struct *curr, unsigned int offset) { struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime); + unsigned int pc; s64 delta; int cpu; @@ -59,6 +60,7 @@ void irqtime_account_irq(struct task_struct *curr) cpu = smp_processor_id(); delta = sched_clock_cpu(cpu) - irqtime->irq_start_time; irqtime->irq_start_time += delta; + pc = preempt_count() - offset; /* * We do not account for softirq time from ksoftirqd here. @@ -66,12 +68,11 @@ void irqtime_account_irq(struct task_struct *curr) * in that case, so as not to confuse scheduler with a special task * that do not consume any time, but still wants to run. */ - if (hardirq_count()) + if (pc & HARDIRQ_MASK) irqtime_account_delta(irqtime, delta, CPUTIME_IRQ); - else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) + else if ((pc & SOFTIRQ_OFFSET) && curr != this_cpu_ksoftirqd()) irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ); } -EXPORT_SYMBOL_GPL(irqtime_account_irq); static u64 irqtime_tick_accounted(u64 maxtime) { @@ -418,24 +419,21 @@ void vtime_task_switch(struct task_struct *prev) } # endif -/* - * Archs that account the whole time spent in the idle task - * (outside irq) as idle time can rely on this and just implement - * vtime_account_kernel() and vtime_account_idle(). Archs that - * have other meaning of the idle time (s390 only includes the - * time spent by the CPU when it's in low power mode) must override - * vtime_account(). - */ -#ifndef __ARCH_HAS_VTIME_ACCOUNT -void vtime_account_irq_enter(struct task_struct *tsk) +void vtime_account_irq(struct task_struct *tsk, unsigned int offset) { - if (!in_interrupt() && is_idle_task(tsk)) + unsigned int pc = preempt_count() - offset; + + if (pc & HARDIRQ_OFFSET) { + vtime_account_hardirq(tsk); + } else if (pc & SOFTIRQ_OFFSET) { + vtime_account_softirq(tsk); + } else if (!IS_ENABLED(CONFIG_HAVE_VIRT_CPU_ACCOUNTING_IDLE) && + is_idle_task(tsk)) { vtime_account_idle(tsk); - else + } else { vtime_account_kernel(tsk); + } } -EXPORT_SYMBOL_GPL(vtime_account_irq_enter); -#endif /* __ARCH_HAS_VTIME_ACCOUNT */ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, u64 *ut, u64 *st) diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index df91b198a74c..305727ea0677 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -78,7 +78,7 @@ void __weak arch_cpu_idle_dead(void) { } void __weak arch_cpu_idle(void) { cpu_idle_force_poll = 1; - local_irq_enable(); + raw_local_irq_enable(); } /** @@ -94,9 +94,35 @@ void __cpuidle default_idle_call(void) trace_cpu_idle(1, smp_processor_id()); stop_critical_timings(); + + /* + * arch_cpu_idle() is supposed to enable IRQs, however + * we can't do that because of RCU and tracing. + * + * Trace IRQs enable here, then switch off RCU, and have + * arch_cpu_idle() use raw_local_irq_enable(). Note that + * rcu_idle_enter() relies on lockdep IRQ state, so switch that + * last -- this is very similar to the entry code. + */ + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(_THIS_IP_); rcu_idle_enter(); + lockdep_hardirqs_on(_THIS_IP_); + arch_cpu_idle(); + + /* + * OK, so IRQs are enabled here, but RCU needs them disabled to + * turn itself back on.. funny thing is that disabling IRQs + * will cause tracing, which needs RCU. Jump through hoops to + * make it 'work'. + */ + raw_local_irq_disable(); + lockdep_hardirqs_off(_THIS_IP_); rcu_idle_exit(); + lockdep_hardirqs_on(_THIS_IP_); + raw_local_irq_enable(); + start_critical_timings(); trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); } diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 5a40b3828ff2..08ae45ad9261 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -166,8 +166,33 @@ static void ipi_mb(void *info) smp_mb(); /* IPIs should be serializing but paranoid. */ } +static void ipi_sync_core(void *info) +{ + /* + * The smp_mb() in membarrier after all the IPIs is supposed to + * ensure that memory on remote CPUs that occur before the IPI + * become visible to membarrier()'s caller -- see scenario B in + * the big comment at the top of this file. + * + * A sync_core() would provide this guarantee, but + * sync_core_before_usermode() might end up being deferred until + * after membarrier()'s smp_mb(). + */ + smp_mb(); /* IPIs should be serializing but paranoid. */ + + sync_core_before_usermode(); +} + static void ipi_rseq(void *info) { + /* + * Ensure that all stores done by the calling thread are visible + * to the current task before the current task resumes. We could + * probably optimize this away on most architectures, but by the + * time we've already sent an IPI, the cost of the extra smp_mb() + * is negligible. + */ + smp_mb(); rseq_preempt(current); } @@ -293,6 +318,7 @@ static int membarrier_private_expedited(int flags, int cpu_id) if (!(atomic_read(&mm->membarrier_state) & MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY)) return -EPERM; + ipi_func = ipi_sync_core; } else if (flags == MEMBARRIER_FLAG_RSEQ) { if (!IS_ENABLED(CONFIG_RSEQ)) return -EINVAL; @@ -307,7 +333,8 @@ static int membarrier_private_expedited(int flags, int cpu_id) return -EPERM; } - if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) + if (flags != MEMBARRIER_FLAG_SYNC_CORE && + (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1)) return 0; /* @@ -326,8 +353,6 @@ static int membarrier_private_expedited(int flags, int cpu_id) if (cpu_id >= nr_cpu_ids || !cpu_online(cpu_id)) goto out; - if (cpu_id == raw_smp_processor_id()) - goto out; rcu_read_lock(); p = rcu_dereference(cpu_rq(cpu_id)->curr); if (!p || p->mm != mm) { @@ -342,16 +367,6 @@ static int membarrier_private_expedited(int flags, int cpu_id) for_each_online_cpu(cpu) { struct task_struct *p; - /* - * Skipping the current CPU is OK even through we can be - * migrated at any point. The current CPU, at the point - * where we read raw_smp_processor_id(), is ensured to - * be in program order with respect to the caller - * thread. Therefore, we can skip this CPU from the - * iteration. - */ - if (cpu == raw_smp_processor_id()) - continue; p = rcu_dereference(cpu_rq(cpu)->curr); if (p && p->mm == mm) __cpumask_set_cpu(cpu, tmpmask); @@ -359,12 +374,38 @@ static int membarrier_private_expedited(int flags, int cpu_id) rcu_read_unlock(); } - preempt_disable(); - if (cpu_id >= 0) + if (cpu_id >= 0) { + /* + * smp_call_function_single() will call ipi_func() if cpu_id + * is the calling CPU. + */ smp_call_function_single(cpu_id, ipi_func, NULL, 1); - else - smp_call_function_many(tmpmask, ipi_func, NULL, 1); - preempt_enable(); + } else { + /* + * For regular membarrier, we can save a few cycles by + * skipping the current cpu -- we're about to do smp_mb() + * below, and if we migrate to a different cpu, this cpu + * and the new cpu will execute a full barrier in the + * scheduler. + * + * For SYNC_CORE, we do need a barrier on the current cpu -- + * otherwise, if we are migrated and replaced by a different + * task in the same mm just before, during, or after + * membarrier, we will end up with some thread in the mm + * running without a core sync. + * + * For RSEQ, don't rseq_preempt() the caller. User code + * is not supposed to issue syscalls at all from inside an + * rseq critical section. + */ + if (flags != MEMBARRIER_FLAG_SYNC_CORE) { + preempt_disable(); + smp_call_function_many(tmpmask, ipi_func, NULL, true); + preempt_enable(); + } else { + on_each_cpu_mask(tmpmask, ipi_func, NULL, true); + } + } out: if (cpu_id < 0) diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 01f5d3020589..183cc6ae68a6 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -37,6 +37,17 @@ void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue } EXPORT_SYMBOL(add_wait_queue_exclusive); +void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) +{ + unsigned long flags; + + wq_entry->flags |= WQ_FLAG_EXCLUSIVE | WQ_FLAG_PRIORITY; + spin_lock_irqsave(&wq_head->lock, flags); + __add_wait_queue(wq_head, wq_entry); + spin_unlock_irqrestore(&wq_head->lock, flags); +} +EXPORT_SYMBOL_GPL(add_wait_queue_priority); + void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { unsigned long flags; @@ -57,7 +68,11 @@ EXPORT_SYMBOL(remove_wait_queue); /* * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve - * number) then we wake all the non-exclusive tasks and one exclusive task. + * number) then we wake that number of exclusive tasks, and potentially all + * the non-exclusive tasks. Normally, exclusive tasks will be at the end of + * the list and any non-exclusive tasks will be woken first. A priority task + * may be at the head of the list, and can consume the event without any other + * tasks being woken. * * There are circumstances in which we can try to wake a task which has already * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns |