diff options
author | Vincent Guittot <vincent.guittot@linaro.org> | 2015-08-10 11:59:38 +0200 |
---|---|---|
committer | Vincent Guittot <vincent.guittot@linaro.org> | 2015-08-10 11:59:38 +0200 |
commit | 18c88327ccc0e2103b835dd5c56b07f76da37e62 (patch) | |
tree | 6fe92a29313986661458e4f61d8577cffaa7cb47 | |
parent | 08428ee8045f579d96155e25c708cbc01dfff19d (diff) | |
parent | 58ec4a717ee67243c4c2ac10ee2e5a777fb466d1 (diff) |
Merge branch 'sched-dvfs-v3' into test-sched-dvfs-v3test-sched-dvfs-v3
Conflicts:
kernel/sched/fair.c
-rw-r--r-- | arch/arm/include/asm/topology.h | 7 | ||||
-rw-r--r-- | arch/arm/kernel/smp.c | 53 | ||||
-rw-r--r-- | arch/arm/kernel/topology.c | 17 | ||||
-rw-r--r-- | drivers/cpufreq/Kconfig | 24 | ||||
-rw-r--r-- | drivers/cpufreq/cpufreq.c | 6 | ||||
-rw-r--r-- | include/linux/cpufreq.h | 12 | ||||
-rw-r--r-- | kernel/sched/Makefile | 1 | ||||
-rw-r--r-- | kernel/sched/cpufreq_sched.c | 308 | ||||
-rw-r--r-- | kernel/sched/fair.c | 30 | ||||
-rw-r--r-- | kernel/sched/sched.h | 8 |
10 files changed, 464 insertions, 2 deletions
diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h index 370f7a732900..c31096fb26db 100644 --- a/arch/arm/include/asm/topology.h +++ b/arch/arm/include/asm/topology.h @@ -24,6 +24,13 @@ void init_cpu_topology(void); void store_cpu_topology(unsigned int cpuid); const struct cpumask *cpu_coregroup_mask(int cpu); +#define arch_scale_freq_capacity arm_arch_scale_freq_capacity +struct sched_domain; +extern +unsigned long arm_arch_scale_freq_capacity(struct sched_domain *sd, int cpu); + +DECLARE_PER_CPU(atomic_long_t, cpu_freq_capacity); + #else static inline void init_cpu_topology(void) { } diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 90dfbedfbfb8..32c7dd9a41b8 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -679,12 +679,34 @@ static DEFINE_PER_CPU(unsigned long, l_p_j_ref); static DEFINE_PER_CPU(unsigned long, l_p_j_ref_freq); static unsigned long global_l_p_j_ref; static unsigned long global_l_p_j_ref_freq; +static DEFINE_PER_CPU(atomic_long_t, cpu_max_freq); +DEFINE_PER_CPU(atomic_long_t, cpu_freq_capacity); + +/* + * Scheduler load-tracking scale-invariance + * + * Provides the scheduler with a scale-invariance correction factor that + * compensates for frequency scaling through arch_scale_freq_capacity() + * (implemented in topology.c). + */ +static inline +void scale_freq_capacity(int cpu, unsigned long curr, unsigned long max) +{ + unsigned long capacity; + + if (!max) + return; + + capacity = (curr << SCHED_CAPACITY_SHIFT) / max; + atomic_long_set(&per_cpu(cpu_freq_capacity, cpu), capacity); +} static int cpufreq_callback(struct notifier_block *nb, unsigned long val, void *data) { struct cpufreq_freqs *freq = data; int cpu = freq->cpu; + unsigned long max = atomic_long_read(&per_cpu(cpu_max_freq, cpu)); if (freq->flags & CPUFREQ_CONST_LOOPS) return NOTIFY_OK; @@ -709,6 +731,9 @@ static int cpufreq_callback(struct notifier_block *nb, per_cpu(l_p_j_ref_freq, cpu), freq->new); } + + scale_freq_capacity(cpu, freq->new, max); + return NOTIFY_OK; } @@ -716,11 +741,35 @@ static struct notifier_block cpufreq_notifier = { .notifier_call = cpufreq_callback, }; +static int cpufreq_policy_callback(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct cpufreq_policy *policy = data; + int i; + + for_each_cpu(i, policy->cpus) { + scale_freq_capacity(i, policy->cur, policy->max); + atomic_long_set(&per_cpu(cpu_max_freq, i), policy->max); + } + + return NOTIFY_OK; +} + +static struct notifier_block cpufreq_policy_notifier = { + .notifier_call = cpufreq_policy_callback, +}; + static int __init register_cpufreq_notifier(void) { - return cpufreq_register_notifier(&cpufreq_notifier, + int ret; + + ret = cpufreq_register_notifier(&cpufreq_notifier, CPUFREQ_TRANSITION_NOTIFIER); + if (ret) + return ret; + + return cpufreq_register_notifier(&cpufreq_policy_notifier, + CPUFREQ_POLICY_NOTIFIER); } core_initcall(register_cpufreq_notifier); - #endif diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index 08b7847bf912..9c09e6ef8def 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -169,6 +169,23 @@ static void update_cpu_capacity(unsigned int cpu) cpu, arch_scale_cpu_capacity(NULL, cpu)); } +/* + * Scheduler load-tracking scale-invariance + * + * Provides the scheduler with a scale-invariance correction factor that + * compensates for frequency scaling (arch_scale_freq_capacity()). The scaling + * factor is updated in smp.c + */ +unsigned long arm_arch_scale_freq_capacity(struct sched_domain *sd, int cpu) +{ + unsigned long curr = atomic_long_read(&per_cpu(cpu_freq_capacity, cpu)); + + if (!curr) + return SCHED_CAPACITY_SCALE; + + return curr; +} + #else static inline void parse_dt_topology(void) {} static inline void update_cpu_capacity(unsigned int cpuid) {} diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig index 659879a56dba..9bbf44c4e574 100644 --- a/drivers/cpufreq/Kconfig +++ b/drivers/cpufreq/Kconfig @@ -102,6 +102,15 @@ config CPU_FREQ_DEFAULT_GOV_CONSERVATIVE Be aware that not all cpufreq drivers support the conservative governor. If unsure have a look at the help section of the driver. Fallback governor will be the performance governor. + +config CPU_FREQ_DEFAULT_GOV_SCHED + bool "sched" + select CPU_FREQ_GOV_SCHED + select CPU_FREQ_GOV_PERFORMANCE + help + Use the CPUfreq governor 'sched' as default. This scales + cpu frequency from the scheduler as per-entity load tracking + statistics are updated. endchoice config CPU_FREQ_GOV_PERFORMANCE @@ -183,6 +192,21 @@ config CPU_FREQ_GOV_CONSERVATIVE If in doubt, say N. +config CPU_FREQ_GOV_SCHED + tristate "'sched' cpufreq governor" + depends on CPU_FREQ + select CPU_FREQ_GOV_COMMON + help + 'sched' - this governor scales cpu frequency from the + scheduler as a function of cpu capacity utilization. It does + not evaluate utilization on a periodic basis (as ondemand + does) but instead is invoked from the completely fair + scheduler when updating per-entity load tracking statistics. + Latency to respond to changes in load is improved over polling + governors due to its event-driven design. + + If in doubt, say N. + comment "CPU frequency scaling drivers" config CPUFREQ_DT diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index b612411655f9..376e7eafa8c5 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -160,6 +160,12 @@ bool have_governor_per_policy(void) } EXPORT_SYMBOL_GPL(have_governor_per_policy); +bool cpufreq_driver_might_sleep(void) +{ + return !(cpufreq_driver->flags & CPUFREQ_DRIVER_WILL_NOT_SLEEP); +} +EXPORT_SYMBOL_GPL(cpufreq_driver_might_sleep); + struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy) { if (have_governor_per_policy()) diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 29ad97c34fd5..a211340deae8 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -160,6 +160,7 @@ u64 get_cpu_idle_time(unsigned int cpu, u64 *wall, int io_busy); int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu); int cpufreq_update_policy(unsigned int cpu); bool have_governor_per_policy(void); +bool cpufreq_driver_might_sleep(void); struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy); #else static inline unsigned int cpufreq_get(unsigned int cpu) @@ -317,6 +318,14 @@ struct cpufreq_driver { */ #define CPUFREQ_NEED_INITIAL_FREQ_CHECK (1 << 5) +/* + * Set by drivers that will never block or sleep during their frequency + * transition. Used to indicate when it is safe to call cpufreq_driver_target + * from non-interruptable context. Drivers must opt-in to this flag, as the + * safe default is that they might sleep. + */ +#define CPUFREQ_DRIVER_WILL_NOT_SLEEP (1 << 6) + int cpufreq_register_driver(struct cpufreq_driver *driver_data); int cpufreq_unregister_driver(struct cpufreq_driver *driver_data); @@ -488,6 +497,9 @@ extern struct cpufreq_governor cpufreq_gov_ondemand; #elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE) extern struct cpufreq_governor cpufreq_gov_conservative; #define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_conservative) +#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED_GOV) +extern struct cpufreq_governor cpufreq_gov_sched_gov; +#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_sched) #endif /********************************************************************* diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 67687973ce80..90ed83225668 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -19,3 +19,4 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o +obj-$(CONFIG_CPU_FREQ_GOV_SCHED) += cpufreq_sched.o diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c new file mode 100644 index 000000000000..5020f2456685 --- /dev/null +++ b/kernel/sched/cpufreq_sched.c @@ -0,0 +1,308 @@ +/* + * Copyright (C) 2015 Michael Turquette <mturquette@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/cpufreq.h> +#include <linux/module.h> +#include <linux/kthread.h> +#include <linux/percpu.h> +#include <linux/irq_work.h> + +#include "sched.h" + +#define THROTTLE_NSEC 50000000 /* 50ms default */ + +static DEFINE_PER_CPU(unsigned long, pcpu_capacity); +static DEFINE_PER_CPU(struct cpufreq_policy *, pcpu_policy); + +/** + * gov_data - per-policy data internal to the governor + * @throttle: next throttling period expiry. Derived from throttle_nsec + * @throttle_nsec: throttle period length in nanoseconds + * @task: worker thread for dvfs transition that may block/sleep + * @irq_work: callback used to wake up worker thread + * @freq: new frequency stored in *_sched_update_cpu and used in *_sched_thread + * + * struct gov_data is the per-policy cpufreq_sched-specific data structure. A + * per-policy instance of it is created when the cpufreq_sched governor receives + * the CPUFREQ_GOV_START condition and a pointer to it exists in the gov_data + * member of struct cpufreq_policy. + * + * Readers of this data must call down_read(policy->rwsem). Writers must + * call down_write(policy->rwsem). + */ +struct gov_data { + ktime_t throttle; + unsigned int throttle_nsec; + struct task_struct *task; + struct irq_work irq_work; + struct cpufreq_policy *policy; + unsigned int freq; +}; + +static void cpufreq_sched_try_driver_target(struct cpufreq_policy *policy, unsigned int freq) +{ + struct gov_data *gd = policy->governor_data; + + /* avoid race with cpufreq_sched_stop */ + if (!down_write_trylock(&policy->rwsem)) + return; + + __cpufreq_driver_target(policy, freq, CPUFREQ_RELATION_L); + + gd->throttle = ktime_add_ns(ktime_get(), gd->throttle_nsec); + up_write(&policy->rwsem); +} + +/* + * we pass in struct cpufreq_policy. This is safe because changing out the + * policy requires a call to __cpufreq_governor(policy, CPUFREQ_GOV_STOP), + * which tears down all of the data structures and __cpufreq_governor(policy, + * CPUFREQ_GOV_START) will do a full rebuild, including this kthread with the + * new policy pointer + */ +static int cpufreq_sched_thread(void *data) +{ + struct sched_param param; + struct cpufreq_policy *policy; + struct gov_data *gd; + int ret; + + policy = (struct cpufreq_policy *) data; + if (!policy) { + pr_warn("%s: missing policy\n", __func__); + do_exit(-EINVAL); + } + + gd = policy->governor_data; + if (!gd) { + pr_warn("%s: missing governor data\n", __func__); + do_exit(-EINVAL); + } + + param.sched_priority = 50; + ret = sched_setscheduler_nocheck(gd->task, SCHED_FIFO, ¶m); + if (ret) { + pr_warn("%s: failed to set SCHED_FIFO\n", __func__); + do_exit(-EINVAL); + } else { + pr_debug("%s: kthread (%d) set to SCHED_FIFO\n", + __func__, gd->task->pid); + } + + ret = set_cpus_allowed_ptr(gd->task, policy->related_cpus); + if (ret) { + pr_warn("%s: failed to set allowed ptr\n", __func__); + do_exit(-EINVAL); + } + + /* main loop of the per-policy kthread */ + do { + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + if (kthread_should_stop()) + break; + + cpufreq_sched_try_driver_target(policy, gd->freq); + } while (!kthread_should_stop()); + + do_exit(0); +} + +static void cpufreq_sched_irq_work(struct irq_work *irq_work) +{ + struct gov_data *gd; + + gd = container_of(irq_work, struct gov_data, irq_work); + if (!gd) { + return; + } + + wake_up_process(gd->task); +} + +/** + * cpufreq_sched_set_capacity - interface to scheduler for changing capacity values + * @cpu: cpu whose capacity utilization has recently changed + * @capacity: the new capacity requested by cpu + * + * cpufreq_sched_sched_capacity is an interface exposed to the scheduler so + * that the scheduler may inform the governor of updates to capacity + * utilization and make changes to cpu frequency. Currently this interface is + * designed around PELT values in CFS. It can be expanded to other scheduling + * classes in the future if needed. + * + * cpufreq_sched_set_capacity raises an IPI. The irq_work handler for that IPI + * wakes up the thread that does the actual work, cpufreq_sched_thread. + * + * This functions bails out early if either condition is true: + * 1) this cpu did not the new maximum capacity for its frequency domain + * 2) no change in cpu frequency is necessary to meet the new capacity request + */ +void cpufreq_sched_set_cap(int cpu, unsigned long capacity) +{ + unsigned int freq_new, cpu_tmp; + struct cpufreq_policy *policy; + struct gov_data *gd; + unsigned long capacity_max = 0; + + /* update per-cpu capacity request */ + __this_cpu_write(pcpu_capacity, capacity); + + policy = cpufreq_cpu_get(cpu); + if (IS_ERR_OR_NULL(policy)) { + return; + } + + if (!policy->governor_data) + goto out; + + gd = policy->governor_data; + + /* bail early if we are throttled */ + if (ktime_before(ktime_get(), gd->throttle)) + goto out; + + /* find max capacity requested by cpus in this policy */ + for_each_cpu(cpu_tmp, policy->cpus) + capacity_max = max(capacity_max, per_cpu(pcpu_capacity, cpu_tmp)); + + /* + * We only change frequency if this cpu's capacity request represents a + * new max. If another cpu has requested a capacity greater than the + * previous max then we rely on that cpu to hit this code path and make + * the change. IOW, the cpu with the new max capacity is responsible + * for setting the new capacity/frequency. + * + * If this cpu is not the new maximum then bail + */ + if (capacity_max > capacity) + goto out; + + /* Convert the new maximum capacity request into a cpu frequency */ + freq_new = capacity * policy->max >> SCHED_CAPACITY_SHIFT; + + /* No change in frequency? Bail and return current capacity. */ + if (freq_new == policy->cur) + goto out; + + /* store the new frequency and perform the transition */ + gd->freq = freq_new; + + if (cpufreq_driver_might_sleep()) + irq_work_queue_on(&gd->irq_work, cpu); + else + cpufreq_sched_try_driver_target(policy, freq_new); + +out: + cpufreq_cpu_put(policy); + return; +} + +static int cpufreq_sched_start(struct cpufreq_policy *policy) +{ + struct gov_data *gd; + int cpu; + + /* prepare per-policy private data */ + gd = kzalloc(sizeof(*gd), GFP_KERNEL); + if (!gd) { + pr_debug("%s: failed to allocate private data\n", __func__); + return -ENOMEM; + } + + /* initialize per-cpu data */ + for_each_cpu(cpu, policy->cpus) { + per_cpu(pcpu_capacity, cpu) = 0; + per_cpu(pcpu_policy, cpu) = policy; + } + + /* + * Don't ask for freq changes at an higher rate than what + * the driver advertises as transition latency. + */ + gd->throttle_nsec = policy->cpuinfo.transition_latency ? + policy->cpuinfo.transition_latency : + THROTTLE_NSEC; + pr_debug("%s: throttle threshold = %u [ns]\n", + __func__, gd->throttle_nsec); + + if (cpufreq_driver_might_sleep()) { + /* init per-policy kthread */ + gd->task = kthread_run(cpufreq_sched_thread, policy, "kcpufreq_sched_task"); + if (IS_ERR_OR_NULL(gd->task)) { + pr_err("%s: failed to create kcpufreq_sched_task thread\n", __func__); + goto err; + } + init_irq_work(&gd->irq_work, cpufreq_sched_irq_work); + } + + policy->governor_data = gd; + gd->policy = policy; + return 0; + +err: + kfree(gd); + return -ENOMEM; +} + +static int cpufreq_sched_stop(struct cpufreq_policy *policy) +{ + struct gov_data *gd = policy->governor_data; + + if (cpufreq_driver_might_sleep()) { + kthread_stop(gd->task); + } + + policy->governor_data = NULL; + + /* FIXME replace with devm counterparts? */ + kfree(gd); + return 0; +} + +static int cpufreq_sched_setup(struct cpufreq_policy *policy, unsigned int event) +{ + switch (event) { + case CPUFREQ_GOV_START: + /* Start managing the frequency */ + return cpufreq_sched_start(policy); + + case CPUFREQ_GOV_STOP: + return cpufreq_sched_stop(policy); + + case CPUFREQ_GOV_LIMITS: /* unused */ + case CPUFREQ_GOV_POLICY_INIT: /* unused */ + case CPUFREQ_GOV_POLICY_EXIT: /* unused */ + break; + } + return 0; +} + +#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED +static +#endif +struct cpufreq_governor cpufreq_gov_sched = { + .name = "sched", + .governor = cpufreq_sched_setup, + .owner = THIS_MODULE, +}; + +static int __init cpufreq_sched_init(void) +{ + return cpufreq_register_governor(&cpufreq_gov_sched); +} + +static void __exit cpufreq_sched_exit(void) +{ + cpufreq_unregister_governor(&cpufreq_gov_sched); +} + +/* Try to make this the default governor */ +fs_initcall(cpufreq_sched_init); + +MODULE_LICENSE("GPL v2"); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 858b94ab1bd2..d313c3536de0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4067,6 +4067,28 @@ static inline void hrtick_update(struct rq *rq) } #endif +#ifdef CONFIG_CPU_FREQ_GOV_SCHED +static void dvfs_kick_needed(struct rq *rq) +{ + unsigned long utilization, capacity = 0; + + if (rq->cfs.h_nr_running) { + /* add 25% margin to current utilization */ + utilization = rq->cfs.avg.util_avg; + capacity = utilization + (utilization >> 2); + + /* handle rounding errors */ + capacity = (capacity > SCHED_LOAD_SCALE) ? SCHED_LOAD_SCALE : + capacity; + + } + + cpufreq_sched_set_cap(cpu_of(rq), capacity); +} +#else +static inline void dvfs_kick_needed(struct rq *rq) {} +#endif + /* * The enqueue_task method is called before nr_running is * increased. Here we update the fair scheduling stats and @@ -4111,6 +4133,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!se) add_nr_running(rq, 1); + if (!(flags & ENQUEUE_WAKEUP) || rq->cfs.h_nr_running == 1 ) + dvfs_kick_needed(rq); + hrtick_update(rq); } @@ -4171,6 +4196,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!se) sub_nr_running(rq, 1); + if (!task_sleep || rq->cfs.h_nr_running == 0) + dvfs_kick_needed(rq); + hrtick_update(rq); } @@ -7812,6 +7840,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) if (numabalancing_enabled) task_tick_numa(rq, curr); + + dvfs_kick_needed(rq); } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ab0b05cc3f37..7776704b238c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1405,6 +1405,13 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) } #endif +#ifdef CONFIG_CPU_FREQ_GOV_SCHED +void cpufreq_sched_set_cap(int cpu, unsigned long util); +#else +static inline void cpufreq_sched_set_cap(int cpu, unsigned long util) +{ } +#endif + static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); @@ -1413,6 +1420,7 @@ static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) #else static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { } static inline void sched_avg_update(struct rq *rq) { } +static inline void gov_cfs_update_cpu(int cpu) {} #endif /* |