diff options
-rw-r--r-- | drivers/thermal/Kconfig | 10 | ||||
-rw-r--r-- | drivers/thermal/cpu_cooling.c | 479 | ||||
-rw-r--r-- | include/linux/cpu_cooling.h | 6 |
3 files changed, 495 insertions, 0 deletions
diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig index 5aaae1b2436d..6c341177bd7f 100644 --- a/drivers/thermal/Kconfig +++ b/drivers/thermal/Kconfig @@ -166,6 +166,16 @@ config CPU_FREQ_THERMAL This will be useful for platforms using the generic thermal interface and not the ACPI interface. +config CPU_IDLE_THERMAL + bool "CPU idle cooling strategy" + depends on CPU_IDLE + help + This implements the generic CPU cooling mechanism through + idle injection. This will throttle the CPU by injecting + fixed idle cycle. All CPUs belonging to the same cluster + will enter idle synchronously to reach the deepest idle + state. + endchoice config CLOCK_THERMAL diff --git a/drivers/thermal/cpu_cooling.c b/drivers/thermal/cpu_cooling.c index 5c219dc42e83..1eec8d6862d0 100644 --- a/drivers/thermal/cpu_cooling.c +++ b/drivers/thermal/cpu_cooling.c @@ -10,18 +10,33 @@ * Viresh Kumar <viresh.kumar@linaro.org> * */ +#define pr_fmt(fmt) "CPU cooling: " fmt + #include <linux/module.h> #include <linux/thermal.h> #include <linux/cpufreq.h> +#include <linux/cpuidle.h> #include <linux/err.h> +#include <linux/freezer.h> #include <linux/idr.h> +#include <linux/kthread.h> #include <linux/pm_opp.h> #include <linux/slab.h> +#include <linux/sched/prio.h> +#include <linux/sched/rt.h> +#include <linux/smpboot.h> #include <linux/cpu.h> #include <linux/cpu_cooling.h> +#include <linux/ratelimit.h> + +#include <linux/platform_device.h> +#include <linux/of_platform.h> + #include <trace/events/thermal.h> +#include <uapi/linux/sched/types.h> + #ifdef CONFIG_CPU_FREQ_THERMAL /* * Cooling state <-> CPUFreq frequency @@ -928,3 +943,467 @@ void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev) } EXPORT_SYMBOL_GPL(cpufreq_cooling_unregister); #endif /* CONFIG_CPU_FREQ_THERMAL */ + +#ifdef CONFIG_CPU_IDLE_THERMAL +/** + * struct cpuidle_cooling_device - data for the idle cooling device + * @cdev: a pointer to a struct thermal_cooling_device + * @cpumask: a cpumask containing the CPU managed by the cooling device + * @timer: a hrtimer giving the tempo for the idle injection cycles + * @kref: a kernel refcount on this structure + * @count: an atomic to keep track of the last task exiting the idle cycle + * @idle_cycle: an integer defining the duration of the idle injection + * @state: an normalized integer giving the state of the cooling device + */ +struct cpuidle_cooling_device { + struct thermal_cooling_device *cdev; + struct cpumask *cpumask; + struct hrtimer timer; + struct kref kref; + atomic_t count; + unsigned int idle_cycle; + unsigned long state; +}; + +struct cpuidle_cooling_thread { + struct task_struct *tsk; + int should_run; +}; + +static DEFINE_PER_CPU(struct cpuidle_cooling_thread, cpuidle_cooling_thread); +static DEFINE_PER_CPU(struct cpuidle_cooling_device *, cpuidle_cooling_device); + +/** + * cpuidle_cooling_wakeup - Wake up all idle injection threads + * @idle_cdev: the idle cooling device + * + * Every idle injection task belonging to the idle cooling device and + * running on an online cpu will be wake up by this call. + */ +static void cpuidle_cooling_wakeup(struct cpuidle_cooling_device *idle_cdev) +{ + struct cpuidle_cooling_thread *cct; + int cpu; + + for_each_cpu_and(cpu, idle_cdev->cpumask, cpu_online_mask) { + cct = per_cpu_ptr(&cpuidle_cooling_thread, cpu); + cct->should_run = 1; + wake_up_process(cct->tsk); + } +} + +/** + * cpuidle_cooling_wakeup_fn - Running cycle timer callback + * @timer: a hrtimer structure + * + * When the mitigation is acting, the CPU is allowed to run an amount + * of time, then the idle injection happens for the specified delay + * and the idle task injection schedules itself until the timer event + * wakes the idle injection tasks again for a new idle injection + * cycle. The time between the end of the idle injection and the timer + * expiration is the allocated running time for the CPU. + * + * Always returns HRTIMER_NORESTART + */ +static enum hrtimer_restart cpuidle_cooling_wakeup_fn(struct hrtimer *timer) +{ + struct cpuidle_cooling_device *idle_cdev = + container_of(timer, struct cpuidle_cooling_device, timer); + + cpuidle_cooling_wakeup(idle_cdev); + + return HRTIMER_NORESTART; +} + +/** + * cpuidle_cooling_runtime - Running time computation + * @idle_cdev: the idle cooling device + * + * The running duration is computed from the idle injection duration + * which is fixed. If we reach 100% of idle injection ratio, that + * means the running duration is zero. If we have a 50% ratio + * injection, that means we have equal duration for idle and for + * running duration. + * + * The formula is deduced as the following: + * + * running = idle x ((100 / ratio) - 1) + * + * For precision purpose for integer math, we use the following: + * + * running = (idle x 100) / ratio - idle + * + * For example, if we have an injected duration of 50%, then we end up + * with 10ms of idle injection and 10ms of running duration. + * + * Returns a s64 nanosecond based + */ +static s64 cpuidle_cooling_runtime(struct cpuidle_cooling_device *idle_cdev) +{ + s64 next_wakeup; + unsigned long state = idle_cdev->state; + + /* + * The function should not be called when there is no + * mitigation because: + * - that does not make sense + * - we end up with a division by zero + */ + if (!state) + return 0; + + next_wakeup = (s64)((idle_cdev->idle_cycle * 100) / state) - + idle_cdev->idle_cycle; + + return next_wakeup * NSEC_PER_USEC; +} + +/** + * cpuidle_cooling_injection - Idle injection mainloop thread function + * @cpu: an integer giving the cpu number the thread is pinned on + * + * This main function does basically two operations: + * + * - Goes idle for a specific amount of time + * + * - Sets a timer to wake up all the idle injection threads after a + * running period + * + * That happens only when the mitigation is enabled, otherwise the + * task is scheduled out. + * + * In order to keep the tasks synchronized together, it is the last + * task exiting the idle period which is in charge of setting the + * timer. + * + * This function never returns. + */ +static void cpuidle_cooling_injection(unsigned int cpu) +{ + s64 next_wakeup; + + struct cpuidle_cooling_device *idle_cdev = + per_cpu(cpuidle_cooling_device, cpu); + + struct cpuidle_cooling_thread *cct = + per_cpu_ptr(&cpuidle_cooling_thread, cpu); + + atomic_inc(&idle_cdev->count); + + cct->should_run = 0; + + play_idle(idle_cdev->idle_cycle / USEC_PER_MSEC); + + /* + * The last CPU waking up is in charge of setting the + * timer. If the CPU is hotplugged, the timer will + * move to another CPU (which may not belong to the + * same cluster) but that is not a problem as the + * timer will be set again by another CPU belonging to + * the cluster, so this mechanism is self adaptive and + * does not require any hotplugging dance. + */ + if (!atomic_dec_and_test(&idle_cdev->count)) + return; + + next_wakeup = cpuidle_cooling_runtime(idle_cdev); + if (next_wakeup) + hrtimer_start(&idle_cdev->timer, ns_to_ktime(next_wakeup), + HRTIMER_MODE_REL_PINNED); +} + +static void cpuidle_cooling_setup(unsigned int cpu) +{ + struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2 }; + + set_freezable(); + + sched_setscheduler(current, SCHED_FIFO, ¶m); +} + +static int cpuidle_cooling_should_run(unsigned int cpu) +{ + struct cpuidle_cooling_thread *cct = + per_cpu_ptr(&cpuidle_cooling_thread, cpu); + + return cct->should_run; +} + +static struct smp_hotplug_thread cpuidle_cooling_threads = { + .store = &cpuidle_cooling_thread.tsk, + .thread_fn = cpuidle_cooling_injection, + .thread_comm = "thermal-idle/%u", + .thread_should_run = cpuidle_cooling_should_run, + .setup = cpuidle_cooling_setup, +}; + +/** + * cpuidle_cooling_get_max_state - Get the maximum state + * @cdev : the thermal cooling device + * @state : a pointer to the state variable to be filled + * + * The function always gives 100 as the injection ratio is percentile + * based for consistency accros different platforms. + * + * The function can not fail, it always returns zero. + */ +static int cpuidle_cooling_get_max_state(struct thermal_cooling_device *cdev, + unsigned long *state) +{ + /* + * Depending on the configuration or the hardware, the running + * cycle and the idle cycle could be different. We want unify + * that to an 0..100 interval, so the set state interface will + * be the same whatever the platform is. + * + * The state 100% will make the cluster 100% ... idle. A 0% + * injection ratio means no idle injection at all and 50% + * means for 10ms of idle injection, we have 10ms of running + * time. + */ + *state = 100; + + return 0; +} + +/** + * cpuidle_cooling_get_cur_state - Get the current cooling state + * @cdev: the thermal cooling device + * @state: a pointer to the state + * + * The function just copy the state value from the private thermal + * cooling device structure, the mapping is 1 <-> 1. + * + * The function can not fail, it always returns zero. + */ +static int cpuidle_cooling_get_cur_state(struct thermal_cooling_device *cdev, + unsigned long *state) +{ + struct cpuidle_cooling_device *idle_cdev = cdev->devdata; + + *state = idle_cdev->state; + + return 0; +} + +/** + * cpuidle_cooling_set_cur_state - Set the current cooling state + * @cdev: the thermal cooling device + * @state: the target state + * + * The function checks first if we are initiating the mitigation which + * in turn wakes up all the idle injection tasks belonging to the idle + * cooling device. In any case, it updates the internal state for the + * cooling device. + * + * The function can not fail, it always returns zero. + */ +static int cpuidle_cooling_set_cur_state(struct thermal_cooling_device *cdev, + unsigned long state) +{ + struct cpuidle_cooling_device *idle_cdev = cdev->devdata; + unsigned long current_state = idle_cdev->state; + + idle_cdev->state = state; + + if (current_state == 0 && state > 0) { + pr_debug("Starting cooling cpus '%*pbl'\n", + cpumask_pr_args(idle_cdev->cpumask)); + cpuidle_cooling_wakeup(idle_cdev); + } else if (current_state > 0 && !state) { + pr_debug("Stopping cooling cpus '%*pbl'\n", + cpumask_pr_args(idle_cdev->cpumask)); + } + + return 0; +} + +/** + * cpuidle_cooling_ops - thermal cooling device ops + */ +static struct thermal_cooling_device_ops cpuidle_cooling_ops = { + .get_max_state = cpuidle_cooling_get_max_state, + .get_cur_state = cpuidle_cooling_get_cur_state, + .set_cur_state = cpuidle_cooling_set_cur_state, +}; + +/** + * cpuidle_cooling_release - Kref based release helper + * @kref: a pointer to the kref structure + * + * This function is automatically called by the kref_put function when + * the idle cooling device refcount reaches zero. At this point, we + * have the guarantee the structure is no longer in use and we can + * safely release all the ressources. + */ +static void __init cpuidle_cooling_release(struct kref *kref) +{ + struct cpuidle_cooling_device *idle_cdev = + container_of(kref, struct cpuidle_cooling_device, kref); + + if (idle_cdev->cdev) + thermal_cooling_device_unregister(idle_cdev->cdev); + + hrtimer_cancel(&idle_cdev->timer); + kfree(idle_cdev); +} + +/** + * cpuilde_cooling_unregister - Idle cooling device exit function + * + * This function unregisters the cpuidle cooling device and frees the + * ressources previously allocated by the init function. This function + * is called when the initialization fails. + */ +static void __init cpuidle_cooling_unregister(void) +{ + struct cpuidle_cooling_device *idle_cdev; + int cpu; + + for_each_possible_cpu(cpu) { + idle_cdev = per_cpu(cpuidle_cooling_device, cpu); + if (idle_cdev) + kref_put(&idle_cdev->kref, cpuidle_cooling_release); + } +} + + +/** + * cpuidle_cooling_alloc - Allocate and initialize an idle cooling device + * @cpumask: a cpumask containing all the cpus handled by the cooling device + * + * The function is called at init time only. It allocates and + * initializes the different fields of the cpuidle cooling device + * + * It returns a pointer to an cpuidle_cooling_device structure on + * success, NULL on error. + */ +static struct cpuidle_cooling_device * __init cpuidle_cooling_alloc( + cpumask_t *cpumask) +{ + struct cpuidle_cooling_device *idle_cdev; + int cpu; + + idle_cdev = kzalloc(sizeof(*idle_cdev), GFP_KERNEL); + if (!idle_cdev) + return NULL; + + /* + * The idle duration injection. As we don't have yet a way to + * specify from the DT configuration, let's default to a tick + * duration. + */ + idle_cdev->idle_cycle = TICK_USEC; + + /* + * Initialize the timer to wakeup all the idle injection tasks + */ + hrtimer_init(&idle_cdev->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + + /* + * The wakeup function callback which is in charge of waking + * up all CPUs belonging to the same cluster + */ + idle_cdev->timer.function = cpuidle_cooling_wakeup_fn; + + idle_cdev->cpumask = cpumask; + + /* + * Assign on a per cpu basis belonging to the cluster, the per + * cpu cpuidle_cooling_device pointer and increment its + * refcount on it + */ + for_each_cpu(cpu, cpumask) { + kref_get(&idle_cdev->kref); + per_cpu(cpuidle_cooling_device, cpu) = idle_cdev; + } + + return idle_cdev; +} + +/** + * cpuidle_cooling_register - Idle cooling device initialization function + * + * This function is in charge of creating a cooling device per cluster + * and register it to thermal framework. For this we rely on the + * topology as there is nothing yet describing better the idle state + * power domains. + * + * We create a cpuidle cooling device per cluster. For this reason we + * must, for each cluster, allocate and initialize the cooling device + * and for each cpu belonging to this cluster, do the initialization + * on a cpu basis. + * + * This approach for creating the cooling device is needed as we don't + * have the guarantee the CPU numbering is sequential. + * + * Unfortunately, there is no API to browse from top to bottom the + * topology, cluster->cpu, only the usual for_each_possible_cpu loop. + * In order to solve that, we use a cpumask to flag the cluster_id we + * already processed. The cpumask will always have enough room for all + * the cluster because it is based on NR_CPUS and it is not possible + * to have more clusters than cpus. + * + */ +void __init cpuidle_cooling_register(void) +{ + struct cpuidle_cooling_device *idle_cdev = NULL; + struct thermal_cooling_device *cdev; + struct device_node *np; + cpumask_var_t cpumask; + char dev_name[THERMAL_NAME_LENGTH]; + int ret = -ENOMEM, cpu; + int cluster_id; + + if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL)) + return; + + for_each_possible_cpu(cpu) { + + cluster_id = topology_physical_package_id(cpu); + if (cpumask_test_cpu(cluster_id, cpumask)) + continue; + + /* + * Allocate the cpuidle cooling device with the list + * of the cpus belonging to the cluster. + */ + idle_cdev = cpuidle_cooling_alloc(topology_core_cpumask(cpu)); + if (!idle_cdev) + goto out; + + /* + * The thermal cooling device name, we use the + * cluster_id as the numbering index for the idle + * cooling device. + */ + snprintf(dev_name, sizeof(dev_name), "thermal-idle-%d", + cluster_id); + + np = of_cpu_device_node_get(cpu); + cdev = thermal_of_cooling_device_register(np, dev_name, + idle_cdev, + &cpuidle_cooling_ops); + if (IS_ERR(cdev)) { + ret = PTR_ERR(cdev); + goto out; + } + + idle_cdev->cdev = cdev; + cpumask_set_cpu(cluster_id, cpumask); + } + + ret = smpboot_register_percpu_thread(&cpuidle_cooling_threads); + if (ret) + goto out; + + pr_info("Created cpuidle cooling device\n"); +out: + free_cpumask_var(cpumask); + + if (ret) { + cpuidle_cooling_unregister(); + pr_err("Failed to create idle cooling device (%d)\n", ret); + } +} +#endif /* CONFIG_CPU_IDLE_THERMAL */ diff --git a/include/linux/cpu_cooling.h b/include/linux/cpu_cooling.h index c0accc7a9245..af5520dfd5ca 100644 --- a/include/linux/cpu_cooling.h +++ b/include/linux/cpu_cooling.h @@ -120,4 +120,10 @@ void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev) } #endif /* CONFIG_CPU_FREQ_THERMAL */ +#ifdef CONFIG_CPU_IDLE_THERMAL +extern void __init cpuidle_cooling_register(void); +#else /* CONFIG_CPU_IDLE_THERMAL */ +static inline void __init cpuidle_cooling_register(void) { } +#endif /* CONFIG_CPU_IDLE_THERMAL */ + #endif /* __CPU_COOLING_H__ */ |