summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--drivers/thermal/Kconfig10
-rw-r--r--drivers/thermal/cpu_cooling.c479
-rw-r--r--include/linux/cpu_cooling.h6
3 files changed, 495 insertions, 0 deletions
diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig
index 5aaae1b2436d..6c341177bd7f 100644
--- a/drivers/thermal/Kconfig
+++ b/drivers/thermal/Kconfig
@@ -166,6 +166,16 @@ config CPU_FREQ_THERMAL
This will be useful for platforms using the generic thermal interface
and not the ACPI interface.
+config CPU_IDLE_THERMAL
+ bool "CPU idle cooling strategy"
+ depends on CPU_IDLE
+ help
+ This implements the generic CPU cooling mechanism through
+ idle injection. This will throttle the CPU by injecting
+ fixed idle cycle. All CPUs belonging to the same cluster
+ will enter idle synchronously to reach the deepest idle
+ state.
+
endchoice
config CLOCK_THERMAL
diff --git a/drivers/thermal/cpu_cooling.c b/drivers/thermal/cpu_cooling.c
index 5c219dc42e83..1eec8d6862d0 100644
--- a/drivers/thermal/cpu_cooling.c
+++ b/drivers/thermal/cpu_cooling.c
@@ -10,18 +10,33 @@
* Viresh Kumar <viresh.kumar@linaro.org>
*
*/
+#define pr_fmt(fmt) "CPU cooling: " fmt
+
#include <linux/module.h>
#include <linux/thermal.h>
#include <linux/cpufreq.h>
+#include <linux/cpuidle.h>
#include <linux/err.h>
+#include <linux/freezer.h>
#include <linux/idr.h>
+#include <linux/kthread.h>
#include <linux/pm_opp.h>
#include <linux/slab.h>
+#include <linux/sched/prio.h>
+#include <linux/sched/rt.h>
+#include <linux/smpboot.h>
#include <linux/cpu.h>
#include <linux/cpu_cooling.h>
+#include <linux/ratelimit.h>
+
+#include <linux/platform_device.h>
+#include <linux/of_platform.h>
+
#include <trace/events/thermal.h>
+#include <uapi/linux/sched/types.h>
+
#ifdef CONFIG_CPU_FREQ_THERMAL
/*
* Cooling state <-> CPUFreq frequency
@@ -928,3 +943,467 @@ void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev)
}
EXPORT_SYMBOL_GPL(cpufreq_cooling_unregister);
#endif /* CONFIG_CPU_FREQ_THERMAL */
+
+#ifdef CONFIG_CPU_IDLE_THERMAL
+/**
+ * struct cpuidle_cooling_device - data for the idle cooling device
+ * @cdev: a pointer to a struct thermal_cooling_device
+ * @cpumask: a cpumask containing the CPU managed by the cooling device
+ * @timer: a hrtimer giving the tempo for the idle injection cycles
+ * @kref: a kernel refcount on this structure
+ * @count: an atomic to keep track of the last task exiting the idle cycle
+ * @idle_cycle: an integer defining the duration of the idle injection
+ * @state: an normalized integer giving the state of the cooling device
+ */
+struct cpuidle_cooling_device {
+ struct thermal_cooling_device *cdev;
+ struct cpumask *cpumask;
+ struct hrtimer timer;
+ struct kref kref;
+ atomic_t count;
+ unsigned int idle_cycle;
+ unsigned long state;
+};
+
+struct cpuidle_cooling_thread {
+ struct task_struct *tsk;
+ int should_run;
+};
+
+static DEFINE_PER_CPU(struct cpuidle_cooling_thread, cpuidle_cooling_thread);
+static DEFINE_PER_CPU(struct cpuidle_cooling_device *, cpuidle_cooling_device);
+
+/**
+ * cpuidle_cooling_wakeup - Wake up all idle injection threads
+ * @idle_cdev: the idle cooling device
+ *
+ * Every idle injection task belonging to the idle cooling device and
+ * running on an online cpu will be wake up by this call.
+ */
+static void cpuidle_cooling_wakeup(struct cpuidle_cooling_device *idle_cdev)
+{
+ struct cpuidle_cooling_thread *cct;
+ int cpu;
+
+ for_each_cpu_and(cpu, idle_cdev->cpumask, cpu_online_mask) {
+ cct = per_cpu_ptr(&cpuidle_cooling_thread, cpu);
+ cct->should_run = 1;
+ wake_up_process(cct->tsk);
+ }
+}
+
+/**
+ * cpuidle_cooling_wakeup_fn - Running cycle timer callback
+ * @timer: a hrtimer structure
+ *
+ * When the mitigation is acting, the CPU is allowed to run an amount
+ * of time, then the idle injection happens for the specified delay
+ * and the idle task injection schedules itself until the timer event
+ * wakes the idle injection tasks again for a new idle injection
+ * cycle. The time between the end of the idle injection and the timer
+ * expiration is the allocated running time for the CPU.
+ *
+ * Always returns HRTIMER_NORESTART
+ */
+static enum hrtimer_restart cpuidle_cooling_wakeup_fn(struct hrtimer *timer)
+{
+ struct cpuidle_cooling_device *idle_cdev =
+ container_of(timer, struct cpuidle_cooling_device, timer);
+
+ cpuidle_cooling_wakeup(idle_cdev);
+
+ return HRTIMER_NORESTART;
+}
+
+/**
+ * cpuidle_cooling_runtime - Running time computation
+ * @idle_cdev: the idle cooling device
+ *
+ * The running duration is computed from the idle injection duration
+ * which is fixed. If we reach 100% of idle injection ratio, that
+ * means the running duration is zero. If we have a 50% ratio
+ * injection, that means we have equal duration for idle and for
+ * running duration.
+ *
+ * The formula is deduced as the following:
+ *
+ * running = idle x ((100 / ratio) - 1)
+ *
+ * For precision purpose for integer math, we use the following:
+ *
+ * running = (idle x 100) / ratio - idle
+ *
+ * For example, if we have an injected duration of 50%, then we end up
+ * with 10ms of idle injection and 10ms of running duration.
+ *
+ * Returns a s64 nanosecond based
+ */
+static s64 cpuidle_cooling_runtime(struct cpuidle_cooling_device *idle_cdev)
+{
+ s64 next_wakeup;
+ unsigned long state = idle_cdev->state;
+
+ /*
+ * The function should not be called when there is no
+ * mitigation because:
+ * - that does not make sense
+ * - we end up with a division by zero
+ */
+ if (!state)
+ return 0;
+
+ next_wakeup = (s64)((idle_cdev->idle_cycle * 100) / state) -
+ idle_cdev->idle_cycle;
+
+ return next_wakeup * NSEC_PER_USEC;
+}
+
+/**
+ * cpuidle_cooling_injection - Idle injection mainloop thread function
+ * @cpu: an integer giving the cpu number the thread is pinned on
+ *
+ * This main function does basically two operations:
+ *
+ * - Goes idle for a specific amount of time
+ *
+ * - Sets a timer to wake up all the idle injection threads after a
+ * running period
+ *
+ * That happens only when the mitigation is enabled, otherwise the
+ * task is scheduled out.
+ *
+ * In order to keep the tasks synchronized together, it is the last
+ * task exiting the idle period which is in charge of setting the
+ * timer.
+ *
+ * This function never returns.
+ */
+static void cpuidle_cooling_injection(unsigned int cpu)
+{
+ s64 next_wakeup;
+
+ struct cpuidle_cooling_device *idle_cdev =
+ per_cpu(cpuidle_cooling_device, cpu);
+
+ struct cpuidle_cooling_thread *cct =
+ per_cpu_ptr(&cpuidle_cooling_thread, cpu);
+
+ atomic_inc(&idle_cdev->count);
+
+ cct->should_run = 0;
+
+ play_idle(idle_cdev->idle_cycle / USEC_PER_MSEC);
+
+ /*
+ * The last CPU waking up is in charge of setting the
+ * timer. If the CPU is hotplugged, the timer will
+ * move to another CPU (which may not belong to the
+ * same cluster) but that is not a problem as the
+ * timer will be set again by another CPU belonging to
+ * the cluster, so this mechanism is self adaptive and
+ * does not require any hotplugging dance.
+ */
+ if (!atomic_dec_and_test(&idle_cdev->count))
+ return;
+
+ next_wakeup = cpuidle_cooling_runtime(idle_cdev);
+ if (next_wakeup)
+ hrtimer_start(&idle_cdev->timer, ns_to_ktime(next_wakeup),
+ HRTIMER_MODE_REL_PINNED);
+}
+
+static void cpuidle_cooling_setup(unsigned int cpu)
+{
+ struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2 };
+
+ set_freezable();
+
+ sched_setscheduler(current, SCHED_FIFO, &param);
+}
+
+static int cpuidle_cooling_should_run(unsigned int cpu)
+{
+ struct cpuidle_cooling_thread *cct =
+ per_cpu_ptr(&cpuidle_cooling_thread, cpu);
+
+ return cct->should_run;
+}
+
+static struct smp_hotplug_thread cpuidle_cooling_threads = {
+ .store = &cpuidle_cooling_thread.tsk,
+ .thread_fn = cpuidle_cooling_injection,
+ .thread_comm = "thermal-idle/%u",
+ .thread_should_run = cpuidle_cooling_should_run,
+ .setup = cpuidle_cooling_setup,
+};
+
+/**
+ * cpuidle_cooling_get_max_state - Get the maximum state
+ * @cdev : the thermal cooling device
+ * @state : a pointer to the state variable to be filled
+ *
+ * The function always gives 100 as the injection ratio is percentile
+ * based for consistency accros different platforms.
+ *
+ * The function can not fail, it always returns zero.
+ */
+static int cpuidle_cooling_get_max_state(struct thermal_cooling_device *cdev,
+ unsigned long *state)
+{
+ /*
+ * Depending on the configuration or the hardware, the running
+ * cycle and the idle cycle could be different. We want unify
+ * that to an 0..100 interval, so the set state interface will
+ * be the same whatever the platform is.
+ *
+ * The state 100% will make the cluster 100% ... idle. A 0%
+ * injection ratio means no idle injection at all and 50%
+ * means for 10ms of idle injection, we have 10ms of running
+ * time.
+ */
+ *state = 100;
+
+ return 0;
+}
+
+/**
+ * cpuidle_cooling_get_cur_state - Get the current cooling state
+ * @cdev: the thermal cooling device
+ * @state: a pointer to the state
+ *
+ * The function just copy the state value from the private thermal
+ * cooling device structure, the mapping is 1 <-> 1.
+ *
+ * The function can not fail, it always returns zero.
+ */
+static int cpuidle_cooling_get_cur_state(struct thermal_cooling_device *cdev,
+ unsigned long *state)
+{
+ struct cpuidle_cooling_device *idle_cdev = cdev->devdata;
+
+ *state = idle_cdev->state;
+
+ return 0;
+}
+
+/**
+ * cpuidle_cooling_set_cur_state - Set the current cooling state
+ * @cdev: the thermal cooling device
+ * @state: the target state
+ *
+ * The function checks first if we are initiating the mitigation which
+ * in turn wakes up all the idle injection tasks belonging to the idle
+ * cooling device. In any case, it updates the internal state for the
+ * cooling device.
+ *
+ * The function can not fail, it always returns zero.
+ */
+static int cpuidle_cooling_set_cur_state(struct thermal_cooling_device *cdev,
+ unsigned long state)
+{
+ struct cpuidle_cooling_device *idle_cdev = cdev->devdata;
+ unsigned long current_state = idle_cdev->state;
+
+ idle_cdev->state = state;
+
+ if (current_state == 0 && state > 0) {
+ pr_debug("Starting cooling cpus '%*pbl'\n",
+ cpumask_pr_args(idle_cdev->cpumask));
+ cpuidle_cooling_wakeup(idle_cdev);
+ } else if (current_state > 0 && !state) {
+ pr_debug("Stopping cooling cpus '%*pbl'\n",
+ cpumask_pr_args(idle_cdev->cpumask));
+ }
+
+ return 0;
+}
+
+/**
+ * cpuidle_cooling_ops - thermal cooling device ops
+ */
+static struct thermal_cooling_device_ops cpuidle_cooling_ops = {
+ .get_max_state = cpuidle_cooling_get_max_state,
+ .get_cur_state = cpuidle_cooling_get_cur_state,
+ .set_cur_state = cpuidle_cooling_set_cur_state,
+};
+
+/**
+ * cpuidle_cooling_release - Kref based release helper
+ * @kref: a pointer to the kref structure
+ *
+ * This function is automatically called by the kref_put function when
+ * the idle cooling device refcount reaches zero. At this point, we
+ * have the guarantee the structure is no longer in use and we can
+ * safely release all the ressources.
+ */
+static void __init cpuidle_cooling_release(struct kref *kref)
+{
+ struct cpuidle_cooling_device *idle_cdev =
+ container_of(kref, struct cpuidle_cooling_device, kref);
+
+ if (idle_cdev->cdev)
+ thermal_cooling_device_unregister(idle_cdev->cdev);
+
+ hrtimer_cancel(&idle_cdev->timer);
+ kfree(idle_cdev);
+}
+
+/**
+ * cpuilde_cooling_unregister - Idle cooling device exit function
+ *
+ * This function unregisters the cpuidle cooling device and frees the
+ * ressources previously allocated by the init function. This function
+ * is called when the initialization fails.
+ */
+static void __init cpuidle_cooling_unregister(void)
+{
+ struct cpuidle_cooling_device *idle_cdev;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ idle_cdev = per_cpu(cpuidle_cooling_device, cpu);
+ if (idle_cdev)
+ kref_put(&idle_cdev->kref, cpuidle_cooling_release);
+ }
+}
+
+
+/**
+ * cpuidle_cooling_alloc - Allocate and initialize an idle cooling device
+ * @cpumask: a cpumask containing all the cpus handled by the cooling device
+ *
+ * The function is called at init time only. It allocates and
+ * initializes the different fields of the cpuidle cooling device
+ *
+ * It returns a pointer to an cpuidle_cooling_device structure on
+ * success, NULL on error.
+ */
+static struct cpuidle_cooling_device * __init cpuidle_cooling_alloc(
+ cpumask_t *cpumask)
+{
+ struct cpuidle_cooling_device *idle_cdev;
+ int cpu;
+
+ idle_cdev = kzalloc(sizeof(*idle_cdev), GFP_KERNEL);
+ if (!idle_cdev)
+ return NULL;
+
+ /*
+ * The idle duration injection. As we don't have yet a way to
+ * specify from the DT configuration, let's default to a tick
+ * duration.
+ */
+ idle_cdev->idle_cycle = TICK_USEC;
+
+ /*
+ * Initialize the timer to wakeup all the idle injection tasks
+ */
+ hrtimer_init(&idle_cdev->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+
+ /*
+ * The wakeup function callback which is in charge of waking
+ * up all CPUs belonging to the same cluster
+ */
+ idle_cdev->timer.function = cpuidle_cooling_wakeup_fn;
+
+ idle_cdev->cpumask = cpumask;
+
+ /*
+ * Assign on a per cpu basis belonging to the cluster, the per
+ * cpu cpuidle_cooling_device pointer and increment its
+ * refcount on it
+ */
+ for_each_cpu(cpu, cpumask) {
+ kref_get(&idle_cdev->kref);
+ per_cpu(cpuidle_cooling_device, cpu) = idle_cdev;
+ }
+
+ return idle_cdev;
+}
+
+/**
+ * cpuidle_cooling_register - Idle cooling device initialization function
+ *
+ * This function is in charge of creating a cooling device per cluster
+ * and register it to thermal framework. For this we rely on the
+ * topology as there is nothing yet describing better the idle state
+ * power domains.
+ *
+ * We create a cpuidle cooling device per cluster. For this reason we
+ * must, for each cluster, allocate and initialize the cooling device
+ * and for each cpu belonging to this cluster, do the initialization
+ * on a cpu basis.
+ *
+ * This approach for creating the cooling device is needed as we don't
+ * have the guarantee the CPU numbering is sequential.
+ *
+ * Unfortunately, there is no API to browse from top to bottom the
+ * topology, cluster->cpu, only the usual for_each_possible_cpu loop.
+ * In order to solve that, we use a cpumask to flag the cluster_id we
+ * already processed. The cpumask will always have enough room for all
+ * the cluster because it is based on NR_CPUS and it is not possible
+ * to have more clusters than cpus.
+ *
+ */
+void __init cpuidle_cooling_register(void)
+{
+ struct cpuidle_cooling_device *idle_cdev = NULL;
+ struct thermal_cooling_device *cdev;
+ struct device_node *np;
+ cpumask_var_t cpumask;
+ char dev_name[THERMAL_NAME_LENGTH];
+ int ret = -ENOMEM, cpu;
+ int cluster_id;
+
+ if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
+ return;
+
+ for_each_possible_cpu(cpu) {
+
+ cluster_id = topology_physical_package_id(cpu);
+ if (cpumask_test_cpu(cluster_id, cpumask))
+ continue;
+
+ /*
+ * Allocate the cpuidle cooling device with the list
+ * of the cpus belonging to the cluster.
+ */
+ idle_cdev = cpuidle_cooling_alloc(topology_core_cpumask(cpu));
+ if (!idle_cdev)
+ goto out;
+
+ /*
+ * The thermal cooling device name, we use the
+ * cluster_id as the numbering index for the idle
+ * cooling device.
+ */
+ snprintf(dev_name, sizeof(dev_name), "thermal-idle-%d",
+ cluster_id);
+
+ np = of_cpu_device_node_get(cpu);
+ cdev = thermal_of_cooling_device_register(np, dev_name,
+ idle_cdev,
+ &cpuidle_cooling_ops);
+ if (IS_ERR(cdev)) {
+ ret = PTR_ERR(cdev);
+ goto out;
+ }
+
+ idle_cdev->cdev = cdev;
+ cpumask_set_cpu(cluster_id, cpumask);
+ }
+
+ ret = smpboot_register_percpu_thread(&cpuidle_cooling_threads);
+ if (ret)
+ goto out;
+
+ pr_info("Created cpuidle cooling device\n");
+out:
+ free_cpumask_var(cpumask);
+
+ if (ret) {
+ cpuidle_cooling_unregister();
+ pr_err("Failed to create idle cooling device (%d)\n", ret);
+ }
+}
+#endif /* CONFIG_CPU_IDLE_THERMAL */
diff --git a/include/linux/cpu_cooling.h b/include/linux/cpu_cooling.h
index c0accc7a9245..af5520dfd5ca 100644
--- a/include/linux/cpu_cooling.h
+++ b/include/linux/cpu_cooling.h
@@ -120,4 +120,10 @@ void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev)
}
#endif /* CONFIG_CPU_FREQ_THERMAL */
+#ifdef CONFIG_CPU_IDLE_THERMAL
+extern void __init cpuidle_cooling_register(void);
+#else /* CONFIG_CPU_IDLE_THERMAL */
+static inline void __init cpuidle_cooling_register(void) { }
+#endif /* CONFIG_CPU_IDLE_THERMAL */
+
#endif /* __CPU_COOLING_H__ */