From b9e7900a8ad0dc9ffe416567841cb606f1689133 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Mon, 14 Oct 2013 12:38:19 +0100 Subject: smp: Don't use typedef to work around compiler issue with tracepoints Having the typedef in place for the tracepoints causes compiler crashes in some situations. Just using void * directly avoids triggering the issue and should have no effect on the trace. Signed-off-by: Mark Brown Acked-by: Liviu Dudau --- include/trace/events/smp.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/include/trace/events/smp.h b/include/trace/events/smp.h index c8abfd74472..da0baf27a39 100644 --- a/include/trace/events/smp.h +++ b/include/trace/events/smp.h @@ -5,11 +5,10 @@ #define _TRACE_SMP_H #include -typedef void (*__smp_call_func_t)(void *info); DECLARE_EVENT_CLASS(smp_call_class, - TP_PROTO(__smp_call_func_t fnc), + TP_PROTO(void * fnc), TP_ARGS(fnc), @@ -35,7 +34,7 @@ DECLARE_EVENT_CLASS(smp_call_class, */ DEFINE_EVENT(smp_call_class, smp_call_func_entry, - TP_PROTO(__smp_call_func_t fnc), + TP_PROTO(void * fnc), TP_ARGS(fnc) ); @@ -51,7 +50,7 @@ DEFINE_EVENT(smp_call_class, smp_call_func_entry, */ DEFINE_EVENT(smp_call_class, smp_call_func_exit, - TP_PROTO(__smp_call_func_t fnc), + TP_PROTO(void * fnc), TP_ARGS(fnc) ); @@ -67,7 +66,7 @@ DEFINE_EVENT(smp_call_class, smp_call_func_exit, */ TRACE_EVENT(smp_call_func_send, - TP_PROTO(__smp_call_func_t func, int dest), + TP_PROTO(void * func, int dest), TP_ARGS(func, dest), -- cgit v1.2.3 From bd40e205ea48665ed60499001f8edf802e2901e2 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Mon, 14 Oct 2013 13:37:36 +0100 Subject: arm64: Fix build due to HMP tracepoints Commit 2353c1f800 (arm: ipi raise/start/end tracing) added tracepoints for IPIs in the generic GIC driver but only added definitions for them on ARM, causing build failures on ARM64. Fix this by adding equivalent definitions for arm64. Signed-off-by: Mark Brown Acked-by: Liviu Dudau --- arch/arm64/kernel/smp.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 5d54e3717bf..a28d2859932 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -48,6 +48,9 @@ #include #include +#define CREATE_TRACE_POINTS +#include + /* * as from 2.5, kernels no longer have an init_tasks structure * so we need some other way of telling a new secondary core -- cgit v1.2.3 From 2a68d1e9125582bedeac4ea34fb9901ab1f7de11 Mon Sep 17 00:00:00 2001 From: Mathieu Poirier Date: Wed, 20 Nov 2013 14:20:42 +0000 Subject: HMP: Avoid using the cpu stopper to stop runnable tasks When migrating a runnable task, we use the CPU stopper on the source CPU to ensure that the task to be moved is not currently running. Before this patch, all forced migrations (up, offload, idle pull) use the stopper for every migration. Using the CPU stopper is mandatory only when a task is currently running on a CPU. Otherwise tasks can be moved by locking the source and destination run queues. This patch checks to see if the task to be moved are currently running. If not the task is moved directly without using the stopper thread. Signed-off-by: Mathieu Poirier Signed-off-by: Jon Medhurst --- include/trace/events/sched.h | 49 ++++++++++++++++++++ kernel/sched/fair.c | 107 +++++++++++++++++++++++++++++++++++++++---- 2 files changed, 147 insertions(+), 9 deletions(-) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 66dc53bca19..2afcb71857f 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -579,6 +579,55 @@ TRACE_EVENT(sched_task_usage_ratio, __entry->ratio) ); +/* + * Tracepoint for HMP (CONFIG_SCHED_HMP) task migrations, + * marking the forced transition of runnable or running tasks. + */ +TRACE_EVENT(sched_hmp_migrate_force_running, + + TP_PROTO(struct task_struct *tsk, int running), + + TP_ARGS(tsk, running), + + TP_STRUCT__entry( + __array(char, comm, TASK_COMM_LEN) + __field(int, running) + ), + + TP_fast_assign( + memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __entry->running = running; + ), + + TP_printk("running=%d comm=%s", + __entry->running, __entry->comm) +); + +/* + * Tracepoint for HMP (CONFIG_SCHED_HMP) task migrations, + * marking the forced transition of runnable or running + * tasks when a task is about to go idle. + */ +TRACE_EVENT(sched_hmp_migrate_idle_running, + + TP_PROTO(struct task_struct *tsk, int running), + + TP_ARGS(tsk, running), + + TP_STRUCT__entry( + __array(char, comm, TASK_COMM_LEN) + __field(int, running) + ), + + TP_fast_assign( + memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __entry->running = running; + ), + + TP_printk("running=%d comm=%s", + __entry->running, __entry->comm) +); + /* * Tracepoint for HMP (CONFIG_SCHED_HMP) task migrations. */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 66b5b30159f..cab90d6f32d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6914,6 +6914,69 @@ out_unlock: return 0; } +/* + * Move task in a runnable state to another CPU. + * + * Tailored on 'active_load_balance_stop_cpu' with slight + * modification to locking and pre-transfer checks. Note + * rq->lock must be held before calling. + */ +static void hmp_migrate_runnable_task(struct rq *rq) +{ + struct sched_domain *sd; + int src_cpu = cpu_of(rq); + struct rq *src_rq = rq; + int dst_cpu = rq->push_cpu; + struct rq *dst_rq = cpu_rq(dst_cpu); + struct task_struct *p = rq->migrate_task; + /* + * One last check to make sure nobody else is playing + * with the source rq. + */ + if (src_rq->active_balance) + return; + + if (src_rq->nr_running <= 1) + return; + + if (task_rq(p) != src_rq) + return; + /* + * Not sure if this applies here but one can never + * be too cautious + */ + BUG_ON(src_rq == dst_rq); + + double_lock_balance(src_rq, dst_rq); + + rcu_read_lock(); + for_each_domain(dst_cpu, sd) { + if (cpumask_test_cpu(src_cpu, sched_domain_span(sd))) + break; + } + + if (likely(sd)) { + struct lb_env env = { + .sd = sd, + .dst_cpu = dst_cpu, + .dst_rq = dst_rq, + .src_cpu = src_cpu, + .src_rq = src_rq, + .idle = CPU_IDLE, + }; + + schedstat_inc(sd, alb_count); + + if (move_specific_task(&env, p)) + schedstat_inc(sd, alb_pushed); + else + schedstat_inc(sd, alb_failed); + } + + rcu_read_unlock(); + double_unlock_balance(src_rq, dst_rq); +} + static DEFINE_SPINLOCK(hmp_force_migration); /* @@ -6926,13 +6989,14 @@ static void hmp_force_up_migration(int this_cpu) struct sched_entity *curr, *orig; struct rq *target; unsigned long flags; - unsigned int force; + unsigned int force, got_target; struct task_struct *p; if (!spin_trylock(&hmp_force_migration)) return; for_each_online_cpu(cpu) { force = 0; + got_target = 0; target = cpu_rq(cpu); raw_spin_lock_irqsave(&target->lock, flags); curr = target->cfs.curr; @@ -6955,15 +7019,14 @@ static void hmp_force_up_migration(int this_cpu) if (hmp_up_migration(cpu, &target_cpu, curr)) { if (!target->active_balance) { get_task_struct(p); - target->active_balance = 1; target->push_cpu = target_cpu; target->migrate_task = p; - force = 1; + got_target = 1; trace_sched_hmp_migrate(p, target->push_cpu, HMP_MIGRATE_FORCE); hmp_next_up_delay(&p->se, target->push_cpu); } } - if (!force && !target->active_balance) { + if (!got_target && !target->active_balance) { /* * For now we just check the currently running task. * Selecting the lightest task for offloading will @@ -6974,14 +7037,29 @@ static void hmp_force_up_migration(int this_cpu) target->push_cpu = hmp_offload_down(cpu, curr); if (target->push_cpu < NR_CPUS) { get_task_struct(p); - target->active_balance = 1; target->migrate_task = p; - force = 1; + got_target = 1; trace_sched_hmp_migrate(p, target->push_cpu, HMP_MIGRATE_OFFLOAD); hmp_next_down_delay(&p->se, target->push_cpu); } } + /* + * We have a target with no active_balance. If the task + * is not currently running move it, otherwise let the + * CPU stopper take care of it. + */ + if (got_target && !target->active_balance) { + if (!task_running(target, p)) { + trace_sched_hmp_migrate_force_running(p, 0); + hmp_migrate_runnable_task(target); + } else { + target->active_balance = 1; + force = 1; + } + } + raw_spin_unlock_irqrestore(&target->lock, flags); + if (force) stop_one_cpu_nowait(cpu_of(target), hmp_active_task_migration_cpu_stop, @@ -7001,7 +7079,7 @@ static unsigned int hmp_idle_pull(int this_cpu) int cpu; struct sched_entity *curr, *orig; struct hmp_domain *hmp_domain = NULL; - struct rq *target, *rq; + struct rq *target = NULL, *rq; unsigned long flags, ratio = 0; unsigned int force = 0; struct task_struct *p = NULL; @@ -7053,14 +7131,25 @@ static unsigned int hmp_idle_pull(int this_cpu) raw_spin_lock_irqsave(&target->lock, flags); if (!target->active_balance && task_rq(p) == target) { get_task_struct(p); - target->active_balance = 1; target->push_cpu = this_cpu; target->migrate_task = p; - force = 1; trace_sched_hmp_migrate(p, target->push_cpu, HMP_MIGRATE_IDLE_PULL); hmp_next_up_delay(&p->se, target->push_cpu); + /* + * if the task isn't running move it right away. + * Otherwise setup the active_balance mechanic and let + * the CPU stopper do its job. + */ + if (!task_running(target, p)) { + trace_sched_hmp_migrate_idle_running(p, 0); + hmp_migrate_runnable_task(target); + } else { + target->active_balance = 1; + force = 1; + } } raw_spin_unlock_irqrestore(&target->lock, flags); + if (force) { stop_one_cpu_nowait(cpu_of(target), hmp_idle_pull_cpu_stop, -- cgit v1.2.3 From 0b877c2baac65994016c6812804d1b30e89c18ed Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Mon, 11 Nov 2013 16:29:29 +0000 Subject: sched: hmp: add read-only hmp domain sysfs file In order to allow userspace to restrict known low-load tasks to little CPUs, we must export this knowledge from the kernel or expect userspace to make their own attempts at figuring it out. Since we now have a userspace requirement for an HMP implementation to always have at least some sysfs files, change the integration so that it only depends upon CONFIG_SCHED_HMP rather than CONFIG_HMP_VARIABLE_SCALE. Fix Kconfig text to match. Signed-off-by: Chris Redpath Signed-off-by: Jon Medhurst --- arch/arm/Kconfig | 31 +++++++++------ kernel/sched/fair.c | 111 +++++++++++++++++++++++++++++++++++++--------------- 2 files changed, 98 insertions(+), 44 deletions(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index e79dfda6644..6cbe972ead2 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1510,6 +1510,17 @@ config SCHED_HMP There is currently no support for migration of task groups, hence !SCHED_AUTOGROUP. Furthermore, normal load-balancing must be disabled between cpus of different type (DISABLE_CPU_SCHED_DOMAIN_BALANCE). + When turned on, this option adds sys/kernel/hmp directory which + contains the following files: + up_threshold - the load average threshold used for up migration + (0 - 1023) + down_threshold - the load average threshold used for down migration + (0 - 1023) + hmp_domains - a list of cpumasks for the present HMP domains, + starting with the 'biggest' and ending with the + 'smallest'. + Note that both the threshold files can be written at runtime to + control scheduler behaviour. config SCHED_HMP_PRIO_FILTER bool "(EXPERIMENTAL) Filter HMP migrations by task priority" @@ -1544,28 +1555,24 @@ config HMP_VARIABLE_SCALE bool "Allows changing the load tracking scale through sysfs" depends on SCHED_HMP help - When turned on, this option exports the thresholds and load average - period value for the load tracking patches through sysfs. + When turned on, this option exports the load average period value + for the load tracking patches through sysfs. The values can be modified to change the rate of load accumulation - and the thresholds used for HMP migration. - The load_avg_period_ms is the time in ms to reach a load average of - 0.5 for an idle task of 0 load average ratio that start a busy loop. - The up_threshold and down_threshold is the value to go to a faster - CPU or to go back to a slower cpu. - The {up,down}_threshold are devided by 1024 before being compared - to the load average. - For examples, with load_avg_period_ms = 128 and up_threshold = 512, + used for HMP migration. 'load_avg_period_ms' is the time in ms to + reach a load average of 0.5 for an idle task of 0 load average + ratio which becomes 100% busy. + For example, with load_avg_period_ms = 128 and up_threshold = 512, a running task with a load of 0 will be migrated to a bigger CPU after 128ms, because after 128ms its load_avg_ratio is 0.5 and the real up_threshold is 0.5. This patch has the same behavior as changing the Y of the load average computation to (1002/1024)^(LOAD_AVG_PERIOD/load_avg_period_ms) - but it remove intermadiate overflows in computation. + but removes intermediate overflows in computation. config HMP_FREQUENCY_INVARIANT_SCALE bool "(EXPERIMENTAL) Frequency-Invariant Tracked Load for HMP" - depends on HMP_VARIABLE_SCALE && CPU_FREQ + depends on SCHED_HMP && CPU_FREQ help Scales the current load contribution in line with the frequency of the CPU that the task was executed on. diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cab90d6f32d..c7d808ee0a3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -31,7 +31,6 @@ #include #include -#ifdef CONFIG_HMP_VARIABLE_SCALE #include #include #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE @@ -40,7 +39,6 @@ */ #include #endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */ -#endif /* CONFIG_HMP_VARIABLE_SCALE */ #include "sched.h" @@ -1212,8 +1210,6 @@ static u32 __compute_runnable_contrib(u64 n) return contrib + runnable_avg_yN_sum[n]; } -#ifdef CONFIG_HMP_VARIABLE_SCALE - #define HMP_VARIABLE_SCALE_SHIFT 16ULL struct hmp_global_attr { struct attribute attr; @@ -1224,6 +1220,7 @@ struct hmp_global_attr { int *value; int (*to_sysfs)(int); int (*from_sysfs)(int); + ssize_t (*to_sysfs_text)(char *buf, int buf_size); }; #define HMP_DATA_SYSFS_MAX 8 @@ -1294,7 +1291,6 @@ struct cpufreq_extents { static struct cpufreq_extents freq_scale[CONFIG_NR_CPUS]; #endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */ -#endif /* CONFIG_HMP_VARIABLE_SCALE */ /* We can represent the historical contribution to runnable average as the * coefficients of a geometric series. To do this we sub-divide our runnable @@ -1340,9 +1336,8 @@ static __always_inline int __update_entity_runnable_avg(u64 now, #endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */ delta = now - sa->last_runnable_update; -#ifdef CONFIG_HMP_VARIABLE_SCALE + delta = hmp_variable_scale_convert(delta); -#endif /* * This should only happen when time goes backwards, which it * unfortunately does during sched clock init when we swap over to TSC. @@ -3842,7 +3837,6 @@ static inline void hmp_next_down_delay(struct sched_entity *se, int cpu) cpu_rq(cpu)->avg.hmp_last_up_migration = 0; } -#ifdef CONFIG_HMP_VARIABLE_SCALE /* * Heterogenous multiprocessor (HMP) optimizations * @@ -3875,27 +3869,35 @@ static inline void hmp_next_down_delay(struct sched_entity *se, int cpu) * The scale factor hmp_data.multiplier is a fixed point * number: (32-HMP_VARIABLE_SCALE_SHIFT).HMP_VARIABLE_SCALE_SHIFT */ -static u64 hmp_variable_scale_convert(u64 delta) +static inline u64 hmp_variable_scale_convert(u64 delta) { +#ifdef CONFIG_HMP_VARIABLE_SCALE u64 high = delta >> 32ULL; u64 low = delta & 0xffffffffULL; low *= hmp_data.multiplier; high *= hmp_data.multiplier; return (low >> HMP_VARIABLE_SCALE_SHIFT) + (high << (32ULL - HMP_VARIABLE_SCALE_SHIFT)); +#else + return delta; +#endif } static ssize_t hmp_show(struct kobject *kobj, struct attribute *attr, char *buf) { - ssize_t ret = 0; struct hmp_global_attr *hmp_attr = container_of(attr, struct hmp_global_attr, attr); - int temp = *(hmp_attr->value); + int temp; + + if (hmp_attr->to_sysfs_text != NULL) + return hmp_attr->to_sysfs_text(buf, PAGE_SIZE); + + temp = *(hmp_attr->value); if (hmp_attr->to_sysfs != NULL) temp = hmp_attr->to_sysfs(temp); - ret = sprintf(buf, "%d\n", temp); - return ret; + + return (ssize_t)sprintf(buf, "%d\n", temp); } static ssize_t hmp_store(struct kobject *a, struct attribute *attr, @@ -3924,11 +3926,31 @@ static ssize_t hmp_store(struct kobject *a, struct attribute *attr, return ret; } +static ssize_t hmp_print_domains(char *outbuf, int outbufsize) +{ + char buf[64]; + const char nospace[] = "%s", space[] = " %s"; + const char *fmt = nospace; + struct hmp_domain *domain; + struct list_head *pos; + int outpos = 0; + list_for_each(pos, &hmp_domains) { + domain = list_entry(pos, struct hmp_domain, hmp_domains); + if (cpumask_scnprintf(buf, 64, &domain->possible_cpus)) { + outpos += sprintf(outbuf+outpos, fmt, buf); + fmt = space; + } + } + strcat(outbuf, "\n"); + return outpos+1; +} + +#ifdef CONFIG_HMP_VARIABLE_SCALE static int hmp_period_tofrom_sysfs(int value) { return (LOAD_AVG_PERIOD << HMP_VARIABLE_SCALE_SHIFT) / value; } - +#endif /* max value for threshold is 1024 */ static int hmp_theshold_from_sysfs(int value) { @@ -3936,9 +3958,10 @@ static int hmp_theshold_from_sysfs(int value) return -1; return value; } -#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE -/* freqinvar control is only 0,1 off/on */ -static int hmp_freqinvar_from_sysfs(int value) +#if defined(CONFIG_SCHED_HMP_LITTLE_PACKING) || \ + defined(CONFIG_HMP_FREQUENCY_INVARIANT_SCALE) +/* toggle control is only 0,1 off/on */ +static int hmp_toggle_from_sysfs(int value) { if (value < 0 || value > 1) return -1; @@ -3958,7 +3981,9 @@ static void hmp_attr_add( const char *name, int *value, int (*to_sysfs)(int), - int (*from_sysfs)(int)) + int (*from_sysfs)(int), + ssize_t (*to_sysfs_text)(char *, int), + umode_t mode) { int i = 0; while (hmp_data.attributes[i] != NULL) { @@ -3966,13 +3991,17 @@ static void hmp_attr_add( if (i >= HMP_DATA_SYSFS_MAX) return; } - hmp_data.attr[i].attr.mode = 0644; + if (mode) + hmp_data.attr[i].attr.mode = mode; + else + hmp_data.attr[i].attr.mode = 0644; hmp_data.attr[i].show = hmp_show; hmp_data.attr[i].store = hmp_store; hmp_data.attr[i].attr.name = name; hmp_data.attr[i].value = value; hmp_data.attr[i].to_sysfs = to_sysfs; hmp_data.attr[i].from_sysfs = from_sysfs; + hmp_data.attr[i].to_sysfs_text = to_sysfs_text; hmp_data.attributes[i] = &hmp_data.attr[i].attr; hmp_data.attributes[i + 1] = NULL; } @@ -3981,40 +4010,59 @@ static int hmp_attr_init(void) { int ret; memset(&hmp_data, sizeof(hmp_data), 0); + hmp_attr_add("hmp_domains", + NULL, + NULL, + NULL, + hmp_print_domains, + 0444); + hmp_attr_add("up_threshold", + &hmp_up_threshold, + NULL, + hmp_theshold_from_sysfs, + NULL, + 0); + hmp_attr_add("down_threshold", + &hmp_down_threshold, + NULL, + hmp_theshold_from_sysfs, + NULL, + 0); +#ifdef CONFIG_HMP_VARIABLE_SCALE /* by default load_avg_period_ms == LOAD_AVG_PERIOD * meaning no change */ hmp_data.multiplier = hmp_period_tofrom_sysfs(LOAD_AVG_PERIOD); - hmp_attr_add("load_avg_period_ms", &hmp_data.multiplier, hmp_period_tofrom_sysfs, - hmp_period_tofrom_sysfs); - hmp_attr_add("up_threshold", - &hmp_up_threshold, - NULL, - hmp_theshold_from_sysfs); - hmp_attr_add("down_threshold", - &hmp_down_threshold, + hmp_period_tofrom_sysfs, NULL, - hmp_theshold_from_sysfs); + 0); +#endif #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE /* default frequency-invariant scaling ON */ hmp_data.freqinvar_load_scale_enabled = 1; hmp_attr_add("frequency_invariant_load_scale", &hmp_data.freqinvar_load_scale_enabled, NULL, - hmp_freqinvar_from_sysfs); + hmp_toggle_from_sysfs, + NULL, + 0); #endif #ifdef CONFIG_SCHED_HMP_LITTLE_PACKING hmp_attr_add("packing_enable", &hmp_packing_enabled, NULL, - hmp_freqinvar_from_sysfs); + hmp_toggle_from_sysfs, + NULL, + 0); hmp_attr_add("packing_limit", &hmp_full_threshold, NULL, - hmp_packing_from_sysfs); + hmp_packing_from_sysfs, + NULL, + 0); #endif hmp_data.attr_group.name = "hmp"; hmp_data.attr_group.attrs = hmp_data.attributes; @@ -4023,7 +4071,6 @@ static int hmp_attr_init(void) return 0; } late_initcall(hmp_attr_init); -#endif /* CONFIG_HMP_VARIABLE_SCALE */ /* * return the load of the lowest-loaded CPU in a given HMP domain * min_cpu optionally points to an int to receive the CPU. -- cgit v1.2.3 From f5be72980bc321f3491377861835c343cc27af0d Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Wed, 20 Nov 2013 14:14:44 +0000 Subject: Documentation: HMP: Small Task Packing explanation Signed-off-by: Chris Redpath Signed-off-by: Jon Medhurst --- Documentation/arm/small_task_packing.txt | 136 +++++++++++++++++++++++++++++++ 1 file changed, 136 insertions(+) create mode 100644 Documentation/arm/small_task_packing.txt diff --git a/Documentation/arm/small_task_packing.txt b/Documentation/arm/small_task_packing.txt new file mode 100644 index 00000000000..43f0a8b8023 --- /dev/null +++ b/Documentation/arm/small_task_packing.txt @@ -0,0 +1,136 @@ +Small Task Packing in the big.LITTLE MP Reference Patch Set + +What is small task packing? +---- +Simply that the scheduler will fit as many small tasks on a single CPU +as possible before using other CPUs. A small task is defined as one +whose tracked load is less than 90% of a NICE_0 task. This is a change +from the usual behavior since the scheduler will normally use an idle +CPU for a waking task unless that task is considered cache hot. + + +How is it implemented? +---- +Since all small tasks must wake up relatively frequently, the main +requirement for packing small tasks is to select a partly-busy CPU when +waking rather than looking for an idle CPU. We use the tracked load of +the CPU runqueue to determine how heavily loaded each CPU is and the +tracked load of the task to determine if it will fit on the CPU. We +always start with the lowest-numbered CPU in a sched domain and stop +looking when we find a CPU with enough space for the task. + +Some further tweaks are necessary to suppress load balancing when the +CPU is not fully loaded, otherwise the scheduler attempts to spread +tasks evenly across the domain. + + +How does it interact with the HMP patches? +---- +Firstly, we only enable packing on the little domain. The intent is that +the big domain is intended to spread tasks amongst the available CPUs +one-task-per-CPU. The little domain however is attempting to use as +little power as possible while servicing its tasks. + +Secondly, since we offload big tasks onto little CPUs in order to try +to devote one CPU to each task, we have a threshold above which we do +not try to pack a task and instead will select an idle CPU if possible. +This maintains maximum forward progress for busy tasks temporarily +demoted from big CPUs. + + +Can the behaviour be tuned? +---- +Yes, the load level of a 'full' CPU can be easily modified in the source +and is exposed through sysfs as /sys/kernel/hmp/packing_limit to be +changed at runtime. The presence of the packing behaviour is controlled +by CONFIG_SCHED_HMP_LITTLE_PACKING and can be disabled at run-time +using /sys/kernel/hmp/packing_enable. +The definition of a small task is hard coded as 90% of NICE_0_LOAD +and cannot be modified at run time. + + +Why do I need to tune it? +---- +The optimal configuration is likely to be different depending upon the +design and manufacturing of your SoC. + +In the main, there are two system effects from enabling small task +packing. + +1. CPU operating point may increase +2. wakeup latency of tasks may be increased + +There are also likely to be secondary effects from loading one CPU +rather than spreading tasks. + +Note that all of these system effects are dependent upon the workload +under consideration. + + +CPU Operating Point +---- +The primary impact of loading one CPU with a number of light tasks is to +increase the compute requirement of that CPU since it is no longer idle +as often. Increased compute requirement causes an increase in the +frequency of the CPU through CPUfreq. + +Consider this example: +We have a system with 3 CPUs which can operate at any frequency between +350MHz and 1GHz. The system has 6 tasks which would each produce 10% +load at 1GHz. The scheduler has frequency-invariant load scaling +enabled. Our DVFS governor aims for 80% utilization at the chosen +frequency. + +Without task packing, these tasks will be spread out amongst all CPUs +such that each has 2. This will produce roughly 20% system load, and +the frequency of the package will remain at 350MHz. + +With task packing set to the default packing_limit, all of these tasks +will sit on one CPU and require a package frequency of ~750MHz to reach +80% utilization. (0.75 = 0.6 * 0.8). + +When a package operates on a single frequency domain, all CPUs in that +package share frequency and voltage. + +Depending upon the SoC implementation there can be a significant amount +of energy lost to leakage from idle CPUs. The decision about how +loaded a CPU must be to be considered 'full' is therefore controllable +through sysfs (sys/kernel/hmp/packing_limit) and directly in the code. + +Continuing the example, lets set packing_limit to 450 which means we +will pack tasks until the total load of all running tasks >= 450. In +practise, this is very similar to a 55% idle 1Ghz CPU. + +Now we are only able to place 4 tasks on CPU0, and two will overflow +onto CPU1. CPU0 will have a load of 40% and CPU1 will have a load of +20%. In order to still hit 80% utilization, CPU0 now only needs to +operate at (0.4*0.8=0.32) 320MHz, which means that the lowest operating +point will be selected, the same as in the non-packing case, except that +now CPU2 is no longer needed and can be power-gated. + +In order to use less energy, the saving from power-gating CPU2 must be +more than the energy spent running CPU0 for the extra cycles. This +depends upon the SoC implementation. + +This is obviously a contrived example requiring all the tasks to +be runnable at the same time, but it illustrates the point. + + +Wakeup Latency +---- +This is an unavoidable consequence of trying to pack tasks together +rather than giving them a CPU each. If you cannot find an acceptable +level of wakeup latency, you should turn packing off. + +Cyclictest is a good test application for determining the added latency +when configuring packing. + + +Why is it turned off for the VersatileExpress V2P_CA15A7 CoreTile? +---- +Simply, this core tile only has power gating for the whole A7 package. +When small task packing is enabled, all our low-energy use cases +normally fit onto one A7 CPU. We therefore end up with 2 mostly-idle +CPUs and one mostly-busy CPU. This decreases the amount of time +available where the whole package is idle and can be turned off. + -- cgit v1.2.3