diff options
-rw-r--r-- | Documentation/devicetree/bindings/arm/pmu.txt | 3 | ||||
-rw-r--r-- | Documentation/kernel-parameters.txt | 9 | ||||
-rw-r--r-- | arch/arm/Kconfig | 38 | ||||
-rw-r--r-- | arch/arm/include/asm/hw_breakpoint.h | 3 | ||||
-rw-r--r-- | arch/arm/include/asm/pmu.h | 12 | ||||
-rw-r--r-- | arch/arm/include/asm/topology.h | 3 | ||||
-rw-r--r-- | arch/arm/kernel/hw_breakpoint.c | 56 | ||||
-rw-r--r-- | arch/arm/kernel/perf_event.c | 19 | ||||
-rw-r--r-- | arch/arm/kernel/perf_event_cpu.c | 117 | ||||
-rw-r--r-- | arch/arm/kernel/perf_event_v7.c | 57 | ||||
-rw-r--r-- | arch/arm/kernel/topology.c | 27 | ||||
-rw-r--r-- | drivers/cpufreq/cpufreq.c | 25 | ||||
-rw-r--r-- | kernel/irq/irqdesc.c | 21 | ||||
-rw-r--r-- | kernel/sched/fair.c | 563 | ||||
-rw-r--r-- | linaro/configs/big-LITTLE-MP.conf | 13 |
15 files changed, 914 insertions, 52 deletions
diff --git a/Documentation/devicetree/bindings/arm/pmu.txt b/Documentation/devicetree/bindings/arm/pmu.txt index 343781b9f246..4ce82d045a6b 100644 --- a/Documentation/devicetree/bindings/arm/pmu.txt +++ b/Documentation/devicetree/bindings/arm/pmu.txt @@ -16,6 +16,9 @@ Required properties: "arm,arm1176-pmu" "arm,arm1136-pmu" - interrupts : 1 combined interrupt or 1 per core. +- cluster : a phandle to the cluster to which it belongs + If there are more than one cluster with same CPU type + then there should be separate PMU nodes per cluster. Example: diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 6c723811c0a0..21516d189f18 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1177,6 +1177,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted. See comment before ip2_setup() in drivers/char/ip2/ip2base.c. + irqaffinity= [SMP] Set the default irq affinity mask + Format: + <cpu number>,...,<cpu number> + or + <cpu number>-<cpu number> + (must be a positive range in ascending order) + or a mixture + <cpu number>,...,<cpu number>-<cpu number> + irqfixup [HW] When an interrupt is not handled search all handlers for it. Intended to get systems with badly broken diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 2fb12da9dafc..67ac812f7338 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1614,6 +1614,44 @@ config HMP_SLOW_CPU_MASK Specify the cpuids of the slow CPUs in the system as a list string, e.g. cpuid 0+1 should be specified as 0-1. +config HMP_VARIABLE_SCALE + bool "Allows changing the load tracking scale through sysfs" + depends on SCHED_HMP + help + When turned on, this option exports the thresholds and load average + period value for the load tracking patches through sysfs. + The values can be modified to change the rate of load accumulation + and the thresholds used for HMP migration. + The load_avg_period_ms is the time in ms to reach a load average of + 0.5 for an idle task of 0 load average ratio that start a busy loop. + The up_threshold and down_threshold is the value to go to a faster + CPU or to go back to a slower cpu. + The {up,down}_threshold are devided by 1024 before being compared + to the load average. + For examples, with load_avg_period_ms = 128 and up_threshold = 512, + a running task with a load of 0 will be migrated to a bigger CPU after + 128ms, because after 128ms its load_avg_ratio is 0.5 and the real + up_threshold is 0.5. + This patch has the same behavior as changing the Y of the load + average computation to + (1002/1024)^(LOAD_AVG_PERIOD/load_avg_period_ms) + but it remove intermadiate overflows in computation. + +config HMP_FREQUENCY_INVARIANT_SCALE + bool "(EXPERIMENTAL) Frequency-Invariant Tracked Load for HMP" + depends on HMP_VARIABLE_SCALE && CPU_FREQ + help + Scales the current load contribution in line with the frequency + of the CPU that the task was executed on. + In this version, we use a simple linear scale derived from the + maximum frequency reported by CPUFreq. + Restricting tracked load to be scaled by the CPU's frequency + represents the consumption of possible compute capacity + (rather than consumption of actual instantaneous capacity as + normal) and allows the HMP migration's simple threshold + migration strategy to interact more predictably with CPUFreq's + asynchronous compute capacity changes. + config HAVE_ARM_SCU bool help diff --git a/arch/arm/include/asm/hw_breakpoint.h b/arch/arm/include/asm/hw_breakpoint.h index 01169dd723f1..eef55ea9ef00 100644 --- a/arch/arm/include/asm/hw_breakpoint.h +++ b/arch/arm/include/asm/hw_breakpoint.h @@ -85,6 +85,9 @@ static inline void decode_ctrl_reg(u32 reg, #define ARM_DSCR_HDBGEN (1 << 14) #define ARM_DSCR_MDBGEN (1 << 15) +/* OSLSR os lock model bits */ +#define ARM_OSLSR_OSLM0 (1 << 0) + /* opcode2 numbers for the co-processor instructions. */ #define ARM_OP2_BVR 4 #define ARM_OP2_BCR 5 diff --git a/arch/arm/include/asm/pmu.h b/arch/arm/include/asm/pmu.h index f24edad26c70..0cd7824ca762 100644 --- a/arch/arm/include/asm/pmu.h +++ b/arch/arm/include/asm/pmu.h @@ -62,9 +62,19 @@ struct pmu_hw_events { raw_spinlock_t pmu_lock; }; +struct cpupmu_regs { + u32 pmc; + u32 pmcntenset; + u32 pmuseren; + u32 pmintenset; + u32 pmxevttype[8]; + u32 pmxevtcnt[8]; +}; + struct arm_pmu { struct pmu pmu; cpumask_t active_irqs; + cpumask_t valid_cpus; char *name; irqreturn_t (*handle_irq)(int irq_num, void *dev); void (*enable)(struct perf_event *event); @@ -81,6 +91,8 @@ struct arm_pmu { int (*request_irq)(struct arm_pmu *, irq_handler_t handler); void (*free_irq)(struct arm_pmu *); int (*map_event)(struct perf_event *event); + void (*save_regs)(struct arm_pmu *, struct cpupmu_regs *); + void (*restore_regs)(struct arm_pmu *, struct cpupmu_regs *); int num_events; atomic_t active_events; struct mutex reserve_mutex; diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h index 5692ba11322d..983fa7c153a2 100644 --- a/arch/arm/include/asm/topology.h +++ b/arch/arm/include/asm/topology.h @@ -26,6 +26,7 @@ extern struct cputopo_arm cpu_topology[NR_CPUS]; void init_cpu_topology(void); void store_cpu_topology(unsigned int cpuid); const struct cpumask *cpu_coregroup_mask(int cpu); +int cluster_to_logical_mask(unsigned int socket_id, cpumask_t *cluster_mask); #ifdef CONFIG_DISABLE_CPU_SCHED_DOMAIN_BALANCE /* Common values for CPUs */ @@ -62,6 +63,8 @@ const struct cpumask *cpu_coregroup_mask(int cpu); static inline void init_cpu_topology(void) { } static inline void store_cpu_topology(unsigned int cpuid) { } +static inline int cluster_to_logical_mask(unsigned int socket_id, + cpumask_t *cluster_mask) { return -EINVAL; } #endif diff --git a/arch/arm/kernel/hw_breakpoint.c b/arch/arm/kernel/hw_breakpoint.c index 5ff2e77782b1..f031a4f82934 100644 --- a/arch/arm/kernel/hw_breakpoint.c +++ b/arch/arm/kernel/hw_breakpoint.c @@ -28,6 +28,7 @@ #include <linux/perf_event.h> #include <linux/hw_breakpoint.h> #include <linux/smp.h> +#include <linux/cpu_pm.h> #include <asm/cacheflush.h> #include <asm/cputype.h> @@ -49,6 +50,9 @@ static int core_num_wrps; /* Debug architecture version. */ static u8 debug_arch; +/* Does debug architecture support OS Save and Restore? */ +static bool has_ossr; + /* Maximum supported watchpoint length. */ static u8 max_watchpoint_len; @@ -903,6 +907,23 @@ static struct undef_hook debug_reg_hook = { .fn = debug_reg_trap, }; +/* Does this core support OS Save and Restore? */ +static bool core_has_os_save_restore(void) +{ + u32 oslsr; + + switch (get_debug_arch()) { + case ARM_DEBUG_ARCH_V7_1: + return true; + case ARM_DEBUG_ARCH_V7_ECP14: + ARM_DBG_READ(c1, c1, 4, oslsr); + if (oslsr & ARM_OSLSR_OSLM0) + return true; + default: + return false; + } +} + static void reset_ctrl_regs(void *unused) { int i, raw_num_brps, err = 0, cpu = smp_processor_id(); @@ -930,11 +951,7 @@ static void reset_ctrl_regs(void *unused) if ((val & 0x1) == 0) err = -EPERM; - /* - * Check whether we implement OS save and restore. - */ - ARM_DBG_READ(c1, c1, 4, val); - if ((val & 0x9) == 0) + if (!has_ossr) goto clear_vcr; break; case ARM_DEBUG_ARCH_V7_1: @@ -1015,6 +1032,31 @@ static struct notifier_block __cpuinitdata dbg_reset_nb = { .notifier_call = dbg_reset_notify, }; +#ifdef CONFIG_CPU_PM +static int dbg_cpu_pm_notify(struct notifier_block *self, unsigned long action, + void *v) +{ + if (action == CPU_PM_EXIT) + reset_ctrl_regs(NULL); + + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata dbg_cpu_pm_nb = { + .notifier_call = dbg_cpu_pm_notify, +}; + +static void __init pm_init(void) +{ + if (has_ossr) + cpu_pm_register_notifier(&dbg_cpu_pm_nb); +} +#else +static inline void pm_init(void) +{ +} +#endif + static int __init arch_hw_breakpoint_init(void) { debug_arch = get_debug_arch(); @@ -1024,6 +1066,8 @@ static int __init arch_hw_breakpoint_init(void) return 0; } + has_ossr = core_has_os_save_restore(); + /* Determine how many BRPs/WRPs are available. */ core_num_brps = get_num_brps(); core_num_wrps = get_num_wrps(); @@ -1064,6 +1108,8 @@ static int __init arch_hw_breakpoint_init(void) /* Register hotplug notifier. */ register_cpu_notifier(&dbg_reset_nb); + + pm_init(); return 0; } arch_initcall(arch_hw_breakpoint_init); diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c index f9e8657dd241..82dc1522da10 100644 --- a/arch/arm/kernel/perf_event.c +++ b/arch/arm/kernel/perf_event.c @@ -12,6 +12,7 @@ */ #define pr_fmt(fmt) "hw perfevents: " fmt +#include <linux/cpumask.h> #include <linux/kernel.h> #include <linux/platform_device.h> #include <linux/pm_runtime.h> @@ -81,6 +82,9 @@ armpmu_map_event(struct perf_event *event, return armpmu_map_cache_event(cache_map, config); case PERF_TYPE_RAW: return armpmu_map_raw_event(raw_event_mask, config); + default: + if (event->attr.type >= PERF_TYPE_MAX) + return armpmu_map_raw_event(raw_event_mask, config); } return -ENOENT; @@ -164,6 +168,8 @@ armpmu_stop(struct perf_event *event, int flags) struct arm_pmu *armpmu = to_arm_pmu(event->pmu); struct hw_perf_event *hwc = &event->hw; + if (!cpumask_test_cpu(smp_processor_id(), &armpmu->valid_cpus)) + return; /* * ARM pmu always has to update the counter, so ignore * PERF_EF_UPDATE, see comments in armpmu_start(). @@ -180,6 +186,8 @@ static void armpmu_start(struct perf_event *event, int flags) struct arm_pmu *armpmu = to_arm_pmu(event->pmu); struct hw_perf_event *hwc = &event->hw; + if (!cpumask_test_cpu(smp_processor_id(), &armpmu->valid_cpus)) + return; /* * ARM pmu always has to reprogram the period, so ignore * PERF_EF_RELOAD, see the comment below. @@ -207,6 +215,9 @@ armpmu_del(struct perf_event *event, int flags) struct hw_perf_event *hwc = &event->hw; int idx = hwc->idx; + if (!cpumask_test_cpu(smp_processor_id(), &armpmu->valid_cpus)) + return; + WARN_ON(idx < 0); armpmu_stop(event, PERF_EF_UPDATE); @@ -225,6 +236,10 @@ armpmu_add(struct perf_event *event, int flags) int idx; int err = 0; + /* An event following a process won't be stopped earlier */ + if (!cpumask_test_cpu(smp_processor_id(), &armpmu->valid_cpus)) + return 0; + perf_pmu_disable(event->pmu); /* If we don't have a space for the counter then finish early. */ @@ -423,6 +438,10 @@ static int armpmu_event_init(struct perf_event *event) int err = 0; atomic_t *active_events = &armpmu->active_events; + if (event->cpu != -1 && + !cpumask_test_cpu(event->cpu, &armpmu->valid_cpus)) + return -ENOENT; + /* does not support taken branch sampling */ if (has_branch_stack(event)) return -EOPNOTSUPP; diff --git a/arch/arm/kernel/perf_event_cpu.c b/arch/arm/kernel/perf_event_cpu.c index 5f6620684e25..f495bbcaf1b2 100644 --- a/arch/arm/kernel/perf_event_cpu.c +++ b/arch/arm/kernel/perf_event_cpu.c @@ -19,6 +19,7 @@ #define pr_fmt(fmt) "CPU PMU: " fmt #include <linux/bitmap.h> +#include <linux/cpu_pm.h> #include <linux/export.h> #include <linux/kernel.h> #include <linux/of.h> @@ -31,33 +32,36 @@ #include <asm/pmu.h> /* Set at runtime when we know what CPU type we are. */ -static struct arm_pmu *cpu_pmu; +static DEFINE_PER_CPU(struct arm_pmu *, cpu_pmu); static DEFINE_PER_CPU(struct perf_event * [ARMPMU_MAX_HWEVENTS], hw_events); static DEFINE_PER_CPU(unsigned long [BITS_TO_LONGS(ARMPMU_MAX_HWEVENTS)], used_mask); static DEFINE_PER_CPU(struct pmu_hw_events, cpu_hw_events); +static DEFINE_PER_CPU(struct cpupmu_regs, cpu_pmu_regs); + /* * Despite the names, these two functions are CPU-specific and are used * by the OProfile/perf code. */ const char *perf_pmu_name(void) { - if (!cpu_pmu) + struct arm_pmu *pmu = per_cpu(cpu_pmu, 0); + if (!pmu) return NULL; - return cpu_pmu->name; + return pmu->name; } EXPORT_SYMBOL_GPL(perf_pmu_name); int perf_num_counters(void) { - int max_events = 0; + struct arm_pmu *pmu = per_cpu(cpu_pmu, 0); - if (cpu_pmu != NULL) - max_events = cpu_pmu->num_events; + if (!pmu) + return 0; - return max_events; + return pmu->num_events; } EXPORT_SYMBOL_GPL(perf_num_counters); @@ -75,11 +79,13 @@ static void cpu_pmu_free_irq(struct arm_pmu *cpu_pmu) { int i, irq, irqs; struct platform_device *pmu_device = cpu_pmu->plat_device; + int cpu = -1; irqs = min(pmu_device->num_resources, num_possible_cpus()); for (i = 0; i < irqs; ++i) { - if (!cpumask_test_and_clear_cpu(i, &cpu_pmu->active_irqs)) + cpu = cpumask_next(cpu, &cpu_pmu->valid_cpus); + if (!cpumask_test_and_clear_cpu(cpu, &cpu_pmu->active_irqs)) continue; irq = platform_get_irq(pmu_device, i); if (irq >= 0) @@ -91,6 +97,7 @@ static int cpu_pmu_request_irq(struct arm_pmu *cpu_pmu, irq_handler_t handler) { int i, err, irq, irqs; struct platform_device *pmu_device = cpu_pmu->plat_device; + int cpu = -1; if (!pmu_device) return -ENODEV; @@ -103,6 +110,7 @@ static int cpu_pmu_request_irq(struct arm_pmu *cpu_pmu, irq_handler_t handler) for (i = 0; i < irqs; ++i) { err = 0; + cpu = cpumask_next(cpu, &cpu_pmu->valid_cpus); irq = platform_get_irq(pmu_device, i); if (irq < 0) continue; @@ -112,7 +120,7 @@ static int cpu_pmu_request_irq(struct arm_pmu *cpu_pmu, irq_handler_t handler) * assume that we're running on a uniprocessor machine and * continue. Otherwise, continue without this interrupt. */ - if (irq_set_affinity(irq, cpumask_of(i)) && irqs > 1) { + if (irq_set_affinity(irq, cpumask_of(cpu)) && irqs > 1) { pr_warning("unable to set irq affinity (irq=%d, cpu=%u)\n", irq, i); continue; @@ -126,7 +134,7 @@ static int cpu_pmu_request_irq(struct arm_pmu *cpu_pmu, irq_handler_t handler) return err; } - cpumask_set_cpu(i, &cpu_pmu->active_irqs); + cpumask_set_cpu(cpu, &cpu_pmu->active_irqs); } return 0; @@ -135,7 +143,7 @@ static int cpu_pmu_request_irq(struct arm_pmu *cpu_pmu, irq_handler_t handler) static void cpu_pmu_init(struct arm_pmu *cpu_pmu) { int cpu; - for_each_possible_cpu(cpu) { + for_each_cpu_mask(cpu, cpu_pmu->valid_cpus) { struct pmu_hw_events *events = &per_cpu(cpu_hw_events, cpu); events->events = per_cpu(hw_events, cpu); events->used_mask = per_cpu(used_mask, cpu); @@ -148,7 +156,7 @@ static void cpu_pmu_init(struct arm_pmu *cpu_pmu) /* Ensure the PMU has sane values out of reset. */ if (cpu_pmu && cpu_pmu->reset) - on_each_cpu(cpu_pmu->reset, cpu_pmu, 1); + on_each_cpu_mask(&cpu_pmu->valid_cpus, cpu_pmu->reset, cpu_pmu, 1); } /* @@ -160,21 +168,46 @@ static void cpu_pmu_init(struct arm_pmu *cpu_pmu) static int __cpuinit cpu_pmu_notify(struct notifier_block *b, unsigned long action, void *hcpu) { + struct arm_pmu *pmu = per_cpu(cpu_pmu, (long)hcpu); + if ((action & ~CPU_TASKS_FROZEN) != CPU_STARTING) return NOTIFY_DONE; - if (cpu_pmu && cpu_pmu->reset) - cpu_pmu->reset(cpu_pmu); + if (pmu && pmu->reset) + pmu->reset(pmu); else return NOTIFY_DONE; return NOTIFY_OK; } +static int cpu_pmu_pm_notify(struct notifier_block *b, + unsigned long action, void *hcpu) +{ + int cpu = smp_processor_id(); + struct arm_pmu *pmu = per_cpu(cpu_pmu, cpu); + struct cpupmu_regs *pmuregs = &per_cpu(cpu_pmu_regs, cpu); + + if (!pmu) + return NOTIFY_DONE; + + if (action == CPU_PM_ENTER && pmu->save_regs) { + pmu->save_regs(pmu, pmuregs); + } else if (action == CPU_PM_EXIT && pmu->restore_regs) { + pmu->restore_regs(pmu, pmuregs); + } + + return NOTIFY_OK; +} + static struct notifier_block __cpuinitdata cpu_pmu_hotplug_notifier = { .notifier_call = cpu_pmu_notify, }; +static struct notifier_block __cpuinitdata cpu_pmu_pm_notifier = { + .notifier_call = cpu_pmu_pm_notify, +}; + /* * PMU platform driver and devicetree bindings. */ @@ -248,6 +281,9 @@ static int probe_current_pmu(struct arm_pmu *pmu) } } + /* assume PMU support all the CPUs in this case */ + cpumask_setall(&pmu->valid_cpus); + put_cpu(); return ret; } @@ -255,15 +291,10 @@ static int probe_current_pmu(struct arm_pmu *pmu) static int cpu_pmu_device_probe(struct platform_device *pdev) { const struct of_device_id *of_id; - int (*init_fn)(struct arm_pmu *); struct device_node *node = pdev->dev.of_node; struct arm_pmu *pmu; - int ret = -ENODEV; - - if (cpu_pmu) { - pr_info("attempt to register multiple PMU devices!"); - return -ENOSPC; - } + int ret = 0; + int cpu; pmu = kzalloc(sizeof(struct arm_pmu), GFP_KERNEL); if (!pmu) { @@ -272,8 +303,28 @@ static int cpu_pmu_device_probe(struct platform_device *pdev) } if (node && (of_id = of_match_node(cpu_pmu_of_device_ids, pdev->dev.of_node))) { - init_fn = of_id->data; - ret = init_fn(pmu); + smp_call_func_t init_fn = (smp_call_func_t)of_id->data; + struct device_node *ncluster; + int cluster = -1; + cpumask_t sibling_mask; + + ncluster = of_parse_phandle(node, "cluster", 0); + if (ncluster) { + int len; + const u32 *hwid; + hwid = of_get_property(ncluster, "reg", &len); + if (hwid && len == 4) + cluster = be32_to_cpup(hwid); + } + /* set sibling mask to all cpu mask if socket is not specified */ + if (cluster == -1 || + cluster_to_logical_mask(cluster, &sibling_mask)) + cpumask_setall(&sibling_mask); + + smp_call_function_any(&sibling_mask, init_fn, pmu, 1); + + /* now set the valid_cpus after init */ + cpumask_copy(&pmu->valid_cpus, &sibling_mask); } else { ret = probe_current_pmu(pmu); } @@ -284,10 +335,12 @@ static int cpu_pmu_device_probe(struct platform_device *pdev) return ret; } - cpu_pmu = pmu; - cpu_pmu->plat_device = pdev; - cpu_pmu_init(cpu_pmu); - armpmu_register(cpu_pmu, PERF_TYPE_RAW); + for_each_cpu_mask(cpu, pmu->valid_cpus) + per_cpu(cpu_pmu, cpu) = pmu; + + pmu->plat_device = pdev; + cpu_pmu_init(pmu); + armpmu_register(pmu, -1); return 0; } @@ -310,9 +363,17 @@ static int __init register_pmu_driver(void) if (err) return err; + err = cpu_pm_register_notifier(&cpu_pmu_pm_notifier); + if (err) { + unregister_cpu_notifier(&cpu_pmu_hotplug_notifier); + return err; + } + err = platform_driver_register(&cpu_pmu_driver); - if (err) + if (err) { + cpu_pm_unregister_notifier(&cpu_pmu_pm_notifier); unregister_cpu_notifier(&cpu_pmu_hotplug_notifier); + } return err; } diff --git a/arch/arm/kernel/perf_event_v7.c b/arch/arm/kernel/perf_event_v7.c index 4fbc757d9cff..37c78c484576 100644 --- a/arch/arm/kernel/perf_event_v7.c +++ b/arch/arm/kernel/perf_event_v7.c @@ -950,6 +950,51 @@ static void armv7_pmnc_dump_regs(struct arm_pmu *cpu_pmu) } #endif +static void armv7pmu_save_regs(struct arm_pmu *cpu_pmu, + struct cpupmu_regs *regs) +{ + unsigned int cnt; + asm volatile("mrc p15, 0, %0, c9, c12, 0" : "=r" (regs->pmc)); + if (!(regs->pmc & ARMV7_PMNC_E)) + return; + + asm volatile("mrc p15, 0, %0, c9, c12, 1" : "=r" (regs->pmcntenset)); + asm volatile("mrc p15, 0, %0, c9, c14, 0" : "=r" (regs->pmuseren)); + asm volatile("mrc p15, 0, %0, c9, c14, 1" : "=r" (regs->pmintenset)); + asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r" (regs->pmxevtcnt[0])); + for (cnt = ARMV7_IDX_COUNTER0; + cnt <= ARMV7_IDX_COUNTER_LAST(cpu_pmu); cnt++) { + armv7_pmnc_select_counter(cnt); + asm volatile("mrc p15, 0, %0, c9, c13, 1" + : "=r"(regs->pmxevttype[cnt])); + asm volatile("mrc p15, 0, %0, c9, c13, 2" + : "=r"(regs->pmxevtcnt[cnt])); + } + return; +} + +static void armv7pmu_restore_regs(struct arm_pmu *cpu_pmu, + struct cpupmu_regs *regs) +{ + unsigned int cnt; + if (!(regs->pmc & ARMV7_PMNC_E)) + return; + + asm volatile("mcr p15, 0, %0, c9, c12, 1" : : "r" (regs->pmcntenset)); + asm volatile("mcr p15, 0, %0, c9, c14, 0" : : "r" (regs->pmuseren)); + asm volatile("mcr p15, 0, %0, c9, c14, 1" : : "r" (regs->pmintenset)); + asm volatile("mcr p15, 0, %0, c9, c13, 0" : : "r" (regs->pmxevtcnt[0])); + for (cnt = ARMV7_IDX_COUNTER0; + cnt <= ARMV7_IDX_COUNTER_LAST(cpu_pmu); cnt++) { + armv7_pmnc_select_counter(cnt); + asm volatile("mcr p15, 0, %0, c9, c13, 1" + : : "r"(regs->pmxevttype[cnt])); + asm volatile("mcr p15, 0, %0, c9, c13, 2" + : : "r"(regs->pmxevtcnt[cnt])); + } + asm volatile("mcr p15, 0, %0, c9, c12, 0" : : "r" (regs->pmc)); +} + static void armv7pmu_enable_event(struct perf_event *event) { unsigned long flags; @@ -1223,6 +1268,8 @@ static void armv7pmu_init(struct arm_pmu *cpu_pmu) cpu_pmu->start = armv7pmu_start; cpu_pmu->stop = armv7pmu_stop; cpu_pmu->reset = armv7pmu_reset; + cpu_pmu->save_regs = armv7pmu_save_regs; + cpu_pmu->restore_regs = armv7pmu_restore_regs; cpu_pmu->max_period = (1LLU << 32) - 1; }; @@ -1240,7 +1287,7 @@ static u32 armv7_read_num_pmnc_events(void) static int armv7_a8_pmu_init(struct arm_pmu *cpu_pmu) { armv7pmu_init(cpu_pmu); - cpu_pmu->name = "ARMv7 Cortex-A8"; + cpu_pmu->name = "ARMv7_Cortex_A8"; cpu_pmu->map_event = armv7_a8_map_event; cpu_pmu->num_events = armv7_read_num_pmnc_events(); return 0; @@ -1249,7 +1296,7 @@ static int armv7_a8_pmu_init(struct arm_pmu *cpu_pmu) static int armv7_a9_pmu_init(struct arm_pmu *cpu_pmu) { armv7pmu_init(cpu_pmu); - cpu_pmu->name = "ARMv7 Cortex-A9"; + cpu_pmu->name = "ARMv7_Cortex_A9"; cpu_pmu->map_event = armv7_a9_map_event; cpu_pmu->num_events = armv7_read_num_pmnc_events(); return 0; @@ -1258,7 +1305,7 @@ static int armv7_a9_pmu_init(struct arm_pmu *cpu_pmu) static int armv7_a5_pmu_init(struct arm_pmu *cpu_pmu) { armv7pmu_init(cpu_pmu); - cpu_pmu->name = "ARMv7 Cortex-A5"; + cpu_pmu->name = "ARMv7_Cortex_A5"; cpu_pmu->map_event = armv7_a5_map_event; cpu_pmu->num_events = armv7_read_num_pmnc_events(); return 0; @@ -1267,7 +1314,7 @@ static int armv7_a5_pmu_init(struct arm_pmu *cpu_pmu) static int armv7_a15_pmu_init(struct arm_pmu *cpu_pmu) { armv7pmu_init(cpu_pmu); - cpu_pmu->name = "ARMv7 Cortex-A15"; + cpu_pmu->name = "ARMv7_Cortex_A15"; cpu_pmu->map_event = armv7_a15_map_event; cpu_pmu->num_events = armv7_read_num_pmnc_events(); cpu_pmu->set_event_filter = armv7pmu_set_event_filter; @@ -1277,7 +1324,7 @@ static int armv7_a15_pmu_init(struct arm_pmu *cpu_pmu) static int armv7_a7_pmu_init(struct arm_pmu *cpu_pmu) { armv7pmu_init(cpu_pmu); - cpu_pmu->name = "ARMv7 Cortex-A7"; + cpu_pmu->name = "ARMv7_Cortex_A7"; cpu_pmu->map_event = armv7_a7_map_event; cpu_pmu->num_events = armv7_read_num_pmnc_events(); cpu_pmu->set_event_filter = armv7pmu_set_event_filter; diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index 677325f4355e..fa45fb43a627 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -393,6 +393,33 @@ void __init arch_get_hmp_domains(struct list_head *hmp_domains_list) /* + * cluster_to_logical_mask - return cpu logical mask of CPUs in a cluster + * @socket_id: cluster HW identifier + * @cluster_mask: the cpumask location to be initialized, modified by the + * function only if return value == 0 + * + * Return: + * + * 0 on success + * -EINVAL if cluster_mask is NULL or there is no record matching socket_id + */ +int cluster_to_logical_mask(unsigned int socket_id, cpumask_t *cluster_mask) +{ + int cpu; + + if (!cluster_mask) + return -EINVAL; + + for_each_online_cpu(cpu) + if (socket_id == topology_physical_package_id(cpu)) { + cpumask_copy(cluster_mask, topology_core_cpumask(cpu)); + return 0; + } + + return -EINVAL; +} + +/* * init_cpu_topology is called at boot when only one cpu is running * which prevent simultaneous write access to cpu_topology array */ diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 1f93dbd72355..034d1836884b 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -47,6 +47,9 @@ static DEFINE_PER_CPU(char[CPUFREQ_NAME_LEN], cpufreq_cpu_governor); #endif static DEFINE_SPINLOCK(cpufreq_driver_lock); +/* Used when we unregister cpufreq driver */ +static struct cpumask cpufreq_online_mask; + /* * cpu_policy_rwsem is a per CPU reader-writer semaphore designed to cure * all cpufreq/hotplug/workqueue/etc related lock issues. @@ -751,11 +754,16 @@ static int cpufreq_add_dev_policy(unsigned int cpu, return -EBUSY; } + __cpufreq_governor(managed_policy, CPUFREQ_GOV_STOP); + spin_lock_irqsave(&cpufreq_driver_lock, flags); cpumask_copy(managed_policy->cpus, policy->cpus); per_cpu(cpufreq_cpu_data, cpu) = managed_policy; spin_unlock_irqrestore(&cpufreq_driver_lock, flags); + __cpufreq_governor(managed_policy, CPUFREQ_GOV_START); + __cpufreq_governor(managed_policy, CPUFREQ_GOV_LIMITS); + pr_debug("CPU already managed, adding link\n"); ret = sysfs_create_link(&dev->kobj, &managed_policy->kobj, @@ -970,6 +978,14 @@ static int cpufreq_add_dev(struct device *dev, struct subsys_interface *sif) pr_debug("initialization failed\n"); goto err_unlock_policy; } + + /* + * affected cpus must always be the one, which are online. We aren't + * managing offline cpus here. + */ + cpumask_and(policy->cpus, policy->cpus, cpu_online_mask); + cpumask_and(policy->cpus, policy->cpus, &cpufreq_online_mask); + policy->user_policy.min = policy->min; policy->user_policy.max = policy->max; @@ -1052,15 +1068,19 @@ static int __cpufreq_remove_dev(struct device *dev, struct subsys_interface *sif } per_cpu(cpufreq_cpu_data, cpu) = NULL; - #ifdef CONFIG_SMP /* if this isn't the CPU which is the parent of the kobj, we * only need to unlink, put and exit */ if (unlikely(cpu != data->cpu)) { pr_debug("removing link\n"); + __cpufreq_governor(data, CPUFREQ_GOV_STOP); cpumask_clear_cpu(cpu, data->cpus); spin_unlock_irqrestore(&cpufreq_driver_lock, flags); + + __cpufreq_governor(data, CPUFREQ_GOV_START); + __cpufreq_governor(data, CPUFREQ_GOV_LIMITS); + kobj = &dev->kobj; cpufreq_cpu_put(data); unlock_policy_rwsem_write(cpu); @@ -1168,6 +1188,7 @@ static int cpufreq_remove_dev(struct device *dev, struct subsys_interface *sif) if (unlikely(lock_policy_rwsem_write(cpu))) BUG(); + cpumask_clear_cpu(cpu, &cpufreq_online_mask); retval = __cpufreq_remove_dev(dev, sif); return retval; } @@ -1886,6 +1907,8 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data) cpufreq_driver = driver_data; spin_unlock_irqrestore(&cpufreq_driver_lock, flags); + cpumask_setall(&cpufreq_online_mask); + ret = subsys_interface_register(&cpufreq_interface); if (ret) goto err_null_driver; diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 192a302d6cfd..473b2b6eccb5 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -23,10 +23,27 @@ static struct lock_class_key irq_desc_lock_class; #if defined(CONFIG_SMP) +static int __init irq_affinity_setup(char *str) +{ + zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); + cpulist_parse(str, irq_default_affinity); + /* + * Set at least the boot cpu. We don't want to end up with + * bugreports caused by random comandline masks + */ + cpumask_set_cpu(smp_processor_id(), irq_default_affinity); + return 1; +} +__setup("irqaffinity=", irq_affinity_setup); + static void __init init_irq_default_affinity(void) { - alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); - cpumask_setall(irq_default_affinity); +#ifdef CONFIG_CPUMASK_OFFSTACK + if (!irq_default_affinity) + zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); +#endif + if (cpumask_empty(irq_default_affinity)) + cpumask_setall(irq_default_affinity); } #else static void __init init_irq_default_affinity(void) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9d9bab5919b1..6baa8c84aa8e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -31,9 +31,20 @@ #include <linux/task_work.h> #include <trace/events/sched.h> +#ifdef CONFIG_HMP_VARIABLE_SCALE +#include <linux/sysfs.h> +#include <linux/vmalloc.h> +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE +/* Include cpufreq header to add a notifier so that cpu frequency + * scaling can track the current CPU frequency + */ +#include <linux/cpufreq.h> +#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */ +#endif /* CONFIG_HMP_VARIABLE_SCALE */ #include "sched.h" + /* * Targeted preemption latency for CPU-bound tasks: * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) @@ -1200,8 +1211,95 @@ static u32 __compute_runnable_contrib(u64 n) return contrib + runnable_avg_yN_sum[n]; } -/* - * We can represent the historical contribution to runnable average as the +#ifdef CONFIG_HMP_VARIABLE_SCALE + +#define HMP_VARIABLE_SCALE_SHIFT 16ULL +struct hmp_global_attr { + struct attribute attr; + ssize_t (*show)(struct kobject *kobj, + struct attribute *attr, char *buf); + ssize_t (*store)(struct kobject *a, struct attribute *b, + const char *c, size_t count); + int *value; + int (*to_sysfs)(int); + int (*from_sysfs)(int); +}; + +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE +#define HMP_DATA_SYSFS_MAX 4 +#else +#define HMP_DATA_SYSFS_MAX 3 +#endif + +struct hmp_data_struct { +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE + int freqinvar_load_scale_enabled; +#endif + int multiplier; /* used to scale the time delta */ + struct attribute_group attr_group; + struct attribute *attributes[HMP_DATA_SYSFS_MAX + 1]; + struct hmp_global_attr attr[HMP_DATA_SYSFS_MAX]; +} hmp_data; + +static u64 hmp_variable_scale_convert(u64 delta); +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE +/* Frequency-Invariant Load Modification: + * Loads are calculated as in PJT's patch however we also scale the current + * contribution in line with the frequency of the CPU that the task was + * executed on. + * In this version, we use a simple linear scale derived from the maximum + * frequency reported by CPUFreq. As an example: + * + * Consider that we ran a task for 100% of the previous interval. + * + * Our CPU was under asynchronous frequency control through one of the + * CPUFreq governors. + * + * The CPUFreq governor reports that it is able to scale the CPU between + * 500MHz and 1GHz. + * + * During the period, the CPU was running at 1GHz. + * + * In this case, our load contribution for that period is calculated as + * 1 * (number_of_active_microseconds) + * + * This results in our task being able to accumulate maximum load as normal. + * + * + * Consider now that our CPU was executing at 500MHz. + * + * We now scale the load contribution such that it is calculated as + * 0.5 * (number_of_active_microseconds) + * + * Our task can only record 50% maximum load during this period. + * + * This represents the task consuming 50% of the CPU's *possible* compute + * capacity. However the task did consume 100% of the CPU's *available* + * compute capacity which is the value seen by the CPUFreq governor and + * user-side CPU Utilization tools. + * + * Restricting tracked load to be scaled by the CPU's frequency accurately + * represents the consumption of possible compute capacity and allows the + * HMP migration's simple threshold migration strategy to interact more + * predictably with CPUFreq's asynchronous compute capacity changes. + */ +#define SCHED_FREQSCALE_SHIFT 10 +struct cpufreq_extents { + u32 curr_scale; + u32 min; + u32 max; + u32 flags; +}; +/* Flag set when the governor in use only allows one frequency. + * Disables scaling. + */ +#define SCHED_LOAD_FREQINVAR_SINGLEFREQ 0x01 + +static struct cpufreq_extents freq_scale[CONFIG_NR_CPUS]; +#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */ +#endif /* CONFIG_HMP_VARIABLE_SCALE */ + +/* We can represent the historical contribution to runnable average as the * coefficients of a geometric series. To do this we sub-divide our runnable * history into segments of approximately 1ms (1024us); label the segment that * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g. @@ -1231,13 +1329,23 @@ static u32 __compute_runnable_contrib(u64 n) static __always_inline int __update_entity_runnable_avg(u64 now, struct sched_avg *sa, int runnable, - int running) + int running, + int cpu) { u64 delta, periods; u32 runnable_contrib; int delta_w, decayed = 0; +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE + u64 scaled_delta; + u32 scaled_runnable_contrib; + int scaled_delta_w; + u32 curr_scale = 1024; +#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */ delta = now - sa->last_runnable_update; +#ifdef CONFIG_HMP_VARIABLE_SCALE + delta = hmp_variable_scale_convert(delta); +#endif /* * This should only happen when time goes backwards, which it * unfortunately does during sched clock init when we swap over to TSC. @@ -1256,6 +1364,12 @@ static __always_inline int __update_entity_runnable_avg(u64 now, return 0; sa->last_runnable_update = now; +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE + /* retrieve scale factor for load */ + if (hmp_data.freqinvar_load_scale_enabled) + curr_scale = freq_scale[cpu].curr_scale; +#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */ + /* delta_w is the amount already accumulated against our next period */ delta_w = sa->runnable_avg_period % 1024; if (delta + delta_w >= 1024) { @@ -1268,10 +1382,20 @@ static __always_inline int __update_entity_runnable_avg(u64 now, * period and accrue it. */ delta_w = 1024 - delta_w; + /* scale runnable time if necessary */ +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE + scaled_delta_w = (delta_w * curr_scale) + >> SCHED_FREQSCALE_SHIFT; + if (runnable) + sa->runnable_avg_sum += scaled_delta_w; + if (running) + sa->usage_avg_sum += scaled_delta_w; +#else if (runnable) sa->runnable_avg_sum += delta_w; if (running) sa->usage_avg_sum += delta_w; +#endif /* #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */ sa->runnable_avg_period += delta_w; delta -= delta_w; @@ -1279,27 +1403,49 @@ static __always_inline int __update_entity_runnable_avg(u64 now, /* Figure out how many additional periods this update spans */ periods = delta / 1024; delta %= 1024; - + /* decay the load we have accumulated so far */ sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum, periods + 1); sa->runnable_avg_period = decay_load(sa->runnable_avg_period, periods + 1); sa->usage_avg_sum = decay_load(sa->usage_avg_sum, periods + 1); - + /* add the contribution from this period */ /* Efficiently calculate \sum (1..n_period) 1024*y^i */ runnable_contrib = __compute_runnable_contrib(periods); + /* Apply load scaling if necessary. + * Note that multiplying the whole series is same as + * multiplying all terms + */ +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE + scaled_runnable_contrib = (runnable_contrib * curr_scale) + >> SCHED_FREQSCALE_SHIFT; + if (runnable) + sa->runnable_avg_sum += scaled_runnable_contrib; + if (running) + sa->usage_avg_sum += scaled_runnable_contrib; +#else if (runnable) sa->runnable_avg_sum += runnable_contrib; if (running) sa->usage_avg_sum += runnable_contrib; +#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */ sa->runnable_avg_period += runnable_contrib; } /* Remainder of delta accrued against u_0` */ + /* scale if necessary */ +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE + scaled_delta = ((delta * curr_scale) >> SCHED_FREQSCALE_SHIFT); + if (runnable) + sa->runnable_avg_sum += scaled_delta; + if (running) + sa->usage_avg_sum += scaled_delta; +#else if (runnable) sa->runnable_avg_sum += delta; if (running) sa->usage_avg_sum += delta; +#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */ sa->runnable_avg_period += delta; return decayed; @@ -1467,7 +1613,11 @@ static inline void update_entity_load_avg(struct sched_entity *se, struct cfs_rq *cfs_rq = cfs_rq_of(se); long contrib_delta; u64 now; + int cpu = -1; /* not used in normal case */ +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE + cpu = cfs_rq->rq->cpu; +#endif /* * For a group entity we need to use their owned cfs_rq_clock_task() in * case they are the parent of a throttled hierarchy. @@ -1478,7 +1628,7 @@ static inline void update_entity_load_avg(struct sched_entity *se, now = cfs_rq_clock_task(group_cfs_rq(se)); if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq, - cfs_rq->curr == se)) + cfs_rq->curr == se, cpu)) return; contrib_delta = __update_entity_load_avg_contrib(se); @@ -1523,8 +1673,13 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) static inline void update_rq_runnable_avg(struct rq *rq, int runnable) { u32 contrib; + int cpu = -1; /* not used in normal case */ + +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE + cpu = rq->cpu; +#endif __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable, - runnable); + runnable, cpu); __update_tg_runnable_avg(&rq->avg, &rq->cfs); contrib = rq->avg.runnable_avg_sum * scale_load_down(1024); contrib /= (rq->avg.runnable_avg_period + 1); @@ -3475,6 +3630,244 @@ static inline void hmp_next_down_delay(struct sched_entity *se, int cpu) se->avg.hmp_last_down_migration = cfs_rq_clock_task(cfs_rq); se->avg.hmp_last_up_migration = 0; } + +#ifdef CONFIG_HMP_VARIABLE_SCALE +/* + * Heterogenous multiprocessor (HMP) optimizations + * + * These functions allow to change the growing speed of the load_avg_ratio + * by default it goes from 0 to 0.5 in LOAD_AVG_PERIOD = 32ms + * This can now be changed with /sys/kernel/hmp/load_avg_period_ms. + * + * These functions also allow to change the up and down threshold of HMP + * using /sys/kernel/hmp/{up,down}_threshold. + * Both must be between 0 and 1023. The threshold that is compared + * to the load_avg_ratio is up_threshold/1024 and down_threshold/1024. + * + * For instance, if load_avg_period = 64 and up_threshold = 512, an idle + * task with a load of 0 will reach the threshold after 64ms of busy loop. + * + * Changing load_avg_periods_ms has the same effect than changing the + * default scaling factor Y=1002/1024 in the load_avg_ratio computation to + * (1002/1024.0)^(LOAD_AVG_PERIOD/load_avg_period_ms), but the last one + * could trigger overflows. + * For instance, with Y = 1023/1024 in __update_task_entity_contrib() + * "contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);" + * could be overflowed for a weight > 2^12 even is the load_avg_contrib + * should still be a 32bits result. This would not happen by multiplicating + * delta time by 1/22 and setting load_avg_period_ms = 706. + */ + +/* + * By scaling the delta time it end-up increasing or decrease the + * growing speed of the per entity load_avg_ratio + * The scale factor hmp_data.multiplier is a fixed point + * number: (32-HMP_VARIABLE_SCALE_SHIFT).HMP_VARIABLE_SCALE_SHIFT + */ +static u64 hmp_variable_scale_convert(u64 delta) +{ + u64 high = delta >> 32ULL; + u64 low = delta & 0xffffffffULL; + low *= hmp_data.multiplier; + high *= hmp_data.multiplier; + return (low >> HMP_VARIABLE_SCALE_SHIFT) + + (high << (32ULL - HMP_VARIABLE_SCALE_SHIFT)); +} + +static ssize_t hmp_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + ssize_t ret = 0; + struct hmp_global_attr *hmp_attr = + container_of(attr, struct hmp_global_attr, attr); + int temp = *(hmp_attr->value); + if (hmp_attr->to_sysfs != NULL) + temp = hmp_attr->to_sysfs(temp); + ret = sprintf(buf, "%d\n", temp); + return ret; +} + +static ssize_t hmp_store(struct kobject *a, struct attribute *attr, + const char *buf, size_t count) +{ + int temp; + ssize_t ret = count; + struct hmp_global_attr *hmp_attr = + container_of(attr, struct hmp_global_attr, attr); + char *str = vmalloc(count + 1); + if (str == NULL) + return -ENOMEM; + memcpy(str, buf, count); + str[count] = 0; + if (sscanf(str, "%d", &temp) < 1) + ret = -EINVAL; + else { + if (hmp_attr->from_sysfs != NULL) + temp = hmp_attr->from_sysfs(temp); + if (temp < 0) + ret = -EINVAL; + else + *(hmp_attr->value) = temp; + } + vfree(str); + return ret; +} + +static int hmp_period_tofrom_sysfs(int value) +{ + return (LOAD_AVG_PERIOD << HMP_VARIABLE_SCALE_SHIFT) / value; +} + +/* max value for threshold is 1024 */ +static int hmp_theshold_from_sysfs(int value) +{ + if (value > 1024) + return -1; + return value; +} +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE +/* freqinvar control is only 0,1 off/on */ +static int hmp_freqinvar_from_sysfs(int value) +{ + if (value < 0 || value > 1) + return -1; + return value; +} +#endif +static void hmp_attr_add( + const char *name, + int *value, + int (*to_sysfs)(int), + int (*from_sysfs)(int)) +{ + int i = 0; + while (hmp_data.attributes[i] != NULL) { + i++; + if (i >= HMP_DATA_SYSFS_MAX) + return; + } + hmp_data.attr[i].attr.mode = 0644; + hmp_data.attr[i].show = hmp_show; + hmp_data.attr[i].store = hmp_store; + hmp_data.attr[i].attr.name = name; + hmp_data.attr[i].value = value; + hmp_data.attr[i].to_sysfs = to_sysfs; + hmp_data.attr[i].from_sysfs = from_sysfs; + hmp_data.attributes[i] = &hmp_data.attr[i].attr; + hmp_data.attributes[i + 1] = NULL; +} + +static int hmp_attr_init(void) +{ + int ret; + memset(&hmp_data, sizeof(hmp_data), 0); + /* by default load_avg_period_ms == LOAD_AVG_PERIOD + * meaning no change + */ + hmp_data.multiplier = hmp_period_tofrom_sysfs(LOAD_AVG_PERIOD); + + hmp_attr_add("load_avg_period_ms", + &hmp_data.multiplier, + hmp_period_tofrom_sysfs, + hmp_period_tofrom_sysfs); + hmp_attr_add("up_threshold", + &hmp_up_threshold, + NULL, + hmp_theshold_from_sysfs); + hmp_attr_add("down_threshold", + &hmp_down_threshold, + NULL, + hmp_theshold_from_sysfs); +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE + /* default frequency-invariant scaling ON */ + hmp_data.freqinvar_load_scale_enabled = 1; + hmp_attr_add("frequency_invariant_load_scale", + &hmp_data.freqinvar_load_scale_enabled, + NULL, + hmp_freqinvar_from_sysfs); +#endif + hmp_data.attr_group.name = "hmp"; + hmp_data.attr_group.attrs = hmp_data.attributes; + ret = sysfs_create_group(kernel_kobj, + &hmp_data.attr_group); + return 0; +} +late_initcall(hmp_attr_init); +#endif /* CONFIG_HMP_VARIABLE_SCALE */ + +static inline unsigned int hmp_domain_min_load(struct hmp_domain *hmpd, + int *min_cpu) +{ + int cpu; + int min_load = INT_MAX; + int min_cpu_temp = NR_CPUS; + + for_each_cpu_mask(cpu, hmpd->cpus) { + if (cpu_rq(cpu)->cfs.tg_load_contrib < min_load) { + min_load = cpu_rq(cpu)->cfs.tg_load_contrib; + min_cpu_temp = cpu; + } + } + + if (min_cpu) + *min_cpu = min_cpu_temp; + + return min_load; +} + +/* + * Calculate the task starvation + * This is the ratio of actually running time vs. runnable time. + * If the two are equal the task is getting the cpu time it needs or + * it is alone on the cpu and the cpu is fully utilized. + */ +static inline unsigned int hmp_task_starvation(struct sched_entity *se) +{ + u32 starvation; + + starvation = se->avg.usage_avg_sum * scale_load_down(NICE_0_LOAD); + starvation /= (se->avg.runnable_avg_sum + 1); + + return scale_load(starvation); +} + +static inline unsigned int hmp_offload_down(int cpu, struct sched_entity *se) +{ + int min_usage; + int dest_cpu = NR_CPUS; + + if (hmp_cpu_is_slowest(cpu)) + return NR_CPUS; + + /* Is the current domain fully loaded? */ + /* load < ~94% */ + min_usage = hmp_domain_min_load(hmp_cpu_domain(cpu), NULL); + if (min_usage < NICE_0_LOAD-64) + return NR_CPUS; + + /* Is the cpu oversubscribed? */ + /* load < ~194% */ + if (cpu_rq(cpu)->cfs.tg_load_contrib < 2*NICE_0_LOAD-64) + return NR_CPUS; + + /* Is the task alone on the cpu? */ + if (cpu_rq(cpu)->cfs.nr_running < 2) + return NR_CPUS; + + /* Is the task actually starving? */ + if (hmp_task_starvation(se) > 768) /* <25% waiting */ + return NR_CPUS; + + /* Does the slower domain have spare cycles? */ + min_usage = hmp_domain_min_load(hmp_slower_domain(cpu), &dest_cpu); + /* load > 50% */ + if (min_usage > NICE_0_LOAD/2) + return NR_CPUS; + + if (cpumask_test_cpu(dest_cpu, &hmp_slower_domain(cpu)->cpus)) + return dest_cpu; + return NR_CPUS; +} #endif /* CONFIG_SCHED_HMP */ /* @@ -5875,10 +6268,14 @@ static unsigned int hmp_up_migration(int cpu, struct sched_entity *se) < hmp_next_up_threshold) return 0; - if (cpumask_intersects(&hmp_faster_domain(cpu)->cpus, - tsk_cpus_allowed(p)) - && se->avg.load_avg_ratio > hmp_up_threshold) { - return 1; + if (se->avg.load_avg_ratio > hmp_up_threshold) { + /* Target domain load < ~94% */ + if (hmp_domain_min_load(hmp_faster_domain(cpu), NULL) + > NICE_0_LOAD-64) + return 0; + if (cpumask_intersects(&hmp_faster_domain(cpu)->cpus, + tsk_cpus_allowed(p))) + return 1; } return 0; } @@ -6101,6 +6498,21 @@ static void hmp_force_up_migration(int this_cpu) hmp_next_up_delay(&p->se, target->push_cpu); } } + if (!force && !target->active_balance) { + /* + * For now we just check the currently running task. + * Selecting the lightest task for offloading will + * require extensive book keeping. + */ + target->push_cpu = hmp_offload_down(cpu, curr); + if (target->push_cpu < NR_CPUS) { + target->active_balance = 1; + target->migrate_task = p; + force = 1; + trace_sched_hmp_migrate(p, target->push_cpu, 2); + hmp_next_down_delay(&p->se, target->push_cpu); + } + } raw_spin_unlock_irqrestore(&target->lock, flags); if (force) stop_one_cpu_nowait(cpu_of(target), @@ -6632,3 +7044,132 @@ __init void init_sched_fair_class(void) #endif /* SMP */ } + +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE +static u32 cpufreq_calc_scale(u32 min, u32 max, u32 curr) +{ + u32 result = curr / max; + return result; +} + +/* Called when the CPU Frequency is changed. + * Once for each CPU. + */ +static int cpufreq_callback(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct cpufreq_freqs *freq = data; + int cpu = freq->cpu; + struct cpufreq_extents *extents; + + if (freq->flags & CPUFREQ_CONST_LOOPS) + return NOTIFY_OK; + + if (val != CPUFREQ_POSTCHANGE) + return NOTIFY_OK; + + /* if dynamic load scale is disabled, set the load scale to 1.0 */ + if (!hmp_data.freqinvar_load_scale_enabled) { + freq_scale[cpu].curr_scale = 1024; + return NOTIFY_OK; + } + + extents = &freq_scale[cpu]; + if (extents->flags & SCHED_LOAD_FREQINVAR_SINGLEFREQ) { + /* If our governor was recognised as a single-freq governor, + * use 1.0 + */ + extents->curr_scale = 1024; + } else { + extents->curr_scale = cpufreq_calc_scale(extents->min, + extents->max, freq->new); + } + + return NOTIFY_OK; +} + +/* Called when the CPUFreq governor is changed. + * Only called for the CPUs which are actually changed by the + * userspace. + */ +static int cpufreq_policy_callback(struct notifier_block *nb, + unsigned long event, void *data) +{ + struct cpufreq_policy *policy = data; + struct cpufreq_extents *extents; + int cpu, singleFreq = 0; + static const char performance_governor[] = "performance"; + static const char powersave_governor[] = "powersave"; + + if (event == CPUFREQ_START) + return 0; + + if (event != CPUFREQ_INCOMPATIBLE) + return 0; + + /* CPUFreq governors do not accurately report the range of + * CPU Frequencies they will choose from. + * We recognise performance and powersave governors as + * single-frequency only. + */ + if (!strncmp(policy->governor->name, performance_governor, + strlen(performance_governor)) || + !strncmp(policy->governor->name, powersave_governor, + strlen(powersave_governor))) + singleFreq = 1; + + /* Make sure that all CPUs impacted by this policy are + * updated since we will only get a notification when the + * user explicitly changes the policy on a CPU. + */ + for_each_cpu(cpu, policy->cpus) { + extents = &freq_scale[cpu]; + extents->max = policy->max >> SCHED_FREQSCALE_SHIFT; + extents->min = policy->min >> SCHED_FREQSCALE_SHIFT; + if (!hmp_data.freqinvar_load_scale_enabled) { + extents->curr_scale = 1024; + } else if (singleFreq) { + extents->flags |= SCHED_LOAD_FREQINVAR_SINGLEFREQ; + extents->curr_scale = 1024; + } else { + extents->flags &= ~SCHED_LOAD_FREQINVAR_SINGLEFREQ; + extents->curr_scale = cpufreq_calc_scale(extents->min, + extents->max, policy->cur); + } + } + + return 0; +} + +static struct notifier_block cpufreq_notifier = { + .notifier_call = cpufreq_callback, +}; +static struct notifier_block cpufreq_policy_notifier = { + .notifier_call = cpufreq_policy_callback, +}; + +static int __init register_sched_cpufreq_notifier(void) +{ + int ret = 0; + + /* init safe defaults since there are no policies at registration */ + for (ret = 0; ret < CONFIG_NR_CPUS; ret++) { + /* safe defaults */ + freq_scale[ret].max = 1024; + freq_scale[ret].min = 1024; + freq_scale[ret].curr_scale = 1024; + } + + pr_info("sched: registering cpufreq notifiers for scale-invariant loads\n"); + ret = cpufreq_register_notifier(&cpufreq_policy_notifier, + CPUFREQ_POLICY_NOTIFIER); + + if (ret != -EINVAL) + ret = cpufreq_register_notifier(&cpufreq_notifier, + CPUFREQ_TRANSITION_NOTIFIER); + + return ret; +} + +core_initcall(register_sched_cpufreq_notifier); +#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */ diff --git a/linaro/configs/big-LITTLE-MP.conf b/linaro/configs/big-LITTLE-MP.conf new file mode 100644 index 000000000000..8cc2be049a41 --- /dev/null +++ b/linaro/configs/big-LITTLE-MP.conf @@ -0,0 +1,13 @@ +CONFIG_CGROUPS=y +CONFIG_CGROUP_SCHED=y +CONFIG_FAIR_GROUP_SCHED=y +CONFIG_NO_HZ=y +CONFIG_SCHED_MC=y +CONFIG_DISABLE_CPU_SCHED_DOMAIN_BALANCE=y +CONFIG_SCHED_HMP=y +CONFIG_HMP_FAST_CPU_MASK="" +CONFIG_HMP_SLOW_CPU_MASK="" +CONFIG_HMP_VARIABLE_SCALE=y +CONFIG_HMP_FREQUENCY_INVARIANT_SCALE=y +CONFIG_SCHED_HMP_PRIO_FILTER=y +CONFIG_SCHED_HMP_PRIO_FILTER_VAL=5 |