diff options
author | Andrey Konovalov <andrey.konovalov@linaro.org> | 2012-07-16 16:16:46 +0400 |
---|---|---|
committer | Andrey Konovalov <andrey.konovalov@linaro.org> | 2012-07-16 16:16:46 +0400 |
commit | 5809a4ec27b44ec5c8019e1cc5ce5d1f5912e0d6 (patch) | |
tree | cc0f28997f383d830d9ffc10e5b76114ce46524a | |
parent | 8381163af689328b7d6ed5de3ef6204f5fc28353 (diff) | |
parent | ca789c61deb0219cd3fefdb26090bbc9701deada (diff) |
Merge branch 'tracking-big-LITTLE-MP-v2' into merge-linux-linaro-core-trackingtracking-llct-ll-20120716.0llct-20120716.0
32 files changed, 2903 insertions, 1032 deletions
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 1d117e639fd4..99d2f1f3e83a 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1542,6 +1542,35 @@ config SCHED_SMT MultiThreading at a cost of slightly increased overhead in some places. If unsure say N here. +config DISABLE_CPU_SCHED_DOMAIN_BALANCE + bool "(EXPERIMENTAL) Disable CPU level scheduler load-balancing" + help + Disables scheduler load-balancing at CPU sched domain level. + +config SCHED_HMP + bool "(EXPERIMENTAL) Heterogenous multiprocessor scheduling" + depends on DISABLE_CPU_SCHED_DOMAIN_BALANCE && SCHED_MC && FAIR_GROUP_SCHED && !SCHED_AUTOGROUP + help + Experimental scheduler optimizations for heterogeneous platforms. + Attempts introspectively select task affinity to optimize power + and performance. Currently support two types of CPUs: fast + (high-performance) and slow (power-efficient). There is currently + no support for migration of task groups, hence !SCHED_AUTOGROUP. + +config HMP_FAST_CPU_MASK + string "HMP scheduler fast CPU mask" + depends on SCHED_HMP + help + Specifies the cpuids of the fast CPUs in the system as a list + string, e.g. cpuid 0+1 should be specified as 0-1. + +config HMP_SLOW_CPU_MASK + string "HMP scheduler slow CPU mask" + depends on SCHED_HMP + help + Specifies the cpuids of the slow CPUs in the system as a list + string, e.g. cpuid 0+1 should be specified as 0-1. + config HAVE_ARM_SCU bool help diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index 8200deaa14f6..f59193cd9aaa 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -17,11 +17,160 @@ #include <linux/percpu.h> #include <linux/node.h> #include <linux/nodemask.h> +#include <linux/of.h> #include <linux/sched.h> +#include <linux/slab.h> #include <asm/cputype.h> #include <asm/topology.h> +/* + * cpu power scale management + */ + +/* + * cpu power table + * This per cpu data structure describes the relative capacity of each core. + * On a heteregenous system, cores don't have the same computation capacity + * and we reflect that difference in the cpu_power field so the scheduler can + * take this difference into account during load balance. A per cpu structure + * is preferred because each CPU updates its own cpu_power field during the + * load balance except for idle cores. One idle core is selected to run the + * rebalance_domains for all idle cores and the cpu_power can be updated + * during this sequence. + */ +static DEFINE_PER_CPU(unsigned long, cpu_scale); + +unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu) +{ + return per_cpu(cpu_scale, cpu); +} + +static void set_power_scale(unsigned int cpu, unsigned long power) +{ + per_cpu(cpu_scale, cpu) = power; +} + +#ifdef CONFIG_OF +struct cpu_efficiency { + const char *compatible; + unsigned long efficiency; +}; + +/* + * Table of relative efficiency of each processors + * The efficiency value must fit in 20bit. The final + * cpu_scale value must be in the range + * 0 < cpu_scale < 2*SCHED_POWER_SCALE. + * Processors that are not defined in the table, + * use the default SCHED_POWER_SCALE value for cpu_scale. + */ +struct cpu_efficiency table_efficiency[] = { + {"arm,cortex-a15", 3891}, + {"arm,cortex-a7", 2048}, + {NULL, }, +}; + +struct cpu_capacity { + unsigned long hwid; + unsigned long capacity; +}; + +struct cpu_capacity *cpu_capacity; + +unsigned long middle_capacity = 1; + +static void __init parse_dt_topology(void) +{ + struct cpu_efficiency *cpu_eff; + struct device_node *cn = NULL; + unsigned long min_capacity = (unsigned long)(-1); + unsigned long max_capacity = 0; + unsigned long capacity = 0; + int alloc_size, cpu = 0; + + alloc_size = nr_cpu_ids * sizeof(struct cpu_capacity); + cpu_capacity = (struct cpu_capacity *)kzalloc(alloc_size, GFP_NOWAIT); + + while ((cn = of_find_node_by_type(cn, "cpu"))) { + const u32 *rate, *reg; + int len; + + if (cpu >= num_possible_cpus()) + break; + + for (cpu_eff = table_efficiency; cpu_eff->compatible; cpu_eff++) + if (of_device_is_compatible(cn, cpu_eff->compatible)) + break; + + if (cpu_eff->compatible == NULL) + continue; + + rate = of_get_property(cn, "clock-frequency", &len); + if (!rate || len != 4) { + pr_err("%s missing clock-frequency property\n", + cn->full_name); + continue; + } + + reg = of_get_property(cn, "reg", &len); + if (!reg || len != 4) { + pr_err("%s missing reg property\n", cn->full_name); + continue; + } + + capacity = ((be32_to_cpup(rate)) >> 20) * cpu_eff->efficiency; + + /* Save min capacity of the system */ + if (capacity < min_capacity) + min_capacity = capacity; + + /* Save max capacity of the system */ + if (capacity > max_capacity) + max_capacity = capacity; + + cpu_capacity[cpu].capacity = capacity; + cpu_capacity[cpu++].hwid = be32_to_cpup(reg); + } + + if (cpu < num_possible_cpus()) + cpu_capacity[cpu].hwid = (unsigned long)(-1); + + middle_capacity = (min_capacity + max_capacity) >> 11; +} + +void update_cpu_power(unsigned int cpu, unsigned long hwid) +{ + unsigned int idx = 0; + + /* look for the cpu's hwid in the cpu capacity table */ + for (idx = 0; idx < num_possible_cpus(); idx++) { + if (cpu_capacity[idx].hwid == hwid) + break; + + if (cpu_capacity[idx].hwid == -1) + return; + } + + if (idx == num_possible_cpus()) + return; + + set_power_scale(cpu, cpu_capacity[idx].capacity / middle_capacity); + + printk(KERN_INFO "CPU%u: update cpu_power %lu\n", + cpu, arch_scale_freq_power(NULL, cpu)); +} + +#else +static inline void parse_dt_topology(void) {} +static inline void update_cpu_power(unsigned int cpuid, unsigned int mpidr) {} +#endif + + +/* + * cpu topology management + */ + #define MPIDR_SMP_BITMASK (0x3 << 30) #define MPIDR_SMP_VALUE (0x2 << 30) @@ -31,6 +180,7 @@ * These masks reflect the current use of the affinity levels. * The affinity level can be up to 16 bits according to ARM ARM */ +#define MPIDR_HWID_BITMASK 0xFFFFFF #define MPIDR_LEVEL0_MASK 0x3 #define MPIDR_LEVEL0_SHIFT 0 @@ -41,6 +191,9 @@ #define MPIDR_LEVEL2_MASK 0xFF #define MPIDR_LEVEL2_SHIFT 16 +/* + * cpu topology table + */ struct cputopo_arm cpu_topology[NR_CPUS]; const struct cpumask *cpu_coregroup_mask(int cpu) @@ -48,6 +201,32 @@ const struct cpumask *cpu_coregroup_mask(int cpu) return &cpu_topology[cpu].core_sibling; } +void update_siblings_masks(unsigned int cpuid) +{ + struct cputopo_arm *cpu_topo, *cpuid_topo = &cpu_topology[cpuid]; + int cpu; + + /* update core and thread sibling masks */ + for_each_possible_cpu(cpu) { + cpu_topo = &cpu_topology[cpu]; + + if (cpuid_topo->socket_id != cpu_topo->socket_id) + continue; + + cpumask_set_cpu(cpuid, &cpu_topo->core_sibling); + if (cpu != cpuid) + cpumask_set_cpu(cpu, &cpuid_topo->core_sibling); + + if (cpuid_topo->core_id != cpu_topo->core_id) + continue; + + cpumask_set_cpu(cpuid, &cpu_topo->thread_sibling); + if (cpu != cpuid) + cpumask_set_cpu(cpu, &cpuid_topo->thread_sibling); + } + smp_wmb(); +} + /* * store_cpu_topology is called at boot when only one cpu is running * and with the mutex cpu_hotplug.lock locked, when several cpus have booted, @@ -57,7 +236,6 @@ void store_cpu_topology(unsigned int cpuid) { struct cputopo_arm *cpuid_topo = &cpu_topology[cpuid]; unsigned int mpidr; - unsigned int cpu; /* If the cpu topology has been already set, just return */ if (cpuid_topo->core_id != -1) @@ -99,26 +277,9 @@ void store_cpu_topology(unsigned int cpuid) cpuid_topo->socket_id = -1; } - /* update core and thread sibling masks */ - for_each_possible_cpu(cpu) { - struct cputopo_arm *cpu_topo = &cpu_topology[cpu]; - - if (cpuid_topo->socket_id == cpu_topo->socket_id) { - cpumask_set_cpu(cpuid, &cpu_topo->core_sibling); - if (cpu != cpuid) - cpumask_set_cpu(cpu, - &cpuid_topo->core_sibling); - - if (cpuid_topo->core_id == cpu_topo->core_id) { - cpumask_set_cpu(cpuid, - &cpu_topo->thread_sibling); - if (cpu != cpuid) - cpumask_set_cpu(cpu, - &cpuid_topo->thread_sibling); - } - } - } - smp_wmb(); + update_siblings_masks(cpuid); + + update_cpu_power(cpuid, mpidr & MPIDR_HWID_BITMASK); printk(KERN_INFO "CPU%u: thread %d, cpu %d, socket %d, mpidr %x\n", cpuid, cpu_topology[cpuid].thread_id, @@ -134,7 +295,7 @@ void init_cpu_topology(void) { unsigned int cpu; - /* init core mask */ + /* init core mask and power*/ for_each_possible_cpu(cpu) { struct cputopo_arm *cpu_topo = &(cpu_topology[cpu]); @@ -143,6 +304,10 @@ void init_cpu_topology(void) cpu_topo->socket_id = -1; cpumask_clear(&cpu_topo->core_sibling); cpumask_clear(&cpu_topo->thread_sibling); + + set_power_scale(cpu, SCHED_POWER_SCALE); } smp_wmb(); + + parse_dt_topology(); } diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 6ab6aa2fdfdd..c5981267a60c 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -14,7 +14,7 @@ CFLAGS_common.o := $(nostackp) obj-y := intel_cacheinfo.o scattered.o topology.o obj-y += proc.o capflags.o powerflags.o common.o -obj-y += vmware.o hypervisor.o sched.o mshyperv.o +obj-y += vmware.o hypervisor.o mshyperv.o obj-y += rdrand.o obj-y += match.o diff --git a/arch/x86/kernel/cpu/sched.c b/arch/x86/kernel/cpu/sched.c deleted file mode 100644 index a640ae5ad201..000000000000 --- a/arch/x86/kernel/cpu/sched.c +++ /dev/null @@ -1,55 +0,0 @@ -#include <linux/sched.h> -#include <linux/math64.h> -#include <linux/percpu.h> -#include <linux/irqflags.h> - -#include <asm/cpufeature.h> -#include <asm/processor.h> - -#ifdef CONFIG_SMP - -static DEFINE_PER_CPU(struct aperfmperf, old_perf_sched); - -static unsigned long scale_aperfmperf(void) -{ - struct aperfmperf val, *old = &__get_cpu_var(old_perf_sched); - unsigned long ratio, flags; - - local_irq_save(flags); - get_aperfmperf(&val); - local_irq_restore(flags); - - ratio = calc_aperfmperf_ratio(old, &val); - *old = val; - - return ratio; -} - -unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu) -{ - /* - * do aperf/mperf on the cpu level because it includes things - * like turbo mode, which are relevant to full cores. - */ - if (boot_cpu_has(X86_FEATURE_APERFMPERF)) - return scale_aperfmperf(); - - /* - * maybe have something cpufreq here - */ - - return default_scale_freq_power(sd, cpu); -} - -unsigned long arch_scale_smt_power(struct sched_domain *sd, int cpu) -{ - /* - * aperf/mperf already includes the smt gain - */ - if (boot_cpu_has(X86_FEATURE_APERFMPERF)) - return SCHED_LOAD_SCALE; - - return default_scale_smt_power(sd, cpu); -} - -#endif diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig index 78a666d1e5f5..a76b689e553b 100644 --- a/drivers/cpuidle/Kconfig +++ b/drivers/cpuidle/Kconfig @@ -18,3 +18,6 @@ config CPU_IDLE_GOV_MENU bool depends on CPU_IDLE && NO_HZ default y + +config ARCH_NEEDS_CPU_IDLE_COUPLED + def_bool n diff --git a/drivers/cpuidle/Makefile b/drivers/cpuidle/Makefile index 5634f88379df..38c8f69f30cf 100644 --- a/drivers/cpuidle/Makefile +++ b/drivers/cpuidle/Makefile @@ -3,3 +3,4 @@ # obj-y += cpuidle.o driver.o governor.o sysfs.o governors/ +obj-$(CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED) += coupled.o diff --git a/drivers/cpuidle/coupled.c b/drivers/cpuidle/coupled.c new file mode 100644 index 000000000000..fc427fa1fefb --- /dev/null +++ b/drivers/cpuidle/coupled.c @@ -0,0 +1,715 @@ +/* + * coupled.c - helper functions to enter the same idle state on multiple cpus + * + * Copyright (c) 2011 Google, Inc. + * + * Author: Colin Cross <ccross@android.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include <linux/kernel.h> +#include <linux/cpu.h> +#include <linux/cpuidle.h> +#include <linux/mutex.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/spinlock.h> + +#include "cpuidle.h" + +/** + * DOC: Coupled cpuidle states + * + * On some ARM SMP SoCs (OMAP4460, Tegra 2, and probably more), the + * cpus cannot be independently powered down, either due to + * sequencing restrictions (on Tegra 2, cpu 0 must be the last to + * power down), or due to HW bugs (on OMAP4460, a cpu powering up + * will corrupt the gic state unless the other cpu runs a work + * around). Each cpu has a power state that it can enter without + * coordinating with the other cpu (usually Wait For Interrupt, or + * WFI), and one or more "coupled" power states that affect blocks + * shared between the cpus (L2 cache, interrupt controller, and + * sometimes the whole SoC). Entering a coupled power state must + * be tightly controlled on both cpus. + * + * This file implements a solution, where each cpu will wait in the + * WFI state until all cpus are ready to enter a coupled state, at + * which point the coupled state function will be called on all + * cpus at approximately the same time. + * + * Once all cpus are ready to enter idle, they are woken by an smp + * cross call. At this point, there is a chance that one of the + * cpus will find work to do, and choose not to enter idle. A + * final pass is needed to guarantee that all cpus will call the + * power state enter function at the same time. During this pass, + * each cpu will increment the ready counter, and continue once the + * ready counter matches the number of online coupled cpus. If any + * cpu exits idle, the other cpus will decrement their counter and + * retry. + * + * requested_state stores the deepest coupled idle state each cpu + * is ready for. It is assumed that the states are indexed from + * shallowest (highest power, lowest exit latency) to deepest + * (lowest power, highest exit latency). The requested_state + * variable is not locked. It is only written from the cpu that + * it stores (or by the on/offlining cpu if that cpu is offline), + * and only read after all the cpus are ready for the coupled idle + * state are are no longer updating it. + * + * Three atomic counters are used. alive_count tracks the number + * of cpus in the coupled set that are currently or soon will be + * online. waiting_count tracks the number of cpus that are in + * the waiting loop, in the ready loop, or in the coupled idle state. + * ready_count tracks the number of cpus that are in the ready loop + * or in the coupled idle state. + * + * To use coupled cpuidle states, a cpuidle driver must: + * + * Set struct cpuidle_device.coupled_cpus to the mask of all + * coupled cpus, usually the same as cpu_possible_mask if all cpus + * are part of the same cluster. The coupled_cpus mask must be + * set in the struct cpuidle_device for each cpu. + * + * Set struct cpuidle_device.safe_state to a state that is not a + * coupled state. This is usually WFI. + * + * Set CPUIDLE_FLAG_COUPLED in struct cpuidle_state.flags for each + * state that affects multiple cpus. + * + * Provide a struct cpuidle_state.enter function for each state + * that affects multiple cpus. This function is guaranteed to be + * called on all cpus at approximately the same time. The driver + * should ensure that the cpus all abort together if any cpu tries + * to abort once the function is called. The function should return + * with interrupts still disabled. + */ + +/** + * struct cpuidle_coupled - data for set of cpus that share a coupled idle state + * @coupled_cpus: mask of cpus that are part of the coupled set + * @requested_state: array of requested states for cpus in the coupled set + * @ready_waiting_counts: combined count of cpus in ready or waiting loops + * @online_count: count of cpus that are online + * @refcnt: reference count of cpuidle devices that are using this struct + * @prevent: flag to prevent coupled idle while a cpu is hotplugging + */ +struct cpuidle_coupled { + cpumask_t coupled_cpus; + int requested_state[NR_CPUS]; + atomic_t ready_waiting_counts; + int online_count; + int refcnt; + int prevent; +}; + +#define WAITING_BITS 16 +#define MAX_WAITING_CPUS (1 << WAITING_BITS) +#define WAITING_MASK (MAX_WAITING_CPUS - 1) +#define READY_MASK (~WAITING_MASK) + +#define CPUIDLE_COUPLED_NOT_IDLE (-1) + +static DEFINE_MUTEX(cpuidle_coupled_lock); +static DEFINE_PER_CPU(struct call_single_data, cpuidle_coupled_poke_cb); + +/* + * The cpuidle_coupled_poked_mask mask is used to avoid calling + * __smp_call_function_single with the per cpu call_single_data struct already + * in use. This prevents a deadlock where two cpus are waiting for each others + * call_single_data struct to be available + */ +static cpumask_t cpuidle_coupled_poked_mask; + +/** + * cpuidle_coupled_parallel_barrier - synchronize all online coupled cpus + * @dev: cpuidle_device of the calling cpu + * @a: atomic variable to hold the barrier + * + * No caller to this function will return from this function until all online + * cpus in the same coupled group have called this function. Once any caller + * has returned from this function, the barrier is immediately available for + * reuse. + * + * The atomic variable a must be initialized to 0 before any cpu calls + * this function, will be reset to 0 before any cpu returns from this function. + * + * Must only be called from within a coupled idle state handler + * (state.enter when state.flags has CPUIDLE_FLAG_COUPLED set). + * + * Provides full smp barrier semantics before and after calling. + */ +void cpuidle_coupled_parallel_barrier(struct cpuidle_device *dev, atomic_t *a) +{ + int n = dev->coupled->online_count; + + smp_mb__before_atomic_inc(); + atomic_inc(a); + + while (atomic_read(a) < n) + cpu_relax(); + + if (atomic_inc_return(a) == n * 2) { + atomic_set(a, 0); + return; + } + + while (atomic_read(a) > n) + cpu_relax(); +} + +/** + * cpuidle_state_is_coupled - check if a state is part of a coupled set + * @dev: struct cpuidle_device for the current cpu + * @drv: struct cpuidle_driver for the platform + * @state: index of the target state in drv->states + * + * Returns true if the target state is coupled with cpus besides this one + */ +bool cpuidle_state_is_coupled(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int state) +{ + return drv->states[state].flags & CPUIDLE_FLAG_COUPLED; +} + +/** + * cpuidle_coupled_set_ready - mark a cpu as ready + * @coupled: the struct coupled that contains the current cpu + */ +static inline void cpuidle_coupled_set_ready(struct cpuidle_coupled *coupled) +{ + atomic_add(MAX_WAITING_CPUS, &coupled->ready_waiting_counts); +} + +/** + * cpuidle_coupled_set_not_ready - mark a cpu as not ready + * @coupled: the struct coupled that contains the current cpu + * + * Decrements the ready counter, unless the ready (and thus the waiting) counter + * is equal to the number of online cpus. Prevents a race where one cpu + * decrements the waiting counter and then re-increments it just before another + * cpu has decremented its ready counter, leading to the ready counter going + * down from the number of online cpus without going through the coupled idle + * state. + * + * Returns 0 if the counter was decremented successfully, -EINVAL if the ready + * counter was equal to the number of online cpus. + */ +static +inline int cpuidle_coupled_set_not_ready(struct cpuidle_coupled *coupled) +{ + int all; + int ret; + + all = coupled->online_count || (coupled->online_count << WAITING_BITS); + ret = atomic_add_unless(&coupled->ready_waiting_counts, + -MAX_WAITING_CPUS, all); + + return ret ? 0 : -EINVAL; +} + +/** + * cpuidle_coupled_no_cpus_ready - check if no cpus in a coupled set are ready + * @coupled: the struct coupled that contains the current cpu + * + * Returns true if all of the cpus in a coupled set are out of the ready loop. + */ +static inline int cpuidle_coupled_no_cpus_ready(struct cpuidle_coupled *coupled) +{ + int r = atomic_read(&coupled->ready_waiting_counts) >> WAITING_BITS; + return r == 0; +} + +/** + * cpuidle_coupled_cpus_ready - check if all cpus in a coupled set are ready + * @coupled: the struct coupled that contains the current cpu + * + * Returns true if all cpus coupled to this target state are in the ready loop + */ +static inline bool cpuidle_coupled_cpus_ready(struct cpuidle_coupled *coupled) +{ + int r = atomic_read(&coupled->ready_waiting_counts) >> WAITING_BITS; + return r == coupled->online_count; +} + +/** + * cpuidle_coupled_cpus_waiting - check if all cpus in a coupled set are waiting + * @coupled: the struct coupled that contains the current cpu + * + * Returns true if all cpus coupled to this target state are in the wait loop + */ +static inline bool cpuidle_coupled_cpus_waiting(struct cpuidle_coupled *coupled) +{ + int w = atomic_read(&coupled->ready_waiting_counts) & WAITING_MASK; + return w == coupled->online_count; +} + +/** + * cpuidle_coupled_no_cpus_waiting - check if no cpus in coupled set are waiting + * @coupled: the struct coupled that contains the current cpu + * + * Returns true if all of the cpus in a coupled set are out of the waiting loop. + */ +static inline int cpuidle_coupled_no_cpus_waiting(struct cpuidle_coupled *coupled) +{ + int w = atomic_read(&coupled->ready_waiting_counts) & WAITING_MASK; + return w == 0; +} + +/** + * cpuidle_coupled_get_state - determine the deepest idle state + * @dev: struct cpuidle_device for this cpu + * @coupled: the struct coupled that contains the current cpu + * + * Returns the deepest idle state that all coupled cpus can enter + */ +static inline int cpuidle_coupled_get_state(struct cpuidle_device *dev, + struct cpuidle_coupled *coupled) +{ + int i; + int state = INT_MAX; + + /* + * Read barrier ensures that read of requested_state is ordered after + * reads of ready_count. Matches the write barriers + * cpuidle_set_state_waiting. + */ + smp_rmb(); + + for_each_cpu_mask(i, coupled->coupled_cpus) + if (cpu_online(i) && coupled->requested_state[i] < state) + state = coupled->requested_state[i]; + + return state; +} + +static void cpuidle_coupled_poked(void *info) +{ + int cpu = (unsigned long)info; + cpumask_clear_cpu(cpu, &cpuidle_coupled_poked_mask); +} + +/** + * cpuidle_coupled_poke - wake up a cpu that may be waiting + * @cpu: target cpu + * + * Ensures that the target cpu exits it's waiting idle state (if it is in it) + * and will see updates to waiting_count before it re-enters it's waiting idle + * state. + * + * If cpuidle_coupled_poked_mask is already set for the target cpu, that cpu + * either has or will soon have a pending IPI that will wake it out of idle, + * or it is currently processing the IPI and is not in idle. + */ +static void cpuidle_coupled_poke(int cpu) +{ + struct call_single_data *csd = &per_cpu(cpuidle_coupled_poke_cb, cpu); + + if (!cpumask_test_and_set_cpu(cpu, &cpuidle_coupled_poked_mask)) + __smp_call_function_single(cpu, csd, 0); +} + +/** + * cpuidle_coupled_poke_others - wake up all other cpus that may be waiting + * @dev: struct cpuidle_device for this cpu + * @coupled: the struct coupled that contains the current cpu + * + * Calls cpuidle_coupled_poke on all other online cpus. + */ +static void cpuidle_coupled_poke_others(int this_cpu, + struct cpuidle_coupled *coupled) +{ + int cpu; + + for_each_cpu_mask(cpu, coupled->coupled_cpus) + if (cpu != this_cpu && cpu_online(cpu)) + cpuidle_coupled_poke(cpu); +} + +/** + * cpuidle_coupled_set_waiting - mark this cpu as in the wait loop + * @dev: struct cpuidle_device for this cpu + * @coupled: the struct coupled that contains the current cpu + * @next_state: the index in drv->states of the requested state for this cpu + * + * Updates the requested idle state for the specified cpuidle device, + * poking all coupled cpus out of idle if necessary to let them see the new + * state. + */ +static void cpuidle_coupled_set_waiting(int cpu, + struct cpuidle_coupled *coupled, int next_state) +{ + int w; + + coupled->requested_state[cpu] = next_state; + + /* + * If this is the last cpu to enter the waiting state, poke + * all the other cpus out of their waiting state so they can + * enter a deeper state. This can race with one of the cpus + * exiting the waiting state due to an interrupt and + * decrementing waiting_count, see comment below. + * + * The atomic_inc_return provides a write barrier to order the write + * to requested_state with the later write that increments ready_count. + */ + w = atomic_inc_return(&coupled->ready_waiting_counts) & WAITING_MASK; + if (w == coupled->online_count) + cpuidle_coupled_poke_others(cpu, coupled); +} + +/** + * cpuidle_coupled_set_not_waiting - mark this cpu as leaving the wait loop + * @dev: struct cpuidle_device for this cpu + * @coupled: the struct coupled that contains the current cpu + * + * Removes the requested idle state for the specified cpuidle device. + */ +static void cpuidle_coupled_set_not_waiting(int cpu, + struct cpuidle_coupled *coupled) +{ + /* + * Decrementing waiting count can race with incrementing it in + * cpuidle_coupled_set_waiting, but that's OK. Worst case, some + * cpus will increment ready_count and then spin until they + * notice that this cpu has cleared it's requested_state. + */ + atomic_dec(&coupled->ready_waiting_counts); + + coupled->requested_state[cpu] = CPUIDLE_COUPLED_NOT_IDLE; +} + +/** + * cpuidle_coupled_set_done - mark this cpu as leaving the ready loop + * @cpu: the current cpu + * @coupled: the struct coupled that contains the current cpu + * + * Marks this cpu as no longer in the ready and waiting loops. Decrements + * the waiting count first to prevent another cpu looping back in and seeing + * this cpu as waiting just before it exits idle. + */ +static void cpuidle_coupled_set_done(int cpu, struct cpuidle_coupled *coupled) +{ + cpuidle_coupled_set_not_waiting(cpu, coupled); + atomic_sub(MAX_WAITING_CPUS, &coupled->ready_waiting_counts); +} + +/** + * cpuidle_coupled_clear_pokes - spin until the poke interrupt is processed + * @cpu - this cpu + * + * Turns on interrupts and spins until any outstanding poke interrupts have + * been processed and the poke bit has been cleared. + * + * Other interrupts may also be processed while interrupts are enabled, so + * need_resched() must be tested after turning interrupts off again to make sure + * the interrupt didn't schedule work that should take the cpu out of idle. + * + * Returns 0 if need_resched was false, -EINTR if need_resched was true. + */ +static int cpuidle_coupled_clear_pokes(int cpu) +{ + local_irq_enable(); + while (cpumask_test_cpu(cpu, &cpuidle_coupled_poked_mask)) + cpu_relax(); + local_irq_disable(); + + return need_resched() ? -EINTR : 0; +} + +/** + * cpuidle_enter_state_coupled - attempt to enter a state with coupled cpus + * @dev: struct cpuidle_device for the current cpu + * @drv: struct cpuidle_driver for the platform + * @next_state: index of the requested state in drv->states + * + * Coordinate with coupled cpus to enter the target state. This is a two + * stage process. In the first stage, the cpus are operating independently, + * and may call into cpuidle_enter_state_coupled at completely different times. + * To save as much power as possible, the first cpus to call this function will + * go to an intermediate state (the cpuidle_device's safe state), and wait for + * all the other cpus to call this function. Once all coupled cpus are idle, + * the second stage will start. Each coupled cpu will spin until all cpus have + * guaranteed that they will call the target_state. + * + * This function must be called with interrupts disabled. It may enable + * interrupts while preparing for idle, and it will always return with + * interrupts enabled. + */ +int cpuidle_enter_state_coupled(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int next_state) +{ + int entered_state = -1; + struct cpuidle_coupled *coupled = dev->coupled; + + if (!coupled) + return -EINVAL; + + while (coupled->prevent) { + if (cpuidle_coupled_clear_pokes(dev->cpu)) { + local_irq_enable(); + return entered_state; + } + entered_state = cpuidle_enter_state(dev, drv, + dev->safe_state_index); + } + + /* Read barrier ensures online_count is read after prevent is cleared */ + smp_rmb(); + + cpuidle_coupled_set_waiting(dev->cpu, coupled, next_state); + +retry: + /* + * Wait for all coupled cpus to be idle, using the deepest state + * allowed for a single cpu. + */ + while (!cpuidle_coupled_cpus_waiting(coupled)) { + if (cpuidle_coupled_clear_pokes(dev->cpu)) { + cpuidle_coupled_set_not_waiting(dev->cpu, coupled); + goto out; + } + + if (coupled->prevent) { + cpuidle_coupled_set_not_waiting(dev->cpu, coupled); + goto out; + } + + entered_state = cpuidle_enter_state(dev, drv, + dev->safe_state_index); + } + + if (cpuidle_coupled_clear_pokes(dev->cpu)) { + cpuidle_coupled_set_not_waiting(dev->cpu, coupled); + goto out; + } + + /* + * All coupled cpus are probably idle. There is a small chance that + * one of the other cpus just became active. Increment the ready count, + * and spin until all coupled cpus have incremented the counter. Once a + * cpu has incremented the ready counter, it cannot abort idle and must + * spin until either all cpus have incremented the ready counter, or + * another cpu leaves idle and decrements the waiting counter. + */ + + cpuidle_coupled_set_ready(coupled); + while (!cpuidle_coupled_cpus_ready(coupled)) { + /* Check if any other cpus bailed out of idle. */ + if (!cpuidle_coupled_cpus_waiting(coupled)) + if (!cpuidle_coupled_set_not_ready(coupled)) + goto retry; + + cpu_relax(); + } + + /* all cpus have acked the coupled state */ + next_state = cpuidle_coupled_get_state(dev, coupled); + + entered_state = cpuidle_enter_state(dev, drv, next_state); + + cpuidle_coupled_set_done(dev->cpu, coupled); + +out: + /* + * Normal cpuidle states are expected to return with irqs enabled. + * That leads to an inefficiency where a cpu receiving an interrupt + * that brings it out of idle will process that interrupt before + * exiting the idle enter function and decrementing ready_count. All + * other cpus will need to spin waiting for the cpu that is processing + * the interrupt. If the driver returns with interrupts disabled, + * all other cpus will loop back into the safe idle state instead of + * spinning, saving power. + * + * Calling local_irq_enable here allows coupled states to return with + * interrupts disabled, but won't cause problems for drivers that + * exit with interrupts enabled. + */ + local_irq_enable(); + + /* + * Wait until all coupled cpus have exited idle. There is no risk that + * a cpu exits and re-enters the ready state because this cpu has + * already decremented its waiting_count. + */ + while (!cpuidle_coupled_no_cpus_ready(coupled)) + cpu_relax(); + + return entered_state; +} + +static void cpuidle_coupled_update_online_cpus(struct cpuidle_coupled *coupled) +{ + cpumask_t cpus; + cpumask_and(&cpus, cpu_online_mask, &coupled->coupled_cpus); + coupled->online_count = cpumask_weight(&cpus); +} + +/** + * cpuidle_coupled_register_device - register a coupled cpuidle device + * @dev: struct cpuidle_device for the current cpu + * + * Called from cpuidle_register_device to handle coupled idle init. Finds the + * cpuidle_coupled struct for this set of coupled cpus, or creates one if none + * exists yet. + */ +int cpuidle_coupled_register_device(struct cpuidle_device *dev) +{ + int cpu; + struct cpuidle_device *other_dev; + struct call_single_data *csd; + struct cpuidle_coupled *coupled; + + if (cpumask_empty(&dev->coupled_cpus)) + return 0; + + for_each_cpu_mask(cpu, dev->coupled_cpus) { + other_dev = per_cpu(cpuidle_devices, cpu); + if (other_dev && other_dev->coupled) { + coupled = other_dev->coupled; + goto have_coupled; + } + } + + /* No existing coupled info found, create a new one */ + coupled = kzalloc(sizeof(struct cpuidle_coupled), GFP_KERNEL); + if (!coupled) + return -ENOMEM; + + coupled->coupled_cpus = dev->coupled_cpus; + +have_coupled: + dev->coupled = coupled; + if (WARN_ON(!cpumask_equal(&dev->coupled_cpus, &coupled->coupled_cpus))) + coupled->prevent++; + + cpuidle_coupled_update_online_cpus(coupled); + + coupled->refcnt++; + + csd = &per_cpu(cpuidle_coupled_poke_cb, dev->cpu); + csd->func = cpuidle_coupled_poked; + csd->info = (void *)(unsigned long)dev->cpu; + + return 0; +} + +/** + * cpuidle_coupled_unregister_device - unregister a coupled cpuidle device + * @dev: struct cpuidle_device for the current cpu + * + * Called from cpuidle_unregister_device to tear down coupled idle. Removes the + * cpu from the coupled idle set, and frees the cpuidle_coupled_info struct if + * this was the last cpu in the set. + */ +void cpuidle_coupled_unregister_device(struct cpuidle_device *dev) +{ + struct cpuidle_coupled *coupled = dev->coupled; + + if (cpumask_empty(&dev->coupled_cpus)) + return; + + if (--coupled->refcnt) + kfree(coupled); + dev->coupled = NULL; +} + +/** + * cpuidle_coupled_prevent_idle - prevent cpus from entering a coupled state + * @coupled: the struct coupled that contains the cpu that is changing state + * + * Disables coupled cpuidle on a coupled set of cpus. Used to ensure that + * cpu_online_mask doesn't change while cpus are coordinating coupled idle. + */ +static void cpuidle_coupled_prevent_idle(struct cpuidle_coupled *coupled) +{ + int cpu = get_cpu(); + + /* Force all cpus out of the waiting loop. */ + coupled->prevent++; + cpuidle_coupled_poke_others(cpu, coupled); + put_cpu(); + while (!cpuidle_coupled_no_cpus_waiting(coupled)) + cpu_relax(); +} + +/** + * cpuidle_coupled_allow_idle - allows cpus to enter a coupled state + * @coupled: the struct coupled that contains the cpu that is changing state + * + * Enables coupled cpuidle on a coupled set of cpus. Used to ensure that + * cpu_online_mask doesn't change while cpus are coordinating coupled idle. + */ +static void cpuidle_coupled_allow_idle(struct cpuidle_coupled *coupled) +{ + int cpu = get_cpu(); + + /* + * Write barrier ensures readers see the new online_count when they + * see prevent == false. + */ + smp_wmb(); + coupled->prevent--; + /* Force cpus out of the prevent loop. */ + cpuidle_coupled_poke_others(cpu, coupled); + put_cpu(); +} + +/** + * cpuidle_coupled_cpu_notify - notifier called during hotplug transitions + * @nb: notifier block + * @action: hotplug transition + * @hcpu: target cpu number + * + * Called when a cpu is brought on or offline using hotplug. Updates the + * coupled cpu set appropriately + */ +static int cpuidle_coupled_cpu_notify(struct notifier_block *nb, + unsigned long action, void *hcpu) +{ + int cpu = (unsigned long)hcpu; + struct cpuidle_device *dev; + + mutex_lock(&cpuidle_lock); + + dev = per_cpu(cpuidle_devices, cpu); + if (!dev->coupled) + goto out; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_UP_PREPARE: + case CPU_DOWN_PREPARE: + cpuidle_coupled_prevent_idle(dev->coupled); + break; + case CPU_ONLINE: + case CPU_DEAD: + cpuidle_coupled_update_online_cpus(dev->coupled); + /* Fall through */ + case CPU_UP_CANCELED: + case CPU_DOWN_FAILED: + cpuidle_coupled_allow_idle(dev->coupled); + break; + } + +out: + mutex_unlock(&cpuidle_lock); + return NOTIFY_OK; +} + +static struct notifier_block cpuidle_coupled_cpu_notifier = { + .notifier_call = cpuidle_coupled_cpu_notify, +}; + +static int __init cpuidle_coupled_init(void) +{ + return register_cpu_notifier(&cpuidle_coupled_cpu_notifier); +} +core_initcall(cpuidle_coupled_init); diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index d90519cec880..ed1dd859299b 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -92,6 +92,34 @@ int cpuidle_play_dead(void) } /** + * cpuidle_enter_state - enter the state and update stats + * @dev: cpuidle device for this cpu + * @drv: cpuidle driver for this cpu + * @next_state: index into drv->states of the state to enter + */ +int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, + int next_state) +{ + int entered_state; + + entered_state = cpuidle_enter_ops(dev, drv, next_state); + + if (entered_state >= 0) { + /* Update cpuidle counters */ + /* This can be moved to within driver enter routine + * but that results in multiple copies of same code. + */ + dev->states_usage[entered_state].time += + (unsigned long long)dev->last_residency; + dev->states_usage[entered_state].usage++; + } else { + dev->last_residency = 0; + } + + return entered_state; +} + +/** * cpuidle_idle_call - the main idle loop * * NOTE: no locks or semaphores should be used here @@ -132,23 +160,15 @@ int cpuidle_idle_call(void) trace_power_start_rcuidle(POWER_CSTATE, next_state, dev->cpu); trace_cpu_idle_rcuidle(next_state, dev->cpu); - entered_state = cpuidle_enter_ops(dev, drv, next_state); + if (cpuidle_state_is_coupled(dev, drv, next_state)) + entered_state = cpuidle_enter_state_coupled(dev, drv, + next_state); + else + entered_state = cpuidle_enter_state(dev, drv, next_state); trace_power_end_rcuidle(dev->cpu); trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu); - if (entered_state >= 0) { - /* Update cpuidle counters */ - /* This can be moved to within driver enter routine - * but that results in multiple copies of same code. - */ - dev->states_usage[entered_state].time += - (unsigned long long)dev->last_residency; - dev->states_usage[entered_state].usage++; - } else { - dev->last_residency = 0; - } - /* give the governor an opportunity to reflect on the outcome */ if (cpuidle_curr_governor->reflect) cpuidle_curr_governor->reflect(dev, entered_state); @@ -376,13 +396,25 @@ static int __cpuidle_register_device(struct cpuidle_device *dev) per_cpu(cpuidle_devices, dev->cpu) = dev; list_add(&dev->device_list, &cpuidle_detected_devices); - if ((ret = cpuidle_add_sysfs(cpu_dev))) { - module_put(cpuidle_driver->owner); - return ret; - } + ret = cpuidle_add_sysfs(cpu_dev); + if (ret) + goto err_sysfs; + + ret = cpuidle_coupled_register_device(dev); + if (ret) + goto err_coupled; dev->registered = 1; return 0; + +err_coupled: + cpuidle_remove_sysfs(cpu_dev); + wait_for_completion(&dev->kobj_unregister); +err_sysfs: + list_del(&dev->device_list); + per_cpu(cpuidle_devices, dev->cpu) = NULL; + module_put(cpuidle_driver->owner); + return ret; } /** @@ -432,6 +464,8 @@ void cpuidle_unregister_device(struct cpuidle_device *dev) wait_for_completion(&dev->kobj_unregister); per_cpu(cpuidle_devices, dev->cpu) = NULL; + cpuidle_coupled_unregister_device(dev); + cpuidle_resume_and_unlock(); module_put(cpuidle_driver->owner); diff --git a/drivers/cpuidle/cpuidle.h b/drivers/cpuidle/cpuidle.h index 7db186685c27..76e7f696ad8c 100644 --- a/drivers/cpuidle/cpuidle.h +++ b/drivers/cpuidle/cpuidle.h @@ -14,6 +14,8 @@ extern struct list_head cpuidle_detected_devices; extern struct mutex cpuidle_lock; extern spinlock_t cpuidle_driver_lock; extern int cpuidle_disabled(void); +extern int cpuidle_enter_state(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int next_state); /* idle loop */ extern void cpuidle_install_idle_handler(void); @@ -30,4 +32,34 @@ extern void cpuidle_remove_state_sysfs(struct cpuidle_device *device); extern int cpuidle_add_sysfs(struct device *dev); extern void cpuidle_remove_sysfs(struct device *dev); +#ifdef CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED +bool cpuidle_state_is_coupled(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int state); +int cpuidle_enter_state_coupled(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int next_state); +int cpuidle_coupled_register_device(struct cpuidle_device *dev); +void cpuidle_coupled_unregister_device(struct cpuidle_device *dev); +#else +static inline bool cpuidle_state_is_coupled(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int state) +{ + return false; +} + +static inline int cpuidle_enter_state_coupled(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int next_state) +{ + return -1; +} + +static inline int cpuidle_coupled_register_device(struct cpuidle_device *dev) +{ + return 0; +} + +static inline void cpuidle_coupled_unregister_device(struct cpuidle_device *dev) +{ +} +#endif + #endif /* __DRIVER_CPUIDLE_H */ diff --git a/drivers/infiniband/hw/ehca/ehca_irq.c b/drivers/infiniband/hw/ehca/ehca_irq.c index 53589000fd07..8c6327876d98 100644 --- a/drivers/infiniband/hw/ehca/ehca_irq.c +++ b/drivers/infiniband/hw/ehca/ehca_irq.c @@ -42,6 +42,7 @@ */ #include <linux/slab.h> +#include <linux/smpboot.h> #include "ehca_classes.h" #include "ehca_irq.h" @@ -652,7 +653,7 @@ void ehca_tasklet_eq(unsigned long data) ehca_process_eq((struct ehca_shca*)data, 1); } -static inline int find_next_online_cpu(struct ehca_comp_pool *pool) +static int find_next_online_cpu(struct ehca_comp_pool *pool) { int cpu; unsigned long flags; @@ -662,10 +663,15 @@ static inline int find_next_online_cpu(struct ehca_comp_pool *pool) ehca_dmp(cpu_online_mask, cpumask_size(), ""); spin_lock_irqsave(&pool->last_cpu_lock, flags); - cpu = cpumask_next(pool->last_cpu, cpu_online_mask); - if (cpu >= nr_cpu_ids) - cpu = cpumask_first(cpu_online_mask); - pool->last_cpu = cpu; + while (1) { + cpu = cpumask_next(pool->last_cpu, cpu_online_mask); + if (cpu >= nr_cpu_ids) + cpu = cpumask_first(cpu_online_mask); + pool->last_cpu = cpu; + /* Might be on the way out */ + if (per_cpu_ptr(pool->cpu_comp_tasks, cpu)->active) + break; + } spin_unlock_irqrestore(&pool->last_cpu_lock, flags); return cpu; @@ -719,19 +725,18 @@ static void queue_comp_task(struct ehca_cq *__cq) static void run_comp_task(struct ehca_cpu_comp_task *cct) { struct ehca_cq *cq; - unsigned long flags; - spin_lock_irqsave(&cct->task_lock, flags); + spin_lock_irq(&cct->task_lock); while (!list_empty(&cct->cq_list)) { cq = list_entry(cct->cq_list.next, struct ehca_cq, entry); - spin_unlock_irqrestore(&cct->task_lock, flags); + spin_unlock_irq(&cct->task_lock); comp_event_callback(cq); if (atomic_dec_and_test(&cq->nr_events)) wake_up(&cq->wait_completion); - spin_lock_irqsave(&cct->task_lock, flags); + spin_lock_irq(&cct->task_lock); spin_lock(&cq->task_lock); cq->nr_callbacks--; if (!cq->nr_callbacks) { @@ -741,17 +746,51 @@ static void run_comp_task(struct ehca_cpu_comp_task *cct) spin_unlock(&cq->task_lock); } - spin_unlock_irqrestore(&cct->task_lock, flags); + spin_unlock_irq(&cct->task_lock); } -static int comp_task(void *__cct) +static void comp_task_park(unsigned int cpu) { - struct ehca_cpu_comp_task *cct = __cct; - int cql_empty; + struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu); + struct ehca_cpu_comp_task *target; + struct ehca_cq *cq, *tmp; + LIST_HEAD(list); + + spin_lock_irq(&cct->task_lock); + cct->cq_jobs = 0; + cct->active = 0; + list_splice_init(&cct->cq_list, &list); + spin_unlock_irq(&cct->task_lock); + + cpu = find_next_online_cpu(pool); + target = per_cpu_ptr(pool->cpu_comp_tasks, cpu); + spin_lock_irq(&target->task_lock); + list_for_each_entry_safe(cq, tmp, &list, entry) { + list_del(&cq->entry); + __queue_comp_task(cq, target); + } + spin_unlock_irq(&target->task_lock); +} + +static void comp_task_stop(unsigned int cpu, bool online) +{ + struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu); + + spin_lock_irq(&cct->task_lock); + cct->cq_jobs = 0; + cct->active = 0; + WARN_ON(!list_empty(&cct->cq_list)); + spin_unlock_irq(&cct->task_lock); +} + +static int comp_task(void *td) +{ + struct ehca_cpu_comp_task *cct = this_cpu_ptr(pool->cpu_comp_tasks); DECLARE_WAITQUEUE(wait, current); + int cql_empty; - set_current_state(TASK_INTERRUPTIBLE); - while (!kthread_should_stop()) { + while (!smpboot_thread_check_parking(td)) { + set_current_state(TASK_INTERRUPTIBLE); add_wait_queue(&cct->wait_queue, &wait); spin_lock_irq(&cct->task_lock); @@ -768,131 +807,21 @@ static int comp_task(void *__cct) cql_empty = list_empty(&cct->cq_list); spin_unlock_irq(&cct->task_lock); if (!cql_empty) - run_comp_task(__cct); - - set_current_state(TASK_INTERRUPTIBLE); + run_comp_task(cct); } - __set_current_state(TASK_RUNNING); - return 0; } -static struct task_struct *create_comp_task(struct ehca_comp_pool *pool, - int cpu) -{ - struct ehca_cpu_comp_task *cct; - - cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu); - spin_lock_init(&cct->task_lock); - INIT_LIST_HEAD(&cct->cq_list); - init_waitqueue_head(&cct->wait_queue); - cct->task = kthread_create_on_node(comp_task, cct, cpu_to_node(cpu), - "ehca_comp/%d", cpu); - - return cct->task; -} - -static void destroy_comp_task(struct ehca_comp_pool *pool, - int cpu) -{ - struct ehca_cpu_comp_task *cct; - struct task_struct *task; - unsigned long flags_cct; - - cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu); - - spin_lock_irqsave(&cct->task_lock, flags_cct); - - task = cct->task; - cct->task = NULL; - cct->cq_jobs = 0; - - spin_unlock_irqrestore(&cct->task_lock, flags_cct); - - if (task) - kthread_stop(task); -} - -static void __cpuinit take_over_work(struct ehca_comp_pool *pool, int cpu) -{ - struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu); - LIST_HEAD(list); - struct ehca_cq *cq; - unsigned long flags_cct; - - spin_lock_irqsave(&cct->task_lock, flags_cct); - - list_splice_init(&cct->cq_list, &list); - - while (!list_empty(&list)) { - cq = list_entry(cct->cq_list.next, struct ehca_cq, entry); - - list_del(&cq->entry); - __queue_comp_task(cq, this_cpu_ptr(pool->cpu_comp_tasks)); - } - - spin_unlock_irqrestore(&cct->task_lock, flags_cct); - -} - -static int __cpuinit comp_pool_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) -{ - unsigned int cpu = (unsigned long)hcpu; - struct ehca_cpu_comp_task *cct; - - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - ehca_gen_dbg("CPU: %x (CPU_PREPARE)", cpu); - if (!create_comp_task(pool, cpu)) { - ehca_gen_err("Can't create comp_task for cpu: %x", cpu); - return notifier_from_errno(-ENOMEM); - } - break; - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - ehca_gen_dbg("CPU: %x (CPU_CANCELED)", cpu); - cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu); - kthread_bind(cct->task, cpumask_any(cpu_online_mask)); - destroy_comp_task(pool, cpu); - break; - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - ehca_gen_dbg("CPU: %x (CPU_ONLINE)", cpu); - cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu); - kthread_bind(cct->task, cpu); - wake_up_process(cct->task); - break; - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - ehca_gen_dbg("CPU: %x (CPU_DOWN_PREPARE)", cpu); - break; - case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: - ehca_gen_dbg("CPU: %x (CPU_DOWN_FAILED)", cpu); - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - ehca_gen_dbg("CPU: %x (CPU_DEAD)", cpu); - destroy_comp_task(pool, cpu); - take_over_work(pool, cpu); - break; - } - - return NOTIFY_OK; -} - -static struct notifier_block comp_pool_callback_nb __cpuinitdata = { - .notifier_call = comp_pool_callback, - .priority = 0, +static struct smp_hotplug_thread comp_pool_threads = { + .thread_fn = comp_task, + .thread_comm = "ehca_comp/%u", + .cleanup = comp_task_stop, + .park = comp_task_park, }; int ehca_create_comp_pool(void) { - int cpu; - struct task_struct *task; + int cpu, ret = -ENOMEM; if (!ehca_scaling_code) return 0; @@ -905,38 +834,47 @@ int ehca_create_comp_pool(void) pool->last_cpu = cpumask_any(cpu_online_mask); pool->cpu_comp_tasks = alloc_percpu(struct ehca_cpu_comp_task); - if (pool->cpu_comp_tasks == NULL) { - kfree(pool); - return -EINVAL; - } + if (!pool->cpu_comp_tasks) + goto out_pool; - for_each_online_cpu(cpu) { - task = create_comp_task(pool, cpu); - if (task) { - kthread_bind(task, cpu); - wake_up_process(task); - } + pool->cpu_comp_threads = alloc_percpu(struct task_struct *); + if (!pool->cpu_comp_threads) + goto out_tasks; + + for_each_present_cpu(cpu) { + struct ehca_cpu_comp_task *cct; + + cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu); + spin_lock_init(&cct->task_lock); + INIT_LIST_HEAD(&cct->cq_list); + init_waitqueue_head(&cct->wait_queue); } - register_hotcpu_notifier(&comp_pool_callback_nb); + comp_pool_threads.store = pool->cpu_comp_threads; + ret = smpboot_register_percpu_thread(&comp_pool_threads); + if (ret) + goto out_threads; - printk(KERN_INFO "eHCA scaling code enabled\n"); + pr_info("eHCA scaling code enabled\n"); + return ret; - return 0; +out_threads: + free_percpu(pool->cpu_comp_threads); +out_tasks: + free_percpu(pool->cpu_comp_tasks); +out_pool: + kfree(pool); + return ret; } void ehca_destroy_comp_pool(void) { - int i; - if (!ehca_scaling_code) return; - unregister_hotcpu_notifier(&comp_pool_callback_nb); - - for_each_online_cpu(i) - destroy_comp_task(pool, i); + smpboot_unregister_percpu_thread(&comp_pool_threads); + free_percpu(pool->cpu_comp_threads); free_percpu(pool->cpu_comp_tasks); kfree(pool); } diff --git a/drivers/infiniband/hw/ehca/ehca_irq.h b/drivers/infiniband/hw/ehca/ehca_irq.h index 3346cb06cea6..43c2a0c772fa 100644 --- a/drivers/infiniband/hw/ehca/ehca_irq.h +++ b/drivers/infiniband/hw/ehca/ehca_irq.h @@ -60,13 +60,14 @@ void ehca_process_eq(struct ehca_shca *shca, int is_irq); struct ehca_cpu_comp_task { wait_queue_head_t wait_queue; struct list_head cq_list; - struct task_struct *task; spinlock_t task_lock; int cq_jobs; + int active; }; struct ehca_comp_pool { - struct ehca_cpu_comp_task *cpu_comp_tasks; + struct ehca_cpu_comp_task __percpu *cpu_comp_tasks; + struct task_struct * __percpu *cpu_comp_threads; int last_cpu; spinlock_t last_cpu_lock; }; diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index 6c26a3da0e03..5ab7183313ce 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -57,6 +57,7 @@ struct cpuidle_state { /* Idle State Flags */ #define CPUIDLE_FLAG_TIME_VALID (0x01) /* is residency time measurable? */ +#define CPUIDLE_FLAG_COUPLED (0x02) /* state applies to multiple cpus */ #define CPUIDLE_DRIVER_FLAGS_MASK (0xFFFF0000) @@ -100,6 +101,12 @@ struct cpuidle_device { struct list_head device_list; struct kobject kobj; struct completion kobj_unregister; + +#ifdef CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED + int safe_state_index; + cpumask_t coupled_cpus; + struct cpuidle_coupled *coupled; +#endif }; DECLARE_PER_CPU(struct cpuidle_device *, cpuidle_devices); @@ -176,6 +183,10 @@ static inline int cpuidle_play_dead(void) {return -ENODEV; } #endif +#ifdef CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED +void cpuidle_coupled_parallel_barrier(struct cpuidle_device *dev, atomic_t *a); +#endif + /****************************** * CPUIDLE GOVERNOR INTERFACE * ******************************/ diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 0714b24c0e45..5365347a0dc9 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -14,6 +14,11 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), kthread_create_on_node(threadfn, data, -1, namefmt, ##arg) +struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), + void *data, + unsigned int cpu, + const char *namefmt); + /** * kthread_run - create and wake a thread. * @threadfn: the function to run until signal_pending(current). @@ -34,9 +39,13 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), void kthread_bind(struct task_struct *k, unsigned int cpu); int kthread_stop(struct task_struct *k); -int kthread_should_stop(void); +bool kthread_should_stop(void); +bool kthread_should_park(void); bool kthread_freezable_should_stop(bool *was_frozen); void *kthread_data(struct task_struct *k); +int kthread_park(struct task_struct *k); +void kthread_unpark(struct task_struct *k); +void kthread_parkme(void); int kthreadd(void *unused); extern struct task_struct *kthreadd_task; diff --git a/include/linux/sched.h b/include/linux/sched.h index 71d51c1c3f59..2b5fba1e9f6a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1100,6 +1100,7 @@ struct sched_class { #ifdef CONFIG_SMP int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); + void (*migrate_task_rq)(struct task_struct *p, int next_cpu); void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); void (*post_schedule) (struct rq *this_rq); @@ -1134,6 +1135,15 @@ struct load_weight { unsigned long weight, inv_weight; }; +struct sched_avg { + u32 runnable_avg_sum, runnable_avg_period; + u64 last_runnable_update; + s64 decay_count; + unsigned long load_avg_contrib; + unsigned long load_avg_ratio; + u32 usage_avg_sum; +}; + #ifdef CONFIG_SCHEDSTATS struct sched_statistics { u64 wait_start; @@ -1194,6 +1204,15 @@ struct sched_entity { /* rq "owned" by this entity/group: */ struct cfs_rq *my_q; #endif +/* + * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be + * removed when useful for applications beyond shares distribution (e.g. + * load-balance). + */ +#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) + /* Per-entity load-tracking */ + struct sched_avg avg; +#endif }; struct sched_rt_entity { diff --git a/include/linux/smpboot.h b/include/linux/smpboot.h new file mode 100644 index 000000000000..3279bee98ef5 --- /dev/null +++ b/include/linux/smpboot.h @@ -0,0 +1,40 @@ +#ifndef _LINUX_SMPBOOT_H +#define _LINUX_SMPBOOT_H + +#include <linux/types.h> + +struct task_struct; +/* Cookie handed to the thread_fn*/ +struct smpboot_thread_data; + +/** + * struct smp_hotplug_thread - CPU hotplug related thread descriptor + * @store: Pointer to per cpu storage for the task pointers + * @list: List head for core management + * @thread_fn: The associated thread function + * @setup: Optional setup function, called when the thread gets + * operational the first time + * @cleanup: Optional cleanup function, called when the thread + * should stop (module exit) + * @park: Optional park function, called when the thread is + * parked (cpu offline) + * @unpark: Optional unpark function, called when the thread is + * unparked (cpu online) + * @thread_comm: The base name of the thread + */ +struct smp_hotplug_thread { + struct task_struct __percpu **store; + struct list_head list; + int (*thread_fn)(void *data); + void (*setup)(unsigned int cpu); + void (*cleanup)(unsigned int cpu, bool online); + void (*park)(unsigned int cpu); + void (*unpark)(unsigned int cpu); + const char *thread_comm; +}; + +int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread); +void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread); +int smpboot_thread_check_parking(struct smpboot_thread_data *td); + +#endif diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index ea7a2035456d..2c50a06fbedd 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -426,6 +426,157 @@ TRACE_EVENT(sched_pi_setprio, __entry->oldprio, __entry->newprio) ); +/* + * Tracepoint for showing tracked load contribution. + */ +TRACE_EVENT(sched_task_load_contrib, + + TP_PROTO(struct task_struct *tsk, unsigned long load_contrib), + + TP_ARGS(tsk, load_contrib), + + TP_STRUCT__entry( + __array(char, comm, TASK_COMM_LEN) + __field(pid_t, pid) + __field(unsigned long, load_contrib) + ), + + TP_fast_assign( + memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __entry->pid = tsk->pid; + __entry->load_contrib = load_contrib; + ), + + TP_printk("comm=%s pid=%d load_contrib=%lu", + __entry->comm, __entry->pid, + __entry->load_contrib) +); + +/* + * Tracepoint for showing tracked task runnable ratio [0..1023]. + */ +TRACE_EVENT(sched_task_runnable_ratio, + + TP_PROTO(struct task_struct *tsk, unsigned long ratio), + + TP_ARGS(tsk, ratio), + + TP_STRUCT__entry( + __array(char, comm, TASK_COMM_LEN) + __field(pid_t, pid) + __field(unsigned long, ratio) + ), + + TP_fast_assign( + memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __entry->pid = tsk->pid; + __entry->ratio = ratio; + ), + + TP_printk("comm=%s pid=%d ratio=%lu", + __entry->comm, __entry->pid, + __entry->ratio) +); + +/* + * Tracepoint for showing tracked rq runnable ratio [0..1023]. + */ +TRACE_EVENT(sched_rq_runnable_ratio, + + TP_PROTO(int cpu, unsigned long ratio), + + TP_ARGS(cpu, ratio), + + TP_STRUCT__entry( + __field(int, cpu) + __field(unsigned long, ratio) + ), + + TP_fast_assign( + __entry->cpu = cpu; + __entry->ratio = ratio; + ), + + TP_printk("cpu=%d ratio=%lu", + __entry->cpu, + __entry->ratio) +); + +/* + * Tracepoint for showing tracked rq runnable load. + */ +TRACE_EVENT(sched_rq_runnable_load, + + TP_PROTO(int cpu, u64 load), + + TP_ARGS(cpu, load), + + TP_STRUCT__entry( + __field(int, cpu) + __field(u64, load) + ), + + TP_fast_assign( + __entry->cpu = cpu; + __entry->load = load; + ), + + TP_printk("cpu=%d load=%llu", + __entry->cpu, + __entry->load) +); + +/* + * Tracepoint for showing tracked task cpu usage ratio [0..1023]. + */ +TRACE_EVENT(sched_task_usage_ratio, + + TP_PROTO(struct task_struct *tsk, unsigned long ratio), + + TP_ARGS(tsk, ratio), + + TP_STRUCT__entry( + __array(char, comm, TASK_COMM_LEN) + __field(pid_t, pid) + __field(unsigned long, ratio) + ), + + TP_fast_assign( + memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __entry->pid = tsk->pid; + __entry->ratio = ratio; + ), + + TP_printk("comm=%s pid=%d ratio=%lu", + __entry->comm, __entry->pid, + __entry->ratio) +); + +/* + * Tracepoint for HMP (CONFIG_SCHED_HMP) task migrations. + */ +TRACE_EVENT(sched_hmp_migrate, + + TP_PROTO(struct task_struct *tsk, int val), + + TP_ARGS(tsk, val), + + TP_STRUCT__entry( + __array(char, comm, TASK_COMM_LEN) + __field(pid_t, pid) + __field(int, val) + ), + + TP_fast_assign( + memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __entry->pid = tsk->pid; + __entry->val = val; + ), + + TP_printk("comm=%s pid=%d val=%d", + __entry->comm, __entry->pid, + __entry->val) +); #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/cpu.c b/kernel/cpu.c index 8e0da8c0ab5a..a792c9bf345a 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -280,12 +280,13 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) __func__, cpu); goto out_release; } + smpboot_park_threads(cpu); err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); if (err) { /* CPU didn't die: tell everyone. Can't complain. */ + smpboot_unpark_threads(cpu); cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); - goto out_release; } BUG_ON(cpu_online(cpu)); @@ -354,6 +355,10 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) goto out; } + ret = smpboot_create_threads(cpu); + if (ret) + goto out; + ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); if (ret) { nr_calls--; @@ -368,6 +373,9 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) goto out_notify; BUG_ON(!cpu_online(cpu)); + /* Wake the per cpu threads */ + smpboot_unpark_threads(cpu); + /* Now call notifier in preparation. */ cpu_notify(CPU_ONLINE | mod, hcpu); diff --git a/kernel/kthread.c b/kernel/kthread.c index 3d3de633702e..c887dd109bfe 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -37,11 +37,20 @@ struct kthread_create_info }; struct kthread { - int should_stop; + unsigned long flags; + unsigned int cpu; void *data; + struct completion parked; struct completion exited; }; +enum KTHREAD_BITS { + KTHREAD_IS_PER_CPU = 0, + KTHREAD_SHOULD_STOP, + KTHREAD_SHOULD_PARK, + KTHREAD_IS_PARKED, +}; + #define to_kthread(tsk) \ container_of((tsk)->vfork_done, struct kthread, exited) @@ -52,13 +61,29 @@ struct kthread { * and this will return true. You should then return, and your return * value will be passed through to kthread_stop(). */ -int kthread_should_stop(void) +bool kthread_should_stop(void) { - return to_kthread(current)->should_stop; + return test_bit(KTHREAD_SHOULD_STOP, &to_kthread(current)->flags); } EXPORT_SYMBOL(kthread_should_stop); /** + * kthread_should_park - should this kthread park now? + * + * When someone calls kthread_park() on your kthread, it will be woken + * and this will return true. You should then do the necessary + * cleanup and call kthread_parkme() + * + * Similar to kthread_should_stop(), but this keeps the thread alive + * and in a park position. kthread_unpark() "restarts" the thread and + * calls the thread function again. + */ +bool kthread_should_park(void) +{ + return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(current)->flags); +} + +/** * kthread_freezable_should_stop - should this freezable kthread return now? * @was_frozen: optional out parameter, indicates whether %current was frozen * @@ -96,6 +121,24 @@ void *kthread_data(struct task_struct *task) return to_kthread(task)->data; } +static void __kthread_parkme(struct kthread *self) +{ + __set_current_state(TASK_INTERRUPTIBLE); + while (test_bit(KTHREAD_SHOULD_PARK, &self->flags)) { + if (!test_and_set_bit(KTHREAD_IS_PARKED, &self->flags)) + complete(&self->parked); + schedule(); + __set_current_state(TASK_INTERRUPTIBLE); + } + clear_bit(KTHREAD_IS_PARKED, &self->flags); + __set_current_state(TASK_RUNNING); +} + +void kthread_parkme(void) +{ + __kthread_parkme(to_kthread(current)); +} + static int kthread(void *_create) { /* Copy data: it's on kthread's stack */ @@ -105,9 +148,10 @@ static int kthread(void *_create) struct kthread self; int ret; - self.should_stop = 0; + self.flags = 0; self.data = data; init_completion(&self.exited); + init_completion(&self.parked); current->vfork_done = &self.exited; /* OK, tell user we're spawned, wait for stop or wakeup */ @@ -117,9 +161,11 @@ static int kthread(void *_create) schedule(); ret = -EINTR; - if (!self.should_stop) - ret = threadfn(data); + if (!test_bit(KTHREAD_SHOULD_STOP, &self.flags)) { + __kthread_parkme(&self); + ret = threadfn(data); + } /* we can't just return, we must preserve "self" on stack */ do_exit(ret); } @@ -172,8 +218,7 @@ static void create_kthread(struct kthread_create_info *create) * Returns a task_struct or ERR_PTR(-ENOMEM). */ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), - void *data, - int node, + void *data, int node, const char namefmt[], ...) { @@ -210,6 +255,13 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), } EXPORT_SYMBOL(kthread_create_on_node); +static void __kthread_bind(struct task_struct *p, unsigned int cpu) +{ + /* It's safe because the task is inactive. */ + do_set_cpus_allowed(p, cpumask_of(cpu)); + p->flags |= PF_THREAD_BOUND; +} + /** * kthread_bind - bind a just-created kthread to a cpu. * @p: thread created by kthread_create(). @@ -226,14 +278,111 @@ void kthread_bind(struct task_struct *p, unsigned int cpu) WARN_ON(1); return; } - - /* It's safe because the task is inactive. */ - do_set_cpus_allowed(p, cpumask_of(cpu)); - p->flags |= PF_THREAD_BOUND; + __kthread_bind(p, cpu); } EXPORT_SYMBOL(kthread_bind); /** + * kthread_create_on_cpu - Create a cpu bound kthread + * @threadfn: the function to run until signal_pending(current). + * @data: data ptr for @threadfn. + * @cpu: The cpu on which the thread should be bound, + * @namefmt: printf-style name for the thread. + * + * Description: This helper function creates and names a kernel thread + * The thread will be woken and put into park mode. + */ +struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), + void *data, unsigned int cpu, + const char *namefmt) +{ + struct task_struct *p; + + p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt, + cpu); + if (IS_ERR(p)) + return p; + set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags); + to_kthread(p)->cpu = cpu; + /* Park the thread to get it out of TASK_UNINTERRUPTIBLE state */ + kthread_park(p); + return p; +} + +static struct kthread *task_get_live_kthread(struct task_struct *k) +{ + struct kthread *kthread; + + get_task_struct(k); + kthread = to_kthread(k); + /* It might have exited */ + barrier(); + if (k->vfork_done != NULL) + return kthread; + return NULL; +} + +/** + * kthread_unpark - unpark a thread created by kthread_create(). + * @k: thread created by kthread_create(). + * + * Sets kthread_should_park() for @k to return false, wakes it, and + * waits for it to return. If the thread is marked percpu then its + * bound to the cpu again. + */ +void kthread_unpark(struct task_struct *k) +{ + struct kthread *kthread = task_get_live_kthread(k); + + if (kthread) { + clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); + /* + * We clear the IS_PARKED bit here as we don't wait + * until the task has left the park code. So if we'd + * park before that happens we'd see the IS_PARKED bit + * which might be about to be cleared. + */ + if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) { + if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) + __kthread_bind(k, kthread->cpu); + wake_up_process(k); + } + } + put_task_struct(k); +} + +/** + * kthread_park - park a thread created by kthread_create(). + * @k: thread created by kthread_create(). + * + * Sets kthread_should_park() for @k to return true, wakes it, and + * waits for it to return. This can also be called after kthread_create() + * instead of calling wake_up_process(): the thread will park without + * calling threadfn(). + * + * Returns 0 if the thread is parked, -ENOSYS if the thread exited. + * If called by the kthread itself just the park bit is set. + */ +int kthread_park(struct task_struct *k) +{ + struct kthread *kthread = task_get_live_kthread(k); + int ret = -ENOSYS; + + if (kthread) { + if (!test_bit(KTHREAD_IS_PARKED, &kthread->flags)) { + set_bit(KTHREAD_SHOULD_PARK, &kthread->flags); + if (k != current) { + wake_up_process(k); + wait_for_completion(&kthread->parked); + } + } + ret = 0; + } + put_task_struct(k); + return ret; +} + +/** * kthread_stop - stop a thread created by kthread_create(). * @k: thread created by kthread_create(). * @@ -250,16 +399,13 @@ EXPORT_SYMBOL(kthread_bind); */ int kthread_stop(struct task_struct *k) { - struct kthread *kthread; + struct kthread *kthread = task_get_live_kthread(k); int ret; trace_sched_kthread_stop(k); - get_task_struct(k); - - kthread = to_kthread(k); - barrier(); /* it might have exited */ - if (k->vfork_done != NULL) { - kthread->should_stop = 1; + if (kthread) { + set_bit(KTHREAD_SHOULD_STOP, &kthread->flags); + clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); wake_up_process(k); wait_for_completion(&kthread->exited); } diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 4b97bba7396e..a8ba38369efa 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -125,13 +125,12 @@ static int rcu_scheduler_fully_active __read_mostly; */ static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task); DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status); -DEFINE_PER_CPU(int, rcu_cpu_kthread_cpu); DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); DEFINE_PER_CPU(char, rcu_cpu_has_work); #endif /* #ifdef CONFIG_RCU_BOOST */ -static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); +static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); static void invoke_rcu_core(void); static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); @@ -1461,8 +1460,7 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ /* Adjust any no-longer-needed kthreads. */ - rcu_stop_cpu_kthread(cpu); - rcu_node_kthread_setaffinity(rnp, -1); + rcu_boost_kthread_setaffinity(rnp, -1); /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */ @@ -2516,12 +2514,10 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, break; case CPU_ONLINE: case CPU_DOWN_FAILED: - rcu_node_kthread_setaffinity(rnp, -1); - rcu_cpu_kthread_setrt(cpu, 1); + rcu_boost_kthread_setaffinity(rnp, -1); break; case CPU_DOWN_PREPARE: - rcu_node_kthread_setaffinity(rnp, cpu); - rcu_cpu_kthread_setrt(cpu, 0); + rcu_boost_kthread_setaffinity(rnp, cpu); break; case CPU_DYING: case CPU_DYING_FROZEN: diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 19b61ac1079f..1989438c68fb 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -192,12 +192,6 @@ struct rcu_node { /* Refused to boost: not sure why, though. */ /* This can happen due to race conditions. */ #endif /* #ifdef CONFIG_RCU_BOOST */ - struct task_struct *node_kthread_task; - /* kthread that takes care of this rcu_node */ - /* structure, for example, awakening the */ - /* per-CPU kthreads as needed. */ - unsigned int node_kthread_status; - /* State of node_kthread_task for tracing. */ } ____cacheline_internodealigned_in_smp; /* @@ -449,7 +443,6 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags); -static void rcu_stop_cpu_kthread(int cpu); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ static void rcu_print_detail_task_stall(struct rcu_state *rsp); static int rcu_print_task_stall(struct rcu_node *rnp); @@ -479,15 +472,9 @@ static void invoke_rcu_callbacks_kthread(void); static bool rcu_is_callbacks_kthread(void); #ifdef CONFIG_RCU_BOOST static void rcu_preempt_do_callbacks(void); -static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, - cpumask_var_t cm); static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, - struct rcu_node *rnp, - int rnp_index); -static void invoke_rcu_node_kthread(struct rcu_node *rnp); -static void rcu_yield(void (*f)(unsigned long), unsigned long arg); + struct rcu_node *rnp); #endif /* #ifdef CONFIG_RCU_BOOST */ -static void rcu_cpu_kthread_setrt(int cpu, int to_rt); static void __cpuinit rcu_prepare_kthreads(int cpu); static void rcu_prepare_for_idle_init(int cpu); static void rcu_cleanup_after_idle(int cpu); diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 3e4899459f3d..cfde23b6c767 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -25,6 +25,7 @@ */ #include <linux/delay.h> +#include <linux/smpboot.h> #define RCU_KTHREAD_PRIO 1 @@ -1225,6 +1226,16 @@ static void rcu_initiate_boost_trace(struct rcu_node *rnp) #endif /* #else #ifdef CONFIG_RCU_TRACE */ +static void rcu_wake_cond(struct task_struct *t, int status) +{ + /* + * If the thread is yielding, only wake it when this + * is invoked from idle + */ + if (status != RCU_KTHREAD_YIELDING || is_idle_task(current)) + wake_up_process(t); +} + /* * Carry out RCU priority boosting on the task indicated by ->exp_tasks * or ->boost_tasks, advancing the pointer to the next task in the @@ -1297,17 +1308,6 @@ static int rcu_boost(struct rcu_node *rnp) } /* - * Timer handler to initiate waking up of boost kthreads that - * have yielded the CPU due to excessive numbers of tasks to - * boost. We wake up the per-rcu_node kthread, which in turn - * will wake up the booster kthread. - */ -static void rcu_boost_kthread_timer(unsigned long arg) -{ - invoke_rcu_node_kthread((struct rcu_node *)arg); -} - -/* * Priority-boosting kthread. One per leaf rcu_node and one for the * root rcu_node. */ @@ -1330,8 +1330,9 @@ static int rcu_boost_kthread(void *arg) else spincnt = 0; if (spincnt > 10) { + rnp->boost_kthread_status = RCU_KTHREAD_YIELDING; trace_rcu_utilization("End boost kthread@rcu_yield"); - rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp); + schedule_timeout_interruptible(2); trace_rcu_utilization("Start boost kthread@rcu_yield"); spincnt = 0; } @@ -1369,8 +1370,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) rnp->boost_tasks = rnp->gp_tasks; raw_spin_unlock_irqrestore(&rnp->lock, flags); t = rnp->boost_kthread_task; - if (t != NULL) - wake_up_process(t); + if (t) + rcu_wake_cond(t, rnp->boost_kthread_status); } else { rcu_initiate_boost_trace(rnp); raw_spin_unlock_irqrestore(&rnp->lock, flags); @@ -1387,8 +1388,10 @@ static void invoke_rcu_callbacks_kthread(void) local_irq_save(flags); __this_cpu_write(rcu_cpu_has_work, 1); if (__this_cpu_read(rcu_cpu_kthread_task) != NULL && - current != __this_cpu_read(rcu_cpu_kthread_task)) - wake_up_process(__this_cpu_read(rcu_cpu_kthread_task)); + current != __this_cpu_read(rcu_cpu_kthread_task)) { + rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task), + __this_cpu_read(rcu_cpu_kthread_status)); + } local_irq_restore(flags); } @@ -1401,21 +1404,6 @@ static bool rcu_is_callbacks_kthread(void) return __get_cpu_var(rcu_cpu_kthread_task) == current; } -/* - * Set the affinity of the boost kthread. The CPU-hotplug locks are - * held, so no one should be messing with the existence of the boost - * kthread. - */ -static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, - cpumask_var_t cm) -{ - struct task_struct *t; - - t = rnp->boost_kthread_task; - if (t != NULL) - set_cpus_allowed_ptr(rnp->boost_kthread_task, cm); -} - #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) /* @@ -1432,15 +1420,19 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) * Returns zero if all is well, a negated errno otherwise. */ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, - struct rcu_node *rnp, - int rnp_index) + struct rcu_node *rnp) { + int rnp_index = rnp - &rsp->node[0]; unsigned long flags; struct sched_param sp; struct task_struct *t; if (&rcu_preempt_state != rsp) return 0; + + if (!rcu_scheduler_fully_active || rnp->qsmaskinit == 0) + return 0; + rsp->boost = 1; if (rnp->boost_kthread_task != NULL) return 0; @@ -1457,25 +1449,6 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, return 0; } -#ifdef CONFIG_HOTPLUG_CPU - -/* - * Stop the RCU's per-CPU kthread when its CPU goes offline,. - */ -static void rcu_stop_cpu_kthread(int cpu) -{ - struct task_struct *t; - - /* Stop the CPU's kthread. */ - t = per_cpu(rcu_cpu_kthread_task, cpu); - if (t != NULL) { - per_cpu(rcu_cpu_kthread_task, cpu) = NULL; - kthread_stop(t); - } -} - -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ - static void rcu_kthread_do_work(void) { rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data)); @@ -1483,112 +1456,17 @@ static void rcu_kthread_do_work(void) rcu_preempt_do_callbacks(); } -/* - * Wake up the specified per-rcu_node-structure kthread. - * Because the per-rcu_node kthreads are immortal, we don't need - * to do anything to keep them alive. - */ -static void invoke_rcu_node_kthread(struct rcu_node *rnp) +static void rcu_cpu_kthread_setup(unsigned int cpu) { - struct task_struct *t; - - t = rnp->node_kthread_task; - if (t != NULL) - wake_up_process(t); -} - -/* - * Set the specified CPU's kthread to run RT or not, as specified by - * the to_rt argument. The CPU-hotplug locks are held, so the task - * is not going away. - */ -static void rcu_cpu_kthread_setrt(int cpu, int to_rt) -{ - int policy; struct sched_param sp; - struct task_struct *t; - - t = per_cpu(rcu_cpu_kthread_task, cpu); - if (t == NULL) - return; - if (to_rt) { - policy = SCHED_FIFO; - sp.sched_priority = RCU_KTHREAD_PRIO; - } else { - policy = SCHED_NORMAL; - sp.sched_priority = 0; - } - sched_setscheduler_nocheck(t, policy, &sp); -} -/* - * Timer handler to initiate the waking up of per-CPU kthreads that - * have yielded the CPU due to excess numbers of RCU callbacks. - * We wake up the per-rcu_node kthread, which in turn will wake up - * the booster kthread. - */ -static void rcu_cpu_kthread_timer(unsigned long arg) -{ - struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg); - struct rcu_node *rnp = rdp->mynode; - - atomic_or(rdp->grpmask, &rnp->wakemask); - invoke_rcu_node_kthread(rnp); -} - -/* - * Drop to non-real-time priority and yield, but only after posting a - * timer that will cause us to regain our real-time priority if we - * remain preempted. Either way, we restore our real-time priority - * before returning. - */ -static void rcu_yield(void (*f)(unsigned long), unsigned long arg) -{ - struct sched_param sp; - struct timer_list yield_timer; - int prio = current->rt_priority; - - setup_timer_on_stack(&yield_timer, f, arg); - mod_timer(&yield_timer, jiffies + 2); - sp.sched_priority = 0; - sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp); - set_user_nice(current, 19); - schedule(); - set_user_nice(current, 0); - sp.sched_priority = prio; + sp.sched_priority = RCU_KTHREAD_PRIO; sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); - del_timer(&yield_timer); } -/* - * Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU. - * This can happen while the corresponding CPU is either coming online - * or going offline. We cannot wait until the CPU is fully online - * before starting the kthread, because the various notifier functions - * can wait for RCU grace periods. So we park rcu_cpu_kthread() until - * the corresponding CPU is online. - * - * Return 1 if the kthread needs to stop, 0 otherwise. - * - * Caller must disable bh. This function can momentarily enable it. - */ -static int rcu_cpu_kthread_should_stop(int cpu) +static void rcu_cpu_kthread_park(unsigned int cpu) { - while (cpu_is_offline(cpu) || - !cpumask_equal(¤t->cpus_allowed, cpumask_of(cpu)) || - smp_processor_id() != cpu) { - if (kthread_should_stop()) - return 1; - per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; - per_cpu(rcu_cpu_kthread_cpu, cpu) = raw_smp_processor_id(); - local_bh_enable(); - schedule_timeout_uninterruptible(1); - if (!cpumask_equal(¤t->cpus_allowed, cpumask_of(cpu))) - set_cpus_allowed_ptr(current, cpumask_of(cpu)); - local_bh_disable(); - } - per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu; - return 0; + per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; } /* @@ -1596,28 +1474,23 @@ static int rcu_cpu_kthread_should_stop(int cpu) * RCU softirq used in flavors and configurations of RCU that do not * support RCU priority boosting. */ -static int rcu_cpu_kthread(void *arg) +static int rcu_cpu_kthread(void *cookie) { - int cpu = (int)(long)arg; + unsigned int *statusp = &__get_cpu_var(rcu_cpu_kthread_status); + char work, *workp = &__get_cpu_var(rcu_cpu_has_work); unsigned long flags; int spincnt = 0; - unsigned int *statusp = &per_cpu(rcu_cpu_kthread_status, cpu); - char work; - char *workp = &per_cpu(rcu_cpu_has_work, cpu); - trace_rcu_utilization("Start CPU kthread@init"); + trace_rcu_utilization("Start CPU kthread@unpark"); for (;;) { *statusp = RCU_KTHREAD_WAITING; trace_rcu_utilization("End CPU kthread@rcu_wait"); - rcu_wait(*workp != 0 || kthread_should_stop()); + rcu_wait(*workp != 0 || + smpboot_thread_check_parking(cookie)); trace_rcu_utilization("Start CPU kthread@rcu_wait"); local_bh_disable(); - if (rcu_cpu_kthread_should_stop(cpu)) { - local_bh_enable(); - break; - } *statusp = RCU_KTHREAD_RUNNING; - per_cpu(rcu_cpu_kthread_loops, cpu)++; + this_cpu_inc(rcu_cpu_kthread_loops); local_irq_save(flags); work = *workp; *workp = 0; @@ -1632,101 +1505,13 @@ static int rcu_cpu_kthread(void *arg) if (spincnt > 10) { *statusp = RCU_KTHREAD_YIELDING; trace_rcu_utilization("End CPU kthread@rcu_yield"); - rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu); + schedule_timeout_interruptible(2); trace_rcu_utilization("Start CPU kthread@rcu_yield"); spincnt = 0; } } + /* Not reached */ *statusp = RCU_KTHREAD_STOPPED; - trace_rcu_utilization("End CPU kthread@term"); - return 0; -} - -/* - * Spawn a per-CPU kthread, setting up affinity and priority. - * Because the CPU hotplug lock is held, no other CPU will be attempting - * to manipulate rcu_cpu_kthread_task. There might be another CPU - * attempting to access it during boot, but the locking in kthread_bind() - * will enforce sufficient ordering. - * - * Please note that we cannot simply refuse to wake up the per-CPU - * kthread because kthreads are created in TASK_UNINTERRUPTIBLE state, - * which can result in softlockup complaints if the task ends up being - * idle for more than a couple of minutes. - * - * However, please note also that we cannot bind the per-CPU kthread to its - * CPU until that CPU is fully online. We also cannot wait until the - * CPU is fully online before we create its per-CPU kthread, as this would - * deadlock the system when CPU notifiers tried waiting for grace - * periods. So we bind the per-CPU kthread to its CPU only if the CPU - * is online. If its CPU is not yet fully online, then the code in - * rcu_cpu_kthread() will wait until it is fully online, and then do - * the binding. - */ -static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu) -{ - struct sched_param sp; - struct task_struct *t; - - if (!rcu_scheduler_fully_active || - per_cpu(rcu_cpu_kthread_task, cpu) != NULL) - return 0; - t = kthread_create_on_node(rcu_cpu_kthread, - (void *)(long)cpu, - cpu_to_node(cpu), - "rcuc/%d", cpu); - if (IS_ERR(t)) - return PTR_ERR(t); - if (cpu_online(cpu)) - kthread_bind(t, cpu); - per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu; - WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL); - sp.sched_priority = RCU_KTHREAD_PRIO; - sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); - per_cpu(rcu_cpu_kthread_task, cpu) = t; - wake_up_process(t); /* Get to TASK_INTERRUPTIBLE quickly. */ - return 0; -} - -/* - * Per-rcu_node kthread, which is in charge of waking up the per-CPU - * kthreads when needed. We ignore requests to wake up kthreads - * for offline CPUs, which is OK because force_quiescent_state() - * takes care of this case. - */ -static int rcu_node_kthread(void *arg) -{ - int cpu; - unsigned long flags; - unsigned long mask; - struct rcu_node *rnp = (struct rcu_node *)arg; - struct sched_param sp; - struct task_struct *t; - - for (;;) { - rnp->node_kthread_status = RCU_KTHREAD_WAITING; - rcu_wait(atomic_read(&rnp->wakemask) != 0); - rnp->node_kthread_status = RCU_KTHREAD_RUNNING; - raw_spin_lock_irqsave(&rnp->lock, flags); - mask = atomic_xchg(&rnp->wakemask, 0); - rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */ - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) { - if ((mask & 0x1) == 0) - continue; - preempt_disable(); - t = per_cpu(rcu_cpu_kthread_task, cpu); - if (!cpu_online(cpu) || t == NULL) { - preempt_enable(); - continue; - } - per_cpu(rcu_cpu_has_work, cpu) = 1; - sp.sched_priority = RCU_KTHREAD_PRIO; - sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); - preempt_enable(); - } - } - /* NOTREACHED */ - rnp->node_kthread_status = RCU_KTHREAD_STOPPED; return 0; } @@ -1739,17 +1524,17 @@ static int rcu_node_kthread(void *arg) * no outgoing CPU. If there are no CPUs left in the affinity set, * this function allows the kthread to execute on any CPU. */ -static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) +static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) { + struct task_struct *t = rnp->boost_kthread_task; + unsigned long mask = rnp->qsmaskinit; cpumask_var_t cm; int cpu; - unsigned long mask = rnp->qsmaskinit; - if (rnp->node_kthread_task == NULL) + if (!t) return; - if (!alloc_cpumask_var(&cm, GFP_KERNEL)) + if (!zalloc_cpumask_var(&cm, GFP_KERNEL)) return; - cpumask_clear(cm); for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) if ((mask & 0x1) && cpu != outgoingcpu) cpumask_set_cpu(cpu, cm); @@ -1759,62 +1544,35 @@ static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) cpumask_clear_cpu(cpu, cm); WARN_ON_ONCE(cpumask_weight(cm) == 0); } - set_cpus_allowed_ptr(rnp->node_kthread_task, cm); - rcu_boost_kthread_setaffinity(rnp, cm); + set_cpus_allowed_ptr(t, cm); free_cpumask_var(cm); } -/* - * Spawn a per-rcu_node kthread, setting priority and affinity. - * Called during boot before online/offline can happen, or, if - * during runtime, with the main CPU-hotplug locks held. So only - * one of these can be executing at a time. - */ -static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp, - struct rcu_node *rnp) -{ - unsigned long flags; - int rnp_index = rnp - &rsp->node[0]; - struct sched_param sp; - struct task_struct *t; - - if (!rcu_scheduler_fully_active || - rnp->qsmaskinit == 0) - return 0; - if (rnp->node_kthread_task == NULL) { - t = kthread_create(rcu_node_kthread, (void *)rnp, - "rcun/%d", rnp_index); - if (IS_ERR(t)) - return PTR_ERR(t); - raw_spin_lock_irqsave(&rnp->lock, flags); - rnp->node_kthread_task = t; - raw_spin_unlock_irqrestore(&rnp->lock, flags); - sp.sched_priority = 99; - sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); - wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ - } - return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index); -} +static struct smp_hotplug_thread rcu_cpu_thread_spec = { + .store = &rcu_cpu_kthread_task, + .thread_fn = rcu_cpu_kthread, + .thread_comm = "rcuc/%u", + .setup = rcu_cpu_kthread_setup, + .park = rcu_cpu_kthread_park, +}; /* * Spawn all kthreads -- called as soon as the scheduler is running. */ static int __init rcu_spawn_kthreads(void) { - int cpu; struct rcu_node *rnp; + int cpu; rcu_scheduler_fully_active = 1; - for_each_possible_cpu(cpu) { + for_each_possible_cpu(cpu) per_cpu(rcu_cpu_has_work, cpu) = 0; - if (cpu_online(cpu)) - (void)rcu_spawn_one_cpu_kthread(cpu); - } + BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec)); rnp = rcu_get_root(rcu_state); - (void)rcu_spawn_one_node_kthread(rcu_state, rnp); + (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); if (NUM_RCU_NODES > 1) { rcu_for_each_leaf_node(rcu_state, rnp) - (void)rcu_spawn_one_node_kthread(rcu_state, rnp); + (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); } return 0; } @@ -1826,11 +1584,8 @@ static void __cpuinit rcu_prepare_kthreads(int cpu) struct rcu_node *rnp = rdp->mynode; /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ - if (rcu_scheduler_fully_active) { - (void)rcu_spawn_one_cpu_kthread(cpu); - if (rnp->node_kthread_task == NULL) - (void)rcu_spawn_one_node_kthread(rcu_state, rnp); - } + if (rcu_scheduler_fully_active) + (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); } #else /* #ifdef CONFIG_RCU_BOOST */ @@ -1854,19 +1609,7 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) { } -#ifdef CONFIG_HOTPLUG_CPU - -static void rcu_stop_cpu_kthread(int cpu) -{ -} - -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ - -static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) -{ -} - -static void rcu_cpu_kthread_setrt(int cpu, int to_rt) +static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) { } diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index d4bc16ddd1d4..6b4c76ba3529 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -83,11 +83,10 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) rdp->nxttail[RCU_WAIT_TAIL]], ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]); #ifdef CONFIG_RCU_BOOST - seq_printf(m, " kt=%d/%c/%d ktl=%x", + seq_printf(m, " kt=%d/%c ktl=%x", per_cpu(rcu_cpu_has_work, rdp->cpu), convert_kthread_status(per_cpu(rcu_cpu_kthread_status, rdp->cpu)), - per_cpu(rcu_cpu_kthread_cpu, rdp->cpu), per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff); #endif /* #ifdef CONFIG_RCU_BOOST */ seq_printf(m, " b=%ld", rdp->blimit); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c812a63aa783..ad9c3c3fb04d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1109,6 +1109,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) trace_sched_migrate_task(p, new_cpu); if (task_cpu(p) != new_cpu) { + if (p->sched_class->migrate_task_rq) + p->sched_class->migrate_task_rq(p, new_cpu); p->se.nr_migrations++; perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); } @@ -1713,6 +1715,9 @@ static void __sched_fork(struct task_struct *p) p->se.vruntime = 0; INIT_LIST_HEAD(&p->se.group_node); +#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) + p->se.avg.decay_count = 0; +#endif #ifdef CONFIG_SCHEDSTATS memset(&p->se.statistics, 0, sizeof(p->se.statistics)); #endif diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 6f79596e0ea9..b9d54d0d7bb0 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -61,14 +61,20 @@ static unsigned long nsec_low(unsigned long long nsec) static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg) { struct sched_entity *se = tg->se[cpu]; - if (!se) - return; #define P(F) \ SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) #define PN(F) \ SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) + if (!se) { + struct sched_avg *avg = &cpu_rq(cpu)->avg; + P(avg->runnable_avg_sum); + P(avg->runnable_avg_period); + return; + } + + PN(se->exec_start); PN(se->vruntime); PN(se->sum_exec_runtime); @@ -85,6 +91,13 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group P(se->statistics.wait_count); #endif P(se->load.weight); +#ifdef CONFIG_SMP + P(se->avg.runnable_avg_sum); + P(se->avg.runnable_avg_period); + P(se->avg.usage_avg_sum); + P(se->avg.load_avg_contrib); + P(se->avg.decay_count); +#endif #undef PN #undef P } @@ -206,14 +219,20 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); #ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_SMP - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_avg", - SPLIT_NS(cfs_rq->load_avg)); - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_period", - SPLIT_NS(cfs_rq->load_period)); - SEQ_printf(m, " .%-30s: %ld\n", "load_contrib", - cfs_rq->load_contribution); - SEQ_printf(m, " .%-30s: %d\n", "load_tg", - atomic_read(&cfs_rq->tg->load_weight)); + SEQ_printf(m, " .%-30s: %lld\n", "runnable_load_avg", + cfs_rq->runnable_load_avg); + SEQ_printf(m, " .%-30s: %lld\n", "blocked_load_avg", + cfs_rq->blocked_load_avg); + SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg", + atomic64_read(&cfs_rq->tg->load_avg)); + SEQ_printf(m, " .%-30s: %lld\n", "tg_load_contrib", + cfs_rq->tg_load_contrib); + SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib", + cfs_rq->tg_runnable_contrib); + SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg", + atomic_read(&cfs_rq->tg->runnable_avg)); + SEQ_printf(m, " .%-30s: %d\n", "tg->usage_avg", + atomic_read(&cfs_rq->tg->usage_avg)); #endif print_cfs_group_stats(m, cpu, cfs_rq->tg); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c099cc6eebe3..f705a87ac7b5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -653,9 +653,6 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) return calc_delta_fair(sched_slice(cfs_rq, se), se); } -static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update); -static void update_cfs_shares(struct cfs_rq *cfs_rq); - /* * Update the current task's runtime statistics. Skip current tasks that * are not in our scheduling class. @@ -675,10 +672,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, curr->vruntime += delta_exec_weighted; update_min_vruntime(cfs_rq); - -#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED - cfs_rq->load_unacc_exec_time += delta_exec; -#endif } static void update_curr(struct cfs_rq *cfs_rq) @@ -801,72 +794,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) } #ifdef CONFIG_FAIR_GROUP_SCHED -/* we need this in update_cfs_load and load-balance functions below */ -static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); # ifdef CONFIG_SMP -static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, - int global_update) -{ - struct task_group *tg = cfs_rq->tg; - long load_avg; - - load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1); - load_avg -= cfs_rq->load_contribution; - - if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) { - atomic_add(load_avg, &tg->load_weight); - cfs_rq->load_contribution += load_avg; - } -} - -static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) -{ - u64 period = sysctl_sched_shares_window; - u64 now, delta; - unsigned long load = cfs_rq->load.weight; - - if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq)) - return; - - now = rq_of(cfs_rq)->clock_task; - delta = now - cfs_rq->load_stamp; - - /* truncate load history at 4 idle periods */ - if (cfs_rq->load_stamp > cfs_rq->load_last && - now - cfs_rq->load_last > 4 * period) { - cfs_rq->load_period = 0; - cfs_rq->load_avg = 0; - delta = period - 1; - } - - cfs_rq->load_stamp = now; - cfs_rq->load_unacc_exec_time = 0; - cfs_rq->load_period += delta; - if (load) { - cfs_rq->load_last = now; - cfs_rq->load_avg += delta * load; - } - - /* consider updating load contribution on each fold or truncate */ - if (global_update || cfs_rq->load_period > period - || !cfs_rq->load_period) - update_cfs_rq_load_contribution(cfs_rq, global_update); - - while (cfs_rq->load_period > period) { - /* - * Inline assembly required to prevent the compiler - * optimising this loop into a divmod call. - * See __iter_div_u64_rem() for another example of this. - */ - asm("" : "+rm" (cfs_rq->load_period)); - cfs_rq->load_period /= 2; - cfs_rq->load_avg /= 2; - } - - if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg) - list_del_leaf_cfs_rq(cfs_rq); -} - static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) { long tg_weight; @@ -876,8 +804,8 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) * to gain a more accurate current total weight. See * update_cfs_rq_load_contribution(). */ - tg_weight = atomic_read(&tg->load_weight); - tg_weight -= cfs_rq->load_contribution; + tg_weight = atomic64_read(&tg->load_avg); + tg_weight -= cfs_rq->tg_load_contrib; tg_weight += cfs_rq->load.weight; return tg_weight; @@ -901,27 +829,11 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) return shares; } - -static void update_entity_shares_tick(struct cfs_rq *cfs_rq) -{ - if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { - update_cfs_load(cfs_rq, 0); - update_cfs_shares(cfs_rq); - } -} # else /* CONFIG_SMP */ -static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) -{ -} - static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) { return tg->shares; } - -static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) -{ -} # endif /* CONFIG_SMP */ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight) @@ -939,6 +851,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, account_entity_enqueue(cfs_rq, se); } +static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); + static void update_cfs_shares(struct cfs_rq *cfs_rq) { struct task_group *tg; @@ -958,18 +872,491 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq) reweight_entity(cfs_rq_of(se), se, shares); } #else /* CONFIG_FAIR_GROUP_SCHED */ -static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) +static inline void update_cfs_shares(struct cfs_rq *cfs_rq) { } +#endif /* CONFIG_FAIR_GROUP_SCHED */ -static inline void update_cfs_shares(struct cfs_rq *cfs_rq) +/* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */ +#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) +/* + * We choose a half-life close to 1 scheduling period. + * Note: The tables below are dependent on this value. + */ +#define LOAD_AVG_PERIOD 32 +#define LOAD_AVG_MAX 46742 /* maximum possible load avg */ +#define LOAD_AVG_MAX_N 516 /* number of full periods it takes to produce max */ + +/* Precomputed fixed inverse multiplies for multiplication by y^n */ +static const u32 runnable_avg_yN_inv[] = { + 0xffffffff, 0xfa83b2db, 0xf5257d15, 0xefe4b99b, 0xeac0c6e7, 0xe5b906e7, + 0xe0ccdeec, 0xdbfbb797, 0xd744fcca, 0xd2a81d91, 0xce248c15, 0xc9b9bd86, + 0xc5672a11, 0xc12c4cca, 0xbd08a39f, 0xb8fbaf47, 0xb504f333, 0xb123f581, + 0xad583eea, 0xa9a15ab4, 0xa5fed6a9, 0xa2704303, 0x9ef53260, 0x9b8d39b9, + 0x9837f051, 0x94f4efa8, 0x91c3d373, 0x8ea4398b, 0x8b95c1e3, 0x88980e80, + 0x85aac367, 0x82cd8698, +}; + +/* Precomputed \Sum y^k { 1<=k<=n } */ +static const u32 runnable_avg_yN_sum[] = { + 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103, + 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082, + 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371, +}; + +/* + * Approximate: + * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) + */ +static __always_inline u64 decay_load(u64 val, u64 n) +{ + int local_n; + if (!n) + return val; + else if (unlikely(n > LOAD_AVG_PERIOD * 63)) + return 0; + + /* will be 32 bits if that's desirable */ + local_n = n; + + /* + * As y^PERIOD = 1/2, we can combine + * y^n = 1/2^(n/PERIOD) * k^(n%PERIOD) + * With a look-up table which covers k^n (n<PERIOD) + * + * To achieve constant time decay_load. + */ + if (unlikely(local_n >= LOAD_AVG_PERIOD)) { + val >>= local_n / LOAD_AVG_PERIOD; + n %= LOAD_AVG_PERIOD; + } + + val *= runnable_avg_yN_inv[local_n]; + return SRR(val, 32); +} + +/* + * For updates fully spanning n periods, the contribution to runnable + * average will be: \Sum 1024*y^n + * + * We can compute this reasonably efficiently by combining: + * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD} + */ +static u32 __compute_runnable_contrib(int n) { + u32 contrib = 0; + + if (likely(n <= LOAD_AVG_PERIOD)) + return runnable_avg_yN_sum[n]; + else if (unlikely(n >= LOAD_AVG_MAX_N)) + return LOAD_AVG_MAX; + + /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */ + do { + contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */ + contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD]; + + n -= LOAD_AVG_PERIOD; + } while (n > LOAD_AVG_PERIOD); + + contrib = decay_load(contrib, n); + return contrib + runnable_avg_yN_sum[n]; } -static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) +/* We can represent the historical contribution to runnable average as the + * coefficients of a geometric series. To do this we sub-divide our runnable + * history into segments of approximately 1ms (1024us); label the segment that + * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g. + * + * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ... + * p0 p1 p1 + * (now) (~1ms ago) (~2ms ago) + * + * Let u_i denote the fraction of p_i that the entity was runnable. + * + * We then designate the fractions u_i as our co-efficients, yielding the + * following representation of historical load: + * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ... + * + * We choose y based on the with of a reasonably scheduling period, fixing: + * y^32 = 0.5 + * + * This means that the contribution to load ~32ms ago (u_32) will be weighted + * approximately half as much as the contribution to load within the last ms + * (u_0). + * + * When a period "rolls over" and we have new u_0`, multiplying the previous + * sum again by y is sufficient to update: + * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) + * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1] + */ +static __always_inline int __update_entity_runnable_avg(u64 now, + struct sched_avg *sa, + int runnable, + int running) { + u64 delta, periods; + u32 runnable_contrib; + int delta_w, decayed = 0; + + delta = now - sa->last_runnable_update; + /* + * This should only happen when time goes backwards, which it + * unfortunately does during sched clock init when we swap over to TSC. + */ + if ((s64)delta < 0) { + sa->last_runnable_update = now; + return 0; + } + + /* + * Use 1024ns as the unit of measurement since it's a reasonable + * approximation of 1us and fast to compute. + */ + delta >>= 10; + if (!delta) + return 0; + sa->last_runnable_update = now; + + /* delta_w is the amount already accumulated against our next period */ + delta_w = sa->runnable_avg_period % 1024; + if (delta + delta_w >= 1024) { + /* period roll-over */ + decayed = 1; + + /* + * Now that we know we're crossing a period boundary, figure + * out how much from delta we need to complete the current + * period and accrue it. + */ + delta_w = 1024 - delta_w; + if (runnable) + sa->runnable_avg_sum += delta_w; + if (running) + sa->usage_avg_sum += delta_w; + sa->runnable_avg_period += delta_w; + + delta -= delta_w; + + /* Figure out how many additional periods this update spans */ + periods = delta / 1024; + delta %= 1024; + + sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum, + periods + 1); + sa->runnable_avg_period = decay_load(sa->runnable_avg_period, + periods + 1); + sa->usage_avg_sum = decay_load(sa->usage_avg_sum, periods + 1); + + /* Efficiently calculate \sum (1..n_period) 1024*y^i */ + runnable_contrib = __compute_runnable_contrib(periods); + if (runnable) + sa->runnable_avg_sum += runnable_contrib; + if (running) + sa->usage_avg_sum += runnable_contrib; + sa->runnable_avg_period += runnable_contrib; + } + + /* Remainder of delta accrued against u_0` */ + if (runnable) + sa->runnable_avg_sum += delta; + if (running) + sa->usage_avg_sum += delta; + sa->runnable_avg_period += delta; + + return decayed; } -#endif /* CONFIG_FAIR_GROUP_SCHED */ + +/* Synchronize an entity's decay with its parentin cfs_rq.*/ +static inline u64 __synchronize_entity_decay(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 decays = atomic64_read(&cfs_rq->decay_counter); + + decays -= se->avg.decay_count; + if (!decays) + return 0; + + se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); + se->avg.decay_count += decays; + + return decays; +} + +#ifdef CONFIG_FAIR_GROUP_SCHED +static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, + int force_update) +{ + struct task_group *tg = cfs_rq->tg; + s64 tg_contrib; + + tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg; + tg_contrib -= cfs_rq->tg_load_contrib; + + if (force_update || abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) { + atomic64_add(tg_contrib, &tg->load_avg); + cfs_rq->tg_load_contrib += tg_contrib; + } +} + +/* + * Aggregate cfs_rq runnable averages into an equivalent task_group + * representation for computing load contributions. + */ +static inline void __update_tg_runnable_avg(struct sched_avg *sa, + struct cfs_rq *cfs_rq) +{ + struct task_group *tg = cfs_rq->tg; + long contrib, usage_contrib; + + contrib = div_u64(sa->runnable_avg_sum << 12, + sa->runnable_avg_period + 1); + contrib -= cfs_rq->tg_runnable_contrib; + + usage_contrib = div_u64(sa->usage_avg_sum << 12, + sa->runnable_avg_period + 1); + usage_contrib -= cfs_rq->tg_usage_contrib; + + /* + * contrib/usage at this point represent deltas, only update if they + * are substantive. + */ + if ((abs(contrib) > cfs_rq->tg_runnable_contrib / 64) || + (abs(usage_contrib) > cfs_rq->tg_usage_contrib / 64)) { + atomic_add(contrib, &tg->runnable_avg); + cfs_rq->tg_runnable_contrib += contrib; + + atomic_add(usage_contrib, &tg->usage_avg); + cfs_rq->tg_usage_contrib += usage_contrib; + } +} + +static inline void __update_group_entity_contrib(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = group_cfs_rq(se); + struct task_group *tg = cfs_rq->tg; + int runnable_avg; + + u64 contrib; + + contrib = cfs_rq->tg_load_contrib * tg->shares; + se->avg.load_avg_contrib = div64_u64(contrib, + atomic64_read(&tg->load_avg) + 1); + + /* + * Unlike a task-entity, a group entity may be using >=1 cpu globally. + * However, in the case that it's using <1 cpu we need to form a + * correction term so that we contribute the same load as a task of + * equal weight. (Global runnable time is taken as a fraction over + * 2^12.) + */ + runnable_avg = atomic_read(&tg->runnable_avg); + if (runnable_avg < (1<<12)) { + se->avg.load_avg_contrib *= runnable_avg; + se->avg.load_avg_contrib /= (1<<12); + } +} +#else +static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, + int force_update) {} +static inline void __update_tg_runnable_avg(struct sched_avg *sa, + struct cfs_rq *cfs_rq) {} +static inline void __update_group_entity_contrib(struct sched_entity *se) {} +#endif + +static inline void __update_task_entity_contrib(struct sched_entity *se) +{ + u32 contrib; + + /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ + contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight); + contrib /= (se->avg.runnable_avg_period + 1); + se->avg.load_avg_contrib = scale_load(contrib); + trace_sched_task_load_contrib(task_of(se), se->avg.load_avg_contrib); + contrib = se->avg.runnable_avg_sum * scale_load_down(1024); + contrib /= (se->avg.runnable_avg_period + 1); + se->avg.load_avg_ratio = scale_load(contrib); + trace_sched_task_runnable_ratio(task_of(se), se->avg.load_avg_ratio); + contrib = se->avg.usage_avg_sum * scale_load_down(1024); + contrib /= (se->avg.runnable_avg_period + 1); + trace_sched_task_usage_ratio(task_of(se), scale_load(contrib)); +} + +/* Compute the current contribution to load_avg by se, return any delta */ +static long __update_entity_load_avg_contrib(struct sched_entity *se) +{ + long old_contrib = se->avg.load_avg_contrib; + + if (entity_is_task(se)) { + __update_task_entity_contrib(se); + } else { + __update_tg_runnable_avg(&se->avg, group_cfs_rq(se)); + __update_group_entity_contrib(se); + } + + return se->avg.load_avg_contrib - old_contrib; +} + +static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq, + long load_contrib) +{ + if (likely(load_contrib < cfs_rq->blocked_load_avg)) + cfs_rq->blocked_load_avg -= load_contrib; + else + cfs_rq->blocked_load_avg = 0; +} + +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); + +/* Update a sched_entity's runnable average */ +static inline void update_entity_load_avg(struct sched_entity *se, + int update_cfs_rq) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + long contrib_delta; + u64 now; + + /* + * For a group entity we need to use their owned cfs_rq_clock_task() in + * case they are the parent of a throttled hierarchy. + */ + if (entity_is_task(se)) + now = cfs_rq_clock_task(cfs_rq); + else + now = cfs_rq_clock_task(group_cfs_rq(se)); + + if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq, + cfs_rq->curr == se)) + return; + + contrib_delta = __update_entity_load_avg_contrib(se); + + if (!update_cfs_rq) + return; + + if (se->on_rq) + cfs_rq->runnable_load_avg += contrib_delta; + else + subtract_blocked_load_contrib(cfs_rq, -contrib_delta); +} + +/* + * Decay the load contributed by all blocked children and account this so that + * they their contribution may appropriately discounted when they wake up. + */ +static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) +{ + u64 now = cfs_rq_clock_task(cfs_rq) >> 20; + u64 decays; + + decays = now - cfs_rq->last_decay; + if (!decays && !force_update) + return; + + if (atomic64_read(&cfs_rq->removed_load)) { + u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0); + subtract_blocked_load_contrib(cfs_rq, removed_load); + } + + if (decays) { + cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg, + decays); + atomic64_add(decays, &cfs_rq->decay_counter); + cfs_rq->last_decay = now; + } + + __update_cfs_rq_tg_load_contrib(cfs_rq, force_update); + update_cfs_shares(cfs_rq); +} + +static inline void update_rq_runnable_avg(struct rq *rq, int runnable) +{ + u32 contrib; + __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable, + runnable); + __update_tg_runnable_avg(&rq->avg, &rq->cfs); + contrib = rq->avg.runnable_avg_sum * scale_load_down(1024); + contrib /= (rq->avg.runnable_avg_period + 1); + trace_sched_rq_runnable_ratio(cpu_of(rq), scale_load(contrib)); + trace_sched_rq_runnable_load(cpu_of(rq), rq->cfs.runnable_load_avg); +} + +/* Add the load generated by se into cfs_rq's child load-average */ +static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, + struct sched_entity *se, + int wakeup) +{ + /* + * We track migrations using entity decay_count <= 0, on a wake-up + * migration we use a negative decay count to track the remote decays + * accumulated while sleeping. + */ + if (unlikely(se->avg.decay_count <= 0)) { + se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task; + if (se->avg.decay_count) { + /* + * In a wake-up migration we have to approximate the + * time sleeping. This is because we can't synchronize + * clock_task between the two cpus, and it is not + * guaranteed to be read-safe. Instead, we can + * approximate this using our carried decays, which are + * explicitly atomically readable. + */ + se->avg.last_runnable_update -= (-se->avg.decay_count) + << 20; + update_entity_load_avg(se, 0); + } + se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); + wakeup = 0; + } else { + __synchronize_entity_decay(se); + } + + /* migrated tasks did not contribute to our blocked load */ + if (wakeup) { + subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib); + update_entity_load_avg(se, 0); + } + + cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; + /* we force update consideration on load-balancer moves */ + update_cfs_rq_blocked_load(cfs_rq, !wakeup); +} + +/* + * Remove se's load from this cfs_rq child load-average, if the entity is + * transitioning to a blocked state we track its projected decay using + * blocked_load_avg. + */ +static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, + struct sched_entity *se, + int sleep) +{ + update_entity_load_avg(se, 1); + /* we force update consideration on load-balancer moves */ + update_cfs_rq_blocked_load(cfs_rq, !sleep); + + cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib; + if (sleep) { + cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; + se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); + } else { + se->avg.decay_count = 0; + } +} +#else +static inline void update_entity_load_avg(struct sched_entity *se, + int update_cfs_rq) {} +static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} +static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, + struct sched_entity *se, + int wakeup) {} +static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, + struct sched_entity *se, + int sleep) {} +static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, + int force_update) {} +#endif static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) { @@ -1096,9 +1483,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); - update_cfs_load(cfs_rq, 0); account_entity_enqueue(cfs_rq, se); - update_cfs_shares(cfs_rq); + enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP); if (flags & ENQUEUE_WAKEUP) { place_entity(cfs_rq, se, 0); @@ -1190,9 +1576,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); - se->on_rq = 0; - update_cfs_load(cfs_rq, 0); account_entity_dequeue(cfs_rq, se); + dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP); /* * Normalize the entity after updating the min_vruntime because the @@ -1206,7 +1591,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) return_cfs_rq_runtime(cfs_rq); update_min_vruntime(cfs_rq); - update_cfs_shares(cfs_rq); + se->on_rq = 0; } /* @@ -1261,6 +1646,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) */ update_stats_wait_end(cfs_rq, se); __dequeue_entity(cfs_rq, se); + update_entity_load_avg(se, 1); } update_stats_curr_start(cfs_rq, se); @@ -1340,6 +1726,8 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) update_stats_wait_start(cfs_rq, prev); /* Put 'current' back into the tree. */ __enqueue_entity(cfs_rq, prev); + /* in !on_rq case, update occurred at dequeue */ + update_entity_load_avg(prev, 1); } cfs_rq->curr = NULL; } @@ -1353,9 +1741,10 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) update_curr(cfs_rq); /* - * Update share accounting for long-running entities. + * Ensure that runnable average is periodically updated. */ - update_entity_shares_tick(cfs_rq); + update_entity_load_avg(curr, 1); + update_cfs_rq_blocked_load(cfs_rq, 1); #ifdef CONFIG_SCHED_HRTICK /* @@ -1448,6 +1837,15 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) return &tg->cfs_bandwidth; } +/* rq->task_clock normalized against any time this cfs_rq has spent throttled */ +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) +{ + if (unlikely(cfs_rq->throttle_count)) + return cfs_rq->throttled_clock_task; + + return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time; +} + /* returns 0 on failure to allocate runtime */ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) { @@ -1592,14 +1990,9 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) cfs_rq->throttle_count--; #ifdef CONFIG_SMP if (!cfs_rq->throttle_count) { - u64 delta = rq->clock_task - cfs_rq->load_stamp; - - /* leaving throttled state, advance shares averaging windows */ - cfs_rq->load_stamp += delta; - cfs_rq->load_last += delta; - - /* update entity weight now that we are on_rq again */ - update_cfs_shares(cfs_rq); + /* adjust cfs_rq_clock_task() */ + cfs_rq->throttled_clock_task_time += rq->clock_task - + cfs_rq->throttled_clock_task; } #endif @@ -1611,9 +2004,9 @@ static int tg_throttle_down(struct task_group *tg, void *data) struct rq *rq = data; struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; - /* group is entering throttled state, record last load */ + /* group is entering throttled state, stop time */ if (!cfs_rq->throttle_count) - update_cfs_load(cfs_rq, 0); + cfs_rq->throttled_clock_task = rq->clock_task; cfs_rq->throttle_count++; return 0; @@ -1628,7 +2021,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; - /* account load preceding throttle */ + /* freeze hierarchy runnable averages while throttled */ rcu_read_lock(); walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); rcu_read_unlock(); @@ -1652,7 +2045,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) rq->nr_running -= task_delta; cfs_rq->throttled = 1; - cfs_rq->throttled_timestamp = rq->clock; + cfs_rq->throttled_clock = rq->clock; raw_spin_lock(&cfs_b->lock); list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); raw_spin_unlock(&cfs_b->lock); @@ -1670,10 +2063,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->throttled = 0; raw_spin_lock(&cfs_b->lock); - cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp; + cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock; list_del_rcu(&cfs_rq->throttled_list); raw_spin_unlock(&cfs_b->lock); - cfs_rq->throttled_timestamp = 0; update_rq_clock(rq); /* update hierarchical throttle state */ @@ -2073,8 +2465,13 @@ void unthrottle_offline_cfs_rqs(struct rq *rq) } #else /* CONFIG_CFS_BANDWIDTH */ -static __always_inline -void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) {} +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) +{ + return rq_of(cfs_rq)->clock_task; +} + +static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, + unsigned long delta_exec) {} static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} @@ -2207,12 +2604,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; - update_cfs_load(cfs_rq, 0); - update_cfs_shares(cfs_rq); + update_entity_load_avg(se, 1); + update_cfs_rq_blocked_load(cfs_rq, 0); } - if (!se) + if (!se) { + update_rq_runnable_avg(rq, rq->nr_running); inc_nr_running(rq); + } hrtick_update(rq); } @@ -2266,12 +2665,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; - update_cfs_load(cfs_rq, 0); - update_cfs_shares(cfs_rq); + update_entity_load_avg(se, 1); + update_cfs_rq_blocked_load(cfs_rq, 0); } - if (!se) + if (!se) { dec_nr_running(rq); + update_rq_runnable_avg(rq, 1); + } hrtick_update(rq); } @@ -2681,6 +3082,73 @@ done: return target; } +#ifdef CONFIG_SCHED_HMP +/* Heterogenous multiprocessor (HMP) optimizations + * We need to know which cpus that are fast and slow. Ideally, this + * information would be provided by the platform in some way. For now it is + * set in the kernel config. */ +static struct cpumask hmp_fast_cpu_mask; +static struct cpumask hmp_slow_cpu_mask; + +/* Setup fast and slow cpumasks. + * This should be setup based on device tree somehow. */ +static int __init hmp_cpu_mask_setup(void) +{ + char buf[64]; + + cpumask_clear(&hmp_fast_cpu_mask); + cpumask_clear(&hmp_slow_cpu_mask); + + if (cpulist_parse(CONFIG_HMP_FAST_CPU_MASK, &hmp_fast_cpu_mask)) + WARN(1, "Failed to parse HMP fast cpu mask!\n"); + if (cpulist_parse(CONFIG_HMP_SLOW_CPU_MASK, &hmp_slow_cpu_mask)) + WARN(1, "Failed to parse HMP slow cpu mask!\n"); + + printk(KERN_DEBUG "Initializing HMP scheduler:\n"); + cpulist_scnprintf(buf, 64, &hmp_fast_cpu_mask); + printk(KERN_DEBUG " fast cpus: %s\n", buf); + cpulist_scnprintf(buf, 64, &hmp_slow_cpu_mask); + printk(KERN_DEBUG " slow cpus: %s\n", buf); + + return 1; +} +early_initcall(hmp_cpu_mask_setup); + +/* Migration thresholds should be in the range [0..1023] + * hmp_up_threshold: min. load required for migrating tasks to a fast cpu + * hmp_down_threshold: max. load allowed for tasks migrating to a slow cpu + * hmp_up_prio: min. task prio for tasks migrating to faster cpus */ +unsigned int hmp_up_threshold = 512; +unsigned int hmp_down_threshold = 256; +unsigned int hmp_up_prio = 125; +static unsigned int hmp_up_migration(int cpu, struct sched_entity *se); +static unsigned int hmp_down_migration(int cpu, struct sched_entity *se); + +static unsigned int hmp_cpu_is_fast(int cpu) +{ + return cpumask_test_cpu(cpu, &hmp_fast_cpu_mask); +} + +static unsigned int hmp_cpu_is_slow(int cpu) +{ + return cpumask_test_cpu(cpu, &hmp_slow_cpu_mask); +} + +/* Select target cpu for HMP migration to fast cpu + * returns target >= nr_cpu_ids if no fast cpus in affinity mask */ +static inline unsigned int hmp_select_fast_cpu(struct task_struct *tsk) +{ + return cpumask_any_and(&hmp_fast_cpu_mask, tsk_cpus_allowed(tsk)); +} + +/* Select target cpu for HMP migration to slow cpu + * returns target >= nr_cpu_ids if no slow cpus in affinity mask */ +static inline unsigned int hmp_select_slow_cpu(struct task_struct *tsk) +{ + return cpumask_any_and(&hmp_slow_cpu_mask, tsk_cpus_allowed(tsk)); +} +#endif /* CONFIG_SCHED_HMP */ + /* * sched_balance_self: balance the current task (running on cpu) in domains * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and @@ -2807,8 +3275,52 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) unlock: rcu_read_unlock(); +#ifdef CONFIG_SCHED_HMP + if (hmp_up_migration(new_cpu, &p->se)) { + cpu = hmp_select_fast_cpu(p); + if (cpu < nr_cpu_ids) + return cpu; + } + if (hmp_down_migration(new_cpu, &p->se)) { + cpu = hmp_select_slow_cpu(p); + if (cpu < nr_cpu_ids) + return cpu; + } +#endif + return new_cpu; } + +/* + * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be + * removed when useful for applications beyond shares distribution (e.g. + * load-balance). + */ +#ifdef CONFIG_FAIR_GROUP_SCHED +/* + * Called immediately before a task is migrated to a new cpu; task_cpu(p) and + * cfs_rq_of(p) references at time of call are still valid and identify the + * previous cpu. However, the caller only guarantees p->pi_lock is held; no + * other assumptions, including rq->lock state, should be made. + * Caller guarantees p->pi_lock held, but nothing else. + */ +static void +migrate_task_rq_fair(struct task_struct *p, int next_cpu) { + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + /* + * Load tracking: accumulate removed load so that it can be processed + * when we next update owning cfs_rq under rq->lock. Tasks contribute + * to blocked load iff they have a non-zero decay-count. + */ + if (se->avg.decay_count) { + se->avg.decay_count = -__synchronize_entity_decay(se); + atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load); + } +} +#endif + #endif /* CONFIG_SMP */ static unsigned long @@ -3300,51 +3812,65 @@ next: /* * update tg->load_weight by folding this cpu's load_avg */ -static int update_shares_cpu(struct task_group *tg, int cpu) +static void __update_blocked_averages_cpu(struct task_group *tg, int cpu) { - struct cfs_rq *cfs_rq; - unsigned long flags; - struct rq *rq; - - if (!tg->se[cpu]) - return 0; - - rq = cpu_rq(cpu); - cfs_rq = tg->cfs_rq[cpu]; - - raw_spin_lock_irqsave(&rq->lock, flags); - - update_rq_clock(rq); - update_cfs_load(cfs_rq, 1); + struct sched_entity *se = tg->se[cpu]; + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu]; - /* - * We need to update shares after updating tg->load_weight in - * order to adjust the weight of groups with long running tasks. - */ - update_cfs_shares(cfs_rq); + /* throttled entities do not contribute to load */ + if (throttled_hierarchy(cfs_rq)) + return; - raw_spin_unlock_irqrestore(&rq->lock, flags); + update_cfs_rq_blocked_load(cfs_rq, 1); + if (se) + update_entity_load_avg(se, 1); + else + update_rq_runnable_avg(rq_of(cfs_rq), 1); - return 0; + if (se) { + /* + * We can pivot on the runnable average decaying to zero for + * list removal since the parent average will always be >= + * child. + */ + if (se->avg.runnable_avg_sum) + update_cfs_shares(cfs_rq); + else + list_del_leaf_cfs_rq(cfs_rq); + } } -static void update_shares(int cpu) +static void update_blocked_averages(int cpu) { - struct cfs_rq *cfs_rq; struct rq *rq = cpu_rq(cpu); + struct cfs_rq *cfs_rq; + + unsigned long flags; + int num_updates = 0; rcu_read_lock(); + raw_spin_lock_irqsave(&rq->lock, flags); + update_rq_clock(rq); /* * Iterates the task_group tree in a bottom up fashion, see * list_add_leaf_cfs_rq() for details. */ for_each_leaf_cfs_rq(rq, cfs_rq) { - /* throttled entities do not contribute to load */ - if (throttled_hierarchy(cfs_rq)) - continue; + __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu); - update_shares_cpu(cfs_rq->tg, cpu); + /* + * Periodically release the lock so that a cfs_rq with many + * children cannot hold it for an arbitrary period of time. + */ + if (num_updates++ % 20 == 0) { + raw_spin_unlock_irqrestore(&rq->lock, flags); + cpu_relax(); + raw_spin_lock_irqsave(&rq->lock, flags); + update_rq_clock(rq); + } } + + raw_spin_unlock_irqrestore(&rq->lock, flags); rcu_read_unlock(); } @@ -3389,7 +3915,7 @@ static unsigned long task_h_load(struct task_struct *p) return load; } #else -static inline void update_shares(int cpu) +static inline void update_blocked_averages(int cpu) { } @@ -4409,12 +4935,14 @@ void idle_balance(int this_cpu, struct rq *this_rq) if (this_rq->avg_idle < sysctl_sched_migration_cost) return; + update_rq_runnable_avg(this_rq, 1); + /* * Drop the rq->lock, but keep IRQ/preempt disabled. */ raw_spin_unlock(&this_rq->lock); - update_shares(this_cpu); + update_blocked_averages(this_cpu); rcu_read_lock(); for_each_domain(this_cpu, sd) { unsigned long interval; @@ -4674,7 +5202,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) int update_next_balance = 0; int need_serialize; - update_shares(cpu); + update_blocked_averages(cpu); rcu_read_lock(); for_each_domain(cpu, sd) { @@ -4842,6 +5370,223 @@ need_kick: static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } #endif +#ifdef CONFIG_SCHED_HMP +/* Check if task should migrate to a faster core */ +static unsigned int hmp_up_migration(int cpu, struct sched_entity *se) +{ + struct task_struct *p = task_of(se); + if (p->prio < hmp_up_prio && p->prio > 100 + && hmp_cpu_is_slow(cpu) + && se->avg.load_avg_ratio > hmp_up_threshold) { + return 1; + } + return 0; +} + +/* Check if task should migrate to a slower core */ +static unsigned int hmp_down_migration(int cpu, struct sched_entity *se) +{ + struct task_struct *p = task_of(se); + if (p->prio >= hmp_up_prio || (hmp_cpu_is_fast(cpu) + && se->avg.load_avg_ratio < hmp_down_threshold)) { + return 1; + } + return 0; +} + +/* + * hmp_can_migrate_task - may task p from runqueue rq be migrated to this_cpu? + * Ideally this function should be merged with can_migrate_task() to avoid + * redundant code. + */ +static int hmp_can_migrate_task(struct task_struct *p, struct lb_env *env) +{ + int tsk_cache_hot = 0; + /* + * We do not migrate tasks that are: + * 1) running (obviously), or + * 2) cannot be migrated to this CPU due to cpus_allowed + */ + if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { + schedstat_inc(p, se.statistics.nr_failed_migrations_affine); + return 0; + } + env->flags &= ~LBF_ALL_PINNED; + + if (task_running(env->src_rq, p)) { + schedstat_inc(p, se.statistics.nr_failed_migrations_running); + return 0; + } + + /* + * Aggressive migration if: + * 1) task is cache cold, or + * 2) too many balance attempts have failed. + */ + + tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd); + if (!tsk_cache_hot || + env->sd->nr_balance_failed > env->sd->cache_nice_tries) { +#ifdef CONFIG_SCHEDSTATS + if (tsk_cache_hot) { + schedstat_inc(env->sd, lb_hot_gained[env->idle]); + schedstat_inc(p, se.statistics.nr_forced_migrations); + } +#endif + return 1; + } + + return 1; +} + +/* + * move_specific_task tries to move a specific task. + * Returns 1 if successful and 0 otherwise. + * Called with both runqueues locked. + */ +static int move_specific_task(struct lb_env *env, struct task_struct *pm) +{ + struct task_struct *p, *n; + + list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { + if (throttled_lb_pair(task_group(p), env->src_rq->cpu, + env->dst_cpu)) + continue; + + if (!hmp_can_migrate_task(p, env)) + continue; + /* Check if we found the right task */ + if (p != pm) + continue; + + move_task(p, env); + /* + * Right now, this is only the third place move_task() + * is called, so we can safely collect move_task() + * stats here rather than inside move_task(). + */ + schedstat_inc(env->sd, lb_gained[env->idle]); + return 1; + } + return 0; +} + +/* + * hmp_active_task_migration_cpu_stop is run by cpu stopper and used to + * migrate a specific task from one runqueue to another. + * hmp_force_up_migration uses this to push a currently running task + * off a runqueue. + * Based on active_load_balance_stop_cpu and can potentially be merged. + */ +static int hmp_active_task_migration_cpu_stop(void *data) +{ + struct rq *busiest_rq = data; + struct task_struct *p = busiest_rq->migrate_task; + int busiest_cpu = cpu_of(busiest_rq); + int target_cpu = busiest_rq->push_cpu; + struct rq *target_rq = cpu_rq(target_cpu); + struct sched_domain *sd; + + raw_spin_lock_irq(&busiest_rq->lock); + /* make sure the requested cpu hasn't gone down in the meantime */ + if (unlikely(busiest_cpu != smp_processor_id() || + !busiest_rq->active_balance)) { + goto out_unlock; + } + /* Is there any task to move? */ + if (busiest_rq->nr_running <= 1) + goto out_unlock; + /* Task has migrated meanwhile, abort forced migration */ + if (task_rq(p) != busiest_rq) + goto out_unlock; + /* + * This condition is "impossible", if it occurs + * we need to fix it. Originally reported by + * Bjorn Helgaas on a 128-cpu setup. + */ + BUG_ON(busiest_rq == target_rq); + + /* move a task from busiest_rq to target_rq */ + double_lock_balance(busiest_rq, target_rq); + + /* Search for an sd spanning us and the target CPU. */ + rcu_read_lock(); + for_each_domain(target_cpu, sd) { + if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) + break; + } + + if (likely(sd)) { + struct lb_env env = { + .sd = sd, + .dst_cpu = target_cpu, + .dst_rq = target_rq, + .src_cpu = busiest_rq->cpu, + .src_rq = busiest_rq, + .idle = CPU_IDLE, + }; + + schedstat_inc(sd, alb_count); + + if (move_specific_task(&env, p)) + schedstat_inc(sd, alb_pushed); + else + schedstat_inc(sd, alb_failed); + } + rcu_read_unlock(); + double_unlock_balance(busiest_rq, target_rq); +out_unlock: + busiest_rq->active_balance = 0; + raw_spin_unlock_irq(&busiest_rq->lock); + return 0; +} + +static DEFINE_SPINLOCK(hmp_force_migration); + +/* hmp_force_up_migration checks runqueues for tasks that need to + * be actively migrated to a faster cpu. */ +static void hmp_force_up_migration(int this_cpu) +{ + int i; + struct sched_entity *curr; + struct rq *target; + unsigned long flags; + unsigned int force; + struct task_struct *p; + + if (!spin_trylock(&hmp_force_migration)) + return; + for_each_cpu(i, &hmp_slow_cpu_mask) { + force = 0; + target = cpu_rq(i); + raw_spin_lock_irqsave(&target->lock, flags); + curr = target->cfs.curr; + if (!curr || !entity_is_task(curr)) { + raw_spin_unlock_irqrestore(&target->lock, flags); + continue; + } + p = task_of(curr); + if (hmp_up_migration(i, curr)) { + if (!target->active_balance) { + target->active_balance = 1; + target->push_cpu = hmp_select_fast_cpu(p); + target->migrate_task = p; + force = 1; + trace_sched_hmp_migrate(p, 1); + } + } + raw_spin_unlock_irqrestore(&target->lock, flags); + if (force) + stop_one_cpu_nowait(cpu_of(target), + hmp_active_task_migration_cpu_stop, + target, &target->active_balance_work); + } + spin_unlock(&hmp_force_migration); +} +#else +static void hmp_force_up_migration(int this_cpu) { } +#endif /* CONFIG_SCHED_HMP */ + /* * run_rebalance_domains is triggered when needed from the scheduler tick. * Also triggered for nohz idle balancing (with nohz_balancing_kick set). @@ -4853,6 +5598,8 @@ static void run_rebalance_domains(struct softirq_action *h) enum cpu_idle_type idle = this_rq->idle_balance ? CPU_IDLE : CPU_NOT_IDLE; + hmp_force_up_migration(this_cpu); + rebalance_domains(this_cpu, idle); /* @@ -4907,6 +5654,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) cfs_rq = cfs_rq_of(se); entity_tick(cfs_rq, se, queued); } + + update_rq_runnable_avg(rq, 1); } /* @@ -4999,6 +5748,21 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) place_entity(cfs_rq, se, 0); se->vruntime -= cfs_rq->min_vruntime; } + +#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) + /* + * Remove our load from contribution when we leave sched_fair + * and ensure we don't carry in an old decay_count if we + * switch back. + */ + if (p->se.avg.decay_count) { + struct cfs_rq *cfs_rq = cfs_rq_of(&p->se); + __synchronize_entity_decay(&p->se); + subtract_blocked_load_contrib(cfs_rq, + p->se.avg.load_avg_contrib); + p->se.avg.decay_count = 0; + } +#endif } /* @@ -5045,11 +5809,16 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) #ifndef CONFIG_64BIT cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; #endif +#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) + atomic64_set(&cfs_rq->decay_counter, 1); + atomic64_set(&cfs_rq->removed_load, 0); +#endif } #ifdef CONFIG_FAIR_GROUP_SCHED static void task_move_group_fair(struct task_struct *p, int on_rq) { + struct cfs_rq *cfs_rq; /* * If the task was not on the rq at the time of this cgroup movement * it must have been asleep, sleeping tasks keep their ->vruntime @@ -5081,8 +5850,19 @@ static void task_move_group_fair(struct task_struct *p, int on_rq) if (!on_rq) p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; set_task_rq(p, task_cpu(p)); - if (!on_rq) - p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime; + if (!on_rq) { + cfs_rq = cfs_rq_of(&p->se); + p->se.vruntime += cfs_rq->min_vruntime; +#ifdef CONFIG_SMP + /* + * set_task_rq will() have removed our previous contribution, + * but we must synchronize explicitly against further decay + * here. + */ + p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter); + cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib; +#endif + } } void free_fair_sched_group(struct task_group *tg) @@ -5167,10 +5947,6 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, cfs_rq->tg = tg; cfs_rq->rq = rq; -#ifdef CONFIG_SMP - /* allow initial update_cfs_load() to truncate */ - cfs_rq->load_stamp = 1; -#endif init_cfs_rq_runtime(cfs_rq); tg->cfs_rq[cpu] = cfs_rq; @@ -5217,8 +5993,11 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) se = tg->se[i]; /* Propagate contribution to hierarchy */ raw_spin_lock_irqsave(&rq->lock, flags); - for_each_sched_entity(se) + for_each_sched_entity(se) { update_cfs_shares(group_cfs_rq(se)); + /* update contribution to parent */ + update_entity_load_avg(se, 1); + } raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -5272,7 +6051,9 @@ const struct sched_class fair_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_fair, - +#ifdef CONFIG_FAIR_GROUP_SCHED + .migrate_task_rq = migrate_task_rq_fair, +#endif .rq_online = rq_online_fair, .rq_offline = rq_offline_fair, diff --git a/kernel/sched/features.h b/kernel/sched/features.h index de00a486c5c6..d98ae909e321 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -42,7 +42,7 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true) /* * Use arch dependent cpu power functions */ -SCHED_FEAT(ARCH_POWER, false) +SCHED_FEAT(ARCH_POWER, true) SCHED_FEAT(HRTICK, false) SCHED_FEAT(DOUBLE_TICK, false) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 55844f24435a..0dc9bd108f57 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -112,6 +112,8 @@ struct task_group { unsigned long shares; atomic_t load_weight; + atomic64_t load_avg; + atomic_t runnable_avg, usage_avg; #endif #ifdef CONFIG_RT_GROUP_SCHED @@ -222,22 +224,29 @@ struct cfs_rq { unsigned int nr_spread_over; #endif +#ifdef CONFIG_SMP +/* + * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be + * removed when useful for applications beyond shares distribution (e.g. + * load-balance). + */ #ifdef CONFIG_FAIR_GROUP_SCHED - struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ - /* - * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in - * a hierarchy). Non-leaf lrqs hold other higher schedulable entities - * (like users, containers etc.) - * - * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This - * list is used during load balance. + * CFS Load tracking + * Under CFS, load is tracked on a per-entity basis and aggregated up. + * This allows for the description of both thread and group usage (in + * the FAIR_GROUP_SCHED case). */ - int on_list; - struct list_head leaf_cfs_rq_list; - struct task_group *tg; /* group that "owns" this runqueue */ + u64 runnable_load_avg, blocked_load_avg; + atomic64_t decay_counter, removed_load; + u64 last_decay; +#endif /* CONFIG_FAIR_GROUP_SCHED */ +/* These always depend on CONFIG_FAIR_GROUP_SCHED */ +#ifdef CONFIG_FAIR_GROUP_SCHED + u32 tg_runnable_contrib, tg_usage_contrib; + u64 tg_load_contrib; +#endif /* CONFIG_FAIR_GROUP_SCHED */ -#ifdef CONFIG_SMP /* * h_load = weight * f(tg) * @@ -245,26 +254,30 @@ struct cfs_rq { * this group. */ unsigned long h_load; +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_FAIR_GROUP_SCHED + struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ /* - * Maintaining per-cpu shares distribution for group scheduling + * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in + * a hierarchy). Non-leaf lrqs hold other higher schedulable entities + * (like users, containers etc.) * - * load_stamp is the last time we updated the load average - * load_last is the last time we updated the load average and saw load - * load_unacc_exec_time is currently unaccounted execution time + * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This + * list is used during load balance. */ - u64 load_avg; - u64 load_period; - u64 load_stamp, load_last, load_unacc_exec_time; + int on_list; + struct list_head leaf_cfs_rq_list; + struct task_group *tg; /* group that "owns" this runqueue */ - unsigned long load_contribution; -#endif /* CONFIG_SMP */ #ifdef CONFIG_CFS_BANDWIDTH int runtime_enabled; u64 runtime_expires; s64 runtime_remaining; - u64 throttled_timestamp; + u64 throttled_clock, throttled_clock_task; + u64 throttled_clock_task_time; int throttled, throttle_count; struct list_head throttled_list; #endif /* CONFIG_CFS_BANDWIDTH */ @@ -408,6 +421,7 @@ struct rq { int active_balance; int push_cpu; struct cpu_stop_work active_balance_work; + struct task_struct *migrate_task; /* cpu of this runqueue: */ int cpu; int online; @@ -463,6 +477,8 @@ struct rq { #ifdef CONFIG_SMP struct llist_head wake_list; #endif + + struct sched_avg avg; }; static inline int cpu_of(struct rq *rq) diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 98f60c5caa1b..d542573f55cc 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -1,11 +1,17 @@ /* * Common SMP CPU bringup/teardown functions */ +#include <linux/cpu.h> #include <linux/err.h> #include <linux/smp.h> #include <linux/init.h> +#include <linux/list.h> +#include <linux/slab.h> #include <linux/sched.h> +#include <linux/export.h> #include <linux/percpu.h> +#include <linux/kthread.h> +#include <linux/smpboot.h> #include "smpboot.h" @@ -65,3 +71,203 @@ void __init idle_threads_init(void) } } #endif + +static LIST_HEAD(hotplug_threads); +static DEFINE_MUTEX(smpboot_threads_lock); + +struct smpboot_thread_data { + unsigned int cpu; + unsigned int status; + struct smp_hotplug_thread *ht; +}; + +enum { + HP_THREAD_NONE = 0, + HP_THREAD_ACTIVE, + HP_THREAD_PARKED, +}; + +/** + * smpboot_thread_check_parking - percpu hotplug thread loop function + * @td: thread data pointer + * + * Checks for thread stop and park conditions. Calls the necessary + * setup, cleanup, park and unpark functions for the registered + * thread. + * + * Returns 1 when the thread should exit, 0 otherwise. + */ +int smpboot_thread_check_parking(struct smpboot_thread_data *td) +{ + struct smp_hotplug_thread *ht = td->ht; + +again: + if (kthread_should_stop()) { + if (ht->cleanup) + ht->cleanup(td->cpu, cpu_online(td->cpu)); + kfree(td); + return 1; + } + + if (kthread_should_park()) { + if (ht->park && td->status == HP_THREAD_ACTIVE) { + BUG_ON(td->cpu != smp_processor_id()); + ht->park(td->cpu); + td->status = HP_THREAD_PARKED; + } + kthread_parkme(); + /* We might have been woken for stop */ + goto again; + } + + BUG_ON(td->cpu != smp_processor_id()); + + switch (td->status) { + case HP_THREAD_NONE: + if (ht->setup) + ht->setup(td->cpu); + td->status = HP_THREAD_ACTIVE; + break; + case HP_THREAD_PARKED: + if (ht->unpark) + ht->unpark(td->cpu); + td->status = HP_THREAD_ACTIVE; + break; + } + return 0; +} +EXPORT_SYMBOL_GPL(smpboot_thread_check_parking); + +static int +__smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu) +{ + struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); + struct smpboot_thread_data *td; + + if (tsk) + return 0; + + td = kzalloc_node(sizeof(*td), GFP_KERNEL, cpu_to_node(cpu)); + if (!td) + return -ENOMEM; + td->cpu = cpu; + td->ht = ht; + + tsk = kthread_create_on_cpu(ht->thread_fn, td, cpu, ht->thread_comm); + if (IS_ERR(tsk)) + return PTR_ERR(tsk); + + get_task_struct(tsk); + *per_cpu_ptr(ht->store, cpu) = tsk; + return 0; +} + +int smpboot_create_threads(unsigned int cpu) +{ + struct smp_hotplug_thread *cur; + int ret = 0; + + mutex_lock(&smpboot_threads_lock); + list_for_each_entry(cur, &hotplug_threads, list) { + ret = __smpboot_create_thread(cur, cpu); + if (ret) + break; + } + mutex_unlock(&smpboot_threads_lock); + return ret; +} + +static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cpu) +{ + struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); + + kthread_unpark(tsk); +} + +void smpboot_unpark_threads(unsigned int cpu) +{ + struct smp_hotplug_thread *cur; + + mutex_lock(&smpboot_threads_lock); + list_for_each_entry(cur, &hotplug_threads, list) + smpboot_unpark_thread(cur, cpu); + mutex_unlock(&smpboot_threads_lock); +} + +static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu) +{ + struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); + + if (tsk) + kthread_park(tsk); +} + +void smpboot_park_threads(unsigned int cpu) +{ + struct smp_hotplug_thread *cur; + + mutex_lock(&smpboot_threads_lock); + list_for_each_entry_reverse(cur, &hotplug_threads, list) + smpboot_park_thread(cur, cpu); + mutex_unlock(&smpboot_threads_lock); +} + +static void smpboot_destroy_threads(struct smp_hotplug_thread *ht) +{ + unsigned int cpu; + + /* We need to destroy also the parked threads of offline cpus */ + for_each_possible_cpu(cpu) { + struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); + + if (tsk) { + kthread_stop(tsk); + put_task_struct(tsk); + *per_cpu_ptr(ht->store, cpu) = NULL; + } + } +} + +/** + * smpboot_register_percpu_thread - Register a per_cpu thread related to hotplug + * @plug_thread: Hotplug thread descriptor + * + * Creates and starts the threads on all online cpus. + */ +int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread) +{ + unsigned int cpu; + int ret = 0; + + mutex_lock(&smpboot_threads_lock); + for_each_online_cpu(cpu) { + ret = __smpboot_create_thread(plug_thread, cpu); + if (ret) { + smpboot_destroy_threads(plug_thread); + goto out; + } + smpboot_unpark_thread(plug_thread, cpu); + } + list_add(&plug_thread->list, &hotplug_threads); +out: + mutex_unlock(&smpboot_threads_lock); + return ret; +} +EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread); + +/** + * smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug + * @plug_thread: Hotplug thread descriptor + * + * Stops all threads on all possible cpus. + */ +void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread) +{ + get_online_cpus(); + mutex_lock(&smpboot_threads_lock); + list_del(&plug_thread->list); + smpboot_destroy_threads(plug_thread); + mutex_unlock(&smpboot_threads_lock); + put_online_cpus(); +} +EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread); diff --git a/kernel/smpboot.h b/kernel/smpboot.h index 80c0acfb8472..9e26228edf66 100644 --- a/kernel/smpboot.h +++ b/kernel/smpboot.h @@ -15,4 +15,8 @@ static inline void idle_thread_set_boot_cpu(void) { } static inline void idle_threads_init(void) { } #endif +int smpboot_create_threads(unsigned int cpu); +void smpboot_park_threads(unsigned int cpu); +void smpboot_unpark_threads(unsigned int cpu); + #endif diff --git a/kernel/softirq.c b/kernel/softirq.c index 671f9594e368..501e724c0081 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -23,6 +23,7 @@ #include <linux/rcupdate.h> #include <linux/ftrace.h> #include <linux/smp.h> +#include <linux/smpboot.h> #include <linux/tick.h> #define CREATE_TRACE_POINTS @@ -733,24 +734,17 @@ void __init softirq_init(void) open_softirq(HI_SOFTIRQ, tasklet_hi_action); } -static int run_ksoftirqd(void * __bind_cpu) +static int run_ksoftirqd(void *td) { - set_current_state(TASK_INTERRUPTIBLE); - - while (!kthread_should_stop()) { + while (!smpboot_thread_check_parking(td)) { + set_current_state(TASK_INTERRUPTIBLE); preempt_disable(); - if (!local_softirq_pending()) { + if (!local_softirq_pending()) schedule_preempt_disabled(); - } __set_current_state(TASK_RUNNING); while (local_softirq_pending()) { - /* Preempt disable stops cpu going offline. - If already offline, we'll be on wrong CPU: - don't process */ - if (cpu_is_offline((long)__bind_cpu)) - goto wait_to_die; local_irq_disable(); if (local_softirq_pending()) __do_softirq(); @@ -758,23 +752,10 @@ static int run_ksoftirqd(void * __bind_cpu) sched_preempt_enable_no_resched(); cond_resched(); preempt_disable(); - rcu_note_context_switch((long)__bind_cpu); + rcu_note_context_switch(smp_processor_id()); } preempt_enable(); - set_current_state(TASK_INTERRUPTIBLE); } - __set_current_state(TASK_RUNNING); - return 0; - -wait_to_die: - preempt_enable(); - /* Wait for kthread_stop */ - set_current_state(TASK_INTERRUPTIBLE); - while (!kthread_should_stop()) { - schedule(); - set_current_state(TASK_INTERRUPTIBLE); - } - __set_current_state(TASK_RUNNING); return 0; } @@ -841,50 +822,17 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { - int hotcpu = (unsigned long)hcpu; - struct task_struct *p; - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - p = kthread_create_on_node(run_ksoftirqd, - hcpu, - cpu_to_node(hotcpu), - "ksoftirqd/%d", hotcpu); - if (IS_ERR(p)) { - printk("ksoftirqd for %i failed\n", hotcpu); - return notifier_from_errno(PTR_ERR(p)); - } - kthread_bind(p, hotcpu); - per_cpu(ksoftirqd, hotcpu) = p; - break; - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - wake_up_process(per_cpu(ksoftirqd, hotcpu)); - break; #ifdef CONFIG_HOTPLUG_CPU - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - if (!per_cpu(ksoftirqd, hotcpu)) - break; - /* Unbind so it can run. Fall thru. */ - kthread_bind(per_cpu(ksoftirqd, hotcpu), - cpumask_any(cpu_online_mask)); case CPU_DEAD: case CPU_DEAD_FROZEN: { - static const struct sched_param param = { - .sched_priority = MAX_RT_PRIO-1 - }; - - p = per_cpu(ksoftirqd, hotcpu); - per_cpu(ksoftirqd, hotcpu) = NULL; - sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); - kthread_stop(p); + int hotcpu = (unsigned long)hcpu; + takeover_tasklets(hotcpu); break; } #endif /* CONFIG_HOTPLUG_CPU */ - } + } return NOTIFY_OK; } @@ -892,14 +840,18 @@ static struct notifier_block __cpuinitdata cpu_nfb = { .notifier_call = cpu_callback }; +static struct smp_hotplug_thread softirq_threads = { + .store = &ksoftirqd, + .thread_fn = run_ksoftirqd, + .thread_comm = "ksoftirqd/%u", +}; + static __init int spawn_ksoftirqd(void) { - void *cpu = (void *)(long)smp_processor_id(); - int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); - - BUG_ON(err != NOTIFY_OK); - cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); register_cpu_notifier(&cpu_nfb); + + BUG_ON(smpboot_register_percpu_thread(&softirq_threads)); + return 0; } early_initcall(spawn_ksoftirqd); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 4b1dfba70f7c..3ce8ea77c5e1 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -22,6 +22,7 @@ #include <linux/notifier.h> #include <linux/module.h> #include <linux/sysctl.h> +#include <linux/smpboot.h> #include <asm/irq_regs.h> #include <linux/kvm_para.h> @@ -29,6 +30,7 @@ int watchdog_enabled = 1; int __read_mostly watchdog_thresh = 10; +static int __read_mostly watchdog_disabled; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); @@ -256,6 +258,9 @@ static void watchdog_interrupt_count(void) static inline void watchdog_interrupt_count(void) { return; } #endif /* CONFIG_HARDLOCKUP_DETECTOR */ +static int watchdog_nmi_enable(unsigned int cpu); +static void watchdog_nmi_disable(unsigned int cpu); + /* watchdog kicker functions */ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) { @@ -327,23 +332,46 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) return HRTIMER_RESTART; } - -/* - * The watchdog thread - touches the timestamp. - */ -static int watchdog(void *unused) +static void watchdog_enable(unsigned int cpu) { - struct sched_param param = { .sched_priority = 0 }; struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); - /* initialize timestamp */ - __touch_watchdog(); + /* Enable the perf event */ + watchdog_nmi_enable(cpu); /* kick off the timer for the hardlockup detector */ + hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer->function = watchdog_timer_fn; + /* done here because hrtimer_start can only pin to smp_processor_id() */ hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()), HRTIMER_MODE_REL_PINNED); + /* initialize timestamp */ + __touch_watchdog(); +} + +static void watchdog_disable(unsigned int cpu) +{ + struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); + + hrtimer_cancel(hrtimer); + /* disable the perf event */ + watchdog_nmi_disable(cpu); +} + +/* + * The watchdog thread - touches the timestamp. + */ +static int watchdog(void *td) +{ + struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; + + if (!watchdog_enabled) + kthread_park(current); + + sched_setscheduler(current, SCHED_FIFO, ¶m); + set_current_state(TASK_INTERRUPTIBLE); /* * Run briefly (kicked by the hrtimer callback function) once every @@ -352,25 +380,14 @@ static int watchdog(void *unused) * 2*watchdog_thresh seconds then the debug-printout triggers in * watchdog_timer_fn(). */ - while (!kthread_should_stop()) { + while (!smpboot_thread_check_parking(td)) { + set_current_state(TASK_INTERRUPTIBLE); __touch_watchdog(); schedule(); - - if (kthread_should_stop()) - break; - - set_current_state(TASK_INTERRUPTIBLE); } - /* - * Drop the policy/priority elevation during thread exit to avoid a - * scheduling latency spike. - */ - __set_current_state(TASK_RUNNING); - sched_setscheduler(current, SCHED_NORMAL, ¶m); return 0; } - #ifdef CONFIG_HARDLOCKUP_DETECTOR /* * People like the simple clean cpu node info on boot. @@ -379,7 +396,7 @@ static int watchdog(void *unused) */ static unsigned long cpu0_err; -static int watchdog_nmi_enable(int cpu) +static int watchdog_nmi_enable(unsigned int cpu) { struct perf_event_attr *wd_attr; struct perf_event *event = per_cpu(watchdog_ev, cpu); @@ -433,7 +450,7 @@ out: return 0; } -static void watchdog_nmi_disable(int cpu) +static void watchdog_nmi_disable(unsigned int cpu) { struct perf_event *event = per_cpu(watchdog_ev, cpu); @@ -447,107 +464,35 @@ static void watchdog_nmi_disable(int cpu) return; } #else -static int watchdog_nmi_enable(int cpu) { return 0; } -static void watchdog_nmi_disable(int cpu) { return; } +static int watchdog_nmi_enable(unsigned int cpu) { return 0; } +static void watchdog_nmi_disable(unsigned int cpu) { return; } #endif /* CONFIG_HARDLOCKUP_DETECTOR */ /* prepare/enable/disable routines */ -static void watchdog_prepare_cpu(int cpu) -{ - struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu); - - WARN_ON(per_cpu(softlockup_watchdog, cpu)); - hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hrtimer->function = watchdog_timer_fn; -} - -static int watchdog_enable(int cpu) -{ - struct task_struct *p = per_cpu(softlockup_watchdog, cpu); - int err = 0; - - /* enable the perf event */ - err = watchdog_nmi_enable(cpu); - - /* Regardless of err above, fall through and start softlockup */ - - /* create the watchdog thread */ - if (!p) { - struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; - p = kthread_create_on_node(watchdog, NULL, cpu_to_node(cpu), "watchdog/%d", cpu); - if (IS_ERR(p)) { - pr_err("softlockup watchdog for %i failed\n", cpu); - if (!err) { - /* if hardlockup hasn't already set this */ - err = PTR_ERR(p); - /* and disable the perf event */ - watchdog_nmi_disable(cpu); - } - goto out; - } - sched_setscheduler(p, SCHED_FIFO, ¶m); - kthread_bind(p, cpu); - per_cpu(watchdog_touch_ts, cpu) = 0; - per_cpu(softlockup_watchdog, cpu) = p; - wake_up_process(p); - } - -out: - return err; -} - -static void watchdog_disable(int cpu) -{ - struct task_struct *p = per_cpu(softlockup_watchdog, cpu); - struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu); - - /* - * cancel the timer first to stop incrementing the stats - * and waking up the kthread - */ - hrtimer_cancel(hrtimer); - - /* disable the perf event */ - watchdog_nmi_disable(cpu); - - /* stop the watchdog thread */ - if (p) { - per_cpu(softlockup_watchdog, cpu) = NULL; - kthread_stop(p); - } -} - /* sysctl functions */ #ifdef CONFIG_SYSCTL static void watchdog_enable_all_cpus(void) { - int cpu; - - watchdog_enabled = 0; - - for_each_online_cpu(cpu) - if (!watchdog_enable(cpu)) - /* if any cpu succeeds, watchdog is considered - enabled for the system */ - watchdog_enabled = 1; - - if (!watchdog_enabled) - pr_err("failed to be enabled on some cpus\n"); + unsigned int cpu; + if (watchdog_disabled) { + watchdog_disabled = 0; + for_each_online_cpu(cpu) + kthread_unpark(per_cpu(softlockup_watchdog, cpu)); + } } static void watchdog_disable_all_cpus(void) { - int cpu; + unsigned int cpu; - for_each_online_cpu(cpu) - watchdog_disable(cpu); - - /* if all watchdogs are disabled, then they are disabled for the system */ - watchdog_enabled = 0; + if (!watchdog_disabled) { + watchdog_disabled = 1; + for_each_online_cpu(cpu) + kthread_park(per_cpu(softlockup_watchdog, cpu)); + } } - /* * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh */ @@ -557,73 +502,35 @@ int proc_dowatchdog(struct ctl_table *table, int write, { int ret; + if (watchdog_disabled < 0) + return -ENODEV; + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret || !write) - goto out; + return ret; if (watchdog_enabled && watchdog_thresh) watchdog_enable_all_cpus(); else watchdog_disable_all_cpus(); -out: return ret; } #endif /* CONFIG_SYSCTL */ - -/* - * Create/destroy watchdog threads as CPUs come and go: - */ -static int __cpuinit -cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) -{ - int hotcpu = (unsigned long)hcpu; - - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - watchdog_prepare_cpu(hotcpu); - break; - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - if (watchdog_enabled) - watchdog_enable(hotcpu); - break; -#ifdef CONFIG_HOTPLUG_CPU - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - watchdog_disable(hotcpu); - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - watchdog_disable(hotcpu); - break; -#endif /* CONFIG_HOTPLUG_CPU */ - } - - /* - * hardlockup and softlockup are not important enough - * to block cpu bring up. Just always succeed and - * rely on printk output to flag problems. - */ - return NOTIFY_OK; -} - -static struct notifier_block __cpuinitdata cpu_nfb = { - .notifier_call = cpu_callback +static struct smp_hotplug_thread watchdog_threads = { + .store = &softlockup_watchdog, + .thread_fn = watchdog, + .thread_comm = "watchdog/%u", + .setup = watchdog_enable, + .park = watchdog_disable, + .unpark = watchdog_enable, }; void __init lockup_detector_init(void) { - void *cpu = (void *)(long)smp_processor_id(); - int err; - - err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); - WARN_ON(notifier_to_errno(err)); - - cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); - register_cpu_notifier(&cpu_nfb); - - return; + if (smpboot_register_percpu_thread(&watchdog_threads)) { + pr_err("Failed to create watchdog threads, disabled\n"); + watchdog_disabled = -ENODEV; + } } diff --git a/linaro/configs/big-LITTLE-MP.conf b/linaro/configs/big-LITTLE-MP.conf new file mode 100644 index 000000000000..df35474eff10 --- /dev/null +++ b/linaro/configs/big-LITTLE-MP.conf @@ -0,0 +1,9 @@ +CONFIG_CGROUPS=y +CONFIG_CGROUP_SCHED=y +CONFIG_FAIR_GROUP_SCHED=y +CONFIG_NO_HZ=y +CONFIG_SCHED_MC=y +CONFIG_DISABLE_CPU_SCHED_DOMAIN_BALANCE=y +CONFIG_SCHED_HMP=y +CONFIG_HMP_FAST_CPU_MASK="0-1" +CONFIG_HMP_SLOW_CPU_MASK="2-3" |