From d2641a5c3d5ecaa1078225e493c7fed821715a04 Mon Sep 17 00:00:00 2001 From: Ionela Voinescu Date: Mon, 14 Dec 2020 12:38:20 +0000 Subject: cppc_cpufreq: use policy->cpu as driver of frequency setting Considering only the currently supported coordination types (ANY, HW, NONE), this change only makes a difference for the ANY type, when policy->cpu is hotplugged out. In that case the new policy->cpu will be different from ((struct cppc_cpudata *)policy->driver_data)->cpu. While in this case the controls of *ANY* CPU could be used to drive frequency changes, it's more consistent to use policy->cpu as the leading CPU, as used in all other cppc_cpufreq functions. Additionally, the debug prints in cppc_set_perf() would no longer create confusion when referring to a CPU that is hotplugged out. Signed-off-by: Ionela Voinescu Acked-by: Viresh Kumar Tested-by: Mian Yousaf Kaukab Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cppc_cpufreq.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'drivers/cpufreq') diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index 7cc9bd8568de..2700fc71d4e8 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -150,6 +150,7 @@ static int cppc_cpufreq_set_target(struct cpufreq_policy *policy, unsigned int relation) { struct cppc_cpudata *cpu_data = all_cpu_data[policy->cpu]; + unsigned int cpu = policy->cpu; struct cpufreq_freqs freqs; u32 desired_perf; int ret = 0; @@ -164,12 +165,12 @@ static int cppc_cpufreq_set_target(struct cpufreq_policy *policy, freqs.new = target_freq; cpufreq_freq_transition_begin(policy, &freqs); - ret = cppc_set_perf(cpu_data->cpu, &cpu_data->perf_ctrls); + ret = cppc_set_perf(cpu, &cpu_data->perf_ctrls); cpufreq_freq_transition_end(policy, &freqs, ret != 0); if (ret) pr_debug("Failed to set target on CPU:%d. ret:%d\n", - cpu_data->cpu, ret); + cpu, ret); return ret; } -- cgit v1.2.3 From bf76bb208f2b653306f2fc8f9c2a22f9890702bd Mon Sep 17 00:00:00 2001 From: Ionela Voinescu Date: Mon, 14 Dec 2020 12:38:21 +0000 Subject: cppc_cpufreq: clarify support for coordination types The previous coordination type handling in the cppc_cpufreq init code created some confusion: the comment mentioned "Support only SW_ANY for now" while only the SW_ALL/ALL case resulted in a failure. The other coordination types (HW_ALL/HW, NONE) were silently supported. Clarify support for coordination types while describing in comments the intended behavior. Signed-off-by: Ionela Voinescu Acked-by: Viresh Kumar Tested-by: Mian Yousaf Kaukab Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cppc_cpufreq.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'drivers/cpufreq') diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index 2700fc71d4e8..f15a44c8b6b7 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -244,7 +244,7 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) struct cppc_cpudata *cpu_data = all_cpu_data[policy->cpu]; struct cppc_perf_caps *caps = &cpu_data->perf_caps; unsigned int cpu = policy->cpu; - int ret = 0; + int i, ret = 0; cpu_data->cpu = cpu; ret = cppc_get_perf_caps(cpu, caps); @@ -281,9 +281,13 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) policy->transition_delay_us = cppc_cpufreq_get_transition_delay_us(cpu); policy->shared_type = cpu_data->shared_type; - if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) { - int i; - + switch (policy->shared_type) { + case CPUFREQ_SHARED_TYPE_HW: + case CPUFREQ_SHARED_TYPE_NONE: + /* Nothing to be done - we'll have a policy for each CPU */ + break; + case CPUFREQ_SHARED_TYPE_ANY: + /* All CPUs in the domain will share a policy */ cpumask_copy(policy->cpus, cpu_data->shared_cpu_map); for_each_cpu(i, policy->cpus) { @@ -293,9 +297,10 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) memcpy(&all_cpu_data[i]->perf_caps, caps, sizeof(cpu_data->perf_caps)); } - } else if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL) { - /* Support only SW_ANY for now. */ - pr_debug("Unsupported CPU co-ord type\n"); + break; + default: + pr_debug("Unsupported CPU co-ord type: %d\n", + policy->shared_type); return -EFAULT; } -- cgit v1.2.3 From cfdc589f4b5f94bf1a975b4a67d8163d533f6e9b Mon Sep 17 00:00:00 2001 From: Ionela Voinescu Date: Mon, 14 Dec 2020 12:38:22 +0000 Subject: cppc_cpufreq: expose information on frequency domains Use the existing sysfs attribute "freqdomain_cpus" to expose information to userspace about CPUs in the same frequency domain. Signed-off-by: Ionela Voinescu Acked-by: Viresh Kumar Tested-by: Mian Yousaf Kaukab Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cppc_cpufreq.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'drivers/cpufreq') diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index f15a44c8b6b7..40b58d2dbbc6 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -402,6 +402,19 @@ static int cppc_cpufreq_set_boost(struct cpufreq_policy *policy, int state) return 0; } +static ssize_t show_freqdomain_cpus(struct cpufreq_policy *policy, char *buf) +{ + unsigned int cpu = policy->cpu; + + return cpufreq_show_cpus(all_cpu_data[cpu]->shared_cpu_map, buf); +} +cpufreq_freq_attr_ro(freqdomain_cpus); + +static struct freq_attr *cppc_cpufreq_attr[] = { + &freqdomain_cpus, + NULL, +}; + static struct cpufreq_driver cppc_cpufreq_driver = { .flags = CPUFREQ_CONST_LOOPS, .verify = cppc_verify_policy, @@ -410,6 +423,7 @@ static struct cpufreq_driver cppc_cpufreq_driver = { .init = cppc_cpufreq_cpu_init, .stop_cpu = cppc_cpufreq_stop_cpu, .set_boost = cppc_cpufreq_set_boost, + .attr = cppc_cpufreq_attr, .name = "cppc_cpufreq", }; -- cgit v1.2.3 From a28b2bfc099c6b9caa6ef697660408e076a32019 Mon Sep 17 00:00:00 2001 From: Ionela Voinescu Date: Mon, 14 Dec 2020 12:38:23 +0000 Subject: cppc_cpufreq: replace per-cpu data array with a list The cppc_cpudata per-cpu storage was inefficient (1) additional to causing functional issues (2) when CPUs are hotplugged out, due to per-cpu data being improperly initialised. (1) The amount of information needed for CPPC performance control in its cpufreq driver depends on the domain (PSD) coordination type: ANY: One set of CPPC control and capability data (e.g desired performance, highest/lowest performance, etc) applies to all CPUs in the domain. ALL: Same as ANY. To be noted that this type is not currently supported. When supported, information about which CPUs belong to a domain is needed in order for frequency change requests to be sent to each of them. HW: It's necessary to store CPPC control and capability information for all the CPUs. HW will then coordinate the performance state based on their limitations and requests. NONE: Same as HW. No HW coordination is expected. Despite this, the previous initialisation code would indiscriminately allocate memory for all CPUs (all_cpu_data) and unnecessarily duplicate performance capabilities and the domain sharing mask and type for each possible CPU. (2) With the current per-cpu structure, when having ANY coordination, the cppc_cpudata cpu information is not initialised (will remain 0) for all CPUs in a policy, other than policy->cpu. When policy->cpu is hotplugged out, the driver will incorrectly use the uninitialised (0) value of the other CPUs when making frequency changes. Additionally, the previous values stored in the perf_ctrls.desired_perf will be lost when policy->cpu changes. Therefore replace the array of per cpu data with a list. The memory for each structure is allocated at policy init, where a single structure can be allocated per policy, not per cpu. In order to accommodate the struct list_head node in the cppc_cpudata structure, the now unused cpu and cur_policy variables are removed. For example, on a arm64 Juno platform with 6 CPUs: (0, 1, 2, 3) in PSD1, (4, 5) in PSD2 - ANY coordination, the memory allocation comparison shows: Before patch: - ANY coordination: total slack req alloc/free caller 0 0 0 0/1 _kernel_size_le_hi32+0x0xffff800008ff7810 0 0 0 0/6 _kernel_size_le_hi32+0x0xffff800008ff7808 128 80 48 1/0 _kernel_size_le_hi32+0x0xffff800008ffc070 768 0 768 6/0 _kernel_size_le_hi32+0x0xffff800008ffc0e4 After patch: - ANY coordination: total slack req alloc/free caller 256 0 256 2/0 _kernel_size_le_hi32+0x0xffff800008fed410 0 0 0 0/2 _kernel_size_le_hi32+0x0xffff800008fed274 Additional notes: - A pointer to the policy's cppc_cpudata is stored in policy->driver_data - Driver registration is skipped if _CPC entries are not present. Signed-off-by: Ionela Voinescu Tested-by: Mian Yousaf Kaukab Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cppc_cpufreq.c | 174 +++++++++++++++++++++-------------------- 1 file changed, 91 insertions(+), 83 deletions(-) (limited to 'drivers/cpufreq') diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index 40b58d2dbbc6..8a482c434ea6 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -30,13 +30,13 @@ #define DMI_PROCESSOR_MAX_SPEED 0x14 /* - * These structs contain information parsed from per CPU - * ACPI _CPC structures. - * e.g. For each CPU the highest, lowest supported - * performance capabilities, desired performance level - * requested etc. + * This list contains information parsed from per CPU ACPI _CPC and _PSD + * structures: e.g. the highest and lowest supported performance, capabilities, + * desired performance, level requested etc. Depending on the share_type, not + * all CPUs will have an entry in the list. */ -static struct cppc_cpudata **all_cpu_data; +static LIST_HEAD(cpu_data_list); + static bool boost_supported; struct cppc_workaround_oem_info { @@ -148,8 +148,9 @@ static unsigned int cppc_cpufreq_khz_to_perf(struct cppc_cpudata *cpu_data, static int cppc_cpufreq_set_target(struct cpufreq_policy *policy, unsigned int target_freq, unsigned int relation) + { - struct cppc_cpudata *cpu_data = all_cpu_data[policy->cpu]; + struct cppc_cpudata *cpu_data = policy->driver_data; unsigned int cpu = policy->cpu; struct cpufreq_freqs freqs; u32 desired_perf; @@ -183,7 +184,7 @@ static int cppc_verify_policy(struct cpufreq_policy_data *policy) static void cppc_cpufreq_stop_cpu(struct cpufreq_policy *policy) { - struct cppc_cpudata *cpu_data = all_cpu_data[policy->cpu]; + struct cppc_cpudata *cpu_data = policy->driver_data; struct cppc_perf_caps *caps = &cpu_data->perf_caps; unsigned int cpu = policy->cpu; int ret; @@ -194,6 +195,12 @@ static void cppc_cpufreq_stop_cpu(struct cpufreq_policy *policy) if (ret) pr_debug("Err setting perf value:%d on CPU:%d. ret:%d\n", caps->lowest_perf, cpu, ret); + + /* Remove CPU node from list and free driver data for policy */ + free_cpumask_var(cpu_data->shared_cpu_map); + list_del(&cpu_data->node); + kfree(policy->driver_data); + policy->driver_data = NULL; } /* @@ -239,25 +246,61 @@ static unsigned int cppc_cpufreq_get_transition_delay_us(unsigned int cpu) } #endif -static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) + +static struct cppc_cpudata *cppc_cpufreq_get_cpu_data(unsigned int cpu) { - struct cppc_cpudata *cpu_data = all_cpu_data[policy->cpu]; - struct cppc_perf_caps *caps = &cpu_data->perf_caps; - unsigned int cpu = policy->cpu; - int i, ret = 0; + struct cppc_cpudata *cpu_data; + int ret; + + cpu_data = kzalloc(sizeof(struct cppc_cpudata), GFP_KERNEL); + if (!cpu_data) + goto out; - cpu_data->cpu = cpu; - ret = cppc_get_perf_caps(cpu, caps); + if (!zalloc_cpumask_var(&cpu_data->shared_cpu_map, GFP_KERNEL)) + goto free_cpu; + ret = acpi_get_psd_map(cpu, cpu_data); if (ret) { - pr_debug("Err reading CPU%d perf capabilities. ret:%d\n", - cpu, ret); - return ret; + pr_debug("Err parsing CPU%d PSD data: ret:%d\n", cpu, ret); + goto free_mask; + } + + ret = cppc_get_perf_caps(cpu, &cpu_data->perf_caps); + if (ret) { + pr_debug("Err reading CPU%d perf caps: ret:%d\n", cpu, ret); + goto free_mask; } /* Convert the lowest and nominal freq from MHz to KHz */ - caps->lowest_freq *= 1000; - caps->nominal_freq *= 1000; + cpu_data->perf_caps.lowest_freq *= 1000; + cpu_data->perf_caps.nominal_freq *= 1000; + + list_add(&cpu_data->node, &cpu_data_list); + + return cpu_data; + +free_mask: + free_cpumask_var(cpu_data->shared_cpu_map); +free_cpu: + kfree(cpu_data); +out: + return NULL; +} + +static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) +{ + unsigned int cpu = policy->cpu; + struct cppc_cpudata *cpu_data; + struct cppc_perf_caps *caps; + int ret; + + cpu_data = cppc_cpufreq_get_cpu_data(cpu); + if (!cpu_data) { + pr_err("Error in acquiring _CPC/_PSD data for CPU%d.\n", cpu); + return -ENODEV; + } + caps = &cpu_data->perf_caps; + policy->driver_data = cpu_data; /* * Set min to lowest nonlinear perf to avoid any efficiency penalty (see @@ -287,16 +330,12 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) /* Nothing to be done - we'll have a policy for each CPU */ break; case CPUFREQ_SHARED_TYPE_ANY: - /* All CPUs in the domain will share a policy */ + /* + * All CPUs in the domain will share a policy and all cpufreq + * operations will use a single cppc_cpudata structure stored + * in policy->driver_data. + */ cpumask_copy(policy->cpus, cpu_data->shared_cpu_map); - - for_each_cpu(i, policy->cpus) { - if (unlikely(i == cpu)) - continue; - - memcpy(&all_cpu_data[i]->perf_caps, caps, - sizeof(cpu_data->perf_caps)); - } break; default: pr_debug("Unsupported CPU co-ord type: %d\n", @@ -304,8 +343,6 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) return -EFAULT; } - cpu_data->cur_policy = policy; - /* * If 'highest_perf' is greater than 'nominal_perf', we assume CPU Boost * is supported. @@ -360,9 +397,12 @@ static int cppc_get_rate_from_fbctrs(struct cppc_cpudata *cpu_data, static unsigned int cppc_cpufreq_get_rate(unsigned int cpu) { struct cppc_perf_fb_ctrs fb_ctrs_t0 = {0}, fb_ctrs_t1 = {0}; - struct cppc_cpudata *cpu_data = all_cpu_data[cpu]; + struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); + struct cppc_cpudata *cpu_data = policy->driver_data; int ret; + cpufreq_cpu_put(policy); + ret = cppc_get_perf_ctrs(cpu, &fb_ctrs_t0); if (ret) return ret; @@ -378,7 +418,7 @@ static unsigned int cppc_cpufreq_get_rate(unsigned int cpu) static int cppc_cpufreq_set_boost(struct cpufreq_policy *policy, int state) { - struct cppc_cpudata *cpu_data = all_cpu_data[policy->cpu]; + struct cppc_cpudata *cpu_data = policy->driver_data; struct cppc_perf_caps *caps = &cpu_data->perf_caps; int ret; @@ -404,9 +444,9 @@ static int cppc_cpufreq_set_boost(struct cpufreq_policy *policy, int state) static ssize_t show_freqdomain_cpus(struct cpufreq_policy *policy, char *buf) { - unsigned int cpu = policy->cpu; + struct cppc_cpudata *cpu_data = policy->driver_data; - return cpufreq_show_cpus(all_cpu_data[cpu]->shared_cpu_map, buf); + return cpufreq_show_cpus(cpu_data->shared_cpu_map, buf); } cpufreq_freq_attr_ro(freqdomain_cpus); @@ -435,10 +475,13 @@ static struct cpufreq_driver cppc_cpufreq_driver = { */ static unsigned int hisi_cppc_cpufreq_get_rate(unsigned int cpu) { - struct cppc_cpudata *cpu_data = all_cpu_data[cpu]; + struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); + struct cppc_cpudata *cpu_data = policy->driver_data; u64 desired_perf; int ret; + cpufreq_cpu_put(policy); + ret = cppc_get_desired_perf(cpu, &desired_perf); if (ret < 0) return -EIO; @@ -471,68 +514,33 @@ static void cppc_check_hisi_workaround(void) static int __init cppc_cpufreq_init(void) { - struct cppc_cpudata *cpu_data; - int i, ret = 0; - - if (acpi_disabled) + if ((acpi_disabled) || !acpi_cpc_valid()) return -ENODEV; - all_cpu_data = kcalloc(num_possible_cpus(), sizeof(void *), - GFP_KERNEL); - if (!all_cpu_data) - return -ENOMEM; - - for_each_possible_cpu(i) { - all_cpu_data[i] = kzalloc(sizeof(struct cppc_cpudata), GFP_KERNEL); - if (!all_cpu_data[i]) - goto out; - - cpu_data = all_cpu_data[i]; - if (!zalloc_cpumask_var(&cpu_data->shared_cpu_map, GFP_KERNEL)) - goto out; - } - - ret = acpi_get_psd_map(all_cpu_data); - if (ret) { - pr_debug("Error parsing PSD data. Aborting cpufreq registration.\n"); - goto out; - } + INIT_LIST_HEAD(&cpu_data_list); cppc_check_hisi_workaround(); - ret = cpufreq_register_driver(&cppc_cpufreq_driver); - if (ret) - goto out; + return cpufreq_register_driver(&cppc_cpufreq_driver); +} - return ret; +static inline void free_cpu_data(void) +{ + struct cppc_cpudata *iter, *tmp; -out: - for_each_possible_cpu(i) { - cpu_data = all_cpu_data[i]; - if (!cpu_data) - break; - free_cpumask_var(cpu_data->shared_cpu_map); - kfree(cpu_data); + list_for_each_entry_safe(iter, tmp, &cpu_data_list, node) { + free_cpumask_var(iter->shared_cpu_map); + list_del(&iter->node); + kfree(iter); } - kfree(all_cpu_data); - return -ENODEV; } static void __exit cppc_cpufreq_exit(void) { - struct cppc_cpudata *cpu_data; - int i; - cpufreq_unregister_driver(&cppc_cpufreq_driver); - for_each_possible_cpu(i) { - cpu_data = all_cpu_data[i]; - free_cpumask_var(cpu_data->shared_cpu_map); - kfree(cpu_data); - } - - kfree(all_cpu_data); + free_cpu_data(); } module_exit(cppc_cpufreq_exit); -- cgit v1.2.3 From ee2cc4276ba4909438f5894a218877660e1536d9 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 14 Dec 2020 21:08:00 +0100 Subject: cpufreq: Add special-purpose fast-switching callback for drivers First off, some cpufreq drivers (eg. intel_pstate) can pass hints beyond the current target frequency to the hardware and there are no provisions for doing that in the cpufreq framework. In particular, today the driver has to assume that it should not allow the frequency to fall below the one requested by the governor (or the required capacity may not be provided) which may not be the case and which may lead to excessive energy usage in some scenarios. Second, the hints passed by these drivers to the hardware need not be in terms of the frequency, so representing the utilization numbers coming from the scheduler as frequency before passing them to those drivers is not really useful. Address the two points above by adding a special-purpose replacement for the ->fast_switch callback, called ->adjust_perf, allowing the governor to pass abstract performance level (rather than frequency) values for the minimum (required) and target (desired) performance along with the CPU capacity to compare them to. Also update the schedutil governor to use the new callback instead of ->fast_switch if present and if the utilization mertics are frequency-invariant (that is requisite for the direct mapping between the utilization and the CPU performance levels to be a reasonable approximation). Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar --- drivers/cpufreq/cpufreq.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'drivers/cpufreq') diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index c17aa2973c44..d0a3525ce27f 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -2097,6 +2097,46 @@ unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy, } EXPORT_SYMBOL_GPL(cpufreq_driver_fast_switch); +/** + * cpufreq_driver_adjust_perf - Adjust CPU performance level in one go. + * @cpu: Target CPU. + * @min_perf: Minimum (required) performance level (units of @capacity). + * @target_perf: Terget (desired) performance level (units of @capacity). + * @capacity: Capacity of the target CPU. + * + * Carry out a fast performance level switch of @cpu without sleeping. + * + * The driver's ->adjust_perf() callback invoked by this function must be + * suitable for being called from within RCU-sched read-side critical sections + * and it is expected to select a suitable performance level equal to or above + * @min_perf and preferably equal to or below @target_perf. + * + * This function must not be called if policy->fast_switch_enabled is unset. + * + * Governors calling this function must guarantee that it will never be invoked + * twice in parallel for the same CPU and that it will never be called in + * parallel with either ->target() or ->target_index() or ->fast_switch() for + * the same CPU. + */ +void cpufreq_driver_adjust_perf(unsigned int cpu, + unsigned long min_perf, + unsigned long target_perf, + unsigned long capacity) +{ + cpufreq_driver->adjust_perf(cpu, min_perf, target_perf, capacity); +} + +/** + * cpufreq_driver_has_adjust_perf - Check "direct fast switch" callback. + * + * Return 'true' if the ->adjust_perf callback is present for the + * current driver or 'false' otherwise. + */ +bool cpufreq_driver_has_adjust_perf(void) +{ + return !!cpufreq_driver->adjust_perf; +} + /* Must set freqs->new to intermediate frequency */ static int __target_intermediate(struct cpufreq_policy *policy, struct cpufreq_freqs *freqs, int index) -- cgit v1.2.3 From a365ab6b9dfbaf8fb4fb4cd5d8a4c55dc4fb8b1c Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 14 Dec 2020 21:09:26 +0100 Subject: cpufreq: intel_pstate: Implement the ->adjust_perf() callback Make intel_pstate expose the ->adjust_perf() callback when it operates in the passive mode with HWP enabled which causes the schedutil governor to use that callback instead of ->fast_switch(). The minimum and target performance-level values passed by the governor to ->adjust_perf() are converted to HWP.REQ.MIN and HWP.REQ.DESIRED, respectively, which allows the processor to adjust its configuration to maximize energy-efficiency while providing sufficient capacity. Signed-off-by: Rafael J. Wysocki Acked-by: Srinivas Pandruvada Acked-by: Viresh Kumar --- drivers/cpufreq/intel_pstate.c | 70 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 58 insertions(+), 12 deletions(-) (limited to 'drivers/cpufreq') diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 2a4db856222f..d5ec0c962ec5 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -2526,20 +2526,19 @@ static void intel_cpufreq_trace(struct cpudata *cpu, unsigned int trace_type, in fp_toint(cpu->iowait_boost * 100)); } -static void intel_cpufreq_adjust_hwp(struct cpudata *cpu, u32 target_pstate, - bool strict, bool fast_switch) +static void intel_cpufreq_adjust_hwp(struct cpudata *cpu, u32 min, u32 max, + u32 desired, bool fast_switch) { u64 prev = READ_ONCE(cpu->hwp_req_cached), value = prev; value &= ~HWP_MIN_PERF(~0L); - value |= HWP_MIN_PERF(target_pstate); + value |= HWP_MIN_PERF(min); - /* - * The entire MSR needs to be updated in order to update the HWP min - * field in it, so opportunistically update the max too if needed. - */ value &= ~HWP_MAX_PERF(~0L); - value |= HWP_MAX_PERF(strict ? target_pstate : cpu->max_perf_ratio); + value |= HWP_MAX_PERF(max); + + value &= ~HWP_DESIRED_PERF(~0L); + value |= HWP_DESIRED_PERF(desired); if (value == prev) return; @@ -2569,11 +2568,15 @@ static int intel_cpufreq_update_pstate(struct cpufreq_policy *policy, int old_pstate = cpu->pstate.current_pstate; target_pstate = intel_pstate_prepare_request(cpu, target_pstate); - if (hwp_active) - intel_cpufreq_adjust_hwp(cpu, target_pstate, - policy->strict_target, fast_switch); - else if (target_pstate != old_pstate) + if (hwp_active) { + int max_pstate = policy->strict_target ? + target_pstate : cpu->max_perf_ratio; + + intel_cpufreq_adjust_hwp(cpu, target_pstate, max_pstate, 0, + fast_switch); + } else if (target_pstate != old_pstate) { intel_cpufreq_adjust_perf_ctl(cpu, target_pstate, fast_switch); + } cpu->pstate.current_pstate = target_pstate; @@ -2634,6 +2637,47 @@ static unsigned int intel_cpufreq_fast_switch(struct cpufreq_policy *policy, return target_pstate * cpu->pstate.scaling; } +static void intel_cpufreq_adjust_perf(unsigned int cpunum, + unsigned long min_perf, + unsigned long target_perf, + unsigned long capacity) +{ + struct cpudata *cpu = all_cpu_data[cpunum]; + int old_pstate = cpu->pstate.current_pstate; + int cap_pstate, min_pstate, max_pstate, target_pstate; + + update_turbo_state(); + cap_pstate = global.turbo_disabled ? cpu->pstate.max_pstate : + cpu->pstate.turbo_pstate; + + /* Optimization: Avoid unnecessary divisions. */ + + target_pstate = cap_pstate; + if (target_perf < capacity) + target_pstate = DIV_ROUND_UP(cap_pstate * target_perf, capacity); + + min_pstate = cap_pstate; + if (min_perf < capacity) + min_pstate = DIV_ROUND_UP(cap_pstate * min_perf, capacity); + + if (min_pstate < cpu->pstate.min_pstate) + min_pstate = cpu->pstate.min_pstate; + + if (min_pstate < cpu->min_perf_ratio) + min_pstate = cpu->min_perf_ratio; + + max_pstate = min(cap_pstate, cpu->max_perf_ratio); + if (max_pstate < min_pstate) + max_pstate = min_pstate; + + target_pstate = clamp_t(int, target_pstate, min_pstate, max_pstate); + + intel_cpufreq_adjust_hwp(cpu, min_pstate, max_pstate, target_pstate, true); + + cpu->pstate.current_pstate = target_pstate; + intel_cpufreq_trace(cpu, INTEL_PSTATE_TRACE_FAST_SWITCH, old_pstate); +} + static int intel_cpufreq_cpu_init(struct cpufreq_policy *policy) { int max_state, turbo_max, min_freq, max_freq, ret; @@ -3032,6 +3076,8 @@ static int __init intel_pstate_init(void) intel_pstate.attr = hwp_cpufreq_attrs; intel_cpufreq.attr = hwp_cpufreq_attrs; intel_cpufreq.flags |= CPUFREQ_NEED_UPDATE_LIMITS; + intel_cpufreq.fast_switch = NULL; + intel_cpufreq.adjust_perf = intel_cpufreq_adjust_perf; if (!default_driver) default_driver = &intel_pstate; -- cgit v1.2.3 From e40ad84c26b4deeee46666492ec66b9a534b8e59 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 17 Dec 2020 20:17:49 +0100 Subject: cpufreq: intel_pstate: Use most recent guaranteed performance values When turbo has been disabled by the BIOS, but HWP_CAP.GUARANTEED is changed later, user space may want to take advantage of this increased guaranteed performance. HWP_CAP.GUARANTEED is not a static value. It can be adjusted by an out-of-band agent or during an Intel Speed Select performance level change. The HWP_CAP.MAX is still the maximum achievable performance with turbo disabled by the BIOS, so HWP_CAP.GUARANTEED can still change as long as it remains less than or equal to HWP_CAP.MAX. When HWP_CAP.GUARANTEED is changed, the sysfs base_frequency attribute shows the most recent guaranteed frequency value. This attribute can be used by user space software to update the scaling min/max limits of the CPU. Currently, the ->setpolicy() callback already uses the latest HWP_CAP values when setting HWP_REQ, but the ->verify() callback will restrict the user settings to the to old guaranteed performance value which prevents user space from making use of the extra CPU capacity theoretically available to it after increasing HWP_CAP.GUARANTEED. To address this, read HWP_CAP in intel_pstate_verify_cpu_policy() to obtain the maximum P-state that can be used and use that to confine the policy max limit instead of using the cached and possibly stale pstate.max_freq value for this purpose. For consistency, update intel_pstate_update_perf_limits() to use the maximum available P-state returned by intel_pstate_get_hwp_max() to compute the maximum frequency instead of using the return value of intel_pstate_get_max_freq() which, again, may be stale. This issue is a side-effect of fixing the scaling frequency limits in commit eacc9c5a927e ("cpufreq: intel_pstate: Fix intel_pstate_get_hwp_max() for turbo disabled") which corrected the setting of the reduced scaling frequency values, but caused stale HWP_CAP.GUARANTEED to be used in the case at hand. Fixes: eacc9c5a927e ("cpufreq: intel_pstate: Fix intel_pstate_get_hwp_max() for turbo disabled") Reported-by: Srinivas Pandruvada Tested-by: Srinivas Pandruvada Cc: 5.8+ # 5.8+ Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'drivers/cpufreq') diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index d5ec0c962ec5..6e23376548ce 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -2207,9 +2207,9 @@ static void intel_pstate_update_perf_limits(struct cpudata *cpu, unsigned int policy_min, unsigned int policy_max) { - int max_freq = intel_pstate_get_max_freq(cpu); int32_t max_policy_perf, min_policy_perf; int max_state, turbo_max; + int max_freq; /* * HWP needs some special consideration, because on BDX the @@ -2223,6 +2223,7 @@ static void intel_pstate_update_perf_limits(struct cpudata *cpu, cpu->pstate.max_pstate : cpu->pstate.turbo_pstate; turbo_max = cpu->pstate.turbo_pstate; } + max_freq = max_state * cpu->pstate.scaling; max_policy_perf = max_state * policy_max / max_freq; if (policy_max == policy_min) { @@ -2325,9 +2326,18 @@ static void intel_pstate_adjust_policy_max(struct cpudata *cpu, static void intel_pstate_verify_cpu_policy(struct cpudata *cpu, struct cpufreq_policy_data *policy) { + int max_freq; + update_turbo_state(); - cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq, - intel_pstate_get_max_freq(cpu)); + if (hwp_active) { + int max_state, turbo_max; + + intel_pstate_get_hwp_max(cpu->cpu, &turbo_max, &max_state); + max_freq = max_state * cpu->pstate.scaling; + } else { + max_freq = intel_pstate_get_max_freq(cpu); + } + cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq, max_freq); intel_pstate_adjust_policy_max(cpu, policy); } -- cgit v1.2.3