diff options
author | Vincent Guittot <vincent.guittot@linaro.org> | 2015-01-09 14:05:39 +0100 |
---|---|---|
committer | Vincent Guittot <vincent.guittot@linaro.org> | 2015-01-09 14:05:39 +0100 |
commit | 6e825db0ab391a047c0d134277c6b3c7a02084e3 (patch) | |
tree | 35f8746b0e2855aecb2cc70080f6222a75abc7a5 /kernel/sched/fair.c | |
parent | 4de4b2f03d6386b26a06946023764b7257fb78c5 (diff) | |
parent | 8fab92fe85001c7684810419af1832a7d8d327b6 (diff) |
Merge branch 'sched-tasks-packing' into test-sched-packing-tasks
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r-- | kernel/sched/fair.c | 402 |
1 files changed, 385 insertions, 17 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e9d79a8b7d00..af0b0dbdb860 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -179,6 +179,243 @@ void sched_init_granularity(void) update_sysctl(); } +static unsigned long capacity_orig_of(int cpu); +static unsigned long capacity_of(int cpu); +static int get_cpu_usage(int cpu); + +#ifdef CONFIG_SCHED_PACKING_TASKS +struct sd_pack { + int my_buddy; /* cpu on which tasks should be packed */ + int my_leader; /* cpu which leads the packing state of a group */ + int packing; /* boolean to activate packing on buddy */ + struct sched_domain *domain; /* domain at which the check is done */ + struct sched_group *group; /* starting group for checking */ +}; + +/* + * Save per_cpu information about the optimal CPUs that should be used to pack + * tasks. + */ +DEFINE_PER_CPU(struct sd_pack, sd_pack_buddy) = { + .packing = true, +}; + +static inline int get_buddy(int cpu) +{ + if (per_cpu(sd_pack_buddy, cpu).packing) + return cpu; + return per_cpu(sd_pack_buddy, cpu).my_buddy; +} + +static inline int get_leader(int cpu) +{ + return per_cpu(sd_pack_buddy, cpu).my_leader; +} + +static inline bool is_packing_cpu(int cpu) +{ + return per_cpu(sd_pack_buddy, cpu).packing; +} + +static inline bool is_leader_cpu(int cpu, struct sched_domain *sd) +{ + if (sd != per_cpu(sd_pack_buddy, cpu).domain) + return 0; + + if (cpu != get_leader(cpu)) + return 0; + + return 1; +} + +/* + * Look for the best buddy CPU that can be used to pack small tasks + * We make the assumption that it doesn't wort to pack on CPU that share the + * same powerline. We look for the 1st sched_domain without the + * SD_SHARE_POWERDOMAIN flag. Then we look for the sched_group with the lowest + * power per core based on the assumption that their power efficiency is + * better + */ +void update_packing_domain(int cpu) +{ + struct sched_domain *sd; + struct sched_group *target = NULL; + struct sd_pack *sdpack = &per_cpu(sd_pack_buddy, cpu); + int id = cpu, pcpu = cpu; + char buf[40] = "-"; + + sd = highest_flag_domain(cpu, SD_SHARE_POWERDOMAIN); + if (!sd) + sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); + else + sd = sd->parent; + + if (sd) { + pcpu = cpumask_first(sched_group_cpus(sd->groups)); + if (pcpu != cpu) { + id = per_cpu(sd_pack_buddy, pcpu).my_buddy; + goto end; + } + } + + while (sd && (sd->flags & SD_LOAD_BALANCE) + && !(sd->flags & SD_SHARE_POWERDOMAIN)) { + struct sched_group *tmp; + struct sched_group *sg = sd->groups; + + /* + * The sched_domain of a CPU points on the local sched_group + * and this CPU of this local group is a good candidate + */ + id = cpu; + target = sg; + + /* loop the sched groups to find the best one */ + for (tmp = sg->next; tmp != sg; tmp = tmp->next) { + if (tmp->sgc->capacity_orig * target->group_weight > + target->sgc->capacity_orig * tmp->group_weight) + continue; + + if ((tmp->sgc->capacity_orig * target->group_weight == + target->sgc->capacity_orig * tmp->group_weight) + && (cpumask_first(sched_group_cpus(tmp)) >= id)) + continue; + + /* + * we have found a better group + * Take the 1st CPU of the new group + */ + id = cpumask_first(sched_group_cpus(tmp)); + target = tmp; + } + + /* Look for another CPU than itself */ + if (id != cpu) + break; + + sd = sd->parent; + } + +end: + if (target) + cpumask_scnprintf(buf, 40, sched_group_cpus(target)); + pr_info("CPU%d packing on CPU%d of group 0x%s\n", cpu, id, buf); + if (sd) + cpumask_scnprintf(buf, 40, sched_group_cpus(sd->groups)); + pr_info("CPU%d leaded by CPU%d for group 0x%s\n", cpu, pcpu, buf); + pr_info("CPU%d packing @ level %d\n", cpu, sd ? sd->level : -1); + + sdpack->my_leader = pcpu; + sdpack->my_buddy = id; + sdpack->packing = true; + sdpack->domain = sd; + sdpack->group = target; +} + + +bool update_packing_buddy(int cpu, struct sched_domain *sd, long activity, long perf) +{ + struct sched_group *tmp, *target = per_cpu(sd_pack_buddy, cpu).group; + long capacity = 0, vactivity = activity, vperf = perf; + int packing = per_cpu(sd_pack_buddy, cpu).packing; + + /* Get the state of 1st CPU of the power group */ + if (!is_leader_cpu(cpu, sd)) { + per_cpu(sd_pack_buddy, cpu).packing = is_packing_cpu(get_leader(cpu)); + return (per_cpu(sd_pack_buddy, cpu).packing != packing); + } + + /* No groups on which we can compute the need bunch of CPUS */ + if (!target) + return false; + + tmp = target; + + /* Take the best groups at this sd level to pack activity */ + while (vactivity > 0) { + int vcapacity = (tmp->sgc->capacity_orig * tmp->sgc->pack_thres) >> 10; + + vactivity -= vcapacity; + + if (cpumask_test_cpu(cpu, sched_group_cpus(tmp))) { + per_cpu(sd_pack_buddy, cpu).packing = true; + vactivity = 0; + } + else if (vactivity < 0) + per_cpu(sd_pack_buddy, cpu).packing = false; + + capacity += tmp->sgc->capacity_orig; + + tmp = tmp->next; + if (tmp == target) + vactivity = 0; + } + + tmp = target; + + /* Take the best group at this sd level to handle max perf activity */ + while (vperf > 0) { + int vcapacity = (tmp->sgc->capacity_orig * tmp->sgc->perf_thres) >> 10; + + if (vcapacity >= vperf) { + if (cpumask_test_cpu(cpu, sched_group_cpus(tmp))) + per_cpu(sd_pack_buddy, cpu).packing = true; + + vperf = 0; + } + + tmp = tmp->next; + if (tmp == target) + vperf = 0; + } + + return (per_cpu(sd_pack_buddy, cpu).packing != packing); +} + +static int check_nohz_packing(int cpu) +{ + if (!is_packing_cpu(cpu)) + return true; + + return false; +} + +static inline int +check_cpu_perf(struct rq *rq, struct sched_group *group) +{ + return ((get_cpu_usage(cpu_of(rq)) * 1024) > + (group->sgc->capacity_orig * group->sgc->perf_thres)); +} + +#else /* CONFIG_SCHED_PACKING_TASKS */ + +static inline bool is_packing_cpu(int cpu) +{ + return 1; +} + +static inline bool is_leader_cpu(int cpu, struct sched_domain *sd) +{ + return 0; +} + +static inline int get_buddy(int cpu) +{ + return cpu; +} + +static inline int check_nohz_packing(int cpu) +{ + return false; +} + +static inline int check_cpu_perf(struct rq *rq, struct sched_group *group) +{ + return 0; +} + +#endif /* CONFIG_SCHED_PACKING_TASKS */ + #define WMULT_CONST (~0U) #define WMULT_SHIFT 32 @@ -1112,7 +1349,6 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, static unsigned long weighted_cpuload(const int cpu); static unsigned long source_load(int cpu, int type); static unsigned long target_load(int cpu, int type); -static unsigned long capacity_of(int cpu); static long effective_load(struct task_group *tg, int cpu, long wl, long wg); /* Cached statistics for all CPUs within a node */ @@ -4650,7 +4886,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu, int sd_flag) { struct sched_group *idlest = NULL, *group = sd->groups; - unsigned long min_load = ULONG_MAX, this_load = 0; + unsigned long min_load = ULONG_MAX, this_load = ULONG_MAX, capacity; int load_idx = sd->forkexec_idx; int imbalance = 100 + (sd->imbalance_pct-100)/2; @@ -4659,7 +4895,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, do { unsigned long load, avg_load; - int local_group; + int local_group, packing_cpus = 0; int i; /* Skip over this group if it has no CPUs allowed */ @@ -4681,10 +4917,21 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, load = target_load(i, load_idx); avg_load += load; + + if (is_packing_cpu(i)) + packing_cpus = 1; } - /* Adjust by relative CPU capacity of the group */ - avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity; + if (!packing_cpus) + continue; + + if (sd_flag & SD_BALANCE_WAKE) + capacity = group->sgc->capacity; + else + capacity = group->sgc->capacity_orig; + + /* Adjust by relative CPU power of the group */ + avg_load = (avg_load * SCHED_CAPACITY_SCALE) / capacity; if (local_group) { this_load = avg_load; @@ -4763,7 +5010,8 @@ static int select_idle_sibling(struct task_struct *p, int target) /* * If the prevous cpu is cache affine and idle, don't be stupid. */ - if (i != target && cpus_share_cache(i, target) && idle_cpu(i)) + if (i != target && cpus_share_cache(i, target) && idle_cpu(i) + && is_packing_cpu(i)) return i; /* @@ -4778,7 +5026,8 @@ static int select_idle_sibling(struct task_struct *p, int target) goto next; for_each_cpu(i, sched_group_cpus(sg)) { - if (i == target || !idle_cpu(i)) + if (i == target || !idle_cpu(i) + || !is_packing_cpu(i)) goto next; } @@ -4865,9 +5114,13 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f sd = tmp; } - if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync)) + if (affine_sd && cpu != prev_cpu && (wake_affine(affine_sd, p, sync) || + !is_packing_cpu(prev_cpu))) prev_cpu = cpu; + if (!is_packing_cpu(prev_cpu)) + prev_cpu = get_buddy(prev_cpu); + if (sd_flag & SD_BALANCE_WAKE) { new_cpu = select_idle_sibling(p, prev_cpu); goto unlock; @@ -5958,11 +6211,13 @@ struct sg_lb_stats { unsigned long load_per_task; unsigned long group_capacity; unsigned long group_usage; /* Total usage of the group */ + unsigned long max_usage; unsigned int sum_nr_running; /* Nr tasks running in the group */ unsigned int idle_cpus; unsigned int group_weight; enum group_type group_type; int group_no_capacity; + int cpu_out_perf; #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; unsigned int nr_preferred_running; @@ -5977,6 +6232,7 @@ struct sd_lb_stats { struct sched_group *busiest; /* Busiest group in this sd */ struct sched_group *local; /* Local group in this sd */ unsigned long total_load; /* Total load of all groups in sd */ + unsigned long total_usage; /* Total usage of all groups in sd */ unsigned long total_capacity; /* Total capacity of all groups in sd */ unsigned long avg_load; /* Average load across all groups in sd */ @@ -6096,6 +6352,7 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) capacity >>= SCHED_CAPACITY_SHIFT; cpu_rq(cpu)->cpu_capacity_orig = capacity; + sdg->sgc->capacity_orig = capacity; capacity *= scale_rt_capacity(cpu); capacity >>= SCHED_CAPACITY_SHIFT; @@ -6103,6 +6360,9 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) if (!capacity) capacity = 1; + if (!is_packing_cpu(cpu)) + capacity = 1; + cpu_rq(cpu)->cpu_capacity = capacity; sdg->sgc->capacity = capacity; } @@ -6111,7 +6371,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) { struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; - unsigned long capacity; + unsigned long capacity, capacity_orig; unsigned long interval; interval = msecs_to_jiffies(sd->balance_interval); @@ -6123,7 +6383,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) return; } - capacity = 0; + capacity_orig = capacity = 0; if (child->flags & SD_OVERLAP) { /* @@ -6147,6 +6407,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) * causing divide-by-zero issues on boot. */ if (unlikely(!rq->sd)) { + capacity_orig += capacity_orig_of(cpu); capacity += capacity_of(cpu); continue; } @@ -6162,11 +6423,13 @@ void update_group_capacity(struct sched_domain *sd, int cpu) group = child->groups; do { + capacity_orig += group->sgc->capacity_orig; capacity += group->sgc->capacity; group = group->next; } while (group != child->groups); } + sdg->sgc->capacity_orig = capacity_orig; sdg->sgc->capacity = capacity; } @@ -6267,6 +6530,9 @@ static enum group_type group_classify(struct lb_env *env, if (sgs->group_no_capacity) return group_overloaded; + if (sgs->cpu_out_perf) + return group_overloaded; + if (sg_imbalanced(group)) return group_imbalanced; @@ -6294,6 +6560,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { struct rq *rq = cpu_rq(i); + unsigned long usage = get_cpu_usage(i); /* Bias balancing toward cpus of our domain */ if (local_group) @@ -6302,9 +6569,15 @@ static inline void update_sg_lb_stats(struct lb_env *env, load = source_load(i, load_idx); sgs->group_load += load; - sgs->group_usage += get_cpu_usage(i); + sgs->group_usage += usage; sgs->sum_nr_running += rq->cfs.h_nr_running; + if ((local_group) || (check_cpu_perf(rq, group))) { + if (usage > sgs->max_usage) + sgs->max_usage = usage; + sgs->cpu_out_perf = true; + } + if (rq->nr_running > 1) *overload = true; @@ -6409,7 +6682,7 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq) } #endif /* CONFIG_NUMA_BALANCING */ -/** +/* * update_sd_lb_stats - Update sched_domain's statistics for load balancing. * @env: The load balancing environment. * @sds: variable to hold the statistics for this sched_domain. @@ -6460,7 +6733,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd if (prefer_sibling && sds->local && group_has_capacity(env, &sds->local_stat) && (sgs->sum_nr_running > 1)) { - sgs->group_no_capacity = 1; + sgs->group_no_capacity = true; sgs->group_type = group_overloaded; } @@ -6473,7 +6746,7 @@ next_group: /* Now, start updating sd_lb_stats */ sds->total_load += sgs->group_load; sds->total_capacity += sgs->group_capacity; - + sds->total_usage += sgs->group_usage; sg = sg->next; } while (sg != env->sd->groups); @@ -6681,6 +6954,39 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s return fix_small_imbalance(env, sds); } +#ifdef CONFIG_SCHED_PACKING_TASKS +static bool update_sd_lb_packing(struct lb_env *env, struct sd_lb_stats *sds) +{ + bool update; + int old, new; + struct sched_domain *sd = env->sd; + + /* Update the list of packing CPU */ + update = update_packing_buddy(env->dst_cpu, env->sd, sds->total_usage, + max(sds->busiest_stat.max_usage, sds->local_stat.max_usage)); + + /* This CPU doesn't act for agressive packing */ + if (!is_packing_cpu(env->dst_cpu)) + sds->busiest = NULL; + + /* + * update cpu capacity and child group capacity + * so the next lb round will get an updated capcity + */ + if (update && sd->child) { + old = sd->child->groups->sgc->capacity - capacity_of(env->dst_cpu); + update_group_capacity(env->dst_rq->sd, env->dst_cpu); + new = capacity_of(env->dst_cpu); + sd->child->groups->sgc->capacity = old + new; + } + + return update; +} + +#else +static bool update_sd_lb_packing(struct lb_env *env, struct sd_lb_stats *sds) +{ return false;} +#endif /******* find_busiest_group() helpers end here *********************/ /** @@ -6705,6 +7011,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) struct sg_lb_stats *local, *busiest; struct sd_lb_stats sds; +update: init_sd_lb_stats(&sds); /* @@ -6712,6 +7019,12 @@ static struct sched_group *find_busiest_group(struct lb_env *env) * this level. */ update_sd_lb_stats(env, &sds); + /* + * Update the involvement of the CPU in the packing effort + */ + if (update_sd_lb_packing(env, &sds)) + goto update; + local = &sds.local_stat; busiest = &sds.busiest_stat; @@ -6843,7 +7156,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, */ if (rq->nr_running == 1 && wl > env->imbalance && - !check_cpu_capacity(rq, env->sd)) + !check_cpu_capacity(rq, env->sd) && !check_cpu_perf(rq, group)) continue; /* @@ -6857,6 +7170,24 @@ static struct rq *find_busiest_queue(struct lb_env *env, * to: wl_i * capacity_j > wl_j * capacity_i; where j is * our previous maximum. */ + if (!busiest) { + busiest_load = wl; + busiest_capacity = capacity; + busiest = rq; + continue; + } + + if (check_cpu_perf(rq, group) && !check_cpu_perf(busiest, group)) { + busiest_load = wl; + busiest_capacity = capacity; + busiest = rq; + continue; + } + + if (!check_cpu_perf(rq, group) && check_cpu_perf(busiest, group)) { + continue; + } + if (wl * busiest_capacity > busiest_load * capacity) { busiest_load = wl; busiest_capacity = capacity; @@ -6876,7 +7207,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, /* Working cpumask for load_balance and load_balance_newidle. */ DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); -static int need_active_balance(struct lb_env *env) +static int need_active_balance(struct lb_env *env, struct sched_group *group) { struct sched_domain *sd = env->sd; @@ -6913,6 +7244,9 @@ static int need_active_balance(struct lb_env *env) return 1; } + if (check_cpu_perf(env->src_rq, group)) + return 1; + return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); } @@ -6931,6 +7265,10 @@ static int should_we_balance(struct lb_env *env) if (env->idle == CPU_NEWLY_IDLE) return 1; + /* Leader CPU must be used to update packing CPUs list */ + if (is_leader_cpu(env->dst_cpu, env->sd)) + return 1; + sg_cpus = sched_group_cpus(sg); sg_mask = sched_group_mask(sg); /* Try to find first idle cpu */ @@ -7127,7 +7465,7 @@ more_balance: if (idle != CPU_NEWLY_IDLE) sd->nr_balance_failed++; - if (need_active_balance(&env)) { + if (need_active_balance(&env, group)) { raw_spin_lock_irqsave(&busiest->lock, flags); /* don't kick the active_load_balance_cpu_stop, @@ -7186,6 +7524,9 @@ more_balance: goto out; out_balanced: + /* We were balanced, so reset the balancing interval */ + sd->balance_interval = sd->min_interval; + /* * We reach balance although we may have faced some affinity * constraints. Clear the imbalance flag if it was set. @@ -7443,7 +7784,28 @@ static struct { static inline int find_new_ilb(void) { + struct sched_domain *sd; int ilb = cpumask_first(nohz.idle_cpus_mask); + int buddy = get_buddy(smp_processor_id()); + + /* + * If we have a pack buddy CPU, we try to run load balance on a CPU + * that is close to the buddy. + */ + if (buddy < nr_cpu_ids) { + rcu_read_lock(); + for_each_domain(buddy, sd) { + if (sd->flags & SD_SHARE_CPUCAPACITY) + continue; + + ilb = cpumask_first_and(sched_domain_span(sd), + nohz.idle_cpus_mask); + + if (ilb < nr_cpu_ids) + break; + } + rcu_read_unlock(); + } if (ilb < nr_cpu_ids && idle_cpu(ilb)) return ilb; @@ -7760,9 +8122,15 @@ static inline bool nohz_kick_needed(struct rq *rq) if (time_before(now, nohz.next_balance)) return false; + return true; + if (rq->nr_running >= 2) return true; + /* This cpu doesn't contribute to packing effort */ + if (check_nohz_packing(cpu)) + return true; + rcu_read_lock(); sd = rcu_dereference(per_cpu(sd_busy, cpu)); if (sd) { |