diff options
author | Daniel Lezcano <daniel.lezcano@linaro.org> | 2014-11-20 15:07:41 +0100 |
---|---|---|
committer | Daniel Lezcano <daniel.lezcano@linaro.org> | 2014-11-20 15:07:41 +0100 |
commit | b9ed965defe8f8ee2e57e0a9372c70b883a4ea3f (patch) | |
tree | 4c72d3a7bc6bd00229bce96b932626f5bb687434 | |
parent | 5f0dea804d380d0fa57c0f69a8e961b058be998f (diff) | |
parent | 7c2684baf9042237fa3f72c9f230637911fe9b53 (diff) |
Merge remote-tracking branch 'eas-next/eas-next' into sched/cpuidle/eas-nexteas-next-20141120sched/cpuidle/eas-next
Conflicts:
include/linux/sched.h
kernel/sched/Makefile
35 files changed, 1705 insertions, 522 deletions
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index 7024c12f7bfe..8f3271842533 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -30,9 +30,6 @@ static __always_inline void preempt_count_set(int pc) /* * must be macros to avoid header recursion hell */ -#define task_preempt_count(p) \ - (task_thread_info(p)->saved_preempt_count & ~PREEMPT_NEED_RESCHED) - #define init_task_preempt_count(p) do { \ task_thread_info(p)->saved_preempt_count = PREEMPT_DISABLED; \ } while (0) @@ -105,6 +102,7 @@ static __always_inline bool should_resched(void) # ifdef CONFIG_CONTEXT_TRACKING extern asmlinkage void ___preempt_schedule_context(void); # define __preempt_schedule_context() asm ("call ___preempt_schedule_context") + extern asmlinkage void preempt_schedule_context(void); # endif #endif diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig index 3489f8f5fada..6520ff6461e1 100644 --- a/drivers/cpufreq/Kconfig +++ b/drivers/cpufreq/Kconfig @@ -102,6 +102,15 @@ config CPU_FREQ_DEFAULT_GOV_CONSERVATIVE Be aware that not all cpufreq drivers support the conservative governor. If unsure have a look at the help section of the driver. Fallback governor will be the performance governor. + +config CPU_FREQ_DEFAULT_GOV_ENERGY_MODEL + bool "energy_model" + select CPU_FREQ_GOV_ENERGY_MODEL + select CPU_FREQ_GOV_PERFORMANCE + help + Use the CPUfreq governor 'energy_model' as default. This + scales cpu frequency from the scheduler as per-task statistics + are updated. endchoice config CPU_FREQ_GOV_PERFORMANCE @@ -183,6 +192,18 @@ config CPU_FREQ_GOV_CONSERVATIVE If in doubt, say N. +config CPU_FREQ_GOV_ENERGY_MODEL + tristate "'energy model' cpufreq governor" + depends on CPU_FREQ + select CPU_FREQ_GOV_COMMON + help + 'energy_model' - this governor scales cpu frequency from the + scheduler as a function of cpu utilization. It does not + evaluate utilization on a periodic basis (unlike ondemand) but + instead is invoked from CFS when updating per-task statistics. + + If in doubt, say N. + config CPUFREQ_DT tristate "Generic DT based cpufreq driver" depends on HAVE_CLK && OF diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c index 89c4cee253e3..fb3f519b803f 100644 --- a/drivers/tty/n_tty.c +++ b/drivers/tty/n_tty.c @@ -2123,7 +2123,7 @@ static ssize_t n_tty_read(struct tty_struct *tty, struct file *file, { struct n_tty_data *ldata = tty->disc_data; unsigned char __user *b = buf; - DECLARE_WAITQUEUE(wait, current); + DEFINE_WAIT_FUNC(wait, woken_wake_function); int c; int minimum, time; ssize_t retval = 0; @@ -2186,10 +2186,6 @@ static ssize_t n_tty_read(struct tty_struct *tty, struct file *file, nr--; break; } - /* This statement must be first before checking for input - so that any interrupt will set the state back to - TASK_RUNNING. */ - set_current_state(TASK_INTERRUPTIBLE); if (((minimum - (b - buf)) < ldata->minimum_to_wake) && ((minimum - (b - buf)) >= 1)) @@ -2220,13 +2216,13 @@ static ssize_t n_tty_read(struct tty_struct *tty, struct file *file, n_tty_set_room(tty); up_read(&tty->termios_rwsem); - timeout = schedule_timeout(timeout); + timeout = wait_woken(&wait, TASK_INTERRUPTIBLE, + timeout); down_read(&tty->termios_rwsem); continue; } } - __set_current_state(TASK_RUNNING); /* Deal with packet mode. */ if (packet && b == buf) { @@ -2273,7 +2269,6 @@ static ssize_t n_tty_read(struct tty_struct *tty, struct file *file, mutex_unlock(&ldata->atomic_read_lock); - __set_current_state(TASK_RUNNING); if (b - buf) retval = b - buf; @@ -2306,7 +2301,7 @@ static ssize_t n_tty_write(struct tty_struct *tty, struct file *file, const unsigned char *buf, size_t nr) { const unsigned char *b = buf; - DECLARE_WAITQUEUE(wait, current); + DEFINE_WAIT_FUNC(wait, woken_wake_function); int c; ssize_t retval = 0; @@ -2324,7 +2319,6 @@ static ssize_t n_tty_write(struct tty_struct *tty, struct file *file, add_wait_queue(&tty->write_wait, &wait); while (1) { - set_current_state(TASK_INTERRUPTIBLE); if (signal_pending(current)) { retval = -ERESTARTSYS; break; @@ -2378,12 +2372,11 @@ static ssize_t n_tty_write(struct tty_struct *tty, struct file *file, } up_read(&tty->termios_rwsem); - schedule(); + wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); down_read(&tty->termios_rwsem); } break_out: - __set_current_state(TASK_RUNNING); remove_wait_queue(&tty->write_wait, &wait); if (b - buf != nr && tty->fasync) set_bit(TTY_DO_WRITE_WAKEUP, &tty->flags); diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index daf76652fe58..283aa312d745 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -227,14 +227,13 @@ static ssize_t inotify_read(struct file *file, char __user *buf, struct fsnotify_event *kevent; char __user *start; int ret; - DEFINE_WAIT(wait); + DEFINE_WAIT_FUNC(wait, woken_wake_function); start = buf; group = file->private_data; + add_wait_queue(&group->notification_waitq, &wait); while (1) { - prepare_to_wait(&group->notification_waitq, &wait, TASK_INTERRUPTIBLE); - mutex_lock(&group->notification_mutex); kevent = get_one_event(group, count); mutex_unlock(&group->notification_mutex); @@ -264,10 +263,10 @@ static ssize_t inotify_read(struct file *file, char __user *buf, if (start != buf) break; - schedule(); + wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); } + remove_wait_queue(&group->notification_waitq, &wait); - finish_wait(&group->notification_waitq, &wait); if (start != buf && ret != -EFAULT) ret = buf - start; return ret; diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h index 1cd3f5d767a8..eb6f9e6c3075 100644 --- a/include/asm-generic/preempt.h +++ b/include/asm-generic/preempt.h @@ -23,9 +23,6 @@ static __always_inline void preempt_count_set(int pc) /* * must be macros to avoid header recursion hell */ -#define task_preempt_count(p) \ - (task_thread_info(p)->preempt_count & ~PREEMPT_NEED_RESCHED) - #define init_task_preempt_count(p) do { \ task_thread_info(p)->preempt_count = PREEMPT_DISABLED; \ } while (0) diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 503b085b7832..9ae4b27c2b4b 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -115,6 +115,9 @@ struct cpufreq_policy { /* For cpufreq driver's internal use */ void *driver_data; + + /* For cpufreq governor's internal use */ + void *gov_data; }; /* Only for ACPI */ @@ -481,6 +484,9 @@ extern struct cpufreq_governor cpufreq_gov_ondemand; #elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE) extern struct cpufreq_governor cpufreq_gov_conservative; #define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_conservative) +#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_ENERGY_MODEL) +extern struct cpufreq_governor cpufreq_gov_energy_model; +#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_energy_model) #endif /********************************************************************* diff --git a/include/linux/freezer.h b/include/linux/freezer.h index 7fd81b8c4897..6b7fd9cf5ea2 100644 --- a/include/linux/freezer.h +++ b/include/linux/freezer.h @@ -246,15 +246,6 @@ static inline int freezable_schedule_hrtimeout_range(ktime_t *expires, * defined in <linux/wait.h> */ -#define wait_event_freezekillable(wq, condition) \ -({ \ - int __retval; \ - freezer_do_not_count(); \ - __retval = wait_event_killable(wq, (condition)); \ - freezer_count(); \ - __retval; \ -}) - /* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */ #define wait_event_freezekillable_unsafe(wq, condition) \ ({ \ @@ -265,35 +256,6 @@ static inline int freezable_schedule_hrtimeout_range(ktime_t *expires, __retval; \ }) -#define wait_event_freezable(wq, condition) \ -({ \ - int __retval; \ - freezer_do_not_count(); \ - __retval = wait_event_interruptible(wq, (condition)); \ - freezer_count(); \ - __retval; \ -}) - -#define wait_event_freezable_timeout(wq, condition, timeout) \ -({ \ - long __retval = timeout; \ - freezer_do_not_count(); \ - __retval = wait_event_interruptible_timeout(wq, (condition), \ - __retval); \ - freezer_count(); \ - __retval; \ -}) - -#define wait_event_freezable_exclusive(wq, condition) \ -({ \ - int __retval; \ - freezer_do_not_count(); \ - __retval = wait_event_interruptible_exclusive(wq, condition); \ - freezer_count(); \ - __retval; \ -}) - - #else /* !CONFIG_FREEZER */ static inline bool frozen(struct task_struct *p) { return false; } static inline bool freezing(struct task_struct *p) { return false; } @@ -331,18 +293,6 @@ static inline void set_freezable(void) {} #define freezable_schedule_hrtimeout_range(expires, delta, mode) \ schedule_hrtimeout_range(expires, delta, mode) -#define wait_event_freezable(wq, condition) \ - wait_event_interruptible(wq, condition) - -#define wait_event_freezable_timeout(wq, condition, timeout) \ - wait_event_interruptible_timeout(wq, condition, timeout) - -#define wait_event_freezable_exclusive(wq, condition) \ - wait_event_interruptible_exclusive(wq, condition) - -#define wait_event_freezekillable(wq, condition) \ - wait_event_killable(wq, condition) - #define wait_event_freezekillable_unsafe(wq, condition) \ wait_event_killable(wq, condition) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 3d770f5564b8..446d76a87ba1 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -162,6 +162,7 @@ extern int _cond_resched(void); #endif #ifdef CONFIG_DEBUG_ATOMIC_SLEEP + void ___might_sleep(const char *file, int line, int preempt_offset); void __might_sleep(const char *file, int line, int preempt_offset); /** * might_sleep - annotation for functions that can sleep @@ -175,10 +176,14 @@ extern int _cond_resched(void); */ # define might_sleep() \ do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0) +# define sched_annotate_sleep() __set_current_state(TASK_RUNNING) #else + static inline void ___might_sleep(const char *file, int line, + int preempt_offset) { } static inline void __might_sleep(const char *file, int line, int preempt_offset) { } # define might_sleep() do { might_resched(); } while (0) +# define sched_annotate_sleep() do { } while (0) #endif #define might_sleep_if(cond) do { if (cond) might_sleep(); } while (0) diff --git a/include/linux/sched.h b/include/linux/sched.h index 50f8c6343f17..fc3a7cf107ec 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -243,6 +243,43 @@ extern char ___assert_task_state[1 - 2*!!( ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \ (task->flags & PF_FROZEN) == 0) +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP + +#define __set_task_state(tsk, state_value) \ + do { \ + (tsk)->task_state_change = _THIS_IP_; \ + (tsk)->state = (state_value); \ + } while (0) +#define set_task_state(tsk, state_value) \ + do { \ + (tsk)->task_state_change = _THIS_IP_; \ + set_mb((tsk)->state, (state_value)); \ + } while (0) + +/* + * set_current_state() includes a barrier so that the write of current->state + * is correctly serialised wrt the caller's subsequent test of whether to + * actually sleep: + * + * set_current_state(TASK_UNINTERRUPTIBLE); + * if (do_i_need_to_sleep()) + * schedule(); + * + * If the caller does not need such serialisation then use __set_current_state() + */ +#define __set_current_state(state_value) \ + do { \ + current->task_state_change = _THIS_IP_; \ + current->state = (state_value); \ + } while (0) +#define set_current_state(state_value) \ + do { \ + current->task_state_change = _THIS_IP_; \ + set_mb(current->state, (state_value)); \ + } while (0) + +#else + #define __set_task_state(tsk, state_value) \ do { (tsk)->state = (state_value); } while (0) #define set_task_state(tsk, state_value) \ @@ -259,11 +296,13 @@ extern char ___assert_task_state[1 - 2*!!( * * If the caller does not need such serialisation then use __set_current_state() */ -#define __set_current_state(state_value) \ +#define __set_current_state(state_value) \ do { current->state = (state_value); } while (0) -#define set_current_state(state_value) \ +#define set_current_state(state_value) \ set_mb(current->state, (state_value)) +#endif + /* Task command name length */ #define TASK_COMM_LEN 16 @@ -1072,15 +1111,28 @@ struct load_weight { }; struct sched_avg { + u64 last_runnable_update; + s64 decay_count; + /* + * utilization_avg_contrib describes the amount of time that a + * sched_entity is running on a CPU. It is based on running_avg_sum + * and is scaled in the range [0..SCHED_LOAD_SCALE]. + * load_avg_contrib described the amount of time that a sched_entity + * is runnable on a rq. It is based on both runnable_avg_sum and the + * weight of the task. + */ + unsigned long load_avg_contrib, utilization_avg_contrib; /* * These sums represent an infinite geometric series and so are bound * above by 1024/(1-y). Thus we only need a u32 to store them for all * choices of y < 1-2^(-32)*1024. + * running_avg_sum reflects the time that the sched_entity is + * effectively running on the CPU. + * runnable_avg_sum represents the amount of time a sched_entity is on + * a runqueue which includes the running time that is monitored by + * running_avg_sum. */ - u32 runnable_avg_sum, runnable_avg_period; - u64 last_runnable_update; - s64 decay_count; - unsigned long load_avg_contrib; + u32 runnable_avg_sum, avg_period, running_avg_sum; }; #ifdef CONFIG_SCHEDSTATS @@ -1576,28 +1628,23 @@ struct task_struct { struct numa_group *numa_group; /* - * Exponential decaying average of faults on a per-node basis. - * Scheduling placement decisions are made based on the these counts. - * The values remain static for the duration of a PTE scan + * numa_faults is an array split into four regions: + * faults_memory, faults_cpu, faults_memory_buffer, faults_cpu_buffer + * in this precise order. + * + * faults_memory: Exponential decaying average of faults on a per-node + * basis. Scheduling placement decisions are made based on these + * counts. The values remain static for the duration of a PTE scan. + * faults_cpu: Track the nodes the process was running on when a NUMA + * hinting fault was incurred. + * faults_memory_buffer and faults_cpu_buffer: Record faults per node + * during the current scan window. When the scan completes, the counts + * in faults_memory and faults_cpu decay and these values are copied. */ - unsigned long *numa_faults_memory; + unsigned long *numa_faults; unsigned long total_numa_faults; /* - * numa_faults_buffer records faults per node during the current - * scan window. When the scan completes, the counts in - * numa_faults_memory decay and these values are copied. - */ - unsigned long *numa_faults_buffer_memory; - - /* - * Track the nodes the process was running on when a NUMA hinting - * fault was incurred. - */ - unsigned long *numa_faults_cpu; - unsigned long *numa_faults_buffer_cpu; - - /* * numa_faults_locality tracks if faults recorded during the last * scan window were remote/local. The task scan period is adapted * based on the locality of the faults with different weights @@ -1682,6 +1729,9 @@ struct task_struct { #ifdef CONFIG_SCHED_IO_LATENCY struct io_latency_node io_latency; #endif +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP + unsigned long task_state_change; +#endif }; /* Future-safe accessor for struct task_struct's cpus_allowed. */ @@ -2073,6 +2123,10 @@ static inline void tsk_restore_flags(struct task_struct *task, task->flags |= orig_flags & flags; } +extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, + const struct cpumask *trial); +extern int task_can_attach(struct task_struct *p, + const struct cpumask *cs_cpus_allowed); #ifdef CONFIG_SMP extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask); @@ -2781,7 +2835,7 @@ static inline int signal_pending_state(long state, struct task_struct *p) extern int _cond_resched(void); #define cond_resched() ({ \ - __might_sleep(__FILE__, __LINE__, 0); \ + ___might_sleep(__FILE__, __LINE__, 0); \ _cond_resched(); \ }) @@ -2794,14 +2848,14 @@ extern int __cond_resched_lock(spinlock_t *lock); #endif #define cond_resched_lock(lock) ({ \ - __might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \ + ___might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET);\ __cond_resched_lock(lock); \ }) extern int __cond_resched_softirq(void); #define cond_resched_softirq() ({ \ - __might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET); \ + ___might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET); \ __cond_resched_softirq(); \ }) diff --git a/include/linux/wait.h b/include/linux/wait.h index e4a8eb9312ea..2232ed16635a 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -13,9 +13,12 @@ typedef struct __wait_queue wait_queue_t; typedef int (*wait_queue_func_t)(wait_queue_t *wait, unsigned mode, int flags, void *key); int default_wake_function(wait_queue_t *wait, unsigned mode, int flags, void *key); +/* __wait_queue::flags */ +#define WQ_FLAG_EXCLUSIVE 0x01 +#define WQ_FLAG_WOKEN 0x02 + struct __wait_queue { unsigned int flags; -#define WQ_FLAG_EXCLUSIVE 0x01 void *private; wait_queue_func_t func; struct list_head task_list; @@ -258,11 +261,37 @@ __out: __ret; \ */ #define wait_event(wq, condition) \ do { \ + might_sleep(); \ if (condition) \ break; \ __wait_event(wq, condition); \ } while (0) +#define __wait_event_freezable(wq, condition) \ + ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, 0, \ + schedule(); try_to_freeze()) + +/** + * wait_event - sleep (or freeze) until a condition gets true + * @wq: the waitqueue to wait on + * @condition: a C expression for the event to wait for + * + * The process is put to sleep (TASK_INTERRUPTIBLE -- so as not to contribute + * to system load) until the @condition evaluates to true. The + * @condition is checked each time the waitqueue @wq is woken up. + * + * wake_up() has to be called after changing any variable that could + * change the result of the wait condition. + */ +#define wait_event_freezable(wq, condition) \ +({ \ + int __ret = 0; \ + might_sleep(); \ + if (!(condition)) \ + __ret = __wait_event_freezable(wq, condition); \ + __ret; \ +}) + #define __wait_event_timeout(wq, condition, timeout) \ ___wait_event(wq, ___wait_cond_timeout(condition), \ TASK_UNINTERRUPTIBLE, 0, timeout, \ @@ -290,11 +319,30 @@ do { \ #define wait_event_timeout(wq, condition, timeout) \ ({ \ long __ret = timeout; \ + might_sleep(); \ if (!___wait_cond_timeout(condition)) \ __ret = __wait_event_timeout(wq, condition, timeout); \ __ret; \ }) +#define __wait_event_freezable_timeout(wq, condition, timeout) \ + ___wait_event(wq, ___wait_cond_timeout(condition), \ + TASK_INTERRUPTIBLE, 0, timeout, \ + __ret = schedule_timeout(__ret); try_to_freeze()) + +/* + * like wait_event_timeout() -- except it uses TASK_INTERRUPTIBLE to avoid + * increasing load and is freezable. + */ +#define wait_event_freezable_timeout(wq, condition, timeout) \ +({ \ + long __ret = timeout; \ + might_sleep(); \ + if (!___wait_cond_timeout(condition)) \ + __ret = __wait_event_freezable_timeout(wq, condition, timeout); \ + __ret; \ +}) + #define __wait_event_cmd(wq, condition, cmd1, cmd2) \ (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ cmd1; schedule(); cmd2) @@ -315,6 +363,7 @@ do { \ */ #define wait_event_cmd(wq, condition, cmd1, cmd2) \ do { \ + might_sleep(); \ if (condition) \ break; \ __wait_event_cmd(wq, condition, cmd1, cmd2); \ @@ -342,6 +391,7 @@ do { \ #define wait_event_interruptible(wq, condition) \ ({ \ int __ret = 0; \ + might_sleep(); \ if (!(condition)) \ __ret = __wait_event_interruptible(wq, condition); \ __ret; \ @@ -375,6 +425,7 @@ do { \ #define wait_event_interruptible_timeout(wq, condition, timeout) \ ({ \ long __ret = timeout; \ + might_sleep(); \ if (!___wait_cond_timeout(condition)) \ __ret = __wait_event_interruptible_timeout(wq, \ condition, timeout); \ @@ -425,6 +476,7 @@ do { \ #define wait_event_hrtimeout(wq, condition, timeout) \ ({ \ int __ret = 0; \ + might_sleep(); \ if (!(condition)) \ __ret = __wait_event_hrtimeout(wq, condition, timeout, \ TASK_UNINTERRUPTIBLE); \ @@ -450,6 +502,7 @@ do { \ #define wait_event_interruptible_hrtimeout(wq, condition, timeout) \ ({ \ long __ret = 0; \ + might_sleep(); \ if (!(condition)) \ __ret = __wait_event_hrtimeout(wq, condition, timeout, \ TASK_INTERRUPTIBLE); \ @@ -463,12 +516,27 @@ do { \ #define wait_event_interruptible_exclusive(wq, condition) \ ({ \ int __ret = 0; \ + might_sleep(); \ if (!(condition)) \ __ret = __wait_event_interruptible_exclusive(wq, condition);\ __ret; \ }) +#define __wait_event_freezable_exclusive(wq, condition) \ + ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 1, 0, \ + schedule(); try_to_freeze()) + +#define wait_event_freezable_exclusive(wq, condition) \ +({ \ + int __ret = 0; \ + might_sleep(); \ + if (!(condition)) \ + __ret = __wait_event_freezable_exclusive(wq, condition);\ + __ret; \ +}) + + #define __wait_event_interruptible_locked(wq, condition, exclusive, irq) \ ({ \ int __ret = 0; \ @@ -637,6 +705,7 @@ do { \ #define wait_event_killable(wq, condition) \ ({ \ int __ret = 0; \ + might_sleep(); \ if (!(condition)) \ __ret = __wait_event_killable(wq, condition); \ __ret; \ @@ -830,6 +899,8 @@ void prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int sta long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state); void finish_wait(wait_queue_head_t *q, wait_queue_t *wait); void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, unsigned int mode, void *key); +long wait_woken(wait_queue_t *wait, unsigned mode, long timeout); +int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key); int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key); int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key); @@ -886,6 +957,7 @@ extern int bit_wait_io_timeout(struct wait_bit_key *); static inline int wait_on_bit(void *word, int bit, unsigned mode) { + might_sleep(); if (!test_bit(bit, word)) return 0; return out_of_line_wait_on_bit(word, bit, @@ -910,6 +982,7 @@ wait_on_bit(void *word, int bit, unsigned mode) static inline int wait_on_bit_io(void *word, int bit, unsigned mode) { + might_sleep(); if (!test_bit(bit, word)) return 0; return out_of_line_wait_on_bit(word, bit, @@ -936,6 +1009,7 @@ wait_on_bit_io(void *word, int bit, unsigned mode) static inline int wait_on_bit_action(void *word, int bit, wait_bit_action_f *action, unsigned mode) { + might_sleep(); if (!test_bit(bit, word)) return 0; return out_of_line_wait_on_bit(word, bit, action, mode); @@ -963,6 +1037,7 @@ wait_on_bit_action(void *word, int bit, wait_bit_action_f *action, unsigned mode static inline int wait_on_bit_lock(void *word, int bit, unsigned mode) { + might_sleep(); if (!test_and_set_bit(bit, word)) return 0; return out_of_line_wait_on_bit_lock(word, bit, bit_wait, mode); @@ -986,6 +1061,7 @@ wait_on_bit_lock(void *word, int bit, unsigned mode) static inline int wait_on_bit_lock_io(void *word, int bit, unsigned mode) { + might_sleep(); if (!test_and_set_bit(bit, word)) return 0; return out_of_line_wait_on_bit_lock(word, bit, bit_wait_io, mode); @@ -1011,6 +1087,7 @@ wait_on_bit_lock_io(void *word, int bit, unsigned mode) static inline int wait_on_bit_lock_action(void *word, int bit, wait_bit_action_f *action, unsigned mode) { + might_sleep(); if (!test_and_set_bit(bit, word)) return 0; return out_of_line_wait_on_bit_lock(word, bit, action, mode); @@ -1029,6 +1106,7 @@ wait_on_bit_lock_action(void *word, int bit, wait_bit_action_f *action, unsigned static inline int wait_on_atomic_t(atomic_t *val, int (*action)(atomic_t *), unsigned mode) { + might_sleep(); if (atomic_read(val) == 0) return 0; return out_of_line_wait_on_atomic_t(val, action, mode); diff --git a/include/net/sock.h b/include/net/sock.h index 7db3db112baa..e6f235ebf6c9 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -897,6 +897,7 @@ static inline void sock_rps_reset_rxhash(struct sock *sk) if (!__rc) { \ *(__timeo) = schedule_timeout(*(__timeo)); \ } \ + sched_annotate_sleep(); \ lock_sock(__sk); \ __rc = __condition; \ __rc; \ diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 0a68d5ae584e..30fedaf3e56a 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -97,16 +97,19 @@ static inline long __trace_sched_switch_state(struct task_struct *p) long state = p->state; #ifdef CONFIG_PREEMPT +#ifdef CONFIG_SCHED_DEBUG + BUG_ON(p != current); +#endif /* CONFIG_SCHED_DEBUG */ /* * For all intents and purposes a preempted task is a running task. */ - if (task_preempt_count(p) & PREEMPT_ACTIVE) + if (preempt_count() & PREEMPT_ACTIVE) state = TASK_RUNNING | TASK_STATE_MAX; -#endif +#endif /* CONFIG_PREEMPT */ return state; } -#endif +#endif /* CREATE_TRACE_POINTS */ /* * Tracepoint for task switches, performed by the scheduler: diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 34f9d7387d13..cc89ddefa926 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -13,7 +13,7 @@ #define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */ #define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */ #define CLONE_THREAD 0x00010000 /* Same thread group? */ -#define CLONE_NEWNS 0x00020000 /* New namespace group? */ +#define CLONE_NEWNS 0x00020000 /* New mount namespace group */ #define CLONE_SYSVSEM 0x00040000 /* share system V SEM_UNDO semantics */ #define CLONE_SETTLS 0x00080000 /* create a new TLS for the child */ #define CLONE_PARENT_SETTID 0x00100000 /* set the TID in the parent */ @@ -23,8 +23,8 @@ #define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */ /* 0x02000000 was previously the unused CLONE_STOPPED (Start in stopped state) and is now available for re-use. */ -#define CLONE_NEWUTS 0x04000000 /* New utsname group? */ -#define CLONE_NEWIPC 0x08000000 /* New ipcs */ +#define CLONE_NEWUTS 0x04000000 /* New utsname namespace */ +#define CLONE_NEWIPC 0x08000000 /* New ipc namespace */ #define CLONE_NEWUSER 0x10000000 /* New user namespace */ #define CLONE_NEWPID 0x20000000 /* New pid namespace */ #define CLONE_NEWNET 0x40000000 /* New network namespace */ diff --git a/kernel/audit.c b/kernel/audit.c index 80983df92cd4..32bfc43ffb9a 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -499,7 +499,6 @@ static int kauditd_thread(void *dummy) set_freezable(); while (!kthread_should_stop()) { struct sk_buff *skb; - DECLARE_WAITQUEUE(wait, current); flush_hold_queue(); @@ -514,16 +513,8 @@ static int kauditd_thread(void *dummy) audit_printk_skb(skb); continue; } - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&kauditd_wait, &wait); - if (!skb_queue_len(&audit_skb_queue)) { - try_to_freeze(); - schedule(); - } - - __set_current_state(TASK_RUNNING); - remove_wait_queue(&kauditd_wait, &wait); + wait_event_freezable(kauditd_wait, skb_queue_len(&audit_skb_queue)); } return 0; } diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 5664985c46a0..937ecdfdf258 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -107,46 +107,6 @@ void context_tracking_user_enter(void) } NOKPROBE_SYMBOL(context_tracking_user_enter); -#ifdef CONFIG_PREEMPT -/** - * preempt_schedule_context - preempt_schedule called by tracing - * - * The tracing infrastructure uses preempt_enable_notrace to prevent - * recursion and tracing preempt enabling caused by the tracing - * infrastructure itself. But as tracing can happen in areas coming - * from userspace or just about to enter userspace, a preempt enable - * can occur before user_exit() is called. This will cause the scheduler - * to be called when the system is still in usermode. - * - * To prevent this, the preempt_enable_notrace will use this function - * instead of preempt_schedule() to exit user context if needed before - * calling the scheduler. - */ -asmlinkage __visible void __sched notrace preempt_schedule_context(void) -{ - enum ctx_state prev_ctx; - - if (likely(!preemptible())) - return; - - /* - * Need to disable preemption in case user_exit() is traced - * and the tracer calls preempt_enable_notrace() causing - * an infinite recursion. - */ - preempt_disable_notrace(); - prev_ctx = exception_enter(); - preempt_enable_no_resched_notrace(); - - preempt_schedule(); - - preempt_disable_notrace(); - exception_exit(prev_ctx); - preempt_enable_notrace(); -} -EXPORT_SYMBOL_GPL(preempt_schedule_context); -#endif /* CONFIG_PREEMPT */ - /** * context_tracking_user_exit - Inform the context tracking that the CPU is * exiting userspace mode and entering the kernel. diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 1f107c74087b..723cfc9d0ad7 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -506,6 +506,16 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) goto out; } + /* + * We can't shrink if we won't have enough room for SCHED_DEADLINE + * tasks. + */ + ret = -EBUSY; + if (is_cpu_exclusive(cur) && + !cpuset_cpumask_can_shrink(cur->cpus_allowed, + trial->cpus_allowed)) + goto out; + ret = 0; out: rcu_read_unlock(); @@ -1429,17 +1439,8 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css, goto out_unlock; cgroup_taskset_for_each(task, tset) { - /* - * Kthreads which disallow setaffinity shouldn't be moved - * to a new cpuset; we don't want to change their cpu - * affinity and isolating such threads by their set of - * allowed nodes is unnecessary. Thus, cpusets are not - * applicable for such threads. This prevents checking for - * success of set_cpus_allowed_ptr() on all attached tasks - * before cpus_allowed may be changed. - */ - ret = -EINVAL; - if (task->flags & PF_NO_SETAFFINITY) + ret = task_can_attach(task, cs->cpus_allowed); + if (ret) goto out_unlock; ret = security_task_setscheduler(task); if (ret) diff --git a/kernel/exit.c b/kernel/exit.c index 123419be6f84..8e4e75d5efaa 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -998,6 +998,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) get_task_struct(p); read_unlock(&tasklist_lock); + sched_annotate_sleep(); + if ((exit_code & 0x7f) == 0) { why = CLD_EXITED; status = exit_code >> 8; @@ -1080,6 +1082,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) * thread can reap it because we its state == DEAD/TRACE. */ read_unlock(&tasklist_lock); + sched_annotate_sleep(); retval = wo->wo_rusage ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; @@ -1211,6 +1214,7 @@ unlock_sig: pid = task_pid_vnr(p); why = ptrace ? CLD_TRAPPED : CLD_STOPPED; read_unlock(&tasklist_lock); + sched_annotate_sleep(); if (unlikely(wo->wo_flags & WNOWAIT)) return wait_noreap_copyout(wo, p, pid, uid, why, exit_code); @@ -1273,6 +1277,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) pid = task_pid_vnr(p); get_task_struct(p); read_unlock(&tasklist_lock); + sched_annotate_sleep(); if (!wo->wo_info) { retval = wo->wo_rusage diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index dadbf88c22c4..454195194d4a 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -378,8 +378,14 @@ done: * reschedule now, before we try-lock the mutex. This avoids getting * scheduled out right after we obtained the mutex. */ - if (need_resched()) + if (need_resched()) { + /* + * We _should_ have TASK_RUNNING here, but just in case + * we do not, make it so, otherwise we might get stuck. + */ + __set_current_state(TASK_RUNNING); schedule_preempt_disabled(); + } return false; } diff --git a/kernel/module.c b/kernel/module.c index 88cec1ddb1e3..e52a8739361a 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3097,6 +3097,32 @@ static int may_init_module(void) } /* + * Can't use wait_event_interruptible() because our condition + * 'finished_loading()' contains a blocking primitive itself (mutex_lock). + */ +static int wait_finished_loading(struct module *mod) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + int ret = 0; + + add_wait_queue(&module_wq, &wait); + for (;;) { + if (finished_loading(mod->name)) + break; + + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + + wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); + } + remove_wait_queue(&module_wq, &wait); + + return ret; +} + +/* * We try to place it in the list now to make sure it's unique before * we dedicate too many resources. In particular, temporary percpu * memory exhaustion. @@ -3116,8 +3142,8 @@ again: || old->state == MODULE_STATE_UNFORMED) { /* Wait in case it fails to load. */ mutex_unlock(&module_mutex); - err = wait_event_interruptible(module_wq, - finished_loading(mod->name)); + + err = wait_finished_loading(mod); if (err) goto out_unlocked; goto again; diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 622511a4e49e..5d5380090741 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -20,4 +20,5 @@ obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o obj-$(CONFIG_SCHED_IO_LATENCY) += io_latency.o -obj-$(CONFIG_SCHED_IDLE_DEBUG) += idle_debug.o
\ No newline at end of file +obj-$(CONFIG_SCHED_IDLE_DEBUG) += idle_debug.o +obj-$(CONFIG_CPU_FREQ_GOV_ENERGY_MODEL) += energy_model.o diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index a63f4dc27909..607f852b4d04 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -148,7 +148,7 @@ EXPORT_SYMBOL(wait_for_completion_timeout); * * This waits to be signaled for completion of a specific task. It is NOT * interruptible and there is no timeout. The caller is accounted as waiting - * for IO. + * for IO (which traditionally means blkio only). */ void __sched wait_for_completion_io(struct completion *x) { @@ -163,7 +163,8 @@ EXPORT_SYMBOL(wait_for_completion_io); * * This waits for either a completion of a specific task to be signaled or for a * specified timeout to expire. The timeout is in jiffies. It is not - * interruptible. The caller is accounted as waiting for IO. + * interruptible. The caller is accounted as waiting for IO (which traditionally + * means blkio only). * * Return: 0 if timed out, and positive (at least 1, or number of jiffies left * till timeout) if completed. diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 827ddf978f38..4db493445cd6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1009,6 +1009,9 @@ inline int task_curr(const struct task_struct *p) return cpu_curr(task_cpu(p)) == p; } +/* + * Can drop rq->lock because from sched_class::switched_from() methods drop it. + */ static inline void check_class_changed(struct rq *rq, struct task_struct *p, const struct sched_class *prev_class, int oldprio) @@ -1016,6 +1019,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, if (prev_class != p->sched_class) { if (prev_class->switched_from) prev_class->switched_from(rq, p); + /* Possble rq->lock 'hole'. */ p->sched_class->switched_to(rq, p); } else if (oldprio != p->prio || dl_task(p)) p->sched_class->prio_changed(rq, p, oldprio); @@ -1055,7 +1059,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) * ttwu() will sort out the placement. */ WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && - !(task_preempt_count(p) & PREEMPT_ACTIVE)); + !p->on_rq); #ifdef CONFIG_LOCKDEP /* @@ -1854,8 +1858,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; p->numa_scan_period = sysctl_numa_balancing_scan_delay; p->numa_work.next = &p->numa_work; - p->numa_faults_memory = NULL; - p->numa_faults_buffer_memory = NULL; + p->numa_faults = NULL; p->last_task_numa_placement = 0; p->last_sum_exec_runtime = 0; @@ -2035,25 +2038,6 @@ static inline int dl_bw_cpus(int i) } #endif -static inline -void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw) -{ - dl_b->total_bw -= tsk_bw; -} - -static inline -void __dl_add(struct dl_bw *dl_b, u64 tsk_bw) -{ - dl_b->total_bw += tsk_bw; -} - -static inline -bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) -{ - return dl_b->bw != -1 && - dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; -} - /* * We must be sure that accepting a new task (or allowing changing the * parameters of an existing one) is consistent with the bandwidth @@ -2221,7 +2205,6 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, /** * finish_task_switch - clean up after a task-switch - * @rq: runqueue associated with task-switch * @prev: the thread we just switched away from. * * finish_task_switch must be called after the context switch, paired @@ -2233,10 +2216,16 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, * so, we finish that here outside of the runqueue lock. (Doing it * with the lock held can cause deadlocks; see schedule() for * details.) + * + * The context switch have flipped the stack from under us and restored the + * local variables which were saved when this task called schedule() in the + * past. prev == current is still correct but we need to recalculate this_rq + * because prev may have moved to another CPU. */ -static void finish_task_switch(struct rq *rq, struct task_struct *prev) +static struct rq *finish_task_switch(struct task_struct *prev) __releases(rq->lock) { + struct rq *rq = this_rq(); struct mm_struct *mm = rq->prev_mm; long prev_state; @@ -2276,6 +2265,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) } tick_nohz_task_switch(current); + return rq; } #ifdef CONFIG_SMP @@ -2310,25 +2300,22 @@ static inline void post_schedule(struct rq *rq) asmlinkage __visible void schedule_tail(struct task_struct *prev) __releases(rq->lock) { - struct rq *rq = this_rq(); - - finish_task_switch(rq, prev); + struct rq *rq; - /* - * FIXME: do we need to worry about rq being invalidated by the - * task_switch? - */ + /* finish_task_switch() drops rq->lock and enables preemtion */ + preempt_disable(); + rq = finish_task_switch(prev); post_schedule(rq); + preempt_enable(); if (current->set_child_tid) put_user(task_pid_vnr(current), current->set_child_tid); } /* - * context_switch - switch to the new MM and the new - * thread's register state. + * context_switch - switch to the new MM and the new thread's register state. */ -static inline void +static inline struct rq * context_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { @@ -2367,14 +2354,9 @@ context_switch(struct rq *rq, struct task_struct *prev, context_tracking_task_switch(prev, next); /* Here we just switch the register state and the stack. */ switch_to(prev, next, prev); - barrier(); - /* - * this_rq must be evaluated again because prev may have moved - * CPUs since it called schedule(), thus the 'rq' on its stack - * frame will be invalid. - */ - finish_task_switch(this_rq(), prev); + + return finish_task_switch(prev); } /* @@ -2856,15 +2838,8 @@ need_resched: rq->curr = next; ++*switch_count; - context_switch(rq, prev, next); /* unlocks the rq */ - /* - * The context switch have flipped the stack from under us - * and restored the local variables which were saved when - * this task called schedule() in the past. prev == current - * is still correct, but it can be moved to another cpu/rq. - */ - cpu = smp_processor_id(); - rq = cpu_rq(cpu); + rq = context_switch(rq, prev, next); /* unlocks the rq */ + cpu = cpu_of(rq); } else raw_spin_unlock_irq(&rq->lock); @@ -2952,6 +2927,47 @@ asmlinkage __visible void __sched notrace preempt_schedule(void) } NOKPROBE_SYMBOL(preempt_schedule); EXPORT_SYMBOL(preempt_schedule); + +#ifdef CONFIG_CONTEXT_TRACKING +/** + * preempt_schedule_context - preempt_schedule called by tracing + * + * The tracing infrastructure uses preempt_enable_notrace to prevent + * recursion and tracing preempt enabling caused by the tracing + * infrastructure itself. But as tracing can happen in areas coming + * from userspace or just about to enter userspace, a preempt enable + * can occur before user_exit() is called. This will cause the scheduler + * to be called when the system is still in usermode. + * + * To prevent this, the preempt_enable_notrace will use this function + * instead of preempt_schedule() to exit user context if needed before + * calling the scheduler. + */ +asmlinkage __visible void __sched notrace preempt_schedule_context(void) +{ + enum ctx_state prev_ctx; + + if (likely(!preemptible())) + return; + + do { + __preempt_count_add(PREEMPT_ACTIVE); + /* + * Needs preempt disabled in case user_exit() is traced + * and the tracer calls preempt_enable_notrace() causing + * an infinite recursion. + */ + prev_ctx = exception_enter(); + __schedule(); + exception_exit(prev_ctx); + + __preempt_count_sub(PREEMPT_ACTIVE); + barrier(); + } while (need_resched()); +} +EXPORT_SYMBOL_GPL(preempt_schedule_context); +#endif /* CONFIG_CONTEXT_TRACKING */ + #endif /* CONFIG_PREEMPT */ /* @@ -4642,6 +4658,81 @@ void init_idle(struct task_struct *idle, int cpu) #endif } +int cpuset_cpumask_can_shrink(const struct cpumask *cur, + const struct cpumask *trial) +{ + int ret = 1, trial_cpus; + struct dl_bw *cur_dl_b; + unsigned long flags; + + rcu_read_lock_sched(); + cur_dl_b = dl_bw_of(cpumask_any(cur)); + trial_cpus = cpumask_weight(trial); + + raw_spin_lock_irqsave(&cur_dl_b->lock, flags); + if (cur_dl_b->bw != -1 && + cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw) + ret = 0; + raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags); + rcu_read_unlock_sched(); + + return ret; +} + +int task_can_attach(struct task_struct *p, + const struct cpumask *cs_cpus_allowed) +{ + int ret = 0; + + /* + * Kthreads which disallow setaffinity shouldn't be moved + * to a new cpuset; we don't want to change their cpu + * affinity and isolating such threads by their set of + * allowed nodes is unnecessary. Thus, cpusets are not + * applicable for such threads. This prevents checking for + * success of set_cpus_allowed_ptr() on all attached tasks + * before cpus_allowed may be changed. + */ + if (p->flags & PF_NO_SETAFFINITY) { + ret = -EINVAL; + goto out; + } + +#ifdef CONFIG_SMP + if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span, + cs_cpus_allowed)) { + unsigned int dest_cpu = cpumask_any_and(cpu_active_mask, + cs_cpus_allowed); + struct dl_bw *dl_b; + bool overflow; + int cpus; + unsigned long flags; + + rcu_read_lock_sched(); + dl_b = dl_bw_of(dest_cpu); + raw_spin_lock_irqsave(&dl_b->lock, flags); + cpus = dl_bw_cpus(dest_cpu); + overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw); + if (overflow) + ret = -EBUSY; + else { + /* + * We reserve space for this task in the destination + * root_domain, as we can't fail after this point. + * We will free resources in the source root_domain + * later on (see set_cpus_allowed_dl()). + */ + __dl_add(dl_b, p->dl.dl_bw); + } + raw_spin_unlock_irqrestore(&dl_b->lock, flags); + rcu_read_unlock_sched(); + + } +#endif +out: + return ret; +} + #ifdef CONFIG_SMP /* * move_queued_task - move a queued task to new rq. @@ -5374,17 +5465,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, break; } - /* - * Even though we initialize ->capacity to something semi-sane, - * we leave capacity_orig unset. This allows us to detect if - * domain iteration is still funny without causing /0 traps. - */ - if (!group->sgc->capacity_orig) { - printk(KERN_CONT "\n"); - printk(KERN_ERR "ERROR: domain->cpu_capacity not set\n"); - break; - } - if (!cpumask_weight(sched_group_cpus(group))) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: empty group\n"); @@ -5869,7 +5949,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) * die on a /0 trap. */ sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); - sg->sgc->capacity_orig = sg->sgc->capacity; /* * Make sure the first group of this domain contains the @@ -6092,7 +6171,9 @@ static void claim_allocations(int cpu, struct sched_domain *sd) #ifdef CONFIG_NUMA static int sched_domains_numa_levels; +enum numa_topology_type sched_numa_topology_type; static int *sched_domains_numa_distance; +int sched_max_numa_distance; static struct cpumask ***sched_domains_numa_masks; static int sched_domains_curr_level; #endif @@ -6178,6 +6259,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu) */ if (sd->flags & SD_SHARE_CPUCAPACITY) { + sd->flags |= SD_PREFER_SIBLING; sd->imbalance_pct = 110; sd->smt_gain = 1178; /* ~15% */ @@ -6264,7 +6346,7 @@ static void sched_numa_warn(const char *str) printk(KERN_WARNING "\n"); } -static bool find_numa_distance(int distance) +bool find_numa_distance(int distance) { int i; @@ -6279,6 +6361,56 @@ static bool find_numa_distance(int distance) return false; } +/* + * A system can have three types of NUMA topology: + * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system + * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes + * NUMA_BACKPLANE: nodes can reach other nodes through a backplane + * + * The difference between a glueless mesh topology and a backplane + * topology lies in whether communication between not directly + * connected nodes goes through intermediary nodes (where programs + * could run), or through backplane controllers. This affects + * placement of programs. + * + * The type of topology can be discerned with the following tests: + * - If the maximum distance between any nodes is 1 hop, the system + * is directly connected. + * - If for two nodes A and B, located N > 1 hops away from each other, + * there is an intermediary node C, which is < N hops away from both + * nodes A and B, the system is a glueless mesh. + */ +static void init_numa_topology_type(void) +{ + int a, b, c, n; + + n = sched_max_numa_distance; + + if (n <= 1) + sched_numa_topology_type = NUMA_DIRECT; + + for_each_online_node(a) { + for_each_online_node(b) { + /* Find two nodes furthest removed from each other. */ + if (node_distance(a, b) < n) + continue; + + /* Is there an intermediary node between a and b? */ + for_each_online_node(c) { + if (node_distance(a, c) < n && + node_distance(b, c) < n) { + sched_numa_topology_type = + NUMA_GLUELESS_MESH; + return; + } + } + + sched_numa_topology_type = NUMA_BACKPLANE; + return; + } + } +} + static void sched_init_numa(void) { int next_distance, curr_distance = node_distance(0, 0); @@ -6411,6 +6543,9 @@ static void sched_init_numa(void) sched_domain_topology = tl; sched_domains_numa_levels = level; + sched_max_numa_distance = sched_domains_numa_distance[level - 1]; + + init_numa_topology_type(); } static void sched_domains_numa_masks_set(int cpu) @@ -7090,7 +7225,7 @@ void __init sched_init(void) #ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; - rq->cpu_capacity = SCHED_CAPACITY_SCALE; + rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE; rq->post_schedule = 0; rq->active_balance = 0; rq->next_balance = jiffies; @@ -7165,6 +7300,25 @@ static inline int preempt_count_equals(int preempt_offset) void __might_sleep(const char *file, int line, int preempt_offset) { + /* + * Blocking primitives will set (and therefore destroy) current->state, + * since we will exit with TASK_RUNNING make sure we enter with it, + * otherwise we will destroy state. + */ + if (WARN_ONCE(current->state != TASK_RUNNING, + "do not call blocking ops when !TASK_RUNNING; " + "state=%lx set at [<%p>] %pS\n", + current->state, + (void *)current->task_state_change, + (void *)current->task_state_change)) + __set_current_state(TASK_RUNNING); + + ___might_sleep(file, line, preempt_offset); +} +EXPORT_SYMBOL(__might_sleep); + +void ___might_sleep(const char *file, int line, int preempt_offset) +{ static unsigned long prev_jiffy; /* ratelimiting */ rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ @@ -7196,7 +7350,7 @@ void __might_sleep(const char *file, int line, int preempt_offset) #endif dump_stack(); } -EXPORT_SYMBOL(__might_sleep); +EXPORT_SYMBOL(___might_sleep); #endif #ifdef CONFIG_MAGIC_SYSRQ @@ -7840,6 +7994,11 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) sched_offline_group(tg); } +static void cpu_cgroup_fork(struct task_struct *task) +{ + sched_move_task(task); +} + static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { @@ -8212,6 +8371,7 @@ struct cgroup_subsys cpu_cgrp_subsys = { .css_free = cpu_cgroup_css_free, .css_online = cpu_cgroup_css_online, .css_offline = cpu_cgroup_css_offline, + .fork = cpu_cgroup_fork, .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, .exit = cpu_cgroup_exit, diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 256e577faf1b..f3d7776656ee 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -518,12 +518,20 @@ again: } /* - * We need to take care of a possible races here. In fact, the - * task might have changed its scheduling policy to something - * different from SCHED_DEADLINE or changed its reservation - * parameters (through sched_setattr()). + * We need to take care of several possible races here: + * + * - the task might have changed its scheduling policy + * to something different than SCHED_DEADLINE + * - the task might have changed its reservation parameters + * (through sched_setattr()) + * - the task might have been boosted by someone else and + * might be in the boosting/deboosting path + * + * In all this cases we bail out, as the task is already + * in the runqueue or is going to be enqueued back anyway. */ - if (!dl_task(p) || dl_se->dl_new) + if (!dl_task(p) || dl_se->dl_new || + dl_se->dl_boosted || !dl_se->dl_throttled) goto unlock; sched_clock_tick(); @@ -532,7 +540,7 @@ again: dl_se->dl_yielded = 0; if (task_on_rq_queued(p)) { enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); - if (task_has_dl_policy(rq->curr)) + if (dl_task(rq->curr)) check_preempt_curr_dl(rq, p, 0); else resched_curr(rq); @@ -555,11 +563,6 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se) { struct hrtimer *timer = &dl_se->dl_timer; - if (hrtimer_active(timer)) { - hrtimer_try_to_cancel(timer); - return; - } - hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); timer->function = dl_task_timer; } @@ -625,7 +628,7 @@ static void update_curr_dl(struct rq *rq) sched_rt_avg_update(rq, delta_exec); - dl_se->runtime -= delta_exec; + dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec; if (dl_runtime_exceeded(rq, dl_se)) { __dequeue_task_dl(rq, curr, 0); if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted))) @@ -847,8 +850,19 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) * smaller than our one... OTW we keep our runtime and * deadline. */ - if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) + if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) { pi_se = &pi_task->dl; + } else if (!dl_prio(p->normal_prio)) { + /* + * Special case in which we have a !SCHED_DEADLINE task + * that is going to be deboosted, but exceedes its + * runtime while doing so. No point in replenishing + * it, as it's going to return back to its original + * scheduling class after this. + */ + BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH); + return; + } /* * If p is throttled, we do nothing. In fact, if it exhausted @@ -914,7 +928,10 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) struct task_struct *curr; struct rq *rq; - if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) + if (p->nr_cpus_allowed == 1) + goto out; + + if (sd_flag != SD_BALANCE_WAKE) goto out; rq = cpu_rq(cpu); @@ -1489,7 +1506,7 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p) p->nr_cpus_allowed > 1 && dl_task(rq->curr) && (rq->curr->nr_cpus_allowed < 2 || - dl_entity_preempt(&rq->curr->dl, &p->dl))) { + !dl_entity_preempt(&p->dl, &rq->curr->dl))) { push_dl_tasks(rq); } } @@ -1498,10 +1515,33 @@ static void set_cpus_allowed_dl(struct task_struct *p, const struct cpumask *new_mask) { struct rq *rq; + struct root_domain *src_rd; int weight; BUG_ON(!dl_task(p)); + rq = task_rq(p); + src_rd = rq->rd; + /* + * Migrating a SCHED_DEADLINE task between exclusive + * cpusets (different root_domains) entails a bandwidth + * update. We already made space for us in the destination + * domain (see cpuset_can_attach()). + */ + if (!cpumask_intersects(src_rd->span, new_mask)) { + struct dl_bw *src_dl_b; + + src_dl_b = dl_bw_of(cpu_of(rq)); + /* + * We now free resources of the root_domain we are migrating + * off. In the worst case, sched_setattr() may temporary fail + * until we complete the update. + */ + raw_spin_lock(&src_dl_b->lock); + __dl_clear(src_dl_b, p->dl.dl_bw); + raw_spin_unlock(&src_dl_b->lock); + } + /* * Update only if the task is actually running (i.e., * it is on the rq AND it is not throttled). @@ -1518,8 +1558,6 @@ static void set_cpus_allowed_dl(struct task_struct *p, if ((p->nr_cpus_allowed > 1) == (weight > 1)) return; - rq = task_rq(p); - /* * The process used to be able to migrate OR it can now migrate */ @@ -1567,22 +1605,48 @@ void init_sched_dl_class(void) #endif /* CONFIG_SMP */ +/* + * Ensure p's dl_timer is cancelled. May drop rq->lock for a while. + */ +static void cancel_dl_timer(struct rq *rq, struct task_struct *p) +{ + struct hrtimer *dl_timer = &p->dl.dl_timer; + + /* Nobody will change task's class if pi_lock is held */ + lockdep_assert_held(&p->pi_lock); + + if (hrtimer_active(dl_timer)) { + int ret = hrtimer_try_to_cancel(dl_timer); + + if (unlikely(ret == -1)) { + /* + * Note, p may migrate OR new deadline tasks + * may appear in rq when we are unlocking it. + * A caller of us must be fine with that. + */ + raw_spin_unlock(&rq->lock); + hrtimer_cancel(dl_timer); + raw_spin_lock(&rq->lock); + } + } +} + static void switched_from_dl(struct rq *rq, struct task_struct *p) { - if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy)) - hrtimer_try_to_cancel(&p->dl.dl_timer); + cancel_dl_timer(rq, p); __dl_clear_params(p); -#ifdef CONFIG_SMP /* * Since this might be the only -deadline task on the rq, * this is the right place to try to pull some other one * from an overloaded cpu, if any. */ - if (!rq->dl.dl_nr_running) - pull_dl_task(rq); -#endif + if (!task_on_rq_queued(p) || rq->dl.dl_nr_running) + return; + + if (pull_dl_task(rq)) + resched_curr(rq); } /* @@ -1603,12 +1667,17 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) if (task_on_rq_queued(p) && rq->curr != p) { #ifdef CONFIG_SMP - if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p)) + if (p->nr_cpus_allowed > 1 && rq->dl.overloaded && + push_dl_task(rq) && rq != task_rq(p)) /* Only reschedule if pushing failed */ check_resched = 0; #endif /* CONFIG_SMP */ - if (check_resched && task_has_dl_policy(rq->curr)) - check_preempt_curr_dl(rq, p, 0); + if (check_resched) { + if (dl_task(rq->curr)) + check_preempt_curr_dl(rq, p, 0); + else + resched_curr(rq); + } } } @@ -1679,3 +1748,12 @@ const struct sched_class dl_sched_class = { .switched_from = switched_from_dl, .switched_to = switched_to_dl, }; + +#ifdef CONFIG_SCHED_DEBUG +extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq); + +void print_dl_stats(struct seq_file *m, int cpu) +{ + print_dl_rq(m, cpu, &cpu_rq(cpu)->dl); +} +#endif /* CONFIG_SCHED_DEBUG */ diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index ce33780d8f20..9dce8b5abeea 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -71,7 +71,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group if (!se) { struct sched_avg *avg = &cpu_rq(cpu)->avg; P(avg->runnable_avg_sum); - P(avg->runnable_avg_period); + P(avg->avg_period); return; } @@ -94,8 +94,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group P(se->load.weight); #ifdef CONFIG_SMP P(se->avg.runnable_avg_sum); - P(se->avg.runnable_avg_period); + P(se->avg.running_avg_sum); + P(se->avg.avg_period); P(se->avg.load_avg_contrib); + P(se->avg.utilization_avg_contrib); P(se->avg.decay_count); #endif #undef PN @@ -214,6 +216,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) cfs_rq->runnable_load_avg); SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg", cfs_rq->blocked_load_avg); + SEQ_printf(m, " .%-30s: %ld\n", "utilization_load_avg", + cfs_rq->utilization_load_avg); #ifdef CONFIG_FAIR_GROUP_SCHED SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib", cfs_rq->tg_load_contrib); @@ -261,6 +265,12 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) #undef P } +void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq) +{ + SEQ_printf(m, "\ndl_rq[%d]:\n", cpu); + SEQ_printf(m, " .%-30s: %ld\n", "dl_nr_running", dl_rq->dl_nr_running); +} + extern __read_mostly int sched_clock_running; static void print_cpu(struct seq_file *m, int cpu) @@ -329,6 +339,7 @@ do { \ spin_lock_irqsave(&sched_debug_lock, flags); print_cfs_stats(m, cpu); print_rt_stats(m, cpu); + print_dl_stats(m, cpu); print_rq(m, rq, cpu); spin_unlock_irqrestore(&sched_debug_lock, flags); @@ -528,8 +539,8 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m) unsigned long nr_faults = -1; int cpu_current, home_node; - if (p->numa_faults_memory) - nr_faults = p->numa_faults_memory[2*node + i]; + if (p->numa_faults) + nr_faults = p->numa_faults[2*node + i]; cpu_current = !i ? (task_node(p) == node) : (pol && node_isset(node, pol->v.nodes)); @@ -628,8 +639,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) P(se.load.weight); #ifdef CONFIG_SMP P(se.avg.runnable_avg_sum); - P(se.avg.runnable_avg_period); + P(se.avg.running_avg_sum); + P(se.avg.avg_period); P(se.avg.load_avg_contrib); + P(se.avg.utilization_avg_contrib); P(se.avg.decay_count); #endif P(policy); diff --git a/kernel/sched/energy_model.c b/kernel/sched/energy_model.c new file mode 100644 index 000000000000..1599fe5dbbb1 --- /dev/null +++ b/kernel/sched/energy_model.c @@ -0,0 +1,341 @@ +/* + * Copyright (C) 2014 Michael Turquette <mturquette@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/cpufreq.h> +#include <linux/module.h> +#include <linux/kthread.h> + +#include "sched.h" + +#define THROTTLE_MSEC 50 +#define UP_THRESHOLD 80 +#define DOWN_THRESHOLD 20 + +/** + * em_data - per-policy data used by energy_mode + * @throttle: bail if current time is less than than ktime_throttle. + * Derived from THROTTLE_MSEC + * @up_threshold: table of normalized capacity states to determine if cpu + * should run faster. Derived from UP_THRESHOLD + * @down_threshold: table of normalized capacity states to determine if cpu + * should run slower. Derived from DOWN_THRESHOLD + * + * struct em_data is the per-policy energy_model-specific data structure. A + * per-policy instance of it is created when the energy_model governor receives + * the CPUFREQ_GOV_START condition and a pointer to it exists in the gov_data + * member of struct cpufreq_policy. + * + * Readers of this data must call down_read(policy->rwsem). Writers must + * call down_write(policy->rwsem). + */ +struct em_data { + /* per-policy throttling */ + ktime_t throttle; + unsigned int *up_threshold; + unsigned int *down_threshold; + struct task_struct *task; + atomic_long_t target_freq; + atomic_t need_wake_task; +}; + +/* + * we pass in struct cpufreq_policy. This is safe because changing out the + * policy requires a call to __cpufreq_governor(policy, CPUFREQ_GOV_STOP), + * which tears all of the data structures down and __cpufreq_governor(policy, + * CPUFREQ_GOV_START) will do a full rebuild, including this kthread with the + * new policy pointer + */ +static int energy_model_thread(void *data) +{ + struct sched_param param; + struct cpufreq_policy *policy; + struct em_data *em; + int ret; + + policy = (struct cpufreq_policy *) data; + if (!policy) { + pr_warn("%s: missing policy\n", __func__); + do_exit(-EINVAL); + } + + em = policy->gov_data; + if (!em) { + pr_warn("%s: missing governor data\n", __func__); + do_exit(-EINVAL); + } + + param.sched_priority = 0; + sched_setscheduler(current, SCHED_FIFO, ¶m); + + + do { + down_write(&policy->rwsem); + if (!atomic_read(&em->need_wake_task)) { + up_write(&policy->rwsem); + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + continue; + } + + ret = __cpufreq_driver_target(policy, atomic_read(&em->target_freq), + CPUFREQ_RELATION_H); + if (ret) + pr_debug("%s: __cpufreq_driver_target returned %d\n", + __func__, ret); + + em->throttle = ktime_get(); + atomic_set(&em->need_wake_task, 0); + up_write(&policy->rwsem); + } while (!kthread_should_stop()); + + do_exit(0); +} + +static void em_wake_up_process(struct task_struct *task) +{ + /* this is null during early boot */ + if (IS_ERR_OR_NULL(task)) { + return; + } + + wake_up_process(task); +} + +void arch_scale_cpu_freq(void) +{ + struct cpufreq_policy *policy; + struct em_data *em; + int cpu; + + for_each_online_cpu(cpu) { + policy = cpufreq_cpu_get(cpu); + if (IS_ERR_OR_NULL(policy)) + continue; + + em = policy->gov_data; + if (!em) + continue; + + /* + * FIXME replace the atomic stuff by holding write-locks + * in arch_eval_cpu_freq? + */ + if (atomic_read(&em->need_wake_task)) { + em_wake_up_process(em->task); + } + + cpufreq_cpu_put(policy); + } +} + +/** + * arch_eval_cpu_freq - scale cpu frequency based on CFS utilization + * @update_cpus: mask of CPUs with updated utilization and capacity + * + * Declared and weakly defined in kernel/sched/fair.c This definition overrides + * the default. In the case of CONFIG_FAIR_GROUP_SCHED, update_cpus may + * contains cpus that are not in the same policy. Otherwise update_cpus will be + * a single cpu. + * + * Holds read lock for policy->rw_sem. + * + * FIXME weak arch function means that only one definition of this function can + * be linked. How to support multiple energy model policies? + */ +void arch_eval_cpu_freq(struct cpumask *update_cpus) +{ + struct cpufreq_policy *policy; + struct em_data *em; + int index; + unsigned int cpu, tmp; + unsigned long percent_util = 0, max_util = 0, cap = 0, util = 0; + + /* + * In the case of CONFIG_FAIR_GROUP_SCHED, policy->cpus may be a subset + * of update_cpus. In such case take the first cpu in update_cpus, get + * its policy and try to scale the affects cpus. Then we clear the + * corresponding bits from update_cpus and try again. If a policy does + * not exist for a cpu then we remove that bit as well, preventing an + * infinite loop. + */ + while (!cpumask_empty(update_cpus)) { + percent_util = 0; + max_util = 0; + cap = 0; + util = 0; + + cpu = cpumask_first(update_cpus); + policy = cpufreq_cpu_get(cpu); + if (IS_ERR_OR_NULL(policy)) { + cpumask_clear_cpu(cpu, update_cpus); + continue; + } + + if (!policy->gov_data) + return; + + em = policy->gov_data; + + if (ktime_before(ktime_get(), em->throttle)) { + trace_printk("THROTTLED"); + goto bail; + } + + /* + * try scaling cpus + * + * algorithm assumptions & description: + * all cpus in a policy run at the same rate/capacity. + * choose frequency target based on most utilized cpu. + * do not care about aggregating cpu utilization. + * do not track any historical trends beyond utilization + * if max_util > 80% of current capacity, + * go to max capacity + * if max_util < 20% of current capacity, + * go to the next lowest capacity + * otherwise, stay at the same capacity state + */ + for_each_cpu(tmp, policy->cpus) { + util = get_cpu_usage(cpu); + if (util > max_util) + max_util = util; + } + + cap = capacity_of(cpu); + if (!cap) { + goto bail; + } + + index = cpufreq_frequency_table_get_index(policy, policy->cur); + if (max_util > em->up_threshold[index]) { + /* write em->target_freq with read lock held */ + atomic_long_set(&em->target_freq, policy->max); + /* + * FIXME this is gross. convert arch_eval_cpu_freq to + * hold the write lock? + */ + atomic_set(&em->need_wake_task, 1); + } else if (max_util < em->down_threshold[index]) { + /* write em->target_freq with read lock held */ + atomic_long_set(&em->target_freq, policy->cur - 1); + /* + * FIXME this is gross. convert arch_eval_cpu_freq to + * hold the write lock? + */ + atomic_set(&em->need_wake_task, 1); + } + +bail: + /* remove policy->cpus fromm update_cpus */ + cpumask_andnot(update_cpus, update_cpus, policy->cpus); + cpufreq_cpu_put(policy); + } + + return; +} + +static void em_start(struct cpufreq_policy *policy) +{ + int index = 0, count = 0; + unsigned int capacity; + struct em_data *em; + struct cpufreq_frequency_table *pos; + + /* prepare per-policy private data */ + em = kzalloc(sizeof(*em), GFP_KERNEL); + if (!em) { + pr_debug("%s: failed to allocate private data\n", __func__); + return; + } + + policy->gov_data = em; + + /* how many entries in the frequency table? */ + cpufreq_for_each_entry(pos, policy->freq_table) + count++; + + /* pre-compute thresholds */ + em->up_threshold = kcalloc(count, sizeof(unsigned int), GFP_KERNEL); + em->down_threshold = kcalloc(count, sizeof(unsigned int), GFP_KERNEL); + + cpufreq_for_each_entry(pos, policy->freq_table) { + /* FIXME capacity below is not scaled for uarch */ + capacity = pos->frequency * SCHED_CAPACITY_SCALE / policy->max; + em->up_threshold[index] = capacity * UP_THRESHOLD / 100; + em->down_threshold[index] = capacity * DOWN_THRESHOLD / 100; + pr_debug("%s: cpu = %u index = %d capacity = %u up = %u down = %u\n", + __func__, cpumask_first(policy->cpus), index, + capacity, em->up_threshold[index], + em->down_threshold[index]); + index++; + } + + /* init per-policy kthread */ + em->task = kthread_create(energy_model_thread, policy, "kenergy_model_task"); + if (IS_ERR_OR_NULL(em->task)) + pr_err("%s: failed to create kenergy_model_task thread\n", __func__); +} + + +static void em_stop(struct cpufreq_policy *policy) +{ + struct em_data *em; + + em = policy->gov_data; + + kthread_stop(em->task); + + /* replace with devm counterparts */ + kfree(em->up_threshold); + kfree(em->down_threshold); + kfree(em); +} + +static int energy_model_setup(struct cpufreq_policy *policy, unsigned int event) +{ + switch (event) { + case CPUFREQ_GOV_START: + /* Start managing the frequency */ + em_start(policy); + return 0; + + case CPUFREQ_GOV_STOP: + em_stop(policy); + return 0; + + case CPUFREQ_GOV_LIMITS: /* unused */ + case CPUFREQ_GOV_POLICY_INIT: /* unused */ + case CPUFREQ_GOV_POLICY_EXIT: /* unused */ + break; + } + return 0; +} + +#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_ENERGY_MODEL +static +#endif +struct cpufreq_governor cpufreq_gov_energy_model = { + .name = "energy_model", + .governor = energy_model_setup, + .owner = THIS_MODULE, +}; + +static int __init energy_model_init(void) +{ + return cpufreq_register_governor(&cpufreq_gov_energy_model); +} + +static void __exit energy_model_exit(void) +{ + cpufreq_unregister_governor(&cpufreq_gov_energy_model); +} + +/* Try to make this the default governor */ +fs_initcall(energy_model_init); + +MODULE_LICENSE("GPL"); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5cc44f21fe5a..5f3056215e41 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -674,6 +674,7 @@ static int select_idle_sibling(struct task_struct *p, int cpu); static unsigned long task_h_load(struct task_struct *p); static inline void __update_task_entity_contrib(struct sched_entity *se); +static inline void __update_task_entity_utilization(struct sched_entity *se); /* Give new task start runnable values to heavy its load in infant time */ void init_task_runnable_average(struct task_struct *p) @@ -682,9 +683,10 @@ void init_task_runnable_average(struct task_struct *p) p->se.avg.decay_count = 0; slice = sched_slice(task_cfs_rq(p), &p->se) >> 10; - p->se.avg.runnable_avg_sum = slice; - p->se.avg.runnable_avg_period = slice; + p->se.avg.runnable_avg_sum = p->se.avg.running_avg_sum = slice; + p->se.avg.avg_period = slice; __update_task_entity_contrib(&p->se); + __update_task_entity_utilization(&p->se); } #else void init_task_runnable_average(struct task_struct *p) @@ -832,11 +834,12 @@ static unsigned int task_nr_scan_windows(struct task_struct *p) static unsigned int task_scan_min(struct task_struct *p) { + unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size); unsigned int scan, floor; unsigned int windows = 1; - if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW) - windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size; + if (scan_size < MAX_SCAN_WINDOW) + windows = MAX_SCAN_WINDOW / scan_size; floor = 1000 / windows; scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p); @@ -899,18 +902,24 @@ pid_t task_numa_group_id(struct task_struct *p) return p->numa_group ? p->numa_group->gid : 0; } -static inline int task_faults_idx(int nid, int priv) +/* + * The averaged statistics, shared & private, memory & cpu, + * occupy the first half of the array. The second half of the + * array is for current counters, which are averaged into the + * first set by task_numa_placement. + */ +static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv) { - return NR_NUMA_HINT_FAULT_TYPES * nid + priv; + return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv; } static inline unsigned long task_faults(struct task_struct *p, int nid) { - if (!p->numa_faults_memory) + if (!p->numa_faults) return 0; - return p->numa_faults_memory[task_faults_idx(nid, 0)] + - p->numa_faults_memory[task_faults_idx(nid, 1)]; + return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] + + p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)]; } static inline unsigned long group_faults(struct task_struct *p, int nid) @@ -918,14 +927,79 @@ static inline unsigned long group_faults(struct task_struct *p, int nid) if (!p->numa_group) return 0; - return p->numa_group->faults[task_faults_idx(nid, 0)] + - p->numa_group->faults[task_faults_idx(nid, 1)]; + return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] + + p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)]; } static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) { - return group->faults_cpu[task_faults_idx(nid, 0)] + - group->faults_cpu[task_faults_idx(nid, 1)]; + return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] + + group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)]; +} + +/* Handle placement on systems where not all nodes are directly connected. */ +static unsigned long score_nearby_nodes(struct task_struct *p, int nid, + int maxdist, bool task) +{ + unsigned long score = 0; + int node; + + /* + * All nodes are directly connected, and the same distance + * from each other. No need for fancy placement algorithms. + */ + if (sched_numa_topology_type == NUMA_DIRECT) + return 0; + + /* + * This code is called for each node, introducing N^2 complexity, + * which should be ok given the number of nodes rarely exceeds 8. + */ + for_each_online_node(node) { + unsigned long faults; + int dist = node_distance(nid, node); + + /* + * The furthest away nodes in the system are not interesting + * for placement; nid was already counted. + */ + if (dist == sched_max_numa_distance || node == nid) + continue; + + /* + * On systems with a backplane NUMA topology, compare groups + * of nodes, and move tasks towards the group with the most + * memory accesses. When comparing two nodes at distance + * "hoplimit", only nodes closer by than "hoplimit" are part + * of each group. Skip other nodes. + */ + if (sched_numa_topology_type == NUMA_BACKPLANE && + dist > maxdist) + continue; + + /* Add up the faults from nearby nodes. */ + if (task) + faults = task_faults(p, node); + else + faults = group_faults(p, node); + + /* + * On systems with a glueless mesh NUMA topology, there are + * no fixed "groups of nodes". Instead, nodes that are not + * directly connected bounce traffic through intermediate + * nodes; a numa_group can occupy any set of nodes. + * The further away a node is, the less the faults count. + * This seems to result in good task placement. + */ + if (sched_numa_topology_type == NUMA_GLUELESS_MESH) { + faults *= (sched_max_numa_distance - dist); + faults /= (sched_max_numa_distance - LOCAL_DISTANCE); + } + + score += faults; + } + + return score; } /* @@ -934,11 +1008,12 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) * larger multiplier, in order to group tasks together that are almost * evenly spread out between numa nodes. */ -static inline unsigned long task_weight(struct task_struct *p, int nid) +static inline unsigned long task_weight(struct task_struct *p, int nid, + int dist) { - unsigned long total_faults; + unsigned long faults, total_faults; - if (!p->numa_faults_memory) + if (!p->numa_faults) return 0; total_faults = p->total_numa_faults; @@ -946,15 +1021,29 @@ static inline unsigned long task_weight(struct task_struct *p, int nid) if (!total_faults) return 0; - return 1000 * task_faults(p, nid) / total_faults; + faults = task_faults(p, nid); + faults += score_nearby_nodes(p, nid, dist, true); + + return 1000 * faults / total_faults; } -static inline unsigned long group_weight(struct task_struct *p, int nid) +static inline unsigned long group_weight(struct task_struct *p, int nid, + int dist) { - if (!p->numa_group || !p->numa_group->total_faults) + unsigned long faults, total_faults; + + if (!p->numa_group) return 0; - return 1000 * group_faults(p, nid) / p->numa_group->total_faults; + total_faults = p->numa_group->total_faults; + + if (!total_faults) + return 0; + + faults = group_faults(p, nid); + faults += score_nearby_nodes(p, nid, dist, false); + + return 1000 * faults / total_faults; } bool should_numa_migrate_memory(struct task_struct *p, struct page * page, @@ -1023,7 +1112,6 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, static unsigned long weighted_cpuload(const int cpu); static unsigned long source_load(int cpu, int type); static unsigned long target_load(int cpu, int type); -static unsigned long capacity_of(int cpu); static long effective_load(struct task_group *tg, int cpu, long wl, long wg); /* Cached statistics for all CPUs within a node */ @@ -1087,6 +1175,7 @@ struct task_numa_env { struct numa_stats src_stats, dst_stats; int imbalance_pct; + int dist; struct task_struct *best_task; long best_imp; @@ -1166,11 +1255,22 @@ static void task_numa_compare(struct task_numa_env *env, long load; long imp = env->p->numa_group ? groupimp : taskimp; long moveimp = imp; + int dist = env->dist; rcu_read_lock(); - cur = ACCESS_ONCE(dst_rq->curr); - if (cur->pid == 0) /* idle */ + + raw_spin_lock_irq(&dst_rq->lock); + cur = dst_rq->curr; + /* + * No need to move the exiting task, and this ensures that ->curr + * wasn't reaped and thus get_task_struct() in task_numa_assign() + * is safe under RCU read lock. + * Note that rcu_read_lock() itself can't protect from the final + * put_task_struct() after the last schedule(). + */ + if ((cur->flags & PF_EXITING) || is_idle_task(cur)) cur = NULL; + raw_spin_unlock_irq(&dst_rq->lock); /* * "imp" is the fault differential for the source task between the @@ -1189,8 +1289,8 @@ static void task_numa_compare(struct task_numa_env *env, * in any group then look only at task weights. */ if (cur->numa_group == env->p->numa_group) { - imp = taskimp + task_weight(cur, env->src_nid) - - task_weight(cur, env->dst_nid); + imp = taskimp + task_weight(cur, env->src_nid, dist) - + task_weight(cur, env->dst_nid, dist); /* * Add some hysteresis to prevent swapping the * tasks within a group over tiny differences. @@ -1204,11 +1304,11 @@ static void task_numa_compare(struct task_numa_env *env, * instead. */ if (cur->numa_group) - imp += group_weight(cur, env->src_nid) - - group_weight(cur, env->dst_nid); + imp += group_weight(cur, env->src_nid, dist) - + group_weight(cur, env->dst_nid, dist); else - imp += task_weight(cur, env->src_nid) - - task_weight(cur, env->dst_nid); + imp += task_weight(cur, env->src_nid, dist) - + task_weight(cur, env->dst_nid, dist); } } @@ -1307,7 +1407,7 @@ static int task_numa_migrate(struct task_struct *p) }; struct sched_domain *sd; unsigned long taskweight, groupweight; - int nid, ret; + int nid, ret, dist; long taskimp, groupimp; /* @@ -1335,29 +1435,45 @@ static int task_numa_migrate(struct task_struct *p) return -EINVAL; } - taskweight = task_weight(p, env.src_nid); - groupweight = group_weight(p, env.src_nid); - update_numa_stats(&env.src_stats, env.src_nid); env.dst_nid = p->numa_preferred_nid; - taskimp = task_weight(p, env.dst_nid) - taskweight; - groupimp = group_weight(p, env.dst_nid) - groupweight; + dist = env.dist = node_distance(env.src_nid, env.dst_nid); + taskweight = task_weight(p, env.src_nid, dist); + groupweight = group_weight(p, env.src_nid, dist); + update_numa_stats(&env.src_stats, env.src_nid); + taskimp = task_weight(p, env.dst_nid, dist) - taskweight; + groupimp = group_weight(p, env.dst_nid, dist) - groupweight; update_numa_stats(&env.dst_stats, env.dst_nid); /* Try to find a spot on the preferred nid. */ task_numa_find_cpu(&env, taskimp, groupimp); - /* No space available on the preferred nid. Look elsewhere. */ - if (env.best_cpu == -1) { + /* + * Look at other nodes in these cases: + * - there is no space available on the preferred_nid + * - the task is part of a numa_group that is interleaved across + * multiple NUMA nodes; in order to better consolidate the group, + * we need to check other locations. + */ + if (env.best_cpu == -1 || (p->numa_group && + nodes_weight(p->numa_group->active_nodes) > 1)) { for_each_online_node(nid) { if (nid == env.src_nid || nid == p->numa_preferred_nid) continue; + dist = node_distance(env.src_nid, env.dst_nid); + if (sched_numa_topology_type == NUMA_BACKPLANE && + dist != env.dist) { + taskweight = task_weight(p, env.src_nid, dist); + groupweight = group_weight(p, env.src_nid, dist); + } + /* Only consider nodes where both task and groups benefit */ - taskimp = task_weight(p, nid) - taskweight; - groupimp = group_weight(p, nid) - groupweight; + taskimp = task_weight(p, nid, dist) - taskweight; + groupimp = group_weight(p, nid, dist) - groupweight; if (taskimp < 0 && groupimp < 0) continue; + env.dist = dist; env.dst_nid = nid; update_numa_stats(&env.dst_stats, env.dst_nid); task_numa_find_cpu(&env, taskimp, groupimp); @@ -1412,7 +1528,7 @@ static void numa_migrate_preferred(struct task_struct *p) unsigned long interval = HZ; /* This task has no NUMA fault statistics yet */ - if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory)) + if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) return; /* Periodically retry migrating the task to the preferred node */ @@ -1524,7 +1640,7 @@ static void update_task_scan_period(struct task_struct *p, * scanning faster if shared accesses dominate as it may * simply bounce migrations uselessly */ - ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared)); + ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1)); diff = (diff * ratio) / NUMA_PERIOD_SLOTS; } @@ -1552,7 +1668,7 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) *period = now - p->last_task_numa_placement; } else { delta = p->se.avg.runnable_avg_sum; - *period = p->se.avg.runnable_avg_period; + *period = p->se.avg.avg_period; } p->last_sum_exec_runtime = runtime; @@ -1561,6 +1677,92 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) return delta; } +/* + * Determine the preferred nid for a task in a numa_group. This needs to + * be done in a way that produces consistent results with group_weight, + * otherwise workloads might not converge. + */ +static int preferred_group_nid(struct task_struct *p, int nid) +{ + nodemask_t nodes; + int dist; + + /* Direct connections between all NUMA nodes. */ + if (sched_numa_topology_type == NUMA_DIRECT) + return nid; + + /* + * On a system with glueless mesh NUMA topology, group_weight + * scores nodes according to the number of NUMA hinting faults on + * both the node itself, and on nearby nodes. + */ + if (sched_numa_topology_type == NUMA_GLUELESS_MESH) { + unsigned long score, max_score = 0; + int node, max_node = nid; + + dist = sched_max_numa_distance; + + for_each_online_node(node) { + score = group_weight(p, node, dist); + if (score > max_score) { + max_score = score; + max_node = node; + } + } + return max_node; + } + + /* + * Finding the preferred nid in a system with NUMA backplane + * interconnect topology is more involved. The goal is to locate + * tasks from numa_groups near each other in the system, and + * untangle workloads from different sides of the system. This requires + * searching down the hierarchy of node groups, recursively searching + * inside the highest scoring group of nodes. The nodemask tricks + * keep the complexity of the search down. + */ + nodes = node_online_map; + for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) { + unsigned long max_faults = 0; + nodemask_t max_group; + int a, b; + + /* Are there nodes at this distance from each other? */ + if (!find_numa_distance(dist)) + continue; + + for_each_node_mask(a, nodes) { + unsigned long faults = 0; + nodemask_t this_group; + nodes_clear(this_group); + + /* Sum group's NUMA faults; includes a==b case. */ + for_each_node_mask(b, nodes) { + if (node_distance(a, b) < dist) { + faults += group_faults(p, b); + node_set(b, this_group); + node_clear(b, nodes); + } + } + + /* Remember the top group. */ + if (faults > max_faults) { + max_faults = faults; + max_group = this_group; + /* + * subtle: at the smallest distance there is + * just one node left in each "group", the + * winner is the preferred nid. + */ + nid = a; + } + } + /* Next round, evaluate the nodes within max_group. */ + nodes = max_group; + } + return nid; +} + static void task_numa_placement(struct task_struct *p) { int seq, nid, max_nid = -1, max_group_nid = -1; @@ -1588,18 +1790,23 @@ static void task_numa_placement(struct task_struct *p) /* Find the node with the highest number of faults */ for_each_online_node(nid) { + /* Keep track of the offsets in numa_faults array */ + int mem_idx, membuf_idx, cpu_idx, cpubuf_idx; unsigned long faults = 0, group_faults = 0; - int priv, i; + int priv; for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) { long diff, f_diff, f_weight; - i = task_faults_idx(nid, priv); + mem_idx = task_faults_idx(NUMA_MEM, nid, priv); + membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv); + cpu_idx = task_faults_idx(NUMA_CPU, nid, priv); + cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv); /* Decay existing window, copy faults since last scan */ - diff = p->numa_faults_buffer_memory[i] - p->numa_faults_memory[i] / 2; - fault_types[priv] += p->numa_faults_buffer_memory[i]; - p->numa_faults_buffer_memory[i] = 0; + diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2; + fault_types[priv] += p->numa_faults[membuf_idx]; + p->numa_faults[membuf_idx] = 0; /* * Normalize the faults_from, so all tasks in a group @@ -1609,21 +1816,27 @@ static void task_numa_placement(struct task_struct *p) * faults are less important. */ f_weight = div64_u64(runtime << 16, period + 1); - f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) / + f_weight = (f_weight * p->numa_faults[cpubuf_idx]) / (total_faults + 1); - f_diff = f_weight - p->numa_faults_cpu[i] / 2; - p->numa_faults_buffer_cpu[i] = 0; + f_diff = f_weight - p->numa_faults[cpu_idx] / 2; + p->numa_faults[cpubuf_idx] = 0; - p->numa_faults_memory[i] += diff; - p->numa_faults_cpu[i] += f_diff; - faults += p->numa_faults_memory[i]; + p->numa_faults[mem_idx] += diff; + p->numa_faults[cpu_idx] += f_diff; + faults += p->numa_faults[mem_idx]; p->total_numa_faults += diff; if (p->numa_group) { - /* safe because we can only change our own group */ - p->numa_group->faults[i] += diff; - p->numa_group->faults_cpu[i] += f_diff; + /* + * safe because we can only change our own group + * + * mem_idx represents the offset for a given + * nid and priv in a specific region because it + * is at the beginning of the numa_faults array. + */ + p->numa_group->faults[mem_idx] += diff; + p->numa_group->faults_cpu[mem_idx] += f_diff; p->numa_group->total_faults += diff; - group_faults += p->numa_group->faults[i]; + group_faults += p->numa_group->faults[mem_idx]; } } @@ -1643,7 +1856,7 @@ static void task_numa_placement(struct task_struct *p) if (p->numa_group) { update_numa_active_node_mask(p->numa_group); spin_unlock_irq(group_lock); - max_nid = max_group_nid; + max_nid = preferred_group_nid(p, max_group_nid); } if (max_faults) { @@ -1695,7 +1908,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, node_set(task_node(current), grp->active_nodes); for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) - grp->faults[i] = p->numa_faults_memory[i]; + grp->faults[i] = p->numa_faults[i]; grp->total_faults = p->total_numa_faults; @@ -1754,8 +1967,8 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, double_lock_irq(&my_grp->lock, &grp->lock); for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) { - my_grp->faults[i] -= p->numa_faults_memory[i]; - grp->faults[i] += p->numa_faults_memory[i]; + my_grp->faults[i] -= p->numa_faults[i]; + grp->faults[i] += p->numa_faults[i]; } my_grp->total_faults -= p->total_numa_faults; grp->total_faults += p->total_numa_faults; @@ -1780,14 +1993,14 @@ no_join: void task_numa_free(struct task_struct *p) { struct numa_group *grp = p->numa_group; - void *numa_faults = p->numa_faults_memory; + void *numa_faults = p->numa_faults; unsigned long flags; int i; if (grp) { spin_lock_irqsave(&grp->lock, flags); for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) - grp->faults[i] -= p->numa_faults_memory[i]; + grp->faults[i] -= p->numa_faults[i]; grp->total_faults -= p->total_numa_faults; list_del(&p->numa_entry); @@ -1797,10 +2010,7 @@ void task_numa_free(struct task_struct *p) put_numa_group(grp); } - p->numa_faults_memory = NULL; - p->numa_faults_buffer_memory = NULL; - p->numa_faults_cpu= NULL; - p->numa_faults_buffer_cpu = NULL; + p->numa_faults = NULL; kfree(numa_faults); } @@ -1823,24 +2033,14 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) return; /* Allocate buffer to track faults on a per-node basis */ - if (unlikely(!p->numa_faults_memory)) { - int size = sizeof(*p->numa_faults_memory) * + if (unlikely(!p->numa_faults)) { + int size = sizeof(*p->numa_faults) * NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids; - p->numa_faults_memory = kzalloc(size, GFP_KERNEL|__GFP_NOWARN); - if (!p->numa_faults_memory) + p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN); + if (!p->numa_faults) return; - BUG_ON(p->numa_faults_buffer_memory); - /* - * The averaged statistics, shared & private, memory & cpu, - * occupy the first half of the array. The second half of the - * array is for current counters, which are averaged into the - * first set by task_numa_placement. - */ - p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids); - p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids); - p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids); p->total_numa_faults = 0; memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); } @@ -1880,8 +2080,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) if (migrated) p->numa_pages_migrated += pages; - p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages; - p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages; + p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages; + p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages; p->numa_faults_locality[local] += pages; } @@ -2064,6 +2264,11 @@ static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p) } #endif /* CONFIG_NUMA_BALANCING */ +#ifdef CONFIG_SMP +unsigned long capacity_of(int cpu); +//unsigned long usage_util_of(int cpu); +#endif /* CONFIG_SMP */ + static void account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { @@ -2268,6 +2473,10 @@ static u32 __compute_runnable_contrib(u64 n) return contrib + runnable_avg_yN_sum[n]; } +unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu); +void arch_eval_cpu_freq(struct cpumask *cpus); +void arch_scale_cpu_freq(void); + /* * We can represent the historical contribution to runnable average as the * coefficients of a geometric series. To do this we sub-divide our runnable @@ -2296,13 +2505,15 @@ static u32 __compute_runnable_contrib(u64 n) * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] */ -static __always_inline int __update_entity_runnable_avg(u64 now, +static __always_inline int __update_entity_runnable_avg(u64 now, int cpu, struct sched_avg *sa, - int runnable) + int runnable, + int running) { u64 delta, periods; u32 runnable_contrib; int delta_w, decayed = 0; + unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu); delta = now - sa->last_runnable_update; /* @@ -2324,7 +2535,7 @@ static __always_inline int __update_entity_runnable_avg(u64 now, sa->last_runnable_update = now; /* delta_w is the amount already accumulated against our next period */ - delta_w = sa->runnable_avg_period % 1024; + delta_w = sa->avg_period % 1024; if (delta + delta_w >= 1024) { /* period roll-over */ decayed = 1; @@ -2337,7 +2548,10 @@ static __always_inline int __update_entity_runnable_avg(u64 now, delta_w = 1024 - delta_w; if (runnable) sa->runnable_avg_sum += delta_w; - sa->runnable_avg_period += delta_w; + if (running) + sa->running_avg_sum += delta_w * scale_freq + >> SCHED_CAPACITY_SHIFT; + sa->avg_period += delta_w; delta -= delta_w; @@ -2347,20 +2561,28 @@ static __always_inline int __update_entity_runnable_avg(u64 now, sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum, periods + 1); - sa->runnable_avg_period = decay_load(sa->runnable_avg_period, + sa->running_avg_sum = decay_load(sa->running_avg_sum, + periods + 1); + sa->avg_period = decay_load(sa->avg_period, periods + 1); /* Efficiently calculate \sum (1..n_period) 1024*y^i */ runnable_contrib = __compute_runnable_contrib(periods); if (runnable) sa->runnable_avg_sum += runnable_contrib; - sa->runnable_avg_period += runnable_contrib; + if (running) + sa->running_avg_sum += runnable_contrib * scale_freq + >> SCHED_CAPACITY_SHIFT; + sa->avg_period += runnable_contrib; } /* Remainder of delta accrued against u_0` */ if (runnable) sa->runnable_avg_sum += delta; - sa->runnable_avg_period += delta; + if (running) + sa->running_avg_sum += delta * scale_freq + >> SCHED_CAPACITY_SHIFT; + sa->avg_period += delta; return decayed; } @@ -2376,6 +2598,8 @@ static inline u64 __synchronize_entity_decay(struct sched_entity *se) return 0; se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); + se->avg.utilization_avg_contrib = + decay_load(se->avg.utilization_avg_contrib, decays); se->avg.decay_count = 0; return decays; @@ -2412,7 +2636,7 @@ static inline void __update_tg_runnable_avg(struct sched_avg *sa, /* The fraction of a cpu used by this cfs_rq */ contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT, - sa->runnable_avg_period + 1); + sa->avg_period + 1); contrib -= cfs_rq->tg_runnable_contrib; if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) { @@ -2465,7 +2689,8 @@ static inline void __update_group_entity_contrib(struct sched_entity *se) static inline void update_rq_runnable_avg(struct rq *rq, int runnable) { - __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable); + __update_entity_runnable_avg(rq_clock_task(rq), cpu_of(rq), &rq->avg, + runnable, runnable); __update_tg_runnable_avg(&rq->avg, &rq->cfs); } #else /* CONFIG_FAIR_GROUP_SCHED */ @@ -2483,7 +2708,7 @@ static inline void __update_task_entity_contrib(struct sched_entity *se) /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight); - contrib /= (se->avg.runnable_avg_period + 1); + contrib /= (se->avg.avg_period + 1); se->avg.load_avg_contrib = scale_load(contrib); } @@ -2502,6 +2727,30 @@ static long __update_entity_load_avg_contrib(struct sched_entity *se) return se->avg.load_avg_contrib - old_contrib; } + +static inline void __update_task_entity_utilization(struct sched_entity *se) +{ + u32 contrib; + + /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ + contrib = se->avg.running_avg_sum * scale_load_down(SCHED_LOAD_SCALE); + contrib /= (se->avg.avg_period + 1); + se->avg.utilization_avg_contrib = scale_load(contrib); +} + +static long __update_entity_utilization_avg_contrib(struct sched_entity *se) +{ + long old_contrib = se->avg.utilization_avg_contrib; + + if (entity_is_task(se)) + __update_task_entity_utilization(se); + else + se->avg.utilization_avg_contrib = + group_cfs_rq(se)->utilization_load_avg; + + return se->avg.utilization_avg_contrib - old_contrib; +} + static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq, long load_contrib) { @@ -2518,7 +2767,8 @@ static inline void update_entity_load_avg(struct sched_entity *se, int update_cfs_rq) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - long contrib_delta; + long contrib_delta, utilization_delta; + int cpu = cpu_of(rq_of(cfs_rq)); u64 now; /* @@ -2530,18 +2780,22 @@ static inline void update_entity_load_avg(struct sched_entity *se, else now = cfs_rq_clock_task(group_cfs_rq(se)); - if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq)) + if (!__update_entity_runnable_avg(now, cpu, &se->avg, se->on_rq, + cfs_rq->curr == se)) return; contrib_delta = __update_entity_load_avg_contrib(se); + utilization_delta = __update_entity_utilization_avg_contrib(se); if (!update_cfs_rq) return; - if (se->on_rq) + if (se->on_rq) { cfs_rq->runnable_load_avg += contrib_delta; - else + cfs_rq->utilization_load_avg += utilization_delta; + } else { subtract_blocked_load_contrib(cfs_rq, -contrib_delta); + } } /* @@ -2616,6 +2870,7 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, } cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; + cfs_rq->utilization_load_avg += se->avg.utilization_avg_contrib; /* we force update consideration on load-balancer moves */ update_cfs_rq_blocked_load(cfs_rq, !wakeup); } @@ -2634,6 +2889,7 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, update_cfs_rq_blocked_load(cfs_rq, !sleep); cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib; + cfs_rq->utilization_load_avg -= se->avg.utilization_avg_contrib; if (sleep) { cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); @@ -2971,6 +3227,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) */ update_stats_wait_end(cfs_rq, se); __dequeue_entity(cfs_rq, se); + update_entity_load_avg(se, 1); } update_stats_curr_start(cfs_rq, se); @@ -3939,6 +4196,11 @@ static inline void hrtick_update(struct rq *rq) } #endif +static inline bool energy_aware(void) +{ + return sched_feat(ENERGY_AWARE); +} + /* * The enqueue_task method is called before nr_running is * increased. Here we update the fair scheduling stats and @@ -3949,6 +4211,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; + struct cpumask update_cpus; + + cpumask_clear(&update_cpus); for_each_sched_entity(se) { if (se->on_rq) @@ -3978,12 +4243,27 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_cfs_shares(cfs_rq); update_entity_load_avg(se, 1); + /* track cpus that need to be re-evaluated */ + cpumask_set_cpu(cpu_of(rq_of(cfs_rq)), &update_cpus); } + /* !CONFIG_FAIR_GROUP_SCHED */ if (!se) { update_rq_runnable_avg(rq, rq->nr_running); add_nr_running(rq, 1); + + /* + * FIXME for !CONFIG_FAIR_GROUP_SCHED it might be nice to + * typedef update_cpus into an int and skip all of the cpumask + * stuff + */ + cpumask_set_cpu(cpu_of(rq), &update_cpus); } + + if (energy_aware()) + if (!cpumask_empty(&update_cpus)) + arch_eval_cpu_freq(&update_cpus); + hrtick_update(rq); } @@ -3999,6 +4279,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; int task_sleep = flags & DEQUEUE_SLEEP; + struct cpumask update_cpus; + + cpumask_clear(&update_cpus); for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); @@ -4039,12 +4322,27 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_cfs_shares(cfs_rq); update_entity_load_avg(se, 1); + /* track runqueues/cpus that need to be re-evaluated */ + cpumask_set_cpu(cpu_of(rq_of(cfs_rq)), &update_cpus); } + /* !CONFIG_FAIR_GROUP_SCHED */ if (!se) { sub_nr_running(rq, 1); update_rq_runnable_avg(rq, 1); + + /* + * FIXME for !CONFIG_FAIR_GROUP_SCHED it might be nice to + * typedef update_cpus into an int and skip all of the cpumask + * stuff + */ + cpumask_set_cpu(cpu_of(rq), &update_cpus); } + + if (energy_aware()) + if (!cpumask_empty(&update_cpus)) + arch_eval_cpu_freq(&update_cpus); + hrtick_update(rq); } @@ -4088,11 +4386,16 @@ static unsigned long target_load(int cpu, int type) return max(rq->cpu_load[type-1], total); } -static unsigned long capacity_of(int cpu) +unsigned long capacity_of(int cpu) { return cpu_rq(cpu)->cpu_capacity; } +static unsigned long capacity_orig_of(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity_orig; +} + static unsigned long cpu_avg_load_per_task(int cpu) { struct rq *rq = cpu_rq(cpu); @@ -4450,7 +4753,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) latest_idle_timestamp = rq->idle_stamp; shallowest_idle_cpu = i; } - } else { + } else if (shallowest_idle_cpu == -1) { load = weighted_cpuload(i); if (load < min_load || (load == min_load && i == this_cpu)) { min_load = load; @@ -4506,6 +4809,33 @@ next: done: return target; } +/* + * get_cpu_usage returns the amount of capacity of a CPU that is used by CFS + * tasks. The unit of the return value must capacity so we can compare the + * usage with the capacity of the CPU that is available for CFS task (ie + * cpu_capacity). + * cfs.utilization_load_avg is the sum of running time of runnable tasks on a + * CPU. It represents the amount of utilization of a CPU in the range + * [0..SCHED_LOAD_SCALE]. The usage of a CPU can't be higher than the full + * capacity of the CPU because it's about the running time on this CPU. + * Nevertheless, cfs.utilization_load_avg can be higher than SCHED_LOAD_SCALE + * because of unfortunate rounding in avg_period and running_load_avg or just + * after migrating tasks until the average stabilizes with the new running + * time. So we need to check that the usage stays into the range + * [0..cpu_capacity_orig] and cap if necessary. + * Without capping the usage, a group could be seen as overloaded (CPU0 usage + * at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity/ + */ +int get_cpu_usage(int cpu) +{ + unsigned long usage = cpu_rq(cpu)->cfs.utilization_load_avg; + unsigned long capacity = capacity_orig_of(cpu); + + if (usage >= SCHED_LOAD_SCALE) + return capacity; + + return (usage * capacity) >> SCHED_LOAD_SHIFT; +} /* * select_task_rq_fair: Select target runqueue for the waking task in domains @@ -5170,7 +5500,7 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) struct numa_group *numa_group = rcu_dereference(p->numa_group); int src_nid, dst_nid; - if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory || + if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults || !(env->sd->flags & SD_NUMA)) { return false; } @@ -5209,7 +5539,7 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) return false; - if (!p->numa_faults_memory || !(env->sd->flags & SD_NUMA)) + if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) return false; src_nid = cpu_to_node(env->src_cpu); @@ -5635,12 +5965,12 @@ struct sg_lb_stats { unsigned long sum_weighted_load; /* Weighted load of group's tasks */ unsigned long load_per_task; unsigned long group_capacity; + unsigned long group_usage; /* Total usage of the group */ unsigned int sum_nr_running; /* Nr tasks running in the group */ - unsigned int group_capacity_factor; unsigned int idle_cpus; unsigned int group_weight; enum group_type group_type; - int group_has_free_capacity; + int group_no_capacity; #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; unsigned int nr_preferred_running; @@ -5734,10 +6064,20 @@ unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) return default_scale_cpu_capacity(sd, cpu); } +void __weak arch_eval_cpu_freq(struct cpumask *cpus) +{ + return; +} + +void __weak arch_scale_cpu_freq(void) +{ + return; +} + static unsigned long scale_rt_capacity(int cpu) { struct rq *rq = cpu_rq(cpu); - u64 total, available, age_stamp, avg; + u64 total, used, age_stamp, avg; s64 delta; /* @@ -5753,19 +6093,12 @@ static unsigned long scale_rt_capacity(int cpu) total = sched_avg_period() + delta; - if (unlikely(total < avg)) { - /* Ensures that capacity won't end up being negative */ - available = 0; - } else { - available = total - avg; - } - - if (unlikely((s64)total < SCHED_CAPACITY_SCALE)) - total = SCHED_CAPACITY_SCALE; + used = div_u64(avg, total); - total >>= SCHED_CAPACITY_SHIFT; + if (likely(used < SCHED_CAPACITY_SCALE)) + return SCHED_CAPACITY_SCALE - used; - return div_u64(available, total); + return 1; } static void update_cpu_capacity(struct sched_domain *sd, int cpu) @@ -5780,14 +6113,7 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) capacity >>= SCHED_CAPACITY_SHIFT; - sdg->sgc->capacity_orig = capacity; - - if (sched_feat(ARCH_CAPACITY)) - capacity *= arch_scale_freq_capacity(sd, cpu); - else - capacity *= default_scale_capacity(sd, cpu); - - capacity >>= SCHED_CAPACITY_SHIFT; + cpu_rq(cpu)->cpu_capacity_orig = capacity; capacity *= scale_rt_capacity(cpu); capacity >>= SCHED_CAPACITY_SHIFT; @@ -5803,7 +6129,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) { struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; - unsigned long capacity, capacity_orig; + unsigned long capacity; unsigned long interval; interval = msecs_to_jiffies(sd->balance_interval); @@ -5815,7 +6141,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) return; } - capacity_orig = capacity = 0; + capacity = 0; if (child->flags & SD_OVERLAP) { /* @@ -5835,19 +6161,15 @@ void update_group_capacity(struct sched_domain *sd, int cpu) * Use capacity_of(), which is set irrespective of domains * in update_cpu_capacity(). * - * This avoids capacity/capacity_orig from being 0 and + * This avoids capacity from being 0 and * causing divide-by-zero issues on boot. - * - * Runtime updates will correct capacity_orig. */ if (unlikely(!rq->sd)) { - capacity_orig += capacity_of(cpu); capacity += capacity_of(cpu); continue; } sgc = rq->sd->groups->sgc; - capacity_orig += sgc->capacity_orig; capacity += sgc->capacity; } } else { @@ -5858,39 +6180,24 @@ void update_group_capacity(struct sched_domain *sd, int cpu) group = child->groups; do { - capacity_orig += group->sgc->capacity_orig; capacity += group->sgc->capacity; group = group->next; } while (group != child->groups); } - sdg->sgc->capacity_orig = capacity_orig; sdg->sgc->capacity = capacity; } /* - * Try and fix up capacity for tiny siblings, this is needed when - * things like SD_ASYM_PACKING need f_b_g to select another sibling - * which on its own isn't powerful enough. - * - * See update_sd_pick_busiest() and check_asym_packing(). + * Check whether the capacity of the rq has been noticeably reduced by side + * activity. The imbalance_pct is used for the threshold. + * Return true is the capacity is reduced */ static inline int -fix_small_capacity(struct sched_domain *sd, struct sched_group *group) +check_cpu_capacity(struct rq *rq, struct sched_domain *sd) { - /* - * Only siblings can have significantly less than SCHED_CAPACITY_SCALE - */ - if (!(sd->flags & SD_SHARE_CPUCAPACITY)) - return 0; - - /* - * If ~90% of the cpu_capacity is still there, we're good. - */ - if (group->sgc->capacity * 32 > group->sgc->capacity_orig * 29) - return 1; - - return 0; + return ((rq->cpu_capacity * sd->imbalance_pct) < + (rq->cpu_capacity_orig * 100)); } /* @@ -5928,37 +6235,54 @@ static inline int sg_imbalanced(struct sched_group *group) } /* - * Compute the group capacity factor. - * - * Avoid the issue where N*frac(smt_capacity) >= 1 creates 'phantom' cores by - * first dividing out the smt factor and computing the actual number of cores - * and limit unit capacity with that. + * group_has_capacity returns true if the group has spare capacity that could + * be used by some tasks. We consider that a group has spare capacity if the + * number of task is smaller than the number of CPUs or if the usage is lower + * than the available capacity for CFS tasks. For the latter, we use a + * threshold to stabilize the state, to take into account the variance of the + * tasks' load and to return true if the available capacity in meaningful for + * the load balancer. As an example, an available capacity of 1% can appear + * but it doesn't make any benefit for the load balance. */ -static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *group) +static inline bool +group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs) { - unsigned int capacity_factor, smt, cpus; - unsigned int capacity, capacity_orig; + if ((sgs->group_capacity * 100) > + (sgs->group_usage * env->sd->imbalance_pct)) + return true; - capacity = group->sgc->capacity; - capacity_orig = group->sgc->capacity_orig; - cpus = group->group_weight; + if (sgs->sum_nr_running < sgs->group_weight) + return true; - /* smt := ceil(cpus / capacity), assumes: 1 < smt_capacity < 2 */ - smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, capacity_orig); - capacity_factor = cpus / smt; /* cores */ + return false; +} + +/* + * group_is_overloaded returns true if the group has more tasks than it can + * handle. We consider that a group is overloaded if the number of tasks is + * greater than the number of CPUs and the tasks already use all available + * capacity for CFS tasks. For the latter, we use a threshold to stabilize + * the state, to take into account the variance of tasks' load and to return + * true if available capacity is no more meaningful for load balancer + */ +static inline bool +group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) +{ + if (sgs->sum_nr_running <= sgs->group_weight) + return false; - capacity_factor = min_t(unsigned, - capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE)); - if (!capacity_factor) - capacity_factor = fix_small_capacity(env->sd, group); + if ((sgs->group_capacity * 100) < + (sgs->group_usage * env->sd->imbalance_pct)) + return true; - return capacity_factor; + return false; } -static enum group_type -group_classify(struct sched_group *group, struct sg_lb_stats *sgs) +static enum group_type group_classify(struct lb_env *env, + struct sched_group *group, + struct sg_lb_stats *sgs) { - if (sgs->sum_nr_running > sgs->group_capacity_factor) + if (sgs->group_no_capacity) return group_overloaded; if (sg_imbalanced(group)) @@ -5996,6 +6320,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, load = source_load(i, load_idx); sgs->group_load += load; + sgs->group_usage += get_cpu_usage(i); sgs->sum_nr_running += rq->cfs.h_nr_running; if (rq->nr_running > 1) @@ -6018,11 +6343,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; sgs->group_weight = group->group_weight; - sgs->group_capacity_factor = sg_capacity_factor(env, group); - sgs->group_type = group_classify(group, sgs); - if (sgs->group_capacity_factor > sgs->sum_nr_running) - sgs->group_has_free_capacity = 1; + sgs->group_no_capacity = group_is_overloaded(env, sgs); + sgs->group_type = group_classify(env, group, sgs); } /** @@ -6144,17 +6467,20 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd /* * In case the child domain prefers tasks go to siblings - * first, lower the sg capacity factor to one so that we'll try + * first, lower the sg capacity so that we'll try * and move all the excess tasks away. We lower the capacity * of a group only if the local group has the capacity to fit - * these excess tasks, i.e. nr_running < group_capacity_factor. The - * extra check prevents the case where you always pull from the - * heaviest group when it is already under-utilized (possible - * with a large weight task outweighs the tasks on the system). + * these excess tasks. The extra check prevents the case where + * you always pull from the heaviest group when it is already + * under-utilized (possible with a large weight task outweighs + * the tasks on the system). */ if (prefer_sibling && sds->local && - sds->local_stat.group_has_free_capacity) - sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U); + group_has_capacity(env, &sds->local_stat) && + (sgs->sum_nr_running > 1)) { + sgs->group_no_capacity = 1; + sgs->group_type = group_overloaded; + } if (update_sd_pick_busiest(env, sds, sg, sgs)) { sds->busiest = sg; @@ -6333,11 +6659,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s */ if (busiest->group_type == group_overloaded && local->group_type == group_overloaded) { - load_above_capacity = - (busiest->sum_nr_running - busiest->group_capacity_factor); - - load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_CAPACITY_SCALE); - load_above_capacity /= busiest->group_capacity; + load_above_capacity = busiest->sum_nr_running * + SCHED_LOAD_SCALE; + if (load_above_capacity > busiest->group_capacity) + load_above_capacity -= busiest->group_capacity; + else + load_above_capacity = ~0UL; } /* @@ -6400,6 +6727,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) local = &sds.local_stat; busiest = &sds.busiest_stat; + /* ASYM feature bypasses nice load balance check */ if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && check_asym_packing(env, &sds)) return sds.busiest; @@ -6420,8 +6748,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env) goto force_balance; /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ - if (env->idle == CPU_NEWLY_IDLE && local->group_has_free_capacity && - !busiest->group_has_free_capacity) + if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) && + busiest->group_no_capacity) goto force_balance; /* @@ -6480,7 +6808,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, int i; for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { - unsigned long capacity, capacity_factor, wl; + unsigned long capacity, wl; enum fbq_type rt; rq = cpu_rq(i); @@ -6509,9 +6837,6 @@ static struct rq *find_busiest_queue(struct lb_env *env, continue; capacity = capacity_of(i); - capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE); - if (!capacity_factor) - capacity_factor = fix_small_capacity(env->sd, group); wl = weighted_cpuload(i); @@ -6519,7 +6844,9 @@ static struct rq *find_busiest_queue(struct lb_env *env, * When comparing with imbalance, use weighted_cpuload() * which is not scaled with the cpu capacity. */ - if (capacity_factor && rq->nr_running == 1 && wl > env->imbalance) + + if (rq->nr_running == 1 && wl > env->imbalance && + !check_cpu_capacity(rq, env->sd)) continue; /* @@ -6567,6 +6894,28 @@ static int need_active_balance(struct lb_env *env) return 1; } + /* + * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task. + * It's worth migrating the task if the src_cpu's capacity is reduced + * because of other sched_class or IRQs whereas capacity stays + * available on dst_cpu. + */ + if ((env->idle != CPU_NOT_IDLE) && + (env->src_rq->cfs.h_nr_running == 1)) { + unsigned long src_eff_capacity, dst_eff_capacity; + + dst_eff_capacity = 100; + dst_eff_capacity *= capacity_of(env->dst_cpu); + dst_eff_capacity *= capacity_orig_of(env->src_cpu); + + src_eff_capacity = sd->imbalance_pct; + src_eff_capacity *= capacity_of(env->src_cpu); + src_eff_capacity *= capacity_orig_of(env->dst_cpu); + + if (src_eff_capacity < dst_eff_capacity) + return 1; + } + return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); } @@ -6666,6 +7015,9 @@ redo: schedstat_add(sd, lb_imbalance[idle], env.imbalance); + env.src_cpu = busiest->cpu; + env.src_rq = busiest; + ld_moved = 0; if (busiest->nr_running > 1) { /* @@ -6675,8 +7027,6 @@ redo: * correctly treated as an imbalance. */ env.flags |= LBF_ALL_PINNED; - env.src_cpu = busiest->cpu; - env.src_rq = busiest; env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); more_balance: @@ -7376,22 +7726,25 @@ end: /* * Current heuristic for kicking the idle load balancer in the presence - * of an idle cpu is the system. + * of an idle cpu in the system. * - This rq has more than one task. - * - At any scheduler domain level, this cpu's scheduler group has multiple - * busy cpu's exceeding the group's capacity. + * - This rq has at least one CFS task and the capacity of the CPU is + * significantly reduced because of RT tasks or IRQs. + * - At parent of LLC scheduler domain level, this cpu's scheduler group has + * multiple busy cpu. * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler * domain span are idle. */ -static inline int nohz_kick_needed(struct rq *rq) +static inline bool nohz_kick_needed(struct rq *rq) { unsigned long now = jiffies; struct sched_domain *sd; struct sched_group_capacity *sgc; int nr_busy, cpu = rq->cpu; + bool kick = false; if (unlikely(rq->idle_balance)) - return 0; + return false; /* * We may be recently in ticked or tickless idle mode. At the first @@ -7405,38 +7758,44 @@ static inline int nohz_kick_needed(struct rq *rq) * balancing. */ if (likely(!atomic_read(&nohz.nr_cpus))) - return 0; + return false; if (time_before(now, nohz.next_balance)) - return 0; + return false; if (rq->nr_running >= 2) - goto need_kick; + return true; rcu_read_lock(); sd = rcu_dereference(per_cpu(sd_busy, cpu)); - if (sd) { sgc = sd->groups->sgc; nr_busy = atomic_read(&sgc->nr_busy_cpus); - if (nr_busy > 1) - goto need_kick_unlock; + if (nr_busy > 1) { + kick = true; + goto unlock; + } + } - sd = rcu_dereference(per_cpu(sd_asym, cpu)); + sd = rcu_dereference(rq->sd); + if (sd) { + if ((rq->cfs.h_nr_running >= 1) && + check_cpu_capacity(rq, sd)) { + kick = true; + goto unlock; + } + } + sd = rcu_dereference(per_cpu(sd_asym, cpu)); if (sd && (cpumask_first_and(nohz.idle_cpus_mask, sched_domain_span(sd)) < cpu)) - goto need_kick_unlock; + kick = true; +unlock: rcu_read_unlock(); - return 0; - -need_kick_unlock: - rcu_read_unlock(); -need_kick: - return 1; + return kick; } #else static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } @@ -7460,6 +7819,9 @@ static void run_rebalance_domains(struct softirq_action *h) * stopped. */ nohz_idle_balance(this_rq, idle); + + if (energy_aware()) + arch_scale_cpu_freq(); } /* diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 90284d117fe6..199ee3a62fbc 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -83,3 +83,9 @@ SCHED_FEAT(NUMA_FAVOUR_HIGHER, true) */ SCHED_FEAT(NUMA_RESIST_LOWER, false) #endif + +/* + * Energy aware scheduling. Use platform energy model to guide scheduling + * decisions optimizing for energy efficiency. + */ +SCHED_FEAT(ENERGY_AWARE, false) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index d024e6ce30ba..3d14312db7ea 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1351,16 +1351,22 @@ out: static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) { - if (rq->curr->nr_cpus_allowed == 1) + /* + * Current can't be migrated, useless to reschedule, + * let's hope p can move out. + */ + if (rq->curr->nr_cpus_allowed == 1 || + !cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) return; + /* + * p is migratable, so let's not schedule it and + * see if it is pushed or pulled somewhere else. + */ if (p->nr_cpus_allowed != 1 && cpupri_find(&rq->rd->cpupri, p, NULL)) return; - if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) - return; - /* * There appears to be other cpus that can accept * current and none to run 'p', so lets reschedule diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 24156c8434d1..770c18cd6bee 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -176,6 +176,25 @@ struct dl_bw { u64 bw, total_bw; }; +static inline +void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw) +{ + dl_b->total_bw -= tsk_bw; +} + +static inline +void __dl_add(struct dl_bw *dl_b, u64 tsk_bw) +{ + dl_b->total_bw += tsk_bw; +} + +static inline +bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) +{ + return dl_b->bw != -1 && + dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; +} + extern struct mutex sched_domains_mutex; #ifdef CONFIG_CGROUP_SCHED @@ -313,6 +332,9 @@ struct cfs_bandwidth { }; #endif /* CONFIG_CGROUP_SCHED */ +extern unsigned long capacity_of(int cpu); +extern int get_cpu_usage(int cpu); + /* CFS-related fields in a runqueue */ struct cfs_rq { struct load_weight load; @@ -343,8 +365,14 @@ struct cfs_rq { * Under CFS, load is tracked on a per-entity basis and aggregated up. * This allows for the description of both thread and group usage (in * the FAIR_GROUP_SCHED case). + * runnable_load_avg is the sum of the load_avg_contrib of the + * sched_entities on the rq. + * blocked_load_avg is similar to runnable_load_avg except that its + * the blocked sched_entities on the rq. + * utilization_load_avg is the sum of the average running time of the + * sched_entities on the rq. */ - unsigned long runnable_load_avg, blocked_load_avg; + unsigned long runnable_load_avg, blocked_load_avg, utilization_load_avg; atomic64_t decay_counter; u64 last_decay; atomic_long_t removed_load; @@ -579,6 +607,7 @@ struct rq { struct sched_domain *sd; unsigned long cpu_capacity; + unsigned long cpu_capacity_orig; unsigned char idle_balance; /* For active balancing */ @@ -678,7 +707,25 @@ static inline u64 rq_clock_task(struct rq *rq) return rq->clock_task; } +#ifdef CONFIG_NUMA +enum numa_topology_type { + NUMA_DIRECT, + NUMA_GLUELESS_MESH, + NUMA_BACKPLANE, +}; +extern enum numa_topology_type sched_numa_topology_type; +extern int sched_max_numa_distance; +extern bool find_numa_distance(int distance); +#endif + #ifdef CONFIG_NUMA_BALANCING +/* The regions in numa_faults array from task_struct */ +enum numa_faults_stats { + NUMA_MEM = 0, + NUMA_CPU, + NUMA_MEMBUF, + NUMA_CPUBUF +}; extern void sched_setnuma(struct task_struct *p, int node); extern int migrate_task_to(struct task_struct *p, int cpu); extern int migrate_swap(struct task_struct *, struct task_struct *); @@ -752,7 +799,7 @@ struct sched_group_capacity { * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity * for a single CPU. */ - unsigned int capacity, capacity_orig; + unsigned int capacity; unsigned long next_update; int imbalance; /* XXX unrelated to capacity but shared group state */ /* @@ -1127,6 +1174,11 @@ struct sched_class { void (*task_fork) (struct task_struct *p); void (*task_dead) (struct task_struct *p); + /* + * The switched_from() call is allowed to drop rq->lock, therefore we + * cannot assume the switched_from/switched_to pair is serliazed by + * rq->lock. They are however serialized by p->pi_lock. + */ void (*switched_from) (struct rq *this_rq, struct task_struct *task); void (*switched_to) (struct rq *this_rq, struct task_struct *task); void (*prio_changed) (struct rq *this_rq, struct task_struct *task, @@ -1306,9 +1358,11 @@ static inline int hrtick_enabled(struct rq *rq) #ifdef CONFIG_SMP extern void sched_avg_update(struct rq *rq); +extern unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu); + static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { - rq->rt_avg += rt_delta; + rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); sched_avg_update(rq); } #else @@ -1502,6 +1556,7 @@ extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq); extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); extern void print_cfs_stats(struct seq_file *m, int cpu); extern void print_rt_stats(struct seq_file *m, int cpu); +extern void print_dl_stats(struct seq_file *m, int cpu); extern void init_cfs_rq(struct cfs_rq *cfs_rq); extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 5a62915f47a8..852143a79f36 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -9,6 +9,7 @@ #include <linux/mm.h> #include <linux/wait.h> #include <linux/hash.h> +#include <linux/kthread.h> void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key) { @@ -297,6 +298,71 @@ int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void * } EXPORT_SYMBOL(autoremove_wake_function); +static inline bool is_kthread_should_stop(void) +{ + return (current->flags & PF_KTHREAD) && kthread_should_stop(); +} + +/* + * DEFINE_WAIT_FUNC(wait, woken_wake_func); + * + * add_wait_queue(&wq, &wait); + * for (;;) { + * if (condition) + * break; + * + * p->state = mode; condition = true; + * smp_mb(); // A smp_wmb(); // C + * if (!wait->flags & WQ_FLAG_WOKEN) wait->flags |= WQ_FLAG_WOKEN; + * schedule() try_to_wake_up(); + * p->state = TASK_RUNNING; ~~~~~~~~~~~~~~~~~~ + * wait->flags &= ~WQ_FLAG_WOKEN; condition = true; + * smp_mb() // B smp_wmb(); // C + * wait->flags |= WQ_FLAG_WOKEN; + * } + * remove_wait_queue(&wq, &wait); + * + */ +long wait_woken(wait_queue_t *wait, unsigned mode, long timeout) +{ + set_current_state(mode); /* A */ + /* + * The above implies an smp_mb(), which matches with the smp_wmb() from + * woken_wake_function() such that if we observe WQ_FLAG_WOKEN we must + * also observe all state before the wakeup. + */ + if (!(wait->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop()) + timeout = schedule_timeout(timeout); + __set_current_state(TASK_RUNNING); + + /* + * The below implies an smp_mb(), it too pairs with the smp_wmb() from + * woken_wake_function() such that we must either observe the wait + * condition being true _OR_ WQ_FLAG_WOKEN such that we will not miss + * an event. + */ + set_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */ + + return timeout; +} +EXPORT_SYMBOL(wait_woken); + +int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) +{ + /* + * Although this function is called under waitqueue lock, LOCK + * doesn't imply write barrier and the users expects write + * barrier semantics on wakeup functions. The following + * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up() + * and is paired with set_mb() in wait_woken(). + */ + smp_wmb(); /* C */ + wait->flags |= WQ_FLAG_WOKEN; + + return default_wake_function(wait, mode, sync, key); +} +EXPORT_SYMBOL(woken_wake_function); + int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg) { struct wait_bit_key *key = arg; diff --git a/kernel/smpboot.c b/kernel/smpboot.c index eb89e1807408..f032fb5284e3 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -110,7 +110,7 @@ static int smpboot_thread_fn(void *data) set_current_state(TASK_INTERRUPTIBLE); preempt_disable(); if (kthread_should_stop()) { - set_current_state(TASK_RUNNING); + __set_current_state(TASK_RUNNING); preempt_enable(); if (ht->cleanup) ht->cleanup(td->cpu, cpu_online(td->cpu)); @@ -136,26 +136,27 @@ static int smpboot_thread_fn(void *data) /* Check for state change setup */ switch (td->status) { case HP_THREAD_NONE: + __set_current_state(TASK_RUNNING); preempt_enable(); if (ht->setup) ht->setup(td->cpu); td->status = HP_THREAD_ACTIVE; - preempt_disable(); - break; + continue; + case HP_THREAD_PARKED: + __set_current_state(TASK_RUNNING); preempt_enable(); if (ht->unpark) ht->unpark(td->cpu); td->status = HP_THREAD_ACTIVE; - preempt_disable(); - break; + continue; } if (!ht->thread_should_run(td->cpu)) { - preempt_enable(); + preempt_enable_no_resched(); schedule(); } else { - set_current_state(TASK_RUNNING); + __set_current_state(TASK_RUNNING); preempt_enable(); ht->thread_fn(td->cpu); } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f33527ba62ca..947663d2935a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -398,7 +398,8 @@ static struct ctl_table kern_table[] = { .data = &sysctl_numa_balancing_scan_size, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, }, { .procname = "numa_balancing", diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c index af73bc3acb40..410dd5e76c41 100644 --- a/net/bluetooth/rfcomm/core.c +++ b/net/bluetooth/rfcomm/core.c @@ -101,11 +101,11 @@ static struct rfcomm_session *rfcomm_session_del(struct rfcomm_session *s); #define __get_rpn_stop_bits(line) (((line) >> 2) & 0x1) #define __get_rpn_parity(line) (((line) >> 3) & 0x7) +static DECLARE_WAIT_QUEUE_HEAD(rfcomm_wq); + static void rfcomm_schedule(void) { - if (!rfcomm_thread) - return; - wake_up_process(rfcomm_thread); + wake_up_all(&rfcomm_wq); } /* ---- RFCOMM FCS computation ---- */ @@ -2086,24 +2086,22 @@ static void rfcomm_kill_listener(void) static int rfcomm_run(void *unused) { + DEFINE_WAIT_FUNC(wait, woken_wake_function); BT_DBG(""); set_user_nice(current, -10); rfcomm_add_listener(BDADDR_ANY); - while (1) { - set_current_state(TASK_INTERRUPTIBLE); - - if (kthread_should_stop()) - break; + add_wait_queue(&rfcomm_wq, &wait); + while (!kthread_should_stop()) { /* Process stuff */ rfcomm_process_sessions(); - schedule(); + wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); } - __set_current_state(TASK_RUNNING); + remove_wait_queue(&rfcomm_wq, &wait); rfcomm_kill_listener(); diff --git a/net/core/dev.c b/net/core/dev.c index b793e3521a36..c5a9d73147a6 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7196,11 +7196,10 @@ static void __net_exit rtnl_lock_unregistering(struct list_head *net_list) */ struct net *net; bool unregistering; - DEFINE_WAIT(wait); + DEFINE_WAIT_FUNC(wait, woken_wake_function); + add_wait_queue(&netdev_unregistering_wq, &wait); for (;;) { - prepare_to_wait(&netdev_unregistering_wq, &wait, - TASK_UNINTERRUPTIBLE); unregistering = false; rtnl_lock(); list_for_each_entry(net, net_list, exit_list) { @@ -7212,9 +7211,10 @@ static void __net_exit rtnl_lock_unregistering(struct list_head *net_list) if (!unregistering) break; __rtnl_unlock(); - schedule(); + + wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); } - finish_wait(&netdev_unregistering_wq, &wait); + remove_wait_queue(&netdev_unregistering_wq, &wait); } static void __net_exit default_device_exit_batch(struct list_head *net_list) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index a6882686ca3a..b095296f711f 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -365,11 +365,10 @@ static void rtnl_lock_unregistering_all(void) { struct net *net; bool unregistering; - DEFINE_WAIT(wait); + DEFINE_WAIT_FUNC(wait, woken_wake_function); + add_wait_queue(&netdev_unregistering_wq, &wait); for (;;) { - prepare_to_wait(&netdev_unregistering_wq, &wait, - TASK_UNINTERRUPTIBLE); unregistering = false; rtnl_lock(); for_each_net(net) { @@ -381,9 +380,10 @@ static void rtnl_lock_unregistering_all(void) if (!unregistering) break; __rtnl_unlock(); - schedule(); + + wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); } - finish_wait(&netdev_unregistering_wq, &wait); + remove_wait_queue(&netdev_unregistering_wq, &wait); } /** |