From c0deae8c9587419ab13874b74425ce2eb2e18508 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 3 Nov 2010 18:52:56 +0200 Subject: posix-cpu-timers: Rcu_read_lock/unlock protect find_task_by_vpid call Commit 4221a9918e38b7494cee341dda7b7b4bb8c04bde "Add RCU check for find_task_by_vpid()" introduced rcu_lockdep_assert to find_task_by_pid_ns. Add rcu_read_lock/rcu_read_unlock to call find_task_by_vpid. Tetsuo Handa wrote: | Quoting from one of posts in that thead | http://kerneltrap.org/mailarchive/linux-kernel/2010/2/8/4536388 | || Usually tasklist gives enough protection, but if copy_process() fails || it calls free_pid() lockless and does call_rcu(delayed_put_pid(). || This means, without rcu lock find_pid_ns() can't scan the hash table || safely. Thomas Gleixner wrote: | We can remove the tasklist_lock while at it. rcu_read_lock is enough. Patch also replaces thread_group_leader with has_group_leader_pid in accordance to comment by Oleg Nesterov: | ... thread_group_leader() check is not relaible without | tasklist. If we race with de_thread() find_task_by_vpid() can find | the new leader before it updates its ->group_leader. | | perhaps it makes sense to change posix_cpu_timer_create() to use | has_group_leader_pid() instead, just to make this code not look racy | and avoid adding new problems. Signed-off-by: Sergey Senozhatsky Cc: Peter Zijlstra Cc: Stanislaw Gruszka Reviewed-by: Oleg Nesterov LKML-Reference: <20101103165256.GD30053@swordfish.minsk.epam.com> Signed-off-by: Thomas Gleixner --- kernel/posix-cpu-timers.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 6842eeba587..05bb7173850 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -37,13 +37,13 @@ static int check_clock(const clockid_t which_clock) if (pid == 0) return 0; - read_lock(&tasklist_lock); + rcu_read_lock(); p = find_task_by_vpid(pid); if (!p || !(CPUCLOCK_PERTHREAD(which_clock) ? - same_thread_group(p, current) : thread_group_leader(p))) { + same_thread_group(p, current) : has_group_leader_pid(p))) { error = -EINVAL; } - read_unlock(&tasklist_lock); + rcu_read_unlock(); return error; } @@ -390,7 +390,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer) INIT_LIST_HEAD(&new_timer->it.cpu.entry); - read_lock(&tasklist_lock); + rcu_read_lock(); if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) { if (pid == 0) { p = current; @@ -404,7 +404,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer) p = current->group_leader; } else { p = find_task_by_vpid(pid); - if (p && !thread_group_leader(p)) + if (p && !has_group_leader_pid(p)) p = NULL; } } @@ -414,7 +414,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer) } else { ret = -EINVAL; } - read_unlock(&tasklist_lock); + rcu_read_unlock(); return ret; } -- cgit v1.2.3 From 13b9b6e746d753d43270a78dd39694912646b5d9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 10 Nov 2010 22:19:24 -0500 Subject: tracing: Fix module use of trace_bprintk() On use of trace_printk() there's a macro that determines if the format is static or a variable. If it is static, it defaults to __trace_bprintk() otherwise it uses __trace_printk(). A while ago, Lai Jiangshan added __trace_bprintk(). In that patch, we discussed a way to allow modules to use it. The difference between __trace_bprintk() and __trace_printk() is that for faster processing, just the format and args are stored in the trace instead of running it through a sprintf function. In order to do this, the format used by the __trace_bprintk() had to be persistent. See commit 1ba28e02a18cbdbea123836f6c98efb09cbf59ec The problem comes with trace_bprintk() where the module is unloaded. The pointer left in the buffer is still pointing to the format. To solve this issue, the formats in the module were copied into kernel core. If the same format was used, they would use the same copy (to prevent memory leak). This all worked well until we tried to merge everything. At the time this was written, Lai Jiangshan, Frederic Weisbecker, Ingo Molnar and myself were all touching the same code. When this was merged, we lost the part of it that was in module.c. This kept out the copying of the formats and unloading the module could cause bad pointers left in the ring buffer. This patch adds back (with updates required for current kernel) the module code that sets up the necessary pointers. Cc: Lai Jiangshan Cc: Rusty Russell Signed-off-by: Steven Rostedt --- kernel/module.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 437a74a7524..d190664f25f 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2326,6 +2326,18 @@ static void find_module_sections(struct module *mod, struct load_info *info) kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) * mod->num_trace_events, GFP_KERNEL); #endif +#ifdef CONFIG_TRACING + mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt", + sizeof(*mod->trace_bprintk_fmt_start), + &mod->num_trace_bprintk_fmt); + /* + * This section contains pointers to allocated objects in the trace + * code and not scanning it leads to false positives. + */ + kmemleak_scan_area(mod->trace_bprintk_fmt_start, + sizeof(*mod->trace_bprintk_fmt_start) * + mod->num_trace_bprintk_fmt, GFP_KERNEL); +#endif #ifdef CONFIG_FTRACE_MCOUNT_RECORD /* sechdrs[0].sh_size is always zero */ mod->ftrace_callsites = section_objs(info, "__mcount_loc", -- cgit v1.2.3 From 3c502e7a0255d82621ff25d60cc816624830497e Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Thu, 4 Nov 2010 17:33:01 -0500 Subject: perf,hw_breakpoint: Initialize hardware api earlier When using early debugging, the kernel does not initialize the hw_breakpoint API early enough and causes the late initialization of the kernel debugger to fail. The boot arguments are: earlyprintk=vga ekgdboc=kbd kgdbwait Then simply type "go" at the kdb prompt and boot. The kernel will later emit the message: kgdb: Could not allocate hwbreakpoints And at that point the kernel debugger will cease to work correctly. The solution is to initialize the hw_breakpoint at the same time that all the other perf call backs are initialized instead of using a core_initcall() initialization which happens well after the kernel debugger can make use of hardware breakpoints. Signed-off-by: Jason Wessel CC: Frederic Weisbecker CC: Ingo Molnar CC: Peter Zijlstra LKML-Reference: <4CD3396D.1090308@windriver.com> Signed-off-by: Frederic Weisbecker --- kernel/hw_breakpoint.c | 3 +-- kernel/perf_event.c | 6 ++++++ 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 2c9120f0afc..e5325825aeb 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -620,7 +620,7 @@ static struct pmu perf_breakpoint = { .read = hw_breakpoint_pmu_read, }; -static int __init init_hw_breakpoint(void) +int __init init_hw_breakpoint(void) { unsigned int **task_bp_pinned; int cpu, err_cpu; @@ -655,6 +655,5 @@ static int __init init_hw_breakpoint(void) return -ENOMEM; } -core_initcall(init_hw_breakpoint); diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 517d827f498..05b7d8c72c6 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -31,6 +31,7 @@ #include #include #include +#include #include @@ -6295,6 +6296,8 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) void __init perf_event_init(void) { + int ret; + perf_event_init_all_cpus(); init_srcu_struct(&pmus_srcu); perf_pmu_register(&perf_swevent); @@ -6302,4 +6305,7 @@ void __init perf_event_init(void) perf_pmu_register(&perf_task_clock); perf_tp_register(); perf_cpu_notifier(perf_cpu_notify); + + ret = init_hw_breakpoint(); + WARN(ret, "hw_breakpoint initialization failed with: %d", ret); } -- cgit v1.2.3 From 91e86e560d0b3ce4c5fc64fd2bbb99f856a30a4e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 10 Nov 2010 12:56:12 +0100 Subject: tracing: Fix recursive user stack trace The user stack trace can fault when examining the trace. Which would call the do_page_fault handler, which would trace again, which would do the user stack trace, which would fault and call do_page_fault again ... Thus this is causing a recursive bug. We need to have a recursion detector here. [ Resubmitted by Jiri Olsa ] [ Eric Dumazet recommended using __this_cpu_* instead of __get_cpu_* ] Cc: Eric Dumazet Signed-off-by: Jiri Olsa LKML-Reference: <1289390172-9730-3-git-send-email-jolsa@redhat.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 82d9b8106cd..ee6a7339cf0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1284,6 +1284,8 @@ void trace_dump_stack(void) __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count()); } +static DEFINE_PER_CPU(int, user_stack_count); + void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) { @@ -1302,6 +1304,18 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) if (unlikely(in_nmi())) return; + /* + * prevent recursion, since the user stack tracing may + * trigger other kernel events. + */ + preempt_disable(); + if (__this_cpu_read(user_stack_count)) + goto out; + + __this_cpu_inc(user_stack_count); + + + event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, sizeof(*entry), flags, pc); if (!event) @@ -1319,6 +1333,11 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) save_stack_trace_user(&trace); if (!filter_check_discard(call, entry, buffer, event)) ring_buffer_unlock_commit(buffer, event); + + __this_cpu_dec(user_stack_count); + + out: + preempt_enable(); } #ifdef UNUSED -- cgit v1.2.3 From b5482cfa1c95a188b3054fa33274806add91bbe5 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Tue, 16 Nov 2010 17:34:02 +0800 Subject: sched: Fix volanomark performance regression Commit fab4762 triggers excessive idle balancing, causing a ~30% loss in volanomark throughput. Remove idle balancing throttle reset. Originally-by: Alex Shi Signed-off-by: Mike Galbraith Acked-by: Nikhil Rao Signed-off-by: Peter Zijlstra LKML-Reference: <1289928732.5169.211.camel@maggy.simson.net> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 52ab113d8bb..ba0556dc7c0 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1758,10 +1758,6 @@ static void pull_task(struct rq *src_rq, struct task_struct *p, set_task_cpu(p, this_cpu); activate_task(this_rq, p, 0); check_preempt_curr(this_rq, p, 0); - - /* re-arm NEWIDLE balancing when moving tasks */ - src_rq->avg_idle = this_rq->avg_idle = 2*sysctl_sched_migration_cost; - this_rq->idle_stamp = 0; } /* -- cgit v1.2.3 From d5ad140bc1505a98c0f040937125bfcbb508078f Mon Sep 17 00:00:00 2001 From: Nikhil Rao Date: Wed, 17 Nov 2010 11:42:04 -0800 Subject: sched: Fix idle balancing An earlier commit reverts idle balancing throttling reset to fix a 30% regression in volanomark throughput. We still need to reset idle_stamp when we pull a task in newidle balance. Reported-by: Alex Shi Signed-off-by: Nikhil Rao Signed-off-by: Peter Zijlstra LKML-Reference: <1290022924-3548-1-git-send-email-ncrao@google.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index ba0556dc7c0..00ebd768667 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -3215,8 +3215,10 @@ static void idle_balance(int this_cpu, struct rq *this_rq) interval = msecs_to_jiffies(sd->balance_interval); if (time_after(next_balance, sd->last_balance + interval)) next_balance = sd->last_balance + interval; - if (pulled_task) + if (pulled_task) { + this_rq->idle_stamp = 0; break; + } } raw_spin_lock(&this_rq->lock); -- cgit v1.2.3 From 8882135bcd332f294df5455747ea43ba9e6f77ad Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 9 Nov 2010 19:01:43 +0100 Subject: perf: Fix owner-list vs exit Oleg noticed that a perf-fd keeping a reference on the creating task leads to a few funny side effects. There's two different aspects to this: - kernel based perf-events, these should not take out a reference on the creating task and appear on the task's event list since they're not bound to fds nor visible to userspace. - fork() and pthread_create(), these can lead to the creating task dying (and thus the task's event-list becomming useless) but keeping the list and ref alive until the event is closed. Combined they lead to malfunction of the ptrace hw_tracepoints. Cure this by not considering kernel based perf_events for the owner-list and destroying the owner-list when the owner dies. Reported-by: Oleg Nesterov Signed-off-by: Peter Zijlstra Acked-by: Oleg Nesterov LKML-Reference: <1289576883.2084.286.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 63 +++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 51 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index f818d9d2dc9..671f6c8c8a3 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -2235,11 +2235,6 @@ int perf_event_release_kernel(struct perf_event *event) raw_spin_unlock_irq(&ctx->lock); mutex_unlock(&ctx->mutex); - mutex_lock(&event->owner->perf_event_mutex); - list_del_init(&event->owner_entry); - mutex_unlock(&event->owner->perf_event_mutex); - put_task_struct(event->owner); - free_event(event); return 0; @@ -2252,9 +2247,43 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel); static int perf_release(struct inode *inode, struct file *file) { struct perf_event *event = file->private_data; + struct task_struct *owner; file->private_data = NULL; + rcu_read_lock(); + owner = ACCESS_ONCE(event->owner); + /* + * Matches the smp_wmb() in perf_event_exit_task(). If we observe + * !owner it means the list deletion is complete and we can indeed + * free this event, otherwise we need to serialize on + * owner->perf_event_mutex. + */ + smp_read_barrier_depends(); + if (owner) { + /* + * Since delayed_put_task_struct() also drops the last + * task reference we can safely take a new reference + * while holding the rcu_read_lock(). + */ + get_task_struct(owner); + } + rcu_read_unlock(); + + if (owner) { + mutex_lock(&owner->perf_event_mutex); + /* + * We have to re-check the event->owner field, if it is cleared + * we raced with perf_event_exit_task(), acquiring the mutex + * ensured they're done, and we can proceed with freeing the + * event. + */ + if (event->owner) + list_del_init(&event->owner_entry); + mutex_unlock(&owner->perf_event_mutex); + put_task_struct(owner); + } + return perf_event_release_kernel(event); } @@ -5678,7 +5707,7 @@ SYSCALL_DEFINE5(perf_event_open, mutex_unlock(&ctx->mutex); event->owner = current; - get_task_struct(current); + mutex_lock(¤t->perf_event_mutex); list_add_tail(&event->owner_entry, ¤t->perf_event_list); mutex_unlock(¤t->perf_event_mutex); @@ -5746,12 +5775,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, ++ctx->generation; mutex_unlock(&ctx->mutex); - event->owner = current; - get_task_struct(current); - mutex_lock(¤t->perf_event_mutex); - list_add_tail(&event->owner_entry, ¤t->perf_event_list); - mutex_unlock(¤t->perf_event_mutex); - return event; err_free: @@ -5902,8 +5925,24 @@ again: */ void perf_event_exit_task(struct task_struct *child) { + struct perf_event *event, *tmp; int ctxn; + mutex_lock(&child->perf_event_mutex); + list_for_each_entry_safe(event, tmp, &child->perf_event_list, + owner_entry) { + list_del_init(&event->owner_entry); + + /* + * Ensure the list deletion is visible before we clear + * the owner, closes a race against perf_release() where + * we need to serialize on the owner->perf_event_mutex. + */ + smp_wmb(); + event->owner = NULL; + } + mutex_unlock(&child->perf_event_mutex); + for_each_task_context_nr(ctxn) perf_event_exit_task_context(child, ctxn); } -- cgit v1.2.3 From 94e8ba728640dc01375a14e337f3b892bfacbeeb Mon Sep 17 00:00:00 2001 From: Sergio Aguirre Date: Tue, 16 Nov 2010 12:02:47 -0600 Subject: irq_work: Drop cmpxchg() result The compiler warned us about: kernel/irq_work.c: In function 'irq_work_run': kernel/irq_work.c:148: warning: value computed is not used Dropping the cmpxchg() result is indeed weird, but correct - so annotate away the warning. Signed-off-by: Sergio Aguirre Cc: Huang Ying Cc: Martin Schwidefsky Cc: Kyle McMartin Signed-off-by: Peter Zijlstra LKML-Reference: <1289930567-17828-1-git-send-email-saaguirre@ti.com> Signed-off-by: Ingo Molnar --- kernel/irq_work.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq_work.c b/kernel/irq_work.c index f16763ff848..90f881904bb 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -145,7 +145,9 @@ void irq_work_run(void) * Clear the BUSY bit and return to the free state if * no-one else claimed it meanwhile. */ - cmpxchg(&entry->next, next_flags(NULL, IRQ_WORK_BUSY), NULL); + (void)cmpxchg(&entry->next, + next_flags(NULL, IRQ_WORK_BUSY), + NULL); } } EXPORT_SYMBOL_GPL(irq_work_run); -- cgit v1.2.3 From dddd3379a619a4cb8247bfd3c94ca9ae3797aa2e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 24 Nov 2010 10:05:55 +0100 Subject: perf: Fix inherit vs. context rotation bug It was found that sometimes children of tasks with inherited events had one extra event. Eventually it turned out to be due to the list rotation no being exclusive with the list iteration in the inheritance code. Cure this by temporarily disabling the rotation while we inherit the events. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra LKML-Reference: Cc: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 671f6c8c8a3..f365dd8ef8b 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1622,8 +1622,12 @@ static void rotate_ctx(struct perf_event_context *ctx) { raw_spin_lock(&ctx->lock); - /* Rotate the first entry last of non-pinned groups */ - list_rotate_left(&ctx->flexible_groups); + /* + * Rotate the first entry last of non-pinned groups. Rotation might be + * disabled by the inheritance code. + */ + if (!ctx->rotate_disable) + list_rotate_left(&ctx->flexible_groups); raw_spin_unlock(&ctx->lock); } @@ -6162,6 +6166,7 @@ int perf_event_init_context(struct task_struct *child, int ctxn) struct perf_event *event; struct task_struct *parent = current; int inherited_all = 1; + unsigned long flags; int ret = 0; child->perf_event_ctxp[ctxn] = NULL; @@ -6202,6 +6207,15 @@ int perf_event_init_context(struct task_struct *child, int ctxn) break; } + /* + * We can't hold ctx->lock when iterating the ->flexible_group list due + * to allocations, but we need to prevent rotation because + * rotate_ctx() will change the list from interrupt context. + */ + raw_spin_lock_irqsave(&parent_ctx->lock, flags); + parent_ctx->rotate_disable = 1; + raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); + list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { ret = inherit_task_group(event, parent, parent_ctx, child, ctxn, &inherited_all); @@ -6209,6 +6223,10 @@ int perf_event_init_context(struct task_struct *child, int ctxn) break; } + raw_spin_lock_irqsave(&parent_ctx->lock, flags); + parent_ctx->rotate_disable = 0; + raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); + child_ctx = child->perf_event_ctxp[ctxn]; if (child_ctx && inherited_all) { -- cgit v1.2.3 From ee6dcfa40a50fe12a3ae0fb4d2653c66c3ed6556 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 26 Nov 2010 13:49:04 +0100 Subject: perf: Fix the software context switch counter Stephane noticed that because the perf_sw_event() call is inside the perf_event_task_sched_out() call it won't get called unless we have a per-task counter. Reported-by: Stephane Eranian Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index f365dd8ef8b..eac7e336433 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1287,8 +1287,6 @@ void __perf_event_task_sched_out(struct task_struct *task, { int ctxn; - perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); - for_each_task_context_nr(ctxn) perf_event_context_sched_out(task, ctxn, next); } -- cgit v1.2.3 From 49f4138346b3cec2706adff02658fe27ceb1e46f Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 26 Nov 2010 13:42:47 +0100 Subject: printk: Fix wake_up_klogd() vs cpu hotplug wake_up_klogd() may get called from preemptible context but uses __raw_get_cpu_var() to write to a per cpu variable. If it gets preempted between getting the address and writing to it, the cpu in question could be offline if the process gets scheduled back and hence writes to the per cpu data of an offline cpu. This buggy behaviour was introduced with fa33507a "printk: robustify printk, fix #2" which was supposed to fix a "using smp_processor_id() in preemptible" warning. Let's use this_cpu_write() instead which disables preemption and makes sure that the outlined scenario cannot happen. Signed-off-by: Heiko Carstens Acked-by: Eric Dumazet Signed-off-by: Peter Zijlstra LKML-Reference: <20101126124247.GC7023@osiris.boeblingen.de.ibm.com> Signed-off-by: Ingo Molnar --- kernel/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 9a2264fc42c..cf7588e93f6 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1088,7 +1088,7 @@ int printk_needs_cpu(int cpu) void wake_up_klogd(void) { if (waitqueue_active(&log_wait)) - __raw_get_cpu_var(printk_pending) = 1; + this_cpu_write(printk_pending, 1); } /** -- cgit v1.2.3 From 61ab25447ad6334a74e32f60efb135a3467223f8 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 26 Nov 2010 13:00:59 +0100 Subject: nohz: Fix printk_needs_cpu() return value on offline cpus This patch fixes a hang observed with 2.6.32 kernels where timers got enqueued on offline cpus. printk_needs_cpu() may return 1 if called on offline cpus. When a cpu gets offlined it schedules the idle process which, before killing its own cpu, will call tick_nohz_stop_sched_tick(). That function in turn will call printk_needs_cpu() in order to check if the local tick can be disabled. On offline cpus this function should naturally return 0 since regardless if the tick gets disabled or not the cpu will be dead short after. That is besides the fact that __cpu_disable() should already have made sure that no interrupts on the offlined cpu will be delivered anyway. In this case it prevents tick_nohz_stop_sched_tick() to call select_nohz_load_balancer(). No idea if that really is a problem. However what made me debug this is that on 2.6.32 the function get_nohz_load_balancer() is used within __mod_timer() to select a cpu on which a timer gets enqueued. If printk_needs_cpu() returns 1 then the nohz_load_balancer cpu doesn't get updated when a cpu gets offlined. It may contain the cpu number of an offline cpu. In turn timers get enqueued on an offline cpu and not very surprisingly they never expire and cause system hangs. This has been observed 2.6.32 kernels. On current kernels __mod_timer() uses get_nohz_timer_target() which doesn't have that problem. However there might be other problems because of the too early exit tick_nohz_stop_sched_tick() in case a cpu goes offline. Easiest way to fix this is just to test if the current cpu is offline and call printk_tick() directly which clears the condition. Alternatively I tried a cpu hotplug notifier which would clear the condition, however between calling the notifier function and printk_needs_cpu() something could have called printk() again and the problem is back again. This seems to be the safest fix. Signed-off-by: Heiko Carstens Signed-off-by: Peter Zijlstra Cc: stable@kernel.org LKML-Reference: <20101126120235.406766476@de.ibm.com> Signed-off-by: Ingo Molnar --- kernel/printk.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index cf7588e93f6..a23315dc449 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1082,6 +1082,8 @@ void printk_tick(void) int printk_needs_cpu(int cpu) { + if (unlikely(cpu_is_offline(cpu))) + printk_tick(); return per_cpu(printk_pending, cpu); } -- cgit v1.2.3 From 25c9170ed64a6551beefe9315882f754e14486f4 Mon Sep 17 00:00:00 2001 From: Kenji Kaneshige Date: Tue, 30 Nov 2010 17:36:08 +0900 Subject: genirq: Fix incorrect proc spurious output Since commit a1afb637(switch /proc/irq/*/spurious to seq_file) all /proc/irq/XX/spurious files show the information of irq 0. Current irq_spurious_proc_open() passes on NULL as the 3rd argument, which is used as an IRQ number in irq_spurious_proc_show(), to the single_open(). Because of this, all the /proc/irq/XX/spurious file shows IRQ 0 information regardless of the IRQ number. To fix the problem, irq_spurious_proc_open() must pass on the appropreate data (IRQ number) to single_open(). Signed-off-by: Kenji Kaneshige Reviewed-by: Yong Zhang LKML-Reference: <4CF4B778.90604@jp.fujitsu.com> Cc: stable@kernel.org [2.6.33+] Signed-off-by: Thomas Gleixner --- kernel/irq/proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 01b1d3a8898..6c8a2a9f8a7 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -214,7 +214,7 @@ static int irq_spurious_proc_show(struct seq_file *m, void *v) static int irq_spurious_proc_open(struct inode *inode, struct file *file) { - return single_open(file, irq_spurious_proc_show, NULL); + return single_open(file, irq_spurious_proc_show, PDE(inode)->data); } static const struct file_operations irq_spurious_proc_fops = { -- cgit v1.2.3 From 33dd94ae1ccbfb7bf0fb6c692bc3d1c4269e6177 Mon Sep 17 00:00:00 2001 From: Nelson Elhage Date: Thu, 2 Dec 2010 14:31:21 -0800 Subject: do_exit(): make sure that we run with get_fs() == USER_DS If a user manages to trigger an oops with fs set to KERNEL_DS, fs is not otherwise reset before do_exit(). do_exit may later (via mm_release in fork.c) do a put_user to a user-controlled address, potentially allowing a user to leverage an oops into a controlled write into kernel memory. This is only triggerable in the presence of another bug, but this potentially turns a lot of DoS bugs into privilege escalations, so it's worth fixing. I have proof-of-concept code which uses this bug along with CVE-2010-3849 to write a zero to an arbitrary kernel address, so I've tested that this is not theoretical. A more logical place to put this fix might be when we know an oops has occurred, before we call do_exit(), but that would involve changing every architecture, in multiple places. Let's just stick it in do_exit instead. [akpm@linux-foundation.org: update code comment] Signed-off-by: Nelson Elhage Cc: KOSAKI Motohiro Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 21aa7b3001f..676149a4ac5 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -914,6 +914,15 @@ NORET_TYPE void do_exit(long code) if (unlikely(!tsk->pid)) panic("Attempted to kill the idle task!"); + /* + * If do_exit is called because this processes oopsed, it's possible + * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before + * continuing. Amongst other possible reasons, this is to prevent + * mm_release()->clear_child_tid() from writing to a user-controlled + * kernel address. + */ + set_fs(USER_DS); + tracehook_report_exit(&code); validate_creds_for_do_exit(tsk); -- cgit v1.2.3 From 9f339caf8454f0c21983111350ede93983db4340 Mon Sep 17 00:00:00 2001 From: Bojan Smojver Date: Thu, 25 Nov 2010 23:41:39 +0100 Subject: PM / Hibernate: Use async I/O when reading compressed hibernation image This is a fix for reading LZO compressed image using async I/O. Essentially, instead of having just one page into which we keep reading blocks from swap, we allocate enough of them to cover the largest compressed size and then let block I/O pick them all up. Once we have them all (and here we wait), we decompress them, as usual. Obviously, the very first block we still pick up synchronously, because we need to know the size of the lot before we pick up the rest. Also fixed the copyright line, which I've forgotten before. Signed-off-by: Bojan Smojver Signed-off-by: Rafael J. Wysocki --- kernel/power/swap.c | 53 ++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 38 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index a0e4a86ccf9..baf667bb279 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -6,6 +6,7 @@ * * Copyright (C) 1998,2001-2005 Pavel Machek * Copyright (C) 2006 Rafael J. Wysocki + * Copyright (C) 2010 Bojan Smojver * * This file is released under the GPLv2. * @@ -753,30 +754,43 @@ static int load_image_lzo(struct swap_map_handle *handle, { unsigned int m; int error = 0; + struct bio *bio; struct timeval start; struct timeval stop; unsigned nr_pages; - size_t off, unc_len, cmp_len; - unsigned char *unc, *cmp, *page; + size_t i, off, unc_len, cmp_len; + unsigned char *unc, *cmp, *page[LZO_CMP_PAGES]; - page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); - if (!page) { - printk(KERN_ERR "PM: Failed to allocate LZO page\n"); - return -ENOMEM; + for (i = 0; i < LZO_CMP_PAGES; i++) { + page[i] = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); + if (!page[i]) { + printk(KERN_ERR "PM: Failed to allocate LZO page\n"); + + while (i) + free_page((unsigned long)page[--i]); + + return -ENOMEM; + } } unc = vmalloc(LZO_UNC_SIZE); if (!unc) { printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n"); - free_page((unsigned long)page); + + for (i = 0; i < LZO_CMP_PAGES; i++) + free_page((unsigned long)page[i]); + return -ENOMEM; } cmp = vmalloc(LZO_CMP_SIZE); if (!cmp) { printk(KERN_ERR "PM: Failed to allocate LZO compressed\n"); + vfree(unc); - free_page((unsigned long)page); + for (i = 0; i < LZO_CMP_PAGES; i++) + free_page((unsigned long)page[i]); + return -ENOMEM; } @@ -787,6 +801,7 @@ static int load_image_lzo(struct swap_map_handle *handle, if (!m) m = 1; nr_pages = 0; + bio = NULL; do_gettimeofday(&start); error = snapshot_write_next(snapshot); @@ -794,11 +809,11 @@ static int load_image_lzo(struct swap_map_handle *handle, goto out_finish; for (;;) { - error = swap_read_page(handle, page, NULL); /* sync */ + error = swap_read_page(handle, page[0], NULL); /* sync */ if (error) break; - cmp_len = *(size_t *)page; + cmp_len = *(size_t *)page[0]; if (unlikely(!cmp_len || cmp_len > lzo1x_worst_compress(LZO_UNC_SIZE))) { printk(KERN_ERR "PM: Invalid LZO compressed length\n"); @@ -806,13 +821,20 @@ static int load_image_lzo(struct swap_map_handle *handle, break; } - memcpy(cmp, page, PAGE_SIZE); - for (off = PAGE_SIZE; off < LZO_HEADER + cmp_len; off += PAGE_SIZE) { - error = swap_read_page(handle, page, NULL); /* sync */ + for (off = PAGE_SIZE, i = 1; + off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) { + error = swap_read_page(handle, page[i], &bio); if (error) goto out_finish; + } - memcpy(cmp + off, page, PAGE_SIZE); + error = hib_wait_on_bio_chain(&bio); /* need all data now */ + if (error) + goto out_finish; + + for (off = 0, i = 0; + off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) { + memcpy(cmp + off, page[i], PAGE_SIZE); } unc_len = LZO_UNC_SIZE; @@ -857,7 +879,8 @@ out_finish: vfree(cmp); vfree(unc); - free_page((unsigned long)page); + for (i = 0; i < LZO_CMP_PAGES; i++) + free_page((unsigned long)page[i]); return error; } -- cgit v1.2.3 From c9e664f1fdf34aa8cede047b206deaa8f1945af0 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 3 Dec 2010 22:57:45 +0100 Subject: PM / Hibernate: Fix memory corruption related to swap There is a problem that swap pages allocated before the creation of a hibernation image can be released and used for storing the contents of different memory pages while the image is being saved. Since the kernel stored in the image doesn't know of that, it causes memory corruption to occur after resume from hibernation, especially on systems with relatively small RAM that need to swap often. This issue can be addressed by keeping the GFP_IOFS bits clear in gfp_allowed_mask during the entire hibernation, including the saving of the image, until the system is finally turned off or the hibernation is aborted. Unfortunately, for this purpose it's necessary to rework the way in which the hibernate and suspend code manipulates gfp_allowed_mask. This change is based on an earlier patch from Hugh Dickins. Signed-off-by: Rafael J. Wysocki Reported-by: Ondrej Zary Acked-by: Hugh Dickins Reviewed-by: KAMEZAWA Hiroyuki Cc: stable@kernel.org --- kernel/power/hibernate.c | 22 ++++++++++++---------- kernel/power/suspend.c | 5 ++--- kernel/power/user.c | 2 ++ 3 files changed, 16 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 657272e91d0..048d0b51483 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -327,7 +327,6 @@ static int create_image(int platform_mode) int hibernation_snapshot(int platform_mode) { int error; - gfp_t saved_mask; error = platform_begin(platform_mode); if (error) @@ -339,7 +338,7 @@ int hibernation_snapshot(int platform_mode) goto Close; suspend_console(); - saved_mask = clear_gfp_allowed_mask(GFP_IOFS); + pm_restrict_gfp_mask(); error = dpm_suspend_start(PMSG_FREEZE); if (error) goto Recover_platform; @@ -348,7 +347,10 @@ int hibernation_snapshot(int platform_mode) goto Recover_platform; error = create_image(platform_mode); - /* Control returns here after successful restore */ + /* + * Control returns here (1) after the image has been created or the + * image creation has failed and (2) after a successful restore. + */ Resume_devices: /* We may need to release the preallocated image pages here. */ @@ -357,7 +359,10 @@ int hibernation_snapshot(int platform_mode) dpm_resume_end(in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); - set_gfp_allowed_mask(saved_mask); + + if (error || !in_suspend) + pm_restore_gfp_mask(); + resume_console(); Close: platform_end(platform_mode); @@ -452,17 +457,16 @@ static int resume_target_kernel(bool platform_mode) int hibernation_restore(int platform_mode) { int error; - gfp_t saved_mask; pm_prepare_console(); suspend_console(); - saved_mask = clear_gfp_allowed_mask(GFP_IOFS); + pm_restrict_gfp_mask(); error = dpm_suspend_start(PMSG_QUIESCE); if (!error) { error = resume_target_kernel(platform_mode); dpm_resume_end(PMSG_RECOVER); } - set_gfp_allowed_mask(saved_mask); + pm_restore_gfp_mask(); resume_console(); pm_restore_console(); return error; @@ -476,7 +480,6 @@ int hibernation_restore(int platform_mode) int hibernation_platform_enter(void) { int error; - gfp_t saved_mask; if (!hibernation_ops) return -ENOSYS; @@ -492,7 +495,6 @@ int hibernation_platform_enter(void) entering_platform_hibernation = true; suspend_console(); - saved_mask = clear_gfp_allowed_mask(GFP_IOFS); error = dpm_suspend_start(PMSG_HIBERNATE); if (error) { if (hibernation_ops->recover) @@ -536,7 +538,6 @@ int hibernation_platform_enter(void) Resume_devices: entering_platform_hibernation = false; dpm_resume_end(PMSG_RESTORE); - set_gfp_allowed_mask(saved_mask); resume_console(); Close: @@ -646,6 +647,7 @@ int hibernate(void) swsusp_free(); if (!error) power_down(); + pm_restore_gfp_mask(); } else { pr_debug("PM: Image restored successfully.\n"); } diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 7335952ee47..ecf770509d0 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -197,7 +197,6 @@ static int suspend_enter(suspend_state_t state) int suspend_devices_and_enter(suspend_state_t state) { int error; - gfp_t saved_mask; if (!suspend_ops) return -ENOSYS; @@ -208,7 +207,7 @@ int suspend_devices_and_enter(suspend_state_t state) goto Close; } suspend_console(); - saved_mask = clear_gfp_allowed_mask(GFP_IOFS); + pm_restrict_gfp_mask(); suspend_test_start(); error = dpm_suspend_start(PMSG_SUSPEND); if (error) { @@ -225,7 +224,7 @@ int suspend_devices_and_enter(suspend_state_t state) suspend_test_start(); dpm_resume_end(PMSG_RESUME); suspend_test_finish("resume devices"); - set_gfp_allowed_mask(saved_mask); + pm_restore_gfp_mask(); resume_console(); Close: if (suspend_ops->end) diff --git a/kernel/power/user.c b/kernel/power/user.c index e819e17877c..1b2ea31e6bd 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -263,6 +263,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, case SNAPSHOT_UNFREEZE: if (!data->frozen || data->ready) break; + pm_restore_gfp_mask(); thaw_processes(); usermodehelper_enable(); data->frozen = 0; @@ -275,6 +276,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, error = -EPERM; break; } + pm_restore_gfp_mask(); error = hibernation_snapshot(data->platform_support); if (!error) error = put_user(in_suspend, (int __user *)arg); -- cgit v1.2.3