From f123c98e7f168e949b283690693695f988332c3d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 6 Jan 2011 15:08:29 -0500 Subject: rtmutex: Fix comment about why new_owner can be NULL in wake_futex_pi() The comment about why rt_mutex_next_owner() can return NULL in wake_futex_pi() is not the normal case. Tracing the cause of why this occurs is more likely that waiter simply timedout. But because it originally caused contention with the futex, the owner will go into the kernel when it unlocks the lock. Then it will hit this code path and rt_mutex_next_owner() will return NULL. Cc: Thomas Gleixner Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/futex.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 3019b92e691..5696d38cc71 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -791,10 +791,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); /* - * This happens when we have stolen the lock and the original - * pending owner did not enqueue itself back on the rt_mutex. - * Thats not a tragedy. We know that way, that a lock waiter - * is on the fly. We make the futex_q waiter the pending owner. + * It is possible that the next waiter (the one that brought + * this owner to the kernel) timed out and is no longer + * waiting on the lock. */ if (!new_owner) new_owner = this->task; -- cgit v1.2.3 From bd3bfe9eda94d3c050830217c1e1c338808de5b2 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 11 Jan 2011 12:42:00 -0200 Subject: perf evsel: Fix order of event list deletion We need to defer calling perf_evsel_list__delete() till after atexit registered routines, because we need to traverse the events being recorded at that time at least on 'perf record'. This fixes the problem reported by Thomas Renninger where cmd_record called by cmd_timechart would not write the tracing data to the perf.data file header because the evsel_list at atexit (control+C on 'perf timechart record') time would be empty, being already deleted by run_builtin(), and thus 'perf timechart' when trying to process such perf.data file would die with: "no trace data in the file" Problem introduced in 70d544d. Reported-by: Thomas Renninger Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Renninger Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 1 + tools/perf/builtin-stat.c | 1 + tools/perf/builtin-top.c | 1 + tools/perf/perf.c | 2 -- 4 files changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 7069bd3e90b..aa7ece39765 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -480,6 +480,7 @@ static void atexit_header(void) process_buildids(); perf_header__write(&session->header, output, true); perf_session__delete(session); + perf_evsel_list__delete(); symbol__exit(); } } diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index c385a63ebfd..0ff11d9b13b 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -743,6 +743,7 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used) out_free_fd: list_for_each_entry(pos, &evsel_list, node) perf_evsel__free_stat_priv(pos); + perf_evsel_list__delete(); out: thread_map__delete(threads); threads = NULL; diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 6ce4042421b..4b995ee099c 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -1490,6 +1490,7 @@ int cmd_top(int argc, const char **argv, const char *prefix __used) out_free_fd: list_for_each_entry(pos, &evsel_list, node) perf_evsel__free_mmap(pos); + perf_evsel_list__delete(); return status; } diff --git a/tools/perf/perf.c b/tools/perf/perf.c index 5b1ecd66bb3..595d0f4a710 100644 --- a/tools/perf/perf.c +++ b/tools/perf/perf.c @@ -286,8 +286,6 @@ static int run_builtin(struct cmd_struct *p, int argc, const char **argv) status = p->fn(argc, argv, prefix); exit_browser(status); - perf_evsel_list__delete(); - if (status) return status & 0xff; -- cgit v1.2.3 From cc841580aa58ad7498b23e282859d07f8b721e24 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 11 Jan 2011 15:16:52 -0200 Subject: perf top: Fix annotate segv Before we had sym_counter, it was initialized to zero and we used that as an index in the global attrs variable, now we have a list of evsel entries, and sym_counter became sym_evsel, that remained initialized to zero (NULL): b00m. Fix it by initializing it to the first entry in the evsel list. Bug-introduced: 69aad6f Reported-by: Kirill Smelkov Tested-by: Kirill Smelkov Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Kirill Smelkov Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-top.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 4b995ee099c..568b1950e63 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -1473,6 +1473,8 @@ int cmd_top(int argc, const char **argv, const char *prefix __used) pos->attr.sample_period = default_interval; } + sym_evsel = list_entry(evsel_list.next, struct perf_evsel, node); + symbol_conf.priv_size = (sizeof(struct sym_entry) + (nr_counters + 1) * sizeof(unsigned long)); -- cgit v1.2.3 From 4ad9f594d7199c99f6b1b3ef88c64bd5920a4592 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 11 Jan 2011 16:58:54 -0200 Subject: Revert "perf tools: Emit clearer message for sys_perf_event_open ENOENT return" This reverts commit aa7bc7ef73efc46d7c3a0e185eefaf85744aec98. It removed the fallback from hardware profiling to software profiling. .e.g., in a VM with no PMU. Reported-by: David Ahern Cc: David Ahern Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 3 --- tools/perf/builtin-top.c | 2 -- 2 files changed, 5 deletions(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index aa7ece39765..1210e6484ad 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -331,9 +331,6 @@ try_again: else if (err == ENODEV && cpu_list) { die("No such device - did you specify" " an out-of-range profile CPU?\n"); - } else if (err == ENOENT) { - die("%s event is not supported. ", - event_name(evsel)); } else if (err == EINVAL && sample_id_all_avail) { /* * Old kernel, no attr->sample_id_type_all field diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 568b1950e63..05344c6210a 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -1247,8 +1247,6 @@ try_again: die("Permission error - are you root?\n" "\t Consider tweaking" " /proc/sys/kernel/perf_event_paranoid.\n"); - if (err == ENOENT) - die("%s event is not supported. ", event_name(evsel)); /* * If it's cycles then fall back to hrtimer * based cpu-clock-tick sw counter, which -- cgit v1.2.3 From 9378b63ccb32b9c071dab155c96357ad1e52a709 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 12 Jan 2011 00:50:37 -0800 Subject: x86, ia64, acpi: Clean up x86-ism in drivers/acpi/numa.c As pointed out by Linus CONFIG_X86 in drivers/acpi/numa.c is ugly. Builds and boots on ia64 (both normally and with maxcpus=8 to limit the number of cpus). Signed-off-by: Tony Luck Acked-by: Yinghai Lu Cc: Linus Torvalds Cc: Wu Fengguang Cc: Bjorn Helgaas Cc: Len Brown LKML-Reference: <4D2D6B5D.4080208@kernel.org> Signed-off-by: Ingo Molnar --- drivers/acpi/numa.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c index d9926afec11..5eb25eb3ea4 100644 --- a/drivers/acpi/numa.c +++ b/drivers/acpi/numa.c @@ -275,23 +275,19 @@ acpi_table_parse_srat(enum acpi_srat_type id, int __init acpi_numa_init(void) { int ret = 0; - int nr_cpu_entries = nr_cpu_ids; -#ifdef CONFIG_X86 /* * Should not limit number with cpu num that is from NR_CPUS or nr_cpus= * SRAT cpu entries could have different order with that in MADT. * So go over all cpu entries in SRAT to get apicid to node mapping. */ - nr_cpu_entries = MAX_LOCAL_APIC; -#endif /* SRAT: Static Resource Affinity Table */ if (!acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat)) { acpi_table_parse_srat(ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY, - acpi_parse_x2apic_affinity, nr_cpu_entries); + acpi_parse_x2apic_affinity, 0); acpi_table_parse_srat(ACPI_SRAT_TYPE_CPU_AFFINITY, - acpi_parse_processor_affinity, nr_cpu_entries); + acpi_parse_processor_affinity, 0); ret = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY, acpi_parse_memory_affinity, NR_NODE_MEMBLKS); -- cgit v1.2.3 From 5fdade95f26372154459347dfb9f60721d22cfc7 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Tue, 11 Jan 2011 12:18:12 -0500 Subject: time: Rename misnamed minsec argument of clocks_calc_mult_shift() The minsec argument to clocks_calc_mult_shift() is misnamed. It is used to clamp the magnitude of the mult factor so that a multiplication with any value in the given range won't overflow a 64 bit result. Let's rename it to match the actual usage. Signed-off-by: Nicolas Pitre Acked-by: John Stultz LKML-Reference: Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index c18d7efa1b4..8588abcac07 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -113,7 +113,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time); * @shift: pointer to shift variable * @from: frequency to convert from * @to: frequency to convert to - * @minsec: guaranteed runtime conversion range in seconds + * @maxsec: guaranteed runtime conversion range in seconds * * The function evaluates the shift/mult pair for the scaled math * operations of clocksources and clockevents. @@ -122,7 +122,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time); * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock * event @to is the counter frequency and @from is NSEC_PER_SEC. * - * The @minsec conversion range argument controls the time frame in + * The @maxsec conversion range argument controls the time frame in * seconds which must be covered by the runtime conversion with the * calculated mult and shift factors. This guarantees that no 64bit * overflow happens when the input value of the conversion is @@ -131,7 +131,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time); * factors. */ void -clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec) +clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec) { u64 tmp; u32 sft, sftacc= 32; @@ -140,7 +140,7 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec) * Calculate the shift factor which is limiting the conversion * range: */ - tmp = ((u64)minsec * from) >> 32; + tmp = ((u64)maxsec * from) >> 32; while (tmp) { tmp >>=1; sftacc--; -- cgit v1.2.3 From afa14e7c553ebe45844d76208f66017a43abd0e2 Mon Sep 17 00:00:00 2001 From: H Hartley Sweeten Date: Tue, 11 Jan 2011 17:59:38 -0600 Subject: timekeeping: Make local variables static Signed-off-by: H Hartley Sweeten Cc: John Stultz LKML-Reference: <0D753D10438DA54287A00B027084269764CE0E54B7@AUSP01VMBX24.collaborationhost.net> Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 5bb86da8200..eef7452bd8a 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -49,7 +49,7 @@ struct timekeeper { u32 mult; }; -struct timekeeper timekeeper; +static struct timekeeper timekeeper; /** * timekeeper_setup_internals - Set up internals to use clocksource clock. @@ -164,7 +164,7 @@ static struct timespec total_sleep_time; /* * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */ -struct timespec raw_time; +static struct timespec raw_time; /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; -- cgit v1.2.3 From 9710118bd4e7f3406865171cb9b9c94547c1c2f9 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Wed, 12 Jan 2011 10:29:05 +0100 Subject: perf sched: Fix list of events, dropping unsupported ':r' modifier Looks to me like the :r modifier is not supported anymore, so remove it from the list of events. Cc: Corey Ashford Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Robert Richter LKML-Reference: Signed-off-by: Stephane Eranian Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-sched.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index abd4b8497bc..29e7ffd8569 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -1843,15 +1843,15 @@ static const char *record_args[] = { "-f", "-m", "1024", "-c", "1", - "-e", "sched:sched_switch:r", - "-e", "sched:sched_stat_wait:r", - "-e", "sched:sched_stat_sleep:r", - "-e", "sched:sched_stat_iowait:r", - "-e", "sched:sched_stat_runtime:r", - "-e", "sched:sched_process_exit:r", - "-e", "sched:sched_process_fork:r", - "-e", "sched:sched_wakeup:r", - "-e", "sched:sched_migrate_task:r", + "-e", "sched:sched_switch", + "-e", "sched:sched_stat_wait", + "-e", "sched:sched_stat_sleep", + "-e", "sched:sched_stat_iowait", + "-e", "sched:sched_stat_runtime", + "-e", "sched:sched_process_exit", + "-e", "sched:sched_process_fork", + "-e", "sched:sched_wakeup", + "-e", "sched:sched_migrate_task", }; static int __cmd_record(int argc, const char **argv) -- cgit v1.2.3 From acac03fa15a8684bb60489ed87b5aae5258c0838 Mon Sep 17 00:00:00 2001 From: Kirill Smelkov Date: Wed, 12 Jan 2011 17:59:36 +0300 Subject: perf record: Add "nodelay" mode, disabled by default Sometimes there is a need to use perf in "live-log" mode. The problem is, for seldom events, actual info output is largely delayed because perf-record reads sample data in whole pages. So for such scenarious, add flag for perf-record to go in "nodelay" mode. To track e.g. what's going on in icmp_rcv while ping is running Use it with something like this: (1) $ perf probe -L icmp_rcv | grep -U8 '^ *43\>' goto error; } 38 if (!pskb_pull(skb, sizeof(*icmph))) goto error; icmph = icmp_hdr(skb); 43 ICMPMSGIN_INC_STATS_BH(net, icmph->type); /* * 18 is the highest 'known' ICMP type. Anything else is a mystery * * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently * discarded. */ 50 if (icmph->type > NR_ICMP_TYPES) goto error; $ perf probe icmp_rcv:43 'type=icmph->type' (2) $ cat trace-icmp.py [...] def trace_begin(): print "in trace_begin" def trace_end(): print "in trace_end" def probe__icmp_rcv(event_name, context, common_cpu, common_secs, common_nsecs, common_pid, common_comm, __probe_ip, type): print_header(event_name, common_cpu, common_secs, common_nsecs, common_pid, common_comm) print "__probe_ip=%u, type=%u\n" % \ (__probe_ip, type), [...] (3) $ perf record -a -D -e probe:icmp_rcv -o - | \ perf script -i - -s trace-icmp.py Thanks to Peter Zijlstra for pointing how to do it. Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Ingo Molnar , Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: <20110112140613.GA11698@tugrik.mns.mnsspb.ru> Signed-off-by: Kirill Smelkov Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-record.txt | 3 +++ tools/perf/builtin-record.c | 8 ++++++++ 2 files changed, 11 insertions(+) diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt index 52462ae2645..e032716c839 100644 --- a/tools/perf/Documentation/perf-record.txt +++ b/tools/perf/Documentation/perf-record.txt @@ -61,6 +61,9 @@ OPTIONS -r:: --realtime=:: Collect data with this RT SCHED_FIFO priority. +-D:: +--no-delay:: + Collect data without buffering. -A:: --append:: Append to the output file to do incremental profiling. diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 1210e6484ad..df6064ad9bf 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -49,6 +49,7 @@ static int pipe_output = 0; static const char *output_name = "perf.data"; static int group = 0; static int realtime_prio = 0; +static bool nodelay = false; static bool raw_samples = false; static bool sample_id_all_avail = true; static bool system_wide = false; @@ -307,6 +308,11 @@ static void create_counter(struct perf_evsel *evsel, int cpu) attr->sample_type |= PERF_SAMPLE_CPU; } + if (nodelay) { + attr->watermark = 0; + attr->wakeup_events = 1; + } + attr->mmap = track; attr->comm = track; attr->inherit = !no_inherit; @@ -843,6 +849,8 @@ const struct option record_options[] = { "record events on existing thread id"), OPT_INTEGER('r', "realtime", &realtime_prio, "collect data with this RT SCHED_FIFO priority"), + OPT_BOOLEAN('D', "no-delay", &nodelay, + "collect data without buffering"), OPT_BOOLEAN('R', "raw-samples", &raw_samples, "collect raw sample records from all opened counters"), OPT_BOOLEAN('a', "all-cpus", &system_wide, -- cgit v1.2.3 From c072a388d59a1d48e36864d0e66f42d71745be1c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 7 Jan 2011 02:33:47 -0800 Subject: rcu: demote SRCU_SYNCHRONIZE_DELAY from kernel-parameter status Because the adaptive synchronize_srcu_expedited() approach has worked very well in testing, remove the kernel parameter and replace it by a C-preprocessor macro. If someone finds problems with this approach, a more complex and aggressively adaptive approach might be required. Longer term, SRCU will be merged with the other RCU implementations, at which point synchronize_srcu_expedited() will be event driven, just as synchronize_sched_expedited() currently is. At that point, there will be no need for this adaptive approach. Reported-by: Linus Torvalds Signed-off-by: Paul E. McKenney --- init/Kconfig | 15 --------------- kernel/srcu.c | 15 +++++++++++++-- 2 files changed, 13 insertions(+), 17 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index 526ec1c7456..e11bc793a91 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -497,21 +497,6 @@ config RCU_BOOST_DELAY Accept the default if unsure. -config SRCU_SYNCHRONIZE_DELAY - int "Microseconds to delay before waiting for readers" - range 0 20 - default 10 - help - This option controls how long SRCU delays before entering its - loop waiting on SRCU readers. The purpose of this loop is - to avoid the unconditional context-switch penalty that would - otherwise be incurred if there was an active SRCU reader, - in a manner similar to adaptive locking schemes. This should - be set to be a bit longer than the common-case SRCU read-side - critical-section overhead. - - Accept the default if unsure. - endmenu # "RCU Subsystem" config IKCONFIG diff --git a/kernel/srcu.c b/kernel/srcu.c index 98d8c1e80ed..73ce23feaea 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c @@ -155,6 +155,16 @@ void __srcu_read_unlock(struct srcu_struct *sp, int idx) } EXPORT_SYMBOL_GPL(__srcu_read_unlock); +/* + * We use an adaptive strategy for synchronize_srcu() and especially for + * synchronize_srcu_expedited(). We spin for a fixed time period + * (defined below) to allow SRCU readers to exit their read-side critical + * sections. If there are still some readers after 10 microseconds, + * we repeatedly block for 1-millisecond time periods. This approach + * has done well in testing, so there is no need for a config parameter. + */ +#define SYNCHRONIZE_SRCU_READER_DELAY 10 + /* * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). */ @@ -207,11 +217,12 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) * will have finished executing. We initially give readers * an arbitrarily chosen 10 microseconds to get out of their * SRCU read-side critical sections, then loop waiting 1/HZ - * seconds per iteration. + * seconds per iteration. The 10-microsecond value has done + * very well in testing. */ if (srcu_readers_active_idx(sp, idx)) - udelay(CONFIG_SRCU_SYNCHRONIZE_DELAY); + udelay(SYNCHRONIZE_SRCU_READER_DELAY); while (srcu_readers_active_idx(sp, idx)) schedule_timeout_interruptible(1); -- cgit v1.2.3 From b24efdfdf679cf9b05947c531971905fc727dd40 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 12 Jan 2011 14:18:11 -0800 Subject: rcu: avoid pointless blocked-task warnings If the RCU callback-processing kthread has nothing to do, it parks in a wait_event(). If RCU remains idle for more than two minutes, the kernel complains about this. This commit changes from wait_event() to wait_event_interruptible() to prevent the kernel from complaining just because RCU is idle. Reported-by: Russell King Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Tested-by: Thomas Weber Tested-by: Russell King --- kernel/rcutiny.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 03449372474..0c343b9a46d 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -189,7 +189,8 @@ static int rcu_kthread(void *arg) unsigned long flags; for (;;) { - wait_event(rcu_kthread_wq, have_rcu_kthread_work != 0); + wait_event_interruptible(rcu_kthread_wq, + have_rcu_kthread_work != 0); morework = rcu_boost(); local_irq_save(flags); work = have_rcu_kthread_work; -- cgit v1.2.3 From c94fbe1d9e1e9b1a1f82eb0b53b1cf53bcf9712b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 14 Jan 2011 11:25:58 -0500 Subject: tracing: Only process module tracepoints once The commit: 9f987b3141f086de27832514aad9f50a53f754 tracing: Include module.h in define_trace.h only solved half the problem. If the trace/events/module.h header is included at the time of define_trace.h (or in ftrace.h within it), the module.h TRACE_SYSTEM will override the current TRACE_SYSTEM macro. Since define_trace.h is included when CREATE_TRACE_POINTS is set, and the first thing it does is to #undef CREATE_TRACE_POINTS, by placing the module.h TRACE_SYSTEM inside a #ifdef CREATE_TRACE_POINTS we can prevent it from overriding the TRACE_SYSTEM that is being processed, and still process the module.h tracepoints when the module code defines CREATE_TRACE_POINTS and includes the trace/events/module.h header. As with commit 9f987b3141, this is only an issue if module.h is not included before the trace/events/.h file is included, which (luckily) has not happened yet. Reported-by: Peter Zijlstra Signed-off-by: Steven Rostedt --- include/trace/events/module.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/include/trace/events/module.h b/include/trace/events/module.h index c7bb2f0482f..c6bae36547e 100644 --- a/include/trace/events/module.h +++ b/include/trace/events/module.h @@ -1,5 +1,15 @@ +/* + * Because linux/module.h has tracepoints in the header, and ftrace.h + * eventually includes this file, define_trace.h includes linux/module.h + * But we do not want the module.h to override the TRACE_SYSTEM macro + * variable that define_trace.h is processing, so we only set it + * when module events are being processed, which would happen when + * CREATE_TRACE_POINTS is defined. + */ +#ifdef CREATE_TRACE_POINTS #undef TRACE_SYSTEM #define TRACE_SYSTEM module +#endif #if !defined(_TRACE_MODULE_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_MODULE_H -- cgit v1.2.3 From 62627bec8a601c5679bf3d20a2096a1206d61b71 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 14 Jan 2011 09:06:28 -0800 Subject: x86: tsc: Fix calibration refinement conditionals to avoid divide by zero Konrad Wilk reported that the new delayed calibration crashes with a divide by zero on Xen. The reason is that Xen sets the pmtimer address, but reading from it returns 0xffffff. That results in the ref_start and ref_stop value being the same, so the delta is zero which causes the divide by zero later in the calculation. The conditional (!hpet && !ref_start && !ref_stop) which sanity checks the calibration reference values doesn't really make sense. If the refs are null, but hpet is on, we still want to break out. The div by zero would be possible to trigger by chance if both reads from the hardware provided the exact same value (due to hardware wrapping). So checking if both the ref values are the same should handle if we don't have hardware (both null) or if they are the same value (either by invalid hardware, or by chance), avoiding the div by zero issue. [ tglx: Applied the same fix to native_calibrate_tsc() where this check was copied from ] Reported-by: Konrad Rzeszutek Wilk Tested-by: Konrad Rzeszutek Wilk Signed-off-by: John Stultz LKML-Reference: <1295024788-15619-1-git-send-email-johnstul@us.ibm.com> Signed-off-by: Thomas Gleixner --- arch/x86/kernel/tsc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 463901efdba..ae09f970e62 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -464,7 +464,7 @@ unsigned long native_calibrate_tsc(void) tsc_pit_min = min(tsc_pit_min, tsc_pit_khz); /* hpet or pmtimer available ? */ - if (!hpet && !ref1 && !ref2) + if (ref1 == ref2) continue; /* Check, whether the sampling was disturbed by an SMI */ @@ -935,7 +935,7 @@ static void tsc_refine_calibration_work(struct work_struct *work) tsc_stop = tsc_read_refs(&ref_stop, hpet); /* hpet or pmtimer available ? */ - if (!hpet && !ref_start && !ref_stop) + if (ref_start == ref_stop) goto out; /* Check, whether the sampling was disturbed by an SMI */ -- cgit v1.2.3 From 6550904ddbc3c286798a87edf95eeebcc62bc58a Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Thu, 13 Jan 2011 16:06:44 -0800 Subject: x86, mrst: Set correct APB timer IRQ affinity for secondary cpu Offlining the secondary CPU causes the timer irq affinity to be set to CPU 0. When the secondary CPU is back online again, the wrong irq affinity will be used. This patch ensures secondary per CPU timer always has the correct IRQ affinity when enabled. Signed-off-by: Jacob Pan LKML-Reference: <1294963604-18111-1-git-send-email-jacob.jun.pan@linux.intel.com> Signed-off-by: H. Peter Anvin Cc: 2.6.37 --- arch/x86/kernel/apb_timer.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index 7c9ab59653e..51ef31a89be 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -313,14 +313,16 @@ static void apbt_setup_irq(struct apbt_dev *adev) if (adev->irq == 0) return; + irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT); + irq_set_affinity(adev->irq, cpumask_of(adev->cpu)); + /* APB timer irqs are set up as mp_irqs, timer is edge type */ + __set_irq_handler(adev->irq, handle_edge_irq, 0, "edge"); + if (system_state == SYSTEM_BOOTING) { - irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT); - irq_set_affinity(adev->irq, cpumask_of(adev->cpu)); - /* APB timer irqs are set up as mp_irqs, timer is edge type */ - __set_irq_handler(adev->irq, handle_edge_irq, 0, "edge"); if (request_irq(adev->irq, apbt_interrupt_handler, - IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING, - adev->name, adev)) { + IRQF_TIMER | IRQF_DISABLED | + IRQF_NOBALANCING, + adev->name, adev)) { printk(KERN_ERR "Failed request IRQ for APBT%d\n", adev->num); } -- cgit v1.2.3 From 76d1f7bfcd5872056902c5a88b5fcd5d4d00a7a9 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 14 Jan 2011 11:57:06 -0800 Subject: x86, olpc: Add missing Kconfig dependencies OLPC uses select for OLPC_OPENFIRMWARE, which means OLPC has to enforce the dependencies for OLPC_OPENFIRMWARE. Make sure it does so. Signed-off-by: H. Peter Anvin Cc: Daniel Drake Cc: Andres Salomon Cc: Grant Likely LKML-Reference: <20100923162846.D8D409D401B@zog.reactivated.net> Cc: 2.6.37 --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b6fccb07123..184bc887279 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2064,6 +2064,7 @@ config OLPC bool "One Laptop Per Child support" select GPIOLIB select OLPC_OPENFIRMWARE + depends on !X86_64 && !X86_PAE ---help--- Add support for detecting the unique features of the OLPC XO hardware. -- cgit v1.2.3 From 7f85803a26f304e698c673838aab06cc6d4d6e59 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 14 Jan 2011 17:49:02 +0800 Subject: tracing: Remove syscall_exit_fields There is no need for syscall_exit_fields as the syscall exit event class can already host the fields in its structure, like most other trace events do by default. Use that default behavior instead. Following this scheme, we don't need anymore to override the get_fields() callback of the syscall exit event class either. Hence both syscall_exit_fields and syscall_get_exit_fields() can be removed. Also changed some indentation to keep the following under 80 characters: ".fields = LIST_HEAD_INIT(event_class_syscall_exit.fields)," Acked-by: Frederic Weisbecker Signed-off-by: Lai Jiangshan LKML-Reference: <4D301C0E.8090408@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_syscalls.c | 33 ++++++++++++--------------------- 1 file changed, 12 insertions(+), 21 deletions(-) diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index bac752f0cfb..b706529b4fc 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -23,9 +23,6 @@ static int syscall_exit_register(struct ftrace_event_call *event, static int syscall_enter_define_fields(struct ftrace_event_call *call); static int syscall_exit_define_fields(struct ftrace_event_call *call); -/* All syscall exit events have the same fields */ -static LIST_HEAD(syscall_exit_fields); - static struct list_head * syscall_get_enter_fields(struct ftrace_event_call *call) { @@ -34,34 +31,28 @@ syscall_get_enter_fields(struct ftrace_event_call *call) return &entry->enter_fields; } -static struct list_head * -syscall_get_exit_fields(struct ftrace_event_call *call) -{ - return &syscall_exit_fields; -} - struct trace_event_functions enter_syscall_print_funcs = { - .trace = print_syscall_enter, + .trace = print_syscall_enter, }; struct trace_event_functions exit_syscall_print_funcs = { - .trace = print_syscall_exit, + .trace = print_syscall_exit, }; struct ftrace_event_class event_class_syscall_enter = { - .system = "syscalls", - .reg = syscall_enter_register, - .define_fields = syscall_enter_define_fields, - .get_fields = syscall_get_enter_fields, - .raw_init = init_syscall_trace, + .system = "syscalls", + .reg = syscall_enter_register, + .define_fields = syscall_enter_define_fields, + .get_fields = syscall_get_enter_fields, + .raw_init = init_syscall_trace, }; struct ftrace_event_class event_class_syscall_exit = { - .system = "syscalls", - .reg = syscall_exit_register, - .define_fields = syscall_exit_define_fields, - .get_fields = syscall_get_exit_fields, - .raw_init = init_syscall_trace, + .system = "syscalls", + .reg = syscall_exit_register, + .define_fields = syscall_exit_define_fields, + .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields), + .raw_init = init_syscall_trace, }; extern unsigned long __start_syscalls_metadata[]; -- cgit v1.2.3