From f123c98e7f168e949b283690693695f988332c3d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 6 Jan 2011 15:08:29 -0500
Subject: rtmutex: Fix comment about why new_owner can be NULL in
 wake_futex_pi()

The comment about why rt_mutex_next_owner() can return NULL in
wake_futex_pi() is not the normal case.

Tracing the cause of why this occurs is more likely that waiter
simply timedout. But because it originally caused contention with
the futex, the owner will go into the kernel when it unlocks
the lock. Then it will hit this code path and
rt_mutex_next_owner() will return NULL.

Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/futex.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/kernel/futex.c b/kernel/futex.c
index 3019b92e691..5696d38cc71 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -791,10 +791,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
 	new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
 
 	/*
-	 * This happens when we have stolen the lock and the original
-	 * pending owner did not enqueue itself back on the rt_mutex.
-	 * Thats not a tragedy. We know that way, that a lock waiter
-	 * is on the fly. We make the futex_q waiter the pending owner.
+	 * It is possible that the next waiter (the one that brought
+	 * this owner to the kernel) timed out and is no longer
+	 * waiting on the lock.
 	 */
 	if (!new_owner)
 		new_owner = this->task;
-- 
cgit v1.2.3


From bd3bfe9eda94d3c050830217c1e1c338808de5b2 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 11 Jan 2011 12:42:00 -0200
Subject: perf evsel: Fix order of event list deletion

We need to defer calling perf_evsel_list__delete() till after atexit
registered routines, because we need to traverse the events being
recorded at that time at least on 'perf record'.

This fixes the problem reported by Thomas Renninger where cmd_record
called by cmd_timechart would not write the tracing data to the perf.data
file header because the evsel_list at atexit (control+C on 'perf timechart
record') time would be empty, being already deleted by run_builtin(),
and thus 'perf timechart' when trying to process such perf.data file would
die with:

"no trace data in the file"

Problem introduced in 70d544d.

Reported-by: Thomas Renninger <trenn@suse.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Renninger <trenn@suse.de>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-record.c | 1 +
 tools/perf/builtin-stat.c   | 1 +
 tools/perf/builtin-top.c    | 1 +
 tools/perf/perf.c           | 2 --
 4 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 7069bd3e90b..aa7ece39765 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -480,6 +480,7 @@ static void atexit_header(void)
 			process_buildids();
 		perf_header__write(&session->header, output, true);
 		perf_session__delete(session);
+		perf_evsel_list__delete();
 		symbol__exit();
 	}
 }
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index c385a63ebfd..0ff11d9b13b 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -743,6 +743,7 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used)
 out_free_fd:
 	list_for_each_entry(pos, &evsel_list, node)
 		perf_evsel__free_stat_priv(pos);
+	perf_evsel_list__delete();
 out:
 	thread_map__delete(threads);
 	threads = NULL;
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 6ce4042421b..4b995ee099c 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -1490,6 +1490,7 @@ int cmd_top(int argc, const char **argv, const char *prefix __used)
 out_free_fd:
 	list_for_each_entry(pos, &evsel_list, node)
 		perf_evsel__free_mmap(pos);
+	perf_evsel_list__delete();
 
 	return status;
 }
diff --git a/tools/perf/perf.c b/tools/perf/perf.c
index 5b1ecd66bb3..595d0f4a710 100644
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -286,8 +286,6 @@ static int run_builtin(struct cmd_struct *p, int argc, const char **argv)
 	status = p->fn(argc, argv, prefix);
 	exit_browser(status);
 
-	perf_evsel_list__delete();
-
 	if (status)
 		return status & 0xff;
 
-- 
cgit v1.2.3


From cc841580aa58ad7498b23e282859d07f8b721e24 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 11 Jan 2011 15:16:52 -0200
Subject: perf top: Fix annotate segv

Before we had sym_counter, it was initialized to zero and we used that
as an index in the global attrs variable, now we have a list of evsel
entries, and sym_counter became sym_evsel, that remained initialized to
zero (NULL): b00m.

Fix it by initializing it to the first entry in the evsel list.

Bug-introduced: 69aad6f
Reported-by: Kirill Smelkov <kirr@mns.spb.ru>
Tested-by: Kirill Smelkov <kirr@mns.spb.ru>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Kirill Smelkov <kirr@mns.spb.ru>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-top.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 4b995ee099c..568b1950e63 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -1473,6 +1473,8 @@ int cmd_top(int argc, const char **argv, const char *prefix __used)
 		pos->attr.sample_period = default_interval;
 	}
 
+	sym_evsel = list_entry(evsel_list.next, struct perf_evsel, node);
+
 	symbol_conf.priv_size = (sizeof(struct sym_entry) +
 				 (nr_counters + 1) * sizeof(unsigned long));
 
-- 
cgit v1.2.3


From 4ad9f594d7199c99f6b1b3ef88c64bd5920a4592 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 11 Jan 2011 16:58:54 -0200
Subject: Revert "perf tools: Emit clearer message for sys_perf_event_open
 ENOENT return"

This reverts commit aa7bc7ef73efc46d7c3a0e185eefaf85744aec98.

It removed the fallback from hardware profiling to software profiling.
.e.g., in a VM with no PMU.

Reported-by: David Ahern <daahern@cisco.com>
Cc: David Ahern <daahern@cisco.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-record.c | 3 ---
 tools/perf/builtin-top.c    | 2 --
 2 files changed, 5 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index aa7ece39765..1210e6484ad 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -331,9 +331,6 @@ try_again:
 			else if (err ==  ENODEV && cpu_list) {
 				die("No such device - did you specify"
 					" an out-of-range profile CPU?\n");
-			} else if (err == ENOENT) {
-				die("%s event is not supported. ",
-				     event_name(evsel));
 			} else if (err == EINVAL && sample_id_all_avail) {
 				/*
 				 * Old kernel, no attr->sample_id_type_all field
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 568b1950e63..05344c6210a 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -1247,8 +1247,6 @@ try_again:
 				die("Permission error - are you root?\n"
 					"\t Consider tweaking"
 					" /proc/sys/kernel/perf_event_paranoid.\n");
-			if (err == ENOENT)
-				die("%s event is not supported. ", event_name(evsel));
 			/*
 			 * If it's cycles then fall back to hrtimer
 			 * based cpu-clock-tick sw counter, which
-- 
cgit v1.2.3


From 9378b63ccb32b9c071dab155c96357ad1e52a709 Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Wed, 12 Jan 2011 00:50:37 -0800
Subject: x86, ia64, acpi: Clean up x86-ism in drivers/acpi/numa.c

As pointed out by Linus CONFIG_X86 in drivers/acpi/numa.c is
ugly.

Builds and boots on ia64 (both normally and with maxcpus=8 to limit
the number of cpus).

Signed-off-by: Tony Luck <tony.luck@intel.com>
Acked-by: Yinghai Lu <yinghai@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Bjorn Helgaas <bjorn.helgaas@hp.com>
Cc: Len Brown <len.brown@intel.com>
LKML-Reference: <4D2D6B5D.4080208@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 drivers/acpi/numa.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c
index d9926afec11..5eb25eb3ea4 100644
--- a/drivers/acpi/numa.c
+++ b/drivers/acpi/numa.c
@@ -275,23 +275,19 @@ acpi_table_parse_srat(enum acpi_srat_type id,
 int __init acpi_numa_init(void)
 {
 	int ret = 0;
-	int nr_cpu_entries = nr_cpu_ids;
 
-#ifdef CONFIG_X86
 	/*
 	 * Should not limit number with cpu num that is from NR_CPUS or nr_cpus=
 	 * SRAT cpu entries could have different order with that in MADT.
 	 * So go over all cpu entries in SRAT to get apicid to node mapping.
 	 */
-	nr_cpu_entries = MAX_LOCAL_APIC;
-#endif
 
 	/* SRAT: Static Resource Affinity Table */
 	if (!acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat)) {
 		acpi_table_parse_srat(ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY,
-				     acpi_parse_x2apic_affinity, nr_cpu_entries);
+				     acpi_parse_x2apic_affinity, 0);
 		acpi_table_parse_srat(ACPI_SRAT_TYPE_CPU_AFFINITY,
-				     acpi_parse_processor_affinity, nr_cpu_entries);
+				     acpi_parse_processor_affinity, 0);
 		ret = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY,
 					    acpi_parse_memory_affinity,
 					    NR_NODE_MEMBLKS);
-- 
cgit v1.2.3


From 5fdade95f26372154459347dfb9f60721d22cfc7 Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nico@fluxnic.net>
Date: Tue, 11 Jan 2011 12:18:12 -0500
Subject: time: Rename misnamed minsec argument of clocks_calc_mult_shift()

The minsec argument to clocks_calc_mult_shift() is misnamed. It is used
to clamp the magnitude of the mult factor so that a multiplication with
any value in the given range won't overflow a 64 bit result.  Let's
rename it to match the actual usage.

Signed-off-by: Nicolas Pitre <nicolas.pitre@linaro.org>
Acked-by: John Stultz <johnstul@us.ibm.com>
LKML-Reference: <alpine.LFD.2.00.1101111207140.17086@xanadu.home>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/clocksource.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index c18d7efa1b4..8588abcac07 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -113,7 +113,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time);
  * @shift:	pointer to shift variable
  * @from:	frequency to convert from
  * @to:		frequency to convert to
- * @minsec:	guaranteed runtime conversion range in seconds
+ * @maxsec:	guaranteed runtime conversion range in seconds
  *
  * The function evaluates the shift/mult pair for the scaled math
  * operations of clocksources and clockevents.
@@ -122,7 +122,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time);
  * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock
  * event @to is the counter frequency and @from is NSEC_PER_SEC.
  *
- * The @minsec conversion range argument controls the time frame in
+ * The @maxsec conversion range argument controls the time frame in
  * seconds which must be covered by the runtime conversion with the
  * calculated mult and shift factors. This guarantees that no 64bit
  * overflow happens when the input value of the conversion is
@@ -131,7 +131,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time);
  * factors.
  */
 void
-clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec)
+clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec)
 {
 	u64 tmp;
 	u32 sft, sftacc= 32;
@@ -140,7 +140,7 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec)
 	 * Calculate the shift factor which is limiting the conversion
 	 * range:
 	 */
-	tmp = ((u64)minsec * from) >> 32;
+	tmp = ((u64)maxsec * from) >> 32;
 	while (tmp) {
 		tmp >>=1;
 		sftacc--;
-- 
cgit v1.2.3


From afa14e7c553ebe45844d76208f66017a43abd0e2 Mon Sep 17 00:00:00 2001
From: H Hartley Sweeten <hartleys@visionengravers.com>
Date: Tue, 11 Jan 2011 17:59:38 -0600
Subject: timekeeping: Make local variables static

Signed-off-by: H Hartley Sweeten <hsweeten@visionengravers.com>
Cc: John Stultz <johnstul@us.ibm.com>
LKML-Reference: <0D753D10438DA54287A00B027084269764CE0E54B7@AUSP01VMBX24.collaborationhost.net>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timekeeping.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 5bb86da8200..eef7452bd8a 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -49,7 +49,7 @@ struct timekeeper {
 	u32	mult;
 };
 
-struct timekeeper timekeeper;
+static struct timekeeper timekeeper;
 
 /**
  * timekeeper_setup_internals - Set up internals to use clocksource clock.
@@ -164,7 +164,7 @@ static struct timespec total_sleep_time;
 /*
  * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock.
  */
-struct timespec raw_time;
+static struct timespec raw_time;
 
 /* flag for if timekeeping is suspended */
 int __read_mostly timekeeping_suspended;
-- 
cgit v1.2.3


From 9710118bd4e7f3406865171cb9b9c94547c1c2f9 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Wed, 12 Jan 2011 10:29:05 +0100
Subject: perf sched: Fix list of events, dropping unsupported ':r' modifier

Looks to me like the :r modifier is not supported anymore, so remove it from
the list of events.

Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Robert Richter <robert.richter@amd.com>
LKML-Reference: <AANLkTim=jawJyBj0iFd0r4-LCKzvjFW+NddzJMD5GUB9@mail.gmail.com>
Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-sched.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index abd4b8497bc..29e7ffd8569 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -1843,15 +1843,15 @@ static const char *record_args[] = {
 	"-f",
 	"-m", "1024",
 	"-c", "1",
-	"-e", "sched:sched_switch:r",
-	"-e", "sched:sched_stat_wait:r",
-	"-e", "sched:sched_stat_sleep:r",
-	"-e", "sched:sched_stat_iowait:r",
-	"-e", "sched:sched_stat_runtime:r",
-	"-e", "sched:sched_process_exit:r",
-	"-e", "sched:sched_process_fork:r",
-	"-e", "sched:sched_wakeup:r",
-	"-e", "sched:sched_migrate_task:r",
+	"-e", "sched:sched_switch",
+	"-e", "sched:sched_stat_wait",
+	"-e", "sched:sched_stat_sleep",
+	"-e", "sched:sched_stat_iowait",
+	"-e", "sched:sched_stat_runtime",
+	"-e", "sched:sched_process_exit",
+	"-e", "sched:sched_process_fork",
+	"-e", "sched:sched_wakeup",
+	"-e", "sched:sched_migrate_task",
 };
 
 static int __cmd_record(int argc, const char **argv)
-- 
cgit v1.2.3


From acac03fa15a8684bb60489ed87b5aae5258c0838 Mon Sep 17 00:00:00 2001
From: Kirill Smelkov <kirr@mns.spb.ru>
Date: Wed, 12 Jan 2011 17:59:36 +0300
Subject: perf record: Add "nodelay" mode, disabled by default

Sometimes there is a need to use perf in "live-log" mode. The problem
is, for seldom events, actual info output is largely delayed because
perf-record reads sample data in whole pages.

So for such scenarious, add flag for perf-record to go in "nodelay"
mode. To track e.g. what's going on in icmp_rcv while ping is running
Use it with something like this:

(1) $ perf probe -L icmp_rcv | grep -U8 '^ *43\>'
                                    goto error;
                    }
         38         if (!pskb_pull(skb, sizeof(*icmph)))
                            goto error;
                    icmph = icmp_hdr(skb);

         43         ICMPMSGIN_INC_STATS_BH(net, icmph->type);
                    /*
                     *      18 is the highest 'known' ICMP type. Anything else is a mystery
                     *
                     *      RFC 1122: 3.2.2  Unknown ICMP messages types MUST be silently
                     *                discarded.
                     */
         50         if (icmph->type > NR_ICMP_TYPES)
                            goto error;

    $ perf probe icmp_rcv:43 'type=icmph->type'

(2) $ cat trace-icmp.py
    [...]
    def trace_begin():
            print "in trace_begin"

    def trace_end():
            print "in trace_end"

    def probe__icmp_rcv(event_name, context, common_cpu,
            common_secs, common_nsecs, common_pid, common_comm,
            __probe_ip, type):
                    print_header(event_name, common_cpu, common_secs, common_nsecs,
                            common_pid, common_comm)

                    print "__probe_ip=%u, type=%u\n" % \
                    (__probe_ip, type),
    [...]

(3) $ perf record -a -D -e probe:icmp_rcv -o - | \
      perf script -i - -s trace-icmp.py

Thanks to Peter Zijlstra for pointing how to do it.

Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>, Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <20110112140613.GA11698@tugrik.mns.mnsspb.ru>
Signed-off-by: Kirill Smelkov <kirr@mns.spb.ru>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-record.txt | 3 +++
 tools/perf/builtin-record.c              | 8 ++++++++
 2 files changed, 11 insertions(+)

diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index 52462ae2645..e032716c839 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -61,6 +61,9 @@ OPTIONS
 -r::
 --realtime=::
 	Collect data with this RT SCHED_FIFO priority.
+-D::
+--no-delay::
+	Collect data without buffering.
 -A::
 --append::
 	Append to the output file to do incremental profiling.
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 1210e6484ad..df6064ad9bf 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -49,6 +49,7 @@ static int			pipe_output			=      0;
 static const char		*output_name			= "perf.data";
 static int			group				=      0;
 static int			realtime_prio			=      0;
+static bool			nodelay				=  false;
 static bool			raw_samples			=  false;
 static bool			sample_id_all_avail		=   true;
 static bool			system_wide			=  false;
@@ -307,6 +308,11 @@ static void create_counter(struct perf_evsel *evsel, int cpu)
 		attr->sample_type	|= PERF_SAMPLE_CPU;
 	}
 
+	if (nodelay) {
+		attr->watermark = 0;
+		attr->wakeup_events = 1;
+	}
+
 	attr->mmap		= track;
 	attr->comm		= track;
 	attr->inherit		= !no_inherit;
@@ -843,6 +849,8 @@ const struct option record_options[] = {
 		    "record events on existing thread id"),
 	OPT_INTEGER('r', "realtime", &realtime_prio,
 		    "collect data with this RT SCHED_FIFO priority"),
+	OPT_BOOLEAN('D', "no-delay", &nodelay,
+		    "collect data without buffering"),
 	OPT_BOOLEAN('R', "raw-samples", &raw_samples,
 		    "collect raw sample records from all opened counters"),
 	OPT_BOOLEAN('a', "all-cpus", &system_wide,
-- 
cgit v1.2.3


From c072a388d59a1d48e36864d0e66f42d71745be1c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 7 Jan 2011 02:33:47 -0800
Subject: rcu: demote SRCU_SYNCHRONIZE_DELAY from kernel-parameter status

Because the adaptive synchronize_srcu_expedited() approach has
worked very well in testing, remove the kernel parameter and
replace it by a C-preprocessor macro.  If someone finds problems
with this approach, a more complex and aggressively adaptive
approach might be required.

Longer term, SRCU will be merged with the other RCU implementations,
at which point synchronize_srcu_expedited() will be event driven,
just as synchronize_sched_expedited() currently is.  At that point,
there will be no need for this adaptive approach.

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 init/Kconfig  | 15 ---------------
 kernel/srcu.c | 15 +++++++++++++--
 2 files changed, 13 insertions(+), 17 deletions(-)

diff --git a/init/Kconfig b/init/Kconfig
index 526ec1c7456..e11bc793a91 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -497,21 +497,6 @@ config RCU_BOOST_DELAY
 
 	  Accept the default if unsure.
 
-config SRCU_SYNCHRONIZE_DELAY
-	int "Microseconds to delay before waiting for readers"
-	range 0 20
-	default 10
-	help
-	  This option controls how long SRCU delays before entering its
-	  loop waiting on SRCU readers.  The purpose of this loop is
-	  to avoid the unconditional context-switch penalty that would
-	  otherwise be incurred if there was an active SRCU reader,
-	  in a manner similar to adaptive locking schemes.  This should
-	  be set to be a bit longer than the common-case SRCU read-side
-	  critical-section overhead.
-
-	  Accept the default if unsure.
-
 endmenu # "RCU Subsystem"
 
 config IKCONFIG
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 98d8c1e80ed..73ce23feaea 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -155,6 +155,16 @@ void __srcu_read_unlock(struct srcu_struct *sp, int idx)
 }
 EXPORT_SYMBOL_GPL(__srcu_read_unlock);
 
+/*
+ * We use an adaptive strategy for synchronize_srcu() and especially for
+ * synchronize_srcu_expedited().  We spin for a fixed time period
+ * (defined below) to allow SRCU readers to exit their read-side critical
+ * sections.  If there are still some readers after 10 microseconds,
+ * we repeatedly block for 1-millisecond time periods.  This approach
+ * has done well in testing, so there is no need for a config parameter.
+ */
+#define SYNCHRONIZE_SRCU_READER_DELAY 10
+
 /*
  * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
  */
@@ -207,11 +217,12 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
 	 * will have finished executing.  We initially give readers
 	 * an arbitrarily chosen 10 microseconds to get out of their
 	 * SRCU read-side critical sections, then loop waiting 1/HZ
-	 * seconds per iteration.
+	 * seconds per iteration.  The 10-microsecond value has done
+	 * very well in testing.
 	 */
 
 	if (srcu_readers_active_idx(sp, idx))
-		udelay(CONFIG_SRCU_SYNCHRONIZE_DELAY);
+		udelay(SYNCHRONIZE_SRCU_READER_DELAY);
 	while (srcu_readers_active_idx(sp, idx))
 		schedule_timeout_interruptible(1);
 
-- 
cgit v1.2.3


From b24efdfdf679cf9b05947c531971905fc727dd40 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Wed, 12 Jan 2011 14:18:11 -0800
Subject: rcu: avoid pointless blocked-task warnings

If the RCU callback-processing kthread has nothing to do, it parks in
a wait_event().  If RCU remains idle for more than two minutes, the
kernel complains about this.  This commit changes from wait_event()
to wait_event_interruptible() to prevent the kernel from complaining
just because RCU is idle.

Reported-by: Russell King <rmk+kernel@arm.linux.org.uk>
Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tested-by: Thomas Weber <weber@corscience.de>
Tested-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 kernel/rcutiny.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 03449372474..0c343b9a46d 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -189,7 +189,8 @@ static int rcu_kthread(void *arg)
 	unsigned long flags;
 
 	for (;;) {
-		wait_event(rcu_kthread_wq, have_rcu_kthread_work != 0);
+		wait_event_interruptible(rcu_kthread_wq,
+					 have_rcu_kthread_work != 0);
 		morework = rcu_boost();
 		local_irq_save(flags);
 		work = have_rcu_kthread_work;
-- 
cgit v1.2.3


From c94fbe1d9e1e9b1a1f82eb0b53b1cf53bcf9712b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 14 Jan 2011 11:25:58 -0500
Subject: tracing: Only process module tracepoints once

The commit:

 9f987b3141f086de27832514aad9f50a53f754
 tracing: Include module.h in define_trace.h

only solved half the problem. If the trace/events/module.h header is
included at the time of define_trace.h (or in ftrace.h within it),
the module.h TRACE_SYSTEM will override the current TRACE_SYSTEM
macro.

Since define_trace.h is included when CREATE_TRACE_POINTS is set,
and the first thing it does is to #undef CREATE_TRACE_POINTS,
by placing the module.h TRACE_SYSTEM inside a
 #ifdef CREATE_TRACE_POINTS
we can prevent it from overriding the TRACE_SYSTEM that is
being processed, and still process the module.h tracepoints
when the module code defines CREATE_TRACE_POINTS and includes
the trace/events/module.h header.

As with commit 9f987b3141, this is only an issue if module.h
is not included before the trace/events/<event>.h file is
included, which (luckily) has not happened yet.

Reported-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/trace/events/module.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/include/trace/events/module.h b/include/trace/events/module.h
index c7bb2f0482f..c6bae36547e 100644
--- a/include/trace/events/module.h
+++ b/include/trace/events/module.h
@@ -1,5 +1,15 @@
+/*
+ * Because linux/module.h has tracepoints in the header, and ftrace.h
+ * eventually includes this file, define_trace.h includes linux/module.h
+ * But we do not want the module.h to override the TRACE_SYSTEM macro
+ * variable that define_trace.h is processing, so we only set it
+ * when module events are being processed, which would happen when
+ * CREATE_TRACE_POINTS is defined.
+ */
+#ifdef CREATE_TRACE_POINTS
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM module
+#endif
 
 #if !defined(_TRACE_MODULE_H) || defined(TRACE_HEADER_MULTI_READ)
 #define _TRACE_MODULE_H
-- 
cgit v1.2.3


From 62627bec8a601c5679bf3d20a2096a1206d61b71 Mon Sep 17 00:00:00 2001
From: John Stultz <johnstul@us.ibm.com>
Date: Fri, 14 Jan 2011 09:06:28 -0800
Subject: x86: tsc: Fix calibration refinement conditionals to avoid divide by
 zero

Konrad Wilk reported that the new delayed calibration crashes with a
divide by zero on Xen. The reason is that Xen sets the pmtimer
address, but reading from it returns 0xffffff. That results in the
ref_start and ref_stop value being the same, so the delta is zero
which causes the divide by zero later in the calculation.

The conditional (!hpet && !ref_start && !ref_stop) which sanity checks
the calibration reference values doesn't really make sense. If the
refs are null, but hpet is on, we still want to break out.

The div by zero would be possible to trigger by chance if both reads
from the hardware provided the exact same value (due to hardware
wrapping).

So checking if both the ref values are the same should handle if we
don't have hardware (both null) or if they are the same value (either by
invalid hardware, or by chance), avoiding the div by zero issue.

[ tglx: Applied the same fix to native_calibrate_tsc() where this
  	check was copied from ]

Reported-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Tested-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: John Stultz <johnstul@us.ibm.com>
LKML-Reference: <1295024788-15619-1-git-send-email-johnstul@us.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/tsc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 463901efdba..ae09f970e62 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -464,7 +464,7 @@ unsigned long native_calibrate_tsc(void)
 		tsc_pit_min = min(tsc_pit_min, tsc_pit_khz);
 
 		/* hpet or pmtimer available ? */
-		if (!hpet && !ref1 && !ref2)
+		if (ref1 == ref2)
 			continue;
 
 		/* Check, whether the sampling was disturbed by an SMI */
@@ -935,7 +935,7 @@ static void tsc_refine_calibration_work(struct work_struct *work)
 	tsc_stop = tsc_read_refs(&ref_stop, hpet);
 
 	/* hpet or pmtimer available ? */
-	if (!hpet && !ref_start && !ref_stop)
+	if (ref_start == ref_stop)
 		goto out;
 
 	/* Check, whether the sampling was disturbed by an SMI */
-- 
cgit v1.2.3


From 6550904ddbc3c286798a87edf95eeebcc62bc58a Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Thu, 13 Jan 2011 16:06:44 -0800
Subject: x86, mrst: Set correct APB timer IRQ affinity for secondary cpu

Offlining the secondary CPU causes the timer irq affinity to be set to
CPU 0. When the secondary CPU is back online again, the wrong irq
affinity will be used.

This patch ensures secondary per CPU timer always has the correct
IRQ affinity when enabled.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
LKML-Reference: <1294963604-18111-1-git-send-email-jacob.jun.pan@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: <stable@kernel.org> 2.6.37
---
 arch/x86/kernel/apb_timer.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index 7c9ab59653e..51ef31a89be 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -313,14 +313,16 @@ static void apbt_setup_irq(struct apbt_dev *adev)
 	if (adev->irq == 0)
 		return;
 
+	irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT);
+	irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
+	/* APB timer irqs are set up as mp_irqs, timer is edge type */
+	__set_irq_handler(adev->irq, handle_edge_irq, 0, "edge");
+
 	if (system_state == SYSTEM_BOOTING) {
-		irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT);
-		irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
-		/* APB timer irqs are set up as mp_irqs, timer is edge type */
-		__set_irq_handler(adev->irq, handle_edge_irq, 0, "edge");
 		if (request_irq(adev->irq, apbt_interrupt_handler,
-				IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING,
-				adev->name, adev)) {
+					IRQF_TIMER | IRQF_DISABLED |
+					IRQF_NOBALANCING,
+					adev->name, adev)) {
 			printk(KERN_ERR "Failed request IRQ for APBT%d\n",
 			       adev->num);
 		}
-- 
cgit v1.2.3


From 76d1f7bfcd5872056902c5a88b5fcd5d4d00a7a9 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Fri, 14 Jan 2011 11:57:06 -0800
Subject: x86, olpc: Add missing Kconfig dependencies

OLPC uses select for OLPC_OPENFIRMWARE, which means OLPC has to
enforce the dependencies for OLPC_OPENFIRMWARE.  Make sure it does so.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: Daniel Drake <dsd@laptop.org>
Cc: Andres Salomon <dilinger@queued.net>
Cc: Grant Likely <grant.likely@secretlab.ca>
LKML-Reference: <20100923162846.D8D409D401B@zog.reactivated.net>
Cc: <stable@kernel.org> 2.6.37
---
 arch/x86/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b6fccb07123..184bc887279 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2064,6 +2064,7 @@ config OLPC
 	bool "One Laptop Per Child support"
 	select GPIOLIB
 	select OLPC_OPENFIRMWARE
+	depends on !X86_64 && !X86_PAE
 	---help---
 	  Add support for detecting the unique features of the OLPC
 	  XO hardware.
-- 
cgit v1.2.3


From 7f85803a26f304e698c673838aab06cc6d4d6e59 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Fri, 14 Jan 2011 17:49:02 +0800
Subject: tracing: Remove syscall_exit_fields

There is no need for syscall_exit_fields as the syscall
exit event class can already host the fields in its structure,
like most other trace events do by default. Use that
default behavior instead.

Following this scheme, we don't need anymore to override the
get_fields() callback of the syscall exit event class either.

Hence both syscall_exit_fields and syscall_get_exit_fields() can
be removed.

Also changed some indentation to keep the following under 80
characters:

".fields		= LIST_HEAD_INIT(event_class_syscall_exit.fields),"

Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
LKML-Reference: <4D301C0E.8090408@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_syscalls.c | 33 ++++++++++++---------------------
 1 file changed, 12 insertions(+), 21 deletions(-)

diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index bac752f0cfb..b706529b4fc 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -23,9 +23,6 @@ static int syscall_exit_register(struct ftrace_event_call *event,
 static int syscall_enter_define_fields(struct ftrace_event_call *call);
 static int syscall_exit_define_fields(struct ftrace_event_call *call);
 
-/* All syscall exit events have the same fields */
-static LIST_HEAD(syscall_exit_fields);
-
 static struct list_head *
 syscall_get_enter_fields(struct ftrace_event_call *call)
 {
@@ -34,34 +31,28 @@ syscall_get_enter_fields(struct ftrace_event_call *call)
 	return &entry->enter_fields;
 }
 
-static struct list_head *
-syscall_get_exit_fields(struct ftrace_event_call *call)
-{
-	return &syscall_exit_fields;
-}
-
 struct trace_event_functions enter_syscall_print_funcs = {
-	.trace                  = print_syscall_enter,
+	.trace		= print_syscall_enter,
 };
 
 struct trace_event_functions exit_syscall_print_funcs = {
-	.trace                  = print_syscall_exit,
+	.trace		= print_syscall_exit,
 };
 
 struct ftrace_event_class event_class_syscall_enter = {
-	.system			= "syscalls",
-	.reg			= syscall_enter_register,
-	.define_fields		= syscall_enter_define_fields,
-	.get_fields		= syscall_get_enter_fields,
-	.raw_init		= init_syscall_trace,
+	.system		= "syscalls",
+	.reg		= syscall_enter_register,
+	.define_fields	= syscall_enter_define_fields,
+	.get_fields	= syscall_get_enter_fields,
+	.raw_init	= init_syscall_trace,
 };
 
 struct ftrace_event_class event_class_syscall_exit = {
-	.system			= "syscalls",
-	.reg			= syscall_exit_register,
-	.define_fields		= syscall_exit_define_fields,
-	.get_fields		= syscall_get_exit_fields,
-	.raw_init		= init_syscall_trace,
+	.system		= "syscalls",
+	.reg		= syscall_exit_register,
+	.define_fields	= syscall_exit_define_fields,
+	.fields		= LIST_HEAD_INIT(event_class_syscall_exit.fields),
+	.raw_init	= init_syscall_trace,
 };
 
 extern unsigned long __start_syscalls_metadata[];
-- 
cgit v1.2.3