From e34dc2e0edac58ea567b2e3dc727945e88e30874 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 18 Nov 2014 16:30:01 +0800
Subject: rcu: Revert "Allow post-unlock reference for rt_mutex" to avoid
 priority-inversion

The patch dfeb9765ce3c ("Allow post-unlock reference for rt_mutex")
ensured rcu-boost safe even the rt_mutex has post-unlock reference.

But rt_mutex allowing post-unlock reference is definitely a bug and it was
fixed by the commit 27e35715df54 ("rtmutex: Plug slow unlock race").
This fix made the previous patch (dfeb9765ce3c) useless.

And even worse, the priority-inversion introduced by the the previous
patch still exists.

rcu_read_unlock_special() {
	rt_mutex_unlock(&rnp->boost_mtx);
	/* Priority-Inversion:
	 * the current task had been deboosted and preempted as a low
	 * priority task immediately, it could wait long before reschedule in,
	 * and the rcu-booster also waits on this low priority task and sleeps.
	 * This priority-inversion makes rcu-booster can't work
	 * as expected.
	 */
	complete(&rnp->boost_completion);
}

Just revert the patch to avoid it.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.h        | 5 -----
 kernel/rcu/tree_plugin.h | 8 +-------
 2 files changed, 1 insertion(+), 12 deletions(-)

diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index bbdc45d8d74f..dddb2987171c 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -172,11 +172,6 @@ struct rcu_node {
 				/*  queued on this rcu_node structure that */
 				/*  are blocking the current grace period, */
 				/*  there can be no such task. */
-	struct completion boost_completion;
-				/* Used to ensure that the rt_mutex used */
-				/*  to carry out the boosting is fully */
-				/*  released with no future boostee accesses */
-				/*  before that rt_mutex is re-initialized. */
 	struct rt_mutex boost_mtx;
 				/* Used only for the priority-boosting */
 				/*  side effect, not as a lock. */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index c1d7f27bd38f..c2d838b218ef 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -398,10 +398,8 @@ void rcu_read_unlock_special(struct task_struct *t)
 
 #ifdef CONFIG_RCU_BOOST
 		/* Unboost if we were boosted. */
-		if (drop_boost_mutex) {
+		if (drop_boost_mutex)
 			rt_mutex_unlock(&rnp->boost_mtx);
-			complete(&rnp->boost_completion);
-		}
 #endif /* #ifdef CONFIG_RCU_BOOST */
 
 		/*
@@ -1175,15 +1173,11 @@ static int rcu_boost(struct rcu_node *rnp)
 	 */
 	t = container_of(tb, struct task_struct, rcu_node_entry);
 	rt_mutex_init_proxy_locked(&rnp->boost_mtx, t);
-	init_completion(&rnp->boost_completion);
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 	/* Lock only for side effect: boosts task t's priority. */
 	rt_mutex_lock(&rnp->boost_mtx);
 	rt_mutex_unlock(&rnp->boost_mtx);  /* Then keep lockdep happy. */
 
-	/* Wait for boostee to be done w/boost_mtx before reinitializing. */
-	wait_for_completion(&rnp->boost_completion);
-
 	return ACCESS_ONCE(rnp->exp_tasks) != NULL ||
 	       ACCESS_ONCE(rnp->boost_tasks) != NULL;
 }
-- 
cgit v1.2.3


From 3d34c029a845b91dce978088ebef0bc8acdcd281 Mon Sep 17 00:00:00 2001
From: Anders Roxell <anders.roxell@linaro.org>
Date: Mon, 27 Apr 2015 22:53:08 +0200
Subject: arm64: Mark PMU interrupt IRQF_NO_THREAD

Mark the PMU interrupts as non-threadable, as is the case with
arch/arm: d9c3365 ARM: 7813/1: Mark pmu interupt IRQF_NO_THREAD

Suggested-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Anders Roxell <anders.roxell@linaro.org>
---
 arch/arm64/kernel/perf_event.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c
index aa29ecb4f800..70dcde67a808 100644
--- a/arch/arm64/kernel/perf_event.c
+++ b/arch/arm64/kernel/perf_event.c
@@ -461,7 +461,7 @@ armpmu_reserve_hardware(struct arm_pmu *armpmu)
 			}
 
 			err = request_irq(irq, armpmu->handle_irq,
-					IRQF_NOBALANCING,
+					IRQF_NOBALANCING | IRQF_NO_THREAD,
 					"arm-pmu", armpmu);
 			if (err) {
 				pr_err("unable to request IRQ%d for ARM PMU counters\n",
-- 
cgit v1.2.3


From e39fcef6fa667444c4a88e194bad27de82aed34a Mon Sep 17 00:00:00 2001
From: Anders Roxell <anders.roxell@linaro.org>
Date: Mon, 27 Apr 2015 22:53:09 +0200
Subject: arm64: Allow forced irq threading

Now its safe to allow forced interrupt threading for arm64,
all timer interrupts and the perf interrupt are marked NO_THREAD, as is
the case with arch/arm: da0ec6f ARM: 7814/2: Allow forced irq threading

Suggested-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Anders Roxell <anders.roxell@linaro.org>
---
 arch/arm64/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index dc2d66cdf311..b8882e5afe7e 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -61,6 +61,7 @@ config ARM64
 	select HAVE_RCU_TABLE_FREE
 	select HAVE_SYSCALL_TRACEPOINTS
 	select IRQ_DOMAIN
+	select IRQ_FORCED_THREADING
 	select MODULES_USE_ELF_RELA
 	select NO_BOOTMEM
 	select OF
-- 
cgit v1.2.3


From 38865cfcfe7cbca2121057a8aa8cc2232496d11d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 23 Jul 2011 11:04:08 +0200
Subject: early-printk-consolidate.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/sparc/kernel/setup_32.c | 1 +
 arch/sparc/kernel/setup_64.c | 8 +++++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/arch/sparc/kernel/setup_32.c b/arch/sparc/kernel/setup_32.c
index baef495c06bd..3548e8591416 100644
--- a/arch/sparc/kernel/setup_32.c
+++ b/arch/sparc/kernel/setup_32.c
@@ -309,6 +309,7 @@ void __init setup_arch(char **cmdline_p)
 
 	boot_flags_init(*cmdline_p);
 
+	early_console = &prom_early_console;
 	register_console(&prom_early_console);
 
 	printk("ARCH: ");
diff --git a/arch/sparc/kernel/setup_64.c b/arch/sparc/kernel/setup_64.c
index c38d19fc27ba..5e78950e2cbd 100644
--- a/arch/sparc/kernel/setup_64.c
+++ b/arch/sparc/kernel/setup_64.c
@@ -563,6 +563,12 @@ static void __init init_sparc64_elf_hwcap(void)
 		pause_patch();
 }
 
+static inline void register_prom_console(void)
+{
+	early_console = &prom_early_console;
+	register_console(&prom_early_console);
+}
+
 void __init setup_arch(char **cmdline_p)
 {
 	/* Initialize PROM console and command line. */
@@ -574,7 +580,7 @@ void __init setup_arch(char **cmdline_p)
 #ifdef CONFIG_EARLYFB
 	if (btext_find_display())
 #endif
-		register_console(&prom_early_console);
+		register_prom_console();
 
 	if (tlb_type == hypervisor)
 		printk("ARCH: SUN4V\n");
-- 
cgit v1.2.3


From 5c1cd16655ab6f86e0a6a6abf25c335f692c5130 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <tkhai@yandex.ru>
Date: Fri, 30 Aug 2013 21:16:08 +0400
Subject: sparc: provide EARLY_PRINTK for SPARC

sparc does not have CONFIG_EARLY_PRINTK option.

So early-printk-consolidate.patch breaks compilation:

arch/sparc/built-in.o: In function `setup_arch':
(.init.text+0x15e4): undefined reference to `early_console'
arch/sparc/built-in.o: In function `setup_arch':
(.init.text+0x15ec): undefined reference to `early_console'

The below addition fixes that.

Signed-off-by: Kirill Tkhai <tkhai@yandex.ru>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 arch/sparc/Kconfig | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 96ac69c5eba0..5e340d55f99b 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -528,6 +528,10 @@ menu "Executable file formats"
 
 source "fs/Kconfig.binfmt"
 
+config EARLY_PRINTK
+	bool
+	default y
+
 config COMPAT
 	bool
 	depends on SPARC64
-- 
cgit v1.2.3


From 46742597465ac867ca2c5ebbd257abaa7b516578 Mon Sep 17 00:00:00 2001
From: Allen Pais <allen.pais@oracle.com>
Date: Fri, 13 Dec 2013 09:44:41 +0530
Subject: sparc64: use generic rwsem spinlocks rt

Signed-off-by: Allen Pais <allen.pais@oracle.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 arch/sparc/Kconfig | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 5e340d55f99b..950b5733fdff 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -182,12 +182,10 @@ config NR_CPUS
 source kernel/Kconfig.hz
 
 config RWSEM_GENERIC_SPINLOCK
-	bool
-	default y if SPARC32
+	def_bool PREEMPT_RT_FULL
 
 config RWSEM_XCHGADD_ALGORITHM
-	bool
-	default y if SPARC64
+	def_bool !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT_FULL
 
 config GENERIC_HWEIGHT
 	bool
-- 
cgit v1.2.3


From 3bc35a40f95cdeca81218b0cdd7895192acf88ea Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 19 Mar 2013 14:44:30 +0100
Subject: kernel/SRCU: provide a static initializer

There are macros for static initializer for the three out of four
possible notifier types, that are:
	ATOMIC_NOTIFIER_HEAD()
	BLOCKING_NOTIFIER_HEAD()
	RAW_NOTIFIER_HEAD()

This patch provides a static initilizer for the forth type to make it
complete.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 include/linux/notifier.h | 34 +++++++++++++++++++++++++---------
 include/linux/srcu.h     |  9 +++++----
 2 files changed, 30 insertions(+), 13 deletions(-)

diff --git a/include/linux/notifier.h b/include/linux/notifier.h
index d14a4c362465..2e4414a0c1c4 100644
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -6,7 +6,7 @@
  *
  *				Alan Cox <Alan.Cox@linux.org>
  */
- 
+
 #ifndef _LINUX_NOTIFIER_H
 #define _LINUX_NOTIFIER_H
 #include <linux/errno.h>
@@ -42,9 +42,7 @@
  * in srcu_notifier_call_chain(): no cache bounces and no memory barriers.
  * As compensation, srcu_notifier_chain_unregister() is rather expensive.
  * SRCU notifier chains should be used when the chain will be called very
- * often but notifier_blocks will seldom be removed.  Also, SRCU notifier
- * chains are slightly more difficult to use because they require special
- * runtime initialization.
+ * often but notifier_blocks will seldom be removed.
  */
 
 typedef	int (*notifier_fn_t)(struct notifier_block *nb,
@@ -88,7 +86,7 @@ struct srcu_notifier_head {
 		(name)->head = NULL;		\
 	} while (0)
 
-/* srcu_notifier_heads must be initialized and cleaned up dynamically */
+/* srcu_notifier_heads must be cleaned up dynamically */
 extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
 #define srcu_cleanup_notifier_head(name)	\
 		cleanup_srcu_struct(&(name)->srcu);
@@ -101,7 +99,13 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
 		.head = NULL }
 #define RAW_NOTIFIER_INIT(name)	{				\
 		.head = NULL }
-/* srcu_notifier_heads cannot be initialized statically */
+
+#define SRCU_NOTIFIER_INIT(name, pcpu)				\
+	{							\
+		.mutex = __MUTEX_INITIALIZER(name.mutex),	\
+		.head = NULL,					\
+		.srcu = __SRCU_STRUCT_INIT(name.srcu, pcpu),	\
+	}
 
 #define ATOMIC_NOTIFIER_HEAD(name)				\
 	struct atomic_notifier_head name =			\
@@ -113,6 +117,18 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
 	struct raw_notifier_head name =				\
 		RAW_NOTIFIER_INIT(name)
 
+#define _SRCU_NOTIFIER_HEAD(name, mod)				\
+	static DEFINE_PER_CPU(struct srcu_struct_array,		\
+			name##_head_srcu_array);		\
+	mod struct srcu_notifier_head name =			\
+			SRCU_NOTIFIER_INIT(name, name##_head_srcu_array)
+
+#define SRCU_NOTIFIER_HEAD(name)				\
+	_SRCU_NOTIFIER_HEAD(name, )
+
+#define SRCU_NOTIFIER_HEAD_STATIC(name)				\
+	_SRCU_NOTIFIER_HEAD(name, static)
+
 #ifdef __KERNEL__
 
 extern int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
@@ -182,12 +198,12 @@ static inline int notifier_to_errno(int ret)
 
 /*
  *	Declared notifiers so far. I can imagine quite a few more chains
- *	over time (eg laptop power reset chains, reboot chain (to clean 
+ *	over time (eg laptop power reset chains, reboot chain (to clean
  *	device units up), device [un]mount chain, module load/unload chain,
- *	low memory chain, screenblank chain (for plug in modular screenblankers) 
+ *	low memory chain, screenblank chain (for plug in modular screenblankers)
  *	VC switch chains (for loadable kernel svgalib VC switch helpers) etc...
  */
- 
+
 /* CPU notfiers are defined in include/linux/cpu.h. */
 
 /* netdevice notifiers are defined in include/linux/netdevice.h */
diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index a2783cb5d275..f12ffc644adb 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -84,10 +84,10 @@ int init_srcu_struct(struct srcu_struct *sp);
 
 void process_srcu(struct work_struct *work);
 
-#define __SRCU_STRUCT_INIT(name)					\
+#define __SRCU_STRUCT_INIT(name, pcpu_name)				\
 	{								\
 		.completed = -300,					\
-		.per_cpu_ref = &name##_srcu_array,			\
+		.per_cpu_ref = &pcpu_name,				\
 		.queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock),	\
 		.running = false,					\
 		.batch_queue = RCU_BATCH_INIT(name.batch_queue),	\
@@ -104,11 +104,12 @@ void process_srcu(struct work_struct *work);
  */
 #define DEFINE_SRCU(name)						\
 	static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\
-	struct srcu_struct name = __SRCU_STRUCT_INIT(name);
+	struct srcu_struct name = __SRCU_STRUCT_INIT(name, name##_srcu_array);
 
 #define DEFINE_STATIC_SRCU(name)					\
 	static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\
-	static struct srcu_struct name = __SRCU_STRUCT_INIT(name);
+	static struct srcu_struct name = __SRCU_STRUCT_INIT(\
+		name, name##_srcu_array);
 
 /**
  * call_srcu() - Queue a callback for invocation after an SRCU grace period
-- 
cgit v1.2.3


From c6691deb8ecdb2efd5d7eeb0bd8c79a1eac014c6 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 12 Feb 2015 16:01:13 +0100
Subject: gpio: omap: use raw locks for locking

This patch converts gpio_bank.lock from a spin_lock into a
raw_spin_lock. The call path is to access this lock is always under a
raw_spin_lock, for instance
- __setup_irq() holds &desc->lock with irq off
  + __irq_set_trigger()
   + omap_gpio_irq_type()

- handle_level_irq() (runs with irqs off therefore raw locks)
  + mask_ack_irq()
   + omap_gpio_mask_irq()

This fixes the obvious backtrace on -RT. However the locking vs context
is not and this is not limited to -RT:
- omap_gpio_irq_type() is called with IRQ off and has an conditional
  call to pm_runtime_get_sync() which may sleep. Either it may happen or
  it may not happen but pm_runtime_get_sync() should not be called with
  irqs off.

- omap_gpio_debounce() is holding the lock with IRQs off.
  + omap2_set_gpio_debounce()
   + clk_prepare_enable()
    + clk_prepare() this one might sleep.
  The number of users of gpiod_set_debounce() / gpio_set_debounce()
  looks low but still this is not good.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 drivers/gpio/gpio-omap.c | 74 ++++++++++++++++++++++++------------------------
 1 file changed, 37 insertions(+), 37 deletions(-)

diff --git a/drivers/gpio/gpio-omap.c b/drivers/gpio/gpio-omap.c
index 415682f69214..da96b23ae203 100644
--- a/drivers/gpio/gpio-omap.c
+++ b/drivers/gpio/gpio-omap.c
@@ -57,7 +57,7 @@ struct gpio_bank {
 	u32 saved_datain;
 	u32 level_mask;
 	u32 toggle_mask;
-	spinlock_t lock;
+	raw_spinlock_t lock;
 	struct gpio_chip chip;
 	struct clk *dbck;
 	u32 mod_usage;
@@ -503,19 +503,19 @@ static int omap_gpio_irq_type(struct irq_data *d, unsigned type)
 		(type & (IRQ_TYPE_LEVEL_LOW|IRQ_TYPE_LEVEL_HIGH)))
 		return -EINVAL;
 
-	spin_lock_irqsave(&bank->lock, flags);
+	raw_spin_lock_irqsave(&bank->lock, flags);
 	offset = GPIO_INDEX(bank, gpio);
 	retval = omap_set_gpio_triggering(bank, offset, type);
 	if (!LINE_USED(bank->mod_usage, offset)) {
 		omap_enable_gpio_module(bank, offset);
 		omap_set_gpio_direction(bank, offset, 1);
 	} else if (!omap_gpio_is_input(bank, BIT(offset))) {
-		spin_unlock_irqrestore(&bank->lock, flags);
+		raw_spin_unlock_irqrestore(&bank->lock, flags);
 		return -EINVAL;
 	}
 
 	bank->irq_usage |= BIT(GPIO_INDEX(bank, gpio));
-	spin_unlock_irqrestore(&bank->lock, flags);
+	raw_spin_unlock_irqrestore(&bank->lock, flags);
 
 	if (type & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH))
 		__irq_set_handler_locked(d->irq, handle_level_irq);
@@ -633,14 +633,14 @@ static int omap_set_gpio_wakeup(struct gpio_bank *bank, int gpio, int enable)
 		return -EINVAL;
 	}
 
-	spin_lock_irqsave(&bank->lock, flags);
+	raw_spin_lock_irqsave(&bank->lock, flags);
 	if (enable)
 		bank->context.wake_en |= gpio_bit;
 	else
 		bank->context.wake_en &= ~gpio_bit;
 
 	writel_relaxed(bank->context.wake_en, bank->base + bank->regs->wkup_en);
-	spin_unlock_irqrestore(&bank->lock, flags);
+	raw_spin_unlock_irqrestore(&bank->lock, flags);
 
 	return 0;
 }
@@ -675,7 +675,7 @@ static int omap_gpio_request(struct gpio_chip *chip, unsigned offset)
 	if (!BANK_USED(bank))
 		pm_runtime_get_sync(bank->dev);
 
-	spin_lock_irqsave(&bank->lock, flags);
+	raw_spin_lock_irqsave(&bank->lock, flags);
 	/* Set trigger to none. You need to enable the desired trigger with
 	 * request_irq() or set_irq_type(). Only do this if the IRQ line has
 	 * not already been requested.
@@ -685,7 +685,7 @@ static int omap_gpio_request(struct gpio_chip *chip, unsigned offset)
 		omap_enable_gpio_module(bank, offset);
 	}
 	bank->mod_usage |= BIT(offset);
-	spin_unlock_irqrestore(&bank->lock, flags);
+	raw_spin_unlock_irqrestore(&bank->lock, flags);
 
 	return 0;
 }
@@ -695,11 +695,11 @@ static void omap_gpio_free(struct gpio_chip *chip, unsigned offset)
 	struct gpio_bank *bank = container_of(chip, struct gpio_bank, chip);
 	unsigned long flags;
 
-	spin_lock_irqsave(&bank->lock, flags);
+	raw_spin_lock_irqsave(&bank->lock, flags);
 	bank->mod_usage &= ~(BIT(offset));
 	omap_disable_gpio_module(bank, offset);
 	omap_reset_gpio(bank, bank->chip.base + offset);
-	spin_unlock_irqrestore(&bank->lock, flags);
+	raw_spin_unlock_irqrestore(&bank->lock, flags);
 
 	/*
 	 * If this is the last gpio to be freed in the bank,
@@ -799,12 +799,12 @@ static void omap_gpio_irq_shutdown(struct irq_data *d)
 	unsigned long flags;
 	unsigned offset = GPIO_INDEX(bank, gpio);
 
-	spin_lock_irqsave(&bank->lock, flags);
+	raw_spin_lock_irqsave(&bank->lock, flags);
 	gpio_unlock_as_irq(&bank->chip, offset);
 	bank->irq_usage &= ~(BIT(offset));
 	omap_disable_gpio_module(bank, offset);
 	omap_reset_gpio(bank, gpio);
-	spin_unlock_irqrestore(&bank->lock, flags);
+	raw_spin_unlock_irqrestore(&bank->lock, flags);
 
 	/*
 	 * If this is the last IRQ to be freed in the bank,
@@ -828,10 +828,10 @@ static void omap_gpio_mask_irq(struct irq_data *d)
 	unsigned int gpio = omap_irq_to_gpio(bank, d->hwirq);
 	unsigned long flags;
 
-	spin_lock_irqsave(&bank->lock, flags);
+	raw_spin_lock_irqsave(&bank->lock, flags);
 	omap_set_gpio_irqenable(bank, gpio, 0);
 	omap_set_gpio_triggering(bank, GPIO_INDEX(bank, gpio), IRQ_TYPE_NONE);
-	spin_unlock_irqrestore(&bank->lock, flags);
+	raw_spin_unlock_irqrestore(&bank->lock, flags);
 }
 
 static void omap_gpio_unmask_irq(struct irq_data *d)
@@ -842,7 +842,7 @@ static void omap_gpio_unmask_irq(struct irq_data *d)
 	u32 trigger = irqd_get_trigger_type(d);
 	unsigned long flags;
 
-	spin_lock_irqsave(&bank->lock, flags);
+	raw_spin_lock_irqsave(&bank->lock, flags);
 	if (trigger)
 		omap_set_gpio_triggering(bank, GPIO_INDEX(bank, gpio), trigger);
 
@@ -854,7 +854,7 @@ static void omap_gpio_unmask_irq(struct irq_data *d)
 	}
 
 	omap_set_gpio_irqenable(bank, gpio, 1);
-	spin_unlock_irqrestore(&bank->lock, flags);
+	raw_spin_unlock_irqrestore(&bank->lock, flags);
 }
 
 /*---------------------------------------------------------------------*/
@@ -867,9 +867,9 @@ static int omap_mpuio_suspend_noirq(struct device *dev)
 					OMAP_MPUIO_GPIO_MASKIT / bank->stride;
 	unsigned long		flags;
 
-	spin_lock_irqsave(&bank->lock, flags);
+	raw_spin_lock_irqsave(&bank->lock, flags);
 	writel_relaxed(0xffff & ~bank->context.wake_en, mask_reg);
-	spin_unlock_irqrestore(&bank->lock, flags);
+	raw_spin_unlock_irqrestore(&bank->lock, flags);
 
 	return 0;
 }
@@ -882,9 +882,9 @@ static int omap_mpuio_resume_noirq(struct device *dev)
 					OMAP_MPUIO_GPIO_MASKIT / bank->stride;
 	unsigned long		flags;
 
-	spin_lock_irqsave(&bank->lock, flags);
+	raw_spin_lock_irqsave(&bank->lock, flags);
 	writel_relaxed(bank->context.wake_en, mask_reg);
-	spin_unlock_irqrestore(&bank->lock, flags);
+	raw_spin_unlock_irqrestore(&bank->lock, flags);
 
 	return 0;
 }
@@ -930,9 +930,9 @@ static int omap_gpio_get_direction(struct gpio_chip *chip, unsigned offset)
 
 	bank = container_of(chip, struct gpio_bank, chip);
 	reg = bank->base + bank->regs->direction;
-	spin_lock_irqsave(&bank->lock, flags);
+	raw_spin_lock_irqsave(&bank->lock, flags);
 	dir = !!(readl_relaxed(reg) & BIT(offset));
-	spin_unlock_irqrestore(&bank->lock, flags);
+	raw_spin_unlock_irqrestore(&bank->lock, flags);
 	return dir;
 }
 
@@ -942,9 +942,9 @@ static int omap_gpio_input(struct gpio_chip *chip, unsigned offset)
 	unsigned long flags;
 
 	bank = container_of(chip, struct gpio_bank, chip);
-	spin_lock_irqsave(&bank->lock, flags);
+	raw_spin_lock_irqsave(&bank->lock, flags);
 	omap_set_gpio_direction(bank, offset, 1);
-	spin_unlock_irqrestore(&bank->lock, flags);
+	raw_spin_unlock_irqrestore(&bank->lock, flags);
 	return 0;
 }
 
@@ -968,10 +968,10 @@ static int omap_gpio_output(struct gpio_chip *chip, unsigned offset, int value)
 	unsigned long flags;
 
 	bank = container_of(chip, struct gpio_bank, chip);
-	spin_lock_irqsave(&bank->lock, flags);
+	raw_spin_lock_irqsave(&bank->lock, flags);
 	bank->set_dataout(bank, offset, value);
 	omap_set_gpio_direction(bank, offset, 0);
-	spin_unlock_irqrestore(&bank->lock, flags);
+	raw_spin_unlock_irqrestore(&bank->lock, flags);
 	return 0;
 }
 
@@ -983,9 +983,9 @@ static int omap_gpio_debounce(struct gpio_chip *chip, unsigned offset,
 
 	bank = container_of(chip, struct gpio_bank, chip);
 
-	spin_lock_irqsave(&bank->lock, flags);
+	raw_spin_lock_irqsave(&bank->lock, flags);
 	omap2_set_gpio_debounce(bank, offset, debounce);
-	spin_unlock_irqrestore(&bank->lock, flags);
+	raw_spin_unlock_irqrestore(&bank->lock, flags);
 
 	return 0;
 }
@@ -996,9 +996,9 @@ static void omap_gpio_set(struct gpio_chip *chip, unsigned offset, int value)
 	unsigned long flags;
 
 	bank = container_of(chip, struct gpio_bank, chip);
-	spin_lock_irqsave(&bank->lock, flags);
+	raw_spin_lock_irqsave(&bank->lock, flags);
 	bank->set_dataout(bank, offset, value);
-	spin_unlock_irqrestore(&bank->lock, flags);
+	raw_spin_unlock_irqrestore(&bank->lock, flags);
 }
 
 /*---------------------------------------------------------------------*/
@@ -1223,7 +1223,7 @@ static int omap_gpio_probe(struct platform_device *pdev)
 	else
 		bank->set_dataout = omap_set_gpio_dataout_mask;
 
-	spin_lock_init(&bank->lock);
+	raw_spin_lock_init(&bank->lock);
 
 	/* Static mapping, never released */
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
@@ -1270,7 +1270,7 @@ static int omap_gpio_runtime_suspend(struct device *dev)
 	unsigned long flags;
 	u32 wake_low, wake_hi;
 
-	spin_lock_irqsave(&bank->lock, flags);
+	raw_spin_lock_irqsave(&bank->lock, flags);
 
 	/*
 	 * Only edges can generate a wakeup event to the PRCM.
@@ -1323,7 +1323,7 @@ update_gpio_context_count:
 				bank->get_context_loss_count(bank->dev);
 
 	omap_gpio_dbck_disable(bank);
-	spin_unlock_irqrestore(&bank->lock, flags);
+	raw_spin_unlock_irqrestore(&bank->lock, flags);
 
 	return 0;
 }
@@ -1338,7 +1338,7 @@ static int omap_gpio_runtime_resume(struct device *dev)
 	unsigned long flags;
 	int c;
 
-	spin_lock_irqsave(&bank->lock, flags);
+	raw_spin_lock_irqsave(&bank->lock, flags);
 
 	/*
 	 * On the first resume during the probe, the context has not
@@ -1374,14 +1374,14 @@ static int omap_gpio_runtime_resume(struct device *dev)
 			if (c != bank->context_loss_count) {
 				omap_gpio_restore_context(bank);
 			} else {
-				spin_unlock_irqrestore(&bank->lock, flags);
+				raw_spin_unlock_irqrestore(&bank->lock, flags);
 				return 0;
 			}
 		}
 	}
 
 	if (!bank->workaround_enabled) {
-		spin_unlock_irqrestore(&bank->lock, flags);
+		raw_spin_unlock_irqrestore(&bank->lock, flags);
 		return 0;
 	}
 
@@ -1436,7 +1436,7 @@ static int omap_gpio_runtime_resume(struct device *dev)
 	}
 
 	bank->workaround_enabled = false;
-	spin_unlock_irqrestore(&bank->lock, flags);
+	raw_spin_unlock_irqrestore(&bank->lock, flags);
 
 	return 0;
 }
-- 
cgit v1.2.3


From ed37bcf7e20ba1bfa7583b8a38215775ff1da679 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 22 Jun 2011 19:47:02 +0200
Subject: block: Shorten interrupt disabled regions

Moving the blk_sched_flush_plug() call out of the interrupt/preempt
disabled region in the scheduler allows us to replace
local_irq_save/restore(flags) by local_irq_disable/enable() in
blk_flush_plug().

Now instead of doing this we disable interrupts explicitely when we
lock the request_queue and reenable them when we drop the lock. That
allows interrupts to be handled when the plug list contains requests
for more than one queue.

Aside of that this change makes the scope of the irq disabled region
more obvious. The current code confused the hell out of me when
looking at:

 local_irq_save(flags);
   spin_lock(q->queue_lock);
   ...
   queue_unplugged(q...);
     scsi_request_fn();
       spin_unlock(q->queue_lock);
       spin_lock(shost->host_lock);
       spin_unlock_irq(shost->host_lock);

-------------------^^^ ????

       spin_lock_irq(q->queue_lock);
       spin_unlock(q->lock);
 local_irq_restore(flags);

Also add a comment to __blk_run_queue() documenting that
q->request_fn() can drop q->queue_lock and reenable interrupts, but
must return with q->queue_lock held and interrupts disabled.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/20110622174919.025446432@linutronix.de
---
 block/blk-core.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 93f9152fc271..8a2074a111ea 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -3037,7 +3037,7 @@ static void queue_unplugged(struct request_queue *q, unsigned int depth,
 		blk_run_queue_async(q);
 	else
 		__blk_run_queue(q);
-	spin_unlock(q->queue_lock);
+	spin_unlock_irq(q->queue_lock);
 }
 
 static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule)
@@ -3085,7 +3085,6 @@ EXPORT_SYMBOL(blk_check_plugged);
 void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 {
 	struct request_queue *q;
-	unsigned long flags;
 	struct request *rq;
 	LIST_HEAD(list);
 	unsigned int depth;
@@ -3105,11 +3104,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 	q = NULL;
 	depth = 0;
 
-	/*
-	 * Save and disable interrupts here, to avoid doing it for every
-	 * queue lock we have to take.
-	 */
-	local_irq_save(flags);
 	while (!list_empty(&list)) {
 		rq = list_entry_rq(list.next);
 		list_del_init(&rq->queuelist);
@@ -3122,7 +3116,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 				queue_unplugged(q, depth, from_schedule);
 			q = rq->q;
 			depth = 0;
-			spin_lock(q->queue_lock);
+			spin_lock_irq(q->queue_lock);
 		}
 
 		/*
@@ -3149,8 +3143,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 	 */
 	if (q)
 		queue_unplugged(q, depth, from_schedule);
-
-	local_irq_restore(flags);
 }
 
 void blk_finish_plug(struct blk_plug *plug)
-- 
cgit v1.2.3


From 60092632b722e869fb0541b9239f45c67b4473e0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 14 Feb 2013 22:36:59 +0100
Subject: timekeeping-split-jiffies-lock.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/jiffies.c       |  7 ++++---
 kernel/time/tick-common.c   | 10 ++++++----
 kernel/time/tick-internal.h |  3 ++-
 kernel/time/tick-sched.c    | 19 ++++++++++++-------
 kernel/time/timekeeping.c   |  6 ++++--
 5 files changed, 28 insertions(+), 17 deletions(-)

diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index a6a5bf53e86d..23d7203cc8af 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -73,7 +73,8 @@ static struct clocksource clocksource_jiffies = {
 	.shift		= JIFFIES_SHIFT,
 };
 
-__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);
+__cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(jiffies_lock);
+__cacheline_aligned_in_smp seqcount_t jiffies_seq;
 
 #if (BITS_PER_LONG < 64)
 u64 get_jiffies_64(void)
@@ -82,9 +83,9 @@ u64 get_jiffies_64(void)
 	u64 ret;
 
 	do {
-		seq = read_seqbegin(&jiffies_lock);
+		seq = read_seqcount_begin(&jiffies_seq);
 		ret = jiffies_64;
-	} while (read_seqretry(&jiffies_lock, seq));
+	} while (read_seqcount_retry(&jiffies_seq, seq));
 	return ret;
 }
 EXPORT_SYMBOL(get_jiffies_64);
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 7efeedf53ebd..e3dd07d0e96b 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -78,13 +78,15 @@ int tick_is_oneshot_available(void)
 static void tick_periodic(int cpu)
 {
 	if (tick_do_timer_cpu == cpu) {
-		write_seqlock(&jiffies_lock);
+		raw_spin_lock(&jiffies_lock);
+		write_seqcount_begin(&jiffies_seq);
 
 		/* Keep track of the next tick event */
 		tick_next_period = ktime_add(tick_next_period, tick_period);
 
 		do_timer(1);
-		write_sequnlock(&jiffies_lock);
+		write_seqcount_end(&jiffies_seq);
+		raw_spin_unlock(&jiffies_lock);
 		update_wall_time();
 	}
 
@@ -146,9 +148,9 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
 		ktime_t next;
 
 		do {
-			seq = read_seqbegin(&jiffies_lock);
+			seq = read_seqcount_begin(&jiffies_seq);
 			next = tick_next_period;
-		} while (read_seqretry(&jiffies_lock, seq));
+		} while (read_seqcount_retry(&jiffies_seq, seq));
 
 		clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
 
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 366aeb4f2c66..8e118bb8ee1c 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -6,7 +6,8 @@
 
 #include "timekeeping.h"
 
-extern seqlock_t jiffies_lock;
+extern raw_spinlock_t jiffies_lock;
+extern seqcount_t jiffies_seq;
 
 #define CS_NAME_LEN	32
 
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 8c30ef7a2b70..ad413989896a 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -62,7 +62,8 @@ static void tick_do_update_jiffies64(ktime_t now)
 		return;
 
 	/* Reevalute with jiffies_lock held */
-	write_seqlock(&jiffies_lock);
+	raw_spin_lock(&jiffies_lock);
+	write_seqcount_begin(&jiffies_seq);
 
 	delta = ktime_sub(now, last_jiffies_update);
 	if (delta.tv64 >= tick_period.tv64) {
@@ -85,10 +86,12 @@ static void tick_do_update_jiffies64(ktime_t now)
 		/* Keep the tick_next_period variable up to date */
 		tick_next_period = ktime_add(last_jiffies_update, tick_period);
 	} else {
-		write_sequnlock(&jiffies_lock);
+		write_seqcount_end(&jiffies_seq);
+		raw_spin_unlock(&jiffies_lock);
 		return;
 	}
-	write_sequnlock(&jiffies_lock);
+	write_seqcount_end(&jiffies_seq);
+	raw_spin_unlock(&jiffies_lock);
 	update_wall_time();
 }
 
@@ -99,12 +102,14 @@ static ktime_t tick_init_jiffy_update(void)
 {
 	ktime_t period;
 
-	write_seqlock(&jiffies_lock);
+	raw_spin_lock(&jiffies_lock);
+	write_seqcount_begin(&jiffies_seq);
 	/* Did we start the jiffies update yet ? */
 	if (last_jiffies_update.tv64 == 0)
 		last_jiffies_update = tick_next_period;
 	period = last_jiffies_update;
-	write_sequnlock(&jiffies_lock);
+	write_seqcount_end(&jiffies_seq);
+	raw_spin_unlock(&jiffies_lock);
 	return period;
 }
 
@@ -580,10 +585,10 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 
 	/* Read jiffies and the time when jiffies were updated last */
 	do {
-		seq = read_seqbegin(&jiffies_lock);
+		seq = read_seqcount_begin(&jiffies_seq);
 		last_update = last_jiffies_update;
 		last_jiffies = jiffies;
-	} while (read_seqretry(&jiffies_lock, seq));
+	} while (read_seqcount_retry(&jiffies_seq, seq));
 
 	if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) ||
 	    arch_needs_cpu() || irq_work_needs_cpu()) {
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index ec1791fae965..57dbbb7d50c0 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1814,8 +1814,10 @@ EXPORT_SYMBOL(hardpps);
  */
 void xtime_update(unsigned long ticks)
 {
-	write_seqlock(&jiffies_lock);
+	raw_spin_lock(&jiffies_lock);
+	write_seqcount_begin(&jiffies_seq);
 	do_timer(ticks);
-	write_sequnlock(&jiffies_lock);
+	write_seqcount_end(&jiffies_seq);
+	raw_spin_unlock(&jiffies_lock);
 	update_wall_time();
 }
-- 
cgit v1.2.3


From c47063e6e3816c23d03bb7745e7394dccd42806e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 23 Jul 2013 15:45:51 +0200
Subject: vtime-split-lock-and-seqcount.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/init_task.h |  3 ++-
 include/linux/sched.h     |  3 ++-
 kernel/fork.c             |  3 ++-
 kernel/sched/cputime.c    | 62 ++++++++++++++++++++++++++++++-----------------
 4 files changed, 46 insertions(+), 25 deletions(-)

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 77fc43f8fb72..43e8c0ebe108 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -149,7 +149,8 @@ extern struct task_group root_task_group;
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 # define INIT_VTIME(tsk)						\
-	.vtime_seqlock = __SEQLOCK_UNLOCKED(tsk.vtime_seqlock),	\
+	.vtime_lock = __RAW_SPIN_LOCK_UNLOCKED(tsk.vtime_lock),	\
+	.vtime_seq = SEQCNT_ZERO(tsk.vtime_seq),			\
 	.vtime_snap = 0,				\
 	.vtime_snap_whence = VTIME_SYS,
 #else
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5e344bbe63ec..a0e66efca1fa 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1371,7 +1371,8 @@ struct task_struct {
 	struct cputime prev_cputime;
 #endif
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-	seqlock_t vtime_seqlock;
+	raw_spinlock_t vtime_lock;
+	seqcount_t vtime_seq;
 	unsigned long long vtime_snap;
 	enum {
 		VTIME_SLEEPING = 0,
diff --git a/kernel/fork.c b/kernel/fork.c
index 9b7d746d6d62..865b701d64a8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1291,7 +1291,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->prev_cputime.utime = p->prev_cputime.stime = 0;
 #endif
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-	seqlock_init(&p->vtime_seqlock);
+	raw_spin_lock_init(&p->vtime_lock);
+	seqcount_init(&p->vtime_seq);
 	p->vtime_snap = 0;
 	p->vtime_snap_whence = VTIME_SLEEPING;
 #endif
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 8394b1ee600c..2da134c9bae7 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -675,37 +675,45 @@ static void __vtime_account_system(struct task_struct *tsk)
 
 void vtime_account_system(struct task_struct *tsk)
 {
-	write_seqlock(&tsk->vtime_seqlock);
+	raw_spin_lock(&tsk->vtime_lock);
+	write_seqcount_begin(&tsk->vtime_seq);
 	__vtime_account_system(tsk);
-	write_sequnlock(&tsk->vtime_seqlock);
+	write_seqcount_end(&tsk->vtime_seq);
+	raw_spin_unlock(&tsk->vtime_lock);
 }
 
 void vtime_gen_account_irq_exit(struct task_struct *tsk)
 {
-	write_seqlock(&tsk->vtime_seqlock);
+	raw_spin_lock(&tsk->vtime_lock);
+	write_seqcount_begin(&tsk->vtime_seq);
 	__vtime_account_system(tsk);
 	if (context_tracking_in_user())
 		tsk->vtime_snap_whence = VTIME_USER;
-	write_sequnlock(&tsk->vtime_seqlock);
+	write_seqcount_end(&tsk->vtime_seq);
+	raw_spin_unlock(&tsk->vtime_lock);
 }
 
 void vtime_account_user(struct task_struct *tsk)
 {
 	cputime_t delta_cpu;
 
-	write_seqlock(&tsk->vtime_seqlock);
+	raw_spin_lock(&tsk->vtime_lock);
+	write_seqcount_begin(&tsk->vtime_seq);
 	delta_cpu = get_vtime_delta(tsk);
 	tsk->vtime_snap_whence = VTIME_SYS;
 	account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
-	write_sequnlock(&tsk->vtime_seqlock);
+	write_seqcount_end(&tsk->vtime_seq);
+	raw_spin_unlock(&tsk->vtime_lock);
 }
 
 void vtime_user_enter(struct task_struct *tsk)
 {
-	write_seqlock(&tsk->vtime_seqlock);
+	raw_spin_lock(&tsk->vtime_lock);
+	write_seqcount_begin(&tsk->vtime_seq);
 	__vtime_account_system(tsk);
 	tsk->vtime_snap_whence = VTIME_USER;
-	write_sequnlock(&tsk->vtime_seqlock);
+	write_seqcount_end(&tsk->vtime_seq);
+	raw_spin_unlock(&tsk->vtime_lock);
 }
 
 void vtime_guest_enter(struct task_struct *tsk)
@@ -717,19 +725,23 @@ void vtime_guest_enter(struct task_struct *tsk)
 	 * synchronization against the reader (task_gtime())
 	 * that can thus safely catch up with a tickless delta.
 	 */
-	write_seqlock(&tsk->vtime_seqlock);
+	raw_spin_lock(&tsk->vtime_lock);
+	write_seqcount_begin(&tsk->vtime_seq);
 	__vtime_account_system(tsk);
 	current->flags |= PF_VCPU;
-	write_sequnlock(&tsk->vtime_seqlock);
+	write_seqcount_end(&tsk->vtime_seq);
+	raw_spin_unlock(&tsk->vtime_lock);
 }
 EXPORT_SYMBOL_GPL(vtime_guest_enter);
 
 void vtime_guest_exit(struct task_struct *tsk)
 {
-	write_seqlock(&tsk->vtime_seqlock);
+	raw_spin_lock(&tsk->vtime_lock);
+	write_seqcount_begin(&tsk->vtime_seq);
 	__vtime_account_system(tsk);
 	current->flags &= ~PF_VCPU;
-	write_sequnlock(&tsk->vtime_seqlock);
+	write_seqcount_end(&tsk->vtime_seq);
+	raw_spin_unlock(&tsk->vtime_lock);
 }
 EXPORT_SYMBOL_GPL(vtime_guest_exit);
 
@@ -742,24 +754,30 @@ void vtime_account_idle(struct task_struct *tsk)
 
 void arch_vtime_task_switch(struct task_struct *prev)
 {
-	write_seqlock(&prev->vtime_seqlock);
+	raw_spin_lock(&prev->vtime_lock);
+	write_seqcount_begin(&prev->vtime_seq);
 	prev->vtime_snap_whence = VTIME_SLEEPING;
-	write_sequnlock(&prev->vtime_seqlock);
+	write_seqcount_end(&prev->vtime_seq);
+	raw_spin_unlock(&prev->vtime_lock);
 
-	write_seqlock(&current->vtime_seqlock);
+	raw_spin_lock(&current->vtime_lock);
+	write_seqcount_begin(&current->vtime_seq);
 	current->vtime_snap_whence = VTIME_SYS;
 	current->vtime_snap = sched_clock_cpu(smp_processor_id());
-	write_sequnlock(&current->vtime_seqlock);
+	write_seqcount_end(&current->vtime_seq);
+	raw_spin_unlock(&current->vtime_lock);
 }
 
 void vtime_init_idle(struct task_struct *t, int cpu)
 {
 	unsigned long flags;
 
-	write_seqlock_irqsave(&t->vtime_seqlock, flags);
+	raw_spin_lock_irqsave(&t->vtime_lock, flags);
+	write_seqcount_begin(&t->vtime_seq);
 	t->vtime_snap_whence = VTIME_SYS;
 	t->vtime_snap = sched_clock_cpu(cpu);
-	write_sequnlock_irqrestore(&t->vtime_seqlock, flags);
+	write_seqcount_end(&t->vtime_seq);
+	raw_spin_unlock_irqrestore(&t->vtime_lock, flags);
 }
 
 cputime_t task_gtime(struct task_struct *t)
@@ -768,13 +786,13 @@ cputime_t task_gtime(struct task_struct *t)
 	cputime_t gtime;
 
 	do {
-		seq = read_seqbegin(&t->vtime_seqlock);
+		seq = read_seqcount_begin(&t->vtime_seq);
 
 		gtime = t->gtime;
 		if (t->flags & PF_VCPU)
 			gtime += vtime_delta(t);
 
-	} while (read_seqretry(&t->vtime_seqlock, seq));
+	} while (read_seqcount_retry(&t->vtime_seq, seq));
 
 	return gtime;
 }
@@ -797,7 +815,7 @@ fetch_task_cputime(struct task_struct *t,
 		*udelta = 0;
 		*sdelta = 0;
 
-		seq = read_seqbegin(&t->vtime_seqlock);
+		seq = read_seqcount_begin(&t->vtime_seq);
 
 		if (u_dst)
 			*u_dst = *u_src;
@@ -821,7 +839,7 @@ fetch_task_cputime(struct task_struct *t,
 			if (t->vtime_snap_whence == VTIME_SYS)
 				*sdelta = delta;
 		}
-	} while (read_seqretry(&t->vtime_seqlock, seq));
+	} while (read_seqcount_retry(&t->vtime_seq, seq));
 }
 
 
-- 
cgit v1.2.3


From 86a498de7272541e33cc91d85bc1884a3e2964c7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 18 Jul 2011 21:32:10 +0200
Subject: mips-enable-interrupts-in-signal.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/mips/kernel/signal.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c
index 16f1e4f2bf3c..00ede39a386b 100644
--- a/arch/mips/kernel/signal.c
+++ b/arch/mips/kernel/signal.c
@@ -613,6 +613,7 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, void *unused,
 	__u32 thread_info_flags)
 {
 	local_irq_enable();
+	preempt_check_resched();
 
 	user_exit();
 
-- 
cgit v1.2.3


From d4a042c3d94753586db9bca384add559123479d7 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 29 Sep 2011 12:24:30 -0500
Subject: tracing: Account for preempt off in preempt_schedule()

The preempt_schedule() uses the preempt_disable_notrace() version
because it can cause infinite recursion by the function tracer as
the function tracer uses preempt_enable_notrace() which may call
back into the preempt_schedule() code as the NEED_RESCHED is still
set and the PREEMPT_ACTIVE has not been set yet.

See commit: d1f74e20b5b064a130cd0743a256c2d3cfe84010 that made this
change.

The preemptoff and preemptirqsoff latency tracers require the first
and last preempt count modifiers to enable tracing. But this skips
the checks. Since we can not convert them back to the non notrace
version, we can use the idle() hooks for the latency tracers here.
That is, the start/stop_critical_timings() works well to manually
start and stop the latency tracer for preempt off timings.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Clark Williams <williams@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched/core.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b794bde3f5e1..e79055548bb2 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2924,7 +2924,16 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
 
 	do {
 		__preempt_count_add(PREEMPT_ACTIVE);
+		/*
+		 * The add/subtract must not be traced by the function
+		 * tracer. But we still want to account for the
+		 * preempt off latency tracer. Since the _notrace versions
+		 * of add/subtract skip the accounting for latency tracer
+		 * we must force it manually.
+		 */
+		start_critical_timings();
 		__schedule();
+		stop_critical_timings();
 		__preempt_count_sub(PREEMPT_ACTIVE);
 
 		/*
-- 
cgit v1.2.3


From 08002109f2a2078e9bb82bba8c2310be16e92f54 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 21 Sep 2011 19:57:12 +0200
Subject: signal-revert-ptrace-preempt-magic.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/signal.c | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/kernel/signal.c b/kernel/signal.c
index 8f0876f9f6dd..252b63de5a52 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1889,15 +1889,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
 		if (gstop_done && ptrace_reparented(current))
 			do_notify_parent_cldstop(current, false, why);
 
-		/*
-		 * Don't want to allow preemption here, because
-		 * sys_ptrace() needs this task to be inactive.
-		 *
-		 * XXX: implement read_unlock_no_resched().
-		 */
-		preempt_disable();
 		read_unlock(&tasklist_lock);
-		preempt_enable_no_resched();
 		freezable_schedule();
 	} else {
 		/*
-- 
cgit v1.2.3


From 8506a9a577d44fa4007aeaebc61f0c5719fdd3f2 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 29 Aug 2013 18:21:04 +0200
Subject: ptrace: fix ptrace vs tasklist_lock race

As explained by Alexander Fyodorov <halcy@yandex.ru>:

|read_lock(&tasklist_lock) in ptrace_stop() is converted to mutex on RT kernel,
|and it can remove __TASK_TRACED from task->state (by moving  it to
|task->saved_state). If parent does wait() on child followed by a sys_ptrace
|call, the following race can happen:
|
|- child sets __TASK_TRACED in ptrace_stop()
|- parent does wait() which eventually calls wait_task_stopped() and returns
|  child's pid
|- child blocks on read_lock(&tasklist_lock) in ptrace_stop() and moves
|  __TASK_TRACED flag to saved_state
|- parent calls sys_ptrace, which calls ptrace_check_attach() and wait_task_inactive()

The patch is based on his initial patch where an additional check is
added in case the __TASK_TRACED moved to ->saved_state. The pi_lock is
taken in case the caller is interrupted between looking into ->state and
->saved_state.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 include/linux/sched.h | 48 +++++++++++++++++++++++++++++++++++++++++++++---
 kernel/ptrace.c       |  7 ++++++-
 kernel/sched/core.c   | 19 ++++++++++++++++---
 3 files changed, 67 insertions(+), 7 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index a0e66efca1fa..51cc17a6855b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -235,10 +235,7 @@ extern char ___assert_task_state[1 - 2*!!(
 				 TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \
 				 __TASK_TRACED | EXIT_ZOMBIE | EXIT_DEAD)
 
-#define task_is_traced(task)	((task->state & __TASK_TRACED) != 0)
 #define task_is_stopped(task)	((task->state & __TASK_STOPPED) != 0)
-#define task_is_stopped_or_traced(task)	\
-			((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
 #define task_contributes_to_load(task)	\
 				((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
 				 (task->flags & PF_FROZEN) == 0)
@@ -2751,6 +2748,51 @@ static inline int signal_pending_state(long state, struct task_struct *p)
 	return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p);
 }
 
+static inline bool __task_is_stopped_or_traced(struct task_struct *task)
+{
+	if (task->state & (__TASK_STOPPED | __TASK_TRACED))
+		return true;
+#ifdef CONFIG_PREEMPT_RT_FULL
+	if (task->saved_state & (__TASK_STOPPED | __TASK_TRACED))
+		return true;
+#endif
+	return false;
+}
+
+static inline bool task_is_stopped_or_traced(struct task_struct *task)
+{
+	bool traced_stopped;
+
+#ifdef CONFIG_PREEMPT_RT_FULL
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&task->pi_lock, flags);
+	traced_stopped = __task_is_stopped_or_traced(task);
+	raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+#else
+	traced_stopped = __task_is_stopped_or_traced(task);
+#endif
+	return traced_stopped;
+}
+
+static inline bool task_is_traced(struct task_struct *task)
+{
+	bool traced = false;
+
+	if (task->state & __TASK_TRACED)
+		return true;
+#ifdef CONFIG_PREEMPT_RT_FULL
+	/* in case the task is sleeping on tasklist_lock */
+	raw_spin_lock_irq(&task->pi_lock);
+	if (task->state & __TASK_TRACED)
+		traced = true;
+	else if (task->saved_state & __TASK_TRACED)
+		traced = true;
+	raw_spin_unlock_irq(&task->pi_lock);
+#endif
+	return traced;
+}
+
 /*
  * cond_resched() and cond_resched_lock(): latency reduction via
  * explicit rescheduling in places that are safe. The return
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 54e75226c2c4..7e32e6c268a0 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -129,7 +129,12 @@ static bool ptrace_freeze_traced(struct task_struct *task)
 
 	spin_lock_irq(&task->sighand->siglock);
 	if (task_is_traced(task) && !__fatal_signal_pending(task)) {
-		task->state = __TASK_TRACED;
+		raw_spin_lock_irq(&task->pi_lock);
+		if (task->state & __TASK_TRACED)
+			task->state = __TASK_TRACED;
+		else
+			task->saved_state = __TASK_TRACED;
+		raw_spin_unlock_irq(&task->pi_lock);
 		ret = true;
 	}
 	spin_unlock_irq(&task->sighand->siglock);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e79055548bb2..3602645aff14 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1198,6 +1198,18 @@ struct migration_arg {
 
 static int migration_cpu_stop(void *data);
 
+static bool check_task_state(struct task_struct *p, long match_state)
+{
+	bool match = false;
+
+	raw_spin_lock_irq(&p->pi_lock);
+	if (p->state == match_state || p->saved_state == match_state)
+		match = true;
+	raw_spin_unlock_irq(&p->pi_lock);
+
+	return match;
+}
+
 /*
  * wait_task_inactive - wait for a thread to unschedule.
  *
@@ -1242,7 +1254,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 		 * is actually now running somewhere else!
 		 */
 		while (task_running(rq, p)) {
-			if (match_state && unlikely(p->state != match_state))
+			if (match_state && !check_task_state(p, match_state))
 				return 0;
 			cpu_relax();
 		}
@@ -1257,7 +1269,8 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 		running = task_running(rq, p);
 		queued = task_on_rq_queued(p);
 		ncsw = 0;
-		if (!match_state || p->state == match_state)
+		if (!match_state || p->state == match_state ||
+		    p->saved_state == match_state)
 			ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
 		task_rq_unlock(rq, p, &flags);
 
@@ -1792,7 +1805,7 @@ out:
  */
 int wake_up_process(struct task_struct *p)
 {
-	WARN_ON(task_is_stopped_or_traced(p));
+	WARN_ON(__task_is_stopped_or_traced(p));
 	return try_to_wake_up(p, TASK_NORMAL, 0);
 }
 EXPORT_SYMBOL(wake_up_process);
-- 
cgit v1.2.3


From f076c857e963f490c446f92f3000165422103a89 Mon Sep 17 00:00:00 2001
From: Frank Rowand <frank.rowand@am.sony.com>
Date: Mon, 19 Sep 2011 14:51:14 -0700
Subject: preempt-rt: Convert arm boot_lock to raw

The arm boot_lock is used by the secondary processor startup code.  The locking
task is the idle thread, which has idle->sched_class == &idle_sched_class.
idle_sched_class->enqueue_task == NULL, so if the idle task blocks on the
lock, the attempt to wake it when the lock becomes available will fail:

try_to_wake_up()
   ...
      activate_task()
         enqueue_task()
            p->sched_class->enqueue_task(rq, p, flags)

Fix by converting boot_lock to a raw spin lock.

Signed-off-by: Frank Rowand <frank.rowand@am.sony.com>
Link: http://lkml.kernel.org/r/4E77B952.3010606@am.sony.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/arm/mach-exynos/platsmp.c    | 12 ++++++------
 arch/arm/mach-hisi/platmcpm.c     | 26 +++++++++++++-------------
 arch/arm/mach-omap2/omap-smp.c    | 10 +++++-----
 arch/arm/mach-prima2/platsmp.c    | 10 +++++-----
 arch/arm/mach-qcom/platsmp.c      | 10 +++++-----
 arch/arm/mach-spear/platsmp.c     | 10 +++++-----
 arch/arm/mach-sti/platsmp.c       | 10 +++++-----
 arch/arm/mach-ux500/platsmp.c     | 10 +++++-----
 arch/arm/plat-versatile/platsmp.c | 10 +++++-----
 9 files changed, 54 insertions(+), 54 deletions(-)

diff --git a/arch/arm/mach-exynos/platsmp.c b/arch/arm/mach-exynos/platsmp.c
index 41ae28d69e6f..9b96bc41b1d3 100644
--- a/arch/arm/mach-exynos/platsmp.c
+++ b/arch/arm/mach-exynos/platsmp.c
@@ -137,7 +137,7 @@ static void __iomem *scu_base_addr(void)
 	return (void __iomem *)(S5P_VA_SCU);
 }
 
-static DEFINE_SPINLOCK(boot_lock);
+static DEFINE_RAW_SPINLOCK(boot_lock);
 
 static void exynos_secondary_init(unsigned int cpu)
 {
@@ -150,8 +150,8 @@ static void exynos_secondary_init(unsigned int cpu)
 	/*
 	 * Synchronise with the boot thread.
 	 */
-	spin_lock(&boot_lock);
-	spin_unlock(&boot_lock);
+	raw_spin_lock(&boot_lock);
+	raw_spin_unlock(&boot_lock);
 }
 
 static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
@@ -165,7 +165,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
 	 * Set synchronisation state between this boot processor
 	 * and the secondary one
 	 */
-	spin_lock(&boot_lock);
+	raw_spin_lock(&boot_lock);
 
 	/*
 	 * The secondary processor is waiting to be released from
@@ -192,7 +192,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
 
 		if (timeout == 0) {
 			printk(KERN_ERR "cpu1 power enable failed");
-			spin_unlock(&boot_lock);
+			raw_spin_unlock(&boot_lock);
 			return -ETIMEDOUT;
 		}
 	}
@@ -242,7 +242,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
 	 * calibrations, then wait for it to finish
 	 */
 fail:
-	spin_unlock(&boot_lock);
+	raw_spin_unlock(&boot_lock);
 
 	return pen_release != -1 ? ret : 0;
 }
diff --git a/arch/arm/mach-hisi/platmcpm.c b/arch/arm/mach-hisi/platmcpm.c
index 280f3f14f77c..bc2ed95c0e62 100644
--- a/arch/arm/mach-hisi/platmcpm.c
+++ b/arch/arm/mach-hisi/platmcpm.c
@@ -57,7 +57,7 @@
 
 static void __iomem *sysctrl, *fabric;
 static int hip04_cpu_table[HIP04_MAX_CLUSTERS][HIP04_MAX_CPUS_PER_CLUSTER];
-static DEFINE_SPINLOCK(boot_lock);
+static DEFINE_RAW_SPINLOCK(boot_lock);
 static u32 fabric_phys_addr;
 /*
  * [0]: bootwrapper physical address
@@ -104,7 +104,7 @@ static int hip04_mcpm_power_up(unsigned int cpu, unsigned int cluster)
 	if (cluster >= HIP04_MAX_CLUSTERS || cpu >= HIP04_MAX_CPUS_PER_CLUSTER)
 		return -EINVAL;
 
-	spin_lock_irq(&boot_lock);
+	raw_spin_lock_irq(&boot_lock);
 
 	if (hip04_cpu_table[cluster][cpu])
 		goto out;
@@ -133,7 +133,7 @@ static int hip04_mcpm_power_up(unsigned int cpu, unsigned int cluster)
 	udelay(20);
 out:
 	hip04_cpu_table[cluster][cpu]++;
-	spin_unlock_irq(&boot_lock);
+	raw_spin_unlock_irq(&boot_lock);
 
 	return 0;
 }
@@ -149,7 +149,7 @@ static void hip04_mcpm_power_down(void)
 
 	__mcpm_cpu_going_down(cpu, cluster);
 
-	spin_lock(&boot_lock);
+	raw_spin_lock(&boot_lock);
 	BUG_ON(__mcpm_cluster_state(cluster) != CLUSTER_UP);
 	hip04_cpu_table[cluster][cpu]--;
 	if (hip04_cpu_table[cluster][cpu] == 1) {
@@ -162,7 +162,7 @@ static void hip04_mcpm_power_down(void)
 
 	last_man = hip04_cluster_is_down(cluster);
 	if (last_man && __mcpm_outbound_enter_critical(cpu, cluster)) {
-		spin_unlock(&boot_lock);
+		raw_spin_unlock(&boot_lock);
 		/* Since it's Cortex A15, disable L2 prefetching. */
 		asm volatile(
 		"mcr	p15, 1, %0, c15, c0, 3 \n\t"
@@ -173,7 +173,7 @@ static void hip04_mcpm_power_down(void)
 		hip04_set_snoop_filter(cluster, 0);
 		__mcpm_outbound_leave_critical(cluster, CLUSTER_DOWN);
 	} else {
-		spin_unlock(&boot_lock);
+		raw_spin_unlock(&boot_lock);
 		v7_exit_coherency_flush(louis);
 	}
 
@@ -192,7 +192,7 @@ static int hip04_mcpm_wait_for_powerdown(unsigned int cpu, unsigned int cluster)
 	       cpu >= HIP04_MAX_CPUS_PER_CLUSTER);
 
 	count = TIMEOUT_MSEC / POLL_MSEC;
-	spin_lock_irq(&boot_lock);
+	raw_spin_lock_irq(&boot_lock);
 	for (tries = 0; tries < count; tries++) {
 		if (hip04_cpu_table[cluster][cpu]) {
 			ret = -EBUSY;
@@ -202,10 +202,10 @@ static int hip04_mcpm_wait_for_powerdown(unsigned int cpu, unsigned int cluster)
 		data = readl_relaxed(sysctrl + SC_CPU_RESET_STATUS(cluster));
 		if (data & CORE_WFI_STATUS(cpu))
 			break;
-		spin_unlock_irq(&boot_lock);
+		raw_spin_unlock_irq(&boot_lock);
 		/* Wait for clean L2 when the whole cluster is down. */
 		msleep(POLL_MSEC);
-		spin_lock_irq(&boot_lock);
+		raw_spin_lock_irq(&boot_lock);
 	}
 	if (tries >= count)
 		goto err;
@@ -220,10 +220,10 @@ static int hip04_mcpm_wait_for_powerdown(unsigned int cpu, unsigned int cluster)
 	}
 	if (tries >= count)
 		goto err;
-	spin_unlock_irq(&boot_lock);
+	raw_spin_unlock_irq(&boot_lock);
 	return 0;
 err:
-	spin_unlock_irq(&boot_lock);
+	raw_spin_unlock_irq(&boot_lock);
 	return ret;
 }
 
@@ -235,10 +235,10 @@ static void hip04_mcpm_powered_up(void)
 	cpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);
 	cluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);
 
-	spin_lock(&boot_lock);
+	raw_spin_lock(&boot_lock);
 	if (!hip04_cpu_table[cluster][cpu])
 		hip04_cpu_table[cluster][cpu] = 1;
-	spin_unlock(&boot_lock);
+	raw_spin_unlock(&boot_lock);
 }
 
 static void __naked hip04_mcpm_power_up_setup(unsigned int affinity_level)
diff --git a/arch/arm/mach-omap2/omap-smp.c b/arch/arm/mach-omap2/omap-smp.c
index 5305ec7341ec..19732b56088b 100644
--- a/arch/arm/mach-omap2/omap-smp.c
+++ b/arch/arm/mach-omap2/omap-smp.c
@@ -43,7 +43,7 @@
 /* SCU base address */
 static void __iomem *scu_base;
 
-static DEFINE_SPINLOCK(boot_lock);
+static DEFINE_RAW_SPINLOCK(boot_lock);
 
 void __iomem *omap4_get_scu_base(void)
 {
@@ -74,8 +74,8 @@ static void omap4_secondary_init(unsigned int cpu)
 	/*
 	 * Synchronise with the boot thread.
 	 */
-	spin_lock(&boot_lock);
-	spin_unlock(&boot_lock);
+	raw_spin_lock(&boot_lock);
+	raw_spin_unlock(&boot_lock);
 }
 
 static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
@@ -89,7 +89,7 @@ static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
 	 * Set synchronisation state between this boot processor
 	 * and the secondary one
 	 */
-	spin_lock(&boot_lock);
+	raw_spin_lock(&boot_lock);
 
 	/*
 	 * Update the AuxCoreBoot0 with boot state for secondary core.
@@ -166,7 +166,7 @@ static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
 	 * Now the secondary core is starting up let it run its
 	 * calibrations, then wait for it to finish
 	 */
-	spin_unlock(&boot_lock);
+	raw_spin_unlock(&boot_lock);
 
 	return 0;
 }
diff --git a/arch/arm/mach-prima2/platsmp.c b/arch/arm/mach-prima2/platsmp.c
index 335c12e92262..2bb5796792b2 100644
--- a/arch/arm/mach-prima2/platsmp.c
+++ b/arch/arm/mach-prima2/platsmp.c
@@ -23,7 +23,7 @@
 static void __iomem *scu_base;
 static void __iomem *rsc_base;
 
-static DEFINE_SPINLOCK(boot_lock);
+static DEFINE_RAW_SPINLOCK(boot_lock);
 
 static struct map_desc scu_io_desc __initdata = {
 	.length		= SZ_4K,
@@ -56,8 +56,8 @@ static void sirfsoc_secondary_init(unsigned int cpu)
 	/*
 	 * Synchronise with the boot thread.
 	 */
-	spin_lock(&boot_lock);
-	spin_unlock(&boot_lock);
+	raw_spin_lock(&boot_lock);
+	raw_spin_unlock(&boot_lock);
 }
 
 static struct of_device_id rsc_ids[]  = {
@@ -95,7 +95,7 @@ static int sirfsoc_boot_secondary(unsigned int cpu, struct task_struct *idle)
 	/* make sure write buffer is drained */
 	mb();
 
-	spin_lock(&boot_lock);
+	raw_spin_lock(&boot_lock);
 
 	/*
 	 * The secondary processor is waiting to be released from
@@ -127,7 +127,7 @@ static int sirfsoc_boot_secondary(unsigned int cpu, struct task_struct *idle)
 	 * now the secondary core is starting up let it run its
 	 * calibrations, then wait for it to finish
 	 */
-	spin_unlock(&boot_lock);
+	raw_spin_unlock(&boot_lock);
 
 	return pen_release != -1 ? -ENOSYS : 0;
 }
diff --git a/arch/arm/mach-qcom/platsmp.c b/arch/arm/mach-qcom/platsmp.c
index d6908569ecaf..80c5e4fc5910 100644
--- a/arch/arm/mach-qcom/platsmp.c
+++ b/arch/arm/mach-qcom/platsmp.c
@@ -46,7 +46,7 @@
 
 extern void secondary_startup(void);
 
-static DEFINE_SPINLOCK(boot_lock);
+static DEFINE_RAW_SPINLOCK(boot_lock);
 
 #ifdef CONFIG_HOTPLUG_CPU
 static void __ref qcom_cpu_die(unsigned int cpu)
@@ -60,8 +60,8 @@ static void qcom_secondary_init(unsigned int cpu)
 	/*
 	 * Synchronise with the boot thread.
 	 */
-	spin_lock(&boot_lock);
-	spin_unlock(&boot_lock);
+	raw_spin_lock(&boot_lock);
+	raw_spin_unlock(&boot_lock);
 }
 
 static int scss_release_secondary(unsigned int cpu)
@@ -284,7 +284,7 @@ static int qcom_boot_secondary(unsigned int cpu, int (*func)(unsigned int))
 	 * set synchronisation state between this boot processor
 	 * and the secondary one
 	 */
-	spin_lock(&boot_lock);
+	raw_spin_lock(&boot_lock);
 
 	/*
 	 * Send the secondary CPU a soft interrupt, thereby causing
@@ -297,7 +297,7 @@ static int qcom_boot_secondary(unsigned int cpu, int (*func)(unsigned int))
 	 * now the secondary core is starting up let it run its
 	 * calibrations, then wait for it to finish
 	 */
-	spin_unlock(&boot_lock);
+	raw_spin_unlock(&boot_lock);
 
 	return ret;
 }
diff --git a/arch/arm/mach-spear/platsmp.c b/arch/arm/mach-spear/platsmp.c
index fd4297713d67..b0553b2c2d53 100644
--- a/arch/arm/mach-spear/platsmp.c
+++ b/arch/arm/mach-spear/platsmp.c
@@ -32,7 +32,7 @@ static void write_pen_release(int val)
 	sync_cache_w(&pen_release);
 }
 
-static DEFINE_SPINLOCK(boot_lock);
+static DEFINE_RAW_SPINLOCK(boot_lock);
 
 static void __iomem *scu_base = IOMEM(VA_SCU_BASE);
 
@@ -47,8 +47,8 @@ static void spear13xx_secondary_init(unsigned int cpu)
 	/*
 	 * Synchronise with the boot thread.
 	 */
-	spin_lock(&boot_lock);
-	spin_unlock(&boot_lock);
+	raw_spin_lock(&boot_lock);
+	raw_spin_unlock(&boot_lock);
 }
 
 static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
@@ -59,7 +59,7 @@ static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
 	 * set synchronisation state between this boot processor
 	 * and the secondary one
 	 */
-	spin_lock(&boot_lock);
+	raw_spin_lock(&boot_lock);
 
 	/*
 	 * The secondary processor is waiting to be released from
@@ -84,7 +84,7 @@ static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
 	 * now the secondary core is starting up let it run its
 	 * calibrations, then wait for it to finish
 	 */
-	spin_unlock(&boot_lock);
+	raw_spin_unlock(&boot_lock);
 
 	return pen_release != -1 ? -ENOSYS : 0;
 }
diff --git a/arch/arm/mach-sti/platsmp.c b/arch/arm/mach-sti/platsmp.c
index d4b624f8dfcb..56d4028122f5 100644
--- a/arch/arm/mach-sti/platsmp.c
+++ b/arch/arm/mach-sti/platsmp.c
@@ -34,7 +34,7 @@ static void write_pen_release(int val)
 	sync_cache_w(&pen_release);
 }
 
-static DEFINE_SPINLOCK(boot_lock);
+static DEFINE_RAW_SPINLOCK(boot_lock);
 
 static void sti_secondary_init(unsigned int cpu)
 {
@@ -49,8 +49,8 @@ static void sti_secondary_init(unsigned int cpu)
 	/*
 	 * Synchronise with the boot thread.
 	 */
-	spin_lock(&boot_lock);
-	spin_unlock(&boot_lock);
+	raw_spin_lock(&boot_lock);
+	raw_spin_unlock(&boot_lock);
 }
 
 static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
@@ -61,7 +61,7 @@ static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
 	 * set synchronisation state between this boot processor
 	 * and the secondary one
 	 */
-	spin_lock(&boot_lock);
+	raw_spin_lock(&boot_lock);
 
 	/*
 	 * The secondary processor is waiting to be released from
@@ -92,7 +92,7 @@ static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
 	 * now the secondary core is starting up let it run its
 	 * calibrations, then wait for it to finish
 	 */
-	spin_unlock(&boot_lock);
+	raw_spin_unlock(&boot_lock);
 
 	return pen_release != -1 ? -ENOSYS : 0;
 }
diff --git a/arch/arm/mach-ux500/platsmp.c b/arch/arm/mach-ux500/platsmp.c
index a44967f3168c..3af22a4836bf 100644
--- a/arch/arm/mach-ux500/platsmp.c
+++ b/arch/arm/mach-ux500/platsmp.c
@@ -51,7 +51,7 @@ static void __iomem *scu_base_addr(void)
 	return NULL;
 }
 
-static DEFINE_SPINLOCK(boot_lock);
+static DEFINE_RAW_SPINLOCK(boot_lock);
 
 static void ux500_secondary_init(unsigned int cpu)
 {
@@ -64,8 +64,8 @@ static void ux500_secondary_init(unsigned int cpu)
 	/*
 	 * Synchronise with the boot thread.
 	 */
-	spin_lock(&boot_lock);
-	spin_unlock(&boot_lock);
+	raw_spin_lock(&boot_lock);
+	raw_spin_unlock(&boot_lock);
 }
 
 static int ux500_boot_secondary(unsigned int cpu, struct task_struct *idle)
@@ -76,7 +76,7 @@ static int ux500_boot_secondary(unsigned int cpu, struct task_struct *idle)
 	 * set synchronisation state between this boot processor
 	 * and the secondary one
 	 */
-	spin_lock(&boot_lock);
+	raw_spin_lock(&boot_lock);
 
 	/*
 	 * The secondary processor is waiting to be released from
@@ -97,7 +97,7 @@ static int ux500_boot_secondary(unsigned int cpu, struct task_struct *idle)
 	 * now the secondary core is starting up let it run its
 	 * calibrations, then wait for it to finish
 	 */
-	spin_unlock(&boot_lock);
+	raw_spin_unlock(&boot_lock);
 
 	return pen_release != -1 ? -ENOSYS : 0;
 }
diff --git a/arch/arm/plat-versatile/platsmp.c b/arch/arm/plat-versatile/platsmp.c
index 53feb90c840c..b4a8d54fc3f3 100644
--- a/arch/arm/plat-versatile/platsmp.c
+++ b/arch/arm/plat-versatile/platsmp.c
@@ -30,7 +30,7 @@ static void write_pen_release(int val)
 	sync_cache_w(&pen_release);
 }
 
-static DEFINE_SPINLOCK(boot_lock);
+static DEFINE_RAW_SPINLOCK(boot_lock);
 
 void versatile_secondary_init(unsigned int cpu)
 {
@@ -43,8 +43,8 @@ void versatile_secondary_init(unsigned int cpu)
 	/*
 	 * Synchronise with the boot thread.
 	 */
-	spin_lock(&boot_lock);
-	spin_unlock(&boot_lock);
+	raw_spin_lock(&boot_lock);
+	raw_spin_unlock(&boot_lock);
 }
 
 int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
@@ -55,7 +55,7 @@ int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
 	 * Set synchronisation state between this boot processor
 	 * and the secondary one
 	 */
-	spin_lock(&boot_lock);
+	raw_spin_lock(&boot_lock);
 
 	/*
 	 * This is really belt and braces; we hold unintended secondary
@@ -85,7 +85,7 @@ int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
 	 * now the secondary core is starting up let it run its
 	 * calibrations, then wait for it to finish
 	 */
-	spin_unlock(&boot_lock);
+	raw_spin_unlock(&boot_lock);
 
 	return pen_release != -1 ? -ENOSYS : 0;
 }
-- 
cgit v1.2.3


From 763c4191be9b7a946b88a0d815737c72d6adae25 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 3 Jul 2009 08:29:20 -0500
Subject: posix-timers: Prevent broadcast signals

Posix timers should not send broadcast signals and kernel only
signals. Prevent it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/posix-timers.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 31ea01f42e1f..35239e500f93 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -499,6 +499,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
 static struct pid *good_sigevent(sigevent_t * event)
 {
 	struct task_struct *rtn = current->group_leader;
+	int sig = event->sigev_signo;
 
 	if ((event->sigev_notify & SIGEV_THREAD_ID ) &&
 		(!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) ||
@@ -507,7 +508,8 @@ static struct pid *good_sigevent(sigevent_t * event)
 		return NULL;
 
 	if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) &&
-	    ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX)))
+	    (sig <= 0 || sig > SIGRTMAX || sig_kernel_only(sig) ||
+	     sig_kernel_coredump(sig)))
 		return NULL;
 
 	return task_pid(rtn);
-- 
cgit v1.2.3


From 52074fb91048cb13a47aec02d5c5cdb6a1ae072e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 3 Jul 2009 08:44:56 -0500
Subject: signals: Allow rt tasks to cache one sigqueue struct

To avoid allocation allow rt tasks to cache one sigqueue struct in
task struct.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h  |  1 +
 include/linux/signal.h |  1 +
 kernel/exit.c          |  2 +-
 kernel/fork.c          |  1 +
 kernel/signal.c        | 84 +++++++++++++++++++++++++++++++++++++++++++++++---
 5 files changed, 84 insertions(+), 5 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 51cc17a6855b..cca2295df002 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1417,6 +1417,7 @@ struct task_struct {
 /* signal handlers */
 	struct signal_struct *signal;
 	struct sighand_struct *sighand;
+	struct sigqueue *sigqueue_cache;
 
 	sigset_t blocked, real_blocked;
 	sigset_t saved_sigmask;	/* restored if set_restore_sigmask() was used */
diff --git a/include/linux/signal.h b/include/linux/signal.h
index ab1e0392b5ac..66215b20aa8d 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -218,6 +218,7 @@ static inline void init_sigpending(struct sigpending *sig)
 }
 
 extern void flush_sigqueue(struct sigpending *queue);
+extern void flush_task_sigqueue(struct task_struct *tsk);
 
 /* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */
 static inline int valid_signal(unsigned long sig)
diff --git a/kernel/exit.c b/kernel/exit.c
index 2116aace6c85..285ae8468de4 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -147,7 +147,7 @@ static void __exit_signal(struct task_struct *tsk)
 	 * Do this under ->siglock, we can race with another thread
 	 * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
 	 */
-	flush_sigqueue(&tsk->pending);
+	flush_task_sigqueue(tsk);
 	tsk->sighand = NULL;
 	spin_unlock(&sighand->siglock);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 865b701d64a8..5099883025a5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1284,6 +1284,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	spin_lock_init(&p->alloc_lock);
 
 	init_sigpending(&p->pending);
+	p->sigqueue_cache = NULL;
 
 	p->utime = p->stime = p->gtime = 0;
 	p->utimescaled = p->stimescaled = 0;
diff --git a/kernel/signal.c b/kernel/signal.c
index 252b63de5a52..c79ca57c9e6f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -14,6 +14,7 @@
 #include <linux/export.h>
 #include <linux/init.h>
 #include <linux/sched.h>
+#include <linux/sched/rt.h>
 #include <linux/fs.h>
 #include <linux/tty.h>
 #include <linux/binfmts.h>
@@ -352,13 +353,45 @@ static bool task_participate_group_stop(struct task_struct *task)
 	return false;
 }
 
+#ifdef __HAVE_ARCH_CMPXCHG
+static inline struct sigqueue *get_task_cache(struct task_struct *t)
+{
+	struct sigqueue *q = t->sigqueue_cache;
+
+	if (cmpxchg(&t->sigqueue_cache, q, NULL) != q)
+		return NULL;
+	return q;
+}
+
+static inline int put_task_cache(struct task_struct *t, struct sigqueue *q)
+{
+	if (cmpxchg(&t->sigqueue_cache, NULL, q) == NULL)
+		return 0;
+	return 1;
+}
+
+#else
+
+static inline struct sigqueue *get_task_cache(struct task_struct *t)
+{
+	return NULL;
+}
+
+static inline int put_task_cache(struct task_struct *t, struct sigqueue *q)
+{
+	return 1;
+}
+
+#endif
+
 /*
  * allocate a new signal queue record
  * - this may be called without locks if and only if t == current, otherwise an
  *   appropriate lock must be held to stop the target task from exiting
  */
 static struct sigqueue *
-__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
+__sigqueue_do_alloc(int sig, struct task_struct *t, gfp_t flags,
+		    int override_rlimit, int fromslab)
 {
 	struct sigqueue *q = NULL;
 	struct user_struct *user;
@@ -375,7 +408,10 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
 	if (override_rlimit ||
 	    atomic_read(&user->sigpending) <=
 			task_rlimit(t, RLIMIT_SIGPENDING)) {
-		q = kmem_cache_alloc(sigqueue_cachep, flags);
+		if (!fromslab)
+			q = get_task_cache(t);
+		if (!q)
+			q = kmem_cache_alloc(sigqueue_cachep, flags);
 	} else {
 		print_dropped_signal(sig);
 	}
@@ -392,6 +428,13 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
 	return q;
 }
 
+static struct sigqueue *
+__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags,
+		 int override_rlimit)
+{
+	return __sigqueue_do_alloc(sig, t, flags, override_rlimit, 0);
+}
+
 static void __sigqueue_free(struct sigqueue *q)
 {
 	if (q->flags & SIGQUEUE_PREALLOC)
@@ -401,6 +444,21 @@ static void __sigqueue_free(struct sigqueue *q)
 	kmem_cache_free(sigqueue_cachep, q);
 }
 
+static void sigqueue_free_current(struct sigqueue *q)
+{
+	struct user_struct *up;
+
+	if (q->flags & SIGQUEUE_PREALLOC)
+		return;
+
+	up = q->user;
+	if (rt_prio(current->normal_prio) && !put_task_cache(current, q)) {
+		atomic_dec(&up->sigpending);
+		free_uid(up);
+	} else
+		  __sigqueue_free(q);
+}
+
 void flush_sigqueue(struct sigpending *queue)
 {
 	struct sigqueue *q;
@@ -413,6 +471,21 @@ void flush_sigqueue(struct sigpending *queue)
 	}
 }
 
+/*
+ * Called from __exit_signal. Flush tsk->pending and
+ * tsk->sigqueue_cache
+ */
+void flush_task_sigqueue(struct task_struct *tsk)
+{
+	struct sigqueue *q;
+
+	flush_sigqueue(&tsk->pending);
+
+	q = get_task_cache(tsk);
+	if (q)
+		kmem_cache_free(sigqueue_cachep, q);
+}
+
 /*
  * Flush all pending signals for a task.
  */
@@ -565,7 +638,7 @@ static void collect_signal(int sig, struct sigpending *list, siginfo_t *info)
 still_pending:
 		list_del_init(&first->list);
 		copy_siginfo(info, &first->info);
-		__sigqueue_free(first);
+		sigqueue_free_current(first);
 	} else {
 		/*
 		 * Ok, it wasn't in the queue.  This must be
@@ -611,6 +684,8 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
 {
 	int signr;
 
+	WARN_ON_ONCE(tsk != current);
+
 	/* We only dequeue private signals from ourselves, we don't let
 	 * signalfd steal them
 	 */
@@ -1528,7 +1603,8 @@ EXPORT_SYMBOL(kill_pid);
  */
 struct sigqueue *sigqueue_alloc(void)
 {
-	struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0);
+	/* Preallocated sigqueue objects always from the slabcache ! */
+	struct sigqueue *q = __sigqueue_do_alloc(-1, current, GFP_KERNEL, 0, 1);
 
 	if (q)
 		q->flags |= SIGQUEUE_PREALLOC;
-- 
cgit v1.2.3


From 0e5127c0eb115ee94e1934a39a88969732fa14b7 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 20 May 2015 00:22:53 +0200
Subject: signal/x86: Delay calling signals in atomic

On x86_64 we must disable preemption before we enable interrupts
for stack faults, int3 and debugging, because the current task is using
a per CPU debug stack defined by the IST. If we schedule out, another task
can come in and use the same stack and cause the stack to be corrupted
and crash the kernel on return.

When CONFIG_PREEMPT_RT_FULL is enabled, spin_locks become mutexes, and
one of these is the spin lock used in signal handling.

Some of the debug code (int3) causes do_trap() to send a signal.
This function calls a spin lock that has been converted to a mutex
and has the possibility to sleep. If this happens, the above issues with
the corrupted stack is possible.

Instead of calling the signal right away, for PREEMPT_RT and x86_64,
the signal information is stored on the stacks task_struct and
TIF_NOTIFY_RESUME is set. Then on exit of the trap, the signal resume
code will send the signal when preemption is enabled.

[ rostedt: Switched from #ifdef CONFIG_PREEMPT_RT_FULL to
  ARCH_RT_DELAYS_SIGNAL_SEND and added comments to the code. ]

Cc: stable-rt@vger.kernel.org
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/signal.h | 13 +++++++++++++
 arch/x86/kernel/signal.c      |  8 ++++++++
 include/linux/sched.h         |  4 ++++
 kernel/signal.c               | 37 +++++++++++++++++++++++++++++++++++--
 4 files changed, 60 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h
index 31eab867e6d3..b1b08a28c72a 100644
--- a/arch/x86/include/asm/signal.h
+++ b/arch/x86/include/asm/signal.h
@@ -23,6 +23,19 @@ typedef struct {
 	unsigned long sig[_NSIG_WORDS];
 } sigset_t;
 
+/*
+ * Because some traps use the IST stack, we must keep preemption
+ * disabled while calling do_trap(), but do_trap() may call
+ * force_sig_info() which will grab the signal spin_locks for the
+ * task, which in PREEMPT_RT_FULL are mutexes.  By defining
+ * ARCH_RT_DELAYS_SIGNAL_SEND the force_sig_info() will set
+ * TIF_NOTIFY_RESUME and set up the signal to be sent on exit of the
+ * trap.
+ */
+#if defined(CONFIG_PREEMPT_RT_FULL) && defined(CONFIG_X86_64)
+#define ARCH_RT_DELAYS_SIGNAL_SEND
+#endif
+
 #ifndef CONFIG_COMPAT
 typedef sigset_t compat_sigset_t;
 #endif
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index ed37a768d0fc..37d2cc072f1b 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -746,6 +746,14 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
 		mce_notify_process();
 #endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
 
+#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
+	if (unlikely(current->forced_info.si_signo)) {
+		struct task_struct *t = current;
+		force_sig_info(t->forced_info.si_signo,	&t->forced_info, t);
+		t->forced_info.si_signo = 0;
+	}
+#endif
+
 	if (thread_info_flags & _TIF_UPROBE)
 		uprobe_notify_resume(regs);
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index cca2295df002..91c0ab3544f0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1422,6 +1422,10 @@ struct task_struct {
 	sigset_t blocked, real_blocked;
 	sigset_t saved_sigmask;	/* restored if set_restore_sigmask() was used */
 	struct sigpending pending;
+#ifdef CONFIG_PREEMPT_RT_FULL
+	/* TODO: move me into ->restart_block ? */
+	struct siginfo forced_info;
+#endif
 
 	unsigned long sas_ss_sp;
 	size_t sas_ss_size;
diff --git a/kernel/signal.c b/kernel/signal.c
index c79ca57c9e6f..c8ea69a738c7 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1282,8 +1282,8 @@ int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p,
  * We don't want to have recursive SIGSEGV's etc, for example,
  * that is why we also clear SIGNAL_UNKILLABLE.
  */
-int
-force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
+static int
+do_force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
 {
 	unsigned long int flags;
 	int ret, blocked, ignored;
@@ -1308,6 +1308,39 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
 	return ret;
 }
 
+int force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
+{
+/*
+ * On some archs, PREEMPT_RT has to delay sending a signal from a trap
+ * since it can not enable preemption, and the signal code's spin_locks
+ * turn into mutexes. Instead, it must set TIF_NOTIFY_RESUME which will
+ * send the signal on exit of the trap.
+ */
+#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
+	if (in_atomic()) {
+		if (WARN_ON_ONCE(t != current))
+			return 0;
+		if (WARN_ON_ONCE(t->forced_info.si_signo))
+			return 0;
+
+		if (is_si_special(info)) {
+			WARN_ON_ONCE(info != SEND_SIG_PRIV);
+			t->forced_info.si_signo = sig;
+			t->forced_info.si_errno = 0;
+			t->forced_info.si_code = SI_KERNEL;
+			t->forced_info.si_pid = 0;
+			t->forced_info.si_uid = 0;
+		} else {
+			t->forced_info = *info;
+		}
+
+		set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
+		return 0;
+	}
+#endif
+	return do_force_sig_info(sig, info, t);
+}
+
 /*
  * Nuke all other threads in the group.
  */
-- 
cgit v1.2.3


From 7bb5a46f20599b92b5979f8b4543a0b0ad3eb2c8 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:29:30 -0500
Subject: drivers: random: Reduce preempt disabled region

No need to keep preemption disabled across the whole function.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/char/random.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 9cd6968e2f92..9a07ef22fab1 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -776,8 +776,6 @@ static void add_timer_randomness(struct timer_rand_state *state, unsigned num)
 	} sample;
 	long delta, delta2, delta3;
 
-	preempt_disable();
-
 	sample.jiffies = jiffies;
 	sample.cycles = random_get_entropy();
 	sample.num = num;
@@ -818,7 +816,6 @@ static void add_timer_randomness(struct timer_rand_state *state, unsigned num)
 		 */
 		credit_entropy_bits(r, min_t(int, fls(delta>>1), 11));
 	}
-	preempt_enable();
 }
 
 void add_input_randomness(unsigned int type, unsigned int code,
-- 
cgit v1.2.3


From fa9e8398f8ad0328fa2f968ff007803f2cbf2b69 Mon Sep 17 00:00:00 2001
From: Benedikt Spranger <b.spranger@linutronix.de>
Date: Sat, 6 Mar 2010 17:47:10 +0100
Subject: ARM: AT91: PIT: Remove irq handler when clock event is unused
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Setup and remove the interrupt handler in clock event mode selection.
This avoids calling the (shared) interrupt handler when the device is
not used.

Signed-off-by: Benedikt Spranger <b.spranger@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
[bigeasy: redo the patch with NR_IRQS_LEGACY which is probably required since
commit 8fe82a55 ("ARM: at91: sparse irq support") which is included since v3.6.
Patch based on what Sami Pietikäinen <Sami.Pietikainen@wapice.com> suggested].
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 arch/arm/mach-at91/at91rm9200_time.c  | 1 +
 drivers/clocksource/timer-atmel-pit.c | 4 ++++
 2 files changed, 5 insertions(+)

diff --git a/arch/arm/mach-at91/at91rm9200_time.c b/arch/arm/mach-at91/at91rm9200_time.c
index 7fd13aef9827..93491b1195fa 100644
--- a/arch/arm/mach-at91/at91rm9200_time.c
+++ b/arch/arm/mach-at91/at91rm9200_time.c
@@ -135,6 +135,7 @@ clkevt32k_mode(enum clock_event_mode mode, struct clock_event_device *dev)
 		break;
 	case CLOCK_EVT_MODE_SHUTDOWN:
 	case CLOCK_EVT_MODE_UNUSED:
+		remove_irq(NR_IRQS_LEGACY + AT91_ID_SYS, &at91rm9200_timer_irq);
 	case CLOCK_EVT_MODE_RESUME:
 		irqmask = 0;
 		break;
diff --git a/drivers/clocksource/timer-atmel-pit.c b/drivers/clocksource/timer-atmel-pit.c
index d5289098b3df..109ad45835a0 100644
--- a/drivers/clocksource/timer-atmel-pit.c
+++ b/drivers/clocksource/timer-atmel-pit.c
@@ -90,6 +90,7 @@ static cycle_t read_pit_clk(struct clocksource *cs)
 	return elapsed;
 }
 
+static struct irqaction at91sam926x_pit_irq;
 /*
  * Clockevent device:  interrupts every 1/HZ (== pit_cycles * MCK/16)
  */
@@ -100,6 +101,8 @@ pit_clkevt_mode(enum clock_event_mode mode, struct clock_event_device *dev)
 
 	switch (mode) {
 	case CLOCK_EVT_MODE_PERIODIC:
+		/* Set up irq handler */
+		setup_irq(at91sam926x_pit_irq.irq, &at91sam926x_pit_irq);
 		/* update clocksource counter */
 		data->cnt += data->cycle * PIT_PICNT(pit_read(data->base, AT91_PIT_PIVR));
 		pit_write(data->base, AT91_PIT_MR,
@@ -113,6 +116,7 @@ pit_clkevt_mode(enum clock_event_mode mode, struct clock_event_device *dev)
 		/* disable irq, leaving the clocksource active */
 		pit_write(data->base, AT91_PIT_MR,
 			  (data->cycle - 1) | AT91_PIT_PITEN);
+		remove_irq(at91sam926x_pit_irq.irq, &at91sam926x_pit_irq);
 		break;
 	case CLOCK_EVT_MODE_RESUME:
 		break;
-- 
cgit v1.2.3


From 872413d2f9aca72e6b0fd432fd186205b62f129b Mon Sep 17 00:00:00 2001
From: Benedikt Spranger <b.spranger@linutronix.de>
Date: Mon, 8 Mar 2010 18:57:04 +0100
Subject: clocksource: TCLIB: Allow higher clock rates for clock events
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

As default the TCLIB uses the 32KiHz base clock rate for clock events.
Add a compile time selection to allow higher clock resulution.

(fixed up by Sami Pietikäinen <Sami.Pietikainen@wapice.com>)

Signed-off-by: Benedikt Spranger <b.spranger@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/clocksource/tcb_clksrc.c | 37 ++++++++++++++++++++++---------------
 drivers/misc/Kconfig             | 12 ++++++++++--
 2 files changed, 32 insertions(+), 17 deletions(-)

diff --git a/drivers/clocksource/tcb_clksrc.c b/drivers/clocksource/tcb_clksrc.c
index 8bdbc45c6dad..43f1c6bc6e28 100644
--- a/drivers/clocksource/tcb_clksrc.c
+++ b/drivers/clocksource/tcb_clksrc.c
@@ -23,8 +23,7 @@
  *     this 32 bit free-running counter. the second channel is not used.
  *
  *   - The third channel may be used to provide a 16-bit clockevent
- *     source, used in either periodic or oneshot mode.  This runs
- *     at 32 KiHZ, and can handle delays of up to two seconds.
+ *     source, used in either periodic or oneshot mode.
  *
  * A boot clocksource and clockevent source are also currently needed,
  * unless the relevant platforms (ARM/AT91, AVR32/AT32) are changed so
@@ -74,6 +73,7 @@ static struct clocksource clksrc = {
 struct tc_clkevt_device {
 	struct clock_event_device	clkevt;
 	struct clk			*clk;
+	u32				freq;
 	void __iomem			*regs;
 };
 
@@ -82,13 +82,6 @@ static struct tc_clkevt_device *to_tc_clkevt(struct clock_event_device *clkevt)
 	return container_of(clkevt, struct tc_clkevt_device, clkevt);
 }
 
-/* For now, we always use the 32K clock ... this optimizes for NO_HZ,
- * because using one of the divided clocks would usually mean the
- * tick rate can never be less than several dozen Hz (vs 0.5 Hz).
- *
- * A divided clock could be good for high resolution timers, since
- * 30.5 usec resolution can seem "low".
- */
 static u32 timer_clock;
 
 static void tc_mode(enum clock_event_mode m, struct clock_event_device *d)
@@ -111,11 +104,12 @@ static void tc_mode(enum clock_event_mode m, struct clock_event_device *d)
 	case CLOCK_EVT_MODE_PERIODIC:
 		clk_enable(tcd->clk);
 
-		/* slow clock, count up to RC, then irq and restart */
+		/* count up to RC, then irq and restart */
 		__raw_writel(timer_clock
 				| ATMEL_TC_WAVE | ATMEL_TC_WAVESEL_UP_AUTO,
 				regs + ATMEL_TC_REG(2, CMR));
-		__raw_writel((32768 + HZ/2) / HZ, tcaddr + ATMEL_TC_REG(2, RC));
+		__raw_writel((tcd->freq + HZ / 2) / HZ,
+			     tcaddr + ATMEL_TC_REG(2, RC));
 
 		/* Enable clock and interrupts on RC compare */
 		__raw_writel(ATMEL_TC_CPCS, regs + ATMEL_TC_REG(2, IER));
@@ -128,7 +122,7 @@ static void tc_mode(enum clock_event_mode m, struct clock_event_device *d)
 	case CLOCK_EVT_MODE_ONESHOT:
 		clk_enable(tcd->clk);
 
-		/* slow clock, count up to RC, then irq and stop */
+		/* count up to RC, then irq and stop */
 		__raw_writel(timer_clock | ATMEL_TC_CPCSTOP
 				| ATMEL_TC_WAVE | ATMEL_TC_WAVESEL_UP_AUTO,
 				regs + ATMEL_TC_REG(2, CMR));
@@ -157,8 +151,12 @@ static struct tc_clkevt_device clkevt = {
 		.name		= "tc_clkevt",
 		.features	= CLOCK_EVT_FEAT_PERIODIC
 					| CLOCK_EVT_FEAT_ONESHOT,
+#ifdef CONFIG_ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
 		/* Should be lower than at91rm9200's system timer */
 		.rating		= 125,
+#else
+		.rating		= 200,
+#endif
 		.set_next_event	= tc_next_event,
 		.set_mode	= tc_mode,
 	},
@@ -178,8 +176,9 @@ static irqreturn_t ch2_irq(int irq, void *handle)
 	return IRQ_NONE;
 }
 
-static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
+static int __init setup_clkevents(struct atmel_tc *tc, int divisor_idx)
 {
+	unsigned divisor = atmel_tc_divisors[divisor_idx];
 	int ret;
 	struct clk *t2_clk = tc->clk[2];
 	int irq = tc->irq[2];
@@ -193,7 +192,11 @@ static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
 	clkevt.regs = tc->regs;
 	clkevt.clk = t2_clk;
 
-	timer_clock = clk32k_divisor_idx;
+	timer_clock = divisor_idx;
+	if (!divisor)
+		clkevt.freq = 32768;
+	else
+		clkevt.freq = clk_get_rate(t2_clk) / divisor;
 
 	clkevt.clkevt.cpumask = cpumask_of(0);
 
@@ -203,7 +206,7 @@ static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
 		return ret;
 	}
 
-	clockevents_config_and_register(&clkevt.clkevt, 32768, 1, 0xffff);
+	clockevents_config_and_register(&clkevt.clkevt, clkevt.freq, 1, 0xffff);
 
 	return ret;
 }
@@ -340,7 +343,11 @@ static int __init tcb_clksrc_init(void)
 		goto err_disable_t1;
 
 	/* channel 2:  periodic and oneshot timer support */
+#ifdef CONFIG_ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
 	ret = setup_clkevents(tc, clk32k_divisor_idx);
+#else
+	ret = setup_clkevents(tc, best_divisor_idx);
+#endif
 	if (ret)
 		goto err_unregister_clksrc;
 
diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index bbeb4516facf..a91797de3f90 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -69,8 +69,7 @@ config ATMEL_TCB_CLKSRC
 	  are combined to make a single 32-bit timer.
 
 	  When GENERIC_CLOCKEVENTS is defined, the third timer channel
-	  may be used as a clock event device supporting oneshot mode
-	  (delays of up to two seconds) based on the 32 KiHz clock.
+	  may be used as a clock event device supporting oneshot mode.
 
 config ATMEL_TCB_CLKSRC_BLOCK
 	int
@@ -84,6 +83,15 @@ config ATMEL_TCB_CLKSRC_BLOCK
 	  TC can be used for other purposes, such as PWM generation and
 	  interval timing.
 
+config ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
+	bool "TC Block use 32 KiHz clock"
+	depends on ATMEL_TCB_CLKSRC
+	default y
+	help
+	  Select this to use 32 KiHz base clock rate as TC block clock
+	  source for clock events.
+
+
 config DUMMY_IRQ
 	tristate "Dummy IRQ handler"
 	default n
-- 
cgit v1.2.3


From 2e6526f2428ed3faf5a49f337d7522d709f14c3b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:29:24 -0500
Subject: drivers/net: Use disable_irq_nosync() in 8139too

Use disable_irq_nosync() instead of disable_irq() as this might be
called in atomic context with netpoll.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/net/ethernet/realtek/8139too.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/realtek/8139too.c b/drivers/net/ethernet/realtek/8139too.c
index 007b38cce69a..7858f2bd894d 100644
--- a/drivers/net/ethernet/realtek/8139too.c
+++ b/drivers/net/ethernet/realtek/8139too.c
@@ -2215,7 +2215,7 @@ static void rtl8139_poll_controller(struct net_device *dev)
 	struct rtl8139_private *tp = netdev_priv(dev);
 	const int irq = tp->pci_dev->irq;
 
-	disable_irq(irq);
+	disable_irq_nosync(irq);
 	rtl8139_interrupt(irq, dev);
 	enable_irq(irq);
 }
-- 
cgit v1.2.3


From 4003247af66377604ad35f29d8bbe94d5dd1f04e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:30:37 -0500
Subject: mm: Prepare decoupling the page fault disabling logic

Add a pagefault_disabled variable to task_struct to allow decoupling
the pagefault-disabled logic from the preempt count.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h   |  1 +
 include/linux/uaccess.h | 32 +++-----------------------------
 kernel/fork.c           |  1 +
 mm/memory.c             | 30 ++++++++++++++++++++++++++++++
 4 files changed, 35 insertions(+), 29 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 91c0ab3544f0..88df56d44d2f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1463,6 +1463,7 @@ struct task_struct {
 	/* mutex deadlock detection */
 	struct mutex_waiter *blocked_on;
 #endif
+	int pagefault_disabled;
 #ifdef CONFIG_TRACE_IRQFLAGS
 	unsigned int irq_events;
 	unsigned long hardirq_enable_ip;
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index ecd3319dac33..9414a1b48f5c 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -6,36 +6,10 @@
 
 /*
  * These routines enable/disable the pagefault handler in that
- * it will not take any locks and go straight to the fixup table.
- *
- * They have great resemblance to the preempt_disable/enable calls
- * and in fact they are identical; this is because currently there is
- * no other way to make the pagefault handlers do this. So we do
- * disable preemption but we don't necessarily care about that.
+ * it will not take any MM locks and go straight to the fixup table.
  */
-static inline void pagefault_disable(void)
-{
-	preempt_count_inc();
-	/*
-	 * make sure to have issued the store before a pagefault
-	 * can hit.
-	 */
-	barrier();
-}
-
-static inline void pagefault_enable(void)
-{
-#ifndef CONFIG_PREEMPT
-	/*
-	 * make sure to issue those last loads/stores before enabling
-	 * the pagefault handler again.
-	 */
-	barrier();
-	preempt_count_dec();
-#else
-	preempt_enable();
-#endif
-}
+extern void pagefault_disable(void);
+extern void pagefault_enable(void);
 
 #ifndef ARCH_HAS_NOCACHE_UACCESS
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 5099883025a5..f4d669c788c5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1344,6 +1344,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->hardirq_context = 0;
 	p->softirq_context = 0;
 #endif
+	p->pagefault_disabled = 0;
 #ifdef CONFIG_LOCKDEP
 	p->lockdep_depth = 0; /* no locks held yet */
 	p->curr_chain_key = 0;
diff --git a/mm/memory.c b/mm/memory.c
index 90fb265b32b6..922073f7d1c5 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3244,6 +3244,36 @@ unlock:
 	return 0;
 }
 
+void pagefault_disable(void)
+{
+	preempt_count_inc();
+	current->pagefault_disabled++;
+	/*
+	 * make sure to have issued the store before a pagefault
+	 * can hit.
+	 */
+	barrier();
+}
+EXPORT_SYMBOL(pagefault_disable);
+
+void pagefault_enable(void)
+{
+#ifndef CONFIG_PREEMPT
+	/*
+	 * make sure to issue those last loads/stores before enabling
+	 * the pagefault handler again.
+	 */
+	barrier();
+	current->pagefault_disabled--;
+	preempt_count_dec();
+#else
+	barrier();
+	current->pagefault_disabled--;
+	preempt_enable();
+#endif
+}
+EXPORT_SYMBOL(pagefault_enable);
+
 /*
  * By the time we get here, we already hold the mm semaphore
  *
-- 
cgit v1.2.3


From f0ca9c02dda31e5ed60a6b14488edcad362b5d3c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 17 Mar 2011 11:32:28 +0100
Subject: mm: Fixup all fault handlers to check current->pagefault_disable

Necessary for decoupling pagefault disable from preempt count.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/alpha/mm/fault.c      | 2 +-
 arch/arm/mm/fault.c        | 2 +-
 arch/avr32/mm/fault.c      | 3 ++-
 arch/cris/mm/fault.c       | 2 +-
 arch/frv/mm/fault.c        | 2 +-
 arch/ia64/mm/fault.c       | 2 +-
 arch/m32r/mm/fault.c       | 2 +-
 arch/m68k/mm/fault.c       | 2 +-
 arch/microblaze/mm/fault.c | 2 +-
 arch/mips/mm/fault.c       | 2 +-
 arch/mn10300/mm/fault.c    | 2 +-
 arch/parisc/mm/fault.c     | 2 +-
 arch/powerpc/mm/fault.c    | 2 +-
 arch/s390/mm/fault.c       | 3 ++-
 arch/score/mm/fault.c      | 2 +-
 arch/sh/mm/fault.c         | 2 +-
 arch/sparc/mm/fault_32.c   | 2 +-
 arch/sparc/mm/fault_64.c   | 2 +-
 arch/tile/mm/fault.c       | 2 +-
 arch/um/kernel/trap.c      | 2 +-
 arch/x86/mm/fault.c        | 2 +-
 arch/xtensa/mm/fault.c     | 2 +-
 22 files changed, 24 insertions(+), 22 deletions(-)

diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c
index 9d0ac091a52a..5b59e11a96d8 100644
--- a/arch/alpha/mm/fault.c
+++ b/arch/alpha/mm/fault.c
@@ -107,7 +107,7 @@ do_page_fault(unsigned long address, unsigned long mmcsr,
 
 	/* If we're in an interrupt context, or have no user context,
 	   we must not take the fault.  */
-	if (!mm || in_atomic())
+	if (!mm || in_atomic() || current->pagefault_disabled)
 		goto no_context;
 
 #ifdef CONFIG_ALPHA_LARGE_VMALLOC
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index eb8830a4c5ed..9e92a6ca6137 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -277,7 +277,7 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (in_atomic() || !mm || current->pagefault_disabled)
 		goto no_context;
 
 	if (user_mode(regs))
diff --git a/arch/avr32/mm/fault.c b/arch/avr32/mm/fault.c
index d223a8b57c1e..ddb8290e8378 100644
--- a/arch/avr32/mm/fault.c
+++ b/arch/avr32/mm/fault.c
@@ -81,7 +81,8 @@ asmlinkage void do_page_fault(unsigned long ecr, struct pt_regs *regs)
 	 * If we're in an interrupt or have no user context, we must
 	 * not take the fault...
 	 */
-	if (in_atomic() || !mm || regs->sr & SYSREG_BIT(GM))
+	if (in_atomic() || !mm || regs->sr & SYSREG_BIT(GM) ||
+	    current->pagefault_disabled)
 		goto no_context;
 
 	local_irq_enable();
diff --git a/arch/cris/mm/fault.c b/arch/cris/mm/fault.c
index 2686a7aa8ec8..544670a37dea 100644
--- a/arch/cris/mm/fault.c
+++ b/arch/cris/mm/fault.c
@@ -113,7 +113,7 @@ do_page_fault(unsigned long address, struct pt_regs *regs,
 	 * user context, we must not take the fault.
 	 */
 
-	if (in_atomic() || !mm)
+	if (in_atomic() || !mm || current->pagefault_disabled)
 		goto no_context;
 
 	if (user_mode(regs))
diff --git a/arch/frv/mm/fault.c b/arch/frv/mm/fault.c
index ec4917ddf678..45204939aabc 100644
--- a/arch/frv/mm/fault.c
+++ b/arch/frv/mm/fault.c
@@ -78,7 +78,7 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (in_atomic() || !mm || current->pagefault_disabled)
 		goto no_context;
 
 	if (user_mode(__frame))
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
index ba5ba7accd0d..c4fb062736dd 100644
--- a/arch/ia64/mm/fault.c
+++ b/arch/ia64/mm/fault.c
@@ -96,7 +96,7 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re
 	/*
 	 * If we're in an interrupt or have no user context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (in_atomic() || !mm || current->pagefault_disabled)
 		goto no_context;
 
 #ifdef CONFIG_VIRTUAL_MEM_MAP
diff --git a/arch/m32r/mm/fault.c b/arch/m32r/mm/fault.c
index e3d4d4890104..ec9ef1059ff4 100644
--- a/arch/m32r/mm/fault.c
+++ b/arch/m32r/mm/fault.c
@@ -114,7 +114,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code,
 	 * If we're in an interrupt or have no user context or are running in an
 	 * atomic region then we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (in_atomic() || !mm || current->pagefault_disabled
 		goto bad_area_nosemaphore;
 
 	if (error_code & ACE_USERMODE)
diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c
index b2f04aee46ec..af31d699b026 100644
--- a/arch/m68k/mm/fault.c
+++ b/arch/m68k/mm/fault.c
@@ -81,7 +81,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address,
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (in_atomic() || !mm || current->pagefault_disabled)
 		goto no_context;
 
 	if (user_mode(regs))
diff --git a/arch/microblaze/mm/fault.c b/arch/microblaze/mm/fault.c
index d46a5ebb7570..eb12f4b61798 100644
--- a/arch/microblaze/mm/fault.c
+++ b/arch/microblaze/mm/fault.c
@@ -107,7 +107,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long address,
 	if ((error_code & 0x13) == 0x13 || (error_code & 0x11) == 0x11)
 		is_write = 0;
 
-	if (unlikely(in_atomic() || !mm)) {
+	if (unlikely(in_atomic() || !mm || current->pagefault_disabled)) {
 		if (kernel_mode(regs))
 			goto bad_area_nosemaphore;
 
diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c
index 70ab5d664332..66d4a13d914e 100644
--- a/arch/mips/mm/fault.c
+++ b/arch/mips/mm/fault.c
@@ -89,7 +89,7 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, unsigned long write,
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (in_atomic() || !mm || current->pagefault_disabled)
 		goto bad_area_nosemaphore;
 
 	if (user_mode(regs))
diff --git a/arch/mn10300/mm/fault.c b/arch/mn10300/mm/fault.c
index 0c2cc5d39c8e..160b3b2af9f9 100644
--- a/arch/mn10300/mm/fault.c
+++ b/arch/mn10300/mm/fault.c
@@ -168,7 +168,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long fault_code,
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (in_atomic() || !mm || current->pagefault_disabled)
 		goto no_context;
 
 	if ((fault_code & MMUFCR_xFC_ACCESS) == MMUFCR_xFC_ACCESS_USR)
diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c
index e5120e653240..60f970803ce4 100644
--- a/arch/parisc/mm/fault.c
+++ b/arch/parisc/mm/fault.c
@@ -207,7 +207,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long code,
 	int fault;
 	unsigned int flags;
 
-	if (in_atomic())
+	if (in_atomic() || current->pagefault_disabled)
 		goto no_context;
 
 	tsk = current;
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index f06b56baf0b3..7896366e6bf2 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -273,7 +273,7 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
 	if (!arch_irq_disabled_regs(regs))
 		local_irq_enable();
 
-	if (in_atomic() || mm == NULL) {
+	if (in_atomic() || mm == NULL || current->pagefault_disabled) {
 		if (!user_mode(regs)) {
 			rc = SIGSEGV;
 			goto bail;
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index fbe8f2cf9245..f22e2379baec 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -435,7 +435,8 @@ static inline int do_exception(struct pt_regs *regs, int access)
 	 * user context.
 	 */
 	fault = VM_FAULT_BADCONTEXT;
-	if (unlikely(!user_space_fault(regs) || in_atomic() || !mm))
+	if (unlikely(!user_space_fault(regs) || in_atomic() || !mm ||
+		     tsk->pagefault_disabled))
 		goto out;
 
 	address = trans_exc_code & __FAIL_ADDR_MASK;
diff --git a/arch/score/mm/fault.c b/arch/score/mm/fault.c
index 6860beb2a280..dae63aec5422 100644
--- a/arch/score/mm/fault.c
+++ b/arch/score/mm/fault.c
@@ -73,7 +73,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long write,
 	* If we're in an interrupt or have no user
 	* context, we must not take the fault..
 	*/
-	if (in_atomic() || !mm)
+	if (in_atomic() || !mm || current->pagefault_disabled)
 		goto bad_area_nosemaphore;
 
 	if (user_mode(regs))
diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c
index a58fec9b55e0..204624f71a4b 100644
--- a/arch/sh/mm/fault.c
+++ b/arch/sh/mm/fault.c
@@ -440,7 +440,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
 	 * If we're in an interrupt, have no user context or are running
 	 * in an atomic region then we must not take the fault:
 	 */
-	if (unlikely(in_atomic() || !mm)) {
+	if (unlikely(in_atomic() || !mm || current->pagefault_disabled)) {
 		bad_area_nosemaphore(regs, error_code, address);
 		return;
 	}
diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c
index 70d817154fe8..e8adda91e74e 100644
--- a/arch/sparc/mm/fault_32.c
+++ b/arch/sparc/mm/fault_32.c
@@ -196,7 +196,7 @@ asmlinkage void do_sparc_fault(struct pt_regs *regs, int text_fault, int write,
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (in_atomic() || !mm || current->pagefault_disabled)
 		goto no_context;
 
 	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c
index 479823249429..c43fdcbcf804 100644
--- a/arch/sparc/mm/fault_64.c
+++ b/arch/sparc/mm/fault_64.c
@@ -330,7 +330,7 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs)
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (in_atomic() || !mm || current->pagefault_enabled)
 		goto intr_or_no_mm;
 
 	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c
index c6d2a76d91a8..925b686f27a0 100644
--- a/arch/tile/mm/fault.c
+++ b/arch/tile/mm/fault.c
@@ -357,7 +357,7 @@ static int handle_page_fault(struct pt_regs *regs,
 	 * If we're in an interrupt, have no user context or are running in an
 	 * atomic region then we must not take the fault.
 	 */
-	if (in_atomic() || !mm) {
+	if (in_atomic() || !mm || current->pagefault_disabled) {
 		vma = NULL;  /* happy compiler */
 		goto bad_area_nosemaphore;
 	}
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
index 209617302df8..015933a10b52 100644
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -38,7 +38,7 @@ int handle_page_fault(unsigned long address, unsigned long ip,
 	 * If the fault was during atomic operation, don't take the fault, just
 	 * fail.
 	 */
-	if (in_atomic())
+	if (in_atomic() || current->pagefault_disabled)
 		goto out_nosemaphore;
 
 	if (is_user)
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 4d8ee827cb14..0c94489cbb8b 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1128,7 +1128,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
 	 * If we're in an interrupt, have no user context or are running
 	 * in an atomic region then we must not take the fault:
 	 */
-	if (unlikely(in_atomic() || !mm)) {
+	if (unlikely(in_atomic() || !mm || current->pagefault_disabled)) {
 		bad_area_nosemaphore(regs, error_code, address);
 		return;
 	}
diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c
index 9e3571a6535c..27f563e5c8fe 100644
--- a/arch/xtensa/mm/fault.c
+++ b/arch/xtensa/mm/fault.c
@@ -57,7 +57,7 @@ void do_page_fault(struct pt_regs *regs)
 	/* If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm) {
+	if (in_atomic() || !mm || current->pagefault_disabled) {
 		bad_page_fault(regs, address, SIGSEGV);
 		return;
 	}
-- 
cgit v1.2.3


From e0fab34143eb00d52c7e0847c7a09526047fd8fd Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 11 Aug 2011 15:31:31 +0200
Subject: mm: pagefault_disabled()

Wrap the test for pagefault_disabled() into a helper, this allows us
to remove the need for current->pagefault_disabled on !-rt kernels.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/n/tip-3yy517m8zsi9fpsf14xfaqkw@git.kernel.org
---
 arch/alpha/mm/fault.c      |  2 +-
 arch/arm/mm/fault.c        |  2 +-
 arch/avr32/mm/fault.c      |  3 +--
 arch/cris/mm/fault.c       |  2 +-
 arch/frv/mm/fault.c        |  2 +-
 arch/ia64/mm/fault.c       |  2 +-
 arch/m32r/mm/fault.c       |  2 +-
 arch/m68k/mm/fault.c       |  2 +-
 arch/microblaze/mm/fault.c |  2 +-
 arch/mips/mm/fault.c       |  2 +-
 arch/mn10300/mm/fault.c    |  2 +-
 arch/parisc/mm/fault.c     |  2 +-
 arch/powerpc/mm/fault.c    |  2 +-
 arch/s390/mm/fault.c       |  2 +-
 arch/score/mm/fault.c      |  2 +-
 arch/sh/mm/fault.c         |  2 +-
 arch/sparc/mm/fault_32.c   |  2 +-
 arch/sparc/mm/fault_64.c   |  2 +-
 arch/tile/mm/fault.c       |  2 +-
 arch/um/kernel/trap.c      |  2 +-
 arch/x86/mm/fault.c        |  2 +-
 arch/xtensa/mm/fault.c     |  2 +-
 include/linux/sched.h      | 14 ++++++++++++++
 kernel/fork.c              |  2 ++
 24 files changed, 38 insertions(+), 23 deletions(-)

diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c
index 5b59e11a96d8..870f626674e8 100644
--- a/arch/alpha/mm/fault.c
+++ b/arch/alpha/mm/fault.c
@@ -107,7 +107,7 @@ do_page_fault(unsigned long address, unsigned long mmcsr,
 
 	/* If we're in an interrupt context, or have no user context,
 	   we must not take the fault.  */
-	if (!mm || in_atomic() || current->pagefault_disabled)
+	if (!mm || pagefault_disabled())
 		goto no_context;
 
 #ifdef CONFIG_ALPHA_LARGE_VMALLOC
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index 9e92a6ca6137..b40d4bab8e07 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -277,7 +277,7 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm || current->pagefault_disabled)
+	if (!mm || pagefault_disabled())
 		goto no_context;
 
 	if (user_mode(regs))
diff --git a/arch/avr32/mm/fault.c b/arch/avr32/mm/fault.c
index ddb8290e8378..8a7fb9343cbf 100644
--- a/arch/avr32/mm/fault.c
+++ b/arch/avr32/mm/fault.c
@@ -81,8 +81,7 @@ asmlinkage void do_page_fault(unsigned long ecr, struct pt_regs *regs)
 	 * If we're in an interrupt or have no user context, we must
 	 * not take the fault...
 	 */
-	if (in_atomic() || !mm || regs->sr & SYSREG_BIT(GM) ||
-	    current->pagefault_disabled)
+	if (!mm || regs->sr & SYSREG_BIT(GM) || pagefault_disabled())
 		goto no_context;
 
 	local_irq_enable();
diff --git a/arch/cris/mm/fault.c b/arch/cris/mm/fault.c
index 544670a37dea..b1a5631d95bf 100644
--- a/arch/cris/mm/fault.c
+++ b/arch/cris/mm/fault.c
@@ -113,7 +113,7 @@ do_page_fault(unsigned long address, struct pt_regs *regs,
 	 * user context, we must not take the fault.
 	 */
 
-	if (in_atomic() || !mm || current->pagefault_disabled)
+	if (!mm || pagefault_disabled())
 		goto no_context;
 
 	if (user_mode(regs))
diff --git a/arch/frv/mm/fault.c b/arch/frv/mm/fault.c
index 45204939aabc..c90d23f50579 100644
--- a/arch/frv/mm/fault.c
+++ b/arch/frv/mm/fault.c
@@ -78,7 +78,7 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm || current->pagefault_disabled)
+	if (!mm || pagefault_disabled())
 		goto no_context;
 
 	if (user_mode(__frame))
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
index c4fb062736dd..f9f7d083cfb4 100644
--- a/arch/ia64/mm/fault.c
+++ b/arch/ia64/mm/fault.c
@@ -96,7 +96,7 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re
 	/*
 	 * If we're in an interrupt or have no user context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm || current->pagefault_disabled)
+	if (!mm || pagefault_disabled())
 		goto no_context;
 
 #ifdef CONFIG_VIRTUAL_MEM_MAP
diff --git a/arch/m32r/mm/fault.c b/arch/m32r/mm/fault.c
index ec9ef1059ff4..7069ad371f72 100644
--- a/arch/m32r/mm/fault.c
+++ b/arch/m32r/mm/fault.c
@@ -114,7 +114,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code,
 	 * If we're in an interrupt or have no user context or are running in an
 	 * atomic region then we must not take the fault..
 	 */
-	if (in_atomic() || !mm || current->pagefault_disabled
+	if (!mm || pagefault_disabled())
 		goto bad_area_nosemaphore;
 
 	if (error_code & ACE_USERMODE)
diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c
index af31d699b026..5de2621aab37 100644
--- a/arch/m68k/mm/fault.c
+++ b/arch/m68k/mm/fault.c
@@ -81,7 +81,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address,
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm || current->pagefault_disabled)
+	if (!mm || pagefault_disabled())
 		goto no_context;
 
 	if (user_mode(regs))
diff --git a/arch/microblaze/mm/fault.c b/arch/microblaze/mm/fault.c
index eb12f4b61798..914ae86b8c96 100644
--- a/arch/microblaze/mm/fault.c
+++ b/arch/microblaze/mm/fault.c
@@ -107,7 +107,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long address,
 	if ((error_code & 0x13) == 0x13 || (error_code & 0x11) == 0x11)
 		is_write = 0;
 
-	if (unlikely(in_atomic() || !mm || current->pagefault_disabled)) {
+	if (unlikely(!mm || pagefault_disabled())) {
 		if (kernel_mode(regs))
 			goto bad_area_nosemaphore;
 
diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c
index 66d4a13d914e..5b41b84d9092 100644
--- a/arch/mips/mm/fault.c
+++ b/arch/mips/mm/fault.c
@@ -89,7 +89,7 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, unsigned long write,
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm || current->pagefault_disabled)
+	if (!mm || pagefault_disabled())
 		goto bad_area_nosemaphore;
 
 	if (user_mode(regs))
diff --git a/arch/mn10300/mm/fault.c b/arch/mn10300/mm/fault.c
index 160b3b2af9f9..54305d255ced 100644
--- a/arch/mn10300/mm/fault.c
+++ b/arch/mn10300/mm/fault.c
@@ -168,7 +168,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long fault_code,
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm || current->pagefault_disabled)
+	if (!mm || pagefault_disabled())
 		goto no_context;
 
 	if ((fault_code & MMUFCR_xFC_ACCESS) == MMUFCR_xFC_ACCESS_USR)
diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c
index 60f970803ce4..065c957adbff 100644
--- a/arch/parisc/mm/fault.c
+++ b/arch/parisc/mm/fault.c
@@ -207,7 +207,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long code,
 	int fault;
 	unsigned int flags;
 
-	if (in_atomic() || current->pagefault_disabled)
+	if (pagefault_disabled())
 		goto no_context;
 
 	tsk = current;
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 7896366e6bf2..1e949bd22909 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -273,7 +273,7 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
 	if (!arch_irq_disabled_regs(regs))
 		local_irq_enable();
 
-	if (in_atomic() || mm == NULL || current->pagefault_disabled) {
+	if (in_atomic() || mm == NULL || pagefault_disabled()) {
 		if (!user_mode(regs)) {
 			rc = SIGSEGV;
 			goto bail;
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index f22e2379baec..43ec237a17e2 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -435,7 +435,7 @@ static inline int do_exception(struct pt_regs *regs, int access)
 	 * user context.
 	 */
 	fault = VM_FAULT_BADCONTEXT;
-	if (unlikely(!user_space_fault(regs) || in_atomic() || !mm ||
+	if (unlikely(!user_space_fault(regs) || !mm ||
 		     tsk->pagefault_disabled))
 		goto out;
 
diff --git a/arch/score/mm/fault.c b/arch/score/mm/fault.c
index dae63aec5422..b946291e60d1 100644
--- a/arch/score/mm/fault.c
+++ b/arch/score/mm/fault.c
@@ -73,7 +73,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long write,
 	* If we're in an interrupt or have no user
 	* context, we must not take the fault..
 	*/
-	if (in_atomic() || !mm || current->pagefault_disabled)
+	if (!mm || pagefault_disabled())
 		goto bad_area_nosemaphore;
 
 	if (user_mode(regs))
diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c
index 204624f71a4b..364cae5806ce 100644
--- a/arch/sh/mm/fault.c
+++ b/arch/sh/mm/fault.c
@@ -440,7 +440,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
 	 * If we're in an interrupt, have no user context or are running
 	 * in an atomic region then we must not take the fault:
 	 */
-	if (unlikely(in_atomic() || !mm || current->pagefault_disabled)) {
+	if (unlikely(!mm || pagefault_disabled())) {
 		bad_area_nosemaphore(regs, error_code, address);
 		return;
 	}
diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c
index e8adda91e74e..7c04a49477af 100644
--- a/arch/sparc/mm/fault_32.c
+++ b/arch/sparc/mm/fault_32.c
@@ -196,7 +196,7 @@ asmlinkage void do_sparc_fault(struct pt_regs *regs, int text_fault, int write,
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm || current->pagefault_disabled)
+	if (!mm || pagefault_disabled())
 		goto no_context;
 
 	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c
index c43fdcbcf804..28afbdf069e1 100644
--- a/arch/sparc/mm/fault_64.c
+++ b/arch/sparc/mm/fault_64.c
@@ -330,7 +330,7 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs)
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm || current->pagefault_enabled)
+	if (!mm || pagefault_disabled())
 		goto intr_or_no_mm;
 
 	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c
index 925b686f27a0..4cdab266bcc3 100644
--- a/arch/tile/mm/fault.c
+++ b/arch/tile/mm/fault.c
@@ -357,7 +357,7 @@ static int handle_page_fault(struct pt_regs *regs,
 	 * If we're in an interrupt, have no user context or are running in an
 	 * atomic region then we must not take the fault.
 	 */
-	if (in_atomic() || !mm || current->pagefault_disabled) {
+	if (!mm || pagefault_disabled()) {
 		vma = NULL;  /* happy compiler */
 		goto bad_area_nosemaphore;
 	}
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
index 015933a10b52..6126d3a23382 100644
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -38,7 +38,7 @@ int handle_page_fault(unsigned long address, unsigned long ip,
 	 * If the fault was during atomic operation, don't take the fault, just
 	 * fail.
 	 */
-	if (in_atomic() || current->pagefault_disabled)
+	if (pagefault_disabled())
 		goto out_nosemaphore;
 
 	if (is_user)
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 0c94489cbb8b..2bb8e7d2882b 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1128,7 +1128,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
 	 * If we're in an interrupt, have no user context or are running
 	 * in an atomic region then we must not take the fault:
 	 */
-	if (unlikely(in_atomic() || !mm || current->pagefault_disabled)) {
+	if (unlikely(!mm || pagefault_disabled())) {
 		bad_area_nosemaphore(regs, error_code, address);
 		return;
 	}
diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c
index 27f563e5c8fe..9696ab258ee6 100644
--- a/arch/xtensa/mm/fault.c
+++ b/arch/xtensa/mm/fault.c
@@ -57,7 +57,7 @@ void do_page_fault(struct pt_regs *regs)
 	/* If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm || current->pagefault_disabled) {
+	if (!mm || pagefault_disabled()) {
 		bad_page_fault(regs, address, SIGSEGV);
 		return;
 	}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 88df56d44d2f..ccbf3690c058 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -56,6 +56,7 @@ struct sched_param {
 #include <linux/cred.h>
 #include <linux/llist.h>
 #include <linux/uidgid.h>
+#include <linux/hardirq.h>
 #include <linux/gfp.h>
 #include <linux/magic.h>
 
@@ -1463,7 +1464,9 @@ struct task_struct {
 	/* mutex deadlock detection */
 	struct mutex_waiter *blocked_on;
 #endif
+#ifdef CONFIG_PREEMPT_RT_FULL
 	int pagefault_disabled;
+#endif
 #ifdef CONFIG_TRACE_IRQFLAGS
 	unsigned int irq_events;
 	unsigned long hardirq_enable_ip;
@@ -1704,6 +1707,17 @@ static inline bool should_numa_migrate_memory(struct task_struct *p,
 }
 #endif
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+static inline bool cur_pf_disabled(void) { return current->pagefault_disabled; }
+#else
+static inline bool cur_pf_disabled(void) { return false; }
+#endif
+
+static inline bool pagefault_disabled(void)
+{
+	return in_atomic() || cur_pf_disabled();
+}
+
 static inline struct pid *task_pid(struct task_struct *task)
 {
 	return task->pids[PIDTYPE_PID].pid;
diff --git a/kernel/fork.c b/kernel/fork.c
index f4d669c788c5..b43d8908b8d6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1344,7 +1344,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->hardirq_context = 0;
 	p->softirq_context = 0;
 #endif
+#ifdef CONFIG_PREEMPT_RT_FULL
 	p->pagefault_disabled = 0;
+#endif
 #ifdef CONFIG_LOCKDEP
 	p->lockdep_depth = 0; /* no locks held yet */
 	p->curr_chain_key = 0;
-- 
cgit v1.2.3


From 0ca3bab8edd728431ffd4eff8ffedf9067b0f8d1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 5 Aug 2011 17:16:58 +0200
Subject: mm: raw_pagefault_disable

Adding migrate_disable() to pagefault_disable() to preserve the
per-cpu thing for kmap_atomic might not have been the best of choices.
But short of adding preempt_disable/migrate_disable foo all over the
kmap code it still seems the best way.

It does however yield the below borkage as well as wreck !-rt builds
since !-rt does rely on pagefault_disable() not preempting. So fix all
that up by adding raw_pagefault_disable().

 <NMI>  [<ffffffff81076d5c>] warn_slowpath_common+0x85/0x9d
 [<ffffffff81076e17>] warn_slowpath_fmt+0x46/0x48
 [<ffffffff814f7fca>] ? _raw_spin_lock+0x6c/0x73
 [<ffffffff810cac87>] ? watchdog_overflow_callback+0x9b/0xd0
 [<ffffffff810caca3>] watchdog_overflow_callback+0xb7/0xd0
 [<ffffffff810f51bb>] __perf_event_overflow+0x11c/0x1fe
 [<ffffffff810f298f>] ? perf_event_update_userpage+0x149/0x151
 [<ffffffff810f2846>] ? perf_event_task_disable+0x7c/0x7c
 [<ffffffff810f5b7c>] perf_event_overflow+0x14/0x16
 [<ffffffff81046e02>] x86_pmu_handle_irq+0xcb/0x108
 [<ffffffff814f9a6b>] perf_event_nmi_handler+0x46/0x91
 [<ffffffff814fb2ba>] notifier_call_chain+0x79/0xa6
 [<ffffffff814fb34d>] __atomic_notifier_call_chain+0x66/0x98
 [<ffffffff814fb2e7>] ? notifier_call_chain+0xa6/0xa6
 [<ffffffff814fb393>] atomic_notifier_call_chain+0x14/0x16
 [<ffffffff814fb3c3>] notify_die+0x2e/0x30
 [<ffffffff814f8f75>] do_nmi+0x7e/0x22b
 [<ffffffff814f8bca>] nmi+0x1a/0x2c
 [<ffffffff814fb130>] ? sub_preempt_count+0x4b/0xaa
 <<EOE>>  <IRQ>  [<ffffffff812d44cc>] delay_tsc+0xac/0xd1
 [<ffffffff812d4399>] __delay+0xf/0x11
 [<ffffffff812d95d9>] do_raw_spin_lock+0xd2/0x13c
 [<ffffffff814f813e>] _raw_spin_lock_irqsave+0x6b/0x85
 [<ffffffff8106772a>] ? task_rq_lock+0x35/0x8d
 [<ffffffff8106772a>] task_rq_lock+0x35/0x8d
 [<ffffffff8106fe2f>] migrate_disable+0x65/0x12c
 [<ffffffff81114e69>] pagefault_disable+0xe/0x1f
 [<ffffffff81039c73>] dump_trace+0x21f/0x2e2
 [<ffffffff8103ad79>] show_trace_log_lvl+0x54/0x5d
 [<ffffffff8103ad97>] show_trace+0x15/0x17
 [<ffffffff814f4f5f>] dump_stack+0x77/0x80
 [<ffffffff812d94b0>] spin_bug+0x9c/0xa3
 [<ffffffff81067745>] ? task_rq_lock+0x50/0x8d
 [<ffffffff812d954e>] do_raw_spin_lock+0x47/0x13c
 [<ffffffff814f7fbe>] _raw_spin_lock+0x60/0x73
 [<ffffffff81067745>] ? task_rq_lock+0x50/0x8d
 [<ffffffff81067745>] task_rq_lock+0x50/0x8d
 [<ffffffff8106fe2f>] migrate_disable+0x65/0x12c
 [<ffffffff81114e69>] pagefault_disable+0xe/0x1f
 [<ffffffff81039c73>] dump_trace+0x21f/0x2e2
 [<ffffffff8104369b>] save_stack_trace+0x2f/0x4c
 [<ffffffff810a7848>] save_trace+0x3f/0xaf
 [<ffffffff810aa2bd>] mark_lock+0x228/0x530
 [<ffffffff810aac27>] __lock_acquire+0x662/0x1812
 [<ffffffff8103dad4>] ? native_sched_clock+0x37/0x6d
 [<ffffffff810a790e>] ? trace_hardirqs_off_caller+0x1f/0x99
 [<ffffffff810693f6>] ? sched_rt_period_timer+0xbd/0x218
 [<ffffffff810ac403>] lock_acquire+0x145/0x18a
 [<ffffffff810693f6>] ? sched_rt_period_timer+0xbd/0x218
 [<ffffffff814f7f9e>] _raw_spin_lock+0x40/0x73
 [<ffffffff810693f6>] ? sched_rt_period_timer+0xbd/0x218
 [<ffffffff810693f6>] sched_rt_period_timer+0xbd/0x218
 [<ffffffff8109aa39>] __run_hrtimer+0x1e4/0x347
 [<ffffffff81069339>] ? can_migrate_task.clone.82+0x14a/0x14a
 [<ffffffff8109b97c>] hrtimer_interrupt+0xee/0x1d6
 [<ffffffff814fb23d>] ? add_preempt_count+0xae/0xb2
 [<ffffffff814ffb38>] smp_apic_timer_interrupt+0x85/0x98
 [<ffffffff814fef13>] apic_timer_interrupt+0x13/0x20


Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/n/tip-31keae8mkjiv8esq4rl76cib@git.kernel.org
---
 include/linux/uaccess.h | 40 ++++++++++++++++++++++++++++++++++++++--
 mm/memory.c             |  8 ++------
 2 files changed, 40 insertions(+), 8 deletions(-)

diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 9414a1b48f5c..303a282920e5 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -8,8 +8,44 @@
  * These routines enable/disable the pagefault handler in that
  * it will not take any MM locks and go straight to the fixup table.
  */
+static inline void raw_pagefault_disable(void)
+{
+	preempt_count_inc();
+	/*
+	 * make sure to have issued the store before a pagefault
+	 * can hit.
+	 */
+	barrier();
+}
+
+static inline void raw_pagefault_enable(void)
+{
+#ifndef CONFIG_PREEMPT
+	/*
+	 * make sure to issue those last loads/stores before enabling
+	 * the pagefault handler again.
+	 */
+	barrier();
+	preempt_count_dec();
+#else
+	preempt_enable();
+#endif
+}
+
+#ifndef CONFIG_PREEMPT_RT_FULL
+static inline void pagefault_disable(void)
+{
+	raw_pagefault_disable();
+}
+
+static inline void pagefault_enable(void)
+{
+	raw_pagefault_enable();
+}
+#else
 extern void pagefault_disable(void);
 extern void pagefault_enable(void);
+#endif
 
 #ifndef ARCH_HAS_NOCACHE_UACCESS
 
@@ -50,9 +86,9 @@ static inline unsigned long __copy_from_user_nocache(void *to,
 		mm_segment_t old_fs = get_fs();		\
 							\
 		set_fs(KERNEL_DS);			\
-		pagefault_disable();			\
+		raw_pagefault_disable();		\
 		ret = __copy_from_user_inatomic(&(retval), (__force typeof(retval) __user *)(addr), sizeof(retval));		\
-		pagefault_enable();			\
+		raw_pagefault_enable();			\
 		set_fs(old_fs);				\
 		ret;					\
 	})
diff --git a/mm/memory.c b/mm/memory.c
index 922073f7d1c5..096323f48cc6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3244,6 +3244,7 @@ unlock:
 	return 0;
 }
 
+#ifdef CONFIG_PREEMPT_RT_FULL
 void pagefault_disable(void)
 {
 	preempt_count_inc();
@@ -3258,21 +3259,16 @@ EXPORT_SYMBOL(pagefault_disable);
 
 void pagefault_enable(void)
 {
-#ifndef CONFIG_PREEMPT
 	/*
 	 * make sure to issue those last loads/stores before enabling
 	 * the pagefault handler again.
 	 */
 	barrier();
 	current->pagefault_disabled--;
-	preempt_count_dec();
-#else
-	barrier();
-	current->pagefault_disabled--;
 	preempt_enable();
-#endif
 }
 EXPORT_SYMBOL(pagefault_enable);
+#endif
 
 /*
  * By the time we get here, we already hold the mm semaphore
-- 
cgit v1.2.3


From ee7d734476ef3fbf5bddfa6907abbb2a1ef52610 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 25 Jul 2009 22:06:27 +0200
Subject: mm: Remove preempt count from pagefault disable/enable

Now that all users are cleaned up, we can remove the preemption count.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 mm/memory.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 096323f48cc6..e38f53b792d9 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3247,7 +3247,6 @@ unlock:
 #ifdef CONFIG_PREEMPT_RT_FULL
 void pagefault_disable(void)
 {
-	preempt_count_inc();
 	current->pagefault_disabled++;
 	/*
 	 * make sure to have issued the store before a pagefault
@@ -3265,7 +3264,6 @@ void pagefault_enable(void)
 	 */
 	barrier();
 	current->pagefault_disabled--;
-	preempt_enable();
 }
 EXPORT_SYMBOL(pagefault_enable);
 #endif
-- 
cgit v1.2.3


From 131336ddab8ce6a0434034d50c48053d0c9bcb39 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 15 Jul 2010 10:29:00 +0200
Subject: suspend: Prevent might sleep splats

timekeeping suspend/resume calls read_persistant_clock() which takes
rtc_lock. That results in might sleep warnings because at that point
we run with interrupts disabled.

We cannot convert rtc_lock to a raw spinlock as that would trigger
other might sleep warnings.

As a temporary workaround we disable the might sleep warnings by
setting system_state to SYSTEM_SUSPEND before calling sysdev_suspend()
and restoring it to SYSTEM_RUNNING afer sysdev_resume().

Needs to be revisited.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/kernel.h   | 1 +
 kernel/power/hibernate.c | 7 +++++++
 kernel/power/suspend.c   | 4 ++++
 3 files changed, 12 insertions(+)

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 3d770f5564b8..8df5e327ec55 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -451,6 +451,7 @@ extern enum system_states {
 	SYSTEM_HALT,
 	SYSTEM_POWER_OFF,
 	SYSTEM_RESTART,
+	SYSTEM_SUSPEND,
 } system_state;
 
 #define TAINT_PROPRIETARY_MODULE	0
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 1f35a3478f3c..f7170869157b 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -287,6 +287,8 @@ static int create_image(int platform_mode)
 
 	local_irq_disable();
 
+	system_state = SYSTEM_SUSPEND;
+
 	error = syscore_suspend();
 	if (error) {
 		printk(KERN_ERR "PM: Some system devices failed to power down, "
@@ -316,6 +318,7 @@ static int create_image(int platform_mode)
 	syscore_resume();
 
  Enable_irqs:
+	system_state = SYSTEM_RUNNING;
 	local_irq_enable();
 
  Enable_cpus:
@@ -439,6 +442,7 @@ static int resume_target_kernel(bool platform_mode)
 		goto Enable_cpus;
 
 	local_irq_disable();
+	system_state = SYSTEM_SUSPEND;
 
 	error = syscore_suspend();
 	if (error)
@@ -472,6 +476,7 @@ static int resume_target_kernel(bool platform_mode)
 	syscore_resume();
 
  Enable_irqs:
+	system_state = SYSTEM_RUNNING;
 	local_irq_enable();
 
  Enable_cpus:
@@ -557,6 +562,7 @@ int hibernation_platform_enter(void)
 		goto Platform_finish;
 
 	local_irq_disable();
+	system_state = SYSTEM_SUSPEND;
 	syscore_suspend();
 	if (pm_wakeup_pending()) {
 		error = -EAGAIN;
@@ -569,6 +575,7 @@ int hibernation_platform_enter(void)
 
  Power_up:
 	syscore_resume();
+	system_state = SYSTEM_RUNNING;
 	local_irq_enable();
 	enable_nonboot_cpus();
 
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index c347e3ce3a55..ae509484125c 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -318,6 +318,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
 	arch_suspend_disable_irqs();
 	BUG_ON(!irqs_disabled());
 
+	system_state = SYSTEM_SUSPEND;
+
 	error = syscore_suspend();
 	if (!error) {
 		*wakeup = pm_wakeup_pending();
@@ -332,6 +334,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
 		syscore_resume();
 	}
 
+	system_state = SYSTEM_RUNNING;
+
 	arch_suspend_enable_irqs();
 	BUG_ON(irqs_disabled());
 
-- 
cgit v1.2.3


From 66843fbcd6bc6a61df4f34ee926db24b4c6f020e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 21 Jun 2011 11:24:35 +0200
Subject: mm-page-alloc-use-list-last-entry.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 mm/page_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c32cb64a1277..7e3f1634389c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -702,7 +702,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
 		do {
 			int mt;	/* migratetype of the to-be-freed page */
 
-			page = list_entry(list->prev, struct page, lru);
+			page = list_last_entry(list, struct page, lru);
 			/* must delete as __free_one_page list manipulates */
 			list_del(&page->lru);
 			mt = get_freepage_migratetype(page);
-- 
cgit v1.2.3


From b6823e455c01601c3d064a37ca9fd03bdee72ff0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 15 Jul 2011 21:24:27 +0200
Subject: rwsem-inlcude-fix.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/pid.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/linux/pid.h b/include/linux/pid.h
index 23705a53abba..2cc64b779f03 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -2,6 +2,7 @@
 #define _LINUX_PID_H
 
 #include <linux/rcupdate.h>
+#include <linux/atomic.h>
 
 enum pid_type
 {
-- 
cgit v1.2.3


From daadc37d2a2717b6b30bd774d50d739662fcb39b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 14 Nov 2011 10:52:34 +0100
Subject: sysctl-include-fix.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sysctl.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index b7361f831226..22050201d20e 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -25,6 +25,7 @@
 #include <linux/rcupdate.h>
 #include <linux/wait.h>
 #include <linux/rbtree.h>
+#include <linux/atomic.h>
 #include <uapi/linux/sysctl.h>
 
 /* For the /proc/sys support */
-- 
cgit v1.2.3


From 1316f8f55ef81105648c7be75fdc0fae697a28d2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 28 Jun 2011 10:59:58 +0200
Subject: net-flip-lock-dep-thingy.patch

=======================================================
[ INFO: possible circular locking dependency detected ]
3.0.0-rc3+ #26
-------------------------------------------------------
ip/1104 is trying to acquire lock:
 (local_softirq_lock){+.+...}, at: [<ffffffff81056d12>] __local_lock+0x25/0x68

but task is already holding lock:
 (sk_lock-AF_INET){+.+...}, at: [<ffffffff81433308>] lock_sock+0x10/0x12

which lock already depends on the new lock.


the existing dependency chain (in reverse order) is:

-> #1 (sk_lock-AF_INET){+.+...}:
       [<ffffffff810836e5>] lock_acquire+0x103/0x12e
       [<ffffffff813e2781>] lock_sock_nested+0x82/0x92
       [<ffffffff81433308>] lock_sock+0x10/0x12
       [<ffffffff81433afa>] tcp_close+0x1b/0x355
       [<ffffffff81453c99>] inet_release+0xc3/0xcd
       [<ffffffff813dff3f>] sock_release+0x1f/0x74
       [<ffffffff813dffbb>] sock_close+0x27/0x2b
       [<ffffffff81129c63>] fput+0x11d/0x1e3
       [<ffffffff81126577>] filp_close+0x70/0x7b
       [<ffffffff8112667a>] sys_close+0xf8/0x13d
       [<ffffffff814ae882>] system_call_fastpath+0x16/0x1b

-> #0 (local_softirq_lock){+.+...}:
       [<ffffffff81082ecc>] __lock_acquire+0xacc/0xdc8
       [<ffffffff810836e5>] lock_acquire+0x103/0x12e
       [<ffffffff814a7e40>] _raw_spin_lock+0x3b/0x4a
       [<ffffffff81056d12>] __local_lock+0x25/0x68
       [<ffffffff81056d8b>] local_bh_disable+0x36/0x3b
       [<ffffffff814a7fc4>] _raw_write_lock_bh+0x16/0x4f
       [<ffffffff81433c38>] tcp_close+0x159/0x355
       [<ffffffff81453c99>] inet_release+0xc3/0xcd
       [<ffffffff813dff3f>] sock_release+0x1f/0x74
       [<ffffffff813dffbb>] sock_close+0x27/0x2b
       [<ffffffff81129c63>] fput+0x11d/0x1e3
       [<ffffffff81126577>] filp_close+0x70/0x7b
       [<ffffffff8112667a>] sys_close+0xf8/0x13d
       [<ffffffff814ae882>] system_call_fastpath+0x16/0x1b

other info that might help us debug this:

 Possible unsafe locking scenario:

       CPU0                    CPU1
       ----                    ----
  lock(sk_lock-AF_INET);
                               lock(local_softirq_lock);
                               lock(sk_lock-AF_INET);
  lock(local_softirq_lock);

 *** DEADLOCK ***

1 lock held by ip/1104:
 #0:  (sk_lock-AF_INET){+.+...}, at: [<ffffffff81433308>] lock_sock+0x10/0x12

stack backtrace:
Pid: 1104, comm: ip Not tainted 3.0.0-rc3+ #26
Call Trace:
 [<ffffffff81081649>] print_circular_bug+0x1f8/0x209
 [<ffffffff81082ecc>] __lock_acquire+0xacc/0xdc8
 [<ffffffff81056d12>] ? __local_lock+0x25/0x68
 [<ffffffff810836e5>] lock_acquire+0x103/0x12e
 [<ffffffff81056d12>] ? __local_lock+0x25/0x68
 [<ffffffff81046c75>] ? get_parent_ip+0x11/0x41
 [<ffffffff814a7e40>] _raw_spin_lock+0x3b/0x4a
 [<ffffffff81056d12>] ? __local_lock+0x25/0x68
 [<ffffffff81046c8c>] ? get_parent_ip+0x28/0x41
 [<ffffffff81056d12>] __local_lock+0x25/0x68
 [<ffffffff81056d8b>] local_bh_disable+0x36/0x3b
 [<ffffffff81433308>] ? lock_sock+0x10/0x12
 [<ffffffff814a7fc4>] _raw_write_lock_bh+0x16/0x4f
 [<ffffffff81433c38>] tcp_close+0x159/0x355
 [<ffffffff81453c99>] inet_release+0xc3/0xcd
 [<ffffffff813dff3f>] sock_release+0x1f/0x74
 [<ffffffff813dffbb>] sock_close+0x27/0x2b
 [<ffffffff81129c63>] fput+0x11d/0x1e3
 [<ffffffff81126577>] filp_close+0x70/0x7b
 [<ffffffff8112667a>] sys_close+0xf8/0x13d
 [<ffffffff814ae882>] system_call_fastpath+0x16/0x1b


Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 net/core/sock.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/net/core/sock.c b/net/core/sock.c
index 852acbc52f96..2ac6eec30de3 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2345,12 +2345,11 @@ void lock_sock_nested(struct sock *sk, int subclass)
 	if (sk->sk_lock.owned)
 		__lock_sock(sk);
 	sk->sk_lock.owned = 1;
-	spin_unlock(&sk->sk_lock.slock);
+	spin_unlock_bh(&sk->sk_lock.slock);
 	/*
 	 * The sk_lock has mutex_lock() semantics here:
 	 */
 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
-	local_bh_enable();
 }
 EXPORT_SYMBOL(lock_sock_nested);
 
-- 
cgit v1.2.3


From cd97db5219e7a18893d56dfee1ec0faca216d66d Mon Sep 17 00:00:00 2001
From: Marc Kleine-Budde <mkl@pengutronix.de>
Date: Wed, 5 Mar 2014 00:49:47 +0100
Subject: net: sched: dev_deactivate_many(): use msleep(1) instead of yield()
 to wait for outstanding qdisc_run calls

On PREEMPT_RT enabled systems the interrupt handler run as threads at prio 50
(by default). If a high priority userspace process tries to shut down a busy
network interface it might spin in a yield loop waiting for the device to
become idle. With the interrupt thread having a lower priority than the
looping process it might never be scheduled and so result in a deadlock on UP
systems.

With Magic SysRq the following backtrace can be produced:

> test_app R running      0   174    168 0x00000000
> [<c02c7070>] (__schedule+0x220/0x3fc) from [<c02c7870>] (preempt_schedule_irq+0x48/0x80)
> [<c02c7870>] (preempt_schedule_irq+0x48/0x80) from [<c0008fa8>] (svc_preempt+0x8/0x20)
> [<c0008fa8>] (svc_preempt+0x8/0x20) from [<c001a984>] (local_bh_enable+0x18/0x88)
> [<c001a984>] (local_bh_enable+0x18/0x88) from [<c025316c>] (dev_deactivate_many+0x220/0x264)
> [<c025316c>] (dev_deactivate_many+0x220/0x264) from [<c023be04>] (__dev_close_many+0x64/0xd4)
> [<c023be04>] (__dev_close_many+0x64/0xd4) from [<c023be9c>] (__dev_close+0x28/0x3c)
> [<c023be9c>] (__dev_close+0x28/0x3c) from [<c023f7f0>] (__dev_change_flags+0x88/0x130)
> [<c023f7f0>] (__dev_change_flags+0x88/0x130) from [<c023f904>] (dev_change_flags+0x10/0x48)
> [<c023f904>] (dev_change_flags+0x10/0x48) from [<c024c140>] (do_setlink+0x370/0x7ec)
> [<c024c140>] (do_setlink+0x370/0x7ec) from [<c024d2f0>] (rtnl_newlink+0x2b4/0x450)
> [<c024d2f0>] (rtnl_newlink+0x2b4/0x450) from [<c024cfa0>] (rtnetlink_rcv_msg+0x158/0x1f4)
> [<c024cfa0>] (rtnetlink_rcv_msg+0x158/0x1f4) from [<c0256740>] (netlink_rcv_skb+0xac/0xc0)
> [<c0256740>] (netlink_rcv_skb+0xac/0xc0) from [<c024bbd8>] (rtnetlink_rcv+0x18/0x24)
> [<c024bbd8>] (rtnetlink_rcv+0x18/0x24) from [<c02561b8>] (netlink_unicast+0x13c/0x198)
> [<c02561b8>] (netlink_unicast+0x13c/0x198) from [<c025651c>] (netlink_sendmsg+0x264/0x2e0)
> [<c025651c>] (netlink_sendmsg+0x264/0x2e0) from [<c022af98>] (sock_sendmsg+0x78/0x98)
> [<c022af98>] (sock_sendmsg+0x78/0x98) from [<c022bb50>] (___sys_sendmsg.part.25+0x268/0x278)
> [<c022bb50>] (___sys_sendmsg.part.25+0x268/0x278) from [<c022cf08>] (__sys_sendmsg+0x48/0x78)
> [<c022cf08>] (__sys_sendmsg+0x48/0x78) from [<c0009320>] (ret_fast_syscall+0x0/0x2c)

This patch works around the problem by replacing yield() by msleep(1), giving
the interrupt thread time to finish, similar to other changes contained in the
rt patch set. Using wait_for_completion() instead would probably be a better
solution.

Cc: stable-rt@vger.kernel.org
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 net/sched/sch_generic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 6efca30894aa..1e346523fc50 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -894,7 +894,7 @@ void dev_deactivate_many(struct list_head *head)
 	/* Wait for outstanding qdisc_run calls. */
 	list_for_each_entry(dev, head, close_list)
 		while (some_qdisc_is_busy(dev))
-			yield();
+			msleep(1);
 }
 
 void dev_deactivate(struct net_device *dev)
-- 
cgit v1.2.3


From a443c64a1f99eafd6f724465740ffc5caef49b2a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 28 Jun 2011 15:44:15 +0200
Subject: softirq-thread-do-softirq.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/interrupt.h | 2 +-
 net/core/dev.c            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 69517a24bc50..f2eee2f304c4 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -418,7 +418,7 @@ struct softirq_action
 
 asmlinkage void do_softirq(void);
 asmlinkage void __do_softirq(void);
-
+static inline void thread_do_softirq(void) { do_softirq(); }
 #ifdef __ARCH_HAS_DO_SOFTIRQ
 void do_softirq_own_stack(void);
 #else
diff --git a/net/core/dev.c b/net/core/dev.c
index 5cdbc1bd9783..885aec4c5fd2 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3405,7 +3405,7 @@ int netif_rx_ni(struct sk_buff *skb)
 	preempt_disable();
 	err = netif_rx_internal(skb);
 	if (local_softirq_pending())
-		do_softirq();
+		thread_do_softirq();
 	preempt_enable();
 
 	return err;
-- 
cgit v1.2.3


From 6a7c4ce5a3c69e9adc426c5eec63325557b012f8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 28 Jun 2011 15:46:49 +0200
Subject: softirq-split-out-code.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/softirq.c | 70 ++++++++++++++++++++++++++++++--------------------------
 1 file changed, 38 insertions(+), 32 deletions(-)

diff --git a/kernel/softirq.c b/kernel/softirq.c
index 0699add19164..822cc641f89b 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -77,6 +77,43 @@ static void wakeup_softirqd(void)
 		wake_up_process(tsk);
 }
 
+static void handle_pending_softirqs(u32 pending)
+{
+	struct softirq_action *h = softirq_vec;
+	int softirq_bit;
+
+	local_irq_enable();
+
+	h = softirq_vec;
+
+	while ((softirq_bit = ffs(pending))) {
+		unsigned int vec_nr;
+		int prev_count;
+
+		h += softirq_bit - 1;
+
+		vec_nr = h - softirq_vec;
+		prev_count = preempt_count();
+
+		kstat_incr_softirqs_this_cpu(vec_nr);
+
+		trace_softirq_entry(vec_nr);
+		h->action(h);
+		trace_softirq_exit(vec_nr);
+		if (unlikely(prev_count != preempt_count())) {
+			pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
+			       vec_nr, softirq_to_name[vec_nr], h->action,
+			       prev_count, preempt_count());
+			preempt_count_set(prev_count);
+		}
+		h++;
+		pending >>= softirq_bit;
+	}
+
+	rcu_bh_qs();
+	local_irq_disable();
+}
+
 /*
  * preempt_count and SOFTIRQ_OFFSET usage:
  * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
@@ -228,10 +265,8 @@ asmlinkage __visible void __do_softirq(void)
 	unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
 	unsigned long old_flags = current->flags;
 	int max_restart = MAX_SOFTIRQ_RESTART;
-	struct softirq_action *h;
 	bool in_hardirq;
 	__u32 pending;
-	int softirq_bit;
 
 	/*
 	 * Mask out PF_MEMALLOC s current task context is borrowed for the
@@ -250,36 +285,7 @@ restart:
 	/* Reset the pending bitmask before enabling irqs */
 	set_softirq_pending(0);
 
-	local_irq_enable();
-
-	h = softirq_vec;
-
-	while ((softirq_bit = ffs(pending))) {
-		unsigned int vec_nr;
-		int prev_count;
-
-		h += softirq_bit - 1;
-
-		vec_nr = h - softirq_vec;
-		prev_count = preempt_count();
-
-		kstat_incr_softirqs_this_cpu(vec_nr);
-
-		trace_softirq_entry(vec_nr);
-		h->action(h);
-		trace_softirq_exit(vec_nr);
-		if (unlikely(prev_count != preempt_count())) {
-			pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
-			       vec_nr, softirq_to_name[vec_nr], h->action,
-			       prev_count, preempt_count());
-			preempt_count_set(prev_count);
-		}
-		h++;
-		pending >>= softirq_bit;
-	}
-
-	rcu_bh_qs();
-	local_irq_disable();
+	handle_pending_softirqs(pending);
 
 	pending = local_softirq_pending();
 	if (pending) {
-- 
cgit v1.2.3


From 72d76b8078443783d01b6a76cc96dda081510298 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:29:27 -0500
Subject: x86: Do not unmask io_apic when interrupt is in progress

With threaded interrupts we might see an interrupt in progress on
migration. Do not unmask it when this is the case.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/apic/io_apic.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 1183d545da1e..791ffdafd674 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2494,7 +2494,8 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
 static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg)
 {
 	/* If we are moving the irq we need to mask it */
-	if (unlikely(irqd_is_setaffinity_pending(data))) {
+	if (unlikely(irqd_is_setaffinity_pending(data) &&
+		     !irqd_irq_inprogress(data))) {
 		mask_ioapic(cfg);
 		return true;
 	}
-- 
cgit v1.2.3


From 9479df50d4c65932758991fd4762dac580102093 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 20 May 2015 00:23:27 +0200
Subject: x86: Do not disable preemption in int3 on 32bit

Preemption must be disabled before enabling interrupts in do_trap
on x86_64 because the stack in use for int3 and debug is a per CPU
stack set by th IST. But 32bit does not have an IST and the stack
still belongs to the current task and there is no problem in scheduling
out the task.

Keep preemption enabled on X86_32 when enabling interrupts for
do_trap().

The name of the function is changed from preempt_conditional_sti/cli()
to conditional_sti/cli_ist(), to annotate that this function is used
when the stack is on the IST.

Cc: stable-rt@vger.kernel.org
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/traps.c | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 07ab8e9733c5..4f0662bb6705 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -87,9 +87,21 @@ static inline void conditional_sti(struct pt_regs *regs)
 		local_irq_enable();
 }
 
-static inline void preempt_conditional_sti(struct pt_regs *regs)
+static inline void conditional_sti_ist(struct pt_regs *regs)
 {
+#ifdef CONFIG_X86_64
+	/*
+	 * X86_64 uses a per CPU stack on the IST for certain traps
+	 * like int3. The task can not be preempted when using one
+	 * of these stacks, thus preemption must be disabled, otherwise
+	 * the stack can be corrupted if the task is scheduled out,
+	 * and another task comes in and uses this stack.
+	 *
+	 * On x86_32 the task keeps its own stack and it is OK if the
+	 * task schedules out.
+	 */
 	preempt_count_inc();
+#endif
 	if (regs->flags & X86_EFLAGS_IF)
 		local_irq_enable();
 }
@@ -100,11 +112,13 @@ static inline void conditional_cli(struct pt_regs *regs)
 		local_irq_disable();
 }
 
-static inline void preempt_conditional_cli(struct pt_regs *regs)
+static inline void conditional_cli_ist(struct pt_regs *regs)
 {
 	if (regs->flags & X86_EFLAGS_IF)
 		local_irq_disable();
+#ifdef CONFIG_X86_64
 	preempt_count_dec();
+#endif
 }
 
 static nokprobe_inline int
@@ -372,9 +386,9 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
 	 * as we may switch to the interrupt stack.
 	 */
 	debug_stack_usage_inc();
-	preempt_conditional_sti(regs);
+	conditional_sti_ist(regs);
 	do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL);
-	preempt_conditional_cli(regs);
+	conditional_cli_ist(regs);
 	debug_stack_usage_dec();
 exit:
 	exception_exit(prev_state);
@@ -517,12 +531,12 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
 	debug_stack_usage_inc();
 
 	/* It's safe to allow irq's after DR6 has been saved */
-	preempt_conditional_sti(regs);
+	conditional_sti_ist(regs);
 
 	if (regs->flags & X86_VM_MASK) {
 		handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code,
 					X86_TRAP_DB);
-		preempt_conditional_cli(regs);
+		conditional_cli_ist(regs);
 		debug_stack_usage_dec();
 		goto exit;
 	}
@@ -542,7 +556,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
 	si_code = get_si_code(tsk->thread.debugreg6);
 	if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp)
 		send_sigtrap(tsk, regs, error_code, si_code);
-	preempt_conditional_cli(regs);
+	conditional_cli_ist(regs);
 	debug_stack_usage_dec();
 
 exit:
-- 
cgit v1.2.3


From d537c1d0e3a1f4ac723c080f5e2f5ea95fe25f83 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 1 Dec 2011 00:07:16 +0100
Subject: pci: Use __wake_up_all_locked pci_unblock_user_cfg_access()

The waitqueue is protected by the pci_lock, so we can just avoid to
lock the waitqueue lock itself. That prevents the
might_sleep()/scheduling while atomic problem on RT

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable-rt@vger.kernel.org
---
 drivers/pci/access.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/pci/access.c b/drivers/pci/access.c
index 49dd766852ba..446c58b5467b 100644
--- a/drivers/pci/access.c
+++ b/drivers/pci/access.c
@@ -434,7 +434,7 @@ void pci_cfg_access_unlock(struct pci_dev *dev)
 	WARN_ON(!dev->block_cfg_access);
 
 	dev->block_cfg_access = 0;
-	wake_up_all(&pci_cfg_wait);
+	wake_up_all_locked(&pci_cfg_wait);
 	raw_spin_unlock_irqrestore(&pci_lock, flags);
 }
 EXPORT_SYMBOL_GPL(pci_cfg_access_unlock);
-- 
cgit v1.2.3


From c4e4ac7e09fd9df7acd3bfd19d4abf2c4e858b06 Mon Sep 17 00:00:00 2001
From: Carsten Emde <C.Emde@osadl.org>
Date: Tue, 19 Jul 2011 14:03:41 +0100
Subject: latency-hist.patch

This patch provides a recording mechanism to store data of potential
sources of system latencies. The recordings separately determine the
latency caused by a delayed timer expiration, by a delayed wakeup of the
related user space program and by the sum of both. The histograms can be
enabled and reset individually. The data are accessible via the debug
filesystem. For details please consult Documentation/trace/histograms.txt.

Signed-off-by: Carsten Emde <C.Emde@osadl.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 Documentation/trace/histograms.txt  |  186 ++++++
 include/linux/hrtimer.h             |    3 +
 include/linux/sched.h               |    6 +
 include/trace/events/hist.h         |   72 +++
 include/trace/events/latency_hist.h |   29 +
 kernel/time/hrtimer.c               |   21 +
 kernel/trace/Kconfig                |  104 ++++
 kernel/trace/Makefile               |    4 +
 kernel/trace/latency_hist.c         | 1178 +++++++++++++++++++++++++++++++++++
 kernel/trace/trace_irqsoff.c        |   11 +
 10 files changed, 1614 insertions(+)
 create mode 100644 Documentation/trace/histograms.txt
 create mode 100644 include/trace/events/hist.h
 create mode 100644 include/trace/events/latency_hist.h
 create mode 100644 kernel/trace/latency_hist.c

diff --git a/Documentation/trace/histograms.txt b/Documentation/trace/histograms.txt
new file mode 100644
index 000000000000..6f2aeabf7faa
--- /dev/null
+++ b/Documentation/trace/histograms.txt
@@ -0,0 +1,186 @@
+		Using the Linux Kernel Latency Histograms
+
+
+This document gives a short explanation how to enable, configure and use
+latency histograms. Latency histograms are primarily relevant in the
+context of real-time enabled kernels (CONFIG_PREEMPT/CONFIG_PREEMPT_RT)
+and are used in the quality management of the Linux real-time
+capabilities.
+
+
+* Purpose of latency histograms
+
+A latency histogram continuously accumulates the frequencies of latency
+data. There are two types of histograms
+- potential sources of latencies
+- effective latencies
+
+
+* Potential sources of latencies
+
+Potential sources of latencies are code segments where interrupts,
+preemption or both are disabled (aka critical sections). To create
+histograms of potential sources of latency, the kernel stores the time
+stamp at the start of a critical section, determines the time elapsed
+when the end of the section is reached, and increments the frequency
+counter of that latency value - irrespective of whether any concurrently
+running process is affected by latency or not.
+- Configuration items (in the Kernel hacking/Tracers submenu)
+  CONFIG_INTERRUPT_OFF_LATENCY
+  CONFIG_PREEMPT_OFF_LATENCY
+
+
+* Effective latencies
+
+Effective latencies are actually occuring during wakeup of a process. To
+determine effective latencies, the kernel stores the time stamp when a
+process is scheduled to be woken up, and determines the duration of the
+wakeup time shortly before control is passed over to this process. Note
+that the apparent latency in user space may be somewhat longer, since the
+process may be interrupted after control is passed over to it but before
+the execution in user space takes place. Simply measuring the interval
+between enqueuing and wakeup may also not appropriate in cases when a
+process is scheduled as a result of a timer expiration. The timer may have
+missed its deadline, e.g. due to disabled interrupts, but this latency
+would not be registered. Therefore, the offsets of missed timers are
+recorded in a separate histogram. If both wakeup latency and missed timer
+offsets are configured and enabled, a third histogram may be enabled that
+records the overall latency as a sum of the timer latency, if any, and the
+wakeup latency. This histogram is called "timerandwakeup".
+- Configuration items (in the Kernel hacking/Tracers submenu)
+  CONFIG_WAKEUP_LATENCY
+  CONFIG_MISSED_TIMER_OFSETS
+
+
+* Usage
+
+The interface to the administration of the latency histograms is located
+in the debugfs file system. To mount it, either enter
+
+mount -t sysfs nodev /sys
+mount -t debugfs nodev /sys/kernel/debug
+
+from shell command line level, or add
+
+nodev	/sys			sysfs	defaults	0 0
+nodev	/sys/kernel/debug	debugfs	defaults	0 0
+
+to the file /etc/fstab. All latency histogram related files are then
+available in the directory /sys/kernel/debug/tracing/latency_hist. A
+particular histogram type is enabled by writing non-zero to the related
+variable in the /sys/kernel/debug/tracing/latency_hist/enable directory.
+Select "preemptirqsoff" for the histograms of potential sources of
+latencies and "wakeup" for histograms of effective latencies etc. The
+histogram data - one per CPU - are available in the files
+
+/sys/kernel/debug/tracing/latency_hist/preemptoff/CPUx
+/sys/kernel/debug/tracing/latency_hist/irqsoff/CPUx
+/sys/kernel/debug/tracing/latency_hist/preemptirqsoff/CPUx
+/sys/kernel/debug/tracing/latency_hist/wakeup/CPUx
+/sys/kernel/debug/tracing/latency_hist/wakeup/sharedprio/CPUx
+/sys/kernel/debug/tracing/latency_hist/missed_timer_offsets/CPUx
+/sys/kernel/debug/tracing/latency_hist/timerandwakeup/CPUx
+
+The histograms are reset by writing non-zero to the file "reset" in a
+particular latency directory. To reset all latency data, use
+
+#!/bin/sh
+
+TRACINGDIR=/sys/kernel/debug/tracing
+HISTDIR=$TRACINGDIR/latency_hist
+
+if test -d $HISTDIR
+then
+  cd $HISTDIR
+  for i in `find . | grep /reset$`
+  do
+    echo 1 >$i
+  done
+fi
+
+
+* Data format
+
+Latency data are stored with a resolution of one microsecond. The
+maximum latency is 10,240 microseconds. The data are only valid, if the
+overflow register is empty. Every output line contains the latency in
+microseconds in the first row and the number of samples in the second
+row. To display only lines with a positive latency count, use, for
+example,
+
+grep -v " 0$" /sys/kernel/debug/tracing/latency_hist/preemptoff/CPU0
+
+#Minimum latency: 0 microseconds.
+#Average latency: 0 microseconds.
+#Maximum latency: 25 microseconds.
+#Total samples: 3104770694
+#There are 0 samples greater or equal than 10240 microseconds
+#usecs	         samples
+    0	      2984486876
+    1	        49843506
+    2	        58219047
+    3	         5348126
+    4	         2187960
+    5	         3388262
+    6	          959289
+    7	          208294
+    8	           40420
+    9	            4485
+   10	           14918
+   11	           18340
+   12	           25052
+   13	           19455
+   14	            5602
+   15	             969
+   16	              47
+   17	              18
+   18	              14
+   19	               1
+   20	               3
+   21	               2
+   22	               5
+   23	               2
+   25	               1
+
+
+* Wakeup latency of a selected process
+
+To only collect wakeup latency data of a particular process, write the
+PID of the requested process to
+
+/sys/kernel/debug/tracing/latency_hist/wakeup/pid
+
+PIDs are not considered, if this variable is set to 0.
+
+
+* Details of the process with the highest wakeup latency so far
+
+Selected data of the process that suffered from the highest wakeup
+latency that occurred in a particular CPU are available in the file
+
+/sys/kernel/debug/tracing/latency_hist/wakeup/max_latency-CPUx.
+
+In addition, other relevant system data at the time when the
+latency occurred are given.
+
+The format of the data is (all in one line):
+<PID> <Priority> <Latency> (<Timeroffset>) <Command> \
+<- <PID> <Priority> <Command> <Timestamp>
+
+The value of <Timeroffset> is only relevant in the combined timer
+and wakeup latency recording. In the wakeup recording, it is
+always 0, in the missed_timer_offsets recording, it is the same
+as <Latency>.
+
+When retrospectively searching for the origin of a latency and
+tracing was not enabled, it may be helpful to know the name and
+some basic data of the task that (finally) was switching to the
+late real-tlme task. In addition to the victim's data, also the
+data of the possible culprit are therefore displayed after the
+"<-" symbol.
+
+Finally, the timestamp of the time when the latency occurred
+in <seconds>.<microseconds> after the most recent system boot
+is provided.
+
+These data are also reset when the wakeup histogram is reset.
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index a036d058a249..bd6483d99678 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -111,6 +111,9 @@ struct hrtimer {
 	enum hrtimer_restart		(*function)(struct hrtimer *);
 	struct hrtimer_clock_base	*base;
 	unsigned long			state;
+#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
+	ktime_t				praecox;
+#endif
 #ifdef CONFIG_TIMER_STATS
 	int				start_pid;
 	void				*start_site;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ccbf3690c058..916d161f992f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1651,6 +1651,12 @@ struct task_struct {
 	unsigned long trace;
 	/* bitmask and counter of trace recursion */
 	unsigned long trace_recursion;
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+	u64 preempt_timestamp_hist;
+#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
+	long timer_offset;
+#endif
+#endif
 #endif /* CONFIG_TRACING */
 #ifdef CONFIG_MEMCG /* memcg uses this to do batch job */
 	unsigned int memcg_kmem_skip_account;
diff --git a/include/trace/events/hist.h b/include/trace/events/hist.h
new file mode 100644
index 000000000000..6122e4286177
--- /dev/null
+++ b/include/trace/events/hist.h
@@ -0,0 +1,72 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hist
+
+#if !defined(_TRACE_HIST_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_HIST_H
+
+#include "latency_hist.h"
+#include <linux/tracepoint.h>
+
+#if !defined(CONFIG_PREEMPT_OFF_HIST) && !defined(CONFIG_INTERRUPT_OFF_HIST)
+#define trace_preemptirqsoff_hist(a, b)
+#else
+TRACE_EVENT(preemptirqsoff_hist,
+
+	TP_PROTO(int reason, int starthist),
+
+	TP_ARGS(reason, starthist),
+
+	TP_STRUCT__entry(
+		__field(int,	reason)
+		__field(int,	starthist)
+	),
+
+	TP_fast_assign(
+		__entry->reason		= reason;
+		__entry->starthist	= starthist;
+	),
+
+	TP_printk("reason=%s starthist=%s", getaction(__entry->reason),
+		  __entry->starthist ? "start" : "stop")
+);
+#endif
+
+#ifndef CONFIG_MISSED_TIMER_OFFSETS_HIST
+#define trace_hrtimer_interrupt(a, b, c, d)
+#else
+TRACE_EVENT(hrtimer_interrupt,
+
+	TP_PROTO(int cpu, long long offset, struct task_struct *curr,
+		struct task_struct *task),
+
+	TP_ARGS(cpu, offset, curr, task),
+
+	TP_STRUCT__entry(
+		__field(int,		cpu)
+		__field(long long,	offset)
+		__array(char,		ccomm,	TASK_COMM_LEN)
+		__field(int,		cprio)
+		__array(char,		tcomm,	TASK_COMM_LEN)
+		__field(int,		tprio)
+	),
+
+	TP_fast_assign(
+		__entry->cpu	= cpu;
+		__entry->offset	= offset;
+		memcpy(__entry->ccomm, curr->comm, TASK_COMM_LEN);
+		__entry->cprio  = curr->prio;
+		memcpy(__entry->tcomm, task != NULL ? task->comm : "<none>",
+			task != NULL ? TASK_COMM_LEN : 7);
+		__entry->tprio  = task != NULL ? task->prio : -1;
+	),
+
+	TP_printk("cpu=%d offset=%lld curr=%s[%d] thread=%s[%d]",
+		__entry->cpu, __entry->offset, __entry->ccomm,
+		__entry->cprio, __entry->tcomm, __entry->tprio)
+);
+#endif
+
+#endif /* _TRACE_HIST_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/events/latency_hist.h b/include/trace/events/latency_hist.h
new file mode 100644
index 000000000000..d3f2fbd560b1
--- /dev/null
+++ b/include/trace/events/latency_hist.h
@@ -0,0 +1,29 @@
+#ifndef _LATENCY_HIST_H
+#define _LATENCY_HIST_H
+
+enum hist_action {
+	IRQS_ON,
+	PREEMPT_ON,
+	TRACE_STOP,
+	IRQS_OFF,
+	PREEMPT_OFF,
+	TRACE_START,
+};
+
+static char *actions[] = {
+	"IRQS_ON",
+	"PREEMPT_ON",
+	"TRACE_STOP",
+	"IRQS_OFF",
+	"PREEMPT_OFF",
+	"TRACE_START",
+};
+
+static inline char *getaction(int action)
+{
+	if (action >= 0 && action <= sizeof(actions)/sizeof(actions[0]))
+		return actions[action];
+	return "unknown";
+}
+
+#endif /* _LATENCY_HIST_H */
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index d8c724cda37b..275533901e09 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -53,6 +53,7 @@
 #include <asm/uaccess.h>
 
 #include <trace/events/timer.h>
+#include <trace/events/hist.h>
 
 #include "timekeeping.h"
 
@@ -949,7 +950,16 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 	new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
 
 	timer_stats_hrtimer_set_start_info(timer);
+#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
+	{
+		ktime_t now = new_base->get_time();
 
+		if (ktime_to_ns(tim) < ktime_to_ns(now))
+			timer->praecox = now;
+		else
+			timer->praecox = ktime_set(0, 0);
+	}
+#endif
 	leftmost = enqueue_hrtimer(timer, new_base);
 
 	if (!leftmost) {
@@ -1236,6 +1246,8 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
 
 #ifdef CONFIG_HIGH_RES_TIMERS
 
+static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer);
+
 /*
  * High resolution timer interrupt
  * Called with interrupts disabled
@@ -1279,6 +1291,15 @@ retry:
 
 			timer = container_of(node, struct hrtimer, node);
 
+			trace_hrtimer_interrupt(raw_smp_processor_id(),
+			    ktime_to_ns(ktime_sub(ktime_to_ns(timer->praecox) ?
+				timer->praecox : hrtimer_get_expires(timer),
+				basenow)),
+			    current,
+			    timer->function == hrtimer_wakeup ?
+			    container_of(timer, struct hrtimer_sleeper,
+				timer)->task : NULL);
+
 			/*
 			 * The immediate goal for using the softexpires is
 			 * minimizing wakeups, not running timers at the
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index a5da09c899dd..20505a806a2b 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -187,6 +187,24 @@ config IRQSOFF_TRACER
 	  enabled. This option and the preempt-off timing option can be
 	  used together or separately.)
 
+config INTERRUPT_OFF_HIST
+	bool "Interrupts-off Latency Histogram"
+	depends on IRQSOFF_TRACER
+	help
+	  This option generates continuously updated histograms (one per cpu)
+	  of the duration of time periods with interrupts disabled. The
+	  histograms are disabled by default. To enable them, write a non-zero
+	  number to
+
+	      /sys/kernel/debug/tracing/latency_hist/enable/preemptirqsoff
+
+	  If PREEMPT_OFF_HIST is also selected, additional histograms (one
+	  per cpu) are generated that accumulate the duration of time periods
+	  when both interrupts and preemption are disabled. The histogram data
+	  will be located in the debug file system at
+
+	      /sys/kernel/debug/tracing/latency_hist/irqsoff
+
 config PREEMPT_TRACER
 	bool "Preemption-off Latency Tracer"
 	default n
@@ -211,6 +229,24 @@ config PREEMPT_TRACER
 	  enabled. This option and the irqs-off timing option can be
 	  used together or separately.)
 
+config PREEMPT_OFF_HIST
+	bool "Preemption-off Latency Histogram"
+	depends on PREEMPT_TRACER
+	help
+	  This option generates continuously updated histograms (one per cpu)
+	  of the duration of time periods with preemption disabled. The
+	  histograms are disabled by default. To enable them, write a non-zero
+	  number to
+
+	      /sys/kernel/debug/tracing/latency_hist/enable/preemptirqsoff
+
+	  If INTERRUPT_OFF_HIST is also selected, additional histograms (one
+	  per cpu) are generated that accumulate the duration of time periods
+	  when both interrupts and preemption are disabled. The histogram data
+	  will be located in the debug file system at
+
+	      /sys/kernel/debug/tracing/latency_hist/preemptoff
+
 config SCHED_TRACER
 	bool "Scheduling Latency Tracer"
 	select GENERIC_TRACER
@@ -221,6 +257,74 @@ config SCHED_TRACER
 	  This tracer tracks the latency of the highest priority task
 	  to be scheduled in, starting from the point it has woken up.
 
+config WAKEUP_LATENCY_HIST
+	bool "Scheduling Latency Histogram"
+	depends on SCHED_TRACER
+	help
+	  This option generates continuously updated histograms (one per cpu)
+	  of the scheduling latency of the highest priority task.
+	  The histograms are disabled by default. To enable them, write a
+	  non-zero number to
+
+	      /sys/kernel/debug/tracing/latency_hist/enable/wakeup
+
+	  Two different algorithms are used, one to determine the latency of
+	  processes that exclusively use the highest priority of the system and
+	  another one to determine the latency of processes that share the
+	  highest system priority with other processes. The former is used to
+	  improve hardware and system software, the latter to optimize the
+	  priority design of a given system. The histogram data will be
+	  located in the debug file system at
+
+	      /sys/kernel/debug/tracing/latency_hist/wakeup
+
+	  and
+
+	      /sys/kernel/debug/tracing/latency_hist/wakeup/sharedprio
+
+	  If both Scheduling Latency Histogram and Missed Timer Offsets
+	  Histogram are selected, additional histogram data will be collected
+	  that contain, in addition to the wakeup latency, the timer latency, in
+	  case the wakeup was triggered by an expired timer. These histograms
+	  are available in the
+
+	      /sys/kernel/debug/tracing/latency_hist/timerandwakeup
+
+	  directory. They reflect the apparent interrupt and scheduling latency
+	  and are best suitable to determine the worst-case latency of a given
+	  system. To enable these histograms, write a non-zero number to
+
+	      /sys/kernel/debug/tracing/latency_hist/enable/timerandwakeup
+
+config MISSED_TIMER_OFFSETS_HIST
+	depends on HIGH_RES_TIMERS
+	select GENERIC_TRACER
+	bool "Missed Timer Offsets Histogram"
+	help
+	  Generate a histogram of missed timer offsets in microseconds. The
+	  histograms are disabled by default. To enable them, write a non-zero
+	  number to
+
+	      /sys/kernel/debug/tracing/latency_hist/enable/missed_timer_offsets
+
+	  The histogram data will be located in the debug file system at
+
+	      /sys/kernel/debug/tracing/latency_hist/missed_timer_offsets
+
+	  If both Scheduling Latency Histogram and Missed Timer Offsets
+	  Histogram are selected, additional histogram data will be collected
+	  that contain, in addition to the wakeup latency, the timer latency, in
+	  case the wakeup was triggered by an expired timer. These histograms
+	  are available in the
+
+	      /sys/kernel/debug/tracing/latency_hist/timerandwakeup
+
+	  directory. They reflect the apparent interrupt and scheduling latency
+	  and are best suitable to determine the worst-case latency of a given
+	  system. To enable these histograms, write a non-zero number to
+
+	      /sys/kernel/debug/tracing/latency_hist/enable/timerandwakeup
+
 config ENABLE_DEFAULT_TRACERS
 	bool "Trace process context switches and events"
 	depends on !GENERIC_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 67d6369ddf83..b896e048e96d 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -36,6 +36,10 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
 obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
 obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
 obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
+obj-$(CONFIG_INTERRUPT_OFF_HIST) += latency_hist.o
+obj-$(CONFIG_PREEMPT_OFF_HIST) += latency_hist.o
+obj-$(CONFIG_WAKEUP_LATENCY_HIST) += latency_hist.o
+obj-$(CONFIG_MISSED_TIMER_OFFSETS_HIST) += latency_hist.o
 obj-$(CONFIG_NOP_TRACER) += trace_nop.o
 obj-$(CONFIG_STACK_TRACER) += trace_stack.o
 obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
diff --git a/kernel/trace/latency_hist.c b/kernel/trace/latency_hist.c
new file mode 100644
index 000000000000..66a69eb5329c
--- /dev/null
+++ b/kernel/trace/latency_hist.c
@@ -0,0 +1,1178 @@
+/*
+ * kernel/trace/latency_hist.c
+ *
+ * Add support for histograms of preemption-off latency and
+ * interrupt-off latency and wakeup latency, it depends on
+ * Real-Time Preemption Support.
+ *
+ *  Copyright (C) 2005 MontaVista Software, Inc.
+ *  Yi Yang <yyang@ch.mvista.com>
+ *
+ *  Converted to work with the new latency tracer.
+ *  Copyright (C) 2008 Red Hat, Inc.
+ *    Steven Rostedt <srostedt@redhat.com>
+ *
+ */
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/percpu.h>
+#include <linux/kallsyms.h>
+#include <linux/uaccess.h>
+#include <linux/sched.h>
+#include <linux/sched/rt.h>
+#include <linux/slab.h>
+#include <linux/atomic.h>
+#include <asm/div64.h>
+
+#include "trace.h"
+#include <trace/events/sched.h>
+
+#define NSECS_PER_USECS 1000L
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/hist.h>
+
+enum {
+	IRQSOFF_LATENCY = 0,
+	PREEMPTOFF_LATENCY,
+	PREEMPTIRQSOFF_LATENCY,
+	WAKEUP_LATENCY,
+	WAKEUP_LATENCY_SHAREDPRIO,
+	MISSED_TIMER_OFFSETS,
+	TIMERANDWAKEUP_LATENCY,
+	MAX_LATENCY_TYPE,
+};
+
+#define MAX_ENTRY_NUM 10240
+
+struct hist_data {
+	atomic_t hist_mode; /* 0 log, 1 don't log */
+	long offset; /* set it to MAX_ENTRY_NUM/2 for a bipolar scale */
+	long min_lat;
+	long max_lat;
+	unsigned long long below_hist_bound_samples;
+	unsigned long long above_hist_bound_samples;
+	long long accumulate_lat;
+	unsigned long long total_samples;
+	unsigned long long hist_array[MAX_ENTRY_NUM];
+};
+
+struct enable_data {
+	int latency_type;
+	int enabled;
+};
+
+static char *latency_hist_dir_root = "latency_hist";
+
+#ifdef CONFIG_INTERRUPT_OFF_HIST
+static DEFINE_PER_CPU(struct hist_data, irqsoff_hist);
+static char *irqsoff_hist_dir = "irqsoff";
+static DEFINE_PER_CPU(cycles_t, hist_irqsoff_start);
+static DEFINE_PER_CPU(int, hist_irqsoff_counting);
+#endif
+
+#ifdef CONFIG_PREEMPT_OFF_HIST
+static DEFINE_PER_CPU(struct hist_data, preemptoff_hist);
+static char *preemptoff_hist_dir = "preemptoff";
+static DEFINE_PER_CPU(cycles_t, hist_preemptoff_start);
+static DEFINE_PER_CPU(int, hist_preemptoff_counting);
+#endif
+
+#if defined(CONFIG_PREEMPT_OFF_HIST) && defined(CONFIG_INTERRUPT_OFF_HIST)
+static DEFINE_PER_CPU(struct hist_data, preemptirqsoff_hist);
+static char *preemptirqsoff_hist_dir = "preemptirqsoff";
+static DEFINE_PER_CPU(cycles_t, hist_preemptirqsoff_start);
+static DEFINE_PER_CPU(int, hist_preemptirqsoff_counting);
+#endif
+
+#if defined(CONFIG_PREEMPT_OFF_HIST) || defined(CONFIG_INTERRUPT_OFF_HIST)
+static notrace void probe_preemptirqsoff_hist(void *v, int reason, int start);
+static struct enable_data preemptirqsoff_enabled_data = {
+	.latency_type = PREEMPTIRQSOFF_LATENCY,
+	.enabled = 0,
+};
+#endif
+
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
+	defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+struct maxlatproc_data {
+	char comm[FIELD_SIZEOF(struct task_struct, comm)];
+	char current_comm[FIELD_SIZEOF(struct task_struct, comm)];
+	int pid;
+	int current_pid;
+	int prio;
+	int current_prio;
+	long latency;
+	long timeroffset;
+	cycle_t timestamp;
+};
+#endif
+
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+static DEFINE_PER_CPU(struct hist_data, wakeup_latency_hist);
+static DEFINE_PER_CPU(struct hist_data, wakeup_latency_hist_sharedprio);
+static char *wakeup_latency_hist_dir = "wakeup";
+static char *wakeup_latency_hist_dir_sharedprio = "sharedprio";
+static notrace void probe_wakeup_latency_hist_start(void *v,
+	struct task_struct *p, int success);
+static notrace void probe_wakeup_latency_hist_stop(void *v,
+	struct task_struct *prev, struct task_struct *next);
+static notrace void probe_sched_migrate_task(void *,
+	struct task_struct *task, int cpu);
+static struct enable_data wakeup_latency_enabled_data = {
+	.latency_type = WAKEUP_LATENCY,
+	.enabled = 0,
+};
+static DEFINE_PER_CPU(struct maxlatproc_data, wakeup_maxlatproc);
+static DEFINE_PER_CPU(struct maxlatproc_data, wakeup_maxlatproc_sharedprio);
+static DEFINE_PER_CPU(struct task_struct *, wakeup_task);
+static DEFINE_PER_CPU(int, wakeup_sharedprio);
+static unsigned long wakeup_pid;
+#endif
+
+#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
+static DEFINE_PER_CPU(struct hist_data, missed_timer_offsets);
+static char *missed_timer_offsets_dir = "missed_timer_offsets";
+static notrace void probe_hrtimer_interrupt(void *v, int cpu,
+	long long offset, struct task_struct *curr, struct task_struct *task);
+static struct enable_data missed_timer_offsets_enabled_data = {
+	.latency_type = MISSED_TIMER_OFFSETS,
+	.enabled = 0,
+};
+static DEFINE_PER_CPU(struct maxlatproc_data, missed_timer_offsets_maxlatproc);
+static unsigned long missed_timer_offsets_pid;
+#endif
+
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
+	defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+static DEFINE_PER_CPU(struct hist_data, timerandwakeup_latency_hist);
+static char *timerandwakeup_latency_hist_dir = "timerandwakeup";
+static struct enable_data timerandwakeup_enabled_data = {
+	.latency_type = TIMERANDWAKEUP_LATENCY,
+	.enabled = 0,
+};
+static DEFINE_PER_CPU(struct maxlatproc_data, timerandwakeup_maxlatproc);
+#endif
+
+void notrace latency_hist(int latency_type, int cpu, long latency,
+			  long timeroffset, cycle_t stop,
+			  struct task_struct *p)
+{
+	struct hist_data *my_hist;
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
+	defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+	struct maxlatproc_data *mp = NULL;
+#endif
+
+	if (!cpu_possible(cpu) || latency_type < 0 ||
+	    latency_type >= MAX_LATENCY_TYPE)
+		return;
+
+	switch (latency_type) {
+#ifdef CONFIG_INTERRUPT_OFF_HIST
+	case IRQSOFF_LATENCY:
+		my_hist = &per_cpu(irqsoff_hist, cpu);
+		break;
+#endif
+#ifdef CONFIG_PREEMPT_OFF_HIST
+	case PREEMPTOFF_LATENCY:
+		my_hist = &per_cpu(preemptoff_hist, cpu);
+		break;
+#endif
+#if defined(CONFIG_PREEMPT_OFF_HIST) && defined(CONFIG_INTERRUPT_OFF_HIST)
+	case PREEMPTIRQSOFF_LATENCY:
+		my_hist = &per_cpu(preemptirqsoff_hist, cpu);
+		break;
+#endif
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+	case WAKEUP_LATENCY:
+		my_hist = &per_cpu(wakeup_latency_hist, cpu);
+		mp = &per_cpu(wakeup_maxlatproc, cpu);
+		break;
+	case WAKEUP_LATENCY_SHAREDPRIO:
+		my_hist = &per_cpu(wakeup_latency_hist_sharedprio, cpu);
+		mp = &per_cpu(wakeup_maxlatproc_sharedprio, cpu);
+		break;
+#endif
+#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
+	case MISSED_TIMER_OFFSETS:
+		my_hist = &per_cpu(missed_timer_offsets, cpu);
+		mp = &per_cpu(missed_timer_offsets_maxlatproc, cpu);
+		break;
+#endif
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
+	defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+	case TIMERANDWAKEUP_LATENCY:
+		my_hist = &per_cpu(timerandwakeup_latency_hist, cpu);
+		mp = &per_cpu(timerandwakeup_maxlatproc, cpu);
+		break;
+#endif
+
+	default:
+		return;
+	}
+
+	latency += my_hist->offset;
+
+	if (atomic_read(&my_hist->hist_mode) == 0)
+		return;
+
+	if (latency < 0 || latency >= MAX_ENTRY_NUM) {
+		if (latency < 0)
+			my_hist->below_hist_bound_samples++;
+		else
+			my_hist->above_hist_bound_samples++;
+	} else
+		my_hist->hist_array[latency]++;
+
+	if (unlikely(latency > my_hist->max_lat ||
+	    my_hist->min_lat == LONG_MAX)) {
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
+	defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+		if (latency_type == WAKEUP_LATENCY ||
+		    latency_type == WAKEUP_LATENCY_SHAREDPRIO ||
+		    latency_type == MISSED_TIMER_OFFSETS ||
+		    latency_type == TIMERANDWAKEUP_LATENCY) {
+			strncpy(mp->comm, p->comm, sizeof(mp->comm));
+			strncpy(mp->current_comm, current->comm,
+			    sizeof(mp->current_comm));
+			mp->pid = task_pid_nr(p);
+			mp->current_pid = task_pid_nr(current);
+			mp->prio = p->prio;
+			mp->current_prio = current->prio;
+			mp->latency = latency;
+			mp->timeroffset = timeroffset;
+			mp->timestamp = stop;
+		}
+#endif
+		my_hist->max_lat = latency;
+	}
+	if (unlikely(latency < my_hist->min_lat))
+		my_hist->min_lat = latency;
+	my_hist->total_samples++;
+	my_hist->accumulate_lat += latency;
+}
+
+static void *l_start(struct seq_file *m, loff_t *pos)
+{
+	loff_t *index_ptr = NULL;
+	loff_t index = *pos;
+	struct hist_data *my_hist = m->private;
+
+	if (index == 0) {
+		char minstr[32], avgstr[32], maxstr[32];
+
+		atomic_dec(&my_hist->hist_mode);
+
+		if (likely(my_hist->total_samples)) {
+			long avg = (long) div64_s64(my_hist->accumulate_lat,
+			    my_hist->total_samples);
+			snprintf(minstr, sizeof(minstr), "%ld",
+			    my_hist->min_lat - my_hist->offset);
+			snprintf(avgstr, sizeof(avgstr), "%ld",
+			    avg - my_hist->offset);
+			snprintf(maxstr, sizeof(maxstr), "%ld",
+			    my_hist->max_lat - my_hist->offset);
+		} else {
+			strcpy(minstr, "<undef>");
+			strcpy(avgstr, minstr);
+			strcpy(maxstr, minstr);
+		}
+
+		seq_printf(m, "#Minimum latency: %s microseconds\n"
+			   "#Average latency: %s microseconds\n"
+			   "#Maximum latency: %s microseconds\n"
+			   "#Total samples: %llu\n"
+			   "#There are %llu samples lower than %ld"
+			   " microseconds.\n"
+			   "#There are %llu samples greater or equal"
+			   " than %ld microseconds.\n"
+			   "#usecs\t%16s\n",
+			   minstr, avgstr, maxstr,
+			   my_hist->total_samples,
+			   my_hist->below_hist_bound_samples,
+			   -my_hist->offset,
+			   my_hist->above_hist_bound_samples,
+			   MAX_ENTRY_NUM - my_hist->offset,
+			   "samples");
+	}
+	if (index < MAX_ENTRY_NUM) {
+		index_ptr = kmalloc(sizeof(loff_t), GFP_KERNEL);
+		if (index_ptr)
+			*index_ptr = index;
+	}
+
+	return index_ptr;
+}
+
+static void *l_next(struct seq_file *m, void *p, loff_t *pos)
+{
+	loff_t *index_ptr = p;
+	struct hist_data *my_hist = m->private;
+
+	if (++*pos >= MAX_ENTRY_NUM) {
+		atomic_inc(&my_hist->hist_mode);
+		return NULL;
+	}
+	*index_ptr = *pos;
+	return index_ptr;
+}
+
+static void l_stop(struct seq_file *m, void *p)
+{
+	kfree(p);
+}
+
+static int l_show(struct seq_file *m, void *p)
+{
+	int index = *(loff_t *) p;
+	struct hist_data *my_hist = m->private;
+
+	seq_printf(m, "%6ld\t%16llu\n", index - my_hist->offset,
+	    my_hist->hist_array[index]);
+	return 0;
+}
+
+static const struct seq_operations latency_hist_seq_op = {
+	.start = l_start,
+	.next  = l_next,
+	.stop  = l_stop,
+	.show  = l_show
+};
+
+static int latency_hist_open(struct inode *inode, struct file *file)
+{
+	int ret;
+
+	ret = seq_open(file, &latency_hist_seq_op);
+	if (!ret) {
+		struct seq_file *seq = file->private_data;
+		seq->private = inode->i_private;
+	}
+	return ret;
+}
+
+static const struct file_operations latency_hist_fops = {
+	.open = latency_hist_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release,
+};
+
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
+	defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+static void clear_maxlatprocdata(struct maxlatproc_data *mp)
+{
+	mp->comm[0] = mp->current_comm[0] = '\0';
+	mp->prio = mp->current_prio = mp->pid = mp->current_pid =
+	    mp->latency = mp->timeroffset = -1;
+	mp->timestamp = 0;
+}
+#endif
+
+static void hist_reset(struct hist_data *hist)
+{
+	atomic_dec(&hist->hist_mode);
+
+	memset(hist->hist_array, 0, sizeof(hist->hist_array));
+	hist->below_hist_bound_samples = 0ULL;
+	hist->above_hist_bound_samples = 0ULL;
+	hist->min_lat = LONG_MAX;
+	hist->max_lat = LONG_MIN;
+	hist->total_samples = 0ULL;
+	hist->accumulate_lat = 0LL;
+
+	atomic_inc(&hist->hist_mode);
+}
+
+static ssize_t
+latency_hist_reset(struct file *file, const char __user *a,
+		   size_t size, loff_t *off)
+{
+	int cpu;
+	struct hist_data *hist = NULL;
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
+	defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+	struct maxlatproc_data *mp = NULL;
+#endif
+	off_t latency_type = (off_t) file->private_data;
+
+	for_each_online_cpu(cpu) {
+
+		switch (latency_type) {
+#ifdef CONFIG_PREEMPT_OFF_HIST
+		case PREEMPTOFF_LATENCY:
+			hist = &per_cpu(preemptoff_hist, cpu);
+			break;
+#endif
+#ifdef CONFIG_INTERRUPT_OFF_HIST
+		case IRQSOFF_LATENCY:
+			hist = &per_cpu(irqsoff_hist, cpu);
+			break;
+#endif
+#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
+		case PREEMPTIRQSOFF_LATENCY:
+			hist = &per_cpu(preemptirqsoff_hist, cpu);
+			break;
+#endif
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+		case WAKEUP_LATENCY:
+			hist = &per_cpu(wakeup_latency_hist, cpu);
+			mp = &per_cpu(wakeup_maxlatproc, cpu);
+			break;
+		case WAKEUP_LATENCY_SHAREDPRIO:
+			hist = &per_cpu(wakeup_latency_hist_sharedprio, cpu);
+			mp = &per_cpu(wakeup_maxlatproc_sharedprio, cpu);
+			break;
+#endif
+#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
+		case MISSED_TIMER_OFFSETS:
+			hist = &per_cpu(missed_timer_offsets, cpu);
+			mp = &per_cpu(missed_timer_offsets_maxlatproc, cpu);
+			break;
+#endif
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
+	defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+		case TIMERANDWAKEUP_LATENCY:
+			hist = &per_cpu(timerandwakeup_latency_hist, cpu);
+			mp = &per_cpu(timerandwakeup_maxlatproc, cpu);
+			break;
+#endif
+		}
+
+		hist_reset(hist);
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
+	defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+		if (latency_type == WAKEUP_LATENCY ||
+		    latency_type == WAKEUP_LATENCY_SHAREDPRIO ||
+		    latency_type == MISSED_TIMER_OFFSETS ||
+		    latency_type == TIMERANDWAKEUP_LATENCY)
+			clear_maxlatprocdata(mp);
+#endif
+	}
+
+	return size;
+}
+
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
+	defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+static ssize_t
+show_pid(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+	char buf[64];
+	int r;
+	unsigned long *this_pid = file->private_data;
+
+	r = snprintf(buf, sizeof(buf), "%lu\n", *this_pid);
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t do_pid(struct file *file, const char __user *ubuf,
+		      size_t cnt, loff_t *ppos)
+{
+	char buf[64];
+	unsigned long pid;
+	unsigned long *this_pid = file->private_data;
+
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = '\0';
+
+	if (kstrtoul(buf, 10, &pid))
+		return -EINVAL;
+
+	*this_pid = pid;
+
+	return cnt;
+}
+#endif
+
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
+	defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+static ssize_t
+show_maxlatproc(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+	int r;
+	struct maxlatproc_data *mp = file->private_data;
+	int strmaxlen = (TASK_COMM_LEN * 2) + (8 * 8);
+	unsigned long long t;
+	unsigned long usecs, secs;
+	char *buf;
+
+	if (mp->pid == -1 || mp->current_pid == -1) {
+		buf = "(none)\n";
+		return simple_read_from_buffer(ubuf, cnt, ppos, buf,
+		    strlen(buf));
+	}
+
+	buf = kmalloc(strmaxlen, GFP_KERNEL);
+	if (buf == NULL)
+		return -ENOMEM;
+
+	t = ns2usecs(mp->timestamp);
+	usecs = do_div(t, USEC_PER_SEC);
+	secs = (unsigned long) t;
+	r = snprintf(buf, strmaxlen,
+	    "%d %d %ld (%ld) %s <- %d %d %s %lu.%06lu\n", mp->pid,
+	    MAX_RT_PRIO-1 - mp->prio, mp->latency, mp->timeroffset, mp->comm,
+	    mp->current_pid, MAX_RT_PRIO-1 - mp->current_prio, mp->current_comm,
+	    secs, usecs);
+	r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+	kfree(buf);
+	return r;
+}
+#endif
+
+static ssize_t
+show_enable(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+	char buf[64];
+	struct enable_data *ed = file->private_data;
+	int r;
+
+	r = snprintf(buf, sizeof(buf), "%d\n", ed->enabled);
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+do_enable(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+	char buf[64];
+	long enable;
+	struct enable_data *ed = file->private_data;
+
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	if (kstrtoul(buf, 10, &enable))
+		return -EINVAL;
+
+	if ((enable && ed->enabled) || (!enable && !ed->enabled))
+		return cnt;
+
+	if (enable) {
+		int ret;
+
+		switch (ed->latency_type) {
+#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
+		case PREEMPTIRQSOFF_LATENCY:
+			ret = register_trace_preemptirqsoff_hist(
+			    probe_preemptirqsoff_hist, NULL);
+			if (ret) {
+				pr_info("wakeup trace: Couldn't assign "
+				    "probe_preemptirqsoff_hist "
+				    "to trace_preemptirqsoff_hist\n");
+				return ret;
+			}
+			break;
+#endif
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+		case WAKEUP_LATENCY:
+			ret = register_trace_sched_wakeup(
+			    probe_wakeup_latency_hist_start, NULL);
+			if (ret) {
+				pr_info("wakeup trace: Couldn't assign "
+				    "probe_wakeup_latency_hist_start "
+				    "to trace_sched_wakeup\n");
+				return ret;
+			}
+			ret = register_trace_sched_wakeup_new(
+			    probe_wakeup_latency_hist_start, NULL);
+			if (ret) {
+				pr_info("wakeup trace: Couldn't assign "
+				    "probe_wakeup_latency_hist_start "
+				    "to trace_sched_wakeup_new\n");
+				unregister_trace_sched_wakeup(
+				    probe_wakeup_latency_hist_start, NULL);
+				return ret;
+			}
+			ret = register_trace_sched_switch(
+			    probe_wakeup_latency_hist_stop, NULL);
+			if (ret) {
+				pr_info("wakeup trace: Couldn't assign "
+				    "probe_wakeup_latency_hist_stop "
+				    "to trace_sched_switch\n");
+				unregister_trace_sched_wakeup(
+				    probe_wakeup_latency_hist_start, NULL);
+				unregister_trace_sched_wakeup_new(
+				    probe_wakeup_latency_hist_start, NULL);
+				return ret;
+			}
+			ret = register_trace_sched_migrate_task(
+			    probe_sched_migrate_task, NULL);
+			if (ret) {
+				pr_info("wakeup trace: Couldn't assign "
+				    "probe_sched_migrate_task "
+				    "to trace_sched_migrate_task\n");
+				unregister_trace_sched_wakeup(
+				    probe_wakeup_latency_hist_start, NULL);
+				unregister_trace_sched_wakeup_new(
+				    probe_wakeup_latency_hist_start, NULL);
+				unregister_trace_sched_switch(
+				    probe_wakeup_latency_hist_stop, NULL);
+				return ret;
+			}
+			break;
+#endif
+#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
+		case MISSED_TIMER_OFFSETS:
+			ret = register_trace_hrtimer_interrupt(
+			    probe_hrtimer_interrupt, NULL);
+			if (ret) {
+				pr_info("wakeup trace: Couldn't assign "
+				    "probe_hrtimer_interrupt "
+				    "to trace_hrtimer_interrupt\n");
+				return ret;
+			}
+			break;
+#endif
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
+	defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+		case TIMERANDWAKEUP_LATENCY:
+			if (!wakeup_latency_enabled_data.enabled ||
+			    !missed_timer_offsets_enabled_data.enabled)
+				return -EINVAL;
+			break;
+#endif
+		default:
+			break;
+		}
+	} else {
+		switch (ed->latency_type) {
+#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
+		case PREEMPTIRQSOFF_LATENCY:
+			{
+				int cpu;
+
+				unregister_trace_preemptirqsoff_hist(
+				    probe_preemptirqsoff_hist, NULL);
+				for_each_online_cpu(cpu) {
+#ifdef CONFIG_INTERRUPT_OFF_HIST
+					per_cpu(hist_irqsoff_counting,
+					    cpu) = 0;
+#endif
+#ifdef CONFIG_PREEMPT_OFF_HIST
+					per_cpu(hist_preemptoff_counting,
+					    cpu) = 0;
+#endif
+#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
+					per_cpu(hist_preemptirqsoff_counting,
+					    cpu) = 0;
+#endif
+				}
+			}
+			break;
+#endif
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+		case WAKEUP_LATENCY:
+			{
+				int cpu;
+
+				unregister_trace_sched_wakeup(
+				    probe_wakeup_latency_hist_start, NULL);
+				unregister_trace_sched_wakeup_new(
+				    probe_wakeup_latency_hist_start, NULL);
+				unregister_trace_sched_switch(
+				    probe_wakeup_latency_hist_stop, NULL);
+				unregister_trace_sched_migrate_task(
+				    probe_sched_migrate_task, NULL);
+
+				for_each_online_cpu(cpu) {
+					per_cpu(wakeup_task, cpu) = NULL;
+					per_cpu(wakeup_sharedprio, cpu) = 0;
+				}
+			}
+#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
+			timerandwakeup_enabled_data.enabled = 0;
+#endif
+			break;
+#endif
+#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
+		case MISSED_TIMER_OFFSETS:
+			unregister_trace_hrtimer_interrupt(
+			    probe_hrtimer_interrupt, NULL);
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+			timerandwakeup_enabled_data.enabled = 0;
+#endif
+			break;
+#endif
+		default:
+			break;
+		}
+	}
+	ed->enabled = enable;
+	return cnt;
+}
+
+static const struct file_operations latency_hist_reset_fops = {
+	.open = tracing_open_generic,
+	.write = latency_hist_reset,
+};
+
+static const struct file_operations enable_fops = {
+	.open = tracing_open_generic,
+	.read = show_enable,
+	.write = do_enable,
+};
+
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
+	defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+static const struct file_operations pid_fops = {
+	.open = tracing_open_generic,
+	.read = show_pid,
+	.write = do_pid,
+};
+
+static const struct file_operations maxlatproc_fops = {
+	.open = tracing_open_generic,
+	.read = show_maxlatproc,
+};
+#endif
+
+#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
+static notrace void probe_preemptirqsoff_hist(void *v, int reason,
+	int starthist)
+{
+	int cpu = raw_smp_processor_id();
+	int time_set = 0;
+
+	if (starthist) {
+		cycle_t uninitialized_var(start);
+
+		if (!preempt_count() && !irqs_disabled())
+			return;
+
+#ifdef CONFIG_INTERRUPT_OFF_HIST
+		if ((reason == IRQS_OFF || reason == TRACE_START) &&
+		    !per_cpu(hist_irqsoff_counting, cpu)) {
+			per_cpu(hist_irqsoff_counting, cpu) = 1;
+			start = ftrace_now(cpu);
+			time_set++;
+			per_cpu(hist_irqsoff_start, cpu) = start;
+		}
+#endif
+
+#ifdef CONFIG_PREEMPT_OFF_HIST
+		if ((reason == PREEMPT_OFF || reason == TRACE_START) &&
+		    !per_cpu(hist_preemptoff_counting, cpu)) {
+			per_cpu(hist_preemptoff_counting, cpu) = 1;
+			if (!(time_set++))
+				start = ftrace_now(cpu);
+			per_cpu(hist_preemptoff_start, cpu) = start;
+		}
+#endif
+
+#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
+		if (per_cpu(hist_irqsoff_counting, cpu) &&
+		    per_cpu(hist_preemptoff_counting, cpu) &&
+		    !per_cpu(hist_preemptirqsoff_counting, cpu)) {
+			per_cpu(hist_preemptirqsoff_counting, cpu) = 1;
+			if (!time_set)
+				start = ftrace_now(cpu);
+			per_cpu(hist_preemptirqsoff_start, cpu) = start;
+		}
+#endif
+	} else {
+		cycle_t uninitialized_var(stop);
+
+#ifdef CONFIG_INTERRUPT_OFF_HIST
+		if ((reason == IRQS_ON || reason == TRACE_STOP) &&
+		    per_cpu(hist_irqsoff_counting, cpu)) {
+			cycle_t start = per_cpu(hist_irqsoff_start, cpu);
+
+			stop = ftrace_now(cpu);
+			time_set++;
+			if (start) {
+				long latency = ((long) (stop - start)) /
+				    NSECS_PER_USECS;
+
+				latency_hist(IRQSOFF_LATENCY, cpu, latency, 0,
+				    stop, NULL);
+			}
+			per_cpu(hist_irqsoff_counting, cpu) = 0;
+		}
+#endif
+
+#ifdef CONFIG_PREEMPT_OFF_HIST
+		if ((reason == PREEMPT_ON || reason == TRACE_STOP) &&
+		    per_cpu(hist_preemptoff_counting, cpu)) {
+			cycle_t start = per_cpu(hist_preemptoff_start, cpu);
+
+			if (!(time_set++))
+				stop = ftrace_now(cpu);
+			if (start) {
+				long latency = ((long) (stop - start)) /
+				    NSECS_PER_USECS;
+
+				latency_hist(PREEMPTOFF_LATENCY, cpu, latency,
+				    0, stop, NULL);
+			}
+			per_cpu(hist_preemptoff_counting, cpu) = 0;
+		}
+#endif
+
+#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
+		if ((!per_cpu(hist_irqsoff_counting, cpu) ||
+		     !per_cpu(hist_preemptoff_counting, cpu)) &&
+		   per_cpu(hist_preemptirqsoff_counting, cpu)) {
+			cycle_t start = per_cpu(hist_preemptirqsoff_start, cpu);
+
+			if (!time_set)
+				stop = ftrace_now(cpu);
+			if (start) {
+				long latency = ((long) (stop - start)) /
+				    NSECS_PER_USECS;
+
+				latency_hist(PREEMPTIRQSOFF_LATENCY, cpu,
+				    latency, 0, stop, NULL);
+			}
+			per_cpu(hist_preemptirqsoff_counting, cpu) = 0;
+		}
+#endif
+	}
+}
+#endif
+
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+static DEFINE_RAW_SPINLOCK(wakeup_lock);
+static notrace void probe_sched_migrate_task(void *v, struct task_struct *task,
+	int cpu)
+{
+	int old_cpu = task_cpu(task);
+
+	if (cpu != old_cpu) {
+		unsigned long flags;
+		struct task_struct *cpu_wakeup_task;
+
+		raw_spin_lock_irqsave(&wakeup_lock, flags);
+
+		cpu_wakeup_task = per_cpu(wakeup_task, old_cpu);
+		if (task == cpu_wakeup_task) {
+			put_task_struct(cpu_wakeup_task);
+			per_cpu(wakeup_task, old_cpu) = NULL;
+			cpu_wakeup_task = per_cpu(wakeup_task, cpu) = task;
+			get_task_struct(cpu_wakeup_task);
+		}
+
+		raw_spin_unlock_irqrestore(&wakeup_lock, flags);
+	}
+}
+
+static notrace void probe_wakeup_latency_hist_start(void *v,
+	struct task_struct *p, int success)
+{
+	unsigned long flags;
+	struct task_struct *curr = current;
+	int cpu = task_cpu(p);
+	struct task_struct *cpu_wakeup_task;
+
+	raw_spin_lock_irqsave(&wakeup_lock, flags);
+
+	cpu_wakeup_task = per_cpu(wakeup_task, cpu);
+
+	if (wakeup_pid) {
+		if ((cpu_wakeup_task && p->prio == cpu_wakeup_task->prio) ||
+		    p->prio == curr->prio)
+			per_cpu(wakeup_sharedprio, cpu) = 1;
+		if (likely(wakeup_pid != task_pid_nr(p)))
+			goto out;
+	} else {
+		if (likely(!rt_task(p)) ||
+		    (cpu_wakeup_task && p->prio > cpu_wakeup_task->prio) ||
+		    p->prio > curr->prio)
+			goto out;
+		if ((cpu_wakeup_task && p->prio == cpu_wakeup_task->prio) ||
+		    p->prio == curr->prio)
+			per_cpu(wakeup_sharedprio, cpu) = 1;
+	}
+
+	if (cpu_wakeup_task)
+		put_task_struct(cpu_wakeup_task);
+	cpu_wakeup_task = per_cpu(wakeup_task, cpu) = p;
+	get_task_struct(cpu_wakeup_task);
+	cpu_wakeup_task->preempt_timestamp_hist =
+		ftrace_now(raw_smp_processor_id());
+out:
+	raw_spin_unlock_irqrestore(&wakeup_lock, flags);
+}
+
+static notrace void probe_wakeup_latency_hist_stop(void *v,
+	struct task_struct *prev, struct task_struct *next)
+{
+	unsigned long flags;
+	int cpu = task_cpu(next);
+	long latency;
+	cycle_t stop;
+	struct task_struct *cpu_wakeup_task;
+
+	raw_spin_lock_irqsave(&wakeup_lock, flags);
+
+	cpu_wakeup_task = per_cpu(wakeup_task, cpu);
+
+	if (cpu_wakeup_task == NULL)
+		goto out;
+
+	/* Already running? */
+	if (unlikely(current == cpu_wakeup_task))
+		goto out_reset;
+
+	if (next != cpu_wakeup_task) {
+		if (next->prio < cpu_wakeup_task->prio)
+			goto out_reset;
+
+		if (next->prio == cpu_wakeup_task->prio)
+			per_cpu(wakeup_sharedprio, cpu) = 1;
+
+		goto out;
+	}
+
+	if (current->prio == cpu_wakeup_task->prio)
+		per_cpu(wakeup_sharedprio, cpu) = 1;
+
+	/*
+	 * The task we are waiting for is about to be switched to.
+	 * Calculate latency and store it in histogram.
+	 */
+	stop = ftrace_now(raw_smp_processor_id());
+
+	latency = ((long) (stop - next->preempt_timestamp_hist)) /
+	    NSECS_PER_USECS;
+
+	if (per_cpu(wakeup_sharedprio, cpu)) {
+		latency_hist(WAKEUP_LATENCY_SHAREDPRIO, cpu, latency, 0, stop,
+		    next);
+		per_cpu(wakeup_sharedprio, cpu) = 0;
+	} else {
+		latency_hist(WAKEUP_LATENCY, cpu, latency, 0, stop, next);
+#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
+		if (timerandwakeup_enabled_data.enabled) {
+			latency_hist(TIMERANDWAKEUP_LATENCY, cpu,
+			    next->timer_offset + latency, next->timer_offset,
+			    stop, next);
+		}
+#endif
+	}
+
+out_reset:
+#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
+	next->timer_offset = 0;
+#endif
+	put_task_struct(cpu_wakeup_task);
+	per_cpu(wakeup_task, cpu) = NULL;
+out:
+	raw_spin_unlock_irqrestore(&wakeup_lock, flags);
+}
+#endif
+
+#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
+static notrace void probe_hrtimer_interrupt(void *v, int cpu,
+	long long latency_ns, struct task_struct *curr,
+	struct task_struct *task)
+{
+	if (latency_ns <= 0 && task != NULL && rt_task(task) &&
+	    (task->prio < curr->prio ||
+	    (task->prio == curr->prio &&
+	    !cpumask_test_cpu(cpu, &task->cpus_allowed)))) {
+		long latency;
+		cycle_t now;
+
+		if (missed_timer_offsets_pid) {
+			if (likely(missed_timer_offsets_pid !=
+			    task_pid_nr(task)))
+				return;
+		}
+
+		now = ftrace_now(cpu);
+		latency = (long) div_s64(-latency_ns, NSECS_PER_USECS);
+		latency_hist(MISSED_TIMER_OFFSETS, cpu, latency, latency, now,
+		    task);
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+		task->timer_offset = latency;
+#endif
+	}
+}
+#endif
+
+static __init int latency_hist_init(void)
+{
+	struct dentry *latency_hist_root = NULL;
+	struct dentry *dentry;
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+	struct dentry *dentry_sharedprio;
+#endif
+	struct dentry *entry;
+	struct dentry *enable_root;
+	int i = 0;
+	struct hist_data *my_hist;
+	char name[64];
+	char *cpufmt = "CPU%d";
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
+	defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+	char *cpufmt_maxlatproc = "max_latency-CPU%d";
+	struct maxlatproc_data *mp = NULL;
+#endif
+
+	dentry = tracing_init_dentry();
+	latency_hist_root = debugfs_create_dir(latency_hist_dir_root, dentry);
+	enable_root = debugfs_create_dir("enable", latency_hist_root);
+
+#ifdef CONFIG_INTERRUPT_OFF_HIST
+	dentry = debugfs_create_dir(irqsoff_hist_dir, latency_hist_root);
+	for_each_possible_cpu(i) {
+		sprintf(name, cpufmt, i);
+		entry = debugfs_create_file(name, 0444, dentry,
+		    &per_cpu(irqsoff_hist, i), &latency_hist_fops);
+		my_hist = &per_cpu(irqsoff_hist, i);
+		atomic_set(&my_hist->hist_mode, 1);
+		my_hist->min_lat = LONG_MAX;
+	}
+	entry = debugfs_create_file("reset", 0644, dentry,
+	    (void *)IRQSOFF_LATENCY, &latency_hist_reset_fops);
+#endif
+
+#ifdef CONFIG_PREEMPT_OFF_HIST
+	dentry = debugfs_create_dir(preemptoff_hist_dir,
+	    latency_hist_root);
+	for_each_possible_cpu(i) {
+		sprintf(name, cpufmt, i);
+		entry = debugfs_create_file(name, 0444, dentry,
+		    &per_cpu(preemptoff_hist, i), &latency_hist_fops);
+		my_hist = &per_cpu(preemptoff_hist, i);
+		atomic_set(&my_hist->hist_mode, 1);
+		my_hist->min_lat = LONG_MAX;
+	}
+	entry = debugfs_create_file("reset", 0644, dentry,
+	    (void *)PREEMPTOFF_LATENCY, &latency_hist_reset_fops);
+#endif
+
+#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
+	dentry = debugfs_create_dir(preemptirqsoff_hist_dir,
+	    latency_hist_root);
+	for_each_possible_cpu(i) {
+		sprintf(name, cpufmt, i);
+		entry = debugfs_create_file(name, 0444, dentry,
+		    &per_cpu(preemptirqsoff_hist, i), &latency_hist_fops);
+		my_hist = &per_cpu(preemptirqsoff_hist, i);
+		atomic_set(&my_hist->hist_mode, 1);
+		my_hist->min_lat = LONG_MAX;
+	}
+	entry = debugfs_create_file("reset", 0644, dentry,
+	    (void *)PREEMPTIRQSOFF_LATENCY, &latency_hist_reset_fops);
+#endif
+
+#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
+	entry = debugfs_create_file("preemptirqsoff", 0644,
+	    enable_root, (void *)&preemptirqsoff_enabled_data,
+	    &enable_fops);
+#endif
+
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+	dentry = debugfs_create_dir(wakeup_latency_hist_dir,
+	    latency_hist_root);
+	dentry_sharedprio = debugfs_create_dir(
+	    wakeup_latency_hist_dir_sharedprio, dentry);
+	for_each_possible_cpu(i) {
+		sprintf(name, cpufmt, i);
+
+		entry = debugfs_create_file(name, 0444, dentry,
+		    &per_cpu(wakeup_latency_hist, i),
+		    &latency_hist_fops);
+		my_hist = &per_cpu(wakeup_latency_hist, i);
+		atomic_set(&my_hist->hist_mode, 1);
+		my_hist->min_lat = LONG_MAX;
+
+		entry = debugfs_create_file(name, 0444, dentry_sharedprio,
+		    &per_cpu(wakeup_latency_hist_sharedprio, i),
+		    &latency_hist_fops);
+		my_hist = &per_cpu(wakeup_latency_hist_sharedprio, i);
+		atomic_set(&my_hist->hist_mode, 1);
+		my_hist->min_lat = LONG_MAX;
+
+		sprintf(name, cpufmt_maxlatproc, i);
+
+		mp = &per_cpu(wakeup_maxlatproc, i);
+		entry = debugfs_create_file(name, 0444, dentry, mp,
+		    &maxlatproc_fops);
+		clear_maxlatprocdata(mp);
+
+		mp = &per_cpu(wakeup_maxlatproc_sharedprio, i);
+		entry = debugfs_create_file(name, 0444, dentry_sharedprio, mp,
+		    &maxlatproc_fops);
+		clear_maxlatprocdata(mp);
+	}
+	entry = debugfs_create_file("pid", 0644, dentry,
+	    (void *)&wakeup_pid, &pid_fops);
+	entry = debugfs_create_file("reset", 0644, dentry,
+	    (void *)WAKEUP_LATENCY, &latency_hist_reset_fops);
+	entry = debugfs_create_file("reset", 0644, dentry_sharedprio,
+	    (void *)WAKEUP_LATENCY_SHAREDPRIO, &latency_hist_reset_fops);
+	entry = debugfs_create_file("wakeup", 0644,
+	    enable_root, (void *)&wakeup_latency_enabled_data,
+	    &enable_fops);
+#endif
+
+#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
+	dentry = debugfs_create_dir(missed_timer_offsets_dir,
+	    latency_hist_root);
+	for_each_possible_cpu(i) {
+		sprintf(name, cpufmt, i);
+		entry = debugfs_create_file(name, 0444, dentry,
+		    &per_cpu(missed_timer_offsets, i), &latency_hist_fops);
+		my_hist = &per_cpu(missed_timer_offsets, i);
+		atomic_set(&my_hist->hist_mode, 1);
+		my_hist->min_lat = LONG_MAX;
+
+		sprintf(name, cpufmt_maxlatproc, i);
+		mp = &per_cpu(missed_timer_offsets_maxlatproc, i);
+		entry = debugfs_create_file(name, 0444, dentry, mp,
+		    &maxlatproc_fops);
+		clear_maxlatprocdata(mp);
+	}
+	entry = debugfs_create_file("pid", 0644, dentry,
+	    (void *)&missed_timer_offsets_pid, &pid_fops);
+	entry = debugfs_create_file("reset", 0644, dentry,
+	    (void *)MISSED_TIMER_OFFSETS, &latency_hist_reset_fops);
+	entry = debugfs_create_file("missed_timer_offsets", 0644,
+	    enable_root, (void *)&missed_timer_offsets_enabled_data,
+	    &enable_fops);
+#endif
+
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
+	defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+	dentry = debugfs_create_dir(timerandwakeup_latency_hist_dir,
+	    latency_hist_root);
+	for_each_possible_cpu(i) {
+		sprintf(name, cpufmt, i);
+		entry = debugfs_create_file(name, 0444, dentry,
+		    &per_cpu(timerandwakeup_latency_hist, i),
+		    &latency_hist_fops);
+		my_hist = &per_cpu(timerandwakeup_latency_hist, i);
+		atomic_set(&my_hist->hist_mode, 1);
+		my_hist->min_lat = LONG_MAX;
+
+		sprintf(name, cpufmt_maxlatproc, i);
+		mp = &per_cpu(timerandwakeup_maxlatproc, i);
+		entry = debugfs_create_file(name, 0444, dentry, mp,
+		    &maxlatproc_fops);
+		clear_maxlatprocdata(mp);
+	}
+	entry = debugfs_create_file("reset", 0644, dentry,
+	    (void *)TIMERANDWAKEUP_LATENCY, &latency_hist_reset_fops);
+	entry = debugfs_create_file("timerandwakeup", 0644,
+	    enable_root, (void *)&timerandwakeup_enabled_data,
+	    &enable_fops);
+#endif
+	return 0;
+}
+
+device_initcall(latency_hist_init);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 9bb104f748d0..de4f1a812c41 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -17,6 +17,7 @@
 #include <linux/fs.h>
 
 #include "trace.h"
+#include <trace/events/hist.h>
 
 static struct trace_array		*irqsoff_trace __read_mostly;
 static int				tracer_enabled __read_mostly;
@@ -435,11 +436,13 @@ void start_critical_timings(void)
 {
 	if (preempt_trace() || irq_trace())
 		start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+	trace_preemptirqsoff_hist(TRACE_START, 1);
 }
 EXPORT_SYMBOL_GPL(start_critical_timings);
 
 void stop_critical_timings(void)
 {
+	trace_preemptirqsoff_hist(TRACE_STOP, 0);
 	if (preempt_trace() || irq_trace())
 		stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
 }
@@ -449,6 +452,7 @@ EXPORT_SYMBOL_GPL(stop_critical_timings);
 #ifdef CONFIG_PROVE_LOCKING
 void time_hardirqs_on(unsigned long a0, unsigned long a1)
 {
+	trace_preemptirqsoff_hist(IRQS_ON, 0);
 	if (!preempt_trace() && irq_trace())
 		stop_critical_timing(a0, a1);
 }
@@ -457,6 +461,7 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1)
 {
 	if (!preempt_trace() && irq_trace())
 		start_critical_timing(a0, a1);
+	trace_preemptirqsoff_hist(IRQS_OFF, 1);
 }
 
 #else /* !CONFIG_PROVE_LOCKING */
@@ -482,6 +487,7 @@ inline void print_irqtrace_events(struct task_struct *curr)
  */
 void trace_hardirqs_on(void)
 {
+	trace_preemptirqsoff_hist(IRQS_ON, 0);
 	if (!preempt_trace() && irq_trace())
 		stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
 }
@@ -491,11 +497,13 @@ void trace_hardirqs_off(void)
 {
 	if (!preempt_trace() && irq_trace())
 		start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+	trace_preemptirqsoff_hist(IRQS_OFF, 1);
 }
 EXPORT_SYMBOL(trace_hardirqs_off);
 
 __visible void trace_hardirqs_on_caller(unsigned long caller_addr)
 {
+	trace_preemptirqsoff_hist(IRQS_ON, 0);
 	if (!preempt_trace() && irq_trace())
 		stop_critical_timing(CALLER_ADDR0, caller_addr);
 }
@@ -505,6 +513,7 @@ __visible void trace_hardirqs_off_caller(unsigned long caller_addr)
 {
 	if (!preempt_trace() && irq_trace())
 		start_critical_timing(CALLER_ADDR0, caller_addr);
+	trace_preemptirqsoff_hist(IRQS_OFF, 1);
 }
 EXPORT_SYMBOL(trace_hardirqs_off_caller);
 
@@ -514,12 +523,14 @@ EXPORT_SYMBOL(trace_hardirqs_off_caller);
 #ifdef CONFIG_PREEMPT_TRACER
 void trace_preempt_on(unsigned long a0, unsigned long a1)
 {
+	trace_preemptirqsoff_hist(PREEMPT_ON, 0);
 	if (preempt_trace() && !irq_trace())
 		stop_critical_timing(a0, a1);
 }
 
 void trace_preempt_off(unsigned long a0, unsigned long a1)
 {
+	trace_preemptirqsoff_hist(PREEMPT_ON, 1);
 	if (preempt_trace() && !irq_trace())
 		start_critical_timing(a0, a1);
 }
-- 
cgit v1.2.3


From 0988db8a8f6db3bc283a5d5bb9f0a760b86e78ba Mon Sep 17 00:00:00 2001
From: Carsten Emde <C.Emde@osadl.org>
Date: Tue, 19 Jul 2011 13:53:12 +0100
Subject: hwlatdetect.patch

Jon Masters developed this wonderful SMI detector. For details please
consult Documentation/hwlat_detector.txt. It could be ported to Linux
3.0 RT without any major change.

Signed-off-by: Carsten Emde <C.Emde@osadl.org>
---
 Documentation/hwlat_detector.txt |   64 ++
 drivers/misc/Kconfig             |   29 +
 drivers/misc/Makefile            |    1 +
 drivers/misc/hwlat_detector.c    | 1212 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 1306 insertions(+)
 create mode 100644 Documentation/hwlat_detector.txt
 create mode 100644 drivers/misc/hwlat_detector.c

diff --git a/Documentation/hwlat_detector.txt b/Documentation/hwlat_detector.txt
new file mode 100644
index 000000000000..cb61516483d3
--- /dev/null
+++ b/Documentation/hwlat_detector.txt
@@ -0,0 +1,64 @@
+Introduction:
+-------------
+
+The module hwlat_detector is a special purpose kernel module that is used to
+detect large system latencies induced by the behavior of certain underlying
+hardware or firmware, independent of Linux itself. The code was developed
+originally to detect SMIs (System Management Interrupts) on x86 systems,
+however there is nothing x86 specific about this patchset. It was
+originally written for use by the "RT" patch since the Real Time
+kernel is highly latency sensitive.
+
+SMIs are usually not serviced by the Linux kernel, which typically does not
+even know that they are occuring. SMIs are instead are set up by BIOS code
+and are serviced by BIOS code, usually for "critical" events such as
+management of thermal sensors and fans. Sometimes though, SMIs are used for
+other tasks and those tasks can spend an inordinate amount of time in the
+handler (sometimes measured in milliseconds). Obviously this is a problem if
+you are trying to keep event service latencies down in the microsecond range.
+
+The hardware latency detector works by hogging all of the cpus for configurable
+amounts of time (by calling stop_machine()), polling the CPU Time Stamp Counter
+for some period, then looking for gaps in the TSC data. Any gap indicates a
+time when the polling was interrupted and since the machine is stopped and
+interrupts turned off the only thing that could do that would be an SMI.
+
+Note that the SMI detector should *NEVER* be used in a production environment.
+It is intended to be run manually to determine if the hardware platform has a
+problem with long system firmware service routines.
+
+Usage:
+------
+
+Loading the module hwlat_detector passing the parameter "enabled=1" (or by
+setting the "enable" entry in "hwlat_detector" debugfs toggled on) is the only
+step required to start the hwlat_detector. It is possible to redefine the
+threshold in microseconds (us) above which latency spikes will be taken
+into account (parameter "threshold=").
+
+Example:
+
+	# modprobe hwlat_detector enabled=1 threshold=100
+
+After the module is loaded, it creates a directory named "hwlat_detector" under
+the debugfs mountpoint, "/debug/hwlat_detector" for this text. It is necessary
+to have debugfs mounted, which might be on /sys/debug on your system.
+
+The /debug/hwlat_detector interface contains the following files:
+
+count			- number of latency spikes observed since last reset
+enable			- a global enable/disable toggle (0/1), resets count
+max			- maximum hardware latency actually observed (usecs)
+sample			- a pipe from which to read current raw sample data
+			  in the format <timestamp> <latency observed usecs>
+			  (can be opened O_NONBLOCK for a single sample)
+threshold		- minimum latency value to be considered (usecs)
+width			- time period to sample with CPUs held (usecs)
+			  must be less than the total window size (enforced)
+window			- total period of sampling, width being inside (usecs)
+
+By default we will set width to 500,000 and window to 1,000,000, meaning that
+we will sample every 1,000,000 usecs (1s) for 500,000 usecs (0.5s). If we
+observe any latencies that exceed the threshold (initially 100 usecs),
+then we write to a global sample ring buffer of 8K samples, which is
+consumed by reading from the "sample" (pipe) debugfs file interface.
diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index a91797de3f90..d31416bfe325 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -121,6 +121,35 @@ config IBM_ASM
 	  for information on the specific driver level and support statement
 	  for your IBM server.
 
+config HWLAT_DETECTOR
+	tristate "Testing module to detect hardware-induced latencies"
+	depends on DEBUG_FS
+	depends on RING_BUFFER
+	default m
+	---help---
+	  A simple hardware latency detector. Use this module to detect
+	  large latencies introduced by the behavior of the underlying
+	  system firmware external to Linux. We do this using periodic
+	  use of stop_machine to grab all available CPUs and measure
+	  for unexplainable gaps in the CPU timestamp counter(s). By
+	  default, the module is not enabled until the "enable" file
+	  within the "hwlat_detector" debugfs directory is toggled.
+
+	  This module is often used to detect SMI (System Management
+	  Interrupts) on x86 systems, though is not x86 specific. To
+	  this end, we default to using a sample window of 1 second,
+	  during which we will sample for 0.5 seconds. If an SMI or
+	  similar event occurs during that time, it is recorded
+	  into an 8K samples global ring buffer until retreived.
+
+	  WARNING: This software should never be enabled (it can be built
+	  but should not be turned on after it is loaded) in a production
+	  environment where high latencies are a concern since the
+	  sampling mechanism actually introduces latencies for
+	  regular tasks while the CPU(s) are being held.
+
+	  If unsure, say N
+
 config PHANTOM
 	tristate "Sensable PHANToM (PCI)"
 	depends on PCI
diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
index 7d5c4cd118c4..6a8e39388cf9 100644
--- a/drivers/misc/Makefile
+++ b/drivers/misc/Makefile
@@ -38,6 +38,7 @@ obj-$(CONFIG_C2PORT)		+= c2port/
 obj-$(CONFIG_HMC6352)		+= hmc6352.o
 obj-y				+= eeprom/
 obj-y				+= cb710/
+obj-$(CONFIG_HWLAT_DETECTOR)	+= hwlat_detector.o
 obj-$(CONFIG_SPEAR13XX_PCIE_GADGET)	+= spear13xx_pcie_gadget.o
 obj-$(CONFIG_VMWARE_BALLOON)	+= vmw_balloon.o
 obj-$(CONFIG_ARM_CHARLCD)	+= arm-charlcd.o
diff --git a/drivers/misc/hwlat_detector.c b/drivers/misc/hwlat_detector.c
new file mode 100644
index 000000000000..6864f3c24c8b
--- /dev/null
+++ b/drivers/misc/hwlat_detector.c
@@ -0,0 +1,1212 @@
+/*
+ * hwlat_detector.c - A simple Hardware Latency detector.
+ *
+ * Use this module to detect large system latencies induced by the behavior of
+ * certain underlying system hardware or firmware, independent of Linux itself.
+ * The code was developed originally to detect the presence of SMIs on Intel
+ * and AMD systems, although there is no dependency upon x86 herein.
+ *
+ * The classical example usage of this module is in detecting the presence of
+ * SMIs or System Management Interrupts on Intel and AMD systems. An SMI is a
+ * somewhat special form of hardware interrupt spawned from earlier CPU debug
+ * modes in which the (BIOS/EFI/etc.) firmware arranges for the South Bridge
+ * LPC (or other device) to generate a special interrupt under certain
+ * circumstances, for example, upon expiration of a special SMI timer device,
+ * due to certain external thermal readings, on certain I/O address accesses,
+ * and other situations. An SMI hits a special CPU pin, triggers a special
+ * SMI mode (complete with special memory map), and the OS is unaware.
+ *
+ * Although certain hardware-inducing latencies are necessary (for example,
+ * a modern system often requires an SMI handler for correct thermal control
+ * and remote management) they can wreak havoc upon any OS-level performance
+ * guarantees toward low-latency, especially when the OS is not even made
+ * aware of the presence of these interrupts. For this reason, we need a
+ * somewhat brute force mechanism to detect these interrupts. In this case,
+ * we do it by hogging all of the CPU(s) for configurable timer intervals,
+ * sampling the built-in CPU timer, looking for discontiguous readings.
+ *
+ * WARNING: This implementation necessarily introduces latencies. Therefore,
+ *          you should NEVER use this module in a production environment
+ *          requiring any kind of low-latency performance guarantee(s).
+ *
+ * Copyright (C) 2008-2009 Jon Masters, Red Hat, Inc. <jcm@redhat.com>
+ *
+ * Includes useful feedback from Clark Williams <clark@redhat.com>
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/ring_buffer.h>
+#include <linux/stop_machine.h>
+#include <linux/time.h>
+#include <linux/hrtimer.h>
+#include <linux/kthread.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/uaccess.h>
+#include <linux/version.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+
+#define BUF_SIZE_DEFAULT	262144UL		/* 8K*(sizeof(entry)) */
+#define BUF_FLAGS		(RB_FL_OVERWRITE)	/* no block on full */
+#define U64STR_SIZE		22			/* 20 digits max */
+
+#define VERSION			"1.0.0"
+#define BANNER			"hwlat_detector: "
+#define DRVNAME			"hwlat_detector"
+#define DEFAULT_SAMPLE_WINDOW	1000000			/* 1s */
+#define DEFAULT_SAMPLE_WIDTH	500000			/* 0.5s */
+#define DEFAULT_LAT_THRESHOLD	10			/* 10us */
+
+/* Module metadata */
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jon Masters <jcm@redhat.com>");
+MODULE_DESCRIPTION("A simple hardware latency detector");
+MODULE_VERSION(VERSION);
+
+/* Module parameters */
+
+static int debug;
+static int enabled;
+static int threshold;
+
+module_param(debug, int, 0);			/* enable debug */
+module_param(enabled, int, 0);			/* enable detector */
+module_param(threshold, int, 0);		/* latency threshold */
+
+/* Buffering and sampling */
+
+static struct ring_buffer *ring_buffer;		/* sample buffer */
+static DEFINE_MUTEX(ring_buffer_mutex);		/* lock changes */
+static unsigned long buf_size = BUF_SIZE_DEFAULT;
+static struct task_struct *kthread;		/* sampling thread */
+
+/* DebugFS filesystem entries */
+
+static struct dentry *debug_dir;		/* debugfs directory */
+static struct dentry *debug_max;		/* maximum TSC delta */
+static struct dentry *debug_count;		/* total detect count */
+static struct dentry *debug_sample_width;	/* sample width us */
+static struct dentry *debug_sample_window;	/* sample window us */
+static struct dentry *debug_sample;		/* raw samples us */
+static struct dentry *debug_threshold;		/* threshold us */
+static struct dentry *debug_enable;		/* enable/disable */
+
+/* Individual samples and global state */
+
+struct sample;					/* latency sample */
+struct data;					/* Global state */
+
+/* Sampling functions */
+static int __buffer_add_sample(struct sample *sample);
+static struct sample *buffer_get_sample(struct sample *sample);
+static int get_sample(void *unused);
+
+/* Threading and state */
+static int kthread_fn(void *unused);
+static int start_kthread(void);
+static int stop_kthread(void);
+static void __reset_stats(void);
+static int init_stats(void);
+
+/* Debugfs interface */
+static ssize_t simple_data_read(struct file *filp, char __user *ubuf,
+				size_t cnt, loff_t *ppos, const u64 *entry);
+static ssize_t simple_data_write(struct file *filp, const char __user *ubuf,
+				 size_t cnt, loff_t *ppos, u64 *entry);
+static int debug_sample_fopen(struct inode *inode, struct file *filp);
+static ssize_t debug_sample_fread(struct file *filp, char __user *ubuf,
+				  size_t cnt, loff_t *ppos);
+static int debug_sample_release(struct inode *inode, struct file *filp);
+static int debug_enable_fopen(struct inode *inode, struct file *filp);
+static ssize_t debug_enable_fread(struct file *filp, char __user *ubuf,
+				  size_t cnt, loff_t *ppos);
+static ssize_t debug_enable_fwrite(struct file *file,
+				   const char __user *user_buffer,
+				   size_t user_size, loff_t *offset);
+
+/* Initialization functions */
+static int init_debugfs(void);
+static void free_debugfs(void);
+static int detector_init(void);
+static void detector_exit(void);
+
+/* Individual latency samples are stored here when detected and packed into
+ * the ring_buffer circular buffer, where they are overwritten when
+ * more than buf_size/sizeof(sample) samples are received. */
+struct sample {
+	u64		seqnum;		/* unique sequence */
+	u64		duration;	/* ktime delta */
+	struct timespec	timestamp;	/* wall time */
+	unsigned long   lost;
+};
+
+/* keep the global state somewhere. Mostly used under stop_machine. */
+static struct data {
+
+	struct mutex lock;		/* protect changes */
+
+	u64	count;			/* total since reset */
+	u64	max_sample;		/* max hardware latency */
+	u64	threshold;		/* sample threshold level */
+
+	u64	sample_window;		/* total sampling window (on+off) */
+	u64	sample_width;		/* active sampling portion of window */
+
+	atomic_t sample_open;		/* whether the sample file is open */
+
+	wait_queue_head_t wq;		/* waitqeue for new sample values */
+
+} data;
+
+/**
+ * __buffer_add_sample - add a new latency sample recording to the ring buffer
+ * @sample: The new latency sample value
+ *
+ * This receives a new latency sample and records it in a global ring buffer.
+ * No additional locking is used in this case - suited for stop_machine use.
+ */
+static int __buffer_add_sample(struct sample *sample)
+{
+	return ring_buffer_write(ring_buffer,
+				 sizeof(struct sample), sample);
+}
+
+/**
+ * buffer_get_sample - remove a hardware latency sample from the ring buffer
+ * @sample: Pre-allocated storage for the sample
+ *
+ * This retrieves a hardware latency sample from the global circular buffer
+ */
+static struct sample *buffer_get_sample(struct sample *sample)
+{
+	struct ring_buffer_event *e = NULL;
+	struct sample *s = NULL;
+	unsigned int cpu = 0;
+
+	if (!sample)
+		return NULL;
+
+	mutex_lock(&ring_buffer_mutex);
+	for_each_online_cpu(cpu) {
+		e = ring_buffer_consume(ring_buffer, cpu, NULL, &sample->lost);
+		if (e)
+			break;
+	}
+
+	if (e) {
+		s = ring_buffer_event_data(e);
+		memcpy(sample, s, sizeof(struct sample));
+	} else
+		sample = NULL;
+	mutex_unlock(&ring_buffer_mutex);
+
+	return sample;
+}
+
+/**
+ * get_sample - sample the CPU TSC and look for likely hardware latencies
+ * @unused: This is not used but is a part of the stop_machine API
+ *
+ * Used to repeatedly capture the CPU TSC (or similar), looking for potential
+ * hardware-induced latency. Called under stop_machine, with data.lock held.
+ */
+static int get_sample(void *unused)
+{
+	ktime_t start, t1, t2;
+	s64 diff, total = 0;
+	u64 sample = 0;
+	int ret = 1;
+
+	start = ktime_get(); /* start timestamp */
+
+	do {
+
+		t1 = ktime_get();	/* we'll look for a discontinuity */
+		t2 = ktime_get();
+
+		total = ktime_to_us(ktime_sub(t2, start)); /* sample width */
+		diff = ktime_to_us(ktime_sub(t2, t1));     /* current diff */
+
+		/* This shouldn't happen */
+		if (diff < 0) {
+			pr_err(BANNER "time running backwards\n");
+			goto out;
+		}
+
+		if (diff > sample)
+			sample = diff; /* only want highest value */
+
+	} while (total <= data.sample_width);
+
+	/* If we exceed the threshold value, we have found a hardware latency */
+	if (sample > data.threshold) {
+		struct sample s;
+
+		data.count++;
+		s.seqnum = data.count;
+		s.duration = sample;
+		s.timestamp = CURRENT_TIME;
+		__buffer_add_sample(&s);
+
+		/* Keep a running maximum ever recorded hardware latency */
+		if (sample > data.max_sample)
+			data.max_sample = sample;
+	}
+
+	ret = 0;
+out:
+	return ret;
+}
+
+/*
+ * kthread_fn - The CPU time sampling/hardware latency detection kernel thread
+ * @unused: A required part of the kthread API.
+ *
+ * Used to periodically sample the CPU TSC via a call to get_sample. We
+ * use stop_machine, whith does (intentionally) introduce latency since we
+ * need to ensure nothing else might be running (and thus pre-empting).
+ * Obviously this should never be used in production environments.
+ *
+ * stop_machine will schedule us typically only on CPU0 which is fine for
+ * almost every real-world hardware latency situation - but we might later
+ * generalize this if we find there are any actualy systems with alternate
+ * SMI delivery or other non CPU0 hardware latencies.
+ */
+static int kthread_fn(void *unused)
+{
+	int err = 0;
+	u64 interval = 0;
+
+	while (!kthread_should_stop()) {
+
+		mutex_lock(&data.lock);
+
+		err = stop_machine(get_sample, unused, 0);
+		if (err) {
+			/* Houston, we have a problem */
+			mutex_unlock(&data.lock);
+			goto err_out;
+		}
+
+		wake_up(&data.wq); /* wake up reader(s) */
+
+		interval = data.sample_window - data.sample_width;
+		do_div(interval, USEC_PER_MSEC); /* modifies interval value */
+
+		mutex_unlock(&data.lock);
+
+		if (msleep_interruptible(interval))
+			goto out;
+	}
+		goto out;
+err_out:
+	pr_err(BANNER "could not call stop_machine, disabling\n");
+	enabled = 0;
+out:
+	return err;
+
+}
+
+/**
+ * start_kthread - Kick off the hardware latency sampling/detector kthread
+ *
+ * This starts a kernel thread that will sit and sample the CPU timestamp
+ * counter (TSC or similar) and look for potential hardware latencies.
+ */
+static int start_kthread(void)
+{
+	kthread = kthread_run(kthread_fn, NULL,
+					DRVNAME);
+	if (IS_ERR(kthread)) {
+		pr_err(BANNER "could not start sampling thread\n");
+		enabled = 0;
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+/**
+ * stop_kthread - Inform the hardware latency samping/detector kthread to stop
+ *
+ * This kicks the running hardware latency sampling/detector kernel thread and
+ * tells it to stop sampling now. Use this on unload and at system shutdown.
+ */
+static int stop_kthread(void)
+{
+	int ret;
+
+	ret = kthread_stop(kthread);
+
+	return ret;
+}
+
+/**
+ * __reset_stats - Reset statistics for the hardware latency detector
+ *
+ * We use data to store various statistics and global state. We call this
+ * function in order to reset those when "enable" is toggled on or off, and
+ * also at initialization. Should be called with data.lock held.
+ */
+static void __reset_stats(void)
+{
+	data.count = 0;
+	data.max_sample = 0;
+	ring_buffer_reset(ring_buffer); /* flush out old sample entries */
+}
+
+/**
+ * init_stats - Setup global state statistics for the hardware latency detector
+ *
+ * We use data to store various statistics and global state. We also use
+ * a global ring buffer (ring_buffer) to keep raw samples of detected hardware
+ * induced system latencies. This function initializes these structures and
+ * allocates the global ring buffer also.
+ */
+static int init_stats(void)
+{
+	int ret = -ENOMEM;
+
+	mutex_init(&data.lock);
+	init_waitqueue_head(&data.wq);
+	atomic_set(&data.sample_open, 0);
+
+	ring_buffer = ring_buffer_alloc(buf_size, BUF_FLAGS);
+
+	if (WARN(!ring_buffer, KERN_ERR BANNER
+			       "failed to allocate ring buffer!\n"))
+		goto out;
+
+	__reset_stats();
+	data.threshold = DEFAULT_LAT_THRESHOLD;	    /* threshold us */
+	data.sample_window = DEFAULT_SAMPLE_WINDOW; /* window us */
+	data.sample_width = DEFAULT_SAMPLE_WIDTH;   /* width us */
+
+	ret = 0;
+
+out:
+	return ret;
+
+}
+
+/*
+ * simple_data_read - Wrapper read function for global state debugfs entries
+ * @filp: The active open file structure for the debugfs "file"
+ * @ubuf: The userspace provided buffer to read value into
+ * @cnt: The maximum number of bytes to read
+ * @ppos: The current "file" position
+ * @entry: The entry to read from
+ *
+ * This function provides a generic read implementation for the global state
+ * "data" structure debugfs filesystem entries. It would be nice to use
+ * simple_attr_read directly, but we need to make sure that the data.lock
+ * spinlock is held during the actual read (even though we likely won't ever
+ * actually race here as the updater runs under a stop_machine context).
+ */
+static ssize_t simple_data_read(struct file *filp, char __user *ubuf,
+				size_t cnt, loff_t *ppos, const u64 *entry)
+{
+	char buf[U64STR_SIZE];
+	u64 val = 0;
+	int len = 0;
+
+	memset(buf, 0, sizeof(buf));
+
+	if (!entry)
+		return -EFAULT;
+
+	mutex_lock(&data.lock);
+	val = *entry;
+	mutex_unlock(&data.lock);
+
+	len = snprintf(buf, sizeof(buf), "%llu\n", (unsigned long long)val);
+
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, len);
+
+}
+
+/*
+ * simple_data_write - Wrapper write function for global state debugfs entries
+ * @filp: The active open file structure for the debugfs "file"
+ * @ubuf: The userspace provided buffer to write value from
+ * @cnt: The maximum number of bytes to write
+ * @ppos: The current "file" position
+ * @entry: The entry to write to
+ *
+ * This function provides a generic write implementation for the global state
+ * "data" structure debugfs filesystem entries. It would be nice to use
+ * simple_attr_write directly, but we need to make sure that the data.lock
+ * spinlock is held during the actual write (even though we likely won't ever
+ * actually race here as the updater runs under a stop_machine context).
+ */
+static ssize_t simple_data_write(struct file *filp, const char __user *ubuf,
+				 size_t cnt, loff_t *ppos, u64 *entry)
+{
+	char buf[U64STR_SIZE];
+	int csize = min(cnt, sizeof(buf));
+	u64 val = 0;
+	int err = 0;
+
+	memset(buf, '\0', sizeof(buf));
+	if (copy_from_user(buf, ubuf, csize))
+		return -EFAULT;
+
+	buf[U64STR_SIZE-1] = '\0';			/* just in case */
+	err = kstrtoull(buf, 10, &val);
+	if (err)
+		return -EINVAL;
+
+	mutex_lock(&data.lock);
+	*entry = val;
+	mutex_unlock(&data.lock);
+
+	return csize;
+}
+
+/**
+ * debug_count_fopen - Open function for "count" debugfs entry
+ * @inode: The in-kernel inode representation of the debugfs "file"
+ * @filp: The active open file structure for the debugfs "file"
+ *
+ * This function provides an open implementation for the "count" debugfs
+ * interface to the hardware latency detector.
+ */
+static int debug_count_fopen(struct inode *inode, struct file *filp)
+{
+	return 0;
+}
+
+/**
+ * debug_count_fread - Read function for "count" debugfs entry
+ * @filp: The active open file structure for the debugfs "file"
+ * @ubuf: The userspace provided buffer to read value into
+ * @cnt: The maximum number of bytes to read
+ * @ppos: The current "file" position
+ *
+ * This function provides a read implementation for the "count" debugfs
+ * interface to the hardware latency detector. Can be used to read the
+ * number of latency readings exceeding the configured threshold since
+ * the detector was last reset (e.g. by writing a zero into "count").
+ */
+static ssize_t debug_count_fread(struct file *filp, char __user *ubuf,
+				     size_t cnt, loff_t *ppos)
+{
+	return simple_data_read(filp, ubuf, cnt, ppos, &data.count);
+}
+
+/**
+ * debug_count_fwrite - Write function for "count" debugfs entry
+ * @filp: The active open file structure for the debugfs "file"
+ * @ubuf: The user buffer that contains the value to write
+ * @cnt: The maximum number of bytes to write to "file"
+ * @ppos: The current position in the debugfs "file"
+ *
+ * This function provides a write implementation for the "count" debugfs
+ * interface to the hardware latency detector. Can be used to write a
+ * desired value, especially to zero the total count.
+ */
+static ssize_t  debug_count_fwrite(struct file *filp,
+				       const char __user *ubuf,
+				       size_t cnt,
+				       loff_t *ppos)
+{
+	return simple_data_write(filp, ubuf, cnt, ppos, &data.count);
+}
+
+/**
+ * debug_enable_fopen - Dummy open function for "enable" debugfs interface
+ * @inode: The in-kernel inode representation of the debugfs "file"
+ * @filp: The active open file structure for the debugfs "file"
+ *
+ * This function provides an open implementation for the "enable" debugfs
+ * interface to the hardware latency detector.
+ */
+static int debug_enable_fopen(struct inode *inode, struct file *filp)
+{
+	return 0;
+}
+
+/**
+ * debug_enable_fread - Read function for "enable" debugfs interface
+ * @filp: The active open file structure for the debugfs "file"
+ * @ubuf: The userspace provided buffer to read value into
+ * @cnt: The maximum number of bytes to read
+ * @ppos: The current "file" position
+ *
+ * This function provides a read implementation for the "enable" debugfs
+ * interface to the hardware latency detector. Can be used to determine
+ * whether the detector is currently enabled ("0\n" or "1\n" returned).
+ */
+static ssize_t debug_enable_fread(struct file *filp, char __user *ubuf,
+				      size_t cnt, loff_t *ppos)
+{
+	char buf[4];
+
+	if ((cnt < sizeof(buf)) || (*ppos))
+		return 0;
+
+	buf[0] = enabled ? '1' : '0';
+	buf[1] = '\n';
+	buf[2] = '\0';
+	if (copy_to_user(ubuf, buf, strlen(buf)))
+		return -EFAULT;
+	return *ppos = strlen(buf);
+}
+
+/**
+ * debug_enable_fwrite - Write function for "enable" debugfs interface
+ * @filp: The active open file structure for the debugfs "file"
+ * @ubuf: The user buffer that contains the value to write
+ * @cnt: The maximum number of bytes to write to "file"
+ * @ppos: The current position in the debugfs "file"
+ *
+ * This function provides a write implementation for the "enable" debugfs
+ * interface to the hardware latency detector. Can be used to enable or
+ * disable the detector, which will have the side-effect of possibly
+ * also resetting the global stats and kicking off the measuring
+ * kthread (on an enable) or the converse (upon a disable).
+ */
+static ssize_t  debug_enable_fwrite(struct file *filp,
+					const char __user *ubuf,
+					size_t cnt,
+					loff_t *ppos)
+{
+	char buf[4];
+	int csize = min(cnt, sizeof(buf));
+	long val = 0;
+	int err = 0;
+
+	memset(buf, '\0', sizeof(buf));
+	if (copy_from_user(buf, ubuf, csize))
+		return -EFAULT;
+
+	buf[sizeof(buf)-1] = '\0';			/* just in case */
+	err = kstrtoul(buf, 10, &val);
+	if (0 != err)
+		return -EINVAL;
+
+	if (val) {
+		if (enabled)
+			goto unlock;
+		enabled = 1;
+		__reset_stats();
+		if (start_kthread())
+			return -EFAULT;
+	} else {
+		if (!enabled)
+			goto unlock;
+		enabled = 0;
+		err = stop_kthread();
+		if (err) {
+			pr_err(BANNER "cannot stop kthread\n");
+			return -EFAULT;
+		}
+		wake_up(&data.wq);		/* reader(s) should return */
+	}
+unlock:
+	return csize;
+}
+
+/**
+ * debug_max_fopen - Open function for "max" debugfs entry
+ * @inode: The in-kernel inode representation of the debugfs "file"
+ * @filp: The active open file structure for the debugfs "file"
+ *
+ * This function provides an open implementation for the "max" debugfs
+ * interface to the hardware latency detector.
+ */
+static int debug_max_fopen(struct inode *inode, struct file *filp)
+{
+	return 0;
+}
+
+/**
+ * debug_max_fread - Read function for "max" debugfs entry
+ * @filp: The active open file structure for the debugfs "file"
+ * @ubuf: The userspace provided buffer to read value into
+ * @cnt: The maximum number of bytes to read
+ * @ppos: The current "file" position
+ *
+ * This function provides a read implementation for the "max" debugfs
+ * interface to the hardware latency detector. Can be used to determine
+ * the maximum latency value observed since it was last reset.
+ */
+static ssize_t debug_max_fread(struct file *filp, char __user *ubuf,
+				   size_t cnt, loff_t *ppos)
+{
+	return simple_data_read(filp, ubuf, cnt, ppos, &data.max_sample);
+}
+
+/**
+ * debug_max_fwrite - Write function for "max" debugfs entry
+ * @filp: The active open file structure for the debugfs "file"
+ * @ubuf: The user buffer that contains the value to write
+ * @cnt: The maximum number of bytes to write to "file"
+ * @ppos: The current position in the debugfs "file"
+ *
+ * This function provides a write implementation for the "max" debugfs
+ * interface to the hardware latency detector. Can be used to reset the
+ * maximum or set it to some other desired value - if, then, subsequent
+ * measurements exceed this value, the maximum will be updated.
+ */
+static ssize_t  debug_max_fwrite(struct file *filp,
+				     const char __user *ubuf,
+				     size_t cnt,
+				     loff_t *ppos)
+{
+	return simple_data_write(filp, ubuf, cnt, ppos, &data.max_sample);
+}
+
+
+/**
+ * debug_sample_fopen - An open function for "sample" debugfs interface
+ * @inode: The in-kernel inode representation of this debugfs "file"
+ * @filp: The active open file structure for the debugfs "file"
+ *
+ * This function handles opening the "sample" file within the hardware
+ * latency detector debugfs directory interface. This file is used to read
+ * raw samples from the global ring_buffer and allows the user to see a
+ * running latency history. Can be opened blocking or non-blocking,
+ * affecting whether it behaves as a buffer read pipe, or does not.
+ * Implements simple locking to prevent multiple simultaneous use.
+ */
+static int debug_sample_fopen(struct inode *inode, struct file *filp)
+{
+	if (!atomic_add_unless(&data.sample_open, 1, 1))
+		return -EBUSY;
+	else
+		return 0;
+}
+
+/**
+ * debug_sample_fread - A read function for "sample" debugfs interface
+ * @filp: The active open file structure for the debugfs "file"
+ * @ubuf: The user buffer that will contain the samples read
+ * @cnt: The maximum bytes to read from the debugfs "file"
+ * @ppos: The current position in the debugfs "file"
+ *
+ * This function handles reading from the "sample" file within the hardware
+ * latency detector debugfs directory interface. This file is used to read
+ * raw samples from the global ring_buffer and allows the user to see a
+ * running latency history. By default this will block pending a new
+ * value written into the sample buffer, unless there are already a
+ * number of value(s) waiting in the buffer, or the sample file was
+ * previously opened in a non-blocking mode of operation.
+ */
+static ssize_t debug_sample_fread(struct file *filp, char __user *ubuf,
+					size_t cnt, loff_t *ppos)
+{
+	int len = 0;
+	char buf[64];
+	struct sample *sample = NULL;
+
+	if (!enabled)
+		return 0;
+
+	sample = kzalloc(sizeof(struct sample), GFP_KERNEL);
+	if (!sample)
+		return -ENOMEM;
+
+	while (!buffer_get_sample(sample)) {
+
+		DEFINE_WAIT(wait);
+
+		if (filp->f_flags & O_NONBLOCK) {
+			len = -EAGAIN;
+			goto out;
+		}
+
+		prepare_to_wait(&data.wq, &wait, TASK_INTERRUPTIBLE);
+		schedule();
+		finish_wait(&data.wq, &wait);
+
+		if (signal_pending(current)) {
+			len = -EINTR;
+			goto out;
+		}
+
+		if (!enabled) {			/* enable was toggled */
+			len = 0;
+			goto out;
+		}
+	}
+
+	len = snprintf(buf, sizeof(buf), "%010lu.%010lu\t%llu\n",
+		      sample->timestamp.tv_sec,
+		      sample->timestamp.tv_nsec,
+		      sample->duration);
+
+
+	/* handling partial reads is more trouble than it's worth */
+	if (len > cnt)
+		goto out;
+
+	if (copy_to_user(ubuf, buf, len))
+		len = -EFAULT;
+
+out:
+	kfree(sample);
+	return len;
+}
+
+/**
+ * debug_sample_release - Release function for "sample" debugfs interface
+ * @inode: The in-kernel inode represenation of the debugfs "file"
+ * @filp: The active open file structure for the debugfs "file"
+ *
+ * This function completes the close of the debugfs interface "sample" file.
+ * Frees the sample_open "lock" so that other users may open the interface.
+ */
+static int debug_sample_release(struct inode *inode, struct file *filp)
+{
+	atomic_dec(&data.sample_open);
+
+	return 0;
+}
+
+/**
+ * debug_threshold_fopen - Open function for "threshold" debugfs entry
+ * @inode: The in-kernel inode representation of the debugfs "file"
+ * @filp: The active open file structure for the debugfs "file"
+ *
+ * This function provides an open implementation for the "threshold" debugfs
+ * interface to the hardware latency detector.
+ */
+static int debug_threshold_fopen(struct inode *inode, struct file *filp)
+{
+	return 0;
+}
+
+/**
+ * debug_threshold_fread - Read function for "threshold" debugfs entry
+ * @filp: The active open file structure for the debugfs "file"
+ * @ubuf: The userspace provided buffer to read value into
+ * @cnt: The maximum number of bytes to read
+ * @ppos: The current "file" position
+ *
+ * This function provides a read implementation for the "threshold" debugfs
+ * interface to the hardware latency detector. It can be used to determine
+ * the current threshold level at which a latency will be recorded in the
+ * global ring buffer, typically on the order of 10us.
+ */
+static ssize_t debug_threshold_fread(struct file *filp, char __user *ubuf,
+					 size_t cnt, loff_t *ppos)
+{
+	return simple_data_read(filp, ubuf, cnt, ppos, &data.threshold);
+}
+
+/**
+ * debug_threshold_fwrite - Write function for "threshold" debugfs entry
+ * @filp: The active open file structure for the debugfs "file"
+ * @ubuf: The user buffer that contains the value to write
+ * @cnt: The maximum number of bytes to write to "file"
+ * @ppos: The current position in the debugfs "file"
+ *
+ * This function provides a write implementation for the "threshold" debugfs
+ * interface to the hardware latency detector. It can be used to configure
+ * the threshold level at which any subsequently detected latencies will
+ * be recorded into the global ring buffer.
+ */
+static ssize_t  debug_threshold_fwrite(struct file *filp,
+					const char __user *ubuf,
+					size_t cnt,
+					loff_t *ppos)
+{
+	int ret;
+
+	ret = simple_data_write(filp, ubuf, cnt, ppos, &data.threshold);
+
+	if (enabled)
+		wake_up_process(kthread);
+
+	return ret;
+}
+
+/**
+ * debug_width_fopen - Open function for "width" debugfs entry
+ * @inode: The in-kernel inode representation of the debugfs "file"
+ * @filp: The active open file structure for the debugfs "file"
+ *
+ * This function provides an open implementation for the "width" debugfs
+ * interface to the hardware latency detector.
+ */
+static int debug_width_fopen(struct inode *inode, struct file *filp)
+{
+	return 0;
+}
+
+/**
+ * debug_width_fread - Read function for "width" debugfs entry
+ * @filp: The active open file structure for the debugfs "file"
+ * @ubuf: The userspace provided buffer to read value into
+ * @cnt: The maximum number of bytes to read
+ * @ppos: The current "file" position
+ *
+ * This function provides a read implementation for the "width" debugfs
+ * interface to the hardware latency detector. It can be used to determine
+ * for how many us of the total window us we will actively sample for any
+ * hardware-induced latecy periods. Obviously, it is not possible to
+ * sample constantly and have the system respond to a sample reader, or,
+ * worse, without having the system appear to have gone out to lunch.
+ */
+static ssize_t debug_width_fread(struct file *filp, char __user *ubuf,
+				     size_t cnt, loff_t *ppos)
+{
+	return simple_data_read(filp, ubuf, cnt, ppos, &data.sample_width);
+}
+
+/**
+ * debug_width_fwrite - Write function for "width" debugfs entry
+ * @filp: The active open file structure for the debugfs "file"
+ * @ubuf: The user buffer that contains the value to write
+ * @cnt: The maximum number of bytes to write to "file"
+ * @ppos: The current position in the debugfs "file"
+ *
+ * This function provides a write implementation for the "width" debugfs
+ * interface to the hardware latency detector. It can be used to configure
+ * for how many us of the total window us we will actively sample for any
+ * hardware-induced latency periods. Obviously, it is not possible to
+ * sample constantly and have the system respond to a sample reader, or,
+ * worse, without having the system appear to have gone out to lunch. It
+ * is enforced that width is less that the total window size.
+ */
+static ssize_t  debug_width_fwrite(struct file *filp,
+				       const char __user *ubuf,
+				       size_t cnt,
+				       loff_t *ppos)
+{
+	char buf[U64STR_SIZE];
+	int csize = min(cnt, sizeof(buf));
+	u64 val = 0;
+	int err = 0;
+
+	memset(buf, '\0', sizeof(buf));
+	if (copy_from_user(buf, ubuf, csize))
+		return -EFAULT;
+
+	buf[U64STR_SIZE-1] = '\0';			/* just in case */
+	err = kstrtoull(buf, 10, &val);
+	if (0 != err)
+		return -EINVAL;
+
+	mutex_lock(&data.lock);
+	if (val < data.sample_window)
+		data.sample_width = val;
+	else {
+		mutex_unlock(&data.lock);
+		return -EINVAL;
+	}
+	mutex_unlock(&data.lock);
+
+	if (enabled)
+		wake_up_process(kthread);
+
+	return csize;
+}
+
+/**
+ * debug_window_fopen - Open function for "window" debugfs entry
+ * @inode: The in-kernel inode representation of the debugfs "file"
+ * @filp: The active open file structure for the debugfs "file"
+ *
+ * This function provides an open implementation for the "window" debugfs
+ * interface to the hardware latency detector. The window is the total time
+ * in us that will be considered one sample period. Conceptually, windows
+ * occur back-to-back and contain a sample width period during which
+ * actual sampling occurs.
+ */
+static int debug_window_fopen(struct inode *inode, struct file *filp)
+{
+	return 0;
+}
+
+/**
+ * debug_window_fread - Read function for "window" debugfs entry
+ * @filp: The active open file structure for the debugfs "file"
+ * @ubuf: The userspace provided buffer to read value into
+ * @cnt: The maximum number of bytes to read
+ * @ppos: The current "file" position
+ *
+ * This function provides a read implementation for the "window" debugfs
+ * interface to the hardware latency detector. The window is the total time
+ * in us that will be considered one sample period. Conceptually, windows
+ * occur back-to-back and contain a sample width period during which
+ * actual sampling occurs. Can be used to read the total window size.
+ */
+static ssize_t debug_window_fread(struct file *filp, char __user *ubuf,
+				      size_t cnt, loff_t *ppos)
+{
+	return simple_data_read(filp, ubuf, cnt, ppos, &data.sample_window);
+}
+
+/**
+ * debug_window_fwrite - Write function for "window" debugfs entry
+ * @filp: The active open file structure for the debugfs "file"
+ * @ubuf: The user buffer that contains the value to write
+ * @cnt: The maximum number of bytes to write to "file"
+ * @ppos: The current position in the debugfs "file"
+ *
+ * This function provides a write implementation for the "window" debufds
+ * interface to the hardware latency detetector. The window is the total time
+ * in us that will be considered one sample period. Conceptually, windows
+ * occur back-to-back and contain a sample width period during which
+ * actual sampling occurs. Can be used to write a new total window size. It
+ * is enfoced that any value written must be greater than the sample width
+ * size, or an error results.
+ */
+static ssize_t  debug_window_fwrite(struct file *filp,
+					const char __user *ubuf,
+					size_t cnt,
+					loff_t *ppos)
+{
+	char buf[U64STR_SIZE];
+	int csize = min(cnt, sizeof(buf));
+	u64 val = 0;
+	int err = 0;
+
+	memset(buf, '\0', sizeof(buf));
+	if (copy_from_user(buf, ubuf, csize))
+		return -EFAULT;
+
+	buf[U64STR_SIZE-1] = '\0';			/* just in case */
+	err = kstrtoull(buf, 10, &val);
+	if (0 != err)
+		return -EINVAL;
+
+	mutex_lock(&data.lock);
+	if (data.sample_width < val)
+		data.sample_window = val;
+	else {
+		mutex_unlock(&data.lock);
+		return -EINVAL;
+	}
+	mutex_unlock(&data.lock);
+
+	return csize;
+}
+
+/*
+ * Function pointers for the "count" debugfs file operations
+ */
+static const struct file_operations count_fops = {
+	.open		= debug_count_fopen,
+	.read		= debug_count_fread,
+	.write		= debug_count_fwrite,
+	.owner		= THIS_MODULE,
+};
+
+/*
+ * Function pointers for the "enable" debugfs file operations
+ */
+static const struct file_operations enable_fops = {
+	.open		= debug_enable_fopen,
+	.read		= debug_enable_fread,
+	.write		= debug_enable_fwrite,
+	.owner		= THIS_MODULE,
+};
+
+/*
+ * Function pointers for the "max" debugfs file operations
+ */
+static const struct file_operations max_fops = {
+	.open		= debug_max_fopen,
+	.read		= debug_max_fread,
+	.write		= debug_max_fwrite,
+	.owner		= THIS_MODULE,
+};
+
+/*
+ * Function pointers for the "sample" debugfs file operations
+ */
+static const struct file_operations sample_fops = {
+	.open		= debug_sample_fopen,
+	.read		= debug_sample_fread,
+	.release	= debug_sample_release,
+	.owner		= THIS_MODULE,
+};
+
+/*
+ * Function pointers for the "threshold" debugfs file operations
+ */
+static const struct file_operations threshold_fops = {
+	.open		= debug_threshold_fopen,
+	.read		= debug_threshold_fread,
+	.write		= debug_threshold_fwrite,
+	.owner		= THIS_MODULE,
+};
+
+/*
+ * Function pointers for the "width" debugfs file operations
+ */
+static const struct file_operations width_fops = {
+	.open		= debug_width_fopen,
+	.read		= debug_width_fread,
+	.write		= debug_width_fwrite,
+	.owner		= THIS_MODULE,
+};
+
+/*
+ * Function pointers for the "window" debugfs file operations
+ */
+static const struct file_operations window_fops = {
+	.open		= debug_window_fopen,
+	.read		= debug_window_fread,
+	.write		= debug_window_fwrite,
+	.owner		= THIS_MODULE,
+};
+
+/**
+ * init_debugfs - A function to initialize the debugfs interface files
+ *
+ * This function creates entries in debugfs for "hwlat_detector", including
+ * files to read values from the detector, current samples, and the
+ * maximum sample that has been captured since the hardware latency
+ * dectector was started.
+ */
+static int init_debugfs(void)
+{
+	int ret = -ENOMEM;
+
+	debug_dir = debugfs_create_dir(DRVNAME, NULL);
+	if (!debug_dir)
+		goto err_debug_dir;
+
+	debug_sample = debugfs_create_file("sample", 0444,
+					       debug_dir, NULL,
+					       &sample_fops);
+	if (!debug_sample)
+		goto err_sample;
+
+	debug_count = debugfs_create_file("count", 0444,
+					      debug_dir, NULL,
+					      &count_fops);
+	if (!debug_count)
+		goto err_count;
+
+	debug_max = debugfs_create_file("max", 0444,
+					    debug_dir, NULL,
+					    &max_fops);
+	if (!debug_max)
+		goto err_max;
+
+	debug_sample_window = debugfs_create_file("window", 0644,
+						      debug_dir, NULL,
+						      &window_fops);
+	if (!debug_sample_window)
+		goto err_window;
+
+	debug_sample_width = debugfs_create_file("width", 0644,
+						     debug_dir, NULL,
+						     &width_fops);
+	if (!debug_sample_width)
+		goto err_width;
+
+	debug_threshold = debugfs_create_file("threshold", 0644,
+						  debug_dir, NULL,
+						  &threshold_fops);
+	if (!debug_threshold)
+		goto err_threshold;
+
+	debug_enable = debugfs_create_file("enable", 0644,
+					       debug_dir, &enabled,
+					       &enable_fops);
+	if (!debug_enable)
+		goto err_enable;
+
+	else {
+		ret = 0;
+		goto out;
+	}
+
+err_enable:
+	debugfs_remove(debug_threshold);
+err_threshold:
+	debugfs_remove(debug_sample_width);
+err_width:
+	debugfs_remove(debug_sample_window);
+err_window:
+	debugfs_remove(debug_max);
+err_max:
+	debugfs_remove(debug_count);
+err_count:
+	debugfs_remove(debug_sample);
+err_sample:
+	debugfs_remove(debug_dir);
+err_debug_dir:
+out:
+	return ret;
+}
+
+/**
+ * free_debugfs - A function to cleanup the debugfs file interface
+ */
+static void free_debugfs(void)
+{
+	/* could also use a debugfs_remove_recursive */
+	debugfs_remove(debug_enable);
+	debugfs_remove(debug_threshold);
+	debugfs_remove(debug_sample_width);
+	debugfs_remove(debug_sample_window);
+	debugfs_remove(debug_max);
+	debugfs_remove(debug_count);
+	debugfs_remove(debug_sample);
+	debugfs_remove(debug_dir);
+}
+
+/**
+ * detector_init - Standard module initialization code
+ */
+static int detector_init(void)
+{
+	int ret = -ENOMEM;
+
+	pr_info(BANNER "version %s\n", VERSION);
+
+	ret = init_stats();
+	if (0 != ret)
+		goto out;
+
+	ret = init_debugfs();
+	if (0 != ret)
+		goto err_stats;
+
+	if (enabled)
+		ret = start_kthread();
+
+	goto out;
+
+err_stats:
+	ring_buffer_free(ring_buffer);
+out:
+	return ret;
+
+}
+
+/**
+ * detector_exit - Standard module cleanup code
+ */
+static void detector_exit(void)
+{
+	int err;
+
+	if (enabled) {
+		enabled = 0;
+		err = stop_kthread();
+		if (err)
+			pr_err(BANNER "cannot stop kthread\n");
+	}
+
+	free_debugfs();
+	ring_buffer_free(ring_buffer);	/* free up the ring buffer */
+
+}
+
+module_init(detector_init);
+module_exit(detector_exit);
-- 
cgit v1.2.3


From eace5aa46f014029cc118ab04c218772dab58341 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 19 Aug 2013 17:33:25 -0400
Subject: hwlat-detector: Update hwlat_detector to add outer loop detection

The hwlat_detector reads two timestamps in a row, then reports any
gap between those calls. The problem is, it misses everything between
the second reading of the time stamp to the first reading of the time stamp
in the next loop. That's were most of the time is spent, which means,
chances are likely that it will miss all hardware latencies. This
defeats the purpose.

By also testing the first time stamp from the previous loop second
time stamp (the outer loop), we are more likely to find a latency.

Setting the threshold to 1, here's what the report now looks like:

1347415723.0232202770	0	2
1347415725.0234202822	0	2
1347415727.0236202875	0	2
1347415729.0238202928	0	2
1347415731.0240202980	0	2
1347415734.0243203061	0	2
1347415736.0245203113	0	2
1347415738.0247203166	2	0
1347415740.0249203219	0	3
1347415742.0251203272	0	3
1347415743.0252203299	0	3
1347415745.0254203351	0	2
1347415747.0256203404	0	2
1347415749.0258203457	0	2
1347415751.0260203510	0	2
1347415754.0263203589	0	2
1347415756.0265203642	0	2
1347415758.0267203695	0	2
1347415760.0269203748	0	2
1347415762.0271203801	0	2
1347415764.0273203853	2	0

There's some hardware latency that takes 2 microseconds to run.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 drivers/misc/hwlat_detector.c | 32 ++++++++++++++++++++++++++------
 1 file changed, 26 insertions(+), 6 deletions(-)

diff --git a/drivers/misc/hwlat_detector.c b/drivers/misc/hwlat_detector.c
index 6864f3c24c8b..c07e85932cbf 100644
--- a/drivers/misc/hwlat_detector.c
+++ b/drivers/misc/hwlat_detector.c
@@ -143,6 +143,7 @@ static void detector_exit(void);
 struct sample {
 	u64		seqnum;		/* unique sequence */
 	u64		duration;	/* ktime delta */
+	u64		outer_duration;	/* ktime delta (outer loop) */
 	struct timespec	timestamp;	/* wall time */
 	unsigned long   lost;
 };
@@ -219,11 +220,13 @@ static struct sample *buffer_get_sample(struct sample *sample)
  */
 static int get_sample(void *unused)
 {
-	ktime_t start, t1, t2;
+	ktime_t start, t1, t2, last_t2;
 	s64 diff, total = 0;
 	u64 sample = 0;
+	u64 outer_sample = 0;
 	int ret = 1;
 
+	last_t2.tv64 = 0;
 	start = ktime_get(); /* start timestamp */
 
 	do {
@@ -231,7 +234,22 @@ static int get_sample(void *unused)
 		t1 = ktime_get();	/* we'll look for a discontinuity */
 		t2 = ktime_get();
 
+		if (last_t2.tv64) {
+			/* Check the delta from outer loop (t2 to next t1) */
+			diff = ktime_to_us(ktime_sub(t1, last_t2));
+			/* This shouldn't happen */
+			if (diff < 0) {
+				pr_err(BANNER "time running backwards\n");
+				goto out;
+			}
+			if (diff > outer_sample)
+				outer_sample = diff;
+		}
+		last_t2 = t2;
+
 		total = ktime_to_us(ktime_sub(t2, start)); /* sample width */
+
+		/* This checks the inner loop (t1 to t2) */
 		diff = ktime_to_us(ktime_sub(t2, t1));     /* current diff */
 
 		/* This shouldn't happen */
@@ -246,12 +264,13 @@ static int get_sample(void *unused)
 	} while (total <= data.sample_width);
 
 	/* If we exceed the threshold value, we have found a hardware latency */
-	if (sample > data.threshold) {
+	if (sample > data.threshold || outer_sample > data.threshold) {
 		struct sample s;
 
 		data.count++;
 		s.seqnum = data.count;
 		s.duration = sample;
+		s.outer_duration = outer_sample;
 		s.timestamp = CURRENT_TIME;
 		__buffer_add_sample(&s);
 
@@ -738,10 +757,11 @@ static ssize_t debug_sample_fread(struct file *filp, char __user *ubuf,
 		}
 	}
 
-	len = snprintf(buf, sizeof(buf), "%010lu.%010lu\t%llu\n",
-		      sample->timestamp.tv_sec,
-		      sample->timestamp.tv_nsec,
-		      sample->duration);
+	len = snprintf(buf, sizeof(buf), "%010lu.%010lu\t%llu\t%llu\n",
+		       sample->timestamp.tv_sec,
+		       sample->timestamp.tv_nsec,
+		       sample->duration,
+		       sample->outer_duration);
 
 
 	/* handling partial reads is more trouble than it's worth */
-- 
cgit v1.2.3


From 40d1682c83bfa922f5d7fa2330085f30823b40ad Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 19 Aug 2013 17:33:26 -0400
Subject: hwlat-detector: Use trace_clock_local if available

As ktime_get() calls into the timing code which does a read_seq(), it
may be affected by other CPUS that touch that lock. To remove this
dependency, use the trace_clock_local() which is already exported
for module use. If CONFIG_TRACING is enabled, use that as the clock,
otherwise use ktime_get().

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 drivers/misc/hwlat_detector.c | 34 +++++++++++++++++++++++++---------
 1 file changed, 25 insertions(+), 9 deletions(-)

diff --git a/drivers/misc/hwlat_detector.c b/drivers/misc/hwlat_detector.c
index c07e85932cbf..0fcc0e38df42 100644
--- a/drivers/misc/hwlat_detector.c
+++ b/drivers/misc/hwlat_detector.c
@@ -51,6 +51,7 @@
 #include <linux/version.h>
 #include <linux/delay.h>
 #include <linux/slab.h>
+#include <linux/trace_clock.h>
 
 #define BUF_SIZE_DEFAULT	262144UL		/* 8K*(sizeof(entry)) */
 #define BUF_FLAGS		(RB_FL_OVERWRITE)	/* no block on full */
@@ -211,6 +212,21 @@ static struct sample *buffer_get_sample(struct sample *sample)
 	return sample;
 }
 
+#ifndef CONFIG_TRACING
+#define time_type	ktime_t
+#define time_get()	ktime_get()
+#define time_to_us(x)	ktime_to_us(x)
+#define time_sub(a, b)	ktime_sub(a, b)
+#define init_time(a, b)	(a).tv64 = b
+#define time_u64(a)	((a).tv64)
+#else
+#define time_type	u64
+#define time_get()	trace_clock_local()
+#define time_to_us(x)	div_u64(x, 1000)
+#define time_sub(a, b)	((a) - (b))
+#define init_time(a, b)	(a = b)
+#define time_u64(a)	a
+#endif
 /**
  * get_sample - sample the CPU TSC and look for likely hardware latencies
  * @unused: This is not used but is a part of the stop_machine API
@@ -220,23 +236,23 @@ static struct sample *buffer_get_sample(struct sample *sample)
  */
 static int get_sample(void *unused)
 {
-	ktime_t start, t1, t2, last_t2;
+	time_type start, t1, t2, last_t2;
 	s64 diff, total = 0;
 	u64 sample = 0;
 	u64 outer_sample = 0;
 	int ret = 1;
 
-	last_t2.tv64 = 0;
-	start = ktime_get(); /* start timestamp */
+	init_time(last_t2, 0);
+	start = time_get(); /* start timestamp */
 
 	do {
 
-		t1 = ktime_get();	/* we'll look for a discontinuity */
-		t2 = ktime_get();
+		t1 = time_get();	/* we'll look for a discontinuity */
+		t2 = time_get();
 
-		if (last_t2.tv64) {
+		if (time_u64(last_t2)) {
 			/* Check the delta from outer loop (t2 to next t1) */
-			diff = ktime_to_us(ktime_sub(t1, last_t2));
+			diff = time_to_us(time_sub(t1, last_t2));
 			/* This shouldn't happen */
 			if (diff < 0) {
 				pr_err(BANNER "time running backwards\n");
@@ -247,10 +263,10 @@ static int get_sample(void *unused)
 		}
 		last_t2 = t2;
 
-		total = ktime_to_us(ktime_sub(t2, start)); /* sample width */
+		total = time_to_us(time_sub(t2, start)); /* sample width */
 
 		/* This checks the inner loop (t1 to t2) */
-		diff = ktime_to_us(ktime_sub(t2, t1));     /* current diff */
+		diff = time_to_us(time_sub(t2, t1));     /* current diff */
 
 		/* This shouldn't happen */
 		if (diff < 0) {
-- 
cgit v1.2.3


From 0ef1e79ff2851a82c78ec323df0ba7ff39e2183c Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 19 Aug 2013 17:33:27 -0400
Subject: hwlat-detector: Use thread instead of stop machine

There's no reason to use stop machine to search for hardware latency.
Simply disabling interrupts while running the loop will do enough to
check if something comes in that wasn't disabled by interrupts being
off, which is exactly what stop machine does.

Instead of using stop machine, just have the thread disable interrupts
while it checks for hardware latency.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 drivers/misc/hwlat_detector.c | 60 +++++++++++++++++++------------------------
 1 file changed, 26 insertions(+), 34 deletions(-)

diff --git a/drivers/misc/hwlat_detector.c b/drivers/misc/hwlat_detector.c
index 0fcc0e38df42..6e88113bc098 100644
--- a/drivers/misc/hwlat_detector.c
+++ b/drivers/misc/hwlat_detector.c
@@ -41,7 +41,6 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/ring_buffer.h>
-#include <linux/stop_machine.h>
 #include <linux/time.h>
 #include <linux/hrtimer.h>
 #include <linux/kthread.h>
@@ -107,7 +106,6 @@ struct data;					/* Global state */
 /* Sampling functions */
 static int __buffer_add_sample(struct sample *sample);
 static struct sample *buffer_get_sample(struct sample *sample);
-static int get_sample(void *unused);
 
 /* Threading and state */
 static int kthread_fn(void *unused);
@@ -149,7 +147,7 @@ struct sample {
 	unsigned long   lost;
 };
 
-/* keep the global state somewhere. Mostly used under stop_machine. */
+/* keep the global state somewhere. */
 static struct data {
 
 	struct mutex lock;		/* protect changes */
@@ -172,7 +170,7 @@ static struct data {
  * @sample: The new latency sample value
  *
  * This receives a new latency sample and records it in a global ring buffer.
- * No additional locking is used in this case - suited for stop_machine use.
+ * No additional locking is used in this case.
  */
 static int __buffer_add_sample(struct sample *sample)
 {
@@ -229,18 +227,18 @@ static struct sample *buffer_get_sample(struct sample *sample)
 #endif
 /**
  * get_sample - sample the CPU TSC and look for likely hardware latencies
- * @unused: This is not used but is a part of the stop_machine API
  *
  * Used to repeatedly capture the CPU TSC (or similar), looking for potential
- * hardware-induced latency. Called under stop_machine, with data.lock held.
+ * hardware-induced latency. Called with interrupts disabled and with
+ * data.lock held.
  */
-static int get_sample(void *unused)
+static int get_sample(void)
 {
 	time_type start, t1, t2, last_t2;
 	s64 diff, total = 0;
 	u64 sample = 0;
 	u64 outer_sample = 0;
-	int ret = 1;
+	int ret = -1;
 
 	init_time(last_t2, 0);
 	start = time_get(); /* start timestamp */
@@ -279,10 +277,14 @@ static int get_sample(void *unused)
 
 	} while (total <= data.sample_width);
 
+	ret = 0;
+
 	/* If we exceed the threshold value, we have found a hardware latency */
 	if (sample > data.threshold || outer_sample > data.threshold) {
 		struct sample s;
 
+		ret = 1;
+
 		data.count++;
 		s.seqnum = data.count;
 		s.duration = sample;
@@ -295,7 +297,6 @@ static int get_sample(void *unused)
 			data.max_sample = sample;
 	}
 
-	ret = 0;
 out:
 	return ret;
 }
@@ -305,32 +306,30 @@ out:
  * @unused: A required part of the kthread API.
  *
  * Used to periodically sample the CPU TSC via a call to get_sample. We
- * use stop_machine, whith does (intentionally) introduce latency since we
+ * disable interrupts, which does (intentionally) introduce latency since we
  * need to ensure nothing else might be running (and thus pre-empting).
  * Obviously this should never be used in production environments.
  *
- * stop_machine will schedule us typically only on CPU0 which is fine for
- * almost every real-world hardware latency situation - but we might later
- * generalize this if we find there are any actualy systems with alternate
- * SMI delivery or other non CPU0 hardware latencies.
+ * Currently this runs on which ever CPU it was scheduled on, but most
+ * real-worald hardware latency situations occur across several CPUs,
+ * but we might later generalize this if we find there are any actualy
+ * systems with alternate SMI delivery or other hardware latencies.
  */
 static int kthread_fn(void *unused)
 {
-	int err = 0;
-	u64 interval = 0;
+	int ret;
+	u64 interval;
 
 	while (!kthread_should_stop()) {
 
 		mutex_lock(&data.lock);
 
-		err = stop_machine(get_sample, unused, 0);
-		if (err) {
-			/* Houston, we have a problem */
-			mutex_unlock(&data.lock);
-			goto err_out;
-		}
+		local_irq_disable();
+		ret = get_sample();
+		local_irq_enable();
 
-		wake_up(&data.wq); /* wake up reader(s) */
+		if (ret > 0)
+			wake_up(&data.wq); /* wake up reader(s) */
 
 		interval = data.sample_window - data.sample_width;
 		do_div(interval, USEC_PER_MSEC); /* modifies interval value */
@@ -338,15 +337,10 @@ static int kthread_fn(void *unused)
 		mutex_unlock(&data.lock);
 
 		if (msleep_interruptible(interval))
-			goto out;
+			break;
 	}
-		goto out;
-err_out:
-	pr_err(BANNER "could not call stop_machine, disabling\n");
-	enabled = 0;
-out:
-	return err;
 
+	return 0;
 }
 
 /**
@@ -442,8 +436,7 @@ out:
  * This function provides a generic read implementation for the global state
  * "data" structure debugfs filesystem entries. It would be nice to use
  * simple_attr_read directly, but we need to make sure that the data.lock
- * spinlock is held during the actual read (even though we likely won't ever
- * actually race here as the updater runs under a stop_machine context).
+ * is held during the actual read.
  */
 static ssize_t simple_data_read(struct file *filp, char __user *ubuf,
 				size_t cnt, loff_t *ppos, const u64 *entry)
@@ -478,8 +471,7 @@ static ssize_t simple_data_read(struct file *filp, char __user *ubuf,
  * This function provides a generic write implementation for the global state
  * "data" structure debugfs filesystem entries. It would be nice to use
  * simple_attr_write directly, but we need to make sure that the data.lock
- * spinlock is held during the actual write (even though we likely won't ever
- * actually race here as the updater runs under a stop_machine context).
+ * is held during the actual write.
  */
 static ssize_t simple_data_write(struct file *filp, const char __user *ubuf,
 				 size_t cnt, loff_t *ppos, u64 *entry)
-- 
cgit v1.2.3


From 7ff29b7a7cca98dfd91bb272464047cc5fcfe9c0 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <bitbucket@online.de>
Date: Fri, 30 Aug 2013 07:57:25 +0200
Subject: hwlat-detector: Don't ignore threshold module parameter

If the user specified a threshold at module load time, use it.

Cc: stable-rt@vger.kernel.org
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Mike Galbraith <bitbucket@online.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 drivers/misc/hwlat_detector.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/misc/hwlat_detector.c b/drivers/misc/hwlat_detector.c
index 6e88113bc098..2429c4331e68 100644
--- a/drivers/misc/hwlat_detector.c
+++ b/drivers/misc/hwlat_detector.c
@@ -414,7 +414,7 @@ static int init_stats(void)
 		goto out;
 
 	__reset_stats();
-	data.threshold = DEFAULT_LAT_THRESHOLD;	    /* threshold us */
+	data.threshold = threshold ?: DEFAULT_LAT_THRESHOLD; /* threshold us */
 	data.sample_window = DEFAULT_SAMPLE_WINDOW; /* window us */
 	data.sample_width = DEFAULT_SAMPLE_WIDTH;   /* width us */
 
-- 
cgit v1.2.3


From 6c44bc000f60ab6e57afa9890ba9f9aede95bcbf Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 22 Jul 2011 17:58:40 +0200
Subject: printk-kill.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/printk.h |  3 +-
 kernel/printk/printk.c | 79 +++++++++++++++++++++++++++++++++++---------------
 kernel/watchdog.c      | 14 +++++++--
 3 files changed, 70 insertions(+), 26 deletions(-)

diff --git a/include/linux/printk.h b/include/linux/printk.h
index d78125f73ac4..9fd2d6fa4489 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -119,9 +119,11 @@ int no_printk(const char *fmt, ...)
 extern asmlinkage __printf(1, 2)
 void early_printk(const char *fmt, ...);
 void early_vprintk(const char *fmt, va_list ap);
+extern void printk_kill(void);
 #else
 static inline __printf(1, 2) __cold
 void early_printk(const char *s, ...) { }
+static inline void printk_kill(void) { }
 #endif
 
 #ifdef CONFIG_PRINTK
@@ -155,7 +157,6 @@ extern int __printk_ratelimit(const char *func);
 #define printk_ratelimit() __printk_ratelimit(__func__)
 extern bool printk_timed_ratelimit(unsigned long *caller_jiffies,
 				   unsigned int interval_msec);
-
 extern int printk_delay_msec;
 extern int dmesg_restrict;
 extern int kptr_restrict;
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index bf95fdad4d96..87846c32e7f3 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1613,6 +1613,55 @@ static size_t cont_print_text(char *text, size_t size)
 	return textlen;
 }
 
+#ifdef CONFIG_EARLY_PRINTK
+struct console *early_console;
+
+void early_vprintk(const char *fmt, va_list ap)
+{
+	if (early_console) {
+		char buf[512];
+		int n = vscnprintf(buf, sizeof(buf), fmt, ap);
+
+		early_console->write(early_console, buf, n);
+	}
+}
+
+asmlinkage void early_printk(const char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	early_vprintk(fmt, ap);
+	va_end(ap);
+}
+
+/*
+ * This is independent of any log levels - a global
+ * kill switch that turns off all of printk.
+ *
+ * Used by the NMI watchdog if early-printk is enabled.
+ */
+static bool __read_mostly printk_killswitch;
+
+void printk_kill(void)
+{
+	printk_killswitch = true;
+}
+
+static int forced_early_printk(const char *fmt, va_list ap)
+{
+	if (!printk_killswitch)
+		return 0;
+	early_vprintk(fmt, ap);
+	return 1;
+}
+#else
+static inline int forced_early_printk(const char *fmt, va_list ap)
+{
+	return 0;
+}
+#endif
+
 asmlinkage int vprintk_emit(int facility, int level,
 			    const char *dict, size_t dictlen,
 			    const char *fmt, va_list args)
@@ -1629,6 +1678,13 @@ asmlinkage int vprintk_emit(int facility, int level,
 	/* cpu currently holding logbuf_lock in this function */
 	static volatile unsigned int logbuf_cpu = UINT_MAX;
 
+	/*
+	 * Fall back to early_printk if a debugging subsystem has
+	 * killed printk output
+	 */
+	if (unlikely(forced_early_printk(fmt, args)))
+		return 1;
+
 	if (level == SCHED_MESSAGE_LOGLEVEL) {
 		level = -1;
 		in_sched = true;
@@ -1878,29 +1934,6 @@ static size_t cont_print_text(char *text, size_t size) { return 0; }
 
 #endif /* CONFIG_PRINTK */
 
-#ifdef CONFIG_EARLY_PRINTK
-struct console *early_console;
-
-void early_vprintk(const char *fmt, va_list ap)
-{
-	if (early_console) {
-		char buf[512];
-		int n = vscnprintf(buf, sizeof(buf), fmt, ap);
-
-		early_console->write(early_console, buf, n);
-	}
-}
-
-asmlinkage __visible void early_printk(const char *fmt, ...)
-{
-	va_list ap;
-
-	va_start(ap, fmt);
-	early_vprintk(fmt, ap);
-	va_end(ap);
-}
-#endif
-
 static int __add_preferred_console(char *name, int idx, char *options,
 				   char *brl_options)
 {
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 70bf11815f84..085bd3c641c7 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -248,6 +248,8 @@ static int is_softlockup(unsigned long touch_ts)
 
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
 
+static DEFINE_RAW_SPINLOCK(watchdog_output_lock);
+
 static struct perf_event_attr wd_hw_attr = {
 	.type		= PERF_TYPE_HARDWARE,
 	.config		= PERF_COUNT_HW_CPU_CYCLES,
@@ -281,13 +283,21 @@ static void watchdog_overflow_callback(struct perf_event *event,
 		/* only print hardlockups once */
 		if (__this_cpu_read(hard_watchdog_warn) == true)
 			return;
+		/*
+		 * If early-printk is enabled then make sure we do not
+		 * lock up in printk() and kill console logging:
+		 */
+		printk_kill();
 
-		if (hardlockup_panic)
+		if (hardlockup_panic) {
 			panic("Watchdog detected hard LOCKUP on cpu %d",
 			      this_cpu);
-		else
+		} else {
+			raw_spin_lock(&watchdog_output_lock);
 			WARN(1, "Watchdog detected hard LOCKUP on cpu %d",
 			     this_cpu);
+			raw_spin_unlock(&watchdog_output_lock);
+		}
 
 		__this_cpu_write(hard_watchdog_warn, true);
 		return;
-- 
cgit v1.2.3


From 9b81f07f4d14657808011b4cfc09de53fad7f801 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 2 Sep 2011 14:29:33 +0200
Subject: printk: 'force_early_printk' boot param to help with debugging

Gives me an option to screw printk and actually see what the machine
says.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1314967289.1301.11.camel@twins
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/n/tip-ykb97nsfmobq44xketrxs977@git.kernel.org
---
 kernel/printk/printk.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 87846c32e7f3..c2cc106455f3 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1643,6 +1643,13 @@ asmlinkage void early_printk(const char *fmt, ...)
  */
 static bool __read_mostly printk_killswitch;
 
+static int __init force_early_printk_setup(char *str)
+{
+	printk_killswitch = true;
+	return 0;
+}
+early_param("force_early_printk", force_early_printk_setup);
+
 void printk_kill(void)
 {
 	printk_killswitch = true;
-- 
cgit v1.2.3


From 31ad91627dc74c9029b6bda45645f5302a953faf Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 17 Jun 2011 12:39:57 +0200
Subject: rt-preempt-base-config.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/Kconfig.preempt | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 3f9c97419f02..c669134982ec 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -1,3 +1,10 @@
+config PREEMPT
+	bool
+	select PREEMPT_COUNT
+
+config PREEMPT_RT_BASE
+	bool
+	select PREEMPT
 
 choice
 	prompt "Preemption Model"
@@ -33,9 +40,9 @@ config PREEMPT_VOLUNTARY
 
 	  Select this if you are building a kernel for a desktop system.
 
-config PREEMPT
+config PREEMPT__LL
 	bool "Preemptible Kernel (Low-Latency Desktop)"
-	select PREEMPT_COUNT
+	select PREEMPT
 	select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
 	help
 	  This option reduces the latency of the kernel by making
@@ -52,6 +59,14 @@ config PREEMPT
 	  embedded system with latency requirements in the milliseconds
 	  range.
 
+config PREEMPT_RTB
+	bool "Preemptible Kernel (Basic RT)"
+	select PREEMPT_RT_BASE
+	help
+	  This option is basically the same as (Low-Latency Desktop) but
+	  enables changes which are preliminary for the full preemptible
+	  RT kernel.
+
 endchoice
 
 config PREEMPT_COUNT
-- 
cgit v1.2.3


From d17add1c54676213d40799428cadeb08afe0a914 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 24 Jul 2011 12:11:43 +0200
Subject: kconfig-disable-a-few-options-rt.patch

Disable stuff which is known to have issues on RT

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/Kconfig | 1 +
 mm/Kconfig   | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 05d7a8a458d5..84d340b061a9 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -6,6 +6,7 @@ config OPROFILE
 	tristate "OProfile system profiling"
 	depends on PROFILING
 	depends on HAVE_OPROFILE
+	depends on !PREEMPT_RT_FULL
 	select RING_BUFFER
 	select RING_BUFFER_ALLOW_SWAP
 	help
diff --git a/mm/Kconfig b/mm/Kconfig
index 1d1ae6b078fd..296e4e048db8 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -408,7 +408,7 @@ config NOMMU_INITIAL_TRIM_EXCESS
 
 config TRANSPARENT_HUGEPAGE
 	bool "Transparent Hugepage Support"
-	depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
+	depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT_FULL
 	select COMPACTION
 	help
 	  Transparent Hugepages allows the kernel to use huge pages and
-- 
cgit v1.2.3


From 22e4a832d6e6640f2f73f2747facf20689eda937 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 29 Jun 2011 14:58:57 +0200
Subject: kconfig-preempt-rt-full.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 init/Makefile          | 2 +-
 kernel/Kconfig.preempt | 8 ++++++++
 scripts/mkcompile_h    | 4 +++-
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/init/Makefile b/init/Makefile
index 7bc47ee31c36..88cf473554e0 100644
--- a/init/Makefile
+++ b/init/Makefile
@@ -33,4 +33,4 @@ silent_chk_compile.h = :
 include/generated/compile.h: FORCE
 	@$($(quiet)chk_compile.h)
 	$(Q)$(CONFIG_SHELL) $(srctree)/scripts/mkcompile_h $@ \
-	"$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CC) $(KBUILD_CFLAGS)"
+	"$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CONFIG_PREEMPT_RT_FULL)" "$(CC) $(KBUILD_CFLAGS)"
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index c669134982ec..f8a2982bdbde 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -67,6 +67,14 @@ config PREEMPT_RTB
 	  enables changes which are preliminary for the full preemptible
 	  RT kernel.
 
+config PREEMPT_RT_FULL
+	bool "Fully Preemptible Kernel (RT)"
+	depends on IRQ_FORCED_THREADING
+	select PREEMPT_RT_BASE
+	select PREEMPT_RCU
+	help
+	  All and everything
+
 endchoice
 
 config PREEMPT_COUNT
diff --git a/scripts/mkcompile_h b/scripts/mkcompile_h
index 6fdc97ef6023..523e0420d7f0 100755
--- a/scripts/mkcompile_h
+++ b/scripts/mkcompile_h
@@ -4,7 +4,8 @@ TARGET=$1
 ARCH=$2
 SMP=$3
 PREEMPT=$4
-CC=$5
+RT=$5
+CC=$6
 
 vecho() { [ "${quiet}" = "silent_" ] || echo "$@" ; }
 
@@ -57,6 +58,7 @@ UTS_VERSION="#$VERSION"
 CONFIG_FLAGS=""
 if [ -n "$SMP" ] ; then CONFIG_FLAGS="SMP"; fi
 if [ -n "$PREEMPT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS PREEMPT"; fi
+if [ -n "$RT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS RT"; fi
 UTS_VERSION="$UTS_VERSION $CONFIG_FLAGS $TIMESTAMP"
 
 # Truncate to maximum length
-- 
cgit v1.2.3


From aad4f8db11d550a4622f1a19ba46929f00c688e4 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:29:58 -0500
Subject: bug: BUG_ON/WARN_ON variants dependend on RT/!RT

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/asm-generic/bug.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
index 630dd2372238..850e4d993a88 100644
--- a/include/asm-generic/bug.h
+++ b/include/asm-generic/bug.h
@@ -206,6 +206,20 @@ extern void warn_slowpath_null(const char *file, const int line);
 # define WARN_ON_SMP(x)			({0;})
 #endif
 
+#ifdef CONFIG_PREEMPT_RT_BASE
+# define BUG_ON_RT(c)			BUG_ON(c)
+# define BUG_ON_NONRT(c)		do { } while (0)
+# define WARN_ON_RT(condition)		WARN_ON(condition)
+# define WARN_ON_NONRT(condition)	do { } while (0)
+# define WARN_ON_ONCE_NONRT(condition)	do { } while (0)
+#else
+# define BUG_ON_RT(c)			do { } while (0)
+# define BUG_ON_NONRT(c)		BUG_ON(c)
+# define WARN_ON_RT(condition)		do { } while (0)
+# define WARN_ON_NONRT(condition)	WARN_ON(condition)
+# define WARN_ON_ONCE_NONRT(condition)	WARN_ON_ONCE(condition)
+#endif
+
 #endif /* __ASSEMBLY__ */
 
 #endif
-- 
cgit v1.2.3


From 07fbf2993a1e6f846887fac7aebbb683975a51a3 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 21 Jul 2009 22:34:14 +0200
Subject: rt: local_irq_* variants depending on RT/!RT

Add local_irq_*_(no)rt variant which are mainly used to break
interrupt disabled sections on PREEMPT_RT or to explicitely disable
interrupts on PREEMPT_RT.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/interrupt.h |  2 +-
 include/linux/irqflags.h  | 19 +++++++++++++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index f2eee2f304c4..265b87a48a73 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -180,7 +180,7 @@ extern void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id);
 #ifdef CONFIG_LOCKDEP
 # define local_irq_enable_in_hardirq()	do { } while (0)
 #else
-# define local_irq_enable_in_hardirq()	local_irq_enable()
+# define local_irq_enable_in_hardirq()	local_irq_enable_nort()
 #endif
 
 extern void disable_irq_nosync(unsigned int irq);
diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
index d176d658fe25..cc05eb7f6df4 100644
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -147,4 +147,23 @@
 
 #endif /* CONFIG_TRACE_IRQFLAGS_SUPPORT */
 
+/*
+ * local_irq* variants depending on RT/!RT
+ */
+#ifdef CONFIG_PREEMPT_RT_FULL
+# define local_irq_disable_nort()	do { } while (0)
+# define local_irq_enable_nort()	do { } while (0)
+# define local_irq_save_nort(flags)	local_save_flags(flags)
+# define local_irq_restore_nort(flags)	(void)(flags)
+# define local_irq_disable_rt()		local_irq_disable()
+# define local_irq_enable_rt()		local_irq_enable()
+#else
+# define local_irq_disable_nort()	local_irq_disable()
+# define local_irq_enable_nort()	local_irq_enable()
+# define local_irq_save_nort(flags)	local_irq_save(flags)
+# define local_irq_restore_nort(flags)	local_irq_restore(flags)
+# define local_irq_disable_rt()		do { } while (0)
+# define local_irq_enable_rt()		do { } while (0)
+#endif
+
 #endif
-- 
cgit v1.2.3


From f95ff2613996a85afb7e319c0f82180e5e3b4201 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 24 Jul 2009 12:38:56 +0200
Subject: preempt: Provide preempt_*_(no)rt variants

RT needs a few preempt_disable/enable points which are not necessary
otherwise. Implement variants to avoid #ifdeffery.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/preempt.h | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index de83b4eb1642..18508708f22b 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -47,7 +47,11 @@ do { \
 	preempt_count_dec(); \
 } while (0)
 
-#define preempt_enable_no_resched() sched_preempt_enable_no_resched()
+#ifdef CONFIG_PREEMPT_RT_BASE
+# define preempt_enable_no_resched() sched_preempt_enable_no_resched()
+#else
+# define preempt_enable_no_resched() preempt_enable()
+#endif
 
 #ifdef CONFIG_PREEMPT
 #define preempt_enable() \
@@ -144,6 +148,18 @@ do { \
 		set_preempt_need_resched(); \
 } while (0)
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+# define preempt_disable_rt()		preempt_disable()
+# define preempt_enable_rt()		preempt_enable()
+# define preempt_disable_nort()		barrier()
+# define preempt_enable_nort()		barrier()
+#else
+# define preempt_disable_rt()		barrier()
+# define preempt_enable_rt()		barrier()
+# define preempt_disable_nort()		preempt_disable()
+# define preempt_enable_nort()		preempt_enable()
+#endif
+
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 
 struct preempt_notifier;
-- 
cgit v1.2.3


From 32b28cd1d719e1d472882b335cc070461ce2ed63 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 3 Jul 2009 08:44:29 -0500
Subject: ata: Do not disable interrupts in ide code for preempt-rt

Use the local_irq_*_nort variants.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/ata/libata-sff.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
index 2e86e3b85266..22ebbbd42c3f 100644
--- a/drivers/ata/libata-sff.c
+++ b/drivers/ata/libata-sff.c
@@ -678,9 +678,9 @@ unsigned int ata_sff_data_xfer_noirq(struct ata_device *dev, unsigned char *buf,
 	unsigned long flags;
 	unsigned int consumed;
 
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 	consumed = ata_sff_data_xfer32(dev, buf, buflen, rw);
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 
 	return consumed;
 }
@@ -719,7 +719,7 @@ static void ata_pio_sector(struct ata_queued_cmd *qc)
 		unsigned long flags;
 
 		/* FIXME: use a bounce buffer */
-		local_irq_save(flags);
+		local_irq_save_nort(flags);
 		buf = kmap_atomic(page);
 
 		/* do the actual data transfer */
@@ -727,7 +727,7 @@ static void ata_pio_sector(struct ata_queued_cmd *qc)
 				       do_write);
 
 		kunmap_atomic(buf);
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 	} else {
 		buf = page_address(page);
 		ap->ops->sff_data_xfer(qc->dev, buf + offset, qc->sect_size,
@@ -864,7 +864,7 @@ next_sg:
 		unsigned long flags;
 
 		/* FIXME: use bounce buffer */
-		local_irq_save(flags);
+		local_irq_save_nort(flags);
 		buf = kmap_atomic(page);
 
 		/* do the actual data transfer */
@@ -872,7 +872,7 @@ next_sg:
 								count, rw);
 
 		kunmap_atomic(buf);
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 	} else {
 		buf = page_address(page);
 		consumed = ap->ops->sff_data_xfer(dev,  buf + offset,
-- 
cgit v1.2.3


From 69ef71ed869e51169342a0d6d957fe601f885cec Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:30:16 -0500
Subject: ide: Do not disable interrupts for PREEMPT-RT

Use the local_irq_*_nort variants.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/ide/alim15x3.c     | 4 ++--
 drivers/ide/hpt366.c       | 4 ++--
 drivers/ide/ide-io-std.c   | 8 ++++----
 drivers/ide/ide-io.c       | 2 +-
 drivers/ide/ide-iops.c     | 4 ++--
 drivers/ide/ide-probe.c    | 4 ++--
 drivers/ide/ide-taskfile.c | 6 +++---
 7 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/drivers/ide/alim15x3.c b/drivers/ide/alim15x3.c
index 36f76e28a0bf..394f142f90c7 100644
--- a/drivers/ide/alim15x3.c
+++ b/drivers/ide/alim15x3.c
@@ -234,7 +234,7 @@ static int init_chipset_ali15x3(struct pci_dev *dev)
 
 	isa_dev = pci_get_device(PCI_VENDOR_ID_AL, PCI_DEVICE_ID_AL_M1533, NULL);
 
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 
 	if (m5229_revision < 0xC2) {
 		/*
@@ -325,7 +325,7 @@ out:
 	}
 	pci_dev_put(north);
 	pci_dev_put(isa_dev);
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 	return 0;
 }
 
diff --git a/drivers/ide/hpt366.c b/drivers/ide/hpt366.c
index 696b6c1ec940..0d0a96629b73 100644
--- a/drivers/ide/hpt366.c
+++ b/drivers/ide/hpt366.c
@@ -1241,7 +1241,7 @@ static int init_dma_hpt366(ide_hwif_t *hwif,
 
 	dma_old = inb(base + 2);
 
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 
 	dma_new = dma_old;
 	pci_read_config_byte(dev, hwif->channel ? 0x4b : 0x43, &masterdma);
@@ -1252,7 +1252,7 @@ static int init_dma_hpt366(ide_hwif_t *hwif,
 	if (dma_new != dma_old)
 		outb(dma_new, base + 2);
 
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 
 	printk(KERN_INFO "    %s: BM-DMA at 0x%04lx-0x%04lx\n",
 			 hwif->name, base, base + 7);
diff --git a/drivers/ide/ide-io-std.c b/drivers/ide/ide-io-std.c
index 19763977568c..4169433faab5 100644
--- a/drivers/ide/ide-io-std.c
+++ b/drivers/ide/ide-io-std.c
@@ -175,7 +175,7 @@ void ide_input_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
 		unsigned long uninitialized_var(flags);
 
 		if ((io_32bit & 2) && !mmio) {
-			local_irq_save(flags);
+			local_irq_save_nort(flags);
 			ata_vlb_sync(io_ports->nsect_addr);
 		}
 
@@ -186,7 +186,7 @@ void ide_input_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
 			insl(data_addr, buf, words);
 
 		if ((io_32bit & 2) && !mmio)
-			local_irq_restore(flags);
+			local_irq_restore_nort(flags);
 
 		if (((len + 1) & 3) < 2)
 			return;
@@ -219,7 +219,7 @@ void ide_output_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
 		unsigned long uninitialized_var(flags);
 
 		if ((io_32bit & 2) && !mmio) {
-			local_irq_save(flags);
+			local_irq_save_nort(flags);
 			ata_vlb_sync(io_ports->nsect_addr);
 		}
 
@@ -230,7 +230,7 @@ void ide_output_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
 			outsl(data_addr, buf, words);
 
 		if ((io_32bit & 2) && !mmio)
-			local_irq_restore(flags);
+			local_irq_restore_nort(flags);
 
 		if (((len + 1) & 3) < 2)
 			return;
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 177db6d5b2f5..079ae6bebf18 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -659,7 +659,7 @@ void ide_timer_expiry (unsigned long data)
 		/* disable_irq_nosync ?? */
 		disable_irq(hwif->irq);
 		/* local CPU only, as if we were handling an interrupt */
-		local_irq_disable();
+		local_irq_disable_nort();
 		if (hwif->polling) {
 			startstop = handler(drive);
 		} else if (drive_is_ready(drive)) {
diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
index 376f2dc410c5..f014dd1b73dc 100644
--- a/drivers/ide/ide-iops.c
+++ b/drivers/ide/ide-iops.c
@@ -129,12 +129,12 @@ int __ide_wait_stat(ide_drive_t *drive, u8 good, u8 bad,
 				if ((stat & ATA_BUSY) == 0)
 					break;
 
-				local_irq_restore(flags);
+				local_irq_restore_nort(flags);
 				*rstat = stat;
 				return -EBUSY;
 			}
 		}
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 	}
 	/*
 	 * Allow status to settle, then read it again.
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index a3d3b1733c49..3eff5828ecfc 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -196,10 +196,10 @@ static void do_identify(ide_drive_t *drive, u8 cmd, u16 *id)
 	int bswap = 1;
 
 	/* local CPU only; some systems need this */
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 	/* read 512 bytes of id info */
 	hwif->tp_ops->input_data(drive, NULL, id, SECTOR_SIZE);
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 
 	drive->dev_flags |= IDE_DFLAG_ID_READ;
 #ifdef DEBUG
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index dabb88b1cbec..2cecea72520a 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -250,7 +250,7 @@ void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd,
 
 		page_is_high = PageHighMem(page);
 		if (page_is_high)
-			local_irq_save(flags);
+			local_irq_save_nort(flags);
 
 		buf = kmap_atomic(page) + offset;
 
@@ -271,7 +271,7 @@ void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd,
 		kunmap_atomic(buf);
 
 		if (page_is_high)
-			local_irq_restore(flags);
+			local_irq_restore_nort(flags);
 
 		len -= nr_bytes;
 	}
@@ -414,7 +414,7 @@ static ide_startstop_t pre_task_out_intr(ide_drive_t *drive,
 	}
 
 	if ((drive->dev_flags & IDE_DFLAG_UNMASK) == 0)
-		local_irq_disable();
+		local_irq_disable_nort();
 
 	ide_set_handler(drive, &task_pio_intr, WAIT_WORSTCASE);
 
-- 
cgit v1.2.3


From b541a1f547dc89311cffb4462e3b5457ac398e0d Mon Sep 17 00:00:00 2001
From: Sven-Thorsten Dietrich <sdietrich@novell.com>
Date: Fri, 3 Jul 2009 08:30:35 -0500
Subject: infiniband: Mellanox IB driver patch use _nort() primitives

Fixes in_atomic stack-dump, when Mellanox module is loaded into the RT
Kernel.

Michael S. Tsirkin <mst@dev.mellanox.co.il> sayeth:
"Basically, if you just make spin_lock_irqsave (and spin_lock_irq) not disable
interrupts for non-raw spinlocks, I think all of infiniband will be fine without
changes."

Signed-off-by: Sven-Thorsten Dietrich <sven@thebigcorporation.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index ffb83b5f7e80..b0f4e42a0f96 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -796,7 +796,7 @@ void ipoib_mcast_restart_task(struct work_struct *work)
 
 	ipoib_mcast_stop_thread(dev, 0);
 
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 	netif_addr_lock(dev);
 	spin_lock(&priv->lock);
 
@@ -878,7 +878,7 @@ void ipoib_mcast_restart_task(struct work_struct *work)
 
 	spin_unlock(&priv->lock);
 	netif_addr_unlock(dev);
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 
 	/* We have to cancel outside of the spinlock */
 	list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
-- 
cgit v1.2.3


From c2c630e19b676561f6b7b7fcdc842c0ac7535ef2 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:30:16 -0500
Subject: input: gameport: Do not disable interrupts on PREEMPT_RT

Use the _nort() primitives.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/input/gameport/gameport.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/input/gameport/gameport.c b/drivers/input/gameport/gameport.c
index e29c04e2aff4..412ab00c7534 100644
--- a/drivers/input/gameport/gameport.c
+++ b/drivers/input/gameport/gameport.c
@@ -124,12 +124,12 @@ static int old_gameport_measure_speed(struct gameport *gameport)
 	tx = 1 << 30;
 
 	for(i = 0; i < 50; i++) {
-		local_irq_save(flags);
+		local_irq_save_nort(flags);
 		GET_TIME(t1);
 		for (t = 0; t < 50; t++) gameport_read(gameport);
 		GET_TIME(t2);
 		GET_TIME(t3);
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 		udelay(i * 10);
 		if ((t = DELTA(t2,t1) - DELTA(t3,t2)) < tx) tx = t;
 	}
@@ -148,11 +148,11 @@ static int old_gameport_measure_speed(struct gameport *gameport)
 	tx = 1 << 30;
 
 	for(i = 0; i < 50; i++) {
-		local_irq_save(flags);
+		local_irq_save_nort(flags);
 		rdtscl(t1);
 		for (t = 0; t < 50; t++) gameport_read(gameport);
 		rdtscl(t2);
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 		udelay(i * 10);
 		if (t2 - t1 < tx) tx = t2 - t1;
 	}
-- 
cgit v1.2.3


From c069db6319fa0eb9824f069b82fb304a4dc2c180 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 21 Jul 2009 23:06:05 +0200
Subject: core: Do not disable interrupts on RT in kernel/users.c

Use the local_irq_*_nort variants to reduce latencies in RT. The code
is serialized by the locks. No need to disable interrupts.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/user.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/user.c b/kernel/user.c
index 2d09940c9632..ea7d0ec75f81 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -158,11 +158,11 @@ void free_uid(struct user_struct *up)
 	if (!up)
 		return;
 
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 	if (atomic_dec_and_lock(&up->__count, &uidhash_lock))
 		free_user(up, flags);
 	else
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 }
 
 struct user_struct *alloc_uid(kuid_t uid)
-- 
cgit v1.2.3


From bd32e240593045fbb37d7245f75606391a0a9def Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:44:33 -0500
Subject: core: Do not disable interrupts on RT in res_counter.c

Frederic Weisbecker reported this warning:

[   45.228562] BUG: sleeping function called from invalid context at kernel/rtmutex.c:683
[   45.228571] in_atomic(): 0, irqs_disabled(): 1, pid: 4290, name: ntpdate
[   45.228576] INFO: lockdep is turned off.
[   45.228580] irq event stamp: 0
[   45.228583] hardirqs last  enabled at (0): [<(null)>] (null)
[   45.228589] hardirqs last disabled at (0): [<ffffffff8025449d>] copy_process+0x68d/0x1500
[   45.228602] softirqs last  enabled at (0): [<ffffffff8025449d>] copy_process+0x68d/0x1500
[   45.228609] softirqs last disabled at (0): [<(null)>] (null)
[   45.228617] Pid: 4290, comm: ntpdate Tainted: G        W  2.6.29-rc4-rt1-tip #1
[   45.228622] Call Trace:
[   45.228632]  [<ffffffff8027dfb0>] ? print_irqtrace_events+0xd0/0xe0
[   45.228639]  [<ffffffff8024cd73>] __might_sleep+0x113/0x130
[   45.228646]  [<ffffffff8077c811>] rt_spin_lock+0xa1/0xb0
[   45.228653]  [<ffffffff80296a3d>] res_counter_charge+0x5d/0x130
[   45.228660]  [<ffffffff802fb67f>] __mem_cgroup_try_charge+0x7f/0x180
[   45.228667]  [<ffffffff802fc407>] mem_cgroup_charge_common+0x57/0x90
[   45.228674]  [<ffffffff80212096>] ? ftrace_call+0x5/0x2b
[   45.228680]  [<ffffffff802fc49d>] mem_cgroup_newpage_charge+0x5d/0x60
[   45.228688]  [<ffffffff802d94ce>] __do_fault+0x29e/0x4c0
[   45.228694]  [<ffffffff8077c843>] ? rt_spin_unlock+0x23/0x80
[   45.228700]  [<ffffffff802db8b5>] handle_mm_fault+0x205/0x890
[   45.228707]  [<ffffffff80212096>] ? ftrace_call+0x5/0x2b
[   45.228714]  [<ffffffff8023495e>] do_page_fault+0x11e/0x2a0
[   45.228720]  [<ffffffff8077e5a5>] page_fault+0x25/0x30
[   45.228727]  [<ffffffff8043e1ed>] ? __clear_user+0x3d/0x70
[   45.228733]  [<ffffffff8043e1d1>] ? __clear_user+0x21/0x70

The reason is the raw IRQ flag use of kernel/res_counter.c.

The irq flags tricks there seem a bit pointless: it cannot protect the
c->parent linkage because local_irq_save() is only per CPU.

So replace it with _nort(). This code needs a second look.

Reported-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/res_counter.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index e791130f85a7..75715182b1ac 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -59,7 +59,7 @@ static int __res_counter_charge(struct res_counter *counter, unsigned long val,
 
 	r = ret = 0;
 	*limit_fail_at = NULL;
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 	for (c = counter; c != NULL; c = c->parent) {
 		spin_lock(&c->lock);
 		r = res_counter_charge_locked(c, val, force);
@@ -79,7 +79,7 @@ static int __res_counter_charge(struct res_counter *counter, unsigned long val,
 			spin_unlock(&u->lock);
 		}
 	}
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 
 	return ret;
 }
@@ -104,7 +104,7 @@ u64 res_counter_uncharge_until(struct res_counter *counter,
 	struct res_counter *c;
 	u64 ret = 0;
 
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 	for (c = counter; c != top; c = c->parent) {
 		u64 r;
 		spin_lock(&c->lock);
@@ -113,7 +113,7 @@ u64 res_counter_uncharge_until(struct res_counter *counter,
 			ret = r;
 		spin_unlock(&c->lock);
 	}
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 	return ret;
 }
 
-- 
cgit v1.2.3


From f0876507eb9946394656923dd396ae6858e6c119 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 8 Nov 2013 17:34:54 +0100
Subject: usb: use _nort in giveback

Since commit 94dfd7ed ("USB: HCD: support giveback of URB in tasklet
context") I see

|BUG: sleeping function called from invalid context at kernel/rtmutex.c:673
|in_atomic(): 0, irqs_disabled(): 1, pid: 109, name: irq/11-uhci_hcd
|no locks held by irq/11-uhci_hcd/109.
|irq event stamp: 440
|hardirqs last  enabled at (439): [<ffffffff816a7555>] _raw_spin_unlock_irqrestore+0x75/0x90
|hardirqs last disabled at (440): [<ffffffff81514906>] __usb_hcd_giveback_urb+0x46/0xc0
|softirqs last  enabled at (0): [<ffffffff81081821>] copy_process.part.52+0x511/0x1510
|softirqs last disabled at (0): [<          (null)>]           (null)
|CPU: 3 PID: 109 Comm: irq/11-uhci_hcd Not tainted 3.12.0-rt0-rc1+ #13
|Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
| 0000000000000000 ffff8800db9ffbe0 ffffffff8169f064 0000000000000000
| ffff8800db9ffbf8 ffffffff810b2122 ffff88020f03e888 ffff8800db9ffc18
| ffffffff816a6944 ffffffff810b5748 ffff88020f03c000 ffff8800db9ffc50
|Call Trace:
| [<ffffffff8169f064>] dump_stack+0x4e/0x8f
| [<ffffffff810b2122>] __might_sleep+0x112/0x190
| [<ffffffff816a6944>] rt_spin_lock+0x24/0x60
| [<ffffffff8158435b>] hid_ctrl+0x3b/0x190
| [<ffffffff8151490f>] __usb_hcd_giveback_urb+0x4f/0xc0
| [<ffffffff81514aaf>] usb_hcd_giveback_urb+0x3f/0x140
| [<ffffffff815346af>] uhci_giveback_urb+0xaf/0x280
| [<ffffffff8153666a>] uhci_scan_schedule+0x47a/0xb10
| [<ffffffff81537336>] uhci_irq+0xa6/0x1a0
| [<ffffffff81513c48>] usb_hcd_irq+0x28/0x40
| [<ffffffff810c8ba3>] irq_forced_thread_fn+0x23/0x70
| [<ffffffff810c918f>] irq_thread+0x10f/0x150
| [<ffffffff810a6fad>] kthread+0xcd/0xe0
| [<ffffffff816a842c>] ret_from_fork+0x7c/0xb0

on -RT we run threaded so no need to disable interrupts.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 drivers/usb/core/hcd.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
index 0009fc847eee..e4d9fe74ff4e 100644
--- a/drivers/usb/core/hcd.c
+++ b/drivers/usb/core/hcd.c
@@ -1681,9 +1681,9 @@ static void __usb_hcd_giveback_urb(struct urb *urb)
 	 * and no one may trigger the above deadlock situation when
 	 * running complete() in tasklet.
 	 */
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 	urb->complete(urb);
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 
 	usb_anchor_resume_wakeups(anchor);
 	atomic_dec(&urb->use_count);
-- 
cgit v1.2.3


From 8f450dafaeb1fa0eae2f9d7f717f1b6605de1129 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 3 Jul 2009 08:44:34 -0500
Subject: mm: scatterlist dont disable irqs on RT

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 lib/scatterlist.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lib/scatterlist.c b/lib/scatterlist.c
index c9f2e8c6ccc9..f6d1f8899dca 100644
--- a/lib/scatterlist.c
+++ b/lib/scatterlist.c
@@ -592,7 +592,7 @@ void sg_miter_stop(struct sg_mapping_iter *miter)
 			flush_kernel_dcache_page(miter->page);
 
 		if (miter->__flags & SG_MITER_ATOMIC) {
-			WARN_ON_ONCE(preemptible());
+			WARN_ON_ONCE(!pagefault_disabled());
 			kunmap_atomic(miter->addr);
 		} else
 			kunmap(miter->page);
@@ -637,7 +637,7 @@ static size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents,
 	if (!sg_miter_skip(&miter, skip))
 		return false;
 
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 
 	while (sg_miter_next(&miter) && offset < buflen) {
 		unsigned int len;
@@ -654,7 +654,7 @@ static size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents,
 
 	sg_miter_stop(&miter);
 
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 	return offset;
 }
 
-- 
cgit v1.2.3


From 483e42da2065cdae3708c9cb24d63a737d71a145 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 29 Jan 2015 17:19:44 +0100
Subject: mm/workingset: do not protect workingset_shadow_nodes with irq off

workingset_shadow_nodes is protected by local_irq_disable(). Some users
use spin_lock_irq().
Replace the irq/on with a local_lock(). Rename workingset_shadow_nodes
so I catch users of it which will be introduced later.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 include/linux/swap.h |  4 +++-
 mm/filemap.c         | 11 ++++++++---
 mm/truncate.c        |  7 +++++--
 mm/workingset.c      | 23 ++++++++++++-----------
 4 files changed, 28 insertions(+), 17 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 37a585beef5c..00aa9b4a8273 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -11,6 +11,7 @@
 #include <linux/fs.h>
 #include <linux/atomic.h>
 #include <linux/page-flags.h>
+#include <linux/locallock.h>
 #include <asm/page.h>
 
 struct notifier_block;
@@ -260,7 +261,8 @@ struct swap_info_struct {
 void *workingset_eviction(struct address_space *mapping, struct page *page);
 bool workingset_refault(void *shadow);
 void workingset_activation(struct page *page);
-extern struct list_lru workingset_shadow_nodes;
+extern struct list_lru __workingset_shadow_nodes;
+DECLARE_LOCAL_IRQ_LOCK(workingset_shadow_lock);
 
 static inline unsigned int workingset_node_pages(struct radix_tree_node *node)
 {
diff --git a/mm/filemap.c b/mm/filemap.c
index 37beab98b416..9159e0454fba 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -168,7 +168,9 @@ static void page_cache_tree_delete(struct address_space *mapping,
 	if (!workingset_node_pages(node) &&
 	    list_empty(&node->private_list)) {
 		node->private_data = mapping;
-		list_lru_add(&workingset_shadow_nodes, &node->private_list);
+		local_lock(workingset_shadow_lock);
+		list_lru_add(&__workingset_shadow_nodes, &node->private_list);
+		local_unlock(workingset_shadow_lock);
 	}
 }
 
@@ -535,9 +537,12 @@ static int page_cache_tree_insert(struct address_space *mapping,
 		 * node->private_list is protected by
 		 * mapping->tree_lock.
 		 */
-		if (!list_empty(&node->private_list))
-			list_lru_del(&workingset_shadow_nodes,
+		if (!list_empty(&node->private_list)) {
+			local_lock(workingset_shadow_lock);
+			list_lru_del(&__workingset_shadow_nodes,
 				     &node->private_list);
+			local_unlock(workingset_shadow_lock);
+		}
 	}
 	return 0;
 }
diff --git a/mm/truncate.c b/mm/truncate.c
index f1e4d6052369..a06369d67fa4 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -56,8 +56,11 @@ static void clear_exceptional_entry(struct address_space *mapping,
 	 * protected by mapping->tree_lock.
 	 */
 	if (!workingset_node_shadows(node) &&
-	    !list_empty(&node->private_list))
-		list_lru_del(&workingset_shadow_nodes, &node->private_list);
+	    !list_empty(&node->private_list)) {
+		local_lock(workingset_shadow_lock);
+		list_lru_del(&__workingset_shadow_nodes, &node->private_list);
+		local_unlock(workingset_shadow_lock);
+	}
 	__radix_tree_delete_node(&mapping->page_tree, node);
 unlock:
 	spin_unlock_irq(&mapping->tree_lock);
diff --git a/mm/workingset.c b/mm/workingset.c
index f7216fa7da27..0decb9ca8db8 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -264,7 +264,8 @@ void workingset_activation(struct page *page)
  * point where they would still be useful.
  */
 
-struct list_lru workingset_shadow_nodes;
+struct list_lru __workingset_shadow_nodes;
+DEFINE_LOCAL_IRQ_LOCK(workingset_shadow_lock);
 
 static unsigned long count_shadow_nodes(struct shrinker *shrinker,
 					struct shrink_control *sc)
@@ -274,9 +275,9 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
 	unsigned long pages;
 
 	/* list_lru lock nests inside IRQ-safe mapping->tree_lock */
-	local_irq_disable();
-	shadow_nodes = list_lru_count_node(&workingset_shadow_nodes, sc->nid);
-	local_irq_enable();
+	local_lock_irq(workingset_shadow_lock);
+	shadow_nodes = list_lru_count_node(&__workingset_shadow_nodes, sc->nid);
+	local_unlock_irq(workingset_shadow_lock);
 
 	pages = node_present_pages(sc->nid);
 	/*
@@ -362,9 +363,9 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
 	spin_unlock(&mapping->tree_lock);
 	ret = LRU_REMOVED_RETRY;
 out:
-	local_irq_enable();
+	local_unlock_irq(workingset_shadow_lock);
 	cond_resched();
-	local_irq_disable();
+	local_lock_irq(workingset_shadow_lock);
 	spin_lock(lru_lock);
 	return ret;
 }
@@ -375,10 +376,10 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
 	unsigned long ret;
 
 	/* list_lru lock nests inside IRQ-safe mapping->tree_lock */
-	local_irq_disable();
-	ret =  list_lru_walk_node(&workingset_shadow_nodes, sc->nid,
+	local_lock_irq(workingset_shadow_lock);
+	ret =  list_lru_walk_node(&__workingset_shadow_nodes, sc->nid,
 				  shadow_lru_isolate, NULL, &sc->nr_to_scan);
-	local_irq_enable();
+	local_unlock_irq(workingset_shadow_lock);
 	return ret;
 }
 
@@ -399,7 +400,7 @@ static int __init workingset_init(void)
 {
 	int ret;
 
-	ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key);
+	ret = list_lru_init_key(&__workingset_shadow_nodes, &shadow_nodes_key);
 	if (ret)
 		goto err;
 	ret = register_shrinker(&workingset_shadow_shrinker);
@@ -407,7 +408,7 @@ static int __init workingset_init(void)
 		goto err_list_lru;
 	return 0;
 err_list_lru:
-	list_lru_destroy(&workingset_shadow_nodes);
+	list_lru_destroy(&__workingset_shadow_nodes);
 err:
 	return ret;
 }
-- 
cgit v1.2.3


From 12b98357431e0694d51ee0dd0048167d47c2dda1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 22 Jul 2011 08:07:08 +0200
Subject: signal-fix-up-rcu-wreckage.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/signal.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/signal.c b/kernel/signal.c
index c8ea69a738c7..237aec98fa7e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1375,12 +1375,12 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
 		 * Disable interrupts early to avoid deadlocks.
 		 * See rcu_read_unlock() comment header for details.
 		 */
-		local_irq_save(*flags);
+		local_irq_save_nort(*flags);
 		rcu_read_lock();
 		sighand = rcu_dereference(tsk->sighand);
 		if (unlikely(sighand == NULL)) {
 			rcu_read_unlock();
-			local_irq_restore(*flags);
+			local_irq_restore_nort(*flags);
 			break;
 		}
 
@@ -1391,7 +1391,7 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
 		}
 		spin_unlock(&sighand->siglock);
 		rcu_read_unlock();
-		local_irq_restore(*flags);
+		local_irq_restore_nort(*flags);
 	}
 
 	return sighand;
-- 
cgit v1.2.3


From 3b76cd7d22bc00c2a031a6b1992512fa70889a05 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 21 Jul 2011 21:05:33 +0200
Subject: net-wireless-warn-nort.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 net/mac80211/rx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 631d59f540d1..cfda615c6f78 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -3360,7 +3360,7 @@ void ieee80211_rx(struct ieee80211_hw *hw, struct sk_buff *skb)
 	struct ieee80211_supported_band *sband;
 	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
 
-	WARN_ON_ONCE(softirq_count() == 0);
+	WARN_ON_ONCE_NONRT(softirq_count() == 0);
 
 	if (WARN_ON(status->band >= IEEE80211_NUM_BANDS))
 		goto drop;
-- 
cgit v1.2.3


From 50272ba524a84867838cccfbed149f0ca8bb8ab2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 18 Mar 2011 09:18:52 +0100
Subject: buffer_head: Replace bh_uptodate_lock for -rt

Wrap the bit_spin_lock calls into a separate inline and add the RT
replacements with a real spinlock.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 fs/buffer.c                 | 21 +++++++--------------
 fs/ntfs/aops.c              | 10 +++-------
 include/linux/buffer_head.h | 34 ++++++++++++++++++++++++++++++++++
 3 files changed, 44 insertions(+), 21 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index 20805db2c987..8b9768093844 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -301,8 +301,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 	 * decide that the page is now completely done.
 	 */
 	first = page_buffers(page);
-	local_irq_save(flags);
-	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
+	flags = bh_uptodate_lock_irqsave(first);
 	clear_buffer_async_read(bh);
 	unlock_buffer(bh);
 	tmp = bh;
@@ -315,8 +314,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 		}
 		tmp = tmp->b_this_page;
 	} while (tmp != bh);
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	bh_uptodate_unlock_irqrestore(first, flags);
 
 	/*
 	 * If none of the buffers had errors and they are all
@@ -328,9 +326,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 	return;
 
 still_busy:
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
-	return;
+	bh_uptodate_unlock_irqrestore(first, flags);
 }
 
 /*
@@ -358,8 +354,7 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 	}
 
 	first = page_buffers(page);
-	local_irq_save(flags);
-	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
+	flags = bh_uptodate_lock_irqsave(first);
 
 	clear_buffer_async_write(bh);
 	unlock_buffer(bh);
@@ -371,15 +366,12 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 		}
 		tmp = tmp->b_this_page;
 	}
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	bh_uptodate_unlock_irqrestore(first, flags);
 	end_page_writeback(page);
 	return;
 
 still_busy:
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
-	return;
+	bh_uptodate_unlock_irqrestore(first, flags);
 }
 EXPORT_SYMBOL(end_buffer_async_write);
 
@@ -3325,6 +3317,7 @@ struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
 	struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
 	if (ret) {
 		INIT_LIST_HEAD(&ret->b_assoc_buffers);
+		buffer_head_init_locks(ret);
 		preempt_disable();
 		__this_cpu_inc(bh_accounting.nr);
 		recalc_bh_state();
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 7521e11db728..39b08f5b8c76 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -107,8 +107,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
 				"0x%llx.", (unsigned long long)bh->b_blocknr);
 	}
 	first = page_buffers(page);
-	local_irq_save(flags);
-	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
+	flags = bh_uptodate_lock_irqsave(first);
 	clear_buffer_async_read(bh);
 	unlock_buffer(bh);
 	tmp = bh;
@@ -123,8 +122,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
 		}
 		tmp = tmp->b_this_page;
 	} while (tmp != bh);
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	bh_uptodate_unlock_irqrestore(first, flags);
 	/*
 	 * If none of the buffers had errors then we can set the page uptodate,
 	 * but we first have to perform the post read mst fixups, if the
@@ -159,9 +157,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
 	unlock_page(page);
 	return;
 still_busy:
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
-	return;
+	bh_uptodate_unlock_irqrestore(first, flags);
 }
 
 /**
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 73b45225a7ca..e6503a3de9eb 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -75,8 +75,42 @@ struct buffer_head {
 	struct address_space *b_assoc_map;	/* mapping this buffer is
 						   associated with */
 	atomic_t b_count;		/* users using this buffer_head */
+#ifdef CONFIG_PREEMPT_RT_BASE
+	spinlock_t b_uptodate_lock;
+#endif
 };
 
+static inline unsigned long bh_uptodate_lock_irqsave(struct buffer_head *bh)
+{
+	unsigned long flags;
+
+#ifndef CONFIG_PREEMPT_RT_BASE
+	local_irq_save(flags);
+	bit_spin_lock(BH_Uptodate_Lock, &bh->b_state);
+#else
+	spin_lock_irqsave(&bh->b_uptodate_lock, flags);
+#endif
+	return flags;
+}
+
+static inline void
+bh_uptodate_unlock_irqrestore(struct buffer_head *bh, unsigned long flags)
+{
+#ifndef CONFIG_PREEMPT_RT_BASE
+	bit_spin_unlock(BH_Uptodate_Lock, &bh->b_state);
+	local_irq_restore(flags);
+#else
+	spin_unlock_irqrestore(&bh->b_uptodate_lock, flags);
+#endif
+}
+
+static inline void buffer_head_init_locks(struct buffer_head *bh)
+{
+#ifdef CONFIG_PREEMPT_RT_BASE
+	spin_lock_init(&bh->b_uptodate_lock);
+#endif
+}
+
 /*
  * macro tricks to expand the set_buffer_foo(), clear_buffer_foo()
  * and buffer_foo() functions.
-- 
cgit v1.2.3


From 31bfe59851db4a29f02366cc1313fc03aaf22947 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 18 Mar 2011 10:11:25 +0100
Subject: fs: jbd/jbd2: Make state lock and journal head lock rt safe

bit_spin_locks break under RT.

Based on a previous patch from Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

--

 include/linux/buffer_head.h |   10 ++++++++++
 include/linux/jbd_common.h  |   24 ++++++++++++++++++++++++
 2 files changed, 34 insertions(+)
---
 include/linux/buffer_head.h | 10 ++++++++++
 include/linux/jbd_common.h  | 24 ++++++++++++++++++++++++
 2 files changed, 34 insertions(+)

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index e6503a3de9eb..ea72f52032d7 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -77,6 +77,11 @@ struct buffer_head {
 	atomic_t b_count;		/* users using this buffer_head */
 #ifdef CONFIG_PREEMPT_RT_BASE
 	spinlock_t b_uptodate_lock;
+#if defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE) || \
+	defined(CONFIG_JBD2) || defined(CONFIG_JBD2_MODULE)
+	spinlock_t b_state_lock;
+	spinlock_t b_journal_head_lock;
+#endif
 #endif
 };
 
@@ -108,6 +113,11 @@ static inline void buffer_head_init_locks(struct buffer_head *bh)
 {
 #ifdef CONFIG_PREEMPT_RT_BASE
 	spin_lock_init(&bh->b_uptodate_lock);
+#if defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE) || \
+	defined(CONFIG_JBD2) || defined(CONFIG_JBD2_MODULE)
+	spin_lock_init(&bh->b_state_lock);
+	spin_lock_init(&bh->b_journal_head_lock);
+#endif
 #endif
 }
 
diff --git a/include/linux/jbd_common.h b/include/linux/jbd_common.h
index 3dc53432355f..a90a6f5ca899 100644
--- a/include/linux/jbd_common.h
+++ b/include/linux/jbd_common.h
@@ -15,32 +15,56 @@ static inline struct journal_head *bh2jh(struct buffer_head *bh)
 
 static inline void jbd_lock_bh_state(struct buffer_head *bh)
 {
+#ifndef CONFIG_PREEMPT_RT_BASE
 	bit_spin_lock(BH_State, &bh->b_state);
+#else
+	spin_lock(&bh->b_state_lock);
+#endif
 }
 
 static inline int jbd_trylock_bh_state(struct buffer_head *bh)
 {
+#ifndef CONFIG_PREEMPT_RT_BASE
 	return bit_spin_trylock(BH_State, &bh->b_state);
+#else
+	return spin_trylock(&bh->b_state_lock);
+#endif
 }
 
 static inline int jbd_is_locked_bh_state(struct buffer_head *bh)
 {
+#ifndef CONFIG_PREEMPT_RT_BASE
 	return bit_spin_is_locked(BH_State, &bh->b_state);
+#else
+	return spin_is_locked(&bh->b_state_lock);
+#endif
 }
 
 static inline void jbd_unlock_bh_state(struct buffer_head *bh)
 {
+#ifndef CONFIG_PREEMPT_RT_BASE
 	bit_spin_unlock(BH_State, &bh->b_state);
+#else
+	spin_unlock(&bh->b_state_lock);
+#endif
 }
 
 static inline void jbd_lock_bh_journal_head(struct buffer_head *bh)
 {
+#ifndef CONFIG_PREEMPT_RT_BASE
 	bit_spin_lock(BH_JournalHead, &bh->b_state);
+#else
+	spin_lock(&bh->b_journal_head_lock);
+#endif
 }
 
 static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
 {
+#ifndef CONFIG_PREEMPT_RT_BASE
 	bit_spin_unlock(BH_JournalHead, &bh->b_state);
+#else
+	spin_unlock(&bh->b_journal_head_lock);
+#endif
 }
 
 #endif
-- 
cgit v1.2.3


From a8b6ade50bb5194bcb164e4bf33967beb0437b4b Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Fri, 21 Jun 2013 15:07:25 -0400
Subject: list_bl.h: make list head locking RT safe

As per changes in include/linux/jbd_common.h for avoiding the
bit_spin_locks on RT ("fs: jbd/jbd2: Make state lock and journal
head lock rt safe") we do the same thing here.

We use the non atomic __set_bit and __clear_bit inside the scope of
the lock to preserve the ability of the existing LIST_DEBUG code to
use the zero'th bit in the sanity checks.

As a bit spinlock, we had no lockdep visibility into the usage
of the list head locking.  Now, if we were to implement it as a
standard non-raw spinlock, we would see:

BUG: sleeping function called from invalid context at kernel/rtmutex.c:658
in_atomic(): 1, irqs_disabled(): 0, pid: 122, name: udevd
5 locks held by udevd/122:
 #0:  (&sb->s_type->i_mutex_key#7/1){+.+.+.}, at: [<ffffffff811967e8>] lock_rename+0xe8/0xf0
 #1:  (rename_lock){+.+...}, at: [<ffffffff811a277c>] d_move+0x2c/0x60
 #2:  (&dentry->d_lock){+.+...}, at: [<ffffffff811a0763>] dentry_lock_for_move+0xf3/0x130
 #3:  (&dentry->d_lock/2){+.+...}, at: [<ffffffff811a0734>] dentry_lock_for_move+0xc4/0x130
 #4:  (&dentry->d_lock/3){+.+...}, at: [<ffffffff811a0747>] dentry_lock_for_move+0xd7/0x130
Pid: 122, comm: udevd Not tainted 3.4.47-rt62 #7
Call Trace:
 [<ffffffff810b9624>] __might_sleep+0x134/0x1f0
 [<ffffffff817a24d4>] rt_spin_lock+0x24/0x60
 [<ffffffff811a0c4c>] __d_shrink+0x5c/0xa0
 [<ffffffff811a1b2d>] __d_drop+0x1d/0x40
 [<ffffffff811a24be>] __d_move+0x8e/0x320
 [<ffffffff811a278e>] d_move+0x3e/0x60
 [<ffffffff81199598>] vfs_rename+0x198/0x4c0
 [<ffffffff8119b093>] sys_renameat+0x213/0x240
 [<ffffffff817a2de5>] ? _raw_spin_unlock+0x35/0x60
 [<ffffffff8107781c>] ? do_page_fault+0x1ec/0x4b0
 [<ffffffff817a32ca>] ? retint_swapgs+0xe/0x13
 [<ffffffff813eb0e6>] ? trace_hardirqs_on_thunk+0x3a/0x3f
 [<ffffffff8119b0db>] sys_rename+0x1b/0x20
 [<ffffffff817a3b96>] system_call_fastpath+0x1a/0x1f

Since we are only taking the lock during short lived list operations,
lets assume for now that it being raw won't be a significant latency
concern.

Cc: stable-rt@vger.kernel.org
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 include/linux/list_bl.h | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h
index 2eb88556c5c5..d8876a0cf036 100644
--- a/include/linux/list_bl.h
+++ b/include/linux/list_bl.h
@@ -2,6 +2,7 @@
 #define _LINUX_LIST_BL_H
 
 #include <linux/list.h>
+#include <linux/spinlock.h>
 #include <linux/bit_spinlock.h>
 
 /*
@@ -32,13 +33,22 @@
 
 struct hlist_bl_head {
 	struct hlist_bl_node *first;
+#ifdef CONFIG_PREEMPT_RT_BASE
+	raw_spinlock_t lock;
+#endif
 };
 
 struct hlist_bl_node {
 	struct hlist_bl_node *next, **pprev;
 };
-#define INIT_HLIST_BL_HEAD(ptr) \
-	((ptr)->first = NULL)
+
+static inline void INIT_HLIST_BL_HEAD(struct hlist_bl_head *h)
+{
+	h->first = NULL;
+#ifdef CONFIG_PREEMPT_RT_BASE
+	raw_spin_lock_init(&h->lock);
+#endif
+}
 
 static inline void INIT_HLIST_BL_NODE(struct hlist_bl_node *h)
 {
@@ -117,12 +127,26 @@ static inline void hlist_bl_del_init(struct hlist_bl_node *n)
 
 static inline void hlist_bl_lock(struct hlist_bl_head *b)
 {
+#ifndef CONFIG_PREEMPT_RT_BASE
 	bit_spin_lock(0, (unsigned long *)b);
+#else
+	raw_spin_lock(&b->lock);
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+	__set_bit(0, (unsigned long *)b);
+#endif
+#endif
 }
 
 static inline void hlist_bl_unlock(struct hlist_bl_head *b)
 {
+#ifndef CONFIG_PREEMPT_RT_BASE
 	__bit_spin_unlock(0, (unsigned long *)b);
+#else
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+	__clear_bit(0, (unsigned long *)b);
+#endif
+	raw_spin_unlock(&b->lock);
+#endif
 }
 
 static inline bool hlist_bl_is_locked(struct hlist_bl_head *b)
-- 
cgit v1.2.3


From 0588971072d4574ae70f391bc874d6da73fc8dfa Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 18 Mar 2011 10:22:04 +0100
Subject: genirq: Disable DEBUG_SHIRQ for rt

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 lib/Kconfig.debug | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 4e35a5d767ed..7c1ae17d8dad 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -639,7 +639,7 @@ endmenu # "Memory Debugging"
 
 config DEBUG_SHIRQ
 	bool "Debug shared IRQ handlers"
-	depends on DEBUG_KERNEL
+	depends on DEBUG_KERNEL && !PREEMPT_RT_BASE
 	help
 	  Enable this to generate a spurious interrupt as soon as a shared
 	  interrupt handler is registered, and just before one is deregistered.
-- 
cgit v1.2.3


From 29bfe1259139d2e81361af3823199eff69a42b4f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:29:57 -0500
Subject: genirq: disable irqpoll on -rt

Creates long latencies for no value

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/spurious.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index e2514b0e439e..903a69c45689 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -444,6 +444,10 @@ MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true");
 
 static int __init irqfixup_setup(char *str)
 {
+#ifdef CONFIG_PREEMPT_RT_BASE
+	pr_warn("irqfixup boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n");
+	return 1;
+#endif
 	irqfixup = 1;
 	printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
 	printk(KERN_WARNING "This may impact system performance.\n");
@@ -456,6 +460,10 @@ module_param(irqfixup, int, 0644);
 
 static int __init irqpoll_setup(char *str)
 {
+#ifdef CONFIG_PREEMPT_RT_BASE
+	pr_warn("irqpoll boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n");
+	return 1;
+#endif
 	irqfixup = 2;
 	printk(KERN_WARNING "Misrouted IRQ fixup and polling support "
 				"enabled\n");
-- 
cgit v1.2.3


From e3d46ed2a246abed5422aa2165ce67cf0910516f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 3 Apr 2011 11:57:29 +0200
Subject: genirq-force-threading.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/interrupt.h | 6 +++++-
 kernel/irq/manage.c       | 2 ++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 265b87a48a73..eb28fe605162 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -358,9 +358,13 @@ static inline int disable_irq_wake(unsigned int irq)
 
 
 #ifdef CONFIG_IRQ_FORCED_THREADING
+# ifndef CONFIG_PREEMPT_RT_BASE
 extern bool force_irqthreads;
+# else
+#  define force_irqthreads	(true)
+# endif
 #else
-#define force_irqthreads	(0)
+#define force_irqthreads	(false)
 #endif
 
 #ifndef __ARCH_SET_SOFTIRQ_PENDING
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 0a9104b4608b..8bff377ba8ef 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -22,6 +22,7 @@
 #include "internals.h"
 
 #ifdef CONFIG_IRQ_FORCED_THREADING
+# ifndef CONFIG_PREEMPT_RT_BASE
 __read_mostly bool force_irqthreads;
 
 static int __init setup_forced_irqthreads(char *arg)
@@ -30,6 +31,7 @@ static int __init setup_forced_irqthreads(char *arg)
 	return 0;
 }
 early_param("threadirqs", setup_forced_irqthreads);
+# endif
 #endif
 
 static void __synchronize_hardirq(struct irq_desc *desc)
-- 
cgit v1.2.3


From ce55405e35618cbfa1f3924268d2fa88e9476a96 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 21 Aug 2013 17:48:46 +0200
Subject: genirq: do not invoke the affinity callback via a workqueue

Joe Korty reported, that __irq_set_affinity_locked() schedules a
workqueue while holding a rawlock which results in a might_sleep()
warning.
This patch moves the invokation into a process context so that we only
wakeup() a process while holding the lock.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 include/linux/interrupt.h |  1 +
 kernel/irq/manage.c       | 79 +++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 77 insertions(+), 3 deletions(-)

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index eb28fe605162..536223286563 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -210,6 +210,7 @@ struct irq_affinity_notify {
 	unsigned int irq;
 	struct kref kref;
 	struct work_struct work;
+	struct list_head list;
 	void (*notify)(struct irq_affinity_notify *, const cpumask_t *mask);
 	void (*release)(struct kref *ref);
 };
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 8bff377ba8ef..3f79018535bf 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -175,6 +175,62 @@ static inline void
 irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { }
 #endif
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+static void _irq_affinity_notify(struct irq_affinity_notify *notify);
+static struct task_struct *set_affinity_helper;
+static LIST_HEAD(affinity_list);
+static DEFINE_RAW_SPINLOCK(affinity_list_lock);
+
+static int set_affinity_thread(void *unused)
+{
+	while (1) {
+		struct irq_affinity_notify *notify;
+		int empty;
+
+		set_current_state(TASK_INTERRUPTIBLE);
+
+		raw_spin_lock_irq(&affinity_list_lock);
+		empty = list_empty(&affinity_list);
+		raw_spin_unlock_irq(&affinity_list_lock);
+
+		if (empty)
+			schedule();
+		if (kthread_should_stop())
+			break;
+		set_current_state(TASK_RUNNING);
+try_next:
+		notify = NULL;
+
+		raw_spin_lock_irq(&affinity_list_lock);
+		if (!list_empty(&affinity_list)) {
+			notify = list_first_entry(&affinity_list,
+					struct irq_affinity_notify, list);
+			list_del_init(&notify->list);
+		}
+		raw_spin_unlock_irq(&affinity_list_lock);
+
+		if (!notify)
+			continue;
+		_irq_affinity_notify(notify);
+		goto try_next;
+	}
+	return 0;
+}
+
+static void init_helper_thread(void)
+{
+	if (set_affinity_helper)
+		return;
+	set_affinity_helper = kthread_run(set_affinity_thread, NULL,
+			"affinity-cb");
+	WARN_ON(IS_ERR(set_affinity_helper));
+}
+#else
+
+static inline void init_helper_thread(void) { }
+
+#endif
+
 int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
 			bool force)
 {
@@ -213,7 +269,17 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,
 
 	if (desc->affinity_notify) {
 		kref_get(&desc->affinity_notify->kref);
+
+#ifdef CONFIG_PREEMPT_RT_FULL
+		raw_spin_lock(&affinity_list_lock);
+		if (list_empty(&desc->affinity_notify->list))
+			list_add_tail(&affinity_list,
+					&desc->affinity_notify->list);
+		raw_spin_unlock(&affinity_list_lock);
+		wake_up_process(set_affinity_helper);
+#else
 		schedule_work(&desc->affinity_notify->work);
+#endif
 	}
 	irqd_set(data, IRQD_AFFINITY_SET);
 
@@ -248,10 +314,8 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
 }
 EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
 
-static void irq_affinity_notify(struct work_struct *work)
+static void _irq_affinity_notify(struct irq_affinity_notify *notify)
 {
-	struct irq_affinity_notify *notify =
-		container_of(work, struct irq_affinity_notify, work);
 	struct irq_desc *desc = irq_to_desc(notify->irq);
 	cpumask_var_t cpumask;
 	unsigned long flags;
@@ -273,6 +337,13 @@ out:
 	kref_put(&notify->kref, notify->release);
 }
 
+static void irq_affinity_notify(struct work_struct *work)
+{
+	struct irq_affinity_notify *notify =
+		container_of(work, struct irq_affinity_notify, work);
+	_irq_affinity_notify(notify);
+}
+
 /**
  *	irq_set_affinity_notifier - control notification of IRQ affinity changes
  *	@irq:		Interrupt for which to enable/disable notification
@@ -302,6 +373,8 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
 		notify->irq = irq;
 		kref_init(&notify->kref);
 		INIT_WORK(&notify->work, irq_affinity_notify);
+		INIT_LIST_HEAD(&notify->list);
+		init_helper_thread();
 	}
 
 	raw_spin_lock_irqsave(&desc->lock, flags);
-- 
cgit v1.2.3


From 2df9495c8edd043cf867cade084dbb1740782519 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 20 Jun 2009 11:36:54 +0200
Subject: drivers/net: fix livelock issues

Preempt-RT runs into a live lock issue with the NETDEV_TX_LOCKED micro
optimization. The reason is that the softirq thread is rescheduling
itself on that return value. Depending on priorities it starts to
monoplize the CPU and livelock on UP systems.

Remove it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/net/ethernet/atheros/atl1c/atl1c_main.c      | 6 +-----
 drivers/net/ethernet/atheros/atl1e/atl1e_main.c      | 3 +--
 drivers/net/ethernet/chelsio/cxgb/sge.c              | 3 +--
 drivers/net/ethernet/neterion/s2io.c                 | 7 +------
 drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c | 6 ++----
 drivers/net/ethernet/tehuti/tehuti.c                 | 9 ++-------
 drivers/net/rionet.c                                 | 6 +-----
 7 files changed, 9 insertions(+), 31 deletions(-)

diff --git a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
index 72fb86b9aa24..6f05c33d1589 100644
--- a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
+++ b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
@@ -2213,11 +2213,7 @@ static netdev_tx_t atl1c_xmit_frame(struct sk_buff *skb,
 	}
 
 	tpd_req = atl1c_cal_tpd_req(skb);
-	if (!spin_trylock_irqsave(&adapter->tx_lock, flags)) {
-		if (netif_msg_pktdata(adapter))
-			dev_info(&adapter->pdev->dev, "tx locked\n");
-		return NETDEV_TX_LOCKED;
-	}
+	spin_lock_irqsave(&adapter->tx_lock, flags);
 
 	if (atl1c_tpd_avail(adapter, type) < tpd_req) {
 		/* no enough descriptor, just stop queue */
diff --git a/drivers/net/ethernet/atheros/atl1e/atl1e_main.c b/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
index 2326579f9454..a042658ff8af 100644
--- a/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
+++ b/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
@@ -1880,8 +1880,7 @@ static netdev_tx_t atl1e_xmit_frame(struct sk_buff *skb,
 		return NETDEV_TX_OK;
 	}
 	tpd_req = atl1e_cal_tdp_req(skb);
-	if (!spin_trylock_irqsave(&adapter->tx_lock, flags))
-		return NETDEV_TX_LOCKED;
+	spin_lock_irqsave(&adapter->tx_lock, flags);
 
 	if (atl1e_tpd_avail(adapter) < tpd_req) {
 		/* no enough descriptor, just stop queue */
diff --git a/drivers/net/ethernet/chelsio/cxgb/sge.c b/drivers/net/ethernet/chelsio/cxgb/sge.c
index 4c5879389003..1adb83c01741 100644
--- a/drivers/net/ethernet/chelsio/cxgb/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb/sge.c
@@ -1663,8 +1663,7 @@ static int t1_sge_tx(struct sk_buff *skb, struct adapter *adapter,
 	struct cmdQ *q = &sge->cmdQ[qid];
 	unsigned int credits, pidx, genbit, count, use_sched_skb = 0;
 
-	if (!spin_trylock(&q->lock))
-		return NETDEV_TX_LOCKED;
+	spin_lock(&q->lock);
 
 	reclaim_completed_tx(sge, q);
 
diff --git a/drivers/net/ethernet/neterion/s2io.c b/drivers/net/ethernet/neterion/s2io.c
index f5e4b820128b..631175b5df3b 100644
--- a/drivers/net/ethernet/neterion/s2io.c
+++ b/drivers/net/ethernet/neterion/s2io.c
@@ -4084,12 +4084,7 @@ static netdev_tx_t s2io_xmit(struct sk_buff *skb, struct net_device *dev)
 			[skb->priority & (MAX_TX_FIFOS - 1)];
 	fifo = &mac_control->fifos[queue];
 
-	if (do_spin_lock)
-		spin_lock_irqsave(&fifo->tx_lock, flags);
-	else {
-		if (unlikely(!spin_trylock_irqsave(&fifo->tx_lock, flags)))
-			return NETDEV_TX_LOCKED;
-	}
+	spin_lock_irqsave(&fifo->tx_lock, flags);
 
 	if (sp->config.multiq) {
 		if (__netif_subqueue_stopped(dev, fifo->fifo_no)) {
diff --git a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
index 3b98b263bad0..ca4add749410 100644
--- a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
+++ b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
@@ -2137,10 +2137,8 @@ static int pch_gbe_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
 	struct pch_gbe_tx_ring *tx_ring = adapter->tx_ring;
 	unsigned long flags;
 
-	if (!spin_trylock_irqsave(&tx_ring->tx_lock, flags)) {
-		/* Collision - tell upper layer to requeue */
-		return NETDEV_TX_LOCKED;
-	}
+	spin_lock_irqsave(&tx_ring->tx_lock, flags);
+
 	if (unlikely(!PCH_GBE_DESC_UNUSED(tx_ring))) {
 		netif_stop_queue(netdev);
 		spin_unlock_irqrestore(&tx_ring->tx_lock, flags);
diff --git a/drivers/net/ethernet/tehuti/tehuti.c b/drivers/net/ethernet/tehuti/tehuti.c
index 6ab36d9ff2ab..6b7d989c4630 100644
--- a/drivers/net/ethernet/tehuti/tehuti.c
+++ b/drivers/net/ethernet/tehuti/tehuti.c
@@ -1629,13 +1629,8 @@ static netdev_tx_t bdx_tx_transmit(struct sk_buff *skb,
 	unsigned long flags;
 
 	ENTER;
-	local_irq_save(flags);
-	if (!spin_trylock(&priv->tx_lock)) {
-		local_irq_restore(flags);
-		DBG("%s[%s]: TX locked, returning NETDEV_TX_LOCKED\n",
-		    BDX_DRV_NAME, ndev->name);
-		return NETDEV_TX_LOCKED;
-	}
+
+	spin_lock_irqsave(&priv->tx_lock, flags);
 
 	/* build tx descriptor */
 	BDX_ASSERT(f->m.wptr >= f->m.memsz);	/* started with valid wptr */
diff --git a/drivers/net/rionet.c b/drivers/net/rionet.c
index dac7a0d9bb46..e17f600ac9ac 100644
--- a/drivers/net/rionet.c
+++ b/drivers/net/rionet.c
@@ -174,11 +174,7 @@ static int rionet_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 	unsigned long flags;
 	int add_num = 1;
 
-	local_irq_save(flags);
-	if (!spin_trylock(&rnet->tx_lock)) {
-		local_irq_restore(flags);
-		return NETDEV_TX_LOCKED;
-	}
+	spin_lock_irqsave(&rnet->tx_lock, flags);
 
 	if (is_multicast_ether_addr(eth->h_dest))
 		add_num = nets[rnet->mport->id].nact;
-- 
cgit v1.2.3


From 59b5edddd17c989339002e5b212c906e87722929 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Fri, 3 Jul 2009 08:30:00 -0500
Subject: drivers/net: vortex fix locking issues

Argh, cut and paste wasn't enough...

Use this patch instead.  It needs an irq disable.  But, believe it or not,
on SMP this is actually better.  If the irq is shared (as it is in Mark's
case), we don't stop the irq of other devices from being handled on
another CPU (unfortunately for Mark, he pinned all interrupts to one CPU).

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

 drivers/net/ethernet/3com/3c59x.c |    8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 drivers/net/ethernet/3com/3c59x.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/3com/3c59x.c b/drivers/net/ethernet/3com/3c59x.c
index 41095ebad97f..b0a0cb22aec4 100644
--- a/drivers/net/ethernet/3com/3c59x.c
+++ b/drivers/net/ethernet/3com/3c59x.c
@@ -842,9 +842,9 @@ static void poll_vortex(struct net_device *dev)
 {
 	struct vortex_private *vp = netdev_priv(dev);
 	unsigned long flags;
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 	(vp->full_bus_master_rx ? boomerang_interrupt:vortex_interrupt)(dev->irq,dev);
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 }
 #endif
 
@@ -1916,12 +1916,12 @@ static void vortex_tx_timeout(struct net_device *dev)
 			 * Block interrupts because vortex_interrupt does a bare spin_lock()
 			 */
 			unsigned long flags;
-			local_irq_save(flags);
+			local_irq_save_nort(flags);
 			if (vp->full_bus_master_tx)
 				boomerang_interrupt(dev->irq, dev);
 			else
 				vortex_interrupt(dev->irq, dev);
-			local_irq_restore(flags);
+			local_irq_restore_nort(flags);
 		}
 	}
 
-- 
cgit v1.2.3


From c862d5ace733a9474da3f6580d705e6f7b4762a2 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 25 Mar 2014 18:34:20 +0100
Subject: net: gianfar: do not disable interrupts

each per-queue lock is taken with spin_lock_irqsave() except in the case
where all of them are taken for some kind of serialisation. As an
optimisation local_irq_save() is used so that lock_tx_qs() and
lock_rx_qs() can use just the spin_lock() variant instead.
On RT local_irq_save() behaves differently so we use the nort()
variant.
Lockdep screems easily by "ethtool -K eth0 rx off tx off"

What remains is missing lockdep annotation that makes lockdep think
lock_tx_qs() may cause a dead lock.

Cc: stable-rt@vger.kernel.org
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 drivers/net/ethernet/freescale/gianfar.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c
index 4fdf0aa16978..bb7aaf0d9a3f 100644
--- a/drivers/net/ethernet/freescale/gianfar.c
+++ b/drivers/net/ethernet/freescale/gianfar.c
@@ -1483,7 +1483,7 @@ static int gfar_suspend(struct device *dev)
 
 	if (netif_running(ndev)) {
 
-		local_irq_save(flags);
+		local_irq_save_nort(flags);
 		lock_tx_qs(priv);
 
 		gfar_halt_nodisable(priv);
@@ -1499,7 +1499,7 @@ static int gfar_suspend(struct device *dev)
 		gfar_write(&regs->maccfg1, tempval);
 
 		unlock_tx_qs(priv);
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 
 		disable_napi(priv);
 
@@ -1541,7 +1541,7 @@ static int gfar_resume(struct device *dev)
 	/* Disable Magic Packet mode, in case something
 	 * else woke us up.
 	 */
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 	lock_tx_qs(priv);
 
 	tempval = gfar_read(&regs->maccfg2);
@@ -1551,7 +1551,7 @@ static int gfar_resume(struct device *dev)
 	gfar_start(priv);
 
 	unlock_tx_qs(priv);
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 
 	netif_device_attach(ndev);
 
@@ -3307,14 +3307,14 @@ static irqreturn_t gfar_error(int irq, void *grp_id)
 			dev->stats.tx_dropped++;
 			atomic64_inc(&priv->extra_stats.tx_underrun);
 
-			local_irq_save(flags);
+			local_irq_save_nort(flags);
 			lock_tx_qs(priv);
 
 			/* Reactivate the Tx Queues */
 			gfar_write(&regs->tstat, gfargrp->tstat);
 
 			unlock_tx_qs(priv);
-			local_irq_restore(flags);
+			local_irq_restore_nort(flags);
 		}
 		netif_dbg(priv, tx_err, dev, "Transmit Error\n");
 	}
-- 
cgit v1.2.3


From 5f6d8b54e43a89c7f23bfab3e205b6262057aa29 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 24 Jun 2011 18:40:37 +0200
Subject: local-var.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/percpu.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index a3aa63e47637..bd65cb68baed 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -23,6 +23,11 @@
 	 PERCPU_MODULE_RESERVE)
 #endif
 
+#define get_local_var(var)	get_cpu_var(var)
+#define put_local_var(var)	put_cpu_var(var)
+#define get_local_ptr(var)	get_cpu_ptr(var)
+#define put_local_ptr(var)	put_cpu_ptr(var)
+
 /* minimum unit size, also is the maximum supported allocation size */
 #define PCPU_MIN_UNIT_SIZE		PFN_ALIGN(32 << 10)
 
-- 
cgit v1.2.3


From d37cfb6012ff5f9572247535342e05e73c34a98d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 20 Jun 2011 09:03:47 +0200
Subject: rt-local-irq-lock.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/locallock.h | 254 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 254 insertions(+)
 create mode 100644 include/linux/locallock.h

diff --git a/include/linux/locallock.h b/include/linux/locallock.h
new file mode 100644
index 000000000000..e7bd8be71f8c
--- /dev/null
+++ b/include/linux/locallock.h
@@ -0,0 +1,254 @@
+#ifndef _LINUX_LOCALLOCK_H
+#define _LINUX_LOCALLOCK_H
+
+#include <linux/percpu.h>
+#include <linux/spinlock.h>
+
+#ifdef CONFIG_PREEMPT_RT_BASE
+
+#ifdef CONFIG_DEBUG_SPINLOCK
+# define LL_WARN(cond)	WARN_ON(cond)
+#else
+# define LL_WARN(cond)	do { } while (0)
+#endif
+
+/*
+ * per cpu lock based substitute for local_irq_*()
+ */
+struct local_irq_lock {
+	spinlock_t		lock;
+	struct task_struct	*owner;
+	int			nestcnt;
+	unsigned long		flags;
+};
+
+#define DEFINE_LOCAL_IRQ_LOCK(lvar)					\
+	DEFINE_PER_CPU(struct local_irq_lock, lvar) = {			\
+		.lock = __SPIN_LOCK_UNLOCKED((lvar).lock) }
+
+#define DECLARE_LOCAL_IRQ_LOCK(lvar)					\
+	DECLARE_PER_CPU(struct local_irq_lock, lvar)
+
+#define local_irq_lock_init(lvar)					\
+	do {								\
+		int __cpu;						\
+		for_each_possible_cpu(__cpu)				\
+			spin_lock_init(&per_cpu(lvar, __cpu).lock);	\
+	} while (0)
+
+static inline void __local_lock(struct local_irq_lock *lv)
+{
+	if (lv->owner != current) {
+		spin_lock(&lv->lock);
+		LL_WARN(lv->owner);
+		LL_WARN(lv->nestcnt);
+		lv->owner = current;
+	}
+	lv->nestcnt++;
+}
+
+#define local_lock(lvar)					\
+	do { __local_lock(&get_local_var(lvar)); } while (0)
+
+static inline int __local_trylock(struct local_irq_lock *lv)
+{
+	if (lv->owner != current && spin_trylock(&lv->lock)) {
+		LL_WARN(lv->owner);
+		LL_WARN(lv->nestcnt);
+		lv->owner = current;
+		lv->nestcnt = 1;
+		return 1;
+	}
+	return 0;
+}
+
+#define local_trylock(lvar)						\
+	({								\
+		int __locked;						\
+		__locked = __local_trylock(&get_local_var(lvar));	\
+		if (!__locked)						\
+			put_local_var(lvar);				\
+		__locked;						\
+	})
+
+static inline void __local_unlock(struct local_irq_lock *lv)
+{
+	LL_WARN(lv->nestcnt == 0);
+	LL_WARN(lv->owner != current);
+	if (--lv->nestcnt)
+		return;
+
+	lv->owner = NULL;
+	spin_unlock(&lv->lock);
+}
+
+#define local_unlock(lvar)					\
+	do {							\
+		__local_unlock(&__get_cpu_var(lvar));		\
+		put_local_var(lvar);				\
+	} while (0)
+
+static inline void __local_lock_irq(struct local_irq_lock *lv)
+{
+	spin_lock_irqsave(&lv->lock, lv->flags);
+	LL_WARN(lv->owner);
+	LL_WARN(lv->nestcnt);
+	lv->owner = current;
+	lv->nestcnt = 1;
+}
+
+#define local_lock_irq(lvar)						\
+	do { __local_lock_irq(&get_local_var(lvar)); } while (0)
+
+#define local_lock_irq_on(lvar, cpu)					\
+	do { __local_lock_irq(&per_cpu(lvar, cpu)); } while (0)
+
+static inline void __local_unlock_irq(struct local_irq_lock *lv)
+{
+	LL_WARN(!lv->nestcnt);
+	LL_WARN(lv->owner != current);
+	lv->owner = NULL;
+	lv->nestcnt = 0;
+	spin_unlock_irq(&lv->lock);
+}
+
+#define local_unlock_irq(lvar)						\
+	do {								\
+		__local_unlock_irq(&__get_cpu_var(lvar));		\
+		put_local_var(lvar);					\
+	} while (0)
+
+#define local_unlock_irq_on(lvar, cpu)					\
+	do {								\
+		__local_unlock_irq(&per_cpu(lvar, cpu));		\
+	} while (0)
+
+static inline int __local_lock_irqsave(struct local_irq_lock *lv)
+{
+	if (lv->owner != current) {
+		__local_lock_irq(lv);
+		return 0;
+	} else {
+		lv->nestcnt++;
+		return 1;
+	}
+}
+
+#define local_lock_irqsave(lvar, _flags)				\
+	do {								\
+		if (__local_lock_irqsave(&get_local_var(lvar)))		\
+			put_local_var(lvar);				\
+		_flags = __get_cpu_var(lvar).flags;			\
+	} while (0)
+
+#define local_lock_irqsave_on(lvar, _flags, cpu)			\
+	do {								\
+		__local_lock_irqsave(&per_cpu(lvar, cpu));		\
+		_flags = per_cpu(lvar, cpu).flags;			\
+	} while (0)
+
+static inline int __local_unlock_irqrestore(struct local_irq_lock *lv,
+					    unsigned long flags)
+{
+	LL_WARN(!lv->nestcnt);
+	LL_WARN(lv->owner != current);
+	if (--lv->nestcnt)
+		return 0;
+
+	lv->owner = NULL;
+	spin_unlock_irqrestore(&lv->lock, lv->flags);
+	return 1;
+}
+
+#define local_unlock_irqrestore(lvar, flags)				\
+	do {								\
+		if (__local_unlock_irqrestore(&__get_cpu_var(lvar), flags)) \
+			put_local_var(lvar);				\
+	} while (0)
+
+#define local_unlock_irqrestore_on(lvar, flags, cpu)			\
+	do {								\
+		__local_unlock_irqrestore(&per_cpu(lvar, cpu), flags);	\
+	} while (0)
+
+#define local_spin_trylock_irq(lvar, lock)				\
+	({								\
+		int __locked;						\
+		local_lock_irq(lvar);					\
+		__locked = spin_trylock(lock);				\
+		if (!__locked)						\
+			local_unlock_irq(lvar);				\
+		__locked;						\
+	})
+
+#define local_spin_lock_irq(lvar, lock)					\
+	do {								\
+		local_lock_irq(lvar);					\
+		spin_lock(lock);					\
+	} while (0)
+
+#define local_spin_unlock_irq(lvar, lock)				\
+	do {								\
+		spin_unlock(lock);					\
+		local_unlock_irq(lvar);					\
+	} while (0)
+
+#define local_spin_lock_irqsave(lvar, lock, flags)			\
+	do {								\
+		local_lock_irqsave(lvar, flags);			\
+		spin_lock(lock);					\
+	} while (0)
+
+#define local_spin_unlock_irqrestore(lvar, lock, flags)			\
+	do {								\
+		spin_unlock(lock);					\
+		local_unlock_irqrestore(lvar, flags);			\
+	} while (0)
+
+#define get_locked_var(lvar, var)					\
+	(*({								\
+		local_lock(lvar);					\
+		&__get_cpu_var(var);					\
+	}))
+
+#define put_locked_var(lvar, var)		local_unlock(lvar)
+
+#define local_lock_cpu(lvar)						\
+	({								\
+		local_lock(lvar);					\
+		smp_processor_id();					\
+	})
+
+#define local_unlock_cpu(lvar)			local_unlock(lvar)
+
+#else /* PREEMPT_RT_BASE */
+
+#define DEFINE_LOCAL_IRQ_LOCK(lvar)		__typeof__(const int) lvar
+#define DECLARE_LOCAL_IRQ_LOCK(lvar)		extern __typeof__(const int) lvar
+
+static inline void local_irq_lock_init(int lvar) { }
+
+#define local_lock(lvar)			preempt_disable()
+#define local_unlock(lvar)			preempt_enable()
+#define local_lock_irq(lvar)			local_irq_disable()
+#define local_unlock_irq(lvar)			local_irq_enable()
+#define local_lock_irqsave(lvar, flags)		local_irq_save(flags)
+#define local_unlock_irqrestore(lvar, flags)	local_irq_restore(flags)
+
+#define local_spin_trylock_irq(lvar, lock)	spin_trylock_irq(lock)
+#define local_spin_lock_irq(lvar, lock)		spin_lock_irq(lock)
+#define local_spin_unlock_irq(lvar, lock)	spin_unlock_irq(lock)
+#define local_spin_lock_irqsave(lvar, lock, flags)	\
+	spin_lock_irqsave(lock, flags)
+#define local_spin_unlock_irqrestore(lvar, lock, flags)	\
+	spin_unlock_irqrestore(lock, flags)
+
+#define get_locked_var(lvar, var)		get_cpu_var(var)
+#define put_locked_var(lvar, var)		put_cpu_var(var)
+
+#define local_lock_cpu(lvar)			get_cpu()
+#define local_unlock_cpu(lvar)			put_cpu()
+
+#endif
+
+#endif
-- 
cgit v1.2.3


From d52e2345208c87cb6b0f07318ff395578bc6d3c9 Mon Sep 17 00:00:00 2001
From: Nicholas Mc Guire <der.herr@hofr.at>
Date: Fri, 17 Jan 2014 20:41:58 +0100
Subject: use local spin_locks in local_lock

Drop recursive call to migrate_disabel/enable for local_*lock* api
reported by Steven Rostedt.

local_lock will call migrate_disable via get_local_var - call tree is

get_locked_var
 `-> local_lock(lvar)
       `-> __local_lock(&get_local_var(lvar));
                          `--> # define get_local_var(var) (*({
                                    migrate_disable();
                                    &__get_cpu_var(var); }))       \

thus there should be no need to call migrate_disable/enable recursively in
spin_try/lock/unlock. This patch addes a spin_trylock_local and replaces
the migration disabling calls by the local calls.

This patch is incomplete as it does not yet cover the _irq/_irqsave variants
by local locks. This patch requires the API cleanup in kernel/softirq.c or
it would break softirq_lock/unlock with respect to migration.

Signed-off-by: Nicholas Mc Guire <der.herr@hofr.at>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 include/linux/locallock.h | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/include/linux/locallock.h b/include/linux/locallock.h
index e7bd8be71f8c..eb338ce64aa9 100644
--- a/include/linux/locallock.h
+++ b/include/linux/locallock.h
@@ -36,10 +36,20 @@ struct local_irq_lock {
 			spin_lock_init(&per_cpu(lvar, __cpu).lock);	\
 	} while (0)
 
+/*
+ * spin_lock|trylock|unlock_local flavour that does not migrate disable
+ * used for __local_lock|trylock|unlock where get_local_var/put_local_var
+ * already takes care of the migrate_disable/enable
+ * for CONFIG_PREEMPT_BASE map to the normal spin_* calls.
+ */
+# define spin_lock_local(lock)			spin_lock(lock)
+# define spin_trylock_local(lock)		spin_trylock(lock)
+# define spin_unlock_local(lock)		spin_unlock(lock)
+
 static inline void __local_lock(struct local_irq_lock *lv)
 {
 	if (lv->owner != current) {
-		spin_lock(&lv->lock);
+		spin_lock_local(&lv->lock);
 		LL_WARN(lv->owner);
 		LL_WARN(lv->nestcnt);
 		lv->owner = current;
@@ -52,7 +62,7 @@ static inline void __local_lock(struct local_irq_lock *lv)
 
 static inline int __local_trylock(struct local_irq_lock *lv)
 {
-	if (lv->owner != current && spin_trylock(&lv->lock)) {
+	if (lv->owner != current && spin_trylock_local(&lv->lock)) {
 		LL_WARN(lv->owner);
 		LL_WARN(lv->nestcnt);
 		lv->owner = current;
@@ -79,7 +89,7 @@ static inline void __local_unlock(struct local_irq_lock *lv)
 		return;
 
 	lv->owner = NULL;
-	spin_unlock(&lv->lock);
+	spin_unlock_local(&lv->lock);
 }
 
 #define local_unlock(lvar)					\
@@ -211,7 +221,7 @@ static inline int __local_unlock_irqrestore(struct local_irq_lock *lv,
 		&__get_cpu_var(var);					\
 	}))
 
-#define put_locked_var(lvar, var)		local_unlock(lvar)
+#define put_locked_var(lvar, var)	local_unlock(lvar);
 
 #define local_lock_cpu(lvar)						\
 	({								\
-- 
cgit v1.2.3


From 94aa9b84f698a848cd2e0bdd9daef0946db3b1c6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 17 Jun 2011 15:42:38 +0200
Subject: cpu-rt-variants.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/smp.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/include/linux/smp.h b/include/linux/smp.h
index 93dff5fff524..b8614d80936d 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -178,6 +178,14 @@ static inline void wake_up_all_idle_cpus(void) {  }
 #define get_cpu()		({ preempt_disable(); smp_processor_id(); })
 #define put_cpu()		preempt_enable()
 
+#ifndef CONFIG_PREEMPT_RT_FULL
+# define get_cpu_light()	get_cpu()
+# define put_cpu_light()	put_cpu()
+#else
+# define get_cpu_light()	({ migrate_disable(); smp_processor_id(); })
+# define put_cpu_light()	migrate_enable()
+#endif
+
 /*
  * Callback to arch code if there's nosmp or maxcpus=0 on the
  * boot command line:
-- 
cgit v1.2.3


From 708a5e6a9147f7d38815c383147ffc853682ac5f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:29:37 -0500
Subject: mm: page_alloc: rt-friendly per-cpu pages

rt-friendly per-cpu pages: convert the irqs-off per-cpu locking
method into a preemptible, explicit-per-cpu-locks method.

Contains fixes from:
	 Peter Zijlstra <a.p.zijlstra@chello.nl>
	 Thomas Gleixner <tglx@linutronix.de>

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 mm/page_alloc.c | 53 +++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 37 insertions(+), 16 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7e3f1634389c..fe0b95e8320d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -59,6 +59,7 @@
 #include <linux/page-debug-flags.h>
 #include <linux/hugetlb.h>
 #include <linux/sched/rt.h>
+#include <linux/locallock.h>
 
 #include <asm/sections.h>
 #include <asm/tlbflush.h>
@@ -230,6 +231,18 @@ EXPORT_SYMBOL(nr_node_ids);
 EXPORT_SYMBOL(nr_online_nodes);
 #endif
 
+static DEFINE_LOCAL_IRQ_LOCK(pa_lock);
+
+#ifdef CONFIG_PREEMPT_RT_BASE
+# define cpu_lock_irqsave(cpu, flags)		\
+	spin_lock_irqsave(&per_cpu(pa_lock, cpu).lock, flags)
+# define cpu_unlock_irqrestore(cpu, flags)	\
+	spin_unlock_irqrestore(&per_cpu(pa_lock, cpu).lock, flags)
+#else
+# define cpu_lock_irqsave(cpu, flags)		local_irq_save(flags)
+# define cpu_unlock_irqrestore(cpu, flags)	local_irq_restore(flags)
+#endif
+
 int page_group_by_mobility_disabled __read_mostly;
 
 void set_pageblock_migratetype(struct page *page, int migratetype)
@@ -773,11 +786,11 @@ static void __free_pages_ok(struct page *page, unsigned int order)
 		return;
 
 	migratetype = get_pfnblock_migratetype(page, pfn);
-	local_irq_save(flags);
+	local_lock_irqsave(pa_lock, flags);
 	__count_vm_events(PGFREE, 1 << order);
 	set_freepage_migratetype(page, migratetype);
 	free_one_page(page_zone(page), page, pfn, order, migratetype);
-	local_irq_restore(flags);
+	local_unlock_irqrestore(pa_lock, flags);
 }
 
 void __init __free_pages_bootmem(struct page *page, unsigned int order)
@@ -1253,14 +1266,14 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
 	unsigned long flags;
 	int to_drain, batch;
 
-	local_irq_save(flags);
+	local_lock_irqsave(pa_lock, flags);
 	batch = ACCESS_ONCE(pcp->batch);
 	to_drain = min(pcp->count, batch);
 	if (to_drain > 0) {
 		free_pcppages_bulk(zone, to_drain, pcp);
 		pcp->count -= to_drain;
 	}
-	local_irq_restore(flags);
+	local_unlock_irqrestore(pa_lock, flags);
 }
 #endif
 
@@ -1280,7 +1293,7 @@ static void drain_pages(unsigned int cpu)
 		struct per_cpu_pageset *pset;
 		struct per_cpu_pages *pcp;
 
-		local_irq_save(flags);
+		cpu_lock_irqsave(cpu, flags);
 		pset = per_cpu_ptr(zone->pageset, cpu);
 
 		pcp = &pset->pcp;
@@ -1288,7 +1301,7 @@ static void drain_pages(unsigned int cpu)
 			free_pcppages_bulk(zone, pcp->count, pcp);
 			pcp->count = 0;
 		}
-		local_irq_restore(flags);
+		cpu_unlock_irqrestore(cpu, flags);
 	}
 }
 
@@ -1341,7 +1354,12 @@ void drain_all_pages(void)
 		else
 			cpumask_clear_cpu(cpu, &cpus_with_pcps);
 	}
+#ifndef CONFIG_PREEMPT_RT_BASE
 	on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);
+#else
+	for_each_cpu(cpu, &cpus_with_pcps)
+		drain_pages(cpu);
+#endif
 }
 
 #ifdef CONFIG_HIBERNATION
@@ -1397,7 +1415,7 @@ void free_hot_cold_page(struct page *page, bool cold)
 
 	migratetype = get_pfnblock_migratetype(page, pfn);
 	set_freepage_migratetype(page, migratetype);
-	local_irq_save(flags);
+	local_lock_irqsave(pa_lock, flags);
 	__count_vm_event(PGFREE);
 
 	/*
@@ -1428,7 +1446,7 @@ void free_hot_cold_page(struct page *page, bool cold)
 	}
 
 out:
-	local_irq_restore(flags);
+	local_unlock_irqrestore(pa_lock, flags);
 }
 
 /*
@@ -1558,7 +1576,7 @@ again:
 		struct per_cpu_pages *pcp;
 		struct list_head *list;
 
-		local_irq_save(flags);
+		local_lock_irqsave(pa_lock, flags);
 		pcp = &this_cpu_ptr(zone->pageset)->pcp;
 		list = &pcp->lists[migratetype];
 		if (list_empty(list)) {
@@ -1590,13 +1608,15 @@ again:
 			 */
 			WARN_ON_ONCE(order > 1);
 		}
-		spin_lock_irqsave(&zone->lock, flags);
+		local_spin_lock_irqsave(pa_lock, &zone->lock, flags);
 		page = __rmqueue(zone, order, migratetype);
-		spin_unlock(&zone->lock);
-		if (!page)
+		if (!page) {
+			spin_unlock(&zone->lock);
 			goto failed;
+		}
 		__mod_zone_freepage_state(zone, -(1 << order),
 					  get_freepage_migratetype(page));
+		spin_unlock(&zone->lock);
 	}
 
 	__mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
@@ -1606,7 +1626,7 @@ again:
 
 	__count_zone_vm_events(PGALLOC, zone, 1 << order);
 	zone_statistics(preferred_zone, zone, gfp_flags);
-	local_irq_restore(flags);
+	local_unlock_irqrestore(pa_lock, flags);
 
 	VM_BUG_ON_PAGE(bad_range(zone, page), page);
 	if (prep_new_page(page, order, gfp_flags))
@@ -1614,7 +1634,7 @@ again:
 	return page;
 
 failed:
-	local_irq_restore(flags);
+	local_unlock_irqrestore(pa_lock, flags);
 	return NULL;
 }
 
@@ -5565,6 +5585,7 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
 void __init page_alloc_init(void)
 {
 	hotcpu_notifier(page_alloc_cpu_notify, 0);
+	local_irq_lock_init(pa_lock);
 }
 
 /*
@@ -6459,7 +6480,7 @@ void zone_pcp_reset(struct zone *zone)
 	struct per_cpu_pageset *pset;
 
 	/* avoid races with drain_pages()  */
-	local_irq_save(flags);
+	local_lock_irqsave(pa_lock, flags);
 	if (zone->pageset != &boot_pageset) {
 		for_each_online_cpu(cpu) {
 			pset = per_cpu_ptr(zone->pageset, cpu);
@@ -6468,7 +6489,7 @@ void zone_pcp_reset(struct zone *zone)
 		free_percpu(zone->pageset);
 		zone->pageset = &boot_pageset;
 	}
-	local_irq_restore(flags);
+	local_unlock_irqrestore(pa_lock, flags);
 }
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
-- 
cgit v1.2.3


From a74e5477d85e7c760cb8da6144a99917f07d205d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 3 Jul 2009 08:44:37 -0500
Subject: mm: page_alloc reduce lock sections further

Split out the pages which are to be freed into a separate list and
call free_pages_bulk() outside of the percpu page allocator locks.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 mm/page_alloc.c | 83 +++++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 60 insertions(+), 23 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fe0b95e8320d..afbc3dfa2b0c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -667,7 +667,7 @@ static inline int free_pages_check(struct page *page)
 }
 
 /*
- * Frees a number of pages from the PCP lists
+ * Frees a number of pages which have been collected from the pcp lists.
  * Assumes all pages on list are in same zone, and of same order.
  * count is the number of pages to free.
  *
@@ -678,18 +678,51 @@ static inline int free_pages_check(struct page *page)
  * pinned" detection logic.
  */
 static void free_pcppages_bulk(struct zone *zone, int count,
-					struct per_cpu_pages *pcp)
+			       struct list_head *list)
 {
-	int migratetype = 0;
-	int batch_free = 0;
 	int to_free = count;
 	unsigned long nr_scanned;
+	unsigned long flags;
+
+	spin_lock_irqsave(&zone->lock, flags);
 
-	spin_lock(&zone->lock);
 	nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
 	if (nr_scanned)
 		__mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
 
+	while (!list_empty(list)) {
+		struct page *page = list_first_entry(list, struct page, lru);
+		int mt;	/* migratetype of the to-be-freed page */
+
+		/* must delete as __free_one_page list manipulates */
+		list_del(&page->lru);
+
+		mt = get_freepage_migratetype(page);
+		if (unlikely(has_isolate_pageblock(zone)))
+			mt = get_pageblock_migratetype(page);
+
+		/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
+		__free_one_page(page, page_to_pfn(page), zone, 0, mt);
+		trace_mm_page_pcpu_drain(page, 0, mt);
+		to_free--;
+	}
+	WARN_ON(to_free != 0);
+	spin_unlock_irqrestore(&zone->lock, flags);
+}
+
+/*
+ * Moves a number of pages from the PCP lists to free list which
+ * is freed outside of the locked region.
+ *
+ * Assumes all pages on list are in same zone, and of same order.
+ * count is the number of pages to free.
+ */
+static void isolate_pcp_pages(int to_free, struct per_cpu_pages *src,
+			      struct list_head *dst)
+{
+	int migratetype = 0;
+	int batch_free = 0;
+
 	while (to_free) {
 		struct page *page;
 		struct list_head *list;
@@ -705,7 +738,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
 			batch_free++;
 			if (++migratetype == MIGRATE_PCPTYPES)
 				migratetype = 0;
-			list = &pcp->lists[migratetype];
+			list = &src->lists[migratetype];
 		} while (list_empty(list));
 
 		/* This is the only non-empty list. Free them all. */
@@ -713,21 +746,11 @@ static void free_pcppages_bulk(struct zone *zone, int count,
 			batch_free = to_free;
 
 		do {
-			int mt;	/* migratetype of the to-be-freed page */
-
 			page = list_last_entry(list, struct page, lru);
-			/* must delete as __free_one_page list manipulates */
 			list_del(&page->lru);
-			mt = get_freepage_migratetype(page);
-			if (unlikely(has_isolate_pageblock(zone)))
-				mt = get_pageblock_migratetype(page);
-
-			/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
-			__free_one_page(page, page_to_pfn(page), zone, 0, mt);
-			trace_mm_page_pcpu_drain(page, 0, mt);
+			list_add(&page->lru, dst);
 		} while (--to_free && --batch_free && !list_empty(list));
 	}
-	spin_unlock(&zone->lock);
 }
 
 static void free_one_page(struct zone *zone,
@@ -736,7 +759,9 @@ static void free_one_page(struct zone *zone,
 				int migratetype)
 {
 	unsigned long nr_scanned;
-	spin_lock(&zone->lock);
+	unsigned long flags;
+
+	spin_lock_irqsave(&zone->lock, flags);
 	nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
 	if (nr_scanned)
 		__mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
@@ -746,7 +771,7 @@ static void free_one_page(struct zone *zone,
 		migratetype = get_pfnblock_migratetype(page, pfn);
 	}
 	__free_one_page(page, pfn, zone, order, migratetype);
-	spin_unlock(&zone->lock);
+	spin_unlock_irqrestore(&zone->lock, flags);
 }
 
 static bool free_pages_prepare(struct page *page, unsigned int order)
@@ -1264,16 +1289,18 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
 {
 	unsigned long flags;
+	LIST_HEAD(dst);
 	int to_drain, batch;
 
 	local_lock_irqsave(pa_lock, flags);
 	batch = ACCESS_ONCE(pcp->batch);
 	to_drain = min(pcp->count, batch);
 	if (to_drain > 0) {
-		free_pcppages_bulk(zone, to_drain, pcp);
+		isolate_pcp_pages(to_drain, pcp, &dst);
 		pcp->count -= to_drain;
 	}
 	local_unlock_irqrestore(pa_lock, flags);
+	free_pcppages_bulk(zone, to_drain, &dst);
 }
 #endif
 
@@ -1292,16 +1319,21 @@ static void drain_pages(unsigned int cpu)
 	for_each_populated_zone(zone) {
 		struct per_cpu_pageset *pset;
 		struct per_cpu_pages *pcp;
+		LIST_HEAD(dst);
+		int count;
 
 		cpu_lock_irqsave(cpu, flags);
 		pset = per_cpu_ptr(zone->pageset, cpu);
 
 		pcp = &pset->pcp;
-		if (pcp->count) {
-			free_pcppages_bulk(zone, pcp->count, pcp);
+		count = pcp->count;
+		if (count) {
+			isolate_pcp_pages(count, pcp, &dst);
 			pcp->count = 0;
 		}
 		cpu_unlock_irqrestore(cpu, flags);
+		if (count)
+			free_pcppages_bulk(zone, count, &dst);
 	}
 }
 
@@ -1441,8 +1473,13 @@ void free_hot_cold_page(struct page *page, bool cold)
 	pcp->count++;
 	if (pcp->count >= pcp->high) {
 		unsigned long batch = ACCESS_ONCE(pcp->batch);
-		free_pcppages_bulk(zone, batch, pcp);
+		LIST_HEAD(dst);
+
+		isolate_pcp_pages(batch, pcp, &dst);
 		pcp->count -= batch;
+		local_unlock_irqrestore(pa_lock, flags);
+		free_pcppages_bulk(zone, batch, &dst);
+		return;
 	}
 
 out:
-- 
cgit v1.2.3


From 5cfeabf0e0492dbca3349b2f5a5509b4e49075fe Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 21 Jul 2011 16:47:49 +0200
Subject: mm-page-alloc-fix.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 mm/page_alloc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index afbc3dfa2b0c..db0e478362f3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2382,8 +2382,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	count_vm_event(COMPACTSTALL);
 
 	/* Page migration frees to the PCP lists but we want merging */
-	drain_pages(get_cpu());
-	put_cpu();
+	drain_pages(get_cpu_light());
+	put_cpu_light();
 
 	page = get_page_from_freelist(gfp_mask, nodemask,
 			order, zonelist, high_zoneidx,
-- 
cgit v1.2.3


From 29e25725e8e348610546789f9aa6df6a8b909f36 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:29:51 -0500
Subject: mm: convert swap to percpu locked

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 mm/swap.c | 34 ++++++++++++++++++++--------------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/mm/swap.c b/mm/swap.c
index 8a12b33936b4..acb833351464 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -31,6 +31,7 @@
 #include <linux/memcontrol.h>
 #include <linux/gfp.h>
 #include <linux/uio.h>
+#include <linux/locallock.h>
 
 #include "internal.h"
 
@@ -44,6 +45,9 @@ static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
 static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
 static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
 
+static DEFINE_LOCAL_IRQ_LOCK(rotate_lock);
+static DEFINE_LOCAL_IRQ_LOCK(swapvec_lock);
+
 /*
  * This path almost never happens for VM activity - pages are normally
  * freed via pagevecs.  But it gets used by networking.
@@ -473,11 +477,11 @@ void rotate_reclaimable_page(struct page *page)
 		unsigned long flags;
 
 		page_cache_get(page);
-		local_irq_save(flags);
+		local_lock_irqsave(rotate_lock, flags);
 		pvec = this_cpu_ptr(&lru_rotate_pvecs);
 		if (!pagevec_add(pvec, page))
 			pagevec_move_tail(pvec);
-		local_irq_restore(flags);
+		local_unlock_irqrestore(rotate_lock, flags);
 	}
 }
 
@@ -528,12 +532,13 @@ static bool need_activate_page_drain(int cpu)
 void activate_page(struct page *page)
 {
 	if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
-		struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
+		struct pagevec *pvec = &get_locked_var(swapvec_lock,
+						       activate_page_pvecs);
 
 		page_cache_get(page);
 		if (!pagevec_add(pvec, page))
 			pagevec_lru_move_fn(pvec, __activate_page, NULL);
-		put_cpu_var(activate_page_pvecs);
+		put_locked_var(swapvec_lock, activate_page_pvecs);
 	}
 }
 
@@ -559,7 +564,7 @@ void activate_page(struct page *page)
 
 static void __lru_cache_activate_page(struct page *page)
 {
-	struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
+	struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
 	int i;
 
 	/*
@@ -581,7 +586,7 @@ static void __lru_cache_activate_page(struct page *page)
 		}
 	}
 
-	put_cpu_var(lru_add_pvec);
+	put_locked_var(swapvec_lock, lru_add_pvec);
 }
 
 /*
@@ -620,13 +625,13 @@ EXPORT_SYMBOL(mark_page_accessed);
 
 static void __lru_cache_add(struct page *page)
 {
-	struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
+	struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
 
 	page_cache_get(page);
 	if (!pagevec_space(pvec))
 		__pagevec_lru_add(pvec);
 	pagevec_add(pvec, page);
-	put_cpu_var(lru_add_pvec);
+	put_locked_var(swapvec_lock, lru_add_pvec);
 }
 
 /**
@@ -806,9 +811,9 @@ void lru_add_drain_cpu(int cpu)
 		unsigned long flags;
 
 		/* No harm done if a racing interrupt already did this */
-		local_irq_save(flags);
+		local_lock_irqsave(rotate_lock, flags);
 		pagevec_move_tail(pvec);
-		local_irq_restore(flags);
+		local_unlock_irqrestore(rotate_lock, flags);
 	}
 
 	pvec = &per_cpu(lru_deactivate_pvecs, cpu);
@@ -836,18 +841,19 @@ void deactivate_page(struct page *page)
 		return;
 
 	if (likely(get_page_unless_zero(page))) {
-		struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
+		struct pagevec *pvec = &get_locked_var(swapvec_lock,
+						       lru_deactivate_pvecs);
 
 		if (!pagevec_add(pvec, page))
 			pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
-		put_cpu_var(lru_deactivate_pvecs);
+		put_locked_var(swapvec_lock, lru_deactivate_pvecs);
 	}
 }
 
 void lru_add_drain(void)
 {
-	lru_add_drain_cpu(get_cpu());
-	put_cpu();
+	lru_add_drain_cpu(local_lock_cpu(swapvec_lock));
+	local_unlock_cpu(swapvec_lock);
 }
 
 static void lru_add_drain_per_cpu(struct work_struct *dummy)
-- 
cgit v1.2.3


From 0c1333825d3fbe8439517d968af4e956d21352d3 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:30:13 -0500
Subject: mm: make vmstat -rt aware

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/vmstat.h | 4 ++++
 mm/vmstat.c            | 6 ++++++
 2 files changed, 10 insertions(+)

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 82e7db7f7100..3feaf770a8bd 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -33,7 +33,9 @@ DECLARE_PER_CPU(struct vm_event_state, vm_event_states);
  */
 static inline void __count_vm_event(enum vm_event_item item)
 {
+	preempt_disable_rt();
 	raw_cpu_inc(vm_event_states.event[item]);
+	preempt_enable_rt();
 }
 
 static inline void count_vm_event(enum vm_event_item item)
@@ -43,7 +45,9 @@ static inline void count_vm_event(enum vm_event_item item)
 
 static inline void __count_vm_events(enum vm_event_item item, long delta)
 {
+	preempt_disable_rt();
 	raw_cpu_add(vm_event_states.event[item], delta);
+	preempt_enable_rt();
 }
 
 static inline void count_vm_events(enum vm_event_item item, long delta)
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 4590aa42b6cd..d65d0c30a536 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -221,6 +221,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
 	long x;
 	long t;
 
+	preempt_disable_rt();
 	x = delta + __this_cpu_read(*p);
 
 	t = __this_cpu_read(pcp->stat_threshold);
@@ -230,6 +231,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
 		x = 0;
 	}
 	__this_cpu_write(*p, x);
+	preempt_enable_rt();
 }
 EXPORT_SYMBOL(__mod_zone_page_state);
 
@@ -262,6 +264,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
 	s8 __percpu *p = pcp->vm_stat_diff + item;
 	s8 v, t;
 
+	preempt_disable_rt();
 	v = __this_cpu_inc_return(*p);
 	t = __this_cpu_read(pcp->stat_threshold);
 	if (unlikely(v > t)) {
@@ -270,6 +273,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
 		zone_page_state_add(v + overstep, zone, item);
 		__this_cpu_write(*p, -overstep);
 	}
+	preempt_enable_rt();
 }
 
 void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
@@ -284,6 +288,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
 	s8 __percpu *p = pcp->vm_stat_diff + item;
 	s8 v, t;
 
+	preempt_disable_rt();
 	v = __this_cpu_dec_return(*p);
 	t = __this_cpu_read(pcp->stat_threshold);
 	if (unlikely(v < - t)) {
@@ -292,6 +297,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
 		zone_page_state_add(v - overstep, zone, item);
 		__this_cpu_write(*p, overstep);
 	}
+	preempt_enable_rt();
 }
 
 void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
-- 
cgit v1.2.3


From 24a71b791d7cceb59d6b5ba18bc2ef0cd5937d79 Mon Sep 17 00:00:00 2001
From: Frank Rowand <frank.rowand@am.sony.com>
Date: Sat, 1 Oct 2011 18:58:13 -0700
Subject: ARM: Initialize ptl->lock for vector page

Without this patch, ARM can not use SPLIT_PTLOCK_CPUS if
PREEMPT_RT_FULL=y because vectors_user_mapping() creates a
VM_ALWAYSDUMP mapping of the vector page (address 0xffff0000), but no
ptl->lock has been allocated for the page.  An attempt to coredump
that page will result in a kernel NULL pointer dereference when
follow_page() attempts to lock the page.

The call tree to the NULL pointer dereference is:

   do_notify_resume()
      get_signal_to_deliver()
         do_coredump()
            elf_core_dump()
               get_dump_page()
                  __get_user_pages()
                     follow_page()
                        pte_offset_map_lock() <----- a #define
                           ...
                              rt_spin_lock()

The underlying problem is exposed by mm-shrink-the-page-frame-to-rt-size.patch.

Signed-off-by: Frank Rowand <frank.rowand@am.sony.com>
Cc: Frank <Frank_Rowand@sonyusa.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/4E87C535.2030907@am.sony.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/arm/kernel/process.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index fe972a2f3df3..4e16ee3fb0e6 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -431,6 +431,30 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
 }
 
 #ifdef CONFIG_MMU
+/*
+ * CONFIG_SPLIT_PTLOCK_CPUS results in a page->ptl lock.  If the lock is not
+ * initialized by pgtable_page_ctor() then a coredump of the vector page will
+ * fail.
+ */
+static int __init vectors_user_mapping_init_page(void)
+{
+	struct page *page;
+	unsigned long addr = 0xffff0000;
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+
+	pgd = pgd_offset_k(addr);
+	pud = pud_offset(pgd, addr);
+	pmd = pmd_offset(pud, addr);
+	page = pmd_page(*(pmd));
+
+	pgtable_page_ctor(page);
+
+	return 0;
+}
+late_initcall(vectors_user_mapping_init_page);
+
 #ifdef CONFIG_KUSER_HELPERS
 /*
  * The vectors page is always readable from user space for the
-- 
cgit v1.2.3


From dd74871269b9d67a1bcd81fcfade87cf2bfe6431 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 9 Jan 2013 10:33:09 +0100
Subject: mm: bounce: Use local_irq_save_nort

kmap_atomic() is preemptible on RT.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 block/bounce.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/bounce.c b/block/bounce.c
index ab21ba203d5c..97b734aed81a 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -54,11 +54,11 @@ static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
 	unsigned long flags;
 	unsigned char *vto;
 
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 	vto = kmap_atomic(to->bv_page);
 	memcpy(vto + to->bv_offset, vfrom, to->bv_len);
 	kunmap_atomic(vto);
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 }
 
 #else /* CONFIG_HIGHMEM */
-- 
cgit v1.2.3


From 63de16b36005325b07cdb3eeed644aa0caef48f6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:44:03 -0500
Subject: mm: Allow only slub on RT

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 init/Kconfig | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/init/Kconfig b/init/Kconfig
index 2081a4d3d917..c3fb8ceb3ecd 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1677,6 +1677,7 @@ choice
 
 config SLAB
 	bool "SLAB"
+	depends on !PREEMPT_RT_FULL
 	help
 	  The regular slab allocator that is established and known to work
 	  well in all environments. It organizes cache hot objects in
@@ -1695,6 +1696,7 @@ config SLUB
 config SLOB
 	depends on EXPERT
 	bool "SLOB (Simple Allocator)"
+	depends on !PREEMPT_RT_FULL
 	help
 	   SLOB replaces the stock allocator with a drastically simpler
 	   allocator. SLOB is generally more space efficient but
-- 
cgit v1.2.3


From 3aefd560fecc08181f8df7a988172152c890c7e9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 25 Oct 2012 10:32:35 +0100
Subject: mm: Enable SLUB for RT

Make SLUB RT aware and remove the restriction in Kconfig.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 mm/slab.h |   4 +++
 mm/slub.c | 118 ++++++++++++++++++++++++++++++++++++++++++++++++--------------
 2 files changed, 95 insertions(+), 27 deletions(-)

diff --git a/mm/slab.h b/mm/slab.h
index ab019e63e3c2..c5d1c194f2ca 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -315,7 +315,11 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
  * The slab lists for all objects.
  */
 struct kmem_cache_node {
+#ifdef CONFIG_SLUB
+	raw_spinlock_t list_lock;
+#else
 	spinlock_t list_lock;
+#endif
 
 #ifdef CONFIG_SLAB
 	struct list_head slabs_partial;	/* partial list first, better asm code */
diff --git a/mm/slub.c b/mm/slub.c
index ae7b9f1ad394..69cbb579a3e6 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1044,7 +1044,7 @@ static noinline struct kmem_cache_node *free_debug_processing(
 {
 	struct kmem_cache_node *n = get_node(s, page_to_nid(page));
 
-	spin_lock_irqsave(&n->list_lock, *flags);
+	raw_spin_lock_irqsave(&n->list_lock, *flags);
 	slab_lock(page);
 
 	if (!check_slab(s, page))
@@ -1091,7 +1091,7 @@ out:
 
 fail:
 	slab_unlock(page);
-	spin_unlock_irqrestore(&n->list_lock, *flags);
+	raw_spin_unlock_irqrestore(&n->list_lock, *flags);
 	slab_fix(s, "Object at 0x%p not freed", object);
 	return NULL;
 }
@@ -1219,6 +1219,12 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node,
 
 #endif /* CONFIG_SLUB_DEBUG */
 
+struct slub_free_list {
+	raw_spinlock_t		lock;
+	struct list_head	list;
+};
+static DEFINE_PER_CPU(struct slub_free_list, slub_free_list);
+
 /*
  * Hooks for other subsystems that check memory allocations. In a typical
  * production configuration these hooks all should produce no code at all.
@@ -1306,7 +1312,11 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
 
 	flags &= gfp_allowed_mask;
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+	if (system_state == SYSTEM_RUNNING)
+#else
 	if (flags & __GFP_WAIT)
+#endif
 		local_irq_enable();
 
 	flags |= s->allocflags;
@@ -1347,7 +1357,11 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
 			kmemcheck_mark_unallocated_pages(page, pages);
 	}
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+	if (system_state == SYSTEM_RUNNING)
+#else
 	if (flags & __GFP_WAIT)
+#endif
 		local_irq_disable();
 	if (!page)
 		return NULL;
@@ -1442,6 +1456,16 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
 	memcg_uncharge_slab(s, order);
 }
 
+static void free_delayed(struct list_head *h)
+{
+	while(!list_empty(h)) {
+		struct page *page = list_first_entry(h, struct page, lru);
+
+		list_del(&page->lru);
+		__free_slab(page->slab_cache, page);
+	}
+}
+
 #define need_reserve_slab_rcu						\
 	(sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head))
 
@@ -1476,6 +1500,12 @@ static void free_slab(struct kmem_cache *s, struct page *page)
 		}
 
 		call_rcu(head, rcu_free_slab);
+	} else if (irqs_disabled()) {
+		struct slub_free_list *f = &__get_cpu_var(slub_free_list);
+
+		raw_spin_lock(&f->lock);
+		list_add(&page->lru, &f->list);
+		raw_spin_unlock(&f->lock);
 	} else
 		__free_slab(s, page);
 }
@@ -1589,7 +1619,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
 	if (!n || !n->nr_partial)
 		return NULL;
 
-	spin_lock(&n->list_lock);
+	raw_spin_lock(&n->list_lock);
 	list_for_each_entry_safe(page, page2, &n->partial, lru) {
 		void *t;
 
@@ -1614,7 +1644,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
 			break;
 
 	}
-	spin_unlock(&n->list_lock);
+	raw_spin_unlock(&n->list_lock);
 	return object;
 }
 
@@ -1860,7 +1890,7 @@ redo:
 			 * that acquire_slab() will see a slab page that
 			 * is frozen
 			 */
-			spin_lock(&n->list_lock);
+			raw_spin_lock(&n->list_lock);
 		}
 	} else {
 		m = M_FULL;
@@ -1871,7 +1901,7 @@ redo:
 			 * slabs from diagnostic functions will not see
 			 * any frozen slabs.
 			 */
-			spin_lock(&n->list_lock);
+			raw_spin_lock(&n->list_lock);
 		}
 	}
 
@@ -1906,7 +1936,7 @@ redo:
 		goto redo;
 
 	if (lock)
-		spin_unlock(&n->list_lock);
+		raw_spin_unlock(&n->list_lock);
 
 	if (m == M_FREE) {
 		stat(s, DEACTIVATE_EMPTY);
@@ -1938,10 +1968,10 @@ static void unfreeze_partials(struct kmem_cache *s,
 		n2 = get_node(s, page_to_nid(page));
 		if (n != n2) {
 			if (n)
-				spin_unlock(&n->list_lock);
+				raw_spin_unlock(&n->list_lock);
 
 			n = n2;
-			spin_lock(&n->list_lock);
+			raw_spin_lock(&n->list_lock);
 		}
 
 		do {
@@ -1970,7 +2000,7 @@ static void unfreeze_partials(struct kmem_cache *s,
 	}
 
 	if (n)
-		spin_unlock(&n->list_lock);
+		raw_spin_unlock(&n->list_lock);
 
 	while (discard_page) {
 		page = discard_page;
@@ -2008,14 +2038,21 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
 			pobjects = oldpage->pobjects;
 			pages = oldpage->pages;
 			if (drain && pobjects > s->cpu_partial) {
+				struct slub_free_list *f;
 				unsigned long flags;
+				LIST_HEAD(tofree);
 				/*
 				 * partial array is full. Move the existing
 				 * set to the per node partial list.
 				 */
 				local_irq_save(flags);
 				unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
+				f = &__get_cpu_var(slub_free_list);
+				raw_spin_lock(&f->lock);
+				list_splice_init(&f->list, &tofree);
+				raw_spin_unlock(&f->lock);
 				local_irq_restore(flags);
+				free_delayed(&tofree);
 				oldpage = NULL;
 				pobjects = 0;
 				pages = 0;
@@ -2079,7 +2116,22 @@ static bool has_cpu_slab(int cpu, void *info)
 
 static void flush_all(struct kmem_cache *s)
 {
+	LIST_HEAD(tofree);
+	int cpu;
+
 	on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC);
+	for_each_online_cpu(cpu) {
+		struct slub_free_list *f;
+
+		if (!has_cpu_slab(cpu, s))
+			continue;
+
+		f = &per_cpu(slub_free_list, cpu);
+		raw_spin_lock_irq(&f->lock);
+		list_splice_init(&f->list, &tofree);
+		raw_spin_unlock_irq(&f->lock);
+		free_delayed(&tofree);
+	}
 }
 
 /*
@@ -2115,10 +2167,10 @@ static unsigned long count_partial(struct kmem_cache_node *n,
 	unsigned long x = 0;
 	struct page *page;
 
-	spin_lock_irqsave(&n->list_lock, flags);
+	raw_spin_lock_irqsave(&n->list_lock, flags);
 	list_for_each_entry(page, &n->partial, lru)
 		x += get_count(page);
-	spin_unlock_irqrestore(&n->list_lock, flags);
+	raw_spin_unlock_irqrestore(&n->list_lock, flags);
 	return x;
 }
 #endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */
@@ -2255,9 +2307,11 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page)
 static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 			  unsigned long addr, struct kmem_cache_cpu *c)
 {
+	struct slub_free_list *f;
 	void *freelist;
 	struct page *page;
 	unsigned long flags;
+	LIST_HEAD(tofree);
 
 	local_irq_save(flags);
 #ifdef CONFIG_PREEMPT
@@ -2325,7 +2379,13 @@ load_freelist:
 	VM_BUG_ON(!c->page->frozen);
 	c->freelist = get_freepointer(s, freelist);
 	c->tid = next_tid(c->tid);
+out:
+	f = &__get_cpu_var(slub_free_list);
+	raw_spin_lock(&f->lock);
+	list_splice_init(&f->list, &tofree);
+	raw_spin_unlock(&f->lock);
 	local_irq_restore(flags);
+	free_delayed(&tofree);
 	return freelist;
 
 new_slab:
@@ -2342,8 +2402,7 @@ new_slab:
 
 	if (unlikely(!freelist)) {
 		slab_out_of_memory(s, gfpflags, node);
-		local_irq_restore(flags);
-		return NULL;
+		goto out;
 	}
 
 	page = c->page;
@@ -2358,8 +2417,7 @@ new_slab:
 	deactivate_slab(s, page, get_freepointer(s, freelist));
 	c->page = NULL;
 	c->freelist = NULL;
-	local_irq_restore(flags);
-	return freelist;
+	goto out;
 }
 
 /*
@@ -2531,7 +2589,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
 
 	do {
 		if (unlikely(n)) {
-			spin_unlock_irqrestore(&n->list_lock, flags);
+			raw_spin_unlock_irqrestore(&n->list_lock, flags);
 			n = NULL;
 		}
 		prior = page->freelist;
@@ -2563,7 +2621,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
 				 * Otherwise the list_lock will synchronize with
 				 * other processors updating the list of slabs.
 				 */
-				spin_lock_irqsave(&n->list_lock, flags);
+				raw_spin_lock_irqsave(&n->list_lock, flags);
 
 			}
 		}
@@ -2605,7 +2663,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
 		add_partial(n, page, DEACTIVATE_TO_TAIL);
 		stat(s, FREE_ADD_PARTIAL);
 	}
-	spin_unlock_irqrestore(&n->list_lock, flags);
+	raw_spin_unlock_irqrestore(&n->list_lock, flags);
 	return;
 
 slab_empty:
@@ -2620,7 +2678,7 @@ slab_empty:
 		remove_full(s, n, page);
 	}
 
-	spin_unlock_irqrestore(&n->list_lock, flags);
+	raw_spin_unlock_irqrestore(&n->list_lock, flags);
 	stat(s, FREE_SLAB);
 	discard_slab(s, page);
 }
@@ -2816,7 +2874,7 @@ static void
 init_kmem_cache_node(struct kmem_cache_node *n)
 {
 	n->nr_partial = 0;
-	spin_lock_init(&n->list_lock);
+	raw_spin_lock_init(&n->list_lock);
 	INIT_LIST_HEAD(&n->partial);
 #ifdef CONFIG_SLUB_DEBUG
 	atomic_long_set(&n->nr_slabs, 0);
@@ -3373,7 +3431,7 @@ int __kmem_cache_shrink(struct kmem_cache *s)
 		for (i = 0; i < objects; i++)
 			INIT_LIST_HEAD(slabs_by_inuse + i);
 
-		spin_lock_irqsave(&n->list_lock, flags);
+		raw_spin_lock_irqsave(&n->list_lock, flags);
 
 		/*
 		 * Build lists indexed by the items in use in each slab.
@@ -3394,7 +3452,7 @@ int __kmem_cache_shrink(struct kmem_cache *s)
 		for (i = objects - 1; i > 0; i--)
 			list_splice(slabs_by_inuse + i, n->partial.prev);
 
-		spin_unlock_irqrestore(&n->list_lock, flags);
+		raw_spin_unlock_irqrestore(&n->list_lock, flags);
 
 		/* Release empty slabs */
 		list_for_each_entry_safe(page, t, slabs_by_inuse, lru)
@@ -3567,6 +3625,12 @@ void __init kmem_cache_init(void)
 {
 	static __initdata struct kmem_cache boot_kmem_cache,
 		boot_kmem_cache_node;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		raw_spin_lock_init(&per_cpu(slub_free_list, cpu).lock);
+		INIT_LIST_HEAD(&per_cpu(slub_free_list, cpu).list);
+	}
 
 	if (debug_guardpage_minorder())
 		slub_max_order = 0;
@@ -3815,7 +3879,7 @@ static int validate_slab_node(struct kmem_cache *s,
 	struct page *page;
 	unsigned long flags;
 
-	spin_lock_irqsave(&n->list_lock, flags);
+	raw_spin_lock_irqsave(&n->list_lock, flags);
 
 	list_for_each_entry(page, &n->partial, lru) {
 		validate_slab_slab(s, page, map);
@@ -3837,7 +3901,7 @@ static int validate_slab_node(struct kmem_cache *s,
 		       s->name, count, atomic_long_read(&n->nr_slabs));
 
 out:
-	spin_unlock_irqrestore(&n->list_lock, flags);
+	raw_spin_unlock_irqrestore(&n->list_lock, flags);
 	return count;
 }
 
@@ -4025,12 +4089,12 @@ static int list_locations(struct kmem_cache *s, char *buf,
 		if (!atomic_long_read(&n->nr_slabs))
 			continue;
 
-		spin_lock_irqsave(&n->list_lock, flags);
+		raw_spin_lock_irqsave(&n->list_lock, flags);
 		list_for_each_entry(page, &n->partial, lru)
 			process_slab(&t, s, page, alloc, map);
 		list_for_each_entry(page, &n->full, lru)
 			process_slab(&t, s, page, alloc, map);
-		spin_unlock_irqrestore(&n->list_lock, flags);
+		raw_spin_unlock_irqrestore(&n->list_lock, flags);
 	}
 
 	for (i = 0; i < t.count; i++) {
-- 
cgit v1.2.3


From afe77af0aefdae66812fe8f25942c736534eb870 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 9 Jan 2013 12:08:15 +0100
Subject: slub: Enable irqs for __GFP_WAIT

SYSTEM_RUNNING might be too late for enabling interrupts. Allocations
with GFP_WAIT can happen before that. So use this as an indicator.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 mm/slub.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index 69cbb579a3e6..534609a0326a 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1309,14 +1309,15 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
 	struct page *page;
 	struct kmem_cache_order_objects oo = s->oo;
 	gfp_t alloc_gfp;
+	bool enableirqs;
 
 	flags &= gfp_allowed_mask;
 
+	enableirqs = (flags & __GFP_WAIT) != 0;
 #ifdef CONFIG_PREEMPT_RT_FULL
-	if (system_state == SYSTEM_RUNNING)
-#else
-	if (flags & __GFP_WAIT)
+	enableirqs |= system_state == SYSTEM_RUNNING;
 #endif
+	if (enableirqs)
 		local_irq_enable();
 
 	flags |= s->allocflags;
@@ -1357,11 +1358,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
 			kmemcheck_mark_unallocated_pages(page, pages);
 	}
 
-#ifdef CONFIG_PREEMPT_RT_FULL
-	if (system_state == SYSTEM_RUNNING)
-#else
-	if (flags & __GFP_WAIT)
-#endif
+	if (enableirqs)
 		local_irq_disable();
 	if (!page)
 		return NULL;
-- 
cgit v1.2.3


From 62d8aafdf9e93b3a4ec09208cbf64615962905b3 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 20 May 2015 00:23:35 +0200
Subject: slub: delay ctor until the object is requested

It seems that allocation of plenty objects causes latency on ARM since that
code can not be preempted

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 mm/slub.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/mm/slub.c b/mm/slub.c
index 534609a0326a..72bb06beaabc 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1376,8 +1376,10 @@ static void setup_object(struct kmem_cache *s, struct page *page,
 				void *object)
 {
 	setup_object_debug(s, page, object);
+#ifndef CONFIG_PREEMPT_RT_FULL
 	if (unlikely(s->ctor))
 		s->ctor(object);
+#endif
 }
 
 static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
@@ -2499,6 +2501,10 @@ redo:
 
 	if (unlikely(gfpflags & __GFP_ZERO) && object)
 		memset(object, 0, s->object_size);
+#ifdef CONFIG_PREEMPT_RT_FULL
+	if (unlikely(s->ctor) && object)
+		s->ctor(object);
+#endif
 
 	slab_post_alloc_hook(s, gfpflags, object);
 
-- 
cgit v1.2.3


From e60b544454e9d7826ce5140784e5d67eafc39b79 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 27 Sep 2012 11:11:46 +0200
Subject: mm: page_alloc: Use local_lock_on() instead of plain spinlock

The plain spinlock while sufficient does not update the local_lock
internals. Use a proper local_lock function instead to ease debugging.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable-rt@vger.kernel.org
---
 mm/page_alloc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index db0e478362f3..6c18699a36e0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -235,9 +235,9 @@ static DEFINE_LOCAL_IRQ_LOCK(pa_lock);
 
 #ifdef CONFIG_PREEMPT_RT_BASE
 # define cpu_lock_irqsave(cpu, flags)		\
-	spin_lock_irqsave(&per_cpu(pa_lock, cpu).lock, flags)
+	local_lock_irqsave_on(pa_lock, flags, cpu)
 # define cpu_unlock_irqrestore(cpu, flags)	\
-	spin_unlock_irqrestore(&per_cpu(pa_lock, cpu).lock, flags)
+	local_unlock_irqrestore_on(pa_lock, flags, cpu)
 #else
 # define cpu_lock_irqsave(cpu, flags)		local_irq_save(flags)
 # define cpu_unlock_irqrestore(cpu, flags)	local_irq_restore(flags)
-- 
cgit v1.2.3


From ccd4edbfd164a9449541e847b39743995c9b15ff Mon Sep 17 00:00:00 2001
From: Yang Shi <yang.shi@windriver.com>
Date: Wed, 30 Oct 2013 11:48:33 -0700
Subject: mm/memcontrol: Don't call schedule_work_on in preemption disabled
 context

The following trace is triggered when running ltp oom test cases:

BUG: sleeping function called from invalid context at kernel/rtmutex.c:659
in_atomic(): 1, irqs_disabled(): 0, pid: 17188, name: oom03
Preemption disabled at:[<ffffffff8112ba70>] mem_cgroup_reclaim+0x90/0xe0

CPU: 2 PID: 17188 Comm: oom03 Not tainted 3.10.10-rt3 #2
Hardware name: Intel Corporation Calpella platform/MATXM-CORE-411-B, BIOS 4.6.3 08/18/2010
ffff88007684d730 ffff880070df9b58 ffffffff8169918d ffff880070df9b70
ffffffff8106db31 ffff88007688b4a0 ffff880070df9b88 ffffffff8169d9c0
ffff88007688b4a0 ffff880070df9bc8 ffffffff81059da1 0000000170df9bb0
Call Trace:
[<ffffffff8169918d>] dump_stack+0x19/0x1b
[<ffffffff8106db31>] __might_sleep+0xf1/0x170
[<ffffffff8169d9c0>] rt_spin_lock+0x20/0x50
[<ffffffff81059da1>] queue_work_on+0x61/0x100
[<ffffffff8112b361>] drain_all_stock+0xe1/0x1c0
[<ffffffff8112ba70>] mem_cgroup_reclaim+0x90/0xe0
[<ffffffff8112beda>] __mem_cgroup_try_charge+0x41a/0xc40
[<ffffffff810f1c91>] ? release_pages+0x1b1/0x1f0
[<ffffffff8106f200>] ? sched_exec+0x40/0xb0
[<ffffffff8112cc87>] mem_cgroup_charge_common+0x37/0x70
[<ffffffff8112e2c6>] mem_cgroup_newpage_charge+0x26/0x30
[<ffffffff8110af68>] handle_pte_fault+0x618/0x840
[<ffffffff8103ecf6>] ? unpin_current_cpu+0x16/0x70
[<ffffffff81070f94>] ? migrate_enable+0xd4/0x200
[<ffffffff8110cde5>] handle_mm_fault+0x145/0x1e0
[<ffffffff810301e1>] __do_page_fault+0x1a1/0x4c0
[<ffffffff8169c9eb>] ? preempt_schedule_irq+0x4b/0x70
[<ffffffff8169e3b7>] ? retint_kernel+0x37/0x40
[<ffffffff8103053e>] do_page_fault+0xe/0x10
[<ffffffff8169e4c2>] page_fault+0x22/0x30

So, to prevent schedule_work_on from being called in preempt disabled context,
replace the pair of get/put_cpu() to get/put_cpu_light().

Cc: stable-rt@vger.kernel.org
Signed-off-by: Yang Shi <yang.shi@windriver.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 mm/memcontrol.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d72bdc3ca09b..8a41fb42413f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2397,7 +2397,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
 
 	/* Notify other cpus that system-wide "drain" is running */
 	get_online_cpus();
-	curcpu = get_cpu();
+	curcpu = get_cpu_light();
 	for_each_online_cpu(cpu) {
 		struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
 		struct mem_cgroup *memcg;
@@ -2414,7 +2414,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
 				schedule_work_on(cpu, &stock->work);
 		}
 	}
-	put_cpu();
+	put_cpu_light();
 
 	if (!sync)
 		goto out;
-- 
cgit v1.2.3


From d84990d3afc37062fac1c659cc990470f6c2de0b Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 28 Jan 2015 17:14:16 +0100
Subject: mm/memcontrol: do no disable interrupts

There are a few local_irq_disable() which then take sleeping locks. This
patch converts them local locks.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 mm/memcontrol.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8a41fb42413f..498e8844a142 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -60,6 +60,8 @@
 #include <net/sock.h>
 #include <net/ip.h>
 #include <net/tcp_memcontrol.h>
+#include <linux/locallock.h>
+
 #include "slab.h"
 
 #include <asm/uaccess.h>
@@ -87,6 +89,7 @@ static int really_do_swap_account __initdata;
 #define do_swap_account		0
 #endif
 
+static DEFINE_LOCAL_IRQ_LOCK(event_lock);
 
 static const char * const mem_cgroup_stat_names[] = {
 	"cache",
@@ -3419,12 +3422,12 @@ static int mem_cgroup_move_account(struct page *page,
 	move_unlock_mem_cgroup(from, &flags);
 	ret = 0;
 
-	local_irq_disable();
+	local_lock_irq(event_lock);
 	mem_cgroup_charge_statistics(to, page, nr_pages);
 	memcg_check_events(to, page);
 	mem_cgroup_charge_statistics(from, page, -nr_pages);
 	memcg_check_events(from, page);
-	local_irq_enable();
+	local_unlock_irq(event_lock);
 out_unlock:
 	unlock_page(page);
 out:
@@ -6406,10 +6409,10 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
 		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
 	}
 
-	local_irq_disable();
+	local_lock_irq(event_lock);
 	mem_cgroup_charge_statistics(memcg, page, nr_pages);
 	memcg_check_events(memcg, page);
-	local_irq_enable();
+	local_unlock_irq(event_lock);
 
 	if (do_swap_account && PageSwapCache(page)) {
 		swp_entry_t entry = { .val = page_private(page) };
@@ -6468,14 +6471,14 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
 		memcg_oom_recover(memcg);
 	}
 
-	local_irq_save(flags);
+	local_lock_irqsave(event_lock, flags);
 	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);
 	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);
 	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
 	__this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
 	__this_cpu_add(memcg->stat->nr_page_events, nr_anon + nr_file);
 	memcg_check_events(memcg, dummy_page);
-	local_irq_restore(flags);
+	local_unlock_irqrestore(event_lock, flags);
 }
 
 static void uncharge_list(struct list_head *page_list)
-- 
cgit v1.2.3


From 3f0c2c5c4cd368ea4402f76f96dfc70a496bd682 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 17 Jul 2011 21:33:18 +0200
Subject: radix-tree-rt-aware.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/radix-tree.h | 7 ++++++-
 lib/radix-tree.c           | 5 ++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 33170dbd9db4..933ff6977c96 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -277,8 +277,13 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
 unsigned int radix_tree_gang_lookup_slot(struct radix_tree_root *root,
 			void ***results, unsigned long *indices,
 			unsigned long first_index, unsigned int max_items);
+#ifndef CONFIG_PREEMPT_RT_FULL
 int radix_tree_preload(gfp_t gfp_mask);
 int radix_tree_maybe_preload(gfp_t gfp_mask);
+#else
+static inline int radix_tree_preload(gfp_t gm) { return 0; }
+static inline int radix_tree_maybe_preload(gfp_t gfp_mask) { return 0; }
+#endif
 void radix_tree_init(void);
 void *radix_tree_tag_set(struct radix_tree_root *root,
 			unsigned long index, unsigned int tag);
@@ -303,7 +308,7 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item);
 
 static inline void radix_tree_preload_end(void)
 {
-	preempt_enable();
+	preempt_enable_nort();
 }
 
 /**
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 3291a8e37490..f783149480e5 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -195,12 +195,13 @@ radix_tree_node_alloc(struct radix_tree_root *root)
 		 * succeed in getting a node here (and never reach
 		 * kmem_cache_alloc)
 		 */
-		rtp = this_cpu_ptr(&radix_tree_preloads);
+		rtp = &get_cpu_var(radix_tree_preloads);
 		if (rtp->nr) {
 			ret = rtp->nodes[rtp->nr - 1];
 			rtp->nodes[rtp->nr - 1] = NULL;
 			rtp->nr--;
 		}
+		put_cpu_var(radix_tree_preloads);
 		/*
 		 * Update the allocation stack trace as this is more useful
 		 * for debugging.
@@ -240,6 +241,7 @@ radix_tree_node_free(struct radix_tree_node *node)
 	call_rcu(&node->rcu_head, radix_tree_node_rcu_free);
 }
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 /*
  * Load up this CPU's radix_tree_node buffer with sufficient objects to
  * ensure that the addition of a single element in the tree cannot fail.  On
@@ -305,6 +307,7 @@ int radix_tree_maybe_preload(gfp_t gfp_mask)
 	return 0;
 }
 EXPORT_SYMBOL(radix_tree_maybe_preload);
+#endif
 
 /*
  *	Return the maximum key which can be store into a
-- 
cgit v1.2.3


From 8c554352a5184ee6f3635c38e8d5cb7d360137ed Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 20 May 2015 00:23:36 +0200
Subject: panic: skip get_random_bytes for RT_FULL in init_oops_id

---
 kernel/panic.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/panic.c b/kernel/panic.c
index cf80672b7924..8ff8e53b0c0c 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -384,9 +384,11 @@ static u64 oops_id;
 
 static int init_oops_id(void)
 {
+#ifndef CONFIG_PREEMPT_RT_FULL
 	if (!oops_id)
 		get_random_bytes(&oops_id, sizeof(oops_id));
 	else
+#endif
 		oops_id++;
 
 	return 0;
-- 
cgit v1.2.3


From 0452e2c302c5b66f2a0626205da27f8f38310a82 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:30:12 -0500
Subject: ipc: Make the ipc code -rt aware

RT serializes the code with the (rt)spinlock but keeps preemption
enabled. Some parts of the code need to be atomic nevertheless.

Protect it with preempt_disable/enable_rt pairts.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 ipc/mqueue.c |  5 +++++
 ipc/msg.c    | 16 +++++++++++++++-
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 4fcf39af1776..7cdf5e554d17 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -923,12 +923,17 @@ static inline void pipelined_send(struct mqueue_inode_info *info,
 				  struct msg_msg *message,
 				  struct ext_wait_queue *receiver)
 {
+	/*
+	 * Keep them in one critical section for PREEMPT_RT:
+	 */
+	preempt_disable_rt();
 	receiver->msg = message;
 	list_del(&receiver->list);
 	receiver->state = STATE_PENDING;
 	wake_up_process(receiver->task);
 	smp_wmb();
 	receiver->state = STATE_READY;
+	preempt_enable_rt();
 }
 
 /* pipelined_receive() - if there is task waiting in sys_mq_timedsend()
diff --git a/ipc/msg.c b/ipc/msg.c
index c5d8e3749985..7b3235eddd7f 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -188,6 +188,12 @@ static void expunge_all(struct msg_queue *msq, int res)
 	struct msg_receiver *msr, *t;
 
 	list_for_each_entry_safe(msr, t, &msq->q_receivers, r_list) {
+		/*
+		 * Make sure that the wakeup doesnt preempt
+		 * this CPU prematurely. (on PREEMPT_RT)
+		 */
+		preempt_disable_rt();
+
 		msr->r_msg = NULL; /* initialize expunge ordering */
 		wake_up_process(msr->r_tsk);
 		/*
@@ -198,6 +204,8 @@ static void expunge_all(struct msg_queue *msq, int res)
 		 */
 		smp_mb();
 		msr->r_msg = ERR_PTR(res);
+
+		preempt_enable_rt();
 	}
 }
 
@@ -574,6 +582,11 @@ static inline int pipelined_send(struct msg_queue *msq, struct msg_msg *msg)
 		if (testmsg(msg, msr->r_msgtype, msr->r_mode) &&
 		    !security_msg_queue_msgrcv(msq, msg, msr->r_tsk,
 					       msr->r_msgtype, msr->r_mode)) {
+			/*
+			 * Make sure that the wakeup doesnt preempt
+			 * this CPU prematurely. (on PREEMPT_RT)
+			 */
+			preempt_disable_rt();
 
 			list_del(&msr->r_list);
 			if (msr->r_maxsize < msg->m_ts) {
@@ -595,12 +608,13 @@ static inline int pipelined_send(struct msg_queue *msq, struct msg_msg *msg)
 				 */
 				smp_mb();
 				msr->r_msg = msg;
+				preempt_enable_rt();
 
 				return 1;
 			}
+			preempt_enable_rt();
 		}
 	}
-
 	return 0;
 }
 
-- 
cgit v1.2.3


From 3c4055ed554d2d896e5b0968a0e5fde5751fea33 Mon Sep 17 00:00:00 2001
From: KOBAYASHI Yoshitake <yoshitake.kobayashi@toshiba.co.jp>
Date: Sat, 23 Jul 2011 11:57:36 +0900
Subject: ipc/mqueue: Add a critical section to avoid a deadlock

(Repost for v3.0-rt1 and changed the distination addreses)
I have tested the following patch on v3.0-rt1 with PREEMPT_RT_FULL.
In POSIX message queue, if a sender process uses SCHED_FIFO and
has a higher priority than a receiver process, the sender will
be stuck at ipc/mqueue.c:452

  452                 while (ewp->state == STATE_PENDING)
  453                         cpu_relax();

Description of the problem
 (receiver process)
   1. receiver changes sender's state to STATE_PENDING (mqueue.c:846)
   2. wake up sender process and "switch to sender" (mqueue.c:847)
      Note: This context switch only happens in PREEMPT_RT_FULL kernel.
 (sender process)
   3. sender check the own state in above loop (mqueue.c:452-453)
   *. receiver will never wake up and cannot change sender's state to
      STATE_READY because sender has higher priority


Signed-off-by: Yoshitake Kobayashi <yoshitake.kobayashi@toshiba.co.jp>
Cc: viro@zeniv.linux.org.uk
Cc: dchinner@redhat.com
Cc: npiggin@kernel.dk
Cc: hch@lst.de
Cc: arnd@arndb.de
Link: http://lkml.kernel.org/r/4E2A38A0.1090601@toshiba.co.jp
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 ipc/mqueue.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 7cdf5e554d17..516902313dc3 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -947,13 +947,18 @@ static inline void pipelined_receive(struct mqueue_inode_info *info)
 		wake_up_interruptible(&info->wait_q);
 		return;
 	}
-	if (msg_insert(sender->msg, info))
-		return;
-	list_del(&sender->list);
-	sender->state = STATE_PENDING;
-	wake_up_process(sender->task);
-	smp_wmb();
-	sender->state = STATE_READY;
+	/*
+	 * Keep them in one critical section for PREEMPT_RT:
+	 */
+	preempt_disable_rt();
+	if (!msg_insert(sender->msg, info)) {
+		list_del(&sender->list);
+		sender->state = STATE_PENDING;
+		wake_up_process(sender->task);
+		smp_wmb();
+		sender->state = STATE_READY;
+	}
+	preempt_enable_rt();
 }
 
 SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr,
-- 
cgit v1.2.3


From 3bb2b648ca58c1022e40ca324a9bba6d87e41378 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:44:07 -0500
Subject: relay: fix timer madness

remove timer calls (!!!) from deep within the tracing infrastructure.
This was totally bogus code that can cause lockups and worse.  Poll
the buffer every 2 jiffies for now.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/relay.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/kernel/relay.c b/kernel/relay.c
index 5a56d3c8dc03..822e7dab6a65 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -339,6 +339,10 @@ static void wakeup_readers(unsigned long data)
 {
 	struct rchan_buf *buf = (struct rchan_buf *)data;
 	wake_up_interruptible(&buf->read_wait);
+	/*
+	 * Stupid polling for now:
+	 */
+	mod_timer(&buf->timer, jiffies + 1);
 }
 
 /**
@@ -356,6 +360,7 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init)
 		init_waitqueue_head(&buf->read_wait);
 		kref_init(&buf->kref);
 		setup_timer(&buf->timer, wakeup_readers, (unsigned long)buf);
+		mod_timer(&buf->timer, jiffies + 1);
 	} else
 		del_timer_sync(&buf->timer);
 
@@ -739,15 +744,6 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
 		else
 			buf->early_bytes += buf->chan->subbuf_size -
 					    buf->padding[old_subbuf];
-		smp_mb();
-		if (waitqueue_active(&buf->read_wait))
-			/*
-			 * Calling wake_up_interruptible() from here
-			 * will deadlock if we happen to be logging
-			 * from the scheduler (trying to re-grab
-			 * rq->lock), so defer it.
-			 */
-			mod_timer(&buf->timer, jiffies + 1);
 	}
 
 	old = buf->data;
-- 
cgit v1.2.3


From db03fb87d32abf55997577b29e5c4959f4a7d593 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:29:34 -0500
Subject: timers: prepare for full preemption

When softirqs can be preempted we need to make sure that cancelling
the timer from the active thread can not deadlock vs. a running timer
callback. Add a waitqueue to resolve that.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/timer.h |  2 +-
 kernel/sched/core.c   |  8 ++++++--
 kernel/time/timer.c   | 33 ++++++++++++++++++++++++++++++---
 3 files changed, 37 insertions(+), 6 deletions(-)

diff --git a/include/linux/timer.h b/include/linux/timer.h
index 8c5a197e1587..5fcd72c57ebe 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -241,7 +241,7 @@ extern void add_timer(struct timer_list *timer);
 
 extern int try_to_del_timer_sync(struct timer_list *timer);
 
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
   extern int del_timer_sync(struct timer_list *timer);
 #else
 # define del_timer_sync(t)		del_timer(t)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3602645aff14..c42dfd8b6e76 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -650,12 +650,14 @@ void resched_cpu(int cpu)
  */
 int get_nohz_timer_target(int pinned)
 {
-	int cpu = smp_processor_id();
+	int cpu;
 	int i;
 	struct sched_domain *sd;
 
+	preempt_disable_rt();
+	cpu = smp_processor_id();
 	if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu))
-		return cpu;
+		goto preempt_en_rt;
 
 	rcu_read_lock();
 	for_each_domain(cpu, sd) {
@@ -668,6 +670,8 @@ int get_nohz_timer_target(int pinned)
 	}
 unlock:
 	rcu_read_unlock();
+preempt_en_rt:
+	preempt_enable_rt();
 	return cpu;
 }
 /*
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 3260ffdb368f..c006603a2be5 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -78,6 +78,7 @@ struct tvec_root {
 struct tvec_base {
 	spinlock_t lock;
 	struct timer_list *running_timer;
+	wait_queue_head_t wait_for_running_timer;
 	unsigned long timer_jiffies;
 	unsigned long next_timer;
 	unsigned long active_timers;
@@ -969,6 +970,29 @@ void add_timer_on(struct timer_list *timer, int cpu)
 }
 EXPORT_SYMBOL_GPL(add_timer_on);
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+/*
+ * Wait for a running timer
+ */
+static void wait_for_running_timer(struct timer_list *timer)
+{
+	struct tvec_base *base = timer->base;
+
+	if (base->running_timer == timer)
+		wait_event(base->wait_for_running_timer,
+			   base->running_timer != timer);
+}
+
+# define wakeup_timer_waiters(b)	wake_up(&(b)->wait_for_tunning_timer)
+#else
+static inline void wait_for_running_timer(struct timer_list *timer)
+{
+	cpu_relax();
+}
+
+# define wakeup_timer_waiters(b)	do { } while (0)
+#endif
+
 /**
  * del_timer - deactive a timer.
  * @timer: the timer to be deactivated
@@ -1026,7 +1050,7 @@ int try_to_del_timer_sync(struct timer_list *timer)
 }
 EXPORT_SYMBOL(try_to_del_timer_sync);
 
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
 /**
  * del_timer_sync - deactivate a timer and wait for the handler to finish.
  * @timer: the timer to be deactivated
@@ -1086,7 +1110,7 @@ int del_timer_sync(struct timer_list *timer)
 		int ret = try_to_del_timer_sync(timer);
 		if (ret >= 0)
 			return ret;
-		cpu_relax();
+		wait_for_running_timer(timer);
 	}
 }
 EXPORT_SYMBOL(del_timer_sync);
@@ -1207,15 +1231,17 @@ static inline void __run_timers(struct tvec_base *base)
 			if (irqsafe) {
 				spin_unlock(&base->lock);
 				call_timer_fn(timer, fn, data);
+				base->running_timer = NULL;
 				spin_lock(&base->lock);
 			} else {
 				spin_unlock_irq(&base->lock);
 				call_timer_fn(timer, fn, data);
+				base->running_timer = NULL;
 				spin_lock_irq(&base->lock);
 			}
 		}
 	}
-	base->running_timer = NULL;
+	wake_up(&base->wait_for_running_timer);
 	spin_unlock_irq(&base->lock);
 }
 
@@ -1574,6 +1600,7 @@ static int init_timers_cpu(int cpu)
 		base = per_cpu(tvec_bases, cpu);
 	}
 
+	init_waitqueue_head(&base->wait_for_running_timer);
 
 	for (j = 0; j < TVN_SIZE; j++) {
 		INIT_LIST_HEAD(base->tv5.vec + j);
-- 
cgit v1.2.3


From 611eab31104589e32ac14d3b88bd16b1b0fbcdac Mon Sep 17 00:00:00 2001
From: Zhao Hongjiang <zhaohongjiang@huawei.com>
Date: Wed, 17 Apr 2013 17:44:16 +0800
Subject: timers: prepare for full preemption improve

wake_up should do nothing on the nort, so we should use wakeup_timer_waiters,
also fix a spell mistake.

Cc: stable-rt@vger.kernel.org
Signed-off-by: Zhao Hongjiang <zhaohongjiang@huawei.com>
[bigeasy: s/CONFIG_PREEMPT_RT_BASE/CONFIG_PREEMPT_RT_FULL/]
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 kernel/time/timer.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index c006603a2be5..eeda9e12a919 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -78,7 +78,9 @@ struct tvec_root {
 struct tvec_base {
 	spinlock_t lock;
 	struct timer_list *running_timer;
+#ifdef CONFIG_PREEMPT_RT_FULL
 	wait_queue_head_t wait_for_running_timer;
+#endif
 	unsigned long timer_jiffies;
 	unsigned long next_timer;
 	unsigned long active_timers;
@@ -983,7 +985,7 @@ static void wait_for_running_timer(struct timer_list *timer)
 			   base->running_timer != timer);
 }
 
-# define wakeup_timer_waiters(b)	wake_up(&(b)->wait_for_tunning_timer)
+# define wakeup_timer_waiters(b)	wake_up(&(b)->wait_for_running_timer)
 #else
 static inline void wait_for_running_timer(struct timer_list *timer)
 {
@@ -1241,7 +1243,7 @@ static inline void __run_timers(struct tvec_base *base)
 			}
 		}
 	}
-	wake_up(&base->wait_for_running_timer);
+	wakeup_timer_waiters(base);
 	spin_unlock_irq(&base->lock);
 }
 
@@ -1600,7 +1602,9 @@ static int init_timers_cpu(int cpu)
 		base = per_cpu(tvec_bases, cpu);
 	}
 
+#ifdef CONFIG_PREEMPT_RT_FULL
 	init_waitqueue_head(&base->wait_for_running_timer);
+#endif
 
 	for (j = 0; j < TVN_SIZE; j++) {
 		INIT_LIST_HEAD(base->tv5.vec + j);
-- 
cgit v1.2.3


From 439d3c48ecbf58dc6f530a14ca3b49a7e3fe8207 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:30:20 -0500
Subject: timers: preempt-rt support

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timer.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index eeda9e12a919..1eb2ffed1f0a 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1383,7 +1383,17 @@ unsigned long get_next_timer_interrupt(unsigned long now)
 	if (cpu_is_offline(smp_processor_id()))
 		return expires;
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+	/*
+	 * On PREEMPT_RT we cannot sleep here. If the trylock does not
+	 * succeed then we return the worst-case 'expires in 1 tick'
+	 * value:
+	 */
+	if (!spin_trylock(&base->lock))
+		return  now + 1;
+#else
 	spin_lock(&base->lock);
+#endif
 	if (base->active_timers) {
 		if (time_before_eq(base->next_timer, base->timer_jiffies))
 			base->next_timer = __next_timer_interrupt(base);
@@ -1393,7 +1403,6 @@ unsigned long get_next_timer_interrupt(unsigned long now)
 
 	if (time_before_eq(expires, now))
 		return now;
-
 	return cmp_next_hrtimer_event(now, expires);
 }
 #endif
@@ -1644,7 +1653,7 @@ static void migrate_timers(int cpu)
 
 	BUG_ON(cpu_online(cpu));
 	old_base = per_cpu(tvec_bases, cpu);
-	new_base = get_cpu_var(tvec_bases);
+	new_base = get_local_var(tvec_bases);
 	/*
 	 * The caller is globally serialized and nobody else
 	 * takes two locks at once, deadlock is not possible.
@@ -1665,7 +1674,7 @@ static void migrate_timers(int cpu)
 
 	spin_unlock(&old_base->lock);
 	spin_unlock_irq(&new_base->lock);
-	put_cpu_var(tvec_bases);
+	put_local_var(tvec_bases);
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
-- 
cgit v1.2.3


From 2f4c14465cd6140f34fc861121d744f9e5b1f358 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 21 Aug 2009 11:56:45 +0200
Subject: timer: delay waking softirqs from the jiffy tick

People were complaining about broken balancing with the recent -rt
series.

A look at /proc/sched_debug yielded:

cpu#0, 2393.874 MHz
  .nr_running                    : 0
  .load                          : 0
  .cpu_load[0]                   : 177522
  .cpu_load[1]                   : 177522
  .cpu_load[2]                   : 177522
  .cpu_load[3]                   : 177522
  .cpu_load[4]                   : 177522
cpu#1, 2393.874 MHz
  .nr_running                    : 4
  .load                          : 4096
  .cpu_load[0]                   : 181618
  .cpu_load[1]                   : 180850
  .cpu_load[2]                   : 180274
  .cpu_load[3]                   : 179938
  .cpu_load[4]                   : 179758

Which indicated the cpu_load computation was hosed, the 177522 value
indicates that there is one RT task runnable. Initially I thought the
old problem of calculating the cpu_load from a softirq had re-surfaced,
however looking at the code shows its being done from scheduler_tick().

[ we really should fix this RT/cfs interaction some day... ]

A few trace_printk()s later:

    sirq-timer/1-19    [001]   174.289744:     19: 50:S ==> [001]     0:140:R <idle>
          <idle>-0     [001]   174.290724: enqueue_task_rt: adding task: 19/sirq-timer/1 with load: 177522
          <idle>-0     [001]   174.290725:      0:140:R   + [001]    19: 50:S sirq-timer/1
          <idle>-0     [001]   174.290730: scheduler_tick: current load: 177522
          <idle>-0     [001]   174.290732: scheduler_tick: current: 0/swapper
          <idle>-0     [001]   174.290736:      0:140:R ==> [001]    19: 50:R sirq-timer/1
    sirq-timer/1-19    [001]   174.290741: dequeue_task_rt: removing task: 19/sirq-timer/1 with load: 177522
    sirq-timer/1-19    [001]   174.290743:     19: 50:S ==> [001]     0:140:R <idle>

We see that we always raise the timer softirq before doing the load
calculation. Avoid this by re-ordering the scheduler_tick() call in
update_process_times() to occur before we deal with timers.

This lowers the load back to sanity and restores regular load-balancing
behaviour.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 1eb2ffed1f0a..576c366b2615 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1418,13 +1418,13 @@ void update_process_times(int user_tick)
 
 	/* Note: this timer irq context must be accounted for as well. */
 	account_process_tick(p, user_tick);
+	scheduler_tick();
 	run_local_timers();
 	rcu_check_callbacks(cpu, user_tick);
 #ifdef CONFIG_IRQ_WORK
 	if (in_irq())
 		irq_work_tick();
 #endif
-	scheduler_tick();
 	run_posix_cpu_timers(p);
 }
 
-- 
cgit v1.2.3


From 4e5eb692259cf2900e47370511e9bdf3a8c314ee Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 21 Jul 2011 15:23:39 +0200
Subject: timers: Avoid the switch timers base set to NULL trick on RT

On RT that code is preemptible, so we cannot assign NULL to timers
base as a preempter would spin forever in lock_timer_base().

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timer.c | 40 ++++++++++++++++++++++++++++++++--------
 1 file changed, 32 insertions(+), 8 deletions(-)

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 576c366b2615..6e4b959e4e79 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -761,6 +761,36 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer,
 	}
 }
 
+#ifndef CONFIG_PREEMPT_RT_FULL
+static inline struct tvec_base *switch_timer_base(struct timer_list *timer,
+						  struct tvec_base *old,
+						  struct tvec_base *new)
+{
+	/* See the comment in lock_timer_base() */
+	timer_set_base(timer, NULL);
+	spin_unlock(&old->lock);
+	spin_lock(&new->lock);
+	timer_set_base(timer, new);
+	return new;
+}
+#else
+static inline struct tvec_base *switch_timer_base(struct timer_list *timer,
+						  struct tvec_base *old,
+						  struct tvec_base *new)
+{
+	/*
+	 * We cannot do the above because we might be preempted and
+	 * then the preempter would see NULL and loop forever.
+	 */
+	if (spin_trylock(&new->lock)) {
+		timer_set_base(timer, new);
+		spin_unlock(&old->lock);
+		return new;
+	}
+	return old;
+}
+#endif
+
 static inline int
 __mod_timer(struct timer_list *timer, unsigned long expires,
 						bool pending_only, int pinned)
@@ -791,14 +821,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
 		 * handler yet has not finished. This also guarantees that
 		 * the timer is serialized wrt itself.
 		 */
-		if (likely(base->running_timer != timer)) {
-			/* See the comment in lock_timer_base() */
-			timer_set_base(timer, NULL);
-			spin_unlock(&base->lock);
-			base = new_base;
-			spin_lock(&base->lock);
-			timer_set_base(timer, base);
-		}
+		if (likely(base->running_timer != timer))
+			base = switch_timer_base(timer, base, new_base);
 	}
 
 	timer->expires = expires;
-- 
cgit v1.2.3


From 299539d7705025242c99ab6a19116b0d2f53fa5d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:29:34 -0500
Subject: hrtimers: prepare full preemption

Make cancellation of a running callback in softirq context safe
against preemption.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h    | 10 ++++++++++
 kernel/time/hrtimer.c      | 33 ++++++++++++++++++++++++++++++++-
 kernel/time/itimer.c       |  1 +
 kernel/time/posix-timers.c | 33 +++++++++++++++++++++++++++++++++
 4 files changed, 76 insertions(+), 1 deletion(-)

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index bd6483d99678..5a3110c11539 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -194,6 +194,9 @@ struct hrtimer_cpu_base {
 	unsigned long			nr_retries;
 	unsigned long			nr_hangs;
 	ktime_t				max_hang_time;
+#endif
+#ifdef CONFIG_PREEMPT_RT_BASE
+	wait_queue_head_t		wait;
 #endif
 	struct hrtimer_clock_base	clock_base[HRTIMER_MAX_CLOCK_BASES];
 };
@@ -382,6 +385,13 @@ static inline int hrtimer_restart(struct hrtimer *timer)
 	return hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
 }
 
+/* Softirq preemption could deadlock timer removal */
+#ifdef CONFIG_PREEMPT_RT_BASE
+  extern void hrtimer_wait_for_timer(const struct hrtimer *timer);
+#else
+# define hrtimer_wait_for_timer(timer)	do { cpu_relax(); } while (0)
+#endif
+
 /* Query timers: */
 extern ktime_t hrtimer_get_remaining(const struct hrtimer *timer);
 extern int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp);
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 275533901e09..e90b019c924e 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -820,6 +820,32 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
 }
 EXPORT_SYMBOL_GPL(hrtimer_forward);
 
+#ifdef CONFIG_PREEMPT_RT_BASE
+# define wake_up_timer_waiters(b)	wake_up(&(b)->wait)
+
+/**
+ * hrtimer_wait_for_timer - Wait for a running timer
+ *
+ * @timer:	timer to wait for
+ *
+ * The function waits in case the timers callback function is
+ * currently executed on the waitqueue of the timer base. The
+ * waitqueue is woken up after the timer callback function has
+ * finished execution.
+ */
+void hrtimer_wait_for_timer(const struct hrtimer *timer)
+{
+	struct hrtimer_clock_base *base = timer->base;
+
+	if (base && base->cpu_base && !hrtimer_hres_active(base->cpu_base))
+		wait_event(base->cpu_base->wait,
+				!(timer->state & HRTIMER_STATE_CALLBACK));
+}
+
+#else
+# define wake_up_timer_waiters(b)	do { } while (0)
+#endif
+
 /*
  * enqueue_hrtimer - internal function to (re)start a timer
  *
@@ -1082,7 +1108,7 @@ int hrtimer_cancel(struct hrtimer *timer)
 
 		if (ret >= 0)
 			return ret;
-		cpu_relax();
+		hrtimer_wait_for_timer(timer);
 	}
 }
 EXPORT_SYMBOL_GPL(hrtimer_cancel);
@@ -1492,6 +1518,8 @@ void hrtimer_run_queues(void)
 		}
 		raw_spin_unlock(&cpu_base->lock);
 	}
+
+	wake_up_timer_waiters(cpu_base);
 }
 
 /*
@@ -1653,6 +1681,9 @@ static void init_hrtimers_cpu(int cpu)
 
 	cpu_base->cpu = cpu;
 	hrtimer_init_hres(cpu_base);
+#ifdef CONFIG_PREEMPT_RT_BASE
+	init_waitqueue_head(&cpu_base->wait);
+#endif
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
index 8d262b467573..d0513909d663 100644
--- a/kernel/time/itimer.c
+++ b/kernel/time/itimer.c
@@ -213,6 +213,7 @@ again:
 		/* We are sharing ->siglock with it_real_fn() */
 		if (hrtimer_try_to_cancel(timer) < 0) {
 			spin_unlock_irq(&tsk->sighand->siglock);
+			hrtimer_wait_for_timer(&tsk->signal->real_timer);
 			goto again;
 		}
 		expires = timeval_to_ktime(value->it_value);
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 35239e500f93..0f5d7eae61f0 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -821,6 +821,20 @@ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id)
 	return overrun;
 }
 
+/*
+ * Protected by RCU!
+ */
+static void timer_wait_for_callback(struct k_clock *kc, struct k_itimer *timr)
+{
+#ifdef CONFIG_PREEMPT_RT_FULL
+	if (kc->timer_set == common_timer_set)
+		hrtimer_wait_for_timer(&timr->it.real.timer);
+	else
+		/* FIXME: Whacky hack for posix-cpu-timers */
+		schedule_timeout(1);
+#endif
+}
+
 /* Set a POSIX.1b interval timer. */
 /* timr->it_lock is taken. */
 static int
@@ -898,6 +912,7 @@ retry:
 	if (!timr)
 		return -EINVAL;
 
+	rcu_read_lock();
 	kc = clockid_to_kclock(timr->it_clock);
 	if (WARN_ON_ONCE(!kc || !kc->timer_set))
 		error = -EINVAL;
@@ -906,9 +921,12 @@ retry:
 
 	unlock_timer(timr, flag);
 	if (error == TIMER_RETRY) {
+		timer_wait_for_callback(kc, timr);
 		rtn = NULL;	// We already got the old time...
+		rcu_read_unlock();
 		goto retry;
 	}
+	rcu_read_unlock();
 
 	if (old_setting && !error &&
 	    copy_to_user(old_setting, &old_spec, sizeof (old_spec)))
@@ -946,10 +964,15 @@ retry_delete:
 	if (!timer)
 		return -EINVAL;
 
+	rcu_read_lock();
 	if (timer_delete_hook(timer) == TIMER_RETRY) {
 		unlock_timer(timer, flags);
+		timer_wait_for_callback(clockid_to_kclock(timer->it_clock),
+					timer);
+		rcu_read_unlock();
 		goto retry_delete;
 	}
+	rcu_read_unlock();
 
 	spin_lock(&current->sighand->siglock);
 	list_del(&timer->list);
@@ -975,8 +998,18 @@ static void itimer_delete(struct k_itimer *timer)
 retry_delete:
 	spin_lock_irqsave(&timer->it_lock, flags);
 
+	/* On RT we can race with a deletion */
+	if (!timer->it_signal) {
+		unlock_timer(timer, flags);
+		return;
+	}
+
 	if (timer_delete_hook(timer) == TIMER_RETRY) {
+		rcu_read_lock();
 		unlock_timer(timer, flags);
+		timer_wait_for_callback(clockid_to_kclock(timer->it_clock),
+					timer);
+		rcu_read_unlock();
 		goto retry_delete;
 	}
 	list_del(&timer->list);
-- 
cgit v1.2.3


From 059fa4ffb81421250e37b85e5b6fe9b517846a85 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 3 Jul 2009 08:44:31 -0500
Subject: hrtimer: fixup hrtimer callback changes for preempt-rt

In preempt-rt we can not call the callbacks which take sleeping locks
from the timer interrupt context.

Bring back the softirq split for now, until we fixed the signal
delivery problem for real.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/hrtimer.h  |   3 +
 kernel/sched/core.c      |   1 +
 kernel/sched/rt.c        |   1 +
 kernel/time/hrtimer.c    | 219 +++++++++++++++++++++++++++++++++++++++++------
 kernel/time/tick-sched.c |   1 +
 kernel/watchdog.c        |   1 +
 6 files changed, 200 insertions(+), 26 deletions(-)

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 5a3110c11539..32a0af13f438 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -111,6 +111,8 @@ struct hrtimer {
 	enum hrtimer_restart		(*function)(struct hrtimer *);
 	struct hrtimer_clock_base	*base;
 	unsigned long			state;
+	struct list_head		cb_entry;
+	int				irqsafe;
 #ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
 	ktime_t				praecox;
 #endif
@@ -150,6 +152,7 @@ struct hrtimer_clock_base {
 	int			index;
 	clockid_t		clockid;
 	struct timerqueue_head	active;
+	struct list_head	expired;
 	ktime_t			resolution;
 	ktime_t			(*get_time)(void);
 	ktime_t			softirq_time;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c42dfd8b6e76..8bdc95ce668d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -516,6 +516,7 @@ static void init_rq_hrtick(struct rq *rq)
 
 	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	rq->hrtick_timer.function = hrtick;
+	rq->hrtick_timer.irqsafe = 1;
 }
 #else	/* CONFIG_SCHED_HRTICK */
 static inline void hrtick_clear(struct rq *rq)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 20bca398084a..fe2eef21ade8 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -43,6 +43,7 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
 
 	hrtimer_init(&rt_b->rt_period_timer,
 			CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	rt_b->rt_period_timer.irqsafe = 1;
 	rt_b->rt_period_timer.function = sched_rt_period_timer;
 }
 
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index e90b019c924e..456cbf24741a 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -569,8 +569,7 @@ static int hrtimer_reprogram(struct hrtimer *timer,
 	 * When the callback is running, we do not reprogram the clock event
 	 * device. The timer callback is either running on a different CPU or
 	 * the callback is executed in the hrtimer_interrupt context. The
-	 * reprogramming is handled either by the softirq, which called the
-	 * callback or at the end of the hrtimer_interrupt.
+	 * reprogramming is handled at the end of the hrtimer_interrupt.
 	 */
 	if (hrtimer_callback_running(timer))
 		return 0;
@@ -605,6 +604,9 @@ static int hrtimer_reprogram(struct hrtimer *timer,
 	return res;
 }
 
+static void __run_hrtimer(struct hrtimer *timer, ktime_t *now);
+static int hrtimer_rt_defer(struct hrtimer *timer);
+
 /*
  * Initialize the high resolution related parts of cpu_base
  */
@@ -614,6 +616,21 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
 	base->hres_active = 0;
 }
 
+static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
+					    struct hrtimer_clock_base *base,
+					    int wakeup)
+{
+	if (!hrtimer_reprogram(timer, base))
+		return 0;
+	if (!wakeup)
+		return -ETIME;
+#ifdef CONFIG_PREEMPT_RT_BASE
+	if (!hrtimer_rt_defer(timer))
+		return -ETIME;
+#endif
+	return 1;
+}
+
 static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
 {
 	ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
@@ -695,6 +712,13 @@ static inline int hrtimer_is_hres_enabled(void) { return 0; }
 static inline int hrtimer_switch_to_hres(void) { return 0; }
 static inline void
 hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
+static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
+					    struct hrtimer_clock_base *base,
+					    int wakeup)
+{
+	return 0;
+}
+
 static inline int hrtimer_reprogram(struct hrtimer *timer,
 				    struct hrtimer_clock_base *base)
 {
@@ -702,7 +726,6 @@ static inline int hrtimer_reprogram(struct hrtimer *timer,
 }
 static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
 static inline void retrigger_next_event(void *arg) { }
-
 #endif /* CONFIG_HIGH_RES_TIMERS */
 
 /*
@@ -837,9 +860,9 @@ void hrtimer_wait_for_timer(const struct hrtimer *timer)
 {
 	struct hrtimer_clock_base *base = timer->base;
 
-	if (base && base->cpu_base && !hrtimer_hres_active(base->cpu_base))
+	if (base && base->cpu_base && !timer->irqsafe)
 		wait_event(base->cpu_base->wait,
-				!(timer->state & HRTIMER_STATE_CALLBACK));
+			   !(timer->state & HRTIMER_STATE_CALLBACK));
 }
 
 #else
@@ -889,6 +912,11 @@ static void __remove_hrtimer(struct hrtimer *timer,
 	if (!(timer->state & HRTIMER_STATE_ENQUEUED))
 		goto out;
 
+	if (unlikely(!list_empty(&timer->cb_entry))) {
+		list_del_init(&timer->cb_entry);
+		goto out;
+	}
+
 	next_timer = timerqueue_getnext(&base->active);
 	timerqueue_del(&base->active, &timer->node);
 	if (&timer->node == next_timer) {
@@ -999,15 +1027,26 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 		 * on dynticks target.
 		 */
 		wake_up_nohz_cpu(new_base->cpu_base->cpu);
-	} else if (new_base->cpu_base == this_cpu_ptr(&hrtimer_bases) &&
-			hrtimer_reprogram(timer, new_base)) {
+	} else if (new_base->cpu_base == this_cpu_ptr(&hrtimer_bases)) {
+
+		ret = hrtimer_enqueue_reprogram(timer, new_base, wakeup);
+		if (ret < 0) {
+			/*
+			 * In case we failed to reprogram the timer (mostly
+			 * because out current timer is already elapsed),
+			 * remove it again and report a failure. This avoids
+			 * stale base->first entries.
+			 */
+			debug_deactivate(timer);
+			__remove_hrtimer(timer, new_base,
+				timer->state & HRTIMER_STATE_CALLBACK, 0);
+		} else if (ret > 0) {
 		/*
 		 * Only allow reprogramming if the new base is on this CPU.
 		 * (it might still be on another CPU if the timer was pending)
 		 *
 		 * XXX send_remote_softirq() ?
 		 */
-		if (wakeup) {
 			/*
 			 * We need to drop cpu_base->lock to avoid a
 			 * lock ordering issue vs. rq->lock.
@@ -1015,9 +1054,7 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 			raw_spin_unlock(&new_base->cpu_base->lock);
 			raise_softirq_irqoff(HRTIMER_SOFTIRQ);
 			local_irq_restore(flags);
-			return ret;
-		} else {
-			__raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+			return 0;
 		}
 	}
 
@@ -1187,6 +1224,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
 
 	base = hrtimer_clockid_to_base(clock_id);
 	timer->base = &cpu_base->clock_base[base];
+	INIT_LIST_HEAD(&timer->cb_entry);
 	timerqueue_init(&timer->node);
 
 #ifdef CONFIG_TIMER_STATS
@@ -1270,10 +1308,128 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
 	timer->state &= ~HRTIMER_STATE_CALLBACK;
 }
 
-#ifdef CONFIG_HIGH_RES_TIMERS
-
 static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer);
 
+#ifdef CONFIG_PREEMPT_RT_BASE
+static void hrtimer_rt_reprogram(int restart, struct hrtimer *timer,
+				 struct hrtimer_clock_base *base)
+{
+	/*
+	 * Note, we clear the callback flag before we requeue the
+	 * timer otherwise we trigger the callback_running() check
+	 * in hrtimer_reprogram().
+	 */
+	timer->state &= ~HRTIMER_STATE_CALLBACK;
+
+	if (restart != HRTIMER_NORESTART) {
+		BUG_ON(hrtimer_active(timer));
+		/*
+		 * Enqueue the timer, if it's the leftmost timer then
+		 * we need to reprogram it.
+		 */
+		if (!enqueue_hrtimer(timer, base))
+			return;
+
+#ifndef CONFIG_HIGH_RES_TIMERS
+	}
+#else
+		if (base->cpu_base->hres_active &&
+		    hrtimer_reprogram(timer, base))
+			goto requeue;
+
+	} else if (hrtimer_active(timer)) {
+		/*
+		 * If the timer was rearmed on another CPU, reprogram
+		 * the event device.
+		 */
+		if (&timer->node == base->active.next &&
+		    base->cpu_base->hres_active &&
+		    hrtimer_reprogram(timer, base))
+			goto requeue;
+	}
+	return;
+
+requeue:
+	/*
+	 * Timer is expired. Thus move it from tree to pending list
+	 * again.
+	 */
+	__remove_hrtimer(timer, base, timer->state, 0);
+	list_add_tail(&timer->cb_entry, &base->expired);
+#endif
+}
+
+/*
+ * The changes in mainline which removed the callback modes from
+ * hrtimer are not yet working with -rt. The non wakeup_process()
+ * based callbacks which involve sleeping locks need to be treated
+ * seperately.
+ */
+static void hrtimer_rt_run_pending(void)
+{
+	enum hrtimer_restart (*fn)(struct hrtimer *);
+	struct hrtimer_cpu_base *cpu_base;
+	struct hrtimer_clock_base *base;
+	struct hrtimer *timer;
+	int index, restart;
+
+	local_irq_disable();
+	cpu_base = &per_cpu(hrtimer_bases, smp_processor_id());
+
+	raw_spin_lock(&cpu_base->lock);
+
+	for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
+		base = &cpu_base->clock_base[index];
+
+		while (!list_empty(&base->expired)) {
+			timer = list_first_entry(&base->expired,
+						 struct hrtimer, cb_entry);
+
+			/*
+			 * Same as the above __run_hrtimer function
+			 * just we run with interrupts enabled.
+			 */
+			debug_hrtimer_deactivate(timer);
+			__remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
+			timer_stats_account_hrtimer(timer);
+			fn = timer->function;
+
+			raw_spin_unlock_irq(&cpu_base->lock);
+			restart = fn(timer);
+			raw_spin_lock_irq(&cpu_base->lock);
+
+			hrtimer_rt_reprogram(restart, timer, base);
+		}
+	}
+
+	raw_spin_unlock_irq(&cpu_base->lock);
+
+	wake_up_timer_waiters(cpu_base);
+}
+
+static int hrtimer_rt_defer(struct hrtimer *timer)
+{
+	if (timer->irqsafe)
+		return 0;
+
+	__remove_hrtimer(timer, timer->base, timer->state, 0);
+	list_add_tail(&timer->cb_entry, &timer->base->expired);
+	return 1;
+}
+
+#else
+
+static inline void hrtimer_rt_run_pending(void)
+{
+	hrtimer_peek_ahead_timers();
+}
+
+static inline int hrtimer_rt_defer(struct hrtimer *timer) { return 0; }
+
+#endif
+
+#ifdef CONFIG_HIGH_RES_TIMERS
+
 /*
  * High resolution timer interrupt
  * Called with interrupts disabled
@@ -1282,7 +1438,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
 {
 	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
 	ktime_t expires_next, now, entry_time, delta;
-	int i, retries = 0;
+	int i, retries = 0, raise = 0;
 
 	BUG_ON(!cpu_base->hres_active);
 	cpu_base->nr_events++;
@@ -1351,7 +1507,10 @@ retry:
 				break;
 			}
 
-			__run_hrtimer(timer, &basenow);
+			if (!hrtimer_rt_defer(timer))
+				__run_hrtimer(timer, &basenow);
+			else
+				raise = 1;
 		}
 	}
 
@@ -1366,6 +1525,10 @@ retry:
 	if (expires_next.tv64 == KTIME_MAX ||
 	    !tick_program_event(expires_next, 0)) {
 		cpu_base->hang_detected = 0;
+
+		if (raise)
+			raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+
 		return;
 	}
 
@@ -1445,18 +1608,18 @@ void hrtimer_peek_ahead_timers(void)
 	__hrtimer_peek_ahead_timers();
 	local_irq_restore(flags);
 }
-
-static void run_hrtimer_softirq(struct softirq_action *h)
-{
-	hrtimer_peek_ahead_timers();
-}
-
 #else /* CONFIG_HIGH_RES_TIMERS */
 
 static inline void __hrtimer_peek_ahead_timers(void) { }
 
 #endif	/* !CONFIG_HIGH_RES_TIMERS */
 
+
+static void run_hrtimer_softirq(struct softirq_action *h)
+{
+	hrtimer_rt_run_pending();
+}
+
 /*
  * Called from timer softirq every jiffy, expire hrtimers:
  *
@@ -1489,7 +1652,7 @@ void hrtimer_run_queues(void)
 	struct timerqueue_node *node;
 	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
 	struct hrtimer_clock_base *base;
-	int index, gettime = 1;
+	int index, gettime = 1, raise = 0;
 
 	if (hrtimer_hres_active())
 		return;
@@ -1514,12 +1677,16 @@ void hrtimer_run_queues(void)
 					hrtimer_get_expires_tv64(timer))
 				break;
 
-			__run_hrtimer(timer, &base->softirq_time);
+			if (!hrtimer_rt_defer(timer))
+				__run_hrtimer(timer, &base->softirq_time);
+			else
+				raise = 1;
 		}
 		raw_spin_unlock(&cpu_base->lock);
 	}
 
-	wake_up_timer_waiters(cpu_base);
+	if (raise)
+		raise_softirq_irqoff(HRTIMER_SOFTIRQ);
 }
 
 /*
@@ -1541,6 +1708,7 @@ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
 void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
 {
 	sl->timer.function = hrtimer_wakeup;
+	sl->timer.irqsafe = 1;
 	sl->task = task;
 }
 EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
@@ -1677,6 +1845,7 @@ static void init_hrtimers_cpu(int cpu)
 	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
 		cpu_base->clock_base[i].cpu_base = cpu_base;
 		timerqueue_init_head(&cpu_base->clock_base[i].active);
+		INIT_LIST_HEAD(&cpu_base->clock_base[i].expired);
 	}
 
 	cpu_base->cpu = cpu;
@@ -1796,9 +1965,7 @@ void __init hrtimers_init(void)
 	hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
 			  (void *)(long)smp_processor_id());
 	register_cpu_notifier(&hrtimers_nb);
-#ifdef CONFIG_HIGH_RES_TIMERS
 	open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq);
-#endif
 }
 
 /**
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index ad413989896a..54c358be4d8e 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -1161,6 +1161,7 @@ void tick_setup_sched_timer(void)
 	 * Emulate tick processing via per-CPU hrtimers:
 	 */
 	hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+	ts->sched_timer.irqsafe = 1;
 	ts->sched_timer.function = tick_sched_timer;
 
 	/* Get the next period (per cpu) */
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 085bd3c641c7..64f44cfb3bdc 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -440,6 +440,7 @@ static void watchdog_enable(unsigned int cpu)
 	/* kick off the timer for the hardlockup detector */
 	hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	hrtimer->function = watchdog_timer_fn;
+	hrtimer->irqsafe = 1;
 
 	/* Enable the perf event */
 	watchdog_nmi_enable(cpu);
-- 
cgit v1.2.3


From 6a126e5054c45d0351ac99df5cf2c11e6dd0dd2a Mon Sep 17 00:00:00 2001
From: Juri Lelli <juri.lelli@gmail.com>
Date: Tue, 13 May 2014 15:30:20 +0200
Subject: sched/deadline: dl_task_timer has to be irqsafe

As for rt_period_timer, dl_task_timer has to be irqsafe.

Signed-off-by: Juri Lelli <juri.lelli@gmail.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 kernel/sched/deadline.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 40a97c3d8aba..1ded49d7d652 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -570,6 +570,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
 
 	hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	timer->function = dl_task_timer;
+	timer->irqsafe = 1;
 }
 
 static
-- 
cgit v1.2.3


From 1377b30d5db0eeae88c905e539925216d2c4340f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 25 Jan 2012 11:08:40 +0100
Subject: timer-fd: Prevent live lock

If hrtimer_try_to_cancel() requires a retry, then depending on the
priority setting te retry loop might prevent timer callback completion
on RT. Prevent that by waiting for completion on RT, no change for a
non RT kernel.

Reported-by: Sankara Muthukrishnan <sankara.m@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable-rt@vger.kernel.org
---
 fs/timerfd.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/timerfd.c b/fs/timerfd.c
index b46ffa94372a..d75bc1b55c3c 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -449,7 +449,10 @@ static int do_timerfd_settime(int ufd, int flags,
 				break;
 		}
 		spin_unlock_irq(&ctx->wqh.lock);
-		cpu_relax();
+		if (isalarm(ctx))
+			hrtimer_wait_for_timer(&ctx->t.alarm.timer);
+		else
+			hrtimer_wait_for_timer(&ctx->t.tmr);
 	}
 
 	/*
-- 
cgit v1.2.3


From 91c1fcdb00db6a77ca6d5a2b8931cc5bf1c7cf5a Mon Sep 17 00:00:00 2001
From: Watanabe <shunsuke.watanabe@tel.com>
Date: Sun, 28 Oct 2012 11:13:44 +0100
Subject: hrtimer: Raise softirq if hrtimer irq stalled

When the hrtimer stall detection hits the softirq is not raised.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable-rt@vger.kernel.org
---
 kernel/time/hrtimer.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 456cbf24741a..24dcdb215272 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1525,11 +1525,7 @@ retry:
 	if (expires_next.tv64 == KTIME_MAX ||
 	    !tick_program_event(expires_next, 0)) {
 		cpu_base->hang_detected = 0;
-
-		if (raise)
-			raise_softirq_irqoff(HRTIMER_SOFTIRQ);
-
-		return;
+		goto out;
 	}
 
 	/*
@@ -1573,6 +1569,9 @@ retry:
 	tick_program_event(expires_next, 1);
 	printk_once(KERN_WARNING "hrtimer: interrupt took %llu ns\n",
 		    ktime_to_ns(delta));
+out:
+	if (raise)
+		raise_softirq_irqoff(HRTIMER_SOFTIRQ);
 }
 
 /*
-- 
cgit v1.2.3


From 771d110858f5fd46befb9df3fc059e5722fee5be Mon Sep 17 00:00:00 2001
From: Yang Shi <yang.shi@windriver.com>
Date: Mon, 16 Sep 2013 14:09:19 -0700
Subject: hrtimer: Move schedule_work call to helper thread

When run ltp leapsec_timer test, the following call trace is caught:

BUG: sleeping function called from invalid context at kernel/rtmutex.c:659
in_atomic(): 1, irqs_disabled(): 1, pid: 0, name: swapper/1
Preemption disabled at:[<ffffffff810857f3>] cpu_startup_entry+0x133/0x310

CPU: 1 PID: 0 Comm: swapper/1 Not tainted 3.10.10-rt3 #2
Hardware name: Intel Corporation Calpella platform/MATXM-CORE-411-B, BIOS 4.6.3 08/18/2010
ffffffff81c2f800 ffff880076843e40 ffffffff8169918d ffff880076843e58
ffffffff8106db31 ffff88007684b4a0 ffff880076843e70 ffffffff8169d9c0
ffff88007684b4a0 ffff880076843eb0 ffffffff81059da1 0000001876851200
Call Trace:
<IRQ>  [<ffffffff8169918d>] dump_stack+0x19/0x1b
[<ffffffff8106db31>] __might_sleep+0xf1/0x170
[<ffffffff8169d9c0>] rt_spin_lock+0x20/0x50
[<ffffffff81059da1>] queue_work_on+0x61/0x100
[<ffffffff81065aa1>] clock_was_set_delayed+0x21/0x30
[<ffffffff810883be>] do_timer+0x40e/0x660
[<ffffffff8108f487>] tick_do_update_jiffies64+0xf7/0x140
[<ffffffff8108fe42>] tick_check_idle+0x92/0xc0
[<ffffffff81044327>] irq_enter+0x57/0x70
[<ffffffff816a040e>] smp_apic_timer_interrupt+0x3e/0x9b
[<ffffffff8169f80a>] apic_timer_interrupt+0x6a/0x70
<EOI>  [<ffffffff8155ea1c>] ? cpuidle_enter_state+0x4c/0xc0
[<ffffffff8155eb68>] cpuidle_idle_call+0xd8/0x2d0
[<ffffffff8100b59e>] arch_cpu_idle+0xe/0x30
[<ffffffff8108585e>] cpu_startup_entry+0x19e/0x310
[<ffffffff8168efa2>] start_secondary+0x1ad/0x1b0

The clock_was_set_delayed is called in hard IRQ handler (timer interrupt), which
calls schedule_work.

Under PREEMPT_RT_FULL, schedule_work calls spinlocks which could sleep, so it's
not safe to call schedule_work in interrupt context.

Reference upstream commit b68d61c705ef02384c0538b8d9374545097899ca
(rt,ntp: Move call to schedule_delayed_work() to helper thread)
from git://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-stable-rt.git, which
makes a similar change.

add a helper thread which does the call to schedule_work and wake up that
thread instead of calling schedule_work directly.

Cc: stable-rt@vger.kernel.org
Signed-off-by: Yang Shi <yang.shi@windriver.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 kernel/time/hrtimer.c | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 24dcdb215272..20a6809b4b02 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -48,6 +48,7 @@
 #include <linux/sched/rt.h>
 #include <linux/sched/deadline.h>
 #include <linux/timer.h>
+#include <linux/kthread.h>
 #include <linux/freezer.h>
 
 #include <asm/uaccess.h>
@@ -696,6 +697,44 @@ static void clock_was_set_work(struct work_struct *work)
 
 static DECLARE_WORK(hrtimer_work, clock_was_set_work);
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+/*
+ * RT can not call schedule_work from real interrupt context.
+ * Need to make a thread to do the real work.
+ */
+static struct task_struct *clock_set_delay_thread;
+static bool do_clock_set_delay;
+
+static int run_clock_set_delay(void *ignore)
+{
+	while (!kthread_should_stop()) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (do_clock_set_delay) {
+			do_clock_set_delay = false;
+			schedule_work(&hrtimer_work);
+		}
+		schedule();
+	}
+	__set_current_state(TASK_RUNNING);
+	return 0;
+}
+
+void clock_was_set_delayed(void)
+{
+	do_clock_set_delay = true;
+	/* Make visible before waking up process */
+	smp_wmb();
+	wake_up_process(clock_set_delay_thread);
+}
+
+static __init int create_clock_set_delay_thread(void)
+{
+	clock_set_delay_thread = kthread_run(run_clock_set_delay, NULL, "kclksetdelayd");
+	BUG_ON(!clock_set_delay_thread);
+	return 0;
+}
+early_initcall(create_clock_set_delay_thread);
+#else /* PREEMPT_RT_FULL */
 /*
  * Called from timekeeping and resume code to reprogramm the hrtimer
  * interrupt device on all cpus.
@@ -704,6 +743,7 @@ void clock_was_set_delayed(void)
 {
 	schedule_work(&hrtimer_work);
 }
+#endif
 
 #else
 
-- 
cgit v1.2.3


From be7daa4caee7627e6b8d5ec63d3af4fd6447b0fc Mon Sep 17 00:00:00 2001
From: John Stultz <johnstul@us.ibm.com>
Date: Fri, 3 Jul 2009 08:29:58 -0500
Subject: posix-timers: thread posix-cpu-timers on -rt

posix-cpu-timer code takes non -rt safe locks in hard irq
context. Move it to a thread.

[ 3.0 fixes from Peter Zijlstra <peterz@infradead.org> ]

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/init_task.h      |   7 ++
 include/linux/sched.h          |   3 +
 kernel/fork.c                  |   3 +
 kernel/time/posix-cpu-timers.c | 183 +++++++++++++++++++++++++++++++++++++++--
 4 files changed, 190 insertions(+), 6 deletions(-)

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 43e8c0ebe108..dc778344d298 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -147,6 +147,12 @@ extern struct task_group root_task_group;
 # define INIT_PERF_EVENTS(tsk)
 #endif
 
+#ifdef CONFIG_PREEMPT_RT_BASE
+# define INIT_TIMER_LIST		.posix_timer_list = NULL,
+#else
+# define INIT_TIMER_LIST
+#endif
+
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 # define INIT_VTIME(tsk)						\
 	.vtime_lock = __RAW_SPIN_LOCK_UNLOCKED(tsk.vtime_lock),	\
@@ -220,6 +226,7 @@ extern struct task_group root_task_group;
 	.cpu_timers	= INIT_CPU_TIMERS(tsk.cpu_timers),		\
 	.pi_lock	= __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock),	\
 	.timer_slack_ns = 50000, /* 50 usec default slack */		\
+	INIT_TIMER_LIST							\
 	.pids = {							\
 		[PIDTYPE_PID]  = INIT_PID_LINK(PIDTYPE_PID),		\
 		[PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID),		\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 916d161f992f..cf2145a61663 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1386,6 +1386,9 @@ struct task_struct {
 
 	struct task_cputime cputime_expires;
 	struct list_head cpu_timers[3];
+#ifdef CONFIG_PREEMPT_RT_BASE
+	struct task_struct *posix_timer_list;
+#endif
 
 /* process credentials */
 	const struct cred __rcu *real_cred; /* objective and real subjective task
diff --git a/kernel/fork.c b/kernel/fork.c
index b43d8908b8d6..576374586884 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1157,6 +1157,9 @@ static void rt_mutex_init_task(struct task_struct *p)
  */
 static void posix_cpu_timers_init(struct task_struct *tsk)
 {
+#ifdef CONFIG_PREEMPT_RT_BASE
+	tsk->posix_timer_list = NULL;
+#endif
 	tsk->cputime_expires.prof_exp = 0;
 	tsk->cputime_expires.virt_exp = 0;
 	tsk->cputime_expires.sched_exp = 0;
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index a16b67859e2a..65a297b1cb83 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -3,6 +3,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/rt.h>
 #include <linux/posix-timers.h>
 #include <linux/errno.h>
 #include <linux/math64.h>
@@ -626,7 +627,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
 	/*
 	 * Disarm any old timer after extracting its expiry time.
 	 */
-	WARN_ON_ONCE(!irqs_disabled());
+	WARN_ON_ONCE_NONRT(!irqs_disabled());
 
 	ret = 0;
 	old_incr = timer->it.cpu.incr;
@@ -1047,7 +1048,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
 	/*
 	 * Now re-arm for the new expiry time.
 	 */
-	WARN_ON_ONCE(!irqs_disabled());
+	WARN_ON_ONCE_NONRT(!irqs_disabled());
 	arm_timer(timer);
 	unlock_task_sighand(p, &flags);
 
@@ -1113,10 +1114,11 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
 	sig = tsk->signal;
 	if (sig->cputimer.running) {
 		struct task_cputime group_sample;
+		unsigned long flags;
 
-		raw_spin_lock(&sig->cputimer.lock);
+		raw_spin_lock_irqsave(&sig->cputimer.lock, flags);
 		group_sample = sig->cputimer.cputime;
-		raw_spin_unlock(&sig->cputimer.lock);
+		raw_spin_unlock_irqrestore(&sig->cputimer.lock, flags);
 
 		if (task_cputime_expired(&group_sample, &sig->cputime_expires))
 			return 1;
@@ -1130,13 +1132,13 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
  * already updated our counts.  We need to check if any timers fire now.
  * Interrupts are disabled.
  */
-void run_posix_cpu_timers(struct task_struct *tsk)
+static void __run_posix_cpu_timers(struct task_struct *tsk)
 {
 	LIST_HEAD(firing);
 	struct k_itimer *timer, *next;
 	unsigned long flags;
 
-	WARN_ON_ONCE(!irqs_disabled());
+	WARN_ON_ONCE_NONRT(!irqs_disabled());
 
 	/*
 	 * The fast path checks that there are no expired thread or thread
@@ -1194,6 +1196,175 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 	}
 }
 
+#ifdef CONFIG_PREEMPT_RT_BASE
+#include <linux/kthread.h>
+#include <linux/cpu.h>
+DEFINE_PER_CPU(struct task_struct *, posix_timer_task);
+DEFINE_PER_CPU(struct task_struct *, posix_timer_tasklist);
+
+static int posix_cpu_timers_thread(void *data)
+{
+	int cpu = (long)data;
+
+	BUG_ON(per_cpu(posix_timer_task,cpu) != current);
+
+	while (!kthread_should_stop()) {
+		struct task_struct *tsk = NULL;
+		struct task_struct *next = NULL;
+
+		if (cpu_is_offline(cpu))
+			goto wait_to_die;
+
+		/* grab task list */
+		raw_local_irq_disable();
+		tsk = per_cpu(posix_timer_tasklist, cpu);
+		per_cpu(posix_timer_tasklist, cpu) = NULL;
+		raw_local_irq_enable();
+
+		/* its possible the list is empty, just return */
+		if (!tsk) {
+			set_current_state(TASK_INTERRUPTIBLE);
+			schedule();
+			__set_current_state(TASK_RUNNING);
+			continue;
+		}
+
+		/* Process task list */
+		while (1) {
+			/* save next */
+			next = tsk->posix_timer_list;
+
+			/* run the task timers, clear its ptr and
+			 * unreference it
+			 */
+			__run_posix_cpu_timers(tsk);
+			tsk->posix_timer_list = NULL;
+			put_task_struct(tsk);
+
+			/* check if this is the last on the list */
+			if (next == tsk)
+				break;
+			tsk = next;
+		}
+	}
+	return 0;
+
+wait_to_die:
+	/* Wait for kthread_stop */
+	set_current_state(TASK_INTERRUPTIBLE);
+	while (!kthread_should_stop()) {
+		schedule();
+		set_current_state(TASK_INTERRUPTIBLE);
+	}
+	__set_current_state(TASK_RUNNING);
+	return 0;
+}
+
+void run_posix_cpu_timers(struct task_struct *tsk)
+{
+	unsigned long cpu = smp_processor_id();
+	struct task_struct *tasklist;
+
+	BUG_ON(!irqs_disabled());
+	if(!per_cpu(posix_timer_task, cpu))
+		return;
+	/* get per-cpu references */
+	tasklist = per_cpu(posix_timer_tasklist, cpu);
+
+	/* check to see if we're already queued */
+	if (!tsk->posix_timer_list) {
+		get_task_struct(tsk);
+		if (tasklist) {
+			tsk->posix_timer_list = tasklist;
+		} else {
+			/*
+			 * The list is terminated by a self-pointing
+			 * task_struct
+			 */
+			tsk->posix_timer_list = tsk;
+		}
+		per_cpu(posix_timer_tasklist, cpu) = tsk;
+	}
+	/* XXX signal the thread somehow */
+	wake_up_process(per_cpu(posix_timer_task, cpu));
+}
+
+/*
+ * posix_cpu_thread_call - callback that gets triggered when a CPU is added.
+ * Here we can start up the necessary migration thread for the new CPU.
+ */
+static int posix_cpu_thread_call(struct notifier_block *nfb,
+				 unsigned long action, void *hcpu)
+{
+	int cpu = (long)hcpu;
+	struct task_struct *p;
+	struct sched_param param;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+		p = kthread_create(posix_cpu_timers_thread, hcpu,
+					"posix_cpu_timers/%d",cpu);
+		if (IS_ERR(p))
+			return NOTIFY_BAD;
+		p->flags |= PF_NOFREEZE;
+		kthread_bind(p, cpu);
+		/* Must be high prio to avoid getting starved */
+		param.sched_priority = MAX_RT_PRIO-1;
+		sched_setscheduler(p, SCHED_FIFO, &param);
+		per_cpu(posix_timer_task,cpu) = p;
+		break;
+	case CPU_ONLINE:
+		/* Strictly unneccessary, as first user will wake it. */
+		wake_up_process(per_cpu(posix_timer_task,cpu));
+		break;
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_UP_CANCELED:
+		/* Unbind it from offline cpu so it can run.  Fall thru. */
+		kthread_bind(per_cpu(posix_timer_task, cpu),
+			     cpumask_any(cpu_online_mask));
+		kthread_stop(per_cpu(posix_timer_task,cpu));
+		per_cpu(posix_timer_task,cpu) = NULL;
+		break;
+	case CPU_DEAD:
+		kthread_stop(per_cpu(posix_timer_task,cpu));
+		per_cpu(posix_timer_task,cpu) = NULL;
+		break;
+#endif
+	}
+	return NOTIFY_OK;
+}
+
+/* Register at highest priority so that task migration (migrate_all_tasks)
+ * happens before everything else.
+ */
+static struct notifier_block posix_cpu_thread_notifier = {
+	.notifier_call = posix_cpu_thread_call,
+	.priority = 10
+};
+
+static int __init posix_cpu_thread_init(void)
+{
+	void *hcpu = (void *)(long)smp_processor_id();
+	/* Start one for boot CPU. */
+	unsigned long cpu;
+
+	/* init the per-cpu posix_timer_tasklets */
+	for_each_possible_cpu(cpu)
+		per_cpu(posix_timer_tasklist, cpu) = NULL;
+
+	posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_UP_PREPARE, hcpu);
+	posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_ONLINE, hcpu);
+	register_cpu_notifier(&posix_cpu_thread_notifier);
+	return 0;
+}
+early_initcall(posix_cpu_thread_init);
+#else /* CONFIG_PREEMPT_RT_BASE */
+void run_posix_cpu_timers(struct task_struct *tsk)
+{
+	__run_posix_cpu_timers(tsk);
+}
+#endif /* CONFIG_PREEMPT_RT_BASE */
+
 /*
  * Set one of the process-wide special case CPU timers or RLIMIT_CPU.
  * The tsk->sighand->siglock must be held by the caller.
-- 
cgit v1.2.3


From 32c23ce7f43eb7a6949c1d546b93d43dfb330ad6 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 3 Jul 2009 08:30:00 -0500
Subject: posix-timers: Shorten posix_cpu_timers/<CPU> kernel thread names

Shorten the softirq kernel thread names because they always overflow the
limited comm length, appearing as "posix_cpu_timer" CPU# times.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/posix-cpu-timers.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 65a297b1cb83..a2a4e485b960 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1303,7 +1303,7 @@ static int posix_cpu_thread_call(struct notifier_block *nfb,
 	switch (action) {
 	case CPU_UP_PREPARE:
 		p = kthread_create(posix_cpu_timers_thread, hcpu,
-					"posix_cpu_timers/%d",cpu);
+					"posixcputmr/%d",cpu);
 		if (IS_ERR(p))
 			return NOTIFY_BAD;
 		p->flags |= PF_NOFREEZE;
-- 
cgit v1.2.3


From 194466974aa17702dbc898e2dced484224049bdf Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 3 Jul 2009 08:44:44 -0500
Subject: posix-timers: Avoid wakeups when no timers are active

Waking the thread even when no timers are scheduled is useless.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/posix-cpu-timers.c | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index a2a4e485b960..79450c994e07 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1260,6 +1260,21 @@ wait_to_die:
 	return 0;
 }
 
+static inline int __fastpath_timer_check(struct task_struct *tsk)
+{
+	/* tsk == current, ensure it is safe to use ->signal/sighand */
+	if (unlikely(tsk->exit_state))
+		return 0;
+
+	if (!task_cputime_zero(&tsk->cputime_expires))
+			return 1;
+
+	if (!task_cputime_zero(&tsk->signal->cputime_expires))
+			return 1;
+
+	return 0;
+}
+
 void run_posix_cpu_timers(struct task_struct *tsk)
 {
 	unsigned long cpu = smp_processor_id();
@@ -1272,7 +1287,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 	tasklist = per_cpu(posix_timer_tasklist, cpu);
 
 	/* check to see if we're already queued */
-	if (!tsk->posix_timer_list) {
+	if (!tsk->posix_timer_list && __fastpath_timer_check(tsk)) {
 		get_task_struct(tsk);
 		if (tasklist) {
 			tsk->posix_timer_list = tasklist;
@@ -1284,9 +1299,9 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 			tsk->posix_timer_list = tsk;
 		}
 		per_cpu(posix_timer_tasklist, cpu) = tsk;
+
+		wake_up_process(per_cpu(posix_timer_task, cpu));
 	}
-	/* XXX signal the thread somehow */
-	wake_up_process(per_cpu(posix_timer_task, cpu));
 }
 
 /*
-- 
cgit v1.2.3


From ac9fcb69c9a57d83a17eea6940fd91909f772244 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 31 May 2011 16:59:16 +0200
Subject: sched-delay-put-task.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h | 13 +++++++++++++
 kernel/fork.c         | 15 ++++++++++++++-
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index cf2145a61663..82071d6e5c9f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1677,6 +1677,9 @@ struct task_struct {
 	unsigned int	sequential_io;
 	unsigned int	sequential_io_avg;
 #endif
+#ifdef CONFIG_PREEMPT_RT_BASE
+	struct rcu_head put_rcu;
+#endif
 };
 
 /* Future-safe accessor for struct task_struct's cpus_allowed. */
@@ -1880,6 +1883,15 @@ extern struct pid *cad_pid;
 extern void free_task(struct task_struct *tsk);
 #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
 
+#ifdef CONFIG_PREEMPT_RT_BASE
+extern void __put_task_struct_cb(struct rcu_head *rhp);
+
+static inline void put_task_struct(struct task_struct *t)
+{
+	if (atomic_dec_and_test(&t->usage))
+		call_rcu(&t->put_rcu, __put_task_struct_cb);
+}
+#else
 extern void __put_task_struct(struct task_struct *t);
 
 static inline void put_task_struct(struct task_struct *t)
@@ -1887,6 +1899,7 @@ static inline void put_task_struct(struct task_struct *t)
 	if (atomic_dec_and_test(&t->usage))
 		__put_task_struct(t);
 }
+#endif
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 extern void task_cputime(struct task_struct *t,
diff --git a/kernel/fork.c b/kernel/fork.c
index 576374586884..6a0fd81c7505 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -233,7 +233,9 @@ static inline void put_signal_struct(struct signal_struct *sig)
 	if (atomic_dec_and_test(&sig->sigcnt))
 		free_signal_struct(sig);
 }
-
+#ifdef CONFIG_PREEMPT_RT_BASE
+static
+#endif
 void __put_task_struct(struct task_struct *tsk)
 {
 	WARN_ON(!tsk->exit_state);
@@ -249,7 +251,18 @@ void __put_task_struct(struct task_struct *tsk)
 	if (!profile_handoff_task(tsk))
 		free_task(tsk);
 }
+#ifndef CONFIG_PREEMPT_RT_BASE
 EXPORT_SYMBOL_GPL(__put_task_struct);
+#else
+void __put_task_struct_cb(struct rcu_head *rhp)
+{
+	struct task_struct *tsk = container_of(rhp, struct task_struct, put_rcu);
+
+	__put_task_struct(tsk);
+
+}
+EXPORT_SYMBOL_GPL(__put_task_struct_cb);
+#endif
 
 void __init __weak arch_task_cache_init(void) { }
 
-- 
cgit v1.2.3


From b18e8611b2d4fe021852162f44c0550bbfbb9922 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 6 Jun 2011 12:12:51 +0200
Subject: sched-limit-nr-migrate.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched/core.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8bdc95ce668d..60c12834cc26 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -280,7 +280,11 @@ late_initcall(sched_init_debug);
  * Number of tasks to iterate in a single balance run.
  * Limited because this is done with IRQs disabled.
  */
+#ifndef CONFIG_PREEMPT_RT_FULL
 const_debug unsigned int sysctl_sched_nr_migrate = 32;
+#else
+const_debug unsigned int sysctl_sched_nr_migrate = 8;
+#endif
 
 /*
  * period over which we average the RT time consumption, measured
-- 
cgit v1.2.3


From a532b471f2d367387db29966727cdcb2809d5b4c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 6 Jun 2011 12:20:33 +0200
Subject: sched-mmdrop-delayed.patch

Needs thread context (pgd_lock) -> ifdeffed. workqueues wont work with
RT

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/mm_types.h |  4 ++++
 include/linux/sched.h    | 12 ++++++++++++
 kernel/fork.c            | 13 +++++++++++++
 kernel/sched/core.c      | 18 ++++++++++++++++--
 4 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 6e0b286649f1..5e4b03bcf1a0 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -11,6 +11,7 @@
 #include <linux/completion.h>
 #include <linux/cpumask.h>
 #include <linux/page-debug-flags.h>
+#include <linux/rcupdate.h>
 #include <linux/uprobes.h>
 #include <linux/page-flags-layout.h>
 #include <asm/page.h>
@@ -454,6 +455,9 @@ struct mm_struct {
 	bool tlb_flush_pending;
 #endif
 	struct uprobes_state uprobes_state;
+#ifdef CONFIG_PREEMPT_RT_BASE
+	struct rcu_head delayed_drop;
+#endif
 };
 
 static inline void mm_init_cpumask(struct mm_struct *mm)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 82071d6e5c9f..fda0f1af7f8c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2446,12 +2446,24 @@ extern struct mm_struct * mm_alloc(void);
 
 /* mmdrop drops the mm and the page tables */
 extern void __mmdrop(struct mm_struct *);
+
 static inline void mmdrop(struct mm_struct * mm)
 {
 	if (unlikely(atomic_dec_and_test(&mm->mm_count)))
 		__mmdrop(mm);
 }
 
+#ifdef CONFIG_PREEMPT_RT_BASE
+extern void __mmdrop_delayed(struct rcu_head *rhp);
+static inline void mmdrop_delayed(struct mm_struct *mm)
+{
+	if (atomic_dec_and_test(&mm->mm_count))
+		call_rcu(&mm->delayed_drop, __mmdrop_delayed);
+}
+#else
+# define mmdrop_delayed(mm)	mmdrop(mm)
+#endif
+
 /* mmput gets rid of the mappings and all user-space */
 extern void mmput(struct mm_struct *);
 /* Grab a reference to a task's mm, if it is not already going away */
diff --git a/kernel/fork.c b/kernel/fork.c
index 6a0fd81c7505..023f585bf86b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -656,6 +656,19 @@ void __mmdrop(struct mm_struct *mm)
 }
 EXPORT_SYMBOL_GPL(__mmdrop);
 
+#ifdef CONFIG_PREEMPT_RT_BASE
+/*
+ * RCU callback for delayed mm drop. Not strictly rcu, but we don't
+ * want another facility to make this work.
+ */
+void __mmdrop_delayed(struct rcu_head *rhp)
+{
+	struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop);
+
+	__mmdrop(mm);
+}
+#endif
+
 /*
  * Decrement the use count and release all resources for an mm.
  */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 60c12834cc26..a590bc9fa885 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2292,8 +2292,12 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	finish_arch_post_lock_switch();
 
 	fire_sched_in_preempt_notifiers(current);
+	/*
+	 * We use mmdrop_delayed() here so we don't have to do the
+	 * full __mmdrop() when we are the last user.
+	 */
 	if (mm)
-		mmdrop(mm);
+		mmdrop_delayed(mm);
 	if (unlikely(prev_state == TASK_DEAD)) {
 		if (prev->sched_class->task_dead)
 			prev->sched_class->task_dead(prev);
@@ -4916,6 +4920,8 @@ static int migration_cpu_stop(void *data)
 
 #ifdef CONFIG_HOTPLUG_CPU
 
+static DEFINE_PER_CPU(struct mm_struct *, idle_last_mm);
+
 /*
  * Ensures that the idle task is using init_mm right before its cpu goes
  * offline.
@@ -4930,7 +4936,11 @@ void idle_task_exit(void)
 		switch_mm(mm, &init_mm, current);
 		finish_arch_post_lock_switch();
 	}
-	mmdrop(mm);
+	/*
+	 * Defer the cleanup to an alive cpu. On RT we can neither
+	 * call mmdrop() nor mmdrop_delayed() from here.
+	 */
+	per_cpu(idle_last_mm, smp_processor_id()) = mm;
 }
 
 /*
@@ -5273,6 +5283,10 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 
 	case CPU_DEAD:
 		calc_load_migrate(rq);
+		if (per_cpu(idle_last_mm, cpu)) {
+			mmdrop(per_cpu(idle_last_mm, cpu));
+			per_cpu(idle_last_mm, cpu) = NULL;
+		}
 		break;
 #endif
 	}
-- 
cgit v1.2.3


From f8501f1f0e7bafbfa97ff9c1c90bc8a57a62d3a7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 25 Jun 2011 09:21:04 +0200
Subject: sched-rt-mutex-wakeup.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h |  2 ++
 kernel/sched/core.c   | 31 ++++++++++++++++++++++++++++++-
 kernel/sched/sched.h  |  1 +
 3 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index fda0f1af7f8c..87fd0df1d2bb 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1232,6 +1232,7 @@ enum perf_event_task_context {
 
 struct task_struct {
 	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
+	volatile long saved_state;	/* saved state for "spinlock sleepers" */
 	void *stack;
 	atomic_t usage;
 	unsigned int flags;	/* per process flags, defined below */
@@ -2330,6 +2331,7 @@ extern void xtime_update(unsigned long ticks);
 
 extern int wake_up_state(struct task_struct *tsk, unsigned int state);
 extern int wake_up_process(struct task_struct *tsk);
+extern int wake_up_lock_sleeper(struct task_struct * tsk);
 extern void wake_up_new_task(struct task_struct *tsk);
 #ifdef CONFIG_SMP
  extern void kick_process(struct task_struct *tsk);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a590bc9fa885..2c3c83c275b5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1721,8 +1721,25 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 	 */
 	smp_mb__before_spinlock();
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
-	if (!(p->state & state))
+	if (!(p->state & state)) {
+		/*
+		 * The task might be running due to a spinlock sleeper
+		 * wakeup. Check the saved state and set it to running
+		 * if the wakeup condition is true.
+		 */
+		if (!(wake_flags & WF_LOCK_SLEEPER)) {
+			if (p->saved_state & state)
+				p->saved_state = TASK_RUNNING;
+		}
 		goto out;
+	}
+
+	/*
+	 * If this is a regular wakeup, then we can unconditionally
+	 * clear the saved state of a "lock sleeper".
+	 */
+	if (!(wake_flags & WF_LOCK_SLEEPER))
+		p->saved_state = TASK_RUNNING;
 
 	success = 1; /* we're going to change ->state */
 	cpu = task_cpu(p);
@@ -1819,6 +1836,18 @@ int wake_up_process(struct task_struct *p)
 }
 EXPORT_SYMBOL(wake_up_process);
 
+/**
+ * wake_up_lock_sleeper - Wake up a specific process blocked on a "sleeping lock"
+ * @p: The process to be woken up.
+ *
+ * Same as wake_up_process() above, but wake_flags=WF_LOCK_SLEEPER to indicate
+ * the nature of the wakeup.
+ */
+int wake_up_lock_sleeper(struct task_struct *p)
+{
+	return try_to_wake_up(p, TASK_ALL, WF_LOCK_SLEEPER);
+}
+
 int wake_up_state(struct task_struct *p, unsigned int state)
 {
 	return try_to_wake_up(p, state, 0);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 2df8ef067cc5..bb191c584970 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1018,6 +1018,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 #define WF_SYNC		0x01		/* waker goes to sleep after wakeup */
 #define WF_FORK		0x02		/* child wakeup after fork */
 #define WF_MIGRATED	0x4		/* internal use, task got migrated */
+#define WF_LOCK_SLEEPER	0x08		/* wakeup spinlock "sleeper" */
 
 /*
  * To aid in avoiding the subversion of "niceness" due to uneven distribution
-- 
cgit v1.2.3


From f79caa952c078ee282bbbc20e3fcd93e04569b46 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 7 Jun 2011 09:19:06 +0200
Subject: sched-might-sleep-do-not-account-rcu-depth.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/rcupdate.h | 7 +++++++
 kernel/sched/core.c      | 3 ++-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 53ff1a752d7e..86bff6fcac2e 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -231,6 +231,11 @@ void synchronize_rcu(void);
  * types of kernel builds, the rcu_read_lock() nesting depth is unknowable.
  */
 #define rcu_preempt_depth() (current->rcu_read_lock_nesting)
+#ifndef CONFIG_PREEMPT_RT_FULL
+#define sched_rcu_preempt_depth()	rcu_preempt_depth()
+#else
+static inline int sched_rcu_preempt_depth(void) { return 0; }
+#endif
 
 #else /* #ifdef CONFIG_PREEMPT_RCU */
 
@@ -254,6 +259,8 @@ static inline int rcu_preempt_depth(void)
 	return 0;
 }
 
+#define sched_rcu_preempt_depth()	rcu_preempt_depth()
+
 #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
 
 /* Internal to kernel */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2c3c83c275b5..ebe0dab443ba 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7257,7 +7257,8 @@ void __init sched_init(void)
 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
 static inline int preempt_count_equals(int preempt_offset)
 {
-	int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
+	int nested = (preempt_count() & ~PREEMPT_ACTIVE) +
+		sched_rcu_preempt_depth();
 
 	return (nested == preempt_offset);
 }
-- 
cgit v1.2.3


From 48256d0cb6180b225a2a8ee3a0045c28fa233d1f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 7 Jun 2011 11:25:03 +0200
Subject: sched-cond-resched.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched/core.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ebe0dab443ba..221990a80edf 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4300,9 +4300,16 @@ SYSCALL_DEFINE0(sched_yield)
 
 static void __cond_resched(void)
 {
-	__preempt_count_add(PREEMPT_ACTIVE);
-	__schedule();
-	__preempt_count_sub(PREEMPT_ACTIVE);
+	do {
+		__preempt_count_add(PREEMPT_ACTIVE);
+		__schedule();
+		__preempt_count_sub(PREEMPT_ACTIVE);
+		/*
+		 * Check again in case we missed a preemption
+		 * opportunity between schedule and now.
+		 */
+		barrier();
+	} while (need_resched());
 }
 
 int __sched _cond_resched(void)
-- 
cgit v1.2.3


From 87c99b2dd5805c3f51c742d99f974674494b6ba9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 14 Jul 2011 09:56:44 +0200
Subject: cond-resched-softirq-fix.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h | 4 ++++
 kernel/sched/core.c   | 2 ++
 2 files changed, 6 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 87fd0df1d2bb..d7606a8873ff 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2876,12 +2876,16 @@ extern int __cond_resched_lock(spinlock_t *lock);
 	__cond_resched_lock(lock);				\
 })
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 extern int __cond_resched_softirq(void);
 
 #define cond_resched_softirq() ({					\
 	__might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET);	\
 	__cond_resched_softirq();					\
 })
+#else
+# define cond_resched_softirq()		cond_resched()
+#endif
 
 static inline void cond_resched_rcu(void)
 {
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 221990a80edf..9f25e1ff71f9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4350,6 +4350,7 @@ int __cond_resched_lock(spinlock_t *lock)
 }
 EXPORT_SYMBOL(__cond_resched_lock);
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 int __sched __cond_resched_softirq(void)
 {
 	BUG_ON(!in_softirq());
@@ -4363,6 +4364,7 @@ int __sched __cond_resched_softirq(void)
 	return 0;
 }
 EXPORT_SYMBOL(__cond_resched_softirq);
+#endif
 
 /**
  * yield - yield the current processor to other threads.
-- 
cgit v1.2.3


From fbae77de8e7b0ed4490806aaaf343b46a6e1577a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 17 Jul 2011 22:51:33 +0200
Subject: cond-resched-lock-rt-tweak.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d7606a8873ff..dd00341f535f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2865,7 +2865,7 @@ extern int _cond_resched(void);
 
 extern int __cond_resched_lock(spinlock_t *lock);
 
-#ifdef CONFIG_PREEMPT_COUNT
+#if defined(CONFIG_PREEMPT_COUNT) && !defined(CONFIG_PREEMPT_RT_FULL)
 #define PREEMPT_LOCK_OFFSET	PREEMPT_OFFSET
 #else
 #define PREEMPT_LOCK_OFFSET	0
-- 
cgit v1.2.3


From 8a631ded124373d735582ec1e10fbfc3d6e0616e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 13 Sep 2011 16:42:35 +0200
Subject: sched-disable-ttwu-queue.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched/features.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 90284d117fe6..10552a509306 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -50,12 +50,15 @@ SCHED_FEAT(LB_BIAS, true)
  */
 SCHED_FEAT(NONTASK_CAPACITY, true)
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+SCHED_FEAT(TTWU_QUEUE, false)
+#else
 /*
  * Queue remote wakeups on the target CPU and process them
  * using the scheduler IPI. Reduces rq->lock contention/bounces.
  */
 SCHED_FEAT(TTWU_QUEUE, true)
-
+#endif
 SCHED_FEAT(FORCE_SD_OVERLAP, false)
 SCHED_FEAT(RT_RUNTIME_SHARE, true)
 SCHED_FEAT(LB_MIN, false)
-- 
cgit v1.2.3


From 70c879e54cb6918539428413a0511371fc548ab6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 18 Jul 2011 17:03:52 +0200
Subject: sched: Disable CONFIG_RT_GROUP_SCHED on RT

Carsten reported problems when running:

	taskset 01 chrt -f 1 sleep 1

from within rc.local on a F15 machine. The task stays running and
never gets on the run queue because some of the run queues have
rt_throttled=1 which does not go away. Works nice from a ssh login
shell. Disabling CONFIG_RT_GROUP_SCHED solves that as well.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 init/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/init/Kconfig b/init/Kconfig
index c3fb8ceb3ecd..1e747ee4f780 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1106,6 +1106,7 @@ config CFS_BANDWIDTH
 config RT_GROUP_SCHED
 	bool "Group scheduling for SCHED_RR/FIFO"
 	depends on CGROUP_SCHED
+	depends on !PREEMPT_RT_FULL
 	default n
 	help
 	  This feature lets you explicitly allocate real CPU bandwidth
-- 
cgit v1.2.3


From 51a69a8d78def5f37ae217b4cbb461b75010e809 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 13 Dec 2011 21:42:19 +0100
Subject: sched: ttwu: Return success when only changing the saved_state value

When a task blocks on a rt lock, it saves the current state in
p->saved_state, so a lock related wake up will not destroy the
original state.

When a real wakeup happens, while the task is running due to a lock
wakeup already, we update p->saved_state to TASK_RUNNING, but we do
not return success, which might cause another wakeup in the waitqueue
code and the task remains in the waitqueue list. Return success in
that case as well.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable-rt@vger.kernel.org
---
 kernel/sched/core.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9f25e1ff71f9..684ac9c50017 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1728,8 +1728,10 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 		 * if the wakeup condition is true.
 		 */
 		if (!(wake_flags & WF_LOCK_SLEEPER)) {
-			if (p->saved_state & state)
+			if (p->saved_state & state) {
 				p->saved_state = TASK_RUNNING;
+				success = 1;
+			}
 		}
 		goto out;
 	}
-- 
cgit v1.2.3


From 1cb2be0a1ed5ab7f759546e071f33e07e2733411 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 18 Mar 2013 15:12:49 -0400
Subject: sched/workqueue: Only wake up idle workers if not blocked on sleeping
 spin lock

In -rt, most spin_locks() turn into mutexes. One of these spin_lock
conversions is performed on the workqueue gcwq->lock. When the idle
worker is worken, the first thing it will do is grab that same lock and
it too will block, possibly jumping into the same code, but because
nr_running would already be decremented it prevents an infinite loop.

But this is still a waste of CPU cycles, and it doesn't follow the method
of mainline, as new workers should only be woken when a worker thread is
truly going to sleep, and not just blocked on a spin_lock().

Check the saved_state too before waking up new workers.

Cc: stable-rt@vger.kernel.org
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 kernel/sched/core.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 684ac9c50017..46edadf76c68 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2868,8 +2868,10 @@ need_resched:
 			 * If a worker went to sleep, notify and ask workqueue
 			 * whether it wants to wake up a task to maintain
 			 * concurrency.
+			 * Only call wake up if prev isn't blocked on a sleeping
+			 * spin lock.
 			 */
-			if (prev->flags & PF_WQ_WORKER) {
+			if (prev->flags & PF_WQ_WORKER && !prev->saved_state) {
 				struct task_struct *to_wakeup;
 
 				to_wakeup = wq_worker_sleeping(prev, cpu);
-- 
cgit v1.2.3


From fbc69406534e46286d00dcae6c7c6962023c2fac Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:30:27 -0500
Subject: stop_machine: convert stop_machine_run() to PREEMPT_RT

Instead of playing with non-preemption, introduce explicit
startup serialization. This is more robust and cleaner as
well.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
[bigeasy: XXX: stopper_lock -> stop_cpus_lock]
---
 kernel/stop_machine.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 695f0c6cd169..b2da47ffe077 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -467,6 +467,16 @@ repeat:
 		struct cpu_stop_done *done = work->done;
 		char ksym_buf[KSYM_NAME_LEN] __maybe_unused;
 
+		/*
+		 * Wait until the stopper finished scheduling on all
+		 * cpus
+		 */
+		lg_global_lock(&stop_cpus_lock);
+		/*
+		 * Let other cpu threads continue as well
+		 */
+		lg_global_unlock(&stop_cpus_lock);
+
 		/* cpu stop callbacks are not allowed to sleep */
 		preempt_disable();
 
-- 
cgit v1.2.3


From 747fb8823c9cc469e3ee604d064c849c8a24360c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 29 Jun 2011 11:01:51 +0200
Subject: stomp-machine-raw-lock.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/stop_machine.c | 64 +++++++++++++++++++++++++++++++++++----------------
 1 file changed, 44 insertions(+), 20 deletions(-)

diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index b2da47ffe077..c27b8ef75c3a 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -30,12 +30,12 @@ struct cpu_stop_done {
 	atomic_t		nr_todo;	/* nr left to execute */
 	bool			executed;	/* actually executed? */
 	int			ret;		/* collected return value */
-	struct completion	completion;	/* fired if nr_todo reaches 0 */
+	struct task_struct	*waiter;	/* woken when nr_todo reaches 0 */
 };
 
 /* the actual stopper, one per every possible cpu, enabled on online cpus */
 struct cpu_stopper {
-	spinlock_t		lock;
+	raw_spinlock_t		lock;
 	bool			enabled;	/* is this stopper enabled? */
 	struct list_head	works;		/* list of pending works */
 };
@@ -56,7 +56,7 @@ static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
 {
 	memset(done, 0, sizeof(*done));
 	atomic_set(&done->nr_todo, nr_todo);
-	init_completion(&done->completion);
+	done->waiter = current;
 }
 
 /* signal completion unless @done is NULL */
@@ -65,8 +65,10 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed)
 	if (done) {
 		if (executed)
 			done->executed = true;
-		if (atomic_dec_and_test(&done->nr_todo))
-			complete(&done->completion);
+		if (atomic_dec_and_test(&done->nr_todo)) {
+			wake_up_process(done->waiter);
+			done->waiter = NULL;
+		}
 	}
 }
 
@@ -78,7 +80,7 @@ static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
 
 	unsigned long flags;
 
-	spin_lock_irqsave(&stopper->lock, flags);
+	raw_spin_lock_irqsave(&stopper->lock, flags);
 
 	if (stopper->enabled) {
 		list_add_tail(&work->list, &stopper->works);
@@ -86,7 +88,23 @@ static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
 	} else
 		cpu_stop_signal_done(work->done, false);
 
-	spin_unlock_irqrestore(&stopper->lock, flags);
+	raw_spin_unlock_irqrestore(&stopper->lock, flags);
+}
+
+static void wait_for_stop_done(struct cpu_stop_done *done)
+{
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	while (atomic_read(&done->nr_todo)) {
+		schedule();
+		set_current_state(TASK_UNINTERRUPTIBLE);
+	}
+	/*
+	 * We need to wait until cpu_stop_signal_done() has cleared
+	 * done->waiter.
+	 */
+	while (done->waiter)
+		cpu_relax();
+	set_current_state(TASK_RUNNING);
 }
 
 /**
@@ -120,7 +138,7 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
 
 	cpu_stop_init_done(&done, 1);
 	cpu_stop_queue_work(cpu, &work);
-	wait_for_completion(&done.completion);
+	wait_for_stop_done(&done);
 	return done.executed ? done.ret : -ENOENT;
 }
 
@@ -297,7 +315,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
 	lg_local_unlock(&stop_cpus_lock);
 	preempt_enable();
 
-	wait_for_completion(&done.completion);
+	wait_for_stop_done(&done);
 
 	return done.executed ? done.ret : -ENOENT;
 }
@@ -360,7 +378,7 @@ static int __stop_cpus(const struct cpumask *cpumask,
 
 	cpu_stop_init_done(&done, cpumask_weight(cpumask));
 	queue_stop_cpus_work(cpumask, fn, arg, &done);
-	wait_for_completion(&done.completion);
+	wait_for_stop_done(&done);
 	return done.executed ? done.ret : -ENOENT;
 }
 
@@ -439,9 +457,9 @@ static int cpu_stop_should_run(unsigned int cpu)
 	unsigned long flags;
 	int run;
 
-	spin_lock_irqsave(&stopper->lock, flags);
+	raw_spin_lock_irqsave(&stopper->lock, flags);
 	run = !list_empty(&stopper->works);
-	spin_unlock_irqrestore(&stopper->lock, flags);
+	raw_spin_unlock_irqrestore(&stopper->lock, flags);
 	return run;
 }
 
@@ -453,13 +471,13 @@ static void cpu_stopper_thread(unsigned int cpu)
 
 repeat:
 	work = NULL;
-	spin_lock_irq(&stopper->lock);
+	raw_spin_lock_irq(&stopper->lock);
 	if (!list_empty(&stopper->works)) {
 		work = list_first_entry(&stopper->works,
 					struct cpu_stop_work, list);
 		list_del_init(&work->list);
 	}
-	spin_unlock_irq(&stopper->lock);
+	raw_spin_unlock_irq(&stopper->lock);
 
 	if (work) {
 		cpu_stop_fn_t fn = work->fn;
@@ -491,7 +509,13 @@ repeat:
 			  kallsyms_lookup((unsigned long)fn, NULL, NULL, NULL,
 					  ksym_buf), arg);
 
+		/*
+		 * Make sure that the wakeup and setting done->waiter
+		 * to NULL is atomic.
+		 */
+		local_irq_disable();
 		cpu_stop_signal_done(done, true);
+		local_irq_enable();
 		goto repeat;
 	}
 }
@@ -510,20 +534,20 @@ static void cpu_stop_park(unsigned int cpu)
 	unsigned long flags;
 
 	/* drain remaining works */
-	spin_lock_irqsave(&stopper->lock, flags);
+	raw_spin_lock_irqsave(&stopper->lock, flags);
 	list_for_each_entry(work, &stopper->works, list)
 		cpu_stop_signal_done(work->done, false);
 	stopper->enabled = false;
-	spin_unlock_irqrestore(&stopper->lock, flags);
+	raw_spin_unlock_irqrestore(&stopper->lock, flags);
 }
 
 static void cpu_stop_unpark(unsigned int cpu)
 {
 	struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
 
-	spin_lock_irq(&stopper->lock);
+	raw_spin_lock_irq(&stopper->lock);
 	stopper->enabled = true;
-	spin_unlock_irq(&stopper->lock);
+	raw_spin_unlock_irq(&stopper->lock);
 }
 
 static struct smp_hotplug_thread cpu_stop_threads = {
@@ -545,7 +569,7 @@ static int __init cpu_stop_init(void)
 	for_each_possible_cpu(cpu) {
 		struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
 
-		spin_lock_init(&stopper->lock);
+		raw_spin_lock_init(&stopper->lock);
 		INIT_LIST_HEAD(&stopper->works);
 	}
 
@@ -648,7 +672,7 @@ int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data,
 	ret = multi_cpu_stop(&msdata);
 
 	/* Busy wait for completion. */
-	while (!completion_done(&done.completion))
+	while (atomic_read(&done.nr_todo))
 		cpu_relax();
 
 	mutex_unlock(&stop_cpus_mutex);
-- 
cgit v1.2.3


From 74616fd04174c4ea71f5fa55e0ce74ee35c6cb79 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 15 Jun 2011 12:36:06 +0200
Subject: hotplug: Lightweight get online cpus

get_online_cpus() is a heavy weight function which involves a global
mutex. migrate_disable() wants a simpler construct which prevents only
a CPU from going doing while a task is in a migrate disabled section.

Implement a per cpu lockless mechanism, which serializes only in the
real unplug case on a global mutex. That serialization affects only
tasks on the cpu which should be brought down.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/cpu.h |   4 ++
 kernel/cpu.c        | 120 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 123 insertions(+), 1 deletion(-)

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index b2d9a43012b2..da87af1b3e33 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -217,6 +217,8 @@ extern bool try_get_online_cpus(void);
 extern void put_online_cpus(void);
 extern void cpu_hotplug_disable(void);
 extern void cpu_hotplug_enable(void);
+extern void pin_current_cpu(void);
+extern void unpin_current_cpu(void);
 #define hotcpu_notifier(fn, pri)	cpu_notifier(fn, pri)
 #define __hotcpu_notifier(fn, pri)	__cpu_notifier(fn, pri)
 #define register_hotcpu_notifier(nb)	register_cpu_notifier(nb)
@@ -235,6 +237,8 @@ static inline void cpu_hotplug_done(void) {}
 #define put_online_cpus()	do { } while (0)
 #define cpu_hotplug_disable()	do { } while (0)
 #define cpu_hotplug_enable()	do { } while (0)
+static inline void pin_current_cpu(void) { }
+static inline void unpin_current_cpu(void) { }
 #define hotcpu_notifier(fn, pri)	do { (void)(fn); } while (0)
 #define __hotcpu_notifier(fn, pri)	do { (void)(fn); } while (0)
 /* These aren't inline functions due to a GCC bug. */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 90a3d017b90c..127a53f7e45d 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -86,6 +86,101 @@ static struct {
 #define cpuhp_lock_acquire()      lock_map_acquire(&cpu_hotplug.dep_map)
 #define cpuhp_lock_release()      lock_map_release(&cpu_hotplug.dep_map)
 
+struct hotplug_pcp {
+	struct task_struct *unplug;
+	int refcount;
+	struct completion synced;
+};
+
+static DEFINE_PER_CPU(struct hotplug_pcp, hotplug_pcp);
+
+/**
+ * pin_current_cpu - Prevent the current cpu from being unplugged
+ *
+ * Lightweight version of get_online_cpus() to prevent cpu from being
+ * unplugged when code runs in a migration disabled region.
+ *
+ * Must be called with preemption disabled (preempt_count = 1)!
+ */
+void pin_current_cpu(void)
+{
+	struct hotplug_pcp *hp = &__get_cpu_var(hotplug_pcp);
+
+retry:
+	if (!hp->unplug || hp->refcount || preempt_count() > 1 ||
+	    hp->unplug == current) {
+		hp->refcount++;
+		return;
+	}
+	preempt_enable();
+	mutex_lock(&cpu_hotplug.lock);
+	mutex_unlock(&cpu_hotplug.lock);
+	preempt_disable();
+	goto retry;
+}
+
+/**
+ * unpin_current_cpu - Allow unplug of current cpu
+ *
+ * Must be called with preemption or interrupts disabled!
+ */
+void unpin_current_cpu(void)
+{
+	struct hotplug_pcp *hp = &__get_cpu_var(hotplug_pcp);
+
+	WARN_ON(hp->refcount <= 0);
+
+	/* This is safe. sync_unplug_thread is pinned to this cpu */
+	if (!--hp->refcount && hp->unplug && hp->unplug != current)
+		wake_up_process(hp->unplug);
+}
+
+/*
+ * FIXME: Is this really correct under all circumstances ?
+ */
+static int sync_unplug_thread(void *data)
+{
+	struct hotplug_pcp *hp = data;
+
+	preempt_disable();
+	hp->unplug = current;
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	while (hp->refcount) {
+		schedule_preempt_disabled();
+		set_current_state(TASK_UNINTERRUPTIBLE);
+	}
+	set_current_state(TASK_RUNNING);
+	preempt_enable();
+	complete(&hp->synced);
+	return 0;
+}
+
+/*
+ * Start the sync_unplug_thread on the target cpu and wait for it to
+ * complete.
+ */
+static int cpu_unplug_begin(unsigned int cpu)
+{
+	struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
+	struct task_struct *tsk;
+
+	init_completion(&hp->synced);
+	tsk = kthread_create(sync_unplug_thread, hp, "sync_unplug/%d\n", cpu);
+	if (IS_ERR(tsk))
+		return (PTR_ERR(tsk));
+	kthread_bind(tsk, cpu);
+	wake_up_process(tsk);
+	wait_for_completion(&hp->synced);
+	return 0;
+}
+
+static void cpu_unplug_done(unsigned int cpu)
+{
+	struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
+
+	hp->unplug = NULL;
+}
+
 void get_online_cpus(void)
 {
 	might_sleep();
@@ -349,13 +444,14 @@ static int __ref take_cpu_down(void *_param)
 /* Requires cpu_add_remove_lock to be held */
 static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 {
-	int err, nr_calls = 0;
+	int mycpu, err, nr_calls = 0;
 	void *hcpu = (void *)(long)cpu;
 	unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
 	struct take_cpu_down_param tcd_param = {
 		.mod = mod,
 		.hcpu = hcpu,
 	};
+	cpumask_var_t cpumask;
 
 	if (num_online_cpus() == 1)
 		return -EBUSY;
@@ -363,7 +459,27 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 	if (!cpu_online(cpu))
 		return -EINVAL;
 
+	/* Move the downtaker off the unplug cpu */
+	if (!alloc_cpumask_var(&cpumask, GFP_KERNEL))
+		return -ENOMEM;
+	cpumask_andnot(cpumask, cpu_online_mask, cpumask_of(cpu));
+	set_cpus_allowed_ptr(current, cpumask);
+	free_cpumask_var(cpumask);
+	preempt_disable();
+	mycpu = smp_processor_id();
+	if (mycpu == cpu) {
+		printk(KERN_ERR "Yuck! Still on unplug CPU\n!");
+		preempt_enable();
+		return -EBUSY;
+	}
+	preempt_enable();
+
 	cpu_hotplug_begin();
+	err = cpu_unplug_begin(cpu);
+	if (err) {
+		printk("cpu_unplug_begin(%d) failed\n", cpu);
+		goto out_cancel;
+	}
 
 	err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
 	if (err) {
@@ -423,6 +539,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 	check_for_tasks(cpu);
 
 out_release:
+	cpu_unplug_done(cpu);
+out_cancel:
 	cpu_hotplug_done();
 	if (!err)
 		cpu_notify_nofail(CPU_POST_DEAD | mod, hcpu);
-- 
cgit v1.2.3


From ed5e23b8a3e8a023f3966001862086e4d1efcc71 Mon Sep 17 00:00:00 2001
From: Yong Zhang <yong.zhang0@gmail.com>
Date: Sun, 16 Oct 2011 18:56:43 +0800
Subject: hotplug: sync_unplug: No " " in task name

Otherwise the output will look a little odd.

Signed-off-by: Yong Zhang <yong.zhang0@gmail.com>
Link: http://lkml.kernel.org/r/1318762607-2261-2-git-send-email-yong.zhang0@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/cpu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 127a53f7e45d..dc3ccb47e776 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -165,7 +165,7 @@ static int cpu_unplug_begin(unsigned int cpu)
 	struct task_struct *tsk;
 
 	init_completion(&hp->synced);
-	tsk = kthread_create(sync_unplug_thread, hp, "sync_unplug/%d\n", cpu);
+	tsk = kthread_create(sync_unplug_thread, hp, "sync_unplug/%d", cpu);
 	if (IS_ERR(tsk))
 		return (PTR_ERR(tsk));
 	kthread_bind(tsk, cpu);
-- 
cgit v1.2.3


From 505fe7e663c2d81460d8e5de46c0632bdffb054f Mon Sep 17 00:00:00 2001
From: Yong Zhang <yong.zhang0@gmail.com>
Date: Thu, 28 Jul 2011 11:16:00 +0800
Subject: hotplug: Reread hotplug_pcp on pin_current_cpu() retry

When retry happens, it's likely that the task has been migrated to
another cpu (except unplug failed), but it still derefernces the
original hotplug_pcp per cpu data.

Update the pointer to hotplug_pcp in the retry path, so it points to
the current cpu.

Signed-off-by: Yong Zhang <yong.zhang0@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20110728031600.GA338@windriver.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/cpu.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/kernel/cpu.c b/kernel/cpu.c
index dc3ccb47e776..f69d51e12e68 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -104,9 +104,11 @@ static DEFINE_PER_CPU(struct hotplug_pcp, hotplug_pcp);
  */
 void pin_current_cpu(void)
 {
-	struct hotplug_pcp *hp = &__get_cpu_var(hotplug_pcp);
+	struct hotplug_pcp *hp;
 
 retry:
+	hp = &__get_cpu_var(hotplug_pcp);
+
 	if (!hp->unplug || hp->refcount || preempt_count() > 1 ||
 	    hp->unplug == current) {
 		hp->refcount++;
-- 
cgit v1.2.3


From 67204191469104b46e4b82f935e7b053918764f2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 16 Jun 2011 13:26:08 +0200
Subject: sched-migrate-disable.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/preempt.h |  8 +++++
 include/linux/sched.h   | 13 ++++++--
 kernel/sched/core.c     | 87 ++++++++++++++++++++++++++++++++++++++++++++++---
 lib/smp_processor_id.c  |  5 +--
 4 files changed, 104 insertions(+), 9 deletions(-)

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 18508708f22b..f05be4a6fc65 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -148,6 +148,14 @@ do { \
 		set_preempt_need_resched(); \
 } while (0)
 
+#ifdef CONFIG_SMP
+extern void migrate_disable(void);
+extern void migrate_enable(void);
+#else
+# define migrate_disable()		barrier()
+# define migrate_enable()		barrier()
+#endif
+
 #ifdef CONFIG_PREEMPT_RT_FULL
 # define preempt_disable_rt()		preempt_disable()
 # define preempt_enable_rt()		preempt_enable()
diff --git a/include/linux/sched.h b/include/linux/sched.h
index dd00341f535f..75a5be31e966 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1269,6 +1269,7 @@ struct task_struct {
 #endif
 
 	unsigned int policy;
+	int migrate_disable;
 	int nr_cpus_allowed;
 	cpumask_t cpus_allowed;
 
@@ -1683,9 +1684,6 @@ struct task_struct {
 #endif
 };
 
-/* Future-safe accessor for struct task_struct's cpus_allowed. */
-#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
-
 #define TNF_MIGRATED	0x01
 #define TNF_NO_GROUP	0x02
 #define TNF_SHARED	0x04
@@ -3052,6 +3050,15 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
 
 #endif /* CONFIG_SMP */
 
+/* Future-safe accessor for struct task_struct's cpus_allowed. */
+static inline const struct cpumask *tsk_cpus_allowed(struct task_struct *p)
+{
+	if (p->migrate_disable)
+		return cpumask_of(task_cpu(p));
+
+	return &p->cpus_allowed;
+}
+
 extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
 extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 46edadf76c68..8a735b738baf 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4772,11 +4772,13 @@ static struct rq *move_queued_task(struct task_struct *p, int new_cpu)
 
 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 {
-	if (p->sched_class && p->sched_class->set_cpus_allowed)
-		p->sched_class->set_cpus_allowed(p, new_mask);
+	if (!p->migrate_disable) {
+		if (p->sched_class && p->sched_class->set_cpus_allowed)
+			p->sched_class->set_cpus_allowed(p, new_mask);
+		p->nr_cpus_allowed = cpumask_weight(new_mask);
+	}
 
 	cpumask_copy(&p->cpus_allowed, new_mask);
-	p->nr_cpus_allowed = cpumask_weight(new_mask);
 }
 
 /*
@@ -4822,7 +4824,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 	do_set_cpus_allowed(p, new_mask);
 
 	/* Can the task run on the task's current CPU? If so, we're done */
-	if (cpumask_test_cpu(task_cpu(p), new_mask))
+	if (cpumask_test_cpu(task_cpu(p), new_mask) || p->migrate_disable)
 		goto out;
 
 	dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
@@ -4842,6 +4844,83 @@ out:
 }
 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
 
+void migrate_disable(void)
+{
+	struct task_struct *p = current;
+	const struct cpumask *mask;
+	unsigned long flags;
+	struct rq *rq;
+
+	preempt_disable();
+	if (p->migrate_disable) {
+		p->migrate_disable++;
+		preempt_enable();
+		return;
+	}
+
+	pin_current_cpu();
+	if (unlikely(!scheduler_running)) {
+		p->migrate_disable = 1;
+		preempt_enable();
+		return;
+	}
+	rq = task_rq_lock(p, &flags);
+	p->migrate_disable = 1;
+	mask = tsk_cpus_allowed(p);
+
+	WARN_ON(!cpumask_test_cpu(smp_processor_id(), mask));
+
+	if (!cpumask_equal(&p->cpus_allowed, mask)) {
+		if (p->sched_class->set_cpus_allowed)
+			p->sched_class->set_cpus_allowed(p, mask);
+		p->nr_cpus_allowed = cpumask_weight(mask);
+	}
+	task_rq_unlock(rq, p, &flags);
+	preempt_enable();
+}
+EXPORT_SYMBOL(migrate_disable);
+
+void migrate_enable(void)
+{
+	struct task_struct *p = current;
+	const struct cpumask *mask;
+	unsigned long flags;
+	struct rq *rq;
+
+	WARN_ON_ONCE(p->migrate_disable <= 0);
+
+	preempt_disable();
+	if (p->migrate_disable > 1) {
+		p->migrate_disable--;
+		preempt_enable();
+		return;
+	}
+
+	if (unlikely(!scheduler_running)) {
+		p->migrate_disable = 0;
+		unpin_current_cpu();
+		preempt_enable();
+		return;
+	}
+
+	rq = task_rq_lock(p, &flags);
+	p->migrate_disable = 0;
+	mask = tsk_cpus_allowed(p);
+
+	WARN_ON(!cpumask_test_cpu(smp_processor_id(), mask));
+
+	if (!cpumask_equal(&p->cpus_allowed, mask)) {
+		if (p->sched_class->set_cpus_allowed)
+			p->sched_class->set_cpus_allowed(p, mask);
+		p->nr_cpus_allowed = cpumask_weight(mask);
+	}
+
+	task_rq_unlock(rq, p, &flags);
+	unpin_current_cpu();
+	preempt_enable();
+}
+EXPORT_SYMBOL(migrate_enable);
+
 /*
  * Move (not current) task off this cpu, onto dest cpu. We're doing
  * this because either it can't run here any more (set_cpus_allowed()
diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c
index 1afec32de6f2..4822c5d268a8 100644
--- a/lib/smp_processor_id.c
+++ b/lib/smp_processor_id.c
@@ -39,8 +39,9 @@ notrace static unsigned int check_preemption_disabled(const char *what1,
 	if (!printk_ratelimit())
 		goto out_enable;
 
-	printk(KERN_ERR "BUG: using %s%s() in preemptible [%08x] code: %s/%d\n",
-		what1, what2, preempt_count() - 1, current->comm, current->pid);
+	printk(KERN_ERR "BUG: using %s%s() in preemptible [%08x %08x] code: %s/%d\n",
+		what1, what2, preempt_count() - 1, current->migrate_disable,
+		current->comm, current->pid);
 
 	print_symbol("caller is %s\n", (long)__builtin_return_address(0));
 	dump_stack();
-- 
cgit v1.2.3


From b1347c1bd49b18a1161bc2d2e45173b3215692d7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 17 Jul 2011 19:35:29 +0200
Subject: hotplug-use-migrate-disable.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/cpu.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/cpu.c b/kernel/cpu.c
index f69d51e12e68..1d6ad09c0592 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -467,14 +467,13 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 	cpumask_andnot(cpumask, cpu_online_mask, cpumask_of(cpu));
 	set_cpus_allowed_ptr(current, cpumask);
 	free_cpumask_var(cpumask);
-	preempt_disable();
+	migrate_disable();
 	mycpu = smp_processor_id();
 	if (mycpu == cpu) {
 		printk(KERN_ERR "Yuck! Still on unplug CPU\n!");
-		preempt_enable();
+		migrate_enable();
 		return -EBUSY;
 	}
-	preempt_enable();
 
 	cpu_hotplug_begin();
 	err = cpu_unplug_begin(cpu);
@@ -543,6 +542,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 out_release:
 	cpu_unplug_done(cpu);
 out_cancel:
+	migrate_enable();
 	cpu_hotplug_done();
 	if (!err)
 		cpu_notify_nofail(CPU_POST_DEAD | mod, hcpu);
-- 
cgit v1.2.3


From 0de90b64c3810517072b601733de56b0ea4de0e0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 17 Jul 2011 21:56:42 +0200
Subject: ftrace-migrate-disable-tracing.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/ftrace_event.h | 2 ++
 kernel/trace/trace.c         | 9 ++++++---
 kernel/trace/trace_events.c  | 1 +
 kernel/trace/trace_output.c  | 5 +++++
 4 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 28672e87e910..d4db37ee5f2d 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -61,6 +61,8 @@ struct trace_entry {
 	unsigned char		flags;
 	unsigned char		preempt_count;
 	int			pid;
+	unsigned short		migrate_disable;
+	unsigned short		padding;
 };
 
 #define FTRACE_MAX_EVENT						\
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 72c71345db81..9e11323e07aa 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1590,6 +1590,8 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
 		((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
 		(tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) |
 		(test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0);
+
+	entry->migrate_disable = (tsk) ? tsk->migrate_disable & 0xFF : 0;
 }
 EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
 
@@ -2514,9 +2516,10 @@ static void print_lat_help_header(struct seq_file *m)
 	seq_puts(m, "#                | / _----=> need-resched    \n");
 	seq_puts(m, "#                || / _---=> hardirq/softirq \n");
 	seq_puts(m, "#                ||| / _--=> preempt-depth   \n");
-	seq_puts(m, "#                |||| /     delay             \n");
-	seq_puts(m, "#  cmd     pid   ||||| time  |   caller      \n");
-	seq_puts(m, "#     \\   /      |||||  \\    |   /           \n");
+	seq_puts(m, "#                |||| / _--=> migrate-disable\n");
+	seq_puts(m, "#                ||||| /     delay           \n");
+	seq_puts(m, "#  cmd     pid   |||||| time  |   caller     \n");
+	seq_puts(m, "#     \\   /      |||||  \\   |   /          \n");
 }
 
 static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 1b0df1e504f0..ec2a38a9fe93 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -162,6 +162,7 @@ static int trace_define_common_fields(void)
 	__common_field(unsigned char, flags);
 	__common_field(unsigned char, preempt_count);
 	__common_field(int, pid);
+	__common_field(unsigned short, migrate_disable);
 
 	return ret;
 }
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index c6977d5a9b12..8a0b28f796b8 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -454,6 +454,11 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
 	else
 		ret = trace_seq_putc(s, '.');
 
+	if (entry->migrate_disable)
+		ret = trace_seq_printf(s, "%x", entry->migrate_disable);
+	else
+		ret = trace_seq_putc(s, '.');
+
 	return ret;
 }
 
-- 
cgit v1.2.3


From b93d4515814c3992fb6904c251376a3eee3aefce Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 16 Nov 2011 13:19:35 -0500
Subject: tracing: Show padding as unsigned short

RT added two bytes to trace migrate disable counting to the trace events
and used two bytes of the padding to make the change. The structures and
all were updated correctly, but the display in the event formats was
not:

cat /debug/tracing/events/sched/sched_switch/format

name: sched_switch
ID: 51
format:
	field:unsigned short common_type;	offset:0;	size:2;	signed:0;
	field:unsigned char common_flags;	offset:2;	size:1;	signed:0;
	field:unsigned char common_preempt_count;	offset:3;	size:1;	signed:0;
	field:int common_pid;	offset:4;	size:4;	signed:1;
	field:unsigned short common_migrate_disable;	offset:8;	size:2;	signed:0;
	field:int common_padding;	offset:10;	size:2;	signed:0;


The field for common_padding has the correct size and offset, but the
use of "int" might confuse some parsers (and people that are reading
it). This needs to be changed to "unsigned short".

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/1321467575.4181.36.camel@frodo
Cc: stable-rt@vger.kernel.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace_events.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index ec2a38a9fe93..53e9ae056872 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -163,6 +163,7 @@ static int trace_define_common_fields(void)
 	__common_field(unsigned char, preempt_count);
 	__common_field(int, pid);
 	__common_field(unsigned short, migrate_disable);
+	__common_field(unsigned short, padding);
 
 	return ret;
 }
-- 
cgit v1.2.3


From a42051f671315f07d9d915c3d1c1ed2b1a53dff4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 17 Jul 2011 19:48:20 +0200
Subject: migrate-disable-rt-variant.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/preempt.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index f05be4a6fc65..c54b96597c09 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -161,11 +161,15 @@ extern void migrate_enable(void);
 # define preempt_enable_rt()		preempt_enable()
 # define preempt_disable_nort()		barrier()
 # define preempt_enable_nort()		barrier()
+# define migrate_disable_rt()		migrate_disable()
+# define migrate_enable_rt()		migrate_enable()
 #else
 # define preempt_disable_rt()		barrier()
 # define preempt_enable_rt()		barrier()
 # define preempt_disable_nort()		preempt_disable()
 # define preempt_enable_nort()		preempt_enable()
+# define migrate_disable_rt()		barrier()
+# define migrate_enable_rt()		barrier()
 #endif
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
-- 
cgit v1.2.3


From a6d59f2f41408d5cdd913ff4744e229b027ee36e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 11 Aug 2011 15:03:35 +0200
Subject: sched: Optimize migrate_disable

Change from task_rq_lock() to raw_spin_lock(&rq->lock) to avoid a few
atomic ops. See comment on why it should be safe.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/n/tip-cbz6hkl5r5mvwtx5s3tor2y6@git.kernel.org
---
 kernel/sched/core.c | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8a735b738baf..4f443ff3299a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4864,7 +4864,19 @@ void migrate_disable(void)
 		preempt_enable();
 		return;
 	}
-	rq = task_rq_lock(p, &flags);
+
+	/*
+	 * Since this is always current we can get away with only locking
+	 * rq->lock, the ->cpus_allowed value can normally only be changed
+	 * while holding both p->pi_lock and rq->lock, but seeing that this
+	 * it current, we cannot actually be waking up, so all code that
+	 * relies on serialization against p->pi_lock is out of scope.
+	 *
+	 * Taking rq->lock serializes us against things like
+	 * set_cpus_allowed_ptr() that can still happen concurrently.
+	 */
+	rq = this_rq();
+	raw_spin_lock_irqsave(&rq->lock, flags);
 	p->migrate_disable = 1;
 	mask = tsk_cpus_allowed(p);
 
@@ -4875,7 +4887,7 @@ void migrate_disable(void)
 			p->sched_class->set_cpus_allowed(p, mask);
 		p->nr_cpus_allowed = cpumask_weight(mask);
 	}
-	task_rq_unlock(rq, p, &flags);
+	raw_spin_unlock_irqrestore(&rq->lock, flags);
 	preempt_enable();
 }
 EXPORT_SYMBOL(migrate_disable);
@@ -4903,7 +4915,11 @@ void migrate_enable(void)
 		return;
 	}
 
-	rq = task_rq_lock(p, &flags);
+	/*
+	 * See comment in migrate_disable().
+	 */
+	rq = this_rq();
+	raw_spin_lock_irqsave(&rq->lock, flags);
 	p->migrate_disable = 0;
 	mask = tsk_cpus_allowed(p);
 
@@ -4915,7 +4931,7 @@ void migrate_enable(void)
 		p->nr_cpus_allowed = cpumask_weight(mask);
 	}
 
-	task_rq_unlock(rq, p, &flags);
+	raw_spin_unlock_irqrestore(&rq->lock, flags);
 	unpin_current_cpu();
 	preempt_enable();
 }
-- 
cgit v1.2.3


From d431015350b9534ac757fae6f155e0c0a5eda170 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 11 Aug 2011 15:14:58 +0200
Subject: sched: Generic migrate_disable

Make migrate_disable() be a preempt_disable() for !rt kernels. This
allows generic code to use it but still enforces that these code
sections stay relatively small.

A preemptible migrate_disable() accessible for general use would allow
people growing arbitrary per-cpu crap instead of clean these things
up.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/n/tip-275i87sl8e1jcamtchmehonm@git.kernel.org
---
 include/linux/preempt.h | 21 +++++++++------------
 include/linux/sched.h   | 13 +++++++++++++
 include/linux/smp.h     |  9 ++-------
 kernel/sched/core.c     |  6 ++++--
 kernel/trace/trace.c    |  2 +-
 lib/smp_processor_id.c  |  2 +-
 6 files changed, 30 insertions(+), 23 deletions(-)

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index c54b96597c09..9f33663ff29b 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -148,28 +148,25 @@ do { \
 		set_preempt_need_resched(); \
 } while (0)
 
-#ifdef CONFIG_SMP
-extern void migrate_disable(void);
-extern void migrate_enable(void);
-#else
-# define migrate_disable()		barrier()
-# define migrate_enable()		barrier()
-#endif
-
 #ifdef CONFIG_PREEMPT_RT_FULL
 # define preempt_disable_rt()		preempt_disable()
 # define preempt_enable_rt()		preempt_enable()
 # define preempt_disable_nort()		barrier()
 # define preempt_enable_nort()		barrier()
-# define migrate_disable_rt()		migrate_disable()
-# define migrate_enable_rt()		migrate_enable()
+# ifdef CONFIG_SMP
+   extern void migrate_disable(void);
+   extern void migrate_enable(void);
+# else /* CONFIG_SMP */
+#  define migrate_disable()		barrier()
+#  define migrate_enable()		barrier()
+# endif /* CONFIG_SMP */
 #else
 # define preempt_disable_rt()		barrier()
 # define preempt_enable_rt()		barrier()
 # define preempt_disable_nort()		preempt_disable()
 # define preempt_enable_nort()		preempt_enable()
-# define migrate_disable_rt()		barrier()
-# define migrate_enable_rt()		barrier()
+# define migrate_disable()		preempt_disable()
+# define migrate_enable()		preempt_enable()
 #endif
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 75a5be31e966..fe1754d6bbf4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1269,7 +1269,9 @@ struct task_struct {
 #endif
 
 	unsigned int policy;
+#ifdef CONFIG_PREEMPT_RT_FULL
 	int migrate_disable;
+#endif
 	int nr_cpus_allowed;
 	cpumask_t cpus_allowed;
 
@@ -3050,11 +3052,22 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
 
 #endif /* CONFIG_SMP */
 
+static inline int __migrate_disabled(struct task_struct *p)
+{
+#ifdef CONFIG_PREEMPT_RT_FULL
+	return p->migrate_disable;
+#else
+	return 0;
+#endif
+}
+
 /* Future-safe accessor for struct task_struct's cpus_allowed. */
 static inline const struct cpumask *tsk_cpus_allowed(struct task_struct *p)
 {
+#ifdef CONFIG_PREEMPT_RT_FULL
 	if (p->migrate_disable)
 		return cpumask_of(task_cpu(p));
+#endif
 
 	return &p->cpus_allowed;
 }
diff --git a/include/linux/smp.h b/include/linux/smp.h
index b8614d80936d..42c3d2ca1d75 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -178,13 +178,8 @@ static inline void wake_up_all_idle_cpus(void) {  }
 #define get_cpu()		({ preempt_disable(); smp_processor_id(); })
 #define put_cpu()		preempt_enable()
 
-#ifndef CONFIG_PREEMPT_RT_FULL
-# define get_cpu_light()	get_cpu()
-# define put_cpu_light()	put_cpu()
-#else
-# define get_cpu_light()	({ migrate_disable(); smp_processor_id(); })
-# define put_cpu_light()	migrate_enable()
-#endif
+#define get_cpu_light()		({ migrate_disable(); smp_processor_id(); })
+#define put_cpu_light()		migrate_enable()
 
 /*
  * Callback to arch code if there's nosmp or maxcpus=0 on the
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4f443ff3299a..9b531d539b87 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4772,7 +4772,7 @@ static struct rq *move_queued_task(struct task_struct *p, int new_cpu)
 
 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 {
-	if (!p->migrate_disable) {
+	if (!__migrate_disabled(p)) {
 		if (p->sched_class && p->sched_class->set_cpus_allowed)
 			p->sched_class->set_cpus_allowed(p, new_mask);
 		p->nr_cpus_allowed = cpumask_weight(new_mask);
@@ -4824,7 +4824,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 	do_set_cpus_allowed(p, new_mask);
 
 	/* Can the task run on the task's current CPU? If so, we're done */
-	if (cpumask_test_cpu(task_cpu(p), new_mask) || p->migrate_disable)
+	if (cpumask_test_cpu(task_cpu(p), new_mask) || __migrate_disabled(p))
 		goto out;
 
 	dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
@@ -4844,6 +4844,7 @@ out:
 }
 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
 
+#ifdef CONFIG_PREEMPT_RT_FULL
 void migrate_disable(void)
 {
 	struct task_struct *p = current;
@@ -4936,6 +4937,7 @@ void migrate_enable(void)
 	preempt_enable();
 }
 EXPORT_SYMBOL(migrate_enable);
+#endif /* CONFIG_PREEMPT_RT_FULL */
 
 /*
  * Move (not current) task off this cpu, onto dest cpu. We're doing
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9e11323e07aa..9d1a3f265049 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1591,7 +1591,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
 		(tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) |
 		(test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0);
 
-	entry->migrate_disable = (tsk) ? tsk->migrate_disable & 0xFF : 0;
+	entry->migrate_disable = (tsk) ? __migrate_disabled(tsk) & 0xFF : 0;
 }
 EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
 
diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c
index 4822c5d268a8..11fa431046a8 100644
--- a/lib/smp_processor_id.c
+++ b/lib/smp_processor_id.c
@@ -40,7 +40,7 @@ notrace static unsigned int check_preemption_disabled(const char *what1,
 		goto out_enable;
 
 	printk(KERN_ERR "BUG: using %s%s() in preemptible [%08x %08x] code: %s/%d\n",
-		what1, what2, preempt_count() - 1, current->migrate_disable,
+		what1, what2, preempt_count() - 1, __migrate_disabled(current),
 		current->comm, current->pid);
 
 	print_symbol("caller is %s\n", (long)__builtin_return_address(0));
-- 
cgit v1.2.3


From 650bfb199eb9153432908da1208a56226312d068 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Tue, 23 Aug 2011 16:12:43 +0200
Subject: sched, rt: Fix migrate_enable() thinko

Assigning mask = tsk_cpus_allowed(p) after p->migrate_disable = 0 ensures
that we won't see a mask change.. no push/pull, we stack tasks on one CPU.

Also add a couple fields to sched_debug for the next guy.

[ Build fix from Stratos Psomadakis <psomas@gentoo.org> ]

Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1314108763.6689.4.camel@marge.simson.net
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched/core.c  | 4 +++-
 kernel/sched/debug.c | 7 +++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9b531d539b87..143c381c9811 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4921,12 +4921,14 @@ void migrate_enable(void)
 	 */
 	rq = this_rq();
 	raw_spin_lock_irqsave(&rq->lock, flags);
-	p->migrate_disable = 0;
 	mask = tsk_cpus_allowed(p);
+	p->migrate_disable = 0;
 
 	WARN_ON(!cpumask_test_cpu(smp_processor_id(), mask));
 
 	if (!cpumask_equal(&p->cpus_allowed, mask)) {
+		/* Get the mask now that migration is enabled */
+		mask = tsk_cpus_allowed(p);
 		if (p->sched_class->set_cpus_allowed)
 			p->sched_class->set_cpus_allowed(p, mask);
 		p->nr_cpus_allowed = cpumask_weight(mask);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index ce33780d8f20..20a1246da14d 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -256,6 +256,9 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
 	P(rt_throttled);
 	PN(rt_time);
 	PN(rt_runtime);
+#ifdef CONFIG_SMP
+	P(rt_nr_migratory);
+#endif
 
 #undef PN
 #undef P
@@ -634,6 +637,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 #endif
 	P(policy);
 	P(prio);
+#ifdef CONFIG_PREEMPT_RT_FULL
+	P(migrate_disable);
+#endif
+	P(nr_cpus_allowed);
 #undef PN
 #undef __PN
 #undef P
-- 
cgit v1.2.3


From 552d3004f7af6d68c52c4656549319cc8bb468cf Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 2 Sep 2011 14:29:27 +0200
Subject: sched: teach migrate_disable about atomic contexts

 <NMI>  [<ffffffff812dafd8>] spin_bug+0x94/0xa8
 [<ffffffff812db07f>] do_raw_spin_lock+0x43/0xea
 [<ffffffff814fa9be>] _raw_spin_lock_irqsave+0x6b/0x85
 [<ffffffff8106ff9e>] ? migrate_disable+0x75/0x12d
 [<ffffffff81078aaf>] ? pin_current_cpu+0x36/0xb0
 [<ffffffff8106ff9e>] migrate_disable+0x75/0x12d
 [<ffffffff81115b9d>] pagefault_disable+0xe/0x1f
 [<ffffffff81047027>] copy_from_user_nmi+0x74/0xe6
 [<ffffffff810489d7>] perf_callchain_user+0xf3/0x135

Now clearly we can't go around taking locks from NMI context, cure
this by short-circuiting migrate_disable() when we're in an atomic
context already.

Add some extra debugging to avoid things like:

  preempt_disable()
  migrate_disable();

  preempt_enable();
  migrate_enable();

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1314967297.1301.14.camel@twins
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/n/tip-wbot4vsmwhi8vmbf83hsclk6@git.kernel.org
---
 include/linux/sched.h |  3 +++
 kernel/sched/core.c   | 21 +++++++++++++++++++++
 2 files changed, 24 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index fe1754d6bbf4..2fd3c5848097 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1271,6 +1271,9 @@ struct task_struct {
 	unsigned int policy;
 #ifdef CONFIG_PREEMPT_RT_FULL
 	int migrate_disable;
+# ifdef CONFIG_SCHED_DEBUG
+	int migrate_disable_atomic;
+# endif
 #endif
 	int nr_cpus_allowed;
 	cpumask_t cpus_allowed;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 143c381c9811..11dc30611e52 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4852,6 +4852,17 @@ void migrate_disable(void)
 	unsigned long flags;
 	struct rq *rq;
 
+	if (in_atomic()) {
+#ifdef CONFIG_SCHED_DEBUG
+		p->migrate_disable_atomic++;
+#endif
+		return;
+	}
+
+#ifdef CONFIG_SCHED_DEBUG
+	WARN_ON_ONCE(p->migrate_disable_atomic);
+#endif
+
 	preempt_disable();
 	if (p->migrate_disable) {
 		p->migrate_disable++;
@@ -4900,6 +4911,16 @@ void migrate_enable(void)
 	unsigned long flags;
 	struct rq *rq;
 
+	if (in_atomic()) {
+#ifdef CONFIG_SCHED_DEBUG
+		p->migrate_disable_atomic--;
+#endif
+		return;
+	}
+
+#ifdef CONFIG_SCHED_DEBUG
+	WARN_ON_ONCE(p->migrate_disable_atomic);
+#endif
 	WARN_ON_ONCE(p->migrate_disable <= 0);
 
 	preempt_disable();
-- 
cgit v1.2.3


From ca5602d44cd22b27b96be9cea7be6ebc4a1863f5 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 27 Sep 2011 08:40:23 -0400
Subject: sched: Postpone actual migration disalbe to schedule

The migrate_disable() can cause a bit of a overhead to the RT kernel,
as changing the affinity is expensive to do at every lock encountered.
As a running task can not migrate, the actual disabling of migration
does not need to occur until the task is about to schedule out.

In most cases, a task that disables migration will enable it before
it schedules making this change improve performance tremendously.

[ Frank Rowand: UP compile fix ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Clark Williams <williams@redhat.com>
Link: http://lkml.kernel.org/r/20110927124422.779693167@goodmis.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched/core.c | 251 +++++++++++++++++++++++++++-------------------------
 1 file changed, 132 insertions(+), 119 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 11dc30611e52..d376e35a504d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2753,6 +2753,135 @@ static inline void schedule_debug(struct task_struct *prev)
 	schedstat_inc(this_rq(), sched_count);
 }
 
+#if defined(CONFIG_PREEMPT_RT_FULL) && defined(CONFIG_SMP)
+#define MIGRATE_DISABLE_SET_AFFIN	(1<<30) /* Can't make a negative */
+#define migrate_disabled_updated(p)	((p)->migrate_disable & MIGRATE_DISABLE_SET_AFFIN)
+#define migrate_disable_count(p)	((p)->migrate_disable & ~MIGRATE_DISABLE_SET_AFFIN)
+
+static inline void update_migrate_disable(struct task_struct *p)
+{
+	const struct cpumask *mask;
+
+	if (likely(!p->migrate_disable))
+		return;
+
+	/* Did we already update affinity? */
+	if (unlikely(migrate_disabled_updated(p)))
+		return;
+
+	/*
+	 * Since this is always current we can get away with only locking
+	 * rq->lock, the ->cpus_allowed value can normally only be changed
+	 * while holding both p->pi_lock and rq->lock, but seeing that this
+	 * is current, we cannot actually be waking up, so all code that
+	 * relies on serialization against p->pi_lock is out of scope.
+	 *
+	 * Having rq->lock serializes us against things like
+	 * set_cpus_allowed_ptr() that can still happen concurrently.
+	 */
+	mask = tsk_cpus_allowed(p);
+
+	WARN_ON(!cpumask_test_cpu(smp_processor_id(), mask));
+
+	if (!cpumask_equal(&p->cpus_allowed, mask)) {
+		if (p->sched_class->set_cpus_allowed)
+			p->sched_class->set_cpus_allowed(p, mask);
+		p->nr_cpus_allowed = cpumask_weight(mask);
+
+		/* Let migrate_enable know to fix things back up */
+		p->migrate_disable |= MIGRATE_DISABLE_SET_AFFIN;
+	}
+}
+
+void migrate_disable(void)
+{
+	struct task_struct *p = current;
+
+	if (in_atomic()) {
+#ifdef CONFIG_SCHED_DEBUG
+		p->migrate_disable_atomic++;
+#endif
+		return;
+	}
+
+#ifdef CONFIG_SCHED_DEBUG
+	WARN_ON_ONCE(p->migrate_disable_atomic);
+#endif
+
+	preempt_disable();
+	if (p->migrate_disable) {
+		p->migrate_disable++;
+		preempt_enable();
+		return;
+	}
+
+	pin_current_cpu();
+	p->migrate_disable = 1;
+	preempt_enable();
+}
+EXPORT_SYMBOL(migrate_disable);
+
+void migrate_enable(void)
+{
+	struct task_struct *p = current;
+	const struct cpumask *mask;
+	unsigned long flags;
+	struct rq *rq;
+
+	if (in_atomic()) {
+#ifdef CONFIG_SCHED_DEBUG
+		p->migrate_disable_atomic--;
+#endif
+		return;
+	}
+
+#ifdef CONFIG_SCHED_DEBUG
+	WARN_ON_ONCE(p->migrate_disable_atomic);
+#endif
+	WARN_ON_ONCE(p->migrate_disable <= 0);
+
+	preempt_disable();
+	if (migrate_disable_count(p) > 1) {
+		p->migrate_disable--;
+		preempt_enable();
+		return;
+	}
+
+	if (unlikely(migrate_disabled_updated(p))) {
+		/*
+		 * See comment in update_migrate_disable() about locking.
+		 */
+		rq = this_rq();
+		raw_spin_lock_irqsave(&rq->lock, flags);
+		mask = tsk_cpus_allowed(p);
+		/*
+		 * Clearing migrate_disable causes tsk_cpus_allowed to
+		 * show the tasks original cpu affinity.
+		 */
+		p->migrate_disable = 0;
+
+		WARN_ON(!cpumask_test_cpu(smp_processor_id(), mask));
+
+		if (unlikely(!cpumask_equal(&p->cpus_allowed, mask))) {
+			/* Get the mask now that migration is enabled */
+			mask = tsk_cpus_allowed(p);
+			if (p->sched_class->set_cpus_allowed)
+				p->sched_class->set_cpus_allowed(p, mask);
+			p->nr_cpus_allowed = cpumask_weight(mask);
+		}
+		raw_spin_unlock_irqrestore(&rq->lock, flags);
+	} else
+		p->migrate_disable = 0;
+
+	unpin_current_cpu();
+	preempt_enable();
+}
+EXPORT_SYMBOL(migrate_enable);
+#else
+static inline void update_migrate_disable(struct task_struct *p) { }
+#define migrate_disabled_updated(p)		0
+#endif
+
 /*
  * Pick up the highest-prio task:
  */
@@ -2856,6 +2985,8 @@ need_resched:
 	smp_mb__before_spinlock();
 	raw_spin_lock_irq(&rq->lock);
 
+	update_migrate_disable(prev);
+
 	switch_count = &prev->nivcsw;
 	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
 		if (unlikely(signal_pending_state(prev->state, prev))) {
@@ -4772,7 +4903,7 @@ static struct rq *move_queued_task(struct task_struct *p, int new_cpu)
 
 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 {
-	if (!__migrate_disabled(p)) {
+	if (!migrate_disabled_updated(p)) {
 		if (p->sched_class && p->sched_class->set_cpus_allowed)
 			p->sched_class->set_cpus_allowed(p, new_mask);
 		p->nr_cpus_allowed = cpumask_weight(new_mask);
@@ -4844,124 +4975,6 @@ out:
 }
 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
 
-#ifdef CONFIG_PREEMPT_RT_FULL
-void migrate_disable(void)
-{
-	struct task_struct *p = current;
-	const struct cpumask *mask;
-	unsigned long flags;
-	struct rq *rq;
-
-	if (in_atomic()) {
-#ifdef CONFIG_SCHED_DEBUG
-		p->migrate_disable_atomic++;
-#endif
-		return;
-	}
-
-#ifdef CONFIG_SCHED_DEBUG
-	WARN_ON_ONCE(p->migrate_disable_atomic);
-#endif
-
-	preempt_disable();
-	if (p->migrate_disable) {
-		p->migrate_disable++;
-		preempt_enable();
-		return;
-	}
-
-	pin_current_cpu();
-	if (unlikely(!scheduler_running)) {
-		p->migrate_disable = 1;
-		preempt_enable();
-		return;
-	}
-
-	/*
-	 * Since this is always current we can get away with only locking
-	 * rq->lock, the ->cpus_allowed value can normally only be changed
-	 * while holding both p->pi_lock and rq->lock, but seeing that this
-	 * it current, we cannot actually be waking up, so all code that
-	 * relies on serialization against p->pi_lock is out of scope.
-	 *
-	 * Taking rq->lock serializes us against things like
-	 * set_cpus_allowed_ptr() that can still happen concurrently.
-	 */
-	rq = this_rq();
-	raw_spin_lock_irqsave(&rq->lock, flags);
-	p->migrate_disable = 1;
-	mask = tsk_cpus_allowed(p);
-
-	WARN_ON(!cpumask_test_cpu(smp_processor_id(), mask));
-
-	if (!cpumask_equal(&p->cpus_allowed, mask)) {
-		if (p->sched_class->set_cpus_allowed)
-			p->sched_class->set_cpus_allowed(p, mask);
-		p->nr_cpus_allowed = cpumask_weight(mask);
-	}
-	raw_spin_unlock_irqrestore(&rq->lock, flags);
-	preempt_enable();
-}
-EXPORT_SYMBOL(migrate_disable);
-
-void migrate_enable(void)
-{
-	struct task_struct *p = current;
-	const struct cpumask *mask;
-	unsigned long flags;
-	struct rq *rq;
-
-	if (in_atomic()) {
-#ifdef CONFIG_SCHED_DEBUG
-		p->migrate_disable_atomic--;
-#endif
-		return;
-	}
-
-#ifdef CONFIG_SCHED_DEBUG
-	WARN_ON_ONCE(p->migrate_disable_atomic);
-#endif
-	WARN_ON_ONCE(p->migrate_disable <= 0);
-
-	preempt_disable();
-	if (p->migrate_disable > 1) {
-		p->migrate_disable--;
-		preempt_enable();
-		return;
-	}
-
-	if (unlikely(!scheduler_running)) {
-		p->migrate_disable = 0;
-		unpin_current_cpu();
-		preempt_enable();
-		return;
-	}
-
-	/*
-	 * See comment in migrate_disable().
-	 */
-	rq = this_rq();
-	raw_spin_lock_irqsave(&rq->lock, flags);
-	mask = tsk_cpus_allowed(p);
-	p->migrate_disable = 0;
-
-	WARN_ON(!cpumask_test_cpu(smp_processor_id(), mask));
-
-	if (!cpumask_equal(&p->cpus_allowed, mask)) {
-		/* Get the mask now that migration is enabled */
-		mask = tsk_cpus_allowed(p);
-		if (p->sched_class->set_cpus_allowed)
-			p->sched_class->set_cpus_allowed(p, mask);
-		p->nr_cpus_allowed = cpumask_weight(mask);
-	}
-
-	raw_spin_unlock_irqrestore(&rq->lock, flags);
-	unpin_current_cpu();
-	preempt_enable();
-}
-EXPORT_SYMBOL(migrate_enable);
-#endif /* CONFIG_PREEMPT_RT_FULL */
-
 /*
  * Move (not current) task off this cpu, onto dest cpu. We're doing
  * this because either it can't run here any more (set_cpus_allowed()
-- 
cgit v1.2.3


From 4dbc6fd31afbc225e4cebffe68cf8b1cdd9d6c2f Mon Sep 17 00:00:00 2001
From: Nicholas Mc Guire <der.herr@hofr.at>
Date: Wed, 20 Nov 2013 07:22:09 +0800
Subject: allow preemption in recursive migrate_disable call

Minor cleanup in migrate_disable/migrate_enable. The recursive case
does not need to disable preemption as it is "pinned" to the current
cpu any way so it is safe to preempt it.

Signed-off-by: Nicholas Mc Guire <der.herr@hofr.at>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 kernel/sched/core.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d376e35a504d..fc831a0ff24e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2808,13 +2808,12 @@ void migrate_disable(void)
 	WARN_ON_ONCE(p->migrate_disable_atomic);
 #endif
 
-	preempt_disable();
 	if (p->migrate_disable) {
 		p->migrate_disable++;
-		preempt_enable();
 		return;
 	}
 
+	preempt_disable();
 	pin_current_cpu();
 	p->migrate_disable = 1;
 	preempt_enable();
@@ -2840,13 +2839,12 @@ void migrate_enable(void)
 #endif
 	WARN_ON_ONCE(p->migrate_disable <= 0);
 
-	preempt_disable();
 	if (migrate_disable_count(p) > 1) {
 		p->migrate_disable--;
-		preempt_enable();
 		return;
 	}
 
+	preempt_disable();
 	if (unlikely(migrate_disabled_updated(p))) {
 		/*
 		 * See comment in update_migrate_disable() about locking.
-- 
cgit v1.2.3


From d125a000d32da92bc4cff5e598969588ba80c8cb Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 27 Sep 2011 08:40:24 -0400
Subject: sched: Do not compare cpu masks in scheduler

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Clark Williams <williams@redhat.com>
Link: http://lkml.kernel.org/r/20110927124423.128129033@goodmis.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched/core.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fc831a0ff24e..467697d7d8be 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2781,16 +2781,12 @@ static inline void update_migrate_disable(struct task_struct *p)
 	 */
 	mask = tsk_cpus_allowed(p);
 
-	WARN_ON(!cpumask_test_cpu(smp_processor_id(), mask));
+	if (p->sched_class->set_cpus_allowed)
+		p->sched_class->set_cpus_allowed(p, mask);
+	p->nr_cpus_allowed = cpumask_weight(mask);
 
-	if (!cpumask_equal(&p->cpus_allowed, mask)) {
-		if (p->sched_class->set_cpus_allowed)
-			p->sched_class->set_cpus_allowed(p, mask);
-		p->nr_cpus_allowed = cpumask_weight(mask);
-
-		/* Let migrate_enable know to fix things back up */
-		p->migrate_disable |= MIGRATE_DISABLE_SET_AFFIN;
-	}
+	/* Let migrate_enable know to fix things back up */
+	p->migrate_disable |= MIGRATE_DISABLE_SET_AFFIN;
 }
 
 void migrate_disable(void)
-- 
cgit v1.2.3


From 7337e56d00a05ca707f598107b84596fb3a41f86 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 27 Sep 2011 08:40:25 -0400
Subject: sched: Have migrate_disable ignore bounded threads

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Clark Williams <williams@redhat.com>
Link: http://lkml.kernel.org/r/20110927124423.567944215@goodmis.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched/core.c | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 467697d7d8be..54649402a613 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2793,7 +2793,7 @@ void migrate_disable(void)
 {
 	struct task_struct *p = current;
 
-	if (in_atomic()) {
+	if (in_atomic() || p->flags & PF_NO_SETAFFINITY) {
 #ifdef CONFIG_SCHED_DEBUG
 		p->migrate_disable_atomic++;
 #endif
@@ -2823,7 +2823,7 @@ void migrate_enable(void)
 	unsigned long flags;
 	struct rq *rq;
 
-	if (in_atomic()) {
+	if (in_atomic() || p->flags & PF_NO_SETAFFINITY) {
 #ifdef CONFIG_SCHED_DEBUG
 		p->migrate_disable_atomic--;
 #endif
@@ -2843,26 +2843,21 @@ void migrate_enable(void)
 	preempt_disable();
 	if (unlikely(migrate_disabled_updated(p))) {
 		/*
-		 * See comment in update_migrate_disable() about locking.
+		 * Undo whatever update_migrate_disable() did, also see there
+		 * about locking.
 		 */
 		rq = this_rq();
 		raw_spin_lock_irqsave(&rq->lock, flags);
-		mask = tsk_cpus_allowed(p);
+
 		/*
 		 * Clearing migrate_disable causes tsk_cpus_allowed to
 		 * show the tasks original cpu affinity.
 		 */
 		p->migrate_disable = 0;
-
-		WARN_ON(!cpumask_test_cpu(smp_processor_id(), mask));
-
-		if (unlikely(!cpumask_equal(&p->cpus_allowed, mask))) {
-			/* Get the mask now that migration is enabled */
-			mask = tsk_cpus_allowed(p);
-			if (p->sched_class->set_cpus_allowed)
-				p->sched_class->set_cpus_allowed(p, mask);
-			p->nr_cpus_allowed = cpumask_weight(mask);
-		}
+		mask = tsk_cpus_allowed(p);
+		if (p->sched_class->set_cpus_allowed)
+			p->sched_class->set_cpus_allowed(p, mask);
+		p->nr_cpus_allowed = cpumask_weight(mask);
 		raw_spin_unlock_irqrestore(&rq->lock, flags);
 	} else
 		p->migrate_disable = 0;
-- 
cgit v1.2.3


From a278e3dc11016751a267d2c7c338c7eec121ba68 Mon Sep 17 00:00:00 2001
From: Nicholas Mc Guire <der.herr@hofr.at>
Date: Mon, 24 Mar 2014 13:18:48 +0100
Subject: sched: dont calculate hweight in update_migrate_disable()

Proposal for a minor optimization in update_migrate_disable - its only a few
instructions saved but those are in the hot path of locks so it might be worth
it

When being scheduled out while migrate_disable > 0 and migrate_disabled_updated
is not yet set we end up here (kernel/sched/core.c):

static inline void update_migrate_disable(struct task_struct *p)
{
        ...

        mask = tsk_cpus_allowed(p);

        if (p->sched_class->set_cpus_allowed)
                p->sched_class->set_cpus_allowed(p, mask);
        p->nr_cpus_allowed = cpumask_weight(mask);

as we only can get here if migrate_disable > 0 there is no need to calculate
the cpumask_weight(mask) as tsk_cpus_allowed in that case will return
cpumask_of(task_cpu(p)) which only can have a hamming weight of 1 anyway.
So we can simply do:

        p->nr_cpus_allowed = 1;

without changing the behavior.

Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Nicholas Mc Guire <der.herr@hofr.at>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 kernel/sched/core.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 54649402a613..037bb470d357 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2783,7 +2783,8 @@ static inline void update_migrate_disable(struct task_struct *p)
 
 	if (p->sched_class->set_cpus_allowed)
 		p->sched_class->set_cpus_allowed(p, mask);
-	p->nr_cpus_allowed = cpumask_weight(mask);
+	/* mask==cpumask_of(task_cpu(p)) which has a cpumask_weight==1 */
+	p->nr_cpus_allowed = 1;
 
 	/* Let migrate_enable know to fix things back up */
 	p->migrate_disable |= MIGRATE_DISABLE_SET_AFFIN;
-- 
cgit v1.2.3


From d9d250769deb0022a97dd87d3a6dde823d7de5a0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 4 Nov 2011 20:48:36 +0100
Subject: sched-clear-pf-thread-bound-on-fallback-rq.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched/core.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 037bb470d357..831d3ae428d1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1425,6 +1425,12 @@ out:
 		}
 	}
 
+	/*
+	 * Clear PF_NO_SETAFFINITY, otherwise we wreckage
+	 * migrate_disable/enable. See optimization for
+	 * PF_NO_SETAFFINITY tasks there.
+	 */
+	p->flags &= ~PF_NO_SETAFFINITY;
 	return dest_cpu;
 }
 
-- 
cgit v1.2.3


From 21a3c863c8f0c708ebb7626142bc9e303ea2b0f9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 17 Jul 2011 16:29:27 +0200
Subject: net-netif_rx_ni-migrate-disable.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 net/core/dev.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 885aec4c5fd2..4c9bf576d382 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3402,11 +3402,11 @@ int netif_rx_ni(struct sk_buff *skb)
 
 	trace_netif_rx_ni_entry(skb);
 
-	preempt_disable();
+	migrate_disable();
 	err = netif_rx_internal(skb);
 	if (local_softirq_pending())
 		thread_do_softirq();
-	preempt_enable();
+	migrate_enable();
 
 	return err;
 }
-- 
cgit v1.2.3


From 5b13722299e2a7d2b75674d27eeb93c6fe706b22 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <umgwanakikbuti@gmail.com>
Date: Wed, 18 Feb 2015 16:05:28 +0100
Subject: sunrpc: make svc_xprt_do_enqueue() use get_cpu_light()

|BUG: sleeping function called from invalid context at kernel/locking/rtmutex.c:915
|in_atomic(): 1, irqs_disabled(): 0, pid: 3194, name: rpc.nfsd
|Preemption disabled at:[<ffffffffa06bf0bb>] svc_xprt_received+0x4b/0xc0 [sunrpc]
|CPU: 6 PID: 3194 Comm: rpc.nfsd Not tainted 3.18.7-rt1 #9
|Hardware name: MEDION MS-7848/MS-7848, BIOS M7848W08.404 11/06/2014
| ffff880409630000 ffff8800d9a33c78 ffffffff815bdeb5 0000000000000002
| 0000000000000000 ffff8800d9a33c98 ffffffff81073c86 ffff880408dd6008
| ffff880408dd6000 ffff8800d9a33cb8 ffffffff815c3d84 ffff88040b3ac000
|Call Trace:
| [<ffffffff815bdeb5>] dump_stack+0x4f/0x9e
| [<ffffffff81073c86>] __might_sleep+0xe6/0x150
| [<ffffffff815c3d84>] rt_spin_lock+0x24/0x50
| [<ffffffffa06beec0>] svc_xprt_do_enqueue+0x80/0x230 [sunrpc]
| [<ffffffffa06bf0bb>] svc_xprt_received+0x4b/0xc0 [sunrpc]
| [<ffffffffa06c03ed>] svc_add_new_perm_xprt+0x6d/0x80 [sunrpc]
| [<ffffffffa06b2693>] svc_addsock+0x143/0x200 [sunrpc]
| [<ffffffffa072e69c>] write_ports+0x28c/0x340 [nfsd]
| [<ffffffffa072d2ac>] nfsctl_transaction_write+0x4c/0x80 [nfsd]
| [<ffffffff8117ee83>] vfs_write+0xb3/0x1d0
| [<ffffffff8117f889>] SyS_write+0x49/0xb0
| [<ffffffff815c4556>] system_call_fastpath+0x16/0x1b

Cc: stable-rt@vger.kernel.org
Signed-off-by: Mike Galbraith <umgwanakikbuti@gmail.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 net/sunrpc/svc_xprt.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index c179ca2a5aa4..ac93e74d5515 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -357,7 +357,7 @@ static void svc_xprt_do_enqueue(struct svc_xprt *xprt)
 		return;
 	}
 
-	cpu = get_cpu();
+	cpu = get_cpu_light();
 	pool = svc_pool_for_cpu(xprt->xpt_server, cpu);
 	spin_lock_bh(&pool->sp_lock);
 
@@ -390,7 +390,7 @@ static void svc_xprt_do_enqueue(struct svc_xprt *xprt)
 	}
 
 	spin_unlock_bh(&pool->sp_lock);
-	put_cpu();
+	put_cpu_light();
 }
 
 /*
-- 
cgit v1.2.3


From 19a0e9b332d4783e091b5dfd8106880c13f97cdd Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 3 Jul 2009 13:16:38 -0500
Subject: softirq: Sanitize softirq pending for NOHZ/RT

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/interrupt.h |  1 +
 kernel/softirq.c          | 63 +++++++++++++++++++++++++++++++++++++++++++++++
 kernel/time/tick-sched.c  |  9 +------
 3 files changed, 65 insertions(+), 8 deletions(-)

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 536223286563..f01e331b0b31 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -439,6 +439,7 @@ extern void __raise_softirq_irqoff(unsigned int nr);
 
 extern void raise_softirq_irqoff(unsigned int nr);
 extern void raise_softirq(unsigned int nr);
+extern void softirq_check_pending_idle(void);
 
 DECLARE_PER_CPU(struct task_struct *, ksoftirqd);
 
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 822cc641f89b..e33a3cf67786 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -62,6 +62,69 @@ const char * const softirq_to_name[NR_SOFTIRQS] = {
 	"TASKLET", "SCHED", "HRTIMER", "RCU"
 };
 
+#ifdef CONFIG_NO_HZ_COMMON
+# ifdef CONFIG_PREEMPT_RT_FULL
+/*
+ * On preempt-rt a softirq might be blocked on a lock. There might be
+ * no other runnable task on this CPU because the lock owner runs on
+ * some other CPU. So we have to go into idle with the pending bit
+ * set. Therefor we need to check this otherwise we warn about false
+ * positives which confuses users and defeats the whole purpose of
+ * this test.
+ *
+ * This code is called with interrupts disabled.
+ */
+void softirq_check_pending_idle(void)
+{
+	static int rate_limit;
+	u32 warnpending = 0, pending;
+
+	if (rate_limit >= 10)
+		return;
+
+	pending = local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK;
+	if (pending) {
+		struct task_struct *tsk;
+
+		tsk = __get_cpu_var(ksoftirqd);
+		/*
+		 * The wakeup code in rtmutex.c wakes up the task
+		 * _before_ it sets pi_blocked_on to NULL under
+		 * tsk->pi_lock. So we need to check for both: state
+		 * and pi_blocked_on.
+		 */
+		raw_spin_lock(&tsk->pi_lock);
+
+		if (!tsk->pi_blocked_on && !(tsk->state == TASK_RUNNING))
+			warnpending = 1;
+
+		raw_spin_unlock(&tsk->pi_lock);
+	}
+
+	if (warnpending) {
+		printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
+		       pending);
+		rate_limit++;
+	}
+}
+# else
+/*
+ * On !PREEMPT_RT we just printk rate limited:
+ */
+void softirq_check_pending_idle(void)
+{
+	static int rate_limit;
+
+	if (rate_limit < 10 &&
+			(local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
+		printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
+		       local_softirq_pending());
+		rate_limit++;
+	}
+}
+# endif
+#endif
+
 /*
  * we cannot loop indefinitely here to avoid userspace starvation,
  * but we also don't want to introduce a worst case 1/HZ latency
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 54c358be4d8e..4e9e92ab8b33 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -766,14 +766,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
 		return false;
 
 	if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
-		static int ratelimit;
-
-		if (ratelimit < 10 &&
-		    (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
-			pr_warn("NOHZ: local_softirq_pending %02x\n",
-				(unsigned int) local_softirq_pending());
-			ratelimit++;
-		}
+		softirq_check_pending_idle();
 		return false;
 	}
 
-- 
cgit v1.2.3


From 79902faac5205d768e86980d280d8c1aae2af2ef Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 17 Jul 2011 18:51:23 +0200
Subject: lockdep-rt.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irqflags.h | 10 +++++++---
 kernel/locking/lockdep.c |  2 ++
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
index cc05eb7f6df4..097782947852 100644
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -25,8 +25,6 @@
 # define trace_softirqs_enabled(p)	((p)->softirqs_enabled)
 # define trace_hardirq_enter()	do { current->hardirq_context++; } while (0)
 # define trace_hardirq_exit()	do { current->hardirq_context--; } while (0)
-# define lockdep_softirq_enter()	do { current->softirq_context++; } while (0)
-# define lockdep_softirq_exit()	do { current->softirq_context--; } while (0)
 # define INIT_TRACE_IRQFLAGS	.softirqs_enabled = 1,
 #else
 # define trace_hardirqs_on()		do { } while (0)
@@ -39,9 +37,15 @@
 # define trace_softirqs_enabled(p)	0
 # define trace_hardirq_enter()		do { } while (0)
 # define trace_hardirq_exit()		do { } while (0)
+# define INIT_TRACE_IRQFLAGS
+#endif
+
+#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PREEMPT_RT_FULL)
+# define lockdep_softirq_enter() do { current->softirq_context++; } while (0)
+# define lockdep_softirq_exit()	 do { current->softirq_context--; } while (0)
+#else
 # define lockdep_softirq_enter()	do { } while (0)
 # define lockdep_softirq_exit()		do { } while (0)
-# define INIT_TRACE_IRQFLAGS
 #endif
 
 #if defined(CONFIG_IRQSOFF_TRACER) || \
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 88d0d4420ad2..fce6f6f69aad 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -3542,6 +3542,7 @@ static void check_flags(unsigned long flags)
 		}
 	}
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 	/*
 	 * We dont accurately track softirq state in e.g.
 	 * hardirq contexts (such as on 4KSTACKS), so only
@@ -3556,6 +3557,7 @@ static void check_flags(unsigned long flags)
 			DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled);
 		}
 	}
+#endif
 
 	if (!debug_locks)
 		print_irqtrace_events(current);
-- 
cgit v1.2.3


From caecce1584aa4c547229c33283fd81aa3f1d776a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 17 Jul 2011 21:51:45 +0200
Subject: mutex-no-spin-on-rt.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/Kconfig.locks | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 76768ee812b2..68212ae72f3f 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -225,11 +225,11 @@ config ARCH_SUPPORTS_ATOMIC_RMW
 
 config MUTEX_SPIN_ON_OWNER
 	def_bool y
-	depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW
+	depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW && !PREEMPT_RT_FULL
 
 config RWSEM_SPIN_ON_OWNER
        def_bool y
-       depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
+       depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW && !PREEMPT_RT_FULL
 
 config ARCH_USE_QUEUE_RWLOCK
 	bool
-- 
cgit v1.2.3


From a48abd28f8819881c68f5681a59a2daafd66dc82 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 28 Jun 2011 15:57:18 +0200
Subject: softirq-local-lock.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/bottom_half.h  |  12 +++
 include/linux/interrupt.h    |  10 +++
 include/linux/preempt_mask.h |  15 +++-
 include/linux/sched.h        |   1 +
 init/main.c                  |   1 +
 kernel/softirq.c             | 186 ++++++++++++++++++++++++++++++++++++++++++-
 6 files changed, 220 insertions(+), 5 deletions(-)

diff --git a/include/linux/bottom_half.h b/include/linux/bottom_half.h
index 86c12c93e3cf..8ca9389352f2 100644
--- a/include/linux/bottom_half.h
+++ b/include/linux/bottom_half.h
@@ -4,6 +4,17 @@
 #include <linux/preempt.h>
 #include <linux/preempt_mask.h>
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+
+extern void local_bh_disable(void);
+extern void _local_bh_enable(void);
+extern void local_bh_enable(void);
+extern void local_bh_enable_ip(unsigned long ip);
+extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt);
+extern void __local_bh_enable_ip(unsigned long ip, unsigned int cnt);
+
+#else
+
 #ifdef CONFIG_TRACE_IRQFLAGS
 extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt);
 #else
@@ -31,5 +42,6 @@ static inline void local_bh_enable(void)
 {
 	__local_bh_enable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET);
 }
+#endif
 
 #endif /* _LINUX_BH_H */
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index f01e331b0b31..eab2235cf3c1 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -423,7 +423,11 @@ struct softirq_action
 
 asmlinkage void do_softirq(void);
 asmlinkage void __do_softirq(void);
+#ifndef CONFIG_PREEMPT_RT_FULL
 static inline void thread_do_softirq(void) { do_softirq(); }
+#else
+extern void thread_do_softirq(void);
+#endif
 #ifdef __ARCH_HAS_DO_SOFTIRQ
 void do_softirq_own_stack(void);
 #else
@@ -598,6 +602,12 @@ void tasklet_hrtimer_cancel(struct tasklet_hrtimer *ttimer)
 	tasklet_kill(&ttimer->tasklet);
 }
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+extern void softirq_early_init(void);
+#else
+static inline void softirq_early_init(void) { }
+#endif
+
 /*
  * Autoprobing for irqs:
  *
diff --git a/include/linux/preempt_mask.h b/include/linux/preempt_mask.h
index dbeec4d4a3be..c7e373dc7314 100644
--- a/include/linux/preempt_mask.h
+++ b/include/linux/preempt_mask.h
@@ -44,16 +44,26 @@
 #define HARDIRQ_OFFSET	(1UL << HARDIRQ_SHIFT)
 #define NMI_OFFSET	(1UL << NMI_SHIFT)
 
-#define SOFTIRQ_DISABLE_OFFSET	(2 * SOFTIRQ_OFFSET)
+#ifndef CONFIG_PREEMPT_RT_FULL
+# define SOFTIRQ_DISABLE_OFFSET	(2 * SOFTIRQ_OFFSET)
+#else
+# define SOFTIRQ_DISABLE_OFFSET	(0)
+#endif
 
 #define PREEMPT_ACTIVE_BITS	1
 #define PREEMPT_ACTIVE_SHIFT	(NMI_SHIFT + NMI_BITS)
 #define PREEMPT_ACTIVE	(__IRQ_MASK(PREEMPT_ACTIVE_BITS) << PREEMPT_ACTIVE_SHIFT)
 
 #define hardirq_count()	(preempt_count() & HARDIRQ_MASK)
-#define softirq_count()	(preempt_count() & SOFTIRQ_MASK)
 #define irq_count()	(preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
 				 | NMI_MASK))
+#ifndef CONFIG_PREEMPT_RT_FULL
+# define softirq_count()	(preempt_count() & SOFTIRQ_MASK)
+# define in_serving_softirq()	(softirq_count() & SOFTIRQ_OFFSET)
+#else
+# define softirq_count()	(0UL)
+extern int in_serving_softirq(void);
+#endif
 
 /*
  * Are we doing bottom half or hardware interrupt processing?
@@ -64,7 +74,6 @@
 #define in_irq()		(hardirq_count())
 #define in_softirq()		(softirq_count())
 #define in_interrupt()		(irq_count())
-#define in_serving_softirq()	(softirq_count() & SOFTIRQ_OFFSET)
 
 /*
  * Are we in NMI context?
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2fd3c5848097..c50d2681a943 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1686,6 +1686,7 @@ struct task_struct {
 #endif
 #ifdef CONFIG_PREEMPT_RT_BASE
 	struct rcu_head put_rcu;
+	int softirq_nestcnt;
 #endif
 };
 
diff --git a/init/main.c b/init/main.c
index 321d0ceb26d3..d70175c70223 100644
--- a/init/main.c
+++ b/init/main.c
@@ -525,6 +525,7 @@ asmlinkage __visible void __init start_kernel(void)
  * Interrupts are still disabled. Do necessary setups, then
  * enable them
  */
+	softirq_early_init();
 	boot_cpu_init();
 	page_address_init();
 	pr_notice("%s", linux_banner);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index e33a3cf67786..bbf51fc01e76 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -25,6 +25,7 @@
 #include <linux/smp.h>
 #include <linux/smpboot.h>
 #include <linux/tick.h>
+#include <linux/locallock.h>
 #include <linux/irq.h>
 
 #define CREATE_TRACE_POINTS
@@ -177,6 +178,7 @@ static void handle_pending_softirqs(u32 pending)
 	local_irq_disable();
 }
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 /*
  * preempt_count and SOFTIRQ_OFFSET usage:
  * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
@@ -384,6 +386,182 @@ asmlinkage __visible void do_softirq(void)
 	local_irq_restore(flags);
 }
 
+static inline void local_bh_disable_nort(void) { local_bh_disable(); }
+static inline void _local_bh_enable_nort(void) { _local_bh_enable(); }
+
+#else /* !PREEMPT_RT_FULL */
+
+/*
+ * On RT we serialize softirq execution with a cpu local lock
+ */
+static DEFINE_LOCAL_IRQ_LOCK(local_softirq_lock);
+static DEFINE_PER_CPU(struct task_struct *, local_softirq_runner);
+
+asmlinkage void __do_softirq(void);
+
+void __init softirq_early_init(void)
+{
+	local_irq_lock_init(local_softirq_lock);
+}
+
+static void __local_bh_disable(void)
+{
+	migrate_disable();
+	current->softirq_nestcnt++;
+}
+
+void local_bh_disable(void)
+{
+	__local_bh_disable();
+}
+EXPORT_SYMBOL(local_bh_disable);
+
+void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
+{
+	__local_bh_disable();
+	if (cnt & PREEMPT_CHECK_OFFSET)
+		preempt_disable();
+}
+
+static void __local_bh_enable(void)
+{
+	if (WARN_ON(current->softirq_nestcnt == 0))
+		return;
+
+	if ((current->softirq_nestcnt == 1) &&
+	    local_softirq_pending() &&
+	    local_trylock(local_softirq_lock)) {
+
+		local_irq_disable();
+		if (local_softirq_pending())
+			__do_softirq();
+		local_irq_enable();
+		local_unlock(local_softirq_lock);
+		WARN_ON(current->softirq_nestcnt != 1);
+	}
+	current->softirq_nestcnt--;
+	migrate_enable();
+}
+
+void local_bh_enable(void)
+{
+	__local_bh_enable();
+}
+EXPORT_SYMBOL(local_bh_enable);
+
+extern void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
+{
+	__local_bh_enable();
+	if (cnt & PREEMPT_CHECK_OFFSET)
+		preempt_enable();
+}
+
+void local_bh_enable_ip(unsigned long ip)
+{
+	local_bh_enable();
+}
+EXPORT_SYMBOL(local_bh_enable_ip);
+
+/* For tracing */
+int notrace __in_softirq(void)
+{
+	if (__get_cpu_var(local_softirq_lock).owner == current)
+		return __get_cpu_var(local_softirq_lock).nestcnt;
+	return 0;
+}
+
+int in_serving_softirq(void)
+{
+	int res;
+
+	preempt_disable();
+	res = __get_cpu_var(local_softirq_runner) == current;
+	preempt_enable();
+	return res;
+}
+EXPORT_SYMBOL(in_serving_softirq);
+
+/*
+ * Called with bh and local interrupts disabled. For full RT cpu must
+ * be pinned.
+ */
+asmlinkage void __do_softirq(void)
+{
+	u32 pending = local_softirq_pending();
+	int cpu = smp_processor_id();
+
+	current->softirq_nestcnt++;
+
+	/* Reset the pending bitmask before enabling irqs */
+	set_softirq_pending(0);
+
+	__get_cpu_var(local_softirq_runner) = current;
+
+	lockdep_softirq_enter();
+
+	handle_pending_softirqs(pending, cpu);
+
+	pending = local_softirq_pending();
+	if (pending)
+		wakeup_softirqd();
+
+	lockdep_softirq_exit();
+	__get_cpu_var(local_softirq_runner) = NULL;
+
+	current->softirq_nestcnt--;
+}
+
+static int __thread_do_softirq(int cpu)
+{
+	/*
+	 * Prevent the current cpu from going offline.
+	 * pin_current_cpu() can reenable preemption and block on the
+	 * hotplug mutex. When it returns, the current cpu is
+	 * pinned. It might be the wrong one, but the offline check
+	 * below catches that.
+	 */
+	pin_current_cpu();
+	/*
+	 * If called from ksoftirqd (cpu >= 0) we need to check
+	 * whether we are on the wrong cpu due to cpu offlining. If
+	 * called via thread_do_softirq() no action required.
+	 */
+	if (cpu >= 0 && cpu_is_offline(cpu)) {
+		unpin_current_cpu();
+		return -1;
+	}
+	preempt_enable();
+	local_lock(local_softirq_lock);
+	local_irq_disable();
+	/*
+	 * We cannot switch stacks on RT as we want to be able to
+	 * schedule!
+	 */
+	if (local_softirq_pending())
+		__do_softirq();
+	local_unlock(local_softirq_lock);
+	unpin_current_cpu();
+	preempt_disable();
+	local_irq_enable();
+	return 0;
+}
+
+/*
+ * Called from netif_rx_ni(). Preemption enabled.
+ */
+void thread_do_softirq(void)
+{
+	if (!in_serving_softirq()) {
+		preempt_disable();
+		__thread_do_softirq(-1);
+		preempt_enable();
+	}
+}
+
+static inline void local_bh_disable_nort(void) { }
+static inline void _local_bh_enable_nort(void) { }
+
+#endif /* PREEMPT_RT_FULL */
 /*
  * Enter an interrupt context.
  */
@@ -395,9 +573,9 @@ void irq_enter(void)
 		 * Prevent raise_softirq from needlessly waking up ksoftirqd
 		 * here, as softirq will be serviced on return from interrupt.
 		 */
-		local_bh_disable();
+		local_bh_disable_nort();
 		tick_irq_enter();
-		_local_bh_enable();
+		_local_bh_enable_nort();
 	}
 
 	__irq_enter();
@@ -405,6 +583,7 @@ void irq_enter(void)
 
 static inline void invoke_softirq(void)
 {
+#ifndef CONFIG_PREEMPT_RT_FULL
 	if (!force_irqthreads) {
 #ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK
 		/*
@@ -424,6 +603,9 @@ static inline void invoke_softirq(void)
 	} else {
 		wakeup_softirqd();
 	}
+#else
+	wakeup_softirqd();
+#endif
 }
 
 static inline void tick_irq_exit(void)
-- 
cgit v1.2.3


From 7d949d001c490fdc58dad9d86de1afadca4a9371 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 18 Jul 2011 13:59:17 +0200
Subject: softirq-disable-softirq-stacks-for-rt.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/powerpc/kernel/irq.c     | 2 ++
 arch/powerpc/kernel/misc_32.S | 2 ++
 arch/powerpc/kernel/misc_64.S | 2 ++
 arch/sh/kernel/irq.c          | 2 ++
 arch/sparc/kernel/irq_64.c    | 2 ++
 arch/x86/kernel/entry_64.S    | 2 ++
 arch/x86/kernel/irq_32.c      | 2 ++
 include/linux/interrupt.h     | 8 ++++----
 8 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index c14383575fe8..ec023fd7bbde 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -615,6 +615,7 @@ void irq_ctx_init(void)
 	}
 }
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 void do_softirq_own_stack(void)
 {
 	struct thread_info *curtp, *irqtp;
@@ -632,6 +633,7 @@ void do_softirq_own_stack(void)
 	if (irqtp->flags)
 		set_bits(irqtp->flags, &curtp->flags);
 }
+#endif
 
 irq_hw_number_t virq_to_hw(unsigned int virq)
 {
diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S
index 7c6bb4b17b49..e9dfe2270e93 100644
--- a/arch/powerpc/kernel/misc_32.S
+++ b/arch/powerpc/kernel/misc_32.S
@@ -40,6 +40,7 @@
  * We store the saved ksp_limit in the unused part
  * of the STACK_FRAME_OVERHEAD
  */
+#ifndef CONFIG_PREEMPT_RT_FULL
 _GLOBAL(call_do_softirq)
 	mflr	r0
 	stw	r0,4(r1)
@@ -56,6 +57,7 @@ _GLOBAL(call_do_softirq)
 	stw	r10,THREAD+KSP_LIMIT(r2)
 	mtlr	r0
 	blr
+#endif
 
 /*
  * void call_do_irq(struct pt_regs *regs, struct thread_info *irqtp);
diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S
index 4e314b90c75d..8a7238dd2f4b 100644
--- a/arch/powerpc/kernel/misc_64.S
+++ b/arch/powerpc/kernel/misc_64.S
@@ -29,6 +29,7 @@
 
 	.text
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 _GLOBAL(call_do_softirq)
 	mflr	r0
 	std	r0,16(r1)
@@ -39,6 +40,7 @@ _GLOBAL(call_do_softirq)
 	ld	r0,16(r1)
 	mtlr	r0
 	blr
+#endif
 
 _GLOBAL(call_do_irq)
 	mflr	r0
diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c
index 65a1ecd77f96..d5dd95e66791 100644
--- a/arch/sh/kernel/irq.c
+++ b/arch/sh/kernel/irq.c
@@ -149,6 +149,7 @@ void irq_ctx_exit(int cpu)
 	hardirq_ctx[cpu] = NULL;
 }
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 void do_softirq_own_stack(void)
 {
 	struct thread_info *curctx;
@@ -176,6 +177,7 @@ void do_softirq_own_stack(void)
 		  "r5", "r6", "r7", "r8", "r9", "r15", "t", "pr"
 	);
 }
+#endif
 #else
 static inline void handle_one_irq(unsigned int irq)
 {
diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c
index 4033c23bdfa6..763cd88b4e92 100644
--- a/arch/sparc/kernel/irq_64.c
+++ b/arch/sparc/kernel/irq_64.c
@@ -849,6 +849,7 @@ void __irq_entry handler_irq(int pil, struct pt_regs *regs)
 	set_irq_regs(old_regs);
 }
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 void do_softirq_own_stack(void)
 {
 	void *orig_sp, *sp = softirq_stack[smp_processor_id()];
@@ -863,6 +864,7 @@ void do_softirq_own_stack(void)
 	__asm__ __volatile__("mov %0, %%sp"
 			     : : "r" (orig_sp));
 }
+#endif
 
 #ifdef CONFIG_HOTPLUG_CPU
 void fixup_irqs(void)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index f1dc27f457f1..544b24771f6d 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1119,6 +1119,7 @@ bad_gs:
 	jmp  2b
 	.previous
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 /* Call softirq on interrupt stack. Interrupts are off. */
 ENTRY(do_softirq_own_stack)
 	CFI_STARTPROC
@@ -1138,6 +1139,7 @@ ENTRY(do_softirq_own_stack)
 	ret
 	CFI_ENDPROC
 END(do_softirq_own_stack)
+#endif
 
 #ifdef CONFIG_XEN
 idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 63ce838e5a54..b889f5ba4fa8 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -142,6 +142,7 @@ void irq_ctx_init(int cpu)
 	       cpu, per_cpu(hardirq_stack, cpu),  per_cpu(softirq_stack, cpu));
 }
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 void do_softirq_own_stack(void)
 {
 	struct thread_info *curstk;
@@ -160,6 +161,7 @@ void do_softirq_own_stack(void)
 
 	call_on_stack(__do_softirq, isp);
 }
+#endif
 
 bool handle_irq(unsigned irq, struct pt_regs *regs)
 {
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index eab2235cf3c1..1f77757dddf9 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -421,13 +421,10 @@ struct softirq_action
 	void	(*action)(struct softirq_action *);
 };
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 asmlinkage void do_softirq(void);
 asmlinkage void __do_softirq(void);
-#ifndef CONFIG_PREEMPT_RT_FULL
 static inline void thread_do_softirq(void) { do_softirq(); }
-#else
-extern void thread_do_softirq(void);
-#endif
 #ifdef __ARCH_HAS_DO_SOFTIRQ
 void do_softirq_own_stack(void);
 #else
@@ -436,6 +433,9 @@ static inline void do_softirq_own_stack(void)
 	__do_softirq();
 }
 #endif
+#else
+extern void thread_do_softirq(void);
+#endif
 
 extern void open_softirq(int nr, void (*action)(struct softirq_action *));
 extern void softirq_init(void);
-- 
cgit v1.2.3


From 75683141d34776459812316a3126b01de822fd30 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 21 Jul 2011 21:06:43 +0200
Subject: softirq-make-fifo.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/softirq.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/kernel/softirq.c b/kernel/softirq.c
index bbf51fc01e76..dd326a5889a1 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -388,6 +388,8 @@ asmlinkage __visible void do_softirq(void)
 
 static inline void local_bh_disable_nort(void) { local_bh_disable(); }
 static inline void _local_bh_enable_nort(void) { _local_bh_enable(); }
+static void ksoftirqd_set_sched_params(unsigned int cpu) { }
+static void ksoftirqd_clr_sched_params(unsigned int cpu, bool online) { }
 
 #else /* !PREEMPT_RT_FULL */
 
@@ -561,6 +563,20 @@ void thread_do_softirq(void)
 static inline void local_bh_disable_nort(void) { }
 static inline void _local_bh_enable_nort(void) { }
 
+static inline void ksoftirqd_set_sched_params(unsigned int cpu)
+{
+	struct sched_param param = { .sched_priority = 1 };
+
+	sched_setscheduler(current, SCHED_FIFO, &param);
+}
+
+static inline void ksoftirqd_clr_sched_params(unsigned int cpu, bool online)
+{
+	struct sched_param param = { .sched_priority = 0 };
+
+	sched_setscheduler(current, SCHED_NORMAL, &param);
+}
+
 #endif /* PREEMPT_RT_FULL */
 /*
  * Enter an interrupt context.
@@ -994,6 +1010,8 @@ static struct notifier_block cpu_nfb = {
 
 static struct smp_hotplug_thread softirq_threads = {
 	.store			= &ksoftirqd,
+	.setup			= ksoftirqd_set_sched_params,
+	.cleanup		= ksoftirqd_clr_sched_params,
 	.thread_should_run	= ksoftirqd_should_run,
 	.thread_fn		= run_ksoftirqd,
 	.thread_comm		= "ksoftirqd/%u",
-- 
cgit v1.2.3


From 9f9e75c09a7b03e7bec77197cf65d4b08568e711 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 29 Nov 2011 20:18:22 -0500
Subject: tasklet: Prevent tasklets from going into infinite spin in RT

When CONFIG_PREEMPT_RT_FULL is enabled, tasklets run as threads,
and spinlocks turn are mutexes. But this can cause issues with
tasks disabling tasklets. A tasklet runs under ksoftirqd, and
if a tasklets are disabled with tasklet_disable(), the tasklet
count is increased. When a tasklet runs, it checks this counter
and if it is set, it adds itself back on the softirq queue and
returns.

The problem arises in RT because ksoftirq will see that a softirq
is ready to run (the tasklet softirq just re-armed itself), and will
not sleep, but instead run the softirqs again. The tasklet softirq
will still see that the count is non-zero and will not execute
the tasklet and requeue itself on the softirq again, which will
cause ksoftirqd to run it again and again and again.

It gets worse because ksoftirqd runs as a real-time thread.
If it preempted the task that disabled tasklets, and that task
has migration disabled, or can't run for other reasons, the tasklet
softirq will never run because the count will never be zero, and
ksoftirqd will go into an infinite loop. As an RT task, it this
becomes a big problem.

This is a hack solution to have tasklet_disable stop tasklets, and
when a tasklet runs, instead of requeueing the tasklet softirqd
it delays it. When tasklet_enable() is called, and tasklets are
waiting, then the tasklet_enable() will kick the tasklets to continue.
This prevents the lock up from ksoftirq going into an infinite loop.

[ rostedt@goodmis.org: ported to 3.0-rt ]

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/interrupt.h |  39 ++++-----
 kernel/softirq.c          | 206 +++++++++++++++++++++++++++++++++-------------
 2 files changed, 168 insertions(+), 77 deletions(-)

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 1f77757dddf9..3fe60d859818 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -465,8 +465,9 @@ static inline struct task_struct *this_cpu_ksoftirqd(void)
      to be executed on some cpu at least once after this.
    * If the tasklet is already scheduled, but its execution is still not
      started, it will be executed only once.
-   * If this tasklet is already running on another CPU (or schedule is called
-     from tasklet itself), it is rescheduled for later.
+   * If this tasklet is already running on another CPU, it is rescheduled
+     for later.
+   * Schedule must not be called from the tasklet itself (a lockup occurs)
    * Tasklet is strictly serialized wrt itself, but not
      wrt another tasklets. If client needs some intertask synchronization,
      he makes it with spinlocks.
@@ -491,27 +492,36 @@ struct tasklet_struct name = { NULL, 0, ATOMIC_INIT(1), func, data }
 enum
 {
 	TASKLET_STATE_SCHED,	/* Tasklet is scheduled for execution */
-	TASKLET_STATE_RUN	/* Tasklet is running (SMP only) */
+	TASKLET_STATE_RUN,	/* Tasklet is running (SMP only) */
+	TASKLET_STATE_PENDING	/* Tasklet is pending */
 };
 
-#ifdef CONFIG_SMP
+#define TASKLET_STATEF_SCHED	(1 << TASKLET_STATE_SCHED)
+#define TASKLET_STATEF_RUN	(1 << TASKLET_STATE_RUN)
+#define TASKLET_STATEF_PENDING	(1 << TASKLET_STATE_PENDING)
+
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
 static inline int tasklet_trylock(struct tasklet_struct *t)
 {
 	return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state);
 }
 
+static inline int tasklet_tryunlock(struct tasklet_struct *t)
+{
+	return cmpxchg(&t->state, TASKLET_STATEF_RUN, 0) == TASKLET_STATEF_RUN;
+}
+
 static inline void tasklet_unlock(struct tasklet_struct *t)
 {
 	smp_mb__before_atomic();
 	clear_bit(TASKLET_STATE_RUN, &(t)->state);
 }
 
-static inline void tasklet_unlock_wait(struct tasklet_struct *t)
-{
-	while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { barrier(); }
-}
+extern void tasklet_unlock_wait(struct tasklet_struct *t);
+
 #else
 #define tasklet_trylock(t) 1
+#define tasklet_tryunlock(t)	1
 #define tasklet_unlock_wait(t) do { } while (0)
 #define tasklet_unlock(t) do { } while (0)
 #endif
@@ -560,17 +570,8 @@ static inline void tasklet_disable(struct tasklet_struct *t)
 	smp_mb();
 }
 
-static inline void tasklet_enable(struct tasklet_struct *t)
-{
-	smp_mb__before_atomic();
-	atomic_dec(&t->count);
-}
-
-static inline void tasklet_hi_enable(struct tasklet_struct *t)
-{
-	smp_mb__before_atomic();
-	atomic_dec(&t->count);
-}
+extern void tasklet_enable(struct tasklet_struct *t);
+extern void tasklet_hi_enable(struct tasklet_struct *t);
 
 extern void tasklet_kill(struct tasklet_struct *t);
 extern void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index dd326a5889a1..2ce7609d6041 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -21,6 +21,7 @@
 #include <linux/freezer.h>
 #include <linux/kthread.h>
 #include <linux/rcupdate.h>
+#include <linux/delay.h>
 #include <linux/ftrace.h>
 #include <linux/smp.h>
 #include <linux/smpboot.h>
@@ -709,15 +710,45 @@ struct tasklet_head {
 static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec);
 static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec);
 
+static void inline
+__tasklet_common_schedule(struct tasklet_struct *t, struct tasklet_head *head, unsigned int nr)
+{
+	if (tasklet_trylock(t)) {
+again:
+		/* We may have been preempted before tasklet_trylock
+		 * and __tasklet_action may have already run.
+		 * So double check the sched bit while the takslet
+		 * is locked before adding it to the list.
+		 */
+		if (test_bit(TASKLET_STATE_SCHED, &t->state)) {
+			t->next = NULL;
+			*head->tail = t;
+			head->tail = &(t->next);
+			raise_softirq_irqoff(nr);
+			tasklet_unlock(t);
+		} else {
+			/* This is subtle. If we hit the corner case above
+			 * It is possible that we get preempted right here,
+			 * and another task has successfully called
+			 * tasklet_schedule(), then this function, and
+			 * failed on the trylock. Thus we must be sure
+			 * before releasing the tasklet lock, that the
+			 * SCHED_BIT is clear. Otherwise the tasklet
+			 * may get its SCHED_BIT set, but not added to the
+			 * list
+			 */
+			if (!tasklet_tryunlock(t))
+				goto again;
+		}
+	}
+}
+
 void __tasklet_schedule(struct tasklet_struct *t)
 {
 	unsigned long flags;
 
 	local_irq_save(flags);
-	t->next = NULL;
-	*__this_cpu_read(tasklet_vec.tail) = t;
-	__this_cpu_write(tasklet_vec.tail, &(t->next));
-	raise_softirq_irqoff(TASKLET_SOFTIRQ);
+	__tasklet_common_schedule(t, &__get_cpu_var(tasklet_vec), TASKLET_SOFTIRQ);
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL(__tasklet_schedule);
@@ -727,10 +758,7 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
 	unsigned long flags;
 
 	local_irq_save(flags);
-	t->next = NULL;
-	*__this_cpu_read(tasklet_hi_vec.tail) = t;
-	__this_cpu_write(tasklet_hi_vec.tail,  &(t->next));
-	raise_softirq_irqoff(HI_SOFTIRQ);
+	__tasklet_common_schedule(t, &__get_cpu_var(tasklet_hi_vec), HI_SOFTIRQ);
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL(__tasklet_hi_schedule);
@@ -739,48 +767,116 @@ void __tasklet_hi_schedule_first(struct tasklet_struct *t)
 {
 	BUG_ON(!irqs_disabled());
 
-	t->next = __this_cpu_read(tasklet_hi_vec.head);
-	__this_cpu_write(tasklet_hi_vec.head, t);
-	__raise_softirq_irqoff(HI_SOFTIRQ);
+	__tasklet_hi_schedule(t);
 }
 EXPORT_SYMBOL(__tasklet_hi_schedule_first);
 
-static void tasklet_action(struct softirq_action *a)
+void  tasklet_enable(struct tasklet_struct *t)
 {
-	struct tasklet_struct *list;
+	if (!atomic_dec_and_test(&t->count))
+		return;
+	if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state))
+		tasklet_schedule(t);
+}
+EXPORT_SYMBOL(tasklet_enable);
 
-	local_irq_disable();
-	list = __this_cpu_read(tasklet_vec.head);
-	__this_cpu_write(tasklet_vec.head, NULL);
-	__this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head));
-	local_irq_enable();
+void  tasklet_hi_enable(struct tasklet_struct *t)
+{
+	if (!atomic_dec_and_test(&t->count))
+		return;
+	if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state))
+		tasklet_hi_schedule(t);
+}
+EXPORT_SYMBOL(tasklet_hi_enable);
+
+static void __tasklet_action(struct softirq_action *a,
+			     struct tasklet_struct *list)
+{
+	int loops = 1000000;
 
 	while (list) {
 		struct tasklet_struct *t = list;
 
 		list = list->next;
 
-		if (tasklet_trylock(t)) {
-			if (!atomic_read(&t->count)) {
-				if (!test_and_clear_bit(TASKLET_STATE_SCHED,
-							&t->state))
-					BUG();
-				t->func(t->data);
-				tasklet_unlock(t);
-				continue;
-			}
-			tasklet_unlock(t);
+		/*
+		 * Should always succeed - after a tasklist got on the
+		 * list (after getting the SCHED bit set from 0 to 1),
+		 * nothing but the tasklet softirq it got queued to can
+		 * lock it:
+		 */
+		if (!tasklet_trylock(t)) {
+			WARN_ON(1);
+			continue;
 		}
 
-		local_irq_disable();
 		t->next = NULL;
-		*__this_cpu_read(tasklet_vec.tail) = t;
-		__this_cpu_write(tasklet_vec.tail, &(t->next));
-		__raise_softirq_irqoff(TASKLET_SOFTIRQ);
-		local_irq_enable();
+
+		/*
+		 * If we cannot handle the tasklet because it's disabled,
+		 * mark it as pending. tasklet_enable() will later
+		 * re-schedule the tasklet.
+		 */
+		if (unlikely(atomic_read(&t->count))) {
+out_disabled:
+			/* implicit unlock: */
+			wmb();
+			t->state = TASKLET_STATEF_PENDING;
+			continue;
+		}
+
+		/*
+		 * After this point on the tasklet might be rescheduled
+		 * on another CPU, but it can only be added to another
+		 * CPU's tasklet list if we unlock the tasklet (which we
+		 * dont do yet).
+		 */
+		if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
+			WARN_ON(1);
+
+again:
+		t->func(t->data);
+
+		/*
+		 * Try to unlock the tasklet. We must use cmpxchg, because
+		 * another CPU might have scheduled or disabled the tasklet.
+		 * We only allow the STATE_RUN -> 0 transition here.
+		 */
+		while (!tasklet_tryunlock(t)) {
+			/*
+			 * If it got disabled meanwhile, bail out:
+			 */
+			if (atomic_read(&t->count))
+				goto out_disabled;
+			/*
+			 * If it got scheduled meanwhile, re-execute
+			 * the tasklet function:
+			 */
+			if (test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
+				goto again;
+			if (!--loops) {
+				printk("hm, tasklet state: %08lx\n", t->state);
+				WARN_ON(1);
+				tasklet_unlock(t);
+				break;
+			}
+		}
 	}
 }
 
+static void tasklet_action(struct softirq_action *a)
+{
+	struct tasklet_struct *list;
+
+	local_irq_disable();
+	list = __get_cpu_var(tasklet_vec).head;
+	__get_cpu_var(tasklet_vec).head = NULL;
+	__get_cpu_var(tasklet_vec).tail = &__get_cpu_var(tasklet_vec).head;
+	local_irq_enable();
+
+	__tasklet_action(a, list);
+}
+
 static void tasklet_hi_action(struct softirq_action *a)
 {
 	struct tasklet_struct *list;
@@ -791,30 +887,7 @@ static void tasklet_hi_action(struct softirq_action *a)
 	__this_cpu_write(tasklet_hi_vec.tail, this_cpu_ptr(&tasklet_hi_vec.head));
 	local_irq_enable();
 
-	while (list) {
-		struct tasklet_struct *t = list;
-
-		list = list->next;
-
-		if (tasklet_trylock(t)) {
-			if (!atomic_read(&t->count)) {
-				if (!test_and_clear_bit(TASKLET_STATE_SCHED,
-							&t->state))
-					BUG();
-				t->func(t->data);
-				tasklet_unlock(t);
-				continue;
-			}
-			tasklet_unlock(t);
-		}
-
-		local_irq_disable();
-		t->next = NULL;
-		*__this_cpu_read(tasklet_hi_vec.tail) = t;
-		__this_cpu_write(tasklet_hi_vec.tail, &(t->next));
-		__raise_softirq_irqoff(HI_SOFTIRQ);
-		local_irq_enable();
-	}
+	__tasklet_action(a, list);
 }
 
 void tasklet_init(struct tasklet_struct *t,
@@ -835,7 +908,7 @@ void tasklet_kill(struct tasklet_struct *t)
 
 	while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
 		do {
-			yield();
+			msleep(1);
 		} while (test_bit(TASKLET_STATE_SCHED, &t->state));
 	}
 	tasklet_unlock_wait(t);
@@ -909,6 +982,23 @@ void __init softirq_init(void)
 	open_softirq(HI_SOFTIRQ, tasklet_hi_action);
 }
 
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
+void tasklet_unlock_wait(struct tasklet_struct *t)
+{
+	while (test_bit(TASKLET_STATE_RUN, &(t)->state)) {
+		/*
+		 * Hack for now to avoid this busy-loop:
+		 */
+#ifdef CONFIG_PREEMPT_RT_FULL
+		msleep(1);
+#else
+		barrier();
+#endif
+	}
+}
+EXPORT_SYMBOL(tasklet_unlock_wait);
+#endif
+
 static int ksoftirqd_should_run(unsigned int cpu)
 {
 	return local_softirq_pending();
-- 
cgit v1.2.3


From 61d1f4b9690276c53f6c058c2068dfa00c4dd478 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 31 Jan 2012 13:01:27 +0100
Subject: genirq: Allow disabling of softirq processing in irq thread context

The processing of softirqs in irq thread context is a performance gain
for the non-rt workloads of a system, but it's counterproductive for
interrupts which are explicitely related to the realtime
workload. Allow such interrupts to prevent softirq processing in their
thread context.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable-rt@vger.kernel.org
---
 include/linux/interrupt.h |  2 ++
 include/linux/irq.h       |  4 +++-
 kernel/irq/manage.c       | 13 ++++++++++++-
 kernel/irq/settings.h     | 12 ++++++++++++
 kernel/softirq.c          |  7 +++++++
 5 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 3fe60d859818..33cfbc085a94 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -57,6 +57,7 @@
  * IRQF_NO_THREAD - Interrupt cannot be threaded
  * IRQF_EARLY_RESUME - Resume IRQ early during syscore instead of at device
  *                resume time.
+ * IRQF_NO_SOFTIRQ_CALL - Do not process softirqs in the irq thread context (RT)
  */
 #define IRQF_DISABLED		0x00000020
 #define IRQF_SHARED		0x00000080
@@ -70,6 +71,7 @@
 #define IRQF_FORCE_RESUME	0x00008000
 #define IRQF_NO_THREAD		0x00010000
 #define IRQF_EARLY_RESUME	0x00020000
+#define IRQF_NO_SOFTIRQ_CALL	0x00080000
 
 #define IRQF_TIMER		(__IRQF_TIMER | IRQF_NO_SUSPEND | IRQF_NO_THREAD)
 
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 03f48d936f66..86a08759072b 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -73,6 +73,7 @@ typedef	void (*irq_preflow_handler_t)(struct irq_data *data);
  * IRQ_IS_POLLED		- Always polled by another interrupt. Exclude
  *				  it from the spurious interrupt detection
  *				  mechanism and from core side polling.
+ * IRQ_NO_SOFTIRQ_CALL		- No softirq processing in the irq thread context (RT)
  */
 enum {
 	IRQ_TYPE_NONE		= 0x00000000,
@@ -98,13 +99,14 @@ enum {
 	IRQ_NOTHREAD		= (1 << 16),
 	IRQ_PER_CPU_DEVID	= (1 << 17),
 	IRQ_IS_POLLED		= (1 << 18),
+	IRQ_NO_SOFTIRQ_CALL     = (1 << 19),
 };
 
 #define IRQF_MODIFY_MASK	\
 	(IRQ_TYPE_SENSE_MASK | IRQ_NOPROBE | IRQ_NOREQUEST | \
 	 IRQ_NOAUTOEN | IRQ_MOVE_PCNTXT | IRQ_LEVEL | IRQ_NO_BALANCING | \
 	 IRQ_PER_CPU | IRQ_NESTED_THREAD | IRQ_NOTHREAD | IRQ_PER_CPU_DEVID | \
-	 IRQ_IS_POLLED)
+	 IRQ_IS_POLLED | IRQ_NO_SOFTIRQ_CALL)
 
 #define IRQ_NO_BALANCING_MASK	(IRQ_PER_CPU | IRQ_NO_BALANCING)
 
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 3f79018535bf..3691d0e778bf 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -863,7 +863,15 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
 	local_bh_disable();
 	ret = action->thread_fn(action->irq, action->dev_id);
 	irq_finalize_oneshot(desc, action);
-	local_bh_enable();
+	/*
+	 * Interrupts which have real time requirements can be set up
+	 * to avoid softirq processing in the thread handler. This is
+	 * safe as these interrupts do not raise soft interrupts.
+	 */
+	if (irq_settings_no_softirq_call(desc))
+		_local_bh_enable();
+	else
+		local_bh_enable();
 	return ret;
 }
 
@@ -1259,6 +1267,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 			irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
 		}
 
+		if (new->flags & IRQF_NO_SOFTIRQ_CALL)
+			irq_settings_set_no_softirq_call(desc);
+
 		/* Set default affinity mask once everything is setup */
 		setup_affinity(irq, desc, mask);
 
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index 3320b84cc60f..34b803b89d41 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -15,6 +15,7 @@ enum {
 	_IRQ_NESTED_THREAD	= IRQ_NESTED_THREAD,
 	_IRQ_PER_CPU_DEVID	= IRQ_PER_CPU_DEVID,
 	_IRQ_IS_POLLED		= IRQ_IS_POLLED,
+	_IRQ_NO_SOFTIRQ_CALL	= IRQ_NO_SOFTIRQ_CALL,
 	_IRQF_MODIFY_MASK	= IRQF_MODIFY_MASK,
 };
 
@@ -28,6 +29,7 @@ enum {
 #define IRQ_NESTED_THREAD	GOT_YOU_MORON
 #define IRQ_PER_CPU_DEVID	GOT_YOU_MORON
 #define IRQ_IS_POLLED		GOT_YOU_MORON
+#define IRQ_NO_SOFTIRQ_CALL	GOT_YOU_MORON
 #undef IRQF_MODIFY_MASK
 #define IRQF_MODIFY_MASK	GOT_YOU_MORON
 
@@ -38,6 +40,16 @@ irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set)
 	desc->status_use_accessors |= (set & _IRQF_MODIFY_MASK);
 }
 
+static inline bool irq_settings_no_softirq_call(struct irq_desc *desc)
+{
+	return desc->status_use_accessors & _IRQ_NO_SOFTIRQ_CALL;
+}
+
+static inline void irq_settings_set_no_softirq_call(struct irq_desc *desc)
+{
+	desc->status_use_accessors |= _IRQ_NO_SOFTIRQ_CALL;
+}
+
 static inline bool irq_settings_is_per_cpu(struct irq_desc *desc)
 {
 	return desc->status_use_accessors & _IRQ_PER_CPU;
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 2ce7609d6041..41afcc5d7151 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -465,6 +465,13 @@ void local_bh_enable_ip(unsigned long ip)
 }
 EXPORT_SYMBOL(local_bh_enable_ip);
 
+void _local_bh_enable(void)
+{
+	current->softirq_nestcnt--;
+	migrate_enable();
+}
+EXPORT_SYMBOL(_local_bh_enable);
+
 /* For tracing */
 int notrace __in_softirq(void)
 {
-- 
cgit v1.2.3


From 109ecd02bd1359450500b3b903844ec06d86f7ea Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 28 Jun 2011 20:42:16 +0200
Subject: local-vars-migrate-disable.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/percpu.h | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index bd65cb68baed..208d86d80cba 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -23,11 +23,35 @@
 	 PERCPU_MODULE_RESERVE)
 #endif
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+
+#define get_local_var(var) (*({		\
+	       migrate_disable();	\
+	       &__get_cpu_var(var);	}))
+
+#define put_local_var(var) do {	\
+	(void)&(var);		\
+	migrate_enable();	\
+} while (0)
+
+# define get_local_ptr(var) ({		\
+		migrate_disable();	\
+		this_cpu_ptr(var);	})
+
+# define put_local_ptr(var) do {	\
+	(void)(var);			\
+	migrate_enable();		\
+} while (0)
+
+#else
+
 #define get_local_var(var)	get_cpu_var(var)
 #define put_local_var(var)	put_cpu_var(var)
 #define get_local_ptr(var)	get_cpu_ptr(var)
 #define put_local_ptr(var)	put_cpu_ptr(var)
 
+#endif
+
 /* minimum unit size, also is the maximum supported allocation size */
 #define PCPU_MIN_UNIT_SIZE		PFN_ALIGN(32 << 10)
 
-- 
cgit v1.2.3


From 038ce813c6ee75055444cab57aada37a7d426c55 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 6 Apr 2010 16:51:31 +0200
Subject: md: raid5: Make raid5_percpu handling RT aware

__raid_run_ops() disables preemption with get_cpu() around the access
to the raid5_percpu variables. That causes scheduling while atomic
spews on RT.

Serialize the access to the percpu data with a lock and keep the code
preemptible.

Reported-by: Udo van den Heuvel <udovdh@xs4all.nl>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Udo van den Heuvel <udovdh@xs4all.nl>
---
 drivers/md/raid5.c | 7 +++++--
 drivers/md/raid5.h | 1 +
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 8577cc7db47e..1169bce67b89 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1649,8 +1649,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
 	struct raid5_percpu *percpu;
 	unsigned long cpu;
 
-	cpu = get_cpu();
+	cpu = get_cpu_light();
 	percpu = per_cpu_ptr(conf->percpu, cpu);
+	spin_lock(&percpu->lock);
 	if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
 		ops_run_biofill(sh);
 		overlap_clear++;
@@ -1702,7 +1703,8 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
 			if (test_and_clear_bit(R5_Overlap, &dev->flags))
 				wake_up(&sh->raid_conf->wait_for_overlap);
 		}
-	put_cpu();
+	spin_unlock(&percpu->lock);
+	put_cpu_light();
 }
 
 static int grow_one_stripe(struct r5conf *conf, int hash)
@@ -5708,6 +5710,7 @@ static int raid5_alloc_percpu(struct r5conf *conf)
 			       __func__, cpu);
 			break;
 		}
+		spin_lock_init(&per_cpu_ptr(conf->percpu, cpu)->lock);
 	}
 	put_online_cpus();
 
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index d59f5ca743cd..9aa9af4ab1aa 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -457,6 +457,7 @@ struct r5conf {
 	int			recovery_disabled;
 	/* per cpu variables */
 	struct raid5_percpu {
+		spinlock_t	lock;	     /* Protection for -RT */
 		struct page	*spare_page; /* Used when checking P/Q in raid6 */
 		void		*scribble;   /* space for constructing buffer
 					      * lists and performing address
-- 
cgit v1.2.3


From a26b62bb7ccfb5b7c9cb796198225b418f186838 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 10 Jun 2011 11:04:15 +0200
Subject: rtmutex-futex-prepare-rt.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/futex.c                  | 77 +++++++++++++++++++++++++++++++++--------
 kernel/locking/rtmutex.c        | 33 +++++++++++++++---
 kernel/locking/rtmutex_common.h |  2 ++
 3 files changed, 92 insertions(+), 20 deletions(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index 63678b573d61..636210a780e8 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1705,6 +1705,16 @@ retry_private:
 				requeue_pi_wake_futex(this, &key2, hb2);
 				drop_count++;
 				continue;
+			} else if (ret == -EAGAIN) {
+				/*
+				 * Waiter was woken by timeout or
+				 * signal and has set pi_blocked_on to
+				 * PI_WAKEUP_INPROGRESS before we
+				 * tried to enqueue it on the rtmutex.
+				 */
+				this->pi_state = NULL;
+				free_pi_state(pi_state);
+				continue;
 			} else if (ret) {
 				/* -EDEADLK */
 				this->pi_state = NULL;
@@ -2549,7 +2559,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 	struct hrtimer_sleeper timeout, *to = NULL;
 	struct rt_mutex_waiter rt_waiter;
 	struct rt_mutex *pi_mutex = NULL;
-	struct futex_hash_bucket *hb;
+	struct futex_hash_bucket *hb, *hb2;
 	union futex_key key2 = FUTEX_KEY_INIT;
 	struct futex_q q = futex_q_init;
 	int res, ret;
@@ -2608,20 +2618,55 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 	/* Queue the futex_q, drop the hb lock, wait for wakeup. */
 	futex_wait_queue_me(hb, &q, to);
 
-	spin_lock(&hb->lock);
-	ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
-	spin_unlock(&hb->lock);
-	if (ret)
-		goto out_put_keys;
+	/*
+	 * On RT we must avoid races with requeue and trying to block
+	 * on two mutexes (hb->lock and uaddr2's rtmutex) by
+	 * serializing access to pi_blocked_on with pi_lock.
+	 */
+	raw_spin_lock_irq(&current->pi_lock);
+	if (current->pi_blocked_on) {
+		/*
+		 * We have been requeued or are in the process of
+		 * being requeued.
+		 */
+		raw_spin_unlock_irq(&current->pi_lock);
+	} else {
+		/*
+		 * Setting pi_blocked_on to PI_WAKEUP_INPROGRESS
+		 * prevents a concurrent requeue from moving us to the
+		 * uaddr2 rtmutex. After that we can safely acquire
+		 * (and possibly block on) hb->lock.
+		 */
+		current->pi_blocked_on = PI_WAKEUP_INPROGRESS;
+		raw_spin_unlock_irq(&current->pi_lock);
+
+		spin_lock(&hb->lock);
+
+		/*
+		 * Clean up pi_blocked_on. We might leak it otherwise
+		 * when we succeeded with the hb->lock in the fast
+		 * path.
+		 */
+		raw_spin_lock_irq(&current->pi_lock);
+		current->pi_blocked_on = NULL;
+		raw_spin_unlock_irq(&current->pi_lock);
+
+		ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
+		spin_unlock(&hb->lock);
+		if (ret)
+			goto out_put_keys;
+	}
 
 	/*
-	 * In order for us to be here, we know our q.key == key2, and since
-	 * we took the hb->lock above, we also know that futex_requeue() has
-	 * completed and we no longer have to concern ourselves with a wakeup
-	 * race with the atomic proxy lock acquisition by the requeue code. The
-	 * futex_requeue dropped our key1 reference and incremented our key2
-	 * reference count.
+	 * In order to be here, we have either been requeued, are in
+	 * the process of being requeued, or requeue successfully
+	 * acquired uaddr2 on our behalf.  If pi_blocked_on was
+	 * non-null above, we may be racing with a requeue.  Do not
+	 * rely on q->lock_ptr to be hb2->lock until after blocking on
+	 * hb->lock or hb2->lock. The futex_requeue dropped our key1
+	 * reference and incremented our key2 reference count.
 	 */
+	hb2 = hash_futex(&key2);
 
 	/* Check if the requeue code acquired the second futex for us. */
 	if (!q.rt_waiter) {
@@ -2630,9 +2675,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 		 * did a lock-steal - fix up the PI-state in that case.
 		 */
 		if (q.pi_state && (q.pi_state->owner != current)) {
-			spin_lock(q.lock_ptr);
+			spin_lock(&hb2->lock);
+			BUG_ON(&hb2->lock != q.lock_ptr);
 			ret = fixup_pi_state_owner(uaddr2, &q, current);
-			spin_unlock(q.lock_ptr);
+			spin_unlock(&hb2->lock);
 		}
 	} else {
 		/*
@@ -2645,7 +2691,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 		ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter);
 		debug_rt_mutex_free_waiter(&rt_waiter);
 
-		spin_lock(q.lock_ptr);
+		spin_lock(&hb2->lock);
+		BUG_ON(&hb2->lock != q.lock_ptr);
 		/*
 		 * Fixup the pi_state owner and possibly acquire the lock if we
 		 * haven't already.
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 41d53e515914..24fa327b1077 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -69,6 +69,11 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
 		clear_rt_mutex_waiters(lock);
 }
 
+static int rt_mutex_real_waiter(struct rt_mutex_waiter *waiter)
+{
+	return waiter && waiter != PI_WAKEUP_INPROGRESS;
+}
+
 /*
  * We can speed up the acquire/release, if the architecture
  * supports cmpxchg and if there's no debugging state to be set up
@@ -477,7 +482,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 	 * reached or the state of the chain has changed while we
 	 * dropped the locks.
 	 */
-	if (!waiter)
+	if (!rt_mutex_real_waiter(waiter))
 		goto out_unlock_pi;
 
 	/*
@@ -894,6 +899,23 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 		return -EDEADLK;
 
 	raw_spin_lock_irqsave(&task->pi_lock, flags);
+
+	/*
+	 * In the case of futex requeue PI, this will be a proxy
+	 * lock. The task will wake unaware that it is enqueueed on
+	 * this lock. Avoid blocking on two locks and corrupting
+	 * pi_blocked_on via the PI_WAKEUP_INPROGRESS
+	 * flag. futex_wait_requeue_pi() sets this when it wakes up
+	 * before requeue (due to a signal or timeout). Do not enqueue
+	 * the task if PI_WAKEUP_INPROGRESS is set.
+	 */
+	if (task != current && task->pi_blocked_on == PI_WAKEUP_INPROGRESS) {
+		raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+		return -EAGAIN;
+	}
+
+	BUG_ON(rt_mutex_real_waiter(task->pi_blocked_on));
+
 	__rt_mutex_adjust_prio(task);
 	waiter->task = task;
 	waiter->lock = lock;
@@ -917,7 +939,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 		rt_mutex_enqueue_pi(owner, waiter);
 
 		__rt_mutex_adjust_prio(owner);
-		if (owner->pi_blocked_on)
+		if (rt_mutex_real_waiter(owner->pi_blocked_on))
 			chain_walk = 1;
 	} else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
 		chain_walk = 1;
@@ -1008,7 +1030,7 @@ static void remove_waiter(struct rt_mutex *lock,
 {
 	bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
 	struct task_struct *owner = rt_mutex_owner(lock);
-	struct rt_mutex *next_lock;
+	struct rt_mutex *next_lock = NULL;
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&current->pi_lock, flags);
@@ -1033,7 +1055,8 @@ static void remove_waiter(struct rt_mutex *lock,
 	__rt_mutex_adjust_prio(owner);
 
 	/* Store the lock on which owner is blocked or NULL */
-	next_lock = task_blocked_on_lock(owner);
+	if (rt_mutex_real_waiter(owner->pi_blocked_on))
+		next_lock = task_blocked_on_lock(owner);
 
 	raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
 
@@ -1069,7 +1092,7 @@ void rt_mutex_adjust_pi(struct task_struct *task)
 	raw_spin_lock_irqsave(&task->pi_lock, flags);
 
 	waiter = task->pi_blocked_on;
-	if (!waiter || (waiter->prio == task->prio &&
+	if (!rt_mutex_real_waiter(waiter) || (waiter->prio == task->prio &&
 			!dl_prio(task->prio))) {
 		raw_spin_unlock_irqrestore(&task->pi_lock, flags);
 		return;
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 855212501407..50405fe5b818 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -119,6 +119,8 @@ enum rtmutex_chainwalk {
 /*
  * PI-futex support (proxy locking functions, etc.):
  */
+#define PI_WAKEUP_INPROGRESS	((struct rt_mutex_waiter *) 1)
+
 extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
 extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
 				       struct task_struct *proxy_owner);
-- 
cgit v1.2.3


From 1c7432eb1816e598884cb5392ba88956184e2386 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 20 May 2015 00:23:42 +0200
Subject: futex: Fix bug on when a requeued RT task times out

Requeue with timeout causes a bug with PREEMPT_RT_FULL.

The bug comes from a timed out condition.


	TASK 1				TASK 2
	------				------
    futex_wait_requeue_pi()
	futex_wait_queue_me()
	<timed out>

					double_lock_hb();

	raw_spin_lock(pi_lock);
	if (current->pi_blocked_on) {
	} else {
	    current->pi_blocked_on = PI_WAKE_INPROGRESS;
	    run_spin_unlock(pi_lock);
	    spin_lock(hb->lock); <-- blocked!


					plist_for_each_entry_safe(this) {
					    rt_mutex_start_proxy_lock();
						task_blocks_on_rt_mutex();
						BUG_ON(task->pi_blocked_on)!!!!

The BUG_ON() actually has a check for PI_WAKE_INPROGRESS, but the
problem is that, after TASK 1 sets PI_WAKE_INPROGRESS, it then tries to
grab the hb->lock, which it fails to do so. As the hb->lock is a mutex,
it will block and set the "pi_blocked_on" to the hb->lock.

When TASK 2 goes to requeue it, the check for PI_WAKE_INPROGESS fails
because the task1's pi_blocked_on is no longer set to that, but instead,
set to the hb->lock.

The fix:

When calling rt_mutex_start_proxy_lock() a check is made to see
if the proxy tasks pi_blocked_on is set. If so, exit out early.
Otherwise set it to a new flag PI_REQUEUE_INPROGRESS, which notifies
the proxy task that it is being requeued, and will handle things
appropriately.

Cc: stable-rt@vger.kernel.org
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/locking/rtmutex.c        | 32 +++++++++++++++++++++++++++++++-
 kernel/locking/rtmutex_common.h |  1 +
 2 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 24fa327b1077..669c2eef8a04 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -71,7 +71,8 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
 
 static int rt_mutex_real_waiter(struct rt_mutex_waiter *waiter)
 {
-	return waiter && waiter != PI_WAKEUP_INPROGRESS;
+	return waiter && waiter != PI_WAKEUP_INPROGRESS &&
+		waiter != PI_REQUEUE_INPROGRESS;
 }
 
 /*
@@ -1581,6 +1582,35 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
 		return 1;
 	}
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+	/*
+	 * In PREEMPT_RT there's an added race.
+	 * If the task, that we are about to requeue, times out,
+	 * it can set the PI_WAKEUP_INPROGRESS. This tells the requeue
+	 * to skip this task. But right after the task sets
+	 * its pi_blocked_on to PI_WAKEUP_INPROGRESS it can then
+	 * block on the spin_lock(&hb->lock), which in RT is an rtmutex.
+	 * This will replace the PI_WAKEUP_INPROGRESS with the actual
+	 * lock that it blocks on. We *must not* place this task
+	 * on this proxy lock in that case.
+	 *
+	 * To prevent this race, we first take the task's pi_lock
+	 * and check if it has updated its pi_blocked_on. If it has,
+	 * we assume that it woke up and we return -EAGAIN.
+	 * Otherwise, we set the task's pi_blocked_on to
+	 * PI_REQUEUE_INPROGRESS, so that if the task is waking up
+	 * it will know that we are in the process of requeuing it.
+	 */
+	raw_spin_lock_irq(&task->pi_lock);
+	if (task->pi_blocked_on) {
+		raw_spin_unlock_irq(&task->pi_lock);
+		raw_spin_unlock(&lock->wait_lock);
+		return -EAGAIN;
+	}
+	task->pi_blocked_on = PI_REQUEUE_INPROGRESS;
+	raw_spin_unlock_irq(&task->pi_lock);
+#endif
+
 	/* We enforce deadlock detection for futexes */
 	ret = task_blocks_on_rt_mutex(lock, waiter, task,
 				      RT_MUTEX_FULL_CHAINWALK);
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 50405fe5b818..f231e7510817 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -120,6 +120,7 @@ enum rtmutex_chainwalk {
  * PI-futex support (proxy locking functions, etc.):
  */
 #define PI_WAKEUP_INPROGRESS	((struct rt_mutex_waiter *) 1)
+#define PI_REQUEUE_INPROGRESS	((struct rt_mutex_waiter *) 2)
 
 extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
 extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
-- 
cgit v1.2.3


From 4193b9cf442a1b83966a4a4991df4971b2c3d7ab Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 1 Mar 2013 11:17:42 +0100
Subject: futex: Ensure lock/unlock symetry versus pi_lock and hash bucket lock

In exit_pi_state_list() we have the following locking construct:

   spin_lock(&hb->lock);
   raw_spin_lock_irq(&curr->pi_lock);

   ...
   spin_unlock(&hb->lock);

In !RT this works, but on RT the migrate_enable() function which is
called from spin_unlock() sees atomic context due to the held pi_lock
and just decrements the migrate_disable_atomic counter of the
task. Now the next call to migrate_disable() sees the counter being
negative and issues a warning. That check should be in
migrate_enable() already.

Fix this by dropping pi_lock before unlocking hb->lock and reaquire
pi_lock after that again. This is safe as the loop code reevaluates
head again under the pi_lock.

Reported-by: Yong Zhang <yong.zhang@windriver.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 kernel/futex.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/futex.c b/kernel/futex.c
index 636210a780e8..21cbb3a8fdc7 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -738,7 +738,9 @@ void exit_pi_state_list(struct task_struct *curr)
 		 * task still owns the PI-state:
 		 */
 		if (head->next != next) {
+			raw_spin_unlock_irq(&curr->pi_lock);
 			spin_unlock(&hb->lock);
+			raw_spin_lock_irq(&curr->pi_lock);
 			continue;
 		}
 
-- 
cgit v1.2.3


From e56012e404db087589c22493ee582d1f4e893655 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 9 Jun 2011 11:43:52 +0200
Subject: rtmutex-lock-killable.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/rtmutex.h  |  1 +
 kernel/locking/rtmutex.c | 19 +++++++++++++++++++
 2 files changed, 20 insertions(+)

diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h
index 1abba5ce2a2f..51dc12e3860c 100644
--- a/include/linux/rtmutex.h
+++ b/include/linux/rtmutex.h
@@ -91,6 +91,7 @@ extern void rt_mutex_destroy(struct rt_mutex *lock);
 
 extern void rt_mutex_lock(struct rt_mutex *lock);
 extern int rt_mutex_lock_interruptible(struct rt_mutex *lock);
+extern int rt_mutex_lock_killable(struct rt_mutex *lock);
 extern int rt_mutex_timed_lock(struct rt_mutex *lock,
 			       struct hrtimer_sleeper *timeout);
 
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 669c2eef8a04..eaf2bf7805c7 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1435,6 +1435,25 @@ int rt_mutex_timed_futex_lock(struct rt_mutex *lock,
 				       rt_mutex_slowlock);
 }
 
+/**
+ * rt_mutex_lock_killable - lock a rt_mutex killable
+ *
+ * @lock:              the rt_mutex to be locked
+ * @detect_deadlock:   deadlock detection on/off
+ *
+ * Returns:
+ *  0          on success
+ * -EINTR      when interrupted by a signal
+ * -EDEADLK    when the lock would deadlock (when deadlock detection is on)
+ */
+int __sched rt_mutex_lock_killable(struct rt_mutex *lock)
+{
+	might_sleep();
+
+	return rt_mutex_fastlock(lock, TASK_KILLABLE, rt_mutex_slowlock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_lock_killable);
+
 /**
  * rt_mutex_timed_lock - lock a rt_mutex interruptible
  *			the timeout structure is provided
-- 
cgit v1.2.3


From 78d3443c4c04fe3feef95dc9bb3926dc12e415c0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 10 Jun 2011 11:21:25 +0200
Subject: rt-mutex-add-sleeping-spinlocks-support.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/rtmutex.h         |  27 ++-
 kernel/futex.c                  |   5 +-
 kernel/locking/rtmutex.c        | 379 +++++++++++++++++++++++++++++++++++++---
 kernel/locking/rtmutex_common.h |  11 ++
 4 files changed, 392 insertions(+), 30 deletions(-)

diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h
index 51dc12e3860c..8c9c53d1c0c9 100644
--- a/include/linux/rtmutex.h
+++ b/include/linux/rtmutex.h
@@ -18,6 +18,10 @@
 
 extern int max_lock_depth; /* for sysctl */
 
+#ifdef CONFIG_DEBUG_MUTEXES
+#include <linux/debug_locks.h>
+#endif
+
 /**
  * The rt_mutex structure
  *
@@ -31,8 +35,8 @@ struct rt_mutex {
 	struct rb_root          waiters;
 	struct rb_node          *waiters_leftmost;
 	struct task_struct	*owner;
-#ifdef CONFIG_DEBUG_RT_MUTEXES
 	int			save_state;
+#ifdef CONFIG_DEBUG_RT_MUTEXES
 	const char 		*name, *file;
 	int			line;
 	void			*magic;
@@ -55,22 +59,33 @@ struct hrtimer_sleeper;
 # define rt_mutex_debug_check_no_locks_held(task)	do { } while (0)
 #endif
 
+# define rt_mutex_init(mutex)					\
+	do {							\
+		raw_spin_lock_init(&(mutex)->wait_lock);	\
+		__rt_mutex_init(mutex, #mutex);			\
+	} while (0)
+
 #ifdef CONFIG_DEBUG_RT_MUTEXES
 # define __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \
 	, .name = #mutexname, .file = __FILE__, .line = __LINE__
-# define rt_mutex_init(mutex)			__rt_mutex_init(mutex, __func__)
  extern void rt_mutex_debug_task_free(struct task_struct *tsk);
 #else
 # define __DEBUG_RT_MUTEX_INITIALIZER(mutexname)
-# define rt_mutex_init(mutex)			__rt_mutex_init(mutex, NULL)
 # define rt_mutex_debug_task_free(t)			do { } while (0)
 #endif
 
-#define __RT_MUTEX_INITIALIZER(mutexname) \
-	{ .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
+#define __RT_MUTEX_INITIALIZER_PLAIN(mutexname) \
+	 .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
 	, .waiters = RB_ROOT \
 	, .owner = NULL \
-	__DEBUG_RT_MUTEX_INITIALIZER(mutexname)}
+	__DEBUG_RT_MUTEX_INITIALIZER(mutexname)
+
+#define __RT_MUTEX_INITIALIZER(mutexname) \
+	{ __RT_MUTEX_INITIALIZER_PLAIN(mutexname) }
+
+#define __RT_MUTEX_INITIALIZER_SAVE_STATE(mutexname) \
+	{ __RT_MUTEX_INITIALIZER_PLAIN(mutexname)    \
+	, .save_state = 1 }
 
 #define DEFINE_RT_MUTEX(mutexname) \
 	struct rt_mutex mutexname = __RT_MUTEX_INITIALIZER(mutexname)
diff --git a/kernel/futex.c b/kernel/futex.c
index 21cbb3a8fdc7..647ff4b3a150 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2586,10 +2586,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 	 * The waiter is allocated on our stack, manipulated by the requeue
 	 * code while we sleep on uaddr.
 	 */
-	debug_rt_mutex_init_waiter(&rt_waiter);
-	RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
-	RB_CLEAR_NODE(&rt_waiter.tree_entry);
-	rt_waiter.task = NULL;
+	rt_mutex_init_waiter(&rt_waiter, false);
 
 	ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
 	if (unlikely(ret != 0))
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index eaf2bf7805c7..43f845c132c3 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -7,6 +7,11 @@
  *  Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
  *  Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
  *  Copyright (C) 2006 Esben Nielsen
+ *  Adaptive Spinlocks:
+ *  Copyright (C) 2008 Novell, Inc., Gregory Haskins, Sven Dietrich,
+ *				     and Peter Morreale,
+ * Adaptive Spinlocks simplification:
+ *  Copyright (C) 2008 Red Hat, Inc., Steven Rostedt <srostedt@redhat.com>
  *
  *  See Documentation/locking/rt-mutex-design.txt for details.
  */
@@ -339,6 +344,14 @@ static bool rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter,
 	return debug_rt_mutex_detect_deadlock(waiter, chwalk);
 }
 
+static void rt_mutex_wake_waiter(struct rt_mutex_waiter *waiter)
+{
+	if (waiter->savestate)
+		wake_up_lock_sleeper(waiter->task);
+	else
+		wake_up_process(waiter->task);
+}
+
 /*
  * Max number of times we'll walk the boosting chain:
  */
@@ -645,13 +658,16 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 	 * follow here. This is the end of the chain we are walking.
 	 */
 	if (!rt_mutex_owner(lock)) {
+		struct rt_mutex_waiter *lock_top_waiter;
+
 		/*
 		 * If the requeue [7] above changed the top waiter,
 		 * then we need to wake the new top waiter up to try
 		 * to get the lock.
 		 */
-		if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
-			wake_up_process(rt_mutex_top_waiter(lock)->task);
+		lock_top_waiter = rt_mutex_top_waiter(lock);
+		if (prerequeue_top_waiter != lock_top_waiter)
+			rt_mutex_wake_waiter(lock_top_waiter);
 		raw_spin_unlock(&lock->wait_lock);
 		return 0;
 	}
@@ -744,6 +760,25 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 	return ret;
 }
 
+
+#define STEAL_NORMAL  0
+#define STEAL_LATERAL 1
+
+/*
+ * Note that RT tasks are excluded from lateral-steals to prevent the
+ * introduction of an unbounded latency
+ */
+static inline int lock_is_stealable(struct task_struct *task,
+				    struct task_struct *pendowner, int mode)
+{
+    if (mode == STEAL_NORMAL || rt_task(task)) {
+	    if (task->prio >= pendowner->prio)
+		    return 0;
+    } else if (task->prio > pendowner->prio)
+	    return 0;
+    return 1;
+}
+
 /*
  * Try to take an rt-mutex
  *
@@ -754,8 +789,9 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
  * @waiter: The waiter that is queued to the lock's wait list if the
  *	    callsite called task_blocked_on_lock(), otherwise NULL
  */
-static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
-				struct rt_mutex_waiter *waiter)
+static int __try_to_take_rt_mutex(struct rt_mutex *lock,
+				  struct task_struct *task,
+				  struct rt_mutex_waiter *waiter, int mode)
 {
 	unsigned long flags;
 
@@ -794,8 +830,10 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
 		 * If waiter is not the highest priority waiter of
 		 * @lock, give up.
 		 */
-		if (waiter != rt_mutex_top_waiter(lock))
+		if (waiter != rt_mutex_top_waiter(lock)) {
+			/* XXX lock_is_stealable() ? */
 			return 0;
+		}
 
 		/*
 		 * We can acquire the lock. Remove the waiter from the
@@ -813,14 +851,10 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
 		 * not need to be dequeued.
 		 */
 		if (rt_mutex_has_waiters(lock)) {
-			/*
-			 * If @task->prio is greater than or equal to
-			 * the top waiter priority (kernel view),
-			 * @task lost.
-			 */
-			if (task->prio >= rt_mutex_top_waiter(lock)->prio)
-				return 0;
+			struct task_struct *pown = rt_mutex_top_waiter(lock)->task;
 
+			if (task != pown && !lock_is_stealable(task, pown, mode))
+				return 0;
 			/*
 			 * The current top waiter stays enqueued. We
 			 * don't have to change anything in the lock
@@ -869,6 +903,314 @@ takeit:
 	return 1;
 }
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+/*
+ * preemptible spin_lock functions:
+ */
+static inline void rt_spin_lock_fastlock(struct rt_mutex *lock,
+					 void  (*slowfn)(struct rt_mutex *lock))
+{
+	might_sleep();
+
+	if (likely(rt_mutex_cmpxchg(lock, NULL, current)))
+		rt_mutex_deadlock_account_lock(lock, current);
+	else
+		slowfn(lock);
+}
+
+static inline void rt_spin_lock_fastunlock(struct rt_mutex *lock,
+					   void  (*slowfn)(struct rt_mutex *lock))
+{
+	if (likely(rt_mutex_cmpxchg(lock, current, NULL)))
+		rt_mutex_deadlock_account_unlock(current);
+	else
+		slowfn(lock);
+}
+#ifdef CONFIG_SMP
+/*
+ * Note that owner is a speculative pointer and dereferencing relies
+ * on rcu_read_lock() and the check against the lock owner.
+ */
+static int adaptive_wait(struct rt_mutex *lock,
+			 struct task_struct *owner)
+{
+	int res = 0;
+
+	rcu_read_lock();
+	for (;;) {
+		if (owner != rt_mutex_owner(lock))
+			break;
+		/*
+		 * Ensure that owner->on_cpu is dereferenced _after_
+		 * checking the above to be valid.
+		 */
+		barrier();
+		if (!owner->on_cpu) {
+			res = 1;
+			break;
+		}
+		cpu_relax();
+	}
+	rcu_read_unlock();
+	return res;
+}
+#else
+static int adaptive_wait(struct rt_mutex *lock,
+			 struct task_struct *orig_owner)
+{
+	return 1;
+}
+#endif
+
+# define pi_lock(lock)		raw_spin_lock_irq(lock)
+# define pi_unlock(lock)	raw_spin_unlock_irq(lock)
+
+static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
+				   struct rt_mutex_waiter *waiter,
+				   struct task_struct *task,
+				   enum rtmutex_chainwalk chwalk);
+/*
+ * Slow path lock function spin_lock style: this variant is very
+ * careful not to miss any non-lock wakeups.
+ *
+ * We store the current state under p->pi_lock in p->saved_state and
+ * the try_to_wake_up() code handles this accordingly.
+ */
+static void  noinline __sched rt_spin_lock_slowlock(struct rt_mutex *lock)
+{
+	struct task_struct *lock_owner, *self = current;
+	struct rt_mutex_waiter waiter, *top_waiter;
+	int ret;
+
+	rt_mutex_init_waiter(&waiter, true);
+
+	raw_spin_lock(&lock->wait_lock);
+
+	if (__try_to_take_rt_mutex(lock, self, NULL, STEAL_LATERAL)) {
+		raw_spin_unlock(&lock->wait_lock);
+		return;
+	}
+
+	BUG_ON(rt_mutex_owner(lock) == self);
+
+	/*
+	 * We save whatever state the task is in and we'll restore it
+	 * after acquiring the lock taking real wakeups into account
+	 * as well. We are serialized via pi_lock against wakeups. See
+	 * try_to_wake_up().
+	 */
+	pi_lock(&self->pi_lock);
+	self->saved_state = self->state;
+	__set_current_state(TASK_UNINTERRUPTIBLE);
+	pi_unlock(&self->pi_lock);
+
+	ret = task_blocks_on_rt_mutex(lock, &waiter, self, 0);
+	BUG_ON(ret);
+
+	for (;;) {
+		/* Try to acquire the lock again. */
+		if (__try_to_take_rt_mutex(lock, self, &waiter, STEAL_LATERAL))
+			break;
+
+		top_waiter = rt_mutex_top_waiter(lock);
+		lock_owner = rt_mutex_owner(lock);
+
+		raw_spin_unlock(&lock->wait_lock);
+
+		debug_rt_mutex_print_deadlock(&waiter);
+
+		if (top_waiter != &waiter || adaptive_wait(lock, lock_owner))
+			schedule_rt_mutex(lock);
+
+		raw_spin_lock(&lock->wait_lock);
+
+		pi_lock(&self->pi_lock);
+		__set_current_state(TASK_UNINTERRUPTIBLE);
+		pi_unlock(&self->pi_lock);
+	}
+
+	/*
+	 * Restore the task state to current->saved_state. We set it
+	 * to the original state above and the try_to_wake_up() code
+	 * has possibly updated it when a real (non-rtmutex) wakeup
+	 * happened while we were blocked. Clear saved_state so
+	 * try_to_wakeup() does not get confused.
+	 */
+	pi_lock(&self->pi_lock);
+	__set_current_state(self->saved_state);
+	self->saved_state = TASK_RUNNING;
+	pi_unlock(&self->pi_lock);
+
+	/*
+	 * try_to_take_rt_mutex() sets the waiter bit
+	 * unconditionally. We might have to fix that up:
+	 */
+	fixup_rt_mutex_waiters(lock);
+
+	BUG_ON(rt_mutex_has_waiters(lock) && &waiter == rt_mutex_top_waiter(lock));
+	BUG_ON(!RB_EMPTY_NODE(&waiter.tree_entry));
+
+	raw_spin_unlock(&lock->wait_lock);
+
+	debug_rt_mutex_free_waiter(&waiter);
+}
+
+static void wakeup_next_waiter(struct rt_mutex *lock);
+/*
+ * Slow path to release a rt_mutex spin_lock style
+ */
+static void  noinline __sched rt_spin_lock_slowunlock(struct rt_mutex *lock)
+{
+	raw_spin_lock(&lock->wait_lock);
+
+	debug_rt_mutex_unlock(lock);
+
+	rt_mutex_deadlock_account_unlock(current);
+
+	if (!rt_mutex_has_waiters(lock)) {
+		lock->owner = NULL;
+		raw_spin_unlock(&lock->wait_lock);
+		return;
+	}
+
+	wakeup_next_waiter(lock);
+
+	raw_spin_unlock(&lock->wait_lock);
+
+	/* Undo pi boosting.when necessary */
+	rt_mutex_adjust_prio(current);
+}
+
+void __lockfunc rt_spin_lock(spinlock_t *lock)
+{
+	rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock);
+	spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+}
+EXPORT_SYMBOL(rt_spin_lock);
+
+void __lockfunc __rt_spin_lock(struct rt_mutex *lock)
+{
+	rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock);
+}
+EXPORT_SYMBOL(__rt_spin_lock);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass)
+{
+	rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock);
+	spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+}
+EXPORT_SYMBOL(rt_spin_lock_nested);
+#endif
+
+void __lockfunc rt_spin_unlock(spinlock_t *lock)
+{
+	/* NOTE: we always pass in '1' for nested, for simplicity */
+	spin_release(&lock->dep_map, 1, _RET_IP_);
+	rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock);
+}
+EXPORT_SYMBOL(rt_spin_unlock);
+
+void __lockfunc __rt_spin_unlock(struct rt_mutex *lock)
+{
+	rt_spin_lock_fastunlock(lock, rt_spin_lock_slowunlock);
+}
+EXPORT_SYMBOL(__rt_spin_unlock);
+
+/*
+ * Wait for the lock to get unlocked: instead of polling for an unlock
+ * (like raw spinlocks do), we lock and unlock, to force the kernel to
+ * schedule if there's contention:
+ */
+void __lockfunc rt_spin_unlock_wait(spinlock_t *lock)
+{
+	spin_lock(lock);
+	spin_unlock(lock);
+}
+EXPORT_SYMBOL(rt_spin_unlock_wait);
+
+int __lockfunc rt_spin_trylock(spinlock_t *lock)
+{
+	int ret;
+
+	migrate_disable();
+	ret = rt_mutex_trylock(&lock->lock);
+	if (ret)
+		spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+	else
+		migrate_enable();
+
+	return ret;
+}
+EXPORT_SYMBOL(rt_spin_trylock);
+
+int __lockfunc rt_spin_trylock_bh(spinlock_t *lock)
+{
+	int ret;
+
+	local_bh_disable();
+	ret = rt_mutex_trylock(&lock->lock);
+	if (ret) {
+		migrate_disable();
+		spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+	} else
+		local_bh_enable();
+	return ret;
+}
+EXPORT_SYMBOL(rt_spin_trylock_bh);
+
+int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags)
+{
+	int ret;
+
+	*flags = 0;
+	migrate_disable();
+	ret = rt_mutex_trylock(&lock->lock);
+	if (ret)
+		spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+	else
+		migrate_enable();
+	return ret;
+}
+EXPORT_SYMBOL(rt_spin_trylock_irqsave);
+
+int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock)
+{
+	/* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
+	if (atomic_add_unless(atomic, -1, 1))
+		return 0;
+	migrate_disable();
+	rt_spin_lock(lock);
+	if (atomic_dec_and_test(atomic))
+		return 1;
+	rt_spin_unlock(lock);
+	migrate_enable();
+	return 0;
+}
+EXPORT_SYMBOL(atomic_dec_and_spin_lock);
+
+	void
+__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	/*
+	 * Make sure we are not reinitializing a held lock:
+	 */
+	debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+	lockdep_init_map(&lock->dep_map, name, key, 0);
+#endif
+}
+EXPORT_SYMBOL(__rt_spin_lock_init);
+
+#endif /* PREEMPT_RT_FULL */
+
+static inline int
+try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
+		     struct rt_mutex_waiter *waiter)
+{
+	return __try_to_take_rt_mutex(lock, task, waiter, STEAL_NORMAL);
+}
+
 /*
  * Task blocks on lock.
  *
@@ -1017,7 +1359,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
 	 * long as we hold lock->wait_lock. The waiter task needs to
 	 * acquire it in order to dequeue the waiter.
 	 */
-	wake_up_process(waiter->task);
+	rt_mutex_wake_waiter(waiter);
 }
 
 /*
@@ -1099,11 +1441,11 @@ void rt_mutex_adjust_pi(struct task_struct *task)
 		return;
 	}
 	next_lock = waiter->lock;
-	raw_spin_unlock_irqrestore(&task->pi_lock, flags);
 
 	/* gets dropped in rt_mutex_adjust_prio_chain()! */
 	get_task_struct(task);
 
+	raw_spin_unlock_irqrestore(&task->pi_lock, flags);
 	rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL,
 				   next_lock, NULL, task);
 }
@@ -1188,9 +1530,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
 	struct rt_mutex_waiter waiter;
 	int ret = 0;
 
-	debug_rt_mutex_init_waiter(&waiter);
-	RB_CLEAR_NODE(&waiter.pi_tree_entry);
-	RB_CLEAR_NODE(&waiter.tree_entry);
+	rt_mutex_init_waiter(&waiter, false);
 
 	raw_spin_lock(&lock->wait_lock);
 
@@ -1532,13 +1872,12 @@ EXPORT_SYMBOL_GPL(rt_mutex_destroy);
 void __rt_mutex_init(struct rt_mutex *lock, const char *name)
 {
 	lock->owner = NULL;
-	raw_spin_lock_init(&lock->wait_lock);
 	lock->waiters = RB_ROOT;
 	lock->waiters_leftmost = NULL;
 
 	debug_rt_mutex_init(lock, name);
 }
-EXPORT_SYMBOL_GPL(__rt_mutex_init);
+EXPORT_SYMBOL(__rt_mutex_init);
 
 /**
  * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
@@ -1553,7 +1892,7 @@ EXPORT_SYMBOL_GPL(__rt_mutex_init);
 void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
 				struct task_struct *proxy_owner)
 {
-	__rt_mutex_init(lock, NULL);
+	rt_mutex_init(lock);
 	debug_rt_mutex_proxy_lock(lock, proxy_owner);
 	rt_mutex_set_owner(lock, proxy_owner);
 	rt_mutex_deadlock_account_lock(lock, proxy_owner);
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index f231e7510817..c6dcda5e53af 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -49,6 +49,7 @@ struct rt_mutex_waiter {
 	struct rb_node          pi_tree_entry;
 	struct task_struct	*task;
 	struct rt_mutex		*lock;
+	bool			savestate;
 #ifdef CONFIG_DEBUG_RT_MUTEXES
 	unsigned long		ip;
 	struct pid		*deadlock_task_pid;
@@ -141,4 +142,14 @@ extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper
 # include "rtmutex.h"
 #endif
 
+static inline void
+rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savestate)
+{
+	debug_rt_mutex_init_waiter(waiter);
+	waiter->task = NULL;
+	waiter->savestate = savestate;
+	RB_CLEAR_NODE(&waiter->pi_tree_entry);
+	RB_CLEAR_NODE(&waiter->tree_entry);
+}
+
 #endif
-- 
cgit v1.2.3


From 6eb9dc48ed5a854c61ff586e6424a136ac0aa4b0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 29 Jun 2011 19:34:01 +0200
Subject: spinlock-types-separate-raw.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/rwlock_types.h        |  4 ++
 include/linux/spinlock_types.h      | 74 +------------------------------------
 include/linux/spinlock_types_nort.h | 33 +++++++++++++++++
 include/linux/spinlock_types_raw.h  | 56 ++++++++++++++++++++++++++++
 4 files changed, 95 insertions(+), 72 deletions(-)
 create mode 100644 include/linux/spinlock_types_nort.h
 create mode 100644 include/linux/spinlock_types_raw.h

diff --git a/include/linux/rwlock_types.h b/include/linux/rwlock_types.h
index cc0072e93e36..5317cd957292 100644
--- a/include/linux/rwlock_types.h
+++ b/include/linux/rwlock_types.h
@@ -1,6 +1,10 @@
 #ifndef __LINUX_RWLOCK_TYPES_H
 #define __LINUX_RWLOCK_TYPES_H
 
+#if !defined(__LINUX_SPINLOCK_TYPES_H)
+# error "Do not include directly, include spinlock_types.h"
+#endif
+
 /*
  * include/linux/rwlock_types.h - generic rwlock type definitions
  *				  and initializers
diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h
index 73548eb13a5d..5c8664d57fb8 100644
--- a/include/linux/spinlock_types.h
+++ b/include/linux/spinlock_types.h
@@ -9,79 +9,9 @@
  * Released under the General Public License (GPL).
  */
 
-#if defined(CONFIG_SMP)
-# include <asm/spinlock_types.h>
-#else
-# include <linux/spinlock_types_up.h>
-#endif
+#include <linux/spinlock_types_raw.h>
 
-#include <linux/lockdep.h>
-
-typedef struct raw_spinlock {
-	arch_spinlock_t raw_lock;
-#ifdef CONFIG_GENERIC_LOCKBREAK
-	unsigned int break_lock;
-#endif
-#ifdef CONFIG_DEBUG_SPINLOCK
-	unsigned int magic, owner_cpu;
-	void *owner;
-#endif
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	struct lockdep_map dep_map;
-#endif
-} raw_spinlock_t;
-
-#define SPINLOCK_MAGIC		0xdead4ead
-
-#define SPINLOCK_OWNER_INIT	((void *)-1L)
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-# define SPIN_DEP_MAP_INIT(lockname)	.dep_map = { .name = #lockname }
-#else
-# define SPIN_DEP_MAP_INIT(lockname)
-#endif
-
-#ifdef CONFIG_DEBUG_SPINLOCK
-# define SPIN_DEBUG_INIT(lockname)		\
-	.magic = SPINLOCK_MAGIC,		\
-	.owner_cpu = -1,			\
-	.owner = SPINLOCK_OWNER_INIT,
-#else
-# define SPIN_DEBUG_INIT(lockname)
-#endif
-
-#define __RAW_SPIN_LOCK_INITIALIZER(lockname)	\
-	{					\
-	.raw_lock = __ARCH_SPIN_LOCK_UNLOCKED,	\
-	SPIN_DEBUG_INIT(lockname)		\
-	SPIN_DEP_MAP_INIT(lockname) }
-
-#define __RAW_SPIN_LOCK_UNLOCKED(lockname)	\
-	(raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname)
-
-#define DEFINE_RAW_SPINLOCK(x)	raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x)
-
-typedef struct spinlock {
-	union {
-		struct raw_spinlock rlock;
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map))
-		struct {
-			u8 __padding[LOCK_PADSIZE];
-			struct lockdep_map dep_map;
-		};
-#endif
-	};
-} spinlock_t;
-
-#define __SPIN_LOCK_INITIALIZER(lockname) \
-	{ { .rlock = __RAW_SPIN_LOCK_INITIALIZER(lockname) } }
-
-#define __SPIN_LOCK_UNLOCKED(lockname) \
-	(spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
-
-#define DEFINE_SPINLOCK(x)	spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
+#include <linux/spinlock_types_nort.h>
 
 #include <linux/rwlock_types.h>
 
diff --git a/include/linux/spinlock_types_nort.h b/include/linux/spinlock_types_nort.h
new file mode 100644
index 000000000000..f1dac1fb1d6a
--- /dev/null
+++ b/include/linux/spinlock_types_nort.h
@@ -0,0 +1,33 @@
+#ifndef __LINUX_SPINLOCK_TYPES_NORT_H
+#define __LINUX_SPINLOCK_TYPES_NORT_H
+
+#ifndef __LINUX_SPINLOCK_TYPES_H
+#error "Do not include directly. Include spinlock_types.h instead"
+#endif
+
+/*
+ * The non RT version maps spinlocks to raw_spinlocks
+ */
+typedef struct spinlock {
+	union {
+		struct raw_spinlock rlock;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map))
+		struct {
+			u8 __padding[LOCK_PADSIZE];
+			struct lockdep_map dep_map;
+		};
+#endif
+	};
+} spinlock_t;
+
+#define __SPIN_LOCK_INITIALIZER(lockname) \
+	{ { .rlock = __RAW_SPIN_LOCK_INITIALIZER(lockname) } }
+
+#define __SPIN_LOCK_UNLOCKED(lockname) \
+	(spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
+
+#define DEFINE_SPINLOCK(x)	spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
+
+#endif
diff --git a/include/linux/spinlock_types_raw.h b/include/linux/spinlock_types_raw.h
new file mode 100644
index 000000000000..edffc4d53fc9
--- /dev/null
+++ b/include/linux/spinlock_types_raw.h
@@ -0,0 +1,56 @@
+#ifndef __LINUX_SPINLOCK_TYPES_RAW_H
+#define __LINUX_SPINLOCK_TYPES_RAW_H
+
+#if defined(CONFIG_SMP)
+# include <asm/spinlock_types.h>
+#else
+# include <linux/spinlock_types_up.h>
+#endif
+
+#include <linux/lockdep.h>
+
+typedef struct raw_spinlock {
+	arch_spinlock_t raw_lock;
+#ifdef CONFIG_GENERIC_LOCKBREAK
+	unsigned int break_lock;
+#endif
+#ifdef CONFIG_DEBUG_SPINLOCK
+	unsigned int magic, owner_cpu;
+	void *owner;
+#endif
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map dep_map;
+#endif
+} raw_spinlock_t;
+
+#define SPINLOCK_MAGIC		0xdead4ead
+
+#define SPINLOCK_OWNER_INIT	((void *)-1L)
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# define SPIN_DEP_MAP_INIT(lockname)	.dep_map = { .name = #lockname }
+#else
+# define SPIN_DEP_MAP_INIT(lockname)
+#endif
+
+#ifdef CONFIG_DEBUG_SPINLOCK
+# define SPIN_DEBUG_INIT(lockname)		\
+	.magic = SPINLOCK_MAGIC,		\
+	.owner_cpu = -1,			\
+	.owner = SPINLOCK_OWNER_INIT,
+#else
+# define SPIN_DEBUG_INIT(lockname)
+#endif
+
+#define __RAW_SPIN_LOCK_INITIALIZER(lockname)	\
+	{					\
+	.raw_lock = __ARCH_SPIN_LOCK_UNLOCKED,	\
+	SPIN_DEBUG_INIT(lockname)		\
+	SPIN_DEP_MAP_INIT(lockname) }
+
+#define __RAW_SPIN_LOCK_UNLOCKED(lockname)	\
+	(raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname)
+
+#define DEFINE_RAW_SPINLOCK(x)	raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x)
+
+#endif
-- 
cgit v1.2.3


From 10510d3c5d6e80b19819cee95d099dcd2a0ca071 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 29 Jun 2011 20:06:39 +0200
Subject: rtmutex-avoid-include-hell.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/rtmutex.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h
index 8c9c53d1c0c9..d5a04ea47a13 100644
--- a/include/linux/rtmutex.h
+++ b/include/linux/rtmutex.h
@@ -14,7 +14,7 @@
 
 #include <linux/linkage.h>
 #include <linux/rbtree.h>
-#include <linux/spinlock_types.h>
+#include <linux/spinlock_types_raw.h>
 
 extern int max_lock_depth; /* for sysctl */
 
-- 
cgit v1.2.3


From 8cd746a897ed068a06b34decb16234a641c8a086 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 29 Jun 2011 19:43:35 +0200
Subject: rt-add-rt-spinlocks.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/rwlock_types_rt.h   | 33 +++++++++++++++++++++++++
 include/linux/spinlock_types.h    | 11 ++++++---
 include/linux/spinlock_types_rt.h | 51 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 92 insertions(+), 3 deletions(-)
 create mode 100644 include/linux/rwlock_types_rt.h
 create mode 100644 include/linux/spinlock_types_rt.h

diff --git a/include/linux/rwlock_types_rt.h b/include/linux/rwlock_types_rt.h
new file mode 100644
index 000000000000..b13832119591
--- /dev/null
+++ b/include/linux/rwlock_types_rt.h
@@ -0,0 +1,33 @@
+#ifndef __LINUX_RWLOCK_TYPES_RT_H
+#define __LINUX_RWLOCK_TYPES_RT_H
+
+#ifndef __LINUX_SPINLOCK_TYPES_H
+#error "Do not include directly. Include spinlock_types.h instead"
+#endif
+
+/*
+ * rwlocks - rtmutex which allows single reader recursion
+ */
+typedef struct {
+	struct rt_mutex		lock;
+	int			read_depth;
+	unsigned int		break_lock;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	dep_map;
+#endif
+} rwlock_t;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# define RW_DEP_MAP_INIT(lockname)	.dep_map = { .name = #lockname }
+#else
+# define RW_DEP_MAP_INIT(lockname)
+#endif
+
+#define __RW_LOCK_UNLOCKED(name) \
+	{ .lock = __RT_MUTEX_INITIALIZER_SAVE_STATE(name.lock),	\
+	  RW_DEP_MAP_INIT(name) }
+
+#define DEFINE_RWLOCK(name) \
+	rwlock_t name __cacheline_aligned_in_smp = __RW_LOCK_UNLOCKED(name)
+
+#endif
diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h
index 5c8664d57fb8..10bac715ea96 100644
--- a/include/linux/spinlock_types.h
+++ b/include/linux/spinlock_types.h
@@ -11,8 +11,13 @@
 
 #include <linux/spinlock_types_raw.h>
 
-#include <linux/spinlock_types_nort.h>
-
-#include <linux/rwlock_types.h>
+#ifndef CONFIG_PREEMPT_RT_FULL
+# include <linux/spinlock_types_nort.h>
+# include <linux/rwlock_types.h>
+#else
+# include <linux/rtmutex.h>
+# include <linux/spinlock_types_rt.h>
+# include <linux/rwlock_types_rt.h>
+#endif
 
 #endif /* __LINUX_SPINLOCK_TYPES_H */
diff --git a/include/linux/spinlock_types_rt.h b/include/linux/spinlock_types_rt.h
new file mode 100644
index 000000000000..9fd431967abc
--- /dev/null
+++ b/include/linux/spinlock_types_rt.h
@@ -0,0 +1,51 @@
+#ifndef __LINUX_SPINLOCK_TYPES_RT_H
+#define __LINUX_SPINLOCK_TYPES_RT_H
+
+#ifndef __LINUX_SPINLOCK_TYPES_H
+#error "Do not include directly. Include spinlock_types.h instead"
+#endif
+
+#include <linux/cache.h>
+
+/*
+ * PREEMPT_RT: spinlocks - an RT mutex plus lock-break field:
+ */
+typedef struct spinlock {
+	struct rt_mutex		lock;
+	unsigned int		break_lock;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	dep_map;
+#endif
+} spinlock_t;
+
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+# define __RT_SPIN_INITIALIZER(name) \
+	{ \
+	.wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
+	.save_state = 1, \
+	.file = __FILE__, \
+	.line = __LINE__ , \
+	}
+#else
+# define __RT_SPIN_INITIALIZER(name) \
+	{								\
+	.wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock),		\
+	.save_state = 1, \
+	}
+#endif
+
+/*
+.wait_list = PLIST_HEAD_INIT_RAW((name).lock.wait_list, (name).lock.wait_lock)
+*/
+
+#define __SPIN_LOCK_UNLOCKED(name)			\
+	{ .lock = __RT_SPIN_INITIALIZER(name.lock),		\
+	  SPIN_DEP_MAP_INIT(name) }
+
+#define __DEFINE_SPINLOCK(name) \
+	spinlock_t name = __SPIN_LOCK_UNLOCKED(name)
+
+#define DEFINE_SPINLOCK(name) \
+	spinlock_t name __cacheline_aligned_in_smp = __SPIN_LOCK_UNLOCKED(name)
+
+#endif
-- 
cgit v1.2.3


From 6e2c7f1a25a378bac1710cf2c027d80250c5fc2c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 29 Jun 2011 20:56:22 +0200
Subject: rt-add-rt-to-mutex-headers.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/mutex.h    | 20 ++++++++----
 include/linux/mutex_rt.h | 84 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 97 insertions(+), 7 deletions(-)
 create mode 100644 include/linux/mutex_rt.h

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index cc31498fc526..a9143091a468 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -19,6 +19,17 @@
 #include <asm/processor.h>
 #include <linux/osq_lock.h>
 
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
+	, .dep_map = { .name = #lockname }
+#else
+# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
+#endif
+
+#ifdef CONFIG_PREEMPT_RT_FULL
+# include <linux/mutex_rt.h>
+#else
+
 /*
  * Simple, straightforward mutexes with strict semantics:
  *
@@ -100,13 +111,6 @@ do {							\
 static inline void mutex_destroy(struct mutex *lock) {}
 #endif
 
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
-		, .dep_map = { .name = #lockname }
-#else
-# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
-#endif
-
 #define __MUTEX_INITIALIZER(lockname) \
 		{ .count = ATOMIC_INIT(1) \
 		, .wait_lock = __SPIN_LOCK_UNLOCKED(lockname.wait_lock) \
@@ -174,6 +178,8 @@ extern int __must_check mutex_lock_killable(struct mutex *lock);
 extern int mutex_trylock(struct mutex *lock);
 extern void mutex_unlock(struct mutex *lock);
 
+#endif /* !PREEMPT_RT_FULL */
+
 extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
 
 #endif /* __LINUX_MUTEX_H */
diff --git a/include/linux/mutex_rt.h b/include/linux/mutex_rt.h
new file mode 100644
index 000000000000..c38a44b14da5
--- /dev/null
+++ b/include/linux/mutex_rt.h
@@ -0,0 +1,84 @@
+#ifndef __LINUX_MUTEX_RT_H
+#define __LINUX_MUTEX_RT_H
+
+#ifndef __LINUX_MUTEX_H
+#error "Please include mutex.h"
+#endif
+
+#include <linux/rtmutex.h>
+
+/* FIXME: Just for __lockfunc */
+#include <linux/spinlock.h>
+
+struct mutex {
+	struct rt_mutex		lock;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	dep_map;
+#endif
+};
+
+#define __MUTEX_INITIALIZER(mutexname)					\
+	{								\
+		.lock = __RT_MUTEX_INITIALIZER(mutexname.lock)		\
+		__DEP_MAP_MUTEX_INITIALIZER(mutexname)			\
+	}
+
+#define DEFINE_MUTEX(mutexname)						\
+	struct mutex mutexname = __MUTEX_INITIALIZER(mutexname)
+
+extern void __mutex_do_init(struct mutex *lock, const char *name, struct lock_class_key *key);
+extern void __lockfunc _mutex_lock(struct mutex *lock);
+extern int __lockfunc _mutex_lock_interruptible(struct mutex *lock);
+extern int __lockfunc _mutex_lock_killable(struct mutex *lock);
+extern void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass);
+extern void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock);
+extern int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass);
+extern int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass);
+extern int __lockfunc _mutex_trylock(struct mutex *lock);
+extern void __lockfunc _mutex_unlock(struct mutex *lock);
+
+#define mutex_is_locked(l)		rt_mutex_is_locked(&(l)->lock)
+#define mutex_lock(l)			_mutex_lock(l)
+#define mutex_lock_interruptible(l)	_mutex_lock_interruptible(l)
+#define mutex_lock_killable(l)		_mutex_lock_killable(l)
+#define mutex_trylock(l)		_mutex_trylock(l)
+#define mutex_unlock(l)			_mutex_unlock(l)
+#define mutex_destroy(l)		rt_mutex_destroy(&(l)->lock)
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# define mutex_lock_nested(l, s)	_mutex_lock_nested(l, s)
+# define mutex_lock_interruptible_nested(l, s) \
+					_mutex_lock_interruptible_nested(l, s)
+# define mutex_lock_killable_nested(l, s) \
+					_mutex_lock_killable_nested(l, s)
+
+# define mutex_lock_nest_lock(lock, nest_lock)				\
+do {									\
+	typecheck(struct lockdep_map *, &(nest_lock)->dep_map);		\
+	_mutex_lock_nest_lock(lock, &(nest_lock)->dep_map);		\
+} while (0)
+
+#else
+# define mutex_lock_nested(l, s)	_mutex_lock(l)
+# define mutex_lock_interruptible_nested(l, s) \
+					_mutex_lock_interruptible(l)
+# define mutex_lock_killable_nested(l, s) \
+					_mutex_lock_killable(l)
+# define mutex_lock_nest_lock(lock, nest_lock) mutex_lock(lock)
+#endif
+
+# define mutex_init(mutex)				\
+do {							\
+	static struct lock_class_key __key;		\
+							\
+	rt_mutex_init(&(mutex)->lock);			\
+	__mutex_do_init((mutex), #mutex, &__key);	\
+} while (0)
+
+# define __mutex_init(mutex, name, key)			\
+do {							\
+	rt_mutex_init(&(mutex)->lock);			\
+	__mutex_do_init((mutex), name, key);		\
+} while (0)
+
+#endif
-- 
cgit v1.2.3


From d0119f9b8edd940d2719746b143b5de4eca9c3c7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 29 Jun 2011 21:02:53 +0200
Subject: rwsem-add-rt-variant.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/rwsem.h    |   6 +++
 include/linux/rwsem_rt.h | 134 +++++++++++++++++++++++++++++++++++++++++++++++
 kernel/locking/Makefile  |   2 +
 3 files changed, 142 insertions(+)
 create mode 100644 include/linux/rwsem_rt.h

diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 8f498cdde280..2b2148431f14 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -18,6 +18,10 @@
 #include <linux/osq_lock.h>
 #endif
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+#include <linux/rwsem_rt.h>
+#else /* PREEMPT_RT_FULL */
+
 struct rw_semaphore;
 
 #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
@@ -177,4 +181,6 @@ extern void up_read_non_owner(struct rw_semaphore *sem);
 # define up_read_non_owner(sem)			up_read(sem)
 #endif
 
+#endif /* !PREEMPT_RT_FULL */
+
 #endif /* _LINUX_RWSEM_H */
diff --git a/include/linux/rwsem_rt.h b/include/linux/rwsem_rt.h
new file mode 100644
index 000000000000..924c2d274ab5
--- /dev/null
+++ b/include/linux/rwsem_rt.h
@@ -0,0 +1,134 @@
+#ifndef _LINUX_RWSEM_RT_H
+#define _LINUX_RWSEM_RT_H
+
+#ifndef _LINUX_RWSEM_H
+#error "Include rwsem.h"
+#endif
+
+/*
+ * RW-semaphores are a spinlock plus a reader-depth count.
+ *
+ * Note that the semantics are different from the usual
+ * Linux rw-sems, in PREEMPT_RT mode we do not allow
+ * multiple readers to hold the lock at once, we only allow
+ * a read-lock owner to read-lock recursively. This is
+ * better for latency, makes the implementation inherently
+ * fair and makes it simpler as well.
+ */
+
+#include <linux/rtmutex.h>
+
+struct rw_semaphore {
+	struct rt_mutex		lock;
+	int			read_depth;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	dep_map;
+#endif
+};
+
+#define __RWSEM_INITIALIZER(name) \
+	{ .lock = __RT_MUTEX_INITIALIZER(name.lock), \
+	  RW_DEP_MAP_INIT(name) }
+
+#define DECLARE_RWSEM(lockname) \
+	struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname)
+
+extern void  __rt_rwsem_init(struct rw_semaphore *rwsem, const char *name,
+				     struct lock_class_key *key);
+
+#define __rt_init_rwsem(sem, name, key)			\
+	do {						\
+		rt_mutex_init(&(sem)->lock);		\
+		__rt_rwsem_init((sem), (name), (key));\
+	} while (0)
+
+#define __init_rwsem(sem, name, key) __rt_init_rwsem(sem, name, key)
+
+# define rt_init_rwsem(sem)				\
+do {							\
+	static struct lock_class_key __key;		\
+							\
+	__rt_init_rwsem((sem), #sem, &__key);		\
+} while (0)
+
+extern void  rt_down_write(struct rw_semaphore *rwsem);
+extern void rt_down_read_nested(struct rw_semaphore *rwsem, int subclass);
+extern void rt_down_write_nested(struct rw_semaphore *rwsem, int subclass);
+extern void rt_down_write_nested_lock(struct rw_semaphore *rwsem,
+		struct lockdep_map *nest);
+extern void  rt_down_read(struct rw_semaphore *rwsem);
+extern int  rt_down_write_trylock(struct rw_semaphore *rwsem);
+extern int  rt_down_read_trylock(struct rw_semaphore *rwsem);
+extern void  rt_up_read(struct rw_semaphore *rwsem);
+extern void  rt_up_write(struct rw_semaphore *rwsem);
+extern void  rt_downgrade_write(struct rw_semaphore *rwsem);
+
+#define init_rwsem(sem)		rt_init_rwsem(sem)
+#define rwsem_is_locked(s)	rt_mutex_is_locked(&(s)->lock)
+
+static inline int rwsem_is_contended(struct rw_semaphore *sem)
+{
+	/* rt_mutex_has_waiters() */
+	return !RB_EMPTY_ROOT(&sem->lock.waiters);
+}
+
+static inline void down_read(struct rw_semaphore *sem)
+{
+	rt_down_read(sem);
+}
+
+static inline int down_read_trylock(struct rw_semaphore *sem)
+{
+	return rt_down_read_trylock(sem);
+}
+
+static inline void down_write(struct rw_semaphore *sem)
+{
+	rt_down_write(sem);
+}
+
+static inline int down_write_trylock(struct rw_semaphore *sem)
+{
+	return rt_down_write_trylock(sem);
+}
+
+static inline void up_read(struct rw_semaphore *sem)
+{
+	rt_up_read(sem);
+}
+
+static inline void up_write(struct rw_semaphore *sem)
+{
+	rt_up_write(sem);
+}
+
+static inline void downgrade_write(struct rw_semaphore *sem)
+{
+	rt_downgrade_write(sem);
+}
+
+static inline void down_read_nested(struct rw_semaphore *sem, int subclass)
+{
+	return rt_down_read_nested(sem, subclass);
+}
+
+static inline void down_write_nested(struct rw_semaphore *sem, int subclass)
+{
+	rt_down_write_nested(sem, subclass);
+}
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static inline void down_write_nest_lock(struct rw_semaphore *sem,
+		struct rw_semaphore *nest_lock)
+{
+	rt_down_write_nested_lock(sem, &nest_lock->dep_map);
+}
+
+#else
+
+static inline void down_write_nest_lock(struct rw_semaphore *sem,
+		struct rw_semaphore *nest_lock)
+{
+	rt_down_write_nested_lock(sem, NULL);
+}
+#endif
+#endif
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 8541bfdfd232..e68edfb1a53e 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -21,8 +21,10 @@ obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
 obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
+ifneq ($(CONFIG_PREEMPT_RT_FULL),y)
 obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
 obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
+endif
 obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o
 obj-$(CONFIG_QUEUE_RWLOCK) += qrwlock.o
 obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
-- 
cgit v1.2.3


From f34fc9030f48569ab704d39481d4ec6a247dac11 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 26 Jul 2009 19:39:56 +0200
Subject: rt: Add the preempt-rt lock replacement APIs

Map spinlocks, rwlocks, rw_semaphores and semaphores to the rt_mutex
based locking functions for preempt-rt.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/locallock.h        |   6 +
 include/linux/rwlock_rt.h        | 123 +++++++++++
 include/linux/spinlock.h         |  12 +-
 include/linux/spinlock_api_smp.h |   4 +-
 include/linux/spinlock_rt.h      | 155 ++++++++++++++
 kernel/locking/Makefile          |   7 +-
 kernel/locking/rt.c              | 442 +++++++++++++++++++++++++++++++++++++++
 kernel/locking/spinlock.c        |   7 +
 kernel/locking/spinlock_debug.c  |   5 +
 9 files changed, 758 insertions(+), 3 deletions(-)
 create mode 100644 include/linux/rwlock_rt.h
 create mode 100644 include/linux/spinlock_rt.h
 create mode 100644 kernel/locking/rt.c

diff --git a/include/linux/locallock.h b/include/linux/locallock.h
index eb338ce64aa9..21653e9bfa20 100644
--- a/include/linux/locallock.h
+++ b/include/linux/locallock.h
@@ -42,9 +42,15 @@ struct local_irq_lock {
  * already takes care of the migrate_disable/enable
  * for CONFIG_PREEMPT_BASE map to the normal spin_* calls.
  */
+#ifdef CONFIG_PREEMPT_RT_FULL
+# define spin_lock_local(lock)			rt_spin_lock(lock)
+# define spin_trylock_local(lock)		rt_spin_trylock(lock)
+# define spin_unlock_local(lock)		rt_spin_unlock(lock)
+#else
 # define spin_lock_local(lock)			spin_lock(lock)
 # define spin_trylock_local(lock)		spin_trylock(lock)
 # define spin_unlock_local(lock)		spin_unlock(lock)
+#endif
 
 static inline void __local_lock(struct local_irq_lock *lv)
 {
diff --git a/include/linux/rwlock_rt.h b/include/linux/rwlock_rt.h
new file mode 100644
index 000000000000..853ee367fef4
--- /dev/null
+++ b/include/linux/rwlock_rt.h
@@ -0,0 +1,123 @@
+#ifndef __LINUX_RWLOCK_RT_H
+#define __LINUX_RWLOCK_RT_H
+
+#ifndef __LINUX_SPINLOCK_H
+#error Do not include directly. Use spinlock.h
+#endif
+
+#define rwlock_init(rwl)				\
+do {							\
+	static struct lock_class_key __key;		\
+							\
+	rt_mutex_init(&(rwl)->lock);			\
+	__rt_rwlock_init(rwl, #rwl, &__key);		\
+} while (0)
+
+extern void __lockfunc rt_write_lock(rwlock_t *rwlock);
+extern void __lockfunc rt_read_lock(rwlock_t *rwlock);
+extern int __lockfunc rt_write_trylock(rwlock_t *rwlock);
+extern int __lockfunc rt_write_trylock_irqsave(rwlock_t *trylock, unsigned long *flags);
+extern int __lockfunc rt_read_trylock(rwlock_t *rwlock);
+extern void __lockfunc rt_write_unlock(rwlock_t *rwlock);
+extern void __lockfunc rt_read_unlock(rwlock_t *rwlock);
+extern unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock);
+extern unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock);
+extern void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key);
+
+#define read_trylock(lock)	__cond_lock(lock, rt_read_trylock(lock))
+#define write_trylock(lock)	__cond_lock(lock, rt_write_trylock(lock))
+
+#define write_trylock_irqsave(lock, flags)	\
+	__cond_lock(lock, rt_write_trylock_irqsave(lock, &flags))
+
+#define read_lock_irqsave(lock, flags)			\
+	do {						\
+		typecheck(unsigned long, flags);	\
+		migrate_disable();			\
+		flags = rt_read_lock_irqsave(lock);	\
+	} while (0)
+
+#define write_lock_irqsave(lock, flags)			\
+	do {						\
+		typecheck(unsigned long, flags);	\
+		migrate_disable();			\
+		flags = rt_write_lock_irqsave(lock);	\
+	} while (0)
+
+#define read_lock(lock)					\
+	do {						\
+		migrate_disable();			\
+		rt_read_lock(lock);			\
+	} while (0)
+
+#define read_lock_bh(lock)				\
+	do {						\
+		local_bh_disable();			\
+		migrate_disable();			\
+		rt_read_lock(lock);			\
+	} while (0)
+
+#define read_lock_irq(lock)	read_lock(lock)
+
+#define write_lock(lock)				\
+	do {						\
+		migrate_disable();			\
+		rt_write_lock(lock);			\
+	} while (0)
+
+#define write_lock_bh(lock)				\
+	do {						\
+		local_bh_disable();			\
+		migrate_disable();			\
+		rt_write_lock(lock);			\
+	} while (0)
+
+#define write_lock_irq(lock)	write_lock(lock)
+
+#define read_unlock(lock)				\
+	do {						\
+		rt_read_unlock(lock);			\
+		migrate_enable();			\
+	} while (0)
+
+#define read_unlock_bh(lock)				\
+	do {						\
+		rt_read_unlock(lock);			\
+		migrate_enable();			\
+		local_bh_enable();			\
+	} while (0)
+
+#define read_unlock_irq(lock)	read_unlock(lock)
+
+#define write_unlock(lock)				\
+	do {						\
+		rt_write_unlock(lock);			\
+		migrate_enable();			\
+	} while (0)
+
+#define write_unlock_bh(lock)				\
+	do {						\
+		rt_write_unlock(lock);			\
+		migrate_enable();			\
+		local_bh_enable();			\
+	} while (0)
+
+#define write_unlock_irq(lock)	write_unlock(lock)
+
+#define read_unlock_irqrestore(lock, flags)		\
+	do {						\
+		typecheck(unsigned long, flags);	\
+		(void) flags;				\
+		rt_read_unlock(lock);			\
+		migrate_enable();			\
+	} while (0)
+
+#define write_unlock_irqrestore(lock, flags) \
+	do {						\
+		typecheck(unsigned long, flags);	\
+		(void) flags;				\
+		rt_write_unlock(lock);			\
+		migrate_enable();			\
+	} while (0)
+
+#endif
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index 262ba4ef9a8e..98e8026598f5 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -278,7 +278,11 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
 #define raw_spin_can_lock(lock)	(!raw_spin_is_locked(lock))
 
 /* Include rwlock functions */
-#include <linux/rwlock.h>
+#ifdef CONFIG_PREEMPT_RT_FULL
+# include <linux/rwlock_rt.h>
+#else
+# include <linux/rwlock.h>
+#endif
 
 /*
  * Pull the _spin_*()/_read_*()/_write_*() functions/declarations:
@@ -289,6 +293,10 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
 # include <linux/spinlock_api_up.h>
 #endif
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+# include <linux/spinlock_rt.h>
+#else /* PREEMPT_RT_FULL */
+
 /*
  * Map the spin_lock functions to the raw variants for PREEMPT_RT=n
  */
@@ -418,4 +426,6 @@ extern int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock);
 #define atomic_dec_and_lock(atomic, lock) \
 		__cond_lock(lock, _atomic_dec_and_lock(atomic, lock))
 
+#endif /* !PREEMPT_RT_FULL */
+
 #endif /* __LINUX_SPINLOCK_H */
diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h
index 42dfab89e740..29d99ae5a8ab 100644
--- a/include/linux/spinlock_api_smp.h
+++ b/include/linux/spinlock_api_smp.h
@@ -187,6 +187,8 @@ static inline int __raw_spin_trylock_bh(raw_spinlock_t *lock)
 	return 0;
 }
 
-#include <linux/rwlock_api_smp.h>
+#ifndef CONFIG_PREEMPT_RT_FULL
+# include <linux/rwlock_api_smp.h>
+#endif
 
 #endif /* __LINUX_SPINLOCK_API_SMP_H */
diff --git a/include/linux/spinlock_rt.h b/include/linux/spinlock_rt.h
new file mode 100644
index 000000000000..621d857ec546
--- /dev/null
+++ b/include/linux/spinlock_rt.h
@@ -0,0 +1,155 @@
+#ifndef __LINUX_SPINLOCK_RT_H
+#define __LINUX_SPINLOCK_RT_H
+
+#ifndef __LINUX_SPINLOCK_H
+#error Do not include directly. Use spinlock.h
+#endif
+
+#include <linux/bug.h>
+
+extern void
+__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key);
+
+#define spin_lock_init(slock)				\
+do {							\
+	static struct lock_class_key __key;		\
+							\
+	rt_mutex_init(&(slock)->lock);			\
+	__rt_spin_lock_init(slock, #slock, &__key);	\
+} while (0)
+
+extern void __lockfunc rt_spin_lock(spinlock_t *lock);
+extern unsigned long __lockfunc rt_spin_lock_trace_flags(spinlock_t *lock);
+extern void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass);
+extern void __lockfunc rt_spin_unlock(spinlock_t *lock);
+extern void __lockfunc rt_spin_unlock_wait(spinlock_t *lock);
+extern int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags);
+extern int __lockfunc rt_spin_trylock_bh(spinlock_t *lock);
+extern int __lockfunc rt_spin_trylock(spinlock_t *lock);
+extern int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock);
+
+/*
+ * lockdep-less calls, for derived types like rwlock:
+ * (for trylock they can use rt_mutex_trylock() directly.
+ */
+extern void __lockfunc __rt_spin_lock(struct rt_mutex *lock);
+extern void __lockfunc __rt_spin_unlock(struct rt_mutex *lock);
+
+#define spin_lock(lock)				\
+	do {					\
+		migrate_disable();		\
+		rt_spin_lock(lock);		\
+	} while (0)
+
+#define spin_lock_bh(lock)			\
+	do {					\
+		local_bh_disable();		\
+		migrate_disable();		\
+		rt_spin_lock(lock);		\
+	} while (0)
+
+#define spin_lock_irq(lock)		spin_lock(lock)
+
+#define spin_trylock(lock)		__cond_lock(lock, rt_spin_trylock(lock))
+
+#ifdef CONFIG_LOCKDEP
+# define spin_lock_nested(lock, subclass)		\
+	do {						\
+		migrate_disable();			\
+		rt_spin_lock_nested(lock, subclass);	\
+	} while (0)
+
+# define spin_lock_irqsave_nested(lock, flags, subclass) \
+	do {						 \
+		typecheck(unsigned long, flags);	 \
+		flags = 0;				 \
+		migrate_disable();			 \
+		rt_spin_lock_nested(lock, subclass);	 \
+	} while (0)
+#else
+# define spin_lock_nested(lock, subclass)	spin_lock(lock)
+
+# define spin_lock_irqsave_nested(lock, flags, subclass) \
+	do {						 \
+		typecheck(unsigned long, flags);	 \
+		flags = 0;				 \
+		spin_lock(lock);			 \
+	} while (0)
+#endif
+
+#define spin_lock_irqsave(lock, flags)			 \
+	do {						 \
+		typecheck(unsigned long, flags);	 \
+		flags = 0;				 \
+		spin_lock(lock);			 \
+	} while (0)
+
+static inline unsigned long spin_lock_trace_flags(spinlock_t *lock)
+{
+	unsigned long flags = 0;
+#ifdef CONFIG_TRACE_IRQFLAGS
+	flags = rt_spin_lock_trace_flags(lock);
+#else
+	spin_lock(lock); /* lock_local */
+#endif
+	return flags;
+}
+
+/* FIXME: we need rt_spin_lock_nest_lock */
+#define spin_lock_nest_lock(lock, nest_lock) spin_lock_nested(lock, 0)
+
+#define spin_unlock(lock)				\
+	do {						\
+		rt_spin_unlock(lock);			\
+		migrate_enable();			\
+	} while (0)
+
+#define spin_unlock_bh(lock)				\
+	do {						\
+		rt_spin_unlock(lock);			\
+		migrate_enable();			\
+		local_bh_enable();			\
+	} while (0)
+
+#define spin_unlock_irq(lock)		spin_unlock(lock)
+
+#define spin_unlock_irqrestore(lock, flags)		\
+	do {						\
+		typecheck(unsigned long, flags);	\
+		(void) flags;				\
+		spin_unlock(lock);			\
+	} while (0)
+
+#define spin_trylock_bh(lock)	__cond_lock(lock, rt_spin_trylock_bh(lock))
+#define spin_trylock_irq(lock)	spin_trylock(lock)
+
+#define spin_trylock_irqsave(lock, flags)	\
+	rt_spin_trylock_irqsave(lock, &(flags))
+
+#define spin_unlock_wait(lock)		rt_spin_unlock_wait(lock)
+
+#ifdef CONFIG_GENERIC_LOCKBREAK
+# define spin_is_contended(lock)	((lock)->break_lock)
+#else
+# define spin_is_contended(lock)	(((void)(lock), 0))
+#endif
+
+static inline int spin_can_lock(spinlock_t *lock)
+{
+	return !rt_mutex_is_locked(&lock->lock);
+}
+
+static inline int spin_is_locked(spinlock_t *lock)
+{
+	return rt_mutex_is_locked(&lock->lock);
+}
+
+static inline void assert_spin_locked(spinlock_t *lock)
+{
+	BUG_ON(!spin_is_locked(lock));
+}
+
+#define atomic_dec_and_lock(atomic, lock) \
+	atomic_dec_and_spin_lock(atomic, lock)
+
+#endif
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index e68edfb1a53e..5cb3588be5c5 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -1,5 +1,5 @@
 
-obj-y += mutex.o semaphore.o rwsem.o mcs_spinlock.o
+obj-y += semaphore.o mcs_spinlock.o
 
 ifdef CONFIG_FUNCTION_TRACER
 CFLAGS_REMOVE_lockdep.o = -pg
@@ -8,7 +8,11 @@ CFLAGS_REMOVE_mutex-debug.o = -pg
 CFLAGS_REMOVE_rtmutex-debug.o = -pg
 endif
 
+ifneq ($(CONFIG_PREEMPT_RT_FULL),y)
+obj-y += mutex.o
 obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
+obj-y += rwsem.o
+endif
 obj-$(CONFIG_LOCKDEP) += lockdep.o
 ifeq ($(CONFIG_PROC_FS),y)
 obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
@@ -26,5 +30,6 @@ obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
 obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
 endif
 obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o
+obj-$(CONFIG_PREEMPT_RT_FULL) += rt.o
 obj-$(CONFIG_QUEUE_RWLOCK) += qrwlock.o
 obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
diff --git a/kernel/locking/rt.c b/kernel/locking/rt.c
new file mode 100644
index 000000000000..2852623d46a7
--- /dev/null
+++ b/kernel/locking/rt.c
@@ -0,0 +1,442 @@
+/*
+ * kernel/rt.c
+ *
+ * Real-Time Preemption Support
+ *
+ * started by Ingo Molnar:
+ *
+ *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * historic credit for proving that Linux spinlocks can be implemented via
+ * RT-aware mutexes goes to many people: The Pmutex project (Dirk Grambow
+ * and others) who prototyped it on 2.4 and did lots of comparative
+ * research and analysis; TimeSys, for proving that you can implement a
+ * fully preemptible kernel via the use of IRQ threading and mutexes;
+ * Bill Huey for persuasively arguing on lkml that the mutex model is the
+ * right one; and to MontaVista, who ported pmutexes to 2.6.
+ *
+ * This code is a from-scratch implementation and is not based on pmutexes,
+ * but the idea of converting spinlocks to mutexes is used here too.
+ *
+ * lock debugging, locking tree, deadlock detection:
+ *
+ *  Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey
+ *  Released under the General Public License (GPL).
+ *
+ * Includes portions of the generic R/W semaphore implementation from:
+ *
+ *  Copyright (c) 2001   David Howells (dhowells@redhat.com).
+ *  - Derived partially from idea by Andrea Arcangeli <andrea@suse.de>
+ *  - Derived also from comments by Linus
+ *
+ * Pending ownership of locks and ownership stealing:
+ *
+ *  Copyright (C) 2005, Kihon Technologies Inc., Steven Rostedt
+ *
+ *   (also by Steven Rostedt)
+ *    - Converted single pi_lock to individual task locks.
+ *
+ * By Esben Nielsen:
+ *    Doing priority inheritance with help of the scheduler.
+ *
+ *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *  - major rework based on Esben Nielsens initial patch
+ *  - replaced thread_info references by task_struct refs
+ *  - removed task->pending_owner dependency
+ *  - BKL drop/reacquire for semaphore style locks to avoid deadlocks
+ *    in the scheduler return path as discussed with Steven Rostedt
+ *
+ *  Copyright (C) 2006, Kihon Technologies Inc.
+ *    Steven Rostedt <rostedt@goodmis.org>
+ *  - debugged and patched Thomas Gleixner's rework.
+ *  - added back the cmpxchg to the rework.
+ *  - turned atomic require back on for SMP.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/rtmutex.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/syscalls.h>
+#include <linux/interrupt.h>
+#include <linux/plist.h>
+#include <linux/fs.h>
+#include <linux/futex.h>
+#include <linux/hrtimer.h>
+
+#include "rtmutex_common.h"
+
+/*
+ * struct mutex functions
+ */
+void __mutex_do_init(struct mutex *mutex, const char *name,
+		     struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	/*
+	 * Make sure we are not reinitializing a held lock:
+	 */
+	debug_check_no_locks_freed((void *)mutex, sizeof(*mutex));
+	lockdep_init_map(&mutex->dep_map, name, key, 0);
+#endif
+	mutex->lock.save_state = 0;
+}
+EXPORT_SYMBOL(__mutex_do_init);
+
+void __lockfunc _mutex_lock(struct mutex *lock)
+{
+	mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+	rt_mutex_lock(&lock->lock);
+}
+EXPORT_SYMBOL(_mutex_lock);
+
+int __lockfunc _mutex_lock_interruptible(struct mutex *lock)
+{
+	int ret;
+
+	mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+	ret = rt_mutex_lock_interruptible(&lock->lock);
+	if (ret)
+		mutex_release(&lock->dep_map, 1, _RET_IP_);
+	return ret;
+}
+EXPORT_SYMBOL(_mutex_lock_interruptible);
+
+int __lockfunc _mutex_lock_killable(struct mutex *lock)
+{
+	int ret;
+
+	mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+	ret = rt_mutex_lock_killable(&lock->lock);
+	if (ret)
+		mutex_release(&lock->dep_map, 1, _RET_IP_);
+	return ret;
+}
+EXPORT_SYMBOL(_mutex_lock_killable);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass)
+{
+	mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
+	rt_mutex_lock(&lock->lock);
+}
+EXPORT_SYMBOL(_mutex_lock_nested);
+
+void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
+{
+	mutex_acquire_nest(&lock->dep_map, 0, 0, nest, _RET_IP_);
+	rt_mutex_lock(&lock->lock);
+}
+EXPORT_SYMBOL(_mutex_lock_nest_lock);
+
+int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass)
+{
+	int ret;
+
+	mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
+	ret = rt_mutex_lock_interruptible(&lock->lock);
+	if (ret)
+		mutex_release(&lock->dep_map, 1, _RET_IP_);
+	return ret;
+}
+EXPORT_SYMBOL(_mutex_lock_interruptible_nested);
+
+int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass)
+{
+	int ret;
+
+	mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+	ret = rt_mutex_lock_killable(&lock->lock);
+	if (ret)
+		mutex_release(&lock->dep_map, 1, _RET_IP_);
+	return ret;
+}
+EXPORT_SYMBOL(_mutex_lock_killable_nested);
+#endif
+
+int __lockfunc _mutex_trylock(struct mutex *lock)
+{
+	int ret = rt_mutex_trylock(&lock->lock);
+
+	if (ret)
+		mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+
+	return ret;
+}
+EXPORT_SYMBOL(_mutex_trylock);
+
+void __lockfunc _mutex_unlock(struct mutex *lock)
+{
+	mutex_release(&lock->dep_map, 1, _RET_IP_);
+	rt_mutex_unlock(&lock->lock);
+}
+EXPORT_SYMBOL(_mutex_unlock);
+
+/*
+ * rwlock_t functions
+ */
+int __lockfunc rt_write_trylock(rwlock_t *rwlock)
+{
+	int ret = rt_mutex_trylock(&rwlock->lock);
+
+	migrate_disable();
+	if (ret)
+		rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
+	else
+		migrate_enable();
+
+	return ret;
+}
+EXPORT_SYMBOL(rt_write_trylock);
+
+int __lockfunc rt_write_trylock_irqsave(rwlock_t *rwlock, unsigned long *flags)
+{
+	int ret;
+
+	*flags = 0;
+	migrate_disable();
+	ret = rt_write_trylock(rwlock);
+	if (!ret)
+		migrate_enable();
+	return ret;
+}
+EXPORT_SYMBOL(rt_write_trylock_irqsave);
+
+int __lockfunc rt_read_trylock(rwlock_t *rwlock)
+{
+	struct rt_mutex *lock = &rwlock->lock;
+	int ret = 1;
+
+	/*
+	 * recursive read locks succeed when current owns the lock,
+	 * but not when read_depth == 0 which means that the lock is
+	 * write locked.
+	 */
+	migrate_disable();
+	if (rt_mutex_owner(lock) != current)
+		ret = rt_mutex_trylock(lock);
+	else if (!rwlock->read_depth)
+		ret = 0;
+
+	if (ret) {
+		rwlock->read_depth++;
+		rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
+	} else
+		migrate_enable();
+
+	return ret;
+}
+EXPORT_SYMBOL(rt_read_trylock);
+
+void __lockfunc rt_write_lock(rwlock_t *rwlock)
+{
+	rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
+	__rt_spin_lock(&rwlock->lock);
+}
+EXPORT_SYMBOL(rt_write_lock);
+
+void __lockfunc rt_read_lock(rwlock_t *rwlock)
+{
+	struct rt_mutex *lock = &rwlock->lock;
+
+	rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
+
+	/*
+	 * recursive read locks succeed when current owns the lock
+	 */
+	if (rt_mutex_owner(lock) != current)
+		__rt_spin_lock(lock);
+	rwlock->read_depth++;
+}
+
+EXPORT_SYMBOL(rt_read_lock);
+
+void __lockfunc rt_write_unlock(rwlock_t *rwlock)
+{
+	/* NOTE: we always pass in '1' for nested, for simplicity */
+	rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
+	__rt_spin_unlock(&rwlock->lock);
+}
+EXPORT_SYMBOL(rt_write_unlock);
+
+void __lockfunc rt_read_unlock(rwlock_t *rwlock)
+{
+	rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
+
+	/* Release the lock only when read_depth is down to 0 */
+	if (--rwlock->read_depth == 0)
+		__rt_spin_unlock(&rwlock->lock);
+}
+EXPORT_SYMBOL(rt_read_unlock);
+
+unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock)
+{
+	rt_write_lock(rwlock);
+
+	return 0;
+}
+EXPORT_SYMBOL(rt_write_lock_irqsave);
+
+unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock)
+{
+	rt_read_lock(rwlock);
+
+	return 0;
+}
+EXPORT_SYMBOL(rt_read_lock_irqsave);
+
+void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	/*
+	 * Make sure we are not reinitializing a held lock:
+	 */
+	debug_check_no_locks_freed((void *)rwlock, sizeof(*rwlock));
+	lockdep_init_map(&rwlock->dep_map, name, key, 0);
+#endif
+	rwlock->lock.save_state = 1;
+	rwlock->read_depth = 0;
+}
+EXPORT_SYMBOL(__rt_rwlock_init);
+
+/*
+ * rw_semaphores
+ */
+
+void  rt_up_write(struct rw_semaphore *rwsem)
+{
+	rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
+	rt_mutex_unlock(&rwsem->lock);
+}
+EXPORT_SYMBOL(rt_up_write);
+
+void  rt_up_read(struct rw_semaphore *rwsem)
+{
+	rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
+	if (--rwsem->read_depth == 0)
+		rt_mutex_unlock(&rwsem->lock);
+}
+EXPORT_SYMBOL(rt_up_read);
+
+/*
+ * downgrade a write lock into a read lock
+ * - just wake up any readers at the front of the queue
+ */
+void  rt_downgrade_write(struct rw_semaphore *rwsem)
+{
+	BUG_ON(rt_mutex_owner(&rwsem->lock) != current);
+	rwsem->read_depth = 1;
+}
+EXPORT_SYMBOL(rt_downgrade_write);
+
+int  rt_down_write_trylock(struct rw_semaphore *rwsem)
+{
+	int ret = rt_mutex_trylock(&rwsem->lock);
+
+	if (ret)
+		rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
+	return ret;
+}
+EXPORT_SYMBOL(rt_down_write_trylock);
+
+void  rt_down_write(struct rw_semaphore *rwsem)
+{
+	rwsem_acquire(&rwsem->dep_map, 0, 0, _RET_IP_);
+	rt_mutex_lock(&rwsem->lock);
+}
+EXPORT_SYMBOL(rt_down_write);
+
+void  rt_down_write_nested(struct rw_semaphore *rwsem, int subclass)
+{
+	rwsem_acquire(&rwsem->dep_map, subclass, 0, _RET_IP_);
+	rt_mutex_lock(&rwsem->lock);
+}
+EXPORT_SYMBOL(rt_down_write_nested);
+
+int  rt_down_read_trylock(struct rw_semaphore *rwsem)
+{
+	struct rt_mutex *lock = &rwsem->lock;
+	int ret = 1;
+
+	/*
+	 * recursive read locks succeed when current owns the rwsem,
+	 * but not when read_depth == 0 which means that the rwsem is
+	 * write locked.
+	 */
+	if (rt_mutex_owner(lock) != current)
+		ret = rt_mutex_trylock(&rwsem->lock);
+	else if (!rwsem->read_depth)
+		ret = 0;
+
+	if (ret) {
+		rwsem->read_depth++;
+		rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(rt_down_read_trylock);
+
+static void __rt_down_read(struct rw_semaphore *rwsem, int subclass)
+{
+	struct rt_mutex *lock = &rwsem->lock;
+
+	rwsem_acquire_read(&rwsem->dep_map, subclass, 0, _RET_IP_);
+
+	if (rt_mutex_owner(lock) != current)
+		rt_mutex_lock(&rwsem->lock);
+	rwsem->read_depth++;
+}
+
+void  rt_down_read(struct rw_semaphore *rwsem)
+{
+	__rt_down_read(rwsem, 0);
+}
+EXPORT_SYMBOL(rt_down_read);
+
+void  rt_down_read_nested(struct rw_semaphore *rwsem, int subclass)
+{
+	__rt_down_read(rwsem, subclass);
+}
+EXPORT_SYMBOL(rt_down_read_nested);
+
+void  __rt_rwsem_init(struct rw_semaphore *rwsem, const char *name,
+			      struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	/*
+	 * Make sure we are not reinitializing a held lock:
+	 */
+	debug_check_no_locks_freed((void *)rwsem, sizeof(*rwsem));
+	lockdep_init_map(&rwsem->dep_map, name, key, 0);
+#endif
+	rwsem->read_depth = 0;
+	rwsem->lock.save_state = 0;
+}
+EXPORT_SYMBOL(__rt_rwsem_init);
+
+/**
+ * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
+ * @cnt: the atomic which we are to dec
+ * @lock: the mutex to return holding if we dec to 0
+ *
+ * return true and hold lock if we dec to 0, return false otherwise
+ */
+int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
+{
+	/* dec if we can't possibly hit 0 */
+	if (atomic_add_unless(cnt, -1, 1))
+		return 0;
+	/* we might hit 0, so take the lock */
+	mutex_lock(lock);
+	if (!atomic_dec_and_test(cnt)) {
+		/* when we actually did the dec, we didn't hit 0 */
+		mutex_unlock(lock);
+		return 0;
+	}
+	/* we hit 0, and we hold the lock */
+	return 1;
+}
+EXPORT_SYMBOL(atomic_dec_and_mutex_lock);
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
index 4b082b5cac9e..5c76166f88e2 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -124,8 +124,11 @@ void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock)		\
  *         __[spin|read|write]_lock_bh()
  */
 BUILD_LOCK_OPS(spin, raw_spinlock);
+
+#ifndef CONFIG_PREEMPT_RT_FULL
 BUILD_LOCK_OPS(read, rwlock);
 BUILD_LOCK_OPS(write, rwlock);
+#endif
 
 #endif
 
@@ -209,6 +212,8 @@ void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock)
 EXPORT_SYMBOL(_raw_spin_unlock_bh);
 #endif
 
+#ifndef CONFIG_PREEMPT_RT_FULL
+
 #ifndef CONFIG_INLINE_READ_TRYLOCK
 int __lockfunc _raw_read_trylock(rwlock_t *lock)
 {
@@ -353,6 +358,8 @@ void __lockfunc _raw_write_unlock_bh(rwlock_t *lock)
 EXPORT_SYMBOL(_raw_write_unlock_bh);
 #endif
 
+#endif /* !PREEMPT_RT_FULL */
+
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 
 void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
index 0374a596cffa..94970338d518 100644
--- a/kernel/locking/spinlock_debug.c
+++ b/kernel/locking/spinlock_debug.c
@@ -31,6 +31,7 @@ void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
 
 EXPORT_SYMBOL(__raw_spin_lock_init);
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 void __rwlock_init(rwlock_t *lock, const char *name,
 		   struct lock_class_key *key)
 {
@@ -48,6 +49,7 @@ void __rwlock_init(rwlock_t *lock, const char *name,
 }
 
 EXPORT_SYMBOL(__rwlock_init);
+#endif
 
 static void spin_dump(raw_spinlock_t *lock, const char *msg)
 {
@@ -159,6 +161,7 @@ void do_raw_spin_unlock(raw_spinlock_t *lock)
 	arch_spin_unlock(&lock->raw_lock);
 }
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 static void rwlock_bug(rwlock_t *lock, const char *msg)
 {
 	if (!debug_locks_off())
@@ -300,3 +303,5 @@ void do_raw_write_unlock(rwlock_t *lock)
 	debug_write_unlock(lock);
 	arch_write_unlock(&lock->raw_lock);
 }
+
+#endif
-- 
cgit v1.2.3


From 4dee06a8f3ba216e23daaace5ed711058dbffcc1 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 2 May 2014 10:53:30 +0200
Subject: rwsem-rt: Do not allow readers to nest

The readers of mainline rwsems are not allowed to nest, the rwsems in the
PREEMPT_RT kernel should not nest either.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 include/linux/rwsem_rt.h |  1 -
 kernel/locking/rt.c      | 29 ++++++-----------------------
 2 files changed, 6 insertions(+), 24 deletions(-)

diff --git a/include/linux/rwsem_rt.h b/include/linux/rwsem_rt.h
index 924c2d274ab5..0065b08fbb7a 100644
--- a/include/linux/rwsem_rt.h
+++ b/include/linux/rwsem_rt.h
@@ -20,7 +20,6 @@
 
 struct rw_semaphore {
 	struct rt_mutex		lock;
-	int			read_depth;
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lockdep_map	dep_map;
 #endif
diff --git a/kernel/locking/rt.c b/kernel/locking/rt.c
index 2852623d46a7..307983c639c3 100644
--- a/kernel/locking/rt.c
+++ b/kernel/locking/rt.c
@@ -316,8 +316,7 @@ EXPORT_SYMBOL(rt_up_write);
 void  rt_up_read(struct rw_semaphore *rwsem)
 {
 	rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
-	if (--rwsem->read_depth == 0)
-		rt_mutex_unlock(&rwsem->lock);
+	rt_mutex_unlock(&rwsem->lock);
 }
 EXPORT_SYMBOL(rt_up_read);
 
@@ -328,7 +327,6 @@ EXPORT_SYMBOL(rt_up_read);
 void  rt_downgrade_write(struct rw_semaphore *rwsem)
 {
 	BUG_ON(rt_mutex_owner(&rwsem->lock) != current);
-	rwsem->read_depth = 1;
 }
 EXPORT_SYMBOL(rt_downgrade_write);
 
@@ -358,23 +356,12 @@ EXPORT_SYMBOL(rt_down_write_nested);
 
 int  rt_down_read_trylock(struct rw_semaphore *rwsem)
 {
-	struct rt_mutex *lock = &rwsem->lock;
-	int ret = 1;
-
-	/*
-	 * recursive read locks succeed when current owns the rwsem,
-	 * but not when read_depth == 0 which means that the rwsem is
-	 * write locked.
-	 */
-	if (rt_mutex_owner(lock) != current)
-		ret = rt_mutex_trylock(&rwsem->lock);
-	else if (!rwsem->read_depth)
-		ret = 0;
+	int ret;
 
-	if (ret) {
-		rwsem->read_depth++;
+	ret = rt_mutex_trylock(&rwsem->lock);
+	if (ret)
 		rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
-	}
+
 	return ret;
 }
 EXPORT_SYMBOL(rt_down_read_trylock);
@@ -384,10 +371,7 @@ static void __rt_down_read(struct rw_semaphore *rwsem, int subclass)
 	struct rt_mutex *lock = &rwsem->lock;
 
 	rwsem_acquire_read(&rwsem->dep_map, subclass, 0, _RET_IP_);
-
-	if (rt_mutex_owner(lock) != current)
-		rt_mutex_lock(&rwsem->lock);
-	rwsem->read_depth++;
+	rt_mutex_lock(&rwsem->lock);
 }
 
 void  rt_down_read(struct rw_semaphore *rwsem)
@@ -412,7 +396,6 @@ void  __rt_rwsem_init(struct rw_semaphore *rwsem, const char *name,
 	debug_check_no_locks_freed((void *)rwsem, sizeof(*rwsem));
 	lockdep_init_map(&rwsem->dep_map, name, key, 0);
 #endif
-	rwsem->read_depth = 0;
 	rwsem->lock.save_state = 0;
 }
 EXPORT_SYMBOL(__rt_rwsem_init);
-- 
cgit v1.2.3


From f5b5352f78fe52871027f73dd21206929e999492 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 25 Feb 2015 12:16:43 +0100
Subject: Revert "rwsem-rt: Do not allow readers to nest"

This behaviour is required by cpufreq and its logic is "okay": It does a
read_lock followed by a try_read_lock.
Lockdep warns if one try a read_lock twice in -RT and vanilla so it
should be good. We still only allow multiple readers as long as it is in
the same process.

Cc: stable-rt@vger.kernel.org
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 include/linux/rwsem_rt.h |  1 +
 kernel/locking/rt.c      | 29 +++++++++++++++++++++++------
 2 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/include/linux/rwsem_rt.h b/include/linux/rwsem_rt.h
index 0065b08fbb7a..924c2d274ab5 100644
--- a/include/linux/rwsem_rt.h
+++ b/include/linux/rwsem_rt.h
@@ -20,6 +20,7 @@
 
 struct rw_semaphore {
 	struct rt_mutex		lock;
+	int			read_depth;
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lockdep_map	dep_map;
 #endif
diff --git a/kernel/locking/rt.c b/kernel/locking/rt.c
index 307983c639c3..2852623d46a7 100644
--- a/kernel/locking/rt.c
+++ b/kernel/locking/rt.c
@@ -316,7 +316,8 @@ EXPORT_SYMBOL(rt_up_write);
 void  rt_up_read(struct rw_semaphore *rwsem)
 {
 	rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
-	rt_mutex_unlock(&rwsem->lock);
+	if (--rwsem->read_depth == 0)
+		rt_mutex_unlock(&rwsem->lock);
 }
 EXPORT_SYMBOL(rt_up_read);
 
@@ -327,6 +328,7 @@ EXPORT_SYMBOL(rt_up_read);
 void  rt_downgrade_write(struct rw_semaphore *rwsem)
 {
 	BUG_ON(rt_mutex_owner(&rwsem->lock) != current);
+	rwsem->read_depth = 1;
 }
 EXPORT_SYMBOL(rt_downgrade_write);
 
@@ -356,12 +358,23 @@ EXPORT_SYMBOL(rt_down_write_nested);
 
 int  rt_down_read_trylock(struct rw_semaphore *rwsem)
 {
-	int ret;
+	struct rt_mutex *lock = &rwsem->lock;
+	int ret = 1;
 
-	ret = rt_mutex_trylock(&rwsem->lock);
-	if (ret)
-		rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
+	/*
+	 * recursive read locks succeed when current owns the rwsem,
+	 * but not when read_depth == 0 which means that the rwsem is
+	 * write locked.
+	 */
+	if (rt_mutex_owner(lock) != current)
+		ret = rt_mutex_trylock(&rwsem->lock);
+	else if (!rwsem->read_depth)
+		ret = 0;
 
+	if (ret) {
+		rwsem->read_depth++;
+		rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
+	}
 	return ret;
 }
 EXPORT_SYMBOL(rt_down_read_trylock);
@@ -371,7 +384,10 @@ static void __rt_down_read(struct rw_semaphore *rwsem, int subclass)
 	struct rt_mutex *lock = &rwsem->lock;
 
 	rwsem_acquire_read(&rwsem->dep_map, subclass, 0, _RET_IP_);
-	rt_mutex_lock(&rwsem->lock);
+
+	if (rt_mutex_owner(lock) != current)
+		rt_mutex_lock(&rwsem->lock);
+	rwsem->read_depth++;
 }
 
 void  rt_down_read(struct rw_semaphore *rwsem)
@@ -396,6 +412,7 @@ void  __rt_rwsem_init(struct rw_semaphore *rwsem, const char *name,
 	debug_check_no_locks_freed((void *)rwsem, sizeof(*rwsem));
 	lockdep_init_map(&rwsem->dep_map, name, key, 0);
 #endif
+	rwsem->read_depth = 0;
 	rwsem->lock.save_state = 0;
 }
 EXPORT_SYMBOL(__rt_rwsem_init);
-- 
cgit v1.2.3


From 7d3a0ce4dc35d88dd4fc7e107126a91cf1b25da5 Mon Sep 17 00:00:00 2001
From: Nicholas Mc Guire <der.herr@hofr.at>
Date: Thu, 21 Nov 2013 22:52:30 -0500
Subject: condition migration_disable on lock acquisition

No need to unconditionally migrate_disable (what is it protecting ?) and
re-enable on failure to acquire the lock.
This patch moves the migrate_disable to be conditioned on sucessful lock
acquisition only.

Signed-off-by: Nicholas Mc Guire <der.herr@hofr.at>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 kernel/locking/rt.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/kernel/locking/rt.c b/kernel/locking/rt.c
index 2852623d46a7..e000d1dcd1cc 100644
--- a/kernel/locking/rt.c
+++ b/kernel/locking/rt.c
@@ -182,11 +182,10 @@ int __lockfunc rt_write_trylock(rwlock_t *rwlock)
 {
 	int ret = rt_mutex_trylock(&rwlock->lock);
 
-	migrate_disable();
-	if (ret)
+	if (ret) {
 		rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
-	else
-		migrate_enable();
+		migrate_disable();
+	}
 
 	return ret;
 }
-- 
cgit v1.2.3


From c084379f7376f58d7c61bf6f3e7ca0ac97ecdae8 Mon Sep 17 00:00:00 2001
From: Nicholas Mc Guire <der.herr@hofr.at>
Date: Fri, 29 Nov 2013 00:19:41 -0500
Subject: migrate_disable pushd down in atomic_dec_and_spin_lock

Signed-off-by: Nicholas Mc Guire <der.herr@hofr.at>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 kernel/locking/rtmutex.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 43f845c132c3..7ba0e6e445ef 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1179,12 +1179,12 @@ int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock)
 	/* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
 	if (atomic_add_unless(atomic, -1, 1))
 		return 0;
-	migrate_disable();
 	rt_spin_lock(lock);
-	if (atomic_dec_and_test(atomic))
+	if (atomic_dec_and_test(atomic)){
+		migrate_disable();
 		return 1;
+	}
 	rt_spin_unlock(lock);
-	migrate_enable();
 	return 0;
 }
 EXPORT_SYMBOL(atomic_dec_and_spin_lock);
-- 
cgit v1.2.3


From 93d805624c59c604b8f32cae33b01be3e3c55113 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 2 May 2014 17:32:30 +0200
Subject: Revert "migrate_disable pushd down in atomic_dec_and_spin_lock"

This reverts commit ff9c870c3e27d58c9512fad122e91436681fee5a.
Cc: stable-rt@vger.kernel.org
---
 kernel/locking/rtmutex.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 7ba0e6e445ef..43f845c132c3 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1179,12 +1179,12 @@ int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock)
 	/* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
 	if (atomic_add_unless(atomic, -1, 1))
 		return 0;
+	migrate_disable();
 	rt_spin_lock(lock);
-	if (atomic_dec_and_test(atomic)){
-		migrate_disable();
+	if (atomic_dec_and_test(atomic))
 		return 1;
-	}
 	rt_spin_unlock(lock);
+	migrate_enable();
 	return 0;
 }
 EXPORT_SYMBOL(atomic_dec_and_spin_lock);
-- 
cgit v1.2.3


From 43ac917e45d9f8ad3e7cfabdb29c41bc41358f48 Mon Sep 17 00:00:00 2001
From: Nicholas Mc Guire <der.herr@hofr.at>
Date: Fri, 29 Nov 2013 00:17:27 -0500
Subject: migrate_disable pushd down in rt_spin_trylock_irqsave

Signed-off-by: Nicholas Mc Guire <der.herr@hofr.at>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 kernel/locking/rtmutex.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 43f845c132c3..d2aaba66b271 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1164,12 +1164,11 @@ int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags)
 	int ret;
 
 	*flags = 0;
-	migrate_disable();
 	ret = rt_mutex_trylock(&lock->lock);
-	if (ret)
+	if (ret) {
+		migrate_disable();
 		spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
-	else
-		migrate_enable();
+	}
 	return ret;
 }
 EXPORT_SYMBOL(rt_spin_trylock_irqsave);
-- 
cgit v1.2.3


From e0ce5c9ef9d9905da8328e959142d668098d2227 Mon Sep 17 00:00:00 2001
From: Nicholas Mc Guire <der.herr@hofr.at>
Date: Fri, 29 Nov 2013 00:21:59 -0500
Subject: migrate_disable pushd down in rt_write_trylock_irqsave

Signed-off-by: Nicholas Mc Guire <der.herr@hofr.at>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 kernel/locking/rt.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/kernel/locking/rt.c b/kernel/locking/rt.c
index e000d1dcd1cc..de6c8af0dace 100644
--- a/kernel/locking/rt.c
+++ b/kernel/locking/rt.c
@@ -196,10 +196,9 @@ int __lockfunc rt_write_trylock_irqsave(rwlock_t *rwlock, unsigned long *flags)
 	int ret;
 
 	*flags = 0;
-	migrate_disable();
 	ret = rt_write_trylock(rwlock);
-	if (!ret)
-		migrate_enable();
+	if (ret)
+		migrate_disable();
 	return ret;
 }
 EXPORT_SYMBOL(rt_write_trylock_irqsave);
-- 
cgit v1.2.3


From c42c9ed29579b9140da779cae4b3b43b7ffa888f Mon Sep 17 00:00:00 2001
From: Nicholas Mc Guire <der.herr@hofr.at>
Date: Thu, 2 Jan 2014 10:18:42 +0100
Subject: write_lock migrate_disable pushdown to rt_write_lock

pushdown of migrate_disable/enable from write_*lock* to the rt_write_*lock*
api level

general mapping of write_*lock* to mutexes:

write_*lock*
  `-> rt_write_*lock*
          `-> __spin_lock (the sleeping __spin_lock)
                 `-> rt_mutex

write_*lock*s are non-recursive so we have two lock chains to consider
 - write_trylock*/write_unlock
 - write_lock*/wirte_unlock
for both paths the migration_disable/enable must be balanced.

write_trylock* mapping:

write_trylock_irqsave
                `-> rt_write_trylock_irqsave
write_trylock             \
          `-------->  rt_write_trylock
                       ret = rt_mutex_trylock
                              rt_mutex_fasttrylock
                               rt_mutex_cmpxchg
                       if (ret)
                            migrate_disable

write_lock* mapping:

                  write_lock_irqsave
                                `-> rt_write_lock_irqsave
write_lock_irq -> write_lock ----.     \
                  write_lock_bh -+      \
                                 `-> rt_write_lock
                                      __rt_spin_lock()
                                       rt_spin_lock_fastlock()
                                        rt_mutex_cmpxchg()
                                     migrate_disable()

write_unlock* mapping:

                    write_unlock_irqrestore.
                    write_unlock_bh -------+
write_unlock_irq -> write_unlock ----------+
                                           `-> rt_write_unlock()
                                                __rt_spin_unlock()
                                                 rt_spin_lock_fastunlock()
                                                  rt_mutex_cmpxchg()
                                               migrate_enable()

So calls to migrate_disable/enable() are better placed at the rt_write_*
level of lock/trylock/unlock as all of the write_*lock* API has this as a
common path.

This approach to write_*_bh also eliminates the concerns raised with
regards to api inbalances (write_lock_bh -> write_unlock+local_bh_enable)

Tested-by: Carsten Emde <C.Emde@osadl.org>
Signed-off-by: Nicholas Mc Guire <der.herr@hofr.at>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 include/linux/rwlock_rt.h | 6 ------
 kernel/locking/rt.c       | 4 ++--
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/include/linux/rwlock_rt.h b/include/linux/rwlock_rt.h
index 853ee367fef4..a276fae2740a 100644
--- a/include/linux/rwlock_rt.h
+++ b/include/linux/rwlock_rt.h
@@ -40,7 +40,6 @@ extern void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key
 #define write_lock_irqsave(lock, flags)			\
 	do {						\
 		typecheck(unsigned long, flags);	\
-		migrate_disable();			\
 		flags = rt_write_lock_irqsave(lock);	\
 	} while (0)
 
@@ -61,14 +60,12 @@ extern void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key
 
 #define write_lock(lock)				\
 	do {						\
-		migrate_disable();			\
 		rt_write_lock(lock);			\
 	} while (0)
 
 #define write_lock_bh(lock)				\
 	do {						\
 		local_bh_disable();			\
-		migrate_disable();			\
 		rt_write_lock(lock);			\
 	} while (0)
 
@@ -92,13 +89,11 @@ extern void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key
 #define write_unlock(lock)				\
 	do {						\
 		rt_write_unlock(lock);			\
-		migrate_enable();			\
 	} while (0)
 
 #define write_unlock_bh(lock)				\
 	do {						\
 		rt_write_unlock(lock);			\
-		migrate_enable();			\
 		local_bh_enable();			\
 	} while (0)
 
@@ -117,7 +112,6 @@ extern void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key
 		typecheck(unsigned long, flags);	\
 		(void) flags;				\
 		rt_write_unlock(lock);			\
-		migrate_enable();			\
 	} while (0)
 
 #endif
diff --git a/kernel/locking/rt.c b/kernel/locking/rt.c
index de6c8af0dace..d75edad93af0 100644
--- a/kernel/locking/rt.c
+++ b/kernel/locking/rt.c
@@ -197,8 +197,6 @@ int __lockfunc rt_write_trylock_irqsave(rwlock_t *rwlock, unsigned long *flags)
 
 	*flags = 0;
 	ret = rt_write_trylock(rwlock);
-	if (ret)
-		migrate_disable();
 	return ret;
 }
 EXPORT_SYMBOL(rt_write_trylock_irqsave);
@@ -232,6 +230,7 @@ EXPORT_SYMBOL(rt_read_trylock);
 void __lockfunc rt_write_lock(rwlock_t *rwlock)
 {
 	rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
+	migrate_disable();
 	__rt_spin_lock(&rwlock->lock);
 }
 EXPORT_SYMBOL(rt_write_lock);
@@ -257,6 +256,7 @@ void __lockfunc rt_write_unlock(rwlock_t *rwlock)
 	/* NOTE: we always pass in '1' for nested, for simplicity */
 	rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
 	__rt_spin_unlock(&rwlock->lock);
+	migrate_enable();
 }
 EXPORT_SYMBOL(rt_write_unlock);
 
-- 
cgit v1.2.3


From de177d38a97929faec7f25a4e8957b42e9f031e4 Mon Sep 17 00:00:00 2001
From: Nicholas Mc Guire <der.herr@hofr.at>
Date: Thu, 2 Jan 2014 10:19:15 +0100
Subject: read_lock migrate_disable pushdown to rt_read_lock

pushdown of migrate_disable/enable from read_*lock* to the rt_read_*lock*
api level

general mapping to mutexes:

read_*lock*
  `-> rt_read_*lock*
          `-> __spin_lock (the sleeping spin locks)
                 `-> rt_mutex

The real read_lock* mapping:

          read_lock_irqsave -.
read_lock_irq                `-> rt_read_lock_irqsave()
       `->read_lock ---------.       \
          read_lock_bh ------+        \
                             `--> rt_read_lock()
                                   if (rt_mutex_owner(lock) != current){
                                           `-> __rt_spin_lock()
                                                rt_spin_lock_fastlock()
                                                       `->rt_mutex_cmpxchg()
                                    migrate_disable()
                                   }
                                   rwlock->read_depth++;
read_trylock mapping:

read_trylock
          `-> rt_read_trylock
               if (rt_mutex_owner(lock) != current){
                                              `-> rt_mutex_trylock()
                                                   rt_mutex_fasttrylock()
                                                    rt_mutex_cmpxchg()
                migrate_disable()
               }
               rwlock->read_depth++;

read_unlock* mapping:

read_unlock_bh --------+
read_unlock_irq -------+
read_unlock_irqrestore +
read_unlock -----------+
                       `-> rt_read_unlock()
                            if(--rwlock->read_depth==0){
                                      `-> __rt_spin_unlock()
                                           rt_spin_lock_fastunlock()
                                                        `-> rt_mutex_cmpxchg()
                             migrate_disable()
                            }

So calls to migrate_disable/enable() are better placed at the rt_read_*
level of lock/trylock/unlock as all of the read_*lock* API has this as a
common path. In the rt_read* API of lock/trylock/unlock the nesting level
is already being recorded in rwlock->read_depth, so we can push down the
migrate disable/enable to that level and condition it on the read_depth
going from 0 to 1 -> migrate_disable and 1 to 0 -> migrate_enable. This
eliminates the recursive calls that were needed when migrate_disable/enable
was done at the read_*lock* level.

The approach to read_*_bh also eliminates the concerns raised with the
regards to api inbalances (read_lock_bh -> read_unlock+local_bh_enable)

Tested-by: Carsten Emde <C.Emde@osadl.org>
Signed-off-by: Nicholas Mc Guire <der.herr@hofr.at>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 include/linux/rwlock_rt.h |  6 ------
 kernel/locking/rt.c       | 20 +++++++++++++-------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/include/linux/rwlock_rt.h b/include/linux/rwlock_rt.h
index a276fae2740a..e85a5dfb1468 100644
--- a/include/linux/rwlock_rt.h
+++ b/include/linux/rwlock_rt.h
@@ -33,7 +33,6 @@ extern void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key
 #define read_lock_irqsave(lock, flags)			\
 	do {						\
 		typecheck(unsigned long, flags);	\
-		migrate_disable();			\
 		flags = rt_read_lock_irqsave(lock);	\
 	} while (0)
 
@@ -45,14 +44,12 @@ extern void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key
 
 #define read_lock(lock)					\
 	do {						\
-		migrate_disable();			\
 		rt_read_lock(lock);			\
 	} while (0)
 
 #define read_lock_bh(lock)				\
 	do {						\
 		local_bh_disable();			\
-		migrate_disable();			\
 		rt_read_lock(lock);			\
 	} while (0)
 
@@ -74,13 +71,11 @@ extern void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key
 #define read_unlock(lock)				\
 	do {						\
 		rt_read_unlock(lock);			\
-		migrate_enable();			\
 	} while (0)
 
 #define read_unlock_bh(lock)				\
 	do {						\
 		rt_read_unlock(lock);			\
-		migrate_enable();			\
 		local_bh_enable();			\
 	} while (0)
 
@@ -104,7 +99,6 @@ extern void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key
 		typecheck(unsigned long, flags);	\
 		(void) flags;				\
 		rt_read_unlock(lock);			\
-		migrate_enable();			\
 	} while (0)
 
 #define write_unlock_irqrestore(lock, flags) \
diff --git a/kernel/locking/rt.c b/kernel/locking/rt.c
index d75edad93af0..906322a61ffd 100644
--- a/kernel/locking/rt.c
+++ b/kernel/locking/rt.c
@@ -211,17 +211,19 @@ int __lockfunc rt_read_trylock(rwlock_t *rwlock)
 	 * but not when read_depth == 0 which means that the lock is
 	 * write locked.
 	 */
-	migrate_disable();
-	if (rt_mutex_owner(lock) != current)
+	if (rt_mutex_owner(lock) != current) {
 		ret = rt_mutex_trylock(lock);
-	else if (!rwlock->read_depth)
+		if (ret)
+			migrate_disable();
+
+	} else if (!rwlock->read_depth) {
 		ret = 0;
+	}
 
 	if (ret) {
 		rwlock->read_depth++;
 		rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
-	} else
-		migrate_enable();
+	}
 
 	return ret;
 }
@@ -244,8 +246,10 @@ void __lockfunc rt_read_lock(rwlock_t *rwlock)
 	/*
 	 * recursive read locks succeed when current owns the lock
 	 */
-	if (rt_mutex_owner(lock) != current)
+	if (rt_mutex_owner(lock) != current) {
 		__rt_spin_lock(lock);
+		migrate_disable();
+	}
 	rwlock->read_depth++;
 }
 
@@ -265,8 +269,10 @@ void __lockfunc rt_read_unlock(rwlock_t *rwlock)
 	rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
 
 	/* Release the lock only when read_depth is down to 0 */
-	if (--rwlock->read_depth == 0)
+	if (--rwlock->read_depth == 0) {
 		__rt_spin_unlock(&rwlock->lock);
+		migrate_enable();
+	}
 }
 EXPORT_SYMBOL(rt_read_unlock);
 
-- 
cgit v1.2.3


From 0530d7aee5444f9c2411e5edcbff38f4333b775f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 29 Apr 2014 20:13:08 -0400
Subject: rwlock: disable migration before taking a lock

If there's no complaints about it. I'm going to add this to the 3.12-rt
stable tree. As without it, it fails horribly with the cpu hotplug
stress test, and I wont release a stable kernel that does that.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/locking/rt.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/kernel/locking/rt.c b/kernel/locking/rt.c
index 906322a61ffd..2e7efaf1d8b2 100644
--- a/kernel/locking/rt.c
+++ b/kernel/locking/rt.c
@@ -180,12 +180,14 @@ EXPORT_SYMBOL(_mutex_unlock);
  */
 int __lockfunc rt_write_trylock(rwlock_t *rwlock)
 {
-	int ret = rt_mutex_trylock(&rwlock->lock);
+	int ret;
 
-	if (ret) {
+	migrate_disable();
+	ret = rt_mutex_trylock(&rwlock->lock);
+	if (ret)
 		rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
-		migrate_disable();
-	}
+	else
+		migrate_enable();
 
 	return ret;
 }
@@ -212,9 +214,10 @@ int __lockfunc rt_read_trylock(rwlock_t *rwlock)
 	 * write locked.
 	 */
 	if (rt_mutex_owner(lock) != current) {
+		migrate_disable();
 		ret = rt_mutex_trylock(lock);
-		if (ret)
-			migrate_disable();
+		if (!ret)
+			migrate_enable();
 
 	} else if (!rwlock->read_depth) {
 		ret = 0;
@@ -247,8 +250,8 @@ void __lockfunc rt_read_lock(rwlock_t *rwlock)
 	 * recursive read locks succeed when current owns the lock
 	 */
 	if (rt_mutex_owner(lock) != current) {
-		__rt_spin_lock(lock);
 		migrate_disable();
+		__rt_spin_lock(lock);
 	}
 	rwlock->read_depth++;
 }
-- 
cgit v1.2.3


From 5e5528a05fe3642f7f8f56dff5c45bc3451ff820 Mon Sep 17 00:00:00 2001
From: Nicholas Mc Guire <der.herr@hofr.at>
Date: Sat, 8 Feb 2014 12:39:20 +0100
Subject: rt: Cleanup of unnecessary do while 0 in read/write _lock()

With the migration pushdonw a few of the do{ }while(0)
loops became obsolete but got left over - this patch
only removes this fallout.

Patch applies on top of 3.12.9-rt13

Signed-off-by: Nicholas Mc Guire <der.herr@hofr.at>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 include/linux/rwlock_rt.h | 20 ++++----------------
 1 file changed, 4 insertions(+), 16 deletions(-)

diff --git a/include/linux/rwlock_rt.h b/include/linux/rwlock_rt.h
index e85a5dfb1468..49ed2d45d3be 100644
--- a/include/linux/rwlock_rt.h
+++ b/include/linux/rwlock_rt.h
@@ -42,10 +42,7 @@ extern void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key
 		flags = rt_write_lock_irqsave(lock);	\
 	} while (0)
 
-#define read_lock(lock)					\
-	do {						\
-		rt_read_lock(lock);			\
-	} while (0)
+#define read_lock(lock)		rt_read_lock(lock)
 
 #define read_lock_bh(lock)				\
 	do {						\
@@ -55,10 +52,7 @@ extern void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key
 
 #define read_lock_irq(lock)	read_lock(lock)
 
-#define write_lock(lock)				\
-	do {						\
-		rt_write_lock(lock);			\
-	} while (0)
+#define write_lock(lock)	rt_write_lock(lock)
 
 #define write_lock_bh(lock)				\
 	do {						\
@@ -68,10 +62,7 @@ extern void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key
 
 #define write_lock_irq(lock)	write_lock(lock)
 
-#define read_unlock(lock)				\
-	do {						\
-		rt_read_unlock(lock);			\
-	} while (0)
+#define read_unlock(lock)	rt_read_unlock(lock)
 
 #define read_unlock_bh(lock)				\
 	do {						\
@@ -81,10 +72,7 @@ extern void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key
 
 #define read_unlock_irq(lock)	read_unlock(lock)
 
-#define write_unlock(lock)				\
-	do {						\
-		rt_write_unlock(lock);			\
-	} while (0)
+#define write_unlock(lock)	rt_write_unlock(lock)
 
 #define write_unlock_bh(lock)				\
 	do {						\
-- 
cgit v1.2.3


From 61c9d79aff424dfb46a9b96925284e2b6da881bc Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 8 Apr 2013 16:08:46 +0200
Subject: percpu-rwsem: compile fix

The shortcut on mainline skip lockdep. No idea why this is a good thing.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 kernel/locking/percpu-rwsem.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index 652a8ee8efe9..2db0f42d5c64 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -84,8 +84,12 @@ void percpu_down_read(struct percpu_rw_semaphore *brw)
 
 	down_read(&brw->rw_sem);
 	atomic_inc(&brw->slow_read_ctr);
+#ifdef CONFIG_PREEMPT_RT_FULL
+	up_read(&brw->rw_sem);
+#else
 	/* avoid up_read()->rwsem_release() */
 	__up_read(&brw->rw_sem);
+#endif
 }
 
 void percpu_up_read(struct percpu_rw_semaphore *brw)
-- 
cgit v1.2.3


From a82f6e3dacf7d3133e193e09caff2ce6a6e4029c Mon Sep 17 00:00:00 2001
From: Brad Mouring <bmouring@ni.com>
Date: Wed, 14 Jan 2015 15:11:38 -0600
Subject: rtmutex.c: Fix incorrect waiter check

In task_blocks_on_lock, there's a null check on pi_blocked_on
of the task_struct. This pointer can encode the fact that the
task that contains the pointer is waking (preventing requeuing)
and therefore is non-null. Use the inline function to avoid
dereferencing an invalid "pointer"

Signed-off-by: Brad Mouring <brad.mouring@ni.com>
Reported-by: Ben Shelton <ben.shelton@ni.com>
Reviewed-by: T Makphaibulchoke <tmac@hp.com>
Tested-by: T Makphaibulchoke <tmac@hp.com>
Cc: stable-rt@vger.kernel.org
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 kernel/locking/rtmutex.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index d2aaba66b271..e88de061853a 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -359,7 +359,8 @@ int max_lock_depth = 1024;
 
 static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
 {
-	return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL;
+	return rt_mutex_real_waiter(p->pi_blocked_on) ?
+		p->pi_blocked_on->lock : NULL;
 }
 
 /*
-- 
cgit v1.2.3


From 9c99968e776f0a9998424039da4b8e069c5e5b64 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <sebastian@breakpoint.cc>
Date: Mon, 28 Oct 2013 09:36:37 +0100
Subject: rtmutex: add a first shot of ww_mutex

lockdep says:
| --------------------------------------------------------------------------
| | Wound/wait tests |
| ---------------------
|                 ww api failures:  ok  |  ok  |  ok  |
|              ww contexts mixing:  ok  |  ok  |
|            finishing ww context:  ok  |  ok  |  ok  |  ok  |
|              locking mismatches:  ok  |  ok  |  ok  |
|                EDEADLK handling:  ok  |  ok  |  ok  |  ok  |  ok  |  ok  |  ok  |  ok  |  ok  |  ok  |
|          spinlock nest unlocked:  ok  |
| -----------------------------------------------------
|                                |block | try  |context|
| -----------------------------------------------------
|                         context:  ok  |  ok  |  ok  |
|                             try:  ok  |  ok  |  ok  |
|                           block:  ok  |  ok  |  ok  |
|                        spinlock:  ok  |  ok  |  ok  |

Signed-off-by: Sebastian Andrzej Siewior <sebastian@breakpoint.cc>
---
 kernel/locking/rtmutex.c | 239 ++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 227 insertions(+), 12 deletions(-)

diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index e88de061853a..fd28865b5077 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -21,6 +21,7 @@
 #include <linux/sched/rt.h>
 #include <linux/sched/deadline.h>
 #include <linux/timer.h>
+#include <linux/ww_mutex.h>
 
 #include "rtmutex_common.h"
 
@@ -1204,6 +1205,40 @@ EXPORT_SYMBOL(__rt_spin_lock_init);
 
 #endif /* PREEMPT_RT_FULL */
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+	static inline int __sched
+__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+	struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock);
+	struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx);
+
+	if (!hold_ctx)
+		return 0;
+
+	if (unlikely(ctx == hold_ctx))
+		return -EALREADY;
+
+	if (ctx->stamp - hold_ctx->stamp <= LONG_MAX &&
+	    (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) {
+#ifdef CONFIG_DEBUG_MUTEXES
+		DEBUG_LOCKS_WARN_ON(ctx->contending_lock);
+		ctx->contending_lock = ww;
+#endif
+		return -EDEADLK;
+	}
+
+	return 0;
+}
+#else
+	static inline int __sched
+__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+	BUG();
+	return 0;
+}
+
+#endif
+
 static inline int
 try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
 		     struct rt_mutex_waiter *waiter)
@@ -1463,7 +1498,8 @@ void rt_mutex_adjust_pi(struct task_struct *task)
 static int __sched
 __rt_mutex_slowlock(struct rt_mutex *lock, int state,
 		    struct hrtimer_sleeper *timeout,
-		    struct rt_mutex_waiter *waiter)
+		    struct rt_mutex_waiter *waiter,
+		    struct ww_acquire_ctx *ww_ctx)
 {
 	int ret = 0;
 
@@ -1486,6 +1522,12 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
 				break;
 		}
 
+		if (ww_ctx && ww_ctx->acquired > 0) {
+			ret = __mutex_lock_check_stamp(lock, ww_ctx);
+			if (ret)
+				break;
+		}
+
 		raw_spin_unlock(&lock->wait_lock);
 
 		debug_rt_mutex_print_deadlock(waiter);
@@ -1519,13 +1561,90 @@ static void rt_mutex_handle_deadlock(int res, int detect_deadlock,
 	}
 }
 
+static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
+						   struct ww_acquire_ctx *ww_ctx)
+{
+#ifdef CONFIG_DEBUG_MUTEXES
+	/*
+	 * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
+	 * but released with a normal mutex_unlock in this call.
+	 *
+	 * This should never happen, always use ww_mutex_unlock.
+	 */
+	DEBUG_LOCKS_WARN_ON(ww->ctx);
+
+	/*
+	 * Not quite done after calling ww_acquire_done() ?
+	 */
+	DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
+
+	if (ww_ctx->contending_lock) {
+		/*
+		 * After -EDEADLK you tried to
+		 * acquire a different ww_mutex? Bad!
+		 */
+		DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
+
+		/*
+		 * You called ww_mutex_lock after receiving -EDEADLK,
+		 * but 'forgot' to unlock everything else first?
+		 */
+		DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
+		ww_ctx->contending_lock = NULL;
+	}
+
+	/*
+	 * Naughty, using a different class will lead to undefined behavior!
+	 */
+	DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
+#endif
+	ww_ctx->acquired++;
+}
+
+#ifdef CONFIG_PREEMPT_RT_FULL
+static void ww_mutex_account_lock(struct rt_mutex *lock,
+				  struct ww_acquire_ctx *ww_ctx)
+{
+	struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock);
+	struct rt_mutex_waiter *waiter, *n;
+
+	/*
+	 * This branch gets optimized out for the common case,
+	 * and is only important for ww_mutex_lock.
+	 */
+	ww_mutex_lock_acquired(ww, ww_ctx);
+	ww->ctx = ww_ctx;
+
+	/*
+	 * Give any possible sleeping processes the chance to wake up,
+	 * so they can recheck if they have to back off.
+	 */
+	rbtree_postorder_for_each_entry_safe(waiter, n, &lock->waiters,
+					     tree_entry) {
+		/* XXX debug rt mutex waiter wakeup */
+
+		BUG_ON(waiter->lock != lock);
+		rt_mutex_wake_waiter(waiter);
+	}
+}
+
+#else
+
+static void ww_mutex_account_lock(struct rt_mutex *lock,
+				  struct ww_acquire_ctx *ww_ctx)
+{
+	BUG();
+}
+#endif
+
 /*
  * Slow path lock function:
  */
 static int __sched
 rt_mutex_slowlock(struct rt_mutex *lock, int state,
 		  struct hrtimer_sleeper *timeout,
-		  enum rtmutex_chainwalk chwalk)
+		  enum rtmutex_chainwalk chwalk,
+		  struct ww_acquire_ctx *ww_ctx)
 {
 	struct rt_mutex_waiter waiter;
 	int ret = 0;
@@ -1536,6 +1655,8 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
 
 	/* Try to acquire the lock again: */
 	if (try_to_take_rt_mutex(lock, current, NULL)) {
+		if (ww_ctx)
+			ww_mutex_account_lock(lock, ww_ctx);
 		raw_spin_unlock(&lock->wait_lock);
 		return 0;
 	}
@@ -1552,7 +1673,8 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
 	ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk);
 
 	if (likely(!ret))
-		ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
+		ret = __rt_mutex_slowlock(lock, state, timeout, &waiter,
+					  ww_ctx);
 
 	set_current_state(TASK_RUNNING);
 
@@ -1560,6 +1682,8 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
 		if (rt_mutex_has_waiters(lock))
 			remove_waiter(lock, &waiter);
 		rt_mutex_handle_deadlock(ret, chwalk, &waiter);
+	} else if (ww_ctx) {
+		ww_mutex_account_lock(lock, ww_ctx);
 	}
 
 	/*
@@ -1684,31 +1808,36 @@ rt_mutex_slowunlock(struct rt_mutex *lock)
  */
 static inline int
 rt_mutex_fastlock(struct rt_mutex *lock, int state,
+		  struct ww_acquire_ctx *ww_ctx,
 		  int (*slowfn)(struct rt_mutex *lock, int state,
 				struct hrtimer_sleeper *timeout,
-				enum rtmutex_chainwalk chwalk))
+				enum rtmutex_chainwalk chwalk,
+				struct ww_acquire_ctx *ww_ctx))
 {
 	if (likely(rt_mutex_cmpxchg(lock, NULL, current))) {
 		rt_mutex_deadlock_account_lock(lock, current);
 		return 0;
 	} else
-		return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
+		return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK,
+			      ww_ctx);
 }
 
 static inline int
 rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
 			struct hrtimer_sleeper *timeout,
 			enum rtmutex_chainwalk chwalk,
+			struct ww_acquire_ctx *ww_ctx,
 			int (*slowfn)(struct rt_mutex *lock, int state,
 				      struct hrtimer_sleeper *timeout,
-				      enum rtmutex_chainwalk chwalk))
+				      enum rtmutex_chainwalk chwalk,
+				      struct ww_acquire_ctx *ww_ctx))
 {
 	if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
 	    likely(rt_mutex_cmpxchg(lock, NULL, current))) {
 		rt_mutex_deadlock_account_lock(lock, current);
 		return 0;
 	} else
-		return slowfn(lock, state, timeout, chwalk);
+		return slowfn(lock, state, timeout, chwalk, ww_ctx);
 }
 
 static inline int
@@ -1741,7 +1870,7 @@ void __sched rt_mutex_lock(struct rt_mutex *lock)
 {
 	might_sleep();
 
-	rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock);
+	rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, NULL, rt_mutex_slowlock);
 }
 EXPORT_SYMBOL_GPL(rt_mutex_lock);
 
@@ -1758,7 +1887,7 @@ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
 {
 	might_sleep();
 
-	return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock);
+	return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, NULL, rt_mutex_slowlock);
 }
 EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
 
@@ -1771,7 +1900,7 @@ int rt_mutex_timed_futex_lock(struct rt_mutex *lock,
 	might_sleep();
 
 	return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
-				       RT_MUTEX_FULL_CHAINWALK,
+				       RT_MUTEX_FULL_CHAINWALK, NULL,
 				       rt_mutex_slowlock);
 }
 
@@ -1790,7 +1919,7 @@ int __sched rt_mutex_lock_killable(struct rt_mutex *lock)
 {
 	might_sleep();
 
-	return rt_mutex_fastlock(lock, TASK_KILLABLE, rt_mutex_slowlock);
+	return rt_mutex_fastlock(lock, TASK_KILLABLE, NULL, rt_mutex_slowlock);
 }
 EXPORT_SYMBOL_GPL(rt_mutex_lock_killable);
 
@@ -1814,6 +1943,7 @@ rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout)
 
 	return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
 				       RT_MUTEX_MIN_CHAINWALK,
+				       NULL,
 				       rt_mutex_slowlock);
 }
 EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
@@ -2038,7 +2168,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
 
 	set_current_state(TASK_INTERRUPTIBLE);
 
-	ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
+	ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, NULL);
 
 	set_current_state(TASK_RUNNING);
 
@@ -2055,3 +2185,88 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
 
 	return ret;
 }
+
+static inline int
+ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
+	unsigned tmp;
+
+	if (ctx->deadlock_inject_countdown-- == 0) {
+		tmp = ctx->deadlock_inject_interval;
+		if (tmp > UINT_MAX/4)
+			tmp = UINT_MAX;
+		else
+			tmp = tmp*2 + tmp + tmp/2;
+
+		ctx->deadlock_inject_interval = tmp;
+		ctx->deadlock_inject_countdown = tmp;
+		ctx->contending_lock = lock;
+
+		ww_mutex_unlock(lock);
+
+		return -EDEADLK;
+	}
+#endif
+
+	return 0;
+}
+
+#ifdef CONFIG_PREEMPT_RT_FULL
+int __sched
+__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx)
+{
+	int ret;
+
+	might_sleep();
+
+	mutex_acquire(&lock->base.dep_map, 0, 0, _RET_IP_);
+	ret = rt_mutex_slowlock(&lock->base.lock, TASK_INTERRUPTIBLE, NULL, 0, ww_ctx);
+	if (ret)
+		mutex_release(&lock->base.dep_map, 1, _RET_IP_);
+	else if (!ret && ww_ctx->acquired > 1)
+		return ww_mutex_deadlock_injection(lock, ww_ctx);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible);
+
+int __sched
+__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx)
+{
+	int ret;
+
+	might_sleep();
+
+	mutex_acquire_nest(&lock->base.dep_map, 0, 0, &ww_ctx->dep_map,
+			_RET_IP_);
+	ret = rt_mutex_slowlock(&lock->base.lock, TASK_UNINTERRUPTIBLE, NULL, 0, ww_ctx);
+	if (ret)
+		mutex_release(&lock->base.dep_map, 1, _RET_IP_);
+	else if (!ret && ww_ctx->acquired > 1)
+		return ww_mutex_deadlock_injection(lock, ww_ctx);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(__ww_mutex_lock);
+
+void __sched ww_mutex_unlock(struct ww_mutex *lock)
+{
+	/*
+	 * The unlocking fastpath is the 0->1 transition from 'locked'
+	 * into 'unlocked' state:
+	 */
+	if (lock->ctx) {
+#ifdef CONFIG_DEBUG_MUTEXES
+		DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired);
+#endif
+		if (lock->ctx->acquired > 0)
+			lock->ctx->acquired--;
+		lock->ctx = NULL;
+	}
+
+	mutex_release(&lock->base.dep_map, 1, _RET_IP_);
+	rt_mutex_unlock(&lock->base.lock);
+}
+EXPORT_SYMBOL(ww_mutex_unlock);
+#endif
-- 
cgit v1.2.3


From 48f29af5330ea6cfaca0c77ea4ad3a95214784c0 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <umgwanakikbuti@gmail.com>
Date: Mon, 2 Jun 2014 15:12:44 +0200
Subject: rt,locking: fix __ww_mutex_lock_interruptible() lockdep annotation

Using mutex_acquire_nest() as used in __ww_mutex_lock() fixes the
splat below.  Remove superfluous line break in __ww_mutex_lock()
as well.

|=============================================
|[ INFO: possible recursive locking detected ]
|3.14.4-rt5 #26 Not tainted
|---------------------------------------------
|Xorg/4298 is trying to acquire lock:
| (reservation_ww_class_mutex){+.+.+.}, at: [<ffffffffa02b4270>] nouveau_gem_ioctl_pushbuf+0x870/0x19f0 [nouveau]
|but task is already holding lock:
| (reservation_ww_class_mutex){+.+.+.}, at: [<ffffffffa02b4270>] nouveau_gem_ioctl_pushbuf+0x870/0x19f0 [nouveau]
|other info that might help us debug this:
| Possible unsafe locking scenario:
|       CPU0
|       ----
|  lock(reservation_ww_class_mutex);
|  lock(reservation_ww_class_mutex);
|
| *** DEADLOCK ***
|
| May be due to missing lock nesting notation
|
|3 locks held by Xorg/4298:
| #0:  (&cli->mutex){+.+.+.}, at: [<ffffffffa02b597b>] nouveau_abi16_get+0x2b/0x100 [nouveau]
| #1:  (reservation_ww_class_acquire){+.+...}, at: [<ffffffffa0160cd2>] drm_ioctl+0x4d2/0x610 [drm]
| #2:  (reservation_ww_class_mutex){+.+.+.}, at: [<ffffffffa02b4270>] nouveau_gem_ioctl_pushbuf+0x870/0x19f0 [nouveau]

Cc: stable-rt@vger.kernel.org
Signed-off-by: Mike Galbraith <umgwanakikbuti@gmail.com>
---
 kernel/locking/rtmutex.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index fd28865b5077..f8001d5a4da7 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -2220,7 +2220,7 @@ __ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ww_c
 
 	might_sleep();
 
-	mutex_acquire(&lock->base.dep_map, 0, 0, _RET_IP_);
+	mutex_acquire_nest(&lock->base.dep_map, 0, 0, &ww_ctx->dep_map, _RET_IP_);
 	ret = rt_mutex_slowlock(&lock->base.lock, TASK_INTERRUPTIBLE, NULL, 0, ww_ctx);
 	if (ret)
 		mutex_release(&lock->base.dep_map, 1, _RET_IP_);
@@ -2238,8 +2238,7 @@ __ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx)
 
 	might_sleep();
 
-	mutex_acquire_nest(&lock->base.dep_map, 0, 0, &ww_ctx->dep_map,
-			_RET_IP_);
+	mutex_acquire_nest(&lock->base.dep_map, 0, 0, &ww_ctx->dep_map, _RET_IP_);
 	ret = rt_mutex_slowlock(&lock->base.lock, TASK_UNINTERRUPTIBLE, NULL, 0, ww_ctx);
 	if (ret)
 		mutex_release(&lock->base.dep_map, 1, _RET_IP_);
-- 
cgit v1.2.3


From db468683bba7c98d2739e947d12b3ee03832aead Mon Sep 17 00:00:00 2001
From: Gustavo Bittencourt <gbitten@gmail.com>
Date: Tue, 20 Jan 2015 18:02:29 -0200
Subject: rtmutex: enable deadlock detection in ww_mutex_lock functions

The functions ww_mutex_lock_interruptible and ww_mutex_lock should return -EDEADLK when faced with
a deadlock. To do so, the paramenter detect_deadlock in rt_mutex_slowlock must be TRUE.
This patch corrects potential deadlocks when running PREEMPT_RT with nouveau driver.

Cc: stable-rt@vger.kernel.org
Signed-off-by: Gustavo Bittencourt <gbitten@gmail.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 kernel/locking/rtmutex.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index f8001d5a4da7..090cc725f38a 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -2221,7 +2221,8 @@ __ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ww_c
 	might_sleep();
 
 	mutex_acquire_nest(&lock->base.dep_map, 0, 0, &ww_ctx->dep_map, _RET_IP_);
-	ret = rt_mutex_slowlock(&lock->base.lock, TASK_INTERRUPTIBLE, NULL, 0, ww_ctx);
+	ret = rt_mutex_slowlock(&lock->base.lock, TASK_INTERRUPTIBLE, NULL,
+				RT_MUTEX_FULL_CHAINWALK, ww_ctx);
 	if (ret)
 		mutex_release(&lock->base.dep_map, 1, _RET_IP_);
 	else if (!ret && ww_ctx->acquired > 1)
@@ -2239,7 +2240,8 @@ __ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx)
 	might_sleep();
 
 	mutex_acquire_nest(&lock->base.dep_map, 0, 0, &ww_ctx->dep_map, _RET_IP_);
-	ret = rt_mutex_slowlock(&lock->base.lock, TASK_UNINTERRUPTIBLE, NULL, 0, ww_ctx);
+	ret = rt_mutex_slowlock(&lock->base.lock, TASK_UNINTERRUPTIBLE, NULL,
+				RT_MUTEX_FULL_CHAINWALK, ww_ctx);
 	if (ret)
 		mutex_release(&lock->base.dep_map, 1, _RET_IP_);
 	else if (!ret && ww_ctx->acquired > 1)
-- 
cgit v1.2.3


From f1280df10464a9e91afdd32853ba596851894459 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <umgwanakikbuti@gmail.com>
Date: Thu, 26 Feb 2015 09:02:05 +0100
Subject: locking: ww_mutex: fix ww_mutex vs self-deadlock

If the caller already holds the mutex, task_blocks_on_rt_mutex()
returns -EDEADLK, we proceed directly to rt_mutex_handle_deadlock()
where it's instant game over.

Let ww_mutexes return EDEADLK/EALREADY as they want to instead.

Signed-off-by: Mike Galbraith <umgwanakikbuti@gmail.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 kernel/locking/rtmutex.c | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 090cc725f38a..a5b076bbb584 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1673,15 +1673,21 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
 	ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk);
 
 	if (likely(!ret))
-		ret = __rt_mutex_slowlock(lock, state, timeout, &waiter,
-					  ww_ctx);
+		ret = __rt_mutex_slowlock(lock, state, timeout, &waiter, ww_ctx);
+	else if (ww_ctx) {
+		/* ww_mutex received EDEADLK, let it become EALREADY */
+		ret = __mutex_lock_check_stamp(lock, ww_ctx);
+		BUG_ON(!ret);
+	}
 
 	set_current_state(TASK_RUNNING);
 
 	if (unlikely(ret)) {
 		if (rt_mutex_has_waiters(lock))
 			remove_waiter(lock, &waiter);
-		rt_mutex_handle_deadlock(ret, chwalk, &waiter);
+		/* ww_mutex want to report EDEADLK/EALREADY, let them */
+		if (!ww_ctx)
+			rt_mutex_handle_deadlock(ret, chwalk, &waiter);
 	} else if (ww_ctx) {
 		ww_mutex_account_lock(lock, ww_ctx);
 	}
@@ -2221,8 +2227,7 @@ __ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ww_c
 	might_sleep();
 
 	mutex_acquire_nest(&lock->base.dep_map, 0, 0, &ww_ctx->dep_map, _RET_IP_);
-	ret = rt_mutex_slowlock(&lock->base.lock, TASK_INTERRUPTIBLE, NULL,
-				RT_MUTEX_FULL_CHAINWALK, ww_ctx);
+	ret = rt_mutex_slowlock(&lock->base.lock, TASK_INTERRUPTIBLE, NULL, 0, ww_ctx);
 	if (ret)
 		mutex_release(&lock->base.dep_map, 1, _RET_IP_);
 	else if (!ret && ww_ctx->acquired > 1)
@@ -2240,8 +2245,7 @@ __ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx)
 	might_sleep();
 
 	mutex_acquire_nest(&lock->base.dep_map, 0, 0, &ww_ctx->dep_map, _RET_IP_);
-	ret = rt_mutex_slowlock(&lock->base.lock, TASK_UNINTERRUPTIBLE, NULL,
-				RT_MUTEX_FULL_CHAINWALK, ww_ctx);
+	ret = rt_mutex_slowlock(&lock->base.lock, TASK_UNINTERRUPTIBLE, NULL, 0, ww_ctx);
 	if (ret)
 		mutex_release(&lock->base.dep_map, 1, _RET_IP_);
 	else if (!ret && ww_ctx->acquired > 1)
@@ -2253,11 +2257,13 @@ EXPORT_SYMBOL_GPL(__ww_mutex_lock);
 
 void __sched ww_mutex_unlock(struct ww_mutex *lock)
 {
+	int nest = !!lock->ctx;
+
 	/*
 	 * The unlocking fastpath is the 0->1 transition from 'locked'
 	 * into 'unlocked' state:
 	 */
-	if (lock->ctx) {
+	if (nest) {
 #ifdef CONFIG_DEBUG_MUTEXES
 		DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired);
 #endif
@@ -2266,7 +2272,7 @@ void __sched ww_mutex_unlock(struct ww_mutex *lock)
 		lock->ctx = NULL;
 	}
 
-	mutex_release(&lock->base.dep_map, 1, _RET_IP_);
+	mutex_release(&lock->base.dep_map, nest, _RET_IP_);
 	rt_mutex_unlock(&lock->base.lock);
 }
 EXPORT_SYMBOL(ww_mutex_unlock);
-- 
cgit v1.2.3


From 5bd4d8bfd00f253a9d398cd5628867e46d68da9f Mon Sep 17 00:00:00 2001
From: John Kacur <jkacur@redhat.com>
Date: Mon, 19 Sep 2011 11:09:27 +0200
Subject: rwlocks: Fix section mismatch

This fixes the following build error for the preempt-rt kernel.

make kernel/fork.o
  CC      kernel/fork.o
kernel/fork.c:90: error: section of tasklist_lock conflicts with previous declaration
make[2]: *** [kernel/fork.o] Error 1
make[1]: *** [kernel/fork.o] Error 2

The rt kernel cache aligns the RWLOCK in DEFINE_RWLOCK by default.
The non-rt kernels explicitly cache align only the tasklist_lock in
kernel/fork.c
That can create a build conflict. This fixes the build problem by making the
non-rt kernels cache align RWLOCKs by default. The side effect is that
the other RWLOCKs are also cache aligned for non-rt.

This is a short term solution for rt only.
The longer term solution would be to push the cache aligned DEFINE_RWLOCK
to mainline. If there are objections, then we could create a
DEFINE_RWLOCK_CACHE_ALIGNED or something of that nature.

Comments? Objections?

Signed-off-by: John Kacur <jkacur@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/alpine.LFD.2.00.1109191104010.23118@localhost6.localdomain6
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/rwlock_types.h | 3 ++-
 kernel/fork.c                | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/linux/rwlock_types.h b/include/linux/rwlock_types.h
index 5317cd957292..d0da966ad7a0 100644
--- a/include/linux/rwlock_types.h
+++ b/include/linux/rwlock_types.h
@@ -47,6 +47,7 @@ typedef struct {
 				RW_DEP_MAP_INIT(lockname) }
 #endif
 
-#define DEFINE_RWLOCK(x)	rwlock_t x = __RW_LOCK_UNLOCKED(x)
+#define DEFINE_RWLOCK(name) \
+	rwlock_t name __cacheline_aligned_in_smp = __RW_LOCK_UNLOCKED(name)
 
 #endif /* __LINUX_RWLOCK_TYPES_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index 023f585bf86b..669e2ca7bf71 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -97,7 +97,7 @@ int max_threads;		/* tunable limit on nr_threads */
 
 DEFINE_PER_CPU(unsigned long, process_counts) = 0;
 
-__cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
+DEFINE_RWLOCK(tasklist_lock);  /* outer */
 
 #ifdef CONFIG_PROVE_RCU
 int lockdep_tasklist_lock_is_held(void)
-- 
cgit v1.2.3


From c34083eb07d4766dbf26873b4ef64ddfa07cf77d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 17 Jul 2011 22:08:38 +0200
Subject: timer-handle-idle-trylock-in-get-next-timer-irq.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/spinlock_rt.h | 12 +++++++++++-
 kernel/locking/rtmutex.c    |  7 +------
 kernel/time/timer.c         |  9 +++++++--
 3 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/include/linux/spinlock_rt.h b/include/linux/spinlock_rt.h
index 621d857ec546..447c457950a4 100644
--- a/include/linux/spinlock_rt.h
+++ b/include/linux/spinlock_rt.h
@@ -50,7 +50,17 @@ extern void __lockfunc __rt_spin_unlock(struct rt_mutex *lock);
 
 #define spin_lock_irq(lock)		spin_lock(lock)
 
-#define spin_trylock(lock)		__cond_lock(lock, rt_spin_trylock(lock))
+#define spin_do_trylock(lock)		__cond_lock(lock, rt_spin_trylock(lock))
+
+#define spin_trylock(lock)			\
+({						\
+	int __locked;				\
+	migrate_disable();			\
+	__locked = spin_do_trylock(lock);	\
+	if (!__locked)				\
+		migrate_enable();		\
+	__locked;				\
+})
 
 #ifdef CONFIG_LOCKDEP
 # define spin_lock_nested(lock, subclass)		\
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index a5b076bbb584..a15062c9d7cb 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1133,15 +1133,10 @@ EXPORT_SYMBOL(rt_spin_unlock_wait);
 
 int __lockfunc rt_spin_trylock(spinlock_t *lock)
 {
-	int ret;
+	int ret = rt_mutex_trylock(&lock->lock);
 
-	migrate_disable();
-	ret = rt_mutex_trylock(&lock->lock);
 	if (ret)
 		spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
-	else
-		migrate_enable();
-
 	return ret;
 }
 EXPORT_SYMBOL(rt_spin_trylock);
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 6e4b959e4e79..91c9149264a6 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1411,9 +1411,10 @@ unsigned long get_next_timer_interrupt(unsigned long now)
 	/*
 	 * On PREEMPT_RT we cannot sleep here. If the trylock does not
 	 * succeed then we return the worst-case 'expires in 1 tick'
-	 * value:
+	 * value.  We use the rt functions here directly to avoid a
+	 * migrate_disable() call.
 	 */
-	if (!spin_trylock(&base->lock))
+	if (!spin_do_trylock(&base->lock))
 		return  now + 1;
 #else
 	spin_lock(&base->lock);
@@ -1423,7 +1424,11 @@ unsigned long get_next_timer_interrupt(unsigned long now)
 			base->next_timer = __next_timer_interrupt(base);
 		expires = base->next_timer;
 	}
+#ifdef CONFIG_PREEMPT_RT_FULL
+	rt_spin_unlock(&base->lock);
+#else
 	spin_unlock(&base->lock);
+#endif
 
 	if (time_before_eq(expires, now))
 		return now;
-- 
cgit v1.2.3


From 98251c3ea1b9e23f0df41c428fafda6ec5143a1b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 7 Nov 2013 12:21:11 +0100
Subject: timers: do not raise softirq unconditionally

Mike,

On Thu, 7 Nov 2013, Mike Galbraith wrote:

> On Thu, 2013-11-07 at 04:26 +0100, Mike Galbraith wrote:
> > On Wed, 2013-11-06 at 18:49 +0100, Thomas Gleixner wrote:
>
> > > I bet you are trying to work around some of the side effects of the
> > > occasional tick which is still necessary despite of full nohz, right?
> >
> > Nope, I wanted to check out cost of nohz_full for rt, and found that it
> > doesn't work at all instead, looked, and found that the sole running
> > task has just awakened ksoftirqd when it wants to shut the tick down, so
> > that shutdown never happens.
>
> Like so in virgin 3.10-rt.  Box is x3550 M3 booted nowatchdog
> rcu_nocbs=1-3 nohz_full=1-3, and CPUs1-3 are completely isolated via
> cpusets as well.

well, that very same problem is in mainline if you add "threadirqs" to
the command line. But we can be smart about this. The untested patch
below should address that issue. If that works on mainline we can
adapt it for RT (needs a trylock(&base->lock) there).

Though it's not a full solution. It needs some thought versus the
softirq code of timers. Assume we have only one timer queued 1000
ticks into the future. So this change will cause the timer softirq not
to be called until that timer expires and then the timer softirq is
going to do 1000 loops until it catches up with jiffies. That's
anything but pretty ...

What worries me more is this one:

  pert-5229  [003] d..h1..   684.482618: softirq_raise: vec=9 [action=RCU]

The CPU has no callbacks as you shoved them over to cpu 0, so why is
the RCU softirq raised?

Thanks,

	tglx
------------------
Message-id: <alpine.DEB.2.02.1311071158350.23353@ionos.tec.linutronix.de>
|CONFIG_NO_HZ_FULL + CONFIG_PREEMPT_RT_FULL = nogo
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 include/linux/hrtimer.h |  3 +--
 kernel/time/hrtimer.c   | 31 +++++++------------------------
 kernel/time/timer.c     | 28 +++++++++++++++++++++++++---
 3 files changed, 33 insertions(+), 29 deletions(-)

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 32a0af13f438..50fc06dfe54a 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -455,9 +455,8 @@ extern int schedule_hrtimeout_range_clock(ktime_t *expires,
 		unsigned long delta, const enum hrtimer_mode mode, int clock);
 extern int schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode);
 
-/* Soft interrupt function to run the hrtimer queues: */
+/* Called from the periodic timer tick */
 extern void hrtimer_run_queues(void);
-extern void hrtimer_run_pending(void);
 
 /* Bootup initialization: */
 extern void __init hrtimers_init(void);
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 20a6809b4b02..3d9b13d709f6 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1659,30 +1659,6 @@ static void run_hrtimer_softirq(struct softirq_action *h)
 	hrtimer_rt_run_pending();
 }
 
-/*
- * Called from timer softirq every jiffy, expire hrtimers:
- *
- * For HRT its the fall back code to run the softirq in the timer
- * softirq context in case the hrtimer initialization failed or has
- * not been done yet.
- */
-void hrtimer_run_pending(void)
-{
-	if (hrtimer_hres_active())
-		return;
-
-	/*
-	 * This _is_ ugly: We have to check in the softirq context,
-	 * whether we can switch to highres and / or nohz mode. The
-	 * clocksource switch happens in the timer interrupt with
-	 * xtime_lock held. Notification from there only sets the
-	 * check bit in the tick_oneshot code, otherwise we might
-	 * deadlock vs. xtime_lock.
-	 */
-	if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
-		hrtimer_switch_to_hres();
-}
-
 /*
  * Called from hardirq context every jiffy
  */
@@ -1696,6 +1672,13 @@ void hrtimer_run_queues(void)
 	if (hrtimer_hres_active())
 		return;
 
+	/*
+	 * Check whether we can switch to highres mode.
+	 */
+	if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())
+	    && hrtimer_switch_to_hres())
+		return;
+
 	for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
 		base = &cpu_base->clock_base[index];
 		if (!timerqueue_getnext(&base->active))
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 91c9149264a6..e27db8e40f75 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1464,8 +1464,6 @@ static void run_timer_softirq(struct softirq_action *h)
 {
 	struct tvec_base *base = __this_cpu_read(tvec_bases);
 
-	hrtimer_run_pending();
-
 	if (time_after_eq(jiffies, base->timer_jiffies))
 		__run_timers(base);
 }
@@ -1475,8 +1473,32 @@ static void run_timer_softirq(struct softirq_action *h)
  */
 void run_local_timers(void)
 {
+	struct tvec_base *base = __this_cpu_read(tvec_bases);
+
 	hrtimer_run_queues();
-	raise_softirq(TIMER_SOFTIRQ);
+	/*
+	 * We can access this lockless as we are in the timer
+	 * interrupt. If there are no timers queued, nothing to do in
+	 * the timer softirq.
+	 */
+#ifdef CONFIG_PREEMPT_RT_FULL
+	if (!spin_do_trylock(&base->lock)) {
+		raise_softirq(TIMER_SOFTIRQ);
+		return;
+	}
+#endif
+	if (!base->active_timers)
+		goto out;
+
+	/* Check whether the next pending timer has expired */
+	if (time_before_eq(base->next_timer, jiffies))
+		raise_softirq(TIMER_SOFTIRQ);
+out:
+#ifdef CONFIG_PREEMPT_RT_FULL
+	rt_spin_unlock_after_trylock_in_irq(&base->lock);
+#endif
+	/* The ; ensures that gcc won't complain in the !RT case */
+	;
 }
 
 #ifdef __ARCH_WANT_SYS_ALARM
-- 
cgit v1.2.3


From e7b3243462308ef2e39a6e4c66cd5e7acb81762a Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Fri, 24 Jan 2014 15:09:33 -0500
Subject: timer: Raise softirq if there's irq_work

[ Talking with Sebastian on IRC, it seems that doing the irq_work_run()
  from the interrupt in -rt is a bad thing. Here we simply raise the
  softirq if there's irq work to do. This too boots on my i7 ]

After trying hard to figure out why my i7 box was locking up with the
new active_timers code, that does not run the timer softirq if there
are no active timers, I took an extra look at the softirq handler and
noticed that it doesn't just run timer softirqs, it also runs irq work.

This was the bug that was locking up the system. It wasn't missing a
timer, it was missing irq work. By always doing the irq work callbacks,
the system boots fine. The missing irq work callback was the RCU's
sp_wakeup() function.

No need to check for defined(CONFIG_IRQ_WORK). When that's not set the
"irq_work_needs_cpu()" is a static inline that returns false.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 kernel/time/timer.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index e27db8e40f75..f322befedd91 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1487,8 +1487,13 @@ void run_local_timers(void)
 		return;
 	}
 #endif
-	if (!base->active_timers)
-		goto out;
+	if (!base->active_timers) {
+#ifdef CONFIG_PREEMPT_RT_FULL
+		/* On RT, irq work runs from softirq */
+		if (!irq_work_needs_cpu())
+#endif
+			goto out;
+	}
 
 	/* Check whether the next pending timer has expired */
 	if (time_before_eq(base->next_timer, jiffies))
-- 
cgit v1.2.3


From b8f7e2b33b9c9bc663d0eecbb7d06beffa51aaa2 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Fri, 31 Jan 2014 12:07:57 -0500
Subject: timer/rt: Always raise the softirq if there's irq_work to be done

It was previously discovered that some systems would hang on boot up
with a previous version of 3.12-rt. This was due to RCU using irq_work,
and RT defers the irq_work to a softirq. But if there's no active
timers, the softirq will not be raised, and RCU work will not get done,
causing the system to hang.  The fix was to check that if there was no
active timers but irq_work to be done, then we should raise the softirq.

But this fix was not 100% correct. It left out the case that there were
active timers that were not expired yet. This would have the softirq
not get raised even if there was irq work to be done.

If there is irq_work to be done, then we must raise the timer softirq
regardless of if there is active timers or whether they are expired or
not. The softirq can handle those cases. But we can never ignore
irq_work.

As it is only PREEMPT_RT_FULL that requires irq_work to be done in the
softirq, we can pull out the check in the active_timers condition, and
make the code a bit cleaner by having the irq_work check separate, and
put the code in with the other #ifdef PREEMPT_RT. If there is irq_work
to be done, there's no need to check the active timers or if they are
expired. Just raise the time softirq and be done with it. Otherwise, we
can do the timer checks just like we do with non -rt.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 kernel/time/timer.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index f322befedd91..593911b5163b 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1482,18 +1482,20 @@ void run_local_timers(void)
 	 * the timer softirq.
 	 */
 #ifdef CONFIG_PREEMPT_RT_FULL
+	/* On RT, irq work runs from softirq */
+	if (irq_work_needs_cpu()) {
+		raise_softirq(TIMER_SOFTIRQ);
+		return;
+	}
+
 	if (!spin_do_trylock(&base->lock)) {
 		raise_softirq(TIMER_SOFTIRQ);
 		return;
 	}
 #endif
-	if (!base->active_timers) {
-#ifdef CONFIG_PREEMPT_RT_FULL
-		/* On RT, irq work runs from softirq */
-		if (!irq_work_needs_cpu())
-#endif
-			goto out;
-	}
+
+	if (!base->active_timers)
+		goto out;
 
 	/* Check whether the next pending timer has expired */
 	if (time_before_eq(base->next_timer, jiffies))
-- 
cgit v1.2.3


From a5f859b90837860d0c41f8649994a0d2e5410af3 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 15 Nov 2013 15:46:50 +0100
Subject: rtmutex: use a trylock for waiter lock in trylock

Mike Galbraith captered the following:
| >#11 [ffff88017b243e90] _raw_spin_lock at ffffffff815d2596
| >#12 [ffff88017b243e90] rt_mutex_trylock at ffffffff815d15be
| >#13 [ffff88017b243eb0] get_next_timer_interrupt at ffffffff81063b42
| >#14 [ffff88017b243f00] tick_nohz_stop_sched_tick at ffffffff810bd1fd
| >#15 [ffff88017b243f70] tick_nohz_irq_exit at ffffffff810bd7d2
| >#16 [ffff88017b243f90] irq_exit at ffffffff8105b02d
| >#17 [ffff88017b243fb0] reschedule_interrupt at ffffffff815db3dd
| >--- <IRQ stack> ---
| >#18 [ffff88017a2a9bc8] reschedule_interrupt at ffffffff815db3dd
| >    [exception RIP: task_blocks_on_rt_mutex+51]
| >#19 [ffff88017a2a9ce0] rt_spin_lock_slowlock at ffffffff815d183c
| >#20 [ffff88017a2a9da0] lock_timer_base.isra.35 at ffffffff81061cbf
| >#21 [ffff88017a2a9dd0] schedule_timeout at ffffffff815cf1ce
| >#22 [ffff88017a2a9e50] rcu_gp_kthread at ffffffff810f9bbb
| >#23 [ffff88017a2a9ed0] kthread at ffffffff810796d5
| >#24 [ffff88017a2a9f50] ret_from_fork at ffffffff815da04c

lock_timer_base() does a try_lock() which deadlocks on the waiter lock
not the lock itself.
This patch takes the waiter_lock with trylock so it should work from interrupt
context as well. If the fastpath doesn't work and the waiter_lock itself is
taken then it seems that the lock itself taken.
This patch also adds "rt_spin_unlock_after_trylock_in_irq" to keep lockdep
happy. If we managed to take the wait_lock in the first place we should also
be able to take it in the unlock path.

Cc: stable-rt@vger.kernel.org
Reported-by: Mike Galbraith <bitbucket@online.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 include/linux/spinlock_rt.h |  1 +
 kernel/locking/rtmutex.c    | 31 +++++++++++++++++++++++++++----
 kernel/time/timer.c         |  2 +-
 3 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/include/linux/spinlock_rt.h b/include/linux/spinlock_rt.h
index 447c457950a4..ac6f08bde7e8 100644
--- a/include/linux/spinlock_rt.h
+++ b/include/linux/spinlock_rt.h
@@ -22,6 +22,7 @@ extern void __lockfunc rt_spin_lock(spinlock_t *lock);
 extern unsigned long __lockfunc rt_spin_lock_trace_flags(spinlock_t *lock);
 extern void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass);
 extern void __lockfunc rt_spin_unlock(spinlock_t *lock);
+extern void __lockfunc rt_spin_unlock_after_trylock_in_irq(spinlock_t *lock);
 extern void __lockfunc rt_spin_unlock_wait(spinlock_t *lock);
 extern int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags);
 extern int __lockfunc rt_spin_trylock_bh(spinlock_t *lock);
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index a15062c9d7cb..eadae8632a4f 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1061,10 +1061,8 @@ static void wakeup_next_waiter(struct rt_mutex *lock);
 /*
  * Slow path to release a rt_mutex spin_lock style
  */
-static void  noinline __sched rt_spin_lock_slowunlock(struct rt_mutex *lock)
+static void __sched __rt_spin_lock_slowunlock(struct rt_mutex *lock)
 {
-	raw_spin_lock(&lock->wait_lock);
-
 	debug_rt_mutex_unlock(lock);
 
 	rt_mutex_deadlock_account_unlock(current);
@@ -1083,6 +1081,23 @@ static void  noinline __sched rt_spin_lock_slowunlock(struct rt_mutex *lock)
 	rt_mutex_adjust_prio(current);
 }
 
+static void  noinline __sched rt_spin_lock_slowunlock(struct rt_mutex *lock)
+{
+	raw_spin_lock(&lock->wait_lock);
+	__rt_spin_lock_slowunlock(lock);
+}
+
+static void  noinline __sched rt_spin_lock_slowunlock_hirq(struct rt_mutex *lock)
+{
+	int ret;
+
+	do {
+		ret = raw_spin_trylock(&lock->wait_lock);
+	} while (!ret);
+
+	__rt_spin_lock_slowunlock(lock);
+}
+
 void __lockfunc rt_spin_lock(spinlock_t *lock)
 {
 	rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock);
@@ -1113,6 +1128,13 @@ void __lockfunc rt_spin_unlock(spinlock_t *lock)
 }
 EXPORT_SYMBOL(rt_spin_unlock);
 
+void __lockfunc rt_spin_unlock_after_trylock_in_irq(spinlock_t *lock)
+{
+	/* NOTE: we always pass in '1' for nested, for simplicity */
+	spin_release(&lock->dep_map, 1, _RET_IP_);
+	rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock_hirq);
+}
+
 void __lockfunc __rt_spin_unlock(struct rt_mutex *lock)
 {
 	rt_spin_lock_fastunlock(lock, rt_spin_lock_slowunlock);
@@ -1723,7 +1745,8 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
 	 * The mutex has currently no owner. Lock the wait lock and
 	 * try to acquire the lock.
 	 */
-	raw_spin_lock(&lock->wait_lock);
+	if (!raw_spin_trylock(&lock->wait_lock))
+		return 0;
 
 	ret = try_to_take_rt_mutex(lock, current, NULL);
 
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 593911b5163b..311a17b73b28 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1425,7 +1425,7 @@ unsigned long get_next_timer_interrupt(unsigned long now)
 		expires = base->next_timer;
 	}
 #ifdef CONFIG_PREEMPT_RT_FULL
-	rt_spin_unlock(&base->lock);
+	rt_spin_unlock_after_trylock_in_irq(&base->lock);
 #else
 	spin_unlock(&base->lock);
 #endif
-- 
cgit v1.2.3


From 74142577dbfc313857d1f53f58577e4c5710daf4 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 2 May 2014 21:31:50 +0200
Subject: timer: do not spin_trylock() on UP

This will void a warning comming from the spin-lock debugging code. The
lock avoiding idea is from Steven Rostedt.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 kernel/time/timer.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 311a17b73b28..a1ab7e326cc2 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1482,6 +1482,19 @@ void run_local_timers(void)
 	 * the timer softirq.
 	 */
 #ifdef CONFIG_PREEMPT_RT_FULL
+
+#ifndef CONFIG_SMP
+	/*
+	 * The spin_do_trylock() later may fail as the lock may be hold before
+	 * the interrupt arrived. The spin-lock debugging code will raise a
+	 * warning if the try_lock fails on UP. Since this is only an
+	 * optimization for the FULL_NO_HZ case (not to run the timer softirq on
+	 * an nohz_full CPU) we don't really care and shedule the softirq.
+	 */
+	raise_softirq(TIMER_SOFTIRQ);
+	return;
+#endif
+
 	/* On RT, irq work runs from softirq */
 	if (irq_work_needs_cpu()) {
 		raise_softirq(TIMER_SOFTIRQ);
-- 
cgit v1.2.3


From 4e038b5124299d73f8ea8f8a01f6a6c52b209c07 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 28 Jan 2015 14:10:02 +0100
Subject: Revert "timers: do not raise softirq unconditionally"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The patch I revert here triggers the HRtimer switch from hardirq instead
of from softirq. As a result we get a periodic interrupt before the
switch is complete (that is a hrtimer has been programmed) and so the
tick still programms periodic mode. Since the timer has been shutdown,
dev->next_event is set to max and the next increment makes it negative.
And now we wait…

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 include/linux/hrtimer.h |  3 ++-
 kernel/time/hrtimer.c   | 31 ++++++++++++++++++++++++-------
 kernel/time/timer.c     | 46 ++--------------------------------------------
 3 files changed, 28 insertions(+), 52 deletions(-)

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 50fc06dfe54a..32a0af13f438 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -455,8 +455,9 @@ extern int schedule_hrtimeout_range_clock(ktime_t *expires,
 		unsigned long delta, const enum hrtimer_mode mode, int clock);
 extern int schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode);
 
-/* Called from the periodic timer tick */
+/* Soft interrupt function to run the hrtimer queues: */
 extern void hrtimer_run_queues(void);
+extern void hrtimer_run_pending(void);
 
 /* Bootup initialization: */
 extern void __init hrtimers_init(void);
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 3d9b13d709f6..20a6809b4b02 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1659,6 +1659,30 @@ static void run_hrtimer_softirq(struct softirq_action *h)
 	hrtimer_rt_run_pending();
 }
 
+/*
+ * Called from timer softirq every jiffy, expire hrtimers:
+ *
+ * For HRT its the fall back code to run the softirq in the timer
+ * softirq context in case the hrtimer initialization failed or has
+ * not been done yet.
+ */
+void hrtimer_run_pending(void)
+{
+	if (hrtimer_hres_active())
+		return;
+
+	/*
+	 * This _is_ ugly: We have to check in the softirq context,
+	 * whether we can switch to highres and / or nohz mode. The
+	 * clocksource switch happens in the timer interrupt with
+	 * xtime_lock held. Notification from there only sets the
+	 * check bit in the tick_oneshot code, otherwise we might
+	 * deadlock vs. xtime_lock.
+	 */
+	if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
+		hrtimer_switch_to_hres();
+}
+
 /*
  * Called from hardirq context every jiffy
  */
@@ -1672,13 +1696,6 @@ void hrtimer_run_queues(void)
 	if (hrtimer_hres_active())
 		return;
 
-	/*
-	 * Check whether we can switch to highres mode.
-	 */
-	if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())
-	    && hrtimer_switch_to_hres())
-		return;
-
 	for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
 		base = &cpu_base->clock_base[index];
 		if (!timerqueue_getnext(&base->active))
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index a1ab7e326cc2..ab5f3e458fdf 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1464,6 +1464,8 @@ static void run_timer_softirq(struct softirq_action *h)
 {
 	struct tvec_base *base = __this_cpu_read(tvec_bases);
 
+	hrtimer_run_pending();
+
 	if (time_after_eq(jiffies, base->timer_jiffies))
 		__run_timers(base);
 }
@@ -1473,52 +1475,8 @@ static void run_timer_softirq(struct softirq_action *h)
  */
 void run_local_timers(void)
 {
-	struct tvec_base *base = __this_cpu_read(tvec_bases);
-
 	hrtimer_run_queues();
-	/*
-	 * We can access this lockless as we are in the timer
-	 * interrupt. If there are no timers queued, nothing to do in
-	 * the timer softirq.
-	 */
-#ifdef CONFIG_PREEMPT_RT_FULL
-
-#ifndef CONFIG_SMP
-	/*
-	 * The spin_do_trylock() later may fail as the lock may be hold before
-	 * the interrupt arrived. The spin-lock debugging code will raise a
-	 * warning if the try_lock fails on UP. Since this is only an
-	 * optimization for the FULL_NO_HZ case (not to run the timer softirq on
-	 * an nohz_full CPU) we don't really care and shedule the softirq.
-	 */
 	raise_softirq(TIMER_SOFTIRQ);
-	return;
-#endif
-
-	/* On RT, irq work runs from softirq */
-	if (irq_work_needs_cpu()) {
-		raise_softirq(TIMER_SOFTIRQ);
-		return;
-	}
-
-	if (!spin_do_trylock(&base->lock)) {
-		raise_softirq(TIMER_SOFTIRQ);
-		return;
-	}
-#endif
-
-	if (!base->active_timers)
-		goto out;
-
-	/* Check whether the next pending timer has expired */
-	if (time_before_eq(base->next_timer, jiffies))
-		raise_softirq(TIMER_SOFTIRQ);
-out:
-#ifdef CONFIG_PREEMPT_RT_FULL
-	rt_spin_unlock_after_trylock_in_irq(&base->lock);
-#endif
-	/* The ; ensures that gcc won't complain in the !RT case */
-	;
 }
 
 #ifdef __ARCH_WANT_SYS_ALARM
-- 
cgit v1.2.3


From ca88610c9b5e089c52db4441ea40fd0d0249410d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 13 Aug 2011 00:23:17 +0200
Subject: rcu: Frob softirq test

With RT_FULL we get the below wreckage:

[  126.060484] =======================================================
[  126.060486] [ INFO: possible circular locking dependency detected ]
[  126.060489] 3.0.1-rt10+ #30
[  126.060490] -------------------------------------------------------
[  126.060492] irq/24-eth0/1235 is trying to acquire lock:
[  126.060495]  (&(lock)->wait_lock#2){+.+...}, at: [<ffffffff81501c81>] rt_mutex_slowunlock+0x16/0x55
[  126.060503]
[  126.060504] but task is already holding lock:
[  126.060506]  (&p->pi_lock){-...-.}, at: [<ffffffff81074fdc>] try_to_wake_up+0x35/0x429
[  126.060511]
[  126.060511] which lock already depends on the new lock.
[  126.060513]
[  126.060514]
[  126.060514] the existing dependency chain (in reverse order) is:
[  126.060516]
[  126.060516] -> #1 (&p->pi_lock){-...-.}:
[  126.060519]        [<ffffffff810afe9e>] lock_acquire+0x145/0x18a
[  126.060524]        [<ffffffff8150291e>] _raw_spin_lock_irqsave+0x4b/0x85
[  126.060527]        [<ffffffff810b5aa4>] task_blocks_on_rt_mutex+0x36/0x20f
[  126.060531]        [<ffffffff815019bb>] rt_mutex_slowlock+0xd1/0x15a
[  126.060534]        [<ffffffff81501ae3>] rt_mutex_lock+0x2d/0x2f
[  126.060537]        [<ffffffff810d9020>] rcu_boost+0xad/0xde
[  126.060541]        [<ffffffff810d90ce>] rcu_boost_kthread+0x7d/0x9b
[  126.060544]        [<ffffffff8109a760>] kthread+0x99/0xa1
[  126.060547]        [<ffffffff81509b14>] kernel_thread_helper+0x4/0x10
[  126.060551]
[  126.060552] -> #0 (&(lock)->wait_lock#2){+.+...}:
[  126.060555]        [<ffffffff810af1b8>] __lock_acquire+0x1157/0x1816
[  126.060558]        [<ffffffff810afe9e>] lock_acquire+0x145/0x18a
[  126.060561]        [<ffffffff8150279e>] _raw_spin_lock+0x40/0x73
[  126.060564]        [<ffffffff81501c81>] rt_mutex_slowunlock+0x16/0x55
[  126.060566]        [<ffffffff81501ce7>] rt_mutex_unlock+0x27/0x29
[  126.060569]        [<ffffffff810d9f86>] rcu_read_unlock_special+0x17e/0x1c4
[  126.060573]        [<ffffffff810da014>] __rcu_read_unlock+0x48/0x89
[  126.060576]        [<ffffffff8106847a>] select_task_rq_rt+0xc7/0xd5
[  126.060580]        [<ffffffff8107511c>] try_to_wake_up+0x175/0x429
[  126.060583]        [<ffffffff81075425>] wake_up_process+0x15/0x17
[  126.060585]        [<ffffffff81080a51>] wakeup_softirqd+0x24/0x26
[  126.060590]        [<ffffffff81081df9>] irq_exit+0x49/0x55
[  126.060593]        [<ffffffff8150a3bd>] smp_apic_timer_interrupt+0x8a/0x98
[  126.060597]        [<ffffffff81509793>] apic_timer_interrupt+0x13/0x20
[  126.060600]        [<ffffffff810d5952>] irq_forced_thread_fn+0x1b/0x44
[  126.060603]        [<ffffffff810d582c>] irq_thread+0xde/0x1af
[  126.060606]        [<ffffffff8109a760>] kthread+0x99/0xa1
[  126.060608]        [<ffffffff81509b14>] kernel_thread_helper+0x4/0x10
[  126.060611]
[  126.060612] other info that might help us debug this:
[  126.060614]
[  126.060615]  Possible unsafe locking scenario:
[  126.060616]
[  126.060617]        CPU0                    CPU1
[  126.060619]        ----                    ----
[  126.060620]   lock(&p->pi_lock);
[  126.060623]                                lock(&(lock)->wait_lock);
[  126.060625]                                lock(&p->pi_lock);
[  126.060627]   lock(&(lock)->wait_lock);
[  126.060629]
[  126.060629]  *** DEADLOCK ***
[  126.060630]
[  126.060632] 1 lock held by irq/24-eth0/1235:
[  126.060633]  #0:  (&p->pi_lock){-...-.}, at: [<ffffffff81074fdc>] try_to_wake_up+0x35/0x429
[  126.060638]
[  126.060638] stack backtrace:
[  126.060641] Pid: 1235, comm: irq/24-eth0 Not tainted 3.0.1-rt10+ #30
[  126.060643] Call Trace:
[  126.060644]  <IRQ>  [<ffffffff810acbde>] print_circular_bug+0x289/0x29a
[  126.060651]  [<ffffffff810af1b8>] __lock_acquire+0x1157/0x1816
[  126.060655]  [<ffffffff810ab3aa>] ? trace_hardirqs_off_caller+0x1f/0x99
[  126.060658]  [<ffffffff81501c81>] ? rt_mutex_slowunlock+0x16/0x55
[  126.060661]  [<ffffffff810afe9e>] lock_acquire+0x145/0x18a
[  126.060664]  [<ffffffff81501c81>] ? rt_mutex_slowunlock+0x16/0x55
[  126.060668]  [<ffffffff8150279e>] _raw_spin_lock+0x40/0x73
[  126.060671]  [<ffffffff81501c81>] ? rt_mutex_slowunlock+0x16/0x55
[  126.060674]  [<ffffffff810d9655>] ? rcu_report_qs_rsp+0x87/0x8c
[  126.060677]  [<ffffffff81501c81>] rt_mutex_slowunlock+0x16/0x55
[  126.060680]  [<ffffffff810d9ea3>] ? rcu_read_unlock_special+0x9b/0x1c4
[  126.060683]  [<ffffffff81501ce7>] rt_mutex_unlock+0x27/0x29
[  126.060687]  [<ffffffff810d9f86>] rcu_read_unlock_special+0x17e/0x1c4
[  126.060690]  [<ffffffff810da014>] __rcu_read_unlock+0x48/0x89
[  126.060693]  [<ffffffff8106847a>] select_task_rq_rt+0xc7/0xd5
[  126.060696]  [<ffffffff810683da>] ? select_task_rq_rt+0x27/0xd5
[  126.060701]  [<ffffffff810a852a>] ? clockevents_program_event+0x8e/0x90
[  126.060704]  [<ffffffff8107511c>] try_to_wake_up+0x175/0x429
[  126.060708]  [<ffffffff810a95dc>] ? tick_program_event+0x1f/0x21
[  126.060711]  [<ffffffff81075425>] wake_up_process+0x15/0x17
[  126.060715]  [<ffffffff81080a51>] wakeup_softirqd+0x24/0x26
[  126.060718]  [<ffffffff81081df9>] irq_exit+0x49/0x55
[  126.060721]  [<ffffffff8150a3bd>] smp_apic_timer_interrupt+0x8a/0x98
[  126.060724]  [<ffffffff81509793>] apic_timer_interrupt+0x13/0x20
[  126.060726]  <EOI>  [<ffffffff81072855>] ? migrate_disable+0x75/0x12d
[  126.060733]  [<ffffffff81080a61>] ? local_bh_disable+0xe/0x1f
[  126.060736]  [<ffffffff81080a70>] ? local_bh_disable+0x1d/0x1f
[  126.060739]  [<ffffffff810d5952>] irq_forced_thread_fn+0x1b/0x44
[  126.060742]  [<ffffffff81502ac0>] ? _raw_spin_unlock_irq+0x3b/0x59
[  126.060745]  [<ffffffff810d582c>] irq_thread+0xde/0x1af
[  126.060748]  [<ffffffff810d5937>] ? irq_thread_fn+0x3a/0x3a
[  126.060751]  [<ffffffff810d574e>] ? irq_finalize_oneshot+0xd1/0xd1
[  126.060754]  [<ffffffff810d574e>] ? irq_finalize_oneshot+0xd1/0xd1
[  126.060757]  [<ffffffff8109a760>] kthread+0x99/0xa1
[  126.060761]  [<ffffffff81509b14>] kernel_thread_helper+0x4/0x10
[  126.060764]  [<ffffffff81069ed7>] ? finish_task_switch+0x87/0x10a
[  126.060768]  [<ffffffff81502ec4>] ? retint_restore_args+0xe/0xe
[  126.060771]  [<ffffffff8109a6c7>] ? __init_kthread_worker+0x8c/0x8c
[  126.060774]  [<ffffffff81509b10>] ? gs_change+0xb/0xb

Because irq_exit() does:

void irq_exit(void)
{
	account_system_vtime(current);
	trace_hardirq_exit();
	sub_preempt_count(IRQ_EXIT_OFFSET);
	if (!in_interrupt() && local_softirq_pending())
		invoke_softirq();

	...
}

Which triggers a wakeup, which uses RCU, now if the interrupted task has
t->rcu_read_unlock_special set, the rcu usage from the wakeup will end
up in rcu_read_unlock_special(). rcu_read_unlock_special() will test
for in_irq(), which will fail as we just decremented preempt_count
with IRQ_EXIT_OFFSET, and in_sering_softirq(), which for
PREEMPT_RT_FULL reads:

int in_serving_softirq(void)
{
	int res;

	preempt_disable();
	res = __get_cpu_var(local_softirq_runner) == current;
	preempt_enable();
	return res;
}

Which will thus also fail, resulting in the above wreckage.

The 'somewhat' ugly solution is to open-code the preempt_count() test
in rcu_read_unlock_special().

Also, we're not at all sure how ->rcu_read_unlock_special gets set
here... so this is very likely a bandaid and more thought is required.

Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/rcu/tree_plugin.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index c2d838b218ef..05c7a7694540 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -335,7 +335,7 @@ void rcu_read_unlock_special(struct task_struct *t)
 	}
 
 	/* Hardware IRQ handlers cannot block, complain if they get here. */
-	if (WARN_ON_ONCE(in_irq() || in_serving_softirq())) {
+	if (WARN_ON_ONCE(preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET))) {
 		local_irq_restore(flags);
 		return;
 	}
-- 
cgit v1.2.3


From 0419badfb4436886f662ad00b38787b86d3e684e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 5 Oct 2011 11:59:38 -0700
Subject: rcu: Merge RCU-bh into RCU-preempt

The Linux kernel has long RCU-bh read-side critical sections that
intolerably increase scheduling latency under mainline's RCU-bh rules,
which include RCU-bh read-side critical sections being non-preemptible.
This patch therefore arranges for RCU-bh to be implemented in terms of
RCU-preempt for CONFIG_PREEMPT_RT_FULL=y.

This has the downside of defeating the purpose of RCU-bh, namely,
handling the case where the system is subjected to a network-based
denial-of-service attack that keeps at least one CPU doing full-time
softirq processing.  This issue will be fixed by a later commit.

The current commit will need some work to make it appropriate for
mainline use, for example, it needs to be extended to cover Tiny RCU.

[ paulmck: Added a useful changelog ]

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Link: http://lkml.kernel.org/r/20111005185938.GA20403@linux.vnet.ibm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/rcupdate.h | 25 +++++++++++++++++++++++++
 include/linux/rcutree.h  | 18 ++++++++++++++++--
 kernel/rcu/tree.c        | 16 ++++++++++++++++
 kernel/rcu/update.c      |  2 ++
 4 files changed, 59 insertions(+), 2 deletions(-)

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 86bff6fcac2e..1f41efc6477b 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -147,6 +147,9 @@ void call_rcu(struct rcu_head *head,
 
 #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+#define call_rcu_bh	call_rcu
+#else
 /**
  * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period.
  * @head: structure to be used for queueing the RCU updates.
@@ -170,6 +173,7 @@ void call_rcu(struct rcu_head *head,
  */
 void call_rcu_bh(struct rcu_head *head,
 		 void (*func)(struct rcu_head *head));
+#endif
 
 /**
  * call_rcu_sched() - Queue an RCU for invocation after sched grace period.
@@ -266,7 +270,13 @@ static inline int rcu_preempt_depth(void)
 /* Internal to kernel */
 void rcu_init(void);
 void rcu_sched_qs(void);
+
+#ifdef CONFIG_PREEMPT_RT_FULL
+static inline void rcu_bh_qs(void) { }
+#else
 void rcu_bh_qs(void);
+#endif
+
 void rcu_check_callbacks(int cpu, int user);
 struct notifier_block;
 void rcu_idle_enter(void);
@@ -437,7 +447,14 @@ extern struct lockdep_map rcu_callback_map;
 int debug_lockdep_rcu_enabled(void);
 
 int rcu_read_lock_held(void);
+#ifdef CONFIG_PREEMPT_RT_FULL
+static inline int rcu_read_lock_bh_held(void)
+{
+	return rcu_read_lock_held();
+}
+#else
 int rcu_read_lock_bh_held(void);
+#endif
 
 /**
  * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section?
@@ -962,10 +979,14 @@ static inline void rcu_read_unlock(void)
 static inline void rcu_read_lock_bh(void)
 {
 	local_bh_disable();
+#ifdef CONFIG_PREEMPT_RT_FULL
+	rcu_read_lock();
+#else
 	__acquire(RCU_BH);
 	rcu_lock_acquire(&rcu_bh_lock_map);
 	rcu_lockdep_assert(rcu_is_watching(),
 			   "rcu_read_lock_bh() used illegally while idle");
+#endif
 }
 
 /*
@@ -975,10 +996,14 @@ static inline void rcu_read_lock_bh(void)
  */
 static inline void rcu_read_unlock_bh(void)
 {
+#ifdef CONFIG_PREEMPT_RT_FULL
+	rcu_read_unlock();
+#else
 	rcu_lockdep_assert(rcu_is_watching(),
 			   "rcu_read_unlock_bh() used illegally while idle");
 	rcu_lock_release(&rcu_bh_lock_map);
 	__release(RCU_BH);
+#endif
 	local_bh_enable();
 }
 
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 3e2f5d432743..b9899eff12f9 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -46,7 +46,11 @@ static inline void rcu_virt_note_context_switch(int cpu)
 	rcu_note_context_switch(cpu);
 }
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+# define synchronize_rcu_bh	synchronize_rcu
+#else
 void synchronize_rcu_bh(void);
+#endif
 void synchronize_sched_expedited(void);
 void synchronize_rcu_expedited(void);
 
@@ -74,7 +78,11 @@ static inline void synchronize_rcu_bh_expedited(void)
 }
 
 void rcu_barrier(void);
+#ifdef CONFIG_PREEMPT_RT_FULL
+# define rcu_barrier_bh                rcu_barrier
+#else
 void rcu_barrier_bh(void);
+#endif
 void rcu_barrier_sched(void);
 unsigned long get_state_synchronize_rcu(void);
 void cond_synchronize_rcu(unsigned long oldstate);
@@ -82,12 +90,10 @@ void cond_synchronize_rcu(unsigned long oldstate);
 extern unsigned long rcutorture_testseq;
 extern unsigned long rcutorture_vernum;
 long rcu_batches_completed(void);
-long rcu_batches_completed_bh(void);
 long rcu_batches_completed_sched(void);
 void show_rcu_gp_kthreads(void);
 
 void rcu_force_quiescent_state(void);
-void rcu_bh_force_quiescent_state(void);
 void rcu_sched_force_quiescent_state(void);
 
 void exit_rcu(void);
@@ -97,4 +103,12 @@ extern int rcu_scheduler_active __read_mostly;
 
 bool rcu_is_watching(void);
 
+#ifndef CONFIG_PREEMPT_RT_FULL
+void rcu_bh_force_quiescent_state(void);
+long rcu_batches_completed_bh(void);
+#else
+# define rcu_bh_force_quiescent_state	rcu_force_quiescent_state
+# define rcu_batches_completed_bh	rcu_batches_completed
+#endif
+
 #endif /* __LINUX_RCUTREE_H */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 9815447d22e0..99a414ded515 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -207,6 +207,7 @@ void rcu_sched_qs(void)
 	}
 }
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 void rcu_bh_qs(void)
 {
 	if (!__this_cpu_read(rcu_bh_data.passed_quiesce)) {
@@ -216,6 +217,7 @@ void rcu_bh_qs(void)
 		__this_cpu_write(rcu_bh_data.passed_quiesce, 1);
 	}
 }
+#endif
 
 static DEFINE_PER_CPU(int, rcu_sched_qs_mask);
 
@@ -336,6 +338,7 @@ long rcu_batches_completed_sched(void)
 }
 EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 /*
  * Return the number of RCU BH batches processed thus far for debug & stats.
  */
@@ -363,6 +366,13 @@ void rcu_bh_force_quiescent_state(void)
 }
 EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
 
+#else
+void rcu_force_quiescent_state(void)
+{
+}
+EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
+#endif
+
 /*
  * Show the state of the grace-period kthreads.
  */
@@ -2734,6 +2744,7 @@ void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 }
 EXPORT_SYMBOL_GPL(call_rcu_sched);
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 /*
  * Queue an RCU callback for invocation after a quicker grace period.
  */
@@ -2742,6 +2753,7 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 	__call_rcu(head, func, &rcu_bh_state, -1, 0);
 }
 EXPORT_SYMBOL_GPL(call_rcu_bh);
+#endif
 
 /*
  * Queue an RCU callback for lazy invocation after a grace period.
@@ -2833,6 +2845,7 @@ void synchronize_sched(void)
 }
 EXPORT_SYMBOL_GPL(synchronize_sched);
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 /**
  * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
  *
@@ -2859,6 +2872,7 @@ void synchronize_rcu_bh(void)
 		wait_rcu_gp(call_rcu_bh);
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
+#endif
 
 /**
  * get_state_synchronize_rcu - Snapshot current RCU state
@@ -3341,6 +3355,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
 	mutex_unlock(&rsp->barrier_mutex);
 }
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 /**
  * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete.
  */
@@ -3349,6 +3364,7 @@ void rcu_barrier_bh(void)
 	_rcu_barrier(&rcu_bh_state);
 }
 EXPORT_SYMBOL_GPL(rcu_barrier_bh);
+#endif
 
 /**
  * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks.
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 3ef8ba58694e..d80edcf92f94 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -170,6 +170,7 @@ int rcu_read_lock_held(void)
 }
 EXPORT_SYMBOL_GPL(rcu_read_lock_held);
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 /**
  * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section?
  *
@@ -196,6 +197,7 @@ int rcu_read_lock_bh_held(void)
 	return in_softirq() || irqs_disabled();
 }
 EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
+#endif
 
 #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
-- 
cgit v1.2.3


From 21ad1533f5e172a9cf485ea8f2e1537db3328668 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 14 Nov 2011 10:57:54 +0100
Subject: rcu-more-fallout.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/rcu/tiny.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index c0623fc47125..1723df96c689 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -370,6 +370,7 @@ void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 }
 EXPORT_SYMBOL_GPL(call_rcu_sched);
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 /*
  * Post an RCU bottom-half callback to be invoked after any subsequent
  * quiescent state.
@@ -379,6 +380,7 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 	__call_rcu(head, func, &rcu_bh_ctrlblk);
 }
 EXPORT_SYMBOL_GPL(call_rcu_bh);
+#endif
 
 void rcu_init(void)
 {
-- 
cgit v1.2.3


From 262312713c24334c0b1bc57139f93e87ab39c98f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 5 Oct 2011 11:45:18 -0700
Subject: rcu: Make ksoftirqd do RCU quiescent states

Implementing RCU-bh in terms of RCU-preempt makes the system vulnerable
to network-based denial-of-service attacks.  This patch therefore
makes __do_softirq() invoke rcu_bh_qs(), but only when __do_softirq()
is running in ksoftirqd context.  A wrapper layer in interposed so that
other calls to __do_softirq() avoid invoking rcu_bh_qs().  The underlying
function __do_softirq_common() does the actual work.

The reason that rcu_bh_qs() is bad in these non-ksoftirqd contexts is
that there might be a local_bh_enable() inside an RCU-preempt read-side
critical section.  This local_bh_enable() can invoke __do_softirq()
directly, so if __do_softirq() were to invoke rcu_bh_qs() (which just
calls rcu_preempt_qs() in the PREEMPT_RT_FULL case), there would be
an illegal RCU-preempt quiescent state in the middle of an RCU-preempt
read-side critical section.  Therefore, quiescent states can only happen
in cases where __do_softirq() is invoked directly from ksoftirqd.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Link: http://lkml.kernel.org/r/20111005184518.GA21601@linux.vnet.ibm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/rcupdate.h |  6 ------
 kernel/rcu/tree.c        |  9 ++++++++-
 kernel/rcu/tree_plugin.h |  8 ++++++--
 kernel/softirq.c         | 18 ++++++++++++------
 4 files changed, 26 insertions(+), 15 deletions(-)

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 1f41efc6477b..10b72bd4ec2e 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -270,13 +270,7 @@ static inline int rcu_preempt_depth(void)
 /* Internal to kernel */
 void rcu_init(void);
 void rcu_sched_qs(void);
-
-#ifdef CONFIG_PREEMPT_RT_FULL
-static inline void rcu_bh_qs(void) { }
-#else
 void rcu_bh_qs(void);
-#endif
-
 void rcu_check_callbacks(int cpu, int user);
 struct notifier_block;
 void rcu_idle_enter(void);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 99a414ded515..fffb361b3dd4 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -207,7 +207,14 @@ void rcu_sched_qs(void)
 	}
 }
 
-#ifndef CONFIG_PREEMPT_RT_FULL
+#ifdef CONFIG_PREEMPT_RT_FULL
+static void rcu_preempt_qs(void);
+
+void rcu_bh_qs(void)
+{
+	rcu_preempt_qs();
+}
+#else
 void rcu_bh_qs(void)
 {
 	if (!__this_cpu_read(rcu_bh_data.passed_quiesce)) {
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 05c7a7694540..36d044a001a2 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1494,7 +1494,7 @@ static void rcu_prepare_kthreads(int cpu)
 
 #endif /* #else #ifdef CONFIG_RCU_BOOST */
 
-#if !defined(CONFIG_RCU_FAST_NO_HZ)
+#if !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL)
 
 /*
  * Check to see if any future RCU-related work will need to be done
@@ -1512,7 +1512,9 @@ int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
 	return rcu_cpu_has_callbacks(cpu, NULL);
 }
 #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
+#endif /* !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL) */
 
+#if !defined(CONFIG_RCU_FAST_NO_HZ)
 /*
  * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
  * after it.
@@ -1609,6 +1611,8 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
 	return cbs_ready;
 }
 
+#ifndef CONFIG_PREEMPT_RT_FULL
+
 /*
  * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
  * to invoke.  If the CPU has callbacks, try to advance them.  Tell the
@@ -1649,7 +1653,7 @@ int rcu_needs_cpu(int cpu, unsigned long *dj)
 	return 0;
 }
 #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
-
+#endif /* #ifndef CONFIG_PREEMPT_RT_FULL */
 /*
  * Prepare a CPU for idle from an RCU perspective.  The first major task
  * is to sense whether nohz mode has been enabled or disabled via sysfs.
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 41afcc5d7151..27e4ad293d67 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -142,7 +142,7 @@ static void wakeup_softirqd(void)
 		wake_up_process(tsk);
 }
 
-static void handle_pending_softirqs(u32 pending)
+static void handle_pending_softirqs(u32 pending, int need_rcu_bh_qs)
 {
 	struct softirq_action *h = softirq_vec;
 	int softirq_bit;
@@ -175,7 +175,8 @@ static void handle_pending_softirqs(u32 pending)
 		pending >>= softirq_bit;
 	}
 
-	rcu_bh_qs();
+	if (need_rcu_bh_qs)
+		rcu_bh_qs();
 	local_irq_disable();
 }
 
@@ -351,7 +352,7 @@ restart:
 	/* Reset the pending bitmask before enabling irqs */
 	set_softirq_pending(0);
 
-	handle_pending_softirqs(pending);
+	handle_pending_softirqs(pending, 1);
 
 	pending = local_softirq_pending();
 	if (pending) {
@@ -400,7 +401,12 @@ static void ksoftirqd_clr_sched_params(unsigned int cpu, bool online) { }
 static DEFINE_LOCAL_IRQ_LOCK(local_softirq_lock);
 static DEFINE_PER_CPU(struct task_struct *, local_softirq_runner);
 
-asmlinkage void __do_softirq(void);
+static void __do_softirq_common(int need_rcu_bh_qs);
+
+asmlinkage __do_softirq(void)
+{
+	__do_softirq_common(0);
+}
 
 void __init softirq_early_init(void)
 {
@@ -509,7 +515,7 @@ asmlinkage void __do_softirq(void)
 
 	lockdep_softirq_enter();
 
-	handle_pending_softirqs(pending, cpu);
+	handle_pending_softirqs(pending, need_rcu_bh_qs);
 
 	pending = local_softirq_pending();
 	if (pending)
@@ -548,7 +554,7 @@ static int __thread_do_softirq(int cpu)
 	 * schedule!
 	 */
 	if (local_softirq_pending())
-		__do_softirq();
+		__do_softirq_common(cpu >= 0);
 	local_unlock(local_softirq_lock);
 	unpin_current_cpu();
 	preempt_disable();
-- 
cgit v1.2.3


From f46fd0def88f53c668640dcd4e576c634724f813 Mon Sep 17 00:00:00 2001
From: Tiejun Chen <tiejun.chen@windriver.com>
Date: Wed, 18 Dec 2013 17:51:49 +0800
Subject: rcutree/rcu_bh_qs: disable irq while calling rcu_preempt_qs()

Any callers to the function rcu_preempt_qs() must disable irqs in
order to protect the assignment to ->rcu_read_unlock_special. In
RT case, rcu_bh_qs() as the wrapper of rcu_preempt_qs() is called
in some scenarios where irq is enabled, like this path,

do_single_softirq()
    |
    + local_irq_enable();
    + handle_softirq()
    |    |
    |    + rcu_bh_qs()
    |        |
    |        + rcu_preempt_qs()
    |
    + local_irq_disable()

So here we'd better disable irq directly inside of rcu_bh_qs() to
fix this, otherwise the kernel may be freezable sometimes as
observed. And especially this way is also kind and safe for the
potential rcu_bh_qs() usage elsewhere in the future.

Cc: stable-rt@vger.kernel.org
Signed-off-by: Tiejun Chen <tiejun.chen@windriver.com>
Signed-off-by: Bin Jiang <bin.jiang@windriver.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 kernel/rcu/tree.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index fffb361b3dd4..140274512c0a 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -212,7 +212,12 @@ static void rcu_preempt_qs(void);
 
 void rcu_bh_qs(void)
 {
+	unsigned long flags;
+
+	/* Callers to this function, rcu_preempt_qs(), must disable irqs. */
+	local_irq_save(flags);
 	rcu_preempt_qs();
+	local_irq_restore(flags);
 }
 #else
 void rcu_bh_qs(void)
-- 
cgit v1.2.3


From fe46c4810a4438dad8ad582990c309de2a7136d1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 15 Jun 2011 11:02:21 +0200
Subject: lglocks-rt.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/lglock.h  | 21 +++++++++++++++++--
 kernel/locking/lglock.c | 54 ++++++++++++++++++++++++++++++++-----------------
 2 files changed, 55 insertions(+), 20 deletions(-)

diff --git a/include/linux/lglock.h b/include/linux/lglock.h
index 0081f000e34b..0ca9a0e0ba06 100644
--- a/include/linux/lglock.h
+++ b/include/linux/lglock.h
@@ -34,22 +34,39 @@
 #endif
 
 struct lglock {
+#ifndef CONFIG_PREEMPT_RT_FULL
 	arch_spinlock_t __percpu *lock;
+#else
+	struct rt_mutex __percpu *lock;
+#endif
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lock_class_key lock_key;
 	struct lockdep_map    lock_dep_map;
 #endif
 };
 
-#define DEFINE_LGLOCK(name)						\
+#ifndef CONFIG_PREEMPT_RT_FULL
+# define DEFINE_LGLOCK(name)						\
 	static DEFINE_PER_CPU(arch_spinlock_t, name ## _lock)		\
 	= __ARCH_SPIN_LOCK_UNLOCKED;					\
 	struct lglock name = { .lock = &name ## _lock }
 
-#define DEFINE_STATIC_LGLOCK(name)					\
+# define DEFINE_STATIC_LGLOCK(name)					\
 	static DEFINE_PER_CPU(arch_spinlock_t, name ## _lock)		\
 	= __ARCH_SPIN_LOCK_UNLOCKED;					\
 	static struct lglock name = { .lock = &name ## _lock }
+#else
+
+# define DEFINE_LGLOCK(name)						\
+	static DEFINE_PER_CPU(struct rt_mutex, name ## _lock)		\
+	= __RT_MUTEX_INITIALIZER( name ## _lock);			\
+	struct lglock name = { .lock = &name ## _lock }
+
+# define DEFINE_STATIC_LGLOCK(name)					\
+	static DEFINE_PER_CPU(struct rt_mutex, name ## _lock)		\
+	= __RT_MUTEX_INITIALIZER( name ## _lock);			\
+	static struct lglock name = { .lock = &name ## _lock }
+#endif
 
 void lg_lock_init(struct lglock *lg, char *name);
 void lg_local_lock(struct lglock *lg);
diff --git a/kernel/locking/lglock.c b/kernel/locking/lglock.c
index 86ae2aebf004..f2356df6a5d1 100644
--- a/kernel/locking/lglock.c
+++ b/kernel/locking/lglock.c
@@ -4,6 +4,15 @@
 #include <linux/cpu.h>
 #include <linux/string.h>
 
+#ifndef CONFIG_PREEMPT_RT_FULL
+# define lg_lock_ptr		arch_spinlock_t
+# define lg_do_lock(l)		arch_spin_lock(l)
+# define lg_do_unlock(l)	arch_spin_unlock(l)
+#else
+# define lg_lock_ptr		struct rt_mutex
+# define lg_do_lock(l)		__rt_spin_lock(l)
+# define lg_do_unlock(l)	__rt_spin_unlock(l)
+#endif
 /*
  * Note there is no uninit, so lglocks cannot be defined in
  * modules (but it's fine to use them from there)
@@ -12,51 +21,60 @@
 
 void lg_lock_init(struct lglock *lg, char *name)
 {
+#ifdef CONFIG_PREEMPT_RT_FULL
+	int i;
+
+	for_each_possible_cpu(i) {
+		struct rt_mutex *lock = per_cpu_ptr(lg->lock, i);
+
+		rt_mutex_init(lock);
+	}
+#endif
 	LOCKDEP_INIT_MAP(&lg->lock_dep_map, name, &lg->lock_key, 0);
 }
 EXPORT_SYMBOL(lg_lock_init);
 
 void lg_local_lock(struct lglock *lg)
 {
-	arch_spinlock_t *lock;
+	lg_lock_ptr *lock;
 
-	preempt_disable();
+	migrate_disable();
 	lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
 	lock = this_cpu_ptr(lg->lock);
-	arch_spin_lock(lock);
+	lg_do_lock(lock);
 }
 EXPORT_SYMBOL(lg_local_lock);
 
 void lg_local_unlock(struct lglock *lg)
 {
-	arch_spinlock_t *lock;
+	lg_lock_ptr *lock;
 
 	lock_release(&lg->lock_dep_map, 1, _RET_IP_);
 	lock = this_cpu_ptr(lg->lock);
-	arch_spin_unlock(lock);
-	preempt_enable();
+	lg_do_unlock(lock);
+	migrate_enable();
 }
 EXPORT_SYMBOL(lg_local_unlock);
 
 void lg_local_lock_cpu(struct lglock *lg, int cpu)
 {
-	arch_spinlock_t *lock;
+	lg_lock_ptr *lock;
 
-	preempt_disable();
+	preempt_disable_nort();
 	lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
 	lock = per_cpu_ptr(lg->lock, cpu);
-	arch_spin_lock(lock);
+	lg_do_lock(lock);
 }
 EXPORT_SYMBOL(lg_local_lock_cpu);
 
 void lg_local_unlock_cpu(struct lglock *lg, int cpu)
 {
-	arch_spinlock_t *lock;
+	lg_lock_ptr *lock;
 
 	lock_release(&lg->lock_dep_map, 1, _RET_IP_);
 	lock = per_cpu_ptr(lg->lock, cpu);
-	arch_spin_unlock(lock);
-	preempt_enable();
+	lg_do_unlock(lock);
+	preempt_enable_nort();
 }
 EXPORT_SYMBOL(lg_local_unlock_cpu);
 
@@ -64,12 +82,12 @@ void lg_global_lock(struct lglock *lg)
 {
 	int i;
 
-	preempt_disable();
+	preempt_disable_nort();
 	lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
 	for_each_possible_cpu(i) {
-		arch_spinlock_t *lock;
+		lg_lock_ptr *lock;
 		lock = per_cpu_ptr(lg->lock, i);
-		arch_spin_lock(lock);
+		lg_do_lock(lock);
 	}
 }
 EXPORT_SYMBOL(lg_global_lock);
@@ -80,10 +98,10 @@ void lg_global_unlock(struct lglock *lg)
 
 	lock_release(&lg->lock_dep_map, 1, _RET_IP_);
 	for_each_possible_cpu(i) {
-		arch_spinlock_t *lock;
+		lg_lock_ptr *lock;
 		lock = per_cpu_ptr(lg->lock, i);
-		arch_spin_unlock(lock);
+		lg_do_unlock(lock);
 	}
-	preempt_enable();
+	preempt_enable_nort();
 }
 EXPORT_SYMBOL(lg_global_unlock);
-- 
cgit v1.2.3


From 9d0af9da7fa99461c3afd8d4abf7b7cbd981e9de Mon Sep 17 00:00:00 2001
From: Mike Galbraith <umgwanakikbuti@gmail.com>
Date: Fri, 2 May 2014 13:13:22 +0200
Subject: stomp-machine: create lg_global_trylock_relax() primitive

Create lg_global_trylock_relax() for use by stopper thread when it cannot
schedule, to deal with stop_cpus_lock, which is now an lglock.

Signed-off-by: Mike Galbraith <umgwanakikbuti@gmail.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 include/linux/lglock.h      |  6 ++++++
 include/linux/spinlock_rt.h |  1 +
 kernel/locking/lglock.c     | 25 +++++++++++++++++++++++++
 kernel/locking/rtmutex.c    |  5 +++++
 4 files changed, 37 insertions(+)

diff --git a/include/linux/lglock.h b/include/linux/lglock.h
index 0ca9a0e0ba06..9603a1500267 100644
--- a/include/linux/lglock.h
+++ b/include/linux/lglock.h
@@ -76,6 +76,12 @@ void lg_local_unlock_cpu(struct lglock *lg, int cpu);
 void lg_global_lock(struct lglock *lg);
 void lg_global_unlock(struct lglock *lg);
 
+#ifndef CONFIG_PREEMPT_RT_FULL
+#define lg_global_trylock_relax(name)	lg_global_lock(name)
+#else
+void lg_global_trylock_relax(struct lglock *lg);
+#endif
+
 #else
 /* When !CONFIG_SMP, map lglock to spinlock */
 #define lglock spinlock
diff --git a/include/linux/spinlock_rt.h b/include/linux/spinlock_rt.h
index ac6f08bde7e8..c0d1367d3e10 100644
--- a/include/linux/spinlock_rt.h
+++ b/include/linux/spinlock_rt.h
@@ -35,6 +35,7 @@ extern int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock);
  */
 extern void __lockfunc __rt_spin_lock(struct rt_mutex *lock);
 extern void __lockfunc __rt_spin_unlock(struct rt_mutex *lock);
+extern int __lockfunc __rt_spin_trylock(struct rt_mutex *lock);
 
 #define spin_lock(lock)				\
 	do {					\
diff --git a/kernel/locking/lglock.c b/kernel/locking/lglock.c
index f2356df6a5d1..9397974b142f 100644
--- a/kernel/locking/lglock.c
+++ b/kernel/locking/lglock.c
@@ -105,3 +105,28 @@ void lg_global_unlock(struct lglock *lg)
 	preempt_enable_nort();
 }
 EXPORT_SYMBOL(lg_global_unlock);
+
+#ifdef CONFIG_PREEMPT_RT_FULL
+/*
+ * HACK: If you use this, you get to keep the pieces.
+ * Used in queue_stop_cpus_work() when stop machinery
+ * is called from inactive CPU, so we can't schedule.
+ */
+# define lg_do_trylock_relax(l)			\
+	do {					\
+		while (!__rt_spin_trylock(l))	\
+			cpu_relax();		\
+	} while (0)
+
+void lg_global_trylock_relax(struct lglock *lg)
+{
+	int i;
+
+	lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
+	for_each_possible_cpu(i) {
+		lg_lock_ptr *lock;
+		lock = per_cpu_ptr(lg->lock, i);
+		lg_do_trylock_relax(lock);
+	}
+}
+#endif
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index eadae8632a4f..397373da53d2 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1153,6 +1153,11 @@ void __lockfunc rt_spin_unlock_wait(spinlock_t *lock)
 }
 EXPORT_SYMBOL(rt_spin_unlock_wait);
 
+int __lockfunc __rt_spin_trylock(struct rt_mutex *lock)
+{
+	return rt_mutex_trylock(lock);
+}
+
 int __lockfunc rt_spin_trylock(spinlock_t *lock)
 {
 	int ret = rt_mutex_trylock(&lock->lock);
-- 
cgit v1.2.3


From 2d061b9f3e21cfbb24d93d3d5657b31f09ce6811 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <umgwanakikbuti@gmail.com>
Date: Fri, 2 May 2014 13:13:34 +0200
Subject: stomp-machine: use lg_global_trylock_relax() to dead with
 stop_cpus_lock lglock

If the stop machinery is called from inactive CPU we cannot use
lg_global_lock(), because some other stomp machine invocation might be
in progress and the lock can be contended.  We cannot schedule from this
context, so use the lovely new lg_global_trylock_relax() primitive to
do what we used to do via one mutex_trylock()/cpu_relax() loop.  We
now do that trylock()/relax() across an entire herd of locks. Joy.

Signed-off-by: Mike Galbraith <umgwanakikbuti@gmail.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 kernel/stop_machine.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index c27b8ef75c3a..1af29ad20970 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -266,7 +266,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
 	struct irq_cpu_stop_queue_work_info call_args;
 	struct multi_stop_data msdata;
 
-	preempt_disable();
+	preempt_disable_nort();
 	msdata = (struct multi_stop_data){
 		.fn = fn,
 		.data = arg,
@@ -299,7 +299,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
 	 * This relies on the stopper workqueues to be FIFO.
 	 */
 	if (!cpu_active(cpu1) || !cpu_active(cpu2)) {
-		preempt_enable();
+		preempt_enable_nort();
 		return -ENOENT;
 	}
 
@@ -313,7 +313,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
 				 &irq_cpu_stop_queue_work,
 				 &call_args, 1);
 	lg_local_unlock(&stop_cpus_lock);
-	preempt_enable();
+	preempt_enable_nort();
 
 	wait_for_stop_done(&done);
 
@@ -347,7 +347,7 @@ static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work);
 
 static void queue_stop_cpus_work(const struct cpumask *cpumask,
 				 cpu_stop_fn_t fn, void *arg,
-				 struct cpu_stop_done *done)
+				 struct cpu_stop_done *done, bool inactive)
 {
 	struct cpu_stop_work *work;
 	unsigned int cpu;
@@ -361,11 +361,13 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask,
 	}
 
 	/*
-	 * Disable preemption while queueing to avoid getting
-	 * preempted by a stopper which might wait for other stoppers
-	 * to enter @fn which can lead to deadlock.
+	 * Make sure that all work is queued on all cpus before
+	 * any of the cpus can execute it.
 	 */
-	lg_global_lock(&stop_cpus_lock);
+	if (!inactive)
+		lg_global_lock(&stop_cpus_lock);
+	else
+		lg_global_trylock_relax(&stop_cpus_lock);
 	for_each_cpu(cpu, cpumask)
 		cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu));
 	lg_global_unlock(&stop_cpus_lock);
@@ -377,7 +379,7 @@ static int __stop_cpus(const struct cpumask *cpumask,
 	struct cpu_stop_done done;
 
 	cpu_stop_init_done(&done, cpumask_weight(cpumask));
-	queue_stop_cpus_work(cpumask, fn, arg, &done);
+	queue_stop_cpus_work(cpumask, fn, arg, &done, false);
 	wait_for_stop_done(&done);
 	return done.executed ? done.ret : -ENOENT;
 }
@@ -573,6 +575,8 @@ static int __init cpu_stop_init(void)
 		INIT_LIST_HEAD(&stopper->works);
 	}
 
+	lg_lock_init(&stop_cpus_lock, "stop_cpus_lock");
+
 	BUG_ON(smpboot_register_percpu_thread(&cpu_stop_threads));
 	stop_machine_initialized = true;
 	return 0;
@@ -668,7 +672,7 @@ int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data,
 	set_state(&msdata, MULTI_STOP_PREPARE);
 	cpu_stop_init_done(&done, num_active_cpus());
 	queue_stop_cpus_work(cpu_active_mask, multi_cpu_stop, &msdata,
-			     &done);
+			     &done, true);
 	ret = multi_cpu_stop(&msdata);
 
 	/* Busy wait for completion. */
-- 
cgit v1.2.3


From a463dd89b0e48de59a4e6eaf23c22998e386c260 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 28 Jul 2011 13:32:57 +0200
Subject: drivers-tty-fix-omap-lock-crap.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/tty/serial/omap-serial.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/drivers/tty/serial/omap-serial.c b/drivers/tty/serial/omap-serial.c
index 18c30cabe27f..6f0629ead171 100644
--- a/drivers/tty/serial/omap-serial.c
+++ b/drivers/tty/serial/omap-serial.c
@@ -1270,13 +1270,10 @@ serial_omap_console_write(struct console *co, const char *s,
 
 	pm_runtime_get_sync(up->dev);
 
-	local_irq_save(flags);
-	if (up->port.sysrq)
-		locked = 0;
-	else if (oops_in_progress)
-		locked = spin_trylock(&up->port.lock);
+	if (up->port.sysrq || oops_in_progress)
+		locked = spin_trylock_irqsave(&up->port.lock, flags);
 	else
-		spin_lock(&up->port.lock);
+		spin_lock_irqsave(&up->port.lock, flags);
 
 	/*
 	 * First save the IER then disable the interrupts
@@ -1305,8 +1302,7 @@ serial_omap_console_write(struct console *co, const char *s,
 	pm_runtime_mark_last_busy(up->dev);
 	pm_runtime_put_autosuspend(up->dev);
 	if (locked)
-		spin_unlock(&up->port.lock);
-	local_irq_restore(flags);
+		spin_unlock_irqrestore(&up->port.lock, flags);
 }
 
 static int __init
-- 
cgit v1.2.3


From 8c00d220707c981c37b6220151d80470101a834d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 8 Jan 2013 21:36:51 +0100
Subject: drivers-tty-pl011-irq-disable-madness.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/tty/serial/amba-pl011.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c
index 02016fcd91b8..5ef2c62bb904 100644
--- a/drivers/tty/serial/amba-pl011.c
+++ b/drivers/tty/serial/amba-pl011.c
@@ -1935,13 +1935,19 @@ pl011_console_write(struct console *co, const char *s, unsigned int count)
 
 	clk_enable(uap->clk);
 
-	local_irq_save(flags);
+	/*
+	 * local_irq_save(flags);
+	 *
+	 * This local_irq_save() is nonsense. If we come in via sysrq
+	 * handling then interrupts are already disabled. Aside of
+	 * that the port.sysrq check is racy on SMP regardless.
+	*/
 	if (uap->port.sysrq)
 		locked = 0;
 	else if (oops_in_progress)
-		locked = spin_trylock(&uap->port.lock);
+		locked = spin_trylock_irqsave(&uap->port.lock, flags);
 	else
-		spin_lock(&uap->port.lock);
+		spin_lock_irqsave(&uap->port.lock, flags);
 
 	/*
 	 *	First save the CR then disable the interrupts
@@ -1963,8 +1969,7 @@ pl011_console_write(struct console *co, const char *s, unsigned int count)
 	writew(old_cr, uap->port.membase + UART011_CR);
 
 	if (locked)
-		spin_unlock(&uap->port.lock);
-	local_irq_restore(flags);
+		spin_unlock_irqrestore(&uap->port.lock, flags);
 
 	clk_disable(uap->clk);
 }
-- 
cgit v1.2.3


From 0cc9eed225732f237f0d3c693fec696d1e45b21f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 14 Dec 2011 13:05:54 +0100
Subject: rt: Improve the serial console PASS_LIMIT
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Beyond the warning:

 drivers/tty/serial/8250/8250.c:1613:6: warning: unused variable ‘pass_counter’ [-Wunused-variable]

the solution of just looping infinitely was ugly - up it to 1 million to
give it a chance to continue in some really ugly situation.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/tty/serial/8250/8250_core.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c
index e5c31eadb0ac..772f1bc13598 100644
--- a/drivers/tty/serial/8250/8250_core.c
+++ b/drivers/tty/serial/8250/8250_core.c
@@ -81,7 +81,16 @@ static unsigned int skip_txen_test; /* force skip of txen test at init time */
 #define DEBUG_INTR(fmt...)	do { } while (0)
 #endif
 
-#define PASS_LIMIT	512
+/*
+ * On -rt we can have a more delays, and legitimately
+ * so - so don't drop work spuriously and spam the
+ * syslog:
+ */
+#ifdef CONFIG_PREEMPT_RT_FULL
+# define PASS_LIMIT	1000000
+#else
+# define PASS_LIMIT	512
+#endif
 
 #define BOTH_EMPTY 	(UART_LSR_TEMT | UART_LSR_THRE)
 
-- 
cgit v1.2.3


From 336cd9ed0de2ac1fae724f1d147f6fb5c6b9499c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 19 Jul 2009 08:44:27 -0500
Subject: fs: namespace preemption fix

On RT we cannot loop with preemption disabled here as
mnt_make_readonly() might have been preempted. We can safely enable
preemption while waiting for MNT_WRITE_HOLD to be cleared. Safe on !RT
as well.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 fs/namespace.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index bbde14719655..3d7e151c081e 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -344,8 +344,11 @@ int __mnt_want_write(struct vfsmount *m)
 	 * incremented count after it has set MNT_WRITE_HOLD.
 	 */
 	smp_mb();
-	while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
+	while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) {
+		preempt_enable();
 		cpu_relax();
+		preempt_disable();
+	}
 	/*
 	 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
 	 * be set to match its requirements. So we must not load that until
-- 
cgit v1.2.3


From 8cb686c0962ced1661ebf0c9d958cff5c9410b34 Mon Sep 17 00:00:00 2001
From: Yong Zhang <yong.zhang0@gmail.com>
Date: Tue, 15 May 2012 13:53:56 +0800
Subject: mm: Protect activate_mm() by preempt_[disable&enable]_rt()

User preempt_*_rt instead of local_irq_*_rt or otherwise there will be
warning on ARM like below:

WARNING: at build/linux/kernel/smp.c:459 smp_call_function_many+0x98/0x264()
Modules linked in:
[<c0013bb4>] (unwind_backtrace+0x0/0xe4) from [<c001be94>] (warn_slowpath_common+0x4c/0x64)
[<c001be94>] (warn_slowpath_common+0x4c/0x64) from [<c001bec4>] (warn_slowpath_null+0x18/0x1c)
[<c001bec4>] (warn_slowpath_null+0x18/0x1c) from [<c0053ff8>](smp_call_function_many+0x98/0x264)
[<c0053ff8>] (smp_call_function_many+0x98/0x264) from [<c0054364>] (smp_call_function+0x44/0x6c)
[<c0054364>] (smp_call_function+0x44/0x6c) from [<c0017d50>] (__new_context+0xbc/0x124)
[<c0017d50>] (__new_context+0xbc/0x124) from [<c009e49c>] (flush_old_exec+0x460/0x5e4)
[<c009e49c>] (flush_old_exec+0x460/0x5e4) from [<c00d61ac>] (load_elf_binary+0x2e0/0x11ac)
[<c00d61ac>] (load_elf_binary+0x2e0/0x11ac) from [<c009d060>] (search_binary_handler+0x94/0x2a4)
[<c009d060>] (search_binary_handler+0x94/0x2a4) from [<c009e8fc>] (do_execve+0x254/0x364)
[<c009e8fc>] (do_execve+0x254/0x364) from [<c0010e84>] (sys_execve+0x34/0x54)
[<c0010e84>] (sys_execve+0x34/0x54) from [<c000da00>] (ret_fast_syscall+0x0/0x30)
---[ end trace 0000000000000002 ]---

The reason is that ARM need irq enabled when doing activate_mm().
According to mm-protect-activate-switch-mm.patch, actually
preempt_[disable|enable]_rt() is sufficient.

Inspired-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Yong Zhang <yong.zhang0@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/1337061236-1766-1-git-send-email-yong.zhang0@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 fs/exec.c        | 2 ++
 mm/mmu_context.c | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/fs/exec.c b/fs/exec.c
index 7302b75a9820..8b93bef9cfe4 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -841,12 +841,14 @@ static int exec_mmap(struct mm_struct *mm)
 		}
 	}
 	task_lock(tsk);
+	preempt_disable_rt();
 	active_mm = tsk->active_mm;
 	tsk->mm = mm;
 	tsk->active_mm = mm;
 	activate_mm(active_mm, mm);
 	tsk->mm->vmacache_seqnum = 0;
 	vmacache_flush(tsk);
+	preempt_enable_rt();
 	task_unlock(tsk);
 	if (old_mm) {
 		up_read(&old_mm->mmap_sem);
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
index f802c2d216a7..b1b6f238e42d 100644
--- a/mm/mmu_context.c
+++ b/mm/mmu_context.c
@@ -23,6 +23,7 @@ void use_mm(struct mm_struct *mm)
 	struct task_struct *tsk = current;
 
 	task_lock(tsk);
+	preempt_disable_rt();
 	active_mm = tsk->active_mm;
 	if (active_mm != mm) {
 		atomic_inc(&mm->mm_count);
@@ -30,6 +31,7 @@ void use_mm(struct mm_struct *mm)
 	}
 	tsk->mm = mm;
 	switch_mm(active_mm, mm, tsk);
+	preempt_enable_rt();
 	task_unlock(tsk);
 #ifdef finish_arch_post_lock_switch
 	finish_arch_post_lock_switch();
-- 
cgit v1.2.3


From 116708be4e07c6838ceea9dc9c9fb40a5183feca Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 14 Jun 2011 17:05:09 +0200
Subject: fs-block-rt-support.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 block/blk-core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 8a2074a111ea..34d4fb831ee5 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -194,7 +194,7 @@ EXPORT_SYMBOL(blk_delay_queue);
  **/
 void blk_start_queue(struct request_queue *q)
 {
-	WARN_ON(!irqs_disabled());
+	WARN_ON_NONRT(!irqs_disabled());
 
 	queue_flag_clear(QUEUE_FLAG_STOPPED, q);
 	__blk_run_queue(q);
-- 
cgit v1.2.3


From 4283c70d1056849d784c656cd2c909abdd07f513 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Fri, 3 Jul 2009 08:44:12 -0500
Subject: fs: ntfs: disable interrupt only on !RT

On Sat, 2007-10-27 at 11:44 +0200, Ingo Molnar wrote:
> * Nick Piggin <nickpiggin@yahoo.com.au> wrote:
>
> > > [10138.175796]  [<c0105de3>] show_trace+0x12/0x14
> > > [10138.180291]  [<c0105dfb>] dump_stack+0x16/0x18
> > > [10138.184769]  [<c011609f>] native_smp_call_function_mask+0x138/0x13d
> > > [10138.191117]  [<c0117606>] smp_call_function+0x1e/0x24
> > > [10138.196210]  [<c012f85c>] on_each_cpu+0x25/0x50
> > > [10138.200807]  [<c0115c74>] flush_tlb_all+0x1e/0x20
> > > [10138.205553]  [<c016caaf>] kmap_high+0x1b6/0x417
> > > [10138.210118]  [<c011ec88>] kmap+0x4d/0x4f
> > > [10138.214102]  [<c026a9d8>] ntfs_end_buffer_async_read+0x228/0x2f9
> > > [10138.220163]  [<c01a0e9e>] end_bio_bh_io_sync+0x26/0x3f
> > > [10138.225352]  [<c01a2b09>] bio_endio+0x42/0x6d
> > > [10138.229769]  [<c02c2a08>] __end_that_request_first+0x115/0x4ac
> > > [10138.235682]  [<c02c2da7>] end_that_request_chunk+0x8/0xa
> > > [10138.241052]  [<c0365943>] ide_end_request+0x55/0x10a
> > > [10138.246058]  [<c036dae3>] ide_dma_intr+0x6f/0xac
> > > [10138.250727]  [<c0366d83>] ide_intr+0x93/0x1e0
> > > [10138.255125]  [<c015afb4>] handle_IRQ_event+0x5c/0xc9
> >
> > Looks like ntfs is kmap()ing from interrupt context. Should be using
> > kmap_atomic instead, I think.
>
> it's not atomic interrupt context but irq thread context - and -rt
> remaps kmap_atomic() to kmap() internally.

Hm.  Looking at the change to mm/bounce.c, perhaps I should do this
instead?

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 fs/ntfs/aops.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 39b08f5b8c76..f0de4b6b8bf3 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -143,13 +143,13 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
 		recs = PAGE_CACHE_SIZE / rec_size;
 		/* Should have been verified before we got here... */
 		BUG_ON(!recs);
-		local_irq_save(flags);
+		local_irq_save_nort(flags);
 		kaddr = kmap_atomic(page);
 		for (i = 0; i < recs; i++)
 			post_read_mst_fixup((NTFS_RECORD*)(kaddr +
 					i * rec_size), rec_size);
 		kunmap_atomic(kaddr);
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 		flush_dcache_page(page);
 		if (likely(page_uptodate && !PageError(page)))
 			SetPageUptodate(page);
-- 
cgit v1.2.3


From 377786865cd73d725d4109847c3a19a56b99ca2b Mon Sep 17 00:00:00 2001
From: Mike Galbraith <mgalbraith@suse.de>
Date: Wed, 11 Jul 2012 22:05:20 +0000
Subject: fs, jbd: pull your plug when waiting for space

With an -rt kernel, and a heavy sync IO load, tasks can jam
up on journal locks without unplugging, which can lead to
terminal IO starvation.  Unplug and schedule when waiting for space.

Signed-off-by: Mike Galbraith <mgalbraith@suse.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Theodore Tso <tytso@mit.edu>
Link: http://lkml.kernel.org/r/1341812414.7370.73.camel@marge.simpson.net
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 fs/jbd/checkpoint.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 08c03044abdd..95debd71e5fa 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -129,6 +129,8 @@ void __log_wait_for_space(journal_t *journal)
 		if (journal->j_flags & JFS_ABORT)
 			return;
 		spin_unlock(&journal->j_state_lock);
+		if (current->plug)
+			io_schedule();
 		mutex_lock(&journal->j_checkpoint_mutex);
 
 		/*
-- 
cgit v1.2.3


From da4a227449f74c1ce698c717869d39645ca6eb15 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 17 Feb 2014 17:30:03 +0100
Subject: fs: jbd2: pull your plug when waiting for space

Two cps in parallel managed to stall the the ext4 fs. It seems that
journal code is either waiting for locks or sleeping waiting for
something to happen. This seems similar to what Mike observed on ext3,
here is his description:

|With an -rt kernel, and a heavy sync IO load, tasks can jam
|up on journal locks without unplugging, which can lead to
|terminal IO starvation.  Unplug and schedule when waiting
|for space.

Cc: stable-rt@vger.kernel.org
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 fs/jbd2/checkpoint.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 988b32ed4c87..6f691c289c71 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -116,6 +116,8 @@ void __jbd2_log_wait_for_space(journal_t *journal)
 	nblocks = jbd2_space_needed(journal);
 	while (jbd2_log_space_left(journal) < nblocks) {
 		write_unlock(&journal->j_state_lock);
+		if (current->plug)
+			io_schedule();
 		mutex_lock(&journal->j_checkpoint_mutex);
 
 		/*
-- 
cgit v1.2.3


From ca52dcf6f9e58141816d39c329258839b3b4ed74 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 16 Feb 2015 18:49:10 +0100
Subject: fs/aio: simple simple work

|BUG: sleeping function called from invalid context at kernel/locking/rtmutex.c:768
|in_atomic(): 1, irqs_disabled(): 0, pid: 26, name: rcuos/2
|2 locks held by rcuos/2/26:
| #0:  (rcu_callback){.+.+..}, at: [<ffffffff810b1a12>] rcu_nocb_kthread+0x1e2/0x380
| #1:  (rcu_read_lock_sched){.+.+..}, at: [<ffffffff812acd26>] percpu_ref_kill_rcu+0xa6/0x1c0
|Preemption disabled at:[<ffffffff810b1a93>] rcu_nocb_kthread+0x263/0x380
|Call Trace:
| [<ffffffff81582e9e>] dump_stack+0x4e/0x9c
| [<ffffffff81077aeb>] __might_sleep+0xfb/0x170
| [<ffffffff81589304>] rt_spin_lock+0x24/0x70
| [<ffffffff811c5790>] free_ioctx_users+0x30/0x130
| [<ffffffff812ace34>] percpu_ref_kill_rcu+0x1b4/0x1c0
| [<ffffffff810b1a93>] rcu_nocb_kthread+0x263/0x380
| [<ffffffff8106e046>] kthread+0xd6/0xf0
| [<ffffffff81591eec>] ret_from_fork+0x7c/0xb0

replace this preempt_disable() friendly swork.

Reported-By: Mike Galbraith <umgwanakikbuti@gmail.com>
Suggested-by: Benjamin LaHaise <bcrl@kvack.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 fs/aio.c | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 58caa7e5d81c..d8af0bfbb0e3 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -40,6 +40,7 @@
 #include <linux/ramfs.h>
 #include <linux/percpu-refcount.h>
 #include <linux/mount.h>
+#include <linux/work-simple.h>
 
 #include <asm/kmap_types.h>
 #include <asm/uaccess.h>
@@ -110,7 +111,7 @@ struct kioctx {
 	struct page		**ring_pages;
 	long			nr_pages;
 
-	struct work_struct	free_work;
+	struct swork_event	free_work;
 
 	/*
 	 * signals when all in-flight requests are done
@@ -226,6 +227,7 @@ static int __init aio_setup(void)
 		.mount		= aio_mount,
 		.kill_sb	= kill_anon_super,
 	};
+	BUG_ON(swork_get());
 	aio_mnt = kern_mount(&aio_fs);
 	if (IS_ERR(aio_mnt))
 		panic("Failed to create aio fs mount.");
@@ -505,9 +507,9 @@ static int kiocb_cancel(struct kiocb *kiocb)
 	return cancel(kiocb);
 }
 
-static void free_ioctx(struct work_struct *work)
+static void free_ioctx(struct swork_event *sev)
 {
-	struct kioctx *ctx = container_of(work, struct kioctx, free_work);
+	struct kioctx *ctx = container_of(sev, struct kioctx, free_work);
 
 	pr_debug("freeing %p\n", ctx);
 
@@ -526,8 +528,8 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
 	if (ctx->requests_done)
 		complete(ctx->requests_done);
 
-	INIT_WORK(&ctx->free_work, free_ioctx);
-	schedule_work(&ctx->free_work);
+	INIT_SWORK(&ctx->free_work, free_ioctx);
+	swork_queue(&ctx->free_work);
 }
 
 /*
@@ -535,9 +537,9 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
  * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
  * now it's safe to cancel any that need to be.
  */
-static void free_ioctx_users(struct percpu_ref *ref)
+static void free_ioctx_users_work(struct swork_event *sev)
 {
-	struct kioctx *ctx = container_of(ref, struct kioctx, users);
+	struct kioctx *ctx = container_of(sev, struct kioctx, free_work);
 	struct kiocb *req;
 
 	spin_lock_irq(&ctx->ctx_lock);
@@ -556,6 +558,14 @@ static void free_ioctx_users(struct percpu_ref *ref)
 	percpu_ref_put(&ctx->reqs);
 }
 
+static void free_ioctx_users(struct percpu_ref *ref)
+{
+	struct kioctx *ctx = container_of(ref, struct kioctx, users);
+
+	INIT_SWORK(&ctx->free_work, free_ioctx_users_work);
+	swork_queue(&ctx->free_work);
+}
+
 static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
 {
 	unsigned i, new_nr;
-- 
cgit v1.2.3


From 7738a41efb23ff13bbd3d88c8dcfc291ba8f95fa Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 13 May 2015 11:36:32 -0400
Subject: xfs: Disable percpu SB on PREEMPT_RT_FULL

Running a test on a large CPU count box with xfs, I hit a live lock
with the following backtraces on several CPUs:

 Call Trace:
  [<ffffffff812c34f8>] __const_udelay+0x28/0x30
  [<ffffffffa033ab9a>] xfs_icsb_lock_cntr+0x2a/0x40 [xfs]
  [<ffffffffa033c871>] xfs_icsb_modify_counters+0x71/0x280 [xfs]
  [<ffffffffa03413e1>] xfs_trans_reserve+0x171/0x210 [xfs]
  [<ffffffffa0378cfd>] xfs_create+0x24d/0x6f0 [xfs]
  [<ffffffff8124c8eb>] ? avc_has_perm_flags+0xfb/0x1e0
  [<ffffffffa0336eeb>] xfs_vn_mknod+0xbb/0x1e0 [xfs]
  [<ffffffffa0337043>] xfs_vn_create+0x13/0x20 [xfs]
  [<ffffffff811b0edd>] vfs_create+0xcd/0x130
  [<ffffffff811b21ef>] do_last+0xb8f/0x1240
  [<ffffffff811b39b2>] path_openat+0xc2/0x490

Looking at the code I see it was stuck at:

STATIC void
xfs_icsb_lock_cntr(
	xfs_icsb_cnts_t	*icsbp)
{
	while (test_and_set_bit(XFS_ICSB_FLAG_LOCK, &icsbp->icsb_flags)) {
		ndelay(1000);
	}
}

In xfs_icsb_modify_counters() the code is fine. There's a
preempt_disable() called when taking this bit spinlock and a
preempt_enable() after it is released. The issue is that not all
locations are protected by preempt_disable() when PREEMPT_RT is set.
Namely the places that grab all CPU cntr locks.

STATIC void
xfs_icsb_lock_all_counters(
	xfs_mount_t	*mp)
{
	xfs_icsb_cnts_t *cntp;
	int		i;

	for_each_online_cpu(i) {
		cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
		xfs_icsb_lock_cntr(cntp);
	}
}

STATIC void
xfs_icsb_disable_counter()
{
	[...]
	xfs_icsb_lock_all_counters(mp);
	[...]
	xfs_icsb_unlock_all_counters(mp);
}

STATIC void
xfs_icsb_balance_counter_locked()
{
	[...]
	xfs_icsb_disable_counter();
	[...]
}

STATIC void
xfs_icsb_balance_counter(
	xfs_mount_t	*mp,
	xfs_sb_field_t  fields,
	int		min_per_cpu)
{
	spin_lock(&mp->m_sb_lock);
	xfs_icsb_balance_counter_locked(mp, fields, min_per_cpu);
	spin_unlock(&mp->m_sb_lock);
}

Now, when PREEMPT_RT is not enabled, that spin_lock() disables
preemption. But for PREEMPT_RT, it does not. Although with my test box I
was not able to produce a task state of all tasks, but I'm assuming that
some task called the xfs_icsb_lock_all_counters() and was preempted by
an RT task and could not finish, causing all callers of that lock to
block indefinitely.

Dave Chinner has stated that the scalability of that code will probably
be negated by PREEMPT_RT, and that it is probably best to just disable
the code in question. Also, this code has been rewritten in newer kernels.

Link: http://lkml.kernel.org/r/20150504004844.GA21261@dastard

Suggested-by: Dave Chinner <david@fromorbit.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 fs/xfs/xfs_linux.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 6a51619d8690..430e7987d6ad 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -119,7 +119,7 @@ typedef __uint64_t __psunsigned_t;
 /*
  * Feature macros (disable/enable)
  */
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT_FULL)
 #define HAVE_PERCPU_SB	/* per cpu superblock counters are a 2.6 feature */
 #else
 #undef  HAVE_PERCPU_SB	/* per cpu superblock counters are a 2.6 feature */
-- 
cgit v1.2.3


From b0b2d8b9fe336db30fad40960cdc60fdacdf7deb Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 13 Dec 2010 16:33:39 +0100
Subject: x86: Convert mce timer to hrtimer

mce_timer is started in atomic contexts of cpu bringup. This results
in might_sleep() warnings on RT. Convert mce_timer to a hrtimer to
avoid this.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
fold in:
|From: Mike Galbraith <bitbucket@online.de>
|Date: Wed, 29 May 2013 13:52:13 +0200
|Subject: [PATCH] x86/mce: fix mce timer interval
|
|Seems mce timer fire at the wrong frequency in -rt kernels since roughly
|forever due to 32 bit overflow.  3.8-rt is also missing a multiplier.
|
|Add missing us -> ns conversion and 32 bit overflow prevention.
|
|Signed-off-by: Mike Galbraith <bitbucket@online.de>
|[bigeasy: use ULL instead of u64 cast]
|Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 58 +++++++++++++++++++++++-----------------
 1 file changed, 34 insertions(+), 24 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 61a9668cebfd..5776fdc95b9a 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -41,6 +41,7 @@
 #include <linux/debugfs.h>
 #include <linux/irq_work.h>
 #include <linux/export.h>
+#include <linux/jiffies.h>
 
 #include <asm/processor.h>
 #include <asm/mce.h>
@@ -1266,7 +1267,7 @@ void mce_log_therm_throt_event(__u64 status)
 static unsigned long check_interval = 5 * 60; /* 5 minutes */
 
 static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
-static DEFINE_PER_CPU(struct timer_list, mce_timer);
+static DEFINE_PER_CPU(struct hrtimer, mce_timer);
 
 static unsigned long mce_adjust_timer_default(unsigned long interval)
 {
@@ -1283,14 +1284,11 @@ static int cmc_error_seen(void)
 	return test_and_clear_bit(0, v);
 }
 
-static void mce_timer_fn(unsigned long data)
+static enum hrtimer_restart mce_timer_fn(struct hrtimer *timer)
 {
-	struct timer_list *t = this_cpu_ptr(&mce_timer);
 	unsigned long iv;
 	int notify;
 
-	WARN_ON(smp_processor_id() != data);
-
 	if (mce_available(this_cpu_ptr(&cpu_info))) {
 		machine_check_poll(MCP_TIMESTAMP,
 				this_cpu_ptr(&mce_poll_banks));
@@ -1313,9 +1311,11 @@ static void mce_timer_fn(unsigned long data)
 	__this_cpu_write(mce_next_interval, iv);
 	/* Might have become 0 after CMCI storm subsided */
 	if (iv) {
-		t->expires = jiffies + iv;
-		add_timer_on(t, smp_processor_id());
+		hrtimer_forward_now(timer, ns_to_ktime(
+					jiffies_to_usecs(iv) * 1000ULL));
+		return HRTIMER_RESTART;
 	}
+	return HRTIMER_NORESTART;
 }
 
 /*
@@ -1323,28 +1323,37 @@ static void mce_timer_fn(unsigned long data)
  */
 void mce_timer_kick(unsigned long interval)
 {
-	struct timer_list *t = this_cpu_ptr(&mce_timer);
-	unsigned long when = jiffies + interval;
+	struct hrtimer *t = this_cpu_ptr(&mce_timer);
 	unsigned long iv = __this_cpu_read(mce_next_interval);
 
-	if (timer_pending(t)) {
-		if (time_before(when, t->expires))
-			mod_timer_pinned(t, when);
+	if (hrtimer_active(t)) {
+		s64 exp;
+		s64 intv_us;
+
+		intv_us = jiffies_to_usecs(interval);
+		exp = ktime_to_us(hrtimer_expires_remaining(t));
+		if (intv_us < exp) {
+			hrtimer_cancel(t);
+			hrtimer_start_range_ns(t,
+					ns_to_ktime(intv_us * 1000),
+					0, HRTIMER_MODE_REL_PINNED);
+		}
 	} else {
-		t->expires = round_jiffies(when);
-		add_timer_on(t, smp_processor_id());
+		hrtimer_start_range_ns(t,
+			ns_to_ktime(jiffies_to_usecs(interval) * 1000ULL),
+				0, HRTIMER_MODE_REL_PINNED);
 	}
 	if (interval < iv)
 		__this_cpu_write(mce_next_interval, interval);
 }
 
-/* Must not be called in IRQ context where del_timer_sync() can deadlock */
+/* Must not be called in IRQ context where hrtimer_cancel() can deadlock */
 static void mce_timer_delete_all(void)
 {
 	int cpu;
 
 	for_each_online_cpu(cpu)
-		del_timer_sync(&per_cpu(mce_timer, cpu));
+		hrtimer_cancel(&per_cpu(mce_timer, cpu));
 }
 
 static void mce_do_trigger(struct work_struct *work)
@@ -1644,7 +1653,7 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
 	}
 }
 
-static void mce_start_timer(unsigned int cpu, struct timer_list *t)
+static void mce_start_timer(unsigned int cpu, struct hrtimer *t)
 {
 	unsigned long iv = check_interval * HZ;
 
@@ -1653,16 +1662,17 @@ static void mce_start_timer(unsigned int cpu, struct timer_list *t)
 
 	per_cpu(mce_next_interval, cpu) = iv;
 
-	t->expires = round_jiffies(jiffies + iv);
-	add_timer_on(t, cpu);
+	hrtimer_start_range_ns(t, ns_to_ktime(jiffies_to_usecs(iv) * 1000ULL),
+			0, HRTIMER_MODE_REL_PINNED);
 }
 
 static void __mcheck_cpu_init_timer(void)
 {
-	struct timer_list *t = this_cpu_ptr(&mce_timer);
+	struct hrtimer *t = this_cpu_ptr(&mce_timer);
 	unsigned int cpu = smp_processor_id();
 
-	setup_timer(t, mce_timer_fn, cpu);
+	hrtimer_init(t, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	t->function = mce_timer_fn;
 	mce_start_timer(cpu, t);
 }
 
@@ -2339,6 +2349,8 @@ static void mce_disable_cpu(void *h)
 	if (!mce_available(raw_cpu_ptr(&cpu_info)))
 		return;
 
+	hrtimer_cancel(this_cpu_ptr(&mce_timer));
+
 	if (!(action & CPU_TASKS_FROZEN))
 		cmci_clear();
 	for (i = 0; i < mca_cfg.banks; i++) {
@@ -2365,6 +2377,7 @@ static void mce_reenable_cpu(void *h)
 		if (b->init)
 			wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
 	}
+	__mcheck_cpu_init_timer();
 }
 
 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
@@ -2372,7 +2385,6 @@ static int
 mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
 	unsigned int cpu = (unsigned long)hcpu;
-	struct timer_list *t = &per_cpu(mce_timer, cpu);
 
 	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_ONLINE:
@@ -2392,11 +2404,9 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		break;
 	case CPU_DOWN_PREPARE:
 		smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
-		del_timer_sync(t);
 		break;
 	case CPU_DOWN_FAILED:
 		smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
-		mce_start_timer(cpu, t);
 		break;
 	}
 
-- 
cgit v1.2.3


From 0606eb0f2cd229fd2ebfcbbee7ee1506485c457d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Fri, 27 Feb 2015 15:20:37 +0100
Subject: x86/mce: use swait queue for mce wakeups

We had a customer report a lockup on a 3.0-rt kernel that had the
following backtrace:

[ffff88107fca3e80] rt_spin_lock_slowlock at ffffffff81499113
[ffff88107fca3f40] rt_spin_lock at ffffffff81499a56
[ffff88107fca3f50] __wake_up at ffffffff81043379
[ffff88107fca3f80] mce_notify_irq at ffffffff81017328
[ffff88107fca3f90] intel_threshold_interrupt at ffffffff81019508
[ffff88107fca3fa0] smp_threshold_interrupt at ffffffff81019fc1
[ffff88107fca3fb0] threshold_interrupt at ffffffff814a1853

It actually bugged because the lock was taken by the same owner that
already had that lock. What happened was the thread that was setting
itself on a wait queue had the lock when an MCE triggered. The MCE
interrupt does a wake up on its wait list and grabs the same lock.

NOTE: THIS IS NOT A BUG ON MAINLINE

Sorry for yelling, but as I Cc'd mainline maintainers I want them to
know that this is an PREEMPT_RT bug only. I only Cc'd them for advice.

On PREEMPT_RT the wait queue locks are converted from normal
"spin_locks" into an rt_mutex (see the rt_spin_lock_slowlock above).
These are not to be taken by hard interrupt context. This usually isn't
a problem as most all interrupts in PREEMPT_RT are converted into
schedulable threads. Unfortunately that's not the case with the MCE irq.

As wait queue locks are notorious for long hold times, we can not
convert them to raw_spin_locks without causing issues with -rt. But
Thomas has created a "simple-wait" structure that uses raw spin locks
which may have been a good fit.

Unfortunately, wait queues are not the only issue, as the mce_notify_irq
also does a schedule_work(), which grabs the workqueue spin locks that
have the exact same issue.

Thus, this patch I'm proposing is to move the actual work of the MCE
interrupt into a helper thread that gets woken up on the MCE interrupt
and does the work in a schedulable context.

NOTE: THIS PATCH ONLY CHANGES THE BEHAVIOR WHEN PREEMPT_RT IS SET

Oops, sorry for yelling again, but I want to stress that I keep the same
behavior of mainline when PREEMPT_RT is not set. Thus, this only changes
the MCE behavior when PREEMPT_RT is configured.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
[bigeasy@linutronix: make mce_notify_work() a proper prototype, use
		     kthread_run()]
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
[wagi: use work-simple framework to defer work to a kthread]
Signed-off-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 68 +++++++++++++++++++++++++++++++++-------
 1 file changed, 56 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 5776fdc95b9a..d7f1c8316f94 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -42,6 +42,7 @@
 #include <linux/irq_work.h>
 #include <linux/export.h>
 #include <linux/jiffies.h>
+#include <linux/work-simple.h>
 
 #include <asm/processor.h>
 #include <asm/mce.h>
@@ -1363,6 +1364,56 @@ static void mce_do_trigger(struct work_struct *work)
 
 static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
 
+static void __mce_notify_work(struct swork_event *event)
+{
+	/* Not more than two messages every minute */
+	static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
+
+	/* wake processes polling /dev/mcelog */
+	wake_up_interruptible(&mce_chrdev_wait);
+
+	/*
+	 * There is no risk of missing notifications because
+	 * work_pending is always cleared before the function is
+	 * executed.
+	 */
+	if (mce_helper[0] && !work_pending(&mce_trigger_work))
+		schedule_work(&mce_trigger_work);
+
+	if (__ratelimit(&ratelimit))
+		pr_info(HW_ERR "Machine check events logged\n");
+}
+
+#ifdef CONFIG_PREEMPT_RT_FULL
+static bool notify_work_ready __read_mostly;
+static struct swork_event notify_work;
+
+static int mce_notify_work_init(void)
+{
+	int err;
+
+	err = swork_get();
+	if (err)
+		return err;
+
+	INIT_SWORK(&notify_work, __mce_notify_work);
+	notify_work_ready = true;
+	return 0;
+}
+
+static void mce_notify_work(void)
+{
+	if (notify_work_ready)
+		swork_queue(&notify_work);
+}
+#else
+static void mce_notify_work(void)
+{
+	__mce_notify_work(NULL);
+}
+static inline int mce_notify_work_init(void) { return 0; }
+#endif
+
 /*
  * Notify the user(s) about new machine check events.
  * Can be called from interrupt context, but not from machine check/NMI
@@ -1370,19 +1421,8 @@ static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
  */
 int mce_notify_irq(void)
 {
-	/* Not more than two messages every minute */
-	static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
-
 	if (test_and_clear_bit(0, &mce_need_notify)) {
-		/* wake processes polling /dev/mcelog */
-		wake_up_interruptible(&mce_chrdev_wait);
-
-		if (mce_helper[0])
-			schedule_work(&mce_trigger_work);
-
-		if (__ratelimit(&ratelimit))
-			pr_info(HW_ERR "Machine check events logged\n");
-
+		mce_notify_work();
 		return 1;
 	}
 	return 0;
@@ -2445,6 +2485,10 @@ static __init int mcheck_init_device(void)
 		goto err_out;
 	}
 
+	err = mce_notify_work_init();
+	if (err)
+		goto err_out;
+
 	if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
 		err = -ENOMEM;
 		goto err_out;
-- 
cgit v1.2.3


From 9d6869861047bf6c1b1380c89d045799d1cb8640 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 16 Dec 2010 14:25:18 +0100
Subject: x86: stackprotector: Avoid random pool on rt

CPU bringup calls into the random pool to initialize the stack
canary. During boot that works nicely even on RT as the might sleep
checks are disabled. During CPU hotplug the might sleep checks
trigger. Making the locks in random raw is a major PITA, so avoid the
call on RT is the only sensible solution. This is basically the same
randomness which we get during boot where the random pool has no
entropy and we rely on the TSC randomnness.

Reported-by: Carsten Emde <carsten.emde@osadl.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/stackprotector.h | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
index 6a998598f172..64fb5cbe54fa 100644
--- a/arch/x86/include/asm/stackprotector.h
+++ b/arch/x86/include/asm/stackprotector.h
@@ -57,7 +57,7 @@
  */
 static __always_inline void boot_init_stack_canary(void)
 {
-	u64 canary;
+	u64 uninitialized_var(canary);
 	u64 tsc;
 
 #ifdef CONFIG_X86_64
@@ -68,8 +68,16 @@ static __always_inline void boot_init_stack_canary(void)
 	 * of randomness. The TSC only matters for very early init,
 	 * there it already has some randomness on most systems. Later
 	 * on during the bootup the random pool has true entropy too.
+	 *
+	 * For preempt-rt we need to weaken the randomness a bit, as
+	 * we can't call into the random generator from atomic context
+	 * due to locking constraints. We just leave canary
+	 * uninitialized and use the TSC based randomness on top of
+	 * it.
 	 */
+#ifndef CONFIG_PREEMPT_RT_FULL
 	get_random_bytes(&canary, sizeof(canary));
+#endif
 	tsc = __native_read_tsc();
 	canary += tsc + (tsc << 32UL);
 
-- 
cgit v1.2.3


From 62dc40e8f37475172e0a5b3c50e1170190f53fb6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 26 Jul 2009 02:21:32 +0200
Subject: x86: Use generic rwsem_spinlocks on -rt

Simplifies the separation of anon_rw_semaphores and rw_semaphores for
-rt.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/Kconfig | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 3635fff7b32d..6e9d2cc9dcc4 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -197,8 +197,11 @@ config ARCH_MAY_HAVE_PC_FDC
 	def_bool y
 	depends on ISA_DMA_API
 
+config RWSEM_GENERIC_SPINLOCK
+	def_bool PREEMPT_RT_FULL
+
 config RWSEM_XCHGADD_ALGORITHM
-	def_bool y
+	def_bool !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT_FULL
 
 config GENERIC_CALIBRATE_DELAY
 	def_bool y
-- 
cgit v1.2.3


From a59d628085d9bf7c478954a28f0f700b468ccb21 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <umgwanakikbuti@gmail.com>
Date: Sun, 2 Nov 2014 08:31:37 +0100
Subject: x86: UV: raw_spinlock conversion

Shrug.  Lots of hobbyists have a beast in their basement, right?

Cc: stable-rt@vger.kernel.org
Signed-off-by: Mike Galbraith <mgalbraith@suse.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 arch/x86/include/asm/uv/uv_bau.h   | 14 +++++++-------
 arch/x86/include/asm/uv/uv_hub.h   |  2 +-
 arch/x86/kernel/apic/x2apic_uv_x.c |  2 +-
 arch/x86/platform/uv/tlb_uv.c      | 26 +++++++++++++-------------
 arch/x86/platform/uv/uv_time.c     | 21 +++++++++++++--------
 5 files changed, 35 insertions(+), 30 deletions(-)

diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 2d60a7813dfe..ddc4c9e3d09d 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -615,9 +615,9 @@ struct bau_control {
 	cycles_t		send_message;
 	cycles_t		period_end;
 	cycles_t		period_time;
-	spinlock_t		uvhub_lock;
-	spinlock_t		queue_lock;
-	spinlock_t		disable_lock;
+	raw_spinlock_t		uvhub_lock;
+	raw_spinlock_t		queue_lock;
+	raw_spinlock_t		disable_lock;
 	/* tunables */
 	int			max_concurr;
 	int			max_concurr_const;
@@ -776,15 +776,15 @@ static inline int atom_asr(short i, struct atomic_short *v)
  * to be lowered below the current 'v'.  atomic_add_unless can only stop
  * on equal.
  */
-static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
+static inline int atomic_inc_unless_ge(raw_spinlock_t *lock, atomic_t *v, int u)
 {
-	spin_lock(lock);
+	raw_spin_lock(lock);
 	if (atomic_read(v) >= u) {
-		spin_unlock(lock);
+		raw_spin_unlock(lock);
 		return 0;
 	}
 	atomic_inc(v);
-	spin_unlock(lock);
+	raw_spin_unlock(lock);
 	return 1;
 }
 
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index a00ad8f2a657..c2729abe02bc 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -492,7 +492,7 @@ struct uv_blade_info {
 	unsigned short	nr_online_cpus;
 	unsigned short	pnode;
 	short		memory_nid;
-	spinlock_t	nmi_lock;	/* obsolete, see uv_hub_nmi */
+	raw_spinlock_t	nmi_lock;	/* obsolete, see uv_hub_nmi */
 	unsigned long	nmi_count;	/* obsolete, see uv_hub_nmi */
 };
 extern struct uv_blade_info *uv_blade_info;
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 8e9dcfd630e4..2cfedcae31b9 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -918,7 +918,7 @@ void __init uv_system_init(void)
 			uv_blade_info[blade].pnode = pnode;
 			uv_blade_info[blade].nr_possible_cpus = 0;
 			uv_blade_info[blade].nr_online_cpus = 0;
-			spin_lock_init(&uv_blade_info[blade].nmi_lock);
+			raw_spin_lock_init(&uv_blade_info[blade].nmi_lock);
 			min_pnode = min(pnode, min_pnode);
 			max_pnode = max(pnode, max_pnode);
 			blade++;
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index 3968d67d366b..7d444650bdd4 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -714,9 +714,9 @@ static void destination_plugged(struct bau_desc *bau_desc,
 
 		quiesce_local_uvhub(hmaster);
 
-		spin_lock(&hmaster->queue_lock);
+		raw_spin_lock(&hmaster->queue_lock);
 		reset_with_ipi(&bau_desc->distribution, bcp);
-		spin_unlock(&hmaster->queue_lock);
+		raw_spin_unlock(&hmaster->queue_lock);
 
 		end_uvhub_quiesce(hmaster);
 
@@ -736,9 +736,9 @@ static void destination_timeout(struct bau_desc *bau_desc,
 
 		quiesce_local_uvhub(hmaster);
 
-		spin_lock(&hmaster->queue_lock);
+		raw_spin_lock(&hmaster->queue_lock);
 		reset_with_ipi(&bau_desc->distribution, bcp);
-		spin_unlock(&hmaster->queue_lock);
+		raw_spin_unlock(&hmaster->queue_lock);
 
 		end_uvhub_quiesce(hmaster);
 
@@ -759,7 +759,7 @@ static void disable_for_period(struct bau_control *bcp, struct ptc_stats *stat)
 	cycles_t tm1;
 
 	hmaster = bcp->uvhub_master;
-	spin_lock(&hmaster->disable_lock);
+	raw_spin_lock(&hmaster->disable_lock);
 	if (!bcp->baudisabled) {
 		stat->s_bau_disabled++;
 		tm1 = get_cycles();
@@ -772,7 +772,7 @@ static void disable_for_period(struct bau_control *bcp, struct ptc_stats *stat)
 			}
 		}
 	}
-	spin_unlock(&hmaster->disable_lock);
+	raw_spin_unlock(&hmaster->disable_lock);
 }
 
 static void count_max_concurr(int stat, struct bau_control *bcp,
@@ -835,7 +835,7 @@ static void record_send_stats(cycles_t time1, cycles_t time2,
  */
 static void uv1_throttle(struct bau_control *hmaster, struct ptc_stats *stat)
 {
-	spinlock_t *lock = &hmaster->uvhub_lock;
+	raw_spinlock_t *lock = &hmaster->uvhub_lock;
 	atomic_t *v;
 
 	v = &hmaster->active_descriptor_count;
@@ -968,7 +968,7 @@ static int check_enable(struct bau_control *bcp, struct ptc_stats *stat)
 	struct bau_control *hmaster;
 
 	hmaster = bcp->uvhub_master;
-	spin_lock(&hmaster->disable_lock);
+	raw_spin_lock(&hmaster->disable_lock);
 	if (bcp->baudisabled && (get_cycles() >= bcp->set_bau_on_time)) {
 		stat->s_bau_reenabled++;
 		for_each_present_cpu(tcpu) {
@@ -980,10 +980,10 @@ static int check_enable(struct bau_control *bcp, struct ptc_stats *stat)
 				tbcp->period_giveups = 0;
 			}
 		}
-		spin_unlock(&hmaster->disable_lock);
+		raw_spin_unlock(&hmaster->disable_lock);
 		return 0;
 	}
-	spin_unlock(&hmaster->disable_lock);
+	raw_spin_unlock(&hmaster->disable_lock);
 	return -1;
 }
 
@@ -1899,9 +1899,9 @@ static void __init init_per_cpu_tunables(void)
 		bcp->cong_reps			= congested_reps;
 		bcp->disabled_period =		sec_2_cycles(disabled_period);
 		bcp->giveup_limit =		giveup_limit;
-		spin_lock_init(&bcp->queue_lock);
-		spin_lock_init(&bcp->uvhub_lock);
-		spin_lock_init(&bcp->disable_lock);
+		raw_spin_lock_init(&bcp->queue_lock);
+		raw_spin_lock_init(&bcp->uvhub_lock);
+		raw_spin_lock_init(&bcp->disable_lock);
 	}
 }
 
diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c
index a244237f3cfa..a718fe0d2e73 100644
--- a/arch/x86/platform/uv/uv_time.c
+++ b/arch/x86/platform/uv/uv_time.c
@@ -58,7 +58,7 @@ static DEFINE_PER_CPU(struct clock_event_device, cpu_ced);
 
 /* There is one of these allocated per node */
 struct uv_rtc_timer_head {
-	spinlock_t	lock;
+	raw_spinlock_t	lock;
 	/* next cpu waiting for timer, local node relative: */
 	int		next_cpu;
 	/* number of cpus on this node: */
@@ -178,7 +178,7 @@ static __init int uv_rtc_allocate_timers(void)
 				uv_rtc_deallocate_timers();
 				return -ENOMEM;
 			}
-			spin_lock_init(&head->lock);
+			raw_spin_lock_init(&head->lock);
 			head->ncpus = uv_blade_nr_possible_cpus(bid);
 			head->next_cpu = -1;
 			blade_info[bid] = head;
@@ -232,7 +232,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
 	unsigned long flags;
 	int next_cpu;
 
-	spin_lock_irqsave(&head->lock, flags);
+	raw_spin_lock_irqsave(&head->lock, flags);
 
 	next_cpu = head->next_cpu;
 	*t = expires;
@@ -244,12 +244,12 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
 		if (uv_setup_intr(cpu, expires)) {
 			*t = ULLONG_MAX;
 			uv_rtc_find_next_timer(head, pnode);
-			spin_unlock_irqrestore(&head->lock, flags);
+			raw_spin_unlock_irqrestore(&head->lock, flags);
 			return -ETIME;
 		}
 	}
 
-	spin_unlock_irqrestore(&head->lock, flags);
+	raw_spin_unlock_irqrestore(&head->lock, flags);
 	return 0;
 }
 
@@ -268,7 +268,7 @@ static int uv_rtc_unset_timer(int cpu, int force)
 	unsigned long flags;
 	int rc = 0;
 
-	spin_lock_irqsave(&head->lock, flags);
+	raw_spin_lock_irqsave(&head->lock, flags);
 
 	if ((head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) || force)
 		rc = 1;
@@ -280,7 +280,7 @@ static int uv_rtc_unset_timer(int cpu, int force)
 			uv_rtc_find_next_timer(head, pnode);
 	}
 
-	spin_unlock_irqrestore(&head->lock, flags);
+	raw_spin_unlock_irqrestore(&head->lock, flags);
 
 	return rc;
 }
@@ -300,13 +300,18 @@ static int uv_rtc_unset_timer(int cpu, int force)
 static cycle_t uv_read_rtc(struct clocksource *cs)
 {
 	unsigned long offset;
+	cycle_t cycles;
 
+	preempt_disable();
 	if (uv_get_min_hub_revision_id() == 1)
 		offset = 0;
 	else
 		offset = (uv_blade_processor_id() * L1_CACHE_BYTES) % PAGE_SIZE;
 
-	return (cycle_t)uv_read_local_mmr(UVH_RTC | offset);
+	cycles = (cycle_t)uv_read_local_mmr(UVH_RTC | offset);
+	preempt_enable();
+
+	return cycles;
 }
 
 /*
-- 
cgit v1.2.3


From 5a221c76296f7a39cdca6742928c8acd37f5c787 Mon Sep 17 00:00:00 2001
From: Daniel Wagner <wagi@monom.org>
Date: Tue, 17 Feb 2015 09:37:44 +0100
Subject: thermal: Defer thermal wakups to threads

On RT the spin lock in pkg_temp_thermal_platfrom_thermal_notify will
call schedule while we run in irq context.

[<ffffffff816850ac>] dump_stack+0x4e/0x8f
[<ffffffff81680f7d>] __schedule_bug+0xa6/0xb4
[<ffffffff816896b4>] __schedule+0x5b4/0x700
[<ffffffff8168982a>] schedule+0x2a/0x90
[<ffffffff8168a8b5>] rt_spin_lock_slowlock+0xe5/0x2d0
[<ffffffff8168afd5>] rt_spin_lock+0x25/0x30
[<ffffffffa03a7b75>] pkg_temp_thermal_platform_thermal_notify+0x45/0x134 [x86_pkg_temp_thermal]
[<ffffffff8103d4db>] ? therm_throt_process+0x1b/0x160
[<ffffffff8103d831>] intel_thermal_interrupt+0x211/0x250
[<ffffffff8103d8c1>] smp_thermal_interrupt+0x21/0x40
[<ffffffff8169415d>] thermal_interrupt+0x6d/0x80

Let's defer the work to a kthread.

Signed-off-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
[bigeasy: reoder init/denit position. TODO: flush swork on exit]
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 drivers/thermal/x86_pkg_temp_thermal.c | 50 ++++++++++++++++++++++++++++++++--
 1 file changed, 47 insertions(+), 3 deletions(-)

diff --git a/drivers/thermal/x86_pkg_temp_thermal.c b/drivers/thermal/x86_pkg_temp_thermal.c
index 9ea3d9d49ffc..e3c2663e0b1f 100644
--- a/drivers/thermal/x86_pkg_temp_thermal.c
+++ b/drivers/thermal/x86_pkg_temp_thermal.c
@@ -29,6 +29,7 @@
 #include <linux/pm.h>
 #include <linux/thermal.h>
 #include <linux/debugfs.h>
+#include <linux/work-simple.h>
 #include <asm/cpu_device_id.h>
 #include <asm/mce.h>
 
@@ -352,7 +353,7 @@ static void pkg_temp_thermal_threshold_work_fn(struct work_struct *work)
 	}
 }
 
-static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
+static void platform_thermal_notify_work(struct swork_event *event)
 {
 	unsigned long flags;
 	int cpu = smp_processor_id();
@@ -369,7 +370,7 @@ static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
 			pkg_work_scheduled[phy_id]) {
 		disable_pkg_thres_interrupt();
 		spin_unlock_irqrestore(&pkg_work_lock, flags);
-		return -EINVAL;
+		return;
 	}
 	pkg_work_scheduled[phy_id] = 1;
 	spin_unlock_irqrestore(&pkg_work_lock, flags);
@@ -378,9 +379,48 @@ static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
 	schedule_delayed_work_on(cpu,
 				&per_cpu(pkg_temp_thermal_threshold_work, cpu),
 				msecs_to_jiffies(notify_delay_ms));
+}
+
+#ifdef CONFIG_PREEMPT_RT_FULL
+static struct swork_event notify_work;
+
+static int thermal_notify_work_init(void)
+{
+	int err;
+
+	err = swork_get();
+	if (err)
+		return err;
+
+	INIT_SWORK(&notify_work, platform_thermal_notify_work);
 	return 0;
 }
 
+static void thermal_notify_work_cleanup(void)
+{
+	swork_put();
+}
+
+static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
+{
+	swork_queue(&notify_work);
+	return 0;
+}
+
+#else  /* !CONFIG_PREEMPT_RT_FULL */
+
+static int thermal_notify_work_init(void) { return 0; }
+
+static int thermal_notify_work_cleanup(void) {  }
+
+static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
+{
+	platform_thermal_notify_work(NULL);
+
+	return 0;
+}
+#endif /* CONFIG_PREEMPT_RT_FULL */
+
 static int find_siblings_cpu(int cpu)
 {
 	int i;
@@ -584,6 +624,9 @@ static int __init pkg_temp_thermal_init(void)
 	if (!x86_match_cpu(pkg_temp_thermal_ids))
 		return -ENODEV;
 
+	if (!thermal_notify_work_init())
+		return -ENODEV;
+
 	spin_lock_init(&pkg_work_lock);
 	platform_thermal_package_notify =
 			pkg_temp_thermal_platform_thermal_notify;
@@ -608,7 +651,7 @@ err_ret:
 	kfree(pkg_work_scheduled);
 	platform_thermal_package_notify = NULL;
 	platform_thermal_package_rate_control = NULL;
-
+	thermal_notify_work_cleanup();
 	return -ENODEV;
 }
 
@@ -633,6 +676,7 @@ static void __exit pkg_temp_thermal_exit(void)
 	mutex_unlock(&phy_dev_list_mutex);
 	platform_thermal_package_notify = NULL;
 	platform_thermal_package_rate_control = NULL;
+	thermal_notify_work_cleanup();
 	for_each_online_cpu(i)
 		cancel_delayed_work_sync(
 			&per_cpu(pkg_temp_thermal_threshold_work, i));
-- 
cgit v1.2.3


From aa3057d90a021ce3f0be73d6716d5419e2d6b535 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 8 Jul 2011 16:35:35 +0200
Subject: epoll.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 fs/eventpoll.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 7bcfff900f05..3a1e81b80080 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -505,12 +505,12 @@ static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
  */
 static void ep_poll_safewake(wait_queue_head_t *wq)
 {
-	int this_cpu = get_cpu();
+	int this_cpu = get_cpu_light();
 
 	ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
 		       ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);
 
-	put_cpu();
+	put_cpu_light();
 }
 
 static void ep_remove_wait_queue(struct eppoll_entry *pwq)
-- 
cgit v1.2.3


From bab05da069daa2b547ffdd6c1d59818dd9a78bb5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 12 Jul 2011 11:39:36 +0200
Subject: mm-vmalloc.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 mm/vmalloc.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 90520af7f186..43122b6c46ed 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -798,7 +798,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
 	struct vmap_block *vb;
 	struct vmap_area *va;
 	unsigned long vb_idx;
-	int node, err;
+	int node, err, cpu;
 
 	node = numa_node_id();
 
@@ -836,11 +836,12 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
 	BUG_ON(err);
 	radix_tree_preload_end();
 
-	vbq = &get_cpu_var(vmap_block_queue);
+	cpu = get_cpu_light();
+	vbq = &__get_cpu_var(vmap_block_queue);
 	spin_lock(&vbq->lock);
 	list_add_rcu(&vb->free_list, &vbq->free);
 	spin_unlock(&vbq->lock);
-	put_cpu_var(vmap_block_queue);
+	put_cpu_light();
 
 	return vb;
 }
@@ -908,6 +909,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
 	struct vmap_block *vb;
 	unsigned long addr = 0;
 	unsigned int order;
+	int cpu = 0;
 
 	BUG_ON(size & ~PAGE_MASK);
 	BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
@@ -923,7 +925,8 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
 
 again:
 	rcu_read_lock();
-	vbq = &get_cpu_var(vmap_block_queue);
+	cpu = get_cpu_light();
+	vbq = &__get_cpu_var(vmap_block_queue);
 	list_for_each_entry_rcu(vb, &vbq->free, free_list) {
 		int i;
 
@@ -947,7 +950,7 @@ next:
 		spin_unlock(&vb->lock);
 	}
 
-	put_cpu_var(vmap_block_queue);
+	put_cpu_light();
 	rcu_read_unlock();
 
 	if (!addr) {
-- 
cgit v1.2.3


From e84d684065e3d7e00b6b364b79493cd6de535538 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 9 Apr 2014 10:37:23 +0200
Subject: block: mq: use cpu_light()

there is a might sleep splat because get_cpu() disables preemption and
later we grab a lock. As a workaround for this we use get_cpu_light()
and an additional lock to prevent taking the same ctx.

There is a lock member in the ctx already but there some functions which do ++
on the member and this works with irq off but on RT we would need the extra lock.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 block/blk-mq.c |  4 ++++
 block/blk-mq.h | 17 ++++++++++++++---
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 849479debac3..28e16354014a 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1252,7 +1252,9 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
 			if (list_empty(&plug->mq_list))
 				trace_block_plug(q);
 			else if (request_count >= BLK_MAX_REQUEST_COUNT) {
+				spin_unlock(&data.ctx->cpu_lock);
 				blk_flush_plug_list(plug, false);
+				spin_lock(&data.ctx->cpu_lock);
 				trace_block_plug(q);
 			}
 			list_add_tail(&rq->queuelist, &plug->mq_list);
@@ -1448,6 +1450,7 @@ static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu)
 		blk_mq_hctx_clear_pending(hctx, ctx);
 	}
 	spin_unlock(&ctx->lock);
+	__blk_mq_put_ctx(ctx);
 
 	if (list_empty(&tmp))
 		return NOTIFY_OK;
@@ -1657,6 +1660,7 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
 		memset(__ctx, 0, sizeof(*__ctx));
 		__ctx->cpu = i;
 		spin_lock_init(&__ctx->lock);
+		spin_lock_init(&__ctx->cpu_lock);
 		INIT_LIST_HEAD(&__ctx->rq_list);
 		__ctx->queue = q;
 
diff --git a/block/blk-mq.h b/block/blk-mq.h
index d567d5283ffa..d0d4780ddfb6 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -9,6 +9,7 @@ struct blk_mq_ctx {
 		struct list_head	rq_list;
 	}  ____cacheline_aligned_in_smp;
 
+	spinlock_t		cpu_lock;
 	unsigned int		cpu;
 	unsigned int		index_hw;
 
@@ -73,7 +74,11 @@ struct blk_align_bitmap {
 static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
 					   unsigned int cpu)
 {
-	return per_cpu_ptr(q->queue_ctx, cpu);
+	struct blk_mq_ctx *ctx;
+
+	ctx = per_cpu_ptr(q->queue_ctx, cpu);
+	spin_lock(&ctx->cpu_lock);
+	return ctx;
 }
 
 /*
@@ -84,12 +89,18 @@ static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
  */
 static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q)
 {
-	return __blk_mq_get_ctx(q, get_cpu());
+	return __blk_mq_get_ctx(q, get_cpu_light());
+}
+
+static void __blk_mq_put_ctx(struct blk_mq_ctx *ctx)
+{
+	spin_unlock(&ctx->cpu_lock);
 }
 
 static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
 {
-	put_cpu();
+	__blk_mq_put_ctx(ctx);
+	put_cpu_light();
 }
 
 struct blk_mq_alloc_data {
-- 
cgit v1.2.3


From cef8e8f3e24a6fede549816242bb5cc03620687f Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 20 May 2015 00:23:48 +0200
Subject: block/mq: do not invoke preempt_disable()

preempt_disable() and get_cpu() don't play well together with the sleeping
locks it tries to allocate later.
It seems to be enough to replace it with get_cpu_light() and migrate_disable().

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 block/blk-mq.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 28e16354014a..8e93eac87101 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -331,7 +331,7 @@ static void blk_mq_ipi_complete_request(struct request *rq)
 		return;
 	}
 
-	cpu = get_cpu();
+	cpu = get_cpu_light();
 	if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
 		shared = cpus_share_cache(cpu, ctx->cpu);
 
@@ -343,7 +343,7 @@ static void blk_mq_ipi_complete_request(struct request *rq)
 	} else {
 		rq->q->softirq_done_fn(rq);
 	}
-	put_cpu();
+	put_cpu_light();
 }
 
 void __blk_mq_complete_request(struct request *rq)
@@ -814,9 +814,9 @@ void blk_mq_run_queues(struct request_queue *q, bool async)
 		    test_bit(BLK_MQ_S_STOPPED, &hctx->state))
 			continue;
 
-		preempt_disable();
+		migrate_disable();
 		blk_mq_run_hw_queue(hctx, async);
-		preempt_enable();
+		migrate_enable();
 	}
 }
 EXPORT_SYMBOL(blk_mq_run_queues);
@@ -843,9 +843,9 @@ void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
 {
 	clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
 
-	preempt_disable();
+	migrate_disable();
 	blk_mq_run_hw_queue(hctx, false);
-	preempt_enable();
+	migrate_enable();
 }
 EXPORT_SYMBOL(blk_mq_start_hw_queue);
 
@@ -870,9 +870,9 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
 			continue;
 
 		clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
-		preempt_disable();
+		migrate_disable();
 		blk_mq_run_hw_queue(hctx, async);
-		preempt_enable();
+		migrate_enable();
 	}
 }
 EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
-- 
cgit v1.2.3


From db8c877fa12b8c19be536f3b3d61c47e86086f8c Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 29 Jan 2015 15:10:08 +0100
Subject: block/mq: don't complete requests via IPI

The IPI runs in hardirq context and there are sleeping locks. This patch
moves the completion into a workqueue.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 block/blk-core.c       |  3 +++
 block/blk-mq.c         | 20 ++++++++++++++++++++
 include/linux/blk-mq.h |  1 +
 include/linux/blkdev.h |  1 +
 4 files changed, 25 insertions(+)

diff --git a/block/blk-core.c b/block/blk-core.c
index 34d4fb831ee5..f78ea53fc77a 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -100,6 +100,9 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
 
 	INIT_LIST_HEAD(&rq->queuelist);
 	INIT_LIST_HEAD(&rq->timeout_list);
+#if CONFIG_PREEMPT_RT_FULL
+	INIT_WORK(&rq->work, __blk_mq_complete_request_remote_work);
+#endif
 	rq->cpu = -1;
 	rq->q = q;
 	rq->__sector = (sector_t) -1;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 8e93eac87101..516a0c019559 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -194,6 +194,9 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
 	rq->resid_len = 0;
 	rq->sense = NULL;
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+	INIT_WORK(&rq->work, __blk_mq_complete_request_remote_work);
+#endif
 	INIT_LIST_HEAD(&rq->timeout_list);
 	rq->timeout = 0;
 
@@ -313,6 +316,17 @@ void blk_mq_end_request(struct request *rq, int error)
 }
 EXPORT_SYMBOL(blk_mq_end_request);
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+
+void __blk_mq_complete_request_remote_work(struct work_struct *work)
+{
+	struct request *rq = container_of(work, struct request, work);
+
+	rq->q->softirq_done_fn(rq);
+}
+
+#else
+
 static void __blk_mq_complete_request_remote(void *data)
 {
 	struct request *rq = data;
@@ -320,6 +334,8 @@ static void __blk_mq_complete_request_remote(void *data)
 	rq->q->softirq_done_fn(rq);
 }
 
+#endif
+
 static void blk_mq_ipi_complete_request(struct request *rq)
 {
 	struct blk_mq_ctx *ctx = rq->mq_ctx;
@@ -336,10 +352,14 @@ static void blk_mq_ipi_complete_request(struct request *rq)
 		shared = cpus_share_cache(cpu, ctx->cpu);
 
 	if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
+#ifdef CONFIG_PREEMPT_RT_FULL
+		schedule_work_on(ctx->cpu, &rq->work);
+#else
 		rq->csd.func = __blk_mq_complete_request_remote;
 		rq->csd.info = rq;
 		rq->csd.flags = 0;
 		smp_call_function_single_async(ctx->cpu, &rq->csd);
+#endif
 	} else {
 		rq->q->softirq_done_fn(rq);
 	}
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index c9be1589415a..81a5ea3a7823 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -169,6 +169,7 @@ struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag);
 
 struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_index);
 struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *, unsigned int, int);
+void __blk_mq_complete_request_remote_work(struct work_struct *work);
 
 void blk_mq_start_request(struct request *rq);
 void blk_mq_end_request(struct request *rq, int error);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index aac0f9ea952a..c7d201802f9a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -101,6 +101,7 @@ struct request {
 	struct list_head queuelist;
 	union {
 		struct call_single_data csd;
+		struct work_struct work;
 		unsigned long fifo_time;
 	};
 
-- 
cgit v1.2.3


From 84b31d6c81bbe14a143248288c992c2f917d0ea6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 7 Mar 2012 20:51:03 +0100
Subject: rt: Introduce cpu_chill()

Retry loops on RT might loop forever when the modifying side was
preempted. Add cpu_chill() to replace cpu_relax(). cpu_chill()
defaults to cpu_relax() for non RT. On RT it puts the looping task to
sleep for a tick so the preempted task can make progress.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable-rt@vger.kernel.org
---
 include/linux/delay.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/include/linux/delay.h b/include/linux/delay.h
index a6ecb34cf547..e23a7c021eeb 100644
--- a/include/linux/delay.h
+++ b/include/linux/delay.h
@@ -52,4 +52,10 @@ static inline void ssleep(unsigned int seconds)
 	msleep(seconds * 1000);
 }
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+# define cpu_chill()	msleep(1)
+#else
+# define cpu_chill()	cpu_relax()
+#endif
+
 #endif /* defined(_LINUX_DELAY_H) */
-- 
cgit v1.2.3


From 30f4a8ad8e13e838a4684cdb81bc4ff0c09292a8 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 5 Feb 2014 11:51:25 -0500
Subject: rt: Make cpu_chill() use hrtimer instead of msleep()

Ulrich Obergfell pointed out that cpu_chill() calls msleep() which is woken
up by the ksoftirqd running the TIMER softirq. But as the cpu_chill() is
called from softirq context, it may block the ksoftirqd() from running, in
which case, it may never wake up the msleep() causing the deadlock.

I checked the vmcore, and irq/74-qla2xxx is stuck in the msleep() call,
running on CPU 8. The one ksoftirqd that is stuck, happens to be the one that
runs on CPU 8, and it is blocked on a lock held by irq/74-qla2xxx. As that
ksoftirqd is the one that will wake up irq/74-qla2xxx, and it happens to be
blocked on a lock that irq/74-qla2xxx holds, we have our deadlock.

The solution is not to convert the cpu_chill() back to a cpu_relax() as that
will re-create a possible live lock that the cpu_chill() fixed earlier, and may
also leave this bug open on other softirqs. The fix is to remove the
dependency on ksoftirqd from cpu_chill(). That is, instead of calling
msleep() that requires ksoftirqd to wake it up, use the
hrtimer_nanosleep() code that does the wakeup from hard irq context.

|Looks to be the lock of the block softirq. I don't have the core dump
|anymore, but from what I could tell the ksoftirqd was blocked on the
|block softirq lock, where the block softirq handler did a msleep
|(called by the qla2xxx interrupt handler).
|
|Looking at trigger_softirq() in block/blk-softirq.c, it can do a
|smp_callfunction() to another cpu to run the block softirq. If that
|happens to be the cpu where the qla2xx irq handler is doing the block
|softirq and is in a middle of a msleep(), I believe the ksoftirqd will
|try to run the softirq. If it does that, then BOOM, it's deadlocked
|because the ksoftirqd will never run the timer softirq either.

|I should have also stated that it was only one lock that was involved.
|But the lock owner was doing a msleep() that requires a wakeup by
|ksoftirqd to continue. If ksoftirqd happens to be blocked on a lock
|held by the msleep() caller, then you have your deadlock.
|
|It's best not to have any softirqs going to sleep requiring another
|softirq to wake it up. Note, if we ever require a timer softirq to do a
|cpu_chill() it will most definitely hit this deadlock.

Cc: stable-rt@vger.kernel.org
Found-by: Ulrich Obergfell <uobergfe@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
[bigeasy: add the 4 | chapters from email]
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 include/linux/delay.h |  2 +-
 kernel/time/hrtimer.c | 15 +++++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/include/linux/delay.h b/include/linux/delay.h
index e23a7c021eeb..37caab306336 100644
--- a/include/linux/delay.h
+++ b/include/linux/delay.h
@@ -53,7 +53,7 @@ static inline void ssleep(unsigned int seconds)
 }
 
 #ifdef CONFIG_PREEMPT_RT_FULL
-# define cpu_chill()	msleep(1)
+extern void cpu_chill(void);
 #else
 # define cpu_chill()	cpu_relax()
 #endif
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 20a6809b4b02..b291fd0eadd7 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1873,6 +1873,21 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
 	return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
 }
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+/*
+ * Sleep for 1 ms in hope whoever holds what we want will let it go.
+ */
+void cpu_chill(void)
+{
+	struct timespec tu = {
+		.tv_nsec = NSEC_PER_MSEC,
+	};
+
+	hrtimer_nanosleep(&tu, NULL, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
+}
+EXPORT_SYMBOL(cpu_chill);
+#endif
+
 /*
  * Functions related to boot-time initialization:
  */
-- 
cgit v1.2.3


From 1295369c9ec15cf514ce2812d3e867c7bf3980d4 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 19 Feb 2014 11:56:06 +0100
Subject: kernel/hrtimer: be non-freezeable in cpu_chill()

Since we replaced msleep() by hrtimer I see now and then (rarely) this:

| [....] Waiting for /dev to be fully populated...
| =====================================
| [ BUG: udevd/229 still has locks held! ]
| 3.12.11-rt17 #23 Not tainted
| -------------------------------------
| 1 lock held by udevd/229:
|  #0:  (&type->i_mutex_dir_key#2){+.+.+.}, at: lookup_slow+0x28/0x98
|
| stack backtrace:
| CPU: 0 PID: 229 Comm: udevd Not tainted 3.12.11-rt17 #23
| (unwind_backtrace+0x0/0xf8) from (show_stack+0x10/0x14)
| (show_stack+0x10/0x14) from (dump_stack+0x74/0xbc)
| (dump_stack+0x74/0xbc) from (do_nanosleep+0x120/0x160)
| (do_nanosleep+0x120/0x160) from (hrtimer_nanosleep+0x90/0x110)
| (hrtimer_nanosleep+0x90/0x110) from (cpu_chill+0x30/0x38)
| (cpu_chill+0x30/0x38) from (dentry_kill+0x158/0x1ec)
| (dentry_kill+0x158/0x1ec) from (dput+0x74/0x15c)
| (dput+0x74/0x15c) from (lookup_real+0x4c/0x50)
| (lookup_real+0x4c/0x50) from (__lookup_hash+0x34/0x44)
| (__lookup_hash+0x34/0x44) from (lookup_slow+0x38/0x98)
| (lookup_slow+0x38/0x98) from (path_lookupat+0x208/0x7fc)
| (path_lookupat+0x208/0x7fc) from (filename_lookup+0x20/0x60)
| (filename_lookup+0x20/0x60) from (user_path_at_empty+0x50/0x7c)
| (user_path_at_empty+0x50/0x7c) from (user_path_at+0x14/0x1c)
| (user_path_at+0x14/0x1c) from (vfs_fstatat+0x48/0x94)
| (vfs_fstatat+0x48/0x94) from (SyS_stat64+0x14/0x30)
| (SyS_stat64+0x14/0x30) from (ret_fast_syscall+0x0/0x48)

For now I see no better way but to disable the freezer the sleep the period.

Cc: stable-rt@vger.kernel.org
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 kernel/time/hrtimer.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index b291fd0eadd7..7c4d4e8e1ef1 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1882,8 +1882,12 @@ void cpu_chill(void)
 	struct timespec tu = {
 		.tv_nsec = NSEC_PER_MSEC,
 	};
+	unsigned int freeze_flag = current->flags & PF_NOFREEZE;
 
+	current->flags |= PF_NOFREEZE;
 	hrtimer_nanosleep(&tu, NULL, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
+	if (!freeze_flag)
+		current->flags &= ~PF_NOFREEZE;
 }
 EXPORT_SYMBOL(cpu_chill);
 #endif
-- 
cgit v1.2.3


From 9ecff9a12fc4bc6ab1095116b95d5a4758fb42ed Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 4 Mar 2014 12:28:32 -0500
Subject: cpu_chill: Add a UNINTERRUPTIBLE hrtimer_nanosleep

We hit another bug that was caused by switching cpu_chill() from
msleep() to hrtimer_nanosleep().

This time it is a livelock. The problem is that hrtimer_nanosleep()
calls schedule with the state == TASK_INTERRUPTIBLE. But these means
that if a signal is pending, the scheduler wont schedule, and will
simply change the current task state back to TASK_RUNNING. This
nullifies the whole point of cpu_chill() in the first place. That is,
if a task is spinning on a try_lock() and it preempted the owner of the
lock, if it has a signal pending, it will never give up the CPU to let
the owner of the lock run.

I made a static function __hrtimer_nanosleep() that takes a fifth
parameter "state", which determines the task state of that the
nanosleep() will be in. The normal hrtimer_nanosleep() will act the
same, but cpu_chill() will call the __hrtimer_nanosleep() directly with
the TASK_UNINTERRUPTIBLE state.

cpu_chill() only cares that the first sleep happens, and does not care
about the state of the restart schedule (in hrtimer_nanosleep_restart).

Cc: stable-rt@vger.kernel.org
Reported-by: Ulrich Obergfell <uobergfe@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 kernel/time/hrtimer.c | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 7c4d4e8e1ef1..50eda41e185a 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1752,12 +1752,13 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
 }
 EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
 
-static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
+static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode,
+				unsigned long state)
 {
 	hrtimer_init_sleeper(t, current);
 
 	do {
-		set_current_state(TASK_INTERRUPTIBLE);
+		set_current_state(state);
 		hrtimer_start_expires(&t->timer, mode);
 		if (!hrtimer_active(&t->timer))
 			t->task = NULL;
@@ -1801,7 +1802,8 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
 				HRTIMER_MODE_ABS);
 	hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
 
-	if (do_nanosleep(&t, HRTIMER_MODE_ABS))
+	/* cpu_chill() does not care about restart state. */
+	if (do_nanosleep(&t, HRTIMER_MODE_ABS, TASK_INTERRUPTIBLE))
 		goto out;
 
 	rmtp = restart->nanosleep.rmtp;
@@ -1818,8 +1820,10 @@ out:
 	return ret;
 }
 
-long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
-		       const enum hrtimer_mode mode, const clockid_t clockid)
+static long
+__hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
+		    const enum hrtimer_mode mode, const clockid_t clockid,
+		    unsigned long state)
 {
 	struct restart_block *restart;
 	struct hrtimer_sleeper t;
@@ -1832,7 +1836,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
 
 	hrtimer_init_on_stack(&t.timer, clockid, mode);
 	hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack);
-	if (do_nanosleep(&t, mode))
+	if (do_nanosleep(&t, mode, state))
 		goto out;
 
 	/* Absolute timers do not update the rmtp value and restart: */
@@ -1859,6 +1863,12 @@ out:
 	return ret;
 }
 
+long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
+		       const enum hrtimer_mode mode, const clockid_t clockid)
+{
+	return __hrtimer_nanosleep(rqtp, rmtp, mode, clockid, TASK_INTERRUPTIBLE);
+}
+
 SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
 		struct timespec __user *, rmtp)
 {
@@ -1885,7 +1895,8 @@ void cpu_chill(void)
 	unsigned int freeze_flag = current->flags & PF_NOFREEZE;
 
 	current->flags |= PF_NOFREEZE;
-	hrtimer_nanosleep(&tu, NULL, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
+	__hrtimer_nanosleep(&tu, NULL, HRTIMER_MODE_REL, CLOCK_MONOTONIC,
+			    TASK_UNINTERRUPTIBLE);
 	if (!freeze_flag)
 		current->flags &= ~PF_NOFREEZE;
 }
-- 
cgit v1.2.3


From 826c0b556077a989600dae96c448e6aa71568ca1 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Sat, 3 May 2014 11:00:29 +0200
Subject: blk-mq: revert raw locks, post pone notifier to POST_DEAD

The blk_mq_cpu_notify_lock should be raw because some CPU down levels
are called with interrupts off. The notifier itself calls currently one
function that is blk_mq_hctx_notify().
That function acquires the ctx->lock lock which is sleeping and I would
prefer to keep it that way. That function only moves IO-requests from
the CPU that is going offline to another CPU and it is currently the
only one. Therefore I revert the list lock back to sleeping spinlocks
and let the notifier run at POST_DEAD time.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 block/blk-mq-cpu.c | 17 ++++++++++-------
 block/blk-mq.c     |  2 +-
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/block/blk-mq-cpu.c b/block/blk-mq-cpu.c
index bb3ed488f7b5..628c6c13c482 100644
--- a/block/blk-mq-cpu.c
+++ b/block/blk-mq-cpu.c
@@ -16,7 +16,7 @@
 #include "blk-mq.h"
 
 static LIST_HEAD(blk_mq_cpu_notify_list);
-static DEFINE_RAW_SPINLOCK(blk_mq_cpu_notify_lock);
+static DEFINE_SPINLOCK(blk_mq_cpu_notify_lock);
 
 static int blk_mq_main_cpu_notify(struct notifier_block *self,
 				  unsigned long action, void *hcpu)
@@ -25,7 +25,10 @@ static int blk_mq_main_cpu_notify(struct notifier_block *self,
 	struct blk_mq_cpu_notifier *notify;
 	int ret = NOTIFY_OK;
 
-	raw_spin_lock(&blk_mq_cpu_notify_lock);
+	if (action != CPU_POST_DEAD)
+		return NOTIFY_OK;
+
+	spin_lock(&blk_mq_cpu_notify_lock);
 
 	list_for_each_entry(notify, &blk_mq_cpu_notify_list, list) {
 		ret = notify->notify(notify->data, action, cpu);
@@ -33,7 +36,7 @@ static int blk_mq_main_cpu_notify(struct notifier_block *self,
 			break;
 	}
 
-	raw_spin_unlock(&blk_mq_cpu_notify_lock);
+	spin_unlock(&blk_mq_cpu_notify_lock);
 	return ret;
 }
 
@@ -41,16 +44,16 @@ void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
 {
 	BUG_ON(!notifier->notify);
 
-	raw_spin_lock(&blk_mq_cpu_notify_lock);
+	spin_lock(&blk_mq_cpu_notify_lock);
 	list_add_tail(&notifier->list, &blk_mq_cpu_notify_list);
-	raw_spin_unlock(&blk_mq_cpu_notify_lock);
+	spin_unlock(&blk_mq_cpu_notify_lock);
 }
 
 void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
 {
-	raw_spin_lock(&blk_mq_cpu_notify_lock);
+	spin_lock(&blk_mq_cpu_notify_lock);
 	list_del(&notifier->list);
-	raw_spin_unlock(&blk_mq_cpu_notify_lock);
+	spin_unlock(&blk_mq_cpu_notify_lock);
 }
 
 void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier,
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 516a0c019559..4b4f788da535 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1517,7 +1517,7 @@ static int blk_mq_hctx_notify(void *data, unsigned long action,
 {
 	struct blk_mq_hw_ctx *hctx = data;
 
-	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
+	if (action == CPU_POST_DEAD)
 		return blk_mq_hctx_cpu_offline(hctx, cpu);
 	else if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
 		return blk_mq_hctx_cpu_online(hctx, cpu);
-- 
cgit v1.2.3


From c9fb62167ad026d93040cf3b1941d3befa281f9f Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 13 Feb 2015 11:01:26 +0100
Subject: block: blk-mq: use swait

| BUG: sleeping function called from invalid context at kernel/locking/rtmutex.c:914
| in_atomic(): 1, irqs_disabled(): 0, pid: 255, name: kworker/u257:6
| 5 locks held by kworker/u257:6/255:
|  #0:  ("events_unbound"){.+.+.+}, at: [<ffffffff8108edf1>] process_one_work+0x171/0x5e0
|  #1:  ((&entry->work)){+.+.+.}, at: [<ffffffff8108edf1>] process_one_work+0x171/0x5e0
|  #2:  (&shost->scan_mutex){+.+.+.}, at: [<ffffffffa000faa3>] __scsi_add_device+0xa3/0x130 [scsi_mod]
|  #3:  (&set->tag_list_lock){+.+...}, at: [<ffffffff812f09fa>] blk_mq_init_queue+0x96a/0xa50
|  #4:  (rcu_read_lock_sched){......}, at: [<ffffffff8132887d>] percpu_ref_kill_and_confirm+0x1d/0x120
| Preemption disabled at:[<ffffffff812eff76>] blk_mq_freeze_queue_start+0x56/0x70
|
| CPU: 2 PID: 255 Comm: kworker/u257:6 Not tainted 3.18.7-rt0+ #1
| Workqueue: events_unbound async_run_entry_fn
|  0000000000000003 ffff8800bc29f998 ffffffff815b3a12 0000000000000000
|  0000000000000000 ffff8800bc29f9b8 ffffffff8109aa16 ffff8800bc29fa28
|  ffff8800bc5d1bc8 ffff8800bc29f9e8 ffffffff815b8dd4 ffff880000000000
| Call Trace:
|  [<ffffffff815b3a12>] dump_stack+0x4f/0x7c
|  [<ffffffff8109aa16>] __might_sleep+0x116/0x190
|  [<ffffffff815b8dd4>] rt_spin_lock+0x24/0x60
|  [<ffffffff810b6089>] __wake_up+0x29/0x60
|  [<ffffffff812ee06e>] blk_mq_usage_counter_release+0x1e/0x20
|  [<ffffffff81328966>] percpu_ref_kill_and_confirm+0x106/0x120
|  [<ffffffff812eff76>] blk_mq_freeze_queue_start+0x56/0x70
|  [<ffffffff812f0000>] blk_mq_update_tag_set_depth+0x40/0xd0
|  [<ffffffff812f0a1c>] blk_mq_init_queue+0x98c/0xa50
|  [<ffffffffa000dcf0>] scsi_mq_alloc_queue+0x20/0x60 [scsi_mod]
|  [<ffffffffa000ea35>] scsi_alloc_sdev+0x2f5/0x370 [scsi_mod]
|  [<ffffffffa000f494>] scsi_probe_and_add_lun+0x9e4/0xdd0 [scsi_mod]
|  [<ffffffffa000fb26>] __scsi_add_device+0x126/0x130 [scsi_mod]
|  [<ffffffffa013033f>] ata_scsi_scan_host+0xaf/0x200 [libata]
|  [<ffffffffa012b5b6>] async_port_probe+0x46/0x60 [libata]
|  [<ffffffff810978fb>] async_run_entry_fn+0x3b/0xf0
|  [<ffffffff8108ee81>] process_one_work+0x201/0x5e0

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 block/blk-core.c       | 2 +-
 block/blk-mq.c         | 8 ++++----
 include/linux/blkdev.h | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index f78ea53fc77a..e88517d92075 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -630,7 +630,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	q->bypass_depth = 1;
 	__set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
 
-	init_waitqueue_head(&q->mq_freeze_wq);
+	init_swait_head(&q->mq_freeze_wq);
 
 	if (blkcg_init_queue(q))
 		goto fail_bdi;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4b4f788da535..fbc36885127d 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -85,7 +85,7 @@ static int blk_mq_queue_enter(struct request_queue *q)
 		if (percpu_ref_tryget_live(&q->mq_usage_counter))
 			return 0;
 
-		ret = wait_event_interruptible(q->mq_freeze_wq,
+		ret = swait_event_interruptible(q->mq_freeze_wq,
 				!q->mq_freeze_depth || blk_queue_dying(q));
 		if (blk_queue_dying(q))
 			return -ENODEV;
@@ -104,7 +104,7 @@ static void blk_mq_usage_counter_release(struct percpu_ref *ref)
 	struct request_queue *q =
 		container_of(ref, struct request_queue, mq_usage_counter);
 
-	wake_up_all(&q->mq_freeze_wq);
+	swait_wake_all(&q->mq_freeze_wq);
 }
 
 static void blk_mq_freeze_queue_start(struct request_queue *q)
@@ -123,7 +123,7 @@ static void blk_mq_freeze_queue_start(struct request_queue *q)
 
 static void blk_mq_freeze_queue_wait(struct request_queue *q)
 {
-	wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter));
+	swait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter));
 }
 
 /*
@@ -146,7 +146,7 @@ static void blk_mq_unfreeze_queue(struct request_queue *q)
 	spin_unlock_irq(q->queue_lock);
 	if (wake) {
 		percpu_ref_reinit(&q->mq_usage_counter);
-		wake_up_all(&q->mq_freeze_wq);
+		swait_wake_all(&q->mq_freeze_wq);
 	}
 }
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c7d201802f9a..552abeecaceb 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -479,7 +479,7 @@ struct request_queue {
 	struct throtl_data *td;
 #endif
 	struct rcu_head		rcu_head;
-	wait_queue_head_t	mq_freeze_wq;
+	struct swait_head	mq_freeze_wq;
 	struct percpu_ref	mq_usage_counter;
 	struct list_head	all_q_node;
 
-- 
cgit v1.2.3


From 98e7a3230eb0c88d400e5a22042987779177d3f4 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 18 Feb 2015 18:37:26 +0100
Subject: block/mq: drop per ctx cpu_lock

While converting the get_cpu() to get_cpu_light() I added a cpu lock to
ensure the same code is not invoked twice on the same CPU. And now I run
into this:

| kernel BUG at kernel/locking/rtmutex.c:996!
| invalid opcode: 0000 [#1] PREEMPT SMP
| CPU0: 13 PID: 75 Comm: kworker/u258:0 Tainted: G          I    3.18.7-rt1.5+ #12
| Workqueue: writeback bdi_writeback_workfn (flush-8:0)
| task: ffff88023742a620 ti: ffff88023743c000 task.ti: ffff88023743c000
| RIP: 0010:[<ffffffff81523cc0>]  [<ffffffff81523cc0>] rt_spin_lock_slowlock+0x280/0x2d0
| Call Trace:
|  [<ffffffff815254e7>] rt_spin_lock+0x27/0x60
taking the same lock again
|
|  [<ffffffff8127c771>] blk_mq_insert_requests+0x51/0x130
|  [<ffffffff8127d4a9>] blk_mq_flush_plug_list+0x129/0x140
|  [<ffffffff81272461>] blk_flush_plug_list+0xd1/0x250
|  [<ffffffff81522075>] schedule+0x75/0xa0
|  [<ffffffff8152474d>] do_nanosleep+0xdd/0x180
|  [<ffffffff810c8312>] __hrtimer_nanosleep+0xd2/0x1c0
|  [<ffffffff810c8456>] cpu_chill+0x56/0x80
|  [<ffffffff8107c13d>] try_to_grab_pending+0x1bd/0x390
|  [<ffffffff8107c431>] cancel_delayed_work+0x21/0x170
|  [<ffffffff81279a98>] blk_mq_stop_hw_queue+0x18/0x40
|  [<ffffffffa000ac6f>] scsi_queue_rq+0x7f/0x830 [scsi_mod]
|  [<ffffffff8127b0de>] __blk_mq_run_hw_queue+0x1ee/0x360
|  [<ffffffff8127b528>] blk_mq_map_request+0x108/0x190
take the lock  ^^^
|
|  [<ffffffff8127c8d2>] blk_sq_make_request+0x82/0x350
|  [<ffffffff8126f6c0>] generic_make_request+0xd0/0x120
|  [<ffffffff8126f788>] submit_bio+0x78/0x190
|  [<ffffffff811bd537>] _submit_bh+0x117/0x180
|  [<ffffffff811bf528>] __block_write_full_page.constprop.38+0x138/0x3f0
|  [<ffffffff811bf880>] block_write_full_page+0xa0/0xe0
|  [<ffffffff811c02b3>] blkdev_writepage+0x13/0x20
|  [<ffffffff81127b25>] __writepage+0x15/0x40
|  [<ffffffff8112873b>] write_cache_pages+0x1fb/0x440
|  [<ffffffff811289be>] generic_writepages+0x3e/0x60
|  [<ffffffff8112a17c>] do_writepages+0x1c/0x30
|  [<ffffffff811b3603>] __writeback_single_inode+0x33/0x140
|  [<ffffffff811b462d>] writeback_sb_inodes+0x2bd/0x490
|  [<ffffffff811b4897>] __writeback_inodes_wb+0x97/0xd0
|  [<ffffffff811b4a9b>] wb_writeback+0x1cb/0x210
|  [<ffffffff811b505b>] bdi_writeback_workfn+0x25b/0x380
|  [<ffffffff8107b50b>] process_one_work+0x1bb/0x490
|  [<ffffffff8107c7ab>] worker_thread+0x6b/0x4f0
|  [<ffffffff81081863>] kthread+0xe3/0x100
|  [<ffffffff8152627c>] ret_from_fork+0x7c/0xb0

After looking at this for a while it seems that it is save if blk_mq_ctx is
used multiple times, the in struct lock protects the access.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 block/blk-mq.c | 4 ----
 block/blk-mq.h | 8 --------
 2 files changed, 12 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index fbc36885127d..067b5ad3e047 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1272,9 +1272,7 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
 			if (list_empty(&plug->mq_list))
 				trace_block_plug(q);
 			else if (request_count >= BLK_MAX_REQUEST_COUNT) {
-				spin_unlock(&data.ctx->cpu_lock);
 				blk_flush_plug_list(plug, false);
-				spin_lock(&data.ctx->cpu_lock);
 				trace_block_plug(q);
 			}
 			list_add_tail(&rq->queuelist, &plug->mq_list);
@@ -1470,7 +1468,6 @@ static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu)
 		blk_mq_hctx_clear_pending(hctx, ctx);
 	}
 	spin_unlock(&ctx->lock);
-	__blk_mq_put_ctx(ctx);
 
 	if (list_empty(&tmp))
 		return NOTIFY_OK;
@@ -1680,7 +1677,6 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
 		memset(__ctx, 0, sizeof(*__ctx));
 		__ctx->cpu = i;
 		spin_lock_init(&__ctx->lock);
-		spin_lock_init(&__ctx->cpu_lock);
 		INIT_LIST_HEAD(&__ctx->rq_list);
 		__ctx->queue = q;
 
diff --git a/block/blk-mq.h b/block/blk-mq.h
index d0d4780ddfb6..d1d78dfe4123 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -9,7 +9,6 @@ struct blk_mq_ctx {
 		struct list_head	rq_list;
 	}  ____cacheline_aligned_in_smp;
 
-	spinlock_t		cpu_lock;
 	unsigned int		cpu;
 	unsigned int		index_hw;
 
@@ -77,7 +76,6 @@ static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
 	struct blk_mq_ctx *ctx;
 
 	ctx = per_cpu_ptr(q->queue_ctx, cpu);
-	spin_lock(&ctx->cpu_lock);
 	return ctx;
 }
 
@@ -92,14 +90,8 @@ static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q)
 	return __blk_mq_get_ctx(q, get_cpu_light());
 }
 
-static void __blk_mq_put_ctx(struct blk_mq_ctx *ctx)
-{
-	spin_unlock(&ctx->cpu_lock);
-}
-
 static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
 {
-	__blk_mq_put_ctx(ctx);
 	put_cpu_light();
 }
 
-- 
cgit v1.2.3


From 8173384c25ab648c10cb22fb7b5e34e8f110aad9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 20 Dec 2012 18:28:26 +0100
Subject: block: Use cpu_chill() for retry loops

Retry loops on RT might loop forever when the modifying side was
preempted. Steven also observed a live lock when there was a
concurrent priority boosting going on.

Use cpu_chill() instead of cpu_relax() to let the system
make progress.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable-rt@vger.kernel.org
---
 block/blk-ioc.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 1a27f45ec776..28f467e636cc 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -7,6 +7,7 @@
 #include <linux/bio.h>
 #include <linux/blkdev.h>
 #include <linux/slab.h>
+#include <linux/delay.h>
 
 #include "blk.h"
 
@@ -109,7 +110,7 @@ static void ioc_release_fn(struct work_struct *work)
 			spin_unlock(q->queue_lock);
 		} else {
 			spin_unlock_irqrestore(&ioc->lock, flags);
-			cpu_relax();
+			cpu_chill();
 			spin_lock_irqsave_nested(&ioc->lock, flags, 1);
 		}
 	}
@@ -187,7 +188,7 @@ retry:
 			spin_unlock(icq->q->queue_lock);
 		} else {
 			spin_unlock_irqrestore(&ioc->lock, flags);
-			cpu_relax();
+			cpu_chill();
 			goto retry;
 		}
 	}
-- 
cgit v1.2.3


From 28f3f3458a4843c79ab26ae70e8f876717d5d85d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 7 Mar 2012 21:00:34 +0100
Subject: fs: dcache: Use cpu_chill() in trylock loops

Retry loops on RT might loop forever when the modifying side was
preempted. Use cpu_chill() instead of cpu_relax() to let the system
make progress.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable-rt@vger.kernel.org
---
 fs/autofs4/autofs_i.h | 1 +
 fs/autofs4/expire.c   | 2 +-
 fs/dcache.c           | 5 +++--
 fs/namespace.c        | 3 ++-
 4 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 8e98cf954bab..7ae1026cdc17 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -34,6 +34,7 @@
 #include <linux/sched.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
+#include <linux/delay.h>
 #include <asm/current.h>
 #include <asm/uaccess.h>
 
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index dcdec6fd33c6..d9a5777d88ec 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -151,7 +151,7 @@ again:
 			parent = p->d_parent;
 			if (!spin_trylock(&parent->d_lock)) {
 				spin_unlock(&p->d_lock);
-				cpu_relax();
+				cpu_chill();
 				goto relock;
 			}
 			spin_unlock(&p->d_lock);
diff --git a/fs/dcache.c b/fs/dcache.c
index 03dca3cad918..ea5a10fb258f 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -19,6 +19,7 @@
 #include <linux/mm.h>
 #include <linux/fs.h>
 #include <linux/fsnotify.h>
+#include <linux/delay.h>
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/hash.h>
@@ -552,7 +553,7 @@ static struct dentry *dentry_kill(struct dentry *dentry)
 
 failed:
 	spin_unlock(&dentry->d_lock);
-	cpu_relax();
+	cpu_chill();
 	return dentry; /* try again with same dentry */
 }
 
@@ -2285,7 +2286,7 @@ again:
 	if (dentry->d_lockref.count == 1) {
 		if (!spin_trylock(&inode->i_lock)) {
 			spin_unlock(&dentry->d_lock);
-			cpu_relax();
+			cpu_chill();
 			goto again;
 		}
 		dentry->d_flags &= ~DCACHE_CANT_MOUNT;
diff --git a/fs/namespace.c b/fs/namespace.c
index 3d7e151c081e..41c36d368430 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -14,6 +14,7 @@
 #include <linux/mnt_namespace.h>
 #include <linux/user_namespace.h>
 #include <linux/namei.h>
+#include <linux/delay.h>
 #include <linux/security.h>
 #include <linux/idr.h>
 #include <linux/init.h>		/* init_rootfs */
@@ -346,7 +347,7 @@ int __mnt_want_write(struct vfsmount *m)
 	smp_mb();
 	while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) {
 		preempt_enable();
-		cpu_relax();
+		cpu_chill();
 		preempt_disable();
 	}
 	/*
-- 
cgit v1.2.3


From b32e5bcf474ecabd1468bf54be0ee33fad039c04 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 7 Mar 2012 21:10:04 +0100
Subject: net: Use cpu_chill() instead of cpu_relax()

Retry loops on RT might loop forever when the modifying side was
preempted. Use cpu_chill() instead of cpu_relax() to let the system
make progress.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable-rt@vger.kernel.org
---
 net/packet/af_packet.c | 5 +++--
 net/rds/ib_rdma.c      | 3 ++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 07c04a841ba0..b2b1dad71160 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -63,6 +63,7 @@
 #include <linux/if_packet.h>
 #include <linux/wireless.h>
 #include <linux/kernel.h>
+#include <linux/delay.h>
 #include <linux/kmod.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
@@ -692,7 +693,7 @@ static void prb_retire_rx_blk_timer_expired(unsigned long data)
 	if (BLOCK_NUM_PKTS(pbd)) {
 		while (atomic_read(&pkc->blk_fill_in_prog)) {
 			/* Waiting for skb_copy_bits to finish... */
-			cpu_relax();
+			cpu_chill();
 		}
 	}
 
@@ -943,7 +944,7 @@ static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
 		if (!(status & TP_STATUS_BLK_TMO)) {
 			while (atomic_read(&pkc->blk_fill_in_prog)) {
 				/* Waiting for skb_copy_bits to finish... */
-				cpu_relax();
+				cpu_chill();
 			}
 		}
 		prb_close_block(pkc, pbd, po, status);
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index 273b8bff6ba4..f64b6e83ab4c 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -34,6 +34,7 @@
 #include <linux/slab.h>
 #include <linux/rculist.h>
 #include <linux/llist.h>
+#include <linux/delay.h>
 
 #include "rds.h"
 #include "ib.h"
@@ -286,7 +287,7 @@ static inline void wait_clean_list_grace(void)
 	for_each_online_cpu(cpu) {
 		flag = &per_cpu(clean_list_grace, cpu);
 		while (test_bit(CLEAN_LIST_BUSY_BIT, flag))
-			cpu_relax();
+			cpu_chill();
 	}
 }
 
-- 
cgit v1.2.3


From a0fb10dd6339de5fade73bc96462c11a9494812a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 24 Jul 2013 15:26:54 +0200
Subject: workqueue: Use normal rcu

There is no need for sched_rcu. The undocumented reason why sched_rcu
is used is to avoid a few explicit rcu_read_lock()/unlock() pairs by
abusing the fact that sched_rcu reader side critical sections are also
protected by preempt or irq disabled regions.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/workqueue.c | 85 ++++++++++++++++++++++++++++++------------------------
 1 file changed, 47 insertions(+), 38 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 2273f534b01a..33e4c3f71b48 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -125,11 +125,11 @@ enum {
  *
  * PL: wq_pool_mutex protected.
  *
- * PR: wq_pool_mutex protected for writes.  Sched-RCU protected for reads.
+ * PR: wq_pool_mutex protected for writes.  RCU protected for reads.
  *
  * WQ: wq->mutex protected.
  *
- * WR: wq->mutex protected for writes.  Sched-RCU protected for reads.
+ * WR: wq->mutex protected for writes.  RCU protected for reads.
  *
  * MD: wq_mayday_lock protected.
  */
@@ -177,7 +177,7 @@ struct worker_pool {
 	atomic_t		nr_running ____cacheline_aligned_in_smp;
 
 	/*
-	 * Destruction of pool is sched-RCU protected to allow dereferences
+	 * Destruction of pool is RCU protected to allow dereferences
 	 * from get_work_pool().
 	 */
 	struct rcu_head		rcu;
@@ -206,7 +206,7 @@ struct pool_workqueue {
 	/*
 	 * Release of unbound pwq is punted to system_wq.  See put_pwq()
 	 * and pwq_unbound_release_workfn() for details.  pool_workqueue
-	 * itself is also sched-RCU protected so that the first pwq can be
+	 * itself is also RCU protected so that the first pwq can be
 	 * determined without grabbing wq->mutex.
 	 */
 	struct work_struct	unbound_release_work;
@@ -329,14 +329,14 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to,
 #include <trace/events/workqueue.h>
 
 #define assert_rcu_or_pool_mutex()					\
-	rcu_lockdep_assert(rcu_read_lock_sched_held() ||		\
+	rcu_lockdep_assert(rcu_read_lock_held() ||			\
 			   lockdep_is_held(&wq_pool_mutex),		\
-			   "sched RCU or wq_pool_mutex should be held")
+			   "RCU or wq_pool_mutex should be held")
 
 #define assert_rcu_or_wq_mutex(wq)					\
-	rcu_lockdep_assert(rcu_read_lock_sched_held() ||		\
+	rcu_lockdep_assert(rcu_read_lock_held() ||			\
 			   lockdep_is_held(&wq->mutex),			\
-			   "sched RCU or wq->mutex should be held")
+			   "RCU or wq->mutex should be held")
 
 #define for_each_cpu_worker_pool(pool, cpu)				\
 	for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0];		\
@@ -348,7 +348,7 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to,
  * @pool: iteration cursor
  * @pi: integer used for iteration
  *
- * This must be called either with wq_pool_mutex held or sched RCU read
+ * This must be called either with wq_pool_mutex held or RCU read
  * locked.  If the pool needs to be used beyond the locking in effect, the
  * caller is responsible for guaranteeing that the pool stays online.
  *
@@ -380,7 +380,7 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to,
  * @pwq: iteration cursor
  * @wq: the target workqueue
  *
- * This must be called either with wq->mutex held or sched RCU read locked.
+ * This must be called either with wq->mutex held or RCU read locked.
  * If the pwq needs to be used beyond the locking in effect, the caller is
  * responsible for guaranteeing that the pwq stays online.
  *
@@ -542,7 +542,7 @@ static int worker_pool_assign_id(struct worker_pool *pool)
  * @wq: the target workqueue
  * @node: the node ID
  *
- * This must be called either with pwq_lock held or sched RCU read locked.
+ * This must be called either with pwq_lock held or RCU read locked.
  * If the pwq needs to be used beyond the locking in effect, the caller is
  * responsible for guaranteeing that the pwq stays online.
  *
@@ -646,8 +646,8 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work)
  * @work: the work item of interest
  *
  * Pools are created and destroyed under wq_pool_mutex, and allows read
- * access under sched-RCU read lock.  As such, this function should be
- * called under wq_pool_mutex or with preemption disabled.
+ * access under RCU read lock.  As such, this function should be
+ * called under wq_pool_mutex or inside of a rcu_read_lock() region.
  *
  * All fields of the returned pool are accessible as long as the above
  * mentioned locking is in effect.  If the returned pool needs to be used
@@ -1053,7 +1053,7 @@ static void put_pwq_unlocked(struct pool_workqueue *pwq)
 {
 	if (pwq) {
 		/*
-		 * As both pwqs and pools are sched-RCU protected, the
+		 * As both pwqs and pools are RCU protected, the
 		 * following lock operations are safe.
 		 */
 		spin_lock_irq(&pwq->pool->lock);
@@ -1179,6 +1179,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
 	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
 		return 0;
 
+	rcu_read_lock();
 	/*
 	 * The queueing is in progress, or it is already queued. Try to
 	 * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
@@ -1217,10 +1218,12 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
 		set_work_pool_and_keep_pending(work, pool->id);
 
 		spin_unlock(&pool->lock);
+		rcu_read_unlock();
 		return 1;
 	}
 	spin_unlock(&pool->lock);
 fail:
+	rcu_read_unlock();
 	local_irq_restore(*flags);
 	if (work_is_canceling(work))
 		return -ENOENT;
@@ -1301,6 +1304,8 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
 	if (unlikely(wq->flags & __WQ_DRAINING) &&
 	    WARN_ON_ONCE(!is_chained_work(wq)))
 		return;
+
+	rcu_read_lock();
 retry:
 	if (req_cpu == WORK_CPU_UNBOUND)
 		cpu = raw_smp_processor_id();
@@ -1357,10 +1362,8 @@ retry:
 	/* pwq determined, queue */
 	trace_workqueue_queue_work(req_cpu, pwq, work);
 
-	if (WARN_ON(!list_empty(&work->entry))) {
-		spin_unlock(&pwq->pool->lock);
-		return;
-	}
+	if (WARN_ON(!list_empty(&work->entry)))
+		goto out;
 
 	pwq->nr_in_flight[pwq->work_color]++;
 	work_flags = work_color_to_flags(pwq->work_color);
@@ -1376,7 +1379,9 @@ retry:
 
 	insert_work(pwq, work, worklist, work_flags);
 
+out:
 	spin_unlock(&pwq->pool->lock);
+	rcu_read_unlock();
 }
 
 /**
@@ -2641,14 +2646,14 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
 
 	might_sleep();
 
-	local_irq_disable();
+	rcu_read_lock();
 	pool = get_work_pool(work);
 	if (!pool) {
-		local_irq_enable();
+		rcu_read_unlock();
 		return false;
 	}
 
-	spin_lock(&pool->lock);
+	spin_lock_irq(&pool->lock);
 	/* see the comment in try_to_grab_pending() with the same code */
 	pwq = get_work_pwq(work);
 	if (pwq) {
@@ -2675,10 +2680,11 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
 	else
 		lock_map_acquire_read(&pwq->wq->lockdep_map);
 	lock_map_release(&pwq->wq->lockdep_map);
-
+	rcu_read_unlock();
 	return true;
 already_gone:
 	spin_unlock_irq(&pool->lock);
+	rcu_read_unlock();
 	return false;
 }
 
@@ -3044,7 +3050,8 @@ static ssize_t wq_pool_ids_show(struct device *dev,
 	const char *delim = "";
 	int node, written = 0;
 
-	rcu_read_lock_sched();
+	get_online_cpus();
+	rcu_read_lock();
 	for_each_node(node) {
 		written += scnprintf(buf + written, PAGE_SIZE - written,
 				     "%s%d:%d", delim, node,
@@ -3052,7 +3059,8 @@ static ssize_t wq_pool_ids_show(struct device *dev,
 		delim = " ";
 	}
 	written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
-	rcu_read_unlock_sched();
+	rcu_read_unlock();
+	put_online_cpus();
 
 	return written;
 }
@@ -3420,7 +3428,7 @@ static void rcu_free_pool(struct rcu_head *rcu)
  * put_unbound_pool - put a worker_pool
  * @pool: worker_pool to put
  *
- * Put @pool.  If its refcnt reaches zero, it gets destroyed in sched-RCU
+ * Put @pool.  If its refcnt reaches zero, it gets destroyed in RCU
  * safe manner.  get_unbound_pool() calls this function on its failure path
  * and this function should be able to release pools which went through,
  * successfully or not, init_worker_pool().
@@ -3474,8 +3482,8 @@ static void put_unbound_pool(struct worker_pool *pool)
 	del_timer_sync(&pool->idle_timer);
 	del_timer_sync(&pool->mayday_timer);
 
-	/* sched-RCU protected to allow dereferences from get_work_pool() */
-	call_rcu_sched(&pool->rcu, rcu_free_pool);
+	/* RCU protected to allow dereferences from get_work_pool() */
+	call_rcu(&pool->rcu, rcu_free_pool);
 }
 
 /**
@@ -3580,7 +3588,7 @@ static void pwq_unbound_release_workfn(struct work_struct *work)
 	put_unbound_pool(pool);
 	mutex_unlock(&wq_pool_mutex);
 
-	call_rcu_sched(&pwq->rcu, rcu_free_pwq);
+	call_rcu(&pwq->rcu, rcu_free_pwq);
 
 	/*
 	 * If we're the last pwq going away, @wq is already dead and no one
@@ -4292,7 +4300,8 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)
 	struct pool_workqueue *pwq;
 	bool ret;
 
-	rcu_read_lock_sched();
+	rcu_read_lock();
+	preempt_disable();
 
 	if (cpu == WORK_CPU_UNBOUND)
 		cpu = smp_processor_id();
@@ -4303,7 +4312,8 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)
 		pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
 
 	ret = !list_empty(&pwq->delayed_works);
-	rcu_read_unlock_sched();
+	preempt_enable();
+	rcu_read_unlock();
 
 	return ret;
 }
@@ -4329,16 +4339,15 @@ unsigned int work_busy(struct work_struct *work)
 	if (work_pending(work))
 		ret |= WORK_BUSY_PENDING;
 
-	local_irq_save(flags);
+	rcu_read_lock()
 	pool = get_work_pool(work);
 	if (pool) {
-		spin_lock(&pool->lock);
+		spin_lock_irqsave(&pool->lock, flags);
 		if (find_worker_executing_work(pool, work))
 			ret |= WORK_BUSY_RUNNING;
-		spin_unlock(&pool->lock);
+		spin_unlock_irqrestore(&pool->lock, flags);
 	}
-	local_irq_restore(flags);
-
+	rcu_read_unlock();
 	return ret;
 }
 EXPORT_SYMBOL_GPL(work_busy);
@@ -4767,16 +4776,16 @@ bool freeze_workqueues_busy(void)
 		 * nr_active is monotonically decreasing.  It's safe
 		 * to peek without lock.
 		 */
-		rcu_read_lock_sched();
+		rcu_read_lock();
 		for_each_pwq(pwq, wq) {
 			WARN_ON_ONCE(pwq->nr_active < 0);
 			if (pwq->nr_active) {
 				busy = true;
-				rcu_read_unlock_sched();
+				rcu_read_unlock();
 				goto out_unlock;
 			}
 		}
-		rcu_read_unlock_sched();
+		rcu_read_unlock();
 	}
 out_unlock:
 	mutex_unlock(&wq_pool_mutex);
-- 
cgit v1.2.3


From 115c92af18e25085e87f21fbf41d5238faaa093d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 17 Jul 2011 21:42:26 +0200
Subject: Use local irq lock instead of irq disable regions

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/workqueue.c | 33 ++++++++++++++++++---------------
 1 file changed, 18 insertions(+), 15 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 33e4c3f71b48..efd935f132b0 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -48,6 +48,7 @@
 #include <linux/nodemask.h>
 #include <linux/moduleparam.h>
 #include <linux/uaccess.h>
+#include <linux/locallock.h>
 
 #include "workqueue_internal.h"
 
@@ -321,6 +322,8 @@ EXPORT_SYMBOL_GPL(system_power_efficient_wq);
 struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly;
 EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
 
+static DEFINE_LOCAL_IRQ_LOCK(pendingb_lock);
+
 static int worker_thread(void *__worker);
 static void copy_workqueue_attrs(struct workqueue_attrs *to,
 				 const struct workqueue_attrs *from);
@@ -1056,9 +1059,9 @@ static void put_pwq_unlocked(struct pool_workqueue *pwq)
 		 * As both pwqs and pools are RCU protected, the
 		 * following lock operations are safe.
 		 */
-		spin_lock_irq(&pwq->pool->lock);
+		local_spin_lock_irq(pendingb_lock, &pwq->pool->lock);
 		put_pwq(pwq);
-		spin_unlock_irq(&pwq->pool->lock);
+		local_spin_unlock_irq(pendingb_lock, &pwq->pool->lock);
 	}
 }
 
@@ -1160,7 +1163,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
 	struct worker_pool *pool;
 	struct pool_workqueue *pwq;
 
-	local_irq_save(*flags);
+	local_lock_irqsave(pendingb_lock, *flags);
 
 	/* try to steal the timer if it exists */
 	if (is_dwork) {
@@ -1224,7 +1227,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
 	spin_unlock(&pool->lock);
 fail:
 	rcu_read_unlock();
-	local_irq_restore(*flags);
+	local_unlock_irqrestore(pendingb_lock, *flags);
 	if (work_is_canceling(work))
 		return -ENOENT;
 	cpu_relax();
@@ -1296,7 +1299,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
 	 * queued or lose PENDING.  Grabbing PENDING and queueing should
 	 * happen with IRQ disabled.
 	 */
-	WARN_ON_ONCE(!irqs_disabled());
+	WARN_ON_ONCE_NONRT(!irqs_disabled());
 
 	debug_work_activate(work);
 
@@ -1401,14 +1404,14 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq,
 	bool ret = false;
 	unsigned long flags;
 
-	local_irq_save(flags);
+	local_lock_irqsave(pendingb_lock,flags);
 
 	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
 		__queue_work(cpu, wq, work);
 		ret = true;
 	}
 
-	local_irq_restore(flags);
+	local_unlock_irqrestore(pendingb_lock, flags);
 	return ret;
 }
 EXPORT_SYMBOL(queue_work_on);
@@ -1475,14 +1478,14 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 	unsigned long flags;
 
 	/* read the comment in __queue_work() */
-	local_irq_save(flags);
+	local_lock_irqsave(pendingb_lock, flags);
 
 	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
 		__queue_delayed_work(cpu, wq, dwork, delay);
 		ret = true;
 	}
 
-	local_irq_restore(flags);
+	local_unlock_irqrestore(pendingb_lock, flags);
 	return ret;
 }
 EXPORT_SYMBOL(queue_delayed_work_on);
@@ -1517,7 +1520,7 @@ bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
 
 	if (likely(ret >= 0)) {
 		__queue_delayed_work(cpu, wq, dwork, delay);
-		local_irq_restore(flags);
+		local_unlock_irqrestore(pendingb_lock, flags);
 	}
 
 	/* -ENOENT from try_to_grab_pending() becomes %true */
@@ -2771,7 +2774,7 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
 
 	/* tell other tasks trying to grab @work to back off */
 	mark_work_canceling(work);
-	local_irq_restore(flags);
+	local_unlock_irqrestore(pendingb_lock, flags);
 
 	flush_work(work);
 	clear_work_data(work);
@@ -2826,10 +2829,10 @@ EXPORT_SYMBOL_GPL(cancel_work_sync);
  */
 bool flush_delayed_work(struct delayed_work *dwork)
 {
-	local_irq_disable();
+	local_lock_irq(pendingb_lock);
 	if (del_timer_sync(&dwork->timer))
 		__queue_work(dwork->cpu, dwork->wq, &dwork->work);
-	local_irq_enable();
+	local_unlock_irq(pendingb_lock);
 	return flush_work(&dwork->work);
 }
 EXPORT_SYMBOL(flush_delayed_work);
@@ -2864,7 +2867,7 @@ bool cancel_delayed_work(struct delayed_work *dwork)
 
 	set_work_pool_and_clear_pending(&dwork->work,
 					get_work_pool_id(&dwork->work));
-	local_irq_restore(flags);
+	local_unlock_irqrestore(pendingb_lock, flags);
 	return ret;
 }
 EXPORT_SYMBOL(cancel_delayed_work);
@@ -4339,7 +4342,7 @@ unsigned int work_busy(struct work_struct *work)
 	if (work_pending(work))
 		ret |= WORK_BUSY_PENDING;
 
-	rcu_read_lock()
+	rcu_read_lock();
 	pool = get_work_pool(work);
 	if (pool) {
 		spin_lock_irqsave(&pool->lock, flags);
-- 
cgit v1.2.3


From fbfcf8cccd20b2c7f34ea647a68ee4626d3f820b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 1 Jul 2013 11:02:42 +0200
Subject: workqueue vs ata-piix livelock fixup

An Intel i7 system regularly detected rcu_preempt stalls after the kernel
was upgraded from 3.6-rt to 3.8-rt. When the stall happened, disk I/O was no
longer possible, unless the system was restarted.

The kernel message was:
INFO: rcu_preempt self-detected stall on CPU { 6}
[..]
NMI backtrace for cpu 6
CPU 6
Pid: 119, comm: irq/19-ata_piix Not tainted 3.8.13-rt13 #11 Shuttle Inc. SX58/SX58
RIP: 0010:[<ffffffff8124ca60>]  [<ffffffff8124ca60>] ip_compute_csum+0x30/0x30
RSP: 0018:ffff880333303cb0  EFLAGS: 00000002
RAX: 0000000000000006 RBX: 00000000000003e9 RCX: 0000000000000034
RDX: 0000000000000000 RSI: ffffffff81aa16d0 RDI: 0000000000000001
RBP: ffff880333303ce8 R08: ffffffff81aa16d0 R09: ffffffff81c1b8cc
R10: 0000000000000000 R11: 0000000000000000 R12: 000000000005161f
R13: 0000000000000006 R14: ffffffff81aa16d0 R15: 0000000000000002
FS:  0000000000000000(0000) GS:ffff880333300000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: 0000003c1b2bb420 CR3: 0000000001a0f000 CR4: 00000000000007e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process irq/19-ata_piix (pid: 119, threadinfo ffff88032d88a000, task ffff88032df80000)
Stack:
ffffffff8124cb32 000000000005161e 00000000000003e9 0000000000001000
0000000000009022 ffffffff81aa16d0 0000000000000002 ffff880333303cf8
ffffffff8124caa9 ffff880333303d08 ffffffff8124cad2 ffff880333303d28
Call Trace:
<IRQ>
[<ffffffff8124cb32>] ? delay_tsc+0x33/0xe3
[<ffffffff8124caa9>] __delay+0xf/0x11
[<ffffffff8124cad2>] __const_udelay+0x27/0x29
[<ffffffff8102d1fa>] native_safe_apic_wait_icr_idle+0x39/0x45
[<ffffffff8102dc9b>] __default_send_IPI_dest_field.constprop.0+0x1e/0x58
[<ffffffff8102dd1e>] default_send_IPI_mask_sequence_phys+0x49/0x7d
[<ffffffff81030326>] physflat_send_IPI_all+0x17/0x19
[<ffffffff8102de53>] arch_trigger_all_cpu_backtrace+0x50/0x79
[<ffffffff810b21d0>] rcu_check_callbacks+0x1cb/0x568
[<ffffffff81048c9c>] ? raise_softirq+0x2e/0x35
[<ffffffff81086be0>] ? tick_sched_do_timer+0x38/0x38
[<ffffffff8104f653>] update_process_times+0x44/0x55
[<ffffffff81086866>] tick_sched_handle+0x4a/0x59
[<ffffffff81086c1c>] tick_sched_timer+0x3c/0x5b
[<ffffffff81062845>] __run_hrtimer+0x9b/0x158
[<ffffffff810631d8>] hrtimer_interrupt+0x172/0x2aa
[<ffffffff8102d498>] smp_apic_timer_interrupt+0x76/0x89
[<ffffffff814d881d>] apic_timer_interrupt+0x6d/0x80
<EOI>
[<ffffffff81057cd2>] ? __local_lock_irqsave+0x17/0x4a
[<ffffffff81059336>] try_to_grab_pending+0x42/0x17e
[<ffffffff8105a699>] mod_delayed_work_on+0x32/0x88
[<ffffffff8105a70b>] mod_delayed_work+0x1c/0x1e
[<ffffffff8122ae84>] blk_run_queue_async+0x37/0x39
[<ffffffff81230985>] flush_end_io+0xf1/0x107
[<ffffffff8122e0da>] blk_finish_request+0x21e/0x264
[<ffffffff8122e162>] blk_end_bidi_request+0x42/0x60
[<ffffffff8122e1ba>] blk_end_request+0x10/0x12
[<ffffffff8132de46>] scsi_io_completion+0x1bf/0x492
[<ffffffff81335cec>] ? sd_done+0x298/0x2ef
[<ffffffff81325a02>] scsi_finish_command+0xe9/0xf2
[<ffffffff8132dbcb>] scsi_softirq_done+0x106/0x10f
[<ffffffff812333d3>] blk_done_softirq+0x77/0x87
[<ffffffff8104826f>] do_current_softirqs+0x172/0x2e1
[<ffffffff810aa820>] ? irq_thread_fn+0x3a/0x3a
[<ffffffff81048466>] local_bh_enable+0x43/0x72
[<ffffffff810aa866>] irq_forced_thread_fn+0x46/0x52
[<ffffffff810ab089>] irq_thread+0x8c/0x17c
[<ffffffff810ab179>] ? irq_thread+0x17c/0x17c
[<ffffffff810aaffd>] ? wake_threads_waitq+0x44/0x44
[<ffffffff8105eb18>] kthread+0x8d/0x95
[<ffffffff8105ea8b>] ? __kthread_parkme+0x65/0x65
[<ffffffff814d7b7c>] ret_from_fork+0x7c/0xb0
[<ffffffff8105ea8b>] ? __kthread_parkme+0x65/0x65

The state of softirqd of this CPU at the time of the crash was:
ksoftirqd/6     R  running task        0    53      2 0x00000000
ffff88032fc39d18 0000000000000046 ffff88033330c4c0 ffff8803303f4710
ffff88032fc39fd8 ffff88032fc39fd8 0000000000000000 0000000000062500
ffff88032df88000 ffff8803303f4710 0000000000000000 ffff88032fc38000
Call Trace:
[<ffffffff8105a3ae>] ? __queue_work+0x27c/0x27c
[<ffffffff814d178c>] preempt_schedule+0x61/0x76
[<ffffffff8106cccf>] migrate_enable+0xe5/0x1df
[<ffffffff8105a3ae>] ? __queue_work+0x27c/0x27c
[<ffffffff8104ef52>] run_timer_softirq+0x161/0x1d6
[<ffffffff8104826f>] do_current_softirqs+0x172/0x2e1
[<ffffffff8104840b>] run_ksoftirqd+0x2d/0x45
[<ffffffff8106658a>] smpboot_thread_fn+0x2ea/0x308
[<ffffffff810662a0>] ? test_ti_thread_flag+0xc/0xc
[<ffffffff810662a0>] ? test_ti_thread_flag+0xc/0xc
[<ffffffff8105eb18>] kthread+0x8d/0x95
[<ffffffff8105ea8b>] ? __kthread_parkme+0x65/0x65
[<ffffffff814d7afc>] ret_from_fork+0x7c/0xb0
[<ffffffff8105ea8b>] ? __kthread_parkme+0x65/0x65

Apparently, the softirq demon and the ata_piix IRQ handler were waiting
for each other to finish ending up in a livelock. After the below patch
was applied, the system no longer crashes.

Reported-by: Carsten Emde <C.Emde@osadl.org>
Proposed-by: Thomas Gleixner <tglx@linutronix.de>
Tested by: Carsten Emde <C.Emde@osadl.org>
Signed-off-by: Carsten Emde <C.Emde@osadl.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 kernel/workqueue.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index efd935f132b0..f2b712255571 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -49,6 +49,7 @@
 #include <linux/moduleparam.h>
 #include <linux/uaccess.h>
 #include <linux/locallock.h>
+#include <linux/delay.h>
 
 #include "workqueue_internal.h"
 
@@ -1230,7 +1231,7 @@ fail:
 	local_unlock_irqrestore(pendingb_lock, *flags);
 	if (work_is_canceling(work))
 		return -ENOENT;
-	cpu_relax();
+	cpu_chill();
 	return -EAGAIN;
 }
 
-- 
cgit v1.2.3


From 2e148e7d0314759a9c8e22e17b2276a3a84c150d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 22 Jun 2011 19:47:03 +0200
Subject: sched: Distangle worker accounting from rqlock

The worker accounting for cpu bound workers is plugged into the core
scheduler code and the wakeup code. This is not a hard requirement and
can be avoided by keeping track of the state in the workqueue code
itself.

Keep track of the sleeping state in the worker itself and call the
notifier before entering the core scheduler. There might be false
positives when the task is woken between that call and actually
scheduling, but that's not really different from scheduling and being
woken immediately after switching away. There is also no harm from
updating nr_running when the task returns from scheduling instead of
accounting it in the wakeup code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/20110622174919.135236139@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched/core.c         | 70 ++++++++++-----------------------------------
 kernel/workqueue.c          | 55 +++++++++++++++--------------------
 kernel/workqueue_internal.h |  5 ++--
 3 files changed, 41 insertions(+), 89 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 831d3ae428d1..572834045528 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1510,10 +1510,6 @@ static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
 {
 	activate_task(rq, p, en_flags);
 	p->on_rq = TASK_ON_RQ_QUEUED;
-
-	/* if a worker is waking up, notify workqueue */
-	if (p->flags & PF_WQ_WORKER)
-		wq_worker_waking_up(p, cpu_of(rq));
 }
 
 /*
@@ -1789,42 +1785,6 @@ out:
 	return success;
 }
 
-/**
- * try_to_wake_up_local - try to wake up a local task with rq lock held
- * @p: the thread to be awakened
- *
- * Put @p on the run-queue if it's not already there. The caller must
- * ensure that this_rq() is locked, @p is bound to this_rq() and not
- * the current task.
- */
-static void try_to_wake_up_local(struct task_struct *p)
-{
-	struct rq *rq = task_rq(p);
-
-	if (WARN_ON_ONCE(rq != this_rq()) ||
-	    WARN_ON_ONCE(p == current))
-		return;
-
-	lockdep_assert_held(&rq->lock);
-
-	if (!raw_spin_trylock(&p->pi_lock)) {
-		raw_spin_unlock(&rq->lock);
-		raw_spin_lock(&p->pi_lock);
-		raw_spin_lock(&rq->lock);
-	}
-
-	if (!(p->state & TASK_NORMAL))
-		goto out;
-
-	if (!task_on_rq_queued(p))
-		ttwu_activate(rq, p, ENQUEUE_WAKEUP);
-
-	ttwu_do_wakeup(rq, p, 0);
-	ttwu_stat(p, smp_processor_id(), 0);
-out:
-	raw_spin_unlock(&p->pi_lock);
-}
-
 /**
  * wake_up_process - Wake up a specific process
  * @p: The process to be woken up.
@@ -2990,21 +2950,6 @@ need_resched:
 		} else {
 			deactivate_task(rq, prev, DEQUEUE_SLEEP);
 			prev->on_rq = 0;
-
-			/*
-			 * If a worker went to sleep, notify and ask workqueue
-			 * whether it wants to wake up a task to maintain
-			 * concurrency.
-			 * Only call wake up if prev isn't blocked on a sleeping
-			 * spin lock.
-			 */
-			if (prev->flags & PF_WQ_WORKER && !prev->saved_state) {
-				struct task_struct *to_wakeup;
-
-				to_wakeup = wq_worker_sleeping(prev, cpu);
-				if (to_wakeup)
-					try_to_wake_up_local(to_wakeup);
-			}
 		}
 		switch_count = &prev->nvcsw;
 	}
@@ -3045,6 +2990,14 @@ static inline void sched_submit_work(struct task_struct *tsk)
 {
 	if (!tsk->state || tsk_is_pi_blocked(tsk))
 		return;
+
+	/*
+	 * If a worker went to sleep, notify and ask workqueue whether
+	 * it wants to wake up a task to maintain concurrency.
+	 */
+	if (tsk->flags & PF_WQ_WORKER)
+		wq_worker_sleeping(tsk);
+
 	/*
 	 * If we are going to sleep and we have plugged IO queued,
 	 * make sure to submit it to avoid deadlocks.
@@ -3053,12 +3006,19 @@ static inline void sched_submit_work(struct task_struct *tsk)
 		blk_schedule_flush_plug(tsk);
 }
 
+static inline void sched_update_worker(struct task_struct *tsk)
+{
+	if (tsk->flags & PF_WQ_WORKER)
+		wq_worker_running(tsk);
+}
+
 asmlinkage __visible void __sched schedule(void)
 {
 	struct task_struct *tsk = current;
 
 	sched_submit_work(tsk);
 	__schedule();
+	sched_update_worker(tsk);
 }
 EXPORT_SYMBOL(schedule);
 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f2b712255571..0fb5b91bc853 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -795,44 +795,31 @@ static void wake_up_worker(struct worker_pool *pool)
 }
 
 /**
- * wq_worker_waking_up - a worker is waking up
- * @task: task waking up
- * @cpu: CPU @task is waking up to
+ * wq_worker_running - a worker is running again
+ * @task: task returning from sleep
  *
- * This function is called during try_to_wake_up() when a worker is
- * being awoken.
- *
- * CONTEXT:
- * spin_lock_irq(rq->lock)
+ * This function is called when a worker returns from schedule()
  */
-void wq_worker_waking_up(struct task_struct *task, int cpu)
+void wq_worker_running(struct task_struct *task)
 {
 	struct worker *worker = kthread_data(task);
 
-	if (!(worker->flags & WORKER_NOT_RUNNING)) {
-		WARN_ON_ONCE(worker->pool->cpu != cpu);
+	if (!worker->sleeping)
+		return;
+	if (!(worker->flags & WORKER_NOT_RUNNING))
 		atomic_inc(&worker->pool->nr_running);
-	}
+	worker->sleeping = 0;
 }
 
 /**
  * wq_worker_sleeping - a worker is going to sleep
  * @task: task going to sleep
- * @cpu: CPU in question, must be the current CPU number
- *
- * This function is called during schedule() when a busy worker is
- * going to sleep.  Worker on the same cpu can be woken up by
- * returning pointer to its task.
- *
- * CONTEXT:
- * spin_lock_irq(rq->lock)
- *
- * Return:
- * Worker task on @cpu to wake up, %NULL if none.
+ * This function is called from schedule() when a busy worker is
+ * going to sleep.
  */
-struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
+void wq_worker_sleeping(struct task_struct *task)
 {
-	struct worker *worker = kthread_data(task), *to_wakeup = NULL;
+	struct worker *next, *worker = kthread_data(task);
 	struct worker_pool *pool;
 
 	/*
@@ -841,14 +828,15 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
 	 * checking NOT_RUNNING.
 	 */
 	if (worker->flags & WORKER_NOT_RUNNING)
-		return NULL;
+		return;
 
 	pool = worker->pool;
 
-	/* this can only happen on the local cpu */
-	if (WARN_ON_ONCE(cpu != raw_smp_processor_id() || pool->cpu != cpu))
-		return NULL;
+	if (WARN_ON_ONCE(worker->sleeping))
+		return;
 
+	worker->sleeping = 1;
+	spin_lock_irq(&pool->lock);
 	/*
 	 * The counterpart of the following dec_and_test, implied mb,
 	 * worklist not empty test sequence is in insert_work().
@@ -861,9 +849,12 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
 	 * lock is safe.
 	 */
 	if (atomic_dec_and_test(&pool->nr_running) &&
-	    !list_empty(&pool->worklist))
-		to_wakeup = first_idle_worker(pool);
-	return to_wakeup ? to_wakeup->task : NULL;
+	    !list_empty(&pool->worklist)) {
+		next = first_idle_worker(pool);
+		if (next)
+			wake_up_process(next->task);
+	}
+	spin_unlock_irq(&pool->lock);
 }
 
 /**
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
index 45215870ac6c..f000c4d6917e 100644
--- a/kernel/workqueue_internal.h
+++ b/kernel/workqueue_internal.h
@@ -43,6 +43,7 @@ struct worker {
 	unsigned long		last_active;	/* L: last active timestamp */
 	unsigned int		flags;		/* X: flags */
 	int			id;		/* I: worker id */
+	int			sleeping;	/* None */
 
 	/*
 	 * Opaque string set with work_set_desc().  Printed out with task
@@ -68,7 +69,7 @@ static inline struct worker *current_wq_worker(void)
  * Scheduler hooks for concurrency managed workqueue.  Only to be used from
  * sched/core.c and workqueue.c.
  */
-void wq_worker_waking_up(struct task_struct *task, int cpu);
-struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu);
+void wq_worker_running(struct task_struct *task);
+void wq_worker_sleeping(struct task_struct *task);
 
 #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */
-- 
cgit v1.2.3


From 4e478135d30cbf75b1e68e8097f8c25528819b27 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 20 May 2015 00:23:50 +0200
Subject: idr: Use local lock instead of preempt enable/disable

We need to protect the per cpu variable and prevent migration.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/idr.h |  4 ++++
 lib/idr.c           | 36 +++++++++++++++++++++++++++++++++---
 2 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/include/linux/idr.h b/include/linux/idr.h
index 013fd9bc4cb6..f62be0aec911 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -95,10 +95,14 @@ bool idr_is_empty(struct idr *idp);
  * Each idr_preload() should be matched with an invocation of this
  * function.  See idr_preload() for details.
  */
+#ifdef CONFIG_PREEMPT_RT_FULL
+void idr_preload_end(void);
+#else
 static inline void idr_preload_end(void)
 {
 	preempt_enable();
 }
+#endif
 
 /**
  * idr_find - return pointer for given id
diff --git a/lib/idr.c b/lib/idr.c
index e654aebd5f80..83758924cba7 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -31,6 +31,7 @@
 #include <linux/spinlock.h>
 #include <linux/percpu.h>
 #include <linux/hardirq.h>
+#include <linux/locallock.h>
 
 #define MAX_IDR_SHIFT		(sizeof(int) * 8 - 1)
 #define MAX_IDR_BIT		(1U << MAX_IDR_SHIFT)
@@ -367,6 +368,35 @@ static void idr_fill_slot(struct idr *idr, void *ptr, int id,
 	idr_mark_full(pa, id);
 }
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+static DEFINE_LOCAL_IRQ_LOCK(idr_lock);
+
+static inline void idr_preload_lock(void)
+{
+	local_lock(idr_lock);
+}
+
+static inline void idr_preload_unlock(void)
+{
+	local_unlock(idr_lock);
+}
+
+void idr_preload_end(void)
+{
+	idr_preload_unlock();
+}
+EXPORT_SYMBOL(idr_preload_end);
+#else
+static inline void idr_preload_lock(void)
+{
+	preempt_disable();
+}
+
+static inline void idr_preload_unlock(void)
+{
+	preempt_enable();
+}
+#endif
 
 /**
  * idr_preload - preload for idr_alloc()
@@ -402,7 +432,7 @@ void idr_preload(gfp_t gfp_mask)
 	WARN_ON_ONCE(in_interrupt());
 	might_sleep_if(gfp_mask & __GFP_WAIT);
 
-	preempt_disable();
+	idr_preload_lock();
 
 	/*
 	 * idr_alloc() is likely to succeed w/o full idr_layer buffer and
@@ -414,9 +444,9 @@ void idr_preload(gfp_t gfp_mask)
 	while (__this_cpu_read(idr_preload_cnt) < MAX_IDR_FREE) {
 		struct idr_layer *new;
 
-		preempt_enable();
+		idr_preload_unlock();
 		new = kmem_cache_zalloc(idr_layer_cache, gfp_mask);
-		preempt_disable();
+		idr_preload_lock();
 		if (!new)
 			break;
 
-- 
cgit v1.2.3


From 8c7c0b6bb2e25c0c7d043c75f23b72377c5fbe6d Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 9 Apr 2014 11:58:17 +0200
Subject: percpu_ida: use locklocks

the local_irq_save() + spin_lock() does not work that well on -RT

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 lib/percpu_ida.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/lib/percpu_ida.c b/lib/percpu_ida.c
index 93d145e5539c..32a608ddbe3b 100644
--- a/lib/percpu_ida.c
+++ b/lib/percpu_ida.c
@@ -29,6 +29,9 @@
 #include <linux/string.h>
 #include <linux/spinlock.h>
 #include <linux/percpu_ida.h>
+#include <linux/locallock.h>
+
+static DEFINE_LOCAL_IRQ_LOCK(irq_off_lock);
 
 struct percpu_ida_cpu {
 	/*
@@ -151,13 +154,13 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
 	unsigned long flags;
 	int tag;
 
-	local_irq_save(flags);
+	local_lock_irqsave(irq_off_lock, flags);
 	tags = this_cpu_ptr(pool->tag_cpu);
 
 	/* Fastpath */
 	tag = alloc_local_tag(tags);
 	if (likely(tag >= 0)) {
-		local_irq_restore(flags);
+		local_unlock_irqrestore(irq_off_lock, flags);
 		return tag;
 	}
 
@@ -176,6 +179,7 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
 
 		if (!tags->nr_free)
 			alloc_global_tags(pool, tags);
+
 		if (!tags->nr_free)
 			steal_tags(pool, tags);
 
@@ -187,7 +191,7 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
 		}
 
 		spin_unlock(&pool->lock);
-		local_irq_restore(flags);
+		local_unlock_irqrestore(irq_off_lock, flags);
 
 		if (tag >= 0 || state == TASK_RUNNING)
 			break;
@@ -199,7 +203,7 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
 
 		schedule();
 
-		local_irq_save(flags);
+		local_lock_irqsave(irq_off_lock, flags);
 		tags = this_cpu_ptr(pool->tag_cpu);
 	}
 	if (state != TASK_RUNNING)
@@ -224,7 +228,7 @@ void percpu_ida_free(struct percpu_ida *pool, unsigned tag)
 
 	BUG_ON(tag >= pool->nr_tags);
 
-	local_irq_save(flags);
+	local_lock_irqsave(irq_off_lock, flags);
 	tags = this_cpu_ptr(pool->tag_cpu);
 
 	spin_lock(&tags->lock);
@@ -256,7 +260,7 @@ void percpu_ida_free(struct percpu_ida *pool, unsigned tag)
 		spin_unlock(&pool->lock);
 	}
 
-	local_irq_restore(flags);
+	local_unlock_irqrestore(irq_off_lock, flags);
 }
 EXPORT_SYMBOL_GPL(percpu_ida_free);
 
@@ -348,7 +352,7 @@ int percpu_ida_for_each_free(struct percpu_ida *pool, percpu_ida_cb fn,
 	struct percpu_ida_cpu *remote;
 	unsigned cpu, i, err = 0;
 
-	local_irq_save(flags);
+	local_lock_irqsave(irq_off_lock, flags);
 	for_each_possible_cpu(cpu) {
 		remote = per_cpu_ptr(pool->tag_cpu, cpu);
 		spin_lock(&remote->lock);
@@ -370,7 +374,7 @@ int percpu_ida_for_each_free(struct percpu_ida *pool, percpu_ida_cb fn,
 	}
 	spin_unlock(&pool->lock);
 out:
-	local_irq_restore(flags);
+	local_unlock_irqrestore(irq_off_lock, flags);
 	return err;
 }
 EXPORT_SYMBOL_GPL(percpu_ida_for_each_free);
-- 
cgit v1.2.3


From 44a28fbbf456348a3b27ed432b0f921fd0fb44b9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 17 Jul 2011 21:41:35 +0200
Subject: debugobjects-rt.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 lib/debugobjects.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/lib/debugobjects.c b/lib/debugobjects.c
index 547f7f923dbc..8fcdbc2fc6d0 100644
--- a/lib/debugobjects.c
+++ b/lib/debugobjects.c
@@ -309,7 +309,10 @@ __debug_object_init(void *addr, struct debug_obj_descr *descr, int onstack)
 	struct debug_obj *obj;
 	unsigned long flags;
 
-	fill_pool();
+#ifdef CONFIG_PREEMPT_RT_FULL
+	if (preempt_count() == 0 && !irqs_disabled())
+#endif
+		fill_pool();
 
 	db = get_bucket((unsigned long) addr);
 
-- 
cgit v1.2.3


From 6c1d25ef4557cbebfb9709f880f79f8eb4c6e756 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Jul 2011 11:03:16 +0200
Subject: jump-label-rt.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/jump_label.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index 98f923b6a0ea..e17a47eae339 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -55,7 +55,8 @@ extern bool static_key_initialized;
 				    "%s used before call to jump_label_init", \
 				    __func__)
 
-#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_JUMP_LABEL)
+#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_JUMP_LABEL) && \
+	!defined(CONFIG_PREEMPT_BASE)
 
 struct static_key {
 	atomic_t enabled;
-- 
cgit v1.2.3


From 261026407ff9f56b6ade7b544abff109bc22c348 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 12 Jul 2011 15:38:34 +0200
Subject: use skbufhead with raw lock

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/netdevice.h |  1 +
 include/linux/skbuff.h    |  7 +++++++
 net/core/dev.c            | 26 ++++++++++++++++++++------
 3 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c3fd34da6c08..f36e65c5ce78 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2351,6 +2351,7 @@ struct softnet_data {
 	unsigned int		dropped;
 	struct sk_buff_head	input_pkt_queue;
 	struct napi_struct	backlog;
+	struct sk_buff_head	tofree_queue;
 
 #ifdef CONFIG_NET_FLOW_LIMIT
 	struct sd_flow_limit __rcu *flow_limit;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 6c8b6f604e76..301a23642f12 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -172,6 +172,7 @@ struct sk_buff_head {
 
 	__u32		qlen;
 	spinlock_t	lock;
+	raw_spinlock_t	raw_lock;
 };
 
 struct sk_buff;
@@ -1327,6 +1328,12 @@ static inline void skb_queue_head_init(struct sk_buff_head *list)
 	__skb_queue_head_init(list);
 }
 
+static inline void skb_queue_head_init_raw(struct sk_buff_head *list)
+{
+	raw_spin_lock_init(&list->raw_lock);
+	__skb_queue_head_init(list);
+}
+
 static inline void skb_queue_head_init_class(struct sk_buff_head *list,
 		struct lock_class_key *class)
 {
diff --git a/net/core/dev.c b/net/core/dev.c
index 4c9bf576d382..0b77a570b35a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -203,14 +203,14 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 static inline void rps_lock(struct softnet_data *sd)
 {
 #ifdef CONFIG_RPS
-	spin_lock(&sd->input_pkt_queue.lock);
+	raw_spin_lock(&sd->input_pkt_queue.raw_lock);
 #endif
 }
 
 static inline void rps_unlock(struct softnet_data *sd)
 {
 #ifdef CONFIG_RPS
-	spin_unlock(&sd->input_pkt_queue.lock);
+	raw_spin_unlock(&sd->input_pkt_queue.raw_lock);
 #endif
 }
 
@@ -3846,7 +3846,7 @@ static void flush_backlog(void *arg)
 	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
 		if (skb->dev == dev) {
 			__skb_unlink(skb, &sd->input_pkt_queue);
-			kfree_skb(skb);
+			__skb_queue_tail(&sd->tofree_queue, skb);
 			input_queue_head_incr(sd);
 		}
 	}
@@ -3855,10 +3855,13 @@ static void flush_backlog(void *arg)
 	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
 		if (skb->dev == dev) {
 			__skb_unlink(skb, &sd->process_queue);
-			kfree_skb(skb);
+			__skb_queue_tail(&sd->tofree_queue, skb);
 			input_queue_head_incr(sd);
 		}
 	}
+
+	if (!skb_queue_empty(&sd->tofree_queue))
+		raise_softirq_irqoff(NET_RX_SOFTIRQ);
 }
 
 static int napi_gro_complete(struct sk_buff *skb)
@@ -4514,10 +4517,17 @@ static void net_rx_action(struct softirq_action *h)
 	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
 	unsigned long time_limit = jiffies + 2;
 	int budget = netdev_budget;
+	struct sk_buff *skb;
 	void *have;
 
 	local_irq_disable();
 
+	while ((skb = __skb_dequeue(&sd->tofree_queue))) {
+		local_irq_enable();
+		kfree_skb(skb);
+		local_irq_disable();
+	}
+
 	while (!list_empty(&sd->poll_list)) {
 		struct napi_struct *n;
 		int work, weight;
@@ -7016,6 +7026,9 @@ static int dev_cpu_callback(struct notifier_block *nfb,
 		netif_rx_internal(skb);
 		input_queue_head_incr(oldsd);
 	}
+	while ((skb = __skb_dequeue(&oldsd->tofree_queue))) {
+		kfree_skb(skb);
+	}
 
 	return NOTIFY_OK;
 }
@@ -7317,8 +7330,9 @@ static int __init net_dev_init(void)
 	for_each_possible_cpu(i) {
 		struct softnet_data *sd = &per_cpu(softnet_data, i);
 
-		skb_queue_head_init(&sd->input_pkt_queue);
-		skb_queue_head_init(&sd->process_queue);
+		skb_queue_head_init_raw(&sd->input_pkt_queue);
+		skb_queue_head_init_raw(&sd->process_queue);
+		skb_queue_head_init_raw(&sd->tofree_queue);
 		INIT_LIST_HEAD(&sd->poll_list);
 		sd->output_queue_tailp = &sd->output_queue;
 #ifdef CONFIG_RPS
-- 
cgit v1.2.3


From 974fe616315ae654878c4636a4fa495819022f55 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Jul 2011 14:05:05 +0200
Subject: x86-no-perf-irq-work-rt.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/irq_work.c | 2 ++
 kernel/irq_work.c          | 2 ++
 kernel/time/timer.c        | 6 +++++-
 3 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c
index 15d741ddfeeb..cbfb33df3749 100644
--- a/arch/x86/kernel/irq_work.c
+++ b/arch/x86/kernel/irq_work.c
@@ -38,6 +38,7 @@ __visible void smp_trace_irq_work_interrupt(struct pt_regs *regs)
 	exiting_irq();
 }
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 void arch_irq_work_raise(void)
 {
 #ifdef CONFIG_X86_LOCAL_APIC
@@ -48,3 +49,4 @@ void arch_irq_work_raise(void)
 	apic_wait_icr_idle();
 #endif
 }
+#endif
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 3ab9048483fa..64a2ab5774f6 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -132,7 +132,9 @@ static void irq_work_run_list(struct llist_head *list)
 	struct irq_work *work;
 	struct llist_node *llnode;
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 	BUG_ON(!irqs_disabled());
+#endif
 
 	if (llist_empty(list))
 		return;
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index ab5f3e458fdf..d07d6370f361 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1450,7 +1450,7 @@ void update_process_times(int user_tick)
 	scheduler_tick();
 	run_local_timers();
 	rcu_check_callbacks(cpu, user_tick);
-#ifdef CONFIG_IRQ_WORK
+#ifdef CONFIG_IRQ_WORK && !defined(CONFIG_PREEMPT_RT_FULL)
 	if (in_irq())
 		irq_work_tick();
 #endif
@@ -1466,6 +1466,10 @@ static void run_timer_softirq(struct softirq_action *h)
 
 	hrtimer_run_pending();
 
+#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL)
+	irq_work_tick();
+#endif
+
 	if (time_after_eq(jiffies, base->timer_jiffies))
 		__run_timers(base);
 }
-- 
cgit v1.2.3


From c194b491f15201ef4cc7dd933962efa45ee27e0f Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 31 Jan 2014 14:20:31 +0100
Subject: irq_work: allow certain work in hard irq context

irq_work is processed in softirq context on -RT because we want to avoid
long latencies which might arise from processing lots of perf events.
The noHZ-full mode requires its callback to be called from real hardirq
context (commit 76c24fb ("nohz: New APIs to re-evaluate the tick on full
dynticks CPUs")). If it is called from a thread context we might get
wrong results for checks like "is_idle_task(current)".
This patch introduces a second list (hirq_work_list) which will be used
if irq_work_run() has been invoked from hardirq context and process only
work items marked with IRQ_WORK_HARD_IRQ.

This patch also removes arch_irq_work_raise() from sparc & powerpc like
it is already done for x86. Atleast for powerpc it is somehow
superfluous because it is called from the timer interrupt which should
invoke update_process_times().

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 arch/arm/kernel/smp.c      |  2 ++
 arch/arm64/kernel/smp.c    |  2 ++
 arch/powerpc/kernel/time.c |  2 +-
 arch/sparc/kernel/pcr.c    |  2 ++
 include/linux/irq_work.h   |  1 +
 kernel/irq_work.c          | 57 ++++++++++++++++++++++++++++++++++++++++------
 kernel/time/tick-sched.c   |  1 +
 kernel/time/timer.c        |  2 +-
 8 files changed, 60 insertions(+), 9 deletions(-)

diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index a8e32aaf0383..fe912bc87c33 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -506,12 +506,14 @@ void arch_send_call_function_single_ipi(int cpu)
 }
 
 #ifdef CONFIG_IRQ_WORK
+#ifndef CONFIG_PREEMPT_RT_FULL
 void arch_irq_work_raise(void)
 {
 	if (arch_irq_work_has_interrupt())
 		smp_cross_call(cpumask_of(smp_processor_id()), IPI_IRQ_WORK);
 }
 #endif
+#endif
 
 #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
 void tick_broadcast(const struct cpumask *mask)
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 0ef87896e4ae..ff2a40b21ad9 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -531,12 +531,14 @@ void arch_send_call_function_single_ipi(int cpu)
 }
 
 #ifdef CONFIG_IRQ_WORK
+#ifndef CONFIG_PREEMPT_RT_FULL
 void arch_irq_work_raise(void)
 {
 	if (__smp_cross_call)
 		smp_cross_call(cpumask_of(smp_processor_id()), IPI_IRQ_WORK);
 }
 #endif
+#endif
 
 static DEFINE_RAW_SPINLOCK(stop_lock);
 
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 7505599c2593..550a14dd313b 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -424,7 +424,7 @@ unsigned long profile_pc(struct pt_regs *regs)
 EXPORT_SYMBOL(profile_pc);
 #endif
 
-#ifdef CONFIG_IRQ_WORK
+#if defined(CONFIG_IRQ_WORK) && !defined(CONFIG_PREEMPT_RT_FULL)
 
 /*
  * 64-bit uses a byte in the PACA, 32-bit uses a per-cpu variable...
diff --git a/arch/sparc/kernel/pcr.c b/arch/sparc/kernel/pcr.c
index 7e967c8018c8..927d9c5e50f5 100644
--- a/arch/sparc/kernel/pcr.c
+++ b/arch/sparc/kernel/pcr.c
@@ -43,10 +43,12 @@ void __irq_entry deferred_pcr_work_irq(int irq, struct pt_regs *regs)
 	set_irq_regs(old_regs);
 }
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 void arch_irq_work_raise(void)
 {
 	set_softint(1 << PIL_DEFERRED_PCR_WORK);
 }
+#endif
 
 const struct pcr_ops *pcr_ops;
 EXPORT_SYMBOL_GPL(pcr_ops);
diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h
index bf3fe719c7ce..30ef6c214e6f 100644
--- a/include/linux/irq_work.h
+++ b/include/linux/irq_work.h
@@ -16,6 +16,7 @@
 #define IRQ_WORK_BUSY		2UL
 #define IRQ_WORK_FLAGS		3UL
 #define IRQ_WORK_LAZY		4UL /* Doesn't want IPI, wait for tick */
+#define IRQ_WORK_HARD_IRQ	8UL /* Run hard IRQ context, even on RT */
 
 struct irq_work {
 	unsigned long flags;
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 64a2ab5774f6..0c6491228b17 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -22,7 +22,9 @@
 
 static DEFINE_PER_CPU(struct llist_head, raised_list);
 static DEFINE_PER_CPU(struct llist_head, lazy_list);
-
+#ifdef CONFIG_PREEMPT_RT_FULL
+static DEFINE_PER_CPU(struct llist_head, hirq_work_list);
+#endif
 /*
  * Claim the entry so that no one else will poke at it.
  */
@@ -49,7 +51,11 @@ static bool irq_work_claim(struct irq_work *work)
 	return true;
 }
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+void arch_irq_work_raise(void)
+#else
 void __weak arch_irq_work_raise(void)
+#endif
 {
 	/*
 	 * Lame architectures will get the timer tick callback
@@ -65,6 +71,8 @@ void __weak arch_irq_work_raise(void)
  */
 bool irq_work_queue_on(struct irq_work *work, int cpu)
 {
+	bool raise_irqwork;
+
 	/* All work should have been flushed before going offline */
 	WARN_ON_ONCE(cpu_is_offline(cpu));
 
@@ -75,7 +83,19 @@ bool irq_work_queue_on(struct irq_work *work, int cpu)
 	if (!irq_work_claim(work))
 		return false;
 
-	if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
+#ifdef CONFIG_PREEMPT_RT_FULL
+	if (work->flags & IRQ_WORK_HARD_IRQ)
+		raise_irqwork = llist_add(&work->llnode,
+					  &per_cpu(hirq_work_list, cpu));
+	else
+		raise_irqwork = llist_add(&work->llnode,
+					  &per_cpu(lazy_list, cpu));
+#else
+		raise_irqwork = llist_add(&work->llnode,
+					  &per_cpu(raised_list, cpu));
+#endif
+
+	if (raise_irqwork)
 		arch_send_call_function_single_ipi(cpu);
 
 	return true;
@@ -93,7 +113,15 @@ bool irq_work_queue(struct irq_work *work)
 	/* Queue the entry and raise the IPI if needed. */
 	preempt_disable();
 
-	/* If the work is "lazy", handle it from next tick if any */
+#ifdef CONFIG_PREEMPT_RT_FULL
+	if (work->flags & IRQ_WORK_HARD_IRQ) {
+		if (llist_add(&work->llnode, this_cpu_ptr(&hirq_work_list)))
+			arch_irq_work_raise();
+	} else {
+		if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)))
+			arch_irq_work_raise();
+	}
+#else
 	if (work->flags & IRQ_WORK_LAZY) {
 		if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
 		    tick_nohz_tick_stopped())
@@ -102,6 +130,7 @@ bool irq_work_queue(struct irq_work *work)
 		if (llist_add(&work->llnode, this_cpu_ptr(&raised_list)))
 			arch_irq_work_raise();
 	}
+#endif
 
 	preempt_enable();
 
@@ -116,9 +145,10 @@ bool irq_work_needs_cpu(void)
 	raised = this_cpu_ptr(&raised_list);
 	lazy = this_cpu_ptr(&lazy_list);
 
-	if (llist_empty(raised) || arch_irq_work_has_interrupt())
+	if (llist_empty(raised))
 		if (llist_empty(lazy))
-			return false;
+			if (llist_empty(this_cpu_ptr(&hirq_work_list)))
+				return false;
 
 	/* All work should have been flushed before going offline */
 	WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
@@ -170,6 +200,12 @@ static void irq_work_run_list(struct llist_head *list)
  */
 void irq_work_run(void)
 {
+#ifdef CONFIG_PREEMPT_RT_FULL
+	if (in_irq()) {
+		irq_work_run_list(this_cpu_ptr(&hirq_work_list));
+		return;
+	}
+#endif
 	irq_work_run_list(this_cpu_ptr(&raised_list));
 	irq_work_run_list(this_cpu_ptr(&lazy_list));
 }
@@ -177,9 +213,16 @@ EXPORT_SYMBOL_GPL(irq_work_run);
 
 void irq_work_tick(void)
 {
-	struct llist_head *raised = &__get_cpu_var(raised_list);
+	struct llist_head *raised;
 
-	if (!llist_empty(raised) && !arch_irq_work_has_interrupt())
+#ifdef CONFIG_PREEMPT_RT_FULL
+	if (in_irq()) {
+		irq_work_run_list(this_cpu_ptr(&hirq_work_list));
+		return;
+	}
+#endif
+	raised = &__get_cpu_var(raised_list);
+	if (!llist_empty(raised))
 		irq_work_run_list(raised);
 	irq_work_run_list(&__get_cpu_var(lazy_list));
 }
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 4e9e92ab8b33..7405d0105892 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -227,6 +227,7 @@ static void nohz_full_kick_work_func(struct irq_work *work)
 
 static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
 	.func = nohz_full_kick_work_func,
+	.flags = IRQ_WORK_HARD_IRQ,
 };
 
 /*
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index d07d6370f361..b617b9a9400d 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1450,7 +1450,7 @@ void update_process_times(int user_tick)
 	scheduler_tick();
 	run_local_timers();
 	rcu_check_callbacks(cpu, user_tick);
-#ifdef CONFIG_IRQ_WORK && !defined(CONFIG_PREEMPT_RT_FULL)
+#ifdef CONFIG_IRQ_WORK
 	if (in_irq())
 		irq_work_tick();
 #endif
-- 
cgit v1.2.3


From 6c9ec4c23b32af2685ef04fe4f4d776db2644d1d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 12 Mar 2015 18:08:57 -0400
Subject: irq_work: Hide access to hirq_work_list in PREEMPT_RT_FULL

The hirq_work_list is only defined when PREEMPT_RT_FULL is configured.
Most access to it is within an #ifdef CONFIG_PREEMPT_RT_FULL, except
for one. Encapsulate that location too.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/irq_work.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 0c6491228b17..4b53470bf97b 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -147,7 +147,9 @@ bool irq_work_needs_cpu(void)
 
 	if (llist_empty(raised))
 		if (llist_empty(lazy))
+#ifdef CONFIG_PREEMPT_RT_FULL
 			if (llist_empty(this_cpu_ptr(&hirq_work_list)))
+#endif
 				return false;
 
 	/* All work should have been flushed before going offline */
-- 
cgit v1.2.3


From 48f9167d64df2c9fc22501fc33794248b9984d45 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 10 Apr 2015 11:50:22 +0200
Subject: kernel/irq_work: fix no_hz deadlock

Invoking NO_HZ's irq_work callback from timer irq is not working very
well if the callback decides to invoke hrtimer_cancel():

|hrtimer_try_to_cancel+0x55/0x5f
|hrtimer_cancel+0x16/0x28
|tick_nohz_restart+0x17/0x72
|__tick_nohz_full_check+0x8e/0x93
|nohz_full_kick_work_func+0xe/0x10
|irq_work_run_list+0x39/0x57
|irq_work_tick+0x60/0x67
|update_process_times+0x57/0x67
|tick_sched_handle+0x4a/0x59
|tick_sched_timer+0x3b/0x64
|__run_hrtimer+0x7a/0x149
|hrtimer_interrupt+0x1cc/0x2c5

and here we deadlock while waiting for the lock which we are holding.
To fix this I'm doing the same thing that upstream is doing: is the
irq_work dedicated IRQ and use it only for what is marked as "hirq"
which should only be the FULL_NO_HZ related work.

Reported-by: Carsten Emde <C.Emde@osadl.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 arch/arm/kernel/smp.c      |  2 --
 arch/arm64/kernel/smp.c    |  2 --
 arch/powerpc/kernel/time.c |  2 +-
 arch/sparc/kernel/pcr.c    |  2 --
 arch/x86/kernel/irq_work.c |  2 --
 kernel/irq_work.c          | 33 +++++++++++++--------------------
 kernel/time/tick-sched.c   |  5 +++++
 kernel/time/timer.c        |  6 +++---
 8 files changed, 22 insertions(+), 32 deletions(-)

diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index fe912bc87c33..a8e32aaf0383 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -506,14 +506,12 @@ void arch_send_call_function_single_ipi(int cpu)
 }
 
 #ifdef CONFIG_IRQ_WORK
-#ifndef CONFIG_PREEMPT_RT_FULL
 void arch_irq_work_raise(void)
 {
 	if (arch_irq_work_has_interrupt())
 		smp_cross_call(cpumask_of(smp_processor_id()), IPI_IRQ_WORK);
 }
 #endif
-#endif
 
 #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
 void tick_broadcast(const struct cpumask *mask)
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index ff2a40b21ad9..0ef87896e4ae 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -531,14 +531,12 @@ void arch_send_call_function_single_ipi(int cpu)
 }
 
 #ifdef CONFIG_IRQ_WORK
-#ifndef CONFIG_PREEMPT_RT_FULL
 void arch_irq_work_raise(void)
 {
 	if (__smp_cross_call)
 		smp_cross_call(cpumask_of(smp_processor_id()), IPI_IRQ_WORK);
 }
 #endif
-#endif
 
 static DEFINE_RAW_SPINLOCK(stop_lock);
 
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 550a14dd313b..f4de021815ae 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -424,7 +424,7 @@ unsigned long profile_pc(struct pt_regs *regs)
 EXPORT_SYMBOL(profile_pc);
 #endif
 
-#if defined(CONFIG_IRQ_WORK) && !defined(CONFIG_PREEMPT_RT_FULL)
+#if defined(CONFIG_IRQ_WORK)
 
 /*
  * 64-bit uses a byte in the PACA, 32-bit uses a per-cpu variable...
diff --git a/arch/sparc/kernel/pcr.c b/arch/sparc/kernel/pcr.c
index 927d9c5e50f5..7e967c8018c8 100644
--- a/arch/sparc/kernel/pcr.c
+++ b/arch/sparc/kernel/pcr.c
@@ -43,12 +43,10 @@ void __irq_entry deferred_pcr_work_irq(int irq, struct pt_regs *regs)
 	set_irq_regs(old_regs);
 }
 
-#ifndef CONFIG_PREEMPT_RT_FULL
 void arch_irq_work_raise(void)
 {
 	set_softint(1 << PIL_DEFERRED_PCR_WORK);
 }
-#endif
 
 const struct pcr_ops *pcr_ops;
 EXPORT_SYMBOL_GPL(pcr_ops);
diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c
index cbfb33df3749..15d741ddfeeb 100644
--- a/arch/x86/kernel/irq_work.c
+++ b/arch/x86/kernel/irq_work.c
@@ -38,7 +38,6 @@ __visible void smp_trace_irq_work_interrupt(struct pt_regs *regs)
 	exiting_irq();
 }
 
-#ifndef CONFIG_PREEMPT_RT_FULL
 void arch_irq_work_raise(void)
 {
 #ifdef CONFIG_X86_LOCAL_APIC
@@ -49,4 +48,3 @@ void arch_irq_work_raise(void)
 	apic_wait_icr_idle();
 #endif
 }
-#endif
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 4b53470bf97b..9dda38a377bf 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -17,6 +17,7 @@
 #include <linux/cpu.h>
 #include <linux/notifier.h>
 #include <linux/smp.h>
+#include <linux/interrupt.h>
 #include <asm/processor.h>
 
 
@@ -51,11 +52,7 @@ static bool irq_work_claim(struct irq_work *work)
 	return true;
 }
 
-#ifdef CONFIG_PREEMPT_RT_FULL
-void arch_irq_work_raise(void)
-#else
 void __weak arch_irq_work_raise(void)
-#endif
 {
 	/*
 	 * Lame architectures will get the timer tick callback
@@ -118,8 +115,9 @@ bool irq_work_queue(struct irq_work *work)
 		if (llist_add(&work->llnode, this_cpu_ptr(&hirq_work_list)))
 			arch_irq_work_raise();
 	} else {
-		if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)))
-			arch_irq_work_raise();
+		if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
+		    tick_nohz_tick_stopped())
+			raise_softirq(TIMER_SOFTIRQ);
 	}
 #else
 	if (work->flags & IRQ_WORK_LAZY) {
@@ -203,30 +201,25 @@ static void irq_work_run_list(struct llist_head *list)
 void irq_work_run(void)
 {
 #ifdef CONFIG_PREEMPT_RT_FULL
-	if (in_irq()) {
-		irq_work_run_list(this_cpu_ptr(&hirq_work_list));
-		return;
-	}
-#endif
+	irq_work_run_list(this_cpu_ptr(&hirq_work_list));
+#else
 	irq_work_run_list(this_cpu_ptr(&raised_list));
 	irq_work_run_list(this_cpu_ptr(&lazy_list));
+#endif
 }
 EXPORT_SYMBOL_GPL(irq_work_run);
 
 void irq_work_tick(void)
 {
-	struct llist_head *raised;
-
 #ifdef CONFIG_PREEMPT_RT_FULL
-	if (in_irq()) {
-		irq_work_run_list(this_cpu_ptr(&hirq_work_list));
-		return;
-	}
-#endif
-	raised = &__get_cpu_var(raised_list);
-	if (!llist_empty(raised))
+	irq_work_run_list(this_cpu_ptr(&lazy_list));
+#else
+	struct llist_head *raised = &__get_cpu_var(raised_list);
+
+	if (!llist_empty(raised) && !arch_irq_work_has_interrupt())
 		irq_work_run_list(raised);
 	irq_work_run_list(&__get_cpu_var(lazy_list));
+#endif
 }
 
 /*
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 7405d0105892..3a43989f0ce0 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -181,6 +181,11 @@ static bool can_stop_full_tick(void)
 		return false;
 	}
 
+	if (!arch_irq_work_has_interrupt()) {
+		trace_tick_stop(0, "missing irq work interrupt\n");
+		return false;
+	}
+
 	/* sched_clock_tick() needs us? */
 #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
 	/*
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index b617b9a9400d..a29ab1a17023 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1450,9 +1450,9 @@ void update_process_times(int user_tick)
 	scheduler_tick();
 	run_local_timers();
 	rcu_check_callbacks(cpu, user_tick);
-#ifdef CONFIG_IRQ_WORK
-	if (in_irq())
-		irq_work_tick();
+
+#if defined(CONFIG_IRQ_WORK) && !defined(CONFIG_PREEMPT_RT_FULL)
+	irq_work_tick();
 #endif
 	run_posix_cpu_timers(p);
 }
-- 
cgit v1.2.3


From 8415df94777693a16dd2dad4c2fde72c35156f8d Mon Sep 17 00:00:00 2001
From: Mike Galbraith <umgwanakikbuti@gmail.com>
Date: Wed, 20 May 2015 00:23:51 +0200
Subject: irq_work: Delegate non-immediate irq work to ksoftirqd

Based on a patch from Jan Kiszka.

Jan reported that ftrace queueing work from arbitrary contexts can
and does lead to deadlock.  trace-cmd -e sched:* deadlocked in fact.

Resolve the problem by delegating all non-immediate work to ksoftirqd.

We need two lists to do this, one for hard irq, one for soft, so we
can use the two existing lists, eliminating the -rt specific list and
all of the ifdefery while we're at it.

Strategy: Queue work tagged for hirq invocation to the raised_list,
invoke via IPI as usual.  If a work item being queued to lazy_list,
which becomes our all others list, is not a lazy work item, or the
tick is stopped, fire an IPI to raise SOFTIRQ_TIMER immediately,
otherwise let ksofirqd find it when the tick comes along.  Raising
SOFTIRQ_TIMER via IPI even when queueing local ensures delegation.

Cc: stable-rt@vger.kernel.org
Acked-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Mike Galbraith <umgwanakikbuti@gmail.com>
---
 kernel/irq_work.c | 85 +++++++++++++++++++++----------------------------------
 1 file changed, 33 insertions(+), 52 deletions(-)

diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 9dda38a377bf..9678fd1382a7 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -23,9 +23,7 @@
 
 static DEFINE_PER_CPU(struct llist_head, raised_list);
 static DEFINE_PER_CPU(struct llist_head, lazy_list);
-#ifdef CONFIG_PREEMPT_RT_FULL
-static DEFINE_PER_CPU(struct llist_head, hirq_work_list);
-#endif
+
 /*
  * Claim the entry so that no one else will poke at it.
  */
@@ -68,7 +66,7 @@ void __weak arch_irq_work_raise(void)
  */
 bool irq_work_queue_on(struct irq_work *work, int cpu)
 {
-	bool raise_irqwork;
+	struct llist_head *list;
 
 	/* All work should have been flushed before going offline */
 	WARN_ON_ONCE(cpu_is_offline(cpu));
@@ -80,19 +78,12 @@ bool irq_work_queue_on(struct irq_work *work, int cpu)
 	if (!irq_work_claim(work))
 		return false;
 
-#ifdef CONFIG_PREEMPT_RT_FULL
-	if (work->flags & IRQ_WORK_HARD_IRQ)
-		raise_irqwork = llist_add(&work->llnode,
-					  &per_cpu(hirq_work_list, cpu));
+	if (IS_ENABLED(CONFIG_PREEMPT_RT_FULL) && !(work->flags & IRQ_WORK_HARD_IRQ))
+		list = &per_cpu(lazy_list, cpu);
 	else
-		raise_irqwork = llist_add(&work->llnode,
-					  &per_cpu(lazy_list, cpu));
-#else
-		raise_irqwork = llist_add(&work->llnode,
-					  &per_cpu(raised_list, cpu));
-#endif
+		list = &per_cpu(raised_list, cpu);
 
-	if (raise_irqwork)
+	if (llist_add(&work->llnode, list))
 		arch_send_call_function_single_ipi(cpu);
 
 	return true;
@@ -103,6 +94,9 @@ EXPORT_SYMBOL_GPL(irq_work_queue_on);
 /* Enqueue the irq work @work on the current CPU */
 bool irq_work_queue(struct irq_work *work)
 {
+	struct llist_head *list;
+	bool lazy_work, realtime = IS_ENABLED(CONFIG_PREEMPT_RT_FULL);
+
 	/* Only queue if not already pending */
 	if (!irq_work_claim(work))
 		return false;
@@ -110,25 +104,17 @@ bool irq_work_queue(struct irq_work *work)
 	/* Queue the entry and raise the IPI if needed. */
 	preempt_disable();
 
-#ifdef CONFIG_PREEMPT_RT_FULL
-	if (work->flags & IRQ_WORK_HARD_IRQ) {
-		if (llist_add(&work->llnode, this_cpu_ptr(&hirq_work_list)))
-			arch_irq_work_raise();
-	} else {
-		if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
-		    tick_nohz_tick_stopped())
-			raise_softirq(TIMER_SOFTIRQ);
-	}
-#else
-	if (work->flags & IRQ_WORK_LAZY) {
-		if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
-		    tick_nohz_tick_stopped())
-			arch_irq_work_raise();
-	} else {
-		if (llist_add(&work->llnode, this_cpu_ptr(&raised_list)))
+	lazy_work = work->flags & IRQ_WORK_LAZY;
+
+	if (lazy_work || (realtime && !(work->flags & IRQ_WORK_HARD_IRQ)))
+		list = this_cpu_ptr(&lazy_list);
+	else
+		list = this_cpu_ptr(&raised_list);
+
+	if (llist_add(&work->llnode, list)) {
+		if (!lazy_work || tick_nohz_tick_stopped())
 			arch_irq_work_raise();
 	}
-#endif
 
 	preempt_enable();
 
@@ -143,12 +129,8 @@ bool irq_work_needs_cpu(void)
 	raised = this_cpu_ptr(&raised_list);
 	lazy = this_cpu_ptr(&lazy_list);
 
-	if (llist_empty(raised))
-		if (llist_empty(lazy))
-#ifdef CONFIG_PREEMPT_RT_FULL
-			if (llist_empty(this_cpu_ptr(&hirq_work_list)))
-#endif
-				return false;
+	if (llist_empty(raised) && llist_empty(lazy))
+		return false;
 
 	/* All work should have been flushed before going offline */
 	WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
@@ -162,9 +144,7 @@ static void irq_work_run_list(struct llist_head *list)
 	struct irq_work *work;
 	struct llist_node *llnode;
 
-#ifndef CONFIG_PREEMPT_RT_FULL
-	BUG_ON(!irqs_disabled());
-#endif
+	BUG_ON(!IS_ENABLED(CONFIG_PREEMPT_RT_FULL) && !irqs_disabled());
 
 	if (llist_empty(list))
 		return;
@@ -200,26 +180,27 @@ static void irq_work_run_list(struct llist_head *list)
  */
 void irq_work_run(void)
 {
-#ifdef CONFIG_PREEMPT_RT_FULL
-	irq_work_run_list(this_cpu_ptr(&hirq_work_list));
-#else
 	irq_work_run_list(this_cpu_ptr(&raised_list));
-	irq_work_run_list(this_cpu_ptr(&lazy_list));
-#endif
+	if (IS_ENABLED(CONFIG_PREEMPT_RT_FULL)) {
+		/*
+		 * NOTE: we raise softirq via IPI for safety,
+		 * and execute in irq_work_tick() to move the
+		 * overhead from hard to soft irq context.
+		 */
+		if (!llist_empty(this_cpu_ptr(&lazy_list)))
+			raise_softirq(TIMER_SOFTIRQ);
+	} else
+		irq_work_run_list(this_cpu_ptr(&lazy_list));
 }
 EXPORT_SYMBOL_GPL(irq_work_run);
 
 void irq_work_tick(void)
 {
-#ifdef CONFIG_PREEMPT_RT_FULL
-	irq_work_run_list(this_cpu_ptr(&lazy_list));
-#else
-	struct llist_head *raised = &__get_cpu_var(raised_list);
+	struct llist_head *raised = this_cpu_ptr(&raised_list);
 
 	if (!llist_empty(raised) && !arch_irq_work_has_interrupt())
 		irq_work_run_list(raised);
-	irq_work_run_list(&__get_cpu_var(lazy_list));
-#endif
+	irq_work_run_list(this_cpu_ptr(&lazy_list));
 }
 
 /*
-- 
cgit v1.2.3


From ddd8396c7da35c6c8afe823f1cc7ae0683440fdd Mon Sep 17 00:00:00 2001
From: Mike Galbraith <umgwanakikbuti@gmail.com>
Date: Wed, 18 Feb 2015 15:09:23 +0100
Subject: snd/pcm: fix snd_pcm_stream_lock*() irqs_disabled() splats

Locking functions previously using read_lock_irq()/read_lock_irqsave() were
changed to local_irq_disable/save(), leading to gripes.  Use nort variants.

|BUG: sleeping function called from invalid context at kernel/locking/rtmutex.c:915
|in_atomic(): 0, irqs_disabled(): 1, pid: 5947, name: alsa-sink-ALC88
|CPU: 5 PID: 5947 Comm: alsa-sink-ALC88 Not tainted 3.18.7-rt1 #9
|Hardware name: MEDION MS-7848/MS-7848, BIOS M7848W08.404 11/06/2014
| ffff880409316240 ffff88040866fa38 ffffffff815bdeb5 0000000000000002
| 0000000000000000 ffff88040866fa58 ffffffff81073c86 ffffffffa03b2640
| ffff88040239ec00 ffff88040866fa78 ffffffff815c3d34 ffffffffa03b2640
|Call Trace:
| [<ffffffff815bdeb5>] dump_stack+0x4f/0x9e
| [<ffffffff81073c86>] __might_sleep+0xe6/0x150
| [<ffffffff815c3d34>] __rt_spin_lock+0x24/0x50
| [<ffffffff815c4044>] rt_read_lock+0x34/0x40
| [<ffffffffa03a2979>] snd_pcm_stream_lock+0x29/0x70 [snd_pcm]
| [<ffffffffa03a355d>] snd_pcm_playback_poll+0x5d/0x120 [snd_pcm]
| [<ffffffff811937a2>] do_sys_poll+0x322/0x5b0
| [<ffffffff81193d48>] SyS_ppoll+0x1a8/0x1c0
| [<ffffffff815c4556>] system_call_fastpath+0x16/0x1b

Signed-off-by: Mike Galbraith <umgwanakikbuti@gmail.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 sound/core/pcm_native.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
index 9c823cfdfff0..bb4297d12066 100644
--- a/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@ -104,7 +104,7 @@ EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock);
 void snd_pcm_stream_lock_irq(struct snd_pcm_substream *substream)
 {
 	if (!substream->pcm->nonatomic)
-		local_irq_disable();
+		local_irq_disable_nort();
 	snd_pcm_stream_lock(substream);
 }
 EXPORT_SYMBOL_GPL(snd_pcm_stream_lock_irq);
@@ -113,7 +113,7 @@ void snd_pcm_stream_unlock_irq(struct snd_pcm_substream *substream)
 {
 	snd_pcm_stream_unlock(substream);
 	if (!substream->pcm->nonatomic)
-		local_irq_enable();
+		local_irq_enable_nort();
 }
 EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock_irq);
 
@@ -121,7 +121,7 @@ unsigned long _snd_pcm_stream_lock_irqsave(struct snd_pcm_substream *substream)
 {
 	unsigned long flags = 0;
 	if (!substream->pcm->nonatomic)
-		local_irq_save(flags);
+		local_irq_save_nort(flags);
 	snd_pcm_stream_lock(substream);
 	return flags;
 }
@@ -132,7 +132,7 @@ void snd_pcm_stream_unlock_irqrestore(struct snd_pcm_substream *substream,
 {
 	snd_pcm_stream_unlock(substream);
 	if (!substream->pcm->nonatomic)
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 }
 EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock_irqrestore);
 
-- 
cgit v1.2.3


From 822a01205cff28955b61bfdf31a7ef8fcf8ec2ef Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 19 Sep 2012 14:50:37 +0200
Subject: printk-rt-aware.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/printk/printk.c | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index c2cc106455f3..647e242c2e47 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1407,6 +1407,7 @@ static void call_console_drivers(int level, const char *text, size_t len)
 	if (!console_drivers)
 		return;
 
+	migrate_disable();
 	for_each_console(con) {
 		if (exclusive_console && con != exclusive_console)
 			continue;
@@ -1419,6 +1420,7 @@ static void call_console_drivers(int level, const char *text, size_t len)
 			continue;
 		con->write(con, text, len);
 	}
+	migrate_enable();
 }
 
 /*
@@ -1479,6 +1481,15 @@ static inline int can_use_console(unsigned int cpu)
 static int console_trylock_for_printk(void)
 {
 	unsigned int cpu = smp_processor_id();
+#ifdef CONFIG_PREEMPT_RT_FULL
+	int lock = !early_boot_irqs_disabled && (preempt_count() == 0) &&
+		!irqs_disabled();
+#else
+	int lock = 1;
+#endif
+
+	if (!lock)
+		return 0;
 
 	if (!console_trylock())
 		return 0;
@@ -1832,8 +1843,7 @@ asmlinkage int vprintk_emit(int facility, int level,
 		 * console_sem which would prevent anyone from printing to
 		 * console
 		 */
-		preempt_disable();
-
+		migrate_disable();
 		/*
 		 * Try to acquire and then immediately release the console
 		 * semaphore.  The release will print out buffers and wake up
@@ -1841,7 +1851,7 @@ asmlinkage int vprintk_emit(int facility, int level,
 		 */
 		if (console_trylock_for_printk())
 			console_unlock();
-		preempt_enable();
+		migrate_enable();
 		lockdep_on();
 	}
 
@@ -2180,11 +2190,16 @@ static void console_cont_flush(char *text, size_t size)
 		goto out;
 
 	len = cont_print_text(text, size);
+#ifndef CONFIG_PREEMPT_RT_FULL
 	raw_spin_unlock(&logbuf_lock);
 	stop_critical_timings();
 	call_console_drivers(cont.level, text, len);
 	start_critical_timings();
 	local_irq_restore(flags);
+#else
+	raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+	call_console_drivers(cont.level, text, len);
+#endif
 	return;
 out:
 	raw_spin_unlock_irqrestore(&logbuf_lock, flags);
@@ -2272,12 +2287,17 @@ skip:
 		console_idx = log_next(console_idx);
 		console_seq++;
 		console_prev = msg->flags;
+#ifdef CONFIG_PREEMPT_RT_FULL
+		raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+		call_console_drivers(level, text, len);
+#else
 		raw_spin_unlock(&logbuf_lock);
 
 		stop_critical_timings();	/* don't trace print latency */
 		call_console_drivers(level, text, len);
 		start_critical_timings();
 		local_irq_restore(flags);
+#endif
 	}
 	console_locked = 0;
 
-- 
cgit v1.2.3


From 24957c039befa8c8035e7b6c758f7b8cdc0d9fb3 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 21 Mar 2013 19:01:05 +0100
Subject: HACK: printk: drop the logbuf_lock more often

The lock is hold with irgs off. The latency drops 500us+ on my arm bugs
with a "full" buffer after executing "dmesg" on the shell.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 kernel/printk/printk.c | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 647e242c2e47..4a27612b44c3 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1165,6 +1165,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 {
 	char *text;
 	int len = 0;
+	int attempts = 0;
 
 	text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
 	if (!text)
@@ -1176,7 +1177,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 		u64 seq;
 		u32 idx;
 		enum log_flags prev;
-
+		int num_msg;
+try_again:
+		attempts++;
+		if (attempts > 10) {
+			len = -EBUSY;
+			goto out;
+		}
+		num_msg = 0;
 		if (clear_seq < log_first_seq) {
 			/* messages are gone, move to first available one */
 			clear_seq = log_first_seq;
@@ -1197,6 +1205,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 			prev = msg->flags;
 			idx = log_next(idx);
 			seq++;
+			num_msg++;
+			if (num_msg > 5) {
+				num_msg = 0;
+				raw_spin_unlock_irq(&logbuf_lock);
+				raw_spin_lock_irq(&logbuf_lock);
+				if (clear_seq < log_first_seq)
+					goto try_again;
+			}
 		}
 
 		/* move first record forward until length fits into the buffer */
@@ -1210,6 +1226,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 			prev = msg->flags;
 			idx = log_next(idx);
 			seq++;
+			num_msg++;
+			if (num_msg > 5) {
+				num_msg = 0;
+				raw_spin_unlock_irq(&logbuf_lock);
+				raw_spin_lock_irq(&logbuf_lock);
+				if (clear_seq < log_first_seq)
+					goto try_again;
+			}
 		}
 
 		/* last message fitting into this dump */
@@ -1250,6 +1274,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 		clear_seq = log_next_seq;
 		clear_idx = log_next_idx;
 	}
+out:
 	raw_spin_unlock_irq(&logbuf_lock);
 
 	kfree(text);
-- 
cgit v1.2.3


From 70ea0c69f759c8347fbf921e358b630d3221eaa7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 20 May 2015 00:23:51 +0200
Subject: Powerpc: Use generic rwsem on RT

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/powerpc/Kconfig | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 88eace4e28c3..72ba67435953 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -60,10 +60,11 @@ config LOCKDEP_SUPPORT
 
 config RWSEM_GENERIC_SPINLOCK
 	bool
+	default y if PREEMPT_RT_FULL
 
 config RWSEM_XCHGADD_ALGORITHM
 	bool
-	default y
+	default y if !PREEMPT_RT_FULL
 
 config GENERIC_LOCKBREAK
 	bool
-- 
cgit v1.2.3


From 7cf617d0961058bccb292411b49b25a486f92c0a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 18 Jul 2011 17:08:34 +0200
Subject: power-disable-highmem-on-rt.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/powerpc/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 72ba67435953..8d9d1d6f16d3 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -304,7 +304,7 @@ menu "Kernel options"
 
 config HIGHMEM
 	bool "High memory support"
-	depends on PPC32
+	depends on PPC32 && !PREEMPT_RT_FULL
 
 source kernel/Kconfig.hz
 source kernel/Kconfig.preempt
-- 
cgit v1.2.3


From c8eb2680f5866171c440c276a15246aa8cde8505 Mon Sep 17 00:00:00 2001
From: Bogdan Purcareata <bogdan.purcareata@freescale.com>
Date: Fri, 24 Apr 2015 15:53:13 +0000
Subject: powerpc/kvm: Disable in-kernel MPIC emulation for PREEMPT_RT_FULL

While converting the openpic emulation code to use a raw_spinlock_t enables
guests to run on RT, there's still a performance issue. For interrupts sent in
directed delivery mode with a multiple CPU mask, the emulated openpic will loop
through all of the VCPUs, and for each VCPUs, it call IRQ_check, which will loop
through all the pending interrupts for that VCPU. This is done while holding the
raw_lock, meaning that in all this time the interrupts and preemption are
disabled on the host Linux. A malicious user app can max both these number and
cause a DoS.

This temporary fix is sent for two reasons. First is so that users who want to
use the in-kernel MPIC emulation are aware of the potential latencies, thus
making sure that the hardware MPIC and their usage scenario does not involve
interrupts sent in directed delivery mode, and the number of possible pending
interrupts is kept small. Secondly, this should incentivize the development of a
proper openpic emulation that would be better suited for RT.

Acked-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Bogdan Purcareata <bogdan.purcareata@freescale.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 arch/powerpc/kvm/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 602eb51d20bc..60fc1adab19f 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -157,6 +157,7 @@ config KVM_E500MC
 config KVM_MPIC
 	bool "KVM in-kernel MPIC emulation"
 	depends on KVM && E500
+	depends on !PREEMPT_RT_FULL
 	select HAVE_KVM_IRQCHIP
 	select HAVE_KVM_IRQFD
 	select HAVE_KVM_IRQ_ROUTING
-- 
cgit v1.2.3


From 809fe22306745ca0aeac5d818f84cd6d6820f174 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 18 Jul 2011 17:09:28 +0200
Subject: arm-disable-highmem-on-rt.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/arm/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 89c4b5ccc68d..2fec345c24d6 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1667,7 +1667,7 @@ config HAVE_GENERIC_RCU_GUP
 
 config HIGHMEM
 	bool "High Memory Support"
-	depends on MMU
+	depends on MMU && !PREEMPT_RT_FULL
 	help
 	  The address space of ARM processors is only 4 Gigabytes large
 	  and it has to accommodate user address space, kernel address
-- 
cgit v1.2.3


From 1e81a3d26df12b4dee057d1184626e567d656ca9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 1 May 2010 18:29:35 +0200
Subject: ARM: at91: tclib: Default to tclib timer for RT

RT is not too happy about the shared timer interrupt in AT91
devices. Default to tclib timer for RT.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/misc/Kconfig | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index d31416bfe325..9f06e82e5aa7 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -54,6 +54,7 @@ config AD525X_DPOT_SPI
 config ATMEL_TCLIB
 	bool "Atmel AT32/AT91 Timer/Counter Library"
 	depends on (AVR32 || ARCH_AT91)
+	default y if PREEMPT_RT_FULL
 	help
 	  Select this if you want a library to allocate the Timer/Counter
 	  blocks found on many Atmel processors.  This facilitates using
@@ -86,7 +87,7 @@ config ATMEL_TCB_CLKSRC_BLOCK
 config ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
 	bool "TC Block use 32 KiHz clock"
 	depends on ATMEL_TCB_CLKSRC
-	default y
+	default y if !PREEMPT_RT_FULL
 	help
 	  Select this to use 32 KiHz base clock rate as TC block clock
 	  source for clock events.
-- 
cgit v1.2.3


From 4e3277d485f3d9827f91cb7ccd197053ffb61286 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 20 Sep 2013 14:31:54 +0200
Subject: arm/unwind: use a raw_spin_lock

Mostly unwind is done with irqs enabled however SLUB may call it with
irqs disabled while creating a new SLUB cache.

I had system freeze while loading a module which called
kmem_cache_create() on init. That means SLUB's __slab_alloc() disabled
interrupts and then

->new_slab_objects()
 ->new_slab()
  ->setup_object()
   ->setup_object_debug()
    ->init_tracking()
     ->set_track()
      ->save_stack_trace()
       ->save_stack_trace_tsk()
        ->walk_stackframe()
         ->unwind_frame()
          ->unwind_find_idx()
           =>spin_lock_irqsave(&unwind_lock);

Cc: stable-rt@vger.kernel.org
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 arch/arm/kernel/unwind.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/arch/arm/kernel/unwind.c b/arch/arm/kernel/unwind.c
index cbb85c5fabf9..5184fe85d167 100644
--- a/arch/arm/kernel/unwind.c
+++ b/arch/arm/kernel/unwind.c
@@ -93,7 +93,7 @@ extern const struct unwind_idx __start_unwind_idx[];
 static const struct unwind_idx *__origin_unwind_idx;
 extern const struct unwind_idx __stop_unwind_idx[];
 
-static DEFINE_SPINLOCK(unwind_lock);
+static DEFINE_RAW_SPINLOCK(unwind_lock);
 static LIST_HEAD(unwind_tables);
 
 /* Convert a prel31 symbol to an absolute address */
@@ -201,7 +201,7 @@ static const struct unwind_idx *unwind_find_idx(unsigned long addr)
 		/* module unwind tables */
 		struct unwind_table *table;
 
-		spin_lock_irqsave(&unwind_lock, flags);
+		raw_spin_lock_irqsave(&unwind_lock, flags);
 		list_for_each_entry(table, &unwind_tables, list) {
 			if (addr >= table->begin_addr &&
 			    addr < table->end_addr) {
@@ -213,7 +213,7 @@ static const struct unwind_idx *unwind_find_idx(unsigned long addr)
 				break;
 			}
 		}
-		spin_unlock_irqrestore(&unwind_lock, flags);
+		raw_spin_unlock_irqrestore(&unwind_lock, flags);
 	}
 
 	pr_debug("%s: idx = %p\n", __func__, idx);
@@ -530,9 +530,9 @@ struct unwind_table *unwind_table_add(unsigned long start, unsigned long size,
 	tab->begin_addr = text_addr;
 	tab->end_addr = text_addr + text_size;
 
-	spin_lock_irqsave(&unwind_lock, flags);
+	raw_spin_lock_irqsave(&unwind_lock, flags);
 	list_add_tail(&tab->list, &unwind_tables);
-	spin_unlock_irqrestore(&unwind_lock, flags);
+	raw_spin_unlock_irqrestore(&unwind_lock, flags);
 
 	return tab;
 }
@@ -544,9 +544,9 @@ void unwind_table_del(struct unwind_table *tab)
 	if (!tab)
 		return;
 
-	spin_lock_irqsave(&unwind_lock, flags);
+	raw_spin_lock_irqsave(&unwind_lock, flags);
 	list_del(&tab->list);
-	spin_unlock_irqrestore(&unwind_lock, flags);
+	raw_spin_unlock_irqrestore(&unwind_lock, flags);
 
 	kfree(tab);
 }
-- 
cgit v1.2.3


From bf7fce8830fef4196649740fc7e2eb3687eb2dc2 Mon Sep 17 00:00:00 2001
From: "Yadi.hu" <yadi.hu@windriver.com>
Date: Wed, 10 Dec 2014 10:32:09 +0800
Subject: ARM: enable irq in translation/section permission fault handlers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Probably happens on all ARM, with
CONFIG_PREEMPT_RT_FULL
CONFIG_DEBUG_ATOMIC_SLEEP

This simple program....

int main() {
   *((char*)0xc0001000) = 0;
};

[ 512.742724] BUG: sleeping function called from invalid context at kernel/rtmutex.c:658
[ 512.743000] in_atomic(): 0, irqs_disabled(): 128, pid: 994, name: a
[ 512.743217] INFO: lockdep is turned off.
[ 512.743360] irq event stamp: 0
[ 512.743482] hardirqs last enabled at (0): [< (null)>] (null)
[ 512.743714] hardirqs last disabled at (0): [<c0426370>] copy_process+0x3b0/0x11c0
[ 512.744013] softirqs last enabled at (0): [<c0426370>] copy_process+0x3b0/0x11c0
[ 512.744303] softirqs last disabled at (0): [< (null)>] (null)
[ 512.744631] [<c041872c>] (unwind_backtrace+0x0/0x104)
[ 512.745001] [<c09af0c4>] (dump_stack+0x20/0x24)
[ 512.745355] [<c0462490>] (__might_sleep+0x1dc/0x1e0)
[ 512.745717] [<c09b6770>] (rt_spin_lock+0x34/0x6c)
[ 512.746073] [<c0441bf0>] (do_force_sig_info+0x34/0xf0)
[ 512.746457] [<c0442668>] (force_sig_info+0x18/0x1c)
[ 512.746829] [<c041d880>] (__do_user_fault+0x9c/0xd8)
[ 512.747185] [<c041d938>] (do_bad_area+0x7c/0x94)
[ 512.747536] [<c041d990>] (do_sect_fault+0x40/0x48)
[ 512.747898] [<c040841c>] (do_DataAbort+0x40/0xa0)
[ 512.748181] Exception stack(0xecaa1fb0 to 0xecaa1ff8)

Oxc0000000 belongs to kernel address space, user task can not be
allowed to access it. For above condition, correct result is that
test case should receive a “segment fault” and exits but not stacks.

the root cause is commit 02fe2845d6a8 ("avoid enabling interrupts in
prefetch/data abort handlers"),it deletes irq enable block in Data
abort assemble code and move them into page/breakpiont/alignment fault
handlers instead. But author does not enable irq in translation/section
permission fault handlers. ARM disables irq when it enters exception/
interrupt mode, if kernel doesn't enable irq, it would be still disabled
during translation/section permission fault.

We see the above splat because do_force_sig_info is still called with
IRQs off, and that code eventually does a:

        spin_lock_irqsave(&t->sighand->siglock, flags);

As this is architecture independent code, and we've not seen any other
need for other arch to have the siglock converted to raw lock, we can
conclude that we should enable irq for ARM translation/section
permission exception.

Cc: stable-rt@vger.kernel.org
Signed-off-by: Yadi.hu <yadi.hu@windriver.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 arch/arm/mm/fault.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index b40d4bab8e07..c15d2a0826c6 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -431,6 +431,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr,
 	if (addr < TASK_SIZE)
 		return do_page_fault(addr, fsr, regs);
 
+	if (interrupts_enabled(regs))
+		local_irq_enable();
+
 	if (user_mode(regs))
 		goto bad_area;
 
@@ -498,6 +501,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr,
 static int
 do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 {
+	if (interrupts_enabled(regs))
+		local_irq_enable();
+
 	do_bad_area(addr, fsr, regs);
 	return 0;
 }
-- 
cgit v1.2.3


From 889d5bfb180b48ddb054745df759b5f98b8aa3e7 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 18 Feb 2015 14:07:21 +0100
Subject: arm/futex: disable preemption during futex_atomic_cmpxchg_inatomic()

The ARM UP implementation of futex_atomic_cmpxchg_inatomic() assumes that
pagefault_disable() inherits a preempt disabled section. This assumtion
is true for mainline but -RT reverts this and allows preemption in
pagefault disabled regions.
The code sequence of futex_atomic_cmpxchg_inatomic():

|   x = *futex;
|   if (x == oldval)
|           *futex = newval;

The problem occurs if the code is preempted after reading the futex value or
after comparing it with x. While preempted, the futex owner has to be
scheduled which then releases the lock (in userland because it has no waiter
yet). Once the code is back on the CPU, it overwrites the futex value
with with the old PID and the waiter bit set.

The workaround is to explicit disable code preemption to avoid the
described race window.

Debugged-by:  Thomas Gleixner <tglx@linutronix.de>
Cc: stable-rt@vger.kernel.org
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 arch/arm/include/asm/futex.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/arm/include/asm/futex.h b/arch/arm/include/asm/futex.h
index 53e69dae796f..9d861f9f215b 100644
--- a/arch/arm/include/asm/futex.h
+++ b/arch/arm/include/asm/futex.h
@@ -93,6 +93,8 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
 	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
 		return -EFAULT;
 
+	preempt_disable_rt();
+
 	__asm__ __volatile__("@futex_atomic_cmpxchg_inatomic\n"
 	"1:	" TUSER(ldr) "	%1, [%4]\n"
 	"	teq	%1, %2\n"
@@ -104,6 +106,8 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
 	: "cc", "memory");
 
 	*uval = val;
+
+	preempt_enable_rt();
 	return ret;
 }
 
-- 
cgit v1.2.3


From 5523024fdd29fe704158c15a6f630c70ddcbfb1e Mon Sep 17 00:00:00 2001
From: Yong Zhang <yong.zhang at windriver.com>
Date: Thu, 29 Jan 2015 12:56:18 -0600
Subject: ARM: cmpxchg: define __HAVE_ARCH_CMPXCHG for armv6 and later

Both pi_stress and sigwaittest in rt-test show performance gain with
__HAVE_ARCH_CMPXCHG. Testing result on coretile_express_a9x4:

pi_stress -p 99 --duration=300 (on linux-3.4-rc5; bigger is better)
  vanilla:     Total inversion performed: 5493381
  patched:     Total inversion performed: 5621746

sigwaittest -p 99 -l 100000 (on linux-3.4-rc5-rt6; less is better)
  3.4-rc5-rt6: Min   24, Cur   27, Avg   30, Max   98
  patched:     Min   19, Cur   21, Avg   23, Max   96

Signed-off-by: Yong Zhang <yong.zhang0 at gmail.com>
Cc: Russell King <rmk+kernel at arm.linux.org.uk>
Cc: Nicolas Pitre <nico at linaro.org>
Cc: Will Deacon <will.deacon at arm.com>
Cc: Catalin Marinas <catalin.marinas at arm.com>
Cc: Thomas Gleixner <tglx at linutronix.de>
Cc: linux-arm-kernel at lists.infradead.org
Cc: stable-rt@vger.kernel.org
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 arch/arm/include/asm/cmpxchg.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/arm/include/asm/cmpxchg.h b/arch/arm/include/asm/cmpxchg.h
index abb2c3769b01..2386e9745ba4 100644
--- a/arch/arm/include/asm/cmpxchg.h
+++ b/arch/arm/include/asm/cmpxchg.h
@@ -129,6 +129,8 @@ static inline unsigned long __xchg(unsigned long x, volatile void *ptr, int size
 
 #else	/* min ARCH >= ARMv6 */
 
+#define __HAVE_ARCH_CMPXCHG 1
+
 extern void __bad_cmpxchg(volatile void *ptr, int size);
 
 /*
-- 
cgit v1.2.3


From ffe3ae78dd952fafea8373d0ee1151d28a3166f0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 18 Jul 2011 17:10:12 +0200
Subject: mips-disable-highmem-on-rt.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/mips/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 9536ef912f59..d196a00b89f5 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -2196,7 +2196,7 @@ config CPU_R4400_WORKAROUNDS
 #
 config HIGHMEM
 	bool "High Memory Support"
-	depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA
+	depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA && !PREEMPT_RT_FULL
 
 config CPU_SUPPORTS_HIGHMEM
 	bool
-- 
cgit v1.2.3


From fa1e1f14ec5af65c68ac86c17d0ed3cc284c5bc3 Mon Sep 17 00:00:00 2001
From: Yang Shi <yang.shi@windriver.com>
Date: Tue, 17 Feb 2015 14:18:24 -0800
Subject: mips: rt: Replace pagefault_* to raw version

In k{un}map_coherent, pagefault_disable and pagefault_enable are called
respectively, but k{un}map_coherent needs preempt disabled according to
commit f8829caee311207afbc882794bdc5aa0db5caf33 ("[MIPS] Fix aliasing bug
in copy_to_user_page / copy_from_user_page") to avoid dcache alias on COW.

k{un}map_coherent are just called when cpu_has_dc_aliases == 1 with VIPT cache.
However, actually, the most modern MIPS processors have PIPT dcache without
dcache alias issue. In such case, k{un}map_atomic will be called with preempt
enabled.

To fix this, we replace pagefault_* to raw version in k{un}map_coherent, which
disables preempt, otherwise the following kernel panic may be caught:

CPU 0 Unable to handle kernel paging request at virtual address fffffffffffd5000, epc == ffffffff80122c00, ra == ffffffff8011fbcc
Oops[#1]:
CPU: 0 PID: 409 Comm: runltp Not tainted 3.14.17-rt5 #1
task: 980000000fa936f0 ti: 980000000eed0000 task.ti: 980000000eed0000
$ 0 : 0000000000000000 000000001400a4e1 fffffffffffd5000 0000000000000001
$ 4 : 980000000cded000 fffffffffffd5000 980000000cdedf00 ffffffffffff00fe
$ 8 : 0000000000000000 ffffffffffffff00 000000000000000d 0000000000000004
$12 : 980000000eed3fe0 000000000000a400 ffffffffa00ae278 0000000000000000
$16 : 980000000cded000 000000726eb855c8 98000000012ccfe8 ffffffff8095e0c0
$20 : ffffffff80ad0000 ffffffff8095e0c0 98000000012d0bd8 980000000fb92000
$24 : 0000000000000000 ffffffff80177fb0
$28 : 980000000eed0000 980000000eed3b60 980000000fb92060 ffffffff8011fbcc
Hi : 000000000002cb02
Lo : 000000000000ee56
epc : ffffffff80122c00 copy_page+0x38/0x548
    Not tainted
ra : ffffffff8011fbcc copy_user_highpage+0x16c/0x180
Status: 1400a4e3 KX SX UX KERNEL EXL IE
Cause : 10800408
BadVA : fffffffffffd5000
PrId : 00010000 (MIPS64R2-generic)
Modules linked in: i2c_piix4 i2c_core uhci_hcd
Process runltp (pid: 409, threadinfo=980000000eed0000, task=980000000fa936f0, tls=000000fff7756700)
Stack : 98000000012ccfe8 980000000eeb7ba8 980000000ecc7508 000000000666da5b
000000726eb855c8 ffffffff802156e0 000000726ea4a000 98000000010007e0
980000000fb92060 0000000000000000 0000000000000000 6db6db6db6db6db7
0000000000000080 000000726eb855c8 980000000fb92000 980000000eeeec28
980000000ecc7508 980000000fb92060 0000000000000001 00000000000000a9
ffffffff80995e60 ffffffff80218910 000000001400a4e0 ffffffff804efd24
980000000ee25b90 ffffffff8079cec4 ffffffff8079d49c ffffffff80979658
000000000666da5b 980000000eeb7ba8 000000726eb855c8 00000000000000a9
980000000fb92000 980000000fa936f0 980000000eed3eb0 0000000000000001
980000000fb92088 0000000000030002 980000000ecc7508 ffffffff8011ecd0
...
Call Trace:
[<ffffffff80122c00>] copy_page+0x38/0x548
[<ffffffff8011fbcc>] copy_user_highpage+0x16c/0x180
[<ffffffff802156e0>] do_wp_page+0x658/0xcd8
[<ffffffff80218910>] handle_mm_fault+0x7d8/0x1070
[<ffffffff8011ecd0>] __do_page_fault+0x1a0/0x508
[<ffffffff80104d84>] resume_userspace_check+0x0/0x10

Or there may be random segmentation fault happened.

Cc: stable-rt@vger.kernel.org
Signed-off-by: Yang Shi <yang.shi@windriver.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 arch/mips/mm/init.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c
index f42e35e42790..e09dae6ce80d 100644
--- a/arch/mips/mm/init.c
+++ b/arch/mips/mm/init.c
@@ -90,7 +90,7 @@ static void *__kmap_pgprot(struct page *page, unsigned long addr, pgprot_t prot)
 
 	BUG_ON(Page_dcache_dirty(page));
 
-	pagefault_disable();
+	raw_pagefault_disable();
 	idx = (addr >> PAGE_SHIFT) & (FIX_N_COLOURS - 1);
 	idx += in_interrupt() ? FIX_N_COLOURS : 0;
 	vaddr = __fix_to_virt(FIX_CMAP_END - idx);
@@ -146,7 +146,7 @@ void kunmap_coherent(void)
 	tlbw_use_hazard();
 	write_c0_entryhi(old_ctx);
 	local_irq_restore(flags);
-	pagefault_enable();
+	raw_pagefault_enable();
 }
 
 void copy_user_highpage(struct page *to, struct page *from,
-- 
cgit v1.2.3


From 4bc52f6cc962d26b1b759fac19a42b7320a33f1f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 6 Oct 2011 10:48:39 -0400
Subject: net: Avoid livelock in net_tx_action() on RT

qdisc_lock is taken w/o disabling interrupts or bottom halfs. So code
holding a qdisc_lock() can be interrupted and softirqs can run on the
return of interrupt in !RT.

The spin_trylock() in net_tx_action() makes sure, that the softirq
does not deadlock. When the lock can't be acquired q is requeued and
the NET_TX softirq is raised. That causes the softirq to run over and
over.

That works in mainline as do_softirq() has a retry loop limit and
leaves the softirq processing in the interrupt return path and
schedules ksoftirqd. The task which holds qdisc_lock cannot be
preempted, so the lock is released and either ksoftirqd or the next
softirq in the return from interrupt path can proceed. Though it's a
bit strange to actually run MAX_SOFTIRQ_RESTART (10) loops before it
decides to bail out even if it's clear in the first iteration :)

On RT all softirq processing is done in a FIFO thread and we don't
have a loop limit, so ksoftirqd preempts the lock holder forever and
unqueues and requeues until the reset button is hit.

Due to the forced threading of ksoftirqd on RT we actually cannot
deadlock on qdisc_lock because it's a "sleeping lock". So it's safe to
replace the spin_trylock() with a spin_lock(). When contended,
ksoftirqd is scheduled out and the lock holder can proceed.

[ tglx: Massaged changelog and code comments ]

Solved-by: Thomas Gleixner <tglx@linuxtronix.de>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Tested-by: Carsten Emde <cbe@osadl.org>
Cc: Clark Williams <williams@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Luis Claudio R. Goncalves <lclaudio@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 net/core/dev.c | 32 +++++++++++++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 0b77a570b35a..a98e98bc85cf 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3412,6 +3412,36 @@ int netif_rx_ni(struct sk_buff *skb)
 }
 EXPORT_SYMBOL(netif_rx_ni);
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+/*
+ * RT runs ksoftirqd as a real time thread and the root_lock is a
+ * "sleeping spinlock". If the trylock fails then we can go into an
+ * infinite loop when ksoftirqd preempted the task which actually
+ * holds the lock, because we requeue q and raise NET_TX softirq
+ * causing ksoftirqd to loop forever.
+ *
+ * It's safe to use spin_lock on RT here as softirqs run in thread
+ * context and cannot deadlock against the thread which is holding
+ * root_lock.
+ *
+ * On !RT the trylock might fail, but there we bail out from the
+ * softirq loop after 10 attempts which we can't do on RT. And the
+ * task holding root_lock cannot be preempted, so the only downside of
+ * that trylock is that we need 10 loops to decide that we should have
+ * given up in the first one :)
+ */
+static inline int take_root_lock(spinlock_t *lock)
+{
+	spin_lock(lock);
+	return 1;
+}
+#else
+static inline int take_root_lock(spinlock_t *lock)
+{
+	return spin_trylock(lock);
+}
+#endif
+
 static void net_tx_action(struct softirq_action *h)
 {
 	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
@@ -3453,7 +3483,7 @@ static void net_tx_action(struct softirq_action *h)
 			head = head->next_sched;
 
 			root_lock = qdisc_lock(q);
-			if (spin_trylock(root_lock)) {
+			if (take_root_lock(root_lock)) {
 				smp_mb__before_atomic();
 				clear_bit(__QDISC_STATE_SCHED,
 					  &q->state);
-- 
cgit v1.2.3


From e9c3001ff6e6fead71801574a6277913b6353765 Mon Sep 17 00:00:00 2001
From: Carsten Emde <C.Emde@osadl.org>
Date: Tue, 19 Jul 2011 13:51:17 +0100
Subject: net: sysrq via icmp

There are (probably rare) situations when a system crashed and the system
console becomes unresponsive but the network icmp layer still is alive.
Wouldn't it be wonderful, if we then could submit a sysreq command via ping?

This patch provides this facility. Please consult the updated documentation
Documentation/sysrq.txt for details.

Signed-off-by: Carsten Emde <C.Emde@osadl.org>
---
 Documentation/sysrq.txt    | 11 +++++++++--
 include/net/netns/ipv4.h   |  1 +
 net/ipv4/icmp.c            | 30 ++++++++++++++++++++++++++++++
 net/ipv4/sysctl_net_ipv4.c |  7 +++++++
 4 files changed, 47 insertions(+), 2 deletions(-)

diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt
index 0e307c94809a..6964d0f80ae7 100644
--- a/Documentation/sysrq.txt
+++ b/Documentation/sysrq.txt
@@ -59,10 +59,17 @@ On PowerPC - Press 'ALT - Print Screen (or F13) - <command key>,
 On other - If you know of the key combos for other architectures, please
            let me know so I can add them to this section.
 
-On all -  write a character to /proc/sysrq-trigger.  e.g.:
-
+On all -  write a character to /proc/sysrq-trigger, e.g.:
 		echo t > /proc/sysrq-trigger
 
+On all - Enable network SysRq by writing a cookie to icmp_echo_sysrq, e.g.
+		echo 0x01020304 >/proc/sys/net/ipv4/icmp_echo_sysrq
+	 Send an ICMP echo request with this pattern plus the particular
+	 SysRq command key. Example:
+		# ping -c1 -s57 -p0102030468
+	 will trigger the SysRq-H (help) command.
+
+
 *  What are the 'command' keys?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 'b'     - Will immediately reboot the system without syncing or unmounting
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 0ffef1a38efc..dddc3a717bcb 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -67,6 +67,7 @@ struct netns_ipv4 {
 
 	int sysctl_icmp_echo_ignore_all;
 	int sysctl_icmp_echo_ignore_broadcasts;
+	int sysctl_icmp_echo_sysrq;
 	int sysctl_icmp_ignore_bogus_error_responses;
 	int sysctl_icmp_ratelimit;
 	int sysctl_icmp_ratemask;
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 5882f584910e..3c71d5fc54ea 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -69,6 +69,7 @@
 #include <linux/jiffies.h>
 #include <linux/kernel.h>
 #include <linux/fcntl.h>
+#include <linux/sysrq.h>
 #include <linux/socket.h>
 #include <linux/in.h>
 #include <linux/inet.h>
@@ -863,6 +864,30 @@ static void icmp_redirect(struct sk_buff *skb)
 	icmp_socket_deliver(skb, icmp_hdr(skb)->un.gateway);
 }
 
+/*
+ * 32bit and 64bit have different timestamp length, so we check for
+ * the cookie at offset 20 and verify it is repeated at offset 50
+ */
+#define CO_POS0		20
+#define CO_POS1		50
+#define CO_SIZE		sizeof(int)
+#define ICMP_SYSRQ_SIZE	57
+
+/*
+ * We got a ICMP_SYSRQ_SIZE sized ping request. Check for the cookie
+ * pattern and if it matches send the next byte as a trigger to sysrq.
+ */
+static void icmp_check_sysrq(struct net *net, struct sk_buff *skb)
+{
+	int cookie = htonl(net->ipv4.sysctl_icmp_echo_sysrq);
+	char *p = skb->data;
+
+	if (!memcmp(&cookie, p + CO_POS0, CO_SIZE) &&
+	    !memcmp(&cookie, p + CO_POS1, CO_SIZE) &&
+	    p[CO_POS0 + CO_SIZE] == p[CO_POS1 + CO_SIZE])
+		handle_sysrq(p[CO_POS0 + CO_SIZE]);
+}
+
 /*
  *	Handle ICMP_ECHO ("ping") requests.
  *
@@ -890,6 +915,11 @@ static void icmp_echo(struct sk_buff *skb)
 		icmp_param.data_len	   = skb->len;
 		icmp_param.head_len	   = sizeof(struct icmphdr);
 		icmp_reply(&icmp_param, skb);
+
+		if (skb->len == ICMP_SYSRQ_SIZE &&
+		    net->ipv4.sysctl_icmp_echo_sysrq) {
+			icmp_check_sysrq(net, skb);
+		}
 	}
 }
 
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index b3c53c8b331e..9096dbb067d1 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -778,6 +778,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "icmp_echo_sysrq",
+		.data		= &init_net.ipv4.sysctl_icmp_echo_sysrq,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 	{
 		.procname	= "icmp_ignore_bogus_error_responses",
 		.data		= &init_net.ipv4.sysctl_icmp_ignore_bogus_error_responses,
-- 
cgit v1.2.3


From 2a1bfe591907424446f109106dde72a20139ec3e Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Thu, 28 Jul 2011 12:42:23 -0500
Subject: kgdb/serial: Short term workaround

On 07/27/2011 04:37 PM, Thomas Gleixner wrote:
>  - KGDB (not yet disabled) is reportedly unusable on -rt right now due
>    to missing hacks in the console locking which I dropped on purpose.
>

To work around this in the short term you can use this patch, in
addition to the clocksource watchdog patch that Thomas brewed up.

Comments are welcome of course.  Ultimately the right solution is to
change separation between the console and the HW to have a polled mode
+ work queue so as not to introduce any kind of latency.

Thanks,
Jason.
---
 drivers/tty/serial/8250/8250_core.c | 3 ++-
 include/linux/kdb.h                 | 3 ++-
 kernel/debug/kdb/kdb_io.c           | 6 ++----
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c
index 772f1bc13598..dbd63ae2d079 100644
--- a/drivers/tty/serial/8250/8250_core.c
+++ b/drivers/tty/serial/8250/8250_core.c
@@ -37,6 +37,7 @@
 #include <linux/nmi.h>
 #include <linux/mutex.h>
 #include <linux/slab.h>
+#include <linux/kdb.h>
 #include <linux/uaccess.h>
 #include <linux/pm_runtime.h>
 #ifdef CONFIG_SPARC
@@ -3206,7 +3207,7 @@ serial8250_console_write(struct console *co, const char *s, unsigned int count)
 
 	serial8250_rpm_get(up);
 
-	if (port->sysrq || oops_in_progress)
+	if (port->sysrq || oops_in_progress || in_kdb_printk())
 		locked = spin_trylock_irqsave(&port->lock, flags);
 	else
 		spin_lock_irqsave(&port->lock, flags);
diff --git a/include/linux/kdb.h b/include/linux/kdb.h
index 290db1269c4c..d9ae75ee77cd 100644
--- a/include/linux/kdb.h
+++ b/include/linux/kdb.h
@@ -116,7 +116,7 @@ extern int kdb_trap_printk;
 extern __printf(1, 0) int vkdb_printf(const char *fmt, va_list args);
 extern __printf(1, 2) int kdb_printf(const char *, ...);
 typedef __printf(1, 2) int (*kdb_printf_t)(const char *, ...);
-
+#define in_kdb_printk() (kdb_trap_printk)
 extern void kdb_init(int level);
 
 /* Access to kdb specific polling devices */
@@ -151,6 +151,7 @@ extern int kdb_register_repeat(char *, kdb_func_t, char *, char *,
 extern int kdb_unregister(char *);
 #else /* ! CONFIG_KGDB_KDB */
 static inline __printf(1, 2) int kdb_printf(const char *fmt, ...) { return 0; }
+#define in_kdb_printk() (0)
 static inline void kdb_init(int level) {}
 static inline int kdb_register(char *cmd, kdb_func_t func, char *usage,
 			       char *help, short minlen) { return 0; }
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 7c70812caea5..3f8e05ff4cfd 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -554,7 +554,6 @@ int vkdb_printf(const char *fmt, va_list ap)
 	int linecount;
 	int colcount;
 	int logging, saved_loglevel = 0;
-	int saved_trap_printk;
 	int got_printf_lock = 0;
 	int retlen = 0;
 	int fnd, len;
@@ -565,8 +564,6 @@ int vkdb_printf(const char *fmt, va_list ap)
 	unsigned long uninitialized_var(flags);
 
 	preempt_disable();
-	saved_trap_printk = kdb_trap_printk;
-	kdb_trap_printk = 0;
 
 	/* Serialize kdb_printf if multiple cpus try to write at once.
 	 * But if any cpu goes recursive in kdb, just print the output,
@@ -833,7 +830,6 @@ kdb_print_out:
 	} else {
 		__release(kdb_printf_lock);
 	}
-	kdb_trap_printk = saved_trap_printk;
 	preempt_enable();
 	return retlen;
 }
@@ -843,9 +839,11 @@ int kdb_printf(const char *fmt, ...)
 	va_list ap;
 	int r;
 
+	kdb_trap_printk++;
 	va_start(ap, fmt);
 	r = vkdb_printf(fmt, ap);
 	va_end(ap);
+	kdb_trap_printk--;
 
 	return r;
 }
-- 
cgit v1.2.3


From 2abbe48330e093d132f5002d902b48a8c6ed05b6 Mon Sep 17 00:00:00 2001
From: Clark Williams <williams@redhat.com>
Date: Sat, 30 Jul 2011 21:55:53 -0500
Subject: add /sys/kernel/realtime entry

Add a /sys/kernel entry to indicate that the kernel is a
realtime kernel.

Clark says that he needs this for udev rules, udev needs to evaluate
if its a PREEMPT_RT kernel a few thousand times and parsing uname
output is too slow or so.

Are there better solutions? Should it exist and return 0 on !-rt?

Signed-off-by: Clark Williams <williams@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/ksysfs.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 6683ccef9fff..d6fc8eeaab8f 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -136,6 +136,15 @@ KERNEL_ATTR_RO(vmcoreinfo);
 
 #endif /* CONFIG_KEXEC */
 
+#if defined(CONFIG_PREEMPT_RT_FULL)
+static ssize_t  realtime_show(struct kobject *kobj,
+			      struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%d\n", 1);
+}
+KERNEL_ATTR_RO(realtime);
+#endif
+
 /* whether file capabilities are enabled */
 static ssize_t fscaps_show(struct kobject *kobj,
 				  struct kobj_attribute *attr, char *buf)
@@ -203,6 +212,9 @@ static struct attribute * kernel_attrs[] = {
 	&vmcoreinfo_attr.attr,
 #endif
 	&rcu_expedited_attr.attr,
+#ifdef CONFIG_PREEMPT_RT_FULL
+	&realtime_attr.attr,
+#endif
 	NULL
 };
 
-- 
cgit v1.2.3


From 500275ad83e5bf644611279dee1efb8b7f05ca90 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 28 Jul 2011 10:43:51 +0200
Subject: mm, rt: kmap_atomic scheduling

In fact, with migrate_disable() existing one could play games with
kmap_atomic. You could save/restore the kmap_atomic slots on context
switch (if there are any in use of course), this should be esp easy now
that we have a kmap_atomic stack.

Something like the below.. it wants replacing all the preempt_disable()
stuff with pagefault_disable() && migrate_disable() of course, but then
you can flip kmaps around like below.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
[dvhart@linux.intel.com: build fix]
Link: http://lkml.kernel.org/r/1311842631.5890.208.camel@twins

[tglx@linutronix.de: Get rid of the per cpu variable and store the idx
		     and the pte content right away in the task struct.
		     Shortens the context switch code. ]
---
 arch/x86/kernel/process_32.c | 32 ++++++++++++++++++++++++++++++++
 arch/x86/mm/highmem_32.c     |  9 ++++++++-
 arch/x86/mm/iomap_32.c       |  9 ++++++++-
 include/linux/highmem.h      | 27 +++++++++++++++++++++++----
 include/linux/sched.h        |  7 +++++++
 mm/highmem.c                 |  6 ++++--
 mm/memory.c                  |  2 ++
 7 files changed, 84 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 8f3ebfe710d0..3531e8fdfdc1 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -35,6 +35,7 @@
 #include <linux/uaccess.h>
 #include <linux/io.h>
 #include <linux/kdebug.h>
+#include <linux/highmem.h>
 
 #include <asm/pgtable.h>
 #include <asm/ldt.h>
@@ -214,6 +215,35 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
 }
 EXPORT_SYMBOL_GPL(start_thread);
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+static void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p)
+{
+	int i;
+
+	/*
+	 * Clear @prev's kmap_atomic mappings
+	 */
+	for (i = 0; i < prev_p->kmap_idx; i++) {
+		int idx = i + KM_TYPE_NR * smp_processor_id();
+		pte_t *ptep = kmap_pte - idx;
+
+		kpte_clear_flush(ptep, __fix_to_virt(FIX_KMAP_BEGIN + idx));
+	}
+	/*
+	 * Restore @next_p's kmap_atomic mappings
+	 */
+	for (i = 0; i < next_p->kmap_idx; i++) {
+		int idx = i + KM_TYPE_NR * smp_processor_id();
+
+		if (!pte_none(next_p->kmap_pte[i]))
+			set_pte(kmap_pte - idx, next_p->kmap_pte[i]);
+	}
+}
+#else
+static inline void
+switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { }
+#endif
+
 
 /*
  *	switch_to(x,y) should switch tasks from x to y.
@@ -301,6 +331,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 		     task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
 		__switch_to_xtra(prev_p, next_p, tss);
 
+	switch_kmaps(prev_p, next_p);
+
 	/*
 	 * Leave lazy mode, flushing any hypercalls made here.
 	 * This must be done before restoring TLS segments so
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 4500142bc4aa..7f96844472bb 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -32,6 +32,7 @@ EXPORT_SYMBOL(kunmap);
  */
 void *kmap_atomic_prot(struct page *page, pgprot_t prot)
 {
+	pte_t pte = mk_pte(page, prot);
 	unsigned long vaddr;
 	int idx, type;
 
@@ -45,7 +46,10 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
 	idx = type + KM_TYPE_NR*smp_processor_id();
 	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
 	BUG_ON(!pte_none(*(kmap_pte-idx)));
-	set_pte(kmap_pte-idx, mk_pte(page, prot));
+#ifdef CONFIG_PREEMPT_RT_FULL
+	current->kmap_pte[type] = pte;
+#endif
+	set_pte(kmap_pte-idx, pte);
 	arch_flush_lazy_mmu_mode();
 
 	return (void *)vaddr;
@@ -88,6 +92,9 @@ void __kunmap_atomic(void *kvaddr)
 		 * is a bad idea also, in case the page changes cacheability
 		 * attributes or becomes a protected page in a hypervisor.
 		 */
+#ifdef CONFIG_PREEMPT_RT_FULL
+		current->kmap_pte[type] = __pte(0);
+#endif
 		kpte_clear_flush(kmap_pte-idx, vaddr);
 		kmap_atomic_idx_pop();
 		arch_flush_lazy_mmu_mode();
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index 7b179b499fa3..0c953e399b96 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -56,6 +56,7 @@ EXPORT_SYMBOL_GPL(iomap_free);
 
 void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
 {
+	pte_t pte = pfn_pte(pfn, prot);
 	unsigned long vaddr;
 	int idx, type;
 
@@ -64,7 +65,10 @@ void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
 	type = kmap_atomic_idx_push();
 	idx = type + KM_TYPE_NR * smp_processor_id();
 	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
-	set_pte(kmap_pte - idx, pfn_pte(pfn, prot));
+#ifdef CONFIG_PREEMPT_RT_FULL
+	current->kmap_pte[type] = pte;
+#endif
+	set_pte(kmap_pte - idx, pte);
 	arch_flush_lazy_mmu_mode();
 
 	return (void *)vaddr;
@@ -110,6 +114,9 @@ iounmap_atomic(void __iomem *kvaddr)
 		 * is a bad idea also, in case the page changes cacheability
 		 * attributes or becomes a protected page in a hypervisor.
 		 */
+#ifdef CONFIG_PREEMPT_RT_FULL
+		current->kmap_pte[type] = __pte(0);
+#endif
 		kpte_clear_flush(kmap_pte-idx, vaddr);
 		kmap_atomic_idx_pop();
 	}
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 9286a46b7d69..75862dfd4144 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -85,32 +85,51 @@ static inline void __kunmap_atomic(void *addr)
 
 #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 DECLARE_PER_CPU(int, __kmap_atomic_idx);
+#endif
 
 static inline int kmap_atomic_idx_push(void)
 {
+#ifndef CONFIG_PREEMPT_RT_FULL
 	int idx = __this_cpu_inc_return(__kmap_atomic_idx) - 1;
 
-#ifdef CONFIG_DEBUG_HIGHMEM
+# ifdef CONFIG_DEBUG_HIGHMEM
 	WARN_ON_ONCE(in_irq() && !irqs_disabled());
 	BUG_ON(idx >= KM_TYPE_NR);
-#endif
+# endif
 	return idx;
+#else
+	current->kmap_idx++;
+	BUG_ON(current->kmap_idx > KM_TYPE_NR);
+	return current->kmap_idx - 1;
+#endif
 }
 
 static inline int kmap_atomic_idx(void)
 {
+#ifndef CONFIG_PREEMPT_RT_FULL
 	return __this_cpu_read(__kmap_atomic_idx) - 1;
+#else
+	return current->kmap_idx - 1;
+#endif
 }
 
 static inline void kmap_atomic_idx_pop(void)
 {
-#ifdef CONFIG_DEBUG_HIGHMEM
+#ifndef CONFIG_PREEMPT_RT_FULL
+# ifdef CONFIG_DEBUG_HIGHMEM
 	int idx = __this_cpu_dec_return(__kmap_atomic_idx);
 
 	BUG_ON(idx < 0);
-#else
+# else
 	__this_cpu_dec(__kmap_atomic_idx);
+# endif
+#else
+	current->kmap_idx--;
+# ifdef CONFIG_DEBUG_HIGHMEM
+	BUG_ON(current->kmap_idx < 0);
+# endif
 #endif
 }
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c50d2681a943..6762444228cc 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -26,6 +26,7 @@ struct sched_param {
 #include <linux/nodemask.h>
 #include <linux/mm_types.h>
 #include <linux/preempt_mask.h>
+#include <asm/kmap_types.h>
 
 #include <asm/page.h>
 #include <asm/ptrace.h>
@@ -1688,6 +1689,12 @@ struct task_struct {
 	struct rcu_head put_rcu;
 	int softirq_nestcnt;
 #endif
+#ifdef CONFIG_PREEMPT_RT_FULL
+# if defined CONFIG_HIGHMEM || defined CONFIG_X86_32
+	int kmap_idx;
+	pte_t kmap_pte[KM_TYPE_NR];
+# endif
+#endif
 };
 
 #define TNF_MIGRATED	0x01
diff --git a/mm/highmem.c b/mm/highmem.c
index 123bcd3ed4f2..16e8cf26d38a 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -29,10 +29,11 @@
 #include <linux/kgdb.h>
 #include <asm/tlbflush.h>
 
-
+#ifndef CONFIG_PREEMPT_RT_FULL
 #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
 DEFINE_PER_CPU(int, __kmap_atomic_idx);
 #endif
+#endif
 
 /*
  * Virtual_count is not a pure "count".
@@ -107,8 +108,9 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color)
 unsigned long totalhigh_pages __read_mostly;
 EXPORT_SYMBOL(totalhigh_pages);
 
-
+#ifndef CONFIG_PREEMPT_RT_FULL
 EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
+#endif
 
 unsigned int nr_free_highpages (void)
 {
diff --git a/mm/memory.c b/mm/memory.c
index e38f53b792d9..8b6028fc342e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3247,6 +3247,7 @@ unlock:
 #ifdef CONFIG_PREEMPT_RT_FULL
 void pagefault_disable(void)
 {
+	migrate_disable();
 	current->pagefault_disabled++;
 	/*
 	 * make sure to have issued the store before a pagefault
@@ -3264,6 +3265,7 @@ void pagefault_enable(void)
 	 */
 	barrier();
 	current->pagefault_disabled--;
+	migrate_enable();
 }
 EXPORT_SYMBOL(pagefault_enable);
 #endif
-- 
cgit v1.2.3


From c53ef374bb35411d6be42acf87b33c1c749297ef Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 11 Mar 2013 17:09:55 +0100
Subject: x86/highmem: add a "already used pte" check

This is a copy from kmap_atomic_prot().

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 arch/x86/mm/iomap_32.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index 0c953e399b96..62377d67ab07 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -65,6 +65,8 @@ void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
 	type = kmap_atomic_idx_push();
 	idx = type + KM_TYPE_NR * smp_processor_id();
 	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+	WARN_ON(!pte_none(*(kmap_pte - idx)));
+
 #ifdef CONFIG_PREEMPT_RT_FULL
 	current->kmap_pte[type] = pte;
 #endif
-- 
cgit v1.2.3


From cab66684fba621b6ba3666e2559bd483705e9162 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 11 Mar 2013 21:37:27 +0100
Subject: arm/highmem: flush tlb on unmap

The tlb should be flushed on unmap and thus make the mapping entry
invalid. This is only done in the non-debug case which does not look
right.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 arch/arm/mm/highmem.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c
index e17ed00828d7..574f45b3bbb4 100644
--- a/arch/arm/mm/highmem.c
+++ b/arch/arm/mm/highmem.c
@@ -110,10 +110,10 @@ void __kunmap_atomic(void *kvaddr)
 			__cpuc_flush_dcache_area((void *)vaddr, PAGE_SIZE);
 #ifdef CONFIG_DEBUG_HIGHMEM
 		BUG_ON(vaddr != __fix_to_virt(idx));
-		set_fixmap_pte(idx, __pte(0));
 #else
 		(void) idx;  /* to kill a warning */
 #endif
+		set_fixmap_pte(idx, __pte(0));
 		kmap_atomic_idx_pop();
 	} else if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) {
 		/* this address was obtained through kmap_high_get() */
-- 
cgit v1.2.3


From 755c2d7c0201f830d630851aedc9ae32069ce6bf Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Feb 2013 11:03:11 +0100
Subject: arm-enable-highmem-for-rt.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/arm/Kconfig                 |  2 +-
 arch/arm/include/asm/switch_to.h |  8 ++++++++
 arch/arm/mm/highmem.c            | 40 ++++++++++++++++++++++++++++++++++++++--
 include/linux/highmem.h          |  1 +
 4 files changed, 48 insertions(+), 3 deletions(-)

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 2fec345c24d6..89c4b5ccc68d 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1667,7 +1667,7 @@ config HAVE_GENERIC_RCU_GUP
 
 config HIGHMEM
 	bool "High Memory Support"
-	depends on MMU && !PREEMPT_RT_FULL
+	depends on MMU
 	help
 	  The address space of ARM processors is only 4 Gigabytes large
 	  and it has to accommodate user address space, kernel address
diff --git a/arch/arm/include/asm/switch_to.h b/arch/arm/include/asm/switch_to.h
index c99e259469f7..f3e3d800c407 100644
--- a/arch/arm/include/asm/switch_to.h
+++ b/arch/arm/include/asm/switch_to.h
@@ -3,6 +3,13 @@
 
 #include <linux/thread_info.h>
 
+#if defined CONFIG_PREEMPT_RT_FULL && defined CONFIG_HIGHMEM
+void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p);
+#else
+static inline void
+switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { }
+#endif
+
 /*
  * For v7 SMP cores running a preemptible kernel we may be pre-empted
  * during a TLB maintenance operation, so execute an inner-shareable dsb
@@ -22,6 +29,7 @@ extern struct task_struct *__switch_to(struct task_struct *, struct thread_info
 
 #define switch_to(prev,next,last)					\
 do {									\
+	switch_kmaps(prev, next);					\
 	last = __switch_to(prev,task_thread_info(prev), task_thread_info(next));	\
 } while (0)
 
diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c
index 574f45b3bbb4..f8b4c3bd38ff 100644
--- a/arch/arm/mm/highmem.c
+++ b/arch/arm/mm/highmem.c
@@ -53,6 +53,7 @@ EXPORT_SYMBOL(kunmap);
 
 void *kmap_atomic(struct page *page)
 {
+	pte_t pte = mk_pte(page, kmap_prot);
 	unsigned int idx;
 	unsigned long vaddr;
 	void *kmap;
@@ -91,7 +92,10 @@ void *kmap_atomic(struct page *page)
 	 * in place, so the contained TLB flush ensures the TLB is updated
 	 * with the new mapping.
 	 */
-	set_fixmap_pte(idx, mk_pte(page, kmap_prot));
+#ifdef CONFIG_PREEMPT_RT_FULL
+	current->kmap_pte[type] = pte;
+#endif
+	set_fixmap_pte(idx, pte);
 
 	return (void *)vaddr;
 }
@@ -108,6 +112,9 @@ void __kunmap_atomic(void *kvaddr)
 
 		if (cache_is_vivt())
 			__cpuc_flush_dcache_area((void *)vaddr, PAGE_SIZE);
+#ifdef CONFIG_PREEMPT_RT_FULL
+		current->kmap_pte[type] = __pte(0);
+#endif
 #ifdef CONFIG_DEBUG_HIGHMEM
 		BUG_ON(vaddr != __fix_to_virt(idx));
 #else
@@ -125,6 +132,7 @@ EXPORT_SYMBOL(__kunmap_atomic);
 
 void *kmap_atomic_pfn(unsigned long pfn)
 {
+	pte_t pte = pfn_pte(pfn, kmap_prot);
 	unsigned long vaddr;
 	int idx, type;
 	struct page *page = pfn_to_page(pfn);
@@ -139,7 +147,10 @@ void *kmap_atomic_pfn(unsigned long pfn)
 #ifdef CONFIG_DEBUG_HIGHMEM
 	BUG_ON(!pte_none(*(fixmap_page_table + idx)));
 #endif
-	set_fixmap_pte(idx, pfn_pte(pfn, kmap_prot));
+#ifdef CONFIG_PREEMPT_RT_FULL
+	current->kmap_pte[type] = pte;
+#endif
+	set_fixmap_pte(idx, pte);
 
 	return (void *)vaddr;
 }
@@ -153,3 +164,28 @@ struct page *kmap_atomic_to_page(const void *ptr)
 
 	return pte_page(get_fixmap_pte(vaddr));
 }
+
+#if defined CONFIG_PREEMPT_RT_FULL
+void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p)
+{
+	int i;
+
+	/*
+	 * Clear @prev's kmap_atomic mappings
+	 */
+	for (i = 0; i < prev_p->kmap_idx; i++) {
+		int idx = i + KM_TYPE_NR * smp_processor_id();
+
+		set_fixmap_pte(idx, __pte(0));
+	}
+	/*
+	 * Restore @next_p's kmap_atomic mappings
+	 */
+	for (i = 0; i < next_p->kmap_idx; i++) {
+		int idx = i + KM_TYPE_NR * smp_processor_id();
+
+		if (!pte_none(next_p->kmap_pte[i]))
+			set_fixmap_pte(idx, next_p->kmap_pte[i]);
+	}
+}
+#endif
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 75862dfd4144..f443283294f8 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -7,6 +7,7 @@
 #include <linux/mm.h>
 #include <linux/uaccess.h>
 #include <linux/hardirq.h>
+#include <linux/sched.h>
 
 #include <asm/cacheflush.h>
 
-- 
cgit v1.2.3


From ac9989c2e5d5262188b84cac6fbf06305bf654d3 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 14 Sep 2011 11:57:04 +0200
Subject: ipc/sem: Rework semaphore wakeups

Current sysv sems have a weird ass wakeup scheme that involves keeping
preemption disabled over a potential O(n^2) loop and busy waiting on
that on other CPUs.

Kill this and simply wake the task directly from under the sem_lock.

This was discovered by a migrate_disable() debug feature that
disallows:

  spin_lock();
  preempt_disable();
  spin_unlock()
  preempt_enable();

Cc: Manfred Spraul <manfred@colorfullife.com>
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Reported-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Manfred Spraul <manfred@colorfullife.com>
Link: http://lkml.kernel.org/r/1315994224.5040.1.camel@twins
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 ipc/sem.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/ipc/sem.c b/ipc/sem.c
index 53c3310f41c6..2ef9824e9a1a 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -673,6 +673,13 @@ undo:
 static void wake_up_sem_queue_prepare(struct list_head *pt,
 				struct sem_queue *q, int error)
 {
+#ifdef CONFIG_PREEMPT_RT_BASE
+	struct task_struct *p = q->sleeper;
+	get_task_struct(p);
+	q->status = error;
+	wake_up_process(p);
+	put_task_struct(p);
+#else
 	if (list_empty(pt)) {
 		/*
 		 * Hold preempt off so that we don't get preempted and have the
@@ -684,6 +691,7 @@ static void wake_up_sem_queue_prepare(struct list_head *pt,
 	q->pid = error;
 
 	list_add_tail(&q->list, pt);
+#endif
 }
 
 /**
@@ -697,6 +705,7 @@ static void wake_up_sem_queue_prepare(struct list_head *pt,
  */
 static void wake_up_sem_queue_do(struct list_head *pt)
 {
+#ifndef CONFIG_PREEMPT_RT_BASE
 	struct sem_queue *q, *t;
 	int did_something;
 
@@ -709,6 +718,7 @@ static void wake_up_sem_queue_do(struct list_head *pt)
 	}
 	if (did_something)
 		preempt_enable();
+#endif
 }
 
 static void unlink_queue(struct sem_array *sma, struct sem_queue *q)
-- 
cgit v1.2.3


From ce32ea68ab44ed38294861ba34caa2def38bd2c2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 6 Nov 2011 12:26:18 +0100
Subject: x86-kvm-require-const-tsc-for-rt.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kvm/x86.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a38dd816015b..13c8f5f144ef 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5772,6 +5772,13 @@ int kvm_arch_init(void *opaque)
 		goto out;
 	}
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
+		printk(KERN_ERR "RT requires X86_FEATURE_CONSTANT_TSC\n");
+		return -EOPNOTSUPP;
+	}
+#endif
+
 	r = kvm_mmu_module_init();
 	if (r)
 		goto out_free_percpu;
-- 
cgit v1.2.3


From c9bce3d8490881e5ac7b74e12054d1ddc5eeac2e Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Wed, 8 Apr 2015 20:33:25 -0300
Subject: KVM: lapic: mark LAPIC timer handler as irqsafe

Since lapic timer handler only wakes up a simple waitqueue,
it can be executed from hardirq context.

Also handle the case where hrtimer_start_expires fails due to -ETIME,
by injecting the interrupt to the guest immediately.

Reduces average cyclictest latency by 3us.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 arch/x86/kvm/lapic.c | 42 +++++++++++++++++++++++++++++++++++++++---
 1 file changed, 39 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index b8345dd41b25..95ea2d044f68 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1034,8 +1034,38 @@ static void update_divide_count(struct kvm_lapic *apic)
 				   apic->divide_count);
 }
 
+
+static enum hrtimer_restart apic_timer_fn(struct hrtimer *data);
+
+static void apic_timer_expired(struct hrtimer *data)
+{
+	int ret, i = 0;
+	enum hrtimer_restart r;
+	struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
+
+	r = apic_timer_fn(data);
+
+	if (r == HRTIMER_RESTART) {
+		do {
+			ret = hrtimer_start_expires(data, HRTIMER_MODE_ABS);
+			if (ret == -ETIME)
+				hrtimer_add_expires_ns(&ktimer->timer,
+							ktimer->period);
+			i++;
+		} while (ret == -ETIME && i < 10);
+
+		if (ret == -ETIME) {
+			printk_once(KERN_ERR "%s: failed to reprogram timer\n",
+			       __func__);
+			WARN_ON_ONCE(1);
+		}
+	}
+}
+
+
 static void start_apic_timer(struct kvm_lapic *apic)
 {
+	int ret;
 	ktime_t now;
 	atomic_set(&apic->lapic_timer.pending, 0);
 
@@ -1065,9 +1095,11 @@ static void start_apic_timer(struct kvm_lapic *apic)
 			}
 		}
 
-		hrtimer_start(&apic->lapic_timer.timer,
+		ret = hrtimer_start(&apic->lapic_timer.timer,
 			      ktime_add_ns(now, apic->lapic_timer.period),
 			      HRTIMER_MODE_ABS);
+		if (ret == -ETIME)
+			apic_timer_expired(&apic->lapic_timer.timer);
 
 		apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
 			   PRIx64 ", "
@@ -1097,8 +1129,10 @@ static void start_apic_timer(struct kvm_lapic *apic)
 			ns = (tscdeadline - guest_tsc) * 1000000ULL;
 			do_div(ns, this_tsc_khz);
 		}
-		hrtimer_start(&apic->lapic_timer.timer,
+		ret = hrtimer_start(&apic->lapic_timer.timer,
 			ktime_add_ns(now, ns), HRTIMER_MODE_ABS);
+		if (ret == -ETIME)
+			apic_timer_expired(&apic->lapic_timer.timer);
 
 		local_irq_restore(flags);
 	}
@@ -1587,6 +1621,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
 	hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
 		     HRTIMER_MODE_ABS);
 	apic->lapic_timer.timer.function = apic_timer_fn;
+	apic->lapic_timer.timer.irqsafe = 1;
 
 	/*
 	 * APIC is created enabled. This will prevent kvm_lapic_set_base from
@@ -1707,7 +1742,8 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
 
 	timer = &vcpu->arch.apic->lapic_timer.timer;
 	if (hrtimer_cancel(timer))
-		hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
+		if (hrtimer_start_expires(timer, HRTIMER_MODE_ABS) == -ETIME)
+			apic_timer_expired(timer);
 }
 
 /*
-- 
cgit v1.2.3


From 128b4c5031e45ce30a397c7c96dafcfd61f8a665 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Wed, 8 Apr 2015 20:33:24 -0300
Subject: KVM: use simple waitqueue for vcpu->wq

The problem:

On -RT, an emulated LAPIC timer instances has the following path:

1) hard interrupt
2) ksoftirqd is scheduled
3) ksoftirqd wakes up vcpu thread
4) vcpu thread is scheduled

This extra context switch introduces unnecessary latency in the
LAPIC path for a KVM guest.

The solution:

Allow waking up vcpu thread from hardirq context,
thus avoiding the need for ksoftirqd to be scheduled.

Normal waitqueues make use of spinlocks, which on -RT
are sleepable locks. Therefore, waking up a waitqueue
waiter involves locking a sleeping lock, which
is not allowed from hard interrupt context.

cyclictest command line:
# cyclictest -m -n -q -p99 -l 1000000 -h60  -D 1m

This patch reduces the average latency in my tests from 14us to 11us.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 arch/arm/kvm/arm.c                  |  4 ++--
 arch/arm/kvm/psci.c                 |  4 ++--
 arch/powerpc/include/asm/kvm_host.h |  4 ++--
 arch/powerpc/kvm/book3s_hv.c        | 20 ++++++++++----------
 arch/s390/include/asm/kvm_host.h    |  2 +-
 arch/s390/kvm/interrupt.c           |  8 ++++----
 arch/x86/kvm/lapic.c                |  6 +++---
 include/linux/kvm_host.h            |  4 ++--
 virt/kvm/async_pf.c                 |  4 ++--
 virt/kvm/kvm_main.c                 | 16 ++++++++--------
 10 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 9e193c8a959e..e71a43720a21 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -441,9 +441,9 @@ static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu)
 
 static void vcpu_pause(struct kvm_vcpu *vcpu)
 {
-	wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu);
+	struct swait_head *wq = kvm_arch_vcpu_wq(vcpu);
 
-	wait_event_interruptible(*wq, !vcpu->arch.pause);
+	swait_event_interruptible(*wq, !vcpu->arch.pause);
 }
 
 static int kvm_vcpu_initialized(struct kvm_vcpu *vcpu)
diff --git a/arch/arm/kvm/psci.c b/arch/arm/kvm/psci.c
index 09cf37737ee2..b26f4cb51b1e 100644
--- a/arch/arm/kvm/psci.c
+++ b/arch/arm/kvm/psci.c
@@ -66,7 +66,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
 {
 	struct kvm *kvm = source_vcpu->kvm;
 	struct kvm_vcpu *vcpu = NULL, *tmp;
-	wait_queue_head_t *wq;
+	struct swait_head *wq;
 	unsigned long cpu_id;
 	unsigned long context_id;
 	unsigned long mpidr;
@@ -123,7 +123,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
 	smp_mb();		/* Make sure the above is visible */
 
 	wq = kvm_arch_vcpu_wq(vcpu);
-	wake_up_interruptible(wq);
+	swait_wake_interruptible(wq);
 
 	return PSCI_RET_SUCCESS;
 }
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 047855619cc4..addfd9b99cef 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -296,7 +296,7 @@ struct kvmppc_vcore {
 	u8 in_guest;
 	struct list_head runnable_threads;
 	spinlock_t lock;
-	wait_queue_head_t wq;
+	struct swait_head wq;
 	u64 stolen_tb;
 	u64 preempt_tb;
 	struct kvm_vcpu *runner;
@@ -618,7 +618,7 @@ struct kvm_vcpu_arch {
 	u8 prodded;
 	u32 last_inst;
 
-	wait_queue_head_t *wqp;
+	struct swait_head *wqp;
 	struct kvmppc_vcore *vcore;
 	int ret;
 	int trap;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index e63587d30b70..499bc8303456 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -84,11 +84,11 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
 {
 	int me;
 	int cpu = vcpu->cpu;
-	wait_queue_head_t *wqp;
+	struct swait_head *wqp;
 
 	wqp = kvm_arch_vcpu_wq(vcpu);
-	if (waitqueue_active(wqp)) {
-		wake_up_interruptible(wqp);
+	if (swaitqueue_active(wqp)) {
+		swait_wake_interruptible(wqp);
 		++vcpu->stat.halt_wakeup;
 	}
 
@@ -639,8 +639,8 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
 		tvcpu->arch.prodded = 1;
 		smp_mb();
 		if (vcpu->arch.ceded) {
-			if (waitqueue_active(&vcpu->wq)) {
-				wake_up_interruptible(&vcpu->wq);
+			if (swaitqueue_active(&vcpu->wq)) {
+				swait_wake_interruptible(&vcpu->wq);
 				vcpu->stat.halt_wakeup++;
 			}
 		}
@@ -1357,7 +1357,7 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core)
 
 	INIT_LIST_HEAD(&vcore->runnable_threads);
 	spin_lock_init(&vcore->lock);
-	init_waitqueue_head(&vcore->wq);
+	init_swait_head(&vcore->wq);
 	vcore->preempt_tb = TB_NIL;
 	vcore->lpcr = kvm->arch.lpcr;
 	vcore->first_vcpuid = core * threads_per_subcore;
@@ -1826,13 +1826,13 @@ static void kvmppc_wait_for_exec(struct kvm_vcpu *vcpu, int wait_state)
  */
 static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
 {
-	DEFINE_WAIT(wait);
+	DEFINE_SWAITER(wait);
 
-	prepare_to_wait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
+	swait_prepare(&vc->wq, &wait, TASK_INTERRUPTIBLE);
 	vc->vcore_state = VCORE_SLEEPING;
 	spin_unlock(&vc->lock);
 	schedule();
-	finish_wait(&vc->wq, &wait);
+	swait_finish(&vc->wq, &wait);
 	spin_lock(&vc->lock);
 	vc->vcore_state = VCORE_INACTIVE;
 }
@@ -1873,7 +1873,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 			kvmppc_create_dtl_entry(vcpu, vc);
 			kvmppc_start_thread(vcpu);
 		} else if (vc->vcore_state == VCORE_SLEEPING) {
-			wake_up(&vc->wq);
+			swait_wake(&vc->wq);
 		}
 
 	}
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 2175f911a73a..884ec9b27b3c 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -311,7 +311,7 @@ struct kvm_s390_local_interrupt {
 	struct list_head list;
 	atomic_t active;
 	struct kvm_s390_float_interrupt *float_int;
-	wait_queue_head_t *wq;
+	struct swait_head *wq;
 	atomic_t *cpuflags;
 	unsigned int action_bits;
 };
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 29e2e5aa2111..e6c1cd7043df 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -619,13 +619,13 @@ no_timer:
 
 void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu)
 {
-	if (waitqueue_active(&vcpu->wq)) {
+	if (swaitqueue_active(&vcpu->wq)) {
 		/*
 		 * The vcpu gave up the cpu voluntarily, mark it as a good
 		 * yield-candidate.
 		 */
 		vcpu->preempted = true;
-		wake_up_interruptible(&vcpu->wq);
+		swait_wake_interruptible(&vcpu->wq);
 		vcpu->stat.halt_wakeup++;
 	}
 }
@@ -746,7 +746,7 @@ int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code)
 	spin_lock(&li->lock);
 	list_add(&inti->list, &li->list);
 	atomic_set(&li->active, 1);
-	BUG_ON(waitqueue_active(li->wq));
+	BUG_ON(swaitqueue_active(li->wq));
 	spin_unlock(&li->lock);
 	return 0;
 }
@@ -771,7 +771,7 @@ int kvm_s390_inject_prog_irq(struct kvm_vcpu *vcpu,
 	spin_lock(&li->lock);
 	list_add(&inti->list, &li->list);
 	atomic_set(&li->active, 1);
-	BUG_ON(waitqueue_active(li->wq));
+	BUG_ON(swaitqueue_active(li->wq));
 	spin_unlock(&li->lock);
 	return 0;
 }
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 95ea2d044f68..ca7b11e8e2d3 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1573,7 +1573,7 @@ static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
 	struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
 	struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic, lapic_timer);
 	struct kvm_vcpu *vcpu = apic->vcpu;
-	wait_queue_head_t *q = &vcpu->wq;
+	struct swait_head *q = &vcpu->wq;
 
 	/*
 	 * There is a race window between reading and incrementing, but we do
@@ -1587,8 +1587,8 @@ static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
 		kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
 	}
 
-	if (waitqueue_active(q))
-		wake_up_interruptible(q);
+	if (swaitqueue_active(q))
+		swait_wake_interruptible(q);
 
 	if (lapic_is_periodic(apic)) {
 		hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index a6059bdf7b03..9527595fdb2d 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -244,7 +244,7 @@ struct kvm_vcpu {
 
 	int fpu_active;
 	int guest_fpu_loaded, guest_xcr0_loaded;
-	wait_queue_head_t wq;
+	struct swait_head wq;
 	struct pid *pid;
 	int sigset_active;
 	sigset_t sigset;
@@ -687,7 +687,7 @@ static inline bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
 }
 #endif
 
-static inline wait_queue_head_t *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu)
+static inline struct swait_head *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu)
 {
 #ifdef __KVM_HAVE_ARCH_WQP
 	return vcpu->arch.wqp;
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index 5ff7f7f2689a..1e545966ddd9 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -94,8 +94,8 @@ static void async_pf_execute(struct work_struct *work)
 
 	trace_kvm_async_pf_completed(addr, gva);
 
-	if (waitqueue_active(&vcpu->wq))
-		wake_up_interruptible(&vcpu->wq);
+	if (swaitqueue_active(&vcpu->wq))
+		swait_wake_interruptible(&vcpu->wq);
 
 	mmput(mm);
 	kvm_put_kvm(vcpu->kvm);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 272fee82f89e..ab3a7261db3a 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -221,7 +221,7 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
 	vcpu->kvm = kvm;
 	vcpu->vcpu_id = id;
 	vcpu->pid = NULL;
-	init_waitqueue_head(&vcpu->wq);
+	init_swait_head(&vcpu->wq);
 	kvm_async_pf_vcpu_init(vcpu);
 
 	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
@@ -1741,10 +1741,10 @@ EXPORT_SYMBOL_GPL(mark_page_dirty);
  */
 void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 {
-	DEFINE_WAIT(wait);
+	DEFINE_SWAITER(wait);
 
 	for (;;) {
-		prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
+		swait_prepare(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
 
 		if (kvm_arch_vcpu_runnable(vcpu)) {
 			kvm_make_request(KVM_REQ_UNHALT, vcpu);
@@ -1758,7 +1758,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 		schedule();
 	}
 
-	finish_wait(&vcpu->wq, &wait);
+	swait_finish(&vcpu->wq, &wait);
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_block);
 
@@ -1770,11 +1770,11 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
 {
 	int me;
 	int cpu = vcpu->cpu;
-	wait_queue_head_t *wqp;
+	struct swait_head *wqp;
 
 	wqp = kvm_arch_vcpu_wq(vcpu);
-	if (waitqueue_active(wqp)) {
-		wake_up_interruptible(wqp);
+	if (swaitqueue_active(wqp)) {
+		swait_wake_interruptible(wqp);
 		++vcpu->stat.halt_wakeup;
 	}
 
@@ -1879,7 +1879,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
 				continue;
 			if (vcpu == me)
 				continue;
-			if (waitqueue_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu))
+			if (swaitqueue_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu))
 				continue;
 			if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
 				continue;
-- 
cgit v1.2.3


From ccf316e777f64b157a467e1ef3cd6ce77d47ffde Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 12 Nov 2011 14:00:48 +0100
Subject: scsi-fcoe-rt-aware.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/scsi/fcoe/fcoe.c      | 18 +++++++++---------
 drivers/scsi/fcoe/fcoe_ctlr.c |  4 ++--
 drivers/scsi/libfc/fc_exch.c  |  4 ++--
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c
index 4a8ac7d8c76b..c1f2bd0bcdb7 100644
--- a/drivers/scsi/fcoe/fcoe.c
+++ b/drivers/scsi/fcoe/fcoe.c
@@ -1286,7 +1286,7 @@ static void fcoe_percpu_thread_destroy(unsigned int cpu)
 	struct sk_buff *skb;
 #ifdef CONFIG_SMP
 	struct fcoe_percpu_s *p0;
-	unsigned targ_cpu = get_cpu();
+	unsigned targ_cpu = get_cpu_light();
 #endif /* CONFIG_SMP */
 
 	FCOE_DBG("Destroying receive thread for CPU %d\n", cpu);
@@ -1342,7 +1342,7 @@ static void fcoe_percpu_thread_destroy(unsigned int cpu)
 			kfree_skb(skb);
 		spin_unlock_bh(&p->fcoe_rx_list.lock);
 	}
-	put_cpu();
+	put_cpu_light();
 #else
 	/*
 	 * This a non-SMP scenario where the singular Rx thread is
@@ -1566,11 +1566,11 @@ err2:
 static int fcoe_alloc_paged_crc_eof(struct sk_buff *skb, int tlen)
 {
 	struct fcoe_percpu_s *fps;
-	int rc;
+	int rc, cpu = get_cpu_light();
 
-	fps = &get_cpu_var(fcoe_percpu);
+	fps = &per_cpu(fcoe_percpu, cpu);
 	rc = fcoe_get_paged_crc_eof(skb, tlen, fps);
-	put_cpu_var(fcoe_percpu);
+	put_cpu_light();
 
 	return rc;
 }
@@ -1768,11 +1768,11 @@ static inline int fcoe_filter_frames(struct fc_lport *lport,
 		return 0;
 	}
 
-	stats = per_cpu_ptr(lport->stats, get_cpu());
+	stats = per_cpu_ptr(lport->stats, get_cpu_light());
 	stats->InvalidCRCCount++;
 	if (stats->InvalidCRCCount < 5)
 		printk(KERN_WARNING "fcoe: dropping frame with CRC error\n");
-	put_cpu();
+	put_cpu_light();
 	return -EINVAL;
 }
 
@@ -1848,13 +1848,13 @@ static void fcoe_recv_frame(struct sk_buff *skb)
 		goto drop;
 
 	if (!fcoe_filter_frames(lport, fp)) {
-		put_cpu();
+		put_cpu_light();
 		fc_exch_recv(lport, fp);
 		return;
 	}
 drop:
 	stats->ErrorFrames++;
-	put_cpu();
+	put_cpu_light();
 	kfree_skb(skb);
 }
 
diff --git a/drivers/scsi/fcoe/fcoe_ctlr.c b/drivers/scsi/fcoe/fcoe_ctlr.c
index 34a1b1f333b4..d91131210695 100644
--- a/drivers/scsi/fcoe/fcoe_ctlr.c
+++ b/drivers/scsi/fcoe/fcoe_ctlr.c
@@ -831,7 +831,7 @@ static unsigned long fcoe_ctlr_age_fcfs(struct fcoe_ctlr *fip)
 
 	INIT_LIST_HEAD(&del_list);
 
-	stats = per_cpu_ptr(fip->lp->stats, get_cpu());
+	stats = per_cpu_ptr(fip->lp->stats, get_cpu_light());
 
 	list_for_each_entry_safe(fcf, next, &fip->fcfs, list) {
 		deadline = fcf->time + fcf->fka_period + fcf->fka_period / 2;
@@ -867,7 +867,7 @@ static unsigned long fcoe_ctlr_age_fcfs(struct fcoe_ctlr *fip)
 				sel_time = fcf->time;
 		}
 	}
-	put_cpu();
+	put_cpu_light();
 
 	list_for_each_entry_safe(fcf, next, &del_list, list) {
 		/* Removes fcf from current list */
diff --git a/drivers/scsi/libfc/fc_exch.c b/drivers/scsi/libfc/fc_exch.c
index 1b3a09473452..50af66a00273 100644
--- a/drivers/scsi/libfc/fc_exch.c
+++ b/drivers/scsi/libfc/fc_exch.c
@@ -816,10 +816,10 @@ static struct fc_exch *fc_exch_em_alloc(struct fc_lport *lport,
 	}
 	memset(ep, 0, sizeof(*ep));
 
-	cpu = get_cpu();
+	cpu = get_cpu_light();
 	pool = per_cpu_ptr(mp->pool, cpu);
 	spin_lock_bh(&pool->lock);
-	put_cpu();
+	put_cpu_light();
 
 	/* peek cache of free slot */
 	if (pool->left != FC_XID_UNKNOWN) {
-- 
cgit v1.2.3


From 023139056969069adb615d64c838a154f67c938b Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Sat, 14 Feb 2015 11:01:16 -0500
Subject: sas-ata/isci: dont't disable interrupts in qc_issue handler

On 3.14-rt we see the following trace on Canoe Pass for
SCSI_ISCI "Intel(R) C600 Series Chipset SAS Controller"
when the sas qc_issue handler is run:

 BUG: sleeping function called from invalid context at kernel/locking/rtmutex.c:905
 in_atomic(): 0, irqs_disabled(): 1, pid: 432, name: udevd
 CPU: 11 PID: 432 Comm: udevd Not tainted 3.14.28-rt22 #2
 Hardware name: Intel Corporation S2600CP/S2600CP, BIOS SE5C600.86B.02.01.0002.082220131453 08/22/2013
 ffff880fab500000 ffff880fa9f239c0 ffffffff81a2d273 0000000000000000
 ffff880fa9f239d8 ffffffff8107f023 ffff880faac23dc0 ffff880fa9f239f0
 ffffffff81a33cc0 ffff880faaeb1400 ffff880fa9f23a40 ffffffff815de891
 Call Trace:
 [<ffffffff81a2d273>] dump_stack+0x4e/0x7a
 [<ffffffff8107f023>] __might_sleep+0xe3/0x160
 [<ffffffff81a33cc0>] rt_spin_lock+0x20/0x50
 [<ffffffff815de891>] isci_task_execute_task+0x171/0x2f0  <-----
 [<ffffffff815cfecb>] sas_ata_qc_issue+0x25b/0x2a0
 [<ffffffff81606363>] ata_qc_issue+0x1f3/0x370
 [<ffffffff8160c600>] ? ata_scsi_invalid_field+0x40/0x40
 [<ffffffff8160c8f5>] ata_scsi_translate+0xa5/0x1b0
 [<ffffffff8160efc6>] ata_sas_queuecmd+0x86/0x280
 [<ffffffff815ce446>] sas_queuecommand+0x196/0x230
 [<ffffffff81081fad>] ? get_parent_ip+0xd/0x50
 [<ffffffff815b05a4>] scsi_dispatch_cmd+0xb4/0x210
 [<ffffffff815b7744>] scsi_request_fn+0x314/0x530

and gdb shows:

(gdb) list * isci_task_execute_task+0x171
0xffffffff815ddfb1 is in isci_task_execute_task (drivers/scsi/isci/task.c:138).
133             dev_dbg(&ihost->pdev->dev, "%s: num=%d\n", __func__, num);
134
135             for_each_sas_task(num, task) {
136                     enum sci_status status = SCI_FAILURE;
137
138                     spin_lock_irqsave(&ihost->scic_lock, flags);    <-----
139                     idev = isci_lookup_device(task->dev);
140                     io_ready = isci_device_io_ready(idev, task);
141                     tag = isci_alloc_tag(ihost);
142                     spin_unlock_irqrestore(&ihost->scic_lock, flags);
(gdb)

In addition to the scic_lock, the function also contains locking of
the task_state_lock -- which is clearly not a candidate for raw lock
conversion.  As can be seen by the comment nearby, we really should
be running the qc_issue code with interrupts enabled anyway.

Cc: stable-rt@vger.kernel.org
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 drivers/scsi/libsas/sas_ata.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c
index 766098af4eb7..d79daca3b00b 100644
--- a/drivers/scsi/libsas/sas_ata.c
+++ b/drivers/scsi/libsas/sas_ata.c
@@ -191,7 +191,7 @@ static unsigned int sas_ata_qc_issue(struct ata_queued_cmd *qc)
 	/* TODO: audit callers to ensure they are ready for qc_issue to
 	 * unconditionally re-enable interrupts
 	 */
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 	spin_unlock(ap->lock);
 
 	/* If the device fell off, no sense in issuing commands */
@@ -261,7 +261,7 @@ static unsigned int sas_ata_qc_issue(struct ata_queued_cmd *qc)
 
  out:
 	spin_lock(ap->lock);
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 5a582e5a3fca2bc18c12716e7e7234e170e99c3a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 14 Nov 2011 18:19:27 +0100
Subject: x86: crypto: Reduce preempt disabled regions

Restrict the preempt disabled regions to the actual floating point
operations and enable preemption for the administrative actions.

This is necessary on RT to avoid that kfree and other operations are
called with preemption disabled.

Reported-and-tested-by: Carsten Emde <cbe@osadl.org>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: stable-rt@vger.kernel.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/crypto/aesni-intel_glue.c | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 5a93783a8a0d..d828fd40d0d0 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -381,14 +381,14 @@ static int ecb_encrypt(struct blkcipher_desc *desc,
 	err = blkcipher_walk_virt(desc, &walk);
 	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
 
-	kernel_fpu_begin();
 	while ((nbytes = walk.nbytes)) {
+		kernel_fpu_begin();
 		aesni_ecb_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
-			      nbytes & AES_BLOCK_MASK);
+				nbytes & AES_BLOCK_MASK);
+		kernel_fpu_end();
 		nbytes &= AES_BLOCK_SIZE - 1;
 		err = blkcipher_walk_done(desc, &walk, nbytes);
 	}
-	kernel_fpu_end();
 
 	return err;
 }
@@ -405,14 +405,14 @@ static int ecb_decrypt(struct blkcipher_desc *desc,
 	err = blkcipher_walk_virt(desc, &walk);
 	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
 
-	kernel_fpu_begin();
 	while ((nbytes = walk.nbytes)) {
+		kernel_fpu_begin();
 		aesni_ecb_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
 			      nbytes & AES_BLOCK_MASK);
+		kernel_fpu_end();
 		nbytes &= AES_BLOCK_SIZE - 1;
 		err = blkcipher_walk_done(desc, &walk, nbytes);
 	}
-	kernel_fpu_end();
 
 	return err;
 }
@@ -429,14 +429,14 @@ static int cbc_encrypt(struct blkcipher_desc *desc,
 	err = blkcipher_walk_virt(desc, &walk);
 	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
 
-	kernel_fpu_begin();
 	while ((nbytes = walk.nbytes)) {
+		kernel_fpu_begin();
 		aesni_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
 			      nbytes & AES_BLOCK_MASK, walk.iv);
+		kernel_fpu_end();
 		nbytes &= AES_BLOCK_SIZE - 1;
 		err = blkcipher_walk_done(desc, &walk, nbytes);
 	}
-	kernel_fpu_end();
 
 	return err;
 }
@@ -453,14 +453,14 @@ static int cbc_decrypt(struct blkcipher_desc *desc,
 	err = blkcipher_walk_virt(desc, &walk);
 	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
 
-	kernel_fpu_begin();
 	while ((nbytes = walk.nbytes)) {
+		kernel_fpu_begin();
 		aesni_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
 			      nbytes & AES_BLOCK_MASK, walk.iv);
+		kernel_fpu_end();
 		nbytes &= AES_BLOCK_SIZE - 1;
 		err = blkcipher_walk_done(desc, &walk, nbytes);
 	}
-	kernel_fpu_end();
 
 	return err;
 }
@@ -512,18 +512,20 @@ static int ctr_crypt(struct blkcipher_desc *desc,
 	err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE);
 	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
 
-	kernel_fpu_begin();
 	while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) {
+		kernel_fpu_begin();
 		aesni_ctr_enc_tfm(ctx, walk.dst.virt.addr, walk.src.virt.addr,
 				  nbytes & AES_BLOCK_MASK, walk.iv);
+		kernel_fpu_end();
 		nbytes &= AES_BLOCK_SIZE - 1;
 		err = blkcipher_walk_done(desc, &walk, nbytes);
 	}
 	if (walk.nbytes) {
+		kernel_fpu_begin();
 		ctr_crypt_final(ctx, &walk);
+		kernel_fpu_end();
 		err = blkcipher_walk_done(desc, &walk, 0);
 	}
-	kernel_fpu_end();
 
 	return err;
 }
-- 
cgit v1.2.3


From c9dd101c1eeeb00eded18de3401586403577f3dd Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 21 Feb 2014 17:24:04 +0100
Subject: crypto: Reduce preempt disabled regions, more algos

Don Estabrook reported
| kernel: WARNING: CPU: 2 PID: 858 at kernel/sched/core.c:2428 migrate_disable+0xed/0x100()
| kernel: WARNING: CPU: 2 PID: 858 at kernel/sched/core.c:2462 migrate_enable+0x17b/0x200()
| kernel: WARNING: CPU: 3 PID: 865 at kernel/sched/core.c:2428 migrate_disable+0xed/0x100()

and his backtrace showed some crypto functions which looked fine.

The problem is the following sequence:

glue_xts_crypt_128bit()
{
	blkcipher_walk_virt(); /* normal migrate_disable() */

	glue_fpu_begin(); /* get atomic */

	while (nbytes) {
		__glue_xts_crypt_128bit();
		blkcipher_walk_done(); /* with nbytes = 0, migrate_enable()
					* while we are atomic */
	};
	glue_fpu_end() /* no longer atomic */
}

and this is why the counter get out of sync and the warning is printed.
The other problem is that we are non-preemptible between
glue_fpu_begin() and glue_fpu_end() and the latency grows. To fix this,
I shorten the FPU off region and ensure blkcipher_walk_done() is called
with preemption enabled. This might hurt the performance because we now
enable/disable the FPU state more often but we gain lower latency and
the bug is gone.

Cc: stable-rt@vger.kernel.org
Reported-by: Don Estabrook <don.estabrook@gmail.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 arch/x86/crypto/cast5_avx_glue.c | 21 +++++++++------------
 arch/x86/crypto/glue_helper.c    | 31 +++++++++++++++----------------
 2 files changed, 24 insertions(+), 28 deletions(-)

diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c
index 60ada677a928..7344a356e7a0 100644
--- a/arch/x86/crypto/cast5_avx_glue.c
+++ b/arch/x86/crypto/cast5_avx_glue.c
@@ -60,7 +60,7 @@ static inline void cast5_fpu_end(bool fpu_enabled)
 static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
 		     bool enc)
 {
-	bool fpu_enabled = false;
+	bool fpu_enabled;
 	struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
 	const unsigned int bsize = CAST5_BLOCK_SIZE;
 	unsigned int nbytes;
@@ -76,7 +76,7 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
 		u8 *wsrc = walk->src.virt.addr;
 		u8 *wdst = walk->dst.virt.addr;
 
-		fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
+		fpu_enabled = cast5_fpu_begin(false, nbytes);
 
 		/* Process multi-block batch */
 		if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
@@ -104,10 +104,9 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
 		} while (nbytes >= bsize);
 
 done:
+		cast5_fpu_end(fpu_enabled);
 		err = blkcipher_walk_done(desc, walk, nbytes);
 	}
-
-	cast5_fpu_end(fpu_enabled);
 	return err;
 }
 
@@ -228,7 +227,7 @@ done:
 static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 		       struct scatterlist *src, unsigned int nbytes)
 {
-	bool fpu_enabled = false;
+	bool fpu_enabled;
 	struct blkcipher_walk walk;
 	int err;
 
@@ -237,12 +236,11 @@ static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
 
 	while ((nbytes = walk.nbytes)) {
-		fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
+		fpu_enabled = cast5_fpu_begin(false, nbytes);
 		nbytes = __cbc_decrypt(desc, &walk);
+		cast5_fpu_end(fpu_enabled);
 		err = blkcipher_walk_done(desc, &walk, nbytes);
 	}
-
-	cast5_fpu_end(fpu_enabled);
 	return err;
 }
 
@@ -312,7 +310,7 @@ done:
 static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 		     struct scatterlist *src, unsigned int nbytes)
 {
-	bool fpu_enabled = false;
+	bool fpu_enabled;
 	struct blkcipher_walk walk;
 	int err;
 
@@ -321,13 +319,12 @@ static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
 
 	while ((nbytes = walk.nbytes) >= CAST5_BLOCK_SIZE) {
-		fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
+		fpu_enabled = cast5_fpu_begin(false, nbytes);
 		nbytes = __ctr_crypt(desc, &walk);
+		cast5_fpu_end(fpu_enabled);
 		err = blkcipher_walk_done(desc, &walk, nbytes);
 	}
 
-	cast5_fpu_end(fpu_enabled);
-
 	if (walk.nbytes) {
 		ctr_crypt_final(desc, &walk);
 		err = blkcipher_walk_done(desc, &walk, 0);
diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c
index 432f1d76ceb8..4a2bd21c2137 100644
--- a/arch/x86/crypto/glue_helper.c
+++ b/arch/x86/crypto/glue_helper.c
@@ -39,7 +39,7 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
 	void *ctx = crypto_blkcipher_ctx(desc->tfm);
 	const unsigned int bsize = 128 / 8;
 	unsigned int nbytes, i, func_bytes;
-	bool fpu_enabled = false;
+	bool fpu_enabled;
 	int err;
 
 	err = blkcipher_walk_virt(desc, walk);
@@ -49,7 +49,7 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
 		u8 *wdst = walk->dst.virt.addr;
 
 		fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
-					     desc, fpu_enabled, nbytes);
+					     desc, false, nbytes);
 
 		for (i = 0; i < gctx->num_funcs; i++) {
 			func_bytes = bsize * gctx->funcs[i].num_blocks;
@@ -71,10 +71,10 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
 		}
 
 done:
+		glue_fpu_end(fpu_enabled);
 		err = blkcipher_walk_done(desc, walk, nbytes);
 	}
 
-	glue_fpu_end(fpu_enabled);
 	return err;
 }
 
@@ -194,7 +194,7 @@ int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx,
 			    struct scatterlist *src, unsigned int nbytes)
 {
 	const unsigned int bsize = 128 / 8;
-	bool fpu_enabled = false;
+	bool fpu_enabled;
 	struct blkcipher_walk walk;
 	int err;
 
@@ -203,12 +203,12 @@ int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx,
 
 	while ((nbytes = walk.nbytes)) {
 		fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
-					     desc, fpu_enabled, nbytes);
+					     desc, false, nbytes);
 		nbytes = __glue_cbc_decrypt_128bit(gctx, desc, &walk);
+		glue_fpu_end(fpu_enabled);
 		err = blkcipher_walk_done(desc, &walk, nbytes);
 	}
 
-	glue_fpu_end(fpu_enabled);
 	return err;
 }
 EXPORT_SYMBOL_GPL(glue_cbc_decrypt_128bit);
@@ -278,7 +278,7 @@ int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
 			  struct scatterlist *src, unsigned int nbytes)
 {
 	const unsigned int bsize = 128 / 8;
-	bool fpu_enabled = false;
+	bool fpu_enabled;
 	struct blkcipher_walk walk;
 	int err;
 
@@ -287,13 +287,12 @@ int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
 
 	while ((nbytes = walk.nbytes) >= bsize) {
 		fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
-					     desc, fpu_enabled, nbytes);
+					     desc, false, nbytes);
 		nbytes = __glue_ctr_crypt_128bit(gctx, desc, &walk);
+		glue_fpu_end(fpu_enabled);
 		err = blkcipher_walk_done(desc, &walk, nbytes);
 	}
 
-	glue_fpu_end(fpu_enabled);
-
 	if (walk.nbytes) {
 		glue_ctr_crypt_final_128bit(
 			gctx->funcs[gctx->num_funcs - 1].fn_u.ctr, desc, &walk);
@@ -348,7 +347,7 @@ int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx,
 			  void *tweak_ctx, void *crypt_ctx)
 {
 	const unsigned int bsize = 128 / 8;
-	bool fpu_enabled = false;
+	bool fpu_enabled;
 	struct blkcipher_walk walk;
 	int err;
 
@@ -361,21 +360,21 @@ int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx,
 
 	/* set minimum length to bsize, for tweak_fn */
 	fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
-				     desc, fpu_enabled,
+				     desc, false,
 				     nbytes < bsize ? bsize : nbytes);
-
 	/* calculate first value of T */
 	tweak_fn(tweak_ctx, walk.iv, walk.iv);
+	glue_fpu_end(fpu_enabled);
 
 	while (nbytes) {
+		fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
+				desc, false, nbytes);
 		nbytes = __glue_xts_crypt_128bit(gctx, crypt_ctx, desc, &walk);
 
+		glue_fpu_end(fpu_enabled);
 		err = blkcipher_walk_done(desc, &walk, nbytes);
 		nbytes = walk.nbytes;
 	}
-
-	glue_fpu_end(fpu_enabled);
-
 	return err;
 }
 EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit);
-- 
cgit v1.2.3


From 65537a388bdf5e9db38da8365a3b8ee2e01073ea Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 14 Nov 2011 23:06:09 +0100
Subject: dm: Make rt aware

Use the BUG_ON_NORT variant for the irq_disabled() checks. RT has
interrupts legitimately enabled here as we cant deadlock against the
irq thread due to the "sleeping spinlocks" conversion.

Reported-by: Luis Claudio R. Goncalves <lclaudio@uudg.org>
Cc: stable-rt@vger.kernel.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/md/dm.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 042114fbd200..f47ba4dfdf26 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1898,14 +1898,14 @@ static void dm_request_fn(struct request_queue *q)
 		if (map_request(ti, clone, md))
 			goto requeued;
 
-		BUG_ON(!irqs_disabled());
+		BUG_ON_NONRT(!irqs_disabled());
 		spin_lock(q->queue_lock);
 	}
 
 	goto out;
 
 requeued:
-	BUG_ON(!irqs_disabled());
+	BUG_ON_NONRT(!irqs_disabled());
 	spin_lock(q->queue_lock);
 
 delay_and_out:
-- 
cgit v1.2.3


From d83e68897c6f88b5788261efdefab4599a9f6068 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 13 Feb 2013 09:26:05 -0500
Subject: acpi/rt: Convert acpi_gbl_hardware lock back to a raw_spinlock_t

We hit the following bug with 3.6-rt:

[    5.898990] BUG: scheduling while atomic: swapper/3/0/0x00000002
[    5.898991] no locks held by swapper/3/0.
[    5.898993] Modules linked in:
[    5.898996] Pid: 0, comm: swapper/3 Not tainted 3.6.11-rt28.19.el6rt.x86_64.debug #1
[    5.898997] Call Trace:
[    5.899011]  [<ffffffff810804e7>] __schedule_bug+0x67/0x90
[    5.899028]  [<ffffffff81577923>] __schedule+0x793/0x7a0
[    5.899032]  [<ffffffff810b4e40>] ? debug_rt_mutex_print_deadlock+0x50/0x200
[    5.899034]  [<ffffffff81577b89>] schedule+0x29/0x70
[    5.899036] BUG: scheduling while atomic: swapper/7/0/0x00000002
[    5.899037] no locks held by swapper/7/0.
[    5.899039]  [<ffffffff81578525>] rt_spin_lock_slowlock+0xe5/0x2f0
[    5.899040] Modules linked in:
[    5.899041]
[    5.899045]  [<ffffffff81579a58>] ? _raw_spin_unlock_irqrestore+0x38/0x90
[    5.899046] Pid: 0, comm: swapper/7 Not tainted 3.6.11-rt28.19.el6rt.x86_64.debug #1
[    5.899047] Call Trace:
[    5.899049]  [<ffffffff81578bc6>] rt_spin_lock+0x16/0x40
[    5.899052]  [<ffffffff810804e7>] __schedule_bug+0x67/0x90
[    5.899054]  [<ffffffff8157d3f0>] ? notifier_call_chain+0x80/0x80
[    5.899056]  [<ffffffff81577923>] __schedule+0x793/0x7a0
[    5.899059]  [<ffffffff812f2034>] acpi_os_acquire_lock+0x1f/0x23
[    5.899062]  [<ffffffff810b4e40>] ? debug_rt_mutex_print_deadlock+0x50/0x200
[    5.899068]  [<ffffffff8130be64>] acpi_write_bit_register+0x33/0xb0
[    5.899071]  [<ffffffff81577b89>] schedule+0x29/0x70
[    5.899072]  [<ffffffff8130be13>] ? acpi_read_bit_register+0x33/0x51
[    5.899074]  [<ffffffff81578525>] rt_spin_lock_slowlock+0xe5/0x2f0
[    5.899077]  [<ffffffff8131d1fc>] acpi_idle_enter_bm+0x8a/0x28e
[    5.899079]  [<ffffffff81579a58>] ? _raw_spin_unlock_irqrestore+0x38/0x90
[    5.899081]  [<ffffffff8107e5da>] ? this_cpu_load+0x1a/0x30
[    5.899083]  [<ffffffff81578bc6>] rt_spin_lock+0x16/0x40
[    5.899087]  [<ffffffff8144c759>] cpuidle_enter+0x19/0x20
[    5.899088]  [<ffffffff8157d3f0>] ? notifier_call_chain+0x80/0x80
[    5.899090]  [<ffffffff8144c777>] cpuidle_enter_state+0x17/0x50
[    5.899092]  [<ffffffff812f2034>] acpi_os_acquire_lock+0x1f/0x23
[    5.899094]  [<ffffffff8144d1a1>] cpuidle899101]  [<ffffffff8130be13>] ?

As the acpi code disables interrupts in acpi_idle_enter_bm, and calls
code that grabs the acpi lock, it causes issues as the lock is currently
in RT a sleeping lock.

The lock was converted from a raw to a sleeping lock due to some
previous issues, and tests that showed it didn't seem to matter.
Unfortunately, it did matter for one of our boxes.

This patch converts the lock back to a raw lock. I've run this code on a
few of my own machines, one being my laptop that uses the acpi quite
extensively. I've been able to suspend and resume without issues.

[ tglx: Made the change exclusive for acpi_gbl_hardware_lock ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Cc: John Kacur <jkacur@gmail.com>
Cc: Clark Williams <clark@redhat.com>
Link: http://lkml.kernel.org/r/1360765565.23152.5.camel@gandalf.local.home
Cc: stable-rt@vger.kernel.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 drivers/acpi/acpica/acglobal.h  |  2 +-
 drivers/acpi/acpica/hwregs.c    |  4 ++--
 drivers/acpi/acpica/hwxface.c   |  4 ++--
 drivers/acpi/acpica/utmutex.c   |  4 ++--
 include/acpi/platform/aclinux.h | 15 +++++++++++++++
 5 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/drivers/acpi/acpica/acglobal.h b/drivers/acpi/acpica/acglobal.h
index ebf02cc10a43..f15ef3dc468c 100644
--- a/drivers/acpi/acpica/acglobal.h
+++ b/drivers/acpi/acpica/acglobal.h
@@ -112,7 +112,7 @@ ACPI_GLOBAL(u8, acpi_gbl_global_lock_pending);
  * interrupt level
  */
 ACPI_GLOBAL(acpi_spinlock, acpi_gbl_gpe_lock);	/* For GPE data structs and registers */
-ACPI_GLOBAL(acpi_spinlock, acpi_gbl_hardware_lock);	/* For ACPI H/W except GPE registers */
+ACPI_GLOBAL(acpi_raw_spinlock, acpi_gbl_hardware_lock);	/* For ACPI H/W except GPE registers */
 ACPI_GLOBAL(acpi_spinlock, acpi_gbl_reference_count_lock);
 
 /* Mutex for _OSI support */
diff --git a/drivers/acpi/acpica/hwregs.c b/drivers/acpi/acpica/hwregs.c
index a4c34d2c556b..b40826b27624 100644
--- a/drivers/acpi/acpica/hwregs.c
+++ b/drivers/acpi/acpica/hwregs.c
@@ -269,14 +269,14 @@ acpi_status acpi_hw_clear_acpi_status(void)
 			  ACPI_BITMASK_ALL_FIXED_STATUS,
 			  ACPI_FORMAT_UINT64(acpi_gbl_xpm1a_status.address)));
 
-	lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
+	raw_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags);
 
 	/* Clear the fixed events in PM1 A/B */
 
 	status = acpi_hw_register_write(ACPI_REGISTER_PM1_STATUS,
 					ACPI_BITMASK_ALL_FIXED_STATUS);
 
-	acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags);
+	raw_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags);
 
 	if (ACPI_FAILURE(status)) {
 		goto exit;
diff --git a/drivers/acpi/acpica/hwxface.c b/drivers/acpi/acpica/hwxface.c
index 96d007df65ec..e57a03ed379c 100644
--- a/drivers/acpi/acpica/hwxface.c
+++ b/drivers/acpi/acpica/hwxface.c
@@ -374,7 +374,7 @@ acpi_status acpi_write_bit_register(u32 register_id, u32 value)
 		return_ACPI_STATUS(AE_BAD_PARAMETER);
 	}
 
-	lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
+	raw_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags);
 
 	/*
 	 * At this point, we know that the parent register is one of the
@@ -435,7 +435,7 @@ acpi_status acpi_write_bit_register(u32 register_id, u32 value)
 
 unlock_and_exit:
 
-	acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags);
+	raw_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags);
 	return_ACPI_STATUS(status);
 }
 
diff --git a/drivers/acpi/acpica/utmutex.c b/drivers/acpi/acpica/utmutex.c
index 82717fff9ffc..508c5770ec71 100644
--- a/drivers/acpi/acpica/utmutex.c
+++ b/drivers/acpi/acpica/utmutex.c
@@ -88,7 +88,7 @@ acpi_status acpi_ut_mutex_initialize(void)
 		return_ACPI_STATUS (status);
 	}
 
-	status = acpi_os_create_lock (&acpi_gbl_hardware_lock);
+	status = acpi_os_create_raw_lock (&acpi_gbl_hardware_lock);
 	if (ACPI_FAILURE (status)) {
 		return_ACPI_STATUS (status);
 	}
@@ -141,7 +141,7 @@ void acpi_ut_mutex_terminate(void)
 	/* Delete the spinlocks */
 
 	acpi_os_delete_lock(acpi_gbl_gpe_lock);
-	acpi_os_delete_lock(acpi_gbl_hardware_lock);
+	acpi_os_delete_raw_lock(acpi_gbl_hardware_lock);
 	acpi_os_delete_lock(acpi_gbl_reference_count_lock);
 
 	/* Delete the reader/writer lock */
diff --git a/include/acpi/platform/aclinux.h b/include/acpi/platform/aclinux.h
index 1ba7c190c2cc..21653ac47b4e 100644
--- a/include/acpi/platform/aclinux.h
+++ b/include/acpi/platform/aclinux.h
@@ -123,6 +123,7 @@
 
 #define acpi_cache_t                        struct kmem_cache
 #define acpi_spinlock                       spinlock_t *
+#define acpi_raw_spinlock		raw_spinlock_t *
 #define acpi_cpu_flags                      unsigned long
 
 /* Use native linux version of acpi_os_allocate_zeroed */
@@ -141,6 +142,20 @@
 #define ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_get_thread_id
 #define ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_create_lock
 
+#define acpi_os_create_raw_lock(__handle)			\
+({								\
+	 raw_spinlock_t *lock = ACPI_ALLOCATE(sizeof(*lock));	\
+								\
+	 if (lock) {						\
+		*(__handle) = lock;				\
+		raw_spin_lock_init(*(__handle));		\
+	 }							\
+	 lock ? AE_OK : AE_NO_MEMORY;				\
+ })
+
+#define acpi_os_delete_raw_lock(__handle)	kfree(__handle)
+
+
 /*
  * OSL interfaces used by debugger/disassembler
  */
-- 
cgit v1.2.3


From ffbe4b238b56dcf7a661a98cd7aeadc51c06e736 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 14 Dec 2011 01:03:49 +0100
Subject: cpumask: Disable CONFIG_CPUMASK_OFFSTACK for RT

We can't deal with the cpumask allocations which happen in atomic
context (see arch/x86/kernel/apic/io_apic.c) on RT right now.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/Kconfig | 2 +-
 lib/Kconfig      | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 6e9d2cc9dcc4..ea9e6637574c 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -814,7 +814,7 @@ config IOMMU_HELPER
 config MAXSMP
 	bool "Enable Maximum number of SMP Processors and NUMA Nodes"
 	depends on X86_64 && SMP && DEBUG_KERNEL
-	select CPUMASK_OFFSTACK
+	select CPUMASK_OFFSTACK if !PREEMPT_RT_FULL
 	---help---
 	  Enable maximum number of CPUS and NUMA Nodes for this architecture.
 	  If unsure, say N.
diff --git a/lib/Kconfig b/lib/Kconfig
index 54cf309a92a5..b06d5444fb11 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -383,6 +383,7 @@ config CHECK_SIGNATURE
 
 config CPUMASK_OFFSTACK
 	bool "Force CPU masks off stack" if DEBUG_PER_CPU_MAPS
+	depends on !PREEMPT_RT_FULL
 	help
 	  Use dynamic allocation for cpumask_var_t, instead of putting
 	  them on the stack.  This is a bit more expensive, but avoids
-- 
cgit v1.2.3


From 30a435240d9215816f0de50d6cc76155dbd3cb4e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 21 Aug 2012 20:38:50 +0200
Subject: random: Make it work on rt

Delegate the random insertion to the forced threaded interrupt
handler. Store the return IP of the hard interrupt handler in the irq
descriptor and feed it into the random generator as a source of
entropy.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable-rt@vger.kernel.org
---
 drivers/char/random.c   | 11 +++++------
 include/linux/irqdesc.h |  1 +
 include/linux/random.h  |  2 +-
 kernel/irq/handle.c     |  8 +++++++-
 kernel/irq/manage.c     |  6 ++++++
 5 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 9a07ef22fab1..eb47efec2506 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -868,28 +868,27 @@ static __u32 get_reg(struct fast_pool *f, struct pt_regs *regs)
 	return *(ptr + f->reg_idx++);
 }
 
-void add_interrupt_randomness(int irq, int irq_flags)
+void add_interrupt_randomness(int irq, int irq_flags, __u64 ip)
 {
 	struct entropy_store	*r;
 	struct fast_pool	*fast_pool = this_cpu_ptr(&irq_randomness);
-	struct pt_regs		*regs = get_irq_regs();
 	unsigned long		now = jiffies;
 	cycles_t		cycles = random_get_entropy();
 	__u32			c_high, j_high;
-	__u64			ip;
 	unsigned long		seed;
 	int			credit = 0;
 
 	if (cycles == 0)
-		cycles = get_reg(fast_pool, regs);
+		cycles = get_reg(fast_pool, NULL);
 	c_high = (sizeof(cycles) > 4) ? cycles >> 32 : 0;
 	j_high = (sizeof(now) > 4) ? now >> 32 : 0;
 	fast_pool->pool[0] ^= cycles ^ j_high ^ irq;
 	fast_pool->pool[1] ^= now ^ c_high;
-	ip = regs ? instruction_pointer(regs) : _RET_IP_;
+	if (!ip)
+		ip = _RET_IP_;
 	fast_pool->pool[2] ^= ip;
 	fast_pool->pool[3] ^= (sizeof(ip) > 4) ? ip >> 32 :
-		get_reg(fast_pool, regs);
+		get_reg(fast_pool, NULL);
 
 	fast_mix(fast_pool);
 	add_interrupt_bench(cycles);
diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index faf433af425e..c51a2e5ec9ac 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -63,6 +63,7 @@ struct irq_desc {
 	unsigned int		irqs_unhandled;
 	atomic_t		threads_handled;
 	int			threads_handled_last;
+	u64			random_ip;
 	raw_spinlock_t		lock;
 	struct cpumask		*percpu_enabled;
 #ifdef CONFIG_SMP
diff --git a/include/linux/random.h b/include/linux/random.h
index b05856e16b75..4a64ad52dcb7 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -11,7 +11,7 @@
 extern void add_device_randomness(const void *, unsigned int);
 extern void add_input_randomness(unsigned int type, unsigned int code,
 				 unsigned int value);
-extern void add_interrupt_randomness(int irq, int irq_flags);
+extern void add_interrupt_randomness(int irq, int irq_flags, __u64 ip);
 
 extern void get_random_bytes(void *buf, int nbytes);
 extern void get_random_bytes_arch(void *buf, int nbytes);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 635480270858..26a63672c263 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -133,6 +133,8 @@ void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
 irqreturn_t
 handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
 {
+	struct pt_regs *regs = get_irq_regs();
+	u64 ip = regs ? instruction_pointer(regs) : 0;
 	irqreturn_t retval = IRQ_NONE;
 	unsigned int flags = 0, irq = desc->irq_data.irq;
 
@@ -173,7 +175,11 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
 		action = action->next;
 	} while (action);
 
-	add_interrupt_randomness(irq, flags);
+#ifndef CONFIG_PREEMPT_RT_FULL
+	add_interrupt_randomness(irq, flags, ip);
+#else
+	desc->random_ip = ip;
+#endif
 
 	if (!noirqdebug)
 		note_interrupt(irq, desc, retval);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 3691d0e778bf..382cbe57abf3 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -954,6 +954,12 @@ static int irq_thread(void *data)
 		if (action_ret == IRQ_HANDLED)
 			atomic_inc(&desc->threads_handled);
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+		migrate_disable();
+		add_interrupt_randomness(action->irq, 0,
+				 desc->random_ip ^ (unsigned long) action);
+		migrate_enable();
+#endif
 		wake_threads_waitq(desc);
 	}
 
-- 
cgit v1.2.3


From d219a3fbf9b2bd94834c5f3f05c33736cfc6ee11 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 22 Feb 2012 12:03:30 +0100
Subject: seqlock: Prevent rt starvation

If a low prio writer gets preempted while holding the seqlock write
locked, a high prio reader spins forever on RT.

To prevent this let the reader grab the spinlock, so it blocks and
eventually boosts the writer. This way the writer can proceed and
endless spinning is prevented.

For seqcount writers we disable preemption over the update code
path. Thaanks to Al Viro for distangling some VFS code to make that
possible.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable-rt@vger.kernel.org
---
 include/linux/seqlock.h | 57 ++++++++++++++++++++++++++++++++++++++-----------
 include/net/dst.h       |  2 +-
 include/net/neighbour.h |  4 ++--
 3 files changed, 48 insertions(+), 15 deletions(-)

diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index f5df8f687b4d..de343cda4680 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -219,20 +219,30 @@ static inline int read_seqcount_retry(const seqcount_t *s, unsigned start)
 	return __read_seqcount_retry(s, start);
 }
 
-
-
-static inline void raw_write_seqcount_begin(seqcount_t *s)
+static inline void __raw_write_seqcount_begin(seqcount_t *s)
 {
 	s->sequence++;
 	smp_wmb();
 }
 
-static inline void raw_write_seqcount_end(seqcount_t *s)
+static inline void raw_write_seqcount_begin(seqcount_t *s)
+{
+	preempt_disable_rt();
+	__raw_write_seqcount_begin(s);
+}
+
+static inline void __raw_write_seqcount_end(seqcount_t *s)
 {
 	smp_wmb();
 	s->sequence++;
 }
 
+static inline void raw_write_seqcount_end(seqcount_t *s)
+{
+	__raw_write_seqcount_end(s);
+	preempt_enable_rt();
+}
+
 /*
  * raw_write_seqcount_latch - redirect readers to even/odd copy
  * @s: pointer to seqcount_t
@@ -305,10 +315,33 @@ typedef struct {
 /*
  * Read side functions for starting and finalizing a read side section.
  */
+#ifndef CONFIG_PREEMPT_RT_FULL
 static inline unsigned read_seqbegin(const seqlock_t *sl)
 {
 	return read_seqcount_begin(&sl->seqcount);
 }
+#else
+/*
+ * Starvation safe read side for RT
+ */
+static inline unsigned read_seqbegin(seqlock_t *sl)
+{
+	unsigned ret;
+
+repeat:
+	ret = ACCESS_ONCE(sl->seqcount.sequence);
+	if (unlikely(ret & 1)) {
+		/*
+		 * Take the lock and let the writer proceed (i.e. evtl
+		 * boost it), otherwise we could loop here forever.
+		 */
+		spin_lock(&sl->lock);
+		spin_unlock(&sl->lock);
+		goto repeat;
+	}
+	return ret;
+}
+#endif
 
 static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
 {
@@ -323,36 +356,36 @@ static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
 static inline void write_seqlock(seqlock_t *sl)
 {
 	spin_lock(&sl->lock);
-	write_seqcount_begin(&sl->seqcount);
+	__write_seqcount_begin(&sl->seqcount);
 }
 
 static inline void write_sequnlock(seqlock_t *sl)
 {
-	write_seqcount_end(&sl->seqcount);
+	__raw_write_seqcount_end(&sl->seqcount);
 	spin_unlock(&sl->lock);
 }
 
 static inline void write_seqlock_bh(seqlock_t *sl)
 {
 	spin_lock_bh(&sl->lock);
-	write_seqcount_begin(&sl->seqcount);
+	__write_seqcount_begin(&sl->seqcount);
 }
 
 static inline void write_sequnlock_bh(seqlock_t *sl)
 {
-	write_seqcount_end(&sl->seqcount);
+	__raw_write_seqcount_end(&sl->seqcount);
 	spin_unlock_bh(&sl->lock);
 }
 
 static inline void write_seqlock_irq(seqlock_t *sl)
 {
 	spin_lock_irq(&sl->lock);
-	write_seqcount_begin(&sl->seqcount);
+	__write_seqcount_begin(&sl->seqcount);
 }
 
 static inline void write_sequnlock_irq(seqlock_t *sl)
 {
-	write_seqcount_end(&sl->seqcount);
+	__raw_write_seqcount_end(&sl->seqcount);
 	spin_unlock_irq(&sl->lock);
 }
 
@@ -361,7 +394,7 @@ static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl)
 	unsigned long flags;
 
 	spin_lock_irqsave(&sl->lock, flags);
-	write_seqcount_begin(&sl->seqcount);
+	__write_seqcount_begin(&sl->seqcount);
 	return flags;
 }
 
@@ -371,7 +404,7 @@ static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl)
 static inline void
 write_sequnlock_irqrestore(seqlock_t *sl, unsigned long flags)
 {
-	write_seqcount_end(&sl->seqcount);
+	__raw_write_seqcount_end(&sl->seqcount);
 	spin_unlock_irqrestore(&sl->lock, flags);
 }
 
diff --git a/include/net/dst.h b/include/net/dst.h
index a8ae4e760778..9b670af2814b 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -403,7 +403,7 @@ static inline void dst_confirm(struct dst_entry *dst)
 static inline int dst_neigh_output(struct dst_entry *dst, struct neighbour *n,
 				   struct sk_buff *skb)
 {
-	const struct hh_cache *hh;
+	struct hh_cache *hh;
 
 	if (dst->pending_confirm) {
 		unsigned long now = jiffies;
diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index f60558d0254c..58f0c09c9a05 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -387,7 +387,7 @@ static inline int neigh_hh_bridge(struct hh_cache *hh, struct sk_buff *skb)
 }
 #endif
 
-static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb)
+static inline int neigh_hh_output(struct hh_cache *hh, struct sk_buff *skb)
 {
 	unsigned int seq;
 	int hh_len;
@@ -442,7 +442,7 @@ struct neighbour_cb {
 
 #define NEIGH_CB(skb)	((struct neighbour_cb *)(skb)->cb)
 
-static inline void neigh_ha_snapshot(char *dst, const struct neighbour *n,
+static inline void neigh_ha_snapshot(char *dst, struct neighbour *n,
 				     const struct net_device *dev)
 {
 	unsigned int seq;
-- 
cgit v1.2.3


From 4777e9e6ac7affccdbb74da1d31efeee7c744e64 Mon Sep 17 00:00:00 2001
From: Nicholas Mc Guire <der.herr@hofr.at>
Date: Sun, 1 Dec 2013 23:03:52 -0500
Subject: seqlock: consolidate spin_lock/unlock waiting with spin_unlock_wait

since c2f21ce ("locking: Implement new raw_spinlock")
include/linux/spinlock.h includes spin_unlock_wait() to wait for a concurren
holder of a lock. this patch just moves over to that API. spin_unlock_wait
covers both raw_spinlock_t and spinlock_t so it should be safe here as well.
the added rt-variant of read_seqbegin in include/linux/seqlock.h that is being
modified, was introduced by patch:
  seqlock-prevent-rt-starvation.patch

behavior should be unchanged.

Signed-off-by: Nicholas Mc Guire <der.herr@hofr.at>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 include/linux/seqlock.h | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index de343cda4680..4acd0e2fb5cb 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -335,8 +335,7 @@ repeat:
 		 * Take the lock and let the writer proceed (i.e. evtl
 		 * boost it), otherwise we could loop here forever.
 		 */
-		spin_lock(&sl->lock);
-		spin_unlock(&sl->lock);
+		spin_unlock_wait(&sl->lock);
 		goto repeat;
 	}
 	return ret;
@@ -356,7 +355,7 @@ static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
 static inline void write_seqlock(seqlock_t *sl)
 {
 	spin_lock(&sl->lock);
-	__write_seqcount_begin(&sl->seqcount);
+	__raw_write_seqcount_begin(&sl->seqcount);
 }
 
 static inline void write_sequnlock(seqlock_t *sl)
@@ -368,7 +367,7 @@ static inline void write_sequnlock(seqlock_t *sl)
 static inline void write_seqlock_bh(seqlock_t *sl)
 {
 	spin_lock_bh(&sl->lock);
-	__write_seqcount_begin(&sl->seqcount);
+	__raw_write_seqcount_begin(&sl->seqcount);
 }
 
 static inline void write_sequnlock_bh(seqlock_t *sl)
@@ -380,7 +379,7 @@ static inline void write_sequnlock_bh(seqlock_t *sl)
 static inline void write_seqlock_irq(seqlock_t *sl)
 {
 	spin_lock_irq(&sl->lock);
-	__write_seqcount_begin(&sl->seqcount);
+	__raw_write_seqcount_begin(&sl->seqcount);
 }
 
 static inline void write_sequnlock_irq(seqlock_t *sl)
@@ -394,7 +393,7 @@ static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl)
 	unsigned long flags;
 
 	spin_lock_irqsave(&sl->lock, flags);
-	__write_seqcount_begin(&sl->seqcount);
+	__raw_write_seqcount_begin(&sl->seqcount);
 	return flags;
 }
 
-- 
cgit v1.2.3


From e6d0162e6f0ba10e95b938c6fbf51385abddbcdd Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Fri, 2 Mar 2012 10:36:57 -0500
Subject: cpu: Make hotplug.lock a "sleeping" spinlock on RT

Tasks can block on hotplug.lock in pin_current_cpu(), but their state
might be != RUNNING. So the mutex wakeup will set the state
unconditionally to RUNNING. That might cause spurious unexpected
wakeups. We could provide a state preserving mutex_lock() function,
but this is semantically backwards. So instead we convert the
hotplug.lock() to a spinlock for RT, which has the state preserving
semantics already.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Carsten Emde <C.Emde@osadl.org>
Cc: John Kacur <jkacur@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Clark Williams <clark.williams@gmail.com>
Cc: stable-rt@vger.kernel.org
Link: http://lkml.kernel.org/r/1330702617.25686.265.camel@gandalf.stny.rr.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/cpu.c | 42 +++++++++++++++++++++++++++++++-----------
 1 file changed, 31 insertions(+), 11 deletions(-)

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 1d6ad09c0592..317caa37fa62 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -58,7 +58,12 @@ static int cpu_hotplug_disabled;
 
 static struct {
 	struct task_struct *active_writer;
+#ifdef CONFIG_PREEMPT_RT_FULL
+	/* Makes the lock keep the task's state */
+	spinlock_t lock;
+#else
 	struct mutex lock; /* Synchronizes accesses to refcount, */
+#endif
 	/*
 	 * Also blocks the new readers during
 	 * an ongoing cpu hotplug operation.
@@ -72,13 +77,27 @@ static struct {
 #endif
 } cpu_hotplug = {
 	.active_writer = NULL,
+#ifdef CONFIG_PREEMPT_RT_FULL
+	.lock = __SPIN_LOCK_UNLOCKED(cpu_hotplug.lock),
+#else
 	.lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
+#endif
 	.refcount = 0,
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	.dep_map = {.name = "cpu_hotplug.lock" },
 #endif
 };
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+# define hotplug_lock() rt_spin_lock(&cpu_hotplug.lock)
+# define hotplug_trylock() rt_spin_trylock(&cpu_hotplug.lock)
+# define hotplug_unlock() rt_spin_unlock(&cpu_hotplug.lock)
+#else
+# define hotplug_lock() mutex_lock(&cpu_hotplug.lock)
+# define hotplug_trylock() mutex_trylock(&cpu_hotplug.lock)
+# define hotplug_unlock() mutex_unlock(&cpu_hotplug.lock)
+#endif
+
 /* Lockdep annotations for get/put_online_cpus() and cpu_hotplug_begin/end() */
 #define cpuhp_lock_acquire_read() lock_map_acquire_read(&cpu_hotplug.dep_map)
 #define cpuhp_lock_acquire_tryread() \
@@ -115,8 +134,8 @@ retry:
 		return;
 	}
 	preempt_enable();
-	mutex_lock(&cpu_hotplug.lock);
-	mutex_unlock(&cpu_hotplug.lock);
+	hotplug_lock();
+	hotplug_unlock();
 	preempt_disable();
 	goto retry;
 }
@@ -189,9 +208,9 @@ void get_online_cpus(void)
 	if (cpu_hotplug.active_writer == current)
 		return;
 	cpuhp_lock_acquire_read();
-	mutex_lock(&cpu_hotplug.lock);
+	hotplug_lock();
 	cpu_hotplug.refcount++;
-	mutex_unlock(&cpu_hotplug.lock);
+	hotplug_unlock();
 }
 EXPORT_SYMBOL_GPL(get_online_cpus);
 
@@ -199,11 +218,12 @@ bool try_get_online_cpus(void)
 {
 	if (cpu_hotplug.active_writer == current)
 		return true;
-	if (!mutex_trylock(&cpu_hotplug.lock))
+
+	if (!hotplug_trylock()) {
 		return false;
 	cpuhp_lock_acquire_tryread();
 	cpu_hotplug.refcount++;
-	mutex_unlock(&cpu_hotplug.lock);
+	hotplug_unlock();
 	return true;
 }
 EXPORT_SYMBOL_GPL(try_get_online_cpus);
@@ -212,7 +232,7 @@ void put_online_cpus(void)
 {
 	if (cpu_hotplug.active_writer == current)
 		return;
-	if (!mutex_trylock(&cpu_hotplug.lock)) {
+	if (!hotplug_trylock()) {
 		atomic_inc(&cpu_hotplug.puts_pending);
 		cpuhp_lock_release();
 		return;
@@ -223,7 +243,7 @@ void put_online_cpus(void)
 
 	if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer))
 		wake_up_process(cpu_hotplug.active_writer);
-	mutex_unlock(&cpu_hotplug.lock);
+	hotplug_unlock();
 	cpuhp_lock_release();
 
 }
@@ -257,7 +277,7 @@ void cpu_hotplug_begin(void)
 
 	cpuhp_lock_acquire();
 	for (;;) {
-		mutex_lock(&cpu_hotplug.lock);
+		hotplug_lock();
 		if (atomic_read(&cpu_hotplug.puts_pending)) {
 			int delta;
 
@@ -267,7 +287,7 @@ void cpu_hotplug_begin(void)
 		if (likely(!cpu_hotplug.refcount))
 			break;
 		__set_current_state(TASK_UNINTERRUPTIBLE);
-		mutex_unlock(&cpu_hotplug.lock);
+		hotplug_unlock();
 		schedule();
 	}
 }
@@ -275,7 +295,7 @@ void cpu_hotplug_begin(void)
 void cpu_hotplug_done(void)
 {
 	cpu_hotplug.active_writer = NULL;
-	mutex_unlock(&cpu_hotplug.lock);
+	hotplug_unlock();
 	cpuhp_lock_release();
 }
 
-- 
cgit v1.2.3


From 99a10b7f52d944e1ee07f1fd556260982c62dc18 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 16 Jul 2012 08:07:43 +0000
Subject: cpu/rt: Rework cpu down for PREEMPT_RT

Bringing a CPU down is a pain with the PREEMPT_RT kernel because
tasks can be preempted in many more places than in non-RT. In
order to handle per_cpu variables, tasks may be pinned to a CPU
for a while, and even sleep. But these tasks need to be off the CPU
if that CPU is going down.

Several synchronization methods have been tried, but when stressed
they failed. This is a new approach.

A sync_tsk thread is still created and tasks may still block on a
lock when the CPU is going down, but how that works is a bit different.
When cpu_down() starts, it will create the sync_tsk and wait on it
to inform that current tasks that are pinned on the CPU are no longer
pinned. But new tasks that are about to be pinned will still be allowed
to do so at this time.

Then the notifiers are called. Several notifiers will bring down tasks
that will enter these locations. Some of these tasks will take locks
of other tasks that are on the CPU. If we don't let those other tasks
continue, but make them block until CPU down is done, the tasks that
the notifiers are waiting on will never complete as they are waiting
for the locks held by the tasks that are blocked.

Thus we still let the task pin the CPU until the notifiers are done.
After the notifiers run, we then make new tasks entering the pinned
CPU sections grab a mutex and wait. This mutex is now a per CPU mutex
in the hotplug_pcp descriptor.

To help things along, a new function in the scheduler code is created
called migrate_me(). This function will try to migrate the current task
off the CPU this is going down if possible. When the sync_tsk is created,
all tasks will then try to migrate off the CPU going down. There are
several cases that this wont work, but it helps in most cases.

After the notifiers are called and if a task can't migrate off but enters
the pin CPU sections, it will be forced to wait on the hotplug_pcp mutex
until the CPU down is complete. Then the scheduler will force the migration
anyway.

Also, I found that THREAD_BOUND need to also be accounted for in the
pinned CPU, and the migrate_disable no longer treats them special.
This helps fix issues with ksoftirqd and workqueue that unbind on CPU down.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h |   7 ++
 kernel/cpu.c          | 245 ++++++++++++++++++++++++++++++++++++++++----------
 kernel/sched/core.c   |  82 ++++++++++++++++-
 3 files changed, 286 insertions(+), 48 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6762444228cc..01c28573dc65 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2110,6 +2110,10 @@ extern void do_set_cpus_allowed(struct task_struct *p,
 
 extern int set_cpus_allowed_ptr(struct task_struct *p,
 				const struct cpumask *new_mask);
+int migrate_me(void);
+void tell_sched_cpu_down_begin(int cpu);
+void tell_sched_cpu_down_done(int cpu);
+
 #else
 static inline void do_set_cpus_allowed(struct task_struct *p,
 				      const struct cpumask *new_mask)
@@ -2122,6 +2126,9 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p,
 		return -EINVAL;
 	return 0;
 }
+static inline int migrate_me(void) { return 0; }
+static inline void tell_sched_cpu_down_begin(int cpu) { }
+static inline void tell_sched_cpu_down_done(int cpu) { }
 #endif
 
 #ifdef CONFIG_NO_HZ_COMMON
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 317caa37fa62..239e10fa54e9 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -58,12 +58,7 @@ static int cpu_hotplug_disabled;
 
 static struct {
 	struct task_struct *active_writer;
-#ifdef CONFIG_PREEMPT_RT_FULL
-	/* Makes the lock keep the task's state */
-	spinlock_t lock;
-#else
 	struct mutex lock; /* Synchronizes accesses to refcount, */
-#endif
 	/*
 	 * Also blocks the new readers during
 	 * an ongoing cpu hotplug operation.
@@ -77,27 +72,13 @@ static struct {
 #endif
 } cpu_hotplug = {
 	.active_writer = NULL,
-#ifdef CONFIG_PREEMPT_RT_FULL
-	.lock = __SPIN_LOCK_UNLOCKED(cpu_hotplug.lock),
-#else
 	.lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
-#endif
 	.refcount = 0,
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	.dep_map = {.name = "cpu_hotplug.lock" },
 #endif
 };
 
-#ifdef CONFIG_PREEMPT_RT_FULL
-# define hotplug_lock() rt_spin_lock(&cpu_hotplug.lock)
-# define hotplug_trylock() rt_spin_trylock(&cpu_hotplug.lock)
-# define hotplug_unlock() rt_spin_unlock(&cpu_hotplug.lock)
-#else
-# define hotplug_lock() mutex_lock(&cpu_hotplug.lock)
-# define hotplug_trylock() mutex_trylock(&cpu_hotplug.lock)
-# define hotplug_unlock() mutex_unlock(&cpu_hotplug.lock)
-#endif
-
 /* Lockdep annotations for get/put_online_cpus() and cpu_hotplug_begin/end() */
 #define cpuhp_lock_acquire_read() lock_map_acquire_read(&cpu_hotplug.dep_map)
 #define cpuhp_lock_acquire_tryread() \
@@ -105,12 +86,42 @@ static struct {
 #define cpuhp_lock_acquire()      lock_map_acquire(&cpu_hotplug.dep_map)
 #define cpuhp_lock_release()      lock_map_release(&cpu_hotplug.dep_map)
 
+/**
+ * hotplug_pcp	- per cpu hotplug descriptor
+ * @unplug:	set when pin_current_cpu() needs to sync tasks
+ * @sync_tsk:	the task that waits for tasks to finish pinned sections
+ * @refcount:	counter of tasks in pinned sections
+ * @grab_lock:	set when the tasks entering pinned sections should wait
+ * @synced:	notifier for @sync_tsk to tell cpu_down it's finished
+ * @mutex:	the mutex to make tasks wait (used when @grab_lock is true)
+ * @mutex_init:	zero if the mutex hasn't been initialized yet.
+ *
+ * Although @unplug and @sync_tsk may point to the same task, the @unplug
+ * is used as a flag and still exists after @sync_tsk has exited and
+ * @sync_tsk set to NULL.
+ */
 struct hotplug_pcp {
 	struct task_struct *unplug;
+	struct task_struct *sync_tsk;
 	int refcount;
+	int grab_lock;
 	struct completion synced;
+#ifdef CONFIG_PREEMPT_RT_FULL
+	spinlock_t lock;
+#else
+	struct mutex mutex;
+#endif
+	int mutex_init;
 };
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+# define hotplug_lock(hp) rt_spin_lock(&(hp)->lock)
+# define hotplug_unlock(hp) rt_spin_unlock(&(hp)->lock)
+#else
+# define hotplug_lock(hp) mutex_lock(&(hp)->mutex)
+# define hotplug_unlock(hp) mutex_unlock(&(hp)->mutex)
+#endif
+
 static DEFINE_PER_CPU(struct hotplug_pcp, hotplug_pcp);
 
 /**
@@ -124,18 +135,39 @@ static DEFINE_PER_CPU(struct hotplug_pcp, hotplug_pcp);
 void pin_current_cpu(void)
 {
 	struct hotplug_pcp *hp;
+	int force = 0;
 
 retry:
 	hp = &__get_cpu_var(hotplug_pcp);
 
-	if (!hp->unplug || hp->refcount || preempt_count() > 1 ||
+	if (!hp->unplug || hp->refcount || force || preempt_count() > 1 ||
 	    hp->unplug == current) {
 		hp->refcount++;
 		return;
 	}
-	preempt_enable();
-	hotplug_lock();
-	hotplug_unlock();
+	if (hp->grab_lock) {
+		preempt_enable();
+		hotplug_lock(hp);
+		hotplug_unlock(hp);
+	} else {
+		preempt_enable();
+		/*
+		 * Try to push this task off of this CPU.
+		 */
+		if (!migrate_me()) {
+			preempt_disable();
+			hp = &__get_cpu_var(hotplug_pcp);
+			if (!hp->grab_lock) {
+				/*
+				 * Just let it continue it's already pinned
+				 * or about to sleep.
+				 */
+				force = 1;
+				goto retry;
+			}
+			preempt_enable();
+		}
+	}
 	preempt_disable();
 	goto retry;
 }
@@ -156,26 +188,84 @@ void unpin_current_cpu(void)
 		wake_up_process(hp->unplug);
 }
 
-/*
- * FIXME: Is this really correct under all circumstances ?
- */
+static void wait_for_pinned_cpus(struct hotplug_pcp *hp)
+{
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	while (hp->refcount) {
+		schedule_preempt_disabled();
+		set_current_state(TASK_UNINTERRUPTIBLE);
+	}
+}
+
 static int sync_unplug_thread(void *data)
 {
 	struct hotplug_pcp *hp = data;
 
 	preempt_disable();
 	hp->unplug = current;
+	wait_for_pinned_cpus(hp);
+
+	/*
+	 * This thread will synchronize the cpu_down() with threads
+	 * that have pinned the CPU. When the pinned CPU count reaches
+	 * zero, we inform the cpu_down code to continue to the next step.
+	 */
 	set_current_state(TASK_UNINTERRUPTIBLE);
-	while (hp->refcount) {
-		schedule_preempt_disabled();
+	preempt_enable();
+	complete(&hp->synced);
+
+	/*
+	 * If all succeeds, the next step will need tasks to wait till
+	 * the CPU is offline before continuing. To do this, the grab_lock
+	 * is set and tasks going into pin_current_cpu() will block on the
+	 * mutex. But we still need to wait for those that are already in
+	 * pinned CPU sections. If the cpu_down() failed, the kthread_should_stop()
+	 * will kick this thread out.
+	 */
+	while (!hp->grab_lock && !kthread_should_stop()) {
+		schedule();
+		set_current_state(TASK_UNINTERRUPTIBLE);
+	}
+
+	/* Make sure grab_lock is seen before we see a stale completion */
+	smp_mb();
+
+	/*
+	 * Now just before cpu_down() enters stop machine, we need to make
+	 * sure all tasks that are in pinned CPU sections are out, and new
+	 * tasks will now grab the lock, keeping them from entering pinned
+	 * CPU sections.
+	 */
+	if (!kthread_should_stop()) {
+		preempt_disable();
+		wait_for_pinned_cpus(hp);
+		preempt_enable();
+		complete(&hp->synced);
+	}
+
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	while (!kthread_should_stop()) {
+		schedule();
 		set_current_state(TASK_UNINTERRUPTIBLE);
 	}
 	set_current_state(TASK_RUNNING);
-	preempt_enable();
-	complete(&hp->synced);
+
+	/*
+	 * Force this thread off this CPU as it's going down and
+	 * we don't want any more work on this CPU.
+	 */
+	current->flags &= ~PF_NO_SETAFFINITY;
+	do_set_cpus_allowed(current, cpu_present_mask);
+	migrate_me();
 	return 0;
 }
 
+static void __cpu_unplug_sync(struct hotplug_pcp *hp)
+{
+	wake_up_process(hp->sync_tsk);
+	wait_for_completion(&hp->synced);
+}
+
 /*
  * Start the sync_unplug_thread on the target cpu and wait for it to
  * complete.
@@ -183,23 +273,83 @@ static int sync_unplug_thread(void *data)
 static int cpu_unplug_begin(unsigned int cpu)
 {
 	struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
-	struct task_struct *tsk;
+	int err;
+
+	/* Protected by cpu_hotplug.lock */
+	if (!hp->mutex_init) {
+#ifdef CONFIG_PREEMPT_RT_FULL
+		spin_lock_init(&hp->lock);
+#else
+		mutex_init(&hp->mutex);
+#endif
+		hp->mutex_init = 1;
+	}
+
+	/* Inform the scheduler to migrate tasks off this CPU */
+	tell_sched_cpu_down_begin(cpu);
 
 	init_completion(&hp->synced);
-	tsk = kthread_create(sync_unplug_thread, hp, "sync_unplug/%d", cpu);
-	if (IS_ERR(tsk))
-		return (PTR_ERR(tsk));
-	kthread_bind(tsk, cpu);
-	wake_up_process(tsk);
-	wait_for_completion(&hp->synced);
+
+	hp->sync_tsk = kthread_create(sync_unplug_thread, hp, "sync_unplug/%d", cpu);
+	if (IS_ERR(hp->sync_tsk)) {
+		err = PTR_ERR(hp->sync_tsk);
+		hp->sync_tsk = NULL;
+		return err;
+	}
+	kthread_bind(hp->sync_tsk, cpu);
+
+	/*
+	 * Wait for tasks to get out of the pinned sections,
+	 * it's still OK if new tasks enter. Some CPU notifiers will
+	 * wait for tasks that are going to enter these sections and
+	 * we must not have them block.
+	 */
+	__cpu_unplug_sync(hp);
+
 	return 0;
 }
 
+static void cpu_unplug_sync(unsigned int cpu)
+{
+	struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
+
+	init_completion(&hp->synced);
+	/* The completion needs to be initialzied before setting grab_lock */
+	smp_wmb();
+
+	/* Grab the mutex before setting grab_lock */
+	hotplug_lock(hp);
+	hp->grab_lock = 1;
+
+	/*
+	 * The CPU notifiers have been completed.
+	 * Wait for tasks to get out of pinned CPU sections and have new
+	 * tasks block until the CPU is completely down.
+	 */
+	__cpu_unplug_sync(hp);
+
+	/* All done with the sync thread */
+	kthread_stop(hp->sync_tsk);
+	hp->sync_tsk = NULL;
+}
+
 static void cpu_unplug_done(unsigned int cpu)
 {
 	struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
 
 	hp->unplug = NULL;
+	/* Let all tasks know cpu unplug is finished before cleaning up */
+	smp_wmb();
+
+	if (hp->sync_tsk)
+		kthread_stop(hp->sync_tsk);
+
+	if (hp->grab_lock) {
+		hotplug_unlock(hp);
+		/* protected by cpu_hotplug.lock */
+		hp->grab_lock = 0;
+	}
+	tell_sched_cpu_down_done(cpu);
 }
 
 void get_online_cpus(void)
@@ -208,9 +358,9 @@ void get_online_cpus(void)
 	if (cpu_hotplug.active_writer == current)
 		return;
 	cpuhp_lock_acquire_read();
-	hotplug_lock();
+	mutex_lock(&cpu_hotplug.lock);
 	cpu_hotplug.refcount++;
-	hotplug_unlock();
+	mutex_unlock(&cpu_hotplug.lock);
 }
 EXPORT_SYMBOL_GPL(get_online_cpus);
 
@@ -219,11 +369,11 @@ bool try_get_online_cpus(void)
 	if (cpu_hotplug.active_writer == current)
 		return true;
 
-	if (!hotplug_trylock()) {
+	if (!mutex_trylock(&cpu_hotplug.lock))
 		return false;
 	cpuhp_lock_acquire_tryread();
 	cpu_hotplug.refcount++;
-	hotplug_unlock();
+	mutex_unlock(&cpu_hotplug.lock);
 	return true;
 }
 EXPORT_SYMBOL_GPL(try_get_online_cpus);
@@ -232,7 +382,7 @@ void put_online_cpus(void)
 {
 	if (cpu_hotplug.active_writer == current)
 		return;
-	if (!hotplug_trylock()) {
+	if (!mutex_trylock(&cpu_hotplug.lock)) {
 		atomic_inc(&cpu_hotplug.puts_pending);
 		cpuhp_lock_release();
 		return;
@@ -243,7 +393,7 @@ void put_online_cpus(void)
 
 	if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer))
 		wake_up_process(cpu_hotplug.active_writer);
-	hotplug_unlock();
+	mutex_unlock(&cpu_hotplug.lock);
 	cpuhp_lock_release();
 
 }
@@ -277,7 +427,7 @@ void cpu_hotplug_begin(void)
 
 	cpuhp_lock_acquire();
 	for (;;) {
-		hotplug_lock();
+		mutex_lock(&cpu_hotplug.lock);
 		if (atomic_read(&cpu_hotplug.puts_pending)) {
 			int delta;
 
@@ -287,7 +437,7 @@ void cpu_hotplug_begin(void)
 		if (likely(!cpu_hotplug.refcount))
 			break;
 		__set_current_state(TASK_UNINTERRUPTIBLE);
-		hotplug_unlock();
+		mutex_unlock(&cpu_hotplug.lock);
 		schedule();
 	}
 }
@@ -295,7 +445,7 @@ void cpu_hotplug_begin(void)
 void cpu_hotplug_done(void)
 {
 	cpu_hotplug.active_writer = NULL;
-	hotplug_unlock();
+	mutex_unlock(&cpu_hotplug.lock);
 	cpuhp_lock_release();
 }
 
@@ -528,6 +678,9 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 
 	smpboot_park_threads(cpu);
 
+	/* Notifiers are done. Don't let any more tasks pin this CPU. */
+	cpu_unplug_sync(cpu);
+
 	/*
 	 * So now all preempt/rcu users must observe !cpu_active().
 	 */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 572834045528..8bf0dddd935b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2760,7 +2760,7 @@ void migrate_disable(void)
 {
 	struct task_struct *p = current;
 
-	if (in_atomic() || p->flags & PF_NO_SETAFFINITY) {
+	if (in_atomic()) {
 #ifdef CONFIG_SCHED_DEBUG
 		p->migrate_disable_atomic++;
 #endif
@@ -2790,7 +2790,7 @@ void migrate_enable(void)
 	unsigned long flags;
 	struct rq *rq;
 
-	if (in_atomic() || p->flags & PF_NO_SETAFFINITY) {
+	if (in_atomic()) {
 #ifdef CONFIG_SCHED_DEBUG
 		p->migrate_disable_atomic--;
 #endif
@@ -4868,6 +4868,84 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 	cpumask_copy(&p->cpus_allowed, new_mask);
 }
 
+static DEFINE_PER_CPU(struct cpumask, sched_cpumasks);
+static DEFINE_MUTEX(sched_down_mutex);
+static cpumask_t sched_down_cpumask;
+
+void tell_sched_cpu_down_begin(int cpu)
+{
+	mutex_lock(&sched_down_mutex);
+	cpumask_set_cpu(cpu, &sched_down_cpumask);
+	mutex_unlock(&sched_down_mutex);
+}
+
+void tell_sched_cpu_down_done(int cpu)
+{
+	mutex_lock(&sched_down_mutex);
+	cpumask_clear_cpu(cpu, &sched_down_cpumask);
+	mutex_unlock(&sched_down_mutex);
+}
+
+/**
+ * migrate_me - try to move the current task off this cpu
+ *
+ * Used by the pin_current_cpu() code to try to get tasks
+ * to move off the current CPU as it is going down.
+ * It will only move the task if the task isn't pinned to
+ * the CPU (with migrate_disable, affinity or NO_SETAFFINITY)
+ * and the task has to be in a RUNNING state. Otherwise the
+ * movement of the task will wake it up (change its state
+ * to running) when the task did not expect it.
+ *
+ * Returns 1 if it succeeded in moving the current task
+ *         0 otherwise.
+ */
+int migrate_me(void)
+{
+	struct task_struct *p = current;
+	struct migration_arg arg;
+	struct cpumask *cpumask;
+	struct cpumask *mask;
+	unsigned long flags;
+	unsigned int dest_cpu;
+	struct rq *rq;
+
+	/*
+	 * We can not migrate tasks bounded to a CPU or tasks not
+	 * running. The movement of the task will wake it up.
+	 */
+	if (p->flags & PF_NO_SETAFFINITY || p->state)
+		return 0;
+
+	mutex_lock(&sched_down_mutex);
+	rq = task_rq_lock(p, &flags);
+
+	cpumask = &__get_cpu_var(sched_cpumasks);
+	mask = &p->cpus_allowed;
+
+	cpumask_andnot(cpumask, mask, &sched_down_cpumask);
+
+	if (!cpumask_weight(cpumask)) {
+		/* It's only on this CPU? */
+		task_rq_unlock(rq, p, &flags);
+		mutex_unlock(&sched_down_mutex);
+		return 0;
+	}
+
+	dest_cpu = cpumask_any_and(cpu_active_mask, cpumask);
+
+	arg.task = p;
+	arg.dest_cpu = dest_cpu;
+
+	task_rq_unlock(rq, p, &flags);
+
+	stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
+	tlb_migrate_finish(p->mm);
+	mutex_unlock(&sched_down_mutex);
+
+	return 1;
+}
+
 /*
  * This is how migration works:
  *
-- 
cgit v1.2.3


From 53a1e4ea8fa0ae98e3aa56a9af688b29b334f432 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 5 Dec 2013 09:16:52 -0500
Subject: cpu hotplug: Document why PREEMPT_RT uses a spinlock

The patch:

    cpu: Make hotplug.lock a "sleeping" spinlock on RT

    Tasks can block on hotplug.lock in pin_current_cpu(), but their
    state might be != RUNNING. So the mutex wakeup will set the state
    unconditionally to RUNNING. That might cause spurious unexpected
    wakeups. We could provide a state preserving mutex_lock() function,
    but this is semantically backwards. So instead we convert the
    hotplug.lock() to a spinlock for RT, which has the state preserving
    semantics already.

Fixed a bug where the hotplug lock on PREEMPT_RT can be called after a
task set its state to TASK_UNINTERRUPTIBLE and before it called
schedule. If the hotplug_lock used a mutex, and there was contention,
the current task's state would be turned to TASK_RUNNABLE and the
schedule call will not sleep. This caused unexpected results.

Although the patch had a description of the change, the code had no
comments about it. This causes confusion to those that review the code,
and as PREEMPT_RT is held in a quilt queue and not git, it's not as easy
to see why a change was made. Even if it was in git, the code should
still have a comment for something as subtle as this.

Document the rational for using a spinlock on PREEMPT_RT in the hotplug
lock code.

Reported-by: Nicholas Mc Guire <der.herr@hofr.at>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 kernel/cpu.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 239e10fa54e9..095cca1297e0 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -107,6 +107,14 @@ struct hotplug_pcp {
 	int grab_lock;
 	struct completion synced;
 #ifdef CONFIG_PREEMPT_RT_FULL
+	/*
+	 * Note, on PREEMPT_RT, the hotplug lock must save the state of
+	 * the task, otherwise the mutex will cause the task to fail
+	 * to sleep when required. (Because it's called from migrate_disable())
+	 *
+	 * The spinlock_t on PREEMPT_RT is a mutex that saves the task's
+	 * state.
+	 */
 	spinlock_t lock;
 #else
 	struct mutex mutex;
-- 
cgit v1.2.3


From 606290c1c81dc6bcc96ac0c7b2bb3a0351d392fd Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 7 Jun 2013 22:37:06 +0200
Subject: kernel/cpu: fix cpu down problem if kthread's cpu is going down

If kthread is pinned to CPUx and CPUx is going down then we get into
trouble:
- first the unplug thread is created
- it will set itself to hp->unplug. As a result, every task that is
  going to take a lock, has to leave the CPU.
- the CPU_DOWN_PREPARE notifier are started. The worker thread will
  start a new process for the "high priority worker".
  Now kthread would like to take a lock but since it can't leave the CPU
  it will never complete its task.

We could fire the unplug thread after the notifier but then the cpu is
no longer marked "online" and the unplug thread will run on CPU0 which
was fixed before :)

So instead the unplug thread is started and kept waiting until the
notfier complete their work.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 kernel/cpu.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 095cca1297e0..c3ce0b9dc8fe 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -106,6 +106,7 @@ struct hotplug_pcp {
 	int refcount;
 	int grab_lock;
 	struct completion synced;
+	struct completion unplug_wait;
 #ifdef CONFIG_PREEMPT_RT_FULL
 	/*
 	 * Note, on PREEMPT_RT, the hotplug lock must save the state of
@@ -209,6 +210,7 @@ static int sync_unplug_thread(void *data)
 {
 	struct hotplug_pcp *hp = data;
 
+	wait_for_completion(&hp->unplug_wait);
 	preempt_disable();
 	hp->unplug = current;
 	wait_for_pinned_cpus(hp);
@@ -274,6 +276,14 @@ static void __cpu_unplug_sync(struct hotplug_pcp *hp)
 	wait_for_completion(&hp->synced);
 }
 
+static void __cpu_unplug_wait(unsigned int cpu)
+{
+	struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
+
+	complete(&hp->unplug_wait);
+	wait_for_completion(&hp->synced);
+}
+
 /*
  * Start the sync_unplug_thread on the target cpu and wait for it to
  * complete.
@@ -297,6 +307,7 @@ static int cpu_unplug_begin(unsigned int cpu)
 	tell_sched_cpu_down_begin(cpu);
 
 	init_completion(&hp->synced);
+	init_completion(&hp->unplug_wait);
 
 	hp->sync_tsk = kthread_create(sync_unplug_thread, hp, "sync_unplug/%d", cpu);
 	if (IS_ERR(hp->sync_tsk)) {
@@ -312,8 +323,7 @@ static int cpu_unplug_begin(unsigned int cpu)
 	 * wait for tasks that are going to enter these sections and
 	 * we must not have them block.
 	 */
-	__cpu_unplug_sync(hp);
-
+	wake_up_process(hp->sync_tsk);
 	return 0;
 }
 
@@ -684,6 +694,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 #endif
 	synchronize_rcu();
 
+	__cpu_unplug_wait(cpu);
 	smpboot_park_threads(cpu);
 
 	/* Notifiers are done. Don't let any more tasks pin this CPU. */
-- 
cgit v1.2.3


From 86f23591bc171c62fa180a512f35990fcb3f9301 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 14 Jun 2013 17:16:35 +0200
Subject: kernel/hotplug: restore original cpu mask oncpu/down

If a task which is allowed to run only on CPU X puts CPU Y down then it
will be allowed on all CPUs but the on CPU Y after it comes back from
kernel. This patch ensures that we don't lose the initial setting unless
the CPU the task is running is going down.

Cc: stable-rt@vger.kernel.org
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 kernel/cpu.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/kernel/cpu.c b/kernel/cpu.c
index c3ce0b9dc8fe..4a4160fcb44f 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -642,6 +642,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 		.hcpu = hcpu,
 	};
 	cpumask_var_t cpumask;
+	cpumask_var_t cpumask_org;
 
 	if (num_online_cpus() == 1)
 		return -EBUSY;
@@ -652,6 +653,12 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 	/* Move the downtaker off the unplug cpu */
 	if (!alloc_cpumask_var(&cpumask, GFP_KERNEL))
 		return -ENOMEM;
+	if (!alloc_cpumask_var(&cpumask_org, GFP_KERNEL))  {
+		free_cpumask_var(cpumask);
+		return -ENOMEM;
+	}
+
+	cpumask_copy(cpumask_org, tsk_cpus_allowed(current));
 	cpumask_andnot(cpumask, cpu_online_mask, cpumask_of(cpu));
 	set_cpus_allowed_ptr(current, cpumask);
 	free_cpumask_var(cpumask);
@@ -660,7 +667,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 	if (mycpu == cpu) {
 		printk(KERN_ERR "Yuck! Still on unplug CPU\n!");
 		migrate_enable();
-		return -EBUSY;
+		err = -EBUSY;
+		goto restore_cpus;
 	}
 
 	cpu_hotplug_begin();
@@ -738,6 +746,9 @@ out_cancel:
 	cpu_hotplug_done();
 	if (!err)
 		cpu_notify_nofail(CPU_POST_DEAD | mod, hcpu);
+restore_cpus:
+	set_cpus_allowed_ptr(current, cpumask_org);
+	free_cpumask_var(cpumask_org);
 	return err;
 }
 
-- 
cgit v1.2.3


From d9c026bce1bf668aa6996a03693ad389f0939746 Mon Sep 17 00:00:00 2001
From: Tiejun Chen <tiejun.chen@windriver.com>
Date: Thu, 7 Nov 2013 10:06:07 +0800
Subject: cpu_down: move migrate_enable() back

Commit 08c1ab68, "hotplug-use-migrate-disable.patch", intends to
use migrate_enable()/migrate_disable() to replace that combination
of preempt_enable() and preempt_disable(), but actually in
!CONFIG_PREEMPT_RT_FULL case, migrate_enable()/migrate_disable()
are still equal to preempt_enable()/preempt_disable(). So that
followed cpu_hotplug_begin()/cpu_unplug_begin(cpu) would go schedule()
to trigger schedule_debug() like this:

_cpu_down()
	|
	+ migrate_disable() = preempt_disable()
	|
	+ cpu_hotplug_begin() or cpu_unplug_begin()
		|
		+ schedule()
			|
			+ __schedule()
				|
				+ preempt_disable();
				|
				+ __schedule_bug() is true!

So we should move migrate_enable() as the original scheme.

Cc: stable-rt@vger.kernel.org
Signed-off-by: Tiejun Chen <tiejun.chen@windriver.com>
---
 kernel/cpu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 4a4160fcb44f..af677166c2a8 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -670,6 +670,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 		err = -EBUSY;
 		goto restore_cpus;
 	}
+	migrate_enable();
 
 	cpu_hotplug_begin();
 	err = cpu_unplug_begin(cpu);
@@ -742,7 +743,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 out_release:
 	cpu_unplug_done(cpu);
 out_cancel:
-	migrate_enable();
 	cpu_hotplug_done();
 	if (!err)
 		cpu_notify_nofail(CPU_POST_DEAD | mod, hcpu);
-- 
cgit v1.2.3


From 43c7ff23f2fa2b9b2fd8bbe42ab322eb2bcfbedf Mon Sep 17 00:00:00 2001
From: Mike Galbraith <umgwanakikbuti@gmail.com>
Date: Tue, 24 Mar 2015 08:14:49 +0100
Subject: hotplug: Use set_cpus_allowed_ptr() in sync_unplug_thread()

do_set_cpus_allowed() is not safe vs ->sched_class change.

crash> bt
PID: 11676  TASK: ffff88026f979da0  CPU: 22  COMMAND: "sync_unplug/22"
 #0 [ffff880274d25bc8] machine_kexec at ffffffff8103b41c
 #1 [ffff880274d25c18] crash_kexec at ffffffff810d881a
 #2 [ffff880274d25cd8] oops_end at ffffffff81525818
 #3 [ffff880274d25cf8] do_invalid_op at ffffffff81003096
 #4 [ffff880274d25d90] invalid_op at ffffffff8152d3de
    [exception RIP: set_cpus_allowed_rt+18]
    RIP: ffffffff8109e012  RSP: ffff880274d25e48  RFLAGS: 00010202
    RAX: ffffffff8109e000  RBX: ffff88026f979da0  RCX: ffff8802770cb6e8
    RDX: 0000000000000000  RSI: ffffffff81add700  RDI: ffff88026f979da0
    RBP: ffff880274d25e78   R8: ffffffff816112e0   R9: 0000000000000001
    R10: 0000000000000001  R11: 0000000000011940  R12: ffff88026f979da0
    R13: ffff8802770cb6d0  R14: ffff880274d25fd8  R15: 0000000000000000
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
 #5 [ffff880274d25e60] do_set_cpus_allowed at ffffffff8108e65f
 #6 [ffff880274d25e80] sync_unplug_thread at ffffffff81058c08
 #7 [ffff880274d25ed8] kthread at ffffffff8107cad6
 #8 [ffff880274d25f50] ret_from_fork at ffffffff8152bbbc
crash> task_struct ffff88026f979da0 | grep class
  sched_class = 0xffffffff816111e0 <fair_sched_class+64>,

Signed-off-by: Mike Galbraith <umgwanakikbuti@gmail.com>
Cc: stable-rt@vger.kernel.org
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 kernel/cpu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/cpu.c b/kernel/cpu.c
index af677166c2a8..27f44cd27d9d 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -265,7 +265,7 @@ static int sync_unplug_thread(void *data)
 	 * we don't want any more work on this CPU.
 	 */
 	current->flags &= ~PF_NO_SETAFFINITY;
-	do_set_cpus_allowed(current, cpu_present_mask);
+	set_cpus_allowed_ptr(current, cpu_present_mask);
 	migrate_me();
 	return 0;
 }
-- 
cgit v1.2.3


From 11a65c05e164deb2fccdc8d90de3b461a61044df Mon Sep 17 00:00:00 2001
From: John Kacur <jkacur@redhat.com>
Date: Fri, 27 Apr 2012 12:48:46 +0200
Subject: scsi: qla2xxx: Use local_irq_save_nort() in qla2x00_poll

RT triggers the following:

[   11.307652]  [<ffffffff81077b27>] __might_sleep+0xe7/0x110
[   11.307663]  [<ffffffff8150e524>] rt_spin_lock+0x24/0x60
[   11.307670]  [<ffffffff8150da78>] ? rt_spin_lock_slowunlock+0x78/0x90
[   11.307703]  [<ffffffffa0272d83>] qla24xx_intr_handler+0x63/0x2d0 [qla2xxx]
[   11.307736]  [<ffffffffa0262307>] qla2x00_poll+0x67/0x90 [qla2xxx]

Function qla2x00_poll does local_irq_save() before calling qla24xx_intr_handler
which has a spinlock. Since spinlocks are sleepable on rt, it is not allowed
to call them with interrupts disabled. Therefore we use local_irq_save_nort()
instead which saves flags without disabling interrupts.

This fix needs to be applied to v3.0-rt, v3.2-rt and v3.4-rt

Suggested-by: Thomas Gleixner
Signed-off-by: John Kacur <jkacur@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: David Sommerseth <davids@redhat.com>
Link: http://lkml.kernel.org/r/1335523726-10024-1-git-send-email-jkacur@redhat.com
Cc: stable-rt@vger.kernel.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/scsi/qla2xxx/qla_inline.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/scsi/qla2xxx/qla_inline.h b/drivers/scsi/qla2xxx/qla_inline.h
index fee9eb7c8a60..b42d4adc42dc 100644
--- a/drivers/scsi/qla2xxx/qla_inline.h
+++ b/drivers/scsi/qla2xxx/qla_inline.h
@@ -59,12 +59,12 @@ qla2x00_poll(struct rsp_que *rsp)
 {
 	unsigned long flags;
 	struct qla_hw_data *ha = rsp->hw;
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 	if (IS_P3P_TYPE(ha))
 		qla82xx_poll(0, rsp);
 	else
 		ha->isp_ops->intr_handler(0, rsp);
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 }
 
 static inline uint8_t *
-- 
cgit v1.2.3


From 2e25e00ac9eecd304fb25a58e25024785e430a94 Mon Sep 17 00:00:00 2001
From: Priyanka Jain <Priyanka.Jain@freescale.com>
Date: Thu, 17 May 2012 09:35:11 +0530
Subject: net,RT:REmove preemption disabling in netif_rx()

1)enqueue_to_backlog() (called from netif_rx) should be
  bind to a particluar CPU. This can be achieved by
  disabling migration. No need to disable preemption

2)Fixes crash "BUG: scheduling while atomic: ksoftirqd"
  in case of RT.
  If preemption is disabled, enqueue_to_backog() is called
  in atomic context. And if backlog exceeds its count,
  kfree_skb() is called. But in RT, kfree_skb() might
  gets scheduled out, so it expects non atomic context.

3)When CONFIG_PREEMPT_RT_FULL is not defined,
 migrate_enable(), migrate_disable() maps to
 preempt_enable() and preempt_disable(), so no
 change in functionality in case of non-RT.

-Replace preempt_enable(), preempt_disable() with
 migrate_enable(), migrate_disable() respectively
-Replace get_cpu(), put_cpu() with get_cpu_light(),
 put_cpu_light() respectively

Signed-off-by: Priyanka Jain <Priyanka.Jain@freescale.com>
Acked-by: Rajan Srivastava <Rajan.Srivastava@freescale.com>
Cc: <rostedt@goodmis.orgn>
Link: http://lkml.kernel.org/r/1337227511-2271-1-git-send-email-Priyanka.Jain@freescale.com
Cc: stable-rt@vger.kernel.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 net/core/dev.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index a98e98bc85cf..da919e2b0bbe 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3352,7 +3352,7 @@ static int netif_rx_internal(struct sk_buff *skb)
 		struct rps_dev_flow voidflow, *rflow = &voidflow;
 		int cpu;
 
-		preempt_disable();
+		migrate_disable();
 		rcu_read_lock();
 
 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
@@ -3362,13 +3362,13 @@ static int netif_rx_internal(struct sk_buff *skb)
 		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
 
 		rcu_read_unlock();
-		preempt_enable();
+		migrate_enable();
 	} else
 #endif
 	{
 		unsigned int qtail;
-		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
-		put_cpu();
+		ret = enqueue_to_backlog(skb, get_cpu_light(), &qtail);
+		put_cpu_light();
 	}
 	return ret;
 }
-- 
cgit v1.2.3


From 422beb0be6f9871814600ba8872c91539dab3e5a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 26 Sep 2012 16:21:08 +0200
Subject: net: Another local_irq_disable/kmalloc headache

Replace it by a local lock. Though that's pretty inefficient :(

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 net/core/skbuff.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 17fd8dca921e..e4e3b13cc729 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -63,6 +63,7 @@
 #include <linux/errqueue.h>
 #include <linux/prefetch.h>
 #include <linux/if_vlan.h>
+#include <linux/locallock.h>
 
 #include <net/protocol.h>
 #include <net/dst.h>
@@ -336,6 +337,7 @@ struct netdev_alloc_cache {
 	unsigned int		pagecnt_bias;
 };
 static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache);
+static DEFINE_LOCAL_IRQ_LOCK(netdev_alloc_lock);
 
 static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
 {
@@ -344,7 +346,7 @@ static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
 	int order;
 	unsigned long flags;
 
-	local_irq_save(flags);
+	local_lock_irqsave(netdev_alloc_lock, flags);
 	nc = this_cpu_ptr(&netdev_alloc_cache);
 	if (unlikely(!nc->frag.page)) {
 refill:
@@ -389,7 +391,7 @@ refill:
 	nc->frag.offset += fragsz;
 	nc->pagecnt_bias--;
 end:
-	local_irq_restore(flags);
+	local_unlock_irqrestore(netdev_alloc_lock, flags);
 	return data;
 }
 
-- 
cgit v1.2.3


From dae83545bd6039aa47e094a5af9e5320c63e5124 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 28 Oct 2012 11:18:08 +0100
Subject: net: netfilter: Serialize xt_write_recseq sections on RT

The netfilter code relies only on the implicit semantics of
local_bh_disable() for serializing wt_write_recseq sections. RT breaks
that and needs explicit serialization here.

Reported-by: Peter LaDow <petela@gocougs.wsu.edu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable-rt@vger.kernel.org
---
 include/linux/netfilter/x_tables.h | 7 +++++++
 net/netfilter/core.c               | 6 ++++++
 2 files changed, 13 insertions(+)

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index a3e215bb0241..f434fc08c325 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -3,6 +3,7 @@
 
 
 #include <linux/netdevice.h>
+#include <linux/locallock.h>
 #include <uapi/linux/netfilter/x_tables.h>
 
 /**
@@ -282,6 +283,8 @@ void xt_free_table_info(struct xt_table_info *info);
  */
 DECLARE_PER_CPU(seqcount_t, xt_recseq);
 
+DECLARE_LOCAL_IRQ_LOCK(xt_write_lock);
+
 /**
  * xt_write_recseq_begin - start of a write section
  *
@@ -296,6 +299,9 @@ static inline unsigned int xt_write_recseq_begin(void)
 {
 	unsigned int addend;
 
+	/* RT protection */
+	local_lock(xt_write_lock);
+
 	/*
 	 * Low order bit of sequence is set if we already
 	 * called xt_write_recseq_begin().
@@ -326,6 +332,7 @@ static inline void xt_write_recseq_end(unsigned int addend)
 	/* this is kind of a write_seqcount_end(), but addend is 0 or 1 */
 	smp_wmb();
 	__this_cpu_add(xt_recseq.sequence, addend);
+	local_unlock(xt_write_lock);
 }
 
 /*
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 024a2e25c8a4..656b5ebb0eff 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -21,11 +21,17 @@
 #include <linux/proc_fs.h>
 #include <linux/mutex.h>
 #include <linux/slab.h>
+#include <linux/locallock.h>
 #include <net/net_namespace.h>
 #include <net/sock.h>
 
 #include "nf_internals.h"
 
+#ifdef CONFIG_PREEMPT_RT_BASE
+DEFINE_LOCAL_IRQ_LOCK(xt_write_lock);
+EXPORT_PER_CPU_SYMBOL(xt_write_lock);
+#endif
+
 static DEFINE_MUTEX(afinfo_mutex);
 
 const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly;
-- 
cgit v1.2.3


From 9630f73f2df43e44f35310a98759fcd2075c7eaf Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 28 Oct 2012 15:12:49 +0000
Subject: net: Use local_bh_disable in netif_rx_ni()

This code triggers the new WARN in __raise_softirq_irqsoff() though it
actually looks at the softirq pending bit and calls into the softirq
code, but that fits not well with the context related softirq model of
RT. It's correct on mainline though, but going through
local_bh_disable/enable here is not going to hurt badly.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 net/core/dev.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index da919e2b0bbe..cdb78dc75a0b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3402,11 +3402,9 @@ int netif_rx_ni(struct sk_buff *skb)
 
 	trace_netif_rx_ni_entry(skb);
 
-	migrate_disable();
+	local_bh_disable();
 	err = netif_rx_internal(skb);
-	if (local_softirq_pending())
-		thread_do_softirq();
-	migrate_enable();
+	local_bh_enable();
 
 	return err;
 }
-- 
cgit v1.2.3


From acae1faf8edc2638ce8baae1d66a0cba2b964d56 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 20 Mar 2013 18:06:20 +0100
Subject: net: Add a mutex around devnet_rename_seq

On RT write_seqcount_begin() disables preemption and device_rename()
allocates memory with GFP_KERNEL and grabs later the sysfs_mutex
mutex. Serialize with a mutex and add use the non preemption disabling
__write_seqcount_begin().

To avoid writer starvation, let the reader grab the mutex and release
it when it detects a writer in progress. This keeps the normal case
(no reader on the fly) fast.

[ tglx: Instead of replacing the seqcount by a mutex, add the mutex ]

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 net/core/dev.c | 34 ++++++++++++++++++++--------------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index cdb78dc75a0b..5ab3842898ba 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -182,6 +182,7 @@ static unsigned int napi_gen_id;
 static DEFINE_HASHTABLE(napi_hash, 8);
 
 static seqcount_t devnet_rename_seq;
+static DEFINE_MUTEX(devnet_rename_mutex);
 
 static inline void dev_base_seq_inc(struct net *net)
 {
@@ -832,7 +833,8 @@ retry:
 	strcpy(name, dev->name);
 	rcu_read_unlock();
 	if (read_seqcount_retry(&devnet_rename_seq, seq)) {
-		cond_resched();
+		mutex_lock(&devnet_rename_mutex);
+		mutex_unlock(&devnet_rename_mutex);
 		goto retry;
 	}
 
@@ -1101,20 +1103,17 @@ int dev_change_name(struct net_device *dev, const char *newname)
 	if (dev->flags & IFF_UP)
 		return -EBUSY;
 
-	write_seqcount_begin(&devnet_rename_seq);
+	mutex_lock(&devnet_rename_mutex);
+	__raw_write_seqcount_begin(&devnet_rename_seq);
 
-	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
-		write_seqcount_end(&devnet_rename_seq);
-		return 0;
-	}
+	if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
+		goto outunlock;
 
 	memcpy(oldname, dev->name, IFNAMSIZ);
 
 	err = dev_get_valid_name(net, dev, newname);
-	if (err < 0) {
-		write_seqcount_end(&devnet_rename_seq);
-		return err;
-	}
+	if (err < 0)
+		goto outunlock;
 
 	if (oldname[0] && !strchr(oldname, '%'))
 		netdev_info(dev, "renamed from %s\n", oldname);
@@ -1127,11 +1126,12 @@ rollback:
 	if (ret) {
 		memcpy(dev->name, oldname, IFNAMSIZ);
 		dev->name_assign_type = old_assign_type;
-		write_seqcount_end(&devnet_rename_seq);
-		return ret;
+		err = ret;
+		goto outunlock;
 	}
 
-	write_seqcount_end(&devnet_rename_seq);
+	__raw_write_seqcount_end(&devnet_rename_seq);
+	mutex_unlock(&devnet_rename_mutex);
 
 	netdev_adjacent_rename_links(dev, oldname);
 
@@ -1152,7 +1152,8 @@ rollback:
 		/* err >= 0 after dev_alloc_name() or stores the first errno */
 		if (err >= 0) {
 			err = ret;
-			write_seqcount_begin(&devnet_rename_seq);
+			mutex_lock(&devnet_rename_mutex);
+			__raw_write_seqcount_begin(&devnet_rename_seq);
 			memcpy(dev->name, oldname, IFNAMSIZ);
 			memcpy(oldname, newname, IFNAMSIZ);
 			dev->name_assign_type = old_assign_type;
@@ -1165,6 +1166,11 @@ rollback:
 	}
 
 	return err;
+
+outunlock:
+	__raw_write_seqcount_end(&devnet_rename_seq);
+	mutex_unlock(&devnet_rename_mutex);
+	return err;
 }
 
 /**
-- 
cgit v1.2.3


From 81c556b14c035c899fbd0941667c964228dc8002 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 5 Oct 2012 09:03:24 +0100
Subject: crypto: Convert crypto notifier chain to SRCU

The crypto notifier deadlocks on RT. Though this can be a real deadlock
on mainline as well due to fifo fair rwsems.

The involved parties here are:

[   82.172678] swapper/0       S 0000000000000001     0     1      0 0x00000000
[   82.172682]  ffff88042f18fcf0 0000000000000046 ffff88042f18fc80 ffffffff81491238
[   82.172685]  0000000000011cc0 0000000000011cc0 ffff88042f18c040 ffff88042f18ffd8
[   82.172688]  0000000000011cc0 0000000000011cc0 ffff88042f18ffd8 0000000000011cc0
[   82.172689] Call Trace:
[   82.172697]  [<ffffffff81491238>] ? _raw_spin_unlock_irqrestore+0x6c/0x7a
[   82.172701]  [<ffffffff8148fd3f>] schedule+0x64/0x66
[   82.172704]  [<ffffffff8148ec6b>] schedule_timeout+0x27/0xd0
[   82.172708]  [<ffffffff81043c0c>] ? unpin_current_cpu+0x1a/0x6c
[   82.172713]  [<ffffffff8106e491>] ? migrate_enable+0x12f/0x141
[   82.172716]  [<ffffffff8148fbbd>] wait_for_common+0xbb/0x11f
[   82.172719]  [<ffffffff810709f2>] ? try_to_wake_up+0x182/0x182
[   82.172722]  [<ffffffff8148fc96>] wait_for_completion_interruptible+0x1d/0x2e
[   82.172726]  [<ffffffff811debfd>] crypto_wait_for_test+0x49/0x6b
[   82.172728]  [<ffffffff811ded32>] crypto_register_alg+0x53/0x5a
[   82.172730]  [<ffffffff811ded6c>] crypto_register_algs+0x33/0x72
[   82.172734]  [<ffffffff81ad7686>] ? aes_init+0x12/0x12
[   82.172737]  [<ffffffff81ad76ea>] aesni_init+0x64/0x66
[   82.172741]  [<ffffffff81000318>] do_one_initcall+0x7f/0x13b
[   82.172744]  [<ffffffff81ac4d34>] kernel_init+0x199/0x22c
[   82.172747]  [<ffffffff81ac44ef>] ? loglevel+0x31/0x31
[   82.172752]  [<ffffffff814987c4>] kernel_thread_helper+0x4/0x10
[   82.172755]  [<ffffffff81491574>] ? retint_restore_args+0x13/0x13
[   82.172759]  [<ffffffff81ac4b9b>] ? start_kernel+0x3ca/0x3ca
[   82.172761]  [<ffffffff814987c0>] ? gs_change+0x13/0x13

[   82.174186] cryptomgr_test  S 0000000000000001     0    41      2 0x00000000
[   82.174189]  ffff88042c971980 0000000000000046 ffffffff81d74830 0000000000000292
[   82.174192]  0000000000011cc0 0000000000011cc0 ffff88042c96eb80 ffff88042c971fd8
[   82.174195]  0000000000011cc0 0000000000011cc0 ffff88042c971fd8 0000000000011cc0
[   82.174195] Call Trace:
[   82.174198]  [<ffffffff8148fd3f>] schedule+0x64/0x66
[   82.174201]  [<ffffffff8148ec6b>] schedule_timeout+0x27/0xd0
[   82.174204]  [<ffffffff81043c0c>] ? unpin_current_cpu+0x1a/0x6c
[   82.174206]  [<ffffffff8106e491>] ? migrate_enable+0x12f/0x141
[   82.174209]  [<ffffffff8148fbbd>] wait_for_common+0xbb/0x11f
[   82.174212]  [<ffffffff810709f2>] ? try_to_wake_up+0x182/0x182
[   82.174215]  [<ffffffff8148fc96>] wait_for_completion_interruptible+0x1d/0x2e
[   82.174218]  [<ffffffff811e4883>] cryptomgr_notify+0x280/0x385
[   82.174221]  [<ffffffff814943de>] notifier_call_chain+0x6b/0x98
[   82.174224]  [<ffffffff8108a11c>] ? rt_down_read+0x10/0x12
[   82.174227]  [<ffffffff810677cd>] __blocking_notifier_call_chain+0x70/0x8d
[   82.174230]  [<ffffffff810677fe>] blocking_notifier_call_chain+0x14/0x16
[   82.174234]  [<ffffffff811dd272>] crypto_probing_notify+0x24/0x50
[   82.174236]  [<ffffffff811dd7a1>] crypto_alg_mod_lookup+0x3e/0x74
[   82.174238]  [<ffffffff811dd949>] crypto_alloc_base+0x36/0x8f
[   82.174241]  [<ffffffff811e9408>] cryptd_alloc_ablkcipher+0x6e/0xb5
[   82.174243]  [<ffffffff811dd591>] ? kzalloc.clone.5+0xe/0x10
[   82.174246]  [<ffffffff8103085d>] ablk_init_common+0x1d/0x38
[   82.174249]  [<ffffffff8103852a>] ablk_ecb_init+0x15/0x17
[   82.174251]  [<ffffffff811dd8c6>] __crypto_alloc_tfm+0xc7/0x114
[   82.174254]  [<ffffffff811e0caa>] ? crypto_lookup_skcipher+0x1f/0xe4
[   82.174256]  [<ffffffff811e0dcf>] crypto_alloc_ablkcipher+0x60/0xa5
[   82.174258]  [<ffffffff811e5bde>] alg_test_skcipher+0x24/0x9b
[   82.174261]  [<ffffffff8106d96d>] ? finish_task_switch+0x3f/0xfa
[   82.174263]  [<ffffffff811e6b8e>] alg_test+0x16f/0x1d7
[   82.174267]  [<ffffffff811e45ac>] ? cryptomgr_probe+0xac/0xac
[   82.174269]  [<ffffffff811e45d8>] cryptomgr_test+0x2c/0x47
[   82.174272]  [<ffffffff81061161>] kthread+0x7e/0x86
[   82.174275]  [<ffffffff8106d9dd>] ? finish_task_switch+0xaf/0xfa
[   82.174278]  [<ffffffff814987c4>] kernel_thread_helper+0x4/0x10
[   82.174281]  [<ffffffff81491574>] ? retint_restore_args+0x13/0x13
[   82.174284]  [<ffffffff810610e3>] ? __init_kthread_worker+0x8c/0x8c
[   82.174287]  [<ffffffff814987c0>] ? gs_change+0x13/0x13

[   82.174329] cryptomgr_probe D 0000000000000002     0    47      2 0x00000000
[   82.174332]  ffff88042c991b70 0000000000000046 ffff88042c991bb0 0000000000000006
[   82.174335]  0000000000011cc0 0000000000011cc0 ffff88042c98ed00 ffff88042c991fd8
[   82.174338]  0000000000011cc0 0000000000011cc0 ffff88042c991fd8 0000000000011cc0
[   82.174338] Call Trace:
[   82.174342]  [<ffffffff8148fd3f>] schedule+0x64/0x66
[   82.174344]  [<ffffffff814901ad>] __rt_mutex_slowlock+0x85/0xbe
[   82.174347]  [<ffffffff814902d2>] rt_mutex_slowlock+0xec/0x159
[   82.174351]  [<ffffffff81089c4d>] rt_mutex_fastlock.clone.8+0x29/0x2f
[   82.174353]  [<ffffffff81490372>] rt_mutex_lock+0x33/0x37
[   82.174356]  [<ffffffff8108a0f2>] __rt_down_read+0x50/0x5a
[   82.174358]  [<ffffffff8108a11c>] ? rt_down_read+0x10/0x12
[   82.174360]  [<ffffffff8108a11c>] rt_down_read+0x10/0x12
[   82.174363]  [<ffffffff810677b5>] __blocking_notifier_call_chain+0x58/0x8d
[   82.174366]  [<ffffffff810677fe>] blocking_notifier_call_chain+0x14/0x16
[   82.174369]  [<ffffffff811dd272>] crypto_probing_notify+0x24/0x50
[   82.174372]  [<ffffffff811debd6>] crypto_wait_for_test+0x22/0x6b
[   82.174374]  [<ffffffff811decd3>] crypto_register_instance+0xb4/0xc0
[   82.174377]  [<ffffffff811e9b76>] cryptd_create+0x378/0x3b6
[   82.174379]  [<ffffffff811de512>] ? __crypto_lookup_template+0x5b/0x63
[   82.174382]  [<ffffffff811e4545>] cryptomgr_probe+0x45/0xac
[   82.174385]  [<ffffffff811e4500>] ? crypto_alloc_pcomp+0x1b/0x1b
[   82.174388]  [<ffffffff81061161>] kthread+0x7e/0x86
[   82.174391]  [<ffffffff8106d9dd>] ? finish_task_switch+0xaf/0xfa
[   82.174394]  [<ffffffff814987c4>] kernel_thread_helper+0x4/0x10
[   82.174398]  [<ffffffff81491574>] ? retint_restore_args+0x13/0x13
[   82.174401]  [<ffffffff810610e3>] ? __init_kthread_worker+0x8c/0x8c
[   82.174403]  [<ffffffff814987c0>] ? gs_change+0x13/0x13

cryptomgr_test spawns the cryptomgr_probe thread from the notifier
call. The probe thread fires the same notifier as the test thread and
deadlocks on the rwsem on RT.

Now this is a potential deadlock in mainline as well, because we have
fifo fair rwsems. If another thread blocks with a down_write() on the
notifier chain before the probe thread issues the down_read() it will
block the probe thread and the whole party is dead locked.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 crypto/algapi.c   | 4 ++--
 crypto/api.c      | 6 +++---
 crypto/internal.h | 4 ++--
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/crypto/algapi.c b/crypto/algapi.c
index 71a8143e23b1..f88b72a74645 100644
--- a/crypto/algapi.c
+++ b/crypto/algapi.c
@@ -698,13 +698,13 @@ EXPORT_SYMBOL_GPL(crypto_spawn_tfm2);
 
 int crypto_register_notifier(struct notifier_block *nb)
 {
-	return blocking_notifier_chain_register(&crypto_chain, nb);
+	return srcu_notifier_chain_register(&crypto_chain, nb);
 }
 EXPORT_SYMBOL_GPL(crypto_register_notifier);
 
 int crypto_unregister_notifier(struct notifier_block *nb)
 {
-	return blocking_notifier_chain_unregister(&crypto_chain, nb);
+	return srcu_notifier_chain_unregister(&crypto_chain, nb);
 }
 EXPORT_SYMBOL_GPL(crypto_unregister_notifier);
 
diff --git a/crypto/api.c b/crypto/api.c
index 2a81e98a0021..6d536b8ac4f0 100644
--- a/crypto/api.c
+++ b/crypto/api.c
@@ -31,7 +31,7 @@ EXPORT_SYMBOL_GPL(crypto_alg_list);
 DECLARE_RWSEM(crypto_alg_sem);
 EXPORT_SYMBOL_GPL(crypto_alg_sem);
 
-BLOCKING_NOTIFIER_HEAD(crypto_chain);
+SRCU_NOTIFIER_HEAD(crypto_chain);
 EXPORT_SYMBOL_GPL(crypto_chain);
 
 static struct crypto_alg *crypto_larval_wait(struct crypto_alg *alg);
@@ -236,10 +236,10 @@ int crypto_probing_notify(unsigned long val, void *v)
 {
 	int ok;
 
-	ok = blocking_notifier_call_chain(&crypto_chain, val, v);
+	ok = srcu_notifier_call_chain(&crypto_chain, val, v);
 	if (ok == NOTIFY_DONE) {
 		request_module("cryptomgr");
-		ok = blocking_notifier_call_chain(&crypto_chain, val, v);
+		ok = srcu_notifier_call_chain(&crypto_chain, val, v);
 	}
 
 	return ok;
diff --git a/crypto/internal.h b/crypto/internal.h
index bd39bfc92eab..a5db167cba84 100644
--- a/crypto/internal.h
+++ b/crypto/internal.h
@@ -48,7 +48,7 @@ struct crypto_larval {
 
 extern struct list_head crypto_alg_list;
 extern struct rw_semaphore crypto_alg_sem;
-extern struct blocking_notifier_head crypto_chain;
+extern struct srcu_notifier_head crypto_chain;
 
 #ifdef CONFIG_PROC_FS
 void __init crypto_init_proc(void);
@@ -142,7 +142,7 @@ static inline int crypto_is_moribund(struct crypto_alg *alg)
 
 static inline void crypto_notify(unsigned long val, void *v)
 {
-	blocking_notifier_call_chain(&crypto_chain, val, v);
+	srcu_notifier_call_chain(&crypto_chain, val, v);
 }
 
 #endif	/* _CRYPTO_INTERNAL_H */
-- 
cgit v1.2.3


From 0aedb63e35fa89d069e7cac53234a3427c5c4876 Mon Sep 17 00:00:00 2001
From: Yong Zhang <yong.zhang@windriver.com>
Date: Mon, 16 Apr 2012 15:01:56 +0800
Subject: lockdep: Selftest: Only do hardirq context test for raw spinlock

On -rt there is no softirq context any more and rwlock is sleepable,
disable softirq context test and rwlock+irq test.

Signed-off-by: Yong Zhang <yong.zhang0@gmail.com>
Cc: Yong Zhang <yong.zhang@windriver.com>
Link: http://lkml.kernel.org/r/1334559716-18447-3-git-send-email-yong.zhang0@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 lib/locking-selftest.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
index 872a15a2a637..19ae2366a7a9 100644
--- a/lib/locking-selftest.c
+++ b/lib/locking-selftest.c
@@ -1858,6 +1858,7 @@ void locking_selftest(void)
 
 	printk("  --------------------------------------------------------------------------\n");
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 	/*
 	 * irq-context testcases:
 	 */
@@ -1870,6 +1871,28 @@ void locking_selftest(void)
 
 	DO_TESTCASE_6x2("irq read-recursion", irq_read_recursion);
 //	DO_TESTCASE_6x2B("irq read-recursion #2", irq_read_recursion2);
+#else
+	/* On -rt, we only do hardirq context test for raw spinlock */
+	DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 12);
+	DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 21);
+
+	DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 12);
+	DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 21);
+
+	DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 123);
+	DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 132);
+	DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 213);
+	DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 231);
+	DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 312);
+	DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 321);
+
+	DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 123);
+	DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 132);
+	DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 213);
+	DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 231);
+	DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 312);
+	DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 321);
+#endif
 
 	ww_tests();
 
-- 
cgit v1.2.3


From 4398003586064fbfb1bacb21a9fc8e655d23021f Mon Sep 17 00:00:00 2001
From: Josh Cartwright <josh.cartwright@ni.com>
Date: Wed, 28 Jan 2015 13:08:45 -0600
Subject: lockdep: selftest: fix warnings due to missing PREEMPT_RT
 conditionals

"lockdep: Selftest: Only do hardirq context test for raw spinlock"
disabled the execution of certain tests with PREEMPT_RT_FULL, but did
not prevent the tests from still being defined.  This leads to warnings
like:

  ./linux/lib/locking-selftest.c:574:1: warning: 'irqsafe1_hard_rlock_12' defined but not used [-Wunused-function]
  ./linux/lib/locking-selftest.c:574:1: warning: 'irqsafe1_hard_rlock_21' defined but not used [-Wunused-function]
  ./linux/lib/locking-selftest.c:577:1: warning: 'irqsafe1_hard_wlock_12' defined but not used [-Wunused-function]
  ./linux/lib/locking-selftest.c:577:1: warning: 'irqsafe1_hard_wlock_21' defined but not used [-Wunused-function]
  ./linux/lib/locking-selftest.c:580:1: warning: 'irqsafe1_soft_spin_12' defined but not used [-Wunused-function]
  ...

Fixed by wrapping the test definitions in #ifndef CONFIG_PREEMPT_RT_FULL
conditionals.

Cc: stable-rt@vger.kernel.org
Signed-off-by: Josh Cartwright <josh.cartwright@ni.com>
Signed-off-by: Xander Huff <xander.huff@ni.com>
Acked-by: Gratian Crisan <gratian.crisan@ni.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 lib/locking-selftest.c | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
index 19ae2366a7a9..b93a6103fa4d 100644
--- a/lib/locking-selftest.c
+++ b/lib/locking-selftest.c
@@ -590,6 +590,8 @@ GENERATE_TESTCASE(init_held_rsem)
 #include "locking-selftest-spin-hardirq.h"
 GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_spin)
 
+#ifndef CONFIG_PREEMPT_RT_FULL
+
 #include "locking-selftest-rlock-hardirq.h"
 GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_rlock)
 
@@ -605,9 +607,12 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_rlock)
 #include "locking-selftest-wlock-softirq.h"
 GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_wlock)
 
+#endif
+
 #undef E1
 #undef E2
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 /*
  * Enabling hardirqs with a softirq-safe lock held:
  */
@@ -640,6 +645,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2A_rlock)
 #undef E1
 #undef E2
 
+#endif
+
 /*
  * Enabling irqs with an irq-safe lock held:
  */
@@ -663,6 +670,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2A_rlock)
 #include "locking-selftest-spin-hardirq.h"
 GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_spin)
 
+#ifndef CONFIG_PREEMPT_RT_FULL
+
 #include "locking-selftest-rlock-hardirq.h"
 GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_rlock)
 
@@ -678,6 +687,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_rlock)
 #include "locking-selftest-wlock-softirq.h"
 GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock)
 
+#endif
+
 #undef E1
 #undef E2
 
@@ -709,6 +720,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock)
 #include "locking-selftest-spin-hardirq.h"
 GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_spin)
 
+#ifndef CONFIG_PREEMPT_RT_FULL
+
 #include "locking-selftest-rlock-hardirq.h"
 GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_rlock)
 
@@ -724,6 +737,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_rlock)
 #include "locking-selftest-wlock-softirq.h"
 GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock)
 
+#endif
+
 #undef E1
 #undef E2
 #undef E3
@@ -757,6 +772,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock)
 #include "locking-selftest-spin-hardirq.h"
 GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_spin)
 
+#ifndef CONFIG_PREEMPT_RT_FULL
+
 #include "locking-selftest-rlock-hardirq.h"
 GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_rlock)
 
@@ -772,10 +789,14 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_rlock)
 #include "locking-selftest-wlock-softirq.h"
 GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_wlock)
 
+#endif
+
 #undef E1
 #undef E2
 #undef E3
 
+#ifndef CONFIG_PREEMPT_RT_FULL
+
 /*
  * read-lock / write-lock irq inversion.
  *
@@ -838,6 +859,10 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_inversion_soft_wlock)
 #undef E2
 #undef E3
 
+#endif
+
+#ifndef CONFIG_PREEMPT_RT_FULL
+
 /*
  * read-lock / write-lock recursion that is actually safe.
  */
@@ -876,6 +901,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_soft)
 #undef E2
 #undef E3
 
+#endif
+
 /*
  * read-lock / write-lock recursion that is unsafe.
  */
-- 
cgit v1.2.3


From 610f1b2f3dc1904a8ad87a67f0799951f1133a17 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 28 Sep 2012 10:49:42 +0100
Subject: rt: rwsem/rwlock: lockdep annotations

rwlocks and rwsems on RT do not allow multiple readers. Annotate the
lockdep acquire functions accordingly.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable-rt@vger.kernel.org
---
 kernel/locking/rt.c | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/kernel/locking/rt.c b/kernel/locking/rt.c
index 2e7efaf1d8b2..73c55089fb93 100644
--- a/kernel/locking/rt.c
+++ b/kernel/locking/rt.c
@@ -216,17 +216,17 @@ int __lockfunc rt_read_trylock(rwlock_t *rwlock)
 	if (rt_mutex_owner(lock) != current) {
 		migrate_disable();
 		ret = rt_mutex_trylock(lock);
-		if (!ret)
+		if (ret)
+			rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
+		else
 			migrate_enable();
 
 	} else if (!rwlock->read_depth) {
 		ret = 0;
 	}
 
-	if (ret) {
+	if (ret)
 		rwlock->read_depth++;
-		rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
-	}
 
 	return ret;
 }
@@ -244,13 +244,13 @@ void __lockfunc rt_read_lock(rwlock_t *rwlock)
 {
 	struct rt_mutex *lock = &rwlock->lock;
 
-	rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
 
 	/*
 	 * recursive read locks succeed when current owns the lock
 	 */
 	if (rt_mutex_owner(lock) != current) {
 		migrate_disable();
+		rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
 		__rt_spin_lock(lock);
 	}
 	rwlock->read_depth++;
@@ -269,10 +269,9 @@ EXPORT_SYMBOL(rt_write_unlock);
 
 void __lockfunc rt_read_unlock(rwlock_t *rwlock)
 {
-	rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
-
 	/* Release the lock only when read_depth is down to 0 */
 	if (--rwlock->read_depth == 0) {
+		rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
 		__rt_spin_unlock(&rwlock->lock);
 		migrate_enable();
 	}
@@ -363,6 +362,14 @@ void  rt_down_write_nested(struct rw_semaphore *rwsem, int subclass)
 }
 EXPORT_SYMBOL(rt_down_write_nested);
 
+void rt_down_write_nested_lock(struct rw_semaphore *rwsem,
+			       struct lockdep_map *nest)
+{
+	rwsem_acquire_nest(&rwsem->dep_map, 0, 0, nest, _RET_IP_);
+	rt_mutex_lock(&rwsem->lock);
+}
+EXPORT_SYMBOL(rt_down_write_nested_lock);
+
 int  rt_down_read_trylock(struct rw_semaphore *rwsem)
 {
 	struct rt_mutex *lock = &rwsem->lock;
-- 
cgit v1.2.3


From 5bf93222c96634ab633513ee5a51c1d0addaab46 Mon Sep 17 00:00:00 2001
From: Yong Zhang <yong.zhang@windriver.com>
Date: Wed, 11 Jul 2012 22:05:21 +0000
Subject: perf: Make swevent hrtimer run in irq instead of softirq

Otherwise we get a deadlock like below:

[ 1044.042749] BUG: scheduling while atomic: ksoftirqd/21/141/0x00010003
[ 1044.042752] INFO: lockdep is turned off.
[ 1044.042754] Modules linked in:
[ 1044.042757] Pid: 141, comm: ksoftirqd/21 Tainted: G        W    3.4.0-rc2-rt3-23676-ga723175-dirty #29
[ 1044.042759] Call Trace:
[ 1044.042761]  <IRQ>  [<ffffffff8107d8e5>] __schedule_bug+0x65/0x80
[ 1044.042770]  [<ffffffff8168978c>] __schedule+0x83c/0xa70
[ 1044.042775]  [<ffffffff8106bdd2>] ? prepare_to_wait+0x32/0xb0
[ 1044.042779]  [<ffffffff81689a5e>] schedule+0x2e/0xa0
[ 1044.042782]  [<ffffffff81071ebd>] hrtimer_wait_for_timer+0x6d/0xb0
[ 1044.042786]  [<ffffffff8106bb30>] ? wake_up_bit+0x40/0x40
[ 1044.042790]  [<ffffffff81071f20>] hrtimer_cancel+0x20/0x40
[ 1044.042794]  [<ffffffff8111da0c>] perf_swevent_cancel_hrtimer+0x3c/0x50
[ 1044.042798]  [<ffffffff8111da31>] task_clock_event_stop+0x11/0x40
[ 1044.042802]  [<ffffffff8111da6e>] task_clock_event_del+0xe/0x10
[ 1044.042805]  [<ffffffff8111c568>] event_sched_out+0x118/0x1d0
[ 1044.042809]  [<ffffffff8111c649>] group_sched_out+0x29/0x90
[ 1044.042813]  [<ffffffff8111ed7e>] __perf_event_disable+0x18e/0x200
[ 1044.042817]  [<ffffffff8111c343>] remote_function+0x63/0x70
[ 1044.042821]  [<ffffffff810b0aae>] generic_smp_call_function_single_interrupt+0xce/0x120
[ 1044.042826]  [<ffffffff81022bc7>] smp_call_function_single_interrupt+0x27/0x40
[ 1044.042831]  [<ffffffff8168d50c>] call_function_single_interrupt+0x6c/0x80
[ 1044.042833]  <EOI>  [<ffffffff811275b0>] ? perf_event_overflow+0x20/0x20
[ 1044.042840]  [<ffffffff8168b970>] ? _raw_spin_unlock_irq+0x30/0x70
[ 1044.042844]  [<ffffffff8168b976>] ? _raw_spin_unlock_irq+0x36/0x70
[ 1044.042848]  [<ffffffff810702e2>] run_hrtimer_softirq+0xc2/0x200
[ 1044.042853]  [<ffffffff811275b0>] ? perf_event_overflow+0x20/0x20
[ 1044.042857]  [<ffffffff81045265>] __do_softirq_common+0xf5/0x3a0
[ 1044.042862]  [<ffffffff81045c3d>] __thread_do_softirq+0x15d/0x200
[ 1044.042865]  [<ffffffff81045dda>] run_ksoftirqd+0xfa/0x210
[ 1044.042869]  [<ffffffff81045ce0>] ? __thread_do_softirq+0x200/0x200
[ 1044.042873]  [<ffffffff81045ce0>] ? __thread_do_softirq+0x200/0x200
[ 1044.042877]  [<ffffffff8106b596>] kthread+0xb6/0xc0
[ 1044.042881]  [<ffffffff8168b97b>] ? _raw_spin_unlock_irq+0x3b/0x70
[ 1044.042886]  [<ffffffff8168d994>] kernel_thread_helper+0x4/0x10
[ 1044.042889]  [<ffffffff8107d98c>] ? finish_task_switch+0x8c/0x110
[ 1044.042894]  [<ffffffff8168b97b>] ? _raw_spin_unlock_irq+0x3b/0x70
[ 1044.042897]  [<ffffffff8168bd5d>] ? retint_restore_args+0xe/0xe
[ 1044.042900]  [<ffffffff8106b4e0>] ? kthreadd+0x1e0/0x1e0
[ 1044.042902]  [<ffffffff8168d990>] ? gs_change+0xb/0xb

Signed-off-by: Yong Zhang <yong.zhang0@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/1341476476-5666-1-git-send-email-yong.zhang0@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/events/core.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index e631dacdb165..8c681d1f480f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6346,6 +6346,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event)
 
 	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	hwc->hrtimer.function = perf_swevent_hrtimer;
+	hwc->hrtimer.irqsafe = 1;
 
 	/*
 	 * Since hrtimers have a fixed rate, we can do a static freq->period
-- 
cgit v1.2.3


From 1501099cdb2f965eda920beeb4ebed5188439fcf Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 13 Nov 2011 17:17:09 +0100
Subject: softirq: Check preemption after reenabling interrupts

raise_softirq_irqoff() disables interrupts and wakes the softirq
daemon, but after reenabling interrupts there is no preemption check,
so the execution of the softirq thread might be delayed arbitrarily.

In principle we could add that check to local_irq_enable/restore, but
that's overkill as the rasie_softirq_irqoff() sections are the only
ones which show this behaviour.

Reported-by: Carsten Emde <cbe@osadl.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable-rt@vger.kernel.org
---
 block/blk-iopoll.c      | 3 +++
 block/blk-softirq.c     | 3 +++
 include/linux/preempt.h | 3 +++
 net/core/dev.c          | 6 ++++++
 4 files changed, 15 insertions(+)

diff --git a/block/blk-iopoll.c b/block/blk-iopoll.c
index 0736729d6494..3e21e31d0d7e 100644
--- a/block/blk-iopoll.c
+++ b/block/blk-iopoll.c
@@ -35,6 +35,7 @@ void blk_iopoll_sched(struct blk_iopoll *iop)
 	list_add_tail(&iop->list, this_cpu_ptr(&blk_cpu_iopoll));
 	__raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
 	local_irq_restore(flags);
+	preempt_check_resched_rt();
 }
 EXPORT_SYMBOL(blk_iopoll_sched);
 
@@ -132,6 +133,7 @@ static void blk_iopoll_softirq(struct softirq_action *h)
 		__raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
 
 	local_irq_enable();
+	preempt_check_resched_rt();
 }
 
 /**
@@ -201,6 +203,7 @@ static int blk_iopoll_cpu_notify(struct notifier_block *self,
 				 this_cpu_ptr(&blk_cpu_iopoll));
 		__raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
 		local_irq_enable();
+		preempt_check_resched_rt();
 	}
 
 	return NOTIFY_OK;
diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index 53b1737e978d..81c3c0a62edf 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -51,6 +51,7 @@ static void trigger_softirq(void *data)
 		raise_softirq_irqoff(BLOCK_SOFTIRQ);
 
 	local_irq_restore(flags);
+	preempt_check_resched_rt();
 }
 
 /*
@@ -93,6 +94,7 @@ static int blk_cpu_notify(struct notifier_block *self, unsigned long action,
 				 this_cpu_ptr(&blk_cpu_done));
 		raise_softirq_irqoff(BLOCK_SOFTIRQ);
 		local_irq_enable();
+		preempt_check_resched_rt();
 	}
 
 	return NOTIFY_OK;
@@ -150,6 +152,7 @@ do_local:
 		goto do_local;
 
 	local_irq_restore(flags);
+	preempt_check_resched_rt();
 }
 
 /**
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 9f33663ff29b..933912ec8268 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -49,8 +49,10 @@ do { \
 
 #ifdef CONFIG_PREEMPT_RT_BASE
 # define preempt_enable_no_resched() sched_preempt_enable_no_resched()
+# define preempt_check_resched_rt() preempt_check_resched()
 #else
 # define preempt_enable_no_resched() preempt_enable()
+# define preempt_check_resched_rt() barrier();
 #endif
 
 #ifdef CONFIG_PREEMPT
@@ -125,6 +127,7 @@ do { \
 #define preempt_disable_notrace()		barrier()
 #define preempt_enable_no_resched_notrace()	barrier()
 #define preempt_enable_notrace()		barrier()
+#define preempt_check_resched_rt()		barrier()
 
 #endif /* CONFIG_PREEMPT_COUNT */
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 5ab3842898ba..9c18ef86044d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2166,6 +2166,7 @@ static inline void __netif_reschedule(struct Qdisc *q)
 	sd->output_queue_tailp = &q->next_sched;
 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
 	local_irq_restore(flags);
+	preempt_check_resched_rt();
 }
 
 void __netif_schedule(struct Qdisc *q)
@@ -2247,6 +2248,7 @@ void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
 	__this_cpu_write(softnet_data.completion_queue, skb);
 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
 	local_irq_restore(flags);
+	preempt_check_resched_rt();
 }
 EXPORT_SYMBOL(__dev_kfree_skb_irq);
 
@@ -3340,6 +3342,7 @@ enqueue:
 	rps_unlock(sd);
 
 	local_irq_restore(flags);
+	preempt_check_resched_rt();
 
 	atomic_long_inc(&skb->dev->rx_dropped);
 	kfree_skb(skb);
@@ -4358,6 +4361,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
 	} else
 #endif
 		local_irq_enable();
+	preempt_check_resched_rt();
 }
 
 static int process_backlog(struct napi_struct *napi, int quota)
@@ -4429,6 +4433,7 @@ void __napi_schedule(struct napi_struct *n)
 	local_irq_save(flags);
 	____napi_schedule(this_cpu_ptr(&softnet_data), n);
 	local_irq_restore(flags);
+	preempt_check_resched_rt();
 }
 EXPORT_SYMBOL(__napi_schedule);
 
@@ -7050,6 +7055,7 @@ static int dev_cpu_callback(struct notifier_block *nfb,
 
 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
 	local_irq_enable();
+	preempt_check_resched_rt();
 
 	/* Process offline CPU's input_pkt_queue */
 	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
-- 
cgit v1.2.3


From 0c3189235e372893f5cfaaeb81418e081762c9d6 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 4 Oct 2012 11:02:04 -0400
Subject: softirq: Init softirq local lock after per cpu section is set up

I discovered this bug when booting 3.4-rt on my powerpc box. It crashed
with the following report:

------------[ cut here ]------------
kernel BUG at /work/rt/stable-rt.git/kernel/rtmutex_common.h:75!
Oops: Exception in kernel mode, sig: 5 [#1]
PREEMPT SMP NR_CPUS=64 NUMA PA Semi PWRficient
Modules linked in:
NIP: c0000000004aa03c LR: c0000000004aa01c CTR: c00000000009b2ac
REGS: c00000003e8d7950 TRAP: 0700   Not tainted  (3.4.11-test-rt19)
MSR: 9000000000029032 <SF,HV,EE,ME,IR,DR,RI>  CR: 24000082  XER: 20000000
SOFTE: 0
TASK = c00000003e8fdcd0[11] 'ksoftirqd/1' THREAD: c00000003e8d4000 CPU: 1
GPR00: 0000000000000001 c00000003e8d7bd0 c000000000d6cbb0 0000000000000000
GPR04: c00000003e8fdcd0 0000000000000000 0000000024004082 c000000000011454
GPR08: 0000000000000000 0000000080000001 c00000003e8fdcd1 0000000000000000
GPR12: 0000000024000084 c00000000fff0280 ffffffffffffffff 000000003ffffad8
GPR16: ffffffffffffffff 000000000072c798 0000000000000060 0000000000000000
GPR20: 0000000000642741 000000000072c858 000000003ffffaf0 0000000000000417
GPR24: 000000000072dcd0 c00000003e7ff990 0000000000000000 0000000000000001
GPR28: 0000000000000000 c000000000792340 c000000000ccec78 c000000001182338
NIP [c0000000004aa03c] .wakeup_next_waiter+0x44/0xb8
LR [c0000000004aa01c] .wakeup_next_waiter+0x24/0xb8
Call Trace:
[c00000003e8d7bd0] [c0000000004aa01c] .wakeup_next_waiter+0x24/0xb8 (unreliable)
[c00000003e8d7c60] [c0000000004a0320] .rt_spin_lock_slowunlock+0x8c/0xe4
[c00000003e8d7ce0] [c0000000004a07cc] .rt_spin_unlock+0x54/0x64
[c00000003e8d7d60] [c0000000000636bc] .__thread_do_softirq+0x130/0x174
[c00000003e8d7df0] [c00000000006379c] .run_ksoftirqd+0x9c/0x1a4
[c00000003e8d7ea0] [c000000000080b68] .kthread+0xa8/0xb4
[c00000003e8d7f90] [c00000000001c2f8] .kernel_thread+0x54/0x70
Instruction dump:
60000000 e86d01c8 38630730 4bff7061 60000000 ebbf0008 7c7c1b78 e81d0040
7fe00278 7c000074 7800d182 68000001 <0b000000> e88d01c8 387d0010 38840738

The rtmutex_common.h:75 is:

rt_mutex_top_waiter(struct rt_mutex *lock)
{
	struct rt_mutex_waiter *w;

	w = plist_first_entry(&lock->wait_list, struct rt_mutex_waiter,
			       list_entry);
	BUG_ON(w->lock != lock);

	return w;
}

Where the waiter->lock is corrupted. I saw various other random bugs
that all had to with the softirq lock and plist. As plist needs to be
initialized before it is used I investigated how this lock is
initialized. It's initialized with:

void __init softirq_early_init(void)
{
	local_irq_lock_init(local_softirq_lock);
}

Where:

#define local_irq_lock_init(lvar)					\
	do {								\
		int __cpu;						\
		for_each_possible_cpu(__cpu)				\
			spin_lock_init(&per_cpu(lvar, __cpu).lock);	\
	} while (0)

As the softirq lock is a local_irq_lock, which is a per_cpu lock, the
initialization is done to all per_cpu versions of the lock. But lets
look at where the softirq_early_init() is called from.

In init/main.c: start_kernel()

/*
 * Interrupts are still disabled. Do necessary setups, then
 * enable them
 */
	softirq_early_init();
	tick_init();
	boot_cpu_init();
	page_address_init();
	printk(KERN_NOTICE "%s", linux_banner);
	setup_arch(&command_line);
	mm_init_owner(&init_mm, &init_task);
	mm_init_cpumask(&init_mm);
	setup_command_line(command_line);
	setup_nr_cpu_ids();
	setup_per_cpu_areas();
	smp_prepare_boot_cpu();	/* arch-specific boot-cpu hooks */

One of the first things that is called is the initialization of the
softirq lock. But if you look further down, we see the per_cpu areas
have not been set up yet. Thus initializing a local_irq_lock() before
the per_cpu section is set up, may not work as it is initializing the
per cpu locks before the per cpu exists.

By moving the softirq_early_init() right after setup_per_cpu_areas(),
the kernel boots fine.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Clark Williams <clark@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Carsten Emde <cbe@osadl.org>
Cc: vomlehn@texas.net
Link: http://lkml.kernel.org/r/1349362924.6755.18.camel@gandalf.local.home
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 init/main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/init/main.c b/init/main.c
index d70175c70223..e7cb76c079d4 100644
--- a/init/main.c
+++ b/init/main.c
@@ -525,7 +525,6 @@ asmlinkage __visible void __init start_kernel(void)
  * Interrupts are still disabled. Do necessary setups, then
  * enable them
  */
-	softirq_early_init();
 	boot_cpu_init();
 	page_address_init();
 	pr_notice("%s", linux_banner);
@@ -534,6 +533,7 @@ asmlinkage __visible void __init start_kernel(void)
 	setup_command_line(command_line);
 	setup_nr_cpu_ids();
 	setup_per_cpu_areas();
+	softirq_early_init();
 	smp_prepare_boot_cpu();	/* arch-specific boot-cpu hooks */
 
 	build_all_zonelists(NULL, NULL);
-- 
cgit v1.2.3


From 074227ad2f4994a8c34a57c5df7a17f48f844dd3 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 4 Oct 2012 14:30:25 +0100
Subject: softirq: Make serving softirqs a task flag

Avoid the percpu softirq_runner pointer magic by using a task flag.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h |  1 +
 kernel/softirq.c      | 20 +++-----------------
 2 files changed, 4 insertions(+), 17 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 01c28573dc65..2b5fa4a77386 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1950,6 +1950,7 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut,
 /*
  * Per process flags
  */
+#define PF_IN_SOFTIRQ	0x00000001	/* Task is serving softirq */
 #define PF_EXITING	0x00000004	/* getting shut down */
 #define PF_EXITPIDONE	0x00000008	/* pi exit done on shut down */
 #define PF_VCPU		0x00000010	/* I'm a virtual CPU */
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 27e4ad293d67..3cd9c72ada60 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -399,7 +399,6 @@ static void ksoftirqd_clr_sched_params(unsigned int cpu, bool online) { }
  * On RT we serialize softirq execution with a cpu local lock
  */
 static DEFINE_LOCAL_IRQ_LOCK(local_softirq_lock);
-static DEFINE_PER_CPU(struct task_struct *, local_softirq_runner);
 
 static void __do_softirq_common(int need_rcu_bh_qs);
 
@@ -478,22 +477,9 @@ void _local_bh_enable(void)
 }
 EXPORT_SYMBOL(_local_bh_enable);
 
-/* For tracing */
-int notrace __in_softirq(void)
-{
-	if (__get_cpu_var(local_softirq_lock).owner == current)
-		return __get_cpu_var(local_softirq_lock).nestcnt;
-	return 0;
-}
-
 int in_serving_softirq(void)
 {
-	int res;
-
-	preempt_disable();
-	res = __get_cpu_var(local_softirq_runner) == current;
-	preempt_enable();
-	return res;
+	return current->flags & PF_IN_SOFTIRQ;
 }
 EXPORT_SYMBOL(in_serving_softirq);
 
@@ -511,7 +497,7 @@ asmlinkage void __do_softirq(void)
 	/* Reset the pending bitmask before enabling irqs */
 	set_softirq_pending(0);
 
-	__get_cpu_var(local_softirq_runner) = current;
+	current->flags |= PF_IN_SOFTIRQ;
 
 	lockdep_softirq_enter();
 
@@ -522,7 +508,7 @@ asmlinkage void __do_softirq(void)
 		wakeup_softirqd();
 
 	lockdep_softirq_exit();
-	__get_cpu_var(local_softirq_runner) = NULL;
+	current->flags &= ~PF_IN_SOFTIRQ;
 
 	current->softirq_nestcnt--;
 }
-- 
cgit v1.2.3


From 6051aebcdf20fc8eba6bcc5ed0d9e2aa2638adae Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 4 Oct 2012 15:33:53 +0100
Subject: softirq: Split handling function

Split out the inner handling function, so RT can reuse it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/softirq.c | 37 ++++++++++++++++++++++---------------
 1 file changed, 22 insertions(+), 15 deletions(-)

diff --git a/kernel/softirq.c b/kernel/softirq.c
index 3cd9c72ada60..c8326b09c1e9 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -142,6 +142,26 @@ static void wakeup_softirqd(void)
 		wake_up_process(tsk);
 }
 
+static void handle_softirq(unsigned int vec_nr)
+{
+	struct softirq_action *h = softirq_vec + vec_nr;
+	int prev_count;
+
+	prev_count = preempt_count();
+
+	kstat_incr_softirqs_this_cpu(vec_nr);
+
+	trace_softirq_entry(vec_nr);
+	h->action(h);
+	trace_softirq_exit(vec_nr);
+	if (unlikely(prev_count != preempt_count())) {
+		pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
+		       vec_nr, softirq_to_name[vec_nr], h->action,
+		       prev_count, preempt_count());
+		preempt_count_set(prev_count);
+	}
+}
+
 static void handle_pending_softirqs(u32 pending, int need_rcu_bh_qs)
 {
 	struct softirq_action *h = softirq_vec;
@@ -153,24 +173,11 @@ static void handle_pending_softirqs(u32 pending, int need_rcu_bh_qs)
 
 	while ((softirq_bit = ffs(pending))) {
 		unsigned int vec_nr;
-		int prev_count;
 
 		h += softirq_bit - 1;
-
 		vec_nr = h - softirq_vec;
-		prev_count = preempt_count();
-
-		kstat_incr_softirqs_this_cpu(vec_nr);
-
-		trace_softirq_entry(vec_nr);
-		h->action(h);
-		trace_softirq_exit(vec_nr);
-		if (unlikely(prev_count != preempt_count())) {
-			pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
-			       vec_nr, softirq_to_name[vec_nr], h->action,
-			       prev_count, preempt_count());
-			preempt_count_set(prev_count);
-		}
+		handle_softirq(vec_nr);
+
 		h++;
 		pending >>= softirq_bit;
 	}
-- 
cgit v1.2.3


From e6ffacd48195656be3e03b912f199f4b84f624c2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 4 Oct 2012 14:20:47 +0100
Subject: softirq: Split softirq locks

The 3.x RT series removed the split softirq implementation in favour
of pushing softirq processing into the context of the thread which
raised it. Though this prevents us from handling the various softirqs
at different priorities. Now instead of reintroducing the split
softirq threads we split the locks which serialize the softirq
processing.

If a softirq is raised in context of a thread, then the softirq is
noted on a per thread field, if the thread is in a bh disabled
region. If the softirq is raised from hard interrupt context, then the
bit is set in the flag field of ksoftirqd and ksoftirqd is invoked.
When a thread leaves a bh disabled region, then it tries to execute
the softirqs which have been raised in its own context. It acquires
the per softirq / per cpu lock for the softirq and then checks,
whether the softirq is still pending in the per cpu
local_softirq_pending() field. If yes, it runs the softirq. If no,
then some other task executed it already. This allows for zero config
softirq elevation in the context of user space tasks or interrupt
threads.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h |   1 +
 kernel/softirq.c      | 310 ++++++++++++++++++++++++++++++--------------------
 2 files changed, 188 insertions(+), 123 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2b5fa4a77386..46ae98a12142 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1688,6 +1688,7 @@ struct task_struct {
 #ifdef CONFIG_PREEMPT_RT_BASE
 	struct rcu_head put_rcu;
 	int softirq_nestcnt;
+	unsigned int softirqs_raised;
 #endif
 #ifdef CONFIG_PREEMPT_RT_FULL
 # if defined CONFIG_HIGHMEM || defined CONFIG_X86_32
diff --git a/kernel/softirq.c b/kernel/softirq.c
index c8326b09c1e9..7c68e6702147 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -162,6 +162,12 @@ static void handle_softirq(unsigned int vec_nr)
 	}
 }
 
+#ifndef CONFIG_PREEMPT_RT_FULL
+static inline int ksoftirqd_softirq_pending(void)
+{
+	return local_softirq_pending();
+}
+
 static void handle_pending_softirqs(u32 pending, int need_rcu_bh_qs)
 {
 	struct softirq_action *h = softirq_vec;
@@ -187,7 +193,19 @@ static void handle_pending_softirqs(u32 pending, int need_rcu_bh_qs)
 	local_irq_disable();
 }
 
-#ifndef CONFIG_PREEMPT_RT_FULL
+static void run_ksoftirqd(unsigned int cpu)
+{
+	local_irq_disable();
+	if (ksoftirqd_softirq_pending()) {
+		__do_softirq();
+		rcu_note_context_switch(cpu);
+		local_irq_enable();
+		cond_resched();
+		return;
+	}
+	local_irq_enable();
+}
+
 /*
  * preempt_count and SOFTIRQ_OFFSET usage:
  * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
@@ -395,6 +413,32 @@ asmlinkage __visible void do_softirq(void)
 	local_irq_restore(flags);
 }
 
+/*
+ * This function must run with irqs disabled!
+ */
+void raise_softirq_irqoff(unsigned int nr)
+{
+	__raise_softirq_irqoff(nr);
+
+	/*
+	 * If we're in an interrupt or softirq, we're done
+	 * (this also catches softirq-disabled code). We will
+	 * actually run the softirq once we return from
+	 * the irq or softirq.
+	 *
+	 * Otherwise we wake up ksoftirqd to make sure we
+	 * schedule the softirq soon.
+	 */
+	if (!in_interrupt())
+		wakeup_softirqd();
+}
+
+void __raise_softirq_irqoff(unsigned int nr)
+{
+	trace_softirq_raise(nr);
+	or_softirq_pending(1UL << nr);
+}
+
 static inline void local_bh_disable_nort(void) { local_bh_disable(); }
 static inline void _local_bh_enable_nort(void) { _local_bh_enable(); }
 static void ksoftirqd_set_sched_params(unsigned int cpu) { }
@@ -403,20 +447,78 @@ static void ksoftirqd_clr_sched_params(unsigned int cpu, bool online) { }
 #else /* !PREEMPT_RT_FULL */
 
 /*
- * On RT we serialize softirq execution with a cpu local lock
+ * On RT we serialize softirq execution with a cpu local lock per softirq
  */
-static DEFINE_LOCAL_IRQ_LOCK(local_softirq_lock);
+static DEFINE_PER_CPU(struct local_irq_lock [NR_SOFTIRQS], local_softirq_locks);
+
+void __init softirq_early_init(void)
+{
+	int i;
 
-static void __do_softirq_common(int need_rcu_bh_qs);
+	for (i = 0; i < NR_SOFTIRQS; i++)
+		local_irq_lock_init(local_softirq_locks[i]);
+}
 
-asmlinkage __do_softirq(void)
+static void lock_softirq(int which)
 {
-	__do_softirq_common(0);
+	__local_lock(&__get_cpu_var(local_softirq_locks[which]));
 }
 
-void __init softirq_early_init(void)
+static void unlock_softirq(int which)
 {
-	local_irq_lock_init(local_softirq_lock);
+	__local_unlock(&__get_cpu_var(local_softirq_locks[which]));
+}
+
+static void do_single_softirq(int which, int need_rcu_bh_qs)
+{
+	unsigned long old_flags = current->flags;
+
+	current->flags &= ~PF_MEMALLOC;
+	vtime_account_irq_enter(current);
+	current->flags |= PF_IN_SOFTIRQ;
+	lockdep_softirq_enter();
+	local_irq_enable();
+	handle_softirq(which);
+	local_irq_disable();
+	lockdep_softirq_exit();
+	current->flags &= ~PF_IN_SOFTIRQ;
+	vtime_account_irq_enter(current);
+	tsk_restore_flags(current, old_flags, PF_MEMALLOC);
+}
+
+/*
+ * Called with interrupts disabled. Process softirqs which were raised
+ * in current context (or on behalf of ksoftirqd).
+ */
+static void do_current_softirqs(int need_rcu_bh_qs)
+{
+	while (current->softirqs_raised) {
+		int i = __ffs(current->softirqs_raised);
+		unsigned int pending, mask = (1U << i);
+
+		current->softirqs_raised &= ~mask;
+		local_irq_enable();
+
+		/*
+		 * If the lock is contended, we boost the owner to
+		 * process the softirq or leave the critical section
+		 * now.
+		 */
+		lock_softirq(i);
+		local_irq_disable();
+		/*
+		 * Check with the local_softirq_pending() bits,
+		 * whether we need to process this still or if someone
+		 * else took care of it.
+		 */
+		pending = local_softirq_pending();
+		if (pending & mask) {
+			set_softirq_pending(pending & ~mask);
+			do_single_softirq(i, need_rcu_bh_qs);
+		}
+		unlock_softirq(i);
+		WARN_ON(current->softirq_nestcnt != 1);
+	}
 }
 
 static void __local_bh_disable(void)
@@ -443,17 +545,11 @@ static void __local_bh_enable(void)
 	if (WARN_ON(current->softirq_nestcnt == 0))
 		return;
 
-	if ((current->softirq_nestcnt == 1) &&
-	    local_softirq_pending() &&
-	    local_trylock(local_softirq_lock)) {
+	local_irq_disable();
+	if (current->softirq_nestcnt == 1 && current->softirqs_raised)
+		do_current_softirqs(1);
+	local_irq_enable();
 
-		local_irq_disable();
-		if (local_softirq_pending())
-			__do_softirq();
-		local_irq_enable();
-		local_unlock(local_softirq_lock);
-		WARN_ON(current->softirq_nestcnt != 1);
-	}
 	current->softirq_nestcnt--;
 	migrate_enable();
 }
@@ -490,81 +586,82 @@ int in_serving_softirq(void)
 }
 EXPORT_SYMBOL(in_serving_softirq);
 
-/*
- * Called with bh and local interrupts disabled. For full RT cpu must
- * be pinned.
- */
-asmlinkage void __do_softirq(void)
+/* Called with preemption disabled */
+static void run_ksoftirqd(unsigned int cpu)
 {
-	u32 pending = local_softirq_pending();
-	int cpu = smp_processor_id();
-
+	local_irq_disable();
 	current->softirq_nestcnt++;
 
-	/* Reset the pending bitmask before enabling irqs */
-	set_softirq_pending(0);
-
-	current->flags |= PF_IN_SOFTIRQ;
-
-	lockdep_softirq_enter();
-
-	handle_pending_softirqs(pending, need_rcu_bh_qs);
-
-	pending = local_softirq_pending();
-	if (pending)
-		wakeup_softirqd();
-
-	lockdep_softirq_exit();
-	current->flags &= ~PF_IN_SOFTIRQ;
-
+	do_current_softirqs(1);
 	current->softirq_nestcnt--;
+	rcu_note_context_switch(cpu);
+	local_irq_enable();
+}
+
+/*
+ * Called from netif_rx_ni(). Preemption enabled, but migration
+ * disabled. So the cpu can't go away under us.
+ */
+void thread_do_softirq(void)
+{
+	if (!in_serving_softirq() && current->softirqs_raised) {
+		current->softirq_nestcnt++;
+		do_current_softirqs(0);
+		current->softirq_nestcnt--;
+	}
 }
 
-static int __thread_do_softirq(int cpu)
+void __raise_softirq_irqoff(unsigned int nr)
 {
+	trace_softirq_raise(nr);
+	or_softirq_pending(1UL << nr);
+
 	/*
-	 * Prevent the current cpu from going offline.
-	 * pin_current_cpu() can reenable preemption and block on the
-	 * hotplug mutex. When it returns, the current cpu is
-	 * pinned. It might be the wrong one, but the offline check
-	 * below catches that.
+	 * If we are not in a hard interrupt and inside a bh disabled
+	 * region, we simply raise the flag on current. local_bh_enable()
+	 * will make sure that the softirq is executed. Otherwise we
+	 * delegate it to ksoftirqd.
 	 */
-	pin_current_cpu();
+	if (!in_irq() && current->softirq_nestcnt)
+		current->softirqs_raised |= (1U << nr);
+	else if (__this_cpu_read(ksoftirqd))
+		__this_cpu_read(ksoftirqd)->softirqs_raised |= (1U << nr);
+}
+
+/*
+ * This function must run with irqs disabled!
+ */
+void raise_softirq_irqoff(unsigned int nr)
+{
+	__raise_softirq_irqoff(nr);
+
 	/*
-	 * If called from ksoftirqd (cpu >= 0) we need to check
-	 * whether we are on the wrong cpu due to cpu offlining. If
-	 * called via thread_do_softirq() no action required.
+	 * If we're in an hard interrupt we let irq return code deal
+	 * with the wakeup of ksoftirqd.
 	 */
-	if (cpu >= 0 && cpu_is_offline(cpu)) {
-		unpin_current_cpu();
-		return -1;
-	}
-	preempt_enable();
-	local_lock(local_softirq_lock);
-	local_irq_disable();
+	if (in_irq())
+		return;
 	/*
-	 * We cannot switch stacks on RT as we want to be able to
-	 * schedule!
+	 * If we are in thread context but outside of a bh disabled
+	 * region, we need to wake ksoftirqd as well.
+	 *
+	 * CHECKME: Some of the places which do that could be wrapped
+	 * into local_bh_disable/enable pairs. Though it's unclear
+	 * whether this is worth the effort. To find those places just
+	 * raise a WARN() if the condition is met.
 	 */
-	if (local_softirq_pending())
-		__do_softirq_common(cpu >= 0);
-	local_unlock(local_softirq_lock);
-	unpin_current_cpu();
-	preempt_disable();
-	local_irq_enable();
-	return 0;
+	if (!current->softirq_nestcnt)
+		wakeup_softirqd();
 }
 
-/*
- * Called from netif_rx_ni(). Preemption enabled.
- */
-void thread_do_softirq(void)
+void do_raise_softirq_irqoff(void)
 {
-	if (!in_serving_softirq()) {
-		preempt_disable();
-		__thread_do_softirq(-1);
-		preempt_enable();
-	}
+	raise_softirq_irqoff(nr);
+}
+
+static inline int ksoftirqd_softirq_pending(void)
+{
+	return current->softirqs_raised;
 }
 
 static inline void local_bh_disable_nort(void) { }
@@ -575,6 +672,10 @@ static inline void ksoftirqd_set_sched_params(unsigned int cpu)
 	struct sched_param param = { .sched_priority = 1 };
 
 	sched_setscheduler(current, SCHED_FIFO, &param);
+	/* Take over all pending softirqs when starting */
+	local_irq_disable();
+	current->softirqs_raised = local_softirq_pending();
+	local_irq_enable();
 }
 
 static inline void ksoftirqd_clr_sched_params(unsigned int cpu, bool online)
@@ -626,8 +727,14 @@ static inline void invoke_softirq(void)
 	} else {
 		wakeup_softirqd();
 	}
-#else
-	wakeup_softirqd();
+#else /* PREEMPT_RT_FULL */
+	unsigned long flags;
+
+	local_irq_save(flags);
+	if (__this_cpu_read(ksoftirqd) &&
+			__this_cpu_read(ksoftirqd)->softirqs_raised)
+		wakeup_softirqd();
+	local_irq_restore(flags);
 #endif
 }
 
@@ -665,26 +772,6 @@ void irq_exit(void)
 	trace_hardirq_exit(); /* must be last! */
 }
 
-/*
- * This function must run with irqs disabled!
- */
-inline void raise_softirq_irqoff(unsigned int nr)
-{
-	__raise_softirq_irqoff(nr);
-
-	/*
-	 * If we're in an interrupt or softirq, we're done
-	 * (this also catches softirq-disabled code). We will
-	 * actually run the softirq once we return from
-	 * the irq or softirq.
-	 *
-	 * Otherwise we wake up ksoftirqd to make sure we
-	 * schedule the softirq soon.
-	 */
-	if (!in_interrupt())
-		wakeup_softirqd();
-}
-
 void raise_softirq(unsigned int nr)
 {
 	unsigned long flags;
@@ -694,12 +781,6 @@ void raise_softirq(unsigned int nr)
 	local_irq_restore(flags);
 }
 
-void __raise_softirq_irqoff(unsigned int nr)
-{
-	trace_softirq_raise(nr);
-	or_softirq_pending(1UL << nr);
-}
-
 void open_softirq(int nr, void (*action)(struct softirq_action *))
 {
 	softirq_vec[nr].action = action;
@@ -1007,24 +1088,7 @@ EXPORT_SYMBOL(tasklet_unlock_wait);
 
 static int ksoftirqd_should_run(unsigned int cpu)
 {
-	return local_softirq_pending();
-}
-
-static void run_ksoftirqd(unsigned int cpu)
-{
-	local_irq_disable();
-	if (local_softirq_pending()) {
-		/*
-		 * We can safely run softirq on inline stack, as we are not deep
-		 * in the task stack here.
-		 */
-		__do_softirq();
-		rcu_note_context_switch(cpu);
-		local_irq_enable();
-		cond_resched();
-		return;
-	}
-	local_irq_enable();
+	return ksoftirqd_softirq_pending();
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-- 
cgit v1.2.3


From 8b2d3fc79eb2f0b0a779437106526f221bfada43 Mon Sep 17 00:00:00 2001
From: Nicholas Mc Guire <der.herr@hofr.at>
Date: Fri, 17 Jan 2014 20:44:03 +0100
Subject: API cleanup - use local_lock not __local_lock for soft

trivial API cleanup - kernel/softirq.c was mimiking local_lock.

No change of functional behavior

Signed-off-by: Nicholas Mc Guire <der.herr@hofr.at>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 kernel/softirq.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/softirq.c b/kernel/softirq.c
index 7c68e6702147..7b2f1e9534cf 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -461,12 +461,12 @@ void __init softirq_early_init(void)
 
 static void lock_softirq(int which)
 {
-	__local_lock(&__get_cpu_var(local_softirq_locks[which]));
+	local_lock(local_softirq_locks[which]);
 }
 
 static void unlock_softirq(int which)
 {
-	__local_unlock(&__get_cpu_var(local_softirq_locks[which]));
+	local_unlock(local_softirq_locks[which]);
 }
 
 static void do_single_softirq(int which, int need_rcu_bh_qs)
-- 
cgit v1.2.3


From 6d613c0649393c5472daff9328fd350683f7f1ed Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 28 Oct 2012 13:46:16 +0000
Subject: softirq: Adapt NOHZ softirq pending check to new RT scheme

We can't rely on ksoftirqd anymore and we need to check the tasks
which run a particular softirq and if such a task is pi blocked ignore
the other pending bits of that task as well.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/softirq.c | 83 +++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 58 insertions(+), 25 deletions(-)

diff --git a/kernel/softirq.c b/kernel/softirq.c
index 7b2f1e9534cf..2a47ad9c26f5 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -66,46 +66,71 @@ const char * const softirq_to_name[NR_SOFTIRQS] = {
 
 #ifdef CONFIG_NO_HZ_COMMON
 # ifdef CONFIG_PREEMPT_RT_FULL
+
+struct softirq_runner {
+	struct task_struct *runner[NR_SOFTIRQS];
+};
+
+static DEFINE_PER_CPU(struct softirq_runner, softirq_runners);
+
+static inline void softirq_set_runner(unsigned int sirq)
+{
+	struct softirq_runner *sr = &__get_cpu_var(softirq_runners);
+
+	sr->runner[sirq] = current;
+}
+
+static inline void softirq_clr_runner(unsigned int sirq)
+{
+	struct softirq_runner *sr = &__get_cpu_var(softirq_runners);
+
+	sr->runner[sirq] = NULL;
+}
+
 /*
- * On preempt-rt a softirq might be blocked on a lock. There might be
- * no other runnable task on this CPU because the lock owner runs on
- * some other CPU. So we have to go into idle with the pending bit
- * set. Therefor we need to check this otherwise we warn about false
- * positives which confuses users and defeats the whole purpose of
- * this test.
+ * On preempt-rt a softirq running context might be blocked on a
+ * lock. There might be no other runnable task on this CPU because the
+ * lock owner runs on some other CPU. So we have to go into idle with
+ * the pending bit set. Therefor we need to check this otherwise we
+ * warn about false positives which confuses users and defeats the
+ * whole purpose of this test.
  *
  * This code is called with interrupts disabled.
  */
 void softirq_check_pending_idle(void)
 {
 	static int rate_limit;
-	u32 warnpending = 0, pending;
+	struct softirq_runner *sr = &__get_cpu_var(softirq_runners);
+	u32 warnpending;
+	int i;
 
 	if (rate_limit >= 10)
 		return;
 
-	pending = local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK;
-	if (pending) {
-		struct task_struct *tsk;
+	warnpending = local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK;
+	for (i = 0; i < NR_SOFTIRQS; i++) {
+		struct task_struct *tsk = sr->runner[i];
 
-		tsk = __get_cpu_var(ksoftirqd);
 		/*
 		 * The wakeup code in rtmutex.c wakes up the task
 		 * _before_ it sets pi_blocked_on to NULL under
 		 * tsk->pi_lock. So we need to check for both: state
 		 * and pi_blocked_on.
 		 */
-		raw_spin_lock(&tsk->pi_lock);
-
-		if (!tsk->pi_blocked_on && !(tsk->state == TASK_RUNNING))
-			warnpending = 1;
-
-		raw_spin_unlock(&tsk->pi_lock);
+		if (tsk) {
+			raw_spin_lock(&tsk->pi_lock);
+			if (tsk->pi_blocked_on || tsk->state == TASK_RUNNING) {
+				/* Clear all bits pending in that task */
+				warnpending &= ~(tsk->softirqs_raised);
+				warnpending &= ~(1 << i);
+			}
+			raw_spin_unlock(&tsk->pi_lock);
+		}
 	}
 
 	if (warnpending) {
 		printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
-		       pending);
+		       warnpending);
 		rate_limit++;
 	}
 }
@@ -125,6 +150,10 @@ void softirq_check_pending_idle(void)
 	}
 }
 # endif
+
+#else /* !CONFIG_NO_HZ_COMMON */
+static inline void softirq_set_runner(unsigned int sirq) { }
+static inline void softirq_clr_runner(unsigned int sirq) { }
 #endif
 
 /*
@@ -506,6 +535,7 @@ static void do_current_softirqs(int need_rcu_bh_qs)
 		 */
 		lock_softirq(i);
 		local_irq_disable();
+		softirq_set_runner(i);
 		/*
 		 * Check with the local_softirq_pending() bits,
 		 * whether we need to process this still or if someone
@@ -516,6 +546,7 @@ static void do_current_softirqs(int need_rcu_bh_qs)
 			set_softirq_pending(pending & ~mask);
 			do_single_softirq(i, need_rcu_bh_qs);
 		}
+		softirq_clr_runner(i);
 		unlock_softirq(i);
 		WARN_ON(current->softirq_nestcnt != 1);
 	}
@@ -611,7 +642,7 @@ void thread_do_softirq(void)
 	}
 }
 
-void __raise_softirq_irqoff(unsigned int nr)
+static void do_raise_softirq_irqoff(unsigned int nr)
 {
 	trace_softirq_raise(nr);
 	or_softirq_pending(1UL << nr);
@@ -628,12 +659,19 @@ void __raise_softirq_irqoff(unsigned int nr)
 		__this_cpu_read(ksoftirqd)->softirqs_raised |= (1U << nr);
 }
 
+void __raise_softirq_irqoff(unsigned int nr)
+{
+	do_raise_softirq_irqoff(nr);
+	if (!in_irq() && !current->softirq_nestcnt)
+		wakeup_softirqd();
+}
+
 /*
  * This function must run with irqs disabled!
  */
 void raise_softirq_irqoff(unsigned int nr)
 {
-	__raise_softirq_irqoff(nr);
+	do_raise_softirq_irqoff(nr);
 
 	/*
 	 * If we're in an hard interrupt we let irq return code deal
@@ -654,11 +692,6 @@ void raise_softirq_irqoff(unsigned int nr)
 		wakeup_softirqd();
 }
 
-void do_raise_softirq_irqoff(void)
-{
-	raise_softirq_irqoff(nr);
-}
-
 static inline int ksoftirqd_softirq_pending(void)
 {
 	return current->softirqs_raised;
-- 
cgit v1.2.3


From 7aa2c415d216b1d2150afb9b8441a2d20fc346e0 Mon Sep 17 00:00:00 2001
From: Nicholas Mc Guire <der.herr@hofr.at>
Date: Fri, 6 Dec 2013 00:42:22 +0100
Subject: softirq: make migrate disable/enable conditioned on softirq_nestcnt
 transition

This patch removes the recursive calls to migrate_disable/enable in
local_bh_disable/enable

the softirq-local-lock.patch introduces local_bh_disable/enable wich
decrements/increments the current->softirq_nestcnt and disable/enables
migration as well. as softirq_nestcnt (include/linux/sched.h conditioned
on CONFIG_PREEMPT_RT_BASE) already is tracking the nesting level of the
recursive calls to local_bh_disable/enable (all in kernel/softirq.c) - no
need to do it twice.

migrate_disable/enable thus can be conditionsed on softirq_nestcnt making
a transition from 0-1 to disable migration and 1-0 to re-enable it.

No change of functional behavior, this does noticably reduce the observed
nesting level of migrate_disable/enable

Signed-off-by: Nicholas Mc Guire <der.herr@hofr.at>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 kernel/softirq.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/kernel/softirq.c b/kernel/softirq.c
index 2a47ad9c26f5..2ae52d6b5a07 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -554,8 +554,8 @@ static void do_current_softirqs(int need_rcu_bh_qs)
 
 static void __local_bh_disable(void)
 {
-	migrate_disable();
-	current->softirq_nestcnt++;
+	if (++current->softirq_nestcnt == 1)
+		migrate_disable();
 }
 
 void local_bh_disable(void)
@@ -581,8 +581,8 @@ static void __local_bh_enable(void)
 		do_current_softirqs(1);
 	local_irq_enable();
 
-	current->softirq_nestcnt--;
-	migrate_enable();
+	if (--current->softirq_nestcnt == 0)
+		migrate_enable();
 }
 
 void local_bh_enable(void)
@@ -606,8 +606,10 @@ EXPORT_SYMBOL(local_bh_enable_ip);
 
 void _local_bh_enable(void)
 {
-	current->softirq_nestcnt--;
-	migrate_enable();
+	if (WARN_ON(current->softirq_nestcnt == 0))
+		return;
+	if (--current->softirq_nestcnt == 0)
+		migrate_enable();
 }
 EXPORT_SYMBOL(_local_bh_enable);
 
-- 
cgit v1.2.3


From e8ea17cbf7e4d53009969bc71528090540f4cff3 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <umgwanakikbuti@gmail.com>
Date: Sat, 11 Apr 2015 15:15:59 +0200
Subject: rt, nohz_full: fix nohz_full for PREEMPT_RT_FULL

A task being ticked and trying to shut the tick down will fail due
to having just awakened ksoftirqd, subtract it from nr_running.

Signed-off-by: Mike Galbraith <umgwanakikbuti@gmail.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 kernel/sched/core.c      | 17 ++++++++++++++++-
 kernel/time/tick-sched.c |  5 +++++
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8bf0dddd935b..2e886b83169f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -754,14 +754,29 @@ static inline bool got_nohz_idle_kick(void)
 #endif /* CONFIG_NO_HZ_COMMON */
 
 #ifdef CONFIG_NO_HZ_FULL
+
+static int ksoftirqd_running(void)
+{
+	struct task_struct *softirqd;
+
+	if (!IS_ENABLED(CONFIG_PREEMPT_RT_FULL))
+		return 0;
+	softirqd = this_cpu_ksoftirqd();
+	if (softirqd && softirqd->on_rq)
+		return 1;
+	return 0;
+}
+
 bool sched_can_stop_tick(void)
 {
 	/*
 	 * More than one running task need preemption.
 	 * nr_running update is assumed to be visible
 	 * after IPI is sent from wakers.
+	 *
+	 * NOTE, RT: if ksoftirqd is awake, subtract it.
 	 */
-	if (this_rq()->nr_running > 1)
+	if (this_rq()->nr_running - ksoftirqd_running() > 1)
 		return false;
 
 	return true;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 3a43989f0ce0..d7cef36a443c 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -227,7 +227,12 @@ void __tick_nohz_full_check(void)
 
 static void nohz_full_kick_work_func(struct irq_work *work)
 {
+	unsigned long flags;
+
+	/* ksoftirqd processes sirqs with interrupts enabled */
+	local_irq_save(flags);
 	__tick_nohz_full_check();
+	local_irq_restore(flags);
 }
 
 static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
-- 
cgit v1.2.3


From 9af3b3ad9e63809df25148a23a6204eecebcd50d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 28 Oct 2012 13:26:09 +0000
Subject: rcu: Disable RCU_FAST_NO_HZ on RT

This uses a timer_list timer from the irq disabled guts of the idle
code. Disable it for now to prevent wreckage.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable-rt@vger.kernel.org
---
 init/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/init/Kconfig b/init/Kconfig
index 1e747ee4f780..aa3cc057213d 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -635,7 +635,7 @@ config RCU_FANOUT_EXACT
 
 config RCU_FAST_NO_HZ
 	bool "Accelerate last non-dyntick-idle CPU's grace periods"
-	depends on NO_HZ_COMMON && SMP
+	depends on NO_HZ_COMMON && SMP && !PREEMPT_RT_FULL
 	default n
 	help
 	  This option permits CPUs to enter dynticks-idle state even if
-- 
cgit v1.2.3


From 3c8f0fd6771c02b3d1bb7362a761977bcd34067b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 4 Nov 2013 13:21:10 -0800
Subject: rcu: Eliminate softirq processing from rcutree

Running RCU out of softirq is a problem for some workloads that would
like to manage RCU core processing independently of other softirq work,
for example, setting kthread priority.  This commit therefore moves the
RCU core work from softirq to a per-CPU/per-flavor SCHED_OTHER kthread
named rcuc.  The SCHED_OTHER approach avoids the scalability problems
that appeared with the earlier attempt to move RCU core processing to
from softirq to kthreads.  That said, kernels built with RCU_BOOST=y
will run the rcuc kthreads at the RCU-boosting priority.

Reported-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Mike Galbraith <bitbucket@online.de>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 kernel/rcu/tree.c        | 113 ++++++++++++++++++++++++++++++++++-----
 kernel/rcu/tree.h        |   3 +-
 kernel/rcu/tree_plugin.h | 134 +++++------------------------------------------
 3 files changed, 113 insertions(+), 137 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 140274512c0a..e73df2108a54 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -56,6 +56,11 @@
 #include <linux/random.h>
 #include <linux/ftrace_event.h>
 #include <linux/suspend.h>
+#include <linux/delay.h>
+#include <linux/gfp.h>
+#include <linux/oom.h>
+#include <linux/smpboot.h>
+#include "../time/tick-internal.h"
 
 #include "tree.h"
 #include "rcu.h"
@@ -152,8 +157,6 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active);
  */
 static int rcu_scheduler_fully_active __read_mostly;
 
-#ifdef CONFIG_RCU_BOOST
-
 /*
  * Control variables for per-CPU and per-rcu_node kthreads.  These
  * handle all flavors of RCU.
@@ -163,8 +166,6 @@ DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
 DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
 DEFINE_PER_CPU(char, rcu_cpu_has_work);
 
-#endif /* #ifdef CONFIG_RCU_BOOST */
-
 static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
 static void invoke_rcu_core(void);
 static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
@@ -2587,16 +2588,14 @@ __rcu_process_callbacks(struct rcu_state *rsp)
 /*
  * Do RCU core processing for the current CPU.
  */
-static void rcu_process_callbacks(struct softirq_action *unused)
+static void rcu_process_callbacks(void)
 {
 	struct rcu_state *rsp;
 
 	if (cpu_is_offline(smp_processor_id()))
 		return;
-	trace_rcu_utilization(TPS("Start RCU core"));
 	for_each_rcu_flavor(rsp)
 		__rcu_process_callbacks(rsp);
-	trace_rcu_utilization(TPS("End RCU core"));
 }
 
 /*
@@ -2610,18 +2609,105 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
 {
 	if (unlikely(!ACCESS_ONCE(rcu_scheduler_fully_active)))
 		return;
-	if (likely(!rsp->boost)) {
-		rcu_do_batch(rsp, rdp);
+	rcu_do_batch(rsp, rdp);
+}
+
+static void rcu_wake_cond(struct task_struct *t, int status)
+{
+	/*
+	 * If the thread is yielding, only wake it when this
+	 * is invoked from idle
+	 */
+	if (t && (status != RCU_KTHREAD_YIELDING || is_idle_task(current)))
+		wake_up_process(t);
+}
+
+/*
+ * Wake up this CPU's rcuc kthread to do RCU core processing.
+ */
+static void invoke_rcu_core(void)
+{
+	unsigned long flags;
+	struct task_struct *t;
+
+	if (!cpu_online(smp_processor_id()))
 		return;
+	local_irq_save(flags);
+	__this_cpu_write(rcu_cpu_has_work, 1);
+	t = __this_cpu_read(rcu_cpu_kthread_task);
+	if (t != NULL && current != t)
+		rcu_wake_cond(t, __this_cpu_read(rcu_cpu_kthread_status));
+	local_irq_restore(flags);
+}
+
+static void rcu_cpu_kthread_park(unsigned int cpu)
+{
+	per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
+}
+
+static int rcu_cpu_kthread_should_run(unsigned int cpu)
+{
+	return __this_cpu_read(rcu_cpu_has_work);
+}
+
+/*
+ * Per-CPU kernel thread that invokes RCU callbacks.  This replaces the
+ * RCU softirq used in flavors and configurations of RCU that do not
+ * support RCU priority boosting.
+ */
+static void rcu_cpu_kthread(unsigned int cpu)
+{
+	unsigned int *statusp = &__get_cpu_var(rcu_cpu_kthread_status);
+	char work, *workp = &__get_cpu_var(rcu_cpu_has_work);
+	int spincnt;
+
+	for (spincnt = 0; spincnt < 10; spincnt++) {
+		trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
+		local_bh_disable();
+		*statusp = RCU_KTHREAD_RUNNING;
+		this_cpu_inc(rcu_cpu_kthread_loops);
+		local_irq_disable();
+		work = *workp;
+		*workp = 0;
+		local_irq_enable();
+		if (work)
+			rcu_process_callbacks();
+		local_bh_enable();
+		if (*workp == 0) {
+			trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
+			*statusp = RCU_KTHREAD_WAITING;
+			return;
+		}
 	}
-	invoke_rcu_callbacks_kthread();
+	*statusp = RCU_KTHREAD_YIELDING;
+	trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
+	schedule_timeout_interruptible(2);
+	trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
+	*statusp = RCU_KTHREAD_WAITING;
 }
 
-static void invoke_rcu_core(void)
+static struct smp_hotplug_thread rcu_cpu_thread_spec = {
+	.store			= &rcu_cpu_kthread_task,
+	.thread_should_run	= rcu_cpu_kthread_should_run,
+	.thread_fn		= rcu_cpu_kthread,
+	.thread_comm		= "rcuc/%u",
+	.setup			= rcu_cpu_kthread_setup,
+	.park			= rcu_cpu_kthread_park,
+};
+
+/*
+ * Spawn per-CPU RCU core processing kthreads.
+ */
+static int __init rcu_spawn_core_kthreads(void)
 {
-	if (cpu_online(smp_processor_id()))
-		raise_softirq(RCU_SOFTIRQ);
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		per_cpu(rcu_cpu_has_work, cpu) = 0;
+	BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
+	return 0;
 }
+early_initcall(rcu_spawn_core_kthreads);
 
 /*
  * Handle any core-RCU processing required by a call_rcu() invocation.
@@ -3783,7 +3869,6 @@ void __init rcu_init(void)
 	rcu_init_one(&rcu_bh_state, &rcu_bh_data);
 	rcu_init_one(&rcu_sched_state, &rcu_sched_data);
 	__rcu_init_preempt();
-	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
 
 	/*
 	 * We don't need protection against CPU-hotplug here because
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index dddb2987171c..5e54e2870e8a 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -565,10 +565,9 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
 static void __init __rcu_init_preempt(void);
 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
 static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
-static void invoke_rcu_callbacks_kthread(void);
 static bool rcu_is_callbacks_kthread(void);
+static void rcu_cpu_kthread_setup(unsigned int cpu);
 #ifdef CONFIG_RCU_BOOST
-static void rcu_preempt_do_callbacks(void);
 static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
 						 struct rcu_node *rnp);
 #endif /* #ifdef CONFIG_RCU_BOOST */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 36d044a001a2..5dd80727a6fc 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -24,12 +24,6 @@
  *	   Paul E. McKenney <paulmck@linux.vnet.ibm.com>
  */
 
-#include <linux/delay.h>
-#include <linux/gfp.h>
-#include <linux/oom.h>
-#include <linux/smpboot.h>
-#include "../time/tick-internal.h"
-
 #define RCU_KTHREAD_PRIO 1
 
 #ifdef CONFIG_RCU_BOOST
@@ -633,15 +627,6 @@ static void rcu_preempt_check_callbacks(int cpu)
 		t->rcu_read_unlock_special.b.need_qs = true;
 }
 
-#ifdef CONFIG_RCU_BOOST
-
-static void rcu_preempt_do_callbacks(void)
-{
-	rcu_do_batch(&rcu_preempt_state, this_cpu_ptr(&rcu_preempt_data));
-}
-
-#endif /* #ifdef CONFIG_RCU_BOOST */
-
 /*
  * Queue a preemptible-RCU callback for invocation after a grace period.
  */
@@ -1070,6 +1055,19 @@ void exit_rcu(void)
 
 #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
 
+/*
+ * If boosting, set rcuc kthreads to realtime priority.
+ */
+static void rcu_cpu_kthread_setup(unsigned int cpu)
+{
+#ifdef CONFIG_RCU_BOOST
+	struct sched_param sp;
+
+	sp.sched_priority = RCU_KTHREAD_PRIO;
+	sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
+#endif /* #ifdef CONFIG_RCU_BOOST */
+}
+
 #ifdef CONFIG_RCU_BOOST
 
 #include "../locking/rtmutex_common.h"
@@ -1101,16 +1099,6 @@ static void rcu_initiate_boost_trace(struct rcu_node *rnp)
 
 #endif /* #else #ifdef CONFIG_RCU_TRACE */
 
-static void rcu_wake_cond(struct task_struct *t, int status)
-{
-	/*
-	 * If the thread is yielding, only wake it when this
-	 * is invoked from idle
-	 */
-	if (status != RCU_KTHREAD_YIELDING || is_idle_task(current))
-		wake_up_process(t);
-}
-
 /*
  * Carry out RCU priority boosting on the task indicated by ->exp_tasks
  * or ->boost_tasks, advancing the pointer to the next task in the
@@ -1254,23 +1242,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
 	}
 }
 
-/*
- * Wake up the per-CPU kthread to invoke RCU callbacks.
- */
-static void invoke_rcu_callbacks_kthread(void)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	__this_cpu_write(rcu_cpu_has_work, 1);
-	if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
-	    current != __this_cpu_read(rcu_cpu_kthread_task)) {
-		rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task),
-			      __this_cpu_read(rcu_cpu_kthread_status));
-	}
-	local_irq_restore(flags);
-}
-
 /*
  * Is the current CPU running the RCU-callbacks kthread?
  * Caller must have preemption disabled.
@@ -1326,67 +1297,6 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
 	return 0;
 }
 
-static void rcu_kthread_do_work(void)
-{
-	rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data));
-	rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data));
-	rcu_preempt_do_callbacks();
-}
-
-static void rcu_cpu_kthread_setup(unsigned int cpu)
-{
-	struct sched_param sp;
-
-	sp.sched_priority = RCU_KTHREAD_PRIO;
-	sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
-}
-
-static void rcu_cpu_kthread_park(unsigned int cpu)
-{
-	per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
-}
-
-static int rcu_cpu_kthread_should_run(unsigned int cpu)
-{
-	return __this_cpu_read(rcu_cpu_has_work);
-}
-
-/*
- * Per-CPU kernel thread that invokes RCU callbacks.  This replaces the
- * RCU softirq used in flavors and configurations of RCU that do not
- * support RCU priority boosting.
- */
-static void rcu_cpu_kthread(unsigned int cpu)
-{
-	unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
-	char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
-	int spincnt;
-
-	for (spincnt = 0; spincnt < 10; spincnt++) {
-		trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
-		local_bh_disable();
-		*statusp = RCU_KTHREAD_RUNNING;
-		this_cpu_inc(rcu_cpu_kthread_loops);
-		local_irq_disable();
-		work = *workp;
-		*workp = 0;
-		local_irq_enable();
-		if (work)
-			rcu_kthread_do_work();
-		local_bh_enable();
-		if (*workp == 0) {
-			trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
-			*statusp = RCU_KTHREAD_WAITING;
-			return;
-		}
-	}
-	*statusp = RCU_KTHREAD_YIELDING;
-	trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
-	schedule_timeout_interruptible(2);
-	trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
-	*statusp = RCU_KTHREAD_WAITING;
-}
-
 /*
  * Set the per-rcu_node kthread's affinity to cover all CPUs that are
  * served by the rcu_node in question.  The CPU hotplug lock is still
@@ -1420,26 +1330,13 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
 	free_cpumask_var(cm);
 }
 
-static struct smp_hotplug_thread rcu_cpu_thread_spec = {
-	.store			= &rcu_cpu_kthread_task,
-	.thread_should_run	= rcu_cpu_kthread_should_run,
-	.thread_fn		= rcu_cpu_kthread,
-	.thread_comm		= "rcuc/%u",
-	.setup			= rcu_cpu_kthread_setup,
-	.park			= rcu_cpu_kthread_park,
-};
-
 /*
  * Spawn boost kthreads -- called as soon as the scheduler is running.
  */
 static void __init rcu_spawn_boost_kthreads(void)
 {
 	struct rcu_node *rnp;
-	int cpu;
 
-	for_each_possible_cpu(cpu)
-		per_cpu(rcu_cpu_has_work, cpu) = 0;
-	BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
 	rnp = rcu_get_root(rcu_state_p);
 	(void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
 	if (NUM_RCU_NODES > 1) {
@@ -1466,11 +1363,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 }
 
-static void invoke_rcu_callbacks_kthread(void)
-{
-	WARN_ON_ONCE(1);
-}
-
 static bool rcu_is_callbacks_kthread(void)
 {
 	return false;
-- 
cgit v1.2.3


From 8f63c81f4c95624bd077092cb4e9fb4e76a89b0c Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 21 Mar 2014 20:19:05 +0100
Subject: rcu: make RCU_BOOST default on RT

Since it is no longer invoked from the softirq people run into OOM more
often if the priority of the RCU thread is too low. Making boosting
default on RT should help in those case and it can be switched off if
someone knows better.

Cc: stable-rt@vger.kernel.org
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 init/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/init/Kconfig b/init/Kconfig
index aa3cc057213d..c164babf84b0 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -662,7 +662,7 @@ config TREE_RCU_TRACE
 config RCU_BOOST
 	bool "Enable RCU priority boosting"
 	depends on RT_MUTEXES && PREEMPT_RCU
-	default n
+	default y if PREEMPT_RT_FULL
 	help
 	  This option boosts the priority of preempted RCU readers that
 	  block the current preemptible RCU grace period for too long.
-- 
cgit v1.2.3


From 92b1a46ac63a98d1fce78d88f2c5ceea7d1b27a5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 26 Oct 2012 18:50:54 +0100
Subject: sched: Add support for lazy preemption

It has become an obsession to mitigate the determinism vs. throughput
loss of RT. Looking at the mainline semantics of preemption points
gives a hint why RT sucks throughput wise for ordinary SCHED_OTHER
tasks. One major issue is the wakeup of tasks which are right away
preempting the waking task while the waking task holds a lock on which
the woken task will block right after having preempted the wakee. In
mainline this is prevented due to the implicit preemption disable of
spin/rw_lock held regions. On RT this is not possible due to the fully
preemptible nature of sleeping spinlocks.

Though for a SCHED_OTHER task preempting another SCHED_OTHER task this
is really not a correctness issue. RT folks are concerned about
SCHED_FIFO/RR tasks preemption and not about the purely fairness
driven SCHED_OTHER preemption latencies.

So I introduced a lazy preemption mechanism which only applies to
SCHED_OTHER tasks preempting another SCHED_OTHER task. Aside of the
existing preempt_count each tasks sports now a preempt_lazy_count
which is manipulated on lock acquiry and release. This is slightly
incorrect as for lazyness reasons I coupled this on
migrate_disable/enable so some other mechanisms get the same treatment
(e.g. get_cpu_light).

Now on the scheduler side instead of setting NEED_RESCHED this sets
NEED_RESCHED_LAZY in case of a SCHED_OTHER/SCHED_OTHER preemption and
therefor allows to exit the waking task the lock held region before
the woken task preempts. That also works better for cross CPU wakeups
as the other side can stay in the adaptive spinning loop.

For RT class preemption there is no change. This simply sets
NEED_RESCHED and forgoes the lazy preemption counter.

 Initial test do not expose any observable latency increasement, but
history shows that I've been proven wrong before :)

The lazy preemption mode is per default on, but with
CONFIG_SCHED_DEBUG enabled it can be disabled via:

 # echo NO_PREEMPT_LAZY >/sys/kernel/debug/sched_features

and reenabled via

 # echo PREEMPT_LAZY >/sys/kernel/debug/sched_features

The test results so far are very machine and workload dependent, but
there is a clear trend that it enhances the non RT workload
performance.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/preempt.h | 18 ++++++++++++++-
 include/linux/ftrace_event.h   |  1 +
 include/linux/preempt.h        | 29 +++++++++++++++++++++++-
 include/linux/sched.h          | 37 +++++++++++++++++++++++++++++++
 include/linux/thread_info.h    | 12 +++++++++-
 kernel/Kconfig.preempt         |  6 +++++
 kernel/sched/core.c            | 50 +++++++++++++++++++++++++++++++++++++++++-
 kernel/sched/fair.c            | 16 +++++++-------
 kernel/sched/features.h        |  3 +++
 kernel/sched/sched.h           |  9 ++++++++
 kernel/trace/trace.c           | 41 ++++++++++++++++++++--------------
 kernel/trace/trace.h           |  2 ++
 kernel/trace/trace_output.c    | 13 +++++++++--
 13 files changed, 206 insertions(+), 31 deletions(-)

diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index 400873450e33..060d1b4e475d 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -85,17 +85,33 @@ static __always_inline void __preempt_count_sub(int val)
  * a decrement which hits zero means we have no preempt_count and should
  * reschedule.
  */
-static __always_inline bool __preempt_count_dec_and_test(void)
+static __always_inline bool ____preempt_count_dec_and_test(void)
 {
 	GEN_UNARY_RMWcc("decl", __preempt_count, __percpu_arg(0), "e");
 }
 
+static __always_inline bool __preempt_count_dec_and_test(void)
+{
+	if (____preempt_count_dec_and_test())
+		return true;
+#ifdef CONFIG_PREEMPT_LAZY
+	return test_thread_flag(TIF_NEED_RESCHED_LAZY);
+#else
+	return false;
+#endif
+}
+
 /*
  * Returns true when we need to resched and can (barring IRQ state).
  */
 static __always_inline bool should_resched(void)
 {
+#ifdef CONFIG_PREEMPT_LAZY
+	return unlikely(!raw_cpu_read_4(__preempt_count) || \
+			test_thread_flag(TIF_NEED_RESCHED_LAZY));
+#else
 	return unlikely(!raw_cpu_read_4(__preempt_count));
+#endif
 }
 
 #ifdef CONFIG_PREEMPT
diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index d4db37ee5f2d..004fac22def7 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -63,6 +63,7 @@ struct trace_entry {
 	int			pid;
 	unsigned short		migrate_disable;
 	unsigned short		padding;
+	unsigned char		preempt_lazy_count;
 };
 
 #define FTRACE_MAX_EVENT						\
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 933912ec8268..66587bf4563a 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -33,6 +33,20 @@ extern void preempt_count_sub(int val);
 #define preempt_count_inc() preempt_count_add(1)
 #define preempt_count_dec() preempt_count_sub(1)
 
+#ifdef CONFIG_PREEMPT_LAZY
+#define add_preempt_lazy_count(val)	do { preempt_lazy_count() += (val); } while (0)
+#define sub_preempt_lazy_count(val)	do { preempt_lazy_count() -= (val); } while (0)
+#define inc_preempt_lazy_count()	add_preempt_lazy_count(1)
+#define dec_preempt_lazy_count()	sub_preempt_lazy_count(1)
+#define preempt_lazy_count()		(current_thread_info()->preempt_lazy_count)
+#else
+#define add_preempt_lazy_count(val)	do { } while (0)
+#define sub_preempt_lazy_count(val)	do { } while (0)
+#define inc_preempt_lazy_count()	do { } while (0)
+#define dec_preempt_lazy_count()	do { } while (0)
+#define preempt_lazy_count()		(0)
+#endif
+
 #ifdef CONFIG_PREEMPT_COUNT
 
 #define preempt_disable() \
@@ -41,6 +55,12 @@ do { \
 	barrier(); \
 } while (0)
 
+#define preempt_lazy_disable() \
+do { \
+	inc_preempt_lazy_count(); \
+	barrier(); \
+} while (0)
+
 #define sched_preempt_enable_no_resched() \
 do { \
 	barrier(); \
@@ -69,6 +89,13 @@ do { \
 		__preempt_schedule(); \
 } while (0)
 
+#define preempt_lazy_enable() \
+do { \
+	dec_preempt_lazy_count(); \
+	barrier(); \
+	preempt_check_resched(); \
+} while (0)
+
 #else
 #define preempt_enable() \
 do { \
@@ -147,7 +174,7 @@ do { \
 } while (0)
 #define preempt_fold_need_resched() \
 do { \
-	if (tif_need_resched()) \
+	if (tif_need_resched_now()) \
 		set_preempt_need_resched(); \
 } while (0)
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 46ae98a12142..05353a40a462 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2793,6 +2793,43 @@ static inline int test_tsk_need_resched(struct task_struct *tsk)
 	return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
 }
 
+#ifdef CONFIG_PREEMPT_LAZY
+static inline void set_tsk_need_resched_lazy(struct task_struct *tsk)
+{
+	set_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
+}
+
+static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk)
+{
+	clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
+}
+
+static inline int test_tsk_need_resched_lazy(struct task_struct *tsk)
+{
+	return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY));
+}
+
+static inline int need_resched_lazy(void)
+{
+	return test_thread_flag(TIF_NEED_RESCHED_LAZY);
+}
+
+static inline int need_resched_now(void)
+{
+	return test_thread_flag(TIF_NEED_RESCHED);
+}
+
+#else
+static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk) { }
+static inline int need_resched_lazy(void) { return 0; }
+
+static inline int need_resched_now(void)
+{
+	return test_thread_flag(TIF_NEED_RESCHED);
+}
+
+#endif
+
 static inline int restart_syscall(void)
 {
 	set_tsk_thread_flag(current, TIF_SIGPENDING);
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index ff307b548ed3..be9f9dc6a4e1 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -102,7 +102,17 @@ static inline int test_ti_thread_flag(struct thread_info *ti, int flag)
 #define test_thread_flag(flag) \
 	test_ti_thread_flag(current_thread_info(), flag)
 
-#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED)
+#ifdef CONFIG_PREEMPT_LAZY
+#define tif_need_resched()	(test_thread_flag(TIF_NEED_RESCHED) || \
+				 test_thread_flag(TIF_NEED_RESCHED_LAZY))
+#define tif_need_resched_now()	(test_thread_flag(TIF_NEED_RESCHED))
+#define tif_need_resched_lazy()	test_thread_flag(TIF_NEED_RESCHED_LAZY))
+
+#else
+#define tif_need_resched()	test_thread_flag(TIF_NEED_RESCHED)
+#define tif_need_resched_now()	test_thread_flag(TIF_NEED_RESCHED)
+#define tif_need_resched_lazy()	0
+#endif
 
 #if defined TIF_RESTORE_SIGMASK && !defined HAVE_SET_RESTORE_SIGMASK
 /*
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index f8a2982bdbde..11dbe26a8279 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -6,6 +6,12 @@ config PREEMPT_RT_BASE
 	bool
 	select PREEMPT
 
+config HAVE_PREEMPT_LAZY
+	bool
+
+config PREEMPT_LAZY
+	def_bool y if HAVE_PREEMPT_LAZY && PREEMPT_RT_FULL
+
 choice
 	prompt "Preemption Model"
 	default PREEMPT_NONE
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2e886b83169f..033eb92d053e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -632,6 +632,38 @@ void resched_curr(struct rq *rq)
 		trace_sched_wake_idle_without_ipi(cpu);
 }
 
+#ifdef CONFIG_PREEMPT_LAZY
+void resched_curr_lazy(struct rq *rq)
+{
+	struct task_struct *curr = rq->curr;
+	int cpu;
+
+	if (!sched_feat(PREEMPT_LAZY)) {
+		resched_curr(rq);
+		return;
+	}
+
+	lockdep_assert_held(&rq->lock);
+
+	if (test_tsk_need_resched(curr))
+		return;
+
+	if (test_tsk_need_resched_lazy(curr))
+		return;
+
+	set_tsk_need_resched_lazy(curr);
+
+	cpu = cpu_of(rq);
+	if (cpu == smp_processor_id())
+		return;
+
+	/* NEED_RESCHED_LAZY must be visible before we test polling */
+	smp_mb();
+	if (!tsk_is_polling(curr))
+		smp_send_reschedule(cpu);
+}
+#endif
+
 void resched_cpu(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
@@ -2021,6 +2053,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
 	p->on_cpu = 0;
 #endif
 	init_task_preempt_count(p);
+#ifdef CONFIG_HAVE_PREEMPT_LAZY
+	task_thread_info(p)->preempt_lazy_count = 0;
+#endif
 #ifdef CONFIG_SMP
 	plist_node_init(&p->pushable_tasks, MAX_PRIO);
 	RB_CLEAR_NODE(&p->pushable_dl_tasks);
@@ -2792,6 +2827,7 @@ void migrate_disable(void)
 	}
 
 	preempt_disable();
+	preempt_lazy_disable();
 	pin_current_cpu();
 	p->migrate_disable = 1;
 	preempt_enable();
@@ -2846,6 +2882,7 @@ void migrate_enable(void)
 
 	unpin_current_cpu();
 	preempt_enable();
+	preempt_lazy_enable();
 }
 EXPORT_SYMBOL(migrate_enable);
 #else
@@ -2974,6 +3011,7 @@ need_resched:
 
 	next = pick_next_task(rq, prev);
 	clear_tsk_need_resched(prev);
+	clear_tsk_need_resched_lazy(prev);
 	clear_preempt_need_resched();
 	rq->skip_clock_update = 0;
 
@@ -3083,6 +3121,14 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
 	if (likely(!preemptible()))
 		return;
 
+#ifdef CONFIG_PREEMPT_LAZY
+	/*
+	 * Check for lazy preemption
+	 */
+	if (current_thread_info()->preempt_lazy_count &&
+			!test_thread_flag(TIF_NEED_RESCHED))
+		return;
+#endif
 	do {
 		__preempt_count_add(PREEMPT_ACTIVE);
 		/*
@@ -4832,7 +4878,9 @@ void init_idle(struct task_struct *idle, int cpu)
 
 	/* Set the preempt count _outside_ the spinlocks! */
 	init_idle_preempt_count(idle, cpu);
-
+#ifdef CONFIG_HAVE_PREEMPT_LAZY
+	task_thread_info(idle)->preempt_lazy_count = 0;
+#endif
 	/*
 	 * The idle tasks have their own, simple scheduling class:
 	 */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ef2b104b254c..b51aad9b0464 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2951,7 +2951,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 	ideal_runtime = sched_slice(cfs_rq, curr);
 	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
 	if (delta_exec > ideal_runtime) {
-		resched_curr(rq_of(cfs_rq));
+		resched_curr_lazy(rq_of(cfs_rq));
 		/*
 		 * The current task ran long enough, ensure it doesn't get
 		 * re-elected due to buddy favours.
@@ -2975,7 +2975,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 		return;
 
 	if (delta > ideal_runtime)
-		resched_curr(rq_of(cfs_rq));
+		resched_curr_lazy(rq_of(cfs_rq));
 }
 
 static void
@@ -3115,7 +3115,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 	 * validating it and just reschedule.
 	 */
 	if (queued) {
-		resched_curr(rq_of(cfs_rq));
+		resched_curr_lazy(rq_of(cfs_rq));
 		return;
 	}
 	/*
@@ -3306,7 +3306,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
 	 * hierarchy can be throttled
 	 */
 	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
-		resched_curr(rq_of(cfs_rq));
+		resched_curr_lazy(rq_of(cfs_rq));
 }
 
 static __always_inline
@@ -3925,7 +3925,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
 
 		if (delta < 0) {
 			if (rq->curr == p)
-				resched_curr(rq);
+				resched_curr_lazy(rq);
 			return;
 		}
 		hrtick_start(rq, delta);
@@ -4792,7 +4792,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 	return;
 
 preempt:
-	resched_curr(rq);
+	resched_curr_lazy(rq);
 	/*
 	 * Only set the backward buddy when the current task is still
 	 * on the rq. This can happen when a wakeup gets interleaved
@@ -7576,7 +7576,7 @@ static void task_fork_fair(struct task_struct *p)
 		 * 'current' within the tree based on its new key value.
 		 */
 		swap(curr->vruntime, se->vruntime);
-		resched_curr(rq);
+		resched_curr_lazy(rq);
 	}
 
 	se->vruntime -= cfs_rq->min_vruntime;
@@ -7601,7 +7601,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
 	 */
 	if (rq->curr == p) {
 		if (p->prio > oldprio)
-			resched_curr(rq);
+			resched_curr_lazy(rq);
 	} else
 		check_preempt_curr(rq, p, 0);
 }
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 10552a509306..fd6d61f1ee17 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -52,6 +52,9 @@ SCHED_FEAT(NONTASK_CAPACITY, true)
 
 #ifdef CONFIG_PREEMPT_RT_FULL
 SCHED_FEAT(TTWU_QUEUE, false)
+# ifdef CONFIG_PREEMPT_LAZY
+SCHED_FEAT(PREEMPT_LAZY, true)
+# endif
 #else
 /*
  * Queue remote wakeups on the target CPU and process them
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index bb191c584970..9e0deba1566e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1211,6 +1211,15 @@ extern void init_sched_dl_class(void);
 extern void resched_curr(struct rq *rq);
 extern void resched_cpu(int cpu);
 
+#ifdef CONFIG_PREEMPT_LAZY
+extern void resched_curr_lazy(struct rq *rq);
+#else
+static inline void resched_curr_lazy(struct rq *rq)
+{
+	resched_curr(rq);
+}
+#endif
+
 extern struct rt_bandwidth def_rt_bandwidth;
 extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9d1a3f265049..7e8e95698307 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1579,6 +1579,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
 	struct task_struct *tsk = current;
 
 	entry->preempt_count		= pc & 0xff;
+	entry->preempt_lazy_count	= preempt_lazy_count();
 	entry->pid			= (tsk) ? tsk->pid : 0;
 	entry->flags =
 #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
@@ -1588,7 +1589,8 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
 #endif
 		((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
 		((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
-		(tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) |
+		(tif_need_resched_now() ? TRACE_FLAG_NEED_RESCHED : 0) |
+		(need_resched_lazy() ? TRACE_FLAG_NEED_RESCHED_LAZY : 0) |
 		(test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0);
 
 	entry->migrate_disable = (tsk) ? __migrate_disabled(tsk) & 0xFF : 0;
@@ -2511,15 +2513,17 @@ get_total_entries(struct trace_buffer *buf,
 
 static void print_lat_help_header(struct seq_file *m)
 {
-	seq_puts(m, "#                  _------=> CPU#            \n");
-	seq_puts(m, "#                 / _-----=> irqs-off        \n");
-	seq_puts(m, "#                | / _----=> need-resched    \n");
-	seq_puts(m, "#                || / _---=> hardirq/softirq \n");
-	seq_puts(m, "#                ||| / _--=> preempt-depth   \n");
-	seq_puts(m, "#                |||| / _--=> migrate-disable\n");
-	seq_puts(m, "#                ||||| /     delay           \n");
-	seq_puts(m, "#  cmd     pid   |||||| time  |   caller     \n");
-	seq_puts(m, "#     \\   /      |||||  \\   |   /          \n");
+	seq_puts(m, "#                   _--------=> CPU#              \n");
+	seq_puts(m, "#                  / _-------=> irqs-off          \n");
+	seq_puts(m, "#                 | / _------=> need-resched      \n");
+	seq_puts(m, "#                 || / _-----=> need-resched_lazy \n");
+	seq_puts(m, "#                 ||| / _----=> hardirq/softirq   \n");
+	seq_puts(m, "#                 |||| / _---=> preempt-depth     \n");
+	seq_puts(m, "#                 ||||| / _--=> preempt-lazy-depth\n");
+	seq_puts(m, "#                 |||||| / _-=> migrate-disable   \n");
+	seq_puts(m, "#                 ||||||| /     delay             \n");
+	seq_puts(m, "#  cmd     pid    |||||||| time  |   caller       \n");
+	seq_puts(m, "#     \\   /      ||||||||  \\   |   /            \n");
 }
 
 static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
@@ -2543,13 +2547,16 @@ static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m)
 static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m)
 {
 	print_event_info(buf, m);
-	seq_puts(m, "#                              _-----=> irqs-off\n");
-	seq_puts(m, "#                             / _----=> need-resched\n");
-	seq_puts(m, "#                            | / _---=> hardirq/softirq\n");
-	seq_puts(m, "#                            || / _--=> preempt-depth\n");
-	seq_puts(m, "#                            ||| /     delay\n");
-	seq_puts(m, "#           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION\n");
-	seq_puts(m, "#              | |       |   ||||       |         |\n");
+	seq_puts(m, "#                              _-------=> irqs-off          \n");
+	seq_puts(m, "#                            /  _------=> need-resched      \n");
+	seq_puts(m, "#                            |/  _-----=> need-resched_lazy \n");
+	seq_puts(m, "#                            ||/  _----=> hardirq/softirq   \n");
+	seq_puts(m, "#                            |||/  _---=> preempt-depth     \n");
+	seq_puts(m, "#                            ||||/  _--=> preempt-lazy-depth\n");
+	seq_puts(m, "#                            ||||| / _-=> migrate-disable   \n");
+	seq_puts(m, "#                            |||||| /     delay\n");
+	seq_puts(m, "#           TASK-PID   CPU#  ||||||  TIMESTAMP  FUNCTION\n");
+	seq_puts(m, "#              | |       |   ||||||     |         |\n");
 }
 
 void
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 385391fb1d3b..33ec7b68c3a6 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -119,6 +119,7 @@ struct kretprobe_trace_entry_head {
  *  NEED_RESCHED	- reschedule is requested
  *  HARDIRQ		- inside an interrupt handler
  *  SOFTIRQ		- inside a softirq handler
+ *  NEED_RESCHED_LAZY	- lazy reschedule is requested
  */
 enum trace_flag_type {
 	TRACE_FLAG_IRQS_OFF		= 0x01,
@@ -127,6 +128,7 @@ enum trace_flag_type {
 	TRACE_FLAG_HARDIRQ		= 0x08,
 	TRACE_FLAG_SOFTIRQ		= 0x10,
 	TRACE_FLAG_PREEMPT_RESCHED	= 0x20,
+	TRACE_FLAG_NEED_RESCHED_LAZY    = 0x40,
 };
 
 #define TRACE_BUF_SIZE		1024
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 8a0b28f796b8..d72684443868 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -410,6 +410,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
 {
 	char hardsoft_irq;
 	char need_resched;
+	char need_resched_lazy;
 	char irqs_off;
 	int hardirq;
 	int softirq;
@@ -438,6 +439,8 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
 		need_resched = '.';
 		break;
 	}
+	need_resched_lazy =
+		(entry->flags & TRACE_FLAG_NEED_RESCHED_LAZY) ? 'L' : '.';
 
 	hardsoft_irq =
 		(hardirq && softirq) ? 'H' :
@@ -445,8 +448,9 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
 		softirq ? 's' :
 		'.';
 
-	if (!trace_seq_printf(s, "%c%c%c",
-			      irqs_off, need_resched, hardsoft_irq))
+	if (!trace_seq_printf(s, "%c%c%c%c",
+			      irqs_off, need_resched, need_resched_lazy,
+			      hardsoft_irq))
 		return 0;
 
 	if (entry->preempt_count)
@@ -454,6 +458,11 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
 	else
 		ret = trace_seq_putc(s, '.');
 
+	if (entry->preempt_lazy_count)
+		ret = trace_seq_printf(s, "%x", entry->preempt_lazy_count);
+	else
+		ret = trace_seq_putc(s, '.');
+
 	if (entry->migrate_disable)
 		ret = trace_seq_printf(s, "%x", entry->migrate_disable);
 	else
-- 
cgit v1.2.3


From 79b5abe15c3b0d4e39545283694d1d16f3d2c120 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 1 Nov 2012 11:03:47 +0100
Subject: x86-preempt-lazy.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/Kconfig                   |  1 +
 arch/x86/include/asm/thread_info.h |  6 ++++++
 arch/x86/kernel/asm-offsets.c      |  2 ++
 arch/x86/kernel/entry_32.S         | 20 ++++++++++++++++++--
 arch/x86/kernel/entry_64.S         | 29 ++++++++++++++++++++++-------
 5 files changed, 49 insertions(+), 9 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index ea9e6637574c..25d692ddae4e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -21,6 +21,7 @@ config X86_64
 ### Arch settings
 config X86
 	def_bool y
+	select HAVE_PREEMPT_LAZY
 	select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
 	select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
 	select ARCH_HAS_FAST_MULTIPLIER
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 547e344a6dc6..223b9f0d463f 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -30,6 +30,8 @@ struct thread_info {
 	__u32			status;		/* thread synchronous flags */
 	__u32			cpu;		/* current CPU */
 	int			saved_preempt_count;
+	int			preempt_lazy_count;	/* 0 => lazy preemptable
+							   <0 => BUG */
 	mm_segment_t		addr_limit;
 	struct restart_block    restart_block;
 	void __user		*sysenter_return;
@@ -75,6 +77,7 @@ struct thread_info {
 #define TIF_SYSCALL_EMU		6	/* syscall emulation active */
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_SECCOMP		8	/* secure computing */
+#define TIF_NEED_RESCHED_LAZY	9	/* lazy rescheduling necessary */
 #define TIF_MCE_NOTIFY		10	/* notify userspace of an MCE */
 #define TIF_USER_RETURN_NOTIFY	11	/* notify kernel of userspace return */
 #define TIF_UPROBE		12	/* breakpointed or singlestepping */
@@ -100,6 +103,7 @@ struct thread_info {
 #define _TIF_SYSCALL_EMU	(1 << TIF_SYSCALL_EMU)
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
+#define _TIF_NEED_RESCHED_LAZY	(1 << TIF_NEED_RESCHED_LAZY)
 #define _TIF_MCE_NOTIFY		(1 << TIF_MCE_NOTIFY)
 #define _TIF_USER_RETURN_NOTIFY	(1 << TIF_USER_RETURN_NOTIFY)
 #define _TIF_UPROBE		(1 << TIF_UPROBE)
@@ -150,6 +154,8 @@ struct thread_info {
 #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
 #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
 
+#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
+
 #define STACK_WARN		(THREAD_SIZE/8)
 #define KERNEL_STACK_OFFSET	(5*(BITS_PER_LONG/8))
 
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 9f6b9341950f..5701b507510b 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -32,6 +32,7 @@ void common(void) {
 	OFFSET(TI_flags, thread_info, flags);
 	OFFSET(TI_status, thread_info, status);
 	OFFSET(TI_addr_limit, thread_info, addr_limit);
+	OFFSET(TI_preempt_lazy_count, thread_info, preempt_lazy_count);
 
 	BLANK();
 	OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
@@ -71,4 +72,5 @@ void common(void) {
 
 	BLANK();
 	DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
+	DEFINE(_PREEMPT_ENABLED, PREEMPT_ENABLED);
 }
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 344b63f18d14..5af2338b30c4 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -359,8 +359,24 @@ END(ret_from_exception)
 ENTRY(resume_kernel)
 	DISABLE_INTERRUPTS(CLBR_ANY)
 need_resched:
+	# preempt count == 0 + NEED_RS set?
 	cmpl $0,PER_CPU_VAR(__preempt_count)
+#ifndef CONFIG_PREEMPT_LAZY
 	jnz restore_all
+#else
+	jz test_int_off
+
+	# atleast preempt count == 0 ?
+	cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count)
+	jne restore_all
+
+	cmpl $0,TI_preempt_lazy_count(%ebp)	# non-zero preempt_lazy_count ?
+	jnz restore_all
+
+	testl $_TIF_NEED_RESCHED_LAZY, TI_flags(%ebp)
+	jz restore_all
+test_int_off:
+#endif
 	testl $X86_EFLAGS_IF,PT_EFLAGS(%esp)	# interrupts off (exception path) ?
 	jz restore_all
 	call preempt_schedule_irq
@@ -591,7 +607,7 @@ ENDPROC(system_call)
 	ALIGN
 	RING0_PTREGS_FRAME		# can't unwind into user space anyway
 work_pending:
-	testb $_TIF_NEED_RESCHED, %cl
+	testl $_TIF_NEED_RESCHED_MASK, %ecx
 	jz work_notifysig
 work_resched:
 	call schedule
@@ -604,7 +620,7 @@ work_resched:
 	andl $_TIF_WORK_MASK, %ecx	# is there any work to be done other
 					# than syscall tracing?
 	jz restore_all
-	testb $_TIF_NEED_RESCHED, %cl
+	testl $_TIF_NEED_RESCHED_MASK, %ecx
 	jnz work_resched
 
 work_notifysig:				# deal with pending signals and
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 544b24771f6d..abaa7cb9c188 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -454,8 +454,8 @@ sysret_check:
 	/* Handle reschedules */
 	/* edx:	work, edi: workmask */
 sysret_careful:
-	bt $TIF_NEED_RESCHED,%edx
-	jnc sysret_signal
+	testl $_TIF_NEED_RESCHED_MASK,%edx
+	jz sysret_signal
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
 	pushq_cfi %rdi
@@ -554,8 +554,8 @@ GLOBAL(int_with_check)
 	/* First do a reschedule test. */
 	/* edx:	work, edi: workmask */
 int_careful:
-	bt $TIF_NEED_RESCHED,%edx
-	jnc  int_very_careful
+	testl $_TIF_NEED_RESCHED_MASK,%edx
+	jz  int_very_careful
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
 	pushq_cfi %rdi
@@ -870,8 +870,8 @@ native_irq_return_ldt:
 	/* edi: workmask, edx: work */
 retint_careful:
 	CFI_RESTORE_STATE
-	bt    $TIF_NEED_RESCHED,%edx
-	jnc   retint_signal
+	testl $_TIF_NEED_RESCHED_MASK,%edx
+	jz   retint_signal
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
 	pushq_cfi %rdi
@@ -903,7 +903,22 @@ retint_signal:
 	/* rcx:	 threadinfo. interrupts off. */
 ENTRY(retint_kernel)
 	cmpl $0,PER_CPU_VAR(__preempt_count)
+#ifndef CONFIG_PREEMPT_LAZY
 	jnz  retint_restore_args
+#else
+	jz  check_int_off
+
+	# atleast preempt count == 0 ?
+	cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count)
+	jnz retint_restore_args
+
+	cmpl $0, TI_preempt_lazy_count(%rcx)
+	jnz retint_restore_args
+
+	bt $TIF_NEED_RESCHED_LAZY,TI_flags(%rcx)
+	jnc  retint_restore_args
+check_int_off:
+#endif
 	bt   $9,EFLAGS-ARGOFFSET(%rsp)	/* interrupts off? */
 	jnc  retint_restore_args
 	call preempt_schedule_irq
@@ -1304,7 +1319,7 @@ paranoid_userspace:
 	movq %rsp,%rdi			/* &pt_regs */
 	call sync_regs
 	movq %rax,%rsp			/* switch stack for scheduling */
-	testl $_TIF_NEED_RESCHED,%ebx
+	testl $_TIF_NEED_RESCHED_MASK,%ebx
 	jnz paranoid_schedule
 	movl %ebx,%edx			/* arg3: thread flags */
 	TRACE_IRQS_ON
-- 
cgit v1.2.3


From 0144e09aa6af2595d66c68a563318610effc02e4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 31 Oct 2012 12:04:11 +0100
Subject: arm-preempt-lazy-support.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/arm/Kconfig                   |  1 +
 arch/arm/include/asm/thread_info.h |  3 +++
 arch/arm/kernel/asm-offsets.c      |  1 +
 arch/arm/kernel/entry-armv.S       | 13 +++++++++++--
 arch/arm/kernel/signal.c           |  3 ++-
 5 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 89c4b5ccc68d..d635abf51063 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -62,6 +62,7 @@ config ARM
 	select HAVE_PERF_EVENTS
 	select HAVE_PERF_REGS
 	select HAVE_PERF_USER_STACK_DUMP
+	select HAVE_PREEMPT_LAZY
 	select HAVE_RCU_TABLE_FREE if (SMP && ARM_LPAE)
 	select HAVE_REGS_AND_STACK_ACCESS_API
 	select HAVE_SYSCALL_TRACEPOINTS
diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h
index ce73ab635414..483c4e7ede2b 100644
--- a/arch/arm/include/asm/thread_info.h
+++ b/arch/arm/include/asm/thread_info.h
@@ -51,6 +51,7 @@ struct cpu_context_save {
 struct thread_info {
 	unsigned long		flags;		/* low level flags */
 	int			preempt_count;	/* 0 => preemptable, <0 => bug */
+	int			preempt_lazy_count;	/* 0 => preemptable, <0 => bug */
 	mm_segment_t		addr_limit;	/* address limit */
 	struct task_struct	*task;		/* main task structure */
 	struct exec_domain	*exec_domain;	/* execution domain */
@@ -149,6 +150,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
 #define TIF_SIGPENDING		0
 #define TIF_NEED_RESCHED	1
 #define TIF_NOTIFY_RESUME	2	/* callback before returning to user */
+#define TIF_NEED_RESCHED_LAZY	3
 #define TIF_UPROBE		7
 #define TIF_SYSCALL_TRACE	8
 #define TIF_SYSCALL_AUDIT	9
@@ -162,6 +164,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
 #define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
 #define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
+#define _TIF_NEED_RESCHED_LAZY	(1 << TIF_NEED_RESCHED_LAZY)
 #define _TIF_UPROBE		(1 << TIF_UPROBE)
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c
index 2d2d6087b9b1..32caa773fe0e 100644
--- a/arch/arm/kernel/asm-offsets.c
+++ b/arch/arm/kernel/asm-offsets.c
@@ -64,6 +64,7 @@ int main(void)
   BLANK();
   DEFINE(TI_FLAGS,		offsetof(struct thread_info, flags));
   DEFINE(TI_PREEMPT,		offsetof(struct thread_info, preempt_count));
+  DEFINE(TI_PREEMPT_LAZY,	offsetof(struct thread_info, preempt_lazy_count));
   DEFINE(TI_ADDR_LIMIT,		offsetof(struct thread_info, addr_limit));
   DEFINE(TI_TASK,		offsetof(struct thread_info, task));
   DEFINE(TI_EXEC_DOMAIN,	offsetof(struct thread_info, exec_domain));
diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
index 2f5555d307b3..61af605ae614 100644
--- a/arch/arm/kernel/entry-armv.S
+++ b/arch/arm/kernel/entry-armv.S
@@ -207,11 +207,18 @@ __irq_svc:
 #ifdef CONFIG_PREEMPT
 	get_thread_info tsk
 	ldr	r8, [tsk, #TI_PREEMPT]		@ get preempt count
-	ldr	r0, [tsk, #TI_FLAGS]		@ get flags
 	teq	r8, #0				@ if preempt count != 0
+	bne	1f				@ return from exeption
+	ldr	r0, [tsk, #TI_FLAGS]		@ get flags
+	tst	r0, #_TIF_NEED_RESCHED		@ if NEED_RESCHED is set
+	blne	svc_preempt			@ preempt!
+
+	ldr	r8, [tsk, #TI_PREEMPT_LAZY]	@ get preempt lazy count
+	teq	r8, #0				@ if preempt lazy count != 0
 	movne	r0, #0				@ force flags to 0
-	tst	r0, #_TIF_NEED_RESCHED
+	tst	r0, #_TIF_NEED_RESCHED_LAZY
 	blne	svc_preempt
+1:
 #endif
 
 	svc_exit r5, irq = 1			@ return from exception
@@ -226,6 +233,8 @@ svc_preempt:
 1:	bl	preempt_schedule_irq		@ irq en/disable is done inside
 	ldr	r0, [tsk, #TI_FLAGS]		@ get new tasks TI_FLAGS
 	tst	r0, #_TIF_NEED_RESCHED
+	bne	1b
+	tst	r0, #_TIF_NEED_RESCHED_LAZY
 	reteq	r8				@ go again
 	b	1b
 #endif
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index bd1983437205..55f92a9f584d 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -574,7 +574,8 @@ asmlinkage int
 do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
 {
 	do {
-		if (likely(thread_flags & _TIF_NEED_RESCHED)) {
+		if (likely(thread_flags & (_TIF_NEED_RESCHED |
+					   _TIF_NEED_RESCHED_LAZY))) {
 			schedule();
 		} else {
 			if (unlikely(!user_mode(regs)))
-- 
cgit v1.2.3


From 64ecba68355ad2cd93bb1fe6da0346754ff9e267 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 1 Nov 2012 10:14:11 +0100
Subject: powerpc-preempt-lazy-support.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/powerpc/Kconfig                   |  1 +
 arch/powerpc/include/asm/thread_info.h | 11 ++++++++---
 arch/powerpc/kernel/asm-offsets.c      |  1 +
 arch/powerpc/kernel/entry_32.S         | 17 ++++++++++++-----
 arch/powerpc/kernel/entry_64.S         | 14 +++++++++++---
 5 files changed, 33 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 8d9d1d6f16d3..5e0337b11d52 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -137,6 +137,7 @@ config PPC
 	select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
 	select GENERIC_STRNCPY_FROM_USER
 	select GENERIC_STRNLEN_USER
+	select HAVE_PREEMPT_LAZY
 	select HAVE_MOD_ARCH_SPECIFIC
 	select MODULES_USE_ELF_RELA
 	select CLONE_BACKWARDS
diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h
index b034ecdb7c74..16ff509e28df 100644
--- a/arch/powerpc/include/asm/thread_info.h
+++ b/arch/powerpc/include/asm/thread_info.h
@@ -43,6 +43,8 @@ struct thread_info {
 	int		cpu;			/* cpu we're on */
 	int		preempt_count;		/* 0 => preemptable,
 						   <0 => BUG */
+	int		preempt_lazy_count;	/* 0 => preemptable,
+						   <0 => BUG */
 	struct restart_block restart_block;
 	unsigned long	local_flags;		/* private flags for thread */
 
@@ -88,8 +90,7 @@ static inline struct thread_info *current_thread_info(void)
 #define TIF_SYSCALL_TRACE	0	/* syscall trace active */
 #define TIF_SIGPENDING		1	/* signal pending */
 #define TIF_NEED_RESCHED	2	/* rescheduling necessary */
-#define TIF_POLLING_NRFLAG	3	/* true if poll_idle() is polling
-					   TIF_NEED_RESCHED */
+#define TIF_NEED_RESCHED_LAZY	3	/* lazy rescheduling necessary */
 #define TIF_32BIT		4	/* 32 bit binary */
 #define TIF_RESTORE_TM		5	/* need to restore TM FP/VEC/VSX */
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
@@ -107,6 +108,8 @@ static inline struct thread_info *current_thread_info(void)
 #if defined(CONFIG_PPC64)
 #define TIF_ELF2ABI		18	/* function descriptors must die! */
 #endif
+#define TIF_POLLING_NRFLAG	19	/* true if poll_idle() is polling
+					   TIF_NEED_RESCHED */
 
 /* as above, but as bit values */
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
@@ -125,14 +128,16 @@ static inline struct thread_info *current_thread_info(void)
 #define _TIF_SYSCALL_TRACEPOINT	(1<<TIF_SYSCALL_TRACEPOINT)
 #define _TIF_EMULATE_STACK_STORE	(1<<TIF_EMULATE_STACK_STORE)
 #define _TIF_NOHZ		(1<<TIF_NOHZ)
+#define _TIF_NEED_RESCHED_LAZY	(1<<TIF_NEED_RESCHED_LAZY)
 #define _TIF_SYSCALL_T_OR_A	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
 				 _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT | \
 				 _TIF_NOHZ)
 
 #define _TIF_USER_WORK_MASK	(_TIF_SIGPENDING | _TIF_NEED_RESCHED | \
 				 _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
-				 _TIF_RESTORE_TM)
+				 _TIF_RESTORE_TM | _TIF_NEED_RESCHED_LAZY)
 #define _TIF_PERSYSCALL_MASK	(_TIF_RESTOREALL|_TIF_NOERROR)
+#define _TIF_NEED_RESCHED_MASK	(_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
 
 /* Bits in local_flags */
 /* Don't move TLF_NAPPING without adjusting the code in entry_32.S */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 9d7dede2847c..c12e5e4d49da 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -159,6 +159,7 @@ int main(void)
 	DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
 	DEFINE(TI_LOCAL_FLAGS, offsetof(struct thread_info, local_flags));
 	DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count));
+	DEFINE(TI_PREEMPT_LAZY, offsetof(struct thread_info, preempt_lazy_count));
 	DEFINE(TI_TASK, offsetof(struct thread_info, task));
 	DEFINE(TI_CPU, offsetof(struct thread_info, cpu));
 
diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index 22b45a4955cd..081f926193f2 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -890,7 +890,14 @@ resume_kernel:
 	cmpwi	0,r0,0		/* if non-zero, just restore regs and return */
 	bne	restore
 	andi.	r8,r8,_TIF_NEED_RESCHED
+	bne+	1f
+	lwz	r0,TI_PREEMPT_LAZY(r9)
+	cmpwi	0,r0,0		/* if non-zero, just restore regs and return */
+	bne	restore
+	lwz	r0,TI_FLAGS(r9)
+	andi.	r0,r0,_TIF_NEED_RESCHED_LAZY
 	beq+	restore
+1:
 	lwz	r3,_MSR(r1)
 	andi.	r0,r3,MSR_EE	/* interrupts off? */
 	beq	restore		/* don't schedule if so */
@@ -901,11 +908,11 @@ resume_kernel:
 	 */
 	bl	trace_hardirqs_off
 #endif
-1:	bl	preempt_schedule_irq
+2:	bl	preempt_schedule_irq
 	CURRENT_THREAD_INFO(r9, r1)
 	lwz	r3,TI_FLAGS(r9)
-	andi.	r0,r3,_TIF_NEED_RESCHED
-	bne-	1b
+	andi.	r0,r3,_TIF_NEED_RESCHED_MASK
+	bne-	2b
 #ifdef CONFIG_TRACE_IRQFLAGS
 	/* And now, to properly rebalance the above, we tell lockdep they
 	 * are being turned back on, which will happen when we return
@@ -1226,7 +1233,7 @@ global_dbcr0:
 #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */
 
 do_work:			/* r10 contains MSR_KERNEL here */
-	andi.	r0,r9,_TIF_NEED_RESCHED
+	andi.	r0,r9,_TIF_NEED_RESCHED_MASK
 	beq	do_user_signal
 
 do_resched:			/* r10 contains MSR_KERNEL here */
@@ -1247,7 +1254,7 @@ recheck:
 	MTMSRD(r10)		/* disable interrupts */
 	CURRENT_THREAD_INFO(r9, r1)
 	lwz	r9,TI_FLAGS(r9)
-	andi.	r0,r9,_TIF_NEED_RESCHED
+	andi.	r0,r9,_TIF_NEED_RESCHED_MASK
 	bne-	do_resched
 	andi.	r0,r9,_TIF_USER_WORK_MASK
 	beq	restore_user
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 0905c8da90f1..96b2d7b20f88 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -644,7 +644,7 @@ _GLOBAL(ret_from_except_lite)
 #else
 	beq	restore
 #endif
-1:	andi.	r0,r4,_TIF_NEED_RESCHED
+1:	andi.	r0,r4,_TIF_NEED_RESCHED_MASK
 	beq	2f
 	bl	restore_interrupts
 	SCHEDULE_USER
@@ -706,10 +706,18 @@ resume_kernel:
 
 #ifdef CONFIG_PREEMPT
 	/* Check if we need to preempt */
+	lwz	r8,TI_PREEMPT(r9)
+	cmpwi	0,r8,0		/* if non-zero, just restore regs and return */
+	bne	restore
 	andi.	r0,r4,_TIF_NEED_RESCHED
+	bne+	check_count
+
+	andi.	r0,r4,_TIF_NEED_RESCHED_LAZY
 	beq+	restore
+	lwz	r8,TI_PREEMPT_LAZY(r9)
+
 	/* Check that preempt_count() == 0 and interrupts are enabled */
-	lwz	r8,TI_PREEMPT(r9)
+check_count:
 	cmpwi	cr1,r8,0
 	ld	r0,SOFTE(r1)
 	cmpdi	r0,0
@@ -726,7 +734,7 @@ resume_kernel:
 	/* Re-test flags and eventually loop */
 	CURRENT_THREAD_INFO(r9, r1)
 	ld	r4,TI_FLAGS(r9)
-	andi.	r0,r4,_TIF_NEED_RESCHED
+	andi.	r0,r4,_TIF_NEED_RESCHED_MASK
 	bne	1b
 
 	/*
-- 
cgit v1.2.3


From 712b5e8e2864b2c7826c63ae11a7f9a672d07300 Mon Sep 17 00:00:00 2001
From: Anders Roxell <anders.roxell@linaro.org>
Date: Thu, 14 May 2015 17:52:17 +0200
Subject: arch/arm64: Add lazy preempt support

arm64 is missing support for PREEMPT_RT. The main feature which is
lacking is support for lazy preemption. The arch-specific entry code,
thread information structure definitions, and associated data tables
have to be extended to provide this support. Then the Kconfig file has
to be extended to indicate the support is available, and also to
indicate that support for full RT preemption is now available.

Signed-off-by: Anders Roxell <anders.roxell@linaro.org>
---
 arch/arm64/Kconfig                   |  1 +
 arch/arm64/include/asm/thread_info.h |  3 +++
 arch/arm64/kernel/asm-offsets.c      |  1 +
 arch/arm64/kernel/entry.S            | 13 ++++++++++---
 4 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index b8882e5afe7e..54ec128acf2d 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -59,6 +59,7 @@ config ARM64
 	select HAVE_PERF_REGS
 	select HAVE_PERF_USER_STACK_DUMP
 	select HAVE_RCU_TABLE_FREE
+	select HAVE_PREEMPT_LAZY
 	select HAVE_SYSCALL_TRACEPOINTS
 	select IRQ_DOMAIN
 	select IRQ_FORCED_THREADING
diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index 459bf8e53208..3e24ea15c05c 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -50,6 +50,7 @@ struct thread_info {
 	struct exec_domain	*exec_domain;	/* execution domain */
 	struct restart_block	restart_block;
 	int			preempt_count;	/* 0 => preemptable, <0 => bug */
+	int			preempt_lazy_count;	/* 0 => preemptable, <0 => bug */
 	int			cpu;		/* cpu */
 };
 
@@ -108,6 +109,7 @@ static inline struct thread_info *current_thread_info(void)
 #define TIF_NEED_RESCHED	1
 #define TIF_NOTIFY_RESUME	2	/* callback before returning to user */
 #define TIF_FOREIGN_FPSTATE	3	/* CPU's FP state is not current's */
+#define TIF_NEED_RESCHED_LAZY	4
 #define TIF_NOHZ		7
 #define TIF_SYSCALL_TRACE	8
 #define TIF_SYSCALL_AUDIT	9
@@ -124,6 +126,7 @@ static inline struct thread_info *current_thread_info(void)
 #define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
 #define _TIF_FOREIGN_FPSTATE	(1 << TIF_FOREIGN_FPSTATE)
+#define _TIF_NEED_RESCHED_LAZY	(1 << TIF_NEED_RESCHED_LAZY)
 #define _TIF_NOHZ		(1 << TIF_NOHZ)
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index 9a9fce090d58..f77413646dba 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -36,6 +36,7 @@ int main(void)
   BLANK();
   DEFINE(TI_FLAGS,		offsetof(struct thread_info, flags));
   DEFINE(TI_PREEMPT,		offsetof(struct thread_info, preempt_count));
+  DEFINE(TI_PREEMPT_LAZY,	offsetof(struct thread_info, preempt_lazy_count));
   DEFINE(TI_ADDR_LIMIT,		offsetof(struct thread_info, addr_limit));
   DEFINE(TI_TASK,		offsetof(struct thread_info, task));
   DEFINE(TI_EXEC_DOMAIN,	offsetof(struct thread_info, exec_domain));
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 2b0f3d5e11c7..0c7c7245b97c 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -367,11 +367,16 @@ el1_irq:
 #ifdef CONFIG_PREEMPT
 	get_thread_info tsk
 	ldr	w24, [tsk, #TI_PREEMPT]		// get preempt count
-	cbnz	w24, 1f				// preempt count != 0
+	cbnz	w24, 2f				// preempt count != 0
 	ldr	x0, [tsk, #TI_FLAGS]		// get flags
-	tbz	x0, #TIF_NEED_RESCHED, 1f	// needs rescheduling?
-	bl	el1_preempt
+	tbnz	x0, #TIF_NEED_RESCHED, 1f	// needs rescheduling?
+
+	ldr	w24, [tsk, #TI_PREEMPT_LAZY]	// get preempt lazy count
+	cbnz	w24, 2f				// preempt lazy count != 0
+	tbz	x0, #TIF_NEED_RESCHED_LAZY, 2f	// needs rescheduling?
 1:
+	bl	el1_preempt
+2:
 #endif
 #ifdef CONFIG_TRACE_IRQFLAGS
 	bl	trace_hardirqs_on
@@ -385,6 +390,7 @@ el1_preempt:
 1:	bl	preempt_schedule_irq		// irq en/disable is done inside
 	ldr	x0, [tsk, #TI_FLAGS]		// get new tasks TI_FLAGS
 	tbnz	x0, #TIF_NEED_RESCHED, 1b	// needs rescheduling?
+	tbnz	x0, #TIF_NEED_RESCHED_LAZY, 1b	// needs rescheduling?
 	ret	x24
 #endif
 
@@ -621,6 +627,7 @@ fast_work_pending:
 	str	x0, [sp, #S_X0]			// returned x0
 work_pending:
 	tbnz	x1, #TIF_NEED_RESCHED, work_resched
+	tbnz	x1, #TIF_NEED_RESCHED_LAZY, work_resched
 	/* TIF_SIGPENDING, TIF_NOTIFY_RESUME or TIF_FOREIGN_FPSTATE case */
 	ldr	x2, [sp, #S_PSTATE]
 	mov	x0, sp				// 'regs'
-- 
cgit v1.2.3


From 43816fda090ec36c9a2334393737f008e38b1535 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 23 Jan 2014 14:45:59 +0100
Subject: leds: trigger: disable CPU trigger on -RT

as it triggers:
|CPU: 0 PID: 0 Comm: swapper Not tainted 3.12.8-rt10 #141
|[<c0014aa4>] (unwind_backtrace+0x0/0xf8) from [<c0012788>] (show_stack+0x1c/0x20)
|[<c0012788>] (show_stack+0x1c/0x20) from [<c043c8dc>] (dump_stack+0x20/0x2c)
|[<c043c8dc>] (dump_stack+0x20/0x2c) from [<c004c5e8>] (__might_sleep+0x13c/0x170)
|[<c004c5e8>] (__might_sleep+0x13c/0x170) from [<c043f270>] (__rt_spin_lock+0x28/0x38)
|[<c043f270>] (__rt_spin_lock+0x28/0x38) from [<c043fa00>] (rt_read_lock+0x68/0x7c)
|[<c043fa00>] (rt_read_lock+0x68/0x7c) from [<c036cf74>] (led_trigger_event+0x2c/0x5c)
|[<c036cf74>] (led_trigger_event+0x2c/0x5c) from [<c036e0bc>] (ledtrig_cpu+0x54/0x5c)
|[<c036e0bc>] (ledtrig_cpu+0x54/0x5c) from [<c000ffd8>] (arch_cpu_idle_exit+0x18/0x1c)
|[<c000ffd8>] (arch_cpu_idle_exit+0x18/0x1c) from [<c00590b8>] (cpu_startup_entry+0xa8/0x234)
|[<c00590b8>] (cpu_startup_entry+0xa8/0x234) from [<c043b2cc>] (rest_init+0xb8/0xe0)
|[<c043b2cc>] (rest_init+0xb8/0xe0) from [<c061ebe0>] (start_kernel+0x2c4/0x380)

Cc: stable-rt@vger.kernel.org
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 drivers/leds/trigger/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/leds/trigger/Kconfig b/drivers/leds/trigger/Kconfig
index 49794b47b51c..3d7245d6b2f8 100644
--- a/drivers/leds/trigger/Kconfig
+++ b/drivers/leds/trigger/Kconfig
@@ -61,7 +61,7 @@ config LEDS_TRIGGER_BACKLIGHT
 
 config LEDS_TRIGGER_CPU
 	bool "LED CPU Trigger"
-	depends on LEDS_TRIGGERS
+	depends on LEDS_TRIGGERS && !PREEMPT_RT_BASE
 	help
 	  This allows LEDs to be controlled by active CPUs. This shows
 	  the active CPUs across an array of LEDs so you can see which
-- 
cgit v1.2.3


From 0dc6edd28c15b9bab7626b1a00613aaf7058ecc2 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 21 Mar 2013 11:35:49 +0100
Subject: i2c/omap: drop the lock hard irq context

The lock is taken while reading two registers. On RT the first lock is
taken in hard irq where it might sleep and in the threaded irq.
The threaded irq runs in oneshot mode so the hard irq does not run until
the thread the completes so there is no reason to grab the lock.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 drivers/i2c/busses/i2c-omap.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/i2c/busses/i2c-omap.c b/drivers/i2c/busses/i2c-omap.c
index 277a2288d4a8..6d1c2960f9fa 100644
--- a/drivers/i2c/busses/i2c-omap.c
+++ b/drivers/i2c/busses/i2c-omap.c
@@ -875,15 +875,12 @@ omap_i2c_isr(int irq, void *dev_id)
 	u16 mask;
 	u16 stat;
 
-	spin_lock(&dev->lock);
-	mask = omap_i2c_read_reg(dev, OMAP_I2C_IE_REG);
 	stat = omap_i2c_read_reg(dev, OMAP_I2C_STAT_REG);
+	mask = omap_i2c_read_reg(dev, OMAP_I2C_IE_REG);
 
 	if (stat & mask)
 		ret = IRQ_WAKE_THREAD;
 
-	spin_unlock(&dev->lock);
-
 	return ret;
 }
 
-- 
cgit v1.2.3


From 0554ffaead46f035509e5b77141092abd614e02c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 9 Jan 2013 12:11:12 +0100
Subject: mmci: Remove bogus local_irq_save()

On !RT interrupt runs with interrupts disabled. On RT it's in a
thread, so no need to disable interrupts at all.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/mmc/host/mmci.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c
index 43af791e2e45..863a9e6f2400 100644
--- a/drivers/mmc/host/mmci.c
+++ b/drivers/mmc/host/mmci.c
@@ -1153,15 +1153,12 @@ static irqreturn_t mmci_pio_irq(int irq, void *dev_id)
 	struct sg_mapping_iter *sg_miter = &host->sg_miter;
 	struct variant_data *variant = host->variant;
 	void __iomem *base = host->base;
-	unsigned long flags;
 	u32 status;
 
 	status = readl(base + MMCISTATUS);
 
 	dev_dbg(mmc_dev(host->mmc), "irq1 (pio) %08x\n", status);
 
-	local_irq_save(flags);
-
 	do {
 		unsigned int remain, len;
 		char *buffer;
@@ -1201,8 +1198,6 @@ static irqreturn_t mmci_pio_irq(int irq, void *dev_id)
 
 	sg_miter_stop(sg_miter);
 
-	local_irq_restore(flags);
-
 	/*
 	 * If we have less than the fifo 'half-full' threshold to transfer,
 	 * trigger a PIO interrupt as soon as any data is available.
-- 
cgit v1.2.3


From d3d76100509077e0dfc0c0b1008809c094a616d4 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 26 Feb 2015 12:13:36 +0100
Subject: mmc: sdhci: don't provide hard irq handler
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

the sdhci code provides both irq handlers: the primary and the thread
handler. Initially it was meant for the primary handler to be very
short.
The result is not that on -RT we have the primrary handler grabing locks
and this isn't really working. As a hack for now I just push both
handler into the threaded mode.

Cc: stable-rt@vger.kernel.org
Reported-By: Michal Šmucr <msmucr@gmail.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 drivers/mmc/host/sdhci.c | 32 +++++++++++++++++++++++++++-----
 1 file changed, 27 insertions(+), 5 deletions(-)

diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c
index 023c2010cd75..bcde53774bc9 100644
--- a/drivers/mmc/host/sdhci.c
+++ b/drivers/mmc/host/sdhci.c
@@ -2565,6 +2565,31 @@ static irqreturn_t sdhci_thread_irq(int irq, void *dev_id)
 	return isr ? IRQ_HANDLED : IRQ_NONE;
 }
 
+#ifdef CONFIG_PREEMPT_RT_BASE
+static irqreturn_t sdhci_rt_irq(int irq, void *dev_id)
+{
+	irqreturn_t ret;
+
+	local_bh_disable();
+	ret = sdhci_irq(irq, dev_id);
+	local_bh_enable();
+	if (ret == IRQ_WAKE_THREAD)
+		ret = sdhci_thread_irq(irq, dev_id);
+	return ret;
+}
+#endif
+
+static int sdhci_req_irq(struct sdhci_host *host)
+{
+#ifdef CONFIG_PREEMPT_RT_BASE
+	return request_threaded_irq(host->irq, NULL, sdhci_rt_irq,
+				    IRQF_SHARED, mmc_hostname(host->mmc), host);
+#else
+	return request_threaded_irq(host->irq, sdhci_irq, sdhci_thread_irq,
+				    IRQF_SHARED, mmc_hostname(host->mmc), host);
+#endif
+}
+
 /*****************************************************************************\
  *                                                                           *
  * Suspend/resume                                                            *
@@ -2632,9 +2657,7 @@ int sdhci_resume_host(struct sdhci_host *host)
 	}
 
 	if (!device_may_wakeup(mmc_dev(host->mmc))) {
-		ret = request_threaded_irq(host->irq, sdhci_irq,
-					   sdhci_thread_irq, IRQF_SHARED,
-					   mmc_hostname(host->mmc), host);
+		ret = sdhci_req_irq(host);
 		if (ret)
 			return ret;
 	} else {
@@ -3253,8 +3276,7 @@ int sdhci_add_host(struct sdhci_host *host)
 
 	sdhci_init(host, 0);
 
-	ret = request_threaded_irq(host->irq, sdhci_irq, sdhci_thread_irq,
-				   IRQF_SHARED,	mmc_hostname(mmc), host);
+	ret = sdhci_req_irq(host);
 	if (ret) {
 		pr_err("%s: Failed to request IRQ %d: %d\n",
 		       mmc_hostname(mmc), host->irq, ret);
-- 
cgit v1.2.3


From e64823d6159f9b2bb12dc0ef86f67ea295009fcd Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 9 Apr 2015 15:23:01 +0200
Subject: cpufreq: drop K8's driver from beeing selected

Ralf posted a picture of a backtrace from

| powernowk8_target_fn() -> transition_frequency_fidvid() and then at the
| end:
| 932         policy = cpufreq_cpu_get(smp_processor_id());
| 933         cpufreq_cpu_put(policy);

crashing the system on -RT. I assumed that policy was a NULL pointer but
was rulled out. Since Ralf can't do any more investigations on this and
I have no machine with this, I simply switch it off.

Reported-by:  Ralf Mardorf <ralf.mardorf@alice-dsl.net>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 drivers/cpufreq/Kconfig.x86 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86
index 89ae88f91895..e3108a67e671 100644
--- a/drivers/cpufreq/Kconfig.x86
+++ b/drivers/cpufreq/Kconfig.x86
@@ -113,7 +113,7 @@ config X86_POWERNOW_K7_ACPI
 
 config X86_POWERNOW_K8
 	tristate "AMD Opteron/Athlon64 PowerNow!"
-	depends on ACPI && ACPI_PROCESSOR && X86_ACPI_CPUFREQ
+	depends on ACPI && ACPI_PROCESSOR && X86_ACPI_CPUFREQ && !PREEMPT_RT_BASE
 	help
 	  This adds the CPUFreq driver for K8/early Opteron/Athlon64 processors.
 	  Support for K10 and newer processors is now in acpi-cpufreq.
-- 
cgit v1.2.3


From 655ffa83061ab48d58efc910a0df853ee93baf40 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 20 May 2015 00:23:59 +0200
Subject: gpu/i915: don't open code these things

The opencode part is gone in 1f83fee0 ("drm/i915: clear up wedged transitions")
the owner check is still there.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 drivers/gpu/drm/i915/i915_gem.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index d88dbedeaa77..68560152b721 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -5144,7 +5144,7 @@ static bool mutex_is_locked_by(struct mutex *mutex, struct task_struct *task)
 	if (!mutex_is_locked(mutex))
 		return false;
 
-#if defined(CONFIG_SMP) && !defined(CONFIG_DEBUG_MUTEXES)
+#if defined(CONFIG_SMP) && !defined(CONFIG_DEBUG_MUTEXES) && !defined(CONFIG_PREEMPT_RT_BASE)
 	return mutex->owner == task;
 #else
 	/* Since UP may be pre-empted, we cannot assume that we own the lock */
-- 
cgit v1.2.3


From a3f5a9c7ca304f479f06a88b35eb1d4b71fd6225 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 25 Apr 2013 18:12:52 +0200
Subject: drm/i915: drop trace_i915_gem_ring_dispatch on rt

This tracepoint is responsible for:

|[<814cc358>] __schedule_bug+0x4d/0x59
|[<814d24cc>] __schedule+0x88c/0x930
|[<814d3b90>] ? _raw_spin_unlock_irqrestore+0x40/0x50
|[<814d3b95>] ? _raw_spin_unlock_irqrestore+0x45/0x50
|[<810b57b5>] ? task_blocks_on_rt_mutex+0x1f5/0x250
|[<814d27d9>] schedule+0x29/0x70
|[<814d3423>] rt_spin_lock_slowlock+0x15b/0x278
|[<814d3786>] rt_spin_lock+0x26/0x30
|[<a00dced9>] gen6_gt_force_wake_get+0x29/0x60 [i915]
|[<a00e183f>] gen6_ring_get_irq+0x5f/0x100 [i915]
|[<a00b2a33>] ftrace_raw_event_i915_gem_ring_dispatch+0xe3/0x100 [i915]
|[<a00ac1b3>] i915_gem_do_execbuffer.isra.13+0xbd3/0x1430 [i915]
|[<810f8943>] ? trace_buffer_unlock_commit+0x43/0x60
|[<8113e8d2>] ? ftrace_raw_event_kmem_alloc+0xd2/0x180
|[<8101d063>] ? native_sched_clock+0x13/0x80
|[<a00acf29>] i915_gem_execbuffer2+0x99/0x280 [i915]
|[<a00114a3>] drm_ioctl+0x4c3/0x570 [drm]
|[<8101d0d9>] ? sched_clock+0x9/0x10
|[<a00ace90>] ? i915_gem_execbuffer+0x480/0x480 [i915]
|[<810f1c18>] ? rb_commit+0x68/0xa0
|[<810f1c6c>] ? ring_buffer_unlock_commit+0x1c/0xa0
|[<81197467>] do_vfs_ioctl+0x97/0x540
|[<81021318>] ? ftrace_raw_event_sys_enter+0xd8/0x130
|[<811979a1>] sys_ioctl+0x91/0xb0
|[<814db931>] tracesys+0xe1/0xe6

Chris Wilson does not like to move i915_trace_irq_get() out of the macro

|No. This enables the IRQ, as well as making a number of
|very expensively serialised read, unconditionally.

so it is gone now on RT.

Cc: stable-rt@vger.kernel.org
Reported-by: Joakim Hernberg <jbh@alchemy.lu>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 1a0611bb576b..8ea32c1616c6 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1170,7 +1170,9 @@ i915_gem_ringbuffer_submission(struct drm_device *dev, struct drm_file *file,
 			return ret;
 	}
 
+#ifndef CONFIG_PREEMPT_RT_BASE
 	trace_i915_gem_ring_dispatch(ring, intel_ring_get_seqno(ring), flags);
+#endif
 
 	i915_gem_execbuffer_move_to_active(vmas, ring);
 	i915_gem_execbuffer_retire_commands(dev, file, ring, batch_obj);
-- 
cgit v1.2.3


From b886ae10d09692ae19ae8624e2afd0aa77398c6c Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 28 Oct 2013 12:19:57 +0100
Subject: wait.h: include atomic.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

|  CC      init/main.o
|In file included from include/linux/mmzone.h:9:0,
|                 from include/linux/gfp.h:4,
|                 from include/linux/kmod.h:22,
|                 from include/linux/module.h:13,
|                 from init/main.c:15:
|include/linux/wait.h: In function ‘wait_on_atomic_t’:
|include/linux/wait.h:982:2: error: implicit declaration of function ‘atomic_read’ [-Werror=implicit-function-declaration]
|  if (atomic_read(val) == 0)
|  ^

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 include/linux/wait.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/linux/wait.h b/include/linux/wait.h
index fc0e99395fbb..5e04bf44762d 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -8,6 +8,7 @@
 #include <linux/spinlock.h>
 #include <asm/current.h>
 #include <uapi/linux/wait.h>
+#include <linux/atomic.h>
 
 typedef struct __wait_queue wait_queue_t;
 typedef int (*wait_queue_func_t)(wait_queue_t *wait, unsigned mode, int flags, void *key);
-- 
cgit v1.2.3


From 6d9d2277b090ffb1c862119aa4f91fd4c9367aaf Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 12 Dec 2011 12:29:04 +0100
Subject: wait-simple: Simple waitqueue implementation

wait_queue is a swiss army knife and in most of the cases the
complexity is not needed. For RT waitqueues are a constant source of
trouble as we can't convert the head lock to a raw spinlock due to
fancy and long lasting callbacks.

Provide a slim version, which allows RT to replace wait queues. This
should go mainline as well, as it lowers memory consumption and
runtime overhead.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

smp_mb() added by Steven Rostedt to fix a race condition with swait
wakeups vs adding items to the list.
---
 include/linux/wait-simple.h | 235 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/Makefile       |   2 +-
 kernel/sched/wait-simple.c  |  68 +++++++++++++
 3 files changed, 304 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/wait-simple.h
 create mode 100644 kernel/sched/wait-simple.c

diff --git a/include/linux/wait-simple.h b/include/linux/wait-simple.h
new file mode 100644
index 000000000000..a9f24692cbe1
--- /dev/null
+++ b/include/linux/wait-simple.h
@@ -0,0 +1,235 @@
+#ifndef _LINUX_WAIT_SIMPLE_H
+#define _LINUX_WAIT_SIMPLE_H
+
+#include <linux/spinlock.h>
+#include <linux/list.h>
+
+#include <asm/current.h>
+
+struct swaiter {
+	struct task_struct	*task;
+	struct list_head	node;
+};
+
+#define DEFINE_SWAITER(name)					\
+	struct swaiter name = {					\
+		.task	= current,				\
+		.node	= LIST_HEAD_INIT((name).node),		\
+	}
+
+struct swait_head {
+	raw_spinlock_t		lock;
+	struct list_head	list;
+};
+
+#define DEFINE_SWAIT_HEAD(name)					\
+	struct swait_head name = {				\
+		.lock	= __RAW_SPIN_LOCK_UNLOCKED(name.lock),	\
+		.list	= LIST_HEAD_INIT((name).list),		\
+	}
+
+extern void __init_swait_head(struct swait_head *h, struct lock_class_key *key);
+
+#define init_swait_head(swh)					\
+	do {							\
+		static struct lock_class_key __key;		\
+								\
+		__init_swait_head((swh), &__key);		\
+	} while (0)
+
+/*
+ * Waiter functions
+ */
+static inline bool swaiter_enqueued(struct swaiter *w)
+{
+	return w->task != NULL;
+}
+
+extern void swait_prepare(struct swait_head *head, struct swaiter *w, int state);
+extern void swait_finish(struct swait_head *head, struct swaiter *w);
+
+/*
+ * Adds w to head->list. Must be called with head->lock locked.
+ */
+static inline void __swait_enqueue(struct swait_head *head, struct swaiter *w)
+{
+	list_add(&w->node, &head->list);
+	/* We can't let the condition leak before the setting of head */
+	smp_mb();
+}
+
+/*
+ * Removes w from head->list. Must be called with head->lock locked.
+ */
+static inline void __swait_dequeue(struct swaiter *w)
+{
+	list_del_init(&w->node);
+}
+
+/*
+ * Check whether a head has waiters enqueued
+ */
+static inline bool swait_head_has_waiters(struct swait_head *h)
+{
+	/* Make sure the condition is visible before checking list_empty() */
+	smp_mb();
+	return !list_empty(&h->list);
+}
+
+/*
+ * Wakeup functions
+ */
+extern int __swait_wake(struct swait_head *head, unsigned int state);
+
+static inline int swait_wake(struct swait_head *head)
+{
+	return swait_head_has_waiters(head) ?
+		__swait_wake(head, TASK_NORMAL) : 0;
+}
+
+static inline int swait_wake_interruptible(struct swait_head *head)
+{
+	return swait_head_has_waiters(head) ?
+		__swait_wake(head, TASK_INTERRUPTIBLE) : 0;
+}
+
+/*
+ * Event API
+ */
+
+#define __swait_event(wq, condition)					\
+do {									\
+	DEFINE_SWAITER(__wait);						\
+									\
+	for (;;) {							\
+		swait_prepare(&wq, &__wait, TASK_UNINTERRUPTIBLE);	\
+		if (condition)						\
+			break;						\
+		schedule();						\
+	}								\
+	swait_finish(&wq, &__wait);					\
+} while (0)
+
+/**
+ * swait_event - sleep until a condition gets true
+ * @wq: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ *
+ * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
+ * @condition evaluates to true. The @condition is checked each time
+ * the waitqueue @wq is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ */
+#define swait_event(wq, condition)					\
+do {									\
+	if (condition)							\
+		break;							\
+	__swait_event(wq, condition);					\
+} while (0)
+
+#define __swait_event_interruptible(wq, condition, ret)			\
+do {									\
+	DEFINE_SWAITER(__wait);						\
+									\
+	for (;;) {							\
+		swait_prepare(&wq, &__wait, TASK_INTERRUPTIBLE);	\
+		if (condition)						\
+			break;						\
+		if (signal_pending(current)) {				\
+			ret = -ERESTARTSYS;				\
+			break;						\
+		}							\
+		schedule();						\
+	}								\
+	swait_finish(&wq, &__wait);					\
+} while (0)
+
+#define __swait_event_interruptible_timeout(wq, condition, ret)		\
+do {									\
+	DEFINE_SWAITER(__wait);						\
+									\
+	for (;;) {							\
+		swait_prepare(&wq, &__wait, TASK_INTERRUPTIBLE);	\
+		if (condition)						\
+			break;						\
+		if (signal_pending(current)) {				\
+			ret = -ERESTARTSYS;				\
+			break;						\
+		}							\
+		ret = schedule_timeout(ret);				\
+		if (!ret)						\
+			break;						\
+	}								\
+	swait_finish(&wq, &__wait);					\
+} while (0)
+
+/**
+ * swait_event_interruptible - sleep until a condition gets true
+ * @wq: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ *
+ * The process is put to sleep (TASK_INTERRUPTIBLE) until the
+ * @condition evaluates to true. The @condition is checked each time
+ * the waitqueue @wq is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ */
+#define swait_event_interruptible(wq, condition)			\
+({									\
+	int __ret = 0;							\
+	if (!(condition))						\
+		__swait_event_interruptible(wq, condition, __ret);	\
+	__ret;								\
+})
+
+#define swait_event_interruptible_timeout(wq, condition, timeout)	\
+({									\
+	int __ret = timeout;						\
+	if (!(condition))						\
+		__swait_event_interruptible_timeout(wq, condition, __ret);	\
+	__ret;								\
+})
+
+#define __swait_event_timeout(wq, condition, ret)			\
+do {									\
+	DEFINE_SWAITER(__wait);						\
+									\
+	for (;;) {							\
+		swait_prepare(&wq, &__wait, TASK_UNINTERRUPTIBLE);	\
+		if (condition)						\
+			break;						\
+		ret = schedule_timeout(ret);				\
+		if (!ret)						\
+			break;						\
+	}								\
+	swait_finish(&wq, &__wait);					\
+} while (0)
+
+/**
+ * swait_event_timeout - sleep until a condition gets true or a timeout elapses
+ * @wq: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ * @timeout: timeout, in jiffies
+ *
+ * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
+ * @condition evaluates to true. The @condition is checked each time
+ * the waitqueue @wq is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ * The function returns 0 if the @timeout elapsed, and the remaining
+ * jiffies if the condition evaluated to true before the timeout elapsed.
+ */
+#define swait_event_timeout(wq, condition, timeout)			\
+({									\
+	long __ret = timeout;						\
+	if (!(condition))						\
+		__swait_event_timeout(wq, condition, __ret);		\
+	__ret;								\
+})
+
+#endif
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index ab32b7b0db5c..39b3e81361aa 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -13,7 +13,7 @@ endif
 
 obj-y += core.o proc.o clock.o cputime.o
 obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
-obj-y += wait.o completion.o idle.o
+obj-y += wait.o wait-simple.o completion.o idle.o
 obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
 obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/sched/wait-simple.c b/kernel/sched/wait-simple.c
new file mode 100644
index 000000000000..040d7146e4df
--- /dev/null
+++ b/kernel/sched/wait-simple.c
@@ -0,0 +1,68 @@
+/*
+ * Simple waitqueues without fancy flags and callbacks
+ *
+ * (C) 2011 Thomas Gleixner <tglx@linutronix.de>
+ *
+ * Based on kernel/wait.c
+ *
+ * For licencing details see kernel-base/COPYING
+ */
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/sched.h>
+#include <linux/wait-simple.h>
+
+void __init_swait_head(struct swait_head *head, struct lock_class_key *key)
+{
+	raw_spin_lock_init(&head->lock);
+	lockdep_set_class(&head->lock, key);
+	INIT_LIST_HEAD(&head->list);
+}
+EXPORT_SYMBOL(__init_swait_head);
+
+void swait_prepare(struct swait_head *head, struct swaiter *w, int state)
+{
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&head->lock, flags);
+	w->task = current;
+	if (list_empty(&w->node))
+		__swait_enqueue(head, w);
+	set_current_state(state);
+	raw_spin_unlock_irqrestore(&head->lock, flags);
+}
+EXPORT_SYMBOL(swait_prepare);
+
+void swait_finish(struct swait_head *head, struct swaiter *w)
+{
+	unsigned long flags;
+
+	__set_current_state(TASK_RUNNING);
+	if (w->task) {
+		raw_spin_lock_irqsave(&head->lock, flags);
+		__swait_dequeue(w);
+		raw_spin_unlock_irqrestore(&head->lock, flags);
+	}
+}
+EXPORT_SYMBOL(swait_finish);
+
+int __swait_wake(struct swait_head *head, unsigned int state)
+{
+	struct swaiter *curr, *next;
+	unsigned long flags;
+	int woken = 0;
+
+	raw_spin_lock_irqsave(&head->lock, flags);
+
+	list_for_each_entry_safe(curr, next, &head->list, node) {
+		if (wake_up_state(curr->task, state)) {
+			__swait_dequeue(curr);
+			curr->task = NULL;
+			woken++;
+		}
+	}
+
+	raw_spin_unlock_irqrestore(&head->lock, flags);
+	return woken;
+}
+EXPORT_SYMBOL(__swait_wake);
-- 
cgit v1.2.3


From 918bf065885d3b2c2bd82a379c7b226856ca9d92 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 10 Jan 2013 11:47:35 +0100
Subject: wait-simple: Rework for use with completions

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/wait-simple.h | 60 ++++++++-----------------------------
 kernel/sched/wait-simple.c  | 73 +++++++++++++++++++++++++++++++++++++++------
 2 files changed, 76 insertions(+), 57 deletions(-)

diff --git a/include/linux/wait-simple.h b/include/linux/wait-simple.h
index a9f24692cbe1..4efba4d28c48 100644
--- a/include/linux/wait-simple.h
+++ b/include/linux/wait-simple.h
@@ -22,12 +22,14 @@ struct swait_head {
 	struct list_head	list;
 };
 
-#define DEFINE_SWAIT_HEAD(name)					\
-	struct swait_head name = {				\
+#define SWAIT_HEAD_INITIALIZER(name) {				\
 		.lock	= __RAW_SPIN_LOCK_UNLOCKED(name.lock),	\
 		.list	= LIST_HEAD_INIT((name).list),		\
 	}
 
+#define DEFINE_SWAIT_HEAD(name)					\
+	struct swait_head name = SWAIT_HEAD_INITIALIZER(name)
+
 extern void __init_swait_head(struct swait_head *h, struct lock_class_key *key);
 
 #define init_swait_head(swh)					\
@@ -40,63 +42,25 @@ extern void __init_swait_head(struct swait_head *h, struct lock_class_key *key);
 /*
  * Waiter functions
  */
-static inline bool swaiter_enqueued(struct swaiter *w)
-{
-	return w->task != NULL;
-}
-
+extern void swait_prepare_locked(struct swait_head *head, struct swaiter *w);
 extern void swait_prepare(struct swait_head *head, struct swaiter *w, int state);
+extern void swait_finish_locked(struct swait_head *head, struct swaiter *w);
 extern void swait_finish(struct swait_head *head, struct swaiter *w);
 
-/*
- * Adds w to head->list. Must be called with head->lock locked.
- */
-static inline void __swait_enqueue(struct swait_head *head, struct swaiter *w)
-{
-	list_add(&w->node, &head->list);
-	/* We can't let the condition leak before the setting of head */
-	smp_mb();
-}
-
-/*
- * Removes w from head->list. Must be called with head->lock locked.
- */
-static inline void __swait_dequeue(struct swaiter *w)
-{
-	list_del_init(&w->node);
-}
-
-/*
- * Check whether a head has waiters enqueued
- */
-static inline bool swait_head_has_waiters(struct swait_head *h)
-{
-	/* Make sure the condition is visible before checking list_empty() */
-	smp_mb();
-	return !list_empty(&h->list);
-}
-
 /*
  * Wakeup functions
  */
-extern int __swait_wake(struct swait_head *head, unsigned int state);
-
-static inline int swait_wake(struct swait_head *head)
-{
-	return swait_head_has_waiters(head) ?
-		__swait_wake(head, TASK_NORMAL) : 0;
-}
+extern unsigned int __swait_wake(struct swait_head *head, unsigned int state, unsigned int num);
+extern unsigned int __swait_wake_locked(struct swait_head *head, unsigned int state, unsigned int num);
 
-static inline int swait_wake_interruptible(struct swait_head *head)
-{
-	return swait_head_has_waiters(head) ?
-		__swait_wake(head, TASK_INTERRUPTIBLE) : 0;
-}
+#define swait_wake(head)			__swait_wake(head, TASK_NORMAL, 1)
+#define swait_wake_interruptible(head)		__swait_wake(head, TASK_INTERRUPTIBLE, 1)
+#define swait_wake_all(head)			__swait_wake(head, TASK_NORMAL, 0)
+#define swait_wake_all_interruptible(head)	__swait_wake(head, TASK_INTERRUPTIBLE, 0)
 
 /*
  * Event API
  */
-
 #define __swait_event(wq, condition)					\
 do {									\
 	DEFINE_SWAITER(__wait);						\
diff --git a/kernel/sched/wait-simple.c b/kernel/sched/wait-simple.c
index 040d7146e4df..2c856267445b 100644
--- a/kernel/sched/wait-simple.c
+++ b/kernel/sched/wait-simple.c
@@ -12,6 +12,28 @@
 #include <linux/sched.h>
 #include <linux/wait-simple.h>
 
+/* Adds w to head->list. Must be called with head->lock locked. */
+static inline void __swait_enqueue(struct swait_head *head, struct swaiter *w)
+{
+	list_add(&w->node, &head->list);
+	/* We can't let the condition leak before the setting of head */
+	smp_mb();
+}
+
+/* Removes w from head->list. Must be called with head->lock locked. */
+static inline void __swait_dequeue(struct swaiter *w)
+{
+	list_del_init(&w->node);
+}
+
+/* Check whether a head has waiters enqueued */
+static inline bool swait_head_has_waiters(struct swait_head *h)
+{
+	/* Make sure the condition is visible before checking list_empty() */
+	smp_mb();
+	return !list_empty(&h->list);
+}
+
 void __init_swait_head(struct swait_head *head, struct lock_class_key *key)
 {
 	raw_spin_lock_init(&head->lock);
@@ -20,19 +42,31 @@ void __init_swait_head(struct swait_head *head, struct lock_class_key *key)
 }
 EXPORT_SYMBOL(__init_swait_head);
 
+void swait_prepare_locked(struct swait_head *head, struct swaiter *w)
+{
+	w->task = current;
+	if (list_empty(&w->node))
+		__swait_enqueue(head, w);
+}
+
 void swait_prepare(struct swait_head *head, struct swaiter *w, int state)
 {
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&head->lock, flags);
-	w->task = current;
-	if (list_empty(&w->node))
-		__swait_enqueue(head, w);
-	set_current_state(state);
+	swait_prepare_locked(head, w);
+	__set_current_state(state);
 	raw_spin_unlock_irqrestore(&head->lock, flags);
 }
 EXPORT_SYMBOL(swait_prepare);
 
+void swait_finish_locked(struct swait_head *head, struct swaiter *w)
+{
+	__set_current_state(TASK_RUNNING);
+	if (w->task)
+		__swait_dequeue(w);
+}
+
 void swait_finish(struct swait_head *head, struct swaiter *w)
 {
 	unsigned long flags;
@@ -46,22 +80,43 @@ void swait_finish(struct swait_head *head, struct swaiter *w)
 }
 EXPORT_SYMBOL(swait_finish);
 
-int __swait_wake(struct swait_head *head, unsigned int state)
+unsigned int
+__swait_wake_locked(struct swait_head *head, unsigned int state, unsigned int num)
 {
 	struct swaiter *curr, *next;
-	unsigned long flags;
 	int woken = 0;
 
-	raw_spin_lock_irqsave(&head->lock, flags);
-
 	list_for_each_entry_safe(curr, next, &head->list, node) {
 		if (wake_up_state(curr->task, state)) {
 			__swait_dequeue(curr);
+			/*
+			 * The waiting task can free the waiter as
+			 * soon as curr->task = NULL is written,
+			 * without taking any locks. A memory barrier
+			 * is required here to prevent the following
+			 * store to curr->task from getting ahead of
+			 * the dequeue operation.
+			 */
+			smp_wmb();
 			curr->task = NULL;
-			woken++;
+			if (++woken == num)
+				break;
 		}
 	}
+	return woken;
+}
+
+unsigned int
+__swait_wake(struct swait_head *head, unsigned int state, unsigned int num)
+{
+	unsigned long flags;
+	int woken;
 
+	if (!swait_head_has_waiters(head))
+		return 0;
+
+	raw_spin_lock_irqsave(&head->lock, flags);
+	woken = __swait_wake_locked(head, state, num);
 	raw_spin_unlock_irqrestore(&head->lock, flags);
 	return woken;
 }
-- 
cgit v1.2.3


From 568782bac7cb3258f4b7e1c89463aad322b65b98 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Tue, 27 Aug 2013 14:20:26 -0400
Subject: simple-wait: rename and export the equivalent of waitqueue_active()

The function "swait_head_has_waiters()" was internalized into
wait-simple.c but it parallels the waitqueue_active of normal
waitqueue support. Given that there are over 150 waitqueue_active
users in drivers/ fs/ kernel/ and the like, lets make it globally
visible, and rename it to parallel the waitqueue_active accordingly.
We'll need to do this if we expect to expand its usage beyond RT.

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 include/linux/wait-simple.h |  8 ++++++++
 kernel/sched/wait-simple.c  | 10 +---------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/include/linux/wait-simple.h b/include/linux/wait-simple.h
index 4efba4d28c48..f86bca2c41d5 100644
--- a/include/linux/wait-simple.h
+++ b/include/linux/wait-simple.h
@@ -47,6 +47,14 @@ extern void swait_prepare(struct swait_head *head, struct swaiter *w, int state)
 extern void swait_finish_locked(struct swait_head *head, struct swaiter *w);
 extern void swait_finish(struct swait_head *head, struct swaiter *w);
 
+/* Check whether a head has waiters enqueued */
+static inline bool swaitqueue_active(struct swait_head *h)
+{
+	/* Make sure the condition is visible before checking list_empty() */
+	smp_mb();
+	return !list_empty(&h->list);
+}
+
 /*
  * Wakeup functions
  */
diff --git a/kernel/sched/wait-simple.c b/kernel/sched/wait-simple.c
index 2c856267445b..7dfa86d1f654 100644
--- a/kernel/sched/wait-simple.c
+++ b/kernel/sched/wait-simple.c
@@ -26,14 +26,6 @@ static inline void __swait_dequeue(struct swaiter *w)
 	list_del_init(&w->node);
 }
 
-/* Check whether a head has waiters enqueued */
-static inline bool swait_head_has_waiters(struct swait_head *h)
-{
-	/* Make sure the condition is visible before checking list_empty() */
-	smp_mb();
-	return !list_empty(&h->list);
-}
-
 void __init_swait_head(struct swait_head *head, struct lock_class_key *key)
 {
 	raw_spin_lock_init(&head->lock);
@@ -112,7 +104,7 @@ __swait_wake(struct swait_head *head, unsigned int state, unsigned int num)
 	unsigned long flags;
 	int woken;
 
-	if (!swait_head_has_waiters(head))
+	if (!swaitqueue_active(head))
 		return 0;
 
 	raw_spin_lock_irqsave(&head->lock, flags);
-- 
cgit v1.2.3


From 61582c38fc9a3b23418a92e55968ce2a79d1fdb2 Mon Sep 17 00:00:00 2001
From: Daniel Wagner <daniel.wagner@bmw-carit.de>
Date: Fri, 11 Jul 2014 15:26:11 +0200
Subject: work-simple: Simple work queue implemenation

Provides a framework for enqueuing callbacks from irq context
PREEMPT_RT_FULL safe. The callbacks are executed in kthread context.

Bases on wait-simple.

Signed-off-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 include/linux/work-simple.h |  24 +++++++
 kernel/sched/Makefile       |   2 +-
 kernel/sched/work-simple.c  | 172 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 197 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/work-simple.h
 create mode 100644 kernel/sched/work-simple.c

diff --git a/include/linux/work-simple.h b/include/linux/work-simple.h
new file mode 100644
index 000000000000..f175fa9a6016
--- /dev/null
+++ b/include/linux/work-simple.h
@@ -0,0 +1,24 @@
+#ifndef _LINUX_SWORK_H
+#define _LINUX_SWORK_H
+
+#include <linux/list.h>
+
+struct swork_event {
+	struct list_head item;
+	unsigned long flags;
+	void (*func)(struct swork_event *);
+};
+
+static inline void INIT_SWORK(struct swork_event *event,
+			      void (*func)(struct swork_event *))
+{
+	event->flags = 0;
+	event->func = func;
+}
+
+bool swork_queue(struct swork_event *sev);
+
+int swork_get(void);
+void swork_put(void);
+
+#endif /* _LINUX_SWORK_H */
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 39b3e81361aa..33d7bfc30a9b 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -13,7 +13,7 @@ endif
 
 obj-y += core.o proc.o clock.o cputime.o
 obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
-obj-y += wait.o wait-simple.o completion.o idle.o
+obj-y += wait.o wait-simple.o work-simple.o completion.o idle.o
 obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
 obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/sched/work-simple.c b/kernel/sched/work-simple.c
new file mode 100644
index 000000000000..c996f755dba6
--- /dev/null
+++ b/kernel/sched/work-simple.c
@@ -0,0 +1,172 @@
+/*
+ * Copyright (C) 2014 BMW Car IT GmbH, Daniel Wagner daniel.wagner@bmw-carit.de
+ *
+ * Provides a framework for enqueuing callbacks from irq context
+ * PREEMPT_RT_FULL safe. The callbacks are executed in kthread context.
+ */
+
+#include <linux/wait-simple.h>
+#include <linux/work-simple.h>
+#include <linux/kthread.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+
+#define SWORK_EVENT_PENDING     (1 << 0)
+
+static DEFINE_MUTEX(worker_mutex);
+static struct sworker *glob_worker;
+
+struct sworker {
+	struct list_head events;
+	struct swait_head wq;
+
+	raw_spinlock_t lock;
+
+	struct task_struct *task;
+	int refs;
+};
+
+static bool swork_readable(struct sworker *worker)
+{
+	bool r;
+
+	if (kthread_should_stop())
+		return true;
+
+	raw_spin_lock_irq(&worker->lock);
+	r = !list_empty(&worker->events);
+	raw_spin_unlock_irq(&worker->lock);
+
+	return r;
+}
+
+static int swork_kthread(void *arg)
+{
+	struct sworker *worker = arg;
+
+	for (;;) {
+		swait_event_interruptible(worker->wq,
+					swork_readable(worker));
+		if (kthread_should_stop())
+			break;
+
+		raw_spin_lock_irq(&worker->lock);
+		while (!list_empty(&worker->events)) {
+			struct swork_event *sev;
+
+			sev = list_first_entry(&worker->events,
+					struct swork_event, item);
+			list_del(&sev->item);
+			raw_spin_unlock_irq(&worker->lock);
+
+			WARN_ON_ONCE(!test_and_clear_bit(SWORK_EVENT_PENDING,
+							 &sev->flags));
+			sev->func(sev);
+			raw_spin_lock_irq(&worker->lock);
+		}
+		raw_spin_unlock_irq(&worker->lock);
+	}
+	return 0;
+}
+
+static struct sworker *swork_create(void)
+{
+	struct sworker *worker;
+
+	worker = kzalloc(sizeof(*worker), GFP_KERNEL);
+	if (!worker)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&worker->events);
+	raw_spin_lock_init(&worker->lock);
+	init_swait_head(&worker->wq);
+
+	worker->task = kthread_run(swork_kthread, worker, "kswork");
+	if (IS_ERR(worker->task)) {
+		kfree(worker);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	return worker;
+}
+
+static void swork_destroy(struct sworker *worker)
+{
+	kthread_stop(worker->task);
+
+	WARN_ON(!list_empty(&worker->events));
+	kfree(worker);
+}
+
+/**
+ * swork_queue - queue swork
+ *
+ * Returns %false if @work was already on a queue, %true otherwise.
+ *
+ * The work is queued and processed on a random CPU
+ */
+bool swork_queue(struct swork_event *sev)
+{
+	unsigned long flags;
+
+	if (test_and_set_bit(SWORK_EVENT_PENDING, &sev->flags))
+		return false;
+
+	raw_spin_lock_irqsave(&glob_worker->lock, flags);
+	list_add_tail(&sev->item, &glob_worker->events);
+	raw_spin_unlock_irqrestore(&glob_worker->lock, flags);
+
+	swait_wake(&glob_worker->wq);
+	return true;
+}
+EXPORT_SYMBOL_GPL(swork_queue);
+
+/**
+ * swork_get - get an instance of the sworker
+ *
+ * Returns an negative error code if the initialization if the worker did not
+ * work, %0 otherwise.
+ *
+ */
+int swork_get(void)
+{
+	struct sworker *worker;
+
+	mutex_lock(&worker_mutex);
+	if (!glob_worker) {
+		worker = swork_create();
+		if (IS_ERR(worker)) {
+			mutex_unlock(&worker_mutex);
+			return -ENOMEM;
+		}
+
+		glob_worker = worker;
+	}
+
+	glob_worker->refs++;
+	mutex_unlock(&worker_mutex);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(swork_get);
+
+/**
+ * swork_put - puts an instance of the sworker
+ *
+ * Will destroy the sworker thread. This function must not be called until all
+ * queued events have been completed.
+ */
+void swork_put(void)
+{
+	mutex_lock(&worker_mutex);
+
+	glob_worker->refs--;
+	if (glob_worker->refs > 0)
+		goto out;
+
+	swork_destroy(glob_worker);
+	glob_worker = NULL;
+out:
+	mutex_unlock(&worker_mutex);
+}
+EXPORT_SYMBOL_GPL(swork_put);
-- 
cgit v1.2.3


From 1ccb68ca2a7a5350feecfe49e2afaafd8d06550d Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 8 Apr 2013 16:09:57 +0200
Subject: kernel/treercu: use a simple waitqueue

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 kernel/rcu/tree.c | 8 ++++----
 kernel/rcu/tree.h | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index e73df2108a54..6aae258f9f62 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1434,7 +1434,7 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp)
 	    !ACCESS_ONCE(rsp->gp_flags) ||
 	    !rsp->gp_kthread)
 		return;
-	wake_up(&rsp->gp_wq);
+	swait_wake(&rsp->gp_wq);
 }
 
 /*
@@ -1816,7 +1816,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
 					       ACCESS_ONCE(rsp->gpnum),
 					       TPS("reqwait"));
 			rsp->gp_state = RCU_GP_WAIT_GPS;
-			wait_event_interruptible(rsp->gp_wq,
+			swait_event_interruptible(rsp->gp_wq,
 						 ACCESS_ONCE(rsp->gp_flags) &
 						 RCU_GP_FLAG_INIT);
 			/* Locking provides needed memory barrier. */
@@ -1844,7 +1844,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
 					       ACCESS_ONCE(rsp->gpnum),
 					       TPS("fqswait"));
 			rsp->gp_state = RCU_GP_WAIT_FQS;
-			ret = wait_event_interruptible_timeout(rsp->gp_wq,
+			ret = swait_event_interruptible_timeout(rsp->gp_wq,
 					((gf = ACCESS_ONCE(rsp->gp_flags)) &
 					 RCU_GP_FLAG_FQS) ||
 					(!ACCESS_ONCE(rnp->qsmask) &&
@@ -3772,7 +3772,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
 	}
 
 	rsp->rda = rda;
-	init_waitqueue_head(&rsp->gp_wq);
+	init_swait_head(&rsp->gp_wq);
 	rnp = rsp->level[rcu_num_lvls - 1];
 	for_each_possible_cpu(i) {
 		while (i > rnp->grphi)
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 5e54e2870e8a..fe31ff16bc57 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -434,7 +434,7 @@ struct rcu_state {
 	unsigned long gpnum;			/* Current gp number. */
 	unsigned long completed;		/* # of last completed gp. */
 	struct task_struct *gp_kthread;		/* Task for grace periods. */
-	wait_queue_head_t gp_wq;		/* Where GP task waits. */
+	struct swait_head gp_wq;		/* Where GP task waits. */
 	short gp_flags;				/* Commands for GP task. */
 	short gp_state;				/* GP kthread sleep state. */
 
-- 
cgit v1.2.3


From 39f143c66348e9de5975031e0a0faf5c6ac7c7a9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 31 Jul 2013 19:00:35 +0200
Subject: rcu-more-swait-conversions.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

Merged Steven's

 static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) {
-       swait_wake(&rnp->nocb_gp_wq[rnp->completed & 0x1]);
+       wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]);
 }

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 kernel/rcu/tree.h        |  5 +++--
 kernel/rcu/tree_plugin.h | 18 +++++++++---------
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index fe31ff16bc57..e8508b4febc5 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -28,6 +28,7 @@
 #include <linux/cpumask.h>
 #include <linux/seqlock.h>
 #include <linux/irq_work.h>
+#include <linux/wait-simple.h>
 
 /*
  * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
@@ -203,7 +204,7 @@ struct rcu_node {
 				/*  This can happen due to race conditions. */
 #endif /* #ifdef CONFIG_RCU_BOOST */
 #ifdef CONFIG_RCU_NOCB_CPU
-	wait_queue_head_t nocb_gp_wq[2];
+	struct swait_head nocb_gp_wq[2];
 				/* Place for rcu_nocb_kthread() to wait GP. */
 #endif /* #ifdef CONFIG_RCU_NOCB_CPU */
 	int need_future_gp[2];
@@ -343,7 +344,7 @@ struct rcu_data {
 	atomic_long_t nocb_follower_count_lazy; /*  (approximate). */
 	int nocb_p_count;		/* # CBs being invoked by kthread */
 	int nocb_p_count_lazy;		/*  (approximate). */
-	wait_queue_head_t nocb_wq;	/* For nocb kthreads to sleep on. */
+	struct swait_head nocb_wq;	/* For nocb kthreads to sleep on. */
 	struct task_struct *nocb_kthread;
 	int nocb_defer_wakeup;		/* Defer wakeup of nocb_kthread. */
 
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 5dd80727a6fc..ecd967ecc7ce 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1891,7 +1891,7 @@ early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
  */
 static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
 {
-	wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]);
+	swait_wake_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]);
 }
 
 /*
@@ -1909,8 +1909,8 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
 
 static void rcu_init_one_nocb(struct rcu_node *rnp)
 {
-	init_waitqueue_head(&rnp->nocb_gp_wq[0]);
-	init_waitqueue_head(&rnp->nocb_gp_wq[1]);
+	init_swait_head(&rnp->nocb_gp_wq[0]);
+	init_swait_head(&rnp->nocb_gp_wq[1]);
 }
 
 #ifndef CONFIG_RCU_NOCB_CPU_ALL
@@ -1935,7 +1935,7 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force)
 	if (ACCESS_ONCE(rdp_leader->nocb_leader_sleep) || force) {
 		/* Prior smp_mb__after_atomic() orders against prior enqueue. */
 		ACCESS_ONCE(rdp_leader->nocb_leader_sleep) = false;
-		wake_up(&rdp_leader->nocb_wq);
+		swait_wake(&rdp_leader->nocb_wq);
 	}
 }
 
@@ -2128,7 +2128,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
 	 */
 	trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait"));
 	for (;;) {
-		wait_event_interruptible(
+		swait_event_interruptible(
 			rnp->nocb_gp_wq[c & 0x1],
 			(d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c)));
 		if (likely(d))
@@ -2156,7 +2156,7 @@ wait_again:
 	/* Wait for callbacks to appear. */
 	if (!rcu_nocb_poll) {
 		trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep");
-		wait_event_interruptible(my_rdp->nocb_wq,
+		swait_event_interruptible(my_rdp->nocb_wq,
 				!ACCESS_ONCE(my_rdp->nocb_leader_sleep));
 		/* Memory barrier handled by smp_mb() calls below and repoll. */
 	} else if (firsttime) {
@@ -2237,7 +2237,7 @@ wait_again:
 			 * List was empty, wake up the follower.
 			 * Memory barriers supplied by atomic_long_add().
 			 */
-			wake_up(&rdp->nocb_wq);
+			swait_wake(&rdp->nocb_wq);
 		}
 	}
 
@@ -2258,7 +2258,7 @@ static void nocb_follower_wait(struct rcu_data *rdp)
 		if (!rcu_nocb_poll) {
 			trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
 					    "FollowerSleep");
-			wait_event_interruptible(rdp->nocb_wq,
+			swait_event_interruptible(rdp->nocb_wq,
 						 ACCESS_ONCE(rdp->nocb_follower_head));
 		} else if (firsttime) {
 			/* Don't drown trace log with "Poll"! */
@@ -2429,7 +2429,7 @@ void __init rcu_init_nohz(void)
 static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
 {
 	rdp->nocb_tail = &rdp->nocb_head;
-	init_waitqueue_head(&rdp->nocb_wq);
+	init_swait_head(&rdp->nocb_wq);
 	rdp->nocb_follower_tail = &rdp->nocb_follower_head;
 }
 
-- 
cgit v1.2.3


From f25c79f0f8601e848ac16f8b2b1b2fd0a459d548 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 11 Jan 2013 11:23:51 +0100
Subject: completion: Use simple wait queues

Completions have no long lasting callbacks and therefor do not need
the complex waitqueue variant. Use simple waitqueues which reduces the
contention on the waitqueue lock.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/completion.h |  9 ++++-----
 include/linux/uprobes.h    |  1 +
 kernel/sched/completion.c  | 34 +++++++++++++++++-----------------
 kernel/sched/core.c        | 10 ++++++++--
 4 files changed, 30 insertions(+), 24 deletions(-)

diff --git a/include/linux/completion.h b/include/linux/completion.h
index 5d5aaae3af43..3fe8d14c98c0 100644
--- a/include/linux/completion.h
+++ b/include/linux/completion.h
@@ -7,8 +7,7 @@
  * Atomic wait-for-completion handler data structures.
  * See kernel/sched/completion.c for details.
  */
-
-#include <linux/wait.h>
+#include <linux/wait-simple.h>
 
 /*
  * struct completion - structure used to maintain state for a "completion"
@@ -24,11 +23,11 @@
  */
 struct completion {
 	unsigned int done;
-	wait_queue_head_t wait;
+	struct swait_head wait;
 };
 
 #define COMPLETION_INITIALIZER(work) \
-	{ 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
+	{ 0, SWAIT_HEAD_INITIALIZER((work).wait) }
 
 #define COMPLETION_INITIALIZER_ONSTACK(work) \
 	({ init_completion(&work); work; })
@@ -73,7 +72,7 @@ struct completion {
 static inline void init_completion(struct completion *x)
 {
 	x->done = 0;
-	init_waitqueue_head(&x->wait);
+	init_swait_head(&x->wait);
 }
 
 /**
diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index 60beb5dc7977..f5a644c649b4 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -27,6 +27,7 @@
 #include <linux/errno.h>
 #include <linux/rbtree.h>
 #include <linux/types.h>
+#include <linux/wait.h>
 
 struct vm_area_struct;
 struct mm_struct;
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index a63f4dc27909..e529e5fcedfb 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -30,10 +30,10 @@ void complete(struct completion *x)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&x->wait.lock, flags);
+	raw_spin_lock_irqsave(&x->wait.lock, flags);
 	x->done++;
-	__wake_up_locked(&x->wait, TASK_NORMAL, 1);
-	spin_unlock_irqrestore(&x->wait.lock, flags);
+	__swait_wake_locked(&x->wait, TASK_NORMAL, 1);
+	raw_spin_unlock_irqrestore(&x->wait.lock, flags);
 }
 EXPORT_SYMBOL(complete);
 
@@ -50,10 +50,10 @@ void complete_all(struct completion *x)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&x->wait.lock, flags);
+	raw_spin_lock_irqsave(&x->wait.lock, flags);
 	x->done += UINT_MAX/2;
-	__wake_up_locked(&x->wait, TASK_NORMAL, 0);
-	spin_unlock_irqrestore(&x->wait.lock, flags);
+	__swait_wake_locked(&x->wait, TASK_NORMAL, 0);
+	raw_spin_unlock_irqrestore(&x->wait.lock, flags);
 }
 EXPORT_SYMBOL(complete_all);
 
@@ -62,20 +62,20 @@ do_wait_for_common(struct completion *x,
 		   long (*action)(long), long timeout, int state)
 {
 	if (!x->done) {
-		DECLARE_WAITQUEUE(wait, current);
+		DEFINE_SWAITER(wait);
 
-		__add_wait_queue_tail_exclusive(&x->wait, &wait);
+		swait_prepare_locked(&x->wait, &wait);
 		do {
 			if (signal_pending_state(state, current)) {
 				timeout = -ERESTARTSYS;
 				break;
 			}
 			__set_current_state(state);
-			spin_unlock_irq(&x->wait.lock);
+			raw_spin_unlock_irq(&x->wait.lock);
 			timeout = action(timeout);
-			spin_lock_irq(&x->wait.lock);
+			raw_spin_lock_irq(&x->wait.lock);
 		} while (!x->done && timeout);
-		__remove_wait_queue(&x->wait, &wait);
+		swait_finish_locked(&x->wait, &wait);
 		if (!x->done)
 			return timeout;
 	}
@@ -89,9 +89,9 @@ __wait_for_common(struct completion *x,
 {
 	might_sleep();
 
-	spin_lock_irq(&x->wait.lock);
+	raw_spin_lock_irq(&x->wait.lock);
 	timeout = do_wait_for_common(x, action, timeout, state);
-	spin_unlock_irq(&x->wait.lock);
+	raw_spin_unlock_irq(&x->wait.lock);
 	return timeout;
 }
 
@@ -267,12 +267,12 @@ bool try_wait_for_completion(struct completion *x)
 	unsigned long flags;
 	int ret = 1;
 
-	spin_lock_irqsave(&x->wait.lock, flags);
+	raw_spin_lock_irqsave(&x->wait.lock, flags);
 	if (!x->done)
 		ret = 0;
 	else
 		x->done--;
-	spin_unlock_irqrestore(&x->wait.lock, flags);
+	raw_spin_unlock_irqrestore(&x->wait.lock, flags);
 	return ret;
 }
 EXPORT_SYMBOL(try_wait_for_completion);
@@ -290,10 +290,10 @@ bool completion_done(struct completion *x)
 	unsigned long flags;
 	int ret = 1;
 
-	spin_lock_irqsave(&x->wait.lock, flags);
+	raw_spin_lock_irqsave(&x->wait.lock, flags);
 	if (!x->done)
 		ret = 0;
-	spin_unlock_irqrestore(&x->wait.lock, flags);
+	raw_spin_unlock_irqrestore(&x->wait.lock, flags);
 	return ret;
 }
 EXPORT_SYMBOL(completion_done);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 033eb92d053e..301e1068cf65 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2818,7 +2818,10 @@ void migrate_disable(void)
 	}
 
 #ifdef CONFIG_SCHED_DEBUG
-	WARN_ON_ONCE(p->migrate_disable_atomic);
+	if (unlikely(p->migrate_disable_atomic)) {
+		tracing_off();
+		WARN_ON_ONCE(1);
+	}
 #endif
 
 	if (p->migrate_disable) {
@@ -2849,7 +2852,10 @@ void migrate_enable(void)
 	}
 
 #ifdef CONFIG_SCHED_DEBUG
-	WARN_ON_ONCE(p->migrate_disable_atomic);
+	if (unlikely(p->migrate_disable_atomic)) {
+		tracing_off();
+		WARN_ON_ONCE(1);
+	}
 #endif
 	WARN_ON_ONCE(p->migrate_disable <= 0);
 
-- 
cgit v1.2.3


From 3216ebbedb577e1f28672d5976de92bc91a87a77 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 28 Oct 2013 11:50:06 +0100
Subject: a few open coded completions

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 drivers/net/wireless/orinoco/orinoco_usb.c | 2 +-
 drivers/usb/gadget/function/f_fs.c         | 2 +-
 drivers/usb/gadget/legacy/inode.c          | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/wireless/orinoco/orinoco_usb.c b/drivers/net/wireless/orinoco/orinoco_usb.c
index 995846422dc0..dfda85005038 100644
--- a/drivers/net/wireless/orinoco/orinoco_usb.c
+++ b/drivers/net/wireless/orinoco/orinoco_usb.c
@@ -699,7 +699,7 @@ static void ezusb_req_ctx_wait(struct ezusb_priv *upriv,
 			while (!ctx->done.done && msecs--)
 				udelay(1000);
 		} else {
-			wait_event_interruptible(ctx->done.wait,
+			swait_event_interruptible(ctx->done.wait,
 						 ctx->done.done);
 		}
 		break;
diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c
index 63314ede7ba6..0ff3631b5224 100644
--- a/drivers/usb/gadget/function/f_fs.c
+++ b/drivers/usb/gadget/function/f_fs.c
@@ -1428,7 +1428,7 @@ static void ffs_data_put(struct ffs_data *ffs)
 		pr_info("%s(): freeing\n", __func__);
 		ffs_data_clear(ffs);
 		BUG_ON(waitqueue_active(&ffs->ev.waitq) ||
-		       waitqueue_active(&ffs->ep0req_completion.wait));
+		       swaitqueue_active(&ffs->ep0req_completion.wait));
 		kfree(ffs->dev_name);
 		kfree(ffs);
 	}
diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c
index db2becd31a51..d74f46063d3a 100644
--- a/drivers/usb/gadget/legacy/inode.c
+++ b/drivers/usb/gadget/legacy/inode.c
@@ -339,7 +339,7 @@ ep_io (struct ep_data *epdata, void *buf, unsigned len)
 	spin_unlock_irq (&epdata->dev->lock);
 
 	if (likely (value == 0)) {
-		value = wait_event_interruptible (done.wait, done.done);
+		value = swait_event_interruptible (done.wait, done.done);
 		if (value != 0) {
 			spin_lock_irq (&epdata->dev->lock);
 			if (likely (epdata->ep != NULL)) {
@@ -348,7 +348,7 @@ ep_io (struct ep_data *epdata, void *buf, unsigned len)
 				usb_ep_dequeue (epdata->ep, epdata->req);
 				spin_unlock_irq (&epdata->dev->lock);
 
-				wait_event (done.wait, done.done);
+				swait_event (done.wait, done.done);
 				if (epdata->status == -ECONNRESET)
 					epdata->status = -EINTR;
 			} else {
-- 
cgit v1.2.3


From 49aa9bd883ccea431a87dccfd06090fcc2166e70 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 13 Feb 2015 15:52:24 +0100
Subject: cgroups: use simple wait in css_release()

To avoid:
|BUG: sleeping function called from invalid context at kernel/locking/rtmutex.c:914
|in_atomic(): 1, irqs_disabled(): 0, pid: 92, name: rcuc/11
|2 locks held by rcuc/11/92:
| #0:  (rcu_callback){......}, at: [<ffffffff810e037e>] rcu_cpu_kthread+0x3de/0x940
| #1:  (rcu_read_lock_sched){......}, at: [<ffffffff81328390>] percpu_ref_call_confirm_rcu+0x0/0xd0
|Preemption disabled at:[<ffffffff813284e2>] percpu_ref_switch_to_atomic_rcu+0x82/0xc0
|CPU: 11 PID: 92 Comm: rcuc/11 Not tainted 3.18.7-rt0+ #1
| ffff8802398cdf80 ffff880235f0bc28 ffffffff815b3a12 0000000000000000
| 0000000000000000 ffff880235f0bc48 ffffffff8109aa16 0000000000000000
| ffff8802398cdf80 ffff880235f0bc78 ffffffff815b8dd4 000000000000df80
|Call Trace:
| [<ffffffff815b3a12>] dump_stack+0x4f/0x7c
| [<ffffffff8109aa16>] __might_sleep+0x116/0x190
| [<ffffffff815b8dd4>] rt_spin_lock+0x24/0x60
| [<ffffffff8108d2cd>] queue_work_on+0x6d/0x1d0
| [<ffffffff8110c881>] css_release+0x81/0x90
| [<ffffffff8132844e>] percpu_ref_call_confirm_rcu+0xbe/0xd0
| [<ffffffff813284e2>] percpu_ref_switch_to_atomic_rcu+0x82/0xc0
| [<ffffffff810e03e5>] rcu_cpu_kthread+0x445/0x940
| [<ffffffff81098a2d>] smpboot_thread_fn+0x18d/0x2d0
| [<ffffffff810948d8>] kthread+0xe8/0x100
| [<ffffffff815b9c3c>] ret_from_fork+0x7c/0xb0

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 include/linux/cgroup.h | 2 ++
 kernel/cgroup.c        | 9 +++++----
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 1d5196889048..650dd9f04047 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -22,6 +22,7 @@
 #include <linux/seq_file.h>
 #include <linux/kernfs.h>
 #include <linux/wait.h>
+#include <linux/work-simple.h>
 
 #ifdef CONFIG_CGROUPS
 
@@ -91,6 +92,7 @@ struct cgroup_subsys_state {
 	/* percpu_ref killing and RCU release */
 	struct rcu_head rcu_head;
 	struct work_struct destroy_work;
+	struct swork_event destroy_swork;
 };
 
 /* bits in struct cgroup_subsys_state flags field */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 136eceadeed1..a32345c255da 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4355,10 +4355,10 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head)
 	queue_work(cgroup_destroy_wq, &css->destroy_work);
 }
 
-static void css_release_work_fn(struct work_struct *work)
+static void css_release_work_fn(struct swork_event *sev)
 {
 	struct cgroup_subsys_state *css =
-		container_of(work, struct cgroup_subsys_state, destroy_work);
+		container_of(sev, struct cgroup_subsys_state, destroy_swork);
 	struct cgroup_subsys *ss = css->ss;
 	struct cgroup *cgrp = css->cgroup;
 
@@ -4395,8 +4395,8 @@ static void css_release(struct percpu_ref *ref)
 	struct cgroup_subsys_state *css =
 		container_of(ref, struct cgroup_subsys_state, refcnt);
 
-	INIT_WORK(&css->destroy_work, css_release_work_fn);
-	queue_work(cgroup_destroy_wq, &css->destroy_work);
+	INIT_SWORK(&css->destroy_swork, css_release_work_fn);
+	swork_queue(&css->destroy_swork);
 }
 
 static void init_and_link_css(struct cgroup_subsys_state *css,
@@ -4997,6 +4997,7 @@ static int __init cgroup_wq_init(void)
 	 */
 	cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
 	BUG_ON(!cgroup_destroy_wq);
+	BUG_ON(swork_get());
 
 	/*
 	 * Used to destroy pidlists and separate to serve as flush domain.
-- 
cgit v1.2.3


From 84a76820d197d9dcc1d78a10f7cb641beb1405ea Mon Sep 17 00:00:00 2001
From: Mike Galbraith <umgwanakikbuti@gmail.com>
Date: Sat, 21 Jun 2014 10:09:48 +0200
Subject: scheduling while atomic in cgroup code

mm, memcg: make refill_stock() use get_cpu_light()

Nikita reported the following memcg scheduling while atomic bug:

Call Trace:
[e22d5a90] [c0007ea8] show_stack+0x4c/0x168 (unreliable)
[e22d5ad0] [c0618c04] __schedule_bug+0x94/0xb0
[e22d5ae0] [c060b9ec] __schedule+0x530/0x550
[e22d5bf0] [c060bacc] schedule+0x30/0xbc
[e22d5c00] [c060ca24] rt_spin_lock_slowlock+0x180/0x27c
[e22d5c70] [c00b39dc] res_counter_uncharge_until+0x40/0xc4
[e22d5ca0] [c013ca88] drain_stock.isra.20+0x54/0x98
[e22d5cc0] [c01402ac] __mem_cgroup_try_charge+0x2e8/0xbac
[e22d5d70] [c01410d4] mem_cgroup_charge_common+0x3c/0x70
[e22d5d90] [c0117284] __do_fault+0x38c/0x510
[e22d5df0] [c011a5f4] handle_pte_fault+0x98/0x858
[e22d5e50] [c060ed08] do_page_fault+0x42c/0x6fc
[e22d5f40] [c000f5b4] handle_page_fault+0xc/0x80

What happens:

   refill_stock()
      get_cpu_var()
      drain_stock()
         res_counter_uncharge()
            res_counter_uncharge_until()
               spin_lock() <== boom

Fix it by replacing get/put_cpu_var() with get/put_cpu_light().

Cc: stable-rt@vger.kernel.org
Reported-by: Nikita Yushchenko <nyushchenko@dev.rtsoft.ru>
Signed-off-by: Mike Galbraith <umgwanakikbuti@gmail.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 mm/memcontrol.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 498e8844a142..0c4538811188 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2379,14 +2379,17 @@ static void __init memcg_stock_init(void)
  */
 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
 {
-	struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
+	struct memcg_stock_pcp *stock;
+	int cpu = get_cpu_light();
+
+	stock = &per_cpu(memcg_stock, cpu);
 
 	if (stock->cached != memcg) { /* reset if necessary */
 		drain_stock(stock);
 		stock->cached = memcg;
 	}
 	stock->nr_pages += nr_pages;
-	put_cpu_var(memcg_stock);
+	put_cpu_light();
 }
 
 /*
-- 
cgit v1.2.3


From 956f9551130ed6bf9985852a6007d3f8d13600ee Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 26 Jun 2013 15:28:11 -0400
Subject: rt,ntp: Move call to schedule_delayed_work() to helper thread

The ntp code for notify_cmos_timer() is called from a hard interrupt
context. schedule_delayed_work() under PREEMPT_RT_FULL calls spinlocks
that have been converted to mutexes, thus calling schedule_delayed_work()
from interrupt is not safe.

Add a helper thread that does the call to schedule_delayed_work and wake
up that thread instead of calling schedule_delayed_work() directly.
This is only for CONFIG_PREEMPT_RT_FULL, otherwise the code still calls
schedule_delayed_work() directly in irq context.

Note: There's a few places in the kernel that do this. Perhaps the RT
code should have a dedicated thread that does the checks. Just register
a notifier on boot up for your check and wake up the thread when
needed. This will be a todo.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/time/ntp.c | 43 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 85fb3d632bd8..02897eb749d3 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -10,6 +10,7 @@
 #include <linux/workqueue.h>
 #include <linux/hrtimer.h>
 #include <linux/jiffies.h>
+#include <linux/kthread.h>
 #include <linux/math64.h>
 #include <linux/timex.h>
 #include <linux/time.h>
@@ -519,10 +520,52 @@ static void sync_cmos_clock(struct work_struct *work)
 			   &sync_cmos_work, timespec_to_jiffies(&next));
 }
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+/*
+ * RT can not call schedule_delayed_work from real interrupt context.
+ * Need to make a thread to do the real work.
+ */
+static struct task_struct *cmos_delay_thread;
+static bool do_cmos_delay;
+
+static int run_cmos_delay(void *ignore)
+{
+	while (!kthread_should_stop()) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (do_cmos_delay) {
+			do_cmos_delay = false;
+			queue_delayed_work(system_power_efficient_wq,
+					   &sync_cmos_work, 0);
+		}
+		schedule();
+	}
+	__set_current_state(TASK_RUNNING);
+	return 0;
+}
+
+void ntp_notify_cmos_timer(void)
+{
+	do_cmos_delay = true;
+	/* Make visible before waking up process */
+	smp_wmb();
+	wake_up_process(cmos_delay_thread);
+}
+
+static __init int create_cmos_delay_thread(void)
+{
+	cmos_delay_thread = kthread_run(run_cmos_delay, NULL, "kcmosdelayd");
+	BUG_ON(!cmos_delay_thread);
+	return 0;
+}
+early_initcall(create_cmos_delay_thread);
+
+#else
+
 void ntp_notify_cmos_timer(void)
 {
 	queue_delayed_work(system_power_efficient_wq, &sync_cmos_work, 0);
 }
+#endif /* CONFIG_PREEMPT_RT_FULL */
 
 #else
 void ntp_notify_cmos_timer(void) { }
-- 
cgit v1.2.3


From b426b264af42d4d3fb4964ea1872e6dd905b4f03 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 29 Aug 2013 11:48:57 +0200
Subject: md: disable bcache
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It uses anon semaphores
|drivers/md/bcache/request.c: In function ‘cached_dev_write_complete’:
|drivers/md/bcache/request.c:1007:2: error: implicit declaration of function ‘up_read_non_owner’ [-Werror=implicit-function-declaration]
|  up_read_non_owner(&dc->writeback_lock);
|  ^
|drivers/md/bcache/request.c: In function ‘request_write’:
|drivers/md/bcache/request.c:1033:2: error: implicit declaration of function ‘down_read_non_owner’ [-Werror=implicit-function-declaration]
|  down_read_non_owner(&dc->writeback_lock);
|  ^

either we get rid of those or we have to introduce them…

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 drivers/md/bcache/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
index 4d200883c505..98b64ed5cb81 100644
--- a/drivers/md/bcache/Kconfig
+++ b/drivers/md/bcache/Kconfig
@@ -1,6 +1,7 @@
 
 config BCACHE
 	tristate "Block device as cache"
+	depends on !PREEMPT_RT_FULL
 	---help---
 	Allows a block device to be used as cache for other devices; uses
 	a btree for indexing and the layout is optimized for SSDs.
-- 
cgit v1.2.3


From 33514dcd3f07b62988422057ca780f51c66e072e Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 1 Jul 2014 11:14:44 -0400
Subject: sched: Do not clear PF_NO_SETAFFINITY flag in select_fallback_rq()

I talked with Peter Zijlstra about this, and he told me that the clearing
of the PF_NO_SETAFFINITY flag was to deal with the optimization of
migrate_disable/enable() that ignores tasks that have that flag set. But
that optimization was removed when I did a rework of the cpu hotplug code.

I found that ignoring tasks that had that flag set would cause those tasks
to not sync with the hotplug code and cause the kernel to crash. Thus it
needed to not treat them special and those tasks had to go though the same
work as tasks without that flag set.

Now that those tasks are not treated special, there's no reason to clear the
flag.

May still need to be tested as the migrate_me() code does not ignore those
flags.

Cc: stable-rt@vger.kernel.org
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Clark Williams <williams@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20140701111444.0cfebaa1@gandalf.local.home
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched/core.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 301e1068cf65..7ebe5182f3e6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1472,12 +1472,6 @@ out:
 		}
 	}
 
-	/*
-	 * Clear PF_NO_SETAFFINITY, otherwise we wreckage
-	 * migrate_disable/enable. See optimization for
-	 * PF_NO_SETAFFINITY tasks there.
-	 */
-	p->flags &= ~PF_NO_SETAFFINITY;
 	return dest_cpu;
 }
 
-- 
cgit v1.2.3


From 1b6ed66e7448ac9be49a2bcadfc7e34b1889a5e2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 27 Jun 2014 16:24:52 +0200
Subject: workqueue: Prevent deadlock/stall on RT

Austin reported a XFS deadlock/stall on RT where scheduled work gets
never exececuted and tasks are waiting for each other for ever.

The underlying problem is the modification of the RT code to the
handling of workers which are about to go to sleep. In mainline a
worker thread which goes to sleep wakes an idle worker if there is
more work to do. This happens from the guts of the schedule()
function. On RT this must be outside and the accessed data structures
are not protected against scheduling due to the spinlock to rtmutex
conversion. So the naive solution to this was to move the code outside
of the scheduler and protect the data structures by the pool
lock. That approach turned out to be a little naive as we cannot call
into that code when the thread blocks on a lock, as it is not allowed
to block on two locks in parallel. So we dont call into the worker
wakeup magic when the worker is blocked on a lock, which causes the
deadlock/stall observed by Austin and Mike.

Looking deeper into that worker code it turns out that the only
relevant data structure which needs to be protected is the list of
idle workers which can be woken up.

So the solution is to protect the list manipulation operations with
preempt_enable/disable pairs on RT and call unconditionally into the
worker code even when the worker is blocked on a lock. The preemption
protection is safe as there is nothing which can fiddle with the list
outside of thread context.

Reported-and_tested-by: Austin Schuh <austin@peloton-tech.com>
Reported-and_tested-by: Mike Galbraith <umgwanakikbuti@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: http://vger.kernel.org/r/alpine.DEB.2.10.1406271249510.5170@nanos
Cc: Richard Weinberger <richard.weinberger@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: stable-rt@vger.kernel.org
---
 kernel/sched/core.c |  7 ++++--
 kernel/workqueue.c  | 61 +++++++++++++++++++++++++++++++++++++++++------------
 2 files changed, 53 insertions(+), 15 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7ebe5182f3e6..f093b200d8ad 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3041,9 +3041,8 @@ need_resched:
 
 static inline void sched_submit_work(struct task_struct *tsk)
 {
-	if (!tsk->state || tsk_is_pi_blocked(tsk))
+	if (!tsk->state)
 		return;
-
 	/*
 	 * If a worker went to sleep, notify and ask workqueue whether
 	 * it wants to wake up a task to maintain concurrency.
@@ -3051,6 +3050,10 @@ static inline void sched_submit_work(struct task_struct *tsk)
 	if (tsk->flags & PF_WQ_WORKER)
 		wq_worker_sleeping(tsk);
 
+
+	if (tsk_is_pi_blocked(tsk))
+		return;
+
 	/*
 	 * If we are going to sleep and we have plugged IO queued,
 	 * make sure to submit it to avoid deadlocks.
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0fb5b91bc853..416a3f67e7fd 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -123,6 +123,11 @@ enum {
  *    cpu or grabbing pool->lock is enough for read access.  If
  *    POOL_DISASSOCIATED is set, it's identical to L.
  *
+ *    On RT we need the extra protection via rt_lock_idle_list() for
+ *    the list manipulations against read access from
+ *    wq_worker_sleeping(). All other places are nicely serialized via
+ *    pool->lock.
+ *
  * A: pool->attach_mutex protected.
  *
  * PL: wq_pool_mutex protected.
@@ -396,6 +401,31 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to,
 		if (({ assert_rcu_or_wq_mutex(wq); false; })) { }	\
 		else
 
+#ifdef CONFIG_PREEMPT_RT_BASE
+static inline void rt_lock_idle_list(struct worker_pool *pool)
+{
+	preempt_disable();
+}
+static inline void rt_unlock_idle_list(struct worker_pool *pool)
+{
+	preempt_enable();
+}
+static inline void sched_lock_idle_list(struct worker_pool *pool) { }
+static inline void sched_unlock_idle_list(struct worker_pool *pool) { }
+#else
+static inline void rt_lock_idle_list(struct worker_pool *pool) { }
+static inline void rt_unlock_idle_list(struct worker_pool *pool) { }
+static inline void sched_lock_idle_list(struct worker_pool *pool)
+{
+	spin_lock_irq(&pool->lock);
+}
+static inline void sched_unlock_idle_list(struct worker_pool *pool)
+{
+	spin_unlock_irq(&pool->lock);
+}
+#endif
+
+
 #ifdef CONFIG_DEBUG_OBJECTS_WORK
 
 static struct debug_obj_descr work_debug_descr;
@@ -788,10 +818,16 @@ static struct worker *first_idle_worker(struct worker_pool *pool)
  */
 static void wake_up_worker(struct worker_pool *pool)
 {
-	struct worker *worker = first_idle_worker(pool);
+	struct worker *worker;
+
+	rt_lock_idle_list(pool);
+
+	worker = first_idle_worker(pool);
 
 	if (likely(worker))
 		wake_up_process(worker->task);
+
+	rt_unlock_idle_list(pool);
 }
 
 /**
@@ -819,7 +855,7 @@ void wq_worker_running(struct task_struct *task)
  */
 void wq_worker_sleeping(struct task_struct *task)
 {
-	struct worker *next, *worker = kthread_data(task);
+	struct worker *worker = kthread_data(task);
 	struct worker_pool *pool;
 
 	/*
@@ -836,25 +872,18 @@ void wq_worker_sleeping(struct task_struct *task)
 		return;
 
 	worker->sleeping = 1;
-	spin_lock_irq(&pool->lock);
+
 	/*
 	 * The counterpart of the following dec_and_test, implied mb,
 	 * worklist not empty test sequence is in insert_work().
 	 * Please read comment there.
-	 *
-	 * NOT_RUNNING is clear.  This means that we're bound to and
-	 * running on the local cpu w/ rq lock held and preemption
-	 * disabled, which in turn means that none else could be
-	 * manipulating idle_list, so dereferencing idle_list without pool
-	 * lock is safe.
 	 */
 	if (atomic_dec_and_test(&pool->nr_running) &&
 	    !list_empty(&pool->worklist)) {
-		next = first_idle_worker(pool);
-		if (next)
-			wake_up_process(next->task);
+		sched_lock_idle_list(pool);
+		wake_up_worker(pool);
+		sched_unlock_idle_list(pool);
 	}
-	spin_unlock_irq(&pool->lock);
 }
 
 /**
@@ -1545,7 +1574,9 @@ static void worker_enter_idle(struct worker *worker)
 	worker->last_active = jiffies;
 
 	/* idle_list is LIFO */
+	rt_lock_idle_list(pool);
 	list_add(&worker->entry, &pool->idle_list);
+	rt_unlock_idle_list(pool);
 
 	if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
 		mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
@@ -1578,7 +1609,9 @@ static void worker_leave_idle(struct worker *worker)
 		return;
 	worker_clr_flags(worker, WORKER_IDLE);
 	pool->nr_idle--;
+	rt_lock_idle_list(pool);
 	list_del_init(&worker->entry);
+	rt_unlock_idle_list(pool);
 }
 
 static struct worker *alloc_worker(int node)
@@ -1746,7 +1779,9 @@ static void destroy_worker(struct worker *worker)
 	pool->nr_workers--;
 	pool->nr_idle--;
 
+	rt_lock_idle_list(pool);
 	list_del_init(&worker->entry);
+	rt_unlock_idle_list(pool);
 	worker->flags |= WORKER_DIE;
 	wake_up_process(worker->task);
 }
-- 
cgit v1.2.3


From 28a06b1ebb1c6a4dd214b9032c6b9da229f015b7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 8 Jul 2011 20:25:16 +0200
Subject: localversion.patch

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/n/tip-8vdw4bfcsds27cvox6rpb334@git.kernel.org
---
 localversion-rt | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 localversion-rt

diff --git a/localversion-rt b/localversion-rt
new file mode 100644
index 000000000000..d79dde624aaa
--- /dev/null
+++ b/localversion-rt
@@ -0,0 +1 @@
+-rt10
-- 
cgit v1.2.3