203 files changed, 5723 insertions, 1329 deletions
diff --git a/arch/Kconfig b/arch/Kconfig
index 99193b160232..1b28306abf74 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -33,6 +33,11 @@ config OPROFILE_IBS
 config HAVE_OPROFILE
 	bool
 
+config PROFILE_NMI
+	bool
+	depends on OPROFILE
+	default y
+
 config KPROBES
 	bool "Kprobes"
 	depends on KALLSYMS && MODULES
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index aef63c8e3d2d..eac1a927ca6e 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -962,18 +962,7 @@ config LOCAL_TIMERS
 	  accounting to be spread across the timer interval, preventing a
 	  "thundering herd" at every timer tick.
 
-config PREEMPT
-	bool "Preemptible Kernel (EXPERIMENTAL)"
-	depends on EXPERIMENTAL
-	help
-	  This option reduces the latency of the kernel when reacting to
-	  real-time or interactive events by allowing a low priority process to
-	  be preempted even if it is in kernel mode executing a system call.
-	  This allows applications to run more reliably even when the system is
-	  under load.
-
-	  Say Y here if you are building a kernel for a desktop, embedded
-	  or real-time system.  Say N if you are unsure.
+source kernel/Kconfig.preempt
 
 config HZ
 	int
diff --git a/arch/arm/include/asm/system.h b/arch/arm/include/asm/system.h
index d65b2f5bf41f..e849ed9c7cf7 100644
--- a/arch/arm/include/asm/system.h
+++ b/arch/arm/include/asm/system.h
@@ -60,6 +60,8 @@
 #include <linux/linkage.h>
 #include <linux/irqflags.h>
 
+#include <asm/memory.h>
+
 #define __exception	__attribute__((section(".exception.text")))
 
 struct thread_info;
diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h
index f41a6f57cd12..dd667f26cc01 100644
--- a/arch/arm/include/asm/tlb.h
+++ b/arch/arm/include/asm/tlb.h
@@ -40,17 +40,12 @@ struct mmu_gather {
 	unsigned long		range_end;
 };
 
-DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);
-
-static inline struct mmu_gather *
-tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
+static inline void
+tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
+	       unsigned int full_mm_flush)
 {
-	struct mmu_gather *tlb = &get_cpu_var(mmu_gathers);
-
 	tlb->mm = mm;
 	tlb->fullmm = full_mm_flush;
-
-	return tlb;
 }
 
 static inline void
@@ -61,8 +56,6 @@ tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
 
 	/* keep the page table cache within bounds */
 	check_pgt_cache();
-
-	put_cpu_var(mmu_gathers);
 }
 
 /*
diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
index 8c3de1a350b5..aa5df73fe409 100644
--- a/arch/arm/kernel/entry-common.S
+++ b/arch/arm/kernel/entry-common.S
@@ -59,7 +59,8 @@ work_pending:
 	b	ret_slow_syscall		@ Check work again
 
 work_resched:
-	bl	schedule
+	bl	__schedule
+
 /*
  * "slow" syscall return path.  "why" tells us if this was a real syscall.
  */
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index 39196dff478c..fcecbb2e3ca2 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -174,9 +174,11 @@ void cpu_idle(void)
 		}
 		leds_event(led_idle_end);
 		tick_nohz_restart_sched_tick();
-		preempt_enable_no_resched();
-		schedule();
+		local_irq_disable();
+		__preempt_enable_no_resched();
+		__schedule();
 		preempt_disable();
+		local_irq_enable();
 	}
 }
 
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index f6bc5d442782..95eb91121c9a 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -627,6 +627,14 @@ static int do_signal(sigset_t *oldset, struct pt_regs *regs, int syscall)
 	siginfo_t info;
 	int signr;
 
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * Fully-preemptible kernel does not need interrupts disabled:
+	 */
+	local_irq_enable();
+	preempt_check_resched();
+#endif
+
 	/*
 	 * We want the common case to go fast, which
 	 * is why we may in certain cases get here from
diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
index 4722582b17b8..8f1e18212780 100644
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -29,8 +29,6 @@
 
 #include "mm.h"
 
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
-
 /*
  * empty_zero_page is a special page that is used for
  * zero-initialized data and COW.
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index d00131ca0835..0b46b68dad85 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -68,13 +68,6 @@ config LOCKDEP_SUPPORT
 	bool
 	default y
 
-config RWSEM_GENERIC_SPINLOCK
-	bool
-
-config RWSEM_XCHGADD_ALGORITHM
-	bool
-	default y
-
 config GENERIC_LOCKBREAK
 	bool
 	default y
@@ -252,6 +245,14 @@ config HIGHMEM
 source kernel/time/Kconfig
 source kernel/Kconfig.hz
 source kernel/Kconfig.preempt
+
+config RWSEM_GENERIC_SPINLOCK
+	bool
+	default y
+
+config RWSEM_XCHGADD_ALGORITHM
+	bool
+
 source "fs/Kconfig.binfmt"
 
 config HUGETLB_PAGE_SIZE_VARIABLE
diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h
index 8cd083c61503..a1ddb07a2b90 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -202,8 +202,15 @@ static inline unsigned long pte_update(struct mm_struct *mm,
 		assert_pte_locked(mm, addr);
 
 #ifdef CONFIG_PPC_STD_MMU_64
-	if (old & _PAGE_HASHPTE)
+	if (old & _PAGE_HASHPTE) {
+#ifdef CONFIG_PREEMPT_RT
+		preempt_disable();
+#endif
 		hpte_need_flush(mm, addr, ptep, old, huge);
+#ifdef CONFIG_PREEMPT_RT
+		preempt_enable();
+#endif
+	}
 #endif
 
 	return old;
diff --git a/arch/powerpc/include/asm/rwsem.h b/arch/powerpc/include/asm/rwsem.h
index 564c3731cda8..c2494d42ca59 100644
--- a/arch/powerpc/include/asm/rwsem.h
+++ b/arch/powerpc/include/asm/rwsem.h
@@ -173,6 +173,8 @@ static inline int anon_rwsem_is_locked(struct rw_anon_semaphore *sem)
 	return (sem->count != 0);
 }
 
+#ifndef CONFIG_PREEMPT_RT
+
 struct rw_semaphore {
 	/* XXX this should be able to be an atomic_t  -- paulus */
 	signed int		count;
@@ -213,6 +215,7 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem)
 {
 	return (sem->count != 0);
 }
+#endif
 
 #endif	/* __KERNEL__ */
 #endif	/* _ASM_POWERPC_RWSEM_H */
diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h
index e20ff7541f36..3ddc8f6986dd 100644
--- a/arch/powerpc/include/asm/tlb.h
+++ b/arch/powerpc/include/asm/tlb.h
@@ -30,26 +30,38 @@ struct mmu_gather;
 #define tlb_start_vma(tlb, vma)	do { } while (0)
 #define tlb_end_vma(tlb, vma)	do { } while (0)
 
+#define HAVE_ARCH_MMU_GATHER 1
+
+struct pte_freelist_batch;
+
+struct arch_mmu_gather {
+	struct pte_freelist_batch *batch;
+};
+
+#define ARCH_MMU_GATHER_INIT (struct arch_mmu_gather){ .batch = NULL, }
+
 #if !defined(CONFIG_PPC_STD_MMU)
 
 #define tlb_flush(tlb)			flush_tlb_mm((tlb)->mm)
 
 #elif defined(__powerpc64__)
 
-extern void pte_free_finish(void);
+extern void pte_free_finish(struct mmu_gather *tlb);
 
 static inline void tlb_flush(struct mmu_gather *tlb)
 {
-	struct ppc64_tlb_batch *tlbbatch = &__get_cpu_var(ppc64_tlb_batch);
+	struct ppc64_tlb_batch *tlbbatch = &get_cpu_var(ppc64_tlb_batch);
 
 	/* If there's a TLB batch pending, then we must flush it because the
 	 * pages are going to be freed and we really don't want to have a CPU
 	 * access a freed page because it has a stale TLB
 	 */
-	if (tlbbatch->index)
+	if (tlbbatch->index) {
 		__flush_tlb_pending(tlbbatch);
+	}
 
-	pte_free_finish();
+	put_cpu_var(ppc64_tlb_batch);
+	pte_free_finish(tlb);
 }
 
 #else
diff --git a/arch/powerpc/include/asm/tlbflush.h b/arch/powerpc/include/asm/tlbflush.h
index abbe3419d1dd..3f675968de70 100644
--- a/arch/powerpc/include/asm/tlbflush.h
+++ b/arch/powerpc/include/asm/tlbflush.h
@@ -101,18 +101,25 @@ extern void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
 
 static inline void arch_enter_lazy_mmu_mode(void)
 {
-	struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
+	struct ppc64_tlb_batch *batch = &get_cpu_var(ppc64_tlb_batch);
 
 	batch->active = 1;
+
+	put_cpu_var(ppc64_tlb_batch);
 }
 
 static inline void arch_leave_lazy_mmu_mode(void)
 {
-	struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
+	struct ppc64_tlb_batch *batch = &get_cpu_var(ppc64_tlb_batch);
+
+	if (batch->active) {
+		if (batch->index) {
+			__flush_tlb_pending(batch);
+		}
+		batch->active = 0;
+	}
 
-	if (batch->index)
-		__flush_tlb_pending(batch);
-	batch->active = 0;
+	put_cpu_var(ppc64_tlb_batch);
 }
 
 #define arch_flush_lazy_mmu_mode()      do {} while (0)
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 43e073477c34..4bb9ce492123 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -625,44 +625,52 @@ do_work:
 	bne	restore
 	/* here we are preempting the current task */
 1:
+	/*
+	 * preempt_schedule_irq() expects interrupts disabled and returns
+	 * with interrupts disabled. No need to check preemption again,
+	 * preempt_schedule_irq just did that for us.
+	 */
+	bl	.preempt_schedule_irq
 #ifdef CONFIG_TRACE_IRQFLAGS
 	bl	.trace_hardirqs_on
+#endif /* CONFIG_TRACE_IRQFLAGS */
+
 	/* Note: we just clobbered r10 which used to contain the previous
 	 * MSR before the hard-disabling done by the caller of do_work.
 	 * We don't have that value anymore, but it doesn't matter as
 	 * we will hard-enable unconditionally, we can just reload the
 	 * current MSR into r10
 	 */
+	bl		.preempt_schedule_irq
 	mfmsr	r10
-#endif /* CONFIG_TRACE_IRQFLAGS */
-	li	r0,1
-	stb	r0,PACASOFTIRQEN(r13)
-	stb	r0,PACAHARDIRQEN(r13)
-	ori	r10,r10,MSR_EE
-	mtmsrd	r10,1		/* reenable interrupts */
-	bl	.preempt_schedule
-	mfmsr	r10
-	clrrdi	r9,r1,THREAD_SHIFT
-	rldicl	r10,r10,48,1	/* disable interrupts again */
-	rotldi	r10,r10,16
-	mtmsrd	r10,1
-	ld	r4,TI_FLAGS(r9)
-	andi.	r0,r4,_TIF_NEED_RESCHED
-	bne	1b
+	clrrdi  r9,r1,THREAD_SHIFT
+	rldicl  r10,r10,48,1    /* disable interrupts again */
+	rotldi  r10,r10,16
+	mtmsrd  r10,1
+	ld      r4,TI_FLAGS(r9)
+	andi.   r0,r4,(_TIF_NEED_RESCHED)
+	bne     1b
 	b	restore
 
 user_work:
 #endif
-	/* Enable interrupts */
-	ori	r10,r10,MSR_EE
-	mtmsrd	r10,1
-
 	andi.	r0,r4,_TIF_NEED_RESCHED
 	beq	1f
-	bl	.schedule
+
+	/* preempt_schedule_irq() expects interrupts disabled. */
+	bl	.preempt_schedule_irq
 	b	.ret_from_except_lite
 
-1:	bl	.save_nvgprs
+	/* here we are preempting the current task */
+1:	li	r0,1
+	stb	r0,PACASOFTIRQEN(r13)
+	stb	r0,PACAHARDIRQEN(r13)
+
+	/* Enable interrupts */
+	ori	r10,r10,MSR_EE
+	mtmsrd	r10,1
+
+	bl	.save_nvgprs
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	.do_signal
 	b	.ret_from_except
diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c
index 88d9c1d5e5fb..1a82d480a833 100644
--- a/arch/powerpc/kernel/idle.c
+++ b/arch/powerpc/kernel/idle.c
@@ -96,9 +96,11 @@ void cpu_idle(void)
 		tick_nohz_restart_sched_tick();
 		if (cpu_should_die())
 			cpu_die();
-		preempt_enable_no_resched();
-		schedule();
+		local_irq_disable();
+		__preempt_enable_no_resched();
+		__schedule();
 		preempt_disable();
+		local_irq_enable();
 	}
 }
 
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 892a9f2e6d76..ef9d506c518e 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -305,6 +305,10 @@ struct task_struct *__switch_to(struct task_struct *prev,
 	struct thread_struct *new_thread, *old_thread;
 	unsigned long flags;
 	struct task_struct *last;
+#if defined(CONFIG_PPC64) && defined (CONFIG_PREEMPT_RT)
+	struct ppc64_tlb_batch *batch;
+	int hadbatch;
+#endif
 
 #ifdef CONFIG_SMP
 	/* avoid complexity of lazy save/restore of fpu
@@ -396,6 +400,17 @@ struct task_struct *__switch_to(struct task_struct *prev,
 		old_thread->accum_tb += (current_tb - start_tb);
 		new_thread->start_tb = current_tb;
 	}
+
+#ifdef CONFIG_PREEMPT_RT
+	batch = &__get_cpu_var(ppc64_tlb_batch);
+	if (batch->active) {
+		hadbatch = 1;
+		if (batch->index) {
+			__flush_tlb_pending(batch);
+		}
+		batch->active = 0;
+	}
+#endif /* #ifdef CONFIG_PREEMPT_RT */
 #endif
 
 	local_irq_save(flags);
@@ -414,6 +429,13 @@ struct task_struct *__switch_to(struct task_struct *prev,
 
 	local_irq_restore(flags);
 
+#if defined(CONFIG_PPC64) && defined(CONFIG_PREEMPT_RT)
+	if (hadbatch) {
+		batch = &__get_cpu_var(ppc64_tlb_batch);
+		batch->active = 1;
+	}
+#endif
+
 	return last;
 }
 
diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c
index 3de6a0d93824..3ef5084b90ca 100644
--- a/arch/powerpc/mm/init_32.c
+++ b/arch/powerpc/mm/init_32.c
@@ -54,8 +54,6 @@
 #endif
 #define MAX_LOW_MEM	CONFIG_LOWMEM_SIZE
 
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
-
 phys_addr_t total_memory;
 phys_addr_t total_lowmem;
 
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index 627767d6169b..c8a29044cb6c 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -30,7 +30,6 @@
 #include <asm/tlbflush.h>
 #include <asm/tlb.h>
 
-static DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur);
 static unsigned long pte_freelist_forced_free;
 
 struct pte_freelist_batch
@@ -81,11 +80,11 @@ static void pte_free_submit(struct pte_freelist_batch *batch)
 
 void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf)
 {
-	/* This is safe since tlb_gather_mmu has disabled preemption */
-	struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur);
+	struct pte_freelist_batch **batchp;
 
-	if (atomic_read(&tlb->mm->mm_users) < 2 ||
-	    cpumask_equal(mm_cpumask(tlb->mm), cpumask_of(smp_processor_id()))){
+	batchp = &tlb->arch.batch;
+
+	if (atomic_read(&tlb->mm->mm_users) < 2) {
 		pgtable_free(pgf);
 		return;
 	}
@@ -105,15 +104,16 @@ void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf)
 	}
 }
 
-void pte_free_finish(void)
+void pte_free_finish(struct mmu_gather *tlb)
 {
-	/* This is safe since tlb_gather_mmu has disabled preemption */
-	struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur);
+	struct pte_freelist_batch **batchp;
 
-	if (*batchp == NULL)
-		return;
-	pte_free_submit(*batchp);
-	*batchp = NULL;
+	batchp = &tlb->arch.batch;
+
+	if (*batchp) {
+		pte_free_submit(*batchp);
+		*batchp = NULL;
+	}
 }
 
 /*
diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c
index 937eb90677d9..33c458fc16f1 100644
--- a/arch/powerpc/mm/tlb_hash64.c
+++ b/arch/powerpc/mm/tlb_hash64.c
@@ -30,14 +30,10 @@
 #include <asm/tlbflush.h>
 #include <asm/tlb.h>
 #include <asm/bug.h>
+#include <asm/machdep.h>
 
 DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch);
 
-/* This is declared as we are using the more or less generic
- * arch/powerpc/include/asm/tlb.h file -- tgall
- */
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
-
 /*
  * A linux PTE was changed and the corresponding hash table entry
  * neesd to be flushed. This function will either perform the flush
@@ -49,7 +45,7 @@ DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
 void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
 		     pte_t *ptep, unsigned long pte, int huge)
 {
-	struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
+	struct ppc64_tlb_batch *batch = &get_cpu_var(ppc64_tlb_batch);
 	unsigned long vsid, vaddr;
 	unsigned int psize;
 	int ssize;
@@ -100,6 +96,7 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
 	 */
 	if (!batch->active) {
 		flush_hash_page(vaddr, rpte, psize, ssize, 0);
+		put_cpu_var(ppc64_tlb_batch);
 		return;
 	}
 
@@ -126,8 +123,22 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
 	batch->pte[i] = rpte;
 	batch->vaddr[i] = vaddr;
 	batch->index = ++i;
+
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * Since flushing tlb needs expensive hypervisor call(s) on celleb,
+	 * always flush it on RT to reduce scheduling latency.
+	 */
+	if (machine_is(celleb)) {
+		__flush_tlb_pending(batch);
+		put_cpu_var(ppc64_tlb_batch);
+		return;
+	}
+#endif /* CONFIG_PREEMPT_RT */
+
 	if (i >= PPC64_TLB_BATCH_NR)
 		__flush_tlb_pending(batch);
+	put_cpu_var(ppc64_tlb_batch);
 }
 
 /*
diff --git a/arch/powerpc/platforms/chrp/time.c b/arch/powerpc/platforms/chrp/time.c
index 054dfe5b8e77..8f1d8cd617ab 100644
--- a/arch/powerpc/platforms/chrp/time.c
+++ b/arch/powerpc/platforms/chrp/time.c
@@ -83,7 +83,12 @@ int chrp_set_rtc_time(struct rtc_time *tmarg)
 	unsigned char save_control, save_freq_select;
 	struct rtc_time tm = *tmarg;
 
+#if CONFIG_PREEMPT_RT
+	if (!spin_trylock(&rtc_lock))
+		return -1;
+#else
 	spin_lock(&rtc_lock);
+#endif
 
 	save_control = chrp_cmos_clock_read(RTC_CONTROL); /* tell the clock it's being set */
 
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 661c8e02bcba..bcdce0afadba 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -140,7 +140,7 @@ static int tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum,
 	return ret;
 }
 
-static DEFINE_PER_CPU(u64 *, tce_page) = NULL;
+static DEFINE_PER_CPU_LOCKED(u64 *, tce_page) = NULL;
 
 static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
 				     long npages, unsigned long uaddr,
@@ -154,13 +154,14 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
 	long l, limit;
 	long tcenum_start = tcenum, npages_start = npages;
 	int ret = 0;
+	int cpu;
 
 	if (npages == 1) {
 		return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr,
 		                           direction, attrs);
 	}
 
-	tcep = __get_cpu_var(tce_page);
+	tcep = get_cpu_var_locked(tce_page, &cpu);
 
 	/* This is safe to do since interrupts are off when we're called
 	 * from iommu_alloc{,_sg}()
@@ -169,10 +170,11 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
 		tcep = (u64 *)__get_free_page(GFP_ATOMIC);
 		/* If allocation fails, fall back to the loop implementation */
 		if (!tcep) {
+			put_cpu_var_locked(tce_page, cpu);
 			return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr,
 					    direction, attrs);
 		}
-		__get_cpu_var(tce_page) = tcep;
+		per_cpu_var_locked(tce_page, cpu) = tcep;
 	}
 
 	rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT;
@@ -216,6 +218,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
 		printk("\ttce[0] val = 0x%llx\n", tcep[0]);
 		show_stack(current, (unsigned long *)__get_SP());
 	}
+	put_cpu_var_locked(tce_page, cpu);
 	return ret;
 }
 
diff --git a/arch/powerpc/platforms/pseries/rtasd.c b/arch/powerpc/platforms/pseries/rtasd.c
index b3cbac855924..493d8deea635 100644
--- a/arch/powerpc/platforms/pseries/rtasd.c
+++ b/arch/powerpc/platforms/pseries/rtasd.c
@@ -208,7 +208,7 @@ void pSeries_log_error(char *buf, unsigned int err_type, int fatal)
 		break;
 	case ERR_TYPE_KERNEL_PANIC:
 	default:
-		WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */
+		WARN_ON_ONCE_NONRT(!irqs_disabled()); /* @@@ DEBUG @@@ */
 		spin_unlock_irqrestore(&rtasd_log_lock, s);
 		return;
 	}
@@ -228,7 +228,7 @@ void pSeries_log_error(char *buf, unsigned int err_type, int fatal)
 	/* Check to see if we need to or have stopped logging */
 	if (fatal || !logging_enabled) {
 		logging_enabled = 0;
-		WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */
+		WARN_ON_ONCE_NONRT(!irqs_disabled()); /* @@@ DEBUG @@@ */
 		spin_unlock_irqrestore(&rtasd_log_lock, s);
 		return;
 	}
@@ -251,13 +251,13 @@ void pSeries_log_error(char *buf, unsigned int err_type, int fatal)
 		else
 			rtas_log_start += 1;
 
-		WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */
+		WARN_ON_ONCE_NONRT(!irqs_disabled()); /* @@@ DEBUG @@@ */
 		spin_unlock_irqrestore(&rtasd_log_lock, s);
 		wake_up_interruptible(&rtas_log_wait);
 		break;
 	case ERR_TYPE_KERNEL_PANIC:
 	default:
-		WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */
+		WARN_ON_ONCE_NONRT(!irqs_disabled()); /* @@@ DEBUG @@@ */
 		spin_unlock_irqrestore(&rtasd_log_lock, s);
 		return;
 	}
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index e1f33a81e5e1..db7cf0d5042a 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -348,6 +348,7 @@ static int xmon_core(struct pt_regs *regs, int fromipi)
 	unsigned long timeout;
 #endif
 
+	preempt_disable();
 	local_irq_save(flags);
 
 	bp = in_breakpoint_table(regs->nip, &offset);
@@ -524,6 +525,7 @@ static int xmon_core(struct pt_regs *regs, int fromipi)
 	insert_cpu_bpts();
 
 	local_irq_restore(flags);
+	preempt_enable();
 
 	return cmd != 'X' && cmd != EOF;
 }
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 13ffa5df37d7..6efd082d51d3 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -123,10 +123,18 @@ config ARCH_MAY_HAVE_PC_FDC
 	def_bool y
 
 config RWSEM_GENERIC_SPINLOCK
-	def_bool !X86_XADD
+	bool
+	depends on !X86_XADD || PREEMPT_RT
+	default y
+
+config ASM_SEMAPHORES
+	bool
+	default y
 
 config RWSEM_XCHGADD_ALGORITHM
-	def_bool X86_XADD
+	bool
+	depends on X86_XADD && !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT
+	default y
 
 config ARCH_HAS_CPU_IDLE_WAIT
 	def_bool y
@@ -672,7 +680,7 @@ config IOMMU_API
 
 config MAXSMP
 	bool "Configure Maximum number of SMP Processors and NUMA Nodes"
-	depends on X86_64 && SMP && DEBUG_KERNEL && EXPERIMENTAL
+	depends on 0 && X86_64 && SMP && DEBUG_KERNEL && EXPERIMENTAL
 	select CPUMASK_OFFSTACK
 	default n
 	---help---
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index d105f29bb6bb..33470295c8f4 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -76,6 +76,7 @@ config DEBUG_PER_CPU_MAPS
 	bool "Debug access to per_cpu maps"
 	depends on DEBUG_KERNEL
 	depends on SMP
+	depends on !PREEMPT_RT
 	default n
 	---help---
 	  Say Y to verify that the per_cpu map being accessed has
@@ -126,6 +127,7 @@ config DEBUG_NX_TEST
 config 4KSTACKS
 	bool "Use 4Kb for kernel stacks instead of 8Kb"
 	depends on X86_32
+	default y
 	---help---
 	  If you say Y here the kernel will use a 4Kb stacksize for the
 	  kernel stack attached to each process/thread. This facilitates
diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h
index 014c2b85ae45..3a53e85427b5 100644
--- a/arch/x86/include/asm/highmem.h
+++ b/arch/x86/include/asm/highmem.h
@@ -58,6 +58,17 @@ extern void *kmap_high(struct page *page);
 extern void kunmap_high(struct page *page);
 
 void *kmap(struct page *page);
+extern void kunmap_virt(void *ptr);
+extern struct page *kmap_to_page(void *ptr);
+void kunmap(struct page *page);
+
+void *__kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot);
+void *__kmap_atomic(struct page *page, enum km_type type);
+void *__kmap_atomic_direct(struct page *page, enum km_type type);
+void __kunmap_atomic(void *kvaddr, enum km_type type);
+void *__kmap_atomic_pfn(unsigned long pfn, enum km_type type);
+struct page *__kmap_atomic_to_page(void *ptr);
+
 void kunmap(struct page *page);
 void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot);
 void *kmap_atomic(struct page *page, enum km_type type);
@@ -67,7 +78,8 @@ void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
 struct page *kmap_atomic_to_page(void *ptr);
 
 #ifndef CONFIG_PARAVIRT
-#define kmap_atomic_pte(page, type)	kmap_atomic(page, type)
+#define kmap_atomic_pte(page, type)		kmap_atomic(page, type)
+#define kmap_atomic_pte_direct(page, type)	kmap_atomic_direct(page, type)
 #endif
 
 #define flush_cache_kmaps()	do { } while (0)
@@ -75,6 +87,27 @@ struct page *kmap_atomic_to_page(void *ptr);
 extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn,
 					unsigned long end_pfn);
 
+/*
+ * on PREEMPT_RT kmap_atomic() is a wrapper that uses kmap():
+ */
+#ifdef CONFIG_PREEMPT_RT
+# define kmap_atomic_prot(page, type, prot)	({ pagefault_disable(); kmap(page); })
+# define kmap_atomic(page, type)	({ pagefault_disable(); kmap(page); })
+# define kmap_atomic_pfn(pfn, type)	kmap(pfn_to_page(pfn))
+# define kunmap_atomic(kvaddr, type)	do { pagefault_enable(); kunmap_virt(kvaddr); } while(0)
+# define kmap_atomic_to_page(kvaddr)	kmap_to_page(kvaddr)
+# define kmap_atomic_direct(page, type)	__kmap_atomic_direct(page, type)
+# define kunmap_atomic_direct(kvaddr, type)	__kunmap_atomic(kvaddr, type)
+#else
+# define kmap_atomic_prot(page, type, prot)	__kmap_atomic_prot(page, type, prot)
+# define kmap_atomic(page, type)	__kmap_atomic(page, type)
+# define kmap_atomic_pfn(pfn, type)	__kmap_atomic_pfn(pfn, type)
+# define kunmap_atomic(kvaddr, type)	__kunmap_atomic(kvaddr, type)
+# define kmap_atomic_to_page(kvaddr)	__kmap_atomic_to_page(kvaddr)
+# define kmap_atomic_direct(page, type)	__kmap_atomic(page, type)
+# define kunmap_atomic_direct(kvaddr, type)	__kunmap_atomic(kvaddr, type)
+#endif
+
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_X86_HIGHMEM_H */
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 7639dbf5d223..0ec050a154be 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -14,12 +14,21 @@
 #define IRQ_STACK_ORDER 2
 #define IRQ_STACK_SIZE (PAGE_SIZE << IRQ_STACK_ORDER)
 
-#define STACKFAULT_STACK 1
-#define DOUBLEFAULT_STACK 2
-#define NMI_STACK 3
-#define DEBUG_STACK 4
-#define MCE_STACK 5
-#define N_EXCEPTION_STACKS 5  /* hw limit: 7 */
+#ifdef CONFIG_PREEMPT_RT
+# define STACKFAULT_STACK 0
+# define DOUBLEFAULT_STACK 1
+# define NMI_STACK 2
+# define DEBUG_STACK 0
+# define MCE_STACK 3
+# define N_EXCEPTION_STACKS 3  /* hw limit: 7 */
+#else
+# define STACKFAULT_STACK 1
+# define DOUBLEFAULT_STACK 2
+# define NMI_STACK 3
+# define DEBUG_STACK 4
+# define MCE_STACK 5
+# define N_EXCEPTION_STACKS 5  /* hw limit: 7 */
+#endif
 
 #define PUD_PAGE_SIZE		(_AC(1, UL) << PUD_SHIFT)
 #define PUD_PAGE_MASK		(~(PUD_PAGE_SIZE-1))
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 4fb37c8a0832..513b09e3582c 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -340,6 +340,7 @@ struct pv_mmu_ops {
 
 #ifdef CONFIG_HIGHPTE
 	void *(*kmap_atomic_pte)(struct page *page, enum km_type type);
+	void *(*kmap_atomic_pte_direct)(struct page *page, enum km_type type);
 #endif
 
 	struct pv_lazy_ops lazy_mode;
@@ -1136,6 +1137,14 @@ static inline void *kmap_atomic_pte(struct page *page, enum km_type type)
 	ret = PVOP_CALL2(unsigned long, pv_mmu_ops.kmap_atomic_pte, page, type);
 	return (void *)ret;
 }
+
+static inline void *kmap_atomic_pte_direct(struct page *page, enum km_type type)
+{
+	unsigned long ret;
+	ret = PVOP_CALL2(unsigned long, pv_mmu_ops.kmap_atomic_pte_direct,
+			 page, type);
+	return (void *)ret;
+}
 #endif
 
 static inline void pte_update(struct mm_struct *mm, unsigned long addr,
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index 177b0165ea01..0e989a1df2fb 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -71,6 +71,7 @@ static inline void pud_clear(pud_t *pudp)
 {
 	unsigned long pgd;
 
+	preempt_disable();
 	set_pud(pudp, __pud(0));
 
 	/*
@@ -86,6 +87,7 @@ static inline void pud_clear(pud_t *pudp)
 	if (__pa(pudp) >= pgd && __pa(pudp) <
 	    (pgd + sizeof(pgd_t)*PTRS_PER_PGD))
 		write_cr3(pgd);
+	preempt_enable();
 }
 
 #ifdef CONFIG_SMP
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index 01fd9461d323..b323c3df7d36 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -59,14 +59,20 @@ extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t);
 #define pte_offset_map_nested(dir, address)				\
 	((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE1) +		\
 	 pte_index((address)))
+#define pte_offset_map_direct(dir, address)				\
+	((pte_t *)kmap_atomic_pte_direct(pmd_page(*(dir)), __KM_PTE) +	\
+	 pte_index((address)))
 #define pte_unmap(pte) kunmap_atomic((pte), __KM_PTE)
 #define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1)
+#define pte_unmap_direct(pte) kunmap_atomic_direct((pte), __KM_PTE)
 #else
 #define pte_offset_map(dir, address)					\
 	((pte_t *)page_address(pmd_page(*(dir))) + pte_index((address)))
 #define pte_offset_map_nested(dir, address) pte_offset_map((dir), (address))
+#define pte_offset_map_direct(dir, address) pte_offset_map((dir), (address))
 #define pte_unmap(pte) do { } while (0)
 #define pte_unmap_nested(pte) do { } while (0)
+#define pte_unmap_direct(pte) do { } while (0)
 #endif
 
 /* Clear a kernel PTE and flush it from the TLB */
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index c57a30117149..efc01aef51c5 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -126,8 +126,10 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
 /* x86-64 always has all page tables mapped. */
 #define pte_offset_map(dir, address) pte_offset_kernel((dir), (address))
 #define pte_offset_map_nested(dir, address) pte_offset_kernel((dir), (address))
-#define pte_unmap(pte) /* NOP */
-#define pte_unmap_nested(pte) /* NOP */
+#define pte_offset_map_direct(dir, address) pte_offset_kernel((dir), (address))
+#define pte_unmap(pte) do { } while (0)
+#define pte_unmap_nested(pte) do { }  while (0)
+#define pte_unmap_direct(pte) do { }  while (0)
 
 #define update_mmu_cache(vma, address, pte) do { } while (0)
 
diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h
index a8646cb5f0c2..92d67a6e07ee 100644
--- a/arch/x86/include/asm/rwsem.h
+++ b/arch/x86/include/asm/rwsem.h
@@ -261,6 +261,8 @@ static inline int anon_rwsem_is_locked(struct rw_anon_semaphore *sem)
 	return (sem->count != 0);
 }
 
+#ifndef CONFIG_PREEMPT_RT
+
 struct rw_semaphore {
 	signed long		count;
 	spinlock_t		wait_lock;
@@ -301,6 +303,7 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem)
 {
 	return (sem->count != 0);
 }
+#endif
 
 #endif /* __KERNEL__ */
 #endif /* _ASM_X86_RWSEM_H */
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 4e77853321db..135097cb540f 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -298,9 +298,9 @@ static inline void __raw_write_unlock(raw_rwlock_t *rw)
 #define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock)
 #define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock)
 
-#define _raw_spin_relax(lock)	cpu_relax()
-#define _raw_read_relax(lock)	cpu_relax()
-#define _raw_write_relax(lock)	cpu_relax()
+#define __raw_spin_relax(lock)	cpu_relax()
+#define __raw_read_relax(lock)	cpu_relax()
+#define __raw_write_relax(lock)	cpu_relax()
 
 /* The {read|write|spin}_lock() on x86 are full memory barriers. */
 static inline void smp_mb__after_lock(void) { }
diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index 20ca9c4d4686..1c4277a06750 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -62,9 +62,9 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc)
 	unsigned long long ns;
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	ns = __cycles_2_ns(cyc);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	return ns;
 }
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 7f3eba08e7de..1c77e8192f62 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -7,6 +7,21 @@
 #include <asm/processor.h>
 #include <asm/system.h>
 
+/*
+ * TLB-flush needs to be nonpreemptible on PREEMPT_RT due to the
+ * following complex race scenario:
+ *
+ * if the current task is lazy-TLB and does a TLB flush and
+ * gets preempted after the movl %%r3, %0 but before the
+ * movl %0, %%cr3 then its ->active_mm might change and it will
+ * install the wrong cr3 when it switches back. This is not a
+ * problem for the lazy-TLB task itself, but if the next task it
+ * switches to has an ->mm that is also the lazy-TLB task's
+ * new ->active_mm, then the scheduler will assume that cr3 is
+ * the new one, while we overwrote it with the old one. The result
+ * is the wrong cr3 in the new (non-lazy-TLB) task, which typically
+ * causes an infinite pagefault upon the next userspace access.
+ */
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #else
@@ -17,7 +32,9 @@
 
 static inline void __native_flush_tlb(void)
 {
+	preempt_disable();
 	native_write_cr3(native_read_cr3());
+	preempt_enable();
 }
 
 static inline void __native_flush_tlb_global(void)
@@ -95,6 +112,13 @@ static inline void __flush_tlb_one(unsigned long addr)
 
 static inline void flush_tlb_mm(struct mm_struct *mm)
 {
+	/*
+	 * This is safe on PREEMPT_RT because if we preempt
+	 * right after the check but before the __flush_tlb(),
+	 * and if ->active_mm changes, then we might miss a
+	 * TLB flush, but that TLB flush happened already when
+	 * ->active_mm was changed:
+	 */
 	if (mm == current->active_mm)
 		__flush_tlb();
 }
diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h
index 133b40a0f495..7a6aa6852678 100644
--- a/arch/x86/include/asm/xor_32.h
+++ b/arch/x86/include/asm/xor_32.h
@@ -865,7 +865,21 @@ static struct xor_block_template xor_block_pIII_sse = {
 #include <asm-generic/xor.h>
 
 #undef XOR_TRY_TEMPLATES
-#define XOR_TRY_TEMPLATES				\
+/*
+ * MMX/SSE ops disable preemption for long periods of time,
+ * so on PREEMPT_RT use the register-based ops only:
+ */
+#ifdef CONFIG_PREEMPT_RT
+# define XOR_TRY_TEMPLATES				\
+	do {						\
+		xor_speed(&xor_block_8regs);		\
+		xor_speed(&xor_block_8regs_p);		\
+		xor_speed(&xor_block_32regs);		\
+		xor_speed(&xor_block_32regs_p);		\
+	} while (0)
+# define XOR_SELECT_TEMPLATE(FASTEST) (FASTEST)
+#else
+# define XOR_TRY_TEMPLATES				\
 do {							\
 	xor_speed(&xor_block_8regs);			\
 	xor_speed(&xor_block_8regs_p);			\
@@ -882,7 +896,8 @@ do {							\
 /* We force the use of the SSE xor block because it can write around L2.
    We may also be able to load into the L1 only depending on how the cpu
    deals with a load to a line that is being prefetched.  */
-#define XOR_SELECT_TEMPLATE(FASTEST)			\
+# define XOR_SELECT_TEMPLATE(FASTEST)			\
 	(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
+#endif /* CONFIG_PREEMPT_RT */
 
 #endif /* _ASM_X86_XOR_32_H */
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 59145f18efa5..a3bf1db8d782 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2600,6 +2600,15 @@ static void ack_apic_level(unsigned int irq)
 			move_masked_irq(irq);
 		unmask_IO_APIC_irq_desc(desc);
 	}
+#if (defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE)) && \
+	defined(CONFIG_PREEMPT_HARDIRQS)
+	/*
+	 * With threaded interrupts, we always have IRQ_INPROGRESS
+	 * when acking.
+	 */
+	else if (unlikely(desc->status & IRQ_MOVE_PENDING))
+		move_masked_irq(irq);
+#endif
 
 #ifdef CONFIG_X86_32
 	if (!(v & (1 << (i & 0x1f)))) {
@@ -3189,7 +3198,6 @@ unsigned int create_irq_nr(unsigned int irq_want, int node)
 	if (irq_want < nr_irqs_gsi)
 		irq_want = nr_irqs_gsi;
 
-	atomic_spin_lock_irqsave(&vector_lock, flags);
 	for (new = irq_want; new < nr_irqs; new++) {
 		desc_new = irq_to_desc_alloc_node(new, node);
 		if (!desc_new) {
@@ -3201,13 +3209,14 @@ unsigned int create_irq_nr(unsigned int irq_want, int node)
 		if (cfg_new->vector != 0)
 			continue;
 
+		atomic_spin_lock_irqsave(&vector_lock, flags);
 		desc_new = move_irq_desc(desc_new, node);
 
 		if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0)
 			irq = new;
+		atomic_spin_unlock_irqrestore(&vector_lock, flags);
 		break;
 	}
-	atomic_spin_unlock_irqrestore(&vector_lock, flags);
 
 	if (irq > 0) {
 		dynamic_irq_init(irq);
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index fa15da6d6884..f6755e50c05d 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -90,7 +90,9 @@ static inline unsigned int get_timer_irqs(int cpu)
  */
 static __init void nmi_cpu_busy(void *data)
 {
+#ifndef CONFIG_PREEMPT_RT
 	local_irq_enable_in_hardirq();
+#endif
 	/*
 	 * Intentionally don't use cpu_relax here. This is
 	 * to make sure that the performance counter really ticks,
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 5ce60a88027b..6acfc1102e8a 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1004,7 +1004,9 @@ DEFINE_PER_CPU(unsigned int, irq_count) = -1;
  */
 static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
 	  [0 ... N_EXCEPTION_STACKS - 1]	= EXCEPTION_STKSZ,
+#if DEBUG_STACK > 0
 	  [DEBUG_STACK - 1]			= DEBUG_STKSZ
+#endif
 };
 
 static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index bca5fba91c9e..ffb9886d3a37 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -99,6 +99,12 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
 }
 
 
+#if defined(CONFIG_DEBUG_STACKOVERFLOW) && defined(CONFIG_EVENT_TRACE)
+extern unsigned long worst_stack_left;
+#else
+# define worst_stack_left -1L
+#endif
+
 void show_registers(struct pt_regs *regs)
 {
 	int i;
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 54b0a3276766..5a11d2a30633 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -21,10 +21,14 @@
 
 
 static char x86_stack_ids[][8] = {
+#if DEBUG_STACK > 0
 		[DEBUG_STACK - 1] = "#DB",
+#endif
 		[NMI_STACK - 1] = "NMI",
 		[DOUBLEFAULT_STACK - 1] = "#DF",
+#if STACKFAULT_STACK > 0
 		[STACKFAULT_STACK - 1] = "#SS",
+#endif
 		[MCE_STACK - 1] = "#MC",
 #if DEBUG_STKSZ > EXCEPTION_STKSZ
 		[N_EXCEPTION_STACKS ...
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 335f049d110f..8b600808c79a 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -59,7 +59,7 @@ static void early_vga_write(struct console *con, const char *str, unsigned n)
 static struct console early_vga_console = {
 	.name =		"earlyvga",
 	.write =	early_vga_write,
-	.flags =	CON_PRINTBUFFER,
+	.flags =	CON_PRINTBUFFER | CON_ATOMIC,
 	.index =	-1,
 };
 
@@ -156,7 +156,7 @@ static __init void early_serial_init(char *s)
 static struct console early_serial_console = {
 	.name =		"earlyser",
 	.write =	early_serial_write,
-	.flags =	CON_PRINTBUFFER,
+	.flags =	CON_PRINTBUFFER | CON_ATOMIC,
 	.index =	-1,
 };
 
@@ -881,7 +881,7 @@ static int __initdata early_console_initialized;
 
 asmlinkage void early_printk(const char *fmt, ...)
 {
-	char buf[512];
+	static char buf[512];
 	int n;
 	va_list ap;
 
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index c097e7d607c6..f86fc3b56bfc 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -371,13 +371,13 @@ END(ret_from_exception)
 ENTRY(resume_kernel)
 	DISABLE_INTERRUPTS(CLBR_ANY)
 	cmpl $0,TI_preempt_count(%ebp)	# non-zero preempt_count ?
-	jnz restore_all
+	jnz restore_nocheck
 need_resched:
 	movl TI_flags(%ebp), %ecx	# need_resched set ?
 	testb $_TIF_NEED_RESCHED, %cl
-	jz restore_all
+	jz restore_nocheck
 	testl $X86_EFLAGS_IF,PT_EFLAGS(%esp)	# interrupts off (exception path) ?
-	jz restore_all
+	jz restore_nocheck
 	call preempt_schedule_irq
 	jmp need_resched
 END(resume_kernel)
@@ -627,12 +627,9 @@ work_pending:
 	testb $_TIF_NEED_RESCHED, %cl
 	jz work_notifysig
 work_resched:
-	call schedule
+	call __schedule
 	LOCKDEP_SYS_EXIT
-	DISABLE_INTERRUPTS(CLBR_ANY)	# make sure we don't miss an interrupt
-					# setting need_resched or sigpending
-					# between sampling and the iret
-	TRACE_IRQS_OFF
+
 	movl TI_flags(%ebp), %ecx
 	andl $_TIF_WORK_MASK, %ecx	# is there any work to be done other
 					# than syscall tracing?
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 70eaa852c732..4d7255c6b1d7 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -30,7 +30,11 @@ static void __init zap_identity_mappings(void)
 {
 	pgd_t *pgd = pgd_offset_k(0UL);
 	pgd_clear(pgd);
-	__flush_tlb_all();
+	/*
+	 * preempt_disable/enable does not work this early in the
+	 * bootup yet:
+	 */
+	write_cr3(read_cr3());
 }
 
 /* Don't add a printk in there. printk relies on the PDA which is not initialized 
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 0d98a01cbdb2..8df48369090c 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -594,6 +594,7 @@ ignore_int:
 	call dump_stack
 
 	addl $(5*4),%esp
+	call dump_stack
 	popl %ds
 	popl %es
 	popl %edx
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index eadb89137d26..6fbe6698818d 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -168,6 +168,8 @@ static void mask_and_ack_8259A(unsigned int irq)
 	 */
 	if (cached_irq_mask & irqmask)
 		goto spurious_8259A_irq;
+	if (irq & 8)
+		outb(0x60+(irq&7), PIC_SLAVE_CMD); /* 'Specific EOI' to slave */
 	cached_irq_mask |= irqmask;
 
 handle_real_irq:
@@ -328,10 +330,10 @@ void init_8259A(int auto_eoi)
 	/* 8259A-1 (the master) has a slave on IR2 */
 	outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR);
 
-	if (auto_eoi)	/* master does Auto EOI */
-		outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR);
-	else		/* master expects normal EOI */
-		outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR);
+	if (!auto_eoi)	/* master expects normal EOI */
+		outb_p(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR);
+	else		/* master does Auto EOI */
+		outb_p(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR);
 
 	outb_pic(0x11, PIC_SLAVE_CMD);	/* ICW1: select 8259A-2 init */
 
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 70ec9b951d76..5611ed65690c 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -422,6 +422,20 @@ struct pv_apic_ops pv_apic_ops = {
 #define PTE_IDENT	__PV_IS_CALLEE_SAVE(_paravirt_ident_64)
 #endif
 
+#ifdef CONFIG_HIGHPTE
+/*
+ * kmap_atomic() might be an inline or a macro:
+ */
+static void *kmap_atomic_func(struct page *page, enum km_type idx)
+{
+	return kmap_atomic(page, idx);
+}
+static void *kmap_atomic_direct_func(struct page *page, enum km_type idx)
+{
+	return kmap_atomic_direct(page, idx);
+}
+#endif
+
 struct pv_mmu_ops pv_mmu_ops = {
 #ifndef CONFIG_X86_64
 	.pagetable_setup_start = native_pagetable_setup_start,
@@ -462,7 +476,8 @@ struct pv_mmu_ops pv_mmu_ops = {
 	.ptep_modify_prot_commit = __ptep_modify_prot_commit,
 
 #ifdef CONFIG_HIGHPTE
-	.kmap_atomic_pte = kmap_atomic,
+	.kmap_atomic_pte = kmap_atomic_func,
+	.kmap_atomic_pte_direct = kmap_atomic_direct_func,
 #endif
 
 #if PAGETABLE_LEVELS >= 3
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 59f4524984af..b9e7a3f40dc1 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -105,7 +105,6 @@ void cpu_idle(void)
 		tick_nohz_stop_sched_tick(1);
 		while (!need_resched()) {
 
-			check_pgt_cache();
 			rmb();
 
 			if (cpu_is_offline(cpu))
@@ -117,10 +116,12 @@ void cpu_idle(void)
 			pm_idle();
 			start_critical_timings();
 		}
+		local_irq_disable();
 		tick_nohz_restart_sched_tick();
-		preempt_enable_no_resched();
-		schedule();
+		__preempt_enable_no_resched();
+		__schedule();
 		preempt_disable();
+		local_irq_enable();
 	}
 }
 
@@ -162,8 +163,10 @@ void __show_regs(struct pt_regs *regs, int all)
 		regs->ax, regs->bx, regs->cx, regs->dx);
 	printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n",
 		regs->si, regs->di, regs->bp, sp);
-	printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n",
-	       (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss);
+	printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x"
+	       " preempt:%08x\n",
+	       (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss,
+	       preempt_count());
 
 	if (!all)
 		return;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index ebefb5407b9d..c8d0ece8eb6c 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -152,9 +152,11 @@ void cpu_idle(void)
 		}
 
 		tick_nohz_restart_sched_tick();
-		preempt_enable_no_resched();
-		schedule();
+		local_irq_disable();
+		__preempt_enable_no_resched();
+		__schedule();
 		preempt_disable();
+		local_irq_enable();
 	}
 }
 
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 4c578751e94e..5777895b8bde 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -782,6 +782,13 @@ static void do_signal(struct pt_regs *regs)
 	int signr;
 	sigset_t *oldset;
 
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * Fully-preemptible kernel does not need interrupts disabled:
+	 */
+	local_irq_enable();
+	preempt_check_resched();
+#endif
 	/*
 	 * We want the common case to go fast, which is why we may in certain
 	 * cases get here from kernel mode. Just return without doing anything
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index ec1de97600e7..a83e38d464f2 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -120,6 +120,16 @@ static void native_smp_send_reschedule(int cpu)
 	apic->send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR);
 }
 
+/*
+ * this function sends a 'reschedule' IPI to all other CPUs.
+ * This is used when RT tasks are starving and other CPUs
+ * might be able to run them:
+ */
+void smp_send_reschedule_allbutself(void)
+{
+	apic->send_IPI_allbutself(RESCHEDULE_VECTOR);
+}
+
 void native_send_call_func_single_ipi(int cpu)
 {
 	apic->send_IPI_mask(cpumask_of(cpu), CALL_FUNCTION_SINGLE_VECTOR);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 5204332f475d..20ee0b1d7b56 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -91,9 +91,10 @@ static inline void conditional_sti(struct pt_regs *regs)
 		local_irq_enable();
 }
 
-static inline void preempt_conditional_sti(struct pt_regs *regs)
+static inline void preempt_conditional_sti(struct pt_regs *regs, int stack)
 {
-	inc_preempt_count();
+	if (stack)
+		inc_preempt_count();
 	if (regs->flags & X86_EFLAGS_IF)
 		local_irq_enable();
 }
@@ -104,11 +105,12 @@ static inline void conditional_cli(struct pt_regs *regs)
 		local_irq_disable();
 }
 
-static inline void preempt_conditional_cli(struct pt_regs *regs)
+static inline void preempt_conditional_cli(struct pt_regs *regs, int stack)
 {
 	if (regs->flags & X86_EFLAGS_IF)
 		local_irq_disable();
-	dec_preempt_count();
+	if (stack)
+		dec_preempt_count();
 }
 
 #ifdef CONFIG_X86_32
@@ -235,9 +237,9 @@ dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code)
 	if (notify_die(DIE_TRAP, "stack segment", regs, error_code,
 			12, SIGBUS) == NOTIFY_STOP)
 		return;
-	preempt_conditional_sti(regs);
+	preempt_conditional_sti(regs, STACKFAULT_STACK);
 	do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL);
-	preempt_conditional_cli(regs);
+	preempt_conditional_cli(regs, STACKFAULT_STACK);
 }
 
 dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
@@ -473,9 +475,9 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
 		return;
 #endif
 
-	preempt_conditional_sti(regs);
+	preempt_conditional_sti(regs, DEBUG_STACK);
 	do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
-	preempt_conditional_cli(regs);
+	preempt_conditional_cli(regs, DEBUG_STACK);
 }
 
 #ifdef CONFIG_X86_64
@@ -552,7 +554,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
 		return;
 
 	/* It's safe to allow irq's after DR6 has been saved */
-	preempt_conditional_sti(regs);
+	preempt_conditional_sti(regs, DEBUG_STACK);
 
 	/* Mask out spurious debug traps due to lazy DR7 setting */
 	if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
@@ -587,7 +589,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
 	 */
 clear_dr7:
 	set_debugreg(0, 7);
-	preempt_conditional_cli(regs);
+	preempt_conditional_cli(regs, DEBUG_STACK);
 	return;
 
 #ifdef CONFIG_X86_32
@@ -602,7 +604,7 @@ debug_vm86:
 clear_TF_reenable:
 	set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
 	regs->flags &= ~X86_EFLAGS_TF;
-	preempt_conditional_cli(regs);
+	preempt_conditional_cli(regs, DEBUG_STACK);
 	return;
 }
 
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 9c4e62539058..cdaeef2de873 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -137,6 +137,7 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
 	local_irq_enable();
 
 	if (!current->thread.vm86_info) {
+		local_irq_disable();
 		printk("no vm86_info: BAD\n");
 		do_exit(SIGSEGV);
 	}
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 18d0d5526f7c..a6c55256404d 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -78,14 +78,41 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
 	unsigned long flags;
 
 	write_atomic_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
+
+	if (likely(vsyscall_gtod_data.sysctl_enabled == 2)) {
+		struct timespec tmp = *(wall_time);
+		cycle_t (*vread)(void);
+		cycle_t now;
+
+		vread = vsyscall_gtod_data.clock.vread;
+		if (likely(vread))
+			now = vread();
+		else
+			now = clock->read(clock);
+
+		/* calculate interval: */
+		now = (now - clock->cycle_last) & clock->mask;
+		/* convert to nsecs: */
+		tmp.tv_nsec += ( now * clock->mult) >> clock->shift;
+
+		while (tmp.tv_nsec >= NSEC_PER_SEC) {
+			tmp.tv_sec += 1;
+			tmp.tv_nsec -= NSEC_PER_SEC;
+		}
+
+		vsyscall_gtod_data.wall_time_sec = tmp.tv_sec;
+		vsyscall_gtod_data.wall_time_nsec = tmp.tv_nsec;
+	} else {
+		vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
+		vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
+	}
+
 	/* copy vsyscall data */
 	vsyscall_gtod_data.clock.vread = clock->vread;
 	vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
 	vsyscall_gtod_data.clock.mask = clock->mask;
 	vsyscall_gtod_data.clock.mult = clock->mult;
 	vsyscall_gtod_data.clock.shift = clock->shift;
-	vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
-	vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
 	vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic;
 	write_atomic_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
 }
@@ -123,6 +150,26 @@ static __always_inline void do_vgettimeofday(struct timeval * tv)
 	unsigned seq;
 	unsigned long mult, shift, nsec;
 	cycle_t (*vread)(void);
+
+	if (likely(__vsyscall_gtod_data.sysctl_enabled == 2)) {
+		struct timeval tmp;
+
+		do {
+			barrier();
+			tv->tv_sec = __vsyscall_gtod_data.wall_time_sec;
+			tv->tv_usec = __vsyscall_gtod_data.wall_time_nsec;
+			barrier();
+			tmp.tv_sec = __vsyscall_gtod_data.wall_time_sec;
+			tmp.tv_usec = __vsyscall_gtod_data.wall_time_nsec;
+
+		} while (tmp.tv_usec != tv->tv_usec ||
+					tmp.tv_sec != tv->tv_sec);
+
+		tv->tv_usec /= NSEC_PER_MSEC;
+		tv->tv_usec *= USEC_PER_MSEC;
+		return;
+	}
+
 	do {
 		seq = read_atomic_seqbegin(&__vsyscall_gtod_data.lock);
 
@@ -133,6 +180,7 @@ static __always_inline void do_vgettimeofday(struct timeval * tv)
 		}
 
 		now = vread();
+
 		base = __vsyscall_gtod_data.clock.cycle_last;
 		mask = __vsyscall_gtod_data.clock.mask;
 		mult = __vsyscall_gtod_data.clock.mult;
@@ -142,6 +190,8 @@ static __always_inline void do_vgettimeofday(struct timeval * tv)
 		nsec = __vsyscall_gtod_data.wall_time_nsec;
 	} while (read_atomic_seqretry(&__vsyscall_gtod_data.lock, seq));
 
+	now = vread();
+
 	/* calculate interval: */
 	cycle_delta = (now - base) & mask;
 	/* convert to nsecs: */
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index cf326e65a945..07df26454924 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -561,6 +561,7 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
 		nr = (address - idt_descr.address) >> 3;
 
 		if (nr == 6) {
+			zap_rt_locks();
 			do_invalid_op(regs, 0);
 			return 1;
 		}
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 71da1bca13cb..71871c8d5c19 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -77,13 +77,13 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
 	if (write)
 		mask |= _PAGE_RW;
 
-	ptep = pte_offset_map(&pmd, addr);
+	ptep = pte_offset_map_direct(&pmd, addr);
 	do {
 		pte_t pte = gup_get_pte(ptep);
 		struct page *page;
 
 		if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) {
-			pte_unmap(ptep);
+			pte_unmap_direct(ptep);
 			return 0;
 		}
 		VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
@@ -93,7 +93,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
 		(*nr)++;
 
 	} while (ptep++, addr += PAGE_SIZE, addr != end);
-	pte_unmap(ptep - 1);
+	pte_unmap_direct(ptep - 1);
 
 	return 1;
 }
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 8d001db5eb2d..e33ec9fd20b5 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -4,9 +4,9 @@
 
 void *kmap(struct page *page)
 {
-	might_sleep();
 	if (!PageHighMem(page))
 		return page_address(page);
+	might_sleep();
 	return kmap_high(page);
 }
 
@@ -19,6 +19,27 @@ void kunmap(struct page *page)
 	kunmap_high(page);
 }
 
+void kunmap_virt(void *ptr)
+{
+	struct page *page;
+
+	if ((unsigned long)ptr < PKMAP_ADDR(0))
+		return;
+	page = pte_page(pkmap_page_table[PKMAP_NR((unsigned long)ptr)]);
+	kunmap(page);
+}
+
+struct page *kmap_to_page(void *ptr)
+{
+	struct page *page;
+
+	if ((unsigned long)ptr < PKMAP_ADDR(0))
+		return virt_to_page(ptr);
+	page = pte_page(pkmap_page_table[PKMAP_NR((unsigned long)ptr)]);
+	return page;
+}
+EXPORT_SYMBOL_GPL(kmap_to_page); /* PREEMPT_RT converts some modules to use this */
+
 /*
  * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
  * no global lock is needed and because the kmap code must perform a global TLB
@@ -27,7 +48,7 @@ void kunmap(struct page *page)
  * However when holding an atomic kmap is is not legal to sleep, so atomic
  * kmaps are appropriate for short, tight code paths only.
  */
-void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
+void *__kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
 {
 	enum fixed_addresses idx;
 	unsigned long vaddr;
@@ -49,12 +70,17 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
 	return (void *)vaddr;
 }
 
-void *kmap_atomic(struct page *page, enum km_type type)
+void *__kmap_atomic_direct(struct page *page, enum km_type type)
+{
+	return __kmap_atomic_prot(page, type, kmap_prot);
+}
+
+void *__kmap_atomic(struct page *page, enum km_type type)
 {
 	return kmap_atomic_prot(page, type, kmap_prot);
 }
 
-void kunmap_atomic(void *kvaddr, enum km_type type)
+void __kunmap_atomic(void *kvaddr, enum km_type type)
 {
 	unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
 	enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
@@ -82,13 +108,14 @@ void kunmap_atomic(void *kvaddr, enum km_type type)
  * This is the same as kmap_atomic() but can map memory that doesn't
  * have a struct page associated with it.
  */
-void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
+void *__kmap_atomic_pfn(unsigned long pfn, enum km_type type)
 {
+	preempt_disable();
 	return kmap_atomic_prot_pfn(pfn, type, kmap_prot);
 }
-EXPORT_SYMBOL_GPL(kmap_atomic_pfn); /* temporarily in use by i915 GEM until vmap */
+EXPORT_SYMBOL_GPL(__kmap_atomic_pfn); /* temporarily in use by i915 GEM until vmap */
 
-struct page *kmap_atomic_to_page(void *ptr)
+struct page *__kmap_atomic_to_page(void *ptr)
 {
 	unsigned long idx, vaddr = (unsigned long)ptr;
 	pte_t *pte;
@@ -103,9 +130,10 @@ struct page *kmap_atomic_to_page(void *ptr)
 
 EXPORT_SYMBOL(kmap);
 EXPORT_SYMBOL(kunmap);
-EXPORT_SYMBOL(kmap_atomic);
-EXPORT_SYMBOL(kunmap_atomic);
-EXPORT_SYMBOL(kmap_atomic_prot);
+EXPORT_SYMBOL(kunmap_virt);
+EXPORT_SYMBOL(__kmap_atomic);
+EXPORT_SYMBOL(__kunmap_atomic);
+EXPORT_SYMBOL(__kmap_atomic_prot);
 
 void __init set_highmem_pages_init(void)
 {
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 0607119cef94..eab22a708794 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -14,8 +14,6 @@
 #include <asm/tlb.h>
 #include <asm/proto.h>
 
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
-
 unsigned long __initdata e820_table_start;
 unsigned long __meminitdata e820_table_end;
 unsigned long __meminitdata e820_table_top;
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 7e600c1962db..704cbd29be9f 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -855,8 +855,10 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
 		}
 	}
 
+#if 0
 	/* Must avoid aliasing mappings in the highmem code */
 	kmap_flush_unused();
+#endif
 
 	vm_unmap_aliases();
 
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index ed34f5e35999..c2ea747bfa3c 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -132,6 +132,7 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
 	   reserved at the pmd (PDPT) level. */
 	set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
 
+	preempt_disable();
 	/*
 	 * According to Intel App note "TLBs, Paging-Structure Caches,
 	 * and Their Invalidation", April 2007, document 317080-001,
@@ -140,6 +141,7 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
 	 */
 	if (mm == current->active_mm)
 		write_cr3(read_cr3());
+	preempt_enable();
 }
 #else  /* !CONFIG_X86_PAE */
 
diff --git a/arch/x86/pci/direct.c b/arch/x86/pci/direct.c
index 286ca0121e0b..e76cff3698eb 100644
--- a/arch/x86/pci/direct.c
+++ b/arch/x86/pci/direct.c
@@ -223,16 +223,23 @@ static int __init pci_check_type1(void)
 	unsigned int tmp;
 	int works = 0;
 
-	local_irq_save(flags);
+	atomic_spin_lock_irqsave(&pci_config_lock, flags);
 
 	outb(0x01, 0xCFB);
 	tmp = inl(0xCF8);
 	outl(0x80000000, 0xCF8);
-	if (inl(0xCF8) == 0x80000000 && pci_sanity_check(&pci_direct_conf1)) {
-		works = 1;
+
+	if (inl(0xCF8) == 0x80000000) {
+		atomic_spin_unlock_irqrestore(&pci_config_lock, flags);
+
+		if (pci_sanity_check(&pci_direct_conf1))
+			works = 1;
+
+		atomic_spin_lock_irqsave(&pci_config_lock, flags);
 	}
 	outl(tmp, 0xCF8);
-	local_irq_restore(flags);
+
+	atomic_spin_unlock_irqrestore(&pci_config_lock, flags);
 
 	return works;
 }
@@ -242,17 +249,19 @@ static int __init pci_check_type2(void)
 	unsigned long flags;
 	int works = 0;
 
-	local_irq_save(flags);
+	atomic_spin_lock_irqsave(&pci_config_lock, flags);
 
 	outb(0x00, 0xCFB);
 	outb(0x00, 0xCF8);
 	outb(0x00, 0xCFA);
-	if (inb(0xCF8) == 0x00 && inb(0xCFA) == 0x00 &&
-	    pci_sanity_check(&pci_direct_conf2)) {
-		works = 1;
-	}
 
-	local_irq_restore(flags);
+	if (inb(0xCF8) == 0x00 && inb(0xCFA) == 0x00) {
+		atomic_spin_unlock_irqrestore(&pci_config_lock, flags);
+
+		if (pci_sanity_check(&pci_direct_conf2))
+			works = 1;
+	} else
+		atomic_spin_unlock_irqrestore(&pci_config_lock, flags);
 
 	return works;
 }
diff --git a/block/blk-core.c b/block/blk-core.c
index e3299a77a0d8..620579e0ff7b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -201,7 +201,7 @@ EXPORT_SYMBOL(blk_dump_rq_flags);
  */
 void blk_plug_device(struct request_queue *q)
 {
-	WARN_ON(!irqs_disabled());
+	WARN_ON_NONRT(!irqs_disabled());
 
 	/*
 	 * don't plug a stopped queue, it must be paired with blk_start_queue()
@@ -241,7 +241,7 @@ EXPORT_SYMBOL(blk_plug_device_unlocked);
  */
 int blk_remove_plug(struct request_queue *q)
 {
-	WARN_ON(!irqs_disabled());
+	WARN_ON_NONRT(!irqs_disabled());
 
 	if (!queue_flag_test_and_clear(QUEUE_FLAG_PLUGGED, q))
 		return 0;
@@ -333,7 +333,7 @@ EXPORT_SYMBOL(blk_unplug);
  **/
 void blk_start_queue(struct request_queue *q)
 {
-	WARN_ON(!irqs_disabled());
+	WARN_ON_NONRT(!irqs_disabled());
 
 	queue_flag_clear(QUEUE_FLAG_STOPPED, q);
 	__blk_run_queue(q);
diff --git a/drivers/acpi/acpica/acglobal.h b/drivers/acpi/acpica/acglobal.h
index 3d87362d17ed..feee0178b1cb 100644
--- a/drivers/acpi/acpica/acglobal.h
+++ b/drivers/acpi/acpica/acglobal.h
@@ -197,7 +197,12 @@ ACPI_EXTERN u8 acpi_gbl_global_lock_present;
  * interrupt level
  */
 ACPI_EXTERN spinlock_t _acpi_gbl_gpe_lock;	/* For GPE data structs and registers */
-ACPI_EXTERN spinlock_t _acpi_gbl_hardware_lock;	/* For ACPI H/W except GPE registers */
+
+/*
+ * Need to be raw because it might be used in acpi_processor_idle():
+ */
+ACPI_EXTERN atomic_spinlock_t _acpi_gbl_hardware_lock;	/* For ACPI H/W except GPE registers */
+
 #define acpi_gbl_gpe_lock	&_acpi_gbl_gpe_lock
 #define acpi_gbl_hardware_lock	&_acpi_gbl_hardware_lock
 
diff --git a/drivers/acpi/acpica/hwregs.c b/drivers/acpi/acpica/hwregs.c
index 23d5505cb1f7..756fe9e18760 100644
--- a/drivers/acpi/acpica/hwregs.c
+++ b/drivers/acpi/acpica/hwregs.c
@@ -85,7 +85,7 @@ acpi_status acpi_hw_clear_acpi_status(void)
 			  ACPI_BITMASK_ALL_FIXED_STATUS,
 			  ACPI_FORMAT_UINT64(acpi_gbl_xpm1a_status.address)));
 
-	lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
+	atomic_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags);
 
 	/* Clear the fixed events in PM1 A/B */
 
@@ -100,7 +100,7 @@ acpi_status acpi_hw_clear_acpi_status(void)
 	status = acpi_ev_walk_gpe_list(acpi_hw_clear_gpe_block, NULL);
 
       unlock_and_exit:
-	acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags);
+	atomic_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags);
 	return_ACPI_STATUS(status);
 }
 
diff --git a/drivers/acpi/acpica/hwxface.c b/drivers/acpi/acpica/hwxface.c
index 9829979f2bdd..3898b185c89f 100644
--- a/drivers/acpi/acpica/hwxface.c
+++ b/drivers/acpi/acpica/hwxface.c
@@ -341,7 +341,7 @@ acpi_status acpi_write_bit_register(u32 register_id, u32 value)
 		return_ACPI_STATUS(AE_BAD_PARAMETER);
 	}
 
-	lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
+	atomic_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags);
 
 	/*
 	 * At this point, we know that the parent register is one of the
@@ -402,7 +402,7 @@ acpi_status acpi_write_bit_register(u32 register_id, u32 value)
 
 unlock_and_exit:
 
-	acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags);
+	atomic_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags);
 	return_ACPI_STATUS(status);
 }
 
diff --git a/drivers/acpi/acpica/utmutex.c b/drivers/acpi/acpica/utmutex.c
index 80bb65154117..477de019b7e9 100644
--- a/drivers/acpi/acpica/utmutex.c
+++ b/drivers/acpi/acpica/utmutex.c
@@ -84,7 +84,7 @@ acpi_status acpi_ut_mutex_initialize(void)
 	/* Create the spinlocks for use at interrupt level */
 
 	spin_lock_init(acpi_gbl_gpe_lock);
-	spin_lock_init(acpi_gbl_hardware_lock);
+	atomic_spin_lock_init(acpi_gbl_hardware_lock);
 
 	/* Create the reader/writer lock for namespace access */
 
@@ -117,11 +117,6 @@ void acpi_ut_mutex_terminate(void)
 		(void)acpi_ut_delete_mutex(i);
 	}
 
-	/* Delete the spinlocks */
-
-	acpi_os_delete_lock(acpi_gbl_gpe_lock);
-	acpi_os_delete_lock(acpi_gbl_hardware_lock);
-
 	/* Delete the reader/writer lock */
 
 	acpi_ut_delete_rw_lock(&acpi_gbl_namespace_rw_lock);
diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c
index 391f331674c7..fe086ca0a183 100644
--- a/drivers/acpi/ec.c
+++ b/drivers/acpi/ec.c
@@ -573,8 +573,22 @@ static u32 acpi_ec_gpe_handler(void *data)
 	if (test_bit(EC_FLAGS_GPE_MODE, &ec->flags)) {
 		gpe_transaction(ec, status);
 		if (ec_transaction_done(ec) &&
-		    (status & ACPI_EC_FLAG_IBF) == 0)
+		    (status & ACPI_EC_FLAG_IBF) == 0) {
+
+#ifndef CONFIG_PREEMPT_RT
 			wake_up(&ec->wait);
+#else
+			// hack ...
+			if (waitqueue_active(&ec->wait)) {
+				struct task_struct *task;
+
+				task = list_entry(ec->wait.task_list.next,
+					  wait_queue_t, task_list)->private;
+				if (task)
+					wake_up_process(task);
+			}
+#endif
+		}
 	}
 
 	ec_check_sci(ec, status);
diff --git a/drivers/block/paride/pseudo.h b/drivers/block/paride/pseudo.h
index bc3703294143..0fbc78c7bfec 100644
--- a/drivers/block/paride/pseudo.h
+++ b/drivers/block/paride/pseudo.h
@@ -43,7 +43,7 @@ static unsigned long ps_timeout;
 static int ps_tq_active = 0;
 static int ps_nice = 0;
 
-static DEFINE_SPINLOCK(ps_spinlock __attribute__((unused)));
+static __attribute__((unused)) DEFINE_SPINLOCK(ps_spinlock);
 
 static DECLARE_DELAYED_WORK(ps_tq, ps_tq_int);
 
diff --git a/drivers/char/rtc.c b/drivers/char/rtc.c
index e0d0f8b2696b..d809c4d8ea96 100644
--- a/drivers/char/rtc.c
+++ b/drivers/char/rtc.c
@@ -1197,10 +1197,12 @@ static void rtc_dropped_irq(unsigned long data)
 
 	spin_unlock_irq(&rtc_lock);
 
+#ifndef CONFIG_PREEMPT_RT
 	if (printk_ratelimit()) {
 		printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n",
 			freq);
 	}
+#endif
 
 	/* Now we have new data */
 	wake_up_interruptible(&rtc_wait);
diff --git a/drivers/char/tty_buffer.c b/drivers/char/tty_buffer.c
index 3108991c5c8b..71002066576d 100644
--- a/drivers/char/tty_buffer.c
+++ b/drivers/char/tty_buffer.c
@@ -495,10 +495,14 @@ void tty_flip_buffer_push(struct tty_struct *tty)
 		tty->buf.tail->commit = tty->buf.tail->used;
 	spin_unlock_irqrestore(&tty->buf.lock, flags);
 
+#ifndef CONFIG_PREEMPT_RT
 	if (tty->low_latency)
 		flush_to_ldisc(&tty->buf.work.work);
 	else
 		schedule_delayed_work(&tty->buf.work, 1);
+#else
+	flush_to_ldisc(&tty->buf.work.work);
+#endif
 }
 EXPORT_SYMBOL(tty_flip_buffer_push);
 
diff --git a/drivers/char/vt.c b/drivers/char/vt.c
index 404f4c1ee431..dee3f64fdc33 100644
--- a/drivers/char/vt.c
+++ b/drivers/char/vt.c
@@ -2537,7 +2537,7 @@ static struct console vt_console_driver = {
 	.write		= vt_console_print,
 	.device		= vt_console_device,
 	.unblank	= unblank_screen,
-	.flags		= CON_PRINTBUFFER,
+	.flags		= CON_PRINTBUFFER | CON_ATOMIC,
 	.index		= -1,
 };
 #endif
diff --git a/drivers/net/3c59x.c b/drivers/net/3c59x.c
index c20416850948..a58ada82a54d 100644
--- a/drivers/net/3c59x.c
+++ b/drivers/net/3c59x.c
@@ -791,9 +791,9 @@ static void poll_vortex(struct net_device *dev)
 {
 	struct vortex_private *vp = netdev_priv(dev);
 	unsigned long flags;
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 	(vp->full_bus_master_rx ? boomerang_interrupt:vortex_interrupt)(dev->irq,dev);
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 }
 #endif
 
@@ -1762,6 +1762,7 @@ vortex_timer(unsigned long data)
 	int next_tick = 60*HZ;
 	int ok = 0;
 	int media_status, old_window;
+	unsigned long flags;
 
 	if (vortex_debug > 2) {
 		pr_debug("%s: Media selection timer tick happened, %s.\n",
@@ -1769,7 +1770,7 @@ vortex_timer(unsigned long data)
 		pr_debug("dev->watchdog_timeo=%d\n", dev->watchdog_timeo);
 	}
 
-	disable_irq_lockdep(dev->irq);
+	spin_lock_irqsave(&vp->lock, flags);
 	old_window = ioread16(ioaddr + EL3_CMD) >> 13;
 	EL3WINDOW(4);
 	media_status = ioread16(ioaddr + Wn4_Media);
@@ -1792,10 +1793,7 @@ vortex_timer(unsigned long data)
 	case XCVR_MII: case XCVR_NWAY:
 		{
 			ok = 1;
-			/* Interrupts are already disabled */
-			spin_lock(&vp->lock);
 			vortex_check_media(dev, 0);
-			spin_unlock(&vp->lock);
 		}
 		break;
 	  default:					/* Other media types handled by Tx timeouts. */
@@ -1849,7 +1847,7 @@ leave_media_alone:
 			 dev->name, media_tbl[dev->if_port].name);
 
 	EL3WINDOW(old_window);
-	enable_irq_lockdep(dev->irq);
+	spin_unlock_irqrestore(&vp->lock, flags);
 	mod_timer(&vp->timer, RUN_AT(next_tick));
 	if (vp->deferred)
 		iowrite16(FakeIntr, ioaddr + EL3_CMD);
@@ -1883,12 +1881,12 @@ static void vortex_tx_timeout(struct net_device *dev)
 			 * Block interrupts because vortex_interrupt does a bare spin_lock()
 			 */
 			unsigned long flags;
-			local_irq_save(flags);
+			local_irq_save_nort(flags);
 			if (vp->full_bus_master_tx)
 				boomerang_interrupt(dev->irq, dev);
 			else
 				vortex_interrupt(dev->irq, dev);
-			local_irq_restore(flags);
+			local_irq_restore_nort(flags);
 		}
 	}
 
diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c
index b70cc99962fc..e7979a630b4c 100644
--- a/drivers/net/bnx2.c
+++ b/drivers/net/bnx2.c
@@ -2836,7 +2836,7 @@ bnx2_tx_int(struct bnx2 *bp, struct bnx2_napi *bnapi, int budget)
 
 	if (unlikely(netif_tx_queue_stopped(txq)) &&
 		     (bnx2_tx_avail(bp, txr) > bp->tx_wake_thresh)) {
-		__netif_tx_lock(txq, smp_processor_id());
+		__netif_tx_lock(txq);
 		if ((netif_tx_queue_stopped(txq)) &&
 		    (bnx2_tx_avail(bp, txr) > bp->tx_wake_thresh))
 			netif_tx_wake_queue(txq);
diff --git a/drivers/net/bnx2x_main.c b/drivers/net/bnx2x_main.c
index c36a5f33739f..2922e002e7f0 100644
--- a/drivers/net/bnx2x_main.c
+++ b/drivers/net/bnx2x_main.c
@@ -926,7 +926,7 @@ static void bnx2x_tx_int(struct bnx2x_fastpath *fp)
 	/* TBD need a thresh? */
 	if (unlikely(netif_tx_queue_stopped(txq))) {
 
-		__netif_tx_lock(txq, smp_processor_id());
+		__netif_tx_lock(txq);
 
 		/* Need to make the tx_bd_cons update visible to start_xmit()
 		 * before checking for netif_tx_queue_stopped().  Without the
diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
index 9415a97ad5b2..15c6599e9775 100644
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -86,7 +86,7 @@ static int loopback_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	skb->protocol = eth_type_trans(skb, dev);
 	len = skb->len;
-	res = netif_rx(skb);
+	res = netif_rx_ni(skb);
 
 	pcpu_lstats = dev->ml_priv;
 	lb_stats = per_cpu_ptr(pcpu_lstats, xmit_get_cpu());
diff --git a/drivers/net/mv643xx_eth.c b/drivers/net/mv643xx_eth.c
index 0f32db3e92ad..ee2c56d99636 100644
--- a/drivers/net/mv643xx_eth.c
+++ b/drivers/net/mv643xx_eth.c
@@ -508,7 +508,7 @@ static void txq_maybe_wake(struct tx_queue *txq)
 	struct netdev_queue *nq = netdev_get_tx_queue(mp->dev, txq->index);
 
 	if (netif_tx_queue_stopped(nq)) {
-		__netif_tx_lock(nq, smp_processor_id());
+		__netif_tx_lock(nq);
 		if (txq->tx_ring_size - txq->tx_desc_count >= MAX_SKB_FRAGS + 1)
 			netif_tx_wake_queue(nq);
 		__netif_tx_unlock(nq);
@@ -899,7 +899,7 @@ static void txq_kick(struct tx_queue *txq)
 	u32 hw_desc_ptr;
 	u32 expected_ptr;
 
-	__netif_tx_lock(nq, smp_processor_id());
+	__netif_tx_lock(nq);
 
 	if (rdlp(mp, TXQ_COMMAND) & (1 << txq->index))
 		goto out;
@@ -923,7 +923,7 @@ static int txq_reclaim(struct tx_queue *txq, int budget, int force)
 	struct netdev_queue *nq = netdev_get_tx_queue(mp->dev, txq->index);
 	int reclaimed;
 
-	__netif_tx_lock(nq, smp_processor_id());
+	__netif_tx_lock(nq);
 
 	reclaimed = 0;
 	while (reclaimed < budget && txq->tx_desc_count > 0) {
diff --git a/drivers/net/netxen/netxen_nic_init.c b/drivers/net/netxen/netxen_nic_init.c
index 7acf204e38c9..f12abcceae5b 100644
--- a/drivers/net/netxen/netxen_nic_init.c
+++ b/drivers/net/netxen/netxen_nic_init.c
@@ -1408,7 +1408,7 @@ int netxen_process_cmd_ring(struct netxen_adapter *adapter)
 		smp_mb();
 
 		if (netif_queue_stopped(netdev) && netif_carrier_ok(netdev)) {
-			__netif_tx_lock(tx_ring->txq, smp_processor_id());
+			__netif_tx_lock(tx_ring->txq);
 			if (netxen_tx_avail(tx_ring) > TX_STOP_THRESH)
 				netif_wake_queue(netdev);
 			__netif_tx_unlock(tx_ring->txq);
diff --git a/drivers/net/niu.c b/drivers/net/niu.c
index d2146d4a10f3..4b6d8cec986c 100644
--- a/drivers/net/niu.c
+++ b/drivers/net/niu.c
@@ -3681,7 +3681,7 @@ static void niu_tx_work(struct niu *np, struct tx_ring_info *rp)
 out:
 	if (unlikely(netif_tx_queue_stopped(txq) &&
 		     (niu_tx_avail(rp) > NIU_TX_WAKEUP_THRESH(rp)))) {
-		__netif_tx_lock(txq, smp_processor_id());
+		__netif_tx_lock(txq);
 		if (netif_tx_queue_stopped(txq) &&
 		    (niu_tx_avail(rp) > NIU_TX_WAKEUP_THRESH(rp)))
 			netif_tx_wake_queue(txq);
diff --git a/drivers/serial/8250.c b/drivers/serial/8250.c
index fb867a9f55e9..d40633379b61 100644
--- a/drivers/serial/8250.c
+++ b/drivers/serial/8250.c
@@ -1595,7 +1595,12 @@ static irqreturn_t serial8250_interrupt(int irq, void *dev_id)
 
 		l = l->next;
 
-		if (l == i->head && pass_counter++ > PASS_LIMIT) {
+		/*
+		 * On preempt-rt we can be preempted and run in our
+		 * own thread.
+		 */
+		if (!preempt_rt() && l == i->head &&
+		    pass_counter++ > PASS_LIMIT) {
 			/* If we hit this, we're dead. */
 			printk(KERN_ERR "serial8250: too much work for "
 				"irq%d\n", irq);
@@ -2729,14 +2734,10 @@ serial8250_console_write(struct console *co, const char *s, unsigned int count)
 
 	touch_nmi_watchdog();
 
-	local_irq_save(flags);
-	if (up->port.sysrq) {
-		/* serial8250_handle_port() already took the lock */
-		locked = 0;
-	} else if (oops_in_progress) {
-		locked = spin_trylock(&up->port.lock);
-	} else
-		spin_lock(&up->port.lock);
+	if (up->port.sysrq || oops_in_progress || preempt_rt())
+		locked = spin_trylock_irqsave(&up->port.lock, flags);
+	else
+		spin_lock_irqsave(&up->port.lock, flags);
 
 	/*
 	 *	First save the IER then disable the interrupts
@@ -2768,8 +2769,7 @@ serial8250_console_write(struct console *co, const char *s, unsigned int count)
 		check_modem_status(up);
 
 	if (locked)
-		spin_unlock(&up->port.lock);
-	local_irq_restore(flags);
+		spin_unlock_irqrestore(&up->port.lock, flags);
 }
 
 static int __init serial8250_console_setup(struct console *co, char *options)
diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c
index 4247eccf858c..57ba3b1677f3 100644
--- a/drivers/usb/core/devio.c
+++ b/drivers/usb/core/devio.c
@@ -330,8 +330,9 @@ static void async_completed(struct urb *urb)
 	uid_t euid = 0;
 	u32 secid = 0;
 	int signr;
+	unsigned long flags;
 
-	spin_lock(&ps->lock);
+	spin_lock_irqsave(&ps->lock, flags);
 	list_move_tail(&as->asynclist, &ps->async_completed);
 	as->status = urb->status;
 	signr = as->signr;
@@ -347,7 +348,7 @@ static void async_completed(struct urb *urb)
 	}
 	snoop(&urb->dev->dev, "urb complete\n");
 	snoop_urb(urb, as->userurb);
-	spin_unlock(&ps->lock);
+	spin_unlock_irqrestore(&ps->lock, flags);
 
 	if (signr)
 		kill_pid_info_as_uid(sinfo.si_signo, &sinfo, pid, uid,
diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
index 95ccfa0b9fc5..167548a0a270 100644
--- a/drivers/usb/core/hcd.c
+++ b/drivers/usb/core/hcd.c
@@ -1880,7 +1880,7 @@ irqreturn_t usb_hcd_irq (int irq, void *__hcd)
 	 * when the first handler doesn't use it.  So let's just
 	 * assume it's never used.
 	 */
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 
 	if (unlikely(hcd->state == HC_STATE_HALT ||
 		     !test_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags))) {
@@ -1895,7 +1895,7 @@ irqreturn_t usb_hcd_irq (int irq, void *__hcd)
 		rc = IRQ_HANDLED;
 	}
 
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 	return rc;
 }
 
diff --git a/drivers/usb/core/message.c b/drivers/usb/core/message.c
index 9720e699f472..b529a76ab6de 100644
--- a/drivers/usb/core/message.c
+++ b/drivers/usb/core/message.c
@@ -269,8 +269,9 @@ static void sg_complete(struct urb *urb)
 {
 	struct usb_sg_request *io = urb->context;
 	int status = urb->status;
+	unsigned long flags;
 
-	spin_lock(&io->lock);
+	spin_lock_irqsave (&io->lock, flags);
 
 	/* In 2.5 we require hcds' endpoint queues not to progress after fault
 	 * reports, until the completion callback (this!) returns.  That lets
@@ -304,7 +305,7 @@ static void sg_complete(struct urb *urb)
 		 * unlink pending urbs so they won't rx/tx bad data.
 		 * careful: unlink can sometimes be synchronous...
 		 */
-		spin_unlock(&io->lock);
+		spin_unlock_irqrestore (&io->lock, flags);
 		for (i = 0, found = 0; i < io->entries; i++) {
 			if (!io->urbs [i] || !io->urbs [i]->dev)
 				continue;
@@ -319,7 +320,7 @@ static void sg_complete(struct urb *urb)
 			} else if (urb == io->urbs [i])
 				found = 1;
 		}
-		spin_lock(&io->lock);
+		spin_lock_irqsave (&io->lock, flags);
 	}
 	urb->dev = NULL;
 
@@ -329,7 +330,7 @@ static void sg_complete(struct urb *urb)
 	if (!io->count)
 		complete(&io->complete);
 
-	spin_unlock(&io->lock);
+	spin_unlock_irqrestore (&io->lock, flags);
 }
 
 
@@ -643,7 +644,7 @@ void usb_sg_cancel(struct usb_sg_request *io)
 		int i;
 
 		io->status = -ECONNRESET;
-		spin_unlock(&io->lock);
+		spin_unlock_irqrestore(&io->lock, flags);
 		for (i = 0; i < io->entries; i++) {
 			int retval;
 
@@ -654,7 +655,7 @@ void usb_sg_cancel(struct usb_sg_request *io)
 				dev_warn(&io->dev->dev, "%s, unlink --> %d\n",
 					__func__, retval);
 		}
-		spin_lock(&io->lock);
+		spin_lock_irqsave(&io->lock, flags);
 	}
 	spin_unlock_irqrestore(&io->lock, flags);
 }
diff --git a/drivers/video/console/fbcon.c b/drivers/video/console/fbcon.c
index 3a44695b9c09..da8c9e315311 100644
--- a/drivers/video/console/fbcon.c
+++ b/drivers/video/console/fbcon.c
@@ -1203,7 +1203,6 @@ static void fbcon_clear(struct vc_data *vc, int sy, int sx, int height,
 {
 	struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]];
 	struct fbcon_ops *ops = info->fbcon_par;
-
 	struct display *p = &fb_display[vc->vc_num];
 	u_int y_break;
 
@@ -1235,10 +1234,11 @@ static void fbcon_putcs(struct vc_data *vc, const unsigned short *s,
 	struct display *p = &fb_display[vc->vc_num];
 	struct fbcon_ops *ops = info->fbcon_par;
 
-	if (!fbcon_is_inactive(vc, info))
+	if (!fbcon_is_inactive(vc, info)) {
 		ops->putcs(vc, info, s, count, real_y(p, ypos), xpos,
 			   get_color(vc, info, scr_readw(s), 1),
 			   get_color(vc, info, scr_readw(s), 0));
+	}
 }
 
 static void fbcon_putc(struct vc_data *vc, int c, int ypos, int xpos)
@@ -3225,6 +3225,7 @@ static const struct consw fb_con = {
 	.con_screen_pos 	= fbcon_screen_pos,
 	.con_getxy 		= fbcon_getxy,
 	.con_resize             = fbcon_resize,
+	.con_preemptible 	= 1,
 };
 
 static struct notifier_block fbcon_event_notifier = {
diff --git a/fs/aio.c b/fs/aio.c
index d065b2c3273e..05e61f79519b 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -612,9 +612,11 @@ static void use_mm(struct mm_struct *mm)
 	task_lock(tsk);
 	active_mm = tsk->active_mm;
 	atomic_inc(&mm->mm_count);
+	local_irq_disable(); // FIXME
+	switch_mm(active_mm, mm, tsk);
 	tsk->mm = mm;
 	tsk->active_mm = mm;
-	switch_mm(active_mm, mm, tsk);
+	local_irq_enable();
 	task_unlock(tsk);
 
 	mmdrop(active_mm);
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 1c36e5cd8f55..4b3d18a22e75 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -94,6 +94,7 @@ static int btrfs_spin_on_block(struct extent_buffer *eb)
  */
 int btrfs_try_spin_lock(struct extent_buffer *eb)
 {
+#ifndef CONFIG_PREEMPT_RT
 	int i;
 
 	if (btrfs_spin_on_block(eb)) {
@@ -113,6 +114,7 @@ int btrfs_try_spin_lock(struct extent_buffer *eb)
 			return 1;
 		spin_unlock(&eb->lock);
 	}
+#endif
 	return 0;
 }
 
diff --git a/fs/buffer.c b/fs/buffer.c
index a3ef091a45bd..02f758dce973 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -40,7 +40,6 @@
 #include <linux/cpu.h>
 #include <linux/bitops.h>
 #include <linux/mpage.h>
-#include <linux/bit_spinlock.h>
 
 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
 
@@ -324,8 +323,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 	 * decide that the page is now completely done.
 	 */
 	first = page_buffers(page);
-	local_irq_save(flags);
-	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
+	spin_lock_irqsave(&first->b_uptodate_lock, flags);
 	clear_buffer_async_read(bh);
 	unlock_buffer(bh);
 	tmp = bh;
@@ -338,8 +336,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 		}
 		tmp = tmp->b_this_page;
 	} while (tmp != bh);
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 
 	/*
 	 * If none of the buffers had errors and they are all
@@ -351,8 +348,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 	return;
 
 still_busy:
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 	return;
 }
 
@@ -387,8 +383,7 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 	}
 
 	first = page_buffers(page);
-	local_irq_save(flags);
-	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
+	spin_lock_irqsave(&first->b_uptodate_lock, flags);
 
 	clear_buffer_async_write(bh);
 	unlock_buffer(bh);
@@ -400,14 +395,12 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 		}
 		tmp = tmp->b_this_page;
 	}
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 	end_page_writeback(page);
 	return;
 
 still_busy:
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 	return;
 }
 
@@ -3248,6 +3241,8 @@ struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
 	struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
 	if (ret) {
 		INIT_LIST_HEAD(&ret->b_assoc_buffers);
+		spin_lock_init(&ret->b_uptodate_lock);
+		spin_lock_init(&ret->b_state_lock);
 		get_cpu_var(bh_accounting).nr++;
 		recalc_bh_state();
 		put_cpu_var(bh_accounting);
@@ -3259,6 +3254,8 @@ EXPORT_SYMBOL(alloc_buffer_head);
 void free_buffer_head(struct buffer_head *bh)
 {
 	BUG_ON(!list_empty(&bh->b_assoc_buffers));
+	BUG_ON(spin_is_locked(&bh->b_uptodate_lock));
+	BUG_ON(spin_is_locked(&bh->b_state_lock));
 	kmem_cache_free(bh_cachep, bh);
 	get_cpu_var(bh_accounting).nr--;
 	recalc_bh_state();
diff --git a/fs/dcache.c b/fs/dcache.c
index 9e5cd3c3a6ba..ce004558f23e 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -726,8 +726,9 @@ void shrink_dcache_for_umount(struct super_block *sb)
 {
 	struct dentry *dentry;
 
-	if (down_read_trylock(&sb->s_umount))
-		BUG();
+// -rt: this might succeed there ...
+//	if (down_read_trylock(&sb->s_umount))
+//		BUG();
 
 	dentry = sb->s_root;
 	sb->s_root = NULL;
diff --git a/fs/exec.c b/fs/exec.c
index 4a8849e45b21..ddaebef9879d 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -48,6 +48,7 @@
 #include <linux/security.h>
 #include <linux/ima.h>
 #include <linux/syscalls.h>
+#include <linux/delay.h>
 #include <linux/tsacct_kern.h>
 #include <linux/cn_proc.h>
 #include <linux/audit.h>
@@ -501,7 +502,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
 	unsigned long length = old_end - old_start;
 	unsigned long new_start = old_start - shift;
 	unsigned long new_end = old_end - shift;
-	struct mmu_gather *tlb;
+	struct mmu_gather tlb;
 
 	BUG_ON(new_start > new_end);
 
@@ -526,12 +527,12 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
 		return -ENOMEM;
 
 	lru_add_drain();
-	tlb = tlb_gather_mmu(mm, 0);
+	tlb_gather_mmu(&tlb, mm, 0);
 	if (new_end > old_start) {
 		/*
 		 * when the old and new regions overlap clear from new_end.
 		 */
-		free_pgd_range(tlb, new_end, old_end, new_end,
+		free_pgd_range(&tlb, new_end, old_end, new_end,
 			vma->vm_next ? vma->vm_next->vm_start : 0);
 	} else {
 		/*
@@ -540,10 +541,10 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
 		 * have constraints on va-space that make this illegal (IA64) -
 		 * for the others its just a little faster.
 		 */
-		free_pgd_range(tlb, old_start, old_end, new_end,
+		free_pgd_range(&tlb, old_start, old_end, new_end,
 			vma->vm_next ? vma->vm_next->vm_start : 0);
 	}
-	tlb_finish_mmu(tlb, new_end, old_end);
+	tlb_finish_mmu(&tlb, new_end, old_end);
 
 	/*
 	 * shrink the vma to just the new range.
@@ -719,10 +720,12 @@ static int exec_mmap(struct mm_struct *mm)
 		}
 	}
 	task_lock(tsk);
+	local_irq_disable();
 	active_mm = tsk->active_mm;
+	activate_mm(active_mm, mm);
 	tsk->mm = mm;
 	tsk->active_mm = mm;
-	activate_mm(active_mm, mm);
+	local_irq_enable();
 	task_unlock(tsk);
 	arch_pick_mmap_layout(mm);
 	if (old_mm) {
diff --git a/fs/file.c b/fs/file.c
index f313314f996f..710e9b0ca698 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -102,14 +102,15 @@ void free_fdtable_rcu(struct rcu_head *rcu)
 		kfree(fdt->open_fds);
 		kfree(fdt);
 	} else {
-		fddef = &get_cpu_var(fdtable_defer_list);
+
+		fddef = &per_cpu(fdtable_defer_list, raw_smp_processor_id());
+
 		spin_lock(&fddef->lock);
 		fdt->next = fddef->next;
 		fddef->next = fdt;
 		/* vmallocs are handled from the workqueue context */
 		schedule_work(&fddef->wq);
 		spin_unlock(&fddef->lock);
-		put_cpu_var(fdtable_defer_list);
 	}
 }
 
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index c03ac11f74be..e7348c292329 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -1576,7 +1576,7 @@ static void __journal_temp_unlink_buffer(struct journal_head *jh)
 	transaction_t *transaction;
 	struct buffer_head *bh = jh2bh(jh);
 
-	J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+	J_ASSERT_JH_SMP(jh, jbd_is_locked_bh_state(bh));
 	transaction = jh->b_transaction;
 	if (transaction)
 		assert_spin_locked(&transaction->t_journal->j_list_lock);
@@ -2028,7 +2028,7 @@ void __journal_file_buffer(struct journal_head *jh,
 	int was_dirty = 0;
 	struct buffer_head *bh = jh2bh(jh);
 
-	J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+	J_ASSERT_JH_SMP(jh, jbd_is_locked_bh_state(bh));
 	assert_spin_locked(&transaction->t_journal->j_list_lock);
 
 	J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
@@ -2122,7 +2122,7 @@ void __journal_refile_buffer(struct journal_head *jh)
 	int was_dirty;
 	struct buffer_head *bh = jh2bh(jh);
 
-	J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+	J_ASSERT_JH_SMP(jh, jbd_is_locked_bh_state(bh));
 	if (jh->b_transaction)
 		assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock);
 
diff --git a/fs/namespace.c b/fs/namespace.c
index 7230787d18b0..2a27e33b725e 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -264,8 +264,16 @@ int mnt_want_write(struct vfsmount *mnt)
 	 * incremented count after it has set MNT_WRITE_HOLD.
 	 */
 	smp_mb();
-	while (mnt->mnt_flags & MNT_WRITE_HOLD)
-		cpu_relax();
+	preempt_enable();
+	/*
+	 * HACK ALERT. on RT we can not spin here with cpu_relax() and
+	 * preemption disabled so we block on the vfsmount lock which is
+	 * held by mnt_make_readonly(). Works on !RT as well.
+	 */
+	while (mnt->mnt_flags & MNT_WRITE_HOLD) {
+		spin_lock(&vfsmount_lock);
+		spin_unlock(&vfsmount_lock);
+	}
 	/*
 	 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
 	 * be set to match its requirements. So we must not load that until
@@ -273,12 +281,11 @@ int mnt_want_write(struct vfsmount *mnt)
 	 */
 	smp_rmb();
 	if (__mnt_is_readonly(mnt)) {
+		preempt_disable();
 		dec_mnt_writers(mnt);
+		preempt_enable();
 		ret = -EROFS;
-		goto out;
 	}
-out:
-	preempt_enable();
 	return ret;
 }
 EXPORT_SYMBOL_GPL(mnt_want_write);
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index b38f944f0667..4391e00b067a 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -29,6 +29,7 @@
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
 #include <linux/bit_spinlock.h>
+#include <linux/interrupt.h>
 
 #include "aops.h"
 #include "attrib.h"
@@ -107,8 +108,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
 				"0x%llx.", (unsigned long long)bh->b_blocknr);
 	}
 	first = page_buffers(page);
-	local_irq_save(flags);
-	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
+	spin_lock_irqsave(&first->b_uptodate_lock, flags);
 	clear_buffer_async_read(bh);
 	unlock_buffer(bh);
 	tmp = bh;
@@ -123,8 +123,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
 		}
 		tmp = tmp->b_this_page;
 	} while (tmp != bh);
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 	/*
 	 * If none of the buffers had errors then we can set the page uptodate,
 	 * but we first have to perform the post read mst fixups, if the
@@ -145,13 +144,13 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
 		recs = PAGE_CACHE_SIZE / rec_size;
 		/* Should have been verified before we got here... */
 		BUG_ON(!recs);
-		local_irq_save(flags);
+		local_irq_save_nort(flags);
 		kaddr = kmap_atomic(page, KM_BIO_SRC_IRQ);
 		for (i = 0; i < recs; i++)
 			post_read_mst_fixup((NTFS_RECORD*)(kaddr +
 					i * rec_size), rec_size);
 		kunmap_atomic(kaddr, KM_BIO_SRC_IRQ);
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 		flush_dcache_page(page);
 		if (likely(page_uptodate && !PageError(page)))
 			SetPageUptodate(page);
@@ -159,8 +158,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
 	unlock_page(page);
 	return;
 still_busy:
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 	return;
 }
 
diff --git a/fs/pipe.c b/fs/pipe.c
index 52c415114838..7f30ed2a148e 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -434,8 +434,14 @@ redo:
 		wake_up_interruptible_sync(&pipe->wait);
 		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 	}
+	/*
+	 * Hack: we turn off atime updates for -RT kernels.
+	 * Who uses them on pipes anyway?
+	 */
+#ifndef CONFIG_PREEMPT_RT
 	if (ret > 0)
 		file_accessed(filp);
+#endif
 	return ret;
 }
 
@@ -607,8 +613,14 @@ out:
 		wake_up_interruptible_sync(&pipe->wait);
 		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 	}
+	/*
+	 * Hack: we turn off atime updates for -RT kernels.
+	 * Who uses them on pipes anyway?
+	 */
+#ifndef CONFIG_PREEMPT_RT
 	if (ret > 0)
 		file_update_time(filp);
+#endif
 	return ret;
 }
 
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 725a650bbbb8..d470f094dd0b 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -134,12 +134,13 @@ static inline void task_name(struct seq_file *m, struct task_struct *p)
  */
 static const char *task_state_array[] = {
 	"R (running)",		/*  0 */
-	"S (sleeping)",		/*  1 */
-	"D (disk sleep)",	/*  2 */
-	"T (stopped)",		/*  4 */
-	"T (tracing stop)",	/*  8 */
-	"Z (zombie)",		/* 16 */
-	"X (dead)"		/* 32 */
+	"M (running-mutex)",	/*  1 */
+	"S (sleeping)",		/*  2 */
+	"D (disk sleep)",	/*  4 */
+	"T (stopped)",		/*  8 */
+	"T (tracing stop)",	/* 16 */
+	"Z (zombie)",		/* 32 */
+	"X (dead)"		/* 64 */
 };
 
 static inline const char *get_task_state(struct task_struct *tsk)
@@ -321,6 +322,19 @@ static inline void task_context_switch_counts(struct seq_file *m,
 			p->nivcsw);
 }
 
+#define get_blocked_on(t)	(-1)
+
+static inline void show_blocked_on(struct seq_file *m, struct task_struct *p)
+{
+	pid_t pid = get_blocked_on(p);
+
+	if (pid < 0)
+		return;
+
+	seq_printf(m, "BlckOn: %d\n", pid);
+}
+
+
 int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
 			struct pid *pid, struct task_struct *task)
 {
@@ -340,6 +354,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
 	task_show_regs(m, task);
 #endif
 	task_context_switch_counts(m, task);
+	show_blocked_on(m, task);
 	return 0;
 }
 
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 7cc726c6d70a..ded18411b591 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -27,14 +27,14 @@ static int show_stat(struct seq_file *p, void *v)
 	int i, j;
 	unsigned long jif;
 	cputime64_t user, nice, system, idle, iowait, irq, softirq, steal;
-	cputime64_t guest;
+	cputime64_t guest, user_rt, system_rt;
 	u64 sum = 0;
 	u64 sum_softirq = 0;
 	unsigned int per_softirq_sums[NR_SOFTIRQS] = {0};
 	struct timespec boottime;
 	unsigned int per_irq_sum;
 
-	user = nice = system = idle = iowait =
+	user_rt = system_rt = user = nice = system = idle = iowait =
 		irq = softirq = steal = cputime64_zero;
 	guest = cputime64_zero;
 	getboottime(&boottime);
@@ -50,6 +50,8 @@ static int show_stat(struct seq_file *p, void *v)
 		irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq);
 		softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
 		steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
+		user_rt = cputime64_add(user_rt, kstat_cpu(i).cpustat.user_rt);
+		system_rt = cputime64_add(system_rt, kstat_cpu(i).cpustat.system_rt);
 		guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
 		for_each_irq_nr(j) {
 			sum += kstat_irqs_cpu(j, i);
@@ -65,7 +67,10 @@ static int show_stat(struct seq_file *p, void *v)
 	}
 	sum += arch_irq_stat();
 
-	seq_printf(p, "cpu  %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
+	user = cputime64_add(user_rt, user);
+	system = cputime64_add(system_rt, system);
+
+	seq_printf(p, "cpu  %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
 		(unsigned long long)cputime64_to_clock_t(user),
 		(unsigned long long)cputime64_to_clock_t(nice),
 		(unsigned long long)cputime64_to_clock_t(system),
@@ -74,13 +79,17 @@ static int show_stat(struct seq_file *p, void *v)
 		(unsigned long long)cputime64_to_clock_t(irq),
 		(unsigned long long)cputime64_to_clock_t(softirq),
 		(unsigned long long)cputime64_to_clock_t(steal),
+		(unsigned long long)cputime64_to_clock_t(user_rt),
+		(unsigned long long)cputime64_to_clock_t(system_rt),
 		(unsigned long long)cputime64_to_clock_t(guest));
 	for_each_online_cpu(i) {
 
 		/* Copy values here to work around gcc-2.95.3, gcc-2.96 */
-		user = kstat_cpu(i).cpustat.user;
+		user_rt = kstat_cpu(i).cpustat.user_rt;
+		system_rt = kstat_cpu(i).cpustat.system_rt;
+		user = cputime64_add(user_rt, kstat_cpu(i).cpustat.user);
 		nice = kstat_cpu(i).cpustat.nice;
-		system = kstat_cpu(i).cpustat.system;
+ 		system = cputime64_add(system_rt, kstat_cpu(i).cpustat.system);
 		idle = kstat_cpu(i).cpustat.idle;
 		idle = cputime64_add(idle, arch_idle_time(i));
 		iowait = kstat_cpu(i).cpustat.iowait;
@@ -89,7 +98,7 @@ static int show_stat(struct seq_file *p, void *v)
 		steal = kstat_cpu(i).cpustat.steal;
 		guest = kstat_cpu(i).cpustat.guest;
 		seq_printf(p,
-			"cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
+			"cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
 			i,
 			(unsigned long long)cputime64_to_clock_t(user),
 			(unsigned long long)cputime64_to_clock_t(nice),
@@ -99,6 +108,8 @@ static int show_stat(struct seq_file *p, void *v)
 			(unsigned long long)cputime64_to_clock_t(irq),
 			(unsigned long long)cputime64_to_clock_t(softirq),
 			(unsigned long long)cputime64_to_clock_t(steal),
+			(unsigned long long)cputime64_to_clock_t(user_rt),
+			(unsigned long long)cputime64_to_clock_t(system_rt),
 			(unsigned long long)cputime64_to_clock_t(guest));
 	}
 	seq_printf(p, "intr %llu", (unsigned long long)sum);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 9bd8be1d235c..9d5e1b138855 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -138,8 +138,10 @@ static void *m_start(struct seq_file *m, loff_t *pos)
 	vma = NULL;
 	if ((unsigned long)l < mm->map_count) {
 		vma = mm->mmap;
-		while (l-- && vma)
+		while (l-- && vma) {
 			vma = vma->vm_next;
+			cond_resched();
+		}
 		goto out;
 	}
 
diff --git a/include/acpi/acpiosxf.h b/include/acpi/acpiosxf.h
index ab0b85cf21f3..e3094c6b87c4 100644
--- a/include/acpi/acpiosxf.h
+++ b/include/acpi/acpiosxf.h
@@ -61,7 +61,7 @@ typedef enum {
 	OSL_EC_BURST_HANDLER
 } acpi_execute_type;
 
-#define ACPI_NO_UNIT_LIMIT          ((u32) -1)
+#define ACPI_NO_UNIT_LIMIT          (INT_MAX/2)
 #define ACPI_MUTEX_SEM              1
 
 /* Functions for acpi_os_signal */
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index e43f9766259f..30f998d8c5c9 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -22,14 +22,8 @@
  * and page free order so much..
  */
 #ifdef CONFIG_SMP
-  #ifdef ARCH_FREE_PTR_NR
-    #define FREE_PTR_NR   ARCH_FREE_PTR_NR
-  #else
-    #define FREE_PTE_NR	506
-  #endif
   #define tlb_fast_mode(tlb) ((tlb)->nr == ~0U)
 #else
-  #define FREE_PTE_NR	1
   #define tlb_fast_mode(tlb) 1
 #endif
 
@@ -39,30 +33,48 @@
 struct mmu_gather {
 	struct mm_struct	*mm;
 	unsigned int		nr;	/* set to ~0U means fast mode */
+	unsigned int		max; 	/* nr < max */
 	unsigned int		need_flush;/* Really unmapped some ptes? */
 	unsigned int		fullmm; /* non-zero means full mm flush */
-	struct page *		pages[FREE_PTE_NR];
+#ifdef HAVE_ARCH_MMU_GATHER
+	struct arch_mmu_gather	arch;
+#endif
+	struct page **		pages;
+	struct page *		local[8];
 };
 
-/* Users of the generic TLB shootdown code must declare this storage space. */
-DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);
+static inline void __tlb_alloc_pages(struct mmu_gather *tlb)
+{
+	unsigned long addr = __get_free_pages(GFP_ATOMIC, 0);
+
+	if (addr) {
+		tlb->pages = (void *)addr;
+		tlb->max = PAGE_SIZE / sizeof(struct page *);
+	}
+}
 
 /* tlb_gather_mmu
  *	Return a pointer to an initialized struct mmu_gather.
  */
-static inline struct mmu_gather *
-tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
+static inline void
+tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned int full_mm_flush)
 {
-	struct mmu_gather *tlb = &get_cpu_var(mmu_gathers);
-
 	tlb->mm = mm;
 
-	/* Use fast mode if only one CPU is online */
-	tlb->nr = num_online_cpus() > 1 ? 0U : ~0U;
+	tlb->max = ARRAY_SIZE(tlb->local);
+	tlb->pages = tlb->local;
+
+	if (num_online_cpus() > 1) {
+		tlb->nr = 0;
+		__tlb_alloc_pages(tlb);
+	} else /* Use fast mode if only one CPU is online */
+		tlb->nr = ~0U;
 
 	tlb->fullmm = full_mm_flush;
 
-	return tlb;
+#ifdef HAVE_ARCH_MMU_GATHER
+	tlb->arch = ARCH_MMU_GATHER_INIT;
+#endif
 }
 
 static inline void
@@ -75,6 +87,8 @@ tlb_flush_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
 	if (!tlb_fast_mode(tlb)) {
 		free_pages_and_swap_cache(tlb->pages, tlb->nr);
 		tlb->nr = 0;
+		if (tlb->pages == tlb->local)
+			__tlb_alloc_pages(tlb);
 	}
 }
 
@@ -90,7 +104,8 @@ tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
 	/* keep the page table cache within bounds */
 	check_pgt_cache();
 
-	put_cpu_var(mmu_gathers);
+	if (tlb->pages != tlb->local)
+		free_pages((unsigned long)tlb->pages, 0);
 }
 
 /* tlb_remove_page
@@ -106,7 +121,7 @@ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
 		return;
 	}
 	tlb->pages[tlb->nr++] = page;
-	if (tlb->nr >= FREE_PTE_NR)
+	if (tlb->nr >= tlb->max)
 		tlb_flush_mmu(tlb, 0, 0);
 }
 
diff --git a/include/linux/bottom_half.h b/include/linux/bottom_half.h
index 27b1bcffe408..2e0e96172318 100644
--- a/include/linux/bottom_half.h
+++ b/include/linux/bottom_half.h
@@ -1,9 +1,17 @@
 #ifndef _LINUX_BH_H
 #define _LINUX_BH_H
 
+#ifdef CONFIG_PREEMPT_RT
+# define local_bh_disable()		do { } while (0)
+# define __local_bh_disable(ip)		do { } while (0)
+# define _local_bh_enable()		do { } while (0)
+# define local_bh_enable()		do { } while (0)
+# define local_bh_enable_ip(ip)		do { } while (0)
+#else
 extern void local_bh_disable(void);
 extern void _local_bh_enable(void);
 extern void local_bh_enable(void);
 extern void local_bh_enable_ip(unsigned long ip);
+#endif
 
 #endif /* _LINUX_BH_H */
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 16ed0284d780..a7a7491d014a 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -21,10 +21,6 @@ enum bh_state_bits {
 	BH_Dirty,	/* Is dirty */
 	BH_Lock,	/* Is locked */
 	BH_Req,		/* Has been submitted for I/O */
-	BH_Uptodate_Lock,/* Used by the first bh in a page, to serialise
-			  * IO completion of other buffers in the page
-			  */
-
 	BH_Mapped,	/* Has a disk mapping */
 	BH_New,		/* Disk mapping was newly created by get_block */
 	BH_Async_Read,	/* Is under end_buffer_async_read I/O */
@@ -74,6 +70,8 @@ struct buffer_head {
 	struct address_space *b_assoc_map;	/* mapping this buffer is
 						   associated with */
 	atomic_t b_count;		/* users using this buffer_head */
+	spinlock_t b_uptodate_lock;
+	spinlock_t b_state_lock;
 };
 
 /*
diff --git a/include/linux/console.h b/include/linux/console.h
index dcca5339ceb3..81651ad6d326 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -55,6 +55,7 @@ struct consw {
 	void	(*con_invert_region)(struct vc_data *, u16 *, int);
 	u16    *(*con_screen_pos)(struct vc_data *, int);
 	unsigned long (*con_getxy)(struct vc_data *, unsigned long, int *, int *);
+	int	con_preemptible; // can it reschedule from within printk?
 };
 
 extern const struct consw *conswitchp;
@@ -92,6 +93,17 @@ void give_up_console(const struct consw *sw);
 #define CON_BOOT	(8)
 #define CON_ANYTIME	(16) /* Safe to call when cpu is offline */
 #define CON_BRL		(32) /* Used for a braille device */
+#define CON_ATOMIC	(64) /* Safe to call in PREEMPT_RT atomic */
+
+#ifdef CONFIG_PREEMPT_RT
+# define console_atomic_safe(con)		\
+	(((con)->flags & CON_ATOMIC) ||		\
+	 (!in_atomic() && !irqs_disabled()) ||	\
+	 (system_state != SYSTEM_RUNNING) ||	\
+	 oops_in_progress)
+#else
+# define console_atomic_safe(con) (1)
+#endif
 
 struct console {
 	char	name[16];
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 1518625411a4..16966fbab185 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -77,9 +77,9 @@
  * Are we doing bottom half or hardware interrupt processing?
  * Are we in a softirq context? Interrupt context?
  */
-#define in_irq()		(hardirq_count())
-#define in_softirq()		(softirq_count())
-#define in_interrupt()		(irq_count())
+#define in_irq()	(hardirq_count() || (current->flags & PF_HARDIRQ))
+#define in_softirq()	(softirq_count() || (current->flags & PF_SOFTIRQ))
+#define in_interrupt()	(irq_count())
 
 /*
  * Are we in NMI context?
@@ -96,19 +96,6 @@
 #define in_atomic()		((preempt_count() & ~PREEMPT_ACTIVE) != 0)
 
 #ifdef CONFIG_PREEMPT
-# define PREEMPT_CHECK_OFFSET 1
-#else
-# define PREEMPT_CHECK_OFFSET 0
-#endif
-
-/*
- * Check whether we were atomic before we did preempt_disable():
- * (used by the scheduler)
- */
-#define in_atomic_preempt_off() \
-		((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET)
-
-#ifdef CONFIG_PREEMPT
 # define preemptible()	(preempt_count() == 0 && !irqs_disabled())
 # define IRQ_EXIT_OFFSET (HARDIRQ_OFFSET-1)
 #else
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index f6177e6c7c9d..6bc3e2833858 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -109,6 +109,7 @@ struct hrtimer {
 	struct hrtimer_clock_base	*base;
 	unsigned long			state;
 	struct list_head		cb_entry;
+	int				irqsafe;
 #ifdef CONFIG_TIMER_STATS
 	int				start_pid;
 	void				*start_site;
@@ -144,6 +145,7 @@ struct hrtimer_clock_base {
 	struct hrtimer_cpu_base	*cpu_base;
 	clockid_t		index;
 	struct rb_root		active;
+	struct list_head	expired;
 	struct rb_node		*first;
 	ktime_t			resolution;
 	ktime_t			(*get_time)(void);
@@ -177,6 +179,9 @@ struct hrtimer_cpu_base {
 	int				hres_active;
 	unsigned long			nr_events;
 #endif
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+	wait_queue_head_t		wait;
+#endif
 };
 
 static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time)
@@ -364,6 +369,13 @@ static inline int hrtimer_restart(struct hrtimer *timer)
 	return hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
 }
 
+/* Softirq preemption could deadlock timer removal */
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+  extern void hrtimer_wait_for_timer(const struct hrtimer *timer);
+#else
+# define hrtimer_wait_for_timer(timer)	do { cpu_relax(); } while (0)
+#endif
+
 /* Query timers: */
 extern ktime_t hrtimer_get_remaining(const struct hrtimer *timer);
 extern int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp);
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 91665bd3d5e1..0d2e6070a7d9 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -161,6 +161,7 @@ extern struct cred init_cred;
 	.fs_excl	= ATOMIC_INIT(0),				\
 	.pi_lock	= __ATOMIC_SPIN_LOCK_UNLOCKED(tsk.pi_lock),	\
 	.timer_slack_ns = 50000, /* 50 usec default slack */		\
+	.posix_timer_list = NULL,					\
 	.pids = {							\
 		[PIDTYPE_PID]  = INIT_PID_LINK(PIDTYPE_PID),		\
 		[PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID),		\
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index c65d2ed54970..874eb071b2a1 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -93,6 +93,7 @@ typedef irqreturn_t (*irq_handler_t)(int, void *);
  * @thread_fn:	interupt handler function for threaded interrupts
  * @thread:	thread pointer for threaded interrupts
  * @thread_flags:	flags related to @thread
+ * @thread_mask:	bit mask to account for forced threads
  */
 struct irqaction {
 	irq_handler_t handler;
@@ -106,6 +107,7 @@ struct irqaction {
 	irq_handler_t thread_fn;
 	struct task_struct *thread;
 	unsigned long thread_flags;
+	unsigned long thread_mask;
 };
 
 extern irqreturn_t no_action(int cpl, void *dev_id);
@@ -322,6 +324,7 @@ static inline int disable_irq_wake(unsigned int irq)
 
 #ifndef __ARCH_SET_SOFTIRQ_PENDING
 #define set_softirq_pending(x) (local_softirq_pending() = (x))
+// FIXME: PREEMPT_RT: set_bit()?
 #define or_softirq_pending(x)  (local_softirq_pending() |= (x))
 #endif
 
@@ -352,7 +355,6 @@ enum
 	SCHED_SOFTIRQ,
 	HRTIMER_SOFTIRQ,
 	RCU_SOFTIRQ,	/* Preferable RCU should always be the last softirq */
-
 	NR_SOFTIRQS
 };
 
@@ -370,13 +372,23 @@ struct softirq_action
 	void	(*action)(struct softirq_action *);
 };
 
+#ifdef CONFIG_PREEMPT_HARDIRQS
+# define __raise_softirq_irqoff(nr) raise_softirq_irqoff(nr)
+# define __do_raise_softirq_irqoff(nr) \
+	do { or_softirq_pending(1UL << (nr)); } while (0)
+#else
+# define __raise_softirq_irqoff(nr) \
+	do { or_softirq_pending(1UL << (nr)); } while (0)
+# define __do_raise_softirq_irqoff(nr) __raise_softirq_irqoff(nr)
+#endif
+
 asmlinkage void do_softirq(void);
 asmlinkage void __do_softirq(void);
 extern void open_softirq(int nr, void (*action)(struct softirq_action *));
 extern void softirq_init(void);
-#define __raise_softirq_irqoff(nr) do { or_softirq_pending(1UL << (nr)); } while (0)
 extern void raise_softirq_irqoff(unsigned int nr);
 extern void raise_softirq(unsigned int nr);
+extern void softirq_check_pending_idle(void);
 
 /* This is the worklist that queues up per-cpu softirq work.
  *
@@ -411,8 +423,9 @@ extern void __send_remote_softirq(struct call_single_data *cp, int cpu,
      to be executed on some cpu at least once after this.
    * If the tasklet is already scheduled, but its excecution is still not
      started, it will be executed only once.
-   * If this tasklet is already running on another CPU (or schedule is called
-     from tasklet itself), it is rescheduled for later.
+   * If this tasklet is already running on another CPU, it is rescheduled
+     for later.
+   * Schedule must not be called from the tasklet itself (a lockup occurs)
    * Tasklet is strictly serialized wrt itself, but not
      wrt another tasklets. If client needs some intertask synchronization,
      he makes it with spinlocks.
@@ -437,27 +450,36 @@ struct tasklet_struct name = { NULL, 0, ATOMIC_INIT(1), func, data }
 enum
 {
 	TASKLET_STATE_SCHED,	/* Tasklet is scheduled for execution */
-	TASKLET_STATE_RUN	/* Tasklet is running (SMP only) */
+	TASKLET_STATE_RUN,	/* Tasklet is running (SMP only) */
+	TASKLET_STATE_PENDING	/* Tasklet is pending */
 };
 
-#ifdef CONFIG_SMP
+#define TASKLET_STATEF_SCHED	(1 << TASKLET_STATE_SCHED)
+#define TASKLET_STATEF_RUN	(1 << TASKLET_STATE_RUN)
+#define TASKLET_STATEF_PENDING	(1 << TASKLET_STATE_PENDING)
+
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
 static inline int tasklet_trylock(struct tasklet_struct *t)
 {
 	return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state);
 }
 
+static inline int tasklet_tryunlock(struct tasklet_struct *t)
+{
+	return cmpxchg(&t->state, TASKLET_STATEF_RUN, 0) == TASKLET_STATEF_RUN;
+}
+
 static inline void tasklet_unlock(struct tasklet_struct *t)
 {
 	smp_mb__before_clear_bit(); 
 	clear_bit(TASKLET_STATE_RUN, &(t)->state);
 }
 
-static inline void tasklet_unlock_wait(struct tasklet_struct *t)
-{
-	while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { barrier(); }
-}
+extern void tasklet_unlock_wait(struct tasklet_struct *t);
+
 #else
 #define tasklet_trylock(t) 1
+#define tasklet_tryunlock(t)	1
 #define tasklet_unlock_wait(t) do { } while (0)
 #define tasklet_unlock(t) do { } while (0)
 #endif
@@ -506,22 +528,14 @@ static inline void tasklet_disable(struct tasklet_struct *t)
 	smp_mb();
 }
 
-static inline void tasklet_enable(struct tasklet_struct *t)
-{
-	smp_mb__before_atomic_dec();
-	atomic_dec(&t->count);
-}
-
-static inline void tasklet_hi_enable(struct tasklet_struct *t)
-{
-	smp_mb__before_atomic_dec();
-	atomic_dec(&t->count);
-}
+extern  void tasklet_enable(struct tasklet_struct *t);
+extern  void tasklet_hi_enable(struct tasklet_struct *t);
 
 extern void tasklet_kill(struct tasklet_struct *t);
 extern void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu);
 extern void tasklet_init(struct tasklet_struct *t,
 			 void (*func)(unsigned long), unsigned long data);
+extern void takeover_tasklets(unsigned int cpu);
 
 struct tasklet_hrtimer {
 	struct hrtimer		timer;
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 5e59d7a0de7a..c168a2fd8377 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -193,6 +193,7 @@ struct irq_desc {
 #endif
 #endif
 	atomic_t		threads_active;
+	unsigned long		forced_threads_active;
 	wait_queue_head_t       wait_for_threads;
 #ifdef CONFIG_PROC_FS
 	struct proc_dir_entry	*dir;
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index c2049a04fa0b..ce07056dffdd 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -260,6 +260,15 @@ void buffer_assertion_failure(struct buffer_head *bh);
 #define J_ASSERT_JH(jh, expr)	J_ASSERT(expr)
 #endif
 
+/*
+ * For assertions that are only valid on SMP (e.g. spin_is_locked()):
+ */
+#ifdef CONFIG_SMP
+# define J_ASSERT_JH_SMP(jh, expr)	J_ASSERT_JH(jh, expr)
+#else
+# define J_ASSERT_JH_SMP(jh, assert)	do { } while (0)
+#endif
+
 #if defined(JBD_PARANOID_IOFAIL)
 #define J_EXPECT(expr, why...)		J_ASSERT(expr)
 #define J_EXPECT_BH(bh, expr, why...)	J_ASSERT_BH(bh, expr)
@@ -315,32 +324,32 @@ static inline struct journal_head *bh2jh(struct buffer_head *bh)
 
 static inline void jbd_lock_bh_state(struct buffer_head *bh)
 {
-	bit_spin_lock(BH_State, &bh->b_state);
+	spin_lock(&bh->b_state_lock);
 }
 
 static inline int jbd_trylock_bh_state(struct buffer_head *bh)
 {
-	return bit_spin_trylock(BH_State, &bh->b_state);
+	return spin_trylock(&bh->b_state_lock);
 }
 
 static inline int jbd_is_locked_bh_state(struct buffer_head *bh)
 {
-	return bit_spin_is_locked(BH_State, &bh->b_state);
+	return spin_is_locked(&bh->b_state_lock);
 }
 
 static inline void jbd_unlock_bh_state(struct buffer_head *bh)
 {
-	bit_spin_unlock(BH_State, &bh->b_state);
+	spin_unlock(&bh->b_state_lock);
 }
 
 static inline void jbd_lock_bh_journal_head(struct buffer_head *bh)
 {
-	bit_spin_lock(BH_JournalHead, &bh->b_state);
+	spin_lock_irq(&bh->b_uptodate_lock);
 }
 
 static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
 {
-	bit_spin_unlock(BH_JournalHead, &bh->b_state);
+	spin_unlock_irq(&bh->b_uptodate_lock);
 }
 
 struct jbd_revoke_table_s;
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index d6320a3e8def..4651e0971d75 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -124,7 +124,7 @@ extern int _cond_resched(void);
 # define might_resched() do { } while (0)
 #endif
 
-#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
+#if defined(CONFIG_DEBUG_SPINLOCK_SLEEP) || defined(CONFIG_DEBUG_PREEMPT)
   void __might_sleep(char *file, int line);
 /**
  * might_sleep - annotation for functions that can sleep
@@ -284,6 +284,12 @@ extern void printk_tick(void);
 extern void asmlinkage __attribute__((format(printf, 1, 2)))
 	early_printk(const char *fmt, ...);
 
+#ifdef CONFIG_PREEMPT_RT
+extern void zap_rt_locks(void);
+#else
+# define zap_rt_locks() do { } while (0)
+#endif
+
 unsigned long int_sqrt(unsigned long);
 
 static inline void console_silent(void)
@@ -313,6 +319,7 @@ extern int root_mountflags;
 /* Values used for system_state */
 extern enum system_states {
 	SYSTEM_BOOTING,
+	SYSTEM_BOOTING_SCHEDULER_OK,
 	SYSTEM_RUNNING,
 	SYSTEM_HALT,
 	SYSTEM_POWER_OFF,
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 348fa8874b52..91958d33eb90 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -24,6 +24,8 @@ struct cpu_usage_stat {
 	cputime64_t idle;
 	cputime64_t iowait;
 	cputime64_t steal;
+	cputime64_t user_rt;
+	cputime64_t system_rt;
 	cputime64_t guest;
 };
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ba3a7cb1eaa0..1f631fce9ea8 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -602,23 +602,39 @@ static __always_inline void *lowmem_page_address(struct page *page)
 #endif
 
 #if defined(WANT_PAGE_VIRTUAL)
-#define page_address(page) ((page)->virtual)
-#define set_page_address(page, address)			\
-	do {						\
-		(page)->virtual = (address);		\
-	} while(0)
-#define page_address_init()  do { } while(0)
+/*
+ * wrap page->virtual so it is safe to set/read locklessly
+ */
+#define page_address(page) \
+	({ typeof((page)->virtual) v = (page)->virtual; \
+	 smp_read_barrier_depends(); \
+	 v; })
+
+static inline int set_page_address(struct page *page, void *address)
+{
+	if (address)
+		return cmpxchg(&page->virtual, NULL, address) == NULL;
+	else {
+		/*
+		 * cmpxchg is a bit abused because it is not guaranteed
+		 * safe wrt direct assignment on all platforms.
+		 */
+		void *virt = page->virtual;
+		return cmpxchg(&page->vitrual, virt, NULL) == virt;
+	}
+}
+void page_address_init(void);
 #endif
 
 #if defined(HASHED_PAGE_VIRTUAL)
 void *page_address(struct page *page);
-void set_page_address(struct page *page, void *virtual);
+int set_page_address(struct page *page, void *virtual);
 void page_address_init(void);
 #endif
 
 #if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL)
 #define page_address(page) lowmem_page_address(page)
-#define set_page_address(page, address)  do { } while(0)
+#define set_page_address(page, address)  (0)
 #define page_address_init()  do { } while(0)
 #endif
 
@@ -759,7 +775,7 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
 		unsigned long size);
 unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
 		unsigned long size, struct zap_details *);
-unsigned long unmap_vmas(struct mmu_gather **tlb,
+unsigned long unmap_vmas(struct mmu_gather *tlb,
 		struct vm_area_struct *start_vma, unsigned long start_addr,
 		unsigned long end_addr, unsigned long *nr_accounted,
 		struct zap_details *);
@@ -938,27 +954,85 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a
  * overflow into the next struct page (as it might with DEBUG_SPINLOCK).
  * When freeing, reset page->mapping so free_pages_check won't complain.
  */
+#ifndef CONFIG_PREEMPT_RT
+
 #define __pte_lockptr(page)	&((page)->ptl)
-#define pte_lock_init(_page)	do {					\
-	spin_lock_init(__pte_lockptr(_page));				\
-} while (0)
+
+static inline struct page *pte_lock_init(struct page *page)
+{
+	spin_lock_init(__pte_lockptr(page));
+	return page;
+}
+
 #define pte_lock_deinit(page)	((page)->mapping = NULL)
+
+#else /* PREEMPT_RT */
+
+/*
+ * On PREEMPT_RT the spinlock_t's are too large to embed in the
+ * page frame, hence it only has a pointer and we need to dynamically
+ * allocate the lock when we allocate PTE-pages.
+ *
+ * This is an overall win, since only a small fraction of the pages
+ * will be PTE pages under normal circumstances.
+ */
+
+#define __pte_lockptr(page)	((page)->ptl)
+
+/*
+ * Heinous hack, relies on the caller doing something like:
+ *
+ *   pte = alloc_pages(PGALLOC_GFP, 0);
+ *   if (pte)
+ *     pgtable_page_ctor(pte);
+ *   return pte;
+ *
+ * This ensures we release the page and return NULL when the
+ * lock allocation fails.
+ */
+static inline struct page *pte_lock_init(struct page *page)
+{
+	page->ptl = kmalloc(sizeof(spinlock_t), GFP_KERNEL);
+	if (page->ptl) {
+		spin_lock_init(__pte_lockptr(page));
+	} else {
+		__free_page(page);
+		page = NULL;
+	}
+	return page;
+}
+
+static inline void pte_lock_deinit(struct page *page)
+{
+	kfree(page->ptl);
+	page->mapping = NULL;
+}
+
+#endif /* PREEMPT_RT */
+
 #define pte_lockptr(mm, pmd)	({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));})
 #else	/* !USE_SPLIT_PTLOCKS */
 /*
  * We use mm->page_table_lock to guard all pagetable pages of the mm.
  */
-#define pte_lock_init(page)	do {} while (0)
+static inline struct page *pte_lock_init(struct page *page) { return page; }
 #define pte_lock_deinit(page)	do {} while (0)
 #define pte_lockptr(mm, pmd)	({(void)(pmd); &(mm)->page_table_lock;})
 #endif /* USE_SPLIT_PTLOCKS */
 
-static inline void pgtable_page_ctor(struct page *page)
+static inline struct page *__pgtable_page_ctor(struct page *page)
 {
-	pte_lock_init(page);
-	inc_zone_page_state(page, NR_PAGETABLE);
+	page = pte_lock_init(page);
+	if (page)
+		inc_zone_page_state(page, NR_PAGETABLE);
+	return page;
 }
 
+#define pgtable_page_ctor(page)				\
+do {							\
+	page = __pgtable_page_ctor(page);		\
+} while (0)
+
 static inline void pgtable_page_dtor(struct page *page)
 {
 	pte_lock_deinit(page);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 7acc8439d9b3..2b208da518cb 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -69,7 +69,11 @@ struct page {
 						 */
 	    };
 #if USE_SPLIT_PTLOCKS
+#ifndef CONFIG_PREEMPT_RT
 	    spinlock_t ptl;
+#else
+	    spinlock_t *ptl;
+#endif
 #endif
 	    struct kmem_cache *slab;	/* SLUB: Pointer to slab */
 	    struct page *first_page;	/* Compound tail pages */
@@ -247,6 +251,9 @@ struct mm_struct {
 	/* Architecture-specific MM context */
 	mm_context_t context;
 
+	/* realtime bits */
+	struct list_head	delayed_drop;
+
 	/* Swap token stuff */
 	/*
 	 * Last value of global fault stamp as seen by this process.
diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 878cab4f5fcc..f98509b26876 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -12,11 +12,85 @@
 
 #include <linux/list.h>
 #include <linux/spinlock_types.h>
+#include <linux/rt_lock.h>
 #include <linux/linkage.h>
 #include <linux/lockdep.h>
 
 #include <asm/atomic.h>
 
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
+		, .dep_map = { .name = #lockname }
+#else
+# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
+#endif
+
+#ifdef CONFIG_PREEMPT_RT
+
+#include <linux/rtmutex.h>
+
+struct mutex {
+	struct rt_mutex		lock;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	dep_map;
+#endif
+};
+
+
+#define __MUTEX_INITIALIZER(mutexname)					\
+	{								\
+		.lock = __RT_MUTEX_INITIALIZER(mutexname.lock)		\
+		__DEP_MAP_MUTEX_INITIALIZER(mutexname)			\
+	}
+
+#define DEFINE_MUTEX(mutexname)						\
+	struct mutex mutexname = __MUTEX_INITIALIZER(mutexname)
+
+extern void
+__mutex_init(struct mutex *lock, char *name, struct lock_class_key *key);
+
+extern void __lockfunc _mutex_lock(struct mutex *lock);
+extern int __lockfunc _mutex_lock_interruptible(struct mutex *lock);
+extern int __lockfunc _mutex_lock_killable(struct mutex *lock);
+extern void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass);
+extern int __lockfunc
+_mutex_lock_interruptible_nested(struct mutex *lock, int subclass);
+extern int __lockfunc
+_mutex_lock_killable_nested(struct mutex *lock, int subclass);
+extern int __lockfunc _mutex_trylock(struct mutex *lock);
+extern void __lockfunc _mutex_unlock(struct mutex *lock);
+
+#define mutex_is_locked(l)		rt_mutex_is_locked(&(l)->lock)
+#define mutex_lock(l)			_mutex_lock(l)
+#define mutex_lock_interruptible(l)	_mutex_lock_interruptible(l)
+#define mutex_lock_killable(l)		_mutex_lock_killable(l)
+#define mutex_trylock(l)		_mutex_trylock(l)
+#define mutex_unlock(l)			_mutex_unlock(l)
+#define mutex_destroy(l)		rt_mutex_destroy(&(l)->lock)
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# define mutex_lock_nested(l, s)	_mutex_lock_nested(l, s)
+# define mutex_lock_interruptible_nested(l, s) \
+					_mutex_lock_interruptible_nested(l, s)
+# define mutex_lock_killable_nested(l, s) \
+					_mutex_lock_killable_nested(l, s)
+#else
+# define mutex_lock_nested(l, s)	_mutex_lock(l)
+# define mutex_lock_interruptible_nested(l, s) \
+					_mutex_lock_interruptible(l)
+# define mutex_lock_killable_nested(l, s) \
+					_mutex_lock_killable(l)
+#endif
+
+# define mutex_init(mutex)				\
+do {							\
+	static struct lock_class_key __key;		\
+							\
+	__mutex_init((mutex), #mutex, &__key);		\
+} while (0)
+
+#else /* PREEMPT_RT */
+
 /*
  * Simple, straightforward mutexes with strict semantics:
  *
@@ -87,13 +161,6 @@ do {							\
 # define mutex_destroy(mutex)				do { } while (0)
 #endif
 
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
-		, .dep_map = { .name = #lockname }
-#else
-# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
-#endif
-
 #define __MUTEX_INITIALIZER(lockname) \
 		{ .count = ATOMIC_INIT(1) \
 		, .wait_lock = __SPIN_LOCK_UNLOCKED(lockname.wait_lock) \
@@ -150,6 +217,8 @@ extern int __must_check mutex_lock_killable(struct mutex *lock);
  */
 extern int mutex_trylock(struct mutex *lock);
 extern void mutex_unlock(struct mutex *lock);
+#endif /* !PREEMPT_RT */
+
 extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
 
 #endif
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d4a4d9867794..14aa9d92e344 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -477,7 +477,7 @@ struct netdev_queue {
  * write mostly part
  */
 	spinlock_t		_xmit_lock ____cacheline_aligned_in_smp;
-	int			xmit_lock_owner;
+	void			*xmit_lock_owner;
 	/*
 	 * please use this field instead of dev->trans_start
 	 */
@@ -1665,41 +1665,49 @@ static inline u32 netif_msg_init(int debug_value, int default_msg_enable_bits)
 	return (1 << debug_value) - 1;
 }
 
-static inline void __netif_tx_lock(struct netdev_queue *txq, int cpu)
+static inline void __netif_tx_lock(struct netdev_queue *txq)
 {
 	spin_lock(&txq->_xmit_lock);
-	txq->xmit_lock_owner = cpu;
+	txq->xmit_lock_owner = (void *)current;
+}
+
+/*
+ * Do we hold the xmit_lock already?
+ */
+static inline int netif_tx_lock_recursion(struct netdev_queue *txq)
+{
+	return txq->xmit_lock_owner == (void *)current;
 }
 
 static inline void __netif_tx_lock_bh(struct netdev_queue *txq)
 {
 	spin_lock_bh(&txq->_xmit_lock);
-	txq->xmit_lock_owner = smp_processor_id();
+	txq->xmit_lock_owner = (void *)current;
 }
 
 static inline int __netif_tx_trylock(struct netdev_queue *txq)
 {
 	int ok = spin_trylock(&txq->_xmit_lock);
 	if (likely(ok))
-		txq->xmit_lock_owner = smp_processor_id();
+		txq->xmit_lock_owner = (void *)current;
 	return ok;
 }
 
 static inline void __netif_tx_unlock(struct netdev_queue *txq)
 {
-	txq->xmit_lock_owner = -1;
+	txq->xmit_lock_owner = (void *)-1;
 	spin_unlock(&txq->_xmit_lock);
 }
 
 static inline void __netif_tx_unlock_bh(struct netdev_queue *txq)
 {
-	txq->xmit_lock_owner = -1;
+	txq->xmit_lock_owner = (void *)-1;
 	spin_unlock_bh(&txq->_xmit_lock);
 }
 
 static inline void txq_trans_update(struct netdev_queue *txq)
 {
-	if (txq->xmit_lock_owner != -1)
+	if (txq->xmit_lock_owner != (void *)-1)
 		txq->trans_start = jiffies;
 }
 
@@ -1712,10 +1720,8 @@ static inline void txq_trans_update(struct netdev_queue *txq)
 static inline void netif_tx_lock(struct net_device *dev)
 {
 	unsigned int i;
-	int cpu;
 
 	spin_lock(&dev->tx_global_lock);
-	cpu = smp_processor_id();
 	for (i = 0; i < dev->num_tx_queues; i++) {
 		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
 
@@ -1725,7 +1731,7 @@ static inline void netif_tx_lock(struct net_device *dev)
 		 * the ->hard_start_xmit() handler and already
 		 * checked the frozen bit.
 		 */
-		__netif_tx_lock(txq, cpu);
+		__netif_tx_lock(txq);
 		set_bit(__QUEUE_STATE_FROZEN, &txq->state);
 		__netif_tx_unlock(txq);
 	}
@@ -1761,9 +1767,9 @@ static inline void netif_tx_unlock_bh(struct net_device *dev)
 	local_bh_enable();
 }
 
-#define HARD_TX_LOCK(dev, txq, cpu) {			\
+#define HARD_TX_LOCK(dev, txq) {			\
 	if ((dev->features & NETIF_F_LLTX) == 0) {	\
-		__netif_tx_lock(txq, cpu);		\
+		__netif_tx_lock(txq);			\
 	}						\
 }
 
@@ -1776,14 +1782,12 @@ static inline void netif_tx_unlock_bh(struct net_device *dev)
 static inline void netif_tx_disable(struct net_device *dev)
 {
 	unsigned int i;
-	int cpu;
 
 	local_bh_disable();
-	cpu = smp_processor_id();
 	for (i = 0; i < dev->num_tx_queues; i++) {
 		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
 
-		__netif_tx_lock(txq, cpu);
+		__netif_tx_lock(txq);
 		netif_tx_stop_queue(txq);
 		__netif_tx_unlock(txq);
 	}
diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 1030b7593898..f7ab3f917048 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -473,14 +473,14 @@ static inline void xt_info_rdlock_bh(void)
 	struct xt_info_lock *lock;
 
 	local_bh_disable();
-	lock = &__get_cpu_var(xt_info_locks);
+	lock = &__raw_get_cpu_var(xt_info_locks);
 	if (likely(!lock->readers++))
 		spin_lock(&lock->lock);
 }
 
 static inline void xt_info_rdunlock_bh(void)
 {
-	struct xt_info_lock *lock = &__get_cpu_var(xt_info_locks);
+	struct xt_info_lock *lock = &__raw_get_cpu_var(xt_info_locks);
 
 	if (likely(!--lock->readers))
 		spin_unlock(&lock->lock);
diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h
index 2524267210d3..838405cb0117 100644
--- a/include/linux/netpoll.h
+++ b/include/linux/netpoll.h
@@ -84,7 +84,7 @@ static inline void *netpoll_poll_lock(struct napi_struct *napi)
 	rcu_read_lock(); /* deal with race on ->npinfo */
 	if (dev && dev->npinfo) {
 		spin_lock(&napi->poll_lock);
-		napi->poll_owner = smp_processor_id();
+		napi->poll_owner = raw_smp_processor_id();
 		return napi;
 	}
 	return NULL;
diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index bab82f4c571c..0af5218b93a8 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -9,7 +9,7 @@
 #define _LINUX_PAGEVEC_H
 
 /* 14 pointers + two long's align the pagevec structure to a power of two */
-#define PAGEVEC_SIZE	14
+#define PAGEVEC_SIZE	8
 
 struct page;
 struct address_space;
diff --git a/include/linux/plist.h b/include/linux/plist.h
index 1eef0565f744..a2d201009c49 100644
--- a/include/linux/plist.h
+++ b/include/linux/plist.h
@@ -75,14 +75,16 @@
 
 #include <linux/kernel.h>
 #include <linux/list.h>
-#include <linux/spinlock_types.h>
+
+struct spinlock;
+struct atomic_spinlock;
 
 struct plist_head {
 	struct list_head prio_list;
 	struct list_head node_list;
 #ifdef CONFIG_DEBUG_PI_LIST
-	atomic_spinlock_t *alock;
-	spinlock_t *slock;
+	struct atomic_spinlock *alock;
+	struct spinlock *slock;
 #endif
 };
 
@@ -142,7 +144,7 @@ struct plist_node {
  * @lock:	list spinlock, remembered for debugging
  */
 static inline void
-plist_head_init(struct plist_head *head, spinlock_t *lock)
+plist_head_init(struct plist_head *head, struct spinlock *lock)
 {
 	INIT_LIST_HEAD(&head->prio_list);
 	INIT_LIST_HEAD(&head->node_list);
@@ -158,7 +160,7 @@ plist_head_init(struct plist_head *head, spinlock_t *lock)
  * @lock:	list atomic_spinlock, remembered for debugging
  */
 static inline void
-plist_head_init_atomic(struct plist_head *head, atomic_spinlock_t *lock)
+plist_head_init_atomic(struct plist_head *head, struct atomic_spinlock *lock)
 {
 	INIT_LIST_HEAD(&head->prio_list);
 	INIT_LIST_HEAD(&head->node_list);
diff --git a/include/linux/profile.h b/include/linux/profile.h
index a0fc32279fc0..5b72082c273e 100644
--- a/include/linux/profile.h
+++ b/include/linux/profile.h
@@ -8,10 +8,11 @@
 
 #include <asm/errno.h>
 
-#define CPU_PROFILING	1
-#define SCHED_PROFILING	2
-#define SLEEP_PROFILING	3
-#define KVM_PROFILING	4
+#define CPU_PROFILING		1
+#define SCHED_PROFILING		2
+#define SLEEP_PROFILING		3
+#define KVM_PROFILING		4
+#define PREEMPT_PROFILING	5
 
 struct proc_dir_entry;
 struct pt_regs;
@@ -36,6 +37,8 @@ enum profile_type {
 	PROFILE_MUNMAP
 };
 
+extern int prof_pid;
+
 #ifdef CONFIG_PROFILING
 
 extern int prof_on __read_mostly;
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index c5da74918096..9eb17f95857b 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -169,7 +169,18 @@ unsigned long radix_tree_next_hole(struct radix_tree_root *root,
 				unsigned long index, unsigned long max_scan);
 unsigned long radix_tree_prev_hole(struct radix_tree_root *root,
 				unsigned long index, unsigned long max_scan);
+/*
+ * On a mutex based kernel we can freely schedule within the radix code:
+ */
+#ifdef CONFIG_PREEMPT_RT
+static inline int radix_tree_preload(gfp_t gfp_mask)
+{
+	return 0;
+}
+#else
 int radix_tree_preload(gfp_t gfp_mask);
+#endif
+
 void radix_tree_init(void);
 void *radix_tree_tag_set(struct radix_tree_root *root,
 			unsigned long index, unsigned int tag);
@@ -189,7 +200,9 @@ int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag);
 
 static inline void radix_tree_preload_end(void)
 {
+#ifndef CONFIG_PREEMPT_RT
 	preempt_enable();
+#endif
 }
 
 #endif /* _LINUX_RADIX_TREE_H */
diff --git a/include/linux/rt_lock.h b/include/linux/rt_lock.h
new file mode 100644
index 000000000000..5c74bade94e9
--- /dev/null
+++ b/include/linux/rt_lock.h
@@ -0,0 +1,214 @@
+#ifndef __LINUX_RT_LOCK_H
+#define __LINUX_RT_LOCK_H
+
+/*
+ * Real-Time Preemption Support
+ *
+ * started by Ingo Molnar:
+ *
+ *  Copyright (C) 2004, 2005 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * This file contains the main data structure definitions.
+ */
+#include <linux/rtmutex.h>
+#include <asm/atomic.h>
+#include <linux/spinlock_types.h>
+
+#ifdef CONFIG_PREEMPT_RT
+
+static inline int preempt_rt(void) { return 1; }
+
+/*
+ * spinlocks - an RT mutex plus lock-break field:
+ */
+typedef struct spinlock {
+	struct rt_mutex		lock;
+	unsigned int		break_lock;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	dep_map;
+#endif
+} spinlock_t;
+
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+# define __RT_SPIN_INITIALIZER(name) \
+	{ \
+	.wait_lock = __ATOMIC_SPIN_LOCK_UNLOCKED(name), \
+	.save_state = 1, \
+	.file = __FILE__, \
+	.line = __LINE__ , \
+	}
+#else
+# define __RT_SPIN_INITIALIZER(name) \
+	{ .wait_lock = __ATOMIC_SPIN_LOCK_UNLOCKED(name) }
+#endif
+
+#define __SPIN_LOCK_UNLOCKED(name)			\
+	{ .lock = __RT_SPIN_INITIALIZER(name),		\
+	  SPIN_DEP_MAP_INIT(name) }
+
+#define SPIN_LOCK_UNLOCKED	__SPIN_LOCK_UNLOCKED(spin_old_style)
+
+#define __DEFINE_SPINLOCK(name) \
+	spinlock_t name = __SPIN_LOCK_UNLOCKED(name)
+
+#define DEFINE_SPINLOCK(name) \
+	spinlock_t name __cacheline_aligned_in_smp = __SPIN_LOCK_UNLOCKED(name)
+
+extern void
+__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key);
+
+#define spin_lock_init(lock)				\
+do {							\
+	static struct lock_class_key __key;		\
+							\
+	__rt_spin_lock_init(lock, #lock, &__key);	\
+} while (0)
+
+extern void __lockfunc rt_spin_lock(spinlock_t *lock);
+extern void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass);
+extern void __lockfunc rt_spin_unlock(spinlock_t *lock);
+extern void __lockfunc rt_spin_unlock_wait(spinlock_t *lock);
+extern int __lockfunc
+rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags);
+extern int __lockfunc rt_spin_trylock(spinlock_t *lock);
+extern int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock);
+
+/*
+ * lockdep-less calls, for derived types like rwlock:
+ * (for trylock they can use rt_mutex_trylock() directly.
+ */
+extern void __lockfunc __rt_spin_lock(struct rt_mutex *lock);
+extern void __lockfunc __rt_spin_unlock(struct rt_mutex *lock);
+
+/*
+ * rwlocks - an RW semaphore plus lock-break field:
+ */
+typedef struct {
+	struct rt_mutex		lock;
+	int			read_depth;
+	unsigned int		break_lock;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	dep_map;
+#endif
+} rwlock_t;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# define RW_DEP_MAP_INIT(lockname)	.dep_map = { .name = #lockname }
+#else
+# define RW_DEP_MAP_INIT(lockname)
+#endif
+
+#define __RW_LOCK_UNLOCKED(name) \
+	{ .lock = __RT_SPIN_INITIALIZER(name),	\
+	  RW_DEP_MAP_INIT(name) }
+
+#define RW_LOCK_UNLOCKED	__RW_LOCK_UNLOCKED(rw_old_style)
+
+#define DEFINE_RWLOCK(name) \
+	rwlock_t name __cacheline_aligned_in_smp = __RW_LOCK_UNLOCKED(name)
+
+extern void __lockfunc rt_write_lock(rwlock_t *rwlock);
+extern void __lockfunc rt_read_lock(rwlock_t *rwlock);
+extern int __lockfunc rt_write_trylock(rwlock_t *rwlock);
+extern int __lockfunc rt_write_trylock_irqsave(rwlock_t *trylock,
+					       unsigned long *flags);
+extern int __lockfunc rt_read_trylock(rwlock_t *rwlock);
+extern void __lockfunc rt_write_unlock(rwlock_t *rwlock);
+extern void __lockfunc rt_read_unlock(rwlock_t *rwlock);
+extern unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock);
+extern unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock);
+extern void
+__rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key);
+
+#define rwlock_init(rwl)				\
+do {							\
+	static struct lock_class_key __key;		\
+							\
+	__rt_rwlock_init(rwl, #rwl, &__key);		\
+} while (0)
+
+/*
+ * RW-semaphores are a spinlock plus a reader-depth count.
+ *
+ * Note that the semantics are different from the usual
+ * Linux rw-sems, in PREEMPT_RT mode we do not allow
+ * multiple readers to hold the lock at once, we only allow
+ * a read-lock owner to read-lock recursively. This is
+ * better for latency, makes the implementation inherently
+ * fair and makes it simpler as well:
+ */
+struct rw_semaphore {
+	struct rt_mutex		lock;
+	int			read_depth;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	dep_map;
+#endif
+};
+
+#define __RWSEM_INITIALIZER(name) \
+	{ .lock = __RT_MUTEX_INITIALIZER(name.lock), \
+	  RW_DEP_MAP_INIT(name) }
+
+#define DECLARE_RWSEM(lockname) \
+	struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname)
+
+extern void  __rt_rwsem_init(struct rw_semaphore *rwsem, char *name,
+				     struct lock_class_key *key);
+
+# define rt_init_rwsem(sem)				\
+do {							\
+	static struct lock_class_key __key;		\
+							\
+	__rt_rwsem_init((sem), #sem, &__key);		\
+} while (0)
+
+extern void  rt_down_write(struct rw_semaphore *rwsem);
+extern void
+rt_down_read_nested(struct rw_semaphore *rwsem, int subclass);
+extern void
+rt_down_write_nested(struct rw_semaphore *rwsem, int subclass);
+extern void  rt_down_read(struct rw_semaphore *rwsem);
+extern int  rt_down_write_trylock(struct rw_semaphore *rwsem);
+extern int  rt_down_read_trylock(struct rw_semaphore *rwsem);
+extern void  rt_up_read(struct rw_semaphore *rwsem);
+extern void  rt_up_write(struct rw_semaphore *rwsem);
+extern void  rt_downgrade_write(struct rw_semaphore *rwsem);
+
+/*
+ * Semaphores - a spinlock plus the semaphore count:
+ */
+struct semaphore {
+	atomic_t		count;
+	struct rt_mutex		lock;
+};
+
+#define DEFINE_SEMAPHORE(name) \
+struct semaphore name = \
+	{ .count = { 1 }, .lock = __RT_MUTEX_INITIALIZER(name.lock) }
+
+extern void
+__sema_init(struct semaphore *sem, int val, char *name, char *file, int line);
+
+#define rt_sema_init(sem, val) \
+		__sema_init(sem, val, #sem, __FILE__, __LINE__)
+
+/*
+ * No locked initialization for RT semaphores
+ */
+extern void  rt_down(struct semaphore *sem);
+extern int  rt_down_interruptible(struct semaphore *sem);
+extern int  rt_down_timeout(struct semaphore *sem, long jiffies);
+extern int  rt_down_trylock(struct semaphore *sem);
+extern void  rt_up(struct semaphore *sem);
+
+#define rt_sem_is_locked(s)	rt_mutex_is_locked(&(s)->lock)
+#define rt_sema_count(s)	atomic_read(&(s)->count)
+
+#else
+
+static inline int preempt_rt(void) { return 0; }
+
+#endif /* CONFIG_PREEMPT_RT */
+
+#endif
+
diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h
index dd9612d436e2..4e062908ad65 100644
--- a/include/linux/rtmutex.h
+++ b/include/linux/rtmutex.h
@@ -88,6 +88,8 @@ extern void rt_mutex_destroy(struct rt_mutex *lock);
 extern void rt_mutex_lock(struct rt_mutex *lock);
 extern int rt_mutex_lock_interruptible(struct rt_mutex *lock,
 						int detect_deadlock);
+extern int rt_mutex_lock_killable(struct rt_mutex *lock,
+				  int detect_deadlock);
 extern int rt_mutex_timed_lock(struct rt_mutex *lock,
 					struct hrtimer_sleeper *timeout,
 					int detect_deadlock);
diff --git a/include/linux/rwlock.h b/include/linux/rwlock.h
index b4458c6fdd3a..a51ec97eb526 100644
--- a/include/linux/rwlock.h
+++ b/include/linux/rwlock.h
@@ -5,6 +5,60 @@
 # error "please don't include this file directly"
 #endif
 
+#ifdef CONFIG_PREEMPT_RT
+
+#define read_trylock(lock)	__cond_lock(lock, rt_read_trylock(lock))
+#define write_trylock(lock)	__cond_lock(lock, rt_write_trylock(lock))
+
+#define write_trylock_irqsave(lock, flags)	\
+	__cond_lock(lock, rt_write_trylock_irqsave(lock, &flags))
+
+#define write_lock(lock)	rt_write_lock(lock)
+#define read_lock(lock)		rt_read_lock(lock)
+
+#define read_lock_irqsave(lock, flags)			\
+	do {						\
+		typecheck(unsigned long, flags);	\
+		flags = rt_read_lock_irqsave(lock);	\
+	} while (0)
+
+#define write_lock_irqsave(lock, flags)			\
+	do {						\
+		typecheck(unsigned long, flags);	\
+		flags = rt_write_lock_irqsave(lock);	\
+	} while (0)
+
+#define read_lock_irq(lock)	rt_read_lock(lock)
+#define read_lock_bh(lock)	rt_read_lock(lock)
+
+#define write_lock_irq(lock)	rt_write_lock(lock)
+#define write_lock_bh(lock)	rt_write_lock(lock)
+
+#define read_unlock(lock)	rt_read_unlock(lock)
+#define write_unlock(lock)	rt_write_unlock(lock)
+#define read_unlock_irq(lock)	rt_read_unlock(lock)
+#define write_unlock_irq(lock)	rt_write_unlock(lock)
+
+#define read_unlock_irqrestore(lock, flags)		\
+	do {						\
+		typecheck(unsigned long, flags);	\
+		(void) flags;				\
+		rt_read_unlock(lock);			\
+	} while (0)
+
+#define read_unlock_bh(lock)	rt_read_unlock(lock)
+
+#define write_unlock_irqrestore(lock, flags) \
+	do {						\
+		typecheck(unsigned long, flags);	\
+		(void) flags;				\
+		rt_write_unlock(lock);			\
+	} while (0)
+
+#define write_unlock_bh(lock)	rt_write_unlock(lock)
+
+#else
+
 /*
  * rwlock related methods
  *
@@ -147,5 +201,6 @@ do {						\
 	write_trylock(lock) ? \
 	1 : ({ local_irq_restore(flags); 0; }); \
 })
+#endif
 
 #endif /* __LINUX_RWLOCK_H */
diff --git a/include/linux/rwsem-spinlock.h b/include/linux/rwsem-spinlock.h
index ec7620975284..d9af794ba6d4 100644
--- a/include/linux/rwsem-spinlock.h
+++ b/include/linux/rwsem-spinlock.h
@@ -77,6 +77,7 @@ static inline int anon_rwsem_is_locked(struct rw_anon_semaphore *sem)
 	return (sem->activity != 0);
 }
 
+#ifndef CONFIG_PREEMPT_RT
 /*
  * Non preempt-rt implementation of rw_semaphore. Same as above, but
  * restricted vs. ownership. i.e. ownerless locked state and non owner
@@ -129,6 +130,7 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem)
 {
 	return (sem->activity != 0);
 }
+#endif
 
 #endif /* __KERNEL__ */
 #endif /* _LINUX_RWSEM_SPINLOCK_H */
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 70e2f4b53854..e516c81daf39 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -11,6 +11,7 @@
 
 #include <linux/types.h>
 #include <linux/kernel.h>
+#include <linux/rt_lock.h>
 #include <asm/system.h>
 #include <asm/atomic.h>
 
@@ -89,6 +90,59 @@ extern void anon_up_read_non_owner(struct rw_anon_semaphore *sem);
 # define anon_up_read_non_owner(sem)		anon_up_read(sem)
 #endif
 
+#ifdef CONFIG_PREEMPT_RT
+
+#include <linux/rt_lock.h>
+
+#define init_rwsem(sem)		rt_init_rwsem(sem)
+#define rwsem_is_locked(s)	rt_mutex_is_locked(&(s)->lock)
+
+static inline void down_read(struct rw_semaphore *sem)
+{
+	rt_down_read(sem);
+}
+
+static inline int down_read_trylock(struct rw_semaphore *sem)
+{
+	return rt_down_read_trylock(sem);
+}
+
+static inline void down_write(struct rw_semaphore *sem)
+{
+	rt_down_write(sem);
+}
+
+static inline int down_write_trylock(struct rw_semaphore *sem)
+{
+	return rt_down_write_trylock(sem);
+}
+
+static inline void up_read(struct rw_semaphore *sem)
+{
+	rt_up_read(sem);
+}
+
+static inline void up_write(struct rw_semaphore *sem)
+{
+	rt_up_write(sem);
+}
+
+static inline void downgrade_write(struct rw_semaphore *sem)
+{
+	rt_downgrade_write(sem);
+}
+
+static inline void down_read_nested(struct rw_semaphore *sem, int subclass)
+{
+	return rt_down_read_nested(sem, subclass);
+}
+
+static inline void down_write_nested(struct rw_semaphore *sem, int subclass)
+{
+	rt_down_write_nested(sem, subclass);
+}
+
+#else
 /*
  * Non preempt-rt implementations
  */
@@ -136,5 +190,6 @@ static inline void down_write_nested(struct rw_semaphore *sem, int subclass)
 {
 	anon_down_write_nested((struct rw_anon_semaphore *)sem, subclass);
 }
+#endif
 
 #endif /* _LINUX_RWSEM_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5dd3fbd1e09b..5163160e4486 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -100,6 +100,23 @@ struct fs_struct;
 struct bts_context;
 struct perf_counter_context;
 
+#ifdef CONFIG_PREEMPT
+extern int kernel_preemption;
+#else
+# define kernel_preemption 0
+#endif
+#ifdef CONFIG_PREEMPT_VOLUNTARY
+extern int voluntary_preemption;
+#else
+# define voluntary_preemption 0
+#endif
+
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+extern int softirq_preemption;
+#else
+# define softirq_preemption 0
+#endif
+
 /*
  * List of flags we want to share for kernel threads,
  * if only because they are not used by them anyway.
@@ -166,6 +183,7 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 #endif
 
 extern unsigned long long time_sync_thresh;
+extern struct semaphore kernel_sem;
 
 /*
  * Task state bitmask. NOTE! These bits are also
@@ -178,16 +196,17 @@ extern unsigned long long time_sync_thresh;
  * mistake.
  */
 #define TASK_RUNNING		0
-#define TASK_INTERRUPTIBLE	1
-#define TASK_UNINTERRUPTIBLE	2
-#define __TASK_STOPPED		4
-#define __TASK_TRACED		8
+#define TASK_RUNNING_MUTEX	1
+#define TASK_INTERRUPTIBLE	2
+#define TASK_UNINTERRUPTIBLE	4
+#define __TASK_STOPPED		8
+#define __TASK_TRACED		16
 /* in tsk->exit_state */
-#define EXIT_ZOMBIE		16
-#define EXIT_DEAD		32
+#define EXIT_ZOMBIE		32
+#define EXIT_DEAD		64
 /* in tsk->state again */
-#define TASK_DEAD		64
-#define TASK_WAKEKILL		128
+#define TASK_DEAD		128
+#define TASK_WAKEKILL		256
 
 /* Convenience macros for the sake of set_task_state */
 #define TASK_KILLABLE		(TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
@@ -199,7 +218,8 @@ extern unsigned long long time_sync_thresh;
 #define TASK_ALL		(TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED)
 
 /* get_task_state() */
-#define TASK_REPORT		(TASK_RUNNING | TASK_INTERRUPTIBLE | \
+#define TASK_REPORT		(TASK_RUNNING | TASK_RUNNING_MUTEX | \
+				 TASK_INTERRUPTIBLE | \
 				 TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \
 				 __TASK_TRACED)
 
@@ -216,6 +236,28 @@ extern unsigned long long time_sync_thresh;
 #define set_task_state(tsk, state_value)		\
 	set_mb((tsk)->state, (state_value))
 
+// #define PREEMPT_DIRECT
+
+#ifdef CONFIG_X86_LOCAL_APIC
+extern void nmi_show_all_regs(void);
+#else
+# define nmi_show_all_regs() do { } while (0)
+#endif
+
+#include <linux/smp.h>
+#include <linux/sem.h>
+#include <linux/signal.h>
+#include <linux/securebits.h>
+#include <linux/fs_struct.h>
+#include <linux/compiler.h>
+#include <linux/completion.h>
+#include <linux/pid.h>
+#include <linux/percpu.h>
+#include <linux/topology.h>
+#include <linux/seccomp.h>
+
+struct exec_domain;
+
 /*
  * set_current_state() includes a barrier so that the write of current->state
  * is correctly serialised wrt the caller's subsequent test of whether to
@@ -345,6 +387,11 @@ extern signed long schedule_timeout_uninterruptible(signed long timeout);
 asmlinkage void __schedule(void);
 asmlinkage void schedule(void);
 extern int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner);
+/*
+ * This one can be called with interrupts disabled, only
+ * to be used by lowlevel arch code!
+ */
+asmlinkage void __sched __schedule(void);
 
 struct nsproxy;
 struct user_namespace;
@@ -1173,10 +1220,8 @@ struct task_struct {
 	int lock_depth;		/* BKL lock depth */
 
 #ifdef CONFIG_SMP
-#ifdef __ARCH_WANT_UNLOCKED_CTXSW
 	int oncpu;
 #endif
-#endif
 
 	int prio, static_prio, normal_prio;
 	unsigned int rt_priority;
@@ -1283,6 +1328,8 @@ struct task_struct {
 	struct task_cputime cputime_expires;
 	struct list_head cpu_timers[3];
 
+	struct task_struct* posix_timer_list;
+
 /* process credentials */
 	const struct cred *real_cred;	/* objective and real subjective task
 					 * credentials (COW) */
@@ -1386,6 +1433,26 @@ struct task_struct {
 	gfp_t lockdep_reclaim_gfp;
 #endif
 
+/* realtime bits */
+
+#define MAX_PREEMPT_TRACE 25
+#define MAX_LOCK_STACK	MAX_PREEMPT_TRACE
+#ifdef CONFIG_DEBUG_PREEMPT
+	atomic_t lock_count;
+# ifdef CONFIG_PREEMPT_RT
+	struct rt_mutex *owned_lock[MAX_LOCK_STACK];
+# endif
+#endif
+#ifdef CONFIG_DETECT_SOFTLOCKUP
+	unsigned long	softlockup_count; /* Count to keep track how long the
+					   *  thread is in the kernel without
+					   *  sleeping.
+					   */
+#endif
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+	void *last_kernel_lock;
+#endif
+
 /* journalling filesystem info */
 	void *journal_info;
 
@@ -1482,11 +1549,24 @@ struct task_struct {
 	/* bitmask of trace recursion */
 	unsigned long trace_recursion;
 #endif /* CONFIG_TRACING */
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * Temporary hack, until we find a solution to
+	 * handle printk in atomic operations.
+	 */
+	int in_printk;
+#endif
 };
 
 /* Future-safe accessor for struct task_struct's cpus_allowed. */
 #define tsk_cpumask(tsk) (&(tsk)->cpus_allowed)
 
+#ifdef CONFIG_PREEMPT_RT
+# define set_printk_might_sleep(x) do { current->in_printk = x; } while(0)
+#else
+# define set_printk_might_sleep(x) do { } while(0)
+#endif
+
 /*
  * Priority of a process goes from 0..MAX_PRIO-1, valid RT
  * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
@@ -1655,6 +1735,15 @@ extern struct pid *cad_pid;
 extern void free_task(struct task_struct *tsk);
 #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
 
+#ifdef CONFIG_PREEMPT_RT
+extern void __put_task_struct_cb(struct rcu_head *rhp);
+
+static inline void put_task_struct(struct task_struct *t)
+{
+	if (atomic_dec_and_test(&t->usage))
+		call_rcu(&t->rcu, __put_task_struct_cb);
+}
+#else
 extern void __put_task_struct(struct task_struct *t);
 
 static inline void put_task_struct(struct task_struct *t)
@@ -1662,6 +1751,7 @@ static inline void put_task_struct(struct task_struct *t)
 	if (atomic_dec_and_test(&t->usage))
 		__put_task_struct(t);
 }
+#endif
 
 extern cputime_t task_utime(struct task_struct *p);
 extern cputime_t task_stime(struct task_struct *p);
@@ -1676,7 +1766,9 @@ extern cputime_t task_gtime(struct task_struct *p);
 #define PF_EXITING	0x00000004	/* getting shut down */
 #define PF_EXITPIDONE	0x00000008	/* pi exit done on shut down */
 #define PF_VCPU		0x00000010	/* I'm a virtual CPU */
+#define PF_HARDIRQ	0x00000020	/* hardirq thread */
 #define PF_FORKNOEXEC	0x00000040	/* forked but didn't exec */
+#define PF_KMAP		0x00000080	/* this context has a kmap */
 #define PF_SUPERPRIV	0x00000100	/* used super-user privileges */
 #define PF_DUMPCORE	0x00000200	/* dumped core */
 #define PF_SIGNALED	0x00000400	/* killed by a signal */
@@ -1696,6 +1788,7 @@ extern cputime_t task_gtime(struct task_struct *p);
 #define PF_SPREAD_PAGE	0x01000000	/* Spread page cache over cpuset */
 #define PF_SPREAD_SLAB	0x02000000	/* Spread some slab caches over cpuset */
 #define PF_THREAD_BOUND	0x04000000	/* Thread bound to specific cpu */
+#define PF_SOFTIRQ	0x08000000	/* softirq context */
 #define PF_MEMPOLICY	0x10000000	/* Non-default NUMA mempolicy */
 #define PF_MUTEX_TESTER	0x20000000	/* Thread belongs to the rt mutex tester */
 #define PF_FREEZER_SKIP	0x40000000	/* Freezer should not count it as freezeable */
@@ -1877,6 +1970,7 @@ extern struct task_struct *curr_task(int cpu);
 extern void set_curr_task(int cpu, struct task_struct *p);
 
 void yield(void);
+void __yield(void);
 
 /*
  * The default (Linux) execution domain.
@@ -1938,6 +2032,9 @@ extern void do_timer(unsigned long ticks);
 
 extern int wake_up_state(struct task_struct *tsk, unsigned int state);
 extern int wake_up_process(struct task_struct *tsk);
+extern int wake_up_process_mutex(struct task_struct * tsk);
+extern int wake_up_process_sync(struct task_struct * tsk);
+extern int wake_up_process_mutex_sync(struct task_struct * tsk);
 extern void wake_up_new_task(struct task_struct *tsk,
 				unsigned long clone_flags);
 #ifdef CONFIG_SMP
@@ -2026,12 +2123,20 @@ extern struct mm_struct * mm_alloc(void);
 
 /* mmdrop drops the mm and the page tables */
 extern void __mmdrop(struct mm_struct *);
+extern void __mmdrop_delayed(struct mm_struct *);
+
 static inline void mmdrop(struct mm_struct * mm)
 {
 	if (unlikely(atomic_dec_and_test(&mm->mm_count)))
 		__mmdrop(mm);
 }
 
+static inline void mmdrop_delayed(struct mm_struct * mm)
+{
+	if (atomic_dec_and_test(&mm->mm_count))
+		__mmdrop_delayed(mm);
+}
+
 /* mmput gets rid of the mappings and all user-space */
 extern void mmput(struct mm_struct *);
 /* Grab a reference to a task's mm, if it is not already going away */
@@ -2305,6 +2410,7 @@ static inline int cond_resched_bkl(void)
 {
 	return _cond_resched();
 }
+extern int cond_resched_softirq_context(void);
 
 /*
  * Does a critical section need to be broken due to another
@@ -2337,6 +2443,13 @@ static inline void thread_group_cputime_free(struct signal_struct *sig)
 {
 }
 
+static inline int softirq_need_resched(void)
+{
+	if (softirq_preemption && (current->flags & PF_SOFTIRQ))
+		return need_resched();
+	return 0;
+}
+
 /*
  * Reevaluate whether the task has signals pending delivery.
  * Wake the task if so.
@@ -2483,7 +2596,14 @@ static inline void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
 }
 #endif /* CONFIG_MM_OWNER */
 
-#define TASK_STATE_TO_CHAR_STR "RSDTtZX"
+#define TASK_STATE_TO_CHAR_STR "RMSDTtZX"
+
+#ifdef CONFIG_SMP
+static inline int task_is_current(struct task_struct *task)
+{
+	return task->oncpu;
+}
+#endif
 
 #endif /* __KERNEL__ */
 
diff --git a/include/linux/semaphore.h b/include/linux/semaphore.h
index b61f2290f8f3..7ba0d2db5e3b 100644
--- a/include/linux/semaphore.h
+++ b/include/linux/semaphore.h
@@ -58,6 +58,46 @@ extern int __must_check anon_down_trylock(struct anon_semaphore *sem);
 extern int __must_check anon_down_timeout(struct anon_semaphore *sem, long jiffies);
 extern void anon_up(struct anon_semaphore *sem);
 
+#ifdef CONFIG_PREEMPT_RT
+
+static inline void sema_init(struct semaphore *sem, int val)
+{
+	rt_sema_init(sem, val);
+}
+
+static inline void semaphore_init(struct semaphore *sem)
+{
+	sema_init(sem, 1);
+}
+
+static inline void down(struct semaphore *sem)
+{
+	rt_down(sem);
+}
+
+static inline int __must_check down_interruptible(struct semaphore *sem)
+{
+	return rt_down_interruptible(sem);
+}
+
+static inline int __must_check down_trylock(struct semaphore *sem)
+{
+	return rt_down_trylock(sem);
+}
+
+static inline int __must_check
+down_timeout(struct semaphore *sem, long jiffies)
+{
+	return rt_down_timeout(sem, jiffies);
+}
+
+static inline void up(struct semaphore *sem)
+{
+	rt_up(sem);
+}
+
+
+#else
 /*
  * Non preempt-rt maps semaphores to anon semaphores
  */
@@ -125,5 +165,6 @@ static inline void up(struct semaphore *sem)
 {
 	anon_up((struct anon_semaphore *)sem);
 }
+#endif
 
 #endif /* __LINUX_SEMAPHORE_H */
diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index dc64d96aaab8..e4a3f95c3bb7 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -3,9 +3,11 @@
 /*
  * Reader/writer consistent mechanism without starving writers. This type of
  * lock for data where the reader wants a consistent set of information
- * and is willing to retry if the information changes.  Readers never
- * block but they may have to retry if a writer is in
- * progress. Writers do not wait for readers. 
+ * and is willing to retry if the information changes. Readers block
+ * on write contention (and where applicable, pi-boost the writer).
+ * Readers without contention on entry acquire the critical section
+ * without any atomic operations, but they may have to retry if a writer
+ * enters before the critical section ends. Writers do not wait for readers.
  *
  * This is not as cache friendly as brlock. Also, this will not work
  * for data that contains pointers, because any writer could
@@ -24,6 +26,8 @@
  *
  * Based on x86_64 vsyscall gettimeofday 
  * by Keith Owens and Andrea Arcangeli
+ *
+ * Priority inheritance and live-lock avoidance by Gregory Haskins
  */
 
 #include <linux/spinlock.h>
@@ -36,7 +40,7 @@ typedef struct {
 
 typedef struct {
 	unsigned sequence;
-	spinlock_t lock;
+	rwlock_t lock;
 } seqlock_t;
 
 /*
@@ -56,7 +60,7 @@ typedef struct {
 	atomic_seqlock_t x = __ATOMIC_SEQLOCK_UNLOCKED(x)
 
 #define __SEQLOCK_UNLOCKED(lockname) \
-	{ 0, __SPIN_LOCK_UNLOCKED(lockname) }
+	{ 0, __RW_LOCK_UNLOCKED(lockname) }
 
 #define SEQLOCK_UNLOCKED \
 	__SEQLOCK_UNLOCKED(old_style_seqlock_init)
@@ -64,7 +68,7 @@ typedef struct {
 #define seqlock_init(x)					\
 	do {						\
 		(x)->sequence = 0;			\
-		spin_lock_init(&(x)->lock);		\
+		rwlock_init(&(x)->lock);		\
 	} while (0)
 
 #define DEFINE_SEQLOCK(x) \
@@ -83,7 +87,7 @@ static inline void write_atomic_seqlock(atomic_seqlock_t *sl)
 
 static inline void write_seqlock(seqlock_t *sl)
 {
-	spin_lock(&sl->lock);
+	write_lock(&sl->lock);
 	++sl->sequence;
 	smp_wmb();
 }
@@ -99,12 +103,12 @@ static inline void write_sequnlock(seqlock_t *sl)
 {
 	smp_wmb();
 	sl->sequence++;
-	spin_unlock(&sl->lock);
+	write_unlock(&sl->lock);
 }
 
 static inline int write_tryseqlock(seqlock_t *sl)
 {
-	int ret = spin_trylock(&sl->lock);
+	int ret = write_trylock(&sl->lock);
 
 	if (ret) {
 		++sl->sequence;
@@ -129,18 +133,26 @@ repeat:
 	return ret;
 }
 
-static __always_inline unsigned read_seqbegin(const seqlock_t *sl)
+static __always_inline unsigned read_seqbegin(seqlock_t *sl)
 {
 	unsigned ret;
 
-repeat:
 	ret = sl->sequence;
 	smp_rmb();
 	if (unlikely(ret & 1)) {
 		cpu_relax();
-		goto repeat;
+		/*
+		 * Serialze with the writer which will ensure they are
+		 * pi-boosted if necessary and prevent us from starving
+		 * them.
+		 */
+		read_lock(&sl->lock);
+		ret = sl->sequence;
+		read_unlock(&sl->lock);
 	}
 
+	BUG_ON(ret & 1);
+
 	return ret;
 }
 
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f2c69a2cca17..313f09d4e357 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -98,6 +98,9 @@ struct pipe_inode_info;
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
 struct nf_conntrack {
 	atomic_t use;
+#ifdef CONFIG_PREEMPT_RT
+	struct rcu_head rcu;
+#endif
 };
 #endif
 
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 9e3d8af09207..378005121d60 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -50,6 +50,16 @@ extern void smp_send_stop(void);
  */
 extern void smp_send_reschedule(int cpu);
 
+/*
+ * trigger a reschedule on all other CPUs:
+ */
+extern void smp_send_reschedule_allbutself(void);
+
+/*
+ * trigger a reschedule on all other CPUs:
+ */
+extern void smp_send_reschedule_allbutself(void);
+
 
 /*
  * Prepare machine for booting other CPUs.
@@ -142,6 +152,7 @@ static inline int up_smp_call_function(void (*func)(void *), void *info)
 		0;				\
 	})
 static inline void smp_send_reschedule(int cpu) { }
+static inline void smp_send_reschedule_allbutself(void) { }
 #define num_booting_cpus()			1
 #define smp_prepare_boot_cpu()			do {} while (0)
 #define smp_call_function_mask(mask, func, info, wait) \
diff --git a/include/linux/smp_lock.h b/include/linux/smp_lock.h
index 813be59bf345..0cb3cf9a68ef 100644
--- a/include/linux/smp_lock.h
+++ b/include/linux/smp_lock.h
@@ -45,7 +45,7 @@ static inline void cycle_kernel_lock(void)
 #define unlock_kernel()				do { } while(0)
 #define release_kernel_lock(task)		do { } while(0)
 #define cycle_kernel_lock()			do { } while(0)
-#define reacquire_kernel_lock(task)		0
+#define reacquire_kernel_lock(task)		do { } while(0)
 #define kernel_locked()				1
 
 #endif /* CONFIG_LOCK_KERNEL */
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index 4e84c621713e..b2eb1c9c725b 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -58,23 +58,6 @@
 #include <asm/system.h>
 
 /*
- * Must define these before including other files, inline functions need them
- */
-#define LOCK_SECTION_NAME ".text.lock."KBUILD_BASENAME
-
-#define LOCK_SECTION_START(extra)               \
-        ".subsection 1\n\t"                     \
-        extra                                   \
-        ".ifndef " LOCK_SECTION_NAME "\n\t"     \
-        LOCK_SECTION_NAME ":\n\t"               \
-        ".endif\n"
-
-#define LOCK_SECTION_END                        \
-        ".previous\n\t"
-
-#define __lockfunc __attribute__((section(".spinlock.text")))
-
-/*
  * Pull the raw_spinlock_t and raw_rwlock_t definitions:
  */
 #include <linux/spinlock_types.h>
@@ -283,6 +266,98 @@ _atomic_dec_and_atomic_lock(atomic_t *atomic, atomic_spinlock_t *lock);
 #define atomic_dec_and_atomic_lock(atomic, lock) \
 	__cond_lock(lock, _atomic_dec_and_atomic_lock(atomic, lock))
 
+#ifdef CONFIG_PREEMPT_RT
+
+#include <linux/rt_lock.h>
+
+#define spin_lock(lock)			rt_spin_lock(lock)
+#define spin_lock_bh(lock)		rt_spin_lock(lock)
+
+#define spin_trylock(lock)		__cond_lock(lock, rt_spin_trylock(lock))
+
+#ifdef CONFIG_LOCKDEP
+# define spin_lock_nested(lock, subclass) \
+	rt_spin_lock_nested(lock, subclass)
+
+# define spin_lock_irqsave_nested(lock, flags, subclass) \
+do { \
+	typecheck(unsigned long, flags); \
+	flags = 0; \
+	rt_spin_lock_nested(lock, subclass); \
+} while (0)
+#else
+# define spin_lock_nested(lock, subclass) \
+	rt_spin_lock(lock)
+
+# define spin_lock_irqsave_nested(lock, flags, subclass) \
+do { \
+	typecheck(unsigned long, flags); \
+	flags = 0; \
+	rt_spin_lock(lock); \
+} while (0)
+#endif
+
+#define spin_lock_irq(lock)		rt_spin_lock(lock)
+
+#define spin_lock_irqsave(lock, flags) \
+do { \
+	typecheck(unsigned long, flags); \
+	flags = 0; \
+	rt_spin_lock(lock); \
+} while (0)
+
+/* FIXME: we need rt_spin_lock_nested */
+#define spin_lock_nest_lock(lock, nest_lock) spin_lock_nested(lock, 0)
+
+#define spin_unlock(lock)		rt_spin_unlock(lock)
+#define spin_unlock_bh(lock)		rt_spin_unlock(lock)
+#define spin_unlock_irq(lock)		rt_spin_unlock(lock)
+
+#define spin_unlock_irqrestore(lock, flags) \
+do { \
+	typecheck(unsigned long, flags); \
+	(void) flags; \
+	rt_spin_unlock(lock); \
+} while (0)
+
+#define spin_trylock_bh(lock)		__cond_lock(lock, rt_spin_trylock(lock))
+#define spin_trylock_irq(lock)		__cond_lock(lock, rt_spin_trylock(lock))
+
+#define spin_trylock_irqsave(lock, flags) \
+({				  \
+	typecheck(unsigned long, flags); \
+	flags = 0; \
+	__cond_lock(lock, rt_spin_trylock(lock));	\
+})
+
+#define spin_unlock_wait(lock)		rt_spin_unlock_wait(lock)
+
+#ifdef CONFIG_GENERIC_LOCKBREAK
+# define spin_is_contended(lock)	((lock)->break_lock)
+#else
+# define spin_is_contended(lock)	(((void)(lock), 0))
+#endif
+
+static inline int spin_can_locked(spinlock_t *lock)
+{
+	return !rt_mutex_is_locked(&lock->lock);
+}
+
+static inline int spin_is_locked(spinlock_t *lock)
+{
+	return rt_mutex_is_locked(&lock->lock);
+}
+
+static inline void assert_spin_locked(spinlock_t *lock)
+{
+	BUG_ON(!spin_is_locked(lock));
+}
+
+#define atomic_dec_and_lock(atomic, lock) \
+	atomic_dec_and_spin_lock(atomic, lock)
+
+#else
+
 /*
  * Map spin* to atomic_spin* for PREEMPT_RT=n
  */
@@ -420,6 +495,8 @@ do { \
 	atomic_dec_and_atomic_lock(atomic, (atomic_spinlock_t *)lock); \
 })
 
+#endif /* !PREEMPT_RT */
+
 /*
  * Get the rwlock part
  */
diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h
index 52df959a5687..4a9f6e4d31a6 100644
--- a/include/linux/spinlock_api_smp.h
+++ b/include/linux/spinlock_api_smp.h
@@ -58,7 +58,7 @@ void __lockfunc
 _atomic_spin_unlock_irqrestore(atomic_spinlock_t *lock,	unsigned long flags)
 							__releases(lock);
 
-
+#ifndef CONFIG_PREEMPT_RT
 void __lockfunc _read_lock(rwlock_t *lock)		__acquires(lock);
 void __lockfunc _write_lock(rwlock_t *lock)		__acquires(lock);
 void __lockfunc _read_lock_bh(rwlock_t *lock)		__acquires(lock);
@@ -85,5 +85,6 @@ void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
 							__releases(lock);
 void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
 							__releases(lock);
+#endif
 
 #endif /* __LINUX_SPINLOCK_API_SMP_H */
diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h
index b30d84b5d6a8..a9278a9b5c75 100644
--- a/include/linux/spinlock_types.h
+++ b/include/linux/spinlock_types.h
@@ -9,6 +9,23 @@
  * Released under the General Public License (GPL).
  */
 
+/*
+ * Must define these before including other files, inline functions need them
+ */
+#define LOCK_SECTION_NAME ".text.lock."KBUILD_BASENAME
+
+#define LOCK_SECTION_START(extra)		\
+	".subsection 1\n\t"			\
+	extra					\
+	".ifndef " LOCK_SECTION_NAME "\n\t"     \
+	LOCK_SECTION_NAME ":\n\t"		\
+	".endif\n"
+
+#define LOCK_SECTION_END			\
+	".previous\n\t"
+
+#define __lockfunc __attribute__((section(".spinlock.text")))
+
 #if defined(CONFIG_SMP)
 # include <asm/spinlock_types.h>
 #else
@@ -17,7 +34,7 @@
 
 #include <linux/lockdep.h>
 
-typedef struct {
+typedef struct atomic_spinlock {
 	raw_spinlock_t raw_lock;
 #ifdef CONFIG_GENERIC_LOCKBREAK
 	unsigned int break_lock;
@@ -64,11 +81,12 @@ typedef struct {
 #define DEFINE_ATOMIC_SPINLOCK(x)	\
 	atomic_spinlock_t x = __ATOMIC_SPIN_LOCK_UNLOCKED(x)
 
+#ifndef CONFIG_PREEMPT_RT
 /*
  * For PREEMPT_RT=n we use the same data structures and the spinlock
  * functions are mapped to the atomic_spinlock functions
  */
-typedef struct {
+typedef struct spinlock {
 	raw_spinlock_t raw_lock;
 #ifdef CONFIG_GENERIC_LOCKBREAK
 	unsigned int break_lock;
@@ -109,4 +127,6 @@ typedef struct {
 
 #include <linux/rwlock_types.h>
 
+#endif
+
 #endif /* __LINUX_SPINLOCK_TYPES_H */
diff --git a/include/linux/timer.h b/include/linux/timer.h
index be62ec2ebea5..0f3c593209a0 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -230,10 +230,12 @@ static inline void timer_stats_timer_clear_start_info(struct timer_list *timer)
 
 extern void add_timer(struct timer_list *timer);
 
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_SOFTIRQS)
+  extern int timer_pending_sync(struct timer_list *timer);
   extern int try_to_del_timer_sync(struct timer_list *timer);
   extern int del_timer_sync(struct timer_list *timer);
 #else
+# define timer_pending_sync(t)		timer_pending(t)
 # define try_to_del_timer_sync(t)	del_timer(t)
 # define del_timer_sync(t)		del_timer(t)
 #endif
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 13e1adf55c4c..3f363b7168c4 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -195,6 +195,9 @@ __create_workqueue_key(const char *name, int singlethread,
 #define create_freezeable_workqueue(name) __create_workqueue((name), 1, 1, 0)
 #define create_singlethread_workqueue(name) __create_workqueue((name), 1, 0, 0)
 
+extern void set_workqueue_prio(struct workqueue_struct *wq, int policy,
+			       int rt_priority, int nice);
+
 extern void destroy_workqueue(struct workqueue_struct *wq);
 
 extern int queue_work(struct workqueue_struct *wq, struct work_struct *work);
diff --git a/init/Kconfig b/init/Kconfig
index 06712e2359b7..71ac0ea0679b 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1041,6 +1041,7 @@ config SLAB
 
 config SLUB
 	bool "SLUB (Unqueued Allocator)"
+	depends on !PREEMPT_RT
 	help
 	   SLUB is a slab allocator that minimizes cache line usage
 	   instead of managing queues of cached objects (SLAB approach).
diff --git a/init/Makefile b/init/Makefile
index 4a243df426f7..3f6e894183d0 100644
--- a/init/Makefile
+++ b/init/Makefile
@@ -33,4 +33,5 @@ silent_chk_compile.h = :
 include/linux/compile.h: FORCE
 	@$($(quiet)chk_compile.h)
 	$(Q)$(CONFIG_SHELL) $(srctree)/scripts/mkcompile_h $@ \
-	"$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CC) $(KBUILD_CFLAGS)"
+	"$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CONFIG_PREEMPT_RT)" \
+	"$(CC) $(KBUILD_CFLAGS)"
diff --git a/init/main.c b/init/main.c
index 252f448d8b9d..a04e1023e97d 100644
--- a/init/main.c
+++ b/init/main.c
@@ -37,6 +37,7 @@
 #include <linux/workqueue.h>
 #include <linux/profile.h>
 #include <linux/rcupdate.h>
+#include <linux/posix-timers.h>
 #include <linux/moduleparam.h>
 #include <linux/kallsyms.h>
 #include <linux/writeback.h>
@@ -451,6 +452,8 @@ static noinline void __init_refok rest_init(void)
 {
 	int pid;
 
+	system_state = SYSTEM_BOOTING_SCHEDULER_OK;
+
 	kernel_thread(kernel_init, NULL, CLONE_FS | CLONE_SIGHAND);
 	numa_default_policy();
 	pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);
@@ -714,6 +717,9 @@ asmlinkage void __init start_kernel(void)
 
 	ftrace_init();
 
+#ifdef CONFIG_PREEMPT_RT
+	WARN_ON(irqs_disabled());
+#endif
 	/* Do the rest non-__init'ed, we're now alive */
 	rest_init();
 }
@@ -816,9 +822,11 @@ static void __init do_basic_setup(void)
 static void __init do_pre_smp_initcalls(void)
 {
 	initcall_t *call;
+	extern int spawn_desched_task(void);
 
 	for (call = __initcall_start; call < __early_initcall_end; call++)
 		do_one_initcall(*call);
+	spawn_desched_task();
 }
 
 static void run_init_process(char *init_filename)
@@ -854,6 +862,9 @@ static noinline int init_post(void)
 		printk(KERN_WARNING "Failed to execute %s\n",
 				ramdisk_execute_command);
 	}
+#ifdef CONFIG_PREEMPT_RT
+	WARN_ON(irqs_disabled());
+#endif
 
 	/*
 	 * We try each of these until one succeeds.
@@ -920,7 +931,54 @@ static int __init kernel_init(void * unused)
 		ramdisk_execute_command = NULL;
 		prepare_namespace();
 	}
+#ifdef CONFIG_PREEMPT_RT
+	WARN_ON(irqs_disabled());
+#endif
+
+#define DEBUG_COUNT (defined(CONFIG_DEBUG_RT_MUTEXES) + defined(CONFIG_IRQSOFF_TRACER) + defined(CONFIG_PREEMPT_TRACER) + defined(CONFIG_STACK_TRACER) + defined(CONFIG_WAKEUP_LATENCY_HIST) + defined(CONFIG_DEBUG_SLAB) + defined(CONFIG_DEBUG_PAGEALLOC) + defined(CONFIG_LOCKDEP) + (defined(CONFIG_FTRACE) - defined(CONFIG_FTRACE_MCOUNT_RECORD)))
 
+#if DEBUG_COUNT > 0
+	printk(KERN_ERR "*****************************************************************************\n");
+	printk(KERN_ERR "*                                                                           *\n");
+#if DEBUG_COUNT == 1
+	printk(KERN_ERR "*  REMINDER, the following debugging option is turned on in your .config:   *\n");
+#else
+	printk(KERN_ERR "*  REMINDER, the following debugging options are turned on in your .config: *\n");
+#endif
+	printk(KERN_ERR "*                                                                           *\n");
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+	printk(KERN_ERR "*        CONFIG_DEBUG_RT_MUTEXES                                            *\n");
+#endif
+#ifdef CONFIG_IRQSOFF_TRACER
+	printk(KERN_ERR "*        CONFIG_IRQSOFF_TRACER                                              *\n");
+#endif
+#ifdef CONFIG_PREEMPT_TRACER
+	printk(KERN_ERR "*        CONFIG_PREEMPT_TRACER                                              *\n");
+#endif
+#ifdef CONFIG_FTRACE
+	printk(KERN_ERR "*        CONFIG_FTRACE                                                      *\n");
+#endif
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+	printk(KERN_ERR "*        CONFIG_WAKEUP_LATENCY_HIST                                         *\n");
+#endif
+#ifdef CONFIG_DEBUG_SLAB
+	printk(KERN_ERR "*        CONFIG_DEBUG_SLAB                                                  *\n");
+#endif
+#ifdef CONFIG_DEBUG_PAGEALLOC
+	printk(KERN_ERR "*        CONFIG_DEBUG_PAGEALLOC                                             *\n");
+#endif
+#ifdef CONFIG_LOCKDEP
+	printk(KERN_ERR "*        CONFIG_LOCKDEP                                                     *\n");
+#endif
+	printk(KERN_ERR "*                                                                           *\n");
+#if DEBUG_COUNT == 1
+	printk(KERN_ERR "*  it may increase runtime overhead and latencies.                          *\n");
+#else
+	printk(KERN_ERR "*  they may increase runtime overhead and latencies.                        *\n");
+#endif
+	printk(KERN_ERR "*                                                                           *\n");
+	printk(KERN_ERR "*****************************************************************************\n");
+#endif
 	/*
 	 * Ok, we have completed the initial bootup, and
 	 * we're essentially up and running. Get rid of the
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index bf987b95b356..f4602f8f35d4 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -1,14 +1,13 @@
-
 choice
-	prompt "Preemption Model"
-	default PREEMPT_NONE
+	prompt "Preemption Mode"
+	default PREEMPT_RT
 
 config PREEMPT_NONE
 	bool "No Forced Preemption (Server)"
 	help
-	  This is the traditional Linux preemption model, geared towards
+	  This is the traditional Linux preemption model geared towards
 	  throughput. It will still provide good latencies most of the
-	  time, but there are no guarantees and occasional longer delays
+	  time but there are no guarantees and occasional long delays
 	  are possible.
 
 	  Select this option if you are building a kernel for a server or
@@ -21,7 +20,7 @@ config PREEMPT_VOLUNTARY
 	help
 	  This option reduces the latency of the kernel by adding more
 	  "explicit preemption points" to the kernel code. These new
-	  preemption points have been selected to reduce the maximum
+	  preemption points have been selected to minimize the maximum
 	  latency of rescheduling, providing faster application reactions,
 	  at the cost of slightly lower throughput.
 
@@ -33,22 +32,91 @@ config PREEMPT_VOLUNTARY
 
 	  Select this if you are building a kernel for a desktop system.
 
-config PREEMPT
+config PREEMPT_DESKTOP
 	bool "Preemptible Kernel (Low-Latency Desktop)"
 	help
 	  This option reduces the latency of the kernel by making
-	  all kernel code (that is not executing in a critical section)
+	  all kernel code that is not executing in a critical section
 	  preemptible.  This allows reaction to interactive events by
 	  permitting a low priority process to be preempted involuntarily
 	  even if it is in kernel mode executing a system call and would
-	  otherwise not be about to reach a natural preemption point.
-	  This allows applications to run more 'smoothly' even when the
-	  system is under load, at the cost of slightly lower throughput
-	  and a slight runtime overhead to kernel code.
+	  otherwise not about to reach a preemption point.  This allows
+	  applications to run more 'smoothly' even when the system is
+	  under load, at the cost of slighly lower throughput and a
+	  slight runtime overhead to kernel code.
+
+	  (According to profiles, when this mode is selected then even
+	  during kernel-intense workloads the system is in an immediately
+	  preemptible state more than 50% of the time.)
 
 	  Select this if you are building a kernel for a desktop or
 	  embedded system with latency requirements in the milliseconds
 	  range.
 
+config PREEMPT_RT
+	bool "Complete Preemption (Real-Time)"
+	select PREEMPT_SOFTIRQS
+	select PREEMPT_HARDIRQS
+	select PREEMPT_RCU
+	select RT_MUTEXES
+	help
+	  This option further reduces the scheduling latency of the
+	  kernel by replacing almost every spinlock used by the kernel
+	  with preemptible mutexes and thus making all but the most
+	  critical kernel code involuntarily preemptible. The remaining
+	  handful of lowlevel non-preemptible codepaths are short and
+	  have a deterministic latency of a couple of tens of
+	  microseconds (depending on the hardware).  This also allows
+	  applications to run more 'smoothly' even when the system is
+	  under load, at the cost of lower throughput and runtime
+	  overhead to kernel code.
+
+	  (According to profiles, when this mode is selected then even
+	  during kernel-intense workloads the system is in an immediately
+	  preemptible state more than 95% of the time.)
+
+	  Select this if you are building a kernel for a desktop,
+	  embedded or real-time system with guaranteed latency
+	  requirements of 100 usecs or lower.
+
 endchoice
 
+config PREEMPT
+	bool
+	default y
+	depends on PREEMPT_DESKTOP || PREEMPT_RT
+
+config PREEMPT_SOFTIRQS
+	bool "Thread Softirqs"
+	default n
+#	depends on PREEMPT
+	help
+	  This option reduces the latency of the kernel by 'threading'
+          soft interrupts. This means that all softirqs will execute
+          in softirqd's context. While this helps latency, it can also
+          reduce performance.
+
+          The threading of softirqs can also be controlled via
+          /proc/sys/kernel/softirq_preemption runtime flag and the
+          sofirq-preempt=0/1 boot-time option.
+
+	  Say N if you are unsure.
+
+config PREEMPT_HARDIRQS
+	bool "Thread Hardirqs"
+	default n
+	depends on GENERIC_HARDIRQS_NO__DO_IRQ
+	select PREEMPT_SOFTIRQS
+	help
+	  This option reduces the latency of the kernel by 'threading'
+          hardirqs. This means that all (or selected) hardirqs will run
+          in their own kernel thread context. While this helps latency,
+          this feature can also reduce performance.
+
+          The threading of hardirqs can also be controlled via the
+          /proc/sys/kernel/hardirq_preemption runtime flag and the
+          hardirq-preempt=0/1 boot-time option. Per-irq threading can
+          be enabled/disable via the /proc/irq/<IRQ>/<handler>/threaded
+          runtime flags.
+
+	  Say N if you are unsure.
diff --git a/kernel/Makefile b/kernel/Makefile
index cbd298929475..1ed7510a4c1a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -7,7 +7,7 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o \
 	    sysctl.o capability.o ptrace.o timer.o user.o \
 	    signal.o sys.o kmod.o workqueue.o pid.o \
 	    rcupdate.o extable.o params.o posix-timers.o \
-	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
+	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o \
 	    hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
 	    notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
 	    async.o
@@ -28,7 +28,10 @@ obj-$(CONFIG_PROFILING) += profile.o
 obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
 obj-y += time/
+ifneq ($(CONFIG_PREEMPT_RT),y)
+obj-y += mutex.o
 obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
+endif
 obj-$(CONFIG_LOCKDEP) += lockdep.o
 ifeq ($(CONFIG_PROC_FS),y)
 obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
@@ -40,6 +43,7 @@ endif
 obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
 obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
 obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
+obj-$(CONFIG_PREEMPT_RT) += rt.o
 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
 obj-$(CONFIG_USE_GENERIC_SMP_HELPERS) += smp.o
 ifneq ($(CONFIG_SMP),y)
diff --git a/kernel/exit.c b/kernel/exit.c
index a27d47d3d0c8..4441e623e671 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -67,7 +67,9 @@ static void __unhash_process(struct task_struct *p)
 		detach_pid(p, PIDTYPE_SID);
 
 		list_del_rcu(&p->tasks);
+		preempt_disable();
 		__get_cpu_var(process_counts)--;
+		preempt_enable();
 	}
 	list_del_rcu(&p->thread_group);
 	list_del_init(&p->sibling);
@@ -685,9 +687,11 @@ static void exit_mm(struct task_struct * tsk)
 	task_lock(tsk);
 	tsk->mm = NULL;
 	up_read(&mm->mmap_sem);
+	preempt_disable(); // FIXME
 	enter_lazy_tlb(mm, current);
 	/* We don't want this task to be frozen prematurely */
 	clear_freeze_flag(tsk);
+	preempt_enable();
 	task_unlock(tsk);
 	mm_update_next_owner(mm);
 	mmput(mm);
@@ -1009,14 +1013,17 @@ NORET_TYPE void do_exit(long code)
 	if (tsk->splice_pipe)
 		__free_pipe_info(tsk->splice_pipe);
 
-	preempt_disable();
+again:
+	local_irq_disable();
 	/* causes final put_task_struct in finish_task_switch(). */
 	tsk->state = TASK_DEAD;
-	schedule();
-	BUG();
-	/* Avoid "noreturn function does return".  */
-	for (;;)
-		cpu_relax();	/* For when BUG is null */
+	__schedule();
+	printk(KERN_ERR "BUG: dead task %s:%d back from the grave!\n",
+		current->comm, current->pid);
+	printk(KERN_ERR ".... flags: %08x, count: %d, state: %08lx\n",
+		current->flags, atomic_read(&current->usage), current->state);
+	printk(KERN_ERR ".... trying again ...\n");
+	goto again;
 }
 
 EXPORT_SYMBOL_GPL(do_exit);
@@ -1476,6 +1483,9 @@ static int wait_consider_task(struct wait_opts *wo, struct task_struct *parent,
 				int ptrace, struct task_struct *p)
 {
 	int ret = eligible_child(wo, p);
+
+	BUG_ON(!atomic_read(&p->usage));
+
 	if (!ret)
 		return ret;
 
diff --git a/kernel/fork.c b/kernel/fork.c
index eed403c582e1..ca4486cb3973 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -38,6 +38,7 @@
 #include <linux/syscalls.h>
 #include <linux/jiffies.h>
 #include <linux/tracehook.h>
+#include <linux/interrupt.h>
 #include <linux/futex.h>
 #include <linux/compat.h>
 #include <linux/task_io_accounting_ops.h>
@@ -48,6 +49,8 @@
 #include <linux/memcontrol.h>
 #include <linux/ftrace.h>
 #include <linux/profile.h>
+#include <linux/kthread.h>
+#include <linux/notifier.h>
 #include <linux/rmap.h>
 #include <linux/acct.h>
 #include <linux/tsacct_kern.h>
@@ -82,7 +85,19 @@ int max_threads;		/* tunable limit on nr_threads */
 
 DEFINE_PER_CPU(unsigned long, process_counts) = 0;
 
+#ifdef CONFIG_PREEMPT_RT
+DEFINE_RWLOCK(tasklist_lock);  /* outer */
+#else
 __cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
+#endif
+
+/*
+ * Delayed mmdrop. In the PREEMPT_RT case we
+ * dont want to do this from the scheduling
+ * context.
+ */
+static DEFINE_PER_CPU(struct task_struct *, desched_task);
+static DEFINE_PER_CPU(struct list_head, delayed_drop_list);
 
 int nr_processes(void)
 {
@@ -160,6 +175,16 @@ void __put_task_struct(struct task_struct *tsk)
 		free_task(tsk);
 }
 
+#ifdef CONFIG_PREEMPT_RT
+void __put_task_struct_cb(struct rcu_head *rhp)
+{
+	struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
+
+	__put_task_struct(tsk);
+
+}
+#endif
+
 /*
  * macro override instead of weak attribute alias, to workaround
  * gcc 4.1.0 and 4.1.1 bugs with weak attribute and empty functions.
@@ -170,6 +195,8 @@ void __put_task_struct(struct task_struct *tsk)
 
 void __init fork_init(unsigned long mempages)
 {
+	int i;
+
 #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
 #ifndef ARCH_MIN_TASKALIGN
 #define ARCH_MIN_TASKALIGN	L1_CACHE_BYTES
@@ -200,6 +227,9 @@ void __init fork_init(unsigned long mempages)
 	init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
 	init_task.signal->rlim[RLIMIT_SIGPENDING] =
 		init_task.signal->rlim[RLIMIT_NPROC];
+
+	for (i = 0; i < NR_CPUS; i++)
+		INIT_LIST_HEAD(&per_cpu(delayed_drop_list, i));
 }
 
 int __attribute__((weak)) arch_dup_task_struct(struct task_struct *dst,
@@ -281,6 +311,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 	mm->locked_vm = 0;
 	mm->mmap = NULL;
 	mm->mmap_cache = NULL;
+	INIT_LIST_HEAD(&mm->delayed_drop);
 	mm->free_area_cache = oldmm->mmap_base;
 	mm->cached_hole_size = ~0UL;
 	mm->map_count = 0;
@@ -912,6 +943,9 @@ static void rt_mutex_init_task(struct task_struct *p)
 #ifdef CONFIG_RT_MUTEXES
 	plist_head_init_atomic(&p->pi_waiters, &p->pi_lock);
 	p->pi_blocked_on = NULL;
+# ifdef CONFIG_DEBUG_RT_MUTEXES
+	p->last_kernel_lock = NULL;
+# endif
 #endif
 }
 
@@ -1046,7 +1080,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	acct_clear_integrals(p);
 
 	posix_cpu_timers_init(p);
-
+	p->posix_timer_list = NULL;
 	p->lock_depth = -1;		/* -1 = no lock */
 	do_posix_clock_monotonic_gettime(&p->start_time);
 	p->real_start_time = p->start_time;
@@ -1124,6 +1158,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	retval = copy_thread(clone_flags, stack_start, stack_size, p, regs);
 	if (retval)
 		goto bad_fork_cleanup_io;
+#ifdef CONFIG_DEBUG_PREEMPT
+	atomic_set(&p->lock_count, 0);
+#endif
 
 	if (pid != &init_struct_pid) {
 		retval = -ENOMEM;
@@ -1209,11 +1246,13 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	 * to ensure it is on a valid CPU (and if not, just force it back to
 	 * parent's CPU). This avoids alot of nasty races.
 	 */
+	preempt_disable();
 	p->cpus_allowed = current->cpus_allowed;
 	p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed;
 	if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
 			!cpu_online(task_cpu(p))))
 		set_task_cpu(p, smp_processor_id());
+	preempt_enable();
 
 	/* CLONE_PARENT re-uses the old parent */
 	if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
@@ -1261,7 +1300,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 			attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
 			attach_pid(p, PIDTYPE_SID, task_session(current));
 			list_add_tail_rcu(&p->tasks, &init_task.tasks);
+			preempt_disable();
 			__get_cpu_var(process_counts)++;
+			preempt_enable();
 		}
 		attach_pid(p, PIDTYPE_PID, pid);
 		nr_threads++;
@@ -1737,3 +1778,138 @@ int unshare_files(struct files_struct **displaced)
 	task_unlock(task);
 	return 0;
 }
+
+static int mmdrop_complete(void)
+{
+	struct list_head *head;
+	int ret = 0;
+
+	head = &get_cpu_var(delayed_drop_list);
+	while (!list_empty(head)) {
+		struct mm_struct *mm = list_entry(head->next,
+					struct mm_struct, delayed_drop);
+		list_del(&mm->delayed_drop);
+		put_cpu_var(delayed_drop_list);
+
+		__mmdrop(mm);
+		ret = 1;
+
+		head = &get_cpu_var(delayed_drop_list);
+	}
+	put_cpu_var(delayed_drop_list);
+
+	return ret;
+}
+
+/*
+ * We dont want to do complex work from the scheduler, thus
+ * we delay the work to a per-CPU worker thread:
+ */
+void  __mmdrop_delayed(struct mm_struct *mm)
+{
+	struct task_struct *desched_task;
+	struct list_head *head;
+
+	head = &get_cpu_var(delayed_drop_list);
+	list_add_tail(&mm->delayed_drop, head);
+	desched_task = __get_cpu_var(desched_task);
+	if (desched_task)
+		wake_up_process(desched_task);
+	put_cpu_var(delayed_drop_list);
+}
+
+static void takeover_delayed_drop(int hotcpu)
+{
+	struct list_head *head = &per_cpu(delayed_drop_list, hotcpu);
+
+	while (!list_empty(head)) {
+		struct mm_struct *mm = list_entry(head->next,
+				struct mm_struct, delayed_drop);
+
+		list_del(&mm->delayed_drop);
+		__mmdrop_delayed(mm);
+	}
+}
+
+static int desched_thread(void * __bind_cpu)
+{
+	set_user_nice(current, -10);
+	current->flags |= PF_NOFREEZE | PF_SOFTIRQ;
+
+	set_current_state(TASK_INTERRUPTIBLE);
+
+	while (!kthread_should_stop()) {
+
+		if (mmdrop_complete())
+			continue;
+		schedule();
+
+		/*
+ 		 * This must be called from time to time on ia64, and is a
+ 		 * no-op on other archs. Used to be in cpu_idle(), but with
+ 		 * the new -rt semantics it can't stay there.
+		 */
+		check_pgt_cache();
+
+		set_current_state(TASK_INTERRUPTIBLE);
+	}
+	__set_current_state(TASK_RUNNING);
+	return 0;
+}
+
+static int __devinit cpu_callback(struct notifier_block *nfb,
+				  unsigned long action,
+				  void *hcpu)
+{
+	int hotcpu = (unsigned long)hcpu;
+	struct task_struct *p;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+
+		BUG_ON(per_cpu(desched_task, hotcpu));
+		INIT_LIST_HEAD(&per_cpu(delayed_drop_list, hotcpu));
+		p = kthread_create(desched_thread, hcpu, "desched/%d", hotcpu);
+		if (IS_ERR(p)) {
+			printk("desched_thread for %i failed\n", hotcpu);
+			return NOTIFY_BAD;
+		}
+		per_cpu(desched_task, hotcpu) = p;
+		kthread_bind(p, hotcpu);
+		break;
+	case CPU_ONLINE:
+
+		wake_up_process(per_cpu(desched_task, hotcpu));
+		break;
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_UP_CANCELED:
+
+		/* Unbind so it can run.  Fall thru. */
+		kthread_bind(per_cpu(desched_task, hotcpu), smp_processor_id());
+	case CPU_DEAD:
+
+		p = per_cpu(desched_task, hotcpu);
+		per_cpu(desched_task, hotcpu) = NULL;
+		kthread_stop(p);
+		takeover_delayed_drop(hotcpu);
+		takeover_tasklets(hotcpu);
+		break;
+#endif /* CONFIG_HOTPLUG_CPU */
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __devinitdata cpu_nfb = {
+	.notifier_call = cpu_callback
+};
+
+__init int spawn_desched_task(void)
+{
+	void *cpu = (void *)(long)smp_processor_id();
+
+	cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+	cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
+	register_cpu_notifier(&cpu_nfb);
+	return 0;
+}
+
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 82636dfd71cf..875327a43ad4 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -477,9 +477,9 @@ static inline int hrtimer_is_hres_enabled(void)
 /*
  * Is the high resolution mode active ?
  */
-static inline int hrtimer_hres_active(void)
+static inline int hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base)
 {
-	return __get_cpu_var(hrtimer_bases).hres_active;
+	return cpu_base->hres_active;
 }
 
 /*
@@ -539,8 +539,7 @@ static int hrtimer_reprogram(struct hrtimer *timer,
 	 * When the callback is running, we do not reprogram the clock event
 	 * device. The timer callback is either running on a different CPU or
 	 * the callback is executed in the hrtimer_interrupt context. The
-	 * reprogramming is handled either by the softirq, which called the
-	 * callback or at the end of the hrtimer_interrupt.
+	 * reprogramming is handled at the end of the hrtimer_interrupt.
 	 */
 	if (hrtimer_callback_running(timer))
 		return 0;
@@ -574,11 +573,11 @@ static int hrtimer_reprogram(struct hrtimer *timer,
  */
 static void retrigger_next_event(void *arg)
 {
-	struct hrtimer_cpu_base *base;
+	struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);
 	struct timespec realtime_offset;
 	unsigned long seq;
 
-	if (!hrtimer_hres_active())
+	if (!hrtimer_hres_active(base))
 		return;
 
 	do {
@@ -588,8 +587,6 @@ static void retrigger_next_event(void *arg)
 					-wall_to_monotonic.tv_nsec);
 	} while (read_atomic_seqretry(&xtime_lock, seq));
 
-	base = &__get_cpu_var(hrtimer_bases);
-
 	/* Adjust CLOCK_REALTIME offset */
 	atomic_spin_lock(&base->lock);
 	base->clock_base[CLOCK_REALTIME].offset =
@@ -644,6 +641,8 @@ static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
 {
 }
 
+static void __run_hrtimer(struct hrtimer *timer);
+static int hrtimer_rt_defer(struct hrtimer *timer);
 
 /*
  * When High resolution timers are active, try to reprogram. Note, that in case
@@ -655,7 +654,27 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
 					    struct hrtimer_clock_base *base,
 					    int wakeup)
 {
+#ifdef CONFIG_PREEMPT_RT
+again:
+#endif
 	if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) {
+#ifdef CONFIG_PREEMPT_RT
+		/*
+		 * Move softirq based timers away from the rbtree in
+		 * case it expired already. Otherwise we would have a
+		 * stale base->first entry until the softirq runs.
+		 */
+		if (!hrtimer_rt_defer(timer)) {
+			__run_hrtimer(timer);
+			/*
+			 * __run_hrtimer might have requeued timer and
+			 * it could be base->first again.
+			 */
+			if (base->first == &timer->node)
+				goto again;
+			return 1;
+		}
+#endif
 		if (wakeup) {
 			atomic_spin_unlock(&base->cpu_base->lock);
 			raise_softirq_irqoff(HRTIMER_SOFTIRQ);
@@ -672,10 +691,8 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
 /*
  * Switch to high resolution mode
  */
-static int hrtimer_switch_to_hres(void)
+static int hrtimer_switch_to_hres(struct hrtimer_cpu_base *base)
 {
-	int cpu = smp_processor_id();
-	struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu);
 	unsigned long flags;
 
 	if (base->hres_active)
@@ -686,7 +703,7 @@ static int hrtimer_switch_to_hres(void)
 	if (tick_init_highres()) {
 		local_irq_restore(flags);
 		printk(KERN_WARNING "Could not switch to high resolution "
-				    "mode on CPU %d\n", cpu);
+		       "mode on CPU %d\n", raw_smp_processor_id());
 		return 0;
 	}
 	base->hres_active = 1;
@@ -698,16 +715,20 @@ static int hrtimer_switch_to_hres(void)
 	/* "Retrigger" the interrupt to get things going */
 	retrigger_next_event(NULL);
 	local_irq_restore(flags);
-	printk(KERN_DEBUG "Switched to high resolution mode on CPU %d\n",
-	       smp_processor_id());
 	return 1;
 }
 
 #else
 
-static inline int hrtimer_hres_active(void) { return 0; }
+static inline int hrtimer_hres_active(struct hrtimer_cpu_base *base)
+{
+	return 0;
+}
 static inline int hrtimer_is_hres_enabled(void) { return 0; }
-static inline int hrtimer_switch_to_hres(void) { return 0; }
+static inline int hrtimer_switch_to_hres(struct hrtimer_cpu_base *base)
+{
+	return 0;
+}
 static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base) { }
 static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
 					    struct hrtimer_clock_base *base,
@@ -715,6 +736,13 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
 {
 	return 0;
 }
+
+static inline int hrtimer_reprogram(struct hrtimer *timer,
+				    struct hrtimer_clock_base *base)
+{
+	return 0;
+}
+
 static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
 static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { }
 
@@ -837,6 +865,32 @@ static int enqueue_hrtimer(struct hrtimer *timer,
 	return leftmost;
 }
 
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+# define wake_up_timer_waiters(b)	wake_up(&(b)->wait)
+
+/**
+ * hrtimer_wait_for_timer - Wait for a running timer
+ *
+ * @timer:	timer to wait for
+ *
+ * The function waits in case the timers callback function is
+ * currently executed on the waitqueue of the timer base. The
+ * waitqueue is woken up after the timer callback function has
+ * finished execution.
+ */
+void hrtimer_wait_for_timer(const struct hrtimer *timer)
+{
+	struct hrtimer_clock_base *base = timer->base;
+
+	if (base && base->cpu_base && !timer->irqsafe)
+		wait_event(base->cpu_base->wait,
+			   !(timer->state & HRTIMER_STATE_CALLBACK));
+}
+
+#else
+# define wake_up_timer_waiters(b)	do { } while (0)
+#endif
+
 /*
  * __remove_hrtimer - internal function to remove a timer
  *
@@ -852,6 +906,11 @@ static void __remove_hrtimer(struct hrtimer *timer,
 			     unsigned long newstate, int reprogram)
 {
 	if (timer->state & HRTIMER_STATE_ENQUEUED) {
+
+		if (unlikely(!list_empty(&timer->cb_entry))) {
+			list_del_init(&timer->cb_entry);
+			goto out;
+		}
 		/*
 		 * Remove the timer from the rbtree and replace the
 		 * first entry pointer if necessary.
@@ -859,11 +918,12 @@ static void __remove_hrtimer(struct hrtimer *timer,
 		if (base->first == &timer->node) {
 			base->first = rb_next(&timer->node);
 			/* Reprogram the clock event device. if enabled */
-			if (reprogram && hrtimer_hres_active())
+			if (reprogram && hrtimer_hres_active(base->cpu_base))
 				hrtimer_force_reprogram(base->cpu_base);
 		}
 		rb_erase(&timer->node, &base->active);
 	}
+out:
 	timer->state = newstate;
 }
 
@@ -1023,7 +1083,7 @@ int hrtimer_cancel(struct hrtimer *timer)
 
 		if (ret >= 0)
 			return ret;
-		cpu_relax();
+		hrtimer_wait_for_timer(timer);
 	}
 }
 EXPORT_SYMBOL_GPL(hrtimer_cancel);
@@ -1063,7 +1123,7 @@ ktime_t hrtimer_get_next_event(void)
 
 	atomic_spin_lock_irqsave(&cpu_base->lock, flags);
 
-	if (!hrtimer_hres_active()) {
+	if (!hrtimer_hres_active(cpu_base)) {
 		for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
 			struct hrtimer *timer;
 
@@ -1177,6 +1237,115 @@ static void __run_hrtimer(struct hrtimer *timer)
 	timer->state &= ~HRTIMER_STATE_CALLBACK;
 }
 
+#ifdef CONFIG_PREEMPT_RT
+
+static void hrtimer_rt_reprogram(int restart, struct hrtimer *timer,
+				 struct hrtimer_clock_base *base)
+{
+	/*
+	 * Note, we clear the callback flag before we requeue the
+	 * timer otherwise we trigger the callback_running() check
+	 * in hrtimer_reprogram().
+	 */
+	timer->state &= ~HRTIMER_STATE_CALLBACK;
+
+	if (restart != HRTIMER_NORESTART) {
+		BUG_ON(hrtimer_active(timer));
+		/*
+		 * Enqueue the timer, if it's the leftmost timer then
+		 * we need to reprogram it.
+		 */
+		if (!enqueue_hrtimer(timer, base))
+			return;
+
+		if (hrtimer_reprogram(timer, base))
+			goto requeue;
+
+	} else if (hrtimer_active(timer)) {
+		/*
+		 * If the timer was rearmed on another CPU, reprogram
+		 * the event device.
+		 */
+		if (base->first == &timer->node &&
+		    hrtimer_reprogram(timer, base))
+			goto requeue;
+	}
+	return;
+
+requeue:
+	/*
+	 * Timer is expired. Thus move it from tree to pending list
+	 * again.
+	 */
+	__remove_hrtimer(timer, base, timer->state, 0);
+	list_add_tail(&timer->cb_entry, &base->expired);
+}
+
+/*
+ * The changes in mainline which removed the callback modes from
+ * hrtimer are not yet working with -rt. The non wakeup_process()
+ * based callbacks which involve sleeping locks need to be treated
+ * seperately.
+ */
+static void hrtimer_rt_run_pending(void)
+{
+	enum hrtimer_restart (*fn)(struct hrtimer *);
+	struct hrtimer_cpu_base *cpu_base;
+	struct hrtimer_clock_base *base;
+	struct hrtimer *timer;
+	int index, restart;
+
+	local_irq_disable();
+	cpu_base = &per_cpu(hrtimer_bases, smp_processor_id());
+
+	atomic_spin_lock(&cpu_base->lock);
+
+	for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
+		base = &cpu_base->clock_base[index];
+
+		while (!list_empty(&base->expired)) {
+			timer = list_first_entry(&base->expired,
+						 struct hrtimer, cb_entry);
+
+			/*
+			 * Same as the above __run_hrtimer function
+			 * just we run with interrupts enabled.
+			 */
+			debug_hrtimer_deactivate(timer);
+			__remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
+			timer_stats_account_hrtimer(timer);
+			fn = timer->function;
+
+			atomic_spin_unlock_irq(&cpu_base->lock);
+			restart = fn(timer);
+			atomic_spin_lock_irq(&cpu_base->lock);
+
+			hrtimer_rt_reprogram(restart, timer, base);
+		}
+	}
+
+	atomic_spin_unlock_irq(&cpu_base->lock);
+
+	wake_up_timer_waiters(cpu_base);
+}
+
+static int hrtimer_rt_defer(struct hrtimer *timer)
+{
+	if (timer->irqsafe)
+		return 0;
+
+	__remove_hrtimer(timer, timer->base, timer->state, 0);
+	list_add_tail(&timer->cb_entry, &timer->base->expired);
+	return 1;
+}
+
+#else
+
+static inline void hrtimer_rt_run_pending(void) { }
+static inline int hrtimer_rt_defer(struct hrtimer *timer) { return 0; }
+
+#endif
+
 #ifdef CONFIG_HIGH_RES_TIMERS
 
 static int force_clock_reprogram;
@@ -1212,7 +1381,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
 	struct hrtimer_clock_base *base;
 	ktime_t expires_next, now;
 	int nr_retries = 0;
-	int i;
+	int i, raise = 0;
 
 	BUG_ON(!cpu_base->hres_active);
 	cpu_base->nr_events++;
@@ -1273,7 +1442,10 @@ void hrtimer_interrupt(struct clock_event_device *dev)
 				break;
 			}
 
-			__run_hrtimer(timer);
+			if (!hrtimer_rt_defer(timer))
+				__run_hrtimer(timer);
+			else
+				raise = 1;
 		}
 		base++;
 	}
@@ -1290,6 +1462,9 @@ void hrtimer_interrupt(struct clock_event_device *dev)
 		if (tick_program_event(expires_next, force_clock_reprogram))
 			goto retry;
 	}
+
+	if (raise)
+		raise_softirq_irqoff(HRTIMER_SOFTIRQ);
 }
 
 /*
@@ -1298,9 +1473,11 @@ void hrtimer_interrupt(struct clock_event_device *dev)
  */
 static void __hrtimer_peek_ahead_timers(void)
 {
+	struct hrtimer_cpu_base *cpu_base;
 	struct tick_device *td;
 
-	if (!hrtimer_hres_active())
+	cpu_base = &__get_cpu_var(hrtimer_bases);
+	if (!hrtimer_hres_active(cpu_base))
 		return;
 
 	td = &__get_cpu_var(tick_cpu_device);
@@ -1326,17 +1503,17 @@ void hrtimer_peek_ahead_timers(void)
 	local_irq_restore(flags);
 }
 
-static void run_hrtimer_softirq(struct softirq_action *h)
-{
-	hrtimer_peek_ahead_timers();
-}
-
 #else /* CONFIG_HIGH_RES_TIMERS */
 
 static inline void __hrtimer_peek_ahead_timers(void) { }
 
 #endif	/* !CONFIG_HIGH_RES_TIMERS */
 
+static void run_hrtimer_softirq(struct softirq_action *h)
+{
+	hrtimer_rt_run_pending();
+}
+
 /*
  * Called from timer softirq every jiffy, expire hrtimers:
  *
@@ -1346,7 +1523,9 @@ static inline void __hrtimer_peek_ahead_timers(void) { }
  */
 void hrtimer_run_pending(void)
 {
-	if (hrtimer_hres_active())
+	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
+
+	if (hrtimer_hres_active(cpu_base))
 		return;
 
 	/*
@@ -1358,7 +1537,7 @@ void hrtimer_run_pending(void)
 	 * deadlock vs. xtime_lock.
 	 */
 	if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
-		hrtimer_switch_to_hres();
+		hrtimer_switch_to_hres(cpu_base);
 }
 
 /*
@@ -1367,11 +1546,12 @@ void hrtimer_run_pending(void)
 void hrtimer_run_queues(void)
 {
 	struct rb_node *node;
-	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
+	struct hrtimer_cpu_base *cpu_base;
 	struct hrtimer_clock_base *base;
-	int index, gettime = 1;
+	int index, gettime = 1, raise = 0;
 
-	if (hrtimer_hres_active())
+	cpu_base = &per_cpu(hrtimer_bases, raw_smp_processor_id());
+	if (hrtimer_hres_active(cpu_base))
 		return;
 
 	for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
@@ -1395,10 +1575,16 @@ void hrtimer_run_queues(void)
 					hrtimer_get_expires_tv64(timer))
 				break;
 
-			__run_hrtimer(timer);
+			if (!hrtimer_rt_defer(timer))
+				__run_hrtimer(timer);
+			else
+				raise = 1;
 		}
 		atomic_spin_unlock(&cpu_base->lock);
 	}
+
+	if (raise)
+		raise_softirq_irqoff(HRTIMER_SOFTIRQ);
 }
 
 /*
@@ -1420,6 +1606,7 @@ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
 void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
 {
 	sl->timer.function = hrtimer_wakeup;
+	sl->timer.irqsafe = 1;
 	sl->task = task;
 }
 
@@ -1554,10 +1741,15 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
 
 	atomic_spin_lock_init(&cpu_base->lock);
 
-	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
+	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
 		cpu_base->clock_base[i].cpu_base = cpu_base;
+		INIT_LIST_HEAD(&cpu_base->clock_base[i].expired);
+	}
 
 	hrtimer_init_hres(cpu_base);
+#ifdef CONFIG_PREEMPT_RT
+	init_waitqueue_head(&cpu_base->wait);
+#endif
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -1670,9 +1862,7 @@ void __init hrtimers_init(void)
 	hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
 			  (void *)(long)smp_processor_id());
 	register_cpu_notifier(&hrtimers_nb);
-#ifdef CONFIG_HIGH_RES_TIMERS
 	open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq);
-#endif
 }
 
 /**
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 474657a806d7..3f8f04f4eb8e 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -247,6 +247,7 @@ static unsigned int default_startup(unsigned int irq)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 
+	desc->status &= ~IRQ_MASKED;
 	desc->chip->enable(irq);
 	return 0;
 }
@@ -382,7 +383,8 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
 
 	atomic_spin_lock(&desc->lock);
 	desc->status &= ~IRQ_INPROGRESS;
-	if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask)
+	if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask &&
+	    !desc->forced_threads_active)
 		desc->chip->unmask(irq);
 out_unlock:
 	atomic_spin_unlock(&desc->lock);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 3d5e31a45650..f05aa88e6702 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -373,7 +373,7 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
 
 	do {
 		trace_irq_handler_entry(irq, action);
-		ret = action->handler(irq, action->dev_id);
+		ret = handle_irq_action(irq, action);
 		trace_irq_handler_exit(irq, action, ret);
 
 		switch (ret) {
@@ -420,8 +420,11 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
 		action = action->next;
 	} while (action);
 
+#ifndef CONFIG_PREEMPT_RT
+	/* FIXME: Can we unbreak that ? */
 	if (status & IRQF_SAMPLE_RANDOM)
 		add_interrupt_randomness(irq);
+#endif
 	local_irq_disable();
 
 	return retval;
@@ -450,6 +453,11 @@ unsigned int __do_IRQ(unsigned int irq)
 	struct irqaction *action;
 	unsigned int status;
 
+#ifdef CONFIG_PREEMPT_RT
+	printk(KERN_WARNING "__do_IRQ called for irq %d. "
+	       "PREEMPT_RT will crash your system soon\n", irq);
+	printk(KERN_WARNING "I hope you have a fire-extinguisher handy!\n");
+#endif
 	kstat_incr_irqs_this_cpu(irq, desc);
 
 	if (CHECK_IRQ_PER_CPU(desc->status)) {
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index d53fa526665b..7370290d2dae 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -44,6 +44,16 @@ extern int irq_select_affinity_usr(unsigned int irq);
 
 extern void irq_set_thread_affinity(struct irq_desc *desc);
 
+#ifdef CONFIG_PREEMPT_HARDIRQS
+extern irqreturn_t handle_irq_action(unsigned int irq,struct irqaction *action);
+#else
+static inline irqreturn_t
+handle_irq_action(unsigned int irq, struct irqaction *action)
+{
+	return action->handler(irq, action->dev_id);
+}
+#endif
+
 /*
  * Debugging printout:
  */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 99a598e850c9..e4a6ba9786bf 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -278,7 +278,8 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
 			goto err_out;
 		/* Prevent probing on this irq: */
 		desc->status = status | IRQ_NOPROBE;
-		check_irq_resend(desc, irq);
+		if (!desc->forced_threads_active)
+			check_irq_resend(desc, irq);
 		/* fall-through */
 	}
 	default:
@@ -436,7 +437,127 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
 	return ret;
 }
 
-static int irq_wait_for_interrupt(struct irqaction *action)
+#ifdef CONFIG_PREEMPT_HARDIRQS
+/*
+ * handler function for forced irq threading. Dummy code. See
+ * handle_irq_action() below.
+ */
+static irqreturn_t preempt_hardirq_handler(int irq, void *dev_id)
+{
+	return IRQ_WAKE_THREAD;
+}
+
+/*
+ * Momentary workaround until I have a brighter idea how to handle the
+ * accounting of forced thread handlers.
+ */
+irqreturn_t handle_irq_action(unsigned int irq, struct irqaction *action)
+{
+	if (action->handler == preempt_hardirq_handler) {
+		struct irq_desc *desc = irq_to_desc(irq);
+		unsigned long flags;
+
+		atomic_spin_lock_irqsave(&desc->lock, flags);
+
+		/* FIXME: use some flag to do that */
+		if (desc->handle_irq == handle_fasteoi_irq) {
+			if (desc->chip->mask)
+				desc->chip->mask(irq);
+		}
+		desc->forced_threads_active |= action->thread_mask;
+		atomic_spin_unlock_irqrestore(&desc->lock, flags);
+		return IRQ_WAKE_THREAD;
+	}
+	return action->handler(irq, action->dev_id);
+}
+
+/*
+ * forced threaded interrupts need to unmask the interrupt line
+ */
+static int preempt_hardirq_thread_done(struct irq_desc *desc,
+					struct irqaction *action)
+{
+	unsigned long masked;
+
+	if (action->handler != preempt_hardirq_handler)
+		return 0;
+again:
+	atomic_spin_lock_irq(&desc->lock);
+	/*
+	 * Be careful. The hardirq handler might be running on the
+	 * other CPU.
+	 */
+	if (desc->status & IRQ_INPROGRESS) {
+		atomic_spin_unlock_irq(&desc->lock);
+		cpu_relax();
+		goto again;
+	}
+
+	/*
+	 * Now check again, whether the thread should run. Otherwise
+	 * we would clear the forced_threads_active bit which was just
+	 * set.
+	 */
+	if (test_bit(IRQTF_RUNTHREAD, &action->thread_flags)) {
+		atomic_spin_unlock_irq(&desc->lock);
+		return 1;
+	}
+
+	masked = desc->forced_threads_active;
+	desc->forced_threads_active &= ~action->thread_mask;
+
+	/*
+	 * Unmask the interrupt line when this is the last active
+	 * thread and the interrupt is not disabled.
+	 */
+	if (masked && !desc->forced_threads_active &&
+	    !(desc->status & IRQ_DISABLED)) {
+		if (desc->chip->unmask)
+			desc->chip->unmask(action->irq);
+		/*
+		 * Do we need to call check_irq_resend() here ?
+		 * No. check_irq_resend needs only to be checked when
+		 * we go from IRQ_DISABLED to IRQ_ENABLED state.
+		 */
+	}
+	atomic_spin_unlock_irq(&desc->lock);
+	return 0;
+}
+
+/*
+ * If the caller does not request irq threading then the handler
+ * becomes the thread function and we use the above handler as the
+ * primary hardirq context handler.
+ */
+static void preempt_hardirq_setup(struct irqaction *new)
+{
+	if (new->thread_fn || (new->flags & IRQF_NODELAY))
+		return;
+
+	new->thread_fn = new->handler;
+	new->handler = preempt_hardirq_handler;
+}
+
+static inline void
+preempt_hardirq_cleanup(struct irq_desc *desc, struct irqaction *action)
+{
+	clear_bit(IRQTF_RUNTHREAD, &action->thread_flags);
+	preempt_hardirq_thread_done(desc, action);
+}
+
+#else
+static inline void preempt_hardirq_setup(struct irqaction *new) { }
+static inline int
+preempt_hardirq_thread_done(struct irq_desc *d, struct irqaction *a)
+{
+	return 0;
+}
+static inline void
+preempt_hardirq_cleanup(struct irq_desc *d, struct irqaction *a) { }
+#endif
+
+static int
+irq_wait_for_interrupt(struct irq_desc *desc, struct irqaction *action)
 {
 	while (!kthread_should_stop()) {
 		set_current_state(TASK_INTERRUPTIBLE);
@@ -446,7 +567,8 @@ static int irq_wait_for_interrupt(struct irqaction *action)
 			__set_current_state(TASK_RUNNING);
 			return 0;
 		}
-		schedule();
+		if (!preempt_hardirq_thread_done(desc, action))
+			schedule();
 	}
 	return -1;
 }
@@ -495,9 +617,10 @@ static int irq_thread(void *data)
 	int wake;
 
 	sched_setscheduler(current, SCHED_FIFO, &param);
+	current->flags |= PF_HARDIRQ;
 	current->irqaction = action;
 
-	while (!irq_wait_for_interrupt(action)) {
+	while (!irq_wait_for_interrupt(desc, action)) {
 
 		irq_thread_check_affinity(desc, action);
 
@@ -526,6 +649,8 @@ static int irq_thread(void *data)
 			wake_up(&desc->wait_for_threads);
 	}
 
+	preempt_hardirq_cleanup(desc, action);
+
 	/*
 	 * Clear irqaction. Otherwise exit_irq_thread() would make
 	 * fuzz about an active irq thread going into nirvana.
@@ -564,7 +689,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 {
 	struct irqaction *old, **old_ptr;
 	const char *old_name = NULL;
-	unsigned long flags;
+	unsigned long flags, thread_mask = 0;
 	int shared = 0;
 	int ret;
 
@@ -590,6 +715,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 		rand_initialize_irq(irq);
 	}
 
+	/* Preempt-RT setup for forced threading */
+	preempt_hardirq_setup(new);
+
 	/*
 	 * Threaded handler ?
 	 */
@@ -607,7 +735,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 		 */
 		get_task_struct(t);
 		new->thread = t;
-		wake_up_process(t);
 	}
 
 	/*
@@ -638,12 +765,20 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 
 		/* add new interrupt at end of irq queue */
 		do {
+			thread_mask |= old->thread_mask;
 			old_ptr = &old->next;
 			old = *old_ptr;
 		} while (old);
 		shared = 1;
 	}
 
+	/*
+	 * Setup the thread mask for this irqaction. No risk that ffz
+	 * will fail. If we have 32 resp. 64 devices sharing one irq
+	 * then .....
+	 */
+	new->thread_mask = 1 << ffz(thread_mask);
+
 	if (!shared) {
 		irq_chip_set_defaults(desc->chip);
 
@@ -712,6 +847,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 	new->dir = NULL;
 	register_handler_proc(irq, new);
 
+	if (new->thread)
+		wake_up_process(new->thread);
+
 	return 0;
 
 mismatch:
@@ -975,7 +1113,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
 		kfree(action);
 
 #ifdef CONFIG_DEBUG_SHIRQ
-	if (irqflags & IRQF_SHARED) {
+	if (!retval && (irqflags & IRQF_SHARED)) {
 		/*
 		 * It's a shared IRQ -- the driver ought to be prepared for it
 		 * to happen immediately, so let's make sure....
@@ -983,13 +1121,18 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
 		 * run in parallel with our fake.
 		 */
 		unsigned long flags;
+		irqreturn_t ret;
 
 		disable_irq(irq);
 		local_irq_save(flags);
 
-		handler(irq, dev_id);
+		ret = action->handler(irq, dev_id);
 
 		local_irq_restore(flags);
+
+		if (ret == IRQ_WAKE_THREAD)
+			action->thread_fn(irq, dev_id);
+
 		enable_irq(irq);
 	}
 #endif
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index b1de750f91f2..1d9ff654c5bf 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -54,6 +54,7 @@ void move_masked_irq(int irq)
 void move_native_irq(int irq)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
+	int mask = 1;
 
 	if (likely(!(desc->status & IRQ_MOVE_PENDING)))
 		return;
@@ -61,8 +62,18 @@ void move_native_irq(int irq)
 	if (unlikely(desc->status & IRQ_DISABLED))
 		return;
 
-	desc->chip->mask(irq);
+	/*
+	 * If the irq is already in progress, it should be masked.
+	 * If we unmask it, we might cause an interrupt storm on RT.
+	 */
+	if (unlikely(desc->status & IRQ_INPROGRESS ||
+		     desc->forced_threads_active))
+		mask = 0;
+
+	if (mask)
+		desc->chip->mask(irq);
 	move_masked_irq(irq);
-	desc->chip->unmask(irq);
+	if (mask)
+		desc->chip->unmask(irq);
 }
 
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 4ee89c9c031c..d8b2df06e95b 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -54,9 +54,9 @@ static int try_one_irq(int irq, struct irq_desc *desc)
 		}
 		action = action->next;
 	}
-	local_irq_disable();
+
 	/* Now clean up the flags */
-	atomic_spin_lock(&desc->lock);
+	atomic_spin_lock_irq(&desc->lock);
 	action = desc->action;
 
 	/*
@@ -288,6 +288,11 @@ MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true");
 
 static int __init irqfixup_setup(char *str)
 {
+#ifdef CONFIG_PREEMPT_RT
+	printk(KERN_WARNING "irqfixup boot option not supported "
+		"w/ CONFIG_PREEMPT_RT\n");
+	return 1;
+#endif
 	irqfixup = 1;
 	printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
 	printk(KERN_WARNING "This may impact system performance.\n");
@@ -301,6 +306,11 @@ MODULE_PARM_DESC("irqfixup", "0: No fixup, 1: irqfixup mode, 2: irqpoll mode");
 
 static int __init irqpoll_setup(char *str)
 {
+#ifdef CONFIG_PREEMPT_RT
+	printk(KERN_WARNING "irqpoll boot option not supported "
+		"w/ CONFIG_PREEMPT_RT\n");
+	return 1;
+#endif
 	irqfixup = 2;
 	printk(KERN_WARNING "Misrouted IRQ fixup and polling support "
 				"enabled\n");
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 58762f7077ec..b4d399859e13 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -161,6 +161,7 @@ again:
 		/* We are sharing ->siglock with it_real_fn() */
 		if (hrtimer_try_to_cancel(timer) < 0) {
 			spin_unlock_irq(&tsk->sighand->siglock);
+			hrtimer_wait_for_timer(&tsk->signal->real_timer);
 			goto again;
 		}
 		expires = timeval_to_ktime(value->it_value);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 8bbeef996c76..a53d4fbb721e 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -846,6 +846,21 @@ out_unlock_set:
 	return class;
 }
 
+#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_TRACE_IRQFLAGS)
+
+#define RECURSION_LIMIT 40
+
+static int noinline print_infinite_recursion_bug(void)
+{
+	if (!debug_locks_off_graph_unlock())
+		return 0;
+
+	WARN_ON(1);
+
+	return 0;
+}
+#endif /* CONFIG_PROVE_LOCKING || CONFIG_TRACE_IRQFLAGS */
+
 #ifdef CONFIG_PROVE_LOCKING
 /*
  * Allocate a lockdep entry. (assumes the graph_lock held, returns
@@ -977,18 +992,6 @@ static noinline int print_circular_bug_tail(void)
 	return 0;
 }
 
-#define RECURSION_LIMIT 40
-
-static int noinline print_infinite_recursion_bug(void)
-{
-	if (!debug_locks_off_graph_unlock())
-		return 0;
-
-	WARN_ON(1);
-
-	return 0;
-}
-
 unsigned long __lockdep_count_forward_deps(struct lock_class *class,
 					   unsigned int depth)
 {
@@ -1181,6 +1184,7 @@ find_usage_backwards(struct lock_class *source, unsigned int depth)
 	return 1;
 }
 
+#ifdef CONFIG_PROVE_LOCKING
 static int
 print_bad_irq_dependency(struct task_struct *curr,
 			 struct held_lock *prev,
@@ -1241,6 +1245,7 @@ print_bad_irq_dependency(struct task_struct *curr,
 
 	return 0;
 }
+#endif /* CONFIG_PROVE_LOCKING */
 
 static int
 check_usage(struct task_struct *curr, struct held_lock *prev,
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 3714ee5bc638..73ad8a627e36 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -249,8 +249,13 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 
 		/* didnt get the lock, go to sleep: */
 		spin_unlock_mutex(&lock->wait_lock, flags);
-		preempt_enable_and_schedule();
+
+		local_irq_disable();
+		__preempt_enable_no_resched();
+		__schedule();
 		preempt_disable();
+		local_irq_enable();
+
 		spin_lock_mutex(&lock->wait_lock, flags);
 	}
 
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 61d5aa5eced3..cf40c2d9817f 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -71,7 +71,7 @@ static int notifier_chain_unregister(struct notifier_block **nl,
  *	@returns:	notifier_call_chain returns the value returned by the
  *			last notifier function called.
  */
-static int __kprobes notifier_call_chain(struct notifier_block **nl,
+static int __kprobes notrace notifier_call_chain(struct notifier_block **nl,
 					unsigned long val, void *v,
 					int nr_to_call,	int *nr_calls)
 {
@@ -217,7 +217,7 @@ int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
 	 * not yet working and interrupts must remain disabled.  At
 	 * such times we must not call down_write().
 	 */
-	if (unlikely(system_state == SYSTEM_BOOTING))
+	if (unlikely(system_state < SYSTEM_RUNNING))
 		return notifier_chain_register(&nh->head, n);
 
 	down_write(&nh->rwsem);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 61fbc7afb558..ca750c7967ee 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -559,7 +559,7 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
 		p->cpu_timers : p->signal->cpu_timers);
 	head += CPUCLOCK_WHICH(timer->it_clock);
 
-	BUG_ON(!irqs_disabled());
+	BUG_ON_NONRT(!irqs_disabled());
 	spin_lock(&p->sighand->siglock);
 
 	listpos = head;
@@ -747,7 +747,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
 	/*
 	 * Disarm any old timer after extracting its expiry time.
 	 */
-	BUG_ON(!irqs_disabled());
+	BUG_ON_NONRT(!irqs_disabled());
 
 	ret = 0;
 	spin_lock(&p->sighand->siglock);
@@ -1381,12 +1381,11 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
  * already updated our counts.  We need to check if any timers fire now.
  * Interrupts are disabled.
  */
-void run_posix_cpu_timers(struct task_struct *tsk)
+void __run_posix_cpu_timers(struct task_struct *tsk)
 {
 	LIST_HEAD(firing);
 	struct k_itimer *timer, *next;
 
-	BUG_ON(!irqs_disabled());
 
 	/*
 	 * The fast path checks that there are no expired thread or thread
@@ -1438,6 +1437,177 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 	}
 }
 
+#include <linux/kthread.h>
+#include <linux/cpu.h>
+DEFINE_PER_CPU(struct task_struct *, posix_timer_task);
+DEFINE_PER_CPU(struct task_struct *, posix_timer_tasklist);
+
+static int posix_cpu_timers_thread(void *data)
+{
+	int cpu = (long)data;
+
+	BUG_ON(per_cpu(posix_timer_task,cpu) != current);
+
+	while (!kthread_should_stop()) {
+		struct task_struct *tsk = NULL;
+		struct task_struct *next = NULL;
+
+		if (cpu_is_offline(cpu))
+			goto wait_to_die;
+
+		/* grab task list */
+		raw_local_irq_disable();
+		tsk = per_cpu(posix_timer_tasklist, cpu);
+		per_cpu(posix_timer_tasklist, cpu) = NULL;
+		raw_local_irq_enable();
+
+		/* its possible the list is empty, just return */
+		if (!tsk) {
+			set_current_state(TASK_INTERRUPTIBLE);
+			schedule();
+			__set_current_state(TASK_RUNNING);
+			continue;
+		}
+
+		/* Process task list */
+		while (1) {
+			/* save next */
+			next = tsk->posix_timer_list;
+
+			/* run the task timers, clear its ptr and
+			 * unreference it
+			 */
+			__run_posix_cpu_timers(tsk);
+			tsk->posix_timer_list = NULL;
+			put_task_struct(tsk);
+
+			/* check if this is the last on the list */
+			if (next == tsk)
+				break;
+			tsk = next;
+		}
+	}
+	return 0;
+
+wait_to_die:
+	/* Wait for kthread_stop */
+	set_current_state(TASK_INTERRUPTIBLE);
+	while (!kthread_should_stop()) {
+		schedule();
+		set_current_state(TASK_INTERRUPTIBLE);
+	}
+	__set_current_state(TASK_RUNNING);
+	return 0;
+}
+
+static inline int __fastpath_timer_check(struct task_struct *tsk)
+{
+	/* tsk == current, ensure it is safe to use ->signal/sighand */
+	if (unlikely(tsk->exit_state))
+		return 0;
+
+	if (!task_cputime_zero(&tsk->cputime_expires))
+			return 1;
+
+	if (!task_cputime_zero(&tsk->signal->cputime_expires))
+			return 1;
+
+	return 0;
+}
+
+void run_posix_cpu_timers(struct task_struct *tsk)
+{
+	unsigned long cpu = smp_processor_id();
+	struct task_struct *tasklist;
+
+	BUG_ON(!irqs_disabled());
+	if(!per_cpu(posix_timer_task, cpu))
+		return;
+	/* get per-cpu references */
+	tasklist = per_cpu(posix_timer_tasklist, cpu);
+
+	/* check to see if we're already queued */
+	if (!tsk->posix_timer_list && __fastpath_timer_check(tsk)) {
+		get_task_struct(tsk);
+		if (tasklist) {
+			tsk->posix_timer_list = tasklist;
+		} else {
+			/*
+			 * The list is terminated by a self-pointing
+			 * task_struct
+			 */
+			tsk->posix_timer_list = tsk;
+		}
+		per_cpu(posix_timer_tasklist, cpu) = tsk;
+
+		wake_up_process(per_cpu(posix_timer_task, cpu));
+	}
+}
+
+/*
+ * posix_cpu_thread_call - callback that gets triggered when a CPU is added.
+ * Here we can start up the necessary migration thread for the new CPU.
+ */
+static int posix_cpu_thread_call(struct notifier_block *nfb,
+				 unsigned long action, void *hcpu)
+{
+	int cpu = (long)hcpu;
+	struct task_struct *p;
+	struct sched_param param;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+		p = kthread_create(posix_cpu_timers_thread, hcpu,
+					"posixcputmr/%d",cpu);
+		if (IS_ERR(p))
+			return NOTIFY_BAD;
+		p->flags |= PF_NOFREEZE;
+		kthread_bind(p, cpu);
+		/* Must be high prio to avoid getting starved */
+		param.sched_priority = MAX_RT_PRIO-1;
+		sched_setscheduler(p, SCHED_FIFO, &param);
+		per_cpu(posix_timer_task,cpu) = p;
+		break;
+	case CPU_ONLINE:
+		/* Strictly unneccessary, as first user will wake it. */
+		wake_up_process(per_cpu(posix_timer_task,cpu));
+		break;
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_UP_CANCELED:
+		/* Unbind it from offline cpu so it can run.  Fall thru. */
+		kthread_bind(per_cpu(posix_timer_task,cpu),
+			     any_online_cpu(cpu_online_map));
+		kthread_stop(per_cpu(posix_timer_task,cpu));
+		per_cpu(posix_timer_task,cpu) = NULL;
+		break;
+	case CPU_DEAD:
+		kthread_stop(per_cpu(posix_timer_task,cpu));
+		per_cpu(posix_timer_task,cpu) = NULL;
+		break;
+#endif
+	}
+	return NOTIFY_OK;
+}
+
+/* Register at highest priority so that task migration (migrate_all_tasks)
+ * happens before everything else.
+ */
+static struct notifier_block __devinitdata posix_cpu_thread_notifier = {
+	.notifier_call = posix_cpu_thread_call,
+	.priority = 10
+};
+
+static int __init posix_cpu_thread_init(void)
+{
+	void *cpu = (void *)(long)smp_processor_id();
+	/* Start one for boot CPU. */
+	posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_UP_PREPARE, cpu);
+	posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_ONLINE, cpu);
+	register_cpu_notifier(&posix_cpu_thread_notifier);
+	return 0;
+}
+early_initcall(posix_cpu_thread_init);
+
 /*
  * Set one of the process-wide special case CPU timers.
  * The tsk->sighand->siglock must be held by the caller.
@@ -1703,6 +1873,12 @@ static __init int init_posix_cpu_timers(void)
 		.nsleep = thread_cpu_nsleep,
 		.nsleep_restart = thread_cpu_nsleep_restart,
 	};
+	unsigned long cpu;
+
+	/* init the per-cpu posix_timer_tasklets */
+	for_each_cpu_mask(cpu, cpu_possible_map) {
+		per_cpu(posix_timer_tasklist, cpu) = NULL;
+	}
 
 	register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
 	register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread);
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 1bd47d224422..2817dd37f04d 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -796,6 +796,7 @@ retry:
 
 	unlock_timer(timr, flag);
 	if (error == TIMER_RETRY) {
+		hrtimer_wait_for_timer(&timr->it.real.timer);
 		rtn = NULL;	// We already got the old time...
 		goto retry;
 	}
@@ -834,6 +835,7 @@ retry_delete:
 
 	if (timer_delete_hook(timer) == TIMER_RETRY) {
 		unlock_timer(timer, flags);
+		hrtimer_wait_for_timer(&timer->it.real.timer);
 		goto retry_delete;
 	}
 
@@ -863,6 +865,7 @@ retry_delete:
 
 	if (timer_delete_hook(timer) == TIMER_RETRY) {
 		unlock_timer(timer, flags);
+		hrtimer_wait_for_timer(&timer->it.real.timer);
 		goto retry_delete;
 	}
 	list_del(&timer->list);
diff --git a/kernel/printk.c b/kernel/printk.c
index 5ed0c9f103b8..1fa5c42eb6ab 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -33,6 +33,7 @@
 #include <linux/bootmem.h>
 #include <linux/syscalls.h>
 #include <linux/kexec.h>
+#include <linux/semaphore.h>
 
 #include <asm/uaccess.h>
 
@@ -414,9 +415,13 @@ static void __call_console_drivers(unsigned start, unsigned end)
 
 	for (con = console_drivers; con; con = con->next) {
 		if ((con->flags & CON_ENABLED) && con->write &&
-				(cpu_online(smp_processor_id()) ||
-				(con->flags & CON_ANYTIME)))
+				console_atomic_safe(con) &&
+				(cpu_online(raw_smp_processor_id()) ||
+				 (con->flags & CON_ANYTIME))) {
+			set_printk_might_sleep(1);
 			con->write(con, &LOG_BUF(start), end - start);
+			set_printk_might_sleep(0);
+		}
 	}
 }
 
@@ -530,6 +535,7 @@ static void zap_locks(void)
 	atomic_spin_lock_init(&logbuf_lock);
 	/* And make sure that we print immediately */
 	semaphore_init(&console_sem);
+	zap_rt_locks();
 }
 
 #if defined(CONFIG_PRINTK_TIME)
@@ -611,7 +617,8 @@ static inline int can_use_console(unsigned int cpu)
  * interrupts disabled. It should return with 'lockbuf_lock'
  * released but interrupts still disabled.
  */
-static int acquire_console_semaphore_for_printk(unsigned int cpu)
+static int acquire_console_semaphore_for_printk(unsigned int cpu,
+						unsigned long flags)
 {
 	int retval = 0;
 
@@ -632,6 +639,8 @@ static int acquire_console_semaphore_for_printk(unsigned int cpu)
 	}
 	printk_cpu = UINT_MAX;
 	atomic_spin_unlock(&logbuf_lock);
+	lockdep_on();
+	local_irq_restore(flags);
 	return retval;
 }
 static const char recursion_bug_msg [] =
@@ -653,7 +662,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 	preempt_disable();
 	/* This stops the holder of console_sem just where we want him */
 	raw_local_irq_save(flags);
-	this_cpu = smp_processor_id();
+	this_cpu = raw_smp_processor_id();
 
 	/*
 	 * Ouch, printk recursed into itself!
@@ -668,7 +677,8 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 		 */
 		if (!oops_in_progress) {
 			recursion_bug = 1;
-			goto out_restore_irqs;
+			raw_local_irq_restore(flags);
+			goto out;
 		}
 		zap_locks();
 	}
@@ -676,6 +686,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 	lockdep_off();
 	atomic_spin_lock(&logbuf_lock);
 	printk_cpu = this_cpu;
+	preempt_enable();
 
 	if (recursion_bug) {
 		recursion_bug = 0;
@@ -760,14 +771,10 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 	 * will release 'logbuf_lock' regardless of whether it
 	 * actually gets the semaphore or not.
 	 */
-	if (acquire_console_semaphore_for_printk(this_cpu))
+	if (acquire_console_semaphore_for_printk(this_cpu, flags))
 		release_console_sem();
 
-	lockdep_on();
-out_restore_irqs:
-	raw_local_irq_restore(flags);
-
-	preempt_enable();
+out:
 	return printed_len;
 }
 EXPORT_SYMBOL(printk);
@@ -1030,15 +1037,36 @@ void release_console_sem(void)
 		_con_start = con_start;
 		_log_end = log_end;
 		con_start = log_end;		/* Flush */
+
+		/*
+		 * on PREEMPT_RT, call console drivers with
+		 * interrupts enabled (if printk was called
+		 * with interrupts disabled):
+		 */
+#ifdef CONFIG_PREEMPT_RT
+		atomic_spin_unlock_irqrestore(&logbuf_lock, flags);
+#else
 		atomic_spin_unlock(&logbuf_lock);
 		stop_critical_timings();	/* don't trace print latency */
+#endif
 		call_console_drivers(_con_start, _log_end);
 		start_critical_timings();
+#ifndef CONFIG_PREEMPT_RT
 		local_irq_restore(flags);
+#endif
 	}
 	console_locked = 0;
-	up(&console_sem);
 	atomic_spin_unlock_irqrestore(&logbuf_lock, flags);
+	up(&console_sem);
+	/*
+	 * On PREEMPT_RT kernels __wake_up may sleep, so wake syslogd
+	 * up only if we are in a preemptible section. We normally dont
+	 * printk from non-preemptible sections so this is for the emergency
+	 * case only.
+	 */
+#ifdef CONFIG_PREEMPT_RT
+	if (!in_atomic() && !irqs_disabled())
+#endif
 	if (wake_klogd)
 		wake_up_klogd();
 }
@@ -1314,6 +1342,23 @@ int printk_ratelimit(void)
 }
 EXPORT_SYMBOL(printk_ratelimit);
 
+static DEFINE_ATOMIC_SPINLOCK(warn_lock);
+
+void __WARN_ON(const char *func, const char *file, const int line)
+{
+	unsigned long flags;
+
+	atomic_spin_lock_irqsave(&warn_lock, flags);
+	printk("%s/%d[CPU#%d]: BUG in %s at %s:%d\n",
+		current->comm, current->pid, raw_smp_processor_id(),
+		func, file, line);
+	dump_stack();
+	atomic_spin_unlock_irqrestore(&warn_lock, flags);
+}
+
+EXPORT_SYMBOL(__WARN_ON);
+
+
 /**
  * printk_timed_ratelimit - caller-controlled printk ratelimiting
  * @caller_jiffies: pointer to caller's state
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 9b4a975a4b4a..7a4e9123f7e5 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -752,7 +752,7 @@ rcu_torture_reader(void *arg)
 		if (p == NULL) {
 			/* Wait for rcu_torture_writer to get underway */
 			cur_ops->readunlock(idx);
-			schedule_timeout_interruptible(HZ);
+			schedule_timeout_interruptible(round_jiffies_relative(HZ));
 			continue;
 		}
 		if (p->rtort_mbtest == 0)
diff --git a/kernel/relay.c b/kernel/relay.c
index bc188549788f..05fd6d5a52b5 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -343,6 +343,10 @@ static void wakeup_readers(unsigned long data)
 {
 	struct rchan_buf *buf = (struct rchan_buf *)data;
 	wake_up_interruptible(&buf->read_wait);
+	/*
+	 * Stupid polling for now:
+	 */
+	mod_timer(&buf->timer, jiffies + 1);
 }
 
 /**
@@ -360,6 +364,7 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init)
 		init_waitqueue_head(&buf->read_wait);
 		kref_init(&buf->kref);
 		setup_timer(&buf->timer, wakeup_readers, (unsigned long)buf);
+		mod_timer(&buf->timer, jiffies + 1);
 	} else
 		del_timer_sync(&buf->timer);
 
@@ -740,15 +745,6 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
 		else
 			buf->early_bytes += buf->chan->subbuf_size -
 					    buf->padding[old_subbuf];
-		smp_mb();
-		if (waitqueue_active(&buf->read_wait))
-			/*
-			 * Calling wake_up_interruptible() from here
-			 * will deadlock if we happen to be logging
-			 * from the scheduler (trying to re-grab
-			 * rq->lock), so defer it.
-			 */
-			mod_timer(&buf->timer, jiffies + 1);
 	}
 
 	old = buf->data;
diff --git a/kernel/rt.c b/kernel/rt.c
new file mode 100644
index 000000000000..59ca169229ac
--- /dev/null
+++ b/kernel/rt.c
@@ -0,0 +1,543 @@
+/*
+ * kernel/rt.c
+ *
+ * Real-Time Preemption Support
+ *
+ * started by Ingo Molnar:
+ *
+ *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * historic credit for proving that Linux spinlocks can be implemented via
+ * RT-aware mutexes goes to many people: The Pmutex project (Dirk Grambow
+ * and others) who prototyped it on 2.4 and did lots of comparative
+ * research and analysis; TimeSys, for proving that you can implement a
+ * fully preemptible kernel via the use of IRQ threading and mutexes;
+ * Bill Huey for persuasively arguing on lkml that the mutex model is the
+ * right one; and to MontaVista, who ported pmutexes to 2.6.
+ *
+ * This code is a from-scratch implementation and is not based on pmutexes,
+ * but the idea of converting spinlocks to mutexes is used here too.
+ *
+ * lock debugging, locking tree, deadlock detection:
+ *
+ *  Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey
+ *  Released under the General Public License (GPL).
+ *
+ * Includes portions of the generic R/W semaphore implementation from:
+ *
+ *  Copyright (c) 2001   David Howells (dhowells@redhat.com).
+ *  - Derived partially from idea by Andrea Arcangeli <andrea@suse.de>
+ *  - Derived also from comments by Linus
+ *
+ * Pending ownership of locks and ownership stealing:
+ *
+ *  Copyright (C) 2005, Kihon Technologies Inc., Steven Rostedt
+ *
+ *   (also by Steven Rostedt)
+ *    - Converted single pi_lock to individual task locks.
+ *
+ * By Esben Nielsen:
+ *    Doing priority inheritance with help of the scheduler.
+ *
+ *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *  - major rework based on Esben Nielsens initial patch
+ *  - replaced thread_info references by task_struct refs
+ *  - removed task->pending_owner dependency
+ *  - BKL drop/reacquire for semaphore style locks to avoid deadlocks
+ *    in the scheduler return path as discussed with Steven Rostedt
+ *
+ *  Copyright (C) 2006, Kihon Technologies Inc.
+ *    Steven Rostedt <rostedt@goodmis.org>
+ *  - debugged and patched Thomas Gleixner's rework.
+ *  - added back the cmpxchg to the rework.
+ *  - turned atomic require back on for SMP.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/kallsyms.h>
+#include <linux/syscalls.h>
+#include <linux/interrupt.h>
+#include <linux/plist.h>
+#include <linux/fs.h>
+#include <linux/futex.h>
+#include <linux/hrtimer.h>
+
+#include "rtmutex_common.h"
+
+#ifdef CONFIG_PREEMPT_RT
+/*
+ * Unlock these on crash:
+ */
+void zap_rt_locks(void)
+{
+	//trace_lock_init();
+}
+#endif
+
+/*
+ * struct mutex functions
+ */
+void __mutex_init(struct mutex *lock, char *name, struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	/*
+	 * Make sure we are not reinitializing a held lock:
+	 */
+	debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+	lockdep_init_map(&lock->dep_map, name, key, 0);
+#endif
+	__rt_mutex_init(&lock->lock, name);
+}
+EXPORT_SYMBOL(__mutex_init);
+
+void __lockfunc _mutex_lock(struct mutex *lock)
+{
+	mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+	rt_mutex_lock(&lock->lock);
+}
+EXPORT_SYMBOL(_mutex_lock);
+
+int __lockfunc _mutex_lock_interruptible(struct mutex *lock)
+{
+	int ret;
+
+	mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+	ret = rt_mutex_lock_interruptible(&lock->lock, 0);
+	if (ret)
+		mutex_release(&lock->dep_map, 1, _RET_IP_);
+	return ret;
+}
+EXPORT_SYMBOL(_mutex_lock_interruptible);
+
+int __lockfunc _mutex_lock_killable(struct mutex *lock)
+{
+	int ret;
+
+	mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+	ret = rt_mutex_lock_killable(&lock->lock, 0);
+	if (ret)
+		mutex_release(&lock->dep_map, 1, _RET_IP_);
+	return ret;
+}
+EXPORT_SYMBOL(_mutex_lock_killable);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass)
+{
+	mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+	rt_mutex_lock(&lock->lock);
+}
+EXPORT_SYMBOL(_mutex_lock_nested);
+
+int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass)
+{
+	int ret;
+
+	mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+	ret = rt_mutex_lock_interruptible(&lock->lock, 0);
+	if (ret)
+		mutex_release(&lock->dep_map, 1, _RET_IP_);
+	return ret;
+}
+EXPORT_SYMBOL(_mutex_lock_interruptible_nested);
+
+int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass)
+{
+	int ret;
+
+	mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+	ret = rt_mutex_lock_killable(&lock->lock, 0);
+	if (ret)
+		mutex_release(&lock->dep_map, 1, _RET_IP_);
+	return ret;
+}
+EXPORT_SYMBOL(_mutex_lock_killable_nested);
+#endif
+
+int __lockfunc _mutex_trylock(struct mutex *lock)
+{
+	int ret = rt_mutex_trylock(&lock->lock);
+
+	if (ret)
+		mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+
+	return ret;
+}
+EXPORT_SYMBOL(_mutex_trylock);
+
+void __lockfunc _mutex_unlock(struct mutex *lock)
+{
+	mutex_release(&lock->dep_map, 1, _RET_IP_);
+	rt_mutex_unlock(&lock->lock);
+}
+EXPORT_SYMBOL(_mutex_unlock);
+
+/*
+ * rwlock_t functions
+ */
+int __lockfunc rt_write_trylock(rwlock_t *rwlock)
+{
+	int ret = rt_mutex_trylock(&rwlock->lock);
+
+	if (ret)
+		rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
+
+	return ret;
+}
+EXPORT_SYMBOL(rt_write_trylock);
+
+int __lockfunc rt_write_trylock_irqsave(rwlock_t *rwlock, unsigned long *flags)
+{
+	*flags = 0;
+	return rt_write_trylock(rwlock);
+}
+EXPORT_SYMBOL(rt_write_trylock_irqsave);
+
+int __lockfunc rt_read_trylock(rwlock_t *rwlock)
+{
+	struct rt_mutex *lock = &rwlock->lock;
+	int ret = 1;
+
+	/*
+	 * recursive read locks succeed when current owns the lock
+	 */
+	if (rt_mutex_real_owner(lock) != current || !rwlock->read_depth)
+		ret = rt_mutex_trylock(lock);
+
+	if (ret) {
+		rwlock->read_depth++;
+		rwlock_acquire_read(&rwlock->dep_map, 0, 1, _RET_IP_);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(rt_read_trylock);
+
+void __lockfunc rt_write_lock(rwlock_t *rwlock)
+{
+	rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
+	__rt_spin_lock(&rwlock->lock);
+}
+EXPORT_SYMBOL(rt_write_lock);
+
+void __lockfunc rt_read_lock(rwlock_t *rwlock)
+{
+	struct rt_mutex *lock = &rwlock->lock;
+
+	rwlock_acquire_read(&rwlock->dep_map, 0, 0, _RET_IP_);
+
+	/*
+	 * recursive read locks succeed when current owns the lock
+	 */
+	if (rt_mutex_real_owner(lock) != current || !rwlock->read_depth)
+		__rt_spin_lock(lock);
+	rwlock->read_depth++;
+}
+
+EXPORT_SYMBOL(rt_read_lock);
+
+void __lockfunc rt_write_unlock(rwlock_t *rwlock)
+{
+	/* NOTE: we always pass in '1' for nested, for simplicity */
+	rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
+	__rt_spin_unlock(&rwlock->lock);
+}
+EXPORT_SYMBOL(rt_write_unlock);
+
+void __lockfunc rt_read_unlock(rwlock_t *rwlock)
+{
+	rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
+
+	BUG_ON(rwlock->read_depth <= 0);
+
+	/* Release the lock only when read_depth is down to 0 */
+	if (--rwlock->read_depth == 0)
+		__rt_spin_unlock(&rwlock->lock);
+}
+EXPORT_SYMBOL(rt_read_unlock);
+
+unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock)
+{
+	rt_write_lock(rwlock);
+
+	return 0;
+}
+EXPORT_SYMBOL(rt_write_lock_irqsave);
+
+unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock)
+{
+	rt_read_lock(rwlock);
+
+	return 0;
+}
+EXPORT_SYMBOL(rt_read_lock_irqsave);
+
+void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	/*
+	 * Make sure we are not reinitializing a held lock:
+	 */
+	debug_check_no_locks_freed((void *)rwlock, sizeof(*rwlock));
+	lockdep_init_map(&rwlock->dep_map, name, key, 0);
+#endif
+	__rt_mutex_init(&rwlock->lock, name);
+	rwlock->read_depth = 0;
+}
+EXPORT_SYMBOL(__rt_rwlock_init);
+
+/*
+ * rw_semaphores
+ */
+
+void  rt_up_write(struct rw_semaphore *rwsem)
+{
+	rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
+	rt_mutex_unlock(&rwsem->lock);
+}
+EXPORT_SYMBOL(rt_up_write);
+
+void  rt_up_read(struct rw_semaphore *rwsem)
+{
+	rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
+	rt_mutex_unlock(&rwsem->lock);
+}
+EXPORT_SYMBOL(rt_up_read);
+
+/*
+ * downgrade a write lock into a read lock
+ * - just wake up any readers at the front of the queue
+ */
+void  rt_downgrade_write(struct rw_semaphore *rwsem)
+{
+	BUG_ON(rt_mutex_real_owner(&rwsem->lock) != current);
+}
+EXPORT_SYMBOL(rt_downgrade_write);
+
+int  rt_down_write_trylock(struct rw_semaphore *rwsem)
+{
+	int ret = rt_mutex_trylock(&rwsem->lock);
+
+	if (ret)
+		rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
+	return ret;
+}
+EXPORT_SYMBOL(rt_down_write_trylock);
+
+void  rt_down_write(struct rw_semaphore *rwsem)
+{
+	rwsem_acquire(&rwsem->dep_map, 0, 0, _RET_IP_);
+	rt_mutex_lock(&rwsem->lock);
+}
+EXPORT_SYMBOL(rt_down_write);
+
+void  rt_down_write_nested(struct rw_semaphore *rwsem, int subclass)
+{
+	rwsem_acquire(&rwsem->dep_map, subclass, 0, _RET_IP_);
+	rt_mutex_lock(&rwsem->lock);
+}
+EXPORT_SYMBOL(rt_down_write_nested);
+
+int  rt_down_read_trylock(struct rw_semaphore *rwsem)
+{
+	int ret = rt_mutex_trylock(&rwsem->lock);
+
+	if (ret)
+		rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
+	return ret;
+}
+EXPORT_SYMBOL(rt_down_read_trylock);
+
+static void __rt_down_read(struct rw_semaphore *rwsem, int subclass)
+{
+	rwsem_acquire_read(&rwsem->dep_map, subclass, 0, _RET_IP_);
+	rt_mutex_lock(&rwsem->lock);
+}
+
+void  rt_down_read(struct rw_semaphore *rwsem)
+{
+	__rt_down_read(rwsem, 0);
+}
+EXPORT_SYMBOL(rt_down_read);
+
+void  rt_down_read_nested(struct rw_semaphore *rwsem, int subclass)
+{
+	__rt_down_read(rwsem, subclass);
+}
+EXPORT_SYMBOL(rt_down_read_nested);
+
+void  __rt_rwsem_init(struct rw_semaphore *rwsem, char *name,
+			      struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	/*
+	 * Make sure we are not reinitializing a held lock:
+	 */
+	debug_check_no_locks_freed((void *)rwsem, sizeof(*rwsem));
+	lockdep_init_map(&rwsem->dep_map, name, key, 0);
+#endif
+	__rt_mutex_init(&rwsem->lock, name);
+}
+EXPORT_SYMBOL(__rt_rwsem_init);
+
+/*
+ * Semaphores
+ */
+/*
+ * Linux Semaphores implemented via RT-mutexes.
+ *
+ * In the down() variants we use the mutex as the semaphore blocking
+ * object: we always acquire it, decrease the counter and keep the lock
+ * locked if we did the 1->0 transition. The next down() will then block.
+ *
+ * In the up() path we atomically increase the counter and do the
+ * unlock if we were the one doing the 0->1 transition.
+ */
+
+static inline void __down_complete(struct semaphore *sem)
+{
+	int count = atomic_dec_return(&sem->count);
+
+	if (unlikely(count > 0))
+		rt_mutex_unlock(&sem->lock);
+}
+
+void  rt_down(struct semaphore *sem)
+{
+	rt_mutex_lock(&sem->lock);
+	__down_complete(sem);
+}
+EXPORT_SYMBOL(rt_down);
+
+int  rt_down_interruptible(struct semaphore *sem)
+{
+	int ret;
+
+	ret = rt_mutex_lock_interruptible(&sem->lock, 0);
+	if (ret)
+		return ret;
+	__down_complete(sem);
+	return 0;
+}
+EXPORT_SYMBOL(rt_down_interruptible);
+
+int rt_down_timeout(struct semaphore *sem, long jiff)
+{
+	struct hrtimer_sleeper t;
+	struct timespec ts;
+	unsigned long expires = jiffies + jiff + 1;
+	int ret;
+
+	/*
+	 * rt_mutex_slowlock can use an interruptible, but this needs to
+	 * be TASK_INTERRUPTIBLE. The down_timeout uses TASK_UNINTERRUPTIBLE.
+	 * To handle this we loop if a signal caused the timeout and the
+	 * we recalculate the new timeout.
+	 * Yes Thomas, this is a hack! But we can fix it right later.
+	 */
+	do {
+		jiffies_to_timespec(jiff, &ts);
+		hrtimer_init_on_stack(&t.timer, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
+		t.timer._expires = timespec_to_ktime(ts);
+
+		ret = rt_mutex_timed_lock(&sem->lock, &t, 0);
+		if (ret != -EINTR)
+			break;
+
+		/* signal occured, but the down_timeout doesn't handle them */
+		jiff = expires - jiffies;
+
+	} while (jiff > 0);
+
+	if (!ret)
+		__down_complete(sem);
+	else
+		ret = -ETIME;
+
+	return ret;
+}
+EXPORT_SYMBOL(rt_down_timeout);
+
+/*
+ * try to down the semaphore, 0 on success and 1 on failure. (inverted)
+ */
+int  rt_down_trylock(struct semaphore *sem)
+{
+	/*
+	 * Here we are a tiny bit different from ordinary Linux semaphores,
+	 * because we can get 'transient' locking-failures when say a
+	 * process decreases the count from 9 to 8 and locks/releases the
+	 * embedded mutex internally. It would be quite complex to remove
+	 * these transient failures so lets try it the simple way first:
+	 */
+	if (rt_mutex_trylock(&sem->lock)) {
+		__down_complete(sem);
+		return 0;
+	}
+	return 1;
+}
+EXPORT_SYMBOL(rt_down_trylock);
+
+void  rt_up(struct semaphore *sem)
+{
+	int count;
+
+	/*
+	 * Disable preemption to make sure a highprio trylock-er cannot
+	 * preempt us here and get into an infinite loop:
+	 */
+	preempt_disable();
+	count = atomic_inc_return(&sem->count);
+	/*
+	 * If we did the 0 -> 1 transition then we are the ones to unlock it:
+	 */
+	if (likely(count == 1))
+		rt_mutex_unlock(&sem->lock);
+	preempt_enable();
+}
+EXPORT_SYMBOL(rt_up);
+
+void  __sema_init(struct semaphore *sem, int val,
+			  char *name, char *file, int line)
+{
+	atomic_set(&sem->count, val);
+	switch (val) {
+	case 0:
+		__rt_mutex_init(&sem->lock, name);
+		rt_mutex_lock(&sem->lock);
+		break;
+	default:
+		__rt_mutex_init(&sem->lock, name);
+		break;
+	}
+}
+EXPORT_SYMBOL(__sema_init);
+
+/**
+ * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
+ * @cnt: the atomic which we are to dec
+ * @lock: the mutex to return holding if we dec to 0
+ *
+ * return true and hold lock if we dec to 0, return false otherwise
+ */
+int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
+{
+	/* dec if we can't possibly hit 0 */
+	if (atomic_add_unless(cnt, -1, 1))
+		return 0;
+	/* we might hit 0, so take the lock */
+	mutex_lock(lock);
+	if (!atomic_dec_and_test(cnt)) {
+		/* when we actually did the dec, we didn't hit 0 */
+		mutex_unlock(lock);
+		return 0;
+	}
+	/* we hit 0, and we hold the lock */
+	return 1;
+}
+EXPORT_SYMBOL(atomic_dec_and_mutex_lock);
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index c97ffa45317a..e7e6314221ca 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -29,61 +29,6 @@
 
 #include "rtmutex_common.h"
 
-# define TRACE_WARN_ON(x)			WARN_ON(x)
-# define TRACE_BUG_ON(x)			BUG_ON(x)
-
-# define TRACE_OFF()						\
-do {								\
-	if (rt_trace_on) {					\
-		rt_trace_on = 0;				\
-		console_verbose();				\
-		if (atomic_spin_is_locked(&current->pi_lock))	\
-			atomic_spin_unlock(&current->pi_lock);	\
-	}							\
-} while (0)
-
-# define TRACE_OFF_NOLOCK()					\
-do {								\
-	if (rt_trace_on) {					\
-		rt_trace_on = 0;				\
-		console_verbose();				\
-	}							\
-} while (0)
-
-# define TRACE_BUG_LOCKED()			\
-do {						\
-	TRACE_OFF();				\
-	BUG();					\
-} while (0)
-
-# define TRACE_WARN_ON_LOCKED(c)		\
-do {						\
-	if (unlikely(c)) {			\
-		TRACE_OFF();			\
-		WARN_ON(1);			\
-	}					\
-} while (0)
-
-# define TRACE_BUG_ON_LOCKED(c)			\
-do {						\
-	if (unlikely(c))			\
-		TRACE_BUG_LOCKED();		\
-} while (0)
-
-#ifdef CONFIG_SMP
-# define SMP_TRACE_BUG_ON_LOCKED(c)	TRACE_BUG_ON_LOCKED(c)
-#else
-# define SMP_TRACE_BUG_ON_LOCKED(c)	do { } while (0)
-#endif
-
-/*
- * deadlock detection flag. We turn it off when we detect
- * the first problem because we dont want to recurse back
- * into the tracing code when doing error printk or
- * executing a BUG():
- */
-static int rt_trace_on = 1;
-
 static void printk_task(struct task_struct *p)
 {
 	if (p)
@@ -111,8 +56,8 @@ static void printk_lock(struct rt_mutex *lock, int print_owner)
 
 void rt_mutex_debug_task_free(struct task_struct *task)
 {
-	WARN_ON(!plist_head_empty(&task->pi_waiters));
-	WARN_ON(task->pi_blocked_on);
+	DEBUG_LOCKS_WARN_ON(!plist_head_empty(&task->pi_waiters));
+	DEBUG_LOCKS_WARN_ON(task->pi_blocked_on);
 }
 
 /*
@@ -125,7 +70,7 @@ void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter,
 {
 	struct task_struct *task;
 
-	if (!rt_trace_on || detect || !act_waiter)
+	if (!debug_locks || detect || !act_waiter)
 		return;
 
 	task = rt_mutex_owner(act_waiter->lock);
@@ -139,7 +84,7 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
 {
 	struct task_struct *task;
 
-	if (!waiter->deadlock_lock || !rt_trace_on)
+	if (!waiter->deadlock_lock || !debug_locks)
 		return;
 
 	rcu_read_lock();
@@ -149,7 +94,8 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
 		return;
 	}
 
-	TRACE_OFF_NOLOCK();
+	if (!debug_locks_off())
+		return;
 
 	printk("\n============================================\n");
 	printk(  "[ BUG: circular locking deadlock detected! ]\n");
@@ -180,7 +126,6 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
 
 	printk("[ turning off deadlock detection."
 	       "Please report this trace. ]\n\n");
-	local_irq_disable();
 }
 
 void debug_rt_mutex_lock(struct rt_mutex *lock)
@@ -189,7 +134,8 @@ void debug_rt_mutex_lock(struct rt_mutex *lock)
 
 void debug_rt_mutex_unlock(struct rt_mutex *lock)
 {
-	TRACE_WARN_ON_LOCKED(rt_mutex_owner(lock) != current);
+	if (debug_locks)
+		DEBUG_LOCKS_WARN_ON(rt_mutex_owner(lock) != current);
 }
 
 void
@@ -199,7 +145,7 @@ debug_rt_mutex_proxy_lock(struct rt_mutex *lock, struct task_struct *powner)
 
 void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock)
 {
-	TRACE_WARN_ON_LOCKED(!rt_mutex_owner(lock));
+	DEBUG_LOCKS_WARN_ON(!rt_mutex_owner(lock));
 }
 
 void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
@@ -213,9 +159,9 @@ void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
 void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
 {
 	put_pid(waiter->deadlock_task_pid);
-	TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry));
-	TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
-	TRACE_WARN_ON(waiter->task);
+	DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->list_entry));
+	DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
+	DEBUG_LOCKS_WARN_ON(waiter->task);
 	memset(waiter, 0x22, sizeof(*waiter));
 }
 
@@ -231,9 +177,36 @@ void debug_rt_mutex_init(struct rt_mutex *lock, const char *name)
 void
 rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task)
 {
+#ifdef CONFIG_DEBUG_PREEMPT
+	if (atomic_read(&task->lock_count) >= MAX_LOCK_STACK) {
+		if (!debug_locks_off())
+			return;
+		printk("BUG: %s/%d: lock count overflow!\n",
+			task->comm, task->pid);
+		dump_stack();
+		return;
+	}
+#ifdef CONFIG_PREEMPT_RT
+	task->owned_lock[atomic_read(&task->lock_count)] = lock;
+#endif
+	atomic_inc(&task->lock_count);
+#endif
 }
 
 void rt_mutex_deadlock_account_unlock(struct task_struct *task)
 {
+#ifdef CONFIG_DEBUG_PREEMPT
+	if (!atomic_read(&task->lock_count)) {
+		if (!debug_locks_off())
+			return;
+		printk("BUG: %s/%d: lock count underflow!\n",
+			task->comm, task->pid);
+		dump_stack();
+		return;
+	}
+	atomic_dec(&task->lock_count);
+#ifdef CONFIG_PREEMPT_RT
+	task->owned_lock[atomic_read(&task->lock_count)] = NULL;
+#endif
+#endif
 }
-
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 7b1c7946f0ef..f66f98de4567 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -8,12 +8,20 @@
  *  Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
  *  Copyright (C) 2006 Esben Nielsen
  *
+ * Adaptive Spinlocks:
+ *  Copyright (C) 2008 Novell, Inc., Gregory Haskins, Sven Dietrich,
+ *                                   and Peter Morreale,
+ * Adaptive Spinlocks simplification:
+ *  Copyright (C) 2008 Red Hat, Inc., Steven Rostedt <srostedt@redhat.com>
+ *
  *  See Documentation/rt-mutex-design.txt for details.
  */
 #include <linux/spinlock.h>
 #include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/timer.h>
+#include <linux/hardirq.h>
+#include <linux/semaphore.h>
 
 #include "rtmutex_common.h"
 
@@ -97,6 +105,22 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
 }
 #endif
 
+int pi_initialized;
+
+/*
+ * we initialize the wait_list runtime. (Could be done build-time and/or
+ * boot-time.)
+ */
+static inline void init_lists(struct rt_mutex *lock)
+{
+	if (unlikely(!lock->wait_list.prio_list.prev)) {
+		plist_head_init_atomic(&lock->wait_list, &lock->wait_lock);
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+		pi_initialized++;
+#endif
+	}
+}
+
 /*
  * Calculate task priority from the waiter list priority
  *
@@ -253,13 +277,13 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 	plist_add(&waiter->list_entry, &lock->wait_list);
 
 	/* Release the task */
-	atomic_spin_unlock_irqrestore(&task->pi_lock, flags);
+	atomic_spin_unlock(&task->pi_lock);
 	put_task_struct(task);
 
 	/* Grab the next task */
 	task = rt_mutex_owner(lock);
 	get_task_struct(task);
-	atomic_spin_lock_irqsave(&task->pi_lock, flags);
+	atomic_spin_lock(&task->pi_lock);
 
 	if (waiter == rt_mutex_top_waiter(lock)) {
 		/* Boost the owner */
@@ -277,10 +301,10 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 		__rt_mutex_adjust_prio(task);
 	}
 
-	atomic_spin_unlock_irqrestore(&task->pi_lock, flags);
+	atomic_spin_unlock(&task->pi_lock);
 
 	top_waiter = rt_mutex_top_waiter(lock);
-	atomic_spin_unlock(&lock->wait_lock);
+	atomic_spin_unlock_irqrestore(&lock->wait_lock, flags);
 
 	if (!detect_deadlock && waiter != top_waiter)
 		goto out_put_task;
@@ -301,11 +325,10 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
  * lock yet]:
  */
 static inline int try_to_steal_lock(struct rt_mutex *lock,
-				    struct task_struct *task)
+				    struct task_struct *task, int mode)
 {
 	struct task_struct *pendowner = rt_mutex_owner(lock);
 	struct rt_mutex_waiter *next;
-	unsigned long flags;
 
 	if (!rt_mutex_owner_pending(lock))
 		return 0;
@@ -313,9 +336,9 @@ static inline int try_to_steal_lock(struct rt_mutex *lock,
 	if (pendowner == task)
 		return 1;
 
-	atomic_spin_lock_irqsave(&pendowner->pi_lock, flags);
-	if (task->prio >= pendowner->prio) {
-		atomic_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
+	atomic_spin_lock(&pendowner->pi_lock);
+	if (!lock_is_stealable(task, pendowner, mode)) {
+		atomic_spin_unlock(&pendowner->pi_lock);
 		return 0;
 	}
 
@@ -325,7 +348,7 @@ static inline int try_to_steal_lock(struct rt_mutex *lock,
 	 * priority.
 	 */
 	if (likely(!rt_mutex_has_waiters(lock))) {
-		atomic_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
+		atomic_spin_unlock(&pendowner->pi_lock);
 		return 1;
 	}
 
@@ -333,7 +356,7 @@ static inline int try_to_steal_lock(struct rt_mutex *lock,
 	next = rt_mutex_top_waiter(lock);
 	plist_del(&next->pi_list_entry, &pendowner->pi_waiters);
 	__rt_mutex_adjust_prio(pendowner);
-	atomic_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
+	atomic_spin_unlock(&pendowner->pi_lock);
 
 	/*
 	 * We are going to steal the lock and a waiter was
@@ -350,10 +373,10 @@ static inline int try_to_steal_lock(struct rt_mutex *lock,
 	 * might be task:
 	 */
 	if (likely(next->task != task)) {
-		atomic_spin_lock_irqsave(&task->pi_lock, flags);
+		atomic_spin_lock(&task->pi_lock);
 		plist_add(&next->pi_list_entry, &task->pi_waiters);
 		__rt_mutex_adjust_prio(task);
-		atomic_spin_unlock_irqrestore(&task->pi_lock, flags);
+		atomic_spin_unlock(&task->pi_lock);
 	}
 	return 1;
 }
@@ -367,7 +390,7 @@ static inline int try_to_steal_lock(struct rt_mutex *lock,
  *
  * Must be called with lock->wait_lock held.
  */
-static int try_to_take_rt_mutex(struct rt_mutex *lock)
+static int do_try_to_take_rt_mutex(struct rt_mutex *lock, int mode)
 {
 	/*
 	 * We have to be careful here if the atomic speedups are
@@ -390,7 +413,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock)
 	 */
 	mark_rt_mutex_waiters(lock);
 
-	if (rt_mutex_owner(lock) && !try_to_steal_lock(lock, current))
+	if (rt_mutex_owner(lock) && !try_to_steal_lock(lock, current, mode))
 		return 0;
 
 	/* We got the lock. */
@@ -403,6 +426,11 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock)
 	return 1;
 }
 
+static inline int try_to_take_rt_mutex(struct rt_mutex *lock)
+{
+	return do_try_to_take_rt_mutex(lock, STEAL_NORMAL);
+}
+
 /*
  * Task blocks on lock.
  *
@@ -413,14 +441,13 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock)
 static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 				   struct rt_mutex_waiter *waiter,
 				   struct task_struct *task,
-				   int detect_deadlock)
+				   int detect_deadlock, unsigned long flags)
 {
 	struct task_struct *owner = rt_mutex_owner(lock);
 	struct rt_mutex_waiter *top_waiter = waiter;
-	unsigned long flags;
 	int chain_walk = 0, res;
 
-	atomic_spin_lock_irqsave(&task->pi_lock, flags);
+	atomic_spin_lock(&task->pi_lock);
 	__rt_mutex_adjust_prio(task);
 	waiter->task = task;
 	waiter->lock = lock;
@@ -434,17 +461,17 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 
 	task->pi_blocked_on = waiter;
 
-	atomic_spin_unlock_irqrestore(&task->pi_lock, flags);
+	atomic_spin_unlock(&task->pi_lock);
 
 	if (waiter == rt_mutex_top_waiter(lock)) {
-		atomic_spin_lock_irqsave(&owner->pi_lock, flags);
+		atomic_spin_lock(&owner->pi_lock);
 		plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
 		plist_add(&waiter->pi_list_entry, &owner->pi_waiters);
 
 		__rt_mutex_adjust_prio(owner);
 		if (owner->pi_blocked_on)
 			chain_walk = 1;
-		atomic_spin_unlock_irqrestore(&owner->pi_lock, flags);
+		atomic_spin_unlock(&owner->pi_lock);
 	}
 	else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock))
 		chain_walk = 1;
@@ -459,12 +486,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 	 */
 	get_task_struct(owner);
 
-	atomic_spin_unlock(&lock->wait_lock);
+	atomic_spin_unlock_irqrestore(&lock->wait_lock, flags);
 
 	res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter,
 					 task);
 
-	atomic_spin_lock(&lock->wait_lock);
+	atomic_spin_lock_irq(&lock->wait_lock);
 
 	return res;
 }
@@ -477,13 +504,13 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
  *
  * Called with lock->wait_lock held.
  */
-static void wakeup_next_waiter(struct rt_mutex *lock)
+static void wakeup_next_waiter(struct rt_mutex *lock, int savestate)
 {
 	struct rt_mutex_waiter *waiter;
 	struct task_struct *pendowner;
-	unsigned long flags;
+	struct rt_mutex_waiter *next;
 
-	atomic_spin_lock_irqsave(&current->pi_lock, flags);
+	atomic_spin_lock(&current->pi_lock);
 
 	waiter = rt_mutex_top_waiter(lock);
 	plist_del(&waiter->list_entry, &lock->wait_list);
@@ -498,9 +525,44 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
 	pendowner = waiter->task;
 	waiter->task = NULL;
 
+	/*
+	 * Do the wakeup before the ownership change to give any spinning
+	 * waiter grantees a headstart over the other threads that will
+	 * trigger once owner changes.
+	 */
+	if (!savestate)
+		wake_up_process(pendowner);
+	else {
+		/*
+		 * We can skip the actual (expensive) wakeup if the
+		 * waiter is already running, but we have to be careful
+		 * of race conditions because they may be about to sleep.
+		 *
+		 * The waiter-side protocol has the following pattern:
+		 * 1: Set state != RUNNING
+		 * 2: Conditionally sleep if waiter->task != NULL;
+		 *
+		 * And the owner-side has the following:
+		 * A: Set waiter->task = NULL
+		 * B: Conditionally wake if the state != RUNNING
+		 *
+		 * As long as we ensure 1->2 order, and A->B order, we
+		 * will never miss a wakeup.
+		 *
+		 * Therefore, this barrier ensures that waiter->task = NULL
+		 * is visible before we test the pendowner->state.  The
+		 * corresponding barrier is in the sleep logic.
+		 */
+		smp_mb();
+
+		/* If !RUNNING && !RUNNING_MUTEX */
+		if (pendowner->state & ~TASK_RUNNING_MUTEX)
+			wake_up_process_mutex(pendowner);
+	}
+
 	rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING);
 
-	atomic_spin_unlock_irqrestore(&current->pi_lock, flags);
+	atomic_spin_unlock(&current->pi_lock);
 
 	/*
 	 * Clear the pi_blocked_on variable and enqueue a possible
@@ -509,7 +571,13 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
 	 * waiter with higher priority than pending-owner->normal_prio
 	 * is blocked on the unboosted (pending) owner.
 	 */
-	atomic_spin_lock_irqsave(&pendowner->pi_lock, flags);
+
+	if (rt_mutex_has_waiters(lock))
+		next = rt_mutex_top_waiter(lock);
+	else
+		next = NULL;
+
+	atomic_spin_lock(&pendowner->pi_lock);
 
 	WARN_ON(!pendowner->pi_blocked_on);
 	WARN_ON(pendowner->pi_blocked_on != waiter);
@@ -517,15 +585,10 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
 
 	pendowner->pi_blocked_on = NULL;
 
-	if (rt_mutex_has_waiters(lock)) {
-		struct rt_mutex_waiter *next;
-
-		next = rt_mutex_top_waiter(lock);
+	if (next)
 		plist_add(&next->pi_list_entry, &pendowner->pi_waiters);
-	}
-	atomic_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
 
-	wake_up_process(pendowner);
+	atomic_spin_unlock(&pendowner->pi_lock);
 }
 
 /*
@@ -534,22 +597,22 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
  * Must be called with lock->wait_lock held
  */
 static void remove_waiter(struct rt_mutex *lock,
-			  struct rt_mutex_waiter *waiter)
+			  struct rt_mutex_waiter *waiter,
+			  unsigned long flags)
 {
 	int first = (waiter == rt_mutex_top_waiter(lock));
 	struct task_struct *owner = rt_mutex_owner(lock);
-	unsigned long flags;
 	int chain_walk = 0;
 
-	atomic_spin_lock_irqsave(&current->pi_lock, flags);
+	atomic_spin_lock(&current->pi_lock);
 	plist_del(&waiter->list_entry, &lock->wait_list);
 	waiter->task = NULL;
 	current->pi_blocked_on = NULL;
-	atomic_spin_unlock_irqrestore(&current->pi_lock, flags);
+	atomic_spin_unlock(&current->pi_lock);
 
 	if (first && owner != current) {
 
-		atomic_spin_lock_irqsave(&owner->pi_lock, flags);
+		atomic_spin_lock(&owner->pi_lock);
 
 		plist_del(&waiter->pi_list_entry, &owner->pi_waiters);
 
@@ -564,7 +627,7 @@ static void remove_waiter(struct rt_mutex *lock,
 		if (owner->pi_blocked_on)
 			chain_walk = 1;
 
-		atomic_spin_unlock_irqrestore(&owner->pi_lock, flags);
+		atomic_spin_unlock(&owner->pi_lock);
 	}
 
 	WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
@@ -575,11 +638,11 @@ static void remove_waiter(struct rt_mutex *lock,
 	/* gets dropped in rt_mutex_adjust_prio_chain()! */
 	get_task_struct(owner);
 
-	atomic_spin_unlock(&lock->wait_lock);
+	atomic_spin_unlock_irqrestore(&lock->wait_lock, flags);
 
 	rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current);
 
-	atomic_spin_lock(&lock->wait_lock);
+	atomic_spin_lock_irq(&lock->wait_lock);
 }
 
 /*
@@ -600,18 +663,391 @@ void rt_mutex_adjust_pi(struct task_struct *task)
 		return;
 	}
 
-	atomic_spin_unlock_irqrestore(&task->pi_lock, flags);
-
 	/* gets dropped in rt_mutex_adjust_prio_chain()! */
 	get_task_struct(task);
+	atomic_spin_unlock_irqrestore(&task->pi_lock, flags);
 	rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task);
 }
 
+/*
+ * preemptible spin_lock functions:
+ */
+
+#ifdef CONFIG_PREEMPT_RT
+
+static inline void
+rt_spin_lock_fastlock(struct rt_mutex *lock,
+		void  (*slowfn)(struct rt_mutex *lock))
+{
+	/* Temporary HACK! */
+	if (likely(!current->in_printk))
+		might_sleep();
+	else if (in_atomic() || irqs_disabled())
+		/* don't grab locks for printk in atomic */
+		return;
+
+	if (likely(rt_mutex_cmpxchg(lock, NULL, current)))
+		rt_mutex_deadlock_account_lock(lock, current);
+	else
+		slowfn(lock);
+}
+
+static inline void
+rt_spin_lock_fastunlock(struct rt_mutex *lock,
+			void  (*slowfn)(struct rt_mutex *lock))
+{
+	/* Temporary HACK! */
+	if (unlikely(rt_mutex_owner(lock) != current) && current->in_printk)
+		/* don't grab locks for printk in atomic */
+		return;
+
+	if (likely(rt_mutex_cmpxchg(lock, current, NULL)))
+		rt_mutex_deadlock_account_unlock(current);
+	else
+		slowfn(lock);
+}
+
+
+#ifdef CONFIG_SMP
+static int adaptive_wait(struct rt_mutex_waiter *waiter,
+			 struct task_struct *orig_owner)
+{
+	for (;;) {
+
+		/* we are the owner? */
+		if (!waiter->task)
+			return 0;
+
+		/* Owner changed? Then lets update the original */
+		if (orig_owner != rt_mutex_owner(waiter->lock))
+			return 0;
+
+		/* Owner went to bed, so should we */
+		if (!task_is_current(orig_owner))
+			return 1;
+
+		cpu_relax();
+	}
+}
+#else
+static int adaptive_wait(struct rt_mutex_waiter *waiter,
+			 struct task_struct *orig_owner)
+{
+	return 1;
+}
+#endif
+
+/*
+ * The state setting needs to preserve the original state and needs to
+ * take care of non rtmutex wakeups.
+ *
+ * Called with rtmutex->wait_lock held to serialize against rtmutex
+ * wakeups().
+ */
+static inline unsigned long
+rt_set_current_blocked_state(unsigned long saved_state)
+{
+	unsigned long state, block_state;
+
+	/*
+	 * If state is TASK_INTERRUPTIBLE, then we set the state for
+	 * blocking to TASK_INTERRUPTIBLE as well, otherwise we would
+	 * miss real wakeups via wake_up_interruptible(). If such a
+	 * wakeup happens we see the running state and preserve it in
+	 * saved_state. Now we can ignore further wakeups as we will
+	 * return in state running from our "spin" sleep.
+	 */
+	if (saved_state == TASK_INTERRUPTIBLE)
+		block_state = TASK_INTERRUPTIBLE;
+	else
+		block_state = TASK_UNINTERRUPTIBLE;
+
+	state = xchg(&current->state, block_state);
+	/*
+	 * Take care of non rtmutex wakeups. rtmutex wakeups
+	 * or TASK_RUNNING_MUTEX to (UN)INTERRUPTIBLE.
+	 */
+	if (state == TASK_RUNNING)
+		saved_state = TASK_RUNNING;
+
+	return saved_state;
+}
+
+static inline void rt_restore_current_state(unsigned long saved_state)
+{
+	unsigned long state = xchg(&current->state, saved_state);
+
+	if (state == TASK_RUNNING)
+		current->state = TASK_RUNNING;
+}
+
+/*
+ * Slow path lock function spin_lock style: this variant is very
+ * careful not to miss any non-lock wakeups.
+ *
+ * The wakeup side uses wake_up_process_mutex, which, combined with
+ * the xchg code of this function is a transparent sleep/wakeup
+ * mechanism nested within any existing sleep/wakeup mechanism. This
+ * enables the seemless use of arbitrary (blocking) spinlocks within
+ * sleep/wakeup event loops.
+ */
+static void  noinline __sched
+rt_spin_lock_slowlock(struct rt_mutex *lock)
+{
+	struct rt_mutex_waiter waiter;
+	unsigned long saved_state, flags;
+	struct task_struct *orig_owner;
+
+	debug_rt_mutex_init_waiter(&waiter);
+	waiter.task = NULL;
+
+	atomic_spin_lock_irqsave(&lock->wait_lock, flags);
+	init_lists(lock);
+
+	BUG_ON(rt_mutex_owner(lock) == current);
+
+	/*
+	 * Here we save whatever state the task was in originally,
+	 * we'll restore it at the end of the function and we'll take
+	 * any intermediate wakeup into account as well, independently
+	 * of the lock sleep/wakeup mechanism. When we get a real
+	 * wakeup the task->state is TASK_RUNNING and we change
+	 * saved_state accordingly. If we did not get a real wakeup
+	 * then we return with the saved state. We need to be careful
+	 * about original state TASK_INTERRUPTIBLE as well, as we
+	 * could miss a wakeup_interruptible()
+	 */
+	saved_state = rt_set_current_blocked_state(current->state);
+
+	for (;;) {
+		int saved_lock_depth = current->lock_depth;
+
+		/* Try to acquire the lock */
+		if (do_try_to_take_rt_mutex(lock, STEAL_LATERAL))
+			break;
+
+		/*
+		 * waiter.task is NULL the first time we come here and
+		 * when we have been woken up by the previous owner
+		 * but the lock got stolen by an higher prio task.
+		 */
+		if (!waiter.task) {
+			task_blocks_on_rt_mutex(lock, &waiter, current, 0,
+						flags);
+			/* Wakeup during boost ? */
+			if (unlikely(!waiter.task))
+				continue;
+		}
+
+		/*
+		 * Prevent schedule() to drop BKL, while waiting for
+		 * the lock ! We restore lock_depth when we come back.
+		 */
+		current->lock_depth = -1;
+		orig_owner = rt_mutex_owner(lock);
+		get_task_struct(orig_owner);
+		atomic_spin_unlock_irqrestore(&lock->wait_lock, flags);
+
+		debug_rt_mutex_print_deadlock(&waiter);
+
+		if (adaptive_wait(&waiter, orig_owner)) {
+			put_task_struct(orig_owner);
+
+			if (waiter.task)
+				schedule_rt_mutex(lock);
+		} else
+			put_task_struct(orig_owner);
+
+		atomic_spin_lock_irqsave(&lock->wait_lock, flags);
+		current->lock_depth = saved_lock_depth;
+		saved_state = rt_set_current_blocked_state(saved_state);
+	}
+
+	rt_restore_current_state(saved_state);
+
+	/*
+	 * Extremely rare case, if we got woken up by a non-mutex wakeup,
+	 * and we managed to steal the lock despite us not being the
+	 * highest-prio waiter (due to SCHED_OTHER changing prio), then we
+	 * can end up with a non-NULL waiter.task:
+	 */
+	if (unlikely(waiter.task))
+		remove_waiter(lock, &waiter, flags);
+	/*
+	 * try_to_take_rt_mutex() sets the waiter bit
+	 * unconditionally. We might have to fix that up:
+	 */
+	fixup_rt_mutex_waiters(lock);
+
+	atomic_spin_unlock_irqrestore(&lock->wait_lock, flags);
+
+	debug_rt_mutex_free_waiter(&waiter);
+}
+
+/*
+ * Slow path to release a rt_mutex spin_lock style
+ */
+static void  noinline __sched
+rt_spin_lock_slowunlock(struct rt_mutex *lock)
+{
+	unsigned long flags;
+
+	atomic_spin_lock_irqsave(&lock->wait_lock, flags);
+
+	debug_rt_mutex_unlock(lock);
+
+	rt_mutex_deadlock_account_unlock(current);
+
+	if (!rt_mutex_has_waiters(lock)) {
+		lock->owner = NULL;
+		atomic_spin_unlock_irqrestore(&lock->wait_lock, flags);
+		return;
+	}
+
+	wakeup_next_waiter(lock, 1);
+
+	atomic_spin_unlock_irqrestore(&lock->wait_lock, flags);
+
+	/* Undo pi boosting.when necessary */
+	rt_mutex_adjust_prio(current);
+}
+
+void __lockfunc rt_spin_lock(spinlock_t *lock)
+{
+	rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock);
+	spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+}
+EXPORT_SYMBOL(rt_spin_lock);
+
+void __lockfunc __rt_spin_lock(struct rt_mutex *lock)
+{
+	rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock);
+}
+EXPORT_SYMBOL(__rt_spin_lock);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+
+void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass)
+{
+	rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock);
+	spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+}
+EXPORT_SYMBOL(rt_spin_lock_nested);
+
+#endif
+
+void __lockfunc rt_spin_unlock(spinlock_t *lock)
+{
+	/* NOTE: we always pass in '1' for nested, for simplicity */
+	spin_release(&lock->dep_map, 1, _RET_IP_);
+	rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock);
+}
+EXPORT_SYMBOL(rt_spin_unlock);
+
+void __lockfunc __rt_spin_unlock(struct rt_mutex *lock)
+{
+	rt_spin_lock_fastunlock(lock, rt_spin_lock_slowunlock);
+}
+EXPORT_SYMBOL(__rt_spin_unlock);
+
+/*
+ * Wait for the lock to get unlocked: instead of polling for an unlock
+ * (like raw spinlocks do), we lock and unlock, to force the kernel to
+ * schedule if there's contention:
+ */
+void __lockfunc rt_spin_unlock_wait(spinlock_t *lock)
+{
+	spin_lock(lock);
+	spin_unlock(lock);
+}
+EXPORT_SYMBOL(rt_spin_unlock_wait);
+
+int __lockfunc rt_spin_trylock(spinlock_t *lock)
+{
+	int ret = rt_mutex_trylock(&lock->lock);
+
+	if (ret)
+		spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+
+	return ret;
+}
+EXPORT_SYMBOL(rt_spin_trylock);
+
+int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags)
+{
+	int ret;
+
+	*flags = 0;
+	ret = rt_mutex_trylock(&lock->lock);
+	if (ret)
+		spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+
+	return ret;
+}
+EXPORT_SYMBOL(rt_spin_trylock_irqsave);
+
+int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock)
+{
+	/* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
+	if (atomic_add_unless(atomic, -1, 1))
+		return 0;
+	rt_spin_lock(lock);
+	if (atomic_dec_and_test(atomic))
+		return 1;
+	rt_spin_unlock(lock);
+	return 0;
+}
+EXPORT_SYMBOL(atomic_dec_and_spin_lock);
+
+void
+__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	/*
+	 * Make sure we are not reinitializing a held lock:
+	 */
+	debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+	lockdep_init_map(&lock->dep_map, name, key, 0);
+#endif
+	__rt_mutex_init(&lock->lock, name);
+}
+EXPORT_SYMBOL(__rt_spin_lock_init);
+
+#endif
+
+static inline int rt_release_bkl(struct rt_mutex *lock, unsigned long flags)
+{
+	int saved_lock_depth = current->lock_depth;
+
+#ifdef CONFIG_LOCK_KERNEL
+	current->lock_depth = -1;
+	/*
+	 * try_to_take_lock set the waiters, make sure it's
+	 * still correct.
+	 */
+	fixup_rt_mutex_waiters(lock);
+	atomic_spin_unlock_irqrestore(&lock->wait_lock, flags);
+
+	up(&kernel_sem);
+
+	atomic_spin_lock_irq(&lock->wait_lock);
+#endif
+	return saved_lock_depth;
+}
+
+static inline void rt_reacquire_bkl(int saved_lock_depth)
+{
+#ifdef CONFIG_LOCK_KERNEL
+	down(&kernel_sem);
+	current->lock_depth = saved_lock_depth;
+#endif
+}
+
 /**
  * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
  * @lock:		 the rt_mutex to take
  * @state:		 the state the task should block in (TASK_INTERRUPTIBLE
- * 			 or TASK_UNINTERRUPTIBLE)
+ *			 or TASK_UNINTERRUPTIBLE)
  * @timeout:		 the pre-initialized and started timer, or NULL for none
  * @waiter:		 the pre-initialized rt_mutex_waiter
  * @detect_deadlock:	 passed to task_blocks_on_rt_mutex
@@ -622,7 +1058,7 @@ static int __sched
 __rt_mutex_slowlock(struct rt_mutex *lock, int state,
 		    struct hrtimer_sleeper *timeout,
 		    struct rt_mutex_waiter *waiter,
-		    int detect_deadlock)
+		    int detect_deadlock, unsigned long flags)
 {
 	int ret = 0;
 
@@ -652,7 +1088,7 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
 		 */
 		if (!waiter->task) {
 			ret = task_blocks_on_rt_mutex(lock, waiter, current,
-						      detect_deadlock);
+						      detect_deadlock, flags);
 			/*
 			 * If we got woken up by the owner then start loop
 			 * all over without going into schedule to try
@@ -672,14 +1108,15 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
 				break;
 		}
 
-		atomic_spin_unlock(&lock->wait_lock);
+		atomic_spin_unlock_irq(&lock->wait_lock);
 
 		debug_rt_mutex_print_deadlock(waiter);
 
 		if (waiter->task)
 			schedule_rt_mutex(lock);
 
-		atomic_spin_lock(&lock->wait_lock);
+		atomic_spin_lock_irq(&lock->wait_lock);
+
 		set_current_state(state);
 	}
 
@@ -694,20 +1131,29 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
 		  struct hrtimer_sleeper *timeout,
 		  int detect_deadlock)
 {
+	int ret = 0, saved_lock_depth = -1;
 	struct rt_mutex_waiter waiter;
-	int ret = 0;
+	unsigned long flags;
 
 	debug_rt_mutex_init_waiter(&waiter);
 	waiter.task = NULL;
 
-	atomic_spin_lock(&lock->wait_lock);
+	atomic_spin_lock_irqsave(&lock->wait_lock, flags);
+	init_lists(lock);
 
 	/* Try to acquire the lock again: */
 	if (try_to_take_rt_mutex(lock)) {
-		atomic_spin_unlock(&lock->wait_lock);
+		atomic_spin_unlock_irqrestore(&lock->wait_lock, flags);
 		return 0;
 	}
 
+	/*
+	 * We drop the BKL here before we go into the wait loop to avoid a
+	 * possible deadlock in the scheduler.
+	 */
+	if (unlikely(current->lock_depth >= 0))
+		saved_lock_depth = rt_release_bkl(lock, flags);
+
 	set_current_state(state);
 
 	/* Setup the timer, when timeout != NULL */
@@ -718,12 +1164,12 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
 	}
 
 	ret = __rt_mutex_slowlock(lock, state, timeout, &waiter,
-				  detect_deadlock);
+				  detect_deadlock, flags);
 
 	set_current_state(TASK_RUNNING);
 
 	if (unlikely(waiter.task))
-		remove_waiter(lock, &waiter);
+		remove_waiter(lock, &waiter, flags);
 
 	/*
 	 * try_to_take_rt_mutex() sets the waiter bit
@@ -731,7 +1177,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
 	 */
 	fixup_rt_mutex_waiters(lock);
 
-	atomic_spin_unlock(&lock->wait_lock);
+	atomic_spin_unlock_irqrestore(&lock->wait_lock, flags);
 
 	/* Remove pending timer: */
 	if (unlikely(timeout))
@@ -745,6 +1191,10 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
 	if (unlikely(ret))
 		rt_mutex_adjust_prio(current);
 
+	/* Must we reaquire the BKL? */
+	if (unlikely(saved_lock_depth >= 0))
+		rt_reacquire_bkl(saved_lock_depth);
+
 	debug_rt_mutex_free_waiter(&waiter);
 
 	return ret;
@@ -756,12 +1206,15 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
 static inline int
 rt_mutex_slowtrylock(struct rt_mutex *lock)
 {
+	unsigned long flags;
 	int ret = 0;
 
-	atomic_spin_lock(&lock->wait_lock);
+	atomic_spin_lock_irqsave(&lock->wait_lock, flags);
 
 	if (likely(rt_mutex_owner(lock) != current)) {
 
+		init_lists(lock);
+
 		ret = try_to_take_rt_mutex(lock);
 		/*
 		 * try_to_take_rt_mutex() sets the lock waiters
@@ -770,7 +1223,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock)
 		fixup_rt_mutex_waiters(lock);
 	}
 
-	atomic_spin_unlock(&lock->wait_lock);
+	atomic_spin_unlock_irqrestore(&lock->wait_lock, flags);
 
 	return ret;
 }
@@ -781,7 +1234,9 @@ rt_mutex_slowtrylock(struct rt_mutex *lock)
 static void __sched
 rt_mutex_slowunlock(struct rt_mutex *lock)
 {
-	atomic_spin_lock(&lock->wait_lock);
+	unsigned long flags;
+
+	atomic_spin_lock_irqsave(&lock->wait_lock, flags);
 
 	debug_rt_mutex_unlock(lock);
 
@@ -789,13 +1244,13 @@ rt_mutex_slowunlock(struct rt_mutex *lock)
 
 	if (!rt_mutex_has_waiters(lock)) {
 		lock->owner = NULL;
-		atomic_spin_unlock(&lock->wait_lock);
+		atomic_spin_unlock_irqrestore(&lock->wait_lock, flags);
 		return;
 	}
 
-	wakeup_next_waiter(lock);
+	wakeup_next_waiter(lock, 0);
 
-	atomic_spin_unlock(&lock->wait_lock);
+	atomic_spin_unlock_irqrestore(&lock->wait_lock, flags);
 
 	/* Undo pi boosting if necessary: */
 	rt_mutex_adjust_prio(current);
@@ -857,6 +1312,27 @@ rt_mutex_fastunlock(struct rt_mutex *lock,
 }
 
 /**
+ * rt_mutex_lock_killable - lock a rt_mutex killable
+ *
+ * @lock: 		the rt_mutex to be locked
+ * @detect_deadlock:	deadlock detection on/off
+ *
+ * Returns:
+ *  0 		on success
+ * -EINTR 	when interrupted by a signal
+ * -EDEADLK	when the lock would deadlock (when deadlock detection is on)
+ */
+int __sched rt_mutex_lock_killable(struct rt_mutex *lock,
+				   int detect_deadlock)
+{
+	might_sleep();
+
+	return rt_mutex_fastlock(lock, TASK_KILLABLE,
+				 detect_deadlock, rt_mutex_slowlock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_lock_killable);
+
+/**
  * rt_mutex_lock - lock a rt_mutex
  *
  * @lock: the rt_mutex to be locked
@@ -1030,13 +1506,15 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
 			      struct rt_mutex_waiter *waiter,
 			      struct task_struct *task, int detect_deadlock)
 {
+	unsigned long flags;
 	int ret;
 
-	atomic_spin_lock(&lock->wait_lock);
+	atomic_spin_lock_irqsave(&lock->wait_lock, flags);
 
 	mark_rt_mutex_waiters(lock);
 
-	if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) {
+	if (!rt_mutex_owner(lock) ||
+	    try_to_steal_lock(lock, task, STEAL_NORMAL)) {
 		/* We got the lock for task. */
 		debug_rt_mutex_lock(lock);
 		rt_mutex_set_owner(lock, task, 0);
@@ -1045,7 +1523,8 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
 		return 1;
 	}
 
-	ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock);
+	ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock,
+				      flags);
 
 	if (ret && !waiter->task) {
 		/*
@@ -1056,7 +1535,7 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
 		 */
 		ret = 0;
 	}
-	atomic_spin_unlock(&lock->wait_lock);
+	atomic_spin_unlock_irqrestore(&lock->wait_lock, flags);
 
 	debug_rt_mutex_print_deadlock(waiter);
 
@@ -1104,19 +1583,20 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
 			       struct rt_mutex_waiter *waiter,
 			       int detect_deadlock)
 {
+	unsigned long flags;
 	int ret;
 
-	atomic_spin_lock(&lock->wait_lock);
+	atomic_spin_lock_irqsave(&lock->wait_lock, flags);
 
 	set_current_state(TASK_INTERRUPTIBLE);
 
 	ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter,
-				  detect_deadlock);
+				  detect_deadlock, flags);
 
 	set_current_state(TASK_RUNNING);
 
 	if (unlikely(waiter->task))
-		remove_waiter(lock, waiter);
+		remove_waiter(lock, waiter, flags);
 
 	/*
 	 * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
@@ -1124,7 +1604,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
 	 */
 	fixup_rt_mutex_waiters(lock);
 
-	atomic_spin_unlock(&lock->wait_lock);
+	atomic_spin_unlock_irqrestore(&lock->wait_lock, flags);
 
 	/*
 	 * Readjust priority, when we did not get the lock. We might have been
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
index 97a2f81866af..4df690ca5038 100644
--- a/kernel/rtmutex_common.h
+++ b/kernel/rtmutex_common.h
@@ -129,6 +129,26 @@ extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
 				      struct rt_mutex_waiter *waiter,
 				      int detect_deadlock);
 
+
+#define STEAL_LATERAL 1
+#define STEAL_NORMAL  0
+
+/*
+ * Note that RT tasks are excluded from lateral-steals to prevent the
+ * introduction of an unbounded latency
+ */
+static inline int lock_is_stealable(struct task_struct *task,
+				    struct task_struct *pendowner, int mode)
+{
+    if (mode == STEAL_NORMAL || rt_task(task)) {
+	    if (task->prio >= pendowner->prio)
+		    return 0;
+    } else if (task->prio > pendowner->prio)
+	    return 0;
+
+    return 1;
+}
+
 #ifdef CONFIG_DEBUG_RT_MUTEXES
 # include "rtmutex-debug.h"
 #else
diff --git a/kernel/rwlock.c b/kernel/rwlock.c
index e063dc52b749..20a357d86c52 100644
--- a/kernel/rwlock.c
+++ b/kernel/rwlock.c
@@ -14,6 +14,8 @@
  * frame contact the architecture maintainers.
  */
 
+#ifndef CONFIG_PREEMPT_RT
+
 #include <linux/linkage.h>
 #include <linux/preempt.h>
 #include <linux/spinlock.h>
@@ -221,3 +223,4 @@ void __lockfunc _write_unlock_bh(rwlock_t *lock)
 }
 EXPORT_SYMBOL(_write_unlock_bh);
 
+#endif
diff --git a/kernel/sched.c b/kernel/sched.c
index dd4902242a36..9e0ba5928330 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4,6 +4,7 @@
  *  Kernel scheduler and related syscalls
  *
  *  Copyright (C) 1991-2002  Linus Torvalds
+ *  Copyright (C) 2004 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
  *
  *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
  *		make semaphores SMP safe
@@ -16,6 +17,7 @@
  *		by Davide Libenzi, preemptible kernel bits by Robert Love.
  *  2003-09-03	Interactivity tuning by Con Kolivas.
  *  2004-04-02	Scheduler domains code by Nick Piggin
+ *  2004-10-13  Real-Time Preemption support by Ingo Molnar
  *  2007-04-15  Work begun on replacing all interactivity tuning with a
  *              fair scheduling design by Con Kolivas.
  *  2007-05-05  Load balancing (smp-nice) and other improvements
@@ -61,6 +63,7 @@
 #include <linux/sysctl.h>
 #include <linux/syscalls.h>
 #include <linux/times.h>
+#include <linux/kallsyms.h>
 #include <linux/tsacct_kern.h>
 #include <linux/kprobes.h>
 #include <linux/delayacct.h>
@@ -107,6 +110,20 @@
 #define NICE_0_LOAD		SCHED_LOAD_SCALE
 #define NICE_0_SHIFT		SCHED_LOAD_SHIFT
 
+#if (BITS_PER_LONG < 64)
+#define JIFFIES_TO_NS64(TIME) \
+	((unsigned long long)(TIME) * ((unsigned long) (1000000000 / HZ)))
+
+#define NS64_TO_JIFFIES(TIME) \
+	((((unsigned long long)((TIME)) >> BITS_PER_LONG) * \
+	(1 + NS_TO_JIFFIES(~0UL))) + NS_TO_JIFFIES((unsigned long)(TIME)))
+#else /* BITS_PER_LONG < 64 */
+
+#define NS64_TO_JIFFIES(TIME) NS_TO_JIFFIES(TIME)
+#define JIFFIES_TO_NS64(TIME) JIFFIES_TO_NS(TIME)
+
+#endif /* BITS_PER_LONG < 64 */
+
 /*
  * These are the 'tuning knobs' of the scheduler:
  *
@@ -144,6 +161,32 @@ static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
 }
 #endif
 
+#define TASK_PREEMPTS_CURR(p, rq) \
+	((p)->prio < (rq)->curr->prio)
+
+/*
+ * Tweaks for current
+ */
+
+#ifdef CURRENT_PTR
+struct task_struct * const ___current = &init_task;
+struct task_struct ** const current_ptr = (struct task_struct ** const)&___current;
+struct thread_info * const current_ti = &init_thread_union.thread_info;
+struct thread_info ** const current_ti_ptr = (struct thread_info ** const)&current_ti;
+
+EXPORT_SYMBOL(___current);
+EXPORT_SYMBOL(current_ti);
+
+/*
+ * The scheduler itself doesnt want 'current' to be cached
+ * during context-switches:
+ */
+# undef current
+# define current __current()
+# undef current_thread_info
+# define current_thread_info() __current_thread_info()
+#endif
+
 static inline int rt_policy(int policy)
 {
 	if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
@@ -207,6 +250,7 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
 
 	hrtimer_init(&rt_b->rt_period_timer,
 			CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	rt_b->rt_period_timer.irqsafe = 1;
 	rt_b->rt_period_timer.function = sched_rt_period_timer;
 }
 
@@ -497,6 +541,7 @@ struct rt_rq {
 	int overloaded;
 	struct plist_head pushable_tasks;
 #endif
+	unsigned long rt_nr_uninterruptible;
 	int rt_throttled;
 	u64 rt_time;
 	u64 rt_runtime;
@@ -602,6 +647,8 @@ struct rq {
 	 */
 	unsigned long nr_uninterruptible;
 
+	unsigned long switch_timestamp;
+	unsigned long slice_avg;
 	struct task_struct *curr, *idle;
 	unsigned long next_balance;
 	struct mm_struct *prev_mm;
@@ -660,6 +707,13 @@ struct rq {
 
 	/* BKL stats */
 	unsigned int bkl_count;
+
+	/* RT-overload stats: */
+	unsigned long rto_schedule;
+	unsigned long rto_schedule_tail;
+	unsigned long rto_wakeup;
+	unsigned long rto_pulled;
+	unsigned long rto_pushed;
 #endif
 };
 
@@ -699,6 +753,13 @@ inline void update_rq_clock(struct rq *rq)
 	rq->clock = sched_clock_cpu(cpu_of(rq));
 }
 
+#ifndef CONFIG_SMP
+int task_is_current(struct task_struct *task)
+{
+	return task_rq(task)->curr == task;
+}
+#endif
+
 /*
  * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
  */
@@ -887,11 +948,23 @@ static inline u64 global_rt_runtime(void)
 	return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
 }
 
+/*
+ * We really dont want to do anything complex within switch_to()
+ * on PREEMPT_RT - this check enforces this.
+ */
+#ifdef prepare_arch_switch
+# ifdef CONFIG_PREEMPT_RT
+#   error FIXME
+# else
+#  define _finish_arch_switch finish_arch_switch
+# endif
+#endif
+
 #ifndef prepare_arch_switch
 # define prepare_arch_switch(next)	do { } while (0)
 #endif
 #ifndef finish_arch_switch
-# define finish_arch_switch(prev)	do { } while (0)
+# define _finish_arch_switch(prev)	do { } while (0)
 #endif
 
 static inline int task_current(struct rq *rq, struct task_struct *p)
@@ -899,18 +972,39 @@ static inline int task_current(struct rq *rq, struct task_struct *p)
 	return rq->curr == p;
 }
 
-#ifndef __ARCH_WANT_UNLOCKED_CTXSW
 static inline int task_running(struct rq *rq, struct task_struct *p)
 {
+#ifdef CONFIG_SMP
+	return p->oncpu;
+#else
 	return task_current(rq, p);
+#endif
 }
 
+#ifndef __ARCH_WANT_UNLOCKED_CTXSW
 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 {
+#ifdef CONFIG_SMP
+	/*
+	 * We can optimise this out completely for !SMP, because the
+	 * SMP rebalancing from interrupt is the only thing that cares
+	 * here.
+	 */
+	next->oncpu = 1;
+#endif
 }
 
 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 {
+#ifdef CONFIG_SMP
+	/*
+	 * After ->oncpu is cleared, the task can be moved to a different CPU.
+	 * We must ensure this doesn't happen until the switch is completely
+	 * finished.
+	 */
+	smp_wmb();
+	prev->oncpu = 0;
+#endif
 #ifdef CONFIG_DEBUG_SPINLOCK
 	/* this is a valid case when another task releases the spinlock */
 	rq->lock.owner = current;
@@ -922,18 +1016,10 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 	 */
 	spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
 
-	atomic_spin_unlock_irq(&rq->lock);
+	atomic_spin_unlock(&rq->lock);
 }
 
 #else /* __ARCH_WANT_UNLOCKED_CTXSW */
-static inline int task_running(struct rq *rq, struct task_struct *p)
-{
-#ifdef CONFIG_SMP
-	return p->oncpu;
-#else
-	return task_current(rq, p);
-#endif
-}
 
 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 {
@@ -963,8 +1049,8 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 	smp_wmb();
 	prev->oncpu = 0;
 #endif
-#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
-	local_irq_enable();
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+	local_irq_disable();
 #endif
 }
 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
@@ -1176,6 +1262,7 @@ static void init_rq_hrtick(struct rq *rq)
 
 	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	rq->hrtick_timer.function = hrtick;
+	rq->hrtick_timer.irqsafe = 1;
 }
 #else	/* CONFIG_SCHED_HRTICK */
 static inline void hrtick_clear(struct rq *rq)
@@ -1251,7 +1338,7 @@ void wake_up_idle_cpu(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 
-	if (cpu == smp_processor_id())
+	if (cpu == raw_smp_processor_id())
 		return;
 
 	/*
@@ -1832,6 +1919,8 @@ static inline int normal_prio(struct task_struct *p)
 		prio = MAX_RT_PRIO-1 - p->rt_priority;
 	else
 		prio = __normal_prio(p);
+
+//	trace_special_pid(p->pid, PRIO(p), __PRIO(prio));
 	return prio;
 }
 
@@ -2415,7 +2504,8 @@ void task_oncpu_function_call(struct task_struct *p,
  *
  * returns failure only if the task is already active.
  */
-static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
+static int
+try_to_wake_up(struct task_struct *p, unsigned int state, int sync, int mutex)
 {
 	int cpu, orig_cpu, this_cpu, success = 0;
 	unsigned long flags;
@@ -2441,6 +2531,13 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 	}
 #endif
 
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * sync wakeups can increase wakeup latencies:
+	 */
+	if (rt_task(p))
+		sync = 0;
+#endif
 	smp_wmb();
 	rq = task_rq_lock(p, &flags);
 	update_rq_clock(rq);
@@ -2524,7 +2621,18 @@ out_running:
 	trace_sched_wakeup(rq, p, success);
 	check_preempt_curr(rq, p, sync);
 
-	p->state = TASK_RUNNING;
+	/*
+	 * For a mutex wakeup we or TASK_RUNNING_MUTEX to the task
+	 * state to preserve the original state, so a real wakeup
+	 * still can see the (UN)INTERRUPTIBLE bits in the state check
+	 * above. We dont have to worry about the | TASK_RUNNING_MUTEX
+	 * here. The waiter is serialized by the mutex lock and nobody
+	 * else can fiddle with p->state as we hold rq lock.
+	 */
+	if (mutex)
+		p->state |= TASK_RUNNING_MUTEX;
+	else
+		p->state = TASK_RUNNING;
 #ifdef CONFIG_SMP
 	if (p->sched_class->task_wake_up)
 		p->sched_class->task_wake_up(rq, p);
@@ -2548,13 +2656,31 @@ out:
  */
 int wake_up_process(struct task_struct *p)
 {
-	return try_to_wake_up(p, TASK_ALL, 0);
+	return try_to_wake_up(p, TASK_ALL, 0, 0);
 }
 EXPORT_SYMBOL(wake_up_process);
 
+int  wake_up_process_sync(struct task_struct * p)
+{
+	return try_to_wake_up(p, TASK_ALL, 1, 0);
+}
+EXPORT_SYMBOL(wake_up_process_sync);
+
+int  wake_up_process_mutex(struct task_struct * p)
+{
+	return try_to_wake_up(p, TASK_ALL, 0, 1);
+}
+EXPORT_SYMBOL(wake_up_process_mutex);
+
+int  wake_up_process_mutex_sync(struct task_struct * p)
+{
+	return try_to_wake_up(p, TASK_ALL, 1, 1);
+}
+EXPORT_SYMBOL(wake_up_process_mutex_sync);
+
 int wake_up_state(struct task_struct *p, unsigned int state)
 {
-	return try_to_wake_up(p, state, 0);
+	return try_to_wake_up(p, state, 0, 0);
 }
 
 /*
@@ -2650,7 +2776,7 @@ void sched_fork(struct task_struct *p, int clone_flags)
 	if (likely(sched_info_on()))
 		memset(&p->sched_info, 0, sizeof(p->sched_info));
 #endif
-#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
+#if defined(CONFIG_SMP)
 	p->oncpu = 0;
 #endif
 #ifdef CONFIG_PREEMPT
@@ -2728,8 +2854,17 @@ static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
 	struct preempt_notifier *notifier;
 	struct hlist_node *node;
 
+	if (hlist_empty(&curr->preempt_notifiers))
+		return;
+
+	/*
+	 * The KVM sched in notifier expects to be called with
+	 * interrupts enabled.
+	 */
+	local_irq_enable();
 	hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
 		notifier->ops->sched_in(notifier, raw_smp_processor_id());
+	local_irq_disable();
 }
 
 static void
@@ -2820,7 +2955,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	 *		Manfred Spraul <manfred@colorfullife.com>
 	 */
 	prev_state = prev->state;
-	finish_arch_switch(prev);
+	_finish_arch_switch(prev);
 	perf_counter_task_sched_in(current, cpu_of(rq));
 	finish_lock_switch(rq, prev);
 #ifdef CONFIG_SMP
@@ -2829,8 +2964,12 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 #endif
 
 	fire_sched_in_preempt_notifiers(current);
+	/*
+	 * Delay the final freeing of the mm or task, so that we dont have
+	 * to do complex work from within the scheduler:
+	 */
 	if (mm)
-		mmdrop(mm);
+ 		mmdrop_delayed(mm);
 	if (unlikely(prev_state == TASK_DEAD)) {
 		/*
 		 * Remove function-return probe instances associated with this
@@ -2848,12 +2987,15 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 asmlinkage void schedule_tail(struct task_struct *prev)
 	__releases(rq->lock)
 {
-	struct rq *rq = this_rq();
-
-	finish_task_switch(rq, prev);
+	preempt_disable();
+	finish_task_switch(this_rq(), prev);
+	__preempt_enable_no_resched();
+	local_irq_enable();
 #ifdef __ARCH_WANT_UNLOCKED_CTXSW
 	/* In this case, finish_task_switch does not reenable preemption */
 	preempt_enable();
+#else
+	preempt_check_resched();
 #endif
 	if (current->set_child_tid)
 		put_user(task_pid_vnr(current), current->set_child_tid);
@@ -2901,6 +3043,11 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
 #endif
 
+#ifdef CURRENT_PTR
+	barrier();
+	*current_ptr = next;
+	*current_ti_ptr = next->thread_info;
+#endif
 	/* Here we just switch the register state and the stack. */
 	switch_to(prev, next, prev);
 
@@ -2947,6 +3094,11 @@ unsigned long nr_uninterruptible(void)
 	return sum;
 }
 
+unsigned long nr_uninterruptible_cpu(int cpu)
+{
+	return cpu_rq(cpu)->nr_uninterruptible;
+}
+
 unsigned long long nr_context_switches(void)
 {
 	int i;
@@ -2965,6 +3117,13 @@ unsigned long nr_iowait(void)
 	for_each_possible_cpu(i)
 		sum += atomic_read(&cpu_rq(i)->nr_iowait);
 
+	/*
+	 * Since we read the counters lockless, it might be slightly
+	 * inaccurate. Do not allow it to go below zero though:
+	 */
+	if (unlikely((long)sum < 0))
+		sum = 0;
+
 	return sum;
 }
 
@@ -4718,7 +4877,7 @@ out:
  */
 static void run_rebalance_domains(struct softirq_action *h)
 {
-	int this_cpu = smp_processor_id();
+	int this_cpu = raw_smp_processor_id();
 	struct rq *this_rq = cpu_rq(this_cpu);
 	enum cpu_idle_type idle = this_rq->idle_at_tick ?
 						CPU_IDLE : CPU_NOT_IDLE;
@@ -4927,7 +5086,9 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
 
 	/* Add user time to cpustat. */
 	tmp = cputime_to_cputime64(cputime);
-	if (TASK_NICE(p) > 0)
+	if (rt_task(p))
+		cpustat->user_rt = cputime64_add(cpustat->user_rt, tmp);
+	else if (TASK_NICE(p) > 0)
 		cpustat->nice = cputime64_add(cpustat->nice, tmp);
 	else
 		cpustat->user = cputime64_add(cpustat->user, tmp);
@@ -4989,8 +5150,10 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
 	tmp = cputime_to_cputime64(cputime);
 	if (hardirq_count() - hardirq_offset)
 		cpustat->irq = cputime64_add(cpustat->irq, tmp);
-	else if (softirq_count())
+	else if (softirq_count() || (p->flags & PF_SOFTIRQ))
 		cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
+	else if (rt_task(p))
+		cpustat->system_rt = cputime64_add(cpustat->system_rt, tmp);
 	else
 		cpustat->system = cputime64_add(cpustat->system, tmp);
 
@@ -5145,10 +5308,13 @@ void scheduler_tick(void)
 
 	sched_clock_tick();
 
+	BUG_ON(!irqs_disabled());
+
 	atomic_spin_lock(&rq->lock);
 	update_rq_clock(rq);
 	update_cpu_load(rq);
-	curr->sched_class->task_tick(rq, curr, 0);
+	if (curr != rq->idle && curr->se.on_rq)
+		curr->sched_class->task_tick(rq, curr, 0);
 	atomic_spin_unlock(&rq->lock);
 
 	perf_counter_task_tick(curr, cpu);
@@ -5238,8 +5404,8 @@ static noinline void __schedule_bug(struct task_struct *prev)
 {
 	struct pt_regs *regs = get_irq_regs();
 
-	printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
-		prev->comm, prev->pid, preempt_count());
+	printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d, CPU#%d\n",
+	       prev->comm, preempt_count(), prev->pid, smp_processor_id());
 
 	debug_show_held_locks(prev);
 	print_modules();
@@ -5257,12 +5423,14 @@ static noinline void __schedule_bug(struct task_struct *prev)
  */
 static inline void schedule_debug(struct task_struct *prev)
 {
+//	WARN_ON(system_state == SYSTEM_BOOTING);
+
 	/*
 	 * Test if we are atomic. Since do_exit() needs to call into
 	 * schedule() atomically, we ignore that path for now.
 	 * Otherwise, whine if we are scheduling when we should not be.
 	 */
-	if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
+	if (unlikely(in_atomic() && !prev->exit_state))
 		__schedule_bug(prev);
 
 	profile_hit(SCHED_PROFILING, __builtin_return_address(0));
@@ -5333,15 +5501,13 @@ pick_next_task(struct rq *rq)
 /*
  * schedule() is the main scheduler function.
  */
-asmlinkage void __sched schedule(void)
+asmlinkage void __sched __schedule(void)
 {
 	struct task_struct *prev, *next;
 	unsigned long *switch_count;
 	struct rq *rq;
 	int cpu;
 
-need_resched:
-	preempt_disable();
 	cpu = smp_processor_id();
 	rq = cpu_rq(cpu);
 	rcu_qsctr_inc(cpu);
@@ -5349,10 +5515,11 @@ need_resched:
 	switch_count = &prev->nivcsw;
 
 	release_kernel_lock(prev);
-need_resched_nonpreemptible:
 
 	schedule_debug(prev);
 
+	preempt_disable();
+
 	if (sched_feat(HRTICK))
 		hrtick_clear(rq);
 
@@ -5360,14 +5527,20 @@ need_resched_nonpreemptible:
 	update_rq_clock(rq);
 	clear_tsk_need_resched(prev);
 
-	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
+	if (!(prev->state & TASK_RUNNING_MUTEX) && prev->state &&
+	    !(preempt_count() & PREEMPT_ACTIVE)) {
 		if (unlikely(signal_pending_state(prev->state, prev)))
 			prev->state = TASK_RUNNING;
-		else
+		else {
+			touch_softlockup_watchdog();
 			deactivate_task(rq, prev, 1);
+		}
 		switch_count = &prev->nvcsw;
 	}
 
+	if (preempt_count() & PREEMPT_ACTIVE)
+		sub_preempt_count(PREEMPT_ACTIVE);
+
 #ifdef CONFIG_SMP
 	if (prev->sched_class->pre_schedule)
 		prev->sched_class->pre_schedule(rq, prev);
@@ -5394,19 +5567,28 @@ need_resched_nonpreemptible:
 		 */
 		cpu = smp_processor_id();
 		rq = cpu_rq(cpu);
-	} else
-		atomic_spin_unlock_irq(&rq->lock);
+		__preempt_enable_no_resched();
+	} else {
+		__preempt_enable_no_resched();
+		atomic_spin_unlock(&rq->lock);
+	}
 
-	if (unlikely(reacquire_kernel_lock(current) < 0))
-		goto need_resched_nonpreemptible;
+	reacquire_kernel_lock(current);
+}
+
+asmlinkage void __sched schedule(void)
+{
+need_resched:
+	local_irq_disable();
+	__schedule();
+	local_irq_enable();
 
-	__preempt_enable_no_resched();
 	if (need_resched())
 		goto need_resched;
 }
 EXPORT_SYMBOL(schedule);
 
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT)
 /*
  * Look out! "owner" is an entirely speculative pointer
  * access and not reliable.
@@ -5468,6 +5650,35 @@ out:
 #endif
 
 #ifdef CONFIG_PREEMPT
+
+/*
+ * Global flag to turn preemption off on a CONFIG_PREEMPT kernel:
+ */
+int kernel_preemption = 1;
+
+static int __init preempt_setup (char *str)
+{
+	if (!strncmp(str, "off", 3)) {
+		if (kernel_preemption) {
+			printk(KERN_INFO "turning off kernel preemption!\n");
+			kernel_preemption = 0;
+		}
+		return 1;
+	}
+	if (!strncmp(str, "on", 2)) {
+		if (!kernel_preemption) {
+			printk(KERN_INFO "turning on kernel preemption!\n");
+			kernel_preemption = 1;
+		}
+		return 1;
+	}
+	get_option(&str, &kernel_preemption);
+
+	return 1;
+}
+
+__setup("preempt=", preempt_setup);
+
 /*
  * this is the entry point to schedule() from in-kernel preemption
  * off of preempt_enable. Kernel preemptions off return from interrupt
@@ -5479,6 +5690,8 @@ asmlinkage void __sched preempt_schedule(void)
 	struct task_struct *task = current;
 	int saved_lock_depth;
 
+	if (!kernel_preemption)
+		return;
 	/*
 	 * If there is a non-zero preempt_count or interrupts are disabled,
 	 * we do not want to preempt the current task. Just return..
@@ -5487,6 +5700,7 @@ asmlinkage void __sched preempt_schedule(void)
 		return;
 
 	do {
+		local_irq_disable();
 		add_preempt_count(PREEMPT_ACTIVE);
 
 		/*
@@ -5496,9 +5710,9 @@ asmlinkage void __sched preempt_schedule(void)
 		 */
 		saved_lock_depth = task->lock_depth;
 		task->lock_depth = -1;
-		schedule();
+		__schedule();
 		task->lock_depth = saved_lock_depth;
-		sub_preempt_count(PREEMPT_ACTIVE);
+		local_irq_enable();
 
 		/*
 		 * Check again in case we missed a preemption opportunity
@@ -5510,10 +5724,10 @@ asmlinkage void __sched preempt_schedule(void)
 EXPORT_SYMBOL(preempt_schedule);
 
 /*
- * this is the entry point to schedule() from kernel preemption
- * off of irq context.
- * Note, that this is called and return with irqs disabled. This will
- * protect us against recursive calling from irq.
+ * this is is the entry point for the IRQ return path. Called with
+ * interrupts disabled.  To avoid infinite irq-entry recursion problems
+ * with fast-paced IRQ sources we do all of this carefully to never
+ * enable interrupts again.
  */
 asmlinkage void __sched preempt_schedule_irq(void)
 {
@@ -5521,10 +5735,17 @@ asmlinkage void __sched preempt_schedule_irq(void)
 	struct task_struct *task = current;
 	int saved_lock_depth;
 
-	/* Catch callers which need to be fixed */
-	BUG_ON(ti->preempt_count || !irqs_disabled());
+	if (!kernel_preemption)
+		return;
+	/*
+	 * If there is a non-zero preempt_count then just return.
+	 * (interrupts are disabled)
+	 */
+	if (unlikely(ti->preempt_count))
+		return;
 
 	do {
+		local_irq_disable();
 		add_preempt_count(PREEMPT_ACTIVE);
 
 		/*
@@ -5534,11 +5755,9 @@ asmlinkage void __sched preempt_schedule_irq(void)
 		 */
 		saved_lock_depth = task->lock_depth;
 		task->lock_depth = -1;
-		local_irq_enable();
-		schedule();
+		__schedule();
 		local_irq_disable();
 		task->lock_depth = saved_lock_depth;
-		sub_preempt_count(PREEMPT_ACTIVE);
 
 		/*
 		 * Check again in case we missed a preemption opportunity
@@ -5553,7 +5772,7 @@ asmlinkage void __sched preempt_schedule_irq(void)
 int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
 			  void *key)
 {
-	return try_to_wake_up(curr->private, mode, sync);
+	return try_to_wake_up(curr->private, mode, sync, 0);
 }
 EXPORT_SYMBOL(default_wake_function);
 
@@ -5596,7 +5815,7 @@ void __wake_up(wait_queue_head_t *q, unsigned int mode,
 	unsigned long flags;
 
 	spin_lock_irqsave(&q->lock, flags);
-	__wake_up_common(q, mode, nr_exclusive, 0, key);
+	__wake_up_common(q, mode, nr_exclusive, 1, key);
 	spin_unlock_irqrestore(&q->lock, flags);
 }
 EXPORT_SYMBOL(__wake_up);
@@ -5676,7 +5895,7 @@ void complete(struct completion *x)
 
 	spin_lock_irqsave(&x->wait.lock, flags);
 	x->done++;
-	__wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
+	__wake_up_common(&x->wait, TASK_NORMAL, 1, 1, NULL);
 	spin_unlock_irqrestore(&x->wait.lock, flags);
 }
 EXPORT_SYMBOL(complete);
@@ -5696,7 +5915,7 @@ void complete_all(struct completion *x)
 
 	spin_lock_irqsave(&x->wait.lock, flags);
 	x->done += UINT_MAX/2;
-	__wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
+	__wake_up_common(&x->wait, TASK_NORMAL, 0, 1, NULL);
 	spin_unlock_irqrestore(&x->wait.lock, flags);
 }
 EXPORT_SYMBOL(complete_all);
@@ -5932,6 +6151,25 @@ void task_setprio(struct task_struct *p, int prio)
 	BUG_ON(prio < 0 || prio > MAX_PRIO);
 
 	rq = task_rq_lock(p, &flags);
+
+	/*
+	 * Idle task boosting is a nono in general. There is one
+	 * exception, when NOHZ is active:
+	 *
+	 * The idle task calls get_next_timer_interrupt() and holds
+	 * the timer wheel base->lock on the CPU and another CPU wants
+	 * to access the timer (probably to cancel it). We can safely
+	 * ignore the boosting request, as the idle CPU runs this code
+	 * with interrupts disabled and will complete the lock
+	 * protected section without being interrupted. So there is no
+	 * real need to boost.
+	 */
+	if (unlikely(p == rq->idle)) {
+		WARN_ON(p != rq->curr);
+		WARN_ON(p->pi_blocked_on);
+		goto out_unlock;
+	}
+
 	update_rq_clock(rq);
 
 	oldprio = p->prio;
@@ -5958,6 +6196,8 @@ void task_setprio(struct task_struct *p, int prio)
 
 		check_class_changed(rq, p, prev_class, oldprio, running);
 	}
+
+out_unlock:
 	task_rq_unlock(rq, &flags);
 }
 
@@ -6598,6 +6838,7 @@ SYSCALL_DEFINE0(sched_yield)
 	__release(rq->lock);
 	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
 	_raw_spin_unlock(&rq->lock);
+	local_irq_enable();
 
 	preempt_enable_and_schedule();
 
@@ -6609,9 +6850,40 @@ static inline int should_resched(void)
 	return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
 }
 
+#if defined(CONFIG_DEBUG_SPINLOCK_SLEEP) || defined(CONFIG_DEBUG_PREEMPT)
+void __might_sleep(char *file, int line)
+{
+#ifdef in_atomic
+	static unsigned long prev_jiffy;	/* ratelimiting */
+
+	if ((!in_atomic() && !irqs_disabled()) ||
+		    system_state != SYSTEM_RUNNING || oops_in_progress)
+		return;
+
+	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
+		return;
+	prev_jiffy = jiffies;
+
+	printk(KERN_ERR
+		"BUG: sleeping function called from invalid context at %s:%d\n",
+			file, line);
+	printk(KERN_ERR
+		"in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
+			in_atomic(), irqs_disabled(),
+			current->pid, current->comm);
+
+	debug_show_held_locks(current);
+	if (irqs_disabled())
+		print_irqtrace_events(current);
+	dump_stack();
+#endif
+}
+EXPORT_SYMBOL(__might_sleep);
+#endif
+
 static void __cond_resched(void)
 {
-#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
+#if defined(CONFIG_DEBUG_SPINLOCK_SLEEP) || defined(CONFIG_DEBUG_PREEMPT)
 	__might_sleep(__FILE__, __LINE__);
 #endif
 	/*
@@ -6620,10 +6892,11 @@ static void __cond_resched(void)
 	 * cond_resched() call.
 	 */
 	do {
+		local_irq_disable();
 		add_preempt_count(PREEMPT_ACTIVE);
-		schedule();
-		sub_preempt_count(PREEMPT_ACTIVE);
+		__schedule();
 	} while (need_resched());
+	local_irq_enable();
 }
 
 int __sched _cond_resched(void)
@@ -6662,9 +6935,16 @@ int cond_resched_lock(spinlock_t *lock)
 }
 EXPORT_SYMBOL(cond_resched_lock);
 
+/*
+ * Voluntarily preempt a process context that has softirqs disabled:
+ */
 int __sched cond_resched_softirq(void)
 {
-	BUG_ON(!in_softirq());
+#ifndef CONFIG_PREEMPT_SOFTIRQS
+	WARN_ON_ONCE(!in_softirq());
+	if (!in_softirq())
+		return 0;
+#endif
 
 	if (should_resched()) {
 		local_bh_enable();
@@ -6676,17 +6956,75 @@ int __sched cond_resched_softirq(void)
 }
 EXPORT_SYMBOL(cond_resched_softirq);
 
+/*
+ * Voluntarily preempt a softirq context (possible with softirq threading):
+ */
+int __sched cond_resched_softirq_context(void)
+{
+	WARN_ON_ONCE(!in_softirq());
+
+	if (softirq_need_resched() && system_state == SYSTEM_RUNNING) {
+		raw_local_irq_disable();
+		_local_bh_enable();
+		raw_local_irq_enable();
+		__cond_resched();
+		local_bh_disable();
+		return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(cond_resched_softirq_context);
+
+#ifdef CONFIG_PREEMPT_VOLUNTARY
+int voluntary_preemption = 1;
+EXPORT_SYMBOL(voluntary_preemption);
+
+static int __init voluntary_preempt_setup (char *str)
+{
+	if (!strncmp(str, "off", 3))
+		voluntary_preemption = 0;
+	else
+		get_option(&str, &voluntary_preemption);
+	if (!voluntary_preemption)
+		printk("turning off voluntary preemption!\n");
+
+	return 1;
+}
+
+__setup("voluntary-preempt=", voluntary_preempt_setup);
+
+#endif
+
 /**
  * yield - yield the current processor to other threads.
  *
  * This is a shortcut for kernel-space yielding - it marks the
  * thread runnable and calls sys_sched_yield().
  */
-void __sched yield(void)
+void __sched __yield(void)
 {
 	set_current_state(TASK_RUNNING);
 	sys_sched_yield();
 }
+
+void __sched yield(void)
+{
+	static int once = 1;
+
+	/*
+	 * it's a bug to rely on yield() with RT priorities. We print
+	 * the first occurance after bootup ... this will still give
+	 * us an idea about the scope of the problem, without spamming
+	 * the syslog:
+	 */
+	if (once && rt_task(current)) {
+		once = 0;
+		printk(KERN_ERR "BUG: %s:%d RT task yield()-ing!\n",
+			current->comm, current->pid);
+		dump_stack();
+	}
+	__yield();
+}
 EXPORT_SYMBOL(yield);
 
 /*
@@ -6860,6 +7198,7 @@ void sched_show_task(struct task_struct *p)
 void show_state_filter(unsigned long state_filter)
 {
 	struct task_struct *g, *p;
+	int do_unlock = 1;
 
 #if BITS_PER_LONG == 32
 	printk(KERN_INFO
@@ -6868,7 +7207,16 @@ void show_state_filter(unsigned long state_filter)
 	printk(KERN_INFO
 		"  task                        PC stack   pid father\n");
 #endif
+#ifdef CONFIG_PREEMPT_RT
+	if (!read_trylock(&tasklist_lock)) {
+		printk("hm, tasklist_lock write-locked.\n");
+		printk("ignoring ...\n");
+		do_unlock = 0;
+	}
+#else
 	read_lock(&tasklist_lock);
+#endif
+
 	do_each_thread(g, p) {
 		/*
 		 * reset the NMI-timeout, listing all files on a slow
@@ -6884,7 +7232,8 @@ void show_state_filter(unsigned long state_filter)
 #ifdef CONFIG_SCHED_DEBUG
 	sysrq_sched_debug_show();
 #endif
-	read_unlock(&tasklist_lock);
+	if (do_unlock)
+		read_unlock(&tasklist_lock);
 	/*
 	 * Only show locks if all tasks are dumped:
 	 */
@@ -6920,7 +7269,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
 	__set_task_cpu(idle, cpu);
 
 	rq->curr = rq->idle = idle;
-#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
+#if defined(CONFIG_SMP)
 	idle->oncpu = 1;
 #endif
 	atomic_spin_unlock_irqrestore(&rq->lock, flags);
@@ -7056,11 +7405,18 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
 static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 {
 	struct rq *rq_dest, *rq_src;
+	unsigned long flags;
 	int ret = 0, on_rq;
 
 	if (unlikely(!cpu_active(dest_cpu)))
 		return ret;
 
+	 /*
+	  * PREEMPT_RT: this relies on write_lock_irq(&tasklist_lock)
+	  * disabling interrupts - which on PREEMPT_RT does not do:
+	  */
+	local_irq_save(flags);
+
 	rq_src = cpu_rq(src_cpu);
 	rq_dest = cpu_rq(dest_cpu);
 
@@ -7085,6 +7441,8 @@ done:
 	ret = 1;
 fail:
 	double_rq_unlock(rq_src, rq_dest);
+	local_irq_restore(flags);
+
 	return ret;
 }
 
@@ -7273,7 +7631,11 @@ void idle_task_exit(void)
 
 	if (mm != &init_mm)
 		switch_mm(mm, &init_mm, current);
+#ifdef CONFIG_PREEMPT_RT
+	mmdrop_delayed(mm);
+#else
 	mmdrop(mm);
+#endif
 }
 
 /* called under rq->lock with disabled interrupts */
@@ -9404,6 +9766,9 @@ void __init sched_init(void)
 	atomic_inc(&init_mm.mm_count);
 	enter_lazy_tlb(&init_mm, current);
 
+#ifdef CONFIG_PREEMPT_RT
+	printk("Real-Time Preemption Support (C) 2004-2007 Ingo Molnar\n");
+#endif
 	/*
 	 * Make us the idle thread. Technically, schedule() should not be
 	 * called from this thread, however somewhere below it might be,
@@ -9434,36 +9799,6 @@ void __init sched_init(void)
 	scheduler_running = 1;
 }
 
-#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
-void __might_sleep(char *file, int line)
-{
-#ifdef in_atomic
-	static unsigned long prev_jiffy;	/* ratelimiting */
-
-	if ((!in_atomic() && !irqs_disabled()) ||
-		    system_state != SYSTEM_RUNNING || oops_in_progress)
-		return;
-	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
-		return;
-	prev_jiffy = jiffies;
-
-	printk(KERN_ERR
-		"BUG: sleeping function called from invalid context at %s:%d\n",
-			file, line);
-	printk(KERN_ERR
-		"in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
-			in_atomic(), irqs_disabled(),
-			current->pid, current->comm);
-
-	debug_show_held_locks(current);
-	if (irqs_disabled())
-		print_irqtrace_events(current);
-	dump_stack();
-#endif
-}
-EXPORT_SYMBOL(__might_sleep);
-#endif
-
 #ifdef CONFIG_MAGIC_SYSRQ
 static void normalize_task(struct rq *rq, struct task_struct *p)
 {
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 899d71ff8bdf..97216fb322ed 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -280,6 +280,19 @@ static void print_cpu(struct seq_file *m, int cpu)
 	P(cpu_load[2]);
 	P(cpu_load[3]);
 	P(cpu_load[4]);
+#ifdef CONFIG_PREEMPT_RT
+	/* Print rt related rq stats */
+	P(rt.rt_nr_running);
+	P(rt.rt_nr_uninterruptible);
+# ifdef CONFIG_SCHEDSTATS
+	P(rto_schedule);
+	P(rto_schedule_tail);
+	P(rto_wakeup);
+	P(rto_pulled);
+	P(rto_pushed);
+# endif
+#endif
+
 #undef P
 #undef PN
 
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 54779ebe48cc..adcbc682c5b9 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -860,6 +860,55 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
 	}
 }
 
+static inline void incr_rt_nr_uninterruptible(struct task_struct *p,
+					      struct rq *rq)
+{
+	rq->rt.rt_nr_uninterruptible++;
+}
+
+static inline void decr_rt_nr_uninterruptible(struct task_struct *p,
+					      struct rq *rq)
+{
+	rq->rt.rt_nr_uninterruptible--;
+}
+
+unsigned long rt_nr_running(void)
+{
+	unsigned long i, sum = 0;
+
+	for_each_online_cpu(i)
+		sum += cpu_rq(i)->rt.rt_nr_running;
+
+	return sum;
+}
+
+unsigned long rt_nr_running_cpu(int cpu)
+{
+	return cpu_rq(cpu)->rt.rt_nr_running;
+}
+
+unsigned long rt_nr_uninterruptible(void)
+{
+	unsigned long i, sum = 0;
+
+	for_each_online_cpu(i)
+		sum += cpu_rq(i)->rt.rt_nr_uninterruptible;
+
+	/*
+	 * Since we read the counters lockless, it might be slightly
+	 * inaccurate. Do not allow it to go below zero though:
+	 */
+	if (unlikely((long)sum < 0))
+		sum = 0;
+
+	return sum;
+}
+
+unsigned long rt_nr_uninterruptible_cpu(int cpu)
+{
+	return cpu_rq(cpu)->rt.rt_nr_uninterruptible;
+}
+
 /*
  * Adding/removing a task to/from a priority array:
  */
@@ -872,6 +921,9 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
 
 	enqueue_rt_entity(rt_se);
 
+	if (p->state == TASK_UNINTERRUPTIBLE)
+		decr_rt_nr_uninterruptible(p, rq);
+
 	if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
 		enqueue_pushable_task(rq, p);
 
@@ -883,6 +935,10 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
 	struct sched_rt_entity *rt_se = &p->rt;
 
 	update_curr_rt(rq);
+
+	if (p->state == TASK_UNINTERRUPTIBLE)
+		incr_rt_nr_uninterruptible(p, rq);
+
 	dequeue_rt_entity(rt_se);
 
 	dequeue_pushable_task(rq, p);
@@ -1462,8 +1518,10 @@ static int pull_rt_task(struct rq *this_rq)
 static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
 {
 	/* Try to pull RT tasks here if we lower this rq's prio */
-	if (unlikely(rt_task(prev)) && rq->rt.highest_prio.curr > prev->prio)
+	if (unlikely(rt_task(prev)) && rq->rt.highest_prio.curr > prev->prio) {
 		pull_rt_task(rq);
+		schedstat_inc(rq, rto_schedule);
+	}
 }
 
 /*
@@ -1545,7 +1603,6 @@ static void set_cpus_allowed_rt(struct task_struct *p,
 			 */
 			if (weight > 1)
 				enqueue_pushable_task(rq, p);
-
 		}
 
 		if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) {
diff --git a/kernel/signal.c b/kernel/signal.c
index bee1953821c9..88a4ee3d7f95 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -916,8 +916,9 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
 
 	trace_sched_signal_send(sig, t);
 
+#ifdef CONFIG_SMP
 	assert_spin_locked(&t->sighand->siglock);
-
+#endif
 	if (!prepare_signal(sig, t, from_ancestor_ns))
 		return 0;
 
@@ -1692,15 +1693,8 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
 	read_lock(&tasklist_lock);
 	if (may_ptrace_stop()) {
 		do_notify_parent_cldstop(current, CLD_TRAPPED);
-		/*
-		 * Don't want to allow preemption here, because
-		 * sys_ptrace() needs this task to be inactive.
-		 *
-		 * XXX: implement read_unlock_no_resched().
-		 */
-		preempt_disable();
 		read_unlock(&tasklist_lock);
-		preempt_enable_and_schedule();
+		schedule();
 	} else {
 		/*
 		 * By the time we got the lock, our tracer went away.
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 5fc1b0eefe9b..aae8d459f728 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -8,15 +8,23 @@
  *	Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903)
  *
  *	Remote softirq infrastructure is by Jens Axboe.
+ *
+ *	Softirq-split implemetation by
+ *	Copyright (C) 2005 Thomas Gleixner, Ingo Molnar
  */
 
 #include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/syscalls.h>
+#include <linux/wait.h>
 #include <linux/kernel_stat.h>
 #include <linux/interrupt.h>
 #include <linux/init.h>
+#include <linux/delay.h>
 #include <linux/mm.h>
 #include <linux/notifier.h>
 #include <linux/percpu.h>
+#include <linux/delay.h>
 #include <linux/cpu.h>
 #include <linux/freezer.h>
 #include <linux/kthread.h>
@@ -54,29 +62,122 @@ EXPORT_SYMBOL(irq_stat);
 
 static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
 
-static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
+struct softirqdata {
+	int			nr;
+	unsigned long		cpu;
+	struct task_struct	*tsk;
+	int			running;
+};
+
+static DEFINE_PER_CPU(struct softirqdata [NR_SOFTIRQS], ksoftirqd);
 
 char *softirq_to_name[NR_SOFTIRQS] = {
 	"HI", "TIMER", "NET_TX", "NET_RX", "BLOCK",
 	"TASKLET", "SCHED", "HRTIMER",	"RCU"
 };
 
+#ifdef CONFIG_PREEMPT_RT
+/*
+ * On preempt-rt a softirq might be blocked on a lock. There might be
+ * no other runnable task on this CPU because the lock owner runs on
+ * some other CPU. So we have to go into idle with the pending bit
+ * set. Therefor we need to check this otherwise we warn about false
+ * positives which confuses users and defeats the whole purpose of
+ * this test.
+ *
+ * This code is called with interrupts disabled.
+ */
+void softirq_check_pending_idle(void)
+{
+	static int rate_limit;
+	u32 warnpending = 0, pending = local_softirq_pending();
+	int curr = 0;
+
+	if (rate_limit >= 10)
+		return;
+
+	while (pending) {
+		if (pending & 1) {
+			struct task_struct *tsk;
+
+			tsk = __get_cpu_var(ksoftirqd)[curr].tsk;
+			/*
+			 * The wakeup code in rtmutex.c wakes up the
+			 * task _before_ it sets pi_blocked_on to NULL
+			 * under tsk->pi_lock. So we need to check for
+			 * both: state and pi_blocked_on.
+			 */
+			atomic_spin_lock(&tsk->pi_lock);
+
+			if (!tsk->pi_blocked_on &&
+			    !(tsk->state == TASK_RUNNING) &&
+			    !(tsk->state & TASK_RUNNING_MUTEX))
+				warnpending |= 1 << curr;
+
+			atomic_spin_unlock(&tsk->pi_lock);
+		}
+		pending >>= 1;
+		curr++;
+	}
+
+	if (warnpending) {
+		printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
+		       warnpending);
+		rate_limit++;
+	}
+}
+
+#else
+/*
+ * On !PREEMPT_RT we just printk rate limited:
+ */
+void softirq_check_pending_idle(void)
+{
+	static int rate_limit;
+
+	if (rate_limit < 10) {
+		printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
+		       local_softirq_pending());
+		rate_limit++;
+	}
+}
+
+#endif
+
 /*
  * we cannot loop indefinitely here to avoid userspace starvation,
  * but we also don't want to introduce a worst case 1/HZ latency
  * to the pending events, so lets the scheduler to balance
  * the softirq load for us.
  */
-static void wakeup_softirqd(void)
+static void wakeup_softirqd(int softirq)
 {
 	/* Interrupts are disabled: no need to stop preemption */
-	struct task_struct *tsk = __get_cpu_var(ksoftirqd);
+	struct task_struct *tsk = __get_cpu_var(ksoftirqd)[softirq].tsk;
 
 	if (tsk && tsk->state != TASK_RUNNING)
 		wake_up_process(tsk);
 }
 
 /*
+ * Wake up the softirq threads which have work
+ */
+static void trigger_softirqs(void)
+{
+	u32 pending = local_softirq_pending();
+	int curr = 0;
+
+	while (pending) {
+		if (pending & 1)
+			wakeup_softirqd(curr);
+		pending >>= 1;
+		curr++;
+	}
+}
+
+#ifndef CONFIG_PREEMPT_HARDIRQS
+
+/*
  * This one is for softirq.c-internal use,
  * where hardirqs are disabled legitimately:
  */
@@ -128,7 +229,6 @@ EXPORT_SYMBOL(local_bh_disable);
  */
 void _local_bh_enable(void)
 {
-	WARN_ON_ONCE(in_irq());
 	WARN_ON_ONCE(!irqs_disabled());
 
 	if (softirq_count() == SOFTIRQ_OFFSET)
@@ -138,45 +238,72 @@ void _local_bh_enable(void)
 
 EXPORT_SYMBOL(_local_bh_enable);
 
-static inline void _local_bh_enable_ip(unsigned long ip)
+void local_bh_enable(void)
 {
-	WARN_ON_ONCE(in_irq() || irqs_disabled());
 #ifdef CONFIG_TRACE_IRQFLAGS
-	local_irq_disable();
+	unsigned long flags;
+
+	WARN_ON_ONCE(in_irq());
+#endif
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+	local_irq_save(flags);
 #endif
 	/*
 	 * Are softirqs going to be turned on now:
 	 */
 	if (softirq_count() == SOFTIRQ_OFFSET)
-		trace_softirqs_on(ip);
+		trace_softirqs_on((unsigned long)__builtin_return_address(0));
 	/*
 	 * Keep preemption disabled until we are done with
 	 * softirq processing:
- 	 */
- 	sub_preempt_count(SOFTIRQ_OFFSET - 1);
+	 */
+	sub_preempt_count(SOFTIRQ_OFFSET - 1);
 
 	if (unlikely(!in_interrupt() && local_softirq_pending()))
 		do_softirq();
 
 	dec_preempt_count();
 #ifdef CONFIG_TRACE_IRQFLAGS
-	local_irq_enable();
+	local_irq_restore(flags);
 #endif
 	preempt_check_resched();
 }
-
-void local_bh_enable(void)
-{
-	_local_bh_enable_ip((unsigned long)__builtin_return_address(0));
-}
 EXPORT_SYMBOL(local_bh_enable);
 
 void local_bh_enable_ip(unsigned long ip)
 {
-	_local_bh_enable_ip(ip);
+#ifdef CONFIG_TRACE_IRQFLAGS
+	unsigned long flags;
+
+	WARN_ON_ONCE(in_irq());
+
+	local_irq_save(flags);
+#endif
+	/*
+	 * Are softirqs going to be turned on now:
+	 */
+	if (softirq_count() == SOFTIRQ_OFFSET)
+		trace_softirqs_on(ip);
+	/*
+	 * Keep preemption disabled until we are done with
+	 * softirq processing:
+	 */
+	sub_preempt_count(SOFTIRQ_OFFSET - 1);
+
+	if (unlikely(!in_interrupt() && local_softirq_pending()))
+		do_softirq();
+
+	dec_preempt_count();
+#ifdef CONFIG_TRACE_IRQFLAGS
+	local_irq_restore(flags);
+#endif
+	preempt_check_resched();
 }
 EXPORT_SYMBOL(local_bh_enable_ip);
 
+#endif
+
 /*
  * We restart softirq processing MAX_SOFTIRQ_RESTART times,
  * and we fall back to softirqd after that.
@@ -186,66 +313,148 @@ EXPORT_SYMBOL(local_bh_enable_ip);
  * we want to handle softirqs as soon as possible, but they
  * should not be able to lock up the box.
  */
-#define MAX_SOFTIRQ_RESTART 10
+#define MAX_SOFTIRQ_RESTART 20
 
-asmlinkage void __do_softirq(void)
+static DEFINE_PER_CPU(u32, softirq_running);
+
+/*
+ * Debug check for leaking preempt counts in h->action handlers:
+ */
+
+static inline void debug_check_preempt_count_start(__u32 *preempt_count)
 {
-	struct softirq_action *h;
-	__u32 pending;
+#ifdef CONFIG_DEBUG_PREEMPT
+	*preempt_count = preempt_count();
+#endif
+}
+
+static inline void
+debug_check_preempt_count_stop(__u32 *preempt_count, struct softirq_action *h)
+{
+#ifdef CONFIG_DEBUG_PREEMPT
+	if (*preempt_count == preempt_count())
+		return;
+
+	print_symbol("BUG: %Ps exited with wrong preemption count!\n",
+		     (unsigned long)h->action);
+	printk("=> enter: %08x, exit: %08x.\n", *preempt_count, preempt_count());
+	preempt_count() = *preempt_count;
+#endif
+}
+
+/*
+ * Execute softirq handlers:
+ */
+static void ___do_softirq(const int same_prio_only)
+{
+	__u32 pending, available_mask, same_prio_skipped, preempt_count;
 	int max_restart = MAX_SOFTIRQ_RESTART;
-	int cpu;
+	struct softirq_action *h;
+	int cpu, softirq;
 
 	pending = local_softirq_pending();
 	account_system_vtime(current);
 
-	__local_bh_disable((unsigned long)__builtin_return_address(0));
-	lockdep_softirq_enter();
-
 	cpu = smp_processor_id();
 restart:
+	available_mask = -1;
+	softirq = 0;
+	same_prio_skipped = 0;
+
 	/* Reset the pending bitmask before enabling irqs */
 	set_softirq_pending(0);
 
-	local_irq_enable();
-
 	h = softirq_vec;
 
 	do {
-		if (pending & 1) {
-			int prev_count = preempt_count();
-			kstat_incr_softirqs_this_cpu(h - softirq_vec);
-
-			trace_softirq_entry(h, softirq_vec);
-			h->action(h);
-			trace_softirq_exit(h, softirq_vec);
-			if (unlikely(prev_count != preempt_count())) {
-				printk(KERN_ERR "huh, entered softirq %td %s %p"
-				       "with preempt_count %08x,"
-				       " exited with %08x?\n", h - softirq_vec,
-				       softirq_to_name[h - softirq_vec],
-				       h->action, prev_count, preempt_count());
-				preempt_count() = prev_count;
+		u32 softirq_mask = 1 << softirq;
+
+		if (!(pending & 1))
+			goto next;
+
+		debug_check_preempt_count_start(&preempt_count);
+
+#if defined(CONFIG_PREEMPT_SOFTIRQS) && defined(CONFIG_PREEMPT_HARDIRQS)
+		/*
+		 * If executed by a same-prio hardirq thread
+		 * then skip pending softirqs that belong
+		 * to softirq threads with different priority:
+		 */
+		if (same_prio_only) {
+			struct task_struct *tsk;
+
+			tsk = __get_cpu_var(ksoftirqd)[softirq].tsk;
+			if (tsk && tsk->normal_prio != current->normal_prio) {
+				same_prio_skipped |= softirq_mask;
+				available_mask &= ~softirq_mask;
+				goto next;
 			}
-
-			rcu_bh_qsctr_inc(cpu);
 		}
+#endif
+		/*
+		 * Is this softirq already being processed?
+		 */
+		if (per_cpu(softirq_running, cpu) & softirq_mask) {
+			available_mask &= ~softirq_mask;
+			goto next;
+  		}
+		per_cpu(softirq_running, cpu) |= softirq_mask;
+		kstat_incr_softirqs_this_cpu(h - softirq_vec);
+		local_irq_enable();
+
+		trace_softirq_entry(h, softirq_vec);
+		h->action(h);
+		trace_softirq_exit(h, softirq_vec);
+
+		debug_check_preempt_count_stop(&preempt_count, h);
+
+		rcu_bh_qsctr_inc(cpu);
+		cond_resched_softirq_context();
+		local_irq_disable();
+		per_cpu(softirq_running, cpu) &= ~softirq_mask;
+
+next:
 		h++;
+		softirq++;
 		pending >>= 1;
 	} while (pending);
 
-	local_irq_disable();
-
+	or_softirq_pending(same_prio_skipped);
 	pending = local_softirq_pending();
-	if (pending && --max_restart)
-		goto restart;
+	if (pending & available_mask) {
+		if (--max_restart)
+			goto restart;
+	}
 
 	if (pending)
-		wakeup_softirqd();
+		trigger_softirqs();
+}
+
+asmlinkage void __do_softirq(void)
+{
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+	/*
+	 * 'preempt harder'. Push all softirq processing off to ksoftirqd.
+	 */
+	if (softirq_preemption) {
+		if (local_softirq_pending())
+			trigger_softirqs();
+		return;
+	}
+#endif
+	/*
+	 * 'immediate' softirq execution:
+	 */
+	__local_bh_disable((unsigned long)__builtin_return_address(0));
+	lockdep_softirq_enter();
+
+	___do_softirq(0);
 
 	lockdep_softirq_exit();
 
 	account_system_vtime(current);
 	_local_bh_enable();
+
 }
 
 #ifndef __ARCH_HAS_DO_SOFTIRQ
@@ -316,19 +525,11 @@ void irq_exit(void)
  */
 inline void raise_softirq_irqoff(unsigned int nr)
 {
-	__raise_softirq_irqoff(nr);
+	__do_raise_softirq_irqoff(nr);
 
-	/*
-	 * If we're in an interrupt or softirq, we're done
-	 * (this also catches softirq-disabled code). We will
-	 * actually run the softirq once we return from
-	 * the irq or softirq.
-	 *
-	 * Otherwise we wake up ksoftirqd to make sure we
-	 * schedule the softirq soon.
-	 */
-	if (!in_interrupt())
-		wakeup_softirqd();
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+	wakeup_softirqd(nr);
+#endif
 }
 
 void raise_softirq(unsigned int nr)
@@ -357,15 +558,45 @@ struct tasklet_head
 static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec);
 static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec);
 
+static void inline
+__tasklet_common_schedule(struct tasklet_struct *t, struct tasklet_head *head, unsigned int nr)
+{
+	if (tasklet_trylock(t)) {
+again:
+		/* We may have been preempted before tasklet_trylock
+		 * and __tasklet_action may have already run.
+		 * So double check the sched bit while the takslet
+		 * is locked before adding it to the list.
+		 */
+		if (test_bit(TASKLET_STATE_SCHED, &t->state)) {
+			t->next = NULL;
+			*head->tail = t;
+			head->tail = &(t->next);
+			raise_softirq_irqoff(nr);
+			tasklet_unlock(t);
+		} else {
+			/* This is subtle. If we hit the corner case above
+			 * It is possible that we get preempted right here,
+			 * and another task has successfully called
+			 * tasklet_schedule(), then this function, and
+			 * failed on the trylock. Thus we must be sure
+			 * before releasing the tasklet lock, that the
+			 * SCHED_BIT is clear. Otherwise the tasklet
+			 * may get its SCHED_BIT set, but not added to the
+			 * list
+			 */
+			if (!tasklet_tryunlock(t))
+				goto again;
+		}
+	}
+}
+
 void __tasklet_schedule(struct tasklet_struct *t)
 {
 	unsigned long flags;
 
 	local_irq_save(flags);
-	t->next = NULL;
-	*__get_cpu_var(tasklet_vec).tail = t;
-	__get_cpu_var(tasklet_vec).tail = &(t->next);
-	raise_softirq_irqoff(TASKLET_SOFTIRQ);
+	__tasklet_common_schedule(t, &__get_cpu_var(tasklet_vec), TASKLET_SOFTIRQ);
 	local_irq_restore(flags);
 }
 
@@ -376,10 +607,7 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
 	unsigned long flags;
 
 	local_irq_save(flags);
-	t->next = NULL;
-	*__get_cpu_var(tasklet_hi_vec).tail = t;
-	__get_cpu_var(tasklet_hi_vec).tail = &(t->next);
-	raise_softirq_irqoff(HI_SOFTIRQ);
+	__tasklet_common_schedule(t, &__get_cpu_var(tasklet_hi_vec), HI_SOFTIRQ);
 	local_irq_restore(flags);
 }
 
@@ -387,50 +615,119 @@ EXPORT_SYMBOL(__tasklet_hi_schedule);
 
 void __tasklet_hi_schedule_first(struct tasklet_struct *t)
 {
-	BUG_ON(!irqs_disabled());
-
-	t->next = __get_cpu_var(tasklet_hi_vec).head;
-	__get_cpu_var(tasklet_hi_vec).head = t;
-	__raise_softirq_irqoff(HI_SOFTIRQ);
+	__tasklet_hi_schedule(t);
 }
 
 EXPORT_SYMBOL(__tasklet_hi_schedule_first);
 
-static void tasklet_action(struct softirq_action *a)
+void  tasklet_enable(struct tasklet_struct *t)
 {
-	struct tasklet_struct *list;
+	if (!atomic_dec_and_test(&t->count))
+		return;
+	if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state))
+		tasklet_schedule(t);
+}
 
-	local_irq_disable();
-	list = __get_cpu_var(tasklet_vec).head;
-	__get_cpu_var(tasklet_vec).head = NULL;
-	__get_cpu_var(tasklet_vec).tail = &__get_cpu_var(tasklet_vec).head;
-	local_irq_enable();
+EXPORT_SYMBOL(tasklet_enable);
+
+void  tasklet_hi_enable(struct tasklet_struct *t)
+{
+	if (!atomic_dec_and_test(&t->count))
+		return;
+	if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state))
+		tasklet_hi_schedule(t);
+}
+
+EXPORT_SYMBOL(tasklet_hi_enable);
+
+static void
+__tasklet_action(struct softirq_action *a, struct tasklet_struct *list)
+{
+	int loops = 1000000;
 
 	while (list) {
 		struct tasklet_struct *t = list;
 
 		list = list->next;
 
-		if (tasklet_trylock(t)) {
-			if (!atomic_read(&t->count)) {
-				if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
-					BUG();
-				t->func(t->data);
-				tasklet_unlock(t);
-				continue;
-			}
-			tasklet_unlock(t);
+		/*
+		 * Should always succeed - after a tasklist got on the
+		 * list (after getting the SCHED bit set from 0 to 1),
+		 * nothing but the tasklet softirq it got queued to can
+		 * lock it:
+		 */
+		if (!tasklet_trylock(t)) {
+			WARN_ON(1);
+			continue;
 		}
 
-		local_irq_disable();
 		t->next = NULL;
-		*__get_cpu_var(tasklet_vec).tail = t;
-		__get_cpu_var(tasklet_vec).tail = &(t->next);
-		__raise_softirq_irqoff(TASKLET_SOFTIRQ);
-		local_irq_enable();
+
+		/*
+		 * If we cannot handle the tasklet because it's disabled,
+		 * mark it as pending. tasklet_enable() will later
+		 * re-schedule the tasklet.
+		 */
+		if (unlikely(atomic_read(&t->count))) {
+out_disabled:
+			/* implicit unlock: */
+			wmb();
+			t->state = TASKLET_STATEF_PENDING;
+			continue;
+		}
+
+		/*
+		 * After this point on the tasklet might be rescheduled
+		 * on another CPU, but it can only be added to another
+		 * CPU's tasklet list if we unlock the tasklet (which we
+		 * dont do yet).
+		 */
+		if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
+			WARN_ON(1);
+
+again:
+		t->func(t->data);
+
+		/*
+		 * Try to unlock the tasklet. We must use cmpxchg, because
+		 * another CPU might have scheduled or disabled the tasklet.
+		 * We only allow the STATE_RUN -> 0 transition here.
+		 */
+		while (!tasklet_tryunlock(t)) {
+			/*
+			 * If it got disabled meanwhile, bail out:
+			 */
+			if (atomic_read(&t->count))
+				goto out_disabled;
+			/*
+			 * If it got scheduled meanwhile, re-execute
+			 * the tasklet function:
+			 */
+			if (test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
+				goto again;
+			if (!--loops) {
+				printk("hm, tasklet state: %08lx\n", t->state);
+				WARN_ON(1);
+				tasklet_unlock(t);
+				break;
+			}
+		}
 	}
 }
 
+static void tasklet_action(struct softirq_action *a)
+{
+	struct tasklet_struct *list;
+
+	local_irq_disable();
+	list = __get_cpu_var(tasklet_vec).head;
+	__get_cpu_var(tasklet_vec).head = NULL;
+	__get_cpu_var(tasklet_vec).tail = &__get_cpu_var(tasklet_vec).head;
+	local_irq_enable();
+
+	__tasklet_action(a, list);
+}
+
 static void tasklet_hi_action(struct softirq_action *a)
 {
 	struct tasklet_struct *list;
@@ -441,29 +738,7 @@ static void tasklet_hi_action(struct softirq_action *a)
 	__get_cpu_var(tasklet_hi_vec).tail = &__get_cpu_var(tasklet_hi_vec).head;
 	local_irq_enable();
 
-	while (list) {
-		struct tasklet_struct *t = list;
-
-		list = list->next;
-
-		if (tasklet_trylock(t)) {
-			if (!atomic_read(&t->count)) {
-				if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
-					BUG();
-				t->func(t->data);
-				tasklet_unlock(t);
-				continue;
-			}
-			tasklet_unlock(t);
-		}
-
-		local_irq_disable();
-		t->next = NULL;
-		*__get_cpu_var(tasklet_hi_vec).tail = t;
-		__get_cpu_var(tasklet_hi_vec).tail = &(t->next);
-		__raise_softirq_irqoff(HI_SOFTIRQ);
-		local_irq_enable();
-	}
+	__tasklet_action(a, list);
 }
 
 
@@ -486,7 +761,7 @@ void tasklet_kill(struct tasklet_struct *t)
 
 	while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
 		do {
-			yield();
+			msleep(1);
 		} while (test_bit(TASKLET_STATE_SCHED, &t->state));
 	}
 	tasklet_unlock_wait(t);
@@ -697,33 +972,89 @@ void __init softirq_init(void)
 	open_softirq(HI_SOFTIRQ, tasklet_hi_action);
 }
 
-static int ksoftirqd(void * __bind_cpu)
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
+
+void tasklet_unlock_wait(struct tasklet_struct *t)
 {
+	while (test_bit(TASKLET_STATE_RUN, &(t)->state)) {
+		/*
+		 * Hack for now to avoid this busy-loop:
+		 */
+#ifdef CONFIG_PREEMPT_RT
+		msleep(1);
+#else
+		barrier();
+#endif
+	}
+}
+EXPORT_SYMBOL(tasklet_unlock_wait);
+
+#endif
+
+static int ksoftirqd(void * __data)
+{
+	/* Priority needs to be below hardirqs */
+	struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2 - 1};
+	struct softirqdata *data = __data;
+	u32 softirq_mask = (1 << data->nr);
+	struct softirq_action *h;
+	int cpu = data->cpu;
+
+	sys_sched_setscheduler(current->pid, SCHED_FIFO, &param);
+	current->flags |= PF_SOFTIRQ;
 	set_current_state(TASK_INTERRUPTIBLE);
 
 	while (!kthread_should_stop()) {
 		preempt_disable();
-		if (!local_softirq_pending()) {
+		if (!(local_softirq_pending() & softirq_mask)) {
+sleep_more:
 			preempt_enable_and_schedule();
 			preempt_disable();
 		}
 
 		__set_current_state(TASK_RUNNING);
+		data->running = 1;
 
-		while (local_softirq_pending()) {
+		while (local_softirq_pending() & softirq_mask) {
 			/* Preempt disable stops cpu going offline.
 			   If already offline, we'll be on wrong CPU:
 			   don't process */
-			if (cpu_is_offline((long)__bind_cpu))
+			if (cpu_is_offline(cpu))
 				goto wait_to_die;
-			do_softirq();
+
+			/*
+			 * Is the softirq already being executed by
+			 * a hardirq context?
+			 */
+			local_irq_disable();
+			if (per_cpu(softirq_running, cpu) & softirq_mask) {
+				local_irq_enable();
+				set_current_state(TASK_INTERRUPTIBLE);
+				goto sleep_more;
+			}
+			per_cpu(softirq_running, cpu) |= softirq_mask;
 			__preempt_enable_no_resched();
+			set_softirq_pending(local_softirq_pending() & ~softirq_mask);
+			local_bh_disable();
+			local_irq_enable();
+
+			h = &softirq_vec[data->nr];
+			if (h)
+				h->action(h);
+			rcu_bh_qsctr_inc(data->cpu);
+
+			local_irq_disable();
+			per_cpu(softirq_running, cpu) &= ~softirq_mask;
+			_local_bh_enable();
+			local_irq_enable();
+
 			cond_resched();
 			preempt_disable();
-			rcu_qsctr_inc((long)__bind_cpu);
+			rcu_qsctr_inc(data->cpu);
 		}
 		preempt_enable();
 		set_current_state(TASK_INTERRUPTIBLE);
+		data->running = 0;
 	}
 	__set_current_state(TASK_RUNNING);
 	return 0;
@@ -773,7 +1104,7 @@ void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu)
 	BUG();
 }
 
-static void takeover_tasklets(unsigned int cpu)
+void takeover_tasklets(unsigned int cpu)
 {
 	/* CPU is dead, so no lock needed. */
 	local_irq_disable();
@@ -799,49 +1130,77 @@ static void takeover_tasklets(unsigned int cpu)
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
+static const char *softirq_names [] =
+{
+  [HI_SOFTIRQ]		= "high",
+  [SCHED_SOFTIRQ]	= "sched",
+  [TIMER_SOFTIRQ]	= "timer",
+  [NET_TX_SOFTIRQ]	= "net-tx",
+  [NET_RX_SOFTIRQ]	= "net-rx",
+  [BLOCK_SOFTIRQ]	= "block",
+  [TASKLET_SOFTIRQ]	= "tasklet",
+#ifdef CONFIG_HIGH_RES_TIMERS
+  [HRTIMER_SOFTIRQ]	= "hrtimer",
+#endif
+  [RCU_SOFTIRQ]		= "rcu",
+};
+
 static int __cpuinit cpu_callback(struct notifier_block *nfb,
 				  unsigned long action,
 				  void *hcpu)
 {
-	int hotcpu = (unsigned long)hcpu;
+	int hotcpu = (unsigned long)hcpu, i;
 	struct task_struct *p;
 
 	switch (action) {
 	case CPU_UP_PREPARE:
 	case CPU_UP_PREPARE_FROZEN:
-		p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
-		if (IS_ERR(p)) {
-			printk("ksoftirqd for %i failed\n", hotcpu);
-			return NOTIFY_BAD;
+		for (i = 0; i < NR_SOFTIRQS; i++) {
+			per_cpu(ksoftirqd, hotcpu)[i].nr = i;
+			per_cpu(ksoftirqd, hotcpu)[i].cpu = hotcpu;
+			per_cpu(ksoftirqd, hotcpu)[i].tsk = NULL;
+		}
+		for (i = 0; i < NR_SOFTIRQS; i++) {
+			p = kthread_create(ksoftirqd,
+					   &per_cpu(ksoftirqd, hotcpu)[i],
+					   "sirq-%s/%d", softirq_names[i],
+					   hotcpu);
+			if (IS_ERR(p)) {
+				printk("ksoftirqd %d for %i failed\n", i,
+				       hotcpu);
+				return NOTIFY_BAD;
+			}
+			kthread_bind(p, hotcpu);
+			per_cpu(ksoftirqd, hotcpu)[i].tsk = p;
 		}
-		kthread_bind(p, hotcpu);
-  		per_cpu(ksoftirqd, hotcpu) = p;
- 		break;
+		break;
+	break;
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
-		wake_up_process(per_cpu(ksoftirqd, hotcpu));
+		for (i = 0; i < NR_SOFTIRQS; i++)
+			wake_up_process(per_cpu(ksoftirqd, hotcpu)[i].tsk);
 		break;
 #ifdef CONFIG_HOTPLUG_CPU
 	case CPU_UP_CANCELED:
 	case CPU_UP_CANCELED_FROZEN:
-		if (!per_cpu(ksoftirqd, hotcpu))
-			break;
-		/* Unbind so it can run.  Fall thru. */
-		kthread_bind(per_cpu(ksoftirqd, hotcpu),
-			     cpumask_any(cpu_online_mask));
+		/* Fall trough */
+
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN: {
-		struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+		struct sched_param param;
 
-		p = per_cpu(ksoftirqd, hotcpu);
-		per_cpu(ksoftirqd, hotcpu) = NULL;
-		sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
-		kthread_stop(p);
+		for (i = 0; i < NR_SOFTIRQS; i++) {
+			param.sched_priority = MAX_RT_PRIO-1;
+			p = per_cpu(ksoftirqd, hotcpu)[i].tsk;
+			sched_setscheduler(p, SCHED_FIFO, &param);
+			per_cpu(ksoftirqd, hotcpu)[i].tsk = NULL;
+			kthread_stop(p);
+		}
 		takeover_tasklets(hotcpu);
 		break;
 	}
 #endif /* CONFIG_HOTPLUG_CPU */
- 	}
+	}
 	return NOTIFY_OK;
 }
 
@@ -861,6 +1220,34 @@ static __init int spawn_ksoftirqd(void)
 }
 early_initcall(spawn_ksoftirqd);
 
+
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+
+int softirq_preemption = 1;
+
+EXPORT_SYMBOL(softirq_preemption);
+
+/*
+ * Real-Time Preemption depends on softirq threading:
+ */
+#ifndef CONFIG_PREEMPT_RT
+
+static int __init softirq_preempt_setup (char *str)
+{
+	if (!strncmp(str, "off", 3))
+		softirq_preemption = 0;
+	else
+		get_option(&str, &softirq_preemption);
+	if (!softirq_preemption)
+		printk("turning off softirq preemption!\n");
+
+	return 1;
+}
+
+__setup("softirq-preempt=", softirq_preempt_setup);
+#endif
+#endif
+
 #ifdef CONFIG_SMP
 /*
  * Call a function on all processors
diff --git a/kernel/sys.c b/kernel/sys.c
index b3f1097c76fa..eb040a48e349 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -33,6 +33,7 @@
 #include <linux/getcpu.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/seccomp.h>
+#include <linux/hardirq.h>
 #include <linux/cpu.h>
 #include <linux/ptrace.h>
 #include <linux/fs_struct.h>
@@ -280,6 +281,15 @@ out_unlock:
  */
 void emergency_restart(void)
 {
+	/*
+	 * Call the notifier chain if we are not in an
+	 * atomic context:
+	 */
+#ifdef CONFIG_PREEMPT
+	if (!in_atomic() && !irqs_disabled())
+		blocking_notifier_call_chain(&reboot_notifier_list,
+					     SYS_RESTART, NULL);
+#endif
 	machine_emergency_restart();
 }
 EXPORT_SYMBOL_GPL(emergency_restart);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index fea1bc74ca94..8e15027e4a58 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -254,13 +254,7 @@ void tick_nohz_stop_sched_tick(int inidle)
 		goto end;
 
 	if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
-		static int ratelimit;
-
-		if (ratelimit < 10) {
-			printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
-			       local_softirq_pending());
-			ratelimit++;
-		}
+		softirq_check_pending_idle();
 		goto end;
 	}
 
@@ -693,6 +687,7 @@ void tick_setup_sched_timer(void)
 	 * Emulate tick processing via per-CPU hrtimers:
 	 */
 	hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+	ts->sched_timer.irqsafe = 1;
 	ts->sched_timer.function = tick_sched_timer;
 
 	/* Get the next period (per cpu) */
diff --git a/kernel/timer.c b/kernel/timer.c
index a7f07d5a6241..8137cce92a2b 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -34,6 +34,7 @@
 #include <linux/posix-timers.h>
 #include <linux/cpu.h>
 #include <linux/syscalls.h>
+#include <linux/kallsyms.h>
 #include <linux/delay.h>
 #include <linux/tick.h>
 #include <linux/kallsyms.h>
@@ -71,6 +72,7 @@ struct tvec_root {
 struct tvec_base {
 	spinlock_t lock;
 	struct timer_list *running_timer;
+	wait_queue_head_t wait_for_running_timer;
 	unsigned long timer_jiffies;
 	struct tvec_root tv1;
 	struct tvec tv2;
@@ -318,9 +320,7 @@ EXPORT_SYMBOL_GPL(round_jiffies_up_relative);
 static inline void set_running_timer(struct tvec_base *base,
 					struct timer_list *timer)
 {
-#ifdef CONFIG_SMP
 	base->running_timer = timer;
-#endif
 }
 
 static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
@@ -630,8 +630,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
 
 	debug_timer_activate(timer);
 
+	preempt_disable();
 	new_base = __get_cpu_var(tvec_bases);
-
 	cpu = smp_processor_id();
 
 #if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
@@ -642,6 +642,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
 			cpu = preferred_cpu;
 	}
 #endif
+	preempt_enable();
+
 	new_base = per_cpu(tvec_bases, cpu);
 
 	if (base != new_base) {
@@ -661,7 +663,6 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
 			timer_set_base(timer, base);
 		}
 	}
-
 	timer->expires = expires;
 	internal_add_timer(base, timer);
 
@@ -795,6 +796,18 @@ void add_timer_on(struct timer_list *timer, int cpu)
 }
 EXPORT_SYMBOL_GPL(add_timer_on);
 
+/*
+ * Wait for a running timer
+ */
+void wait_for_running_timer(struct timer_list *timer)
+{
+	struct tvec_base *base = timer->base;
+
+	if (base->running_timer == timer)
+		wait_event(base->wait_for_running_timer,
+			   base->running_timer != timer);
+}
+
 /**
  * del_timer - deactive a timer.
  * @timer: the timer to be deactivated
@@ -826,7 +839,34 @@ int del_timer(struct timer_list *timer)
 }
 EXPORT_SYMBOL(del_timer);
 
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_SOFTIRQS)
+/*
+ * This function checks whether a timer is active and not running on any
+ * CPU. Upon successful (ret >= 0) exit the timer is not queued and the
+ * handler is not running on any CPU.
+ *
+ * It must not be called from interrupt contexts.
+ */
+int timer_pending_sync(struct timer_list *timer)
+{
+	struct tvec_base *base;
+	unsigned long flags;
+	int ret = -1;
+
+	base = lock_timer_base(timer, &flags);
+
+	if (base->running_timer == timer)
+		goto out;
+
+	ret = 0;
+	if (timer_pending(timer))
+		ret = 1;
+out:
+	spin_unlock_irqrestore(&base->lock, flags);
+
+	return ret;
+}
+
 /**
  * try_to_del_timer_sync - Try to deactivate a timer
  * @timer: timer do del
@@ -891,7 +931,7 @@ int del_timer_sync(struct timer_list *timer)
 		int ret = try_to_del_timer_sync(timer);
 		if (ret >= 0)
 			return ret;
-		cpu_relax();
+		wait_for_running_timer(timer);
 	}
 }
 EXPORT_SYMBOL(del_timer_sync);
@@ -936,6 +976,20 @@ static inline void __run_timers(struct tvec_base *base)
 		struct list_head *head = &work_list;
 		int index = base->timer_jiffies & TVR_MASK;
 
+		if (softirq_need_resched()) {
+			spin_unlock_irq(&base->lock);
+			wake_up(&base->wait_for_running_timer);
+			cond_resched_softirq_context();
+			cpu_relax();
+			spin_lock_irq(&base->lock);
+			/*
+			 * We can simply continue after preemption, nobody
+			 * else can touch timer_jiffies so 'index' is still
+			 * valid. Any new jiffy will be taken care of in
+			 * subsequent loops:
+			 */
+		}
+
 		/*
 		 * Cascade timers:
 		 */
@@ -989,18 +1043,17 @@ static inline void __run_timers(struct tvec_base *base)
 				lock_map_release(&lockdep_map);
 
 				if (preempt_count != preempt_count()) {
-					printk(KERN_ERR "huh, entered %p "
-					       "with preempt_count %08x, exited"
-					       " with %08x?\n",
-					       fn, preempt_count,
-					       preempt_count());
-					BUG();
+					print_symbol("BUG: unbalanced timer-handler preempt count in %s!\n", (unsigned long) fn);
+					printk("entered with %08x, exited with %08x.\n", preempt_count, preempt_count());
+					preempt_count() = preempt_count;
 				}
 			}
+			set_running_timer(base, NULL);
+			cond_resched_softirq_context();
 			spin_lock_irq(&base->lock);
 		}
 	}
-	set_running_timer(base, NULL);
+	wake_up(&base->wait_for_running_timer);
 	spin_unlock_irq(&base->lock);
 }
 
@@ -1133,9 +1186,22 @@ unsigned long get_next_timer_interrupt(unsigned long now)
 	struct tvec_base *base = __get_cpu_var(tvec_bases);
 	unsigned long expires;
 
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * On PREEMPT_RT we cannot sleep here. If the trylock does not
+	 * succeed then we return the worst-case 'expires in 1 tick'
+	 * value:
+	 */
+	if (spin_trylock(&base->lock)) {
+		expires = __next_timer_interrupt(base);
+		spin_unlock(&base->lock);
+	} else
+		expires = now + 1;
+#else
 	spin_lock(&base->lock);
 	expires = __next_timer_interrupt(base);
 	spin_unlock(&base->lock);
+#endif
 
 	if (time_before_eq(expires, now))
 		return now;
@@ -1158,7 +1224,6 @@ void update_process_times(int user_tick)
 	run_local_timers();
 	if (rcu_pending(cpu))
 		rcu_check_callbacks(cpu, user_tick);
-	printk_tick();
 	scheduler_tick();
 	run_posix_cpu_timers(p);
 }
@@ -1168,10 +1233,11 @@ void update_process_times(int user_tick)
  */
 static void run_timer_softirq(struct softirq_action *h)
 {
-	struct tvec_base *base = __get_cpu_var(tvec_bases);
+	struct tvec_base *base = per_cpu(tvec_bases, raw_smp_processor_id());
 
 	perf_counter_do_pending();
 
+	printk_tick();
 	hrtimer_run_pending();
 
 	if (time_after_eq(jiffies, base->timer_jiffies))
@@ -1512,6 +1578,7 @@ static int __cpuinit init_timers_cpu(int cpu)
 	}
 
 	spin_lock_init(&base->lock);
+	init_waitqueue_head(&base->wait_for_running_timer);
 
 	for (j = 0; j < TVN_SIZE; j++) {
 		INIT_LIST_HEAD(base->tv5.vec + j);
@@ -1543,6 +1610,7 @@ static void __cpuinit migrate_timers(int cpu)
 {
 	struct tvec_base *old_base;
 	struct tvec_base *new_base;
+	unsigned long flags;
 	int i;
 
 	BUG_ON(cpu_online(cpu));
@@ -1552,8 +1620,11 @@ static void __cpuinit migrate_timers(int cpu)
 	 * The caller is globally serialized and nobody else
 	 * takes two locks at once, deadlock is not possible.
 	 */
-	spin_lock_irq(&new_base->lock);
-	spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
+	local_irq_save(flags);
+	while (!spin_trylock(&new_base->lock))
+		cpu_relax();
+	while (!spin_trylock(&old_base->lock))
+		cpu_relax();
 
 	BUG_ON(old_base->running_timer);
 
@@ -1567,7 +1638,9 @@ static void __cpuinit migrate_timers(int cpu)
 	}
 
 	spin_unlock(&old_base->lock);
-	spin_unlock_irq(&new_base->lock);
+	spin_unlock(&new_base->lock);
+	local_irq_restore(flags);
+
 	put_cpu_var(tvec_bases);
 }
 #endif /* CONFIG_HOTPLUG_CPU */
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 019f380fd764..86947452df6d 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -349,6 +349,7 @@ config STACK_TRACER
 
 config HW_BRANCH_TRACER
 	depends on HAVE_HW_BRANCH_TRACER
+	depends on !PREEMPT_RT
 	bool "Trace hw branches"
 	select GENERIC_TRACER
 	help
@@ -376,7 +377,7 @@ config KMEMTRACE
 	  If unsure, say N.
 
 config WORKQUEUE_TRACER
-	bool "Trace workqueues"
+	bool "Trace workqueues" if !PREEMPT_RT
 	select GENERIC_TRACER
 	help
 	  The workqueue tracer provides some statistical informations
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 1e1d23c26308..d3a7a6a4fb2e 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -377,7 +377,8 @@ static int function_stat_show(struct seq_file *m, void *v)
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 	seq_printf(m, "    ");
 	avg = rec->time;
-	do_div(avg, rec->counter);
+	if (rec->counter)
+		do_div(avg, rec->counter);
 
 	mutex_lock(&mutex);
 	trace_seq_init(&s);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index dd185a741e4c..90f024dc4614 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -274,6 +274,10 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
  */
 void trace_wake_up(void)
 {
+#ifdef CONFIG_PREEMPT_RT
+	if (in_atomic() || irqs_disabled())
+		return;
+#endif
 	/*
 	 * The runqueue_is_locked() can fail, but this is the best we
 	 * have for now:
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 420ec3487579..4fbc5f996282 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -470,7 +470,7 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
 	int ret, len;
 	int i;
 
-	sprintf(msecs_str, "%lu", (unsigned long) duration);
+	snprintf(msecs_str, sizeof(msecs_str), "%lu", (unsigned long) duration);
 
 	/* Print msecs */
 	ret = trace_seq_printf(s, "%s", msecs_str);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0668795d8818..0a98bef8bd35 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -26,6 +26,7 @@
 #include <linux/slab.h>
 #include <linux/cpu.h>
 #include <linux/notifier.h>
+#include <linux/syscalls.h>
 #include <linux/kthread.h>
 #include <linux/hardirq.h>
 #include <linux/mempolicy.h>
@@ -36,6 +37,8 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/workqueue.h>
 
+#include <asm/uaccess.h>
+
 /*
  * The per-CPU workqueue (if single thread, we always use the first
  * possible cpu).
@@ -159,13 +162,14 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
  *
  * We queue the work to the CPU on which it was submitted, but if the CPU dies
  * it can be processed by another CPU.
+ *
+ * Especially no such guarantee on PREEMPT_RT.
  */
 int queue_work(struct workqueue_struct *wq, struct work_struct *work)
 {
-	int ret;
+	int ret = 0, cpu = raw_smp_processor_id();
 
-	ret = queue_work_on(get_cpu(), wq, work);
-	put_cpu();
+	ret = queue_work_on(cpu, wq, work);
 
 	return ret;
 }
@@ -202,7 +206,7 @@ static void delayed_work_timer_fn(unsigned long __data)
 	struct cpu_workqueue_struct *cwq = get_wq_data(&dwork->work);
 	struct workqueue_struct *wq = cwq->wq;
 
-	__queue_work(wq_per_cpu(wq, smp_processor_id()), &dwork->work);
+	__queue_work(wq_per_cpu(wq, raw_smp_processor_id()), &dwork->work);
 }
 
 /**
@@ -883,6 +887,49 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
 	cwq->thread = NULL;
 }
 
+void set_workqueue_thread_prio(struct workqueue_struct *wq, int cpu,
+			       int policy, int rt_priority, int nice)
+{
+	struct sched_param param = { .sched_priority = rt_priority };
+	struct cpu_workqueue_struct *cwq;
+	mm_segment_t oldfs = get_fs();
+	struct task_struct *p;
+	unsigned long flags;
+	int ret;
+
+	cwq = per_cpu_ptr(wq->cpu_wq, cpu);
+	spin_lock_irqsave(&cwq->lock, flags);
+	p = cwq->thread;
+	spin_unlock_irqrestore(&cwq->lock, flags);
+
+	set_user_nice(p, nice);
+
+	set_fs(KERNEL_DS);
+	ret = sys_sched_setscheduler(p->pid, policy, &param);
+	set_fs(oldfs);
+
+	WARN_ON(ret);
+}
+
+void set_workqueue_prio(struct workqueue_struct *wq, int policy,
+			int rt_priority, int nice)
+{
+	int cpu;
+
+	/* We don't need the distraction of CPUs appearing and vanishing. */
+	get_online_cpus();
+	spin_lock(&workqueue_lock);
+	if (is_wq_single_threaded(wq))
+		set_workqueue_thread_prio(wq, 0, policy, rt_priority, nice);
+	else {
+		for_each_online_cpu(cpu)
+			set_workqueue_thread_prio(wq, cpu, policy,
+						  rt_priority, nice);
+	}
+	spin_unlock(&workqueue_lock);
+	put_online_cpus();
+}
+
 /**
  * destroy_workqueue - safely terminate a workqueue
  * @wq: target workqueue
@@ -1015,4 +1062,5 @@ void __init init_workqueues(void)
 	hotcpu_notifier(workqueue_cpu_callback, 0);
 	keventd_wq = create_workqueue("events");
 	BUG_ON(!keventd_wq);
+	set_workqueue_prio(keventd_wq, SCHED_FIFO, 1, -20);
 }
diff --git a/lib/Kconfig b/lib/Kconfig
index bb1326d3839c..faefb8065b07 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -179,6 +179,7 @@ config HAVE_LMB
 
 config CPUMASK_OFFSTACK
 	bool "Force CPU masks off stack" if DEBUG_PER_CPU_MAPS
+	depends on !PREEMPT_RT && BROKEN
 	help
 	  Use dynamic allocation for cpumask_var_t, instead of putting
 	  them on the stack.  This is a bit more expensive, but avoids
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 12327b2bb785..8a2ddd3f0922 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -397,6 +397,8 @@ config DEBUG_RT_MUTEXES
 	help
 	 This allows rt mutex semantics violations and rt mutex related
 	 deadlocks (lockups) to be detected and reported automatically.
+	 When realtime preemption is enabled this includes spinlocks,
+	 rwlocks, mutexes and (rw)semaphores
 
 config DEBUG_PI_LIST
 	bool
@@ -420,7 +422,7 @@ config DEBUG_SPINLOCK
 
 config DEBUG_MUTEXES
 	bool "Mutex debugging: basic checks"
-	depends on DEBUG_KERNEL
+	depends on DEBUG_KERNEL && !PREEMPT_RT
 	help
 	 This feature allows mutex semantics violations to be detected and
 	 reported.
diff --git a/lib/Makefile b/lib/Makefile
index 2e78277eff9d..ceeef2464d3f 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -34,7 +34,8 @@ obj-$(CONFIG_HAS_IOMEM) += iomap_copy.o devres.o
 obj-$(CONFIG_CHECK_SIGNATURE) += check_signature.o
 obj-$(CONFIG_DEBUG_LOCKING_API_SELFTESTS) += locking-selftest.o
 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
-lib-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
+obj-$(CONFIG_PREEMPT_RT) += plist.o
+obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
 lib-$(CONFIG_GENERIC_FIND_FIRST_BIT) += find_next_bit.o
 lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o
diff --git a/lib/kernel_lock.c b/lib/kernel_lock.c
index f7874462effd..709c432b8a92 100644
--- a/lib/kernel_lock.c
+++ b/lib/kernel_lock.c
@@ -24,7 +24,7 @@
  *
  * Don't use in new code.
  */
-static DEFINE_SEMAPHORE(kernel_sem);
+DEFINE_SEMAPHORE(kernel_sem);
 
 /*
  * Re-acquire the kernel semaphore.
@@ -35,21 +35,23 @@ static DEFINE_SEMAPHORE(kernel_sem);
  * about recursion, both due to the down() and due to the enabling of
  * preemption. schedule() will re-check the preemption flag after
  * reacquiring the semaphore.
+ *
+ * Called with interrupts disabled.
  */
 int __lockfunc __reacquire_kernel_lock(void)
 {
 	struct task_struct *task = current;
 	int saved_lock_depth = task->lock_depth;
 
+	local_irq_enable();
 	BUG_ON(saved_lock_depth < 0);
 
 	task->lock_depth = -1;
-	__preempt_enable_no_resched();
 
 	down(&kernel_sem);
 
-	preempt_disable();
 	task->lock_depth = saved_lock_depth;
+	local_irq_enable();
 
 	return 0;
 }
@@ -67,11 +69,15 @@ void __lockfunc lock_kernel(void)
 	struct task_struct *task = current;
 	int depth = task->lock_depth + 1;
 
-	if (likely(!depth))
+	if (likely(!depth)) {
 		/*
 		 * No recursion worries - we set up lock_depth _after_
 		 */
 		down(&kernel_sem);
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+		current->last_kernel_lock = __builtin_return_address(0);
+#endif
+	}
 
 	task->lock_depth = depth;
 }
@@ -82,8 +88,12 @@ void __lockfunc unlock_kernel(void)
 
 	BUG_ON(task->lock_depth < 0);
 
-	if (likely(--task->lock_depth < 0))
+	if (likely(--task->lock_depth < 0)) {
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+		current->last_kernel_lock = NULL;
+#endif
 		up(&kernel_sem);
+	}
 }
 
 EXPORT_SYMBOL(lock_kernel);
diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
index 619313ed6c46..65e7eab8498e 100644
--- a/lib/locking-selftest.c
+++ b/lib/locking-selftest.c
@@ -158,7 +158,7 @@ static void init_shared_classes(void)
 		local_bh_disable();		\
 		local_irq_disable();		\
 		lockdep_softirq_enter();	\
-		WARN_ON(!in_softirq());
+		/* FIXME: preemptible softirqs. WARN_ON(!in_softirq()); */
 
 #define SOFTIRQ_EXIT()				\
 		lockdep_softirq_exit();		\
@@ -550,6 +550,11 @@ GENERATE_TESTCASE(init_held_rsem)
 #undef E
 
 /*
+ * FIXME: turns these into raw-spinlock tests on -rt
+ */
+#ifndef CONFIG_PREEMPT_RT
+
+/*
  * locking an irq-safe lock with irqs enabled:
  */
 #define E1()				\
@@ -890,6 +895,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_soft)
 #include "locking-selftest-softirq.h"
 // GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion2_soft)
 
+#endif /* !CONFIG_PREEMPT_RT */
+
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 # define I_SPINLOCK(x)	lockdep_reset_lock(&lock_##x.dep_map)
 # define I_RWLOCK(x)	lockdep_reset_lock(&rwlock_##x.dep_map)
@@ -998,7 +1005,7 @@ static inline void print_testname(const char *testname)
 
 #define DO_TESTCASE_1(desc, name, nr)				\
 	print_testname(desc"/"#nr);				\
-	dotest(name##_##nr, SUCCESS, LOCKTYPE_RWLOCK);		\
+	dotest(name##_##nr, SUCCESS, LOCKTYPE_RWLOCK);	\
 	printk("\n");
 
 #define DO_TESTCASE_1B(desc, name, nr)				\
@@ -1006,17 +1013,17 @@ static inline void print_testname(const char *testname)
 	dotest(name##_##nr, FAILURE, LOCKTYPE_RWLOCK);		\
 	printk("\n");
 
-#define DO_TESTCASE_3(desc, name, nr)				\
-	print_testname(desc"/"#nr);				\
-	dotest(name##_spin_##nr, FAILURE, LOCKTYPE_SPIN);	\
-	dotest(name##_wlock_##nr, FAILURE, LOCKTYPE_RWLOCK);	\
+#define DO_TESTCASE_3(desc, name, nr)					\
+	print_testname(desc"/"#nr);					\
+	dotest(name##_spin_##nr, FAILURE, LOCKTYPE_SPIN);		\
+	dotest(name##_wlock_##nr, FAILURE, LOCKTYPE_RWLOCK);		\
 	dotest(name##_rlock_##nr, SUCCESS, LOCKTYPE_RWLOCK);	\
 	printk("\n");
 
-#define DO_TESTCASE_3RW(desc, name, nr)				\
-	print_testname(desc"/"#nr);				\
+#define DO_TESTCASE_3RW(desc, name, nr)					\
+	print_testname(desc"/"#nr);					\
 	dotest(name##_spin_##nr, FAILURE, LOCKTYPE_SPIN|LOCKTYPE_RWLOCK);\
-	dotest(name##_wlock_##nr, FAILURE, LOCKTYPE_RWLOCK);	\
+	dotest(name##_wlock_##nr, FAILURE, LOCKTYPE_RWLOCK);		\
 	dotest(name##_rlock_##nr, SUCCESS, LOCKTYPE_RWLOCK);	\
 	printk("\n");
 
@@ -1047,7 +1054,7 @@ static inline void print_testname(const char *testname)
 	print_testname(desc);					\
 	dotest(name##_spin, FAILURE, LOCKTYPE_SPIN);		\
 	dotest(name##_wlock, FAILURE, LOCKTYPE_RWLOCK);		\
-	dotest(name##_rlock, SUCCESS, LOCKTYPE_RWLOCK);		\
+	dotest(name##_rlock, SUCCESS, LOCKTYPE_RWLOCK);	\
 	dotest(name##_mutex, FAILURE, LOCKTYPE_MUTEX);		\
 	dotest(name##_wsem, FAILURE, LOCKTYPE_RWSEM);		\
 	dotest(name##_rsem, FAILURE, LOCKTYPE_RWSEM);		\
@@ -1179,6 +1186,7 @@ void locking_selftest(void)
 	/*
 	 * irq-context testcases:
 	 */
+#ifndef CONFIG_PREEMPT_RT
 	DO_TESTCASE_2x6("irqs-on + irq-safe-A", irqsafe1);
 	DO_TESTCASE_2x3("sirq-safe-A => hirqs-on", irqsafe2A);
 	DO_TESTCASE_2x6("safe-A + irqs-on", irqsafe2B);
@@ -1188,6 +1196,7 @@ void locking_selftest(void)
 
 	DO_TESTCASE_6x2("irq read-recursion", irq_read_recursion);
 //	DO_TESTCASE_6x2B("irq read-recursion #2", irq_read_recursion2);
+#endif
 
 	if (unexpected_testcase_failures) {
 		printk("-----------------------------------------------------------------\n");
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 23abbd93cae1..e209012e5d31 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -157,12 +157,14 @@ radix_tree_node_alloc(struct radix_tree_root *root)
 		 * succeed in getting a node here (and never reach
 		 * kmem_cache_alloc)
 		 */
+		rtp = &get_cpu_var(radix_tree_preloads);
 		rtp = &__get_cpu_var(radix_tree_preloads);
 		if (rtp->nr) {
 			ret = rtp->nodes[rtp->nr - 1];
 			rtp->nodes[rtp->nr - 1] = NULL;
 			rtp->nr--;
 		}
+		put_cpu_var(radix_tree_preloads);
 	}
 	if (ret == NULL)
 		ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
@@ -195,6 +197,8 @@ radix_tree_node_free(struct radix_tree_node *node)
 	call_rcu(&node->rcu_head, radix_tree_node_rcu_free);
 }
 
+#ifndef CONFIG_PREEMPT_RT
+
 /*
  * Load up this CPU's radix_tree_node buffer with sufficient objects to
  * ensure that the addition of a single element in the tree cannot fail.  On
@@ -227,6 +231,8 @@ out:
 }
 EXPORT_SYMBOL(radix_tree_preload);
 
+#endif
+
 /*
  *	Return the maximum key which can be store into a
  *	radix tree with height HEIGHT.
diff --git a/lib/scatterlist.c b/lib/scatterlist.c
index 0d475d8167bf..e6dcd3b0d974 100644
--- a/lib/scatterlist.c
+++ b/lib/scatterlist.c
@@ -9,6 +9,7 @@
 #include <linux/module.h>
 #include <linux/scatterlist.h>
 #include <linux/highmem.h>
+#include <linux/interrupt.h>
 
 /**
  * sg_next - return the next scatterlist entry in a list
@@ -399,7 +400,7 @@ void sg_miter_stop(struct sg_mapping_iter *miter)
 			flush_kernel_dcache_page(miter->page);
 
 		if (miter->__flags & SG_MITER_ATOMIC) {
-			WARN_ON(!irqs_disabled());
+			WARN_ON_NONRT(!irqs_disabled());
 			kunmap_atomic(miter->addr, KM_BIO_SRC_IRQ);
 		} else
 			kunmap(miter->page);
@@ -439,7 +440,7 @@ static size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents,
 
 	sg_miter_start(&miter, sgl, nents, sg_flags);
 
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 
 	while (sg_miter_next(&miter) && offset < buflen) {
 		unsigned int len;
@@ -456,7 +457,7 @@ static size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents,
 
 	sg_miter_stop(&miter);
 
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 	return offset;
 }
 
diff --git a/lib/spinlock_debug.c b/lib/spinlock_debug.c
index 5f41fad4d302..70b5c1c27c07 100644
--- a/lib/spinlock_debug.c
+++ b/lib/spinlock_debug.c
@@ -31,6 +31,7 @@ void __atomic_spin_lock_init(atomic_spinlock_t *lock, const char *name,
 
 EXPORT_SYMBOL(__atomic_spin_lock_init);
 
+#ifndef CONFIG_PREEMPT_RT
 void __rwlock_init(rwlock_t *lock, const char *name,
 		   struct lock_class_key *key)
 {
@@ -46,8 +47,8 @@ void __rwlock_init(rwlock_t *lock, const char *name,
 	lock->owner = SPINLOCK_OWNER_INIT;
 	lock->owner_cpu = -1;
 }
-
 EXPORT_SYMBOL(__rwlock_init);
+#endif
 
 static void spin_bug(atomic_spinlock_t *lock, const char *msg)
 {
@@ -154,6 +155,8 @@ void _raw_spin_unlock(atomic_spinlock_t *lock)
 	__raw_spin_unlock(&lock->raw_lock);
 }
 
+#ifndef CONFIG_PREEMPT_RT
+
 static void rwlock_bug(rwlock_t *lock, const char *msg)
 {
 	if (!debug_locks_off())
@@ -295,3 +298,4 @@ void _raw_write_unlock(rwlock_t *lock)
 	debug_write_unlock(lock);
 	__raw_write_unlock(&lock->raw_lock);
 }
+#endif
diff --git a/mm/bounce.c b/mm/bounce.c
index a2b76a588e34..4a91eedbf3ac 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -13,6 +13,7 @@
 #include <linux/init.h>
 #include <linux/hash.h>
 #include <linux/highmem.h>
+#include <linux/interrupt.h>
 #include <asm/tlbflush.h>
 
 #include <trace/events/block.h>
@@ -49,11 +50,11 @@ static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
 	unsigned long flags;
 	unsigned char *vto;
 
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 	vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ);
 	memcpy(vto + to->bv_offset, vfrom, to->bv_len);
 	kunmap_atomic(vto, KM_BOUNCE_READ);
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 }
 
 #else /* CONFIG_HIGHMEM */
diff --git a/mm/filemap.c b/mm/filemap.c
index ccea3b665c12..769d389262d7 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1890,7 +1890,7 @@ size_t iov_iter_copy_from_user_atomic(struct page *page,
 	char *kaddr;
 	size_t copied;
 
-	BUG_ON(!in_atomic());
+//	BUG_ON(!in_atomic());
 	kaddr = kmap_atomic(page, KM_USER0);
 	if (likely(i->nr_segs == 1)) {
 		int left;
diff --git a/mm/highmem.c b/mm/highmem.c
index 25878cc49daa..66e915acd783 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -14,6 +14,11 @@
  * based on Linus' idea.
  *
  * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
+ *
+ * Largely rewritten to get rid of all global locks
+ *
+ * Copyright (C) 2006 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *
  */
 
 #include <linux/mm.h>
@@ -26,18 +31,15 @@
 #include <linux/init.h>
 #include <linux/hash.h>
 #include <linux/highmem.h>
+#include <linux/hardirq.h>
+
 #include <asm/tlbflush.h>
+#include <asm/pgtable.h>
 
-/*
- * Virtual_count is not a pure "count".
- *  0 means that it is not mapped, and has not been mapped
- *    since a TLB flush - it is usable.
- *  1 means that there are no users, but it has been mapped
- *    since the last TLB flush - so we can't use it.
- *  n means that there are (n-1) current users of it.
- */
 #ifdef CONFIG_HIGHMEM
 
+static int __set_page_address(struct page *page, void *virtual, int pos);
+
 unsigned long totalhigh_pages __read_mostly;
 EXPORT_SYMBOL(totalhigh_pages);
 
@@ -58,13 +60,21 @@ unsigned int nr_free_highpages (void)
 	return pages;
 }
 
-static int pkmap_count[LAST_PKMAP];
-static unsigned int last_pkmap_nr;
-static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock);
+/*
+ * count is not a pure "count".
+ *  0 means its owned exclusively by someone
+ *  1 means its free for use - either mapped or not.
+ *  n means that there are (n-1) current users of it.
+ */
+static atomic_t pkmap_count[LAST_PKMAP];
+static atomic_t pkmap_hand;
+static atomic_t pkmap_free;
+static atomic_t pkmap_users;
 
 pte_t * pkmap_page_table;
 
-static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
+static DECLARE_WAIT_QUEUE_HEAD(pkmap_wait);
+
 
 /*
  * Most architectures have no use for kmap_high_get(), so let's abstract
@@ -85,131 +95,261 @@ static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
 		do { spin_unlock(&kmap_lock); (void)(flags); } while (0)
 #endif
 
-static void flush_all_zero_pkmaps(void)
+/*
+ * Try to free a given kmap slot.
+ *
+ * Returns:
+ *  -1 - in use
+ *   0 - free, no TLB flush needed
+ *   1 - free, needs TLB flush
+ */
+static int pkmap_try_free(int pos)
 {
-	int i;
-	int need_flush = 0;
+	if (atomic_cmpxchg(&pkmap_count[pos], 1, 0) != 1)
+		return -1;
+	atomic_dec(&pkmap_free);
+	/*
+	 * TODO: add a young bit to make it CLOCK
+	 */
+	if (!pte_none(pkmap_page_table[pos])) {
+		struct page *page = pte_page(pkmap_page_table[pos]);
+		unsigned long addr = PKMAP_ADDR(pos);
+		pte_t *ptep = &pkmap_page_table[pos];
+
+		VM_BUG_ON(addr != (unsigned long)page_address(page));
 
-	flush_cache_kmaps();
+		if (!__set_page_address(page, NULL, pos))
+			BUG();
+		flush_kernel_dcache_page(page);
+		pte_clear(&init_mm, addr, ptep);
 
+
+		return 1;
+	}
+
+	return 0;
+}
+
+static inline void pkmap_put(atomic_t *counter)
+{
+	switch (atomic_dec_return(counter)) {
+	case 0:
+		BUG();
+
+	case 1:
+		atomic_inc(&pkmap_free);
+		wake_up(&pkmap_wait);
+	}
+}
+
+#define TLB_BATCH	32
+
+static int pkmap_get_free(void)
+{
+	int i, pos, flush;
+
+restart:
 	for (i = 0; i < LAST_PKMAP; i++) {
-		struct page *page;
+		pos = atomic_inc_return(&pkmap_hand) & LAST_PKMAP_MASK;
+		flush = pkmap_try_free(pos);
+		if (flush >= 0)
+			goto got_one;
+	}
+
+	atomic_dec(&pkmap_free);
+	/*
+	 * wait for somebody else to unmap their entries
+	 */
+	if (likely(!in_interrupt()))
+		wait_event(pkmap_wait, atomic_read(&pkmap_free) != 0);
+
+	goto restart;
+
+got_one:
+	if (flush) {
+#if 0
+		flush_tlb_kernel_range(PKMAP_ADDR(pos), PKMAP_ADDR(pos+1));
+#else
+		int pos2 = (pos + 1) & LAST_PKMAP_MASK;
+		int nr;
+		int entries[TLB_BATCH];
 
 		/*
-		 * zero means we don't have anything to do,
-		 * >1 means that it is still in use. Only
-		 * a count of 1 means that it is free but
-		 * needs to be unmapped
+		 * For those architectures that cannot help but flush the
+		 * whole TLB, flush some more entries to make it worthwhile.
+		 * Scan ahead of the hand to minimise search distances.
 		 */
-		if (pkmap_count[i] != 1)
-			continue;
-		pkmap_count[i] = 0;
+		for (i = 0, nr = 0; i < LAST_PKMAP && nr < TLB_BATCH;
+				i++, pos2 = (pos2 + 1) & LAST_PKMAP_MASK) {
+
+			flush = pkmap_try_free(pos2);
+			if (flush < 0)
+				continue;
+
+			if (!flush) {
+				atomic_t *counter = &pkmap_count[pos2];
+				VM_BUG_ON(atomic_read(counter) != 0);
+				atomic_set(counter, 2);
+				pkmap_put(counter);
+			} else
+				entries[nr++] = pos2;
+		}
+		flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP));
 
-		/* sanity check */
-		BUG_ON(pte_none(pkmap_page_table[i]));
+		for (i = 0; i < nr; i++) {
+			atomic_t *counter = &pkmap_count[entries[i]];
+			VM_BUG_ON(atomic_read(counter) != 0);
+			atomic_set(counter, 2);
+			pkmap_put(counter);
+		}
+#endif
+	}
+	return pos;
+}
+
+static unsigned long pkmap_insert(struct page *page)
+{
+	int pos = pkmap_get_free();
+	unsigned long vaddr = PKMAP_ADDR(pos);
+	pte_t *ptep = &pkmap_page_table[pos];
+	pte_t entry = mk_pte(page, kmap_prot);
+	atomic_t *counter = &pkmap_count[pos];
 
+	VM_BUG_ON(atomic_read(counter) != 0);
+
+	set_pte_at(&init_mm, vaddr, ptep, entry);
+	if (unlikely(!__set_page_address(page, (void *)vaddr, pos))) {
 		/*
-		 * Don't need an atomic fetch-and-clear op here;
-		 * no-one has the page mapped, and cannot get at
-		 * its virtual address (and hence PTE) without first
-		 * getting the kmap_lock (which is held here).
-		 * So no dangers, even with speculative execution.
+		 * concurrent pkmap_inserts for this page -
+		 * the other won the race, release this entry.
+		 *
+		 * we can still clear the pte without a tlb flush since
+		 * it couldn't have been used yet.
 		 */
-		page = pte_page(pkmap_page_table[i]);
-		pte_clear(&init_mm, (unsigned long)page_address(page),
-			  &pkmap_page_table[i]);
+		pte_clear(&init_mm, vaddr, ptep);
+		VM_BUG_ON(atomic_read(counter) != 0);
+		atomic_set(counter, 2);
+		pkmap_put(counter);
+		vaddr = 0;
+	} else
+		atomic_set(counter, 2);
 
-		set_page_address(page, NULL);
-		need_flush = 1;
-	}
-	if (need_flush)
-		flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP));
+	return vaddr;
 }
 
-/**
- * kmap_flush_unused - flush all unused kmap mappings in order to remove stray mappings
+/*
+ * Flush all unused kmap mappings in order to remove stray mappings.
  */
 void kmap_flush_unused(void)
 {
-	lock_kmap();
-	flush_all_zero_pkmaps();
-	unlock_kmap();
+	WARN_ON_ONCE(1);
 }
 
-static inline unsigned long map_new_virtual(struct page *page)
+/*
+ * Avoid starvation deadlock by limiting the number of tasks that can obtain a
+ * kmap to (LAST_PKMAP - KM_TYPE_NR*NR_CPUS)/2.
+ */
+static void kmap_account(void)
 {
-	unsigned long vaddr;
-	int count;
-
-start:
-	count = LAST_PKMAP;
-	/* Find an empty entry */
-	for (;;) {
-		last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK;
-		if (!last_pkmap_nr) {
-			flush_all_zero_pkmaps();
-			count = LAST_PKMAP;
-		}
-		if (!pkmap_count[last_pkmap_nr])
-			break;	/* Found a usable entry */
-		if (--count)
-			continue;
+	int weight;
 
+#ifndef CONFIG_PREEMPT_RT
+	if (in_interrupt()) {
+		/* irqs can always get them */
+		weight = -1;
+	} else
+#endif
+	if (current->flags & PF_KMAP) {
+		current->flags &= ~PF_KMAP;
+		/* we already accounted the second */
+		weight = 0;
+	} else {
+		/* mark 1, account 2 */
+		current->flags |= PF_KMAP;
+		weight = 2;
+	}
+
+	if (weight > 0) {
 		/*
-		 * Sleep for somebody else to unmap their entries
+		 * reserve KM_TYPE_NR maps per CPU for interrupt context
 		 */
-		{
-			DECLARE_WAITQUEUE(wait, current);
-
-			__set_current_state(TASK_UNINTERRUPTIBLE);
-			add_wait_queue(&pkmap_map_wait, &wait);
-			unlock_kmap();
-			schedule();
-			remove_wait_queue(&pkmap_map_wait, &wait);
-			lock_kmap();
-
-			/* Somebody else might have mapped it while we slept */
-			if (page_address(page))
-				return (unsigned long)page_address(page);
-
-			/* Re-start */
-			goto start;
+		const int target = LAST_PKMAP
+#ifndef CONFIG_PREEMPT_RT
+				- KM_TYPE_NR*NR_CPUS
+#endif
+			;
+
+again:
+		wait_event(pkmap_wait,
+			atomic_read(&pkmap_users) + weight <= target);
+
+		if (atomic_add_return(weight, &pkmap_users) > target) {
+			atomic_sub(weight, &pkmap_users);
+			goto again;
 		}
 	}
-	vaddr = PKMAP_ADDR(last_pkmap_nr);
-	set_pte_at(&init_mm, vaddr,
-		   &(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot));
+}
 
-	pkmap_count[last_pkmap_nr] = 1;
-	set_page_address(page, (void *)vaddr);
+static void kunmap_account(void)
+{
+	int weight;
 
-	return vaddr;
+#ifndef CONFIG_PREEMPT_RT
+	if (in_irq()) {
+		weight = -1;
+	} else
+#endif
+	if (current->flags & PF_KMAP) {
+		/* there was only 1 kmap, un-account both */
+		current->flags &= ~PF_KMAP;
+		weight = 2;
+	} else {
+		/* there were two kmaps, un-account per kunmap */
+		weight = 1;
+	}
+
+	if (weight > 0)
+		atomic_sub(weight, &pkmap_users);
+	wake_up(&pkmap_wait);
 }
 
-/**
- * kmap_high - map a highmem page into memory
- * @page: &struct page to map
- *
- * Returns the page's virtual memory address.
- *
- * We cannot call this from interrupts, as it may block.
- */
 void *kmap_high(struct page *page)
 {
 	unsigned long vaddr;
 
-	/*
-	 * For highmem pages, we can't trust "virtual" until
-	 * after we have the lock.
-	 */
-	lock_kmap();
+
+	kmap_account();
+again:
 	vaddr = (unsigned long)page_address(page);
+	if (vaddr) {
+		atomic_t *counter = &pkmap_count[PKMAP_NR(vaddr)];
+		if (atomic_inc_not_zero(counter)) {
+			/*
+			 * atomic_inc_not_zero implies a (memory) barrier on success
+			 * so page address will be reloaded.
+			 */
+			unsigned long vaddr2 = (unsigned long)page_address(page);
+			if (likely(vaddr == vaddr2))
+				return (void *)vaddr;
+
+			/*
+			 * Oops, we got someone else.
+			 *
+			 * This can happen if we get preempted after
+			 * page_address() and before atomic_inc_not_zero()
+			 * and during that preemption this slot is freed and
+			 * reused.
+			 */
+			pkmap_put(counter);
+			goto again;
+		}
+	}
+
+	vaddr = pkmap_insert(page);
 	if (!vaddr)
-		vaddr = map_new_virtual(page);
-	pkmap_count[PKMAP_NR(vaddr)]++;
-	BUG_ON(pkmap_count[PKMAP_NR(vaddr)] < 2);
-	unlock_kmap();
-	return (void*) vaddr;
+		goto again;
+
+	return (void *)vaddr;
 }
 
 EXPORT_SYMBOL(kmap_high);
@@ -240,51 +380,12 @@ void *kmap_high_get(struct page *page)
 }
 #endif
 
-/**
- * kunmap_high - map a highmem page into memory
- * @page: &struct page to unmap
- *
- * If ARCH_NEEDS_KMAP_HIGH_GET is not defined then this may be called
- * only from user context.
- */
-void kunmap_high(struct page *page)
+ void kunmap_high(struct page *page)
 {
-	unsigned long vaddr;
-	unsigned long nr;
-	unsigned long flags;
-	int need_wakeup;
-
-	lock_kmap_any(flags);
-	vaddr = (unsigned long)page_address(page);
+	unsigned long vaddr = (unsigned long)page_address(page);
 	BUG_ON(!vaddr);
-	nr = PKMAP_NR(vaddr);
-
-	/*
-	 * A count must never go down to zero
-	 * without a TLB flush!
-	 */
-	need_wakeup = 0;
-	switch (--pkmap_count[nr]) {
-	case 0:
-		BUG();
-	case 1:
-		/*
-		 * Avoid an unnecessary wake_up() function call.
-		 * The common case is pkmap_count[] == 1, but
-		 * no waiters.
-		 * The tasks queued in the wait-queue are guarded
-		 * by both the lock in the wait-queue-head and by
-		 * the kmap_lock.  As the kmap_lock is held here,
-		 * no need for the wait-queue-head's lock.  Simply
-		 * test if the queue is empty.
-		 */
-		need_wakeup = waitqueue_active(&pkmap_map_wait);
-	}
-	unlock_kmap_any(flags);
-
-	/* do wake-up, if needed, race-free outside of the spin lock */
-	if (need_wakeup)
-		wake_up(&pkmap_map_wait);
+	pkmap_put(&pkmap_count[PKMAP_NR(vaddr)]);
+	kunmap_account();
 }
 
 EXPORT_SYMBOL(kunmap_high);
@@ -295,19 +396,13 @@ EXPORT_SYMBOL(kunmap_high);
 #define PA_HASH_ORDER	7
 
 /*
- * Describes one page->virtual association
+ * Describes one page->virtual address association.
  */
-struct page_address_map {
+static struct page_address_map {
 	struct page *page;
 	void *virtual;
 	struct list_head list;
-};
-
-/*
- * page_address_map freelist, allocated from page_address_maps.
- */
-static struct list_head page_address_pool;	/* freelist */
-static spinlock_t pool_lock;			/* protects page_address_pool */
+} page_address_maps[LAST_PKMAP];
 
 /*
  * Hash table bucket
@@ -328,29 +423,37 @@ static struct page_address_slot *page_slot(struct page *page)
  *
  * Returns the page's virtual address.
  */
-void *page_address(struct page *page)
-{
-	unsigned long flags;
-	void *ret;
-	struct page_address_slot *pas;
 
-	if (!PageHighMem(page))
-		return lowmem_page_address(page);
+static void *__page_address(struct page_address_slot *pas, struct page *page)
+{
+	void *ret = NULL;
 
-	pas = page_slot(page);
-	ret = NULL;
-	spin_lock_irqsave(&pas->lock, flags);
 	if (!list_empty(&pas->lh)) {
 		struct page_address_map *pam;
 
 		list_for_each_entry(pam, &pas->lh, list) {
 			if (pam->page == page) {
 				ret = pam->virtual;
-				goto done;
+				break;
 			}
 		}
 	}
-done:
+
+	return ret;
+}
+
+void *page_address(struct page *page)
+{
+	unsigned long flags;
+	void *ret;
+	struct page_address_slot *pas;
+
+	if (!PageHighMem(page))
+		return lowmem_page_address(page);
+
+	pas = page_slot(page);
+	spin_lock_irqsave(&pas->lock, flags);
+	ret = __page_address(pas, page);
 	spin_unlock_irqrestore(&pas->lock, flags);
 	return ret;
 }
@@ -362,62 +465,90 @@ EXPORT_SYMBOL(page_address);
  * @page: &struct page to set
  * @virtual: virtual address to use
  */
-void set_page_address(struct page *page, void *virtual)
+static int __set_page_address(struct page *page, void *virtual, int pos)
 {
+	int ret = 0;
 	unsigned long flags;
 	struct page_address_slot *pas;
 	struct page_address_map *pam;
 
-	BUG_ON(!PageHighMem(page));
+	VM_BUG_ON(!PageHighMem(page));
+	VM_BUG_ON(atomic_read(&pkmap_count[pos]) != 0);
+	VM_BUG_ON(pos < 0 || pos >= LAST_PKMAP);
 
 	pas = page_slot(page);
-	if (virtual) {		/* Add */
-		BUG_ON(list_empty(&page_address_pool));
-
-		spin_lock_irqsave(&pool_lock, flags);
-		pam = list_entry(page_address_pool.next,
-				struct page_address_map, list);
-		list_del(&pam->list);
-		spin_unlock_irqrestore(&pool_lock, flags);
-
-		pam->page = page;
-		pam->virtual = virtual;
-
-		spin_lock_irqsave(&pas->lock, flags);
-		list_add_tail(&pam->list, &pas->lh);
-		spin_unlock_irqrestore(&pas->lock, flags);
-	} else {		/* Remove */
-		spin_lock_irqsave(&pas->lock, flags);
-		list_for_each_entry(pam, &pas->lh, list) {
-			if (pam->page == page) {
-				list_del(&pam->list);
-				spin_unlock_irqrestore(&pas->lock, flags);
-				spin_lock_irqsave(&pool_lock, flags);
-				list_add_tail(&pam->list, &page_address_pool);
-				spin_unlock_irqrestore(&pool_lock, flags);
-				goto done;
-			}
+	pam = &page_address_maps[pos];
+
+	spin_lock_irqsave(&pas->lock, flags);
+	if (virtual) { /* add */
+		VM_BUG_ON(!list_empty(&pam->list));
+
+		if (!__page_address(pas, page)) {
+			pam->page = page;
+			pam->virtual = virtual;
+			list_add_tail(&pam->list, &pas->lh);
+			ret = 1;
+		}
+	} else { /* remove */
+		if (!list_empty(&pam->list)) {
+			list_del_init(&pam->list);
+			ret = 1;
 		}
-		spin_unlock_irqrestore(&pas->lock, flags);
 	}
-done:
-	return;
+	spin_unlock_irqrestore(&pas->lock, flags);
+
+	return ret;
 }
 
-static struct page_address_map page_address_maps[LAST_PKMAP];
+int set_page_address(struct page *page, void *virtual)
+{
+	/*
+	 * set_page_address is not supposed to be called when using
+	 * hashed virtual addresses.
+	 */
+	BUG();
+	return 0;
+}
 
-void __init page_address_init(void)
+void __init __page_address_init(void)
 {
 	int i;
 
-	INIT_LIST_HEAD(&page_address_pool);
 	for (i = 0; i < ARRAY_SIZE(page_address_maps); i++)
-		list_add(&page_address_maps[i].list, &page_address_pool);
+		INIT_LIST_HEAD(&page_address_maps[i].list);
+
 	for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) {
 		INIT_LIST_HEAD(&page_address_htable[i].lh);
 		spin_lock_init(&page_address_htable[i].lock);
 	}
-	spin_lock_init(&pool_lock);
+}
+
+#elif defined (CONFIG_HIGHMEM) /* HASHED_PAGE_VIRTUAL */
+
+static int __set_page_address(struct page *page, void *virtual, int pos)
+{
+	return set_page_address(page, virtual);
+}
+
+#endif	/* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
+
+#if defined(CONFIG_HIGHMEM) || defined(HASHED_PAGE_VIRTUAL)
+
+void __init page_address_init(void)
+{
+#ifdef CONFIG_HIGHMEM
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(pkmap_count); i++)
+		atomic_set(&pkmap_count[i], 1);
+	atomic_set(&pkmap_hand, 0);
+	atomic_set(&pkmap_free, LAST_PKMAP);
+	atomic_set(&pkmap_users, 0);
+#endif
+
+#ifdef HASHED_PAGE_VIRTUAL
+	__page_address_init();
+#endif
 }
 
 #endif	/* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
diff --git a/mm/memory.c b/mm/memory.c
index a6fb32fc67dc..c39396955651 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -959,17 +959,14 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
  * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
  * drops the lock and schedules.
  */
-unsigned long unmap_vmas(struct mmu_gather **tlbp,
+unsigned long unmap_vmas(struct mmu_gather *tlb,
 		struct vm_area_struct *vma, unsigned long start_addr,
 		unsigned long end_addr, unsigned long *nr_accounted,
 		struct zap_details *details)
 {
 	long zap_work = ZAP_BLOCK_SIZE;
-	unsigned long tlb_start = 0;	/* For tlb_finish_mmu */
-	int tlb_start_valid = 0;
 	unsigned long start = start_addr;
 	spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
-	int fullmm = (*tlbp)->fullmm;
 	struct mm_struct *mm = vma->vm_mm;
 
 	mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
@@ -990,11 +987,6 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
 			untrack_pfn_vma(vma, 0, 0);
 
 		while (start != end) {
-			if (!tlb_start_valid) {
-				tlb_start = start;
-				tlb_start_valid = 1;
-			}
-
 			if (unlikely(is_vm_hugetlb_page(vma))) {
 				/*
 				 * It is undesirable to test vma->vm_file as it
@@ -1015,7 +1007,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
 
 				start = end;
 			} else
-				start = unmap_page_range(*tlbp, vma,
+				start = unmap_page_range(tlb, vma,
 						start, end, &zap_work, details);
 
 			if (zap_work > 0) {
@@ -1023,19 +1015,13 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
 				break;
 			}
 
-			tlb_finish_mmu(*tlbp, tlb_start, start);
-
 			if (need_resched() ||
 				(i_mmap_lock && spin_needbreak(i_mmap_lock))) {
-				if (i_mmap_lock) {
-					*tlbp = NULL;
+				if (i_mmap_lock)
 					goto out;
-				}
 				cond_resched();
 			}
 
-			*tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
-			tlb_start_valid = 0;
 			zap_work = ZAP_BLOCK_SIZE;
 		}
 	}
@@ -1055,16 +1041,15 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
 		unsigned long size, struct zap_details *details)
 {
 	struct mm_struct *mm = vma->vm_mm;
-	struct mmu_gather *tlb;
+	struct mmu_gather tlb;
 	unsigned long end = address + size;
 	unsigned long nr_accounted = 0;
 
 	lru_add_drain();
-	tlb = tlb_gather_mmu(mm, 0);
+	tlb_gather_mmu(&tlb, mm, 0);
 	update_hiwater_rss(mm);
 	end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
-	if (tlb)
-		tlb_finish_mmu(tlb, address, end);
+	tlb_finish_mmu(&tlb, address, end);
 	return end;
 }
 
diff --git a/mm/mmap.c b/mm/mmap.c
index 34579b23ebd5..c7c61d04aac3 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1775,17 +1775,17 @@ static void unmap_region(struct mm_struct *mm,
 		unsigned long start, unsigned long end)
 {
 	struct vm_area_struct *next = prev? prev->vm_next: mm->mmap;
-	struct mmu_gather *tlb;
+	struct mmu_gather tlb;
 	unsigned long nr_accounted = 0;
 
 	lru_add_drain();
-	tlb = tlb_gather_mmu(mm, 0);
+	tlb_gather_mmu(&tlb, mm, 0);
 	update_hiwater_rss(mm);
 	unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
 	vm_unacct_memory(nr_accounted);
-	free_pgtables(tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
+	free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
 				 next? next->vm_start: 0);
-	tlb_finish_mmu(tlb, start, end);
+	tlb_finish_mmu(&tlb, start, end);
 }
 
 /*
@@ -1967,10 +1967,16 @@ SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
 static inline void verify_mm_writelocked(struct mm_struct *mm)
 {
 #ifdef CONFIG_DEBUG_VM
-	if (unlikely(down_read_trylock(&mm->mmap_sem))) {
+# ifdef CONFIG_PREEMPT_RT
+	if (unlikely(!rwsem_is_locked(&mm->mmap_sem))) {
 		WARN_ON(1);
-		up_read(&mm->mmap_sem);
 	}
+# else
+        if (unlikely(down_read_trylock(&mm->mmap_sem))) {
+		WARN_ON(1);
+		up_read(&mm->mmap_sem);
+        }
+# endif
 #endif
 }
 
@@ -2084,7 +2090,7 @@ EXPORT_SYMBOL(do_brk);
 /* Release all mmaps. */
 void exit_mmap(struct mm_struct *mm)
 {
-	struct mmu_gather *tlb;
+	struct mmu_gather tlb;
 	struct vm_area_struct *vma;
 	unsigned long nr_accounted = 0;
 	unsigned long end;
@@ -2109,13 +2115,13 @@ void exit_mmap(struct mm_struct *mm)
 
 	lru_add_drain();
 	flush_cache_mm(mm);
-	tlb = tlb_gather_mmu(mm, 1);
+	tlb_gather_mmu(&tlb, mm, 1);
 	/* update_hiwater_rss(mm) here? but nobody should be looking */
 	/* Use -1 here to ensure all VMAs in the mm are unmapped */
 	end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
 	vm_unacct_memory(nr_accounted);
-	free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0);
-	tlb_finish_mmu(tlb, 0, end);
+	free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
+	tlb_finish_mmu(&tlb, 0, end);
 
 	/*
 	 * Walk the list again, actually closing and freeing it,
diff --git a/net/core/dev.c b/net/core/dev.c
index 6a94475aee85..24cfd4493036 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1887,42 +1887,52 @@ gso:
 	   Check this and shot the lock. It is not prone from deadlocks.
 	   Either shot noqueue qdisc, it is even simpler 8)
 	 */
-	if (dev->flags & IFF_UP) {
-		int cpu = smp_processor_id(); /* ok because BHs are off */
+	if (!(dev->flags & IFF_UP))
+		goto err;
 
-		if (txq->xmit_lock_owner != cpu) {
+	/* Recursion is detected! It is possible, unfortunately: */
+	if (netif_tx_lock_recursion(txq))
+		goto err_recursion;
 
-			HARD_TX_LOCK(dev, txq, cpu);
+	HARD_TX_LOCK(dev, txq);
 
-			if (!netif_tx_queue_stopped(txq)) {
-				rc = 0;
-				if (!dev_hard_start_xmit(skb, dev, txq)) {
-					HARD_TX_UNLOCK(dev, txq);
-					goto out;
-				}
-			}
-			HARD_TX_UNLOCK(dev, txq);
-			if (net_ratelimit())
-				printk(KERN_CRIT "Virtual device %s asks to "
-				       "queue packet!\n", dev->name);
-		} else {
-			/* Recursion is detected! It is possible,
-			 * unfortunately */
-			if (net_ratelimit())
-				printk(KERN_CRIT "Dead loop on virtual device "
-				       "%s, fix it urgently!\n", dev->name);
-		}
+	if (netif_tx_queue_stopped(txq))
+		goto err_tx_unlock;
+
+	if (dev_hard_start_xmit(skb, dev, txq))
+		goto err_tx_unlock;
+
+	rc = 0;
+	HARD_TX_UNLOCK(dev, txq);
+
+out:
+	rcu_read_unlock_bh();
+	return rc;
+
+err_recursion:
+	if (net_ratelimit()) {
+		printk(KERN_CRIT
+		       "Dead loop on virtual device %s, fix it urgently!\n",
+			dev->name);
+	}
+	goto err;
+
+err_tx_unlock:
+	HARD_TX_UNLOCK(dev, txq);
+
+	if (net_ratelimit()) {
+		printk(KERN_CRIT "Virtual device %s asks to queue packet!\n",
+			dev->name);
 	}
+	/* Fall through: */
 
+err:
 	rc = -ENETDOWN;
 	rcu_read_unlock_bh();
 
 out_kfree_skb:
 	kfree_skb(skb);
 	return rc;
-out:
-	rcu_read_unlock_bh();
-	return rc;
 }
 
 
@@ -1995,8 +2005,8 @@ int netif_rx_ni(struct sk_buff *skb)
 {
 	int err;
 
-	preempt_disable();
 	err = netif_rx(skb);
+	preempt_disable();
 	if (local_softirq_pending())
 		do_softirq();
 	preempt_enable();
@@ -2008,7 +2018,8 @@ EXPORT_SYMBOL(netif_rx_ni);
 
 static void net_tx_action(struct softirq_action *h)
 {
-	struct softnet_data *sd = &__get_cpu_var(softnet_data);
+	struct softnet_data *sd = &per_cpu(softnet_data,
+					   raw_smp_processor_id());
 
 	if (sd->completion_queue) {
 		struct sk_buff *clist;
@@ -2024,6 +2035,11 @@ static void net_tx_action(struct softirq_action *h)
 
 			WARN_ON(atomic_read(&skb->users));
 			__kfree_skb(skb);
+			/*
+			 * Safe to reschedule - the list is private
+			 * at this point.
+			 */
+			cond_resched_softirq_context();
 		}
 	}
 
@@ -2042,6 +2058,22 @@ static void net_tx_action(struct softirq_action *h)
 			head = head->next_sched;
 
 			root_lock = qdisc_lock(q);
+			/*
+			 * We are executing in softirq context here, and
+			 * if softirqs are preemptible, we must avoid
+			 * infinite reactivation of the softirq by
+			 * either the tx handler, or by netif_schedule().
+			 * (it would result in an infinitely looping
+			 *  softirq context)
+			 * So we take the spinlock unconditionally.
+			 */
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+			spin_lock(root_lock);
+			smp_mb__before_clear_bit();
+			clear_bit(__QDISC_STATE_SCHED, &q->state);
+			qdisc_run(q);
+			spin_unlock(root_lock);
+#else
 			if (spin_trylock(root_lock)) {
 				smp_mb__before_clear_bit();
 				clear_bit(__QDISC_STATE_SCHED,
@@ -2058,6 +2090,7 @@ static void net_tx_action(struct softirq_action *h)
 						  &q->state);
 				}
 			}
+#endif
 		}
 	}
 }
@@ -2270,7 +2303,7 @@ int netif_receive_skb(struct sk_buff *skb)
 			skb->dev = orig_dev->master;
 	}
 
-	__get_cpu_var(netdev_rx_stat).total++;
+	per_cpu(netdev_rx_stat, raw_smp_processor_id()).total++;
 
 	skb_reset_network_header(skb);
 	skb_reset_transport_header(skb);
@@ -2660,9 +2693,10 @@ EXPORT_SYMBOL(napi_gro_frags);
 static int process_backlog(struct napi_struct *napi, int quota)
 {
 	int work = 0;
-	struct softnet_data *queue = &__get_cpu_var(softnet_data);
+	struct softnet_data *queue;
 	unsigned long start_time = jiffies;
 
+	queue = &per_cpu(softnet_data, raw_smp_processor_id());
 	napi->weight = weight_p;
 	do {
 		struct sk_buff *skb;
@@ -2694,7 +2728,7 @@ void __napi_schedule(struct napi_struct *n)
 
 	local_irq_save(flags);
 	list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
-	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
+	raise_softirq_irqoff(NET_RX_SOFTIRQ);
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL(__napi_schedule);
@@ -2848,7 +2882,7 @@ out:
 
 softnet_break:
 	__get_cpu_var(netdev_rx_stat).time_squeeze++;
-	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
+	raise_softirq_irqoff(NET_RX_SOFTIRQ);
 	goto out;
 }
 
@@ -4644,7 +4678,7 @@ static void __netdev_init_queue_locks_one(struct net_device *dev,
 {
 	spin_lock_init(&dev_queue->_xmit_lock);
 	netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
-	dev_queue->xmit_lock_owner = -1;
+	dev_queue->xmit_lock_owner = (void *)-1;
 }
 
 static void netdev_init_queue_locks(struct net_device *dev)
diff --git a/net/core/flow.c b/net/core/flow.c
index 96015871ecea..f032d1c37192 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -39,9 +39,10 @@ atomic_t flow_cache_genid = ATOMIC_INIT(0);
 
 static u32 flow_hash_shift;
 #define flow_hash_size	(1 << flow_hash_shift)
-static DEFINE_PER_CPU(struct flow_cache_entry **, flow_tables) = { NULL };
 
-#define flow_table(cpu) (per_cpu(flow_tables, cpu))
+static DEFINE_PER_CPU_LOCKED(struct flow_cache_entry **, flow_tables);
+
+#define flow_table(cpu) (per_cpu_var_locked(flow_tables, cpu))
 
 static struct kmem_cache *flow_cachep __read_mostly;
 
@@ -168,24 +169,24 @@ static int flow_key_compare(struct flowi *key1, struct flowi *key2)
 void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
 			flow_resolve_t resolver)
 {
-	struct flow_cache_entry *fle, **head;
+	struct flow_cache_entry **table, *fle, **head;
 	unsigned int hash;
 	int cpu;
 
 	local_bh_disable();
-	cpu = smp_processor_id();
+	table = get_cpu_var_locked(flow_tables, &cpu);
 
 	fle = NULL;
 	/* Packet really early in init?  Making flow_cache_init a
 	 * pre-smp initcall would solve this.  --RR */
-	if (!flow_table(cpu))
+	if (!table)
 		goto nocache;
 
 	if (flow_hash_rnd_recalc(cpu))
 		flow_new_hash_rnd(cpu);
 	hash = flow_hash_code(key, cpu);
 
-	head = &flow_table(cpu)[hash];
+	head = &table[hash];
 	for (fle = *head; fle; fle = fle->next) {
 		if (fle->family == family &&
 		    fle->dir == dir &&
@@ -195,6 +196,7 @@ void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
 
 				if (ret)
 					atomic_inc(fle->object_ref);
+				put_cpu_var_locked(flow_tables, cpu);
 				local_bh_enable();
 
 				return ret;
@@ -220,6 +222,8 @@ void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
 	}
 
 nocache:
+	put_cpu_var_locked(flow_tables, cpu);
+
 	{
 		int err;
 		void *obj;
@@ -249,14 +253,15 @@ nocache:
 static void flow_cache_flush_tasklet(unsigned long data)
 {
 	struct flow_flush_info *info = (void *)data;
+	struct flow_cache_entry **table;
 	int i;
 	int cpu;
 
-	cpu = smp_processor_id();
+	table = get_cpu_var_locked(flow_tables, &cpu);
 	for (i = 0; i < flow_hash_size; i++) {
 		struct flow_cache_entry *fle;
 
-		fle = flow_table(cpu)[i];
+		fle = table[i];
 		for (; fle; fle = fle->next) {
 			unsigned genid = atomic_read(&flow_cache_genid);
 
@@ -267,6 +272,7 @@ static void flow_cache_flush_tasklet(unsigned long data)
 			atomic_dec(fle->object_ref);
 		}
 	}
+	put_cpu_var_locked(flow_tables, cpu);
 
 	if (atomic_dec_and_test(&info->cpuleft))
 		complete(&info->completion);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index df30feb2fc72..0daa44b46aba 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -69,20 +69,20 @@ static void queue_process(struct work_struct *work)
 
 		txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
 
-		local_irq_save(flags);
-		__netif_tx_lock(txq, smp_processor_id());
+		local_irq_save_nort(flags);
+		__netif_tx_lock(txq);
 		if (netif_tx_queue_stopped(txq) ||
 		    netif_tx_queue_frozen(txq) ||
 		    ops->ndo_start_xmit(skb, dev) != NETDEV_TX_OK) {
 			skb_queue_head(&npinfo->txq, skb);
 			__netif_tx_unlock(txq);
-			local_irq_restore(flags);
+			local_irq_restore_nort(flags);
 
 			schedule_delayed_work(&npinfo->tx_work, HZ/10);
 			return;
 		}
 		__netif_tx_unlock(txq);
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 	}
 }
 
@@ -153,7 +153,7 @@ static void poll_napi(struct net_device *dev)
 	int budget = 16;
 
 	list_for_each_entry(napi, &dev->napi_list, dev_list) {
-		if (napi->poll_owner != smp_processor_id() &&
+		if (napi->poll_owner != raw_smp_processor_id() &&
 		    spin_trylock(&napi->poll_lock)) {
 			budget = poll_one_napi(dev->npinfo, napi, budget);
 			spin_unlock(&napi->poll_lock);
@@ -214,30 +214,35 @@ static void refill_skbs(void)
 
 static void zap_completion_queue(void)
 {
-	unsigned long flags;
 	struct softnet_data *sd = &get_cpu_var(softnet_data);
+	struct sk_buff *clist = NULL;
+	unsigned long flags;
 
 	if (sd->completion_queue) {
-		struct sk_buff *clist;
 
 		local_irq_save(flags);
 		clist = sd->completion_queue;
 		sd->completion_queue = NULL;
 		local_irq_restore(flags);
-
-		while (clist != NULL) {
-			struct sk_buff *skb = clist;
-			clist = clist->next;
-			if (skb->destructor) {
-				atomic_inc(&skb->users);
-				dev_kfree_skb_any(skb); /* put this one back */
-			} else {
-				__kfree_skb(skb);
-			}
-		}
 	}
 
+
+	/*
+	 * Took the list private, can drop our softnet
+	 * reference:
+	 */
 	put_cpu_var(softnet_data);
+
+	while (clist != NULL) {
+		struct sk_buff *skb = clist;
+		clist = clist->next;
+		if (skb->destructor) {
+			atomic_inc(&skb->users);
+			dev_kfree_skb_any(skb); /* put this one back */
+		} else {
+			__kfree_skb(skb);
+		}
+	}
 }
 
 static struct sk_buff *find_skb(struct netpoll *np, int len, int reserve)
@@ -245,13 +250,26 @@ static struct sk_buff *find_skb(struct netpoll *np, int len, int reserve)
 	int count = 0;
 	struct sk_buff *skb;
 
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * On -rt skb_pool.lock is schedulable, so if we are
+	 * in an atomic context we just try to dequeue from the
+	 * pool and fail if we cannot get one.
+	 */
+	if (in_atomic() || irqs_disabled())
+		goto pick_atomic;
+#endif
 	zap_completion_queue();
 	refill_skbs();
 repeat:
 
 	skb = alloc_skb(len, GFP_ATOMIC);
-	if (!skb)
+	if (!skb) {
+#ifdef CONFIG_PREEMPT_RT
+pick_atomic:
+#endif
 		skb = skb_dequeue(&skb_pool);
+	}
 
 	if (!skb) {
 		if (++count < 10) {
@@ -271,7 +289,7 @@ static int netpoll_owner_active(struct net_device *dev)
 	struct napi_struct *napi;
 
 	list_for_each_entry(napi, &dev->napi_list, dev_list) {
-		if (napi->poll_owner == smp_processor_id())
+		if (napi->poll_owner == raw_smp_processor_id())
 			return 1;
 	}
 	return 0;
@@ -297,7 +315,7 @@ static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
 
 		txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
 
-		local_irq_save(flags);
+		local_irq_save_nort(flags);
 		/* try until next clock tick */
 		for (tries = jiffies_to_usecs(1)/USEC_PER_POLL;
 		     tries > 0; --tries) {
@@ -319,7 +337,7 @@ static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
 
 			udelay(USEC_PER_POLL);
 		}
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 	}
 
 	if (status != NETDEV_TX_OK) {
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 9e0597d189b0..27d2eb26d0fc 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -395,7 +395,7 @@ static void skb_release_head_state(struct sk_buff *skb)
 	secpath_put(skb->sp);
 #endif
 	if (skb->destructor) {
-		WARN_ON(in_irq());
+//		WARN_ON(in_irq());
 		skb->destructor(skb);
 	}
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
diff --git a/net/core/sock.c b/net/core/sock.c
index bbb25be7ddfe..04dbb0c80dd5 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2080,8 +2080,9 @@ static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
 #ifdef CONFIG_NET_NS
 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
 {
-	int cpu = smp_processor_id();
+	int cpu = get_cpu();
 	per_cpu_ptr(net->core.inuse, cpu)->val[prot->inuse_idx] += val;
+	put_cpu();
 }
 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
 
@@ -2127,7 +2128,9 @@ static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
 
 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
 {
-	__get_cpu_var(prot_inuse).val[prot->inuse_idx] += val;
+	int cpu = get_cpu();
+	per_cpu(prot_inuse, cpu).val[prot->inuse_idx] += val;
+	put_cpu();
 }
 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
 
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 97c410e84388..c883e293f5aa 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -201,7 +201,10 @@ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1];
  */
 static struct sock *icmp_sk(struct net *net)
 {
-	return net->ipv4.icmp_sk[smp_processor_id()];
+	/*
+	 * Should be safe on PREEMPT_SOFTIRQS/HARDIRQS to use raw-smp-processor-id:
+	 */
+	return net->ipv4.icmp_sk[raw_smp_processor_id()];
 }
 
 static inline struct sock *icmp_xmit_lock(struct net *net)
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 7505dff4ffdf..2c4b93ef1ff1 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -261,7 +261,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 
 	xt_info_rdlock_bh();
 	private = table->private;
-	table_base = private->entries[smp_processor_id()];
+	table_base = private->entries[raw_smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 	back = get_entry(table_base, private->underflow[hook]);
@@ -709,7 +709,7 @@ static void get_counters(const struct xt_table_info *t,
 {
 	unsigned int cpu;
 	unsigned int i;
-	unsigned int curcpu;
+	unsigned int curcpu = NR_CPUS;
 
 	/* Instead of clearing (by a previous call to memset())
 	 * the counters and using adds, we set the counters
@@ -719,6 +719,7 @@ static void get_counters(const struct xt_table_info *t,
 	 * if new softirq were to run and call ipt_do_table
 	 */
 	local_bh_disable();
+#ifndef CONFIG_PREEMPT_RT
 	curcpu = smp_processor_id();
 
 	i = 0;
@@ -727,7 +728,7 @@ static void get_counters(const struct xt_table_info *t,
 			   set_entry_to_counter,
 			   counters,
 			   &i);
-
+#endif
 	for_each_possible_cpu(cpu) {
 		if (cpu == curcpu)
 			continue;
@@ -1183,7 +1184,7 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len,
 
 	i = 0;
 	/* Choose the copy that is on our node */
-	curcpu = smp_processor_id();
+	curcpu = raw_smp_processor_id();
 	loc_cpu_entry = private->entries[curcpu];
 	xt_info_wrlock(curcpu);
 	ARPT_ENTRY_ITERATE(loc_cpu_entry,
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index fdefae6b5dfc..5e6e773fe03f 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -348,7 +348,7 @@ ipt_do_table(struct sk_buff *skb,
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
 	xt_info_rdlock_bh();
 	private = table->private;
-	table_base = private->entries[smp_processor_id()];
+	table_base = private->entries[raw_smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 
@@ -892,7 +892,7 @@ get_counters(const struct xt_table_info *t,
 {
 	unsigned int cpu;
 	unsigned int i;
-	unsigned int curcpu;
+	unsigned int curcpu = NR_CPUS;
 
 	/* Instead of clearing (by a previous call to memset())
 	 * the counters and using adds, we set the counters
@@ -902,6 +902,7 @@ get_counters(const struct xt_table_info *t,
 	 * if new softirq were to run and call ipt_do_table
 	 */
 	local_bh_disable();
+#ifndef CONFIG_PREEMPT_RT
 	curcpu = smp_processor_id();
 
 	i = 0;
@@ -910,7 +911,7 @@ get_counters(const struct xt_table_info *t,
 			  set_entry_to_counter,
 			  counters,
 			  &i);
-
+#endif
 	for_each_possible_cpu(cpu) {
 		if (cpu == curcpu)
 			continue;
@@ -1391,7 +1392,7 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat
 
 	i = 0;
 	/* Choose the copy that is on our node */
-	curcpu = smp_processor_id();
+	curcpu = raw_smp_processor_id();
 	loc_cpu_entry = private->entries[curcpu];
 	xt_info_wrlock(curcpu);
 	IPT_ENTRY_ITERATE(loc_cpu_entry,
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 568270884c70..2cfa9cb6b3b1 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -204,13 +204,13 @@ struct rt_hash_bucket {
 };
 
 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
-	defined(CONFIG_PROVE_LOCKING)
+	defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_PREEMPT_RT)
 /*
  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
  * The size of this table is a power of two and depends on the number of CPUS.
  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
  */
-#ifdef CONFIG_LOCKDEP
+#if defined(CONFIG_LOCKDEP) || defined(CONFIG_PREEMPT_RT)
 # define RT_HASH_LOCK_SZ	256
 #else
 # if NR_CPUS >= 32
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index ced1f2c0cb65..2683fd10a91c 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -375,7 +375,7 @@ ip6t_do_table(struct sk_buff *skb,
 
 	xt_info_rdlock_bh();
 	private = table->private;
-	table_base = private->entries[smp_processor_id()];
+	table_base = private->entries[raw_smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 
@@ -921,7 +921,7 @@ get_counters(const struct xt_table_info *t,
 {
 	unsigned int cpu;
 	unsigned int i;
-	unsigned int curcpu;
+	unsigned int curcpu = NR_CPUS;
 
 	/* Instead of clearing (by a previous call to memset())
 	 * the counters and using adds, we set the counters
@@ -931,6 +931,8 @@ get_counters(const struct xt_table_info *t,
 	 * if new softirq were to run and call ipt_do_table
 	 */
 	local_bh_disable();
+
+#ifndef CONFIG_PREEMPT_RT
 	curcpu = smp_processor_id();
 
 	i = 0;
@@ -939,7 +941,7 @@ get_counters(const struct xt_table_info *t,
 			   set_entry_to_counter,
 			   counters,
 			   &i);
-
+#endif
 	for_each_possible_cpu(cpu) {
 		if (cpu == curcpu)
 			continue;
@@ -960,12 +962,13 @@ static struct xt_counters *alloc_counters(struct xt_table *table)
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
+	int node = cpu_to_node(raw_smp_processor_id());
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
 	   about). */
 	countersize = sizeof(struct xt_counters) * private->number;
-	counters = vmalloc_node(countersize, numa_node_id());
+	counters = vmalloc_node(countersize, node);
 
 	if (counters == NULL)
 		return ERR_PTR(-ENOMEM);
@@ -1423,7 +1426,7 @@ do_add_counters(struct net *net, void __user *user, unsigned int len,
 
 	i = 0;
 	/* Choose the copy that is on our node */
-	curcpu = smp_processor_id();
+	curcpu = raw_smp_processor_id();
 	xt_info_wrlock(curcpu);
 	loc_cpu_entry = private->entries[curcpu];
 	IP6T_ENTRY_ITERATE(loc_cpu_entry,
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 5bb34737501f..9fce0a411322 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -233,7 +233,7 @@ EXPORT_SYMBOL(nf_ct_attach);
 void (*nf_ct_destroy)(struct nf_conntrack *);
 EXPORT_SYMBOL(nf_ct_destroy);
 
-void nf_conntrack_destroy(struct nf_conntrack *nfct)
+static void __nf_conntrack_destroy(struct nf_conntrack *nfct)
 {
 	void (*destroy)(struct nf_conntrack *);
 
@@ -243,6 +243,28 @@ void nf_conntrack_destroy(struct nf_conntrack *nfct)
 	destroy(nfct);
 	rcu_read_unlock();
 }
+
+#ifdef CONFIG_PREEMPT_RT
+/*
+ * nf_contrack_destroy is called with preemption disabled
+ * and will call functions that might schedule in PREEMPT_RT.
+ * For PREEMPT_RT we use a rcu callback instead to handle
+ * the destroying.
+ */
+static void nf_conntrack_destroy_rcu(struct rcu_head *rhp)
+{
+	__nf_conntrack_destroy(container_of(rhp, struct nf_conntrack, rcu));
+}
+void nf_conntrack_destroy(struct nf_conntrack *nfct)
+{
+	call_rcu(&nfct->rcu, nf_conntrack_destroy_rcu);
+}
+#else /* !PREEMPT_RT */
+void nf_conntrack_destroy(struct nf_conntrack *nfct)
+{
+	__nf_conntrack_destroy(nfct);
+}
+#endif /* PREEMPT_RT */
 EXPORT_SYMBOL(nf_conntrack_destroy);
 #endif /* CONFIG_NF_CONNTRACK */
 
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 2936fa3b6dc8..f4e94fb17477 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1061,7 +1061,7 @@ int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid,
 		return -ENOBUFS;
 
 	if (info.delivered) {
-		if (info.congested && (allocation & __GFP_WAIT))
+		if (info.congested && (allocation & __GFP_WAIT) && !rt_task(current))
 			yield();
 		return 0;
 	}
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 27d03816ec3e..98d22ca42161 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -12,6 +12,7 @@
  */
 
 #include <linux/bitops.h>
+#include <linux/kallsyms.h>
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
@@ -24,6 +25,7 @@
 #include <linux/init.h>
 #include <linux/rcupdate.h>
 #include <linux/list.h>
+#include <linux/delay.h>
 #include <net/pkt_sched.h>
 
 /* Main transmission queue. */
@@ -78,7 +80,7 @@ static inline int handle_dev_cpu_collision(struct sk_buff *skb,
 {
 	int ret;
 
-	if (unlikely(dev_queue->xmit_lock_owner == smp_processor_id())) {
+	if (unlikely(netif_tx_lock_recursion(dev_queue))) {
 		/*
 		 * Same CPU holding the lock. It may be a transient
 		 * configuration error, when hard_start_xmit() recurses. We
@@ -95,7 +97,9 @@ static inline int handle_dev_cpu_collision(struct sk_buff *skb,
 		 * Another cpu is holding lock, requeue & delay xmits for
 		 * some time.
 		 */
+		preempt_disable(); /* FIXME: we need an _rt version of this */
 		__get_cpu_var(netdev_rx_stat).cpu_collision++;
+		preempt_enable();
 		ret = dev_requeue_skb(skb, q);
 	}
 
@@ -141,7 +145,7 @@ static inline int qdisc_restart(struct Qdisc *q)
 	dev = qdisc_dev(q);
 	txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
 
-	HARD_TX_LOCK(dev, txq, smp_processor_id());
+	HARD_TX_LOCK(dev, txq);
 	if (!netif_tx_queue_stopped(txq) &&
 	    !netif_tx_queue_frozen(txq))
 		ret = dev_hard_start_xmit(skb, dev, txq);
@@ -713,9 +717,12 @@ void dev_deactivate(struct net_device *dev)
 	/* Wait for outstanding qdisc-less dev_queue_xmit calls. */
 	synchronize_rcu();
 
-	/* Wait for outstanding qdisc_run calls. */
+	/*
+	 * Wait for outstanding qdisc_run calls.
+	 * TODO: shouldnt this be wakeup-based, instead of polling it?
+	 */
 	while (some_qdisc_is_busy(dev))
-		yield();
+		msleep(1);
 }
 
 static void dev_init_scheduler_queue(struct net_device *dev,
diff --git a/scripts/mkcompile_h b/scripts/mkcompile_h
index 6a12dd9f1181..b987a5fda7b4 100755
--- a/scripts/mkcompile_h
+++ b/scripts/mkcompile_h
@@ -2,7 +2,8 @@ TARGET=$1
 ARCH=$2
 SMP=$3
 PREEMPT=$4
-CC=$5
+PREEMPT_RT=$5
+CC=$6
 
 vecho() { [ "${quiet}" = "silent_" ] || echo "$@" ; }
 
@@ -45,6 +46,7 @@ UTS_VERSION="#$VERSION"
 CONFIG_FLAGS=""
 if [ -n "$SMP" ] ; then CONFIG_FLAGS="SMP"; fi
 if [ -n "$PREEMPT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS PREEMPT"; fi
+if [ -n "$PREEMPT_RT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS RT"; fi
 UTS_VERSION="$UTS_VERSION $CONFIG_FLAGS $TIMESTAMP"
 
 # Truncate to maximum length