From ffebdff74ff04666b221c568daabd878f5939738 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Thu, 30 Jul 2015 16:24:43 -0700
Subject: x86/apic: Serialize LVTT and TSC_DEADLINE writes

commit 5d7c631d926b59aa16f3c56eaeb83f1036c81dc7 upstream.

The APIC LVTT register is MMIO mapped but the TSC_DEADLINE register is an
MSR. The write to the TSC_DEADLINE MSR is not serializing, so it's not
guaranteed that the write to LVTT has reached the APIC before the
TSC_DEADLINE MSR is written. In such a case the write to the MSR is
ignored and as a consequence the local timer interrupt never fires.

The SDM decribes this issue for xAPIC and x2APIC modes. The
serialization methods recommended by the SDM differ.

xAPIC:
 "1. Memory-mapped write to LVT Timer Register, setting bits 18:17 to 10b.
  2. WRMSR to the IA32_TSC_DEADLINE MSR a value much larger than current time-stamp counter.
  3. If RDMSR of the IA32_TSC_DEADLINE MSR returns zero, go to step 2.
  4. WRMSR to the IA32_TSC_DEADLINE MSR the desired deadline."

x2APIC:
 "To allow for efficient access to the APIC registers in x2APIC mode,
  the serializing semantics of WRMSR are relaxed when writing to the
  APIC registers. Thus, system software should not use 'WRMSR to APIC
  registers in x2APIC mode' as a serializing instruction. Read and write
  accesses to the APIC registers will occur in program order. A WRMSR to
  an APIC register may complete before all preceding stores are globally
  visible; software can prevent this by inserting a serializing
  instruction, an SFENCE, or an MFENCE before the WRMSR."

The xAPIC method is to just wait for the memory mapped write to hit
the LVTT by checking whether the MSR write has reached the hardware.
There is no reason why a proper MFENCE after the memory mapped write would
not do the same. Andi Kleen confirmed that MFENCE is sufficient for the
xAPIC case as well.

Issue MFENCE before writing to the TSC_DEADLINE MSR. This can be done
unconditionally as all CPUs which have TSC_DEADLINE also have MFENCE
support.

[ tglx: Massaged the changelog ]

Signed-off-by: Shaohua Li <shli@fb.com>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Cc: <Kernel-team@fb.com>
Cc: <lenb@kernel.org>
Cc: <fenghua.yu@intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Link: http://lkml.kernel.org/r/20150909041352.GA2059853@devbig257.prn2.facebook.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/apic/apic.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 033eb44dc661..9620d18cb638 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -350,6 +350,13 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
 	apic_write(APIC_LVTT, lvtt_value);
 
 	if (lvtt_value & APIC_LVT_TIMER_TSCDEADLINE) {
+		/*
+		 * See Intel SDM: TSC-Deadline Mode chapter. In xAPIC mode,
+		 * writing to the APIC LVTT and TSC_DEADLINE MSR isn't serialized.
+		 * According to Intel, MFENCE can do the serialization here.
+		 */
+		asm volatile("mfence" : : : "memory");
+
 		printk_once(KERN_DEBUG "TSC deadline timer enabled\n");
 		return;
 	}
-- 
cgit v1.2.3


From 9870892fc277debf23fd5f6bef2fef5ae77b97fb Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Wed, 16 Sep 2015 14:10:03 +0100
Subject: x86/platform: Fix Geode LX timekeeping in the generic x86 build

commit 03da3ff1cfcd7774c8780d2547ba0d995f7dc03d upstream.

In 2007, commit 07190a08eef36 ("Mark TSC on GeodeLX reliable")
bypassed verification of the TSC on Geode LX. However, this code
(now in the check_system_tsc_reliable() function in
arch/x86/kernel/tsc.c) was only present if CONFIG_MGEODE_LX was
set.

OpenWRT has recently started building its generic Geode target
for Geode GX, not LX, to include support for additional
platforms. This broke the timekeeping on LX-based devices,
because the TSC wasn't marked as reliable:
https://dev.openwrt.org/ticket/20531

By adding a runtime check on is_geode_lx(), we can also include
the fix if CONFIG_MGEODEGX1 or CONFIG_X86_GENERIC are set, thus
fixing the problem.

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Cc: Andres Salomon <dilinger@queued.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Marcelo Tosatti <marcelo@kvack.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1442409003.131189.87.camel@infradead.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/tsc.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 27e3a14fc917..9714a7aa32fc 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -20,6 +20,7 @@
 #include <asm/hypervisor.h>
 #include <asm/nmi.h>
 #include <asm/x86_init.h>
+#include <asm/geode.h>
 
 unsigned int __read_mostly cpu_khz;	/* TSC clocks / usec, not used here */
 EXPORT_SYMBOL(cpu_khz);
@@ -806,15 +807,17 @@ EXPORT_SYMBOL_GPL(mark_tsc_unstable);
 
 static void __init check_system_tsc_reliable(void)
 {
-#ifdef CONFIG_MGEODE_LX
-	/* RTSC counts during suspend */
+#if defined(CONFIG_MGEODEGX1) || defined(CONFIG_MGEODE_LX) || defined(CONFIG_X86_GENERIC)
+	if (is_geode_lx()) {
+		/* RTSC counts during suspend */
 #define RTSC_SUSP 0x100
-	unsigned long res_low, res_high;
+		unsigned long res_low, res_high;
 
-	rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high);
-	/* Geode_LX - the OLPC CPU has a very reliable TSC */
-	if (res_low & RTSC_SUSP)
-		tsc_clocksource_reliable = 1;
+		rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high);
+		/* Geode_LX - the OLPC CPU has a very reliable TSC */
+		if (res_low & RTSC_SUSP)
+			tsc_clocksource_reliable = 1;
+	}
 #endif
 	if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE))
 		tsc_clocksource_reliable = 1;
-- 
cgit v1.2.3


From fb7eff9cfdffc4cb2f3d75024bda6c2a56ab12e6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dirk=20M=C3=BCller?= <dmueller@suse.com>
Date: Thu, 1 Oct 2015 13:43:42 +0200
Subject: Use WARN_ON_ONCE for missing X86_FEATURE_NRIPS

commit d2922422c48df93f3edff7d872ee4f3191fefb08 upstream.

The cpu feature flags are not ever going to change, so warning
everytime can cause a lot of kernel log spam
(in our case more than 10GB/hour).

The warning seems to only occur when nested virtualization is
enabled, so it's probably triggered by a KVM bug.  This is a
sensible and safe change anyway, and the KVM bug fix might not
be suitable for stable releases anyway.

Signed-off-by: Dirk Mueller <dmueller@suse.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kvm/svm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 224d2ef754cc..3deddd796f76 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -496,7 +496,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 	struct vcpu_svm *svm = to_svm(vcpu);
 
 	if (svm->vmcb->control.next_rip != 0) {
-		WARN_ON(!static_cpu_has(X86_FEATURE_NRIPS));
+		WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS));
 		svm->next_rip = svm->vmcb->control.next_rip;
 	}
 
-- 
cgit v1.2.3


From 4f9d53595caf9c801c8b4389bc0c64e9c16488f0 Mon Sep 17 00:00:00 2001
From: Stephen Smalley <sds@tycho.nsa.gov>
Date: Thu, 1 Oct 2015 09:04:22 -0400
Subject: x86/mm: Set NX on gap between __ex_table and rodata

commit ab76f7b4ab2397ffdd2f1eb07c55697d19991d10 upstream.

Unused space between the end of __ex_table and the start of
rodata can be left W+x in the kernel page tables.  Extend the
setting of the NX bit to cover this gap by starting from
text_end rather than rodata_start.

  Before:
  ---[ High Kernel Mapping ]---
  0xffffffff80000000-0xffffffff81000000          16M                               pmd
  0xffffffff81000000-0xffffffff81600000           6M     ro         PSE     GLB x  pmd
  0xffffffff81600000-0xffffffff81754000        1360K     ro                 GLB x  pte
  0xffffffff81754000-0xffffffff81800000         688K     RW                 GLB x  pte
  0xffffffff81800000-0xffffffff81a00000           2M     ro         PSE     GLB NX pmd
  0xffffffff81a00000-0xffffffff81b3b000        1260K     ro                 GLB NX pte
  0xffffffff81b3b000-0xffffffff82000000        4884K     RW                 GLB NX pte
  0xffffffff82000000-0xffffffff82200000           2M     RW         PSE     GLB NX pmd
  0xffffffff82200000-0xffffffffa0000000         478M                               pmd

  After:
  ---[ High Kernel Mapping ]---
  0xffffffff80000000-0xffffffff81000000          16M                               pmd
  0xffffffff81000000-0xffffffff81600000           6M     ro         PSE     GLB x  pmd
  0xffffffff81600000-0xffffffff81754000        1360K     ro                 GLB x  pte
  0xffffffff81754000-0xffffffff81800000         688K     RW                 GLB NX pte
  0xffffffff81800000-0xffffffff81a00000           2M     ro         PSE     GLB NX pmd
  0xffffffff81a00000-0xffffffff81b3b000        1260K     ro                 GLB NX pte
  0xffffffff81b3b000-0xffffffff82000000        4884K     RW                 GLB NX pte
  0xffffffff82000000-0xffffffff82200000           2M     RW         PSE     GLB NX pmd
  0xffffffff82200000-0xffffffffa0000000         478M                               pmd

Signed-off-by: Stephen Smalley <sds@tycho.nsa.gov>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/1443704662-3138-1-git-send-email-sds@tycho.nsa.gov
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/mm/init_64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 2db3f30bed75..b04e50262088 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1163,7 +1163,7 @@ void mark_rodata_ro(void)
 	 * has been zapped already via cleanup_highmem().
 	 */
 	all_end = roundup((unsigned long)_brk_end, PMD_SIZE);
-	set_memory_nx(rodata_start, (all_end - rodata_start) >> PAGE_SHIFT);
+	set_memory_nx(text_end, (all_end - text_end) >> PAGE_SHIFT);
 
 	rodata_test();
 
-- 
cgit v1.2.3


From 919845cb331e066a042a8060e84bb9dc6ba390bf Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Fri, 25 Sep 2015 11:59:52 +0200
Subject: x86/xen: Support kexec/kdump in HVM guests by doing a soft reset

commit 0b34a166f291d255755be46e43ed5497cdd194f2 upstream.

Currently there is a number of issues preventing PVHVM Xen guests from
doing successful kexec/kdump:

  - Bound event channels.
  - Registered vcpu_info.
  - PIRQ/emuirq mappings.
  - shared_info frame after XENMAPSPACE_shared_info operation.
  - Active grant mappings.

Basically, newly booted kernel stumbles upon already set up Xen
interfaces and there is no way to reestablish them. In Xen-4.7 a new
feature called 'soft reset' is coming. A guest performing kexec/kdump
operation is supposed to call SCHEDOP_shutdown hypercall with
SHUTDOWN_soft_reset reason before jumping to new kernel. Hypervisor
(with some help from toolstack) will do full domain cleanup (but
keeping its memory and vCPU contexts intact) returning the guest to
the state it had when it was first booted and thus allowing it to
start over.

Doing SHUTDOWN_soft_reset on Xen hypervisors which don't support it is
probably OK as by default all unknown shutdown reasons cause domain
destroy with a message in toolstack log: 'Unknown shutdown reason code
5. Destroying domain.'  which gives a clue to what the problem is and
eliminates false expectations.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/xen/enlighten.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 13d926282c89..511630db00a8 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -33,6 +33,10 @@
 #include <linux/memblock.h>
 #include <linux/edd.h>
 
+#ifdef CONFIG_KEXEC_CORE
+#include <linux/kexec.h>
+#endif
+
 #include <xen/xen.h>
 #include <xen/events.h>
 #include <xen/interface/xen.h>
@@ -1744,6 +1748,21 @@ static struct notifier_block xen_hvm_cpu_notifier __cpuinitdata = {
 	.notifier_call	= xen_hvm_cpu_notify,
 };
 
+#ifdef CONFIG_KEXEC_CORE
+static void xen_hvm_shutdown(void)
+{
+	native_machine_shutdown();
+	if (kexec_in_progress)
+		xen_reboot(SHUTDOWN_soft_reset);
+}
+
+static void xen_hvm_crash_shutdown(struct pt_regs *regs)
+{
+	native_machine_crash_shutdown(regs);
+	xen_reboot(SHUTDOWN_soft_reset);
+}
+#endif
+
 static void __init xen_hvm_guest_init(void)
 {
 	init_hvm_pv_info();
@@ -1758,6 +1777,10 @@ static void __init xen_hvm_guest_init(void)
 	x86_init.irqs.intr_init = xen_init_IRQ;
 	xen_hvm_init_time_ops();
 	xen_hvm_init_mmu_ops();
+#ifdef CONFIG_KEXEC_CORE
+	machine_ops.shutdown = xen_hvm_shutdown;
+	machine_ops.crash_shutdown = xen_hvm_crash_shutdown;
+#endif
 }
 
 static bool __init xen_hvm_platform(void)
-- 
cgit v1.2.3


From f7c7bb9f06eb2af9c03b29e019d45ad4e943f900 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Fri, 16 Aug 2013 14:17:19 -0700
Subject: x86: Add 1/2/4/8 byte optimization to 64bit
 __copy_{from,to}_user_inatomic

commit ff47ab4ff3cddfa7bc1b25b990e24abe2ae474ff upstream.

The 64bit __copy_{from,to}_user_inatomic always called
copy_from_user_generic, but skipped the special optimizations for 1/2/4/8
byte accesses.

This especially hurts the futex call, which accesses the 4 byte futex
user value with a complicated fast string operation in a function call,
instead of a single movl.

Use __copy_{from,to}_user for _inatomic instead to get the same
optimizations. The only problem was the might_fault() in those functions.
So move that to new wrapper and call __copy_{f,t}_user_nocheck()
from *_inatomic directly.

32bit already did this correctly by duplicating the code.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Link: http://lkml.kernel.org/r/1376687844-19857-2-git-send-email-andi@firstfloor.org
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/uaccess_64.h | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 142810c457dc..34df5c22df90 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -77,11 +77,10 @@ int copy_to_user(void __user *dst, const void *src, unsigned size)
 }
 
 static __always_inline __must_check
-int __copy_from_user(void *dst, const void __user *src, unsigned size)
+int __copy_from_user_nocheck(void *dst, const void __user *src, unsigned size)
 {
 	int ret = 0;
 
-	might_fault();
 	if (!__builtin_constant_p(size))
 		return copy_user_generic(dst, (__force void *)src, size);
 	switch (size) {
@@ -121,11 +120,17 @@ int __copy_from_user(void *dst, const void __user *src, unsigned size)
 }
 
 static __always_inline __must_check
-int __copy_to_user(void __user *dst, const void *src, unsigned size)
+int __copy_from_user(void *dst, const void __user *src, unsigned size)
+{
+	might_fault();
+	return __copy_from_user_nocheck(dst, src, size);
+}
+
+static __always_inline __must_check
+int __copy_to_user_nocheck(void __user *dst, const void *src, unsigned size)
 {
 	int ret = 0;
 
-	might_fault();
 	if (!__builtin_constant_p(size))
 		return copy_user_generic((__force void *)dst, src, size);
 	switch (size) {
@@ -164,6 +169,13 @@ int __copy_to_user(void __user *dst, const void *src, unsigned size)
 	}
 }
 
+static __always_inline __must_check
+int __copy_to_user(void __user *dst, const void *src, unsigned size)
+{
+	might_fault();
+	return __copy_to_user_nocheck(dst, src, size);
+}
+
 static __always_inline __must_check
 int __copy_in_user(void __user *dst, const void __user *src, unsigned size)
 {
@@ -220,13 +232,13 @@ int __copy_in_user(void __user *dst, const void __user *src, unsigned size)
 static __must_check __always_inline int
 __copy_from_user_inatomic(void *dst, const void __user *src, unsigned size)
 {
-	return copy_user_generic(dst, (__force const void *)src, size);
+	return __copy_from_user_nocheck(dst, (__force const void *)src, size);
 }
 
 static __must_check __always_inline int
 __copy_to_user_inatomic(void __user *dst, const void *src, unsigned size)
 {
-	return copy_user_generic((__force void *)dst, src, size);
+	return __copy_to_user_nocheck((__force void *)dst, src, size);
 }
 
 extern long __copy_user_nocache(void *dst, const void __user *src,
-- 
cgit v1.2.3