From 1adbb5db21a1e08537061607abc9b0a9e7e12848 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 24 Jan 2013 23:24:56 -0500
Subject: quota: autoload the quota_v2 module for QFMT_VFS_V1 quota format

commit c3ad83d9efdfe6a86efd44945a781f00c879b7b4 upstream.

Otherwise, ext4 file systems with the quota featured enable will get a
very confusing "No such process" error message if the quota code is
built as a module and the quota_v2 module has not been loaded.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/quota.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/linux/quota.h b/include/linux/quota.h
index 9a85412e0db6..a6dd99587762 100644
--- a/include/linux/quota.h
+++ b/include/linux/quota.h
@@ -413,6 +413,7 @@ struct quota_module_name {
 #define INIT_QUOTA_MODULE_NAMES {\
 	{QFMT_VFS_OLD, "quota_v1"},\
 	{QFMT_VFS_V0, "quota_v2"},\
+	{QFMT_VFS_V1, "quota_v2"},\
 	{0, NULL}}
 
 #else
-- 
cgit v1.2.3


From ffbf1423cc7e8ed018d0ba8fbe3f9f3bb816fa4c Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joro@8bytes.org>
Date: Wed, 6 Feb 2013 12:55:23 +0100
Subject: iommu/amd: Initialize device table after dma_ops

commit f528d980c17b8714aedc918ba86e058af914d66b upstream.

When dma_ops are initialized the unity mappings are
created. The init_device_table_dma() function makes sure DMA
from all devices is blocked by default. This opens a short
window in time where DMA to unity mapped regions is blocked
by the IOMMU. Make sure this does not happen by initializing
the device table after dma_ops.

Signed-off-by: Joerg Roedel <joro@8bytes.org>
Signed-off-by: Shuah Khan <shuah.khan@hp.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/amd_iommu_init.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 33df6e82f653..d86aa3f15041 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -1363,6 +1363,7 @@ static struct syscore_ops amd_iommu_syscore_ops = {
  */
 static int __init amd_iommu_init(void)
 {
+	struct amd_iommu *iommu;
 	int i, ret = 0;
 
 	/*
@@ -1411,9 +1412,6 @@ static int __init amd_iommu_init(void)
 	if (amd_iommu_pd_alloc_bitmap == NULL)
 		goto free;
 
-	/* init the device table */
-	init_device_table();
-
 	/*
 	 * let all alias entries point to itself
 	 */
@@ -1463,6 +1461,12 @@ static int __init amd_iommu_init(void)
 	if (ret)
 		goto free_disable;
 
+	/* init the device table */
+	init_device_table();
+
+	for_each_iommu(iommu)
+		iommu_flush_all_caches(iommu);
+
 	amd_iommu_init_api();
 
 	amd_iommu_init_notifier();
-- 
cgit v1.2.3


From ffe56d754e59cdb086b76b95fe0c3a0302ec46c8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 20 Feb 2013 15:24:12 -0800
Subject: posix-timer: Don't call idr_find() with out-of-range ID

commit e182bb38d7db7494fa5dcd82da17fe0dedf60ecf upstream.

When idr_find() was fed a negative ID, it used to look up the ID
ignoring the sign bit before recent ("idr: remove MAX_IDR_MASK and
move left MAX_IDR_* into idr.c") patch. Now a negative ID triggers
a WARN_ON_ONCE().

__lock_timer() feeds timer_id from userland directly to idr_find()
without sanitizing it which can trigger the above malfunctions.  Add a
range check on @timer_id before invoking idr_find() in __lock_timer().

While timer_t is defined as int by all archs at the moment, Andrew
worries that it may be defined as a larger type later on.  Make the
test cover larger integers too so that it at least is guaranteed to
not return the wrong timer.

Note that WARN_ON_ONCE() in idr_find() on id < 0 is transitional
precaution while moving away from ignoring MSB.  Once it's gone we can
remove the guard as long as timer_t isn't larger than int.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Sasha Levin <sasha.levin@oracle.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/20130220232412.GL3570@htj.dyndns.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/posix-timers.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 4556182527f3..d2da8ad45b30 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -639,6 +639,13 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
 {
 	struct k_itimer *timr;
 
+	/*
+	 * timer_t could be any type >= int and we want to make sure any
+	 * @timer_id outside positive int range fails lookup.
+	 */
+	if ((unsigned long long)timer_id > INT_MAX)
+		return NULL;
+
 	rcu_read_lock();
 	timr = idr_find(&posix_timers_id, (int)timer_id);
 	if (timr) {
-- 
cgit v1.2.3


From 85fed56cdbc7973f34524ee9efa885ef66e4d831 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Wed, 13 Feb 2013 15:18:38 -0500
Subject: ftrace: Call ftrace cleanup module notifier after all other notifiers

commit 8c189ea64eea01ca20d102ddb74d6936dd16c579 upstream.

Commit: c1bf08ac "ftrace: Be first to run code modification on modules"

changed ftrace module notifier's priority to INT_MAX in order to
process the ftrace nops before anything else could touch them
(namely kprobes). This was the correct thing to do.

Unfortunately, the ftrace module notifier also contains the ftrace
clean up code. As opposed to the set up code, this code should be
run *after* all the module notifiers have run in case a module is doing
correct clean-up and unregisters its ftrace hooks. Basically, ftrace
needs to do clean up on module removal, as it needs to know about code
being removed so that it doesn't try to modify that code. But after it
removes the module from its records, if a ftrace user tries to remove
a probe, that removal will fail due as the record of that code segment
no longer exists.

Nothing really bad happens if the probe removal is called after ftrace
did the clean up, but the ftrace removal function will return an error.
Correct code (such as kprobes) will produce a WARN_ON() if it fails
to remove the probe. As people get annoyed by frivolous warnings, it's
best to do the ftrace clean up after everything else.

By splitting the ftrace_module_notifier into two notifiers, one that
does the module load setup that is run at high priority, and the other
that is called for module clean up that is run at low priority, the
problem is solved.

Reported-by: Frank Ch. Eigler <fche@redhat.com>
Acked-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/trace/ftrace.c | 46 ++++++++++++++++++++++++++++++++--------------
 1 file changed, 32 insertions(+), 14 deletions(-)

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index e96eee3a8606..86fd4170d248 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3432,37 +3432,51 @@ static void ftrace_init_module(struct module *mod,
 	ftrace_process_locs(mod, start, end);
 }
 
-static int ftrace_module_notify(struct notifier_block *self,
-				unsigned long val, void *data)
+static int ftrace_module_notify_enter(struct notifier_block *self,
+				      unsigned long val, void *data)
 {
 	struct module *mod = data;
 
-	switch (val) {
-	case MODULE_STATE_COMING:
+	if (val == MODULE_STATE_COMING)
 		ftrace_init_module(mod, mod->ftrace_callsites,
 				   mod->ftrace_callsites +
 				   mod->num_ftrace_callsites);
-		break;
-	case MODULE_STATE_GOING:
+	return 0;
+}
+
+static int ftrace_module_notify_exit(struct notifier_block *self,
+				     unsigned long val, void *data)
+{
+	struct module *mod = data;
+
+	if (val == MODULE_STATE_GOING)
 		ftrace_release_mod(mod);
-		break;
-	}
 
 	return 0;
 }
 #else
-static int ftrace_module_notify(struct notifier_block *self,
-				unsigned long val, void *data)
+static int ftrace_module_notify_enter(struct notifier_block *self,
+				      unsigned long val, void *data)
+{
+	return 0;
+}
+static int ftrace_module_notify_exit(struct notifier_block *self,
+				     unsigned long val, void *data)
 {
 	return 0;
 }
 #endif /* CONFIG_MODULES */
 
-struct notifier_block ftrace_module_nb = {
-	.notifier_call = ftrace_module_notify,
+struct notifier_block ftrace_module_enter_nb = {
+	.notifier_call = ftrace_module_notify_enter,
 	.priority = INT_MAX,	/* Run before anything that can use kprobes */
 };
 
+struct notifier_block ftrace_module_exit_nb = {
+	.notifier_call = ftrace_module_notify_exit,
+	.priority = INT_MIN,	/* Run after anything that can remove kprobes */
+};
+
 extern unsigned long __start_mcount_loc[];
 extern unsigned long __stop_mcount_loc[];
 
@@ -3494,9 +3508,13 @@ void __init ftrace_init(void)
 				  __start_mcount_loc,
 				  __stop_mcount_loc);
 
-	ret = register_module_notifier(&ftrace_module_nb);
+	ret = register_module_notifier(&ftrace_module_enter_nb);
+	if (ret)
+		pr_warning("Failed to register trace ftrace module enter notifier\n");
+
+	ret = register_module_notifier(&ftrace_module_exit_nb);
 	if (ret)
-		pr_warning("Failed to register trace ftrace module notifier\n");
+		pr_warning("Failed to register trace ftrace module exit notifier\n");
 
 	set_ftrace_early_filters();
 
-- 
cgit v1.2.3


From 9b1d040fee4c82b3c69283cbb8a4d2a1dc062a21 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Mon, 25 Feb 2013 15:54:08 -0500
Subject: doc, xen: Mention 'earlyprintk=xen' in the documentation.

commit 2482a92e7d17187301d7313cfe5021b13393a0b4 upstream.

The earlyprintk for Xen PV guests utilizes a simple hypercall
(console_io) to provide output to Xen emergency console.

Note that the Xen hypervisor should be booted with 'loglevel=all'
to output said information.

Reported-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Link: http://lkml.kernel.org/r/1361825650-14031-2-git-send-email-konrad.wilk@oracle.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/kernel-parameters.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 397ee05132a3..019569d24ce0 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -679,6 +679,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 
 	earlyprintk=	[X86,SH,BLACKFIN]
 			earlyprintk=vga
+			earlyprintk=xen
 			earlyprintk=serial[,ttySn[,baudrate]]
 			earlyprintk=ttySn[,baudrate]
 			earlyprintk=dbgp[debugController#]
@@ -696,6 +697,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			The VGA output is eventually overwritten by the real
 			console.
 
+			The xen output can only be used by Xen PV guests.
+
 	ekgdboc=	[X86,KGDB] Allow early kernel console debugging
 			ekgdboc=kbd
 
-- 
cgit v1.2.3


From 834f139f1613ba0ca3239a6a12be97c70698f189 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Mon, 25 Feb 2013 15:54:09 -0500
Subject: doc, kernel-parameters: Document 'console=hvc<n>'

commit a2fd6419174470f5ae6383f5037d0ee21ed9833f upstream.

Both the PowerPC hypervisor and Xen hypervisor can utilize the
hvc driver.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Link: http://lkml.kernel.org/r/1361825650-14031-3-git-send-email-konrad.wilk@oracle.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/kernel-parameters.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 019569d24ce0..389923426cae 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -531,6 +531,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			UART at the specified I/O port or MMIO address,
 			switching to the matching ttyS device later.  The
 			options are the same as for ttyS, above.
+		hvc<n>	Use the hypervisor console device <n>. This is for
+			both Xen and PowerPC hypervisors.
 
                 If the device connected to the port is not a TTY but a braille
                 device, prepend "brl," before the device type, for instance
-- 
cgit v1.2.3


From 93b6b299f7a97a8ce2f8ab7b14195f8d2d131905 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Wed, 27 Feb 2013 12:46:40 -0800
Subject: x86: Make sure we can boot in the case the BDA contains pure garbage

commit 7c10093692ed2e6f318387d96b829320aa0ca64c upstream.

On non-BIOS platforms it is possible that the BIOS data area contains
garbage instead of being zeroed or something equivalent (firmware
people: we are talking of 1.5K here, so please do the sane thing.)

We need on the order of 20-30K of low memory in order to boot, which
may grow up to < 64K in the future.  We probably want to avoid the
lowest of the low memory.  At the same time, it seems extremely
unlikely that a legitimate EBDA would ever reach down to the 128K
(which would require it to be over half a megabyte in size.)  Thus,
pick 128K as the cutoff for "this is insane, ignore."  We may still
end up reserving a bunch of extra memory on the low megabyte, but that
is not really a major issue these days.  In the worst case we lose
512K of RAM.

This code really should be merged with trim_bios_range() in
arch/x86/kernel/setup.c, but that is a bigger patch for a later merge
window.

Reported-by: Darren Hart <dvhart@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: Matt Fleming <matt.fleming@intel.com>
Link: http://lkml.kernel.org/n/tip-oebml055yyfm8yxmria09rja@git.kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/head.c | 53 ++++++++++++++++++++++++++++++++------------------
 1 file changed, 34 insertions(+), 19 deletions(-)

diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c
index af0699ba48cf..f6c467484eb4 100644
--- a/arch/x86/kernel/head.c
+++ b/arch/x86/kernel/head.c
@@ -5,8 +5,6 @@
 #include <asm/setup.h>
 #include <asm/bios_ebda.h>
 
-#define BIOS_LOWMEM_KILOBYTES 0x413
-
 /*
  * The BIOS places the EBDA/XBDA at the top of conventional
  * memory, and usually decreases the reported amount of
@@ -16,17 +14,30 @@
  * chipset: reserve a page before VGA to prevent PCI prefetch
  * into it (errata #56). Usually the page is reserved anyways,
  * unless you have no PS/2 mouse plugged in.
+ *
+ * This functions is deliberately very conservative.  Losing
+ * memory in the bottom megabyte is rarely a problem, as long
+ * as we have enough memory to install the trampoline.  Using
+ * memory that is in use by the BIOS or by some DMA device
+ * the BIOS didn't shut down *is* a big problem.
  */
+
+#define BIOS_LOWMEM_KILOBYTES	0x413
+#define LOWMEM_CAP		0x9f000U	/* Absolute maximum */
+#define INSANE_CUTOFF		0x20000U	/* Less than this = insane */
+
 void __init reserve_ebda_region(void)
 {
 	unsigned int lowmem, ebda_addr;
 
-	/* To determine the position of the EBDA and the */
-	/* end of conventional memory, we need to look at */
-	/* the BIOS data area. In a paravirtual environment */
-	/* that area is absent. We'll just have to assume */
-	/* that the paravirt case can handle memory setup */
-	/* correctly, without our help. */
+	/*
+	 * To determine the position of the EBDA and the
+	 * end of conventional memory, we need to look at
+	 * the BIOS data area. In a paravirtual environment
+	 * that area is absent. We'll just have to assume
+	 * that the paravirt case can handle memory setup
+	 * correctly, without our help.
+	 */
 	if (paravirt_enabled())
 		return;
 
@@ -37,19 +48,23 @@ void __init reserve_ebda_region(void)
 	/* start of EBDA area */
 	ebda_addr = get_bios_ebda();
 
-	/* Fixup: bios puts an EBDA in the top 64K segment */
-	/* of conventional memory, but does not adjust lowmem. */
-	if ((lowmem - ebda_addr) <= 0x10000)
-		lowmem = ebda_addr;
+	/*
+	 * Note: some old Dells seem to need 4k EBDA without
+	 * reporting so, so just consider the memory above 0x9f000
+	 * to be off limits (bugzilla 2990).
+	 */
+
+	/* If the EBDA address is below 128K, assume it is bogus */
+	if (ebda_addr < INSANE_CUTOFF)
+		ebda_addr = LOWMEM_CAP;
 
-	/* Fixup: bios does not report an EBDA at all. */
-	/* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
-	if ((ebda_addr == 0) && (lowmem >= 0x9f000))
-		lowmem = 0x9f000;
+	/* If lowmem is less than 128K, assume it is bogus */
+	if (lowmem < INSANE_CUTOFF)
+		lowmem = LOWMEM_CAP;
 
-	/* Paranoia: should never happen, but... */
-	if ((lowmem == 0) || (lowmem >= 0x100000))
-		lowmem = 0x9f000;
+	/* Use the lower of the lowmem and EBDA markers as the cutoff */
+	lowmem = min(lowmem, ebda_addr);
+	lowmem = min(lowmem, LOWMEM_CAP); /* Absolute cap */
 
 	/* reserve all memory between lowmem and the 1MB mark */
 	memblock_x86_reserve_range(lowmem, 0x100000, "* BIOS reserved");
-- 
cgit v1.2.3


From 954497ea2037e8c42fe1c7a0882030b62ac9116c Mon Sep 17 00:00:00 2001
From: Nicholas Bellinger <nab@linux-iscsi.org>
Date: Mon, 18 Feb 2013 18:31:37 -0800
Subject: target: Add missing mapped_lun bounds checking during make_mappedlun
 setup

commit fbbf8555a986ed31e54f006b6cc637ea4ff1425b upstream.

This patch adds missing bounds checking for the configfs provided
mapped_lun value during target_fabric_make_mappedlun() setup ahead
of se_lun_acl initialization.

This addresses a potential OOPs when using a mapped_lun value that
exceeds the hardcoded TRANSPORT_MAX_LUNS_PER_TPG-1 value within
se_node_acl->device_list[].

Reported-by: Jan Engelhardt <jengelh@inai.de>
Cc: Jan Engelhardt <jengelh@inai.de>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/target/target_core_fabric_configfs.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/target/target_core_fabric_configfs.c b/drivers/target/target_core_fabric_configfs.c
index 07ab5a3bb8e8..6246f28eac41 100644
--- a/drivers/target/target_core_fabric_configfs.c
+++ b/drivers/target/target_core_fabric_configfs.c
@@ -355,6 +355,14 @@ static struct config_group *target_fabric_make_mappedlun(
 		ret = -EINVAL;
 		goto out;
 	}
+	if (mapped_lun > (TRANSPORT_MAX_LUNS_PER_TPG-1)) {
+		pr_err("Mapped LUN: %lu exceeds TRANSPORT_MAX_LUNS_PER_TPG"
+			"-1: %u for Target Portal Group: %u\n", mapped_lun,
+			TRANSPORT_MAX_LUNS_PER_TPG-1,
+			se_tpg->se_tpg_tfo->tpg_get_tag(se_tpg));
+		ret = -EINVAL;
+		goto out;
+	}
 
 	lacl = core_dev_init_initiator_node_lun_acl(se_tpg, mapped_lun,
 			config_item_name(acl_ci), &ret);
-- 
cgit v1.2.3


From ad315127433dc9a8147e7f6b43f21b3d0061003a Mon Sep 17 00:00:00 2001
From: "Xiaowei.Hu" <xiaowei.hu@oracle.com>
Date: Wed, 27 Feb 2013 17:02:49 -0800
Subject: ocfs2: ac->ac_allow_chain_relink=0 won't disable group relink

commit 309a85b6861fedbb48a22d45e0e079d1be993b3a upstream.

ocfs2_block_group_alloc_discontig() disables chain relink by setting
ac->ac_allow_chain_relink = 0 because it grabs clusters from multiple
cluster groups.

It doesn't keep the credits for all chain relink,but
ocfs2_claim_suballoc_bits overrides this in this call trace:
ocfs2_block_group_claim_bits()->ocfs2_claim_clusters()->
__ocfs2_claim_clusters()->ocfs2_claim_suballoc_bits()
ocfs2_claim_suballoc_bits set ac->ac_allow_chain_relink = 1; then call
ocfs2_search_chain() one time and disable it again, and then we run out
of credits.

Fix is to allow relink by default and disable it in
ocfs2_block_group_alloc_discontig.

Without this patch, End-users will run into a crash due to run out of
credits, backtrace like this:

  RIP: 0010:[<ffffffffa0808b14>]  [<ffffffffa0808b14>]
  jbd2_journal_dirty_metadata+0x164/0x170 [jbd2]
  RSP: 0018:ffff8801b919b5b8  EFLAGS: 00010246
  RAX: 0000000000000000 RBX: ffff88022139ddc0 RCX: ffff880159f652d0
  RDX: ffff880178aa3000 RSI: ffff880159f652d0 RDI: ffff880087f09bf8
  RBP: ffff8801b919b5e8 R08: 0000000000000000 R09: 0000000000000000
  R10: 0000000000001e00 R11: 00000000000150b0 R12: ffff880159f652d0
  R13: ffff8801a0cae908 R14: ffff880087f09bf8 R15: ffff88018d177800
  FS:  00007fc9b0b6b6e0(0000) GS:ffff88022fd40000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
  CR2: 000000000040819c CR3: 0000000184017000 CR4: 00000000000006e0
  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
  Process dd (pid: 9945, threadinfo ffff8801b919a000, task ffff880149a264c0)
  Call Trace:
    ocfs2_journal_dirty+0x2f/0x70 [ocfs2]
    ocfs2_relink_block_group+0x111/0x480 [ocfs2]
    ocfs2_search_chain+0x455/0x9a0 [ocfs2]
    ...

Signed-off-by: Xiaowei.Hu <xiaowei.hu@oracle.com>
Reviewed-by: Srinivas Eeda <srinivas.eeda@oracle.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ocfs2/suballoc.c | 7 +++----
 fs/ocfs2/suballoc.h | 2 +-
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index f169da4624fd..b7e74b580c0f 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -642,7 +642,7 @@ ocfs2_block_group_alloc_discontig(handle_t *handle,
 	 * cluster groups will be staying in cache for the duration of
 	 * this operation.
 	 */
-	ac->ac_allow_chain_relink = 0;
+	ac->ac_disable_chain_relink = 1;
 
 	/* Claim the first region */
 	status = ocfs2_block_group_claim_bits(osb, handle, ac, min_bits,
@@ -1823,7 +1823,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
 	 * Do this *after* figuring out how many bits we're taking out
 	 * of our target group.
 	 */
-	if (ac->ac_allow_chain_relink &&
+	if (!ac->ac_disable_chain_relink &&
 	    (prev_group_bh) &&
 	    (ocfs2_block_group_reasonably_empty(bg, res->sr_bits))) {
 		status = ocfs2_relink_block_group(handle, alloc_inode,
@@ -1928,7 +1928,6 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
 
 	victim = ocfs2_find_victim_chain(cl);
 	ac->ac_chain = victim;
-	ac->ac_allow_chain_relink = 1;
 
 	status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
 				    res, &bits_left);
@@ -1947,7 +1946,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
 	 * searching each chain in order. Don't allow chain relinking
 	 * because we only calculate enough journal credits for one
 	 * relink per alloc. */
-	ac->ac_allow_chain_relink = 0;
+	ac->ac_disable_chain_relink = 1;
 	for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) {
 		if (i == victim)
 			continue;
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index b8afabfeede4..a36d0aa50911 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -49,7 +49,7 @@ struct ocfs2_alloc_context {
 
 	/* these are used by the chain search */
 	u16    ac_chain;
-	int    ac_allow_chain_relink;
+	int    ac_disable_chain_relink;
 	group_search_t *ac_group_search;
 
 	u64    ac_last_group;
-- 
cgit v1.2.3


From 975398374dac831d5d5c2825615c489aee24ea3a Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@suse.com>
Date: Thu, 20 Dec 2012 10:31:11 +0000
Subject: xen-blkback: do not leak mode property

commit 9d092603cc306ee6edfe917bf9ab8beb5f32d7bc upstream.

"be->mode" is obtained from xenbus_read(), which does a kmalloc() for
the message body. The short string is never released, so do it along
with freeing "be" itself, and make sure the string isn't kept when
backend_changed() doesn't complete successfully (which made it
desirable to slightly re-structure that function, so that the error
cleanup can be done in one place).

Reported-by: Olaf Hering <olaf@aepfle.de>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/block/xen-blkback/xenbus.c | 49 +++++++++++++++++++-------------------
 1 file changed, 24 insertions(+), 25 deletions(-)

diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index 6cc0db1bf522..97ded2552d9c 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -400,6 +400,7 @@ static int xen_blkbk_remove(struct xenbus_device *dev)
 		be->blkif = NULL;
 	}
 
+	kfree(be->mode);
 	kfree(be);
 	dev_set_drvdata(&dev->dev, NULL);
 	return 0;
@@ -482,6 +483,7 @@ static void backend_changed(struct xenbus_watch *watch,
 		= container_of(watch, struct backend_info, backend_watch);
 	struct xenbus_device *dev = be->dev;
 	int cdrom = 0;
+	unsigned long handle;
 	char *device_type;
 
 	DPRINTK("");
@@ -501,10 +503,10 @@ static void backend_changed(struct xenbus_watch *watch,
 		return;
 	}
 
-	if ((be->major || be->minor) &&
-	    ((be->major != major) || (be->minor != minor))) {
-		pr_warn(DRV_PFX "changing physical device (from %x:%x to %x:%x) not supported.\n",
-			be->major, be->minor, major, minor);
+	if (be->major | be->minor) {
+		if (be->major != major || be->minor != minor)
+			pr_warn(DRV_PFX "changing physical device (from %x:%x to %x:%x) not supported.\n",
+				be->major, be->minor, major, minor);
 		return;
 	}
 
@@ -522,36 +524,33 @@ static void backend_changed(struct xenbus_watch *watch,
 		kfree(device_type);
 	}
 
-	if (be->major == 0 && be->minor == 0) {
-		/* Front end dir is a number, which is used as the handle. */
-
-		char *p = strrchr(dev->otherend, '/') + 1;
-		long handle;
-		err = strict_strtoul(p, 0, &handle);
-		if (err)
-			return;
+	/* Front end dir is a number, which is used as the handle. */
+	err = strict_strtoul(strrchr(dev->otherend, '/') + 1, 0, &handle);
+	if (err)
+		return;
 
-		be->major = major;
-		be->minor = minor;
+	be->major = major;
+	be->minor = minor;
 
-		err = xen_vbd_create(be->blkif, handle, major, minor,
-				 (NULL == strchr(be->mode, 'w')), cdrom);
-		if (err) {
-			be->major = 0;
-			be->minor = 0;
-			xenbus_dev_fatal(dev, err, "creating vbd structure");
-			return;
-		}
+	err = xen_vbd_create(be->blkif, handle, major, minor,
+			     !strchr(be->mode, 'w'), cdrom);
 
+	if (err)
+		xenbus_dev_fatal(dev, err, "creating vbd structure");
+	else {
 		err = xenvbd_sysfs_addif(dev);
 		if (err) {
 			xen_vbd_free(&be->blkif->vbd);
-			be->major = 0;
-			be->minor = 0;
 			xenbus_dev_fatal(dev, err, "creating sysfs entries");
-			return;
 		}
+	}
 
+	if (err) {
+		kfree(be->mode);
+		be->mode = NULL;
+		be->major = 0;
+		be->minor = 0;
+	} else {
 		/* We're potentially connected now */
 		xen_update_blkif_status(be->blkif);
 	}
-- 
cgit v1.2.3


From 4ec348232dc21cf79b62f32e0bdb099c9d817941 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 27 Feb 2013 17:03:34 -0800
Subject: idr: fix a subtle bug in idr_get_next()

commit 6cdae7416a1c45c2ce105a78187d9b7e8feb9e24 upstream.

The iteration logic of idr_get_next() is borrowed mostly verbatim from
idr_for_each().  It walks down the tree looking for the slot matching
the current ID.  If the matching slot is not found, the ID is
incremented by the distance of single slot at the given level and
repeats.

The implementation assumes that during the whole iteration id is aligned
to the layer boundaries of the level closest to the leaf, which is true
for all iterations starting from zero or an existing element and thus is
fine for idr_for_each().

However, idr_get_next() may be given any point and if the starting id
hits in the middle of a non-existent layer, increment to the next layer
will end up skipping the same offset into it.  For example, an IDR with
IDs filled between [64, 127] would look like the following.

          [  0  64 ... ]
       /----/   |
       |        |
      NULL    [ 64 ... 127 ]

If idr_get_next() is called with 63 as the starting point, it will try
to follow down the pointer from 0.  As it is NULL, it will then try to
proceed to the next slot in the same level by adding the slot distance
at that level which is 64 - making the next try 127.  It goes around the
loop and finds and returns 127 skipping [64, 126].

Note that this bug also triggers in idr_for_each_entry() loop which
deletes during iteration as deletions can make layers go away leaving
the iteration with unaligned ID into missing layers.

Fix it by ensuring proceeding to the next slot doesn't carry over the
unaligned offset - ie.  use round_up(id + 1, slot_distance) instead of
id += slot_distance.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: David Teigland <teigland@redhat.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 lib/idr.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/lib/idr.c b/lib/idr.c
index e15502e8b21e..b0540c6a0ecb 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -621,7 +621,14 @@ void *idr_get_next(struct idr *idp, int *nextidp)
 			return p;
 		}
 
-		id += 1 << n;
+		/*
+		 * Proceed to the next layer at the current level.  Unlike
+		 * idr_for_each(), @id isn't guaranteed to be aligned to
+		 * layer boundary at this point and adding 1 << n may
+		 * incorrectly skip IDs.  Make sure we jump to the
+		 * beginning of the next layer using round_up().
+		 */
+		id = round_up(id + 1, 1 << n);
 		while (n < fls(id)) {
 			n += IDR_BITS;
 			p = *--paa;
-- 
cgit v1.2.3


From cbf253c44b52b3c6b921a22f05243a1dc7a00122 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 27 Feb 2013 17:04:04 -0800
Subject: firewire: add minor number range check to fw_device_init()

commit 3bec60d511179853138836ae6e1b61fe34d9235f upstream.

fw_device_init() didn't check whether the allocated minor number isn't
too large.  Fail if it goes overflows MINORBITS.

Signed-off-by: Tejun Heo <tj@kernel.org>
Suggested-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
Acked-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/firewire/core-device.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/firewire/core-device.c b/drivers/firewire/core-device.c
index 9f661e069318..812cea37a5b5 100644
--- a/drivers/firewire/core-device.c
+++ b/drivers/firewire/core-device.c
@@ -995,6 +995,10 @@ static void fw_device_init(struct work_struct *work)
 	ret = idr_pre_get(&fw_device_idr, GFP_KERNEL) ?
 	      idr_get_new(&fw_device_idr, device, &minor) :
 	      -ENOMEM;
+	if (minor >= 1 << MINORBITS) {
+		idr_remove(&fw_device_idr, minor);
+		minor = -ENOSPC;
+	}
 	up_write(&fw_device_rwsem);
 
 	if (ret < 0)
-- 
cgit v1.2.3


From d6bf427fdac5fb26636d72f04289780a0ff2ca93 Mon Sep 17 00:00:00 2001
From: Xi Wang <xi.wang@gmail.com>
Date: Wed, 27 Feb 2013 17:05:21 -0800
Subject: sysctl: fix null checking in bin_dn_node_address()

commit df1778be1a33edffa51d094eeda87c858ded6560 upstream.

The null check of `strchr() + 1' is broken, which is always non-null,
leading to OOB read.  Instead, check the result of strchr().

Signed-off-by: Xi Wang <xi.wang@gmail.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/sysctl_binary.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index e055e8b533ce..17c20c7563a5 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1194,9 +1194,10 @@ static ssize_t bin_dn_node_address(struct file *file,
 
 		/* Convert the decnet address to binary */
 		result = -EIO;
-		nodep = strchr(buf, '.') + 1;
+		nodep = strchr(buf, '.');
 		if (!nodep)
 			goto out;
+		++nodep;
 
 		area = simple_strtoul(buf, NULL, 10);
 		node = simple_strtoul(nodep, NULL, 10);
-- 
cgit v1.2.3


From 5dec43e3d688390256df3343bb444e7ed0022f44 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Tue, 27 Nov 2012 13:35:09 -0300
Subject: media: rc: unlock on error in show_protocols()

commit 30ebc5e44d057a1619ad63fe32c8c1670c37c4b8 upstream.

We recently introduced a new return -ENODEV in this function but we need
to unlock before returning.

[mchehab@redhat.com: found two patches with the same fix. Merged SOB's/acks into one patch]
Acked-by: Herton R. Krzesinski <herton.krzesinski@canonical.com>
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Douglas Bagnall <douglas@paradise.net.nz>
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/media/rc/rc-main.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/media/rc/rc-main.c b/drivers/media/rc/rc-main.c
index 9cfb56d8cd82..62910ac90f69 100644
--- a/drivers/media/rc/rc-main.c
+++ b/drivers/media/rc/rc-main.c
@@ -775,8 +775,10 @@ static ssize_t show_protocols(struct device *device,
 	} else if (dev->raw) {
 		enabled = dev->raw->enabled_protocols;
 		allowed = ir_raw_get_allowed_protocols();
-	} else
+	} else {
+		mutex_unlock(&dev->lock);
 		return -ENODEV;
+	}
 
 	IR_dprintk(1, "allowed - 0x%llx, enabled - 0x%llx\n",
 		   (long long)allowed,
-- 
cgit v1.2.3


From fd80f53550720f200ca0469e9419c750f895ab50 Mon Sep 17 00:00:00 2001
From: Niu Yawei <yawei.niu@gmail.com>
Date: Fri, 1 Feb 2013 21:31:27 -0500
Subject: ext4: fix race in ext4_mb_add_n_trim()

commit f1167009711032b0d747ec89a632a626c901a1ad upstream.

In ext4_mb_add_n_trim(), lg_prealloc_lock should be taken when
changing the lg_prealloc_list.

Signed-off-by: Niu Yawei <yawei.niu@intel.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/mballoc.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index b6adf68a5c02..31bbdb52b899 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4111,7 +4111,7 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
 		/* The max size of hash table is PREALLOC_TB_SIZE */
 		order = PREALLOC_TB_SIZE - 1;
 	/* Add the prealloc space to lg */
-	rcu_read_lock();
+	spin_lock(&lg->lg_prealloc_lock);
 	list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
 						pa_inode_list) {
 		spin_lock(&tmp_pa->pa_lock);
@@ -4135,12 +4135,12 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
 	if (!added)
 		list_add_tail_rcu(&pa->pa_inode_list,
 					&lg->lg_prealloc_list[order]);
-	rcu_read_unlock();
+	spin_unlock(&lg->lg_prealloc_lock);
 
 	/* Now trim the list to be not more than 8 elements */
 	if (lg_prealloc_count > 8) {
 		ext4_mb_discard_lg_preallocations(sb, lg,
-						order, lg_prealloc_count);
+						  order, lg_prealloc_count);
 		return;
 	}
 	return ;
-- 
cgit v1.2.3


From 4bf6d0956ad678f3ef59362cb41dae05a2259f78 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Sun, 10 Feb 2013 11:33:48 -0500
Subject: svcrpc: make svc_age_temp_xprts enqueue under sv_lock
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit e75bafbff2270993926abcc31358361db74a9bc2 upstream.

svc_age_temp_xprts expires xprts in a two-step process: first it takes
the sv_lock and moves the xprts to expire off their server-wide list
(sv_tempsocks or sv_permsocks) to a local list.  Then it drops the
sv_lock and enqueues and puts each one.

I see no reason for this: svc_xprt_enqueue() will take sp_lock, but the
sv_lock and sp_lock are not otherwise nested anywhere (and documentation
at the top of this file claims it's correct to nest these with sp_lock
inside.)

Tested-by: Jason Tibbitts <tibbs@math.uh.edu>
Tested-by: Paweł Sikora <pawel.sikora@agmk.net>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/sunrpc/svc_xprt.c | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 05dbccf48f63..e47876cf6180 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -824,7 +824,6 @@ static void svc_age_temp_xprts(unsigned long closure)
 	struct svc_serv *serv = (struct svc_serv *)closure;
 	struct svc_xprt *xprt;
 	struct list_head *le, *next;
-	LIST_HEAD(to_be_aged);
 
 	dprintk("svc_age_temp_xprts\n");
 
@@ -845,25 +844,15 @@ static void svc_age_temp_xprts(unsigned long closure)
 		if (atomic_read(&xprt->xpt_ref.refcount) > 1 ||
 		    test_bit(XPT_BUSY, &xprt->xpt_flags))
 			continue;
-		svc_xprt_get(xprt);
-		list_move(le, &to_be_aged);
+		list_del_init(le);
 		set_bit(XPT_CLOSE, &xprt->xpt_flags);
 		set_bit(XPT_DETACHED, &xprt->xpt_flags);
-	}
-	spin_unlock_bh(&serv->sv_lock);
-
-	while (!list_empty(&to_be_aged)) {
-		le = to_be_aged.next;
-		/* fiddling the xpt_list node is safe 'cos we're XPT_DETACHED */
-		list_del_init(le);
-		xprt = list_entry(le, struct svc_xprt, xpt_list);
-
 		dprintk("queuing xprt %p for closing\n", xprt);
 
 		/* a thread will dequeue and close it soon */
 		svc_xprt_enqueue(xprt);
-		svc_xprt_put(xprt);
 	}
+	spin_unlock_bh(&serv->sv_lock);
 
 	mod_timer(&serv->sv_temptimer, jiffies + svc_conn_age_period * HZ);
 }
-- 
cgit v1.2.3


From 603b86549a4d6928d1059b19df2dfc5d61070533 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Mon, 26 Nov 2012 05:57:27 +0000
Subject: vhost: fix length for cross region descriptor

commit bd97120fc3d1a11f3124c7c9ba1d91f51829eb85 upstream.

If a single descriptor crosses a region, the
second chunk length should be decremented
by size translated so far, instead it includes
the full descriptor length.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/vhost/vhost.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 61047fe31206..e3fac280e462 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -986,7 +986,7 @@ static int translate_desc(struct vhost_dev *dev, u64 addr, u32 len,
 		}
 		_iov = iov + ret;
 		size = reg->memory_size - addr + reg->guest_phys_addr;
-		_iov->iov_len = min((u64)len, size);
+		_iov->iov_len = min((u64)len - s, size);
 		_iov->iov_base = (void __user *)(unsigned long)
 			(reg->userspace_addr + addr - reg->guest_phys_addr);
 		s += size;
-- 
cgit v1.2.3


From 964b12560e1d50f31bc1cc0ac662d52bdbdb6f40 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 19 Feb 2013 14:56:51 +0100
Subject: ptrace: introduce signal_wake_up_state() and ptrace_signal_wake_up()

Upstream commit 910ffdb18a6408e14febbb6e4b6840fd2c928c82.

Cleanup and preparation for the next change.

signal_wake_up(resume => true) is overused. None of ptrace/jctl callers
actually want to wakeup a TASK_WAKEKILL task, but they can't specify the
necessary mask.

Turn signal_wake_up() into signal_wake_up_state(state), reintroduce
signal_wake_up() as a trivial helper, and add ptrace_signal_wake_up()
which adds __TASK_TRACED.

This way ptrace_signal_wake_up() can work "inside" ptrace_request()
even if the tracee doesn't have the TASK_WAKEKILL bit set.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/sched.h | 11 ++++++++++-
 kernel/ptrace.c       |  4 ++--
 kernel/signal.c       | 12 +++---------
 3 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0dae42e70295..d728baba741e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2564,7 +2564,16 @@ static inline void thread_group_cputime_init(struct signal_struct *sig)
 extern void recalc_sigpending_and_wake(struct task_struct *t);
 extern void recalc_sigpending(void);
 
-extern void signal_wake_up(struct task_struct *t, int resume_stopped);
+extern void signal_wake_up_state(struct task_struct *t, unsigned int state);
+
+static inline void signal_wake_up(struct task_struct *t, bool resume)
+{
+	signal_wake_up_state(t, resume ? TASK_WAKEKILL : 0);
+}
+static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume)
+{
+	signal_wake_up_state(t, resume ? __TASK_TRACED : 0);
+}
 
 /*
  * Wrappers for p->thread_info->cpu access. No-op on UP.
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 2df115790cd9..d77fa9eb0820 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -92,7 +92,7 @@ void __ptrace_unlink(struct task_struct *child)
 	 * TASK_KILLABLE sleeps.
 	 */
 	if (child->group_stop & GROUP_STOP_PENDING || task_is_traced(child))
-		signal_wake_up(child, task_is_traced(child));
+		ptrace_signal_wake_up(child, true);
 
 	spin_unlock(&child->sighand->siglock);
 }
@@ -245,7 +245,7 @@ static int ptrace_attach(struct task_struct *task)
 	 */
 	if (task_is_stopped(task)) {
 		task->group_stop |= GROUP_STOP_PENDING | GROUP_STOP_TRAPPING;
-		signal_wake_up(task, 1);
+		signal_wake_up_state(task, __TASK_STOPPED);
 		wait_trap = true;
 	}
 
diff --git a/kernel/signal.c b/kernel/signal.c
index 43fee1cf50d0..8b0dd5bb4da3 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -631,23 +631,17 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
  * No need to set need_resched since signal event passing
  * goes through ->blocked
  */
-void signal_wake_up(struct task_struct *t, int resume)
+void signal_wake_up_state(struct task_struct *t, unsigned int state)
 {
-	unsigned int mask;
-
 	set_tsk_thread_flag(t, TIF_SIGPENDING);
-
 	/*
-	 * For SIGKILL, we want to wake it up in the stopped/traced/killable
+	 * TASK_WAKEKILL also means wake it up in the stopped/traced/killable
 	 * case. We don't check t->state here because there is a race with it
 	 * executing another processor and just now entering stopped state.
 	 * By using wake_up_state, we ensure the process will wake up and
 	 * handle its death signal.
 	 */
-	mask = TASK_INTERRUPTIBLE;
-	if (resume)
-		mask |= TASK_WAKEKILL;
-	if (!wake_up_state(t, mask))
+	if (!wake_up_state(t, state | TASK_INTERRUPTIBLE))
 		kick_process(t);
 }
 
-- 
cgit v1.2.3


From a214998c48e204227d52a81dd459ea51a8a9ae36 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 19 Feb 2013 14:56:52 +0100
Subject: ptrace: ensure arch_ptrace/ptrace_request can never race with SIGKILL

Upstream commit 9899d11f654474d2d54ea52ceaa2a1f4db3abd68.

putreg() assumes that the tracee is not running and pt_regs_access() can
safely play with its stack.  However a killed tracee can return from
ptrace_stop() to the low-level asm code and do RESTORE_REST, this means
that debugger can actually read/modify the kernel stack until the tracee
does SAVE_REST again.

set_task_blockstep() can race with SIGKILL too and in some sense this
race is even worse, the very fact the tracee can be woken up breaks the
logic.

As Linus suggested we can clear TASK_WAKEKILL around the arch_ptrace()
call, this ensures that nobody can ever wakeup the tracee while the
debugger looks at it.  Not only this fixes the mentioned problems, we
can do some cleanups/simplifications in arch_ptrace() paths.

Probably ptrace_unfreeze_traced() needs more callers, for example it
makes sense to make the tracee killable for oom-killer before
access_process_vm().

While at it, add the comment into may_ptrace_stop() to explain why
ptrace_stop() still can't rely on SIGKILL and signal_pending_state().

Reported-by: Salman Qazi <sqazi@google.com>
Reported-by: Suleiman Souhlal <suleiman@google.com>
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/ptrace.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++---------
 kernel/signal.c |  5 +++++
 2 files changed, 55 insertions(+), 9 deletions(-)

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index d77fa9eb0820..40581ee56802 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -38,6 +38,36 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
 	child->parent = new_parent;
 }
 
+/* Ensure that nothing can wake it up, even SIGKILL */
+static bool ptrace_freeze_traced(struct task_struct *task)
+{
+	bool ret = false;
+
+	spin_lock_irq(&task->sighand->siglock);
+	if (task_is_traced(task) && !__fatal_signal_pending(task)) {
+		task->state = __TASK_TRACED;
+		ret = true;
+	}
+	spin_unlock_irq(&task->sighand->siglock);
+
+	return ret;
+}
+
+static void ptrace_unfreeze_traced(struct task_struct *task)
+{
+	if (task->state != __TASK_TRACED)
+		return;
+
+	WARN_ON(!task->ptrace || task->parent != current);
+
+	spin_lock_irq(&task->sighand->siglock);
+	if (__fatal_signal_pending(task))
+		wake_up_state(task, __TASK_TRACED);
+	else
+		task->state = TASK_TRACED;
+	spin_unlock_irq(&task->sighand->siglock);
+}
+
 /**
  * __ptrace_unlink - unlink ptracee and restore its execution state
  * @child: ptracee to be unlinked
@@ -112,23 +142,29 @@ int ptrace_check_attach(struct task_struct *child, int kill)
 	 * be changed by us so it's not changing right after this.
 	 */
 	read_lock(&tasklist_lock);
-	if ((child->ptrace & PT_PTRACED) && child->parent == current) {
+	if (child->ptrace && child->parent == current) {
+		WARN_ON(child->state == __TASK_TRACED);
 		/*
 		 * child->sighand can't be NULL, release_task()
 		 * does ptrace_unlink() before __exit_signal().
 		 */
-		spin_lock_irq(&child->sighand->siglock);
-		WARN_ON_ONCE(task_is_stopped(child));
-		if (task_is_traced(child) || kill)
+		if (kill || ptrace_freeze_traced(child))
 			ret = 0;
-		spin_unlock_irq(&child->sighand->siglock);
 	}
 	read_unlock(&tasklist_lock);
 
-	if (!ret && !kill)
-		ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH;
+	if (!ret && !kill) {
+		if (!wait_task_inactive(child, __TASK_TRACED)) {
+			/*
+			 * This can only happen if may_ptrace_stop() fails and
+			 * ptrace_stop() changes ->state back to TASK_RUNNING,
+			 * so we should not worry about leaking __TASK_TRACED.
+			 */
+			WARN_ON(child->state == __TASK_TRACED);
+			ret = -ESRCH;
+		}
+	}
 
-	/* All systems go.. */
 	return ret;
 }
 
@@ -777,6 +813,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
 		goto out_put_task_struct;
 
 	ret = arch_ptrace(child, request, addr, data);
+	if (ret || request != PTRACE_DETACH)
+		ptrace_unfreeze_traced(child);
 
  out_put_task_struct:
 	put_task_struct(child);
@@ -915,8 +953,11 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
 	}
 
 	ret = ptrace_check_attach(child, request == PTRACE_KILL);
-	if (!ret)
+	if (!ret) {
 		ret = compat_arch_ptrace(child, request, addr, data);
+		if (ret || request != PTRACE_DETACH)
+			ptrace_unfreeze_traced(child);
+	}
 
  out_put_task_struct:
 	put_task_struct(child);
diff --git a/kernel/signal.c b/kernel/signal.c
index 8b0dd5bb4da3..51f2e694ec69 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1669,6 +1669,10 @@ static inline int may_ptrace_stop(void)
 	 * If SIGKILL was already sent before the caller unlocked
 	 * ->siglock we must see ->core_state != NULL. Otherwise it
 	 * is safe to enter schedule().
+	 *
+	 * This is almost outdated, a task with the pending SIGKILL can't
+	 * block in TASK_TRACED. But PTRACE_EVENT_EXIT can be reported
+	 * after SIGKILL was already dequeued.
 	 */
 	if (unlikely(current->mm->core_state) &&
 	    unlikely(current->mm == current->parent->mm))
@@ -1800,6 +1804,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
 		if (gstop_done)
 			do_notify_parent_cldstop(current, false, why);
 
+		/* tasklist protects us from ptrace_freeze_traced() */
 		__set_current_state(TASK_RUNNING);
 		if (clear_code)
 			current->exit_code = 0;
-- 
cgit v1.2.3


From 8a9279a5af607d9f59b3921c639df139a28bcef6 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 19 Feb 2013 14:56:53 +0100
Subject: wake_up_process() should be never used to wakeup a
 TASK_STOPPED/TRACED task

Upstream commit 9067ac85d533651b98c2ff903182a20cbb361fcb.

wake_up_process() should never wakeup a TASK_STOPPED/TRACED task.
Change it to use TASK_NORMAL and add the WARN_ON().

TASK_ALL has no other users, probably can be killed.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/sched.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index aacd55f8d4ea..cd2b7cb638fe 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2778,7 +2778,8 @@ out:
  */
 int wake_up_process(struct task_struct *p)
 {
-	return try_to_wake_up(p, TASK_ALL, 0);
+	WARN_ON(task_is_stopped_or_traced(p));
+	return try_to_wake_up(p, TASK_NORMAL, 0);
 }
 EXPORT_SYMBOL(wake_up_process);
 
-- 
cgit v1.2.3


From 43f514598ad16af1c6dd08dd429631eaafef7849 Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Mon, 4 Feb 2013 19:39:52 +0000
Subject: unbreak automounter support on 64-bit kernel with 32-bit userspace
 (v2)

commit 4f4ffc3a5398ef9bdbb32db04756d7d34e356fcf upstream.

automount-support is broken on the parisc architecture, because the existing
#if list does not include a check for defined(__hppa__). The HPPA (parisc)
architecture is similiar to other 64bit Linux targets where we have to define
autofs_wqt_t (which is passed back and forth to user space) as int type which
has a size of 32bit across 32 and 64bit kernels.

During the discussion on the mailing list, H. Peter Anvin suggested to invert
the #if list since only specific platforms (specifically those who do not have
a 32bit userspace, like IA64 and Alpha) should have autofs_wqt_t as unsigned
long type.

This suggestion is probably the best way to go, since Arm64 (and maybe others?)
seems to have a non-working automounter. So in the long run even for other new
upcoming architectures this inverted check seem to be the best solution, since
it will not require them to change this #if again (unless they are 64bit only).

Signed-off-by: Helge Deller <deller@gmx.de>
Acked-by: H. Peter Anvin <hpa@zytor.com>
Acked-by: Ian Kent <raven@themaw.net>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
CC: James Bottomley <James.Bottomley@HansenPartnership.com>
CC: Rolf Eike Beer <eike-kernel@sf-tec.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/auto_fs.h | 25 ++++++++-----------------
 1 file changed, 8 insertions(+), 17 deletions(-)

diff --git a/include/linux/auto_fs.h b/include/linux/auto_fs.h
index da64e15004b6..6cdabb422de6 100644
--- a/include/linux/auto_fs.h
+++ b/include/linux/auto_fs.h
@@ -31,25 +31,16 @@
 #define AUTOFS_MIN_PROTO_VERSION	AUTOFS_PROTO_VERSION
 
 /*
- * Architectures where both 32- and 64-bit binaries can be executed
- * on 64-bit kernels need this.  This keeps the structure format
- * uniform, and makes sure the wait_queue_token isn't too big to be
- * passed back down to the kernel.
- *
- * This assumes that on these architectures:
- * mode     32 bit    64 bit
- * -------------------------
- * int      32 bit    32 bit
- * long     32 bit    64 bit
- *
- * If so, 32-bit user-space code should be backwards compatible.
+ * The wait_queue_token (autofs_wqt_t) is part of a structure which is passed
+ * back to the kernel via ioctl from userspace. On architectures where 32- and
+ * 64-bit userspace binaries can be executed it's important that the size of
+ * autofs_wqt_t stays constant between 32- and 64-bit Linux kernels so that we
+ * do not break the binary ABI interface by changing the structure size.
  */
-
-#if defined(__sparc__) || defined(__mips__) || defined(__x86_64__) \
- || defined(__powerpc__) || defined(__s390__)
-typedef unsigned int autofs_wqt_t;
-#else
+#if defined(__ia64__) || defined(__alpha__) /* pure 64bit architectures */
 typedef unsigned long autofs_wqt_t;
+#else
+typedef unsigned int autofs_wqt_t;
 #endif
 
 /* Packet types */
-- 
cgit v1.2.3


From f39edfbf6dbf8e29cfbafd67d93fa1e30196701c Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 7 Feb 2013 09:44:13 -0800
Subject: x86: Do not leak kernel page mapping locations

commit e575a86fdc50d013bf3ad3aa81d9100e8e6cc60d upstream.

Without this patch, it is trivial to determine kernel page
mappings by examining the error code reported to dmesg[1].
Instead, declare the entire kernel memory space as a violation
of a present page.

Additionally, since show_unhandled_signals is enabled by
default, switch branch hinting to the more realistic
expectation, and unobfuscate the setting of the PF_PROT bit to
improve readability.

[1] http://vulnfactory.org/blog/2013/02/06/a-linux-memory-trick/

Reported-by: Dan Rosenberg <dan.j.rosenberg@gmail.com>
Suggested-by: Brad Spengler <spender@grsecurity.net>
Signed-off-by: Kees Cook <keescook@chromium.org>
Acked-by: H. Peter Anvin <hpa@zytor.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20130207174413.GA12485@www.outflux.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: CAI Qian <caiqian@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/mm/fault.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 2dbf6bf4c7e5..3b2ad913d6ce 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -720,12 +720,15 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
 		if (is_errata100(regs, address))
 			return;
 
-		if (unlikely(show_unhandled_signals))
+		/* Kernel addresses are always protection faults: */
+		if (address >= TASK_SIZE)
+			error_code |= PF_PROT;
+
+		if (likely(show_unhandled_signals))
 			show_signal_msg(regs, error_code, address, tsk);
 
-		/* Kernel addresses are always protection faults: */
 		tsk->thread.cr2		= address;
-		tsk->thread.error_code	= error_code | (address >= TASK_SIZE);
+		tsk->thread.error_code	= error_code;
 		tsk->thread.trap_no	= 14;
 
 		force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0);
-- 
cgit v1.2.3


From 2212f47b734e5b9461b5c3f555dc653ea7aa212f Mon Sep 17 00:00:00 2001
From: Stoney Wang <song-bo.wang@hp.com>
Date: Thu, 7 Feb 2013 10:53:02 -0800
Subject: x86/apic: Work around boot failure on HP ProLiant DL980 G7 Server
 systems

commit cb214ede7657db458fd0b2a25ea0b28dbf900ebc upstream.

When a HP ProLiant DL980 G7 Server boots a regular kernel,
there will be intermittent lost interrupts which could
result in a hang or (in extreme cases) data loss.

The reason is that this system only supports x2apic physical
mode, while the kernel boots with a logical-cluster default
setting.

This bug can be worked around by specifying the "x2apic_phys" or
"nox2apic" boot option, but we want to handle this system
without requiring manual workarounds.

The BIOS sets ACPI_FADT_APIC_PHYSICAL in FADT table.
As all apicids are smaller than 255, BIOS need to pass the
control to the OS with xapic mode, according to x2apic-spec,
chapter 2.9.

Current code handle x2apic when BIOS pass with xapic mode
enabled:

When user specifies x2apic_phys, or FADT indicates PHYSICAL:

1. During madt oem check, apic driver is set with xapic logical
   or xapic phys driver at first.

2. enable_IR_x2apic() will enable x2apic_mode.

3. if user specifies x2apic_phys on the boot line, x2apic_phys_probe()
   will install the correct x2apic phys driver and use x2apic phys mode.
   Otherwise it will skip the driver will let x2apic_cluster_probe to
   take over to install x2apic cluster driver (wrong one) even though FADT
   indicates PHYSICAL, because x2apic_phys_probe does not check
   FADT PHYSICAL.

Add checking x2apic_fadt_phys in x2apic_phys_probe() to fix the
problem.

Signed-off-by: Stoney Wang <song-bo.wang@hp.com>
[ updated the changelog and simplified the code ]
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Zhang Lin-Bao <Linbao.zhang@hp.com>
[ make a patch specially for 3.0.66]
Link: http://lkml.kernel.org/r/1360263182-16226-1-git-send-email-yinghai@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/apic/x2apic_phys.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index f5373dfde21e..db4f70492ea7 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -20,12 +20,19 @@ static int set_x2apic_phys_mode(char *arg)
 }
 early_param("x2apic_phys", set_x2apic_phys_mode);
 
+static bool x2apic_fadt_phys(void)
+{
+	if ((acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID) &&
+		(acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) {
+		printk(KERN_DEBUG "System requires x2apic physical mode\n");
+		return true;
+	}
+	return false;
+}
+
 static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 {
-	if (x2apic_phys)
-		return x2apic_enabled();
-	else
-		return 0;
+	return x2apic_enabled() && (x2apic_phys || x2apic_fadt_phys());
 }
 
 static void
@@ -108,7 +115,7 @@ static void init_x2apic_ldr(void)
 
 static int x2apic_phys_probe(void)
 {
-	if (x2apic_mode && x2apic_phys)
+	if (x2apic_mode && (x2apic_phys || x2apic_fadt_phys()))
 		return 1;
 
 	return apic == &apic_x2apic_phys;
-- 
cgit v1.2.3


From 6403d47ff9392807fcfa4464527193e0cab65b2a Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Fri, 25 Jan 2013 16:08:01 +0800
Subject: cpuset: fix cpuset_print_task_mems_allowed() vs rename() race

commit 63f43f55c9bbc14f76b582644019b8a07dc8219a upstream.

rename() will change dentry->d_name. The result of this race can
be worse than seeing partially rewritten name, but we might access
a stale pointer because rename() will re-allocate memory to hold
a longer name.

It's safe in the protection of dentry->d_lock.

v2: check NULL dentry before acquiring dentry lock.

Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/cpuset.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 6cbe0330249d..ea76c9c5d42d 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2499,8 +2499,16 @@ void cpuset_print_task_mems_allowed(struct task_struct *tsk)
 
 	dentry = task_cs(tsk)->css.cgroup->dentry;
 	spin_lock(&cpuset_buffer_lock);
-	snprintf(cpuset_name, CPUSET_NAME_LEN,
-		 dentry ? (const char *)dentry->d_name.name : "/");
+
+	if (!dentry) {
+		strcpy(cpuset_name, "/");
+	} else {
+		spin_lock(&dentry->d_lock);
+		strlcpy(cpuset_name, (const char *)dentry->d_name.name,
+			CPUSET_NAME_LEN);
+		spin_unlock(&dentry->d_lock);
+	}
+
 	nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
 			   tsk->mems_allowed);
 	printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n",
-- 
cgit v1.2.3


From cc0e3e13b0a90e5ff42d5b134939eacf5e7e497c Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Thu, 24 Jan 2013 14:43:28 +0800
Subject: cgroup: fix exit() vs rmdir() race

commit 71b5707e119653039e6e95213f00479668c79b75 upstream.

In cgroup_exit() put_css_set_taskexit() is called without any lock,
which might lead to accessing a freed cgroup:

thread1                           thread2
---------------------------------------------
exit()
  cgroup_exit()
    put_css_set_taskexit()
      atomic_dec(cgrp->count);
                                   rmdir();
      /* not safe !! */
      check_for_release(cgrp);

rcu_read_lock() can be used to make sure the cgroup is alive.

Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/cgroup.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1749dcd976b9..b964f9e406f1 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -359,12 +359,20 @@ static void __put_css_set(struct css_set *cg, int taskexit)
 		struct cgroup *cgrp = link->cgrp;
 		list_del(&link->cg_link_list);
 		list_del(&link->cgrp_link_list);
+
+		/*
+		 * We may not be holding cgroup_mutex, and if cgrp->count is
+		 * dropped to 0 the cgroup can be destroyed at any time, hence
+		 * rcu_read_lock is used to keep it alive.
+		 */
+		rcu_read_lock();
 		if (atomic_dec_and_test(&cgrp->count) &&
 		    notify_on_release(cgrp)) {
 			if (taskexit)
 				set_bit(CGRP_RELEASABLE, &cgrp->flags);
 			check_for_release(cgrp);
 		}
+		rcu_read_unlock();
 
 		kfree(link);
 	}
-- 
cgit v1.2.3


From 55bce39db26a3b1d6b18fc0ae5d57948946f0c49 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Mon, 4 Mar 2013 06:09:07 +0800
Subject: dca: check against empty dca_domains list before unregister provider
 fix

In 3.0.67, commit 7a9a20ea77e7508c795dead9ab2f6c98a617762d (dca: check
against empty dca_domains list before unregister provider), upstream
commit c419fcfd071cf34ba00f9f65282583772d2655e7, added a fail path to
unregister_dca_provider. It added there also a call to
raw_spin_unlock_irqrestore. But in 3.0, the lock is not raw, so this
results in:
drivers/dca/dca-core.c: In function 'unregister_dca_provider':
drivers/dca/dca-core.c:413: warning: passing argument 1 of '_raw_spin_unlock_irqrestore' from incompatible pointer type

Fix it by calling spin_unlock_irqrestore properly.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/dca/dca-core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/dca/dca-core.c b/drivers/dca/dca-core.c
index 7065851d0b79..605fd20b1e53 100644
--- a/drivers/dca/dca-core.c
+++ b/drivers/dca/dca-core.c
@@ -410,7 +410,7 @@ void unregister_dca_provider(struct dca_provider *dca, struct device *dev)
 	spin_lock_irqsave(&dca_lock, flags);
 
 	if (list_empty(&dca_domains)) {
-		raw_spin_unlock_irqrestore(&dca_lock, flags);
+		spin_unlock_irqrestore(&dca_lock, flags);
 		return;
 	}
 
-- 
cgit v1.2.3


From d81d788db85abd39fd7753e2482f748c48de202a Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Mon, 4 Mar 2013 06:09:07 +0800
Subject: s390/kvm: Fix store status for ACRS/FPRS fix

In 3.0.67, commit 58c9ce6fad8e00d9726447f939fe7e78e2aec891 (s390/kvm:
Fix store status for ACRS/FPRS), upstream commit
15bc8d8457875f495c59d933b05770ba88d1eacb, added a call to
save_access_regs to save ACRS. But we do not have ARCS in kvm_run in
3.0 yet, so this results in:
arch/s390/kvm/kvm-s390.c: In function 'kvm_s390_vcpu_store_status':
arch/s390/kvm/kvm-s390.c:593: error: 'struct kvm_run' has no member named 's'

Fix it by saving guest_acrs which is where ARCS are in 3.0.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/s390/kvm/kvm-s390.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 25ab200f4aa2..f9804b7aec18 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -590,7 +590,7 @@ int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr)
 	 * it into the save area
 	 */
 	save_fp_regs(&vcpu->arch.guest_fpregs);
-	save_access_regs(vcpu->run->s.regs.acrs);
+	save_access_regs(vcpu->arch.guest_acrs);
 
 	if (__guestcopy(vcpu, addr + offsetof(struct save_area, fp_regs),
 			vcpu->arch.guest_fpregs.fprs, 128, prefix))
-- 
cgit v1.2.3


From 5e6af63de116db55be85da4374441f7963d11281 Mon Sep 17 00:00:00 2001
From: Ian Abbott <abbotti@mev.co.uk>
Date: Wed, 27 Feb 2013 12:52:45 +0000
Subject: staging: comedi: ni_labpc: correct differential channel sequence for
 AI commands

Commit 4c4bc25d0fa6beaf054c0b4c3b324487f266c820 upstream.

Tuomas <tvainikk _at_ gmail _dot_ com> reported problems getting
meaningful output from a Lab-PC+ in differential mode for AI cmds, but
AI insn reads gave correct readings.  He tracked it down to two
problems, one of which is addressed by this patch.

It seems the setting of the channel bits for particular scanning modes
was incorrect for differential mode.  (Only half the number of channels
are available in differential mode; comedi refers to them as channels 0,
1, 2 and 3, but the hardware documentation refers to them as channels 0,
2, 4 and 6.)  In differential mode, the setting of the channel enable
bits in the command1 register should depend on whether the scan enable
bit is set.  Effectively, we need to double the comedi channel number
when the scan enable bit is not set in differential mode.  The scan
enable bit gets set when the AI scan mode is `MODE_MULT_CHAN_UP` or
`MODE_MULT_CHAN_DOWN`, and gets cleared when the AI scan mode is
`MODE_SINGLE_CHAN` or `MODE_SINGLE_CHAN_INTERVAL`.  The existing test
for whether the comedi channel number needs to be doubled in
differential mode is incorrect in `labpc_ai_cmd()`.  This patch corrects
the test.

Thanks to Tuomas for suggesting the fix.

Signed-off-by: Ian Abbott <abbotti@mev.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/staging/comedi/drivers/ni_labpc.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/staging/comedi/drivers/ni_labpc.c b/drivers/staging/comedi/drivers/ni_labpc.c
index ab8f37022a3c..9b204b4d1e73 100644
--- a/drivers/staging/comedi/drivers/ni_labpc.c
+++ b/drivers/staging/comedi/drivers/ni_labpc.c
@@ -1241,7 +1241,9 @@ static int labpc_ai_cmd(struct comedi_device *dev, struct comedi_subdevice *s)
 	else
 		channel = CR_CHAN(cmd->chanlist[0]);
 	/* munge channel bits for differential / scan disabled mode */
-	if (labpc_ai_scan_mode(cmd) != MODE_SINGLE_CHAN && aref == AREF_DIFF)
+	if ((labpc_ai_scan_mode(cmd) == MODE_SINGLE_CHAN ||
+	     labpc_ai_scan_mode(cmd) == MODE_SINGLE_CHAN_INTERVAL) &&
+	    aref == AREF_DIFF)
 		channel *= 2;
 	devpriv->command1_bits |= ADC_CHAN_BITS(channel);
 	devpriv->command1_bits |= thisboard->ai_range_code[range];
-- 
cgit v1.2.3


From 544da4403b05e874e134919db7124def003ef7cf Mon Sep 17 00:00:00 2001
From: Ian Abbott <abbotti@mev.co.uk>
Date: Wed, 27 Feb 2013 12:52:46 +0000
Subject: staging: comedi: ni_labpc: set up command4 register *after* command3

Commit 22056e2b46246d97ff0f7c6e21a77b8daa07f02c upstream.

Tuomas <tvainikk _at_ gmail _dot_ com> reported problems getting
meaningful output from a Lab-PC+ in differential mode for AI cmds, but
AI insn reads gave correct readings.  He tracked it down to two
problems, one of which is addressed by this patch.

It seems that writing to the command3 register after writing to the
command4 register in `labpc_ai_cmd()` messes up the differential
reference bit setting in the command4 register.  Set up the command4
register after the command3 register (as in `labpc_ai_rinsn()`) to avoid
the problem.

Thanks to Tuomas for suggesting the fix.

Signed-off-by: Ian Abbott <abbotti@mev.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/staging/comedi/drivers/ni_labpc.c | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/drivers/staging/comedi/drivers/ni_labpc.c b/drivers/staging/comedi/drivers/ni_labpc.c
index 9b204b4d1e73..897359d79e50 100644
--- a/drivers/staging/comedi/drivers/ni_labpc.c
+++ b/drivers/staging/comedi/drivers/ni_labpc.c
@@ -1259,21 +1259,6 @@ static int labpc_ai_cmd(struct comedi_device *dev, struct comedi_subdevice *s)
 		devpriv->write_byte(devpriv->command1_bits,
 				    dev->iobase + COMMAND1_REG);
 	}
-	/*  setup any external triggering/pacing (command4 register) */
-	devpriv->command4_bits = 0;
-	if (cmd->convert_src != TRIG_EXT)
-		devpriv->command4_bits |= EXT_CONVERT_DISABLE_BIT;
-	/* XXX should discard first scan when using interval scanning
-	 * since manual says it is not synced with scan clock */
-	if (labpc_use_continuous_mode(cmd) == 0) {
-		devpriv->command4_bits |= INTERVAL_SCAN_EN_BIT;
-		if (cmd->scan_begin_src == TRIG_EXT)
-			devpriv->command4_bits |= EXT_SCAN_EN_BIT;
-	}
-	/*  single-ended/differential */
-	if (aref == AREF_DIFF)
-		devpriv->command4_bits |= ADC_DIFF_BIT;
-	devpriv->write_byte(devpriv->command4_bits, dev->iobase + COMMAND4_REG);
 
 	devpriv->write_byte(cmd->chanlist_len,
 			    dev->iobase + INTERVAL_COUNT_REG);
@@ -1351,6 +1336,22 @@ static int labpc_ai_cmd(struct comedi_device *dev, struct comedi_subdevice *s)
 		devpriv->command3_bits &= ~ADC_FNE_INTR_EN_BIT;
 	devpriv->write_byte(devpriv->command3_bits, dev->iobase + COMMAND3_REG);
 
+	/*  setup any external triggering/pacing (command4 register) */
+	devpriv->command4_bits = 0;
+	if (cmd->convert_src != TRIG_EXT)
+		devpriv->command4_bits |= EXT_CONVERT_DISABLE_BIT;
+	/* XXX should discard first scan when using interval scanning
+	 * since manual says it is not synced with scan clock */
+	if (labpc_use_continuous_mode(cmd) == 0) {
+		devpriv->command4_bits |= INTERVAL_SCAN_EN_BIT;
+		if (cmd->scan_begin_src == TRIG_EXT)
+			devpriv->command4_bits |= EXT_SCAN_EN_BIT;
+	}
+	/*  single-ended/differential */
+	if (aref == AREF_DIFF)
+		devpriv->command4_bits |= ADC_DIFF_BIT;
+	devpriv->write_byte(devpriv->command4_bits, dev->iobase + COMMAND4_REG);
+
 	/*  startup acquisition */
 
 	/*  command2 reg */
-- 
cgit v1.2.3


From 6c80ee53f3737993f693bf2a3b4b3e4bbde9b51f Mon Sep 17 00:00:00 2001
From: Ian Abbott <abbotti@mev.co.uk>
Date: Wed, 27 Feb 2013 10:56:19 +0000
Subject: staging: comedi: check s->async for poll(), read() and write()

commit cc400e185c07c15a42d2635995f422de5b94b696 upstream.

Some low-level comedi drivers (incorrectly) point `dev->read_subdev` or
`dev->write_subdev` to a subdevice that does not support asynchronous
commands.  Comedi's poll(), read() and write() file operation handlers
assume these subdevices do support asynchronous commands.  In
particular, they assume `s->async` is valid (where `s` points to the
read or write subdevice), which it won't be if it has been set
incorrectly.  This can lead to a NULL pointer dereference.

Check `s->async` is non-NULL in `comedi_poll()`, `comedi_read()` and
`comedi_write()` to avoid the bug.

Signed-off-by: Ian Abbott <abbotti@mev.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/staging/comedi/comedi_fops.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/staging/comedi/comedi_fops.c b/drivers/staging/comedi/comedi_fops.c
index 4b9d8f003289..ee33cba59a4f 100644
--- a/drivers/staging/comedi/comedi_fops.c
+++ b/drivers/staging/comedi/comedi_fops.c
@@ -1577,7 +1577,7 @@ static unsigned int comedi_poll(struct file *file, poll_table * wait)
 
 	mask = 0;
 	read_subdev = comedi_get_read_subdevice(dev_file_info);
-	if (read_subdev) {
+	if (read_subdev && read_subdev->async) {
 		poll_wait(file, &read_subdev->async->wait_head, wait);
 		if (!read_subdev->busy
 		    || comedi_buf_read_n_available(read_subdev->async) > 0
@@ -1587,7 +1587,7 @@ static unsigned int comedi_poll(struct file *file, poll_table * wait)
 		}
 	}
 	write_subdev = comedi_get_write_subdevice(dev_file_info);
-	if (write_subdev) {
+	if (write_subdev && write_subdev->async) {
 		poll_wait(file, &write_subdev->async->wait_head, wait);
 		comedi_buf_write_alloc(write_subdev->async,
 				       write_subdev->async->prealloc_bufsz);
@@ -1629,7 +1629,7 @@ static ssize_t comedi_write(struct file *file, const char __user *buf,
 	}
 
 	s = comedi_get_write_subdevice(dev_file_info);
-	if (s == NULL) {
+	if (s == NULL || s->async == NULL) {
 		retval = -EIO;
 		goto done;
 	}
@@ -1740,7 +1740,7 @@ static ssize_t comedi_read(struct file *file, char __user *buf, size_t nbytes,
 	}
 
 	s = comedi_get_read_subdevice(dev_file_info);
-	if (s == NULL) {
+	if (s == NULL || s->async == NULL) {
 		retval = -EIO;
 		goto done;
 	}
-- 
cgit v1.2.3


From e28c3f2b514b5581e15614f7cf976131092cf4b6 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Mon, 4 Mar 2013 06:09:28 +0800
Subject: Linux 3.0.68

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 7d4347a8dcf4..8f3b7a815d94 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
 VERSION = 3
 PATCHLEVEL = 0
-SUBLEVEL = 67
+SUBLEVEL = 68
 EXTRAVERSION =
 NAME = Sneaky Weasel
 
-- 
cgit v1.2.3