From d40bc96257fe070796c63934913f95cc183016b0 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 26 Feb 2018 10:52:46 -0800
Subject: test_bpf: add a schedule point

test_bpf() is taking 1.6 seconds nowadays, it is time
to add a schedule point in it.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 lib/test_bpf.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'lib')

diff --git a/lib/test_bpf.c b/lib/test_bpf.c
index b4e22345963f..e6f550608d72 100644
--- a/lib/test_bpf.c
+++ b/lib/test_bpf.c
@@ -24,6 +24,7 @@
 #include <linux/if_vlan.h>
 #include <linux/random.h>
 #include <linux/highmem.h>
+#include <linux/sched.h>
 
 /* General test specific settings */
 #define MAX_SUBTESTS	3
@@ -6582,6 +6583,7 @@ static __init int test_bpf(void)
 		struct bpf_prog *fp;
 		int err;
 
+		cond_resched();
 		if (exclude_test(i))
 			continue;
 
-- 
cgit v1.2.3


From 9960d7669eaa42e82a2f4393adf549191de2e587 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 28 Feb 2018 08:39:20 -0800
Subject: test_bpf: reduce MAX_TESTRUNS

For tests that are using the maximal number of BPF instruction, each
run takes 20 usec. Looping 10,000 times on them totals 200 ms, which
is bad when the loop is not preemptible.

test_bpf: #264 BPF_MAXINSNS: Call heavy transformations jited:1 19248
18548 PASS
test_bpf: #269 BPF_MAXINSNS: ld_abs+get_processor_id jited:1 20896 PASS

Lets divide by ten the number of iterations, so that max latency is
20ms. We could use need_resched() to break the loop earlier if we
believe 20 ms is too much.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 lib/test_bpf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'lib')

diff --git a/lib/test_bpf.c b/lib/test_bpf.c
index e6f550608d72..2efb213716fa 100644
--- a/lib/test_bpf.c
+++ b/lib/test_bpf.c
@@ -28,7 +28,7 @@
 
 /* General test specific settings */
 #define MAX_SUBTESTS	3
-#define MAX_TESTRUNS	10000
+#define MAX_TESTRUNS	1000
 #define MAX_DATA	128
 #define MAX_INSNS	512
 #define MAX_K		0xffffFFFF
-- 
cgit v1.2.3


From d3dcf8eb615537526bd42ff27a081d46d337816e Mon Sep 17 00:00:00 2001
From: Paul Blakey <paulb@mellanox.com>
Date: Sun, 4 Mar 2018 17:29:48 +0200
Subject: rhashtable: Fix rhlist duplicates insertion

When inserting duplicate objects (those with the same key),
current rhlist implementation messes up the chain pointers by
updating the bucket pointer instead of prev next pointer to the
newly inserted node. This causes missing elements on removal and
travesal.

Fix that by properly updating pprev pointer to point to
the correct rhash_head next pointer.

Issue: 1241076
Change-Id: I86b2c140bcb4aeb10b70a72a267ff590bb2b17e7
Fixes: ca26893f05e8 ('rhashtable: Add rhlist interface')
Signed-off-by: Paul Blakey <paulb@mellanox.com>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'lib')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 3825c30aaa36..47de025b6245 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -506,8 +506,10 @@ static void *rhashtable_lookup_one(struct rhashtable *ht,
 		if (!key ||
 		    (ht->p.obj_cmpfn ?
 		     ht->p.obj_cmpfn(&arg, rht_obj(ht, head)) :
-		     rhashtable_compare(&arg, rht_obj(ht, head))))
+		     rhashtable_compare(&arg, rht_obj(ht, head)))) {
+			pprev = &head->next;
 			continue;
+		}
 
 		if (!ht->rhlist)
 			return rht_obj(ht, head);
-- 
cgit v1.2.3


From 499ac3b60f657dae82055fc81c7b01e6242ac9bc Mon Sep 17 00:00:00 2001
From: Paul Blakey <paulb@mellanox.com>
Date: Sun, 4 Mar 2018 17:29:49 +0200
Subject: test_rhashtable: add test case for rhltable with duplicate objects

Tries to insert duplicates in the middle of bucket's chain:
bucket 1:  [[val 21 (tid=1)]] -> [[ val 1 (tid=2),  val 1 (tid=0) ]]

Reuses tid to distinguish the elements insertion order.

Signed-off-by: Paul Blakey <paulb@mellanox.com>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/test_rhashtable.c | 134 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 134 insertions(+)

(limited to 'lib')

diff --git a/lib/test_rhashtable.c b/lib/test_rhashtable.c
index 76d3667fdea2..f4000c137dbe 100644
--- a/lib/test_rhashtable.c
+++ b/lib/test_rhashtable.c
@@ -79,6 +79,21 @@ struct thread_data {
 	struct test_obj *objs;
 };
 
+static u32 my_hashfn(const void *data, u32 len, u32 seed)
+{
+	const struct test_obj_rhl *obj = data;
+
+	return (obj->value.id % 10) << RHT_HASH_RESERVED_SPACE;
+}
+
+static int my_cmpfn(struct rhashtable_compare_arg *arg, const void *obj)
+{
+	const struct test_obj_rhl *test_obj = obj;
+	const struct test_obj_val *val = arg->key;
+
+	return test_obj->value.id - val->id;
+}
+
 static struct rhashtable_params test_rht_params = {
 	.head_offset = offsetof(struct test_obj, node),
 	.key_offset = offsetof(struct test_obj, value),
@@ -87,6 +102,17 @@ static struct rhashtable_params test_rht_params = {
 	.nulls_base = (3U << RHT_BASE_SHIFT),
 };
 
+static struct rhashtable_params test_rht_params_dup = {
+	.head_offset = offsetof(struct test_obj_rhl, list_node),
+	.key_offset = offsetof(struct test_obj_rhl, value),
+	.key_len = sizeof(struct test_obj_val),
+	.hashfn = jhash,
+	.obj_hashfn = my_hashfn,
+	.obj_cmpfn = my_cmpfn,
+	.nelem_hint = 128,
+	.automatic_shrinking = false,
+};
+
 static struct semaphore prestart_sem;
 static struct semaphore startup_sem = __SEMAPHORE_INITIALIZER(startup_sem, 0);
 
@@ -465,6 +491,112 @@ static int __init test_rhashtable_max(struct test_obj *array,
 	return err;
 }
 
+static unsigned int __init print_ht(struct rhltable *rhlt)
+{
+	struct rhashtable *ht;
+	const struct bucket_table *tbl;
+	char buff[512] = "";
+	unsigned int i, cnt = 0;
+
+	ht = &rhlt->ht;
+	tbl = rht_dereference(ht->tbl, ht);
+	for (i = 0; i < tbl->size; i++) {
+		struct rhash_head *pos, *next;
+		struct test_obj_rhl *p;
+
+		pos = rht_dereference(tbl->buckets[i], ht);
+		next = !rht_is_a_nulls(pos) ? rht_dereference(pos->next, ht) : NULL;
+
+		if (!rht_is_a_nulls(pos)) {
+			sprintf(buff, "%s\nbucket[%d] -> ", buff, i);
+		}
+
+		while (!rht_is_a_nulls(pos)) {
+			struct rhlist_head *list = container_of(pos, struct rhlist_head, rhead);
+			sprintf(buff, "%s[[", buff);
+			do {
+				pos = &list->rhead;
+				list = rht_dereference(list->next, ht);
+				p = rht_obj(ht, pos);
+
+				sprintf(buff, "%s val %d (tid=%d)%s", buff, p->value.id, p->value.tid,
+					list? ", " : " ");
+				cnt++;
+			} while (list);
+
+			pos = next,
+			next = !rht_is_a_nulls(pos) ?
+				rht_dereference(pos->next, ht) : NULL;
+
+			sprintf(buff, "%s]]%s", buff, !rht_is_a_nulls(pos) ? " -> " : "");
+		}
+	}
+	printk(KERN_ERR "\n---- ht: ----%s\n-------------\n", buff);
+
+	return cnt;
+}
+
+static int __init test_insert_dup(struct test_obj_rhl *rhl_test_objects,
+				  int cnt, bool slow)
+{
+	struct rhltable rhlt;
+	unsigned int i, ret;
+	const char *key;
+	int err = 0;
+
+	err = rhltable_init(&rhlt, &test_rht_params_dup);
+	if (WARN_ON(err))
+		return err;
+
+	for (i = 0; i < cnt; i++) {
+		rhl_test_objects[i].value.tid = i;
+		key = rht_obj(&rhlt.ht, &rhl_test_objects[i].list_node.rhead);
+		key += test_rht_params_dup.key_offset;
+
+		if (slow) {
+			err = PTR_ERR(rhashtable_insert_slow(&rhlt.ht, key,
+							     &rhl_test_objects[i].list_node.rhead));
+			if (err == -EAGAIN)
+				err = 0;
+		} else
+			err = rhltable_insert(&rhlt,
+					      &rhl_test_objects[i].list_node,
+					      test_rht_params_dup);
+		if (WARN(err, "error %d on element %d/%d (%s)\n", err, i, cnt, slow? "slow" : "fast"))
+			goto skip_print;
+	}
+
+	ret = print_ht(&rhlt);
+	WARN(ret != cnt, "missing rhltable elements (%d != %d, %s)\n", ret, cnt, slow? "slow" : "fast");
+
+skip_print:
+	rhltable_destroy(&rhlt);
+
+	return 0;
+}
+
+static int __init test_insert_duplicates_run(void)
+{
+	struct test_obj_rhl rhl_test_objects[3] = {};
+
+	pr_info("test inserting duplicates\n");
+
+	/* two different values that map to same bucket */
+	rhl_test_objects[0].value.id = 1;
+	rhl_test_objects[1].value.id = 21;
+
+	/* and another duplicate with same as [0] value
+	 * which will be second on the bucket list */
+	rhl_test_objects[2].value.id = rhl_test_objects[0].value.id;
+
+	test_insert_dup(rhl_test_objects, 2, false);
+	test_insert_dup(rhl_test_objects, 3, false);
+	test_insert_dup(rhl_test_objects, 2, true);
+	test_insert_dup(rhl_test_objects, 3, true);
+
+	return 0;
+}
+
 static int thread_lookup_test(struct thread_data *tdata)
 {
 	unsigned int entries = tdata->entries;
@@ -613,6 +745,8 @@ static int __init test_rht_init(void)
 	do_div(total_time, runs);
 	pr_info("Average test time: %llu\n", total_time);
 
+	test_insert_duplicates_run();
+
 	if (!tcount)
 		return 0;
 
-- 
cgit v1.2.3


From 0862ca422b79cb5aa70823ee0f07f6b468f86070 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 9 Mar 2018 15:50:59 -0800
Subject: bug: use %pB in BUG and stack protector failure

The BUG and stack protector reports were still using a raw %p.  This
changes it to %pB for more meaningful output.

Link: http://lkml.kernel.org/r/20180301225704.GA34198@beast
Fixes: ad67b74d2469 ("printk: hash addresses printed with %p")
Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Richard Weinberger <richard.weinberger@gmail.com>,
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/bug.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'lib')

diff --git a/lib/bug.c b/lib/bug.c
index c1b0fad31b10..44f432cb064d 100644
--- a/lib/bug.c
+++ b/lib/bug.c
@@ -191,7 +191,7 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
 	if (file)
 		pr_crit("kernel BUG at %s:%u!\n", file, line);
 	else
-		pr_crit("Kernel BUG at %p [verbose debug info unavailable]\n",
+		pr_crit("Kernel BUG at %pB [verbose debug info unavailable]\n",
 			(void *)bugaddr);
 
 	return BUG_TRAP_TYPE_BUG;
-- 
cgit v1.2.3


From 1b4cfe3c0a30dde968fb43c577a8d7e262a145ee Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 9 Mar 2018 15:51:02 -0800
Subject: lib/bug.c: exclude non-BUG/WARN exceptions from report_bug()

Commit b8347c219649 ("x86/debug: Handle warnings before the notifier
chain, to fix KGDB crash") changed the ordering of fixups, and did not
take into account the case of x86 processing non-WARN() and non-BUG()
exceptions.  This would lead to output of a false BUG line with no other
information.

In the case of a refcount exception, it would be immediately followed by
the refcount WARN(), producing very strange double-"cut here":

  lkdtm: attempting bad refcount_inc() overflow
  ------------[ cut here ]------------
  Kernel BUG at 0000000065f29de5 [verbose debug info unavailable]
  ------------[ cut here ]------------
  refcount_t overflow at lkdtm_REFCOUNT_INC_OVERFLOW+0x6b/0x90 in cat[3065], uid/euid: 0/0
  WARNING: CPU: 0 PID: 3065 at kernel/panic.c:657 refcount_error_report+0x9a/0xa4
  ...

In the prior ordering, exceptions were searched first:

   do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
   ...
                if (fixup_exception(regs, trapnr))
                        return 0;

  -               if (fixup_bug(regs, trapnr))
  -                       return 0;
  -

As a result, fixup_bugs()'s is_valid_bugaddr() didn't take into account
needing to search the exception list first, since that had already
happened.

So, instead of searching the exception list twice (once in
is_valid_bugaddr() and then again in fixup_exception()), just add a
simple sanity check to report_bug() that will immediately bail out if a
BUG() (or WARN()) entry is not found.

Link: http://lkml.kernel.org/r/20180301225934.GA34350@beast
Fixes: b8347c219649 ("x86/debug: Handle warnings before the notifier chain, to fix KGDB crash")
Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Richard Weinberger <richard.weinberger@gmail.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/bug.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'lib')

diff --git a/lib/bug.c b/lib/bug.c
index 44f432cb064d..1077366f496b 100644
--- a/lib/bug.c
+++ b/lib/bug.c
@@ -150,6 +150,8 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
 		return BUG_TRAP_TYPE_NONE;
 
 	bug = find_bug(bugaddr);
+	if (!bug)
+		return BUG_TRAP_TYPE_NONE;
 
 	file = NULL;
 	line = 0;
-- 
cgit v1.2.3


From ac68b1b3b9c73e652dc7ce0585672e23c5a2dca4 Mon Sep 17 00:00:00 2001
From: "Luis R. Rodriguez" <mcgrof@kernel.org>
Date: Fri, 9 Mar 2018 15:51:20 -0800
Subject: lib/test_kmod.c: fix limit check on number of test devices created

As reported by Dan the parentheses is in the wrong place, and since
unlikely() call returns either 0 or 1 it's never less than zero.  The
second issue is that signed integer overflows like "INT_MAX + 1" are
undefined behavior.

Since num_test_devs represents the number of devices, we want to stop
prior to hitting the max, and not rely on the wrap arround at all.  So
just cap at num_test_devs + 1, prior to assigning a new device.

Link: http://lkml.kernel.org/r/20180224030046.24238-1-mcgrof@kernel.org
Fixes: d9c6a72d6fa2 ("kmod: add test driver to stress test the module loader")
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Luis R. Rodriguez <mcgrof@kernel.org>
Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/test_kmod.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'lib')

diff --git a/lib/test_kmod.c b/lib/test_kmod.c
index e372b97eee13..0e5b7a61460b 100644
--- a/lib/test_kmod.c
+++ b/lib/test_kmod.c
@@ -1141,7 +1141,7 @@ static struct kmod_test_device *register_test_dev_kmod(void)
 	mutex_lock(&reg_dev_mutex);
 
 	/* int should suffice for number of devices, test for wrap */
-	if (unlikely(num_test_devs + 1) < 0) {
+	if (num_test_devs + 1 == INT_MAX) {
 		pr_err("reached limit of number of test devices\n");
 		goto out;
 	}
-- 
cgit v1.2.3


From 8df3aaaf9b5f8bdfc4036695fa22f35b45b4d92f Mon Sep 17 00:00:00 2001
From: Joern Engel <joern@purestorage.com>
Date: Tue, 13 Mar 2018 11:36:49 -0700
Subject: btree: avoid variable-length allocations

geo->keylen cannot be larger than 4.  So we might as well make
fixed-size allocations.

Given the one remaining user, geo->keylen cannot even be larger than 1.
Logfs used to have 64bit and 128bit keys, tcm_qla2xxx only has 32bit
keys.  But let's not break the code if we don't have to.

Signed-off-by: Joern Engel <joern@purestorage.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/btree.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'lib')

diff --git a/lib/btree.c b/lib/btree.c
index f93a945274af..590facba2c50 100644
--- a/lib/btree.c
+++ b/lib/btree.c
@@ -3,7 +3,7 @@
  *
  * As should be obvious for Linux kernel code, license is GPLv2
  *
- * Copyright (c) 2007-2008 Joern Engel <joern@logfs.org>
+ * Copyright (c) 2007-2008 Joern Engel <joern@purestorage.com>
  * Bits and pieces stolen from Peter Zijlstra's code, which is
  * Copyright 2007, Red Hat Inc. Peter Zijlstra
  * GPLv2
@@ -76,6 +76,8 @@ struct btree_geo btree_geo128 = {
 };
 EXPORT_SYMBOL_GPL(btree_geo128);
 
+#define MAX_KEYLEN	(2 * LONG_PER_U64)
+
 static struct kmem_cache *btree_cachep;
 
 void *btree_alloc(gfp_t gfp_mask, void *pool_data)
@@ -313,7 +315,7 @@ void *btree_get_prev(struct btree_head *head, struct btree_geo *geo,
 {
 	int i, height;
 	unsigned long *node, *oldnode;
-	unsigned long *retry_key = NULL, key[geo->keylen];
+	unsigned long *retry_key = NULL, key[MAX_KEYLEN];
 
 	if (keyzero(geo, __key))
 		return NULL;
@@ -639,8 +641,8 @@ EXPORT_SYMBOL_GPL(btree_remove);
 int btree_merge(struct btree_head *target, struct btree_head *victim,
 		struct btree_geo *geo, gfp_t gfp)
 {
-	unsigned long key[geo->keylen];
-	unsigned long dup[geo->keylen];
+	unsigned long key[MAX_KEYLEN];
+	unsigned long dup[MAX_KEYLEN];
 	void *val;
 	int err;
 
-- 
cgit v1.2.3


From b3a5d111994450909158929560906f2c1c6c1d85 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 14 Mar 2018 12:45:12 -0700
Subject: percpu_ref: Update doc to dissuade users from depending on internal
 RCU grace periods

percpu_ref internally uses sched-RCU to implement the percpu -> atomic
mode switching and the documentation suggested that this could be
depended upon.  This doesn't seem like a good idea.

* percpu_ref uses sched-RCU which has different grace periods regular
  RCU.  Users may combine percpu_ref with regular RCU usage and
  incorrectly believe that regular RCU grace periods are performed by
  percpu_ref.  This can lead to, for example, use-after-free due to
  premature freeing.

* percpu_ref has a grace period when switching from percpu to atomic
  mode.  It doesn't have one between the last put and release.  This
  distinction is subtle and can lead to surprising bugs.

* percpu_ref allows starting in and switching to atomic mode manually
  for debugging and other purposes.  This means that there may not be
  any grace periods from kill to release.

This patch makes it clear that the grace periods are percpu_ref's
internal implementation detail and can't be depended upon by the
users.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 lib/percpu-refcount.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'lib')

diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c
index 30e7dd88148b..9f96fa7bc000 100644
--- a/lib/percpu-refcount.c
+++ b/lib/percpu-refcount.c
@@ -322,6 +322,8 @@ EXPORT_SYMBOL_GPL(percpu_ref_switch_to_percpu);
  * This function normally doesn't block and can be called from any context
  * but it may block if @confirm_kill is specified and @ref is in the
  * process of switching to atomic mode by percpu_ref_switch_to_atomic().
+ *
+ * There are no implied RCU grace periods between kill and release.
  */
 void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
 				 percpu_ref_func_t *confirm_kill)
-- 
cgit v1.2.3


From 52fda36d63bfc8c8e8ae5eda8eb5ac6f52cd67ed Mon Sep 17 00:00:00 2001
From: Thadeu Lima de Souza Cascardo <cascardo@canonical.com>
Date: Tue, 20 Mar 2018 09:58:51 -0300
Subject: test_bpf: Fix testing with CONFIG_BPF_JIT_ALWAYS_ON=y on other arches

Function bpf_fill_maxinsns11 is designed to not be able to be JITed on
x86_64. So, it fails when CONFIG_BPF_JIT_ALWAYS_ON=y, and
commit 09584b406742 ("bpf: fix selftests/bpf test_kmod.sh failure when
CONFIG_BPF_JIT_ALWAYS_ON=y") makes sure that failure is detected on that
case.

However, it does not fail on other architectures, which have a different
JIT compiler design. So, test_bpf has started to fail to load on those.

After this fix, test_bpf loads fine on both x86_64 and ppc64el.

Fixes: 09584b406742 ("bpf: fix selftests/bpf test_kmod.sh failure when CONFIG_BPF_JIT_ALWAYS_ON=y")
Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@canonical.com>
Reviewed-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 lib/test_bpf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'lib')

diff --git a/lib/test_bpf.c b/lib/test_bpf.c
index 2efb213716fa..3e9335493fe4 100644
--- a/lib/test_bpf.c
+++ b/lib/test_bpf.c
@@ -5467,7 +5467,7 @@ static struct bpf_test tests[] = {
 	{
 		"BPF_MAXINSNS: Jump, gap, jump, ...",
 		{ },
-#ifdef CONFIG_BPF_JIT_ALWAYS_ON
+#if defined(CONFIG_BPF_JIT_ALWAYS_ON) && defined(CONFIG_X86)
 		CLASSIC | FLAG_NO_DATA | FLAG_EXPECTED_FAIL,
 #else
 		CLASSIC | FLAG_NO_DATA,
-- 
cgit v1.2.3


From b6bdb7517c3d3f41f20e5c2948d6bc3f8897394e Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hpe.com>
Date: Thu, 22 Mar 2018 16:17:20 -0700
Subject: mm/vmalloc: add interfaces to free unmapped page table

On architectures with CONFIG_HAVE_ARCH_HUGE_VMAP set, ioremap() may
create pud/pmd mappings.  A kernel panic was observed on arm64 systems
with Cortex-A75 in the following steps as described by Hanjun Guo.

 1. ioremap a 4K size, valid page table will build,
 2. iounmap it, pte0 will set to 0;
 3. ioremap the same address with 2M size, pgd/pmd is unchanged,
    then set the a new value for pmd;
 4. pte0 is leaked;
 5. CPU may meet exception because the old pmd is still in TLB,
    which will lead to kernel panic.

This panic is not reproducible on x86.  INVLPG, called from iounmap,
purges all levels of entries associated with purged address on x86.  x86
still has memory leak.

The patch changes the ioremap path to free unmapped page table(s) since
doing so in the unmap path has the following issues:

 - The iounmap() path is shared with vunmap(). Since vmap() only
   supports pte mappings, making vunmap() to free a pte page is an
   overhead for regular vmap users as they do not need a pte page freed
   up.

 - Checking if all entries in a pte page are cleared in the unmap path
   is racy, and serializing this check is expensive.

 - The unmap path calls free_vmap_area_noflush() to do lazy TLB purges.
   Clearing a pud/pmd entry before the lazy TLB purges needs extra TLB
   purge.

Add two interfaces, pud_free_pmd_page() and pmd_free_pte_page(), which
clear a given pud/pmd entry and free up a page for the lower level
entries.

This patch implements their stub functions on x86 and arm64, which work
as workaround.

[akpm@linux-foundation.org: fix typo in pmd_free_pte_page() stub]
Link: http://lkml.kernel.org/r/20180314180155.19492-2-toshi.kani@hpe.com
Fixes: e61ce6ade404e ("mm: change ioremap to set up huge I/O mappings")
Reported-by: Lei Li <lious.lilei@hisilicon.com>
Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Wang Xuefeng <wxf.wang@hisilicon.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Chintan Pandya <cpandya@codeaurora.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/ioremap.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'lib')

diff --git a/lib/ioremap.c b/lib/ioremap.c
index b808a390e4c3..54e5bbaa3200 100644
--- a/lib/ioremap.c
+++ b/lib/ioremap.c
@@ -91,7 +91,8 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
 
 		if (ioremap_pmd_enabled() &&
 		    ((next - addr) == PMD_SIZE) &&
-		    IS_ALIGNED(phys_addr + addr, PMD_SIZE)) {
+		    IS_ALIGNED(phys_addr + addr, PMD_SIZE) &&
+		    pmd_free_pte_page(pmd)) {
 			if (pmd_set_huge(pmd, phys_addr + addr, prot))
 				continue;
 		}
@@ -117,7 +118,8 @@ static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr,
 
 		if (ioremap_pud_enabled() &&
 		    ((next - addr) == PUD_SIZE) &&
-		    IS_ALIGNED(phys_addr + addr, PUD_SIZE)) {
+		    IS_ALIGNED(phys_addr + addr, PUD_SIZE) &&
+		    pud_free_pmd_page(pud)) {
 			if (pud_set_huge(pud, phys_addr + addr, prot))
 				continue;
 		}
-- 
cgit v1.2.3