From accb3e424b8b6b9fa51b77fcf7e6673f40d182a6 Mon Sep 17 00:00:00 2001
From: Misono Tomohiro <misono.tomohiro@jp.fujitsu.com>
Date: Tue, 31 Jul 2018 16:20:21 +0900
Subject: btrfs: replace: Reset on-disk dev stats value after replace

[ Upstream commit 1e7e1f9e3aba00c9b9c323bfeeddafe69ff21ff6 ]

on-disk devs stats value is updated in btrfs_run_dev_stats(),
which is called during commit transaction, if device->dev_stats_ccnt
is not zero.

Since current replace operation does not touch dev_stats_ccnt,
on-disk dev stats value is not updated. Therefore "btrfs device stats"
may return old device's value after umount/mount
(Example: See "btrfs ins dump-t -t DEV $DEV" after btrfs/100 finish).

Fix this by just incrementing dev_stats_ccnt in
btrfs_dev_replace_finishing() when replace is succeeded and this will
update the values.

Signed-off-by: Misono Tomohiro <misono.tomohiro@jp.fujitsu.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Sasha Levin <alexander.levin@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/dev-replace.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 1e668fb7dd4c..176a27bc63aa 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -573,6 +573,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 
 	btrfs_rm_dev_replace_unblocked(fs_info);
 
+	/*
+	 * Increment dev_stats_ccnt so that btrfs_run_dev_stats() will
+	 * update on-disk dev stats value during commit transaction
+	 */
+	atomic_inc(&tgt_device->dev_stats_ccnt);
+
 	/*
 	 * this is again a consistent state where no dev_replace procedure
 	 * is running, the target device is part of the filesystem, the
-- 
cgit v1.2.3


From 510825b3f8c1f5dc29b81660e1eb68e7fb0b8d50 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Tue, 3 Jul 2018 17:10:07 +0800
Subject: btrfs: relocation: Only remove reloc rb_trees if reloc control has
 been initialized

[ Upstream commit 389305b2aa68723c754f88d9dbd268a400e10664 ]

Invalid reloc tree can cause kernel NULL pointer dereference when btrfs
does some cleanup of the reloc roots.

It turns out that fs_info::reloc_ctl can be NULL in
btrfs_recover_relocation() as we allocate relocation control after all
reloc roots have been verified.
So when we hit: note, we haven't called set_reloc_control() thus
fs_info::reloc_ctl is still NULL.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=199833
Reported-by: Xu Wen <wen.xu@gatech.edu>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Tested-by: Gu Jinxiang <gujx@cn.fujitsu.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Sasha Levin <alexander.levin@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/relocation.c | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 9ebe027cc4b7..cfe913d2d3df 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1318,18 +1318,19 @@ static void __del_reloc_root(struct btrfs_root *root)
 	struct mapping_node *node = NULL;
 	struct reloc_control *rc = root->fs_info->reloc_ctl;
 
-	spin_lock(&rc->reloc_root_tree.lock);
-	rb_node = tree_search(&rc->reloc_root_tree.rb_root,
-			      root->node->start);
-	if (rb_node) {
-		node = rb_entry(rb_node, struct mapping_node, rb_node);
-		rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
+	if (rc) {
+		spin_lock(&rc->reloc_root_tree.lock);
+		rb_node = tree_search(&rc->reloc_root_tree.rb_root,
+				      root->node->start);
+		if (rb_node) {
+			node = rb_entry(rb_node, struct mapping_node, rb_node);
+			rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
+		}
+		spin_unlock(&rc->reloc_root_tree.lock);
+		if (!node)
+			return;
+		BUG_ON((struct btrfs_root *)node->data != root);
 	}
-	spin_unlock(&rc->reloc_root_tree.lock);
-
-	if (!node)
-		return;
-	BUG_ON((struct btrfs_root *)node->data != root);
 
 	spin_lock(&root->fs_info->trans_lock);
 	list_del_init(&root->root_list);
-- 
cgit v1.2.3


From 02e48c4d57ccaa89ad1d9fbf39ae6a4bb20aa4e5 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Fri, 22 Jun 2018 12:35:00 +0800
Subject: btrfs: Don't remove block group that still has pinned down bytes

[ Upstream commit 43794446548730ac8461be30bbe47d5d027d1d16 ]

[BUG]
Under certain KVM load and LTP tests, it is possible to hit the
following calltrace if quota is enabled:

BTRFS critical (device vda2): unable to find logical 8820195328 length 4096
BTRFS critical (device vda2): unable to find logical 8820195328 length 4096

WARNING: CPU: 0 PID: 49 at ../block/blk-core.c:172 blk_status_to_errno+0x1a/0x30
CPU: 0 PID: 49 Comm: kworker/u2:1 Not tainted 4.12.14-15-default #1 SLE15 (unreleased)
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.0.0-prebuilt.qemu-project.org 04/01/2014
Workqueue: btrfs-endio-write btrfs_endio_write_helper [btrfs]
task: ffff9f827b340bc0 task.stack: ffffb4f8c0304000
RIP: 0010:blk_status_to_errno+0x1a/0x30
Call Trace:
 submit_extent_page+0x191/0x270 [btrfs]
 ? btrfs_create_repair_bio+0x130/0x130 [btrfs]
 __do_readpage+0x2d2/0x810 [btrfs]
 ? btrfs_create_repair_bio+0x130/0x130 [btrfs]
 ? run_one_async_done+0xc0/0xc0 [btrfs]
 __extent_read_full_page+0xe7/0x100 [btrfs]
 ? run_one_async_done+0xc0/0xc0 [btrfs]
 read_extent_buffer_pages+0x1ab/0x2d0 [btrfs]
 ? run_one_async_done+0xc0/0xc0 [btrfs]
 btree_read_extent_buffer_pages+0x94/0xf0 [btrfs]
 read_tree_block+0x31/0x60 [btrfs]
 read_block_for_search.isra.35+0xf0/0x2e0 [btrfs]
 btrfs_search_slot+0x46b/0xa00 [btrfs]
 ? kmem_cache_alloc+0x1a8/0x510
 ? btrfs_get_token_32+0x5b/0x120 [btrfs]
 find_parent_nodes+0x11d/0xeb0 [btrfs]
 ? leaf_space_used+0xb8/0xd0 [btrfs]
 ? btrfs_leaf_free_space+0x49/0x90 [btrfs]
 ? btrfs_find_all_roots_safe+0x93/0x100 [btrfs]
 btrfs_find_all_roots_safe+0x93/0x100 [btrfs]
 btrfs_find_all_roots+0x45/0x60 [btrfs]
 btrfs_qgroup_trace_extent_post+0x20/0x40 [btrfs]
 btrfs_add_delayed_data_ref+0x1a3/0x1d0 [btrfs]
 btrfs_alloc_reserved_file_extent+0x38/0x40 [btrfs]
 insert_reserved_file_extent.constprop.71+0x289/0x2e0 [btrfs]
 btrfs_finish_ordered_io+0x2f4/0x7f0 [btrfs]
 ? pick_next_task_fair+0x2cd/0x530
 ? __switch_to+0x92/0x4b0
 btrfs_worker_helper+0x81/0x300 [btrfs]
 process_one_work+0x1da/0x3f0
 worker_thread+0x2b/0x3f0
 ? process_one_work+0x3f0/0x3f0
 kthread+0x11a/0x130
 ? kthread_create_on_node+0x40/0x40
 ret_from_fork+0x35/0x40

BTRFS critical (device vda2): unable to find logical 8820195328 length 16384
BTRFS: error (device vda2) in btrfs_finish_ordered_io:3023: errno=-5 IO failure
BTRFS info (device vda2): forced readonly
BTRFS error (device vda2): pending csums is 2887680

[CAUSE]
It's caused by race with block group auto removal:

- There is a meta block group X, which has only one tree block
  The tree block belongs to fs tree 257.
- In current transaction, some operation modified fs tree 257
  The tree block gets COWed, so the block group X is empty, and marked
  as unused, queued to be deleted.
- Some workload (like fsync) wakes up cleaner_kthread()
  Which will call btrfs_delete_unused_bgs() to remove unused block
  groups.
  So block group X along its chunk map get removed.
- Some delalloc work finished for fs tree 257
  Quota needs to get the original reference of the extent, which will
  read tree blocks of commit root of 257.
  Then since the chunk map gets removed, the above warning gets
  triggered.

[FIX]
Just let btrfs_delete_unused_bgs() skip block group which still has
pinned bytes.

However there is a minor side effect: currently we only queue empty
blocks at update_block_group(), and such empty block group with pinned
bytes won't go through update_block_group() again, such block group
won't be removed, until it gets new extent allocated and removed.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Sasha Levin <alexander.levin@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/extent-tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 493c7354ec0b..a72f941ca750 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -10410,7 +10410,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
 		/* Don't want to race with allocators so take the groups_sem */
 		down_write(&space_info->groups_sem);
 		spin_lock(&block_group->lock);
-		if (block_group->reserved ||
+		if (block_group->reserved || block_group->pinned ||
 		    btrfs_block_group_used(&block_group->item) ||
 		    block_group->ro ||
 		    list_is_singular(&block_group->list)) {
-- 
cgit v1.2.3


From a632d2d1849f9370d87e397319304f3787b5d05b Mon Sep 17 00:00:00 2001
From: Ethan Lien <ethanlien@synology.com>
Date: Mon, 2 Jul 2018 15:44:58 +0800
Subject: btrfs: use correct compare function of dirty_metadata_bytes

commit d814a49198eafa6163698bdd93961302f3a877a4 upstream.

We use customized, nodesize batch value to update dirty_metadata_bytes.
We should also use batch version of compare function or we will easily
goto fast path and get false result from percpu_counter_compare().

Fixes: e2d845211eda ("Btrfs: use percpu counter for dirty metadata count")
CC: stable@vger.kernel.org # 4.4+
Signed-off-by: Ethan Lien <ethanlien@synology.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
nb: Rebased on 4.4.y ]
Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/disk-io.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d106b981d86f..ae6e3a30e61e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1011,8 +1011,9 @@ static int btree_writepages(struct address_space *mapping,
 
 		fs_info = BTRFS_I(mapping->host)->root->fs_info;
 		/* this is a bit racy, but that's ok */
-		ret = percpu_counter_compare(&fs_info->dirty_metadata_bytes,
-					     BTRFS_DIRTY_METADATA_THRESH);
+		ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes,
+					     BTRFS_DIRTY_METADATA_THRESH,
+					     fs_info->dirty_metadata_batch);
 		if (ret < 0)
 			return 0;
 	}
@@ -3987,8 +3988,9 @@ static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
 	if (flush_delayed)
 		btrfs_balance_delayed_items(root);
 
-	ret = percpu_counter_compare(&root->fs_info->dirty_metadata_bytes,
-				     BTRFS_DIRTY_METADATA_THRESH);
+	ret = __percpu_counter_compare(&root->fs_info->dirty_metadata_bytes,
+				     BTRFS_DIRTY_METADATA_THRESH,
+				     root->fs_info->dirty_metadata_batch);
 	if (ret > 0) {
 		balance_dirty_pages_ratelimited(
 				   root->fs_info->btree_inode->i_mapping);
-- 
cgit v1.2.3