From d04c6b88320debb403bff8d8b634a1efa48b8d3d Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Mon, 15 Jun 2015 09:41:14 -0400
Subject: btrfs: make btrfs_issue_discard return bytes discarded

Initially this will just be the length argument passed to it,
but the following patches will adjust that to reflect re-alignment
and skipped blocks.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Tested-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 07204bf601ed..16655bb5f293 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1883,10 +1883,17 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-static int btrfs_issue_discard(struct block_device *bdev,
-				u64 start, u64 len)
+static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
+			       u64 *discarded_bytes)
 {
-	return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0);
+	int ret = 0;
+
+	*discarded_bytes = 0;
+	ret = blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0);
+	if (!ret)
+		*discarded_bytes = len;
+
+	return ret;
 }
 
 int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
@@ -1907,14 +1914,16 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
 
 
 		for (i = 0; i < bbio->num_stripes; i++, stripe++) {
+			u64 bytes;
 			if (!stripe->dev->can_discard)
 				continue;
 
 			ret = btrfs_issue_discard(stripe->dev->bdev,
 						  stripe->physical,
-						  stripe->length);
+						  stripe->length,
+						  &bytes);
 			if (!ret)
-				discarded_bytes += stripe->length;
+				discarded_bytes += bytes;
 			else if (ret != -EOPNOTSUPP)
 				break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
 
-- 
cgit v1.2.3


From 4d89d377bbb0e34cd1571b57a984c2326cab69b5 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Mon, 15 Jun 2015 09:41:15 -0400
Subject: btrfs: btrfs_issue_discard ensure offset/length are aligned to sector
 boundaries

It's possible, though unexpected, to pass unaligned offsets and lengths
to btrfs_issue_discard.  We then shift the offset/length values to sector
units.  If an unaligned offset has been passed, it will result in the
entire sector being discarded, possibly losing data.  An unaligned
length is safe but we'll end up returning an inaccurate number of
discarded bytes.

This patch aligns the offset to the 512B boundary, adjusts the length,
and warns, since we shouldn't be discarding on an offset that isn't
aligned with our sector size.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Tested-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 16655bb5f293..7aa6ad1f014d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1887,12 +1887,21 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
 			       u64 *discarded_bytes)
 {
 	int ret = 0;
+	u64 aligned_start = ALIGN(start, 1 << 9);
 
-	*discarded_bytes = 0;
-	ret = blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0);
-	if (!ret)
-		*discarded_bytes = len;
+	if (WARN_ON(start != aligned_start)) {
+		len -= aligned_start - start;
+		len = round_down(len, 1 << 9);
+		start = aligned_start;
+	}
 
+	*discarded_bytes = 0;
+	if (len) {
+		ret = blkdev_issue_discard(bdev, start >> 9, len >> 9,
+					   GFP_NOFS, 0);
+		if (!ret)
+			*discarded_bytes = len;
+	}
 	return ret;
 }
 
-- 
cgit v1.2.3


From 86557861dfe4f8defde0df40620b97cc60285aa4 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Mon, 15 Jun 2015 09:41:16 -0400
Subject: btrfs: skip superblocks during discard

Btrfs doesn't track superblocks with extent records so there is nothing
persistent on-disk to indicate that those blocks are in use.  We track
the superblocks in memory to ensure they don't get used by removing them
from the free space cache when we load a block group from disk.  Prior
to 47ab2a6c6a (Btrfs: remove empty block groups automatically), that
was fine since the block group would never be reclaimed so the superblock
was always safe.  Once we started removing the empty block groups, we
were protected by the fact that discards weren't being properly issued
for unused space either via FITRIM or -odiscard.  The block groups were
still being released, but the blocks remained on disk.

In order to properly discard unused block groups, we need to filter out
the superblocks from the discard range.  Superblocks are located at fixed
locations on each device, so it makes sense to filter them out in
btrfs_issue_discard, which is used by both -odiscard and FITRIM.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Tested-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 55 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 7aa6ad1f014d..d763457b3cce 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1883,10 +1883,12 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+#define in_range(b, first, len)        ((b) >= (first) && (b) < (first) + (len))
 static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
 			       u64 *discarded_bytes)
 {
-	int ret = 0;
+	int j, ret = 0;
+	u64 bytes_left, end;
 	u64 aligned_start = ALIGN(start, 1 << 9);
 
 	if (WARN_ON(start != aligned_start)) {
@@ -1896,11 +1898,60 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
 	}
 
 	*discarded_bytes = 0;
-	if (len) {
-		ret = blkdev_issue_discard(bdev, start >> 9, len >> 9,
+
+	if (!len)
+		return 0;
+
+	end = start + len;
+	bytes_left = len;
+
+	/* Skip any superblocks on this device. */
+	for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
+		u64 sb_start = btrfs_sb_offset(j);
+		u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
+		u64 size = sb_start - start;
+
+		if (!in_range(sb_start, start, bytes_left) &&
+		    !in_range(sb_end, start, bytes_left) &&
+		    !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
+			continue;
+
+		/*
+		 * Superblock spans beginning of range.  Adjust start and
+		 * try again.
+		 */
+		if (sb_start <= start) {
+			start += sb_end - start;
+			if (start > end) {
+				bytes_left = 0;
+				break;
+			}
+			bytes_left = end - start;
+			continue;
+		}
+
+		if (size) {
+			ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
+						   GFP_NOFS, 0);
+			if (!ret)
+				*discarded_bytes += size;
+			else if (ret != -EOPNOTSUPP)
+				return ret;
+		}
+
+		start = sb_end;
+		if (start > end) {
+			bytes_left = 0;
+			break;
+		}
+		bytes_left = end - start;
+	}
+
+	if (bytes_left) {
+		ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
 					   GFP_NOFS, 0);
 		if (!ret)
-			*discarded_bytes = len;
+			*discarded_bytes += bytes_left;
 	}
 	return ret;
 }
-- 
cgit v1.2.3


From 499f377f49f085ee4aa214c738e948e88626f39b Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Mon, 15 Jun 2015 09:41:17 -0400
Subject: btrfs: iterate over unused chunk space in FITRIM

Since we now clean up block groups automatically as they become
empty, iterating over block groups is no longer sufficient to discard
unused space.

This patch iterates over the unused chunk space and discards any regions
that are unallocated, regardless of whether they were ever used.  This is
a change for btrfs but is consistent with other file systems.

We do this in a transactionless manner since the discard process can take
a substantial amount of time and a transaction would need to be started
before the acquisition of the device list lock.  That would mean a
transaction would be held open across /all/ of the discards collectively.
In order to prevent other threads from allocating or freeing chunks, we
hold the chunks lock across the search and discard calls.  We release it
between searches to allow the file system to perform more-or-less
normally.  Since the running transaction can commit and disappear while
we're using the transaction pointer, we take a reference to it and
release it after the search.  This is safe since it would happen normally
at the end of the transaction commit after any locks are released anyway.
We also take the commit_root_sem to protect against a transaction starting
and committing while we're running.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Tested-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c | 101 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/volumes.c     |  63 ++++++++++++++++++------------
 fs/btrfs/volumes.h     |   3 ++
 3 files changed, 143 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d763457b3cce..15411aefbfa0 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -10135,10 +10135,99 @@ int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
 	return unpin_extent_range(root, start, end, false);
 }
 
+/*
+ * It used to be that old block groups would be left around forever.
+ * Iterating over them would be enough to trim unused space.  Since we
+ * now automatically remove them, we also need to iterate over unallocated
+ * space.
+ *
+ * We don't want a transaction for this since the discard may take a
+ * substantial amount of time.  We don't require that a transaction be
+ * running, but we do need to take a running transaction into account
+ * to ensure that we're not discarding chunks that were released in
+ * the current transaction.
+ *
+ * Holding the chunks lock will prevent other threads from allocating
+ * or releasing chunks, but it won't prevent a running transaction
+ * from committing and releasing the memory that the pending chunks
+ * list head uses.  For that, we need to take a reference to the
+ * transaction.
+ */
+static int btrfs_trim_free_extents(struct btrfs_device *device,
+				   u64 minlen, u64 *trimmed)
+{
+	u64 start = 0, len = 0;
+	int ret;
+
+	*trimmed = 0;
+
+	/* Not writeable = nothing to do. */
+	if (!device->writeable)
+		return 0;
+
+	/* No free space = nothing to do. */
+	if (device->total_bytes <= device->bytes_used)
+		return 0;
+
+	ret = 0;
+
+	while (1) {
+		struct btrfs_fs_info *fs_info = device->dev_root->fs_info;
+		struct btrfs_transaction *trans;
+		u64 bytes;
+
+		ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
+		if (ret)
+			return ret;
+
+		down_read(&fs_info->commit_root_sem);
+
+		spin_lock(&fs_info->trans_lock);
+		trans = fs_info->running_transaction;
+		if (trans)
+			atomic_inc(&trans->use_count);
+		spin_unlock(&fs_info->trans_lock);
+
+		ret = find_free_dev_extent_start(trans, device, minlen, start,
+						 &start, &len);
+		if (trans)
+			btrfs_put_transaction(trans);
+
+		if (ret) {
+			up_read(&fs_info->commit_root_sem);
+			mutex_unlock(&fs_info->chunk_mutex);
+			if (ret == -ENOSPC)
+				ret = 0;
+			break;
+		}
+
+		ret = btrfs_issue_discard(device->bdev, start, len, &bytes);
+		up_read(&fs_info->commit_root_sem);
+		mutex_unlock(&fs_info->chunk_mutex);
+
+		if (ret)
+			break;
+
+		start += len;
+		*trimmed += bytes;
+
+		if (fatal_signal_pending(current)) {
+			ret = -ERESTARTSYS;
+			break;
+		}
+
+		cond_resched();
+	}
+
+	return ret;
+}
+
 int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_block_group_cache *cache = NULL;
+	struct btrfs_device *device;
+	struct list_head *devices;
 	u64 group_trimmed;
 	u64 start;
 	u64 end;
@@ -10193,6 +10282,18 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
 		cache = next_block_group(fs_info->tree_root, cache);
 	}
 
+	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+	devices = &root->fs_info->fs_devices->alloc_list;
+	list_for_each_entry(device, devices, dev_alloc_list) {
+		ret = btrfs_trim_free_extents(device, range->minlen,
+					      &group_trimmed);
+		if (ret)
+			break;
+
+		trimmed += group_trimmed;
+	}
+	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
 	range->len = trimmed;
 	return ret;
 }
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 9b95503ddd00..141c6051cf58 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1116,15 +1116,18 @@ out:
 	return ret;
 }
 
-static int contains_pending_extent(struct btrfs_trans_handle *trans,
+static int contains_pending_extent(struct btrfs_transaction *transaction,
 				   struct btrfs_device *device,
 				   u64 *start, u64 len)
 {
+	struct btrfs_fs_info *fs_info = device->dev_root->fs_info;
 	struct extent_map *em;
-	struct list_head *search_list = &trans->transaction->pending_chunks;
+	struct list_head *search_list = &fs_info->pinned_chunks;
 	int ret = 0;
 	u64 physical_start = *start;
 
+	if (transaction)
+		search_list = &transaction->pending_chunks;
 again:
 	list_for_each_entry(em, search_list, list) {
 		struct map_lookup *map;
@@ -1159,8 +1162,8 @@ again:
 			}
 		}
 	}
-	if (search_list == &trans->transaction->pending_chunks) {
-		search_list = &trans->root->fs_info->pinned_chunks;
+	if (search_list != &fs_info->pinned_chunks) {
+		search_list = &fs_info->pinned_chunks;
 		goto again;
 	}
 
@@ -1169,12 +1172,13 @@ again:
 
 
 /*
- * find_free_dev_extent - find free space in the specified device
- * @device:	the device which we search the free space in
- * @num_bytes:	the size of the free space that we need
- * @start:	store the start of the free space.
- * @len:	the size of the free space. that we find, or the size of the max
- * 		free space if we don't find suitable free space
+ * find_free_dev_extent_start - find free space in the specified device
+ * @device:	  the device which we search the free space in
+ * @num_bytes:	  the size of the free space that we need
+ * @search_start: the position from which to begin the search
+ * @start:	  store the start of the free space.
+ * @len:	  the size of the free space. that we find, or the size
+ *		  of the max free space if we don't find suitable free space
  *
  * this uses a pretty simple search, the expectation is that it is
  * called very infrequently and that a given device has a small number
@@ -1188,9 +1192,9 @@ again:
  * But if we don't find suitable free space, it is used to store the size of
  * the max free space.
  */
-int find_free_dev_extent(struct btrfs_trans_handle *trans,
-			 struct btrfs_device *device, u64 num_bytes,
-			 u64 *start, u64 *len)
+int find_free_dev_extent_start(struct btrfs_transaction *transaction,
+			       struct btrfs_device *device, u64 num_bytes,
+			       u64 search_start, u64 *start, u64 *len)
 {
 	struct btrfs_key key;
 	struct btrfs_root *root = device->dev_root;
@@ -1200,19 +1204,11 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
 	u64 max_hole_start;
 	u64 max_hole_size;
 	u64 extent_end;
-	u64 search_start;
 	u64 search_end = device->total_bytes;
 	int ret;
 	int slot;
 	struct extent_buffer *l;
 
-	/* FIXME use last free of some kind */
-
-	/* we don't want to overwrite the superblock on the drive,
-	 * so we make sure to start at an offset of at least 1MB
-	 */
-	search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
-
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
@@ -1273,7 +1269,7 @@ again:
 			 * Have to check before we set max_hole_start, otherwise
 			 * we could end up sending back this offset anyway.
 			 */
-			if (contains_pending_extent(trans, device,
+			if (contains_pending_extent(transaction, device,
 						    &search_start,
 						    hole_size)) {
 				if (key.offset >= search_start) {
@@ -1322,7 +1318,7 @@ next:
 	if (search_end > search_start) {
 		hole_size = search_end - search_start;
 
-		if (contains_pending_extent(trans, device, &search_start,
+		if (contains_pending_extent(transaction, device, &search_start,
 					    hole_size)) {
 			btrfs_release_path(path);
 			goto again;
@@ -1348,6 +1344,24 @@ out:
 	return ret;
 }
 
+int find_free_dev_extent(struct btrfs_trans_handle *trans,
+			 struct btrfs_device *device, u64 num_bytes,
+			 u64 *start, u64 *len)
+{
+	struct btrfs_root *root = device->dev_root;
+	u64 search_start;
+
+	/* FIXME use last free of some kind */
+
+	/*
+	 * we don't want to overwrite the superblock on the drive,
+	 * so we make sure to start at an offset of at least 1MB
+	 */
+	search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
+	return find_free_dev_extent_start(trans->transaction, device,
+					  num_bytes, search_start, start, len);
+}
+
 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
 			  struct btrfs_device *device,
 			  u64 start, u64 *dev_extent_len)
@@ -4200,7 +4214,8 @@ again:
 		u64 start = new_size;
 		u64 len = old_size - new_size;
 
-		if (contains_pending_extent(trans, device, &start, len)) {
+		if (contains_pending_extent(trans->transaction, device,
+					    &start, len)) {
 			unlock_chunks(root);
 			checked_pending_chunks = true;
 			failed = 0;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 210a64390f40..57b0217b5300 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -455,6 +455,9 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
 int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info);
 int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info);
 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
+int find_free_dev_extent_start(struct btrfs_transaction *transaction,
+			 struct btrfs_device *device, u64 num_bytes,
+			 u64 search_start, u64 *start, u64 *max_avail);
 int find_free_dev_extent(struct btrfs_trans_handle *trans,
 			 struct btrfs_device *device, u64 num_bytes,
 			 u64 *start, u64 *max_avail);
-- 
cgit v1.2.3


From e44163e177960ee60e32a73bffdd53c3a5827406 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Mon, 15 Jun 2015 09:41:18 -0400
Subject: btrfs: explictly delete unused block groups in close_ctree and
 ro-remount

The cleaner thread may already be sleeping by the time we enter
close_ctree.  If that's the case, we'll skip removing any unused
block groups queued for removal, even during a normal umount.
They'll be cleaned up automatically at next mount, but users
expect a umount to be a clean synchronization point, especially
when used on thin-provisioned storage with -odiscard.  We also
explicitly remove unused block groups in the ro-remount path
for the same reason.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Tested-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/disk-io.c |  9 +++++++++
 fs/btrfs/super.c   | 11 +++++++++++
 2 files changed, 20 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 84cbbb2d562e..053109ba26b7 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3767,6 +3767,15 @@ void close_ctree(struct btrfs_root *root)
 	cancel_work_sync(&fs_info->async_reclaim_work);
 
 	if (!(fs_info->sb->s_flags & MS_RDONLY)) {
+		/*
+		 * If the cleaner thread is stopped and there are
+		 * block groups queued for removal, the deletion will be
+		 * skipped when we quit the cleaner thread.
+		 */
+		mutex_lock(&root->fs_info->cleaner_mutex);
+		btrfs_delete_unused_bgs(root->fs_info);
+		mutex_unlock(&root->fs_info->cleaner_mutex);
+
 		ret = btrfs_commit_super(root);
 		if (ret)
 			btrfs_err(fs_info, "commit super ret %d", ret);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index cd7ef34d2dce..a1077e0ffaa8 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1650,6 +1650,17 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 
 		sb->s_flags |= MS_RDONLY;
 
+		/*
+		 * Setting MS_RDONLY will put the cleaner thread to
+		 * sleep at the next loop if it's already active.
+		 * If it's already asleep, we'll leave unused block
+		 * groups on disk until we're mounted read-write again
+		 * unless we clean them up here.
+		 */
+		mutex_lock(&root->fs_info->cleaner_mutex);
+		btrfs_delete_unused_bgs(fs_info);
+		mutex_unlock(&root->fs_info->cleaner_mutex);
+
 		btrfs_dev_replace_suspend_for_unmount(fs_info);
 		btrfs_scrub_cancel(fs_info);
 		btrfs_pause_balance(fs_info);
-- 
cgit v1.2.3


From e33e17ee1098d8d751552ac11c111e1c1a3db014 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Mon, 15 Jun 2015 09:41:19 -0400
Subject: btrfs: add missing discards when unpinning extents with -o discard

When we clear the dirty bits in btrfs_delete_unused_bgs for extents
in the empty block group, it results in btrfs_finish_extent_commit being
unable to discard the freed extents.

The block group removal patch added an alternate path to forget extents
other than btrfs_finish_extent_commit.  As a result, any extents that
would be freed when the block group is removed aren't discarded.  In my
test run, with a large copy of mixed sized files followed by removal, it
left nearly 2/3 of extents undiscarded.

To clean up the block groups, we add the removed block group onto a list
that will be discarded after transaction commit.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Tested-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h            |  3 ++
 fs/btrfs/extent-tree.c      | 68 ++++++++++++++++++++++++++++++++++++++++++---
 fs/btrfs/free-space-cache.c | 57 +++++++++++++++++++++----------------
 fs/btrfs/super.c            |  2 +-
 fs/btrfs/transaction.c      |  2 ++
 fs/btrfs/transaction.h      |  2 ++
 6 files changed, 105 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index aac314e14188..19ef3f306559 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3437,6 +3437,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root, u64 group_start,
 			     struct extent_map *em);
 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
+void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache);
+void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *cache);
 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
 				       struct btrfs_root *root);
 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
@@ -4073,6 +4075,7 @@ __cold
 void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
 		     unsigned int line, int errno, const char *fmt, ...);
 
+const char *btrfs_decode_error(int errno);
 
 __cold
 void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 15411aefbfa0..6b791f394698 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6131,20 +6131,19 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_block_group_cache *block_group, *tmp;
+	struct list_head *deleted_bgs;
 	struct extent_io_tree *unpin;
 	u64 start;
 	u64 end;
 	int ret;
 
-	if (trans->aborted)
-		return 0;
-
 	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
 		unpin = &fs_info->freed_extents[1];
 	else
 		unpin = &fs_info->freed_extents[0];
 
-	while (1) {
+	while (!trans->aborted) {
 		mutex_lock(&fs_info->unused_bg_unpin_mutex);
 		ret = find_first_extent_bit(unpin, 0, &start, &end,
 					    EXTENT_DIRTY, NULL);
@@ -6163,6 +6162,34 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 		cond_resched();
 	}
 
+	/*
+	 * Transaction is finished.  We don't need the lock anymore.  We
+	 * do need to clean up the block groups in case of a transaction
+	 * abort.
+	 */
+	deleted_bgs = &trans->transaction->deleted_bgs;
+	list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
+		u64 trimmed = 0;
+
+		ret = -EROFS;
+		if (!trans->aborted)
+			ret = btrfs_discard_extent(root,
+						   block_group->key.objectid,
+						   block_group->key.offset,
+						   &trimmed);
+
+		list_del_init(&block_group->bg_list);
+		btrfs_put_block_group_trimming(block_group);
+		btrfs_put_block_group(block_group);
+
+		if (ret) {
+			const char *errstr = btrfs_decode_error(ret);
+			btrfs_warn(fs_info,
+				   "Discard failed while removing blockgroup: errno=%d %s\n",
+				   ret, errstr);
+		}
+	}
+
 	return 0;
 }
 
@@ -9903,6 +9930,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	 * currently running transaction might finish and a new one start,
 	 * allowing for new block groups to be created that can reuse the same
 	 * physical device locations unless we take this special care.
+	 *
+	 * There may also be an implicit trim operation if the file system
+	 * is mounted with -odiscard. The same protections must remain
+	 * in place until the extents have been discarded completely when
+	 * the transaction commit has completed.
 	 */
 	remove_em = (atomic_read(&block_group->trimming) == 0);
 	/*
@@ -9977,6 +10009,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
 	spin_lock(&fs_info->unused_bgs_lock);
 	while (!list_empty(&fs_info->unused_bgs)) {
 		u64 start, end;
+		int trimming;
 
 		block_group = list_first_entry(&fs_info->unused_bgs,
 					       struct btrfs_block_group_cache,
@@ -10076,12 +10109,39 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
 		spin_unlock(&block_group->lock);
 		spin_unlock(&space_info->lock);
 
+		/* DISCARD can flip during remount */
+		trimming = btrfs_test_opt(root, DISCARD);
+
+		/* Implicit trim during transaction commit. */
+		if (trimming)
+			btrfs_get_block_group_trimming(block_group);
+
 		/*
 		 * Btrfs_remove_chunk will abort the transaction if things go
 		 * horribly wrong.
 		 */
 		ret = btrfs_remove_chunk(trans, root,
 					 block_group->key.objectid);
+
+		if (ret) {
+			if (trimming)
+				btrfs_put_block_group_trimming(block_group);
+			goto end_trans;
+		}
+
+		/*
+		 * If we're not mounted with -odiscard, we can just forget
+		 * about this block group. Otherwise we'll need to wait
+		 * until transaction commit to do the actual discard.
+		 */
+		if (trimming) {
+			WARN_ON(!list_empty(&block_group->bg_list));
+			spin_lock(&trans->transaction->deleted_bgs_lock);
+			list_move(&block_group->bg_list,
+				  &trans->transaction->deleted_bgs);
+			spin_unlock(&trans->transaction->deleted_bgs_lock);
+			btrfs_get_block_group(block_group);
+		}
 end_trans:
 		btrfs_end_transaction(trans, root);
 next:
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index fb5a6b1c62a6..abe3a66bd3ba 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -3272,35 +3272,23 @@ next:
 	return ret;
 }
 
-int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
-			   u64 *trimmed, u64 start, u64 end, u64 minlen)
+void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache)
 {
-	int ret;
+	atomic_inc(&cache->trimming);
+}
 
-	*trimmed = 0;
+void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *block_group)
+{
+	struct extent_map_tree *em_tree;
+	struct extent_map *em;
+	bool cleanup;
 
 	spin_lock(&block_group->lock);
-	if (block_group->removed) {
-		spin_unlock(&block_group->lock);
-		return 0;
-	}
-	atomic_inc(&block_group->trimming);
+	cleanup = (atomic_dec_and_test(&block_group->trimming) &&
+		   block_group->removed);
 	spin_unlock(&block_group->lock);
 
-	ret = trim_no_bitmap(block_group, trimmed, start, end, minlen);
-	if (ret)
-		goto out;
-
-	ret = trim_bitmaps(block_group, trimmed, start, end, minlen);
-out:
-	spin_lock(&block_group->lock);
-	if (atomic_dec_and_test(&block_group->trimming) &&
-	    block_group->removed) {
-		struct extent_map_tree *em_tree;
-		struct extent_map *em;
-
-		spin_unlock(&block_group->lock);
-
+	if (cleanup) {
 		lock_chunks(block_group->fs_info->chunk_root);
 		em_tree = &block_group->fs_info->mapping_tree.map_tree;
 		write_lock(&em_tree->lock);
@@ -3324,10 +3312,31 @@ out:
 		 * this block group have left 1 entry each one. Free them.
 		 */
 		__btrfs_remove_free_space_cache(block_group->free_space_ctl);
-	} else {
+	}
+}
+
+int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
+			   u64 *trimmed, u64 start, u64 end, u64 minlen)
+{
+	int ret;
+
+	*trimmed = 0;
+
+	spin_lock(&block_group->lock);
+	if (block_group->removed) {
 		spin_unlock(&block_group->lock);
+		return 0;
 	}
+	btrfs_get_block_group_trimming(block_group);
+	spin_unlock(&block_group->lock);
+
+	ret = trim_no_bitmap(block_group, trimmed, start, end, minlen);
+	if (ret)
+		goto out;
 
+	ret = trim_bitmaps(block_group, trimmed, start, end, minlen);
+out:
+	btrfs_put_block_group_trimming(block_group);
 	return ret;
 }
 
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index a1077e0ffaa8..8da24e242896 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -69,7 +69,7 @@ static struct file_system_type btrfs_fs_type;
 
 static int btrfs_remount(struct super_block *sb, int *flags, char *data);
 
-static const char *btrfs_decode_error(int errno)
+const char *btrfs_decode_error(int errno)
 {
 	char *errstr = "unknown";
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index f5021fcb154e..44da9299a25b 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -258,6 +258,8 @@ loop:
 	mutex_init(&cur_trans->cache_write_mutex);
 	cur_trans->num_dirty_bgs = 0;
 	spin_lock_init(&cur_trans->dirty_bgs_lock);
+	INIT_LIST_HEAD(&cur_trans->deleted_bgs);
+	spin_lock_init(&cur_trans->deleted_bgs_lock);
 	list_add_tail(&cur_trans->list, &fs_info->trans_list);
 	extent_io_tree_init(&cur_trans->dirty_pages,
 			     fs_info->btree_inode->i_mapping);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index eb09c2067fa8..edc2fbc262d7 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -74,6 +74,8 @@ struct btrfs_transaction {
 	 */
 	struct mutex cache_write_mutex;
 	spinlock_t dirty_bgs_lock;
+	struct list_head deleted_bgs;
+	spinlock_t deleted_bgs_lock;
 	struct btrfs_delayed_ref_root delayed_refs;
 	int aborted;
 	int dirty_bg_run;
-- 
cgit v1.2.3


From bb53eda9029fd52b466fa501ba4aa58e94789b18 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 15 Jul 2015 23:26:43 +0100
Subject: Btrfs: fix stale directory entries after fsync log replay

We have another case where after an fsync log replay we get an inode with
a wrong link count (smaller than it should be) and a number of directory
entries greater than its link count. This happens when we add a new link
hard link to our inode A and then we fsync some other inode B that has
the side effect of logging the parent directory inode too. In this case
at log replay time we add the new hard link to our inode (the item with
key BTRFS_INODE_REF_KEY) when processing the parent directory but we
never adjust the link count of our inode A. As a result we get stale dir
entries for our inode A that can never be deleted and therefore it makes
it impossible to remove the parent directory (as its i_size can never
decrease back to 0).

A simple reproducer for fstests that triggers this issue:

  seq=`basename $0`
  seqres=$RESULT_DIR/$seq
  echo "QA output created by $seq"
  tmp=/tmp/$$
  status=1	# failure is the default!
  trap "_cleanup; exit \$status" 0 1 2 3 15

  _cleanup()
  {
      _cleanup_flakey
      rm -f $tmp.*
  }

  # get standard environment, filters and checks
  . ./common/rc
  . ./common/filter
  . ./common/dmflakey

  # real QA test starts here
  _need_to_be_root
  _supported_fs generic
  _supported_os Linux
  _require_scratch
  _require_dm_flakey
  _require_metadata_journaling $SCRATCH_DEV

  rm -f $seqres.full

  _scratch_mkfs >>$seqres.full 2>&1
  _init_flakey
  _mount_flakey

  # Create our test directory and files.
  mkdir $SCRATCH_MNT/testdir
  touch $SCRATCH_MNT/testdir/foo
  touch $SCRATCH_MNT/testdir/bar

  # Make sure everything done so far is durably persisted.
  sync

  # Create one hard link for file foo and another one for file bar. After
  # that fsync only the file bar.
  ln $SCRATCH_MNT/testdir/bar $SCRATCH_MNT/testdir/bar_link
  ln $SCRATCH_MNT/testdir/foo $SCRATCH_MNT/testdir/foo_link
  $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/testdir/bar

  # Silently drop all writes on scratch device to simulate power failure.
  _load_flakey_table $FLAKEY_DROP_WRITES
  _unmount_flakey

  # Allow writes again and mount the fs to trigger log/journal replay.
  _load_flakey_table $FLAKEY_ALLOW_WRITES
  _mount_flakey

  # Now verify both our files have a link count of 2.
  echo "Link count for file foo: $(stat --format=%h $SCRATCH_MNT/testdir/foo)"
  echo "Link count for file bar: $(stat --format=%h $SCRATCH_MNT/testdir/bar)"

  # We should be able to remove all the links of our files in testdir, and
  # after that the parent directory should become empty and therefore
  # possible to remove it.
  rm -f $SCRATCH_MNT/testdir/*
  rmdir $SCRATCH_MNT/testdir

  _unmount_flakey

  # The fstests framework will call fsck against our filesystem which will verify
  # that all metadata is in a consistent state.

  status=0
  exit

The test fails with:

 -Link count for file foo: 2
 +Link count for file foo: 1
  Link count for file bar: 2
 +rm: cannot remove '/home/fdmanana/btrfs-tests/scratch_1/testdir/foo_link': Stale file handle
 +rmdir: failed to remove '/home/fdmanana/btrfs-tests/scratch_1/testdir': Directory not empty
 (...)
 _check_btrfs_filesystem: filesystem on /dev/sdc is inconsistent

And fsck's output:

  (...)
  checking fs roots
  root 5 inode 258 errors 2001, no inode item, link count wrong
      unresolved ref dir 257 index 5 namelen 8 name foo_link filetype 1 errors 4, no inode ref
  Checking filesystem on /dev/sdc
  (...)

So fix this by marking inodes for link count fixup at log replay time
whenever a directory entry is replayed if the entry was created in the
transaction where the fsync was made and if it points to a non-directory
inode.

This isn't a new problem/regression, the issue exists for a long time,
possibly since the log tree feature was added (2008).

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/tree-log.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 60 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 9c45431e69ab..cb5666e7c3f9 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1613,6 +1613,9 @@ static bool name_in_log_ref(struct btrfs_root *log_root,
  * not exist in the FS, it is skipped.  fsyncs on directories
  * do not force down inodes inside that directory, just changes to the
  * names or unlinks in a directory.
+ *
+ * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
+ * non-existing inode) and 1 if the name was replayed.
  */
 static noinline int replay_one_name(struct btrfs_trans_handle *trans,
 				    struct btrfs_root *root,
@@ -1631,6 +1634,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
 	int exists;
 	int ret = 0;
 	bool update_size = (key->type == BTRFS_DIR_INDEX_KEY);
+	bool name_added = false;
 
 	dir = read_one_inode(root, key->objectid);
 	if (!dir)
@@ -1708,6 +1712,8 @@ out:
 	}
 	kfree(name);
 	iput(dir);
+	if (!ret && name_added)
+		ret = 1;
 	return ret;
 
 insert:
@@ -1723,6 +1729,8 @@ insert:
 			      name, name_len, log_type, &log_key);
 	if (ret && ret != -ENOENT && ret != -EEXIST)
 		goto out;
+	if (!ret)
+		name_added = true;
 	update_size = false;
 	ret = 0;
 	goto out;
@@ -1740,12 +1748,13 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
 					struct extent_buffer *eb, int slot,
 					struct btrfs_key *key)
 {
-	int ret;
+	int ret = 0;
 	u32 item_size = btrfs_item_size_nr(eb, slot);
 	struct btrfs_dir_item *di;
 	int name_len;
 	unsigned long ptr;
 	unsigned long ptr_end;
+	struct btrfs_path *fixup_path = NULL;
 
 	ptr = btrfs_item_ptr_offset(eb, slot);
 	ptr_end = ptr + item_size;
@@ -1755,12 +1764,59 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
 			return -EIO;
 		name_len = btrfs_dir_name_len(eb, di);
 		ret = replay_one_name(trans, root, path, eb, di, key);
-		if (ret)
-			return ret;
+		if (ret < 0)
+			break;
 		ptr = (unsigned long)(di + 1);
 		ptr += name_len;
+
+		/*
+		 * If this entry refers to a non-directory (directories can not
+		 * have a link count > 1) and it was added in the transaction
+		 * that was not committed, make sure we fixup the link count of
+		 * the inode it the entry points to. Otherwise something like
+		 * the following would result in a directory pointing to an
+		 * inode with a wrong link that does not account for this dir
+		 * entry:
+		 *
+		 * mkdir testdir
+		 * touch testdir/foo
+		 * touch testdir/bar
+		 * sync
+		 *
+		 * ln testdir/bar testdir/bar_link
+		 * ln testdir/foo testdir/foo_link
+		 * xfs_io -c "fsync" testdir/bar
+		 *
+		 * <power failure>
+		 *
+		 * mount fs, log replay happens
+		 *
+		 * File foo would remain with a link count of 1 when it has two
+		 * entries pointing to it in the directory testdir. This would
+		 * make it impossible to ever delete the parent directory has
+		 * it would result in stale dentries that can never be deleted.
+		 */
+		if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) {
+			struct btrfs_key di_key;
+
+			if (!fixup_path) {
+				fixup_path = btrfs_alloc_path();
+				if (!fixup_path) {
+					ret = -ENOMEM;
+					break;
+				}
+			}
+
+			btrfs_dir_item_key_to_cpu(eb, di, &di_key);
+			ret = link_to_fixup_dir(trans, root, fixup_path,
+						di_key.objectid);
+			if (ret)
+				break;
+		}
+		ret = 0;
 	}
-	return 0;
+	btrfs_free_path(fixup_path);
+	return ret;
 }
 
 /*
-- 
cgit v1.2.3


From bde6c242027b0f1d697d5333950b3a05761d40e4 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Fri, 24 Jul 2015 00:00:19 +0100
Subject: Btrfs: fix stale dir entries after unlink, inode eviction and fsync

If we remove a hard link from an inode, the inode gets evicted, then
we fsync the inode and then power fail/crash, when the log tree is
replayed, the parent directory inode still has entries pointing to
the name that no longer exists, while our inode no longer has the
BTRFS_INODE_REF_KEY item matching the deleted hard link (as expected),
leaving the filesystem in an inconsistent state. The stale directory
entries can not be deleted (an attempt to delete them causes -ESTALE
errors), which makes it impossible to delete the parent directory.

This happens because we track the id of the transaction where the last
unlink operation for the inode happened (last_unlink_trans) in an
in-memory only field of the inode, that is, a value that is never
persisted in the inode item stored on the fs/subvol btree. So if an
inode is evicted and loaded again, the value for last_unlink_trans is
set to 0, which prevents the fsync from logging the parent directory
at btrfs_log_inode_parent(). So fix this by setting last_unlink_trans
to the id of the transaction that last modified the inode when we
load the inode. This is a pessimistic approach but it always ensures
correctness with the trade off of ocassional full transaction commits
when an fsync is done against the inode in the same transaction where
it was evicted and reloaded when our inode is a directory and often
logging its parent unnecessarily when our inode is not a directory.

The following test case for fstests triggers the problem:

  seq=`basename $0`
  seqres=$RESULT_DIR/$seq
  echo "QA output created by $seq"
  tmp=/tmp/$$
  status=1	# failure is the default!
  trap "_cleanup; exit \$status" 0 1 2 3 15

  _cleanup()
  {
      _cleanup_flakey
      rm -f $tmp.*
  }

  # get standard environment, filters and checks
  . ./common/rc
  . ./common/filter
  . ./common/dmflakey

  # real QA test starts here
  _need_to_be_root
  _supported_fs generic
  _supported_os Linux
  _require_scratch
  _require_dm_flakey
  _require_metadata_journaling $SCRATCH_DEV

  rm -f $seqres.full

  _scratch_mkfs >>$seqres.full 2>&1
  _init_flakey
  _mount_flakey

  # Create our test file with 2 hard links.
  mkdir $SCRATCH_MNT/testdir
  touch $SCRATCH_MNT/testdir/foo
  ln $SCRATCH_MNT/testdir/foo $SCRATCH_MNT/testdir/bar

  # Make sure everything done so far is durably persisted.
  sync

  # Now remove one of the links, trigger inode eviction and then fsync
  # our inode.
  unlink $SCRATCH_MNT/testdir/bar
  echo 2 > /proc/sys/vm/drop_caches
  $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/testdir/foo

  # Silently drop all writes on our scratch device to simulate a power failure.
  _load_flakey_table $FLAKEY_DROP_WRITES
  _unmount_flakey

  # Allow writes again and mount the fs to trigger log/journal replay.
  _load_flakey_table $FLAKEY_ALLOW_WRITES
  _mount_flakey

  # Now verify our directory entries.
  echo "Entries in testdir:"
  ls -1 $SCRATCH_MNT/testdir

  # If we remove our inode, its parent should become empty and therefore we should
  # be able to remove the parent.
  rm -f $SCRATCH_MNT/testdir/*
  rmdir $SCRATCH_MNT/testdir

  _unmount_flakey

  # The fstests framework will call fsck against our filesystem which will verify
  # that all metadata is in a consistent state.

  status=0
  exit

The test failed on btrfs with:

  generic/098 4s ... - output mismatch (see /home/fdmanana/git/hub/xfstests/results//generic/098.out.bad)
    --- tests/generic/098.out	2015-07-23 18:01:12.616175932 +0100
    +++ /home/fdmanana/git/hub/xfstests/results//generic/098.out.bad	2015-07-23 18:04:58.924138308 +0100
    @@ -1,3 +1,6 @@
     QA output created by 098
     Entries in testdir:
    +bar
     foo
    +rm: cannot remove '/home/fdmanana/btrfs-tests/scratch_1/testdir/foo': Stale file handle
    +rmdir: failed to remove '/home/fdmanana/btrfs-tests/scratch_1/testdir': Directory not empty
    ...
    (Run 'diff -u tests/generic/098.out /home/fdmanana/git/hub/xfstests/results//generic/098.out.bad'  to see the entire diff)
  _check_btrfs_filesystem: filesystem on /dev/sdc is inconsistent (see /home/fdmanana/git/hub/xfstests/results//generic/098.full)

  $ cat /home/fdmanana/git/hub/xfstests/results//generic/098.full
  (...)
  checking fs roots
  root 5 inode 258 errors 2001, no inode item, link count wrong
     unresolved ref dir 257 index 0 namelen 3 name foo filetype 1 errors 6, no dir index, no inode ref
     unresolved ref dir 257 index 3 namelen 3 name bar filetype 1 errors 5, no dir item, no inode ref
  Checking filesystem on /dev/sdc
  (...)

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/inode.c | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e33dff356460..79a73645346e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3654,6 +3654,35 @@ cache_index:
 		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
 			&BTRFS_I(inode)->runtime_flags);
 
+	/*
+	 * We don't persist the id of the transaction where an unlink operation
+	 * against the inode was last made. So here we assume the inode might
+	 * have been evicted, and therefore the exact value of last_unlink_trans
+	 * lost, and set it to last_trans to avoid metadata inconsistencies
+	 * between the inode and its parent if the inode is fsync'ed and the log
+	 * replayed. For example, in the scenario:
+	 *
+	 * touch mydir/foo
+	 * ln mydir/foo mydir/bar
+	 * sync
+	 * unlink mydir/bar
+	 * echo 2 > /proc/sys/vm/drop_caches   # evicts inode
+	 * xfs_io -c fsync mydir/foo
+	 * <power failure>
+	 * mount fs, triggers fsync log replay
+	 *
+	 * We must make sure that when we fsync our inode foo we also log its
+	 * parent inode, otherwise after log replay the parent still has the
+	 * dentry with the "bar" name but our inode foo has a link count of 1
+	 * and doesn't have an inode ref with the name "bar" anymore.
+	 *
+	 * Setting last_unlink_trans to last_trans is a pessimistic approach,
+	 * but it guarantees correctness at the expense of ocassional full
+	 * transaction commits on fsync if our inode is a directory, or if our
+	 * inode is not a directory, logging its parent unnecessarily.
+	 */
+	BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;
+
 	path->slots[0]++;
 	if (inode->i_nlink != 1 ||
 	    path->slots[0] >= btrfs_header_nritems(leaf))
-- 
cgit v1.2.3


From d6589101b67a55107652050dfbf414403a93e351 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 29 Jul 2015 17:21:17 +0100
Subject: Btrfs: teach backref walking about backrefs with underflowed offset
 values

When cloning/deduplicating file extents (through the clone and extent_same
ioctls) we can get data back references with offset values that are a
result of an unsigned integer arithmetic underflow, that is, values that
are much larger then they could be otherwise.

This is not a problem when decrementing or dropping the back references
(happens when we overwrite the extents or punch a hole for example, through
__btrfs_drop_extents()), since we compute the same too large offset value,
but it is a problem for the backref walking code, used by an incremental
send and the ioctls that are used by the btrfs tool "inspect-internal"
commands, as it makes it miss the corresponding file extent items because
the search key is set for an extent item that starts at an offset matching
the exceptionally large offset value of the data back reference. For an
incremental send this causes the send ioctl to fail with -EIO.

So teach the backref walking code to deal with these cases by setting the
search key's offset to 0 if the backref's offset value is larger than
LLONG_MAX (the largest possible file offset). This makes sure the backref
walking code finds the corresponding file extent items at the expense of
scanning more items and leafs in the btree.

Fixing the clone/dedup ioctls to not produce such underflowed results would
require major changes breaking backward compatibility, updating user space
tools, etc.

Simple reproducer case for fstests:

  seq=`basename $0`
  seqres=$RESULT_DIR/$seq
  echo "QA output created by $seq"

  tmp=/tmp/$$
  status=1	# failure is the default!
  trap "_cleanup; exit \$status" 0 1 2 3 15

  _cleanup()
  {
      rm -fr $send_files_dir
      rm -f $tmp.*
  }

  # get standard environment, filters and checks
  . ./common/rc
  . ./common/filter

  # real QA test starts here
  _supported_fs btrfs
  _supported_os Linux
  _require_scratch
  _require_cloner
  _need_to_be_root

  send_files_dir=$TEST_DIR/btrfs-test-$seq

  rm -f $seqres.full
  rm -fr $send_files_dir
  mkdir $send_files_dir

  _scratch_mkfs >>$seqres.full 2>&1
  _scratch_mount

  # Create our test file with a single extent of 64K starting at file
  # offset 128K.
  $XFS_IO_PROG -f -c "pwrite -S 0xaa 128K 64K" $SCRATCH_MNT/foo \
      | _filter_xfs_io

  _run_btrfs_util_prog subvolume snapshot -r $SCRATCH_MNT \
      $SCRATCH_MNT/mysnap1

  # Now clone parts of the original extent into lower offsets of the file.
  #
  # The first clone operation adds a file extent item to file offset 0
  # that points to our initial extent with a data offset of 16K. The
  # corresponding data back reference in the extent tree has an offset of
  # 18446744073709535232, which is the result of file_offset - data_offset
  # = 0 - 16K.
  #
  # The second clone operation adds a file extent item to file offset 16K
  # that points to our initial extent with a data offset of 48K. The
  # corresponding data back reference in the extent tree has an offset of
  # 18446744073709518848, which is the result of file_offset - data_offset
  # = 16K - 48K.
  #
  # Those large back reference offsets (result of unsigned arithmetic
  # underflow) confused the back reference walking code (used by an
  # incremental send and the multiple inspect-internal ioctls) and made it
  # miss the back references, which for the case of an incremental send it
  # made it fail with -EIO and print a message like the following to
  # dmesg:
  #
  # "BTRFS error (device sdc): did not find backref in send_root. \
  #  inode=257, offset=0, disk_byte=12845056 found extent=12845056"
  #
  $CLONER_PROG -s $(((128 + 16) * 1024)) -d 0 -l $((16 * 1024)) \
      $SCRATCH_MNT/foo $SCRATCH_MNT/foo
  $CLONER_PROG -s $(((128 + 48) * 1024)) -d $((16 * 1024)) \
      -l $((16 * 1024)) $SCRATCH_MNT/foo $SCRATCH_MNT/foo

  _run_btrfs_util_prog subvolume snapshot -r $SCRATCH_MNT \
      $SCRATCH_MNT/mysnap2

  _run_btrfs_util_prog send $SCRATCH_MNT/mysnap1 -f $send_files_dir/1.snap
  _run_btrfs_util_prog send -p $SCRATCH_MNT/mysnap1 $SCRATCH_MNT/mysnap2 \
      -f $send_files_dir/2.snap

  echo "File digest in the original filesystem:"
  md5sum $SCRATCH_MNT/mysnap2/foo | _filter_scratch

  # Now recreate the filesystem by receiving both send streams and verify
  # we get the same file contents that the original filesystem had.
  _scratch_unmount
  _scratch_mkfs >>$seqres.full 2>&1
  _scratch_mount

  _run_btrfs_util_prog receive $SCRATCH_MNT -f $send_files_dir/1.snap
  _run_btrfs_util_prog receive $SCRATCH_MNT -f $send_files_dir/2.snap

  echo "File digest in the new filesystem:"
  md5sum $SCRATCH_MNT/mysnap2/foo | _filter_scratch

  status=0
  exit

The test's expected golden output is:

  wrote 65536/65536 bytes at offset 131072
  XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
  File digest in the original filesystem:
  6c6079335cff141b8a31233ead04cbff  SCRATCH_MNT/mysnap2/foo
  File digest in the new filesystem:
  6c6079335cff141b8a31233ead04cbff  SCRATCH_MNT/mysnap2/foo

But it failed with:

    (...)
    @@ -1,7 +1,5 @@
     QA output created by 097
     wrote 65536/65536 bytes at offset 131072
     XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
    -File digest in the original filesystem:
    -6c6079335cff141b8a31233ead04cbff  SCRATCH_MNT/mysnap2/foo
    -File digest in the new filesystem:
    -6c6079335cff141b8a31233ead04cbff  SCRATCH_MNT/mysnap2/foo
    ...

  $ cat /home/fdmanana/git/hub/xfstests/results//btrfs/097.full
  (...)
  ERROR: send ioctl failed with -5: Input/output error

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/backref.c | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 802fabb30e15..a0ca5757a3ff 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -206,10 +206,33 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id,
 		return -ENOMEM;
 
 	ref->root_id = root_id;
-	if (key)
+	if (key) {
 		ref->key_for_search = *key;
-	else
+		/*
+		 * We can often find data backrefs with an offset that is too
+		 * large (>= LLONG_MAX, maximum allowed file offset) due to
+		 * underflows when subtracting a file's offset with the data
+		 * offset of its corresponding extent data item. This can
+		 * happen for example in the clone ioctl.
+		 * So if we detect such case we set the search key's offset to
+		 * zero to make sure we will find the matching file extent item
+		 * at add_all_parents(), otherwise we will miss it because the
+		 * offset taken form the backref is much larger then the offset
+		 * of the file extent item. This can make us scan a very large
+		 * number of file extent items, but at least it will not make
+		 * us miss any.
+		 * This is an ugly workaround for a behaviour that should have
+		 * never existed, but it does and a fix for the clone ioctl
+		 * would touch a lot of places, cause backwards incompatibility
+		 * and would not fix the problem for extents cloned with older
+		 * kernels.
+		 */
+		if (ref->key_for_search.type == BTRFS_EXTENT_DATA_KEY &&
+		    ref->key_for_search.offset >= LLONG_MAX)
+			ref->key_for_search.offset = 0;
+	} else {
 		memset(&ref->key_for_search, 0, sizeof(ref->key_for_search));
+	}
 
 	ref->inode_list = NULL;
 	ref->level = level;
-- 
cgit v1.2.3


From dd81d459a37d73cfa39896bd070e7b92e66e3628 Mon Sep 17 00:00:00 2001
From: Naohiro Aota <naota@elisp.net>
Date: Tue, 30 Jun 2015 11:25:43 +0900
Subject: btrfs: fix search key advancing condition

The search key advancing condition used in copy_to_sk() is loose. It can
advance the key even if it reaches sk->max_*: e.g. when the max key = (512,
1024, -1) and the current key = (512, 1025, 10), it increments the
offset by 1, continues hopeless search from (512, 1025, 11). This issue
make ioctl() to take unexpectedly long time scanning all the leaf a blocks
one by one.

This commit fix the problem using standard way of key comparison:
btrfs_comp_cpu_keys()

Signed-off-by: Naohiro Aota <naota@elisp.net>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ioctl.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 0770c91586ca..3e2a80433504 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1933,6 +1933,7 @@ static noinline int copy_to_sk(struct btrfs_root *root,
 	u64 found_transid;
 	struct extent_buffer *leaf;
 	struct btrfs_ioctl_search_header sh;
+	struct btrfs_key test;
 	unsigned long item_off;
 	unsigned long item_len;
 	int nritems;
@@ -2016,12 +2017,17 @@ static noinline int copy_to_sk(struct btrfs_root *root,
 	}
 advance_key:
 	ret = 0;
-	if (key->offset < (u64)-1 && key->offset < sk->max_offset)
+	test.objectid = sk->max_objectid;
+	test.type = sk->max_type;
+	test.offset = sk->max_offset;
+	if (btrfs_comp_cpu_keys(key, &test) >= 0)
+		ret = 1;
+	else if (key->offset < (u64)-1)
 		key->offset++;
-	else if (key->type < (u8)-1 && key->type < sk->max_type) {
+	else if (key->type < (u8)-1) {
 		key->offset = 0;
 		key->type++;
-	} else if (key->objectid < (u64)-1 && key->objectid < sk->max_objectid) {
+	} else if (key->objectid < (u64)-1) {
 		key->offset = 0;
 		key->type = 0;
 		key->objectid++;
-- 
cgit v1.2.3


From 18aa09229741364280d0a1670597b5207fc05b8d Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 5 Aug 2015 16:49:08 +0100
Subject: Btrfs: fix stale dir entries after removing a link and fsync

We have one more case where after a log tree is replayed we get
inconsistent metadata leading to stale directory entries, due to
some directories having entries pointing to some inode while the
inode does not have a matching BTRFS_INODE_[REF|EXTREF]_KEY item.

To trigger the problem we need to have a file with multiple hard links
belonging to different parent directories. Then if one of those hard
links is removed and we fsync the file using one of its other links
that belongs to a different parent directory, we end up not logging
the fact that the removed hard link doesn't exists anymore in the
parent directory.

Simple reproducer:

  seq=`basename $0`
  seqres=$RESULT_DIR/$seq
  echo "QA output created by $seq"
  tmp=/tmp/$$
  status=1	# failure is the default!
  trap "_cleanup; exit \$status" 0 1 2 3 15

  _cleanup()
  {
      _cleanup_flakey
      rm -f $tmp.*
  }

  # get standard environment, filters and checks
  . ./common/rc
  . ./common/filter
  . ./common/dmflakey

  # real QA test starts here
  _need_to_be_root
  _supported_fs generic
  _supported_os Linux
  _require_scratch
  _require_dm_flakey
  _require_metadata_journaling $SCRATCH_DEV

  rm -f $seqres.full

  _scratch_mkfs >>$seqres.full 2>&1
  _init_flakey
  _mount_flakey

  # Create our test directory and file.
  mkdir $SCRATCH_MNT/testdir
  touch $SCRATCH_MNT/foo
  ln $SCRATCH_MNT/foo $SCRATCH_MNT/testdir/foo2
  ln $SCRATCH_MNT/foo $SCRATCH_MNT/testdir/foo3

  # Make sure everything done so far is durably persisted.
  sync

  # Now we remove one of our file's hardlinks in the directory testdir.
  unlink $SCRATCH_MNT/testdir/foo3

  # We now fsync our file using the "foo" link, which has a parent that
  # is not the directory "testdir".
  $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foo

  # Silently drop all writes and unmount to simulate a crash/power
  # failure.
  _load_flakey_table $FLAKEY_DROP_WRITES
  _unmount_flakey

  # Allow writes again, mount to trigger journal/log replay.
  _load_flakey_table $FLAKEY_ALLOW_WRITES
  _mount_flakey

  # After the journal/log is replayed we expect to not see the "foo3"
  # link anymore and we should be able to remove all names in the
  # directory "testdir" and then remove it (no stale directory entries
  # left after the journal/log replay).
  echo "Entries in testdir:"
  ls -1 $SCRATCH_MNT/testdir

  rm -f $SCRATCH_MNT/testdir/*
  rmdir $SCRATCH_MNT/testdir

  _unmount_flakey

  status=0
  exit

The test fails with:

  $ ./check generic/107
  FSTYP         -- btrfs
  PLATFORM      -- Linux/x86_64 debian3 4.1.0-rc6-btrfs-next-11+
  MKFS_OPTIONS  -- /dev/sdc
  MOUNT_OPTIONS -- /dev/sdc /home/fdmanana/btrfs-tests/scratch_1

  generic/107 3s ... - output mismatch (see .../results/generic/107.out.bad)
    --- tests/generic/107.out	2015-08-01 01:39:45.807462161 +0100
    +++ /home/fdmanana/git/hub/xfstests/results//generic/107.out.bad
    @@ -1,3 +1,5 @@
     QA output created by 107
     Entries in testdir:
     foo2
    +foo3
    +rmdir: failed to remove '/home/fdmanana/btrfs-tests/scratch_1/testdir': Directory not empty
    ...
    _check_btrfs_filesystem: filesystem on /dev/sdc is inconsistent \
      (see /home/fdmanana/git/hub/xfstests/results//generic/107.full)
    _check_dmesg: something found in dmesg (see .../results/generic/107.dmesg)
  Ran: generic/107
  Failures: generic/107
  Failed 1 of 1 tests

  $ cat /home/fdmanana/git/hub/xfstests/results//generic/107.full
  (...)
  checking fs roots
  root 5 inode 257 errors 200, dir isize wrong
	unresolved ref dir 257 index 3 namelen 4 name foo3 filetype 1 errors 5, no dir item, no inode ref
  (...)

And produces the following warning in dmesg:

  [127298.759064] BTRFS info (device dm-0): failed to delete reference to foo3, inode 258 parent 257
  [127298.762081] ------------[ cut here ]------------
  [127298.763311] WARNING: CPU: 10 PID: 7891 at fs/btrfs/inode.c:3956 __btrfs_unlink_inode+0x182/0x35a [btrfs]()
  [127298.767327] BTRFS: Transaction aborted (error -2)
  (...)
  [127298.788611] Call Trace:
  [127298.789137]  [<ffffffff8145f077>] dump_stack+0x4f/0x7b
  [127298.790090]  [<ffffffff81095de5>] ? console_unlock+0x356/0x3a2
  [127298.791157]  [<ffffffff8104b3b0>] warn_slowpath_common+0xa1/0xbb
  [127298.792323]  [<ffffffffa065ad09>] ? __btrfs_unlink_inode+0x182/0x35a [btrfs]
  [127298.793633]  [<ffffffff8104b410>] warn_slowpath_fmt+0x46/0x48
  [127298.794699]  [<ffffffffa065ad09>] __btrfs_unlink_inode+0x182/0x35a [btrfs]
  [127298.797640]  [<ffffffffa065be8f>] btrfs_unlink_inode+0x1e/0x40 [btrfs]
  [127298.798876]  [<ffffffffa065bf11>] btrfs_unlink+0x60/0x9b [btrfs]
  [127298.800154]  [<ffffffff8116fb48>] vfs_unlink+0x9c/0xed
  [127298.801303]  [<ffffffff81173481>] do_unlinkat+0x12b/0x1fb
  [127298.802450]  [<ffffffff81253855>] ? lockdep_sys_exit_thunk+0x12/0x14
  [127298.803797]  [<ffffffff81174056>] SyS_unlinkat+0x29/0x2b
  [127298.805017]  [<ffffffff81465197>] system_call_fastpath+0x12/0x6f
  [127298.806310] ---[ end trace bbfddacb7aaada7b ]---
  [127298.807325] BTRFS warning (device dm-0): __btrfs_unlink_inode:3956: Aborting unused transaction(No such entry).

So fix this by logging all parent inodes, current and old ones, to make
sure we do not get stale entries after log replay. This is not a simple
solution such as triggering a full transaction commit because it would
imply full transaction commit when an inode is fsynced in the same
transaction that modified it and reloaded it after eviction (because its
last_unlink_trans is set to the same value as its last_trans as of the
commit with the title "Btrfs: fix stale dir entries after unlink, inode
eviction and fsync"), and it would also make fstest generic/066 fail
since one of the fsyncs triggers a full commit and the next fsync will
not find the inode in the log anymore (therefore not removing the xattr).

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/tree-log.c | 158 +++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 138 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index cb5666e7c3f9..9314adeba946 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4960,6 +4960,94 @@ next_dir_inode:
 	return ret;
 }
 
+static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
+				 struct inode *inode,
+				 struct btrfs_log_ctx *ctx)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	const u64 ino = btrfs_ino(inode);
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	path->skip_locking = 1;
+	path->search_commit_root = 1;
+
+	key.objectid = ino;
+	key.type = BTRFS_INODE_REF_KEY;
+	key.offset = 0;
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+
+	while (true) {
+		struct extent_buffer *leaf = path->nodes[0];
+		int slot = path->slots[0];
+		u32 cur_offset = 0;
+		u32 item_size;
+		unsigned long ptr;
+
+		if (slot >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				goto out;
+			else if (ret > 0)
+				break;
+			continue;
+		}
+
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+		/* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */
+		if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY)
+			break;
+
+		item_size = btrfs_item_size_nr(leaf, slot);
+		ptr = btrfs_item_ptr_offset(leaf, slot);
+		while (cur_offset < item_size) {
+			struct btrfs_key inode_key;
+			struct inode *dir_inode;
+
+			inode_key.type = BTRFS_INODE_ITEM_KEY;
+			inode_key.offset = 0;
+
+			if (key.type == BTRFS_INODE_EXTREF_KEY) {
+				struct btrfs_inode_extref *extref;
+
+				extref = (struct btrfs_inode_extref *)
+					(ptr + cur_offset);
+				inode_key.objectid = btrfs_inode_extref_parent(
+					leaf, extref);
+				cur_offset += sizeof(*extref);
+				cur_offset += btrfs_inode_extref_name_len(leaf,
+					extref);
+			} else {
+				inode_key.objectid = key.offset;
+				cur_offset = item_size;
+			}
+
+			dir_inode = btrfs_iget(root->fs_info->sb, &inode_key,
+					       root, NULL);
+			/* If parent inode was deleted, skip it. */
+			if (IS_ERR(dir_inode))
+				continue;
+
+			ret = btrfs_log_inode(trans, root, dir_inode,
+					      LOG_INODE_ALL, 0, LLONG_MAX, ctx);
+			iput(dir_inode);
+			if (ret)
+				goto out;
+		}
+		path->slots[0]++;
+	}
+	ret = 0;
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
 /*
  * helper function around btrfs_log_inode to make sure newly created
  * parent directories also end up in the log.  A minimal inode and backref
@@ -4979,9 +5067,6 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 	struct dentry *old_parent = NULL;
 	int ret = 0;
 	u64 last_committed = root->fs_info->last_trans_committed;
-	const struct dentry * const first_parent = parent;
-	const bool did_unlink = (BTRFS_I(inode)->last_unlink_trans >
-				 last_committed);
 	bool log_dentries = false;
 	struct inode *orig_inode = inode;
 
@@ -5042,6 +5127,53 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 	if (S_ISDIR(inode->i_mode) && ctx && ctx->log_new_dentries)
 		log_dentries = true;
 
+	/*
+	 * On unlink we must make sure all our current and old parent directores
+	 * inodes are fully logged. This is to prevent leaving dangling
+	 * directory index entries in directories that were our parents but are
+	 * not anymore. Not doing this results in old parent directory being
+	 * impossible to delete after log replay (rmdir will always fail with
+	 * error -ENOTEMPTY).
+	 *
+	 * Example 1:
+	 *
+	 * mkdir testdir
+	 * touch testdir/foo
+	 * ln testdir/foo testdir/bar
+	 * sync
+	 * unlink testdir/bar
+	 * xfs_io -c fsync testdir/foo
+	 * <power failure>
+	 * mount fs, triggers log replay
+	 *
+	 * If we don't log the parent directory (testdir), after log replay the
+	 * directory still has an entry pointing to the file inode using the bar
+	 * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and
+	 * the file inode has a link count of 1.
+	 *
+	 * Example 2:
+	 *
+	 * mkdir testdir
+	 * touch foo
+	 * ln foo testdir/foo2
+	 * ln foo testdir/foo3
+	 * sync
+	 * unlink testdir/foo3
+	 * xfs_io -c fsync foo
+	 * <power failure>
+	 * mount fs, triggers log replay
+	 *
+	 * Similar as the first example, after log replay the parent directory
+	 * testdir still has an entry pointing to the inode file with name foo3
+	 * but the file inode does not have a matching BTRFS_INODE_REF_KEY item
+	 * and has a link count of 2.
+	 */
+	if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
+		ret = btrfs_log_all_parents(trans, orig_inode, ctx);
+		if (ret)
+			goto end_trans;
+	}
+
 	while (1) {
 		if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb)
 			break;
@@ -5050,23 +5182,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 		if (root != BTRFS_I(inode)->root)
 			break;
 
-		/*
-		 * On unlink we must make sure our immediate parent directory
-		 * inode is fully logged. This is to prevent leaving dangling
-		 * directory index entries and a wrong directory inode's i_size.
-		 * Not doing so can result in a directory being impossible to
-		 * delete after log replay (rmdir will always fail with error
-		 * -ENOTEMPTY).
-		 */
-		if (did_unlink && parent == first_parent)
-			inode_only = LOG_INODE_ALL;
-		else
-			inode_only = LOG_INODE_EXISTS;
-
-		if (BTRFS_I(inode)->generation >
-		    root->fs_info->last_trans_committed ||
-		    inode_only == LOG_INODE_ALL) {
-			ret = btrfs_log_inode(trans, root, inode, inode_only,
+		if (BTRFS_I(inode)->generation > last_committed) {
+			ret = btrfs_log_inode(trans, root, inode,
+					      LOG_INODE_EXISTS,
 					      0, LLONG_MAX, ctx);
 			if (ret)
 				goto end_trans;
-- 
cgit v1.2.3


From a323e8139c3617b2bf975317725d0fd962886d06 Mon Sep 17 00:00:00 2001
From: Zhao Lei <zhaolei@cn.fujitsu.com>
Date: Thu, 23 Jul 2015 12:29:49 +0800
Subject: btrfs: Fix scrub panic when leaf crosses stripes

Scrub panic in following operation:
  mkfs.ext4 /dev/vdh
  btrfs-convert /dev/vdh
  mount /dev/vdh /mnt/tmp1
  btrfs scrub start -B /dev/vdh
  (panic)

Reason:
  1: In some case, leaf created by btrfs-convert was splited into 2
     strips.
  2: Scrub bypassed part of above wrong leaf data, but remain data
     caused panic in scrub_checksum_tree_block().

For reason 1:
  we can get following information after some simple operation.
  a. mkfs.ext4 /dev/vdh
     btrfs-convert /dev/vdh
  b. btrfs-debug-tree /dev/vdh
     we can see following item in extent tree:
     item 25 key (27054080 METADATA_ITEM 0) itemoff 15083 itemsize 33
     Its logical address is [27054080, 27070464)
     and acrossed 2 strips:
     [27000832, 27066368)
     [27066368, 27131904)
  Will be fixed in btrfs-progs(btrfs-convert, btrfsck, ...)

For reason 2:
  Scrub is trying to do a "bypass" in this case, but the result is
  "panic", because current code lacks of some condition in bypass,
  and let some wrong leaf data escaped.

This patch fixed above scrub code.

Before patch:
  # btrfs scrub start -B /dev/vdh
  (panic)

After patch:
  # btrfs scrub start -B /dev/vdh
  scrub done for 353cec8f-da31-4a94-aa35-be72d997b06e
  ...
  # dmesg
  ...
  [   59.088697] BTRFS error (device vdh): scrub: tree block 27054080 spanning stripes, ignored. logical=27000832
  [   59.089929] BTRFS error (device vdh): scrub: tree block 27054080 spanning stripes, ignored. logical=27066368
  #

Reported-by: Chris Murphy <lists@colorremedies.com>
Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/scrub.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 94db0fa5225a..35d49b28a688 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2881,11 +2881,12 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
 			flags = btrfs_extent_flags(l, extent);
 			generation = btrfs_extent_generation(l, extent);
 
-			if (key.objectid < logic_start &&
-			    (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
-				btrfs_err(fs_info,
-					  "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
-					   key.objectid, logic_start);
+			if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
+			    (key.objectid < logic_start ||
+			     key.objectid + bytes >
+			     logic_start + map->stripe_len)) {
+				btrfs_err(fs_info, "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
+					  key.objectid, logic_start);
 				goto next;
 			}
 again:
@@ -3212,8 +3213,10 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 			flags = btrfs_extent_flags(l, extent);
 			generation = btrfs_extent_generation(l, extent);
 
-			if (key.objectid < logical &&
-			    (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
+			if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
+			    (key.objectid < logical ||
+			     key.objectid + bytes >
+			     logical + map->stripe_len)) {
 				btrfs_err(fs_info,
 					   "scrub: tree block %llu spanning "
 					   "stripes, ignored. logical=%llu",
-- 
cgit v1.2.3


From 78fa177029802f7f10953d357067171f39a79b81 Mon Sep 17 00:00:00 2001
From: Zhao Lei <zhaolei@cn.fujitsu.com>
Date: Mon, 20 Jul 2015 17:54:50 +0800
Subject: btrfs: Show detail information when mount failed on missing devices

When mount failed because missing device, we can see following
dmesg:
 [ 1060.267743] BTRFS: too many missing devices, writeable mount is not allowed
 [ 1060.273158] BTRFS: open_ctree failed

This patch add missing_device_number and tolerated_missing_device_number
to above output, to let user know what really happened, and helps
bug-report and debug.

dmesg after patch:
 [  127.050367] BTRFS: missing devices(1) exceeds the limit(0), writeable mount is not allowed
 [  127.056099] BTRFS: open_ctree failed

Changelog v1->v2:
1: Changed to more clear description, suggested-by:
   Anand Jain <anand.jain@oracle.com>

Suggested-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/disk-io.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f556c3732c2c..e49ae5ea9040 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2950,8 +2950,9 @@ retry_root_backup:
 	if (fs_info->fs_devices->missing_devices >
 	     fs_info->num_tolerated_disk_barrier_failures &&
 	    !(sb->s_flags & MS_RDONLY)) {
-		printk(KERN_WARNING "BTRFS: "
-			"too many missing devices, writeable mount is not allowed\n");
+		pr_warn("BTRFS: missing devices(%llu) exceeds the limit(%d), writeable mount is not allowed\n",
+			fs_info->fs_devices->missing_devices,
+			fs_info->num_tolerated_disk_barrier_failures);
 		goto fail_sysfs;
 	}
 
-- 
cgit v1.2.3


From f2f66a2f886383fb76aca8ecc1bcc116c5d1f6fe Mon Sep 17 00:00:00 2001
From: Zhao Lei <zhaolei@cn.fujitsu.com>
Date: Tue, 21 Jul 2015 12:22:29 +0800
Subject: btrfs: Check cancel and pause in interval of scrub operation

Old code checking cancel and pause request inside scrub stripe
operation, like:
  loop() {
    if (parity) {
      scrub_parity_stripe();
      continue;
    }

    check_cancel_and_pause()

    scrub_normal_stripe();
  }

Reason is when introduce raid56 stripe scrub, new code is inserted
simplely to front of loop.

Better to:
  loop() {
    check_cancel_and_pause()

    if (parity)
      scrub_parity_stripe();
    else
      scrub_normal_stripe();
  }

This patch adjusted code place to realize above sequence.

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/scrub.c | 34 ++++++++++++++++++----------------
 1 file changed, 18 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 35d49b28a688..d99cdb11d4d9 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3105,22 +3105,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 	 */
 	ret = 0;
 	while (physical < physical_end) {
-		/* for raid56, we skip parity stripe */
-		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
-			ret = get_raid56_logic_offset(physical, num,
-					map, &logical, &stripe_logical);
-			logical += base;
-			if (ret) {
-				stripe_logical += base;
-				stripe_end = stripe_logical + increment - 1;
-				ret = scrub_raid56_parity(sctx, map, scrub_dev,
-						ppath, stripe_logical,
-						stripe_end);
-				if (ret)
-					goto out;
-				goto skip;
-			}
-		}
 		/*
 		 * canceled?
 		 */
@@ -3145,6 +3129,24 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 			scrub_blocked_if_needed(fs_info);
 		}
 
+		/* for raid56, we skip parity stripe */
+		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+			ret = get_raid56_logic_offset(physical, num, map,
+						      &logical,
+						      &stripe_logical);
+			logical += base;
+			if (ret) {
+				stripe_logical += base;
+				stripe_end = stripe_logical + increment - 1;
+				ret = scrub_raid56_parity(sctx, map, scrub_dev,
+							  ppath, stripe_logical,
+							  stripe_end);
+				if (ret)
+					goto out;
+				goto skip;
+			}
+		}
+
 		if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
 			key.type = BTRFS_METADATA_ITEM_KEY;
 		else
-- 
cgit v1.2.3


From 6fa96d72f79a15579da2bb63c65cafb210915b48 Mon Sep 17 00:00:00 2001
From: Zhao Lei <zhaolei@cn.fujitsu.com>
Date: Tue, 21 Jul 2015 12:22:30 +0800
Subject: btrfs: Free checksum list on scrub_extent() fail

When scrub_extent() failed, we need to free previois created
checksum list.

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/scrub.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index d99cdb11d4d9..6987de6b1ed8 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2924,10 +2924,12 @@ again:
 						      extent_dev, flags,
 						      generation,
 						      extent_mirror_num);
+
+			scrub_free_csums(sctx);
+
 			if (ret)
 				goto out;
 
-			scrub_free_csums(sctx);
 			if (extent_logical + extent_len <
 			    key.objectid + bytes) {
 				logic_start += map->stripe_len;
@@ -3262,10 +3264,12 @@ again:
 					   extent_physical, extent_dev, flags,
 					   generation, extent_mirror_num,
 					   extent_logical - logical + physical);
+
+			scrub_free_csums(sctx);
+
 			if (ret)
 				goto out;
 
-			scrub_free_csums(sctx);
 			if (extent_logical + extent_len <
 			    key.objectid + bytes) {
 				if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
-- 
cgit v1.2.3


From a0dd59de3c73fbb3b738eaf333732f2f27254a2c Mon Sep 17 00:00:00 2001
From: Zhao Lei <zhaolei@cn.fujitsu.com>
Date: Tue, 21 Jul 2015 15:42:26 +0800
Subject: btrfs: Fix calculate typo caused by ambiguous meaning of logic_end

For example, in scrub_raid56_parity(), following lines are used
to judge is all data processed:
 place1: if (key.objectid > logic_end) ...
 place2: if (logic_start >= logic_end) ...
 ...
 (place2 is typo, is should be ">", it is copied from other
  place, where logic_end's meaning is different, long story...)

We can fix above typo directly, but the root reason is ambiguous
meaning of logic_end in scrub raid56 parity.

In other place, XXX_end is pointed to data which is not included,
and we need to process segment of [XXX_start, XXX_end).

But for scrub raid56 parity, logic_end is pointed to lattest data
need to process, and introduced many "+ 1" and "- 1" in code as
below:
 length = sparity->logic_end - sparity->logic_start + 1
 logic_end - logic_start + 1
 stripe_logical + increment - 1

This patch changed logic_end's meaning to make it in normal understanding
in raid56 parity functions and data struct alone with above bugfix.

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/scrub.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 6987de6b1ed8..185595a7be10 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2702,7 +2702,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
 			   sparity->nsectors))
 		goto out;
 
-	length = sparity->logic_end - sparity->logic_start + 1;
+	length = sparity->logic_end - sparity->logic_start;
 	ret = btrfs_map_sblock(sctx->dev_root->fs_info, WRITE,
 			       sparity->logic_start,
 			       &length, &bbio, 0, 1);
@@ -2868,7 +2868,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
 			    key.type != BTRFS_METADATA_ITEM_KEY)
 				goto next;
 
-			if (key.objectid > logic_end) {
+			if (key.objectid >= logic_end) {
 				stop_loop = 1;
 				break;
 			}
@@ -2958,7 +2958,7 @@ next:
 out:
 	if (ret < 0)
 		scrub_parity_mark_sectors_error(sparity, logic_start,
-						logic_end - logic_start + 1);
+						logic_end - logic_start);
 	scrub_parity_put(sparity);
 	scrub_submit(sctx);
 	mutex_lock(&sctx->wr_ctx.wr_lock);
@@ -3139,7 +3139,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 			logical += base;
 			if (ret) {
 				stripe_logical += base;
-				stripe_end = stripe_logical + increment - 1;
+				stripe_end = stripe_logical + increment;
 				ret = scrub_raid56_parity(sctx, map, scrub_dev,
 							  ppath, stripe_logical,
 							  stripe_end);
@@ -3287,7 +3287,7 @@ loop:
 					if (ret && physical < physical_end) {
 						stripe_logical += base;
 						stripe_end = stripe_logical +
-								increment - 1;
+								increment;
 						ret = scrub_raid56_parity(sctx,
 							map, scrub_dev, ppath,
 							stripe_logical,
-- 
cgit v1.2.3


From fe8cf654b1ccf7f0b49cc0de9e6ad8a56529e384 Mon Sep 17 00:00:00 2001
From: Zhao Lei <zhaolei@cn.fujitsu.com>
Date: Wed, 22 Jul 2015 13:14:47 +0800
Subject: btrfs: Load only necessary csums into list in scrub

We need not load csum of whole strip in scrub because strip is trimed
before use, it is to say, what we really need to calculate csum is
data between [extent_logical, extent_len).

This patch changed to use above segment for btrfs_lookup_csums_range()
in scrub_stripe()

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/scrub.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 185595a7be10..b03dd10e1d25 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3254,9 +3254,11 @@ again:
 						   &extent_dev,
 						   &extent_mirror_num);
 
-			ret = btrfs_lookup_csums_range(csum_root, logical,
-						logical + map->stripe_len - 1,
-						&sctx->csum_list, 1);
+			ret = btrfs_lookup_csums_range(csum_root,
+						       extent_logical,
+						       extent_logical +
+						       extent_len - 1,
+						       &sctx->csum_list, 1);
 			if (ret)
 				goto out;
 
-- 
cgit v1.2.3


From d7cad2389560f3249435ac928bc13e9408ae467e Mon Sep 17 00:00:00 2001
From: Zhao Lei <zhaolei@cn.fujitsu.com>
Date: Wed, 22 Jul 2015 13:14:48 +0800
Subject: btrfs: Bypass unrelated items before accessing its contents in scrub

When we access extent_root in scrub_stripe() and
scrub_raid56_parity(), we need bypass unrelated tree item firstly
before using its contents to do other condition.

It is not a bug fix, only making code sequence in logic.

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/scrub.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index b03dd10e1d25..fadf5fcd9306 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2856,6 +2856,10 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
 			}
 			btrfs_item_key_to_cpu(l, &key, slot);
 
+			if (key.type != BTRFS_EXTENT_ITEM_KEY &&
+			    key.type != BTRFS_METADATA_ITEM_KEY)
+				goto next;
+
 			if (key.type == BTRFS_METADATA_ITEM_KEY)
 				bytes = root->nodesize;
 			else
@@ -2864,10 +2868,6 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
 			if (key.objectid + bytes <= logic_start)
 				goto next;
 
-			if (key.type != BTRFS_EXTENT_ITEM_KEY &&
-			    key.type != BTRFS_METADATA_ITEM_KEY)
-				goto next;
-
 			if (key.objectid >= logic_end) {
 				stop_loop = 1;
 				break;
@@ -3193,6 +3193,10 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 			}
 			btrfs_item_key_to_cpu(l, &key, slot);
 
+			if (key.type != BTRFS_EXTENT_ITEM_KEY &&
+			    key.type != BTRFS_METADATA_ITEM_KEY)
+				goto next;
+
 			if (key.type == BTRFS_METADATA_ITEM_KEY)
 				bytes = root->nodesize;
 			else
@@ -3201,10 +3205,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 			if (key.objectid + bytes <= logical)
 				goto next;
 
-			if (key.type != BTRFS_EXTENT_ITEM_KEY &&
-			    key.type != BTRFS_METADATA_ITEM_KEY)
-				goto next;
-
 			if (key.objectid >= logical + map->stripe_len) {
 				/* out of this device extent */
 				if (key.objectid >= logic_end)
-- 
cgit v1.2.3


From 868f401ae38acb439005626c04d575e64c5ae760 Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Wed, 5 Aug 2015 16:43:27 +0800
Subject: btrfs: Use ref_cnt for set_block_group_ro()

More than one code call set_block_group_ro() and restore rw in fail.

Old code use bool bit to save blockgroup's ro state, it can not
support parallel case(it is confirmd exist in my debug log).

This patch use ref count to store ro state, and rename
set_block_group_ro/set_block_group_rw
to
inc_block_group_ro/dec_block_group_ro.

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h       |  6 +++---
 fs/btrfs/extent-tree.c | 42 +++++++++++++++++++++---------------------
 fs/btrfs/relocation.c  | 14 ++++++--------
 3 files changed, 30 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index aac314e14188..f57e6cae394b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1300,7 +1300,7 @@ struct btrfs_block_group_cache {
 	/* for raid56, this is a full stripe, without parity */
 	unsigned long full_stripe_len;
 
-	unsigned int ro:1;
+	unsigned int ro;
 	unsigned int iref:1;
 	unsigned int has_caching_ctl:1;
 	unsigned int removed:1;
@@ -3495,9 +3495,9 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
 void btrfs_block_rsv_release(struct btrfs_root *root,
 			     struct btrfs_block_rsv *block_rsv,
 			     u64 num_bytes);
-int btrfs_set_block_group_ro(struct btrfs_root *root,
+int btrfs_inc_block_group_ro(struct btrfs_root *root,
 			     struct btrfs_block_group_cache *cache);
-void btrfs_set_block_group_rw(struct btrfs_root *root,
+void btrfs_dec_block_group_ro(struct btrfs_root *root,
 			      struct btrfs_block_group_cache *cache);
 void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 07204bf601ed..5cefa02b40a9 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -8723,14 +8723,13 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
 	return flags;
 }
 
-static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
+static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
 {
 	struct btrfs_space_info *sinfo = cache->space_info;
 	u64 num_bytes;
 	u64 min_allocable_bytes;
 	int ret = -ENOSPC;
 
-
 	/*
 	 * We need some metadata space and system metadata space for
 	 * allocating chunks in some corner cases until we force to set
@@ -8747,6 +8746,7 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
 	spin_lock(&cache->lock);
 
 	if (cache->ro) {
+		cache->ro++;
 		ret = 0;
 		goto out;
 	}
@@ -8758,7 +8758,7 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
 	    sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
 	    min_allocable_bytes <= sinfo->total_bytes) {
 		sinfo->bytes_readonly += num_bytes;
-		cache->ro = 1;
+		cache->ro++;
 		list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
 		ret = 0;
 	}
@@ -8768,7 +8768,7 @@ out:
 	return ret;
 }
 
-int btrfs_set_block_group_ro(struct btrfs_root *root,
+int btrfs_inc_block_group_ro(struct btrfs_root *root,
 			     struct btrfs_block_group_cache *cache)
 
 {
@@ -8776,8 +8776,6 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
 	u64 alloc_flags;
 	int ret;
 
-	BUG_ON(cache->ro);
-
 again:
 	trans = btrfs_join_transaction(root);
 	if (IS_ERR(trans))
@@ -8820,7 +8818,7 @@ again:
 			goto out;
 	}
 
-	ret = set_block_group_ro(cache, 0);
+	ret = inc_block_group_ro(cache, 0);
 	if (!ret)
 		goto out;
 	alloc_flags = get_alloc_profile(root, cache->space_info->flags);
@@ -8828,7 +8826,7 @@ again:
 			     CHUNK_ALLOC_FORCE);
 	if (ret < 0)
 		goto out;
-	ret = set_block_group_ro(cache, 0);
+	ret = inc_block_group_ro(cache, 0);
 out:
 	if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
 		alloc_flags = update_block_group_flags(root, cache->flags);
@@ -8891,7 +8889,7 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
 	return free_bytes;
 }
 
-void btrfs_set_block_group_rw(struct btrfs_root *root,
+void btrfs_dec_block_group_ro(struct btrfs_root *root,
 			      struct btrfs_block_group_cache *cache)
 {
 	struct btrfs_space_info *sinfo = cache->space_info;
@@ -8901,11 +8899,13 @@ void btrfs_set_block_group_rw(struct btrfs_root *root,
 
 	spin_lock(&sinfo->lock);
 	spin_lock(&cache->lock);
-	num_bytes = cache->key.offset - cache->reserved - cache->pinned -
-		    cache->bytes_super - btrfs_block_group_used(&cache->item);
-	sinfo->bytes_readonly -= num_bytes;
-	cache->ro = 0;
-	list_del_init(&cache->ro_list);
+	if (!--cache->ro) {
+		num_bytes = cache->key.offset - cache->reserved -
+			    cache->pinned - cache->bytes_super -
+			    btrfs_block_group_used(&cache->item);
+		sinfo->bytes_readonly -= num_bytes;
+		list_del_init(&cache->ro_list);
+	}
 	spin_unlock(&cache->lock);
 	spin_unlock(&sinfo->lock);
 }
@@ -9421,7 +9421,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 
 		set_avail_alloc_bits(root->fs_info, cache->flags);
 		if (btrfs_chunk_readonly(root, cache->key.objectid)) {
-			set_block_group_ro(cache, 1);
+			inc_block_group_ro(cache, 1);
 		} else if (btrfs_block_group_used(&cache->item) == 0) {
 			spin_lock(&info->unused_bgs_lock);
 			/* Should always be true but just in case. */
@@ -9449,11 +9449,11 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 		list_for_each_entry(cache,
 				&space_info->block_groups[BTRFS_RAID_RAID0],
 				list)
-			set_block_group_ro(cache, 1);
+			inc_block_group_ro(cache, 1);
 		list_for_each_entry(cache,
 				&space_info->block_groups[BTRFS_RAID_SINGLE],
 				list)
-			set_block_group_ro(cache, 1);
+			inc_block_group_ro(cache, 1);
 	}
 
 	init_global_block_rsv(info);
@@ -9941,7 +9941,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
 		spin_unlock(&block_group->lock);
 
 		/* We don't want to force the issue, only flip if it's ok. */
-		ret = set_block_group_ro(block_group, 0);
+		ret = inc_block_group_ro(block_group, 0);
 		up_write(&space_info->groups_sem);
 		if (ret < 0) {
 			ret = 0;
@@ -9955,7 +9955,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
 		/* 1 for btrfs_orphan_reserve_metadata() */
 		trans = btrfs_start_transaction(root, 1);
 		if (IS_ERR(trans)) {
-			btrfs_set_block_group_rw(root, block_group);
+			btrfs_dec_block_group_ro(root, block_group);
 			ret = PTR_ERR(trans);
 			goto next;
 		}
@@ -9982,14 +9982,14 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
 				  EXTENT_DIRTY, GFP_NOFS);
 		if (ret) {
 			mutex_unlock(&fs_info->unused_bg_unpin_mutex);
-			btrfs_set_block_group_rw(root, block_group);
+			btrfs_dec_block_group_ro(root, block_group);
 			goto end_trans;
 		}
 		ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
 				  EXTENT_DIRTY, GFP_NOFS);
 		if (ret) {
 			mutex_unlock(&fs_info->unused_bg_unpin_mutex);
-			btrfs_set_block_group_rw(root, block_group);
+			btrfs_dec_block_group_ro(root, block_group);
 			goto end_trans;
 		}
 		mutex_unlock(&fs_info->unused_bg_unpin_mutex);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 88cbb5995667..52fe55ad11d6 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -4215,14 +4215,12 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
 	rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
 	BUG_ON(!rc->block_group);
 
-	if (!rc->block_group->ro) {
-		ret = btrfs_set_block_group_ro(extent_root, rc->block_group);
-		if (ret) {
-			err = ret;
-			goto out;
-		}
-		rw = 1;
+	ret = btrfs_inc_block_group_ro(extent_root, rc->block_group);
+	if (ret) {
+		err = ret;
+		goto out;
 	}
+	rw = 1;
 
 	path = btrfs_alloc_path();
 	if (!path) {
@@ -4294,7 +4292,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
 	WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
 out:
 	if (err && rw)
-		btrfs_set_block_group_rw(extent_root, rc->block_group);
+		btrfs_dec_block_group_ro(extent_root, rc->block_group);
 	iput(rc->data_inode);
 	btrfs_put_block_group(rc->block_group);
 	kfree(rc);
-- 
cgit v1.2.3


From 0e22be890ef385de6816ec87a8a41c0b0f4a7a23 Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Wed, 5 Aug 2015 16:43:28 +0800
Subject: btrfs: Separate scrub_blocked_if_needed() to scrub_pause_on/off()

It can reduce current duplicated code which is similar to
scrub_blocked_if_needed() but can not call it because little
different.
It also used by my next patch which is in same case.

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/scrub.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index fadf5fcd9306..08872026a254 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -332,11 +332,14 @@ static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
 	}
 }
 
-static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
+static void scrub_pause_on(struct btrfs_fs_info *fs_info)
 {
 	atomic_inc(&fs_info->scrubs_paused);
 	wake_up(&fs_info->scrub_pause_wait);
+}
 
+static void scrub_pause_off(struct btrfs_fs_info *fs_info)
+{
 	mutex_lock(&fs_info->scrub_lock);
 	__scrub_blocked_if_needed(fs_info);
 	atomic_dec(&fs_info->scrubs_paused);
@@ -345,6 +348,12 @@ static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
 	wake_up(&fs_info->scrub_pause_wait);
 }
 
+static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
+{
+	scrub_pause_on(fs_info);
+	scrub_pause_off(fs_info);
+}
+
 /*
  * used for workers that require transaction commits (i.e., for the
  * NOCOW case)
-- 
cgit v1.2.3


From b708ce969af3ceadca18aac5bffe48fe977473b1 Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Wed, 5 Aug 2015 16:43:29 +0800
Subject: btrfs: use scrub_pause_on/off() to reduce code in
 scrub_enumerate_chunks()

Use new intruduced scrub_pause_on/off() can make this code block
clean and more readable.

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/scrub.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 08872026a254..281de3f2041d 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3503,8 +3503,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
 
 		wait_event(sctx->list_wait,
 			   atomic_read(&sctx->bios_in_flight) == 0);
-		atomic_inc(&fs_info->scrubs_paused);
-		wake_up(&fs_info->scrub_pause_wait);
+
+		scrub_pause_on(fs_info);
 
 		/*
 		 * must be called before we decrease @scrub_paused.
@@ -3515,11 +3515,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
 			   atomic_read(&sctx->workers_pending) == 0);
 		atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
 
-		mutex_lock(&fs_info->scrub_lock);
-		__scrub_blocked_if_needed(fs_info);
-		atomic_dec(&fs_info->scrubs_paused);
-		mutex_unlock(&fs_info->scrub_lock);
-		wake_up(&fs_info->scrub_pause_wait);
+		scrub_pause_off(fs_info);
 
 		btrfs_put_block_group(cache);
 		if (ret)
-- 
cgit v1.2.3


From 55e3a601c81cdca4497bf855fa4d331f8e830744 Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Wed, 5 Aug 2015 16:43:30 +0800
Subject: btrfs: Fix data checksum error cause by replace with io-load.

xfstests btrfs/070 sometimes failed.
In my test machine, its fail rate is about 30%.
In another vm(vmware), its fail rate is about 50%.

Reason:
  btrfs/070 do replace and defrag with fsstress simultaneously,
  after above operation, checksum error is found by scrub.

  Actually, it have no relationship with defrag operation, only
  replace with fsstress can trigger this bug.

  New data writen to target device have possibility rewrited by
  old data from source device by replace code in debug, to avoid
  above problem, we can set target block group to readonly in
  replace period, so new data requested by other operation will
  not write to same place with replace code.

  Before patch(4.1-rc3):
    30% failed in 100 xfstests.
  After patch:
    0% failed in 300 xfstests.

It also happened in btrfs/071 as it's another scrub with IO load tests.

Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/scrub.c   | 34 +++++++++++++++++++++++++++-------
 fs/btrfs/volumes.c |  2 ++
 2 files changed, 29 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 281de3f2041d..7555ddc5289f 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3407,7 +3407,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
 	u64 chunk_tree;
 	u64 chunk_objectid;
 	u64 chunk_offset;
-	int ret;
+	int ret = 0;
 	int slot;
 	struct extent_buffer *l;
 	struct btrfs_key key;
@@ -3435,8 +3435,14 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
 			if (path->slots[0] >=
 			    btrfs_header_nritems(path->nodes[0])) {
 				ret = btrfs_next_leaf(root, path);
-				if (ret)
+				if (ret < 0)
+					break;
+				if (ret > 0) {
+					ret = 0;
 					break;
+				}
+			} else {
+				ret = 0;
 			}
 		}
 
@@ -3478,6 +3484,22 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
 		if (!cache)
 			goto skip;
 
+		/*
+		 * we need call btrfs_inc_block_group_ro() with scrubs_paused,
+		 * to avoid deadlock caused by:
+		 * btrfs_inc_block_group_ro()
+		 * -> btrfs_wait_for_commit()
+		 * -> btrfs_commit_transaction()
+		 * -> btrfs_scrub_pause()
+		 */
+		scrub_pause_on(fs_info);
+		ret = btrfs_inc_block_group_ro(root, cache);
+		scrub_pause_off(fs_info);
+		if (ret) {
+			btrfs_put_block_group(cache);
+			break;
+		}
+
 		dev_replace->cursor_right = found_key.offset + length;
 		dev_replace->cursor_left = found_key.offset;
 		dev_replace->item_needs_writeback = 1;
@@ -3517,6 +3539,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
 
 		scrub_pause_off(fs_info);
 
+		btrfs_dec_block_group_ro(root, cache);
+
 		btrfs_put_block_group(cache);
 		if (ret)
 			break;
@@ -3539,11 +3563,7 @@ skip:
 
 	btrfs_free_path(path);
 
-	/*
-	 * ret can still be 1 from search_slot or next_leaf,
-	 * that's not an error
-	 */
-	return ret < 0 ? ret : 0;
+	return ret;
 }
 
 static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index fbe7c104531c..b7fb0b297f71 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2785,7 +2785,9 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
 		return -ENOSPC;
 
 	/* step one, relocate all the extents inside this chunk */
+	btrfs_scrub_pause(root);
 	ret = btrfs_relocate_block_group(extent_root, chunk_offset);
+	btrfs_scrub_continue(root);
 	if (ret)
 		return ret;
 
-- 
cgit v1.2.3


From 4b3576e450a1539492e26e2d7b1e2e69578c032e Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Wed, 5 Aug 2015 18:00:02 +0800
Subject: btrfs: Error handle for get_ref_objectid_v0() in
 relocate_block_group()

We need error checking code for get_ref_objectid_v0() in
relocate_block_group(), to avoid unpredictable result, especially
for accessing uninitialized value(when function failed) after
this line.

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/relocation.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 52fe55ad11d6..1659c94f179c 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3976,6 +3976,10 @@ restart:
 			       sizeof(struct btrfs_extent_item_v0));
 			ret = get_ref_objectid_v0(rc, path, &key, &ref_owner,
 						  &path_change);
+			if (ret < 0) {
+				err = ret;
+				break;
+			}
 			if (ref_owner < BTRFS_FIRST_FREE_OBJECTID)
 				flags = BTRFS_EXTENT_FLAG_TREE_BLOCK;
 			else
-- 
cgit v1.2.3


From 4624900dd353488479a12abf8dffe2c1786cdb89 Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Wed, 5 Aug 2015 18:00:03 +0800
Subject: btrfs: Cleanup: Remove objectid's init-value in create_reloc_inode()

objectid's init-value is not used in any case, remove it.

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/relocation.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 1659c94f179c..46989289c6e6 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -4144,7 +4144,7 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root;
 	struct btrfs_key key;
-	u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
+	u64 objectid;
 	int err = 0;
 
 	root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
-- 
cgit v1.2.3


From dc2ee4e244138124a05cdc39365b38d4cc77e3fa Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Wed, 5 Aug 2015 18:00:04 +0800
Subject: btrfs: Cleanup: Remove chunk_objectid argument from
 btrfs_relocate_chunk()

Remove chunk_objectid argument from btrfs_relocate_chunk() because
it is not necessary, it can also cleanup some code in caller for
prepare its value.

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/volumes.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b7fb0b297f71..fb9abf1678d0 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2755,9 +2755,7 @@ out:
 	return ret;
 }
 
-static int btrfs_relocate_chunk(struct btrfs_root *root,
-				u64 chunk_objectid,
-				u64 chunk_offset)
+static int btrfs_relocate_chunk(struct btrfs_root *root, u64 chunk_offset)
 {
 	struct btrfs_root *extent_root;
 	struct btrfs_trans_handle *trans;
@@ -2857,7 +2855,6 @@ again:
 
 		if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
 			ret = btrfs_relocate_chunk(chunk_root,
-						   found_key.objectid,
 						   found_key.offset);
 			if (ret == -ENOSPC)
 				failed++;
@@ -3377,7 +3374,6 @@ again:
 		}
 
 		ret = btrfs_relocate_chunk(chunk_root,
-					   found_key.objectid,
 					   found_key.offset);
 		mutex_unlock(&fs_info->delete_unused_bgs_mutex);
 		if (ret && ret != -ENOSPC)
@@ -4079,7 +4075,6 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 	struct btrfs_dev_extent *dev_extent = NULL;
 	struct btrfs_path *path;
 	u64 length;
-	u64 chunk_objectid;
 	u64 chunk_offset;
 	int ret;
 	int slot;
@@ -4156,11 +4151,10 @@ again:
 			break;
 		}
 
-		chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
 		chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
 		btrfs_release_path(path);
 
-		ret = btrfs_relocate_chunk(root, chunk_objectid, chunk_offset);
+		ret = btrfs_relocate_chunk(root, chunk_offset);
 		mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
 		if (ret && ret != -ENOSPC)
 			goto done;
-- 
cgit v1.2.3


From 147d256e0980e31505d25d721be979d6a8d2148c Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Thu, 6 Aug 2015 20:58:11 +0800
Subject: btrfs: Remove unnecessary variants in relocation.c

These arguments are not used in functions, remove them for cleanup
and make kernel stack happy.

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h       |  3 +--
 fs/btrfs/relocation.c  | 13 +++++--------
 fs/btrfs/transaction.c |  2 +-
 3 files changed, 7 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f57e6cae394b..f335c18bd263 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -4185,8 +4185,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
 int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root, struct extent_buffer *buf,
 			  struct extent_buffer *cow);
-void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
-			      struct btrfs_pending_snapshot *pending,
+void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
 			      u64 *bytes_to_reserve);
 int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
 			      struct btrfs_pending_snapshot *pending);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 46989289c6e6..303babeef505 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2523,8 +2523,7 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
  * counted. return -ENOENT if the block is root of reloc tree.
  */
 static noinline_for_stack
-struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
-				   struct backref_node *node)
+struct btrfs_root *select_one_root(struct backref_node *node)
 {
 	struct backref_node *next;
 	struct btrfs_root *root;
@@ -2912,7 +2911,7 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
 		return 0;
 
 	BUG_ON(node->processed);
-	root = select_one_root(trans, node);
+	root = select_one_root(node);
 	if (root == ERR_PTR(-ENOENT)) {
 		update_processed_blocks(rc, node);
 		goto out;
@@ -3755,8 +3754,7 @@ out:
  * helper to find next unprocessed extent
  */
 static noinline_for_stack
-int find_next_extent(struct btrfs_trans_handle *trans,
-		     struct reloc_control *rc, struct btrfs_path *path,
+int find_next_extent(struct reloc_control *rc, struct btrfs_path *path,
 		     struct btrfs_key *extent_key)
 {
 	struct btrfs_key key;
@@ -3951,7 +3949,7 @@ restart:
 			continue;
 		}
 
-		ret = find_next_extent(trans, rc, path, &key);
+		ret = find_next_extent(rc, path, &key);
 		if (ret < 0)
 			err = ret;
 		if (ret != 0)
@@ -4596,8 +4594,7 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
  * called before creating snapshot. it calculates metadata reservation
  * requried for relocating tree blocks in the snapshot
  */
-void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
-			      struct btrfs_pending_snapshot *pending,
+void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
 			      u64 *bytes_to_reserve)
 {
 	struct btrfs_root *root;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index f5021fcb154e..91f44c9f7ebc 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1301,7 +1301,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	 */
 	btrfs_set_skip_qgroup(trans, objectid);
 
-	btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
+	btrfs_reloc_pre_snapshot(pending, &to_reserve);
 
 	if (to_reserve > 0) {
 		pending->error = btrfs_block_rsv_add(root,
-- 
cgit v1.2.3


From 93314e3b64fd2e77237fdba7cfcc0d38dcd05579 Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Thu, 6 Aug 2015 21:56:58 +0800
Subject: btrfs: abort transaction on btrfs_reloc_cow_block()

When btrfs_reloc_cow_block() failed in __btrfs_cow_block(), current
code just return a err-value to caller, but leave new_created extent
buffer exist and locked.

Then subsequent code (in relocate) try to lock above eb again,
and caused deadlock without any dmesg.
(eb lock use wait_event(), so no lockdep message)

It is hard to do recover work in __btrfs_cow_block() at this error
point, but we can abort transaction to avoid deadlock and operate on
unstable state.a

It also helps developer to find wrong place quickly.
(better than a frozen fs without any dmesg before patch)

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 54114b4887dd..5f745eadf77d 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1159,8 +1159,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 
 	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
 		ret = btrfs_reloc_cow_block(trans, root, buf, cow);
-		if (ret)
+		if (ret) {
+			btrfs_abort_transaction(trans, root, ret);
 			return ret;
+		}
 	}
 
 	if (buf == root->node) {
-- 
cgit v1.2.3


From d02207512df120a37a8f1e50a5f90c555266282c Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Thu, 6 Aug 2015 22:16:23 +0800
Subject: btrfs: Fix wrong comment of btrfs_alloc_tree_block()

These wrong comment was copyed from another function(expired) from
init, this patch fixed them.

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5cefa02b40a9..e2ad72b241ad 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -7567,9 +7567,6 @@ static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
 
 /*
  * finds a free extent and does all the dirty work required for allocation
- * returns the key for the extent through ins, and a tree buffer for
- * the first block of the extent through buf.
- *
  * returns the tree buffer or an ERR_PTR on error.
  */
 struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
-- 
cgit v1.2.3


From 9ed0dea09fc5e8cb1926c15fc986a46a2db43da6 Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Thu, 6 Aug 2015 22:16:24 +0800
Subject: btrfs: Remove root argument in extent_data_ref_count()

Because it is never used.

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e2ad72b241ad..59d59d98bca1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1316,8 +1316,7 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-static noinline u32 extent_data_ref_count(struct btrfs_root *root,
-					  struct btrfs_path *path,
+static noinline u32 extent_data_ref_count(struct btrfs_path *path,
 					  struct btrfs_extent_inline_ref *iref)
 {
 	struct btrfs_key key;
@@ -6349,7 +6348,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 	} else {
 		if (found_extent) {
 			BUG_ON(is_data && refs_to_drop !=
-			       extent_data_ref_count(root, path, iref));
+			       extent_data_ref_count(path, iref));
 			if (iref) {
 				BUG_ON(path->slots[0] != extent_slot);
 			} else {
-- 
cgit v1.2.3


From 166f66d0bc94a14ec4bc6ee6ab079532df4a8d9e Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Thu, 6 Aug 2015 22:39:36 +0800
Subject: btrfs: Add WARN_ON() for double lock in btrfs_tree_lock()

When a task trying to double lock a extent buffer, there are no
lockdep warning about it because this lock may be in "blocking_lock"
state, and make us hard to debug.

This patch add a WARN_ON() for above condition, it can not report
all deadlock cases(as lock between tasks), but at least helps us
some.

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/locking.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index f8229ef1b46d..d7e6baf1b205 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -241,6 +241,7 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
  */
 void btrfs_tree_lock(struct extent_buffer *eb)
 {
+	WARN_ON(eb->lock_owner == current->pid);
 again:
 	wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0);
 	wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0);
-- 
cgit v1.2.3


From acdf898de8903f50bb10bbce4b774432bcd63c85 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Tue, 28 Jul 2015 18:03:30 +0800
Subject: Btrfs: fix warning in backref walking

When we do backref walking, we search firstly in queued delayed refs
and then the on-disk backrefs, but we parse differently for shared
references, for delayed refs we also add 'ref->root' while for on-disk
backrefs we don't, this can prevent us from merging refs indexed
by the same bytenr and cause find_parent_nodes() to throw a warning at
'WARN_ON(ref->count < 0)', for example, when we have a shared data extent
with 'ref_cnt=1' and a delayed shared data with a BTRFS_DROP_DELAYED_REF,
that happens.

For shared references, no matter if it's delayed or on-disk, ref->root is
not at all used, instead it's ref->parent that really matters, so this has
delayed refs handled as the same way as on-disk refs.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/backref.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index a0ca5757a3ff..ecbc63d3143e 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -655,7 +655,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
 			struct btrfs_delayed_tree_ref *ref;
 
 			ref = btrfs_delayed_node_to_tree_ref(node);
-			ret = __add_prelim_ref(prefs, ref->root, NULL,
+			ret = __add_prelim_ref(prefs, 0, NULL,
 					       ref->level + 1, ref->parent,
 					       node->bytenr,
 					       node->ref_mod * sgn, GFP_ATOMIC);
@@ -687,11 +687,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
 			struct btrfs_delayed_data_ref *ref;
 
 			ref = btrfs_delayed_node_to_data_ref(node);
-
-			key.objectid = ref->objectid;
-			key.type = BTRFS_EXTENT_DATA_KEY;
-			key.offset = ref->offset;
-			ret = __add_prelim_ref(prefs, ref->root, &key, 0,
+			ret = __add_prelim_ref(prefs, 0, NULL, 0,
 					       ref->parent, node->bytenr,
 					       node->ref_mod * sgn, GFP_ATOMIC);
 			break;
-- 
cgit v1.2.3


From 4a3560c4f3f0f92d3b673944753e3e947e030bc4 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Fri, 7 Aug 2015 16:48:41 +0800
Subject: Btrfs: fix defrag to merge tail file extent

The file layout is

[extent 1]...[extent n][4k extent][HOLE][extent x]

extent 1~n and 4k extent can be merged during defrag, and the whole
defrag bytes is larger than our defrag thresh(256k), 4k extent as a
tail is left unmerged since we check if its next extent can be merged
(the next one is a hole, so the check will fail), the layout thus can
be

[new extent][4k extent][HOLE][extent x]
 (1~n)

To fix it, beside looking at the next one, this also looks at the
previous one by checking @defrag_end, which is set to 0 when we
decide to stop merging contiguous extents, otherwise, we can merge
the previous one with our extent.

Also, this makes btrfs behave consistent with how xfs and ext4 do.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ioctl.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 3e2a80433504..d1e4cac83311 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1030,6 +1030,7 @@ static int should_defrag_range(struct inode *inode, u64 start, u32 thresh,
 	struct extent_map *em;
 	int ret = 1;
 	bool next_mergeable = true;
+	bool prev_mergeable = true;
 
 	/*
 	 * make sure that once we start defragging an extent, we keep on
@@ -1050,13 +1051,16 @@ static int should_defrag_range(struct inode *inode, u64 start, u32 thresh,
 		goto out;
 	}
 
+	if (!*defrag_end)
+		prev_mergeable = false;
+
 	next_mergeable = defrag_check_next_extent(inode, em);
 	/*
 	 * we hit a real extent, if it is big or the next extent is not a
 	 * real extent, don't bother defragging it
 	 */
 	if (!compress && (*last_len == 0 || *last_len >= thresh) &&
-	    (em->len >= thresh || !next_mergeable))
+	    (em->len >= thresh || (!next_mergeable && !prev_mergeable)))
 		ret = 0;
 out:
 	/*
-- 
cgit v1.2.3


From 293a8489f300536dc6d996c35a6ebb89aa03bab2 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@suse.de>
Date: Tue, 30 Jun 2015 14:42:06 -0700
Subject: btrfs: fix clone / extent-same deadlocks

Clone and extent same lock their source and target inodes in opposite order.
In addition to this, the range locking in clone doesn't take ordering into
account. Fix this by having clone use the same locking helpers as
btrfs-extent-same.

In addition, I do a small cleanup of the locking helpers, removing a case
(both inodes being the same) which was poorly accounted for and never
actually used by the callers.

Signed-off-by: Mark Fasheh <mfasheh@suse.de>
Reviewed-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ioctl.c | 34 ++++++++--------------------------
 1 file changed, 8 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index d1e4cac83311..0adf5422fce9 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2852,8 +2852,7 @@ static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2)
 		swap(inode1, inode2);
 
 	mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
-	if (inode1 != inode2)
-		mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
+	mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
 }
 
 static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
@@ -2871,8 +2870,7 @@ static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
 		swap(loff1, loff2);
 	}
 	lock_extent_range(inode1, loff1, len);
-	if (inode1 != inode2)
-		lock_extent_range(inode2, loff2, len);
+	lock_extent_range(inode2, loff2, len);
 }
 
 struct cmp_pages {
@@ -3797,13 +3795,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 		goto out_fput;
 
 	if (!same_inode) {
-		if (inode < src) {
-			mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
-			mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD);
-		} else {
-			mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT);
-			mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
-		}
+		btrfs_double_inode_lock(src, inode);
 	} else {
 		mutex_lock(&src->i_mutex);
 	}
@@ -3853,8 +3845,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 
 		lock_extent_range(src, lock_start, lock_len);
 	} else {
-		lock_extent_range(src, off, len);
-		lock_extent_range(inode, destoff, len);
+		btrfs_double_extent_lock(src, off, inode, destoff, len);
 	}
 
 	ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
@@ -3865,9 +3856,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 
 		unlock_extent(&BTRFS_I(src)->io_tree, lock_start, lock_end);
 	} else {
-		unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
-		unlock_extent(&BTRFS_I(inode)->io_tree, destoff,
-			      destoff + len - 1);
+		btrfs_double_extent_unlock(src, off, inode, destoff, len);
 	}
 	/*
 	 * Truncate page cache pages so that future reads will see the cloned
@@ -3876,17 +3865,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 	truncate_inode_pages_range(&inode->i_data, destoff,
 				   PAGE_CACHE_ALIGN(destoff + len) - 1);
 out_unlock:
-	if (!same_inode) {
-		if (inode < src) {
-			mutex_unlock(&src->i_mutex);
-			mutex_unlock(&inode->i_mutex);
-		} else {
-			mutex_unlock(&inode->i_mutex);
-			mutex_unlock(&src->i_mutex);
-		}
-	} else {
+	if (!same_inode)
+		btrfs_double_inode_unlock(src, inode);
+	else
 		mutex_unlock(&src->i_mutex);
-	}
 out_fput:
 	fdput(src_file);
 out_drop_write:
-- 
cgit v1.2.3


From 03679ade86b2b1c370c8790f1ffcbcdef6d49f9f Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Fri, 19 Jun 2015 11:52:48 -0700
Subject: Btrfs: remove misleading handling of missing device scrub

scrub_submit() claims that it can handle a bio with a NULL block device,
but this is misleading, as calling bio_add_page() on a bio with a NULL
->bi_bdev would've already crashed. Delete this, as we're about to
properly handle a missing block device.

Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/scrub.c | 16 +---------------
 1 file changed, 1 insertion(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 7555ddc5289f..a12c450e55fa 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2096,21 +2096,7 @@ static void scrub_submit(struct scrub_ctx *sctx)
 	sbio = sctx->bios[sctx->curr];
 	sctx->curr = -1;
 	scrub_pending_bio_inc(sctx);
-
-	if (!sbio->bio->bi_bdev) {
-		/*
-		 * this case should not happen. If btrfs_map_block() is
-		 * wrong, it could happen for dev-replace operations on
-		 * missing devices when no mirrors are available, but in
-		 * this case it should already fail the mount.
-		 * This case is handled correctly (but _very_ slowly).
-		 */
-		printk_ratelimited(KERN_WARNING
-			"BTRFS: scrub_submit(bio bdev == NULL) is unexpected!\n");
-		bio_endio(sbio->bio, -EIO);
-	} else {
-		btrfsic_submit_bio(READ, sbio->bio);
-	}
+	btrfsic_submit_bio(READ, sbio->bio);
 }
 
 static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
-- 
cgit v1.2.3


From 7cb2c4202ed5730ecbf13c5d34c2cadff4cbe88f Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Fri, 19 Jun 2015 11:52:49 -0700
Subject: Btrfs: count devices correctly in readahead during RAID 5/6 replace

Commit 5fbc7c59fd22 ("Btrfs: fix unfinished readahead thread for raid5/6
degraded mounting") fixed a problem where we would skip a missing device
when we shouldn't have because there are no other mirrors to read from
in RAID 5/6. After commit 2c8cdd6ee4e7 ("Btrfs, replace: write dirty
pages into the replace target device"), the fix doesn't work when we're
doing a missing device replace on RAID 5/6 because the replace device is
counted as a mirror so we're tricked into thinking we can safely skip
the missing device. The fix is to count only the real stripes and decide
based on that.

Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/reada.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 0e7beea92b4c..4645cd16d5ba 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -328,6 +328,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
 	struct btrfs_device *prev_dev;
 	u32 blocksize;
 	u64 length;
+	int real_stripes;
 	int nzones = 0;
 	int i;
 	unsigned long index = logical >> PAGE_CACHE_SHIFT;
@@ -369,7 +370,8 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
 		goto error;
 	}
 
-	for (nzones = 0; nzones < bbio->num_stripes; ++nzones) {
+	real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
+	for (nzones = 0; nzones < real_stripes; ++nzones) {
 		struct reada_zone *zone;
 
 		dev = bbio->stripes[nzones].dev;
-- 
cgit v1.2.3


From b4ee1782686d5b7a97826d67fdeaefaedbca23ce Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Fri, 19 Jun 2015 11:52:50 -0700
Subject: Btrfs: add RAID 5/6 BTRFS_RBIO_REBUILD_MISSING operation

The current RAID 5/6 recovery code isn't quite prepared to handle
missing devices. In particular, it expects a bio that we previously
attempted to use in the read path, meaning that it has valid pages
allocated. However, missing devices have a NULL blkdev, and we can't
call bio_add_page() on a bio with a NULL blkdev. We could do manual
manipulation of bio->bi_io_vec, but that's pretty gross. So instead, add
a separate path that allows us to manually add pages to the rbio.

Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/raid56.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++++-------
 fs/btrfs/raid56.h | 10 +++++--
 fs/btrfs/scrub.c  |  3 +-
 3 files changed, 86 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index fa72068bd256..6fe2613ef288 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -61,9 +61,10 @@
 #define RBIO_CACHE_SIZE 1024
 
 enum btrfs_rbio_ops {
-	BTRFS_RBIO_WRITE	= 0,
-	BTRFS_RBIO_READ_REBUILD	= 1,
-	BTRFS_RBIO_PARITY_SCRUB	= 2,
+	BTRFS_RBIO_WRITE,
+	BTRFS_RBIO_READ_REBUILD,
+	BTRFS_RBIO_PARITY_SCRUB,
+	BTRFS_RBIO_REBUILD_MISSING,
 };
 
 struct btrfs_raid_bio {
@@ -602,6 +603,10 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
 	    cur->operation == BTRFS_RBIO_PARITY_SCRUB)
 		return 0;
 
+	if (last->operation == BTRFS_RBIO_REBUILD_MISSING ||
+	    cur->operation == BTRFS_RBIO_REBUILD_MISSING)
+		return 0;
+
 	return 1;
 }
 
@@ -793,7 +798,10 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
 
 			if (next->operation == BTRFS_RBIO_READ_REBUILD)
 				async_read_rebuild(next);
-			else if (next->operation == BTRFS_RBIO_WRITE) {
+			else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) {
+				steal_rbio(rbio, next);
+				async_read_rebuild(next);
+			} else if (next->operation == BTRFS_RBIO_WRITE) {
 				steal_rbio(rbio, next);
 				async_rmw_stripe(next);
 			} else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
@@ -1809,7 +1817,8 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
 	faila = rbio->faila;
 	failb = rbio->failb;
 
-	if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
+	if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
+	    rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
 		spin_lock_irq(&rbio->bio_list_lock);
 		set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
 		spin_unlock_irq(&rbio->bio_list_lock);
@@ -1834,7 +1843,8 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
 			 * if we're rebuilding a read, we have to use
 			 * pages from the bio list
 			 */
-			if (rbio->operation == BTRFS_RBIO_READ_REBUILD &&
+			if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
+			     rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
 			    (stripe == faila || stripe == failb)) {
 				page = page_in_rbio(rbio, stripe, pagenr, 0);
 			} else {
@@ -1943,7 +1953,8 @@ pstripe:
 			 * if we're rebuilding a read, we have to use
 			 * pages from the bio list
 			 */
-			if (rbio->operation == BTRFS_RBIO_READ_REBUILD &&
+			if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
+			     rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
 			    (stripe == faila || stripe == failb)) {
 				page = page_in_rbio(rbio, stripe, pagenr, 0);
 			} else {
@@ -1964,6 +1975,8 @@ cleanup_io:
 		else
 			clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
 
+		rbio_orig_end_io(rbio, err, err == 0);
+	} else if (rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
 		rbio_orig_end_io(rbio, err, err == 0);
 	} else if (err == 0) {
 		rbio->faila = -1;
@@ -2101,7 +2114,8 @@ out:
 	return 0;
 
 cleanup:
-	if (rbio->operation == BTRFS_RBIO_READ_REBUILD)
+	if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
+	    rbio->operation == BTRFS_RBIO_REBUILD_MISSING)
 		rbio_orig_end_io(rbio, -EIO, 0);
 	return -EIO;
 }
@@ -2232,8 +2246,9 @@ raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio,
 	return rbio;
 }
 
-void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio,
-				   struct page *page, u64 logical)
+/* Used for both parity scrub and missing. */
+void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
+			    u64 logical)
 {
 	int stripe_offset;
 	int index;
@@ -2668,3 +2683,55 @@ void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
 	if (!lock_stripe_add(rbio))
 		async_scrub_parity(rbio);
 }
+
+/* The following code is used for dev replace of a missing RAID 5/6 device. */
+
+struct btrfs_raid_bio *
+raid56_alloc_missing_rbio(struct btrfs_root *root, struct bio *bio,
+			  struct btrfs_bio *bbio, u64 length)
+{
+	struct btrfs_raid_bio *rbio;
+
+	rbio = alloc_rbio(root, bbio, length);
+	if (IS_ERR(rbio))
+		return NULL;
+
+	rbio->operation = BTRFS_RBIO_REBUILD_MISSING;
+	bio_list_add(&rbio->bio_list, bio);
+	/*
+	 * This is a special bio which is used to hold the completion handler
+	 * and make the scrub rbio is similar to the other types
+	 */
+	ASSERT(!bio->bi_iter.bi_size);
+
+	rbio->faila = find_logical_bio_stripe(rbio, bio);
+	if (rbio->faila == -1) {
+		BUG();
+		kfree(rbio);
+		return NULL;
+	}
+
+	return rbio;
+}
+
+static void missing_raid56_work(struct btrfs_work *work)
+{
+	struct btrfs_raid_bio *rbio;
+
+	rbio = container_of(work, struct btrfs_raid_bio, work);
+	__raid56_parity_recover(rbio);
+}
+
+static void async_missing_raid56(struct btrfs_raid_bio *rbio)
+{
+	btrfs_init_work(&rbio->work, btrfs_rmw_helper,
+			missing_raid56_work, NULL, NULL);
+
+	btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work);
+}
+
+void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio)
+{
+	if (!lock_stripe_add(rbio))
+		async_missing_raid56(rbio);
+}
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index 2b5d7977d83b..8b694699d502 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -48,15 +48,21 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
 int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
 			       struct btrfs_bio *bbio, u64 stripe_len);
 
+void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
+			    u64 logical);
+
 struct btrfs_raid_bio *
 raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio,
 			       struct btrfs_bio *bbio, u64 stripe_len,
 			       struct btrfs_device *scrub_dev,
 			       unsigned long *dbitmap, int stripe_nsectors);
-void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio,
-				   struct page *page, u64 logical);
 void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio);
 
+struct btrfs_raid_bio *
+raid56_alloc_missing_rbio(struct btrfs_root *root, struct bio *bio,
+			  struct btrfs_bio *bbio, u64 length);
+void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio);
+
 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info);
 #endif
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index a12c450e55fa..038162456cfa 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2720,8 +2720,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
 		goto rbio_out;
 
 	list_for_each_entry(spage, &sparity->spages, list)
-		raid56_parity_add_scrub_pages(rbio, spage->page,
-					      spage->logical);
+		raid56_add_scrub_pages(rbio, spage->page, spage->logical);
 
 	scrub_pending_bio_inc(sctx);
 	raid56_parity_submit_scrub_rbio(rbio);
-- 
cgit v1.2.3


From 73ff61dbe5edeb1799d7e91c8b0641f87feb75fa Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Fri, 19 Jun 2015 11:52:51 -0700
Subject: Btrfs: fix device replace of a missing RAID 5/6 device

The original implementation of device replace on RAID 5/6 seems to have
missed support for replacing a missing device. When this is attempted,
we end up calling bio_add_page() on a bio with a NULL ->bi_bdev, which
crashes when we try to dereference it. This happens because
btrfs_map_block() has no choice but to return us the missing device
because RAID 5/6 don't have any alternate mirrors to read from, and a
missing device has a NULL bdev.

The idea implemented here is to handle the missing device case
separately, which better only happen when we're replacing a missing RAID
5/6 device. We use the new BTRFS_RBIO_REBUILD_MISSING operation to
reconstruct the data from parity, check it with
scrub_recheck_block_checksum(), and write it out with
scrub_write_block_to_dev_replace().

Reported-by: Philip <bugzilla@philip-seeger.de>
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=96141
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/scrub.c | 157 +++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 147 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 038162456cfa..6bce7f2ff805 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -125,6 +125,7 @@ struct scrub_block {
 		/* It is for the data with checksum */
 		unsigned int	data_corrected:1;
 	};
+	struct btrfs_work	work;
 };
 
 /* Used for the chunks with parity stripe such RAID5/6 */
@@ -2173,6 +2174,134 @@ again:
 	return 0;
 }
 
+static void scrub_missing_raid56_end_io(struct bio *bio, int error)
+{
+	struct scrub_block *sblock = bio->bi_private;
+	struct btrfs_fs_info *fs_info = sblock->sctx->dev_root->fs_info;
+
+	if (error)
+		sblock->no_io_error_seen = 0;
+
+	btrfs_queue_work(fs_info->scrub_workers, &sblock->work);
+}
+
+static void scrub_missing_raid56_worker(struct btrfs_work *work)
+{
+	struct scrub_block *sblock = container_of(work, struct scrub_block, work);
+	struct scrub_ctx *sctx = sblock->sctx;
+	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+	unsigned int is_metadata;
+	unsigned int have_csum;
+	u8 *csum;
+	u64 generation;
+	u64 logical;
+	struct btrfs_device *dev;
+
+	is_metadata = !(sblock->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA);
+	have_csum = sblock->pagev[0]->have_csum;
+	csum = sblock->pagev[0]->csum;
+	generation = sblock->pagev[0]->generation;
+	logical = sblock->pagev[0]->logical;
+	dev = sblock->pagev[0]->dev;
+
+	if (sblock->no_io_error_seen) {
+		scrub_recheck_block_checksum(fs_info, sblock, is_metadata,
+					     have_csum, csum, generation,
+					     sctx->csum_size);
+	}
+
+	if (!sblock->no_io_error_seen) {
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.read_errors++;
+		spin_unlock(&sctx->stat_lock);
+		printk_ratelimited_in_rcu(KERN_ERR
+			"BTRFS: I/O error rebulding logical %llu for dev %s\n",
+			logical, rcu_str_deref(dev->name));
+	} else if (sblock->header_error || sblock->checksum_error) {
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.uncorrectable_errors++;
+		spin_unlock(&sctx->stat_lock);
+		printk_ratelimited_in_rcu(KERN_ERR
+			"BTRFS: failed to rebuild valid logical %llu for dev %s\n",
+			logical, rcu_str_deref(dev->name));
+	} else {
+		scrub_write_block_to_dev_replace(sblock);
+	}
+
+	scrub_block_put(sblock);
+
+	if (sctx->is_dev_replace &&
+	    atomic_read(&sctx->wr_ctx.flush_all_writes)) {
+		mutex_lock(&sctx->wr_ctx.wr_lock);
+		scrub_wr_submit(sctx);
+		mutex_unlock(&sctx->wr_ctx.wr_lock);
+	}
+
+	scrub_pending_bio_dec(sctx);
+}
+
+static void scrub_missing_raid56_pages(struct scrub_block *sblock)
+{
+	struct scrub_ctx *sctx = sblock->sctx;
+	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+	u64 length = sblock->page_count * PAGE_SIZE;
+	u64 logical = sblock->pagev[0]->logical;
+	struct btrfs_bio *bbio;
+	struct bio *bio;
+	struct btrfs_raid_bio *rbio;
+	int ret;
+	int i;
+
+	ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, logical, &length,
+			       &bbio, 0, 1);
+	if (ret || !bbio || !bbio->raid_map)
+		goto bbio_out;
+
+	if (WARN_ON(!sctx->is_dev_replace ||
+		    !(bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
+		/*
+		 * We shouldn't be scrubbing a missing device. Even for dev
+		 * replace, we should only get here for RAID 5/6. We either
+		 * managed to mount something with no mirrors remaining or
+		 * there's a bug in scrub_remap_extent()/btrfs_map_block().
+		 */
+		goto bbio_out;
+	}
+
+	bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
+	if (!bio)
+		goto bbio_out;
+
+	bio->bi_iter.bi_sector = logical >> 9;
+	bio->bi_private = sblock;
+	bio->bi_end_io = scrub_missing_raid56_end_io;
+
+	rbio = raid56_alloc_missing_rbio(sctx->dev_root, bio, bbio, length);
+	if (!rbio)
+		goto rbio_out;
+
+	for (i = 0; i < sblock->page_count; i++) {
+		struct scrub_page *spage = sblock->pagev[i];
+
+		raid56_add_scrub_pages(rbio, spage->page, spage->logical);
+	}
+
+	btrfs_init_work(&sblock->work, btrfs_scrub_helper,
+			scrub_missing_raid56_worker, NULL, NULL);
+	scrub_block_get(sblock);
+	scrub_pending_bio_inc(sctx);
+	raid56_submit_missing_rbio(rbio);
+	return;
+
+rbio_out:
+	bio_put(bio);
+bbio_out:
+	btrfs_put_bbio(bbio);
+	spin_lock(&sctx->stat_lock);
+	sctx->stat.malloc_errors++;
+	spin_unlock(&sctx->stat_lock);
+}
+
 static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
 		       u64 physical, struct btrfs_device *dev, u64 flags,
 		       u64 gen, int mirror_num, u8 *csum, int force,
@@ -2236,19 +2365,27 @@ leave_nomem:
 	}
 
 	WARN_ON(sblock->page_count == 0);
-	for (index = 0; index < sblock->page_count; index++) {
-		struct scrub_page *spage = sblock->pagev[index];
-		int ret;
+	if (dev->missing) {
+		/*
+		 * This case should only be hit for RAID 5/6 device replace. See
+		 * the comment in scrub_missing_raid56_pages() for details.
+		 */
+		scrub_missing_raid56_pages(sblock);
+	} else {
+		for (index = 0; index < sblock->page_count; index++) {
+			struct scrub_page *spage = sblock->pagev[index];
+			int ret;
 
-		ret = scrub_add_page_to_rd_bio(sctx, spage);
-		if (ret) {
-			scrub_block_put(sblock);
-			return ret;
+			ret = scrub_add_page_to_rd_bio(sctx, spage);
+			if (ret) {
+				scrub_block_put(sblock);
+				return ret;
+			}
 		}
-	}
 
-	if (force)
-		scrub_submit(sctx);
+		if (force)
+			scrub_submit(sctx);
+	}
 
 	/* last one frees, either here or in bio completion for last page */
 	scrub_block_put(sblock);
-- 
cgit v1.2.3


From 4a770891d9ddf94df985ca438e78d355b8469247 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Fri, 19 Jun 2015 11:52:52 -0700
Subject: Btrfs: fix parity scrub of RAID 5/6 with missing device

When testing the previous patch, Zhao Lei reported a similar bug when
attempting to scrub a degraded RAID 5/6 filesystem with a missing
device, leading to NULL pointer dereferences from the RAID 5/6 parity
scrubbing code.

The first cause was the same as in the previous patch: attempting to
call bio_add_page() on a missing block device. To fix this,
scrub_extent_for_parity() can just mark the sectors on the missing
device as errors instead of attempting to read from it.

Additionally, the code uses scrub_remap_extent() to map the extent of
the corresponding data stripe, but the extent wasn't already mapped. If
scrub_remap_extent() finds a missing block device, it doesn't initialize
extent_dev, so we're left with a NULL struct btrfs_device. The solution
is to use btrfs_map_block() directly.

Reported-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/scrub.c | 26 ++++++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 6bce7f2ff805..c69c75e7b841 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2696,6 +2696,11 @@ static int scrub_extent_for_parity(struct scrub_parity *sparity,
 	u8 csum[BTRFS_CSUM_SIZE];
 	u32 blocksize;
 
+	if (dev->missing) {
+		scrub_parity_mark_sectors_error(sparity, logical, len);
+		return 0;
+	}
+
 	if (flags & BTRFS_EXTENT_FLAG_DATA) {
 		blocksize = sctx->sectorsize;
 	} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
@@ -2905,6 +2910,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
 	struct btrfs_root *root = fs_info->extent_root;
 	struct btrfs_root *csum_root = fs_info->csum_root;
 	struct btrfs_extent_item *extent;
+	struct btrfs_bio *bbio = NULL;
 	u64 flags;
 	int ret;
 	int slot;
@@ -2914,6 +2920,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
 	u64 extent_logical;
 	u64 extent_physical;
 	u64 extent_len;
+	u64 mapped_length;
 	struct btrfs_device *extent_dev;
 	struct scrub_parity *sparity;
 	int nsectors;
@@ -3037,10 +3044,21 @@ again:
 			scrub_parity_mark_sectors_data(sparity, extent_logical,
 						       extent_len);
 
-			scrub_remap_extent(fs_info, extent_logical,
-					   extent_len, &extent_physical,
-					   &extent_dev,
-					   &extent_mirror_num);
+			mapped_length = extent_len;
+			ret = btrfs_map_block(fs_info, READ, extent_logical,
+					      &mapped_length, &bbio, 0);
+			if (!ret) {
+				if (!bbio || mapped_length < extent_len)
+					ret = -EIO;
+			}
+			if (ret) {
+				btrfs_put_bbio(bbio);
+				goto out;
+			}
+			extent_physical = bbio->stripes[0].physical;
+			extent_mirror_num = bbio->mirror_num;
+			extent_dev = bbio->stripes[0].dev;
+			btrfs_put_bbio(bbio);
 
 			ret = btrfs_lookup_csums_range(csum_root,
 						extent_logical,
-- 
cgit v1.2.3


From a4027a20c57a2c8779e17a61425737634bb7163d Mon Sep 17 00:00:00 2001
From: Byongho Lee <bhlee.kernel@gmail.com>
Date: Fri, 10 Jul 2015 13:10:26 +0900
Subject: Btrfs: remove unused mutex from struct 'btrfs_fs_info'

The code using 'ordered_extent_flush_mutex' mutex has removed by below
commit.
 - 8d875f95da43c6a8f18f77869f2ef26e9594fecc
   btrfs: disable strict file flushes for renames and truncates
But the mutex still lives in struct 'btrfs_fs_info'.

So, this patch removes the mutex from struct 'btrfs_fs_info' and its
initialization code.

Signed-off-by: Byongho Lee <bhlee.kernel@gmail.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h   | 6 ------
 fs/btrfs/disk-io.c | 1 -
 2 files changed, 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f335c18bd263..d4042c89d29b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1518,12 +1518,6 @@ struct btrfs_fs_info {
 	 */
 	struct mutex ordered_operations_mutex;
 
-	/*
-	 * Same as ordered_operations_mutex except this is for ordered extents
-	 * and not the operations.
-	 */
-	struct mutex ordered_extent_flush_mutex;
-
 	struct rw_semaphore commit_root_sem;
 
 	struct rw_semaphore cleanup_work_sem;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e49ae5ea9040..f7536bcf7cee 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2608,7 +2608,6 @@ int open_ctree(struct super_block *sb,
 
 
 	mutex_init(&fs_info->ordered_operations_mutex);
-	mutex_init(&fs_info->ordered_extent_flush_mutex);
 	mutex_init(&fs_info->tree_log_mutex);
 	mutex_init(&fs_info->chunk_mutex);
 	mutex_init(&fs_info->transaction_kthread_mutex);
-- 
cgit v1.2.3


From da2f0f74cf7d074e5a8918c8efdf6aba4a989b4a Mon Sep 17 00:00:00 2001
From: Chris Mason <clm@fb.com>
Date: Thu, 2 Jul 2015 13:57:22 -0700
Subject: Btrfs: add support for blkio controllers

This attaches accounting information to bios as we submit them so the
new blkio controllers can throttle on btrfs filesystems.

Not much is required, we're just associating bios with blkcgs during clone,
calling wbc_init_bio()/wbc_account_io() during writepages submission,
and attaching the bios to the current context during direct IO.

Finally if we are splitting bios during btrfs_map_bio, this attaches
accounting information to the split.

The end result is able to throttle nicely on single disk filesystems.  A
little more work is required for multi-device filesystems.

Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/disk-io.c   |  1 +
 fs/btrfs/extent_io.c | 16 +++++++++++++---
 fs/btrfs/inode.c     |  6 +++++-
 fs/btrfs/super.c     |  1 +
 fs/btrfs/volumes.c   |  8 ++++++++
 5 files changed, 28 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f7536bcf7cee..230546b45474 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1724,6 +1724,7 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
 	bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE;
 	bdi->congested_fn	= btrfs_congested_fn;
 	bdi->congested_data	= info;
+	bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK;
 	return 0;
 }
 
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 02d05817cbdf..b9755ce98218 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2730,6 +2730,9 @@ struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask)
 		btrfs_bio->csum = NULL;
 		btrfs_bio->csum_allocated = NULL;
 		btrfs_bio->end_io = NULL;
+		/* FIXME, put this into bio_clone_bioset */
+		if (bio->bi_css)
+			bio_associate_blkcg(new, bio->bi_css);
 	}
 	return new;
 }
@@ -2790,6 +2793,7 @@ static int merge_bio(int rw, struct extent_io_tree *tree, struct page *page,
 }
 
 static int submit_extent_page(int rw, struct extent_io_tree *tree,
+			      struct writeback_control *wbc,
 			      struct page *page, sector_t sector,
 			      size_t size, unsigned long offset,
 			      struct block_device *bdev,
@@ -2826,6 +2830,8 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
 			}
 			bio = NULL;
 		} else {
+			if (wbc)
+				wbc_account_io(wbc, page, page_size);
 			return 0;
 		}
 	}
@@ -2841,6 +2847,10 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
 	bio_add_page(bio, page, page_size, offset);
 	bio->bi_end_io = end_io_func;
 	bio->bi_private = tree;
+	if (wbc) {
+		wbc_init_bio(wbc, bio);
+		wbc_account_io(wbc, page, page_size);
+	}
 
 	if (bio_ret)
 		*bio_ret = bio;
@@ -3051,7 +3061,7 @@ static int __do_readpage(struct extent_io_tree *tree,
 		}
 
 		pnr -= page->index;
-		ret = submit_extent_page(rw, tree, page,
+		ret = submit_extent_page(rw, tree, NULL, page,
 					 sector, disk_io_size, pg_offset,
 					 bdev, bio, pnr,
 					 end_bio_extent_readpage, mirror_num,
@@ -3446,7 +3456,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
 				       page->index, cur, end);
 			}
 
-			ret = submit_extent_page(write_flags, tree, page,
+			ret = submit_extent_page(write_flags, tree, wbc, page,
 						 sector, iosize, pg_offset,
 						 bdev, &epd->bio, max_nr,
 						 end_bio_extent_writepage,
@@ -3749,7 +3759,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
 
 		clear_page_dirty_for_io(p);
 		set_page_writeback(p);
-		ret = submit_extent_page(rw, tree, p, offset >> 9,
+		ret = submit_extent_page(rw, tree, wbc, p, offset >> 9,
 					 PAGE_CACHE_SIZE, 0, bdev, &epd->bio,
 					 -1, end_bio_extent_buffer_writepage,
 					 0, epd->bio_flags, bio_flags);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 79a73645346e..bda3c41dc9d5 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7987,7 +7987,11 @@ static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
 				       u64 first_sector, gfp_t gfp_flags)
 {
 	int nr_vecs = bio_get_nr_vecs(bdev);
-	return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags);
+	struct bio *bio;
+	bio = btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags);
+	if (bio)
+		bio_associate_current(bio);
+	return bio;
 }
 
 static inline int btrfs_lookup_and_bind_dio_csum(struct btrfs_root *root,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index cd7ef34d2dce..d366dd4664d0 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1033,6 +1033,7 @@ static int btrfs_fill_super(struct super_block *sb,
 	sb->s_flags |= MS_POSIXACL;
 #endif
 	sb->s_flags |= MS_I_VERSION;
+	sb->s_iflags |= SB_I_CGROUPWB;
 	err = open_ctree(sb, fs_devices, (char *)data);
 	if (err) {
 		printk(KERN_ERR "BTRFS: open_ctree failed\n");
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index fb9abf1678d0..88e2fe931bde 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5942,6 +5942,14 @@ again:
 	if (!bio)
 		return -ENOMEM;
 
+	if (first_bio->bi_ioc) {
+		get_io_context_active(first_bio->bi_ioc);
+		bio->bi_ioc = first_bio->bi_ioc;
+	}
+	if (first_bio->bi_css) {
+		css_get(first_bio->bi_css);
+		bio->bi_css = first_bio->bi_css;
+	}
 	while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) {
 		if (bio_add_page(bio, bvec->bv_page, bvec->bv_len,
 				 bvec->bv_offset) < bvec->bv_len) {
-- 
cgit v1.2.3


From 34eb2a524997e5cd7117569b1fda925516adf6ac Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Mon, 17 Aug 2015 18:44:45 +0800
Subject: btrfs: Remove useless condition in start_log_trans()

Dan Carpenter <dan.carpenter@oracle.com> reported a smatch warning
for start_log_trans():
 fs/btrfs/tree-log.c:178 start_log_trans()
 warn: we tested 'root->log_root' before and it was 'false'

 fs/btrfs/tree-log.c
 147          if (root->log_root) {
 We test "root->log_root" here.
 ...

Reason:
 Condition of:
 fs/btrfs/tree-log.c:178: if (!root->log_root) {
 is not necessary after commit: 7237f1833

 It caused a smatch warning, and no functionally error.

Fix:
 Deleting above condition will make smatch shut up,
 but a better way is to do cleanup for start_log_trans()
 to remove duplicated code and make code more readable.

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/tree-log.c | 43 +++++++++++++++++--------------------------
 1 file changed, 17 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 9314adeba946..2e65e8e73844 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -140,55 +140,46 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   struct btrfs_log_ctx *ctx)
 {
-	int index;
-	int ret;
+	int ret = 0;
 
 	mutex_lock(&root->log_mutex);
+
 	if (root->log_root) {
 		if (btrfs_need_log_full_commit(root->fs_info, trans)) {
 			ret = -EAGAIN;
 			goto out;
 		}
+
 		if (!root->log_start_pid) {
-			root->log_start_pid = current->pid;
 			clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
+			root->log_start_pid = current->pid;
 		} else if (root->log_start_pid != current->pid) {
 			set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
 		}
+	} else {
+		mutex_lock(&root->fs_info->tree_log_mutex);
+		if (!root->fs_info->log_root_tree)
+			ret = btrfs_init_log_root_tree(trans, root->fs_info);
+		mutex_unlock(&root->fs_info->tree_log_mutex);
+		if (ret)
+			goto out;
 
-		atomic_inc(&root->log_batch);
-		atomic_inc(&root->log_writers);
-		if (ctx) {
-			index = root->log_transid % 2;
-			list_add_tail(&ctx->list, &root->log_ctxs[index]);
-			ctx->log_transid = root->log_transid;
-		}
-		mutex_unlock(&root->log_mutex);
-		return 0;
-	}
-
-	ret = 0;
-	mutex_lock(&root->fs_info->tree_log_mutex);
-	if (!root->fs_info->log_root_tree)
-		ret = btrfs_init_log_root_tree(trans, root->fs_info);
-	mutex_unlock(&root->fs_info->tree_log_mutex);
-	if (ret)
-		goto out;
-
-	if (!root->log_root) {
 		ret = btrfs_add_log_tree(trans, root);
 		if (ret)
 			goto out;
+
+		clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
+		root->log_start_pid = current->pid;
 	}
-	clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
-	root->log_start_pid = current->pid;
+
 	atomic_inc(&root->log_batch);
 	atomic_inc(&root->log_writers);
 	if (ctx) {
-		index = root->log_transid % 2;
+		int index = root->log_transid % 2;
 		list_add_tail(&ctx->list, &root->log_ctxs[index]);
 		ctx->log_transid = root->log_transid;
 	}
+
 out:
 	mutex_unlock(&root->log_mutex);
 	return ret;
-- 
cgit v1.2.3


From 60d53eb3107c8e8960e8d7c22aa4e69aac7a8fe6 Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Mon, 17 Aug 2015 18:44:46 +0800
Subject: btrfs: Remove unused arguments in tree-log.c

Following arguments are not used in tree-log.c:
 insert_one_name(): path, type
 wait_log_commit(): trans
 wait_for_writer(): trans

This patch remove them.

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/tree-log.c | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 2e65e8e73844..6d650468d21a 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1540,9 +1540,8 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
  */
 static noinline int insert_one_name(struct btrfs_trans_handle *trans,
 				    struct btrfs_root *root,
-				    struct btrfs_path *path,
 				    u64 dirid, u64 index,
-				    char *name, int name_len, u8 type,
+				    char *name, int name_len,
 				    struct btrfs_key *location)
 {
 	struct inode *inode;
@@ -1716,8 +1715,8 @@ insert:
 		goto out;
 	}
 	btrfs_release_path(path);
-	ret = insert_one_name(trans, root, path, key->objectid, key->offset,
-			      name, name_len, log_type, &log_key);
+	ret = insert_one_name(trans, root, key->objectid, key->offset,
+			      name, name_len, &log_key);
 	if (ret && ret != -ENOENT && ret != -EEXIST)
 		goto out;
 	if (!ret)
@@ -2582,8 +2581,7 @@ static int update_log_root(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-static void wait_log_commit(struct btrfs_trans_handle *trans,
-			    struct btrfs_root *root, int transid)
+static void wait_log_commit(struct btrfs_root *root, int transid)
 {
 	DEFINE_WAIT(wait);
 	int index = transid % 2;
@@ -2608,8 +2606,7 @@ static void wait_log_commit(struct btrfs_trans_handle *trans,
 		 atomic_read(&root->log_commit[index]));
 }
 
-static void wait_for_writer(struct btrfs_trans_handle *trans,
-			    struct btrfs_root *root)
+static void wait_for_writer(struct btrfs_root *root)
 {
 	DEFINE_WAIT(wait);
 
@@ -2689,7 +2686,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 
 	index1 = log_transid % 2;
 	if (atomic_read(&root->log_commit[index1])) {
-		wait_log_commit(trans, root, log_transid);
+		wait_log_commit(root, log_transid);
 		mutex_unlock(&root->log_mutex);
 		return ctx->log_ret;
 	}
@@ -2698,7 +2695,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 
 	/* wait for previous tree log sync to complete */
 	if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
-		wait_log_commit(trans, root, log_transid - 1);
+		wait_log_commit(root, log_transid - 1);
 
 	while (1) {
 		int batch = atomic_read(&root->log_batch);
@@ -2709,7 +2706,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 			schedule_timeout_uninterruptible(1);
 			mutex_lock(&root->log_mutex);
 		}
-		wait_for_writer(trans, root);
+		wait_for_writer(root);
 		if (batch == atomic_read(&root->log_batch))
 			break;
 	}
@@ -2806,7 +2803,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 		ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages,
 						mark);
 		btrfs_wait_logged_extents(trans, log, log_transid);
-		wait_log_commit(trans, log_root_tree,
+		wait_log_commit(log_root_tree,
 				root_log_ctx.log_transid);
 		mutex_unlock(&log_root_tree->log_mutex);
 		if (!ret)
@@ -2817,11 +2814,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	atomic_set(&log_root_tree->log_commit[index2], 1);
 
 	if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
-		wait_log_commit(trans, log_root_tree,
+		wait_log_commit(log_root_tree,
 				root_log_ctx.log_transid - 1);
 	}
 
-	wait_for_writer(trans, log_root_tree);
+	wait_for_writer(log_root_tree);
 
 	/*
 	 * now that we've moved on to the tree of log tree roots,
-- 
cgit v1.2.3


From d1b5c5671d010de1df78d3efddb84bf22bfafd1e Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 19 Aug 2015 14:17:40 +0200
Subject: btrfs: Prevent from early transaction abort

Btrfs relies on GFP_NOFS allocation when committing the transaction but
this allocation context is rather weak wrt. reclaim capabilities. The
page allocator currently tries hard to not fail these allocations if
they are small (<=PAGE_ALLOC_COSTLY_ORDER) so this is not a problem
currently but there is an attempt to move away from the default no-fail
behavior and allow these allocation to fail more eagerly. And this would
lead to a pre-mature transaction abort as follows:

[   55.328093] Call Trace:
[   55.328890]  [<ffffffff8154e6f0>] dump_stack+0x4f/0x7b
[   55.330518]  [<ffffffff8108fa28>] ? console_unlock+0x334/0x363
[   55.332738]  [<ffffffff8110873e>] __alloc_pages_nodemask+0x81d/0x8d4
[   55.334910]  [<ffffffff81100752>] pagecache_get_page+0x10e/0x20c
[   55.336844]  [<ffffffffa007d916>] alloc_extent_buffer+0xd0/0x350 [btrfs]
[   55.338973]  [<ffffffffa0059d8c>] btrfs_find_create_tree_block+0x15/0x17 [btrfs]
[   55.341329]  [<ffffffffa004f728>] btrfs_alloc_tree_block+0x18c/0x405 [btrfs]
[   55.343566]  [<ffffffffa003fa34>] split_leaf+0x1e4/0x6a6 [btrfs]
[   55.345577]  [<ffffffffa0040567>] btrfs_search_slot+0x671/0x831 [btrfs]
[   55.347679]  [<ffffffff810682d7>] ? get_parent_ip+0xe/0x3e
[   55.349434]  [<ffffffffa0041cb2>] btrfs_insert_empty_items+0x5d/0xa8 [btrfs]
[   55.351681]  [<ffffffffa004ecfb>] __btrfs_run_delayed_refs+0x7a6/0xf35 [btrfs]
[   55.353979]  [<ffffffffa00512ea>] btrfs_run_delayed_refs+0x6e/0x226 [btrfs]
[   55.356212]  [<ffffffffa0060e21>] ? start_transaction+0x192/0x534 [btrfs]
[   55.358378]  [<ffffffffa0060e21>] ? start_transaction+0x192/0x534 [btrfs]
[   55.360626]  [<ffffffffa0060221>] btrfs_commit_transaction+0x4c/0xaba [btrfs]
[   55.362894]  [<ffffffffa0060e21>] ? start_transaction+0x192/0x534 [btrfs]
[   55.365221]  [<ffffffffa0073428>] btrfs_sync_file+0x29c/0x310 [btrfs]
[   55.367273]  [<ffffffff81186808>] vfs_fsync_range+0x8f/0x9e
[   55.369047]  [<ffffffff81186833>] vfs_fsync+0x1c/0x1e
[   55.370654]  [<ffffffff81186869>] do_fsync+0x34/0x4e
[   55.372246]  [<ffffffff81186ab3>] SyS_fsync+0x10/0x14
[   55.373851]  [<ffffffff81554f97>] system_call_fastpath+0x12/0x6f
[   55.381070] BTRFS: error (device hdb1) in btrfs_run_delayed_refs:2821: errno=-12 Out of memory
[   55.382431] BTRFS warning (device hdb1): Skipping commit of aborted transaction.
[   55.382433] BTRFS warning (device hdb1): cleanup_transaction:1692: Aborting unused transaction(IO failure).
[   55.384280] ------------[ cut here ]------------
[   55.384312] WARNING: CPU: 0 PID: 3010 at fs/btrfs/delayed-ref.c:438 btrfs_select_ref_head+0xd9/0xfe [btrfs]()
[...]
[   55.384337] Call Trace:
[   55.384353]  [<ffffffff8154e6f0>] dump_stack+0x4f/0x7b
[   55.384357]  [<ffffffff8107f717>] ? down_trylock+0x2d/0x37
[   55.384359]  [<ffffffff81046977>] warn_slowpath_common+0xa1/0xbb
[   55.384398]  [<ffffffffa00a1d6b>] ? btrfs_select_ref_head+0xd9/0xfe [btrfs]
[   55.384400]  [<ffffffff81046a34>] warn_slowpath_null+0x1a/0x1c
[   55.384423]  [<ffffffffa00a1d6b>] btrfs_select_ref_head+0xd9/0xfe [btrfs]
[   55.384446]  [<ffffffffa004e5f7>] ? __btrfs_run_delayed_refs+0xa2/0xf35 [btrfs]
[   55.384455]  [<ffffffffa004e600>] __btrfs_run_delayed_refs+0xab/0xf35 [btrfs]
[   55.384476]  [<ffffffffa00512ea>] btrfs_run_delayed_refs+0x6e/0x226 [btrfs]
[   55.384499]  [<ffffffffa0060e21>] ? start_transaction+0x192/0x534 [btrfs]
[   55.384521]  [<ffffffffa0060e21>] ? start_transaction+0x192/0x534 [btrfs]
[   55.384543]  [<ffffffffa0060221>] btrfs_commit_transaction+0x4c/0xaba [btrfs]
[   55.384565]  [<ffffffffa0060e21>] ? start_transaction+0x192/0x534 [btrfs]
[   55.384588]  [<ffffffffa0073428>] btrfs_sync_file+0x29c/0x310 [btrfs]
[   55.384591]  [<ffffffff81186808>] vfs_fsync_range+0x8f/0x9e
[   55.384592]  [<ffffffff81186833>] vfs_fsync+0x1c/0x1e
[   55.384593]  [<ffffffff81186869>] do_fsync+0x34/0x4e
[   55.384594]  [<ffffffff81186ab3>] SyS_fsync+0x10/0x14
[   55.384595]  [<ffffffff81554f97>] system_call_fastpath+0x12/0x6f
[...]
[   55.384608] ---[ end trace c29799da1d4dd621 ]---
[   55.437323] BTRFS info (device hdb1): forced readonly
[   55.438815] BTRFS info (device hdb1): delayed_refs has NO entry

Fix this by being explicit about the no-fail behavior of this allocation
path and use __GFP_NOFAIL.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent_io.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index b9755ce98218..3cfbd6261f9b 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4624,9 +4624,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
 {
 	struct extent_buffer *eb = NULL;
 
-	eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS);
-	if (eb == NULL)
-		return NULL;
+	eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL);
 	eb->start = start;
 	eb->len = len;
 	eb->fs_info = fs_info;
@@ -4884,7 +4882,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
 		return NULL;
 
 	for (i = 0; i < num_pages; i++, index++) {
-		p = find_or_create_page(mapping, index, GFP_NOFS);
+		p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
 		if (!p)
 			goto free_eb;
 
-- 
cgit v1.2.3


From 277fb5fc177dc4674ef6151a7697f5396bbdff11 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 19 Aug 2015 14:17:41 +0200
Subject: btrfs: use __GFP_NOFAIL in alloc_btrfs_bio

alloc_btrfs_bio relies on GFP_NOFS allocation when committing the
transaction but this allocation context is rather weak wrt. reclaim
capabilities. The page allocator currently tries hard to not fail these
allocations if they are small (<=PAGE_ALLOC_COSTLY_ORDER) but it can
still fail if the _current_ process is the OOM killer victim. Moreover
there is an attempt to move away from the default no-fail behavior and
allow these allocation to fail more eagerly. This would lead to:

[   37.928625] kernel BUG at fs/btrfs/extent_io.c:4045

which is clearly undesirable and the nofail behavior should be explicit
if the allocation failure cannot be tolerated.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/volumes.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 7c84a8122c37..53a38075911e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5082,9 +5082,7 @@ static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
 		 * and the stripes
 		 */
 		sizeof(u64) * (total_stripes),
-		GFP_NOFS);
-	if (!bbio)
-		return NULL;
+		GFP_NOFS|__GFP_NOFAIL);
 
 	atomic_set(&bbio->error, 0);
 	atomic_set(&bbio->refs, 1);
-- 
cgit v1.2.3


From 1f9b8c8fbc9a4d029760b16f477b9d15500e3a34 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 12 Aug 2015 11:54:35 +0100
Subject: Btrfs: check if previous transaction aborted to avoid fs corruption

While we are committing a transaction, it's possible the previous one is
still finishing its commit and therefore we wait for it to finish first.
However we were not checking if that previous transaction ended up getting
aborted after we waited for it to commit, so we ended up committing the
current transaction which can lead to fs corruption because the new
superblock can point to trees that have had one or more nodes/leafs that
were never durably persisted.
The following sequence diagram exemplifies how this is possible:

          CPU 0                                                        CPU 1

  transaction N starts

  (...)

  btrfs_commit_transaction(N)

    cur_trans->state = TRANS_STATE_COMMIT_START;
    (...)
    cur_trans->state = TRANS_STATE_COMMIT_DOING;
    (...)

    cur_trans->state = TRANS_STATE_UNBLOCKED;
    root->fs_info->running_transaction = NULL;

                                                              btrfs_start_transaction()
                                                                 --> starts transaction N + 1

    btrfs_write_and_wait_transaction(trans, root);
      --> starts writing all new or COWed ebs created
          at transaction N

                                                              creates some new ebs, COWs some
                                                              existing ebs but doesn't COW or
                                                              deletes eb X

                                                              btrfs_commit_transaction(N + 1)
                                                                (...)
                                                                cur_trans->state = TRANS_STATE_COMMIT_START;
                                                                (...)
                                                                wait_for_commit(root, prev_trans);
                                                                  --> prev_trans == transaction N

    btrfs_write_and_wait_transaction() continues
    writing ebs
       --> fails writing eb X, we abort transaction N
           and set bit BTRFS_FS_STATE_ERROR on
           fs_info->fs_state, so no new transactions
           can start after setting that bit

       cleanup_transaction()
         btrfs_cleanup_one_transaction()
           wakes up task at CPU 1

                                                                continues, doesn't abort because
                                                                cur_trans->aborted (transaction N + 1)
                                                                is zero, and no checks for bit
                                                                BTRFS_FS_STATE_ERROR in fs_info->fs_state
                                                                are made

                                                                btrfs_write_and_wait_transaction(trans, root);
                                                                  --> succeeds, no errors during writeback

                                                                write_ctree_super(trans, root, 0);
                                                                  --> succeeds
                                                                  --> we have now a superblock that points us
                                                                      to some root that uses eb X, which was
                                                                      never written to disk

In this scenario future attempts to read eb X from disk results in an
error message like "parent transid verify failed on X wanted Y found Z".

So fix this by aborting the current transaction if after waiting for the
previous transaction we verify that it was aborted.

Cc: stable@vger.kernel.org
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/transaction.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 20267d47dbcd..68ad89e23713 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1895,8 +1895,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 			spin_unlock(&root->fs_info->trans_lock);
 
 			wait_for_commit(root, prev_trans);
+			ret = prev_trans->aborted;
 
 			btrfs_put_transaction(prev_trans);
+			if (ret)
+				goto cleanup_transaction;
 		} else {
 			spin_unlock(&root->fs_info->trans_lock);
 		}
-- 
cgit v1.2.3


From b84b8390d6009cde5134f775a251103c14bbed74 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 19 Aug 2015 11:09:40 +0100
Subject: Btrfs: fix file read corruption after extent cloning and fsync

If we partially clone one extent of a file into a lower offset of the
file, fsync the file, power fail and then mount the fs to trigger log
replay, we can get multiple checksum items in the csum tree that overlap
each other and result in checksum lookup failures later. Those failures
can make file data read requests assume a checksum value of 0, but they
will not return an error (-EIO for example) to userspace exactly because
the expected checksum value 0 is a special value that makes the read bio
endio callback return success and set all the bytes of the corresponding
page with the value 0x01 (at fs/btrfs/inode.c:__readpage_endio_check()).
From a userspace perspective this is equivalent to file corruption
because we are not returning what was written to the file.

Details about how this can happen, and why, are included inline in the
following reproducer test case for fstests and the comment added to
tree-log.c.

  seq=`basename $0`
  seqres=$RESULT_DIR/$seq
  echo "QA output created by $seq"
  tmp=/tmp/$$
  status=1	# failure is the default!
  trap "_cleanup; exit \$status" 0 1 2 3 15

  _cleanup()
  {
      _cleanup_flakey
      rm -f $tmp.*
  }

  # get standard environment, filters and checks
  . ./common/rc
  . ./common/filter
  . ./common/dmflakey

  # real QA test starts here
  _need_to_be_root
  _supported_fs btrfs
  _supported_os Linux
  _require_scratch
  _require_dm_flakey
  _require_cloner
  _require_metadata_journaling $SCRATCH_DEV

  rm -f $seqres.full

  _scratch_mkfs >>$seqres.full 2>&1
  _init_flakey
  _mount_flakey

  # Create our test file with a single 100K extent starting at file
  # offset 800K. We fsync the file here to make the fsync log tree gets
  # a single csum item that covers the whole 100K extent, which causes
  # the second fsync, done after the cloning operation below, to not
  # leave in the log tree two csum items covering two sub-ranges
  # ([0, 20K[ and [20K, 100K[)) of our extent.
  $XFS_IO_PROG -f -c "pwrite -S 0xaa 800K 100K"  \
                  -c "fsync"                     \
                   $SCRATCH_MNT/foo | _filter_xfs_io

  # Now clone part of our extent into file offset 400K. This adds a file
  # extent item to our inode's metadata that points to the 100K extent
  # we created before, using a data offset of 20K and a data length of
  # 20K, so that it refers to the sub-range [20K, 40K[ of our original
  # extent.
  $CLONER_PROG -s $((800 * 1024 + 20 * 1024)) -d $((400 * 1024)) \
      -l $((20 * 1024)) $SCRATCH_MNT/foo $SCRATCH_MNT/foo

  # Now fsync our file to make sure the extent cloning is durably
  # persisted. This fsync will not add a second csum item to the log
  # tree containing the checksums for the blocks in the sub-range
  # [20K, 40K[ of our extent, because there was already a csum item in
  # the log tree covering the whole extent, added by the first fsync
  # we did before.
  $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foo

  echo "File digest before power failure:"
  md5sum $SCRATCH_MNT/foo | _filter_scratch

  # Silently drop all writes and ummount to simulate a crash/power
  # failure.
  _load_flakey_table $FLAKEY_DROP_WRITES
  _unmount_flakey

  # Allow writes again, mount to trigger log replay and validate file
  # contents.
  # The fsync log replay first processes the file extent item
  # corresponding to the file offset 400K (the one which refers to the
  # [20K, 40K[ sub-range of our 100K extent) and then processes the file
  # extent item for file offset 800K. It used to happen that when
  # processing the later, it erroneously left in the csum tree 2 csum
  # items that overlapped each other, 1 for the sub-range [20K, 40K[ and
  # 1 for the whole range of our extent. This introduced a problem where
  # subsequent lookups for the checksums of blocks within the range
  # [40K, 100K[ of our extent would not find anything because lookups in
  # the csum tree ended up looking only at the smaller csum item, the
  # one covering the subrange [20K, 40K[. This made read requests assume
  # an expected checksum with a value of 0 for those blocks, which caused
  # checksum verification failure when the read operations finished.
  # However those checksum failure did not result in read requests
  # returning an error to user space (like -EIO for e.g.) because the
  # expected checksum value had the special value 0, and in that case
  # btrfs set all bytes of the corresponding pages with the value 0x01
  # and produce the following warning in dmesg/syslog:
  #
  #  "BTRFS warning (device dm-0): csum failed ino 257 off 917504 csum\
  #   1322675045 expected csum 0"
  #
  _load_flakey_table $FLAKEY_ALLOW_WRITES
  _mount_flakey

  echo "File digest after log replay:"
  # Must match the same digest he had after cloning the extent and
  # before the power failure happened.
  md5sum $SCRATCH_MNT/foo | _filter_scratch

  _unmount_flakey

  status=0
  exit

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/tree-log.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 6d650468d21a..1bbaace73383 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -722,11 +722,65 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 						&ordered_sums, 0);
 			if (ret)
 				goto out;
+			/*
+			 * Now delete all existing cums in the csum root that
+			 * cover our range. We do this because we can have an
+			 * extent that is completely referenced by one file
+			 * extent item and partially referenced by another
+			 * file extent item (like after using the clone or
+			 * extent_same ioctls). In this case if we end up doing
+			 * the replay of the one that partially references the
+			 * extent first, and we do not do the csum deletion
+			 * below, we can get 2 csum items in the csum tree that
+			 * overlap each other. For example, imagine our log has
+			 * the two following file extent items:
+			 *
+			 * key (257 EXTENT_DATA 409600)
+			 *     extent data disk byte 12845056 nr 102400
+			 *     extent data offset 20480 nr 20480 ram 102400
+			 *
+			 * key (257 EXTENT_DATA 819200)
+			 *     extent data disk byte 12845056 nr 102400
+			 *     extent data offset 0 nr 102400 ram 102400
+			 *
+			 * Where the second one fully references the 100K extent
+			 * that starts at disk byte 12845056, and the log tree
+			 * has a single csum item that covers the entire range
+			 * of the extent:
+			 *
+			 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
+			 *
+			 * After the first file extent item is replayed, the
+			 * csum tree gets the following csum item:
+			 *
+			 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
+			 *
+			 * Which covers the 20K sub-range starting at offset 20K
+			 * of our extent. Now when we replay the second file
+			 * extent item, if we do not delete existing csum items
+			 * that cover any of its blocks, we end up getting two
+			 * csum items in our csum tree that overlap each other:
+			 *
+			 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
+			 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
+			 *
+			 * Which is a problem, because after this anyone trying
+			 * to lookup up for the checksum of any block of our
+			 * extent starting at an offset of 40K or higher, will
+			 * end up looking at the second csum item only, which
+			 * does not contain the checksum for any block starting
+			 * at offset 40K or higher of our extent.
+			 */
 			while (!list_empty(&ordered_sums)) {
 				struct btrfs_ordered_sum *sums;
 				sums = list_entry(ordered_sums.next,
 						struct btrfs_ordered_sum,
 						list);
+				if (!ret)
+					ret = btrfs_del_csums(trans,
+						      root->fs_info->csum_root,
+						      sums->bytenr,
+						      sums->len);
 				if (!ret)
 					ret = btrfs_csum_file_blocks(trans,
 						root->fs_info->csum_root,
-- 
cgit v1.2.3


From 3a9508b0221dfd290b95fb0ab199958fe078bbdf Mon Sep 17 00:00:00 2001
From: Chris Mason <clm@fb.com>
Date: Fri, 21 Aug 2015 10:05:39 -0700
Subject: btrfs: fix compile when block cgroups are not enabled

bio->bi_css and bio->bi_ioc don't exist when block cgroups are not on.
This adds an ifdef around them.  It's not perfect, but our
use of bi_ioc is being removed in the 4.3 merge window.

The bi_css usage really should go into bio_clone, but I want to make
sure that doesn't introduce problems for other bio_clone use cases.

Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent_io.c | 3 +++
 fs/btrfs/volumes.c   | 2 ++
 2 files changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 3cfbd6261f9b..fa19f2f68c1b 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2730,9 +2730,12 @@ struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask)
 		btrfs_bio->csum = NULL;
 		btrfs_bio->csum_allocated = NULL;
 		btrfs_bio->end_io = NULL;
+
+#ifdef CONFIG_BLK_CGROUP
 		/* FIXME, put this into bio_clone_bioset */
 		if (bio->bi_css)
 			bio_associate_blkcg(new, bio->bi_css);
+#endif
 	}
 	return new;
 }
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 53a38075911e..69520dfa8960 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5955,6 +5955,7 @@ again:
 	if (!bio)
 		return -ENOMEM;
 
+#ifdef CONFIG_BLK_CGROUP
 	if (first_bio->bi_ioc) {
 		get_io_context_active(first_bio->bi_ioc);
 		bio->bi_ioc = first_bio->bi_ioc;
@@ -5963,6 +5964,7 @@ again:
 		css_get(first_bio->bi_css);
 		bio->bi_css = first_bio->bi_css;
 	}
+#endif
 	while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) {
 		if (bio_add_page(bio, bvec->bv_page, bvec->bv_len,
 				 bvec->bv_offset) < bvec->bv_len) {
-- 
cgit v1.2.3