diff options
author | Stephen Rothwell <sfr@canb.auug.org.au> | 2017-05-18 09:29:48 +1000 |
---|---|---|
committer | Stephen Rothwell <sfr@canb.auug.org.au> | 2017-05-18 09:29:48 +1000 |
commit | 4f47e7917c1761c062bf93154a729de7e96bcc55 (patch) | |
tree | c79ed7734ed2e49431fab29783ed58916eadc8c5 | |
parent | cfb825fa2e18837a28a015dc8a68a6456f84972b (diff) | |
parent | 4b1370b4e34a1487b79173bf4201201cc58c9979 (diff) |
Merge remote-tracking branch 'btrfs-kdave/for-next'
-rw-r--r-- | fs/btrfs/check-integrity.c | 35 | ||||
-rw-r--r-- | fs/btrfs/ctree.h | 18 | ||||
-rw-r--r-- | fs/btrfs/disk-io.c | 77 | ||||
-rw-r--r-- | fs/btrfs/extent-tree.c | 41 | ||||
-rw-r--r-- | fs/btrfs/extent_io.c | 146 | ||||
-rw-r--r-- | fs/btrfs/extent_io.h | 37 | ||||
-rw-r--r-- | fs/btrfs/file-item.c | 31 | ||||
-rw-r--r-- | fs/btrfs/file.c | 41 | ||||
-rw-r--r-- | fs/btrfs/inode-map.c | 4 | ||||
-rw-r--r-- | fs/btrfs/inode.c | 296 | ||||
-rw-r--r-- | fs/btrfs/ioctl.c | 12 | ||||
-rw-r--r-- | fs/btrfs/qgroup.c | 220 | ||||
-rw-r--r-- | fs/btrfs/qgroup.h | 8 | ||||
-rw-r--r-- | fs/btrfs/reada.c | 1 | ||||
-rw-r--r-- | fs/btrfs/relocation.c | 12 | ||||
-rw-r--r-- | fs/btrfs/send.c | 8 | ||||
-rw-r--r-- | fs/btrfs/sysfs.c | 41 | ||||
-rw-r--r-- | fs/btrfs/transaction.c | 18 | ||||
-rw-r--r-- | fs/btrfs/volumes.c | 26 | ||||
-rw-r--r-- | fs/btrfs/volumes.h | 2 | ||||
-rw-r--r-- | include/trace/events/btrfs.h | 36 |
21 files changed, 706 insertions, 404 deletions
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index ab14c2e635ca..6cabc8acee2a 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -1668,14 +1668,8 @@ static int btrfsic_read_block(struct btrfsic_state *state, dev_bytenr += (j - i) * PAGE_SIZE; i = j; } - for (i = 0; i < num_pages; i++) { + for (i = 0; i < num_pages; i++) block_ctx->datav[i] = kmap(block_ctx->pagev[i]); - if (!block_ctx->datav[i]) { - pr_info("btrfsic: kmap() failed (dev %s)!\n", - block_ctx->dev->name); - return -1; - } - } return block_ctx->len; } @@ -2822,44 +2816,47 @@ static void __btrfsic_submit_bio(struct bio *bio) dev_state = btrfsic_dev_state_lookup(bio->bi_bdev); if (NULL != dev_state && (bio_op(bio) == REQ_OP_WRITE) && bio_has_data(bio)) { - unsigned int i; + unsigned int i = 0; u64 dev_bytenr; u64 cur_bytenr; - struct bio_vec *bvec; + struct bio_vec bvec; + struct bvec_iter iter; int bio_is_patched; char **mapped_datav; + unsigned int segs = bio_segments(bio); dev_bytenr = 512 * bio->bi_iter.bi_sector; bio_is_patched = 0; if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) pr_info("submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n", - bio_op(bio), bio->bi_opf, bio->bi_vcnt, + bio_op(bio), bio->bi_opf, segs, (unsigned long long)bio->bi_iter.bi_sector, dev_bytenr, bio->bi_bdev); - mapped_datav = kmalloc_array(bio->bi_vcnt, + mapped_datav = kmalloc_array(segs, sizeof(*mapped_datav), GFP_NOFS); if (!mapped_datav) goto leave; cur_bytenr = dev_bytenr; - bio_for_each_segment_all(bvec, bio, i) { - BUG_ON(bvec->bv_len != PAGE_SIZE); - mapped_datav[i] = kmap(bvec->bv_page); + bio_for_each_segment(bvec, bio, iter) { + BUG_ON(bvec.bv_len != PAGE_SIZE); + mapped_datav[i] = kmap(bvec.bv_page); + i++; if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE) pr_info("#%u: bytenr=%llu, len=%u, offset=%u\n", - i, cur_bytenr, bvec->bv_len, bvec->bv_offset); - cur_bytenr += bvec->bv_len; + i, cur_bytenr, bvec.bv_len, bvec.bv_offset); + cur_bytenr += bvec.bv_len; } btrfsic_process_written_block(dev_state, dev_bytenr, - mapped_datav, bio->bi_vcnt, + mapped_datav, segs, bio, &bio_is_patched, NULL, bio->bi_opf); - bio_for_each_segment_all(bvec, bio, i) - kunmap(bvec->bv_page); + bio_for_each_segment(bvec, bio, iter) + kunmap(bvec.bv_page); kfree(mapped_datav); } else if (NULL != dev_state && (bio->bi_opf & REQ_PREFLUSH)) { if (dev_state->state->print_mask & diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 643c70d2b2e6..160e4b8b510a 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -48,7 +48,6 @@ struct btrfs_trans_handle; struct btrfs_transaction; struct btrfs_pending_snapshot; extern struct kmem_cache *btrfs_trans_handle_cachep; -extern struct kmem_cache *btrfs_transaction_cachep; extern struct kmem_cache *btrfs_bit_radix_cachep; extern struct kmem_cache *btrfs_path_cachep; extern struct kmem_cache *btrfs_free_space_cachep; @@ -716,6 +715,8 @@ struct btrfs_delayed_root; #define BTRFS_FS_BTREE_ERR 11 #define BTRFS_FS_LOG1_ERR 12 #define BTRFS_FS_LOG2_ERR 13 +#define BTRFS_FS_QUOTA_OVERRIDE 14 + /* * Indicate that a whole-filesystem exclusive operation is running * (device replace, resize, device add/delete, balance) @@ -748,8 +749,7 @@ struct btrfs_fs_info { struct rb_root block_group_cache_tree; /* keep track of unallocated space */ - spinlock_t free_chunk_lock; - u64 free_chunk_space; + atomic64_t free_chunk_space; struct extent_io_tree freed_extents[2]; struct extent_io_tree *pinned_extents; @@ -2703,9 +2703,13 @@ enum btrfs_flush_state { COMMIT_TRANS = 6, }; -int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len); int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes); -void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len); +int btrfs_check_data_free_space(struct inode *inode, + struct extent_changeset **reserved, u64 start, u64 len); +void btrfs_free_reserved_data_space(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len); +void btrfs_delalloc_release_space(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len); void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, u64 len); void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, @@ -2722,8 +2726,8 @@ void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv); int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes); void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes); -int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len); -void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len); +int btrfs_delalloc_reserve_space(struct inode *inode, + struct extent_changeset **reserved, u64 start, u64 len); void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type); struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, unsigned short type); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 8685d67185d0..f52e98210590 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -89,7 +89,6 @@ struct btrfs_end_io_wq { struct btrfs_fs_info *info; int error; enum btrfs_wq_endio_type metadata; - struct list_head list; struct btrfs_work work; }; @@ -120,7 +119,6 @@ void btrfs_end_io_wq_exit(void) struct async_submit_bio { struct inode *inode; struct bio *bio; - struct list_head list; extent_submit_bio_hook_t *submit_bio_start; extent_submit_bio_hook_t *submit_bio_done; int mirror_num; @@ -2626,7 +2624,6 @@ int open_ctree(struct super_block *sb, spin_lock_init(&fs_info->fs_roots_radix_lock); spin_lock_init(&fs_info->delayed_iput_lock); spin_lock_init(&fs_info->defrag_inodes_lock); - spin_lock_init(&fs_info->free_chunk_lock); spin_lock_init(&fs_info->tree_mod_seq_lock); spin_lock_init(&fs_info->super_lock); spin_lock_init(&fs_info->qgroup_op_lock); @@ -2667,7 +2664,7 @@ int open_ctree(struct super_block *sb, fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE; fs_info->metadata_ratio = 0; fs_info->defrag_inodes = RB_ROOT; - fs_info->free_chunk_space = 0; + atomic64_set(&fs_info->free_chunk_space, 0); fs_info->tree_mod_log = RB_ROOT; fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */ @@ -3467,10 +3464,12 @@ static int write_dev_supers(struct btrfs_device *device, * we fua the first super. The others we allow * to go down lazy. */ - if (i == 0) - ret = btrfsic_submit_bh(REQ_OP_WRITE, REQ_FUA, bh); - else + if (i == 0) { + ret = btrfsic_submit_bh(REQ_OP_WRITE, + REQ_SYNC | REQ_FUA, bh); + } else { ret = btrfsic_submit_bh(REQ_OP_WRITE, REQ_SYNC, bh); + } if (ret) errors++; } @@ -3507,6 +3506,10 @@ static int write_dev_flush(struct btrfs_device *device, int wait) if (wait) { bio = device->flush_bio; if (!bio) + /* + * This means the alloc has failed with ENOMEM, however + * here we return 0, as its not a device error. + */ return 0; wait_for_completion(&device->flush_wait); @@ -3535,7 +3538,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait) bio->bi_end_io = btrfs_end_empty_barrier; bio->bi_bdev = device->bdev; - bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; + bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH; init_completion(&device->flush_wait); bio->bi_private = &device->flush_wait; device->flush_bio = bio; @@ -3546,6 +3549,32 @@ static int write_dev_flush(struct btrfs_device *device, int wait) return 0; } +static int check_barrier_error(struct btrfs_fs_devices *fsdevs) +{ + int submit_flush_error = 0; + int dev_flush_error = 0; + struct btrfs_device *dev; + int tolerance; + + list_for_each_entry_rcu(dev, &fsdevs->devices, dev_list) { + if (!dev->bdev) { + submit_flush_error++; + dev_flush_error++; + continue; + } + if (dev->last_flush_error == -ENOMEM) + submit_flush_error++; + if (dev->last_flush_error && dev->last_flush_error != -ENOMEM) + dev_flush_error++; + } + + tolerance = fsdevs->fs_info->num_tolerated_disk_barrier_failures; + if (submit_flush_error > tolerance || dev_flush_error > tolerance) + return -EIO; + + return 0; +} + /* * send an empty flush down to each device in parallel, * then wait for them @@ -3573,6 +3602,7 @@ static int barrier_all_devices(struct btrfs_fs_info *info) ret = write_dev_flush(dev, 0); if (ret) errors_send++; + dev->last_flush_error = ret; } /* wait for all the barriers */ @@ -3587,12 +3617,30 @@ static int barrier_all_devices(struct btrfs_fs_info *info) continue; ret = write_dev_flush(dev, 1); - if (ret) + if (ret) { + dev->last_flush_error = ret; errors_wait++; + } + } + + /* + * Try hard in case of flush. Lets say, in RAID1 we have + * the following situation + * dev1: EIO dev2: ENOMEM + * this is not a fatal error as we hope to recover from + * ENOMEM in the next attempt to flush. + * But the following is considered as fatal + * dev1: ENOMEM dev2: ENOMEM + * dev1: bdev == NULL dev2: ENOMEM + */ + if (errors_send || errors_wait) { + /* + * At some point we need the status of all disks + * to arrive at the volume status. So error checking + * is being pushed to a separate loop. + */ + return check_barrier_error(info->fs_devices); } - if (errors_send > info->num_tolerated_disk_barrier_failures || - errors_wait > info->num_tolerated_disk_barrier_failures) - return -EIO; return 0; } @@ -4576,11 +4624,6 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, cur_trans->state =TRANS_STATE_COMPLETED; wake_up(&cur_trans->commit_wait); - - /* - memset(cur_trans, 0, sizeof(*cur_trans)); - kmem_cache_free(btrfs_transaction_cachep, cur_trans); - */ } static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index e390451c72e6..6e9e4461c2e9 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3364,6 +3364,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group, struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_root *root = fs_info->tree_root; struct inode *inode = NULL; + struct extent_changeset *data_reserved = NULL; u64 alloc_hint = 0; int dcs = BTRFS_DC_ERROR; u64 num_pages = 0; @@ -3483,7 +3484,7 @@ again: num_pages *= 16; num_pages *= PAGE_SIZE; - ret = btrfs_check_data_free_space(inode, 0, num_pages); + ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages); if (ret) goto out_put; @@ -3514,6 +3515,7 @@ out: block_group->disk_cache_state = dcs; spin_unlock(&block_group->lock); + extent_changeset_free(data_reserved); return ret; } @@ -4277,12 +4279,8 @@ commit_trans: return ret; } -/* - * New check_data_free_space() with ability for precious data reservation - * Will replace old btrfs_check_data_free_space(), but for patch split, - * add a new function first and then replace it. - */ -int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len) +int btrfs_check_data_free_space(struct inode *inode, + struct extent_changeset **reserved, u64 start, u64 len) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); int ret; @@ -4297,9 +4295,11 @@ int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len) return ret; /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */ - ret = btrfs_qgroup_reserve_data(inode, start, len); - if (ret) + ret = btrfs_qgroup_reserve_data(inode, reserved, start, len); + if (ret < 0) btrfs_free_reserved_data_space_noquota(inode, start, len); + else + ret = 0; return ret; } @@ -4340,7 +4340,8 @@ void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, * This one will handle the per-inode data rsv map for accurate reserved * space framework. */ -void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len) +void btrfs_free_reserved_data_space(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len) { struct btrfs_root *root = BTRFS_I(inode)->root; @@ -4350,7 +4351,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len) start = round_down(start, root->fs_info->sectorsize); btrfs_free_reserved_data_space_noquota(inode, start, len); - btrfs_qgroup_free_data(inode, start, len); + btrfs_qgroup_free_data(inode, reserved, start, len); } static void force_metadata_allocation(struct btrfs_fs_info *info) @@ -4645,9 +4646,7 @@ static int can_overcommit(struct btrfs_root *root, used += space_info->bytes_may_use; - spin_lock(&fs_info->free_chunk_lock); - avail = fs_info->free_chunk_space; - spin_unlock(&fs_info->free_chunk_lock); + avail = atomic64_read(&fs_info->free_chunk_space); /* * If we have dup, raid1 or raid10 then only half of the free @@ -6123,6 +6122,8 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes) * @inode: inode we're writing to * @start: start range we are writing to * @len: how long the range we are writing to + * @reserved: mandatory parameter, record actually reserved qgroup ranges of + * current reservation. * * This will do the following things * @@ -6140,16 +6141,17 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes) * Return 0 for success * Return <0 for error(-ENOSPC or -EQUOT) */ -int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len) +int btrfs_delalloc_reserve_space(struct inode *inode, + struct extent_changeset **reserved, u64 start, u64 len) { int ret; - ret = btrfs_check_data_free_space(inode, start, len); + ret = btrfs_check_data_free_space(inode, reserved, start, len); if (ret < 0) return ret; ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len); if (ret < 0) - btrfs_free_reserved_data_space(inode, start, len); + btrfs_free_reserved_data_space(inode, *reserved, start, len); return ret; } @@ -6168,10 +6170,11 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len) * list if there are no delalloc bytes left. * Also it will handle the qgroup reserved space. */ -void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len) +void btrfs_delalloc_release_space(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len) { btrfs_delalloc_release_metadata(BTRFS_I(inode), len); - btrfs_free_reserved_data_space(inode, start, len); + btrfs_free_reserved_data_space(inode, reserved, start, len); } static int update_block_group(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d8da3edf2ac3..da4658feb69d 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2458,7 +2458,7 @@ void end_extent_writepage(struct page *page, int err, u64 start, u64 end) if (!uptodate) { ClearPageUptodate(page); SetPageError(page); - ret = ret < 0 ? ret : -EIO; + ret = err < 0 ? err : -EIO; mapping_set_error(page->mapping, ret); } } @@ -2713,7 +2713,7 @@ struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask) struct btrfs_io_bio *btrfs_bio; struct bio *new; - new = bio_clone_bioset(bio, gfp_mask, btrfs_bioset); + new = bio_clone_fast(bio, gfp_mask, btrfs_bioset); if (new) { btrfs_bio = btrfs_io_bio(new); btrfs_bio->csum = NULL; @@ -2739,6 +2739,24 @@ struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) return bio; } +struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size) +{ + struct bio *bio; + struct btrfs_io_bio *btrfs_bio; + + /* this will fail when it's backed by a bioset */ + bio = bio_clone_fast(orig, GFP_NOFS, btrfs_bioset); + ASSERT(bio); + + btrfs_bio = btrfs_io_bio(bio); + btrfs_bio->csum = NULL; + btrfs_bio->csum_allocated = NULL; + btrfs_bio->end_io = NULL; + + bio_trim(bio, offset >> 9, size >> 9); + btrfs_bio->iter = bio->bi_iter; + return bio; +} static int __must_check submit_one_bio(struct bio *bio, int mirror_num, unsigned long bio_flags) @@ -4377,6 +4395,123 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode, return NULL; } +/* + * To cache previous fiemap extent + * + * Will be used for merging fiemap extent + */ +struct fiemap_cache { + u64 offset; + u64 phys; + u64 len; + u32 flags; + bool cached; +}; + +/* + * Helper to submit fiemap extent. + * + * Will try to merge current fiemap extent specified by @offset, @phys, + * @len and @flags with cached one. + * And only when we fails to merge, cached one will be submitted as + * fiemap extent. + * + * Return value is the same as fiemap_fill_next_extent(). + */ +static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, + struct fiemap_cache *cache, + u64 offset, u64 phys, u64 len, u32 flags) +{ + int ret = 0; + + if (!cache->cached) + goto assign; + + /* + * Sanity check, extent_fiemap() should have ensured that new + * fiemap extent won't overlap with cahced one. + * Not recoverable. + * + * NOTE: Physical address can overlap, due to compression + */ + if (cache->offset + cache->len > offset) { + WARN_ON(1); + return -EINVAL; + } + + /* + * Only merges fiemap extents if + * 1) Their logical addresses are continuous + * + * 2) Their physical addresses are continuous + * So truly compressed (physical size smaller than logical size) + * extents won't get merged with each other + * + * 3) Share same flags except FIEMAP_EXTENT_LAST + * So regular extent won't get merged with prealloc extent + */ + if (cache->offset + cache->len == offset && + cache->phys + cache->len == phys && + (cache->flags & ~FIEMAP_EXTENT_LAST) == + (flags & ~FIEMAP_EXTENT_LAST)) { + cache->len += len; + cache->flags |= flags; + goto try_submit_last; + } + + /* Not mergeable, need to submit cached one */ + ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys, + cache->len, cache->flags); + cache->cached = false; + if (ret) + return ret; +assign: + cache->cached = true; + cache->offset = offset; + cache->phys = phys; + cache->len = len; + cache->flags = flags; +try_submit_last: + if (cache->flags & FIEMAP_EXTENT_LAST) { + ret = fiemap_fill_next_extent(fieinfo, cache->offset, + cache->phys, cache->len, cache->flags); + cache->cached = false; + } + return ret; +} + +/* + * Sanity check for fiemap cache + * + * All fiemap cache should be submitted by emit_fiemap_extent() + * Iteration should be terminated either by last fiemap extent or + * fieinfo->fi_extents_max. + * So no cached fiemap should exist. + */ +static int check_fiemap_cache(struct btrfs_fs_info *fs_info, + struct fiemap_extent_info *fieinfo, + struct fiemap_cache *cache) +{ + int ret; + + if (!cache->cached) + return 0; + + /* Small and recoverbale problem, only to info developer */ +#ifdef CONFIG_BTRFS_DEBUG + WARN_ON(1); +#endif + btrfs_warn(fs_info, + "unhandled fiemap cache detected: offset=%llu phys=%llu len=%llu flags=0x%x", + cache->offset, cache->phys, cache->len, cache->flags); + ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys, + cache->len, cache->flags); + cache->cached = false; + if (ret > 0) + ret = 0; + return ret; +} + int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len, get_extent_t *get_extent) { @@ -4394,6 +4529,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, struct extent_state *cached_state = NULL; struct btrfs_path *path; struct btrfs_root *root = BTRFS_I(inode)->root; + struct fiemap_cache cache = { 0 }; int end = 0; u64 em_start = 0; u64 em_len = 0; @@ -4573,8 +4709,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, flags |= FIEMAP_EXTENT_LAST; end = 1; } - ret = fiemap_fill_next_extent(fieinfo, em_start, disko, - em_len, flags); + ret = emit_fiemap_extent(fieinfo, &cache, em_start, disko, + em_len, flags); if (ret) { if (ret == 1) ret = 0; @@ -4582,6 +4718,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } } out_free: + if (!ret) + ret = check_fiemap_cache(root->fs_info, fieinfo, &cache); free_extent_map(em); out: btrfs_free_path(path); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 1eafa2f0ede3..204c7660a746 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -205,12 +205,46 @@ struct extent_buffer { */ struct extent_changeset { /* How many bytes are set/cleared in this operation */ - u64 bytes_changed; + unsigned int bytes_changed; /* Changed ranges */ struct ulist range_changed; }; +static inline void extent_changeset_init(struct extent_changeset *changeset) +{ + changeset->bytes_changed = 0; + ulist_init(&changeset->range_changed); +} + +static inline struct extent_changeset *extent_changeset_alloc(void) +{ + struct extent_changeset *ret; + + ret = kmalloc(sizeof(*ret), GFP_KERNEL); + if (!ret) + return NULL; + + extent_changeset_init(ret); + return ret; +} + +static inline void extent_changeset_release(struct extent_changeset *changeset) +{ + if (!changeset) + return; + changeset->bytes_changed = 0; + ulist_release(&changeset->range_changed); +} + +static inline void extent_changeset_free(struct extent_changeset *changeset) +{ + if (!changeset) + return; + extent_changeset_release(changeset); + kfree(changeset); +} + static inline void extent_set_compress_type(unsigned long *bio_flags, int compress_type) { @@ -464,6 +498,7 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, gfp_t gfp_flags); struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs); struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask); +struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size); struct btrfs_fs_info; struct btrfs_inode; diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 64fcb31d7163..9f6062c82419 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -164,7 +164,8 @@ static int __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u64 logical_offset, u32 *dst, int dio) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct bio_vec *bvec; + struct bio_vec bvec; + struct bvec_iter iter; struct btrfs_io_bio *btrfs_bio = btrfs_io_bio(bio); struct btrfs_csum_item *item = NULL; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; @@ -177,7 +178,7 @@ static int __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u64 page_bytes_left; u32 diff; int nblocks; - int count = 0, i; + int count = 0; u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); path = btrfs_alloc_path(); @@ -206,8 +207,6 @@ static int __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, if (bio->bi_iter.bi_size > PAGE_SIZE * 8) path->reada = READA_FORWARD; - WARN_ON(bio->bi_vcnt <= 0); - /* * the free space stuff is only read when it hasn't been * updated in the current transaction. So, we can safely @@ -223,13 +222,13 @@ static int __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, if (dio) offset = logical_offset; - bio_for_each_segment_all(bvec, bio, i) { - page_bytes_left = bvec->bv_len; + bio_for_each_segment(bvec, bio, iter) { + page_bytes_left = bvec.bv_len; if (count) goto next; if (!dio) - offset = page_offset(bvec->bv_page) + bvec->bv_offset; + offset = page_offset(bvec.bv_page) + bvec.bv_offset; count = btrfs_find_ordered_sum(inode, offset, disk_bytenr, (u32 *)csum, nblocks); if (count) @@ -440,15 +439,15 @@ int btrfs_csum_one_bio(struct inode *inode, struct bio *bio, struct btrfs_ordered_sum *sums; struct btrfs_ordered_extent *ordered = NULL; char *data; - struct bio_vec *bvec; + struct bvec_iter iter; + struct bio_vec bvec; int index; int nr_sectors; - int i, j; unsigned long total_bytes = 0; unsigned long this_sum_bytes = 0; + int i; u64 offset; - WARN_ON(bio->bi_vcnt <= 0); sums = kzalloc(btrfs_ordered_sum_size(fs_info, bio->bi_iter.bi_size), GFP_NOFS); if (!sums) @@ -465,19 +464,19 @@ int btrfs_csum_one_bio(struct inode *inode, struct bio *bio, sums->bytenr = (u64)bio->bi_iter.bi_sector << 9; index = 0; - bio_for_each_segment_all(bvec, bio, j) { + bio_for_each_segment(bvec, bio, iter) { if (!contig) - offset = page_offset(bvec->bv_page) + bvec->bv_offset; + offset = page_offset(bvec.bv_page) + bvec.bv_offset; if (!ordered) { ordered = btrfs_lookup_ordered_extent(inode, offset); BUG_ON(!ordered); /* Logic error */ } - data = kmap_atomic(bvec->bv_page); + data = kmap_atomic(bvec.bv_page); nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, - bvec->bv_len + fs_info->sectorsize + bvec.bv_len + fs_info->sectorsize - 1); for (i = 0; i < nr_sectors; i++) { @@ -504,12 +503,12 @@ int btrfs_csum_one_bio(struct inode *inode, struct bio *bio, + total_bytes; index = 0; - data = kmap_atomic(bvec->bv_page); + data = kmap_atomic(bvec.bv_page); } sums->sums[index] = ~(u32)0; sums->sums[index] - = btrfs_csum_data(data + bvec->bv_offset + = btrfs_csum_data(data + bvec.bv_offset + (i * fs_info->sectorsize), sums->sums[index], fs_info->sectorsize); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index da1096eb1a40..0502bd2272fe 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1581,6 +1581,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, struct btrfs_root *root = BTRFS_I(inode)->root; struct page **pages = NULL; struct extent_state *cached_state = NULL; + struct extent_changeset *data_reserved = NULL; u64 release_bytes = 0; u64 lockstart; u64 lockend; @@ -1628,7 +1629,9 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, reserve_bytes = round_up(write_bytes + sector_offset, fs_info->sectorsize); - ret = btrfs_check_data_free_space(inode, pos, write_bytes); + extent_changeset_release(data_reserved); + ret = btrfs_check_data_free_space(inode, &data_reserved, pos, + write_bytes); if (ret < 0) { if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) && @@ -1657,8 +1660,9 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, reserve_bytes); if (ret) { if (!only_release_metadata) - btrfs_free_reserved_data_space(inode, pos, - write_bytes); + btrfs_free_reserved_data_space(inode, + data_reserved, pos, + write_bytes); else btrfs_end_write_no_snapshoting(root); break; @@ -1740,8 +1744,9 @@ again: __pos = round_down(pos, fs_info->sectorsize) + (dirty_pages << PAGE_SHIFT); - btrfs_delalloc_release_space(inode, __pos, - release_bytes); + btrfs_delalloc_release_space(inode, + data_reserved, __pos, + release_bytes); } } @@ -1796,12 +1801,13 @@ again: btrfs_delalloc_release_metadata(BTRFS_I(inode), release_bytes); } else { - btrfs_delalloc_release_space(inode, - round_down(pos, fs_info->sectorsize), - release_bytes); + btrfs_delalloc_release_space(inode, data_reserved, + round_down(pos, fs_info->sectorsize), + release_bytes); } } + extent_changeset_free(data_reserved); return num_written ? num_written : ret; } @@ -2769,6 +2775,7 @@ static long btrfs_fallocate(struct file *file, int mode, { struct inode *inode = file_inode(file); struct extent_state *cached_state = NULL; + struct extent_changeset *data_reserved = NULL; struct falloc_range *range; struct falloc_range *tmp; struct list_head reserve_list; @@ -2898,8 +2905,8 @@ static long btrfs_fallocate(struct file *file, int mode, free_extent_map(em); break; } - ret = btrfs_qgroup_reserve_data(inode, cur_offset, - last_byte - cur_offset); + ret = btrfs_qgroup_reserve_data(inode, &data_reserved, + cur_offset, last_byte - cur_offset); if (ret < 0) { free_extent_map(em); break; @@ -2910,8 +2917,8 @@ static long btrfs_fallocate(struct file *file, int mode, * range, free reserved data space first, otherwise * it'll result in false ENOSPC error. */ - btrfs_free_reserved_data_space(inode, cur_offset, - last_byte - cur_offset); + btrfs_free_reserved_data_space(inode, data_reserved, + cur_offset, last_byte - cur_offset); } free_extent_map(em); cur_offset = last_byte; @@ -2930,8 +2937,9 @@ static long btrfs_fallocate(struct file *file, int mode, range->len, i_blocksize(inode), offset + len, &alloc_hint); else - btrfs_free_reserved_data_space(inode, range->start, - range->len); + btrfs_free_reserved_data_space(inode, + data_reserved, range->start, + range->len); list_del(&range->list); kfree(range); } @@ -2969,8 +2977,9 @@ out: inode_unlock(inode); /* Let go of our reservation. */ if (ret != 0) - btrfs_free_reserved_data_space(inode, alloc_start, - alloc_end - cur_offset); + btrfs_free_reserved_data_space(inode, data_reserved, + alloc_start, alloc_end - cur_offset); + extent_changeset_free(data_reserved); return ret; } diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 5c6c20ec64d8..d02019747d00 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -400,6 +400,7 @@ int btrfs_save_ino_cache(struct btrfs_root *root, struct btrfs_path *path; struct inode *inode; struct btrfs_block_rsv *rsv; + struct extent_changeset *data_reserved = NULL; u64 num_bytes; u64 alloc_hint = 0; int ret; @@ -492,7 +493,7 @@ again: /* Just to make sure we have enough space */ prealloc += 8 * PAGE_SIZE; - ret = btrfs_delalloc_reserve_space(inode, 0, prealloc); + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, 0, prealloc); if (ret) goto out_put; @@ -516,6 +517,7 @@ out: trans->bytes_reserved = num_bytes; btrfs_free_path(path); + extent_changeset_free(data_reserved); return ret; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 17cbe9306faf..8c37b4fa4cbb 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -86,7 +86,6 @@ static const struct extent_io_ops btrfs_extent_io_ops; static struct kmem_cache *btrfs_inode_cachep; struct kmem_cache *btrfs_trans_handle_cachep; -struct kmem_cache *btrfs_transaction_cachep; struct kmem_cache *btrfs_path_cachep; struct kmem_cache *btrfs_free_space_cachep; @@ -350,7 +349,7 @@ out: * And at reserve time, it's always aligned to page size, so * just free one page here. */ - btrfs_qgroup_free_data(inode, 0, PAGE_SIZE); + btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE); btrfs_free_path(path); btrfs_end_transaction(trans); return ret; @@ -2035,6 +2034,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) struct btrfs_writepage_fixup *fixup; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; + struct extent_changeset *data_reserved = NULL; struct page *page; struct inode *inode; u64 page_start; @@ -2072,7 +2072,7 @@ again: goto again; } - ret = btrfs_delalloc_reserve_space(inode, page_start, + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start, PAGE_SIZE); if (ret) { mapping_set_error(page->mapping, ret); @@ -2092,6 +2092,7 @@ out_page: unlock_page(page); put_page(page); kfree(fixup); + extent_changeset_free(data_reserved); } /* @@ -2143,6 +2144,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_key ins; + u64 qg_released; int extent_inserted = 0; int ret; @@ -2198,13 +2200,17 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, ins.objectid = disk_bytenr; ins.offset = disk_num_bytes; ins.type = BTRFS_EXTENT_ITEM_KEY; - ret = btrfs_alloc_reserved_file_extent(trans, root->root_key.objectid, - btrfs_ino(BTRFS_I(inode)), file_pos, ram_bytes, &ins); + /* * Release the reserved range from inode dirty range map, as it is * already moved into delayed_ref_head */ - btrfs_qgroup_release_data(inode, file_pos, ram_bytes); + ret = btrfs_qgroup_release_data(inode, file_pos, ram_bytes); + if (ret < 0) + goto out; + qg_released = ret; + ret = btrfs_alloc_reserved_file_extent(trans, root->root_key.objectid, + btrfs_ino(BTRFS_I(inode)), file_pos, qg_released, &ins); out: btrfs_free_path(path); @@ -2926,7 +2932,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) * space for NOCOW range. * As NOCOW won't cause a new delayed ref, just free the space */ - btrfs_qgroup_free_data(inode, ordered_extent->file_offset, + btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset, ordered_extent->len); btrfs_ordered_update_i_size(inode, 0, ordered_extent); if (nolock) @@ -2952,7 +2958,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) ret = test_range_bit(io_tree, ordered_extent->file_offset, ordered_extent->file_offset + ordered_extent->len - 1, - EXTENT_DEFRAG, 1, cached_state); + EXTENT_DEFRAG, 0, cached_state); if (ret) { u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item); if (0 && last_snapshot >= BTRFS_I(inode)->generation) @@ -4762,6 +4768,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; + struct extent_changeset *data_reserved = NULL; char *kaddr; u32 blocksize = fs_info->sectorsize; pgoff_t index = from >> PAGE_SHIFT; @@ -4776,7 +4783,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, (!len || ((len & (blocksize - 1)) == 0))) goto out; - ret = btrfs_delalloc_reserve_space(inode, + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, round_down(from, blocksize), blocksize); if (ret) goto out; @@ -4784,7 +4791,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, again: page = find_or_create_page(mapping, index, mask); if (!page) { - btrfs_delalloc_release_space(inode, + btrfs_delalloc_release_space(inode, data_reserved, round_down(from, blocksize), blocksize); ret = -ENOMEM; @@ -4856,11 +4863,12 @@ again: out_unlock: if (ret) - btrfs_delalloc_release_space(inode, block_start, + btrfs_delalloc_release_space(inode, data_reserved, block_start, blocksize); unlock_page(page); put_page(page); out: + extent_changeset_free(data_reserved); return ret; } @@ -5255,7 +5263,7 @@ static void evict_inode_truncate_pages(struct inode *inode) * Note, end is the bytenr of last byte, so we need + 1 here. */ if (state->state & EXTENT_DELALLOC) - btrfs_qgroup_free_data(inode, start, end - start + 1); + btrfs_qgroup_free_data(inode, NULL, start, end - start + 1); clear_extent_bit(io_tree, start, end, EXTENT_LOCKED | EXTENT_DIRTY | @@ -5868,7 +5876,6 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_item *item; struct btrfs_dir_item *di; struct btrfs_key key; struct btrfs_key found_key; @@ -5919,7 +5926,6 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) continue; } - item = btrfs_item_nr(slot); btrfs_item_key_to_cpu(leaf, &found_key, slot); if (found_key.objectid != key.objectid) @@ -7480,11 +7486,11 @@ out: bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end) { struct radix_tree_root *root = &inode->i_mapping->page_tree; - int found = false; + bool found = false; void **pagep = NULL; struct page *page = NULL; - int start_idx; - int end_idx; + unsigned long start_idx; + unsigned long end_idx; start_idx = start >> PAGE_SHIFT; @@ -7981,6 +7987,7 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio, struct bio *bio; int isector; int read_mode = 0; + int segs; int ret; BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); @@ -7996,9 +8003,9 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio, return -EIO; } - if ((failed_bio->bi_vcnt > 1) - || (failed_bio->bi_io_vec->bv_len - > btrfs_inode_sectorsize(inode))) + segs = bio_segments(failed_bio); + if (segs > 1 || + (failed_bio->bi_io_vec->bv_len > btrfs_inode_sectorsize(inode))) read_mode |= REQ_FAILFAST_DEV; isector = start - btrfs_io_bio(failed_bio)->logical; @@ -8056,36 +8063,40 @@ static int __btrfs_correct_data_nocsum(struct inode *inode, struct btrfs_io_bio *io_bio) { struct btrfs_fs_info *fs_info; - struct bio_vec *bvec; + struct bio_vec bvec; + struct bvec_iter iter; struct btrfs_retry_complete done; u64 start; unsigned int pgoff; u32 sectorsize; int nr_sectors; - int i; int ret; + int err = 0; fs_info = BTRFS_I(inode)->root->fs_info; sectorsize = fs_info->sectorsize; start = io_bio->logical; done.inode = inode; + io_bio->bio.bi_iter = io_bio->iter; - bio_for_each_segment_all(bvec, &io_bio->bio, i) { - nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len); - pgoff = bvec->bv_offset; + bio_for_each_segment(bvec, &io_bio->bio, iter) { + nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len); + pgoff = bvec.bv_offset; next_block_or_try_again: done.uptodate = 0; done.start = start; init_completion(&done.done); - ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, + ret = dio_read_error(inode, &io_bio->bio, bvec.bv_page, pgoff, start, start + sectorsize - 1, io_bio->mirror_num, btrfs_retry_endio_nocsum, &done); - if (ret) - return ret; + if (ret) { + err = ret; + goto next; + } wait_for_completion(&done.done); @@ -8094,6 +8105,7 @@ next_block_or_try_again: goto next_block_or_try_again; } +next: start += sectorsize; nr_sectors--; @@ -8104,7 +8116,7 @@ next_block_or_try_again: } } - return 0; + return err; } static void btrfs_retry_endio(struct bio *bio) @@ -8145,7 +8157,8 @@ static int __btrfs_subio_endio_read(struct inode *inode, struct btrfs_io_bio *io_bio, int err) { struct btrfs_fs_info *fs_info; - struct bio_vec *bvec; + struct bio_vec bvec; + struct bvec_iter iter; struct btrfs_retry_complete done; u64 start; u64 offset = 0; @@ -8153,7 +8166,7 @@ static int __btrfs_subio_endio_read(struct inode *inode, int nr_sectors; unsigned int pgoff; int csum_pos; - int i; + int uptodate = !!(err == 0); int ret; fs_info = BTRFS_I(inode)->root->fs_info; @@ -8162,24 +8175,27 @@ static int __btrfs_subio_endio_read(struct inode *inode, err = 0; start = io_bio->logical; done.inode = inode; + io_bio->bio.bi_iter = io_bio->iter; - bio_for_each_segment_all(bvec, &io_bio->bio, i) { - nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len); + bio_for_each_segment(bvec, &io_bio->bio, iter) { + nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len); - pgoff = bvec->bv_offset; + pgoff = bvec.bv_offset; next_block: - csum_pos = BTRFS_BYTES_TO_BLKS(fs_info, offset); - ret = __readpage_endio_check(inode, io_bio, csum_pos, - bvec->bv_page, pgoff, start, - sectorsize); - if (likely(!ret)) - goto next; + if (uptodate) { + csum_pos = BTRFS_BYTES_TO_BLKS(fs_info, offset); + ret = __readpage_endio_check(inode, io_bio, csum_pos, + bvec.bv_page, pgoff, + start, sectorsize); + if (likely(!ret)) + goto next; + } try_again: done.uptodate = 0; done.start = start; init_completion(&done.done); - ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, + ret = dio_read_error(inode, &io_bio->bio, bvec.bv_page, pgoff, start, start + sectorsize - 1, io_bio->mirror_num, btrfs_retry_endio, &done); @@ -8234,8 +8250,11 @@ static void btrfs_endio_direct_read(struct bio *bio) struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); int err = bio->bi_error; - if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED) + if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED) { err = btrfs_subio_endio_read(inode, io_bio, err); + if (!err) + bio->bi_error = 0; + } unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, dip->logical_offset + dip->bytes - 1); @@ -8358,16 +8377,6 @@ out: bio_put(bio); } -static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev, - u64 first_sector, gfp_t gfp_flags) -{ - struct bio *bio; - bio = btrfs_bio_alloc(bdev, first_sector, BIO_MAX_PAGES, gfp_flags); - if (bio) - bio_associate_current(bio); - return bio; -} - static inline int btrfs_lookup_and_bind_dio_csum(struct inode *inode, struct btrfs_dio_private *dip, struct bio *bio, @@ -8457,24 +8466,23 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip, struct btrfs_root *root = BTRFS_I(inode)->root; struct bio *bio; struct bio *orig_bio = dip->orig_bio; - struct bio_vec *bvec; u64 start_sector = orig_bio->bi_iter.bi_sector; u64 file_offset = dip->logical_offset; - u64 submit_len = 0; u64 map_length; - u32 blocksize = fs_info->sectorsize; int async_submit = 0; - int nr_sectors; + u64 submit_len; + int clone_offset = 0; + int clone_len; int ret; - int i, j; map_length = orig_bio->bi_iter.bi_size; + submit_len = map_length; ret = btrfs_map_block(fs_info, btrfs_op(orig_bio), start_sector << 9, &map_length, NULL, 0); if (ret) return -EIO; - if (map_length >= orig_bio->bi_iter.bi_size) { + if (map_length >= submit_len) { bio = orig_bio; dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED; goto submit; @@ -8486,70 +8494,52 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip, else async_submit = 1; - bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); - if (!bio) - return -ENOMEM; - - bio->bi_opf = orig_bio->bi_opf; - bio->bi_private = dip; - bio->bi_end_io = btrfs_end_dio_bio; - btrfs_io_bio(bio)->logical = file_offset; + /* bio split */ + ASSERT(map_length <= INT_MAX); atomic_inc(&dip->pending_bios); + while (submit_len > 0) { + clone_len = min_t(int, submit_len, map_length); - bio_for_each_segment_all(bvec, orig_bio, j) { - nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len); - i = 0; -next_block: - if (unlikely(map_length < submit_len + blocksize || - bio_add_page(bio, bvec->bv_page, blocksize, - bvec->bv_offset + (i * blocksize)) < blocksize)) { - /* - * inc the count before we submit the bio so - * we know the end IO handler won't happen before - * we inc the count. Otherwise, the dip might get freed - * before we're done setting it up - */ - atomic_inc(&dip->pending_bios); - ret = __btrfs_submit_dio_bio(bio, inode, - file_offset, skip_sum, - async_submit); - if (ret) { - bio_put(bio); - atomic_dec(&dip->pending_bios); - goto out_err; - } - - start_sector += submit_len >> 9; - file_offset += submit_len; + /* + * This will never fail as it's passing GPF_NOFS and + * the allocation is backed by btrfs_bioset. + */ + bio = btrfs_bio_clone_partial(orig_bio, clone_offset, + clone_len); + bio->bi_private = dip; + bio->bi_end_io = btrfs_end_dio_bio; + btrfs_io_bio(bio)->logical = file_offset; + + ASSERT(submit_len >= clone_len); + submit_len -= clone_len; + if (submit_len == 0) + break; - submit_len = 0; + /* + * Increase the count before we submit the bio so we know + * the end IO handler won't happen before we increase the + * count. Otherwise, the dip might get freed before we're + * done setting it up. + */ + atomic_inc(&dip->pending_bios); - bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, - start_sector, GFP_NOFS); - if (!bio) - goto out_err; - bio->bi_opf = orig_bio->bi_opf; - bio->bi_private = dip; - bio->bi_end_io = btrfs_end_dio_bio; - btrfs_io_bio(bio)->logical = file_offset; + ret = __btrfs_submit_dio_bio(bio, inode, file_offset, skip_sum, + async_submit); + if (ret) { + bio_put(bio); + atomic_dec(&dip->pending_bios); + goto out_err; + } - map_length = orig_bio->bi_iter.bi_size; - ret = btrfs_map_block(fs_info, btrfs_op(orig_bio), - start_sector << 9, - &map_length, NULL, 0); - if (ret) { - bio_put(bio); - goto out_err; - } + clone_offset += clone_len; + start_sector += clone_len >> 9; + file_offset += clone_len; - goto next_block; - } else { - submit_len += blocksize; - if (--nr_sectors) { - i++; - goto next_block; - } - } + map_length = submit_len; + ret = btrfs_map_block(fs_info, btrfs_op(orig_bio), + start_sector << 9, &map_length, NULL, 0); + if (ret) + goto out_err; } submit: @@ -8577,16 +8567,16 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, loff_t file_offset) { struct btrfs_dio_private *dip = NULL; - struct bio *io_bio = NULL; - struct btrfs_io_bio *btrfs_bio; + struct bio *bio = NULL; + struct btrfs_io_bio *io_bio; int skip_sum; bool write = (bio_op(dio_bio) == REQ_OP_WRITE); int ret = 0; skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; - io_bio = btrfs_bio_clone(dio_bio, GFP_NOFS); - if (!io_bio) { + bio = btrfs_bio_clone(dio_bio, GFP_NOFS); + if (!bio) { ret = -ENOMEM; goto free_ordered; } @@ -8602,17 +8592,17 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, dip->logical_offset = file_offset; dip->bytes = dio_bio->bi_iter.bi_size; dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9; - io_bio->bi_private = dip; - dip->orig_bio = io_bio; + bio->bi_private = dip; + dip->orig_bio = bio; dip->dio_bio = dio_bio; atomic_set(&dip->pending_bios, 0); - btrfs_bio = btrfs_io_bio(io_bio); - btrfs_bio->logical = file_offset; + io_bio = btrfs_io_bio(bio); + io_bio->logical = file_offset; if (write) { - io_bio->bi_end_io = btrfs_endio_direct_write; + bio->bi_end_io = btrfs_endio_direct_write; } else { - io_bio->bi_end_io = btrfs_endio_direct_read; + bio->bi_end_io = btrfs_endio_direct_read; dip->subio_endio = btrfs_subio_endio_read; } @@ -8635,8 +8625,8 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, if (!ret) return; - if (btrfs_bio->end_io) - btrfs_bio->end_io(btrfs_bio, ret); + if (io_bio->end_io) + io_bio->end_io(io_bio, ret); free_ordered: /* @@ -8648,16 +8638,16 @@ free_ordered: * same as btrfs_endio_direct_[write|read] because we can't call these * callbacks - they require an allocated dip and a clone of dio_bio. */ - if (io_bio && dip) { - io_bio->bi_error = -EIO; - bio_endio(io_bio); + if (bio && dip) { + bio->bi_error = -EIO; + bio_endio(bio); /* - * The end io callbacks free our dip, do the final put on io_bio + * The end io callbacks free our dip, do the final put on bio * and all the cleanup and final put for dio_bio (through * dio_end_io()). */ dip = NULL; - io_bio = NULL; + bio = NULL; } else { if (write) __endio_write_update_ordered(inode, @@ -8675,8 +8665,8 @@ free_ordered: */ dio_end_io(dio_bio, ret); } - if (io_bio) - bio_put(io_bio); + if (bio) + bio_put(bio); kfree(dip); } @@ -8720,6 +8710,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) struct inode *inode = file->f_mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_dio_data dio_data = { 0 }; + struct extent_changeset *data_reserved = NULL; loff_t offset = iocb->ki_pos; size_t count = 0; int flags = 0; @@ -8756,7 +8747,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) inode_unlock(inode); relock = true; } - ret = btrfs_delalloc_reserve_space(inode, offset, count); + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, + offset, count); if (ret) goto out; dio_data.outstanding_extents = count_max_extents(count); @@ -8788,8 +8780,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) current->journal_info = NULL; if (ret < 0 && ret != -EIOCBQUEUED) { if (dio_data.reserve) - btrfs_delalloc_release_space(inode, offset, - dio_data.reserve); + btrfs_delalloc_release_space(inode, data_reserved, + offset, dio_data.reserve); /* * On error we might have left some ordered extents * without submitting corresponding bios for them, so @@ -8804,8 +8796,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) dio_data.unsubmitted_oe_range_start, false); } else if (ret >= 0 && (size_t)ret < count) - btrfs_delalloc_release_space(inode, offset, - count - (size_t)ret); + btrfs_delalloc_release_space(inode, data_reserved, + offset, count - (size_t)ret); } out: if (wakeup) @@ -8813,6 +8805,7 @@ out: if (relock) inode_lock(inode); + extent_changeset_free(data_reserved); return ret; } @@ -9003,7 +8996,7 @@ again: * free the entire extent. */ if (PageDirty(page)) - btrfs_qgroup_free_data(inode, page_start, PAGE_SIZE); + btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE); if (!inode_evicting) { clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED | EXTENT_DIRTY | @@ -9045,6 +9038,7 @@ int btrfs_page_mkwrite(struct vm_fault *vmf) struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; + struct extent_changeset *data_reserved = NULL; char *kaddr; unsigned long zero_start; loff_t size; @@ -9070,7 +9064,7 @@ int btrfs_page_mkwrite(struct vm_fault *vmf) * end up waiting indefinitely to get a lock on the page currently * being processed by btrfs_page_mkwrite() function. */ - ret = btrfs_delalloc_reserve_space(inode, page_start, + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start, reserved_space); if (!ret) { ret = file_update_time(vmf->vma->vm_file); @@ -9124,8 +9118,8 @@ again: spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->outstanding_extents++; spin_unlock(&BTRFS_I(inode)->lock); - btrfs_delalloc_release_space(inode, page_start, - PAGE_SIZE - reserved_space); + btrfs_delalloc_release_space(inode, data_reserved, + page_start, PAGE_SIZE - reserved_space); } } @@ -9176,13 +9170,16 @@ again: out_unlock: if (!ret) { sb_end_pagefault(inode->i_sb); + extent_changeset_free(data_reserved); return VM_FAULT_LOCKED; } unlock_page(page); out: - btrfs_delalloc_release_space(inode, page_start, reserved_space); + btrfs_delalloc_release_space(inode, data_reserved, page_start, + reserved_space); out_noreserve: sb_end_pagefault(inode->i_sb); + extent_changeset_free(data_reserved); return ret; } @@ -9514,7 +9511,6 @@ void btrfs_destroy_cachep(void) rcu_barrier(); kmem_cache_destroy(btrfs_inode_cachep); kmem_cache_destroy(btrfs_trans_handle_cachep); - kmem_cache_destroy(btrfs_transaction_cachep); kmem_cache_destroy(btrfs_path_cachep); kmem_cache_destroy(btrfs_free_space_cachep); } @@ -9534,12 +9530,6 @@ int btrfs_init_cachep(void) if (!btrfs_trans_handle_cachep) goto fail; - btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction", - sizeof(struct btrfs_transaction), 0, - SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL); - if (!btrfs_transaction_cachep) - goto fail; - btrfs_path_cachep = kmem_cache_create("btrfs_path", sizeof(struct btrfs_path), 0, SLAB_MEM_SPREAD, NULL); @@ -10538,7 +10528,7 @@ next: btrfs_end_transaction(trans); } if (cur_offset < end) - btrfs_free_reserved_data_space(inode, cur_offset, + btrfs_free_reserved_data_space(inode, NULL, cur_offset, end - cur_offset + 1); return ret; } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index e176375f374f..2bbcb334a32a 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1127,6 +1127,7 @@ static int cluster_pages_for_defrag(struct inode *inode, struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; struct extent_io_tree *tree; + struct extent_changeset *data_reserved = NULL; gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); file_end = (isize - 1) >> PAGE_SHIFT; @@ -1135,7 +1136,7 @@ static int cluster_pages_for_defrag(struct inode *inode, page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1); - ret = btrfs_delalloc_reserve_space(inode, + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start_index << PAGE_SHIFT, page_cnt << PAGE_SHIFT); if (ret) @@ -1226,7 +1227,7 @@ again: spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->outstanding_extents++; spin_unlock(&BTRFS_I(inode)->lock); - btrfs_delalloc_release_space(inode, + btrfs_delalloc_release_space(inode, data_reserved, start_index << PAGE_SHIFT, (page_cnt - i_done) << PAGE_SHIFT); } @@ -1247,15 +1248,17 @@ again: unlock_page(pages[i]); put_page(pages[i]); } + extent_changeset_free(data_reserved); return i_done; out: for (i = 0; i < i_done; i++) { unlock_page(pages[i]); put_page(pages[i]); } - btrfs_delalloc_release_space(inode, + btrfs_delalloc_release_space(inode, data_reserved, start_index << PAGE_SHIFT, page_cnt << PAGE_SHIFT); + extent_changeset_free(data_reserved); return ret; } @@ -4897,7 +4900,6 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) goto out; } - /* FIXME: check if the IDs really exist */ if (sa->assign) { ret = btrfs_add_qgroup_relation(trans, fs_info, sa->src, sa->dst); @@ -4956,7 +4958,6 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg) goto out; } - /* FIXME: check if the IDs really exist */ if (sa->create) { ret = btrfs_create_qgroup(trans, fs_info, sa->qgroupid); } else { @@ -5010,7 +5011,6 @@ static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg) qgroupid = root->root_key.objectid; } - /* FIXME: check if the IDs really exist */ ret = btrfs_limit_qgroup(trans, fs_info, qgroupid, &sa->lim); err = btrfs_end_transaction(trans); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index deffbeb74a0b..acfe37360084 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1406,38 +1406,6 @@ out: return ret; } -int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) -{ - struct btrfs_qgroup_extent_record *record; - struct btrfs_delayed_ref_root *delayed_refs; - struct rb_node *node; - u64 qgroup_to_skip; - int ret = 0; - - delayed_refs = &trans->transaction->delayed_refs; - qgroup_to_skip = delayed_refs->qgroup_to_skip; - - /* - * No need to do lock, since this function will only be called in - * btrfs_commit_transaction(). - */ - node = rb_first(&delayed_refs->dirty_extent_root); - while (node) { - record = rb_entry(node, struct btrfs_qgroup_extent_record, - node); - if (WARN_ON(!record->old_roots)) - ret = btrfs_find_all_roots(NULL, fs_info, - record->bytenr, 0, &record->old_roots); - if (ret < 0) - break; - if (qgroup_to_skip) - ulist_del(record->old_roots, qgroup_to_skip, 0); - node = rb_next(node); - } - return ret; -} - int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_qgroup_extent_record *record) @@ -1918,6 +1886,33 @@ static int qgroup_update_counters(struct btrfs_fs_info *fs_info, return 0; } +/* + * Helper to check if the @roots is a list of fs tree roots + * Return 0 for definitely not a fs/subvol tree roots ulist + * Return 1 for possible fs/subvol tree roots ulist(including empty) + */ +static int maybe_fs_roots(struct ulist *roots) +{ + struct ulist_node *unode; + struct ulist_iterator uiter; + + /* Empty one, still possible for fs roots */ + if (!roots || roots->nnodes == 0) + return 1; + + ULIST_ITER_INIT(&uiter); + unode = ulist_next(roots, &uiter); + if (!unode) + return 1; + + /* + * If it contains fs tree roots, then it must belongs to fs/subvol + * trees. + * If it contains non-fs tree, it won't be shared to fs/subvol trees. + */ + return is_fstree(unode->val); +} + int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, @@ -1934,10 +1929,20 @@ btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) return 0; - if (new_roots) + if (new_roots) { + if (!maybe_fs_roots(new_roots)) + goto out_free; nr_new_roots = new_roots->nnodes; - if (old_roots) + } + if (old_roots) { + if (!maybe_fs_roots(old_roots)) + goto out_free; nr_old_roots = old_roots->nnodes; + } + + /* Quick exit, either not fs tree roots, or won't affect any qgroup */ + if (nr_old_roots == 0 && nr_new_roots == 0) + goto out_free; BUG_ON(!fs_info->quota_root); @@ -2017,6 +2022,19 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans, if (!ret) { /* + * old roots should be searched when inserting qgroup + * extent record + */ + if (WARN_ON(!record->old_roots)) { + /* Search commit root to find old_roots */ + ret = btrfs_find_all_roots(NULL, fs_info, + record->bytenr, 0, + &record->old_roots); + if (ret < 0) + goto cleanup; + } + + /* * Use SEQ_LAST as time_seq to do special search, which * doesn't lock tree or delayed_refs and search current * root. It's safe inside commit_transaction(). @@ -2025,8 +2043,11 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans, record->bytenr, SEQ_LAST, &new_roots); if (ret < 0) goto cleanup; - if (qgroup_to_skip) + if (qgroup_to_skip) { ulist_del(new_roots, qgroup_to_skip, 0); + ulist_del(record->old_roots, qgroup_to_skip, + 0); + } ret = btrfs_qgroup_account_extent(trans, fs_info, record->bytenr, record->num_bytes, record->old_roots, new_roots); @@ -2338,6 +2359,11 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce) if (num_bytes == 0) return 0; + + if (test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags) && + capable(CAP_SYS_RESOURCE)) + enforce = false; + retry: spin_lock(&fs_info->qgroup_lock); quota_root = fs_info->quota_root; @@ -2806,55 +2832,130 @@ btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info) * Return <0 for error (including -EQUOT) * * NOTE: this function may sleep for memory allocation. + * if btrfs_qgroup_reserve_data() is called multiple times with + * same @reserved, caller must ensure when error happens it's OK + * to free *ALL* reserved space. */ -int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len) +int btrfs_qgroup_reserve_data(struct inode *inode, + struct extent_changeset **reserved_ret, u64 start, + u64 len) { struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_changeset changeset; struct ulist_node *unode; struct ulist_iterator uiter; + struct extent_changeset *reserved; + u64 orig_reserved; + u64 to_reserve; int ret; if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) || !is_fstree(root->objectid) || len == 0) return 0; - changeset.bytes_changed = 0; - ulist_init(&changeset.range_changed); + /* @reserved parameter is mandatory for qgroup */ + if (WARN_ON(!reserved_ret)) + return -EINVAL; + if (!*reserved_ret) { + *reserved_ret = extent_changeset_alloc(); + if (!*reserved_ret) + return -ENOMEM; + } + reserved = *reserved_ret; + /* Record already reserved space */ + orig_reserved = reserved->bytes_changed; ret = set_record_extent_bits(&BTRFS_I(inode)->io_tree, start, - start + len -1, EXTENT_QGROUP_RESERVED, &changeset); + start + len -1, EXTENT_QGROUP_RESERVED, reserved); + + /* Newly reserved space */ + to_reserve = reserved->bytes_changed - orig_reserved; trace_btrfs_qgroup_reserve_data(inode, start, len, - changeset.bytes_changed, - QGROUP_RESERVE); + to_reserve, QGROUP_RESERVE); if (ret < 0) goto cleanup; - ret = qgroup_reserve(root, changeset.bytes_changed, true); + ret = qgroup_reserve(root, to_reserve, true); if (ret < 0) goto cleanup; - ulist_release(&changeset.range_changed); return ret; cleanup: - /* cleanup already reserved ranges */ + /* cleanup *ALL* already reserved ranges */ ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(&changeset.range_changed, &uiter))) + while ((unode = ulist_next(&reserved->range_changed, &uiter))) clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val, unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL, GFP_NOFS); - ulist_release(&changeset.range_changed); + extent_changeset_release(reserved); + return ret; +} + +/* Free ranges specified by @reserved, normally in error path */ +static int qgroup_free_reserved_data(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct ulist_node *unode; + struct ulist_iterator uiter; + struct extent_changeset changeset; + int freed = 0; + int ret; + + extent_changeset_init(&changeset); + len = round_up(start + len, root->fs_info->sectorsize); + start = round_down(start, root->fs_info->sectorsize); + + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(&reserved->range_changed, &uiter))) { + u64 range_start = unode->val; + /* unode->aux is the inclusive end */ + u64 range_len = unode->aux - range_start + 1; + u64 free_start; + u64 free_len; + + extent_changeset_release(&changeset); + + /* Only free range in range [start, start + len) */ + if (range_start >= start + len || + range_start + range_len <= start) + continue; + free_start = max(range_start, start); + free_len = min(start + len, range_start + range_len) - + free_start; + /* + * TODO: To also modify reserved->ranges_reserved to reflect + * the modification. + * + * However as long as we free qgroup reserved according to + * EXTENT_QGROUP_RESERVED, we won't double free. + * So not need to rush. + */ + ret = clear_record_extent_bits(&BTRFS_I(inode)->io_failure_tree, + free_start, free_start + free_len - 1, + EXTENT_QGROUP_RESERVED, &changeset); + if (ret < 0) + goto out; + freed += changeset.bytes_changed; + } + btrfs_qgroup_free_refroot(root->fs_info, root->objectid, freed); + ret = freed; +out: + extent_changeset_release(&changeset); return ret; } -static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len, - int free) +static int __btrfs_qgroup_release_data(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len, + int free) { struct extent_changeset changeset; int trace_op = QGROUP_RELEASE; int ret; - changeset.bytes_changed = 0; - ulist_init(&changeset.range_changed); + /* In release case, we shouldn't have @reserved */ + WARN_ON(!free && reserved); + if (free && reserved) + return qgroup_free_reserved_data(inode, reserved, start, len); + extent_changeset_init(&changeset); ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len -1, EXTENT_QGROUP_RESERVED, &changeset); if (ret < 0) @@ -2868,8 +2969,9 @@ static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len, btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info, BTRFS_I(inode)->root->objectid, changeset.bytes_changed); + ret = changeset.bytes_changed; out: - ulist_release(&changeset.range_changed); + extent_changeset_release(&changeset); return ret; } @@ -2878,14 +2980,17 @@ out: * * Should be called when a range of pages get invalidated before reaching disk. * Or for error cleanup case. + * if @reserved is given, only reserved range in [@start, @start + @len) will + * be freed. * * For data written to disk, use btrfs_qgroup_release_data(). * * NOTE: This function may sleep for memory allocation. */ -int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len) +int btrfs_qgroup_free_data(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len) { - return __btrfs_qgroup_release_data(inode, start, len, 1); + return __btrfs_qgroup_release_data(inode, reserved, start, len, 1); } /* @@ -2905,7 +3010,7 @@ int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len) */ int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len) { - return __btrfs_qgroup_release_data(inode, start, len, 0); + return __btrfs_qgroup_release_data(inode, NULL, start, len, 0); } int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, @@ -2969,8 +3074,7 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode) struct ulist_iterator iter; int ret; - changeset.bytes_changed = 0; - ulist_init(&changeset.range_changed); + extent_changeset_init(&changeset); ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1, EXTENT_QGROUP_RESERVED, &changeset); @@ -2987,5 +3091,5 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode) changeset.bytes_changed); } - ulist_release(&changeset.range_changed); + extent_changeset_release(&changeset); } diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index fe04d3f295c6..102aa7fb342b 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -134,8 +134,6 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info); void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); struct btrfs_delayed_extent_op; -int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); /* * Inform qgroup to trace one dirty extent, its info is recorded in @record. * So qgroup can account it at transaction committing time. @@ -243,9 +241,11 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, #endif /* New io_tree based accurate qgroup reserve API */ -int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len); +int btrfs_qgroup_reserve_data(struct inode *inode, + struct extent_changeset **reserved, u64 start, u64 len); int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len); -int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len); +int btrfs_qgroup_free_data(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len); int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, bool enforce); diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index a17e775a4a89..ab852b8e3e37 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -66,7 +66,6 @@ struct reada_extctl { struct reada_extent { u64 logical; struct btrfs_key top; - int err; struct list_head extctl; int refcnt; spinlock_t lock; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index d60df51959f7..2115f3e02dd5 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3093,11 +3093,12 @@ int prealloc_file_extent_cluster(struct inode *inode, u64 prealloc_start = cluster->start - offset; u64 prealloc_end = cluster->end - offset; u64 cur_offset; + struct extent_changeset *data_reserved = NULL; BUG_ON(cluster->start != cluster->boundary[0]); inode_lock(inode); - ret = btrfs_check_data_free_space(inode, prealloc_start, + ret = btrfs_check_data_free_space(inode, &data_reserved, prealloc_start, prealloc_end + 1 - prealloc_start); if (ret) goto out; @@ -3113,8 +3114,8 @@ int prealloc_file_extent_cluster(struct inode *inode, lock_extent(&BTRFS_I(inode)->io_tree, start, end); num_bytes = end + 1 - start; if (cur_offset < start) - btrfs_free_reserved_data_space(inode, cur_offset, - start - cur_offset); + btrfs_free_reserved_data_space(inode, data_reserved, + cur_offset, start - cur_offset); ret = btrfs_prealloc_file_range(inode, 0, start, num_bytes, num_bytes, end + 1, &alloc_hint); @@ -3125,10 +3126,11 @@ int prealloc_file_extent_cluster(struct inode *inode, nr++; } if (cur_offset < prealloc_end) - btrfs_free_reserved_data_space(inode, cur_offset, - prealloc_end + 1 - cur_offset); + btrfs_free_reserved_data_space(inode, data_reserved, + cur_offset, prealloc_end + 1 - cur_offset); out: inode_unlock(inode); + extent_changeset_free(data_reserved); return ret; } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index fc496a6f842a..e8185c83f667 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -2769,12 +2769,10 @@ out: struct recorded_ref { struct list_head list; - char *dir_path; char *name; struct fs_path *full_path; u64 dir; u64 dir_gen; - int dir_path_len; int name_len; }; @@ -2798,12 +2796,6 @@ static int __record_ref(struct list_head *head, u64 dir, ref->name = (char *)kbasename(ref->full_path->start); ref->name_len = ref->full_path->end - ref->name; - ref->dir_path = ref->full_path->start; - if (ref->name == ref->full_path->start) - ref->dir_path_len = 0; - else - ref->dir_path_len = ref->full_path->end - - ref->full_path->start - 1 - ref->name_len; list_add_tail(&ref->list, head); return 0; diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 1f157fba8940..c2d5f3580b4c 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -447,11 +447,52 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, BTRFS_ATTR(clone_alignment, btrfs_clone_alignment_show); +static ssize_t quota_override_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + int quota_override; + + quota_override = test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags); + return snprintf(buf, PAGE_SIZE, "%d\n", quota_override); +} + +static ssize_t quota_override_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + unsigned long knob; + int err; + + if (!fs_info) + return -EPERM; + + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + + err = kstrtoul(buf, 10, &knob); + if (err) + return err; + if (knob > 1) + return -EINVAL; + + if (knob) + set_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags); + else + clear_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags); + + return len; +} + +BTRFS_ATTR_RW(quota_override, quota_override_show, quota_override_store); + static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(label), BTRFS_ATTR_PTR(nodesize), BTRFS_ATTR_PTR(sectorsize), BTRFS_ATTR_PTR(clone_alignment), + BTRFS_ATTR_PTR(quota_override), NULL, }; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 2168654c90a1..a109db56c4bf 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -93,7 +93,7 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) btrfs_put_block_group_trimming(cache); btrfs_put_block_group(cache); } - kmem_cache_free(btrfs_transaction_cachep, transaction); + kfree(transaction); } } @@ -228,7 +228,7 @@ loop: */ BUG_ON(type == TRANS_JOIN_NOLOCK); - cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS); + cur_trans = kmalloc(sizeof(*cur_trans), GFP_NOFS); if (!cur_trans) return -ENOMEM; @@ -238,11 +238,11 @@ loop: * someone started a transaction after we unlocked. Make sure * to redo the checks above */ - kmem_cache_free(btrfs_transaction_cachep, cur_trans); + kfree(cur_trans); goto loop; } else if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { spin_unlock(&fs_info->trans_lock); - kmem_cache_free(btrfs_transaction_cachep, cur_trans); + kfree(cur_trans); return -EROFS; } @@ -1374,9 +1374,6 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, ret = commit_fs_roots(trans, fs_info); if (ret) goto out; - ret = btrfs_qgroup_prepare_account_extents(trans, fs_info); - if (ret < 0) - goto out; ret = btrfs_qgroup_account_extents(trans, fs_info); if (ret < 0) goto out; @@ -2180,13 +2177,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) goto scrub_continue; } - ret = btrfs_qgroup_prepare_account_extents(trans, fs_info); - if (ret) { - mutex_unlock(&fs_info->tree_log_mutex); - mutex_unlock(&fs_info->reloc_mutex); - goto scrub_continue; - } - /* * Since fs roots are all committed, we can get a quite accurate * new_roots. So let's do quota accounting. diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 017b67daa3bb..e37f95976443 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2417,9 +2417,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path fs_info->fs_devices->total_devices++; fs_info->fs_devices->total_rw_bytes += device->total_bytes; - spin_lock(&fs_info->free_chunk_lock); - fs_info->free_chunk_space += device->total_bytes; - spin_unlock(&fs_info->free_chunk_lock); + atomic64_add(device->total_bytes, &fs_info->free_chunk_space); if (!blk_queue_nonrot(q)) fs_info->fs_devices->rotating = 1; @@ -2874,9 +2872,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, mutex_lock(&fs_info->chunk_mutex); btrfs_device_set_bytes_used(device, device->bytes_used - dev_extent_len); - spin_lock(&fs_info->free_chunk_lock); - fs_info->free_chunk_space += dev_extent_len; - spin_unlock(&fs_info->free_chunk_lock); + atomic64_add(dev_extent_len, &fs_info->free_chunk_space); btrfs_clear_space_info_full(fs_info); mutex_unlock(&fs_info->chunk_mutex); } @@ -4409,9 +4405,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) btrfs_device_set_total_bytes(device, new_size); if (device->writeable) { device->fs_devices->total_rw_bytes -= diff; - spin_lock(&fs_info->free_chunk_lock); - fs_info->free_chunk_space -= diff; - spin_unlock(&fs_info->free_chunk_lock); + atomic64_sub(diff, &fs_info->free_chunk_space); } mutex_unlock(&fs_info->chunk_mutex); @@ -4535,9 +4529,7 @@ done: btrfs_device_set_total_bytes(device, old_size); if (device->writeable) device->fs_devices->total_rw_bytes += diff; - spin_lock(&fs_info->free_chunk_lock); - fs_info->free_chunk_space += diff; - spin_unlock(&fs_info->free_chunk_lock); + atomic64_add(diff, &fs_info->free_chunk_space); mutex_unlock(&fs_info->chunk_mutex); } return ret; @@ -4882,9 +4874,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes); } - spin_lock(&info->free_chunk_lock); - info->free_chunk_space -= (stripe_size * map->num_stripes); - spin_unlock(&info->free_chunk_lock); + atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space); free_extent_map(em); check_raid56_incompat_flag(info, type); @@ -6684,10 +6674,8 @@ static int read_one_dev(struct btrfs_fs_info *fs_info, device->in_fs_metadata = 1; if (device->writeable && !device->is_tgtdev_for_dev_replace) { device->fs_devices->total_rw_bytes += device->total_bytes; - spin_lock(&fs_info->free_chunk_lock); - fs_info->free_chunk_space += device->total_bytes - - device->bytes_used; - spin_unlock(&fs_info->free_chunk_lock); + atomic64_add(device->total_bytes - device->bytes_used, + &fs_info->free_chunk_space); } ret = 0; return ret; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index c7d0fbc915ca..58b97b6f5f02 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -74,6 +74,7 @@ struct btrfs_device { int missing; int can_discard; int is_tgtdev_for_dev_replace; + int last_flush_error; #ifdef __BTRFS_NEED_DEVICE_DATA_ORDERED seqcount_t data_seqcount; @@ -279,6 +280,7 @@ struct btrfs_io_bio { u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE]; u8 *csum_allocated; btrfs_io_bio_end_io_t *end_io; + struct bvec_iter iter; struct bio bio; }; diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index e37973526153..cd99a3658156 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -1410,42 +1410,6 @@ DEFINE_EVENT(btrfs__workqueue_done, btrfs_workqueue_destroy, TP_ARGS(wq) ); -DECLARE_EVENT_CLASS(btrfs__qgroup_data_map, - - TP_PROTO(struct inode *inode, u64 free_reserved), - - TP_ARGS(inode, free_reserved), - - TP_STRUCT__entry_btrfs( - __field( u64, rootid ) - __field( unsigned long, ino ) - __field( u64, free_reserved ) - ), - - TP_fast_assign_btrfs(btrfs_sb(inode->i_sb), - __entry->rootid = BTRFS_I(inode)->root->objectid; - __entry->ino = inode->i_ino; - __entry->free_reserved = free_reserved; - ), - - TP_printk_btrfs("rootid=%llu ino=%lu free_reserved=%llu", - __entry->rootid, __entry->ino, __entry->free_reserved) -); - -DEFINE_EVENT(btrfs__qgroup_data_map, btrfs_qgroup_init_data_rsv_map, - - TP_PROTO(struct inode *inode, u64 free_reserved), - - TP_ARGS(inode, free_reserved) -); - -DEFINE_EVENT(btrfs__qgroup_data_map, btrfs_qgroup_free_data_rsv_map, - - TP_PROTO(struct inode *inode, u64 free_reserved), - - TP_ARGS(inode, free_reserved) -); - #define BTRFS_QGROUP_OPERATIONS \ { QGROUP_RESERVE, "reserve" }, \ { QGROUP_RELEASE, "release" }, \ |