summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-07-17 20:53:57 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2015-07-17 20:53:57 -0700
commit3f8476fe892c58c0583f98f17e98fd67c3fec466 (patch)
tree0787ca7ae13ef1937498c88de0ed0d09613087b5
parenteb254374a30cc53f976f2302f2198813a3b687ea (diff)
parent665022d72f9b5762f21b5ea02fa0503d04802849 (diff)
Merge tag 'dm-4.2-fixes-2' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm
Pull device mapper fixes from Mike Snitzer: - revert a request-based DM core change that caused IO latency to increase and adversely impact both throughput and system load - fix for a use after free bug in DM core's device cleanup - a couple DM btree removal fixes (used by dm-thinp) - a DM thinp fix for order-5 allocation failure - a DM thinp fix to not degrade to read-only metadata mode when in out-of-data-space mode for longer than the 'no_space_timeout' - fix a long-standing oversight in both dm-thinp and dm-cache by now exporting 'needs_check' in status if it was set in metadata - fix an embarrassing dm-cache busy-loop that caused worker threads to eat cpu even if no IO was actively being issued to the cache device * tag 'dm-4.2-fixes-2' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: dm cache: avoid calls to prealloc_free_structs() if possible dm cache: avoid preallocation if no work in writeback_some_dirty_blocks() dm cache: do not wake_worker() in free_migration() dm cache: display 'needs_check' in status if it is set dm thin: display 'needs_check' in status if it is set dm thin: stay in out-of-data-space mode once no_space_timeout expires dm: fix use after free crash due to incorrect cleanup sequence Revert "dm: only run the queue on completion if congested or no requests pending" dm btree: silence lockdep lock inversion in dm_btree_del() dm thin: allocate the cell_sort_array dynamically dm btree remove: fix bug in redistribute3
-rw-r--r--Documentation/device-mapper/cache.txt6
-rw-r--r--Documentation/device-mapper/thin-provisioning.txt9
-rw-r--r--drivers/md/dm-cache-target.c38
-rw-r--r--drivers/md/dm-thin.c44
-rw-r--r--drivers/md/dm.c12
-rw-r--r--drivers/md/persistent-data/dm-btree-remove.c6
-rw-r--r--drivers/md/persistent-data/dm-btree.c2
7 files changed, 82 insertions, 35 deletions
diff --git a/Documentation/device-mapper/cache.txt b/Documentation/device-mapper/cache.txt
index 82960cffbad3..785eab87aa71 100644
--- a/Documentation/device-mapper/cache.txt
+++ b/Documentation/device-mapper/cache.txt
@@ -258,6 +258,12 @@ cache metadata mode : ro if read-only, rw if read-write
no further I/O will be permitted and the status will just
contain the string 'Fail'. The userspace recovery tools
should then be used.
+needs_check : 'needs_check' if set, '-' if not set
+ A metadata operation has failed, resulting in the needs_check
+ flag being set in the metadata's superblock. The metadata
+ device must be deactivated and checked/repaired before the
+ cache can be made fully operational again. '-' indicates
+ needs_check is not set.
Messages
--------
diff --git a/Documentation/device-mapper/thin-provisioning.txt b/Documentation/device-mapper/thin-provisioning.txt
index 4f67578b2954..1699a55b7b70 100644
--- a/Documentation/device-mapper/thin-provisioning.txt
+++ b/Documentation/device-mapper/thin-provisioning.txt
@@ -296,7 +296,7 @@ ii) Status
underlying device. When this is enabled when loading the table,
it can get disabled if the underlying device doesn't support it.
- ro|rw
+ ro|rw|out_of_data_space
If the pool encounters certain types of device failures it will
drop into a read-only metadata mode in which no changes to
the pool metadata (like allocating new blocks) are permitted.
@@ -314,6 +314,13 @@ ii) Status
module parameter can be used to change this timeout -- it
defaults to 60 seconds but may be disabled using a value of 0.
+ needs_check
+ A metadata operation has failed, resulting in the needs_check
+ flag being set in the metadata's superblock. The metadata
+ device must be deactivated and checked/repaired before the
+ thin-pool can be made fully operational again. '-' indicates
+ needs_check is not set.
+
iii) Messages
create_thin <dev id>
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 1b4e1756b169..b680da5d7b93 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -424,7 +424,6 @@ static void free_migration(struct dm_cache_migration *mg)
wake_up(&cache->migration_wait);
mempool_free(mg, cache->migration_pool);
- wake_worker(cache);
}
static int prealloc_data_structs(struct cache *cache, struct prealloc *p)
@@ -1947,6 +1946,7 @@ static int commit_if_needed(struct cache *cache)
static void process_deferred_bios(struct cache *cache)
{
+ bool prealloc_used = false;
unsigned long flags;
struct bio_list bios;
struct bio *bio;
@@ -1981,13 +1981,16 @@ static void process_deferred_bios(struct cache *cache)
process_discard_bio(cache, &structs, bio);
else
process_bio(cache, &structs, bio);
+ prealloc_used = true;
}
- prealloc_free_structs(cache, &structs);
+ if (prealloc_used)
+ prealloc_free_structs(cache, &structs);
}
static void process_deferred_cells(struct cache *cache)
{
+ bool prealloc_used = false;
unsigned long flags;
struct dm_bio_prison_cell *cell, *tmp;
struct list_head cells;
@@ -2015,9 +2018,11 @@ static void process_deferred_cells(struct cache *cache)
}
process_cell(cache, &structs, cell);
+ prealloc_used = true;
}
- prealloc_free_structs(cache, &structs);
+ if (prealloc_used)
+ prealloc_free_structs(cache, &structs);
}
static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
@@ -2062,7 +2067,7 @@ static void process_deferred_writethrough_bios(struct cache *cache)
static void writeback_some_dirty_blocks(struct cache *cache)
{
- int r = 0;
+ bool prealloc_used = false;
dm_oblock_t oblock;
dm_cblock_t cblock;
struct prealloc structs;
@@ -2072,23 +2077,21 @@ static void writeback_some_dirty_blocks(struct cache *cache)
memset(&structs, 0, sizeof(structs));
while (spare_migration_bandwidth(cache)) {
- if (prealloc_data_structs(cache, &structs))
- break;
+ if (policy_writeback_work(cache->policy, &oblock, &cblock, busy))
+ break; /* no work to do */
- r = policy_writeback_work(cache->policy, &oblock, &cblock, busy);
- if (r)
- break;
-
- r = get_cell(cache, oblock, &structs, &old_ocell);
- if (r) {
+ if (prealloc_data_structs(cache, &structs) ||
+ get_cell(cache, oblock, &structs, &old_ocell)) {
policy_set_dirty(cache->policy, oblock);
break;
}
writeback(cache, &structs, oblock, cblock, old_ocell);
+ prealloc_used = true;
}
- prealloc_free_structs(cache, &structs);
+ if (prealloc_used)
+ prealloc_free_structs(cache, &structs);
}
/*----------------------------------------------------------------
@@ -3496,7 +3499,7 @@ static void cache_resume(struct dm_target *ti)
* <#demotions> <#promotions> <#dirty>
* <#features> <features>*
* <#core args> <core args>
- * <policy name> <#policy args> <policy args>* <cache metadata mode>
+ * <policy name> <#policy args> <policy args>* <cache metadata mode> <needs_check>
*/
static void cache_status(struct dm_target *ti, status_type_t type,
unsigned status_flags, char *result, unsigned maxlen)
@@ -3582,6 +3585,11 @@ static void cache_status(struct dm_target *ti, status_type_t type,
else
DMEMIT("rw ");
+ if (dm_cache_metadata_needs_check(cache->cmd))
+ DMEMIT("needs_check ");
+ else
+ DMEMIT("- ");
+
break;
case STATUSTYPE_TABLE:
@@ -3820,7 +3828,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
static struct target_type cache_target = {
.name = "cache",
- .version = {1, 7, 0},
+ .version = {1, 8, 0},
.module = THIS_MODULE,
.ctr = cache_ctr,
.dtr = cache_dtr,
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index c33f61a4cc28..1c50c580215c 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -18,6 +18,7 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/slab.h>
+#include <linux/vmalloc.h>
#include <linux/sort.h>
#include <linux/rbtree.h>
@@ -268,7 +269,7 @@ struct pool {
process_mapping_fn process_prepared_mapping;
process_mapping_fn process_prepared_discard;
- struct dm_bio_prison_cell *cell_sort_array[CELL_SORT_ARRAY_SIZE];
+ struct dm_bio_prison_cell **cell_sort_array;
};
static enum pool_mode get_pool_mode(struct pool *pool);
@@ -2281,18 +2282,23 @@ static void do_waker(struct work_struct *ws)
queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD);
}
+static void notify_of_pool_mode_change_to_oods(struct pool *pool);
+
/*
* We're holding onto IO to allow userland time to react. After the
* timeout either the pool will have been resized (and thus back in
- * PM_WRITE mode), or we degrade to PM_READ_ONLY and start erroring IO.
+ * PM_WRITE mode), or we degrade to PM_OUT_OF_DATA_SPACE w/ error_if_no_space.
*/
static void do_no_space_timeout(struct work_struct *ws)
{
struct pool *pool = container_of(to_delayed_work(ws), struct pool,
no_space_timeout);
- if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space)
- set_pool_mode(pool, PM_READ_ONLY);
+ if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space) {
+ pool->pf.error_if_no_space = true;
+ notify_of_pool_mode_change_to_oods(pool);
+ error_retry_list(pool);
+ }
}
/*----------------------------------------------------------------*/
@@ -2370,6 +2376,14 @@ static void notify_of_pool_mode_change(struct pool *pool, const char *new_mode)
dm_device_name(pool->pool_md), new_mode);
}
+static void notify_of_pool_mode_change_to_oods(struct pool *pool)
+{
+ if (!pool->pf.error_if_no_space)
+ notify_of_pool_mode_change(pool, "out-of-data-space (queue IO)");
+ else
+ notify_of_pool_mode_change(pool, "out-of-data-space (error IO)");
+}
+
static bool passdown_enabled(struct pool_c *pt)
{
return pt->adjusted_pf.discard_passdown;
@@ -2454,7 +2468,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
* frequently seeing this mode.
*/
if (old_mode != new_mode)
- notify_of_pool_mode_change(pool, "out-of-data-space");
+ notify_of_pool_mode_change_to_oods(pool);
pool->process_bio = process_bio_read_only;
pool->process_discard = process_discard_bio;
pool->process_cell = process_cell_read_only;
@@ -2777,6 +2791,7 @@ static void __pool_destroy(struct pool *pool)
{
__pool_table_remove(pool);
+ vfree(pool->cell_sort_array);
if (dm_pool_metadata_close(pool->pmd) < 0)
DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
@@ -2889,6 +2904,13 @@ static struct pool *pool_create(struct mapped_device *pool_md,
goto bad_mapping_pool;
}
+ pool->cell_sort_array = vmalloc(sizeof(*pool->cell_sort_array) * CELL_SORT_ARRAY_SIZE);
+ if (!pool->cell_sort_array) {
+ *error = "Error allocating cell sort array";
+ err_p = ERR_PTR(-ENOMEM);
+ goto bad_sort_array;
+ }
+
pool->ref_count = 1;
pool->last_commit_jiffies = jiffies;
pool->pool_md = pool_md;
@@ -2897,6 +2919,8 @@ static struct pool *pool_create(struct mapped_device *pool_md,
return pool;
+bad_sort_array:
+ mempool_destroy(pool->mapping_pool);
bad_mapping_pool:
dm_deferred_set_destroy(pool->all_io_ds);
bad_all_io_ds:
@@ -3714,6 +3738,7 @@ static void emit_flags(struct pool_features *pf, char *result,
* Status line is:
* <transaction id> <used metadata sectors>/<total metadata sectors>
* <used data sectors>/<total data sectors> <held metadata root>
+ * <pool mode> <discard config> <no space config> <needs_check>
*/
static void pool_status(struct dm_target *ti, status_type_t type,
unsigned status_flags, char *result, unsigned maxlen)
@@ -3815,6 +3840,11 @@ static void pool_status(struct dm_target *ti, status_type_t type,
else
DMEMIT("queue_if_no_space ");
+ if (dm_pool_metadata_needs_check(pool->pmd))
+ DMEMIT("needs_check ");
+ else
+ DMEMIT("- ");
+
break;
case STATUSTYPE_TABLE:
@@ -3918,7 +3948,7 @@ static struct target_type pool_target = {
.name = "thin-pool",
.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
DM_TARGET_IMMUTABLE,
- .version = {1, 15, 0},
+ .version = {1, 16, 0},
.module = THIS_MODULE,
.ctr = pool_ctr,
.dtr = pool_dtr,
@@ -4305,7 +4335,7 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
static struct target_type thin_target = {
.name = "thin",
- .version = {1, 15, 0},
+ .version = {1, 16, 0},
.module = THIS_MODULE,
.ctr = thin_ctr,
.dtr = thin_dtr,
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index f331d888e7f5..ab37ae114e94 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1067,13 +1067,10 @@ static void rq_end_stats(struct mapped_device *md, struct request *orig)
*/
static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
{
- int nr_requests_pending;
-
atomic_dec(&md->pending[rw]);
/* nudge anyone waiting on suspend queue */
- nr_requests_pending = md_in_flight(md);
- if (!nr_requests_pending)
+ if (!md_in_flight(md))
wake_up(&md->wait);
/*
@@ -1085,8 +1082,7 @@ static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
if (run_queue) {
if (md->queue->mq_ops)
blk_mq_run_hw_queues(md->queue, true);
- else if (!nr_requests_pending ||
- (nr_requests_pending >= md->queue->nr_congestion_on))
+ else
blk_run_queue_async(md->queue);
}
@@ -2281,8 +2277,6 @@ static void dm_init_old_md_queue(struct mapped_device *md)
static void cleanup_mapped_device(struct mapped_device *md)
{
- cleanup_srcu_struct(&md->io_barrier);
-
if (md->wq)
destroy_workqueue(md->wq);
if (md->kworker_task)
@@ -2294,6 +2288,8 @@ static void cleanup_mapped_device(struct mapped_device *md)
if (md->bs)
bioset_free(md->bs);
+ cleanup_srcu_struct(&md->io_barrier);
+
if (md->disk) {
spin_lock(&_minor_lock);
md->disk->private_data = NULL;
diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c
index e04cfd2d60ef..9836c0ae897c 100644
--- a/drivers/md/persistent-data/dm-btree-remove.c
+++ b/drivers/md/persistent-data/dm-btree-remove.c
@@ -309,8 +309,8 @@ static void redistribute3(struct dm_btree_info *info, struct btree_node *parent,
if (s < 0 && nr_center < -s) {
/* not enough in central node */
- shift(left, center, nr_center);
- s = nr_center - target;
+ shift(left, center, -nr_center);
+ s += nr_center;
shift(left, right, s);
nr_right += s;
} else
@@ -323,7 +323,7 @@ static void redistribute3(struct dm_btree_info *info, struct btree_node *parent,
if (s > 0 && nr_center < s) {
/* not enough in central node */
shift(center, right, nr_center);
- s = target - nr_center;
+ s -= nr_center;
shift(left, right, s);
nr_left -= s;
} else
diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c
index 200ac12a1d40..fdd3793e22f9 100644
--- a/drivers/md/persistent-data/dm-btree.c
+++ b/drivers/md/persistent-data/dm-btree.c
@@ -255,7 +255,7 @@ int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
int r;
struct del_stack *s;
- s = kmalloc(sizeof(*s), GFP_KERNEL);
+ s = kmalloc(sizeof(*s), GFP_NOIO);
if (!s)
return -ENOMEM;
s->info = info;