From 9284bcf4e335e5f18a8bc7b26461c33ab60d0689 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 29 Oct 2010 08:10:18 -0600 Subject: block: check for proper length of iov entries in blk_rq_map_user_iov() Ensure that we pass down properly validated iov segments before calling into the mapping or copy functions. Reported-by: Dan Rosenberg Cc: stable@kernel.org Signed-off-by: Jens Axboe --- block/blk-map.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'block') diff --git a/block/blk-map.c b/block/blk-map.c index d4a586d8691..5d5dbe47c22 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -205,6 +205,8 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, unaligned = 1; break; } + if (!iov[i].iov_len) + return -EINVAL; } if (unaligned || (q->dma_pad_mask & len) || map_data) -- cgit v1.2.3 From 9f864c80913467312c7b8690e41fb5ebd1b50e92 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 29 Oct 2010 11:31:42 -0600 Subject: block: take care not to overflow when calculating total iov length Reported-by: Dan Rosenberg Cc: stable@kernel.org Signed-off-by: Jens Axboe --- block/scsi_ioctl.c | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) (limited to 'block') diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index a8b5a10eb5b..4f4230b79bb 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -321,33 +321,47 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, if (hdr->iovec_count) { const int size = sizeof(struct sg_iovec) * hdr->iovec_count; size_t iov_data_len; - struct sg_iovec *iov; + struct sg_iovec *sg_iov; + struct iovec *iov; + int i; - iov = kmalloc(size, GFP_KERNEL); - if (!iov) { + sg_iov = kmalloc(size, GFP_KERNEL); + if (!sg_iov) { ret = -ENOMEM; goto out; } - if (copy_from_user(iov, hdr->dxferp, size)) { - kfree(iov); + if (copy_from_user(sg_iov, hdr->dxferp, size)) { + kfree(sg_iov); ret = -EFAULT; goto out; } + /* + * Sum up the vecs, making sure they don't overflow + */ + iov = (struct iovec *) sg_iov; + iov_data_len = 0; + for (i = 0; i < hdr->iovec_count; i++) { + if (iov_data_len + iov[i].iov_len < iov_data_len) { + kfree(sg_iov); + ret = -EINVAL; + goto out; + } + iov_data_len += iov[i].iov_len; + } + /* SG_IO howto says that the shorter of the two wins */ - iov_data_len = iov_length((struct iovec *)iov, - hdr->iovec_count); if (hdr->dxfer_len < iov_data_len) { - hdr->iovec_count = iov_shorten((struct iovec *)iov, + hdr->iovec_count = iov_shorten(iov, hdr->iovec_count, hdr->dxfer_len); iov_data_len = hdr->dxfer_len; } - ret = blk_rq_map_user_iov(q, rq, NULL, iov, hdr->iovec_count, + ret = blk_rq_map_user_iov(q, rq, NULL, sg_iov, hdr->iovec_count, iov_data_len, GFP_KERNEL); - kfree(iov); + kfree(sg_iov); } else if (hdr->dxfer_len) ret = blk_rq_map_user(q, rq, NULL, hdr->dxferp, hdr->dxfer_len, GFP_KERNEL); -- cgit v1.2.3 From 77304d2abac6101f7249754ffdd4421258877ab0 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Mon, 8 Nov 2010 14:39:12 +0100 Subject: block: read i_size with i_size_read() Convert direct reads of an inode's i_size to using i_size_read(). i_size_{read,write} use a seqcount to protect reads from accessing incomple writes. Concurrent i_size_write()s require mutual exclussion to protect the seqcount that is used by i_size_{read,write}. But i_size_read() callers do not need to use additional locking. Signed-off-by: Mike Snitzer Acked-by: NeilBrown Acked-by: Lars Ellenberg Signed-off-by: Jens Axboe --- block/blk-core.c | 4 ++-- block/compat_ioctl.c | 4 ++-- block/ioctl.c | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index f0834e2f572..17fcb83670c 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1351,7 +1351,7 @@ static void handle_bad_sector(struct bio *bio) bdevname(bio->bi_bdev, b), bio->bi_rw, (unsigned long long)bio->bi_sector + bio_sectors(bio), - (long long)(bio->bi_bdev->bd_inode->i_size >> 9)); + (long long)(i_size_read(bio->bi_bdev->bd_inode) >> 9)); set_bit(BIO_EOF, &bio->bi_flags); } @@ -1404,7 +1404,7 @@ static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors) return 0; /* Test device or partition size, when known. */ - maxsector = bio->bi_bdev->bd_inode->i_size >> 9; + maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9; if (maxsector) { sector_t sector = bio->bi_sector; diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c index 119f07b74dc..58c6ee5b010 100644 --- a/block/compat_ioctl.c +++ b/block/compat_ioctl.c @@ -744,13 +744,13 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) bdi->ra_pages = (arg * 512) / PAGE_CACHE_SIZE; return 0; case BLKGETSIZE: - size = bdev->bd_inode->i_size; + size = i_size_read(bdev->bd_inode); if ((size >> 9) > ~0UL) return -EFBIG; return compat_put_ulong(arg, size >> 9); case BLKGETSIZE64_32: - return compat_put_u64(arg, bdev->bd_inode->i_size); + return compat_put_u64(arg, i_size_read(bdev->bd_inode)); case BLKTRACESETUP32: case BLKTRACESTART: /* compatible */ diff --git a/block/ioctl.c b/block/ioctl.c index d724ceb1d46..38aa194f63e 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -125,7 +125,7 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start, start >>= 9; len >>= 9; - if (start + len > (bdev->bd_inode->i_size >> 9)) + if (start + len > (i_size_read(bdev->bd_inode) >> 9)) return -EINVAL; if (secure) flags |= BLKDEV_DISCARD_SECURE; @@ -307,12 +307,12 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, ret = blkdev_reread_part(bdev); break; case BLKGETSIZE: - size = bdev->bd_inode->i_size; + size = i_size_read(bdev->bd_inode); if ((size >> 9) > ~0UL) return -EFBIG; return put_ulong(arg, size >> 9); case BLKGETSIZE64: - return put_u64(arg, bdev->bd_inode->i_size); + return put_u64(arg, i_size_read(bdev->bd_inode)); case BLKTRACESTART: case BLKTRACESTOP: case BLKTRACESETUP: -- cgit v1.2.3 From a014741c0adfb8fb79952939ca087cf03d272bb9 Mon Sep 17 00:00:00 2001 From: Vasiliy Kulikov Date: Mon, 8 Nov 2010 14:42:40 +0100 Subject: block: ioctl: fix information leak to userland Structure hd_geometry is copied to userland with 4 padding bytes between cylinders and start fields uninitialized on 64-bit platforms. It leads to leaking of contents of kernel stack memory. Currently there is no memset() in real implementations of getgeo() in drivers/block/, so it makes sense to have memset() in blkdev_ioctl(). Signed-off-by: Vasiliy Kulikov Signed-off-by: Jens Axboe --- block/ioctl.c | 1 + 1 file changed, 1 insertion(+) (limited to 'block') diff --git a/block/ioctl.c b/block/ioctl.c index 38aa194f63e..3d866d0037f 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -242,6 +242,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, * We need to set the startsect first, the driver may * want to override it. */ + memset(&geo, 0, sizeof(geo)); geo.start = get_start_sect(bdev); ret = disk->fops->getgeo(bdev, &geo); if (ret) -- cgit v1.2.3 From 02e031cbc843b010e72fcc05c76113c688b2860f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 10 Nov 2010 14:54:09 +0100 Subject: block: remove REQ_HARDBARRIER REQ_HARDBARRIER is dead now, so remove the leftovers. What's left at this point is: - various checks inside the block layer. - sanity checks in bio based drivers. - now unused bio_empty_barrier helper. - Xen blockfront use of BLKIF_OP_WRITE_BARRIER - it's dead for a while, but Xen really needs to sort out it's barrier situaton. - setting of ordered tags in uas - dead code copied from old scsi drivers. - scsi different retry for barriers - it's dead and should have been removed when flushes were converted to FS requests. - blktrace handling of barriers - removed. Someone who knows blktrace better should add support for REQ_FLUSH and REQ_FUA, though. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-core.c | 7 ------- block/elevator.c | 4 ++-- 2 files changed, 2 insertions(+), 9 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 17fcb83670c..4ce953f1b39 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1194,13 +1194,6 @@ static int __make_request(struct request_queue *q, struct bio *bio) int where = ELEVATOR_INSERT_SORT; int rw_flags; - /* REQ_HARDBARRIER is no more */ - if (WARN_ONCE(bio->bi_rw & REQ_HARDBARRIER, - "block: HARDBARRIER is deprecated, use FLUSH/FUA instead\n")) { - bio_endio(bio, -EOPNOTSUPP); - return 0; - } - /* * low level driver can indicate that it wants pages above a * certain limit bounced to low memory (ie for highmem, or even diff --git a/block/elevator.c b/block/elevator.c index 282e8308f7e..2569512830d 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -429,7 +429,7 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq) q->nr_sorted--; boundary = q->end_sector; - stop_flags = REQ_SOFTBARRIER | REQ_HARDBARRIER | REQ_STARTED; + stop_flags = REQ_SOFTBARRIER | REQ_STARTED; list_for_each_prev(entry, &q->queue_head) { struct request *pos = list_entry_rq(entry); @@ -691,7 +691,7 @@ void elv_insert(struct request_queue *q, struct request *rq, int where) void __elv_add_request(struct request_queue *q, struct request *rq, int where, int plug) { - if (rq->cmd_flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) { + if (rq->cmd_flags & REQ_SOFTBARRIER) { /* barriers are scheduling boundary, update end_sector */ if (rq->cmd_type == REQ_TYPE_FS || (rq->cmd_flags & REQ_DISCARD)) { -- cgit v1.2.3 From cedb4a7d9f6aedb0dce94d6285b69dcb3c10fa05 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 11 Nov 2010 13:37:54 +0100 Subject: block: remove unused copy_io_context() Reported-by: Oleg Nesterov Signed-off-by: Jens Axboe --- block/blk-ioc.c | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'block') diff --git a/block/blk-ioc.c b/block/blk-ioc.c index d22c4c55c40..3c7a339fe38 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -153,20 +153,6 @@ struct io_context *get_io_context(gfp_t gfp_flags, int node) } EXPORT_SYMBOL(get_io_context); -void copy_io_context(struct io_context **pdst, struct io_context **psrc) -{ - struct io_context *src = *psrc; - struct io_context *dst = *pdst; - - if (src) { - BUG_ON(atomic_long_read(&src->refcount) == 0); - atomic_long_inc(&src->refcount); - put_io_context(dst); - *pdst = src; - } -} -EXPORT_SYMBOL(copy_io_context); - static int __init blk_ioc_init(void) { iocontext_cachep = kmem_cache_create("blkdev_ioc", -- cgit v1.2.3 From c2f6805d470af369a7337801deeecea800dbfe1c Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Mon, 15 Nov 2010 19:32:42 +0100 Subject: blk-throttle: Fix calculation of max number of WRITES to be dispatched o Currently we try to dispatch more READS and less WRITES (75%, 25%) in one dispatch round. ummy pointed out that there is a bug in max_nr_writes calculation. This patch fixes it. Reported-by: ummy y Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/blk-throttle.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 56ad4531b41..004be80fd89 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -645,7 +645,7 @@ static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg, { unsigned int nr_reads = 0, nr_writes = 0; unsigned int max_nr_reads = throtl_grp_quantum*3/4; - unsigned int max_nr_writes = throtl_grp_quantum - nr_reads; + unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads; struct bio *bio; /* Try to dispatch 75% READS and 25% WRITES */ -- cgit v1.2.3 From 451a3c24b0135bce54542009b5fde43846c7cf67 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 17 Nov 2010 16:26:55 +0100 Subject: BKL: remove extraneous #include The big kernel lock has been removed from all these files at some point, leaving only the #include. Remove this too as a cleanup. Signed-off-by: Arnd Bergmann Signed-off-by: Linus Torvalds --- block/compat_ioctl.c | 1 - block/ioctl.c | 1 - 2 files changed, 2 deletions(-) (limited to 'block') diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c index 58c6ee5b010..cc3eb78e333 100644 --- a/block/compat_ioctl.c +++ b/block/compat_ioctl.c @@ -8,7 +8,6 @@ #include #include #include -#include #include #include diff --git a/block/ioctl.c b/block/ioctl.c index 3d866d0037f..a9a302eba01 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -5,7 +5,6 @@ #include #include #include -#include #include #include -- cgit v1.2.3 From 5478755616ae2ef1ce144dded589b62b2a50d575 Mon Sep 17 00:00:00 2001 From: Xiaotian Feng Date: Mon, 29 Nov 2010 10:03:55 +0100 Subject: block: check for proper length of iov entries earlier in blk_rq_map_user_iov() commit 9284bcf checks for proper length of iov entries in blk_rq_map_user_iov(). But if the map is unaligned, kernel will break out the loop without checking for the proper length. So we need to check the proper length before the unalign check. Signed-off-by: Xiaotian Feng Cc: stable@kernel.org Signed-off-by: Jens Axboe --- block/blk-map.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-map.c b/block/blk-map.c index 5d5dbe47c22..e663ac2d8e6 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -201,12 +201,13 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, for (i = 0; i < iov_count; i++) { unsigned long uaddr = (unsigned long)iov[i].iov_base; + if (!iov[i].iov_len) + return -EINVAL; + if (uaddr & queue_dma_alignment(q)) { unaligned = 1; break; } - if (!iov[i].iov_len) - return -EINVAL; } if (unaligned || (q->dma_pad_mask & len) || map_data) -- cgit v1.2.3 From d1ae8ffdfaa16b2ab2e9346e81cf0ab6eaaae347 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 1 Dec 2010 19:34:46 +0100 Subject: blk-throttle: Trim/adjust slice_end once a bio has been dispatched o During some testing I did following and noticed throttling stops working. - Put a very low limit on a cgroup, say 1 byte per second. - Start some reads, this will set slice_end to a very high value. - Change the limit to higher value say 1MB/s - Now IO unthrottles and finishes as expected. - Try to do the read again but IO is not limited to 1MB/s as expected. o What is happening. - Initially low value of limit sets slice_end to a very high value. - During updation of limit, slice_end is not being truncated. - Very high value of slice_end leads to keeping the existing slice valid for a very long time and new slice does not start. - tg_may_dispatch() is called in blk_throtle_bio(), and trim_slice() is not called in this path. So slice_start is some old value and practically we are able to do huge amount of IO. o There are many ways it can be fixed. I have fixed it by trying to adjust/cleanup slice_end in trim_slice(). Generally we extend slices if bio is big and can't be dispatched in one slice. After dispatch of bio, readjust the slice_end to make sure we don't end up with huge values. Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/blk-throttle.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'block') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 004be80fd89..2d134b7c40a 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -355,6 +355,12 @@ throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) tg->slice_end[rw], jiffies); } +static inline void throtl_set_slice_end(struct throtl_data *td, + struct throtl_grp *tg, bool rw, unsigned long jiffy_end) +{ + tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); +} + static inline void throtl_extend_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw, unsigned long jiffy_end) { @@ -391,6 +397,16 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) if (throtl_slice_used(td, tg, rw)) return; + /* + * A bio has been dispatched. Also adjust slice_end. It might happen + * that initially cgroup limit was very low resulting in high + * slice_end, but later limit was bumped up and bio was dispached + * sooner, then we need to reduce slice_end. A high bogus slice_end + * is bad because it does not allow new slice to start. + */ + + throtl_set_slice_end(td, tg, rw, jiffies + throtl_slice); + time_elapsed = jiffies - tg->slice_start[rw]; nr_slices = time_elapsed / throtl_slice; -- cgit v1.2.3 From 04a6b516cdc6efc2500b52a540cf65be8c5aaf9e Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 1 Dec 2010 19:34:52 +0100 Subject: blk-throttle: Correct the placement of smp_rmb() o I was discussing what are the variable being updated without spin lock and why do we need barriers and Oleg pointed out that location of smp_rmb() should be between read of td->limits_changed and tg->limits_changed. This patch fixes it. o Following is one possible sequence of events. Say cpu0 is executing throtl_update_blkio_group_read_bps() and cpu1 is executing throtl_process_limit_change(). cpu0 cpu1 tg->limits_changed = true; smp_mb__before_atomic_inc(); atomic_inc(&td->limits_changed); if (!atomic_read(&td->limits_changed)) return; if (tg->limits_changed) do_something; If cpu0 has updated tg->limits_changed and td->limits_changed, we want to make sure that if update to td->limits_changed is visible on cpu1, then update to tg->limits_changed should also be visible. Oleg pointed out to ensure that we need to insert an smp_rmb() between td->limits_changed read and tg->limits_changed read. o I had erroneously put smp_rmb() before atomic_read(&td->limits_changed). This patch fixes it. Reported-by: Oleg Nesterov Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/blk-throttle.c | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) (limited to 'block') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 2d134b7c40a..381b09bb562 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -725,26 +725,21 @@ static void throtl_process_limit_change(struct throtl_data *td) struct throtl_grp *tg; struct hlist_node *pos, *n; - /* - * Make sure atomic_inc() effects from - * throtl_update_blkio_group_read_bps(), group of functions are - * visible. - * Is this required or smp_mb__after_atomic_inc() was suffcient - * after the atomic_inc(). - */ - smp_rmb(); if (!atomic_read(&td->limits_changed)) return; throtl_log(td, "limit changed =%d", atomic_read(&td->limits_changed)); - hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) { - /* - * Do I need an smp_rmb() here to make sure tg->limits_changed - * update is visible. I am relying on smp_rmb() at the - * beginning of function and not putting a new one here. - */ + /* + * Make sure updates from throtl_update_blkio_group_read_bps() group + * of functions to tg->limits_changed are visible. We do not + * want update td->limits_changed to be visible but update to + * tg->limits_changed not being visible yet on this cpu. Hence + * the read barrier. + */ + smp_rmb(); + hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) { if (throtl_tg_on_rr(tg) && tg->limits_changed) { throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu" " riops=%u wiops=%u", tg->bps[READ], -- cgit v1.2.3 From c7a841f3aca469187db76842676951a672fd27d1 Mon Sep 17 00:00:00 2001 From: James Smart Date: Sun, 14 Nov 2010 11:12:04 -0500 Subject: [SCSI] bsg: correct fault if queue object removed while dev_t open This patch corrects an issue in bsg that results in a general protection fault if an LLD is removed while an application is using an open file handle to a bsg device, and the application issues an ioctl. The fault occurs because the class_dev is NULL, having been cleared in bsg_unregister_queue() when the driver was removed. With this patch, a check is made for the class_dev, and the application will receive ENXIO if the related object is gone. Signed-off-by: Carl Lajeunesse Signed-off-by: James Smart Signed-off-by: James Bottomley --- block/bsg.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'block') diff --git a/block/bsg.c b/block/bsg.c index f20d6a789d4..0c8b64a1648 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -250,6 +250,14 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm, int ret, rw; unsigned int dxfer_len; void *dxferp = NULL; + struct bsg_class_device *bcd = &q->bsg_dev; + + /* if the LLD has been removed then the bsg_unregister_queue will + * eventually be called and the class_dev was freed, so we can no + * longer use this request_queue. Return no such address. + */ + if (!bcd->class_dev) + return ERR_PTR(-ENXIO); dprintk("map hdr %llx/%u %llx/%u\n", (unsigned long long) hdr->dout_xferp, hdr->dout_xfer_len, (unsigned long long) hdr->din_xferp, -- cgit v1.2.3 From e692cb668fdd5a712c6ed2a2d6f2a36ee83997b4 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Wed, 1 Dec 2010 19:41:49 +0100 Subject: block: Deprecate QUEUE_FLAG_CLUSTER and use queue_limits instead When stacking devices, a request_queue is not always available. This forced us to have a no_cluster flag in the queue_limits that could be used as a carrier until the request_queue had been set up for a metadevice. There were several problems with that approach. First of all it was up to the stacking device to remember to set queue flag after stacking had completed. Also, the queue flag and the queue limits had to be kept in sync at all times. We got that wrong, which could lead to us issuing commands that went beyond the max scatterlist limit set by the driver. The proper fix is to avoid having two flags for tracking the same thing. We deprecate QUEUE_FLAG_CLUSTER and use the queue limit directly in the block layer merging functions. The queue_limit 'no_cluster' is turned into 'cluster' to avoid double negatives and to ease stacking. Clustering defaults to being enabled as before. The queue flag logic is removed from the stacking function, and explicitly setting the cluster flag is no longer necessary in DM and MD. Reported-by: Ed Lin Signed-off-by: Martin K. Petersen Acked-by: Mike Snitzer Cc: stable@kernel.org Signed-off-by: Jens Axboe --- block/blk-merge.c | 6 +++--- block/blk-settings.c | 25 ++----------------------- block/blk-sysfs.c | 2 +- 3 files changed, 6 insertions(+), 27 deletions(-) (limited to 'block') diff --git a/block/blk-merge.c b/block/blk-merge.c index 77b7c26df6b..74bc4a768f3 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -21,7 +21,7 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q, return 0; fbio = bio; - cluster = test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags); + cluster = blk_queue_cluster(q); seg_size = 0; nr_phys_segs = 0; for_each_bio(bio) { @@ -87,7 +87,7 @@ EXPORT_SYMBOL(blk_recount_segments); static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio, struct bio *nxt) { - if (!test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags)) + if (!blk_queue_cluster(q)) return 0; if (bio->bi_seg_back_size + nxt->bi_seg_front_size > @@ -123,7 +123,7 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq, int nsegs, cluster; nsegs = 0; - cluster = test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags); + cluster = blk_queue_cluster(q); /* * for each bio in rq diff --git a/block/blk-settings.c b/block/blk-settings.c index 701859fb964..e55f5fc4ca2 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -126,7 +126,7 @@ void blk_set_default_limits(struct queue_limits *lim) lim->alignment_offset = 0; lim->io_opt = 0; lim->misaligned = 0; - lim->no_cluster = 0; + lim->cluster = 1; } EXPORT_SYMBOL(blk_set_default_limits); @@ -464,15 +464,6 @@ EXPORT_SYMBOL(blk_queue_io_opt); void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b) { blk_stack_limits(&t->limits, &b->limits, 0); - - if (!t->queue_lock) - WARN_ON_ONCE(1); - else if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) { - unsigned long flags; - spin_lock_irqsave(t->queue_lock, flags); - queue_flag_clear(QUEUE_FLAG_CLUSTER, t); - spin_unlock_irqrestore(t->queue_lock, flags); - } } EXPORT_SYMBOL(blk_queue_stack_limits); @@ -545,7 +536,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->io_min = max(t->io_min, b->io_min); t->io_opt = lcm(t->io_opt, b->io_opt); - t->no_cluster |= b->no_cluster; + t->cluster &= b->cluster; t->discard_zeroes_data &= b->discard_zeroes_data; /* Physical block size a multiple of the logical block size? */ @@ -641,7 +632,6 @@ void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, sector_t offset) { struct request_queue *t = disk->queue; - struct request_queue *b = bdev_get_queue(bdev); if (bdev_stack_limits(&t->limits, bdev, offset >> 9) < 0) { char top[BDEVNAME_SIZE], bottom[BDEVNAME_SIZE]; @@ -652,17 +642,6 @@ void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, printk(KERN_NOTICE "%s: Warning: Device %s is misaligned\n", top, bottom); } - - if (!t->queue_lock) - WARN_ON_ONCE(1); - else if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) { - unsigned long flags; - - spin_lock_irqsave(t->queue_lock, flags); - if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) - queue_flag_clear(QUEUE_FLAG_CLUSTER, t); - spin_unlock_irqrestore(t->queue_lock, flags); - } } EXPORT_SYMBOL(disk_stack_limits); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 013457f47fd..41fb69150b4 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -119,7 +119,7 @@ static ssize_t queue_max_integrity_segments_show(struct request_queue *q, char * static ssize_t queue_max_segment_size_show(struct request_queue *q, char *page) { - if (test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags)) + if (blk_queue_cluster(q)) return queue_var_show(queue_max_segment_size(q), (page)); return queue_var_show(PAGE_CACHE_SIZE, (page)); -- cgit v1.2.3 From 72d4cd9f38b5ed96b75df4c622be25e1c2648dd3 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 17 Dec 2010 08:34:20 +0100 Subject: block: max hardware sectors limit wrapper Implement blk_limits_max_hw_sectors() and make blk_queue_max_hw_sectors() a wrapper around it. DM needs this to avoid setting queue_limits' max_hw_sectors and max_sectors directly. dm_set_device_limits() now leverages blk_limits_max_hw_sectors() logic to establish the appropriate max_hw_sectors minimum (PAGE_SIZE). Fixes issue where DM was incorrectly setting max_sectors rather than max_hw_sectors (which caused dm_merge_bvec()'s max_hw_sectors check to be ineffective). Signed-off-by: Mike Snitzer Cc: stable@kernel.org Acked-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/blk-settings.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/blk-settings.c b/block/blk-settings.c index e55f5fc4ca2..36c8c1f2af1 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -229,8 +229,8 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 dma_mask) EXPORT_SYMBOL(blk_queue_bounce_limit); /** - * blk_queue_max_hw_sectors - set max sectors for a request for this queue - * @q: the request queue for the device + * blk_limits_max_hw_sectors - set hard and soft limit of max sectors for request + * @limits: the queue limits * @max_hw_sectors: max hardware sectors in the usual 512b unit * * Description: @@ -244,7 +244,7 @@ EXPORT_SYMBOL(blk_queue_bounce_limit); * per-device basis in /sys/block//queue/max_sectors_kb. * The soft limit can not exceed max_hw_sectors. **/ -void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors) +void blk_limits_max_hw_sectors(struct queue_limits *limits, unsigned int max_hw_sectors) { if ((max_hw_sectors << 9) < PAGE_CACHE_SIZE) { max_hw_sectors = 1 << (PAGE_CACHE_SHIFT - 9); @@ -252,9 +252,23 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto __func__, max_hw_sectors); } - q->limits.max_hw_sectors = max_hw_sectors; - q->limits.max_sectors = min_t(unsigned int, max_hw_sectors, - BLK_DEF_MAX_SECTORS); + limits->max_hw_sectors = max_hw_sectors; + limits->max_sectors = min_t(unsigned int, max_hw_sectors, + BLK_DEF_MAX_SECTORS); +} +EXPORT_SYMBOL(blk_limits_max_hw_sectors); + +/** + * blk_queue_max_hw_sectors - set max sectors for a request for this queue + * @q: the request queue for the device + * @max_hw_sectors: max hardware sectors in the usual 512b unit + * + * Description: + * See description for blk_limits_max_hw_sectors(). + **/ +void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors) +{ + blk_limits_max_hw_sectors(&q->limits, max_hw_sectors); } EXPORT_SYMBOL(blk_queue_max_hw_sectors); -- cgit v1.2.3