From 68a9f5e7007c1afa2cf6830b690a90d0187c0684 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 21 Jun 2016 09:53:44 +1000 Subject: xfs: implement iomap based buffered write path Convert XFS to use the new iomap based multipage write path. This involves implementing the ->iomap_begin and ->iomap_end methods, and switching the buffered file write, page_mkwrite and xfs_iozero paths to the new iomap helpers. With this change __xfs_get_blocks will never be used for buffered writes, and the code handling them can be removed. Based on earlier code from Dave Chinner. Signed-off-by: Christoph Hellwig Reviewed-by: Bob Peterson Signed-off-by: Dave Chinner --- fs/xfs/xfs_aops.c | 212 ------------------------------------------------------ 1 file changed, 212 deletions(-) (limited to 'fs/xfs/xfs_aops.c') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 4c463b99fe57..2ac9f7e5f504 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1427,216 +1427,6 @@ xfs_vm_direct_IO( xfs_get_blocks_direct, endio, NULL, flags); } -/* - * Punch out the delalloc blocks we have already allocated. - * - * Don't bother with xfs_setattr given that nothing can have made it to disk yet - * as the page is still locked at this point. - */ -STATIC void -xfs_vm_kill_delalloc_range( - struct inode *inode, - loff_t start, - loff_t end) -{ - struct xfs_inode *ip = XFS_I(inode); - xfs_fileoff_t start_fsb; - xfs_fileoff_t end_fsb; - int error; - - start_fsb = XFS_B_TO_FSB(ip->i_mount, start); - end_fsb = XFS_B_TO_FSB(ip->i_mount, end); - if (end_fsb <= start_fsb) - return; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_bmap_punch_delalloc_range(ip, start_fsb, - end_fsb - start_fsb); - if (error) { - /* something screwed, just bail */ - if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { - xfs_alert(ip->i_mount, - "xfs_vm_write_failed: unable to clean up ino %lld", - ip->i_ino); - } - } - xfs_iunlock(ip, XFS_ILOCK_EXCL); -} - -STATIC void -xfs_vm_write_failed( - struct inode *inode, - struct page *page, - loff_t pos, - unsigned len) -{ - loff_t block_offset; - loff_t block_start; - loff_t block_end; - loff_t from = pos & (PAGE_SIZE - 1); - loff_t to = from + len; - struct buffer_head *bh, *head; - struct xfs_mount *mp = XFS_I(inode)->i_mount; - - /* - * The request pos offset might be 32 or 64 bit, this is all fine - * on 64-bit platform. However, for 64-bit pos request on 32-bit - * platform, the high 32-bit will be masked off if we evaluate the - * block_offset via (pos & PAGE_MASK) because the PAGE_MASK is - * 0xfffff000 as an unsigned long, hence the result is incorrect - * which could cause the following ASSERT failed in most cases. - * In order to avoid this, we can evaluate the block_offset of the - * start of the page by using shifts rather than masks the mismatch - * problem. - */ - block_offset = (pos >> PAGE_SHIFT) << PAGE_SHIFT; - - ASSERT(block_offset + from == pos); - - head = page_buffers(page); - block_start = 0; - for (bh = head; bh != head || !block_start; - bh = bh->b_this_page, block_start = block_end, - block_offset += bh->b_size) { - block_end = block_start + bh->b_size; - - /* skip buffers before the write */ - if (block_end <= from) - continue; - - /* if the buffer is after the write, we're done */ - if (block_start >= to) - break; - - /* - * Process delalloc and unwritten buffers beyond EOF. We can - * encounter unwritten buffers in the event that a file has - * post-EOF unwritten extents and an extending write happens to - * fail (e.g., an unaligned write that also involves a delalloc - * to the same page). - */ - if (!buffer_delay(bh) && !buffer_unwritten(bh)) - continue; - - if (!xfs_mp_fail_writes(mp) && !buffer_new(bh) && - block_offset < i_size_read(inode)) - continue; - - if (buffer_delay(bh)) - xfs_vm_kill_delalloc_range(inode, block_offset, - block_offset + bh->b_size); - - /* - * This buffer does not contain data anymore. make sure anyone - * who finds it knows that for certain. - */ - clear_buffer_delay(bh); - clear_buffer_uptodate(bh); - clear_buffer_mapped(bh); - clear_buffer_new(bh); - clear_buffer_dirty(bh); - clear_buffer_unwritten(bh); - } - -} - -/* - * This used to call block_write_begin(), but it unlocks and releases the page - * on error, and we need that page to be able to punch stale delalloc blocks out - * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at - * the appropriate point. - */ -STATIC int -xfs_vm_write_begin( - struct file *file, - struct address_space *mapping, - loff_t pos, - unsigned len, - unsigned flags, - struct page **pagep, - void **fsdata) -{ - pgoff_t index = pos >> PAGE_SHIFT; - struct page *page; - int status; - struct xfs_mount *mp = XFS_I(mapping->host)->i_mount; - - ASSERT(len <= PAGE_SIZE); - - page = grab_cache_page_write_begin(mapping, index, flags); - if (!page) - return -ENOMEM; - - status = __block_write_begin(page, pos, len, xfs_get_blocks); - if (xfs_mp_fail_writes(mp)) - status = -EIO; - if (unlikely(status)) { - struct inode *inode = mapping->host; - size_t isize = i_size_read(inode); - - xfs_vm_write_failed(inode, page, pos, len); - unlock_page(page); - - /* - * If the write is beyond EOF, we only want to kill blocks - * allocated in this write, not blocks that were previously - * written successfully. - */ - if (xfs_mp_fail_writes(mp)) - isize = 0; - if (pos + len > isize) { - ssize_t start = max_t(ssize_t, pos, isize); - - truncate_pagecache_range(inode, start, pos + len); - } - - put_page(page); - page = NULL; - } - - *pagep = page; - return status; -} - -/* - * On failure, we only need to kill delalloc blocks beyond EOF in the range of - * this specific write because they will never be written. Previous writes - * beyond EOF where block allocation succeeded do not need to be trashed, so - * only new blocks from this write should be trashed. For blocks within - * EOF, generic_write_end() zeros them so they are safe to leave alone and be - * written with all the other valid data. - */ -STATIC int -xfs_vm_write_end( - struct file *file, - struct address_space *mapping, - loff_t pos, - unsigned len, - unsigned copied, - struct page *page, - void *fsdata) -{ - int ret; - - ASSERT(len <= PAGE_SIZE); - - ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); - if (unlikely(ret < len)) { - struct inode *inode = mapping->host; - size_t isize = i_size_read(inode); - loff_t to = pos + len; - - if (to > isize) { - /* only kill blocks in this write beyond EOF */ - if (pos > isize) - isize = pos; - xfs_vm_kill_delalloc_range(inode, isize, to); - truncate_pagecache_range(inode, isize, to); - } - } - return ret; -} - STATIC sector_t xfs_vm_bmap( struct address_space *mapping, @@ -1747,8 +1537,6 @@ const struct address_space_operations xfs_address_space_operations = { .set_page_dirty = xfs_vm_set_page_dirty, .releasepage = xfs_vm_releasepage, .invalidatepage = xfs_vm_invalidatepage, - .write_begin = xfs_vm_write_begin, - .write_end = xfs_vm_write_end, .bmap = xfs_vm_bmap, .direct_IO = xfs_vm_direct_IO, .migratepage = buffer_migrate_page, -- cgit v1.2.3 From 6e8a27a816390d1fbcc28903da4dd9bc348e19ca Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 21 Jun 2016 09:53:45 +1000 Subject: xfs: remove buffered write support from __xfs_get_blocks Signed-off-by: Christoph Hellwig Reviewed-by: Bob Peterson Signed-off-by: Dave Chinner --- fs/xfs/xfs_aops.c | 71 +++++++++++++++---------------------------------------- 1 file changed, 19 insertions(+), 52 deletions(-) (limited to 'fs/xfs/xfs_aops.c') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 2ac9f7e5f504..80714ebd54c0 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1143,6 +1143,8 @@ __xfs_get_blocks( ssize_t size; int new = 0; + BUG_ON(create && !direct); + if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; @@ -1150,22 +1152,14 @@ __xfs_get_blocks( ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); size = bh_result->b_size; - if (!create && direct && offset >= i_size_read(inode)) + if (!create && offset >= i_size_read(inode)) return 0; /* * Direct I/O is usually done on preallocated files, so try getting - * a block mapping without an exclusive lock first. For buffered - * writes we already have the exclusive iolock anyway, so avoiding - * a lock roundtrip here by taking the ilock exclusive from the - * beginning is a useful micro optimization. + * a block mapping without an exclusive lock first. */ - if (create && !direct) { - lockmode = XFS_ILOCK_EXCL; - xfs_ilock(ip, lockmode); - } else { - lockmode = xfs_ilock_data_map_shared(ip); - } + lockmode = xfs_ilock_data_map_shared(ip); ASSERT(offset <= mp->m_super->s_maxbytes); if (offset + size > mp->m_super->s_maxbytes) @@ -1184,37 +1178,19 @@ __xfs_get_blocks( (imap.br_startblock == HOLESTARTBLOCK || imap.br_startblock == DELAYSTARTBLOCK) || (IS_DAX(inode) && ISUNWRITTEN(&imap)))) { - if (direct || xfs_get_extsz_hint(ip)) { - /* - * xfs_iomap_write_direct() expects the shared lock. It - * is unlocked on return. - */ - if (lockmode == XFS_ILOCK_EXCL) - xfs_ilock_demote(ip, lockmode); - - error = xfs_iomap_write_direct(ip, offset, size, - &imap, nimaps); - if (error) - return error; - new = 1; + /* + * xfs_iomap_write_direct() expects the shared lock. It + * is unlocked on return. + */ + if (lockmode == XFS_ILOCK_EXCL) + xfs_ilock_demote(ip, lockmode); - } else { - /* - * Delalloc reservations do not require a transaction, - * we can go on without dropping the lock here. If we - * are allocating a new delalloc block, make sure that - * we set the new flag so that we mark the buffer new so - * that we know that it is newly allocated if the write - * fails. - */ - if (nimaps && imap.br_startblock == HOLESTARTBLOCK) - new = 1; - error = xfs_iomap_write_delay(ip, offset, size, &imap); - if (error) - goto out_unlock; + error = xfs_iomap_write_direct(ip, offset, size, + &imap, nimaps); + if (error) + return error; + new = 1; - xfs_iunlock(ip, lockmode); - } trace_xfs_get_blocks_alloc(ip, offset, size, ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN : XFS_IO_DELALLOC, &imap); @@ -1235,9 +1211,7 @@ __xfs_get_blocks( } /* trim mapping down to size requested */ - if (direct || size > (1 << inode->i_blkbits)) - xfs_map_trim_size(inode, iblock, bh_result, - &imap, offset, size); + xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size); /* * For unwritten extents do not report a disk address in the buffered @@ -1250,7 +1224,7 @@ __xfs_get_blocks( if (ISUNWRITTEN(&imap)) set_buffer_unwritten(bh_result); /* direct IO needs special help */ - if (create && direct) { + if (create) { if (dax_fault) ASSERT(!ISUNWRITTEN(&imap)); else @@ -1279,14 +1253,7 @@ __xfs_get_blocks( (new || ISUNWRITTEN(&imap)))) set_buffer_new(bh_result); - if (imap.br_startblock == DELAYSTARTBLOCK) { - BUG_ON(direct); - if (create) { - set_buffer_uptodate(bh_result); - set_buffer_mapped(bh_result); - set_buffer_delay(bh_result); - } - } + BUG_ON(direct && imap.br_startblock == DELAYSTARTBLOCK); return 0; -- cgit v1.2.3 From fa8d972d055c723cc427e14d4d7919640f418730 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Jul 2016 11:38:01 +1000 Subject: xfs: direct calls in the direct I/O path We control both the callers and callees of ->direct_IO, so remove the indirect calls. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_aops.c | 24 +++++------------------- 1 file changed, 5 insertions(+), 19 deletions(-) (limited to 'fs/xfs/xfs_aops.c') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 4c463b99fe57..3ba0809e0be8 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1336,7 +1336,7 @@ xfs_get_blocks_dax_fault( * whereas if we have flags set we will always be called in task context * (i.e. from a workqueue). */ -STATIC int +int xfs_end_io_direct_write( struct kiocb *iocb, loff_t offset, @@ -1407,24 +1407,10 @@ xfs_vm_direct_IO( struct kiocb *iocb, struct iov_iter *iter) { - struct inode *inode = iocb->ki_filp->f_mapping->host; - dio_iodone_t *endio = NULL; - int flags = 0; - struct block_device *bdev; - - if (iov_iter_rw(iter) == WRITE) { - endio = xfs_end_io_direct_write; - flags = DIO_ASYNC_EXTEND; - } - - if (IS_DAX(inode)) { - return dax_do_io(iocb, inode, iter, - xfs_get_blocks_direct, endio, 0); - } - - bdev = xfs_find_bdev_for_inode(inode); - return __blockdev_direct_IO(iocb, inode, bdev, iter, - xfs_get_blocks_direct, endio, NULL, flags); + /* + * We just need the method present so that open/fcntl allow direct I/O. + */ + return -EINVAL; } /* -- cgit v1.2.3 From 99579ccec4e271c3d4d4e7c946058766812afdab Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 22 Jul 2016 09:50:38 +1000 Subject: xfs: skip dirty pages in ->releasepage() XFS has had scattered reports of delalloc blocks present at ->releasepage() time. This results in a warning with a stack trace similar to the following: ... Call Trace: [] dump_stack+0x63/0x84 [] warn_slowpath_common+0x97/0xe0 [] warn_slowpath_null+0x1a/0x20 [] xfs_vm_releasepage+0x10f/0x140 [] ? page_mkclean_one+0xd0/0xd0 [] ? anon_vma_prepare+0x150/0x150 [] try_to_release_page+0x32/0x50 [] shrink_active_list+0x3ce/0x3e0 [] shrink_lruvec+0x687/0x7d0 [] shrink_zone+0xdc/0x2c0 [] kswapd+0x4f9/0x970 [] ? mem_cgroup_shrink_node_zone+0x1a0/0x1a0 [] kthread+0xc9/0xe0 [] ? kthread_stop+0x100/0x100 [] ret_from_fork+0x3f/0x70 [] ? kthread_stop+0x100/0x100 This occurs because it is possible for shrink_active_list() to send pages marked dirty to ->releasepage() when certain buffer_head threshold conditions are met. shrink_active_list() doesn't check the page dirty state apparently to handle an old ext3 corner case where in some cases clean pages would not have the dirty bit cleared, thus it is up to the filesystem to determine how to handle the page. XFS currently handles the delalloc case properly, but this behavior makes the warning spurious. Update the XFS ->releasepage() handler to explicitly skip dirty pages. Retain the existing delalloc/unwritten checks so we continue to warn if such buffers exist on clean pages when they shouldn't. Diagnosed-by: Dave Chinner Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_aops.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'fs/xfs/xfs_aops.c') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 3ba0809e0be8..6135787500fc 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1040,6 +1040,20 @@ xfs_vm_releasepage( trace_xfs_releasepage(page->mapping->host, page, 0, 0); + /* + * mm accommodates an old ext3 case where clean pages might not have had + * the dirty bit cleared. Thus, it can send actual dirty pages to + * ->releasepage() via shrink_active_list(). Conversely, + * block_invalidatepage() can send pages that are still marked dirty + * but otherwise have invalidated buffers. + * + * We've historically freed buffers on the latter. Instead, quietly + * filter out all dirty pages to avoid spurious buffer state warnings. + * This can likely be removed once shrink_active_list() is fixed. + */ + if (PageDirty(page)) + return 0; + xfs_count_page_state(page, &delalloc, &unwritten); if (WARN_ON_ONCE(delalloc)) -- cgit v1.2.3 From 28b783e47ad702b8e0f4861ef94cdfce6abd7c80 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 22 Jul 2016 09:56:38 +1000 Subject: xfs: bufferhead chains are invalid after end_page_writeback In xfs_finish_page_writeback(), we have a loop that looks like this: do { if (off < bvec->bv_offset) goto next_bh; if (off > end) break; bh->b_end_io(bh, !error); next_bh: off += bh->b_size; } while ((bh = bh->b_this_page) != head); The b_end_io function is end_buffer_async_write(), which will call end_page_writeback() once all the buffers have marked as no longer under IO. This issue here is that the only thing currently protecting both the bufferhead chain and the page from being reclaimed is the PageWriteback state held on the page. While we attempt to limit the loop to just the buffers covered by the IO, we still read from the buffer size and follow the next pointer in the bufferhead chain. There is no guarantee that either of these are valid after the PageWriteback flag has been cleared. Hence, loops like this are completely unsafe, and result in use-after-free issues. One such problem was caught by Calvin Owens with KASAN: ..... INFO: Freed in 0x103fc80ec age=18446651500051355200 cpu=2165122683 pid=-1 free_buffer_head+0x41/0x90 __slab_free+0x1ed/0x340 kmem_cache_free+0x270/0x300 free_buffer_head+0x41/0x90 try_to_free_buffers+0x171/0x240 xfs_vm_releasepage+0xcb/0x3b0 try_to_release_page+0x106/0x190 shrink_page_list+0x118e/0x1a10 shrink_inactive_list+0x42c/0xdf0 shrink_zone_memcg+0xa09/0xfa0 shrink_zone+0x2c3/0xbc0 ..... Call Trace: [] dump_stack+0x68/0x94 [] print_trailer+0x115/0x1a0 [] object_err+0x34/0x40 [] kasan_report_error+0x217/0x530 [] __asan_report_load8_noabort+0x43/0x50 [] xfs_destroy_ioend+0x3bf/0x4c0 [] xfs_end_bio+0x154/0x220 [] bio_endio+0x158/0x1b0 [] blk_update_request+0x18b/0xb80 [] scsi_end_request+0x97/0x5a0 [] scsi_io_completion+0x438/0x1690 [] scsi_finish_command+0x375/0x4e0 [] scsi_softirq_done+0x280/0x340 Where the access is occuring during IO completion after the buffer had been freed from direct memory reclaim. Prevent use-after-free accidents in this end_io processing loop by pre-calculating the loop conditionals before calling bh->b_end_io(). The loop is already limited to just the bufferheads covered by the IO in progress, so the offset checks are sufficient to prevent accessing buffers in the chain after end_page_writeback() has been called by the the bh->b_end_io() callout. Yet another example of why Bufferheads Must Die. cc: # 4.7 Signed-off-by: Dave Chinner Reported-and-Tested-by: Calvin Owens Reviewed-by: Christoph Hellwig Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_aops.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'fs/xfs/xfs_aops.c') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 6135787500fc..f1c7f8cec22a 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -87,6 +87,12 @@ xfs_find_bdev_for_inode( * We're now finished for good with this page. Update the page state via the * associated buffer_heads, paying attention to the start and end offsets that * we need to process on the page. + * + * Landmine Warning: bh->b_end_io() will call end_page_writeback() on the last + * buffer in the IO. Once it does this, it is unsafe to access the bufferhead or + * the page at all, as we may be racing with memory reclaim and it can free both + * the bufferhead chain and the page as it will see the page as clean and + * unused. */ static void xfs_finish_page_writeback( @@ -95,8 +101,9 @@ xfs_finish_page_writeback( int error) { unsigned int end = bvec->bv_offset + bvec->bv_len - 1; - struct buffer_head *head, *bh; + struct buffer_head *head, *bh, *next; unsigned int off = 0; + unsigned int bsize; ASSERT(bvec->bv_offset < PAGE_SIZE); ASSERT((bvec->bv_offset & ((1 << inode->i_blkbits) - 1)) == 0); @@ -105,15 +112,17 @@ xfs_finish_page_writeback( bh = head = page_buffers(bvec->bv_page); + bsize = bh->b_size; do { + next = bh->b_this_page; if (off < bvec->bv_offset) goto next_bh; if (off > end) break; bh->b_end_io(bh, !error); next_bh: - off += bh->b_size; - } while ((bh = bh->b_this_page) != head); + off += bsize; + } while ((bh = next) != head); } /* -- cgit v1.2.3