From 6e6938b6d3130305a5960c86b1a9b21e58cf6144 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Sun, 6 Jun 2010 10:38:15 -0600 Subject: writeback: introduce .tagged_writepages for the WB_SYNC_NONE sync stage sync(2) is performed in two stages: the WB_SYNC_NONE sync and the WB_SYNC_ALL sync. Identify the first stage with .tagged_writepages and do livelock prevention for it, too. Jan's commit f446daaea9 ("mm: implement writeback livelock avoidance using page tagging") is a partial fix in that it only fixed the WB_SYNC_ALL phase livelock. Although ext4 is tested to no longer livelock with commit f446daaea9, it may due to some "redirty_tail() after pages_skipped" effect which is by no means a guarantee for _all_ the file systems. Note that writeback_inodes_sb() is called by not only sync(), they are treated the same because the other callers also need livelock prevention. Impact: It changes the order in which pages/inodes are synced to disk. Now in the WB_SYNC_NONE stage, it won't proceed to write the next inode until finished with the current inode. Acked-by: Jan Kara CC: Dave Chinner Signed-off-by: Wu Fengguang --- fs/fs-writeback.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 0f015a0468d..5ed2ce9a28d 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -36,6 +36,7 @@ struct wb_writeback_work { long nr_pages; struct super_block *sb; enum writeback_sync_modes sync_mode; + unsigned int tagged_writepages:1; unsigned int for_kupdate:1; unsigned int range_cyclic:1; unsigned int for_background:1; @@ -650,6 +651,7 @@ static long wb_writeback(struct bdi_writeback *wb, { struct writeback_control wbc = { .sync_mode = work->sync_mode, + .tagged_writepages = work->tagged_writepages, .older_than_this = NULL, .for_kupdate = work->for_kupdate, .for_background = work->for_background, @@ -657,7 +659,7 @@ static long wb_writeback(struct bdi_writeback *wb, }; unsigned long oldest_jif; long wrote = 0; - long write_chunk; + long write_chunk = MAX_WRITEBACK_PAGES; struct inode *inode; if (wbc.for_kupdate) { @@ -683,9 +685,7 @@ static long wb_writeback(struct bdi_writeback *wb, * (quickly) tag currently dirty pages * (maybe slowly) sync all tagged pages */ - if (wbc.sync_mode == WB_SYNC_NONE) - write_chunk = MAX_WRITEBACK_PAGES; - else + if (wbc.sync_mode == WB_SYNC_ALL || wbc.tagged_writepages) write_chunk = LONG_MAX; wbc.wb_start = jiffies; /* livelock avoidance */ @@ -1188,10 +1188,11 @@ void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr) { DECLARE_COMPLETION_ONSTACK(done); struct wb_writeback_work work = { - .sb = sb, - .sync_mode = WB_SYNC_NONE, - .done = &done, - .nr_pages = nr, + .sb = sb, + .sync_mode = WB_SYNC_NONE, + .tagged_writepages = 1, + .done = &done, + .nr_pages = nr, }; WARN_ON(!rwsem_is_locked(&sb->s_umount)); -- cgit v1.2.3 From 94c3dcbb0b0cdfd82cedd21705424d8044edc42c Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 27 Apr 2011 19:05:21 -0600 Subject: writeback: update dirtied_when for synced inode to prevent livelock Explicitly update .dirtied_when on synced inodes, so that they are no longer considered for writeback in the next round. It can prevent both of the following livelock schemes: - while true; do echo data >> f; done - while true; do touch f; done (in theory) The exact livelock condition is, during sync(1): (1) no new inodes are dirtied (2) an inode being actively dirtied On (2), the inode will be tagged and synced with .nr_to_write=LONG_MAX. When finished, it will be redirty_tail()ed because it's still dirty and (.nr_to_write > 0). redirty_tail() won't update its ->dirtied_when on condition (1). The sync work will then revisit it on the next queue_io() and find it eligible again because its old ->dirtied_when predates the sync work start time. We'll do more aggressive "keep writeback as long as we wrote something" logic in wb_writeback(). The "use LONG_MAX .nr_to_write" trick in commit b9543dac5bbc ("writeback: avoid livelocking WB_SYNC_ALL writeback") will no longer be enough to stop sync livelock. Reviewed-by: Jan Kara Signed-off-by: Wu Fengguang --- fs/fs-writeback.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 5ed2ce9a28d..fe190a8b0bc 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -419,6 +419,15 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) spin_lock(&inode->i_lock); inode->i_state &= ~I_SYNC; if (!(inode->i_state & I_FREEING)) { + /* + * Sync livelock prevention. Each inode is tagged and synced in + * one shot. If still dirty, it will be redirty_tail()'ed below. + * Update the dirty time to prevent enqueue and sync it again. + */ + if ((inode->i_state & I_DIRTY) && + (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)) + inode->dirtied_when = jiffies; + if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { /* * We didn't write back all the pages. nfs_writepages() -- cgit v1.2.3 From cb9bd1159c5fe8995e151fa7df10fa19f8c119cc Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 21 Jul 2010 22:50:57 -0600 Subject: writeback: introduce writeback_control.inodes_written The flusher works on dirty inodes in batches, and may quit prematurely if the batch of inodes happen to be metadata-only dirtied: in this case wbc->nr_to_write won't be decreased at all, which stands for "no pages written" but also mis-interpreted as "no progress". So introduce writeback_control.inodes_written to count the inodes get cleaned from VFS POV. A non-zero value means there are some progress on writeback, in which case more writeback can be tried. Acked-by: Jan Kara Acked-by: Mel Gorman Signed-off-by: Wu Fengguang --- fs/fs-writeback.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index fe190a8b0bc..e4504299f4a 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -464,6 +464,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * No need to add it back to the LRU. */ list_del_init(&inode->i_wb_list); + wbc->inodes_written++; } } inode_sync_complete(inode); @@ -725,6 +726,7 @@ static long wb_writeback(struct bdi_writeback *wb, wbc.more_io = 0; wbc.nr_to_write = write_chunk; wbc.pages_skipped = 0; + wbc.inodes_written = 0; trace_wbc_writeback_start(&wbc, wb->bdi); if (work->sb) @@ -741,6 +743,8 @@ static long wb_writeback(struct bdi_writeback *wb, */ if (wbc.nr_to_write <= 0) continue; + if (wbc.inodes_written) + continue; /* * Didn't write everything and we don't have more IO, bail */ -- cgit v1.2.3 From e6fb6da2e10682d477f2fdb749451d9fe5d168e8 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Thu, 22 Jul 2010 10:23:44 -0600 Subject: writeback: try more writeback as long as something was written writeback_inodes_wb()/__writeback_inodes_sb() are not aggressive in that they only populate possibly a subset of eligible inodes into b_io at entrance time. When the queued set of inodes are all synced, they just return, possibly with all queued inode pages written but still wbc.nr_to_write > 0. For kupdate and background writeback, there may be more eligible inodes sitting in b_dirty when the current set of b_io inodes are completed. So it is necessary to try another round of writeback as long as we made some progress in this round. When there are no more eligible inodes, no more inodes will be enqueued in queue_io(), hence nothing could/will be synced and we may safely bail. For example, imagine 100 inodes i0, i1, i2, ..., i90, i91, i99 At queue_io() time, i90-i99 happen to be expired and moved to s_io for IO. When finished successfully, if their total size is less than MAX_WRITEBACK_PAGES, nr_to_write will be > 0. Then wb_writeback() will quit the background work (w/o this patch) while it's still over background threshold. This will be a fairly normal/frequent case I guess. Now that we do tagged sync and update inode->dirtied_when after the sync, this change won't livelock sync(1). I actually tried to write 1 page per 1ms with this command write-and-fsync -n10000 -S 1000 -c 4096 /fs/test and do sync(1) at the same time. The sync completes quickly on ext4, xfs, btrfs. Acked-by: Jan Kara Signed-off-by: Wu Fengguang --- fs/fs-writeback.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index e4504299f4a..271cf2150ba 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -739,22 +739,22 @@ static long wb_writeback(struct bdi_writeback *wb, wrote += write_chunk - wbc.nr_to_write; /* - * If we consumed everything, see if we have more + * Did we write something? Try for more + * + * Dirty inodes are moved to b_io for writeback in batches. + * The completion of the current batch does not necessarily + * mean the overall work is done. So we keep looping as long + * as made some progress on cleaning pages or inodes. */ - if (wbc.nr_to_write <= 0) + if (wbc.nr_to_write < write_chunk) continue; if (wbc.inodes_written) continue; /* - * Didn't write everything and we don't have more IO, bail + * No more inodes for IO, bail */ if (!wbc.more_io) break; - /* - * Did we write something? Try for more - */ - if (wbc.nr_to_write < write_chunk) - continue; /* * Nothing written. Wait for some inode to * become available for writeback. Otherwise -- cgit v1.2.3 From ba9aa8399fda48510d80c2fed1afb8fedbe1bb41 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 21 Jul 2010 20:32:30 -0600 Subject: writeback: the kupdate expire timestamp should be a moving target Dynamically compute the dirty expire timestamp at queue_io() time. writeback_control.older_than_this used to be determined at entrance to the kupdate writeback work. This _static_ timestamp may go stale if the kupdate work runs on and on. The flusher may then stuck with some old busy inodes, never considering newly expired inodes thereafter. This has two possible problems: - It is unfair for a large dirty inode to delay (for a long time) the writeback of small dirty inodes. - As time goes by, the large and busy dirty inode may contain only _freshly_ dirtied pages. Ignoring newly expired dirty inodes risks delaying the expired dirty pages to the end of LRU lists, triggering the evil pageout(). Nevertheless this patch merely addresses part of the problem. v2: keep policy changes inside wb_writeback() and keep the wbc.older_than_this visibility as suggested by Dave. CC: Dave Chinner Acked-by: Jan Kara Acked-by: Mel Gorman Signed-off-by: Itaru Kitayama Signed-off-by: Wu Fengguang --- fs/fs-writeback.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 271cf2150ba..0adee7853b8 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -672,11 +672,6 @@ static long wb_writeback(struct bdi_writeback *wb, long write_chunk = MAX_WRITEBACK_PAGES; struct inode *inode; - if (wbc.for_kupdate) { - wbc.older_than_this = &oldest_jif; - oldest_jif = jiffies - - msecs_to_jiffies(dirty_expire_interval * 10); - } if (!wbc.range_cyclic) { wbc.range_start = 0; wbc.range_end = LLONG_MAX; @@ -723,6 +718,12 @@ static long wb_writeback(struct bdi_writeback *wb, if (work->for_background && !over_bground_thresh()) break; + if (work->for_kupdate) { + oldest_jif = jiffies - + msecs_to_jiffies(dirty_expire_interval * 10); + wbc.older_than_this = &oldest_jif; + } + wbc.more_io = 0; wbc.nr_to_write = write_chunk; wbc.pages_skipped = 0; -- cgit v1.2.3 From 424b351fe1901fc909fd0ca4f21dab58f24c1aac Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 21 Jul 2010 20:11:53 -0600 Subject: writeback: refill b_io iff empty There is no point to carry different refill policies between for_kupdate and other type of works. Use a consistent "refill b_io iff empty" policy which can guarantee fairness in an easy to understand way. A b_io refill will setup a _fixed_ work set with all currently eligible inodes and start a new round of walk through b_io. The "fixed" work set means no new inodes will be added to the work set during the walk. Only when a complete walk over b_io is done, new inodes that are eligible at the time will be enqueued and the walk be started over. This procedure provides fairness among the inodes because it guarantees each inode to be synced once and only once at each round. So all inodes will be free from starvations. This change relies on wb_writeback() to keep retrying as long as we made some progress on cleaning some pages and/or inodes. Without that ability, the old logic on background works relies on aggressively queuing all eligible inodes into b_io at every time. But that's not a guarantee. The below test script completes a slightly faster now: 2.6.39-rc3 2.6.39-rc3-dyn-expire+ ------------------------------------------------ all elapsed 256.043 252.367 stddev 24.381 12.530 tar elapsed 30.097 28.808 dd elapsed 13.214 11.782 #!/bin/zsh cp /c/linux-2.6.38.3.tar.bz2 /dev/shm/ umount /dev/sda7 mkfs.xfs -f /dev/sda7 mount /dev/sda7 /fs echo 3 > /proc/sys/vm/drop_caches tic=$(cat /proc/uptime|cut -d' ' -f2) cd /fs time tar jxf /dev/shm/linux-2.6.38.3.tar.bz2 & time dd if=/dev/zero of=/fs/zero bs=1M count=1000 & wait sync tac=$(cat /proc/uptime|cut -d' ' -f2) echo elapsed: $((tac - tic)) It maintains roughly the same small vs. large file writeout shares, and offers large files better chances to be written in nice 4M chunks. Analyzes from Dave Chinner in great details: Let's say we have lots of inodes with 100 dirty pages being created, and one large writeback going on. We expire 8 new inodes for every 1024 pages we write back. With the old code, we do: b_more_io (large inode) -> b_io (1l) 8 newly expired inodes -> b_io (1l, 8s) writeback large inode 1024 pages -> b_more_io b_more_io (large inode) -> b_io (8s, 1l) 8 newly expired inodes -> b_io (8s, 1l, 8s) writeback 8 small inodes 800 pages 1 large inode 224 pages -> b_more_io b_more_io (large inode) -> b_io (8s, 1l) 8 newly expired inodes -> b_io (8s, 1l, 8s) ..... Your new code: b_more_io (large inode) -> b_io (1l) 8 newly expired inodes -> b_io (1l, 8s) writeback large inode 1024 pages -> b_more_io (b_io == 8s) writeback 8 small inodes 800 pages b_io empty: (1800 pages written) b_more_io (large inode) -> b_io (1l) 14 newly expired inodes -> b_io (1l, 14s) writeback large inode 1024 pages -> b_more_io (b_io == 14s) writeback 10 small inodes 1000 pages 1 small inode 24 pages -> b_more_io (1l, 1s(24)) writeback 5 small inodes 500 pages b_io empty: (2548 pages written) b_more_io (large inode) -> b_io (1l, 1s(24)) 20 newly expired inodes -> b_io (1l, 1s(24), 20s) ...... Rough progression of pages written at b_io refill: Old code: total large file % of writeback 1024 224 21.9% (fixed) New code: total large file % of writeback 1800 1024 ~55% 2550 1024 ~40% 3050 1024 ~33% 3500 1024 ~29% 3950 1024 ~26% 4250 1024 ~24% 4500 1024 ~22.7% 4700 1024 ~21.7% 4800 1024 ~21.3% 4800 1024 ~21.3% (pretty much steady state from here) Ok, so the steady state is reached with a similar percentage of writeback to the large file as the existing code. Ok, that's good, but providing some evidence that is doesn't change the shared of writeback to the large should be in the commit message ;) The other advantage to this is that we always write 1024 page chunks to the large file, rather than smaller "whatever remains" chunks. CC: Jan Kara Acked-by: Mel Gorman Signed-off-by: Wu Fengguang --- fs/fs-writeback.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 0adee7853b8..664acdb2e7e 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -589,7 +589,8 @@ void writeback_inodes_wb(struct bdi_writeback *wb, if (!wbc->wb_start) wbc->wb_start = jiffies; /* livelock avoidance */ spin_lock(&inode_wb_list_lock); - if (!wbc->for_kupdate || list_empty(&wb->b_io)) + + if (list_empty(&wb->b_io)) queue_io(wb, wbc->older_than_this); while (!list_empty(&wb->b_io)) { @@ -616,7 +617,7 @@ static void __writeback_inodes_sb(struct super_block *sb, WARN_ON(!rwsem_is_locked(&sb->s_umount)); spin_lock(&inode_wb_list_lock); - if (!wbc->for_kupdate || list_empty(&wb->b_io)) + if (list_empty(&wb->b_io)) queue_io(wb, wbc->older_than_this); writeback_sb_inodes(sb, wb, wbc, true); spin_unlock(&inode_wb_list_lock); -- cgit v1.2.3 From f758eeabeb96f878c860e8f110f94ec8820822a9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 21 Apr 2011 18:19:44 -0600 Subject: writeback: split inode_wb_list_lock into bdi_writeback.list_lock Split the global inode_wb_list_lock into a per-bdi_writeback list_lock, as it's currently the most contended lock in the system for metadata heavy workloads. It won't help for single-filesystem workloads for which we'll need the I/O-less balance_dirty_pages, but at least we can dedicate a cpu to spinning on each bdi now for larger systems. Based on earlier patches from Nick Piggin and Dave Chinner. It reduces lock contentions to 1/4 in this test case: 10 HDD JBOD, 100 dd on each disk, XFS, 6GB ram lock_stat version 0.3 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- class name con-bounces contentions waittime-min waittime-max waittime-total acq-bounces acquisitions holdtime-min holdtime-max holdtime-total ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- vanilla 2.6.39-rc3: inode_wb_list_lock: 42590 44433 0.12 147.74 144127.35 252274 886792 0.08 121.34 917211.23 ------------------ inode_wb_list_lock 2 [] bdev_inode_switch_bdi+0x29/0x85 inode_wb_list_lock 34 [] inode_wb_list_del+0x22/0x49 inode_wb_list_lock 12893 [] __mark_inode_dirty+0x170/0x1d0 inode_wb_list_lock 10702 [] writeback_single_inode+0x16d/0x20a ------------------ inode_wb_list_lock 2 [] bdev_inode_switch_bdi+0x29/0x85 inode_wb_list_lock 19 [] inode_wb_list_del+0x22/0x49 inode_wb_list_lock 5550 [] __mark_inode_dirty+0x170/0x1d0 inode_wb_list_lock 8511 [] writeback_sb_inodes+0x10f/0x157 2.6.39-rc3 + patch: &(&wb->list_lock)->rlock: 11383 11657 0.14 151.69 40429.51 90825 527918 0.11 145.90 556843.37 ------------------------ &(&wb->list_lock)->rlock 10 [] inode_wb_list_del+0x5f/0x86 &(&wb->list_lock)->rlock 1493 [] writeback_inodes_wb+0x3d/0x150 &(&wb->list_lock)->rlock 3652 [] writeback_sb_inodes+0x123/0x16f &(&wb->list_lock)->rlock 1412 [] writeback_single_inode+0x17f/0x223 ------------------------ &(&wb->list_lock)->rlock 3 [] bdi_lock_two+0x46/0x4b &(&wb->list_lock)->rlock 6 [] inode_wb_list_del+0x5f/0x86 &(&wb->list_lock)->rlock 2061 [] __mark_inode_dirty+0x173/0x1cf &(&wb->list_lock)->rlock 2629 [] writeback_sb_inodes+0x123/0x16f hughd@google.com: fix recursive lock when bdi_lock_two() is called with new the same as old akpm@linux-foundation.org: cleanup bdev_inode_switch_bdi() comment Signed-off-by: Christoph Hellwig Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Wu Fengguang --- fs/fs-writeback.c | 97 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 49 insertions(+), 48 deletions(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 664acdb2e7e..36a30917e0d 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -181,12 +181,13 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi) */ void inode_wb_list_del(struct inode *inode) { - spin_lock(&inode_wb_list_lock); + struct backing_dev_info *bdi = inode_to_bdi(inode); + + spin_lock(&bdi->wb.list_lock); list_del_init(&inode->i_wb_list); - spin_unlock(&inode_wb_list_lock); + spin_unlock(&bdi->wb.list_lock); } - /* * Redirty an inode: set its when-it-was dirtied timestamp and move it to the * furthest end of its superblock's dirty-inode list. @@ -196,11 +197,9 @@ void inode_wb_list_del(struct inode *inode) * the case then the inode must have been redirtied while it was being written * out and we don't reset its dirtied_when. */ -static void redirty_tail(struct inode *inode) +static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) { - struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; - - assert_spin_locked(&inode_wb_list_lock); + assert_spin_locked(&wb->list_lock); if (!list_empty(&wb->b_dirty)) { struct inode *tail; @@ -214,11 +213,9 @@ static void redirty_tail(struct inode *inode) /* * requeue inode for re-scanning after bdi->b_io list is exhausted. */ -static void requeue_io(struct inode *inode) +static void requeue_io(struct inode *inode, struct bdi_writeback *wb) { - struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; - - assert_spin_locked(&inode_wb_list_lock); + assert_spin_locked(&wb->list_lock); list_move(&inode->i_wb_list, &wb->b_more_io); } @@ -226,7 +223,7 @@ static void inode_sync_complete(struct inode *inode) { /* * Prevent speculative execution through - * spin_unlock(&inode_wb_list_lock); + * spin_unlock(&wb->list_lock); */ smp_mb(); @@ -302,7 +299,7 @@ static void move_expired_inodes(struct list_head *delaying_queue, */ static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) { - assert_spin_locked(&inode_wb_list_lock); + assert_spin_locked(&wb->list_lock); list_splice_init(&wb->b_more_io, &wb->b_io); move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); } @@ -317,7 +314,8 @@ static int write_inode(struct inode *inode, struct writeback_control *wbc) /* * Wait for writeback on an inode to complete. */ -static void inode_wait_for_writeback(struct inode *inode) +static void inode_wait_for_writeback(struct inode *inode, + struct bdi_writeback *wb) { DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC); wait_queue_head_t *wqh; @@ -325,15 +323,15 @@ static void inode_wait_for_writeback(struct inode *inode) wqh = bit_waitqueue(&inode->i_state, __I_SYNC); while (inode->i_state & I_SYNC) { spin_unlock(&inode->i_lock); - spin_unlock(&inode_wb_list_lock); + spin_unlock(&wb->list_lock); __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); - spin_lock(&inode_wb_list_lock); + spin_lock(&wb->list_lock); spin_lock(&inode->i_lock); } } /* - * Write out an inode's dirty pages. Called under inode_wb_list_lock and + * Write out an inode's dirty pages. Called under wb->list_lock and * inode->i_lock. Either the caller has an active reference on the inode or * the inode has I_WILL_FREE set. * @@ -344,13 +342,14 @@ static void inode_wait_for_writeback(struct inode *inode) * livelocks, etc. */ static int -writeback_single_inode(struct inode *inode, struct writeback_control *wbc) +writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, + struct writeback_control *wbc) { struct address_space *mapping = inode->i_mapping; unsigned dirty; int ret; - assert_spin_locked(&inode_wb_list_lock); + assert_spin_locked(&wb->list_lock); assert_spin_locked(&inode->i_lock); if (!atomic_read(&inode->i_count)) @@ -368,14 +367,14 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * completed a full scan of b_io. */ if (wbc->sync_mode != WB_SYNC_ALL) { - requeue_io(inode); + requeue_io(inode, wb); return 0; } /* * It's a data-integrity sync. We must wait. */ - inode_wait_for_writeback(inode); + inode_wait_for_writeback(inode, wb); } BUG_ON(inode->i_state & I_SYNC); @@ -384,7 +383,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) inode->i_state |= I_SYNC; inode->i_state &= ~I_DIRTY_PAGES; spin_unlock(&inode->i_lock); - spin_unlock(&inode_wb_list_lock); + spin_unlock(&wb->list_lock); ret = do_writepages(mapping, wbc); @@ -415,7 +414,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) ret = err; } - spin_lock(&inode_wb_list_lock); + spin_lock(&wb->list_lock); spin_lock(&inode->i_lock); inode->i_state &= ~I_SYNC; if (!(inode->i_state & I_FREEING)) { @@ -438,7 +437,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) /* * slice used up: queue for next turn */ - requeue_io(inode); + requeue_io(inode, wb); } else { /* * Writeback blocked by something other than @@ -447,7 +446,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * retrying writeback of the dirty page/inode * that cannot be performed immediately. */ - redirty_tail(inode); + redirty_tail(inode, wb); } } else if (inode->i_state & I_DIRTY) { /* @@ -456,7 +455,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * submission or metadata updates after data IO * completion. */ - redirty_tail(inode); + redirty_tail(inode, wb); } else { /* * The inode is clean. At this point we either have @@ -521,7 +520,7 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, * superblock, move all inodes not belonging * to it back onto the dirty list. */ - redirty_tail(inode); + redirty_tail(inode, wb); continue; } @@ -541,7 +540,7 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, spin_lock(&inode->i_lock); if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { spin_unlock(&inode->i_lock); - requeue_io(inode); + requeue_io(inode, wb); continue; } @@ -557,19 +556,19 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, __iget(inode); pages_skipped = wbc->pages_skipped; - writeback_single_inode(inode, wbc); + writeback_single_inode(inode, wb, wbc); if (wbc->pages_skipped != pages_skipped) { /* * writeback is not making progress due to locked * buffers. Skip this inode for now. */ - redirty_tail(inode); + redirty_tail(inode, wb); } spin_unlock(&inode->i_lock); - spin_unlock(&inode_wb_list_lock); + spin_unlock(&wb->list_lock); iput(inode); cond_resched(); - spin_lock(&inode_wb_list_lock); + spin_lock(&wb->list_lock); if (wbc->nr_to_write <= 0) { wbc->more_io = 1; return 1; @@ -588,7 +587,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb, if (!wbc->wb_start) wbc->wb_start = jiffies; /* livelock avoidance */ - spin_lock(&inode_wb_list_lock); + spin_lock(&wb->list_lock); if (list_empty(&wb->b_io)) queue_io(wb, wbc->older_than_this); @@ -598,7 +597,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb, struct super_block *sb = inode->i_sb; if (!pin_sb_for_writeback(sb)) { - requeue_io(inode); + requeue_io(inode, wb); continue; } ret = writeback_sb_inodes(sb, wb, wbc, false); @@ -607,7 +606,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb, if (ret) break; } - spin_unlock(&inode_wb_list_lock); + spin_unlock(&wb->list_lock); /* Leave any unwritten inodes on b_io */ } @@ -616,11 +615,11 @@ static void __writeback_inodes_sb(struct super_block *sb, { WARN_ON(!rwsem_is_locked(&sb->s_umount)); - spin_lock(&inode_wb_list_lock); + spin_lock(&wb->list_lock); if (list_empty(&wb->b_io)) queue_io(wb, wbc->older_than_this); writeback_sb_inodes(sb, wb, wbc, true); - spin_unlock(&inode_wb_list_lock); + spin_unlock(&wb->list_lock); } /* @@ -762,15 +761,15 @@ static long wb_writeback(struct bdi_writeback *wb, * become available for writeback. Otherwise * we'll just busyloop. */ - spin_lock(&inode_wb_list_lock); + spin_lock(&wb->list_lock); if (!list_empty(&wb->b_more_io)) { inode = wb_inode(wb->b_more_io.prev); trace_wbc_writeback_wait(&wbc, wb->bdi); spin_lock(&inode->i_lock); - inode_wait_for_writeback(inode); + inode_wait_for_writeback(inode, wb); spin_unlock(&inode->i_lock); } - spin_unlock(&inode_wb_list_lock); + spin_unlock(&wb->list_lock); } return wrote; @@ -1104,10 +1103,10 @@ void __mark_inode_dirty(struct inode *inode, int flags) } spin_unlock(&inode->i_lock); - spin_lock(&inode_wb_list_lock); + spin_lock(&bdi->wb.list_lock); inode->dirtied_when = jiffies; list_move(&inode->i_wb_list, &bdi->wb.b_dirty); - spin_unlock(&inode_wb_list_lock); + spin_unlock(&bdi->wb.list_lock); if (wakeup_bdi) bdi_wakeup_thread_delayed(bdi); @@ -1309,6 +1308,7 @@ EXPORT_SYMBOL(sync_inodes_sb); */ int write_inode_now(struct inode *inode, int sync) { + struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; int ret; struct writeback_control wbc = { .nr_to_write = LONG_MAX, @@ -1321,11 +1321,11 @@ int write_inode_now(struct inode *inode, int sync) wbc.nr_to_write = 0; might_sleep(); - spin_lock(&inode_wb_list_lock); + spin_lock(&wb->list_lock); spin_lock(&inode->i_lock); - ret = writeback_single_inode(inode, &wbc); + ret = writeback_single_inode(inode, wb, &wbc); spin_unlock(&inode->i_lock); - spin_unlock(&inode_wb_list_lock); + spin_unlock(&wb->list_lock); if (sync) inode_sync_wait(inode); return ret; @@ -1345,13 +1345,14 @@ EXPORT_SYMBOL(write_inode_now); */ int sync_inode(struct inode *inode, struct writeback_control *wbc) { + struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; int ret; - spin_lock(&inode_wb_list_lock); + spin_lock(&wb->list_lock); spin_lock(&inode->i_lock); - ret = writeback_single_inode(inode, wbc); + ret = writeback_single_inode(inode, wb, wbc); spin_unlock(&inode->i_lock); - spin_unlock(&inode_wb_list_lock); + spin_unlock(&wb->list_lock); return ret; } EXPORT_SYMBOL(sync_inode); -- cgit v1.2.3 From e8dfc30582995ae12454cda517b17d6294175b07 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Thu, 21 Apr 2011 12:06:32 -0600 Subject: writeback: elevate queue_io() into wb_writeback() Code refactor for more logical code layout. No behavior change. - remove the mis-named __writeback_inodes_sb() - wb_writeback()/writeback_inodes_wb() will decide when to queue_io() before calling __writeback_inodes_wb() Acked-by: Jan Kara Signed-off-by: Wu Fengguang --- fs/fs-writeback.c | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 36a30917e0d..565b1fd15be 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -580,17 +580,13 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, return 1; } -void writeback_inodes_wb(struct bdi_writeback *wb, - struct writeback_control *wbc) +static void __writeback_inodes_wb(struct bdi_writeback *wb, + struct writeback_control *wbc) { int ret = 0; if (!wbc->wb_start) wbc->wb_start = jiffies; /* livelock avoidance */ - spin_lock(&wb->list_lock); - - if (list_empty(&wb->b_io)) - queue_io(wb, wbc->older_than_this); while (!list_empty(&wb->b_io)) { struct inode *inode = wb_inode(wb->b_io.prev); @@ -606,19 +602,16 @@ void writeback_inodes_wb(struct bdi_writeback *wb, if (ret) break; } - spin_unlock(&wb->list_lock); /* Leave any unwritten inodes on b_io */ } -static void __writeback_inodes_sb(struct super_block *sb, - struct bdi_writeback *wb, struct writeback_control *wbc) +void writeback_inodes_wb(struct bdi_writeback *wb, + struct writeback_control *wbc) { - WARN_ON(!rwsem_is_locked(&sb->s_umount)); - spin_lock(&wb->list_lock); if (list_empty(&wb->b_io)) queue_io(wb, wbc->older_than_this); - writeback_sb_inodes(sb, wb, wbc, true); + __writeback_inodes_wb(wb, wbc); spin_unlock(&wb->list_lock); } @@ -685,7 +678,7 @@ static long wb_writeback(struct bdi_writeback *wb, * The intended call sequence for WB_SYNC_ALL writeback is: * * wb_writeback() - * __writeback_inodes_sb() <== called only once + * writeback_sb_inodes() <== called only once * write_cache_pages() <== called once for each inode * (quickly) tag currently dirty pages * (maybe slowly) sync all tagged pages @@ -694,6 +687,7 @@ static long wb_writeback(struct bdi_writeback *wb, write_chunk = LONG_MAX; wbc.wb_start = jiffies; /* livelock avoidance */ + spin_lock(&wb->list_lock); for (;;) { /* * Stop writeback when nr_pages has been consumed @@ -730,10 +724,12 @@ static long wb_writeback(struct bdi_writeback *wb, wbc.inodes_written = 0; trace_wbc_writeback_start(&wbc, wb->bdi); + if (list_empty(&wb->b_io)) + queue_io(wb, wbc.older_than_this); if (work->sb) - __writeback_inodes_sb(work->sb, wb, &wbc); + writeback_sb_inodes(work->sb, wb, &wbc, true); else - writeback_inodes_wb(wb, &wbc); + __writeback_inodes_wb(wb, &wbc); trace_wbc_writeback_written(&wbc, wb->bdi); work->nr_pages -= write_chunk - wbc.nr_to_write; @@ -761,7 +757,6 @@ static long wb_writeback(struct bdi_writeback *wb, * become available for writeback. Otherwise * we'll just busyloop. */ - spin_lock(&wb->list_lock); if (!list_empty(&wb->b_more_io)) { inode = wb_inode(wb->b_more_io.prev); trace_wbc_writeback_wait(&wbc, wb->bdi); @@ -769,8 +764,8 @@ static long wb_writeback(struct bdi_writeback *wb, inode_wait_for_writeback(inode, wb); spin_unlock(&inode->i_lock); } - spin_unlock(&wb->list_lock); } + spin_unlock(&wb->list_lock); return wrote; } -- cgit v1.2.3 From e185dda89d69cde142b48059413a03561f41f78a Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Sat, 23 Apr 2011 11:26:07 -0600 Subject: writeback: avoid extra sync work at enqueue time This removes writeback_control.wb_start and does more straightforward sync livelock prevention by setting .older_than_this to prevent extra inodes from being enqueued in the first place. Acked-by: Jan Kara Signed-off-by: Wu Fengguang --- fs/fs-writeback.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 565b1fd15be..d0553f33fb5 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -544,15 +544,6 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, continue; } - /* - * Was this inode dirtied after sync_sb_inodes was called? - * This keeps sync from extra jobs and livelock. - */ - if (inode_dirtied_after(inode, wbc->wb_start)) { - spin_unlock(&inode->i_lock); - return 1; - } - __iget(inode); pages_skipped = wbc->pages_skipped; @@ -585,9 +576,6 @@ static void __writeback_inodes_wb(struct bdi_writeback *wb, { int ret = 0; - if (!wbc->wb_start) - wbc->wb_start = jiffies; /* livelock avoidance */ - while (!list_empty(&wb->b_io)) { struct inode *inode = wb_inode(wb->b_io.prev); struct super_block *sb = inode->i_sb; @@ -686,7 +674,9 @@ static long wb_writeback(struct bdi_writeback *wb, if (wbc.sync_mode == WB_SYNC_ALL || wbc.tagged_writepages) write_chunk = LONG_MAX; - wbc.wb_start = jiffies; /* livelock avoidance */ + oldest_jif = jiffies; + wbc.older_than_this = &oldest_jif; + spin_lock(&wb->list_lock); for (;;) { /* -- cgit v1.2.3 From b7a2441f9966fe3e1be960a876ab52e6029ea005 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 21 Jul 2010 22:19:51 -0600 Subject: writeback: remove writeback_control.more_io When wbc.more_io was first introduced, it indicates whether there are at least one superblock whose s_more_io contains more IO work. Now with the per-bdi writeback, it can be replaced with a simple b_more_io test. Acked-by: Jan Kara Acked-by: Mel Gorman Reviewed-by: Minchan Kim Signed-off-by: Wu Fengguang --- fs/fs-writeback.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index d0553f33fb5..f43c479feee 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -560,12 +560,8 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, iput(inode); cond_resched(); spin_lock(&wb->list_lock); - if (wbc->nr_to_write <= 0) { - wbc->more_io = 1; + if (wbc->nr_to_write <= 0) return 1; - } - if (!list_empty(&wb->b_more_io)) - wbc->more_io = 1; } /* b_io is empty */ return 1; @@ -708,7 +704,6 @@ static long wb_writeback(struct bdi_writeback *wb, wbc.older_than_this = &oldest_jif; } - wbc.more_io = 0; wbc.nr_to_write = write_chunk; wbc.pages_skipped = 0; wbc.inodes_written = 0; @@ -740,7 +735,7 @@ static long wb_writeback(struct bdi_writeback *wb, /* * No more inodes for IO, bail */ - if (!wbc.more_io) + if (list_empty(&wb->b_more_io)) break; /* * Nothing written. Wait for some inode to -- cgit v1.2.3 From 251d6a471c831e22880b3c146bb4556ddfb1dc82 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 1 Dec 2010 17:33:37 -0600 Subject: writeback: trace event writeback_single_inode It is valuable to know how the dirty inodes are iterated and their IO size. "writeback_single_inode: bdi 8:0: ino=134246746 state=I_DIRTY_SYNC|I_SYNC age=414 index=0 to_write=1024 wrote=0" - "state" reflects inode->i_state at the end of writeback_single_inode() - "index" reflects mapping->writeback_index after the ->writepages() call - "to_write" is the wbc->nr_to_write at entrance of writeback_single_inode() - "wrote" is the number of pages actually written v2: add trace event writeback_single_inode_requeue as proposed by Dave. CC: Dave Chinner Signed-off-by: Wu Fengguang --- fs/fs-writeback.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index f43c479feee..5185fad48b6 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -346,6 +346,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, struct writeback_control *wbc) { struct address_space *mapping = inode->i_mapping; + long nr_to_write = wbc->nr_to_write; unsigned dirty; int ret; @@ -368,6 +369,8 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, */ if (wbc->sync_mode != WB_SYNC_ALL) { requeue_io(inode, wb); + trace_writeback_single_inode_requeue(inode, wbc, + nr_to_write); return 0; } @@ -467,6 +470,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, } } inode_sync_complete(inode); + trace_writeback_single_inode(inode, wbc, nr_to_write); return ret; } -- cgit v1.2.3 From e84d0a4f8e39a73003a6ec9a11b07702745f4c1f Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Sat, 23 Apr 2011 12:27:27 -0600 Subject: writeback: trace event writeback_queue_io Note that it adds a little overheads to account the moved/enqueued inodes from b_dirty to b_io. The "moved" accounting may be later used to limit the number of inodes that can be moved in one shot, in order to keep spinlock hold time under control. Signed-off-by: Wu Fengguang --- fs/fs-writeback.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 5185fad48b6..6caa98247a5 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -248,15 +248,16 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t) /* * Move expired dirty inodes from @delaying_queue to @dispatch_queue. */ -static void move_expired_inodes(struct list_head *delaying_queue, +static int move_expired_inodes(struct list_head *delaying_queue, struct list_head *dispatch_queue, - unsigned long *older_than_this) + unsigned long *older_than_this) { LIST_HEAD(tmp); struct list_head *pos, *node; struct super_block *sb = NULL; struct inode *inode; int do_sb_sort = 0; + int moved = 0; while (!list_empty(delaying_queue)) { inode = wb_inode(delaying_queue->prev); @@ -267,12 +268,13 @@ static void move_expired_inodes(struct list_head *delaying_queue, do_sb_sort = 1; sb = inode->i_sb; list_move(&inode->i_wb_list, &tmp); + moved++; } /* just one sb in list, splice to dispatch_queue and we're done */ if (!do_sb_sort) { list_splice(&tmp, dispatch_queue); - return; + goto out; } /* Move inodes from one superblock together */ @@ -284,6 +286,8 @@ static void move_expired_inodes(struct list_head *delaying_queue, list_move(&inode->i_wb_list, dispatch_queue); } } +out: + return moved; } /* @@ -299,9 +303,11 @@ static void move_expired_inodes(struct list_head *delaying_queue, */ static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) { + int moved; assert_spin_locked(&wb->list_lock); list_splice_init(&wb->b_more_io, &wb->b_io); - move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); + moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); + trace_writeback_queue_io(wb, older_than_this, moved); } static int write_inode(struct inode *inode, struct writeback_control *wbc) -- cgit v1.2.3 From d46db3d58233be4be980eb1e42eebe7808bcabab Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 4 May 2011 19:54:37 -0600 Subject: writeback: make writeback_control.nr_to_write straight Pass struct wb_writeback_work all the way down to writeback_sb_inodes(), and initialize the struct writeback_control there. struct writeback_control is basically designed to control writeback of a single file, but we keep abuse it for writing multiple files in writeback_sb_inodes() and its callers. It immediately clean things up, e.g. suddenly wbc.nr_to_write vs work->nr_pages starts to make sense, and instead of saving and restoring pages_skipped in writeback_sb_inodes it can always start with a clean zero value. It also makes a neat IO pattern change: large dirty files are now written in the full 4MB writeback chunk size, rather than whatever remained quota in wbc->nr_to_write. Acked-by: Jan Kara Proposed-by: Christoph Hellwig Signed-off-by: Wu Fengguang --- fs/fs-writeback.c | 196 +++++++++++++++++++++++++++++++----------------------- 1 file changed, 111 insertions(+), 85 deletions(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 6caa98247a5..2c947da39f6 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -29,12 +29,22 @@ #include #include "internal.h" +/* + * The maximum number of pages to writeout in a single bdi flush/kupdate + * operation. We do this so we don't hold I_SYNC against an inode for + * enormous amounts of time, which would block a userspace task which has + * been forced to throttle against that inode. Also, the code reevaluates + * the dirty each time it has written this many pages. + */ +#define MAX_WRITEBACK_PAGES 1024L + /* * Passed into wb_writeback(), essentially a subset of writeback_control */ struct wb_writeback_work { long nr_pages; struct super_block *sb; + unsigned long *older_than_this; enum writeback_sync_modes sync_mode; unsigned int tagged_writepages:1; unsigned int for_kupdate:1; @@ -472,7 +482,6 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, * No need to add it back to the LRU. */ list_del_init(&inode->i_wb_list); - wbc->inodes_written++; } } inode_sync_complete(inode); @@ -506,6 +515,31 @@ static bool pin_sb_for_writeback(struct super_block *sb) return false; } +static long writeback_chunk_size(struct wb_writeback_work *work) +{ + long pages; + + /* + * WB_SYNC_ALL mode does livelock avoidance by syncing dirty + * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX + * here avoids calling into writeback_inodes_wb() more than once. + * + * The intended call sequence for WB_SYNC_ALL writeback is: + * + * wb_writeback() + * writeback_sb_inodes() <== called only once + * write_cache_pages() <== called once for each inode + * (quickly) tag currently dirty pages + * (maybe slowly) sync all tagged pages + */ + if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages) + pages = LONG_MAX; + else + pages = min(MAX_WRITEBACK_PAGES, work->nr_pages); + + return pages; +} + /* * Write a portion of b_io inodes which belong to @sb. * @@ -513,18 +547,30 @@ static bool pin_sb_for_writeback(struct super_block *sb) * inodes. Otherwise write only ones which go sequentially * in reverse order. * - * Return 1, if the caller writeback routine should be - * interrupted. Otherwise return 0. + * Return the number of pages and/or inodes written. */ -static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, - struct writeback_control *wbc, bool only_this_sb) +static long writeback_sb_inodes(struct super_block *sb, + struct bdi_writeback *wb, + struct wb_writeback_work *work) { + struct writeback_control wbc = { + .sync_mode = work->sync_mode, + .tagged_writepages = work->tagged_writepages, + .for_kupdate = work->for_kupdate, + .for_background = work->for_background, + .range_cyclic = work->range_cyclic, + .range_start = 0, + .range_end = LLONG_MAX, + }; + unsigned long start_time = jiffies; + long write_chunk; + long wrote = 0; /* count both pages and inodes */ + while (!list_empty(&wb->b_io)) { - long pages_skipped; struct inode *inode = wb_inode(wb->b_io.prev); if (inode->i_sb != sb) { - if (only_this_sb) { + if (work->sb) { /* * We only want to write back data for this * superblock, move all inodes not belonging @@ -539,7 +585,7 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, * Bounce back to the caller to unpin this and * pin the next superblock. */ - return 0; + break; } /* @@ -553,12 +599,18 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, requeue_io(inode, wb); continue; } - __iget(inode); + write_chunk = writeback_chunk_size(work); + wbc.nr_to_write = write_chunk; + wbc.pages_skipped = 0; + + writeback_single_inode(inode, wb, &wbc); - pages_skipped = wbc->pages_skipped; - writeback_single_inode(inode, wb, wbc); - if (wbc->pages_skipped != pages_skipped) { + work->nr_pages -= write_chunk - wbc.nr_to_write; + wrote += write_chunk - wbc.nr_to_write; + if (!(inode->i_state & I_DIRTY)) + wrote++; + if (wbc.pages_skipped) { /* * writeback is not making progress due to locked * buffers. Skip this inode for now. @@ -570,17 +622,25 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, iput(inode); cond_resched(); spin_lock(&wb->list_lock); - if (wbc->nr_to_write <= 0) - return 1; + /* + * bail out to wb_writeback() often enough to check + * background threshold and other termination conditions. + */ + if (wrote) { + if (time_is_before_jiffies(start_time + HZ / 10UL)) + break; + if (work->nr_pages <= 0) + break; + } } - /* b_io is empty */ - return 1; + return wrote; } -static void __writeback_inodes_wb(struct bdi_writeback *wb, - struct writeback_control *wbc) +static long __writeback_inodes_wb(struct bdi_writeback *wb, + struct wb_writeback_work *work) { - int ret = 0; + unsigned long start_time = jiffies; + long wrote = 0; while (!list_empty(&wb->b_io)) { struct inode *inode = wb_inode(wb->b_io.prev); @@ -590,33 +650,37 @@ static void __writeback_inodes_wb(struct bdi_writeback *wb, requeue_io(inode, wb); continue; } - ret = writeback_sb_inodes(sb, wb, wbc, false); + wrote += writeback_sb_inodes(sb, wb, work); drop_super(sb); - if (ret) - break; + /* refer to the same tests at the end of writeback_sb_inodes */ + if (wrote) { + if (time_is_before_jiffies(start_time + HZ / 10UL)) + break; + if (work->nr_pages <= 0) + break; + } } /* Leave any unwritten inodes on b_io */ + return wrote; } -void writeback_inodes_wb(struct bdi_writeback *wb, - struct writeback_control *wbc) +long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages) { + struct wb_writeback_work work = { + .nr_pages = nr_pages, + .sync_mode = WB_SYNC_NONE, + .range_cyclic = 1, + }; + spin_lock(&wb->list_lock); if (list_empty(&wb->b_io)) - queue_io(wb, wbc->older_than_this); - __writeback_inodes_wb(wb, wbc); + queue_io(wb, NULL); + __writeback_inodes_wb(wb, &work); spin_unlock(&wb->list_lock); -} -/* - * The maximum number of pages to writeout in a single bdi flush/kupdate - * operation. We do this so we don't hold I_SYNC against an inode for - * enormous amounts of time, which would block a userspace task which has - * been forced to throttle against that inode. Also, the code reevaluates - * the dirty each time it has written this many pages. - */ -#define MAX_WRITEBACK_PAGES 1024 + return nr_pages - work.nr_pages; +} static inline bool over_bground_thresh(void) { @@ -646,42 +710,13 @@ static inline bool over_bground_thresh(void) static long wb_writeback(struct bdi_writeback *wb, struct wb_writeback_work *work) { - struct writeback_control wbc = { - .sync_mode = work->sync_mode, - .tagged_writepages = work->tagged_writepages, - .older_than_this = NULL, - .for_kupdate = work->for_kupdate, - .for_background = work->for_background, - .range_cyclic = work->range_cyclic, - }; + long nr_pages = work->nr_pages; unsigned long oldest_jif; - long wrote = 0; - long write_chunk = MAX_WRITEBACK_PAGES; struct inode *inode; - - if (!wbc.range_cyclic) { - wbc.range_start = 0; - wbc.range_end = LLONG_MAX; - } - - /* - * WB_SYNC_ALL mode does livelock avoidance by syncing dirty - * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX - * here avoids calling into writeback_inodes_wb() more than once. - * - * The intended call sequence for WB_SYNC_ALL writeback is: - * - * wb_writeback() - * writeback_sb_inodes() <== called only once - * write_cache_pages() <== called once for each inode - * (quickly) tag currently dirty pages - * (maybe slowly) sync all tagged pages - */ - if (wbc.sync_mode == WB_SYNC_ALL || wbc.tagged_writepages) - write_chunk = LONG_MAX; + long progress; oldest_jif = jiffies; - wbc.older_than_this = &oldest_jif; + work->older_than_this = &oldest_jif; spin_lock(&wb->list_lock); for (;;) { @@ -711,24 +746,17 @@ static long wb_writeback(struct bdi_writeback *wb, if (work->for_kupdate) { oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval * 10); - wbc.older_than_this = &oldest_jif; + work->older_than_this = &oldest_jif; } - wbc.nr_to_write = write_chunk; - wbc.pages_skipped = 0; - wbc.inodes_written = 0; - - trace_wbc_writeback_start(&wbc, wb->bdi); + trace_writeback_start(wb->bdi, work); if (list_empty(&wb->b_io)) - queue_io(wb, wbc.older_than_this); + queue_io(wb, work->older_than_this); if (work->sb) - writeback_sb_inodes(work->sb, wb, &wbc, true); + progress = writeback_sb_inodes(work->sb, wb, work); else - __writeback_inodes_wb(wb, &wbc); - trace_wbc_writeback_written(&wbc, wb->bdi); - - work->nr_pages -= write_chunk - wbc.nr_to_write; - wrote += write_chunk - wbc.nr_to_write; + progress = __writeback_inodes_wb(wb, work); + trace_writeback_written(wb->bdi, work); /* * Did we write something? Try for more @@ -738,9 +766,7 @@ static long wb_writeback(struct bdi_writeback *wb, * mean the overall work is done. So we keep looping as long * as made some progress on cleaning pages or inodes. */ - if (wbc.nr_to_write < write_chunk) - continue; - if (wbc.inodes_written) + if (progress) continue; /* * No more inodes for IO, bail @@ -753,8 +779,8 @@ static long wb_writeback(struct bdi_writeback *wb, * we'll just busyloop. */ if (!list_empty(&wb->b_more_io)) { + trace_writeback_wait(wb->bdi, work); inode = wb_inode(wb->b_more_io.prev); - trace_wbc_writeback_wait(&wbc, wb->bdi); spin_lock(&inode->i_lock); inode_wait_for_writeback(inode, wb); spin_unlock(&inode->i_lock); @@ -762,7 +788,7 @@ static long wb_writeback(struct bdi_writeback *wb, } spin_unlock(&wb->list_lock); - return wrote; + return nr_pages - work->nr_pages; } /* -- cgit v1.2.3 From e98be2d599207c6b31e9bb340d52a231b2f3662d Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Sun, 29 Aug 2010 11:22:30 -0600 Subject: writeback: bdi write bandwidth estimation The estimation value will start from 100MB/s and adapt to the real bandwidth in seconds. It tries to update the bandwidth only when disk is fully utilized. Any inactive period of more than one second will be skipped. The estimated bandwidth will be reflecting how fast the device can writeout when _fully utilized_, and won't drop to 0 when it goes idle. The value will remain constant at disk idle time. At busy write time, if not considering fluctuations, it will also remain high unless be knocked down by possible concurrent reads that compete for the disk time and bandwidth with async writes. The estimation is not done purely in the flusher because there is no guarantee for write_cache_pages() to return timely to update bandwidth. The bdi->avg_write_bandwidth smoothing is very effective for filtering out sudden spikes, however may be a little biased in long term. The overheads are low because the bdi bandwidth update only occurs at 200ms intervals. The 200ms update interval is suitable, because it's not possible to get the real bandwidth for the instance at all, due to large fluctuations. The NFS commits can be as large as seconds worth of data. One XFS completion may be as large as half second worth of data if we are going to increase the write chunk to half second worth of data. In ext4, fluctuations with time period of around 5 seconds is observed. And there is another pattern of irregular periods of up to 20 seconds on SSD tests. That's why we are not only doing the estimation at 200ms intervals, but also averaging them over a period of 3 seconds and then go further to do another level of smoothing in avg_write_bandwidth. CC: Li Shaohua CC: Peter Zijlstra Signed-off-by: Wu Fengguang --- fs/fs-writeback.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 2c947da39f6..5826992910e 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -692,6 +692,16 @@ static inline bool over_bground_thresh(void) global_page_state(NR_UNSTABLE_NFS) > background_thresh); } +/* + * Called under wb->list_lock. If there are multiple wb per bdi, + * only the flusher working on the first wb should do it. + */ +static void wb_update_bandwidth(struct bdi_writeback *wb, + unsigned long start_time) +{ + __bdi_update_bandwidth(wb->bdi, start_time); +} + /* * Explicit flushing or periodic writeback of "old" data. * @@ -710,6 +720,7 @@ static inline bool over_bground_thresh(void) static long wb_writeback(struct bdi_writeback *wb, struct wb_writeback_work *work) { + unsigned long wb_start = jiffies; long nr_pages = work->nr_pages; unsigned long oldest_jif; struct inode *inode; @@ -758,6 +769,8 @@ static long wb_writeback(struct bdi_writeback *wb, progress = __writeback_inodes_wb(wb, work); trace_writeback_written(wb->bdi, work); + wb_update_bandwidth(wb, wb_start); + /* * Did we write something? Try for more * -- cgit v1.2.3 From c42843f2f0bbc9d716a32caf667d18fc2bf3bc4c Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 2 Mar 2011 15:54:09 -0600 Subject: writeback: introduce smoothed global dirty limit The start of a heavy weight application (ie. KVM) may instantly knock down determine_dirtyable_memory() if the swap is not enabled or full. global_dirty_limits() and bdi_dirty_limit() will in turn get global/bdi dirty thresholds that are _much_ lower than the global/bdi dirty pages. balance_dirty_pages() will then heavily throttle all dirtiers including the light ones, until the dirty pages drop below the new dirty thresholds. During this _deep_ dirty-exceeded state, the system may appear rather unresponsive to the users. About "deep" dirty-exceeded: task_dirty_limit() assigns 1/8 lower dirty threshold to heavy dirtiers than light ones, and the dirty pages will be throttled around the heavy dirtiers' dirty threshold and reasonably below the light dirtiers' dirty threshold. In this state, only the heavy dirtiers will be throttled and the dirty pages are carefully controlled to not exceed the light dirtiers' dirty threshold. However if the threshold itself suddenly drops below the number of dirty pages, the light dirtiers will get heavily throttled. So introduce global_dirty_limit for tracking the global dirty threshold with policies - follow downwards slowly - follow up in one shot global_dirty_limit can effectively mask out the impact of sudden drop of dirtyable memory. It will be used in the next patch for two new type of dirty limits. Note that the new dirty limits are not going to avoid throttling the light dirtiers, but could limit their sleep time to 200ms. Signed-off-by: Wu Fengguang --- fs/fs-writeback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 5826992910e..227ff12257f 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -699,7 +699,7 @@ static inline bool over_bground_thresh(void) static void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time) { - __bdi_update_bandwidth(wb->bdi, start_time); + __bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, start_time); } /* -- cgit v1.2.3 From 1a12d8bd7b2998be01ee55edb64e7473728abb9c Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Sun, 29 Aug 2010 13:28:09 -0600 Subject: writeback: scale IO chunk size up to half device bandwidth Originally, MAX_WRITEBACK_PAGES was hard-coded to 1024 because of a concern of not holding I_SYNC for too long. (At least, that was the comment previously.) This doesn't make sense now because the only time we wait for I_SYNC is if we are calling sync or fsync, and in that case we need to write out all of the data anyway. Previously there may have been other code paths that waited on I_SYNC, but not any more. -- Theodore Ts'o So remove the MAX_WRITEBACK_PAGES constraint. The writeback pages will adapt to as large as the storage device can write within 500ms. XFS is observed to do IO completions in a batch, and the batch size is equal to the write chunk size. To avoid dirty pages to suddenly drop out of balance_dirty_pages()'s dirty control scope and create large fluctuations, the chunk size is also limited to half the control scope. The balance_dirty_pages() control scrope is [(background_thresh + dirty_thresh) / 2, dirty_thresh] which is by default [15%, 20%] of global dirty pages, whose range size is dirty_thresh / DIRTY_FULL_SCOPE. The adpative write chunk size will be rounded to the nearest 4MB boundary. http://bugzilla.kernel.org/show_bug.cgi?id=13930 CC: Theodore Ts'o CC: Dave Chinner CC: Chris Mason CC: Peter Zijlstra Signed-off-by: Wu Fengguang --- fs/fs-writeback.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 227ff12257f..50445cf0b83 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -29,15 +29,6 @@ #include #include "internal.h" -/* - * The maximum number of pages to writeout in a single bdi flush/kupdate - * operation. We do this so we don't hold I_SYNC against an inode for - * enormous amounts of time, which would block a userspace task which has - * been forced to throttle against that inode. Also, the code reevaluates - * the dirty each time it has written this many pages. - */ -#define MAX_WRITEBACK_PAGES 1024L - /* * Passed into wb_writeback(), essentially a subset of writeback_control */ @@ -515,7 +506,8 @@ static bool pin_sb_for_writeback(struct super_block *sb) return false; } -static long writeback_chunk_size(struct wb_writeback_work *work) +static long writeback_chunk_size(struct backing_dev_info *bdi, + struct wb_writeback_work *work) { long pages; @@ -534,8 +526,13 @@ static long writeback_chunk_size(struct wb_writeback_work *work) */ if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages) pages = LONG_MAX; - else - pages = min(MAX_WRITEBACK_PAGES, work->nr_pages); + else { + pages = min(bdi->avg_write_bandwidth / 2, + global_dirty_limit / DIRTY_SCOPE); + pages = min(pages, work->nr_pages); + pages = round_down(pages + MIN_WRITEBACK_PAGES, + MIN_WRITEBACK_PAGES); + } return pages; } @@ -600,7 +597,7 @@ static long writeback_sb_inodes(struct super_block *sb, continue; } __iget(inode); - write_chunk = writeback_chunk_size(work); + write_chunk = writeback_chunk_size(wb->bdi, work); wbc.nr_to_write = write_chunk; wbc.pages_skipped = 0; -- cgit v1.2.3 From fcc5c22218a18509a7412bf074fc9a7a5d874a8a Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Mon, 11 Jul 2011 23:08:50 -0700 Subject: writeback: don't busy retry writeback on new/freeing inodes Fix a system hang bug introduced by commit b7a2441f9966 ("writeback: remove writeback_control.more_io") and e8dfc3058 ("writeback: elevate queue_io() into wb_writeback()") easily reproducible with high memory pressure and lots of file creation/deletions, for example, a kernel build in limited memory. It hangs when some inode is in the I_NEW, I_FREEING or I_WILL_FREE state, the flusher will get stuck busy retrying that inode, never releasing wb->list_lock. The lock in turn blocks all kinds of other tasks when they are trying to grab it. As put by Jan, it's a safe change regarding data integrity. I_FREEING or I_WILL_FREE inodes are written back by iput_final() and it is reclaim code that is responsible for eventually removing them. So writeback code can safely ignore them. I_NEW inodes should move out of this state when they are fully set up and in the writeback round following that, we will consider them for writeback. So the change makes sense. CC: Jan Kara Reported-by: Hugh Dickins Tested-by: Hugh Dickins Signed-off-by: Wu Fengguang --- fs/fs-writeback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 50445cf0b83..6d49439ca31 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -593,7 +593,7 @@ static long writeback_sb_inodes(struct super_block *sb, spin_lock(&inode->i_lock); if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { spin_unlock(&inode->i_lock); - requeue_io(inode, wb); + redirty_tail(inode, wb); continue; } __iget(inode); -- cgit v1.2.3