From a5989bdc981ec85e0734ac22519cc0b780813d7b Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 16 Sep 2009 19:22:48 +0200
Subject: fs: Fix busyloop in wb_writeback()

If all inodes are under writeback (e.g. in case when there's only one inode
with dirty pages), wb_writeback() with WB_SYNC_NONE work basically degrades
to busylooping until I_SYNC flags of the inode is cleared. Fix the problem by
waiting on I_SYNC flags of an inode on b_more_io list in case we failed to
write anything.

Tested-by: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 8e1e5e1..c59d673 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -706,6 +706,7 @@ static long wb_writeback(struct bdi_writeback *wb,
 	};
 	unsigned long oldest_jif;
 	long wrote = 0;
+	struct inode *inode;
 
 	if (wbc.for_kupdate) {
 		wbc.older_than_this = &oldest_jif;
@@ -747,8 +748,24 @@ static long wb_writeback(struct bdi_writeback *wb,
 		 * If we ran out of stuff to write, bail unless more_io got set
 		 */
 		if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
-			if (wbc.more_io && !wbc.for_kupdate)
+			if (wbc.more_io && !wbc.for_kupdate) {
+				if (wbc.nr_to_write < MAX_WRITEBACK_PAGES)
+					continue;
+				/*
+				 * Nothing written. Wait for some inode to
+				 * become available for writeback. Otherwise
+				 * we'll just busyloop.
+				 */
+				spin_lock(&inode_lock);
+				if (!list_empty(&wb->b_more_io))  {
+					inode = list_entry(
+							wb->b_more_io.prev,
+							struct inode, i_list);
+					inode_wait_for_writeback(inode);
+				}
+				spin_unlock(&inode_lock);
 				continue;
+			}
 			break;
 		}
 	}
-- 
cgit v0.10.2


From 3a2e9a5a2afc1a2d2c548b8987f133235cebe933 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Wed, 23 Sep 2009 21:56:00 +0800
Subject: writeback: balance_dirty_pages() shall write more than dirtied pages

Some filesystem may choose to write much more than ratelimit_pages
before calling balance_dirty_pages_ratelimited_nr(). So it is safer to
determine number to write based on real number of dirtied pages.

Otherwise it is possible that
  loop {
    btrfs_file_write():     dirty 1024 pages
    balance_dirty_pages():  write up to 48 pages (= ratelimit_pages * 1.5)
  }
in which the writeback rate cannot keep up with dirty rate, and the
dirty pages go all the way beyond dirty_thresh.

The increased write_chunk may make the dirtier more bumpy.
So filesystems shall be take care not to dirty too much at
a time (eg. > 4MB) without checking the ratelimit.

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 5f378dd5..cbd4cba 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -44,12 +44,15 @@ static long ratelimit_pages = 32;
 /*
  * When balance_dirty_pages decides that the caller needs to perform some
  * non-background writeback, this is how many pages it will attempt to write.
- * It should be somewhat larger than RATELIMIT_PAGES to ensure that reasonably
+ * It should be somewhat larger than dirtied pages to ensure that reasonably
  * large amounts of I/O are submitted.
  */
-static inline long sync_writeback_pages(void)
+static inline long sync_writeback_pages(unsigned long dirtied)
 {
-	return ratelimit_pages + ratelimit_pages / 2;
+	if (dirtied < ratelimit_pages)
+		dirtied = ratelimit_pages;
+
+	return dirtied + dirtied / 2;
 }
 
 /* The following parameters are exported via /proc/sys/vm */
@@ -477,7 +480,8 @@ get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
  * If we're over `background_thresh' then pdflush is woken to perform some
  * writeout.
  */
-static void balance_dirty_pages(struct address_space *mapping)
+static void balance_dirty_pages(struct address_space *mapping,
+				unsigned long write_chunk)
 {
 	long nr_reclaimable, bdi_nr_reclaimable;
 	long nr_writeback, bdi_nr_writeback;
@@ -485,7 +489,6 @@ static void balance_dirty_pages(struct address_space *mapping)
 	unsigned long dirty_thresh;
 	unsigned long bdi_thresh;
 	unsigned long pages_written = 0;
-	unsigned long write_chunk = sync_writeback_pages();
 	unsigned long pause = 1;
 
 	struct backing_dev_info *bdi = mapping->backing_dev_info;
@@ -640,9 +643,10 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
 	p =  &__get_cpu_var(bdp_ratelimits);
 	*p += nr_pages_dirtied;
 	if (unlikely(*p >= ratelimit)) {
+		ratelimit = sync_writeback_pages(*p);
 		*p = 0;
 		preempt_enable();
-		balance_dirty_pages(mapping);
+		balance_dirty_pages(mapping, ratelimit);
 		return;
 	}
 	preempt_enable();
-- 
cgit v0.10.2


From d3ddec7635b6fb37cb49e3553bdeea59642be653 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Wed, 23 Sep 2009 20:33:40 +0800
Subject: writeback: stop background writeback when below background threshold

Treat bdi_start_writeback(0) as a special request to do background write,
and stop such work when we are below the background dirty threshold.

Also simplify the (nr_pages <= 0) checks. Since we already pass in
nr_pages=LONG_MAX for WB_SYNC_ALL and background writes, we don't
need to worry about it being decreased to zero.

Reported-by: Richard Kennedy <richard@rsk.demon.co.uk>
CC: Jan Kara <jack@suse.cz>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index c59d673..476be9b 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -41,8 +41,9 @@ struct wb_writeback_args {
 	long nr_pages;
 	struct super_block *sb;
 	enum writeback_sync_modes sync_mode;
-	int for_kupdate;
-	int range_cyclic;
+	int for_kupdate:1;
+	int range_cyclic:1;
+	int for_background:1;
 };
 
 /*
@@ -257,6 +258,15 @@ void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
 		.range_cyclic	= 1,
 	};
 
+	/*
+	 * We treat @nr_pages=0 as the special case to do background writeback,
+	 * ie. to sync pages until the background dirty threshold is reached.
+	 */
+	if (!nr_pages) {
+		args.nr_pages = LONG_MAX;
+		args.for_background = 1;
+	}
+
 	bdi_alloc_queue_work(bdi, &args);
 }
 
@@ -720,20 +730,16 @@ static long wb_writeback(struct bdi_writeback *wb,
 
 	for (;;) {
 		/*
-		 * Don't flush anything for non-integrity writeback where
-		 * no nr_pages was given
+		 * Stop writeback when nr_pages has been consumed
 		 */
-		if (!args->for_kupdate && args->nr_pages <= 0 &&
-		     args->sync_mode == WB_SYNC_NONE)
+		if (args->nr_pages <= 0)
 			break;
 
 		/*
-		 * If no specific pages were given and this is just a
-		 * periodic background writeout and we are below the
-		 * background dirty threshold, don't do anything
+		 * For background writeout, stop when we are below the
+		 * background dirty threshold
 		 */
-		if (args->for_kupdate && args->nr_pages <= 0 &&
-		    !over_bground_thresh())
+		if (args->for_background && !over_bground_thresh())
 			break;
 
 		wbc.more_io = 0;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index cbd4cba..3c78fc3 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -593,10 +593,10 @@ static void balance_dirty_pages(struct address_space *mapping,
 	 * background_thresh, to keep the amount of dirty memory low.
 	 */
 	if ((laptop_mode && pages_written) ||
-	    (!laptop_mode && ((nr_writeback = global_page_state(NR_FILE_DIRTY)
-					  + global_page_state(NR_UNSTABLE_NFS))
+	    (!laptop_mode && ((global_page_state(NR_FILE_DIRTY)
+			       + global_page_state(NR_UNSTABLE_NFS))
 					  > background_thresh)))
-		bdi_start_writeback(bdi, nr_writeback);
+		bdi_start_writeback(bdi, 0);
 }
 
 void set_page_dirty_balance(struct page *page, int page_mkwrite)
-- 
cgit v0.10.2


From 7fbdea32328312c65870c397a0a436c3226c8631 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Wed, 23 Sep 2009 20:33:41 +0800
Subject: writeback: kupdate writeback shall not stop when more io is possible

Fix the kupdate case, which disregards wbc.more_io and stop writeback
prematurely even when there are more inodes to be synced.

wbc.more_io should always be respected.

Also remove the pages_skipped check. It will set when some page(s) of some
inode(s) cannot be written for now. Such inodes will be delayed for a while.
This variable has nothing to do with whether there are other writeable inodes.

CC: Jan Kara <jack@suse.cz>
CC: Dave Chinner <david@fromorbit.com>
CC: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 476be9b..551684d 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -753,8 +753,8 @@ static long wb_writeback(struct bdi_writeback *wb,
 		/*
 		 * If we ran out of stuff to write, bail unless more_io got set
 		 */
-		if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
-			if (wbc.more_io && !wbc.for_kupdate) {
+		if (wbc.nr_to_write > 0) {
+			if (wbc.more_io) {
 				if (wbc.nr_to_write < MAX_WRITEBACK_PAGES)
 					continue;
 				/*
-- 
cgit v0.10.2


From ae1b7f7d4b9ea587fda95c38301f4e72e8146634 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Wed, 23 Sep 2009 20:33:42 +0800
Subject: writeback: cleanup writeback_single_inode()

Make the if-else straight in writeback_single_inode().
No behavior change.

Cc: Jan Kara <jack@suse.cz>
Cc: Michael Rubin <mrubin@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Fengguang Wu <wfg@mail.ustc.edu.cn>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 551684d..916e834 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -449,8 +449,13 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 	spin_lock(&inode_lock);
 	inode->i_state &= ~I_SYNC;
 	if (!(inode->i_state & (I_FREEING | I_CLEAR))) {
-		if (!(inode->i_state & I_DIRTY) &&
-		    mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+		if (inode->i_state & I_DIRTY) {
+			/*
+			 * Someone redirtied the inode while were writing back
+			 * the pages.
+			 */
+			redirty_tail(inode);
+		} else if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
 			/*
 			 * We didn't write back all the pages.  nfs_writepages()
 			 * sometimes bales out without doing anything. Redirty
@@ -494,12 +499,6 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 				inode->i_state |= I_DIRTY_PAGES;
 				redirty_tail(inode);
 			}
-		} else if (inode->i_state & I_DIRTY) {
-			/*
-			 * Someone redirtied the inode while were writing back
-			 * the pages.
-			 */
-			redirty_tail(inode);
 		} else if (atomic_read(&inode->i_count)) {
 			/*
 			 * The inode is clean, inuse
-- 
cgit v0.10.2


From 71fd05a887e0f3f6bfff76ff81b33776177d0606 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 23 Sep 2009 19:32:26 +0200
Subject: writeback: improve readability of the wb_writeback() continue/break
 logic

And throw some comments in there, too.

Reviewed-by: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 916e834..15e375b 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -750,29 +750,32 @@ static long wb_writeback(struct bdi_writeback *wb,
 		wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
 
 		/*
-		 * If we ran out of stuff to write, bail unless more_io got set
+		 * If we consumed everything, see if we have more
 		 */
-		if (wbc.nr_to_write > 0) {
-			if (wbc.more_io) {
-				if (wbc.nr_to_write < MAX_WRITEBACK_PAGES)
-					continue;
-				/*
-				 * Nothing written. Wait for some inode to
-				 * become available for writeback. Otherwise
-				 * we'll just busyloop.
-				 */
-				spin_lock(&inode_lock);
-				if (!list_empty(&wb->b_more_io))  {
-					inode = list_entry(
-							wb->b_more_io.prev,
-							struct inode, i_list);
-					inode_wait_for_writeback(inode);
-				}
-				spin_unlock(&inode_lock);
-				continue;
-			}
+		if (wbc.nr_to_write <= 0)
+			continue;
+		/*
+		 * Didn't write everything and we don't have more IO, bail
+		 */
+		if (!wbc.more_io)
 			break;
+		/*
+		 * Did we write something? Try for more
+		 */
+		if (wbc.nr_to_write < MAX_WRITEBACK_PAGES)
+			continue;
+		/*
+		 * Nothing written. Wait for some inode to
+		 * become available for writeback. Otherwise
+		 * we'll just busyloop.
+		 */
+		spin_lock(&inode_lock);
+		if (!list_empty(&wb->b_more_io))  {
+			inode = list_entry(wb->b_more_io.prev,
+						struct inode, i_list);
+			inode_wait_for_writeback(inode);
 		}
+		spin_unlock(&inode_lock);
 	}
 
 	return wrote;
-- 
cgit v0.10.2


From 5b0830cb9085f4b69f9d57d7f3aaff322ffbec26 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 23 Sep 2009 19:37:09 +0200
Subject: writeback: get rid to incorrect references to pdflush in comments

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/fs/buffer.c b/fs/buffer.c
index 90a9886..fc22b45 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -274,7 +274,7 @@ void invalidate_bdev(struct block_device *bdev)
 }
 
 /*
- * Kick pdflush then try to free up some ZONE_NORMAL memory.
+ * Kick the writeback threads then try to free up some ZONE_NORMAL memory.
  */
 static void free_more_memory(void)
 {
@@ -1699,9 +1699,9 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
 		/*
 		 * If it's a fully non-blocking write attempt and we cannot
 		 * lock the buffer then redirty the page.  Note that this can
-		 * potentially cause a busy-wait loop from pdflush and kswapd
-		 * activity, but those code paths have their own higher-level
-		 * throttling.
+		 * potentially cause a busy-wait loop from writeback threads
+		 * and kswapd activity, but those code paths have their own
+		 * higher-level throttling.
 		 */
 		if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
 			lock_buffer(bh);
@@ -3191,7 +3191,7 @@ void block_sync_page(struct page *page)
  * still running obsolete flush daemons, so we terminate them here.
  *
  * Use of bdflush() is deprecated and will be removed in a future kernel.
- * The `pdflush' kernel threads fully replace bdflush daemons and this call.
+ * The `flush-X' kernel threads fully replace bdflush daemons and this call.
  */
 SYSCALL_DEFINE2(bdflush, int, func, long, data)
 {
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 15e375b..15944f7 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -320,7 +320,7 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
 	 * For inodes being constantly redirtied, dirtied_when can get stuck.
 	 * It _appears_ to be in the future, but is actually in distant past.
 	 * This test is necessary to prevent such wrapped-around relative times
-	 * from permanently stopping the whole pdflush writeback.
+	 * from permanently stopping the whole bdi writeback.
 	 */
 	ret = ret && time_before_eq(inode->dirtied_when, jiffies);
 #endif
@@ -1085,9 +1085,6 @@ EXPORT_SYMBOL(__mark_inode_dirty);
  * If older_than_this is non-NULL, then only write out inodes which
  * had their first dirtying at a time earlier than *older_than_this.
  *
- * If we're a pdlfush thread, then implement pdflush collision avoidance
- * against the entire list.
- *
  * If `bdi' is non-zero then we're being asked to writeback a specific queue.
  * This function assumes that the blockdev superblock's inodes are backed by
  * a variety of queues, so all inodes are searched.  For other superblocks,
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 3c78fc3..8bef063 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -58,7 +58,7 @@ static inline long sync_writeback_pages(unsigned long dirtied)
 /* The following parameters are exported via /proc/sys/vm */
 
 /*
- * Start background writeback (via pdflush) at this percentage
+ * Start background writeback (via writeback threads) at this percentage
  */
 int dirty_background_ratio = 10;
 
@@ -477,8 +477,8 @@ get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
  * balance_dirty_pages() must be called by processes which are generating dirty
  * data.  It looks at the number of dirty pages in the machine and will force
  * the caller to perform writeback if the system is over `vm_dirty_ratio'.
- * If we're over `background_thresh' then pdflush is woken to perform some
- * writeout.
+ * If we're over `background_thresh' then the writeback threads are woken to
+ * perform some writeout.
  */
 static void balance_dirty_pages(struct address_space *mapping,
 				unsigned long write_chunk)
@@ -582,7 +582,7 @@ static void balance_dirty_pages(struct address_space *mapping,
 		bdi->dirty_exceeded = 0;
 
 	if (writeback_in_progress(bdi))
-		return;		/* pdflush is already working this queue */
+		return;
 
 	/*
 	 * In laptop mode, we wait until hitting the higher threshold before
diff --git a/mm/shmem.c b/mm/shmem.c
index b206a7a..aa94811 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1046,8 +1046,9 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
 	 * sync from ever calling shmem_writepage; but a stacking filesystem
 	 * may use the ->writepage of its underlying filesystem, in which case
 	 * tmpfs should write out to swap only in response to memory pressure,
-	 * and not for pdflush or sync.  However, in those cases, we do still
-	 * want to check if there's a redundant swappage to be discarded.
+	 * and not for the writeback threads or sync.  However, in those cases,
+	 * we do still want to check if there's a redundant swappage to be
+	 * discarded.
 	 */
 	if (wbc->for_reclaim)
 		swap = get_swap_page();
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 613e89f..359c3c5 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1709,10 +1709,10 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
  *
  * If the caller is !__GFP_FS then the probability of a failure is reasonably
  * high - the zone may be full of dirty or under-writeback pages, which this
- * caller can't do much about.  We kick pdflush and take explicit naps in the
- * hope that some of these pages can be written.  But if the allocating task
- * holds filesystem locks which prevent writeout this might not work, and the
- * allocation attempt will fail.
+ * caller can't do much about.  We kick the writeback threads and take explicit
+ * naps in the hope that some of these pages can be written.  But if the
+ * allocating task holds filesystem locks which prevent writeout this might not
+ * work, and the allocation attempt will fail.
  *
  * returns:	0, if no pages reclaimed
  * 		else, the number of pages reclaimed
-- 
cgit v0.10.2


From 5c03449d34debca0deab58046377e1175c1bcd7e Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Thu, 24 Sep 2009 14:42:33 +0200
Subject: writeback: move inodes from one super_block together

__mark_inode_dirty adds inode to wb dirty list in random order. If a disk has
several partitions, writeback might keep spindle moving between partitions.
To reduce the move, better write big chunk of one partition and then move to
another. Inodes from one fs usually are in one partion, so idealy move indoes
from one fs together should reduce spindle move. This patch tries to address
this. Before per-bdi writeback is added, the behavior is write indoes
from one fs first and then another, so the patch restores previous behavior.
The loop in the patch is a bit ugly, should we add a dirty list for each
superblock in bdi_writeback?

Test in a two partition disk with attached fio script shows about 3% ~ 6%
improvement.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Reviewed-by: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 15944f7..b27406d 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -334,13 +334,28 @@ static void move_expired_inodes(struct list_head *delaying_queue,
 			       struct list_head *dispatch_queue,
 				unsigned long *older_than_this)
 {
+	LIST_HEAD(tmp);
+	struct list_head *pos, *node;
+	struct super_block *sb;
+	struct inode *inode;
+
 	while (!list_empty(delaying_queue)) {
-		struct inode *inode = list_entry(delaying_queue->prev,
-						struct inode, i_list);
+		inode = list_entry(delaying_queue->prev, struct inode, i_list);
 		if (older_than_this &&
 		    inode_dirtied_after(inode, *older_than_this))
 			break;
-		list_move(&inode->i_list, dispatch_queue);
+		list_move(&inode->i_list, &tmp);
+	}
+
+	/* Move inodes from one superblock together */
+	while (!list_empty(&tmp)) {
+		inode = list_entry(tmp.prev, struct inode, i_list);
+		sb = inode->i_sb;
+		list_for_each_prev_safe(pos, node, &tmp) {
+			inode = list_entry(pos, struct inode, i_list);
+			if (inode->i_sb == sb)
+				list_move(&inode->i_list, dispatch_queue);
+		}
 	}
 }
 
-- 
cgit v0.10.2


From cf137307cd9827495b65e7d74ea2b610daa9898b Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 24 Sep 2009 15:12:57 +0200
Subject: writeback: don't resort for a single super_block in
 move_expired_inodes()

If we only moved inodes from a single super_block to the temporary
list, there's no point in doing a resort for multiple super_blocks.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index b27406d..225c7316 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -336,17 +336,27 @@ static void move_expired_inodes(struct list_head *delaying_queue,
 {
 	LIST_HEAD(tmp);
 	struct list_head *pos, *node;
-	struct super_block *sb;
+	struct super_block *sb = NULL;
 	struct inode *inode;
+	int do_sb_sort = 0;
 
 	while (!list_empty(delaying_queue)) {
 		inode = list_entry(delaying_queue->prev, struct inode, i_list);
 		if (older_than_this &&
 		    inode_dirtied_after(inode, *older_than_this))
 			break;
+		if (sb && sb != inode->i_sb)
+			do_sb_sort = 1;
+		sb = inode->i_sb;
 		list_move(&inode->i_list, &tmp);
 	}
 
+	/* just one sb in list, splice to dispatch_queue and we're done */
+	if (!do_sb_sort) {
+		list_splice(&tmp, dispatch_queue);
+		return;
+	}
+
 	/* Move inodes from one superblock together */
 	while (!list_empty(&tmp)) {
 		inode = list_entry(tmp.prev, struct inode, i_list);
-- 
cgit v0.10.2


From 9ecc2738ac2371f88dff5d48914b4e35c45203cd Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 24 Sep 2009 15:25:11 +0200
Subject: writeback: make the super_block pinning more efficient

Currently we pin the inode->i_sb for every single inode. This
increases cache traffic on sb->s_umount sem. Lets instead
cache the inode sb pin state and keep the super_block pinned
for as long as keep writing out inodes from the same
super_block.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 225c7316..c6bf775 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -540,6 +540,17 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 	return ret;
 }
 
+static void unpin_sb_for_writeback(struct super_block **psb)
+{
+	struct super_block *sb = *psb;
+
+	if (sb) {
+		up_read(&sb->s_umount);
+		put_super(sb);
+		*psb = NULL;
+	}
+}
+
 /*
  * For WB_SYNC_NONE writeback, the caller does not have the sb pinned
  * before calling writeback. So make sure that we do pin it, so it doesn't
@@ -549,11 +560,20 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
  * 1 if we failed.
  */
 static int pin_sb_for_writeback(struct writeback_control *wbc,
-				   struct inode *inode)
+				struct inode *inode, struct super_block **psb)
 {
 	struct super_block *sb = inode->i_sb;
 
 	/*
+	 * If this sb is already pinned, nothing more to do. If not and
+	 * *psb is non-NULL, unpin the old one first
+	 */
+	if (sb == *psb)
+		return 0;
+	else if (*psb)
+		unpin_sb_for_writeback(psb);
+
+	/*
 	 * Caller must already hold the ref for this
 	 */
 	if (wbc->sync_mode == WB_SYNC_ALL) {
@@ -566,7 +586,7 @@ static int pin_sb_for_writeback(struct writeback_control *wbc,
 	if (down_read_trylock(&sb->s_umount)) {
 		if (sb->s_root) {
 			spin_unlock(&sb_lock);
-			return 0;
+			goto pinned;
 		}
 		/*
 		 * umounted, drop rwsem again and fall through to failure
@@ -577,24 +597,15 @@ static int pin_sb_for_writeback(struct writeback_control *wbc,
 	sb->s_count--;
 	spin_unlock(&sb_lock);
 	return 1;
-}
-
-static void unpin_sb_for_writeback(struct writeback_control *wbc,
-				   struct inode *inode)
-{
-	struct super_block *sb = inode->i_sb;
-
-	if (wbc->sync_mode == WB_SYNC_ALL)
-		return;
-
-	up_read(&sb->s_umount);
-	put_super(sb);
+pinned:
+	*psb = sb;
+	return 0;
 }
 
 static void writeback_inodes_wb(struct bdi_writeback *wb,
 				struct writeback_control *wbc)
 {
-	struct super_block *sb = wbc->sb;
+	struct super_block *sb = wbc->sb, *pin_sb = NULL;
 	const int is_blkdev_sb = sb_is_blkdev_sb(sb);
 	const unsigned long start = jiffies;	/* livelock avoidance */
 
@@ -653,7 +664,7 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
 		if (inode_dirtied_after(inode, start))
 			break;
 
-		if (pin_sb_for_writeback(wbc, inode)) {
+		if (pin_sb_for_writeback(wbc, inode, &pin_sb)) {
 			requeue_io(inode);
 			continue;
 		}
@@ -662,7 +673,6 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
 		__iget(inode);
 		pages_skipped = wbc->pages_skipped;
 		writeback_single_inode(inode, wbc);
-		unpin_sb_for_writeback(wbc, inode);
 		if (wbc->pages_skipped != pages_skipped) {
 			/*
 			 * writeback is not making progress due to locked
@@ -682,6 +692,8 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
 			wbc->more_io = 1;
 	}
 
+	unpin_sb_for_writeback(&pin_sb);
+
 	spin_unlock(&inode_lock);
 	/* Leave any unwritten inodes on b_io */
 }
-- 
cgit v0.10.2


From b3af9468aebf5fcb573d0a116b31d2be1d43c0e9 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Fri, 25 Sep 2009 06:04:10 +0200
Subject: writeback: don't delay inodes redirtied by a fast dirtier

Debug traces show that in per-bdi writeback, the inode under writeback
almost always get redirtied by a busy dirtier.  We used to call
redirty_tail() in this case, which could delay inode for up to 30s.

This is unacceptable because it now happens so frequently for plain cp/dd,
that the accumulated delays could make writeback of big files very slow.

So let's distinguish between data redirty and metadata only redirty.
The first one is caused by a busy dirtier, while the latter one could
happen in XFS, NFS, etc. when they are doing delalloc or updating isize.

The inode being busy dirtied will now be requeued for next io, while
the inode being redirtied by fs will continue to be delayed to avoid
repeated IO.

CC: Jan Kara <jack@suse.cz>
CC: Theodore Ts'o <tytso@mit.edu>
CC: Dave Chinner <david@fromorbit.com>
CC: Chris Mason <chris.mason@oracle.com>
CC: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index c6bf775..52aa545 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -474,10 +474,15 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 	spin_lock(&inode_lock);
 	inode->i_state &= ~I_SYNC;
 	if (!(inode->i_state & (I_FREEING | I_CLEAR))) {
-		if (inode->i_state & I_DIRTY) {
+		if ((inode->i_state & I_DIRTY_PAGES) && wbc->for_kupdate) {
 			/*
-			 * Someone redirtied the inode while were writing back
-			 * the pages.
+			 * More pages get dirtied by a fast dirtier.
+			 */
+			goto select_queue;
+		} else if (inode->i_state & I_DIRTY) {
+			/*
+			 * At least XFS will redirty the inode during the
+			 * writeback (delalloc) and on io completion (isize).
 			 */
 			redirty_tail(inode);
 		} else if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
@@ -502,6 +507,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 				 * soon as the queue becomes uncongested.
 				 */
 				inode->i_state |= I_DIRTY_PAGES;
+select_queue:
 				if (wbc->nr_to_write <= 0) {
 					/*
 					 * slice used up: queue for next turn
-- 
cgit v0.10.2


From 56a131dcf7ed36c3c6e36bea448b674ea85ed5bb Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 25 Sep 2009 17:15:03 +0200
Subject: writeback: writeback_inodes_sb() should use bdi_start_writeback()

Pointless to iterate other devices looking for a super, when
we have a bdi mapping.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 52aa545..fb61178 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1206,7 +1206,7 @@ void writeback_inodes_sb(struct super_block *sb)
 	nr_to_write = nr_dirty + nr_unstable +
 			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
 
-	bdi_writeback_all(sb, nr_to_write);
+	bdi_start_writeback(sb->s_bdi, nr_to_write);
 }
 EXPORT_SYMBOL(writeback_inodes_sb);
 
-- 
cgit v0.10.2