From ffbd517d5a8c8e93ddd11046434fb029f3df73aa Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 20 Apr 2009 15:50:09 -0400
Subject: Btrfs: use WRITE_SYNC for synchronous writes

Part of reducing fsync/O_SYNC/O_DIRECT latencies is using WRITE_SYNC for
writes we plan on waiting on in the near future.  This patch
mirrors recent changes in other filesystems and the generic code to
use WRITE_SYNC when WB_SYNC_ALL is passed and to use WRITE_SYNC for
other latency critical writes.

Btrfs uses async worker threads for checksumming before the write is done,
and then again to actually submit the bios.  The bio submission code just
runs a per-device list of bios that need to be sent down the pipe.

This list is split into low priority and high priority lists so the
WRITE_SYNC IO happens first.

Signed-off-by: Chris Mason <chris.mason@oracle.com>

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 92caa80..fec18b4 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2095,10 +2095,10 @@ static int write_dev_supers(struct btrfs_device *device,
 				device->barriers = 0;
 				get_bh(bh);
 				lock_buffer(bh);
-				ret = submit_bh(WRITE, bh);
+				ret = submit_bh(WRITE_SYNC, bh);
 			}
 		} else {
-			ret = submit_bh(WRITE, bh);
+			ret = submit_bh(WRITE_SYNC, bh);
 		}
 
 		if (!ret && wait) {
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index eb2bee8..483b672 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -50,7 +50,10 @@ struct extent_page_data {
 	/* tells writepage not to lock the state bits for this range
 	 * it still does the unlocking
 	 */
-	int extent_locked;
+	unsigned int extent_locked:1;
+
+	/* tells the submit_bio code to use a WRITE_SYNC */
+	unsigned int sync_io:1;
 };
 
 int __init extent_io_init(void)
@@ -2136,8 +2139,14 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	u64 delalloc_end;
 	int page_started;
 	int compressed;
+	int write_flags;
 	unsigned long nr_written = 0;
 
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		write_flags = WRITE_SYNC_PLUG;
+	else
+		write_flags = WRITE;
+
 	WARN_ON(!PageLocked(page));
 	pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
 	if (page->index > end_index ||
@@ -2314,9 +2323,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 				       (unsigned long long)end);
 			}
 
-			ret = submit_extent_page(WRITE, tree, page, sector,
-						 iosize, pg_offset, bdev,
-						 &epd->bio, max_nr,
+			ret = submit_extent_page(write_flags, tree, page,
+						 sector, iosize, pg_offset,
+						 bdev, &epd->bio, max_nr,
 						 end_bio_extent_writepage,
 						 0, 0, 0);
 			if (ret)
@@ -2460,15 +2469,23 @@ retry:
 	return ret;
 }
 
-static noinline void flush_write_bio(void *data)
+static void flush_epd_write_bio(struct extent_page_data *epd)
 {
-	struct extent_page_data *epd = data;
 	if (epd->bio) {
-		submit_one_bio(WRITE, epd->bio, 0, 0);
+		if (epd->sync_io)
+			submit_one_bio(WRITE_SYNC, epd->bio, 0, 0);
+		else
+			submit_one_bio(WRITE, epd->bio, 0, 0);
 		epd->bio = NULL;
 	}
 }
 
+static noinline void flush_write_bio(void *data)
+{
+	struct extent_page_data *epd = data;
+	flush_epd_write_bio(epd);
+}
+
 int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
 			  get_extent_t *get_extent,
 			  struct writeback_control *wbc)
@@ -2480,6 +2497,7 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
 		.tree = tree,
 		.get_extent = get_extent,
 		.extent_locked = 0,
+		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
 	};
 	struct writeback_control wbc_writepages = {
 		.bdi		= wbc->bdi,
@@ -2490,13 +2508,11 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
 		.range_end	= (loff_t)-1,
 	};
 
-
 	ret = __extent_writepage(page, wbc, &epd);
 
 	extent_write_cache_pages(tree, mapping, &wbc_writepages,
 				 __extent_writepage, &epd, flush_write_bio);
-	if (epd.bio)
-		submit_one_bio(WRITE, epd.bio, 0, 0);
+	flush_epd_write_bio(&epd);
 	return ret;
 }
 
@@ -2515,6 +2531,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
 		.tree = tree,
 		.get_extent = get_extent,
 		.extent_locked = 1,
+		.sync_io = mode == WB_SYNC_ALL,
 	};
 	struct writeback_control wbc_writepages = {
 		.bdi		= inode->i_mapping->backing_dev_info,
@@ -2540,8 +2557,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
 		start += PAGE_CACHE_SIZE;
 	}
 
-	if (epd.bio)
-		submit_one_bio(WRITE, epd.bio, 0, 0);
+	flush_epd_write_bio(&epd);
 	return ret;
 }
 
@@ -2556,13 +2572,13 @@ int extent_writepages(struct extent_io_tree *tree,
 		.tree = tree,
 		.get_extent = get_extent,
 		.extent_locked = 0,
+		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
 	};
 
 	ret = extent_write_cache_pages(tree, mapping, wbc,
 				       __extent_writepage, &epd,
 				       flush_write_bio);
-	if (epd.bio)
-		submit_one_bio(WRITE, epd.bio, 0, 0);
+	flush_epd_write_bio(&epd);
 	return ret;
 }
 
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 53c87b1..d6f0806 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -489,7 +489,7 @@ again:
 	/* start IO across the range first to instantiate any delalloc
 	 * extents
 	 */
-	btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_NONE);
+	btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL);
 
 	/* The compression code will leave pages locked but return from
 	 * writepage without setting the page writeback.  Starting again
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index e0913e4..e53835b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -125,6 +125,20 @@ static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
 	return NULL;
 }
 
+static void requeue_list(struct btrfs_pending_bios *pending_bios,
+			struct bio *head, struct bio *tail)
+{
+
+	struct bio *old_head;
+
+	old_head = pending_bios->head;
+	pending_bios->head = head;
+	if (pending_bios->tail)
+		tail->bi_next = old_head;
+	else
+		pending_bios->tail = tail;
+}
+
 /*
  * we try to collect pending bios for a device so we don't get a large
  * number of procs sending bios down to the same device.  This greatly
@@ -141,10 +155,12 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
 	struct bio *pending;
 	struct backing_dev_info *bdi;
 	struct btrfs_fs_info *fs_info;
+	struct btrfs_pending_bios *pending_bios;
 	struct bio *tail;
 	struct bio *cur;
 	int again = 0;
-	unsigned long num_run = 0;
+	unsigned long num_run;
+	unsigned long num_sync_run;
 	unsigned long limit;
 	unsigned long last_waited = 0;
 
@@ -153,20 +169,30 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
 	limit = btrfs_async_submit_limit(fs_info);
 	limit = limit * 2 / 3;
 
+	/* we want to make sure that every time we switch from the sync
+	 * list to the normal list, we unplug
+	 */
+	num_sync_run = 0;
+
 loop:
 	spin_lock(&device->io_lock);
+	num_run = 0;
 
 loop_lock:
+
 	/* take all the bios off the list at once and process them
 	 * later on (without the lock held).  But, remember the
 	 * tail and other pointers so the bios can be properly reinserted
 	 * into the list if we hit congestion
 	 */
-	pending = device->pending_bios;
-	tail = device->pending_bio_tail;
+	if (device->pending_sync_bios.head)
+		pending_bios = &device->pending_sync_bios;
+	else
+		pending_bios = &device->pending_bios;
+
+	pending = pending_bios->head;
+	tail = pending_bios->tail;
 	WARN_ON(pending && !tail);
-	device->pending_bios = NULL;
-	device->pending_bio_tail = NULL;
 
 	/*
 	 * if pending was null this time around, no bios need processing
@@ -176,16 +202,41 @@ loop_lock:
 	 * device->running_pending is used to synchronize with the
 	 * schedule_bio code.
 	 */
-	if (pending) {
-		again = 1;
-		device->running_pending = 1;
-	} else {
+	if (device->pending_sync_bios.head == NULL &&
+	    device->pending_bios.head == NULL) {
 		again = 0;
 		device->running_pending = 0;
+	} else {
+		again = 1;
+		device->running_pending = 1;
 	}
+
+	pending_bios->head = NULL;
+	pending_bios->tail = NULL;
+
 	spin_unlock(&device->io_lock);
 
+	/*
+	 * if we're doing the regular priority list, make sure we unplug
+	 * for any high prio bios we've sent down
+	 */
+	if (pending_bios == &device->pending_bios && num_sync_run > 0) {
+		num_sync_run = 0;
+		blk_run_backing_dev(bdi, NULL);
+	}
+
 	while (pending) {
+
+		rmb();
+		if (pending_bios != &device->pending_sync_bios &&
+		    device->pending_sync_bios.head &&
+		    num_run > 16) {
+			cond_resched();
+			spin_lock(&device->io_lock);
+			requeue_list(pending_bios, pending, tail);
+			goto loop_lock;
+		}
+
 		cur = pending;
 		pending = pending->bi_next;
 		cur->bi_next = NULL;
@@ -196,10 +247,18 @@ loop_lock:
 			wake_up(&fs_info->async_submit_wait);
 
 		BUG_ON(atomic_read(&cur->bi_cnt) == 0);
-		bio_get(cur);
 		submit_bio(cur->bi_rw, cur);
-		bio_put(cur);
 		num_run++;
+		if (bio_sync(cur))
+			num_sync_run++;
+
+		if (need_resched()) {
+			if (num_sync_run) {
+				blk_run_backing_dev(bdi, NULL);
+				num_sync_run = 0;
+			}
+			cond_resched();
+		}
 
 		/*
 		 * we made progress, there is more work to do and the bdi
@@ -208,7 +267,6 @@ loop_lock:
 		 */
 		if (pending && bdi_write_congested(bdi) && num_run > 16 &&
 		    fs_info->fs_devices->open_devices > 1) {
-			struct bio *old_head;
 			struct io_context *ioc;
 
 			ioc = current->io_context;
@@ -233,17 +291,17 @@ loop_lock:
 				 * against it before looping
 				 */
 				last_waited = ioc->last_waited;
+				if (need_resched()) {
+					if (num_sync_run) {
+						blk_run_backing_dev(bdi, NULL);
+						num_sync_run = 0;
+					}
+					cond_resched();
+				}
 				continue;
 			}
 			spin_lock(&device->io_lock);
-
-			old_head = device->pending_bios;
-			device->pending_bios = pending;
-			if (device->pending_bio_tail)
-				tail->bi_next = old_head;
-			else
-				device->pending_bio_tail = tail;
-
+			requeue_list(pending_bios, pending, tail);
 			device->running_pending = 1;
 
 			spin_unlock(&device->io_lock);
@@ -251,11 +309,18 @@ loop_lock:
 			goto done;
 		}
 	}
+
+	if (num_sync_run) {
+		num_sync_run = 0;
+		blk_run_backing_dev(bdi, NULL);
+	}
+
+	cond_resched();
 	if (again)
 		goto loop;
 
 	spin_lock(&device->io_lock);
-	if (device->pending_bios)
+	if (device->pending_bios.head || device->pending_sync_bios.head)
 		goto loop_lock;
 	spin_unlock(&device->io_lock);
 
@@ -2497,7 +2562,7 @@ again:
 			max_errors = 1;
 		}
 	}
-	if (multi_ret && rw == WRITE &&
+	if (multi_ret && (rw & (1 << BIO_RW)) &&
 	    stripes_allocated < stripes_required) {
 		stripes_allocated = map->num_stripes;
 		free_extent_map(em);
@@ -2762,6 +2827,7 @@ static noinline int schedule_bio(struct btrfs_root *root,
 				 int rw, struct bio *bio)
 {
 	int should_queue = 1;
+	struct btrfs_pending_bios *pending_bios;
 
 	/* don't bother with additional async steps for reads, right now */
 	if (!(rw & (1 << BIO_RW))) {
@@ -2783,13 +2849,17 @@ static noinline int schedule_bio(struct btrfs_root *root,
 	bio->bi_rw |= rw;
 
 	spin_lock(&device->io_lock);
+	if (bio_sync(bio))
+		pending_bios = &device->pending_sync_bios;
+	else
+		pending_bios = &device->pending_bios;
 
-	if (device->pending_bio_tail)
-		device->pending_bio_tail->bi_next = bio;
+	if (pending_bios->tail)
+		pending_bios->tail->bi_next = bio;
 
-	device->pending_bio_tail = bio;
-	if (!device->pending_bios)
-		device->pending_bios = bio;
+	pending_bios->tail = bio;
+	if (!pending_bios->head)
+		pending_bios->head = bio;
 	if (device->running_pending)
 		should_queue = 0;
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 2185de7..5836327 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -23,13 +23,22 @@
 #include "async-thread.h"
 
 struct buffer_head;
+struct btrfs_pending_bios {
+	struct bio *head;
+	struct bio *tail;
+};
+
 struct btrfs_device {
 	struct list_head dev_list;
 	struct list_head dev_alloc_list;
 	struct btrfs_fs_devices *fs_devices;
 	struct btrfs_root *dev_root;
-	struct bio *pending_bios;
-	struct bio *pending_bio_tail;
+
+	/* regular prio bios */
+	struct btrfs_pending_bios pending_bios;
+	/* WRITE_SYNC bios */
+	struct btrfs_pending_bios pending_sync_bios;
+
 	int running_pending;
 	u64 generation;
 
-- 
cgit v0.10.2