From f772b3d9ca135087a70406d8466e57d1cf29237e Mon Sep 17 00:00:00 2001
From: Vasily Tarasov <vtaras@openvz.org>
Date: Tue, 27 Mar 2007 08:52:47 +0200
Subject: block: blk_max_pfn is somtimes wrong

There is a small problem in handling page bounce.

At the moment blk_max_pfn equals max_pfn, which is in fact not maximum
possible _number_ of a page frame, but the _amount_ of page frames.  For
example for the 32bit x86 node with 4Gb RAM, max_pfn = 0x100000, but not
0xFFFF.

request_queue structure has a member q->bounce_pfn and queue needs bounce
pages for the pages _above_ this limit.  This routine is handled by
blk_queue_bounce(), where the following check is produced:

	if (q->bounce_pfn >= blk_max_pfn)
		return;

Assume, that a driver has set q->bounce_pfn to 0xFFFF, but blk_max_pfn
equals 0x10000.  In such situation the check above fails and for each bio
we always fall down for iterating over pages tied to the bio.

I want to notice, that for quite a big range of device drivers (ide, md,
...) such problem doesn't happen because they use BLK_BOUNCE_ANY for
bounce_pfn.  BLK_BOUNCE_ANY is defined as blk_max_pfn << PAGE_SHIFT, and
then the check above doesn't fail.  But for other drivers, which obtain
reuired value from drivers, it fails.  For example sata_nv uses
ATA_DMA_MASK or dev->dma_mask.

I propose to use (max_pfn - 1) for blk_max_pfn.  And the same for
blk_max_low_pfn.  The patch also cleanses some checks related with
bounce_pfn.

Signed-off-by: Vasily Tarasov <vtaras@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index 38c293b..3de0695 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -1221,7 +1221,7 @@ void blk_recount_segments(request_queue_t *q, struct bio *bio)
 		 * considered part of another segment, since that might
 		 * change with the bounce page.
 		 */
-		high = page_to_pfn(bv->bv_page) >= q->bounce_pfn;
+		high = page_to_pfn(bv->bv_page) > q->bounce_pfn;
 		if (high || highprv)
 			goto new_hw_segment;
 		if (cluster) {
@@ -3658,8 +3658,8 @@ int __init blk_dev_init(void)
 	open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL);
 	register_hotcpu_notifier(&blk_cpu_notifier);
 
-	blk_max_low_pfn = max_low_pfn;
-	blk_max_pfn = max_pfn;
+	blk_max_low_pfn = max_low_pfn - 1;
+	blk_max_pfn = max_pfn - 1;
 
 	return 0;
 }
diff --git a/mm/bounce.c b/mm/bounce.c
index 643efbe..ad401fc 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -204,7 +204,7 @@ static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig,
 		/*
 		 * is destination page below bounce pfn?
 		 */
-		if (page_to_pfn(page) < q->bounce_pfn)
+		if (page_to_pfn(page) <= q->bounce_pfn)
 			continue;
 
 		/*
-- 
cgit v0.10.2


From 1ffb96c587fa2af0d690dc3548a4a781c477bfb7 Mon Sep 17 00:00:00 2001
From: Thibaut VARENE <T-Bone@parisc-linux.org>
Date: Thu, 15 Mar 2007 12:59:19 +0100
Subject: make elv_register() output atomic

Booting 2.6.21-rc3-g45592145 I noticed the following on one of my
machines in the bootlog:

io scheduler noop registered<6>Time: jiffies clocksource has been installed.

io scheduler deadline registered (default)

Looking at block/elevator.c, it appears that elv_register() uses two
consecutive printks in a non-atomic way, leading to the above glitch. The
attached trivial patch fixes this issue, by using a single printk.

Signed-off-by: Thibaut VARENE <varenet@parisc-linux.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/elevator.c b/block/elevator.c
index 25f6ef2..96a00c8 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -964,17 +964,18 @@ void elv_unregister_queue(struct request_queue *q)
 
 int elv_register(struct elevator_type *e)
 {
+	char *def = "";
 	spin_lock_irq(&elv_list_lock);
 	BUG_ON(elevator_find(e->elevator_name));
 	list_add_tail(&e->list, &elv_list);
 	spin_unlock_irq(&elv_list_lock);
 
-	printk(KERN_INFO "io scheduler %s registered", e->elevator_name);
 	if (!strcmp(e->elevator_name, chosen_elevator) ||
 			(!*chosen_elevator &&
 			 !strcmp(e->elevator_name, CONFIG_DEFAULT_IOSCHED)))
-				printk(" (default)");
-	printk("\n");
+				def = " (default)";
+
+	printk(KERN_INFO "io scheduler %s registered%s\n", e->elevator_name, def);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(elv_register);
-- 
cgit v0.10.2


From 485ddb4b9741bafb70b22e5c1f9b4f37dc3e85bd Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Tue, 27 Mar 2007 08:55:08 +0200
Subject: 1/2 splice: dont steal

Stealing pages with splice is problematic because we cannot just insert
an uptodate page into the pagecache and hope the filesystem can take care
of it later.

We also cannot just ClearPageUptodate, then hope prepare_write does not
write anything into the page, because I don't think prepare_write gives
that guarantee.

Remove support for SPLICE_F_MOVE for now. If we really want to bring it
back, we might be able to do so with a the new filesystem buffered write
aops APIs I'm working on. If we really don't want to bring it back, then
we should decide that sooner rather than later, and remove the flag and
all the stealing infrastructure before anybody starts using it.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/fs/splice.c b/fs/splice.c
index 2fca6eb..badc78f 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -576,76 +576,51 @@ static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 	if (this_len + offset > PAGE_CACHE_SIZE)
 		this_len = PAGE_CACHE_SIZE - offset;
 
-	/*
-	 * Reuse buf page, if SPLICE_F_MOVE is set and we are doing a full
-	 * page.
-	 */
-	if ((sd->flags & SPLICE_F_MOVE) && this_len == PAGE_CACHE_SIZE) {
+find_page:
+	page = find_lock_page(mapping, index);
+	if (!page) {
+		ret = -ENOMEM;
+		page = page_cache_alloc_cold(mapping);
+		if (unlikely(!page))
+			goto out_ret;
+
 		/*
-		 * If steal succeeds, buf->page is now pruned from the
-		 * pagecache and we can reuse it. The page will also be
-		 * locked on successful return.
+		 * This will also lock the page
 		 */
-		if (buf->ops->steal(pipe, buf))
-			goto find_page;
-
-		page = buf->page;
-		if (add_to_page_cache(page, mapping, index, GFP_KERNEL)) {
-			unlock_page(page);
-			goto find_page;
-		}
-
-		page_cache_get(page);
-
-		if (!(buf->flags & PIPE_BUF_FLAG_LRU))
-			lru_cache_add(page);
-	} else {
-find_page:
-		page = find_lock_page(mapping, index);
-		if (!page) {
-			ret = -ENOMEM;
-			page = page_cache_alloc_cold(mapping);
-			if (unlikely(!page))
-				goto out_ret;
+		ret = add_to_page_cache_lru(page, mapping, index,
+					    GFP_KERNEL);
+		if (unlikely(ret))
+			goto out;
+	}
 
-			/*
-			 * This will also lock the page
-			 */
-			ret = add_to_page_cache_lru(page, mapping, index,
-						    GFP_KERNEL);
+	/*
+	 * We get here with the page locked. If the page is also
+	 * uptodate, we don't need to do more. If it isn't, we
+	 * may need to bring it in if we are not going to overwrite
+	 * the full page.
+	 */
+	if (!PageUptodate(page)) {
+		if (this_len < PAGE_CACHE_SIZE) {
+			ret = mapping->a_ops->readpage(file, page);
 			if (unlikely(ret))
 				goto out;
-		}
 
-		/*
-		 * We get here with the page locked. If the page is also
-		 * uptodate, we don't need to do more. If it isn't, we
-		 * may need to bring it in if we are not going to overwrite
-		 * the full page.
-		 */
-		if (!PageUptodate(page)) {
-			if (this_len < PAGE_CACHE_SIZE) {
-				ret = mapping->a_ops->readpage(file, page);
-				if (unlikely(ret))
-					goto out;
-
-				lock_page(page);
-
-				if (!PageUptodate(page)) {
-					/*
-					 * Page got invalidated, repeat.
-					 */
-					if (!page->mapping) {
-						unlock_page(page);
-						page_cache_release(page);
-						goto find_page;
-					}
-					ret = -EIO;
-					goto out;
+			lock_page(page);
+
+			if (!PageUptodate(page)) {
+				/*
+				 * Page got invalidated, repeat.
+				 */
+				if (!page->mapping) {
+					unlock_page(page);
+					page_cache_release(page);
+					goto find_page;
 				}
-			} else
-				SetPageUptodate(page);
-		}
+				ret = -EIO;
+				goto out;
+			}
+		} else
+			SetPageUptodate(page);
 	}
 
 	ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len);
-- 
cgit v0.10.2


From 08c72591636829d40bd695d43ec6d2a8191b668b Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Tue, 27 Mar 2007 08:55:39 +0200
Subject: 2/2 splice: dont readpage

Splice does not need to readpage to bring the page uptodate before writing
to it, because prepare_write will take care of that for us.

Splice is also wrong to SetPageUptodate before the page is actually uptodate.
This results in the old uninitialised memory leak. This gets fixed as a
matter of course when removing the readpage logic.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/fs/splice.c b/fs/splice.c
index badc78f..ae50208 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -593,36 +593,6 @@ find_page:
 			goto out;
 	}
 
-	/*
-	 * We get here with the page locked. If the page is also
-	 * uptodate, we don't need to do more. If it isn't, we
-	 * may need to bring it in if we are not going to overwrite
-	 * the full page.
-	 */
-	if (!PageUptodate(page)) {
-		if (this_len < PAGE_CACHE_SIZE) {
-			ret = mapping->a_ops->readpage(file, page);
-			if (unlikely(ret))
-				goto out;
-
-			lock_page(page);
-
-			if (!PageUptodate(page)) {
-				/*
-				 * Page got invalidated, repeat.
-				 */
-				if (!page->mapping) {
-					unlock_page(page);
-					page_cache_release(page);
-					goto find_page;
-				}
-				ret = -EIO;
-				goto out;
-			}
-		} else
-			SetPageUptodate(page);
-	}
-
 	ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len);
 	if (unlikely(ret)) {
 		loff_t isize = i_size_read(mapping->host);
-- 
cgit v0.10.2


From 40bee44eaef91b6030037c8bb47f909181fb1edc Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Wed, 21 Mar 2007 13:11:02 +0100
Subject: Export __splice_from_pipe()

Ocfs2 wants to implement it's own splice write actor so that it can better
manage cluster / page locks. This lets us re-use the rest of splice write
while only providing our own code where it's actually important.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/fs/splice.c b/fs/splice.c
index ae50208..07f6556 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -651,9 +651,9 @@ out_ret:
  * key here is the 'actor' worker passed in that actually moves the data
  * to the wanted destination. See pipe_to_file/pipe_to_sendpage above.
  */
-static ssize_t __splice_from_pipe(struct pipe_inode_info *pipe,
-				  struct file *out, loff_t *ppos, size_t len,
-				  unsigned int flags, splice_actor *actor)
+ssize_t __splice_from_pipe(struct pipe_inode_info *pipe,
+			   struct file *out, loff_t *ppos, size_t len,
+			   unsigned int flags, splice_actor *actor)
 {
 	int ret, do_wakeup, err;
 	struct splice_desc sd;
@@ -747,6 +747,7 @@ static ssize_t __splice_from_pipe(struct pipe_inode_info *pipe,
 
 	return ret;
 }
+EXPORT_SYMBOL(__splice_from_pipe);
 
 ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
 			 loff_t *ppos, size_t len, unsigned int flags,
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 2e19478..8bcbc54 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -99,4 +99,8 @@ extern ssize_t splice_from_pipe(struct pipe_inode_info *, struct file *,
 				loff_t *, size_t, unsigned int,
 				splice_actor *);
 
+extern ssize_t __splice_from_pipe(struct pipe_inode_info *, struct file *,
+				  loff_t *, size_t, unsigned int,
+				  splice_actor *);
+
 #endif
-- 
cgit v0.10.2