From f772b3d9ca135087a70406d8466e57d1cf29237e Mon Sep 17 00:00:00 2001 From: Vasily Tarasov Date: Tue, 27 Mar 2007 08:52:47 +0200 Subject: block: blk_max_pfn is somtimes wrong There is a small problem in handling page bounce. At the moment blk_max_pfn equals max_pfn, which is in fact not maximum possible _number_ of a page frame, but the _amount_ of page frames. For example for the 32bit x86 node with 4Gb RAM, max_pfn = 0x100000, but not 0xFFFF. request_queue structure has a member q->bounce_pfn and queue needs bounce pages for the pages _above_ this limit. This routine is handled by blk_queue_bounce(), where the following check is produced: if (q->bounce_pfn >= blk_max_pfn) return; Assume, that a driver has set q->bounce_pfn to 0xFFFF, but blk_max_pfn equals 0x10000. In such situation the check above fails and for each bio we always fall down for iterating over pages tied to the bio. I want to notice, that for quite a big range of device drivers (ide, md, ...) such problem doesn't happen because they use BLK_BOUNCE_ANY for bounce_pfn. BLK_BOUNCE_ANY is defined as blk_max_pfn << PAGE_SHIFT, and then the check above doesn't fail. But for other drivers, which obtain reuired value from drivers, it fails. For example sata_nv uses ATA_DMA_MASK or dev->dma_mask. I propose to use (max_pfn - 1) for blk_max_pfn. And the same for blk_max_low_pfn. The patch also cleanses some checks related with bounce_pfn. Signed-off-by: Vasily Tarasov Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c index 38c293b..3de0695 100644 --- a/block/ll_rw_blk.c +++ b/block/ll_rw_blk.c @@ -1221,7 +1221,7 @@ void blk_recount_segments(request_queue_t *q, struct bio *bio) * considered part of another segment, since that might * change with the bounce page. */ - high = page_to_pfn(bv->bv_page) >= q->bounce_pfn; + high = page_to_pfn(bv->bv_page) > q->bounce_pfn; if (high || highprv) goto new_hw_segment; if (cluster) { @@ -3658,8 +3658,8 @@ int __init blk_dev_init(void) open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL); register_hotcpu_notifier(&blk_cpu_notifier); - blk_max_low_pfn = max_low_pfn; - blk_max_pfn = max_pfn; + blk_max_low_pfn = max_low_pfn - 1; + blk_max_pfn = max_pfn - 1; return 0; } diff --git a/mm/bounce.c b/mm/bounce.c index 643efbe..ad401fc 100644 --- a/mm/bounce.c +++ b/mm/bounce.c @@ -204,7 +204,7 @@ static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig, /* * is destination page below bounce pfn? */ - if (page_to_pfn(page) < q->bounce_pfn) + if (page_to_pfn(page) <= q->bounce_pfn) continue; /* -- cgit v0.10.2 From 1ffb96c587fa2af0d690dc3548a4a781c477bfb7 Mon Sep 17 00:00:00 2001 From: Thibaut VARENE Date: Thu, 15 Mar 2007 12:59:19 +0100 Subject: make elv_register() output atomic Booting 2.6.21-rc3-g45592145 I noticed the following on one of my machines in the bootlog: io scheduler noop registered<6>Time: jiffies clocksource has been installed. io scheduler deadline registered (default) Looking at block/elevator.c, it appears that elv_register() uses two consecutive printks in a non-atomic way, leading to the above glitch. The attached trivial patch fixes this issue, by using a single printk. Signed-off-by: Thibaut VARENE Signed-off-by: Jens Axboe diff --git a/block/elevator.c b/block/elevator.c index 25f6ef2..96a00c8 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -964,17 +964,18 @@ void elv_unregister_queue(struct request_queue *q) int elv_register(struct elevator_type *e) { + char *def = ""; spin_lock_irq(&elv_list_lock); BUG_ON(elevator_find(e->elevator_name)); list_add_tail(&e->list, &elv_list); spin_unlock_irq(&elv_list_lock); - printk(KERN_INFO "io scheduler %s registered", e->elevator_name); if (!strcmp(e->elevator_name, chosen_elevator) || (!*chosen_elevator && !strcmp(e->elevator_name, CONFIG_DEFAULT_IOSCHED))) - printk(" (default)"); - printk("\n"); + def = " (default)"; + + printk(KERN_INFO "io scheduler %s registered%s\n", e->elevator_name, def); return 0; } EXPORT_SYMBOL_GPL(elv_register); -- cgit v0.10.2 From 485ddb4b9741bafb70b22e5c1f9b4f37dc3e85bd Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Tue, 27 Mar 2007 08:55:08 +0200 Subject: 1/2 splice: dont steal Stealing pages with splice is problematic because we cannot just insert an uptodate page into the pagecache and hope the filesystem can take care of it later. We also cannot just ClearPageUptodate, then hope prepare_write does not write anything into the page, because I don't think prepare_write gives that guarantee. Remove support for SPLICE_F_MOVE for now. If we really want to bring it back, we might be able to do so with a the new filesystem buffered write aops APIs I'm working on. If we really don't want to bring it back, then we should decide that sooner rather than later, and remove the flag and all the stealing infrastructure before anybody starts using it. Signed-off-by: Nick Piggin Signed-off-by: Jens Axboe diff --git a/fs/splice.c b/fs/splice.c index 2fca6eb..badc78f 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -576,76 +576,51 @@ static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf, if (this_len + offset > PAGE_CACHE_SIZE) this_len = PAGE_CACHE_SIZE - offset; - /* - * Reuse buf page, if SPLICE_F_MOVE is set and we are doing a full - * page. - */ - if ((sd->flags & SPLICE_F_MOVE) && this_len == PAGE_CACHE_SIZE) { +find_page: + page = find_lock_page(mapping, index); + if (!page) { + ret = -ENOMEM; + page = page_cache_alloc_cold(mapping); + if (unlikely(!page)) + goto out_ret; + /* - * If steal succeeds, buf->page is now pruned from the - * pagecache and we can reuse it. The page will also be - * locked on successful return. + * This will also lock the page */ - if (buf->ops->steal(pipe, buf)) - goto find_page; - - page = buf->page; - if (add_to_page_cache(page, mapping, index, GFP_KERNEL)) { - unlock_page(page); - goto find_page; - } - - page_cache_get(page); - - if (!(buf->flags & PIPE_BUF_FLAG_LRU)) - lru_cache_add(page); - } else { -find_page: - page = find_lock_page(mapping, index); - if (!page) { - ret = -ENOMEM; - page = page_cache_alloc_cold(mapping); - if (unlikely(!page)) - goto out_ret; + ret = add_to_page_cache_lru(page, mapping, index, + GFP_KERNEL); + if (unlikely(ret)) + goto out; + } - /* - * This will also lock the page - */ - ret = add_to_page_cache_lru(page, mapping, index, - GFP_KERNEL); + /* + * We get here with the page locked. If the page is also + * uptodate, we don't need to do more. If it isn't, we + * may need to bring it in if we are not going to overwrite + * the full page. + */ + if (!PageUptodate(page)) { + if (this_len < PAGE_CACHE_SIZE) { + ret = mapping->a_ops->readpage(file, page); if (unlikely(ret)) goto out; - } - /* - * We get here with the page locked. If the page is also - * uptodate, we don't need to do more. If it isn't, we - * may need to bring it in if we are not going to overwrite - * the full page. - */ - if (!PageUptodate(page)) { - if (this_len < PAGE_CACHE_SIZE) { - ret = mapping->a_ops->readpage(file, page); - if (unlikely(ret)) - goto out; - - lock_page(page); - - if (!PageUptodate(page)) { - /* - * Page got invalidated, repeat. - */ - if (!page->mapping) { - unlock_page(page); - page_cache_release(page); - goto find_page; - } - ret = -EIO; - goto out; + lock_page(page); + + if (!PageUptodate(page)) { + /* + * Page got invalidated, repeat. + */ + if (!page->mapping) { + unlock_page(page); + page_cache_release(page); + goto find_page; } - } else - SetPageUptodate(page); - } + ret = -EIO; + goto out; + } + } else + SetPageUptodate(page); } ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len); -- cgit v0.10.2 From 08c72591636829d40bd695d43ec6d2a8191b668b Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Tue, 27 Mar 2007 08:55:39 +0200 Subject: 2/2 splice: dont readpage Splice does not need to readpage to bring the page uptodate before writing to it, because prepare_write will take care of that for us. Splice is also wrong to SetPageUptodate before the page is actually uptodate. This results in the old uninitialised memory leak. This gets fixed as a matter of course when removing the readpage logic. Signed-off-by: Nick Piggin Signed-off-by: Jens Axboe diff --git a/fs/splice.c b/fs/splice.c index badc78f..ae50208 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -593,36 +593,6 @@ find_page: goto out; } - /* - * We get here with the page locked. If the page is also - * uptodate, we don't need to do more. If it isn't, we - * may need to bring it in if we are not going to overwrite - * the full page. - */ - if (!PageUptodate(page)) { - if (this_len < PAGE_CACHE_SIZE) { - ret = mapping->a_ops->readpage(file, page); - if (unlikely(ret)) - goto out; - - lock_page(page); - - if (!PageUptodate(page)) { - /* - * Page got invalidated, repeat. - */ - if (!page->mapping) { - unlock_page(page); - page_cache_release(page); - goto find_page; - } - ret = -EIO; - goto out; - } - } else - SetPageUptodate(page); - } - ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len); if (unlikely(ret)) { loff_t isize = i_size_read(mapping->host); -- cgit v0.10.2 From 40bee44eaef91b6030037c8bb47f909181fb1edc Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Wed, 21 Mar 2007 13:11:02 +0100 Subject: Export __splice_from_pipe() Ocfs2 wants to implement it's own splice write actor so that it can better manage cluster / page locks. This lets us re-use the rest of splice write while only providing our own code where it's actually important. Signed-off-by: Mark Fasheh Signed-off-by: Jens Axboe diff --git a/fs/splice.c b/fs/splice.c index ae50208..07f6556 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -651,9 +651,9 @@ out_ret: * key here is the 'actor' worker passed in that actually moves the data * to the wanted destination. See pipe_to_file/pipe_to_sendpage above. */ -static ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, - struct file *out, loff_t *ppos, size_t len, - unsigned int flags, splice_actor *actor) +ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, + struct file *out, loff_t *ppos, size_t len, + unsigned int flags, splice_actor *actor) { int ret, do_wakeup, err; struct splice_desc sd; @@ -747,6 +747,7 @@ static ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, return ret; } +EXPORT_SYMBOL(__splice_from_pipe); ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags, diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 2e19478..8bcbc54 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -99,4 +99,8 @@ extern ssize_t splice_from_pipe(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int, splice_actor *); +extern ssize_t __splice_from_pipe(struct pipe_inode_info *, struct file *, + loff_t *, size_t, unsigned int, + splice_actor *); + #endif -- cgit v0.10.2