From a82c53a0e3f57f02782330372b7adad67b417645 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 9 May 2008 13:28:36 +0200 Subject: splice: fix sendfile() issue with relay Splice isn't always incrementing the ppos correctly, which broke relay splice. Signed-off-by: Tom Zanussi Tested-by: Dan Williams Signed-off-by: Jens Axboe diff --git a/fs/splice.c b/fs/splice.c index 7815003..a048ad2 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -983,7 +983,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, while (len) { size_t read_len; - loff_t pos = sd->pos; + loff_t pos = sd->pos, prev_pos = pos; ret = do_splice_to(in, &pos, pipe, len, flags); if (unlikely(ret <= 0)) @@ -998,15 +998,19 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, * could get stuck data in the internal pipe: */ ret = actor(pipe, sd); - if (unlikely(ret <= 0)) + if (unlikely(ret <= 0)) { + sd->pos = prev_pos; goto out_release; + } bytes += ret; len -= ret; sd->pos = pos; - if (ret < read_len) + if (ret < read_len) { + sd->pos = prev_pos + ret; goto out_release; + } } done: @@ -1072,7 +1076,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, ret = splice_direct_to_actor(in, &sd, direct_splice_actor); if (ret > 0) - *ppos += ret; + *ppos = sd.pos; return ret; } diff --git a/kernel/relay.c b/kernel/relay.c index bc24dcd..7de644c 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1191,7 +1191,7 @@ static ssize_t relay_file_splice_read(struct file *in, ret = 0; spliced = 0; - while (len) { + while (len && !spliced) { ret = subbuf_splice_actor(in, ppos, pipe, len, flags, &nonpad_ret); if (ret < 0) break; -- cgit v0.10.2 From ca39d651d17df49b6d11f851d56c0ce0ce01ea1a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 20 May 2008 21:27:41 +0200 Subject: splice: handle try_to_release_page() failure splice currently assumes that try_to_release_page() always suceeds, but it can return failure. If it does, we cannot steal the page. Acked-by: Mingming Cao diff --git a/fs/splice.c b/fs/splice.c index a048ad2..aa5f6f6 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -58,8 +58,8 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe, */ wait_on_page_writeback(page); - if (PagePrivate(page)) - try_to_release_page(page, GFP_KERNEL); + if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL)) + goto out_unlock; /* * If we succeeded in removing the mapping, set LRU flag @@ -75,6 +75,7 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe, * Raced with truncate or failed to remove page from current * address space, unlock and return failure. */ +out_unlock: unlock_page(page); return 1; } -- cgit v0.10.2 From 05caf8dbc1880415df3378cfd114d832c9618b60 Mon Sep 17 00:00:00 2001 From: "Zhang, Yanmin" Date: Thu, 22 May 2008 15:13:29 +0200 Subject: block: Move the second call to get_request to the end of the loop In function get_request_wait, the second call to get_request could be moved to the end of the while loop, because if the first call to get_request fails, the second call will fail without sleep. Signed-off-by: Zhang Yanmin Signed-off-by: Jens Axboe diff --git a/block/blk-core.c b/block/blk-core.c index 6a9cc0d..1905aab 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -806,35 +806,32 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags, rq = get_request(q, rw_flags, bio, GFP_NOIO); while (!rq) { DEFINE_WAIT(wait); + struct io_context *ioc; struct request_list *rl = &q->rq; prepare_to_wait_exclusive(&rl->wait[rw], &wait, TASK_UNINTERRUPTIBLE); - rq = get_request(q, rw_flags, bio, GFP_NOIO); - - if (!rq) { - struct io_context *ioc; + blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ); - blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ); - - __generic_unplug_device(q); - spin_unlock_irq(q->queue_lock); - io_schedule(); + __generic_unplug_device(q); + spin_unlock_irq(q->queue_lock); + io_schedule(); - /* - * After sleeping, we become a "batching" process and - * will be able to allocate at least one request, and - * up to a big batch of them for a small period time. - * See ioc_batching, ioc_set_batching - */ - ioc = current_io_context(GFP_NOIO, q->node); - ioc_set_batching(q, ioc); + /* + * After sleeping, we become a "batching" process and + * will be able to allocate at least one request, and + * up to a big batch of them for a small period time. + * See ioc_batching, ioc_set_batching + */ + ioc = current_io_context(GFP_NOIO, q->node); + ioc_set_batching(q, ioc); - spin_lock_irq(q->queue_lock); - } + spin_lock_irq(q->queue_lock); finish_wait(&rl->wait[rw], &wait); - } + + rq = get_request(q, rw_flags, bio, GFP_NOIO); + }; return rq; } -- cgit v0.10.2 From be754d2c2161c0cce11d62727016985ecb76831b Mon Sep 17 00:00:00 2001 From: Richard Kennedy Date: Fri, 23 May 2008 06:52:00 +0200 Subject: block: reorder cfq_queue to save space on 64bit builds saves 8 bytes of padding & increases objects/slab from 30 to 32 on my AMD64 config Signed-off-by: Richard Kennedy Signed-off-by: Jens Axboe diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index b399c62..4df3f052 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -124,6 +124,8 @@ struct cfq_data { struct cfq_queue { /* reference count */ atomic_t ref; + /* various state flags, see below */ + unsigned int flags; /* parent cfq_data */ struct cfq_data *cfqd; /* service_tree member */ @@ -138,14 +140,14 @@ struct cfq_queue { int queued[2]; /* currently allocated requests */ int allocated[2]; - /* pending metadata requests */ - int meta_pending; /* fifo list of requests in sort_list */ struct list_head fifo; unsigned long slice_end; long slice_resid; + /* pending metadata requests */ + int meta_pending; /* number of requests that are on the dispatch list or inside driver */ int dispatched; @@ -153,8 +155,6 @@ struct cfq_queue { unsigned short ioprio, org_ioprio; unsigned short ioprio_class, org_ioprio_class; - /* various state flags, see below */ - unsigned int flags; }; enum cfqq_state_flags { -- cgit v0.10.2 From 9d5f09a424a67ddb959829894efb4c71cbf6d600 Mon Sep 17 00:00:00 2001 From: "Alan D. Brunelle" Date: Tue, 27 May 2008 14:54:41 +0200 Subject: Added in MESSAGE notes for blktraces Allows messages to be inserted into blktrace streams. Signed-off-by: Alan D. Brunelle Signed-off-by: Jens Axboe diff --git a/block/blktrace.c b/block/blktrace.c index b2cbb4e..20e11f3 100644 --- a/block/blktrace.c +++ b/block/blktrace.c @@ -75,6 +75,20 @@ static void trace_note_time(struct blk_trace *bt) local_irq_restore(flags); } +void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) +{ + int n; + va_list args; + static char bt_msg_buf[BLK_TN_MAX_MSG]; + + va_start(args, fmt); + n = vscnprintf(bt_msg_buf, BLK_TN_MAX_MSG, fmt, args); + va_end(args); + + trace_note(bt, 0, BLK_TN_MESSAGE, bt_msg_buf, n); +} +EXPORT_SYMBOL_GPL(__trace_note_message); + static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, pid_t pid) { diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index cfc3147..b7cd8f1 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -55,6 +55,7 @@ enum blktrace_act { enum blktrace_notify { __BLK_TN_PROCESS = 0, /* establish pid/name mapping */ __BLK_TN_TIMESTAMP, /* include system clock */ + __BLK_TN_MESSAGE, /* Character string message */ }; @@ -79,6 +80,7 @@ enum blktrace_notify { #define BLK_TN_PROCESS (__BLK_TN_PROCESS | BLK_TC_ACT(BLK_TC_NOTIFY)) #define BLK_TN_TIMESTAMP (__BLK_TN_TIMESTAMP | BLK_TC_ACT(BLK_TC_NOTIFY)) +#define BLK_TN_MESSAGE (__BLK_TN_MESSAGE | BLK_TC_ACT(BLK_TC_NOTIFY)) #define BLK_IO_TRACE_MAGIC 0x65617400 #define BLK_IO_TRACE_VERSION 0x07 @@ -149,7 +151,28 @@ extern void blk_trace_shutdown(struct request_queue *); extern void __blk_add_trace(struct blk_trace *, sector_t, int, int, u32, int, int, void *); extern int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, struct blk_user_trace_setup *buts); +extern void __trace_note_message(struct blk_trace *, const char *fmt, ...); +/** + * blk_add_trace_msg - Add a (simple) message to the blktrace stream + * @q: queue the io is for + * @fmt: format to print message in + * args... Variable argument list for format + * + * Description: + * Records a (simple) message onto the blktrace stream. + * + * NOTE: BLK_TN_MAX_MSG characters are output at most. + * NOTE: Can not use 'static inline' due to presence of var args... + * + **/ +#define blk_add_trace_msg(q, fmt, ...) \ + do { \ + struct blk_trace *bt = (q)->blk_trace; \ + if (unlikely(bt)) \ + __trace_note_message(bt, fmt, ##__VA_ARGS__); \ + } while (0) +#define BLK_TN_MAX_MSG 1024 /** * blk_add_trace_rq - Add a trace for a request oriented action @@ -299,6 +322,8 @@ extern int blk_trace_remove(struct request_queue *q); #define blk_trace_setup(q, name, dev, arg) (-ENOTTY) #define blk_trace_startstop(q, start) (-ENOTTY) #define blk_trace_remove(q) (-ENOTTY) +#define blk_add_trace_msg(q, fmt, ...) do { } while (0) + #endif /* CONFIG_BLK_DEV_IO_TRACE */ #endif /* __KERNEL__ */ #endif -- cgit v0.10.2 From 4722dc52a891ab6cb2d637ddb87233e0ce277827 Mon Sep 17 00:00:00 2001 From: "Alan D. Brunelle" Date: Tue, 27 May 2008 14:55:00 +0200 Subject: Added in elevator switch message to blktrace stream Signed-off-by: Alan D. Brunelle Signed-off-by: Jens Axboe diff --git a/block/elevator.c b/block/elevator.c index 980f8ae..902dd13 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -1110,6 +1110,8 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q); spin_unlock_irq(q->queue_lock); + blk_add_trace_msg(q, "elv switch: %s", e->elevator_type->elevator_name); + return 1; fail_register: -- cgit v0.10.2 From 64565911cdb57c2f512a9715b985b5617402cc67 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 28 May 2008 14:45:33 +0200 Subject: block: make blktrace use per-cpu buffers for message notes Currently it uses a single static char array, but that risks being corrupted when multiple users issue message notes at the same time. Make the buffers dynamically allocated when the trace is setup and make them per-cpu instead. The default max message size of 1k is also very large, the interface is mainly for small text notes. So shrink it to 128 bytes. Signed-off-by: Jens Axboe diff --git a/block/blktrace.c b/block/blktrace.c index 20e11f3..7ae87cc 100644 --- a/block/blktrace.c +++ b/block/blktrace.c @@ -79,13 +79,16 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) { int n; va_list args; - static char bt_msg_buf[BLK_TN_MAX_MSG]; + char *buf; + preempt_disable(); + buf = per_cpu_ptr(bt->msg_data, smp_processor_id()); va_start(args, fmt); - n = vscnprintf(bt_msg_buf, BLK_TN_MAX_MSG, fmt, args); + n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args); va_end(args); - trace_note(bt, 0, BLK_TN_MESSAGE, bt_msg_buf, n); + trace_note(bt, 0, BLK_TN_MESSAGE, buf, n); + preempt_enable(); } EXPORT_SYMBOL_GPL(__trace_note_message); @@ -246,6 +249,7 @@ static void blk_trace_cleanup(struct blk_trace *bt) debugfs_remove(bt->dropped_file); blk_remove_tree(bt->dir); free_percpu(bt->sequence); + free_percpu(bt->msg_data); kfree(bt); } @@ -360,6 +364,10 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (!bt->sequence) goto err; + bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG); + if (!bt->msg_data) + goto err; + ret = -ENOENT; dir = blk_create_tree(buts->name); if (!dir) @@ -406,6 +414,7 @@ err: if (bt->dropped_file) debugfs_remove(bt->dropped_file); free_percpu(bt->sequence); + free_percpu(bt->msg_data); if (bt->rchan) relay_close(bt->rchan); kfree(bt); diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index b7cd8f1..e3ef903 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -121,6 +121,7 @@ struct blk_trace { int trace_state; struct rchan *rchan; unsigned long *sequence; + unsigned char *msg_data; u16 act_mask; u64 start_lba; u64 end_lba; @@ -172,7 +173,7 @@ extern void __trace_note_message(struct blk_trace *, const char *fmt, ...); if (unlikely(bt)) \ __trace_note_message(bt, fmt, ##__VA_ARGS__); \ } while (0) -#define BLK_TN_MAX_MSG 1024 +#define BLK_TN_MAX_MSG 128 /** * blk_add_trace_rq - Add a trace for a request oriented action -- cgit v0.10.2 From d6de8be711b28049a5cb93c954722c311c7d3f7f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 28 May 2008 14:46:59 +0200 Subject: cfq-iosched: fix RCU problem in cfq_cic_lookup() cfq_cic_lookup() needs to properly protect ioc->ioc_data before dereferencing it and also exclude updaters of ioc->ioc_data as well. Also add a number of comments documenting why the existing RCU usage is OK. Thanks a lot to "Paul E. McKenney" for review and comments! Signed-off-by: Jens Axboe diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 4df3f052..d01b411 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1142,6 +1142,9 @@ static void cfq_put_queue(struct cfq_queue *cfqq) kmem_cache_free(cfq_pool, cfqq); } +/* + * Must always be called with the rcu_read_lock() held + */ static void __call_for_each_cic(struct io_context *ioc, void (*func)(struct io_context *, struct cfq_io_context *)) @@ -1197,6 +1200,11 @@ static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic) cfq_cic_free(cic); } +/* + * Must be called with rcu_read_lock() held or preemption otherwise disabled. + * Only two callers of this - ->dtor() which is called with the rcu_read_lock(), + * and ->trim() which is called with the task lock held + */ static void cfq_free_io_context(struct io_context *ioc) { /* @@ -1502,20 +1510,24 @@ static struct cfq_io_context * cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc) { struct cfq_io_context *cic; + unsigned long flags; void *k; if (unlikely(!ioc)) return NULL; + rcu_read_lock(); + /* * we maintain a last-hit cache, to avoid browsing over the tree */ cic = rcu_dereference(ioc->ioc_data); - if (cic && cic->key == cfqd) + if (cic && cic->key == cfqd) { + rcu_read_unlock(); return cic; + } do { - rcu_read_lock(); cic = radix_tree_lookup(&ioc->radix_root, (unsigned long) cfqd); rcu_read_unlock(); if (!cic) @@ -1524,10 +1536,13 @@ cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc) k = cic->key; if (unlikely(!k)) { cfq_drop_dead_cic(cfqd, ioc, cic); + rcu_read_lock(); continue; } + spin_lock_irqsave(&ioc->lock, flags); rcu_assign_pointer(ioc->ioc_data, cic); + spin_unlock_irqrestore(&ioc->lock, flags); break; } while (1); @@ -2134,6 +2149,10 @@ static void *cfq_init_queue(struct request_queue *q) static void cfq_slab_kill(void) { + /* + * Caller already ensured that pending RCU callbacks are completed, + * so we should have no busy allocations at this point. + */ if (cfq_pool) kmem_cache_destroy(cfq_pool); if (cfq_ioc_pool) @@ -2292,6 +2311,11 @@ static void __exit cfq_exit(void) ioc_gone = &all_gone; /* ioc_gone's update must be visible before reading ioc_count */ smp_wmb(); + + /* + * this also protects us from entering cfq_slab_kill() with + * pending RCU callbacks + */ if (elv_ioc_count_read(ioc_count)) wait_for_completion(ioc_gone); cfq_slab_kill(); -- cgit v0.10.2