From 9a11b4ed0e7c44bca7c939aa544c3c47aae40c12 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 29 May 2008 09:32:08 +0200 Subject: cfq-iosched: properly protect ioc_gone and ioc count If we have multiple tasks freeing cfq_io_contexts when cfq-iosched is being unloaded, we could complete() ioc_gone twice. Fix that by protecting ioc_gone complete() and clearing with a spinlock for just that purpose. Doesn't matter from a performance perspective, since it'll only enter that path when ioc_gone != NULL (when cfq-iosched is being rmmod'ed). Signed-off-by: Jens Axboe diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index d01b411..32aa367 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -48,6 +48,7 @@ static struct kmem_cache *cfq_ioc_pool; static DEFINE_PER_CPU(unsigned long, ioc_count); static struct completion *ioc_gone; +static DEFINE_SPINLOCK(ioc_gone_lock); #define CFQ_PRIO_LISTS IOPRIO_BE_NR #define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE) @@ -1177,8 +1178,19 @@ static void cfq_cic_free_rcu(struct rcu_head *head) kmem_cache_free(cfq_ioc_pool, cic); elv_ioc_count_dec(ioc_count); - if (ioc_gone && !elv_ioc_count_read(ioc_count)) - complete(ioc_gone); + if (ioc_gone) { + /* + * CFQ scheduler is exiting, grab exit lock and check + * the pending io context count. If it hits zero, + * complete ioc_gone and set it back to NULL + */ + spin_lock(&ioc_gone_lock); + if (ioc_gone && !elv_ioc_count_read(ioc_count)) { + complete(ioc_gone); + ioc_gone = NULL; + } + spin_unlock(&ioc_gone_lock); + } } static void cfq_cic_free(struct cfq_io_context *cic) @@ -2317,7 +2329,7 @@ static void __exit cfq_exit(void) * pending RCU callbacks */ if (elv_ioc_count_read(ioc_count)) - wait_for_completion(ioc_gone); + wait_for_completion(&all_gone); cfq_slab_kill(); } -- cgit v0.10.2 From 863fddcb4b0caee4c2d5bd6e3b28779920516db3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 29 May 2008 09:35:22 +0200 Subject: as-iosched: properly protect ioc_gone and ioc count If we have multiple tasks freeing io contexts when as-iosched is being unloaded, we could complete() ioc_gone twice. Fix that by protecting ioc_gone complete() and clearing with a spinlock for just that purpose. Doesn't matter from a performance perspective, since it'll only enter that path when ioc_gone != NULL (when as-iosched is being rmmod'ed). Signed-off-by: Jens Axboe diff --git a/block/as-iosched.c b/block/as-iosched.c index 743f33a..9735acb 100644 --- a/block/as-iosched.c +++ b/block/as-iosched.c @@ -151,6 +151,7 @@ enum arq_state { static DEFINE_PER_CPU(unsigned long, ioc_count); static struct completion *ioc_gone; +static DEFINE_SPINLOCK(ioc_gone_lock); static void as_move_to_dispatch(struct as_data *ad, struct request *rq); static void as_antic_stop(struct as_data *ad); @@ -164,8 +165,19 @@ static void free_as_io_context(struct as_io_context *aic) { kfree(aic); elv_ioc_count_dec(ioc_count); - if (ioc_gone && !elv_ioc_count_read(ioc_count)) - complete(ioc_gone); + if (ioc_gone) { + /* + * AS scheduler is exiting, grab exit lock and check + * the pending io context count. If it hits zero, + * complete ioc_gone and set it back to NULL. + */ + spin_lock(&ioc_gone_lock); + if (ioc_gone && !elv_ioc_count_read(ioc_count)) { + complete(ioc_gone); + ioc_gone = NULL; + } + spin_unlock(&ioc_gone_lock); + } } static void as_trim(struct io_context *ioc) @@ -1493,7 +1505,7 @@ static void __exit as_exit(void) /* ioc_gone's update must be visible before reading ioc_count */ smp_wmb(); if (elv_ioc_count_read(ioc_count)) - wait_for_completion(ioc_gone); + wait_for_completion(&all_gone); synchronize_rcu(); } -- cgit v0.10.2 From 7b679138b3237a9a3d45a4fda23a58ac79cd279c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 30 May 2008 12:23:07 +0200 Subject: cfq-iosched: add message logging through blktrace Now that blktrace has the ability to carry arbitrary messages in its stream, use that for some CFQ logging. Signed-off-by: Jens Axboe diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 32aa367..0ebb626 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -11,6 +11,7 @@ #include #include #include +#include /* * tunables @@ -41,7 +42,7 @@ static int cfq_slice_idle = HZ / 125; #define RQ_CIC(rq) \ ((struct cfq_io_context *) (rq)->elevator_private) -#define RQ_CFQQ(rq) ((rq)->elevator_private2) +#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private2) static struct kmem_cache *cfq_pool; static struct kmem_cache *cfq_ioc_pool; @@ -156,6 +157,7 @@ struct cfq_queue { unsigned short ioprio, org_ioprio; unsigned short ioprio_class, org_ioprio_class; + pid_t pid; }; enum cfqq_state_flags { @@ -199,6 +201,11 @@ CFQ_CFQQ_FNS(slice_new); CFQ_CFQQ_FNS(sync); #undef CFQ_CFQQ_FNS +#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ + blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args) +#define cfq_log(cfqd, fmt, args...) \ + blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args) + static void cfq_dispatch_insert(struct request_queue *, struct request *); static struct cfq_queue *cfq_get_queue(struct cfq_data *, int, struct io_context *, gfp_t); @@ -235,8 +242,10 @@ static inline int cfq_bio_sync(struct bio *bio) */ static inline void cfq_schedule_dispatch(struct cfq_data *cfqd) { - if (cfqd->busy_queues) + if (cfqd->busy_queues) { + cfq_log(cfqd, "schedule dispatch"); kblockd_schedule_work(&cfqd->unplug_work); + } } static int cfq_queue_empty(struct request_queue *q) @@ -271,6 +280,7 @@ static inline void cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) { cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies; + cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies); } /* @@ -540,6 +550,7 @@ static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq) */ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) { + cfq_log_cfqq(cfqd, cfqq, "add_to_rr"); BUG_ON(cfq_cfqq_on_rr(cfqq)); cfq_mark_cfqq_on_rr(cfqq); cfqd->busy_queues++; @@ -553,6 +564,7 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) */ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) { + cfq_log_cfqq(cfqd, cfqq, "del_from_rr"); BUG_ON(!cfq_cfqq_on_rr(cfqq)); cfq_clear_cfqq_on_rr(cfqq); @@ -639,6 +651,8 @@ static void cfq_activate_request(struct request_queue *q, struct request *rq) struct cfq_data *cfqd = q->elevator->elevator_data; cfqd->rq_in_driver++; + cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d", + cfqd->rq_in_driver); /* * If the depth is larger 1, it really could be queueing. But lets @@ -658,6 +672,8 @@ static void cfq_deactivate_request(struct request_queue *q, struct request *rq) WARN_ON(!cfqd->rq_in_driver); cfqd->rq_in_driver--; + cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d", + cfqd->rq_in_driver); } static void cfq_remove_request(struct request *rq) @@ -747,6 +763,7 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) { if (cfqq) { + cfq_log_cfqq(cfqd, cfqq, "set_active"); cfqq->slice_end = 0; cfq_clear_cfqq_must_alloc_slice(cfqq); cfq_clear_cfqq_fifo_expire(cfqq); @@ -764,6 +781,8 @@ static void __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, int timed_out) { + cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out); + if (cfq_cfqq_wait_request(cfqq)) del_timer(&cfqd->idle_slice_timer); @@ -773,8 +792,10 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, /* * store what was left of this slice, if the queue idled/timed out */ - if (timed_out && !cfq_cfqq_slice_new(cfqq)) + if (timed_out && !cfq_cfqq_slice_new(cfqq)) { cfqq->slice_resid = cfqq->slice_end - jiffies; + cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid); + } cfq_resort_rr_list(cfqd, cfqq); @@ -867,6 +888,12 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) return; /* + * still requests with the driver, don't idle + */ + if (cfqd->rq_in_driver) + return; + + /* * task has exited, don't wait */ cic = cfqd->active_cic; @@ -893,6 +920,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) sl = min(sl, msecs_to_jiffies(CFQ_MIN_TT)); mod_timer(&cfqd->idle_slice_timer, jiffies + sl); + cfq_log(cfqd, "arm_idle: %lu", sl); } /* @@ -903,6 +931,8 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq) struct cfq_data *cfqd = q->elevator->elevator_data; struct cfq_queue *cfqq = RQ_CFQQ(rq); + cfq_log_cfqq(cfqd, cfqq, "dispatch_insert"); + cfq_remove_request(rq); cfqq->dispatched++; elv_dispatch_sort(q, rq); @@ -932,8 +962,9 @@ static struct request *cfq_check_fifo(struct cfq_queue *cfqq) rq = rq_entry_fifo(cfqq->fifo.next); if (time_before(jiffies, rq->start_time + cfqd->cfq_fifo_expire[fifo])) - return NULL; + rq = NULL; + cfq_log_cfqq(cfqd, cfqq, "fifo=%p", rq); return rq; } @@ -1073,6 +1104,7 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd) BUG_ON(cfqd->busy_queues); + cfq_log(cfqd, "forced_dispatch=%d\n", dispatched); return dispatched; } @@ -1113,6 +1145,7 @@ static int cfq_dispatch_requests(struct request_queue *q, int force) dispatched += __cfq_dispatch_requests(cfqd, cfqq, max_dispatch); } + cfq_log(cfqd, "dispatched=%d", dispatched); return dispatched; } @@ -1131,6 +1164,7 @@ static void cfq_put_queue(struct cfq_queue *cfqq) if (!atomic_dec_and_test(&cfqq->ref)) return; + cfq_log_cfqq(cfqd, cfqq, "put_queue"); BUG_ON(rb_first(&cfqq->sort_list)); BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]); BUG_ON(cfq_cfqq_on_rr(cfqq)); @@ -1439,6 +1473,8 @@ retry: cfq_mark_cfqq_idle_window(cfqq); cfq_mark_cfqq_sync(cfqq); } + cfqq->pid = current->pid; + cfq_log_cfqq(cfqd, cfqq, "alloced"); } if (new_cfqq) @@ -1687,7 +1723,7 @@ static void cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, struct cfq_io_context *cic) { - int enable_idle; + int old_idle, enable_idle; /* * Don't idle for async or idle io prio class @@ -1695,7 +1731,7 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq)) return; - enable_idle = cfq_cfqq_idle_window(cfqq); + old_idle = cfq_cfqq_idle_window(cfqq); if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle || (cfqd->hw_tag && CIC_SEEKY(cic))) @@ -1707,10 +1743,13 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, enable_idle = 1; } - if (enable_idle) - cfq_mark_cfqq_idle_window(cfqq); - else - cfq_clear_cfqq_idle_window(cfqq); + if (old_idle != enable_idle) { + cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle); + if (enable_idle) + cfq_mark_cfqq_idle_window(cfqq); + else + cfq_clear_cfqq_idle_window(cfqq); + } } /* @@ -1769,6 +1808,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, */ static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) { + cfq_log_cfqq(cfqd, cfqq, "preempt"); cfq_slice_expired(cfqd, 1); /* @@ -1830,6 +1870,7 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq) struct cfq_data *cfqd = q->elevator->elevator_data; struct cfq_queue *cfqq = RQ_CFQQ(rq); + cfq_log_cfqq(cfqd, cfqq, "insert_request"); cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc); cfq_add_rq_rb(rq); @@ -1847,6 +1888,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) unsigned long now; now = jiffies; + cfq_log_cfqq(cfqd, cfqq, "complete"); WARN_ON(!cfqd->rq_in_driver); WARN_ON(!cfqq->dispatched); @@ -2016,6 +2058,7 @@ queue_fail: cfq_schedule_dispatch(cfqd); spin_unlock_irqrestore(q->queue_lock, flags); + cfq_log(cfqd, "set_request fail"); return 1; } @@ -2041,6 +2084,8 @@ static void cfq_idle_slice_timer(unsigned long data) unsigned long flags; int timed_out = 1; + cfq_log(cfqd, "idle timer fired"); + spin_lock_irqsave(cfqd->queue->queue_lock, flags); cfqq = cfqd->active_queue; -- cgit v0.10.2 From 02c62304e6af60f1963695c6bc1bbffe619aa585 Mon Sep 17 00:00:00 2001 From: "Alan D. Brunelle" Date: Wed, 11 Jun 2008 09:12:52 +0200 Subject: Added in user-injected messages into blk traces This allows a user to annotate the blk trace stream: writing a suitable message to {/sys/kernel/debug}/block//msg will have it propagated into the trace stream. Signed-off-by: Alan D. Brunelle Signed-off-by: Jens Axboe diff --git a/block/blktrace.c b/block/blktrace.c index 8d3a277..eb9651c 100644 --- a/block/blktrace.c +++ b/block/blktrace.c @@ -244,6 +244,7 @@ err: static void blk_trace_cleanup(struct blk_trace *bt) { relay_close(bt->rchan); + debugfs_remove(bt->msg_file); debugfs_remove(bt->dropped_file); blk_remove_tree(bt->dir); free_percpu(bt->sequence); @@ -291,6 +292,44 @@ static const struct file_operations blk_dropped_fops = { .read = blk_dropped_read, }; +static int blk_msg_open(struct inode *inode, struct file *filp) +{ + filp->private_data = inode->i_private; + + return 0; +} + +static ssize_t blk_msg_write(struct file *filp, const char __user *buffer, + size_t count, loff_t *ppos) +{ + char *msg; + struct blk_trace *bt; + + if (count > BLK_TN_MAX_MSG) + return -EINVAL; + + msg = kmalloc(count, GFP_KERNEL); + if (msg == NULL) + return -ENOMEM; + + if (copy_from_user(msg, buffer, count)) { + kfree(msg); + return -EFAULT; + } + + bt = filp->private_data; + __trace_note_message(bt, "%s", msg); + kfree(msg); + + return count; +} + +static const struct file_operations blk_msg_fops = { + .owner = THIS_MODULE, + .open = blk_msg_open, + .write = blk_msg_write, +}; + /* * Keep track of how many times we encountered a full subbuffer, to aid * the user space app in telling how many lost events there were. @@ -380,6 +419,10 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (!bt->dropped_file) goto err; + bt->msg_file = debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops); + if (!bt->msg_file) + goto err; + bt->rchan = relay_open("trace", dir, buts->buf_size, buts->buf_nr, &blk_relay_callbacks, bt); if (!bt->rchan) @@ -409,6 +452,8 @@ err: if (dir) blk_remove_tree(dir); if (bt) { + if (bt->msg_file) + debugfs_remove(bt->msg_file); if (bt->dropped_file) debugfs_remove(bt->dropped_file); free_percpu(bt->sequence); diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index e3ef903..d084b8d 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -129,6 +129,7 @@ struct blk_trace { u32 dev; struct dentry *dir; struct dentry *dropped_file; + struct dentry *msg_file; atomic_t dropped; }; -- cgit v0.10.2 From 1c9ce5276324ae566ca409491b99a2cc8d5986fa Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Fri, 13 Jun 2008 09:41:00 +0200 Subject: block: export "ro" attribute Signed-off-by: Kay Sievers Signed-off-by: Jens Axboe diff --git a/block/genhd.c b/block/genhd.c index b922d48..43e468e 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -400,6 +400,14 @@ static ssize_t disk_removable_show(struct device *dev, (disk->flags & GENHD_FL_REMOVABLE ? 1 : 0)); } +static ssize_t disk_ro_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + return sprintf(buf, "%d\n", disk->policy ? 1 : 0); +} + static ssize_t disk_size_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -472,6 +480,7 @@ static ssize_t disk_fail_store(struct device *dev, static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL); static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL); +static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL); static DEVICE_ATTR(size, S_IRUGO, disk_size_show, NULL); static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL); static DEVICE_ATTR(stat, S_IRUGO, disk_stat_show, NULL); @@ -483,6 +492,7 @@ static struct device_attribute dev_attr_fail = static struct attribute *disk_attrs[] = { &dev_attr_range.attr, &dev_attr_removable.attr, + &dev_attr_ro.attr, &dev_attr_size.attr, &dev_attr_capability.attr, &dev_attr_stat.attr, -- cgit v0.10.2 From 244b4d56f85bcd11b21ab0b94845a3dabeed5c10 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 12 Jun 2008 20:12:36 +0200 Subject: block: kill request_queue_t Everything was moved to struct request_queue a few kernel revisions ago, maintaining the deprecated typedef to avoid breaking things. Now the time has come to get rid of that typedef. Signed-off-by: Jens Axboe diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d2a1b71..6a3da67 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -23,7 +23,6 @@ struct scsi_ioctl_command; struct request_queue; -typedef struct request_queue request_queue_t __deprecated; struct elevator_queue; typedef struct elevator_queue elevator_t; struct request_pm_state; -- cgit v0.10.2 From 1c91fe1a0d577f2e53475e789c9d63a0feb7d93c Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Tue, 17 Jun 2008 10:47:08 +0200 Subject: xen/blkfront: Make sure we don't use bounce buffers, we don't need them. [ linux-2.6.18-xen changeset 667228bf8fc5 ] Signed-off-by: Ian Campbell Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Jens Axboe diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index f2fff57..e8d3bf6 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -324,6 +324,9 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size) /* Make sure buffer addresses are sector-aligned. */ blk_queue_dma_alignment(rq, 511); + /* Make sure we don't use bounce buffers. */ + blk_queue_bounce_limit(rq, BLK_BOUNCE_ANY); + gd->queue = rq; return 0; -- cgit v0.10.2 From 440a01a7f46742400c74d9d346118523e81d188b Mon Sep 17 00:00:00 2001 From: Christian Limpach Date: Tue, 17 Jun 2008 10:47:08 +0200 Subject: xen/blkfront: Add the CDROM_GET_CAPABILITY ioctl to blkfront. Return 0 instead of -EINVAL if the blkfront device is a cdrom, i.e. had the VDISK_CDROM attribute. This allows udev's cdrom_id to correctly detect the device as a cdrom device. [ Add blkif_ioctl, and CDROMMULTISESSION ] [ linux-2.6.18-xen changeset d2bd9af846b5 ] Signed-off-by: Christian Limpach Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Jens Axboe diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index e8d3bf6..da3fee6 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -153,6 +154,40 @@ static int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg) return 0; } +int blkif_ioctl(struct inode *inode, struct file *filep, + unsigned command, unsigned long argument) +{ + struct blkfront_info *info = + inode->i_bdev->bd_disk->private_data; + int i; + + dev_dbg(&info->xbdev->dev, "command: 0x%x, argument: 0x%lx\n", + command, (long)argument); + + switch (command) { + case CDROMMULTISESSION: + dev_dbg(&info->xbdev->dev, "FIXME: support multisession CDs later\n"); + for (i = 0; i < sizeof(struct cdrom_multisession); i++) + if (put_user(0, (char __user *)(argument + i))) + return -EFAULT; + return 0; + + case CDROM_GET_CAPABILITY: { + struct gendisk *gd = info->gd; + if (gd->flags & GENHD_FL_CD) + return 0; + return -EINVAL; + } + + default: + /*printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n", + command);*/ + return -EINVAL; /* same return as native Linux */ + } + + return 0; +} + /* * blkif_queue_request * @@ -974,6 +1009,7 @@ static struct block_device_operations xlvbd_block_fops = .open = blkif_open, .release = blkif_release, .getgeo = blkif_getgeo, + .ioctl = blkif_ioctl, }; -- cgit v0.10.2 From 04c0635058256e2f4618139c237e56b5a4bdbb8f Mon Sep 17 00:00:00 2001 From: Wim Colgate Date: Tue, 17 Jun 2008 10:47:08 +0200 Subject: xen/blkfront: Make sure that the device is fully ready before allowing release. [ linux-2.6.18-xen changeset c1c57fea77e9 ] Signed-off-by: Wim Colgate Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Jens Axboe diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index da3fee6..a39b4b2 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -997,7 +997,7 @@ static int blkif_release(struct inode *inode, struct file *filep) struct xenbus_device *dev = info->xbdev; enum xenbus_state state = xenbus_read_driver_state(dev->otherend); - if (state == XenbusStateClosing) + if (state == XenbusStateClosing && info->is_ready) blkfront_closing(dev); } return 0; -- cgit v0.10.2 From 5a60d0cd4ff227c4c5212898ecbeeaf5662eb5fa Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 17 Jun 2008 10:47:08 +0200 Subject: xen/blkfront: add __exit to module_exit() handlers Signed-off-by: Jan Beulich Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Jens Axboe diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index a39b4b2..b00682e 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -1045,7 +1045,7 @@ static int __init xlblk_init(void) module_init(xlblk_init); -static void xlblk_exit(void) +static void __exit xlblk_exit(void) { return xenbus_unregister_driver(&blkfront); } -- cgit v0.10.2 From a144ff09bc52ef3f3684ed23eadc9c7c0e57b3aa Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Tue, 17 Jun 2008 10:47:08 +0200 Subject: xen: Avoid allocations causing swap activity on the resume path Avoid allocations causing swap activity on the resume path by preventing the allocations from doing IO and allowing them to access the emergency pools. These paths are used when a frontend device is trying to connect to its backend driver over Xenbus. These reconnections are triggered on demand by IO, so by definition there is already IO underway, and further IO would naturally deadlock. On resume, this path is triggered when the running system tries to continue using its devices. If it cannot then the resume will fail; to try to avoid this we let it dip into the emergency pools. [ linux-2.6.18-xen changesets e8b49cfbdac, fdb998e79aba ] Signed-off-by: Ian Campbell Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Jens Axboe diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index b00682e..9ae05c5 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -584,7 +584,7 @@ static int setup_blkring(struct xenbus_device *dev, info->ring_ref = GRANT_INVALID_REF; - sring = (struct blkif_sring *)__get_free_page(GFP_KERNEL); + sring = (struct blkif_sring *)__get_free_page(GFP_NOIO | __GFP_HIGH); if (!sring) { xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring"); return -ENOMEM; @@ -741,7 +741,8 @@ static int blkif_recover(struct blkfront_info *info) int j; /* Stage 1: Make a safe copy of the shadow state. */ - copy = kmalloc(sizeof(info->shadow), GFP_KERNEL); + copy = kmalloc(sizeof(info->shadow), + GFP_NOIO | __GFP_REPEAT | __GFP_HIGH); if (!copy) return -ENOMEM; memcpy(copy, info->shadow, sizeof(info->shadow)); diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index d26f69b..ef671d1 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -1324,7 +1324,7 @@ static int setup_netfront(struct xenbus_device *dev, struct netfront_info *info) goto fail; } - txs = (struct xen_netif_tx_sring *)get_zeroed_page(GFP_KERNEL); + txs = (struct xen_netif_tx_sring *)get_zeroed_page(GFP_NOIO | __GFP_HIGH); if (!txs) { err = -ENOMEM; xenbus_dev_fatal(dev, err, "allocating tx ring page"); @@ -1340,7 +1340,7 @@ static int setup_netfront(struct xenbus_device *dev, struct netfront_info *info) } info->tx_ring_ref = err; - rxs = (struct xen_netif_rx_sring *)get_zeroed_page(GFP_KERNEL); + rxs = (struct xen_netif_rx_sring *)get_zeroed_page(GFP_NOIO | __GFP_HIGH); if (!rxs) { err = -ENOMEM; xenbus_dev_fatal(dev, err, "allocating rx ring page"); diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c index 0f86b0f..9678b3e 100644 --- a/drivers/xen/xenbus/xenbus_client.c +++ b/drivers/xen/xenbus/xenbus_client.c @@ -117,7 +117,7 @@ int xenbus_watch_pathfmt(struct xenbus_device *dev, char *path; va_start(ap, pathfmt); - path = kvasprintf(GFP_KERNEL, pathfmt, ap); + path = kvasprintf(GFP_NOIO | __GFP_HIGH, pathfmt, ap); va_end(ap); if (!path) { diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c index 227d53b1..7f2f91c 100644 --- a/drivers/xen/xenbus/xenbus_xs.c +++ b/drivers/xen/xenbus/xenbus_xs.c @@ -283,9 +283,9 @@ static char *join(const char *dir, const char *name) char *buffer; if (strlen(name) == 0) - buffer = kasprintf(GFP_KERNEL, "%s", dir); + buffer = kasprintf(GFP_NOIO | __GFP_HIGH, "%s", dir); else - buffer = kasprintf(GFP_KERNEL, "%s/%s", dir, name); + buffer = kasprintf(GFP_NOIO | __GFP_HIGH, "%s/%s", dir, name); return (!buffer) ? ERR_PTR(-ENOMEM) : buffer; } @@ -297,7 +297,7 @@ static char **split(char *strings, unsigned int len, unsigned int *num) *num = count_strings(strings, len); /* Transfer to one big alloc for easy freeing. */ - ret = kmalloc(*num * sizeof(char *) + len, GFP_KERNEL); + ret = kmalloc(*num * sizeof(char *) + len, GFP_NOIO | __GFP_HIGH); if (!ret) { kfree(strings); return ERR_PTR(-ENOMEM); @@ -751,7 +751,7 @@ static int process_msg(void) } - msg = kmalloc(sizeof(*msg), GFP_KERNEL); + msg = kmalloc(sizeof(*msg), GFP_NOIO | __GFP_HIGH); if (msg == NULL) { err = -ENOMEM; goto out; @@ -763,7 +763,7 @@ static int process_msg(void) goto out; } - body = kmalloc(msg->hdr.len + 1, GFP_KERNEL); + body = kmalloc(msg->hdr.len + 1, GFP_NOIO | __GFP_HIGH); if (body == NULL) { kfree(msg); err = -ENOMEM; -- cgit v0.10.2 From 51d654e1d885607a6edd02b337105fa5c28b6d33 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Tue, 17 Jun 2008 18:59:56 +0200 Subject: block: Globalize bio_set and bio_vec_slab Move struct bio_set and biovec_slab definitions to bio.h so they can be used outside of bio.c. Signed-off-by: Martin K. Petersen Reviewed-by: Jeff Moyer Signed-off-by: Jens Axboe diff --git a/fs/bio.c b/fs/bio.c index 7856257..7a6598a 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -28,25 +28,10 @@ #include #include /* for struct sg_iovec */ -#define BIO_POOL_SIZE 2 - static struct kmem_cache *bio_slab __read_mostly; -#define BIOVEC_NR_POOLS 6 - -/* - * a small number of entries is fine, not going to be performance critical. - * basically we just need to survive - */ -#define BIO_SPLIT_ENTRIES 2 mempool_t *bio_split_pool __read_mostly; -struct biovec_slab { - int nr_vecs; - char *name; - struct kmem_cache *slab; -}; - /* * if you change this list, also change bvec_alloc or things will * break badly! cannot be bigger than what you can fit into an @@ -60,23 +45,12 @@ static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = { #undef BV /* - * bio_set is used to allow other portions of the IO system to - * allocate their own private memory pools for bio and iovec structures. - * These memory pools in turn all allocate from the bio_slab - * and the bvec_slabs[]. - */ -struct bio_set { - mempool_t *bio_pool; - mempool_t *bvec_pools[BIOVEC_NR_POOLS]; -}; - -/* * fs_bio_set is the bio_set containing bio and iovec memory pools used by * IO code that does not need private memory pools. */ -static struct bio_set *fs_bio_set; +struct bio_set *fs_bio_set; -static inline struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs) +struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs) { struct bio_vec *bvl; diff --git a/include/linux/bio.h b/include/linux/bio.h index 61c15ea..49dfb3c 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -333,6 +333,35 @@ extern struct bio *bio_copy_user_iov(struct request_queue *, struct sg_iovec *, int, int); extern int bio_uncopy_user(struct bio *); void zero_fill_bio(struct bio *bio); +extern struct bio_vec *bvec_alloc_bs(gfp_t, int, unsigned long *, struct bio_set *); + +/* + * bio_set is used to allow other portions of the IO system to + * allocate their own private memory pools for bio and iovec structures. + * These memory pools in turn all allocate from the bio_slab + * and the bvec_slabs[]. + */ +#define BIO_POOL_SIZE 2 +#define BIOVEC_NR_POOLS 6 + +struct bio_set { + mempool_t *bio_pool; + mempool_t *bvec_pools[BIOVEC_NR_POOLS]; +}; + +struct biovec_slab { + int nr_vecs; + char *name; + struct kmem_cache *slab; +}; + +extern struct bio_set *fs_bio_set; + +/* + * a small number of entries is fine, not going to be performance critical. + * basically we just need to survive + */ +#define BIO_SPLIT_ENTRIES 2 #ifdef CONFIG_HIGHMEM /* -- cgit v0.10.2 From 7ba1ba12eeef0aa7113beb16410ef8b7c748e18b Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Mon, 30 Jun 2008 20:04:41 +0200 Subject: block: Block layer data integrity support Some block devices support verifying the integrity of requests by way of checksums or other protection information that is submitted along with the I/O. This patch implements support for generating and verifying integrity metadata, as well as correctly merging, splitting and cloning bios and requests that have this extra information attached. See Documentation/block/data-integrity.txt for more information. Signed-off-by: Martin K. Petersen Signed-off-by: Jens Axboe diff --git a/block/Kconfig b/block/Kconfig index 3e97f2b..1ab7c15 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -81,6 +81,18 @@ config BLK_DEV_BSG If unsure, say N. +config BLK_DEV_INTEGRITY + bool "Block layer data integrity support" + ---help--- + Some storage devices allow extra information to be + stored/retrieved to help protect the data. The block layer + data integrity option provides hooks which can be used by + filesystems to ensure better data integrity. + + Say yes here if you have a storage device that provides the + T10/SCSI Data Integrity Field or the T13/ATA External Path + Protection. If in doubt, say N. + endif # BLOCK config BLOCK_COMPAT diff --git a/block/Makefile b/block/Makefile index 5a43c7d..045f7b6 100644 --- a/block/Makefile +++ b/block/Makefile @@ -14,3 +14,4 @@ obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o +obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o diff --git a/block/blk-core.c b/block/blk-core.c index 1905aab..e0fb0bc 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -143,6 +143,10 @@ static void req_bio_endio(struct request *rq, struct bio *bio, bio->bi_size -= nbytes; bio->bi_sector += (nbytes >> 9); + + if (bio_integrity(bio)) + bio_integrity_advance(bio, nbytes); + if (bio->bi_size == 0) bio_endio(bio, error); } else { @@ -1381,6 +1385,9 @@ end_io: */ blk_partition_remap(bio); + if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) + goto end_io; + if (old_sector != -1) blk_add_trace_remap(q, bio, old_dev, bio->bi_sector, old_sector); diff --git a/block/blk-integrity.c b/block/blk-integrity.c new file mode 100644 index 0000000..65f23ef --- /dev/null +++ b/block/blk-integrity.c @@ -0,0 +1,382 @@ +/* + * blk-integrity.c - Block layer data integrity extensions + * + * Copyright (C) 2007, 2008 Oracle Corporation + * Written by: Martin K. Petersen + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, + * USA. + * + */ + +#include +#include +#include +#include + +#include "blk.h" + +static struct kmem_cache *integrity_cachep; + +/** + * blk_rq_count_integrity_sg - Count number of integrity scatterlist elements + * @rq: request with integrity metadata attached + * + * Description: Returns the number of elements required in a + * scatterlist corresponding to the integrity metadata in a request. + */ +int blk_rq_count_integrity_sg(struct request *rq) +{ + struct bio_vec *iv, *ivprv; + struct req_iterator iter; + unsigned int segments; + + ivprv = NULL; + segments = 0; + + rq_for_each_integrity_segment(iv, rq, iter) { + + if (!ivprv || !BIOVEC_PHYS_MERGEABLE(ivprv, iv)) + segments++; + + ivprv = iv; + } + + return segments; +} +EXPORT_SYMBOL(blk_rq_count_integrity_sg); + +/** + * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist + * @rq: request with integrity metadata attached + * @sglist: target scatterlist + * + * Description: Map the integrity vectors in request into a + * scatterlist. The scatterlist must be big enough to hold all + * elements. I.e. sized using blk_rq_count_integrity_sg(). + */ +int blk_rq_map_integrity_sg(struct request *rq, struct scatterlist *sglist) +{ + struct bio_vec *iv, *ivprv; + struct req_iterator iter; + struct scatterlist *sg; + unsigned int segments; + + ivprv = NULL; + sg = NULL; + segments = 0; + + rq_for_each_integrity_segment(iv, rq, iter) { + + if (ivprv) { + if (!BIOVEC_PHYS_MERGEABLE(ivprv, iv)) + goto new_segment; + + sg->length += iv->bv_len; + } else { +new_segment: + if (!sg) + sg = sglist; + else { + sg->page_link &= ~0x02; + sg = sg_next(sg); + } + + sg_set_page(sg, iv->bv_page, iv->bv_len, iv->bv_offset); + segments++; + } + + ivprv = iv; + } + + if (sg) + sg_mark_end(sg); + + return segments; +} +EXPORT_SYMBOL(blk_rq_map_integrity_sg); + +/** + * blk_integrity_compare - Compare integrity profile of two block devices + * @b1: Device to compare + * @b2: Device to compare + * + * Description: Meta-devices like DM and MD need to verify that all + * sub-devices use the same integrity format before advertising to + * upper layers that they can send/receive integrity metadata. This + * function can be used to check whether two block devices have + * compatible integrity formats. + */ +int blk_integrity_compare(struct block_device *bd1, struct block_device *bd2) +{ + struct blk_integrity *b1 = bd1->bd_disk->integrity; + struct blk_integrity *b2 = bd2->bd_disk->integrity; + + BUG_ON(bd1->bd_disk == NULL); + BUG_ON(bd2->bd_disk == NULL); + + if (!b1 || !b2) + return 0; + + if (b1->sector_size != b2->sector_size) { + printk(KERN_ERR "%s: %s/%s sector sz %u != %u\n", __func__, + bd1->bd_disk->disk_name, bd2->bd_disk->disk_name, + b1->sector_size, b2->sector_size); + return -1; + } + + if (b1->tuple_size != b2->tuple_size) { + printk(KERN_ERR "%s: %s/%s tuple sz %u != %u\n", __func__, + bd1->bd_disk->disk_name, bd2->bd_disk->disk_name, + b1->tuple_size, b2->tuple_size); + return -1; + } + + if (b1->tag_size && b2->tag_size && (b1->tag_size != b2->tag_size)) { + printk(KERN_ERR "%s: %s/%s tag sz %u != %u\n", __func__, + bd1->bd_disk->disk_name, bd2->bd_disk->disk_name, + b1->tag_size, b2->tag_size); + return -1; + } + + if (strcmp(b1->name, b2->name)) { + printk(KERN_ERR "%s: %s/%s type %s != %s\n", __func__, + bd1->bd_disk->disk_name, bd2->bd_disk->disk_name, + b1->name, b2->name); + return -1; + } + + return 0; +} +EXPORT_SYMBOL(blk_integrity_compare); + +struct integrity_sysfs_entry { + struct attribute attr; + ssize_t (*show)(struct blk_integrity *, char *); + ssize_t (*store)(struct blk_integrity *, const char *, size_t); +}; + +static ssize_t integrity_attr_show(struct kobject *kobj, struct attribute *attr, + char *page) +{ + struct blk_integrity *bi = + container_of(kobj, struct blk_integrity, kobj); + struct integrity_sysfs_entry *entry = + container_of(attr, struct integrity_sysfs_entry, attr); + + return entry->show(bi, page); +} + +static ssize_t integrity_attr_store(struct kobject *kobj, struct attribute *attr, + const char *page, size_t count) +{ + struct blk_integrity *bi = + container_of(kobj, struct blk_integrity, kobj); + struct integrity_sysfs_entry *entry = + container_of(attr, struct integrity_sysfs_entry, attr); + ssize_t ret = 0; + + if (entry->store) + ret = entry->store(bi, page, count); + + return ret; +} + +static ssize_t integrity_format_show(struct blk_integrity *bi, char *page) +{ + if (bi != NULL && bi->name != NULL) + return sprintf(page, "%s\n", bi->name); + else + return sprintf(page, "none\n"); +} + +static ssize_t integrity_tag_size_show(struct blk_integrity *bi, char *page) +{ + if (bi != NULL) + return sprintf(page, "%u\n", bi->tag_size); + else + return sprintf(page, "0\n"); +} + +static ssize_t integrity_read_store(struct blk_integrity *bi, + const char *page, size_t count) +{ + char *p = (char *) page; + unsigned long val = simple_strtoul(p, &p, 10); + + if (val) + set_bit(INTEGRITY_FLAG_READ, &bi->flags); + else + clear_bit(INTEGRITY_FLAG_READ, &bi->flags); + + return count; +} + +static ssize_t integrity_read_show(struct blk_integrity *bi, char *page) +{ + return sprintf(page, "%d\n", + test_bit(INTEGRITY_FLAG_READ, &bi->flags) ? 1 : 0); +} + +static ssize_t integrity_write_store(struct blk_integrity *bi, + const char *page, size_t count) +{ + char *p = (char *) page; + unsigned long val = simple_strtoul(p, &p, 10); + + if (val) + set_bit(INTEGRITY_FLAG_WRITE, &bi->flags); + else + clear_bit(INTEGRITY_FLAG_WRITE, &bi->flags); + + return count; +} + +static ssize_t integrity_write_show(struct blk_integrity *bi, char *page) +{ + return sprintf(page, "%d\n", + test_bit(INTEGRITY_FLAG_WRITE, &bi->flags) ? 1 : 0); +} + +static struct integrity_sysfs_entry integrity_format_entry = { + .attr = { .name = "format", .mode = S_IRUGO }, + .show = integrity_format_show, +}; + +static struct integrity_sysfs_entry integrity_tag_size_entry = { + .attr = { .name = "tag_size", .mode = S_IRUGO }, + .show = integrity_tag_size_show, +}; + +static struct integrity_sysfs_entry integrity_read_entry = { + .attr = { .name = "read_verify", .mode = S_IRUGO | S_IWUSR }, + .show = integrity_read_show, + .store = integrity_read_store, +}; + +static struct integrity_sysfs_entry integrity_write_entry = { + .attr = { .name = "write_generate", .mode = S_IRUGO | S_IWUSR }, + .show = integrity_write_show, + .store = integrity_write_store, +}; + +static struct attribute *integrity_attrs[] = { + &integrity_format_entry.attr, + &integrity_tag_size_entry.attr, + &integrity_read_entry.attr, + &integrity_write_entry.attr, + NULL, +}; + +static struct sysfs_ops integrity_ops = { + .show = &integrity_attr_show, + .store = &integrity_attr_store, +}; + +static int __init blk_dev_integrity_init(void) +{ + integrity_cachep = kmem_cache_create("blkdev_integrity", + sizeof(struct blk_integrity), + 0, SLAB_PANIC, NULL); + return 0; +} +subsys_initcall(blk_dev_integrity_init); + +static void blk_integrity_release(struct kobject *kobj) +{ + struct blk_integrity *bi = + container_of(kobj, struct blk_integrity, kobj); + + kmem_cache_free(integrity_cachep, bi); +} + +static struct kobj_type integrity_ktype = { + .default_attrs = integrity_attrs, + .sysfs_ops = &integrity_ops, + .release = blk_integrity_release, +}; + +/** + * blk_integrity_register - Register a gendisk as being integrity-capable + * @disk: struct gendisk pointer to make integrity-aware + * @template: integrity profile + * + * Description: When a device needs to advertise itself as being able + * to send/receive integrity metadata it must use this function to + * register the capability with the block layer. The template is a + * blk_integrity struct with values appropriate for the underlying + * hardware. See Documentation/block/data-integrity.txt. + */ +int blk_integrity_register(struct gendisk *disk, struct blk_integrity *template) +{ + struct blk_integrity *bi; + + BUG_ON(disk == NULL); + BUG_ON(template == NULL); + + if (disk->integrity == NULL) { + bi = kmem_cache_alloc(integrity_cachep, GFP_KERNEL | __GFP_ZERO); + if (!bi) + return -1; + + if (kobject_init_and_add(&bi->kobj, &integrity_ktype, + &disk->dev.kobj, "%s", "integrity")) { + kmem_cache_free(integrity_cachep, bi); + return -1; + } + + kobject_uevent(&bi->kobj, KOBJ_ADD); + + set_bit(INTEGRITY_FLAG_READ, &bi->flags); + set_bit(INTEGRITY_FLAG_WRITE, &bi->flags); + bi->sector_size = disk->queue->hardsect_size; + disk->integrity = bi; + } else + bi = disk->integrity; + + /* Use the provided profile as template */ + bi->name = template->name; + bi->generate_fn = template->generate_fn; + bi->verify_fn = template->verify_fn; + bi->tuple_size = template->tuple_size; + bi->set_tag_fn = template->set_tag_fn; + bi->get_tag_fn = template->get_tag_fn; + bi->tag_size = template->tag_size; + + return 0; +} +EXPORT_SYMBOL(blk_integrity_register); + +/** + * blk_integrity_unregister - Remove block integrity profile + * @disk: disk whose integrity profile to deallocate + * + * Description: This function frees all memory used by the block + * integrity profile. To be called at device teardown. + */ +void blk_integrity_unregister(struct gendisk *disk) +{ + struct blk_integrity *bi; + + if (!disk || !disk->integrity) + return; + + bi = disk->integrity; + + kobject_uevent(&bi->kobj, KOBJ_REMOVE); + kobject_del(&bi->kobj); + kobject_put(&disk->dev.kobj); + kmem_cache_free(integrity_cachep, bi); +} +EXPORT_SYMBOL(blk_integrity_unregister); diff --git a/block/blk-merge.c b/block/blk-merge.c index 651136a..5efc9e7 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -441,6 +441,9 @@ static int attempt_merge(struct request_queue *q, struct request *req, || next->special) return 0; + if (blk_integrity_rq(req) != blk_integrity_rq(next)) + return 0; + /* * If we are allowed to merge, then append bio list * from next to rq and release next. merge_requests_fn diff --git a/block/blk.h b/block/blk.h index 59776ab..c79f30e 100644 --- a/block/blk.h +++ b/block/blk.h @@ -51,4 +51,12 @@ static inline int queue_congestion_off_threshold(struct request_queue *q) return q->nr_congestion_off; } +#if defined(CONFIG_BLK_DEV_INTEGRITY) + +#define rq_for_each_integrity_segment(bvl, _rq, _iter) \ + __rq_for_each_bio(_iter.bio, _rq) \ + bip_for_each_vec(bvl, _iter.bio->bi_integrity, _iter.i) + +#endif /* BLK_DEV_INTEGRITY */ + #endif diff --git a/block/elevator.c b/block/elevator.c index 902dd13..1f5bfe6 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -86,6 +86,12 @@ int elv_rq_merge_ok(struct request *rq, struct bio *bio) if (rq->rq_disk != bio->bi_bdev->bd_disk || rq->special) return 0; + /* + * only merge integrity protected bio into ditto rq + */ + if (bio_integrity(bio) != blk_integrity_rq(rq)) + return 0; + if (!elv_iosched_allow_merge(rq, bio)) return 0; diff --git a/fs/Makefile b/fs/Makefile index 1e7a11b..277b079 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -19,6 +19,7 @@ else obj-y += no-block.o endif +obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o obj-$(CONFIG_INOTIFY) += inotify.o obj-$(CONFIG_INOTIFY_USER) += inotify_user.o obj-$(CONFIG_EPOLL) += eventpoll.o diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c new file mode 100644 index 0000000..31b0887 --- /dev/null +++ b/fs/bio-integrity.c @@ -0,0 +1,708 @@ +/* + * bio-integrity.c - bio data integrity extensions + * + * Copyright (C) 2007, 2008 Oracle Corporation + * Written by: Martin K. Petersen + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, + * USA. + * + */ + +#include +#include +#include +#include + +static struct kmem_cache *bio_integrity_slab __read_mostly; +static struct workqueue_struct *kintegrityd_wq; + +/** + * bio_integrity_alloc_bioset - Allocate integrity payload and attach it to bio + * @bio: bio to attach integrity metadata to + * @gfp_mask: Memory allocation mask + * @nr_vecs: Number of integrity metadata scatter-gather elements + * @bs: bio_set to allocate from + * + * Description: This function prepares a bio for attaching integrity + * metadata. nr_vecs specifies the maximum number of pages containing + * integrity metadata that can be attached. + */ +struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio, gfp_t gfp_mask, unsigned int nr_vecs, struct bio_set *bs) +{ + struct bio_integrity_payload *bip; + struct bio_vec *iv; + unsigned long idx; + + BUG_ON(bio == NULL); + + bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask); + if (unlikely(bip == NULL)) { + printk(KERN_ERR "%s: could not alloc bip\n", __func__); + return NULL; + } + + memset(bip, 0, sizeof(*bip)); + + iv = bvec_alloc_bs(gfp_mask, nr_vecs, &idx, bs); + if (unlikely(iv == NULL)) { + printk(KERN_ERR "%s: could not alloc bip_vec\n", __func__); + mempool_free(bip, bs->bio_integrity_pool); + return NULL; + } + + bip->bip_pool = idx; + bip->bip_vec = iv; + bip->bip_bio = bio; + bio->bi_integrity = bip; + + return bip; +} +EXPORT_SYMBOL(bio_integrity_alloc_bioset); + +/** + * bio_integrity_alloc - Allocate integrity payload and attach it to bio + * @bio: bio to attach integrity metadata to + * @gfp_mask: Memory allocation mask + * @nr_vecs: Number of integrity metadata scatter-gather elements + * + * Description: This function prepares a bio for attaching integrity + * metadata. nr_vecs specifies the maximum number of pages containing + * integrity metadata that can be attached. + */ +struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, gfp_t gfp_mask, unsigned int nr_vecs) +{ + return bio_integrity_alloc_bioset(bio, gfp_mask, nr_vecs, fs_bio_set); +} +EXPORT_SYMBOL(bio_integrity_alloc); + +/** + * bio_integrity_free - Free bio integrity payload + * @bio: bio containing bip to be freed + * @bs: bio_set this bio was allocated from + * + * Description: Used to free the integrity portion of a bio. Usually + * called from bio_free(). + */ +void bio_integrity_free(struct bio *bio, struct bio_set *bs) +{ + struct bio_integrity_payload *bip = bio->bi_integrity; + + BUG_ON(bip == NULL); + + /* A cloned bio doesn't own the integrity metadata */ + if (!bio_flagged(bio, BIO_CLONED) && bip->bip_buf != NULL) + kfree(bip->bip_buf); + + mempool_free(bip->bip_vec, bs->bvec_pools[bip->bip_pool]); + mempool_free(bip, bs->bio_integrity_pool); + + bio->bi_integrity = NULL; +} +EXPORT_SYMBOL(bio_integrity_free); + +/** + * bio_integrity_add_page - Attach integrity metadata + * @bio: bio to update + * @page: page containing integrity metadata + * @len: number of bytes of integrity metadata in page + * @offset: start offset within page + * + * Description: Attach a page containing integrity metadata to bio. + */ +int bio_integrity_add_page(struct bio *bio, struct page *page, + unsigned int len, unsigned int offset) +{ + struct bio_integrity_payload *bip = bio->bi_integrity; + struct bio_vec *iv; + + if (bip->bip_vcnt >= bvec_nr_vecs(bip->bip_pool)) { + printk(KERN_ERR "%s: bip_vec full\n", __func__); + return 0; + } + + iv = bip_vec_idx(bip, bip->bip_vcnt); + BUG_ON(iv == NULL); + BUG_ON(iv->bv_page != NULL); + + iv->bv_page = page; + iv->bv_len = len; + iv->bv_offset = offset; + bip->bip_vcnt++; + + return len; +} +EXPORT_SYMBOL(bio_integrity_add_page); + +/** + * bio_integrity_enabled - Check whether integrity can be passed + * @bio: bio to check + * + * Description: Determines whether bio_integrity_prep() can be called + * on this bio or not. bio data direction and target device must be + * set prior to calling. The functions honors the write_generate and + * read_verify flags in sysfs. + */ +int bio_integrity_enabled(struct bio *bio) +{ + /* Already protected? */ + if (bio_integrity(bio)) + return 0; + + return bdev_integrity_enabled(bio->bi_bdev, bio_data_dir(bio)); +} +EXPORT_SYMBOL(bio_integrity_enabled); + +/** + * bio_integrity_hw_sectors - Convert 512b sectors to hardware ditto + * @bi: blk_integrity profile for device + * @sectors: Number of 512 sectors to convert + * + * Description: The block layer calculates everything in 512 byte + * sectors but integrity metadata is done in terms of the hardware + * sector size of the storage device. Convert the block layer sectors + * to physical sectors. + */ +static inline unsigned int bio_integrity_hw_sectors(struct blk_integrity *bi, unsigned int sectors) +{ + /* At this point there are only 512b or 4096b DIF/EPP devices */ + if (bi->sector_size == 4096) + return sectors >>= 3; + + return sectors; +} + +/** + * bio_integrity_tag_size - Retrieve integrity tag space + * @bio: bio to inspect + * + * Description: Returns the maximum number of tag bytes that can be + * attached to this bio. Filesystems can use this to determine how + * much metadata to attach to an I/O. + */ +unsigned int bio_integrity_tag_size(struct bio *bio) +{ + struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + + BUG_ON(bio->bi_size == 0); + + return bi->tag_size * (bio->bi_size / bi->sector_size); +} +EXPORT_SYMBOL(bio_integrity_tag_size); + +int bio_integrity_tag(struct bio *bio, void *tag_buf, unsigned int len, int set) +{ + struct bio_integrity_payload *bip = bio->bi_integrity; + struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + unsigned int nr_sectors; + + BUG_ON(bip->bip_buf == NULL); + + if (bi->tag_size == 0) + return -1; + + nr_sectors = bio_integrity_hw_sectors(bi, DIV_ROUND_UP(len, bi->tag_size)); + + if (nr_sectors * bi->tuple_size > bip->bip_size) { + printk(KERN_ERR "%s: tag too big for bio: %u > %u\n", + __func__, nr_sectors * bi->tuple_size, bip->bip_size); + return -1; + } + + if (set) + bi->set_tag_fn(bip->bip_buf, tag_buf, nr_sectors); + else + bi->get_tag_fn(bip->bip_buf, tag_buf, nr_sectors); + + return 0; +} + +/** + * bio_integrity_set_tag - Attach a tag buffer to a bio + * @bio: bio to attach buffer to + * @tag_buf: Pointer to a buffer containing tag data + * @len: Length of the included buffer + * + * Description: Use this function to tag a bio by leveraging the extra + * space provided by devices formatted with integrity protection. The + * size of the integrity buffer must be <= to the size reported by + * bio_integrity_tag_size(). + */ +int bio_integrity_set_tag(struct bio *bio, void *tag_buf, unsigned int len) +{ + BUG_ON(bio_data_dir(bio) != WRITE); + + return bio_integrity_tag(bio, tag_buf, len, 1); +} +EXPORT_SYMBOL(bio_integrity_set_tag); + +/** + * bio_integrity_get_tag - Retrieve a tag buffer from a bio + * @bio: bio to retrieve buffer from + * @tag_buf: Pointer to a buffer for the tag data + * @len: Length of the target buffer + * + * Description: Use this function to retrieve the tag buffer from a + * completed I/O. The size of the integrity buffer must be <= to the + * size reported by bio_integrity_tag_size(). + */ +int bio_integrity_get_tag(struct bio *bio, void *tag_buf, unsigned int len) +{ + BUG_ON(bio_data_dir(bio) != READ); + + return bio_integrity_tag(bio, tag_buf, len, 0); +} +EXPORT_SYMBOL(bio_integrity_get_tag); + +/** + * bio_integrity_generate - Generate integrity metadata for a bio + * @bio: bio to generate integrity metadata for + * + * Description: Generates integrity metadata for a bio by calling the + * block device's generation callback function. The bio must have a + * bip attached with enough room to accommodate the generated + * integrity metadata. + */ +static void bio_integrity_generate(struct bio *bio) +{ + struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + struct blk_integrity_exchg bix; + struct bio_vec *bv; + sector_t sector = bio->bi_sector; + unsigned int i, sectors, total; + void *prot_buf = bio->bi_integrity->bip_buf; + + total = 0; + bix.disk_name = bio->bi_bdev->bd_disk->disk_name; + bix.sector_size = bi->sector_size; + + bio_for_each_segment(bv, bio, i) { + void *kaddr = kmap_atomic(bv->bv_page, KM_USER0); + bix.data_buf = kaddr + bv->bv_offset; + bix.data_size = bv->bv_len; + bix.prot_buf = prot_buf; + bix.sector = sector; + + bi->generate_fn(&bix); + + sectors = bv->bv_len / bi->sector_size; + sector += sectors; + prot_buf += sectors * bi->tuple_size; + total += sectors * bi->tuple_size; + BUG_ON(total > bio->bi_integrity->bip_size); + + kunmap_atomic(kaddr, KM_USER0); + } +} + +/** + * bio_integrity_prep - Prepare bio for integrity I/O + * @bio: bio to prepare + * + * Description: Allocates a buffer for integrity metadata, maps the + * pages and attaches them to a bio. The bio must have data + * direction, target device and start sector set priot to calling. In + * the WRITE case, integrity metadata will be generated using the + * block device's integrity function. In the READ case, the buffer + * will be prepared for DMA and a suitable end_io handler set up. + */ +int bio_integrity_prep(struct bio *bio) +{ + struct bio_integrity_payload *bip; + struct blk_integrity *bi; + struct request_queue *q; + void *buf; + unsigned long start, end; + unsigned int len, nr_pages; + unsigned int bytes, offset, i; + unsigned int sectors; + + bi = bdev_get_integrity(bio->bi_bdev); + q = bdev_get_queue(bio->bi_bdev); + BUG_ON(bi == NULL); + BUG_ON(bio_integrity(bio)); + + sectors = bio_integrity_hw_sectors(bi, bio_sectors(bio)); + + /* Allocate kernel buffer for protection data */ + len = sectors * blk_integrity_tuple_size(bi); + buf = kmalloc(len, GFP_NOIO | __GFP_NOFAIL | q->bounce_gfp); + if (unlikely(buf == NULL)) { + printk(KERN_ERR "could not allocate integrity buffer\n"); + return -EIO; + } + + end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + start = ((unsigned long) buf) >> PAGE_SHIFT; + nr_pages = end - start; + + /* Allocate bio integrity payload and integrity vectors */ + bip = bio_integrity_alloc(bio, GFP_NOIO, nr_pages); + if (unlikely(bip == NULL)) { + printk(KERN_ERR "could not allocate data integrity bioset\n"); + kfree(buf); + return -EIO; + } + + bip->bip_buf = buf; + bip->bip_size = len; + bip->bip_sector = bio->bi_sector; + + /* Map it */ + offset = offset_in_page(buf); + for (i = 0 ; i < nr_pages ; i++) { + int ret; + bytes = PAGE_SIZE - offset; + + if (len <= 0) + break; + + if (bytes > len) + bytes = len; + + ret = bio_integrity_add_page(bio, virt_to_page(buf), + bytes, offset); + + if (ret == 0) + return 0; + + if (ret < bytes) + break; + + buf += bytes; + len -= bytes; + offset = 0; + } + + /* Install custom I/O completion handler if read verify is enabled */ + if (bio_data_dir(bio) == READ) { + bip->bip_end_io = bio->bi_end_io; + bio->bi_end_io = bio_integrity_endio; + } + + /* Auto-generate integrity metadata if this is a write */ + if (bio_data_dir(bio) == WRITE) + bio_integrity_generate(bio); + + return 0; +} +EXPORT_SYMBOL(bio_integrity_prep); + +/** + * bio_integrity_verify - Verify integrity metadata for a bio + * @bio: bio to verify + * + * Description: This function is called to verify the integrity of a + * bio. The data in the bio io_vec is compared to the integrity + * metadata returned by the HBA. + */ +static int bio_integrity_verify(struct bio *bio) +{ + struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + struct blk_integrity_exchg bix; + struct bio_vec *bv; + sector_t sector = bio->bi_integrity->bip_sector; + unsigned int i, sectors, total, ret; + void *prot_buf = bio->bi_integrity->bip_buf; + + ret = total = 0; + bix.disk_name = bio->bi_bdev->bd_disk->disk_name; + bix.sector_size = bi->sector_size; + + bio_for_each_segment(bv, bio, i) { + void *kaddr = kmap_atomic(bv->bv_page, KM_USER0); + bix.data_buf = kaddr + bv->bv_offset; + bix.data_size = bv->bv_len; + bix.prot_buf = prot_buf; + bix.sector = sector; + + ret = bi->verify_fn(&bix); + + if (ret) { + kunmap_atomic(kaddr, KM_USER0); + break; + } + + sectors = bv->bv_len / bi->sector_size; + sector += sectors; + prot_buf += sectors * bi->tuple_size; + total += sectors * bi->tuple_size; + BUG_ON(total > bio->bi_integrity->bip_size); + + kunmap_atomic(kaddr, KM_USER0); + } + + return ret; +} + +/** + * bio_integrity_verify_fn - Integrity I/O completion worker + * @work: Work struct stored in bio to be verified + * + * Description: This workqueue function is called to complete a READ + * request. The function verifies the transferred integrity metadata + * and then calls the original bio end_io function. + */ +static void bio_integrity_verify_fn(struct work_struct *work) +{ + struct bio_integrity_payload *bip = + container_of(work, struct bio_integrity_payload, bip_work); + struct bio *bio = bip->bip_bio; + int error = bip->bip_error; + + if (bio_integrity_verify(bio)) { + clear_bit(BIO_UPTODATE, &bio->bi_flags); + error = -EIO; + } + + /* Restore original bio completion handler */ + bio->bi_end_io = bip->bip_end_io; + + if (bio->bi_end_io) + bio->bi_end_io(bio, error); +} + +/** + * bio_integrity_endio - Integrity I/O completion function + * @bio: Protected bio + * @error: Pointer to errno + * + * Description: Completion for integrity I/O + * + * Normally I/O completion is done in interrupt context. However, + * verifying I/O integrity is a time-consuming task which must be run + * in process context. This function postpones completion + * accordingly. + */ +void bio_integrity_endio(struct bio *bio, int error) +{ + struct bio_integrity_payload *bip = bio->bi_integrity; + + BUG_ON(bip->bip_bio != bio); + + bip->bip_error = error; + INIT_WORK(&bip->bip_work, bio_integrity_verify_fn); + queue_work(kintegrityd_wq, &bip->bip_work); +} +EXPORT_SYMBOL(bio_integrity_endio); + +/** + * bio_integrity_mark_head - Advance bip_vec skip bytes + * @bip: Integrity vector to advance + * @skip: Number of bytes to advance it + */ +void bio_integrity_mark_head(struct bio_integrity_payload *bip, unsigned int skip) +{ + struct bio_vec *iv; + unsigned int i; + + bip_for_each_vec(iv, bip, i) { + if (skip == 0) { + bip->bip_idx = i; + return; + } else if (skip >= iv->bv_len) { + skip -= iv->bv_len; + } else { /* skip < iv->bv_len) */ + iv->bv_offset += skip; + iv->bv_len -= skip; + bip->bip_idx = i; + return; + } + } +} + +/** + * bio_integrity_mark_tail - Truncate bip_vec to be len bytes long + * @bip: Integrity vector to truncate + * @len: New length of integrity vector + */ +void bio_integrity_mark_tail(struct bio_integrity_payload *bip, unsigned int len) +{ + struct bio_vec *iv; + unsigned int i; + + bip_for_each_vec(iv, bip, i) { + if (len == 0) { + bip->bip_vcnt = i; + return; + } else if (len >= iv->bv_len) { + len -= iv->bv_len; + } else { /* len < iv->bv_len) */ + iv->bv_len = len; + len = 0; + } + } +} + +/** + * bio_integrity_advance - Advance integrity vector + * @bio: bio whose integrity vector to update + * @bytes_done: number of data bytes that have been completed + * + * Description: This function calculates how many integrity bytes the + * number of completed data bytes correspond to and advances the + * integrity vector accordingly. + */ +void bio_integrity_advance(struct bio *bio, unsigned int bytes_done) +{ + struct bio_integrity_payload *bip = bio->bi_integrity; + struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + unsigned int nr_sectors; + + BUG_ON(bip == NULL); + BUG_ON(bi == NULL); + + nr_sectors = bio_integrity_hw_sectors(bi, bytes_done >> 9); + bio_integrity_mark_head(bip, nr_sectors * bi->tuple_size); +} +EXPORT_SYMBOL(bio_integrity_advance); + +/** + * bio_integrity_trim - Trim integrity vector + * @bio: bio whose integrity vector to update + * @offset: offset to first data sector + * @sectors: number of data sectors + * + * Description: Used to trim the integrity vector in a cloned bio. + * The ivec will be advanced corresponding to 'offset' data sectors + * and the length will be truncated corresponding to 'len' data + * sectors. + */ +void bio_integrity_trim(struct bio *bio, unsigned int offset, unsigned int sectors) +{ + struct bio_integrity_payload *bip = bio->bi_integrity; + struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + unsigned int nr_sectors; + + BUG_ON(bip == NULL); + BUG_ON(bi == NULL); + BUG_ON(!bio_flagged(bio, BIO_CLONED)); + + nr_sectors = bio_integrity_hw_sectors(bi, sectors); + bip->bip_sector = bip->bip_sector + offset; + bio_integrity_mark_head(bip, offset * bi->tuple_size); + bio_integrity_mark_tail(bip, sectors * bi->tuple_size); +} +EXPORT_SYMBOL(bio_integrity_trim); + +/** + * bio_integrity_split - Split integrity metadata + * @bio: Protected bio + * @bp: Resulting bio_pair + * @sectors: Offset + * + * Description: Splits an integrity page into a bio_pair. + */ +void bio_integrity_split(struct bio *bio, struct bio_pair *bp, int sectors) +{ + struct blk_integrity *bi; + struct bio_integrity_payload *bip = bio->bi_integrity; + unsigned int nr_sectors; + + if (bio_integrity(bio) == 0) + return; + + bi = bdev_get_integrity(bio->bi_bdev); + BUG_ON(bi == NULL); + BUG_ON(bip->bip_vcnt != 1); + + nr_sectors = bio_integrity_hw_sectors(bi, sectors); + + bp->bio1.bi_integrity = &bp->bip1; + bp->bio2.bi_integrity = &bp->bip2; + + bp->iv1 = bip->bip_vec[0]; + bp->iv2 = bip->bip_vec[0]; + + bp->bip1.bip_vec = &bp->iv1; + bp->bip2.bip_vec = &bp->iv2; + + bp->iv1.bv_len = sectors * bi->tuple_size; + bp->iv2.bv_offset += sectors * bi->tuple_size; + bp->iv2.bv_len -= sectors * bi->tuple_size; + + bp->bip1.bip_sector = bio->bi_integrity->bip_sector; + bp->bip2.bip_sector = bio->bi_integrity->bip_sector + nr_sectors; + + bp->bip1.bip_vcnt = bp->bip2.bip_vcnt = 1; + bp->bip1.bip_idx = bp->bip2.bip_idx = 0; +} +EXPORT_SYMBOL(bio_integrity_split); + +/** + * bio_integrity_clone - Callback for cloning bios with integrity metadata + * @bio: New bio + * @bio_src: Original bio + * @bs: bio_set to allocate bip from + * + * Description: Called to allocate a bip when cloning a bio + */ +int bio_integrity_clone(struct bio *bio, struct bio *bio_src, struct bio_set *bs) +{ + struct bio_integrity_payload *bip_src = bio_src->bi_integrity; + struct bio_integrity_payload *bip; + + BUG_ON(bip_src == NULL); + + bip = bio_integrity_alloc_bioset(bio, GFP_NOIO, bip_src->bip_vcnt, bs); + + if (bip == NULL) + return -EIO; + + memcpy(bip->bip_vec, bip_src->bip_vec, + bip_src->bip_vcnt * sizeof(struct bio_vec)); + + bip->bip_sector = bip_src->bip_sector; + bip->bip_vcnt = bip_src->bip_vcnt; + bip->bip_idx = bip_src->bip_idx; + + return 0; +} +EXPORT_SYMBOL(bio_integrity_clone); + +int bioset_integrity_create(struct bio_set *bs, int pool_size) +{ + bs->bio_integrity_pool = mempool_create_slab_pool(pool_size, + bio_integrity_slab); + if (!bs->bio_integrity_pool) + return -1; + + return 0; +} +EXPORT_SYMBOL(bioset_integrity_create); + +void bioset_integrity_free(struct bio_set *bs) +{ + if (bs->bio_integrity_pool) + mempool_destroy(bs->bio_integrity_pool); +} +EXPORT_SYMBOL(bioset_integrity_free); + +void __init bio_integrity_init_slab(void) +{ + bio_integrity_slab = KMEM_CACHE(bio_integrity_payload, + SLAB_HWCACHE_ALIGN|SLAB_PANIC); +} +EXPORT_SYMBOL(bio_integrity_init_slab); + +static int __init integrity_init(void) +{ + kintegrityd_wq = create_workqueue("kintegrityd"); + + if (!kintegrityd_wq) + panic("Failed to create kintegrityd\n"); + + return 0; +} +subsys_initcall(integrity_init); diff --git a/fs/bio.c b/fs/bio.c index 7a6598a..7761c84 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -50,6 +50,11 @@ static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = { */ struct bio_set *fs_bio_set; +unsigned int bvec_nr_vecs(unsigned short idx) +{ + return bvec_slabs[idx].nr_vecs; +} + struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs) { struct bio_vec *bvl; @@ -91,6 +96,9 @@ void bio_free(struct bio *bio, struct bio_set *bio_set) mempool_free(bio->bi_io_vec, bio_set->bvec_pools[pool_idx]); } + if (bio_integrity(bio)) + bio_integrity_free(bio, bio_set); + mempool_free(bio, bio_set->bio_pool); } @@ -249,9 +257,19 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask) { struct bio *b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, fs_bio_set); - if (b) { - b->bi_destructor = bio_fs_destructor; - __bio_clone(b, bio); + if (!b) + return NULL; + + b->bi_destructor = bio_fs_destructor; + __bio_clone(b, bio); + + if (bio_integrity(bio)) { + int ret; + + ret = bio_integrity_clone(b, bio, fs_bio_set); + + if (ret < 0) + return NULL; } return b; @@ -1223,6 +1241,9 @@ struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors) bp->bio1.bi_private = bi; bp->bio2.bi_private = pool; + if (bio_integrity(bi)) + bio_integrity_split(bi, bp, first_sectors); + return bp; } @@ -1264,6 +1285,7 @@ void bioset_free(struct bio_set *bs) if (bs->bio_pool) mempool_destroy(bs->bio_pool); + bioset_integrity_free(bs); biovec_free_pools(bs); kfree(bs); @@ -1280,6 +1302,9 @@ struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size) if (!bs->bio_pool) goto bad; + if (bioset_integrity_create(bs, bio_pool_size)) + goto bad; + if (!biovec_create_pools(bs, bvec_pool_size)) return bs; @@ -1306,6 +1331,7 @@ static int __init init_bio(void) { bio_slab = KMEM_CACHE(bio, SLAB_HWCACHE_ALIGN|SLAB_PANIC); + bio_integrity_init_slab(); biovec_init_slabs(); fs_bio_set = bioset_create(BIO_POOL_SIZE, 2); diff --git a/include/linux/bio.h b/include/linux/bio.h index 49dfb3c..6bfc3e8 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -64,6 +64,7 @@ struct bio_vec { struct bio_set; struct bio; +struct bio_integrity_payload; typedef void (bio_end_io_t) (struct bio *, int); typedef void (bio_destructor_t) (struct bio *); @@ -112,6 +113,9 @@ struct bio { atomic_t bi_cnt; /* pin count */ void *bi_private; +#if defined(CONFIG_BLK_DEV_INTEGRITY) + struct bio_integrity_payload *bi_integrity; /* data integrity */ +#endif bio_destructor_t *bi_destructor; /* destructor */ }; @@ -271,6 +275,29 @@ static inline void *bio_data(struct bio *bio) */ #define bio_get(bio) atomic_inc(&(bio)->bi_cnt) +#if defined(CONFIG_BLK_DEV_INTEGRITY) +/* + * bio integrity payload + */ +struct bio_integrity_payload { + struct bio *bip_bio; /* parent bio */ + struct bio_vec *bip_vec; /* integrity data vector */ + + sector_t bip_sector; /* virtual start sector */ + + void *bip_buf; /* generated integrity data */ + bio_end_io_t *bip_end_io; /* saved I/O completion fn */ + + int bip_error; /* saved I/O error */ + unsigned int bip_size; + + unsigned short bip_pool; /* pool the ivec came from */ + unsigned short bip_vcnt; /* # of integrity bio_vecs */ + unsigned short bip_idx; /* current bip_vec index */ + + struct work_struct bip_work; /* I/O completion */ +}; +#endif /* CONFIG_BLK_DEV_INTEGRITY */ /* * A bio_pair is used when we need to split a bio. @@ -283,10 +310,14 @@ static inline void *bio_data(struct bio *bio) * in bio2.bi_private */ struct bio_pair { - struct bio bio1, bio2; - struct bio_vec bv1, bv2; - atomic_t cnt; - int error; + struct bio bio1, bio2; + struct bio_vec bv1, bv2; +#if defined(CONFIG_BLK_DEV_INTEGRITY) + struct bio_integrity_payload bip1, bip2; + struct bio_vec iv1, iv2; +#endif + atomic_t cnt; + int error; }; extern struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors); @@ -334,6 +365,7 @@ extern struct bio *bio_copy_user_iov(struct request_queue *, struct sg_iovec *, extern int bio_uncopy_user(struct bio *); void zero_fill_bio(struct bio *bio); extern struct bio_vec *bvec_alloc_bs(gfp_t, int, unsigned long *, struct bio_set *); +extern unsigned int bvec_nr_vecs(unsigned short idx); /* * bio_set is used to allow other portions of the IO system to @@ -346,6 +378,9 @@ extern struct bio_vec *bvec_alloc_bs(gfp_t, int, unsigned long *, struct bio_set struct bio_set { mempool_t *bio_pool; +#if defined(CONFIG_BLK_DEV_INTEGRITY) + mempool_t *bio_integrity_pool; +#endif mempool_t *bvec_pools[BIOVEC_NR_POOLS]; }; @@ -410,5 +445,56 @@ static inline char *__bio_kmap_irq(struct bio *bio, unsigned short idx, __bio_kmap_irq((bio), (bio)->bi_idx, (flags)) #define bio_kunmap_irq(buf,flags) __bio_kunmap_irq(buf, flags) +#if defined(CONFIG_BLK_DEV_INTEGRITY) + +#define bip_vec_idx(bip, idx) (&(bip->bip_vec[(idx)])) +#define bip_vec(bip) bip_vec_idx(bip, 0) + +#define __bip_for_each_vec(bvl, bip, i, start_idx) \ + for (bvl = bip_vec_idx((bip), (start_idx)), i = (start_idx); \ + i < (bip)->bip_vcnt; \ + bvl++, i++) + +#define bip_for_each_vec(bvl, bip, i) \ + __bip_for_each_vec(bvl, bip, i, (bip)->bip_idx) + +#define bio_integrity(bio) ((bio)->bi_integrity ? 1 : 0) + +extern struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *, gfp_t, unsigned int, struct bio_set *); +extern struct bio_integrity_payload *bio_integrity_alloc(struct bio *, gfp_t, unsigned int); +extern void bio_integrity_free(struct bio *, struct bio_set *); +extern int bio_integrity_add_page(struct bio *, struct page *, unsigned int, unsigned int); +extern int bio_integrity_enabled(struct bio *bio); +extern int bio_integrity_set_tag(struct bio *, void *, unsigned int); +extern int bio_integrity_get_tag(struct bio *, void *, unsigned int); +extern int bio_integrity_prep(struct bio *); +extern void bio_integrity_endio(struct bio *, int); +extern void bio_integrity_advance(struct bio *, unsigned int); +extern void bio_integrity_trim(struct bio *, unsigned int, unsigned int); +extern void bio_integrity_split(struct bio *, struct bio_pair *, int); +extern int bio_integrity_clone(struct bio *, struct bio *, struct bio_set *); +extern int bioset_integrity_create(struct bio_set *, int); +extern void bioset_integrity_free(struct bio_set *); +extern void bio_integrity_init_slab(void); + +#else /* CONFIG_BLK_DEV_INTEGRITY */ + +#define bio_integrity(a) (0) +#define bioset_integrity_create(a, b) (0) +#define bio_integrity_prep(a) (0) +#define bio_integrity_enabled(a) (0) +#define bio_integrity_clone(a, b, c) (0) +#define bioset_integrity_free(a) do { } while (0) +#define bio_integrity_free(a, b) do { } while (0) +#define bio_integrity_endio(a, b) do { } while (0) +#define bio_integrity_advance(a, b) do { } while (0) +#define bio_integrity_trim(a, b, c) do { } while (0) +#define bio_integrity_split(a, b, c) do { } while (0) +#define bio_integrity_set_tag(a, b, c) do { } while (0) +#define bio_integrity_get_tag(a, b, c) do { } while (0) +#define bio_integrity_init_slab(a) do { } while (0) + +#endif /* CONFIG_BLK_DEV_INTEGRITY */ + #endif /* CONFIG_BLOCK */ #endif /* __LINUX_BIO_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6a3da67..4a9ed45 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -112,6 +112,7 @@ enum rq_flag_bits { __REQ_ALLOCED, /* request came from our alloc pool */ __REQ_RW_META, /* metadata io request */ __REQ_COPY_USER, /* contains copies of user pages */ + __REQ_INTEGRITY, /* integrity metadata has been remapped */ __REQ_NR_BITS, /* stops here */ }; @@ -134,6 +135,7 @@ enum rq_flag_bits { #define REQ_ALLOCED (1 << __REQ_ALLOCED) #define REQ_RW_META (1 << __REQ_RW_META) #define REQ_COPY_USER (1 << __REQ_COPY_USER) +#define REQ_INTEGRITY (1 << __REQ_INTEGRITY) #define BLK_MAX_CDB 16 @@ -865,6 +867,109 @@ void kblockd_flush_work(struct work_struct *work); MODULE_ALIAS("block-major-" __stringify(major) "-*") +#if defined(CONFIG_BLK_DEV_INTEGRITY) + +#define INTEGRITY_FLAG_READ 1 /* verify data integrity on read */ +#define INTEGRITY_FLAG_WRITE 2 /* generate data integrity on write */ + +struct blk_integrity_exchg { + void *prot_buf; + void *data_buf; + sector_t sector; + unsigned int data_size; + unsigned short sector_size; + const char *disk_name; +}; + +typedef void (integrity_gen_fn) (struct blk_integrity_exchg *); +typedef int (integrity_vrfy_fn) (struct blk_integrity_exchg *); +typedef void (integrity_set_tag_fn) (void *, void *, unsigned int); +typedef void (integrity_get_tag_fn) (void *, void *, unsigned int); + +struct blk_integrity { + integrity_gen_fn *generate_fn; + integrity_vrfy_fn *verify_fn; + integrity_set_tag_fn *set_tag_fn; + integrity_get_tag_fn *get_tag_fn; + + unsigned short flags; + unsigned short tuple_size; + unsigned short sector_size; + unsigned short tag_size; + + const char *name; + + struct kobject kobj; +}; + +extern int blk_integrity_register(struct gendisk *, struct blk_integrity *); +extern void blk_integrity_unregister(struct gendisk *); +extern int blk_integrity_compare(struct block_device *, struct block_device *); +extern int blk_rq_map_integrity_sg(struct request *, struct scatterlist *); +extern int blk_rq_count_integrity_sg(struct request *); + +static inline unsigned short blk_integrity_tuple_size(struct blk_integrity *bi) +{ + if (bi) + return bi->tuple_size; + + return 0; +} + +static inline struct blk_integrity *bdev_get_integrity(struct block_device *bdev) +{ + return bdev->bd_disk->integrity; +} + +static inline unsigned int bdev_get_tag_size(struct block_device *bdev) +{ + struct blk_integrity *bi = bdev_get_integrity(bdev); + + if (bi) + return bi->tag_size; + + return 0; +} + +static inline int bdev_integrity_enabled(struct block_device *bdev, int rw) +{ + struct blk_integrity *bi = bdev_get_integrity(bdev); + + if (bi == NULL) + return 0; + + if (rw == READ && bi->verify_fn != NULL && + test_bit(INTEGRITY_FLAG_READ, &bi->flags)) + return 1; + + if (rw == WRITE && bi->generate_fn != NULL && + test_bit(INTEGRITY_FLAG_WRITE, &bi->flags)) + return 1; + + return 0; +} + +static inline int blk_integrity_rq(struct request *rq) +{ + BUG_ON(rq->bio == NULL); + + return bio_integrity(rq->bio); +} + +#else /* CONFIG_BLK_DEV_INTEGRITY */ + +#define blk_integrity_rq(rq) (0) +#define blk_rq_count_integrity_sg(a) (0) +#define blk_rq_map_integrity_sg(a, b) (0) +#define bdev_get_integrity(a) (0) +#define bdev_get_tag_size(a) (0) +#define blk_integrity_compare(a, b) (0) +#define blk_integrity_register(a, b) (0) +#define blk_integrity_unregister(a) do { } while (0); + +#endif /* CONFIG_BLK_DEV_INTEGRITY */ + + #else /* CONFIG_BLOCK */ /* * stubs for when the block layer is configured out diff --git a/include/linux/genhd.h b/include/linux/genhd.h index ae7aec3..524ec96 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -141,6 +141,9 @@ struct gendisk { struct disk_stats dkstats; #endif struct work_struct async_notify; +#ifdef CONFIG_BLK_DEV_INTEGRITY + struct blk_integrity *integrity; +#endif }; /* -- cgit v0.10.2 From c1c72b59941e2f5aad4b02609d7ee7b121734b8d Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Tue, 17 Jun 2008 18:59:57 +0200 Subject: block: Data integrity infrastructure documentation Signed-off-by: Martin K. Petersen Signed-off-by: Jens Axboe diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block index 4bd9ea5..44f52a4 100644 --- a/Documentation/ABI/testing/sysfs-block +++ b/Documentation/ABI/testing/sysfs-block @@ -26,3 +26,37 @@ Description: I/O statistics of partition . The format is the same as the above-written /sys/block//stat format. + + +What: /sys/block//integrity/format +Date: June 2008 +Contact: Martin K. Petersen +Description: + Metadata format for integrity capable block device. + E.g. T10-DIF-TYPE1-CRC. + + +What: /sys/block//integrity/read_verify +Date: June 2008 +Contact: Martin K. Petersen +Description: + Indicates whether the block layer should verify the + integrity of read requests serviced by devices that + support sending integrity metadata. + + +What: /sys/block//integrity/tag_size +Date: June 2008 +Contact: Martin K. Petersen +Description: + Number of bytes of integrity tag space available per + 512 bytes of data. + + +What: /sys/block//integrity/write_generate +Date: June 2008 +Contact: Martin K. Petersen +Description: + Indicates whether the block layer should automatically + generate checksums for write requests bound for + devices that support receiving integrity metadata. diff --git a/Documentation/block/data-integrity.txt b/Documentation/block/data-integrity.txt new file mode 100644 index 0000000..e9dc8d8 --- /dev/null +++ b/Documentation/block/data-integrity.txt @@ -0,0 +1,327 @@ +---------------------------------------------------------------------- +1. INTRODUCTION + +Modern filesystems feature checksumming of data and metadata to +protect against data corruption. However, the detection of the +corruption is done at read time which could potentially be months +after the data was written. At that point the original data that the +application tried to write is most likely lost. + +The solution is to ensure that the disk is actually storing what the +application meant it to. Recent additions to both the SCSI family +protocols (SBC Data Integrity Field, SCC protection proposal) as well +as SATA/T13 (External Path Protection) try to remedy this by adding +support for appending integrity metadata to an I/O. The integrity +metadata (or protection information in SCSI terminology) includes a +checksum for each sector as well as an incrementing counter that +ensures the individual sectors are written in the right order. And +for some protection schemes also that the I/O is written to the right +place on disk. + +Current storage controllers and devices implement various protective +measures, for instance checksumming and scrubbing. But these +technologies are working in their own isolated domains or at best +between adjacent nodes in the I/O path. The interesting thing about +DIF and the other integrity extensions is that the protection format +is well defined and every node in the I/O path can verify the +integrity of the I/O and reject it if corruption is detected. This +allows not only corruption prevention but also isolation of the point +of failure. + +---------------------------------------------------------------------- +2. THE DATA INTEGRITY EXTENSIONS + +As written, the protocol extensions only protect the path between +controller and storage device. However, many controllers actually +allow the operating system to interact with the integrity metadata +(IMD). We have been working with several FC/SAS HBA vendors to enable +the protection information to be transferred to and from their +controllers. + +The SCSI Data Integrity Field works by appending 8 bytes of protection +information to each sector. The data + integrity metadata is stored +in 520 byte sectors on disk. Data + IMD are interleaved when +transferred between the controller and target. The T13 proposal is +similar. + +Because it is highly inconvenient for operating systems to deal with +520 (and 4104) byte sectors, we approached several HBA vendors and +encouraged them to allow separation of the data and integrity metadata +scatter-gather lists. + +The controller will interleave the buffers on write and split them on +read. This means that the Linux can DMA the data buffers to and from +host memory without changes to the page cache. + +Also, the 16-bit CRC checksum mandated by both the SCSI and SATA specs +is somewhat heavy to compute in software. Benchmarks found that +calculating this checksum had a significant impact on system +performance for a number of workloads. Some controllers allow a +lighter-weight checksum to be used when interfacing with the operating +system. Emulex, for instance, supports the TCP/IP checksum instead. +The IP checksum received from the OS is converted to the 16-bit CRC +when writing and vice versa. This allows the integrity metadata to be +generated by Linux or the application at very low cost (comparable to +software RAID5). + +The IP checksum is weaker than the CRC in terms of detecting bit +errors. However, the strength is really in the separation of the data +buffers and the integrity metadata. These two distinct buffers much +match up for an I/O to complete. + +The separation of the data and integrity metadata buffers as well as +the choice in checksums is referred to as the Data Integrity +Extensions. As these extensions are outside the scope of the protocol +bodies (T10, T13), Oracle and its partners are trying to standardize +them within the Storage Networking Industry Association. + +---------------------------------------------------------------------- +3. KERNEL CHANGES + +The data integrity framework in Linux enables protection information +to be pinned to I/Os and sent to/received from controllers that +support it. + +The advantage to the integrity extensions in SCSI and SATA is that +they enable us to protect the entire path from application to storage +device. However, at the same time this is also the biggest +disadvantage. It means that the protection information must be in a +format that can be understood by the disk. + +Generally Linux/POSIX applications are agnostic to the intricacies of +the storage devices they are accessing. The virtual filesystem switch +and the block layer make things like hardware sector size and +transport protocols completely transparent to the application. + +However, this level of detail is required when preparing the +protection information to send to a disk. Consequently, the very +concept of an end-to-end protection scheme is a layering violation. +It is completely unreasonable for an application to be aware whether +it is accessing a SCSI or SATA disk. + +The data integrity support implemented in Linux attempts to hide this +from the application. As far as the application (and to some extent +the kernel) is concerned, the integrity metadata is opaque information +that's attached to the I/O. + +The current implementation allows the block layer to automatically +generate the protection information for any I/O. Eventually the +intent is to move the integrity metadata calculation to userspace for +user data. Metadata and other I/O that originates within the kernel +will still use the automatic generation interface. + +Some storage devices allow each hardware sector to be tagged with a +16-bit value. The owner of this tag space is the owner of the block +device. I.e. the filesystem in most cases. The filesystem can use +this extra space to tag sectors as they see fit. Because the tag +space is limited, the block interface allows tagging bigger chunks by +way of interleaving. This way, 8*16 bits of information can be +attached to a typical 4KB filesystem block. + +This also means that applications such as fsck and mkfs will need +access to manipulate the tags from user space. A passthrough +interface for this is being worked on. + + +---------------------------------------------------------------------- +4. BLOCK LAYER IMPLEMENTATION DETAILS + +4.1 BIO + +The data integrity patches add a new field to struct bio when +CONFIG_BLK_DEV_INTEGRITY is enabled. bio->bi_integrity is a pointer +to a struct bip which contains the bio integrity payload. Essentially +a bip is a trimmed down struct bio which holds a bio_vec containing +the integrity metadata and the required housekeeping information (bvec +pool, vector count, etc.) + +A kernel subsystem can enable data integrity protection on a bio by +calling bio_integrity_alloc(bio). This will allocate and attach the +bip to the bio. + +Individual pages containing integrity metadata can subsequently be +attached using bio_integrity_add_page(). + +bio_free() will automatically free the bip. + + +4.2 BLOCK DEVICE + +Because the format of the protection data is tied to the physical +disk, each block device has been extended with a block integrity +profile (struct blk_integrity). This optional profile is registered +with the block layer using blk_integrity_register(). + +The profile contains callback functions for generating and verifying +the protection data, as well as getting and setting application tags. +The profile also contains a few constants to aid in completing, +merging and splitting the integrity metadata. + +Layered block devices will need to pick a profile that's appropriate +for all subdevices. blk_integrity_compare() can help with that. DM +and MD linear, RAID0 and RAID1 are currently supported. RAID4/5/6 +will require extra work due to the application tag. + + +---------------------------------------------------------------------- +5.0 BLOCK LAYER INTEGRITY API + +5.1 NORMAL FILESYSTEM + + The normal filesystem is unaware that the underlying block device + is capable of sending/receiving integrity metadata. The IMD will + be automatically generated by the block layer at submit_bio() time + in case of a WRITE. A READ request will cause the I/O integrity + to be verified upon completion. + + IMD generation and verification can be toggled using the + + /sys/block//integrity/write_generate + + and + + /sys/block//integrity/read_verify + + flags. + + +5.2 INTEGRITY-AWARE FILESYSTEM + + A filesystem that is integrity-aware can prepare I/Os with IMD + attached. It can also use the application tag space if this is + supported by the block device. + + + int bdev_integrity_enabled(block_device, int rw); + + bdev_integrity_enabled() will return 1 if the block device + supports integrity metadata transfer for the data direction + specified in 'rw'. + + bdev_integrity_enabled() honors the write_generate and + read_verify flags in sysfs and will respond accordingly. + + + int bio_integrity_prep(bio); + + To generate IMD for WRITE and to set up buffers for READ, the + filesystem must call bio_integrity_prep(bio). + + Prior to calling this function, the bio data direction and start + sector must be set, and the bio should have all data pages + added. It is up to the caller to ensure that the bio does not + change while I/O is in progress. + + bio_integrity_prep() should only be called if + bio_integrity_enabled() returned 1. + + + int bio_integrity_tag_size(bio); + + If the filesystem wants to use the application tag space it will + first have to find out how much storage space is available. + Because tag space is generally limited (usually 2 bytes per + sector regardless of sector size), the integrity framework + supports interleaving the information between the sectors in an + I/O. + + Filesystems can call bio_integrity_tag_size(bio) to find out how + many bytes of storage are available for that particular bio. + + Another option is bdev_get_tag_size(block_device) which will + return the number of available bytes per hardware sector. + + + int bio_integrity_set_tag(bio, void *tag_buf, len); + + After a successful return from bio_integrity_prep(), + bio_integrity_set_tag() can be used to attach an opaque tag + buffer to a bio. Obviously this only makes sense if the I/O is + a WRITE. + + + int bio_integrity_get_tag(bio, void *tag_buf, len); + + Similarly, at READ I/O completion time the filesystem can + retrieve the tag buffer using bio_integrity_get_tag(). + + +6.3 PASSING EXISTING INTEGRITY METADATA + + Filesystems that either generate their own integrity metadata or + are capable of transferring IMD from user space can use the + following calls: + + + struct bip * bio_integrity_alloc(bio, gfp_mask, nr_pages); + + Allocates the bio integrity payload and hangs it off of the bio. + nr_pages indicate how many pages of protection data need to be + stored in the integrity bio_vec list (similar to bio_alloc()). + + The integrity payload will be freed at bio_free() time. + + + int bio_integrity_add_page(bio, page, len, offset); + + Attaches a page containing integrity metadata to an existing + bio. The bio must have an existing bip, + i.e. bio_integrity_alloc() must have been called. For a WRITE, + the integrity metadata in the pages must be in a format + understood by the target device with the notable exception that + the sector numbers will be remapped as the request traverses the + I/O stack. This implies that the pages added using this call + will be modified during I/O! The first reference tag in the + integrity metadata must have a value of bip->bip_sector. + + Pages can be added using bio_integrity_add_page() as long as + there is room in the bip bio_vec array (nr_pages). + + Upon completion of a READ operation, the attached pages will + contain the integrity metadata received from the storage device. + It is up to the receiver to process them and verify data + integrity upon completion. + + +6.4 REGISTERING A BLOCK DEVICE AS CAPABLE OF EXCHANGING INTEGRITY + METADATA + + To enable integrity exchange on a block device the gendisk must be + registered as capable: + + int blk_integrity_register(gendisk, blk_integrity); + + The blk_integrity struct is a template and should contain the + following: + + static struct blk_integrity my_profile = { + .name = "STANDARDSBODY-TYPE-VARIANT-CSUM", + .generate_fn = my_generate_fn, + .verify_fn = my_verify_fn, + .get_tag_fn = my_get_tag_fn, + .set_tag_fn = my_set_tag_fn, + .tuple_size = sizeof(struct my_tuple_size), + .tag_size = , + }; + + 'name' is a text string which will be visible in sysfs. This is + part of the userland API so chose it carefully and never change + it. The format is standards body-type-variant. + E.g. T10-DIF-TYPE1-IP or T13-EPP-0-CRC. + + 'generate_fn' generates appropriate integrity metadata (for WRITE). + + 'verify_fn' verifies that the data buffer matches the integrity + metadata. + + 'tuple_size' must be set to match the size of the integrity + metadata per sector. I.e. 8 for DIF and EPP. + + 'tag_size' must be set to identify how many bytes of tag space + are available per hardware sector. For DIF this is either 2 or + 0 depending on the value of the Control Mode Page ATO bit. + + See 6.2 for a description of get_tag_fn and set_tag_fn. + +---------------------------------------------------------------------- +2007-12-24 Martin K. Petersen -- cgit v0.10.2 From b984679efe1a616ec4ac919dba08286d71593900 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 17 Jun 2008 19:05:48 +0200 Subject: block: integrity checkpatch cleanups > 80 char lines and that sort of thing. Signed-off-by: Jens Axboe diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 65f23ef..4ffa381 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -178,8 +178,9 @@ static ssize_t integrity_attr_show(struct kobject *kobj, struct attribute *attr, return entry->show(bi, page); } -static ssize_t integrity_attr_store(struct kobject *kobj, struct attribute *attr, - const char *page, size_t count) +static ssize_t integrity_attr_store(struct kobject *kobj, + struct attribute *attr, const char *page, + size_t count) { struct blk_integrity *bi = container_of(kobj, struct blk_integrity, kobj); @@ -326,7 +327,8 @@ int blk_integrity_register(struct gendisk *disk, struct blk_integrity *template) BUG_ON(template == NULL); if (disk->integrity == NULL) { - bi = kmem_cache_alloc(integrity_cachep, GFP_KERNEL | __GFP_ZERO); + bi = kmem_cache_alloc(integrity_cachep, + GFP_KERNEL | __GFP_ZERO); if (!bi) return -1; diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index 31b0887..63e2ee6 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -39,7 +39,10 @@ static struct workqueue_struct *kintegrityd_wq; * metadata. nr_vecs specifies the maximum number of pages containing * integrity metadata that can be attached. */ -struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio, gfp_t gfp_mask, unsigned int nr_vecs, struct bio_set *bs) +struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio, + gfp_t gfp_mask, + unsigned int nr_vecs, + struct bio_set *bs) { struct bio_integrity_payload *bip; struct bio_vec *iv; @@ -81,7 +84,9 @@ EXPORT_SYMBOL(bio_integrity_alloc_bioset); * metadata. nr_vecs specifies the maximum number of pages containing * integrity metadata that can be attached. */ -struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, gfp_t gfp_mask, unsigned int nr_vecs) +struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, + gfp_t gfp_mask, + unsigned int nr_vecs) { return bio_integrity_alloc_bioset(bio, gfp_mask, nr_vecs, fs_bio_set); } @@ -174,7 +179,8 @@ EXPORT_SYMBOL(bio_integrity_enabled); * sector size of the storage device. Convert the block layer sectors * to physical sectors. */ -static inline unsigned int bio_integrity_hw_sectors(struct blk_integrity *bi, unsigned int sectors) +static inline unsigned int bio_integrity_hw_sectors(struct blk_integrity *bi, + unsigned int sectors) { /* At this point there are only 512b or 4096b DIF/EPP devices */ if (bi->sector_size == 4096) @@ -212,7 +218,8 @@ int bio_integrity_tag(struct bio *bio, void *tag_buf, unsigned int len, int set) if (bi->tag_size == 0) return -1; - nr_sectors = bio_integrity_hw_sectors(bi, DIV_ROUND_UP(len, bi->tag_size)); + nr_sectors = bio_integrity_hw_sectors(bi, + DIV_ROUND_UP(len, bi->tag_size)); if (nr_sectors * bi->tuple_size > bip->bip_size) { printk(KERN_ERR "%s: tag too big for bio: %u > %u\n", @@ -456,7 +463,7 @@ static int bio_integrity_verify(struct bio *bio) */ static void bio_integrity_verify_fn(struct work_struct *work) { - struct bio_integrity_payload *bip = + struct bio_integrity_payload *bip = container_of(work, struct bio_integrity_payload, bip_work); struct bio *bio = bip->bip_bio; int error = bip->bip_error; @@ -502,7 +509,8 @@ EXPORT_SYMBOL(bio_integrity_endio); * @bip: Integrity vector to advance * @skip: Number of bytes to advance it */ -void bio_integrity_mark_head(struct bio_integrity_payload *bip, unsigned int skip) +void bio_integrity_mark_head(struct bio_integrity_payload *bip, + unsigned int skip) { struct bio_vec *iv; unsigned int i; @@ -527,7 +535,8 @@ void bio_integrity_mark_head(struct bio_integrity_payload *bip, unsigned int ski * @bip: Integrity vector to truncate * @len: New length of integrity vector */ -void bio_integrity_mark_tail(struct bio_integrity_payload *bip, unsigned int len) +void bio_integrity_mark_tail(struct bio_integrity_payload *bip, + unsigned int len) { struct bio_vec *iv; unsigned int i; @@ -579,7 +588,8 @@ EXPORT_SYMBOL(bio_integrity_advance); * and the length will be truncated corresponding to 'len' data * sectors. */ -void bio_integrity_trim(struct bio *bio, unsigned int offset, unsigned int sectors) +void bio_integrity_trim(struct bio *bio, unsigned int offset, + unsigned int sectors) { struct bio_integrity_payload *bip = bio->bi_integrity; struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); @@ -648,7 +658,8 @@ EXPORT_SYMBOL(bio_integrity_split); * * Description: Called to allocate a bip when cloning a bio */ -int bio_integrity_clone(struct bio *bio, struct bio *bio_src, struct bio_set *bs) +int bio_integrity_clone(struct bio *bio, struct bio *bio_src, + struct bio_set *bs) { struct bio_integrity_payload *bip_src = bio_src->bi_integrity; struct bio_integrity_payload *bip; -- cgit v0.10.2 From da9cbc87395308a21465bd25441297bbba0477e1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 30 Jun 2008 20:42:08 +0200 Subject: block: blkdev.h cleanup, move iocontext stuff to iocontext.h Signed-off-by: Jens Axboe diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 4a9ed45..443df75 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -33,12 +33,6 @@ struct sg_io_hdr; #define BLKDEV_MIN_RQ 4 #define BLKDEV_MAX_RQ 128 /* Default maximum */ -int put_io_context(struct io_context *ioc); -void exit_io_context(void); -struct io_context *get_io_context(gfp_t gfp_flags, int node); -struct io_context *alloc_io_context(gfp_t gfp_flags, int node); -void copy_io_context(struct io_context **pdst, struct io_context **psrc); - struct request; typedef void (rq_end_io_fn)(struct request *, int); @@ -981,17 +975,6 @@ static inline long nr_blockdev_pages(void) return 0; } -static inline void exit_io_context(void) -{ -} - -struct io_context; -static inline int put_io_context(struct io_context *ioc) -{ - return 1; -} - - #endif /* CONFIG_BLOCK */ #endif diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index 2b7a118..08b987b 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -99,4 +99,22 @@ static inline struct io_context *ioc_task_link(struct io_context *ioc) return NULL; } +#ifdef CONFIG_BLOCK +int put_io_context(struct io_context *ioc); +void exit_io_context(void); +struct io_context *get_io_context(gfp_t gfp_flags, int node); +struct io_context *alloc_io_context(gfp_t gfp_flags, int node); +void copy_io_context(struct io_context **pdst, struct io_context **psrc); +#else +static inline void exit_io_context(void) +{ +} + +struct io_context; +static inline int put_io_context(struct io_context *ioc) +{ + return 1; +} +#endif + #endif diff --git a/kernel/exit.c b/kernel/exit.c index 8f6185e..ceb2587 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include diff --git a/kernel/fork.c b/kernel/fork.c index 19908b2..b71ccd0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include -- cgit v0.10.2 From 6e2401ad6f33de15ff00f78b88159f00a14f3b35 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 18 Jun 2008 10:15:02 +0200 Subject: block: integrity cleanups - No need to check for NULL bio, we'll get an immediate oops anyway. - Make bio_integrity() a proper function. Signed-off-by: Jens Axboe diff --git a/include/linux/bio.h b/include/linux/bio.h index 6bfc3e8..0933a14 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -458,7 +458,14 @@ static inline char *__bio_kmap_irq(struct bio *bio, unsigned short idx, #define bip_for_each_vec(bvl, bip, i) \ __bip_for_each_vec(bvl, bip, i, (bip)->bip_idx) -#define bio_integrity(bio) ((bio)->bi_integrity ? 1 : 0) +static inline int bio_integrity(struct bio *bio) +{ +#if defined(CONFIG_BLK_DEV_INTEGRITY) + return bio->bi_integrity != NULL; +#else + return 0; +#endif +} extern struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *, gfp_t, unsigned int, struct bio_set *); extern struct bio_integrity_payload *bio_integrity_alloc(struct bio *, gfp_t, unsigned int); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 443df75..d3ae9ad 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -860,7 +860,6 @@ void kblockd_flush_work(struct work_struct *work); #define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \ MODULE_ALIAS("block-major-" __stringify(major) "-*") - #if defined(CONFIG_BLK_DEV_INTEGRITY) #define INTEGRITY_FLAG_READ 1 /* verify data integrity on read */ @@ -945,8 +944,6 @@ static inline int bdev_integrity_enabled(struct block_device *bdev, int rw) static inline int blk_integrity_rq(struct request *rq) { - BUG_ON(rq->bio == NULL); - return bio_integrity(rq->bio); } @@ -963,7 +960,6 @@ static inline int blk_integrity_rq(struct request *rq) #endif /* CONFIG_BLK_DEV_INTEGRITY */ - #else /* CONFIG_BLOCK */ /* * stubs for when the block layer is configured out -- cgit v0.10.2 From 0b07de85a76e1346e675f0e98437378932473df7 Mon Sep 17 00:00:00 2001 From: Adel Gadllah Date: Thu, 26 Jun 2008 13:48:27 +0200 Subject: allow userspace to modify scsi command filter on per device basis This patch exports the per-gendisk command filter to user space through sysfs, so it can be changed by the system administrator. All users of the old cmd filter have been converted to use the new one. Original patch from Peter Jones. Signed-off-by: Adel Gadllah Signed-off-by: Peter Jones Signed-off-by: Jens Axboe diff --git a/block/Makefile b/block/Makefile index 045f7b6..208000b 100644 --- a/block/Makefile +++ b/block/Makefile @@ -4,7 +4,8 @@ obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \ - blk-exec.o blk-merge.o ioctl.o genhd.o scsi_ioctl.o + blk-exec.o blk-merge.o ioctl.o genhd.o scsi_ioctl.o \ + cmd-filter.o obj-$(CONFIG_BLK_DEV_BSG) += bsg.o obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o diff --git a/block/bsg.c b/block/bsg.c index f0b7cd3..439940c 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -44,11 +44,12 @@ struct bsg_device { char name[BUS_ID_SIZE]; int max_queue; unsigned long flags; + struct blk_scsi_cmd_filter *cmd_filter; + mode_t *f_mode; }; enum { BSG_F_BLOCK = 1, - BSG_F_WRITE_PERM = 2, }; #define BSG_DEFAULT_CMDS 64 @@ -172,7 +173,7 @@ unlock: } static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq, - struct sg_io_v4 *hdr, int has_write_perm) + struct sg_io_v4 *hdr, struct bsg_device *bd) { if (hdr->request_len > BLK_MAX_CDB) { rq->cmd = kzalloc(hdr->request_len, GFP_KERNEL); @@ -185,7 +186,8 @@ static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq, return -EFAULT; if (hdr->subprotocol == BSG_SUB_PROTOCOL_SCSI_CMD) { - if (blk_verify_command(rq->cmd, has_write_perm)) + if (blk_cmd_filter_verify_command(bd->cmd_filter, rq->cmd, + bd->f_mode)) return -EPERM; } else if (!capable(CAP_SYS_RAWIO)) return -EPERM; @@ -263,8 +265,7 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr) rq = blk_get_request(q, rw, GFP_KERNEL); if (!rq) return ERR_PTR(-ENOMEM); - ret = blk_fill_sgv4_hdr_rq(q, rq, hdr, test_bit(BSG_F_WRITE_PERM, - &bd->flags)); + ret = blk_fill_sgv4_hdr_rq(q, rq, hdr, bd); if (ret) goto out; @@ -566,12 +567,23 @@ static inline void bsg_set_block(struct bsg_device *bd, struct file *file) set_bit(BSG_F_BLOCK, &bd->flags); } -static inline void bsg_set_write_perm(struct bsg_device *bd, struct file *file) +static void bsg_set_cmd_filter(struct bsg_device *bd, + struct file *file) { - if (file->f_mode & FMODE_WRITE) - set_bit(BSG_F_WRITE_PERM, &bd->flags); - else - clear_bit(BSG_F_WRITE_PERM, &bd->flags); + struct inode *inode; + struct gendisk *disk; + + if (!file) + return; + + inode = file->f_dentry->d_inode; + if (!inode) + return; + + disk = inode->i_bdev->bd_disk; + + bd->cmd_filter = &disk->cmd_filter; + bd->f_mode = &file->f_mode; } /* @@ -595,6 +607,8 @@ bsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) dprintk("%s: read %Zd bytes\n", bd->name, count); bsg_set_block(bd, file); + bsg_set_cmd_filter(bd, file); + bytes_read = 0; ret = __bsg_read(buf, count, bd, NULL, &bytes_read); *ppos = bytes_read; @@ -668,7 +682,7 @@ bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) dprintk("%s: write %Zd bytes\n", bd->name, count); bsg_set_block(bd, file); - bsg_set_write_perm(bd, file); + bsg_set_cmd_filter(bd, file); bytes_written = 0; ret = __bsg_write(bd, buf, count, &bytes_written); @@ -771,7 +785,9 @@ static struct bsg_device *bsg_add_device(struct inode *inode, } bd->queue = rq; + bsg_set_block(bd, file); + bsg_set_cmd_filter(bd, file); atomic_set(&bd->ref_count, 1); mutex_lock(&bsg_mutex); diff --git a/block/cmd-filter.c b/block/cmd-filter.c new file mode 100644 index 0000000..35e327c --- /dev/null +++ b/block/cmd-filter.c @@ -0,0 +1,325 @@ +/* + * Copyright 2004 Peter M. Jones + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public Licens + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- + * + */ + +#include +#include +#include +#include +#include +#include + +#include +#include + +int blk_cmd_filter_verify_command(struct blk_scsi_cmd_filter *filter, + unsigned char *cmd, mode_t *f_mode) +{ + /* root can do any command. */ + if (capable(CAP_SYS_RAWIO)) + return 0; + + /* if there's no filter set, assume we're filtering everything out */ + if (!filter) + return -EPERM; + + /* Anybody who can open the device can do a read-safe command */ + if (test_bit(cmd[0], filter->read_ok)) + return 0; + + /* Write-safe commands require a writable open */ + if (test_bit(cmd[0], filter->write_ok) && (*f_mode & FMODE_WRITE)) + return 0; + + return -EPERM; +} +EXPORT_SYMBOL(blk_cmd_filter_verify_command); + +int blk_verify_command(struct file *file, unsigned char *cmd) +{ + struct gendisk *disk; + struct inode *inode; + + if (!file) + return -EINVAL; + + inode = file->f_dentry->d_inode; + if (!inode) + return -EINVAL; + + disk = inode->i_bdev->bd_disk; + + return blk_cmd_filter_verify_command(&disk->cmd_filter, + cmd, &file->f_mode); +} +EXPORT_SYMBOL(blk_verify_command); + +/* and now, the sysfs stuff */ +static ssize_t rcf_cmds_show(struct blk_scsi_cmd_filter *filter, char *page, + int rw) +{ + char *npage = page; + unsigned long *okbits; + int i; + + if (rw == READ) + okbits = filter->read_ok; + else + okbits = filter->write_ok; + + for (i = 0; i < BLK_SCSI_MAX_CMDS; i++) { + if (test_bit(i, okbits)) { + sprintf(npage, "%02x", i); + npage += 2; + if (i < BLK_SCSI_MAX_CMDS - 1) + sprintf(npage++, " "); + } + } + + if (npage != page) + npage += sprintf(npage, "\n"); + + return npage - page; +} + +static ssize_t rcf_readcmds_show(struct blk_scsi_cmd_filter *filter, char *page) +{ + return rcf_cmds_show(filter, page, READ); +} + +static ssize_t rcf_writecmds_show(struct blk_scsi_cmd_filter *filter, + char *page) +{ + return rcf_cmds_show(filter, page, WRITE); +} + +static ssize_t rcf_cmds_store(struct blk_scsi_cmd_filter *filter, + const char *page, size_t count, int rw) +{ + ssize_t ret = 0; + unsigned long okbits[BLK_SCSI_CMD_PER_LONG], *target_okbits; + int cmd, status, len; + substring_t ss; + + memset(&okbits, 0, sizeof(okbits)); + + for (len = strlen(page); len > 0; len -= 3) { + if (len < 2) + break; + ss.from = (char *) page + ret; + ss.to = (char *) page + ret + 2; + ret += 3; + status = match_hex(&ss, &cmd); + /* either of these cases means invalid input, so do nothing. */ + if (status || cmd >= BLK_SCSI_MAX_CMDS) + return -EINVAL; + + __set_bit(cmd, okbits); + } + + if (rw == READ) + target_okbits = filter->read_ok; + else + target_okbits = filter->write_ok; + + memmove(target_okbits, okbits, sizeof(okbits)); + return count; +} + +static ssize_t rcf_readcmds_store(struct blk_scsi_cmd_filter *filter, + const char *page, size_t count) +{ + return rcf_cmds_store(filter, page, count, READ); +} + +static ssize_t rcf_writecmds_store(struct blk_scsi_cmd_filter *filter, + const char *page, size_t count) +{ + return rcf_cmds_store(filter, page, count, WRITE); +} + +struct rcf_sysfs_entry { + struct attribute attr; + ssize_t (*show)(struct blk_scsi_cmd_filter *, char *); + ssize_t (*store)(struct blk_scsi_cmd_filter *, const char *, size_t); +}; + +static struct rcf_sysfs_entry rcf_readcmds_entry = { + .attr = { .name = "read_table", .mode = S_IRUGO | S_IWUSR }, + .show = rcf_readcmds_show, + .store = rcf_readcmds_store, +}; + +static struct rcf_sysfs_entry rcf_writecmds_entry = { + .attr = {.name = "write_table", .mode = S_IRUGO | S_IWUSR }, + .show = rcf_writecmds_show, + .store = rcf_writecmds_store, +}; + +static struct attribute *default_attrs[] = { + &rcf_readcmds_entry.attr, + &rcf_writecmds_entry.attr, + NULL, +}; + +#define to_rcf(atr) container_of((atr), struct rcf_sysfs_entry, attr) + +static ssize_t +rcf_attr_show(struct kobject *kobj, struct attribute *attr, char *page) +{ + struct rcf_sysfs_entry *entry = to_rcf(attr); + struct blk_scsi_cmd_filter *filter; + + filter = container_of(kobj, struct blk_scsi_cmd_filter, kobj); + if (entry->show) + return entry->show(filter, page); + + return 0; +} + +static ssize_t +rcf_attr_store(struct kobject *kobj, struct attribute *attr, + const char *page, size_t length) +{ + struct rcf_sysfs_entry *entry = to_rcf(attr); + struct blk_scsi_cmd_filter *filter; + + if (!capable(CAP_SYS_RAWIO)) + return -EPERM; + + if (!entry->store) + return -EINVAL; + + filter = container_of(kobj, struct blk_scsi_cmd_filter, kobj); + return entry->store(filter, page, length); +} + +static struct sysfs_ops rcf_sysfs_ops = { + .show = rcf_attr_show, + .store = rcf_attr_store, +}; + +static struct kobj_type rcf_ktype = { + .sysfs_ops = &rcf_sysfs_ops, + .default_attrs = default_attrs, +}; + +static void rcf_set_defaults(struct blk_scsi_cmd_filter *filter) +{ + /* Basic read-only commands */ + __set_bit(TEST_UNIT_READY, filter->read_ok); + __set_bit(REQUEST_SENSE, filter->read_ok); + __set_bit(READ_6, filter->read_ok); + __set_bit(READ_10, filter->read_ok); + __set_bit(READ_12, filter->read_ok); + __set_bit(READ_16, filter->read_ok); + __set_bit(READ_BUFFER, filter->read_ok); + __set_bit(READ_DEFECT_DATA, filter->read_ok); + __set_bit(READ_LONG, filter->read_ok); + __set_bit(INQUIRY, filter->read_ok); + __set_bit(MODE_SENSE, filter->read_ok); + __set_bit(MODE_SENSE_10, filter->read_ok); + __set_bit(LOG_SENSE, filter->read_ok); + __set_bit(START_STOP, filter->read_ok); + __set_bit(GPCMD_VERIFY_10, filter->read_ok); + __set_bit(VERIFY_16, filter->read_ok); + __set_bit(GPCMD_READ_BUFFER_CAPACITY, filter->read_ok); + + /* Audio CD commands */ + __set_bit(GPCMD_PLAY_CD, filter->read_ok); + __set_bit(GPCMD_PLAY_AUDIO_10, filter->read_ok); + __set_bit(GPCMD_PLAY_AUDIO_MSF, filter->read_ok); + __set_bit(GPCMD_PLAY_AUDIO_TI, filter->read_ok); + __set_bit(GPCMD_PAUSE_RESUME, filter->read_ok); + + /* CD/DVD data reading */ + __set_bit(GPCMD_READ_CD, filter->read_ok); + __set_bit(GPCMD_READ_CD_MSF, filter->read_ok); + __set_bit(GPCMD_READ_DISC_INFO, filter->read_ok); + __set_bit(GPCMD_READ_CDVD_CAPACITY, filter->read_ok); + __set_bit(GPCMD_READ_DVD_STRUCTURE, filter->read_ok); + __set_bit(GPCMD_READ_HEADER, filter->read_ok); + __set_bit(GPCMD_READ_TRACK_RZONE_INFO, filter->read_ok); + __set_bit(GPCMD_READ_SUBCHANNEL, filter->read_ok); + __set_bit(GPCMD_READ_TOC_PMA_ATIP, filter->read_ok); + __set_bit(GPCMD_REPORT_KEY, filter->read_ok); + __set_bit(GPCMD_SCAN, filter->read_ok); + __set_bit(GPCMD_GET_CONFIGURATION, filter->read_ok); + __set_bit(GPCMD_READ_FORMAT_CAPACITIES, filter->read_ok); + __set_bit(GPCMD_GET_EVENT_STATUS_NOTIFICATION, filter->read_ok); + __set_bit(GPCMD_GET_PERFORMANCE, filter->read_ok); + __set_bit(GPCMD_SEEK, filter->read_ok); + __set_bit(GPCMD_STOP_PLAY_SCAN, filter->read_ok); + + /* Basic writing commands */ + __set_bit(WRITE_6, filter->write_ok); + __set_bit(WRITE_10, filter->write_ok); + __set_bit(WRITE_VERIFY, filter->write_ok); + __set_bit(WRITE_12, filter->write_ok); + __set_bit(WRITE_VERIFY_12, filter->write_ok); + __set_bit(WRITE_16, filter->write_ok); + __set_bit(WRITE_LONG, filter->write_ok); + __set_bit(WRITE_LONG_2, filter->write_ok); + __set_bit(ERASE, filter->write_ok); + __set_bit(GPCMD_MODE_SELECT_10, filter->write_ok); + __set_bit(MODE_SELECT, filter->write_ok); + __set_bit(LOG_SELECT, filter->write_ok); + __set_bit(GPCMD_BLANK, filter->write_ok); + __set_bit(GPCMD_CLOSE_TRACK, filter->write_ok); + __set_bit(GPCMD_FLUSH_CACHE, filter->write_ok); + __set_bit(GPCMD_FORMAT_UNIT, filter->write_ok); + __set_bit(GPCMD_REPAIR_RZONE_TRACK, filter->write_ok); + __set_bit(GPCMD_RESERVE_RZONE_TRACK, filter->write_ok); + __set_bit(GPCMD_SEND_DVD_STRUCTURE, filter->write_ok); + __set_bit(GPCMD_SEND_EVENT, filter->write_ok); + __set_bit(GPCMD_SEND_KEY, filter->write_ok); + __set_bit(GPCMD_SEND_OPC, filter->write_ok); + __set_bit(GPCMD_SEND_CUE_SHEET, filter->write_ok); + __set_bit(GPCMD_SET_SPEED, filter->write_ok); + __set_bit(GPCMD_PREVENT_ALLOW_MEDIUM_REMOVAL, filter->write_ok); + __set_bit(GPCMD_LOAD_UNLOAD, filter->write_ok); + __set_bit(GPCMD_SET_STREAMING, filter->write_ok); +} + +int blk_register_filter(struct gendisk *disk) +{ + int ret; + struct blk_scsi_cmd_filter *filter = &disk->cmd_filter; + struct kobject *parent = kobject_get(disk->holder_dir->parent); + + if (!parent) + return -ENODEV; + + ret = kobject_init_and_add(&filter->kobj, &rcf_ktype, parent, + "%s", "cmd_filter"); + + if (ret < 0) + return ret; + + rcf_set_defaults(filter); + return 0; +} + +void blk_unregister_filter(struct gendisk *disk) +{ + struct blk_scsi_cmd_filter *filter = &disk->cmd_filter; + + kobject_put(&filter->kobj); + kobject_put(disk->holder_dir->parent); +} + diff --git a/block/genhd.c b/block/genhd.c index 43e468e..9074f38 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -189,6 +189,7 @@ void add_disk(struct gendisk *disk) disk->minors, NULL, exact_match, exact_lock, disk); register_disk(disk); blk_register_queue(disk); + blk_register_filter(disk); bdi = &disk->queue->backing_dev_info; bdi_register_dev(bdi, MKDEV(disk->major, disk->first_minor)); @@ -200,6 +201,7 @@ EXPORT_SYMBOL(del_gendisk); /* in partitions/check.c */ void unlink_gendisk(struct gendisk *disk) { + blk_unregister_filter(disk); sysfs_remove_link(&disk->dev.kobj, "bdi"); bdi_unregister(&disk->queue->backing_dev_info); blk_unregister_queue(disk); diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index 78199c0..c5b9bcf 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -105,120 +105,12 @@ static int sg_emulated_host(struct request_queue *q, int __user *p) return put_user(1, p); } -#define CMD_READ_SAFE 0x01 -#define CMD_WRITE_SAFE 0x02 -#define CMD_WARNED 0x04 -#define safe_for_read(cmd) [cmd] = CMD_READ_SAFE -#define safe_for_write(cmd) [cmd] = CMD_WRITE_SAFE - -int blk_verify_command(unsigned char *cmd, int has_write_perm) -{ - static unsigned char cmd_type[256] = { - - /* Basic read-only commands */ - safe_for_read(TEST_UNIT_READY), - safe_for_read(REQUEST_SENSE), - safe_for_read(READ_6), - safe_for_read(READ_10), - safe_for_read(READ_12), - safe_for_read(READ_16), - safe_for_read(READ_BUFFER), - safe_for_read(READ_DEFECT_DATA), - safe_for_read(READ_LONG), - safe_for_read(INQUIRY), - safe_for_read(MODE_SENSE), - safe_for_read(MODE_SENSE_10), - safe_for_read(LOG_SENSE), - safe_for_read(START_STOP), - safe_for_read(GPCMD_VERIFY_10), - safe_for_read(VERIFY_16), - - /* Audio CD commands */ - safe_for_read(GPCMD_PLAY_CD), - safe_for_read(GPCMD_PLAY_AUDIO_10), - safe_for_read(GPCMD_PLAY_AUDIO_MSF), - safe_for_read(GPCMD_PLAY_AUDIO_TI), - safe_for_read(GPCMD_PAUSE_RESUME), - - /* CD/DVD data reading */ - safe_for_read(GPCMD_READ_BUFFER_CAPACITY), - safe_for_read(GPCMD_READ_CD), - safe_for_read(GPCMD_READ_CD_MSF), - safe_for_read(GPCMD_READ_DISC_INFO), - safe_for_read(GPCMD_READ_CDVD_CAPACITY), - safe_for_read(GPCMD_READ_DVD_STRUCTURE), - safe_for_read(GPCMD_READ_HEADER), - safe_for_read(GPCMD_READ_TRACK_RZONE_INFO), - safe_for_read(GPCMD_READ_SUBCHANNEL), - safe_for_read(GPCMD_READ_TOC_PMA_ATIP), - safe_for_read(GPCMD_REPORT_KEY), - safe_for_read(GPCMD_SCAN), - safe_for_read(GPCMD_GET_CONFIGURATION), - safe_for_read(GPCMD_READ_FORMAT_CAPACITIES), - safe_for_read(GPCMD_GET_EVENT_STATUS_NOTIFICATION), - safe_for_read(GPCMD_GET_PERFORMANCE), - safe_for_read(GPCMD_SEEK), - safe_for_read(GPCMD_STOP_PLAY_SCAN), - - /* Basic writing commands */ - safe_for_write(WRITE_6), - safe_for_write(WRITE_10), - safe_for_write(WRITE_VERIFY), - safe_for_write(WRITE_12), - safe_for_write(WRITE_VERIFY_12), - safe_for_write(WRITE_16), - safe_for_write(WRITE_LONG), - safe_for_write(WRITE_LONG_2), - safe_for_write(ERASE), - safe_for_write(GPCMD_MODE_SELECT_10), - safe_for_write(MODE_SELECT), - safe_for_write(LOG_SELECT), - safe_for_write(GPCMD_BLANK), - safe_for_write(GPCMD_CLOSE_TRACK), - safe_for_write(GPCMD_FLUSH_CACHE), - safe_for_write(GPCMD_FORMAT_UNIT), - safe_for_write(GPCMD_REPAIR_RZONE_TRACK), - safe_for_write(GPCMD_RESERVE_RZONE_TRACK), - safe_for_write(GPCMD_SEND_DVD_STRUCTURE), - safe_for_write(GPCMD_SEND_EVENT), - safe_for_write(GPCMD_SEND_KEY), - safe_for_write(GPCMD_SEND_OPC), - safe_for_write(GPCMD_SEND_CUE_SHEET), - safe_for_write(GPCMD_SET_SPEED), - safe_for_write(GPCMD_PREVENT_ALLOW_MEDIUM_REMOVAL), - safe_for_write(GPCMD_LOAD_UNLOAD), - safe_for_write(GPCMD_SET_STREAMING), - }; - unsigned char type = cmd_type[cmd[0]]; - - /* Anybody who can open the device can do a read-safe command */ - if (type & CMD_READ_SAFE) - return 0; - - /* Write-safe commands just require a writable open.. */ - if ((type & CMD_WRITE_SAFE) && has_write_perm) - return 0; - - /* And root can do any command.. */ - if (capable(CAP_SYS_RAWIO)) - return 0; - - if (!type) { - cmd_type[cmd[0]] = CMD_WARNED; - printk(KERN_WARNING "scsi: unknown opcode 0x%02x\n", cmd[0]); - } - - /* Otherwise fail it with an "Operation not permitted" */ - return -EPERM; -} -EXPORT_SYMBOL_GPL(blk_verify_command); - static int blk_fill_sghdr_rq(struct request_queue *q, struct request *rq, - struct sg_io_hdr *hdr, int has_write_perm) + struct sg_io_hdr *hdr, struct file *file) { if (copy_from_user(rq->cmd, hdr->cmdp, hdr->cmd_len)) return -EFAULT; - if (blk_verify_command(rq->cmd, has_write_perm)) + if (blk_verify_command(file, rq->cmd)) return -EPERM; /* @@ -287,7 +179,7 @@ static int sg_io(struct file *file, struct request_queue *q, struct gendisk *bd_disk, struct sg_io_hdr *hdr) { unsigned long start_time; - int writing = 0, ret = 0, has_write_perm = 0; + int writing = 0, ret = 0; struct request *rq; char sense[SCSI_SENSE_BUFFERSIZE]; struct bio *bio; @@ -316,10 +208,7 @@ static int sg_io(struct file *file, struct request_queue *q, if (!rq) return -ENOMEM; - if (file) - has_write_perm = file->f_mode & FMODE_WRITE; - - if (blk_fill_sghdr_rq(q, rq, hdr, has_write_perm)) { + if (blk_fill_sghdr_rq(q, rq, hdr, file)) { blk_put_request(rq); return -EFAULT; } @@ -451,7 +340,7 @@ int sg_scsi_ioctl(struct file *file, struct request_queue *q, if (in_len && copy_from_user(buffer, sic->data + cmdlen, in_len)) goto error; - err = blk_verify_command(rq->cmd, file->f_mode & FMODE_WRITE); + err = blk_verify_command(file, rq->cmd); if (err) goto error; diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index ea0edd1..f7abcca 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -182,8 +182,9 @@ static int sg_build_sgat(Sg_scatter_hold * schp, const Sg_fd * sfp, int tablesize); static ssize_t sg_new_read(Sg_fd * sfp, char __user *buf, size_t count, Sg_request * srp); -static ssize_t sg_new_write(Sg_fd * sfp, const char __user *buf, size_t count, - int blocking, int read_only, Sg_request ** o_srp); +static ssize_t sg_new_write(Sg_fd *sfp, struct file *file, + const char __user *buf, size_t count, int blocking, + int read_only, Sg_request **o_srp); static int sg_common_write(Sg_fd * sfp, Sg_request * srp, unsigned char *cmnd, int timeout, int blocking); static int sg_u_iovec(sg_io_hdr_t * hp, int sg_num, int ind, @@ -204,7 +205,6 @@ static Sg_request *sg_get_rq_mark(Sg_fd * sfp, int pack_id); static Sg_request *sg_add_request(Sg_fd * sfp); static int sg_remove_request(Sg_fd * sfp, Sg_request * srp); static int sg_res_in_use(Sg_fd * sfp); -static int sg_allow_access(unsigned char opcode, char dev_type); static int sg_build_direct(Sg_request * srp, Sg_fd * sfp, int dxfer_len); static Sg_device *sg_get_dev(int dev); #ifdef CONFIG_SCSI_PROC_FS @@ -544,7 +544,7 @@ sg_write(struct file *filp, const char __user *buf, size_t count, loff_t * ppos) return -EFAULT; blocking = !(filp->f_flags & O_NONBLOCK); if (old_hdr.reply_len < 0) - return sg_new_write(sfp, buf, count, blocking, 0, NULL); + return sg_new_write(sfp, filp, buf, count, blocking, 0, NULL); if (count < (SZ_SG_HEADER + 6)) return -EIO; /* The minimum scsi command length is 6 bytes. */ @@ -621,8 +621,9 @@ sg_write(struct file *filp, const char __user *buf, size_t count, loff_t * ppos) } static ssize_t -sg_new_write(Sg_fd * sfp, const char __user *buf, size_t count, - int blocking, int read_only, Sg_request ** o_srp) +sg_new_write(Sg_fd *sfp, struct file *file, const char __user *buf, + size_t count, int blocking, int read_only, + Sg_request **o_srp) { int k; Sg_request *srp; @@ -678,8 +679,7 @@ sg_new_write(Sg_fd * sfp, const char __user *buf, size_t count, sg_remove_request(sfp, srp); return -EFAULT; } - if (read_only && - (!sg_allow_access(cmnd[0], sfp->parentdp->device->type))) { + if (read_only && (!blk_verify_command(file, cmnd))) { sg_remove_request(sfp, srp); return -EPERM; } @@ -799,7 +799,7 @@ sg_ioctl(struct inode *inode, struct file *filp, if (!access_ok(VERIFY_WRITE, p, SZ_SG_IO_HDR)) return -EFAULT; result = - sg_new_write(sfp, p, SZ_SG_IO_HDR, + sg_new_write(sfp, filp, p, SZ_SG_IO_HDR, blocking, read_only, &srp); if (result < 0) return result; @@ -1048,7 +1048,7 @@ sg_ioctl(struct inode *inode, struct file *filp, if (copy_from_user(&opcode, siocp->data, 1)) return -EFAULT; - if (!sg_allow_access(opcode, sdp->device->type)) + if (!blk_verify_command(filp, &opcode)) return -EPERM; } return sg_scsi_ioctl(filp, sdp->device->request_queue, NULL, p); @@ -2506,26 +2506,6 @@ sg_page_free(struct page *page, int size) #define MAINTENANCE_IN_CMD 0xa3 #endif -static unsigned char allow_ops[] = { TEST_UNIT_READY, REQUEST_SENSE, - INQUIRY, READ_CAPACITY, READ_BUFFER, READ_6, READ_10, READ_12, - READ_16, MODE_SENSE, MODE_SENSE_10, LOG_SENSE, REPORT_LUNS, - SERVICE_ACTION_IN, RECEIVE_DIAGNOSTIC, READ_LONG, MAINTENANCE_IN_CMD -}; - -static int -sg_allow_access(unsigned char opcode, char dev_type) -{ - int k; - - if (TYPE_SCANNER == dev_type) /* TYPE_ROM maybe burner */ - return 1; - for (k = 0; k < sizeof (allow_ops); ++k) { - if (opcode == allow_ops[k]) - return 1; - } - return 0; -} - #ifdef CONFIG_SCSI_PROC_FS static int sg_idr_max_id(int id, void *p, void *data) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d3ae9ad..a842b77 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -671,7 +671,6 @@ extern int blk_execute_rq(struct request_queue *, struct gendisk *, struct request *, int); extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *, struct request *, int, rq_end_io_fn *); -extern int blk_verify_command(unsigned char *, int); extern void blk_unplug(struct request_queue *q); static inline struct request_queue *bdev_get_queue(struct block_device *bdev) @@ -797,6 +796,15 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt, extern int blkdev_issue_flush(struct block_device *, sector_t *); +/* +* command filter functions +*/ +extern int blk_verify_command(struct file *file, unsigned char *cmd); +extern int blk_cmd_filter_verify_command(struct blk_scsi_cmd_filter *filter, + unsigned char *cmd, mode_t *f_mode); +extern int blk_register_filter(struct gendisk *disk); +extern void blk_unregister_filter(struct gendisk *disk); + #define MAX_PHYS_SEGMENTS 128 #define MAX_HW_SEGMENTS 128 #define SAFE_MAX_SECTORS 255 diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 524ec96..e878741 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -110,6 +110,14 @@ struct hd_struct { #define GENHD_FL_SUPPRESS_PARTITION_INFO 32 #define GENHD_FL_FAIL 64 +#define BLK_SCSI_MAX_CMDS (256) +#define BLK_SCSI_CMD_PER_LONG (BLK_SCSI_MAX_CMDS / (sizeof(long) * 8)) + +struct blk_scsi_cmd_filter { + unsigned long read_ok[BLK_SCSI_CMD_PER_LONG]; + unsigned long write_ok[BLK_SCSI_CMD_PER_LONG]; + struct kobject kobj; +}; struct gendisk { int major; /* major number of driver */ @@ -120,6 +128,7 @@ struct gendisk { struct hd_struct **part; /* [indexed by minor] */ struct block_device_operations *fops; struct request_queue *queue; + struct blk_scsi_cmd_filter cmd_filter; void *private_data; sector_t capacity; -- cgit v0.10.2 From c265a7f41706cee20508de5b4a919214cfd7a11b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 26 Jun 2008 13:49:33 +0200 Subject: cfq-iosched: get rid of enable_idle being unused warning Signed-off-by: Jens Axboe diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 0ebb626..1e2aff8 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1731,7 +1731,7 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq)) return; - old_idle = cfq_cfqq_idle_window(cfqq); + enable_idle = old_idle = cfq_cfqq_idle_window(cfqq); if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle || (cfqd->hw_tag && CIC_SEEKY(cic))) -- cgit v0.10.2 From 07359fc61bb8ed786f96a1c24cca6f94dd17e329 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Thu, 26 Jun 2008 19:39:23 +0200 Subject: block: add bounce support to blk_rq_map_user_iov blk_rq_map_user_iov can't handle the bounce buffer (it means that the bio_map_user_iov path doesn't work with a LLD that needs GFP_DMA). This patch fixes blk_rq_map_user_iov to support the bounce buffer. Signed-off-by: FUJITA Tomonori Cc: Mike Christie Signed-off-by: Jens Axboe diff --git a/block/blk-map.c b/block/blk-map.c index 0b1af5a..813011e 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -210,6 +210,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, if (!bio_flagged(bio, BIO_USER_MAPPED)) rq->cmd_flags |= REQ_COPY_USER; + blk_queue_bounce(q, &bio); bio_get(bio); blk_rq_bio_prep(q, rq, bio); rq->buffer = rq->data = NULL; -- cgit v0.10.2 From 2b272d4f7953a73ea1c1f7ba33d5a2d7439ce71b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 26 Jun 2008 19:45:54 +0200 Subject: sg: fix odd style (extra parenthesis) introduced by cmd filter patch Signed-off-by: Jens Axboe diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index f7abcca..62b5bd5 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -679,7 +679,7 @@ sg_new_write(Sg_fd *sfp, struct file *file, const char __user *buf, sg_remove_request(sfp, srp); return -EFAULT; } - if (read_only && (!blk_verify_command(file, cmnd))) { + if (read_only && !blk_verify_command(file, cmnd)) { sg_remove_request(sfp, srp); return -EPERM; } -- cgit v0.10.2 From 06a452e5b95eb669b7ad414ccf587dfc2d91b217 Mon Sep 17 00:00:00 2001 From: Adel Gadllah Date: Fri, 27 Jun 2008 09:16:17 +0200 Subject: cmdfilter: extend default read filter This patch adds the commands that the former sg filter allowed for read access to the cmdfilter to keep userspace apps that rely on them working. Signed-off-by: Adel Gadllah Signed-off-by: Jens Axboe diff --git a/block/cmd-filter.c b/block/cmd-filter.c index 35e327c..eec4404 100644 --- a/block/cmd-filter.c +++ b/block/cmd-filter.c @@ -219,6 +219,10 @@ static struct kobj_type rcf_ktype = { .default_attrs = default_attrs, }; +#ifndef MAINTENANCE_IN_CMD +#define MAINTENANCE_IN_CMD 0xa3 +#endif + static void rcf_set_defaults(struct blk_scsi_cmd_filter *filter) { /* Basic read-only commands */ @@ -230,6 +234,7 @@ static void rcf_set_defaults(struct blk_scsi_cmd_filter *filter) __set_bit(READ_16, filter->read_ok); __set_bit(READ_BUFFER, filter->read_ok); __set_bit(READ_DEFECT_DATA, filter->read_ok); + __set_bit(READ_CAPACITY, filter->read_ok); __set_bit(READ_LONG, filter->read_ok); __set_bit(INQUIRY, filter->read_ok); __set_bit(MODE_SENSE, filter->read_ok); @@ -238,6 +243,10 @@ static void rcf_set_defaults(struct blk_scsi_cmd_filter *filter) __set_bit(START_STOP, filter->read_ok); __set_bit(GPCMD_VERIFY_10, filter->read_ok); __set_bit(VERIFY_16, filter->read_ok); + __set_bit(REPORT_LUNS, filter->read_ok); + __set_bit(SERVICE_ACTION_IN, filter->read_ok); + __set_bit(RECEIVE_DIAGNOSTIC, filter->read_ok); + __set_bit(MAINTENANCE_IN_CMD, filter->read_ok); __set_bit(GPCMD_READ_BUFFER_CAPACITY, filter->read_ok); /* Audio CD commands */ diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 62b5bd5..fe694f0 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -2502,10 +2502,6 @@ sg_page_free(struct page *page, int size) __free_pages(page, order); } -#ifndef MAINTENANCE_IN_CMD -#define MAINTENANCE_IN_CMD 0xa3 -#endif - #ifdef CONFIG_SCSI_PROC_FS static int sg_idr_max_id(int id, void *p, void *data) -- cgit v0.10.2 From b24498d477a14680fc3bb3ad884fa9fa76a2d237 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 27 Jun 2008 09:12:09 +0200 Subject: block: integrity flags can't use bit ops on unsigned short Just use normal open coded bit operations instead, they need not be atomic. Signed-off-by: Jens Axboe diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 4ffa381..3f1a847 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -217,17 +217,16 @@ static ssize_t integrity_read_store(struct blk_integrity *bi, unsigned long val = simple_strtoul(p, &p, 10); if (val) - set_bit(INTEGRITY_FLAG_READ, &bi->flags); + bi->flags |= INTEGRITY_FLAG_READ; else - clear_bit(INTEGRITY_FLAG_READ, &bi->flags); + bi->flags &= ~INTEGRITY_FLAG_READ; return count; } static ssize_t integrity_read_show(struct blk_integrity *bi, char *page) { - return sprintf(page, "%d\n", - test_bit(INTEGRITY_FLAG_READ, &bi->flags) ? 1 : 0); + return sprintf(page, "%d\n", (bi->flags & INTEGRITY_FLAG_READ) != 0); } static ssize_t integrity_write_store(struct blk_integrity *bi, @@ -237,17 +236,16 @@ static ssize_t integrity_write_store(struct blk_integrity *bi, unsigned long val = simple_strtoul(p, &p, 10); if (val) - set_bit(INTEGRITY_FLAG_WRITE, &bi->flags); + bi->flags |= INTEGRITY_FLAG_WRITE; else - clear_bit(INTEGRITY_FLAG_WRITE, &bi->flags); + bi->flags &= ~INTEGRITY_FLAG_WRITE; return count; } static ssize_t integrity_write_show(struct blk_integrity *bi, char *page) { - return sprintf(page, "%d\n", - test_bit(INTEGRITY_FLAG_WRITE, &bi->flags) ? 1 : 0); + return sprintf(page, "%d\n", (bi->flags & INTEGRITY_FLAG_WRITE) != 0); } static struct integrity_sysfs_entry integrity_format_entry = { @@ -340,8 +338,7 @@ int blk_integrity_register(struct gendisk *disk, struct blk_integrity *template) kobject_uevent(&bi->kobj, KOBJ_ADD); - set_bit(INTEGRITY_FLAG_READ, &bi->flags); - set_bit(INTEGRITY_FLAG_WRITE, &bi->flags); + bi->flags |= INTEGRITY_FLAG_READ | INTEGRITY_FLAG_WRITE; bi->sector_size = disk->queue->hardsect_size; disk->integrity = bi; } else diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index a842b77..7ab8aca 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -870,8 +870,8 @@ void kblockd_flush_work(struct work_struct *work); #if defined(CONFIG_BLK_DEV_INTEGRITY) -#define INTEGRITY_FLAG_READ 1 /* verify data integrity on read */ -#define INTEGRITY_FLAG_WRITE 2 /* generate data integrity on write */ +#define INTEGRITY_FLAG_READ 2 /* verify data integrity on read */ +#define INTEGRITY_FLAG_WRITE 4 /* generate data integrity on write */ struct blk_integrity_exchg { void *prot_buf; @@ -940,11 +940,11 @@ static inline int bdev_integrity_enabled(struct block_device *bdev, int rw) return 0; if (rw == READ && bi->verify_fn != NULL && - test_bit(INTEGRITY_FLAG_READ, &bi->flags)) + (bi->flags & INTEGRITY_FLAG_READ)) return 1; if (rw == WRITE && bi->generate_fn != NULL && - test_bit(INTEGRITY_FLAG_WRITE, &bi->flags)) + (bi->flags & INTEGRITY_FLAG_WRITE)) return 1; return 0; -- cgit v0.10.2 From cc371e66e340f35eed8dc4651c7c18e754c7fb26 Mon Sep 17 00:00:00 2001 From: Alasdair G Kergon Date: Thu, 3 Jul 2008 09:53:43 +0200 Subject: Add bvec_merge_data to handle stacked devices and ->merge_bvec() When devices are stacked, one device's merge_bvec_fn may need to perform the mapping and then call one or more functions for its underlying devices. The following bio fields are used: bio->bi_sector bio->bi_bdev bio->bi_size bio->bi_rw using bio_data_dir() This patch creates a new struct bvec_merge_data holding a copy of those fields to avoid having to change them directly in the struct bio when going down the stack only to have to change them back again on the way back up. (And then when the bio gets mapped for real, the whole exercise gets repeated, but that's a problem for another day...) Signed-off-by: Alasdair G Kergon Cc: Neil Brown Cc: Milan Broz Signed-off-by: Jens Axboe diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 3ba1df9..589850c 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -2633,11 +2633,12 @@ end_io: -static int pkt_merge_bvec(struct request_queue *q, struct bio *bio, struct bio_vec *bvec) +static int pkt_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd, + struct bio_vec *bvec) { struct pktcdvd_device *pd = q->queuedata; - sector_t zone = ZONE(bio->bi_sector, pd); - int used = ((bio->bi_sector - zone) << 9) + bio->bi_size; + sector_t zone = ZONE(bmd->bi_sector, pd); + int used = ((bmd->bi_sector - zone) << 9) + bmd->bi_size; int remaining = (pd->settings.size << 9) - used; int remaining2; @@ -2645,7 +2646,7 @@ static int pkt_merge_bvec(struct request_queue *q, struct bio *bio, struct bio_v * A bio <= PAGE_SIZE must be allowed. If it crosses a packet * boundary, pkt_make_request() will split the bio. */ - remaining2 = PAGE_SIZE - bio->bi_size; + remaining2 = PAGE_SIZE - bmd->bi_size; remaining = max(remaining, remaining2); BUG_ON(remaining < 0); diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 1074824..6a866d7 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -50,17 +50,19 @@ static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector) /** * linear_mergeable_bvec -- tell bio layer if two requests can be merged * @q: request queue - * @bio: the buffer head that's been built up so far + * @bvm: properties of new bio * @biovec: the request that could be merged to it. * * Return amount of bytes we can take at this offset */ -static int linear_mergeable_bvec(struct request_queue *q, struct bio *bio, struct bio_vec *biovec) +static int linear_mergeable_bvec(struct request_queue *q, + struct bvec_merge_data *bvm, + struct bio_vec *biovec) { mddev_t *mddev = q->queuedata; dev_info_t *dev0; - unsigned long maxsectors, bio_sectors = bio->bi_size >> 9; - sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); + unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9; + sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); dev0 = which_dev(mddev, sector); maxsectors = (dev0->size << 1) - (sector - (dev0->offset<<1)); diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 914c04d..bcbb825 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -241,18 +241,20 @@ static int create_strip_zones (mddev_t *mddev) /** * raid0_mergeable_bvec -- tell bio layer if a two requests can be merged * @q: request queue - * @bio: the buffer head that's been built up so far + * @bvm: properties of new bio * @biovec: the request that could be merged to it. * * Return amount of bytes we can accept at this offset */ -static int raid0_mergeable_bvec(struct request_queue *q, struct bio *bio, struct bio_vec *biovec) +static int raid0_mergeable_bvec(struct request_queue *q, + struct bvec_merge_data *bvm, + struct bio_vec *biovec) { mddev_t *mddev = q->queuedata; - sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); + sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); int max; unsigned int chunk_sectors = mddev->chunk_size >> 9; - unsigned int bio_sectors = bio->bi_size >> 9; + unsigned int bio_sectors = bvm->bi_size >> 9; max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; if (max < 0) max = 0; /* bio_add cannot handle a negative return */ diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index a71277b..22bb2b1 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -439,26 +439,27 @@ static sector_t raid10_find_virt(conf_t *conf, sector_t sector, int dev) /** * raid10_mergeable_bvec -- tell bio layer if a two requests can be merged * @q: request queue - * @bio: the buffer head that's been built up so far + * @bvm: properties of new bio * @biovec: the request that could be merged to it. * * Return amount of bytes we can accept at this offset * If near_copies == raid_disk, there are no striping issues, * but in that case, the function isn't called at all. */ -static int raid10_mergeable_bvec(struct request_queue *q, struct bio *bio, - struct bio_vec *bio_vec) +static int raid10_mergeable_bvec(struct request_queue *q, + struct bvec_merge_data *bvm, + struct bio_vec *biovec) { mddev_t *mddev = q->queuedata; - sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); + sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); int max; unsigned int chunk_sectors = mddev->chunk_size >> 9; - unsigned int bio_sectors = bio->bi_size >> 9; + unsigned int bio_sectors = bvm->bi_size >> 9; max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; if (max < 0) max = 0; /* bio_add cannot handle a negative return */ - if (max <= bio_vec->bv_len && bio_sectors == 0) - return bio_vec->bv_len; + if (max <= biovec->bv_len && bio_sectors == 0) + return biovec->bv_len; else return max; } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 54c8ee2..9b00675 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3319,15 +3319,17 @@ static int raid5_congested(void *data, int bits) /* We want read requests to align with chunks where possible, * but write requests don't need to. */ -static int raid5_mergeable_bvec(struct request_queue *q, struct bio *bio, struct bio_vec *biovec) +static int raid5_mergeable_bvec(struct request_queue *q, + struct bvec_merge_data *bvm, + struct bio_vec *biovec) { mddev_t *mddev = q->queuedata; - sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); + sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); int max; unsigned int chunk_sectors = mddev->chunk_size >> 9; - unsigned int bio_sectors = bio->bi_size >> 9; + unsigned int bio_sectors = bvm->bi_size >> 9; - if (bio_data_dir(bio) == WRITE) + if ((bvm->bi_rw & 1) == WRITE) return biovec->bv_len; /* always allow writes to be mergeable */ max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; diff --git a/fs/bio.c b/fs/bio.c index 7761c84..88322b0 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -325,10 +325,19 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page if (page == prev->bv_page && offset == prev->bv_offset + prev->bv_len) { prev->bv_len += len; - if (q->merge_bvec_fn && - q->merge_bvec_fn(q, bio, prev) < len) { - prev->bv_len -= len; - return 0; + + if (q->merge_bvec_fn) { + struct bvec_merge_data bvm = { + .bi_bdev = bio->bi_bdev, + .bi_sector = bio->bi_sector, + .bi_size = bio->bi_size, + .bi_rw = bio->bi_rw, + }; + + if (q->merge_bvec_fn(q, &bvm, prev) < len) { + prev->bv_len -= len; + return 0; + } } goto done; @@ -369,11 +378,18 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page * queue to get further control */ if (q->merge_bvec_fn) { + struct bvec_merge_data bvm = { + .bi_bdev = bio->bi_bdev, + .bi_sector = bio->bi_sector, + .bi_size = bio->bi_size, + .bi_rw = bio->bi_rw, + }; + /* * merge_bvec_fn() returns number of bytes it can accept * at this offset */ - if (q->merge_bvec_fn(q, bio, bvec) < len) { + if (q->merge_bvec_fn(q, &bvm, bvec) < len) { bvec->bv_page = NULL; bvec->bv_len = 0; bvec->bv_offset = 0; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 7ab8aca..ff9d0bd 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -254,7 +254,14 @@ typedef int (prep_rq_fn) (struct request_queue *, struct request *); typedef void (unplug_fn) (struct request_queue *); struct bio_vec; -typedef int (merge_bvec_fn) (struct request_queue *, struct bio *, struct bio_vec *); +struct bvec_merge_data { + struct block_device *bi_bdev; + sector_t bi_sector; + unsigned bi_size; + unsigned long bi_rw; +}; +typedef int (merge_bvec_fn) (struct request_queue *, struct bvec_merge_data *, + struct bio_vec *); typedef void (prepare_flush_fn) (struct request_queue *, struct request *); typedef void (softirq_done_fn)(struct request *); typedef int (dma_drain_needed_fn)(struct request *); -- cgit v0.10.2 From e180f5949327e897bc35a816f4f4010186632df9 Mon Sep 17 00:00:00 2001 From: maximilian attems Date: Tue, 1 Jul 2008 09:42:47 +0200 Subject: block: request_module(): use format string Avoid bad things happening if the module has a printk control string in its name. Signed-off-by: maximilian attems Signed-off-by: Jens Axboe diff --git a/block/elevator.c b/block/elevator.c index 1f5bfe6..ed6f8f3 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -150,7 +150,7 @@ static struct elevator_type *elevator_get(const char *name) else sprintf(elv, "%s-iosched", name); - request_module(elv); + request_module("%s", elv); spin_lock(&elv_list_lock); e = elevator_find(name); } -- cgit v0.10.2 From e48ec69005f02b70b7ecfde1bc39a599086d16ef Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 3 Jul 2008 13:18:54 +0200 Subject: block: extend queue_flag bitops Add test_and_clear and test_and_set. Signed-off-by: Jens Axboe diff --git a/block/blk-core.c b/block/blk-core.c index e0fb0bc..dbc7f42 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -205,8 +205,7 @@ void blk_plug_device(struct request_queue *q) if (blk_queue_stopped(q)) return; - if (!test_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) { - __set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags); + if (!queue_flag_test_and_set(QUEUE_FLAG_PLUGGED, q)) { mod_timer(&q->unplug_timer, jiffies + q->unplug_delay); blk_add_trace_generic(q, NULL, 0, BLK_TA_PLUG); } @@ -221,10 +220,9 @@ int blk_remove_plug(struct request_queue *q) { WARN_ON(!irqs_disabled()); - if (!test_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) + if (!queue_flag_test_and_clear(QUEUE_FLAG_PLUGGED, q)) return 0; - queue_flag_clear(QUEUE_FLAG_PLUGGED, q); del_timer(&q->unplug_timer); return 1; } @@ -328,8 +326,7 @@ void blk_start_queue(struct request_queue *q) * one level of recursion is ok and is much faster than kicking * the unplug handling */ - if (!test_bit(QUEUE_FLAG_REENTER, &q->queue_flags)) { - queue_flag_set(QUEUE_FLAG_REENTER, q); + if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) { q->request_fn(q); queue_flag_clear(QUEUE_FLAG_REENTER, q); } else { @@ -394,8 +391,7 @@ void __blk_run_queue(struct request_queue *q) * handling reinvoke the handler shortly if we already got there. */ if (!elv_queue_empty(q)) { - if (!test_bit(QUEUE_FLAG_REENTER, &q->queue_flags)) { - queue_flag_set(QUEUE_FLAG_REENTER, q); + if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) { q->request_fn(q); queue_flag_clear(QUEUE_FLAG_REENTER, q); } else { diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ff9d0bd..e04c4ac 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -428,6 +428,32 @@ static inline void queue_flag_set_unlocked(unsigned int flag, __set_bit(flag, &q->queue_flags); } +static inline int queue_flag_test_and_clear(unsigned int flag, + struct request_queue *q) +{ + WARN_ON_ONCE(!queue_is_locked(q)); + + if (test_bit(flag, &q->queue_flags)) { + __clear_bit(flag, &q->queue_flags); + return 1; + } + + return 0; +} + +static inline int queue_flag_test_and_set(unsigned int flag, + struct request_queue *q) +{ + WARN_ON_ONCE(!queue_is_locked(q)); + + if (!test_bit(flag, &q->queue_flags)) { + __set_bit(flag, &q->queue_flags); + return 0; + } + + return 1; +} + static inline void queue_flag_set(unsigned int flag, struct request_queue *q) { WARN_ON_ONCE(!queue_is_locked(q)); -- cgit v0.10.2 From 823ed72e8fe566983b121e8cc3147dd50ce63a8a Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Fri, 4 Jul 2008 09:28:32 +0200 Subject: block: use get_unaligned_* helpers Signed-off-by: Harvey Harrison Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index 41f818b..2f17462 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -1003,7 +1003,7 @@ aoecmd_cfg_rsp(struct sk_buff *skb) * Enough people have their dip switches set backwards to * warrant a loud message for this special case. */ - aoemajor = be16_to_cpu(get_unaligned(&h->major)); + aoemajor = get_unaligned_be16(&h->major); if (aoemajor == 0xfff) { printk(KERN_ERR "aoe: Warning: shelf address is all ones. " "Check shelf dip switches.\n"); -- cgit v0.10.2 From be1fd70fea1100c57f3aa1934ebb93abc474e50c Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Fri, 4 Jul 2008 09:51:21 +0200 Subject: paride: push ioctl down into driver Leaves us with lock_kernel for two methods. Also remove a bogus printk with no printk level and return -ENOTTY not -EINVAL for correctness. Signed-off-by: Alan Cox Signed-off-by: Andrew Morton (Jens: added smp_lock.h include to pt.c, otherwise it wont compile because of missing {un}lock_kernel() definition) Signed-off-by: Jens Axboe diff --git a/drivers/block/paride/pt.c b/drivers/block/paride/pt.c index 8b9549a..27455ee 100644 --- a/drivers/block/paride/pt.c +++ b/drivers/block/paride/pt.c @@ -146,6 +146,7 @@ static int (*drives[4])[6] = {&drive0, &drive1, &drive2, &drive3}; #include #include #include /* current, TASK_*, schedule_timeout() */ +#include #include @@ -189,8 +190,7 @@ module_param_array(drive3, int, NULL, 0); #define ATAPI_LOG_SENSE 0x4d static int pt_open(struct inode *inode, struct file *file); -static int pt_ioctl(struct inode *inode, struct file *file, - unsigned int cmd, unsigned long arg); +static long pt_ioctl(struct file *file, unsigned int cmd, unsigned long arg); static int pt_release(struct inode *inode, struct file *file); static ssize_t pt_read(struct file *filp, char __user *buf, size_t count, loff_t * ppos); @@ -236,7 +236,7 @@ static const struct file_operations pt_fops = { .owner = THIS_MODULE, .read = pt_read, .write = pt_write, - .ioctl = pt_ioctl, + .unlocked_ioctl = pt_ioctl, .open = pt_open, .release = pt_release, }; @@ -685,8 +685,7 @@ out: return err; } -static int pt_ioctl(struct inode *inode, struct file *file, - unsigned int cmd, unsigned long arg) +static long pt_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct pt_unit *tape = file->private_data; struct mtop __user *p = (void __user *)arg; @@ -700,23 +699,26 @@ static int pt_ioctl(struct inode *inode, struct file *file, switch (mtop.mt_op) { case MTREW: + lock_kernel(); pt_rewind(tape); + unlock_kernel(); return 0; case MTWEOF: + lock_kernel(); pt_write_fm(tape); + unlock_kernel(); return 0; default: - printk("%s: Unimplemented mt_op %d\n", tape->name, + /* FIXME: rate limit ?? */ + printk(KERN_DEBUG "%s: Unimplemented mt_op %d\n", tape->name, mtop.mt_op); return -EINVAL; } default: - printk("%s: Unimplemented ioctl 0x%x\n", tape->name, cmd); - return -EINVAL; - + return -ENOTTY; } } -- cgit v0.10.2 From 5b6155ee70e9c4d2ad7e6f514c8eee06e2711c3a Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Fri, 4 Jul 2008 09:29:16 +0200 Subject: pktcdvd: push BKL down into driver Push the lock_kernel down into the driver and switch to unlocked_ioctl [akpm@linux-foundation.org: build fix] Signed-off-by: Alan Cox Acked-by: Peter Osterlund Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 589850c..85e44d0 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include #include @@ -2797,9 +2798,14 @@ out_mem: return ret; } -static int pkt_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg) +static long pkt_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - struct pktcdvd_device *pd = inode->i_bdev->bd_disk->private_data; + struct inode *inode = file->f_path.dentry->d_inode; + struct pktcdvd_device *pd; + long ret; + + lock_kernel(); + pd = inode->i_bdev->bd_disk->private_data; VPRINTK("pkt_ioctl: cmd %x, dev %d:%d\n", cmd, imajor(inode), iminor(inode)); @@ -2812,7 +2818,8 @@ static int pkt_ioctl(struct inode *inode, struct file *file, unsigned int cmd, u case CDROM_LAST_WRITTEN: case CDROM_SEND_PACKET: case SCSI_IOCTL_SEND_COMMAND: - return blkdev_ioctl(pd->bdev->bd_inode, file, cmd, arg); + ret = blkdev_ioctl(pd->bdev->bd_inode, file, cmd, arg); + break; case CDROMEJECT: /* @@ -2821,14 +2828,15 @@ static int pkt_ioctl(struct inode *inode, struct file *file, unsigned int cmd, u */ if (pd->refcnt == 1) pkt_lock_door(pd, 0); - return blkdev_ioctl(pd->bdev->bd_inode, file, cmd, arg); + ret = blkdev_ioctl(pd->bdev->bd_inode, file, cmd, arg); + break; default: VPRINTK(DRIVER_NAME": Unknown ioctl for %s (%x)\n", pd->name, cmd); - return -ENOTTY; + ret = -ENOTTY; } - - return 0; + unlock_kernel(); + return ret; } static int pkt_media_changed(struct gendisk *disk) @@ -2850,7 +2858,7 @@ static struct block_device_operations pktcdvd_ops = { .owner = THIS_MODULE, .open = pkt_open, .release = pkt_close, - .ioctl = pkt_ioctl, + .unlocked_ioctl = pkt_ioctl, .media_changed = pkt_media_changed, }; @@ -3015,7 +3023,8 @@ static void pkt_get_status(struct pkt_ctrl_command *ctrl_cmd) mutex_unlock(&ctl_mutex); } -static int pkt_ctl_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg) +static long pkt_ctl_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) { void __user *argp = (void __user *)arg; struct pkt_ctrl_command ctrl_cmd; @@ -3032,16 +3041,22 @@ static int pkt_ctl_ioctl(struct inode *inode, struct file *file, unsigned int cm case PKT_CTRL_CMD_SETUP: if (!capable(CAP_SYS_ADMIN)) return -EPERM; + lock_kernel(); ret = pkt_setup_dev(new_decode_dev(ctrl_cmd.dev), &pkt_dev); ctrl_cmd.pkt_dev = new_encode_dev(pkt_dev); + unlock_kernel(); break; case PKT_CTRL_CMD_TEARDOWN: if (!capable(CAP_SYS_ADMIN)) return -EPERM; + lock_kernel(); ret = pkt_remove_dev(new_decode_dev(ctrl_cmd.pkt_dev)); + unlock_kernel(); break; case PKT_CTRL_CMD_STATUS: + lock_kernel(); pkt_get_status(&ctrl_cmd); + unlock_kernel(); break; default: return -ENOTTY; @@ -3054,7 +3069,7 @@ static int pkt_ctl_ioctl(struct inode *inode, struct file *file, unsigned int cm static const struct file_operations pkt_ctl_fops = { - .ioctl = pkt_ctl_ioctl, + .unlocked_ioctl = pkt_ctl_ioctl, .owner = THIS_MODULE, }; -- cgit v0.10.2 From 2610324fcacf38a24b630090ebcb802538763187 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Fri, 4 Jul 2008 09:29:31 +0200 Subject: DAC960: push down BKL Signed-off-by: Alan Cox Cc: Richard Knutsson Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c index cd03473..a002a38 100644 --- a/drivers/block/DAC960.c +++ b/drivers/block/DAC960.c @@ -6628,15 +6628,18 @@ static void DAC960_DestroyProcEntries(DAC960_Controller_T *Controller) * DAC960_gam_ioctl is the ioctl function for performing RAID operations. */ -static int DAC960_gam_ioctl(struct inode *inode, struct file *file, - unsigned int Request, unsigned long Argument) +static long DAC960_gam_ioctl(struct file *file, unsigned int Request, + unsigned long Argument) { - int ErrorCode = 0; + long ErrorCode = 0; if (!capable(CAP_SYS_ADMIN)) return -EACCES; + + lock_kernel(); switch (Request) { case DAC960_IOCTL_GET_CONTROLLER_COUNT: - return DAC960_ControllerCount; + ErrorCode = DAC960_ControllerCount; + break; case DAC960_IOCTL_GET_CONTROLLER_INFO: { DAC960_ControllerInfo_T __user *UserSpaceControllerInfo = @@ -6644,15 +6647,20 @@ static int DAC960_gam_ioctl(struct inode *inode, struct file *file, DAC960_ControllerInfo_T ControllerInfo; DAC960_Controller_T *Controller; int ControllerNumber; - if (UserSpaceControllerInfo == NULL) return -EINVAL; - ErrorCode = get_user(ControllerNumber, + if (UserSpaceControllerInfo == NULL) + ErrorCode = -EINVAL; + else ErrorCode = get_user(ControllerNumber, &UserSpaceControllerInfo->ControllerNumber); - if (ErrorCode != 0) return ErrorCode; + if (ErrorCode != 0) + break;; + ErrorCode = -ENXIO; if (ControllerNumber < 0 || - ControllerNumber > DAC960_ControllerCount - 1) - return -ENXIO; + ControllerNumber > DAC960_ControllerCount - 1) { + break; + } Controller = DAC960_Controllers[ControllerNumber]; - if (Controller == NULL) return -ENXIO; + if (Controller == NULL) + break;; memset(&ControllerInfo, 0, sizeof(DAC960_ControllerInfo_T)); ControllerInfo.ControllerNumber = ControllerNumber; ControllerInfo.FirmwareType = Controller->FirmwareType; @@ -6665,8 +6673,9 @@ static int DAC960_gam_ioctl(struct inode *inode, struct file *file, ControllerInfo.PCI_Address = Controller->PCI_Address; strcpy(ControllerInfo.ModelName, Controller->ModelName); strcpy(ControllerInfo.FirmwareVersion, Controller->FirmwareVersion); - return (copy_to_user(UserSpaceControllerInfo, &ControllerInfo, + ErrorCode = (copy_to_user(UserSpaceControllerInfo, &ControllerInfo, sizeof(DAC960_ControllerInfo_T)) ? -EFAULT : 0); + break; } case DAC960_IOCTL_V1_EXECUTE_COMMAND: { @@ -6684,30 +6693,39 @@ static int DAC960_gam_ioctl(struct inode *inode, struct file *file, int ControllerNumber, DataTransferLength; unsigned char *DataTransferBuffer = NULL; dma_addr_t DataTransferBufferDMA; - if (UserSpaceUserCommand == NULL) return -EINVAL; + if (UserSpaceUserCommand == NULL) { + ErrorCode = -EINVAL; + break; + } if (copy_from_user(&UserCommand, UserSpaceUserCommand, sizeof(DAC960_V1_UserCommand_T))) { ErrorCode = -EFAULT; - goto Failure1a; + break; } ControllerNumber = UserCommand.ControllerNumber; + ErrorCode = -ENXIO; if (ControllerNumber < 0 || ControllerNumber > DAC960_ControllerCount - 1) - return -ENXIO; + break; Controller = DAC960_Controllers[ControllerNumber]; - if (Controller == NULL) return -ENXIO; - if (Controller->FirmwareType != DAC960_V1_Controller) return -EINVAL; + if (Controller == NULL) + break; + ErrorCode = -EINVAL; + if (Controller->FirmwareType != DAC960_V1_Controller) + break; CommandOpcode = UserCommand.CommandMailbox.Common.CommandOpcode; DataTransferLength = UserCommand.DataTransferLength; - if (CommandOpcode & 0x80) return -EINVAL; + if (CommandOpcode & 0x80) + break; if (CommandOpcode == DAC960_V1_DCDB) { if (copy_from_user(&DCDB, UserCommand.DCDB, sizeof(DAC960_V1_DCDB_T))) { ErrorCode = -EFAULT; - goto Failure1a; + break; } - if (DCDB.Channel >= DAC960_V1_MaxChannels) return -EINVAL; + if (DCDB.Channel >= DAC960_V1_MaxChannels) + break; if (!((DataTransferLength == 0 && DCDB.Direction == DAC960_V1_DCDB_NoDataTransfer) || @@ -6717,38 +6735,37 @@ static int DAC960_gam_ioctl(struct inode *inode, struct file *file, (DataTransferLength < 0 && DCDB.Direction == DAC960_V1_DCDB_DataTransferSystemToDevice))) - return -EINVAL; + break; if (((DCDB.TransferLengthHigh4 << 16) | DCDB.TransferLength) != abs(DataTransferLength)) - return -EINVAL; + break; DCDB_IOBUF = pci_alloc_consistent(Controller->PCIDevice, sizeof(DAC960_V1_DCDB_T), &DCDB_IOBUFDMA); - if (DCDB_IOBUF == NULL) - return -ENOMEM; + if (DCDB_IOBUF == NULL) { + ErrorCode = -ENOMEM; + break; + } } + ErrorCode = -ENOMEM; if (DataTransferLength > 0) { DataTransferBuffer = pci_alloc_consistent(Controller->PCIDevice, DataTransferLength, &DataTransferBufferDMA); - if (DataTransferBuffer == NULL) { - ErrorCode = -ENOMEM; - goto Failure1; - } + if (DataTransferBuffer == NULL) + break; memset(DataTransferBuffer, 0, DataTransferLength); } else if (DataTransferLength < 0) { DataTransferBuffer = pci_alloc_consistent(Controller->PCIDevice, -DataTransferLength, &DataTransferBufferDMA); - if (DataTransferBuffer == NULL) { - ErrorCode = -ENOMEM; - goto Failure1; - } + if (DataTransferBuffer == NULL) + break; if (copy_from_user(DataTransferBuffer, UserCommand.DataTransferBuffer, -DataTransferLength)) { ErrorCode = -EFAULT; - goto Failure1; + break; } } if (CommandOpcode == DAC960_V1_DCDB) @@ -6825,8 +6842,7 @@ static int DAC960_gam_ioctl(struct inode *inode, struct file *file, if (DCDB_IOBUF != NULL) pci_free_consistent(Controller->PCIDevice, sizeof(DAC960_V1_DCDB_T), DCDB_IOBUF, DCDB_IOBUFDMA); - Failure1a: - return ErrorCode; + break; } case DAC960_IOCTL_V2_EXECUTE_COMMAND: { @@ -6844,32 +6860,43 @@ static int DAC960_gam_ioctl(struct inode *inode, struct file *file, dma_addr_t DataTransferBufferDMA; unsigned char *RequestSenseBuffer = NULL; dma_addr_t RequestSenseBufferDMA; - if (UserSpaceUserCommand == NULL) return -EINVAL; + + ErrorCode = -EINVAL; + if (UserSpaceUserCommand == NULL) + break; if (copy_from_user(&UserCommand, UserSpaceUserCommand, sizeof(DAC960_V2_UserCommand_T))) { ErrorCode = -EFAULT; - goto Failure2a; + break; } + ErrorCode = -ENXIO; ControllerNumber = UserCommand.ControllerNumber; if (ControllerNumber < 0 || ControllerNumber > DAC960_ControllerCount - 1) - return -ENXIO; + break; Controller = DAC960_Controllers[ControllerNumber]; - if (Controller == NULL) return -ENXIO; - if (Controller->FirmwareType != DAC960_V2_Controller) return -EINVAL; + if (Controller == NULL) + break; + if (Controller->FirmwareType != DAC960_V2_Controller){ + ErrorCode = -EINVAL; + break; + } DataTransferLength = UserCommand.DataTransferLength; + ErrorCode = -ENOMEM; if (DataTransferLength > 0) { DataTransferBuffer = pci_alloc_consistent(Controller->PCIDevice, DataTransferLength, &DataTransferBufferDMA); - if (DataTransferBuffer == NULL) return -ENOMEM; + if (DataTransferBuffer == NULL) + break; memset(DataTransferBuffer, 0, DataTransferLength); } else if (DataTransferLength < 0) { DataTransferBuffer = pci_alloc_consistent(Controller->PCIDevice, -DataTransferLength, &DataTransferBufferDMA); - if (DataTransferBuffer == NULL) return -ENOMEM; + if (DataTransferBuffer == NULL) + break; if (copy_from_user(DataTransferBuffer, UserCommand.DataTransferBuffer, -DataTransferLength)) { @@ -6979,8 +7006,7 @@ static int DAC960_gam_ioctl(struct inode *inode, struct file *file, if (RequestSenseBuffer != NULL) pci_free_consistent(Controller->PCIDevice, RequestSenseLength, RequestSenseBuffer, RequestSenseBufferDMA); - Failure2a: - return ErrorCode; + break; } case DAC960_IOCTL_V2_GET_HEALTH_STATUS: { @@ -6990,21 +7016,33 @@ static int DAC960_gam_ioctl(struct inode *inode, struct file *file, DAC960_V2_HealthStatusBuffer_T HealthStatusBuffer; DAC960_Controller_T *Controller; int ControllerNumber; - if (UserSpaceGetHealthStatus == NULL) return -EINVAL; + if (UserSpaceGetHealthStatus == NULL) { + ErrorCode = -EINVAL; + break; + } if (copy_from_user(&GetHealthStatus, UserSpaceGetHealthStatus, - sizeof(DAC960_V2_GetHealthStatus_T))) - return -EFAULT; + sizeof(DAC960_V2_GetHealthStatus_T))) { + ErrorCode = -EFAULT; + break; + } + ErrorCode = -ENXIO; ControllerNumber = GetHealthStatus.ControllerNumber; if (ControllerNumber < 0 || ControllerNumber > DAC960_ControllerCount - 1) - return -ENXIO; + break; Controller = DAC960_Controllers[ControllerNumber]; - if (Controller == NULL) return -ENXIO; - if (Controller->FirmwareType != DAC960_V2_Controller) return -EINVAL; + if (Controller == NULL) + break; + if (Controller->FirmwareType != DAC960_V2_Controller) { + ErrorCode = -EINVAL; + break; + } if (copy_from_user(&HealthStatusBuffer, GetHealthStatus.HealthStatusBuffer, - sizeof(DAC960_V2_HealthStatusBuffer_T))) - return -EFAULT; + sizeof(DAC960_V2_HealthStatusBuffer_T))) { + ErrorCode = -EFAULT; + break; + } while (Controller->V2.HealthStatusBuffer->StatusChangeCounter == HealthStatusBuffer.StatusChangeCounter && Controller->V2.HealthStatusBuffer->NextEventSequenceNumber @@ -7012,21 +7050,28 @@ static int DAC960_gam_ioctl(struct inode *inode, struct file *file, { interruptible_sleep_on_timeout(&Controller->HealthStatusWaitQueue, DAC960_MonitoringTimerInterval); - if (signal_pending(current)) return -EINTR; + if (signal_pending(current)) { + ErrorCode = -EINTR; + break; + } } if (copy_to_user(GetHealthStatus.HealthStatusBuffer, Controller->V2.HealthStatusBuffer, sizeof(DAC960_V2_HealthStatusBuffer_T))) - return -EFAULT; - return 0; + ErrorCode = -EFAULT; + else + ErrorCode = 0; } + default: + ErrorCode = -ENOTTY; } - return -EINVAL; + unlock_kernel(); + return ErrorCode; } static const struct file_operations DAC960_gam_fops = { .owner = THIS_MODULE, - .ioctl = DAC960_gam_ioctl + .unlocked_ioctl = DAC960_gam_ioctl }; static struct miscdevice DAC960_gam_dev = { -- cgit v0.10.2 From 27f8221af406e43b529a5425bc99c9b1e9bdf521 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Fri, 4 Jul 2008 09:30:03 +0200 Subject: block: add blk_queue_update_dma_pad This adds blk_queue_update_dma_pad to prevent LLDs from overwriting the dma pad mask wrongly (we added blk_queue_update_dma_alignment due to the same reason). This also converts libata to use blk_queue_update_dma_pad instead of blk_queue_dma_pad. Signed-off-by: FUJITA Tomonori Cc: Tejun Heo Cc: Bartlomiej Zolnierkiewicz Cc: Thomas Bogendoerfer Cc: James Bottomley Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe diff --git a/block/blk-settings.c b/block/blk-settings.c index 8dd8641..dfc7701 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -302,11 +302,10 @@ EXPORT_SYMBOL(blk_queue_stack_limits); * @q: the request queue for the device * @mask: pad mask * - * Set pad mask. Direct IO requests are padded to the mask specified. + * Set dma pad mask. * - * Appending pad buffer to a request modifies ->data_len such that it - * includes the pad buffer. The original requested data length can be - * obtained using blk_rq_raw_data_len(). + * Appending pad buffer to a request modifies the last entry of a + * scatter list such that it includes the pad buffer. **/ void blk_queue_dma_pad(struct request_queue *q, unsigned int mask) { @@ -315,6 +314,23 @@ void blk_queue_dma_pad(struct request_queue *q, unsigned int mask) EXPORT_SYMBOL(blk_queue_dma_pad); /** + * blk_queue_update_dma_pad - update pad mask + * @q: the request queue for the device + * @mask: pad mask + * + * Update dma pad mask. + * + * Appending pad buffer to a request modifies the last entry of a + * scatter list such that it includes the pad buffer. + **/ +void blk_queue_update_dma_pad(struct request_queue *q, unsigned int mask) +{ + if (mask > q->dma_pad_mask) + q->dma_pad_mask = mask; +} +EXPORT_SYMBOL(blk_queue_update_dma_pad); + +/** * blk_queue_dma_drain - Set up a drain buffer for excess dma. * @q: the request queue for the device * @dma_drain_needed: fn which returns non-zero if drain is necessary diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index 57a4364..499ccc6 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -885,7 +885,8 @@ static int ata_scsi_dev_config(struct scsi_device *sdev, /* set the min alignment and padding */ blk_queue_update_dma_alignment(sdev->request_queue, ATA_DMA_PAD_SZ - 1); - blk_queue_dma_pad(sdev->request_queue, ATA_DMA_PAD_SZ - 1); + blk_queue_update_dma_pad(sdev->request_queue, + ATA_DMA_PAD_SZ - 1); /* configure draining */ buf = kmalloc(ATAPI_MAX_DRAIN, q->bounce_gfp | GFP_KERNEL); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e04c4ac..1ffd8bf 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -776,6 +776,7 @@ extern void blk_queue_max_segment_size(struct request_queue *, unsigned int); extern void blk_queue_hardsect_size(struct request_queue *, unsigned short); extern void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b); extern void blk_queue_dma_pad(struct request_queue *, unsigned int); +extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int); extern int blk_queue_dma_drain(struct request_queue *q, dma_drain_needed_fn *dma_drain_needed, void *buf, unsigned int size); -- cgit v0.10.2 From 30c00eda73d5db5bd64dd0c370161abd8df5ba4a Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Fri, 4 Jul 2008 09:31:11 +0200 Subject: block: blk_rq_map_kern uses the bounce buffers for stack buffers blk_rq_map_kern is used for kernel internal I/Os. Some callers use this function with stack buffers but DMA to/from the stack buffers leads to memory corruption on a non-coherent platform. This patch make blk_rq_map_kern uses the bounce buffers if a caller passes a stack buffer (on the all platforms for simplicity). Signed-off-by: FUJITA Tomonori Cc: Bartlomiej Zolnierkiewicz Cc: Thomas Bogendoerfer Cc: Tejun Heo Cc: James Bottomley Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe diff --git a/block/blk-map.c b/block/blk-map.c index 813011e..ddd96fb 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -269,6 +269,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, int reading = rq_data_dir(rq) == READ; int do_copy = 0; struct bio *bio; + unsigned long stack_mask = ~(THREAD_SIZE - 1); if (len > (q->max_hw_sectors << 9)) return -EINVAL; @@ -279,6 +280,10 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, alignment = queue_dma_alignment(q) | q->dma_pad_mask; do_copy = ((kaddr & alignment) || (len & alignment)); + if (!((kaddr & stack_mask) ^ + ((unsigned long)current->stack & stack_mask))) + do_copy = 1; + if (do_copy) bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading); else -- cgit v0.10.2 From 62858dacc8dea55c5bdb474ccd8acb0657e23dd0 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Fri, 4 Jul 2008 09:31:50 +0200 Subject: scsi: sr avoids useless buffer allocation blk_rq_map_kern can handle the stack buffers correctly (avoid DMA from/to the stack buffers by using the bounce buffer) so we don't need to complicate the code by allocating just 8 bytes. Signed-off-by: FUJITA Tomonori Cc: James Bottomley Cc: Bartlomiej Zolnierkiewicz Cc: Thomas Bogendoerfer Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index c82df8b..27f5bfd 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c @@ -673,24 +673,20 @@ fail: static void get_sectorsize(struct scsi_cd *cd) { unsigned char cmd[10]; - unsigned char *buffer; + unsigned char buffer[8]; int the_result, retries = 3; int sector_size; struct request_queue *queue; - buffer = kmalloc(512, GFP_KERNEL | GFP_DMA); - if (!buffer) - goto Enomem; - do { cmd[0] = READ_CAPACITY; memset((void *) &cmd[1], 0, 9); - memset(buffer, 0, 8); + memset(buffer, 0, sizeof(buffer)); /* Do the command and wait.. */ the_result = scsi_execute_req(cd->device, cmd, DMA_FROM_DEVICE, - buffer, 8, NULL, SR_TIMEOUT, - MAX_RETRIES); + buffer, sizeof(buffer), NULL, + SR_TIMEOUT, MAX_RETRIES); retries--; @@ -745,14 +741,8 @@ static void get_sectorsize(struct scsi_cd *cd) queue = cd->device->request_queue; blk_queue_hardsect_size(queue, sector_size); -out: - kfree(buffer); - return; -Enomem: - cd->capacity = 0x1fffff; - cd->device->sector_size = 2048; /* A guess, just in case */ - goto out; + return; } static void get_capabilities(struct scsi_cd *cd) -- cgit v0.10.2 From fce5384755e8e0e56c5609d2972db4702990d592 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Fri, 4 Jul 2008 09:33:01 +0200 Subject: cdrom: revert commit 22a9189 (cdrom: use kmalloced buffers instead of buffers on stack) The commit 22a9189fd073db3d03a4cf8b8c098aa207602de1 (cdrom: use kmalloced buffers instead of buffers on stack) is introduced to use kmalloced buffers for packet commands to avoid stack corruption on non coherent platforms. SCSI cdrom uses blk_rq_map_kern, which properly avoids DMA on the stack by using the bounce buffers. IDE cdrom also has the mechnism to avoids DMA on the stack. So we don't need this extra complexitiy in cdrom.c, such as allocating just 8 bytes. The lower layers can handle it. Signed-off-by: FUJITA Tomonori Cc: Thomas Bogendoerfer Cc: Bartlomiej Zolnierkiewicz Cc: Thomas Bogendoerfer Cc: Tejun Heo Cc: James Bottomley Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index 69f26eb..a5da356 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -461,37 +461,27 @@ int cdrom_get_media_event(struct cdrom_device_info *cdi, struct media_event_desc *med) { struct packet_command cgc; - unsigned char *buffer; - struct event_header *eh; - int ret = 1; - - buffer = kmalloc(8, GFP_KERNEL); - if (!buffer) - return -ENOMEM; + unsigned char buffer[8]; + struct event_header *eh = (struct event_header *) buffer; - eh = (struct event_header *)buffer; - - init_cdrom_command(&cgc, buffer, 8, CGC_DATA_READ); + init_cdrom_command(&cgc, buffer, sizeof(buffer), CGC_DATA_READ); cgc.cmd[0] = GPCMD_GET_EVENT_STATUS_NOTIFICATION; cgc.cmd[1] = 1; /* IMMED */ cgc.cmd[4] = 1 << 4; /* media event */ - cgc.cmd[8] = 8; + cgc.cmd[8] = sizeof(buffer); cgc.quiet = 1; if (cdi->ops->generic_packet(cdi, &cgc)) - goto err; + return 1; if (be16_to_cpu(eh->data_len) < sizeof(*med)) - goto err; + return 1; if (eh->nea || eh->notification_class != 0x4) - goto err; + return 1; - memcpy(med, buffer + sizeof(*eh), sizeof(*med)); - ret = 0; -err: - kfree(buffer); - return ret; + memcpy(med, &buffer[sizeof(*eh)], sizeof(*med)); + return 0; } /* @@ -501,82 +491,68 @@ err: static int cdrom_mrw_probe_pc(struct cdrom_device_info *cdi) { struct packet_command cgc; - char *buffer; - int ret = 1; - - buffer = kmalloc(16, GFP_KERNEL); - if (!buffer) - return -ENOMEM; + char buffer[16]; - init_cdrom_command(&cgc, buffer, 16, CGC_DATA_READ); + init_cdrom_command(&cgc, buffer, sizeof(buffer), CGC_DATA_READ); cgc.timeout = HZ; cgc.quiet = 1; if (!cdrom_mode_sense(cdi, &cgc, MRW_MODE_PC, 0)) { cdi->mrw_mode_page = MRW_MODE_PC; - ret = 0; + return 0; } else if (!cdrom_mode_sense(cdi, &cgc, MRW_MODE_PC_PRE1, 0)) { cdi->mrw_mode_page = MRW_MODE_PC_PRE1; - ret = 0; + return 0; } - kfree(buffer); - return ret; + + return 1; } static int cdrom_is_mrw(struct cdrom_device_info *cdi, int *write) { struct packet_command cgc; struct mrw_feature_desc *mfd; - unsigned char *buffer; + unsigned char buffer[16]; int ret; *write = 0; - buffer = kmalloc(16, GFP_KERNEL); - if (!buffer) - return -ENOMEM; - init_cdrom_command(&cgc, buffer, 16, CGC_DATA_READ); + init_cdrom_command(&cgc, buffer, sizeof(buffer), CGC_DATA_READ); cgc.cmd[0] = GPCMD_GET_CONFIGURATION; cgc.cmd[3] = CDF_MRW; - cgc.cmd[8] = 16; + cgc.cmd[8] = sizeof(buffer); cgc.quiet = 1; if ((ret = cdi->ops->generic_packet(cdi, &cgc))) - goto err; + return ret; mfd = (struct mrw_feature_desc *)&buffer[sizeof(struct feature_header)]; - if (be16_to_cpu(mfd->feature_code) != CDF_MRW) { - ret = 1; - goto err; - } + if (be16_to_cpu(mfd->feature_code) != CDF_MRW) + return 1; *write = mfd->write; if ((ret = cdrom_mrw_probe_pc(cdi))) { *write = 0; + return ret; } -err: - kfree(buffer); - return ret; + + return 0; } static int cdrom_mrw_bgformat(struct cdrom_device_info *cdi, int cont) { struct packet_command cgc; - unsigned char *buffer; + unsigned char buffer[12]; int ret; printk(KERN_INFO "cdrom: %sstarting format\n", cont ? "Re" : ""); - buffer = kmalloc(12, GFP_KERNEL); - if (!buffer) - return -ENOMEM; - /* * FmtData bit set (bit 4), format type is 1 */ - init_cdrom_command(&cgc, buffer, 12, CGC_DATA_WRITE); + init_cdrom_command(&cgc, buffer, sizeof(buffer), CGC_DATA_WRITE); cgc.cmd[0] = GPCMD_FORMAT_UNIT; cgc.cmd[1] = (1 << 4) | 1; @@ -603,7 +579,6 @@ static int cdrom_mrw_bgformat(struct cdrom_device_info *cdi, int cont) if (ret) printk(KERN_INFO "cdrom: bgformat failed\n"); - kfree(buffer); return ret; } @@ -663,17 +638,16 @@ static int cdrom_mrw_set_lba_space(struct cdrom_device_info *cdi, int space) { struct packet_command cgc; struct mode_page_header *mph; - char *buffer; + char buffer[16]; int ret, offset, size; - buffer = kmalloc(16, GFP_KERNEL); - if (!buffer) - return -ENOMEM; + init_cdrom_command(&cgc, buffer, sizeof(buffer), CGC_DATA_READ); - init_cdrom_command(&cgc, buffer, 16, CGC_DATA_READ); + cgc.buffer = buffer; + cgc.buflen = sizeof(buffer); if ((ret = cdrom_mode_sense(cdi, &cgc, cdi->mrw_mode_page, 0))) - goto err; + return ret; mph = (struct mode_page_header *) buffer; offset = be16_to_cpu(mph->desc_length); @@ -683,70 +657,55 @@ static int cdrom_mrw_set_lba_space(struct cdrom_device_info *cdi, int space) cgc.buflen = size; if ((ret = cdrom_mode_select(cdi, &cgc))) - goto err; + return ret; printk(KERN_INFO "cdrom: %s: mrw address space %s selected\n", cdi->name, mrw_address_space[space]); - ret = 0; -err: - kfree(buffer); - return ret; + return 0; } static int cdrom_get_random_writable(struct cdrom_device_info *cdi, struct rwrt_feature_desc *rfd) { struct packet_command cgc; - char *buffer; + char buffer[24]; int ret; - buffer = kmalloc(24, GFP_KERNEL); - if (!buffer) - return -ENOMEM; - - init_cdrom_command(&cgc, buffer, 24, CGC_DATA_READ); + init_cdrom_command(&cgc, buffer, sizeof(buffer), CGC_DATA_READ); cgc.cmd[0] = GPCMD_GET_CONFIGURATION; /* often 0x46 */ cgc.cmd[3] = CDF_RWRT; /* often 0x0020 */ - cgc.cmd[8] = 24; /* often 0x18 */ + cgc.cmd[8] = sizeof(buffer); /* often 0x18 */ cgc.quiet = 1; if ((ret = cdi->ops->generic_packet(cdi, &cgc))) - goto err; + return ret; memcpy(rfd, &buffer[sizeof(struct feature_header)], sizeof (*rfd)); - ret = 0; -err: - kfree(buffer); - return ret; + return 0; } static int cdrom_has_defect_mgt(struct cdrom_device_info *cdi) { struct packet_command cgc; - char *buffer; + char buffer[16]; __be16 *feature_code; int ret; - buffer = kmalloc(16, GFP_KERNEL); - if (!buffer) - return -ENOMEM; - - init_cdrom_command(&cgc, buffer, 16, CGC_DATA_READ); + init_cdrom_command(&cgc, buffer, sizeof(buffer), CGC_DATA_READ); cgc.cmd[0] = GPCMD_GET_CONFIGURATION; cgc.cmd[3] = CDF_HWDM; - cgc.cmd[8] = 16; + cgc.cmd[8] = sizeof(buffer); cgc.quiet = 1; if ((ret = cdi->ops->generic_packet(cdi, &cgc))) - goto err; + return ret; feature_code = (__be16 *) &buffer[sizeof(struct feature_header)]; if (be16_to_cpu(*feature_code) == CDF_HWDM) - ret = 0; -err: - kfree(buffer); - return ret; + return 0; + + return 1; } @@ -837,14 +796,10 @@ static int cdrom_mrw_open_write(struct cdrom_device_info *cdi) static int mo_open_write(struct cdrom_device_info *cdi) { struct packet_command cgc; - char *buffer; + char buffer[255]; int ret; - buffer = kmalloc(255, GFP_KERNEL); - if (!buffer) - return -ENOMEM; - - init_cdrom_command(&cgc, buffer, 4, CGC_DATA_READ); + init_cdrom_command(&cgc, &buffer, 4, CGC_DATA_READ); cgc.quiet = 1; /* @@ -861,15 +816,10 @@ static int mo_open_write(struct cdrom_device_info *cdi) } /* drive gave us no info, let the user go ahead */ - if (ret) { - ret = 0; - goto err; - } + if (ret) + return 0; - ret = buffer[3] & 0x80; -err: - kfree(buffer); - return ret; + return buffer[3] & 0x80; } static int cdrom_ram_open_write(struct cdrom_device_info *cdi) @@ -892,19 +842,15 @@ static int cdrom_ram_open_write(struct cdrom_device_info *cdi) static void cdrom_mmc3_profile(struct cdrom_device_info *cdi) { struct packet_command cgc; - char *buffer; + char buffer[32]; int ret, mmc3_profile; - buffer = kmalloc(32, GFP_KERNEL); - if (!buffer) - return; - - init_cdrom_command(&cgc, buffer, 32, CGC_DATA_READ); + init_cdrom_command(&cgc, buffer, sizeof(buffer), CGC_DATA_READ); cgc.cmd[0] = GPCMD_GET_CONFIGURATION; cgc.cmd[1] = 0; cgc.cmd[2] = cgc.cmd[3] = 0; /* Starting Feature Number */ - cgc.cmd[8] = 32; /* Allocation Length */ + cgc.cmd[8] = sizeof(buffer); /* Allocation Length */ cgc.quiet = 1; if ((ret = cdi->ops->generic_packet(cdi, &cgc))) @@ -913,7 +859,6 @@ static void cdrom_mmc3_profile(struct cdrom_device_info *cdi) mmc3_profile = (buffer[6] << 8) | buffer[7]; cdi->mmc3_profile = mmc3_profile; - kfree(buffer); } static int cdrom_is_dvd_rw(struct cdrom_device_info *cdi) @@ -1628,15 +1573,12 @@ static void setup_send_key(struct packet_command *cgc, unsigned agid, unsigned t static int dvd_do_auth(struct cdrom_device_info *cdi, dvd_authinfo *ai) { int ret; - u_char *buf; + u_char buf[20]; struct packet_command cgc; struct cdrom_device_ops *cdo = cdi->ops; - rpc_state_t *rpc_state; - - buf = kzalloc(20, GFP_KERNEL); - if (!buf) - return -ENOMEM; + rpc_state_t rpc_state; + memset(buf, 0, sizeof(buf)); init_cdrom_command(&cgc, buf, 0, CGC_DATA_READ); switch (ai->type) { @@ -1647,7 +1589,7 @@ static int dvd_do_auth(struct cdrom_device_info *cdi, dvd_authinfo *ai) setup_report_key(&cgc, ai->lsa.agid, 0); if ((ret = cdo->generic_packet(cdi, &cgc))) - goto err; + return ret; ai->lsa.agid = buf[7] >> 6; /* Returning data, let host change state */ @@ -1658,7 +1600,7 @@ static int dvd_do_auth(struct cdrom_device_info *cdi, dvd_authinfo *ai) setup_report_key(&cgc, ai->lsk.agid, 2); if ((ret = cdo->generic_packet(cdi, &cgc))) - goto err; + return ret; copy_key(ai->lsk.key, &buf[4]); /* Returning data, let host change state */ @@ -1669,7 +1611,7 @@ static int dvd_do_auth(struct cdrom_device_info *cdi, dvd_authinfo *ai) setup_report_key(&cgc, ai->lsc.agid, 1); if ((ret = cdo->generic_packet(cdi, &cgc))) - goto err; + return ret; copy_chal(ai->lsc.chal, &buf[4]); /* Returning data, let host change state */ @@ -1686,7 +1628,7 @@ static int dvd_do_auth(struct cdrom_device_info *cdi, dvd_authinfo *ai) cgc.cmd[2] = ai->lstk.lba >> 24; if ((ret = cdo->generic_packet(cdi, &cgc))) - goto err; + return ret; ai->lstk.cpm = (buf[4] >> 7) & 1; ai->lstk.cp_sec = (buf[4] >> 6) & 1; @@ -1700,7 +1642,7 @@ static int dvd_do_auth(struct cdrom_device_info *cdi, dvd_authinfo *ai) setup_report_key(&cgc, ai->lsasf.agid, 5); if ((ret = cdo->generic_packet(cdi, &cgc))) - goto err; + return ret; ai->lsasf.asf = buf[7] & 1; break; @@ -1713,7 +1655,7 @@ static int dvd_do_auth(struct cdrom_device_info *cdi, dvd_authinfo *ai) copy_chal(&buf[4], ai->hsc.chal); if ((ret = cdo->generic_packet(cdi, &cgc))) - goto err; + return ret; ai->type = DVD_LU_SEND_KEY1; break; @@ -1726,7 +1668,7 @@ static int dvd_do_auth(struct cdrom_device_info *cdi, dvd_authinfo *ai) if ((ret = cdo->generic_packet(cdi, &cgc))) { ai->type = DVD_AUTH_FAILURE; - goto err; + return ret; } ai->type = DVD_AUTH_ESTABLISHED; break; @@ -1737,23 +1679,24 @@ static int dvd_do_auth(struct cdrom_device_info *cdi, dvd_authinfo *ai) cdinfo(CD_DVD, "entering DVD_INVALIDATE_AGID\n"); setup_report_key(&cgc, ai->lsa.agid, 0x3f); if ((ret = cdo->generic_packet(cdi, &cgc))) - goto err; + return ret; break; /* Get region settings */ case DVD_LU_SEND_RPC_STATE: cdinfo(CD_DVD, "entering DVD_LU_SEND_RPC_STATE\n"); setup_report_key(&cgc, 0, 8); + memset(&rpc_state, 0, sizeof(rpc_state_t)); + cgc.buffer = (char *) &rpc_state; if ((ret = cdo->generic_packet(cdi, &cgc))) - goto err; + return ret; - rpc_state = (rpc_state_t *)buf; - ai->lrpcs.type = rpc_state->type_code; - ai->lrpcs.vra = rpc_state->vra; - ai->lrpcs.ucca = rpc_state->ucca; - ai->lrpcs.region_mask = rpc_state->region_mask; - ai->lrpcs.rpc_scheme = rpc_state->rpc_scheme; + ai->lrpcs.type = rpc_state.type_code; + ai->lrpcs.vra = rpc_state.vra; + ai->lrpcs.ucca = rpc_state.ucca; + ai->lrpcs.region_mask = rpc_state.region_mask; + ai->lrpcs.rpc_scheme = rpc_state.rpc_scheme; break; /* Set region settings */ @@ -1764,23 +1707,20 @@ static int dvd_do_auth(struct cdrom_device_info *cdi, dvd_authinfo *ai) buf[4] = ai->hrpcs.pdrc; if ((ret = cdo->generic_packet(cdi, &cgc))) - goto err; + return ret; break; default: cdinfo(CD_WARNING, "Invalid DVD key ioctl (%d)\n", ai->type); - ret = -ENOTTY; - goto err; + return -ENOTTY; } - ret = 0; -err: - kfree(buf); - return ret; + + return 0; } static int dvd_read_physical(struct cdrom_device_info *cdi, dvd_struct *s) { - unsigned char *buf, *base; + unsigned char buf[21], *base; struct dvd_layer *layer; struct packet_command cgc; struct cdrom_device_ops *cdo = cdi->ops; @@ -1789,11 +1729,7 @@ static int dvd_read_physical(struct cdrom_device_info *cdi, dvd_struct *s) if (layer_num >= DVD_LAYERS) return -EINVAL; - buf = kmalloc(21, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - init_cdrom_command(&cgc, buf, 21, CGC_DATA_READ); + init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_READ); cgc.cmd[0] = GPCMD_READ_DVD_STRUCTURE; cgc.cmd[6] = layer_num; cgc.cmd[7] = s->type; @@ -1805,7 +1741,7 @@ static int dvd_read_physical(struct cdrom_device_info *cdi, dvd_struct *s) cgc.quiet = 1; if ((ret = cdo->generic_packet(cdi, &cgc))) - goto err; + return ret; base = &buf[4]; layer = &s->physical.layer[layer_num]; @@ -1829,24 +1765,17 @@ static int dvd_read_physical(struct cdrom_device_info *cdi, dvd_struct *s) layer->end_sector_l0 = base[13] << 16 | base[14] << 8 | base[15]; layer->bca = base[16] >> 7; - ret = 0; -err: - kfree(buf); - return ret; + return 0; } static int dvd_read_copyright(struct cdrom_device_info *cdi, dvd_struct *s) { int ret; - u_char *buf; + u_char buf[8]; struct packet_command cgc; struct cdrom_device_ops *cdo = cdi->ops; - buf = kmalloc(8, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - init_cdrom_command(&cgc, buf, 8, CGC_DATA_READ); + init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_READ); cgc.cmd[0] = GPCMD_READ_DVD_STRUCTURE; cgc.cmd[6] = s->copyright.layer_num; cgc.cmd[7] = s->type; @@ -1854,15 +1783,12 @@ static int dvd_read_copyright(struct cdrom_device_info *cdi, dvd_struct *s) cgc.cmd[9] = cgc.buflen & 0xff; if ((ret = cdo->generic_packet(cdi, &cgc))) - goto err; + return ret; s->copyright.cpst = buf[4]; s->copyright.rmi = buf[5]; - ret = 0; -err: - kfree(buf); - return ret; + return 0; } static int dvd_read_disckey(struct cdrom_device_info *cdi, dvd_struct *s) @@ -1894,33 +1820,26 @@ static int dvd_read_disckey(struct cdrom_device_info *cdi, dvd_struct *s) static int dvd_read_bca(struct cdrom_device_info *cdi, dvd_struct *s) { int ret; - u_char *buf; + u_char buf[4 + 188]; struct packet_command cgc; struct cdrom_device_ops *cdo = cdi->ops; - buf = kmalloc(4 + 188, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - init_cdrom_command(&cgc, buf, 4 + 188, CGC_DATA_READ); + init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_READ); cgc.cmd[0] = GPCMD_READ_DVD_STRUCTURE; cgc.cmd[7] = s->type; cgc.cmd[9] = cgc.buflen & 0xff; if ((ret = cdo->generic_packet(cdi, &cgc))) - goto err; + return ret; s->bca.len = buf[0] << 8 | buf[1]; if (s->bca.len < 12 || s->bca.len > 188) { cdinfo(CD_WARNING, "Received invalid BCA length (%d)\n", s->bca.len); - ret = -EIO; - goto err; + return -EIO; } memcpy(s->bca.value, &buf[4], s->bca.len); - ret = 0; -err: - kfree(buf); - return ret; + + return 0; } static int dvd_read_manufact(struct cdrom_device_info *cdi, dvd_struct *s) @@ -2020,13 +1939,9 @@ static int cdrom_read_subchannel(struct cdrom_device_info *cdi, { struct cdrom_device_ops *cdo = cdi->ops; struct packet_command cgc; - char *buffer; + char buffer[32]; int ret; - buffer = kmalloc(32, GFP_KERNEL); - if (!buffer) - return -ENOMEM; - init_cdrom_command(&cgc, buffer, 16, CGC_DATA_READ); cgc.cmd[0] = GPCMD_READ_SUBCHANNEL; cgc.cmd[1] = 2; /* MSF addressing */ @@ -2035,7 +1950,7 @@ static int cdrom_read_subchannel(struct cdrom_device_info *cdi, cgc.cmd[8] = 16; if ((ret = cdo->generic_packet(cdi, &cgc))) - goto err; + return ret; subchnl->cdsc_audiostatus = cgc.buffer[1]; subchnl->cdsc_format = CDROM_MSF; @@ -2050,10 +1965,7 @@ static int cdrom_read_subchannel(struct cdrom_device_info *cdi, subchnl->cdsc_absaddr.msf.second = cgc.buffer[10]; subchnl->cdsc_absaddr.msf.frame = cgc.buffer[11]; - ret = 0; -err: - kfree(buffer); - return ret; + return 0; } /* -- cgit v0.10.2 From 7c0c0b5b19611ac15eec69043cb5588f6cbfbd7b Mon Sep 17 00:00:00 2001 From: Christophe Jaillet Date: Fri, 4 Jul 2008 09:33:17 +0200 Subject: drivers/block/pktcdvd.c: avoid useless memset Avoid the 'memset(...,0, ...)' before calling 'init_cdrom_command' because this function already does it. Signed-off-by: Christophe Jaillet Acked-by: Peter Osterlund Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 85e44d0..45bee91 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -2080,7 +2080,6 @@ static noinline_for_stack int pkt_write_caching(struct pktcdvd_device *pd, unsigned char buf[64]; int ret; - memset(buf, 0, sizeof(buf)); init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_READ); cgc.sense = &sense; cgc.buflen = pd->mode_offset + 12; @@ -2127,7 +2126,6 @@ static noinline_for_stack int pkt_get_max_speed(struct pktcdvd_device *pd, unsigned char *cap_buf; int ret, offset; - memset(buf, 0, sizeof(buf)); cap_buf = &buf[sizeof(struct mode_page_header) + pd->mode_offset]; init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_UNKNOWN); cgc.sense = &sense; -- cgit v0.10.2 From 8b3d3567f72aa61d5d6f4ce89d289b154e1ea866 Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Fri, 4 Jul 2008 09:33:33 +0200 Subject: ramfs: enable splice write Signed-off-by: Octavian Purdila Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c index 9590b90..78f613c 100644 --- a/fs/ramfs/file-mmu.c +++ b/fs/ramfs/file-mmu.c @@ -45,6 +45,7 @@ const struct file_operations ramfs_file_operations = { .mmap = generic_file_mmap, .fsync = simple_sync_file, .splice_read = generic_file_splice_read, + .splice_write = generic_file_splice_write, .llseek = generic_file_llseek, }; diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 0989bc2..52312ec 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -43,6 +43,7 @@ const struct file_operations ramfs_file_operations = { .aio_write = generic_file_aio_write, .fsync = simple_sync_file, .splice_read = generic_file_splice_read, + .splice_write = generic_file_splice_write, .llseek = generic_file_llseek, }; -- cgit v0.10.2 From 32502b8413a77b54b9e19809404109590c32dfb7 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 4 Jul 2008 09:35:17 +0200 Subject: splice: fix generic_file_splice_read() race with page invalidation If a page was invalidated during splicing from file to a pipe, then generic_file_splice_read() could return a short or zero count. This manifested itself in rare I/O errors seen on nfs exported fuse filesystems. This is because nfsd uses splice_direct_to_actor() to read files, and fuse uses invalidate_inode_pages2() to invalidate stale data on open. Fix by redoing the page find/create if it was found to be truncated (invalidated). Signed-off-by: Miklos Szeredi Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe diff --git a/fs/splice.c b/fs/splice.c index aa5f6f6..3994421 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -379,13 +379,22 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, lock_page(page); /* - * page was truncated, stop here. if this isn't the - * first page, we'll just complete what we already - * added + * Page was truncated, or invalidated by the + * filesystem. Redo the find/create, but this time the + * page is kept locked, so there's no chance of another + * race with truncate/invalidate. */ if (!page->mapping) { unlock_page(page); - break; + page = find_or_create_page(mapping, index, + mapping_gfp_mask(mapping)); + + if (!page) { + error = -ENOMEM; + break; + } + page_cache_release(pages[page_nr]); + pages[page_nr] = page; } /* * page was already under io and is now done, great -- cgit v0.10.2