From 88e740f1654bf28565edd528a060695c1f2b5ad7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fernando=20Luis=20V=C3=A1zquez=20Cao?=
 <fernando@oss.ntt.co.jp>
Date: Mon, 27 Oct 2008 18:44:46 +0900
Subject: block: add queue flag for paravirt frontend drivers

As is the case with SSD devices, we do not want to idle in AS/CFQ when
the block device is a paravirt front-end driver. This patch adds a flag
(QUEUE_FLAG_VIRT) which should be used by front-end drivers such as
virtio_blk and xen-blkfront to indicate a paravirtualized device.

Signed-off-by: Fernando Luis Vazquez Cao <fernando@oss.ntt.co.jp>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 031a315..482e960 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -449,6 +449,7 @@ struct request_queue
 #define QUEUE_FLAG_FAIL_IO     12	/* fake timeout */
 #define QUEUE_FLAG_STACKABLE   13	/* supports request stacking */
 #define QUEUE_FLAG_NONROT      14	/* non-rotational device (SSD) */
+#define QUEUE_FLAG_VIRT        QUEUE_FLAG_NONROT /* paravirt device */
 
 static inline int queue_is_locked(struct request_queue *q)
 {
-- 
cgit v0.10.2


From 7d116b626b843c66b17630a62d696e044d049bfc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fernando=20Luis=20V=C3=A1zquez=20Cao?=
 <fernando@oss.ntt.co.jp>
Date: Mon, 27 Oct 2008 18:45:15 +0900
Subject: virtio_blk: set queue paravirt flag

As a paravirt front-end driver, virtio_blk is not a rotational device so
we want do avoid idling in AS/CFQ. Tell the block layer about this.

Signed-off-by: Fernando Luis Vazquez Cao <fernando@oss.ntt.co.jp>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 85d79a0..f151592 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -237,6 +237,8 @@ static int virtblk_probe(struct virtio_device *vdev)
 		goto out_put_disk;
 	}
 
+	queue_flag_set_unlocked(QUEUE_FLAG_VIRT, vblk->disk->queue);
+
 	if (index < 26) {
 		sprintf(vblk->disk->disk_name, "vd%c", 'a' + index % 26);
 	} else if (index < (26 + 1) * 26) {
-- 
cgit v0.10.2


From 66d352e1e410dcea22fecb9fa9ec09dd23a62e1c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fernando=20Luis=20V=C3=A1zquez=20Cao?=
 <fernando@oss.ntt.co.jp>
Date: Mon, 27 Oct 2008 18:45:54 +0900
Subject: xen-blkfront: set queue paravirt flag

Xen's blkfront sets noop as the default I/O scheduler at initialization
time to avoid elevator overheads such as idling, but with the advent of
basic disk profiling capabilities this is not necessary anymore. We
should just tell the block layer that we are a paravirt front-end driver
and the elevator will automatically make the necessary adjustments.

Signed-off-by: Fernando Luis Vazquez Cao <fernando@oss.ntt.co.jp>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 2d19f0c..918ef725 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -338,18 +338,12 @@ wait:
 static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
 {
 	struct request_queue *rq;
-	elevator_t *old_e;
 
 	rq = blk_init_queue(do_blkif_request, &blkif_io_lock);
 	if (rq == NULL)
 		return -1;
 
-	old_e = rq->elevator;
-	if (IS_ERR_VALUE(elevator_init(rq, "noop")))
-		printk(KERN_WARNING
-			"blkfront: Switch elevator failed, use default\n");
-	else
-		elevator_exit(old_e);
+	queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq);
 
 	/* Hard sector size and max sectors impersonate the equiv. hardware. */
 	blk_queue_hardsect_size(rq, sector_size);
-- 
cgit v0.10.2


From 565e411d764eeda006738dfadbccca79d48381e1 Mon Sep 17 00:00:00 2001
From: "malahal@us.ibm.com" <malahal@us.ibm.com>
Date: Thu, 30 Oct 2008 08:51:58 +0100
Subject: block: optimizations in blk_rq_timed_out_timer()

Now the rq->deadline can't be zero if the request is in the
timeout_list, so there is no need to have next_set. There is no need to
access a request's deadline field if blk_rq_timed_out is called on it.

Signed-off-by: Malahal Naineni <malahal@us.ibm.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 69185ea..116bbf3 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -111,7 +111,7 @@ static void blk_rq_timed_out(struct request *req)
 void blk_rq_timed_out_timer(unsigned long data)
 {
 	struct request_queue *q = (struct request_queue *) data;
-	unsigned long flags, uninitialized_var(next), next_set = 0;
+	unsigned long flags, next = 0;
 	struct request *rq, *tmp;
 
 	spin_lock_irqsave(q->queue_lock, flags);
@@ -126,12 +126,10 @@ void blk_rq_timed_out_timer(unsigned long data)
 			if (blk_mark_rq_complete(rq))
 				continue;
 			blk_rq_timed_out(rq);
+		} else {
+			if (!next || time_after(next, rq->deadline))
+				next = rq->deadline;
 		}
-		if (!next_set) {
-			next = rq->deadline;
-			next_set = 1;
-		} else if (time_after(next, rq->deadline))
-			next = rq->deadline;
 	}
 
 	if (next_set && !list_empty(&q->timeout_list))
-- 
cgit v0.10.2


From 65d3618ccfe686e8d7b3f01a838d0578182406df Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 30 Oct 2008 08:53:02 +0100
Subject: block: add comment in blk_rq_timed_out() about why next can not be 0

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 116bbf3..99c3efc 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -132,7 +132,12 @@ void blk_rq_timed_out_timer(unsigned long data)
 		}
 	}
 
-	if (next_set && !list_empty(&q->timeout_list))
+	/*
+	 * next can never be 0 here with the list non-empty, since we always
+	 * bump ->deadline to 1 so we can detect if the timer was ever added
+	 * or not. See comment in blk_add_timer()
+	 */
+	if (next)
 		mod_timer(&q->timeout, round_jiffies_up(next));
 
 	spin_unlock_irqrestore(q->queue_lock, flags);
-- 
cgit v0.10.2


From 70ed28b92a786f44750ab64117b03d126dd14656 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 19 Nov 2008 14:38:39 +0100
Subject: block: leave the request timeout timer running even on an empty list

For sync IO, we'll often do them serialized. This means we'll be touching
the queue timer for every IO, as opposed to only occasionally like we
do for queued IO. Instead of deleting the timer when the last request
is removed, just let continue running. If a new request comes up soon
we then don't have to readd the timer again. If no new requests arrive,
the timer will expire without side effect later.

This improves high iops sync IO by ~1%.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-core.c b/block/blk-core.c
index 561e8a1..243d18b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -404,6 +404,7 @@ EXPORT_SYMBOL(blk_stop_queue);
 void blk_sync_queue(struct request_queue *q)
 {
 	del_timer_sync(&q->unplug_timer);
+	del_timer_sync(&q->timeout);
 	kblockd_flush_work(&q->unplug_work);
 }
 EXPORT_SYMBOL(blk_sync_queue);
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 99c3efc..a095353 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -73,11 +73,7 @@ ssize_t part_timeout_store(struct device *dev, struct device_attribute *attr,
  */
 void blk_delete_timer(struct request *req)
 {
-	struct request_queue *q = req->q;
-
 	list_del_init(&req->timeout_list);
-	if (list_empty(&q->timeout_list))
-		del_timer(&q->timeout);
 }
 
 static void blk_rq_timed_out(struct request *req)
-- 
cgit v0.10.2


From 2b91bafcc0fc545e489e9537a38f487706960ea5 Mon Sep 17 00:00:00 2001
From: Milton Miller <miltonm@bga.com>
Date: Mon, 17 Nov 2008 13:10:34 +0100
Subject: scsi-ioctl: use clock_t <> jiffies

Convert the timeout ioctl scalling to use the clock_t functions
which are much more accurate with some USER_HZ vs HZ combinations.

Signed-off-by: Milton Miller <miltonm@bga.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index d0bb92c..ee9c67d 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -60,7 +60,7 @@ static int scsi_get_bus(struct request_queue *q, int __user *p)
 
 static int sg_get_timeout(struct request_queue *q)
 {
-	return q->sg_timeout / (HZ / USER_HZ);
+	return jiffies_to_clock_t(q->sg_timeout);
 }
 
 static int sg_set_timeout(struct request_queue *q, int __user *p)
@@ -68,7 +68,7 @@ static int sg_set_timeout(struct request_queue *q, int __user *p)
 	int timeout, err = get_user(timeout, p);
 
 	if (!err)
-		q->sg_timeout = timeout * (HZ / USER_HZ);
+		q->sg_timeout = clock_t_to_jiffies(timeout);
 
 	return err;
 }
-- 
cgit v0.10.2


From 3147c531b6b53fbbf8942689c4aa113c519f5e4e Mon Sep 17 00:00:00 2001
From: Marcin Slusarz <marcin.slusarz@gmail.com>
Date: Sun, 16 Nov 2008 19:04:47 +0100
Subject: cdrom: split mmc_ioctl to lower stack usage

Checkstack output:

Before:
mmc_ioctl:                  584

After:
mmc_ioctl_dvd_read_struct:  280
mmc_ioctl_cdrom_subchannel: 152
mmc_ioctl_cdrom_read_data:  120
mmc_ioctl_cdrom_volume:     104
mmc_ioctl_cdrom_read_audio: 104
(mmc_ioctl is inlined into cdrom_ioctl - 104 bytes)

Signed-off-by: Marcin Slusarz <marcin.slusarz@gmail.com>
Cc: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index 7d2e91c..b927aa2 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -2787,271 +2787,359 @@ static int cdrom_switch_blocksize(struct cdrom_device_info *cdi, int size)
 	return cdo->generic_packet(cdi, &cgc);
 }
 
-static int mmc_ioctl(struct cdrom_device_info *cdi, unsigned int cmd,
-		     unsigned long arg)
-{		
-	struct cdrom_device_ops *cdo = cdi->ops;
-	struct packet_command cgc;
+static noinline int mmc_ioctl_cdrom_read_data(struct cdrom_device_info *cdi,
+					void __user *arg,
+					struct packet_command *cgc,
+					int cmd)
+{
 	struct request_sense sense;
-	unsigned char buffer[32];
-	int ret = 0;
-
-	memset(&cgc, 0, sizeof(cgc));
+	struct cdrom_msf msf;
+	int blocksize = 0, format = 0, lba;
+	int ret;
 
-	/* build a unified command and queue it through
-	   cdo->generic_packet() */
 	switch (cmd) {
 	case CDROMREADRAW:
+		blocksize = CD_FRAMESIZE_RAW;
+		break;
 	case CDROMREADMODE1:
-	case CDROMREADMODE2: {
-		struct cdrom_msf msf;
-		int blocksize = 0, format = 0, lba;
-		
-		switch (cmd) {
-		case CDROMREADRAW:
-			blocksize = CD_FRAMESIZE_RAW;
-			break;
-		case CDROMREADMODE1:
-			blocksize = CD_FRAMESIZE;
-			format = 2;
-			break;
-		case CDROMREADMODE2:
-			blocksize = CD_FRAMESIZE_RAW0;
-			break;
-		}
-		IOCTL_IN(arg, struct cdrom_msf, msf);
-		lba = msf_to_lba(msf.cdmsf_min0,msf.cdmsf_sec0,msf.cdmsf_frame0);
-		/* FIXME: we need upper bound checking, too!! */
-		if (lba < 0)
-			return -EINVAL;
-		cgc.buffer = kmalloc(blocksize, GFP_KERNEL);
-		if (cgc.buffer == NULL)
-			return -ENOMEM;
-		memset(&sense, 0, sizeof(sense));
-		cgc.sense = &sense;
-		cgc.data_direction = CGC_DATA_READ;
-		ret = cdrom_read_block(cdi, &cgc, lba, 1, format, blocksize);
-		if (ret && sense.sense_key==0x05 && sense.asc==0x20 && sense.ascq==0x00) {
-			/*
-			 * SCSI-II devices are not required to support
-			 * READ_CD, so let's try switching block size
-			 */
-			/* FIXME: switch back again... */
-			if ((ret = cdrom_switch_blocksize(cdi, blocksize))) {
-				kfree(cgc.buffer);
-				return ret;
-			}
-			cgc.sense = NULL;
-			ret = cdrom_read_cd(cdi, &cgc, lba, blocksize, 1);
-			ret |= cdrom_switch_blocksize(cdi, blocksize);
-		}
-		if (!ret && copy_to_user((char __user *)arg, cgc.buffer, blocksize))
-			ret = -EFAULT;
-		kfree(cgc.buffer);
+		blocksize = CD_FRAMESIZE;
+		format = 2;
+		break;
+	case CDROMREADMODE2:
+		blocksize = CD_FRAMESIZE_RAW0;
+		break;
+	}
+	IOCTL_IN(arg, struct cdrom_msf, msf);
+	lba = msf_to_lba(msf.cdmsf_min0, msf.cdmsf_sec0, msf.cdmsf_frame0);
+	/* FIXME: we need upper bound checking, too!! */
+	if (lba < 0)
+		return -EINVAL;
+
+	cgc->buffer = kmalloc(blocksize, GFP_KERNEL);
+	if (cgc->buffer == NULL)
+		return -ENOMEM;
+
+	memset(&sense, 0, sizeof(sense));
+	cgc->sense = &sense;
+	cgc->data_direction = CGC_DATA_READ;
+	ret = cdrom_read_block(cdi, cgc, lba, 1, format, blocksize);
+	if (ret && sense.sense_key == 0x05 &&
+		   sense.asc == 0x20 &&
+		   sense.ascq == 0x00) {
+		/*
+		 * SCSI-II devices are not required to support
+		 * READ_CD, so let's try switching block size
+		 */
+		/* FIXME: switch back again... */
+		ret = cdrom_switch_blocksize(cdi, blocksize);
+		if (ret)
+			goto out;
+		cgc->sense = NULL;
+		ret = cdrom_read_cd(cdi, cgc, lba, blocksize, 1);
+		ret |= cdrom_switch_blocksize(cdi, blocksize);
+	}
+	if (!ret && copy_to_user(arg, cgc->buffer, blocksize))
+		ret = -EFAULT;
+out:
+	kfree(cgc->buffer);
+	return ret;
+}
+
+static noinline int mmc_ioctl_cdrom_read_audio(struct cdrom_device_info *cdi,
+					void __user *arg)
+{
+	struct cdrom_read_audio ra;
+	int lba;
+
+	IOCTL_IN(arg, struct cdrom_read_audio, ra);
+
+	if (ra.addr_format == CDROM_MSF)
+		lba = msf_to_lba(ra.addr.msf.minute,
+				 ra.addr.msf.second,
+				 ra.addr.msf.frame);
+	else if (ra.addr_format == CDROM_LBA)
+		lba = ra.addr.lba;
+	else
+		return -EINVAL;
+
+	/* FIXME: we need upper bound checking, too!! */
+	if (lba < 0 || ra.nframes <= 0 || ra.nframes > CD_FRAMES)
+		return -EINVAL;
+
+	return cdrom_read_cdda(cdi, ra.buf, lba, ra.nframes);
+}
+
+static noinline int mmc_ioctl_cdrom_subchannel(struct cdrom_device_info *cdi,
+					void __user *arg)
+{
+	int ret;
+	struct cdrom_subchnl q;
+	u_char requested, back;
+	IOCTL_IN(arg, struct cdrom_subchnl, q);
+	requested = q.cdsc_format;
+	if (!((requested == CDROM_MSF) ||
+	      (requested == CDROM_LBA)))
+		return -EINVAL;
+	q.cdsc_format = CDROM_MSF;
+	ret = cdrom_read_subchannel(cdi, &q, 0);
+	if (ret)
 		return ret;
-		}
-	case CDROMREADAUDIO: {
-		struct cdrom_read_audio ra;
-		int lba;
-
-		IOCTL_IN(arg, struct cdrom_read_audio, ra);
-
-		if (ra.addr_format == CDROM_MSF)
-			lba = msf_to_lba(ra.addr.msf.minute,
-					 ra.addr.msf.second,
-					 ra.addr.msf.frame);
-		else if (ra.addr_format == CDROM_LBA)
-			lba = ra.addr.lba;
-		else
-			return -EINVAL;
+	back = q.cdsc_format; /* local copy */
+	sanitize_format(&q.cdsc_absaddr, &back, requested);
+	sanitize_format(&q.cdsc_reladdr, &q.cdsc_format, requested);
+	IOCTL_OUT(arg, struct cdrom_subchnl, q);
+	/* cdinfo(CD_DO_IOCTL, "CDROMSUBCHNL successful\n"); */
+	return 0;
+}
 
-		/* FIXME: we need upper bound checking, too!! */
-		if (lba < 0 || ra.nframes <= 0 || ra.nframes > CD_FRAMES)
-			return -EINVAL;
+static noinline int mmc_ioctl_cdrom_play_msf(struct cdrom_device_info *cdi,
+					void __user *arg,
+					struct packet_command *cgc)
+{
+	struct cdrom_device_ops *cdo = cdi->ops;
+	struct cdrom_msf msf;
+	cdinfo(CD_DO_IOCTL, "entering CDROMPLAYMSF\n");
+	IOCTL_IN(arg, struct cdrom_msf, msf);
+	cgc->cmd[0] = GPCMD_PLAY_AUDIO_MSF;
+	cgc->cmd[3] = msf.cdmsf_min0;
+	cgc->cmd[4] = msf.cdmsf_sec0;
+	cgc->cmd[5] = msf.cdmsf_frame0;
+	cgc->cmd[6] = msf.cdmsf_min1;
+	cgc->cmd[7] = msf.cdmsf_sec1;
+	cgc->cmd[8] = msf.cdmsf_frame1;
+	cgc->data_direction = CGC_DATA_NONE;
+	return cdo->generic_packet(cdi, cgc);
+}
 
-		return cdrom_read_cdda(cdi, ra.buf, lba, ra.nframes);
-		}
-	case CDROMSUBCHNL: {
-		struct cdrom_subchnl q;
-		u_char requested, back;
-		IOCTL_IN(arg, struct cdrom_subchnl, q);
-		requested = q.cdsc_format;
-		if (!((requested == CDROM_MSF) ||
-		      (requested == CDROM_LBA)))
-			return -EINVAL;
-		q.cdsc_format = CDROM_MSF;
-		if ((ret = cdrom_read_subchannel(cdi, &q, 0)))
-			return ret;
-		back = q.cdsc_format; /* local copy */
-		sanitize_format(&q.cdsc_absaddr, &back, requested);
-		sanitize_format(&q.cdsc_reladdr, &q.cdsc_format, requested);
-		IOCTL_OUT(arg, struct cdrom_subchnl, q);
-		/* cdinfo(CD_DO_IOCTL, "CDROMSUBCHNL successful\n"); */ 
-		return 0;
-		}
-	case CDROMPLAYMSF: {
-		struct cdrom_msf msf;
-		cdinfo(CD_DO_IOCTL, "entering CDROMPLAYMSF\n");
-		IOCTL_IN(arg, struct cdrom_msf, msf);
-		cgc.cmd[0] = GPCMD_PLAY_AUDIO_MSF;
-		cgc.cmd[3] = msf.cdmsf_min0;
-		cgc.cmd[4] = msf.cdmsf_sec0;
-		cgc.cmd[5] = msf.cdmsf_frame0;
-		cgc.cmd[6] = msf.cdmsf_min1;
-		cgc.cmd[7] = msf.cdmsf_sec1;
-		cgc.cmd[8] = msf.cdmsf_frame1;
-		cgc.data_direction = CGC_DATA_NONE;
-		return cdo->generic_packet(cdi, &cgc);
-		}
-	case CDROMPLAYBLK: {
-		struct cdrom_blk blk;
-		cdinfo(CD_DO_IOCTL, "entering CDROMPLAYBLK\n");
-		IOCTL_IN(arg, struct cdrom_blk, blk);
-		cgc.cmd[0] = GPCMD_PLAY_AUDIO_10;
-		cgc.cmd[2] = (blk.from >> 24) & 0xff;
-		cgc.cmd[3] = (blk.from >> 16) & 0xff;
-		cgc.cmd[4] = (blk.from >>  8) & 0xff;
-		cgc.cmd[5] = blk.from & 0xff;
-		cgc.cmd[7] = (blk.len >> 8) & 0xff;
-		cgc.cmd[8] = blk.len & 0xff;
-		cgc.data_direction = CGC_DATA_NONE;
-		return cdo->generic_packet(cdi, &cgc);
-		}
-	case CDROMVOLCTRL:
-	case CDROMVOLREAD: {
-		struct cdrom_volctrl volctrl;
-		char mask[sizeof(buffer)];
-		unsigned short offset;
+static noinline int mmc_ioctl_cdrom_play_blk(struct cdrom_device_info *cdi,
+					void __user *arg,
+					struct packet_command *cgc)
+{
+	struct cdrom_device_ops *cdo = cdi->ops;
+	struct cdrom_blk blk;
+	cdinfo(CD_DO_IOCTL, "entering CDROMPLAYBLK\n");
+	IOCTL_IN(arg, struct cdrom_blk, blk);
+	cgc->cmd[0] = GPCMD_PLAY_AUDIO_10;
+	cgc->cmd[2] = (blk.from >> 24) & 0xff;
+	cgc->cmd[3] = (blk.from >> 16) & 0xff;
+	cgc->cmd[4] = (blk.from >>  8) & 0xff;
+	cgc->cmd[5] = blk.from & 0xff;
+	cgc->cmd[7] = (blk.len >> 8) & 0xff;
+	cgc->cmd[8] = blk.len & 0xff;
+	cgc->data_direction = CGC_DATA_NONE;
+	return cdo->generic_packet(cdi, cgc);
+}
 
-		cdinfo(CD_DO_IOCTL, "entering CDROMVOLUME\n");
+static noinline int mmc_ioctl_cdrom_volume(struct cdrom_device_info *cdi,
+					void __user *arg,
+					struct packet_command *cgc,
+					unsigned int cmd)
+{
+	struct cdrom_volctrl volctrl;
+	unsigned char buffer[32];
+	char mask[sizeof(buffer)];
+	unsigned short offset;
+	int ret;
+
+	cdinfo(CD_DO_IOCTL, "entering CDROMVOLUME\n");
 
-		IOCTL_IN(arg, struct cdrom_volctrl, volctrl);
+	IOCTL_IN(arg, struct cdrom_volctrl, volctrl);
 
-		cgc.buffer = buffer;
-		cgc.buflen = 24;
-		if ((ret = cdrom_mode_sense(cdi, &cgc, GPMODE_AUDIO_CTL_PAGE, 0)))
-		    return ret;
+	cgc->buffer = buffer;
+	cgc->buflen = 24;
+	ret = cdrom_mode_sense(cdi, cgc, GPMODE_AUDIO_CTL_PAGE, 0);
+	if (ret)
+		return ret;
 		
-		/* originally the code depended on buffer[1] to determine
-		   how much data is available for transfer. buffer[1] is
-		   unfortunately ambigious and the only reliable way seem
-		   to be to simply skip over the block descriptor... */
-		offset = 8 + be16_to_cpu(*(__be16 *)(buffer+6));
-
-		if (offset + 16 > sizeof(buffer))
-			return -E2BIG;
-
-		if (offset + 16 > cgc.buflen) {
-			cgc.buflen = offset+16;
-			ret = cdrom_mode_sense(cdi, &cgc,
-						GPMODE_AUDIO_CTL_PAGE, 0);
-			if (ret)
-				return ret;
-		}
+	/* originally the code depended on buffer[1] to determine
+	   how much data is available for transfer. buffer[1] is
+	   unfortunately ambigious and the only reliable way seem
+	   to be to simply skip over the block descriptor... */
+	offset = 8 + be16_to_cpu(*(__be16 *)(buffer + 6));
+
+	if (offset + 16 > sizeof(buffer))
+		return -E2BIG;
+
+	if (offset + 16 > cgc->buflen) {
+		cgc->buflen = offset + 16;
+		ret = cdrom_mode_sense(cdi, cgc,
+					GPMODE_AUDIO_CTL_PAGE, 0);
+		if (ret)
+			return ret;
+	}
 
-		/* sanity check */
-		if ((buffer[offset] & 0x3f) != GPMODE_AUDIO_CTL_PAGE ||
-				buffer[offset+1] < 14)
-			return -EINVAL;
+	/* sanity check */
+	if ((buffer[offset] & 0x3f) != GPMODE_AUDIO_CTL_PAGE ||
+			buffer[offset + 1] < 14)
+		return -EINVAL;
 
-		/* now we have the current volume settings. if it was only
-		   a CDROMVOLREAD, return these values */
-		if (cmd == CDROMVOLREAD) {
-			volctrl.channel0 = buffer[offset+9];
-			volctrl.channel1 = buffer[offset+11];
-			volctrl.channel2 = buffer[offset+13];
-			volctrl.channel3 = buffer[offset+15];
-			IOCTL_OUT(arg, struct cdrom_volctrl, volctrl);
-			return 0;
-		}
+	/* now we have the current volume settings. if it was only
+	   a CDROMVOLREAD, return these values */
+	if (cmd == CDROMVOLREAD) {
+		volctrl.channel0 = buffer[offset+9];
+		volctrl.channel1 = buffer[offset+11];
+		volctrl.channel2 = buffer[offset+13];
+		volctrl.channel3 = buffer[offset+15];
+		IOCTL_OUT(arg, struct cdrom_volctrl, volctrl);
+		return 0;
+	}
 		
-		/* get the volume mask */
-		cgc.buffer = mask;
-		if ((ret = cdrom_mode_sense(cdi, &cgc, 
-				GPMODE_AUDIO_CTL_PAGE, 1)))
-			return ret;
+	/* get the volume mask */
+	cgc->buffer = mask;
+	ret = cdrom_mode_sense(cdi, cgc, GPMODE_AUDIO_CTL_PAGE, 1);
+	if (ret)
+		return ret;
 
-		buffer[offset+9] = volctrl.channel0 & mask[offset+9];
-		buffer[offset+11] = volctrl.channel1 & mask[offset+11];
-		buffer[offset+13] = volctrl.channel2 & mask[offset+13];
-		buffer[offset+15] = volctrl.channel3 & mask[offset+15];
+	buffer[offset + 9]  = volctrl.channel0 & mask[offset + 9];
+	buffer[offset + 11] = volctrl.channel1 & mask[offset + 11];
+	buffer[offset + 13] = volctrl.channel2 & mask[offset + 13];
+	buffer[offset + 15] = volctrl.channel3 & mask[offset + 15];
 
-		/* set volume */
-		cgc.buffer = buffer + offset - 8;
-		memset(cgc.buffer, 0, 8);
-		return cdrom_mode_select(cdi, &cgc);
-		}
+	/* set volume */
+	cgc->buffer = buffer + offset - 8;
+	memset(cgc->buffer, 0, 8);
+	return cdrom_mode_select(cdi, cgc);
+}
 
-	case CDROMSTART:
-	case CDROMSTOP: {
-		cdinfo(CD_DO_IOCTL, "entering CDROMSTART/CDROMSTOP\n"); 
-		cgc.cmd[0] = GPCMD_START_STOP_UNIT;
-		cgc.cmd[1] = 1;
-		cgc.cmd[4] = (cmd == CDROMSTART) ? 1 : 0;
-		cgc.data_direction = CGC_DATA_NONE;
-		return cdo->generic_packet(cdi, &cgc);
-		}
+static noinline int mmc_ioctl_cdrom_start_stop(struct cdrom_device_info *cdi,
+					struct packet_command *cgc,
+					int cmd)
+{
+	struct cdrom_device_ops *cdo = cdi->ops;
+	cdinfo(CD_DO_IOCTL, "entering CDROMSTART/CDROMSTOP\n");
+	cgc->cmd[0] = GPCMD_START_STOP_UNIT;
+	cgc->cmd[1] = 1;
+	cgc->cmd[4] = (cmd == CDROMSTART) ? 1 : 0;
+	cgc->data_direction = CGC_DATA_NONE;
+	return cdo->generic_packet(cdi, cgc);
+}
 
-	case CDROMPAUSE:
-	case CDROMRESUME: {
-		cdinfo(CD_DO_IOCTL, "entering CDROMPAUSE/CDROMRESUME\n"); 
-		cgc.cmd[0] = GPCMD_PAUSE_RESUME;
-		cgc.cmd[8] = (cmd == CDROMRESUME) ? 1 : 0;
-		cgc.data_direction = CGC_DATA_NONE;
-		return cdo->generic_packet(cdi, &cgc);
-		}
+static noinline int mmc_ioctl_cdrom_pause_resume(struct cdrom_device_info *cdi,
+					struct packet_command *cgc,
+					int cmd)
+{
+	struct cdrom_device_ops *cdo = cdi->ops;
+	cdinfo(CD_DO_IOCTL, "entering CDROMPAUSE/CDROMRESUME\n");
+	cgc->cmd[0] = GPCMD_PAUSE_RESUME;
+	cgc->cmd[8] = (cmd == CDROMRESUME) ? 1 : 0;
+	cgc->data_direction = CGC_DATA_NONE;
+	return cdo->generic_packet(cdi, cgc);
+}
 
-	case DVD_READ_STRUCT: {
-		dvd_struct *s;
-		int size = sizeof(dvd_struct);
-		if (!CDROM_CAN(CDC_DVD))
-			return -ENOSYS;
-		if ((s = kmalloc(size, GFP_KERNEL)) == NULL)
-			return -ENOMEM;
-		cdinfo(CD_DO_IOCTL, "entering DVD_READ_STRUCT\n"); 
-		if (copy_from_user(s, (dvd_struct __user *)arg, size)) {
-			kfree(s);
-			return -EFAULT;
-		}
-		if ((ret = dvd_read_struct(cdi, s))) {
-			kfree(s);
-			return ret;
-		}
-		if (copy_to_user((dvd_struct __user *)arg, s, size))
-			ret = -EFAULT;
+static noinline int mmc_ioctl_dvd_read_struct(struct cdrom_device_info *cdi,
+						void __user *arg)
+{
+	int ret;
+	dvd_struct *s;
+	int size = sizeof(dvd_struct);
+
+	if (!CDROM_CAN(CDC_DVD))
+		return -ENOSYS;
+
+	s = kmalloc(size, GFP_KERNEL);
+	if (!s)
+		return -ENOMEM;
+
+	cdinfo(CD_DO_IOCTL, "entering DVD_READ_STRUCT\n");
+	if (copy_from_user(s, arg, size)) {
 		kfree(s);
+		return -EFAULT;
+	}
+
+	ret = dvd_read_struct(cdi, s);
+	if (ret)
+		goto out;
+
+	if (copy_to_user(arg, s, size))
+		ret = -EFAULT;
+out:
+	kfree(s);
+	return ret;
+}
+
+static noinline int mmc_ioctl_dvd_auth(struct cdrom_device_info *cdi,
+					void __user *arg)
+{
+	int ret;
+	dvd_authinfo ai;
+	if (!CDROM_CAN(CDC_DVD))
+		return -ENOSYS;
+	cdinfo(CD_DO_IOCTL, "entering DVD_AUTH\n");
+	IOCTL_IN(arg, dvd_authinfo, ai);
+	ret = dvd_do_auth(cdi, &ai);
+	if (ret)
 		return ret;
-		}
+	IOCTL_OUT(arg, dvd_authinfo, ai);
+	return 0;
+}
 
-	case DVD_AUTH: {
-		dvd_authinfo ai;
-		if (!CDROM_CAN(CDC_DVD))
-			return -ENOSYS;
-		cdinfo(CD_DO_IOCTL, "entering DVD_AUTH\n"); 
-		IOCTL_IN(arg, dvd_authinfo, ai);
-		if ((ret = dvd_do_auth (cdi, &ai)))
-			return ret;
-		IOCTL_OUT(arg, dvd_authinfo, ai);
-		return 0;
-		}
+static noinline int mmc_ioctl_cdrom_next_writable(struct cdrom_device_info *cdi,
+						void __user *arg)
+{
+	int ret;
+	long next = 0;
+	cdinfo(CD_DO_IOCTL, "entering CDROM_NEXT_WRITABLE\n");
+	ret = cdrom_get_next_writable(cdi, &next);
+	if (ret)
+		return ret;
+	IOCTL_OUT(arg, long, next);
+	return 0;
+}
 
-	case CDROM_NEXT_WRITABLE: {
-		long next = 0;
-		cdinfo(CD_DO_IOCTL, "entering CDROM_NEXT_WRITABLE\n"); 
-		if ((ret = cdrom_get_next_writable(cdi, &next)))
-			return ret;
-		IOCTL_OUT(arg, long, next);
-		return 0;
-		}
-	case CDROM_LAST_WRITTEN: {
-		long last = 0;
-		cdinfo(CD_DO_IOCTL, "entering CDROM_LAST_WRITTEN\n"); 
-		if ((ret = cdrom_get_last_written(cdi, &last)))
-			return ret;
-		IOCTL_OUT(arg, long, last);
-		return 0;
-		}
-	} /* switch */
+static noinline int mmc_ioctl_cdrom_last_written(struct cdrom_device_info *cdi,
+						void __user *arg)
+{
+	int ret;
+	long last = 0;
+	cdinfo(CD_DO_IOCTL, "entering CDROM_LAST_WRITTEN\n");
+	ret = cdrom_get_last_written(cdi, &last);
+	if (ret)
+		return ret;
+	IOCTL_OUT(arg, long, last);
+	return 0;
+}
+
+static int mmc_ioctl(struct cdrom_device_info *cdi, unsigned int cmd,
+		     unsigned long arg)
+{
+	struct packet_command cgc;
+	void __user *userptr = (void __user *)arg;
+
+	memset(&cgc, 0, sizeof(cgc));
+
+	/* build a unified command and queue it through
+	   cdo->generic_packet() */
+	switch (cmd) {
+	case CDROMREADRAW:
+	case CDROMREADMODE1:
+	case CDROMREADMODE2:
+		return mmc_ioctl_cdrom_read_data(cdi, userptr, &cgc, cmd);
+	case CDROMREADAUDIO:
+		return mmc_ioctl_cdrom_read_audio(cdi, userptr);
+	case CDROMSUBCHNL:
+		return mmc_ioctl_cdrom_subchannel(cdi, userptr);
+	case CDROMPLAYMSF:
+		return mmc_ioctl_cdrom_play_msf(cdi, userptr, &cgc);
+	case CDROMPLAYBLK:
+		return mmc_ioctl_cdrom_play_blk(cdi, userptr, &cgc);
+	case CDROMVOLCTRL:
+	case CDROMVOLREAD:
+		return mmc_ioctl_cdrom_volume(cdi, userptr, &cgc, cmd);
+	case CDROMSTART:
+	case CDROMSTOP:
+		return mmc_ioctl_cdrom_start_stop(cdi, &cgc, cmd);
+	case CDROMPAUSE:
+	case CDROMRESUME:
+		return mmc_ioctl_cdrom_pause_resume(cdi, &cgc, cmd);
+	case DVD_READ_STRUCT:
+		return mmc_ioctl_dvd_read_struct(cdi, userptr);
+	case DVD_AUTH:
+		return mmc_ioctl_dvd_auth(cdi, userptr);
+	case CDROM_NEXT_WRITABLE:
+		return mmc_ioctl_cdrom_next_writable(cdi, userptr);
+	case CDROM_LAST_WRITTEN:
+		return mmc_ioctl_cdrom_last_written(cdi, userptr);
+	}
 
 	return -ENOTTY;
 }
-- 
cgit v0.10.2


From d194139c18edb7dae45e159a24f2aa72091033b8 Mon Sep 17 00:00:00 2001
From: Marcin Slusarz <marcin.slusarz@gmail.com>
Date: Sun, 16 Nov 2008 19:06:37 +0100
Subject: cdrom: reduce stack usage of mmc_ioctl_dvd_read_struct

1. kmalloc 192 bytes in dvd_read_bca (which is inlined into dvd_read_struct)
2. Pass struct packet_command to all dvd_read_* functions.

Checkstack output:
Before: mmc_ioctl_dvd_read_struct:         280
After:  mmc_ioctl_dvd_read_struct:         56

Signed-off-by: Marcin Slusarz <marcin.slusarz@gmail.com>
Cc: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index b927aa2..cceace6 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -1712,29 +1712,30 @@ static int dvd_do_auth(struct cdrom_device_info *cdi, dvd_authinfo *ai)
 	return 0;
 }
 
-static int dvd_read_physical(struct cdrom_device_info *cdi, dvd_struct *s)
+static int dvd_read_physical(struct cdrom_device_info *cdi, dvd_struct *s,
+				struct packet_command *cgc)
 {
 	unsigned char buf[21], *base;
 	struct dvd_layer *layer;
-	struct packet_command cgc;
 	struct cdrom_device_ops *cdo = cdi->ops;
 	int ret, layer_num = s->physical.layer_num;
 
 	if (layer_num >= DVD_LAYERS)
 		return -EINVAL;
 
-	init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_READ);
-	cgc.cmd[0] = GPCMD_READ_DVD_STRUCTURE;
-	cgc.cmd[6] = layer_num;
-	cgc.cmd[7] = s->type;
-	cgc.cmd[9] = cgc.buflen & 0xff;
+	init_cdrom_command(cgc, buf, sizeof(buf), CGC_DATA_READ);
+	cgc->cmd[0] = GPCMD_READ_DVD_STRUCTURE;
+	cgc->cmd[6] = layer_num;
+	cgc->cmd[7] = s->type;
+	cgc->cmd[9] = cgc->buflen & 0xff;
 
 	/*
 	 * refrain from reporting errors on non-existing layers (mainly)
 	 */
-	cgc.quiet = 1;
+	cgc->quiet = 1;
 
-	if ((ret = cdo->generic_packet(cdi, &cgc)))
+	ret = cdo->generic_packet(cdi, cgc);
+	if (ret)
 		return ret;
 
 	base = &buf[4];
@@ -1762,21 +1763,22 @@ static int dvd_read_physical(struct cdrom_device_info *cdi, dvd_struct *s)
 	return 0;
 }
 
-static int dvd_read_copyright(struct cdrom_device_info *cdi, dvd_struct *s)
+static int dvd_read_copyright(struct cdrom_device_info *cdi, dvd_struct *s,
+				struct packet_command *cgc)
 {
 	int ret;
 	u_char buf[8];
-	struct packet_command cgc;
 	struct cdrom_device_ops *cdo = cdi->ops;
 
-	init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_READ);
-	cgc.cmd[0] = GPCMD_READ_DVD_STRUCTURE;
-	cgc.cmd[6] = s->copyright.layer_num;
-	cgc.cmd[7] = s->type;
-	cgc.cmd[8] = cgc.buflen >> 8;
-	cgc.cmd[9] = cgc.buflen & 0xff;
+	init_cdrom_command(cgc, buf, sizeof(buf), CGC_DATA_READ);
+	cgc->cmd[0] = GPCMD_READ_DVD_STRUCTURE;
+	cgc->cmd[6] = s->copyright.layer_num;
+	cgc->cmd[7] = s->type;
+	cgc->cmd[8] = cgc->buflen >> 8;
+	cgc->cmd[9] = cgc->buflen & 0xff;
 
-	if ((ret = cdo->generic_packet(cdi, &cgc)))
+	ret = cdo->generic_packet(cdi, cgc);
+	if (ret)
 		return ret;
 
 	s->copyright.cpst = buf[4];
@@ -1785,79 +1787,89 @@ static int dvd_read_copyright(struct cdrom_device_info *cdi, dvd_struct *s)
 	return 0;
 }
 
-static int dvd_read_disckey(struct cdrom_device_info *cdi, dvd_struct *s)
+static int dvd_read_disckey(struct cdrom_device_info *cdi, dvd_struct *s,
+				struct packet_command *cgc)
 {
 	int ret, size;
 	u_char *buf;
-	struct packet_command cgc;
 	struct cdrom_device_ops *cdo = cdi->ops;
 
 	size = sizeof(s->disckey.value) + 4;
 
-	if ((buf = kmalloc(size, GFP_KERNEL)) == NULL)
+	buf = kmalloc(size, GFP_KERNEL);
+	if (!buf)
 		return -ENOMEM;
 
-	init_cdrom_command(&cgc, buf, size, CGC_DATA_READ);
-	cgc.cmd[0] = GPCMD_READ_DVD_STRUCTURE;
-	cgc.cmd[7] = s->type;
-	cgc.cmd[8] = size >> 8;
-	cgc.cmd[9] = size & 0xff;
-	cgc.cmd[10] = s->disckey.agid << 6;
+	init_cdrom_command(cgc, buf, size, CGC_DATA_READ);
+	cgc->cmd[0] = GPCMD_READ_DVD_STRUCTURE;
+	cgc->cmd[7] = s->type;
+	cgc->cmd[8] = size >> 8;
+	cgc->cmd[9] = size & 0xff;
+	cgc->cmd[10] = s->disckey.agid << 6;
 
-	if (!(ret = cdo->generic_packet(cdi, &cgc)))
+	ret = cdo->generic_packet(cdi, cgc);
+	if (!ret)
 		memcpy(s->disckey.value, &buf[4], sizeof(s->disckey.value));
 
 	kfree(buf);
 	return ret;
 }
 
-static int dvd_read_bca(struct cdrom_device_info *cdi, dvd_struct *s)
+static int dvd_read_bca(struct cdrom_device_info *cdi, dvd_struct *s,
+			struct packet_command *cgc)
 {
-	int ret;
-	u_char buf[4 + 188];
-	struct packet_command cgc;
+	int ret, size = 4 + 188;
+	u_char *buf;
 	struct cdrom_device_ops *cdo = cdi->ops;
 
-	init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_READ);
-	cgc.cmd[0] = GPCMD_READ_DVD_STRUCTURE;
-	cgc.cmd[7] = s->type;
-	cgc.cmd[9] = cgc.buflen & 0xff;
+	buf = kmalloc(size, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
 
-	if ((ret = cdo->generic_packet(cdi, &cgc)))
-		return ret;
+	init_cdrom_command(cgc, buf, size, CGC_DATA_READ);
+	cgc->cmd[0] = GPCMD_READ_DVD_STRUCTURE;
+	cgc->cmd[7] = s->type;
+	cgc->cmd[9] = cgc->buflen & 0xff;
+
+	ret = cdo->generic_packet(cdi, cgc);
+	if (ret)
+		goto out;
 
 	s->bca.len = buf[0] << 8 | buf[1];
 	if (s->bca.len < 12 || s->bca.len > 188) {
 		cdinfo(CD_WARNING, "Received invalid BCA length (%d)\n", s->bca.len);
-		return -EIO;
+		ret = -EIO;
+		goto out;
 	}
 	memcpy(s->bca.value, &buf[4], s->bca.len);
-
-	return 0;
+	ret = 0;
+out:
+	kfree(buf);
+	return ret;
 }
 
-static int dvd_read_manufact(struct cdrom_device_info *cdi, dvd_struct *s)
+static int dvd_read_manufact(struct cdrom_device_info *cdi, dvd_struct *s,
+				struct packet_command *cgc)
 {
 	int ret = 0, size;
 	u_char *buf;
-	struct packet_command cgc;
 	struct cdrom_device_ops *cdo = cdi->ops;
 
 	size = sizeof(s->manufact.value) + 4;
 
-	if ((buf = kmalloc(size, GFP_KERNEL)) == NULL)
+	buf = kmalloc(size, GFP_KERNEL);
+	if (!buf)
 		return -ENOMEM;
 
-	init_cdrom_command(&cgc, buf, size, CGC_DATA_READ);
-	cgc.cmd[0] = GPCMD_READ_DVD_STRUCTURE;
-	cgc.cmd[7] = s->type;
-	cgc.cmd[8] = size >> 8;
-	cgc.cmd[9] = size & 0xff;
+	init_cdrom_command(cgc, buf, size, CGC_DATA_READ);
+	cgc->cmd[0] = GPCMD_READ_DVD_STRUCTURE;
+	cgc->cmd[7] = s->type;
+	cgc->cmd[8] = size >> 8;
+	cgc->cmd[9] = size & 0xff;
 
-	if ((ret = cdo->generic_packet(cdi, &cgc))) {
-		kfree(buf);
-		return ret;
-	}
+	ret = cdo->generic_packet(cdi, cgc);
+	if (ret)
+		goto out;
 
 	s->manufact.len = buf[0] << 8 | buf[1];
 	if (s->manufact.len < 0 || s->manufact.len > 2048) {
@@ -1868,27 +1880,29 @@ static int dvd_read_manufact(struct cdrom_device_info *cdi, dvd_struct *s)
 		memcpy(s->manufact.value, &buf[4], s->manufact.len);
 	}
 
+out:
 	kfree(buf);
 	return ret;
 }
 
-static int dvd_read_struct(struct cdrom_device_info *cdi, dvd_struct *s)
+static int dvd_read_struct(struct cdrom_device_info *cdi, dvd_struct *s,
+				struct packet_command *cgc)
 {
 	switch (s->type) {
 	case DVD_STRUCT_PHYSICAL:
-		return dvd_read_physical(cdi, s);
+		return dvd_read_physical(cdi, s, cgc);
 
 	case DVD_STRUCT_COPYRIGHT:
-		return dvd_read_copyright(cdi, s);
+		return dvd_read_copyright(cdi, s, cgc);
 
 	case DVD_STRUCT_DISCKEY:
-		return dvd_read_disckey(cdi, s);
+		return dvd_read_disckey(cdi, s, cgc);
 
 	case DVD_STRUCT_BCA:
-		return dvd_read_bca(cdi, s);
+		return dvd_read_bca(cdi, s, cgc);
 
 	case DVD_STRUCT_MANUFACT:
-		return dvd_read_manufact(cdi, s);
+		return dvd_read_manufact(cdi, s, cgc);
 		
 	default:
 		cdinfo(CD_WARNING, ": Invalid DVD structure read requested (%d)\n",
@@ -3027,7 +3041,8 @@ static noinline int mmc_ioctl_cdrom_pause_resume(struct cdrom_device_info *cdi,
 }
 
 static noinline int mmc_ioctl_dvd_read_struct(struct cdrom_device_info *cdi,
-						void __user *arg)
+						void __user *arg,
+						struct packet_command *cgc)
 {
 	int ret;
 	dvd_struct *s;
@@ -3046,7 +3061,7 @@ static noinline int mmc_ioctl_dvd_read_struct(struct cdrom_device_info *cdi,
 		return -EFAULT;
 	}
 
-	ret = dvd_read_struct(cdi, s);
+	ret = dvd_read_struct(cdi, s, cgc);
 	if (ret)
 		goto out;
 
@@ -3132,7 +3147,7 @@ static int mmc_ioctl(struct cdrom_device_info *cdi, unsigned int cmd,
 	case CDROMRESUME:
 		return mmc_ioctl_cdrom_pause_resume(cdi, &cgc, cmd);
 	case DVD_READ_STRUCT:
-		return mmc_ioctl_dvd_read_struct(cdi, userptr);
+		return mmc_ioctl_dvd_read_struct(cdi, userptr, &cgc);
 	case DVD_AUTH:
 		return mmc_ioctl_dvd_auth(cdi, userptr);
 	case CDROM_NEXT_WRITABLE:
-- 
cgit v0.10.2


From 7c0990c7ee988aa193abbb7da3faeb9279146dbf Mon Sep 17 00:00:00 2001
From: Nikanth Karthikesan <knikanth@suse.de>
Date: Wed, 19 Nov 2008 10:20:23 +0100
Subject: Do not free io context when taking recursive faults in do_exit

When taking recursive faults in do_exit, if the io_context is not null,
exit_io_context() is being called. But it might decrement the refcount
more than once. It is better to leave this task alone.

Signed-off-by: Nikanth Karthikesan <knikanth@suse.de>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/kernel/exit.c b/kernel/exit.c
index c7422ca..9a213474 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1037,8 +1037,6 @@ NORET_TYPE void do_exit(long code)
 		 * task into the wait for ever nirwana as well.
 		 */
 		tsk->flags |= PF_EXITPIDONE;
-		if (tsk->io_context)
-			exit_io_context();
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		schedule();
 	}
-- 
cgit v0.10.2


From 8a3173de4ab4cdacc43675dc5c077f9a5bf17f5f Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 20 Nov 2008 09:46:09 +0100
Subject: cciss: switch to using hlist for command list management

This both cleans up the code and also helps detect the spurious case
of a command attempted being removed from a queue it doesn't belong
to.

Acked-by: Mike Miller <mike.miller@hp.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 9f7c543..4e07b5c 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -215,31 +215,17 @@ static struct block_device_operations cciss_fops = {
 /*
  * Enqueuing and dequeuing functions for cmdlists.
  */
-static inline void addQ(CommandList_struct **Qptr, CommandList_struct *c)
+static inline void addQ(struct hlist_head *list, CommandList_struct *c)
 {
-	if (*Qptr == NULL) {
-		*Qptr = c;
-		c->next = c->prev = c;
-	} else {
-		c->prev = (*Qptr)->prev;
-		c->next = (*Qptr);
-		(*Qptr)->prev->next = c;
-		(*Qptr)->prev = c;
-	}
+	hlist_add_head(&c->list, list);
 }
 
-static inline CommandList_struct *removeQ(CommandList_struct **Qptr,
-					  CommandList_struct *c)
+static inline void removeQ(CommandList_struct *c)
 {
-	if (c && c->next != c) {
-		if (*Qptr == c)
-			*Qptr = c->next;
-		c->prev->next = c->next;
-		c->next->prev = c->prev;
-	} else {
-		*Qptr = NULL;
-	}
-	return c;
+	if (WARN_ON(hlist_unhashed(&c->list)))
+		return;
+
+	hlist_del_init(&c->list);
 }
 
 #include "cciss_scsi.c"		/* For SCSI tape support */
@@ -506,6 +492,7 @@ static CommandList_struct *cmd_alloc(ctlr_info_t *h, int get_from_pool)
 		c->cmdindex = i;
 	}
 
+	INIT_HLIST_NODE(&c->list);
 	c->busaddr = (__u32) cmd_dma_handle;
 	temp64.val = (__u64) err_dma_handle;
 	c->ErrDesc.Addr.lower = temp64.val32.lower;
@@ -2548,7 +2535,8 @@ static void start_io(ctlr_info_t *h)
 {
 	CommandList_struct *c;
 
-	while ((c = h->reqQ) != NULL) {
+	while (!hlist_empty(&h->reqQ)) {
+		c = hlist_entry(h->reqQ.first, CommandList_struct, list);
 		/* can't do anything if fifo is full */
 		if ((h->access.fifo_full(h))) {
 			printk(KERN_WARNING "cciss: fifo full\n");
@@ -2556,14 +2544,14 @@ static void start_io(ctlr_info_t *h)
 		}
 
 		/* Get the first entry from the Request Q */
-		removeQ(&(h->reqQ), c);
+		removeQ(c);
 		h->Qdepth--;
 
 		/* Tell the controller execute command */
 		h->access.submit_command(h, c);
 
 		/* Put job onto the completed Q */
-		addQ(&(h->cmpQ), c);
+		addQ(&h->cmpQ, c);
 	}
 }
 
@@ -2576,7 +2564,7 @@ static inline void resend_cciss_cmd(ctlr_info_t *h, CommandList_struct *c)
 	memset(c->err_info, 0, sizeof(ErrorInfo_struct));
 
 	/* add it to software queue and then send it to the controller */
-	addQ(&(h->reqQ), c);
+	addQ(&h->reqQ, c);
 	h->Qdepth++;
 	if (h->Qdepth > h->maxQsinceinit)
 		h->maxQsinceinit = h->Qdepth;
@@ -2897,7 +2885,7 @@ static void do_cciss_request(struct request_queue *q)
 
 	spin_lock_irq(q->queue_lock);
 
-	addQ(&(h->reqQ), c);
+	addQ(&h->reqQ, c);
 	h->Qdepth++;
 	if (h->Qdepth > h->maxQsinceinit)
 		h->maxQsinceinit = h->Qdepth;
@@ -2985,16 +2973,12 @@ static irqreturn_t do_cciss_intr(int irq, void *dev_id)
 				a = c->busaddr;
 
 			} else {
+				struct hlist_node *tmp;
+
 				a &= ~3;
-				if ((c = h->cmpQ) == NULL) {
-					printk(KERN_WARNING
-					       "cciss: Completion of %08x ignored\n",
-					       a1);
-					continue;
-				}
-				while (c->busaddr != a) {
-					c = c->next;
-					if (c == h->cmpQ)
+				c = NULL;
+				hlist_for_each_entry(c, tmp, &h->cmpQ, list) {
+					if (c->busaddr == a)
 						break;
 				}
 			}
@@ -3002,8 +2986,8 @@ static irqreturn_t do_cciss_intr(int irq, void *dev_id)
 			 * If we've found the command, take it off the
 			 * completion Q and free it
 			 */
-			if (c->busaddr == a) {
-				removeQ(&h->cmpQ, c);
+			if (c && c->busaddr == a) {
+				removeQ(c);
 				if (c->cmd_type == CMD_RWREQ) {
 					complete_command(h, c, 0);
 				} else if (c->cmd_type == CMD_IOCTL_PEND) {
@@ -3423,6 +3407,8 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
 		return -1;
 
 	hba[i]->busy_initializing = 1;
+	INIT_HLIST_HEAD(&hba[i]->cmpQ);
+	INIT_HLIST_HEAD(&hba[i]->reqQ);
 
 	if (cciss_pci_init(hba[i], pdev) != 0)
 		goto clean1;
@@ -3730,15 +3716,17 @@ static void fail_all_cmds(unsigned long ctlr)
 	pci_disable_device(h->pdev);	/* Make sure it is really dead. */
 
 	/* move everything off the request queue onto the completed queue */
-	while ((c = h->reqQ) != NULL) {
-		removeQ(&(h->reqQ), c);
+	while (!hlist_empty(&h->reqQ)) {
+		c = hlist_entry(h->reqQ.first, CommandList_struct, list);
+		removeQ(c);
 		h->Qdepth--;
-		addQ(&(h->cmpQ), c);
+		addQ(&h->cmpQ, c);
 	}
 
 	/* Now, fail everything on the completed queue with a HW error */
-	while ((c = h->cmpQ) != NULL) {
-		removeQ(&h->cmpQ, c);
+	while (!hlist_empty(&h->cmpQ)) {
+		c = hlist_entry(h->cmpQ.first, CommandList_struct, list);
+		removeQ(c);
 		c->err_info->CommandStatus = CMD_HARDWARE_ERR;
 		if (c->cmd_type == CMD_RWREQ) {
 			complete_command(h, c, 0);
diff --git a/drivers/block/cciss.h b/drivers/block/cciss.h
index 24a7efa..15e2b84 100644
--- a/drivers/block/cciss.h
+++ b/drivers/block/cciss.h
@@ -89,8 +89,8 @@ struct ctlr_info
 	struct access_method access;
 
 	/* queue and queue Info */ 
-	CommandList_struct *reqQ;
-	CommandList_struct  *cmpQ;
+	struct hlist_head reqQ;
+	struct hlist_head cmpQ;
 	unsigned int Qdepth;
 	unsigned int maxQsinceinit;
 	unsigned int maxSG;
diff --git a/drivers/block/cciss_cmd.h b/drivers/block/cciss_cmd.h
index 43bf559..24e22de 100644
--- a/drivers/block/cciss_cmd.h
+++ b/drivers/block/cciss_cmd.h
@@ -265,8 +265,7 @@ typedef struct _CommandList_struct {
   int			   ctlr;
   int			   cmd_type; 
   long			   cmdindex;
-  struct _CommandList_struct *prev;
-  struct _CommandList_struct *next;
+  struct hlist_node list;
   struct request *	   rq;
   struct completion *waiting;
   int	 retry_count;
-- 
cgit v0.10.2


From c6a06f707cc29ea3a47588e4d2cd0bdcfa311a7d Mon Sep 17 00:00:00 2001
From: Qinghuang Feng <qhfeng.kernel@gmail.com>
Date: Mon, 24 Nov 2008 10:43:36 +0100
Subject: block/blk-tag.c: cleanup kernel-doc

There is no argument named @tags in blk_init_tags,
remove its' comment.

Signed-off-by: Qinghuang Feng <qhfeng.kernel@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-tag.c b/block/blk-tag.c
index c0d419e..3c518e3 100644
--- a/block/blk-tag.c
+++ b/block/blk-tag.c
@@ -158,7 +158,6 @@ fail:
 /**
  * blk_init_tags - initialize the tag info for an external tag map
  * @depth:	the maximum queue depth supported
- * @tags: the tag to use
  **/
 struct blk_queue_tag *blk_init_tags(int depth)
 {
-- 
cgit v0.10.2


From 42364690992e592c05f85c76fda4055820b48c1b Mon Sep 17 00:00:00 2001
From: Nikanth Karthikesan <knikanth@suse.de>
Date: Mon, 24 Nov 2008 10:46:29 +0100
Subject: Documentation: remove reference to ll_rw_blk.c and moved
 drivers/block/elevator.c

The drivers/block/ll_rw_block.c has been split and organized in the block/
directory, and also drivers/block/elevator.c has been moved to the block/
directory. Update Documentation/block/biodoc.txt accordingly

Signed-off-by: Nikanth Karthikesan <knikanth@suse.de>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt
index 4dbb8be..3c5434c 100644
--- a/Documentation/block/biodoc.txt
+++ b/Documentation/block/biodoc.txt
@@ -914,7 +914,7 @@ I/O scheduler, a.k.a. elevator, is implemented in two layers.  Generic dispatch
 queue and specific I/O schedulers.  Unless stated otherwise, elevator is used
 to refer to both parts and I/O scheduler to specific I/O schedulers.
 
-Block layer implements generic dispatch queue in ll_rw_blk.c and elevator.c.
+Block layer implements generic dispatch queue in block/*.c.
 The generic dispatch queue is responsible for properly ordering barrier
 requests, requeueing, handling non-fs requests and all other subtleties.
 
@@ -926,8 +926,8 @@ be built inside the kernel.  Each queue can choose different one and can also
 change to another one dynamically.
 
 A block layer call to the i/o scheduler follows the convention elv_xxx(). This
-calls elevator_xxx_fn in the elevator switch (drivers/block/elevator.c). Oh,
-xxx and xxx might not match exactly, but use your imagination. If an elevator
+calls elevator_xxx_fn in the elevator switch (block/elevator.c). Oh, xxx
+and xxx might not match exactly, but use your imagination. If an elevator
 doesn't implement a function, the switch does nothing or some minimal house
 keeping work.
 
-- 
cgit v0.10.2


From 7c239517d9f18427fc2e7ed259fb3b866595f5af Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Tue, 25 Nov 2008 09:08:39 +0100
Subject: block: don't take lock on changing ra_pages

There's no need to take queue_lock or kernel_lock when modifying
bdi->ra_pages. So remove them. Also remove out of date comment for
queue_max_sectors_store().

Signed-off-by: Wu Fengguang <wfg@linux.intel.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 21e275d..a29cb78 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -88,9 +88,7 @@ queue_ra_store(struct request_queue *q, const char *page, size_t count)
 	unsigned long ra_kb;
 	ssize_t ret = queue_var_store(&ra_kb, page, count);
 
-	spin_lock_irq(q->queue_lock);
 	q->backing_dev_info.ra_pages = ra_kb >> (PAGE_CACHE_SHIFT - 10);
-	spin_unlock_irq(q->queue_lock);
 
 	return ret;
 }
@@ -117,10 +115,7 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
 
 	if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb)
 		return -EINVAL;
-	/*
-	 * Take the queue lock to update the readahead and max_sectors
-	 * values synchronously:
-	 */
+
 	spin_lock_irq(q->queue_lock);
 	q->max_sectors = max_sectors_kb << 1;
 	spin_unlock_irq(q->queue_lock);
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index 67eb93c..f87615d 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -774,9 +774,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 		bdi = blk_get_backing_dev_info(bdev);
 		if (bdi == NULL)
 			return -ENOTTY;
-		lock_kernel();
 		bdi->ra_pages = (arg * 512) / PAGE_CACHE_SIZE;
-		unlock_kernel();
 		return 0;
 	case BLKGETSIZE:
 		size = bdev->bd_inode->i_size;
diff --git a/block/ioctl.c b/block/ioctl.c
index d03985b..0f22e62 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -323,9 +323,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 		bdi = blk_get_backing_dev_info(bdev);
 		if (bdi == NULL)
 			return -ENOTTY;
-		lock_kernel();
 		bdi->ra_pages = (arg * 512) / PAGE_CACHE_SIZE;
-		unlock_kernel();
 		return 0;
 	case BLKBSZSET:
 		/* set the logical block size */
-- 
cgit v0.10.2


From 08bafc0341f2f7920e9045bc32c40299cac8c21b Mon Sep 17 00:00:00 2001
From: Keith Mannthey <kmannth@us.ibm.com>
Date: Tue, 25 Nov 2008 10:24:35 +0100
Subject: block: Supress Buffer I/O errors when SCSI REQ_QUIET flag set

Allow the scsi request REQ_QUIET flag to be propagated to the buffer
file system layer. The basic ideas is to pass the flag from the scsi
request to the bio (block IO) and then to the buffer layer.  The buffer
layer can then suppress needless printks.

This patch declutters the kernel log by removed the 40-50 (per lun)
buffer io error messages seen during a boot in my multipath setup . It
is a good chance any real errors will be missed in the "noise" it the
logs without this patch.

During boot I see blocks of messages like
"
__ratelimit: 211 callbacks suppressed
Buffer I/O error on device sdm, logical block 5242879
Buffer I/O error on device sdm, logical block 5242879
Buffer I/O error on device sdm, logical block 5242847
Buffer I/O error on device sdm, logical block 1
Buffer I/O error on device sdm, logical block 5242878
Buffer I/O error on device sdm, logical block 5242879
Buffer I/O error on device sdm, logical block 5242879
Buffer I/O error on device sdm, logical block 5242879
Buffer I/O error on device sdm, logical block 5242879
Buffer I/O error on device sdm, logical block 5242872
"
in my logs.

My disk environment is multipath fiber channel using the SCSI_DH_RDAC
code and multipathd.  This topology includes an "active" and "ghost"
path for each lun. IO's to the "ghost" path will never complete and the
SCSI layer, via the scsi device handler rdac code, quick returns the IOs
to theses paths and sets the REQ_QUIET scsi flag to suppress the scsi
layer messages.

 I am wanting to extend the QUIET behavior to include the buffer file
system layer to deal with these errors as well. I have been running this
patch for a while now on several boxes without issue.  A few runs of
bonnie++ show no noticeable difference in performance in my setup.

Thanks for John Stultz for the quiet_error finalization.

Submitted-by:  Keith Mannthey <kmannth@us.ibm.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-core.c b/block/blk-core.c
index 243d18b..20e1724 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -153,6 +153,9 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
 			nbytes = bio->bi_size;
 		}
 
+		if (unlikely(rq->cmd_flags & REQ_QUIET))
+			set_bit(BIO_QUIET, &bio->bi_flags);
+
 		bio->bi_size -= nbytes;
 		bio->bi_sector += (nbytes >> 9);
 
diff --git a/fs/buffer.c b/fs/buffer.c
index 10179cf..776ae09 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -99,10 +99,18 @@ __clear_page_buffers(struct page *page)
 	page_cache_release(page);
 }
 
+
+static int quiet_error(struct buffer_head *bh)
+{
+	if (!test_bit(BH_Quiet, &bh->b_state) && printk_ratelimit())
+		return 0;
+	return 1;
+}
+
+
 static void buffer_io_error(struct buffer_head *bh)
 {
 	char b[BDEVNAME_SIZE];
-
 	printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
 			bdevname(bh->b_bdev, b),
 			(unsigned long long)bh->b_blocknr);
@@ -144,7 +152,7 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 	if (uptodate) {
 		set_buffer_uptodate(bh);
 	} else {
-		if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
+		if (!buffer_eopnotsupp(bh) && !quiet_error(bh)) {
 			buffer_io_error(bh);
 			printk(KERN_WARNING "lost page write due to "
 					"I/O error on %s\n",
@@ -394,7 +402,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 		set_buffer_uptodate(bh);
 	} else {
 		clear_buffer_uptodate(bh);
-		if (printk_ratelimit())
+		if (!quiet_error(bh))
 			buffer_io_error(bh);
 		SetPageError(page);
 	}
@@ -455,7 +463,7 @@ static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 	if (uptodate) {
 		set_buffer_uptodate(bh);
 	} else {
-		if (printk_ratelimit()) {
+		if (!quiet_error(bh)) {
 			buffer_io_error(bh);
 			printk(KERN_WARNING "lost page write due to "
 					"I/O error on %s\n",
@@ -2913,6 +2921,9 @@ static void end_bio_bh_io_sync(struct bio *bio, int err)
 		set_bit(BH_Eopnotsupp, &bh->b_state);
 	}
 
+	if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
+		set_bit(BH_Quiet, &bh->b_state);
+
 	bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
 	bio_put(bio);
 }
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 6a64209..cf132bf 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -117,6 +117,7 @@ struct bio {
 #define BIO_CPU_AFFINE	8	/* complete bio on same CPU as submitted */
 #define BIO_NULL_MAPPED 9	/* contains invalid user pages */
 #define BIO_FS_INTEGRITY 10	/* fs owns integrity data, not block layer */
+#define BIO_QUIET	11	/* Make BIO Quiet */
 #define bio_flagged(bio, flag)	((bio)->bi_flags & (1 << (flag)))
 
 /*
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 3ce64b9..8605f8a 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -35,6 +35,7 @@ enum bh_state_bits {
 	BH_Ordered,	/* ordered write */
 	BH_Eopnotsupp,	/* operation not supported (barrier) */
 	BH_Unwritten,	/* Buffer is allocated on disk but not written */
+	BH_Quiet,	/* Buffer Error Prinks to be quiet */
 
 	BH_PrivateStart,/* not a state bit, but the first bit available
 			 * for private allocation by other entities
-- 
cgit v0.10.2


From 64d01dc9e1927e6535627d73f2336c75d1dd3fe2 Mon Sep 17 00:00:00 2001
From: Cheng Renquan <crquan@gmail.com>
Date: Wed, 3 Dec 2008 12:41:39 +0100
Subject: block: use cancel_work_sync() instead of kblockd_flush_work()

After many improvements on kblockd_flush_work, it is now identical to
cancel_work_sync, so a direct call to cancel_work_sync is suggested.

The only difference is that cancel_work_sync is a GPL symbol,
so no non-GPL modules anymore.

Signed-off-by: Cheng Renquan <crquan@gmail.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/as-iosched.c b/block/as-iosched.c
index 71f0abb..802b5d0 100644
--- a/block/as-iosched.c
+++ b/block/as-iosched.c
@@ -1344,7 +1344,7 @@ static void as_exit_queue(elevator_t *e)
 	struct as_data *ad = e->elevator_data;
 
 	del_timer_sync(&ad->antic_timer);
-	kblockd_flush_work(&ad->antic_work);
+	cancel_work_sync(&ad->antic_work);
 
 	BUG_ON(!list_empty(&ad->fifo_list[REQ_SYNC]));
 	BUG_ON(!list_empty(&ad->fifo_list[REQ_ASYNC]));
diff --git a/block/blk-core.c b/block/blk-core.c
index 20e1724..2fdcd0c 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -408,7 +408,7 @@ void blk_sync_queue(struct request_queue *q)
 {
 	del_timer_sync(&q->unplug_timer);
 	del_timer_sync(&q->timeout);
-	kblockd_flush_work(&q->unplug_work);
+	cancel_work_sync(&q->unplug_work);
 }
 EXPORT_SYMBOL(blk_sync_queue);
 
@@ -2147,12 +2147,6 @@ int kblockd_schedule_work(struct request_queue *q, struct work_struct *work)
 }
 EXPORT_SYMBOL(kblockd_schedule_work);
 
-void kblockd_flush_work(struct work_struct *work)
-{
-	cancel_work_sync(work);
-}
-EXPORT_SYMBOL(kblockd_flush_work);
-
 int __init blk_dev_init(void)
 {
 	kblockd_workqueue = create_workqueue("kblockd");
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 6a062ee..a2bfec7 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2160,7 +2160,7 @@ out_cont:
 static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
 {
 	del_timer_sync(&cfqd->idle_slice_timer);
-	kblockd_flush_work(&cfqd->unplug_work);
+	cancel_work_sync(&cfqd->unplug_work);
 }
 
 static void cfq_put_async_queues(struct cfq_data *cfqd)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 482e960..e9bb73f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -978,7 +978,6 @@ static inline void put_dev_sector(Sector p)
 
 struct work_struct;
 int kblockd_schedule_work(struct request_queue *q, struct work_struct *work);
-void kblockd_flush_work(struct work_struct *work);
 
 #define MODULE_ALIAS_BLOCKDEV(major,minor) \
 	MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor))
-- 
cgit v0.10.2


From ba744d5e290055d171c68067259fcc1e2721f542 Mon Sep 17 00:00:00 2001
From: Richard Kennedy <richard@rsk.demon.co.uk>
Date: Wed, 3 Dec 2008 12:41:40 +0100
Subject: block: reorder struct bio to remove padding on 64bit

Remove 8 bytes of padding from struct bio which also removes 16 bytes from
struct bio_pair to make it 248 bytes.  bio_pair then fits into one fewer
cache lines & into a smaller slab.

Signed-off-by: Richard Kennedy <richard@rsk.demon.co.uk>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/include/linux/bio.h b/include/linux/bio.h
index cf132bf..3ed714e 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -90,10 +90,11 @@ struct bio {
 
 	unsigned int		bi_comp_cpu;	/* completion CPU */
 
+	atomic_t		bi_cnt;		/* pin count */
+
 	struct bio_vec		*bi_io_vec;	/* the actual vec list */
 
 	bio_end_io_t		*bi_end_io;
-	atomic_t		bi_cnt;		/* pin count */
 
 	void			*bi_private;
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
-- 
cgit v0.10.2


From 313e42999dbc0f234ca5909a236f78f082cb43b1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 28 Nov 2008 13:32:02 +0900
Subject: block: reorganize QUEUE_ORDERED_* constants

Separate out ordering type (drain,) and action masks (preflush,
postflush, fua) from visible ordering mode selectors
(QUEUE_ORDERED_*).  Ordering types are now named QUEUE_ORDERED_BY_*
while action masks are named QUEUE_ORDERED_DO_*.

This change is necessary to add QUEUE_ORDERED_DO_BAR and make it
optional to improve empty barrier implementation.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 6e72d66..1d7adc7 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -24,8 +24,8 @@
 int blk_queue_ordered(struct request_queue *q, unsigned ordered,
 		      prepare_flush_fn *prepare_flush_fn)
 {
-	if (ordered & (QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_POSTFLUSH) &&
-	    prepare_flush_fn == NULL) {
+	if (!prepare_flush_fn && (ordered & (QUEUE_ORDERED_DO_PREFLUSH |
+					     QUEUE_ORDERED_DO_POSTFLUSH))) {
 		printk(KERN_ERR "%s: prepare_flush_fn required\n", __func__);
 		return -EINVAL;
 	}
@@ -134,7 +134,7 @@ static void queue_flush(struct request_queue *q, unsigned which)
 	struct request *rq;
 	rq_end_io_fn *end_io;
 
-	if (which == QUEUE_ORDERED_PREFLUSH) {
+	if (which == QUEUE_ORDERED_DO_PREFLUSH) {
 		rq = &q->pre_flush_rq;
 		end_io = pre_flush_end_io;
 	} else {
@@ -167,7 +167,7 @@ static inline struct request *start_ordered(struct request_queue *q,
 	blk_rq_init(q, rq);
 	if (bio_data_dir(q->orig_bar_rq->bio) == WRITE)
 		rq->cmd_flags |= REQ_RW;
-	if (q->ordered & QUEUE_ORDERED_FUA)
+	if (q->ordered & QUEUE_ORDERED_DO_FUA)
 		rq->cmd_flags |= REQ_FUA;
 	init_request_from_bio(rq, q->orig_bar_rq->bio);
 	rq->end_io = bar_end_io;
@@ -181,20 +181,20 @@ static inline struct request *start_ordered(struct request_queue *q,
 	 * there will be no data written between the pre and post flush.
 	 * Hence a single flush will suffice.
 	 */
-	if ((q->ordered & QUEUE_ORDERED_POSTFLUSH) && !blk_empty_barrier(rq))
-		queue_flush(q, QUEUE_ORDERED_POSTFLUSH);
+	if ((q->ordered & QUEUE_ORDERED_DO_POSTFLUSH) && !blk_empty_barrier(rq))
+		queue_flush(q, QUEUE_ORDERED_DO_POSTFLUSH);
 	else
 		q->ordseq |= QUEUE_ORDSEQ_POSTFLUSH;
 
 	elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
 
-	if (q->ordered & QUEUE_ORDERED_PREFLUSH) {
-		queue_flush(q, QUEUE_ORDERED_PREFLUSH);
+	if (q->ordered & QUEUE_ORDERED_DO_PREFLUSH) {
+		queue_flush(q, QUEUE_ORDERED_DO_PREFLUSH);
 		rq = &q->pre_flush_rq;
 	} else
 		q->ordseq |= QUEUE_ORDSEQ_PREFLUSH;
 
-	if ((q->ordered & QUEUE_ORDERED_TAG) || q->in_flight == 0)
+	if ((q->ordered & QUEUE_ORDERED_BY_TAG) || q->in_flight == 0)
 		q->ordseq |= QUEUE_ORDSEQ_DRAIN;
 	else
 		rq = NULL;
@@ -237,7 +237,7 @@ int blk_do_ordered(struct request_queue *q, struct request **rqp)
 	    rq != &q->pre_flush_rq && rq != &q->post_flush_rq)
 		return 1;
 
-	if (q->ordered & QUEUE_ORDERED_TAG) {
+	if (q->ordered & QUEUE_ORDERED_BY_TAG) {
 		/* Ordered by tag.  Blocking the next barrier is enough. */
 		if (is_barrier && rq != &q->bar_rq)
 			*rqp = NULL;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e9bb73f..5c92b44 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -523,22 +523,29 @@ enum {
 	 * TAG_FLUSH	: ordering by tag w/ pre and post flushes
 	 * TAG_FUA	: ordering by tag w/ pre flush and FUA write
 	 */
-	QUEUE_ORDERED_NONE	= 0x00,
-	QUEUE_ORDERED_DRAIN	= 0x01,
-	QUEUE_ORDERED_TAG	= 0x02,
-
-	QUEUE_ORDERED_PREFLUSH	= 0x10,
-	QUEUE_ORDERED_POSTFLUSH	= 0x20,
-	QUEUE_ORDERED_FUA	= 0x40,
-
-	QUEUE_ORDERED_DRAIN_FLUSH = QUEUE_ORDERED_DRAIN |
-			QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_POSTFLUSH,
-	QUEUE_ORDERED_DRAIN_FUA	= QUEUE_ORDERED_DRAIN |
-			QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_FUA,
-	QUEUE_ORDERED_TAG_FLUSH	= QUEUE_ORDERED_TAG |
-			QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_POSTFLUSH,
-	QUEUE_ORDERED_TAG_FUA	= QUEUE_ORDERED_TAG |
-			QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_FUA,
+	QUEUE_ORDERED_BY_DRAIN		= 0x01,
+	QUEUE_ORDERED_BY_TAG		= 0x02,
+	QUEUE_ORDERED_DO_PREFLUSH	= 0x10,
+	QUEUE_ORDERED_DO_POSTFLUSH	= 0x40,
+	QUEUE_ORDERED_DO_FUA		= 0x80,
+
+	QUEUE_ORDERED_NONE		= 0x00,
+
+	QUEUE_ORDERED_DRAIN		= QUEUE_ORDERED_BY_DRAIN,
+	QUEUE_ORDERED_DRAIN_FLUSH	= QUEUE_ORDERED_DRAIN |
+					  QUEUE_ORDERED_DO_PREFLUSH |
+					  QUEUE_ORDERED_DO_POSTFLUSH,
+	QUEUE_ORDERED_DRAIN_FUA		= QUEUE_ORDERED_DRAIN |
+					  QUEUE_ORDERED_DO_PREFLUSH |
+					  QUEUE_ORDERED_DO_FUA,
+
+	QUEUE_ORDERED_TAG		= QUEUE_ORDERED_BY_TAG,
+	QUEUE_ORDERED_TAG_FLUSH		= QUEUE_ORDERED_TAG |
+					  QUEUE_ORDERED_DO_PREFLUSH |
+					  QUEUE_ORDERED_DO_POSTFLUSH,
+	QUEUE_ORDERED_TAG_FUA		= QUEUE_ORDERED_TAG |
+					  QUEUE_ORDERED_DO_PREFLUSH |
+					  QUEUE_ORDERED_DO_FUA,
 
 	/*
 	 * Ordered operation sequence
-- 
cgit v0.10.2


From a7384677b2f4cd40948fd7ce024ba5e1821444ba Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 28 Nov 2008 13:32:03 +0900
Subject: block: remove duplicate or unused barrier/discard error paths

* Because barrier mode can be changed dynamically, whether barrier is
  supported or not can be determined only when actually issuing the
  barrier and there is no point in checking it earlier.  Drop barrier
  support check in generic_make_request() and __make_request(), and
  update comment around the support check in blk_do_ordered().

* There is no reason to check discard support in both
  generic_make_request() and __make_request().  Drop the check in
  __make_request().  While at it, move error action block to the end
  of the function and add unlikely() to q existence test.

* Barrier request, be it empty or not, is never passed to low level
  driver and thus it's meaningless to try to copy back req->sector to
  bio->bi_sector on error.  In addition, the notion of failed sector
  doesn't make any sense for empty barrier to begin with.  Drop the
  code block from __end_that_request_first().

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 1d7adc7..43d479a 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -216,8 +216,8 @@ int blk_do_ordered(struct request_queue *q, struct request **rqp)
 			return 1;
 		} else {
 			/*
-			 * This can happen when the queue switches to
-			 * ORDERED_NONE while this request is on it.
+			 * Queue ordering not supported.  Terminate
+			 * with prejudice.
 			 */
 			elv_dequeue_request(q, rq);
 			if (__blk_end_request(rq, -EOPNOTSUPP,
diff --git a/block/blk-core.c b/block/blk-core.c
index 2fdcd0c..b1fd4f5 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1139,7 +1139,7 @@ void init_request_from_bio(struct request *req, struct bio *bio)
 static int __make_request(struct request_queue *q, struct bio *bio)
 {
 	struct request *req;
-	int el_ret, nr_sectors, barrier, discard, err;
+	int el_ret, nr_sectors;
 	const unsigned short prio = bio_prio(bio);
 	const int sync = bio_sync(bio);
 	int rw_flags;
@@ -1153,22 +1153,9 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 	 */
 	blk_queue_bounce(q, &bio);
 
-	barrier = bio_barrier(bio);
-	if (unlikely(barrier) && bio_has_data(bio) &&
-	    (q->next_ordered == QUEUE_ORDERED_NONE)) {
-		err = -EOPNOTSUPP;
-		goto end_io;
-	}
-
-	discard = bio_discard(bio);
-	if (unlikely(discard) && !q->prepare_discard_fn) {
-		err = -EOPNOTSUPP;
-		goto end_io;
-	}
-
 	spin_lock_irq(q->queue_lock);
 
-	if (unlikely(barrier) || elv_queue_empty(q))
+	if (unlikely(bio_barrier(bio)) || elv_queue_empty(q))
 		goto get_rq;
 
 	el_ret = elv_merge(q, &req, bio);
@@ -1262,10 +1249,6 @@ out:
 		__generic_unplug_device(q);
 	spin_unlock_irq(q->queue_lock);
 	return 0;
-
-end_io:
-	bio_endio(bio, err);
-	return 0;
 }
 
 /*
@@ -1418,15 +1401,13 @@ static inline void __generic_make_request(struct bio *bio)
 		char b[BDEVNAME_SIZE];
 
 		q = bdev_get_queue(bio->bi_bdev);
-		if (!q) {
+		if (unlikely(!q)) {
 			printk(KERN_ERR
 			       "generic_make_request: Trying to access "
 				"nonexistent block-device %s (%Lu)\n",
 				bdevname(bio->bi_bdev, b),
 				(long long) bio->bi_sector);
-end_io:
-			bio_endio(bio, err);
-			break;
+			goto end_io;
 		}
 
 		if (unlikely(nr_sectors > q->max_hw_sectors)) {
@@ -1463,14 +1444,19 @@ end_io:
 
 		if (bio_check_eod(bio, nr_sectors))
 			goto end_io;
-		if ((bio_empty_barrier(bio) && !q->prepare_flush_fn) ||
-		    (bio_discard(bio) && !q->prepare_discard_fn)) {
+
+		if (bio_discard(bio) && !q->prepare_discard_fn) {
 			err = -EOPNOTSUPP;
 			goto end_io;
 		}
 
 		ret = q->make_request_fn(q, bio);
 	} while (ret);
+
+	return;
+
+end_io:
+	bio_endio(bio, err);
 }
 
 /*
@@ -1720,14 +1706,6 @@ static int __end_that_request_first(struct request *req, int error,
 	while ((bio = req->bio) != NULL) {
 		int nbytes;
 
-		/*
-		 * For an empty barrier request, the low level driver must
-		 * store a potential error location in ->sector. We pass
-		 * that back up in ->bi_sector.
-		 */
-		if (blk_empty_barrier(req))
-			bio->bi_sector = req->sector;
-
 		if (nr_bytes >= bio->bi_size) {
 			req->bio = bio->bi_next;
 			nbytes = bio->bi_size;
-- 
cgit v0.10.2


From f671620e7d895af221bdfeda751d54fa55ed9546 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 28 Nov 2008 13:32:04 +0900
Subject: block: make every barrier action optional

In all barrier sequences, the barrier write itself was always assumed
to be issued and thus didn't have corresponding control flag.  This
patch adds QUEUE_ORDERED_DO_BAR and unify action mask handling in
start_ordered() such that any barrier action can be skipped.

This patch doesn't introduce any visible behavior changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 43d479a..1efabf8 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -158,19 +158,10 @@ static inline struct request *start_ordered(struct request_queue *q,
 	q->ordered = q->next_ordered;
 	q->ordseq |= QUEUE_ORDSEQ_STARTED;
 
-	/*
-	 * Prep proxy barrier request.
-	 */
+	/* stash away the original request */
 	elv_dequeue_request(q, rq);
 	q->orig_bar_rq = rq;
-	rq = &q->bar_rq;
-	blk_rq_init(q, rq);
-	if (bio_data_dir(q->orig_bar_rq->bio) == WRITE)
-		rq->cmd_flags |= REQ_RW;
-	if (q->ordered & QUEUE_ORDERED_DO_FUA)
-		rq->cmd_flags |= REQ_FUA;
-	init_request_from_bio(rq, q->orig_bar_rq->bio);
-	rq->end_io = bar_end_io;
+	rq = NULL;
 
 	/*
 	 * Queue ordered sequence.  As we stack them at the head, we
@@ -181,12 +172,28 @@ static inline struct request *start_ordered(struct request_queue *q,
 	 * there will be no data written between the pre and post flush.
 	 * Hence a single flush will suffice.
 	 */
-	if ((q->ordered & QUEUE_ORDERED_DO_POSTFLUSH) && !blk_empty_barrier(rq))
+	if ((q->ordered & QUEUE_ORDERED_DO_POSTFLUSH) &&
+	    !blk_empty_barrier(q->orig_bar_rq)) {
 		queue_flush(q, QUEUE_ORDERED_DO_POSTFLUSH);
-	else
+		rq = &q->post_flush_rq;
+	} else
 		q->ordseq |= QUEUE_ORDSEQ_POSTFLUSH;
 
-	elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
+	if (q->ordered & QUEUE_ORDERED_DO_BAR) {
+		rq = &q->bar_rq;
+
+		/* initialize proxy request and queue it */
+		blk_rq_init(q, rq);
+		if (bio_data_dir(q->orig_bar_rq->bio) == WRITE)
+			rq->cmd_flags |= REQ_RW;
+		if (q->ordered & QUEUE_ORDERED_DO_FUA)
+			rq->cmd_flags |= REQ_FUA;
+		init_request_from_bio(rq, q->orig_bar_rq->bio);
+		rq->end_io = bar_end_io;
+
+		elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
+	} else
+		q->ordseq |= QUEUE_ORDSEQ_BAR;
 
 	if (q->ordered & QUEUE_ORDERED_DO_PREFLUSH) {
 		queue_flush(q, QUEUE_ORDERED_DO_PREFLUSH);
@@ -194,10 +201,10 @@ static inline struct request *start_ordered(struct request_queue *q,
 	} else
 		q->ordseq |= QUEUE_ORDSEQ_PREFLUSH;
 
-	if ((q->ordered & QUEUE_ORDERED_BY_TAG) || q->in_flight == 0)
-		q->ordseq |= QUEUE_ORDSEQ_DRAIN;
-	else
+	if ((q->ordered & QUEUE_ORDERED_BY_DRAIN) && q->in_flight)
 		rq = NULL;
+	else
+		q->ordseq |= QUEUE_ORDSEQ_DRAIN;
 
 	return rq;
 }
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 5c92b44..b044267 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -526,12 +526,14 @@ enum {
 	QUEUE_ORDERED_BY_DRAIN		= 0x01,
 	QUEUE_ORDERED_BY_TAG		= 0x02,
 	QUEUE_ORDERED_DO_PREFLUSH	= 0x10,
+	QUEUE_ORDERED_DO_BAR		= 0x20,
 	QUEUE_ORDERED_DO_POSTFLUSH	= 0x40,
 	QUEUE_ORDERED_DO_FUA		= 0x80,
 
 	QUEUE_ORDERED_NONE		= 0x00,
 
-	QUEUE_ORDERED_DRAIN		= QUEUE_ORDERED_BY_DRAIN,
+	QUEUE_ORDERED_DRAIN		= QUEUE_ORDERED_BY_DRAIN |
+					  QUEUE_ORDERED_DO_BAR,
 	QUEUE_ORDERED_DRAIN_FLUSH	= QUEUE_ORDERED_DRAIN |
 					  QUEUE_ORDERED_DO_PREFLUSH |
 					  QUEUE_ORDERED_DO_POSTFLUSH,
@@ -539,7 +541,8 @@ enum {
 					  QUEUE_ORDERED_DO_PREFLUSH |
 					  QUEUE_ORDERED_DO_FUA,
 
-	QUEUE_ORDERED_TAG		= QUEUE_ORDERED_BY_TAG,
+	QUEUE_ORDERED_TAG		= QUEUE_ORDERED_BY_TAG |
+					  QUEUE_ORDERED_DO_BAR,
 	QUEUE_ORDERED_TAG_FLUSH		= QUEUE_ORDERED_TAG |
 					  QUEUE_ORDERED_DO_PREFLUSH |
 					  QUEUE_ORDERED_DO_POSTFLUSH,
-- 
cgit v0.10.2


From 8f11b3e99a1136fcbb67316c3260f085299c0bff Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 28 Nov 2008 13:32:05 +0900
Subject: block: make barrier completion more robust

Barrier completion had the following assumptions.

* start_ordered() couldn't finish the whole sequence properly.  If all
  actions are to be skipped, q->ordseq is set correctly but the actual
  completion was never triggered thus hanging the barrier request.

* Drain completion in elv_complete_request() assumed that there's
  always at least one request in the queue when drain completes.

Both assumptions are true but these assumptions need to be removed to
improve empty barrier implementation.  This patch makes the following
changes.

* Make start_ordered() use blk_ordered_complete_seq() to mark skipped
  steps complete and notify __elv_next_request() that it should fetch
  the next request if the whole barrier has completed inside
  start_ordered().

* Make drain completion path in elv_complete_request() check whether
  the queue is empty.  Empty queue also indicates drain completion.

* While at it, convert 0/1 return from blk_do_ordered() to false/true.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 1efabf8..b03d880 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -88,7 +88,7 @@ unsigned blk_ordered_req_seq(struct request *rq)
 		return QUEUE_ORDSEQ_DONE;
 }
 
-void blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error)
+bool blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error)
 {
 	struct request *rq;
 
@@ -99,7 +99,7 @@ void blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error)
 	q->ordseq |= seq;
 
 	if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE)
-		return;
+		return false;
 
 	/*
 	 * Okay, sequence complete.
@@ -109,6 +109,8 @@ void blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error)
 
 	if (__blk_end_request(rq, q->orderr, blk_rq_bytes(rq)))
 		BUG();
+
+	return true;
 }
 
 static void pre_flush_end_io(struct request *rq, int error)
@@ -151,9 +153,11 @@ static void queue_flush(struct request_queue *q, unsigned which)
 	elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
 }
 
-static inline struct request *start_ordered(struct request_queue *q,
-					    struct request *rq)
+static inline bool start_ordered(struct request_queue *q, struct request **rqp)
 {
+	struct request *rq = *rqp;
+	unsigned skip = 0;
+
 	q->orderr = 0;
 	q->ordered = q->next_ordered;
 	q->ordseq |= QUEUE_ORDSEQ_STARTED;
@@ -177,7 +181,7 @@ static inline struct request *start_ordered(struct request_queue *q,
 		queue_flush(q, QUEUE_ORDERED_DO_POSTFLUSH);
 		rq = &q->post_flush_rq;
 	} else
-		q->ordseq |= QUEUE_ORDSEQ_POSTFLUSH;
+		skip |= QUEUE_ORDSEQ_POSTFLUSH;
 
 	if (q->ordered & QUEUE_ORDERED_DO_BAR) {
 		rq = &q->bar_rq;
@@ -193,35 +197,40 @@ static inline struct request *start_ordered(struct request_queue *q,
 
 		elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
 	} else
-		q->ordseq |= QUEUE_ORDSEQ_BAR;
+		skip |= QUEUE_ORDSEQ_BAR;
 
 	if (q->ordered & QUEUE_ORDERED_DO_PREFLUSH) {
 		queue_flush(q, QUEUE_ORDERED_DO_PREFLUSH);
 		rq = &q->pre_flush_rq;
 	} else
-		q->ordseq |= QUEUE_ORDSEQ_PREFLUSH;
+		skip |= QUEUE_ORDSEQ_PREFLUSH;
 
 	if ((q->ordered & QUEUE_ORDERED_BY_DRAIN) && q->in_flight)
 		rq = NULL;
 	else
-		q->ordseq |= QUEUE_ORDSEQ_DRAIN;
+		skip |= QUEUE_ORDSEQ_DRAIN;
+
+	*rqp = rq;
 
-	return rq;
+	/*
+	 * Complete skipped sequences.  If whole sequence is complete,
+	 * return false to tell elevator that this request is gone.
+	 */
+	return !blk_ordered_complete_seq(q, skip, 0);
 }
 
-int blk_do_ordered(struct request_queue *q, struct request **rqp)
+bool blk_do_ordered(struct request_queue *q, struct request **rqp)
 {
 	struct request *rq = *rqp;
 	const int is_barrier = blk_fs_request(rq) && blk_barrier_rq(rq);
 
 	if (!q->ordseq) {
 		if (!is_barrier)
-			return 1;
+			return true;
 
-		if (q->next_ordered != QUEUE_ORDERED_NONE) {
-			*rqp = start_ordered(q, rq);
-			return 1;
-		} else {
+		if (q->next_ordered != QUEUE_ORDERED_NONE)
+			return start_ordered(q, rqp);
+		else {
 			/*
 			 * Queue ordering not supported.  Terminate
 			 * with prejudice.
@@ -231,7 +240,7 @@ int blk_do_ordered(struct request_queue *q, struct request **rqp)
 					      blk_rq_bytes(rq)))
 				BUG();
 			*rqp = NULL;
-			return 0;
+			return false;
 		}
 	}
 
@@ -242,7 +251,7 @@ int blk_do_ordered(struct request_queue *q, struct request **rqp)
 	/* Special requests are not subject to ordering rules. */
 	if (!blk_fs_request(rq) &&
 	    rq != &q->pre_flush_rq && rq != &q->post_flush_rq)
-		return 1;
+		return true;
 
 	if (q->ordered & QUEUE_ORDERED_BY_TAG) {
 		/* Ordered by tag.  Blocking the next barrier is enough. */
@@ -255,7 +264,7 @@ int blk_do_ordered(struct request_queue *q, struct request **rqp)
 			*rqp = NULL;
 	}
 
-	return 1;
+	return true;
 }
 
 static void bio_end_empty_barrier(struct bio *bio, int err)
diff --git a/block/elevator.c b/block/elevator.c
index 86836dd..261ffaa 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -944,10 +944,14 @@ void elv_completed_request(struct request_queue *q, struct request *rq)
 	 * drained for flush sequence.
 	 */
 	if (unlikely(q->ordseq)) {
-		struct request *first_rq = list_entry_rq(q->queue_head.next);
-		if (q->in_flight == 0 &&
+		struct request *next = NULL;
+
+		if (!list_empty(&q->queue_head))
+			next = list_entry_rq(q->queue_head.next);
+
+		if (!q->in_flight &&
 		    blk_ordered_cur_seq(q) == QUEUE_ORDSEQ_DRAIN &&
-		    blk_ordered_req_seq(first_rq) > QUEUE_ORDSEQ_DRAIN) {
+		    (!next || blk_ordered_req_seq(next) > QUEUE_ORDSEQ_DRAIN)) {
 			blk_ordered_complete_seq(q, QUEUE_ORDSEQ_DRAIN, 0);
 			blk_start_queueing(q);
 		}
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index b044267..3c7078e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -866,10 +866,10 @@ extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *);
 extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);
 extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
 extern int blk_queue_ordered(struct request_queue *, unsigned, prepare_flush_fn *);
-extern int blk_do_ordered(struct request_queue *, struct request **);
+extern bool blk_do_ordered(struct request_queue *, struct request **);
 extern unsigned blk_ordered_cur_seq(struct request_queue *);
 extern unsigned blk_ordered_req_seq(struct request *);
-extern void blk_ordered_complete_seq(struct request_queue *, unsigned, int);
+extern bool blk_ordered_complete_seq(struct request_queue *, unsigned, int);
 
 extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *);
 extern void blk_dump_rq_flags(struct request *, char *);
-- 
cgit v0.10.2


From 58eea927d2de43dc6f03d1ba2c46e55854b31540 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 28 Nov 2008 13:32:06 +0900
Subject: block: simplify empty barrier implementation

Empty barrier required special handling in __elv_next_request() to
complete it without letting the low level driver see it.

With previous changes, barrier code is now flexible enough to skip the
BAR step using the same barrier sequence selection mechanism.  Drop
the special handling and mask off q->ordered from start_ordered().

Remove blk_empty_barrier() test which now has no user.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index b03d880..c63044e 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -162,6 +162,14 @@ static inline bool start_ordered(struct request_queue *q, struct request **rqp)
 	q->ordered = q->next_ordered;
 	q->ordseq |= QUEUE_ORDSEQ_STARTED;
 
+	/*
+	 * For an empty barrier, there's no actual BAR request, which
+	 * in turn makes POSTFLUSH unnecessary.  Mask them off.
+	 */
+	if (!rq->hard_nr_sectors)
+		q->ordered &= ~(QUEUE_ORDERED_DO_BAR |
+				QUEUE_ORDERED_DO_POSTFLUSH);
+
 	/* stash away the original request */
 	elv_dequeue_request(q, rq);
 	q->orig_bar_rq = rq;
@@ -171,13 +179,9 @@ static inline bool start_ordered(struct request_queue *q, struct request **rqp)
 	 * Queue ordered sequence.  As we stack them at the head, we
 	 * need to queue in reverse order.  Note that we rely on that
 	 * no fs request uses ELEVATOR_INSERT_FRONT and thus no fs
-	 * request gets inbetween ordered sequence. If this request is
-	 * an empty barrier, we don't need to do a postflush ever since
-	 * there will be no data written between the pre and post flush.
-	 * Hence a single flush will suffice.
+	 * request gets inbetween ordered sequence.
 	 */
-	if ((q->ordered & QUEUE_ORDERED_DO_POSTFLUSH) &&
-	    !blk_empty_barrier(q->orig_bar_rq)) {
+	if (q->ordered & QUEUE_ORDERED_DO_POSTFLUSH) {
 		queue_flush(q, QUEUE_ORDERED_DO_POSTFLUSH);
 		rq = &q->post_flush_rq;
 	} else
diff --git a/block/elevator.c b/block/elevator.c
index 261ffaa..ff60177 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -755,14 +755,6 @@ struct request *elv_next_request(struct request_queue *q)
 	int ret;
 
 	while ((rq = __elv_next_request(q)) != NULL) {
-		/*
-		 * Kill the empty barrier place holder, the driver must
-		 * not ever see it.
-		 */
-		if (blk_empty_barrier(rq)) {
-			__blk_end_request(rq, 0, blk_rq_bytes(rq));
-			continue;
-		}
 		if (!(rq->cmd_flags & REQ_STARTED)) {
 			/*
 			 * This is the first time the device driver
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 3c7078e..41bbadf 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -596,7 +596,6 @@ enum {
 #define blk_fua_rq(rq)		((rq)->cmd_flags & REQ_FUA)
 #define blk_discard_rq(rq)	((rq)->cmd_flags & REQ_DISCARD)
 #define blk_bidi_rq(rq)		((rq)->next_rq != NULL)
-#define blk_empty_barrier(rq)	(blk_barrier_rq(rq) && blk_fs_request(rq) && !(rq)->hard_nr_sectors)
 /* rq->queuelist of dequeued request must be list_empty() */
 #define blk_queued_rq(rq)	(!list_empty(&(rq)->queuelist))
 
-- 
cgit v0.10.2


From a185eb4bc84155fff35b602ce99602c010de9634 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 28 Nov 2008 13:32:07 +0900
Subject: block: fix empty barrier on write-through w/ ordered tag

Empty barrier on write-through (or no cache) w/ ordered tag has no
command to execute and without any command to execute ordered tag is
never issued to the device and the ordering is never achieved.  Force
draining for such cases.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index c63044e..8eba4e4 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -166,9 +166,21 @@ static inline bool start_ordered(struct request_queue *q, struct request **rqp)
 	 * For an empty barrier, there's no actual BAR request, which
 	 * in turn makes POSTFLUSH unnecessary.  Mask them off.
 	 */
-	if (!rq->hard_nr_sectors)
+	if (!rq->hard_nr_sectors) {
 		q->ordered &= ~(QUEUE_ORDERED_DO_BAR |
 				QUEUE_ORDERED_DO_POSTFLUSH);
+		/*
+		 * Empty barrier on a write-through device w/ ordered
+		 * tag has no command to issue and without any command
+		 * to issue, ordering by tag can't be used.  Drain
+		 * instead.
+		 */
+		if ((q->ordered & QUEUE_ORDERED_BY_TAG) &&
+		    !(q->ordered & QUEUE_ORDERED_DO_PREFLUSH)) {
+			q->ordered &= ~QUEUE_ORDERED_BY_TAG;
+			q->ordered |= QUEUE_ORDERED_BY_DRAIN;
+		}
+	}
 
 	/* stash away the original request */
 	elv_dequeue_request(q, rq);
-- 
cgit v0.10.2


From a31a97381cdf7dceb03b797a8faf9bc8a01c65d1 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 17 Oct 2008 13:58:29 +0200
Subject: block: don't use plugging on SSD devices

We just want to hand the first bits of IO to the device as fast
as possible. Gains a few percent on the IOPS rate.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-core.c b/block/blk-core.c
index b1fd4f5..a824e49 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -268,8 +268,7 @@ void __generic_unplug_device(struct request_queue *q)
 {
 	if (unlikely(blk_queue_stopped(q)))
 		return;
-
-	if (!blk_remove_plug(q))
+	if (!blk_remove_plug(q) && !blk_queue_nonrot(q))
 		return;
 
 	q->request_fn(q);
@@ -1241,11 +1240,11 @@ get_rq:
 	if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||
 	    bio_flagged(bio, BIO_CPU_AFFINE))
 		req->cpu = blk_cpu_to_group(smp_processor_id());
-	if (elv_queue_empty(q))
+	if (!blk_queue_nonrot(q) && elv_queue_empty(q))
 		blk_plug_device(q);
 	add_request(q, req);
 out:
-	if (sync)
+	if (sync || blk_queue_nonrot(q))
 		__generic_unplug_device(q);
 	spin_unlock_irq(q->queue_lock);
 	return 0;
-- 
cgit v0.10.2


From 7ff9345ffac56743b5001561bc2dc1e041b79149 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 11 Dec 2008 11:53:43 +0100
Subject: bio: only mempool back the largest bio_vec slab cache

We only very rarely need the mempool backing, so it makes sense to
get rid of all but one of the mempool in a bio_set. So keep the
largest bio_vec count mempool so we can always honor the largest
allocation, and "upgrade" callers that fail.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/fs/bio.c b/fs/bio.c
index df99c88..eb6b4683 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -58,7 +58,8 @@ unsigned int bvec_nr_vecs(unsigned short idx)
 	return bvec_slabs[idx].nr_vecs;
 }
 
-struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs)
+struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx,
+			      struct bio_set *bs)
 {
 	struct bio_vec *bvl;
 
@@ -67,60 +68,90 @@ struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct
 	 * If not, this is a bio_kmalloc() allocation and just do a
 	 * kzalloc() for the exact number of vecs right away.
 	 */
-	if (bs) {
+	if (!bs)
+		bvl = kzalloc(nr * sizeof(struct bio_vec), gfp_mask);
+
+	/*
+	 * see comment near bvec_array define!
+	 */
+	switch (nr) {
+	case 1:
+		*idx = 0;
+		break;
+	case 2 ... 4:
+		*idx = 1;
+		break;
+	case 5 ... 16:
+		*idx = 2;
+		break;
+	case 17 ... 64:
+		*idx = 3;
+		break;
+	case 65 ... 128:
+		*idx = 4;
+		break;
+	case 129 ... BIO_MAX_PAGES:
+		*idx = 5;
+		break;
+	default:
+		return NULL;
+	}
+
+	/*
+	 * idx now points to the pool we want to allocate from. only the
+	 * 1-vec entry pool is mempool backed.
+	 */
+	if (*idx == BIOVEC_MAX_IDX) {
+fallback:
+		bvl = mempool_alloc(bs->bvec_pool, gfp_mask);
+	} else {
+		struct biovec_slab *bvs = bvec_slabs + *idx;
+		gfp_t __gfp_mask = gfp_mask & ~(__GFP_WAIT | __GFP_IO);
+
 		/*
-		 * see comment near bvec_array define!
+		 * Make this allocation restricted and don't dump info on
+		 * allocation failures, since we'll fallback to the mempool
+		 * in case of failure.
 		 */
-		switch (nr) {
-		case 1:
-			*idx = 0;
-			break;
-		case 2 ... 4:
-			*idx = 1;
-			break;
-		case 5 ... 16:
-			*idx = 2;
-			break;
-		case 17 ... 64:
-			*idx = 3;
-			break;
-		case 65 ... 128:
-			*idx = 4;
-			break;
-		case 129 ... BIO_MAX_PAGES:
-			*idx = 5;
-			break;
-		default:
-			return NULL;
-		}
+		__gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
 
 		/*
-		 * idx now points to the pool we want to allocate from
+		 * Try a slab allocation. If this fails and __GFP_WAIT
+		 * is set, retry with the 1-entry mempool
 		 */
-		bvl = mempool_alloc(bs->bvec_pools[*idx], gfp_mask);
-		if (bvl)
-			memset(bvl, 0,
-				bvec_nr_vecs(*idx) * sizeof(struct bio_vec));
-	} else
-		bvl = kzalloc(nr * sizeof(struct bio_vec), gfp_mask);
+		bvl = kmem_cache_alloc(bvs->slab, __gfp_mask);
+		if (unlikely(!bvl && (gfp_mask & __GFP_WAIT))) {
+			*idx = BIOVEC_MAX_IDX;
+			goto fallback;
+		}
+	}
+
+	if (bvl)
+		memset(bvl, 0, bvec_nr_vecs(*idx) * sizeof(struct bio_vec));
 
 	return bvl;
 }
 
-void bio_free(struct bio *bio, struct bio_set *bio_set)
+void bio_free(struct bio *bio, struct bio_set *bs)
 {
 	if (bio->bi_io_vec) {
 		const int pool_idx = BIO_POOL_IDX(bio);
 
 		BIO_BUG_ON(pool_idx >= BIOVEC_NR_POOLS);
 
-		mempool_free(bio->bi_io_vec, bio_set->bvec_pools[pool_idx]);
+		if (pool_idx == BIOVEC_MAX_IDX)
+			mempool_free(bio->bi_io_vec, bs->bvec_pool);
+		else {
+			struct biovec_slab *bvs = bvec_slabs + pool_idx;
+
+			kmem_cache_free(bvs->slab, bio->bi_io_vec);
+		}
 	}
 
 	if (bio_integrity(bio))
-		bio_integrity_free(bio, bio_set);
+		bio_integrity_free(bio, bs);
 
-	mempool_free(bio, bio_set->bio_pool);
+	mempool_free(bio, bs->bio_pool);
 }
 
 /*
@@ -1346,30 +1377,18 @@ EXPORT_SYMBOL(bio_sector_offset);
  */
 static int biovec_create_pools(struct bio_set *bs, int pool_entries)
 {
-	int i;
+	struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX;
 
-	for (i = 0; i < BIOVEC_NR_POOLS; i++) {
-		struct biovec_slab *bp = bvec_slabs + i;
-		mempool_t **bvp = bs->bvec_pools + i;
+	bs->bvec_pool = mempool_create_slab_pool(pool_entries, bp->slab);
+	if (!bs->bvec_pool)
+		return -ENOMEM;
 
-		*bvp = mempool_create_slab_pool(pool_entries, bp->slab);
-		if (!*bvp)
-			return -ENOMEM;
-	}
 	return 0;
 }
 
 static void biovec_free_pools(struct bio_set *bs)
 {
-	int i;
-
-	for (i = 0; i < BIOVEC_NR_POOLS; i++) {
-		mempool_t *bvp = bs->bvec_pools[i];
-
-		if (bvp)
-			mempool_destroy(bvp);
-	}
-
+	mempool_destroy(bs->bvec_pool);
 }
 
 void bioset_free(struct bio_set *bs)
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 3ed714e..d76e4bf 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -397,13 +397,14 @@ static inline void bio_set_completion_cpu(struct bio *bio, unsigned int cpu)
  */
 #define BIO_POOL_SIZE 2
 #define BIOVEC_NR_POOLS 6
+#define BIOVEC_MAX_IDX	(BIOVEC_NR_POOLS - 1)
 
 struct bio_set {
 	mempool_t *bio_pool;
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 	mempool_t *bio_integrity_pool;
 #endif
-	mempool_t *bvec_pools[BIOVEC_NR_POOLS];
+	mempool_t *bvec_pool;
 };
 
 struct biovec_slab {
-- 
cgit v0.10.2


From 1b4344986926da324b5cd10b683e5a1a5e1b7db3 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 22 Oct 2008 20:32:58 +0200
Subject: bio: move the slab pointer inside the bio_set

In preparation for adding differently sized bios.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/fs/bio.c b/fs/bio.c
index eb6b4683..1ab8986 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1404,12 +1404,15 @@ void bioset_free(struct bio_set *bs)
 
 struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size)
 {
-	struct bio_set *bs = kzalloc(sizeof(*bs), GFP_KERNEL);
+	struct bio_set *bs;
 
+	bs = kzalloc(sizeof(*bs), GFP_KERNEL);
 	if (!bs)
 		return NULL;
 
-	bs->bio_pool = mempool_create_slab_pool(bio_pool_size, bio_slab);
+	bs->bio_slab = bio_slab;
+
+	bs->bio_pool = mempool_create_slab_pool(bio_pool_size, bs->bio_slab);
 	if (!bs->bio_pool)
 		goto bad;
 
diff --git a/include/linux/bio.h b/include/linux/bio.h
index d76e4bf..9340098 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -400,6 +400,7 @@ static inline void bio_set_completion_cpu(struct bio *bio, unsigned int cpu)
 #define BIOVEC_MAX_IDX	(BIOVEC_NR_POOLS - 1)
 
 struct bio_set {
+	struct kmem_cache *bio_slab;
 	mempool_t *bio_pool;
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 	mempool_t *bio_integrity_pool;
-- 
cgit v0.10.2


From bb799ca0202a360fa74d5f17039b9100caebdde7 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 10 Dec 2008 15:35:05 +0100
Subject: bio: allow individual slabs in the bio_set

Instead of having a global bio slab cache, add a reference to one
in each bio_set that is created. This allows for personalized slabs
in each bio_set, so that they can have bios of different sizes.

This means we can personalize the bios we return. File systems may
want to embed the bio inside another structure, to avoid allocation
more items (and stuffing them in ->bi_private) after the get a bio.
Or we may want to embed a number of bio_vecs directly at the end
of a bio, to avoid doing two allocations to return a bio. This is now
possible.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index ce26c84..3326750 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1060,7 +1060,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad_page_pool;
 	}
 
-	cc->bs = bioset_create(MIN_IOS, MIN_IOS);
+	cc->bs = bioset_create(MIN_IOS, 0);
 	if (!cc->bs) {
 		ti->error = "Cannot allocate crypt bioset";
 		goto bad_bs;
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 2fd6d44..a343385 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -56,7 +56,7 @@ struct dm_io_client *dm_io_client_create(unsigned num_pages)
 	if (!client->pool)
 		goto bad;
 
-	client->bios = bioset_create(16, 16);
+	client->bios = bioset_create(16, 0);
 	if (!client->bios)
 		goto bad;
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 343094c..421c9f0 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1093,7 +1093,7 @@ static struct mapped_device *alloc_dev(int minor)
 	if (!md->tio_pool)
 		goto bad_tio_pool;
 
-	md->bs = bioset_create(16, 16);
+	md->bs = bioset_create(16, 0);
 	if (!md->bs)
 		goto bad_no_bioset;
 
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 19caf7c..77ebc3c 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -111,7 +111,7 @@ void bio_integrity_free(struct bio *bio, struct bio_set *bs)
 	    && bip->bip_buf != NULL)
 		kfree(bip->bip_buf);
 
-	mempool_free(bip->bip_vec, bs->bvec_pools[bip->bip_pool]);
+	bvec_free_bs(bs, bip->bip_vec, bip->bip_pool);
 	mempool_free(bip, bs->bio_integrity_pool);
 
 	bio->bi_integrity = NULL;
diff --git a/fs/bio.c b/fs/bio.c
index 1ab8986..0146f80 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -31,8 +31,6 @@
 
 DEFINE_TRACE(block_split);
 
-static struct kmem_cache *bio_slab __read_mostly;
-
 static mempool_t *bio_split_pool __read_mostly;
 
 /*
@@ -40,9 +38,8 @@ static mempool_t *bio_split_pool __read_mostly;
  * break badly! cannot be bigger than what you can fit into an
  * unsigned short
  */
-
 #define BV(x) { .nr_vecs = x, .name = "biovec-"__stringify(x) }
-static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
+struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
 	BV(1), BV(4), BV(16), BV(64), BV(128), BV(BIO_MAX_PAGES),
 };
 #undef BV
@@ -53,11 +50,119 @@ static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
  */
 struct bio_set *fs_bio_set;
 
+/*
+ * Our slab pool management
+ */
+struct bio_slab {
+	struct kmem_cache *slab;
+	unsigned int slab_ref;
+	unsigned int slab_size;
+	char name[8];
+};
+static DEFINE_MUTEX(bio_slab_lock);
+static struct bio_slab *bio_slabs;
+static unsigned int bio_slab_nr, bio_slab_max;
+
+static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
+{
+	unsigned int sz = sizeof(struct bio) + extra_size;
+	struct kmem_cache *slab = NULL;
+	struct bio_slab *bslab;
+	unsigned int i, entry = -1;
+
+	mutex_lock(&bio_slab_lock);
+
+	i = 0;
+	while (i < bio_slab_nr) {
+		struct bio_slab *bslab = &bio_slabs[i];
+
+		if (!bslab->slab && entry == -1)
+			entry = i;
+		else if (bslab->slab_size == sz) {
+			slab = bslab->slab;
+			bslab->slab_ref++;
+			break;
+		}
+		i++;
+	}
+
+	if (slab)
+		goto out_unlock;
+
+	if (bio_slab_nr == bio_slab_max && entry == -1) {
+		bio_slab_max <<= 1;
+		bio_slabs = krealloc(bio_slabs,
+				     bio_slab_max * sizeof(struct bio_slab),
+				     GFP_KERNEL);
+		if (!bio_slabs)
+			goto out_unlock;
+	}
+	if (entry == -1)
+		entry = bio_slab_nr++;
+
+	bslab = &bio_slabs[entry];
+
+	snprintf(bslab->name, sizeof(bslab->name), "bio-%d", entry);
+	slab = kmem_cache_create(bslab->name, sz, 0, SLAB_HWCACHE_ALIGN, NULL);
+	if (!slab)
+		goto out_unlock;
+
+	printk("bio: create slab <%s> at %d\n", bslab->name, entry);
+	bslab->slab = slab;
+	bslab->slab_ref = 1;
+	bslab->slab_size = sz;
+out_unlock:
+	mutex_unlock(&bio_slab_lock);
+	return slab;
+}
+
+static void bio_put_slab(struct bio_set *bs)
+{
+	struct bio_slab *bslab = NULL;
+	unsigned int i;
+
+	mutex_lock(&bio_slab_lock);
+
+	for (i = 0; i < bio_slab_nr; i++) {
+		if (bs->bio_slab == bio_slabs[i].slab) {
+			bslab = &bio_slabs[i];
+			break;
+		}
+	}
+
+	if (WARN(!bslab, KERN_ERR "bio: unable to find slab!\n"))
+		goto out;
+
+	WARN_ON(!bslab->slab_ref);
+
+	if (--bslab->slab_ref)
+		goto out;
+
+	kmem_cache_destroy(bslab->slab);
+	bslab->slab = NULL;
+
+out:
+	mutex_unlock(&bio_slab_lock);
+}
+
 unsigned int bvec_nr_vecs(unsigned short idx)
 {
 	return bvec_slabs[idx].nr_vecs;
 }
 
+void bvec_free_bs(struct bio_set *bs, struct bio_vec *bv, unsigned int idx)
+{
+	BIO_BUG_ON(idx >= BIOVEC_NR_POOLS);
+
+	if (idx == BIOVEC_MAX_IDX)
+		mempool_free(bv, bs->bvec_pool);
+	else {
+		struct biovec_slab *bvs = bvec_slabs + idx;
+
+		kmem_cache_free(bvs->slab, bv);
+	}
+}
+
 struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx,
 			      struct bio_set *bs)
 {
@@ -134,24 +239,22 @@ fallback:
 
 void bio_free(struct bio *bio, struct bio_set *bs)
 {
-	if (bio->bi_io_vec) {
-		const int pool_idx = BIO_POOL_IDX(bio);
+	void *p;
 
-		BIO_BUG_ON(pool_idx >= BIOVEC_NR_POOLS);
-
-		if (pool_idx == BIOVEC_MAX_IDX)
-			mempool_free(bio->bi_io_vec, bs->bvec_pool);
-		else {
-			struct biovec_slab *bvs = bvec_slabs + pool_idx;
-
-			kmem_cache_free(bvs->slab, bio->bi_io_vec);
-		}
-	}
+	if (bio->bi_io_vec)
+		bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio));
 
 	if (bio_integrity(bio))
 		bio_integrity_free(bio, bs);
 
-	mempool_free(bio, bs->bio_pool);
+	/*
+	 * If we have front padding, adjust the bio pointer before freeing
+	 */
+	p = bio;
+	if (bs->front_pad)
+		p -= bs->front_pad;
+
+	mempool_free(p, bs->bio_pool);
 }
 
 /*
@@ -188,16 +291,20 @@ void bio_init(struct bio *bio)
  *   for a &struct bio to become free. If a %NULL @bs is passed in, we will
  *   fall back to just using @kmalloc to allocate the required memory.
  *
- *   allocate bio and iovecs from the memory pools specified by the
- *   bio_set structure, or @kmalloc if none given.
+ *   Note that the caller must set ->bi_destructor on succesful return
+ *   of a bio, to do the appropriate freeing of the bio once the reference
+ *   count drops to zero.
  **/
 struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 {
-	struct bio *bio;
+	struct bio *bio = NULL;
+
+	if (bs) {
+		void *p = mempool_alloc(bs->bio_pool, gfp_mask);
 
-	if (bs)
-		bio = mempool_alloc(bs->bio_pool, gfp_mask);
-	else
+		if (p)
+			bio = p + bs->front_pad;
+	} else
 		bio = kmalloc(sizeof(*bio), gfp_mask);
 
 	if (likely(bio)) {
@@ -1398,11 +1505,25 @@ void bioset_free(struct bio_set *bs)
 
 	bioset_integrity_free(bs);
 	biovec_free_pools(bs);
+	bio_put_slab(bs);
 
 	kfree(bs);
 }
 
-struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size)
+/**
+ * bioset_create  - Create a bio_set
+ * @pool_size:	Number of bio and bio_vecs to cache in the mempool
+ * @front_pad:	Number of bytes to allocate in front of the returned bio
+ *
+ * Description:
+ *    Set up a bio_set to be used with @bio_alloc_bioset. Allows the caller
+ *    to ask for a number of bytes to be allocated in front of the bio.
+ *    Front pad allocation is useful for embedding the bio inside
+ *    another structure, to avoid allocating extra data to go with the bio.
+ *    Note that the bio must be embedded at the END of that structure always,
+ *    or things will break badly.
+ */
+struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
 {
 	struct bio_set *bs;
 
@@ -1410,16 +1531,22 @@ struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size)
 	if (!bs)
 		return NULL;
 
-	bs->bio_slab = bio_slab;
+	bs->front_pad = front_pad;
 
-	bs->bio_pool = mempool_create_slab_pool(bio_pool_size, bs->bio_slab);
+	bs->bio_slab = bio_find_or_create_slab(front_pad);
+	if (!bs->bio_slab) {
+		kfree(bs);
+		return NULL;
+	}
+
+	bs->bio_pool = mempool_create_slab_pool(pool_size, bs->bio_slab);
 	if (!bs->bio_pool)
 		goto bad;
 
-	if (bioset_integrity_create(bs, bio_pool_size))
+	if (bioset_integrity_create(bs, pool_size))
 		goto bad;
 
-	if (!biovec_create_pools(bs, bvec_pool_size))
+	if (!biovec_create_pools(bs, pool_size))
 		return bs;
 
 bad:
@@ -1443,12 +1570,16 @@ static void __init biovec_init_slabs(void)
 
 static int __init init_bio(void)
 {
-	bio_slab = KMEM_CACHE(bio, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
+	bio_slab_max = 2;
+	bio_slab_nr = 0;
+	bio_slabs = kzalloc(bio_slab_max * sizeof(struct bio_slab), GFP_KERNEL);
+	if (!bio_slabs)
+		panic("bio: can't allocate bios\n");
 
 	bio_integrity_init_slab();
 	biovec_init_slabs();
 
-	fs_bio_set = bioset_create(BIO_POOL_SIZE, 2);
+	fs_bio_set = bioset_create(BIO_POOL_SIZE, 0);
 	if (!fs_bio_set)
 		panic("bio: can't allocate bios\n");
 
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 9340098..4b80d35 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -334,7 +334,7 @@ struct bio_pair {
 extern struct bio_pair *bio_split(struct bio *bi, int first_sectors);
 extern void bio_pair_release(struct bio_pair *dbio);
 
-extern struct bio_set *bioset_create(int, int);
+extern struct bio_set *bioset_create(unsigned int, unsigned int);
 extern void bioset_free(struct bio_set *);
 
 extern struct bio *bio_alloc(gfp_t, int);
@@ -379,6 +379,7 @@ extern struct bio *bio_copy_user_iov(struct request_queue *,
 extern int bio_uncopy_user(struct bio *);
 void zero_fill_bio(struct bio *bio);
 extern struct bio_vec *bvec_alloc_bs(gfp_t, int, unsigned long *, struct bio_set *);
+extern void bvec_free_bs(struct bio_set *, struct bio_vec *, unsigned int);
 extern unsigned int bvec_nr_vecs(unsigned short idx);
 
 /*
@@ -401,6 +402,8 @@ static inline void bio_set_completion_cpu(struct bio *bio, unsigned int cpu)
 
 struct bio_set {
 	struct kmem_cache *bio_slab;
+	unsigned int front_pad;
+
 	mempool_t *bio_pool;
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 	mempool_t *bio_integrity_pool;
@@ -415,6 +418,7 @@ struct biovec_slab {
 };
 
 extern struct bio_set *fs_bio_set;
+extern struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly;
 
 /*
  * a small number of entries is fine, not going to be performance critical.
-- 
cgit v0.10.2


From 392ddc32982a5c661dd90dd49a3cb37f1c68b782 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 23 Dec 2008 12:42:54 +0100
Subject: bio: add support for inlining a number of bio_vecs inside the bio

When we go and allocate a bio for IO, we actually do two allocations.
One for the bio itself, and one for the bi_io_vec that holds the
actual pages we are interested in.

This feature inlines a definable amount of io vecs inside the bio
itself, so we eliminate the bio_vec array allocation for IO's up
to a certain size. It defaults to 4 vecs, which is typically 16k
of IO.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/fs/bio.c b/fs/bio.c
index 0146f80..75e6be1 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -31,6 +31,12 @@
 
 DEFINE_TRACE(block_split);
 
+/*
+ * Test patch to inline a certain number of bi_io_vec's inside the bio
+ * itself, to shrink a bio data allocation from two mempool calls to one
+ */
+#define BIO_INLINE_VECS		4
+
 static mempool_t *bio_split_pool __read_mostly;
 
 /*
@@ -241,7 +247,7 @@ void bio_free(struct bio *bio, struct bio_set *bs)
 {
 	void *p;
 
-	if (bio->bi_io_vec)
+	if (bio_has_allocated_vec(bio))
 		bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio));
 
 	if (bio_integrity(bio))
@@ -267,7 +273,8 @@ static void bio_fs_destructor(struct bio *bio)
 
 static void bio_kmalloc_destructor(struct bio *bio)
 {
-	kfree(bio->bi_io_vec);
+	if (bio_has_allocated_vec(bio))
+		kfree(bio->bi_io_vec);
 	kfree(bio);
 }
 
@@ -314,7 +321,16 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 		if (likely(nr_iovecs)) {
 			unsigned long uninitialized_var(idx);
 
-			bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs);
+			if (nr_iovecs <= BIO_INLINE_VECS) {
+				idx = 0;
+				bvl = bio->bi_inline_vecs;
+				nr_iovecs = BIO_INLINE_VECS;
+				memset(bvl, 0, BIO_INLINE_VECS * sizeof(*bvl));
+			} else {
+				bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx,
+							bs);
+				nr_iovecs = bvec_nr_vecs(idx);
+			}
 			if (unlikely(!bvl)) {
 				if (bs)
 					mempool_free(bio, bs->bio_pool);
@@ -324,7 +340,7 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 				goto out;
 			}
 			bio->bi_flags |= idx << BIO_POOL_OFFSET;
-			bio->bi_max_vecs = bvec_nr_vecs(idx);
+			bio->bi_max_vecs = nr_iovecs;
 		}
 		bio->bi_io_vec = bvl;
 	}
@@ -1525,6 +1541,7 @@ void bioset_free(struct bio_set *bs)
  */
 struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
 {
+	unsigned int back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec);
 	struct bio_set *bs;
 
 	bs = kzalloc(sizeof(*bs), GFP_KERNEL);
@@ -1533,7 +1550,7 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
 
 	bs->front_pad = front_pad;
 
-	bs->bio_slab = bio_find_or_create_slab(front_pad);
+	bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad);
 	if (!bs->bio_slab) {
 		kfree(bs);
 		return NULL;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 4b80d35..18462c5 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -102,6 +102,13 @@ struct bio {
 #endif
 
 	bio_destructor_t	*bi_destructor;	/* destructor */
+
+	/*
+	 * We can inline a number of vecs at the end of the bio, to avoid
+	 * double allocations for a small number of bio_vecs. This member
+	 * MUST obviously be kept at the very end of the bio.
+	 */
+	struct bio_vec		bi_inline_vecs[0];
 };
 
 /*
@@ -213,6 +220,11 @@ static inline void *bio_data(struct bio *bio)
 	return NULL;
 }
 
+static inline int bio_has_allocated_vec(struct bio *bio)
+{
+	return bio->bi_io_vec && bio->bi_io_vec != bio->bi_inline_vecs;
+}
+
 /*
  * will die
  */
-- 
cgit v0.10.2


From abf137dd7712132ee56d5b3143c2ff61a72a5faa Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 9 Dec 2008 08:11:22 +0100
Subject: aio: make the lookup_ioctx() lockless

The mm->ioctx_list is currently protected by a reader-writer lock,
so we always grab that lock on the read side for doing ioctx
lookups. As the workload is extremely reader biased, turn this into
an rcu hlist so we can make lookup_ioctx() lockless. Get rid of
the rwlock and use a spinlock for providing update side exclusion.

There's usually only 1 entry on this list, so it doesn't make sense
to look into fancier data structures.

Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index ef3635b..0767827 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -263,7 +263,7 @@ int s390_enable_sie(void)
 	/* lets check if we are allowed to replace the mm */
 	task_lock(tsk);
 	if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
-	    tsk->mm != tsk->active_mm || tsk->mm->ioctx_list) {
+	    tsk->mm != tsk->active_mm || !hlist_empty(&tsk->mm->ioctx_list)) {
 		task_unlock(tsk);
 		return -EINVAL;
 	}
@@ -279,7 +279,7 @@ int s390_enable_sie(void)
 	/* Now lets check again if something happened */
 	task_lock(tsk);
 	if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
-	    tsk->mm != tsk->active_mm || tsk->mm->ioctx_list) {
+	    tsk->mm != tsk->active_mm || !hlist_empty(&tsk->mm->ioctx_list)) {
 		mmput(mm);
 		task_unlock(tsk);
 		return -EINVAL;
diff --git a/fs/aio.c b/fs/aio.c
index f658441..d6f89d3 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -191,6 +191,20 @@ static int aio_setup_ring(struct kioctx *ctx)
 	kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK), km); \
 } while(0)
 
+static void ctx_rcu_free(struct rcu_head *head)
+{
+	struct kioctx *ctx = container_of(head, struct kioctx, rcu_head);
+	unsigned nr_events = ctx->max_reqs;
+
+	kmem_cache_free(kioctx_cachep, ctx);
+
+	if (nr_events) {
+		spin_lock(&aio_nr_lock);
+		BUG_ON(aio_nr - nr_events > aio_nr);
+		aio_nr -= nr_events;
+		spin_unlock(&aio_nr_lock);
+	}
+}
 
 /* __put_ioctx
  *	Called when the last user of an aio context has gone away,
@@ -198,8 +212,6 @@ static int aio_setup_ring(struct kioctx *ctx)
  */
 static void __put_ioctx(struct kioctx *ctx)
 {
-	unsigned nr_events = ctx->max_reqs;
-
 	BUG_ON(ctx->reqs_active);
 
 	cancel_delayed_work(&ctx->wq);
@@ -208,14 +220,7 @@ static void __put_ioctx(struct kioctx *ctx)
 	mmdrop(ctx->mm);
 	ctx->mm = NULL;
 	pr_debug("__put_ioctx: freeing %p\n", ctx);
-	kmem_cache_free(kioctx_cachep, ctx);
-
-	if (nr_events) {
-		spin_lock(&aio_nr_lock);
-		BUG_ON(aio_nr - nr_events > aio_nr);
-		aio_nr -= nr_events;
-		spin_unlock(&aio_nr_lock);
-	}
+	call_rcu(&ctx->rcu_head, ctx_rcu_free);
 }
 
 #define get_ioctx(kioctx) do {						\
@@ -235,6 +240,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 {
 	struct mm_struct *mm;
 	struct kioctx *ctx;
+	int did_sync = 0;
 
 	/* Prevent overflows */
 	if ((nr_events > (0x10000000U / sizeof(struct io_event))) ||
@@ -267,21 +273,30 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 		goto out_freectx;
 
 	/* limit the number of system wide aios */
-	spin_lock(&aio_nr_lock);
-	if (aio_nr + ctx->max_reqs > aio_max_nr ||
-	    aio_nr + ctx->max_reqs < aio_nr)
-		ctx->max_reqs = 0;
-	else
-		aio_nr += ctx->max_reqs;
-	spin_unlock(&aio_nr_lock);
+	do {
+		spin_lock_bh(&aio_nr_lock);
+		if (aio_nr + nr_events > aio_max_nr ||
+		    aio_nr + nr_events < aio_nr)
+			ctx->max_reqs = 0;
+		else
+			aio_nr += ctx->max_reqs;
+		spin_unlock_bh(&aio_nr_lock);
+		if (ctx->max_reqs || did_sync)
+			break;
+
+		/* wait for rcu callbacks to have completed before giving up */
+		synchronize_rcu();
+		did_sync = 1;
+		ctx->max_reqs = nr_events;
+	} while (1);
+
 	if (ctx->max_reqs == 0)
 		goto out_cleanup;
 
 	/* now link into global list. */
-	write_lock(&mm->ioctx_list_lock);
-	ctx->next = mm->ioctx_list;
-	mm->ioctx_list = ctx;
-	write_unlock(&mm->ioctx_list_lock);
+	spin_lock(&mm->ioctx_lock);
+	hlist_add_head_rcu(&ctx->list, &mm->ioctx_list);
+	spin_unlock(&mm->ioctx_lock);
 
 	dprintk("aio: allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
 		ctx, ctx->user_id, current->mm, ctx->ring_info.nr);
@@ -375,11 +390,12 @@ ssize_t wait_on_sync_kiocb(struct kiocb *iocb)
  */
 void exit_aio(struct mm_struct *mm)
 {
-	struct kioctx *ctx = mm->ioctx_list;
-	mm->ioctx_list = NULL;
-	while (ctx) {
-		struct kioctx *next = ctx->next;
-		ctx->next = NULL;
+	struct kioctx *ctx;
+
+	while (!hlist_empty(&mm->ioctx_list)) {
+		ctx = hlist_entry(mm->ioctx_list.first, struct kioctx, list);
+		hlist_del_rcu(&ctx->list);
+
 		aio_cancel_all(ctx);
 
 		wait_for_all_aios(ctx);
@@ -394,7 +410,6 @@ void exit_aio(struct mm_struct *mm)
 				atomic_read(&ctx->users), ctx->dead,
 				ctx->reqs_active);
 		put_ioctx(ctx);
-		ctx = next;
 	}
 }
 
@@ -555,19 +570,21 @@ int aio_put_req(struct kiocb *req)
 
 static struct kioctx *lookup_ioctx(unsigned long ctx_id)
 {
-	struct kioctx *ioctx;
-	struct mm_struct *mm;
+	struct mm_struct *mm = current->mm;
+	struct kioctx *ctx = NULL;
+	struct hlist_node *n;
 
-	mm = current->mm;
-	read_lock(&mm->ioctx_list_lock);
-	for (ioctx = mm->ioctx_list; ioctx; ioctx = ioctx->next)
-		if (likely(ioctx->user_id == ctx_id && !ioctx->dead)) {
-			get_ioctx(ioctx);
+	rcu_read_lock();
+
+	hlist_for_each_entry_rcu(ctx, n, &mm->ioctx_list, list) {
+		if (ctx->user_id == ctx_id && !ctx->dead) {
+			get_ioctx(ctx);
 			break;
 		}
-	read_unlock(&mm->ioctx_list_lock);
+	}
 
-	return ioctx;
+	rcu_read_unlock();
+	return ctx;
 }
 
 /*
@@ -1215,19 +1232,14 @@ out:
 static void io_destroy(struct kioctx *ioctx)
 {
 	struct mm_struct *mm = current->mm;
-	struct kioctx **tmp;
 	int was_dead;
 
 	/* delete the entry from the list is someone else hasn't already */
-	write_lock(&mm->ioctx_list_lock);
+	spin_lock(&mm->ioctx_lock);
 	was_dead = ioctx->dead;
 	ioctx->dead = 1;
-	for (tmp = &mm->ioctx_list; *tmp && *tmp != ioctx;
-	     tmp = &(*tmp)->next)
-		;
-	if (*tmp)
-		*tmp = ioctx->next;
-	write_unlock(&mm->ioctx_list_lock);
+	hlist_del_rcu(&ioctx->list);
+	spin_unlock(&mm->ioctx_lock);
 
 	dprintk("aio_release(%p)\n", ioctx);
 	if (likely(!was_dead))
diff --git a/include/linux/aio.h b/include/linux/aio.h
index f6b8cf9..b16a957 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -5,6 +5,7 @@
 #include <linux/workqueue.h>
 #include <linux/aio_abi.h>
 #include <linux/uio.h>
+#include <linux/rcupdate.h>
 
 #include <asm/atomic.h>
 
@@ -183,7 +184,7 @@ struct kioctx {
 
 	/* This needs improving */
 	unsigned long		user_id;
-	struct kioctx		*next;
+	struct hlist_node	list;
 
 	wait_queue_head_t	wait;
 
@@ -199,6 +200,8 @@ struct kioctx {
 	struct aio_ring_info	ring_info;
 
 	struct delayed_work	wq;
+
+	struct rcu_head		rcu_head;
 };
 
 /* prototypes */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index fe825471..9cfc9b6 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -232,8 +232,9 @@ struct mm_struct {
 	struct core_state *core_state; /* coredumping support */
 
 	/* aio bits */
-	rwlock_t		ioctx_list_lock;	/* aio lock */
-	struct kioctx		*ioctx_list;
+	spinlock_t		ioctx_lock;
+	struct hlist_head	ioctx_list;
+
 #ifdef CONFIG_MM_OWNER
 	/*
 	 * "owner" points to a task that is regarded as the canonical
diff --git a/kernel/fork.c b/kernel/fork.c
index 6144b36..43cbf30 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -415,8 +415,8 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
 	set_mm_counter(mm, file_rss, 0);
 	set_mm_counter(mm, anon_rss, 0);
 	spin_lock_init(&mm->page_table_lock);
-	rwlock_init(&mm->ioctx_list_lock);
-	mm->ioctx_list = NULL;
+	spin_lock_init(&mm->ioctx_lock);
+	INIT_HLIST_HEAD(&mm->ioctx_list);
 	mm->free_area_cache = TASK_UNMAPPED_BASE;
 	mm->cached_hole_size = ~0UL;
 	mm_init_owner(mm, p);
-- 
cgit v0.10.2


From b374d18a4bfce705e4a99ae9f501b53e86ecb283 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 31 Oct 2008 10:05:07 +0100
Subject: block: get rid of elevator_t typedef

Just use struct elevator_queue everywhere instead.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/as-iosched.c b/block/as-iosched.c
index 802b5d0..631f6f4 100644
--- a/block/as-iosched.c
+++ b/block/as-iosched.c
@@ -1339,7 +1339,7 @@ static int as_may_queue(struct request_queue *q, int rw)
 	return ret;
 }
 
-static void as_exit_queue(elevator_t *e)
+static void as_exit_queue(struct elevator_queue *e)
 {
 	struct as_data *ad = e->elevator_data;
 
@@ -1409,7 +1409,7 @@ as_var_store(unsigned long *var, const char *page, size_t count)
 	return count;
 }
 
-static ssize_t est_time_show(elevator_t *e, char *page)
+static ssize_t est_time_show(struct elevator_queue *e, char *page)
 {
 	struct as_data *ad = e->elevator_data;
 	int pos = 0;
@@ -1427,7 +1427,7 @@ static ssize_t est_time_show(elevator_t *e, char *page)
 }
 
 #define SHOW_FUNCTION(__FUNC, __VAR)				\
-static ssize_t __FUNC(elevator_t *e, char *page)		\
+static ssize_t __FUNC(struct elevator_queue *e, char *page)	\
 {								\
 	struct as_data *ad = e->elevator_data;			\
 	return as_var_show(jiffies_to_msecs((__VAR)), (page));	\
@@ -1440,7 +1440,7 @@ SHOW_FUNCTION(as_write_batch_expire_show, ad->batch_expire[REQ_ASYNC]);
 #undef SHOW_FUNCTION
 
 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX)				\
-static ssize_t __FUNC(elevator_t *e, const char *page, size_t count)	\
+static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)	\
 {									\
 	struct as_data *ad = e->elevator_data;				\
 	int ret = as_var_store(__PTR, (page), count);			\
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index a2bfec7..adaf93a 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2178,7 +2178,7 @@ static void cfq_put_async_queues(struct cfq_data *cfqd)
 		cfq_put_queue(cfqd->async_idle_cfqq);
 }
 
-static void cfq_exit_queue(elevator_t *e)
+static void cfq_exit_queue(struct elevator_queue *e)
 {
 	struct cfq_data *cfqd = e->elevator_data;
 	struct request_queue *q = cfqd->queue;
@@ -2288,7 +2288,7 @@ cfq_var_store(unsigned int *var, const char *page, size_t count)
 }
 
 #define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
-static ssize_t __FUNC(elevator_t *e, char *page)			\
+static ssize_t __FUNC(struct elevator_queue *e, char *page)		\
 {									\
 	struct cfq_data *cfqd = e->elevator_data;			\
 	unsigned int __data = __VAR;					\
@@ -2308,7 +2308,7 @@ SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
 #undef SHOW_FUNCTION
 
 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
-static ssize_t __FUNC(elevator_t *e, const char *page, size_t count)	\
+static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)	\
 {									\
 	struct cfq_data *cfqd = e->elevator_data;			\
 	unsigned int __data;						\
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index fd31117..c4d991d 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -334,7 +334,7 @@ static int deadline_queue_empty(struct request_queue *q)
 		&& list_empty(&dd->fifo_list[READ]);
 }
 
-static void deadline_exit_queue(elevator_t *e)
+static void deadline_exit_queue(struct elevator_queue *e)
 {
 	struct deadline_data *dd = e->elevator_data;
 
@@ -387,7 +387,7 @@ deadline_var_store(int *var, const char *page, size_t count)
 }
 
 #define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
-static ssize_t __FUNC(elevator_t *e, char *page)			\
+static ssize_t __FUNC(struct elevator_queue *e, char *page)		\
 {									\
 	struct deadline_data *dd = e->elevator_data;			\
 	int __data = __VAR;						\
@@ -403,7 +403,7 @@ SHOW_FUNCTION(deadline_fifo_batch_show, dd->fifo_batch, 0);
 #undef SHOW_FUNCTION
 
 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
-static ssize_t __FUNC(elevator_t *e, const char *page, size_t count)	\
+static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)	\
 {									\
 	struct deadline_data *dd = e->elevator_data;			\
 	int __data;							\
diff --git a/block/elevator.c b/block/elevator.c
index ff60177..98259ed 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -65,7 +65,7 @@ DEFINE_TRACE(block_rq_issue);
 static int elv_iosched_allow_merge(struct request *rq, struct bio *bio)
 {
 	struct request_queue *q = rq->q;
-	elevator_t *e = q->elevator;
+	struct elevator_queue *e = q->elevator;
 
 	if (e->ops->elevator_allow_merge_fn)
 		return e->ops->elevator_allow_merge_fn(q, rq, bio);
@@ -208,13 +208,13 @@ __setup("elevator=", elevator_setup);
 
 static struct kobj_type elv_ktype;
 
-static elevator_t *elevator_alloc(struct request_queue *q,
+static struct elevator_queue *elevator_alloc(struct request_queue *q,
 				  struct elevator_type *e)
 {
-	elevator_t *eq;
+	struct elevator_queue *eq;
 	int i;
 
-	eq = kmalloc_node(sizeof(elevator_t), GFP_KERNEL | __GFP_ZERO, q->node);
+	eq = kmalloc_node(sizeof(*eq), GFP_KERNEL | __GFP_ZERO, q->node);
 	if (unlikely(!eq))
 		goto err;
 
@@ -240,8 +240,9 @@ err:
 
 static void elevator_release(struct kobject *kobj)
 {
-	elevator_t *e = container_of(kobj, elevator_t, kobj);
+	struct elevator_queue *e;
 
+	e = container_of(kobj, struct elevator_queue, kobj);
 	elevator_put(e->elevator_type);
 	kfree(e->hash);
 	kfree(e);
@@ -297,7 +298,7 @@ int elevator_init(struct request_queue *q, char *name)
 }
 EXPORT_SYMBOL(elevator_init);
 
-void elevator_exit(elevator_t *e)
+void elevator_exit(struct elevator_queue *e)
 {
 	mutex_lock(&e->sysfs_lock);
 	if (e->ops->elevator_exit_fn)
@@ -311,7 +312,7 @@ EXPORT_SYMBOL(elevator_exit);
 
 static void elv_activate_rq(struct request_queue *q, struct request *rq)
 {
-	elevator_t *e = q->elevator;
+	struct elevator_queue *e = q->elevator;
 
 	if (e->ops->elevator_activate_req_fn)
 		e->ops->elevator_activate_req_fn(q, rq);
@@ -319,7 +320,7 @@ static void elv_activate_rq(struct request_queue *q, struct request *rq)
 
 static void elv_deactivate_rq(struct request_queue *q, struct request *rq)
 {
-	elevator_t *e = q->elevator;
+	struct elevator_queue *e = q->elevator;
 
 	if (e->ops->elevator_deactivate_req_fn)
 		e->ops->elevator_deactivate_req_fn(q, rq);
@@ -338,7 +339,7 @@ static void elv_rqhash_del(struct request_queue *q, struct request *rq)
 
 static void elv_rqhash_add(struct request_queue *q, struct request *rq)
 {
-	elevator_t *e = q->elevator;
+	struct elevator_queue *e = q->elevator;
 
 	BUG_ON(ELV_ON_HASH(rq));
 	hlist_add_head(&rq->hash, &e->hash[ELV_HASH_FN(rq_hash_key(rq))]);
@@ -352,7 +353,7 @@ static void elv_rqhash_reposition(struct request_queue *q, struct request *rq)
 
 static struct request *elv_rqhash_find(struct request_queue *q, sector_t offset)
 {
-	elevator_t *e = q->elevator;
+	struct elevator_queue *e = q->elevator;
 	struct hlist_head *hash_list = &e->hash[ELV_HASH_FN(offset)];
 	struct hlist_node *entry, *next;
 	struct request *rq;
@@ -494,7 +495,7 @@ EXPORT_SYMBOL(elv_dispatch_add_tail);
 
 int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
 {
-	elevator_t *e = q->elevator;
+	struct elevator_queue *e = q->elevator;
 	struct request *__rq;
 	int ret;
 
@@ -529,7 +530,7 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
 
 void elv_merged_request(struct request_queue *q, struct request *rq, int type)
 {
-	elevator_t *e = q->elevator;
+	struct elevator_queue *e = q->elevator;
 
 	if (e->ops->elevator_merged_fn)
 		e->ops->elevator_merged_fn(q, rq, type);
@@ -543,7 +544,7 @@ void elv_merged_request(struct request_queue *q, struct request *rq, int type)
 void elv_merge_requests(struct request_queue *q, struct request *rq,
 			     struct request *next)
 {
-	elevator_t *e = q->elevator;
+	struct elevator_queue *e = q->elevator;
 
 	if (e->ops->elevator_merge_req_fn)
 		e->ops->elevator_merge_req_fn(q, rq, next);
@@ -846,7 +847,7 @@ void elv_dequeue_request(struct request_queue *q, struct request *rq)
 
 int elv_queue_empty(struct request_queue *q)
 {
-	elevator_t *e = q->elevator;
+	struct elevator_queue *e = q->elevator;
 
 	if (!list_empty(&q->queue_head))
 		return 0;
@@ -860,7 +861,7 @@ EXPORT_SYMBOL(elv_queue_empty);
 
 struct request *elv_latter_request(struct request_queue *q, struct request *rq)
 {
-	elevator_t *e = q->elevator;
+	struct elevator_queue *e = q->elevator;
 
 	if (e->ops->elevator_latter_req_fn)
 		return e->ops->elevator_latter_req_fn(q, rq);
@@ -869,7 +870,7 @@ struct request *elv_latter_request(struct request_queue *q, struct request *rq)
 
 struct request *elv_former_request(struct request_queue *q, struct request *rq)
 {
-	elevator_t *e = q->elevator;
+	struct elevator_queue *e = q->elevator;
 
 	if (e->ops->elevator_former_req_fn)
 		return e->ops->elevator_former_req_fn(q, rq);
@@ -878,7 +879,7 @@ struct request *elv_former_request(struct request_queue *q, struct request *rq)
 
 int elv_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
 {
-	elevator_t *e = q->elevator;
+	struct elevator_queue *e = q->elevator;
 
 	if (e->ops->elevator_set_req_fn)
 		return e->ops->elevator_set_req_fn(q, rq, gfp_mask);
@@ -889,7 +890,7 @@ int elv_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
 
 void elv_put_request(struct request_queue *q, struct request *rq)
 {
-	elevator_t *e = q->elevator;
+	struct elevator_queue *e = q->elevator;
 
 	if (e->ops->elevator_put_req_fn)
 		e->ops->elevator_put_req_fn(rq);
@@ -897,7 +898,7 @@ void elv_put_request(struct request_queue *q, struct request *rq)
 
 int elv_may_queue(struct request_queue *q, int rw)
 {
-	elevator_t *e = q->elevator;
+	struct elevator_queue *e = q->elevator;
 
 	if (e->ops->elevator_may_queue_fn)
 		return e->ops->elevator_may_queue_fn(q, rw);
@@ -920,7 +921,7 @@ EXPORT_SYMBOL(elv_abort_queue);
 
 void elv_completed_request(struct request_queue *q, struct request *rq)
 {
-	elevator_t *e = q->elevator;
+	struct elevator_queue *e = q->elevator;
 
 	/*
 	 * request is released from the driver, io must be done
@@ -955,13 +956,14 @@ void elv_completed_request(struct request_queue *q, struct request *rq)
 static ssize_t
 elv_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
 {
-	elevator_t *e = container_of(kobj, elevator_t, kobj);
 	struct elv_fs_entry *entry = to_elv(attr);
+	struct elevator_queue *e;
 	ssize_t error;
 
 	if (!entry->show)
 		return -EIO;
 
+	e = container_of(kobj, struct elevator_queue, kobj);
 	mutex_lock(&e->sysfs_lock);
 	error = e->ops ? entry->show(e, page) : -ENOENT;
 	mutex_unlock(&e->sysfs_lock);
@@ -972,13 +974,14 @@ static ssize_t
 elv_attr_store(struct kobject *kobj, struct attribute *attr,
 	       const char *page, size_t length)
 {
-	elevator_t *e = container_of(kobj, elevator_t, kobj);
 	struct elv_fs_entry *entry = to_elv(attr);
+	struct elevator_queue *e;
 	ssize_t error;
 
 	if (!entry->store)
 		return -EIO;
 
+	e = container_of(kobj, struct elevator_queue, kobj);
 	mutex_lock(&e->sysfs_lock);
 	error = e->ops ? entry->store(e, page, length) : -ENOENT;
 	mutex_unlock(&e->sysfs_lock);
@@ -997,7 +1000,7 @@ static struct kobj_type elv_ktype = {
 
 int elv_register_queue(struct request_queue *q)
 {
-	elevator_t *e = q->elevator;
+	struct elevator_queue *e = q->elevator;
 	int error;
 
 	error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched");
@@ -1015,7 +1018,7 @@ int elv_register_queue(struct request_queue *q)
 	return error;
 }
 
-static void __elv_unregister_queue(elevator_t *e)
+static void __elv_unregister_queue(struct elevator_queue *e)
 {
 	kobject_uevent(&e->kobj, KOBJ_REMOVE);
 	kobject_del(&e->kobj);
@@ -1078,7 +1081,7 @@ EXPORT_SYMBOL_GPL(elv_unregister);
  */
 static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 {
-	elevator_t *old_elevator, *e;
+	struct elevator_queue *old_elevator, *e;
 	void *data;
 
 	/*
@@ -1184,7 +1187,7 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *name,
 
 ssize_t elv_iosched_show(struct request_queue *q, char *name)
 {
-	elevator_t *e = q->elevator;
+	struct elevator_queue *e = q->elevator;
 	struct elevator_type *elv = e->elevator_type;
 	struct elevator_type *__e;
 	int len = 0;
diff --git a/block/noop-iosched.c b/block/noop-iosched.c
index c23e029..3a0d369 100644
--- a/block/noop-iosched.c
+++ b/block/noop-iosched.c
@@ -76,7 +76,7 @@ static void *noop_init_queue(struct request_queue *q)
 	return nd;
 }
 
-static void noop_exit_queue(elevator_t *e)
+static void noop_exit_queue(struct elevator_queue *e)
 {
 	struct noop_data *nd = e->elevator_data;
 
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index d3a91ca..0766ce6 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -722,7 +722,7 @@ static int __init nbd_init(void)
 
 	for (i = 0; i < nbds_max; i++) {
 		struct gendisk *disk = alloc_disk(1 << part_shift);
-		elevator_t *old_e;
+		struct elevator_queue *old_e;
 		if (!disk)
 			goto out;
 		nbd_dev[i].disk = disk;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 41bbadf..7035cec 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -26,7 +26,6 @@ struct scsi_ioctl_command;
 
 struct request_queue;
 struct elevator_queue;
-typedef struct elevator_queue elevator_t;
 struct request_pm_state;
 struct blk_trace;
 struct request;
@@ -313,7 +312,7 @@ struct request_queue
 	 */
 	struct list_head	queue_head;
 	struct request		*last_merge;
-	elevator_t		*elevator;
+	struct elevator_queue	*elevator;
 
 	/*
 	 * the queue request freelist, one for reads and one for writes
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 92f6f63..7a20425 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -28,7 +28,7 @@ typedef void (elevator_activate_req_fn) (struct request_queue *, struct request
 typedef void (elevator_deactivate_req_fn) (struct request_queue *, struct request *);
 
 typedef void *(elevator_init_fn) (struct request_queue *);
-typedef void (elevator_exit_fn) (elevator_t *);
+typedef void (elevator_exit_fn) (struct elevator_queue *);
 
 struct elevator_ops
 {
@@ -62,8 +62,8 @@ struct elevator_ops
 
 struct elv_fs_entry {
 	struct attribute attr;
-	ssize_t (*show)(elevator_t *, char *);
-	ssize_t (*store)(elevator_t *, const char *, size_t);
+	ssize_t (*show)(struct elevator_queue *, char *);
+	ssize_t (*store)(struct elevator_queue *, const char *, size_t);
 };
 
 /*
@@ -130,7 +130,7 @@ extern ssize_t elv_iosched_show(struct request_queue *, char *);
 extern ssize_t elv_iosched_store(struct request_queue *, const char *, size_t);
 
 extern int elevator_init(struct request_queue *, char *);
-extern void elevator_exit(elevator_t *);
+extern void elevator_exit(struct elevator_queue *);
 extern int elv_rq_merge_ok(struct request *, struct bio *);
 
 /*
-- 
cgit v0.10.2


From 31dcfab0ae6eb0340d456931df4d478e17efc65d Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 31 Oct 2008 10:06:37 +0100
Subject: nbd: tell the block layer that it is not a rotational device

Then we can get rid of that manual elevator type fiddling.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 0766ce6..7bcc1d8 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -722,7 +722,6 @@ static int __init nbd_init(void)
 
 	for (i = 0; i < nbds_max; i++) {
 		struct gendisk *disk = alloc_disk(1 << part_shift);
-		struct elevator_queue *old_e;
 		if (!disk)
 			goto out;
 		nbd_dev[i].disk = disk;
@@ -736,11 +735,10 @@ static int __init nbd_init(void)
 			put_disk(disk);
 			goto out;
 		}
-		old_e = disk->queue->elevator;
-		if (elevator_init(disk->queue, "deadline") == 0 ||
-			elevator_init(disk->queue, "noop") == 0) {
-				elevator_exit(old_e);
-		}
+		/*
+		 * Tell the block layer that we are not a rotational device
+		 */
+		queue_flag_set_unlocked(QUEUE_FLAG_NONROT, disk->queue);
 	}
 
 	if (register_blkdev(NBD_MAJOR, "nbd")) {
-- 
cgit v0.10.2


From 30e0dc28bff9dc456cdfc2aae4aca78b8b1a1cec Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Mon, 20 Oct 2008 15:44:28 +0200
Subject: cfq-iosched: remove limit of dispatch depth of max 4 times quantum

This basically limits the hardware queue depth to 4*quantum at any
point in time, which is 16 with the default settings. As CFQ uses
other means to shrink the hardware queue when necessary in the first
place, there's really no need for this extra heuristic. Additionally,
it ends up hurting performance in some cases.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index adaf93a..ee8a90c 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1136,12 +1136,8 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
 		if (cfq_class_idle(cfqq))
 			max_dispatch = 1;
 
-		if (cfqq->dispatched >= max_dispatch) {
-			if (cfqd->busy_queues > 1)
-				break;
-			if (cfqq->dispatched >= 4 * max_dispatch)
-				break;
-		}
+		if (cfqq->dispatched >= max_dispatch && cfqd->busy_queues > 1)
+			break;
 
 		if (cfqd->sync_flight && !cfq_cfqq_sync(cfqq))
 			break;
-- 
cgit v0.10.2


From a6f23657d3072bde6844055bbc2290e497f33fbc Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 24 Oct 2008 12:52:42 +0200
Subject: block: add one-hit cache for disk partition lookup

disk_map_sector_rcu() returns a partition from a sector offset,
which we use for IO statistics on a per-partition basis. The
lookup itself is an O(N) list lookup, where N is the number of
partitions. This actually hurts performance quite a bit, even
on the lower end partitions. On higher numbered partitions,
it can get pretty bad.

Solve this by adding a one-hit cache for partition lookup.
This makes the lookup O(1) for the case where we do most IO to
one partition. Even for mixed partition workloads, amortized cost
is pretty close to O(1) since the natural IO batching makes the
one-hit cache last for lots of IOs.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/genhd.c b/block/genhd.c
index 2f7feda..d84a7df 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -181,6 +181,12 @@ void disk_part_iter_exit(struct disk_part_iter *piter)
 }
 EXPORT_SYMBOL_GPL(disk_part_iter_exit);
 
+static inline int sector_in_part(struct hd_struct *part, sector_t sector)
+{
+	return part->start_sect <= sector &&
+		sector < part->start_sect + part->nr_sects;
+}
+
 /**
  * disk_map_sector_rcu - map sector to partition
  * @disk: gendisk of interest
@@ -199,16 +205,22 @@ EXPORT_SYMBOL_GPL(disk_part_iter_exit);
 struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector)
 {
 	struct disk_part_tbl *ptbl;
+	struct hd_struct *part;
 	int i;
 
 	ptbl = rcu_dereference(disk->part_tbl);
 
+	part = rcu_dereference(ptbl->last_lookup);
+	if (part && sector_in_part(part, sector))
+		return part;
+
 	for (i = 1; i < ptbl->len; i++) {
-		struct hd_struct *part = rcu_dereference(ptbl->part[i]);
+		part = rcu_dereference(ptbl->part[i]);
 
-		if (part && part->start_sect <= sector &&
-		    sector < part->start_sect + part->nr_sects)
+		if (part && sector_in_part(part, sector)) {
+			rcu_assign_pointer(ptbl->last_lookup, part);
 			return part;
+		}
 	}
 	return &disk->part0;
 }
@@ -888,8 +900,11 @@ static void disk_replace_part_tbl(struct gendisk *disk,
 	struct disk_part_tbl *old_ptbl = disk->part_tbl;
 
 	rcu_assign_pointer(disk->part_tbl, new_ptbl);
-	if (old_ptbl)
+
+	if (old_ptbl) {
+		rcu_assign_pointer(old_ptbl->last_lookup, NULL);
 		call_rcu(&old_ptbl->rcu_head, disk_free_ptbl_rcu_cb);
+	}
 }
 
 /**
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 3df7742..16948ea 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -126,6 +126,7 @@ struct blk_scsi_cmd_filter {
 struct disk_part_tbl {
 	struct rcu_head rcu_head;
 	int len;
+	struct hd_struct *last_lookup;
 	struct hd_struct *part[];
 };
 
-- 
cgit v0.10.2


From 18af8b2ca34b831c32c6fa01e7ce33143c33bb63 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Thu, 4 Dec 2008 08:56:35 +0100
Subject: block: use min_not_zero in blk_queue_stack_limits

zero is invalid for max_phys_segments, max_hw_segments, and
max_segment_size. It's better to use use min_not_zero instead of
min. min() works though (because the commit 0e435ac makes sure that
these values are set to the default values, non zero, if a queue is
initialized properly).

With this patch, blk_queue_stack_limits does the almost same thing
that dm's combine_restrictions_low() does. I think that it's easy to
remove dm's combine_restrictions_low.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-settings.c b/block/blk-settings.c
index afa55e1..59fd05d 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -319,9 +319,9 @@ void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b)
 	t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors);
 	t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask, b->seg_boundary_mask);
 
-	t->max_phys_segments = min(t->max_phys_segments, b->max_phys_segments);
-	t->max_hw_segments = min(t->max_hw_segments, b->max_hw_segments);
-	t->max_segment_size = min(t->max_segment_size, b->max_segment_size);
+	t->max_phys_segments = min_not_zero(t->max_phys_segments, b->max_phys_segments);
+	t->max_hw_segments = min_not_zero(t->max_hw_segments, b->max_hw_segments);
+	t->max_segment_size = min_not_zero(t->max_segment_size, b->max_segment_size);
 	t->hardsect_size = max(t->hardsect_size, b->hardsect_size);
 	if (!t->queue_lock)
 		WARN_ON_ONCE(1);
-- 
cgit v0.10.2


From 3c18ce71af754cefae75103dbae28817e04b2db4 Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Wed, 10 Dec 2008 15:47:33 +0100
Subject: block: make blk_softirq_init() static

Sparse asked whether these could be static.

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index e660d26..ce0efc6 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -161,7 +161,7 @@ void blk_complete_request(struct request *req)
 }
 EXPORT_SYMBOL(blk_complete_request);
 
-__init int blk_softirq_init(void)
+static __init int blk_softirq_init(void)
 {
 	int i;
 
-- 
cgit v0.10.2


From b3a6ffe16b5cc48abe7db8d04882dc45280eb693 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 12 Dec 2008 09:51:16 +0100
Subject: Get rid of CONFIG_LSF

We have two seperate config entries for large devices/files. One
is CONFIG_LBD that guards just the devices, the other is CONFIG_LSF
that handles large files. This doesn't make a lot of sense, you typically
want both or none. So get rid of CONFIG_LSF and change CONFIG_LBD wording
to indicate that it covers both.

Acked-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/Kconfig b/block/Kconfig
index 290b219..ac0956f 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -24,21 +24,17 @@ menuconfig BLOCK
 if BLOCK
 
 config LBD
-	bool "Support for Large Block Devices"
+	bool "Support for large block devices and files"
 	depends on !64BIT
 	help
-	  Enable block devices of size 2TB and larger.
+	  Enable block devices or files of size 2TB and larger.
 
 	  This option is required to support the full capacity of large
 	  (2TB+) block devices, including RAID, disk, Network Block Device,
 	  Logical Volume Manager (LVM) and loopback.
-
-	  For example, RAID devices are frequently bigger than the capacity
-	  of the largest individual hard drive.
-
-	  This option is not required if you have individual disk drives
-	  which total 2TB+ and you are not aggregating the capacity into
-	  a large block device (e.g. using RAID or LVM).
+	
+	  This option also enables support for single files larger than
+	  2TB.
 
 	  If unsure, say N.
 
@@ -58,15 +54,6 @@ config BLK_DEV_IO_TRACE
 
 	  If unsure, say N.
 
-config LSF
-	bool "Support for Large Single Files"
-	depends on !64BIT
-	help
-	  Say Y here if you want to be able to handle very large files (2TB
-	  and larger), otherwise say N.
-
-	  If unsure, say Y.
-
 config BLK_DEV_BSG
 	bool "Block layer SG support v4 (EXPERIMENTAL)"
 	depends on EXPERIMENTAL
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e4a241c..04158ad 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1721,7 +1721,7 @@ static loff_t ext4_max_size(int blkbits, int has_huge_files)
 	/* small i_blocks in vfs inode? */
 	if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
 		/*
-		 * CONFIG_LSF is not enabled implies the inode
+		 * CONFIG_LBD is not enabled implies the inode
 		 * i_block represent total blocks in 512 bytes
 		 * 32 == size of vfs inode i_blocks * 8
 		 */
@@ -1764,7 +1764,7 @@ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
 
 	if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
 		/*
-		 * !has_huge_files or CONFIG_LSF is not enabled
+		 * !has_huge_files or CONFIG_LBD is not enabled
 		 * implies the inode i_block represent total blocks in
 		 * 512 bytes 32 == size of vfs inode i_blocks * 8
 		 */
@@ -2021,13 +2021,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	if (has_huge_files) {
 		/*
 		 * Large file size enabled file system can only be
-		 * mount if kernel is build with CONFIG_LSF
+		 * mount if kernel is build with CONFIG_LBD
 		 */
 		if (sizeof(root->i_blocks) < sizeof(u64) &&
 				!(sb->s_flags & MS_RDONLY)) {
 			printk(KERN_ERR "EXT4-fs: %s: Filesystem with huge "
 					"files cannot be mounted read-write "
-					"without CONFIG_LSF.\n", sb->s_id);
+					"without CONFIG_LBD.\n", sb->s_id);
 			goto failed_mount;
 		}
 	}
diff --git a/include/linux/types.h b/include/linux/types.h
index 1d98330..121f349 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -135,19 +135,14 @@ typedef		__s64		int64_t;
  *
  * Linux always considers sectors to be 512 bytes long independently
  * of the devices real block size.
+ *
+ * blkcnt_t is the type of the inode's block count.
  */
 #ifdef CONFIG_LBD
 typedef u64 sector_t;
-#else
-typedef unsigned long sector_t;
-#endif
-
-/*
- * The type of the inode's block count.
- */
-#ifdef CONFIG_LSF
 typedef u64 blkcnt_t;
 #else
+typedef unsigned long sector_t;
 typedef unsigned long blkcnt_t;
 #endif
 
-- 
cgit v0.10.2


From 10e5b64415ae9a0ba81dd1f1036d14dfdae9308c Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Fri, 12 Dec 2008 09:52:35 +0100
Subject: alpha: remove dead BIO_VMERGE_BOUNDARY

The block layer dropped the virtual merge feature
(b8b3e16cfe6435d961f6aaebcfd52a1ff2a988c5). BIO_VMERGE_BOUNDARY
definition is meaningless now.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/arch/alpha/include/asm/io.h b/arch/alpha/include/asm/io.h
index e971ab0..eda9b90 100644
--- a/arch/alpha/include/asm/io.h
+++ b/arch/alpha/include/asm/io.h
@@ -96,9 +96,6 @@ static inline dma_addr_t __deprecated isa_page_to_bus(struct page *page)
 	return page_to_phys(page);
 }
 
-/* This depends on working iommu.  */
-#define BIO_VMERGE_BOUNDARY	(alpha_mv.mv_pci_tbi ? PAGE_SIZE : 0)
-
 /* Maximum PIO space address supported?  */
 #define IO_SPACE_LIMIT 0xffff
 
-- 
cgit v0.10.2


From 14f2793958bd7878db2dc0309ed4525d6b7f1b72 Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Fri, 12 Dec 2008 14:48:27 +0100
Subject: loop: Flush possible running bios when loop device is released.

When there are still queued bios and reference count
drops to zero, loop device must flush all queued bios.

Otherwise it can lead to situation that caller
closes the device, but some bios are still running
and endio() function call later OOpses when uses
unallocated mempool.

This happens for example when running dm-crypt over loop,
here is typical oops backtrace:

 Oops: 0000 [#1] PREEMPT SMP
 EIP is at mempool_free+0x12/0x6b
...
 crypt_dec_pending+0x50/0x54 [dm_crypt]
 crypt_endio+0x9f/0xa7 [dm_crypt]
 crypt_endio+0x0/0xa7 [dm_crypt]
 bio_endio+0x2b/0x2e
 loop_thread+0x37a/0x3b1
 do_lo_send_aops+0x0/0x165
 autoremove_wake_function+0x0/0x33
 loop_thread+0x0/0x3b1
 kthread+0x3b/0x61
 kthread+0x0/0x61
 kernel_thread_helper+0x7/0x10

(But crash is reproducible with different dm targets
running over loop device too.)

Patch fixes it by flushing the bios in release call,
reusing the flush mechanism for switching backing store.

Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index fb06ed6..66cfff7 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -624,20 +624,38 @@ static int loop_switch(struct loop_device *lo, struct file *file)
 }
 
 /*
+ * Helper to flush the IOs in loop, but keeping loop thread running
+ */
+static int loop_flush(struct loop_device *lo)
+{
+	/* loop not yet configured, no running thread, nothing to flush */
+	if (!lo->lo_thread)
+		return 0;
+
+	return loop_switch(lo, NULL);
+}
+
+/*
  * Do the actual switch; called from the BIO completion routine
  */
 static void do_loop_switch(struct loop_device *lo, struct switch_request *p)
 {
 	struct file *file = p->file;
 	struct file *old_file = lo->lo_backing_file;
-	struct address_space *mapping = file->f_mapping;
+	struct address_space *mapping;
+
+	/* if no new file, only flush of queued bios requested */
+	if (!file)
+		goto out;
 
+	mapping = file->f_mapping;
 	mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask);
 	lo->lo_backing_file = file;
 	lo->lo_blocksize = S_ISBLK(mapping->host->i_mode) ?
 		mapping->host->i_bdev->bd_block_size : PAGE_SIZE;
 	lo->old_gfp_mask = mapping_gfp_mask(mapping);
 	mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
+out:
 	complete(&p->wait);
 }
 
@@ -1345,11 +1363,25 @@ static int lo_release(struct gendisk *disk, fmode_t mode)
 	struct loop_device *lo = disk->private_data;
 
 	mutex_lock(&lo->lo_ctl_mutex);
-	--lo->lo_refcnt;
 
-	if ((lo->lo_flags & LO_FLAGS_AUTOCLEAR) && !lo->lo_refcnt)
+	if (--lo->lo_refcnt)
+		goto out;
+
+	if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) {
+		/*
+		 * In autoclear mode, stop the loop thread
+		 * and remove configuration after last close.
+		 */
 		loop_clr_fd(lo, NULL);
+	} else {
+		/*
+		 * Otherwise keep thread (if running) and config,
+		 * but flush possible ongoing bios in thread.
+		 */
+		loop_flush(lo);
+	}
 
+out:
 	mutex_unlock(&lo->lo_ctl_mutex);
 
 	return 0;
-- 
cgit v0.10.2


From 8ae30b895805a6e2bb725b1d78b12daabd7eadfe Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Fri, 12 Dec 2008 14:50:49 +0100
Subject: loop: Do not call loop_unplug for not configured loop device.

In loop_unplug() function is expected that mapping is set
and lo->lo_backing_file is not NULL.

Unfortunately loop_set_fd() set the request queue unplug function,
but loop_clr_fd() doesn't clear that.

Loop device allows open of non-configured loop in some situations.
If the unplug on request queue is called, loop module oopses because
of missing lo_backing_file.

Simple reproducer:
	losetup /dev/loop0 /xxx
	losetup -d /dev/loop0
	dmsetup create x --table "0 1 linear /dev/loop0 0"

 EIP is at loop_unplug+0x1d/0x3b
 ...
  Call Trace:
   blk_unplug+0x57/0x5e
   dm_table_unplug_all+0x34/0x77 [dm_mod]
   destroy_inode+0x27/0x38
   generic_delete_inode+0xd5/0xd9
   iput+0x4b/0x4e
   dm_resume+0xca/0xfe [dm_mod]
   dev_suspend+0x143/0x165 [dm_mod]
   dm_ctl_ioctl+0x18e/0x1cf [dm_mod]
   dev_suspend+0x0/0x165 [dm_mod]
   dm_ctl_ioctl+0x0/0x1cf [dm_mod]
   vfs_ioctl+0x22/0x69
   do_vfs_ioctl+0x39d/0x3c7
   trace_hardirqs_on+0xb/0xd
   remove_vma+0x50/0x56
   do_munmap+0x21c/0x237
   sys_ioctl+0x2c/0x45
   sysenter_do_call+0x12/0x31

Several reports here
http://www.kerneloops.org/search.php?search=loop_unplug

Fix it by simply clear unplug function together with
removing of backing file.

Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 66cfff7..edbaac6 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -919,6 +919,7 @@ static int loop_clr_fd(struct loop_device *lo, struct block_device *bdev)
 
 	kthread_stop(lo->lo_thread);
 
+	lo->lo_queue->unplug_fn = NULL;
 	lo->lo_backing_file = NULL;
 
 	loop_release_xfer(lo);
-- 
cgit v0.10.2


From 62c1fe9d9f0a676fce89185b1513f0e5f473c72c Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Mon, 15 Dec 2008 21:19:25 +0100
Subject: cfq-iosched: fix race between exiting queue and exiting task

Original patch from Nikanth Karthikesan <knikanth@suse.de>

When a queue exits the queue lock is taken and cfq_exit_queue() would free all
the cic's associated with the queue.

But when a task exits, cfq_exit_io_context() gets cic one by one and then
locks the associated queue to call __cfq_exit_single_io_context. It looks like
between getting a cic from the ioc and locking the queue, the queue might have
exited on another cpu.

Fix this by rechecking the cfq_io_context queue key inside the queue lock
again, and not calling into __cfq_exit_single_io_context() if somebody
beat us to it.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index ee8a90c..e8525fa 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1314,7 +1314,15 @@ static void cfq_exit_single_io_context(struct io_context *ioc,
 		unsigned long flags;
 
 		spin_lock_irqsave(q->queue_lock, flags);
-		__cfq_exit_single_io_context(cfqd, cic);
+
+		/*
+		 * Ensure we get a fresh copy of the ->key to prevent
+		 * race between exiting task and queue
+		 */
+		smp_read_barrier_depends();
+		if (cic->key)
+			__cfq_exit_single_io_context(cfqd, cic);
+
 		spin_unlock_irqrestore(q->queue_lock, flags);
 	}
 }
-- 
cgit v0.10.2


From a0ea8622918019ed76c0b85f5d0247809ba05a7c Mon Sep 17 00:00:00 2001
From: "Stephen M. Cameron" <scameron@beardog.cca.cpqcorp.net>
Date: Thu, 18 Dec 2008 14:55:51 +0100
Subject: cciss: simplify parameters to deregister_disk function

Simplify parameters to deregister_disk function.

Signed-off-by: Stephen M. Cameron <scameron@beardog.cca.cpqcorp.net>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 4e07b5c..01e6938 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -164,7 +164,7 @@ static int cciss_getgeo(struct block_device *bdev, struct hd_geometry *geo);
 
 static int cciss_revalidate(struct gendisk *disk);
 static int rebuild_lun_table(ctlr_info_t *h, int first_time);
-static int deregister_disk(struct gendisk *disk, drive_info_struct *drv,
+static int deregister_disk(ctlr_info_t *h, int drv_index,
 			   int clear_all);
 
 static void cciss_read_capacity(int ctlr, int logvol, int withirq,
@@ -1479,8 +1479,7 @@ static void cciss_update_drive_info(int ctlr, int drv_index, int first_time)
 		 * which keeps the interrupt handler from starting
 		 * the queue.
 		 */
-		ret = deregister_disk(h->gendisk[drv_index],
-				      &h->drv[drv_index], 0);
+		ret = deregister_disk(h, drv_index, 0);
 		h->drv[drv_index].busy_configuring = 0;
 	}
 
@@ -1698,8 +1697,7 @@ static int rebuild_lun_table(ctlr_info_t *h, int first_time)
 			spin_lock_irqsave(CCISS_LOCK(h->ctlr), flags);
 			h->drv[i].busy_configuring = 1;
 			spin_unlock_irqrestore(CCISS_LOCK(h->ctlr), flags);
-			return_code = deregister_disk(h->gendisk[i],
-				&h->drv[i], 1);
+			return_code = deregister_disk(h, i, 1);
 			h->drv[i].busy_configuring = 0;
 		}
 	}
@@ -1769,15 +1767,19 @@ mem_msg:
  *             the highest_lun should be left unchanged and the LunID
  *             should not be cleared.
 */
-static int deregister_disk(struct gendisk *disk, drive_info_struct *drv,
+static int deregister_disk(ctlr_info_t *h, int drv_index,
 			   int clear_all)
 {
 	int i;
-	ctlr_info_t *h = get_host(disk);
+	struct gendisk *disk;
+	drive_info_struct *drv;
 
 	if (!capable(CAP_SYS_RAWIO))
 		return -EPERM;
 
+	drv = &h->drv[drv_index];
+	disk = h->gendisk[drv_index];
+
 	/* make sure logical volume is NOT is use */
 	if (clear_all || (h->gendisk[0] == disk)) {
 		if (drv->usage_count > 1)
-- 
cgit v0.10.2


From f735b5eeb9fcbb001e0cf2a5296b19c4bbaec55f Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 23 Dec 2008 12:44:19 +0100
Subject: bounce: don't rely on a zeroed bio_vec list

__blk_queue_bounce() relies on a zeroed bio_vec list, since it looks
up arbitrary indexes in the allocated bio. The block layer only
guarentees that added entries are valid, so clear memory after alloc.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/mm/bounce.c b/mm/bounce.c
index bf0cf7c..e590272 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -198,8 +198,13 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
 		/*
 		 * irk, bounce it
 		 */
-		if (!bio)
-			bio = bio_alloc(GFP_NOIO, (*bio_orig)->bi_vcnt);
+		if (!bio) {
+			unsigned int cnt = (*bio_orig)->bi_vcnt;
+
+			bio = bio_alloc(GFP_NOIO, cnt);
+			memset(bio->bi_io_vec, 0, cnt * sizeof(struct bio_vec));
+		}
+			
 
 		to = bio->bi_io_vec + i;
 
-- 
cgit v0.10.2


From d3f761104b097738932afcc310fbbbbfb007ef92 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 23 Dec 2008 12:46:21 +0100
Subject: bio: get rid of bio_vec clearing

We don't need to clear the memory used for adding bio_vec entries,
since nobody should be looking at members unitialized. Any valid
use should be below bio->bi_vcnt, and that members up until that count
must be valid since they were added through bio_add_page().

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/fs/bio.c b/fs/bio.c
index 75e6be1..711cee1 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -180,7 +180,7 @@ struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx,
 	 * kzalloc() for the exact number of vecs right away.
 	 */
 	if (!bs)
-		bvl = kzalloc(nr * sizeof(struct bio_vec), gfp_mask);
+		bvl = kmalloc(nr * sizeof(struct bio_vec), gfp_mask);
 
 	/*
 	 * see comment near bvec_array define!
@@ -237,9 +237,6 @@ fallback:
 		}
 	}
 
-	if (bvl)
-		memset(bvl, 0, bvec_nr_vecs(*idx) * sizeof(struct bio_vec));
-
 	return bvl;
 }
 
@@ -325,7 +322,6 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 				idx = 0;
 				bvl = bio->bi_inline_vecs;
 				nr_iovecs = BIO_INLINE_VECS;
-				memset(bvl, 0, BIO_INLINE_VECS * sizeof(*bvl));
 			} else {
 				bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx,
 							bs);
-- 
cgit v0.10.2