From eb60fa1066622ddb2278732cf61e0c4544e82c6f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 10 Nov 2008 15:28:59 +0900
Subject: block: fix add_partition() error path

Partition stats structure was not freed on devt allocation failure
path.  Fix it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 633f7a0..90bcf136 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -395,7 +395,7 @@ int add_partition(struct gendisk *disk, int partno,
 
 	err = blk_alloc_devt(p, &devt);
 	if (err)
-		goto out_free;
+		goto out_free_stats;
 	pdev->devt = devt;
 
 	/* delay uevent until 'holders' subdir is created */
@@ -426,6 +426,8 @@ int add_partition(struct gendisk *disk, int partno,
 
 	return 0;
 
+out_free_stats:
+	free_part_stats(p);
 out_free:
 	kfree(p);
 	return err;
-- 
cgit v0.10.2


From ba32929a91fe2c0628f5be62d1597b379c8d3062 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 10 Nov 2008 15:29:58 +0900
Subject: block: make add_partition() return pointer to hd_struct

Make add_partition() return pointer to the new hd_struct on success
and ERR_PTR() value on failure.  This change will be used to fix md
autodetection bug.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Neil Brown <neilb@suse.de>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/ioctl.c b/block/ioctl.c
index c832d63..d03985b 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -18,7 +18,6 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
 	struct disk_part_iter piter;
 	long long start, length;
 	int partno;
-	int err;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
@@ -61,10 +60,10 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
 			disk_part_iter_exit(&piter);
 
 			/* all seems OK */
-			err = add_partition(disk, partno, start, length,
-					    ADDPART_FLAG_NONE);
+			part = add_partition(disk, partno, start, length,
+					     ADDPART_FLAG_NONE);
 			mutex_unlock(&bdev->bd_mutex);
-			return err;
+			return IS_ERR(part) ? PTR_ERR(part) : 0;
 		case BLKPG_DEL_PARTITION:
 			part = disk_get_part(disk, partno);
 			if (!part)
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 90bcf136..6330253 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -348,8 +348,8 @@ static ssize_t whole_disk_show(struct device *dev,
 static DEVICE_ATTR(whole_disk, S_IRUSR | S_IRGRP | S_IROTH,
 		   whole_disk_show, NULL);
 
-int add_partition(struct gendisk *disk, int partno,
-		  sector_t start, sector_t len, int flags)
+struct hd_struct *add_partition(struct gendisk *disk, int partno,
+				sector_t start, sector_t len, int flags)
 {
 	struct hd_struct *p;
 	dev_t devt = MKDEV(0, 0);
@@ -361,15 +361,15 @@ int add_partition(struct gendisk *disk, int partno,
 
 	err = disk_expand_part_tbl(disk, partno);
 	if (err)
-		return err;
+		return ERR_PTR(err);
 	ptbl = disk->part_tbl;
 
 	if (ptbl->part[partno])
-		return -EBUSY;
+		return ERR_PTR(-EBUSY);
 
 	p = kzalloc(sizeof(*p), GFP_KERNEL);
 	if (!p)
-		return -ENOMEM;
+		return ERR_PTR(-EBUSY);
 
 	if (!init_part_stats(p)) {
 		err = -ENOMEM;
@@ -424,20 +424,20 @@ int add_partition(struct gendisk *disk, int partno,
 	if (!ddev->uevent_suppress)
 		kobject_uevent(&pdev->kobj, KOBJ_ADD);
 
-	return 0;
+	return p;
 
 out_free_stats:
 	free_part_stats(p);
 out_free:
 	kfree(p);
-	return err;
+	return ERR_PTR(err);
 out_del:
 	kobject_put(p->holder_dir);
 	device_del(pdev);
 out_put:
 	put_device(pdev);
 	blk_free_devt(devt);
-	return err;
+	return ERR_PTR(err);
 }
 
 /* Not exported, helper to add_disk(). */
@@ -568,10 +568,11 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
 			       disk->disk_name, p, (unsigned long long) size);
 			size = get_capacity(disk) - from;
 		}
-		res = add_partition(disk, p, from, size, state->parts[p].flags);
-		if (res) {
-			printk(KERN_ERR " %s: p%d could not be added: %d\n",
-				disk->disk_name, p, -res);
+		part = add_partition(disk, p, from, size,
+				     state->parts[p].flags);
+		if (IS_ERR(part)) {
+			printk(KERN_ERR " %s: p%d could not be added: %ld\n",
+			       disk->disk_name, p, -PTR_ERR(part));
 			continue;
 		}
 #ifdef CONFIG_BLK_DEV_MD
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index e439e6a..3df7742 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -522,7 +522,9 @@ extern char *disk_name (struct gendisk *hd, int partno, char *buf);
 
 extern int disk_expand_part_tbl(struct gendisk *disk, int target);
 extern int rescan_partitions(struct gendisk *disk, struct block_device *bdev);
-extern int __must_check add_partition(struct gendisk *, int, sector_t, sector_t, int);
+extern struct hd_struct * __must_check add_partition(struct gendisk *disk,
+						     int partno, sector_t start,
+						     sector_t len, int flags);
 extern void delete_partition(struct gendisk *, int);
 extern void printk_all_partitions(void);
 
-- 
cgit v0.10.2


From 55e8e30c382d25c34f8aafcc78efec948571a941 Mon Sep 17 00:00:00 2001
From: Tejun Heo <teheo@suse.de>
Date: Mon, 10 Nov 2008 15:30:47 +0900
Subject: block/md: fix md autodetection

Block ext devt conversion missed md_autodetect_dev() call in
rescan_partitions() leaving md autodetect unable to see partitions.
Fix it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Neil Brown <neilb@suse.de>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 6330253..6d5b213 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -577,7 +577,7 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
 		}
 #ifdef CONFIG_BLK_DEV_MD
 		if (state->parts[p].flags & ADDPART_FLAG_RAID)
-			md_autodetect_dev(bdev->bd_dev+p);
+			md_autodetect_dev(part_to_dev(part)->devt);
 #endif
 	}
 	kfree(state);
-- 
cgit v0.10.2


From 561ec68e4de7947167937c49c451728e6b19e63b Mon Sep 17 00:00:00 2001
From: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Date: Fri, 14 Nov 2008 08:26:30 +0100
Subject: block: fix boot failure with CONFIG_DEBUG_BLOCK_EXT_DEVT=y and nash

We run into system boot failure with kernel 2.6.28-rc. We found it on a
couple of machines, including T61 notebook, nehalem machine, and another
HPC NX6325 notebook.  All the machines use FedoraCore 8 or FedoraCore 9.
With kernel prior to 2.6.28-rc, system boot doesn't fail.

I debug it and locate the root cause. Pls. see
http://bugzilla.kernel.org/show_bug.cgi?id=11899
https://bugzilla.redhat.com/show_bug.cgi?id=471517

As a matter of fact, there are 2 bugs.

1)root=/dev/sda1, system boot randomly fails. Mostly, boot for 5 times
and fails once. nash has a bug. Some of its functions misuse return
value 0.  Sometimes, 0 means timeout and no uevent available. Sometimes,
0 means nash gets an uevent, but the uevent isn't block-related (for
exmaple, usb). If by coincidence, kernel tells nash that uevents are
available, but kernel also set timeout, nash might stops collecting
other uevents in queue if current uevent isn't block-related.  I work
out a patch for nash to fix it.
http://bugzilla.kernel.org/attachment.cgi?id=18858

2) root=LABEL=/, system always can't boot. initrd init reports
switchroot fails. Here is an executation branch of nash when booting:
    (1) nash read /sys/block/sda/dev; Assume major is 8 (on my desktop)
    (2) nash query /proc/devices with the major number; It found line
	"8 sd";
    (3) nash use 'sd' to search its own probe table to find device (DISK)
	type for the device and add it to its own list;
    (4) Later on, it probes all devices in its list to get filesystem
	labels; scsi register "8 sd" always.

When major is 259, nash fails to find the device(DISK) type. I enables
CONFIG_DEBUG_BLOCK_EXT_DEVT=y when compiling kernel, so 259 is picked up
for device /dev/sda1, which causes nash to fail to find device (DISK)
type.

To fixing issue 2), I create a patch for nash and another patch for
kernel.

http://bugzilla.kernel.org/attachment.cgi?id=18859
http://bugzilla.kernel.org/attachment.cgi?id=18837

Below is the patch for kernel 2.6.28-rc4. It registers blkext, a new
block device in proc/devices.

With 2 patches on nash and 1 patch on kernel, I boot my machines for
dozens of times without failure.

Signed-off-by Zhang Yanmin <yanmin.zhang@linux.intel.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/genhd.c b/block/genhd.c
index 4e5e749..27549e4 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -768,6 +768,8 @@ static int __init genhd_device_init(void)
 	bdev_map = kobj_map_init(base_probe, &block_class_lock);
 	blk_dev_init();
 
+	register_blkdev(BLOCK_EXT_MAJOR, "blkext");
+
 #ifndef CONFIG_SYSFS_DEPRECATED
 	/* create top-level block dir */
 	block_depr = kobject_create_and_add("block", NULL);
-- 
cgit v0.10.2


From 68aee07f9bad2c830a898cf6d6bfc11ea24efc40 Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Fri, 14 Nov 2008 09:44:33 +0100
Subject: Release old elevator on change elevator

We should release old elevator when change to use a new one.

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index b220c68..2d19f0c 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -338,12 +338,18 @@ wait:
 static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
 {
 	struct request_queue *rq;
+	elevator_t *old_e;
 
 	rq = blk_init_queue(do_blkif_request, &blkif_io_lock);
 	if (rq == NULL)
 		return -1;
 
-	elevator_init(rq, "noop");
+	old_e = rq->elevator;
+	if (IS_ERR_VALUE(elevator_init(rq, "noop")))
+		printk(KERN_WARNING
+			"blkfront: Switch elevator failed, use default\n");
+	else
+		elevator_exit(old_e);
 
 	/* Hard sector size and max sectors impersonate the equiv. hardware. */
 	blk_queue_hardsect_size(rq, sector_size);
-- 
cgit v0.10.2


From 98ba4031ab2adc8b394295e68aa4c8fe9d5060db Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Fri, 14 Nov 2008 10:44:59 +0100
Subject: relay: fix cpu offline problem

relay_open() will close allocated buffers when failed.
but if cpu offlined, some buffer will not be closed.
this patch fixed it.

and did cleanup for relay_reset() too.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/kernel/relay.c b/kernel/relay.c
index 8d13a78..32b0bef 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -400,7 +400,7 @@ void relay_reset(struct rchan *chan)
 	}
 
 	mutex_lock(&relay_channels_mutex);
-	for_each_online_cpu(i)
+	for_each_possible_cpu(i)
 		if (chan->buf[i])
 			__relay_reset(chan->buf[i], 0);
 	mutex_unlock(&relay_channels_mutex);
@@ -611,10 +611,9 @@ struct rchan *relay_open(const char *base_filename,
 	return chan;
 
 free_bufs:
-	for_each_online_cpu(i) {
-		if (!chan->buf[i])
-			break;
-		relay_close_buf(chan->buf[i]);
+	for_each_possible_cpu(i) {
+		if (chan->buf[i])
+			relay_close_buf(chan->buf[i]);
 	}
 
 	kref_put(&chan->kref, relay_destroy_channel);
-- 
cgit v0.10.2


From c26156b2534c75bb3cdedf76f6ad1340971cf5bd Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 18 Nov 2008 15:07:05 +0100
Subject: block: hold extra reference to bio in blk_rq_map_user_iov()

If the size passed in is OK but we end up mapping too many segments,
we call the unmap path directly like from IO completion. But from IO
completion we have an extra reference to the bio, so this error case
goes OOPS when it attempts to free and already free bio.

Fix it by getting an extra reference to the bio before calling the
unmap failure case.

Reported-by: Petr Vandrovec <vandrove@vc.cvut.cz>

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-map.c b/block/blk-map.c
index 4849fa3..0f4b4b8 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -217,6 +217,12 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
 		return PTR_ERR(bio);
 
 	if (bio->bi_size != len) {
+		/*
+		 * Grab an extra reference to this bio, as bio_unmap_user()
+		 * expects to be able to drop it twice as it happens on the
+		 * normal IO completion path
+		 */
+		bio_get(bio);
 		bio_endio(bio, 0);
 		bio_unmap_user(bio);
 		return -EINVAL;
-- 
cgit v0.10.2