From dd935f44a40f8fb02aff2cc0df2269c92422df1c Mon Sep 17 00:00:00 2001
From: Josh Durgin <josh.durgin@inktank.com>
Date: Wed, 28 Aug 2013 21:43:09 -0700
Subject: libceph: add function to ensure notifies are complete

Without a way to flush the osd client's notify workqueue, a watch
event that is unregistered could continue receiving callbacks
indefinitely.

Unregistering the event simply means no new notifies are added to the
queue, but there may still be events in the queue that will call the
watch callback for the event. If the queue is flushed after the event
is unregistered, the caller can be sure no more watch callbacks will
occur for the canceled watch.

Signed-off-by: Josh Durgin <josh.durgin@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@linaro.org>

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index ce6df39..8f47625 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -335,6 +335,8 @@ extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
 				  struct ceph_osd_request *req);
 extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
 
+extern void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc);
+
 extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
 			       struct ceph_vino vino,
 			       struct ceph_file_layout *layout,
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 1606f74..2b4b32a 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -2216,6 +2216,17 @@ void ceph_osdc_sync(struct ceph_osd_client *osdc)
 EXPORT_SYMBOL(ceph_osdc_sync);
 
 /*
+ * Call all pending notify callbacks - for use after a watch is
+ * unregistered, to make sure no more callbacks for it will be invoked
+ */
+extern void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc)
+{
+	flush_workqueue(osdc->notify_wq);
+}
+EXPORT_SYMBOL(ceph_osdc_flush_notifies);
+
+
+/*
  * init, shutdown
  */
 int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
-- 
cgit v0.10.2


From 9abc59908e0c5f983aaa91150da32d5b62cf60b7 Mon Sep 17 00:00:00 2001
From: Josh Durgin <josh.durgin@inktank.com>
Date: Thu, 29 Aug 2013 17:31:03 -0700
Subject: rbd: complete notifies before cleaning up osd_client and rbd_dev

To ensure rbd_dev is not used after it's released, flush all pending
notify callbacks before calling rbd_dev_image_release(). No new
notifies can be added to the queue at this point because the watch has
already be unregistered with the osd_client.

Signed-off-by: Josh Durgin <josh.durgin@inktank.com>
Reviewed-by: Alex Elder <elder@linaro.org>

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index fef3687..bf89e34 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -5163,6 +5163,13 @@ static ssize_t rbd_remove(struct bus_type *bus,
 	ret = rbd_dev_header_watch_sync(rbd_dev, false);
 	if (ret)
 		rbd_warn(rbd_dev, "failed to cancel watch event (%d)\n", ret);
+
+	/*
+	 * flush remaining watch callbacks - these must be complete
+	 * before the osd_client is shutdown
+	 */
+	dout("%s: flushing notifies", __func__);
+	ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
 	rbd_dev_image_release(rbd_dev);
 	module_put(THIS_MODULE);
 
-- 
cgit v0.10.2


From 20e0af67ce88c657d0601977b9941a2256afbdaa Mon Sep 17 00:00:00 2001
From: Josh Durgin <josh.durgin@inktank.com>
Date: Thu, 29 Aug 2013 17:36:03 -0700
Subject: rbd: make rbd_obj_notify_ack() synchronous

The only user of rbd_obj_notify_ack() is rbd_watch_cb(). It used
asynchronously with no tracking of when the notify ack completes, so
it may still be in progress when the osd_client is shut down.  This
results in a BUG() since the osd client assumes no requests are in
flight when it stops. Since all notifies are flushed before the
osd_client is stopped, waiting for the notify ack to complete before
returning from the watch callback ensures there are no notify acks in
flight during shutdown.

Rename rbd_obj_notify_ack() to rbd_obj_notify_ack_sync() to reflect
its new synchronous nature.

Signed-off-by: Josh Durgin <josh.durgin@inktank.com>
Reviewed-by: Alex Elder <elder@linaro.org>

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index bf89e34..9e5f07f 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -2808,7 +2808,7 @@ out_err:
 	obj_request_done_set(obj_request);
 }
 
-static int rbd_obj_notify_ack(struct rbd_device *rbd_dev, u64 notify_id)
+static int rbd_obj_notify_ack_sync(struct rbd_device *rbd_dev, u64 notify_id)
 {
 	struct rbd_obj_request *obj_request;
 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
@@ -2823,16 +2823,17 @@ static int rbd_obj_notify_ack(struct rbd_device *rbd_dev, u64 notify_id)
 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
 	if (!obj_request->osd_req)
 		goto out;
-	obj_request->callback = rbd_obj_request_put;
 
 	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
 					notify_id, 0, 0);
 	rbd_osd_req_format_read(obj_request);
 
 	ret = rbd_obj_request_submit(osdc, obj_request);
-out:
 	if (ret)
-		rbd_obj_request_put(obj_request);
+		goto out;
+	ret = rbd_obj_request_wait(obj_request);
+out:
+	rbd_obj_request_put(obj_request);
 
 	return ret;
 }
@@ -2852,7 +2853,7 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
 	if (ret)
 		rbd_warn(rbd_dev, "header refresh error (%d)\n", ret);
 
-	rbd_obj_notify_ack(rbd_dev, notify_id);
+	rbd_obj_notify_ack_sync(rbd_dev, notify_id);
 }
 
 /*
-- 
cgit v0.10.2


From 9875201e10496612080e7d164acc8f625c18725c Mon Sep 17 00:00:00 2001
From: Josh Durgin <josh.durgin@inktank.com>
Date: Thu, 29 Aug 2013 17:26:31 -0700
Subject: rbd: fix use-after free of rbd_dev->disk

Removing a device deallocates the disk, unschedules the watch, and
finally cleans up the rbd_dev structure. rbd_dev_refresh(), called
from the watch callback, updates the disk size and rbd_dev
structure. With no locking between them, rbd_dev_refresh() may use the
device or rbd_dev after they've been freed.

To fix this, check whether RBD_DEV_FLAG_REMOVING is set before
updating the disk size in rbd_dev_refresh(). In order to prevent a
race where rbd_dev_refresh() is already revalidating the disk when
rbd_remove() is called, move the call to rbd_bus_del_dev() after the
watch is unregistered and all notifies are complete. It's safe to
defer deleting this structure because no new requests can be submitted
once the RBD_DEV_FLAG_REMOVING is set, since the device cannot be
opened.

Fixes: http://tracker.ceph.com/issues/5636
Signed-off-by: Josh Durgin <josh.durgin@inktank.com>
Reviewed-by: Alex Elder <elder@linaro.org>

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 9e5f07f..47c6f9c 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -3325,6 +3325,31 @@ static void rbd_exists_validate(struct rbd_device *rbd_dev)
 		clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
 }
 
+static void rbd_dev_update_size(struct rbd_device *rbd_dev)
+{
+	sector_t size;
+	bool removing;
+
+	/*
+	 * Don't hold the lock while doing disk operations,
+	 * or lock ordering will conflict with the bdev mutex via:
+	 * rbd_add() -> blkdev_get() -> rbd_open()
+	 */
+	spin_lock_irq(&rbd_dev->lock);
+	removing = test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
+	spin_unlock_irq(&rbd_dev->lock);
+	/*
+	 * If the device is being removed, rbd_dev->disk has
+	 * been destroyed, so don't try to update its size
+	 */
+	if (!removing) {
+		size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
+		dout("setting size to %llu sectors", (unsigned long long)size);
+		set_capacity(rbd_dev->disk, size);
+		revalidate_disk(rbd_dev->disk);
+	}
+}
+
 static int rbd_dev_refresh(struct rbd_device *rbd_dev)
 {
 	u64 mapping_size;
@@ -3344,12 +3369,7 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev)
 	up_write(&rbd_dev->header_rwsem);
 
 	if (mapping_size != rbd_dev->mapping.size) {
-		sector_t size;
-
-		size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
-		dout("setting size to %llu sectors", (unsigned long long)size);
-		set_capacity(rbd_dev->disk, size);
-		revalidate_disk(rbd_dev->disk);
+		rbd_dev_update_size(rbd_dev);
 	}
 
 	return ret;
@@ -5160,7 +5180,6 @@ static ssize_t rbd_remove(struct bus_type *bus,
 	if (ret < 0 || already)
 		return ret;
 
-	rbd_bus_del_dev(rbd_dev);
 	ret = rbd_dev_header_watch_sync(rbd_dev, false);
 	if (ret)
 		rbd_warn(rbd_dev, "failed to cancel watch event (%d)\n", ret);
@@ -5171,6 +5190,13 @@ static ssize_t rbd_remove(struct bus_type *bus,
 	 */
 	dout("%s: flushing notifies", __func__);
 	ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
+	/*
+	 * Don't free anything from rbd_dev->disk until after all
+	 * notifies are completely processed. Otherwise
+	 * rbd_bus_del_dev() will race with rbd_watch_cb(), resulting
+	 * in a potential use after free of rbd_dev->disk or rbd_dev.
+	 */
+	rbd_bus_del_dev(rbd_dev);
 	rbd_dev_image_release(rbd_dev);
 	module_put(THIS_MODULE);
 
-- 
cgit v0.10.2


From efadc98aab674153709cc357ba565f04e3164fcd Mon Sep 17 00:00:00 2001
From: Josh Durgin <josh.durgin@inktank.com>
Date: Thu, 29 Aug 2013 19:16:42 -0700
Subject: rbd: ignore unmapped snapshots that no longer exist

This prevents erroring out while adding a device when a snapshot
unrelated to the current mapping is deleted between reading the
snapshot context and reading the snapshot names. If the mapped
snapshot name is not found an error still occurs as usual.

Signed-off-by: Josh Durgin <josh.durgin@inktank.com>
Reviewed-by: Alex Elder <elder@linaro.org>

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 47c6f9c..626a713 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -4078,8 +4078,13 @@ static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
 
 		snap_id = snapc->snaps[which];
 		snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
-		if (IS_ERR(snap_name))
-			break;
+		if (IS_ERR(snap_name)) {
+			/* ignore no-longer existing snapshots */
+			if (PTR_ERR(snap_name) == -ENOENT)
+				continue;
+			else
+				break;
+		}
 		found = !strcmp(name, snap_name);
 		kfree(snap_name);
 	}
-- 
cgit v0.10.2


From da6a6b63978d45f9ae582d1f362f182012da3a22 Mon Sep 17 00:00:00 2001
From: Josh Durgin <josh.durgin@inktank.com>
Date: Wed, 4 Sep 2013 17:57:31 -0700
Subject: rbd: fix error handling from rbd_snap_name()

rbd_snap_name() calls rbd_dev_v{1,2}_snap_name() depending on the
format of the image. The format 1 version returns NULL on error, which
is handled by the caller. The format 2 version returns an ERR_PTR,
which the caller of rbd_snap_name() does not expect.

Fortunately this is unlikely to occur in practice because
rbd_snap_id_by_name() is called before rbd_snap_name(). This would hit
similar errors to rbd_snap_name() (like the snapshot not existing) and
return early, so rbd_snap_name() would not hit an error unless the
snapshot was removed between the two calls or memory was exhausted.

Use an ERR_PTR in rbd_dev_v1_snap_name() so that the specific error
can be propagated, and it is consistent with rbd_dev_v2_snap_name().
Handle the ERR_PTR in the only rbd_snap_name() caller.

Suggested-by: Alex Elder <alex.elder@linaro.org>
Signed-off-by: Josh Durgin <josh.durgin@inktank.com>
Reviewed-by: Alex Elder <elder@linaro.org>

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 626a713..2f00778 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -927,12 +927,14 @@ static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
 					u64 snap_id)
 {
 	u32 which;
+	const char *snap_name;
 
 	which = rbd_dev_snap_index(rbd_dev, snap_id);
 	if (which == BAD_SNAP_INDEX)
-		return NULL;
+		return ERR_PTR(-ENOENT);
 
-	return _rbd_dev_v1_snap_name(rbd_dev, which);
+	snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
+	return snap_name ? snap_name : ERR_PTR(-ENOMEM);
 }
 
 static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
@@ -4163,8 +4165,8 @@ static int rbd_dev_spec_update(struct rbd_device *rbd_dev)
 	/* Look up the snapshot name, and make a copy */
 
 	snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
-	if (!snap_name) {
-		ret = -ENOMEM;
+	if (IS_ERR(snap_name)) {
+		ret = PTR_ERR(snap_name);
 		goto out_err;
 	}
 
-- 
cgit v0.10.2


From 9c89d62948c4740e379a7e0085dd8d7c1561f53f Mon Sep 17 00:00:00 2001
From: Milosz Tanski <milosz@adfin.com>
Date: Mon, 9 Sep 2013 14:28:57 -0400
Subject: fscache: check consistency does not decrement refcount

__fscache_check_consistency() does not decrement the count of operations
active after it finishes in the success case. This leads to a hung tasks on
cookie de-registration (commonly in inode eviction).

INFO: task kworker/1:2:4214 blocked for more than 120 seconds.
kworker/1:2     D ffff880443513fc0     0  4214      2 0x00000000
Workqueue: ceph-msgr con_work [libceph]
  ...
Call Trace:
 [<ffffffff81569fc6>] ? _raw_spin_unlock_irqrestore+0x16/0x20
 [<ffffffffa0016570>] ? fscache_wait_bit_interruptible+0x30/0x30 [fscache]
 [<ffffffff81568d09>] schedule+0x29/0x70
 [<ffffffffa001657e>] fscache_wait_atomic_t+0xe/0x20 [fscache]
 [<ffffffff815665cf>] out_of_line_wait_on_atomic_t+0x9f/0xe0
 [<ffffffff81083560>] ? autoremove_wake_function+0x40/0x40
 [<ffffffffa0015a9c>] __fscache_relinquish_cookie+0x15c/0x310 [fscache]
 [<ffffffffa00a4fae>] ceph_fscache_unregister_inode_cookie+0x3e/0x50 [ceph]
 [<ffffffffa007e373>] ceph_destroy_inode+0x33/0x200 [ceph]
 [<ffffffff811c13ae>] ? __fsnotify_inode_delete+0xe/0x10
 [<ffffffff8119ba1c>] destroy_inode+0x3c/0x70
 [<ffffffff8119bb69>] evict+0x119/0x1b0

Signed-off-by: Milosz Tanski <milosz@adfin.com>
Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: Sage Weil <sage@inktank.com>

diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index 318e843..b2a86e3 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -586,7 +586,8 @@ int __fscache_check_consistency(struct fscache_cookie *cookie)
 
 	fscache_operation_init(op, NULL, NULL);
 	op->flags = FSCACHE_OP_MYTHREAD |
-		(1 << FSCACHE_OP_WAITING);
+		(1 << FSCACHE_OP_WAITING) |
+		(1 << FSCACHE_OP_UNUSE_COOKIE);
 
 	spin_lock(&cookie->lock);
 
-- 
cgit v0.10.2