From 48fec5d0a504dfbb302cb1dd24ebb0b82a46cce9 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Wed, 1 Jul 2015 16:27:46 +0800
Subject: ceph: EIO all operations after forced umount

This patch makes try_get_cap_refs() and __do_request() check
if the file system was forced umount, and return -EIO if it was.
This patch also adds a helper function to drops dirty caps and
wakes up blocking operation.

Signed-off-by: Yan, Zheng <zyan@redhat.com>

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 890c509..1594f2c 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -717,7 +717,7 @@ static int ceph_writepages_start(struct address_space *mapping,
 	     wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
 	     (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
 
-	if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) {
+	if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
 		pr_warn("writepage_start %p on forced umount\n", inode);
 		return -EIO; /* we're in a forced umount, don't write! */
 	}
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index ddd5e94..27b5668 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2413,6 +2413,14 @@ again:
 			goto out_unlock;
 		}
 
+		if (!__ceph_is_any_caps(ci) &&
+		    ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+			dout("get_cap_refs %p forced umount\n", inode);
+			*err = -EIO;
+			ret = 1;
+			goto out_unlock;
+		}
+
 		dout("get_cap_refs %p have %s needed %s\n", inode,
 		     ceph_cap_string(have), ceph_cap_string(need));
 	}
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 6aa07af..ddc471c 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2107,7 +2107,6 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
 	msg = create_request_message(mdsc, req, mds, drop_cap_releases);
 	if (IS_ERR(msg)) {
 		req->r_err = PTR_ERR(msg);
-		complete_request(mdsc, req);
 		return PTR_ERR(msg);
 	}
 	req->r_request = msg;
@@ -2135,7 +2134,7 @@ static int __do_request(struct ceph_mds_client *mdsc,
 {
 	struct ceph_mds_session *session = NULL;
 	int mds = -1;
-	int err = -EAGAIN;
+	int err = 0;
 
 	if (req->r_err || req->r_got_result) {
 		if (req->r_aborted)
@@ -2149,6 +2148,11 @@ static int __do_request(struct ceph_mds_client *mdsc,
 		err = -EIO;
 		goto finish;
 	}
+	if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+		dout("do_request forced umount\n");
+		err = -EIO;
+		goto finish;
+	}
 
 	put_request_session(req);
 
@@ -2196,13 +2200,15 @@ static int __do_request(struct ceph_mds_client *mdsc,
 
 out_session:
 	ceph_put_mds_session(session);
+finish:
+	if (err) {
+		dout("__do_request early error %d\n", err);
+		req->r_err = err;
+		complete_request(mdsc, req);
+		__unregister_request(mdsc, req);
+	}
 out:
 	return err;
-
-finish:
-	req->r_err = err;
-	complete_request(mdsc, req);
-	goto out;
 }
 
 /*
@@ -2289,8 +2295,6 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
 
 	if (req->r_err) {
 		err = req->r_err;
-		__unregister_request(mdsc, req);
-		dout("do_request early error %d\n", err);
 		goto out;
 	}
 
@@ -3555,7 +3559,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
 {
 	u64 want_tid, want_flush, want_snap;
 
-	if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
+	if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
 		return;
 
 	dout("sync\n");
@@ -3584,7 +3588,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
  */
 static bool done_closing_sessions(struct ceph_mds_client *mdsc)
 {
-	if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
+	if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
 		return true;
 	return atomic_read(&mdsc->num_sessions) == 0;
 }
@@ -3643,6 +3647,34 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
 	dout("stopped\n");
 }
 
+void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
+{
+	struct ceph_mds_session *session;
+	int mds;
+
+	dout("force umount\n");
+
+	mutex_lock(&mdsc->mutex);
+	for (mds = 0; mds < mdsc->max_sessions; mds++) {
+		session = __ceph_lookup_mds_session(mdsc, mds);
+		if (!session)
+			continue;
+		mutex_unlock(&mdsc->mutex);
+		mutex_lock(&session->s_mutex);
+		__close_session(mdsc, session);
+		if (session->s_state == CEPH_MDS_SESSION_CLOSING) {
+			cleanup_session_requests(mdsc, session);
+			remove_session_caps(session);
+		}
+		mutex_unlock(&session->s_mutex);
+		ceph_put_mds_session(session);
+		mutex_lock(&mdsc->mutex);
+		kick_requests(mdsc, mds);
+	}
+	__wake_requests(mdsc, &mdsc->waiting_for_map);
+	mutex_unlock(&mdsc->mutex);
+}
+
 static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
 {
 	dout("stop\n");
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 762757e..f575eaf 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -366,6 +366,7 @@ extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
 
 extern int ceph_mdsc_init(struct ceph_fs_client *fsc);
 extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
+extern void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc);
 extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc);
 
 extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index d1c833c..69e6e19 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -708,6 +708,7 @@ static void ceph_umount_begin(struct super_block *sb)
 	if (!fsc)
 		return;
 	fsc->mount_state = CEPH_MOUNT_SHUTDOWN;
+	ceph_mdsc_force_umount(fsc->mdsc);
 	return;
 }
 
-- 
cgit v0.10.2


From a341d4df87487ae68189e0be869c39a2b0cb9aaa Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Wed, 1 Jul 2015 17:03:23 +0800
Subject: ceph: invalidate dirty pages after forced umount

After forced umount, ceph_writepages_start() skips flushing dirty
pages. To make sure inode's reference count get dropped to zero,
we need to invalidate dirty pages.

Signed-off-by: Yan, Zheng <zyan@redhat.com>

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 1594f2c..9893335 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -719,6 +719,8 @@ static int ceph_writepages_start(struct address_space *mapping,
 
 	if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
 		pr_warn("writepage_start %p on forced umount\n", inode);
+		truncate_pagecache(inode, 0);
+		mapping_set_error(mapping, -EIO);
 		return -EIO; /* we're in a forced umount, don't write! */
 	}
 	if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize)
-- 
cgit v0.10.2


From b79b23682a1649f30960fb5bd920ba46c89a1b14 Mon Sep 17 00:00:00 2001
From: Nicholas Krause <xerofoify@gmail.com>
Date: Sun, 5 Jul 2015 06:34:05 +0000
Subject: libceph: remove the unused macro AES_KEY_SIZE

This removes the no longer used macro AES_KEY_SIZE as no functions use
this macro anymore and thus this macro can be removed due it no longer
being required.

Signed-off-by: Nicholas Krause <xerofoify@gmail.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>

diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c
index 790fe89..4440edc 100644
--- a/net/ceph/crypto.c
+++ b/net/ceph/crypto.c
@@ -79,10 +79,6 @@ int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey)
 	return 0;
 }
 
-
-
-#define AES_KEY_SIZE 16
-
 static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void)
 {
 	return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
-- 
cgit v0.10.2


From d920ff6fc7c1ec3d7bd80432bff5575c0ebe426c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Beno=C3=AEt=20Canet?= <benoit.canet@nodalink.com>
Date: Thu, 25 Jun 2015 21:02:57 +0200
Subject: libceph: Avoid holding the zero page on ceph_msgr_slab_init errors
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ceph_msgr_slab_init may fail due to a temporary ENOMEM.

Delay a bit the initialization of zero_page in ceph_msgr_init and
reorder its cleanup in _ceph_msgr_exit so it's done in reverse
order of setup.

BUG_ON() will not suffer to be postponed in case it is triggered.

Signed-off-by: Benoît Canet <benoit.canet@nodalink.com>
Reviewed-by: Alex Elder <elder@linaro.org>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index e3be1d2..0f9ea60 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -276,22 +276,22 @@ static void _ceph_msgr_exit(void)
 		ceph_msgr_wq = NULL;
 	}
 
-	ceph_msgr_slab_exit();
-
 	BUG_ON(zero_page == NULL);
 	page_cache_release(zero_page);
 	zero_page = NULL;
+
+	ceph_msgr_slab_exit();
 }
 
 int ceph_msgr_init(void)
 {
+	if (ceph_msgr_slab_init())
+		return -ENOMEM;
+
 	BUG_ON(zero_page != NULL);
 	zero_page = ZERO_PAGE(0);
 	page_cache_get(zero_page);
 
-	if (ceph_msgr_slab_init())
-		return -ENOMEM;
-
 	/*
 	 * The number of active work items is limited by the number of
 	 * connections, so leave @max_active at default.
-- 
cgit v0.10.2


From 6893162215d7bf08a4273247ec1fc7dedee5135c Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Fri, 3 Jul 2015 15:44:41 +0300
Subject: libceph: rename con_work() to ceph_con_workfn()

Even though it's static, con_work(), being a work func, shows up in
various stacktraces a lot.  Prefix it with ceph_.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 0f9ea60..101ab62 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -176,7 +176,7 @@ static struct lock_class_key socket_class;
 
 static void queue_con(struct ceph_connection *con);
 static void cancel_con(struct ceph_connection *con);
-static void con_work(struct work_struct *);
+static void ceph_con_workfn(struct work_struct *);
 static void con_fault(struct ceph_connection *con);
 
 /*
@@ -749,7 +749,7 @@ void ceph_con_init(struct ceph_connection *con, void *private,
 	mutex_init(&con->mutex);
 	INIT_LIST_HEAD(&con->out_queue);
 	INIT_LIST_HEAD(&con->out_sent);
-	INIT_DELAYED_WORK(&con->work, con_work);
+	INIT_DELAYED_WORK(&con->work, ceph_con_workfn);
 
 	con->state = CON_STATE_CLOSED;
 }
@@ -2799,7 +2799,7 @@ static void con_fault_finish(struct ceph_connection *con)
 /*
  * Do some work on a connection.  Drop a connection ref when we're done.
  */
-static void con_work(struct work_struct *work)
+static void ceph_con_workfn(struct work_struct *work)
 {
 	struct ceph_connection *con = container_of(work, struct ceph_connection,
 						   work.work);
-- 
cgit v0.10.2


From 23078637e05460428f803be7d0f46908df8a970a Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Mon, 20 Jul 2015 10:14:06 +0800
Subject: ceph: fix queuing inode to mdsdir's snaprealm

During MDS failovers, MClientSnap message may cause kclient to move
some inodes from root directory's snaprealm to mdsdir's snaprealm
and queue snapshots for these inodes. For a FS has never created any
snapshot, both root directory's snaprealm and mdsdir's snaprealm
share the same snapshot contexts (both are ceph_empty_snapc). This
confuses ceph_put_wrbuffer_cap_refs(), make it unable to distinguish
snapshot buffers from head buffers.

The fix is do not use ceph_empty_snapc as snaprealm's cached context.

Signed-off-by: Yan, Zheng <zyan@redhat.com>

diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 233d906a..4aa7122 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -338,12 +338,6 @@ static int build_snap_context(struct ceph_snap_realm *realm)
 		return 0;
 	}
 
-	if (num == 0 && realm->seq == ceph_empty_snapc->seq) {
-		ceph_get_snap_context(ceph_empty_snapc);
-		snapc = ceph_empty_snapc;
-		goto done;
-	}
-
 	/* alloc new snap context */
 	err = -ENOMEM;
 	if (num > (SIZE_MAX - sizeof(*snapc)) / sizeof(u64))
@@ -381,7 +375,6 @@ static int build_snap_context(struct ceph_snap_realm *realm)
 	     realm->ino, realm, snapc, snapc->seq,
 	     (unsigned int) snapc->num_snaps);
 
-done:
 	ceph_put_snap_context(realm->cached_context);
 	realm->cached_context = snapc;
 	return 0;
-- 
cgit v0.10.2


From 1550d34e5626a20a2e12c73bdc1e6e217a0ba897 Mon Sep 17 00:00:00 2001
From: Brad Hubbard <bhubbard@redhat.com>
Date: Tue, 18 Aug 2015 10:18:53 +0800
Subject: ceph: remove redundant test of head->safe and silence static analysis
 warnings

Signed-off-by: Brad Hubbard <bhubbard@redhat.com>
Signed-off-by: Yan, Zheng <zyan@redhat.com>

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index ddc471c..d4eaa84 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2415,7 +2415,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 		mutex_unlock(&mdsc->mutex);
 		goto out;
 	}
-	if (req->r_got_safe && !head->safe) {
+	if (req->r_got_safe) {
 		pr_warn("got unsafe after safe on %llu from mds%d\n",
 			   tid, mds);
 		mutex_unlock(&mdsc->mutex);
-- 
cgit v0.10.2


From a43137f7b0f1467cf3005b6ff6574d978642d247 Mon Sep 17 00:00:00 2001
From: Jianpeng Ma <jianpeng.ma@intel.com>
Date: Tue, 18 Aug 2015 10:23:50 +0800
Subject: ceph: remove the useless judgement

err != 0 is already handled. So skip this.

Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com>
Signed-off-by: Yan, Zheng <zyan@redhat.com>

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 8b79d87..2ebcbd4 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -279,7 +279,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
 	if (err)
 		goto out_req;
 
-	if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
+	if ((flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
 		err = ceph_handle_notrace_create(dir, dentry);
 
 	if (d_unhashed(dentry)) {
-- 
cgit v0.10.2


From e36d571d70c7f46b20c28d81025fd5fc044a8e22 Mon Sep 17 00:00:00 2001
From: Jianpeng Ma <jianpeng.ma@intel.com>
Date: Tue, 18 Aug 2015 10:25:35 +0800
Subject: ceph: no need to get parent inode in ceph_open

parent inode is needed in creating new inode case.  For ceph_open,
the target inode already exists.

Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com>
Signed-off-by: Yan, Zheng <zyan@redhat.com>

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 2ebcbd4..90ec110 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -136,7 +136,6 @@ int ceph_open(struct inode *inode, struct file *file)
 	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_mds_request *req;
 	struct ceph_file_info *cf = file->private_data;
-	struct inode *parent_inode = NULL;
 	int err;
 	int flags, fmode, wanted;
 
@@ -210,10 +209,7 @@ int ceph_open(struct inode *inode, struct file *file)
 	ihold(inode);
 
 	req->r_num_caps = 1;
-	if (flags & O_CREAT)
-		parent_inode = ceph_get_dentry_parent_inode(file->f_path.dentry);
-	err = ceph_mdsc_do_request(mdsc, parent_inode, req);
-	iput(parent_inode);
+	err = ceph_mdsc_do_request(mdsc, NULL, req);
 	if (!err)
 		err = ceph_init_file(inode, file, req->r_fmode);
 	ceph_mdsc_put_request(req);
-- 
cgit v0.10.2


From 5fdb1389e1399d6801a8c5d10952ef4153039fb2 Mon Sep 17 00:00:00 2001
From: Jianpeng Ma <jianpeng.ma@intel.com>
Date: Tue, 18 Aug 2015 10:30:38 +0800
Subject: ceph: cleanup use of ceph_msg_get

Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com>
Signed-off-by: Yan, Zheng <zyan@redhat.com>

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index d4eaa84..51cb02d 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2524,8 +2524,7 @@ out_err:
 		if (err) {
 			req->r_err = err;
 		} else {
-			req->r_reply = msg;
-			ceph_msg_get(msg);
+			req->r_reply =  ceph_msg_get(msg);
 			req->r_got_result = true;
 		}
 	} else {
-- 
cgit v0.10.2


From 6dd74e44dc1df85f125982a8d6591bc4a76c9f5d Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Fri, 28 Aug 2015 17:59:35 +0800
Subject: libceph: set 'exists' flag for newly up osd

Signed-off-by: Yan, Zheng <zyan@redhat.com>
Reviewed-by: Sage Weil <sage@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>

diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 4a31258..7d8f581 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1300,7 +1300,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 		ceph_decode_addr(&addr);
 		pr_info("osd%d up\n", osd);
 		BUG_ON(osd >= map->max_osd);
-		map->osd_state[osd] |= CEPH_OSD_UP;
+		map->osd_state[osd] |= CEPH_OSD_UP | CEPH_OSD_EXISTS;
 		map->osd_addr[osd] = addr;
 	}
 
-- 
cgit v0.10.2


From 3ebe138ac642a195c7f2efdb918f464734421fd6 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 31 Aug 2015 15:21:39 +0300
Subject: rbd: fix double free on rbd_dev->header_name

If rbd_dev_image_probe() in rbd_dev_probe_parent() fails, header_name
is freed twice: once in rbd_dev_probe_parent() and then in its caller
rbd_dev_image_probe() (rbd_dev_image_probe() is called recursively to
handle parent images).

rbd_dev_probe_parent() is responsible for probing the parent, so it
shouldn't muck with clone's fields.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Alex Elder <elder@linaro.org>

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index bc67a93..324bf35 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -5201,7 +5201,6 @@ static int rbd_dev_probe_parent(struct rbd_device *rbd_dev)
 out_err:
 	if (parent) {
 		rbd_dev_unparent(rbd_dev);
-		kfree(rbd_dev->header_name);
 		rbd_dev_destroy(parent);
 	} else {
 		rbd_put_client(rbdc);
-- 
cgit v0.10.2


From d194cd1dd1be61249b08e5461ae8a9c05d1072c9 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 31 Aug 2015 18:22:10 +0300
Subject: rbd: plug rbd_dev->header.object_prefix memory leak

Need to free object_prefix when rbd_dev_v2_snap_context() fails, but
only if this is the first time we are reading in the header.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Alex Elder <elder@linaro.org>

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 324bf35..69d03aa 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -4720,7 +4720,10 @@ static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
 	}
 
 	ret = rbd_dev_v2_snap_context(rbd_dev);
-	dout("rbd_dev_v2_snap_context returned %d\n", ret);
+	if (ret && first_time) {
+		kfree(rbd_dev->header.object_prefix);
+		rbd_dev->header.object_prefix = NULL;
+	}
 
 	return ret;
 }
-- 
cgit v0.10.2


From 8b9558aab853e98ba6e3fee0dd8545544966958c Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Tue, 1 Sep 2015 17:19:38 +0800
Subject: libceph: use keepalive2 to verify the mon session is alive

Signed-off-by: Yan, Zheng <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>

diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 9ebee53..397c5cd 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -46,6 +46,7 @@ struct ceph_options {
 	unsigned long mount_timeout;		/* jiffies */
 	unsigned long osd_idle_ttl;		/* jiffies */
 	unsigned long osd_keepalive_timeout;	/* jiffies */
+	unsigned long monc_ping_timeout;	/* jiffies */
 
 	/*
 	 * any type that can't be simply compared or doesn't need need
@@ -66,6 +67,7 @@ struct ceph_options {
 #define CEPH_MOUNT_TIMEOUT_DEFAULT	msecs_to_jiffies(60 * 1000)
 #define CEPH_OSD_KEEPALIVE_DEFAULT	msecs_to_jiffies(5 * 1000)
 #define CEPH_OSD_IDLE_TTL_DEFAULT	msecs_to_jiffies(60 * 1000)
+#define CEPH_MONC_PING_TIMEOUT_DEFAULT	msecs_to_jiffies(30 * 1000)
 
 #define CEPH_MSG_MAX_FRONT_LEN	(16*1024*1024)
 #define CEPH_MSG_MAX_MIDDLE_LEN	(16*1024*1024)
diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 3775327..7e1252e 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -248,6 +248,8 @@ struct ceph_connection {
 	int in_base_pos;     /* bytes read */
 	__le64 in_temp_ack;  /* for reading an ack */
 
+	struct timespec last_keepalive_ack;
+
 	struct delayed_work work;	    /* send|recv work */
 	unsigned long       delay;          /* current delay interval */
 };
@@ -285,6 +287,8 @@ extern void ceph_msg_revoke(struct ceph_msg *msg);
 extern void ceph_msg_revoke_incoming(struct ceph_msg *msg);
 
 extern void ceph_con_keepalive(struct ceph_connection *con);
+extern bool ceph_con_keepalive_expired(struct ceph_connection *con,
+				       unsigned long interval);
 
 extern void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages,
 				size_t length, size_t alignment);
diff --git a/include/linux/ceph/msgr.h b/include/linux/ceph/msgr.h
index 1c18872..0fe2656 100644
--- a/include/linux/ceph/msgr.h
+++ b/include/linux/ceph/msgr.h
@@ -84,10 +84,12 @@ struct ceph_entity_inst {
 #define CEPH_MSGR_TAG_MSG           7  /* message */
 #define CEPH_MSGR_TAG_ACK           8  /* message ack */
 #define CEPH_MSGR_TAG_KEEPALIVE     9  /* just a keepalive byte! */
-#define CEPH_MSGR_TAG_BADPROTOVER  10  /* bad protocol version */
+#define CEPH_MSGR_TAG_BADPROTOVER   10 /* bad protocol version */
 #define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */
 #define CEPH_MSGR_TAG_FEATURES      12 /* insufficient features */
 #define CEPH_MSGR_TAG_SEQ           13 /* 64-bit int follows with seen seq number */
+#define CEPH_MSGR_TAG_KEEPALIVE2    14 /* keepalive2 byte + ceph_timespec */
+#define CEPH_MSGR_TAG_KEEPALIVE2_ACK 15 /* keepalive2 reply */
 
 
 /*
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index f30329f..3f56eef 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -357,6 +357,7 @@ ceph_parse_options(char *options, const char *dev_name,
 	opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
 	opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT;
 	opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;
+	opt->monc_ping_timeout = CEPH_MONC_PING_TIMEOUT_DEFAULT;
 
 	/* get mon ip(s) */
 	/* ip1[:port1][,ip2[:port2]...] */
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 101ab62..36757d4 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -163,6 +163,7 @@ static struct kmem_cache	*ceph_msg_data_cache;
 static char tag_msg = CEPH_MSGR_TAG_MSG;
 static char tag_ack = CEPH_MSGR_TAG_ACK;
 static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
+static char tag_keepalive2 = CEPH_MSGR_TAG_KEEPALIVE2;
 
 #ifdef CONFIG_LOCKDEP
 static struct lock_class_key socket_class;
@@ -1351,7 +1352,15 @@ static void prepare_write_keepalive(struct ceph_connection *con)
 {
 	dout("prepare_write_keepalive %p\n", con);
 	con_out_kvec_reset(con);
-	con_out_kvec_add(con, sizeof (tag_keepalive), &tag_keepalive);
+	if (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2) {
+		struct timespec ts = CURRENT_TIME;
+		struct ceph_timespec ceph_ts;
+		ceph_encode_timespec(&ceph_ts, &ts);
+		con_out_kvec_add(con, sizeof(tag_keepalive2), &tag_keepalive2);
+		con_out_kvec_add(con, sizeof(ceph_ts), &ceph_ts);
+	} else {
+		con_out_kvec_add(con, sizeof(tag_keepalive), &tag_keepalive);
+	}
 	con_flag_set(con, CON_FLAG_WRITE_PENDING);
 }
 
@@ -1625,6 +1634,12 @@ static void prepare_read_tag(struct ceph_connection *con)
 	con->in_tag = CEPH_MSGR_TAG_READY;
 }
 
+static void prepare_read_keepalive_ack(struct ceph_connection *con)
+{
+	dout("prepare_read_keepalive_ack %p\n", con);
+	con->in_base_pos = 0;
+}
+
 /*
  * Prepare to read a message.
  */
@@ -2457,6 +2472,17 @@ static void process_message(struct ceph_connection *con)
 	mutex_lock(&con->mutex);
 }
 
+static int read_keepalive_ack(struct ceph_connection *con)
+{
+	struct ceph_timespec ceph_ts;
+	size_t size = sizeof(ceph_ts);
+	int ret = read_partial(con, size, size, &ceph_ts);
+	if (ret <= 0)
+		return ret;
+	ceph_decode_timespec(&con->last_keepalive_ack, &ceph_ts);
+	prepare_read_tag(con);
+	return 1;
+}
 
 /*
  * Write something to the socket.  Called in a worker thread when the
@@ -2526,6 +2552,10 @@ more_kvec:
 
 do_next:
 	if (con->state == CON_STATE_OPEN) {
+		if (con_flag_test_and_clear(con, CON_FLAG_KEEPALIVE_PENDING)) {
+			prepare_write_keepalive(con);
+			goto more;
+		}
 		/* is anything else pending? */
 		if (!list_empty(&con->out_queue)) {
 			prepare_write_message(con);
@@ -2535,10 +2565,6 @@ do_next:
 			prepare_write_ack(con);
 			goto more;
 		}
-		if (con_flag_test_and_clear(con, CON_FLAG_KEEPALIVE_PENDING)) {
-			prepare_write_keepalive(con);
-			goto more;
-		}
 	}
 
 	/* Nothing to do! */
@@ -2641,6 +2667,9 @@ more:
 		case CEPH_MSGR_TAG_ACK:
 			prepare_read_ack(con);
 			break;
+		case CEPH_MSGR_TAG_KEEPALIVE2_ACK:
+			prepare_read_keepalive_ack(con);
+			break;
 		case CEPH_MSGR_TAG_CLOSE:
 			con_close_socket(con);
 			con->state = CON_STATE_CLOSED;
@@ -2684,6 +2713,12 @@ more:
 		process_ack(con);
 		goto more;
 	}
+	if (con->in_tag == CEPH_MSGR_TAG_KEEPALIVE2_ACK) {
+		ret = read_keepalive_ack(con);
+		if (ret <= 0)
+			goto out;
+		goto more;
+	}
 
 out:
 	dout("try_read done on %p ret %d\n", con, ret);
@@ -3101,6 +3136,20 @@ void ceph_con_keepalive(struct ceph_connection *con)
 }
 EXPORT_SYMBOL(ceph_con_keepalive);
 
+bool ceph_con_keepalive_expired(struct ceph_connection *con,
+			       unsigned long interval)
+{
+	if (interval > 0 &&
+	    (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2)) {
+		struct timespec now = CURRENT_TIME;
+		struct timespec ts;
+		jiffies_to_timespec(interval, &ts);
+		ts = timespec_add(con->last_keepalive_ack, ts);
+		return timespec_compare(&now, &ts) >= 0;
+	}
+	return false;
+}
+
 static struct ceph_msg_data *ceph_msg_data_create(enum ceph_msg_data_type type)
 {
 	struct ceph_msg_data *data;
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 9d6ff12..edda016 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -149,6 +149,10 @@ static int __open_session(struct ceph_mon_client *monc)
 			      CEPH_ENTITY_TYPE_MON, monc->cur_mon,
 			      &monc->monmap->mon_inst[monc->cur_mon].addr);
 
+		/* send an initial keepalive to ensure our timestamp is
+		 * valid by the time we are in an OPENED state */
+		ceph_con_keepalive(&monc->con);
+
 		/* initiatiate authentication handshake */
 		ret = ceph_auth_build_hello(monc->auth,
 					    monc->m_auth->front.iov_base,
@@ -170,14 +174,19 @@ static bool __sub_expired(struct ceph_mon_client *monc)
  */
 static void __schedule_delayed(struct ceph_mon_client *monc)
 {
-	unsigned int delay;
+	struct ceph_options *opt = monc->client->options;
+	unsigned long delay;
 
-	if (monc->cur_mon < 0 || __sub_expired(monc))
+	if (monc->cur_mon < 0 || __sub_expired(monc)) {
 		delay = 10 * HZ;
-	else
+	} else {
 		delay = 20 * HZ;
-	dout("__schedule_delayed after %u\n", delay);
-	schedule_delayed_work(&monc->delayed_work, delay);
+		if (opt->monc_ping_timeout > 0)
+			delay = min(delay, opt->monc_ping_timeout / 3);
+	}
+	dout("__schedule_delayed after %lu\n", delay);
+	schedule_delayed_work(&monc->delayed_work,
+			      round_jiffies_relative(delay));
 }
 
 /*
@@ -743,11 +752,23 @@ static void delayed_work(struct work_struct *work)
 		__close_session(monc);
 		__open_session(monc);  /* continue hunting */
 	} else {
-		ceph_con_keepalive(&monc->con);
+		struct ceph_options *opt = monc->client->options;
+		int is_auth = ceph_auth_is_authenticated(monc->auth);
+		if (ceph_con_keepalive_expired(&monc->con,
+					       opt->monc_ping_timeout)) {
+			dout("monc keepalive timeout\n");
+			is_auth = 0;
+			__close_session(monc);
+			monc->hunting = true;
+			__open_session(monc);
+		}
 
-		__validate_auth(monc);
+		if (!monc->hunting) {
+			ceph_con_keepalive(&monc->con);
+			__validate_auth(monc);
+		}
 
-		if (ceph_auth_is_authenticated(monc->auth))
+		if (is_auth)
 			__send_subscribe(monc);
 	}
 	__schedule_delayed(monc);
-- 
cgit v0.10.2


From d15f9d694b77fe5e4ea12b3031ecaa13b5aa2b10 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Wed, 2 Sep 2015 11:37:09 +0300
Subject: libceph: check data_len in ->alloc_msg()

Only ->alloc_msg() should check data_len of the incoming message
against the preallocated ceph_msg, doing it in the messenger is not
right.  The contract is that either ->alloc_msg() returns a ceph_msg
which will fit all of the portions of the incoming message, or it
returns NULL and possibly sets skip, signaling whether NULL is due to
an -ENOMEM.  ->alloc_msg() should be the only place where we make the
skip/no-skip decision.

I stumbled upon this while looking at con/osd ref counting.  Right now,
if we get a non-extent message with a larger data portion than we are
prepared for, ->alloc_msg() returns a ceph_msg, and then, when we skip
it in the messenger, we don't put the con/osd ref acquired in
ceph_con_in_msg_alloc() (which is normally put in process_message()),
so this also fixes a memory leak.

An existing BUG_ON in ceph_msg_data_cursor_init() ensures we don't
corrupt random memory should a buggy ->alloc_msg() return an unfit
ceph_msg.

While at it, I changed the "unknown tid" dout() to a pr_warn() to make
sure all skips are seen and unified format strings.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Alex Elder <elder@linaro.org>

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 36757d4..525f454 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -2337,13 +2337,6 @@ static int read_partial_message(struct ceph_connection *con)
 			return ret;
 
 		BUG_ON(!con->in_msg ^ skip);
-		if (con->in_msg && data_len > con->in_msg->data_length) {
-			pr_warn("%s skipping long message (%u > %zd)\n",
-				__func__, data_len, con->in_msg->data_length);
-			ceph_msg_put(con->in_msg);
-			con->in_msg = NULL;
-			skip = 1;
-		}
 		if (skip) {
 			/* skip this message */
 			dout("alloc_msg said skip message\n");
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 5003367..80b94e3 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -2817,8 +2817,9 @@ out:
 }
 
 /*
- * lookup and return message for incoming reply.  set up reply message
- * pages.
+ * Lookup and return message for incoming reply.  Don't try to do
+ * anything about a larger than preallocated data portion of the
+ * message at the moment - for now, just skip the message.
  */
 static struct ceph_msg *get_reply(struct ceph_connection *con,
 				  struct ceph_msg_header *hdr,
@@ -2836,10 +2837,10 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 	mutex_lock(&osdc->request_mutex);
 	req = __lookup_request(osdc, tid);
 	if (!req) {
-		*skip = 1;
+		pr_warn("%s osd%d tid %llu unknown, skipping\n",
+			__func__, osd->o_osd, tid);
 		m = NULL;
-		dout("get_reply unknown tid %llu from osd%d\n", tid,
-		     osd->o_osd);
+		*skip = 1;
 		goto out;
 	}
 
@@ -2849,10 +2850,9 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 	ceph_msg_revoke_incoming(req->r_reply);
 
 	if (front_len > req->r_reply->front_alloc_len) {
-		pr_warn("get_reply front %d > preallocated %d (%u#%llu)\n",
-			front_len, req->r_reply->front_alloc_len,
-			(unsigned int)con->peer_name.type,
-			le64_to_cpu(con->peer_name.num));
+		pr_warn("%s osd%d tid %llu front %d > preallocated %d\n",
+			__func__, osd->o_osd, req->r_tid, front_len,
+			req->r_reply->front_alloc_len);
 		m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front_len, GFP_NOFS,
 				 false);
 		if (!m)
@@ -2860,37 +2860,22 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 		ceph_msg_put(req->r_reply);
 		req->r_reply = m;
 	}
-	m = ceph_msg_get(req->r_reply);
-
-	if (data_len > 0) {
-		struct ceph_osd_data *osd_data;
 
-		/*
-		 * XXX This is assuming there is only one op containing
-		 * XXX page data.  Probably OK for reads, but this
-		 * XXX ought to be done more generally.
-		 */
-		osd_data = osd_req_op_extent_osd_data(req, 0);
-		if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) {
-			if (osd_data->pages &&
-				unlikely(osd_data->length < data_len)) {
-
-				pr_warn("tid %lld reply has %d bytes we had only %llu bytes ready\n",
-					tid, data_len, osd_data->length);
-				*skip = 1;
-				ceph_msg_put(m);
-				m = NULL;
-				goto out;
-			}
-		}
+	if (data_len > req->r_reply->data_length) {
+		pr_warn("%s osd%d tid %llu data %d > preallocated %zu, skipping\n",
+			__func__, osd->o_osd, req->r_tid, data_len,
+			req->r_reply->data_length);
+		m = NULL;
+		*skip = 1;
+		goto out;
 	}
-	*skip = 0;
+
+	m = ceph_msg_get(req->r_reply);
 	dout("get_reply tid %lld %p\n", tid, m);
 
 out:
 	mutex_unlock(&osdc->request_mutex);
 	return m;
-
 }
 
 static struct ceph_msg *alloc_msg(struct ceph_connection *con,
-- 
cgit v0.10.2


From 55b0b31cbc09f80db384671e22cdc94b2aa26b29 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Mon, 7 Sep 2015 11:35:01 +0800
Subject: ceph: get inode size for each append write

Signed-off-by: Yan, Zheng <zyan@redhat.com>

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 90ec110..0c62868 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -952,6 +952,12 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	/* We can write back this queue in page reclaim */
 	current->backing_dev_info = inode_to_bdi(inode);
 
+	if (iocb->ki_flags & IOCB_APPEND) {
+		err = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false);
+		if (err < 0)
+			goto out;
+	}
+
 	err = generic_write_checks(iocb, from);
 	if (err <= 0)
 		goto out;
-- 
cgit v0.10.2


From 438386853d4c0c48fe73bf05a7d61c70ca5a3bfb Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Mon, 7 Sep 2015 15:46:24 +0800
Subject: ceph: improve readahead for file holes

When readahead encounters file holes, osd reply returns error -ENOENT,
finish_read() skips adding pages to the the page cache. So readahead
does not work for file holes. The fix is adding zero pages to the
page cache when -ENOENT is returned.

Signed-off-by: Yan, Zheng <zyan@redhat.com>

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 9893335..6471e28 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -276,7 +276,7 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
 	for (i = 0; i < num_pages; i++) {
 		struct page *page = osd_data->pages[i];
 
-		if (rc < 0)
+		if (rc < 0 && rc != ENOENT)
 			goto unlock;
 		if (bytes < (int)PAGE_CACHE_SIZE) {
 			/* zero (remainder of) page */
-- 
cgit v0.10.2