From d1c338a509cea5378df59629ad47382810c38623 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Sun, 19 Aug 2012 12:29:16 -0700
Subject: libceph: delay debugfs initialization until we learn global_id

The debugfs directory includes the cluster fsid and our unique global_id.
We need to delay the initialization of the debug entry until we have
learned both the fsid and our global_id from the monitor or else the
second client can't create its debugfs entry and will fail (and multiple
client instances aren't properly reflected in debugfs).

Reported by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Yehuda Sadeh <yehuda@inktank.com>

diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index fb962ef..6d59006 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -201,6 +201,7 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
 	int err = -ENOMEM;
 
 	dout("ceph_fs_debugfs_init\n");
+	BUG_ON(!fsc->client->debugfs_dir);
 	fsc->debugfs_congestion_kb =
 		debugfs_create_file("writeback_congestion_kb",
 				    0600,
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 69e38db..a802029 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -84,7 +84,6 @@ int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
 			return -1;
 		}
 	} else {
-		pr_info("client%lld fsid %pU\n", ceph_client_id(client), fsid);
 		memcpy(&client->fsid, fsid, sizeof(*fsid));
 	}
 	return 0;
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 54b531a..38b5dc1 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -189,6 +189,9 @@ int ceph_debugfs_client_init(struct ceph_client *client)
 	snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid,
 		 client->monc.auth->global_id);
 
+	dout("ceph_debugfs_client_init %p %s\n", client, name);
+
+	BUG_ON(client->debugfs_dir);
 	client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir);
 	if (!client->debugfs_dir)
 		goto out;
@@ -234,6 +237,7 @@ out:
 
 void ceph_debugfs_client_cleanup(struct ceph_client *client)
 {
+	dout("ceph_debugfs_client_cleanup %p\n", client);
 	debugfs_remove(client->debugfs_osdmap);
 	debugfs_remove(client->debugfs_monmap);
 	debugfs_remove(client->osdc.debugfs_file);
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 105d533..900ea0f 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -311,6 +311,17 @@ int ceph_monc_open_session(struct ceph_mon_client *monc)
 EXPORT_SYMBOL(ceph_monc_open_session);
 
 /*
+ * We require the fsid and global_id in order to initialize our
+ * debugfs dir.
+ */
+static bool have_debugfs_info(struct ceph_mon_client *monc)
+{
+	dout("have_debugfs_info fsid %d globalid %lld\n",
+	     (int)monc->client->have_fsid, monc->auth->global_id);
+	return monc->client->have_fsid && monc->auth->global_id > 0;
+}
+
+/*
  * The monitor responds with mount ack indicate mount success.  The
  * included client ticket allows the client to talk to MDSs and OSDs.
  */
@@ -320,9 +331,12 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc,
 	struct ceph_client *client = monc->client;
 	struct ceph_monmap *monmap = NULL, *old = monc->monmap;
 	void *p, *end;
+	int had_debugfs_info, init_debugfs = 0;
 
 	mutex_lock(&monc->mutex);
 
+	had_debugfs_info = have_debugfs_info(monc);
+
 	dout("handle_monmap\n");
 	p = msg->front.iov_base;
 	end = p + msg->front.iov_len;
@@ -344,12 +358,22 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc,
 
 	if (!client->have_fsid) {
 		client->have_fsid = true;
+		if (!had_debugfs_info && have_debugfs_info(monc)) {
+			pr_info("client%lld fsid %pU\n",
+				ceph_client_id(monc->client),
+				&monc->client->fsid);
+			init_debugfs = 1;
+		}
 		mutex_unlock(&monc->mutex);
-		/*
-		 * do debugfs initialization without mutex to avoid
-		 * creating a locking dependency
-		 */
-		ceph_debugfs_client_init(client);
+
+		if (init_debugfs) {
+			/*
+			 * do debugfs initialization without mutex to avoid
+			 * creating a locking dependency
+			 */
+			ceph_debugfs_client_init(monc->client);
+		}
+
 		goto out_unlocked;
 	}
 out:
@@ -865,8 +889,10 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
 {
 	int ret;
 	int was_auth = 0;
+	int had_debugfs_info, init_debugfs = 0;
 
 	mutex_lock(&monc->mutex);
+	had_debugfs_info = have_debugfs_info(monc);
 	if (monc->auth->ops)
 		was_auth = monc->auth->ops->is_authenticated(monc->auth);
 	monc->pending_auth = 0;
@@ -889,7 +915,22 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
 		__send_subscribe(monc);
 		__resend_generic_request(monc);
 	}
+
+	if (!had_debugfs_info && have_debugfs_info(monc)) {
+		pr_info("client%lld fsid %pU\n",
+			ceph_client_id(monc->client),
+			&monc->client->fsid);
+		init_debugfs = 1;
+	}
 	mutex_unlock(&monc->mutex);
+
+	if (init_debugfs) {
+		/*
+		 * do debugfs initialization without mutex to avoid
+		 * creating a locking dependency
+		 */
+		ceph_debugfs_client_init(monc->client);
+	}
 }
 
 static int __validate_auth(struct ceph_mon_client *monc)
-- 
cgit v0.10.2


From 6c5e50fa614fea5325a2973be06f7ec6f1055316 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Tue, 21 Aug 2012 15:55:25 -0700
Subject: ceph: tolerate (and warn on) extraneous dentry from mds

If the MDS gives us a dentry and we weren't prepared to handle it,
WARN_ON_ONCE instead of crashing.

Reported-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@inktank.com>

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 9fff9f3..4b5762e 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -992,11 +992,15 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
 	if (rinfo->head->is_dentry) {
 		struct inode *dir = req->r_locked_dir;
 
-		err = fill_inode(dir, &rinfo->diri, rinfo->dirfrag,
-				 session, req->r_request_started, -1,
-				 &req->r_caps_reservation);
-		if (err < 0)
-			return err;
+		if (dir) {
+			err = fill_inode(dir, &rinfo->diri, rinfo->dirfrag,
+					 session, req->r_request_started, -1,
+					 &req->r_caps_reservation);
+			if (err < 0)
+				return err;
+		} else {
+			WARN_ON_ONCE(1);
+		}
 	}
 
 	/*
@@ -1004,6 +1008,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
 	 * will have trouble splicing in the virtual snapdir later
 	 */
 	if (rinfo->head->is_dentry && !req->r_aborted &&
+	    req->r_locked_dir &&
 	    (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name,
 					       fsc->mount_options->snapdir_name,
 					       req->r_dentry->d_name.len))) {
-- 
cgit v0.10.2


From 6d4221b53707486dfad3f5bfe568d2ce7f4c9863 Mon Sep 17 00:00:00 2001
From: Jim Schutt <jaschut@sandia.gov>
Date: Fri, 10 Aug 2012 10:37:38 -0700
Subject: libceph: avoid truncation due to racing banners

Because the Ceph client messenger uses a non-blocking connect, it is
possible for the sending of the client banner to race with the
arrival of the banner sent by the peer.

When ceph_sock_state_change() notices the connect has completed, it
schedules work to process the socket via con_work().  During this
time the peer is writing its banner, and arrival of the peer banner
races with con_work().

If con_work() calls try_read() before the peer banner arrives, there
is nothing for it to do, after which con_work() calls try_write() to
send the client's banner.  In this case Ceph's protocol negotiation
can complete succesfully.

The server-side messenger immediately sends its banner and addresses
after accepting a connect request, *before* actually attempting to
read or verify the banner from the client.  As a result, it is
possible for the banner from the server to arrive before con_work()
calls try_read().  If that happens, try_read() will read the banner
and prepare protocol negotiation info via prepare_write_connect().
prepare_write_connect() calls con_out_kvec_reset(), which discards
the as-yet-unsent client banner.  Next, con_work() calls
try_write(), which sends the protocol negotiation info rather than
the banner that the peer is expecting.

The result is that the peer sees an invalid banner, and the client
reports "negotiation failed".

Fix this by moving con_out_kvec_reset() out of
prepare_write_connect() to its callers at all locations except the
one where the banner might still need to be sent.

[elder@inktak.com: added note about server-side behavior]

Signed-off-by: Jim Schutt <jaschut@sandia.gov>
Reviewed-by: Alex Elder <elder@inktank.com>

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index b979675..24c5eea 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -915,7 +915,6 @@ static int prepare_write_connect(struct ceph_connection *con)
 	con->out_connect.authorizer_len = auth ?
 		cpu_to_le32(auth->authorizer_buf_len) : 0;
 
-	con_out_kvec_reset(con);
 	con_out_kvec_add(con, sizeof (con->out_connect),
 					&con->out_connect);
 	if (auth && auth->authorizer_buf_len)
@@ -1557,6 +1556,7 @@ static int process_connect(struct ceph_connection *con)
 			return -1;
 		}
 		con->auth_retry = 1;
+		con_out_kvec_reset(con);
 		ret = prepare_write_connect(con);
 		if (ret < 0)
 			return ret;
@@ -1577,6 +1577,7 @@ static int process_connect(struct ceph_connection *con)
 		       ENTITY_NAME(con->peer_name),
 		       ceph_pr_addr(&con->peer_addr.in_addr));
 		reset_connection(con);
+		con_out_kvec_reset(con);
 		ret = prepare_write_connect(con);
 		if (ret < 0)
 			return ret;
@@ -1601,6 +1602,7 @@ static int process_connect(struct ceph_connection *con)
 		     le32_to_cpu(con->out_connect.connect_seq),
 		     le32_to_cpu(con->in_reply.connect_seq));
 		con->connect_seq = le32_to_cpu(con->in_reply.connect_seq);
+		con_out_kvec_reset(con);
 		ret = prepare_write_connect(con);
 		if (ret < 0)
 			return ret;
@@ -1617,6 +1619,7 @@ static int process_connect(struct ceph_connection *con)
 		     le32_to_cpu(con->in_reply.global_seq));
 		get_global_seq(con->msgr,
 			       le32_to_cpu(con->in_reply.global_seq));
+		con_out_kvec_reset(con);
 		ret = prepare_write_connect(con);
 		if (ret < 0)
 			return ret;
@@ -2135,7 +2138,11 @@ more:
 		BUG_ON(con->state != CON_STATE_CONNECTING);
 		con->state = CON_STATE_NEGOTIATING;
 
-		/* Banner is good, exchange connection info */
+		/*
+		 * Received banner is good, exchange connection info.
+		 * Do not reset out_kvec, as sending our banner raced
+		 * with receiving peer banner after connect completed.
+		 */
 		ret = prepare_write_connect(con);
 		if (ret < 0)
 			goto out;
-- 
cgit v0.10.2


From 45f2e081f573526977abfa781a12728f83e9641f Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Tue, 21 Aug 2012 12:11:51 -0700
Subject: ceph: avoid divide by zero in __validate_layout()

If "l->stripe_unit" is zero the the mod on the next line will cause a
divide by zero bug.  This comes from the copy_from_user() in
ceph_ioctl_set_layout_policy().  Passing 0 is valid, though (it means
"do not change") so avoid the % check in that case.

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@inktank.com>

diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 8e3fb69..1396ceb 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -42,7 +42,8 @@ static long __validate_layout(struct ceph_mds_client *mdsc,
 	/* validate striping parameters */
 	if ((l->object_size & ~PAGE_MASK) ||
 	    (l->stripe_unit & ~PAGE_MASK) ||
-	    ((unsigned)l->object_size % (unsigned)l->stripe_unit))
+	    (l->stripe_unit != 0 &&
+	     ((unsigned)l->object_size % (unsigned)l->stripe_unit)))
 		return -EINVAL;
 
 	/* make sure it's a valid data pool */
-- 
cgit v0.10.2