From 79aec9844de339531f05b019644ccaf5dd777144 Mon Sep 17 00:00:00 2001
From: Sam Lang <sam.lang@inktank.com>
Date: Wed, 19 Dec 2012 09:44:23 -1000
Subject: ceph: Check for err on mds request in atomic_open

The error returned by ceph_mdsc_do_request includes errors sending the
request, errors on timeout, or any errors coming from the mds.  If
ceph_mdsc_do_request returns an error, the reply struct will most likely
be bogus.  We need to bail out and propogate the error instead of
overwriting it.

Signed-off-by: Sam Lang <sam.lang@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index d415096..2c71cbd 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -243,6 +243,9 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
 	err = ceph_mdsc_do_request(mdsc,
 				   (flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
 				   req);
+	if (err)
+		goto out_err;
+
 	err = ceph_handle_snapdir(req, dentry, err);
 	if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
 		err = ceph_handle_notrace_create(dir, dentry);
-- 
cgit v0.10.2


From 6e8575faa8fa680d59404a4d58d12190667be815 Mon Sep 17 00:00:00 2001
From: Sam Lang <sam.lang@inktank.com>
Date: Fri, 28 Dec 2012 09:56:46 -0800
Subject: ceph: Check for created flag in response from mds

The mds now sends back a created inode if the create request
performed the create.  If the file already existed, no inode is
returned in the reply.  This allows ceph to set the created flag
in atomic_open so that permissions are properly checked in the case
that the file wasn't created by the create call to the mds.

To ensure compability with previous kernels, a feature for sending
back the inode in the create reply was added, so that the mds will
only send back the inode if the client indicates it supports the
feature.

Signed-off-by: Sam Lang <sam.lang@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 2c71cbd..22b5b71 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -266,6 +266,9 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
 		err = finish_no_open(file, dn);
 	} else {
 		dout("atomic_open finish_open on dn %p\n", dn);
+		if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) {
+			*opened |= FILE_CREATED;
+		}
 		err = finish_open(file, dentry, ceph_open, opened);
 	}
 
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 9165eb8..d958420 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -233,6 +233,30 @@ bad:
 }
 
 /*
+ * parse create results
+ */
+static int parse_reply_info_create(void **p, void *end,
+				  struct ceph_mds_reply_info_parsed *info,
+				  int features)
+{
+	if (features & CEPH_FEATURE_REPLY_CREATE_INODE) {
+		if (*p == end) {
+			info->has_create_ino = false;
+		} else {
+			info->has_create_ino = true;
+			info->ino = ceph_decode_64(p);
+		}
+	}
+
+	if (unlikely(*p != end))
+		goto bad;
+	return 0;
+
+bad:
+	return -EIO;
+}
+
+/*
  * parse extra results
  */
 static int parse_reply_info_extra(void **p, void *end,
@@ -241,8 +265,12 @@ static int parse_reply_info_extra(void **p, void *end,
 {
 	if (info->head->op == CEPH_MDS_OP_GETFILELOCK)
 		return parse_reply_info_filelock(p, end, info, features);
-	else
+	else if (info->head->op == CEPH_MDS_OP_READDIR)
 		return parse_reply_info_dir(p, end, info, features);
+	else if (info->head->op == CEPH_MDS_OP_CREATE)
+		return parse_reply_info_create(p, end, info, features);
+	else
+		return -EIO;
 }
 
 /*
@@ -2170,7 +2198,8 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 	mutex_lock(&req->r_fill_mutex);
 	err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session);
 	if (err == 0) {
-		if (result == 0 && req->r_op != CEPH_MDS_OP_GETFILELOCK &&
+		if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR ||
+				    req->r_op == CEPH_MDS_OP_LSSNAP) &&
 		    rinfo->dir_nr)
 			ceph_readdir_prepopulate(req, req->r_session);
 		ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index dd26846..567f7c6 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -74,6 +74,12 @@ struct ceph_mds_reply_info_parsed {
 			struct ceph_mds_reply_info_in *dir_in;
 			u8                            dir_complete, dir_end;
 		};
+
+		/* for create results */
+		struct {
+			bool has_create_ino;
+			u64 ino;
+		};
 	};
 
 	/* encoded blob describing snapshot contexts for certain
diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h
index dad579b..6b7c6ac 100644
--- a/include/linux/ceph/ceph_features.h
+++ b/include/linux/ceph/ceph_features.h
@@ -14,13 +14,16 @@
 #define CEPH_FEATURE_DIRLAYOUTHASH  (1<<7)
 /* bits 8-17 defined by user-space; not supported yet here */
 #define CEPH_FEATURE_CRUSH_TUNABLES (1<<18)
+/* bits 19-25 defined by user-space; not supported yet here */
+#define CEPH_FEATURE_REPLY_CREATE_INODE (1<<27)
 
 /*
  * Features supported.
  */
 #define CEPH_FEATURES_SUPPORTED_DEFAULT  \
 	(CEPH_FEATURE_NOSRCADDR |	 \
-	 CEPH_FEATURE_CRUSH_TUNABLES)
+	 CEPH_FEATURE_CRUSH_TUNABLES |   \
+	 CEPH_FEATURE_REPLY_CREATE_INODE)
 
 #define CEPH_FEATURES_REQUIRED_DEFAULT   \
 	(CEPH_FEATURE_NOSRCADDR)
-- 
cgit v0.10.2


From a41bad1a9b9f9982eb9b451165724c5f81096683 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Fri, 30 Nov 2012 13:49:51 +0800
Subject: ceph: re-calculate truncate_size for strip object

Otherwise osd may truncate the object to larger size.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Reviewed-by: Sage Weil <sage@inktank.com>

diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index eb9a444..267f183 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -76,8 +76,16 @@ int ceph_calc_raw_layout(struct ceph_osd_client *osdc,
 		     orig_len - *plen, off, *plen);
 
 	if (op_has_extent(op->op)) {
+		u32 osize = le32_to_cpu(layout->fl_object_size);
 		op->extent.offset = objoff;
 		op->extent.length = objlen;
+		if (op->extent.truncate_size <= off - objoff) {
+			op->extent.truncate_size = 0;
+		} else {
+			op->extent.truncate_size -= off - objoff;
+			if (op->extent.truncate_size > osize)
+				op->extent.truncate_size = osize;
+		}
 	}
 	req->r_num_pages = calc_pages_for(off, *plen);
 	req->r_page_alignment = off & ~PAGE_MASK;
-- 
cgit v0.10.2


From 8a92a119b292012a9bd920b908c3e9f1c512291d Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Fri, 4 Jan 2013 14:28:07 +0800
Subject: ceph: move dirty inode to migrating list when clearing auth caps

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Reviewed-by: Sage Weil <sage@inktank.com>

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index a1d9bb3..a9fe2d5 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -611,8 +611,16 @@ retry:
 
 	if (flags & CEPH_CAP_FLAG_AUTH)
 		ci->i_auth_cap = cap;
-	else if (ci->i_auth_cap == cap)
+	else if (ci->i_auth_cap == cap) {
 		ci->i_auth_cap = NULL;
+		spin_lock(&mdsc->cap_dirty_lock);
+		if (!list_empty(&ci->i_dirty_item)) {
+			dout(" moving %p to cap_dirty_migrating\n", inode);
+			list_move(&ci->i_dirty_item,
+				  &mdsc->cap_dirty_migrating);
+		}
+		spin_unlock(&mdsc->cap_dirty_lock);
+	}
 
 	dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
 	     inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
-- 
cgit v0.10.2


From 395c312b9c535d57db122cbb5b7292223561d0b8 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Fri, 4 Jan 2013 14:37:57 +0800
Subject: ceph: allow revoking duplicated caps issued by non-auth MDS

Allow revoking duplicated caps issued by non-auth MDS if these caps
are also issued by auth MDS.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Reviewed-by: Sage Weil <sage@inktank.com>

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index a9fe2d5..76b1923 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1468,7 +1468,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
 	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct inode *inode = &ci->vfs_inode;
 	struct ceph_cap *cap;
-	int file_wanted, used;
+	int file_wanted, used, cap_used;
 	int took_snap_rwsem = 0;             /* true if mdsc->snap_rwsem held */
 	int issued, implemented, want, retain, revoking, flushing = 0;
 	int mds = -1;   /* keep track of how far we've gone through i_caps list
@@ -1571,9 +1571,14 @@ retry_locked:
 
 		/* NOTE: no side-effects allowed, until we take s_mutex */
 
+		cap_used = used;
+		if (ci->i_auth_cap && cap != ci->i_auth_cap)
+			cap_used &= ~ci->i_auth_cap->issued;
+
 		revoking = cap->implemented & ~cap->issued;
-		dout(" mds%d cap %p issued %s implemented %s revoking %s\n",
+		dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n",
 		     cap->mds, cap, ceph_cap_string(cap->issued),
+		     ceph_cap_string(cap_used),
 		     ceph_cap_string(cap->implemented),
 		     ceph_cap_string(revoking));
 
@@ -1601,7 +1606,7 @@ retry_locked:
 		}
 
 		/* completed revocation? going down and there are no caps? */
-		if (revoking && (revoking & used) == 0) {
+		if (revoking && (revoking & cap_used) == 0) {
 			dout("completed revocation of %s\n",
 			     ceph_cap_string(cap->implemented & ~cap->issued));
 			goto ack;
@@ -1678,8 +1683,8 @@ ack:
 		sent++;
 
 		/* __send_cap drops i_ceph_lock */
-		delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want,
-				      retain, flushing, NULL);
+		delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, cap_used,
+				      want, retain, flushing, NULL);
 		goto retry; /* retake i_ceph_lock and restart our cap scan. */
 	}
 
-- 
cgit v0.10.2


From 66f58691c5c820283dd7e4d6fe8649033ed43ceb Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Fri, 4 Jan 2013 14:45:18 +0800
Subject: ceph: allocate cap_release message when receiving cap import

When client wants to release an imported cap, it's possible there
is no reserved cap_release message in corresponding mds session.
so __queue_cap_release causes kernel panic.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Reviewed-by: Sage Weil <sage@inktank.com>

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 76b1923..40b5bbe 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2833,6 +2833,9 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 	dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
 	     (unsigned)seq);
 
+	if (op == CEPH_CAP_OP_IMPORT)
+		ceph_add_cap_releases(mdsc, session);
+
 	/* lookup ino */
 	inode = ceph_find_inode(sb, vino);
 	ci = ceph_inode(inode);
-- 
cgit v0.10.2


From 390306c38dd43908f7f7730229999790a773d1d5 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Fri, 4 Jan 2013 15:30:10 +0800
Subject: ceph: check mds_wanted for imported cap

The MDS may have incorrect wanted caps after importing caps. So the
client should check the value mds has and send cap update if necessary.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Reviewed-by: Sage Weil <sage@inktank.com>

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 40b5bbe..1e1e020 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2429,7 +2429,9 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
 		dout("mds wanted %s -> %s\n",
 		     ceph_cap_string(le32_to_cpu(grant->wanted)),
 		     ceph_cap_string(wanted));
-		grant->wanted = cpu_to_le32(wanted);
+		/* imported cap may not have correct mds_wanted */
+		if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT)
+			check_caps = 1;
 	}
 
 	cap->seq = seq;
-- 
cgit v0.10.2


From 1604f488ac2dcce33c8218e75a000e8c5fb57e61 Mon Sep 17 00:00:00 2001
From: Jim Schutt <jaschut@sandia.gov>
Date: Fri, 30 Nov 2012 09:15:25 -0700
Subject: libceph: for chooseleaf rules, retry CRUSH map descent from root if
 leaf is failed

Add libceph support for a new CRUSH tunable recently added to Ceph servers.

Consider the CRUSH rule
  step chooseleaf firstn 0 type <node_type>

This rule means that <n> replicas will be chosen in a manner such that
each chosen leaf's branch will contain a unique instance of <node_type>.

When an object is re-replicated after a leaf failure, if the CRUSH map uses
a chooseleaf rule the remapped replica ends up under the <node_type> bucket
that held the failed leaf.  This causes uneven data distribution across the
storage cluster, to the point that when all the leaves but one fail under a
particular <node_type> bucket, that remaining leaf holds all the data from
its failed peers.

This behavior also limits the number of peers that can participate in the
re-replication of the data held by the failed leaf, which increases the
time required to re-replicate after a failure.

For a chooseleaf CRUSH rule, the tree descent has two steps: call them the
inner and outer descents.

If the tree descent down to <node_type> is the outer descent, and the descent
from <node_type> down to a leaf is the inner descent, the issue is that a
down leaf is detected on the inner descent, so only the inner descent is
retried.

In order to disperse re-replicated data as widely as possible across a
storage cluster after a failure, we want to retry the outer descent. So,
fix up crush_choose() to allow the inner descent to return immediately on
choosing a failed leaf.  Wire this up as a new CRUSH tunable.

Note that after this change, for a chooseleaf rule, if the primary OSD
in a placement group has failed, choosing a replacement may result in
one of the other OSDs in the PG colliding with the new primary.  This
requires that OSD's data for that PG to need moving as well.  This
seems unavoidable but should be relatively rare.

This corresponds to ceph.git commit 88f218181a9e6d2292e2697fc93797d0f6d6e5dc.

Signed-off-by: Jim Schutt <jaschut@sandia.gov>
Reviewed-by: Sage Weil <sage@inktank.com>

diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h
index 6b7c6ac..2160aab 100644
--- a/include/linux/ceph/ceph_features.h
+++ b/include/linux/ceph/ceph_features.h
@@ -14,7 +14,9 @@
 #define CEPH_FEATURE_DIRLAYOUTHASH  (1<<7)
 /* bits 8-17 defined by user-space; not supported yet here */
 #define CEPH_FEATURE_CRUSH_TUNABLES (1<<18)
-/* bits 19-25 defined by user-space; not supported yet here */
+/* bits 19-24 defined by user-space; not supported yet here */
+#define CEPH_FEATURE_CRUSH_TUNABLES2 (1<<25)
+/* bit 26 defined by user-space; not supported yet here */
 #define CEPH_FEATURE_REPLY_CREATE_INODE (1<<27)
 
 /*
@@ -22,7 +24,8 @@
  */
 #define CEPH_FEATURES_SUPPORTED_DEFAULT  \
 	(CEPH_FEATURE_NOSRCADDR |	 \
-	 CEPH_FEATURE_CRUSH_TUNABLES |   \
+	 CEPH_FEATURE_CRUSH_TUNABLES |	  \
+	 CEPH_FEATURE_CRUSH_TUNABLES2 |   \
 	 CEPH_FEATURE_REPLY_CREATE_INODE)
 
 #define CEPH_FEATURES_REQUIRED_DEFAULT   \
diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h
index 25baa28..6a1101f 100644
--- a/include/linux/crush/crush.h
+++ b/include/linux/crush/crush.h
@@ -162,6 +162,8 @@ struct crush_map {
 	__u32 choose_local_fallback_tries;
 	/* choose attempts before giving up */ 
 	__u32 choose_total_tries;
+	/* attempt chooseleaf inner descent once; on failure retry outer descent */
+	__u32 chooseleaf_descend_once;
 };
 
 
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index 35fce75..96c8a58 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -287,6 +287,7 @@ static int is_out(const struct crush_map *map, const __u32 *weight, int item, in
  * @outpos: our position in that vector
  * @firstn: true if choosing "first n" items, false if choosing "indep"
  * @recurse_to_leaf: true if we want one device under each item of given type
+ * @descend_once: true if we should only try one descent before giving up
  * @out2: second output vector for leaf items (if @recurse_to_leaf)
  */
 static int crush_choose(const struct crush_map *map,
@@ -295,7 +296,7 @@ static int crush_choose(const struct crush_map *map,
 			int x, int numrep, int type,
 			int *out, int outpos,
 			int firstn, int recurse_to_leaf,
-			int *out2)
+			int descend_once, int *out2)
 {
 	int rep;
 	unsigned int ftotal, flocal;
@@ -399,6 +400,7 @@ static int crush_choose(const struct crush_map *map,
 							 x, outpos+1, 0,
 							 out2, outpos,
 							 firstn, 0,
+							 map->chooseleaf_descend_once,
 							 NULL) <= outpos)
 							/* didn't get leaf */
 							reject = 1;
@@ -422,7 +424,10 @@ reject:
 					ftotal++;
 					flocal++;
 
-					if (collide && flocal <= map->choose_local_tries)
+					if (reject && descend_once)
+						/* let outer call try again */
+						skip_rep = 1;
+					else if (collide && flocal <= map->choose_local_tries)
 						/* retry locally a few times */
 						retry_bucket = 1;
 					else if (map->choose_local_fallback_tries > 0 &&
@@ -485,6 +490,7 @@ int crush_do_rule(const struct crush_map *map,
 	int i, j;
 	int numrep;
 	int firstn;
+	const int descend_once = 0;
 
 	if ((__u32)ruleno >= map->max_rules) {
 		dprintk(" bad ruleno %d\n", ruleno);
@@ -544,7 +550,8 @@ int crush_do_rule(const struct crush_map *map,
 						      curstep->arg2,
 						      o+osize, j,
 						      firstn,
-						      recurse_to_leaf, c+osize);
+						      recurse_to_leaf,
+						      descend_once, c+osize);
 			}
 
 			if (recurse_to_leaf)
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index de73214..ca05871 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -170,6 +170,7 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
         c->choose_local_tries = 2;
         c->choose_local_fallback_tries = 5;
         c->choose_total_tries = 19;
+	c->chooseleaf_descend_once = 0;
 
 	ceph_decode_need(p, end, 4*sizeof(u32), bad);
 	magic = ceph_decode_32(p);
@@ -336,6 +337,11 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
         dout("crush decode tunable choose_total_tries = %d",
              c->choose_total_tries);
 
+	ceph_decode_need(p, end, sizeof(u32), done);
+	c->chooseleaf_descend_once = ceph_decode_32(p);
+	dout("crush decode tunable chooseleaf_descend_once = %d",
+	     c->chooseleaf_descend_once);
+
 done:
 	dout("crush_decode success\n");
 	return c;
-- 
cgit v0.10.2


From 7d7c1f6136bac00174842f845babe7fb3483724e Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Tue, 15 Jan 2013 18:49:09 -0800
Subject: crush: avoid recursion if we have already collided

This saves us some cycles, but does not affect the placement result at
all.

This corresponds to ceph.git commit 4abb53d4f.

Signed-off-by: Sage Weil <sage@inktank.com>

diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index 96c8a58..cbd06a9 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -392,7 +392,7 @@ static int crush_choose(const struct crush_map *map,
 				}
 
 				reject = 0;
-				if (recurse_to_leaf) {
+				if (!collide && recurse_to_leaf) {
 					if (item < 0) {
 						if (crush_choose(map,
 							 map->buckets[-1-item],
-- 
cgit v0.10.2


From c3acb18196cf3d7d3db6a5121c1bc674c3fba31f Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 7 Dec 2012 09:57:58 -0600
Subject: libceph: reformat __reset_osd()

Reformat __reset_osd() into three distinct blocks of code
handling the three return cases.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>

diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 267f183..eade41b 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -747,31 +747,35 @@ static void remove_old_osds(struct ceph_osd_client *osdc)
  */
 static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
 {
-	struct ceph_osd_request *req;
-	int ret = 0;
+	struct ceph_entity_addr *peer_addr;
 
 	dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
 	if (list_empty(&osd->o_requests) &&
 	    list_empty(&osd->o_linger_requests)) {
 		__remove_osd(osdc, osd);
-		ret = -ENODEV;
-	} else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd],
-			  &osd->o_con.peer_addr,
-			  sizeof(osd->o_con.peer_addr)) == 0 &&
-		   !ceph_con_opened(&osd->o_con)) {
+
+		return -ENODEV;
+	}
+
+	peer_addr = &osdc->osdmap->osd_addr[osd->o_osd];
+	if (!memcmp(peer_addr, &osd->o_con.peer_addr, sizeof (*peer_addr)) &&
+			!ceph_con_opened(&osd->o_con)) {
+		struct ceph_osd_request *req;
+
 		dout(" osd addr hasn't changed and connection never opened,"
 		     " letting msgr retry");
 		/* touch each r_stamp for handle_timeout()'s benfit */
 		list_for_each_entry(req, &osd->o_requests, r_osd_item)
 			req->r_stamp = jiffies;
-		ret = -EAGAIN;
-	} else {
-		ceph_con_close(&osd->o_con);
-		ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd,
-			      &osdc->osdmap->osd_addr[osd->o_osd]);
-		osd->o_incarnation++;
+
+		return -EAGAIN;
 	}
-	return ret;
+
+	ceph_con_close(&osd->o_con);
+	ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd, peer_addr);
+	osd->o_incarnation++;
+
+	return 0;
 }
 
 static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new)
-- 
cgit v0.10.2


From c66c6e0c0b04ce4a152fe02c562dd0752665d580 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreamhost.com>
Date: Thu, 1 Nov 2012 08:39:26 -0500
Subject: rbd: document rbd_spec structure

I promised Josh I would document whether there were any restrictions
needed for accessing fields of an rbd_spec structure.  This adds a
big block of comments that documents the structure and how it is
used--including the fact that we don't attempt to synchronize access
to it.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: David Zafman <david.zafman@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 89576a0..128978c 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -119,7 +119,26 @@ struct rbd_image_header {
  * An rbd image specification.
  *
  * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
- * identify an image.
+ * identify an image.  Each rbd_dev structure includes a pointer to
+ * an rbd_spec structure that encapsulates this identity.
+ *
+ * Each of the id's in an rbd_spec has an associated name.  For a
+ * user-mapped image, the names are supplied and the id's associated
+ * with them are looked up.  For a layered image, a parent image is
+ * defined by the tuple, and the names are looked up.
+ *
+ * An rbd_dev structure contains a parent_spec pointer which is
+ * non-null if the image it represents is a child in a layered
+ * image.  This pointer will refer to the rbd_spec structure used
+ * by the parent rbd_dev for its own identity (i.e., the structure
+ * is shared between the parent and child).
+ *
+ * Since these structures are populated once, during the discovery
+ * phase of image construction, they are effectively immutable so
+ * we make no effort to synchronize access to them.
+ *
+ * Note that code herein does not assume the image name is known (it
+ * could be a null pointer).
  */
 struct rbd_spec {
 	u64		pool_id;
-- 
cgit v0.10.2


From 69e7a02f633ba7de0aefa87f3f0e3b31e953b09a Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreamhost.com>
Date: Thu, 1 Nov 2012 08:39:26 -0500
Subject: rbd: kill rbd_spec->image_name_len

There may have been a benefit to hanging on to the length of an
image name before, but there is really none now.  The only time it's
used is when probing for rbd images, so we can just compute the
length then.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: David Zafman <david.zafman@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 128978c..a002984 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -147,7 +147,6 @@ struct rbd_spec {
 	char		*image_id;
 	size_t		image_id_len;
 	char		*image_name;
-	size_t		image_name_len;
 
 	u64		snap_id;
 	char		*snap_name;
@@ -2563,15 +2562,15 @@ static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
 
 	rbd_assert(!rbd_dev->spec->image_name);
 
-	image_id_size = sizeof (__le32) + rbd_dev->spec->image_id_len;
+	len = strlen(rbd_dev->spec->image_id);
+	image_id_size = sizeof (__le32) + len;
 	image_id = kmalloc(image_id_size, GFP_KERNEL);
 	if (!image_id)
 		return NULL;
 
 	p = image_id;
 	end = (char *) image_id + image_id_size;
-	ceph_encode_string(&p, end, rbd_dev->spec->image_id,
-				(u32) rbd_dev->spec->image_id_len);
+	ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len);
 
 	size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
 	reply_buf = kmalloc(size, GFP_KERNEL);
@@ -2631,14 +2630,12 @@ static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
 	/* Fetch the image name; tolerate failure here */
 
 	name = rbd_dev_image_name(rbd_dev);
-	if (name) {
-		rbd_dev->spec->image_name_len = strlen(name);
+	if (name)
 		rbd_dev->spec->image_name = (char *) name;
-	} else {
+	else
 		pr_warning(RBD_DRV_NAME "%d "
 			"unable to get image name for image id %s\n",
 			rbd_dev->major, rbd_dev->spec->image_id);
-	}
 
 	/* Look up the snapshot name. */
 
@@ -3252,7 +3249,7 @@ static int rbd_add_parse_args(const char *buf,
 	if (!*spec->pool_name)
 		goto out_err;	/* Missing pool name */
 
-	spec->image_name = dup_token(&buf, &spec->image_name_len);
+	spec->image_name = dup_token(&buf, NULL);
 	if (!spec->image_name)
 		goto out_mem;
 	if (!*spec->image_name)
@@ -3342,7 +3339,7 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev)
 	 * First, see if the format 2 image id file exists, and if
 	 * so, get the image's persistent id from it.
 	 */
-	size = sizeof (RBD_ID_PREFIX) + rbd_dev->spec->image_name_len;
+	size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
 	object_name = kmalloc(size, GFP_NOIO);
 	if (!object_name)
 		return -ENOMEM;
@@ -3400,7 +3397,7 @@ static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
 
 	/* Record the header object name for this rbd image. */
 
-	size = rbd_dev->spec->image_name_len + sizeof (RBD_SUFFIX);
+	size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX);
 	rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
 	if (!rbd_dev->header_name) {
 		ret = -ENOMEM;
-- 
cgit v0.10.2


From 979ed480a2722ad8d9f87054635158f652a1241e Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreamhost.com>
Date: Thu, 1 Nov 2012 08:39:26 -0500
Subject: rbd: kill rbd_spec->image_id_len

There is no real benefit to keeping the length of an image id, so
get rid of it.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: David Zafman <david.zafman@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index a002984..e01dbb1 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -145,7 +145,6 @@ struct rbd_spec {
 	char		*pool_name;
 
 	char		*image_id;
-	size_t		image_id_len;
 	char		*image_name;
 
 	u64		snap_id;
@@ -2492,7 +2491,6 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
 	void *end;
 	char *image_id;
 	u64 overlap;
-	size_t len = 0;
 	int ret;
 
 	parent_spec = rbd_spec_alloc();
@@ -2526,13 +2524,12 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
 	if (parent_spec->pool_id == CEPH_NOPOOL)
 		goto out;	/* No parent?  No problem. */
 
-	image_id = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
+	image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
 	if (IS_ERR(image_id)) {
 		ret = PTR_ERR(image_id);
 		goto out_err;
 	}
 	parent_spec->image_id = image_id;
-	parent_spec->image_id_len = len;
 	ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
 	ceph_decode_64_safe(&p, end, overlap, out_err);
 
@@ -3368,8 +3365,7 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev)
 	p = response;
 	rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
 						p + RBD_IMAGE_ID_LEN_MAX,
-						&rbd_dev->spec->image_id_len,
-						GFP_NOIO);
+						NULL, GFP_NOIO);
 	if (IS_ERR(rbd_dev->spec->image_id)) {
 		ret = PTR_ERR(rbd_dev->spec->image_id);
 		rbd_dev->spec->image_id = NULL;
@@ -3393,7 +3389,6 @@ static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
 	rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
 	if (!rbd_dev->spec->image_id)
 		return -ENOMEM;
-	rbd_dev->spec->image_id_len = 0;
 
 	/* Record the header object name for this rbd image. */
 
@@ -3443,7 +3438,7 @@ static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
 	 * Image id was filled in by the caller.  Record the header
 	 * object name for this rbd image.
 	 */
-	size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->spec->image_id_len;
+	size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id);
 	rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
 	if (!rbd_dev->header_name)
 		return -ENOMEM;
-- 
cgit v0.10.2


From 4caf35f9ecdca1feef1d2e5e223b78e52ffbea87 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreamhost.com>
Date: Thu, 1 Nov 2012 08:39:27 -0500
Subject: rbd: use kmemdup()

This replaces two kmalloc()/memcpy() combinations with a single
call to kmemdup().

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: David Zafman <david.zafman@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index e01dbb1..d97611e 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -3151,11 +3151,9 @@ static inline char *dup_token(const char **buf, size_t *lenp)
 	size_t len;
 
 	len = next_token(buf);
-	dup = kmalloc(len + 1, GFP_KERNEL);
+	dup = kmemdup(*buf, len + 1, GFP_KERNEL);
 	if (!dup)
 		return NULL;
-
-	memcpy(dup, *buf, len);
 	*(dup + len) = '\0';
 	*buf += len;
 
@@ -3264,10 +3262,9 @@ static int rbd_add_parse_args(const char *buf,
 		ret = -ENAMETOOLONG;
 		goto out_err;
 	}
-	spec->snap_name = kmalloc(len + 1, GFP_KERNEL);
+	spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
 	if (!spec->snap_name)
 		goto out_mem;
-	memcpy(spec->snap_name, buf, len);
 	*(spec->snap_name + len) = '\0';
 
 	/* Initialize all rbd options to the defaults */
-- 
cgit v0.10.2


From dd5f049dbdf973d9bceebef1fd73647a5ede6732 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 1 Nov 2012 08:39:27 -0500
Subject: ceph: define ceph_encode_8_safe()

It's kind of a silly macro, but ceph_encode_8_safe() is the only one
missing from an otherwise pretty complete set.  It's not used, but
neither are a couple of the others in this set.

While in there, insert some whitespace to tidy up the alignment of
the line-terminating backslashes in some of the macro definitions.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Dan Mick <dan.mick@inktank.com>

diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h
index 4bbf2db..cd679f2 100644
--- a/include/linux/ceph/decode.h
+++ b/include/linux/ceph/decode.h
@@ -52,10 +52,10 @@ static inline int ceph_has_room(void **p, void *end, size_t n)
 	return end >= *p && n <= end - *p;
 }
 
-#define ceph_decode_need(p, end, n, bad)		\
-	do {						\
-		if (!likely(ceph_has_room(p, end, n)))	\
-			goto bad;			\
+#define ceph_decode_need(p, end, n, bad)			\
+	do {							\
+		if (!likely(ceph_has_room(p, end, n)))		\
+			goto bad;				\
 	} while (0)
 
 #define ceph_decode_64_safe(p, end, v, bad)			\
@@ -99,8 +99,8 @@ static inline int ceph_has_room(void **p, void *end, size_t n)
  *
  * There are two possible failures:
  *   - converting the string would require accessing memory at or
- *     beyond the "end" pointer provided (-E
- *   - memory could not be allocated for the result
+ *     beyond the "end" pointer provided (-ERANGE)
+ *   - memory could not be allocated for the result (-ENOMEM)
  */
 static inline char *ceph_extract_encoded_string(void **p, void *end,
 						size_t *lenp, gfp_t gfp)
@@ -217,10 +217,10 @@ static inline void ceph_encode_string(void **p, void *end,
 	*p += len;
 }
 
-#define ceph_encode_need(p, end, n, bad)		\
-	do {						\
-		if (!likely(ceph_has_room(p, end, n)))	\
-			goto bad;			\
+#define ceph_encode_need(p, end, n, bad)			\
+	do {							\
+		if (!likely(ceph_has_room(p, end, n)))		\
+			goto bad;				\
 	} while (0)
 
 #define ceph_encode_64_safe(p, end, v, bad)			\
@@ -231,12 +231,17 @@ static inline void ceph_encode_string(void **p, void *end,
 #define ceph_encode_32_safe(p, end, v, bad)			\
 	do {							\
 		ceph_encode_need(p, end, sizeof(u32), bad);	\
-		ceph_encode_32(p, v);			\
+		ceph_encode_32(p, v);				\
 	} while (0)
 #define ceph_encode_16_safe(p, end, v, bad)			\
 	do {							\
 		ceph_encode_need(p, end, sizeof(u16), bad);	\
-		ceph_encode_16(p, v);			\
+		ceph_encode_16(p, v);				\
+	} while (0)
+#define ceph_encode_8_safe(p, end, v, bad)			\
+	do {							\
+		ceph_encode_need(p, end, sizeof(u8), bad);	\
+		ceph_encode_8(p, v);				\
 	} while (0)
 
 #define ceph_encode_copy_safe(p, end, pv, n, bad)		\
-- 
cgit v0.10.2


From 06ecc6cbf7a60cd5abd9fd2bda8ae69b395c2be3 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreamhost.com>
Date: Thu, 1 Nov 2012 10:17:15 -0500
Subject: rbd: define and use rbd_warn()

Define a new function rbd_warn() that produces a boilerplate warning
message, identifying in the resulting message the affected rbd
device in the best way available.  Use it in a few places that now
use pr_warning().

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Dan Mick <dan.mick@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index d97611e..635b81d 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -294,6 +294,33 @@ static struct device rbd_root_dev = {
 	.release =      rbd_root_dev_release,
 };
 
+static __printf(2, 3)
+void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	if (!rbd_dev)
+		printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
+	else if (rbd_dev->disk)
+		printk(KERN_WARNING "%s: %s: %pV\n",
+			RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
+	else if (rbd_dev->spec && rbd_dev->spec->image_name)
+		printk(KERN_WARNING "%s: image %s: %pV\n",
+			RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
+	else if (rbd_dev->spec && rbd_dev->spec->image_id)
+		printk(KERN_WARNING "%s: id %s: %pV\n",
+			RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
+	else	/* punt */
+		printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
+			RBD_DRV_NAME, rbd_dev, &vaf);
+	va_end(args);
+}
+
 #ifdef RBD_DEBUG
 #define rbd_assert(expr)						\
 		if (unlikely(!(expr))) {				\
@@ -1403,8 +1430,8 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
 		(unsigned int) opcode);
 	rc = rbd_dev_refresh(rbd_dev, &hver);
 	if (rc)
-		pr_warning(RBD_DRV_NAME "%d got notification but failed to "
-			   " update snaps: %d\n", rbd_dev->major, rc);
+		rbd_warn(rbd_dev, "got notification but failed to "
+			   " update snaps: %d\n", rc);
 
 	rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
 }
@@ -1767,15 +1794,13 @@ rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
 			goto out_err;
 		if (WARN_ON((size_t) ret < size)) {
 			ret = -ENXIO;
-			pr_warning("short header read for image %s"
-					" (want %zd got %d)\n",
-				rbd_dev->spec->image_name, size, ret);
+			rbd_warn(rbd_dev, "short header read (want %zd got %d)",
+				size, ret);
 			goto out_err;
 		}
 		if (!rbd_dev_ondisk_valid(ondisk)) {
 			ret = -ENXIO;
-			pr_warning("invalid header for image %s\n",
-				rbd_dev->spec->image_name);
+			rbd_warn(rbd_dev, "invalid header");
 			goto out_err;
 		}
 
@@ -2630,9 +2655,7 @@ static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
 	if (name)
 		rbd_dev->spec->image_name = (char *) name;
 	else
-		pr_warning(RBD_DRV_NAME "%d "
-			"unable to get image name for image id %s\n",
-			rbd_dev->major, rbd_dev->spec->image_id);
+		rbd_warn(rbd_dev, "unable to get image name");
 
 	/* Look up the snapshot name. */
 
-- 
cgit v0.10.2


From 4fb5d671399e83d3875593db2f56d5b57fcb104f Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreamhost.com>
Date: Thu, 1 Nov 2012 10:17:15 -0500
Subject: rbd: add warning messages for missing arguments

Tell the user (via dmesg) what was wrong with the arguments provided
via /sys/bus/rbd/add.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Dan Mick <dan.mick@inktank.com>

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 635b81d..31da8c5 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -3244,8 +3244,10 @@ static int rbd_add_parse_args(const char *buf,
 	/* The first four tokens are required */
 
 	len = next_token(&buf);
-	if (!len)
-		return -EINVAL;	/* Missing monitor address(es) */
+	if (!len) {
+		rbd_warn(NULL, "no monitor address(es) provided");
+		return -EINVAL;
+	}
 	mon_addrs = buf;
 	mon_addrs_size = len + 1;
 	buf += len;
@@ -3254,8 +3256,10 @@ static int rbd_add_parse_args(const char *buf,
 	options = dup_token(&buf, NULL);
 	if (!options)
 		return -ENOMEM;
-	if (!*options)
-		goto out_err;	/* Missing options */
+	if (!*options) {
+		rbd_warn(NULL, "no options provided");
+		goto out_err;
+	}
 
 	spec = rbd_spec_alloc();
 	if (!spec)
@@ -3264,14 +3268,18 @@ static int rbd_add_parse_args(const char *buf,
 	spec->pool_name = dup_token(&buf, NULL);
 	if (!spec->pool_name)
 		goto out_mem;
-	if (!*spec->pool_name)
-		goto out_err;	/* Missing pool name */
+	if (!*spec->pool_name) {
+		rbd_warn(NULL, "no pool name provided");
+		goto out_err;
+	}
 
 	spec->image_name = dup_token(&buf, NULL);
 	if (!spec->image_name)
 		goto out_mem;
-	if (!*spec->image_name)
-		goto out_err;	/* Missing image name */
+	if (!*spec->image_name) {
+		rbd_warn(NULL, "no image name provided");
+		goto out_err;
+	}
 
 	/*
 	 * Snapshot name is optional; default is to use "-"
-- 
cgit v0.10.2


From f5400b7a0e78a53edce8960a079aa022640849a1 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreamhost.com>
Date: Thu, 1 Nov 2012 10:17:15 -0500
Subject: rbd: add a warning in bio_chain_clone_range()

Add a warning in bio_chain_clone_range() to help a user determine
what exactly might have led to a failure.  There is only one; please
say something if you disagree with the following reasoning.

There are three places this can return abnormally:
    - Initially, if there is nothing to clone.  It turns out that
      right now this cannot happen anyway.  The test is in place
      because the code below it doesn't work if those conditions
      don't hold.  As such they could be assertions but since I can
      return a null to indicate an error I just do that instead.
      I have not added a warning here because it won't happen.
    - While processing bio's, if none remain but there are supposed
      to be more bytes to clone.  Here I have added a warning.
    - If bio_clone_range() returns a null pointer.  That function
      will have already produced a warning (at least the first
      time, via WARN_ON_ONCE()) to distinguish the cause of the
      error.  The only exception is memory exhaustion, and I'd
      rather not pepper the code with warnings in all those spots.
      So no warning is added in that place.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 31da8c5..ce6c0cb 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -993,8 +993,10 @@ static struct bio *bio_chain_clone_range(struct bio **bio_src,
 		unsigned int bi_size;
 		struct bio *bio;
 
-		if (!bi)
+		if (!bi) {
+			rbd_warn(NULL, "bio_chain exhausted with %u left", len);
 			goto out_err;	/* EINVAL; ran out of bio's */
+		}
 		bi_size = min_t(unsigned int, bi->bi_size - off, len);
 		bio = bio_clone_range(bi, off, bi_size, gfpmask);
 		if (!bio)
-- 
cgit v0.10.2


From 935dc89f3e29e2ef1d7c89778cdb9f37ff36e94b Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreamhost.com>
Date: Thu, 1 Nov 2012 10:17:15 -0500
Subject: rbd: add warnings to rbd_dev_probe_update_spec()

Josh suggested adding warnings to this function to help users
diagnose problems.

Other than memory allocatino errors, there are two places where
errors can be returned.  Both represent problems that should
have been caught earlier, and as such might well have been
handled with BUG_ON() calls.  But if either ever did manage to
happen, it will be reported.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index ce6c0cb..530a121 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -2644,8 +2644,11 @@ static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
 
 	osdc = &rbd_dev->rbd_client->client->osdc;
 	name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
-	if (!name)
-		return -EIO;	/* pool id too large (>= 2^31) */
+	if (!name) {
+		rbd_warn(rbd_dev, "there is no pool with id %llu",
+			rbd_dev->spec->pool_id);	/* Really a BUG() */
+		return -EIO;
+	}
 
 	rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
 	if (!rbd_dev->spec->pool_name)
@@ -2663,6 +2666,8 @@ static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
 
 	name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
 	if (!name) {
+		rbd_warn(rbd_dev, "no snapshot with id %llu",
+			rbd_dev->spec->snap_id);	/* Really a BUG() */
 		ret = -EIO;
 		goto out_err;
 	}
-- 
cgit v0.10.2


From 725afc97c91cd5f71a015143da5095d20cd668b9 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreamhost.com>
Date: Thu, 8 Nov 2012 08:01:39 -0600
Subject: rbd: standardize rbd_request variable names

There are two names used for items of rbd_request structure type:
"req" and "req_data".  The former name is also used to represent
items of pointers to struct ceph_osd_request.

Change all variables that have these names so they are instead
called "rbd_req" consistently.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Dan Mick <dan.mick@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 530a121..0091fa4 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1088,10 +1088,12 @@ static void rbd_coll_end_req_index(struct request *rq,
 	spin_unlock_irq(q->queue_lock);
 }
 
-static void rbd_coll_end_req(struct rbd_request *req,
+static void rbd_coll_end_req(struct rbd_request *rbd_req,
 			     int ret, u64 len)
 {
-	rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
+	rbd_coll_end_req_index(rbd_req->rq,
+				rbd_req->coll, rbd_req->coll_index,
+				ret, len);
 }
 
 /*
@@ -1119,12 +1121,12 @@ static int rbd_do_request(struct request *rq,
 	int ret;
 	u64 bno;
 	struct timespec mtime = CURRENT_TIME;
-	struct rbd_request *req_data;
+	struct rbd_request *rbd_req;
 	struct ceph_osd_request_head *reqhead;
 	struct ceph_osd_client *osdc;
 
-	req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
-	if (!req_data) {
+	rbd_req = kzalloc(sizeof(*rbd_req), GFP_NOIO);
+	if (!rbd_req) {
 		if (coll)
 			rbd_coll_end_req_index(rq, coll, coll_index,
 					       -ENOMEM, len);
@@ -1132,8 +1134,8 @@ static int rbd_do_request(struct request *rq,
 	}
 
 	if (coll) {
-		req_data->coll = coll;
-		req_data->coll_index = coll_index;
+		rbd_req->coll = coll;
+		rbd_req->coll_index = coll_index;
 	}
 
 	dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n",
@@ -1150,12 +1152,12 @@ static int rbd_do_request(struct request *rq,
 
 	req->r_callback = rbd_cb;
 
-	req_data->rq = rq;
-	req_data->bio = bio;
-	req_data->pages = pages;
-	req_data->len = len;
+	rbd_req->rq = rq;
+	rbd_req->bio = bio;
+	rbd_req->pages = pages;
+	rbd_req->len = len;
 
-	req->r_priv = req_data;
+	req->r_priv = rbd_req;
 
 	reqhead = req->r_request->front.iov_base;
 	reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
@@ -1200,11 +1202,11 @@ static int rbd_do_request(struct request *rq,
 	return ret;
 
 done_err:
-	bio_chain_put(req_data->bio);
+	bio_chain_put(rbd_req->bio);
 	ceph_osdc_put_request(req);
 done_pages:
-	rbd_coll_end_req(req_data, ret, len);
-	kfree(req_data);
+	rbd_coll_end_req(rbd_req, ret, len);
+	kfree(rbd_req);
 	return ret;
 }
 
@@ -1213,7 +1215,7 @@ done_pages:
  */
 static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
 {
-	struct rbd_request *req_data = req->r_priv;
+	struct rbd_request *rbd_req = req->r_priv;
 	struct ceph_osd_reply_head *replyhead;
 	struct ceph_osd_op *op;
 	__s32 rc;
@@ -1232,20 +1234,20 @@ static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
 		(unsigned long long) bytes, read_op, (int) rc);
 
 	if (rc == -ENOENT && read_op) {
-		zero_bio_chain(req_data->bio, 0);
+		zero_bio_chain(rbd_req->bio, 0);
 		rc = 0;
-	} else if (rc == 0 && read_op && bytes < req_data->len) {
-		zero_bio_chain(req_data->bio, bytes);
-		bytes = req_data->len;
+	} else if (rc == 0 && read_op && bytes < rbd_req->len) {
+		zero_bio_chain(rbd_req->bio, bytes);
+		bytes = rbd_req->len;
 	}
 
-	rbd_coll_end_req(req_data, rc, bytes);
+	rbd_coll_end_req(rbd_req, rc, bytes);
 
-	if (req_data->bio)
-		bio_chain_put(req_data->bio);
+	if (rbd_req->bio)
+		bio_chain_put(rbd_req->bio);
 
 	ceph_osdc_put_request(req);
-	kfree(req_data);
+	kfree(rbd_req);
 }
 
 static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
-- 
cgit v0.10.2


From 5f29ddd4f0954ad6c84e28b934773f128840f064 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreamhost.com>
Date: Thu, 8 Nov 2012 08:01:39 -0600
Subject: rbd: standardize ceph_osd_request variable names

There are spots where a ceph_osds_request pointer variable is given
the name "req".  Since we're dealing with (at least) three types of
requests (block layer, rbd, and osd), I find this slightly
distracting.

Change such instances to use "osd_req" consistently to make the
abstraction represented a little more obvious.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Dan Mick <dan.mick@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 0091fa4..85131de 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1111,12 +1111,12 @@ static int rbd_do_request(struct request *rq,
 			  struct ceph_osd_req_op *ops,
 			  struct rbd_req_coll *coll,
 			  int coll_index,
-			  void (*rbd_cb)(struct ceph_osd_request *req,
-					 struct ceph_msg *msg),
+			  void (*rbd_cb)(struct ceph_osd_request *,
+					 struct ceph_msg *),
 			  struct ceph_osd_request **linger_req,
 			  u64 *ver)
 {
-	struct ceph_osd_request *req;
+	struct ceph_osd_request *osd_req;
 	struct ceph_file_layout *layout;
 	int ret;
 	u64 bno;
@@ -1143,67 +1143,68 @@ static int rbd_do_request(struct request *rq,
 		(unsigned long long) len, coll, coll_index);
 
 	osdc = &rbd_dev->rbd_client->client->osdc;
-	req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
+	osd_req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
 					false, GFP_NOIO, pages, bio);
-	if (!req) {
+	if (!osd_req) {
 		ret = -ENOMEM;
 		goto done_pages;
 	}
 
-	req->r_callback = rbd_cb;
+	osd_req->r_callback = rbd_cb;
 
 	rbd_req->rq = rq;
 	rbd_req->bio = bio;
 	rbd_req->pages = pages;
 	rbd_req->len = len;
 
-	req->r_priv = rbd_req;
+	osd_req->r_priv = rbd_req;
 
-	reqhead = req->r_request->front.iov_base;
+	reqhead = osd_req->r_request->front.iov_base;
 	reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
 
-	strncpy(req->r_oid, object_name, sizeof(req->r_oid));
-	req->r_oid_len = strlen(req->r_oid);
+	strncpy(osd_req->r_oid, object_name, sizeof(osd_req->r_oid));
+	osd_req->r_oid_len = strlen(osd_req->r_oid);
 
-	layout = &req->r_file_layout;
+	layout = &osd_req->r_file_layout;
 	memset(layout, 0, sizeof(*layout));
 	layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
 	layout->fl_stripe_count = cpu_to_le32(1);
 	layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
 	layout->fl_pg_pool = cpu_to_le32((int) rbd_dev->spec->pool_id);
 	ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
-				   req, ops);
+				   osd_req, ops);
 	rbd_assert(ret == 0);
 
-	ceph_osdc_build_request(req, ofs, &len,
+	ceph_osdc_build_request(osd_req, ofs, &len,
 				ops,
 				snapc,
 				&mtime,
-				req->r_oid, req->r_oid_len);
+				osd_req->r_oid, osd_req->r_oid_len);
 
 	if (linger_req) {
-		ceph_osdc_set_request_linger(osdc, req);
-		*linger_req = req;
+		ceph_osdc_set_request_linger(osdc, osd_req);
+		*linger_req = osd_req;
 	}
 
-	ret = ceph_osdc_start_request(osdc, req, false);
+	ret = ceph_osdc_start_request(osdc, osd_req, false);
 	if (ret < 0)
 		goto done_err;
 
 	if (!rbd_cb) {
-		ret = ceph_osdc_wait_request(osdc, req);
+		u64 version;
+
+		ret = ceph_osdc_wait_request(osdc, osd_req);
+		version = le64_to_cpu(osd_req->r_reassert_version.version);
 		if (ver)
-			*ver = le64_to_cpu(req->r_reassert_version.version);
-		dout("reassert_ver=%llu\n",
-			(unsigned long long)
-				le64_to_cpu(req->r_reassert_version.version));
-		ceph_osdc_put_request(req);
+			*ver = version;
+		dout("reassert_ver=%llu\n", (unsigned long long) version);
+		ceph_osdc_put_request(osd_req);
 	}
 	return ret;
 
 done_err:
 	bio_chain_put(rbd_req->bio);
-	ceph_osdc_put_request(req);
+	ceph_osdc_put_request(osd_req);
 done_pages:
 	rbd_coll_end_req(rbd_req, ret, len);
 	kfree(rbd_req);
@@ -1213,9 +1214,9 @@ done_pages:
 /*
  * Ceph osd op callback
  */
-static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
+static void rbd_req_cb(struct ceph_osd_request *osd_req, struct ceph_msg *msg)
 {
-	struct rbd_request *rbd_req = req->r_priv;
+	struct rbd_request *rbd_req = osd_req->r_priv;
 	struct ceph_osd_reply_head *replyhead;
 	struct ceph_osd_op *op;
 	__s32 rc;
@@ -1246,13 +1247,14 @@ static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
 	if (rbd_req->bio)
 		bio_chain_put(rbd_req->bio);
 
-	ceph_osdc_put_request(req);
+	ceph_osdc_put_request(osd_req);
 	kfree(rbd_req);
 }
 
-static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
+static void rbd_simple_req_cb(struct ceph_osd_request *osd_req,
+				struct ceph_msg *msg)
 {
-	ceph_osdc_put_request(req);
+	ceph_osdc_put_request(osd_req);
 }
 
 /*
-- 
cgit v0.10.2


From 8986cb37b1cf1f54b35f062f0a12dc68dd89f311 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreamhost.com>
Date: Thu, 8 Nov 2012 08:01:39 -0600
Subject: rbd: be picky about osd request status type

The result field in a ceph osd reply header is a signed 32-bit type,
but rbd code often casually uses int to represent it.

The following changes the types of variables that handle this result
value to be "s32" instead of "int" to be completely explicit about
it.  Only at the point we pass that result to __blk_end_request()
does the type get converted to the plain old int defined for that
interface.

There is almost certainly no binary impact of this change, but I
prefer to show the exact size and signedness of the value since we
know it.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Dan Mick <dan.mick@inktank.com>

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 85131de..8d93b6a 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -171,7 +171,7 @@ struct rbd_client {
  */
 struct rbd_req_status {
 	int done;
-	int rc;
+	s32 rc;
 	u64 bytes;
 };
 
@@ -1053,13 +1053,13 @@ static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
 static void rbd_coll_end_req_index(struct request *rq,
 				   struct rbd_req_coll *coll,
 				   int index,
-				   int ret, u64 len)
+				   s32 ret, u64 len)
 {
 	struct request_queue *q;
 	int min, max, i;
 
 	dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
-	     coll, index, ret, (unsigned long long) len);
+	     coll, index, (int)ret, (unsigned long long)len);
 
 	if (!rq)
 		return;
@@ -1080,7 +1080,7 @@ static void rbd_coll_end_req_index(struct request *rq,
 		max++;
 
 	for (i = min; i<max; i++) {
-		__blk_end_request(rq, coll->status[i].rc,
+		__blk_end_request(rq, (int)coll->status[i].rc,
 				  coll->status[i].bytes);
 		coll->num_done++;
 		kref_put(&coll->kref, rbd_coll_release);
@@ -1089,7 +1089,7 @@ static void rbd_coll_end_req_index(struct request *rq,
 }
 
 static void rbd_coll_end_req(struct rbd_request *rbd_req,
-			     int ret, u64 len)
+			     s32 ret, u64 len)
 {
 	rbd_coll_end_req_index(rbd_req->rq,
 				rbd_req->coll, rbd_req->coll_index,
@@ -1129,7 +1129,7 @@ static int rbd_do_request(struct request *rq,
 	if (!rbd_req) {
 		if (coll)
 			rbd_coll_end_req_index(rq, coll, coll_index,
-					       -ENOMEM, len);
+					       (s32)-ENOMEM, len);
 		return -ENOMEM;
 	}
 
@@ -1206,7 +1206,7 @@ done_err:
 	bio_chain_put(rbd_req->bio);
 	ceph_osdc_put_request(osd_req);
 done_pages:
-	rbd_coll_end_req(rbd_req, ret, len);
+	rbd_coll_end_req(rbd_req, (s32)ret, len);
 	kfree(rbd_req);
 	return ret;
 }
@@ -1219,7 +1219,7 @@ static void rbd_req_cb(struct ceph_osd_request *osd_req, struct ceph_msg *msg)
 	struct rbd_request *rbd_req = osd_req->r_priv;
 	struct ceph_osd_reply_head *replyhead;
 	struct ceph_osd_op *op;
-	__s32 rc;
+	s32 rc;
 	u64 bytes;
 	int read_op;
 
@@ -1227,14 +1227,14 @@ static void rbd_req_cb(struct ceph_osd_request *osd_req, struct ceph_msg *msg)
 	replyhead = msg->front.iov_base;
 	WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
 	op = (void *)(replyhead + 1);
-	rc = le32_to_cpu(replyhead->result);
+	rc = (s32)le32_to_cpu(replyhead->result);
 	bytes = le64_to_cpu(op->extent.length);
 	read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
 
 	dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
 		(unsigned long long) bytes, read_op, (int) rc);
 
-	if (rc == -ENOENT && read_op) {
+	if (rc == (s32)-ENOENT && read_op) {
 		zero_bio_chain(rbd_req->bio, 0);
 		rc = 0;
 	} else if (rc == 0 && read_op && bytes < rbd_req->len) {
@@ -1679,7 +1679,8 @@ static void rbd_rq_fn(struct request_queue *q)
 						bio_chain, coll, cur_seg);
 			else
 				rbd_coll_end_req_index(rq, coll, cur_seg,
-						       -ENOMEM, chain_size);
+						       (s32)-ENOMEM,
+						       chain_size);
 			size -= chain_size;
 			ofs += chain_size;
 
-- 
cgit v0.10.2


From 8295cda7ceceb7b25f9a12cd21bbfb6206e28a6d Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 8 Nov 2012 08:01:39 -0600
Subject: rbd: encapsulate handling for a single request

In rbd_rq_fn(), requests are fetched from the block layer and each
request is processed, looping through the request's list of bio's
until they've all been consumed.

Separate the handling for a single request into its own function to
make it a bit easier to see what's going on.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 8d93b6a..738d1e4 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1583,6 +1583,64 @@ static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
 	return coll;
 }
 
+static int rbd_dev_do_request(struct request *rq,
+				struct rbd_device *rbd_dev,
+				struct ceph_snap_context *snapc,
+				u64 ofs, unsigned int size,
+				struct bio *bio_chain)
+{
+	int num_segs;
+	struct rbd_req_coll *coll;
+	unsigned int bio_offset;
+	int cur_seg = 0;
+
+	dout("%s 0x%x bytes at 0x%llx\n",
+		rq_data_dir(rq) == WRITE ? "write" : "read",
+		size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
+
+	num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
+	if (num_segs <= 0)
+		return num_segs;
+
+	coll = rbd_alloc_coll(num_segs);
+	if (!coll)
+		return -ENOMEM;
+
+	bio_offset = 0;
+	do {
+		u64 limit = rbd_segment_length(rbd_dev, ofs, size);
+		unsigned int clone_size;
+		struct bio *bio_clone;
+
+		BUG_ON(limit > (u64)UINT_MAX);
+		clone_size = (unsigned int)limit;
+		dout("bio_chain->bi_vcnt=%hu\n", bio_chain->bi_vcnt);
+
+		kref_get(&coll->kref);
+
+		/* Pass a cloned bio chain via an osd request */
+
+		bio_clone = bio_chain_clone_range(&bio_chain,
+					&bio_offset, clone_size,
+					GFP_ATOMIC);
+		if (bio_clone)
+			(void)rbd_do_op(rq, rbd_dev, snapc,
+					ofs, clone_size,
+					bio_clone, coll, cur_seg);
+		else
+			rbd_coll_end_req_index(rq, coll, cur_seg,
+						(s32)-ENOMEM,
+						clone_size);
+		size -= clone_size;
+		ofs += clone_size;
+
+		cur_seg++;
+	} while (size > 0);
+	kref_put(&coll->kref, rbd_coll_release);
+
+	return 0;
+}
+
 /*
  * block device queue callback
  */
@@ -1596,10 +1654,8 @@ static void rbd_rq_fn(struct request_queue *q)
 		bool do_write;
 		unsigned int size;
 		u64 ofs;
-		int num_segs, cur_seg = 0;
-		struct rbd_req_coll *coll;
 		struct ceph_snap_context *snapc;
-		unsigned int bio_offset;
+		int result;
 
 		dout("fetched request\n");
 
@@ -1637,60 +1693,11 @@ static void rbd_rq_fn(struct request_queue *q)
 		ofs = blk_rq_pos(rq) * SECTOR_SIZE;
 		bio = rq->bio;
 
-		dout("%s 0x%x bytes at 0x%llx\n",
-		     do_write ? "write" : "read",
-		     size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
-
-		num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
-		if (num_segs <= 0) {
-			spin_lock_irq(q->queue_lock);
-			__blk_end_request_all(rq, num_segs);
-			ceph_put_snap_context(snapc);
-			continue;
-		}
-		coll = rbd_alloc_coll(num_segs);
-		if (!coll) {
-			spin_lock_irq(q->queue_lock);
-			__blk_end_request_all(rq, -ENOMEM);
-			ceph_put_snap_context(snapc);
-			continue;
-		}
-
-		bio_offset = 0;
-		do {
-			u64 limit = rbd_segment_length(rbd_dev, ofs, size);
-			unsigned int chain_size;
-			struct bio *bio_chain;
-
-			BUG_ON(limit > (u64) UINT_MAX);
-			chain_size = (unsigned int) limit;
-			dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
-
-			kref_get(&coll->kref);
-
-			/* Pass a cloned bio chain via an osd request */
-
-			bio_chain = bio_chain_clone_range(&bio,
-						&bio_offset, chain_size,
-						GFP_ATOMIC);
-			if (bio_chain)
-				(void) rbd_do_op(rq, rbd_dev, snapc,
-						ofs, chain_size,
-						bio_chain, coll, cur_seg);
-			else
-				rbd_coll_end_req_index(rq, coll, cur_seg,
-						       (s32)-ENOMEM,
-						       chain_size);
-			size -= chain_size;
-			ofs += chain_size;
-
-			cur_seg++;
-		} while (size > 0);
-		kref_put(&coll->kref, rbd_coll_release);
-
-		spin_lock_irq(q->queue_lock);
-
+		result = rbd_dev_do_request(rq, rbd_dev, snapc, ofs, size, bio);
 		ceph_put_snap_context(snapc);
+		spin_lock_irq(q->queue_lock);
+		if (!size || result < 0)
+			__blk_end_request_all(rq, result);
 	}
 }
 
-- 
cgit v0.10.2


From cd323ac0eb433b14cbb270bfc5a82f308f2662de Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreamhost.com>
Date: Thu, 8 Nov 2012 08:01:39 -0600
Subject: rbd: end request on error in rbd_do_request() caller

Only one of the three callers of rbd_do_request() provide a
collection structure to aggregate status.

If an error occurs in rbd_do_request(), have the caller
take care of calling rbd_coll_end_req() if necessary in
that one spot.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 738d1e4..21468d0 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1126,12 +1126,8 @@ static int rbd_do_request(struct request *rq,
 	struct ceph_osd_client *osdc;
 
 	rbd_req = kzalloc(sizeof(*rbd_req), GFP_NOIO);
-	if (!rbd_req) {
-		if (coll)
-			rbd_coll_end_req_index(rq, coll, coll_index,
-					       (s32)-ENOMEM, len);
+	if (!rbd_req)
 		return -ENOMEM;
-	}
 
 	if (coll) {
 		rbd_req->coll = coll;
@@ -1206,7 +1202,6 @@ done_err:
 	bio_chain_put(rbd_req->bio);
 	ceph_osdc_put_request(osd_req);
 done_pages:
-	rbd_coll_end_req(rbd_req, (s32)ret, len);
 	kfree(rbd_req);
 	return ret;
 }
@@ -1359,7 +1354,9 @@ static int rbd_do_op(struct request *rq,
 			     ops,
 			     coll, coll_index,
 			     rbd_req_cb, 0, NULL);
-
+	if (ret < 0)
+		rbd_coll_end_req_index(rq, coll, coll_index,
+					(s32)ret, seg_len);
 	rbd_destroy_ops(ops);
 done:
 	kfree(seg_name);
-- 
cgit v0.10.2


From b395e8b5b8f06399e3fe3ee016c9cf41ff665efc Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreamhost.com>
Date: Thu, 8 Nov 2012 08:01:39 -0600
Subject: rbd: a little more cleanup of rbd_rq_fn()

Now that a big hunk in the middle of rbd_rq_fn() has been moved
into its own routine we can simplify it a little more.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 21468d0..9d49e5b 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1644,53 +1644,51 @@ static int rbd_dev_do_request(struct request *rq,
 static void rbd_rq_fn(struct request_queue *q)
 {
 	struct rbd_device *rbd_dev = q->queuedata;
+	bool read_only = rbd_dev->mapping.read_only;
 	struct request *rq;
 
 	while ((rq = blk_fetch_request(q))) {
-		struct bio *bio;
-		bool do_write;
-		unsigned int size;
-		u64 ofs;
-		struct ceph_snap_context *snapc;
+		struct ceph_snap_context *snapc = NULL;
+		unsigned int size = 0;
 		int result;
 
 		dout("fetched request\n");
 
-		/* filter out block requests we don't understand */
+		/* Filter out block requests we don't understand */
+
 		if ((rq->cmd_type != REQ_TYPE_FS)) {
 			__blk_end_request_all(rq, 0);
 			continue;
 		}
+		spin_unlock_irq(q->queue_lock);
 
-		/* deduce our operation (read, write) */
-		do_write = (rq_data_dir(rq) == WRITE);
-		if (do_write && rbd_dev->mapping.read_only) {
-			__blk_end_request_all(rq, -EROFS);
-			continue;
-		}
+		/* Stop writes to a read-only device */
 
-		spin_unlock_irq(q->queue_lock);
+		result = -EROFS;
+		if (read_only && rq_data_dir(rq) == WRITE)
+			goto out_end_request;
+
+		/* Grab a reference to the snapshot context */
 
 		down_read(&rbd_dev->header_rwsem);
+		if (rbd_dev->exists) {
+			snapc = ceph_get_snap_context(rbd_dev->header.snapc);
+			rbd_assert(snapc != NULL);
+		}
+		up_read(&rbd_dev->header_rwsem);
 
-		if (!rbd_dev->exists) {
+		if (!snapc) {
 			rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
-			up_read(&rbd_dev->header_rwsem);
 			dout("request for non-existent snapshot");
-			spin_lock_irq(q->queue_lock);
-			__blk_end_request_all(rq, -ENXIO);
-			continue;
+			result = -ENXIO;
+			goto out_end_request;
 		}
 
-		snapc = ceph_get_snap_context(rbd_dev->header.snapc);
-
-		up_read(&rbd_dev->header_rwsem);
-
 		size = blk_rq_bytes(rq);
-		ofs = blk_rq_pos(rq) * SECTOR_SIZE;
-		bio = rq->bio;
-
-		result = rbd_dev_do_request(rq, rbd_dev, snapc, ofs, size, bio);
+		result = rbd_dev_do_request(rq, rbd_dev, snapc,
+				blk_rq_pos(rq) * SECTOR_SIZE,
+				size, rq->bio);
+out_end_request:
 		ceph_put_snap_context(snapc);
 		spin_lock_irq(q->queue_lock);
 		if (!size || result < 0)
-- 
cgit v0.10.2


From d78b650a595e23e5a115d332e3c37e019baf7703 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 9 Nov 2012 08:43:15 -0600
Subject: rbd: make exists flag atomic

The rbd_device->exists field can be updated asynchronously, changing
from set to clear if a mapped snapshot disappears from the base
image's snapshot context.

Currently, value of the "exists" flag is only read and modified
under protection of the header semaphore, but that will change with
the next patch.  Making it atomic ensures this won't be a problem
because the a the non-existence of device will be immediately known.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 9d49e5b..2846536 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -229,7 +229,7 @@ struct rbd_device {
 	spinlock_t		lock;		/* queue lock */
 
 	struct rbd_image_header	header;
-	bool                    exists;
+	atomic_t		exists;
 	struct rbd_spec		*spec;
 
 	char			*header_name;
@@ -751,7 +751,7 @@ static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
 			goto done;
 		rbd_dev->mapping.read_only = true;
 	}
-	rbd_dev->exists = true;
+	atomic_set(&rbd_dev->exists, 1);
 done:
 	return ret;
 }
@@ -1671,7 +1671,7 @@ static void rbd_rq_fn(struct request_queue *q)
 		/* Grab a reference to the snapshot context */
 
 		down_read(&rbd_dev->header_rwsem);
-		if (rbd_dev->exists) {
+		if (atomic_read(&rbd_dev->exists)) {
 			snapc = ceph_get_snap_context(rbd_dev->header.snapc);
 			rbd_assert(snapc != NULL);
 		}
@@ -2294,6 +2294,7 @@ struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
 		return NULL;
 
 	spin_lock_init(&rbd_dev->lock);
+	atomic_set(&rbd_dev->exists, 0);
 	INIT_LIST_HEAD(&rbd_dev->node);
 	INIT_LIST_HEAD(&rbd_dev->snaps);
 	init_rwsem(&rbd_dev->header_rwsem);
@@ -2918,7 +2919,7 @@ static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
 			/* Existing snapshot not in the new snap context */
 
 			if (rbd_dev->spec->snap_id == snap->id)
-				rbd_dev->exists = false;
+				atomic_set(&rbd_dev->exists, 0);
 			rbd_remove_snap_dev(snap);
 			dout("%ssnap id %llu has been removed\n",
 				rbd_dev->spec->snap_id == snap->id ?
-- 
cgit v0.10.2


From a7b4c65f4f15aa657b09d13da8f45ba0b72ec0df Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 9 Nov 2012 08:43:15 -0600
Subject: rbd: only get snap context for write requests

Right now we get the snapshot context for an rbd image (under
protection of the header semaphore) for every request processed.

There's no need to get the snap context if we're doing a read,
so avoid doing so in that case.

Note that we no longer need to hold the header semaphore to
check the rbd_dev's existence flag.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 2846536..0fbe9ad 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1331,7 +1331,7 @@ static int rbd_do_op(struct request *rq,
 	} else {
 		opcode = CEPH_OSD_OP_READ;
 		flags = CEPH_OSD_FLAG_READ;
-		snapc = NULL;
+		rbd_assert(!snapc);
 		snapid = rbd_dev->spec->snap_id;
 		payload_len = 0;
 	}
@@ -1662,22 +1662,25 @@ static void rbd_rq_fn(struct request_queue *q)
 		}
 		spin_unlock_irq(q->queue_lock);
 
-		/* Stop writes to a read-only device */
-
-		result = -EROFS;
-		if (read_only && rq_data_dir(rq) == WRITE)
-			goto out_end_request;
-
-		/* Grab a reference to the snapshot context */
-
-		down_read(&rbd_dev->header_rwsem);
-		if (atomic_read(&rbd_dev->exists)) {
+		/* Write requests need a reference to the snapshot context */
+
+		if (rq_data_dir(rq) == WRITE) {
+			result = -EROFS;
+			if (read_only) /* Can't write to a read-only device */
+				goto out_end_request;
+
+			/*
+			 * Note that each osd request will take its
+			 * own reference to the snapshot context
+			 * supplied.  The reference we take here
+			 * just guarantees the one we provide stays
+			 * valid.
+			 */
+			down_read(&rbd_dev->header_rwsem);
 			snapc = ceph_get_snap_context(rbd_dev->header.snapc);
+			up_read(&rbd_dev->header_rwsem);
 			rbd_assert(snapc != NULL);
-		}
-		up_read(&rbd_dev->header_rwsem);
-
-		if (!snapc) {
+		} else if (!atomic_read(&rbd_dev->exists)) {
 			rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
 			dout("request for non-existent snapshot");
 			result = -ENXIO;
@@ -1689,7 +1692,8 @@ static void rbd_rq_fn(struct request_queue *q)
 				blk_rq_pos(rq) * SECTOR_SIZE,
 				size, rq->bio);
 out_end_request:
-		ceph_put_snap_context(snapc);
+		if (snapc)
+			ceph_put_snap_context(snapc);
 		spin_lock_irq(q->queue_lock);
 		if (!size || result < 0)
 			__blk_end_request_all(rq, result);
-- 
cgit v0.10.2


From 0ec8ce87f3bb5e4a561190f5320934e003405b6f Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 9 Nov 2012 08:43:15 -0600
Subject: rbd: separate layout init

Pull a block of code that initializes the layout structure in an osd
request into its own function so it can be reused.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Dan Mick <dan.mick@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 0fbe9ad..d4e93a2 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -54,6 +54,7 @@
 
 /* It might be useful to have this defined elsewhere too */
 
+#define	U32_MAX	((u32) (~0U))
 #define	U64_MAX	((u64) (~0ULL))
 
 #define RBD_DRV_NAME "rbd"
@@ -1096,6 +1097,16 @@ static void rbd_coll_end_req(struct rbd_request *rbd_req,
 				ret, len);
 }
 
+static void rbd_layout_init(struct ceph_file_layout *layout, u64 pool_id)
+{
+	memset(layout, 0, sizeof (*layout));
+	layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
+	layout->fl_stripe_count = cpu_to_le32(1);
+	layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
+	rbd_assert(pool_id <= (u64) U32_MAX);
+	layout->fl_pg_pool = cpu_to_le32((u32) pool_id);
+}
+
 /*
  * Send ceph osd request
  */
@@ -1117,7 +1128,6 @@ static int rbd_do_request(struct request *rq,
 			  u64 *ver)
 {
 	struct ceph_osd_request *osd_req;
-	struct ceph_file_layout *layout;
 	int ret;
 	u64 bno;
 	struct timespec mtime = CURRENT_TIME;
@@ -1161,14 +1171,9 @@ static int rbd_do_request(struct request *rq,
 	strncpy(osd_req->r_oid, object_name, sizeof(osd_req->r_oid));
 	osd_req->r_oid_len = strlen(osd_req->r_oid);
 
-	layout = &osd_req->r_file_layout;
-	memset(layout, 0, sizeof(*layout));
-	layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
-	layout->fl_stripe_count = cpu_to_le32(1);
-	layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
-	layout->fl_pg_pool = cpu_to_le32((int) rbd_dev->spec->pool_id);
-	ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
-				   osd_req, ops);
+	rbd_layout_init(&osd_req->r_file_layout, rbd_dev->spec->pool_id);
+	ret = ceph_calc_raw_layout(osdc, &osd_req->r_file_layout,
+				snapid, ofs, &len, &bno, osd_req, ops);
 	rbd_assert(ret == 0);
 
 	ceph_osdc_build_request(osd_req, ofs, &len,
-- 
cgit v0.10.2


From af77f26caa35a95af09d1dac5c513b3901de7e37 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 9 Nov 2012 08:43:15 -0600
Subject: rbd: drop oid parameters from ceph_osdc_build_request()

The last two parameters to ceph_osd_build_request() describe the
object id, but the values passed always come from the osd request
structure whose address is also provided.  Get rid of those last
two parameters.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index d4e93a2..9a701ef 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1176,11 +1176,7 @@ static int rbd_do_request(struct request *rq,
 				snapid, ofs, &len, &bno, osd_req, ops);
 	rbd_assert(ret == 0);
 
-	ceph_osdc_build_request(osd_req, ofs, &len,
-				ops,
-				snapc,
-				&mtime,
-				osd_req->r_oid, osd_req->r_oid_len);
+	ceph_osdc_build_request(osd_req, ofs, &len, ops, snapc, &mtime);
 
 	if (linger_req) {
 		ceph_osdc_set_request_linger(osdc, osd_req);
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index d9b880e..f2e5d2c 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -227,9 +227,7 @@ extern void ceph_osdc_build_request(struct ceph_osd_request *req,
 				    u64 off, u64 *plen,
 				    struct ceph_osd_req_op *src_ops,
 				    struct ceph_snap_context *snapc,
-				    struct timespec *mtime,
-				    const char *oid,
-				    int oid_len);
+				    struct timespec *mtime);
 
 extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
 				      struct ceph_file_layout *layout,
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index eade41b..7d38327 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -376,9 +376,7 @@ void ceph_osdc_build_request(struct ceph_osd_request *req,
 			     u64 off, u64 *plen,
 			     struct ceph_osd_req_op *src_ops,
 			     struct ceph_snap_context *snapc,
-			     struct timespec *mtime,
-			     const char *oid,
-			     int oid_len)
+			     struct timespec *mtime)
 {
 	struct ceph_msg *msg = req->r_request;
 	struct ceph_osd_request_head *head;
@@ -405,9 +403,9 @@ void ceph_osdc_build_request(struct ceph_osd_request *req,
 
 
 	/* fill in oid */
-	head->object_len = cpu_to_le32(oid_len);
-	memcpy(p, oid, oid_len);
-	p += oid_len;
+	head->object_len = cpu_to_le32(req->r_oid_len);
+	memcpy(p, req->r_oid, req->r_oid_len);
+	p += req->r_oid_len;
 
 	src_op = src_ops;
 	while (src_op->op) {
@@ -506,8 +504,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 
 	ceph_osdc_build_request(req, off, plen, ops,
 				snapc,
-				mtime,
-				req->r_oid, req->r_oid_len);
+				mtime);
 
 	return req;
 }
-- 
cgit v0.10.2


From 4775618d9255c0c135580bbee8bee6815f8194cf Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 9 Nov 2012 08:43:15 -0600
Subject: rbd: drop snapid parameter from rbd_req_sync_read()

There is only one caller of rbd_req_sync_read(), and it passes
CEPH_NOSNAP as the snapshot id argument.  Delete that parameter
and just use CEPH_NOSNAP within the function.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 9a701ef..07e0644 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1368,7 +1368,6 @@ done:
  * Request sync osd read
  */
 static int rbd_req_sync_read(struct rbd_device *rbd_dev,
-			  u64 snapid,
 			  const char *object_name,
 			  u64 ofs, u64 len,
 			  char *buf,
@@ -1382,7 +1381,7 @@ static int rbd_req_sync_read(struct rbd_device *rbd_dev,
 		return -ENOMEM;
 
 	ret = rbd_req_sync_op(rbd_dev, NULL,
-			       snapid,
+			       CEPH_NOSNAP,
 			       CEPH_OSD_FLAG_READ,
 			       ops, object_name, ofs, len, buf, NULL, ver);
 	rbd_destroy_ops(ops);
@@ -1799,8 +1798,7 @@ rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
 		if (!ondisk)
 			return ERR_PTR(-ENOMEM);
 
-		ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
-				       rbd_dev->header_name,
+		ret = rbd_req_sync_read(rbd_dev, rbd_dev->header_name,
 				       0, size,
 				       (char *) ondisk, version);
 
-- 
cgit v0.10.2


From 07b2391fbbcefdecbc2f16321f8e454802e0b926 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 9 Nov 2012 08:43:16 -0600
Subject: rbd: drop flags parameter from rbd_req_sync_exec()

All callers of rbd_req_sync_exec() pass CEPH_OSD_FLAG_READ as their
flags argument.  Delete that parameter and use CEPH_OSD_FLAG_READ
within the function.  If we find a need to support write operations
we can add it back again.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 07e0644..5d48bcd 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1524,7 +1524,6 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
 			     size_t outbound_size,
 			     char *inbound,
 			     size_t inbound_size,
-			     int flags,
 			     u64 *ver)
 {
 	struct ceph_osd_req_op *ops;
@@ -1555,8 +1554,7 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
 	ops[0].cls.indata_len = outbound_size;
 
 	ret = rbd_req_sync_op(rbd_dev, NULL,
-			       CEPH_NOSNAP,
-			       flags, ops,
+			       CEPH_NOSNAP, CEPH_OSD_FLAG_READ, ops,
 			       object_name, 0, inbound_size, inbound,
 			       NULL, ver);
 
@@ -2418,8 +2416,7 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
 	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
 				"rbd", "get_size",
 				(char *) &snapid, sizeof (snapid),
-				(char *) &size_buf, sizeof (size_buf),
-				CEPH_OSD_FLAG_READ, NULL);
+				(char *) &size_buf, sizeof (size_buf), NULL);
 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
 	if (ret < 0)
 		return ret;
@@ -2454,8 +2451,7 @@ static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
 	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
 				"rbd", "get_object_prefix",
 				NULL, 0,
-				reply_buf, RBD_OBJ_PREFIX_LEN_MAX,
-				CEPH_OSD_FLAG_READ, NULL);
+				reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL);
 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
 	if (ret < 0)
 		goto out;
@@ -2494,7 +2490,7 @@ static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
 				"rbd", "get_features",
 				(char *) &snapid, sizeof (snapid),
 				(char *) &features_buf, sizeof (features_buf),
-				CEPH_OSD_FLAG_READ, NULL);
+				NULL);
 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
 	if (ret < 0)
 		return ret;
@@ -2549,8 +2545,7 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
 	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
 				"rbd", "get_parent",
 				(char *) &snapid, sizeof (snapid),
-				(char *) reply_buf, size,
-				CEPH_OSD_FLAG_READ, NULL);
+				(char *) reply_buf, size, NULL);
 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
 	if (ret < 0)
 		goto out_err;
@@ -2615,8 +2610,7 @@ static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
 	ret = rbd_req_sync_exec(rbd_dev, RBD_DIRECTORY,
 				"rbd", "dir_get_name",
 				image_id, image_id_size,
-				(char *) reply_buf, size,
-				CEPH_OSD_FLAG_READ, NULL);
+				(char *) reply_buf, size, NULL);
 	if (ret < 0)
 		goto out;
 	p = reply_buf;
@@ -2722,8 +2716,7 @@ static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
 	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
 				"rbd", "get_snapcontext",
 				NULL, 0,
-				reply_buf, size,
-				CEPH_OSD_FLAG_READ, ver);
+				reply_buf, size, ver);
 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
 	if (ret < 0)
 		goto out;
@@ -2792,8 +2785,7 @@ static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
 	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
 				"rbd", "get_snapshot_name",
 				(char *) &snap_id, sizeof (snap_id),
-				reply_buf, size,
-				CEPH_OSD_FLAG_READ, NULL);
+				reply_buf, size, NULL);
 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
 	if (ret < 0)
 		goto out;
@@ -3401,8 +3393,7 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev)
 	ret = rbd_req_sync_exec(rbd_dev, object_name,
 				"rbd", "get_id",
 				NULL, 0,
-				response, RBD_IMAGE_ID_LEN_MAX,
-				CEPH_OSD_FLAG_READ, NULL);
+				response, RBD_IMAGE_ID_LEN_MAX, NULL);
 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
 	if (ret < 0)
 		goto out;
-- 
cgit v0.10.2


From 25704ac9de30ac3e73c123e7b2734f7ca744c8d8 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 9 Nov 2012 08:43:16 -0600
Subject: rbd: kill rbd_req_sync_op() snapc and snapid parameters

The snapc and snapid parameters to rbd_req_sync_op() always take
the values NULL and CEPH_NOSNAP, respectively.  So just get rid
of them and use those values where needed.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 5d48bcd..85c5852 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1257,8 +1257,6 @@ static void rbd_simple_req_cb(struct ceph_osd_request *osd_req,
  * Do a synchronous ceph osd operation
  */
 static int rbd_req_sync_op(struct rbd_device *rbd_dev,
-			   struct ceph_snap_context *snapc,
-			   u64 snapid,
 			   int flags,
 			   struct ceph_osd_req_op *ops,
 			   const char *object_name,
@@ -1278,7 +1276,7 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev,
 	if (IS_ERR(pages))
 		return PTR_ERR(pages);
 
-	ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
+	ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
 			  object_name, ofs, inbound_size, NULL,
 			  pages, num_pages,
 			  flags,
@@ -1380,9 +1378,7 @@ static int rbd_req_sync_read(struct rbd_device *rbd_dev,
 	if (!ops)
 		return -ENOMEM;
 
-	ret = rbd_req_sync_op(rbd_dev, NULL,
-			       CEPH_NOSNAP,
-			       CEPH_OSD_FLAG_READ,
+	ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ,
 			       ops, object_name, ofs, len, buf, NULL, ver);
 	rbd_destroy_ops(ops);
 
@@ -1461,8 +1457,7 @@ static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
 	ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
 	ops[0].watch.flag = 1;
 
-	ret = rbd_req_sync_op(rbd_dev, NULL,
-			      CEPH_NOSNAP,
+	ret = rbd_req_sync_op(rbd_dev,
 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
 			      ops,
 			      rbd_dev->header_name,
@@ -1499,8 +1494,7 @@ static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
 	ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
 	ops[0].watch.flag = 0;
 
-	ret = rbd_req_sync_op(rbd_dev, NULL,
-			      CEPH_NOSNAP,
+	ret = rbd_req_sync_op(rbd_dev,
 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
 			      ops,
 			      rbd_dev->header_name,
@@ -1553,8 +1547,7 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
 	ops[0].cls.indata = outbound;
 	ops[0].cls.indata_len = outbound_size;
 
-	ret = rbd_req_sync_op(rbd_dev, NULL,
-			       CEPH_NOSNAP, CEPH_OSD_FLAG_READ, ops,
+	ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, ops,
 			       object_name, 0, inbound_size, inbound,
 			       NULL, ver);
 
-- 
cgit v0.10.2


From 7c3d22cf16f1bbcb37a73e88338c042bb49ff112 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 9 Nov 2012 12:50:10 -0600
Subject: rbd: don't bother setting snapid in rbd_do_request()

For some reason, the snapid field of the osd request header is
explicitly set to CEPH_NOSNAP in rbd_do_request().  Just a few lines
later--with no code that would access this field in between--a call
is made to ceph_calc_raw_layout() passing the snapid provided to
rbd_do_request(), which encodes the snapid value it is provided into
that field instead.

In other words, there is no need to fill in CEPH_NOSNAP, and doing
so suggests it might be necessary.  Don't do that any more.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 85c5852..54bd9fc 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1132,7 +1132,6 @@ static int rbd_do_request(struct request *rq,
 	u64 bno;
 	struct timespec mtime = CURRENT_TIME;
 	struct rbd_request *rbd_req;
-	struct ceph_osd_request_head *reqhead;
 	struct ceph_osd_client *osdc;
 
 	rbd_req = kzalloc(sizeof(*rbd_req), GFP_NOIO);
@@ -1165,9 +1164,6 @@ static int rbd_do_request(struct request *rq,
 
 	osd_req->r_priv = rbd_req;
 
-	reqhead = osd_req->r_request->front.iov_base;
-	reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
-
 	strncpy(osd_req->r_oid, object_name, sizeof(osd_req->r_oid));
 	osd_req->r_oid_len = strlen(osd_req->r_oid);
 
-- 
cgit v0.10.2


From c885837f7d4f8c4f5cb2a744cc6929bc078e9dc0 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 13 Nov 2012 21:11:15 -0600
Subject: libceph: always allow trail in osd request

An osd request structure contains an optional trail portion, which
if present will contain data to be passed in the payload portion of
the message containing the request.  The trail field is a
ceph_pagelist pointer, and if null it indicates there is no trail.

A ceph_pagelist structure contains a length field, and it can
legitimately hold value 0.  Make use of this to change the
interpretation of the "trail" of an osd request so that every osd
request has trailing data, it just might have length 0.

This means we change the r_trail field in a ceph_osd_request
structure from a pointer to a structure that is always initialized.

Note that in ceph_osdc_start_request(), the trail pointer (or now
address of that structure) is assigned to a ceph message's trail
field.  Here's why that's still OK (looking at net/ceph/messenger.c):
    - What would have resulted in a null pointer previously will now
      refer to a 0-length page list.  That message trail pointer
      is used in two functions, write_partial_msg_pages() and
      out_msg_pos_next().
    - In write_partial_msg_pages(), a null page list pointer is
      handled the same as a message with 0-length trail, and both
      result in a "in_trail" variable set to false.  The trail
      pointer is only used if in_trail is true.
    - The only other place the message trail pointer is used is
      out_msg_pos_next().  That function is only called by
      write_partial_msg_pages() and only touches the trail pointer
      if the in_trail value it is passed is true.
Therefore a null ceph_msg->trail pointer is equivalent to a non-null
pointer referring to a 0-length page list structure.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index f2e5d2c..61562c7 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -10,6 +10,7 @@
 #include <linux/ceph/osdmap.h>
 #include <linux/ceph/messenger.h>
 #include <linux/ceph/auth.h>
+#include <linux/ceph/pagelist.h>
 
 /* 
  * Maximum object name size 
@@ -22,7 +23,6 @@ struct ceph_snap_context;
 struct ceph_osd_request;
 struct ceph_osd_client;
 struct ceph_authorizer;
-struct ceph_pagelist;
 
 /*
  * completion callback for async writepages
@@ -95,7 +95,7 @@ struct ceph_osd_request {
 	struct bio       *r_bio;	      /* instead of pages */
 #endif
 
-	struct ceph_pagelist *r_trail;	      /* trailing part of the data */
+	struct ceph_pagelist r_trail;	      /* trailing part of the data */
 };
 
 struct ceph_osd_event {
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 7d38327..2be50d8 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -171,10 +171,7 @@ void ceph_osdc_release_request(struct kref *kref)
 		bio_put(req->r_bio);
 #endif
 	ceph_put_snap_context(req->r_snapc);
-	if (req->r_trail) {
-		ceph_pagelist_release(req->r_trail);
-		kfree(req->r_trail);
-	}
+	ceph_pagelist_release(&req->r_trail);
 	if (req->r_mempool)
 		mempool_free(req, req->r_osdc->req_mempool);
 	else
@@ -208,8 +205,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 {
 	struct ceph_osd_request *req;
 	struct ceph_msg *msg;
-	int needs_trail;
-	int num_op = get_num_ops(ops, &needs_trail);
+	int num_op = get_num_ops(ops, NULL);
 	size_t msg_size = sizeof(struct ceph_osd_request_head);
 
 	msg_size += num_op*sizeof(struct ceph_osd_op);
@@ -252,15 +248,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 	}
 	req->r_reply = msg;
 
-	/* allocate space for the trailing data */
-	if (needs_trail) {
-		req->r_trail = kmalloc(sizeof(struct ceph_pagelist), gfp_flags);
-		if (!req->r_trail) {
-			ceph_osdc_put_request(req);
-			return NULL;
-		}
-		ceph_pagelist_init(req->r_trail);
-	}
+	ceph_pagelist_init(&req->r_trail);
 
 	/* create request message; allow space for oid */
 	msg_size += MAX_OBJ_NAME_SIZE;
@@ -312,29 +300,25 @@ static void osd_req_encode_op(struct ceph_osd_request *req,
 	case CEPH_OSD_OP_GETXATTR:
 	case CEPH_OSD_OP_SETXATTR:
 	case CEPH_OSD_OP_CMPXATTR:
-		BUG_ON(!req->r_trail);
-
 		dst->xattr.name_len = cpu_to_le32(src->xattr.name_len);
 		dst->xattr.value_len = cpu_to_le32(src->xattr.value_len);
 		dst->xattr.cmp_op = src->xattr.cmp_op;
 		dst->xattr.cmp_mode = src->xattr.cmp_mode;
-		ceph_pagelist_append(req->r_trail, src->xattr.name,
+		ceph_pagelist_append(&req->r_trail, src->xattr.name,
 				     src->xattr.name_len);
-		ceph_pagelist_append(req->r_trail, src->xattr.val,
+		ceph_pagelist_append(&req->r_trail, src->xattr.val,
 				     src->xattr.value_len);
 		break;
 	case CEPH_OSD_OP_CALL:
-		BUG_ON(!req->r_trail);
-
 		dst->cls.class_len = src->cls.class_len;
 		dst->cls.method_len = src->cls.method_len;
 		dst->cls.indata_len = cpu_to_le32(src->cls.indata_len);
 
-		ceph_pagelist_append(req->r_trail, src->cls.class_name,
+		ceph_pagelist_append(&req->r_trail, src->cls.class_name,
 				     src->cls.class_len);
-		ceph_pagelist_append(req->r_trail, src->cls.method_name,
+		ceph_pagelist_append(&req->r_trail, src->cls.method_name,
 				     src->cls.method_len);
-		ceph_pagelist_append(req->r_trail, src->cls.indata,
+		ceph_pagelist_append(&req->r_trail, src->cls.indata,
 				     src->cls.indata_len);
 		break;
 	case CEPH_OSD_OP_ROLLBACK:
@@ -347,11 +331,9 @@ static void osd_req_encode_op(struct ceph_osd_request *req,
 			__le32 prot_ver = cpu_to_le32(src->watch.prot_ver);
 			__le32 timeout = cpu_to_le32(src->watch.timeout);
 
-			BUG_ON(!req->r_trail);
-
-			ceph_pagelist_append(req->r_trail,
+			ceph_pagelist_append(&req->r_trail,
 						&prot_ver, sizeof(prot_ver));
-			ceph_pagelist_append(req->r_trail,
+			ceph_pagelist_append(&req->r_trail,
 						&timeout, sizeof(timeout));
 		}
 	case CEPH_OSD_OP_NOTIFY_ACK:
@@ -414,8 +396,7 @@ void ceph_osdc_build_request(struct ceph_osd_request *req,
 		op++;
 	}
 
-	if (req->r_trail)
-		data_len += req->r_trail->length;
+	data_len += req->r_trail.length;
 
 	if (snapc) {
 		head->snap_seq = cpu_to_le64(snapc->seq);
@@ -1715,7 +1696,7 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc,
 #ifdef CONFIG_BLOCK
 	req->r_request->bio = req->r_bio;
 #endif
-	req->r_request->trail = req->r_trail;
+	req->r_request->trail = &req->r_trail;
 
 	register_request(osdc, req);
 
-- 
cgit v0.10.2


From 5b9d1b1cd46aa6c8abf891f25c15aee31538da7e Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 13 Nov 2012 21:11:15 -0600
Subject: libceph: kill op_needs_trail()

Since every osd message is now prepared to include trailing data,
there's no need to check ahead of time whether any operations will
make use of the trail portion of the message.

We can drop the second argument to get_num_ops(), and as a result we
can also get rid of op_needs_trail() which is no longer used.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>

diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 2be50d8..37d43d5 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -32,20 +32,6 @@ static void __unregister_linger_request(struct ceph_osd_client *osdc,
 static void __send_request(struct ceph_osd_client *osdc,
 			   struct ceph_osd_request *req);
 
-static int op_needs_trail(int op)
-{
-	switch (op) {
-	case CEPH_OSD_OP_GETXATTR:
-	case CEPH_OSD_OP_SETXATTR:
-	case CEPH_OSD_OP_CMPXATTR:
-	case CEPH_OSD_OP_CALL:
-	case CEPH_OSD_OP_NOTIFY:
-		return 1;
-	default:
-		return 0;
-	}
-}
-
 static int op_has_extent(int op)
 {
 	return (op == CEPH_OSD_OP_READ ||
@@ -179,17 +165,12 @@ void ceph_osdc_release_request(struct kref *kref)
 }
 EXPORT_SYMBOL(ceph_osdc_release_request);
 
-static int get_num_ops(struct ceph_osd_req_op *ops, int *needs_trail)
+static int get_num_ops(struct ceph_osd_req_op *ops)
 {
 	int i = 0;
 
-	if (needs_trail)
-		*needs_trail = 0;
-	while (ops[i].op) {
-		if (needs_trail && op_needs_trail(ops[i].op))
-			*needs_trail = 1;
+	while (ops[i].op)
 		i++;
-	}
 
 	return i;
 }
@@ -205,7 +186,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 {
 	struct ceph_osd_request *req;
 	struct ceph_msg *msg;
-	int num_op = get_num_ops(ops, NULL);
+	int num_op = get_num_ops(ops);
 	size_t msg_size = sizeof(struct ceph_osd_request_head);
 
 	msg_size += num_op*sizeof(struct ceph_osd_op);
@@ -365,7 +346,7 @@ void ceph_osdc_build_request(struct ceph_osd_request *req,
 	struct ceph_osd_req_op *src_op;
 	struct ceph_osd_op *op;
 	void *p;
-	int num_op = get_num_ops(src_ops, NULL);
+	int num_op = get_num_ops(src_ops);
 	size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
 	int flags = req->r_flags;
 	u64 data_len = 0;
-- 
cgit v0.10.2


From 0120be3c60d46d6d55f4bf7a3d654cc705eb0c54 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 14 Nov 2012 09:38:19 -0600
Subject: libceph: pass length to ceph_osdc_build_request()

The len argument to ceph_osdc_build_request() is set up to be
passed by address, but that function never updates its value
so there's no need to do this.  Tighten up the interface by
passing the length directly.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 54bd9fc..c1b135b 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1172,7 +1172,7 @@ static int rbd_do_request(struct request *rq,
 				snapid, ofs, &len, &bno, osd_req, ops);
 	rbd_assert(ret == 0);
 
-	ceph_osdc_build_request(osd_req, ofs, &len, ops, snapc, &mtime);
+	ceph_osdc_build_request(osd_req, ofs, len, ops, snapc, &mtime);
 
 	if (linger_req) {
 		ceph_osdc_set_request_linger(osdc, osd_req);
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 61562c7..4bfb458 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -224,7 +224,7 @@ extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *
 					       struct bio *bio);
 
 extern void ceph_osdc_build_request(struct ceph_osd_request *req,
-				    u64 off, u64 *plen,
+				    u64 off, u64 len,
 				    struct ceph_osd_req_op *src_ops,
 				    struct ceph_snap_context *snapc,
 				    struct timespec *mtime);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 37d43d5..e29a3ed 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -336,7 +336,7 @@ static void osd_req_encode_op(struct ceph_osd_request *req,
  *
  */
 void ceph_osdc_build_request(struct ceph_osd_request *req,
-			     u64 off, u64 *plen,
+			     u64 off, u64 len,
 			     struct ceph_osd_req_op *src_ops,
 			     struct ceph_snap_context *snapc,
 			     struct timespec *mtime)
@@ -390,7 +390,7 @@ void ceph_osdc_build_request(struct ceph_osd_request *req,
 
 	if (flags & CEPH_OSD_FLAG_WRITE) {
 		req->r_request->hdr.data_off = cpu_to_le16(off);
-		req->r_request->hdr.data_len = cpu_to_le32(*plen + data_len);
+		req->r_request->hdr.data_len = cpu_to_le32(len + data_len);
 	} else if (data_len) {
 		req->r_request->hdr.data_off = 0;
 		req->r_request->hdr.data_len = cpu_to_le32(data_len);
@@ -464,7 +464,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 	req->r_num_pages = calc_pages_for(page_align, *plen);
 	req->r_page_alignment = page_align;
 
-	ceph_osdc_build_request(req, off, plen, ops,
+	ceph_osdc_build_request(req, off, *plen, ops,
 				snapc,
 				mtime);
 
-- 
cgit v0.10.2


From e8afad656cbcd06d02a7bacd4b318fa0e2907de0 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 14 Nov 2012 09:38:19 -0600
Subject: libceph: pass length to ceph_calc_file_object_mapping()

ceph_calc_file_object_mapping() takes (among other things) a "file"
offset and length, and based on the layout, determines the object
number ("bno") backing the affected portion of the file's data and
the offset into that object where the desired range begins.  It also
computes the size that should be used for the request--either the
amount requested or something less if that would exceed the end of
the object.

This patch changes the input length parameter in this function so it
is used only for input.  That is, the argument will be passed by
value rather than by address, so the value provided won't get
updated by the function.

The value would only get updated if the length would surpass the
current object, and in that case the value it got updated to would
be exactly that returned in *oxlen.

Only one of the two callers is affected by this change.  Update
ceph_calc_raw_layout() so it records any updated value.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>

diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 36549a4..3b22150 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -194,7 +194,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
 		return -EFAULT;
 
 	down_read(&osdc->map_sem);
-	r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len,
+	r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len,
 					  &dl.object_no, &dl.object_offset,
 					  &olen);
 	if (r < 0)
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 5ea57ba..1f653e2 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -110,7 +110,7 @@ extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
 
 /* calculate mapping of a file extent to an object */
 extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
-					 u64 off, u64 *plen,
+					 u64 off, u64 len,
 					 u64 *bno, u64 *oxoff, u64 *oxlen);
 
 /* calculate mapping of object to a placement group */
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index e29a3ed..47e5f5b 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -53,13 +53,15 @@ int ceph_calc_raw_layout(struct ceph_osd_client *osdc,
 	reqhead->snapid = cpu_to_le64(snapid);
 
 	/* object extent? */
-	r = ceph_calc_file_object_mapping(layout, off, plen, bno,
+	r = ceph_calc_file_object_mapping(layout, off, orig_len, bno,
 					  &objoff, &objlen);
 	if (r < 0)
 		return r;
-	if (*plen < orig_len)
+	if (objlen < orig_len) {
+		*plen = objlen;
 		dout(" skipping last %llu, final file extent %llu~%llu\n",
 		     orig_len - *plen, off, *plen);
+	}
 
 	if (op_has_extent(op->op)) {
 		u32 osize = le32_to_cpu(layout->fl_object_size);
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index ca05871..369f03b 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1016,7 +1016,7 @@ bad:
  * pass a stride back to the caller.
  */
 int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
-				   u64 off, u64 *plen,
+				   u64 off, u64 len,
 				   u64 *ono,
 				   u64 *oxoff, u64 *oxlen)
 {
@@ -1027,7 +1027,7 @@ int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
 	u32 su_per_object;
 	u64 t, su_offset;
 
-	dout("mapping %llu~%llu  osize %u fl_su %u\n", off, *plen,
+	dout("mapping %llu~%llu  osize %u fl_su %u\n", off, len,
 	     osize, su);
 	if (su == 0 || sc == 0)
 		goto invalid;
@@ -1060,11 +1060,10 @@ int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
 
 	/*
 	 * Calculate the length of the extent being written to the selected
-	 * object. This is the minimum of the full length requested (plen) or
+	 * object. This is the minimum of the full length requested (len) or
 	 * the remainder of the current stripe being written to.
 	 */
-	*oxlen = min_t(u64, *plen, su - su_offset);
-	*plen = *oxlen;
+	*oxlen = min_t(u64, len, su - su_offset);
 
 	dout(" obj extent %llu~%llu\n", *oxoff, *oxlen);
 	return 0;
-- 
cgit v0.10.2


From 4d6b250bf18d44571d69a0f4afec4b6a1969729f Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 13 Nov 2012 21:11:15 -0600
Subject: libceph: drop snapid in ceph_calc_raw_layout()

A snapshot id must be provided to ceph_calc_raw_layout() even though
it is not needed at all for calculating the layout.

Where the snapshot id *is* needed is when building the request
message for an osd operation.

Drop the snapid parameter from ceph_calc_raw_layout() and pass
that value instead in ceph_osdc_build_request().

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index c1b135b..fa37186 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1169,10 +1169,10 @@ static int rbd_do_request(struct request *rq,
 
 	rbd_layout_init(&osd_req->r_file_layout, rbd_dev->spec->pool_id);
 	ret = ceph_calc_raw_layout(osdc, &osd_req->r_file_layout,
-				snapid, ofs, &len, &bno, osd_req, ops);
+				ofs, &len, &bno, osd_req, ops);
 	rbd_assert(ret == 0);
 
-	ceph_osdc_build_request(osd_req, ofs, len, ops, snapc, &mtime);
+	ceph_osdc_build_request(osd_req, ofs, len, ops, snapc, snapid, &mtime);
 
 	if (linger_req) {
 		ceph_osdc_set_request_linger(osdc, osd_req);
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 4bfb458..0e82a0a 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -209,7 +209,6 @@ extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
 
 extern int ceph_calc_raw_layout(struct ceph_osd_client *osdc,
 			struct ceph_file_layout *layout,
-			u64 snapid,
 			u64 off, u64 *plen, u64 *bno,
 			struct ceph_osd_request *req,
 			struct ceph_osd_req_op *op);
@@ -227,6 +226,7 @@ extern void ceph_osdc_build_request(struct ceph_osd_request *req,
 				    u64 off, u64 len,
 				    struct ceph_osd_req_op *src_ops,
 				    struct ceph_snap_context *snapc,
+				    u64 snap_id,
 				    struct timespec *mtime);
 
 extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 47e5f5b..b5a4b28 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -40,18 +40,14 @@ static int op_has_extent(int op)
 
 int ceph_calc_raw_layout(struct ceph_osd_client *osdc,
 			struct ceph_file_layout *layout,
-			u64 snapid,
 			u64 off, u64 *plen, u64 *bno,
 			struct ceph_osd_request *req,
 			struct ceph_osd_req_op *op)
 {
-	struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
 	u64 orig_len = *plen;
 	u64 objoff, objlen;    /* extent in object */
 	int r;
 
-	reqhead->snapid = cpu_to_le64(snapid);
-
 	/* object extent? */
 	r = ceph_calc_file_object_mapping(layout, off, orig_len, bno,
 					  &objoff, &objlen);
@@ -121,8 +117,7 @@ static int calc_layout(struct ceph_osd_client *osdc,
 	u64 bno;
 	int r;
 
-	r = ceph_calc_raw_layout(osdc, layout, vino.snap, off,
-				 plen, &bno, req, op);
+	r = ceph_calc_raw_layout(osdc, layout, off, plen, &bno, req, op);
 	if (r < 0)
 		return r;
 
@@ -340,7 +335,7 @@ static void osd_req_encode_op(struct ceph_osd_request *req,
 void ceph_osdc_build_request(struct ceph_osd_request *req,
 			     u64 off, u64 len,
 			     struct ceph_osd_req_op *src_ops,
-			     struct ceph_snap_context *snapc,
+			     struct ceph_snap_context *snapc, u64 snap_id,
 			     struct timespec *mtime)
 {
 	struct ceph_msg *msg = req->r_request;
@@ -355,6 +350,7 @@ void ceph_osdc_build_request(struct ceph_osd_request *req,
 	int i;
 
 	head = msg->front.iov_base;
+	head->snapid = cpu_to_le64(snap_id);
 	op = (void *)(head + 1);
 	p = (void *)(op + num_op);
 
@@ -466,9 +462,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 	req->r_num_pages = calc_pages_for(page_align, *plen);
 	req->r_page_alignment = page_align;
 
-	ceph_osdc_build_request(req, off, *plen, ops,
-				snapc,
-				mtime);
+	ceph_osdc_build_request(req, off, *plen, ops, snapc, vino.snap, mtime);
 
 	return req;
 }
-- 
cgit v0.10.2


From e75b45cf36565fd8ba206a9d80f670a86e61ba2f Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 13 Nov 2012 21:11:14 -0600
Subject: libceph: drop osdc from ceph_calc_raw_layout()

The osdc parameter to ceph_calc_raw_layout() is not used, so get rid
of it.  Consequently, the corresponding parameter in calc_layout()
becomes unused, so get rid of that as well.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index fa37186..ac8fd38 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1168,7 +1168,7 @@ static int rbd_do_request(struct request *rq,
 	osd_req->r_oid_len = strlen(osd_req->r_oid);
 
 	rbd_layout_init(&osd_req->r_file_layout, rbd_dev->spec->pool_id);
-	ret = ceph_calc_raw_layout(osdc, &osd_req->r_file_layout,
+	ret = ceph_calc_raw_layout(&osd_req->r_file_layout,
 				ofs, &len, &bno, osd_req, ops);
 	rbd_assert(ret == 0);
 
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 0e82a0a..fe3a6e8 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -207,8 +207,7 @@ extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
 extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
 				 struct ceph_msg *msg);
 
-extern int ceph_calc_raw_layout(struct ceph_osd_client *osdc,
-			struct ceph_file_layout *layout,
+extern int ceph_calc_raw_layout(struct ceph_file_layout *layout,
 			u64 off, u64 *plen, u64 *bno,
 			struct ceph_osd_request *req,
 			struct ceph_osd_req_op *op);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index b5a4b28..cd9c283 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -38,8 +38,7 @@ static int op_has_extent(int op)
 		op == CEPH_OSD_OP_WRITE);
 }
 
-int ceph_calc_raw_layout(struct ceph_osd_client *osdc,
-			struct ceph_file_layout *layout,
+int ceph_calc_raw_layout(struct ceph_file_layout *layout,
 			u64 off, u64 *plen, u64 *bno,
 			struct ceph_osd_request *req,
 			struct ceph_osd_req_op *op)
@@ -107,8 +106,7 @@ EXPORT_SYMBOL(ceph_calc_raw_layout);
  *
  * fill osd op in request message.
  */
-static int calc_layout(struct ceph_osd_client *osdc,
-		       struct ceph_vino vino,
+static int calc_layout(struct ceph_vino vino,
 		       struct ceph_file_layout *layout,
 		       u64 off, u64 *plen,
 		       struct ceph_osd_request *req,
@@ -117,7 +115,7 @@ static int calc_layout(struct ceph_osd_client *osdc,
 	u64 bno;
 	int r;
 
-	r = ceph_calc_raw_layout(osdc, layout, off, plen, &bno, req, op);
+	r = ceph_calc_raw_layout(layout, off, plen, &bno, req, op);
 	if (r < 0)
 		return r;
 
@@ -452,7 +450,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 		return ERR_PTR(-ENOMEM);
 
 	/* calculate max write size */
-	r = calc_layout(osdc, vino, layout, off, plen, req, ops);
+	r = calc_layout(vino, layout, off, plen, req, ops);
 	if (r < 0)
 		return ERR_PTR(r);
 	req->r_file_layout = *layout;  /* keep a copy */
-- 
cgit v0.10.2


From d178a9e74006e80f568d87e29f2a68f14fc7cbb1 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 13 Nov 2012 21:11:15 -0600
Subject: libceph: don't set flags in ceph_osdc_alloc_request()

The only thing ceph_osdc_alloc_request() really does with the
flags value it is passed is assign it to the newly-created
osd request structure.  Do that in the caller instead.

Both callers subsequently call ceph_osdc_build_request(), so have
that function (instead of ceph_osdc_alloc_request()) issue a warning
if a request comes through with neither the read nor write flags set.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index ac8fd38..bdbaa4c 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1148,13 +1148,14 @@ static int rbd_do_request(struct request *rq,
 		(unsigned long long) len, coll, coll_index);
 
 	osdc = &rbd_dev->rbd_client->client->osdc;
-	osd_req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
+	osd_req = ceph_osdc_alloc_request(osdc, snapc, ops,
 					false, GFP_NOIO, pages, bio);
 	if (!osd_req) {
 		ret = -ENOMEM;
 		goto done_pages;
 	}
 
+	osd_req->r_flags = flags;
 	osd_req->r_callback = rbd_cb;
 
 	rbd_req->rq = rq;
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index fe3a6e8..6ddda5b 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -213,7 +213,6 @@ extern int ceph_calc_raw_layout(struct ceph_file_layout *layout,
 			struct ceph_osd_req_op *op);
 
 extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
-					       int flags,
 					       struct ceph_snap_context *snapc,
 					       struct ceph_osd_req_op *ops,
 					       bool use_mempool,
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index cd9c283..77ce1ed 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -171,7 +171,6 @@ static int get_num_ops(struct ceph_osd_req_op *ops)
 }
 
 struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
-					       int flags,
 					       struct ceph_snap_context *snapc,
 					       struct ceph_osd_req_op *ops,
 					       bool use_mempool,
@@ -208,10 +207,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 	INIT_LIST_HEAD(&req->r_req_lru_item);
 	INIT_LIST_HEAD(&req->r_osd_item);
 
-	req->r_flags = flags;
-
-	WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0);
-
 	/* create reply message */
 	if (use_mempool)
 		msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
@@ -347,6 +342,8 @@ void ceph_osdc_build_request(struct ceph_osd_request *req,
 	u64 data_len = 0;
 	int i;
 
+	WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0);
+
 	head = msg->front.iov_base;
 	head->snapid = cpu_to_le64(snap_id);
 	op = (void *)(head + 1);
@@ -442,12 +439,12 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 	} else
 		ops[1].op = 0;
 
-	req = ceph_osdc_alloc_request(osdc, flags,
-					 snapc, ops,
+	req = ceph_osdc_alloc_request(osdc, snapc, ops,
 					 use_mempool,
 					 GFP_NOFS, NULL, NULL);
 	if (!req)
 		return ERR_PTR(-ENOMEM);
+	req->r_flags = flags;
 
 	/* calculate max write size */
 	r = calc_layout(vino, layout, off, plen, req, ops);
-- 
cgit v0.10.2


From 54a5400721da7fa5a16cea151aade5bdfee74111 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 13 Nov 2012 21:11:15 -0600
Subject: libceph: don't set pages or bio in ceph_osdc_alloc_request()

Only one of the two callers of ceph_osdc_alloc_request() provides
page or bio data for its payload.  And essentially all that function
was doing with those arguments was assigning them to fields in the
osd request structure.

Simplify ceph_osdc_alloc_request() by having the caller take care of
making those assignments

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index bdbaa4c..d1445df 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1148,14 +1148,18 @@ static int rbd_do_request(struct request *rq,
 		(unsigned long long) len, coll, coll_index);
 
 	osdc = &rbd_dev->rbd_client->client->osdc;
-	osd_req = ceph_osdc_alloc_request(osdc, snapc, ops,
-					false, GFP_NOIO, pages, bio);
+	osd_req = ceph_osdc_alloc_request(osdc, snapc, ops, false, GFP_NOIO);
 	if (!osd_req) {
 		ret = -ENOMEM;
 		goto done_pages;
 	}
 
 	osd_req->r_flags = flags;
+	osd_req->r_pages = pages;
+	if (bio) {
+		osd_req->r_bio = bio;
+		bio_get(osd_req->r_bio);
+	}
 	osd_req->r_callback = rbd_cb;
 
 	rbd_req->rq = rq;
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 6ddda5b..75f56d3 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -216,9 +216,7 @@ extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *
 					       struct ceph_snap_context *snapc,
 					       struct ceph_osd_req_op *ops,
 					       bool use_mempool,
-					       gfp_t gfp_flags,
-					       struct page **pages,
-					       struct bio *bio);
+					       gfp_t gfp_flags);
 
 extern void ceph_osdc_build_request(struct ceph_osd_request *req,
 				    u64 off, u64 len,
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 77ce1ed..bdc3bb1 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -174,9 +174,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 					       struct ceph_snap_context *snapc,
 					       struct ceph_osd_req_op *ops,
 					       bool use_mempool,
-					       gfp_t gfp_flags,
-					       struct page **pages,
-					       struct bio *bio)
+					       gfp_t gfp_flags)
 {
 	struct ceph_osd_request *req;
 	struct ceph_msg *msg;
@@ -237,13 +235,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 	memset(msg->front.iov_base, 0, msg->front.iov_len);
 
 	req->r_request = msg;
-	req->r_pages = pages;
-#ifdef CONFIG_BLOCK
-	if (bio) {
-		req->r_bio = bio;
-		bio_get(req->r_bio);
-	}
-#endif
 
 	return req;
 }
@@ -439,9 +430,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 	} else
 		ops[1].op = 0;
 
-	req = ceph_osdc_alloc_request(osdc, snapc, ops,
-					 use_mempool,
-					 GFP_NOFS, NULL, NULL);
+	req = ceph_osdc_alloc_request(osdc, snapc, ops, use_mempool, GFP_NOFS);
 	if (!req)
 		return ERR_PTR(-ENOMEM);
 	req->r_flags = flags;
-- 
cgit v0.10.2


From d07c09589f533db9ab500ac38151bc9f3a4d0648 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 13 Nov 2012 21:11:15 -0600
Subject: rbd: pass num_op with ops array

Add a num_op parameter to rbd_do_request() and rbd_req_sync_op() to
indicate the number of entries in the array.  The callers of these
functions always know how many entries are in the array, so just
pass that information down.

This is in anticipation of eliminating the extra zero-filled entry
in these ops arrays.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index d1445df..b6872d3 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1119,6 +1119,7 @@ static int rbd_do_request(struct request *rq,
 			  struct page **pages,
 			  int num_pages,
 			  int flags,
+			  unsigned int num_op,
 			  struct ceph_osd_req_op *ops,
 			  struct rbd_req_coll *coll,
 			  int coll_index,
@@ -1259,6 +1260,7 @@ static void rbd_simple_req_cb(struct ceph_osd_request *osd_req,
  */
 static int rbd_req_sync_op(struct rbd_device *rbd_dev,
 			   int flags,
+			   unsigned int num_op,
 			   struct ceph_osd_req_op *ops,
 			   const char *object_name,
 			   u64 ofs, u64 inbound_size,
@@ -1281,7 +1283,7 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev,
 			  object_name, ofs, inbound_size, NULL,
 			  pages, num_pages,
 			  flags,
-			  ops,
+			  num_op, ops,
 			  NULL, 0,
 			  NULL,
 			  linger_req, ver);
@@ -1351,7 +1353,7 @@ static int rbd_do_op(struct request *rq,
 			     bio,
 			     NULL, 0,
 			     flags,
-			     ops,
+			     1, ops,
 			     coll, coll_index,
 			     rbd_req_cb, 0, NULL);
 	if (ret < 0)
@@ -1380,7 +1382,7 @@ static int rbd_req_sync_read(struct rbd_device *rbd_dev,
 		return -ENOMEM;
 
 	ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ,
-			       ops, object_name, ofs, len, buf, NULL, ver);
+			       1, ops, object_name, ofs, len, buf, NULL, ver);
 	rbd_destroy_ops(ops);
 
 	return ret;
@@ -1408,7 +1410,7 @@ static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
 			  rbd_dev->header_name, 0, 0, NULL,
 			  NULL, 0,
 			  CEPH_OSD_FLAG_READ,
-			  ops,
+			  1, ops,
 			  NULL, 0,
 			  rbd_simple_req_cb, 0, NULL);
 
@@ -1460,7 +1462,7 @@ static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
 
 	ret = rbd_req_sync_op(rbd_dev,
 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
-			      ops,
+			      1, ops,
 			      rbd_dev->header_name,
 			      0, 0, NULL,
 			      &rbd_dev->watch_request, NULL);
@@ -1497,7 +1499,7 @@ static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
 
 	ret = rbd_req_sync_op(rbd_dev,
 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
-			      ops,
+			      1, ops,
 			      rbd_dev->header_name,
 			      0, 0, NULL, NULL, NULL);
 
@@ -1548,7 +1550,7 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
 	ops[0].cls.indata = outbound;
 	ops[0].cls.indata_len = outbound_size;
 
-	ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, ops,
+	ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, 1, ops,
 			       object_name, 0, inbound_size, inbound,
 			       NULL, ver);
 
-- 
cgit v0.10.2


From ae7ca4a35b1f5df86e2c32b2cfc01a8d528c7b8c Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 13 Nov 2012 21:11:15 -0600
Subject: libceph: pass num_op with ops

Both ceph_osdc_alloc_request() and ceph_osdc_build_request() are
provided an array of ceph osd request operations.  Rather than just
passing the number of operations in the array, the caller is
required append an additional zeroed operation structure to signal
the end of the array.

All callers know the number of operations at the time these
functions are called, so drop the silly zero entry and supply that
number directly.  As a result, get_num_ops() is no longer needed.
This also means that ceph_osdc_alloc_request() never uses its ops
argument, so that can be dropped.

Also rbd_create_rw_ops() no longer needs to add one to reserve room
for the additional op.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index b6872d3..88de8cc 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1026,12 +1026,12 @@ out_err:
 /*
  * helpers for osd request op vectors.
  */
-static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
+static struct ceph_osd_req_op *rbd_create_rw_ops(int num_op,
 					int opcode, u32 payload_len)
 {
 	struct ceph_osd_req_op *ops;
 
-	ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
+	ops = kzalloc(num_op * sizeof (*ops), GFP_NOIO);
 	if (!ops)
 		return NULL;
 
@@ -1149,7 +1149,7 @@ static int rbd_do_request(struct request *rq,
 		(unsigned long long) len, coll, coll_index);
 
 	osdc = &rbd_dev->rbd_client->client->osdc;
-	osd_req = ceph_osdc_alloc_request(osdc, snapc, ops, false, GFP_NOIO);
+	osd_req = ceph_osdc_alloc_request(osdc, snapc, num_op, false, GFP_NOIO);
 	if (!osd_req) {
 		ret = -ENOMEM;
 		goto done_pages;
@@ -1178,7 +1178,8 @@ static int rbd_do_request(struct request *rq,
 				ofs, &len, &bno, osd_req, ops);
 	rbd_assert(ret == 0);
 
-	ceph_osdc_build_request(osd_req, ofs, len, ops, snapc, snapid, &mtime);
+	ceph_osdc_build_request(osd_req, ofs, len, num_op, ops,
+				snapc, snapid, &mtime);
 
 	if (linger_req) {
 		ceph_osdc_set_request_linger(osdc, osd_req);
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 75f56d3..2b04d05 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -214,12 +214,13 @@ extern int ceph_calc_raw_layout(struct ceph_file_layout *layout,
 
 extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 					       struct ceph_snap_context *snapc,
-					       struct ceph_osd_req_op *ops,
+					       unsigned int num_op,
 					       bool use_mempool,
 					       gfp_t gfp_flags);
 
 extern void ceph_osdc_build_request(struct ceph_osd_request *req,
 				    u64 off, u64 len,
+				    unsigned int num_op,
 				    struct ceph_osd_req_op *src_ops,
 				    struct ceph_snap_context *snapc,
 				    u64 snap_id,
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index bdc3bb1..500ae8b 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -160,25 +160,14 @@ void ceph_osdc_release_request(struct kref *kref)
 }
 EXPORT_SYMBOL(ceph_osdc_release_request);
 
-static int get_num_ops(struct ceph_osd_req_op *ops)
-{
-	int i = 0;
-
-	while (ops[i].op)
-		i++;
-
-	return i;
-}
-
 struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 					       struct ceph_snap_context *snapc,
-					       struct ceph_osd_req_op *ops,
+					       unsigned int num_op,
 					       bool use_mempool,
 					       gfp_t gfp_flags)
 {
 	struct ceph_osd_request *req;
 	struct ceph_msg *msg;
-	int num_op = get_num_ops(ops);
 	size_t msg_size = sizeof(struct ceph_osd_request_head);
 
 	msg_size += num_op*sizeof(struct ceph_osd_op);
@@ -317,7 +306,7 @@ static void osd_req_encode_op(struct ceph_osd_request *req,
  *
  */
 void ceph_osdc_build_request(struct ceph_osd_request *req,
-			     u64 off, u64 len,
+			     u64 off, u64 len, unsigned int num_op,
 			     struct ceph_osd_req_op *src_ops,
 			     struct ceph_snap_context *snapc, u64 snap_id,
 			     struct timespec *mtime)
@@ -327,7 +316,6 @@ void ceph_osdc_build_request(struct ceph_osd_request *req,
 	struct ceph_osd_req_op *src_op;
 	struct ceph_osd_op *op;
 	void *p;
-	int num_op = get_num_ops(src_ops);
 	size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
 	int flags = req->r_flags;
 	u64 data_len = 0;
@@ -346,20 +334,17 @@ void ceph_osdc_build_request(struct ceph_osd_request *req,
 	head->flags = cpu_to_le32(flags);
 	if (flags & CEPH_OSD_FLAG_WRITE)
 		ceph_encode_timespec(&head->mtime, mtime);
+	BUG_ON(num_op > (unsigned int) ((u16) -1));
 	head->num_ops = cpu_to_le16(num_op);
 
-
 	/* fill in oid */
 	head->object_len = cpu_to_le32(req->r_oid_len);
 	memcpy(p, req->r_oid, req->r_oid_len);
 	p += req->r_oid_len;
 
 	src_op = src_ops;
-	while (src_op->op) {
-		osd_req_encode_op(req, op, src_op);
-		src_op++;
-		op++;
-	}
+	while (num_op--)
+		osd_req_encode_op(req, op++, src_op++);
 
 	data_len += req->r_trail.length;
 
@@ -414,23 +399,24 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 					       bool use_mempool, int num_reply,
 					       int page_align)
 {
-	struct ceph_osd_req_op ops[3];
+	struct ceph_osd_req_op ops[2];
 	struct ceph_osd_request *req;
+	unsigned int num_op = 1;
 	int r;
 
+	memset(&ops, 0, sizeof ops);
+
 	ops[0].op = opcode;
 	ops[0].extent.truncate_seq = truncate_seq;
 	ops[0].extent.truncate_size = truncate_size;
-	ops[0].payload_len = 0;
 
 	if (do_sync) {
 		ops[1].op = CEPH_OSD_OP_STARTSYNC;
-		ops[1].payload_len = 0;
-		ops[2].op = 0;
-	} else
-		ops[1].op = 0;
+		num_op++;
+	}
 
-	req = ceph_osdc_alloc_request(osdc, snapc, ops, use_mempool, GFP_NOFS);
+	req = ceph_osdc_alloc_request(osdc, snapc, num_op, use_mempool,
+					GFP_NOFS);
 	if (!req)
 		return ERR_PTR(-ENOMEM);
 	req->r_flags = flags;
@@ -446,7 +432,8 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 	req->r_num_pages = calc_pages_for(page_align, *plen);
 	req->r_page_alignment = page_align;
 
-	ceph_osdc_build_request(req, off, *plen, ops, snapc, vino.snap, mtime);
+	ceph_osdc_build_request(req, off, *plen, num_op, ops,
+				snapc, vino.snap, mtime);
 
 	return req;
 }
-- 
cgit v0.10.2


From 139b4318ad93ae4370d88882ff89b42dcbfaaab1 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 13 Nov 2012 21:11:15 -0600
Subject: rbd: there is really only one op

Throughout the rbd code there are spots where it appears we can
handle an osd request containing more than one osd request op.

But that is only the way it appears.  In fact, currently only one
operation at a time can be supported, and supporting more than
one will require much more than fleshing out the support that's
there now.

This patch changes names to make it perfectly clear that anywhere
we're dealing with a block of ops, we're in fact dealing with
exactly one of them.  We'll be able to simplify some things as
a result.

When multiple op support is implemented, we can update things again
accordingly.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 88de8cc..cc8924d 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1023,32 +1023,26 @@ out_err:
 	return NULL;
 }
 
-/*
- * helpers for osd request op vectors.
- */
-static struct ceph_osd_req_op *rbd_create_rw_ops(int num_op,
-					int opcode, u32 payload_len)
+static struct ceph_osd_req_op *rbd_create_rw_op(int opcode, u32 payload_len)
 {
-	struct ceph_osd_req_op *ops;
+	struct ceph_osd_req_op *op;
 
-	ops = kzalloc(num_op * sizeof (*ops), GFP_NOIO);
-	if (!ops)
+	op = kzalloc(sizeof (*op), GFP_NOIO);
+	if (!op)
 		return NULL;
-
-	ops[0].op = opcode;
-
 	/*
 	 * op extent offset and length will be set later on
 	 * in calc_raw_layout()
 	 */
-	ops[0].payload_len = payload_len;
+	op->op = opcode;
+	op->payload_len = payload_len;
 
-	return ops;
+	return op;
 }
 
-static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
+static void rbd_destroy_op(struct ceph_osd_req_op *op)
 {
-	kfree(ops);
+	kfree(op);
 }
 
 static void rbd_coll_end_req_index(struct request *rq,
@@ -1314,7 +1308,7 @@ static int rbd_do_op(struct request *rq,
 	u64 seg_ofs;
 	u64 seg_len;
 	int ret;
-	struct ceph_osd_req_op *ops;
+	struct ceph_osd_req_op *op;
 	u32 payload_len;
 	int opcode;
 	int flags;
@@ -1340,8 +1334,8 @@ static int rbd_do_op(struct request *rq,
 	}
 
 	ret = -ENOMEM;
-	ops = rbd_create_rw_ops(1, opcode, payload_len);
-	if (!ops)
+	op = rbd_create_rw_op(opcode, payload_len);
+	if (!op)
 		goto done;
 
 	/* we've taken care of segment sizes earlier when we
@@ -1354,13 +1348,13 @@ static int rbd_do_op(struct request *rq,
 			     bio,
 			     NULL, 0,
 			     flags,
-			     1, ops,
+			     1, op,
 			     coll, coll_index,
 			     rbd_req_cb, 0, NULL);
 	if (ret < 0)
 		rbd_coll_end_req_index(rq, coll, coll_index,
 					(s32)ret, seg_len);
-	rbd_destroy_ops(ops);
+	rbd_destroy_op(op);
 done:
 	kfree(seg_name);
 	return ret;
@@ -1375,16 +1369,16 @@ static int rbd_req_sync_read(struct rbd_device *rbd_dev,
 			  char *buf,
 			  u64 *ver)
 {
-	struct ceph_osd_req_op *ops;
+	struct ceph_osd_req_op *op;
 	int ret;
 
-	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
-	if (!ops)
+	op = rbd_create_rw_op(CEPH_OSD_OP_READ, 0);
+	if (!op)
 		return -ENOMEM;
 
 	ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ,
-			       1, ops, object_name, ofs, len, buf, NULL, ver);
-	rbd_destroy_ops(ops);
+			       1, op, object_name, ofs, len, buf, NULL, ver);
+	rbd_destroy_op(op);
 
 	return ret;
 }
@@ -1396,26 +1390,26 @@ static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
 				   u64 ver,
 				   u64 notify_id)
 {
-	struct ceph_osd_req_op *ops;
+	struct ceph_osd_req_op *op;
 	int ret;
 
-	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
-	if (!ops)
+	op = rbd_create_rw_op(CEPH_OSD_OP_NOTIFY_ACK, 0);
+	if (!op)
 		return -ENOMEM;
 
-	ops[0].watch.ver = cpu_to_le64(ver);
-	ops[0].watch.cookie = notify_id;
-	ops[0].watch.flag = 0;
+	op->watch.ver = cpu_to_le64(ver);
+	op->watch.cookie = notify_id;
+	op->watch.flag = 0;
 
 	ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
 			  rbd_dev->header_name, 0, 0, NULL,
 			  NULL, 0,
 			  CEPH_OSD_FLAG_READ,
-			  1, ops,
+			  1, op,
 			  NULL, 0,
 			  rbd_simple_req_cb, 0, NULL);
 
-	rbd_destroy_ops(ops);
+	rbd_destroy_op(op);
 	return ret;
 }
 
@@ -1444,12 +1438,12 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
  */
 static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
 {
-	struct ceph_osd_req_op *ops;
+	struct ceph_osd_req_op *op;
 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
 	int ret;
 
-	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
-	if (!ops)
+	op = rbd_create_rw_op(CEPH_OSD_OP_WATCH, 0);
+	if (!op)
 		return -ENOMEM;
 
 	ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
@@ -1457,13 +1451,13 @@ static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
 	if (ret < 0)
 		goto fail;
 
-	ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
-	ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
-	ops[0].watch.flag = 1;
+	op->watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
+	op->watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
+	op->watch.flag = 1;
 
 	ret = rbd_req_sync_op(rbd_dev,
 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
-			      1, ops,
+			      1, op,
 			      rbd_dev->header_name,
 			      0, 0, NULL,
 			      &rbd_dev->watch_request, NULL);
@@ -1471,14 +1465,14 @@ static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
 	if (ret < 0)
 		goto fail_event;
 
-	rbd_destroy_ops(ops);
+	rbd_destroy_op(op);
 	return 0;
 
 fail_event:
 	ceph_osdc_cancel_event(rbd_dev->watch_event);
 	rbd_dev->watch_event = NULL;
 fail:
-	rbd_destroy_ops(ops);
+	rbd_destroy_op(op);
 	return ret;
 }
 
@@ -1487,25 +1481,25 @@ fail:
  */
 static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
 {
-	struct ceph_osd_req_op *ops;
+	struct ceph_osd_req_op *op;
 	int ret;
 
-	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
-	if (!ops)
+	op = rbd_create_rw_op(CEPH_OSD_OP_WATCH, 0);
+	if (!op)
 		return -ENOMEM;
 
-	ops[0].watch.ver = 0;
-	ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
-	ops[0].watch.flag = 0;
+	op->watch.ver = 0;
+	op->watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
+	op->watch.flag = 0;
 
 	ret = rbd_req_sync_op(rbd_dev,
 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
-			      1, ops,
+			      1, op,
 			      rbd_dev->header_name,
 			      0, 0, NULL, NULL, NULL);
 
 
-	rbd_destroy_ops(ops);
+	rbd_destroy_op(op);
 	ceph_osdc_cancel_event(rbd_dev->watch_event);
 	rbd_dev->watch_event = NULL;
 	return ret;
@@ -1524,7 +1518,7 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
 			     size_t inbound_size,
 			     u64 *ver)
 {
-	struct ceph_osd_req_op *ops;
+	struct ceph_osd_req_op *op;
 	int class_name_len = strlen(class_name);
 	int method_name_len = strlen(method_name);
 	int payload_size;
@@ -1539,23 +1533,23 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
 	 * operation.
 	 */
 	payload_size = class_name_len + method_name_len + outbound_size;
-	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size);
-	if (!ops)
+	op = rbd_create_rw_op(CEPH_OSD_OP_CALL, payload_size);
+	if (!op)
 		return -ENOMEM;
 
-	ops[0].cls.class_name = class_name;
-	ops[0].cls.class_len = (__u8) class_name_len;
-	ops[0].cls.method_name = method_name;
-	ops[0].cls.method_len = (__u8) method_name_len;
-	ops[0].cls.argc = 0;
-	ops[0].cls.indata = outbound;
-	ops[0].cls.indata_len = outbound_size;
+	op->cls.class_name = class_name;
+	op->cls.class_len = (__u8) class_name_len;
+	op->cls.method_name = method_name;
+	op->cls.method_len = (__u8) method_name_len;
+	op->cls.argc = 0;
+	op->cls.indata = outbound;
+	op->cls.indata_len = outbound_size;
 
-	ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, 1, ops,
+	ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, 1, op,
 			       object_name, 0, inbound_size, inbound,
 			       NULL, ver);
 
-	rbd_destroy_ops(ops);
+	rbd_destroy_op(op);
 
 	dout("cls_exec returned %d\n", ret);
 	return ret;
-- 
cgit v0.10.2


From 30573d680355ca0de4db2113b9080cd078ac726f Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 13 Nov 2012 21:11:15 -0600
Subject: rbd: assume single op in a request

We now know that every of rbd_req_sync_op() passes an array of
exactly one operation, as evidenced by all callers passing 1 as its
num_op argument.  So get rid of that argument, assuming a single op.

Similarly, we now know that all callers of rbd_do_request() pass 1
as the num_op value, so that parameter can be eliminated as well.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index cc8924d..7ce178f 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1113,8 +1113,7 @@ static int rbd_do_request(struct request *rq,
 			  struct page **pages,
 			  int num_pages,
 			  int flags,
-			  unsigned int num_op,
-			  struct ceph_osd_req_op *ops,
+			  struct ceph_osd_req_op *op,
 			  struct rbd_req_coll *coll,
 			  int coll_index,
 			  void (*rbd_cb)(struct ceph_osd_request *,
@@ -1143,7 +1142,7 @@ static int rbd_do_request(struct request *rq,
 		(unsigned long long) len, coll, coll_index);
 
 	osdc = &rbd_dev->rbd_client->client->osdc;
-	osd_req = ceph_osdc_alloc_request(osdc, snapc, num_op, false, GFP_NOIO);
+	osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_NOIO);
 	if (!osd_req) {
 		ret = -ENOMEM;
 		goto done_pages;
@@ -1169,10 +1168,10 @@ static int rbd_do_request(struct request *rq,
 
 	rbd_layout_init(&osd_req->r_file_layout, rbd_dev->spec->pool_id);
 	ret = ceph_calc_raw_layout(&osd_req->r_file_layout,
-				ofs, &len, &bno, osd_req, ops);
+				ofs, &len, &bno, osd_req, op);
 	rbd_assert(ret == 0);
 
-	ceph_osdc_build_request(osd_req, ofs, len, num_op, ops,
+	ceph_osdc_build_request(osd_req, ofs, len, 1, op,
 				snapc, snapid, &mtime);
 
 	if (linger_req) {
@@ -1255,8 +1254,7 @@ static void rbd_simple_req_cb(struct ceph_osd_request *osd_req,
  */
 static int rbd_req_sync_op(struct rbd_device *rbd_dev,
 			   int flags,
-			   unsigned int num_op,
-			   struct ceph_osd_req_op *ops,
+			   struct ceph_osd_req_op *op,
 			   const char *object_name,
 			   u64 ofs, u64 inbound_size,
 			   char *inbound,
@@ -1267,7 +1265,7 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev,
 	struct page **pages;
 	int num_pages;
 
-	rbd_assert(ops != NULL);
+	rbd_assert(op != NULL);
 
 	num_pages = calc_pages_for(ofs, inbound_size);
 	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
@@ -1278,7 +1276,7 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev,
 			  object_name, ofs, inbound_size, NULL,
 			  pages, num_pages,
 			  flags,
-			  num_op, ops,
+			  op,
 			  NULL, 0,
 			  NULL,
 			  linger_req, ver);
@@ -1348,7 +1346,7 @@ static int rbd_do_op(struct request *rq,
 			     bio,
 			     NULL, 0,
 			     flags,
-			     1, op,
+			     op,
 			     coll, coll_index,
 			     rbd_req_cb, 0, NULL);
 	if (ret < 0)
@@ -1377,7 +1375,7 @@ static int rbd_req_sync_read(struct rbd_device *rbd_dev,
 		return -ENOMEM;
 
 	ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ,
-			       1, op, object_name, ofs, len, buf, NULL, ver);
+			       op, object_name, ofs, len, buf, NULL, ver);
 	rbd_destroy_op(op);
 
 	return ret;
@@ -1405,7 +1403,7 @@ static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
 			  rbd_dev->header_name, 0, 0, NULL,
 			  NULL, 0,
 			  CEPH_OSD_FLAG_READ,
-			  1, op,
+			  op,
 			  NULL, 0,
 			  rbd_simple_req_cb, 0, NULL);
 
@@ -1457,7 +1455,7 @@ static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
 
 	ret = rbd_req_sync_op(rbd_dev,
 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
-			      1, op,
+			      op,
 			      rbd_dev->header_name,
 			      0, 0, NULL,
 			      &rbd_dev->watch_request, NULL);
@@ -1494,7 +1492,7 @@ static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
 
 	ret = rbd_req_sync_op(rbd_dev,
 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
-			      1, op,
+			      op,
 			      rbd_dev->header_name,
 			      0, 0, NULL, NULL, NULL);
 
@@ -1545,7 +1543,7 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
 	op->cls.indata = outbound;
 	op->cls.indata_len = outbound_size;
 
-	ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, 1, op,
+	ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, op,
 			       object_name, 0, inbound_size, inbound,
 			       NULL, ver);
 
-- 
cgit v0.10.2


From 2b5fc648af5eec2f4fe984cb6b926214e02c5cf4 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 14 Nov 2012 09:38:20 -0600
Subject: rbd: kill ceph_osd_req_op->flags

The flags field of struct ceph_osd_req_op is never used, so just get
rid of it.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 2b04d05..69287cc 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -157,7 +157,6 @@ struct ceph_osd_client {
 
 struct ceph_osd_req_op {
 	u16 op;           /* CEPH_OSD_OP_* */
-	u32 flags;        /* CEPH_OSD_FLAG_* */
 	union {
 		struct {
 			u64 offset, length;
-- 
cgit v0.10.2


From 0829661863fb5c8031c1c5c119693ea157517783 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 14 Nov 2012 12:25:18 -0600
Subject: rbd: pull in ceph_calc_raw_layout()

This is the first in a series of patches aimed at eliminating
the use of ceph_calc_raw_layout() by rbd.

It simply pulls in a copy of that function and renames it
rbd_calc_raw_layout().

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 7ce178f..eee0d36 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1101,6 +1101,40 @@ static void rbd_layout_init(struct ceph_file_layout *layout, u64 pool_id)
 	layout->fl_pg_pool = cpu_to_le32((u32) pool_id);
 }
 
+int rbd_calc_raw_layout(struct ceph_file_layout *layout,
+			u64 off, u64 *plen, u64 *bno,
+			struct ceph_osd_request *req,
+			struct ceph_osd_req_op *op)
+{
+	u64 orig_len = *plen;
+	u64 objoff, objlen;    /* extent in object */
+	int r;
+
+	/* object extent? */
+	r = ceph_calc_file_object_mapping(layout, off, orig_len, bno,
+					  &objoff, &objlen);
+	if (r < 0)
+		return r;
+	if (objlen < orig_len) {
+		*plen = objlen;
+		dout(" skipping last %llu, final file extent %llu~%llu\n",
+		     orig_len - *plen, off, *plen);
+	}
+
+	if (op->op == CEPH_OSD_OP_READ || op->op == CEPH_OSD_OP_WRITE) {
+		op->extent.offset = objoff;
+		op->extent.length = objlen;
+	}
+	req->r_num_pages = calc_pages_for(off, *plen);
+	req->r_page_alignment = off & ~PAGE_MASK;
+	if (op->op == CEPH_OSD_OP_WRITE)
+		op->payload_len = *plen;
+
+	dout("calc_layout bno=%llx %llu~%llu (%d pages)\n",
+	     *bno, objoff, objlen, req->r_num_pages);
+	return 0;
+}
+
 /*
  * Send ceph osd request
  */
@@ -1167,7 +1201,7 @@ static int rbd_do_request(struct request *rq,
 	osd_req->r_oid_len = strlen(osd_req->r_oid);
 
 	rbd_layout_init(&osd_req->r_file_layout, rbd_dev->spec->pool_id);
-	ret = ceph_calc_raw_layout(&osd_req->r_file_layout,
+	ret = rbd_calc_raw_layout(&osd_req->r_file_layout,
 				ofs, &len, &bno, osd_req, op);
 	rbd_assert(ret == 0);
 
-- 
cgit v0.10.2


From e01e79273b251dbb35ff2522a688229b09481923 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 14 Nov 2012 12:25:18 -0600
Subject: rbd: open code rbd_calc_raw_layout()

This patch gets rid of rbd_calc_raw_layout() by simply open coding
it in its one caller.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index eee0d36..f5f4e4a 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1032,7 +1032,7 @@ static struct ceph_osd_req_op *rbd_create_rw_op(int opcode, u32 payload_len)
 		return NULL;
 	/*
 	 * op extent offset and length will be set later on
-	 * in calc_raw_layout()
+	 * after ceph_calc_file_object_mapping().
 	 */
 	op->op = opcode;
 	op->payload_len = payload_len;
@@ -1101,40 +1101,6 @@ static void rbd_layout_init(struct ceph_file_layout *layout, u64 pool_id)
 	layout->fl_pg_pool = cpu_to_le32((u32) pool_id);
 }
 
-int rbd_calc_raw_layout(struct ceph_file_layout *layout,
-			u64 off, u64 *plen, u64 *bno,
-			struct ceph_osd_request *req,
-			struct ceph_osd_req_op *op)
-{
-	u64 orig_len = *plen;
-	u64 objoff, objlen;    /* extent in object */
-	int r;
-
-	/* object extent? */
-	r = ceph_calc_file_object_mapping(layout, off, orig_len, bno,
-					  &objoff, &objlen);
-	if (r < 0)
-		return r;
-	if (objlen < orig_len) {
-		*plen = objlen;
-		dout(" skipping last %llu, final file extent %llu~%llu\n",
-		     orig_len - *plen, off, *plen);
-	}
-
-	if (op->op == CEPH_OSD_OP_READ || op->op == CEPH_OSD_OP_WRITE) {
-		op->extent.offset = objoff;
-		op->extent.length = objlen;
-	}
-	req->r_num_pages = calc_pages_for(off, *plen);
-	req->r_page_alignment = off & ~PAGE_MASK;
-	if (op->op == CEPH_OSD_OP_WRITE)
-		op->payload_len = *plen;
-
-	dout("calc_layout bno=%llx %llu~%llu (%d pages)\n",
-	     *bno, objoff, objlen, req->r_num_pages);
-	return 0;
-}
-
 /*
  * Send ceph osd request
  */
@@ -1158,6 +1124,8 @@ static int rbd_do_request(struct request *rq,
 	struct ceph_osd_request *osd_req;
 	int ret;
 	u64 bno;
+	u64 obj_off = 0;
+	u64 obj_len = 0;
 	struct timespec mtime = CURRENT_TIME;
 	struct rbd_request *rbd_req;
 	struct ceph_osd_client *osdc;
@@ -1201,9 +1169,22 @@ static int rbd_do_request(struct request *rq,
 	osd_req->r_oid_len = strlen(osd_req->r_oid);
 
 	rbd_layout_init(&osd_req->r_file_layout, rbd_dev->spec->pool_id);
-	ret = rbd_calc_raw_layout(&osd_req->r_file_layout,
-				ofs, &len, &bno, osd_req, op);
+	ret = ceph_calc_file_object_mapping(&osd_req->r_file_layout, ofs, len,
+						&bno, &obj_off, &obj_len);
 	rbd_assert(ret == 0);
+	if (obj_len < len) {
+		dout(" skipping last %llu, final file extent %llu~%llu\n",
+		     len - obj_len, ofs, obj_len);
+		len = obj_len;
+	}
+	if (op->op == CEPH_OSD_OP_READ || op->op == CEPH_OSD_OP_WRITE) {
+		op->extent.offset = obj_off;
+		op->extent.length = obj_len;
+		if (op->op == CEPH_OSD_OP_WRITE)
+			op->payload_len = obj_len;
+	}
+	osd_req->r_num_pages = calc_pages_for(ofs, len);
+	osd_req->r_page_alignment = ofs & ~PAGE_MASK;
 
 	ceph_osdc_build_request(osd_req, ofs, len, 1, op,
 				snapc, snapid, &mtime);
-- 
cgit v0.10.2


From 47dba7ba2623b088cbbe1ac0aaa1a034f3249b6d Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 14 Nov 2012 12:25:19 -0600
Subject: rbd: don't bother calculating file mapping

When rbd_do_request() has a request to process it initializes a ceph
file layout structure and uses it to compute offsets and limits for
the range of the request using ceph_calc_file_object_mapping().

The layout used is fixed, and is based on RBD_MAX_OBJ_ORDER (30).
It sets the layout's object size and stripe unit to be 1 GB (2^30),
and sets the stripe count to be 1.

The job of ceph_calc_file_object_mapping() is to determine which
of a sequence of objects will contain data covered by range, and
within that object, at what offset the range starts.  It also
truncates the length of the range at the end of the selected object
if necessary.

This is needed for ceph fs, but for rbd it really serves no purpose.
It does its own blocking of images into objects, echo of which is
(1 << obj_order) in size, and as a result it ignores the "bno"
value returned by ceph_calc_file_object_mapping().  In addition,
by the point a request has reached this function, it is already
destined for a single rbd object, and its length will not exceed
that object's extent.  Because of this, and because the mapping will
result in blocking up the range using an integer multiple of the
image's object order, ceph_calc_file_object_mapping() will never
change the offset or length values defined by the request.

In other words, this call is a big no-op for rbd data requests.

There is one exception.  We read the header object using this
function, and in that case we will not have already limited the
request size.  However, the header is a single object (not a file or
rbd image), and should not be broken into pieces anyway.  So in fact
we should *not* be calling ceph_calc_file_object_mapping() when
operating on the header object.

So...

Don't call ceph_calc_file_object_mapping() in rbd_do_request(),
because useless for image data and incorrect to do sofor the image
header.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index f5f4e4a..1c0192c 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1123,9 +1123,6 @@ static int rbd_do_request(struct request *rq,
 {
 	struct ceph_osd_request *osd_req;
 	int ret;
-	u64 bno;
-	u64 obj_off = 0;
-	u64 obj_len = 0;
 	struct timespec mtime = CURRENT_TIME;
 	struct rbd_request *rbd_req;
 	struct ceph_osd_client *osdc;
@@ -1169,19 +1166,12 @@ static int rbd_do_request(struct request *rq,
 	osd_req->r_oid_len = strlen(osd_req->r_oid);
 
 	rbd_layout_init(&osd_req->r_file_layout, rbd_dev->spec->pool_id);
-	ret = ceph_calc_file_object_mapping(&osd_req->r_file_layout, ofs, len,
-						&bno, &obj_off, &obj_len);
-	rbd_assert(ret == 0);
-	if (obj_len < len) {
-		dout(" skipping last %llu, final file extent %llu~%llu\n",
-		     len - obj_len, ofs, obj_len);
-		len = obj_len;
-	}
+
 	if (op->op == CEPH_OSD_OP_READ || op->op == CEPH_OSD_OP_WRITE) {
-		op->extent.offset = obj_off;
-		op->extent.length = obj_len;
+		op->extent.offset = ofs;
+		op->extent.length = len;
 		if (op->op == CEPH_OSD_OP_WRITE)
-			op->payload_len = obj_len;
+			op->payload_len = len;
 	}
 	osd_req->r_num_pages = calc_pages_for(ofs, len);
 	osd_req->r_page_alignment = ofs & ~PAGE_MASK;
-- 
cgit v0.10.2


From 0903e875caa93e1fb231dd66c69b118dbdad25cb Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 14 Nov 2012 12:25:19 -0600
Subject: rbd: use a common layout for each device

Each osd message includes a layout structure, and for rbd it is
always the same (at least for osd's in a given pool).

Initialize a layout structure when an rbd_dev gets created and just
copy that into osd requests for the rbd image.

Replace an assertion that was done when initializing the layout
structures with code that catches and handles anything that would
trigger the assertion as soon as it is identified.  This precludes
that (bad) condition from ever occurring.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 1c0192c..d2a6e95 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -235,6 +235,8 @@ struct rbd_device {
 
 	char			*header_name;
 
+	struct ceph_file_layout	layout;
+
 	struct ceph_osd_event   *watch_event;
 	struct ceph_osd_request *watch_request;
 
@@ -1091,16 +1093,6 @@ static void rbd_coll_end_req(struct rbd_request *rbd_req,
 				ret, len);
 }
 
-static void rbd_layout_init(struct ceph_file_layout *layout, u64 pool_id)
-{
-	memset(layout, 0, sizeof (*layout));
-	layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
-	layout->fl_stripe_count = cpu_to_le32(1);
-	layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
-	rbd_assert(pool_id <= (u64) U32_MAX);
-	layout->fl_pg_pool = cpu_to_le32((u32) pool_id);
-}
-
 /*
  * Send ceph osd request
  */
@@ -1165,7 +1157,7 @@ static int rbd_do_request(struct request *rq,
 	strncpy(osd_req->r_oid, object_name, sizeof(osd_req->r_oid));
 	osd_req->r_oid_len = strlen(osd_req->r_oid);
 
-	rbd_layout_init(&osd_req->r_file_layout, rbd_dev->spec->pool_id);
+	osd_req->r_file_layout = rbd_dev->layout;	/* struct */
 
 	if (op->op == CEPH_OSD_OP_READ || op->op == CEPH_OSD_OP_WRITE) {
 		op->extent.offset = ofs;
@@ -2297,6 +2289,13 @@ struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
 	rbd_dev->spec = spec;
 	rbd_dev->rbd_client = rbdc;
 
+	/* Initialize the layout used for all rbd requests */
+
+	rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
+	rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
+	rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
+	rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
+
 	return rbd_dev;
 }
 
@@ -2551,6 +2550,12 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
 	if (parent_spec->pool_id == CEPH_NOPOOL)
 		goto out;	/* No parent?  No problem. */
 
+	/* The ceph file layout needs to fit pool id in 32 bits */
+
+	ret = -EIO;
+	if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX))
+		goto out;
+
 	image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
 	if (IS_ERR(image_id)) {
 		ret = PTR_ERR(image_id);
@@ -3680,6 +3685,13 @@ static ssize_t rbd_add(struct bus_type *bus,
 		goto err_out_client;
 	spec->pool_id = (u64) rc;
 
+	/* The ceph file layout needs to fit pool id in 32 bits */
+
+	if (WARN_ON(spec->pool_id > (u64) U32_MAX)) {
+		rc = -EIO;
+		goto err_out_client;
+	}
+
 	rbd_dev = rbd_dev_create(rbdc, spec);
 	if (!rbd_dev)
 		goto err_out_client;
-- 
cgit v0.10.2


From 907703d050df92979b3848ee42f88d5c9c6c13fe Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 13 Nov 2012 21:11:15 -0600
Subject: rbd: combine rbd sync watch/unwatch functions

The rbd_req_sync_watch() and rbd_req_sync_unwatch() functions are
nearly identical.  Combine them into a single function with a flag
indicating whether a watch is to be initiated or torn down.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index d2a6e95..9d21fcd 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1429,74 +1429,48 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
 }
 
 /*
- * Request sync osd watch
+ * Request sync osd watch/unwatch.  The value of "start" determines
+ * whether a watch request is being initiated or torn down.
  */
-static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
+static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start)
 {
 	struct ceph_osd_req_op *op;
-	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+	struct ceph_osd_request **linger_req = NULL;
+	__le64 version = 0;
 	int ret;
 
 	op = rbd_create_rw_op(CEPH_OSD_OP_WATCH, 0);
 	if (!op)
 		return -ENOMEM;
 
-	ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
-				     (void *)rbd_dev, &rbd_dev->watch_event);
-	if (ret < 0)
-		goto fail;
-
-	op->watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
-	op->watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
-	op->watch.flag = 1;
-
-	ret = rbd_req_sync_op(rbd_dev,
-			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
-			      op,
-			      rbd_dev->header_name,
-			      0, 0, NULL,
-			      &rbd_dev->watch_request, NULL);
-
-	if (ret < 0)
-		goto fail_event;
-
-	rbd_destroy_op(op);
-	return 0;
-
-fail_event:
-	ceph_osdc_cancel_event(rbd_dev->watch_event);
-	rbd_dev->watch_event = NULL;
-fail:
-	rbd_destroy_op(op);
-	return ret;
-}
-
-/*
- * Request sync osd unwatch
- */
-static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
-{
-	struct ceph_osd_req_op *op;
-	int ret;
+	if (start) {
+		struct ceph_osd_client *osdc;
 
-	op = rbd_create_rw_op(CEPH_OSD_OP_WATCH, 0);
-	if (!op)
-		return -ENOMEM;
+		osdc = &rbd_dev->rbd_client->client->osdc;
+		ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, rbd_dev,
+						&rbd_dev->watch_event);
+		if (ret < 0)
+			goto done;
+		version = cpu_to_le64(rbd_dev->header.obj_version);
+		linger_req = &rbd_dev->watch_request;
+	}
 
-	op->watch.ver = 0;
+	op->watch.ver = version;
 	op->watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
-	op->watch.flag = 0;
+	op->watch.flag = (u8) start ? 1 : 0;
 
 	ret = rbd_req_sync_op(rbd_dev,
 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
-			      op,
-			      rbd_dev->header_name,
-			      0, 0, NULL, NULL, NULL);
-
+			      op, rbd_dev->header_name,
+			      0, 0, NULL, linger_req, NULL);
 
+	if (!start || ret < 0) {
+		ceph_osdc_cancel_event(rbd_dev->watch_event);
+		rbd_dev->watch_event = NULL;
+	}
+done:
 	rbd_destroy_op(op);
-	ceph_osdc_cancel_event(rbd_dev->watch_event);
-	rbd_dev->watch_event = NULL;
+
 	return ret;
 }
 
@@ -3033,7 +3007,7 @@ static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
 	int ret, rc;
 
 	do {
-		ret = rbd_req_sync_watch(rbd_dev);
+		ret = rbd_req_sync_watch(rbd_dev, 1);
 		if (ret == -ERANGE) {
 			rc = rbd_dev_refresh(rbd_dev, NULL);
 			if (rc < 0)
@@ -3752,8 +3726,7 @@ static void rbd_dev_release(struct device *dev)
 						    rbd_dev->watch_request);
 	}
 	if (rbd_dev->watch_event)
-		rbd_req_sync_unwatch(rbd_dev);
-
+		rbd_req_sync_watch(rbd_dev, 0);
 
 	/* clean up and free blkdev */
 	rbd_free_disk(rbd_dev);
-- 
cgit v0.10.2


From 2e53c6c379b65372df21f4d6019f6eb63af81384 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 30 Nov 2012 09:59:47 -0600
Subject: rbd: don't leak rbd_req on synchronous requests

When rbd_do_request() is called it allocates and populates an
rbd_req structure to hold information about the osd request to be
sent.  This is done for the benefit of the callback function (in
particular, rbd_req_cb()), which uses this in processing when
the request completes.

Synchronous requests provide no callback function, in which case
rbd_do_request() waits for the request to complete before returning.
This case is not handling the needed free of the rbd_req structure
like it should, so it is getting leaked.

Note however that the synchronous case has no need for the rbd_req
structure at all.  So rather than simply freeing this structure for
synchronous requests, just don't allocate it to begin with.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 9d21fcd..28b6236 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1113,20 +1113,11 @@ static int rbd_do_request(struct request *rq,
 			  struct ceph_osd_request **linger_req,
 			  u64 *ver)
 {
+	struct ceph_osd_client *osdc;
 	struct ceph_osd_request *osd_req;
-	int ret;
+	struct rbd_request *rbd_req = NULL;
 	struct timespec mtime = CURRENT_TIME;
-	struct rbd_request *rbd_req;
-	struct ceph_osd_client *osdc;
-
-	rbd_req = kzalloc(sizeof(*rbd_req), GFP_NOIO);
-	if (!rbd_req)
-		return -ENOMEM;
-
-	if (coll) {
-		rbd_req->coll = coll;
-		rbd_req->coll_index = coll_index;
-	}
+	int ret;
 
 	dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n",
 		object_name, (unsigned long long) ofs,
@@ -1134,10 +1125,8 @@ static int rbd_do_request(struct request *rq,
 
 	osdc = &rbd_dev->rbd_client->client->osdc;
 	osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_NOIO);
-	if (!osd_req) {
-		ret = -ENOMEM;
-		goto done_pages;
-	}
+	if (!osd_req)
+		return -ENOMEM;
 
 	osd_req->r_flags = flags;
 	osd_req->r_pages = pages;
@@ -1145,13 +1134,22 @@ static int rbd_do_request(struct request *rq,
 		osd_req->r_bio = bio;
 		bio_get(osd_req->r_bio);
 	}
-	osd_req->r_callback = rbd_cb;
 
-	rbd_req->rq = rq;
-	rbd_req->bio = bio;
-	rbd_req->pages = pages;
-	rbd_req->len = len;
+	if (rbd_cb) {
+		ret = -ENOMEM;
+		rbd_req = kmalloc(sizeof(*rbd_req), GFP_NOIO);
+		if (!rbd_req)
+			goto done_osd_req;
+
+		rbd_req->rq = rq;
+		rbd_req->bio = bio;
+		rbd_req->pages = pages;
+		rbd_req->len = len;
+		rbd_req->coll = coll;
+		rbd_req->coll_index = coll ? coll_index : 0;
+	}
 
+	osd_req->r_callback = rbd_cb;
 	osd_req->r_priv = rbd_req;
 
 	strncpy(osd_req->r_oid, object_name, sizeof(osd_req->r_oid));
@@ -1193,10 +1191,12 @@ static int rbd_do_request(struct request *rq,
 	return ret;
 
 done_err:
-	bio_chain_put(rbd_req->bio);
-	ceph_osdc_put_request(osd_req);
-done_pages:
+	if (bio)
+		bio_chain_put(osd_req->r_bio);
 	kfree(rbd_req);
+done_osd_req:
+	ceph_osdc_put_request(osd_req);
+
 	return ret;
 }
 
-- 
cgit v0.10.2


From 1821665749a3d7e26ad470c63fc2c46990dc6041 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 30 Nov 2012 09:59:47 -0600
Subject: rbd: don't leak rbd_req for rbd_req_sync_notify_ack()

When rbd_req_sync_notify_ack() calls rbd_do_request() it supplies
rbd_simple_req_cb() as its callback function.  Because the callback
is supplied, an rbd_req structure gets allocated and populated so it
can be used by the callback.  However rbd_simple_req_cb() is not
freeing (or even using) the rbd_req structure, so it's getting
leaked.

Since rbd_simple_req_cb() has no need for the rbd_req structure,
just avoid allocating one for this case.  Of the three calls to
rbd_do_request(), only the one from rbd_do_op() needs the rbd_req
structure, and that call can be distinguished from the other two
because it supplies a non-null rbd_collection pointer.

So fix this leak by only allocating the rbd_req structure if a
non-null "coll" value is provided to rbd_do_request().

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 28b6236..619d680 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1135,7 +1135,7 @@ static int rbd_do_request(struct request *rq,
 		bio_get(osd_req->r_bio);
 	}
 
-	if (rbd_cb) {
+	if (coll) {
 		ret = -ENOMEM;
 		rbd_req = kmalloc(sizeof(*rbd_req), GFP_NOIO);
 		if (!rbd_req)
@@ -1146,7 +1146,7 @@ static int rbd_do_request(struct request *rq,
 		rbd_req->pages = pages;
 		rbd_req->len = len;
 		rbd_req->coll = coll;
-		rbd_req->coll_index = coll ? coll_index : 0;
+		rbd_req->coll_index = coll_index;
 	}
 
 	osd_req->r_callback = rbd_cb;
-- 
cgit v0.10.2


From c561191813e232aa52022532751855ff5c9fa319 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Mon, 19 Nov 2012 22:55:21 -0600
Subject: rbd: don't assign extent info in rbd_do_request()

In rbd_do_request() there's a sort of last-minute assignment of the
extent offset and length and payload length for read and write
operations.  Move those assignments into the caller (in those spots
that might initiate read or write operations)

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 619d680..c6917b1 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1156,13 +1156,6 @@ static int rbd_do_request(struct request *rq,
 	osd_req->r_oid_len = strlen(osd_req->r_oid);
 
 	osd_req->r_file_layout = rbd_dev->layout;	/* struct */
-
-	if (op->op == CEPH_OSD_OP_READ || op->op == CEPH_OSD_OP_WRITE) {
-		op->extent.offset = ofs;
-		op->extent.length = len;
-		if (op->op == CEPH_OSD_OP_WRITE)
-			op->payload_len = len;
-	}
 	osd_req->r_num_pages = calc_pages_for(ofs, len);
 	osd_req->r_page_alignment = ofs & ~PAGE_MASK;
 
@@ -1269,6 +1262,13 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev,
 	if (IS_ERR(pages))
 		return PTR_ERR(pages);
 
+	if (op->op == CEPH_OSD_OP_READ || op->op == CEPH_OSD_OP_WRITE) {
+		op->extent.offset = ofs;
+		op->extent.length = inbound_size;
+		if (op->op == CEPH_OSD_OP_WRITE)
+			op->payload_len = inbound_size;
+	}
+
 	ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
 			  object_name, ofs, inbound_size, NULL,
 			  pages, num_pages,
@@ -1332,6 +1332,9 @@ static int rbd_do_op(struct request *rq,
 	op = rbd_create_rw_op(opcode, payload_len);
 	if (!op)
 		goto done;
+	op->extent.offset = seg_ofs;
+	op->extent.length = seg_len;
+	op->payload_len = payload_len;
 
 	/* we've taken care of segment sizes earlier when we
 	   cloned the bios. We should never have a segment
-- 
cgit v0.10.2


From 8d23bf29095e5fab84535035e7a27c4920812c44 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Mon, 19 Nov 2012 22:55:21 -0600
Subject: rbd: don't assign extent info in rbd_req_sync_op()

Move the assignment of the extent offset and length and payload
length out of rbd_req_sync_op() and into its caller in the one spot
where a read (and note--no write) operation might be initiated.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index c6917b1..235cda0 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1025,19 +1025,15 @@ out_err:
 	return NULL;
 }
 
-static struct ceph_osd_req_op *rbd_create_rw_op(int opcode, u32 payload_len)
+static struct ceph_osd_req_op *rbd_create_rw_op(int opcode, u64 ofs, u64 len)
 {
 	struct ceph_osd_req_op *op;
 
 	op = kzalloc(sizeof (*op), GFP_NOIO);
 	if (!op)
 		return NULL;
-	/*
-	 * op extent offset and length will be set later on
-	 * after ceph_calc_file_object_mapping().
-	 */
+
 	op->op = opcode;
-	op->payload_len = payload_len;
 
 	return op;
 }
@@ -1047,6 +1043,42 @@ static void rbd_destroy_op(struct ceph_osd_req_op *op)
 	kfree(op);
 }
 
+struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...)
+{
+	struct ceph_osd_req_op *op;
+	va_list args;
+
+	op = kzalloc(sizeof (*op), GFP_NOIO);
+	if (!op)
+		return NULL;
+	op->op = opcode;
+	va_start(args, opcode);
+	switch (opcode) {
+	case CEPH_OSD_OP_READ:
+	case CEPH_OSD_OP_WRITE:
+		/* rbd_osd_req_op_create(READ, offset, length) */
+		/* rbd_osd_req_op_create(WRITE, offset, length) */
+		op->extent.offset = va_arg(args, u64);
+		op->extent.length = va_arg(args, u64);
+		if (opcode == CEPH_OSD_OP_WRITE)
+			op->payload_len = op->extent.length;
+		break;
+	default:
+		rbd_warn(NULL, "unsupported opcode %hu\n", opcode);
+		kfree(op);
+		op = NULL;
+		break;
+	}
+	va_end(args);
+
+	return op;
+}
+
+static void rbd_osd_req_op_destroy(struct ceph_osd_req_op *op)
+{
+	kfree(op);
+}
+
 static void rbd_coll_end_req_index(struct request *rq,
 				   struct rbd_req_coll *coll,
 				   int index,
@@ -1262,13 +1294,6 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev,
 	if (IS_ERR(pages))
 		return PTR_ERR(pages);
 
-	if (op->op == CEPH_OSD_OP_READ || op->op == CEPH_OSD_OP_WRITE) {
-		op->extent.offset = ofs;
-		op->extent.length = inbound_size;
-		if (op->op == CEPH_OSD_OP_WRITE)
-			op->payload_len = inbound_size;
-	}
-
 	ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
 			  object_name, ofs, inbound_size, NULL,
 			  pages, num_pages,
@@ -1304,7 +1329,6 @@ static int rbd_do_op(struct request *rq,
 	u64 seg_len;
 	int ret;
 	struct ceph_osd_req_op *op;
-	u32 payload_len;
 	int opcode;
 	int flags;
 	u64 snapid;
@@ -1319,22 +1343,17 @@ static int rbd_do_op(struct request *rq,
 		opcode = CEPH_OSD_OP_WRITE;
 		flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK;
 		snapid = CEPH_NOSNAP;
-		payload_len = seg_len;
 	} else {
 		opcode = CEPH_OSD_OP_READ;
 		flags = CEPH_OSD_FLAG_READ;
 		rbd_assert(!snapc);
 		snapid = rbd_dev->spec->snap_id;
-		payload_len = 0;
 	}
 
 	ret = -ENOMEM;
-	op = rbd_create_rw_op(opcode, payload_len);
+	op = rbd_osd_req_op_create(opcode, seg_ofs, seg_len);
 	if (!op)
 		goto done;
-	op->extent.offset = seg_ofs;
-	op->extent.length = seg_len;
-	op->payload_len = payload_len;
 
 	/* we've taken care of segment sizes earlier when we
 	   cloned the bios. We should never have a segment
@@ -1352,7 +1371,7 @@ static int rbd_do_op(struct request *rq,
 	if (ret < 0)
 		rbd_coll_end_req_index(rq, coll, coll_index,
 					(s32)ret, seg_len);
-	rbd_destroy_op(op);
+	rbd_osd_req_op_destroy(op);
 done:
 	kfree(seg_name);
 	return ret;
@@ -1370,13 +1389,13 @@ static int rbd_req_sync_read(struct rbd_device *rbd_dev,
 	struct ceph_osd_req_op *op;
 	int ret;
 
-	op = rbd_create_rw_op(CEPH_OSD_OP_READ, 0);
+	op = rbd_osd_req_op_create(CEPH_OSD_OP_READ, ofs, len);
 	if (!op)
 		return -ENOMEM;
 
 	ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ,
 			       op, object_name, ofs, len, buf, NULL, ver);
-	rbd_destroy_op(op);
+	rbd_osd_req_op_destroy(op);
 
 	return ret;
 }
@@ -1391,7 +1410,7 @@ static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
 	struct ceph_osd_req_op *op;
 	int ret;
 
-	op = rbd_create_rw_op(CEPH_OSD_OP_NOTIFY_ACK, 0);
+	op = rbd_create_rw_op(CEPH_OSD_OP_NOTIFY_ACK, 0, 0);
 	if (!op)
 		return -ENOMEM;
 
@@ -1442,7 +1461,7 @@ static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start)
 	__le64 version = 0;
 	int ret;
 
-	op = rbd_create_rw_op(CEPH_OSD_OP_WATCH, 0);
+	op = rbd_create_rw_op(CEPH_OSD_OP_WATCH, 0, 0);
 	if (!op)
 		return -ENOMEM;
 
@@ -1505,9 +1524,10 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
 	 * operation.
 	 */
 	payload_size = class_name_len + method_name_len + outbound_size;
-	op = rbd_create_rw_op(CEPH_OSD_OP_CALL, payload_size);
+	op = rbd_create_rw_op(CEPH_OSD_OP_CALL, 0, 0);
 	if (!op)
 		return -ENOMEM;
+	op->payload_len = payload_size;
 
 	op->cls.class_name = class_name;
 	op->cls.class_len = (__u8) class_name_len;
-- 
cgit v0.10.2


From 2647ba38100765298fc67ce1ec5d32e80d9fe046 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Mon, 19 Nov 2012 22:55:21 -0600
Subject: rbd: move call osd op setup into rbd_osd_req_op_create()

Move the initialization of the CEPH_OSD_OP_CALL operation into
rbd_osd_req_op_create().

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 235cda0..760be82 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -52,10 +52,12 @@
 #define	SECTOR_SHIFT	9
 #define	SECTOR_SIZE	(1ULL << SECTOR_SHIFT)
 
-/* It might be useful to have this defined elsewhere too */
+/* It might be useful to have these defined elsewhere */
 
-#define	U32_MAX	((u32) (~0U))
-#define	U64_MAX	((u64) (~0ULL))
+#define	U8_MAX	((u8)	(~0U))
+#define	U16_MAX	((u16)	(~0U))
+#define	U32_MAX	((u32)	(~0U))
+#define	U64_MAX	((u64)	(~0ULL))
 
 #define RBD_DRV_NAME "rbd"
 #define RBD_DRV_NAME_LONG "rbd (rados block device)"
@@ -1047,6 +1049,7 @@ struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...)
 {
 	struct ceph_osd_req_op *op;
 	va_list args;
+	size_t size;
 
 	op = kzalloc(sizeof (*op), GFP_NOIO);
 	if (!op)
@@ -1063,6 +1066,27 @@ struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...)
 		if (opcode == CEPH_OSD_OP_WRITE)
 			op->payload_len = op->extent.length;
 		break;
+	case CEPH_OSD_OP_CALL:
+		/* rbd_osd_req_op_create(CALL, class, method, data, datalen) */
+		op->cls.class_name = va_arg(args, char *);
+		size = strlen(op->cls.class_name);
+		rbd_assert(size <= (size_t) U8_MAX);
+		op->cls.class_len = size;
+		op->payload_len = size;
+
+		op->cls.method_name = va_arg(args, char *);
+		size = strlen(op->cls.method_name);
+		rbd_assert(size <= (size_t) U8_MAX);
+		op->cls.method_len = size;
+		op->payload_len += size;
+
+		op->cls.argc = 0;
+		op->cls.indata = va_arg(args, void *);
+		size = va_arg(args, size_t);
+		rbd_assert(size <= (size_t) U32_MAX);
+		op->cls.indata_len = (u32) size;
+		op->payload_len += size;
+		break;
 	default:
 		rbd_warn(NULL, "unsupported opcode %hu\n", opcode);
 		kfree(op);
@@ -1510,9 +1534,6 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
 			     u64 *ver)
 {
 	struct ceph_osd_req_op *op;
-	int class_name_len = strlen(class_name);
-	int method_name_len = strlen(method_name);
-	int payload_size;
 	int ret;
 
 	/*
@@ -1523,25 +1544,16 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
 	 * the perspective of the server side) in the OSD request
 	 * operation.
 	 */
-	payload_size = class_name_len + method_name_len + outbound_size;
-	op = rbd_create_rw_op(CEPH_OSD_OP_CALL, 0, 0);
+	op = rbd_osd_req_op_create(CEPH_OSD_OP_CALL, class_name,
+					method_name, outbound, outbound_size);
 	if (!op)
 		return -ENOMEM;
-	op->payload_len = payload_size;
-
-	op->cls.class_name = class_name;
-	op->cls.class_len = (__u8) class_name_len;
-	op->cls.method_name = method_name;
-	op->cls.method_len = (__u8) method_name_len;
-	op->cls.argc = 0;
-	op->cls.indata = outbound;
-	op->cls.indata_len = outbound_size;
 
 	ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, op,
 			       object_name, 0, inbound_size, inbound,
 			       NULL, ver);
 
-	rbd_destroy_op(op);
+	rbd_osd_req_op_destroy(op);
 
 	dout("cls_exec returned %d\n", ret);
 	return ret;
-- 
cgit v0.10.2


From 5efea49a98d1a3b3a7301d3a17f826ad4c31b290 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Mon, 19 Nov 2012 22:55:21 -0600
Subject: rbd: move remaining osd op setup into rbd_osd_req_op_create()

The two remaining osd ops used by rbd are CEPH_OSD_OP_WATCH and
CEPH_OSD_OP_NOTIFY_ACK.  Move the setup of those operations into
rbd_osd_req_op_create(), and get rid of rbd_create_rw_op() and
rbd_destroy_op().

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 760be82..3802a78 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1027,24 +1027,6 @@ out_err:
 	return NULL;
 }
 
-static struct ceph_osd_req_op *rbd_create_rw_op(int opcode, u64 ofs, u64 len)
-{
-	struct ceph_osd_req_op *op;
-
-	op = kzalloc(sizeof (*op), GFP_NOIO);
-	if (!op)
-		return NULL;
-
-	op->op = opcode;
-
-	return op;
-}
-
-static void rbd_destroy_op(struct ceph_osd_req_op *op)
-{
-	kfree(op);
-}
-
 struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...)
 {
 	struct ceph_osd_req_op *op;
@@ -1087,6 +1069,16 @@ struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...)
 		op->cls.indata_len = (u32) size;
 		op->payload_len += size;
 		break;
+	case CEPH_OSD_OP_NOTIFY_ACK:
+	case CEPH_OSD_OP_WATCH:
+		/* rbd_osd_req_op_create(NOTIFY_ACK, cookie, version) */
+		/* rbd_osd_req_op_create(WATCH, cookie, version, flag) */
+		op->watch.cookie = va_arg(args, u64);
+		op->watch.ver = va_arg(args, u64);
+		op->watch.ver = cpu_to_le64(op->watch.ver);
+		if (opcode == CEPH_OSD_OP_WATCH && va_arg(args, int))
+			op->watch.flag = (u8) 1;
+		break;
 	default:
 		rbd_warn(NULL, "unsupported opcode %hu\n", opcode);
 		kfree(op);
@@ -1434,14 +1426,10 @@ static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
 	struct ceph_osd_req_op *op;
 	int ret;
 
-	op = rbd_create_rw_op(CEPH_OSD_OP_NOTIFY_ACK, 0, 0);
+	op = rbd_osd_req_op_create(CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver);
 	if (!op)
 		return -ENOMEM;
 
-	op->watch.ver = cpu_to_le64(ver);
-	op->watch.cookie = notify_id;
-	op->watch.flag = 0;
-
 	ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
 			  rbd_dev->header_name, 0, 0, NULL,
 			  NULL, 0,
@@ -1450,7 +1438,8 @@ static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
 			  NULL, 0,
 			  rbd_simple_req_cb, 0, NULL);
 
-	rbd_destroy_op(op);
+	rbd_osd_req_op_destroy(op);
+
 	return ret;
 }
 
@@ -1480,14 +1469,9 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
  */
 static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start)
 {
-	struct ceph_osd_req_op *op;
 	struct ceph_osd_request **linger_req = NULL;
-	__le64 version = 0;
-	int ret;
-
-	op = rbd_create_rw_op(CEPH_OSD_OP_WATCH, 0, 0);
-	if (!op)
-		return -ENOMEM;
+	struct ceph_osd_req_op *op;
+	int ret = 0;
 
 	if (start) {
 		struct ceph_osd_client *osdc;
@@ -1496,26 +1480,28 @@ static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start)
 		ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, rbd_dev,
 						&rbd_dev->watch_event);
 		if (ret < 0)
-			goto done;
-		version = cpu_to_le64(rbd_dev->header.obj_version);
+			return ret;
 		linger_req = &rbd_dev->watch_request;
+	} else {
+		rbd_assert(rbd_dev->watch_request != NULL);
 	}
 
-	op->watch.ver = version;
-	op->watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
-	op->watch.flag = (u8) start ? 1 : 0;
-
-	ret = rbd_req_sync_op(rbd_dev,
+	op = rbd_osd_req_op_create(CEPH_OSD_OP_WATCH,
+				rbd_dev->watch_event->cookie,
+				rbd_dev->header.obj_version, start);
+	if (op)
+		ret = rbd_req_sync_op(rbd_dev,
 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
 			      op, rbd_dev->header_name,
 			      0, 0, NULL, linger_req, NULL);
 
-	if (!start || ret < 0) {
+	/* Cancel the event if we're tearing down, or on error */
+
+	if (!start || !op || ret < 0) {
 		ceph_osdc_cancel_event(rbd_dev->watch_event);
 		rbd_dev->watch_event = NULL;
 	}
-done:
-	rbd_destroy_op(op);
+	rbd_osd_req_op_destroy(op);
 
 	return ret;
 }
-- 
cgit v0.10.2


From 8b84de7940b69fd7326946ba244621aa5fc412e0 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 20 Nov 2012 14:17:17 -0600
Subject: rbd: assign watch request more directly

Both rbd_req_sync_op() and rbd_do_request() have a "linger"
parameter, which is the address of a pointer that should refer to
the osd request structure used to issue a request to an osd.

Only one case ever supplies a non-null "linger" argument: an
CEPH_OSD_OP_WATCH start.  And in that one case it is assigned
&rbd_dev->watch_request.

Within rbd_do_request() (where the assignment ultimately gets made)
we know the rbd_dev and therefore its watch_request field.  We
also know whether the op being sent is CEPH_OSD_OP_WATCH start.

Stop opaquely passing down the "linger" pointer, and instead just
assign the value directly inside rbd_do_request() when it's needed.

This makes it unnecessary for rbd_req_sync_watch() to make
arrangements to hold a value that's not available until a
bit later.  This more clearly separates setting up a watch
request from submitting it.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 3802a78..a3a11c3 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1158,7 +1158,6 @@ static int rbd_do_request(struct request *rq,
 			  int coll_index,
 			  void (*rbd_cb)(struct ceph_osd_request *,
 					 struct ceph_msg *),
-			  struct ceph_osd_request **linger_req,
 			  u64 *ver)
 {
 	struct ceph_osd_client *osdc;
@@ -1210,9 +1209,9 @@ static int rbd_do_request(struct request *rq,
 	ceph_osdc_build_request(osd_req, ofs, len, 1, op,
 				snapc, snapid, &mtime);
 
-	if (linger_req) {
+	if (op->op == CEPH_OSD_OP_WATCH && op->watch.flag) {
 		ceph_osdc_set_request_linger(osdc, osd_req);
-		*linger_req = osd_req;
+		rbd_dev->watch_request = osd_req;
 	}
 
 	ret = ceph_osdc_start_request(osdc, osd_req, false);
@@ -1296,7 +1295,6 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev,
 			   const char *object_name,
 			   u64 ofs, u64 inbound_size,
 			   char *inbound,
-			   struct ceph_osd_request **linger_req,
 			   u64 *ver)
 {
 	int ret;
@@ -1317,7 +1315,7 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev,
 			  op,
 			  NULL, 0,
 			  NULL,
-			  linger_req, ver);
+			  ver);
 	if (ret < 0)
 		goto done;
 
@@ -1383,7 +1381,7 @@ static int rbd_do_op(struct request *rq,
 			     flags,
 			     op,
 			     coll, coll_index,
-			     rbd_req_cb, 0, NULL);
+			     rbd_req_cb, NULL);
 	if (ret < 0)
 		rbd_coll_end_req_index(rq, coll, coll_index,
 					(s32)ret, seg_len);
@@ -1410,7 +1408,7 @@ static int rbd_req_sync_read(struct rbd_device *rbd_dev,
 		return -ENOMEM;
 
 	ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ,
-			       op, object_name, ofs, len, buf, NULL, ver);
+			       op, object_name, ofs, len, buf, ver);
 	rbd_osd_req_op_destroy(op);
 
 	return ret;
@@ -1436,7 +1434,7 @@ static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
 			  CEPH_OSD_FLAG_READ,
 			  op,
 			  NULL, 0,
-			  rbd_simple_req_cb, 0, NULL);
+			  rbd_simple_req_cb, NULL);
 
 	rbd_osd_req_op_destroy(op);
 
@@ -1469,7 +1467,6 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
  */
 static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start)
 {
-	struct ceph_osd_request **linger_req = NULL;
 	struct ceph_osd_req_op *op;
 	int ret = 0;
 
@@ -1481,7 +1478,6 @@ static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start)
 						&rbd_dev->watch_event);
 		if (ret < 0)
 			return ret;
-		linger_req = &rbd_dev->watch_request;
 	} else {
 		rbd_assert(rbd_dev->watch_request != NULL);
 	}
@@ -1493,7 +1489,7 @@ static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start)
 		ret = rbd_req_sync_op(rbd_dev,
 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
 			      op, rbd_dev->header_name,
-			      0, 0, NULL, linger_req, NULL);
+			      0, 0, NULL, NULL);
 
 	/* Cancel the event if we're tearing down, or on error */
 
@@ -1537,7 +1533,7 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
 
 	ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, op,
 			       object_name, 0, inbound_size, inbound,
-			       NULL, ver);
+			       ver);
 
 	rbd_osd_req_op_destroy(op);
 
-- 
cgit v0.10.2


From e0b49868d3629708eda593b6739cb78f33ab238a Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 9 Jan 2013 14:44:18 -0600
Subject: rbd: fix type of snap_id in rbd_dev_v2_snap_info()

The type of the snap_id local variable is defined with the
wrong byte order.  Fix that.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index a3a11c3..007b726 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -2802,7 +2802,7 @@ out:
 static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
 		u64 *snap_size, u64 *snap_features)
 {
-	__le64 snap_id;
+	u64 snap_id;
 	u8 order;
 	int ret;
 
-- 
cgit v0.10.2


From 98571b5aa776d4a69eadd7d4e5c9d4e69365ab9a Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Sun, 20 Jan 2013 14:44:42 -0600
Subject: rbd: small changes

A few very minor changes to the rbd code:
    - RBD_MAX_OPT_LEN is unused, so get rid of it
    - Consolidate rbd options definitions
    - Make rbd_segment_name() return pointer to const char

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Dan Mick <dan.mick@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 007b726..4ed0741 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -69,7 +69,6 @@
 			(NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
 
 #define RBD_MAX_SNAP_COUNT	510	/* allows max snapc to fit in 4KB */
-#define RBD_MAX_OPT_LEN		1024
 
 #define RBD_SNAP_HEAD_NAME	"-"
 
@@ -96,8 +95,6 @@
 #define DEV_NAME_LEN		32
 #define MAX_INT_FORMAT_WIDTH	((5 * sizeof (int)) / 2 + 1)
 
-#define RBD_READ_ONLY_DEFAULT		false
-
 /*
  * block device image metadata (in-memory version)
  */
@@ -156,10 +153,6 @@ struct rbd_spec {
 	struct kref	kref;
 };
 
-struct rbd_options {
-	bool	read_only;
-};
-
 /*
  * an instance of the client.  multiple devices may share an rbd client.
  */
@@ -475,6 +468,12 @@ static match_table_t rbd_opts_tokens = {
 	{-1, NULL}
 };
 
+struct rbd_options {
+	bool	read_only;
+};
+
+#define RBD_READ_ONLY_DEFAULT	false
+
 static int parse_rbd_opts_token(char *c, void *private)
 {
 	struct rbd_options *rbd_opts = private;
@@ -773,7 +772,7 @@ static void rbd_header_free(struct rbd_image_header *header)
 	header->snapc = NULL;
 }
 
-static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
+static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
 {
 	char *name;
 	u64 segment;
@@ -1338,7 +1337,7 @@ static int rbd_do_op(struct request *rq,
 		     struct rbd_req_coll *coll,
 		     int coll_index)
 {
-	char *seg_name;
+	const char *seg_name;
 	u64 seg_ofs;
 	u64 seg_len;
 	int ret;
-- 
cgit v0.10.2


From 38901e0f240b634467fb31743365af80a006be33 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 10 Jan 2013 12:56:58 -0600
Subject: rbd: check for overflow in rbd_get_num_segments()

The return type of rbd_get_num_segments() is int, but the values it
operates on are u64.  Although it's not likely, there's no guarantee
the result won't exceed what can be respresented in an int.  The
function is already designed to return -ERANGE on error, so just add
this possible overflow as another reason to return that.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Dan Mick <dan.mick@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 4ed0741..58d01e3 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -820,6 +820,7 @@ static int rbd_get_num_segments(struct rbd_image_header *header,
 {
 	u64 start_seg;
 	u64 end_seg;
+	u64 result;
 
 	if (!len)
 		return 0;
@@ -829,7 +830,11 @@ static int rbd_get_num_segments(struct rbd_image_header *header,
 	start_seg = ofs >> header->obj_order;
 	end_seg = (ofs + len - 1) >> header->obj_order;
 
-	return end_seg - start_seg + 1;
+	result = end_seg - start_seg + 1;
+	if (result > (u64) INT_MAX)
+		return -ERANGE;
+
+	return (int) result;
 }
 
 /*
-- 
cgit v0.10.2


From c04306471ad93f1daf60771a0373316d4c3494ae Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 18 Jan 2013 12:31:09 -0600
Subject: rbd: don't retry setting up header watch

When an rbd image is initially mapped a watch event is registered so
we can do something if the header object changes.

The code that does this currently loops if initiating the watch
request results in an ERANGE error.  The osds will never return
ERANGE, so there's no reason to do this loop, so get rid of it.

This resolves:
    http://tracker.newdream.net/issues/3860

Note that the problem this loop was intended to solve is a race
between collecting image header information and setting up the watch
on the header object.  The real fix for that problem is described
here:
    http://tracker.newdream.net/issues/3871

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 58d01e3..6689363 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1474,6 +1474,9 @@ static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start)
 	struct ceph_osd_req_op *op;
 	int ret = 0;
 
+	rbd_assert(start ^ !!rbd_dev->watch_event);
+	rbd_assert(start ^ !!rbd_dev->watch_request);
+
 	if (start) {
 		struct ceph_osd_client *osdc;
 
@@ -1482,8 +1485,6 @@ static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start)
 						&rbd_dev->watch_event);
 		if (ret < 0)
 			return ret;
-	} else {
-		rbd_assert(rbd_dev->watch_request != NULL);
 	}
 
 	op = rbd_osd_req_op_create(CEPH_OSD_OP_WATCH,
@@ -3023,22 +3024,6 @@ static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
 	device_unregister(&rbd_dev->dev);
 }
 
-static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
-{
-	int ret, rc;
-
-	do {
-		ret = rbd_req_sync_watch(rbd_dev, 1);
-		if (ret == -ERANGE) {
-			rc = rbd_dev_refresh(rbd_dev, NULL);
-			if (rc < 0)
-				return rc;
-		}
-	} while (ret == -ERANGE);
-
-	return ret;
-}
-
 static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
 
 /*
@@ -3584,7 +3569,7 @@ static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
 	if (ret)
 		goto err_out_bus;
 
-	ret = rbd_init_watch_dev(rbd_dev);
+	ret = rbd_req_sync_watch(rbd_dev, 1);
 	if (ret)
 		goto err_out_bus;
 
-- 
cgit v0.10.2


From 1ec3911dbd19076bcdfe5540096ff67f91a6ec02 Mon Sep 17 00:00:00 2001
From: Cong Ding <dinggnu@gmail.com>
Date: Fri, 25 Jan 2013 17:48:59 -0600
Subject: libceph: fix undefined behavior when using snprintf()

The variable "str" is used as both the source and destination in
function snprintf(), which is undefined behavior based on C11. The
original description in C11 is:
	"If copying takes place between objects that
	overlap, the behavior is undefined."

And, the function of ceph_osdmap_state_str() is to return the osdmap
state, so it should return "doesn't exist" when all the conditions
are not satisfied. I fix it in this patch.

[elder@inktank.com: shortened the commit message]

Signed-off-by: Cong Ding <dinggnu@gmail.com>
Reviewed-by: Alex Elder <elder@inktank.com>

diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 369f03b..3c61e216 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -13,26 +13,18 @@
 
 char *ceph_osdmap_state_str(char *str, int len, int state)
 {
-	int flag = 0;
-
 	if (!len)
-		goto done;
-
-	*str = '\0';
-	if (state) {
-		if (state & CEPH_OSD_EXISTS) {
-			snprintf(str, len, "exists");
-			flag = 1;
-		}
-		if (state & CEPH_OSD_UP) {
-			snprintf(str, len, "%s%s%s", str, (flag ? ", " : ""),
-				 "up");
-			flag = 1;
-		}
-	} else {
+		return str;
+
+	if ((state & CEPH_OSD_EXISTS) && (state & CEPH_OSD_UP))
+		snprintf(str, len, "exists, up");
+	else if (state & CEPH_OSD_EXISTS)
+		snprintf(str, len, "exists");
+	else if (state & CEPH_OSD_UP)
+		snprintf(str, len, "up");
+	else
 		snprintf(str, len, "doesn't exist");
-	}
-done:
+
 	return str;
 }
 
-- 
cgit v0.10.2