From 98bdb0aa007ff7e8e0061936d8d0e210faf2e655 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 25 Jan 2011 08:17:48 -0800 Subject: libceph: fix socket read error handling If we get EAGAIN when trying to read from the socket, it is not an error. Return 0 from the helper in this case to simplify the error handling cases in the caller (indirectly, try_read). Fix try_read to pass any error to it's caller (con_work) instead of almost always returning 0. This let's us respond to things like socket disconnects. Signed-off-by: Sage Weil diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index dff633d..d95576d 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -252,8 +252,12 @@ static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len) { struct kvec iov = {buf, len}; struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; + int r; - return kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags); + r = kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags); + if (r == -EAGAIN) + r = 0; + return r; } /* @@ -1821,19 +1825,17 @@ more: dout("try_read connecting\n"); ret = read_partial_banner(con); if (ret <= 0) - goto done; - if (process_banner(con) < 0) { - ret = -1; goto out; - } + ret = process_banner(con); + if (ret < 0) + goto out; } ret = read_partial_connect(con); if (ret <= 0) - goto done; - if (process_connect(con) < 0) { - ret = -1; goto out; - } + ret = process_connect(con); + if (ret < 0) + goto out; goto more; } @@ -1848,7 +1850,7 @@ more: dout("skipping %d / %d bytes\n", skip, -con->in_base_pos); ret = ceph_tcp_recvmsg(con->sock, buf, skip); if (ret <= 0) - goto done; + goto out; con->in_base_pos += ret; if (con->in_base_pos) goto more; @@ -1859,7 +1861,7 @@ more: */ ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1); if (ret <= 0) - goto done; + goto out; dout("try_read got tag %d\n", (int)con->in_tag); switch (con->in_tag) { case CEPH_MSGR_TAG_MSG: @@ -1870,7 +1872,7 @@ more: break; case CEPH_MSGR_TAG_CLOSE: set_bit(CLOSED, &con->state); /* fixme */ - goto done; + goto out; default: goto bad_tag; } @@ -1882,13 +1884,12 @@ more: case -EBADMSG: con->error_msg = "bad crc"; ret = -EIO; - goto out; + break; case -EIO: con->error_msg = "io error"; - goto out; - default: - goto done; + break; } + goto out; } if (con->in_tag == CEPH_MSGR_TAG_READY) goto more; @@ -1898,15 +1899,13 @@ more: if (con->in_tag == CEPH_MSGR_TAG_ACK) { ret = read_partial_ack(con); if (ret <= 0) - goto done; + goto out; process_ack(con); goto more; } -done: - ret = 0; out: - dout("try_read done on %p\n", con); + dout("try_read done on %p ret %d\n", con, ret); return ret; bad_tag: -- cgit v0.10.2 From 42961d2333a1855c649fa3790e258ab4f0fa66a4 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 25 Jan 2011 08:19:34 -0800 Subject: libceph: fix socket write error handling Pass errors from writing to the socket up the stack. If we get -EAGAIN, return 0 from the helper to simplify the callers' checks. Signed-off-by: Sage Weil diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index d95576d..35b36b8 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -268,13 +268,17 @@ static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov, size_t kvlen, size_t len, int more) { struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; + int r; if (more) msg.msg_flags |= MSG_MORE; else msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */ - return kernel_sendmsg(sock, &msg, iov, kvlen, len); + r = kernel_sendmsg(sock, &msg, iov, kvlen, len); + if (r == -EAGAIN) + r = 0; + return r; } @@ -851,6 +855,8 @@ static int write_partial_msg_pages(struct ceph_connection *con) (msg->pages || msg->pagelist || msg->bio || in_trail)) kunmap(page); + if (ret == -EAGAIN) + ret = 0; if (ret <= 0) goto out; @@ -1741,16 +1747,12 @@ more_kvec: if (con->out_skip) { ret = write_partial_skip(con); if (ret <= 0) - goto done; - if (ret < 0) { - dout("try_write write_partial_skip err %d\n", ret); - goto done; - } + goto out; } if (con->out_kvec_left) { ret = write_partial_kvec(con); if (ret <= 0) - goto done; + goto out; } /* msg pages? */ @@ -1765,11 +1767,11 @@ more_kvec: if (ret == 1) goto more_kvec; /* we need to send the footer, too! */ if (ret == 0) - goto done; + goto out; if (ret < 0) { dout("try_write write_partial_msg_pages err %d\n", ret); - goto done; + goto out; } } @@ -1793,10 +1795,9 @@ do_next: /* Nothing to do! */ clear_bit(WRITE_PENDING, &con->state); dout("try_write nothing else to write.\n"); -done: ret = 0; out: - dout("try_write done on %p\n", con); + dout("try_write done on %p ret %d\n", con, ret); return ret; } -- cgit v0.10.2 From e8e1ba96b207deba1339b09983f8b29f92cb1497 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 4 Feb 2011 20:45:58 -0800 Subject: ceph: queue cap_snaps once per realm We were forming a dirty list, and then queueing cap_snaps for each realm _and_ its children, regardless of whether the children were already in the dirty list. This meant we did it twice for some realms. Which in turn meant we corrupted mdsc->snap_flush_list when the cap_snap was re-added to the list it was already on, and could trigger an infinite loop. We were also using recursion to do reach all the children, a no-no when stack is limited. Instead, (re)queue any children on the dirty list, avoiding processing anything twice and avoiding any recursion. Signed-off-by: Sage Weil diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 39c243a..f40b913 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -584,10 +584,14 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm) if (lastinode) iput(lastinode); - dout("queue_realm_cap_snaps %p %llx children\n", realm, realm->ino); - list_for_each_entry(child, &realm->children, child_item) - queue_realm_cap_snaps(child); + list_for_each_entry(child, &realm->children, child_item) { + dout("queue_realm_cap_snaps %p %llx queue child %p %llx\n", + realm, realm->ino, child, child->ino); + list_del_init(&child->dirty_item); + list_add(&child->dirty_item, &realm->dirty_item); + } + list_del_init(&realm->dirty_item); dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino); } @@ -683,7 +687,9 @@ more: * queue cap snaps _after_ we've built the new snap contexts, * so that i_head_snapc can be set appropriately. */ - list_for_each_entry(realm, &dirty_realms, dirty_item) { + while (!list_empty(&dirty_realms)) { + realm = list_first_entry(&dirty_realms, struct ceph_snap_realm, + dirty_item); queue_realm_cap_snaps(realm); } -- cgit v0.10.2 From 97d79b403ef03f729883246208ef5d8a2ebc4d68 Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Tue, 18 Jan 2011 13:37:28 -0800 Subject: ceph: keep reference to parent inode on ceph_dentry When creating a new dentry we now hold a reference to the parent inode in the ceph_dentry. This is required due to the new RCU changes from 949854d0, which set dentry->d_parent to NULL in d_kill before calling the ->release() callback. If/when that behavior is changed, we can revert this hack. Signed-off-by: Yehuda Sadeh Signed-off-by: Sage Weil diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 562f988..6bfaa6a 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -60,6 +60,7 @@ int ceph_init_dentry(struct dentry *dentry) } di->dentry = dentry; di->lease_session = NULL; + di->parent_inode = igrab(dentry->d_parent->d_inode); dentry->d_fsdata = di; dentry->d_time = jiffies; ceph_dentry_lru_add(dentry); @@ -1025,7 +1026,7 @@ static void ceph_dentry_release(struct dentry *dentry) u64 snapid = CEPH_NOSNAP; if (!IS_ROOT(dentry)) { - parent_inode = dentry->d_parent->d_inode; + parent_inode = di->parent_inode; if (parent_inode) snapid = ceph_snap(parent_inode); } @@ -1050,6 +1051,8 @@ static void ceph_dentry_release(struct dentry *dentry) kmem_cache_free(ceph_dentry_cachep, di); dentry->d_fsdata = NULL; } + if (parent_inode) + iput(parent_inode); } static int ceph_snapdir_d_revalidate(struct dentry *dentry, diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 6e08266..c01aa64 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -207,6 +207,7 @@ struct ceph_dentry_info { struct dentry *dentry; u64 time; u64 offset; + struct inode *parent_inode; }; struct ceph_inode_xattrs_info { -- cgit v0.10.2