summaryrefslogtreecommitdiff
path: root/net/rds
diff options
context:
space:
mode:
Diffstat (limited to 'net/rds')
-rw-r--r--net/rds/af_rds.c43
-rw-r--r--net/rds/bind.c4
-rw-r--r--net/rds/connection.c17
-rw-r--r--net/rds/ib_cm.c13
-rw-r--r--net/rds/rds.h6
-rw-r--r--net/rds/tcp_connect.c1
-rw-r--r--net/rds/tcp_listen.c46
-rw-r--r--net/rds/transport.c21
8 files changed, 141 insertions, 10 deletions
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index 1044337..2ad9032 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -270,6 +270,28 @@ static int rds_cong_monitor(struct rds_sock *rs, char __user *optval,
return ret;
}
+static int rds_set_transport(struct rds_sock *rs, char __user *optval,
+ int optlen)
+{
+ int t_type;
+
+ if (rs->rs_transport)
+ return -EOPNOTSUPP; /* previously attached to transport */
+
+ if (optlen != sizeof(int))
+ return -EINVAL;
+
+ if (copy_from_user(&t_type, (int __user *)optval, sizeof(t_type)))
+ return -EFAULT;
+
+ if (t_type < 0 || t_type >= RDS_TRANS_COUNT)
+ return -EINVAL;
+
+ rs->rs_transport = rds_trans_get(t_type);
+
+ return rs->rs_transport ? 0 : -ENOPROTOOPT;
+}
+
static int rds_setsockopt(struct socket *sock, int level, int optname,
char __user *optval, unsigned int optlen)
{
@@ -300,6 +322,11 @@ static int rds_setsockopt(struct socket *sock, int level, int optname,
case RDS_CONG_MONITOR:
ret = rds_cong_monitor(rs, optval, optlen);
break;
+ case SO_RDS_TRANSPORT:
+ lock_sock(sock->sk);
+ ret = rds_set_transport(rs, optval, optlen);
+ release_sock(sock->sk);
+ break;
default:
ret = -ENOPROTOOPT;
}
@@ -312,6 +339,7 @@ static int rds_getsockopt(struct socket *sock, int level, int optname,
{
struct rds_sock *rs = rds_sk_to_rs(sock->sk);
int ret = -ENOPROTOOPT, len;
+ int trans;
if (level != SOL_RDS)
goto out;
@@ -337,6 +365,19 @@ static int rds_getsockopt(struct socket *sock, int level, int optname,
else
ret = 0;
break;
+ case SO_RDS_TRANSPORT:
+ if (len < sizeof(int)) {
+ ret = -EINVAL;
+ break;
+ }
+ trans = (rs->rs_transport ? rs->rs_transport->t_type :
+ RDS_TRANS_NONE); /* unbound */
+ if (put_user(trans, (int __user *)optval) ||
+ put_user(sizeof(int), optlen))
+ ret = -EFAULT;
+ else
+ ret = 0;
+ break;
default:
break;
}
@@ -440,7 +481,7 @@ static int rds_create(struct net *net, struct socket *sock, int protocol,
if (sock->type != SOCK_SEQPACKET || protocol)
return -ESOCKTNOSUPPORT;
- sk = sk_alloc(net, AF_RDS, GFP_ATOMIC, &rds_proto);
+ sk = sk_alloc(net, AF_RDS, GFP_ATOMIC, &rds_proto, kern);
if (!sk)
return -ENOMEM;
diff --git a/net/rds/bind.c b/net/rds/bind.c
index a2e6562..4ebd29c 100644
--- a/net/rds/bind.c
+++ b/net/rds/bind.c
@@ -181,6 +181,10 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
if (ret)
goto out;
+ if (rs->rs_transport) { /* previously bound */
+ ret = 0;
+ goto out;
+ }
trans = rds_trans_get_preferred(sin->sin_addr.s_addr);
if (!trans) {
ret = -EADDRNOTAVAIL;
diff --git a/net/rds/connection.c b/net/rds/connection.c
index 14f04139..da6da57 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -126,7 +126,10 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
struct rds_transport *loop_trans;
unsigned long flags;
int ret;
+ struct rds_transport *otrans = trans;
+ if (!is_outgoing && otrans->t_type == RDS_TRANS_TCP)
+ goto new_conn;
rcu_read_lock();
conn = rds_conn_lookup(head, laddr, faddr, trans);
if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport &&
@@ -142,6 +145,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
if (conn)
goto out;
+new_conn:
conn = kmem_cache_zalloc(rds_conn_slab, gfp);
if (!conn) {
conn = ERR_PTR(-ENOMEM);
@@ -230,13 +234,22 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
/* Creating normal conn */
struct rds_connection *found;
- found = rds_conn_lookup(head, laddr, faddr, trans);
+ if (!is_outgoing && otrans->t_type == RDS_TRANS_TCP)
+ found = NULL;
+ else
+ found = rds_conn_lookup(head, laddr, faddr, trans);
if (found) {
trans->conn_free(conn->c_transport_data);
kmem_cache_free(rds_conn_slab, conn);
conn = found;
} else {
- hlist_add_head_rcu(&conn->c_hash_node, head);
+ if ((is_outgoing && otrans->t_type == RDS_TRANS_TCP) ||
+ (otrans->t_type != RDS_TRANS_TCP)) {
+ /* Only the active side should be added to
+ * reconnect list for TCP.
+ */
+ hlist_add_head_rcu(&conn->c_hash_node, head);
+ }
rds_cong_add_conn(conn);
rds_conn_count++;
}
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 31b74f5..8a09ee7 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -183,8 +183,17 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
/* If the peer gave us the last packet it saw, process this as if
* we had received a regular ACK. */
- if (dp && dp->dp_ack_seq)
- rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
+ if (dp) {
+ /* dp structure start is not guaranteed to be 8 bytes aligned.
+ * Since dp_ack_seq is 64-bit extended load operations can be
+ * used so go through get_unaligned to avoid unaligned errors.
+ */
+ __be64 dp_ack_seq = get_unaligned(&dp->dp_ack_seq);
+
+ if (dp_ack_seq)
+ rds_send_drop_acked(conn, be64_to_cpu(dp_ack_seq),
+ NULL);
+ }
rds_connect_complete(conn);
}
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 0d41155..a33fb4a 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -408,11 +408,6 @@ struct rds_notifier {
* should try hard not to block.
*/
-#define RDS_TRANS_IB 0
-#define RDS_TRANS_IWARP 1
-#define RDS_TRANS_TCP 2
-#define RDS_TRANS_COUNT 3
-
struct rds_transport {
char t_name[TRANSNAMSIZ];
struct list_head t_item;
@@ -803,6 +798,7 @@ struct rds_transport *rds_trans_get_preferred(__be32 addr);
void rds_trans_put(struct rds_transport *trans);
unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
unsigned int avail);
+struct rds_transport *rds_trans_get(int t_type);
int rds_trans_init(void);
void rds_trans_exit(void);
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
index f9f564a..973109c7 100644
--- a/net/rds/tcp_connect.c
+++ b/net/rds/tcp_connect.c
@@ -62,6 +62,7 @@ void rds_tcp_state_change(struct sock *sk)
case TCP_ESTABLISHED:
rds_connect_complete(conn);
break;
+ case TCP_CLOSE_WAIT:
case TCP_CLOSE:
rds_conn_drop(conn);
default:
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index 23ab4dcd..0da49e3 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -45,12 +45,45 @@ static void rds_tcp_accept_worker(struct work_struct *work);
static DECLARE_WORK(rds_tcp_listen_work, rds_tcp_accept_worker);
static struct socket *rds_tcp_listen_sock;
+static int rds_tcp_keepalive(struct socket *sock)
+{
+ /* values below based on xs_udp_default_timeout */
+ int keepidle = 5; /* send a probe 'keepidle' secs after last data */
+ int keepcnt = 5; /* number of unack'ed probes before declaring dead */
+ int keepalive = 1;
+ int ret = 0;
+
+ ret = kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
+ (char *)&keepalive, sizeof(keepalive));
+ if (ret < 0)
+ goto bail;
+
+ ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPCNT,
+ (char *)&keepcnt, sizeof(keepcnt));
+ if (ret < 0)
+ goto bail;
+
+ ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPIDLE,
+ (char *)&keepidle, sizeof(keepidle));
+ if (ret < 0)
+ goto bail;
+
+ /* KEEPINTVL is the interval between successive probes. We follow
+ * the model in xs_tcp_finish_connecting() and re-use keepidle.
+ */
+ ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPINTVL,
+ (char *)&keepidle, sizeof(keepidle));
+bail:
+ return ret;
+}
+
static int rds_tcp_accept_one(struct socket *sock)
{
struct socket *new_sock = NULL;
struct rds_connection *conn;
int ret;
struct inet_sock *inet;
+ struct rds_tcp_connection *rs_tcp;
ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type,
sock->sk->sk_protocol, &new_sock);
@@ -63,6 +96,10 @@ static int rds_tcp_accept_one(struct socket *sock)
if (ret < 0)
goto out;
+ ret = rds_tcp_keepalive(new_sock);
+ if (ret < 0)
+ goto out;
+
rds_tcp_tune(new_sock);
inet = inet_sk(new_sock->sk);
@@ -77,6 +114,15 @@ static int rds_tcp_accept_one(struct socket *sock)
ret = PTR_ERR(conn);
goto out;
}
+ /* An incoming SYN request came in, and TCP just accepted it.
+ * We always create a new conn for listen side of TCP, and do not
+ * add it to the c_hash_list.
+ *
+ * If the client reboots, this conn will need to be cleaned up.
+ * rds_tcp_state_change() will do that cleanup
+ */
+ rs_tcp = (struct rds_tcp_connection *)conn->c_transport_data;
+ WARN_ON(!rs_tcp || rs_tcp->t_sock);
/*
* see the comment above rds_queue_delayed_reconnect()
diff --git a/net/rds/transport.c b/net/rds/transport.c
index 7f2ac4f..8b4a6cd 100644
--- a/net/rds/transport.c
+++ b/net/rds/transport.c
@@ -101,6 +101,27 @@ struct rds_transport *rds_trans_get_preferred(__be32 addr)
return ret;
}
+struct rds_transport *rds_trans_get(int t_type)
+{
+ struct rds_transport *ret = NULL;
+ struct rds_transport *trans;
+ unsigned int i;
+
+ down_read(&rds_trans_sem);
+ for (i = 0; i < RDS_TRANS_COUNT; i++) {
+ trans = transports[i];
+
+ if (trans && trans->t_type == t_type &&
+ (!trans->t_owner || try_module_get(trans->t_owner))) {
+ ret = trans;
+ break;
+ }
+ }
+ up_read(&rds_trans_sem);
+
+ return ret;
+}
+
/*
* This returns the number of stats entries in the snapshot and only
* copies them using the iter if there is enough space for them. The