diff options
Diffstat (limited to 'drivers/staging/lustre/lnet/klnds')
7 files changed, 184 insertions, 319 deletions
diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c index 4f5978b..c7a5d49 100644 --- a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c +++ b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c @@ -128,6 +128,7 @@ static int kiblnd_msgtype2size(int type) static int kiblnd_unpack_rd(struct kib_msg *msg, int flip) { struct kib_rdma_desc *rd; + int msg_size; int nob; int n; int i; @@ -146,12 +147,6 @@ static int kiblnd_unpack_rd(struct kib_msg *msg, int flip) n = rd->rd_nfrags; - if (n <= 0 || n > IBLND_MAX_RDMA_FRAGS) { - CERROR("Bad nfrags: %d, should be 0 < n <= %d\n", - n, IBLND_MAX_RDMA_FRAGS); - return 1; - } - nob = offsetof(struct kib_msg, ibm_u) + kiblnd_rd_msg_size(rd, msg->ibm_type, n); @@ -161,6 +156,13 @@ static int kiblnd_unpack_rd(struct kib_msg *msg, int flip) return 1; } + msg_size = kiblnd_rd_size(rd); + if (msg_size <= 0 || msg_size > LNET_MAX_PAYLOAD) { + CERROR("Bad msg_size: %d, should be 0 < n <= %d\n", + msg_size, LNET_MAX_PAYLOAD); + return 1; + } + if (!flip) return 0; @@ -618,7 +620,7 @@ static int kiblnd_get_completion_vector(struct kib_conn *conn, int cpt) } struct kib_conn *kiblnd_create_conn(struct kib_peer *peer, struct rdma_cm_id *cmid, - int state, int version) + int state, int version) { /* * CAVEAT EMPTOR: diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h index 078a0c3..1457697 100644 --- a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h +++ b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h @@ -113,8 +113,9 @@ extern struct kib_tunables kiblnd_tunables; #define IBLND_OOB_CAPABLE(v) ((v) != IBLND_MSG_VERSION_1) #define IBLND_OOB_MSGS(v) (IBLND_OOB_CAPABLE(v) ? 2 : 0) -#define IBLND_MSG_SIZE (4 << 10) /* max size of queued messages (inc hdr) */ -#define IBLND_MAX_RDMA_FRAGS LNET_MAX_IOV /* max # of fragments supported */ +#define IBLND_FRAG_SHIFT (PAGE_SHIFT - 12) /* frag size on wire is in 4K units */ +#define IBLND_MSG_SIZE (4 << 10) /* max size of queued messages (inc hdr) */ +#define IBLND_MAX_RDMA_FRAGS (LNET_MAX_PAYLOAD >> 12)/* max # of fragments supported in 4K size */ /************************/ /* derived constants... */ @@ -133,8 +134,8 @@ extern struct kib_tunables kiblnd_tunables; /* WRs and CQEs (per connection) */ #define IBLND_RECV_WRS(c) IBLND_RX_MSGS(c) #define IBLND_SEND_WRS(c) \ - ((c->ibc_max_frags + 1) * kiblnd_concurrent_sends(c->ibc_version, \ - c->ibc_peer->ibp_ni)) + (((c->ibc_max_frags + 1) << IBLND_FRAG_SHIFT) * \ + kiblnd_concurrent_sends(c->ibc_version, c->ibc_peer->ibp_ni)) #define IBLND_CQ_ENTRIES(c) (IBLND_RECV_WRS(c) + IBLND_SEND_WRS(c)) struct kib_hca_dev; @@ -582,6 +583,8 @@ struct kib_peer { unsigned short ibp_connecting; /* reconnect this peer later */ unsigned short ibp_reconnecting:1; + /* counter of how many times we triggered a conn race */ + unsigned char ibp_races; /* # consecutive reconnection attempts to this peer */ unsigned int ibp_reconnected; /* errno on closing this peer */ @@ -607,14 +610,14 @@ kiblnd_cfg_rdma_frags(struct lnet_ni *ni) tunables = &ni->ni_lnd_tunables->lt_tun_u.lt_o2ib; mod = tunables->lnd_map_on_demand; - return mod ? mod : IBLND_MAX_RDMA_FRAGS; + return mod ? mod : IBLND_MAX_RDMA_FRAGS >> IBLND_FRAG_SHIFT; } static inline int kiblnd_rdma_frags(int version, struct lnet_ni *ni) { return version == IBLND_MSG_VERSION_1 ? - IBLND_MAX_RDMA_FRAGS : + (IBLND_MAX_RDMA_FRAGS >> IBLND_FRAG_SHIFT) : kiblnd_cfg_rdma_frags(ni); } @@ -1034,5 +1037,4 @@ int kiblnd_post_rx(struct kib_rx *rx, int credit); int kiblnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg); int kiblnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed, - unsigned int niov, struct kvec *iov, lnet_kiov_t *kiov, - unsigned int offset, unsigned int mlen, unsigned int rlen); + struct iov_iter *to, unsigned int rlen); diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c index 596a697..b27de88 100644 --- a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c +++ b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c @@ -36,16 +36,19 @@ #include "o2iblnd.h" +#define MAX_CONN_RACES_BEFORE_ABORT 20 + static void kiblnd_peer_alive(struct kib_peer *peer); static void kiblnd_peer_connect_failed(struct kib_peer *peer, int active, int error); -static void kiblnd_check_sends(struct kib_conn *conn); static void kiblnd_init_tx_msg(lnet_ni_t *ni, struct kib_tx *tx, - int type, int body_nob); + int type, int body_nob); static int kiblnd_init_rdma(struct kib_conn *conn, struct kib_tx *tx, int type, - int resid, struct kib_rdma_desc *dstrd, __u64 dstcookie); + int resid, struct kib_rdma_desc *dstrd, + __u64 dstcookie); static void kiblnd_queue_tx_locked(struct kib_tx *tx, struct kib_conn *conn); static void kiblnd_queue_tx(struct kib_tx *tx, struct kib_conn *conn); static void kiblnd_unmap_tx(lnet_ni_t *ni, struct kib_tx *tx); +static void kiblnd_check_sends_locked(struct kib_conn *conn); static void kiblnd_tx_done(lnet_ni_t *ni, struct kib_tx *tx) @@ -211,9 +214,9 @@ kiblnd_post_rx(struct kib_rx *rx, int credit) conn->ibc_outstanding_credits++; else conn->ibc_reserved_credits++; + kiblnd_check_sends_locked(conn); spin_unlock(&conn->ibc_lock); - kiblnd_check_sends(conn); out: kiblnd_conn_decref(conn); return rc; @@ -344,8 +347,8 @@ kiblnd_handle_rx(struct kib_rx *rx) !IBLND_OOB_CAPABLE(conn->ibc_version)) /* v1 only */ conn->ibc_outstanding_credits++; + kiblnd_check_sends_locked(conn); spin_unlock(&conn->ibc_lock); - kiblnd_check_sends(conn); } switch (msg->ibm_type) { @@ -648,7 +651,7 @@ static int kiblnd_map_tx(lnet_ni_t *ni, struct kib_tx *tx, struct kib_rdma_desc static int kiblnd_setup_rd_iov(lnet_ni_t *ni, struct kib_tx *tx, struct kib_rdma_desc *rd, - unsigned int niov, struct kvec *iov, int offset, int nob) + unsigned int niov, const struct kvec *iov, int offset, int nob) { struct kib_net *net = ni->ni_data; struct page *page; @@ -705,7 +708,7 @@ kiblnd_setup_rd_iov(lnet_ni_t *ni, struct kib_tx *tx, struct kib_rdma_desc *rd, static int kiblnd_setup_rd_kiov(lnet_ni_t *ni, struct kib_tx *tx, struct kib_rdma_desc *rd, - int nkiov, lnet_kiov_t *kiov, int offset, int nob) + int nkiov, const lnet_kiov_t *kiov, int offset, int nob) { struct kib_net *net = ni->ni_data; struct scatterlist *sg; @@ -717,8 +720,8 @@ kiblnd_setup_rd_kiov(lnet_ni_t *ni, struct kib_tx *tx, struct kib_rdma_desc *rd, LASSERT(nkiov > 0); LASSERT(net); - while (offset >= kiov->kiov_len) { - offset -= kiov->kiov_len; + while (offset >= kiov->bv_len) { + offset -= kiov->bv_len; nkiov--; kiov++; LASSERT(nkiov > 0); @@ -728,10 +731,10 @@ kiblnd_setup_rd_kiov(lnet_ni_t *ni, struct kib_tx *tx, struct kib_rdma_desc *rd, do { LASSERT(nkiov > 0); - fragnob = min((int)(kiov->kiov_len - offset), nob); + fragnob = min((int)(kiov->bv_len - offset), nob); - sg_set_page(sg, kiov->kiov_page, fragnob, - kiov->kiov_offset + offset); + sg_set_page(sg, kiov->bv_page, fragnob, + kiov->bv_offset + offset); sg = sg_next(sg); if (!sg) { CERROR("lacking enough sg entries to map tx\n"); @@ -761,7 +764,6 @@ kiblnd_post_tx_locked(struct kib_conn *conn, struct kib_tx *tx, int credit) LASSERT(tx->tx_queued); /* We rely on this for QP sizing */ LASSERT(tx->tx_nwrq > 0); - LASSERT(tx->tx_nwrq <= 1 + conn->ibc_max_frags); LASSERT(!credit || credit == 1); LASSERT(conn->ibc_outstanding_credits >= 0); @@ -800,7 +802,7 @@ kiblnd_post_tx_locked(struct kib_conn *conn, struct kib_tx *tx, int credit) conn->ibc_noops_posted == IBLND_OOB_MSGS(ver)))) { /* * OK to drop when posted enough NOOPs, since - * kiblnd_check_sends will queue NOOP again when + * kiblnd_check_sends_locked will queue NOOP again when * posted NOOPs complete */ spin_unlock(&conn->ibc_lock); @@ -905,7 +907,7 @@ kiblnd_post_tx_locked(struct kib_conn *conn, struct kib_tx *tx, int credit) } static void -kiblnd_check_sends(struct kib_conn *conn) +kiblnd_check_sends_locked(struct kib_conn *conn) { int ver = conn->ibc_version; lnet_ni_t *ni = conn->ibc_peer->ibp_ni; @@ -918,8 +920,6 @@ kiblnd_check_sends(struct kib_conn *conn) return; } - spin_lock(&conn->ibc_lock); - LASSERT(conn->ibc_nsends_posted <= kiblnd_concurrent_sends(ver, ni)); LASSERT(!IBLND_OOB_CAPABLE(ver) || conn->ibc_noops_posted <= IBLND_OOB_MSGS(ver)); @@ -969,8 +969,6 @@ kiblnd_check_sends(struct kib_conn *conn) if (kiblnd_post_tx_locked(conn, tx, credit)) break; } - - spin_unlock(&conn->ibc_lock); } static void @@ -1016,16 +1014,11 @@ kiblnd_tx_complete(struct kib_tx *tx, int status) if (idle) list_del(&tx->tx_list); - kiblnd_conn_addref(conn); /* 1 ref for me.... */ - + kiblnd_check_sends_locked(conn); spin_unlock(&conn->ibc_lock); if (idle) kiblnd_tx_done(conn->ibc_peer->ibp_ni, tx); - - kiblnd_check_sends(conn); - - kiblnd_conn_decref(conn); /* ...until here */ } static void @@ -1078,6 +1071,15 @@ kiblnd_init_rdma(struct kib_conn *conn, struct kib_tx *tx, int type, LASSERT(type == IBLND_MSG_GET_DONE || type == IBLND_MSG_PUT_DONE); + if (kiblnd_rd_size(srcrd) > conn->ibc_max_frags << PAGE_SHIFT) { + CERROR("RDMA is too large for peer %s (%d), src size: %d dst size: %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), + conn->ibc_max_frags << PAGE_SHIFT, + kiblnd_rd_size(srcrd), kiblnd_rd_size(dstrd)); + rc = -EMSGSIZE; + goto too_big; + } + while (resid > 0) { if (srcidx >= srcrd->rd_nfrags) { CERROR("Src buffer exhausted: %d frags\n", srcidx); @@ -1091,10 +1093,10 @@ kiblnd_init_rdma(struct kib_conn *conn, struct kib_tx *tx, int type, break; } - if (tx->tx_nwrq >= conn->ibc_max_frags) { + if (tx->tx_nwrq >= IBLND_MAX_RDMA_FRAGS) { CERROR("RDMA has too many fragments for peer %s (%d), src idx/frags: %d/%d dst idx/frags: %d/%d\n", libcfs_nid2str(conn->ibc_peer->ibp_nid), - conn->ibc_max_frags, + IBLND_MAX_RDMA_FRAGS, srcidx, srcrd->rd_nfrags, dstidx, dstrd->rd_nfrags); rc = -EMSGSIZE; @@ -1132,7 +1134,7 @@ kiblnd_init_rdma(struct kib_conn *conn, struct kib_tx *tx, int type, wrq++; sge++; } - +too_big: if (rc < 0) /* no RDMA if completing with failure */ tx->tx_nwrq = 0; @@ -1204,9 +1206,8 @@ kiblnd_queue_tx(struct kib_tx *tx, struct kib_conn *conn) { spin_lock(&conn->ibc_lock); kiblnd_queue_tx_locked(tx, conn); + kiblnd_check_sends_locked(conn); spin_unlock(&conn->ibc_lock); - - kiblnd_check_sends(conn); } static int kiblnd_resolve_addr(struct rdma_cm_id *cmid, @@ -1499,6 +1500,7 @@ kiblnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) lnet_kiov_t *payload_kiov = lntmsg->msg_kiov; unsigned int payload_offset = lntmsg->msg_offset; unsigned int payload_nob = lntmsg->msg_len; + struct iov_iter from; struct kib_msg *ibmsg; struct kib_rdma_desc *rd; struct kib_tx *tx; @@ -1518,6 +1520,17 @@ kiblnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) /* payload is either all vaddrs or all pages */ LASSERT(!(payload_kiov && payload_iov)); + if (payload_kiov) + iov_iter_bvec(&from, ITER_BVEC | WRITE, + payload_kiov, payload_niov, + payload_nob + payload_offset); + else + iov_iter_kvec(&from, ITER_KVEC | WRITE, + payload_iov, payload_niov, + payload_nob + payload_offset); + + iov_iter_advance(&from, payload_offset); + switch (type) { default: LBUG(); @@ -1637,17 +1650,8 @@ kiblnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) ibmsg = tx->tx_msg; ibmsg->ibm_u.immediate.ibim_hdr = *hdr; - if (payload_kiov) - lnet_copy_kiov2flat(IBLND_MSG_SIZE, ibmsg, - offsetof(struct kib_msg, ibm_u.immediate.ibim_payload), - payload_niov, payload_kiov, - payload_offset, payload_nob); - else - lnet_copy_iov2flat(IBLND_MSG_SIZE, ibmsg, - offsetof(struct kib_msg, ibm_u.immediate.ibim_payload), - payload_niov, payload_iov, - payload_offset, payload_nob); - + copy_from_iter(&ibmsg->ibm_u.immediate.ibim_payload, IBLND_MSG_SIZE, + &from); nob = offsetof(struct kib_immediate_msg, ibim_payload[payload_nob]); kiblnd_init_tx_msg(ni, tx, IBLND_MSG_IMMEDIATE, nob); @@ -1719,8 +1723,7 @@ kiblnd_reply(lnet_ni_t *ni, struct kib_rx *rx, lnet_msg_t *lntmsg) int kiblnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed, - unsigned int niov, struct kvec *iov, lnet_kiov_t *kiov, - unsigned int offset, unsigned int mlen, unsigned int rlen) + struct iov_iter *to, unsigned int rlen) { struct kib_rx *rx = private; struct kib_msg *rxmsg = rx->rx_msg; @@ -1730,10 +1733,9 @@ kiblnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed, int post_credit = IBLND_POSTRX_PEER_CREDIT; int rc = 0; - LASSERT(mlen <= rlen); + LASSERT(iov_iter_count(to) <= rlen); LASSERT(!in_interrupt()); /* Either all pages or all vaddrs */ - LASSERT(!(kiov && iov)); switch (rxmsg->ibm_type) { default: @@ -1749,16 +1751,8 @@ kiblnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed, break; } - if (kiov) - lnet_copy_flat2kiov(niov, kiov, offset, - IBLND_MSG_SIZE, rxmsg, - offsetof(struct kib_msg, ibm_u.immediate.ibim_payload), - mlen); - else - lnet_copy_flat2iov(niov, iov, offset, - IBLND_MSG_SIZE, rxmsg, - offsetof(struct kib_msg, ibm_u.immediate.ibim_payload), - mlen); + copy_to_iter(&rxmsg->ibm_u.immediate.ibim_payload, + IBLND_MSG_SIZE, to); lnet_finalize(ni, lntmsg, 0); break; @@ -1766,7 +1760,7 @@ kiblnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed, struct kib_msg *txmsg; struct kib_rdma_desc *rd; - if (!mlen) { + if (!iov_iter_count(to)) { lnet_finalize(ni, lntmsg, 0); kiblnd_send_completion(rx->rx_conn, IBLND_MSG_PUT_NAK, 0, rxmsg->ibm_u.putreq.ibprm_cookie); @@ -1784,12 +1778,16 @@ kiblnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed, txmsg = tx->tx_msg; rd = &txmsg->ibm_u.putack.ibpam_rd; - if (!kiov) + if (!(to->type & ITER_BVEC)) rc = kiblnd_setup_rd_iov(ni, tx, rd, - niov, iov, offset, mlen); + to->nr_segs, to->kvec, + to->iov_offset, + iov_iter_count(to)); else rc = kiblnd_setup_rd_kiov(ni, tx, rd, - niov, kiov, offset, mlen); + to->nr_segs, to->bvec, + to->iov_offset, + iov_iter_count(to)); if (rc) { CERROR("Can't setup PUT sink for %s: %d\n", libcfs_nid2str(conn->ibc_peer->ibp_nid), rc); @@ -2183,14 +2181,11 @@ kiblnd_connreq_done(struct kib_conn *conn, int status) return; } - /** - * refcount taken by cmid is not reliable after I released the glock - * because this connection is visible to other threads now, another - * thread can find and close this connection right after I released - * the glock, if kiblnd_cm_callback for RDMA_CM_EVENT_DISCONNECTED is - * called, it can release the connection refcount taken by cmid. - * It means the connection could be destroyed before I finish my - * operations on it. + /* + * +1 ref for myself, this connection is visible to other threads + * now, refcount of peer:ibp_conns can be released by connection + * close from either a different thread, or the calling of + * kiblnd_check_sends_locked() below. See bz21911 for details. */ kiblnd_conn_addref(conn); write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); @@ -2202,10 +2197,9 @@ kiblnd_connreq_done(struct kib_conn *conn, int status) kiblnd_queue_tx_locked(tx, conn); } + kiblnd_check_sends_locked(conn); spin_unlock(&conn->ibc_lock); - kiblnd_check_sends(conn); - /* schedule blocked rxs */ kiblnd_handle_early_rxs(conn); @@ -2240,6 +2234,7 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob) struct kib_rej rej; int version = IBLND_MSG_VERSION; unsigned long flags; + int max_frags; int rc; struct sockaddr_in *peer_addr; @@ -2346,22 +2341,20 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob) goto failed; } - if (reqmsg->ibm_u.connparams.ibcp_max_frags > - kiblnd_rdma_frags(version, ni)) { - CWARN("Can't accept conn from %s (version %x): max_frags %d too large (%d wanted)\n", - libcfs_nid2str(nid), version, - reqmsg->ibm_u.connparams.ibcp_max_frags, + max_frags = reqmsg->ibm_u.connparams.ibcp_max_frags >> IBLND_FRAG_SHIFT; + if (max_frags > kiblnd_rdma_frags(version, ni)) { + CWARN("Can't accept conn from %s (version %x): max message size %d is too large (%d wanted)\n", + libcfs_nid2str(nid), version, max_frags, kiblnd_rdma_frags(version, ni)); if (version >= IBLND_MSG_VERSION) rej.ibr_why = IBLND_REJECT_RDMA_FRAGS; goto failed; - } else if (reqmsg->ibm_u.connparams.ibcp_max_frags < - kiblnd_rdma_frags(version, ni) && !net->ibn_fmr_ps) { - CWARN("Can't accept conn from %s (version %x): max_frags %d incompatible without FMR pool (%d wanted)\n", - libcfs_nid2str(nid), version, - reqmsg->ibm_u.connparams.ibcp_max_frags, + } else if (max_frags < kiblnd_rdma_frags(version, ni) && + !net->ibn_fmr_ps) { + CWARN("Can't accept conn from %s (version %x): max message size %d incompatible without FMR pool (%d wanted)\n", + libcfs_nid2str(nid), version, max_frags, kiblnd_rdma_frags(version, ni)); if (version == IBLND_MSG_VERSION) @@ -2387,7 +2380,7 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob) } /* We have validated the peer's parameters so use those */ - peer->ibp_max_frags = reqmsg->ibm_u.connparams.ibcp_max_frags; + peer->ibp_max_frags = max_frags; peer->ibp_queue_depth = reqmsg->ibm_u.connparams.ibcp_queue_depth; write_lock_irqsave(g_lock, flags); @@ -2419,23 +2412,37 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob) goto failed; } - /* tie-break connection race in favour of the higher NID */ + /* + * Tie-break connection race in favour of the higher NID. + * If we keep running into a race condition multiple times, + * we have to assume that the connection attempt with the + * higher NID is stuck in a connecting state and will never + * recover. As such, we pass through this if-block and let + * the lower NID connection win so we can move forward. + */ if (peer2->ibp_connecting && - nid < ni->ni_nid) { + nid < ni->ni_nid && peer2->ibp_races < + MAX_CONN_RACES_BEFORE_ABORT) { + peer2->ibp_races++; write_unlock_irqrestore(g_lock, flags); - CWARN("Conn race %s\n", libcfs_nid2str(peer2->ibp_nid)); + CDEBUG(D_NET, "Conn race %s\n", + libcfs_nid2str(peer2->ibp_nid)); kiblnd_peer_decref(peer); rej.ibr_why = IBLND_REJECT_CONN_RACE; goto failed; } - + if (peer2->ibp_races >= MAX_CONN_RACES_BEFORE_ABORT) + CNETERR("Conn race %s: unresolved after %d attempts, letting lower NID win\n", + libcfs_nid2str(peer2->ibp_nid), + MAX_CONN_RACES_BEFORE_ABORT); /** * passive connection is allowed even this peer is waiting for * reconnection. */ peer2->ibp_reconnecting = 0; + peer2->ibp_races = 0; peer2->ibp_accepting++; kiblnd_peer_addref(peer2); @@ -2494,7 +2501,7 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob) kiblnd_init_msg(ackmsg, IBLND_MSG_CONNACK, sizeof(ackmsg->ibm_u.connparams)); ackmsg->ibm_u.connparams.ibcp_queue_depth = conn->ibc_queue_depth; - ackmsg->ibm_u.connparams.ibcp_max_frags = conn->ibc_max_frags; + ackmsg->ibm_u.connparams.ibcp_max_frags = conn->ibc_max_frags << IBLND_FRAG_SHIFT; ackmsg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE; kiblnd_pack_msg(ni, ackmsg, version, 0, nid, reqmsg->ibm_srcstamp); @@ -2526,9 +2533,9 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob) failed: if (ni) { - lnet_ni_decref(ni); rej.ibr_cp.ibcp_queue_depth = kiblnd_msg_queue_size(version, ni); rej.ibr_cp.ibcp_max_frags = kiblnd_rdma_frags(version, ni); + lnet_ni_decref(ni); } rej.ibr_version = version; @@ -2556,7 +2563,7 @@ kiblnd_check_reconnect(struct kib_conn *conn, int version, if (cp) { msg_size = cp->ibcp_max_msg_size; - frag_num = cp->ibcp_max_frags; + frag_num = cp->ibcp_max_frags << IBLND_FRAG_SHIFT; queue_dep = cp->ibcp_queue_depth; } @@ -2821,11 +2828,11 @@ kiblnd_check_connreply(struct kib_conn *conn, void *priv, int priv_nob) goto failed; } - if (msg->ibm_u.connparams.ibcp_max_frags > + if ((msg->ibm_u.connparams.ibcp_max_frags >> IBLND_FRAG_SHIFT) > conn->ibc_max_frags) { CERROR("%s has incompatible max_frags %d (<=%d wanted)\n", libcfs_nid2str(peer->ibp_nid), - msg->ibm_u.connparams.ibcp_max_frags, + msg->ibm_u.connparams.ibcp_max_frags >> IBLND_FRAG_SHIFT, conn->ibc_max_frags); rc = -EPROTO; goto failed; @@ -2859,7 +2866,7 @@ kiblnd_check_connreply(struct kib_conn *conn, void *priv, int priv_nob) conn->ibc_credits = msg->ibm_u.connparams.ibcp_queue_depth; conn->ibc_reserved_credits = msg->ibm_u.connparams.ibcp_queue_depth; conn->ibc_queue_depth = msg->ibm_u.connparams.ibcp_queue_depth; - conn->ibc_max_frags = msg->ibm_u.connparams.ibcp_max_frags; + conn->ibc_max_frags = msg->ibm_u.connparams.ibcp_max_frags >> IBLND_FRAG_SHIFT; LASSERT(conn->ibc_credits + conn->ibc_reserved_credits + IBLND_OOB_MSGS(ver) <= IBLND_RX_MSGS(conn)); @@ -2916,7 +2923,7 @@ kiblnd_active_connect(struct rdma_cm_id *cmid) memset(msg, 0, sizeof(*msg)); kiblnd_init_msg(msg, IBLND_MSG_CONNREQ, sizeof(msg->ibm_u.connparams)); msg->ibm_u.connparams.ibcp_queue_depth = conn->ibc_queue_depth; - msg->ibm_u.connparams.ibcp_max_frags = conn->ibc_max_frags; + msg->ibm_u.connparams.ibcp_max_frags = conn->ibc_max_frags << IBLND_FRAG_SHIFT; msg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE; kiblnd_pack_msg(peer->ibp_ni, msg, version, @@ -3233,7 +3240,11 @@ kiblnd_check_conns(int idx) */ list_for_each_entry_safe(conn, temp, &checksends, ibc_connd_list) { list_del(&conn->ibc_connd_list); - kiblnd_check_sends(conn); + + spin_lock(&conn->ibc_lock); + kiblnd_check_sends_locked(conn); + spin_unlock(&conn->ibc_lock); + kiblnd_conn_decref(conn); } } @@ -3419,6 +3430,12 @@ kiblnd_qp_event(struct ib_event *event, void *arg) case IB_EVENT_COMM_EST: CDEBUG(D_NET, "%s established\n", libcfs_nid2str(conn->ibc_peer->ibp_nid)); + /* + * We received a packet but connection isn't established + * probably handshake packet was lost, so free to + * force make connection established + */ + rdma_notify(conn->ibc_cmid, IB_EVENT_COMM_EST); return; default: diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c index 07ec540..cbc9a9c 100644 --- a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c +++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c @@ -1468,11 +1468,6 @@ ksocknal_close_conn_locked(struct ksock_conn *conn, int error) conn->ksnc_route = NULL; -#if 0 /* irrelevant with only eager routes */ - /* make route least favourite */ - list_del(&route->ksnr_list); - list_add_tail(&route->ksnr_list, &peer->ksnp_routes); -#endif ksocknal_route_decref(route); /* drop conn's ref on route */ } diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h index a56632b..e6ca0cf 100644 --- a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h +++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h @@ -86,8 +86,6 @@ struct ksock_sched { /* per scheduler state */ int kss_nconns; /* # connections assigned to * this scheduler */ struct ksock_sched_info *kss_info; /* owner of it */ - struct page *kss_rx_scratch_pgs[LNET_MAX_IOV]; - struct kvec kss_scratch_iov[LNET_MAX_IOV]; }; struct ksock_sched_info { @@ -616,9 +614,7 @@ void ksocknal_shutdown(lnet_ni_t *ni); int ksocknal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg); int ksocknal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg); int ksocknal_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, - int delayed, unsigned int niov, - struct kvec *iov, lnet_kiov_t *kiov, - unsigned int offset, unsigned int mlen, unsigned int rlen); + int delayed, struct iov_iter *to, unsigned int rlen); int ksocknal_accept(lnet_ni_t *ni, struct socket *sock); int ksocknal_add_peer(lnet_ni_t *ni, lnet_process_id_t id, __u32 ip, int port); @@ -635,7 +631,7 @@ int ksocknal_close_peer_conns_locked(struct ksock_peer *peer, int ksocknal_close_conn_and_siblings(struct ksock_conn *conn, int why); int ksocknal_close_matching_conns(lnet_process_id_t id, __u32 ipaddr); struct ksock_conn *ksocknal_find_conn_locked(struct ksock_peer *peer, - struct ksock_tx *tx, int nonblk); + struct ksock_tx *tx, int nonblk); int ksocknal_launch_packet(lnet_ni_t *ni, struct ksock_tx *tx, lnet_process_id_t id); diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c index 303576d..c1c6f60 100644 --- a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c +++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c @@ -35,8 +35,8 @@ ksocknal_alloc_tx(int type, int size) spin_lock(&ksocknal_data.ksnd_tx_lock); if (!list_empty(&ksocknal_data.ksnd_idle_noop_txs)) { - tx = list_entry(ksocknal_data.ksnd_idle_noop_txs. \ - next, struct ksock_tx, tx_list); + tx = list_entry(ksocknal_data.ksnd_idle_noop_txs.next, + struct ksock_tx, tx_list); LASSERT(tx->tx_desc_size == size); list_del(&tx->tx_list); } @@ -164,13 +164,13 @@ ksocknal_send_kiov(struct ksock_conn *conn, struct ksock_tx *tx) do { LASSERT(tx->tx_nkiov > 0); - if (nob < (int)kiov->kiov_len) { - kiov->kiov_offset += nob; - kiov->kiov_len -= nob; + if (nob < (int)kiov->bv_len) { + kiov->bv_offset += nob; + kiov->bv_len -= nob; return rc; } - nob -= (int)kiov->kiov_len; + nob -= (int)kiov->bv_len; tx->tx_kiov = ++kiov; tx->tx_nkiov--; } while (nob); @@ -326,13 +326,13 @@ ksocknal_recv_kiov(struct ksock_conn *conn) do { LASSERT(conn->ksnc_rx_nkiov > 0); - if (nob < (int)kiov->kiov_len) { - kiov->kiov_offset += nob; - kiov->kiov_len -= nob; + if (nob < (int)kiov->bv_len) { + kiov->bv_offset += nob; + kiov->bv_len -= nob; return -EAGAIN; } - nob -= kiov->kiov_len; + nob -= kiov->bv_len; conn->ksnc_rx_kiov = ++kiov; conn->ksnc_rx_nkiov--; } while (nob); @@ -1325,39 +1325,36 @@ ksocknal_process_receive(struct ksock_conn *conn) int ksocknal_recv(lnet_ni_t *ni, void *private, lnet_msg_t *msg, int delayed, - unsigned int niov, struct kvec *iov, lnet_kiov_t *kiov, - unsigned int offset, unsigned int mlen, unsigned int rlen) + struct iov_iter *to, unsigned int rlen) { struct ksock_conn *conn = private; struct ksock_sched *sched = conn->ksnc_scheduler; - LASSERT(mlen <= rlen); - LASSERT(niov <= LNET_MAX_IOV); + LASSERT(iov_iter_count(to) <= rlen); + LASSERT(to->nr_segs <= LNET_MAX_IOV); conn->ksnc_cookie = msg; - conn->ksnc_rx_nob_wanted = mlen; + conn->ksnc_rx_nob_wanted = iov_iter_count(to); conn->ksnc_rx_nob_left = rlen; - if (!mlen || iov) { + if (to->type & ITER_KVEC) { conn->ksnc_rx_nkiov = 0; conn->ksnc_rx_kiov = NULL; conn->ksnc_rx_iov = conn->ksnc_rx_iov_space.iov; conn->ksnc_rx_niov = lnet_extract_iov(LNET_MAX_IOV, conn->ksnc_rx_iov, - niov, iov, offset, mlen); + to->nr_segs, to->kvec, + to->iov_offset, iov_iter_count(to)); } else { conn->ksnc_rx_niov = 0; conn->ksnc_rx_iov = NULL; conn->ksnc_rx_kiov = conn->ksnc_rx_iov_space.kiov; conn->ksnc_rx_nkiov = lnet_extract_kiov(LNET_MAX_IOV, conn->ksnc_rx_kiov, - niov, kiov, offset, mlen); + to->nr_segs, to->bvec, + to->iov_offset, iov_iter_count(to)); } - LASSERT(mlen == - lnet_iov_nob(conn->ksnc_rx_niov, conn->ksnc_rx_iov) + - lnet_kiov_nob(conn->ksnc_rx_nkiov, conn->ksnc_rx_kiov)); - LASSERT(conn->ksnc_rx_scheduled); spin_lock_bh(&sched->kss_lock); @@ -2008,13 +2005,6 @@ ksocknal_connect(struct ksock_route *route) list_splice_init(&peer->ksnp_tx_queue, &zombies); } -#if 0 /* irrelevant with only eager routes */ - if (!route->ksnr_deleted) { - /* make this route least-favourite for re-selection */ - list_del(&route->ksnr_list); - list_add_tail(&route->ksnr_list, &peer->ksnp_routes); - } -#endif write_unlock_bh(&ksocknal_data.ksnd_global_lock); ksocknal_peer_failed(peer); diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib.c index 6a17757..6c95e98 100644 --- a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib.c +++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib.c @@ -73,9 +73,9 @@ ksocknal_lib_zc_capable(struct ksock_conn *conn) int ksocknal_lib_send_iov(struct ksock_conn *conn, struct ksock_tx *tx) { + struct msghdr msg = {.msg_flags = MSG_DONTWAIT}; struct socket *sock = conn->ksnc_sock; - int nob; - int rc; + int nob, i; if (*ksocknal_tunables.ksnd_enable_csum && /* checksum enabled */ conn->ksnc_proto == &ksocknal_protocol_v2x && /* V2.x connection */ @@ -83,34 +83,16 @@ ksocknal_lib_send_iov(struct ksock_conn *conn, struct ksock_tx *tx) !tx->tx_msg.ksm_csum) /* not checksummed */ ksocknal_lib_csum_tx(tx); - /* - * NB we can't trust socket ops to either consume our iovs - * or leave them alone. - */ - { -#if SOCKNAL_SINGLE_FRAG_TX - struct kvec scratch; - struct kvec *scratchiov = &scratch; - unsigned int niov = 1; -#else - struct kvec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; - unsigned int niov = tx->tx_niov; -#endif - struct msghdr msg = {.msg_flags = MSG_DONTWAIT}; - int i; + for (nob = i = 0; i < tx->tx_niov; i++) + nob += tx->tx_iov[i].iov_len; - for (nob = i = 0; i < niov; i++) { - scratchiov[i] = tx->tx_iov[i]; - nob += scratchiov[i].iov_len; - } + if (!list_empty(&conn->ksnc_tx_queue) || + nob < tx->tx_resid) + msg.msg_flags |= MSG_MORE; - if (!list_empty(&conn->ksnc_tx_queue) || - nob < tx->tx_resid) - msg.msg_flags |= MSG_MORE; - - rc = kernel_sendmsg(sock, &msg, scratchiov, niov, nob); - } - return rc; + iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, + tx->tx_iov, tx->tx_niov, nob); + return sock_sendmsg(sock, &msg); } int @@ -124,20 +106,16 @@ ksocknal_lib_send_kiov(struct ksock_conn *conn, struct ksock_tx *tx) /* Not NOOP message */ LASSERT(tx->tx_lnetmsg); - /* - * NB we can't trust socket ops to either consume our iovs - * or leave them alone. - */ if (tx->tx_msg.ksm_zc_cookies[0]) { /* Zero copy is enabled */ struct sock *sk = sock->sk; - struct page *page = kiov->kiov_page; - int offset = kiov->kiov_offset; - int fragsize = kiov->kiov_len; + struct page *page = kiov->bv_page; + int offset = kiov->bv_offset; + int fragsize = kiov->bv_len; int msgflg = MSG_DONTWAIT; CDEBUG(D_NET, "page %p + offset %x for %d\n", - page, offset, kiov->kiov_len); + page, offset, kiov->bv_len); if (!list_empty(&conn->ksnc_tx_queue) || fragsize < tx->tx_resid) @@ -150,34 +128,19 @@ ksocknal_lib_send_kiov(struct ksock_conn *conn, struct ksock_tx *tx) rc = tcp_sendpage(sk, page, offset, fragsize, msgflg); } } else { -#if SOCKNAL_SINGLE_FRAG_TX || !SOCKNAL_RISK_KMAP_DEADLOCK - struct kvec scratch; - struct kvec *scratchiov = &scratch; - unsigned int niov = 1; -#else -#ifdef CONFIG_HIGHMEM -#warning "XXX risk of kmap deadlock on multiple frags..." -#endif - struct kvec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; - unsigned int niov = tx->tx_nkiov; -#endif struct msghdr msg = {.msg_flags = MSG_DONTWAIT}; int i; - for (nob = i = 0; i < niov; i++) { - scratchiov[i].iov_base = kmap(kiov[i].kiov_page) + - kiov[i].kiov_offset; - nob += scratchiov[i].iov_len = kiov[i].kiov_len; - } + for (nob = i = 0; i < tx->tx_nkiov; i++) + nob += kiov[i].bv_len; if (!list_empty(&conn->ksnc_tx_queue) || nob < tx->tx_resid) msg.msg_flags |= MSG_MORE; - rc = kernel_sendmsg(sock, &msg, (struct kvec *)scratchiov, niov, nob); - - for (i = 0; i < niov; i++) - kunmap(kiov[i].kiov_page); + iov_iter_bvec(&msg.msg_iter, WRITE | ITER_BVEC, + kiov, tx->tx_nkiov, nob); + rc = sock_sendmsg(sock, &msg); } return rc; } @@ -201,14 +164,7 @@ ksocknal_lib_eager_ack(struct ksock_conn *conn) int ksocknal_lib_recv_iov(struct ksock_conn *conn) { -#if SOCKNAL_SINGLE_FRAG_RX - struct kvec scratch; - struct kvec *scratchiov = &scratch; - unsigned int niov = 1; -#else - struct kvec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; unsigned int niov = conn->ksnc_rx_niov; -#endif struct kvec *iov = conn->ksnc_rx_iov; struct msghdr msg = { .msg_flags = 0 @@ -220,20 +176,15 @@ ksocknal_lib_recv_iov(struct ksock_conn *conn) int sum; __u32 saved_csum; - /* - * NB we can't trust socket ops to either consume our iovs - * or leave them alone. - */ LASSERT(niov > 0); - for (nob = i = 0; i < niov; i++) { - scratchiov[i] = iov[i]; - nob += scratchiov[i].iov_len; - } + for (nob = i = 0; i < niov; i++) + nob += iov[i].iov_len; + LASSERT(nob <= conn->ksnc_rx_nob_wanted); - rc = kernel_recvmsg(conn->ksnc_sock, &msg, scratchiov, niov, nob, - MSG_DONTWAIT); + iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, iov, niov, nob); + rc = sock_recvmsg(conn->ksnc_sock, &msg, MSG_DONTWAIT); saved_csum = 0; if (conn->ksnc_proto == &ksocknal_protocol_v2x) { @@ -259,67 +210,10 @@ ksocknal_lib_recv_iov(struct ksock_conn *conn) return rc; } -static void -ksocknal_lib_kiov_vunmap(void *addr) -{ - if (!addr) - return; - - vunmap(addr); -} - -static void * -ksocknal_lib_kiov_vmap(lnet_kiov_t *kiov, int niov, - struct kvec *iov, struct page **pages) -{ - void *addr; - int nob; - int i; - - if (!*ksocknal_tunables.ksnd_zc_recv || !pages) - return NULL; - - LASSERT(niov <= LNET_MAX_IOV); - - if (niov < 2 || - niov < *ksocknal_tunables.ksnd_zc_recv_min_nfrags) - return NULL; - - for (nob = i = 0; i < niov; i++) { - if ((kiov[i].kiov_offset && i > 0) || - (kiov[i].kiov_offset + kiov[i].kiov_len != PAGE_SIZE && i < niov - 1)) - return NULL; - - pages[i] = kiov[i].kiov_page; - nob += kiov[i].kiov_len; - } - - addr = vmap(pages, niov, VM_MAP, PAGE_KERNEL); - if (!addr) - return NULL; - - iov->iov_base = addr + kiov[0].kiov_offset; - iov->iov_len = nob; - - return addr; -} - int ksocknal_lib_recv_kiov(struct ksock_conn *conn) { -#if SOCKNAL_SINGLE_FRAG_RX || !SOCKNAL_RISK_KMAP_DEADLOCK - struct kvec scratch; - struct kvec *scratchiov = &scratch; - struct page **pages = NULL; - unsigned int niov = 1; -#else -#ifdef CONFIG_HIGHMEM -#warning "XXX risk of kmap deadlock on multiple frags..." -#endif - struct kvec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; - struct page **pages = conn->ksnc_scheduler->kss_rx_scratch_pgs; unsigned int niov = conn->ksnc_rx_nkiov; -#endif lnet_kiov_t *kiov = conn->ksnc_rx_kiov; struct msghdr msg = { .msg_flags = 0 @@ -328,63 +222,32 @@ ksocknal_lib_recv_kiov(struct ksock_conn *conn) int i; int rc; void *base; - void *addr; int sum; int fragnob; - int n; - - /* - * NB we can't trust socket ops to either consume our iovs - * or leave them alone. - */ - addr = ksocknal_lib_kiov_vmap(kiov, niov, scratchiov, pages); - if (addr) { - nob = scratchiov[0].iov_len; - n = 1; - } else { - for (nob = i = 0; i < niov; i++) { - nob += scratchiov[i].iov_len = kiov[i].kiov_len; - scratchiov[i].iov_base = kmap(kiov[i].kiov_page) + - kiov[i].kiov_offset; - } - n = niov; - } + for (nob = i = 0; i < niov; i++) + nob += kiov[i].bv_len; LASSERT(nob <= conn->ksnc_rx_nob_wanted); - rc = kernel_recvmsg(conn->ksnc_sock, &msg, (struct kvec *)scratchiov, - n, nob, MSG_DONTWAIT); + iov_iter_bvec(&msg.msg_iter, READ | ITER_BVEC, kiov, niov, nob); + rc = sock_recvmsg(conn->ksnc_sock, &msg, MSG_DONTWAIT); if (conn->ksnc_msg.ksm_csum) { for (i = 0, sum = rc; sum > 0; i++, sum -= fragnob) { LASSERT(i < niov); - /* - * Dang! have to kmap again because I have nowhere to - * stash the mapped address. But by doing it while the - * page is still mapped, the kernel just bumps the map - * count and returns me the address it stashed. - */ - base = kmap(kiov[i].kiov_page) + kiov[i].kiov_offset; - fragnob = kiov[i].kiov_len; + base = kmap(kiov[i].bv_page) + kiov[i].bv_offset; + fragnob = kiov[i].bv_len; if (fragnob > sum) fragnob = sum; conn->ksnc_rx_csum = ksocknal_csum(conn->ksnc_rx_csum, base, fragnob); - kunmap(kiov[i].kiov_page); + kunmap(kiov[i].bv_page); } } - - if (addr) { - ksocknal_lib_kiov_vunmap(addr); - } else { - for (i = 0; i < niov; i++) - kunmap(kiov[i].kiov_page); - } - return rc; } @@ -406,12 +269,12 @@ ksocknal_lib_csum_tx(struct ksock_tx *tx) if (tx->tx_kiov) { for (i = 0; i < tx->tx_nkiov; i++) { - base = kmap(tx->tx_kiov[i].kiov_page) + - tx->tx_kiov[i].kiov_offset; + base = kmap(tx->tx_kiov[i].bv_page) + + tx->tx_kiov[i].bv_offset; - csum = ksocknal_csum(csum, base, tx->tx_kiov[i].kiov_len); + csum = ksocknal_csum(csum, base, tx->tx_kiov[i].bv_len); - kunmap(tx->tx_kiov[i].kiov_page); + kunmap(tx->tx_kiov[i].bv_page); } } else { for (i = 1; i < tx->tx_niov; i++) |