diff options
Diffstat (limited to 'net')
146 files changed, 2280 insertions, 1361 deletions
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index bced8c0..7bc2208 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -108,9 +108,7 @@ struct p9_poll_wait { * @unsent_req_list: accounting for requests that haven't been sent * @req: current request being processed (if any) * @tmp_buf: temporary buffer to read in header - * @rsize: amount to read for current frame - * @rpos: read position in current frame - * @rbuf: current read buffer + * @rc: temporary fcall for reading current frame * @wpos: write position for current frame * @wsize: amount of data to write for current frame * @wbuf: current write buffer @@ -131,9 +129,7 @@ struct p9_conn { struct list_head unsent_req_list; struct p9_req_t *req; char tmp_buf[7]; - int rsize; - int rpos; - char *rbuf; + struct p9_fcall rc; int wpos; int wsize; char *wbuf; @@ -305,69 +301,77 @@ static void p9_read_work(struct work_struct *work) if (m->err < 0) return; - p9_debug(P9_DEBUG_TRANS, "start mux %p pos %d\n", m, m->rpos); + p9_debug(P9_DEBUG_TRANS, "start mux %p pos %zd\n", m, m->rc.offset); - if (!m->rbuf) { - m->rbuf = m->tmp_buf; - m->rpos = 0; - m->rsize = 7; /* start by reading header */ + if (!m->rc.sdata) { + m->rc.sdata = m->tmp_buf; + m->rc.offset = 0; + m->rc.capacity = 7; /* start by reading header */ } clear_bit(Rpending, &m->wsched); - p9_debug(P9_DEBUG_TRANS, "read mux %p pos %d size: %d = %d\n", - m, m->rpos, m->rsize, m->rsize-m->rpos); - err = p9_fd_read(m->client, m->rbuf + m->rpos, - m->rsize - m->rpos); + p9_debug(P9_DEBUG_TRANS, "read mux %p pos %zd size: %zd = %zd\n", + m, m->rc.offset, m->rc.capacity, + m->rc.capacity - m->rc.offset); + err = p9_fd_read(m->client, m->rc.sdata + m->rc.offset, + m->rc.capacity - m->rc.offset); p9_debug(P9_DEBUG_TRANS, "mux %p got %d bytes\n", m, err); - if (err == -EAGAIN) { + if (err == -EAGAIN) goto end_clear; - } if (err <= 0) goto error; - m->rpos += err; + m->rc.offset += err; - if ((!m->req) && (m->rpos == m->rsize)) { /* header read in */ - u16 tag; + /* header read in */ + if ((!m->req) && (m->rc.offset == m->rc.capacity)) { p9_debug(P9_DEBUG_TRANS, "got new header\n"); - n = le32_to_cpu(*(__le32 *) m->rbuf); /* read packet size */ - if (n >= m->client->msize) { + err = p9_parse_header(&m->rc, NULL, NULL, NULL, 0); + if (err) { + p9_debug(P9_DEBUG_ERROR, + "error parsing header: %d\n", err); + goto error; + } + + if (m->rc.size >= m->client->msize) { p9_debug(P9_DEBUG_ERROR, - "requested packet size too big: %d\n", n); + "requested packet size too big: %d\n", + m->rc.size); err = -EIO; goto error; } - tag = le16_to_cpu(*(__le16 *) (m->rbuf+5)); /* read tag */ p9_debug(P9_DEBUG_TRANS, - "mux %p pkt: size: %d bytes tag: %d\n", m, n, tag); + "mux %p pkt: size: %d bytes tag: %d\n", + m, m->rc.size, m->rc.tag); - m->req = p9_tag_lookup(m->client, tag); + m->req = p9_tag_lookup(m->client, m->rc.tag); if (!m->req || (m->req->status != REQ_STATUS_SENT)) { p9_debug(P9_DEBUG_ERROR, "Unexpected packet tag %d\n", - tag); + m->rc.tag); err = -EIO; goto error; } if (m->req->rc == NULL) { - m->req->rc = kmalloc(sizeof(struct p9_fcall) + - m->client->msize, GFP_NOFS); - if (!m->req->rc) { - m->req = NULL; - err = -ENOMEM; - goto error; - } + p9_debug(P9_DEBUG_ERROR, + "No recv fcall for tag %d (req %p), disconnecting!\n", + m->rc.tag, m->req); + m->req = NULL; + err = -EIO; + goto error; } - m->rbuf = (char *)m->req->rc + sizeof(struct p9_fcall); - memcpy(m->rbuf, m->tmp_buf, m->rsize); - m->rsize = n; + m->rc.sdata = (char *)m->req->rc + sizeof(struct p9_fcall); + memcpy(m->rc.sdata, m->tmp_buf, m->rc.capacity); + m->rc.capacity = m->rc.size; } - /* not an else because some packets (like clunk) have no payload */ - if ((m->req) && (m->rpos == m->rsize)) { /* packet is read in */ + /* packet is read in + * not an else because some packets (like clunk) have no payload + */ + if ((m->req) && (m->rc.offset == m->rc.capacity)) { p9_debug(P9_DEBUG_TRANS, "got new packet\n"); spin_lock(&m->client->lock); if (m->req->status != REQ_STATUS_ERROR) @@ -375,9 +379,9 @@ static void p9_read_work(struct work_struct *work) list_del(&m->req->req_list); spin_unlock(&m->client->lock); p9_client_cb(m->client, m->req, status); - m->rbuf = NULL; - m->rpos = 0; - m->rsize = 0; + m->rc.sdata = NULL; + m->rc.offset = 0; + m->rc.capacity = 0; m->req = NULL; } diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index 199bc76..4acb1d5 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -658,7 +658,7 @@ p9_virtio_create(struct p9_client *client, const char *devname, char *args) mutex_unlock(&virtio_9p_lock); if (!found) { - pr_err("no channels available\n"); + pr_err("no channels available for device %s\n", devname); return ret; } diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index d5871ac..f066781 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -1625,7 +1625,7 @@ static int atalk_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) rt = atrtr_find(&at_hint); } - err = ENETUNREACH; + err = -ENETUNREACH; if (!rt) goto out; diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index d5d71ac..c24c481 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -127,21 +127,17 @@ batadv_backbone_gw_free_ref(struct batadv_bla_backbone_gw *backbone_gw) } /* finally deinitialize the claim */ -static void batadv_claim_free_rcu(struct rcu_head *rcu) +static void batadv_claim_release(struct batadv_bla_claim *claim) { - struct batadv_bla_claim *claim; - - claim = container_of(rcu, struct batadv_bla_claim, rcu); - batadv_backbone_gw_free_ref(claim->backbone_gw); - kfree(claim); + kfree_rcu(claim, rcu); } /* free a claim, call claim_free_rcu if its the last reference */ static void batadv_claim_free_ref(struct batadv_bla_claim *claim) { if (atomic_dec_and_test(&claim->refcount)) - call_rcu(&claim->rcu, batadv_claim_free_rcu); + batadv_claim_release(claim); } /** diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index e6c8382..ccf70be 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -527,11 +527,12 @@ void batadv_gw_node_update(struct batadv_priv *bat_priv, * gets dereferenced. */ spin_lock_bh(&bat_priv->gw.list_lock); - hlist_del_init_rcu(&gw_node->list); + if (!hlist_unhashed(&gw_node->list)) { + hlist_del_init_rcu(&gw_node->list); + batadv_gw_node_free_ref(gw_node); + } spin_unlock_bh(&bat_priv->gw.list_lock); - batadv_gw_node_free_ref(gw_node); - curr_gw = batadv_gw_get_selected_gw_node(bat_priv); if (gw_node == curr_gw) batadv_gw_reselect(bat_priv); diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 01acccc..57f71071 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -76,6 +76,28 @@ out: } /** + * batadv_mutual_parents - check if two devices are each others parent + * @dev1: 1st net_device + * @dev2: 2nd net_device + * + * veth devices come in pairs and each is the parent of the other! + * + * Return: true if the devices are each others parent, otherwise false + */ +static bool batadv_mutual_parents(const struct net_device *dev1, + const struct net_device *dev2) +{ + int dev1_parent_iflink = dev_get_iflink(dev1); + int dev2_parent_iflink = dev_get_iflink(dev2); + + if (!dev1_parent_iflink || !dev2_parent_iflink) + return false; + + return (dev1_parent_iflink == dev2->ifindex) && + (dev2_parent_iflink == dev1->ifindex); +} + +/** * batadv_is_on_batman_iface - check if a device is a batman iface descendant * @net_dev: the device to check * @@ -108,6 +130,9 @@ static bool batadv_is_on_batman_iface(const struct net_device *net_dev) if (WARN(!parent_dev, "Cannot find parent device")) return false; + if (batadv_mutual_parents(net_dev, parent_dev)) + return false; + ret = batadv_is_on_batman_iface(parent_dev); return ret; diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h index 5a31420..7b12ea8 100644 --- a/net/batman-adv/hard-interface.h +++ b/net/batman-adv/hard-interface.h @@ -75,18 +75,6 @@ batadv_hardif_free_ref(struct batadv_hard_iface *hard_iface) call_rcu(&hard_iface->rcu, batadv_hardif_free_rcu); } -/** - * batadv_hardif_free_ref_now - decrement the hard interface refcounter and - * possibly free it (without rcu callback) - * @hard_iface: the hard interface to free - */ -static inline void -batadv_hardif_free_ref_now(struct batadv_hard_iface *hard_iface) -{ - if (atomic_dec_and_test(&hard_iface->refcount)) - batadv_hardif_free_rcu(&hard_iface->rcu); -} - static inline struct batadv_hard_iface * batadv_primary_if_get_selected(struct batadv_priv *bat_priv) { diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index c98b0ab..cc63b44 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -203,28 +203,25 @@ void batadv_nc_init_orig(struct batadv_orig_node *orig_node) } /** - * batadv_nc_node_free_rcu - rcu callback to free an nc node and remove - * its refcount on the orig_node - * @rcu: rcu pointer of the nc node + * batadv_nc_node_release - release nc_node from lists and queue for free after + * rcu grace period + * @nc_node: the nc node to free */ -static void batadv_nc_node_free_rcu(struct rcu_head *rcu) +static void batadv_nc_node_release(struct batadv_nc_node *nc_node) { - struct batadv_nc_node *nc_node; - - nc_node = container_of(rcu, struct batadv_nc_node, rcu); batadv_orig_node_free_ref(nc_node->orig_node); - kfree(nc_node); + kfree_rcu(nc_node, rcu); } /** - * batadv_nc_node_free_ref - decrements the nc node refcounter and possibly - * frees it + * batadv_nc_node_free_ref - decrement the nc node refcounter and possibly + * release it * @nc_node: the nc node to free */ static void batadv_nc_node_free_ref(struct batadv_nc_node *nc_node) { if (atomic_dec_and_test(&nc_node->refcount)) - call_rcu(&nc_node->rcu, batadv_nc_node_free_rcu); + batadv_nc_node_release(nc_node); } /** diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index ae6d18c..fe578f7 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -163,148 +163,101 @@ err: } /** - * batadv_neigh_ifinfo_free_rcu - free the neigh_ifinfo object - * @rcu: rcu pointer of the neigh_ifinfo object - */ -static void batadv_neigh_ifinfo_free_rcu(struct rcu_head *rcu) -{ - struct batadv_neigh_ifinfo *neigh_ifinfo; - - neigh_ifinfo = container_of(rcu, struct batadv_neigh_ifinfo, rcu); - - if (neigh_ifinfo->if_outgoing != BATADV_IF_DEFAULT) - batadv_hardif_free_ref_now(neigh_ifinfo->if_outgoing); - - kfree(neigh_ifinfo); -} - -/** - * batadv_neigh_ifinfo_free_now - decrement the refcounter and possibly free - * the neigh_ifinfo (without rcu callback) + * batadv_neigh_ifinfo_release - release neigh_ifinfo from lists and queue for + * free after rcu grace period * @neigh_ifinfo: the neigh_ifinfo object to release */ static void -batadv_neigh_ifinfo_free_ref_now(struct batadv_neigh_ifinfo *neigh_ifinfo) +batadv_neigh_ifinfo_release(struct batadv_neigh_ifinfo *neigh_ifinfo) { - if (atomic_dec_and_test(&neigh_ifinfo->refcount)) - batadv_neigh_ifinfo_free_rcu(&neigh_ifinfo->rcu); + if (neigh_ifinfo->if_outgoing != BATADV_IF_DEFAULT) + batadv_hardif_free_ref(neigh_ifinfo->if_outgoing); + + kfree_rcu(neigh_ifinfo, rcu); } /** - * batadv_neigh_ifinfo_free_ref - decrement the refcounter and possibly free + * batadv_neigh_ifinfo_free_ref - decrement the refcounter and possibly release * the neigh_ifinfo * @neigh_ifinfo: the neigh_ifinfo object to release */ void batadv_neigh_ifinfo_free_ref(struct batadv_neigh_ifinfo *neigh_ifinfo) { if (atomic_dec_and_test(&neigh_ifinfo->refcount)) - call_rcu(&neigh_ifinfo->rcu, batadv_neigh_ifinfo_free_rcu); + batadv_neigh_ifinfo_release(neigh_ifinfo); } /** - * batadv_hardif_neigh_free_rcu - free the hardif neigh_node - * @rcu: rcu pointer of the neigh_node - */ -static void batadv_hardif_neigh_free_rcu(struct rcu_head *rcu) -{ - struct batadv_hardif_neigh_node *hardif_neigh; - - hardif_neigh = container_of(rcu, struct batadv_hardif_neigh_node, rcu); - - batadv_hardif_free_ref_now(hardif_neigh->if_incoming); - kfree(hardif_neigh); -} - -/** - * batadv_hardif_neigh_free_now - decrement the hardif neighbors refcounter - * and possibly free it (without rcu callback) + * batadv_hardif_neigh_release - release hardif neigh node from lists and + * queue for free after rcu grace period * @hardif_neigh: hardif neigh neighbor to free */ static void -batadv_hardif_neigh_free_now(struct batadv_hardif_neigh_node *hardif_neigh) +batadv_hardif_neigh_release(struct batadv_hardif_neigh_node *hardif_neigh) { - if (atomic_dec_and_test(&hardif_neigh->refcount)) { - spin_lock_bh(&hardif_neigh->if_incoming->neigh_list_lock); - hlist_del_init_rcu(&hardif_neigh->list); - spin_unlock_bh(&hardif_neigh->if_incoming->neigh_list_lock); + spin_lock_bh(&hardif_neigh->if_incoming->neigh_list_lock); + hlist_del_init_rcu(&hardif_neigh->list); + spin_unlock_bh(&hardif_neigh->if_incoming->neigh_list_lock); - batadv_hardif_neigh_free_rcu(&hardif_neigh->rcu); - } + batadv_hardif_free_ref(hardif_neigh->if_incoming); + kfree_rcu(hardif_neigh, rcu); } /** * batadv_hardif_neigh_free_ref - decrement the hardif neighbors refcounter - * and possibly free it + * and possibly release it * @hardif_neigh: hardif neigh neighbor to free */ void batadv_hardif_neigh_free_ref(struct batadv_hardif_neigh_node *hardif_neigh) { - if (atomic_dec_and_test(&hardif_neigh->refcount)) { - spin_lock_bh(&hardif_neigh->if_incoming->neigh_list_lock); - hlist_del_init_rcu(&hardif_neigh->list); - spin_unlock_bh(&hardif_neigh->if_incoming->neigh_list_lock); - - call_rcu(&hardif_neigh->rcu, batadv_hardif_neigh_free_rcu); - } + if (atomic_dec_and_test(&hardif_neigh->refcount)) + batadv_hardif_neigh_release(hardif_neigh); } /** - * batadv_neigh_node_free_rcu - free the neigh_node - * @rcu: rcu pointer of the neigh_node + * batadv_neigh_node_release - release neigh_node from lists and queue for + * free after rcu grace period + * @neigh_node: neigh neighbor to free */ -static void batadv_neigh_node_free_rcu(struct rcu_head *rcu) +static void batadv_neigh_node_release(struct batadv_neigh_node *neigh_node) { struct hlist_node *node_tmp; - struct batadv_neigh_node *neigh_node; struct batadv_hardif_neigh_node *hardif_neigh; struct batadv_neigh_ifinfo *neigh_ifinfo; struct batadv_algo_ops *bao; - neigh_node = container_of(rcu, struct batadv_neigh_node, rcu); bao = neigh_node->orig_node->bat_priv->bat_algo_ops; hlist_for_each_entry_safe(neigh_ifinfo, node_tmp, &neigh_node->ifinfo_list, list) { - batadv_neigh_ifinfo_free_ref_now(neigh_ifinfo); + batadv_neigh_ifinfo_free_ref(neigh_ifinfo); } hardif_neigh = batadv_hardif_neigh_get(neigh_node->if_incoming, neigh_node->addr); if (hardif_neigh) { /* batadv_hardif_neigh_get() increases refcount too */ - batadv_hardif_neigh_free_now(hardif_neigh); - batadv_hardif_neigh_free_now(hardif_neigh); + batadv_hardif_neigh_free_ref(hardif_neigh); + batadv_hardif_neigh_free_ref(hardif_neigh); } if (bao->bat_neigh_free) bao->bat_neigh_free(neigh_node); - batadv_hardif_free_ref_now(neigh_node->if_incoming); + batadv_hardif_free_ref(neigh_node->if_incoming); - kfree(neigh_node); -} - -/** - * batadv_neigh_node_free_ref_now - decrement the neighbors refcounter - * and possibly free it (without rcu callback) - * @neigh_node: neigh neighbor to free - */ -static void -batadv_neigh_node_free_ref_now(struct batadv_neigh_node *neigh_node) -{ - if (atomic_dec_and_test(&neigh_node->refcount)) - batadv_neigh_node_free_rcu(&neigh_node->rcu); + kfree_rcu(neigh_node, rcu); } /** * batadv_neigh_node_free_ref - decrement the neighbors refcounter - * and possibly free it + * and possibly release it * @neigh_node: neigh neighbor to free */ void batadv_neigh_node_free_ref(struct batadv_neigh_node *neigh_node) { if (atomic_dec_and_test(&neigh_node->refcount)) - call_rcu(&neigh_node->rcu, batadv_neigh_node_free_rcu); + batadv_neigh_node_release(neigh_node); } /** @@ -733,108 +686,99 @@ int batadv_hardif_neigh_seq_print_text(struct seq_file *seq, void *offset) } /** - * batadv_orig_ifinfo_free_rcu - free the orig_ifinfo object - * @rcu: rcu pointer of the orig_ifinfo object + * batadv_orig_ifinfo_release - release orig_ifinfo from lists and queue for + * free after rcu grace period + * @orig_ifinfo: the orig_ifinfo object to release */ -static void batadv_orig_ifinfo_free_rcu(struct rcu_head *rcu) +static void batadv_orig_ifinfo_release(struct batadv_orig_ifinfo *orig_ifinfo) { - struct batadv_orig_ifinfo *orig_ifinfo; struct batadv_neigh_node *router; - orig_ifinfo = container_of(rcu, struct batadv_orig_ifinfo, rcu); - if (orig_ifinfo->if_outgoing != BATADV_IF_DEFAULT) - batadv_hardif_free_ref_now(orig_ifinfo->if_outgoing); + batadv_hardif_free_ref(orig_ifinfo->if_outgoing); /* this is the last reference to this object */ router = rcu_dereference_protected(orig_ifinfo->router, true); if (router) - batadv_neigh_node_free_ref_now(router); - kfree(orig_ifinfo); + batadv_neigh_node_free_ref(router); + + kfree_rcu(orig_ifinfo, rcu); } /** - * batadv_orig_ifinfo_free_ref - decrement the refcounter and possibly free - * the orig_ifinfo (without rcu callback) + * batadv_orig_ifinfo_free_ref - decrement the refcounter and possibly release + * the orig_ifinfo * @orig_ifinfo: the orig_ifinfo object to release */ -static void -batadv_orig_ifinfo_free_ref_now(struct batadv_orig_ifinfo *orig_ifinfo) +void batadv_orig_ifinfo_free_ref(struct batadv_orig_ifinfo *orig_ifinfo) { if (atomic_dec_and_test(&orig_ifinfo->refcount)) - batadv_orig_ifinfo_free_rcu(&orig_ifinfo->rcu); + batadv_orig_ifinfo_release(orig_ifinfo); } /** - * batadv_orig_ifinfo_free_ref - decrement the refcounter and possibly free - * the orig_ifinfo - * @orig_ifinfo: the orig_ifinfo object to release + * batadv_orig_node_free_rcu - free the orig_node + * @rcu: rcu pointer of the orig_node */ -void batadv_orig_ifinfo_free_ref(struct batadv_orig_ifinfo *orig_ifinfo) +static void batadv_orig_node_free_rcu(struct rcu_head *rcu) { - if (atomic_dec_and_test(&orig_ifinfo->refcount)) - call_rcu(&orig_ifinfo->rcu, batadv_orig_ifinfo_free_rcu); + struct batadv_orig_node *orig_node; + + orig_node = container_of(rcu, struct batadv_orig_node, rcu); + + batadv_mcast_purge_orig(orig_node); + + batadv_frag_purge_orig(orig_node, NULL); + + if (orig_node->bat_priv->bat_algo_ops->bat_orig_free) + orig_node->bat_priv->bat_algo_ops->bat_orig_free(orig_node); + + kfree(orig_node->tt_buff); + kfree(orig_node); } -static void batadv_orig_node_free_rcu(struct rcu_head *rcu) +/** + * batadv_orig_node_release - release orig_node from lists and queue for + * free after rcu grace period + * @orig_node: the orig node to free + */ +static void batadv_orig_node_release(struct batadv_orig_node *orig_node) { struct hlist_node *node_tmp; struct batadv_neigh_node *neigh_node; - struct batadv_orig_node *orig_node; struct batadv_orig_ifinfo *orig_ifinfo; - orig_node = container_of(rcu, struct batadv_orig_node, rcu); - spin_lock_bh(&orig_node->neigh_list_lock); /* for all neighbors towards this originator ... */ hlist_for_each_entry_safe(neigh_node, node_tmp, &orig_node->neigh_list, list) { hlist_del_rcu(&neigh_node->list); - batadv_neigh_node_free_ref_now(neigh_node); + batadv_neigh_node_free_ref(neigh_node); } hlist_for_each_entry_safe(orig_ifinfo, node_tmp, &orig_node->ifinfo_list, list) { hlist_del_rcu(&orig_ifinfo->list); - batadv_orig_ifinfo_free_ref_now(orig_ifinfo); + batadv_orig_ifinfo_free_ref(orig_ifinfo); } spin_unlock_bh(&orig_node->neigh_list_lock); - batadv_mcast_purge_orig(orig_node); - /* Free nc_nodes */ batadv_nc_purge_orig(orig_node->bat_priv, orig_node, NULL); - batadv_frag_purge_orig(orig_node, NULL); - - if (orig_node->bat_priv->bat_algo_ops->bat_orig_free) - orig_node->bat_priv->bat_algo_ops->bat_orig_free(orig_node); - - kfree(orig_node->tt_buff); - kfree(orig_node); + call_rcu(&orig_node->rcu, batadv_orig_node_free_rcu); } /** * batadv_orig_node_free_ref - decrement the orig node refcounter and possibly - * schedule an rcu callback for freeing it + * release it * @orig_node: the orig node to free */ void batadv_orig_node_free_ref(struct batadv_orig_node *orig_node) { if (atomic_dec_and_test(&orig_node->refcount)) - call_rcu(&orig_node->rcu, batadv_orig_node_free_rcu); -} - -/** - * batadv_orig_node_free_ref_now - decrement the orig node refcounter and - * possibly free it (without rcu callback) - * @orig_node: the orig node to free - */ -void batadv_orig_node_free_ref_now(struct batadv_orig_node *orig_node) -{ - if (atomic_dec_and_test(&orig_node->refcount)) - batadv_orig_node_free_rcu(&orig_node->rcu); + batadv_orig_node_release(orig_node); } void batadv_originator_free(struct batadv_priv *bat_priv) diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h index 2955775..cf07304 100644 --- a/net/batman-adv/originator.h +++ b/net/batman-adv/originator.h @@ -38,7 +38,6 @@ int batadv_originator_init(struct batadv_priv *bat_priv); void batadv_originator_free(struct batadv_priv *bat_priv); void batadv_purge_orig_ref(struct batadv_priv *bat_priv); void batadv_orig_node_free_ref(struct batadv_orig_node *orig_node); -void batadv_orig_node_free_ref_now(struct batadv_orig_node *orig_node); struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv, const u8 *addr); struct batadv_hardif_neigh_node * diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index a22080c..0e80fd1 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -240,20 +240,6 @@ int batadv_tt_global_hash_count(struct batadv_priv *bat_priv, return count; } -static void batadv_tt_orig_list_entry_free_rcu(struct rcu_head *rcu) -{ - struct batadv_tt_orig_list_entry *orig_entry; - - orig_entry = container_of(rcu, struct batadv_tt_orig_list_entry, rcu); - - /* We are in an rcu callback here, therefore we cannot use - * batadv_orig_node_free_ref() and its call_rcu(): - * An rcu_barrier() wouldn't wait for that to finish - */ - batadv_orig_node_free_ref_now(orig_entry->orig_node); - kfree(orig_entry); -} - /** * batadv_tt_local_size_mod - change the size by v of the local table identified * by vid @@ -317,9 +303,11 @@ static void batadv_tt_global_size_mod(struct batadv_orig_node *orig_node, if (atomic_add_return(v, &vlan->tt.num_entries) == 0) { spin_lock_bh(&orig_node->vlan_list_lock); - hlist_del_init_rcu(&vlan->list); + if (!hlist_unhashed(&vlan->list)) { + hlist_del_init_rcu(&vlan->list); + batadv_orig_node_vlan_free_ref(vlan); + } spin_unlock_bh(&orig_node->vlan_list_lock); - batadv_orig_node_vlan_free_ref(vlan); } batadv_orig_node_vlan_free_ref(vlan); @@ -349,13 +337,25 @@ static void batadv_tt_global_size_dec(struct batadv_orig_node *orig_node, batadv_tt_global_size_mod(orig_node, vid, -1); } +/** + * batadv_tt_orig_list_entry_release - release tt orig entry from lists and + * queue for free after rcu grace period + * @orig_entry: tt orig entry to be free'd + */ +static void +batadv_tt_orig_list_entry_release(struct batadv_tt_orig_list_entry *orig_entry) +{ + batadv_orig_node_free_ref(orig_entry->orig_node); + kfree_rcu(orig_entry, rcu); +} + static void batadv_tt_orig_list_entry_free_ref(struct batadv_tt_orig_list_entry *orig_entry) { if (!atomic_dec_and_test(&orig_entry->refcount)) return; - call_rcu(&orig_entry->rcu, batadv_tt_orig_list_entry_free_rcu); + batadv_tt_orig_list_entry_release(orig_entry); } /** diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index d040365..8a4cc2f 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -307,6 +307,9 @@ static int recv_pkt(struct sk_buff *skb, struct net_device *dev, /* check that it's our buffer */ if (lowpan_is_ipv6(*skb_network_header(skb))) { + /* Pull off the 1-byte of 6lowpan header. */ + skb_pull(skb, 1); + /* Copy the packet so that the IPv6 header is * properly aligned. */ @@ -317,6 +320,7 @@ static int recv_pkt(struct sk_buff *skb, struct net_device *dev, local_skb->protocol = htons(ETH_P_IPV6); local_skb->pkt_type = PACKET_HOST; + local_skb->dev = dev; skb_set_transport_header(local_skb, sizeof(struct ipv6hdr)); @@ -335,6 +339,8 @@ static int recv_pkt(struct sk_buff *skb, struct net_device *dev, if (!local_skb) goto drop; + local_skb->dev = dev; + ret = iphc_decompress(local_skb, dev, chan); if (ret < 0) { kfree_skb(local_skb); @@ -343,7 +349,6 @@ static int recv_pkt(struct sk_buff *skb, struct net_device *dev, local_skb->protocol = htons(ETH_P_IPV6); local_skb->pkt_type = PACKET_HOST; - local_skb->dev = dev; if (give_skb_to_upper(local_skb, dev) != NET_RX_SUCCESS) { diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 47bcef754..883c821 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -4112,8 +4112,10 @@ void hci_req_cmd_complete(struct hci_dev *hdev, u16 opcode, u8 status, break; } - *req_complete = bt_cb(skb)->hci.req_complete; - *req_complete_skb = bt_cb(skb)->hci.req_complete_skb; + if (bt_cb(skb)->hci.req_flags & HCI_REQ_SKB) + *req_complete_skb = bt_cb(skb)->hci.req_complete_skb; + else + *req_complete = bt_cb(skb)->hci.req_complete; kfree_skb(skb); } spin_unlock_irqrestore(&hdev->cmd_q.lock, flags); diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index 41b5f38..c78ee2d 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -688,21 +688,29 @@ static u8 update_white_list(struct hci_request *req) * command to remove it from the controller. */ list_for_each_entry(b, &hdev->le_white_list, list) { - struct hci_cp_le_del_from_white_list cp; + /* If the device is neither in pend_le_conns nor + * pend_le_reports then remove it from the whitelist. + */ + if (!hci_pend_le_action_lookup(&hdev->pend_le_conns, + &b->bdaddr, b->bdaddr_type) && + !hci_pend_le_action_lookup(&hdev->pend_le_reports, + &b->bdaddr, b->bdaddr_type)) { + struct hci_cp_le_del_from_white_list cp; + + cp.bdaddr_type = b->bdaddr_type; + bacpy(&cp.bdaddr, &b->bdaddr); - if (hci_pend_le_action_lookup(&hdev->pend_le_conns, - &b->bdaddr, b->bdaddr_type) || - hci_pend_le_action_lookup(&hdev->pend_le_reports, - &b->bdaddr, b->bdaddr_type)) { - white_list_entries++; + hci_req_add(req, HCI_OP_LE_DEL_FROM_WHITE_LIST, + sizeof(cp), &cp); continue; } - cp.bdaddr_type = b->bdaddr_type; - bacpy(&cp.bdaddr, &b->bdaddr); + if (hci_find_irk_by_addr(hdev, &b->bdaddr, b->bdaddr_type)) { + /* White list can not be used with RPAs */ + return 0x00; + } - hci_req_add(req, HCI_OP_LE_DEL_FROM_WHITE_LIST, - sizeof(cp), &cp); + white_list_entries++; } /* Since all no longer valid white list entries have been diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 39a5149..eb4f5f2 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -197,10 +197,20 @@ int l2cap_add_psm(struct l2cap_chan *chan, bdaddr_t *src, __le16 psm) chan->sport = psm; err = 0; } else { - u16 p; + u16 p, start, end, incr; + + if (chan->src_type == BDADDR_BREDR) { + start = L2CAP_PSM_DYN_START; + end = L2CAP_PSM_AUTO_END; + incr = 2; + } else { + start = L2CAP_PSM_LE_DYN_START; + end = L2CAP_PSM_LE_DYN_END; + incr = 1; + } err = -EINVAL; - for (p = 0x1001; p < 0x1100; p += 2) + for (p = start; p <= end; p += incr) if (!__l2cap_global_chan_by_addr(cpu_to_le16(p), src)) { chan->psm = cpu_to_le16(p); chan->sport = cpu_to_le16(p); diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 1bb5515..e4cae72 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -58,7 +58,7 @@ static int l2cap_validate_bredr_psm(u16 psm) return -EINVAL; /* Restrict usage of well-known PSMs */ - if (psm < 0x1001 && !capable(CAP_NET_BIND_SERVICE)) + if (psm < L2CAP_PSM_DYN_START && !capable(CAP_NET_BIND_SERVICE)) return -EACCES; return 0; @@ -67,11 +67,11 @@ static int l2cap_validate_bredr_psm(u16 psm) static int l2cap_validate_le_psm(u16 psm) { /* Valid LE_PSM ranges are defined only until 0x00ff */ - if (psm > 0x00ff) + if (psm > L2CAP_PSM_LE_DYN_END) return -EINVAL; /* Restrict fixed, SIG assigned PSM values to CAP_NET_BIND_SERVICE */ - if (psm <= 0x007f && !capable(CAP_NET_BIND_SERVICE)) + if (psm < L2CAP_PSM_LE_DYN_START && !capable(CAP_NET_BIND_SERVICE)) return -EACCES; return 0; @@ -125,6 +125,9 @@ static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int alen) goto done; } + bacpy(&chan->src, &la.l2_bdaddr); + chan->src_type = la.l2_bdaddr_type; + if (la.l2_cid) err = l2cap_add_scid(chan, __le16_to_cpu(la.l2_cid)); else @@ -156,9 +159,6 @@ static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int alen) break; } - bacpy(&chan->src, &la.l2_bdaddr); - chan->src_type = la.l2_bdaddr_type; - if (chan->psm && bdaddr_type_is_le(chan->src_type)) chan->mode = L2CAP_MODE_LE_FLOWCTL; diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index ffed8a1..4b175df 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -1072,22 +1072,6 @@ static void smp_notify_keys(struct l2cap_conn *conn) hcon->dst_type = smp->remote_irk->addr_type; queue_work(hdev->workqueue, &conn->id_addr_update_work); } - - /* When receiving an indentity resolving key for - * a remote device that does not use a resolvable - * private address, just remove the key so that - * it is possible to use the controller white - * list for scanning. - * - * Userspace will have been told to not store - * this key at this point. So it is safe to - * just remove it. - */ - if (!bacmp(&smp->remote_irk->rpa, BDADDR_ANY)) { - list_del_rcu(&smp->remote_irk->list); - kfree_rcu(smp->remote_irk, rcu); - smp->remote_irk = NULL; - } } if (smp->csrk) { diff --git a/net/bridge/br.c b/net/bridge/br.c index a1abe49..3addc05 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -121,6 +121,7 @@ static struct notifier_block br_device_notifier = { .notifier_call = br_device_event }; +/* called with RTNL */ static int br_switchdev_event(struct notifier_block *unused, unsigned long event, void *ptr) { @@ -130,7 +131,6 @@ static int br_switchdev_event(struct notifier_block *unused, struct switchdev_notifier_fdb_info *fdb_info; int err = NOTIFY_DONE; - rtnl_lock(); p = br_port_get_rtnl(dev); if (!p) goto out; @@ -155,7 +155,6 @@ static int br_switchdev_event(struct notifier_block *unused, } out: - rtnl_unlock(); return err; } diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 5e88d3e..2c8095a 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -28,6 +28,8 @@ const struct nf_br_ops __rcu *nf_br_ops __read_mostly; EXPORT_SYMBOL_GPL(nf_br_ops); +static struct lock_class_key bridge_netdev_addr_lock_key; + /* net device transmit always called with BH disabled */ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) { @@ -87,6 +89,11 @@ out: return NETDEV_TX_OK; } +static void br_set_lockdep_class(struct net_device *dev) +{ + lockdep_set_class(&dev->addr_list_lock, &bridge_netdev_addr_lock_key); +} + static int br_dev_init(struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); @@ -99,6 +106,7 @@ static int br_dev_init(struct net_device *dev) err = br_vlan_init(br); if (err) free_percpu(br->stats); + br_set_lockdep_class(dev); return err; } diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 82e3e97..dcea4f4 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -723,6 +723,8 @@ int br_fdb_dump(struct sk_buff *skb, struct net_bridge_fdb_entry *f; hlist_for_each_entry_rcu(f, &br->hash[i], hlist) { + int err; + if (idx < cb->args[0]) goto skip; @@ -741,12 +743,15 @@ int br_fdb_dump(struct sk_buff *skb, if (!filter_dev && f->dst) goto skip; - if (fdb_fill_info(skb, br, f, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - RTM_NEWNEIGH, - NLM_F_MULTI) < 0) + err = fdb_fill_info(skb, br, f, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWNEIGH, + NLM_F_MULTI); + if (err < 0) { + cb->args[1] = err; break; + } skip: ++idx; } diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 30e105f..74c278e 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -425,8 +425,8 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, mp = br_mdb_ip_get(mdb, group); if (!mp) { mp = br_multicast_new_group(br, port, group); - err = PTR_ERR(mp); - if (IS_ERR(mp)) + err = PTR_ERR_OR_ZERO(mp); + if (err) return err; } diff --git a/net/caif/cfrfml.c b/net/caif/cfrfml.c index 61d7617..b82440e 100644 --- a/net/caif/cfrfml.c +++ b/net/caif/cfrfml.c @@ -159,7 +159,7 @@ static int cfrfml_receive(struct cflayer *layr, struct cfpkt *pkt) tmppkt = NULL; /* Verify that length is correct */ - err = EPROTO; + err = -EPROTO; if (rfml->pdu_size != cfpkt_getlen(pkt) - RFM_HEAD_SIZE + 1) goto out; } diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c index 10d87753..9e43a31 100644 --- a/net/ceph/auth_x.c +++ b/net/ceph/auth_x.c @@ -152,7 +152,6 @@ static int process_one_ticket(struct ceph_auth_client *ac, void *ticket_buf = NULL; void *tp, *tpend; void **ptp; - struct ceph_timespec new_validity; struct ceph_crypto_key new_session_key; struct ceph_buffer *new_ticket_blob; unsigned long new_expires, new_renew_after; @@ -193,8 +192,8 @@ static int process_one_ticket(struct ceph_auth_client *ac, if (ret) goto out; - ceph_decode_copy(&dp, &new_validity, sizeof(new_validity)); - ceph_decode_timespec(&validity, &new_validity); + ceph_decode_timespec(&validity, dp); + dp += sizeof(struct ceph_timespec); new_expires = get_seconds() + validity.tv_sec; new_renew_after = new_expires - (validity.tv_sec / 4); dout(" expires=%lu renew_after=%lu\n", new_expires, @@ -233,10 +232,10 @@ static int process_one_ticket(struct ceph_auth_client *ac, ceph_buffer_put(th->ticket_blob); th->session_key = new_session_key; th->ticket_blob = new_ticket_blob; - th->validity = new_validity; th->secret_id = new_secret_id; th->expires = new_expires; th->renew_after = new_renew_after; + th->have_key = true; dout(" got ticket service %d (%s) secret_id %lld len %d\n", type, ceph_entity_type_name(type), th->secret_id, (int)th->ticket_blob->vec.iov_len); @@ -384,6 +383,24 @@ bad: return -ERANGE; } +static bool need_key(struct ceph_x_ticket_handler *th) +{ + if (!th->have_key) + return true; + + return get_seconds() >= th->renew_after; +} + +static bool have_key(struct ceph_x_ticket_handler *th) +{ + if (th->have_key) { + if (get_seconds() >= th->expires) + th->have_key = false; + } + + return th->have_key; +} + static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed) { int want = ac->want_keys; @@ -402,20 +419,18 @@ static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed) continue; th = get_ticket_handler(ac, service); - if (IS_ERR(th)) { *pneed |= service; continue; } - if (get_seconds() >= th->renew_after) + if (need_key(th)) *pneed |= service; - if (get_seconds() >= th->expires) + if (!have_key(th)) xi->have_keys &= ~service; } } - static int ceph_x_build_request(struct ceph_auth_client *ac, void *buf, void *end) { @@ -667,14 +682,26 @@ static void ceph_x_destroy(struct ceph_auth_client *ac) ac->private = NULL; } -static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac, - int peer_type) +static void invalidate_ticket(struct ceph_auth_client *ac, int peer_type) { struct ceph_x_ticket_handler *th; th = get_ticket_handler(ac, peer_type); if (!IS_ERR(th)) - memset(&th->validity, 0, sizeof(th->validity)); + th->have_key = false; +} + +static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac, + int peer_type) +{ + /* + * We are to invalidate a service ticket in the hopes of + * getting a new, hopefully more valid, one. But, we won't get + * it unless our AUTH ticket is good, so invalidate AUTH ticket + * as well, just in case. + */ + invalidate_ticket(ac, peer_type); + invalidate_ticket(ac, CEPH_ENTITY_TYPE_AUTH); } static int calcu_signature(struct ceph_x_authorizer *au, diff --git a/net/ceph/auth_x.h b/net/ceph/auth_x.h index e8b7c69..40b1a3c 100644 --- a/net/ceph/auth_x.h +++ b/net/ceph/auth_x.h @@ -16,7 +16,7 @@ struct ceph_x_ticket_handler { unsigned int service; struct ceph_crypto_key session_key; - struct ceph_timespec validity; + bool have_key; u64 secret_id; struct ceph_buffer *ticket_blob; diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index 393bfb2..5fcfb98 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -403,6 +403,7 @@ static int is_out(const struct crush_map *map, * @local_retries: localized retries * @local_fallback_retries: localized fallback retries * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose) + * @stable: stable mode starts rep=0 in the recursive call for all replicas * @vary_r: pass r to recursive calls * @out2: second output vector for leaf items (if @recurse_to_leaf) * @parent_r: r value passed from the parent @@ -419,6 +420,7 @@ static int crush_choose_firstn(const struct crush_map *map, unsigned int local_fallback_retries, int recurse_to_leaf, unsigned int vary_r, + unsigned int stable, int *out2, int parent_r) { @@ -433,13 +435,13 @@ static int crush_choose_firstn(const struct crush_map *map, int collide, reject; int count = out_size; - dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d\n", + dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d stable %d\n", recurse_to_leaf ? "_LEAF" : "", bucket->id, x, outpos, numrep, tries, recurse_tries, local_retries, local_fallback_retries, - parent_r); + parent_r, stable); - for (rep = outpos; rep < numrep && count > 0 ; rep++) { + for (rep = stable ? 0 : outpos; rep < numrep && count > 0 ; rep++) { /* keep trying until we get a non-out, non-colliding item */ ftotal = 0; skip_rep = 0; @@ -512,13 +514,14 @@ static int crush_choose_firstn(const struct crush_map *map, if (crush_choose_firstn(map, map->buckets[-1-item], weight, weight_max, - x, outpos+1, 0, + x, stable ? 1 : outpos+1, 0, out2, outpos, count, recurse_tries, 0, local_retries, local_fallback_retries, 0, vary_r, + stable, NULL, sub_r) <= outpos) /* didn't get leaf */ @@ -816,6 +819,7 @@ int crush_do_rule(const struct crush_map *map, int choose_local_fallback_retries = map->choose_local_fallback_tries; int vary_r = map->chooseleaf_vary_r; + int stable = map->chooseleaf_stable; if ((__u32)ruleno >= map->max_rules) { dprintk(" bad ruleno %d\n", ruleno); @@ -835,7 +839,8 @@ int crush_do_rule(const struct crush_map *map, case CRUSH_RULE_TAKE: if ((curstep->arg1 >= 0 && curstep->arg1 < map->max_devices) || - (-1-curstep->arg1 < map->max_buckets && + (-1-curstep->arg1 >= 0 && + -1-curstep->arg1 < map->max_buckets && map->buckets[-1-curstep->arg1])) { w[0] = curstep->arg1; wsize = 1; @@ -869,6 +874,11 @@ int crush_do_rule(const struct crush_map *map, vary_r = curstep->arg1; break; + case CRUSH_RULE_SET_CHOOSELEAF_STABLE: + if (curstep->arg1 >= 0) + stable = curstep->arg1; + break; + case CRUSH_RULE_CHOOSELEAF_FIRSTN: case CRUSH_RULE_CHOOSE_FIRSTN: firstn = 1; @@ -888,6 +898,7 @@ int crush_do_rule(const struct crush_map *map, osize = 0; for (i = 0; i < wsize; i++) { + int bno; /* * see CRUSH_N, CRUSH_N_MINUS macros. * basically, numrep <= 0 means relative to @@ -900,6 +911,13 @@ int crush_do_rule(const struct crush_map *map, continue; } j = 0; + /* make sure bucket id is valid */ + bno = -1 - w[i]; + if (bno < 0 || bno >= map->max_buckets) { + /* w[i] is probably CRUSH_ITEM_NONE */ + dprintk(" bad w[i] %d\n", w[i]); + continue; + } if (firstn) { int recurse_tries; if (choose_leaf_tries) @@ -911,7 +929,7 @@ int crush_do_rule(const struct crush_map *map, recurse_tries = choose_tries; osize += crush_choose_firstn( map, - map->buckets[-1-w[i]], + map->buckets[bno], weight, weight_max, x, numrep, curstep->arg2, @@ -923,6 +941,7 @@ int crush_do_rule(const struct crush_map *map, choose_local_fallback_retries, recurse_to_leaf, vary_r, + stable, c+osize, 0); } else { @@ -930,7 +949,7 @@ int crush_do_rule(const struct crush_map *map, numrep : (result_max-osize)); crush_choose_indep( map, - map->buckets[-1-w[i]], + map->buckets[bno], weight, weight_max, x, out_size, numrep, curstep->arg2, diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 9981039..9cfedf5 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -23,9 +23,6 @@ #include <linux/ceph/pagelist.h> #include <linux/export.h> -#define list_entry_next(pos, member) \ - list_entry(pos->member.next, typeof(*pos), member) - /* * Ceph uses the messenger to exchange ceph_msg messages with other * hosts in the system. The messenger provides ordered and reliable @@ -672,6 +669,8 @@ static void reset_connection(struct ceph_connection *con) } con->in_seq = 0; con->in_seq_acked = 0; + + con->out_skip = 0; } /* @@ -771,6 +770,8 @@ static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt) static void con_out_kvec_reset(struct ceph_connection *con) { + BUG_ON(con->out_skip); + con->out_kvec_left = 0; con->out_kvec_bytes = 0; con->out_kvec_cur = &con->out_kvec[0]; @@ -779,9 +780,9 @@ static void con_out_kvec_reset(struct ceph_connection *con) static void con_out_kvec_add(struct ceph_connection *con, size_t size, void *data) { - int index; + int index = con->out_kvec_left; - index = con->out_kvec_left; + BUG_ON(con->out_skip); BUG_ON(index >= ARRAY_SIZE(con->out_kvec)); con->out_kvec[index].iov_len = size; @@ -790,6 +791,27 @@ static void con_out_kvec_add(struct ceph_connection *con, con->out_kvec_bytes += size; } +/* + * Chop off a kvec from the end. Return residual number of bytes for + * that kvec, i.e. how many bytes would have been written if the kvec + * hadn't been nuked. + */ +static int con_out_kvec_skip(struct ceph_connection *con) +{ + int off = con->out_kvec_cur - con->out_kvec; + int skip = 0; + + if (con->out_kvec_bytes > 0) { + skip = con->out_kvec[off + con->out_kvec_left - 1].iov_len; + BUG_ON(con->out_kvec_bytes < skip); + BUG_ON(!con->out_kvec_left); + con->out_kvec_bytes -= skip; + con->out_kvec_left--; + } + + return skip; +} + #ifdef CONFIG_BLOCK /* @@ -1042,7 +1064,7 @@ static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data_cursor *cursor, /* Move on to the next page */ BUG_ON(list_is_last(&cursor->page->lru, &pagelist->head)); - cursor->page = list_entry_next(cursor->page, lru); + cursor->page = list_next_entry(cursor->page, lru); cursor->last_piece = cursor->resid <= PAGE_SIZE; return true; @@ -1166,7 +1188,7 @@ static bool ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, if (!cursor->resid && cursor->total_resid) { WARN_ON(!cursor->last_piece); BUG_ON(list_is_last(&cursor->data->links, cursor->data_head)); - cursor->data = list_entry_next(cursor->data, links); + cursor->data = list_next_entry(cursor->data, links); __ceph_msg_data_cursor_init(cursor); new_piece = true; } @@ -1197,7 +1219,6 @@ static void prepare_write_message_footer(struct ceph_connection *con) m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE; dout("prepare_write_message_footer %p\n", con); - con->out_kvec_is_msg = true; con->out_kvec[v].iov_base = &m->footer; if (con->peer_features & CEPH_FEATURE_MSG_AUTH) { if (con->ops->sign_message) @@ -1225,7 +1246,6 @@ static void prepare_write_message(struct ceph_connection *con) u32 crc; con_out_kvec_reset(con); - con->out_kvec_is_msg = true; con->out_msg_done = false; /* Sneak an ack in there first? If we can get it into the same @@ -1265,18 +1285,19 @@ static void prepare_write_message(struct ceph_connection *con) /* tag + hdr + front + middle */ con_out_kvec_add(con, sizeof (tag_msg), &tag_msg); - con_out_kvec_add(con, sizeof (m->hdr), &m->hdr); + con_out_kvec_add(con, sizeof(con->out_hdr), &con->out_hdr); con_out_kvec_add(con, m->front.iov_len, m->front.iov_base); if (m->middle) con_out_kvec_add(con, m->middle->vec.iov_len, m->middle->vec.iov_base); - /* fill in crc (except data pages), footer */ + /* fill in hdr crc and finalize hdr */ crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc)); con->out_msg->hdr.crc = cpu_to_le32(crc); - con->out_msg->footer.flags = 0; + memcpy(&con->out_hdr, &con->out_msg->hdr, sizeof(con->out_hdr)); + /* fill in front and middle crc, footer */ crc = crc32c(0, m->front.iov_base, m->front.iov_len); con->out_msg->footer.front_crc = cpu_to_le32(crc); if (m->middle) { @@ -1288,6 +1309,7 @@ static void prepare_write_message(struct ceph_connection *con) dout("%s front_crc %u middle_crc %u\n", __func__, le32_to_cpu(con->out_msg->footer.front_crc), le32_to_cpu(con->out_msg->footer.middle_crc)); + con->out_msg->footer.flags = 0; /* is there a data payload? */ con->out_msg->footer.data_crc = 0; @@ -1492,7 +1514,6 @@ static int write_partial_kvec(struct ceph_connection *con) } } con->out_kvec_left = 0; - con->out_kvec_is_msg = false; ret = 1; out: dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con, @@ -1584,6 +1605,7 @@ static int write_partial_skip(struct ceph_connection *con) { int ret; + dout("%s %p %d left\n", __func__, con, con->out_skip); while (con->out_skip > 0) { size_t size = min(con->out_skip, (int) PAGE_CACHE_SIZE); @@ -2506,13 +2528,13 @@ more: more_kvec: /* kvec data queued? */ - if (con->out_skip) { - ret = write_partial_skip(con); + if (con->out_kvec_left) { + ret = write_partial_kvec(con); if (ret <= 0) goto out; } - if (con->out_kvec_left) { - ret = write_partial_kvec(con); + if (con->out_skip) { + ret = write_partial_skip(con); if (ret <= 0) goto out; } @@ -2805,13 +2827,17 @@ static bool con_backoff(struct ceph_connection *con) static void con_fault_finish(struct ceph_connection *con) { + dout("%s %p\n", __func__, con); + /* * in case we faulted due to authentication, invalidate our * current tickets so that we can get new ones. */ - if (con->auth_retry && con->ops->invalidate_authorizer) { - dout("calling invalidate_authorizer()\n"); - con->ops->invalidate_authorizer(con); + if (con->auth_retry) { + dout("auth_retry %d, invalidating\n", con->auth_retry); + if (con->ops->invalidate_authorizer) + con->ops->invalidate_authorizer(con); + con->auth_retry = 0; } if (con->ops->fault) @@ -3050,16 +3076,31 @@ void ceph_msg_revoke(struct ceph_msg *msg) ceph_msg_put(msg); } if (con->out_msg == msg) { - dout("%s %p msg %p - was sending\n", __func__, con, msg); - con->out_msg = NULL; - if (con->out_kvec_is_msg) { - con->out_skip = con->out_kvec_bytes; - con->out_kvec_is_msg = false; + BUG_ON(con->out_skip); + /* footer */ + if (con->out_msg_done) { + con->out_skip += con_out_kvec_skip(con); + } else { + BUG_ON(!msg->data_length); + if (con->peer_features & CEPH_FEATURE_MSG_AUTH) + con->out_skip += sizeof(msg->footer); + else + con->out_skip += sizeof(msg->old_footer); } + /* data, middle, front */ + if (msg->data_length) + con->out_skip += msg->cursor.total_resid; + if (msg->middle) + con->out_skip += con_out_kvec_skip(con); + con->out_skip += con_out_kvec_skip(con); + + dout("%s %p msg %p - was sending, will write %d skip %d\n", + __func__, con, msg, con->out_kvec_bytes, con->out_skip); msg->hdr.seq = 0; - + con->out_msg = NULL; ceph_msg_put(msg); } + mutex_unlock(&con->mutex); } @@ -3361,9 +3402,7 @@ static void ceph_msg_free(struct ceph_msg *m) static void ceph_msg_release(struct kref *kref) { struct ceph_msg *m = container_of(kref, struct ceph_msg, kref); - LIST_HEAD(data); - struct list_head *links; - struct list_head *next; + struct ceph_msg_data *data, *next; dout("%s %p\n", __func__, m); WARN_ON(!list_empty(&m->list_head)); @@ -3376,12 +3415,8 @@ static void ceph_msg_release(struct kref *kref) m->middle = NULL; } - list_splice_init(&m->data, &data); - list_for_each_safe(links, next, &data) { - struct ceph_msg_data *data; - - data = list_entry(links, struct ceph_msg_data, links); - list_del_init(links); + list_for_each_entry_safe(data, next, &m->data, links) { + list_del_init(&data->links); ceph_msg_data_destroy(data); } m->data_length = 0; diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index edda016..de85ddd 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -364,10 +364,6 @@ static bool have_debugfs_info(struct ceph_mon_client *monc) return monc->client->have_fsid && monc->auth->global_id > 0; } -/* - * The monitor responds with mount ack indicate mount success. The - * included client ticket allows the client to talk to MDSs and OSDs. - */ static void ceph_monc_handle_map(struct ceph_mon_client *monc, struct ceph_msg *msg) { diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index f8f2359..3534e12 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1770,6 +1770,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg) u32 osdmap_epoch; int already_completed; u32 bytes; + u8 decode_redir; unsigned int i; tid = le64_to_cpu(msg->hdr.tid); @@ -1841,6 +1842,15 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg) p += 8 + 4; /* skip replay_version */ p += 8; /* skip user_version */ + if (le16_to_cpu(msg->hdr.version) >= 7) + ceph_decode_8_safe(&p, end, decode_redir, bad_put); + else + decode_redir = 1; + } else { + decode_redir = 0; + } + + if (decode_redir) { err = ceph_redirect_decode(&p, end, &redir); if (err) goto bad_put; diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 7d8f581..243574c 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -342,23 +342,32 @@ static struct crush_map *crush_decode(void *pbyval, void *end) c->choose_local_tries = ceph_decode_32(p); c->choose_local_fallback_tries = ceph_decode_32(p); c->choose_total_tries = ceph_decode_32(p); - dout("crush decode tunable choose_local_tries = %d", + dout("crush decode tunable choose_local_tries = %d\n", c->choose_local_tries); - dout("crush decode tunable choose_local_fallback_tries = %d", + dout("crush decode tunable choose_local_fallback_tries = %d\n", c->choose_local_fallback_tries); - dout("crush decode tunable choose_total_tries = %d", + dout("crush decode tunable choose_total_tries = %d\n", c->choose_total_tries); ceph_decode_need(p, end, sizeof(u32), done); c->chooseleaf_descend_once = ceph_decode_32(p); - dout("crush decode tunable chooseleaf_descend_once = %d", + dout("crush decode tunable chooseleaf_descend_once = %d\n", c->chooseleaf_descend_once); ceph_decode_need(p, end, sizeof(u8), done); c->chooseleaf_vary_r = ceph_decode_8(p); - dout("crush decode tunable chooseleaf_vary_r = %d", + dout("crush decode tunable chooseleaf_vary_r = %d\n", c->chooseleaf_vary_r); + /* skip straw_calc_version, allowed_bucket_algs */ + ceph_decode_need(p, end, sizeof(u8) + sizeof(u32), done); + *p += sizeof(u8) + sizeof(u32); + + ceph_decode_need(p, end, sizeof(u8), done); + c->chooseleaf_stable = ceph_decode_8(p); + dout("crush decode tunable chooseleaf_stable = %d\n", + c->chooseleaf_stable); + done: dout("crush_decode success\n"); return c; diff --git a/net/core/dev.c b/net/core/dev.c index 0ca95d5..0ef061b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2695,6 +2695,8 @@ static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) * * It may return NULL if the skb requires no segmentation. This is * only possible when GSO is used for verifying header integrity. + * + * Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb. */ struct sk_buff *__skb_gso_segment(struct sk_buff *skb, netdev_features_t features, bool tx_path) @@ -2709,6 +2711,9 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb, return ERR_PTR(err); } + BUILD_BUG_ON(SKB_SGO_CB_OFFSET + + sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb)); + SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb); SKB_GSO_CB(skb)->encap_level = 0; @@ -4346,6 +4351,7 @@ static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb) diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; diffs |= p->vlan_tci ^ skb->vlan_tci; + diffs |= skb_metadata_dst_cmp(p, skb); if (maclen == ETH_HLEN) diffs |= compare_ether_header(skb_mac_header(p), skb_mac_header(skb)); @@ -4543,10 +4549,12 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) break; case GRO_MERGED_FREE: - if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) + if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) { + skb_dst_drop(skb); kmem_cache_free(skbuff_head_cache, skb); - else + } else { __kfree_skb(skb); + } break; case GRO_HELD: @@ -5371,12 +5379,12 @@ void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter) { struct netdev_adjacent *lower; - lower = list_entry((*iter)->next, struct netdev_adjacent, list); + lower = list_entry(*iter, struct netdev_adjacent, list); if (&lower->list == &dev->adj_list.lower) return NULL; - *iter = &lower->list; + *iter = lower->list.next; return lower->dev; } @@ -7414,8 +7422,10 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; setup(dev); - if (!dev->tx_queue_len) + if (!dev->tx_queue_len) { dev->priv_flags |= IFF_NO_QUEUE; + dev->tx_queue_len = 1; + } dev->num_tx_queues = txqs; dev->real_num_tx_queues = txqs; diff --git a/net/core/filter.c b/net/core/filter.c index 94d2620..bba502f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1752,7 +1752,7 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) u8 compat[sizeof(struct bpf_tunnel_key)]; struct ip_tunnel_info *info; - if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6))) + if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX))) return -EINVAL; if (unlikely(size != sizeof(struct bpf_tunnel_key))) { switch (size) { @@ -1776,7 +1776,7 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) info = &md->u.tun_info; info->mode = IP_TUNNEL_INFO_TX; - info->key.tun_flags = TUNNEL_KEY; + info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM; info->key.tun_id = cpu_to_be64(from->tunnel_id); info->key.tos = from->tunnel_tos; info->key.ttl = from->tunnel_ttl; @@ -1787,6 +1787,8 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) sizeof(from->remote_ipv6)); } else { info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4); + if (flags & BPF_F_ZERO_CSUM_TX) + info->key.tun_flags &= ~TUNNEL_CSUM; } return 0; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index d79699c..12e7003 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -208,7 +208,6 @@ ip: case htons(ETH_P_IPV6): { const struct ipv6hdr *iph; struct ipv6hdr _iph; - __be32 flow_label; ipv6: iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph); @@ -230,8 +229,12 @@ ipv6: key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; } - flow_label = ip6_flowlabel(iph); - if (flow_label) { + if ((dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_FLOW_LABEL) || + (flags & FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL)) && + ip6_flowlabel(iph)) { + __be32 flow_label = ip6_flowlabel(iph); + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_FLOW_LABEL)) { key_tags = skb_flow_dissector_target(flow_dissector, @@ -396,6 +399,13 @@ ip_proto_again: goto out_bad; proto = eth->h_proto; nhoff += sizeof(*eth); + + /* Cap headers that we access via pointers at the + * end of the Ethernet header as our maximum alignment + * at that point is only 2 bytes. + */ + if (NET_IP_ALIGN) + hlen = nhoff; } key_control->flags |= FLOW_DIS_ENCAPSULATION; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index d735e85..8261d95 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2911,6 +2911,7 @@ int ndo_dflt_fdb_dump(struct sk_buff *skb, nlmsg_populate_fdb(skb, cb, dev, &idx, &dev->mc); out: netif_addr_unlock_bh(dev); + cb->args[1] = err; return idx; } EXPORT_SYMBOL(ndo_dflt_fdb_dump); @@ -2944,6 +2945,7 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) ops = br_dev->netdev_ops; } + cb->args[1] = 0; for_each_netdev(net, dev) { if (brport_idx && (dev->ifindex != brport_idx)) continue; @@ -2971,12 +2973,16 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) idx = cops->ndo_fdb_dump(skb, cb, br_dev, dev, idx); } + if (cb->args[1] == -EMSGSIZE) + break; if (dev->netdev_ops->ndo_fdb_dump) idx = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, NULL, idx); else idx = ndo_dflt_fdb_dump(skb, cb, dev, NULL, idx); + if (cb->args[1] == -EMSGSIZE) + break; cops = NULL; } diff --git a/net/core/scm.c b/net/core/scm.c index 14596fb..2696aef 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -87,6 +87,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) *fplp = fpl; fpl->count = 0; fpl->max = SCM_MAX_FD; + fpl->user = NULL; } fpp = &fpl->fp[fpl->count]; @@ -107,6 +108,10 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) *fpp++ = file; fpl->count++; } + + if (!fpl->user) + fpl->user = get_uid(current_user()); + return num; } @@ -119,6 +124,7 @@ void __scm_destroy(struct scm_cookie *scm) scm->fp = NULL; for (i=fpl->count-1; i>=0; i--) fput(fpl->fp[i]); + free_uid(fpl->user); kfree(fpl); } } @@ -336,6 +342,7 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl) for (i = 0; i < fpl->count; i++) get_file(fpl->fp[i]); new_fpl->max = new_fpl->count; + new_fpl->user = get_uid(fpl->user); } return new_fpl; } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b2df375..8616d11 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -79,6 +79,8 @@ struct kmem_cache *skbuff_head_cache __read_mostly; static struct kmem_cache *skbuff_fclone_cache __read_mostly; +int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS; +EXPORT_SYMBOL(sysctl_max_skb_frags); /** * skb_panic - private function for out-of-line support @@ -2946,6 +2948,24 @@ int skb_append_pagefrags(struct sk_buff *skb, struct page *page, EXPORT_SYMBOL_GPL(skb_append_pagefrags); /** + * skb_push_rcsum - push skb and update receive checksum + * @skb: buffer to update + * @len: length of data pulled + * + * This function performs an skb_push on the packet and updates + * the CHECKSUM_COMPLETE checksum. It should be used on + * receive path processing instead of skb_push unless you know + * that the checksum difference is zero (e.g., a valid IP header) + * or you are setting ip_summed to CHECKSUM_NONE. + */ +static unsigned char *skb_push_rcsum(struct sk_buff *skb, unsigned len) +{ + skb_push(skb, len); + skb_postpush_rcsum(skb, skb->data, len); + return skb->data; +} + +/** * skb_pull_rcsum - pull skb and update receive checksum * @skb: buffer to update * @len: length of data pulled @@ -4082,9 +4102,9 @@ struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb, if (!pskb_may_pull(skb_chk, offset)) goto err; - __skb_pull(skb_chk, offset); + skb_pull_rcsum(skb_chk, offset); ret = skb_chkf(skb_chk); - __skb_push(skb_chk, offset); + skb_push_rcsum(skb_chk, offset); if (ret) goto err; diff --git a/net/core/sock.c b/net/core/sock.c index 5127023..6c1c8bc9 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -195,44 +195,6 @@ bool sk_net_capable(const struct sock *sk, int cap) } EXPORT_SYMBOL(sk_net_capable); - -#ifdef CONFIG_MEMCG_KMEM -int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss) -{ - struct proto *proto; - int ret = 0; - - mutex_lock(&proto_list_mutex); - list_for_each_entry(proto, &proto_list, node) { - if (proto->init_cgroup) { - ret = proto->init_cgroup(memcg, ss); - if (ret) - goto out; - } - } - - mutex_unlock(&proto_list_mutex); - return ret; -out: - list_for_each_entry_continue_reverse(proto, &proto_list, node) - if (proto->destroy_cgroup) - proto->destroy_cgroup(memcg); - mutex_unlock(&proto_list_mutex); - return ret; -} - -void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg) -{ - struct proto *proto; - - mutex_lock(&proto_list_mutex); - list_for_each_entry_reverse(proto, &proto_list, node) - if (proto->destroy_cgroup) - proto->destroy_cgroup(memcg); - mutex_unlock(&proto_list_mutex); -} -#endif - /* * Each address family might have different locking rules, so we have * one slock key per address family: @@ -240,11 +202,6 @@ void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg) static struct lock_class_key af_family_keys[AF_MAX]; static struct lock_class_key af_family_slock_keys[AF_MAX]; -#if defined(CONFIG_MEMCG_KMEM) -struct static_key memcg_socket_limit_enabled; -EXPORT_SYMBOL(memcg_socket_limit_enabled); -#endif - /* * Make lock validator output more readable. (we pre-construct these * strings build-time, so that runtime initialization of socket @@ -1507,12 +1464,6 @@ void sk_free(struct sock *sk) } EXPORT_SYMBOL(sk_free); -static void sk_update_clone(const struct sock *sk, struct sock *newsk) -{ - if (mem_cgroup_sockets_enabled && sk->sk_cgrp) - sock_update_memcg(newsk); -} - /** * sk_clone_lock - clone a socket, and lock its clone * @sk: the socket to clone @@ -1607,7 +1558,8 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) sk_set_socket(newsk, NULL); newsk->sk_wq = NULL; - sk_update_clone(sk, newsk); + if (mem_cgroup_sockets_enabled && sk->sk_memcg) + sock_update_memcg(newsk); if (newsk->sk_prot->sockets_allocated) sk_sockets_allocated_inc(newsk); @@ -2089,27 +2041,27 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind) struct proto *prot = sk->sk_prot; int amt = sk_mem_pages(size); long allocated; - int parent_status = UNDER_LIMIT; sk->sk_forward_alloc += amt * SK_MEM_QUANTUM; - allocated = sk_memory_allocated_add(sk, amt, &parent_status); + allocated = sk_memory_allocated_add(sk, amt); + + if (mem_cgroup_sockets_enabled && sk->sk_memcg && + !mem_cgroup_charge_skmem(sk->sk_memcg, amt)) + goto suppress_allocation; /* Under limit. */ - if (parent_status == UNDER_LIMIT && - allocated <= sk_prot_mem_limits(sk, 0)) { + if (allocated <= sk_prot_mem_limits(sk, 0)) { sk_leave_memory_pressure(sk); return 1; } - /* Under pressure. (we or our parents) */ - if ((parent_status > SOFT_LIMIT) || - allocated > sk_prot_mem_limits(sk, 1)) + /* Under pressure. */ + if (allocated > sk_prot_mem_limits(sk, 1)) sk_enter_memory_pressure(sk); - /* Over hard limit (we or our parents) */ - if ((parent_status == OVER_LIMIT) || - (allocated > sk_prot_mem_limits(sk, 2))) + /* Over hard limit. */ + if (allocated > sk_prot_mem_limits(sk, 2)) goto suppress_allocation; /* guarantee minimum buffer size under pressure */ @@ -2158,6 +2110,9 @@ suppress_allocation: sk_memory_allocated_sub(sk, amt); + if (mem_cgroup_sockets_enabled && sk->sk_memcg) + mem_cgroup_uncharge_skmem(sk->sk_memcg, amt); + return 0; } EXPORT_SYMBOL(__sk_mem_schedule); @@ -2173,6 +2128,9 @@ void __sk_mem_reclaim(struct sock *sk, int amount) sk_memory_allocated_sub(sk, amount); sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT; + if (mem_cgroup_sockets_enabled && sk->sk_memcg) + mem_cgroup_uncharge_skmem(sk->sk_memcg, amount); + if (sk_under_memory_pressure(sk) && (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) sk_leave_memory_pressure(sk); diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index 1df98c5..e92b759 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -93,10 +93,17 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse) * @sk2: Socket belonging to the existing reuseport group. * May return ENOMEM and not add socket to group under memory pressure. */ -int reuseport_add_sock(struct sock *sk, const struct sock *sk2) +int reuseport_add_sock(struct sock *sk, struct sock *sk2) { struct sock_reuseport *reuse; + if (!rcu_access_pointer(sk2->sk_reuseport_cb)) { + int err = reuseport_alloc(sk2); + + if (err) + return err; + } + spin_lock_bh(&reuseport_lock); reuse = rcu_dereference_protected(sk2->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)), diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 95b6139..a6beb7b 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -26,6 +26,7 @@ static int zero = 0; static int one = 1; static int min_sndbuf = SOCK_MIN_SNDBUF; static int min_rcvbuf = SOCK_MIN_RCVBUF; +static int max_skb_frags = MAX_SKB_FRAGS; static int net_msg_warn; /* Unused, but still a sysctl */ @@ -392,6 +393,15 @@ static struct ctl_table net_core_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "max_skb_frags", + .data = &sysctl_max_skb_frags, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + .extra2 = &max_skb_frags, + }, { } }; diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 5684e14..902d606 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -824,26 +824,26 @@ lookup: if (sk->sk_state == DCCP_NEW_SYN_RECV) { struct request_sock *req = inet_reqsk(sk); - struct sock *nsk = NULL; + struct sock *nsk; sk = req->rsk_listener; - if (likely(sk->sk_state == DCCP_LISTEN)) { - nsk = dccp_check_req(sk, skb, req); - } else { + if (unlikely(sk->sk_state != DCCP_LISTEN)) { inet_csk_reqsk_queue_drop_and_put(sk, req); goto lookup; } + sock_hold(sk); + nsk = dccp_check_req(sk, skb, req); if (!nsk) { reqsk_put(req); - goto discard_it; + goto discard_and_relse; } if (nsk == sk) { - sock_hold(sk); reqsk_put(req); } else if (dccp_child_process(sk, nsk, skb)) { dccp_v4_ctl_send_reset(sk, skb); - goto discard_it; + goto discard_and_relse; } else { + sock_put(sk); return 0; } } diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 9c6d050..b8608b7 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -691,26 +691,26 @@ lookup: if (sk->sk_state == DCCP_NEW_SYN_RECV) { struct request_sock *req = inet_reqsk(sk); - struct sock *nsk = NULL; + struct sock *nsk; sk = req->rsk_listener; - if (likely(sk->sk_state == DCCP_LISTEN)) { - nsk = dccp_check_req(sk, skb, req); - } else { + if (unlikely(sk->sk_state != DCCP_LISTEN)) { inet_csk_reqsk_queue_drop_and_put(sk, req); goto lookup; } + sock_hold(sk); + nsk = dccp_check_req(sk, skb, req); if (!nsk) { reqsk_put(req); - goto discard_it; + goto discard_and_relse; } if (nsk == sk) { - sock_hold(sk); reqsk_put(req); } else if (dccp_child_process(sk, nsk, skb)) { dccp_v6_ctl_send_reset(sk, skb); - goto discard_it; + goto discard_and_relse; } else { + sock_put(sk); return 0; } } diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 40b9ca7..ab24521 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1194,7 +1194,6 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent, if (ret) { netdev_err(master, "error %d registering interface %s\n", ret, slave_dev->name); - phy_disconnect(p->phy); ds->ports[port] = NULL; free_netdev(slave_dev); return ret; @@ -1205,6 +1204,7 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent, ret = dsa_slave_phy_setup(p, slave_dev); if (ret) { netdev_err(master, "error %d setting up slave phy\n", ret); + unregister_netdev(slave_dev); free_netdev(slave_dev); return ret; } diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index c229205..7758247 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -353,6 +353,7 @@ config INET_ESP select CRYPTO_CBC select CRYPTO_SHA1 select CRYPTO_DES + select CRYPTO_ECHAINIV ---help--- Support for IPsec ESP. diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index c29809f..62c049b 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -56,7 +56,6 @@ obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o -obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o obj-$(CONFIG_NETLABEL) += cipso_ipv4.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index cebd9d3..f6303b1 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1847,7 +1847,7 @@ static int inet_netconf_get_devconf(struct sk_buff *in_skb, if (err < 0) goto errout; - err = EINVAL; + err = -EINVAL; if (!tb[NETCONFA_IFINDEX]) goto errout; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 744e593..d07fc07 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -289,10 +289,8 @@ static void __node_free_rcu(struct rcu_head *head) if (!n->tn_bits) kmem_cache_free(trie_leaf_kmem, n); - else if (n->tn_bits <= TNODE_KMALLOC_MAX) - kfree(n); else - vfree(n); + kvfree(n); } #define node_free(n) call_rcu(&tn_info(n)->rcu, __node_free_rcu) @@ -1396,9 +1394,10 @@ found: struct fib_info *fi = fa->fa_info; int nhsel, err; - if ((index >= (1ul << fa->fa_slen)) && - ((BITS_PER_LONG > KEYLENGTH) || (fa->fa_slen != KEYLENGTH))) - continue; + if ((BITS_PER_LONG > KEYLENGTH) || (fa->fa_slen < KEYLENGTH)) { + if (index >= (1ul << fa->fa_slen)) + continue; + } if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos) continue; if (fi->fib_dead) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 46b9c88..6414891 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -789,14 +789,16 @@ static void inet_child_forget(struct sock *sk, struct request_sock *req, reqsk_put(req); } -void inet_csk_reqsk_queue_add(struct sock *sk, struct request_sock *req, - struct sock *child) +struct sock *inet_csk_reqsk_queue_add(struct sock *sk, + struct request_sock *req, + struct sock *child) { struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; spin_lock(&queue->rskq_lock); if (unlikely(sk->sk_state != TCP_LISTEN)) { inet_child_forget(sk, req, child); + child = NULL; } else { req->sk = child; req->dl_next = NULL; @@ -808,6 +810,7 @@ void inet_csk_reqsk_queue_add(struct sock *sk, struct request_sock *req, sk_acceptq_added(sk); } spin_unlock(&queue->rskq_lock); + return child; } EXPORT_SYMBOL(inet_csk_reqsk_queue_add); @@ -817,11 +820,8 @@ struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child, if (own_req) { inet_csk_reqsk_queue_drop(sk, req); reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req); - inet_csk_reqsk_queue_add(sk, req, child); - /* Warning: caller must not call reqsk_put(req); - * child stole last reference on it. - */ - return child; + if (inet_csk_reqsk_queue_add(sk, req, child)) + return child; } /* Too bad, another child took ownership of the request, undo. */ bh_unlock_sock(child); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 8bb8e7a..6029157 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -361,13 +361,20 @@ struct sock *inet_diag_find_one_icsk(struct net *net, req->id.idiag_dport, req->id.idiag_src[0], req->id.idiag_sport, req->id.idiag_if); #if IS_ENABLED(CONFIG_IPV6) - else if (req->sdiag_family == AF_INET6) - sk = inet6_lookup(net, hashinfo, - (struct in6_addr *)req->id.idiag_dst, - req->id.idiag_dport, - (struct in6_addr *)req->id.idiag_src, - req->id.idiag_sport, - req->id.idiag_if); + else if (req->sdiag_family == AF_INET6) { + if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) && + ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src)) + sk = inet_lookup(net, hashinfo, req->id.idiag_dst[3], + req->id.idiag_dport, req->id.idiag_src[3], + req->id.idiag_sport, req->id.idiag_if); + else + sk = inet6_lookup(net, hashinfo, + (struct in6_addr *)req->id.idiag_dst, + req->id.idiag_dport, + (struct in6_addr *)req->id.idiag_src, + req->id.idiag_sport, + req->id.idiag_if); + } #endif else return ERR_PTR(-EINVAL); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 3f00810..187c6fc 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -661,6 +661,7 @@ int ip_defrag(struct net *net, struct sk_buff *skb, u32 user) struct ipq *qp; IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS); + skb_orphan(skb); /* Lookup (or create) queue header */ qp = ip_find(net, ip_hdr(skb), user, vif); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 7c51c4e..41ba68d 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1054,8 +1054,9 @@ static const struct net_device_ops gre_tap_netdev_ops = { static void ipgre_tap_setup(struct net_device *dev) { ether_setup(dev); - dev->netdev_ops = &gre_tap_netdev_ops; - dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + dev->netdev_ops = &gre_tap_netdev_ops; + dev->priv_flags &= ~IFF_TX_SKB_SHARING; + dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; ip_tunnel_setup(dev, gre_tap_net_id); } @@ -1240,6 +1241,14 @@ struct net_device *gretap_fb_dev_create(struct net *net, const char *name, err = ipgre_newlink(net, dev, tb, NULL); if (err < 0) goto out; + + /* openvswitch users expect packet sizes to be unrestricted, + * so set the largest MTU we can. + */ + err = __ip_tunnel_change_mtu(dev, IP_MAX_MTU, false); + if (err) + goto out; + return dev; out: free_netdev(dev); diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index b1209b6..d77eb0c 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -316,7 +316,10 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) const struct iphdr *iph = ip_hdr(skb); struct rtable *rt; - if (sysctl_ip_early_demux && !skb_dst(skb) && !skb->sk) { + if (sysctl_ip_early_demux && + !skb_dst(skb) && + !skb->sk && + !ip_is_fragment(iph)) { const struct net_protocol *ipprot; int protocol = iph->protocol; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 512a447..565bf64 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -239,6 +239,7 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk, * from host network stack. */ features = netif_skb_features(skb); + BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_SGO_CB_OFFSET); segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); if (IS_ERR_OR_NULL(segs)) { kfree_skb(skb); @@ -1235,13 +1236,16 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page, if (!skb) return -EINVAL; - cork->length += size; if ((size + skb->len > mtu) && (sk->sk_protocol == IPPROTO_UDP) && (rt->dst.dev->features & NETIF_F_UFO)) { + if (skb->ip_summed != CHECKSUM_PARTIAL) + return -EOPNOTSUPP; + skb_shinfo(skb)->gso_size = mtu - fragheaderlen; skb_shinfo(skb)->gso_type = SKB_GSO_UDP; } + cork->length += size; while (size > 0) { if (skb_is_gso(skb)) { diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 5f73a7c..a501242 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -249,6 +249,8 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc, switch (cmsg->cmsg_type) { case IP_RETOPTS: err = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)); + + /* Our caller is responsible for freeing ipc->opt */ err = ip_options_get(net, &ipc->opt, CMSG_DATA(cmsg), err < 40 ? err : 40); if (err) diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index c7bd72e..336e689 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -661,6 +661,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, inner_iph = (const struct iphdr *)skb_inner_network_header(skb); connected = (tunnel->parms.iph.daddr != 0); + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + dst = tnl_params->daddr; if (dst == 0) { /* NBMA tunnel */ @@ -758,7 +760,6 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { tunnel->err_count--; - memset(IPCB(skb), 0, sizeof(*IPCB(skb))); dst_link_failure(skb); } else tunnel->err_count = 0; @@ -943,17 +944,31 @@ done: } EXPORT_SYMBOL_GPL(ip_tunnel_ioctl); -int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu) +int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict) { struct ip_tunnel *tunnel = netdev_priv(dev); int t_hlen = tunnel->hlen + sizeof(struct iphdr); + int max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen; - if (new_mtu < 68 || - new_mtu > 0xFFF8 - dev->hard_header_len - t_hlen) + if (new_mtu < 68) return -EINVAL; + + if (new_mtu > max_mtu) { + if (strict) + return -EINVAL; + + new_mtu = max_mtu; + } + dev->mtu = new_mtu; return 0; } +EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu); + +int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu) +{ + return __ip_tunnel_change_mtu(dev, new_mtu, true); +} EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu); static void ip_tunnel_dev_free(struct net_device *dev) diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 67f7c9d..2ed9dd2 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -143,7 +143,11 @@ static char dhcp_client_identifier[253] __initdata; /* Persistent data: */ +#ifdef IPCONFIG_DYNAMIC static int ic_proto_used; /* Protocol used, if any */ +#else +#define ic_proto_used 0 +#endif static __be32 ic_nameservers[CONF_NAMESERVERS_MAX]; /* DNS Server IP addresses */ static u8 ic_domain[64]; /* DNS (not NIS) domain name */ diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index 6fb869f6..a04dee5 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -27,8 +27,6 @@ static int nf_ct_ipv4_gather_frags(struct net *net, struct sk_buff *skb, { int err; - skb_orphan(skb); - local_bh_disable(); err = ip_defrag(net, skb, user); local_bh_enable(); diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index c117b21..d3a2716 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -746,8 +746,10 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (msg->msg_controllen) { err = ip_cmsg_send(sock_net(sk), msg, &ipc, false); - if (err) + if (unlikely(err)) { + kfree(ipc.opt); return err; + } if (ipc.opt) free = 1; } diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index bc35f18..7113bae 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -547,8 +547,10 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (msg->msg_controllen) { err = ip_cmsg_send(net, msg, &ipc, false); - if (err) + if (unlikely(err)) { + kfree(ipc.opt); goto out; + } if (ipc.opt) free = 1; } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 85f184e..02c6229 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -129,6 +129,7 @@ static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ; static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; static int ip_rt_min_advmss __read_mostly = 256; +static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT; /* * Interface to generic destination cache. */ @@ -755,7 +756,7 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow struct fib_nh *nh = &FIB_RES_NH(res); update_or_create_fnhe(nh, fl4->daddr, new_gw, - 0, 0); + 0, jiffies + ip_rt_gc_timeout); } if (kill_route) rt->dst.obsolete = DST_OBSOLETE_KILL; @@ -1556,6 +1557,36 @@ static void ip_handle_martian_source(struct net_device *dev, #endif } +static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr) +{ + struct fnhe_hash_bucket *hash; + struct fib_nh_exception *fnhe, __rcu **fnhe_p; + u32 hval = fnhe_hashfun(daddr); + + spin_lock_bh(&fnhe_lock); + + hash = rcu_dereference_protected(nh->nh_exceptions, + lockdep_is_held(&fnhe_lock)); + hash += hval; + + fnhe_p = &hash->chain; + fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock)); + while (fnhe) { + if (fnhe->fnhe_daddr == daddr) { + rcu_assign_pointer(*fnhe_p, rcu_dereference_protected( + fnhe->fnhe_next, lockdep_is_held(&fnhe_lock))); + fnhe_flush_routes(fnhe); + kfree_rcu(fnhe, rcu); + break; + } + fnhe_p = &fnhe->fnhe_next; + fnhe = rcu_dereference_protected(fnhe->fnhe_next, + lockdep_is_held(&fnhe_lock)); + } + + spin_unlock_bh(&fnhe_lock); +} + /* called in rcu_read_lock() section */ static int __mkroute_input(struct sk_buff *skb, const struct fib_result *res, @@ -1609,11 +1640,20 @@ static int __mkroute_input(struct sk_buff *skb, fnhe = find_exception(&FIB_RES_NH(*res), daddr); if (do_cache) { - if (fnhe) + if (fnhe) { rth = rcu_dereference(fnhe->fnhe_rth_input); - else - rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input); + if (rth && rth->dst.expires && + time_after(jiffies, rth->dst.expires)) { + ip_del_fnhe(&FIB_RES_NH(*res), daddr); + fnhe = NULL; + } else { + goto rt_cache; + } + } + + rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input); +rt_cache: if (rt_cache_valid(rth)) { skb_dst_set_noref(skb, &rth->dst); goto out; @@ -2014,19 +2054,29 @@ static struct rtable *__mkroute_output(const struct fib_result *res, struct fib_nh *nh = &FIB_RES_NH(*res); fnhe = find_exception(nh, fl4->daddr); - if (fnhe) + if (fnhe) { prth = &fnhe->fnhe_rth_output; - else { - if (unlikely(fl4->flowi4_flags & - FLOWI_FLAG_KNOWN_NH && - !(nh->nh_gw && - nh->nh_scope == RT_SCOPE_LINK))) { - do_cache = false; - goto add; + rth = rcu_dereference(*prth); + if (rth && rth->dst.expires && + time_after(jiffies, rth->dst.expires)) { + ip_del_fnhe(nh, fl4->daddr); + fnhe = NULL; + } else { + goto rt_cache; } - prth = raw_cpu_ptr(nh->nh_pcpu_rth_output); } + + if (unlikely(fl4->flowi4_flags & + FLOWI_FLAG_KNOWN_NH && + !(nh->nh_gw && + nh->nh_scope == RT_SCOPE_LINK))) { + do_cache = false; + goto add; + } + prth = raw_cpu_ptr(nh->nh_pcpu_rth_output); rth = rcu_dereference(*prth); + +rt_cache: if (rt_cache_valid(rth)) { dst_hold(&rth->dst); return rth; @@ -2569,7 +2619,6 @@ void ip_rt_multicast_event(struct in_device *in_dev) } #ifdef CONFIG_SYSCTL -static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT; static int ip_rt_gc_interval __read_mostly = 60 * HZ; static int ip_rt_gc_min_interval __read_mostly = HZ / 2; static int ip_rt_gc_elasticity __read_mostly = 8; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 46ce410..4d367b4 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -24,7 +24,6 @@ #include <net/cipso_ipv4.h> #include <net/inet_frag.h> #include <net/ping.h> -#include <net/tcp_memcontrol.h> static int zero; static int one = 1; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 7bb1b09..483ffdf 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -279,6 +279,7 @@ #include <asm/uaccess.h> #include <asm/ioctls.h> +#include <asm/unaligned.h> #include <net/busy_poll.h> int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT; @@ -422,7 +423,8 @@ void tcp_init_sock(struct sock *sk) sk->sk_rcvbuf = sysctl_tcp_rmem[1]; local_bh_disable(); - sock_update_memcg(sk); + if (mem_cgroup_sockets_enabled) + sock_update_memcg(sk); sk_sockets_allocated_inc(sk); local_bh_enable(); } @@ -938,7 +940,7 @@ new_segment: i = skb_shinfo(skb)->nr_frags; can_coalesce = skb_can_coalesce(skb, i, page, offset); - if (!can_coalesce && i >= MAX_SKB_FRAGS) { + if (!can_coalesce && i >= sysctl_max_skb_frags) { tcp_mark_push(tp, skb); goto new_segment; } @@ -1211,7 +1213,7 @@ new_segment: if (!skb_can_coalesce(skb, i, pfrag->page, pfrag->offset)) { - if (i == MAX_SKB_FRAGS || !sg) { + if (i == sysctl_max_skb_frags || !sg) { tcp_mark_push(tp, skb); goto new_segment; } @@ -2637,6 +2639,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) const struct inet_connection_sock *icsk = inet_csk(sk); u32 now = tcp_time_stamp; unsigned int start; + u64 rate64; u32 rate; memset(info, 0, sizeof(*info)); @@ -2702,15 +2705,17 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_total_retrans = tp->total_retrans; rate = READ_ONCE(sk->sk_pacing_rate); - info->tcpi_pacing_rate = rate != ~0U ? rate : ~0ULL; + rate64 = rate != ~0U ? rate : ~0ULL; + put_unaligned(rate64, &info->tcpi_pacing_rate); rate = READ_ONCE(sk->sk_max_pacing_rate); - info->tcpi_max_pacing_rate = rate != ~0U ? rate : ~0ULL; + rate64 = rate != ~0U ? rate : ~0ULL; + put_unaligned(rate64, &info->tcpi_max_pacing_rate); do { start = u64_stats_fetch_begin_irq(&tp->syncp); - info->tcpi_bytes_acked = tp->bytes_acked; - info->tcpi_bytes_received = tp->bytes_received; + put_unaligned(tp->bytes_acked, &info->tcpi_bytes_acked); + put_unaligned(tp->bytes_received, &info->tcpi_bytes_received); } while (u64_stats_fetch_retry_irq(&tp->syncp, start)); info->tcpi_segs_out = tp->segs_out; info->tcpi_segs_in = tp->segs_in; @@ -2945,7 +2950,7 @@ static void __tcp_alloc_md5sig_pool(void) struct crypto_hash *hash; hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC); - if (IS_ERR_OR_NULL(hash)) + if (IS_ERR(hash)) return; per_cpu(tcp_md5sig_pool, cpu).md5_desc.tfm = hash; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 0003d40..3b2c8e9 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2164,8 +2164,7 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - int cnt, oldcnt; - int err; + int cnt, oldcnt, lost; unsigned int mss; /* Use SACK to deduce losses of new sequences sent during recovery */ const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq; @@ -2205,9 +2204,10 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) break; mss = tcp_skb_mss(skb); - err = tcp_fragment(sk, skb, (packets - oldcnt) * mss, - mss, GFP_ATOMIC); - if (err < 0) + /* If needed, chop off the prefix to mark as lost. */ + lost = (packets - oldcnt) * mss; + if (lost < skb->len && + tcp_fragment(sk, skb, lost, mss, GFP_ATOMIC) < 0) break; cnt = packets; } @@ -2366,8 +2366,6 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss) tp->snd_ssthresh = tp->prior_ssthresh; tcp_ecn_withdraw_cwr(tp); } - } else { - tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh); } tp->snd_cwnd_stamp = tcp_time_stamp; tp->undo_marker = 0; @@ -2898,7 +2896,10 @@ static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us) { const u32 now = tcp_time_stamp, wlen = sysctl_tcp_min_rtt_wlen * HZ; struct rtt_meas *m = tcp_sk(sk)->rtt_min; - struct rtt_meas rttm = { .rtt = (rtt_us ? : 1), .ts = now }; + struct rtt_meas rttm = { + .rtt = likely(rtt_us) ? rtt_us : jiffies_to_usecs(1), + .ts = now, + }; u32 elapsed; /* Check if the new measurement updates the 1st, 2nd, or 3rd choices */ diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 65947c1..487ac67 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -73,7 +73,6 @@ #include <net/timewait_sock.h> #include <net/xfrm.h> #include <net/secure_seq.h> -#include <net/tcp_memcontrol.h> #include <net/busy_poll.h> #include <linux/inet.h> @@ -312,7 +311,7 @@ static void do_redirect(struct sk_buff *skb, struct sock *sk) /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ -void tcp_req_err(struct sock *sk, u32 seq) +void tcp_req_err(struct sock *sk, u32 seq, bool abort) { struct request_sock *req = inet_reqsk(sk); struct net *net = sock_net(sk); @@ -324,7 +323,7 @@ void tcp_req_err(struct sock *sk, u32 seq) if (seq != tcp_rsk(req)->snt_isn) { NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); - } else { + } else if (abort) { /* * Still in SYN_RECV, just remove it silently. * There is no good way to pass the error to the newly @@ -384,7 +383,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) } seq = ntohl(th->seq); if (sk->sk_state == TCP_NEW_SYN_RECV) - return tcp_req_err(sk, seq); + return tcp_req_err(sk, seq, + type == ICMP_PARAMETERPROB || + type == ICMP_TIME_EXCEEDED || + (type == ICMP_DEST_UNREACH && + (code == ICMP_NET_UNREACH || + code == ICMP_HOST_UNREACH))); bh_lock_sock(sk); /* If too many ICMPs get dropped on busy @@ -708,7 +712,8 @@ release_sk1: outside socket context is ugly, certainly. What can I do? */ -static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, +static void tcp_v4_send_ack(struct net *net, + struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 tsval, u32 tsecr, int oif, struct tcp_md5sig_key *key, int reply_flags, u8 tos) @@ -723,7 +728,6 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, ]; } rep; struct ip_reply_arg arg; - struct net *net = dev_net(skb_dst(skb)->dev); memset(&rep.th, 0, sizeof(struct tcphdr)); memset(&arg, 0, sizeof(arg)); @@ -785,7 +789,8 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) struct inet_timewait_sock *tw = inet_twsk(sk); struct tcp_timewait_sock *tcptw = tcp_twsk(sk); - tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, + tcp_v4_send_ack(sock_net(sk), skb, + tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcp_time_stamp + tcptw->tw_ts_offset, tcptw->tw_ts_recent, @@ -804,8 +809,10 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV * sk->sk_state == TCP_SYN_RECV -> for Fast Open. */ - tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ? - tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, + u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : + tcp_sk(sk)->snd_nxt; + + tcp_v4_send_ack(sock_net(sk), skb, seq, tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd, tcp_time_stamp, req->ts_recent, @@ -1590,28 +1597,30 @@ process: if (sk->sk_state == TCP_NEW_SYN_RECV) { struct request_sock *req = inet_reqsk(sk); - struct sock *nsk = NULL; + struct sock *nsk; sk = req->rsk_listener; - if (tcp_v4_inbound_md5_hash(sk, skb)) - goto discard_and_relse; - if (likely(sk->sk_state == TCP_LISTEN)) { - nsk = tcp_check_req(sk, skb, req, false); - } else { + if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) { + reqsk_put(req); + goto discard_it; + } + if (unlikely(sk->sk_state != TCP_LISTEN)) { inet_csk_reqsk_queue_drop_and_put(sk, req); goto lookup; } + sock_hold(sk); + nsk = tcp_check_req(sk, skb, req, false); if (!nsk) { reqsk_put(req); - goto discard_it; + goto discard_and_relse; } if (nsk == sk) { - sock_hold(sk); reqsk_put(req); } else if (tcp_child_process(sk, nsk, skb)) { tcp_v4_send_reset(nsk, skb); - goto discard_it; + goto discard_and_relse; } else { + sock_put(sk); return 0; } } @@ -1818,7 +1827,9 @@ void tcp_v4_destroy_sock(struct sock *sk) tcp_saved_syn_free(tp); sk_sockets_allocated_dec(sk); - sock_release_memcg(sk); + + if (mem_cgroup_sockets_enabled && sk->sk_memcg) + sock_release_memcg(sk); } EXPORT_SYMBOL(tcp_v4_destroy_sock); @@ -2342,11 +2353,6 @@ struct proto tcp_prot = { .compat_setsockopt = compat_tcp_setsockopt, .compat_getsockopt = compat_tcp_getsockopt, #endif -#ifdef CONFIG_MEMCG_KMEM - .init_cgroup = tcp_init_cgroup, - .destroy_cgroup = tcp_destroy_cgroup, - .proto_cgroup = tcp_proto_cgroup, -#endif .diag_destroy = tcp_abort, }; EXPORT_SYMBOL(tcp_prot); diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c deleted file mode 100644 index 2379c1b..0000000 --- a/net/ipv4/tcp_memcontrol.c +++ /dev/null @@ -1,233 +0,0 @@ -#include <net/tcp.h> -#include <net/tcp_memcontrol.h> -#include <net/sock.h> -#include <net/ip.h> -#include <linux/nsproxy.h> -#include <linux/memcontrol.h> -#include <linux/module.h> - -int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss) -{ - /* - * The root cgroup does not use page_counters, but rather, - * rely on the data already collected by the network - * subsystem - */ - struct mem_cgroup *parent = parent_mem_cgroup(memcg); - struct page_counter *counter_parent = NULL; - struct cg_proto *cg_proto, *parent_cg; - - cg_proto = tcp_prot.proto_cgroup(memcg); - if (!cg_proto) - return 0; - - cg_proto->sysctl_mem[0] = sysctl_tcp_mem[0]; - cg_proto->sysctl_mem[1] = sysctl_tcp_mem[1]; - cg_proto->sysctl_mem[2] = sysctl_tcp_mem[2]; - cg_proto->memory_pressure = 0; - cg_proto->memcg = memcg; - - parent_cg = tcp_prot.proto_cgroup(parent); - if (parent_cg) - counter_parent = &parent_cg->memory_allocated; - - page_counter_init(&cg_proto->memory_allocated, counter_parent); - percpu_counter_init(&cg_proto->sockets_allocated, 0, GFP_KERNEL); - - return 0; -} -EXPORT_SYMBOL(tcp_init_cgroup); - -void tcp_destroy_cgroup(struct mem_cgroup *memcg) -{ - struct cg_proto *cg_proto; - - cg_proto = tcp_prot.proto_cgroup(memcg); - if (!cg_proto) - return; - - percpu_counter_destroy(&cg_proto->sockets_allocated); - - if (test_bit(MEMCG_SOCK_ACTIVATED, &cg_proto->flags)) - static_key_slow_dec(&memcg_socket_limit_enabled); - -} -EXPORT_SYMBOL(tcp_destroy_cgroup); - -static int tcp_update_limit(struct mem_cgroup *memcg, unsigned long nr_pages) -{ - struct cg_proto *cg_proto; - int i; - int ret; - - cg_proto = tcp_prot.proto_cgroup(memcg); - if (!cg_proto) - return -EINVAL; - - ret = page_counter_limit(&cg_proto->memory_allocated, nr_pages); - if (ret) - return ret; - - for (i = 0; i < 3; i++) - cg_proto->sysctl_mem[i] = min_t(long, nr_pages, - sysctl_tcp_mem[i]); - - if (nr_pages == PAGE_COUNTER_MAX) - clear_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags); - else { - /* - * The active bit needs to be written after the static_key - * update. This is what guarantees that the socket activation - * function is the last one to run. See sock_update_memcg() for - * details, and note that we don't mark any socket as belonging - * to this memcg until that flag is up. - * - * We need to do this, because static_keys will span multiple - * sites, but we can't control their order. If we mark a socket - * as accounted, but the accounting functions are not patched in - * yet, we'll lose accounting. - * - * We never race with the readers in sock_update_memcg(), - * because when this value change, the code to process it is not - * patched in yet. - * - * The activated bit is used to guarantee that no two writers - * will do the update in the same memcg. Without that, we can't - * properly shutdown the static key. - */ - if (!test_and_set_bit(MEMCG_SOCK_ACTIVATED, &cg_proto->flags)) - static_key_slow_inc(&memcg_socket_limit_enabled); - set_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags); - } - - return 0; -} - -enum { - RES_USAGE, - RES_LIMIT, - RES_MAX_USAGE, - RES_FAILCNT, -}; - -static DEFINE_MUTEX(tcp_limit_mutex); - -static ssize_t tcp_cgroup_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); - unsigned long nr_pages; - int ret = 0; - - buf = strstrip(buf); - - switch (of_cft(of)->private) { - case RES_LIMIT: - /* see memcontrol.c */ - ret = page_counter_memparse(buf, "-1", &nr_pages); - if (ret) - break; - mutex_lock(&tcp_limit_mutex); - ret = tcp_update_limit(memcg, nr_pages); - mutex_unlock(&tcp_limit_mutex); - break; - default: - ret = -EINVAL; - break; - } - return ret ?: nbytes; -} - -static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft) -{ - struct mem_cgroup *memcg = mem_cgroup_from_css(css); - struct cg_proto *cg_proto = tcp_prot.proto_cgroup(memcg); - u64 val; - - switch (cft->private) { - case RES_LIMIT: - if (!cg_proto) - return PAGE_COUNTER_MAX; - val = cg_proto->memory_allocated.limit; - val *= PAGE_SIZE; - break; - case RES_USAGE: - if (!cg_proto) - val = atomic_long_read(&tcp_memory_allocated); - else - val = page_counter_read(&cg_proto->memory_allocated); - val *= PAGE_SIZE; - break; - case RES_FAILCNT: - if (!cg_proto) - return 0; - val = cg_proto->memory_allocated.failcnt; - break; - case RES_MAX_USAGE: - if (!cg_proto) - return 0; - val = cg_proto->memory_allocated.watermark; - val *= PAGE_SIZE; - break; - default: - BUG(); - } - return val; -} - -static ssize_t tcp_cgroup_reset(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - struct mem_cgroup *memcg; - struct cg_proto *cg_proto; - - memcg = mem_cgroup_from_css(of_css(of)); - cg_proto = tcp_prot.proto_cgroup(memcg); - if (!cg_proto) - return nbytes; - - switch (of_cft(of)->private) { - case RES_MAX_USAGE: - page_counter_reset_watermark(&cg_proto->memory_allocated); - break; - case RES_FAILCNT: - cg_proto->memory_allocated.failcnt = 0; - break; - } - - return nbytes; -} - -static struct cftype tcp_files[] = { - { - .name = "kmem.tcp.limit_in_bytes", - .write = tcp_cgroup_write, - .read_u64 = tcp_cgroup_read, - .private = RES_LIMIT, - }, - { - .name = "kmem.tcp.usage_in_bytes", - .read_u64 = tcp_cgroup_read, - .private = RES_USAGE, - }, - { - .name = "kmem.tcp.failcnt", - .private = RES_FAILCNT, - .write = tcp_cgroup_reset, - .read_u64 = tcp_cgroup_read, - }, - { - .name = "kmem.tcp.max_usage_in_bytes", - .private = RES_MAX_USAGE, - .write = tcp_cgroup_reset, - .read_u64 = tcp_cgroup_read, - }, - { } /* terminate */ -}; - -static int __init tcp_memcontrol_init(void) -{ - WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, tcp_files)); - return 0; -} -__initcall(tcp_memcontrol_init); diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index c8cbc2b..a726d78 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -550,7 +550,7 @@ reset: */ if (crtt > tp->srtt_us) { /* Set RTO like tcp_rtt_estimator(), but from cached RTT. */ - crtt /= 8 * USEC_PER_MSEC; + crtt /= 8 * USEC_PER_SEC / HZ; inet_csk(sk)->icsk_rto = crtt + max(2 * crtt, tcp_rto_min(sk)); } else if (tp->srtt_us == 0) { /* RFC6298: 5.7 We've failed to get a valid RTT sample from diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 412a920..fda379c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2813,13 +2813,16 @@ begin_fwd: */ void sk_forced_mem_schedule(struct sock *sk, int size) { - int amt, status; + int amt; if (size <= sk->sk_forward_alloc) return; amt = sk_mem_pages(size); sk->sk_forward_alloc += amt * SK_MEM_QUANTUM; - sk_memory_allocated_add(sk, amt, &status); + sk_memory_allocated_add(sk, amt); + + if (mem_cgroup_sockets_enabled && sk->sk_memcg) + mem_cgroup_charge_skmem(sk->sk_memcg, amt); } /* Send a FIN. The caller locks the socket for us. diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index dc45b53..95d2f19 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -499,6 +499,7 @@ static struct sock *udp4_lib_lookup2(struct net *net, struct sock *sk, *result; struct hlist_nulls_node *node; int score, badness, matches = 0, reuseport = 0; + bool select_ok = true; u32 hash = 0; begin: @@ -512,14 +513,18 @@ begin: badness = score; reuseport = sk->sk_reuseport; if (reuseport) { - struct sock *sk2; hash = udp_ehashfn(net, daddr, hnum, saddr, sport); - sk2 = reuseport_select_sock(sk, hash, skb, - sizeof(struct udphdr)); - if (sk2) { - result = sk2; - goto found; + if (select_ok) { + struct sock *sk2; + + sk2 = reuseport_select_sock(sk, hash, skb, + sizeof(struct udphdr)); + if (sk2) { + result = sk2; + select_ok = false; + goto found; + } } matches = 1; } @@ -563,6 +568,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask); struct udp_hslot *hslot2, *hslot = &udptable->hash[slot]; int score, badness, matches = 0, reuseport = 0; + bool select_ok = true; u32 hash = 0; rcu_read_lock(); @@ -601,14 +607,18 @@ begin: badness = score; reuseport = sk->sk_reuseport; if (reuseport) { - struct sock *sk2; hash = udp_ehashfn(net, daddr, hnum, saddr, sport); - sk2 = reuseport_select_sock(sk, hash, skb, + if (select_ok) { + struct sock *sk2; + + sk2 = reuseport_select_sock(sk, hash, skb, sizeof(struct udphdr)); - if (sk2) { - result = sk2; - goto found; + if (sk2) { + result = sk2; + select_ok = false; + goto found; + } } matches = 1; } @@ -1038,8 +1048,10 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (msg->msg_controllen) { err = ip_cmsg_send(sock_net(sk), msg, &ipc, sk->sk_family == AF_INET6); - if (err) + if (unlikely(err)) { + kfree(ipc.opt); return err; + } if (ipc.opt) free = 1; connected = 0; diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c index 0ec0881..96599d1 100644 --- a/net/ipv4/udp_tunnel.c +++ b/net/ipv4/udp_tunnel.c @@ -89,6 +89,8 @@ void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb uh->source = src_port; uh->len = htons(skb->len); + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + udp_set_csum(nocheck, skb, src, dst, skb->len); iptunnel_xmit(sk, rt, skb, src, dst, IPPROTO_UDP, tos, ttl, df, xnet); diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index bb7dabe..40c8975 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -69,6 +69,7 @@ config INET6_ESP select CRYPTO_CBC select CRYPTO_SHA1 select CRYPTO_DES + select CRYPTO_ECHAINIV ---help--- Support for IPsec ESP. diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 38eedde..bdd7eac 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -583,7 +583,7 @@ static int inet6_netconf_get_devconf(struct sk_buff *in_skb, if (err < 0) goto errout; - err = EINVAL; + err = -EINVAL; if (!tb[NETCONFA_IFINDEX]) goto errout; @@ -3538,6 +3538,7 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp) { struct inet6_dev *idev = ifp->idev; struct net_device *dev = idev->dev; + bool notify = false; addrconf_join_solict(dev, &ifp->addr); @@ -3583,7 +3584,7 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp) /* Because optimistic nodes can use this address, * notify listeners. If DAD fails, RTM_DELADDR is sent. */ - ipv6_ifa_notify(RTM_NEWADDR, ifp); + notify = true; } } @@ -3591,6 +3592,8 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp) out: spin_unlock(&ifp->lock); read_unlock_bh(&idev->lock); + if (notify) + ipv6_ifa_notify(RTM_NEWADDR, ifp); } static void addrconf_dad_start(struct inet6_ifaddr *ifp) diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 517c55b..4281621 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -162,6 +162,9 @@ ipv4_connected: fl6.fl6_dport = inet->inet_dport; fl6.fl6_sport = inet->inet_sport; + if (!fl6.flowi6_oif) + fl6.flowi6_oif = np->sticky_pktinfo.ipi6_ifindex; + if (!fl6.flowi6_oif && (addr_type&IPV6_ADDR_MULTICAST)) fl6.flowi6_oif = np->mcast_oif; diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index 1f9ebe3..dc2db4f 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -540,12 +540,13 @@ int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen) } spin_lock_bh(&ip6_sk_fl_lock); for (sflp = &np->ipv6_fl_list; - (sfl = rcu_dereference(*sflp)) != NULL; + (sfl = rcu_dereference_protected(*sflp, + lockdep_is_held(&ip6_sk_fl_lock))) != NULL; sflp = &sfl->next) { if (sfl->fl->label == freq.flr_label) { if (freq.flr_label == (np->flow_label&IPV6_FLOWLABEL_MASK)) np->flow_label &= ~IPV6_FLOWLABEL_MASK; - *sflp = rcu_dereference(sfl->next); + *sflp = sfl->next; spin_unlock_bh(&ip6_sk_fl_lock); fl_release(sfl->fl); kfree_rcu(sfl, rcu); diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index f37f18b..c0d4dc1 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -777,6 +777,8 @@ static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev) __u32 mtu; int err; + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) encap_limit = t->parms.encap_limit; @@ -1512,6 +1514,7 @@ static void ip6gre_tap_setup(struct net_device *dev) dev->destructor = ip6gre_dev_free; dev->features |= NETIF_F_NETNS_LOCAL; + dev->priv_flags &= ~IFF_TX_SKB_SHARING; } static int ip6gre_newlink(struct net *src_net, struct net_device *dev, diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 23de98f..a163102 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -909,6 +909,7 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, struct rt6_info *rt; #endif int err; + int flags = 0; /* The correct way to handle this would be to do * ip6_route_get_saddr, and then ip6_route_output; however, @@ -940,10 +941,13 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, dst_release(*dst); *dst = NULL; } + + if (fl6->flowi6_oif) + flags |= RT6_LOOKUP_F_IFACE; } if (!*dst) - *dst = ip6_route_output(net, sk, fl6); + *dst = ip6_route_output_flags(net, sk, fl6, flags); err = (*dst)->error; if (err) diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 137fca4..6c5dfec 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1180,6 +1180,8 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) u8 tproto; int err; + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + tproto = ACCESS_ONCE(t->parms.proto); if (tproto != IPPROTO_IPIP && tproto != 0) return -1; diff --git a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c index 31ba7ca..051b6a6 100644 --- a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c +++ b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c @@ -21,6 +21,10 @@ #include <net/ipv6.h> #include <net/netfilter/ipv6/nf_nat_masquerade.h> +#define MAX_WORK_COUNT 16 + +static atomic_t v6_worker_count; + unsigned int nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range *range, const struct net_device *out) @@ -78,14 +82,78 @@ static struct notifier_block masq_dev_notifier = { .notifier_call = masq_device_event, }; +struct masq_dev_work { + struct work_struct work; + struct net *net; + int ifindex; +}; + +static void iterate_cleanup_work(struct work_struct *work) +{ + struct masq_dev_work *w; + long index; + + w = container_of(work, struct masq_dev_work, work); + + index = w->ifindex; + nf_ct_iterate_cleanup(w->net, device_cmp, (void *)index, 0, 0); + + put_net(w->net); + kfree(w); + atomic_dec(&v6_worker_count); + module_put(THIS_MODULE); +} + +/* ipv6 inet notifier is an atomic notifier, i.e. we cannot + * schedule. + * + * Unfortunately, nf_ct_iterate_cleanup can run for a long + * time if there are lots of conntracks and the system + * handles high softirq load, so it frequently calls cond_resched + * while iterating the conntrack table. + * + * So we defer nf_ct_iterate_cleanup walk to the system workqueue. + * + * As we can have 'a lot' of inet_events (depending on amount + * of ipv6 addresses being deleted), we also need to add an upper + * limit to the number of queued work items. + */ static int masq_inet_event(struct notifier_block *this, unsigned long event, void *ptr) { struct inet6_ifaddr *ifa = ptr; - struct netdev_notifier_info info; + const struct net_device *dev; + struct masq_dev_work *w; + struct net *net; + + if (event != NETDEV_DOWN || + atomic_read(&v6_worker_count) >= MAX_WORK_COUNT) + return NOTIFY_DONE; + + dev = ifa->idev->dev; + net = maybe_get_net(dev_net(dev)); + if (!net) + return NOTIFY_DONE; - netdev_notifier_info_init(&info, ifa->idev->dev); - return masq_device_event(this, event, &info); + if (!try_module_get(THIS_MODULE)) + goto err_module; + + w = kmalloc(sizeof(*w), GFP_ATOMIC); + if (w) { + atomic_inc(&v6_worker_count); + + INIT_WORK(&w->work, iterate_cleanup_work); + w->ifindex = dev->ifindex; + w->net = net; + schedule_work(&w->work); + + return NOTIFY_DONE; + } + + module_put(THIS_MODULE); + err_module: + put_net(net); + return NOTIFY_DONE; } static struct notifier_block masq_inet_notifier = { diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 3c8834b..ed44663 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1183,11 +1183,10 @@ static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags); } -struct dst_entry *ip6_route_output(struct net *net, const struct sock *sk, - struct flowi6 *fl6) +struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, + struct flowi6 *fl6, int flags) { struct dst_entry *dst; - int flags = 0; bool any_src; dst = l3mdev_rt6_dst_by_oif(net, fl6); @@ -1208,7 +1207,7 @@ struct dst_entry *ip6_route_output(struct net *net, const struct sock *sk, return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output); } -EXPORT_SYMBOL(ip6_route_output); +EXPORT_SYMBOL_GPL(ip6_route_output_flags); struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) { diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index e794ef6..2066d1c 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -201,14 +201,14 @@ static int ipip6_tunnel_create(struct net_device *dev) if ((__force u16)t->parms.i_flags & SIT_ISATAP) dev->priv_flags |= IFF_ISATAP; + dev->rtnl_link_ops = &sit_link_ops; + err = register_netdevice(dev); if (err < 0) goto out; ipip6_tunnel_clone_6rd(dev, sitn); - dev->rtnl_link_ops = &sit_link_ops; - dev_hold(dev); ipip6_tunnel_link(sitn, t); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index db9f1c3..5c8c842 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -61,7 +61,6 @@ #include <net/timewait_sock.h> #include <net/inet_common.h> #include <net/secure_seq.h> -#include <net/tcp_memcontrol.h> #include <net/busy_poll.h> #include <linux/proc_fs.h> @@ -328,6 +327,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, struct tcp_sock *tp; __u32 seq, snd_una; struct sock *sk; + bool fatal; int err; sk = __inet6_lookup_established(net, &tcp_hashinfo, @@ -346,8 +346,9 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, return; } seq = ntohl(th->seq); + fatal = icmpv6_err_convert(type, code, &err); if (sk->sk_state == TCP_NEW_SYN_RECV) - return tcp_req_err(sk, seq); + return tcp_req_err(sk, seq, fatal); bh_lock_sock(sk); if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG) @@ -401,7 +402,6 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, goto out; } - icmpv6_err_convert(type, code, &err); /* Might be for an request_sock */ switch (sk->sk_state) { @@ -1387,7 +1387,7 @@ process: if (sk->sk_state == TCP_NEW_SYN_RECV) { struct request_sock *req = inet_reqsk(sk); - struct sock *nsk = NULL; + struct sock *nsk; sk = req->rsk_listener; tcp_v6_fill_cb(skb, hdr, th); @@ -1395,24 +1395,24 @@ process: reqsk_put(req); goto discard_it; } - if (likely(sk->sk_state == TCP_LISTEN)) { - nsk = tcp_check_req(sk, skb, req, false); - } else { + if (unlikely(sk->sk_state != TCP_LISTEN)) { inet_csk_reqsk_queue_drop_and_put(sk, req); goto lookup; } + sock_hold(sk); + nsk = tcp_check_req(sk, skb, req, false); if (!nsk) { reqsk_put(req); - goto discard_it; + goto discard_and_relse; } if (nsk == sk) { - sock_hold(sk); reqsk_put(req); tcp_v6_restore_cb(skb); } else if (tcp_child_process(sk, nsk, skb)) { tcp_v6_send_reset(nsk, skb); - goto discard_it; + goto discard_and_relse; } else { + sock_put(sk); return 0; } } @@ -1889,9 +1889,6 @@ struct proto tcpv6_prot = { .compat_setsockopt = compat_tcp_setsockopt, .compat_getsockopt = compat_tcp_getsockopt, #endif -#ifdef CONFIG_MEMCG_KMEM - .proto_cgroup = tcp_proto_cgroup, -#endif .clear_sk = tcp_v6_clear_sk, .diag_destroy = tcp_abort, }; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 5d2c2af..22e28a4 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -257,6 +257,7 @@ static struct sock *udp6_lib_lookup2(struct net *net, struct sock *sk, *result; struct hlist_nulls_node *node; int score, badness, matches = 0, reuseport = 0; + bool select_ok = true; u32 hash = 0; begin: @@ -270,14 +271,18 @@ begin: badness = score; reuseport = sk->sk_reuseport; if (reuseport) { - struct sock *sk2; hash = udp6_ehashfn(net, daddr, hnum, saddr, sport); - sk2 = reuseport_select_sock(sk, hash, skb, - sizeof(struct udphdr)); - if (sk2) { - result = sk2; - goto found; + if (select_ok) { + struct sock *sk2; + + sk2 = reuseport_select_sock(sk, hash, skb, + sizeof(struct udphdr)); + if (sk2) { + result = sk2; + select_ok = false; + goto found; + } } matches = 1; } @@ -321,6 +326,7 @@ struct sock *__udp6_lib_lookup(struct net *net, unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask); struct udp_hslot *hslot2, *hslot = &udptable->hash[slot]; int score, badness, matches = 0, reuseport = 0; + bool select_ok = true; u32 hash = 0; rcu_read_lock(); @@ -358,14 +364,18 @@ begin: badness = score; reuseport = sk->sk_reuseport; if (reuseport) { - struct sock *sk2; hash = udp6_ehashfn(net, daddr, hnum, saddr, sport); - sk2 = reuseport_select_sock(sk, hash, skb, + if (select_ok) { + struct sock *sk2; + + sk2 = reuseport_select_sock(sk, hash, skb, sizeof(struct udphdr)); - if (sk2) { - result = sk2; - goto found; + if (sk2) { + result = sk2; + select_ok = false; + goto found; + } } matches = 1; } diff --git a/net/ipv6/xfrm6_mode_tunnel.c b/net/ipv6/xfrm6_mode_tunnel.c index f7fbdba..372855e 100644 --- a/net/ipv6/xfrm6_mode_tunnel.c +++ b/net/ipv6/xfrm6_mode_tunnel.c @@ -23,7 +23,7 @@ static inline void ipip6_ecn_decapsulate(struct sk_buff *skb) struct ipv6hdr *inner_iph = ipipv6_hdr(skb); if (INET_ECN_is_ce(XFRM_MODE_SKB_CB(skb)->tos)) - IP6_ECN_set_ce(inner_iph); + IP6_ECN_set_ce(skb, inner_iph); } /* Add encapsulation header. diff --git a/net/irda/ircomm/ircomm_param.c b/net/irda/ircomm/ircomm_param.c index 3c4caa6..5728e76 100644 --- a/net/irda/ircomm/ircomm_param.c +++ b/net/irda/ircomm/ircomm_param.c @@ -134,11 +134,10 @@ int ircomm_param_request(struct ircomm_tty_cb *self, __u8 pi, int flush) return -1; } skb_put(skb, count); + pr_debug("%s(), skb->len=%d\n", __func__, skb->len); spin_unlock_irqrestore(&self->spinlock, flags); - pr_debug("%s(), skb->len=%d\n", __func__ , skb->len); - if (flush) { /* ircomm_tty_do_softint will take care of the rest */ schedule_work(&self->tqueue); diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index ef50a94..fc3598a 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -708,6 +708,9 @@ static int iucv_sock_bind(struct socket *sock, struct sockaddr *addr, if (!addr || addr->sa_family != AF_IUCV) return -EINVAL; + if (addr_len < sizeof(struct sockaddr_iucv)) + return -EINVAL; + lock_sock(sk); if (sk->sk_state != IUCV_OPEN) { err = -EBADFD; diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index f93c5be..2caaa84 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -124,8 +124,13 @@ static int l2tp_tunnel_notify(struct genl_family *family, ret = l2tp_nl_tunnel_send(msg, info->snd_portid, info->snd_seq, NLM_F_ACK, tunnel, cmd); - if (ret >= 0) - return genlmsg_multicast_allns(family, msg, 0, 0, GFP_ATOMIC); + if (ret >= 0) { + ret = genlmsg_multicast_allns(family, msg, 0, 0, GFP_ATOMIC); + /* We don't care if no one is listening */ + if (ret == -ESRCH) + ret = 0; + return ret; + } nlmsg_free(msg); @@ -147,8 +152,13 @@ static int l2tp_session_notify(struct genl_family *family, ret = l2tp_nl_session_send(msg, info->snd_portid, info->snd_seq, NLM_F_ACK, session, cmd); - if (ret >= 0) - return genlmsg_multicast_allns(family, msg, 0, 0, GFP_ATOMIC); + if (ret >= 0) { + ret = genlmsg_multicast_allns(family, msg, 0, 0, GFP_ATOMIC); + /* We don't care if no one is listening */ + if (ret == -ESRCH) + ret = 0; + return ret; + } nlmsg_free(msg); diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index abbdff0..3e24d0d 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -91,7 +91,7 @@ static const struct file_operations reset_ops = { }; #endif -static const char *hw_flag_names[NUM_IEEE80211_HW_FLAGS + 1] = { +static const char *hw_flag_names[] = { #define FLAG(F) [IEEE80211_HW_##F] = #F FLAG(HAS_RATE_CONTROL), FLAG(RX_INCLUDES_FCS), @@ -126,9 +126,6 @@ static const char *hw_flag_names[NUM_IEEE80211_HW_FLAGS + 1] = { FLAG(SUPPORTS_AMSDU_IN_AMPDU), FLAG(BEACON_TX_STATUS), FLAG(NEEDS_UNIQUE_STA_ADDR), - - /* keep last for the build bug below */ - (void *)0x1 #undef FLAG }; @@ -148,7 +145,7 @@ static ssize_t hwflags_read(struct file *file, char __user *user_buf, /* fail compilation if somebody adds or removes * a flag without updating the name array above */ - BUILD_BUG_ON(hw_flag_names[NUM_IEEE80211_HW_FLAGS] != (void *)0x1); + BUILD_BUG_ON(ARRAY_SIZE(hw_flag_names) != NUM_IEEE80211_HW_FLAGS); for (i = 0; i < NUM_IEEE80211_HW_FLAGS; i++) { if (test_bit(i, local->hw.flags)) diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 8c067e6..95e757c 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -891,7 +891,7 @@ config NETFILTER_XT_TARGET_TEE depends on IPV6 || IPV6=n depends on !NF_CONNTRACK || NF_CONNTRACK select NF_DUP_IPV4 - select NF_DUP_IPV6 if IP6_NF_IPTABLES != n + select NF_DUP_IPV6 if IPV6 ---help--- This option adds a "TEE" target with which a packet can be cloned and this clone be rerouted to another nexthop. diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c index 43d8c98..f0f688d 100644 --- a/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -164,8 +164,6 @@ hash_netiface4_kadt(struct ip_set *set, const struct sk_buff *skb, }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); - if (e.cidr == 0) - return -EINVAL; if (adt == IPSET_TEST) e.cidr = HOST_MASK; @@ -377,8 +375,6 @@ hash_netiface6_kadt(struct ip_set *set, const struct sk_buff *skb, }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); - if (e.cidr == 0) - return -EINVAL; if (adt == IPSET_TEST) e.cidr = HOST_MASK; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 3cb3cb8..f60b4fd 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -66,6 +66,21 @@ EXPORT_SYMBOL_GPL(nf_conntrack_locks); __cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock); EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock); +static __read_mostly spinlock_t nf_conntrack_locks_all_lock; +static __read_mostly bool nf_conntrack_locks_all; + +void nf_conntrack_lock(spinlock_t *lock) __acquires(lock) +{ + spin_lock(lock); + while (unlikely(nf_conntrack_locks_all)) { + spin_unlock(lock); + spin_lock(&nf_conntrack_locks_all_lock); + spin_unlock(&nf_conntrack_locks_all_lock); + spin_lock(lock); + } +} +EXPORT_SYMBOL_GPL(nf_conntrack_lock); + static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2) { h1 %= CONNTRACK_LOCKS; @@ -82,12 +97,12 @@ static bool nf_conntrack_double_lock(struct net *net, unsigned int h1, h1 %= CONNTRACK_LOCKS; h2 %= CONNTRACK_LOCKS; if (h1 <= h2) { - spin_lock(&nf_conntrack_locks[h1]); + nf_conntrack_lock(&nf_conntrack_locks[h1]); if (h1 != h2) spin_lock_nested(&nf_conntrack_locks[h2], SINGLE_DEPTH_NESTING); } else { - spin_lock(&nf_conntrack_locks[h2]); + nf_conntrack_lock(&nf_conntrack_locks[h2]); spin_lock_nested(&nf_conntrack_locks[h1], SINGLE_DEPTH_NESTING); } @@ -102,16 +117,19 @@ static void nf_conntrack_all_lock(void) { int i; - for (i = 0; i < CONNTRACK_LOCKS; i++) - spin_lock_nested(&nf_conntrack_locks[i], i); + spin_lock(&nf_conntrack_locks_all_lock); + nf_conntrack_locks_all = true; + + for (i = 0; i < CONNTRACK_LOCKS; i++) { + spin_lock(&nf_conntrack_locks[i]); + spin_unlock(&nf_conntrack_locks[i]); + } } static void nf_conntrack_all_unlock(void) { - int i; - - for (i = 0; i < CONNTRACK_LOCKS; i++) - spin_unlock(&nf_conntrack_locks[i]); + nf_conntrack_locks_all = false; + spin_unlock(&nf_conntrack_locks_all_lock); } unsigned int nf_conntrack_htable_size __read_mostly; @@ -757,7 +775,7 @@ restart: hash = hash_bucket(_hash, net); for (; i < net->ct.htable_size; i++) { lockp = &nf_conntrack_locks[hash % CONNTRACK_LOCKS]; - spin_lock(lockp); + nf_conntrack_lock(lockp); if (read_seqcount_retry(&net->ct.generation, sequence)) { spin_unlock(lockp); goto restart; @@ -1382,7 +1400,7 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data), for (; *bucket < net->ct.htable_size; (*bucket)++) { lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS]; local_bh_disable(); - spin_lock(lockp); + nf_conntrack_lock(lockp); if (*bucket < net->ct.htable_size) { hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) { if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) @@ -1394,6 +1412,7 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data), } spin_unlock(lockp); local_bh_enable(); + cond_resched(); } for_each_possible_cpu(cpu) { @@ -1406,6 +1425,7 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data), set_bit(IPS_DYING_BIT, &ct->status); } spin_unlock_bh(&pcpu->lock); + cond_resched(); } return NULL; found: @@ -1422,6 +1442,8 @@ void nf_ct_iterate_cleanup(struct net *net, struct nf_conn *ct; unsigned int bucket = 0; + might_sleep(); + while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) { /* Time to push up daises... */ if (del_timer(&ct->timeout)) @@ -1430,6 +1452,7 @@ void nf_ct_iterate_cleanup(struct net *net, /* ... else the timer will get him soon. */ nf_ct_put(ct); + cond_resched(); } } EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup); diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index bd9d315..3b40ec5 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -425,7 +425,7 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me, } local_bh_disable(); for (i = 0; i < net->ct.htable_size; i++) { - spin_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); + nf_conntrack_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); if (i < net->ct.htable_size) { hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode) unhelp(h, me); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index dbb1bb3..355e855 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -840,7 +840,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) for (; cb->args[0] < net->ct.htable_size; cb->args[0]++) { restart: lockp = &nf_conntrack_locks[cb->args[0] % CONNTRACK_LOCKS]; - spin_lock(lockp); + nf_conntrack_lock(lockp); if (cb->args[0] >= net->ct.htable_size) { spin_unlock(lockp); goto out; diff --git a/net/netfilter/nf_tables_netdev.c b/net/netfilter/nf_tables_netdev.c index b6605e0..5eefe4a 100644 --- a/net/netfilter/nf_tables_netdev.c +++ b/net/netfilter/nf_tables_netdev.c @@ -224,12 +224,12 @@ static int __init nf_tables_netdev_init(void) nft_register_chain_type(&nft_filter_chain_netdev); ret = register_pernet_subsys(&nf_tables_netdev_net_ops); - if (ret < 0) + if (ret < 0) { nft_unregister_chain_type(&nft_filter_chain_netdev); - + return ret; + } register_netdevice_notifier(&nf_tables_netdev_notifier); - - return ret; + return 0; } static void __exit nf_tables_netdev_exit(void) diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index a7ba233..857ae89 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -311,14 +311,14 @@ replay: #endif { nfnl_unlock(subsys_id); - netlink_ack(skb, nlh, -EOPNOTSUPP); + netlink_ack(oskb, nlh, -EOPNOTSUPP); return kfree_skb(skb); } } if (!ss->commit || !ss->abort) { nfnl_unlock(subsys_id); - netlink_ack(skb, nlh, -EOPNOTSUPP); + netlink_ack(oskb, nlh, -EOPNOTSUPP); return kfree_skb(skb); } @@ -328,10 +328,12 @@ replay: nlh = nlmsg_hdr(skb); err = 0; - if (nlmsg_len(nlh) < sizeof(struct nfgenmsg) || - skb->len < nlh->nlmsg_len) { - err = -EINVAL; - goto ack; + if (nlh->nlmsg_len < NLMSG_HDRLEN || + skb->len < nlh->nlmsg_len || + nlmsg_len(nlh) < sizeof(struct nfgenmsg)) { + nfnl_err_reset(&err_list); + status |= NFNL_BATCH_FAILURE; + goto done; } /* Only requests are handled by the kernel */ @@ -406,7 +408,7 @@ ack: * pointing to the batch header. */ nfnl_err_reset(&err_list); - netlink_ack(skb, nlmsg_hdr(oskb), -ENOMEM); + netlink_ack(oskb, nlmsg_hdr(oskb), -ENOMEM); status |= NFNL_BATCH_FAILURE; goto done; } diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index 5d010f2..2671b9d 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -307,7 +307,7 @@ static void ctnl_untimeout(struct net *net, struct ctnl_timeout *timeout) local_bh_disable(); for (i = 0; i < net->ct.htable_size; i++) { - spin_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); + nf_conntrack_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); if (i < net->ct.htable_size) { hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode) untimeout(h, timeout); diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c index 383c171..b78c28b 100644 --- a/net/netfilter/nft_byteorder.c +++ b/net/netfilter/nft_byteorder.c @@ -46,16 +46,14 @@ static void nft_byteorder_eval(const struct nft_expr *expr, switch (priv->op) { case NFT_BYTEORDER_NTOH: for (i = 0; i < priv->len / 8; i++) { - src64 = get_unaligned_be64(&src[i]); - src64 = be64_to_cpu((__force __be64)src64); + src64 = get_unaligned((u64 *)&src[i]); put_unaligned_be64(src64, &dst[i]); } break; case NFT_BYTEORDER_HTON: for (i = 0; i < priv->len / 8; i++) { src64 = get_unaligned_be64(&src[i]); - src64 = (__force u64)cpu_to_be64(src64); - put_unaligned_be64(src64, &dst[i]); + put_unaligned(src64, (u64 *)&dst[i]); } break; } diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c index c7808fc..c9743f7 100644 --- a/net/netfilter/nft_counter.c +++ b/net/netfilter/nft_counter.c @@ -100,7 +100,7 @@ static int nft_counter_init(const struct nft_ctx *ctx, cpu_stats = netdev_alloc_pcpu_stats(struct nft_counter_percpu); if (cpu_stats == NULL) - return ENOMEM; + return -ENOMEM; preempt_disable(); this_cpu = this_cpu_ptr(cpu_stats); @@ -138,7 +138,7 @@ static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src) cpu_stats = __netdev_alloc_pcpu_stats(struct nft_counter_percpu, GFP_ATOMIC); if (cpu_stats == NULL) - return ENOMEM; + return -ENOMEM; preempt_disable(); this_cpu = this_cpu_ptr(cpu_stats); diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index a0eb216..d4a4619 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -127,6 +127,7 @@ static void nft_ct_get_eval(const struct nft_expr *expr, NF_CT_LABELS_MAX_SIZE - size); return; } +#endif case NFT_CT_BYTES: /* fallthrough */ case NFT_CT_PKTS: { const struct nf_conn_acct *acct = nf_conn_acct_find(ct); @@ -138,7 +139,6 @@ static void nft_ct_get_eval(const struct nft_expr *expr, memcpy(dest, &count, sizeof(count)); return; } -#endif default: break; } diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c index b7c43de..e118397 100644 --- a/net/netfilter/xt_TCPMSS.c +++ b/net/netfilter/xt_TCPMSS.c @@ -228,7 +228,7 @@ tcpmss_tg6(struct sk_buff *skb, const struct xt_action_param *par) { struct ipv6hdr *ipv6h = ipv6_hdr(skb); u8 nexthdr; - __be16 frag_off; + __be16 frag_off, oldlen, newlen; int tcphoff; int ret; @@ -244,7 +244,12 @@ tcpmss_tg6(struct sk_buff *skb, const struct xt_action_param *par) return NF_DROP; if (ret > 0) { ipv6h = ipv6_hdr(skb); - ipv6h->payload_len = htons(ntohs(ipv6h->payload_len) + ret); + oldlen = ipv6h->payload_len; + newlen = htons(ntohs(oldlen) + ret); + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->csum = csum_add(csum_sub(skb->csum, oldlen), + newlen); + ipv6h->payload_len = newlen; } return XT_CONTINUE; } diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c index 3eff7b6..6e57a39 100644 --- a/net/netfilter/xt_TEE.c +++ b/net/netfilter/xt_TEE.c @@ -38,7 +38,7 @@ tee_tg4(struct sk_buff *skb, const struct xt_action_param *par) return XT_CONTINUE; } -#if IS_ENABLED(CONFIG_NF_DUP_IPV6) +#if IS_ENABLED(CONFIG_IPV6) static unsigned int tee_tg6(struct sk_buff *skb, const struct xt_action_param *par) { @@ -131,7 +131,7 @@ static struct xt_target tee_tg_reg[] __read_mostly = { .destroy = tee_tg_destroy, .me = THIS_MODULE, }, -#if IS_ENABLED(CONFIG_NF_DUP_IPV6) +#if IS_ENABLED(CONFIG_IPV6) { .name = "TEE", .revision = 1, diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 81dc1bb..f1ffb34 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2831,7 +2831,8 @@ static int netlink_dump(struct sock *sk) * reasonable static buffer based on the expected largest dump of a * single netdev. The outcome is MSG_TRUNC error. */ - skb_reserve(skb, skb_tailroom(skb) - alloc_size); + if (!netlink_rx_is_mmaped(sk)) + skb_reserve(skb, skb_tailroom(skb) - alloc_size); netlink_skb_set_owner_r(skb, sk); len = cb->dump(skb, cb); diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index c88d0f2..2d59df5 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -1160,17 +1160,26 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, const struct sw_flow_actions *acts, struct sw_flow_key *key) { - int level = this_cpu_read(exec_actions_level); - int err; + static const int ovs_recursion_limit = 5; + int err, level; + + level = __this_cpu_inc_return(exec_actions_level); + if (unlikely(level > ovs_recursion_limit)) { + net_crit_ratelimited("ovs: recursion limit reached on datapath %s, probable configuration error\n", + ovs_dp_name(dp)); + kfree_skb(skb); + err = -ENETDOWN; + goto out; + } - this_cpu_inc(exec_actions_level); err = do_execute_actions(dp, skb, key, acts->actions, acts->actions_len); - if (!level) + if (level == 1) process_deferred_actions(dp); - this_cpu_dec(exec_actions_level); +out: + __this_cpu_dec(exec_actions_level); return err; } diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 91a8b00..deadfda 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -336,12 +336,10 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, unsigned short gso_type = skb_shinfo(skb)->gso_type; struct sw_flow_key later_key; struct sk_buff *segs, *nskb; - struct ovs_skb_cb ovs_cb; int err; - ovs_cb = *OVS_CB(skb); + BUILD_BUG_ON(sizeof(*OVS_CB(skb)) > SKB_SGO_CB_OFFSET); segs = __skb_gso_segment(skb, NETIF_F_SG, false); - *OVS_CB(skb) = ovs_cb; if (IS_ERR(segs)) return PTR_ERR(segs); if (segs == NULL) @@ -359,7 +357,6 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, /* Queue all of the segments. */ skb = segs; do { - *OVS_CB(skb) = ovs_cb; if (gso_type & SKB_GSO_UDP && skb != segs) key = &later_key; diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c index 1605691..5eb7694 100644 --- a/net/openvswitch/vport-vxlan.c +++ b/net/openvswitch/vport-vxlan.c @@ -90,7 +90,9 @@ static struct vport *vxlan_tnl_create(const struct vport_parms *parms) int err; struct vxlan_config conf = { .no_share = true, - .flags = VXLAN_F_COLLECT_METADATA, + .flags = VXLAN_F_COLLECT_METADATA | VXLAN_F_UDP_ZERO_CSUM6_RX, + /* Don't restrict the packets that can be sent by MTU */ + .mtu = IP_MAX_MTU, }; if (!options) { diff --git a/net/rds/ib.c b/net/rds/ib.c index f222885..9481d55 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -122,44 +122,34 @@ void rds_ib_dev_put(struct rds_ib_device *rds_ibdev) static void rds_ib_add_one(struct ib_device *device) { struct rds_ib_device *rds_ibdev; - struct ib_device_attr *dev_attr; /* Only handle IB (no iWARP) devices */ if (device->node_type != RDMA_NODE_IB_CA) return; - dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL); - if (!dev_attr) - return; - - if (ib_query_device(device, dev_attr)) { - rdsdebug("Query device failed for %s\n", device->name); - goto free_attr; - } - rds_ibdev = kzalloc_node(sizeof(struct rds_ib_device), GFP_KERNEL, ibdev_to_node(device)); if (!rds_ibdev) - goto free_attr; + return; spin_lock_init(&rds_ibdev->spinlock); atomic_set(&rds_ibdev->refcount, 1); INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free); - rds_ibdev->max_wrs = dev_attr->max_qp_wr; - rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE); + rds_ibdev->max_wrs = device->attrs.max_qp_wr; + rds_ibdev->max_sge = min(device->attrs.max_sge, RDS_IB_MAX_SGE); - rds_ibdev->fmr_max_remaps = dev_attr->max_map_per_fmr?: 32; - rds_ibdev->max_1m_fmrs = dev_attr->max_mr ? - min_t(unsigned int, (dev_attr->max_mr / 2), + rds_ibdev->fmr_max_remaps = device->attrs.max_map_per_fmr?: 32; + rds_ibdev->max_1m_fmrs = device->attrs.max_mr ? + min_t(unsigned int, (device->attrs.max_mr / 2), rds_ib_fmr_1m_pool_size) : rds_ib_fmr_1m_pool_size; - rds_ibdev->max_8k_fmrs = dev_attr->max_mr ? - min_t(unsigned int, ((dev_attr->max_mr / 2) * RDS_MR_8K_SCALE), + rds_ibdev->max_8k_fmrs = device->attrs.max_mr ? + min_t(unsigned int, ((device->attrs.max_mr / 2) * RDS_MR_8K_SCALE), rds_ib_fmr_8k_pool_size) : rds_ib_fmr_8k_pool_size; - rds_ibdev->max_initiator_depth = dev_attr->max_qp_init_rd_atom; - rds_ibdev->max_responder_resources = dev_attr->max_qp_rd_atom; + rds_ibdev->max_initiator_depth = device->attrs.max_qp_init_rd_atom; + rds_ibdev->max_responder_resources = device->attrs.max_qp_rd_atom; rds_ibdev->dev = device; rds_ibdev->pd = ib_alloc_pd(device); @@ -183,7 +173,7 @@ static void rds_ib_add_one(struct ib_device *device) } rdsdebug("RDS/IB: max_mr = %d, max_wrs = %d, max_sge = %d, fmr_max_remaps = %d, max_1m_fmrs = %d, max_8k_fmrs = %d\n", - dev_attr->max_fmr, rds_ibdev->max_wrs, rds_ibdev->max_sge, + device->attrs.max_fmr, rds_ibdev->max_wrs, rds_ibdev->max_sge, rds_ibdev->fmr_max_remaps, rds_ibdev->max_1m_fmrs, rds_ibdev->max_8k_fmrs); @@ -202,8 +192,6 @@ static void rds_ib_add_one(struct ib_device *device) put_dev: rds_ib_dev_put(rds_ibdev); -free_attr: - kfree(dev_attr); } /* diff --git a/net/rds/iw.c b/net/rds/iw.c index 576f182..f4a9fff 100644 --- a/net/rds/iw.c +++ b/net/rds/iw.c @@ -60,30 +60,20 @@ LIST_HEAD(iw_nodev_conns); static void rds_iw_add_one(struct ib_device *device) { struct rds_iw_device *rds_iwdev; - struct ib_device_attr *dev_attr; /* Only handle iwarp devices */ if (device->node_type != RDMA_NODE_RNIC) return; - dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL); - if (!dev_attr) - return; - - if (ib_query_device(device, dev_attr)) { - rdsdebug("Query device failed for %s\n", device->name); - goto free_attr; - } - rds_iwdev = kmalloc(sizeof *rds_iwdev, GFP_KERNEL); if (!rds_iwdev) - goto free_attr; + return; spin_lock_init(&rds_iwdev->spinlock); - rds_iwdev->dma_local_lkey = !!(dev_attr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY); - rds_iwdev->max_wrs = dev_attr->max_qp_wr; - rds_iwdev->max_sge = min(dev_attr->max_sge, RDS_IW_MAX_SGE); + rds_iwdev->dma_local_lkey = !!(device->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY); + rds_iwdev->max_wrs = device->attrs.max_qp_wr; + rds_iwdev->max_sge = min(device->attrs.max_sge, RDS_IW_MAX_SGE); rds_iwdev->dev = device; rds_iwdev->pd = ib_alloc_pd(device); @@ -111,8 +101,7 @@ static void rds_iw_add_one(struct ib_device *device) list_add_tail(&rds_iwdev->list, &rds_iw_devices); ib_set_client_data(device, &rds_iw_client, rds_iwdev); - - goto free_attr; + return; err_mr: if (rds_iwdev->mr) @@ -121,8 +110,6 @@ err_pd: ib_dealloc_pd(rds_iwdev->pd); free_dev: kfree(rds_iwdev); -free_attr: - kfree(dev_attr); } static void rds_iw_remove_one(struct ib_device *device, void *client_data) diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index b5c2cf2..af1acf0 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1852,6 +1852,7 @@ reset: } tp = old_tp; + protocol = tc_skb_protocol(skb); goto reclassify; #endif } diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index f26bdea..a1cd778 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -403,6 +403,8 @@ static struct sk_buff *drr_dequeue(struct Qdisc *sch) if (len <= cl->deficit) { cl->deficit -= len; skb = qdisc_dequeue_peeked(cl->qdisc); + if (unlikely(skb == NULL)) + goto out; if (cl->qdisc->q.qlen == 0) list_del(&cl->alist); diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c index 52838ea..2522a61 100644 --- a/net/sctp/endpointola.c +++ b/net/sctp/endpointola.c @@ -333,7 +333,7 @@ struct sctp_association *sctp_endpoint_lookup_assoc( if (!ep->base.bind_addr.port) goto out; t = sctp_epaddr_lookup_transport(ep, paddr); - if (!t || t->asoc->temp) + if (!t) goto out; *transport = t; diff --git a/net/sctp/input.c b/net/sctp/input.c index d9a6e66..49d2cc7 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -784,6 +784,7 @@ hit: /* rhashtable for transport */ struct sctp_hash_cmp_arg { + const struct sctp_endpoint *ep; const union sctp_addr *laddr; const union sctp_addr *paddr; const struct net *net; @@ -797,15 +798,20 @@ static inline int sctp_hash_cmp(struct rhashtable_compare_arg *arg, struct sctp_association *asoc = t->asoc; const struct net *net = x->net; - if (x->laddr->v4.sin_port != htons(asoc->base.bind_addr.port)) - return 1; if (!sctp_cmp_addr_exact(&t->ipaddr, x->paddr)) return 1; if (!net_eq(sock_net(asoc->base.sk), net)) return 1; - if (!sctp_bind_addr_match(&asoc->base.bind_addr, - x->laddr, sctp_sk(asoc->base.sk))) - return 1; + if (x->ep) { + if (x->ep != asoc->ep) + return 1; + } else { + if (x->laddr->v4.sin_port != htons(asoc->base.bind_addr.port)) + return 1; + if (!sctp_bind_addr_match(&asoc->base.bind_addr, + x->laddr, sctp_sk(asoc->base.sk))) + return 1; + } return 0; } @@ -832,9 +838,11 @@ static inline u32 sctp_hash_key(const void *data, u32 len, u32 seed) const struct sctp_hash_cmp_arg *x = data; const union sctp_addr *paddr = x->paddr; const struct net *net = x->net; - u16 lport = x->laddr->v4.sin_port; + u16 lport; u32 addr; + lport = x->ep ? htons(x->ep->base.bind_addr.port) : + x->laddr->v4.sin_port; if (paddr->sa.sa_family == AF_INET6) addr = jhash(&paddr->v6.sin6_addr, 16, seed); else @@ -864,12 +872,12 @@ void sctp_transport_hashtable_destroy(void) void sctp_hash_transport(struct sctp_transport *t) { - struct sctp_sockaddr_entry *addr; struct sctp_hash_cmp_arg arg; - addr = list_entry(t->asoc->base.bind_addr.address_list.next, - struct sctp_sockaddr_entry, list); - arg.laddr = &addr->a; + if (t->asoc->temp) + return; + + arg.ep = t->asoc->ep; arg.paddr = &t->ipaddr; arg.net = sock_net(t->asoc->base.sk); @@ -881,6 +889,9 @@ reinsert: void sctp_unhash_transport(struct sctp_transport *t) { + if (t->asoc->temp) + return; + rhashtable_remove_fast(&sctp_transport_hashtable, &t->node, sctp_hash_params); } @@ -891,6 +902,7 @@ struct sctp_transport *sctp_addrs_lookup_transport( const union sctp_addr *paddr) { struct sctp_hash_cmp_arg arg = { + .ep = NULL, .laddr = laddr, .paddr = paddr, .net = net, @@ -904,13 +916,15 @@ struct sctp_transport *sctp_epaddr_lookup_transport( const struct sctp_endpoint *ep, const union sctp_addr *paddr) { - struct sctp_sockaddr_entry *addr; struct net *net = sock_net(ep->base.sk); + struct sctp_hash_cmp_arg arg = { + .ep = ep, + .paddr = paddr, + .net = net, + }; - addr = list_entry(ep->base.bind_addr.address_list.next, - struct sctp_sockaddr_entry, list); - - return sctp_addrs_lookup_transport(net, &addr->a, paddr); + return rhashtable_lookup_fast(&sctp_transport_hashtable, &arg, + sctp_hash_params); } /* Look up an association. */ @@ -921,15 +935,22 @@ static struct sctp_association *__sctp_lookup_association( struct sctp_transport **pt) { struct sctp_transport *t; + struct sctp_association *asoc = NULL; + rcu_read_lock(); t = sctp_addrs_lookup_transport(net, local, peer); - if (!t || t->dead || t->asoc->temp) - return NULL; + if (!t || !sctp_transport_hold(t)) + goto out; - sctp_association_hold(t->asoc); + asoc = t->asoc; + sctp_association_hold(asoc); *pt = t; - return t->asoc; + sctp_transport_put(t); + +out: + rcu_read_unlock(); + return asoc; } /* Look up an association. protected by RCU read lock */ @@ -941,9 +962,7 @@ struct sctp_association *sctp_lookup_association(struct net *net, { struct sctp_association *asoc; - rcu_read_lock(); asoc = __sctp_lookup_association(net, laddr, paddr, transportp); - rcu_read_unlock(); return asoc; } diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index ec52912..ce46f1c 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -526,6 +526,8 @@ static int sctp_v6_cmp_addr(const union sctp_addr *addr1, } return 0; } + if (addr1->v6.sin6_port != addr2->v6.sin6_port) + return 0; if (!ipv6_addr_equal(&addr1->v6.sin6_addr, &addr2->v6.sin6_addr)) return 0; /* If this is a linklocal address, compare the scope_id. */ diff --git a/net/sctp/proc.c b/net/sctp/proc.c index dfa7eec..963dffc 100644 --- a/net/sctp/proc.c +++ b/net/sctp/proc.c @@ -165,8 +165,6 @@ static void sctp_seq_dump_remote_addrs(struct seq_file *seq, struct sctp_associa list_for_each_entry_rcu(transport, &assoc->peer.transport_addr_list, transports) { addr = &transport->ipaddr; - if (transport->dead) - continue; af = sctp_get_af_specific(addr->sa.sa_family); if (af->cmp_addr(addr, primary)) { @@ -310,7 +308,7 @@ static struct sctp_transport *sctp_transport_get_next(struct seq_file *seq) static struct sctp_transport *sctp_transport_get_idx(struct seq_file *seq, loff_t pos) { - void *obj; + void *obj = SEQ_START_TOKEN; while (pos && (obj = sctp_transport_get_next(seq)) && !IS_ERR(obj)) pos--; @@ -347,7 +345,7 @@ static void *sctp_assocs_seq_start(struct seq_file *seq, loff_t *pos) if (err) return ERR_PTR(err); - return *pos ? sctp_transport_get_idx(seq, *pos) : SEQ_START_TOKEN; + return sctp_transport_get_idx(seq, *pos); } static void sctp_assocs_seq_stop(struct seq_file *seq, void *v) @@ -380,6 +378,8 @@ static int sctp_assocs_seq_show(struct seq_file *seq, void *v) } transport = (struct sctp_transport *)v; + if (!sctp_transport_hold(transport)) + return 0; assoc = transport->asoc; epb = &assoc->base; sk = epb->sk; @@ -412,6 +412,8 @@ static int sctp_assocs_seq_show(struct seq_file *seq, void *v) sk->sk_rcvbuf); seq_printf(seq, "\n"); + sctp_transport_put(transport); + return 0; } @@ -462,7 +464,7 @@ static void *sctp_remaddr_seq_start(struct seq_file *seq, loff_t *pos) if (err) return ERR_PTR(err); - return *pos ? sctp_transport_get_idx(seq, *pos) : SEQ_START_TOKEN; + return sctp_transport_get_idx(seq, *pos); } static void *sctp_remaddr_seq_next(struct seq_file *seq, void *v, loff_t *pos) @@ -480,7 +482,7 @@ static void sctp_remaddr_seq_stop(struct seq_file *seq, void *v) static int sctp_remaddr_seq_show(struct seq_file *seq, void *v) { struct sctp_association *assoc; - struct sctp_transport *tsp; + struct sctp_transport *transport, *tsp; if (v == SEQ_START_TOKEN) { seq_printf(seq, "ADDR ASSOC_ID HB_ACT RTO MAX_PATH_RTX " @@ -488,13 +490,13 @@ static int sctp_remaddr_seq_show(struct seq_file *seq, void *v) return 0; } - tsp = (struct sctp_transport *)v; - assoc = tsp->asoc; + transport = (struct sctp_transport *)v; + if (!sctp_transport_hold(transport)) + return 0; + assoc = transport->asoc; list_for_each_entry_rcu(tsp, &assoc->peer.transport_addr_list, transports) { - if (tsp->dead) - continue; /* * The remote address (ADDR) */ @@ -544,6 +546,8 @@ static int sctp_remaddr_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "\n"); } + sctp_transport_put(transport); + return 0; } diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index ab0d538..1099e99 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -60,6 +60,8 @@ #include <net/inet_common.h> #include <net/inet_ecn.h> +#define MAX_SCTP_PORT_HASH_ENTRIES (64 * 1024) + /* Global data structures. */ struct sctp_globals sctp_globals __read_mostly; @@ -1355,6 +1357,8 @@ static __init int sctp_init(void) unsigned long limit; int max_share; int order; + int num_entries; + int max_entry_order; sock_skb_cb_check_size(sizeof(struct sctp_ulpevent)); @@ -1407,14 +1411,24 @@ static __init int sctp_init(void) /* Size and allocate the association hash table. * The methodology is similar to that of the tcp hash tables. + * Though not identical. Start by getting a goal size */ if (totalram_pages >= (128 * 1024)) goal = totalram_pages >> (22 - PAGE_SHIFT); else goal = totalram_pages >> (24 - PAGE_SHIFT); - for (order = 0; (1UL << order) < goal; order++) - ; + /* Then compute the page order for said goal */ + order = get_order(goal); + + /* Now compute the required page order for the maximum sized table we + * want to create + */ + max_entry_order = get_order(MAX_SCTP_PORT_HASH_ENTRIES * + sizeof(struct sctp_bind_hashbucket)); + + /* Limit the page order by that maximum hash table size */ + order = min(order, max_entry_order); /* Allocate and initialize the endpoint hash table. */ sctp_ep_hashsize = 64; @@ -1430,20 +1444,35 @@ static __init int sctp_init(void) INIT_HLIST_HEAD(&sctp_ep_hashtable[i].chain); } - /* Allocate and initialize the SCTP port hash table. */ + /* Allocate and initialize the SCTP port hash table. + * Note that order is initalized to start at the max sized + * table we want to support. If we can't get that many pages + * reduce the order and try again + */ do { - sctp_port_hashsize = (1UL << order) * PAGE_SIZE / - sizeof(struct sctp_bind_hashbucket); - if ((sctp_port_hashsize > (64 * 1024)) && order > 0) - continue; sctp_port_hashtable = (struct sctp_bind_hashbucket *) __get_free_pages(GFP_KERNEL | __GFP_NOWARN, order); } while (!sctp_port_hashtable && --order > 0); + if (!sctp_port_hashtable) { pr_err("Failed bind hash alloc\n"); status = -ENOMEM; goto err_bhash_alloc; } + + /* Now compute the number of entries that will fit in the + * port hash space we allocated + */ + num_entries = (1UL << order) * PAGE_SIZE / + sizeof(struct sctp_bind_hashbucket); + + /* And finish by rounding it down to the nearest power of two + * this wastes some memory of course, but its needed because + * the hash function operates based on the assumption that + * that the number of entries is a power of two + */ + sctp_port_hashsize = rounddown_pow_of_two(num_entries); + for (i = 0; i < sctp_port_hashsize; i++) { spin_lock_init(&sctp_port_hashtable[i].lock); INIT_HLIST_HEAD(&sctp_port_hashtable[i].chain); @@ -1452,7 +1481,8 @@ static __init int sctp_init(void) if (sctp_transport_hashtable_init()) goto err_thash_alloc; - pr_info("Hash tables configured (bind %d)\n", sctp_port_hashsize); + pr_info("Hash tables configured (bind %d/%d)\n", sctp_port_hashsize, + num_entries); sctp_sysctl_register(); diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index 2e21384..b5327bb 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -259,12 +259,6 @@ void sctp_generate_t3_rtx_event(unsigned long peer) goto out_unlock; } - /* Is this transport really dead and just waiting around for - * the timer to let go of the reference? - */ - if (transport->dead) - goto out_unlock; - /* Run through the state machine. */ error = sctp_do_sm(net, SCTP_EVENT_T_TIMEOUT, SCTP_ST_TIMEOUT(SCTP_EVENT_TIMEOUT_T3_RTX), @@ -380,12 +374,6 @@ void sctp_generate_heartbeat_event(unsigned long data) goto out_unlock; } - /* Is this structure just waiting around for us to actually - * get destroyed? - */ - if (transport->dead) - goto out_unlock; - error = sctp_do_sm(net, SCTP_EVENT_T_TIMEOUT, SCTP_ST_TIMEOUT(SCTP_EVENT_TIMEOUT_HEARTBEAT), asoc->state, asoc->ep, asoc, diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 9bb80ec..e878da0 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -5538,6 +5538,7 @@ static int sctp_getsockopt_hmac_ident(struct sock *sk, int len, struct sctp_hmac_algo_param *hmacs; __u16 data_len = 0; u32 num_idents; + int i; if (!ep->auth_enable) return -EACCES; @@ -5555,8 +5556,12 @@ static int sctp_getsockopt_hmac_ident(struct sock *sk, int len, return -EFAULT; if (put_user(num_idents, &p->shmac_num_idents)) return -EFAULT; - if (copy_to_user(p->shmac_idents, hmacs->hmac_ids, data_len)) - return -EFAULT; + for (i = 0; i < num_idents; i++) { + __u16 hmacid = ntohs(hmacs->hmac_ids[i]); + + if (copy_to_user(&p->shmac_idents[i], &hmacid, sizeof(__u16))) + return -EFAULT; + } return 0; } @@ -6636,6 +6641,7 @@ static int sctp_msghdr_parse(const struct msghdr *msg, sctp_cmsgs_t *cmsgs) if (cmsgs->srinfo->sinfo_flags & ~(SCTP_UNORDERED | SCTP_ADDR_OVER | + SCTP_SACK_IMMEDIATELY | SCTP_ABORT | SCTP_EOF)) return -EINVAL; break; @@ -6659,6 +6665,7 @@ static int sctp_msghdr_parse(const struct msghdr *msg, sctp_cmsgs_t *cmsgs) if (cmsgs->sinfo->snd_flags & ~(SCTP_UNORDERED | SCTP_ADDR_OVER | + SCTP_SACK_IMMEDIATELY | SCTP_ABORT | SCTP_EOF)) return -EINVAL; break; diff --git a/net/sctp/transport.c b/net/sctp/transport.c index aab9e3f..a431c14 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -132,8 +132,6 @@ fail: */ void sctp_transport_free(struct sctp_transport *transport) { - transport->dead = 1; - /* Try to delete the heartbeat timer. */ if (del_timer(&transport->hb_timer)) sctp_transport_put(transport); @@ -169,7 +167,7 @@ static void sctp_transport_destroy_rcu(struct rcu_head *head) */ static void sctp_transport_destroy(struct sctp_transport *transport) { - if (unlikely(!transport->dead)) { + if (unlikely(atomic_read(&transport->refcnt))) { WARN(1, "Attempt to destroy undead transport %p!\n", transport); return; } @@ -296,9 +294,9 @@ void sctp_transport_route(struct sctp_transport *transport, } /* Hold a reference to a transport. */ -void sctp_transport_hold(struct sctp_transport *transport) +int sctp_transport_hold(struct sctp_transport *transport) { - atomic_inc(&transport->refcnt); + return atomic_add_unless(&transport->refcnt, 1, 0); } /* Release a reference to a transport and clean up diff --git a/net/socket.c b/net/socket.c index 91c2de6..c044d1e 100644 --- a/net/socket.c +++ b/net/socket.c @@ -294,7 +294,7 @@ static int init_inodecache(void) 0, (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT | - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD | SLAB_ACCOUNT), init_once); if (sock_inode_cachep == NULL) return -ENOMEM; diff --git a/net/sunrpc/auth_gss/gss_rpc_upcall.c b/net/sunrpc/auth_gss/gss_rpc_upcall.c index 59eeed4..f0c6a8c 100644 --- a/net/sunrpc/auth_gss/gss_rpc_upcall.c +++ b/net/sunrpc/auth_gss/gss_rpc_upcall.c @@ -326,6 +326,9 @@ int gssp_accept_sec_context_upcall(struct net *net, if (data->found_creds && client_name.data != NULL) { char *c; + data->creds.cr_raw_principal = kstrndup(client_name.data, + client_name.len, GFP_KERNEL); + data->creds.cr_principal = kstrndup(client_name.data, client_name.len, GFP_KERNEL); if (data->creds.cr_principal) { diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 5e4f815..2b32fd6 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -771,7 +771,7 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count, if (count == 0) return 0; - mutex_lock(&inode->i_mutex); /* protect against multiple concurrent + inode_lock(inode); /* protect against multiple concurrent * readers on this file */ again: spin_lock(&queue_lock); @@ -784,7 +784,7 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count, } if (rp->q.list.next == &cd->queue) { spin_unlock(&queue_lock); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); WARN_ON_ONCE(rp->offset); return 0; } @@ -838,7 +838,7 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count, } if (err == -EAGAIN) goto again; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return err ? err : count; } @@ -909,9 +909,9 @@ static ssize_t cache_write(struct file *filp, const char __user *buf, if (!cd->cache_parse) goto out; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = cache_downcall(mapping, buf, count, cd); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); out: return ret; } diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 23608eb..b7f2104 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1217,6 +1217,7 @@ static int rpc_anyaddr(int family, struct sockaddr *buf, size_t buflen) return -EINVAL; memcpy(buf, &rpc_in6addr_loopback, sizeof(rpc_in6addr_loopback)); + break; default: dprintk("RPC: %s: address family not supported\n", __func__); diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index d81186d..31789ef 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -172,7 +172,7 @@ rpc_close_pipes(struct inode *inode) int need_release; LIST_HEAD(free_list); - mutex_lock(&inode->i_mutex); + inode_lock(inode); spin_lock(&pipe->lock); need_release = pipe->nreaders != 0 || pipe->nwriters != 0; pipe->nreaders = 0; @@ -188,7 +188,7 @@ rpc_close_pipes(struct inode *inode) cancel_delayed_work_sync(&pipe->queue_timeout); rpc_inode_setowner(inode, NULL); RPC_I(inode)->pipe = NULL; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } static struct inode * @@ -221,7 +221,7 @@ rpc_pipe_open(struct inode *inode, struct file *filp) int first_open; int res = -ENXIO; - mutex_lock(&inode->i_mutex); + inode_lock(inode); pipe = RPC_I(inode)->pipe; if (pipe == NULL) goto out; @@ -237,7 +237,7 @@ rpc_pipe_open(struct inode *inode, struct file *filp) pipe->nwriters++; res = 0; out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return res; } @@ -248,7 +248,7 @@ rpc_pipe_release(struct inode *inode, struct file *filp) struct rpc_pipe_msg *msg; int last_close; - mutex_lock(&inode->i_mutex); + inode_lock(inode); pipe = RPC_I(inode)->pipe; if (pipe == NULL) goto out; @@ -278,7 +278,7 @@ rpc_pipe_release(struct inode *inode, struct file *filp) if (last_close && pipe->ops->release_pipe) pipe->ops->release_pipe(inode); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return 0; } @@ -290,7 +290,7 @@ rpc_pipe_read(struct file *filp, char __user *buf, size_t len, loff_t *offset) struct rpc_pipe_msg *msg; int res = 0; - mutex_lock(&inode->i_mutex); + inode_lock(inode); pipe = RPC_I(inode)->pipe; if (pipe == NULL) { res = -EPIPE; @@ -322,7 +322,7 @@ rpc_pipe_read(struct file *filp, char __user *buf, size_t len, loff_t *offset) pipe->ops->destroy_msg(msg); } out_unlock: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return res; } @@ -332,11 +332,11 @@ rpc_pipe_write(struct file *filp, const char __user *buf, size_t len, loff_t *of struct inode *inode = file_inode(filp); int res; - mutex_lock(&inode->i_mutex); + inode_lock(inode); res = -EPIPE; if (RPC_I(inode)->pipe != NULL) res = RPC_I(inode)->pipe->ops->downcall(filp, buf, len); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return res; } @@ -349,12 +349,12 @@ rpc_pipe_poll(struct file *filp, struct poll_table_struct *wait) poll_wait(filp, &rpci->waitq, wait); - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (rpci->pipe == NULL) mask |= POLLERR | POLLHUP; else if (filp->private_data || !list_empty(&rpci->pipe->pipe)) mask |= POLLIN | POLLRDNORM; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return mask; } @@ -367,10 +367,10 @@ rpc_pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) switch (cmd) { case FIONREAD: - mutex_lock(&inode->i_mutex); + inode_lock(inode); pipe = RPC_I(inode)->pipe; if (pipe == NULL) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return -EPIPE; } spin_lock(&pipe->lock); @@ -381,7 +381,7 @@ rpc_pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) len += msg->len - msg->copied; } spin_unlock(&pipe->lock); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return put_user(len, (int __user *)arg); default: return -EINVAL; @@ -617,9 +617,9 @@ int rpc_rmdir(struct dentry *dentry) parent = dget_parent(dentry); dir = d_inode(parent); - mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(dir, I_MUTEX_PARENT); error = __rpc_rmdir(dir, dentry); - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); dput(parent); return error; } @@ -701,9 +701,9 @@ static void rpc_depopulate(struct dentry *parent, { struct inode *dir = d_inode(parent); - mutex_lock_nested(&dir->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(dir, I_MUTEX_CHILD); __rpc_depopulate(parent, files, start, eof); - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); } static int rpc_populate(struct dentry *parent, @@ -715,7 +715,7 @@ static int rpc_populate(struct dentry *parent, struct dentry *dentry; int i, err; - mutex_lock(&dir->i_mutex); + inode_lock(dir); for (i = start; i < eof; i++) { dentry = __rpc_lookup_create_exclusive(parent, files[i].name); err = PTR_ERR(dentry); @@ -739,11 +739,11 @@ static int rpc_populate(struct dentry *parent, if (err != 0) goto out_bad; } - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); return 0; out_bad: __rpc_depopulate(parent, files, start, eof); - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); printk(KERN_WARNING "%s: %s failed to populate directory %pd\n", __FILE__, __func__, parent); return err; @@ -757,7 +757,7 @@ static struct dentry *rpc_mkdir_populate(struct dentry *parent, struct inode *dir = d_inode(parent); int error; - mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(dir, I_MUTEX_PARENT); dentry = __rpc_lookup_create_exclusive(parent, name); if (IS_ERR(dentry)) goto out; @@ -770,7 +770,7 @@ static struct dentry *rpc_mkdir_populate(struct dentry *parent, goto err_rmdir; } out: - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); return dentry; err_rmdir: __rpc_rmdir(dir, dentry); @@ -788,11 +788,11 @@ static int rpc_rmdir_depopulate(struct dentry *dentry, parent = dget_parent(dentry); dir = d_inode(parent); - mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(dir, I_MUTEX_PARENT); if (depopulate != NULL) depopulate(dentry); error = __rpc_rmdir(dir, dentry); - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); dput(parent); return error; } @@ -828,7 +828,7 @@ struct dentry *rpc_mkpipe_dentry(struct dentry *parent, const char *name, if (pipe->ops->downcall == NULL) umode &= ~S_IWUGO; - mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(dir, I_MUTEX_PARENT); dentry = __rpc_lookup_create_exclusive(parent, name); if (IS_ERR(dentry)) goto out; @@ -837,7 +837,7 @@ struct dentry *rpc_mkpipe_dentry(struct dentry *parent, const char *name, if (err) goto out_err; out: - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); return dentry; out_err: dentry = ERR_PTR(err); @@ -865,9 +865,9 @@ rpc_unlink(struct dentry *dentry) parent = dget_parent(dentry); dir = d_inode(parent); - mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(dir, I_MUTEX_PARENT); error = __rpc_rmpipe(dir, dentry); - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); dput(parent); return error; } @@ -1500,7 +1500,7 @@ int register_rpc_pipefs(void) rpc_inode_cachep = kmem_cache_create("rpc_inode_cache", sizeof(struct rpc_inode), 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (!rpc_inode_cachep) return -ENOMEM; diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index a6cbb21..7422f28 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -10,11 +10,13 @@ #include <linux/kthread.h> #include <linux/slab.h> #include <net/sock.h> +#include <linux/sunrpc/addr.h> #include <linux/sunrpc/stats.h> #include <linux/sunrpc/svc_xprt.h> #include <linux/sunrpc/svcsock.h> #include <linux/sunrpc/xprt.h> #include <linux/module.h> +#include <linux/netdevice.h> #include <trace/events/sunrpc.h> #define RPCDBG_FACILITY RPCDBG_SVCXPRT @@ -938,6 +940,49 @@ static void svc_age_temp_xprts(unsigned long closure) mod_timer(&serv->sv_temptimer, jiffies + svc_conn_age_period * HZ); } +/* Close temporary transports whose xpt_local matches server_addr immediately + * instead of waiting for them to be picked up by the timer. + * + * This is meant to be called from a notifier_block that runs when an ip + * address is deleted. + */ +void svc_age_temp_xprts_now(struct svc_serv *serv, struct sockaddr *server_addr) +{ + struct svc_xprt *xprt; + struct svc_sock *svsk; + struct socket *sock; + struct list_head *le, *next; + LIST_HEAD(to_be_closed); + struct linger no_linger = { + .l_onoff = 1, + .l_linger = 0, + }; + + spin_lock_bh(&serv->sv_lock); + list_for_each_safe(le, next, &serv->sv_tempsocks) { + xprt = list_entry(le, struct svc_xprt, xpt_list); + if (rpc_cmp_addr(server_addr, (struct sockaddr *) + &xprt->xpt_local)) { + dprintk("svc_age_temp_xprts_now: found %p\n", xprt); + list_move(le, &to_be_closed); + } + } + spin_unlock_bh(&serv->sv_lock); + + while (!list_empty(&to_be_closed)) { + le = to_be_closed.next; + list_del_init(le); + xprt = list_entry(le, struct svc_xprt, xpt_list); + dprintk("svc_age_temp_xprts_now: closing %p\n", xprt); + svsk = container_of(xprt, struct svc_sock, sk_xprt); + sock = svsk->sk_sock; + kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER, + (char *)&no_linger, sizeof(no_linger)); + svc_close_xprt(xprt); + } +} +EXPORT_SYMBOL_GPL(svc_age_temp_xprts_now); + static void call_xpt_users(struct svc_xprt *xprt) { struct svc_xpt_user *u; diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c index 79c0f34..69841db 100644 --- a/net/sunrpc/svcauth.c +++ b/net/sunrpc/svcauth.c @@ -55,6 +55,7 @@ svc_authenticate(struct svc_rqst *rqstp, __be32 *authp) spin_unlock(&authtab_lock); rqstp->rq_auth_slack = 0; + init_svc_cred(&rqstp->rq_cred); rqstp->rq_authop = aops; return aops->accept(rqstp, authp); @@ -63,6 +64,7 @@ EXPORT_SYMBOL_GPL(svc_authenticate); int svc_set_client(struct svc_rqst *rqstp) { + rqstp->rq_client = NULL; return rqstp->rq_authop->set_client(rqstp); } EXPORT_SYMBOL_GPL(svc_set_client); diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index 621ca7b..dfacdc9 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -728,10 +728,6 @@ svcauth_null_accept(struct svc_rqst *rqstp, __be32 *authp) struct kvec *resv = &rqstp->rq_res.head[0]; struct svc_cred *cred = &rqstp->rq_cred; - cred->cr_group_info = NULL; - cred->cr_principal = NULL; - rqstp->rq_client = NULL; - if (argv->iov_len < 3*4) return SVC_GARBAGE; @@ -794,10 +790,6 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp) u32 slen, i; int len = argv->iov_len; - cred->cr_group_info = NULL; - cred->cr_principal = NULL; - rqstp->rq_client = NULL; - if ((len -= 3*4) < 0) return SVC_GARBAGE; diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 2e98f4a..37edea6 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1425,3 +1425,4 @@ void xprt_put(struct rpc_xprt *xprt) if (atomic_dec_and_test(&xprt->count)) xprt_destroy(xprt); } +EXPORT_SYMBOL_GPL(xprt_put); diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile index 33f99d3..dc9f3b5 100644 --- a/net/sunrpc/xprtrdma/Makefile +++ b/net/sunrpc/xprtrdma/Makefile @@ -2,7 +2,7 @@ obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o rpcrdma-y := transport.o rpc_rdma.o verbs.o \ fmr_ops.o frwr_ops.o physical_ops.o \ - svc_rdma.o svc_rdma_transport.o \ + svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \ svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o \ module.o rpcrdma-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel.o diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 2dcb44f..cc1251d 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -15,7 +15,7 @@ # define RPCDBG_FACILITY RPCDBG_TRANS #endif -#define RPCRDMA_BACKCHANNEL_DEBUG +#undef RPCRDMA_BACKCHANNEL_DEBUG static void rpcrdma_bc_free_rqst(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) @@ -42,8 +42,8 @@ static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt, size_t size; req = rpcrdma_create_req(r_xprt); - if (!req) - return -ENOMEM; + if (IS_ERR(req)) + return PTR_ERR(req); req->rl_backchannel = true; size = RPCRDMA_INLINE_WRITE_THRESHOLD(rqst); @@ -84,9 +84,7 @@ out_fail: static int rpcrdma_bc_setup_reps(struct rpcrdma_xprt *r_xprt, unsigned int count) { - struct rpcrdma_buffer *buffers = &r_xprt->rx_buf; struct rpcrdma_rep *rep; - unsigned long flags; int rc = 0; while (count--) { @@ -98,9 +96,7 @@ static int rpcrdma_bc_setup_reps(struct rpcrdma_xprt *r_xprt, break; } - spin_lock_irqsave(&buffers->rb_lock, flags); - list_add(&rep->rr_list, &buffers->rb_recv_bufs); - spin_unlock_irqrestore(&buffers->rb_lock, flags); + rpcrdma_recv_buffer_put(rep); } return rc; @@ -140,6 +136,7 @@ int xprt_rdma_bc_setup(struct rpc_xprt *xprt, unsigned int reqs) __func__); goto out_free; } + dprintk("RPC: %s: new rqst %p\n", __func__, rqst); rqst->rq_xprt = &r_xprt->rx_xprt; INIT_LIST_HEAD(&rqst->rq_list); @@ -220,12 +217,14 @@ int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst) rpclen = rqst->rq_svec[0].iov_len; +#ifdef RPCRDMA_BACKCHANNEL_DEBUG pr_info("RPC: %s: rpclen %zd headerp 0x%p lkey 0x%x\n", __func__, rpclen, headerp, rdmab_lkey(req->rl_rdmabuf)); pr_info("RPC: %s: RPC/RDMA: %*ph\n", __func__, (int)RPCRDMA_HDRLEN_MIN, headerp); pr_info("RPC: %s: RPC: %*ph\n", __func__, (int)rpclen, rqst->rq_svec[0].iov_base); +#endif req->rl_send_iov[0].addr = rdmab_addr(req->rl_rdmabuf); req->rl_send_iov[0].length = RPCRDMA_HDRLEN_MIN; @@ -269,6 +268,9 @@ void xprt_rdma_bc_free_rqst(struct rpc_rqst *rqst) { struct rpc_xprt *xprt = rqst->rq_xprt; + dprintk("RPC: %s: freeing rqst %p (req %p)\n", + __func__, rqst, rpcr_to_rdmar(rqst)); + smp_mb__before_atomic(); WARN_ON_ONCE(!test_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state)); clear_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state); @@ -333,9 +335,7 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt, struct rpc_rqst, rq_bc_pa_list); list_del(&rqst->rq_bc_pa_list); spin_unlock(&xprt->bc_pa_lock); -#ifdef RPCRDMA_BACKCHANNEL_DEBUG - pr_info("RPC: %s: using rqst %p\n", __func__, rqst); -#endif + dprintk("RPC: %s: using rqst %p\n", __func__, rqst); /* Prepare rqst */ rqst->rq_reply_bytes_recvd = 0; @@ -355,10 +355,8 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt, * direction reply. */ req = rpcr_to_rdmar(rqst); -#ifdef RPCRDMA_BACKCHANNEL_DEBUG - pr_info("RPC: %s: attaching rep %p to req %p\n", + dprintk("RPC: %s: attaching rep %p to req %p\n", __func__, rep, req); -#endif req->rl_reply = rep; /* Defeat the retransmit detection logic in send_request */ diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index f1e8daf..c14f3a4 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -179,6 +179,69 @@ out_maperr: return rc; } +static void +__fmr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) +{ + struct ib_device *device = r_xprt->rx_ia.ri_device; + struct rpcrdma_mw *mw = seg->rl_mw; + int nsegs = seg->mr_nsegs; + + seg->rl_mw = NULL; + + while (nsegs--) + rpcrdma_unmap_one(device, seg++); + + rpcrdma_put_mw(r_xprt, mw); +} + +/* Invalidate all memory regions that were registered for "req". + * + * Sleeps until it is safe for the host CPU to access the + * previously mapped memory regions. + */ +static void +fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) +{ + struct rpcrdma_mr_seg *seg; + unsigned int i, nchunks; + struct rpcrdma_mw *mw; + LIST_HEAD(unmap_list); + int rc; + + dprintk("RPC: %s: req %p\n", __func__, req); + + /* ORDER: Invalidate all of the req's MRs first + * + * ib_unmap_fmr() is slow, so use a single call instead + * of one call per mapped MR. + */ + for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { + seg = &req->rl_segments[i]; + mw = seg->rl_mw; + + list_add(&mw->r.fmr.fmr->list, &unmap_list); + + i += seg->mr_nsegs; + } + rc = ib_unmap_fmr(&unmap_list); + if (rc) + pr_warn("%s: ib_unmap_fmr failed (%i)\n", __func__, rc); + + /* ORDER: Now DMA unmap all of the req's MRs, and return + * them to the free MW list. + */ + for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { + seg = &req->rl_segments[i]; + + __fmr_dma_unmap(r_xprt, seg); + + i += seg->mr_nsegs; + seg->mr_nsegs = 0; + } + + req->rl_nchunks = 0; +} + /* Use the ib_unmap_fmr() verb to prevent further remote * access via RDMA READ or RDMA WRITE. */ @@ -231,6 +294,7 @@ fmr_op_destroy(struct rpcrdma_buffer *buf) const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = { .ro_map = fmr_op_map, + .ro_unmap_sync = fmr_op_unmap_sync, .ro_unmap = fmr_op_unmap, .ro_open = fmr_op_open, .ro_maxpages = fmr_op_maxpages, diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 88cf9e7..e165673 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -190,12 +190,11 @@ static int frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, struct rpcrdma_create_data_internal *cdata) { - struct ib_device_attr *devattr = &ia->ri_devattr; int depth, delta; ia->ri_max_frmr_depth = min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, - devattr->max_fast_reg_page_list_len); + ia->ri_device->attrs.max_fast_reg_page_list_len); dprintk("RPC: %s: device's max FR page list len = %u\n", __func__, ia->ri_max_frmr_depth); @@ -222,8 +221,8 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, } ep->rep_attr.cap.max_send_wr *= depth; - if (ep->rep_attr.cap.max_send_wr > devattr->max_qp_wr) { - cdata->max_requests = devattr->max_qp_wr / depth; + if (ep->rep_attr.cap.max_send_wr > ia->ri_device->attrs.max_qp_wr) { + cdata->max_requests = ia->ri_device->attrs.max_qp_wr / depth; if (!cdata->max_requests) return -EINVAL; ep->rep_attr.cap.max_send_wr = cdata->max_requests * @@ -245,12 +244,14 @@ frwr_op_maxpages(struct rpcrdma_xprt *r_xprt) rpcrdma_max_segments(r_xprt) * ia->ri_max_frmr_depth); } -/* If FAST_REG or LOCAL_INV failed, indicate the frmr needs to be reset. */ +/* If FAST_REG or LOCAL_INV failed, indicate the frmr needs + * to be reset. + * + * WARNING: Only wr_id and status are reliable at this point + */ static void -frwr_sendcompletion(struct ib_wc *wc) +__frwr_sendcompletion_flush(struct ib_wc *wc, struct rpcrdma_mw *r) { - struct rpcrdma_mw *r; - if (likely(wc->status == IB_WC_SUCCESS)) return; @@ -261,9 +262,23 @@ frwr_sendcompletion(struct ib_wc *wc) else pr_warn("RPC: %s: frmr %p error, status %s (%d)\n", __func__, r, ib_wc_status_msg(wc->status), wc->status); + r->r.frmr.fr_state = FRMR_IS_STALE; } +static void +frwr_sendcompletion(struct ib_wc *wc) +{ + struct rpcrdma_mw *r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; + struct rpcrdma_frmr *f = &r->r.frmr; + + if (unlikely(wc->status != IB_WC_SUCCESS)) + __frwr_sendcompletion_flush(wc, r); + + if (f->fr_waiter) + complete(&f->fr_linv_done); +} + static int frwr_op_init(struct rpcrdma_xprt *r_xprt) { @@ -319,7 +334,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, struct rpcrdma_mw *mw; struct rpcrdma_frmr *frmr; struct ib_mr *mr; - struct ib_reg_wr reg_wr; + struct ib_reg_wr *reg_wr; struct ib_send_wr *bad_wr; int rc, i, n, dma_nents; u8 key; @@ -335,7 +350,9 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, } while (mw->r.frmr.fr_state != FRMR_IS_INVALID); frmr = &mw->r.frmr; frmr->fr_state = FRMR_IS_VALID; + frmr->fr_waiter = false; mr = frmr->fr_mr; + reg_wr = &frmr->fr_regwr; if (nsegs > ia->ri_max_frmr_depth) nsegs = ia->ri_max_frmr_depth; @@ -381,19 +398,19 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, key = (u8)(mr->rkey & 0x000000FF); ib_update_fast_reg_key(mr, ++key); - reg_wr.wr.next = NULL; - reg_wr.wr.opcode = IB_WR_REG_MR; - reg_wr.wr.wr_id = (uintptr_t)mw; - reg_wr.wr.num_sge = 0; - reg_wr.wr.send_flags = 0; - reg_wr.mr = mr; - reg_wr.key = mr->rkey; - reg_wr.access = writing ? - IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : - IB_ACCESS_REMOTE_READ; + reg_wr->wr.next = NULL; + reg_wr->wr.opcode = IB_WR_REG_MR; + reg_wr->wr.wr_id = (uintptr_t)mw; + reg_wr->wr.num_sge = 0; + reg_wr->wr.send_flags = 0; + reg_wr->mr = mr; + reg_wr->key = mr->rkey; + reg_wr->access = writing ? + IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : + IB_ACCESS_REMOTE_READ; DECR_CQCOUNT(&r_xprt->rx_ep); - rc = ib_post_send(ia->ri_id->qp, ®_wr.wr, &bad_wr); + rc = ib_post_send(ia->ri_id->qp, ®_wr->wr, &bad_wr); if (rc) goto out_senderr; @@ -413,6 +430,116 @@ out_senderr: return rc; } +static struct ib_send_wr * +__frwr_prepare_linv_wr(struct rpcrdma_mr_seg *seg) +{ + struct rpcrdma_mw *mw = seg->rl_mw; + struct rpcrdma_frmr *f = &mw->r.frmr; + struct ib_send_wr *invalidate_wr; + + f->fr_waiter = false; + f->fr_state = FRMR_IS_INVALID; + invalidate_wr = &f->fr_invwr; + + memset(invalidate_wr, 0, sizeof(*invalidate_wr)); + invalidate_wr->wr_id = (unsigned long)(void *)mw; + invalidate_wr->opcode = IB_WR_LOCAL_INV; + invalidate_wr->ex.invalidate_rkey = f->fr_mr->rkey; + + return invalidate_wr; +} + +static void +__frwr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, + int rc) +{ + struct ib_device *device = r_xprt->rx_ia.ri_device; + struct rpcrdma_mw *mw = seg->rl_mw; + struct rpcrdma_frmr *f = &mw->r.frmr; + + seg->rl_mw = NULL; + + ib_dma_unmap_sg(device, f->sg, f->sg_nents, seg->mr_dir); + + if (!rc) + rpcrdma_put_mw(r_xprt, mw); + else + __frwr_queue_recovery(mw); +} + +/* Invalidate all memory regions that were registered for "req". + * + * Sleeps until it is safe for the host CPU to access the + * previously mapped memory regions. + */ +static void +frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) +{ + struct ib_send_wr *invalidate_wrs, *pos, *prev, *bad_wr; + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct rpcrdma_mr_seg *seg; + unsigned int i, nchunks; + struct rpcrdma_frmr *f; + int rc; + + dprintk("RPC: %s: req %p\n", __func__, req); + + /* ORDER: Invalidate all of the req's MRs first + * + * Chain the LOCAL_INV Work Requests and post them with + * a single ib_post_send() call. + */ + invalidate_wrs = pos = prev = NULL; + seg = NULL; + for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { + seg = &req->rl_segments[i]; + + pos = __frwr_prepare_linv_wr(seg); + + if (!invalidate_wrs) + invalidate_wrs = pos; + else + prev->next = pos; + prev = pos; + + i += seg->mr_nsegs; + } + f = &seg->rl_mw->r.frmr; + + /* Strong send queue ordering guarantees that when the + * last WR in the chain completes, all WRs in the chain + * are complete. + */ + f->fr_invwr.send_flags = IB_SEND_SIGNALED; + f->fr_waiter = true; + init_completion(&f->fr_linv_done); + INIT_CQCOUNT(&r_xprt->rx_ep); + + /* Transport disconnect drains the receive CQ before it + * replaces the QP. The RPC reply handler won't call us + * unless ri_id->qp is a valid pointer. + */ + rc = ib_post_send(ia->ri_id->qp, invalidate_wrs, &bad_wr); + if (rc) + pr_warn("%s: ib_post_send failed %i\n", __func__, rc); + + wait_for_completion(&f->fr_linv_done); + + /* ORDER: Now DMA unmap all of the req's MRs, and return + * them to the free MW list. + */ + for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { + seg = &req->rl_segments[i]; + + __frwr_dma_unmap(r_xprt, seg, rc); + + i += seg->mr_nsegs; + seg->mr_nsegs = 0; + } + + req->rl_nchunks = 0; +} + /* Post a LOCAL_INV Work Request to prevent further remote access * via RDMA READ or RDMA WRITE. */ @@ -423,23 +550,24 @@ frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct rpcrdma_mw *mw = seg1->rl_mw; struct rpcrdma_frmr *frmr = &mw->r.frmr; - struct ib_send_wr invalidate_wr, *bad_wr; + struct ib_send_wr *invalidate_wr, *bad_wr; int rc, nsegs = seg->mr_nsegs; dprintk("RPC: %s: FRMR %p\n", __func__, mw); seg1->rl_mw = NULL; frmr->fr_state = FRMR_IS_INVALID; + invalidate_wr = &mw->r.frmr.fr_invwr; - memset(&invalidate_wr, 0, sizeof(invalidate_wr)); - invalidate_wr.wr_id = (unsigned long)(void *)mw; - invalidate_wr.opcode = IB_WR_LOCAL_INV; - invalidate_wr.ex.invalidate_rkey = frmr->fr_mr->rkey; + memset(invalidate_wr, 0, sizeof(*invalidate_wr)); + invalidate_wr->wr_id = (uintptr_t)mw; + invalidate_wr->opcode = IB_WR_LOCAL_INV; + invalidate_wr->ex.invalidate_rkey = frmr->fr_mr->rkey; DECR_CQCOUNT(&r_xprt->rx_ep); ib_dma_unmap_sg(ia->ri_device, frmr->sg, frmr->sg_nents, seg1->mr_dir); read_lock(&ia->ri_qplock); - rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr); + rc = ib_post_send(ia->ri_id->qp, invalidate_wr, &bad_wr); read_unlock(&ia->ri_qplock); if (rc) goto out_err; @@ -471,6 +599,7 @@ frwr_op_destroy(struct rpcrdma_buffer *buf) const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { .ro_map = frwr_op_map, + .ro_unmap_sync = frwr_op_unmap_sync, .ro_unmap = frwr_op_unmap, .ro_open = frwr_op_open, .ro_maxpages = frwr_op_maxpages, diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c index 617b76f..dbb302e 100644 --- a/net/sunrpc/xprtrdma/physical_ops.c +++ b/net/sunrpc/xprtrdma/physical_ops.c @@ -83,6 +83,18 @@ physical_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) return 1; } +/* DMA unmap all memory regions that were mapped for "req". + */ +static void +physical_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) +{ + struct ib_device *device = r_xprt->rx_ia.ri_device; + unsigned int i; + + for (i = 0; req->rl_nchunks; --req->rl_nchunks) + rpcrdma_unmap_one(device, &req->rl_segments[i++]); +} + static void physical_op_destroy(struct rpcrdma_buffer *buf) { @@ -90,6 +102,7 @@ physical_op_destroy(struct rpcrdma_buffer *buf) const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = { .ro_map = physical_op_map, + .ro_unmap_sync = physical_op_unmap_sync, .ro_unmap = physical_op_unmap, .ro_open = physical_op_open, .ro_maxpages = physical_op_maxpages, diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index c10d969..0f28f2d 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -804,6 +804,11 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep) if (req->rl_reply) goto out_duplicate; + /* Sanity checking has passed. We are now committed + * to complete this transaction. + */ + list_del_init(&rqst->rq_list); + spin_unlock_bh(&xprt->transport_lock); dprintk("RPC: %s: reply 0x%p completes request 0x%p\n" " RPC request 0x%p xid 0x%08x\n", __func__, rep, req, rqst, @@ -888,12 +893,23 @@ badheader: break; } + /* Invalidate and flush the data payloads before waking the + * waiting application. This guarantees the memory region is + * properly fenced from the server before the application + * accesses the data. It also ensures proper send flow + * control: waking the next RPC waits until this RPC has + * relinquished all its Send Queue entries. + */ + if (req->rl_nchunks) + r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, req); + credits = be32_to_cpu(headerp->rm_credit); if (credits == 0) credits = 1; /* don't deadlock */ else if (credits > r_xprt->rx_buf.rb_max_requests) credits = r_xprt->rx_buf.rb_max_requests; + spin_lock_bh(&xprt->transport_lock); cwnd = xprt->cwnd; xprt->cwnd = credits << RPC_CWNDSHIFT; if (xprt->cwnd > cwnd) diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c index 1b7051b..c846ca9 100644 --- a/net/sunrpc/xprtrdma/svc_rdma.c +++ b/net/sunrpc/xprtrdma/svc_rdma.c @@ -55,6 +55,7 @@ unsigned int svcrdma_ord = RPCRDMA_ORD; static unsigned int min_ord = 1; static unsigned int max_ord = 4096; unsigned int svcrdma_max_requests = RPCRDMA_MAX_REQUESTS; +unsigned int svcrdma_max_bc_requests = RPCRDMA_MAX_BC_REQUESTS; static unsigned int min_max_requests = 4; static unsigned int max_max_requests = 16384; unsigned int svcrdma_max_req_size = RPCRDMA_MAX_REQ_SIZE; @@ -71,10 +72,6 @@ atomic_t rdma_stat_rq_prod; atomic_t rdma_stat_sq_poll; atomic_t rdma_stat_sq_prod; -/* Temporary NFS request map and context caches */ -struct kmem_cache *svc_rdma_map_cachep; -struct kmem_cache *svc_rdma_ctxt_cachep; - struct workqueue_struct *svc_rdma_wq; /* @@ -243,17 +240,16 @@ void svc_rdma_cleanup(void) svc_unreg_xprt_class(&svc_rdma_bc_class); #endif svc_unreg_xprt_class(&svc_rdma_class); - kmem_cache_destroy(svc_rdma_map_cachep); - kmem_cache_destroy(svc_rdma_ctxt_cachep); } int svc_rdma_init(void) { dprintk("SVCRDMA Module Init, register RPC RDMA transport\n"); dprintk("\tsvcrdma_ord : %d\n", svcrdma_ord); - dprintk("\tmax_requests : %d\n", svcrdma_max_requests); - dprintk("\tsq_depth : %d\n", + dprintk("\tmax_requests : %u\n", svcrdma_max_requests); + dprintk("\tsq_depth : %u\n", svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT); + dprintk("\tmax_bc_requests : %u\n", svcrdma_max_bc_requests); dprintk("\tmax_inline : %d\n", svcrdma_max_req_size); svc_rdma_wq = alloc_workqueue("svc_rdma", 0, 0); @@ -264,39 +260,10 @@ int svc_rdma_init(void) svcrdma_table_header = register_sysctl_table(svcrdma_root_table); - /* Create the temporary map cache */ - svc_rdma_map_cachep = kmem_cache_create("svc_rdma_map_cache", - sizeof(struct svc_rdma_req_map), - 0, - SLAB_HWCACHE_ALIGN, - NULL); - if (!svc_rdma_map_cachep) { - printk(KERN_INFO "Could not allocate map cache.\n"); - goto err0; - } - - /* Create the temporary context cache */ - svc_rdma_ctxt_cachep = - kmem_cache_create("svc_rdma_ctxt_cache", - sizeof(struct svc_rdma_op_ctxt), - 0, - SLAB_HWCACHE_ALIGN, - NULL); - if (!svc_rdma_ctxt_cachep) { - printk(KERN_INFO "Could not allocate WR ctxt cache.\n"); - goto err1; - } - /* Register RDMA with the SVC transport switch */ svc_reg_xprt_class(&svc_rdma_class); #if defined(CONFIG_SUNRPC_BACKCHANNEL) svc_reg_xprt_class(&svc_rdma_bc_class); #endif return 0; - err1: - kmem_cache_destroy(svc_rdma_map_cachep); - err0: - unregister_sysctl_table(svcrdma_table_header); - destroy_workqueue(svc_rdma_wq); - return -ENOMEM; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c new file mode 100644 index 0000000..65a7c23 --- /dev/null +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -0,0 +1,371 @@ +/* + * Copyright (c) 2015 Oracle. All rights reserved. + * + * Support for backward direction RPCs on RPC/RDMA (server-side). + */ + +#include <linux/sunrpc/svc_rdma.h> +#include "xprt_rdma.h" + +#define RPCDBG_FACILITY RPCDBG_SVCXPRT + +#undef SVCRDMA_BACKCHANNEL_DEBUG + +int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, struct rpcrdma_msg *rmsgp, + struct xdr_buf *rcvbuf) +{ + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + struct kvec *dst, *src = &rcvbuf->head[0]; + struct rpc_rqst *req; + unsigned long cwnd; + u32 credits; + size_t len; + __be32 xid; + __be32 *p; + int ret; + + p = (__be32 *)src->iov_base; + len = src->iov_len; + xid = rmsgp->rm_xid; + +#ifdef SVCRDMA_BACKCHANNEL_DEBUG + pr_info("%s: xid=%08x, length=%zu\n", + __func__, be32_to_cpu(xid), len); + pr_info("%s: RPC/RDMA: %*ph\n", + __func__, (int)RPCRDMA_HDRLEN_MIN, rmsgp); + pr_info("%s: RPC: %*ph\n", + __func__, (int)len, p); +#endif + + ret = -EAGAIN; + if (src->iov_len < 24) + goto out_shortreply; + + spin_lock_bh(&xprt->transport_lock); + req = xprt_lookup_rqst(xprt, xid); + if (!req) + goto out_notfound; + + dst = &req->rq_private_buf.head[0]; + memcpy(&req->rq_private_buf, &req->rq_rcv_buf, sizeof(struct xdr_buf)); + if (dst->iov_len < len) + goto out_unlock; + memcpy(dst->iov_base, p, len); + + credits = be32_to_cpu(rmsgp->rm_credit); + if (credits == 0) + credits = 1; /* don't deadlock */ + else if (credits > r_xprt->rx_buf.rb_bc_max_requests) + credits = r_xprt->rx_buf.rb_bc_max_requests; + + cwnd = xprt->cwnd; + xprt->cwnd = credits << RPC_CWNDSHIFT; + if (xprt->cwnd > cwnd) + xprt_release_rqst_cong(req->rq_task); + + ret = 0; + xprt_complete_rqst(req->rq_task, rcvbuf->len); + rcvbuf->len = 0; + +out_unlock: + spin_unlock_bh(&xprt->transport_lock); +out: + return ret; + +out_shortreply: + dprintk("svcrdma: short bc reply: xprt=%p, len=%zu\n", + xprt, src->iov_len); + goto out; + +out_notfound: + dprintk("svcrdma: unrecognized bc reply: xprt=%p, xid=%08x\n", + xprt, be32_to_cpu(xid)); + + goto out_unlock; +} + +/* Send a backwards direction RPC call. + * + * Caller holds the connection's mutex and has already marshaled + * the RPC/RDMA request. + * + * This is similar to svc_rdma_reply, but takes an rpc_rqst + * instead, does not support chunks, and avoids blocking memory + * allocation. + * + * XXX: There is still an opportunity to block in svc_rdma_send() + * if there are no SQ entries to post the Send. This may occur if + * the adapter has a small maximum SQ depth. + */ +static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma, + struct rpc_rqst *rqst) +{ + struct xdr_buf *sndbuf = &rqst->rq_snd_buf; + struct svc_rdma_op_ctxt *ctxt; + struct svc_rdma_req_map *vec; + struct ib_send_wr send_wr; + int ret; + + vec = svc_rdma_get_req_map(rdma); + ret = svc_rdma_map_xdr(rdma, sndbuf, vec); + if (ret) + goto out_err; + + /* Post a recv buffer to handle the reply for this request. */ + ret = svc_rdma_post_recv(rdma, GFP_NOIO); + if (ret) { + pr_err("svcrdma: Failed to post bc receive buffer, err=%d.\n", + ret); + pr_err("svcrdma: closing transport %p.\n", rdma); + set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); + ret = -ENOTCONN; + goto out_err; + } + + ctxt = svc_rdma_get_context(rdma); + ctxt->pages[0] = virt_to_page(rqst->rq_buffer); + ctxt->count = 1; + + ctxt->wr_op = IB_WR_SEND; + ctxt->direction = DMA_TO_DEVICE; + ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey; + ctxt->sge[0].length = sndbuf->len; + ctxt->sge[0].addr = + ib_dma_map_page(rdma->sc_cm_id->device, ctxt->pages[0], 0, + sndbuf->len, DMA_TO_DEVICE); + if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr)) { + ret = -EIO; + goto out_unmap; + } + atomic_inc(&rdma->sc_dma_used); + + memset(&send_wr, 0, sizeof(send_wr)); + send_wr.wr_id = (unsigned long)ctxt; + send_wr.sg_list = ctxt->sge; + send_wr.num_sge = 1; + send_wr.opcode = IB_WR_SEND; + send_wr.send_flags = IB_SEND_SIGNALED; + + ret = svc_rdma_send(rdma, &send_wr); + if (ret) { + ret = -EIO; + goto out_unmap; + } + +out_err: + svc_rdma_put_req_map(rdma, vec); + dprintk("svcrdma: %s returns %d\n", __func__, ret); + return ret; + +out_unmap: + svc_rdma_unmap_dma(ctxt); + svc_rdma_put_context(ctxt, 1); + goto out_err; +} + +/* Server-side transport endpoint wants a whole page for its send + * buffer. The client RPC code constructs the RPC header in this + * buffer before it invokes ->send_request. + * + * Returns NULL if there was a temporary allocation failure. + */ +static void * +xprt_rdma_bc_allocate(struct rpc_task *task, size_t size) +{ + struct rpc_rqst *rqst = task->tk_rqstp; + struct svc_xprt *sxprt = rqst->rq_xprt->bc_xprt; + struct svcxprt_rdma *rdma; + struct page *page; + + rdma = container_of(sxprt, struct svcxprt_rdma, sc_xprt); + + /* Prevent an infinite loop: try to make this case work */ + if (size > PAGE_SIZE) + WARN_ONCE(1, "svcrdma: large bc buffer request (size %zu)\n", + size); + + page = alloc_page(RPCRDMA_DEF_GFP); + if (!page) + return NULL; + + return page_address(page); +} + +static void +xprt_rdma_bc_free(void *buffer) +{ + /* No-op: ctxt and page have already been freed. */ +} + +static int +rpcrdma_bc_send_request(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst) +{ + struct rpc_xprt *xprt = rqst->rq_xprt; + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + struct rpcrdma_msg *headerp = (struct rpcrdma_msg *)rqst->rq_buffer; + int rc; + + /* Space in the send buffer for an RPC/RDMA header is reserved + * via xprt->tsh_size. + */ + headerp->rm_xid = rqst->rq_xid; + headerp->rm_vers = rpcrdma_version; + headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_bc_max_requests); + headerp->rm_type = rdma_msg; + headerp->rm_body.rm_chunks[0] = xdr_zero; + headerp->rm_body.rm_chunks[1] = xdr_zero; + headerp->rm_body.rm_chunks[2] = xdr_zero; + +#ifdef SVCRDMA_BACKCHANNEL_DEBUG + pr_info("%s: %*ph\n", __func__, 64, rqst->rq_buffer); +#endif + + rc = svc_rdma_bc_sendto(rdma, rqst); + if (rc) + goto drop_connection; + return rc; + +drop_connection: + dprintk("svcrdma: failed to send bc call\n"); + xprt_disconnect_done(xprt); + return -ENOTCONN; +} + +/* Send an RPC call on the passive end of a transport + * connection. + */ +static int +xprt_rdma_bc_send_request(struct rpc_task *task) +{ + struct rpc_rqst *rqst = task->tk_rqstp; + struct svc_xprt *sxprt = rqst->rq_xprt->bc_xprt; + struct svcxprt_rdma *rdma; + int ret; + + dprintk("svcrdma: sending bc call with xid: %08x\n", + be32_to_cpu(rqst->rq_xid)); + + if (!mutex_trylock(&sxprt->xpt_mutex)) { + rpc_sleep_on(&sxprt->xpt_bc_pending, task, NULL); + if (!mutex_trylock(&sxprt->xpt_mutex)) + return -EAGAIN; + rpc_wake_up_queued_task(&sxprt->xpt_bc_pending, task); + } + + ret = -ENOTCONN; + rdma = container_of(sxprt, struct svcxprt_rdma, sc_xprt); + if (!test_bit(XPT_DEAD, &sxprt->xpt_flags)) + ret = rpcrdma_bc_send_request(rdma, rqst); + + mutex_unlock(&sxprt->xpt_mutex); + + if (ret < 0) + return ret; + return 0; +} + +static void +xprt_rdma_bc_close(struct rpc_xprt *xprt) +{ + dprintk("svcrdma: %s: xprt %p\n", __func__, xprt); +} + +static void +xprt_rdma_bc_put(struct rpc_xprt *xprt) +{ + dprintk("svcrdma: %s: xprt %p\n", __func__, xprt); + + xprt_free(xprt); + module_put(THIS_MODULE); +} + +static struct rpc_xprt_ops xprt_rdma_bc_procs = { + .reserve_xprt = xprt_reserve_xprt_cong, + .release_xprt = xprt_release_xprt_cong, + .alloc_slot = xprt_alloc_slot, + .release_request = xprt_release_rqst_cong, + .buf_alloc = xprt_rdma_bc_allocate, + .buf_free = xprt_rdma_bc_free, + .send_request = xprt_rdma_bc_send_request, + .set_retrans_timeout = xprt_set_retrans_timeout_def, + .close = xprt_rdma_bc_close, + .destroy = xprt_rdma_bc_put, + .print_stats = xprt_rdma_print_stats +}; + +static const struct rpc_timeout xprt_rdma_bc_timeout = { + .to_initval = 60 * HZ, + .to_maxval = 60 * HZ, +}; + +/* It shouldn't matter if the number of backchannel session slots + * doesn't match the number of RPC/RDMA credits. That just means + * one or the other will have extra slots that aren't used. + */ +static struct rpc_xprt * +xprt_setup_rdma_bc(struct xprt_create *args) +{ + struct rpc_xprt *xprt; + struct rpcrdma_xprt *new_xprt; + + if (args->addrlen > sizeof(xprt->addr)) { + dprintk("RPC: %s: address too large\n", __func__); + return ERR_PTR(-EBADF); + } + + xprt = xprt_alloc(args->net, sizeof(*new_xprt), + RPCRDMA_MAX_BC_REQUESTS, + RPCRDMA_MAX_BC_REQUESTS); + if (!xprt) { + dprintk("RPC: %s: couldn't allocate rpc_xprt\n", + __func__); + return ERR_PTR(-ENOMEM); + } + + xprt->timeout = &xprt_rdma_bc_timeout; + xprt_set_bound(xprt); + xprt_set_connected(xprt); + xprt->bind_timeout = RPCRDMA_BIND_TO; + xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO; + xprt->idle_timeout = RPCRDMA_IDLE_DISC_TO; + + xprt->prot = XPRT_TRANSPORT_BC_RDMA; + xprt->tsh_size = RPCRDMA_HDRLEN_MIN / sizeof(__be32); + xprt->ops = &xprt_rdma_bc_procs; + + memcpy(&xprt->addr, args->dstaddr, args->addrlen); + xprt->addrlen = args->addrlen; + xprt_rdma_format_addresses(xprt, (struct sockaddr *)&xprt->addr); + xprt->resvport = 0; + + xprt->max_payload = xprt_rdma_max_inline_read; + + new_xprt = rpcx_to_rdmax(xprt); + new_xprt->rx_buf.rb_bc_max_requests = xprt->max_reqs; + + xprt_get(xprt); + args->bc_xprt->xpt_bc_xprt = xprt; + xprt->bc_xprt = args->bc_xprt; + + if (!try_module_get(THIS_MODULE)) + goto out_fail; + + /* Final put for backchannel xprt is in __svc_rdma_free */ + xprt_get(xprt); + return xprt; + +out_fail: + xprt_rdma_free_addresses(xprt); + args->bc_xprt->xpt_bc_xprt = NULL; + xprt_put(xprt); + xprt_free(xprt); + return ERR_PTR(-EINVAL); +} + +struct xprt_class xprt_rdma_bc = { + .list = LIST_HEAD_INIT(xprt_rdma_bc.list), + .name = "rdma backchannel", + .owner = THIS_MODULE, + .ident = XPRT_TRANSPORT_BC_RDMA, + .setup = xprt_setup_rdma_bc, +}; diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index ff4f01e..c8b8a8b 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -144,6 +144,7 @@ int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt, head->arg.pages[pg_no] = rqstp->rq_arg.pages[pg_no]; head->arg.page_len += len; + head->arg.len += len; if (!pg_off) head->count++; @@ -160,8 +161,7 @@ int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt, goto err; atomic_inc(&xprt->sc_dma_used); - /* The lkey here is either a local dma lkey or a dma_mr lkey */ - ctxt->sge[pno].lkey = xprt->sc_dma_lkey; + ctxt->sge[pno].lkey = xprt->sc_pd->local_dma_lkey; ctxt->sge[pno].length = len; ctxt->count++; @@ -567,6 +567,38 @@ static int rdma_read_complete(struct svc_rqst *rqstp, return ret; } +/* By convention, backchannel calls arrive via rdma_msg type + * messages, and never populate the chunk lists. This makes + * the RPC/RDMA header small and fixed in size, so it is + * straightforward to check the RPC header's direction field. + */ +static bool +svc_rdma_is_backchannel_reply(struct svc_xprt *xprt, struct rpcrdma_msg *rmsgp) +{ + __be32 *p = (__be32 *)rmsgp; + + if (!xprt->xpt_bc_xprt) + return false; + + if (rmsgp->rm_type != rdma_msg) + return false; + if (rmsgp->rm_body.rm_chunks[0] != xdr_zero) + return false; + if (rmsgp->rm_body.rm_chunks[1] != xdr_zero) + return false; + if (rmsgp->rm_body.rm_chunks[2] != xdr_zero) + return false; + + /* sanity */ + if (p[7] != rmsgp->rm_xid) + return false; + /* call direction */ + if (p[8] == cpu_to_be32(RPC_CALL)) + return false; + + return true; +} + /* * Set up the rqstp thread context to point to the RQ buffer. If * necessary, pull additional data from the client with an RDMA_READ @@ -632,6 +664,15 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) goto close_out; } + if (svc_rdma_is_backchannel_reply(xprt, rmsgp)) { + ret = svc_rdma_handle_bc_reply(xprt->xpt_bc_xprt, rmsgp, + &rqstp->rq_arg); + svc_rdma_put_context(ctxt, 0); + if (ret) + goto repost; + return ret; + } + /* Read read-list data. */ ret = rdma_read_chunks(rdma_xprt, rmsgp, rqstp, ctxt); if (ret > 0) { @@ -668,4 +709,15 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) set_bit(XPT_CLOSE, &xprt->xpt_flags); defer: return 0; + +repost: + ret = svc_rdma_post_recv(rdma_xprt, GFP_KERNEL); + if (ret) { + pr_err("svcrdma: could not post a receive buffer, err=%d.\n", + ret); + pr_err("svcrdma: closing transport %p.\n", rdma_xprt); + set_bit(XPT_CLOSE, &rdma_xprt->sc_xprt.xpt_flags); + ret = -ENOTCONN; + } + return ret; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 969a1ab..df57f3c 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -50,9 +50,9 @@ #define RPCDBG_FACILITY RPCDBG_SVCXPRT -static int map_xdr(struct svcxprt_rdma *xprt, - struct xdr_buf *xdr, - struct svc_rdma_req_map *vec) +int svc_rdma_map_xdr(struct svcxprt_rdma *xprt, + struct xdr_buf *xdr, + struct svc_rdma_req_map *vec) { int sge_no; u32 sge_bytes; @@ -62,7 +62,7 @@ static int map_xdr(struct svcxprt_rdma *xprt, if (xdr->len != (xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len)) { - pr_err("svcrdma: map_xdr: XDR buffer length error\n"); + pr_err("svcrdma: %s: XDR buffer length error\n", __func__); return -EIO; } @@ -97,9 +97,9 @@ static int map_xdr(struct svcxprt_rdma *xprt, sge_no++; } - dprintk("svcrdma: map_xdr: sge_no %d page_no %d " + dprintk("svcrdma: %s: sge_no %d page_no %d " "page_base %u page_len %u head_len %zu tail_len %zu\n", - sge_no, page_no, xdr->page_base, xdr->page_len, + __func__, sge_no, page_no, xdr->page_base, xdr->page_len, xdr->head[0].iov_len, xdr->tail[0].iov_len); vec->count = sge_no; @@ -265,7 +265,7 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, sge[sge_no].addr)) goto err; atomic_inc(&xprt->sc_dma_used); - sge[sge_no].lkey = xprt->sc_dma_lkey; + sge[sge_no].lkey = xprt->sc_pd->local_dma_lkey; ctxt->count++; sge_off = 0; sge_no++; @@ -465,7 +465,7 @@ static int send_reply(struct svcxprt_rdma *rdma, int ret; /* Post a recv buffer to handle another request. */ - ret = svc_rdma_post_recv(rdma); + ret = svc_rdma_post_recv(rdma, GFP_KERNEL); if (ret) { printk(KERN_INFO "svcrdma: could not post a receive buffer, err=%d." @@ -480,7 +480,7 @@ static int send_reply(struct svcxprt_rdma *rdma, ctxt->count = 1; /* Prepare the SGE for the RPCRDMA Header */ - ctxt->sge[0].lkey = rdma->sc_dma_lkey; + ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey; ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp); ctxt->sge[0].addr = ib_dma_map_page(rdma->sc_cm_id->device, page, 0, @@ -504,7 +504,7 @@ static int send_reply(struct svcxprt_rdma *rdma, ctxt->sge[sge_no].addr)) goto err; atomic_inc(&rdma->sc_dma_used); - ctxt->sge[sge_no].lkey = rdma->sc_dma_lkey; + ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey; ctxt->sge[sge_no].length = sge_bytes; } if (byte_count != 0) { @@ -591,14 +591,17 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) /* Build an req vec for the XDR */ ctxt = svc_rdma_get_context(rdma); ctxt->direction = DMA_TO_DEVICE; - vec = svc_rdma_get_req_map(); - ret = map_xdr(rdma, &rqstp->rq_res, vec); + vec = svc_rdma_get_req_map(rdma); + ret = svc_rdma_map_xdr(rdma, &rqstp->rq_res, vec); if (ret) goto err0; inline_bytes = rqstp->rq_res.len; /* Create the RDMA response header */ - res_page = alloc_page(GFP_KERNEL | __GFP_NOFAIL); + ret = -ENOMEM; + res_page = alloc_page(GFP_KERNEL); + if (!res_page) + goto err0; rdma_resp = page_address(res_page); reply_ary = svc_rdma_get_reply_array(rdma_argp); if (reply_ary) @@ -630,14 +633,14 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) ret = send_reply(rdma, rqstp, res_page, rdma_resp, ctxt, vec, inline_bytes); - svc_rdma_put_req_map(vec); + svc_rdma_put_req_map(rdma, vec); dprintk("svcrdma: send_reply returns %d\n", ret); return ret; err1: put_page(res_page); err0: - svc_rdma_put_req_map(vec); + svc_rdma_put_req_map(rdma, vec); svc_rdma_put_context(ctxt, 0); return ret; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index b348b4a..5763825 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -153,18 +153,76 @@ static void svc_rdma_bc_free(struct svc_xprt *xprt) } #endif /* CONFIG_SUNRPC_BACKCHANNEL */ -struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt) +static struct svc_rdma_op_ctxt *alloc_ctxt(struct svcxprt_rdma *xprt, + gfp_t flags) { struct svc_rdma_op_ctxt *ctxt; - ctxt = kmem_cache_alloc(svc_rdma_ctxt_cachep, - GFP_KERNEL | __GFP_NOFAIL); - ctxt->xprt = xprt; - INIT_LIST_HEAD(&ctxt->dto_q); + ctxt = kmalloc(sizeof(*ctxt), flags); + if (ctxt) { + ctxt->xprt = xprt; + INIT_LIST_HEAD(&ctxt->free); + INIT_LIST_HEAD(&ctxt->dto_q); + } + return ctxt; +} + +static bool svc_rdma_prealloc_ctxts(struct svcxprt_rdma *xprt) +{ + unsigned int i; + + /* Each RPC/RDMA credit can consume a number of send + * and receive WQEs. One ctxt is allocated for each. + */ + i = xprt->sc_sq_depth + xprt->sc_rq_depth; + + while (i--) { + struct svc_rdma_op_ctxt *ctxt; + + ctxt = alloc_ctxt(xprt, GFP_KERNEL); + if (!ctxt) { + dprintk("svcrdma: No memory for RDMA ctxt\n"); + return false; + } + list_add(&ctxt->free, &xprt->sc_ctxts); + } + return true; +} + +struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt) +{ + struct svc_rdma_op_ctxt *ctxt = NULL; + + spin_lock_bh(&xprt->sc_ctxt_lock); + xprt->sc_ctxt_used++; + if (list_empty(&xprt->sc_ctxts)) + goto out_empty; + + ctxt = list_first_entry(&xprt->sc_ctxts, + struct svc_rdma_op_ctxt, free); + list_del_init(&ctxt->free); + spin_unlock_bh(&xprt->sc_ctxt_lock); + +out: ctxt->count = 0; ctxt->frmr = NULL; - atomic_inc(&xprt->sc_ctxt_used); return ctxt; + +out_empty: + /* Either pre-allocation missed the mark, or send + * queue accounting is broken. + */ + spin_unlock_bh(&xprt->sc_ctxt_lock); + + ctxt = alloc_ctxt(xprt, GFP_NOIO); + if (ctxt) + goto out; + + spin_lock_bh(&xprt->sc_ctxt_lock); + xprt->sc_ctxt_used--; + spin_unlock_bh(&xprt->sc_ctxt_lock); + WARN_ONCE(1, "svcrdma: empty RDMA ctxt list?\n"); + return NULL; } void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt) @@ -174,11 +232,11 @@ void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt) for (i = 0; i < ctxt->count && ctxt->sge[i].length; i++) { /* * Unmap the DMA addr in the SGE if the lkey matches - * the sc_dma_lkey, otherwise, ignore it since it is + * the local_dma_lkey, otherwise, ignore it since it is * an FRMR lkey and will be unmapped later when the * last WR that uses it completes. */ - if (ctxt->sge[i].lkey == xprt->sc_dma_lkey) { + if (ctxt->sge[i].lkey == xprt->sc_pd->local_dma_lkey) { atomic_dec(&xprt->sc_dma_used); ib_dma_unmap_page(xprt->sc_cm_id->device, ctxt->sge[i].addr, @@ -190,35 +248,108 @@ void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt) void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages) { - struct svcxprt_rdma *xprt; + struct svcxprt_rdma *xprt = ctxt->xprt; int i; - xprt = ctxt->xprt; if (free_pages) for (i = 0; i < ctxt->count; i++) put_page(ctxt->pages[i]); - kmem_cache_free(svc_rdma_ctxt_cachep, ctxt); - atomic_dec(&xprt->sc_ctxt_used); + spin_lock_bh(&xprt->sc_ctxt_lock); + xprt->sc_ctxt_used--; + list_add(&ctxt->free, &xprt->sc_ctxts); + spin_unlock_bh(&xprt->sc_ctxt_lock); } -/* - * Temporary NFS req mappings are shared across all transport - * instances. These are short lived and should be bounded by the number - * of concurrent server threads * depth of the SQ. - */ -struct svc_rdma_req_map *svc_rdma_get_req_map(void) +static void svc_rdma_destroy_ctxts(struct svcxprt_rdma *xprt) +{ + while (!list_empty(&xprt->sc_ctxts)) { + struct svc_rdma_op_ctxt *ctxt; + + ctxt = list_first_entry(&xprt->sc_ctxts, + struct svc_rdma_op_ctxt, free); + list_del(&ctxt->free); + kfree(ctxt); + } +} + +static struct svc_rdma_req_map *alloc_req_map(gfp_t flags) { struct svc_rdma_req_map *map; - map = kmem_cache_alloc(svc_rdma_map_cachep, - GFP_KERNEL | __GFP_NOFAIL); + + map = kmalloc(sizeof(*map), flags); + if (map) + INIT_LIST_HEAD(&map->free); + return map; +} + +static bool svc_rdma_prealloc_maps(struct svcxprt_rdma *xprt) +{ + unsigned int i; + + /* One for each receive buffer on this connection. */ + i = xprt->sc_max_requests; + + while (i--) { + struct svc_rdma_req_map *map; + + map = alloc_req_map(GFP_KERNEL); + if (!map) { + dprintk("svcrdma: No memory for request map\n"); + return false; + } + list_add(&map->free, &xprt->sc_maps); + } + return true; +} + +struct svc_rdma_req_map *svc_rdma_get_req_map(struct svcxprt_rdma *xprt) +{ + struct svc_rdma_req_map *map = NULL; + + spin_lock(&xprt->sc_map_lock); + if (list_empty(&xprt->sc_maps)) + goto out_empty; + + map = list_first_entry(&xprt->sc_maps, + struct svc_rdma_req_map, free); + list_del_init(&map->free); + spin_unlock(&xprt->sc_map_lock); + +out: map->count = 0; return map; + +out_empty: + spin_unlock(&xprt->sc_map_lock); + + /* Pre-allocation amount was incorrect */ + map = alloc_req_map(GFP_NOIO); + if (map) + goto out; + + WARN_ONCE(1, "svcrdma: empty request map list?\n"); + return NULL; +} + +void svc_rdma_put_req_map(struct svcxprt_rdma *xprt, + struct svc_rdma_req_map *map) +{ + spin_lock(&xprt->sc_map_lock); + list_add(&map->free, &xprt->sc_maps); + spin_unlock(&xprt->sc_map_lock); } -void svc_rdma_put_req_map(struct svc_rdma_req_map *map) +static void svc_rdma_destroy_maps(struct svcxprt_rdma *xprt) { - kmem_cache_free(svc_rdma_map_cachep, map); + while (!list_empty(&xprt->sc_maps)) { + struct svc_rdma_req_map *map; + + map = list_first_entry(&xprt->sc_maps, + struct svc_rdma_req_map, free); + list_del(&map->free); + kfree(map); + } } /* ib_cq event handler */ @@ -386,46 +517,44 @@ static void rq_cq_reap(struct svcxprt_rdma *xprt) static void process_context(struct svcxprt_rdma *xprt, struct svc_rdma_op_ctxt *ctxt) { + struct svc_rdma_op_ctxt *read_hdr; + int free_pages = 0; + svc_rdma_unmap_dma(ctxt); switch (ctxt->wr_op) { case IB_WR_SEND: - if (ctxt->frmr) - pr_err("svcrdma: SEND: ctxt->frmr != NULL\n"); - svc_rdma_put_context(ctxt, 1); + free_pages = 1; break; case IB_WR_RDMA_WRITE: - if (ctxt->frmr) - pr_err("svcrdma: WRITE: ctxt->frmr != NULL\n"); - svc_rdma_put_context(ctxt, 0); break; case IB_WR_RDMA_READ: case IB_WR_RDMA_READ_WITH_INV: svc_rdma_put_frmr(xprt, ctxt->frmr); - if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) { - struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr; - if (read_hdr) { - spin_lock_bh(&xprt->sc_rq_dto_lock); - set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); - list_add_tail(&read_hdr->dto_q, - &xprt->sc_read_complete_q); - spin_unlock_bh(&xprt->sc_rq_dto_lock); - } else { - pr_err("svcrdma: ctxt->read_hdr == NULL\n"); - } - svc_xprt_enqueue(&xprt->sc_xprt); - } + + if (!test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) + break; + + read_hdr = ctxt->read_hdr; svc_rdma_put_context(ctxt, 0); - break; + + spin_lock_bh(&xprt->sc_rq_dto_lock); + set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); + list_add_tail(&read_hdr->dto_q, + &xprt->sc_read_complete_q); + spin_unlock_bh(&xprt->sc_rq_dto_lock); + svc_xprt_enqueue(&xprt->sc_xprt); + return; default: - printk(KERN_ERR "svcrdma: unexpected completion type, " - "opcode=%d\n", - ctxt->wr_op); + dprintk("svcrdma: unexpected completion opcode=%d\n", + ctxt->wr_op); break; } + + svc_rdma_put_context(ctxt, free_pages); } /* @@ -523,19 +652,15 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q); INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q); INIT_LIST_HEAD(&cma_xprt->sc_frmr_q); + INIT_LIST_HEAD(&cma_xprt->sc_ctxts); + INIT_LIST_HEAD(&cma_xprt->sc_maps); init_waitqueue_head(&cma_xprt->sc_send_wait); spin_lock_init(&cma_xprt->sc_lock); spin_lock_init(&cma_xprt->sc_rq_dto_lock); spin_lock_init(&cma_xprt->sc_frmr_q_lock); - - cma_xprt->sc_ord = svcrdma_ord; - - cma_xprt->sc_max_req_size = svcrdma_max_req_size; - cma_xprt->sc_max_requests = svcrdma_max_requests; - cma_xprt->sc_sq_depth = svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT; - atomic_set(&cma_xprt->sc_sq_count, 0); - atomic_set(&cma_xprt->sc_ctxt_used, 0); + spin_lock_init(&cma_xprt->sc_ctxt_lock); + spin_lock_init(&cma_xprt->sc_map_lock); if (listener) set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags); @@ -543,7 +668,7 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, return cma_xprt; } -int svc_rdma_post_recv(struct svcxprt_rdma *xprt) +int svc_rdma_post_recv(struct svcxprt_rdma *xprt, gfp_t flags) { struct ib_recv_wr recv_wr, *bad_recv_wr; struct svc_rdma_op_ctxt *ctxt; @@ -561,7 +686,9 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt) pr_err("svcrdma: Too many sges (%d)\n", sge_no); goto err_put_ctxt; } - page = alloc_page(GFP_KERNEL | __GFP_NOFAIL); + page = alloc_page(flags); + if (!page) + goto err_put_ctxt; ctxt->pages[sge_no] = page; pa = ib_dma_map_page(xprt->sc_cm_id->device, page, 0, PAGE_SIZE, @@ -571,7 +698,7 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt) atomic_inc(&xprt->sc_dma_used); ctxt->sge[sge_no].addr = pa; ctxt->sge[sge_no].length = PAGE_SIZE; - ctxt->sge[sge_no].lkey = xprt->sc_dma_lkey; + ctxt->sge[sge_no].lkey = xprt->sc_pd->local_dma_lkey; ctxt->count = sge_no + 1; buflen += PAGE_SIZE; } @@ -886,11 +1013,9 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) struct rdma_conn_param conn_param; struct ib_cq_init_attr cq_attr = {}; struct ib_qp_init_attr qp_attr; - struct ib_device_attr devattr; - int uninitialized_var(dma_mr_acc); - int need_dma_mr = 0; - int ret; - int i; + struct ib_device *dev; + unsigned int i; + int ret = 0; listen_rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt); clear_bit(XPT_CONN, &xprt->xpt_flags); @@ -910,37 +1035,42 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) dprintk("svcrdma: newxprt from accept queue = %p, cm_id=%p\n", newxprt, newxprt->sc_cm_id); - ret = ib_query_device(newxprt->sc_cm_id->device, &devattr); - if (ret) { - dprintk("svcrdma: could not query device attributes on " - "device %p, rc=%d\n", newxprt->sc_cm_id->device, ret); - goto errout; - } + dev = newxprt->sc_cm_id->device; /* Qualify the transport resource defaults with the * capabilities of this particular device */ - newxprt->sc_max_sge = min((size_t)devattr.max_sge, + newxprt->sc_max_sge = min((size_t)dev->attrs.max_sge, (size_t)RPCSVC_MAXPAGES); - newxprt->sc_max_sge_rd = min_t(size_t, devattr.max_sge_rd, + newxprt->sc_max_sge_rd = min_t(size_t, dev->attrs.max_sge_rd, RPCSVC_MAXPAGES); - newxprt->sc_max_requests = min((size_t)devattr.max_qp_wr, - (size_t)svcrdma_max_requests); - newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_max_requests; + newxprt->sc_max_req_size = svcrdma_max_req_size; + newxprt->sc_max_requests = min_t(u32, dev->attrs.max_qp_wr, + svcrdma_max_requests); + newxprt->sc_max_bc_requests = min_t(u32, dev->attrs.max_qp_wr, + svcrdma_max_bc_requests); + newxprt->sc_rq_depth = newxprt->sc_max_requests + + newxprt->sc_max_bc_requests; + newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_rq_depth; + + if (!svc_rdma_prealloc_ctxts(newxprt)) + goto errout; + if (!svc_rdma_prealloc_maps(newxprt)) + goto errout; /* * Limit ORD based on client limit, local device limit, and * configured svcrdma limit. */ - newxprt->sc_ord = min_t(size_t, devattr.max_qp_rd_atom, newxprt->sc_ord); + newxprt->sc_ord = min_t(size_t, dev->attrs.max_qp_rd_atom, newxprt->sc_ord); newxprt->sc_ord = min_t(size_t, svcrdma_ord, newxprt->sc_ord); - newxprt->sc_pd = ib_alloc_pd(newxprt->sc_cm_id->device); + newxprt->sc_pd = ib_alloc_pd(dev); if (IS_ERR(newxprt->sc_pd)) { dprintk("svcrdma: error creating PD for connect request\n"); goto errout; } cq_attr.cqe = newxprt->sc_sq_depth; - newxprt->sc_sq_cq = ib_create_cq(newxprt->sc_cm_id->device, + newxprt->sc_sq_cq = ib_create_cq(dev, sq_comp_handler, cq_event_handler, newxprt, @@ -949,8 +1079,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) dprintk("svcrdma: error creating SQ CQ for connect request\n"); goto errout; } - cq_attr.cqe = newxprt->sc_max_requests; - newxprt->sc_rq_cq = ib_create_cq(newxprt->sc_cm_id->device, + cq_attr.cqe = newxprt->sc_rq_depth; + newxprt->sc_rq_cq = ib_create_cq(dev, rq_comp_handler, cq_event_handler, newxprt, @@ -964,7 +1094,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) qp_attr.event_handler = qp_event_handler; qp_attr.qp_context = &newxprt->sc_xprt; qp_attr.cap.max_send_wr = newxprt->sc_sq_depth; - qp_attr.cap.max_recv_wr = newxprt->sc_max_requests; + qp_attr.cap.max_recv_wr = newxprt->sc_rq_depth; qp_attr.cap.max_send_sge = newxprt->sc_max_sge; qp_attr.cap.max_recv_sge = newxprt->sc_max_sge; qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; @@ -978,7 +1108,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) " cap.max_send_sge = %d\n" " cap.max_recv_sge = %d\n", newxprt->sc_cm_id, newxprt->sc_pd, - newxprt->sc_cm_id->device, newxprt->sc_pd->device, + dev, newxprt->sc_pd->device, qp_attr.cap.max_send_wr, qp_attr.cap.max_recv_wr, qp_attr.cap.max_send_sge, @@ -1014,9 +1144,9 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) * of an RDMA_READ. IB does not. */ newxprt->sc_reader = rdma_read_chunk_lcl; - if (devattr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) { + if (dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) { newxprt->sc_frmr_pg_list_len = - devattr.max_fast_reg_page_list_len; + dev->attrs.max_fast_reg_page_list_len; newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_FAST_REG; newxprt->sc_reader = rdma_read_chunk_frmr; } @@ -1024,44 +1154,16 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) /* * Determine if a DMA MR is required and if so, what privs are required */ - if (!rdma_protocol_iwarp(newxprt->sc_cm_id->device, - newxprt->sc_cm_id->port_num) && - !rdma_ib_or_roce(newxprt->sc_cm_id->device, - newxprt->sc_cm_id->port_num)) + if (!rdma_protocol_iwarp(dev, newxprt->sc_cm_id->port_num) && + !rdma_ib_or_roce(dev, newxprt->sc_cm_id->port_num)) goto errout; - if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG) || - !(devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) { - need_dma_mr = 1; - dma_mr_acc = IB_ACCESS_LOCAL_WRITE; - if (rdma_protocol_iwarp(newxprt->sc_cm_id->device, - newxprt->sc_cm_id->port_num) && - !(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)) - dma_mr_acc |= IB_ACCESS_REMOTE_WRITE; - } - - if (rdma_protocol_iwarp(newxprt->sc_cm_id->device, - newxprt->sc_cm_id->port_num)) + if (rdma_protocol_iwarp(dev, newxprt->sc_cm_id->port_num)) newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_READ_W_INV; - /* Create the DMA MR if needed, otherwise, use the DMA LKEY */ - if (need_dma_mr) { - /* Register all of physical memory */ - newxprt->sc_phys_mr = - ib_get_dma_mr(newxprt->sc_pd, dma_mr_acc); - if (IS_ERR(newxprt->sc_phys_mr)) { - dprintk("svcrdma: Failed to create DMA MR ret=%d\n", - ret); - goto errout; - } - newxprt->sc_dma_lkey = newxprt->sc_phys_mr->lkey; - } else - newxprt->sc_dma_lkey = - newxprt->sc_cm_id->device->local_dma_lkey; - /* Post receive buffers */ - for (i = 0; i < newxprt->sc_max_requests; i++) { - ret = svc_rdma_post_recv(newxprt); + for (i = 0; i < newxprt->sc_rq_depth; i++) { + ret = svc_rdma_post_recv(newxprt, GFP_KERNEL); if (ret) { dprintk("svcrdma: failure posting receive buffers\n"); goto errout; @@ -1160,12 +1262,14 @@ static void __svc_rdma_free(struct work_struct *work) { struct svcxprt_rdma *rdma = container_of(work, struct svcxprt_rdma, sc_work); - dprintk("svcrdma: svc_rdma_free(%p)\n", rdma); + struct svc_xprt *xprt = &rdma->sc_xprt; + + dprintk("svcrdma: %s(%p)\n", __func__, rdma); /* We should only be called from kref_put */ - if (atomic_read(&rdma->sc_xprt.xpt_ref.refcount) != 0) + if (atomic_read(&xprt->xpt_ref.refcount) != 0) pr_err("svcrdma: sc_xprt still in use? (%d)\n", - atomic_read(&rdma->sc_xprt.xpt_ref.refcount)); + atomic_read(&xprt->xpt_ref.refcount)); /* * Destroy queued, but not processed read completions. Note @@ -1193,15 +1297,22 @@ static void __svc_rdma_free(struct work_struct *work) } /* Warn if we leaked a resource or under-referenced */ - if (atomic_read(&rdma->sc_ctxt_used) != 0) + if (rdma->sc_ctxt_used != 0) pr_err("svcrdma: ctxt still in use? (%d)\n", - atomic_read(&rdma->sc_ctxt_used)); + rdma->sc_ctxt_used); if (atomic_read(&rdma->sc_dma_used) != 0) pr_err("svcrdma: dma still in use? (%d)\n", atomic_read(&rdma->sc_dma_used)); - /* De-allocate fastreg mr */ + /* Final put of backchannel client transport */ + if (xprt->xpt_bc_xprt) { + xprt_put(xprt->xpt_bc_xprt); + xprt->xpt_bc_xprt = NULL; + } + rdma_dealloc_frmr_q(rdma); + svc_rdma_destroy_ctxts(rdma); + svc_rdma_destroy_maps(rdma); /* Destroy the QP if present (not a listener) */ if (rdma->sc_qp && !IS_ERR(rdma->sc_qp)) @@ -1213,9 +1324,6 @@ static void __svc_rdma_free(struct work_struct *work) if (rdma->sc_rq_cq && !IS_ERR(rdma->sc_rq_cq)) ib_destroy_cq(rdma->sc_rq_cq); - if (rdma->sc_phys_mr && !IS_ERR(rdma->sc_phys_mr)) - ib_dereg_mr(rdma->sc_phys_mr); - if (rdma->sc_pd && !IS_ERR(rdma->sc_pd)) ib_dealloc_pd(rdma->sc_pd); @@ -1321,7 +1429,9 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp, int length; int ret; - p = alloc_page(GFP_KERNEL | __GFP_NOFAIL); + p = alloc_page(GFP_KERNEL); + if (!p) + return; va = page_address(p); /* XDR encode error */ @@ -1341,7 +1451,7 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp, return; } atomic_inc(&xprt->sc_dma_used); - ctxt->sge[0].lkey = xprt->sc_dma_lkey; + ctxt->sge[0].lkey = xprt->sc_pd->local_dma_lkey; ctxt->sge[0].length = length; /* Prepare SEND WR */ diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 8c545f7..b1b009f 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -63,7 +63,7 @@ */ static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE; -static unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE; +unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE; static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE; static unsigned int xprt_rdma_inline_write_padding; static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR; @@ -143,12 +143,7 @@ static struct ctl_table sunrpc_table[] = { #endif -#define RPCRDMA_BIND_TO (60U * HZ) -#define RPCRDMA_INIT_REEST_TO (5U * HZ) -#define RPCRDMA_MAX_REEST_TO (30U * HZ) -#define RPCRDMA_IDLE_DISC_TO (5U * 60 * HZ) - -static struct rpc_xprt_ops xprt_rdma_procs; /* forward reference */ +static struct rpc_xprt_ops xprt_rdma_procs; /*forward reference */ static void xprt_rdma_format_addresses4(struct rpc_xprt *xprt, struct sockaddr *sap) @@ -174,7 +169,7 @@ xprt_rdma_format_addresses6(struct rpc_xprt *xprt, struct sockaddr *sap) xprt->address_strings[RPC_DISPLAY_NETID] = RPCBIND_NETID_RDMA6; } -static void +void xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap) { char buf[128]; @@ -203,7 +198,7 @@ xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap) xprt->address_strings[RPC_DISPLAY_PROTO] = "rdma"; } -static void +void xprt_rdma_free_addresses(struct rpc_xprt *xprt) { unsigned int i; @@ -499,7 +494,7 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size) if (req == NULL) return NULL; - flags = GFP_NOIO | __GFP_NOWARN; + flags = RPCRDMA_DEF_GFP; if (RPC_IS_SWAPPER(task)) flags = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN; @@ -576,6 +571,9 @@ xprt_rdma_free(void *buffer) rb = container_of(buffer, struct rpcrdma_regbuf, rg_base[0]); req = rb->rg_owner; + if (req->rl_backchannel) + return; + r_xprt = container_of(req->rl_buffer, struct rpcrdma_xprt, rx_buf); dprintk("RPC: %s: called on 0x%p\n", __func__, req->rl_reply); @@ -639,7 +637,7 @@ drop_connection: return -ENOTCONN; /* implies disconnect */ } -static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) +void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); long idle_time = 0; @@ -740,6 +738,11 @@ void xprt_rdma_cleanup(void) rpcrdma_destroy_wq(); frwr_destroy_recovery_wq(); + + rc = xprt_unregister_transport(&xprt_rdma_bc); + if (rc) + dprintk("RPC: %s: xprt_unregister(bc) returned %i\n", + __func__, rc); } int xprt_rdma_init(void) @@ -763,6 +766,14 @@ int xprt_rdma_init(void) return rc; } + rc = xprt_register_transport(&xprt_rdma_bc); + if (rc) { + xprt_unregister_transport(&xprt_rdma); + rpcrdma_destroy_wq(); + frwr_destroy_recovery_wq(); + return rc; + } + dprintk("RPCRDMA Module Init, register RPC RDMA transport\n"); dprintk("Defaults:\n"); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index eadd1655..878f1bf 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -462,7 +462,6 @@ int rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) { struct rpcrdma_ia *ia = &xprt->rx_ia; - struct ib_device_attr *devattr = &ia->ri_devattr; int rc; ia->ri_dma_mr = NULL; @@ -482,16 +481,10 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) goto out2; } - rc = ib_query_device(ia->ri_device, devattr); - if (rc) { - dprintk("RPC: %s: ib_query_device failed %d\n", - __func__, rc); - goto out3; - } - if (memreg == RPCRDMA_FRMR) { - if (!(devattr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) || - (devattr->max_fast_reg_page_list_len == 0)) { + if (!(ia->ri_device->attrs.device_cap_flags & + IB_DEVICE_MEM_MGT_EXTENSIONS) || + (ia->ri_device->attrs.max_fast_reg_page_list_len == 0)) { dprintk("RPC: %s: FRMR registration " "not supported by HCA\n", __func__); memreg = RPCRDMA_MTHCAFMR; @@ -566,24 +559,23 @@ int rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata) { - struct ib_device_attr *devattr = &ia->ri_devattr; struct ib_cq *sendcq, *recvcq; struct ib_cq_init_attr cq_attr = {}; unsigned int max_qp_wr; int rc, err; - if (devattr->max_sge < RPCRDMA_MAX_IOVS) { + if (ia->ri_device->attrs.max_sge < RPCRDMA_MAX_IOVS) { dprintk("RPC: %s: insufficient sge's available\n", __func__); return -ENOMEM; } - if (devattr->max_qp_wr <= RPCRDMA_BACKWARD_WRS) { + if (ia->ri_device->attrs.max_qp_wr <= RPCRDMA_BACKWARD_WRS) { dprintk("RPC: %s: insufficient wqe's available\n", __func__); return -ENOMEM; } - max_qp_wr = devattr->max_qp_wr - RPCRDMA_BACKWARD_WRS; + max_qp_wr = ia->ri_device->attrs.max_qp_wr - RPCRDMA_BACKWARD_WRS; /* check provider's send/recv wr limits */ if (cdata->max_requests > max_qp_wr) @@ -616,10 +608,8 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, /* set trigger for requesting send completion */ ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1; - if (ep->rep_cqinit > RPCRDMA_MAX_UNSIGNALED_SENDS) - ep->rep_cqinit = RPCRDMA_MAX_UNSIGNALED_SENDS; - else if (ep->rep_cqinit <= 2) - ep->rep_cqinit = 0; + if (ep->rep_cqinit <= 2) + ep->rep_cqinit = 0; /* always signal? */ INIT_CQCOUNT(ep); init_waitqueue_head(&ep->rep_connect_wait); INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker); @@ -670,11 +660,11 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, /* Client offers RDMA Read but does not initiate */ ep->rep_remote_cma.initiator_depth = 0; - if (devattr->max_qp_rd_atom > 32) /* arbitrary but <= 255 */ + if (ia->ri_device->attrs.max_qp_rd_atom > 32) /* arbitrary but <= 255 */ ep->rep_remote_cma.responder_resources = 32; else ep->rep_remote_cma.responder_resources = - devattr->max_qp_rd_atom; + ia->ri_device->attrs.max_qp_rd_atom; ep->rep_remote_cma.retry_count = 7; ep->rep_remote_cma.flow_control = 0; @@ -852,10 +842,11 @@ retry: if (extras) { rc = rpcrdma_ep_post_extra_recv(r_xprt, extras); - if (rc) + if (rc) { pr_warn("%s: rpcrdma_ep_post_extra_recv: %i\n", __func__, rc); rc = 0; + } } } @@ -1337,15 +1328,14 @@ rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *r_xprt, unsigned int count) struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct rpcrdma_ep *ep = &r_xprt->rx_ep; struct rpcrdma_rep *rep; - unsigned long flags; int rc; while (count--) { - spin_lock_irqsave(&buffers->rb_lock, flags); + spin_lock(&buffers->rb_lock); if (list_empty(&buffers->rb_recv_bufs)) goto out_reqbuf; rep = rpcrdma_buffer_get_rep_locked(buffers); - spin_unlock_irqrestore(&buffers->rb_lock, flags); + spin_unlock(&buffers->rb_lock); rc = rpcrdma_ep_post_recv(ia, ep, rep); if (rc) @@ -1355,7 +1345,7 @@ rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *r_xprt, unsigned int count) return 0; out_reqbuf: - spin_unlock_irqrestore(&buffers->rb_lock, flags); + spin_unlock(&buffers->rb_lock); pr_warn("%s: no extra receive buffers\n", __func__); return -ENOMEM; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index ac7f8d4..38fe11b 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -55,6 +55,11 @@ #define RDMA_RESOLVE_TIMEOUT (5000) /* 5 seconds */ #define RDMA_CONNECT_RETRY_MAX (2) /* retries if no listener backlog */ +#define RPCRDMA_BIND_TO (60U * HZ) +#define RPCRDMA_INIT_REEST_TO (5U * HZ) +#define RPCRDMA_MAX_REEST_TO (30U * HZ) +#define RPCRDMA_IDLE_DISC_TO (5U * 60 * HZ) + /* * Interface Adapter -- one per transport instance */ @@ -68,7 +73,6 @@ struct rpcrdma_ia { struct completion ri_done; int ri_async_rc; unsigned int ri_max_frmr_depth; - struct ib_device_attr ri_devattr; struct ib_qp_attr ri_qp_attr; struct ib_qp_init_attr ri_qp_init_attr; }; @@ -88,12 +92,6 @@ struct rpcrdma_ep { struct delayed_work rep_connect_worker; }; -/* - * Force a signaled SEND Work Request every so often, - * in case the provider needs to do some housekeeping. - */ -#define RPCRDMA_MAX_UNSIGNALED_SENDS (32) - #define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit) #define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount) @@ -148,6 +146,8 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb) return (struct rpcrdma_msg *)rb->rg_base; } +#define RPCRDMA_DEF_GFP (GFP_NOIO | __GFP_NOWARN) + /* * struct rpcrdma_rep -- this structure encapsulates state required to recv * and complete a reply, asychronously. It needs several pieces of @@ -207,6 +207,12 @@ struct rpcrdma_frmr { enum rpcrdma_frmr_state fr_state; struct work_struct fr_work; struct rpcrdma_xprt *fr_xprt; + bool fr_waiter; + struct completion fr_linv_done;; + union { + struct ib_reg_wr fr_regwr; + struct ib_send_wr fr_invwr; + }; }; struct rpcrdma_fmr { @@ -309,6 +315,8 @@ struct rpcrdma_buffer { u32 rb_bc_srv_max_requests; spinlock_t rb_reqslock; /* protect rb_allreqs */ struct list_head rb_allreqs; + + u32 rb_bc_max_requests; }; #define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia) @@ -364,6 +372,8 @@ struct rpcrdma_xprt; struct rpcrdma_memreg_ops { int (*ro_map)(struct rpcrdma_xprt *, struct rpcrdma_mr_seg *, int, bool); + void (*ro_unmap_sync)(struct rpcrdma_xprt *, + struct rpcrdma_req *); int (*ro_unmap)(struct rpcrdma_xprt *, struct rpcrdma_mr_seg *); int (*ro_open)(struct rpcrdma_ia *, @@ -514,6 +524,10 @@ int rpcrdma_marshal_req(struct rpc_rqst *); /* RPC/RDMA module init - xprtrdma/transport.c */ +extern unsigned int xprt_rdma_max_inline_read; +void xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap); +void xprt_rdma_free_addresses(struct rpc_xprt *xprt); +void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq); int xprt_rdma_init(void); void xprt_rdma_cleanup(void); @@ -529,11 +543,6 @@ void xprt_rdma_bc_free_rqst(struct rpc_rqst *); void xprt_rdma_bc_destroy(struct rpc_xprt *, unsigned int); #endif /* CONFIG_SUNRPC_BACKCHANNEL */ -/* Temporary NFS request map cache. Created in svc_rdma.c */ -extern struct kmem_cache *svc_rdma_map_cachep; -/* WR context cache. Created in svc_rdma.c */ -extern struct kmem_cache *svc_rdma_ctxt_cachep; -/* Workqueue created in svc_rdma.c */ -extern struct workqueue_struct *svc_rdma_wq; +extern struct xprt_class xprt_rdma_bc; #endif /* _LINUX_SUNRPC_XPRT_RDMA_H */ diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 2ffaf6a..fde2138 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -398,7 +398,6 @@ static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, if (unlikely(!sock)) return -ENOTSOCK; - clear_bit(SOCKWQ_ASYNC_NOSPACE, &sock->flags); if (base != 0) { addr = NULL; addrlen = 0; @@ -442,7 +441,6 @@ static void xs_nospace_callback(struct rpc_task *task) struct sock_xprt *transport = container_of(task->tk_rqstp->rq_xprt, struct sock_xprt, xprt); transport->inet->sk_write_pending--; - clear_bit(SOCKWQ_ASYNC_NOSPACE, &transport->sock->flags); } /** @@ -467,20 +465,11 @@ static int xs_nospace(struct rpc_task *task) /* Don't race with disconnect */ if (xprt_connected(xprt)) { - if (test_bit(SOCKWQ_ASYNC_NOSPACE, &transport->sock->flags)) { - /* - * Notify TCP that we're limited by the application - * window size - */ - set_bit(SOCK_NOSPACE, &transport->sock->flags); - sk->sk_write_pending++; - /* ...and wait for more buffer space */ - xprt_wait_for_buffer_space(task, xs_nospace_callback); - } - } else { - clear_bit(SOCKWQ_ASYNC_NOSPACE, &transport->sock->flags); + /* wait for more buffer space */ + sk->sk_write_pending++; + xprt_wait_for_buffer_space(task, xs_nospace_callback); + } else ret = -ENOTCONN; - } spin_unlock_bh(&xprt->transport_lock); @@ -616,9 +605,6 @@ process_status: case -EAGAIN: status = xs_nospace(task); break; - default: - dprintk("RPC: sendmsg returned unrecognized error %d\n", - -status); case -ENETUNREACH: case -ENOBUFS: case -EPIPE: @@ -626,7 +612,10 @@ process_status: case -EPERM: /* When the server has died, an ICMP port unreachable message * prompts ECONNREFUSED. */ - clear_bit(SOCKWQ_ASYNC_NOSPACE, &transport->sock->flags); + break; + default: + dprintk("RPC: sendmsg returned unrecognized error %d\n", + -status); } return status; @@ -706,16 +695,16 @@ static int xs_tcp_send_request(struct rpc_task *task) case -EAGAIN: status = xs_nospace(task); break; - default: - dprintk("RPC: sendmsg returned unrecognized error %d\n", - -status); case -ECONNRESET: case -ECONNREFUSED: case -ENOTCONN: case -EADDRINUSE: case -ENOBUFS: case -EPIPE: - clear_bit(SOCKWQ_ASYNC_NOSPACE, &transport->sock->flags); + break; + default: + dprintk("RPC: sendmsg returned unrecognized error %d\n", + -status); } return status; @@ -1609,19 +1598,23 @@ static void xs_tcp_state_change(struct sock *sk) static void xs_write_space(struct sock *sk) { - struct socket *sock; + struct socket_wq *wq; struct rpc_xprt *xprt; - if (unlikely(!(sock = sk->sk_socket))) + if (!sk->sk_socket) return; - clear_bit(SOCK_NOSPACE, &sock->flags); + clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); if (unlikely(!(xprt = xprt_from_sock(sk)))) return; - if (test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &sock->flags) == 0) - return; + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (!wq || test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags) == 0) + goto out; xprt_write_space(xprt); +out: + rcu_read_unlock(); } /** @@ -1907,18 +1900,6 @@ static inline void xs_reclassify_socket(int family, struct socket *sock) } } #else -static inline void xs_reclassify_socketu(struct socket *sock) -{ -} - -static inline void xs_reclassify_socket4(struct socket *sock) -{ -} - -static inline void xs_reclassify_socket6(struct socket *sock) -{ -} - static inline void xs_reclassify_socket(int family, struct socket *sock) { } @@ -2008,7 +1989,7 @@ static int xs_local_setup_socket(struct sock_xprt *transport) "transport socket (%d).\n", -status); goto out; } - xs_reclassify_socketu(sock); + xs_reclassify_socket(AF_LOCAL, sock); dprintk("RPC: worker connecting xprt %p via AF_LOCAL to %s\n", xprt, xprt->address_strings[RPC_DISPLAY_ADDR]); diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index ebc661d..8b5833c 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -20,6 +20,7 @@ #include <linux/list.h> #include <linux/workqueue.h> #include <linux/if_vlan.h> +#include <linux/rtnetlink.h> #include <net/ip_fib.h> #include <net/switchdev.h> @@ -567,7 +568,6 @@ int switchdev_port_obj_dump(struct net_device *dev, struct switchdev_obj *obj, } EXPORT_SYMBOL_GPL(switchdev_port_obj_dump); -static DEFINE_MUTEX(switchdev_mutex); static RAW_NOTIFIER_HEAD(switchdev_notif_chain); /** @@ -582,9 +582,9 @@ int register_switchdev_notifier(struct notifier_block *nb) { int err; - mutex_lock(&switchdev_mutex); + rtnl_lock(); err = raw_notifier_chain_register(&switchdev_notif_chain, nb); - mutex_unlock(&switchdev_mutex); + rtnl_unlock(); return err; } EXPORT_SYMBOL_GPL(register_switchdev_notifier); @@ -600,9 +600,9 @@ int unregister_switchdev_notifier(struct notifier_block *nb) { int err; - mutex_lock(&switchdev_mutex); + rtnl_lock(); err = raw_notifier_chain_unregister(&switchdev_notif_chain, nb); - mutex_unlock(&switchdev_mutex); + rtnl_unlock(); return err; } EXPORT_SYMBOL_GPL(unregister_switchdev_notifier); @@ -616,16 +616,17 @@ EXPORT_SYMBOL_GPL(unregister_switchdev_notifier); * Call all network notifier blocks. This should be called by driver * when it needs to propagate hardware event. * Return values are same as for atomic_notifier_call_chain(). + * rtnl_lock must be held. */ int call_switchdev_notifiers(unsigned long val, struct net_device *dev, struct switchdev_notifier_info *info) { int err; + ASSERT_RTNL(); + info->dev = dev; - mutex_lock(&switchdev_mutex); err = raw_notifier_call_chain(&switchdev_notif_chain, val, info); - mutex_unlock(&switchdev_mutex); return err; } EXPORT_SYMBOL_GPL(call_switchdev_notifiers); @@ -1092,8 +1093,11 @@ int switchdev_port_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, .cb = cb, .idx = idx, }; + int err; - switchdev_port_obj_dump(dev, &dump.fdb.obj, switchdev_port_fdb_dump_cb); + err = switchdev_port_obj_dump(dev, &dump.fdb.obj, + switchdev_port_fdb_dump_cb); + cb->args[1] = err; return dump.idx; } EXPORT_SYMBOL_GPL(switchdev_port_fdb_dump); diff --git a/net/tipc/link.c b/net/tipc/link.c index 0c2944f..347cdc9 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1973,8 +1973,10 @@ int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg) hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, NLM_F_MULTI, TIPC_NL_LINK_GET); - if (!hdr) + if (!hdr) { + tipc_bcast_unlock(net); return -EMSGSIZE; + } attrs = nla_nest_start(msg->skb, TIPC_NLA_LINK); if (!attrs) diff --git a/net/tipc/node.c b/net/tipc/node.c index fa97d96..9d7a16f 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -346,12 +346,6 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr, u16 capabilities) skb_queue_head_init(&n->bc_entry.inputq2); for (i = 0; i < MAX_BEARERS; i++) spin_lock_init(&n->links[i].lock); - hlist_add_head_rcu(&n->hash, &tn->node_htable[tipc_hashfn(addr)]); - list_for_each_entry_rcu(temp_node, &tn->node_list, list) { - if (n->addr < temp_node->addr) - break; - } - list_add_tail_rcu(&n->list, &temp_node->list); n->state = SELF_DOWN_PEER_LEAVING; n->signature = INVALID_NODE_SIG; n->active_links[0] = INVALID_BEARER_ID; @@ -372,6 +366,12 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr, u16 capabilities) tipc_node_get(n); setup_timer(&n->timer, tipc_node_timeout, (unsigned long)n); n->keepalive_intv = U32_MAX; + hlist_add_head_rcu(&n->hash, &tn->node_htable[tipc_hashfn(addr)]); + list_for_each_entry_rcu(temp_node, &tn->node_list, list) { + if (n->addr < temp_node->addr) + break; + } + list_add_tail_rcu(&n->list, &temp_node->list); exit: spin_unlock_bh(&tn->node_list_lock); return n; diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index 350cca3..69ee2ee 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -289,15 +289,14 @@ static void tipc_subscrb_rcv_cb(struct net *net, int conid, struct sockaddr_tipc *addr, void *usr_data, void *buf, size_t len) { - struct tipc_subscriber *subscriber = usr_data; + struct tipc_subscriber *subscrb = usr_data; struct tipc_subscription *sub = NULL; struct tipc_net *tn = net_generic(net, tipc_net_id); - tipc_subscrp_create(net, (struct tipc_subscr *)buf, subscriber, &sub); - if (sub) - tipc_nametbl_subscribe(sub); - else - tipc_conn_terminate(tn->topsrv, subscriber->conid); + if (tipc_subscrp_create(net, (struct tipc_subscr *)buf, subscrb, &sub)) + return tipc_conn_terminate(tn->topsrv, subscrb->conid); + + tipc_nametbl_subscribe(sub); } /* Handle one request to establish a new subscriber */ diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index c5bf5ef..f75f847 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1496,7 +1496,7 @@ static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) UNIXCB(skb).fp = NULL; for (i = scm->fp->count-1; i >= 0; i--) - unix_notinflight(scm->fp->fp[i]); + unix_notinflight(scm->fp->user, scm->fp->fp[i]); } static void unix_destruct_scm(struct sk_buff *skb) @@ -1561,7 +1561,7 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) return -ENOMEM; for (i = scm->fp->count - 1; i >= 0; i--) - unix_inflight(scm->fp->fp[i]); + unix_inflight(scm->fp->user, scm->fp->fp[i]); return max_level; } @@ -1781,7 +1781,12 @@ restart_locked: goto out_unlock; } - if (unlikely(unix_peer(other) != sk && unix_recvq_full(other))) { + /* other == sk && unix_peer(other) != sk if + * - unix_peer(sk) == NULL, destination address bound to sk + * - unix_peer(sk) == sk by time of get but disconnected before lock + */ + if (other != sk && + unlikely(unix_peer(other) != sk && unix_recvq_full(other))) { if (timeo) { timeo = unix_wait_for_peer(other, timeo); @@ -2277,13 +2282,15 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state) size_t size = state->size; unsigned int last_len; - err = -EINVAL; - if (sk->sk_state != TCP_ESTABLISHED) + if (unlikely(sk->sk_state != TCP_ESTABLISHED)) { + err = -EINVAL; goto out; + } - err = -EOPNOTSUPP; - if (flags & MSG_OOB) + if (unlikely(flags & MSG_OOB)) { + err = -EOPNOTSUPP; goto out; + } target = sock_rcvlowat(sk, flags & MSG_WAITALL, size); timeo = sock_rcvtimeo(sk, noblock); @@ -2305,6 +2312,7 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state) bool drop_skb; struct sk_buff *skb, *last; +redo: unix_state_lock(sk); if (sock_flag(sk, SOCK_DEAD)) { err = -ECONNRESET; @@ -2329,9 +2337,11 @@ again: goto unlock; unix_state_unlock(sk); - err = -EAGAIN; - if (!timeo) + if (!timeo) { + err = -EAGAIN; break; + } + mutex_unlock(&u->readlock); timeo = unix_stream_data_wait(sk, timeo, last, @@ -2339,11 +2349,12 @@ again: if (signal_pending(current)) { err = sock_intr_errno(timeo); + scm_destroy(&scm); goto out; } mutex_lock(&u->readlock); - continue; + goto redo; unlock: unix_state_unlock(sk); break; diff --git a/net/unix/diag.c b/net/unix/diag.c index c512f64..4d96797 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -220,7 +220,7 @@ done: return skb->len; } -static struct sock *unix_lookup_by_ino(int ino) +static struct sock *unix_lookup_by_ino(unsigned int ino) { int i; struct sock *sk; diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 8fcdc22..6a0d485 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -116,7 +116,7 @@ struct sock *unix_get_socket(struct file *filp) * descriptor if it is for an AF_UNIX socket. */ -void unix_inflight(struct file *fp) +void unix_inflight(struct user_struct *user, struct file *fp) { struct sock *s = unix_get_socket(fp); @@ -133,11 +133,11 @@ void unix_inflight(struct file *fp) } unix_tot_inflight++; } - fp->f_cred->user->unix_inflight++; + user->unix_inflight++; spin_unlock(&unix_gc_lock); } -void unix_notinflight(struct file *fp) +void unix_notinflight(struct user_struct *user, struct file *fp) { struct sock *s = unix_get_socket(fp); @@ -152,7 +152,7 @@ void unix_notinflight(struct file *fp) list_del_init(&u->link); unix_tot_inflight--; } - fp->f_cred->user->unix_inflight--; + user->unix_inflight--; spin_unlock(&unix_gc_lock); } diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 7fd1220..bbe65dc 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1557,8 +1557,6 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, if (err < 0) goto out; - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); - while (total_written < len) { ssize_t written; @@ -1578,7 +1576,9 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, goto out_wait; release_sock(sk); + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); timeout = schedule_timeout(timeout); + finish_wait(sk_sleep(sk), &wait); lock_sock(sk); if (signal_pending(current)) { err = sock_intr_errno(timeout); @@ -1588,8 +1588,6 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, goto out_wait; } - prepare_to_wait(sk_sleep(sk), &wait, - TASK_INTERRUPTIBLE); } /* These checks occur both as part of and after the loop @@ -1635,7 +1633,6 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, out_wait: if (total_written > 0) err = total_written; - finish_wait(sk_sleep(sk), &wait); out: release_sock(sk); return err; @@ -1716,7 +1713,6 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, if (err < 0) goto out; - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); while (1) { s64 ready = vsock_stream_has_data(vsk); @@ -1727,7 +1723,7 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, */ err = -ENOMEM; - goto out_wait; + goto out; } else if (ready > 0) { ssize_t read; @@ -1750,7 +1746,7 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, vsk, target, read, !(flags & MSG_PEEK), &recv_data); if (err < 0) - goto out_wait; + goto out; if (read >= target || flags & MSG_PEEK) break; @@ -1773,7 +1769,9 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, break; release_sock(sk); + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); timeout = schedule_timeout(timeout); + finish_wait(sk_sleep(sk), &wait); lock_sock(sk); if (signal_pending(current)) { @@ -1783,9 +1781,6 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, err = -EAGAIN; break; } - - prepare_to_wait(sk_sleep(sk), &wait, - TASK_INTERRUPTIBLE); } } @@ -1816,8 +1811,6 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, err = copied; } -out_wait: - finish_wait(sk_sleep(sk), &wait); out: release_sock(sk); return err; diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index cc3676e..ff4a91f 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -167,6 +167,8 @@ static int xfrm_output_gso(struct net *net, struct sock *sk, struct sk_buff *skb { struct sk_buff *segs; + BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_SGO_CB_OFFSET); + BUILD_BUG_ON(sizeof(*IP6CB(skb)) > SKB_SGO_CB_OFFSET); segs = skb_gso_segment(skb, 0); kfree_skb(skb); if (IS_ERR(segs)) |