diff options
Diffstat (limited to 'net')
110 files changed, 2487 insertions, 1129 deletions
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 98a30a5..59555f0 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -443,7 +443,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, case NETDEV_UP: /* Put all VLANs for this dev in the up state too. */ vlan_group_for_each_dev(grp, i, vlandev) { - flgs = vlandev->flags; + flgs = dev_get_flags(vlandev); if (flgs & IFF_UP) continue; diff --git a/net/9p/protocol.c b/net/9p/protocol.c index e9d0f0c..16d2875 100644 --- a/net/9p/protocol.c +++ b/net/9p/protocol.c @@ -275,7 +275,7 @@ p9pdu_vreadf(struct p9_fcall *pdu, int proto_version, const char *fmt, } break; case 'R':{ - int16_t *nwqid = va_arg(ap, int16_t *); + uint16_t *nwqid = va_arg(ap, uint16_t *); struct p9_qid **wqids = va_arg(ap, struct p9_qid **); @@ -440,7 +440,7 @@ p9pdu_vwritef(struct p9_fcall *pdu, int proto_version, const char *fmt, stbuf->n_gid, stbuf->n_muid); } break; case 'V':{ - int32_t count = va_arg(ap, int32_t); + uint32_t count = va_arg(ap, uint32_t); struct iov_iter *from = va_arg(ap, struct iov_iter *); errcode = p9pdu_writef(pdu, proto_version, "d", @@ -471,7 +471,7 @@ p9pdu_vwritef(struct p9_fcall *pdu, int proto_version, const char *fmt, } break; case 'R':{ - int16_t nwqid = va_arg(ap, int); + uint16_t nwqid = va_arg(ap, int); struct p9_qid *wqids = va_arg(ap, struct p9_qid *); diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index 3e3d82d..bced8c0 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -734,6 +734,7 @@ static int parse_opts(char *params, struct p9_fd_opts *opts) opts->port = P9_PORT; opts->rfd = ~0; opts->wfd = ~0; + opts->privport = 0; if (!params) return 0; @@ -1013,7 +1014,6 @@ p9_fd_create(struct p9_client *client, const char *addr, char *args) { int err; struct p9_fd_opts opts; - struct p9_trans_fd *p; parse_opts(args, &opts); @@ -1026,7 +1026,6 @@ p9_fd_create(struct p9_client *client, const char *addr, char *args) if (err < 0) return err; - p = (struct p9_trans_fd *) client->trans; p9_conn_create(client); return 0; diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c index 14ad43b..3533d2a 100644 --- a/net/9p/trans_rdma.c +++ b/net/9p/trans_rdma.c @@ -139,6 +139,7 @@ struct p9_rdma_opts { int sq_depth; int rq_depth; long timeout; + int privport; }; /* @@ -146,7 +147,10 @@ struct p9_rdma_opts { */ enum { /* Options that take integer arguments */ - Opt_port, Opt_rq_depth, Opt_sq_depth, Opt_timeout, Opt_err, + Opt_port, Opt_rq_depth, Opt_sq_depth, Opt_timeout, + /* Options that take no argument */ + Opt_privport, + Opt_err, }; static match_table_t tokens = { @@ -154,6 +158,7 @@ static match_table_t tokens = { {Opt_sq_depth, "sq=%u"}, {Opt_rq_depth, "rq=%u"}, {Opt_timeout, "timeout=%u"}, + {Opt_privport, "privport"}, {Opt_err, NULL}, }; @@ -175,6 +180,7 @@ static int parse_opts(char *params, struct p9_rdma_opts *opts) opts->sq_depth = P9_RDMA_SQ_DEPTH; opts->rq_depth = P9_RDMA_RQ_DEPTH; opts->timeout = P9_RDMA_TIMEOUT; + opts->privport = 0; if (!params) return 0; @@ -193,13 +199,13 @@ static int parse_opts(char *params, struct p9_rdma_opts *opts) if (!*p) continue; token = match_token(p, tokens, args); - if (token == Opt_err) - continue; - r = match_int(&args[0], &option); - if (r < 0) { - p9_debug(P9_DEBUG_ERROR, - "integer field, but no integer?\n"); - continue; + if ((token != Opt_err) && (token != Opt_privport)) { + r = match_int(&args[0], &option); + if (r < 0) { + p9_debug(P9_DEBUG_ERROR, + "integer field, but no integer?\n"); + continue; + } } switch (token) { case Opt_port: @@ -214,6 +220,9 @@ static int parse_opts(char *params, struct p9_rdma_opts *opts) case Opt_timeout: opts->timeout = option; break; + case Opt_privport: + opts->privport = 1; + break; default: continue; } @@ -607,6 +616,23 @@ static int rdma_cancelled(struct p9_client *client, struct p9_req_t *req) return 0; } +static int p9_rdma_bind_privport(struct p9_trans_rdma *rdma) +{ + struct sockaddr_in cl = { + .sin_family = AF_INET, + .sin_addr.s_addr = htonl(INADDR_ANY), + }; + int port, err = -EINVAL; + + for (port = P9_DEF_MAX_RESVPORT; port >= P9_DEF_MIN_RESVPORT; port--) { + cl.sin_port = htons((ushort)port); + err = rdma_bind_addr(rdma->cm_id, (struct sockaddr *)&cl); + if (err != -EADDRINUSE) + break; + } + return err; +} + /** * trans_create_rdma - Transport method for creating atransport instance * @client: client instance @@ -642,6 +668,16 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args) /* Associate the client with the transport */ client->trans = rdma; + /* Bind to a privileged port if we need to */ + if (opts.privport) { + err = p9_rdma_bind_privport(rdma); + if (err < 0) { + pr_err("%s (%d): problem binding to privport: %d\n", + __func__, task_pid_nr(current), -err); + goto error; + } + } + /* Resolve the server's address */ rdma->addr.sin_family = AF_INET; rdma->addr.sin_addr.s_addr = in_aton(addr); diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index e62bcbb..9dd49ca 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -525,7 +525,10 @@ static ssize_t p9_mount_tag_show(struct device *dev, vdev = dev_to_virtio(dev); chan = vdev->priv; - return snprintf(buf, chan->tag_len + 1, "%s", chan->tag); + memcpy(buf, chan->tag, chan->tag_len); + buf[chan->tag_len] = 0; + + return chan->tag_len + 1; } static DEVICE_ATTR(mount_tag, 0444, p9_mount_tag_show, NULL); diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 476709b..c4802f3 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -1557,7 +1557,8 @@ static int hci_dev_do_close(struct hci_dev *hdev) { BT_DBG("%s %p", hdev->name, hdev); - if (!hci_dev_test_flag(hdev, HCI_UNREGISTER)) { + if (!hci_dev_test_flag(hdev, HCI_UNREGISTER) && + test_bit(HCI_UP, &hdev->flags)) { /* Execute vendor specific shutdown routine */ if (hdev->shutdown) hdev->shutdown(hdev); @@ -2853,9 +2854,11 @@ static void le_scan_disable_work_complete(struct hci_dev *hdev, u8 status, * state. If we were running both LE and BR/EDR inquiry * simultaneously, and BR/EDR inquiry is already * finished, stop discovery, otherwise BR/EDR inquiry - * will stop discovery when finished. + * will stop discovery when finished. If we will resolve + * remote device name, do not change discovery state. */ - if (!test_bit(HCI_INQUIRY, &hdev->flags)) + if (!test_bit(HCI_INQUIRY, &hdev->flags) && + hdev->discovery.state != DISCOVERY_RESOLVING) hci_discovery_set_state(hdev, DISCOVERY_STOPPED); } else { diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index a05b9db..9070dfd 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -1313,7 +1313,8 @@ int hidp_connection_add(struct hidp_connadd_req *req, struct socket *ctrl_sock, struct socket *intr_sock) { - u32 valid_flags = 0; + u32 valid_flags = BIT(HIDP_VIRTUAL_CABLE_UNPLUG) | + BIT(HIDP_BOOT_PROTOCOL_MODE); struct hidp_session *session; struct l2cap_conn *conn; struct l2cap_chan *chan; diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 4096089..e29ad70 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -170,7 +170,7 @@ static int nlmsg_populate_mdb_fill(struct sk_buff *skb, struct br_port_msg *bpm; struct nlattr *nest, *nest2; - nlh = nlmsg_put(skb, pid, seq, type, sizeof(*bpm), NLM_F_MULTI); + nlh = nlmsg_put(skb, pid, seq, type, sizeof(*bpm), 0); if (!nlh) return -EMSGSIZE; diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 4b6722f..a3abe6e 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -1072,7 +1072,7 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br, err = br_ip6_multicast_add_group(br, port, &grec->grec_mca, vid); - if (!err) + if (err) break; } diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index ab55e24..60ddfbe 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -37,10 +37,6 @@ #include <net/route.h> #include <net/netfilter/br_netfilter.h> -#if IS_ENABLED(CONFIG_NF_CONNTRACK) -#include <net/netfilter/nf_conntrack.h> -#endif - #include <asm/uaccess.h> #include "br_private.h" #ifdef CONFIG_SYSCTL @@ -350,24 +346,15 @@ free_skb: return 0; } -static bool dnat_took_place(const struct sk_buff *skb) +static bool daddr_was_changed(const struct sk_buff *skb, + const struct nf_bridge_info *nf_bridge) { -#if IS_ENABLED(CONFIG_NF_CONNTRACK) - enum ip_conntrack_info ctinfo; - struct nf_conn *ct; - - ct = nf_ct_get(skb, &ctinfo); - if (!ct || nf_ct_is_untracked(ct)) - return false; - - return test_bit(IPS_DST_NAT_BIT, &ct->status); -#else - return false; -#endif + return ip_hdr(skb)->daddr != nf_bridge->ipv4_daddr; } /* This requires some explaining. If DNAT has taken place, * we will need to fix up the destination Ethernet address. + * This is also true when SNAT takes place (for the reply direction). * * There are two cases to consider: * 1. The packet was DNAT'ed to a device in the same bridge @@ -421,7 +408,7 @@ static int br_nf_pre_routing_finish(struct sock *sk, struct sk_buff *skb) nf_bridge->pkt_otherhost = false; } nf_bridge->mask ^= BRNF_NF_BRIDGE_PREROUTING; - if (dnat_took_place(skb)) { + if (daddr_was_changed(skb, nf_bridge)) { if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) { struct in_device *in_dev = __in_dev_get_rcu(dev); @@ -632,6 +619,7 @@ static unsigned int br_nf_pre_routing(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct nf_hook_state *state) { + struct nf_bridge_info *nf_bridge; struct net_bridge_port *p; struct net_bridge *br; __u32 len = nf_bridge_encap_header_len(skb); @@ -669,6 +657,9 @@ static unsigned int br_nf_pre_routing(const struct nf_hook_ops *ops, if (!setup_pre_routing(skb)) return NF_DROP; + nf_bridge = nf_bridge_info_get(skb); + nf_bridge->ipv4_daddr = ip_hdr(skb)->daddr; + skb->protocol = htons(ETH_P_IP); NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, state->sk, skb, diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 0e4ddb8..4b5c236 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -394,7 +394,7 @@ errout: * Dump information about all ports, in response to GETLINK */ int br_getlink(struct sk_buff *skb, u32 pid, u32 seq, - struct net_device *dev, u32 filter_mask) + struct net_device *dev, u32 filter_mask, int nlflags) { struct net_bridge_port *port = br_port_get_rtnl(dev); @@ -402,7 +402,7 @@ int br_getlink(struct sk_buff *skb, u32 pid, u32 seq, !(filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED)) return 0; - return br_fill_ifinfo(skb, port, pid, seq, RTM_NEWLINK, NLM_F_MULTI, + return br_fill_ifinfo(skb, port, pid, seq, RTM_NEWLINK, nlflags, filter_mask, dev); } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 6ca0251..3362c29 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -828,7 +828,7 @@ void br_ifinfo_notify(int event, struct net_bridge_port *port); int br_setlink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags); int br_dellink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags); int br_getlink(struct sk_buff *skb, u32 pid, u32 seq, struct net_device *dev, - u32 filter_mask); + u32 filter_mask, int nlflags); #ifdef CONFIG_SYSFS /* br_sysfs_if.c */ diff --git a/net/bridge/br_stp_timer.c b/net/bridge/br_stp_timer.c index 4fcaa67..7caf7fa 100644 --- a/net/bridge/br_stp_timer.c +++ b/net/bridge/br_stp_timer.c @@ -97,7 +97,9 @@ static void br_forward_delay_timer_expired(unsigned long arg) netif_carrier_on(br->dev); } br_log_state(p); + rcu_read_lock(); br_ifinfo_notify(RTM_NEWLINK, p); + rcu_read_unlock(); spin_unlock(&br->lock); } diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 91180a7..24c7c96 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1117,6 +1117,8 @@ static int do_replace(struct net *net, const void __user *user, return -ENOMEM; if (tmp.num_counters >= INT_MAX / sizeof(struct ebt_counter)) return -ENOMEM; + if (tmp.num_counters == 0) + return -EINVAL; tmp.name[sizeof(tmp.name) - 1] = 0; @@ -2159,6 +2161,8 @@ static int compat_copy_ebt_replace_from_user(struct ebt_replace *repl, return -ENOMEM; if (tmp.num_counters >= INT_MAX / sizeof(struct ebt_counter)) return -ENOMEM; + if (tmp.num_counters == 0) + return -EINVAL; memcpy(repl, &tmp, offsetof(struct ebt_replace, hook_entry)); diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index ec56550..79e8f71 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -490,6 +490,43 @@ out: } EXPORT_SYMBOL(ceph_parse_options); +int ceph_print_client_options(struct seq_file *m, struct ceph_client *client) +{ + struct ceph_options *opt = client->options; + size_t pos = m->count; + + if (opt->name) + seq_printf(m, "name=%s,", opt->name); + if (opt->key) + seq_puts(m, "secret=<hidden>,"); + + if (opt->flags & CEPH_OPT_FSID) + seq_printf(m, "fsid=%pU,", &opt->fsid); + if (opt->flags & CEPH_OPT_NOSHARE) + seq_puts(m, "noshare,"); + if (opt->flags & CEPH_OPT_NOCRC) + seq_puts(m, "nocrc,"); + if (opt->flags & CEPH_OPT_NOMSGAUTH) + seq_puts(m, "nocephx_require_signatures,"); + if ((opt->flags & CEPH_OPT_TCP_NODELAY) == 0) + seq_puts(m, "notcp_nodelay,"); + + if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT) + seq_printf(m, "mount_timeout=%d,", opt->mount_timeout); + if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT) + seq_printf(m, "osd_idle_ttl=%d,", opt->osd_idle_ttl); + if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT) + seq_printf(m, "osdkeepalivetimeout=%d,", + opt->osd_keepalive_timeout); + + /* drop redundant comma */ + if (m->count != pos) + m->count--; + + return 0; +} +EXPORT_SYMBOL(ceph_print_client_options); + u64 ceph_client_id(struct ceph_client *client) { return client->monc.auth->global_id; diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c index 16bc199d..9d84ce4 100644 --- a/net/ceph/crush/crush.c +++ b/net/ceph/crush/crush.c @@ -17,6 +17,7 @@ const char *crush_bucket_alg_name(int alg) case CRUSH_BUCKET_LIST: return "list"; case CRUSH_BUCKET_TREE: return "tree"; case CRUSH_BUCKET_STRAW: return "straw"; + case CRUSH_BUCKET_STRAW2: return "straw2"; default: return "unknown"; } } @@ -40,6 +41,8 @@ int crush_get_bucket_item_weight(const struct crush_bucket *b, int p) return ((struct crush_bucket_tree *)b)->node_weights[crush_calc_tree_node(p)]; case CRUSH_BUCKET_STRAW: return ((struct crush_bucket_straw *)b)->item_weights[p]; + case CRUSH_BUCKET_STRAW2: + return ((struct crush_bucket_straw2 *)b)->item_weights[p]; } return 0; } @@ -77,6 +80,14 @@ void crush_destroy_bucket_straw(struct crush_bucket_straw *b) kfree(b); } +void crush_destroy_bucket_straw2(struct crush_bucket_straw2 *b) +{ + kfree(b->item_weights); + kfree(b->h.perm); + kfree(b->h.items); + kfree(b); +} + void crush_destroy_bucket(struct crush_bucket *b) { switch (b->alg) { @@ -92,6 +103,9 @@ void crush_destroy_bucket(struct crush_bucket *b) case CRUSH_BUCKET_STRAW: crush_destroy_bucket_straw((struct crush_bucket_straw *)b); break; + case CRUSH_BUCKET_STRAW2: + crush_destroy_bucket_straw2((struct crush_bucket_straw2 *)b); + break; } } diff --git a/net/ceph/crush/crush_ln_table.h b/net/ceph/crush/crush_ln_table.h new file mode 100644 index 0000000..6192c7f --- /dev/null +++ b/net/ceph/crush/crush_ln_table.h @@ -0,0 +1,166 @@ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Intel Corporation All Rights Reserved + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#if defined(__linux__) +#include <linux/types.h> +#elif defined(__FreeBSD__) +#include <sys/types.h> +#endif + +#ifndef CEPH_CRUSH_LN_H +#define CEPH_CRUSH_LN_H + + +// RH_LH_tbl[2*k] = 2^48/(1.0+k/128.0) +// RH_LH_tbl[2*k+1] = 2^48*log2(1.0+k/128.0) + +static int64_t __RH_LH_tbl[128*2+2] = { + 0x0001000000000000ll, 0x0000000000000000ll, 0x0000fe03f80fe040ll, 0x000002dfca16dde1ll, + 0x0000fc0fc0fc0fc1ll, 0x000005b9e5a170b4ll, 0x0000fa232cf25214ll, 0x0000088e68ea899all, + 0x0000f83e0f83e0f9ll, 0x00000b5d69bac77ell, 0x0000f6603d980f67ll, 0x00000e26fd5c8555ll, + 0x0000f4898d5f85bcll, 0x000010eb389fa29fll, 0x0000f2b9d6480f2cll, 0x000013aa2fdd27f1ll, + 0x0000f0f0f0f0f0f1ll, 0x00001663f6fac913ll, 0x0000ef2eb71fc435ll, 0x00001918a16e4633ll, + 0x0000ed7303b5cc0fll, 0x00001bc84240adabll, 0x0000ebbdb2a5c162ll, 0x00001e72ec117fa5ll, + 0x0000ea0ea0ea0ea1ll, 0x00002118b119b4f3ll, 0x0000e865ac7b7604ll, 0x000023b9a32eaa56ll, + 0x0000e6c2b4481cd9ll, 0x00002655d3c4f15cll, 0x0000e525982af70dll, 0x000028ed53f307eell, + 0x0000e38e38e38e39ll, 0x00002b803473f7adll, 0x0000e1fc780e1fc8ll, 0x00002e0e85a9de04ll, + 0x0000e070381c0e08ll, 0x0000309857a05e07ll, 0x0000dee95c4ca038ll, 0x0000331dba0efce1ll, + 0x0000dd67c8a60dd7ll, 0x0000359ebc5b69d9ll, 0x0000dbeb61eed19dll, 0x0000381b6d9bb29bll, + 0x0000da740da740dbll, 0x00003a93dc9864b2ll, 0x0000d901b2036407ll, 0x00003d0817ce9cd4ll, + 0x0000d79435e50d7all, 0x00003f782d7204d0ll, 0x0000d62b80d62b81ll, 0x000041e42b6ec0c0ll, + 0x0000d4c77b03531ell, 0x0000444c1f6b4c2dll, 0x0000d3680d3680d4ll, 0x000046b016ca47c1ll, + 0x0000d20d20d20d21ll, 0x000049101eac381cll, 0x0000d0b69fcbd259ll, 0x00004b6c43f1366all, + 0x0000cf6474a8819fll, 0x00004dc4933a9337ll, 0x0000ce168a772509ll, 0x0000501918ec6c11ll, + 0x0000cccccccccccdll, 0x00005269e12f346ell, 0x0000cb8727c065c4ll, 0x000054b6f7f1325all, + 0x0000ca4587e6b750ll, 0x0000570068e7ef5all, 0x0000c907da4e8712ll, 0x000059463f919deell, + 0x0000c7ce0c7ce0c8ll, 0x00005b8887367433ll, 0x0000c6980c6980c7ll, 0x00005dc74ae9fbecll, + 0x0000c565c87b5f9ell, 0x00006002958c5871ll, 0x0000c4372f855d83ll, 0x0000623a71cb82c8ll, + 0x0000c30c30c30c31ll, 0x0000646eea247c5cll, 0x0000c1e4bbd595f7ll, 0x000066a008e4788cll, + 0x0000c0c0c0c0c0c1ll, 0x000068cdd829fd81ll, 0x0000bfa02fe80bfbll, 0x00006af861e5fc7dll, + 0x0000be82fa0be830ll, 0x00006d1fafdce20all, 0x0000bd6910470767ll, 0x00006f43cba79e40ll, + 0x0000bc52640bc527ll, 0x00007164beb4a56dll, 0x0000bb3ee721a54ell, 0x000073829248e961ll, + 0x0000ba2e8ba2e8bbll, 0x0000759d4f80cba8ll, 0x0000b92143fa36f6ll, 0x000077b4ff5108d9ll, + 0x0000b81702e05c0cll, 0x000079c9aa879d53ll, 0x0000b70fbb5a19bfll, 0x00007bdb59cca388ll, + 0x0000b60b60b60b61ll, 0x00007dea15a32c1bll, 0x0000b509e68a9b95ll, 0x00007ff5e66a0ffell, + 0x0000b40b40b40b41ll, 0x000081fed45cbccbll, 0x0000b30f63528918ll, 0x00008404e793fb81ll, + 0x0000b21642c8590cll, 0x000086082806b1d5ll, 0x0000b11fd3b80b12ll, 0x000088089d8a9e47ll, + 0x0000b02c0b02c0b1ll, 0x00008a064fd50f2all, 0x0000af3addc680b0ll, 0x00008c01467b94bbll, + 0x0000ae4c415c9883ll, 0x00008df988f4ae80ll, 0x0000ad602b580ad7ll, 0x00008fef1e987409ll, + 0x0000ac7691840ac8ll, 0x000091e20ea1393ell, 0x0000ab8f69e2835all, 0x000093d2602c2e5fll, + 0x0000aaaaaaaaaaabll, 0x000095c01a39fbd6ll, 0x0000a9c84a47a080ll, 0x000097ab43af59f9ll, + 0x0000a8e83f5717c1ll, 0x00009993e355a4e5ll, 0x0000a80a80a80a81ll, 0x00009b79ffdb6c8bll, + 0x0000a72f0539782all, 0x00009d5d9fd5010bll, 0x0000a655c4392d7cll, 0x00009f3ec9bcfb80ll, + 0x0000a57eb50295fbll, 0x0000a11d83f4c355ll, 0x0000a4a9cf1d9684ll, 0x0000a2f9d4c51039ll, + 0x0000a3d70a3d70a4ll, 0x0000a4d3c25e68dcll, 0x0000a3065e3fae7dll, 0x0000a6ab52d99e76ll, + 0x0000a237c32b16d0ll, 0x0000a8808c384547ll, 0x0000a16b312ea8fdll, 0x0000aa5374652a1cll, + 0x0000a0a0a0a0a0a1ll, 0x0000ac241134c4e9ll, 0x00009fd809fd80a0ll, 0x0000adf26865a8a1ll, + 0x00009f1165e72549ll, 0x0000afbe7fa0f04dll, 0x00009e4cad23dd60ll, 0x0000b1885c7aa982ll, + 0x00009d89d89d89d9ll, 0x0000b35004723c46ll, 0x00009cc8e160c3fcll, 0x0000b5157cf2d078ll, + 0x00009c09c09c09c1ll, 0x0000b6d8cb53b0call, 0x00009b4c6f9ef03bll, 0x0000b899f4d8ab63ll, + 0x00009a90e7d95bc7ll, 0x0000ba58feb2703all, 0x000099d722dabde6ll, 0x0000bc15edfeed32ll, + 0x0000991f1a515886ll, 0x0000bdd0c7c9a817ll, 0x00009868c809868dll, 0x0000bf89910c1678ll, + 0x000097b425ed097cll, 0x0000c1404eadf383ll, 0x000097012e025c05ll, 0x0000c2f5058593d9ll, + 0x0000964fda6c0965ll, 0x0000c4a7ba58377cll, 0x000095a02568095bll, 0x0000c65871da59ddll, + 0x000094f2094f2095ll, 0x0000c80730b00016ll, 0x0000944580944581ll, 0x0000c9b3fb6d0559ll, + 0x0000939a85c4093all, 0x0000cb5ed69565afll, 0x000092f113840498ll, 0x0000cd07c69d8702ll, + 0x0000924924924925ll, 0x0000ceaecfea8085ll, 0x000091a2b3c4d5e7ll, 0x0000d053f6d26089ll, + 0x000090fdbc090fdcll, 0x0000d1f73f9c70c0ll, 0x0000905a38633e07ll, 0x0000d398ae817906ll, + 0x00008fb823ee08fcll, 0x0000d53847ac00a6ll, 0x00008f1779d9fdc4ll, 0x0000d6d60f388e41ll, + 0x00008e78356d1409ll, 0x0000d8720935e643ll, 0x00008dda5202376all, 0x0000da0c39a54804ll, + 0x00008d3dcb08d3ddll, 0x0000dba4a47aa996ll, 0x00008ca29c046515ll, 0x0000dd3b4d9cf24bll, + 0x00008c08c08c08c1ll, 0x0000ded038e633f3ll, 0x00008b70344a139cll, 0x0000e0636a23e2eell, + 0x00008ad8f2fba939ll, 0x0000e1f4e5170d02ll, 0x00008a42f870566all, 0x0000e384ad748f0ell, + 0x000089ae4089ae41ll, 0x0000e512c6e54998ll, 0x0000891ac73ae982ll, 0x0000e69f35065448ll, + 0x0000888888888889ll, 0x0000e829fb693044ll, 0x000087f78087f781ll, 0x0000e9b31d93f98ell, + 0x00008767ab5f34e5ll, 0x0000eb3a9f019750ll, 0x000086d905447a35ll, 0x0000ecc08321eb30ll, + 0x0000864b8a7de6d2ll, 0x0000ee44cd59ffabll, 0x000085bf37612cefll, 0x0000efc781043579ll, + 0x0000853408534086ll, 0x0000f148a170700all, 0x000084a9f9c8084bll, 0x0000f2c831e44116ll, + 0x0000842108421085ll, 0x0000f446359b1353ll, 0x0000839930523fbfll, 0x0000f5c2afc65447ll, + 0x000083126e978d50ll, 0x0000f73da38d9d4all, 0x0000828cbfbeb9a1ll, 0x0000f8b7140edbb1ll, + 0x0000820820820821ll, 0x0000fa2f045e7832ll, 0x000081848da8faf1ll, 0x0000fba577877d7dll, + 0x0000810204081021ll, 0x0000fd1a708bbe11ll, 0x0000808080808081ll, 0x0000fe8df263f957ll, + 0x0000800000000000ll, 0x0000ffff00000000ll, + }; + + + // LL_tbl[k] = 2^48*log2(1.0+k/2^15); +static int64_t __LL_tbl[256] = { + 0x0000000000000000ull, 0x00000002e2a60a00ull, 0x000000070cb64ec5ull, 0x00000009ef50ce67ull, + 0x0000000cd1e588fdull, 0x0000000fb4747e9cull, 0x0000001296fdaf5eull, 0x0000001579811b58ull, + 0x000000185bfec2a1ull, 0x0000001b3e76a552ull, 0x0000001e20e8c380ull, 0x0000002103551d43ull, + 0x00000023e5bbb2b2ull, 0x00000026c81c83e4ull, 0x00000029aa7790f0ull, 0x0000002c8cccd9edull, + 0x0000002f6f1c5ef2ull, 0x0000003251662017ull, 0x0000003533aa1d71ull, 0x0000003815e8571aull, + 0x0000003af820cd26ull, 0x0000003dda537faeull, 0x00000040bc806ec8ull, 0x000000439ea79a8cull, + 0x0000004680c90310ull, 0x0000004962e4a86cull, 0x0000004c44fa8ab6ull, 0x0000004f270aaa06ull, + 0x0000005209150672ull, 0x00000054eb19a013ull, 0x00000057cd1876fdull, 0x0000005aaf118b4aull, + 0x0000005d9104dd0full, 0x0000006072f26c64ull, 0x0000006354da3960ull, 0x0000006636bc441aull, + 0x0000006918988ca8ull, 0x0000006bfa6f1322ull, 0x0000006edc3fd79full, 0x00000071be0ada35ull, + 0x000000749fd01afdull, 0x00000077818f9a0cull, 0x0000007a6349577aull, 0x0000007d44fd535eull, + 0x0000008026ab8dceull, 0x00000083085406e3ull, 0x00000085e9f6beb2ull, 0x00000088cb93b552ull, + 0x0000008bad2aeadcull, 0x0000008e8ebc5f65ull, 0x0000009170481305ull, 0x0000009451ce05d3ull, + 0x00000097334e37e5ull, 0x0000009a14c8a953ull, 0x0000009cf63d5a33ull, 0x0000009fd7ac4a9dull, + 0x000000a2b07f3458ull, 0x000000a59a78ea6aull, 0x000000a87bd699fbull, 0x000000ab5d2e8970ull, + 0x000000ae3e80b8e3ull, 0x000000b11fcd2869ull, 0x000000b40113d818ull, 0x000000b6e254c80aull, + 0x000000b9c38ff853ull, 0x000000bca4c5690cull, 0x000000bf85f51a4aull, 0x000000c2671f0c26ull, + 0x000000c548433eb6ull, 0x000000c82961b211ull, 0x000000cb0a7a664dull, 0x000000cdeb8d5b82ull, + 0x000000d0cc9a91c8ull, 0x000000d3ada20933ull, 0x000000d68ea3c1ddull, 0x000000d96f9fbbdbull, + 0x000000dc5095f744ull, 0x000000df31867430ull, 0x000000e2127132b5ull, 0x000000e4f35632eaull, + 0x000000e7d43574e6ull, 0x000000eab50ef8c1ull, 0x000000ed95e2be90ull, 0x000000f076b0c66cull, + 0x000000f35779106aull, 0x000000f6383b9ca2ull, 0x000000f918f86b2aull, 0x000000fbf9af7c1aull, + 0x000000feda60cf88ull, 0x00000101bb0c658cull, 0x000001049bb23e3cull, 0x000001077c5259afull, + 0x0000010a5cecb7fcull, 0x0000010d3d81593aull, 0x000001101e103d7full, 0x00000112fe9964e4ull, + 0x00000115df1ccf7eull, 0x00000118bf9a7d64ull, 0x0000011ba0126eadull, 0x0000011e8084a371ull, + 0x0000012160f11bc6ull, 0x000001244157d7c3ull, 0x0000012721b8d77full, 0x0000012a02141b10ull, + 0x0000012ce269a28eull, 0x0000012fc2b96e0full, 0x00000132a3037daaull, 0x000001358347d177ull, + 0x000001386386698cull, 0x0000013b43bf45ffull, 0x0000013e23f266e9ull, 0x00000141041fcc5eull, + 0x00000143e4477678ull, 0x00000146c469654bull, 0x00000149a48598f0ull, 0x0000014c849c117cull, + 0x0000014f64accf08ull, 0x0000015244b7d1a9ull, 0x0000015524bd1976ull, 0x0000015804bca687ull, + 0x0000015ae4b678f2ull, 0x0000015dc4aa90ceull, 0x00000160a498ee31ull, 0x0000016384819134ull, + 0x00000166646479ecull, 0x000001694441a870ull, 0x0000016c24191cd7ull, 0x0000016df6ca19bdull, + 0x00000171e3b6d7aaull, 0x00000174c37d1e44ull, 0x00000177a33dab1cull, 0x0000017a82f87e49ull, + 0x0000017d62ad97e2ull, 0x00000180425cf7feull, 0x00000182b07f3458ull, 0x0000018601aa8c19ull, + 0x00000188e148c046ull, 0x0000018bc0e13b52ull, 0x0000018ea073fd52ull, 0x000001918001065dull, + 0x000001945f88568bull, 0x000001973f09edf2ull, 0x0000019a1e85ccaaull, 0x0000019cfdfbf2c8ull, + 0x0000019fdd6c6063ull, 0x000001a2bcd71593ull, 0x000001a59c3c126eull, 0x000001a87b9b570bull, + 0x000001ab5af4e380ull, 0x000001ae3a48b7e5ull, 0x000001b11996d450ull, 0x000001b3f8df38d9ull, + 0x000001b6d821e595ull, 0x000001b9b75eda9bull, 0x000001bc96961803ull, 0x000001bf75c79de3ull, + 0x000001c254f36c51ull, 0x000001c534198365ull, 0x000001c81339e336ull, 0x000001caf2548bd9ull, + 0x000001cdd1697d67ull, 0x000001d0b078b7f5ull, 0x000001d38f823b9aull, 0x000001d66e86086dull, + 0x000001d94d841e86ull, 0x000001dc2c7c7df9ull, 0x000001df0b6f26dfull, 0x000001e1ea5c194eull, + 0x000001e4c943555dull, 0x000001e7a824db23ull, 0x000001ea8700aab5ull, 0x000001ed65d6c42bull, + 0x000001f044a7279dull, 0x000001f32371d51full, 0x000001f60236cccaull, 0x000001f8e0f60eb3ull, + 0x000001fbbfaf9af3ull, 0x000001fe9e63719eull, 0x000002017d1192ccull, 0x000002045bb9fe94ull, + 0x000002073a5cb50dull, 0x00000209c06e6212ull, 0x0000020cf791026aull, 0x0000020fd622997cull, + 0x00000212b07f3458ull, 0x000002159334a8d8ull, 0x0000021871b52150ull, 0x0000021b502fe517ull, + 0x0000021d6a73a78full, 0x000002210d144eeeull, 0x00000223eb7df52cull, 0x00000226c9e1e713ull, + 0x00000229a84024bbull, 0x0000022c23679b4eull, 0x0000022f64eb83a8ull, 0x000002324338a51bull, + 0x00000235218012a9ull, 0x00000237ffc1cc69ull, 0x0000023a2c3b0ea4ull, 0x0000023d13ee805bull, + 0x0000024035e9221full, 0x00000243788faf25ull, 0x0000024656b4e735ull, 0x00000247ed646bfeull, + 0x0000024c12ee3d98ull, 0x0000024ef1025c1aull, 0x00000251cf10c799ull, 0x0000025492644d65ull, + 0x000002578b1c85eeull, 0x0000025a6919d8f0ull, 0x0000025d13ee805bull, 0x0000026025036716ull, + 0x0000026296453882ull, 0x00000265e0d62b53ull, 0x00000268beb701f3ull, 0x0000026b9c92265eull, + 0x0000026d32f798a9ull, 0x00000271583758ebull, 0x000002743601673bull, 0x0000027713c5c3b0ull, + 0x00000279f1846e5full, 0x0000027ccf3d6761ull, 0x0000027e6580aecbull, 0x000002828a9e44b3ull, + 0x0000028568462932ull, 0x00000287bdbf5255ull, 0x0000028b2384de4aull, 0x0000028d13ee805bull, + 0x0000029035e9221full, 0x0000029296453882ull, 0x0000029699bdfb61ull, 0x0000029902a37aabull, + 0x0000029c54b864c9ull, 0x0000029deabd1083ull, 0x000002a20f9c0bb5ull, 0x000002a4c7605d61ull, + 0x000002a7bdbf5255ull, 0x000002a96056dafcull, 0x000002ac3daf14efull, 0x000002af1b019ecaull, + 0x000002b296453882ull, 0x000002b5d022d80full, 0x000002b8fa471cb3ull, 0x000002ba9012e713ull, + 0x000002bd6d4901ccull, 0x000002c04a796cf6ull, 0x000002c327a428a6ull, 0x000002c61a5e8f4cull, + 0x000002c8e1e891f6ull, 0x000002cbbf023fc2ull, 0x000002ce9c163e6eull, 0x000002d179248e13ull, + 0x000002d4562d2ec6ull, 0x000002d73330209dull, 0x000002da102d63b0ull, 0x000002dced24f814ull, +}; + + + + +#endif diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index a1ef53c..5b47736 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -20,7 +20,7 @@ #include <linux/crush/crush.h> #include <linux/crush/hash.h> -#include <linux/crush/mapper.h> +#include "crush_ln_table.h" /* * Implement the core CRUSH mapping algorithm. @@ -238,6 +238,102 @@ static int bucket_straw_choose(struct crush_bucket_straw *bucket, return bucket->h.items[high]; } +// compute 2^44*log2(input+1) +uint64_t crush_ln(unsigned xin) +{ + unsigned x=xin, x1; + int iexpon, index1, index2; + uint64_t RH, LH, LL, xl64, result; + + x++; + + // normalize input + iexpon = 15; + while(!(x&0x18000)) { x<<=1; iexpon--; } + + index1 = (x>>8)<<1; + // RH ~ 2^56/index1 + RH = __RH_LH_tbl[index1 - 256]; + // LH ~ 2^48 * log2(index1/256) + LH = __RH_LH_tbl[index1 + 1 - 256]; + + // RH*x ~ 2^48 * (2^15 + xf), xf<2^8 + xl64 = (int64_t)x * RH; + xl64 >>= 48; + x1 = xl64; + + result = iexpon; + result <<= (12 + 32); + + index2 = x1 & 0xff; + // LL ~ 2^48*log2(1.0+index2/2^15) + LL = __LL_tbl[index2]; + + LH = LH + LL; + + LH >>= (48-12 - 32); + result += LH; + + return result; +} + + +/* + * straw2 + * + * for reference, see: + * + * http://en.wikipedia.org/wiki/Exponential_distribution#Distribution_of_the_minimum_of_exponential_random_variables + * + */ + +static int bucket_straw2_choose(struct crush_bucket_straw2 *bucket, + int x, int r) +{ + unsigned i, high = 0; + unsigned u; + unsigned w; + __s64 ln, draw, high_draw = 0; + + for (i = 0; i < bucket->h.size; i++) { + w = bucket->item_weights[i]; + if (w) { + u = crush_hash32_3(bucket->h.hash, x, + bucket->h.items[i], r); + u &= 0xffff; + + /* + * for some reason slightly less than 0x10000 produces + * a slightly more accurate distribution... probably a + * rounding effect. + * + * the natural log lookup table maps [0,0xffff] + * (corresponding to real numbers [1/0x10000, 1] to + * [0, 0xffffffffffff] (corresponding to real numbers + * [-11.090355,0]). + */ + ln = crush_ln(u) - 0x1000000000000ll; + + /* + * divide by 16.16 fixed-point weight. note + * that the ln value is negative, so a larger + * weight means a larger (less negative) value + * for draw. + */ + draw = div64_s64(ln, w); + } else { + draw = S64_MIN; + } + + if (i == 0 || draw > high_draw) { + high = i; + high_draw = draw; + } + } + return bucket->h.items[high]; +} + + static int crush_bucket_choose(struct crush_bucket *in, int x, int r) { dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r); @@ -255,12 +351,16 @@ static int crush_bucket_choose(struct crush_bucket *in, int x, int r) case CRUSH_BUCKET_STRAW: return bucket_straw_choose((struct crush_bucket_straw *)in, x, r); + case CRUSH_BUCKET_STRAW2: + return bucket_straw2_choose((struct crush_bucket_straw2 *)in, + x, r); default: dprintk("unknown bucket %d alg %d\n", in->id, in->alg); return in->items[0]; } } + /* * true if device is marked "out" (failed, fully offloaded) * of the cluster @@ -290,6 +390,7 @@ static int is_out(const struct crush_map *map, * @type: the type of item to choose * @out: pointer to output vector * @outpos: our position in that vector + * @out_size: size of the out vector * @tries: number of attempts to make * @recurse_tries: number of attempts to have recursive chooseleaf make * @local_retries: localized retries @@ -304,6 +405,7 @@ static int crush_choose_firstn(const struct crush_map *map, const __u32 *weight, int weight_max, int x, int numrep, int type, int *out, int outpos, + int out_size, unsigned int tries, unsigned int recurse_tries, unsigned int local_retries, @@ -322,6 +424,7 @@ static int crush_choose_firstn(const struct crush_map *map, int item = 0; int itemtype; int collide, reject; + int count = out_size; dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d\n", recurse_to_leaf ? "_LEAF" : "", @@ -329,7 +432,7 @@ static int crush_choose_firstn(const struct crush_map *map, tries, recurse_tries, local_retries, local_fallback_retries, parent_r); - for (rep = outpos; rep < numrep; rep++) { + for (rep = outpos; rep < numrep && count > 0 ; rep++) { /* keep trying until we get a non-out, non-colliding item */ ftotal = 0; skip_rep = 0; @@ -403,7 +506,7 @@ static int crush_choose_firstn(const struct crush_map *map, map->buckets[-1-item], weight, weight_max, x, outpos+1, 0, - out2, outpos, + out2, outpos, count, recurse_tries, 0, local_retries, local_fallback_retries, @@ -463,6 +566,7 @@ reject: dprintk("CHOOSE got %d\n", item); out[outpos] = item; outpos++; + count--; } dprintk("CHOOSE returns %d\n", outpos); @@ -654,6 +758,7 @@ int crush_do_rule(const struct crush_map *map, __u32 step; int i, j; int numrep; + int out_size; /* * the original choose_total_tries value was off by one (it * counted "retries" and not "tries"). add one. @@ -761,6 +866,7 @@ int crush_do_rule(const struct crush_map *map, x, numrep, curstep->arg2, o+osize, j, + result_max-osize, choose_tries, recurse_tries, choose_local_retries, @@ -770,11 +876,13 @@ int crush_do_rule(const struct crush_map *map, c+osize, 0); } else { + out_size = ((numrep < (result_max-osize)) ? + numrep : (result_max-osize)); crush_choose_indep( map, map->buckets[-1-w[i]], weight, weight_max, - x, numrep, numrep, + x, out_size, numrep, curstep->arg2, o+osize, j, choose_tries, @@ -783,7 +891,7 @@ int crush_do_rule(const struct crush_map *map, recurse_to_leaf, c+osize, 0); - osize += numrep; + osize += out_size; } } diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index 14d9995..593dc2e 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -22,6 +22,7 @@ * .../monmap - current monmap * .../osdc - active osd requests * .../monc - mon client state + * .../client_options - libceph-only (i.e. not rbd or cephfs) options * .../dentry_lru - dump contents of dentry lru * .../caps - expose cap (reservation) stats * .../bdi - symlink to ../../bdi/something @@ -177,10 +178,24 @@ static int osdc_show(struct seq_file *s, void *pp) return 0; } +static int client_options_show(struct seq_file *s, void *p) +{ + struct ceph_client *client = s->private; + int ret; + + ret = ceph_print_client_options(s, client); + if (ret) + return ret; + + seq_putc(s, '\n'); + return 0; +} + CEPH_DEFINE_SHOW_FUNC(monmap_show) CEPH_DEFINE_SHOW_FUNC(osdmap_show) CEPH_DEFINE_SHOW_FUNC(monc_show) CEPH_DEFINE_SHOW_FUNC(osdc_show) +CEPH_DEFINE_SHOW_FUNC(client_options_show) int ceph_debugfs_init(void) { @@ -242,6 +257,14 @@ int ceph_debugfs_client_init(struct ceph_client *client) if (!client->debugfs_osdmap) goto out; + client->debugfs_options = debugfs_create_file("client_options", + 0600, + client->debugfs_dir, + client, + &client_options_show_fops); + if (!client->debugfs_options) + goto out; + return 0; out: @@ -252,6 +275,7 @@ out: void ceph_debugfs_client_cleanup(struct ceph_client *client) { dout("ceph_debugfs_client_cleanup %p\n", client); + debugfs_remove(client->debugfs_options); debugfs_remove(client->debugfs_osdmap); debugfs_remove(client->debugfs_monmap); debugfs_remove(client->osdc.debugfs_file); diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index a9f4ae4..967080a 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -505,8 +505,6 @@ static int ceph_tcp_connect(struct ceph_connection *con) pr_err("connect %s error %d\n", ceph_pr_addr(&con->peer_addr.in_addr), ret); sock_release(sock); - con->error_msg = "connect error"; - return ret; } @@ -2145,12 +2143,10 @@ static int process_connect(struct ceph_connection *con) * to WAIT. This shouldn't happen if we are the * client. */ - pr_err("process_connect got WAIT as client\n"); con->error_msg = "protocol error, got WAIT as client"; return -1; default: - pr_err("connect protocol error, will retry\n"); con->error_msg = "protocol error, garbage tag during connect"; return -1; } @@ -2282,8 +2278,7 @@ static int read_partial_message(struct ceph_connection *con) crc = crc32c(0, &con->in_hdr, offsetof(struct ceph_msg_header, crc)); if (cpu_to_le32(crc) != con->in_hdr.crc) { - pr_err("read_partial_message bad hdr " - " crc %u != expected %u\n", + pr_err("read_partial_message bad hdr crc %u != expected %u\n", crc, con->in_hdr.crc); return -EBADMSG; } @@ -2313,7 +2308,7 @@ static int read_partial_message(struct ceph_connection *con) pr_err("read_partial_message bad seq %lld expected %lld\n", seq, con->in_seq + 1); con->error_msg = "bad message sequence # for incoming message"; - return -EBADMSG; + return -EBADE; } /* allocate message? */ @@ -2660,6 +2655,8 @@ more: switch (ret) { case -EBADMSG: con->error_msg = "bad crc"; + /* fall through */ + case -EBADE: ret = -EIO; break; case -EIO: @@ -2838,7 +2835,8 @@ static void con_work(struct work_struct *work) if (ret < 0) { if (ret == -EAGAIN) continue; - con->error_msg = "socket error on read"; + if (!con->error_msg) + con->error_msg = "socket error on read"; fault = true; break; } @@ -2847,7 +2845,8 @@ static void con_work(struct work_struct *work) if (ret < 0) { if (ret == -EAGAIN) continue; - con->error_msg = "socket error on write"; + if (!con->error_msg) + con->error_msg = "socket error on write"; fault = true; } @@ -2869,11 +2868,13 @@ static void con_work(struct work_struct *work) */ static void con_fault(struct ceph_connection *con) { - pr_warn("%s%lld %s %s\n", ENTITY_NAME(con->peer_name), - ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg); dout("fault %p state %lu to peer %s\n", con, con->state, ceph_pr_addr(&con->peer_addr.in_addr)); + pr_warn("%s%lld %s %s\n", ENTITY_NAME(con->peer_name), + ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg); + con->error_msg = NULL; + WARN_ON(con->state != CON_STATE_CONNECTING && con->state != CON_STATE_NEGOTIATING && con->state != CON_STATE_OPEN); @@ -3295,8 +3296,8 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip) */ if (*skip) return 0; - con->error_msg = "error allocating memory for incoming message"; + con->error_msg = "error allocating memory for incoming message"; return -ENOMEM; } memcpy(&con->in_msg->hdr, &con->in_hdr, sizeof(con->in_hdr)); diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index b8c3fde..1579669 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -122,6 +122,22 @@ bad: return -EINVAL; } +static int crush_decode_straw2_bucket(void **p, void *end, + struct crush_bucket_straw2 *b) +{ + int j; + dout("crush_decode_straw2_bucket %p to %p\n", *p, end); + b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS); + if (b->item_weights == NULL) + return -ENOMEM; + ceph_decode_need(p, end, b->h.size * sizeof(u32), bad); + for (j = 0; j < b->h.size; j++) + b->item_weights[j] = ceph_decode_32(p); + return 0; +bad: + return -EINVAL; +} + static int skip_name_map(void **p, void *end) { int len; @@ -204,6 +220,9 @@ static struct crush_map *crush_decode(void *pbyval, void *end) case CRUSH_BUCKET_STRAW: size = sizeof(struct crush_bucket_straw); break; + case CRUSH_BUCKET_STRAW2: + size = sizeof(struct crush_bucket_straw2); + break; default: err = -EINVAL; goto bad; @@ -261,6 +280,12 @@ static struct crush_map *crush_decode(void *pbyval, void *end) if (err < 0) goto bad; break; + case CRUSH_BUCKET_STRAW2: + err = crush_decode_straw2_bucket(p, end, + (struct crush_bucket_straw2 *)b); + if (err < 0) + goto bad; + break; } } diff --git a/net/core/dev.c b/net/core/dev.c index 1796cef5..2c1c67f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3079,7 +3079,7 @@ static struct rps_dev_flow * set_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow *rflow, u16 next_cpu) { - if (next_cpu != RPS_NO_CPU) { + if (next_cpu < nr_cpu_ids) { #ifdef CONFIG_RFS_ACCEL struct netdev_rx_queue *rxqueue; struct rps_dev_flow_table *flow_table; @@ -3184,7 +3184,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, * If the desired CPU (where last recvmsg was done) is * different from current CPU (one in the rx-queue flow * table entry), switch if one of the following holds: - * - Current CPU is unset (equal to RPS_NO_CPU). + * - Current CPU is unset (>= nr_cpu_ids). * - Current CPU is offline. * - The current CPU's queue tail has advanced beyond the * last packet that was enqueued using this table entry. @@ -3192,14 +3192,14 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, * have been dequeued, thus preserving in order delivery. */ if (unlikely(tcpu != next_cpu) && - (tcpu == RPS_NO_CPU || !cpu_online(tcpu) || + (tcpu >= nr_cpu_ids || !cpu_online(tcpu) || ((int)(per_cpu(softnet_data, tcpu).input_queue_head - rflow->last_qtail)) >= 0)) { tcpu = next_cpu; rflow = set_rps_cpu(dev, skb, rflow, next_cpu); } - if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) { + if (tcpu < nr_cpu_ids && cpu_online(tcpu)) { *rflowp = rflow; cpu = tcpu; goto done; @@ -3240,14 +3240,14 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, struct rps_dev_flow_table *flow_table; struct rps_dev_flow *rflow; bool expire = true; - int cpu; + unsigned int cpu; rcu_read_lock(); flow_table = rcu_dereference(rxqueue->rps_flow_table); if (flow_table && flow_id <= flow_table->mask) { rflow = &flow_table->flows[flow_id]; cpu = ACCESS_ONCE(rflow->cpu); - if (rflow->filter == filter_id && cpu != RPS_NO_CPU && + if (rflow->filter == filter_id && cpu < nr_cpu_ids && ((int)(per_cpu(softnet_data, cpu).input_queue_head - rflow->last_qtail) < (int)(10 * flow_table->mask))) @@ -5209,7 +5209,7 @@ static int __netdev_upper_dev_link(struct net_device *dev, if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper)) return -EBUSY; - if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper)) + if (__netdev_find_adj(dev, upper_dev, &dev->adj_list.upper)) return -EEXIST; if (master && netdev_master_upper_dev_get(dev)) diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 1d00b89..1347e11 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -359,7 +359,15 @@ static int ethtool_get_settings(struct net_device *dev, void __user *useraddr) int err; struct ethtool_cmd cmd; - err = __ethtool_get_settings(dev, &cmd); + if (!dev->ethtool_ops->get_settings) + return -EOPNOTSUPP; + + if (copy_from_user(&cmd, useraddr, sizeof(cmd))) + return -EFAULT; + + cmd.cmd = ETHTOOL_GSET; + + err = dev->ethtool_ops->get_settings(dev, &cmd); if (err < 0) return err; diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 78fc04a..572af00 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -601,7 +601,7 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh) } err = rtnl_net_fill(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, - RTM_GETNSID, net, peer, -1); + RTM_NEWNSID, net, peer, -1); if (err < 0) goto err_out; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 358d52a..8de3682 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2416,6 +2416,9 @@ void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, { struct sk_buff *skb; + if (dev->reg_state != NETREG_REGISTERED) + return; + skb = rtmsg_ifinfo_build_skb(type, dev, change, flags); if (skb) rtmsg_ifinfo_send(skb, dev, flags); @@ -2854,7 +2857,7 @@ static int brport_nla_put_flag(struct sk_buff *skb, u32 flags, u32 mask, int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, struct net_device *dev, u16 mode, - u32 flags, u32 mask) + u32 flags, u32 mask, int nlflags) { struct nlmsghdr *nlh; struct ifinfomsg *ifm; @@ -2863,7 +2866,7 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN; struct net_device *br_dev = netdev_master_upper_dev_get(dev); - nlh = nlmsg_put(skb, pid, seq, RTM_NEWLINK, sizeof(*ifm), NLM_F_MULTI); + nlh = nlmsg_put(skb, pid, seq, RTM_NEWLINK, sizeof(*ifm), nlflags); if (nlh == NULL) return -EMSGSIZE; @@ -2969,7 +2972,8 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb) if (br_dev && br_dev->netdev_ops->ndo_bridge_getlink) { if (idx >= cb->args[0] && br_dev->netdev_ops->ndo_bridge_getlink( - skb, portid, seq, dev, filter_mask) < 0) + skb, portid, seq, dev, filter_mask, + NLM_F_MULTI) < 0) break; idx++; } @@ -2977,7 +2981,8 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb) if (ops->ndo_bridge_getlink) { if (idx >= cb->args[0] && ops->ndo_bridge_getlink(skb, portid, seq, dev, - filter_mask) < 0) + filter_mask, + NLM_F_MULTI) < 0) break; idx++; } @@ -3018,7 +3023,7 @@ static int rtnl_bridge_notify(struct net_device *dev) goto errout; } - err = dev->netdev_ops->ndo_bridge_getlink(skb, 0, 0, dev, 0); + err = dev->netdev_ops->ndo_bridge_getlink(skb, 0, 0, dev, 0, 0); if (err < 0) goto errout; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index d1967da..3cfff2a 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -280,13 +280,14 @@ nodata: EXPORT_SYMBOL(__alloc_skb); /** - * build_skb - build a network buffer + * __build_skb - build a network buffer * @data: data buffer provided by caller - * @frag_size: size of fragment, or 0 if head was kmalloced + * @frag_size: size of data, or 0 if head was kmalloced * * Allocate a new &sk_buff. Caller provides space holding head and * skb_shared_info. @data must have been allocated by kmalloc() only if - * @frag_size is 0, otherwise data should come from the page allocator. + * @frag_size is 0, otherwise data should come from the page allocator + * or vmalloc() * The return is the new skb buffer. * On a failure the return is %NULL, and @data is not freed. * Notes : @@ -297,7 +298,7 @@ EXPORT_SYMBOL(__alloc_skb); * before giving packet to stack. * RX rings only contains data buffers, not full skbs. */ -struct sk_buff *build_skb(void *data, unsigned int frag_size) +struct sk_buff *__build_skb(void *data, unsigned int frag_size) { struct skb_shared_info *shinfo; struct sk_buff *skb; @@ -311,7 +312,6 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size) memset(skb, 0, offsetof(struct sk_buff, tail)); skb->truesize = SKB_TRUESIZE(size); - skb->head_frag = frag_size != 0; atomic_set(&skb->users, 1); skb->head = data; skb->data = data; @@ -328,6 +328,23 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size) return skb; } + +/* build_skb() is wrapper over __build_skb(), that specifically + * takes care of skb->head and skb->pfmemalloc + * This means that if @frag_size is not zero, then @data must be backed + * by a page fragment, not kmalloc() or vmalloc() + */ +struct sk_buff *build_skb(void *data, unsigned int frag_size) +{ + struct sk_buff *skb = __build_skb(data, frag_size); + + if (skb && frag_size) { + skb->head_frag = 1; + if (virt_to_head_page(data)->pfmemalloc) + skb->pfmemalloc = 1; + } + return skb; +} EXPORT_SYMBOL(build_skb); struct netdev_alloc_cache { @@ -348,7 +365,8 @@ static struct page *__page_frag_refill(struct netdev_alloc_cache *nc, gfp_t gfp = gfp_mask; if (order) { - gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY; + gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY | + __GFP_NOMEMALLOC; page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, order); nc->frag.size = PAGE_SIZE << (page ? order : 0); } diff --git a/net/core/sock.c b/net/core/sock.c index e891bcf..292f422 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1474,8 +1474,8 @@ void sk_release_kernel(struct sock *sk) return; sock_hold(sk); - sock_net_set(sk, get_net(&init_net)); sock_release(sk->sk_socket); + sock_net_set(sk, get_net(&init_net)); sock_put(sk); } EXPORT_SYMBOL(sk_release_kernel); diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 2b4f21d..ccf4c56 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -453,7 +453,8 @@ static struct sock *dccp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) iph->saddr, iph->daddr); if (req) { nsk = dccp_check_req(sk, skb, req); - reqsk_put(req); + if (!nsk) + reqsk_put(req); return nsk; } nsk = inet_lookup_established(sock_net(sk), &dccp_hashinfo, diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 9d05510..5165571 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -301,7 +301,8 @@ static struct sock *dccp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) &iph->daddr, inet6_iif(skb)); if (req) { nsk = dccp_check_req(sk, skb, req); - reqsk_put(req); + if (!nsk) + reqsk_put(req); return nsk; } nsk = __inet6_lookup_established(sock_net(sk), &dccp_hashinfo, diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index 5f56666..30addee 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -186,8 +186,7 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb, if (child == NULL) goto listen_overflow; - inet_csk_reqsk_queue_unlink(sk, req); - inet_csk_reqsk_queue_removed(sk, req); + inet_csk_reqsk_queue_drop(sk, req); inet_csk_reqsk_queue_add(sk, req, child); out: return child; diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 079a224..e6f6cc3 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -633,7 +633,7 @@ static int dsa_of_probe(struct device *dev) if (cd->sw_addr > PHY_MAX_ADDR) continue; - if (!of_property_read_u32(np, "eeprom-length", &eeprom_len)) + if (!of_property_read_u32(child, "eeprom-length", &eeprom_len)) cd->eeprom_len = eeprom_len; for_each_available_child_of_node(child, port) { diff --git a/net/ieee802154/Makefile b/net/ieee802154/Makefile index 05dab29..4adfd4d 100644 --- a/net/ieee802154/Makefile +++ b/net/ieee802154/Makefile @@ -3,7 +3,9 @@ obj-$(CONFIG_IEEE802154_SOCKET) += ieee802154_socket.o obj-y += 6lowpan/ ieee802154-y := netlink.o nl-mac.o nl-phy.o nl_policy.o core.o \ - header_ops.o sysfs.o nl802154.o + header_ops.o sysfs.o nl802154.o trace.o ieee802154_socket-y := socket.o +CFLAGS_trace.o := -I$(src) + ccflags-y += -D__CHECK_ENDIAN__ diff --git a/net/ieee802154/nl-phy.c b/net/ieee802154/nl-phy.c index 1b9d25f6..346c666 100644 --- a/net/ieee802154/nl-phy.c +++ b/net/ieee802154/nl-phy.c @@ -175,6 +175,7 @@ int ieee802154_add_iface(struct sk_buff *skb, struct genl_info *info) int rc = -ENOBUFS; struct net_device *dev; int type = __IEEE802154_DEV_INVALID; + unsigned char name_assign_type; pr_debug("%s\n", __func__); @@ -190,8 +191,10 @@ int ieee802154_add_iface(struct sk_buff *skb, struct genl_info *info) if (devname[nla_len(info->attrs[IEEE802154_ATTR_DEV_NAME]) - 1] != '\0') return -EINVAL; /* phy name should be null-terminated */ + name_assign_type = NET_NAME_USER; } else { devname = "wpan%d"; + name_assign_type = NET_NAME_ENUM; } if (strlen(devname) >= IFNAMSIZ) @@ -221,7 +224,7 @@ int ieee802154_add_iface(struct sk_buff *skb, struct genl_info *info) } dev = rdev_add_virtual_intf_deprecated(wpan_phy_to_rdev(phy), devname, - type); + name_assign_type, type); if (IS_ERR(dev)) { rc = PTR_ERR(dev); goto nla_put_failure; diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c index a4daf91..f3c12f6 100644 --- a/net/ieee802154/nl802154.c +++ b/net/ieee802154/nl802154.c @@ -589,7 +589,7 @@ static int nl802154_new_interface(struct sk_buff *skb, struct genl_info *info) return rdev_add_virtual_intf(rdev, nla_data(info->attrs[NL802154_ATTR_IFNAME]), - type, extended_addr); + NET_NAME_USER, type, extended_addr); } static int nl802154_del_interface(struct sk_buff *skb, struct genl_info *info) diff --git a/net/ieee802154/rdev-ops.h b/net/ieee802154/rdev-ops.h index 7c46732..7b5a9dd 100644 --- a/net/ieee802154/rdev-ops.h +++ b/net/ieee802154/rdev-ops.h @@ -4,13 +4,16 @@ #include <net/cfg802154.h> #include "core.h" +#include "trace.h" static inline struct net_device * rdev_add_virtual_intf_deprecated(struct cfg802154_registered_device *rdev, - const char *name, int type) + const char *name, + unsigned char name_assign_type, + int type) { return rdev->ops->add_virtual_intf_deprecated(&rdev->wpan_phy, name, - type); + name_assign_type, type); } static inline void @@ -22,75 +25,131 @@ rdev_del_virtual_intf_deprecated(struct cfg802154_registered_device *rdev, static inline int rdev_add_virtual_intf(struct cfg802154_registered_device *rdev, char *name, + unsigned char name_assign_type, enum nl802154_iftype type, __le64 extended_addr) { - return rdev->ops->add_virtual_intf(&rdev->wpan_phy, name, type, + int ret; + + trace_802154_rdev_add_virtual_intf(&rdev->wpan_phy, name, type, extended_addr); + ret = rdev->ops->add_virtual_intf(&rdev->wpan_phy, name, + name_assign_type, type, + extended_addr); + trace_802154_rdev_return_int(&rdev->wpan_phy, ret); + return ret; } static inline int rdev_del_virtual_intf(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev) { - return rdev->ops->del_virtual_intf(&rdev->wpan_phy, wpan_dev); + int ret; + + trace_802154_rdev_del_virtual_intf(&rdev->wpan_phy, wpan_dev); + ret = rdev->ops->del_virtual_intf(&rdev->wpan_phy, wpan_dev); + trace_802154_rdev_return_int(&rdev->wpan_phy, ret); + return ret; } static inline int rdev_set_channel(struct cfg802154_registered_device *rdev, u8 page, u8 channel) { - return rdev->ops->set_channel(&rdev->wpan_phy, page, channel); + int ret; + + trace_802154_rdev_set_channel(&rdev->wpan_phy, page, channel); + ret = rdev->ops->set_channel(&rdev->wpan_phy, page, channel); + trace_802154_rdev_return_int(&rdev->wpan_phy, ret); + return ret; } static inline int rdev_set_cca_mode(struct cfg802154_registered_device *rdev, const struct wpan_phy_cca *cca) { - return rdev->ops->set_cca_mode(&rdev->wpan_phy, cca); + int ret; + + trace_802154_rdev_set_cca_mode(&rdev->wpan_phy, cca); + ret = rdev->ops->set_cca_mode(&rdev->wpan_phy, cca); + trace_802154_rdev_return_int(&rdev->wpan_phy, ret); + return ret; } static inline int rdev_set_pan_id(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, __le16 pan_id) { - return rdev->ops->set_pan_id(&rdev->wpan_phy, wpan_dev, pan_id); + int ret; + + trace_802154_rdev_set_pan_id(&rdev->wpan_phy, wpan_dev, pan_id); + ret = rdev->ops->set_pan_id(&rdev->wpan_phy, wpan_dev, pan_id); + trace_802154_rdev_return_int(&rdev->wpan_phy, ret); + return ret; } static inline int rdev_set_short_addr(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, __le16 short_addr) { - return rdev->ops->set_short_addr(&rdev->wpan_phy, wpan_dev, short_addr); + int ret; + + trace_802154_rdev_set_short_addr(&rdev->wpan_phy, wpan_dev, short_addr); + ret = rdev->ops->set_short_addr(&rdev->wpan_phy, wpan_dev, short_addr); + trace_802154_rdev_return_int(&rdev->wpan_phy, ret); + return ret; } static inline int rdev_set_backoff_exponent(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, u8 min_be, u8 max_be) { - return rdev->ops->set_backoff_exponent(&rdev->wpan_phy, wpan_dev, + int ret; + + trace_802154_rdev_set_backoff_exponent(&rdev->wpan_phy, wpan_dev, min_be, max_be); + ret = rdev->ops->set_backoff_exponent(&rdev->wpan_phy, wpan_dev, + min_be, max_be); + trace_802154_rdev_return_int(&rdev->wpan_phy, ret); + return ret; } static inline int rdev_set_max_csma_backoffs(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, u8 max_csma_backoffs) { - return rdev->ops->set_max_csma_backoffs(&rdev->wpan_phy, wpan_dev, - max_csma_backoffs); + int ret; + + trace_802154_rdev_set_csma_backoffs(&rdev->wpan_phy, wpan_dev, + max_csma_backoffs); + ret = rdev->ops->set_max_csma_backoffs(&rdev->wpan_phy, wpan_dev, + max_csma_backoffs); + trace_802154_rdev_return_int(&rdev->wpan_phy, ret); + return ret; } static inline int rdev_set_max_frame_retries(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, s8 max_frame_retries) { - return rdev->ops->set_max_frame_retries(&rdev->wpan_phy, wpan_dev, + int ret; + + trace_802154_rdev_set_max_frame_retries(&rdev->wpan_phy, wpan_dev, max_frame_retries); + ret = rdev->ops->set_max_frame_retries(&rdev->wpan_phy, wpan_dev, + max_frame_retries); + trace_802154_rdev_return_int(&rdev->wpan_phy, ret); + return ret; } static inline int rdev_set_lbt_mode(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, bool mode) { - return rdev->ops->set_lbt_mode(&rdev->wpan_phy, wpan_dev, mode); + int ret; + + trace_802154_rdev_set_lbt_mode(&rdev->wpan_phy, wpan_dev, mode); + ret = rdev->ops->set_lbt_mode(&rdev->wpan_phy, wpan_dev, mode); + trace_802154_rdev_return_int(&rdev->wpan_phy, ret); + return ret; } #endif /* __CFG802154_RDEV_OPS */ diff --git a/net/ieee802154/trace.c b/net/ieee802154/trace.c new file mode 100644 index 0000000..95f997f --- /dev/null +++ b/net/ieee802154/trace.c @@ -0,0 +1,7 @@ +#include <linux/module.h> + +#ifndef __CHECKER__ +#define CREATE_TRACE_POINTS +#include "trace.h" + +#endif diff --git a/net/ieee802154/trace.h b/net/ieee802154/trace.h new file mode 100644 index 0000000..5ac25eb --- /dev/null +++ b/net/ieee802154/trace.h @@ -0,0 +1,247 @@ +/* Based on net/wireless/tracing.h */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM cfg802154 + +#if !defined(__RDEV_CFG802154_OPS_TRACE) || defined(TRACE_HEADER_MULTI_READ) +#define __RDEV_CFG802154_OPS_TRACE + +#include <linux/tracepoint.h> + +#include <net/cfg802154.h> + +#define MAXNAME 32 +#define WPAN_PHY_ENTRY __array(char, wpan_phy_name, MAXNAME) +#define WPAN_PHY_ASSIGN strlcpy(__entry->wpan_phy_name, \ + wpan_phy_name(wpan_phy), \ + MAXNAME) +#define WPAN_PHY_PR_FMT "%s" +#define WPAN_PHY_PR_ARG __entry->wpan_phy_name + +#define WPAN_DEV_ENTRY __field(u32, identifier) +#define WPAN_DEV_ASSIGN (__entry->identifier) = (!IS_ERR_OR_NULL(wpan_dev) \ + ? wpan_dev->identifier : 0) +#define WPAN_DEV_PR_FMT "wpan_dev(%u)" +#define WPAN_DEV_PR_ARG (__entry->identifier) + +#define WPAN_CCA_ENTRY __field(enum nl802154_cca_modes, cca_mode) \ + __field(enum nl802154_cca_opts, cca_opt) +#define WPAN_CCA_ASSIGN \ + do { \ + (__entry->cca_mode) = cca->mode; \ + (__entry->cca_opt) = cca->opt; \ + } while (0) +#define WPAN_CCA_PR_FMT "cca_mode: %d, cca_opt: %d" +#define WPAN_CCA_PR_ARG __entry->cca_mode, __entry->cca_opt + +#define BOOL_TO_STR(bo) (bo) ? "true" : "false" + +/************************************************************* + * rdev->ops traces * + *************************************************************/ + +TRACE_EVENT(802154_rdev_add_virtual_intf, + TP_PROTO(struct wpan_phy *wpan_phy, char *name, + enum nl802154_iftype type, __le64 extended_addr), + TP_ARGS(wpan_phy, name, type, extended_addr), + TP_STRUCT__entry( + WPAN_PHY_ENTRY + __string(vir_intf_name, name ? name : "<noname>") + __field(enum nl802154_iftype, type) + __field(__le64, extended_addr) + ), + TP_fast_assign( + WPAN_PHY_ASSIGN; + __assign_str(vir_intf_name, name ? name : "<noname>"); + __entry->type = type; + __entry->extended_addr = extended_addr; + ), + TP_printk(WPAN_PHY_PR_FMT ", virtual intf name: %s, type: %d, ea %llx", + WPAN_PHY_PR_ARG, __get_str(vir_intf_name), __entry->type, + __le64_to_cpu(__entry->extended_addr)) +); + +TRACE_EVENT(802154_rdev_del_virtual_intf, + TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev), + TP_ARGS(wpan_phy, wpan_dev), + TP_STRUCT__entry( + WPAN_PHY_ENTRY + WPAN_DEV_ENTRY + ), + TP_fast_assign( + WPAN_PHY_ASSIGN; + WPAN_DEV_ASSIGN; + ), + TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT, WPAN_PHY_PR_ARG, + WPAN_DEV_PR_ARG) +); + +TRACE_EVENT(802154_rdev_set_channel, + TP_PROTO(struct wpan_phy *wpan_phy, u8 page, u8 channel), + TP_ARGS(wpan_phy, page, channel), + TP_STRUCT__entry( + WPAN_PHY_ENTRY + __field(u8, page) + __field(u8, channel) + ), + TP_fast_assign( + WPAN_PHY_ASSIGN; + __entry->page = page; + __entry->channel = channel; + ), + TP_printk(WPAN_PHY_PR_FMT ", page: %d, channel: %d", WPAN_PHY_PR_ARG, + __entry->page, __entry->channel) +); + +TRACE_EVENT(802154_rdev_set_cca_mode, + TP_PROTO(struct wpan_phy *wpan_phy, const struct wpan_phy_cca *cca), + TP_ARGS(wpan_phy, cca), + TP_STRUCT__entry( + WPAN_PHY_ENTRY + WPAN_CCA_ENTRY + ), + TP_fast_assign( + WPAN_PHY_ASSIGN; + WPAN_CCA_ASSIGN; + ), + TP_printk(WPAN_PHY_PR_FMT ", " WPAN_CCA_PR_FMT, WPAN_PHY_PR_ARG, + WPAN_CCA_PR_ARG) +); + +DECLARE_EVENT_CLASS(802154_le16_template, + TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, + __le16 le16arg), + TP_ARGS(wpan_phy, wpan_dev, le16arg), + TP_STRUCT__entry( + WPAN_PHY_ENTRY + WPAN_DEV_ENTRY + __field(__le16, le16arg) + ), + TP_fast_assign( + WPAN_PHY_ASSIGN; + WPAN_DEV_ASSIGN; + __entry->le16arg = le16arg; + ), + TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT ", pan id: 0x%04x", + WPAN_PHY_PR_ARG, WPAN_DEV_PR_ARG, + __le16_to_cpu(__entry->le16arg)) +); + +DEFINE_EVENT(802154_le16_template, 802154_rdev_set_pan_id, + TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, + __le16 le16arg), + TP_ARGS(wpan_phy, wpan_dev, le16arg) +); + +DEFINE_EVENT_PRINT(802154_le16_template, 802154_rdev_set_short_addr, + TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, + __le16 le16arg), + TP_ARGS(wpan_phy, wpan_dev, le16arg), + TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT ", sa: 0x%04x", + WPAN_PHY_PR_ARG, WPAN_DEV_PR_ARG, + __le16_to_cpu(__entry->le16arg)) +); + +TRACE_EVENT(802154_rdev_set_backoff_exponent, + TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, + u8 min_be, u8 max_be), + TP_ARGS(wpan_phy, wpan_dev, min_be, max_be), + TP_STRUCT__entry( + WPAN_PHY_ENTRY + WPAN_DEV_ENTRY + __field(u8, min_be) + __field(u8, max_be) + ), + TP_fast_assign( + WPAN_PHY_ASSIGN; + WPAN_DEV_ASSIGN; + __entry->min_be = min_be; + __entry->max_be = max_be; + ), + + TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT + ", min be: %d, max_be: %d", WPAN_PHY_PR_ARG, + WPAN_DEV_PR_ARG, __entry->min_be, __entry->max_be) +); + +TRACE_EVENT(802154_rdev_set_csma_backoffs, + TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, + u8 max_csma_backoffs), + TP_ARGS(wpan_phy, wpan_dev, max_csma_backoffs), + TP_STRUCT__entry( + WPAN_PHY_ENTRY + WPAN_DEV_ENTRY + __field(u8, max_csma_backoffs) + ), + TP_fast_assign( + WPAN_PHY_ASSIGN; + WPAN_DEV_ASSIGN; + __entry->max_csma_backoffs = max_csma_backoffs; + ), + + TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT + ", max csma backoffs: %d", WPAN_PHY_PR_ARG, + WPAN_DEV_PR_ARG, __entry->max_csma_backoffs) +); + +TRACE_EVENT(802154_rdev_set_max_frame_retries, + TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, + s8 max_frame_retries), + TP_ARGS(wpan_phy, wpan_dev, max_frame_retries), + TP_STRUCT__entry( + WPAN_PHY_ENTRY + WPAN_DEV_ENTRY + __field(s8, max_frame_retries) + ), + TP_fast_assign( + WPAN_PHY_ASSIGN; + WPAN_DEV_ASSIGN; + __entry->max_frame_retries = max_frame_retries; + ), + + TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT + ", max frame retries: %d", WPAN_PHY_PR_ARG, + WPAN_DEV_PR_ARG, __entry->max_frame_retries) +); + +TRACE_EVENT(802154_rdev_set_lbt_mode, + TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, + bool mode), + TP_ARGS(wpan_phy, wpan_dev, mode), + TP_STRUCT__entry( + WPAN_PHY_ENTRY + WPAN_DEV_ENTRY + __field(bool, mode) + ), + TP_fast_assign( + WPAN_PHY_ASSIGN; + WPAN_DEV_ASSIGN; + __entry->mode = mode; + ), + TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT + ", lbt mode: %s", WPAN_PHY_PR_ARG, + WPAN_DEV_PR_ARG, BOOL_TO_STR(__entry->mode)) +); + +TRACE_EVENT(802154_rdev_return_int, + TP_PROTO(struct wpan_phy *wpan_phy, int ret), + TP_ARGS(wpan_phy, ret), + TP_STRUCT__entry( + WPAN_PHY_ENTRY + __field(int, ret) + ), + TP_fast_assign( + WPAN_PHY_ASSIGN; + __entry->ret = ret; + ), + TP_printk(WPAN_PHY_PR_FMT ", returned: %d", WPAN_PHY_PR_ARG, + __entry->ret) +); + +#endif /* !__RDEV_CFG802154_OPS_TRACE || TRACE_HEADER_MULTI_READ */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace +#include <trace/define_trace.h> diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index e13fcc6..09b62e1 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1164,6 +1164,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) state = fa->fa_state; new_fa->fa_state = state & ~FA_S_ACCESSED; new_fa->fa_slen = fa->fa_slen; + new_fa->tb_id = tb->tb_id; err = netdev_switch_fib_ipv4_add(key, plen, fi, new_fa->fa_tos, @@ -1764,7 +1765,7 @@ void fib_table_flush_external(struct fib_table *tb) /* record local slen */ slen = fa->fa_slen; - if (!fi || !(fi->fib_flags & RTNH_F_EXTERNAL)) + if (!fi || !(fi->fib_flags & RTNH_F_OFFLOAD)) continue; netdev_switch_fib_ipv4_del(n->key, diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 5c3dd62..8976ca4 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -564,6 +564,40 @@ int inet_rtx_syn_ack(struct sock *parent, struct request_sock *req) } EXPORT_SYMBOL(inet_rtx_syn_ack); +/* return true if req was found in the syn_table[] */ +static bool reqsk_queue_unlink(struct request_sock_queue *queue, + struct request_sock *req) +{ + struct listen_sock *lopt = queue->listen_opt; + struct request_sock **prev; + bool found = false; + + spin_lock(&queue->syn_wait_lock); + + for (prev = &lopt->syn_table[req->rsk_hash]; *prev != NULL; + prev = &(*prev)->dl_next) { + if (*prev == req) { + *prev = req->dl_next; + found = true; + break; + } + } + + spin_unlock(&queue->syn_wait_lock); + if (del_timer(&req->rsk_timer)) + reqsk_put(req); + return found; +} + +void inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req) +{ + if (reqsk_queue_unlink(&inet_csk(sk)->icsk_accept_queue, req)) { + reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req); + reqsk_put(req); + } +} +EXPORT_SYMBOL(inet_csk_reqsk_queue_drop); + static void reqsk_timer_handler(unsigned long data) { struct request_sock *req = (struct request_sock *)data; diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index bb77ebd..4d32262 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -224,14 +224,16 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, handler->idiag_get_info(sk, r, info); if (sk->sk_state < TCP_TIME_WAIT) { - int err = 0; + union tcp_cc_info info; + size_t sz = 0; + int attr; rcu_read_lock(); ca_ops = READ_ONCE(icsk->icsk_ca_ops); if (ca_ops && ca_ops->get_info) - err = ca_ops->get_info(sk, ext, skb); + sz = ca_ops->get_info(sk, ext, &attr, &info); rcu_read_unlock(); - if (err < 0) + if (sz && nla_put(skb, attr, sz, &info) < 0) goto errout; } diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 939992c..3674484 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -82,6 +82,9 @@ int ip_forward(struct sk_buff *skb) if (skb->pkt_type != PACKET_HOST) goto drop; + if (unlikely(skb->sk)) + goto drop; + if (skb_warn_if_lro(skb)) goto drop; diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 13bfe84..a612007 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -1075,6 +1075,9 @@ static int do_replace(struct net *net, const void __user *user, /* overflow check */ if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; + if (tmp.num_counters == 0) + return -EINVAL; + tmp.name[sizeof(tmp.name)-1] = 0; newinfo = xt_alloc_table_info(tmp.size); @@ -1499,6 +1502,9 @@ static int compat_do_replace(struct net *net, void __user *user, return -ENOMEM; if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; + if (tmp.num_counters == 0) + return -EINVAL; + tmp.name[sizeof(tmp.name)-1] = 0; newinfo = xt_alloc_table_info(tmp.size); diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index c69db7f..2d0e265 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1262,6 +1262,9 @@ do_replace(struct net *net, const void __user *user, unsigned int len) /* overflow check */ if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; + if (tmp.num_counters == 0) + return -EINVAL; + tmp.name[sizeof(tmp.name)-1] = 0; newinfo = xt_alloc_table_info(tmp.size); @@ -1809,6 +1812,9 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) return -ENOMEM; if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; + if (tmp.num_counters == 0) + return -EINVAL; + tmp.name[sizeof(tmp.name)-1] = 0; newinfo = xt_alloc_table_info(tmp.size); diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index a93f260..05ff44b 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -158,6 +158,7 @@ void ping_unhash(struct sock *sk) if (sk_hashed(sk)) { write_lock_bh(&ping_table.lock); hlist_nulls_del(&sk->sk_nulls_node); + sk_nulls_node_init(&sk->sk_nulls_node); sock_put(sk); isk->inet_num = 0; isk->inet_sport = 0; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index a78540f..f45f2a1 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -902,6 +902,10 @@ static int ip_error(struct sk_buff *skb) bool send; int code; + /* IP on this device is disabled. */ + if (!in_dev) + goto out; + net = dev_net(rt->dst.dev); if (!IN_DEV_FORWARD(in_dev)) { switch (rt->dst.error) { @@ -962,10 +966,7 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) if (dst_metric_locked(dst, RTAX_MTU)) return; - if (dst->dev->mtu < mtu) - return; - - if (rt->rt_pmtu && rt->rt_pmtu < mtu) + if (ipv4_mtu(dst) < mtu) return; if (mtu < ip_rt_min_pmtu) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 59c8a02..f1377f2 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -252,6 +252,7 @@ #include <linux/types.h> #include <linux/fcntl.h> #include <linux/poll.h> +#include <linux/inet_diag.h> #include <linux/init.h> #include <linux/fs.h> #include <linux/skbuff.h> @@ -401,6 +402,7 @@ void tcp_init_sock(struct sock *sk) tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tp->snd_cwnd_clamp = ~0; tp->mss_cache = TCP_MSS_DEFAULT; + u64_stats_init(&tp->syncp); tp->reordering = sysctl_tcp_reordering; tcp_enable_early_retrans(tp); @@ -520,8 +522,10 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) /* Race breaker. If space is freed after * wspace test but before the flags are set, - * IO signal will be lost. + * IO signal will be lost. Memory barrier + * pairs with the input side. */ + smp_mb__after_atomic(); if (sk_stream_is_writeable(sk)) mask |= POLLOUT | POLLWRNORM; } @@ -2590,11 +2594,12 @@ EXPORT_SYMBOL(compat_tcp_setsockopt); #endif /* Return information about state of tcp endpoint in API format. */ -void tcp_get_info(const struct sock *sk, struct tcp_info *info) +void tcp_get_info(struct sock *sk, struct tcp_info *info) { const struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); u32 now = tcp_time_stamp; + unsigned int start; u32 rate; memset(info, 0, sizeof(*info)); @@ -2661,6 +2666,12 @@ void tcp_get_info(const struct sock *sk, struct tcp_info *info) rate = READ_ONCE(sk->sk_max_pacing_rate); info->tcpi_max_pacing_rate = rate != ~0U ? rate : ~0ULL; + + do { + start = u64_stats_fetch_begin_irq(&tp->syncp); + info->tcpi_bytes_acked = tp->bytes_acked; + info->tcpi_bytes_received = tp->bytes_received; + } while (u64_stats_fetch_retry_irq(&tp->syncp, start)); } EXPORT_SYMBOL_GPL(tcp_get_info); @@ -2732,6 +2743,26 @@ static int do_tcp_getsockopt(struct sock *sk, int level, return -EFAULT; return 0; } + case TCP_CC_INFO: { + const struct tcp_congestion_ops *ca_ops; + union tcp_cc_info info; + size_t sz = 0; + int attr; + + if (get_user(len, optlen)) + return -EFAULT; + + ca_ops = icsk->icsk_ca_ops; + if (ca_ops && ca_ops->get_info) + sz = ca_ops->get_info(sk, ~0U, &attr, &info); + + len = min_t(unsigned int, len, sz); + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &info, len)) + return -EFAULT; + return 0; + } case TCP_QUICKACK: val = !icsk->icsk_ack.pingpong; break; diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c index 4376016..4c41c12 100644 --- a/net/ipv4/tcp_dctcp.c +++ b/net/ipv4/tcp_dctcp.c @@ -277,7 +277,8 @@ static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev) } } -static int dctcp_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) +static size_t dctcp_get_info(struct sock *sk, u32 ext, int *attr, + union tcp_cc_info *info) { const struct dctcp *ca = inet_csk_ca(sk); @@ -286,18 +287,17 @@ static int dctcp_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) */ if (ext & (1 << (INET_DIAG_DCTCPINFO - 1)) || ext & (1 << (INET_DIAG_VEGASINFO - 1))) { - struct tcp_dctcp_info info; - - memset(&info, 0, sizeof(info)); + memset(info, 0, sizeof(struct tcp_dctcp_info)); if (inet_csk(sk)->icsk_ca_ops != &dctcp_reno) { - info.dctcp_enabled = 1; - info.dctcp_ce_state = (u16) ca->ce_state; - info.dctcp_alpha = ca->dctcp_alpha; - info.dctcp_ab_ecn = ca->acked_bytes_ecn; - info.dctcp_ab_tot = ca->acked_bytes_total; + info->dctcp.dctcp_enabled = 1; + info->dctcp.dctcp_ce_state = (u16) ca->ce_state; + info->dctcp.dctcp_alpha = ca->dctcp_alpha; + info->dctcp.dctcp_ab_ecn = ca->acked_bytes_ecn; + info->dctcp.dctcp_ab_tot = ca->acked_bytes_total; } - return nla_put(skb, INET_DIAG_DCTCPINFO, sizeof(info), &info); + *attr = INET_DIAG_DCTCPINFO; + return sizeof(*info); } return 0; } diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index e3d87ac..46b087a 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -206,6 +206,11 @@ static bool tcp_fastopen_create_child(struct sock *sk, skb_set_owner_r(skb2, child); __skb_queue_tail(&child->sk_receive_queue, skb2); tp->syn_data_acked = 1; + + /* u64_stats_update_begin(&tp->syncp) not needed here, + * as we certainly are not changing upper 32bit value (0) + */ + tp->bytes_received = end_seq - TCP_SKB_CB(skb)->seq - 1; } else { end_seq = TCP_SKB_CB(skb)->seq + 1; } diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c index 67476f0..f71002e 100644 --- a/net/ipv4/tcp_illinois.c +++ b/net/ipv4/tcp_illinois.c @@ -300,24 +300,25 @@ static u32 tcp_illinois_ssthresh(struct sock *sk) } /* Extract info for Tcp socket info provided via netlink. */ -static int tcp_illinois_info(struct sock *sk, u32 ext, struct sk_buff *skb) +static size_t tcp_illinois_info(struct sock *sk, u32 ext, int *attr, + union tcp_cc_info *info) { const struct illinois *ca = inet_csk_ca(sk); if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { - struct tcpvegas_info info = { - .tcpv_enabled = 1, - .tcpv_rttcnt = ca->cnt_rtt, - .tcpv_minrtt = ca->base_rtt, - }; + info->vegas.tcpv_enabled = 1; + info->vegas.tcpv_rttcnt = ca->cnt_rtt; + info->vegas.tcpv_minrtt = ca->base_rtt; + info->vegas.tcpv_rtt = 0; - if (info.tcpv_rttcnt > 0) { + if (info->vegas.tcpv_rttcnt > 0) { u64 t = ca->sum_rtt; - do_div(t, info.tcpv_rttcnt); - info.tcpv_rtt = t; + do_div(t, info->vegas.tcpv_rttcnt); + info->vegas.tcpv_rtt = t; } - return nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info); + *attr = INET_DIAG_VEGASINFO; + return sizeof(struct tcpvegas_info); } return 0; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a7ef679..c9ab964 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1820,14 +1820,12 @@ advance_sp: for (j = 0; j < used_sacks; j++) tp->recv_sack_cache[i++] = sp[j]; - tcp_mark_lost_retrans(sk); - - tcp_verify_left_out(tp); - if ((state.reord < tp->fackets_out) && ((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker)) tcp_update_reordering(sk, tp->fackets_out - state.reord, 0); + tcp_mark_lost_retrans(sk); + tcp_verify_left_out(tp); out: #if FASTRETRANS_DEBUG > 0 @@ -2700,16 +2698,21 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack) struct tcp_sock *tp = tcp_sk(sk); bool recovered = !before(tp->snd_una, tp->high_seq); + if ((flag & FLAG_SND_UNA_ADVANCED) && + tcp_try_undo_loss(sk, false)) + return; + if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */ /* Step 3.b. A timeout is spurious if not all data are * lost, i.e., never-retransmitted data are (s)acked. */ - if (tcp_try_undo_loss(sk, flag & FLAG_ORIG_SACK_ACKED)) + if ((flag & FLAG_ORIG_SACK_ACKED) && + tcp_try_undo_loss(sk, true)) return; - if (after(tp->snd_nxt, tp->high_seq) && - (flag & FLAG_DATA_SACKED || is_dupack)) { - tp->frto = 0; /* Loss was real: 2nd part of step 3.a */ + if (after(tp->snd_nxt, tp->high_seq)) { + if (flag & FLAG_DATA_SACKED || is_dupack) + tp->frto = 0; /* Step 3.a. loss was real */ } else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) { tp->high_seq = tp->snd_nxt; __tcp_push_pending_frames(sk, tcp_current_mss(sk), @@ -2734,8 +2737,6 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack) else if (flag & FLAG_SND_UNA_ADVANCED) tcp_reset_reno_sack(tp); } - if (tcp_try_undo_loss(sk, false)) - return; tcp_xmit_retransmit_queue(sk); } @@ -3280,6 +3281,28 @@ static inline bool tcp_may_update_window(const struct tcp_sock *tp, (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd); } +/* If we update tp->snd_una, also update tp->bytes_acked */ +static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack) +{ + u32 delta = ack - tp->snd_una; + + u64_stats_update_begin(&tp->syncp); + tp->bytes_acked += delta; + u64_stats_update_end(&tp->syncp); + tp->snd_una = ack; +} + +/* If we update tp->rcv_nxt, also update tp->bytes_received */ +static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq) +{ + u32 delta = seq - tp->rcv_nxt; + + u64_stats_update_begin(&tp->syncp); + tp->bytes_received += delta; + u64_stats_update_end(&tp->syncp); + tp->rcv_nxt = seq; +} + /* Update our send window. * * Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2 @@ -3315,7 +3338,7 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 } } - tp->snd_una = ack; + tcp_snd_una_update(tp, ack); return flag; } @@ -3497,7 +3520,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) * Note, we use the fact that SND.UNA>=SND.WL2. */ tcp_update_wl(tp, ack_seq); - tp->snd_una = ack; + tcp_snd_una_update(tp, ack); flag |= FLAG_WIN_UPDATE; tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE); @@ -4236,7 +4259,7 @@ static void tcp_ofo_queue(struct sock *sk) tail = skb_peek_tail(&sk->sk_receive_queue); eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen); - tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); if (!eaten) __skb_queue_tail(&sk->sk_receive_queue, skb); if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) @@ -4404,7 +4427,7 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int __skb_pull(skb, hdrlen); eaten = (tail && tcp_try_coalesce(sk, tail, skb, fragstolen)) ? 1 : 0; - tcp_sk(sk)->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq); if (!eaten) { __skb_queue_tail(&sk->sk_receive_queue, skb); skb_set_owner_r(skb, sk); @@ -4497,7 +4520,7 @@ queue_and_out: eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); } - tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); if (skb->len) tcp_event_data_recv(sk, skb); if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) @@ -4845,6 +4868,8 @@ static void tcp_check_space(struct sock *sk) { if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) { sock_reset_flag(sk, SOCK_QUEUE_SHRUNK); + /* pairs with tcp_poll() */ + smp_mb__after_atomic(); if (sk->sk_socket && test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) tcp_new_space(sk); @@ -5243,7 +5268,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, tcp_rcv_rtt_measure_ts(sk, skb); __skb_pull(skb, tcp_header_len); - tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER); eaten = 1; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 3571f2b..fc1c658 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1348,7 +1348,8 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) req = inet_csk_search_req(sk, th->source, iph->saddr, iph->daddr); if (req) { nsk = tcp_check_req(sk, skb, req, false); - reqsk_put(req); + if (!nsk) + reqsk_put(req); return nsk; } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 63d6311..b5732a5 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -300,7 +300,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) tw->tw_v6_daddr = sk->sk_v6_daddr; tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr; tw->tw_tclass = np->tclass; - tw->tw_flowlabel = np->flow_label >> 12; + tw->tw_flowlabel = be32_to_cpu(np->flow_label & IPV6_FLOWLABEL_MASK); tw->tw_ipv6only = sk->sk_ipv6only; } #endif @@ -755,10 +755,11 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, if (!child) goto listen_overflow; - inet_csk_reqsk_queue_unlink(sk, req); - inet_csk_reqsk_queue_removed(sk, req); - + inet_csk_reqsk_queue_drop(sk, req); inet_csk_reqsk_queue_add(sk, req, child); + /* Warning: caller must not call reqsk_put(req); + * child stole last reference on it. + */ return child; listen_overflow: diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 8c8d7e0..a369e8a 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2812,39 +2812,65 @@ begin_fwd: } } -/* Send a fin. The caller locks the socket for us. This cannot be - * allowed to fail queueing a FIN frame under any circumstances. +/* We allow to exceed memory limits for FIN packets to expedite + * connection tear down and (memory) recovery. + * Otherwise tcp_send_fin() could be tempted to either delay FIN + * or even be forced to close flow without any FIN. + */ +static void sk_forced_wmem_schedule(struct sock *sk, int size) +{ + int amt, status; + + if (size <= sk->sk_forward_alloc) + return; + amt = sk_mem_pages(size); + sk->sk_forward_alloc += amt * SK_MEM_QUANTUM; + sk_memory_allocated_add(sk, amt, &status); +} + +/* Send a FIN. The caller locks the socket for us. + * We should try to send a FIN packet really hard, but eventually give up. */ void tcp_send_fin(struct sock *sk) { + struct sk_buff *skb, *tskb = tcp_write_queue_tail(sk); struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb = tcp_write_queue_tail(sk); - int mss_now; - /* Optimization, tack on the FIN if we have a queue of - * unsent frames. But be careful about outgoing SACKS - * and IP options. + /* Optimization, tack on the FIN if we have one skb in write queue and + * this skb was not yet sent, or we are under memory pressure. + * Note: in the latter case, FIN packet will be sent after a timeout, + * as TCP stack thinks it has already been transmitted. */ - mss_now = tcp_current_mss(sk); - - if (tcp_send_head(sk)) { - TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN; - TCP_SKB_CB(skb)->end_seq++; + if (tskb && (tcp_send_head(sk) || sk_under_memory_pressure(sk))) { +coalesce: + TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN; + TCP_SKB_CB(tskb)->end_seq++; tp->write_seq++; + if (!tcp_send_head(sk)) { + /* This means tskb was already sent. + * Pretend we included the FIN on previous transmit. + * We need to set tp->snd_nxt to the value it would have + * if FIN had been sent. This is because retransmit path + * does not change tp->snd_nxt. + */ + tp->snd_nxt++; + return; + } } else { - /* Socket is locked, keep trying until memory is available. */ - for (;;) { - skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation); - if (skb) - break; - yield(); + skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation); + if (unlikely(!skb)) { + if (tskb) + goto coalesce; + return; } + skb_reserve(skb, MAX_TCP_HEADER); + sk_forced_wmem_schedule(sk, skb->truesize); /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ tcp_init_nondata_skb(skb, tp->write_seq, TCPHDR_ACK | TCPHDR_FIN); tcp_queue_skb(sk, skb); } - __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF); + __tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF); } /* We get here when a process closes a file descriptor (either due to diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index c71a1b8..a6cea1d 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -286,18 +286,19 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked) } /* Extract info for Tcp socket info provided via netlink. */ -int tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) +size_t tcp_vegas_get_info(struct sock *sk, u32 ext, int *attr, + union tcp_cc_info *info) { const struct vegas *ca = inet_csk_ca(sk); + if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { - struct tcpvegas_info info = { - .tcpv_enabled = ca->doing_vegas_now, - .tcpv_rttcnt = ca->cntRTT, - .tcpv_rtt = ca->baseRTT, - .tcpv_minrtt = ca->minRTT, - }; - - return nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info); + info->vegas.tcpv_enabled = ca->doing_vegas_now, + info->vegas.tcpv_rttcnt = ca->cntRTT, + info->vegas.tcpv_rtt = ca->baseRTT, + info->vegas.tcpv_minrtt = ca->minRTT, + + *attr = INET_DIAG_VEGASINFO; + return sizeof(struct tcpvegas_info); } return 0; } diff --git a/net/ipv4/tcp_vegas.h b/net/ipv4/tcp_vegas.h index e8a6b33..ef9da53 100644 --- a/net/ipv4/tcp_vegas.h +++ b/net/ipv4/tcp_vegas.h @@ -19,6 +19,7 @@ void tcp_vegas_init(struct sock *sk); void tcp_vegas_state(struct sock *sk, u8 ca_state); void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us); void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event); -int tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb); +size_t tcp_vegas_get_info(struct sock *sk, u32 ext, int *attr, + union tcp_cc_info *info); #endif /* __TCP_VEGAS_H */ diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index b3c57cc..c10732e 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -256,18 +256,19 @@ static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) } /* Extract info for Tcp socket info provided via netlink. */ -static int tcp_westwood_info(struct sock *sk, u32 ext, struct sk_buff *skb) +static size_t tcp_westwood_info(struct sock *sk, u32 ext, int *attr, + union tcp_cc_info *info) { const struct westwood *ca = inet_csk_ca(sk); if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { - struct tcpvegas_info info = { - .tcpv_enabled = 1, - .tcpv_rtt = jiffies_to_usecs(ca->rtt), - .tcpv_minrtt = jiffies_to_usecs(ca->rtt_min), - }; + info->vegas.tcpv_enabled = 1; + info->vegas.tcpv_rttcnt = 0; + info->vegas.tcpv_rtt = jiffies_to_usecs(ca->rtt), + info->vegas.tcpv_minrtt = jiffies_to_usecs(ca->rtt_min), - return nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info); + *attr = INET_DIAG_VEGASINFO; + return sizeof(struct tcpvegas_info); } return 0; } diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 96dbfff..bde57b1 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -693,6 +693,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, { struct rt6_info *iter = NULL; struct rt6_info **ins; + struct rt6_info **fallback_ins = NULL; int replace = (info->nlh && (info->nlh->nlmsg_flags & NLM_F_REPLACE)); int add = (!info->nlh || @@ -716,8 +717,13 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, (info->nlh->nlmsg_flags & NLM_F_EXCL)) return -EEXIST; if (replace) { - found++; - break; + if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) { + found++; + break; + } + if (rt_can_ecmp) + fallback_ins = fallback_ins ?: ins; + goto next_iter; } if (iter->dst.dev == rt->dst.dev && @@ -753,9 +759,17 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, if (iter->rt6i_metric > rt->rt6i_metric) break; +next_iter: ins = &iter->dst.rt6_next; } + if (fallback_ins && !found) { + /* No ECMP-able route found, replace first non-ECMP one */ + ins = fallback_ins; + iter = *ins; + found++; + } + /* Reset round-robin state, if necessary */ if (ins == &fn->leaf) fn->rr_ptr = NULL; @@ -815,6 +829,8 @@ add: } } else { + int nsiblings; + if (!found) { if (add) goto add; @@ -835,8 +851,27 @@ add: info->nl_net->ipv6.rt6_stats->fib_route_nodes++; fn->fn_flags |= RTN_RTINFO; } + nsiblings = iter->rt6i_nsiblings; fib6_purge_rt(iter, fn, info->nl_net); rt6_release(iter); + + if (nsiblings) { + /* Replacing an ECMP route, remove all siblings */ + ins = &rt->dst.rt6_next; + iter = *ins; + while (iter) { + if (rt6_qualify_for_ecmp(iter)) { + *ins = iter->dst.rt6_next; + fib6_purge_rt(iter, fn, info->nl_net); + rt6_release(iter); + nsiblings--; + } else { + ins = &iter->dst.rt6_next; + } + iter = *ins; + } + WARN_ON(nsiblings != 0); + } } return 0; diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index b5e6cc1..a38d3ac 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1246,7 +1246,6 @@ static void ip6gre_tunnel_setup(struct net_device *dev) static int ip6gre_tunnel_init(struct net_device *dev) { struct ip6_tnl *tunnel; - int i; tunnel = netdev_priv(dev); @@ -1260,16 +1259,10 @@ static int ip6gre_tunnel_init(struct net_device *dev) if (ipv6_addr_any(&tunnel->parms.raddr)) dev->header_ops = &ip6gre_header_ops; - dev->tstats = alloc_percpu(struct pcpu_sw_netstats); + dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!dev->tstats) return -ENOMEM; - for_each_possible_cpu(i) { - struct pcpu_sw_netstats *ip6gre_tunnel_stats; - ip6gre_tunnel_stats = per_cpu_ptr(dev->tstats, i); - u64_stats_init(&ip6gre_tunnel_stats->syncp); - } - return 0; } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 7fde1f2..bc09cb9 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -886,22 +886,45 @@ static int ip6_dst_lookup_tail(struct sock *sk, #endif int err; - if (!*dst) - *dst = ip6_route_output(net, sk, fl6); - - err = (*dst)->error; - if (err) - goto out_err_release; + /* The correct way to handle this would be to do + * ip6_route_get_saddr, and then ip6_route_output; however, + * the route-specific preferred source forces the + * ip6_route_output call _before_ ip6_route_get_saddr. + * + * In source specific routing (no src=any default route), + * ip6_route_output will fail given src=any saddr, though, so + * that's why we try it again later. + */ + if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) { + struct rt6_info *rt; + bool had_dst = *dst != NULL; - if (ipv6_addr_any(&fl6->saddr)) { - struct rt6_info *rt = (struct rt6_info *) *dst; + if (!had_dst) + *dst = ip6_route_output(net, sk, fl6); + rt = (*dst)->error ? NULL : (struct rt6_info *)*dst; err = ip6_route_get_saddr(net, rt, &fl6->daddr, sk ? inet6_sk(sk)->srcprefs : 0, &fl6->saddr); if (err) goto out_err_release; + + /* If we had an erroneous initial result, pretend it + * never existed and let the SA-enabled version take + * over. + */ + if (!had_dst && (*dst)->error) { + dst_release(*dst); + *dst = NULL; + } } + if (!*dst) + *dst = ip6_route_output(net, sk, fl6); + + err = (*dst)->error; + if (err) + goto out_err_release; + #ifdef CONFIG_IPV6_OPTIMISTIC_DAD /* * Here if the dst entry we've looked up @@ -1277,8 +1300,10 @@ emsgsize: /* If this is the first and only packet and device * supports checksum offloading, let's use it. + * Use transhdrlen, same as IPv4, because partial + * sums only work when transhdrlen is set. */ - if (!skb && sk->sk_protocol == IPPROTO_UDP && + if (transhdrlen && sk->sk_protocol == IPPROTO_UDP && length + fragheaderlen < mtu && rt->dst.dev->features & NETIF_F_V6_CSUM && !exthdrlen) diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 1a732a1..62f5b0d 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1275,6 +1275,9 @@ do_replace(struct net *net, const void __user *user, unsigned int len) /* overflow check */ if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; + if (tmp.num_counters == 0) + return -EINVAL; + tmp.name[sizeof(tmp.name)-1] = 0; newinfo = xt_alloc_table_info(tmp.size); @@ -1822,6 +1825,9 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) return -ENOMEM; if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; + if (tmp.num_counters == 0) + return -EINVAL; + tmp.name[sizeof(tmp.name)-1] = 0; newinfo = xt_alloc_table_info(tmp.size); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 5c48293..c73ae50 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2245,9 +2245,10 @@ int ip6_route_get_saddr(struct net *net, unsigned int prefs, struct in6_addr *saddr) { - struct inet6_dev *idev = ip6_dst_idev((struct dst_entry *)rt); + struct inet6_dev *idev = + rt ? ip6_dst_idev((struct dst_entry *)rt) : NULL; int err = 0; - if (rt->rt6i_prefsrc.plen) + if (rt && rt->rt6i_prefsrc.plen) *saddr = rt->rt6i_prefsrc.addr; else err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL, @@ -2503,9 +2504,9 @@ static int ip6_route_multipath(struct fib6_config *cfg, int add) int attrlen; int err = 0, last_err = 0; + remaining = cfg->fc_mp_len; beginning: rtnh = (struct rtnexthop *)cfg->fc_mp; - remaining = cfg->fc_mp_len; /* Parse a Multipath Entry */ while (rtnh_ok(rtnh, remaining)) { @@ -2535,15 +2536,19 @@ beginning: * next hops that have been already added. */ add = 0; + remaining = cfg->fc_mp_len - remaining; goto beginning; } } /* Because each route is added like a single route we remove - * this flag after the first nexthop (if there is a collision, - * we have already fail to add the first nexthop: - * fib6_add_rt2node() has reject it). + * these flags after the first nexthop: if there is a collision, + * we have already failed to add the first nexthop: + * fib6_add_rt2node() has rejected it; when replacing, old + * nexthops have been replaced by first new, the rest should + * be added to it. */ - cfg->fc_nlinfo.nlh->nlmsg_flags &= ~NLM_F_EXCL; + cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | + NLM_F_REPLACE); rtnh = rtnh_next(rtnh, &remaining); } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index ad51df8..3adffb3 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -914,7 +914,7 @@ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcp_time_stamp + tcptw->tw_ts_offset, tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw), - tw->tw_tclass, (tw->tw_flowlabel << 12)); + tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel)); inet_twsk_put(tw); } @@ -946,7 +946,8 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb) &ipv6_hdr(skb)->daddr, tcp_v6_iif(skb)); if (req) { nsk = tcp_check_req(sk, skb, req, false); - reqsk_put(req); + if (!nsk) + reqsk_put(req); return nsk; } nsk = __inet6_lookup_established(sock_net(sk), &tcp_hashinfo, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 3477c91..c2ec416 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -731,7 +731,9 @@ static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk, (inet->inet_dport && inet->inet_dport != rmt_port) || (!ipv6_addr_any(&sk->sk_v6_daddr) && !ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr)) || - (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)) + (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) || + (!ipv6_addr_any(&sk->sk_v6_rcv_saddr) && + !ipv6_addr_equal(&sk->sk_v6_rcv_saddr, loc_addr))) return false; if (!inet6_mc_check(sk, loc_addr, rmt_addr)) return false; diff --git a/net/mac802154/cfg.c b/net/mac802154/cfg.c index 5d9f68c..70be9c7 100644 --- a/net/mac802154/cfg.c +++ b/net/mac802154/cfg.c @@ -22,13 +22,14 @@ static struct net_device * ieee802154_add_iface_deprecated(struct wpan_phy *wpan_phy, - const char *name, int type) + const char *name, + unsigned char name_assign_type, int type) { struct ieee802154_local *local = wpan_phy_priv(wpan_phy); struct net_device *dev; rtnl_lock(); - dev = ieee802154_if_add(local, name, type, + dev = ieee802154_if_add(local, name, name_assign_type, type, cpu_to_le64(0x0000000000000000ULL)); rtnl_unlock(); @@ -45,12 +46,14 @@ static void ieee802154_del_iface_deprecated(struct wpan_phy *wpan_phy, static int ieee802154_add_iface(struct wpan_phy *phy, const char *name, + unsigned char name_assign_type, enum nl802154_iftype type, __le64 extended_addr) { struct ieee802154_local *local = wpan_phy_priv(phy); struct net_device *err; - err = ieee802154_if_add(local, name, type, extended_addr); + err = ieee802154_if_add(local, name, name_assign_type, type, + extended_addr); return PTR_ERR_OR_ZERO(err); } diff --git a/net/mac802154/ieee802154_i.h b/net/mac802154/ieee802154_i.h index bebd70f..127ba18 100644 --- a/net/mac802154/ieee802154_i.h +++ b/net/mac802154/ieee802154_i.h @@ -182,7 +182,8 @@ void ieee802154_iface_exit(void); void ieee802154_if_remove(struct ieee802154_sub_if_data *sdata); struct net_device * ieee802154_if_add(struct ieee802154_local *local, const char *name, - enum nl802154_iftype type, __le64 extended_addr); + unsigned char name_assign_type, enum nl802154_iftype type, + __le64 extended_addr); void ieee802154_remove_interfaces(struct ieee802154_local *local); #endif /* __IEEE802154_I_H */ diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c index 38b56f9..91b75ab 100644 --- a/net/mac802154/iface.c +++ b/net/mac802154/iface.c @@ -522,7 +522,8 @@ ieee802154_setup_sdata(struct ieee802154_sub_if_data *sdata, struct net_device * ieee802154_if_add(struct ieee802154_local *local, const char *name, - enum nl802154_iftype type, __le64 extended_addr) + unsigned char name_assign_type, enum nl802154_iftype type, + __le64 extended_addr) { struct net_device *ndev = NULL; struct ieee802154_sub_if_data *sdata = NULL; @@ -531,7 +532,7 @@ ieee802154_if_add(struct ieee802154_local *local, const char *name, ASSERT_RTNL(); ndev = alloc_netdev(sizeof(*sdata) + local->hw.vif_data_size, name, - NET_NAME_UNKNOWN, ieee802154_if_setup); + name_assign_type, ieee802154_if_setup); if (!ndev) return ERR_PTR(-ENOMEM); diff --git a/net/mac802154/llsec.c b/net/mac802154/llsec.c index dcf7395..5b2be12 100644 --- a/net/mac802154/llsec.c +++ b/net/mac802154/llsec.c @@ -134,7 +134,7 @@ llsec_key_alloc(const struct ieee802154_llsec_key *template) for (i = 0; i < ARRAY_SIZE(key->tfm); i++) { key->tfm[i] = crypto_alloc_aead("ccm(aes)", 0, CRYPTO_ALG_ASYNC); - if (!key->tfm[i]) + if (IS_ERR(key->tfm[i])) goto err_tfm; if (crypto_aead_setkey(key->tfm[i], template->key, IEEE802154_LLSEC_KEY_SIZE)) @@ -144,7 +144,7 @@ llsec_key_alloc(const struct ieee802154_llsec_key *template) } key->tfm0 = crypto_alloc_blkcipher("ctr(aes)", 0, CRYPTO_ALG_ASYNC); - if (!key->tfm0) + if (IS_ERR(key->tfm0)) goto err_tfm; if (crypto_blkcipher_setkey(key->tfm0, template->key, diff --git a/net/mac802154/main.c b/net/mac802154/main.c index 8500378..08cb32d 100644 --- a/net/mac802154/main.c +++ b/net/mac802154/main.c @@ -161,18 +161,21 @@ int ieee802154_register_hw(struct ieee802154_hw *hw) rtnl_lock(); - dev = ieee802154_if_add(local, "wpan%d", NL802154_IFTYPE_NODE, + dev = ieee802154_if_add(local, "wpan%d", NET_NAME_ENUM, + NL802154_IFTYPE_NODE, cpu_to_le64(0x0000000000000000ULL)); if (IS_ERR(dev)) { rtnl_unlock(); rc = PTR_ERR(dev); - goto out_wq; + goto out_phy; } rtnl_unlock(); return 0; +out_phy: + wpan_phy_unregister(local->phy); out_wq: destroy_workqueue(local->workqueue); out: diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index db8a2ea..7b3f732 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -53,6 +53,11 @@ static struct mpls_route *mpls_route_input_rcu(struct net *net, unsigned index) return rt; } +static inline struct mpls_dev *mpls_dev_get(const struct net_device *dev) +{ + return rcu_dereference_rtnl(dev->mpls_ptr); +} + static bool mpls_output_possible(const struct net_device *dev) { return dev && (dev->flags & IFF_UP) && netif_carrier_ok(dev); @@ -136,6 +141,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev, struct mpls_route *rt; struct mpls_entry_decoded dec; struct net_device *out_dev; + struct mpls_dev *mdev; unsigned int hh_len; unsigned int new_header_size; unsigned int mtu; @@ -143,6 +149,10 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev, /* Careful this entire function runs inside of an rcu critical section */ + mdev = mpls_dev_get(dev); + if (!mdev || !mdev->input_enabled) + goto drop; + if (skb->pkt_type != PACKET_HOST) goto drop; @@ -352,9 +362,9 @@ static int mpls_route_add(struct mpls_route_config *cfg) if (!dev) goto errout; - /* For now just support ethernet devices */ + /* Ensure this is a supported device */ err = -EINVAL; - if ((dev->type != ARPHRD_ETHER) && (dev->type != ARPHRD_LOOPBACK)) + if (!mpls_dev_get(dev)) goto errout; err = -EINVAL; @@ -428,10 +438,89 @@ errout: return err; } +#define MPLS_PERDEV_SYSCTL_OFFSET(field) \ + (&((struct mpls_dev *)0)->field) + +static const struct ctl_table mpls_dev_table[] = { + { + .procname = "input", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + .data = MPLS_PERDEV_SYSCTL_OFFSET(input_enabled), + }, + { } +}; + +static int mpls_dev_sysctl_register(struct net_device *dev, + struct mpls_dev *mdev) +{ + char path[sizeof("net/mpls/conf/") + IFNAMSIZ]; + struct ctl_table *table; + int i; + + table = kmemdup(&mpls_dev_table, sizeof(mpls_dev_table), GFP_KERNEL); + if (!table) + goto out; + + /* Table data contains only offsets relative to the base of + * the mdev at this point, so make them absolute. + */ + for (i = 0; i < ARRAY_SIZE(mpls_dev_table); i++) + table[i].data = (char *)mdev + (uintptr_t)table[i].data; + + snprintf(path, sizeof(path), "net/mpls/conf/%s", dev->name); + + mdev->sysctl = register_net_sysctl(dev_net(dev), path, table); + if (!mdev->sysctl) + goto free; + + return 0; + +free: + kfree(table); +out: + return -ENOBUFS; +} + +static void mpls_dev_sysctl_unregister(struct mpls_dev *mdev) +{ + struct ctl_table *table; + + table = mdev->sysctl->ctl_table_arg; + unregister_net_sysctl_table(mdev->sysctl); + kfree(table); +} + +static struct mpls_dev *mpls_add_dev(struct net_device *dev) +{ + struct mpls_dev *mdev; + int err = -ENOMEM; + + ASSERT_RTNL(); + + mdev = kzalloc(sizeof(*mdev), GFP_KERNEL); + if (!mdev) + return ERR_PTR(err); + + err = mpls_dev_sysctl_register(dev, mdev); + if (err) + goto free; + + rcu_assign_pointer(dev->mpls_ptr, mdev); + + return mdev; + +free: + kfree(mdev); + return ERR_PTR(err); +} + static void mpls_ifdown(struct net_device *dev) { struct mpls_route __rcu **platform_label; struct net *net = dev_net(dev); + struct mpls_dev *mdev; unsigned index; platform_label = rtnl_dereference(net->mpls.platform_label); @@ -443,14 +532,35 @@ static void mpls_ifdown(struct net_device *dev) continue; rt->rt_dev = NULL; } + + mdev = mpls_dev_get(dev); + if (!mdev) + return; + + mpls_dev_sysctl_unregister(mdev); + + RCU_INIT_POINTER(dev->mpls_ptr, NULL); + + kfree(mdev); } static int mpls_dev_notify(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct mpls_dev *mdev; switch(event) { + case NETDEV_REGISTER: + /* For now just support ethernet devices */ + if ((dev->type == ARPHRD_ETHER) || + (dev->type == ARPHRD_LOOPBACK)) { + mdev = mpls_add_dev(dev); + if (IS_ERR(mdev)) + return notifier_from_errno(PTR_ERR(mdev)); + } + break; + case NETDEV_UNREGISTER: mpls_ifdown(dev); break; @@ -536,6 +646,15 @@ int nla_get_labels(const struct nlattr *nla, if ((dec.bos != bos) || dec.ttl || dec.tc) return -EINVAL; + switch (dec.label) { + case MPLS_LABEL_IMPLNULL: + /* RFC3032: This is a label that an LSR may + * assign and distribute, but which never + * actually appears in the encapsulation. + */ + return -EINVAL; + } + label[i] = dec.label; } *labels = nla_labels; @@ -816,7 +935,7 @@ static int resize_platform_label_table(struct net *net, size_t limit) } /* In case the predefined labels need to be populated */ - if (limit > LABEL_IPV4_EXPLICIT_NULL) { + if (limit > MPLS_LABEL_IPV4NULL) { struct net_device *lo = net->loopback_dev; rt0 = mpls_rt_alloc(lo->addr_len); if (!rt0) @@ -826,7 +945,7 @@ static int resize_platform_label_table(struct net *net, size_t limit) rt0->rt_via_table = NEIGH_LINK_TABLE; memcpy(rt0->rt_via, lo->dev_addr, lo->addr_len); } - if (limit > LABEL_IPV6_EXPLICIT_NULL) { + if (limit > MPLS_LABEL_IPV6NULL) { struct net_device *lo = net->loopback_dev; rt2 = mpls_rt_alloc(lo->addr_len); if (!rt2) @@ -854,15 +973,15 @@ static int resize_platform_label_table(struct net *net, size_t limit) memcpy(labels, old, cp_size); /* If needed set the predefined labels */ - if ((old_limit <= LABEL_IPV6_EXPLICIT_NULL) && - (limit > LABEL_IPV6_EXPLICIT_NULL)) { - RCU_INIT_POINTER(labels[LABEL_IPV6_EXPLICIT_NULL], rt2); + if ((old_limit <= MPLS_LABEL_IPV6NULL) && + (limit > MPLS_LABEL_IPV6NULL)) { + RCU_INIT_POINTER(labels[MPLS_LABEL_IPV6NULL], rt2); rt2 = NULL; } - if ((old_limit <= LABEL_IPV4_EXPLICIT_NULL) && - (limit > LABEL_IPV4_EXPLICIT_NULL)) { - RCU_INIT_POINTER(labels[LABEL_IPV4_EXPLICIT_NULL], rt0); + if ((old_limit <= MPLS_LABEL_IPV4NULL) && + (limit > MPLS_LABEL_IPV4NULL)) { + RCU_INIT_POINTER(labels[MPLS_LABEL_IPV4NULL], rt0); rt0 = NULL; } @@ -912,7 +1031,7 @@ static int mpls_platform_labels(struct ctl_table *table, int write, return ret; } -static struct ctl_table mpls_table[] = { +static const struct ctl_table mpls_table[] = { { .procname = "platform_labels", .data = NULL, diff --git a/net/mpls/internal.h b/net/mpls/internal.h index fb6de92..b064c34 100644 --- a/net/mpls/internal.h +++ b/net/mpls/internal.h @@ -1,16 +1,6 @@ #ifndef MPLS_INTERNAL_H #define MPLS_INTERNAL_H -#define LABEL_IPV4_EXPLICIT_NULL 0 /* RFC3032 */ -#define LABEL_ROUTER_ALERT_LABEL 1 /* RFC3032 */ -#define LABEL_IPV6_EXPLICIT_NULL 2 /* RFC3032 */ -#define LABEL_IMPLICIT_NULL 3 /* RFC3032 */ -#define LABEL_ENTROPY_INDICATOR 7 /* RFC6790 */ -#define LABEL_GAL 13 /* RFC5586 */ -#define LABEL_OAM_ALERT 14 /* RFC3429 */ -#define LABEL_EXTENSION 15 /* RFC7274 */ - - struct mpls_shim_hdr { __be32 label_stack_entry; }; @@ -22,6 +12,12 @@ struct mpls_entry_decoded { u8 bos; }; +struct mpls_dev { + int input_enabled; + + struct ctl_table_header *sysctl; +}; + struct sk_buff; static inline struct mpls_shim_hdr *mpls_hdr(const struct sk_buff *skb) diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index f70e34a..a0f3e6a3 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -863,6 +863,7 @@ config NETFILTER_XT_TARGET_TPROXY depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED depends on (IPV6 || IPV6=n) + depends on (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n) depends on IP_NF_MANGLE select NF_DEFRAG_IPV4 select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES @@ -1356,6 +1357,7 @@ config NETFILTER_XT_MATCH_SOCKET depends on NETFILTER_ADVANCED depends on !NF_CONNTRACK || NF_CONNTRACK depends on (IPV6 || IPV6=n) + depends on (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n) select NF_DEFRAG_IPV4 select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES help diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 4953267..285eae3 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -3823,6 +3823,9 @@ static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net) cancel_work_sync(&ipvs->defense_work.work); unregister_net_sysctl_table(ipvs->sysctl_hdr); ip_vs_stop_estimator(net, &ipvs->tot_stats); + + if (!net_eq(net, &init_net)) + kfree(ipvs->sysctl_tbl); } #else diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 5caa0c4..70383de 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -202,7 +202,7 @@ static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { * sES -> sES :-) * sFW -> sCW Normal close request answered by ACK. * sCW -> sCW - * sLA -> sTW Last ACK detected. + * sLA -> sTW Last ACK detected (RFC5961 challenged) * sTW -> sTW Retransmitted last ACK. Remain in the same state. * sCL -> sCL */ @@ -261,7 +261,7 @@ static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { * sES -> sES :-) * sFW -> sCW Normal close request answered by ACK. * sCW -> sCW - * sLA -> sTW Last ACK detected. + * sLA -> sTW Last ACK detected (RFC5961 challenged) * sTW -> sTW Retransmitted last ACK. * sCL -> sCL */ @@ -906,6 +906,7 @@ static int tcp_packet(struct nf_conn *ct, 1 : ct->proto.tcp.last_win; ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_scale = ct->proto.tcp.last_wscale; + ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK; ct->proto.tcp.seen[ct->proto.tcp.last_dir].flags = ct->proto.tcp.last_flags; memset(&ct->proto.tcp.seen[dir], 0, @@ -923,7 +924,9 @@ static int tcp_packet(struct nf_conn *ct, * may be in sync but we are not. In that case, we annotate * the TCP options and let the packet go through. If it is a * valid SYN packet, the server will reply with a SYN/ACK, and - * then we'll get in sync. Otherwise, the server ignores it. */ + * then we'll get in sync. Otherwise, the server potentially + * responds with a challenge ACK if implementing RFC5961. + */ if (index == TCP_SYN_SET && dir == IP_CT_DIR_ORIGINAL) { struct ip_ct_tcp_state seen = {}; @@ -939,6 +942,13 @@ static int tcp_packet(struct nf_conn *ct, ct->proto.tcp.last_flags |= IP_CT_TCP_FLAG_SACK_PERM; } + /* Mark the potential for RFC5961 challenge ACK, + * this pose a special problem for LAST_ACK state + * as ACK is intrepretated as ACKing last FIN. + */ + if (old_state == TCP_CONNTRACK_LAST_ACK) + ct->proto.tcp.last_flags |= + IP_CT_EXP_CHALLENGE_ACK; } spin_unlock_bh(&ct->lock); if (LOG_INVALID(net, IPPROTO_TCP)) @@ -970,6 +980,25 @@ static int tcp_packet(struct nf_conn *ct, nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: invalid state "); return -NF_ACCEPT; + case TCP_CONNTRACK_TIME_WAIT: + /* RFC5961 compliance cause stack to send "challenge-ACK" + * e.g. in response to spurious SYNs. Conntrack MUST + * not believe this ACK is acking last FIN. + */ + if (old_state == TCP_CONNTRACK_LAST_ACK && + index == TCP_ACK_SET && + ct->proto.tcp.last_dir != dir && + ct->proto.tcp.last_index == TCP_SYN_SET && + (ct->proto.tcp.last_flags & IP_CT_EXP_CHALLENGE_ACK)) { + /* Detected RFC5961 challenge ACK */ + ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK; + spin_unlock_bh(&ct->lock); + if (LOG_INVALID(net, IPPROTO_TCP)) + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, + "nf_ct_tcp: challenge-ACK ignored "); + return NF_ACCEPT; /* Don't change state */ + } + break; case TCP_CONNTRACK_CLOSE: if (index == TCP_RST_SET && (ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 78af83b..34ded09 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4340,7 +4340,6 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, case NFT_CONTINUE: case NFT_BREAK: case NFT_RETURN: - desc->len = sizeof(data->verdict); break; case NFT_JUMP: case NFT_GOTO: @@ -4355,10 +4354,10 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, chain->use++; data->verdict.chain = chain; - desc->len = sizeof(data); break; } + desc->len = sizeof(data->verdict); desc->type = NFT_DATA_VERDICT; return 0; } @@ -4473,9 +4472,9 @@ EXPORT_SYMBOL_GPL(nft_data_init); */ void nft_data_uninit(const struct nft_data *data, enum nft_data_types type) { - switch (type) { - case NFT_DATA_VALUE: + if (type < NFT_DATA_VERDICT) return; + switch (type) { case NFT_DATA_VERDICT: return nft_verdict_uninit(data); default: diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 3ad9126..4ef1fae 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -1073,7 +1073,13 @@ static struct pernet_operations nfnl_log_net_ops = { static int __init nfnetlink_log_init(void) { - int status = -ENOMEM; + int status; + + status = register_pernet_subsys(&nfnl_log_net_ops); + if (status < 0) { + pr_err("failed to register pernet ops\n"); + goto out; + } netlink_register_notifier(&nfulnl_rtnl_notifier); status = nfnetlink_subsys_register(&nfulnl_subsys); @@ -1088,28 +1094,23 @@ static int __init nfnetlink_log_init(void) goto cleanup_subsys; } - status = register_pernet_subsys(&nfnl_log_net_ops); - if (status < 0) { - pr_err("failed to register pernet ops\n"); - goto cleanup_logger; - } return status; -cleanup_logger: - nf_log_unregister(&nfulnl_logger); cleanup_subsys: nfnetlink_subsys_unregister(&nfulnl_subsys); cleanup_netlink_notifier: netlink_unregister_notifier(&nfulnl_rtnl_notifier); + unregister_pernet_subsys(&nfnl_log_net_ops); +out: return status; } static void __exit nfnetlink_log_fini(void) { - unregister_pernet_subsys(&nfnl_log_net_ops); nf_log_unregister(&nfulnl_logger); nfnetlink_subsys_unregister(&nfulnl_subsys); netlink_unregister_notifier(&nfulnl_rtnl_notifier); + unregister_pernet_subsys(&nfnl_log_net_ops); } MODULE_DESCRIPTION("netfilter userspace logging"); diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c index 0b98c74..11c7682 100644 --- a/net/netfilter/nfnetlink_queue_core.c +++ b/net/netfilter/nfnetlink_queue_core.c @@ -1317,7 +1317,13 @@ static struct pernet_operations nfnl_queue_net_ops = { static int __init nfnetlink_queue_init(void) { - int status = -ENOMEM; + int status; + + status = register_pernet_subsys(&nfnl_queue_net_ops); + if (status < 0) { + pr_err("nf_queue: failed to register pernet ops\n"); + goto out; + } netlink_register_notifier(&nfqnl_rtnl_notifier); status = nfnetlink_subsys_register(&nfqnl_subsys); @@ -1326,19 +1332,13 @@ static int __init nfnetlink_queue_init(void) goto cleanup_netlink_notifier; } - status = register_pernet_subsys(&nfnl_queue_net_ops); - if (status < 0) { - pr_err("nf_queue: failed to register pernet ops\n"); - goto cleanup_subsys; - } register_netdevice_notifier(&nfqnl_dev_notifier); nf_register_queue_handler(&nfqh); return status; -cleanup_subsys: - nfnetlink_subsys_unregister(&nfqnl_subsys); cleanup_netlink_notifier: netlink_unregister_notifier(&nfqnl_rtnl_notifier); +out: return status; } @@ -1346,9 +1346,9 @@ static void __exit nfnetlink_queue_fini(void) { nf_unregister_queue_handler(); unregister_netdevice_notifier(&nfqnl_dev_notifier); - unregister_pernet_subsys(&nfnl_queue_net_ops); nfnetlink_subsys_unregister(&nfqnl_subsys); netlink_unregister_notifier(&nfqnl_rtnl_notifier); + unregister_pernet_subsys(&nfnl_queue_net_ops); rcu_barrier(); /* Wait for completion of call_rcu()'s */ } diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c index 57d3e1a..0522fc9 100644 --- a/net/netfilter/nft_reject.c +++ b/net/netfilter/nft_reject.c @@ -63,6 +63,8 @@ int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr) if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code)) goto nla_put_failure; break; + default: + break; } return 0; diff --git a/net/netfilter/nft_reject_inet.c b/net/netfilter/nft_reject_inet.c index 62cabee..635dbba 100644 --- a/net/netfilter/nft_reject_inet.c +++ b/net/netfilter/nft_reject_inet.c @@ -108,6 +108,8 @@ static int nft_reject_inet_dump(struct sk_buff *skb, if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code)) goto nla_put_failure; break; + default: + break; } return 0; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 19909d0..bf6e766 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -89,7 +89,7 @@ static inline int netlink_is_kernel(struct sock *sk) return nlk_sk(sk)->flags & NETLINK_KERNEL_SOCKET; } -struct netlink_table *nl_table; +struct netlink_table *nl_table __read_mostly; EXPORT_SYMBOL_GPL(nl_table); static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait); @@ -1081,6 +1081,7 @@ static int netlink_insert(struct sock *sk, u32 portid) if (err) { if (err == -EEXIST) err = -EADDRINUSE; + nlk_sk(sk)->portid = 0; sock_put(sk); } @@ -1629,13 +1630,11 @@ static struct sk_buff *netlink_alloc_large_skb(unsigned int size, if (data == NULL) return NULL; - skb = build_skb(data, size); + skb = __build_skb(data, size); if (skb == NULL) vfree(data); - else { - skb->head_frag = 0; + else skb->destructor = netlink_skb_destructor; - } return skb; } @@ -3141,7 +3140,6 @@ static const struct rhashtable_params netlink_rhashtable_params = { .key_len = netlink_compare_arg_len, .obj_hashfn = netlink_hash, .obj_cmpfn = netlink_compare, - .max_size = 65536, .automatic_shrinking = true, }; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 5102c3c..b5989c6 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2311,11 +2311,14 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) tlen = dev->needed_tailroom; skb = sock_alloc_send_skb(&po->sk, hlen + tlen + sizeof(struct sockaddr_ll), - 0, &err); + !need_wait, &err); - if (unlikely(skb == NULL)) + if (unlikely(skb == NULL)) { + /* we assume the socket was initially writeable ... */ + if (likely(len_sum > 0)) + err = len_sum; goto out_status; - + } tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto, addr, hlen); if (tp_len > dev->mtu + dev->hard_header_len) { diff --git a/net/rds/connection.c b/net/rds/connection.c index 14f04139..da6da57 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -126,7 +126,10 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, struct rds_transport *loop_trans; unsigned long flags; int ret; + struct rds_transport *otrans = trans; + if (!is_outgoing && otrans->t_type == RDS_TRANS_TCP) + goto new_conn; rcu_read_lock(); conn = rds_conn_lookup(head, laddr, faddr, trans); if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport && @@ -142,6 +145,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, if (conn) goto out; +new_conn: conn = kmem_cache_zalloc(rds_conn_slab, gfp); if (!conn) { conn = ERR_PTR(-ENOMEM); @@ -230,13 +234,22 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, /* Creating normal conn */ struct rds_connection *found; - found = rds_conn_lookup(head, laddr, faddr, trans); + if (!is_outgoing && otrans->t_type == RDS_TRANS_TCP) + found = NULL; + else + found = rds_conn_lookup(head, laddr, faddr, trans); if (found) { trans->conn_free(conn->c_transport_data); kmem_cache_free(rds_conn_slab, conn); conn = found; } else { - hlist_add_head_rcu(&conn->c_hash_node, head); + if ((is_outgoing && otrans->t_type == RDS_TRANS_TCP) || + (otrans->t_type != RDS_TRANS_TCP)) { + /* Only the active side should be added to + * reconnect list for TCP. + */ + hlist_add_head_rcu(&conn->c_hash_node, head); + } rds_cong_add_conn(conn); rds_conn_count++; } diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 31b74f5..8a09ee7 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -183,8 +183,17 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even /* If the peer gave us the last packet it saw, process this as if * we had received a regular ACK. */ - if (dp && dp->dp_ack_seq) - rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL); + if (dp) { + /* dp structure start is not guaranteed to be 8 bytes aligned. + * Since dp_ack_seq is 64-bit extended load operations can be + * used so go through get_unaligned to avoid unaligned errors. + */ + __be64 dp_ack_seq = get_unaligned(&dp->dp_ack_seq); + + if (dp_ack_seq) + rds_send_drop_acked(conn, be64_to_cpu(dp_ack_seq), + NULL); + } rds_connect_complete(conn); } diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index f9f564a..973109c7 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -62,6 +62,7 @@ void rds_tcp_state_change(struct sock *sk) case TCP_ESTABLISHED: rds_connect_complete(conn); break; + case TCP_CLOSE_WAIT: case TCP_CLOSE: rds_conn_drop(conn); default: diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 23ab4dcd..0da49e3 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -45,12 +45,45 @@ static void rds_tcp_accept_worker(struct work_struct *work); static DECLARE_WORK(rds_tcp_listen_work, rds_tcp_accept_worker); static struct socket *rds_tcp_listen_sock; +static int rds_tcp_keepalive(struct socket *sock) +{ + /* values below based on xs_udp_default_timeout */ + int keepidle = 5; /* send a probe 'keepidle' secs after last data */ + int keepcnt = 5; /* number of unack'ed probes before declaring dead */ + int keepalive = 1; + int ret = 0; + + ret = kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, + (char *)&keepalive, sizeof(keepalive)); + if (ret < 0) + goto bail; + + ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPCNT, + (char *)&keepcnt, sizeof(keepcnt)); + if (ret < 0) + goto bail; + + ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPIDLE, + (char *)&keepidle, sizeof(keepidle)); + if (ret < 0) + goto bail; + + /* KEEPINTVL is the interval between successive probes. We follow + * the model in xs_tcp_finish_connecting() and re-use keepidle. + */ + ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPINTVL, + (char *)&keepidle, sizeof(keepidle)); +bail: + return ret; +} + static int rds_tcp_accept_one(struct socket *sock) { struct socket *new_sock = NULL; struct rds_connection *conn; int ret; struct inet_sock *inet; + struct rds_tcp_connection *rs_tcp; ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type, sock->sk->sk_protocol, &new_sock); @@ -63,6 +96,10 @@ static int rds_tcp_accept_one(struct socket *sock) if (ret < 0) goto out; + ret = rds_tcp_keepalive(new_sock); + if (ret < 0) + goto out; + rds_tcp_tune(new_sock); inet = inet_sk(new_sock->sk); @@ -77,6 +114,15 @@ static int rds_tcp_accept_one(struct socket *sock) ret = PTR_ERR(conn); goto out; } + /* An incoming SYN request came in, and TCP just accepted it. + * We always create a new conn for listen side of TCP, and do not + * add it to the c_hash_list. + * + * If the client reboots, this conn will need to be cleaned up. + * rds_tcp_state_change() will do that cleanup + */ + rs_tcp = (struct rds_tcp_connection *)conn->c_transport_data; + WARN_ON(!rs_tcp || rs_tcp->t_sock); /* * see the comment above rds_queue_delayed_reconnect() diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 8e47251..295d14b 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -63,7 +63,6 @@ static int tcf_connmark(struct sk_buff *skb, const struct tc_action *a, skb->mark = c->mark; /* using overlimits stats to count how many packets marked */ ca->tcf_qstats.overlimits++; - nf_ct_put(c); goto out; } @@ -82,7 +81,6 @@ static int tcf_connmark(struct sk_buff *skb, const struct tc_action *a, nf_ct_put(c); out: - skb->nfct = NULL; spin_unlock(&ca->tcf_lock); return ca->tcf_action; } diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 8b0470e..a75864d 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -81,6 +81,11 @@ int unregister_tcf_proto_ops(struct tcf_proto_ops *ops) struct tcf_proto_ops *t; int rc = -ENOENT; + /* Wait for outstanding call_rcu()s, if any, from a + * tcf_proto_ops's destroy() handler. + */ + rcu_barrier(); + write_lock(&cls_mod_lock); list_for_each_entry(t, &tcf_proto_base, head) { if (t == ops) { @@ -308,12 +313,11 @@ replay: case RTM_DELTFILTER: err = tp->ops->delete(tp, fh); if (err == 0) { - tfilter_notify(net, skb, n, tp, fh, RTM_DELTFILTER); - if (tcf_destroy(tp, false)) { - struct tcf_proto *next = rtnl_dereference(tp->next); + struct tcf_proto *next = rtnl_dereference(tp->next); + tfilter_notify(net, skb, n, tp, fh, RTM_DELTFILTER); + if (tcf_destroy(tp, false)) RCU_INIT_POINTER(*back, next); - } } goto errout; case RTM_GETTFILTER: diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c index de28f8e..7a0bdb1 100644 --- a/net/sched/sch_codel.c +++ b/net/sched/sch_codel.c @@ -164,7 +164,7 @@ static int codel_init(struct Qdisc *sch, struct nlattr *opt) sch->limit = DEFAULT_CODEL_LIMIT; - codel_params_init(&q->params); + codel_params_init(&q->params, sch); codel_vars_init(&q->vars); codel_stats_init(&q->stats); diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 1e52dec..c244c45b 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -391,7 +391,7 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt) q->perturbation = prandom_u32(); INIT_LIST_HEAD(&q->new_flows); INIT_LIST_HEAD(&q->old_flows); - codel_params_init(&q->cparams); + codel_params_init(&q->cparams, sch); codel_stats_init(&q->cstats); q->cparams.ecn = true; diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index a4ca451..634529e 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -229,7 +229,7 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch) break; } - if (q->backlog + qdisc_pkt_len(skb) <= q->limit) { + if (gred_backlog(t, q, sch) + qdisc_pkt_len(skb) <= q->limit) { q->backlog += qdisc_pkt_len(skb); return qdisc_enqueue_tail(skb, sch); } @@ -553,7 +553,7 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) opt.limit = q->limit; opt.DP = q->DP; - opt.backlog = q->backlog; + opt.backlog = gred_backlog(table, q, sch); opt.prio = q->prio; opt.qth_min = q->parms.qth_min >> q->parms.Wlog; opt.qth_max = q->parms.qth_max >> q->parms.Wlog; diff --git a/net/socket.c b/net/socket.c index 3e33959..884e329 100644 --- a/net/socket.c +++ b/net/socket.c @@ -312,7 +312,7 @@ static const struct super_operations sockfs_ops = { static char *sockfs_dname(struct dentry *dentry, char *buffer, int buflen) { return dynamic_dname(dentry, buffer, buflen, "socket:[%lu]", - dentry->d_inode->i_ino); + d_inode(dentry)->i_ino); } static const struct dentry_operations sockfs_dentry_operations = { @@ -375,7 +375,7 @@ struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname) &socket_file_ops); if (unlikely(IS_ERR(file))) { /* drop dentry, keep inode */ - ihold(path.dentry->d_inode); + ihold(d_inode(path.dentry)); path_put(&path); return file; } @@ -497,7 +497,7 @@ static ssize_t sockfs_listxattr(struct dentry *dentry, char *buffer, ssize_t len; ssize_t used = 0; - len = security_inode_listsecurity(dentry->d_inode, buffer, size); + len = security_inode_listsecurity(d_inode(dentry), buffer, size); if (len < 0) return len; used += len; diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c index 1ec19f6..eeeba5a 100644 --- a/net/sunrpc/auth_gss/gss_rpc_xdr.c +++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c @@ -793,20 +793,26 @@ int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp, { u32 value_follows; int err; + struct page *scratch; + + scratch = alloc_page(GFP_KERNEL); + if (!scratch) + return -ENOMEM; + xdr_set_scratch_buffer(xdr, page_address(scratch), PAGE_SIZE); /* res->status */ err = gssx_dec_status(xdr, &res->status); if (err) - return err; + goto out_free; /* res->context_handle */ err = gssx_dec_bool(xdr, &value_follows); if (err) - return err; + goto out_free; if (value_follows) { err = gssx_dec_ctx(xdr, res->context_handle); if (err) - return err; + goto out_free; } else { res->context_handle = NULL; } @@ -814,11 +820,11 @@ int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp, /* res->output_token */ err = gssx_dec_bool(xdr, &value_follows); if (err) - return err; + goto out_free; if (value_follows) { err = gssx_dec_buffer(xdr, res->output_token); if (err) - return err; + goto out_free; } else { res->output_token = NULL; } @@ -826,14 +832,17 @@ int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp, /* res->delegated_cred_handle */ err = gssx_dec_bool(xdr, &value_follows); if (err) - return err; + goto out_free; if (value_follows) { /* we do not support upcall servers sending this data. */ - return -EINVAL; + err = -EINVAL; + goto out_free; } /* res->options */ err = gssx_dec_option_array(xdr, &res->options); +out_free: + __free_page(scratch); return err; } diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 2d12b76..d81186d 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -94,7 +94,7 @@ rpc_timeout_upcall_queue(struct work_struct *work) } dentry = dget(pipe->dentry); spin_unlock(&pipe->lock); - rpc_purge_list(dentry ? &RPC_I(dentry->d_inode)->waitq : NULL, + rpc_purge_list(dentry ? &RPC_I(d_inode(dentry))->waitq : NULL, &free_list, destroy_msg, -ETIMEDOUT); dput(dentry); } @@ -152,7 +152,7 @@ rpc_queue_upcall(struct rpc_pipe *pipe, struct rpc_pipe_msg *msg) dentry = dget(pipe->dentry); spin_unlock(&pipe->lock); if (dentry) { - wake_up(&RPC_I(dentry->d_inode)->waitq); + wake_up(&RPC_I(d_inode(dentry))->waitq); dput(dentry); } return res; @@ -591,7 +591,7 @@ static int __rpc_mkpipe_dentry(struct inode *dir, struct dentry *dentry, err = __rpc_create_common(dir, dentry, S_IFIFO | mode, i_fop, private); if (err) return err; - rpci = RPC_I(dentry->d_inode); + rpci = RPC_I(d_inode(dentry)); rpci->private = private; rpci->pipe = pipe; fsnotify_create(dir, dentry); @@ -616,7 +616,7 @@ int rpc_rmdir(struct dentry *dentry) int error; parent = dget_parent(dentry); - dir = parent->d_inode; + dir = d_inode(parent); mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); error = __rpc_rmdir(dir, dentry); mutex_unlock(&dir->i_mutex); @@ -638,7 +638,7 @@ static int __rpc_unlink(struct inode *dir, struct dentry *dentry) static int __rpc_rmpipe(struct inode *dir, struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); rpc_close_pipes(inode); return __rpc_unlink(dir, dentry); @@ -654,7 +654,7 @@ static struct dentry *__rpc_lookup_create_exclusive(struct dentry *parent, if (!dentry) return ERR_PTR(-ENOMEM); } - if (dentry->d_inode == NULL) + if (d_really_is_negative(dentry)) return dentry; dput(dentry); return ERR_PTR(-EEXIST); @@ -667,7 +667,7 @@ static void __rpc_depopulate(struct dentry *parent, const struct rpc_filelist *files, int start, int eof) { - struct inode *dir = parent->d_inode; + struct inode *dir = d_inode(parent); struct dentry *dentry; struct qstr name; int i; @@ -679,9 +679,9 @@ static void __rpc_depopulate(struct dentry *parent, if (dentry == NULL) continue; - if (dentry->d_inode == NULL) + if (d_really_is_negative(dentry)) goto next; - switch (dentry->d_inode->i_mode & S_IFMT) { + switch (d_inode(dentry)->i_mode & S_IFMT) { default: BUG(); case S_IFREG: @@ -699,7 +699,7 @@ static void rpc_depopulate(struct dentry *parent, const struct rpc_filelist *files, int start, int eof) { - struct inode *dir = parent->d_inode; + struct inode *dir = d_inode(parent); mutex_lock_nested(&dir->i_mutex, I_MUTEX_CHILD); __rpc_depopulate(parent, files, start, eof); @@ -711,7 +711,7 @@ static int rpc_populate(struct dentry *parent, int start, int eof, void *private) { - struct inode *dir = parent->d_inode; + struct inode *dir = d_inode(parent); struct dentry *dentry; int i, err; @@ -754,7 +754,7 @@ static struct dentry *rpc_mkdir_populate(struct dentry *parent, int (*populate)(struct dentry *, void *), void *args_populate) { struct dentry *dentry; - struct inode *dir = parent->d_inode; + struct inode *dir = d_inode(parent); int error; mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); @@ -787,7 +787,7 @@ static int rpc_rmdir_depopulate(struct dentry *dentry, int error; parent = dget_parent(dentry); - dir = parent->d_inode; + dir = d_inode(parent); mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); if (depopulate != NULL) depopulate(dentry); @@ -819,7 +819,7 @@ struct dentry *rpc_mkpipe_dentry(struct dentry *parent, const char *name, void *private, struct rpc_pipe *pipe) { struct dentry *dentry; - struct inode *dir = parent->d_inode; + struct inode *dir = d_inode(parent); umode_t umode = S_IFIFO | S_IRUSR | S_IWUSR; int err; @@ -864,7 +864,7 @@ rpc_unlink(struct dentry *dentry) int error = 0; parent = dget_parent(dentry); - dir = parent->d_inode; + dir = d_inode(parent); mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); error = __rpc_rmpipe(dir, dentry); mutex_unlock(&dir->i_mutex); @@ -1375,7 +1375,7 @@ rpc_gssd_dummy_depopulate(struct dentry *pipe_dentry) struct dentry *clnt_dir = pipe_dentry->d_parent; struct dentry *gssd_dir = clnt_dir->d_parent; - __rpc_rmpipe(clnt_dir->d_inode, pipe_dentry); + __rpc_rmpipe(d_inode(clnt_dir), pipe_dentry); __rpc_depopulate(clnt_dir, gssd_dummy_info_file, 0, 1); __rpc_depopulate(gssd_dir, gssd_dummy_clnt_dir, 0, 1); dput(pipe_dentry); diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index b91fd9c..337ca85 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -89,8 +89,8 @@ __rpc_add_timer(struct rpc_wait_queue *queue, struct rpc_task *task) if (!task->tk_timeout) return; - dprintk("RPC: %5u setting alarm for %lu ms\n", - task->tk_pid, task->tk_timeout * 1000 / HZ); + dprintk("RPC: %5u setting alarm for %u ms\n", + task->tk_pid, jiffies_to_msecs(task->tk_timeout)); task->u.tk_wait.expires = jiffies + task->tk_timeout; if (list_empty(&queue->timer_list.list) || time_before(task->u.tk_wait.expires, queue->timer_list.expires)) diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 9949722..1d4fe24 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -326,6 +326,15 @@ out_unlock: xprt_clear_locked(xprt); } +static void xprt_task_clear_bytes_sent(struct rpc_task *task) +{ + if (task != NULL) { + struct rpc_rqst *req = task->tk_rqstp; + if (req != NULL) + req->rq_bytes_sent = 0; + } +} + /** * xprt_release_xprt - allow other requests to use a transport * @xprt: transport with other tasks potentially waiting @@ -336,11 +345,7 @@ out_unlock: void xprt_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task) { if (xprt->snd_task == task) { - if (task != NULL) { - struct rpc_rqst *req = task->tk_rqstp; - if (req != NULL) - req->rq_bytes_sent = 0; - } + xprt_task_clear_bytes_sent(task); xprt_clear_locked(xprt); __xprt_lock_write_next(xprt); } @@ -358,11 +363,7 @@ EXPORT_SYMBOL_GPL(xprt_release_xprt); void xprt_release_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task) { if (xprt->snd_task == task) { - if (task != NULL) { - struct rpc_rqst *req = task->tk_rqstp; - if (req != NULL) - req->rq_bytes_sent = 0; - } + xprt_task_clear_bytes_sent(task); xprt_clear_locked(xprt); __xprt_lock_write_next_cong(xprt); } @@ -700,6 +701,7 @@ bool xprt_lock_connect(struct rpc_xprt *xprt, goto out; if (xprt->snd_task != task) goto out; + xprt_task_clear_bytes_sent(task); xprt->snd_task = cookie; ret = true; out: diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile index da5136f..579f72b 100644 --- a/net/sunrpc/xprtrdma/Makefile +++ b/net/sunrpc/xprtrdma/Makefile @@ -1,6 +1,7 @@ obj-$(CONFIG_SUNRPC_XPRT_RDMA_CLIENT) += xprtrdma.o -xprtrdma-y := transport.o rpc_rdma.o verbs.o +xprtrdma-y := transport.o rpc_rdma.o verbs.o \ + fmr_ops.o frwr_ops.o physical_ops.o obj-$(CONFIG_SUNRPC_XPRT_RDMA_SERVER) += svcrdma.o diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c new file mode 100644 index 0000000..302d4eb --- /dev/null +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -0,0 +1,208 @@ +/* + * Copyright (c) 2015 Oracle. All rights reserved. + * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. + */ + +/* Lightweight memory registration using Fast Memory Regions (FMR). + * Referred to sometimes as MTHCAFMR mode. + * + * FMR uses synchronous memory registration and deregistration. + * FMR registration is known to be fast, but FMR deregistration + * can take tens of usecs to complete. + */ + +#include "xprt_rdma.h" + +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) +# define RPCDBG_FACILITY RPCDBG_TRANS +#endif + +/* Maximum scatter/gather per FMR */ +#define RPCRDMA_MAX_FMR_SGES (64) + +static int +fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, + struct rpcrdma_create_data_internal *cdata) +{ + return 0; +} + +/* FMR mode conveys up to 64 pages of payload per chunk segment. + */ +static size_t +fmr_op_maxpages(struct rpcrdma_xprt *r_xprt) +{ + return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, + rpcrdma_max_segments(r_xprt) * RPCRDMA_MAX_FMR_SGES); +} + +static int +fmr_op_init(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ; + struct ib_fmr_attr fmr_attr = { + .max_pages = RPCRDMA_MAX_FMR_SGES, + .max_maps = 1, + .page_shift = PAGE_SHIFT + }; + struct ib_pd *pd = r_xprt->rx_ia.ri_pd; + struct rpcrdma_mw *r; + int i, rc; + + INIT_LIST_HEAD(&buf->rb_mws); + INIT_LIST_HEAD(&buf->rb_all); + + i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS; + dprintk("RPC: %s: initializing %d FMRs\n", __func__, i); + + while (i--) { + r = kzalloc(sizeof(*r), GFP_KERNEL); + if (!r) + return -ENOMEM; + + r->r.fmr = ib_alloc_fmr(pd, mr_access_flags, &fmr_attr); + if (IS_ERR(r->r.fmr)) + goto out_fmr_err; + + list_add(&r->mw_list, &buf->rb_mws); + list_add(&r->mw_all, &buf->rb_all); + } + return 0; + +out_fmr_err: + rc = PTR_ERR(r->r.fmr); + dprintk("RPC: %s: ib_alloc_fmr status %i\n", __func__, rc); + kfree(r); + return rc; +} + +/* Use the ib_map_phys_fmr() verb to register a memory region + * for remote access via RDMA READ or RDMA WRITE. + */ +static int +fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, + int nsegs, bool writing) +{ + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct ib_device *device = ia->ri_id->device; + enum dma_data_direction direction = rpcrdma_data_dir(writing); + struct rpcrdma_mr_seg *seg1 = seg; + struct rpcrdma_mw *mw = seg1->rl_mw; + u64 physaddrs[RPCRDMA_MAX_DATA_SEGS]; + int len, pageoff, i, rc; + + pageoff = offset_in_page(seg1->mr_offset); + seg1->mr_offset -= pageoff; /* start of page */ + seg1->mr_len += pageoff; + len = -pageoff; + if (nsegs > RPCRDMA_MAX_FMR_SGES) + nsegs = RPCRDMA_MAX_FMR_SGES; + for (i = 0; i < nsegs;) { + rpcrdma_map_one(device, seg, direction); + physaddrs[i] = seg->mr_dma; + len += seg->mr_len; + ++seg; + ++i; + /* Check for holes */ + if ((i < nsegs && offset_in_page(seg->mr_offset)) || + offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) + break; + } + + rc = ib_map_phys_fmr(mw->r.fmr, physaddrs, i, seg1->mr_dma); + if (rc) + goto out_maperr; + + seg1->mr_rkey = mw->r.fmr->rkey; + seg1->mr_base = seg1->mr_dma + pageoff; + seg1->mr_nsegs = i; + seg1->mr_len = len; + return i; + +out_maperr: + dprintk("RPC: %s: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n", + __func__, len, (unsigned long long)seg1->mr_dma, + pageoff, i, rc); + while (i--) + rpcrdma_unmap_one(device, --seg); + return rc; +} + +/* Use the ib_unmap_fmr() verb to prevent further remote + * access via RDMA READ or RDMA WRITE. + */ +static int +fmr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) +{ + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct rpcrdma_mr_seg *seg1 = seg; + struct ib_device *device; + int rc, nsegs = seg->mr_nsegs; + LIST_HEAD(l); + + list_add(&seg1->rl_mw->r.fmr->list, &l); + rc = ib_unmap_fmr(&l); + read_lock(&ia->ri_qplock); + device = ia->ri_id->device; + while (seg1->mr_nsegs--) + rpcrdma_unmap_one(device, seg++); + read_unlock(&ia->ri_qplock); + if (rc) + goto out_err; + return nsegs; + +out_err: + dprintk("RPC: %s: ib_unmap_fmr status %i\n", __func__, rc); + return nsegs; +} + +/* After a disconnect, unmap all FMRs. + * + * This is invoked only in the transport connect worker in order + * to serialize with rpcrdma_register_fmr_external(). + */ +static void +fmr_op_reset(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct rpcrdma_mw *r; + LIST_HEAD(list); + int rc; + + list_for_each_entry(r, &buf->rb_all, mw_all) + list_add(&r->r.fmr->list, &list); + + rc = ib_unmap_fmr(&list); + if (rc) + dprintk("RPC: %s: ib_unmap_fmr failed %i\n", + __func__, rc); +} + +static void +fmr_op_destroy(struct rpcrdma_buffer *buf) +{ + struct rpcrdma_mw *r; + int rc; + + while (!list_empty(&buf->rb_all)) { + r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); + list_del(&r->mw_all); + rc = ib_dealloc_fmr(r->r.fmr); + if (rc) + dprintk("RPC: %s: ib_dealloc_fmr failed %i\n", + __func__, rc); + kfree(r); + } +} + +const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = { + .ro_map = fmr_op_map, + .ro_unmap = fmr_op_unmap, + .ro_open = fmr_op_open, + .ro_maxpages = fmr_op_maxpages, + .ro_init = fmr_op_init, + .ro_reset = fmr_op_reset, + .ro_destroy = fmr_op_destroy, + .ro_displayname = "fmr", +}; diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c new file mode 100644 index 0000000..dff0481 --- /dev/null +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -0,0 +1,353 @@ +/* + * Copyright (c) 2015 Oracle. All rights reserved. + * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. + */ + +/* Lightweight memory registration using Fast Registration Work + * Requests (FRWR). Also referred to sometimes as FRMR mode. + * + * FRWR features ordered asynchronous registration and deregistration + * of arbitrarily sized memory regions. This is the fastest and safest + * but most complex memory registration mode. + */ + +#include "xprt_rdma.h" + +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) +# define RPCDBG_FACILITY RPCDBG_TRANS +#endif + +static int +__frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device, + unsigned int depth) +{ + struct rpcrdma_frmr *f = &r->r.frmr; + int rc; + + f->fr_mr = ib_alloc_fast_reg_mr(pd, depth); + if (IS_ERR(f->fr_mr)) + goto out_mr_err; + f->fr_pgl = ib_alloc_fast_reg_page_list(device, depth); + if (IS_ERR(f->fr_pgl)) + goto out_list_err; + return 0; + +out_mr_err: + rc = PTR_ERR(f->fr_mr); + dprintk("RPC: %s: ib_alloc_fast_reg_mr status %i\n", + __func__, rc); + return rc; + +out_list_err: + rc = PTR_ERR(f->fr_pgl); + dprintk("RPC: %s: ib_alloc_fast_reg_page_list status %i\n", + __func__, rc); + ib_dereg_mr(f->fr_mr); + return rc; +} + +static void +__frwr_release(struct rpcrdma_mw *r) +{ + int rc; + + rc = ib_dereg_mr(r->r.frmr.fr_mr); + if (rc) + dprintk("RPC: %s: ib_dereg_mr status %i\n", + __func__, rc); + ib_free_fast_reg_page_list(r->r.frmr.fr_pgl); +} + +static int +frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, + struct rpcrdma_create_data_internal *cdata) +{ + struct ib_device_attr *devattr = &ia->ri_devattr; + int depth, delta; + + ia->ri_max_frmr_depth = + min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, + devattr->max_fast_reg_page_list_len); + dprintk("RPC: %s: device's max FR page list len = %u\n", + __func__, ia->ri_max_frmr_depth); + + /* Add room for frmr register and invalidate WRs. + * 1. FRMR reg WR for head + * 2. FRMR invalidate WR for head + * 3. N FRMR reg WRs for pagelist + * 4. N FRMR invalidate WRs for pagelist + * 5. FRMR reg WR for tail + * 6. FRMR invalidate WR for tail + * 7. The RDMA_SEND WR + */ + depth = 7; + + /* Calculate N if the device max FRMR depth is smaller than + * RPCRDMA_MAX_DATA_SEGS. + */ + if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) { + delta = RPCRDMA_MAX_DATA_SEGS - ia->ri_max_frmr_depth; + do { + depth += 2; /* FRMR reg + invalidate */ + delta -= ia->ri_max_frmr_depth; + } while (delta > 0); + } + + ep->rep_attr.cap.max_send_wr *= depth; + if (ep->rep_attr.cap.max_send_wr > devattr->max_qp_wr) { + cdata->max_requests = devattr->max_qp_wr / depth; + if (!cdata->max_requests) + return -EINVAL; + ep->rep_attr.cap.max_send_wr = cdata->max_requests * + depth; + } + + return 0; +} + +/* FRWR mode conveys a list of pages per chunk segment. The + * maximum length of that list is the FRWR page list depth. + */ +static size_t +frwr_op_maxpages(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + + return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, + rpcrdma_max_segments(r_xprt) * ia->ri_max_frmr_depth); +} + +/* If FAST_REG or LOCAL_INV failed, indicate the frmr needs to be reset. */ +static void +frwr_sendcompletion(struct ib_wc *wc) +{ + struct rpcrdma_mw *r; + + if (likely(wc->status == IB_WC_SUCCESS)) + return; + + /* WARNING: Only wr_id and status are reliable at this point */ + r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; + dprintk("RPC: %s: frmr %p (stale), status %d\n", + __func__, r, wc->status); + r->r.frmr.fr_state = FRMR_IS_STALE; +} + +static int +frwr_op_init(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct ib_device *device = r_xprt->rx_ia.ri_id->device; + unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth; + struct ib_pd *pd = r_xprt->rx_ia.ri_pd; + int i; + + INIT_LIST_HEAD(&buf->rb_mws); + INIT_LIST_HEAD(&buf->rb_all); + + i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS; + dprintk("RPC: %s: initializing %d FRMRs\n", __func__, i); + + while (i--) { + struct rpcrdma_mw *r; + int rc; + + r = kzalloc(sizeof(*r), GFP_KERNEL); + if (!r) + return -ENOMEM; + + rc = __frwr_init(r, pd, device, depth); + if (rc) { + kfree(r); + return rc; + } + + list_add(&r->mw_list, &buf->rb_mws); + list_add(&r->mw_all, &buf->rb_all); + r->mw_sendcompletion = frwr_sendcompletion; + } + + return 0; +} + +/* Post a FAST_REG Work Request to register a memory region + * for remote access via RDMA READ or RDMA WRITE. + */ +static int +frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, + int nsegs, bool writing) +{ + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct ib_device *device = ia->ri_id->device; + enum dma_data_direction direction = rpcrdma_data_dir(writing); + struct rpcrdma_mr_seg *seg1 = seg; + struct rpcrdma_mw *mw = seg1->rl_mw; + struct rpcrdma_frmr *frmr = &mw->r.frmr; + struct ib_mr *mr = frmr->fr_mr; + struct ib_send_wr fastreg_wr, *bad_wr; + u8 key; + int len, pageoff; + int i, rc; + int seg_len; + u64 pa; + int page_no; + + pageoff = offset_in_page(seg1->mr_offset); + seg1->mr_offset -= pageoff; /* start of page */ + seg1->mr_len += pageoff; + len = -pageoff; + if (nsegs > ia->ri_max_frmr_depth) + nsegs = ia->ri_max_frmr_depth; + for (page_no = i = 0; i < nsegs;) { + rpcrdma_map_one(device, seg, direction); + pa = seg->mr_dma; + for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) { + frmr->fr_pgl->page_list[page_no++] = pa; + pa += PAGE_SIZE; + } + len += seg->mr_len; + ++seg; + ++i; + /* Check for holes */ + if ((i < nsegs && offset_in_page(seg->mr_offset)) || + offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) + break; + } + dprintk("RPC: %s: Using frmr %p to map %d segments (%d bytes)\n", + __func__, mw, i, len); + + frmr->fr_state = FRMR_IS_VALID; + + memset(&fastreg_wr, 0, sizeof(fastreg_wr)); + fastreg_wr.wr_id = (unsigned long)(void *)mw; + fastreg_wr.opcode = IB_WR_FAST_REG_MR; + fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma + pageoff; + fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl; + fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT; + fastreg_wr.wr.fast_reg.page_list_len = page_no; + fastreg_wr.wr.fast_reg.length = len; + fastreg_wr.wr.fast_reg.access_flags = writing ? + IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : + IB_ACCESS_REMOTE_READ; + key = (u8)(mr->rkey & 0x000000FF); + ib_update_fast_reg_key(mr, ++key); + fastreg_wr.wr.fast_reg.rkey = mr->rkey; + + DECR_CQCOUNT(&r_xprt->rx_ep); + rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr); + if (rc) + goto out_senderr; + + seg1->mr_rkey = mr->rkey; + seg1->mr_base = seg1->mr_dma + pageoff; + seg1->mr_nsegs = i; + seg1->mr_len = len; + return i; + +out_senderr: + dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc); + ib_update_fast_reg_key(mr, --key); + frmr->fr_state = FRMR_IS_INVALID; + while (i--) + rpcrdma_unmap_one(device, --seg); + return rc; +} + +/* Post a LOCAL_INV Work Request to prevent further remote access + * via RDMA READ or RDMA WRITE. + */ +static int +frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) +{ + struct rpcrdma_mr_seg *seg1 = seg; + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct ib_send_wr invalidate_wr, *bad_wr; + int rc, nsegs = seg->mr_nsegs; + struct ib_device *device; + + seg1->rl_mw->r.frmr.fr_state = FRMR_IS_INVALID; + + memset(&invalidate_wr, 0, sizeof(invalidate_wr)); + invalidate_wr.wr_id = (unsigned long)(void *)seg1->rl_mw; + invalidate_wr.opcode = IB_WR_LOCAL_INV; + invalidate_wr.ex.invalidate_rkey = seg1->rl_mw->r.frmr.fr_mr->rkey; + DECR_CQCOUNT(&r_xprt->rx_ep); + + read_lock(&ia->ri_qplock); + device = ia->ri_id->device; + while (seg1->mr_nsegs--) + rpcrdma_unmap_one(device, seg++); + rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr); + read_unlock(&ia->ri_qplock); + if (rc) + goto out_err; + return nsegs; + +out_err: + /* Force rpcrdma_buffer_get() to retry */ + seg1->rl_mw->r.frmr.fr_state = FRMR_IS_STALE; + dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc); + return nsegs; +} + +/* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in + * an unusable state. Find FRMRs in this state and dereg / reg + * each. FRMRs that are VALID and attached to an rpcrdma_req are + * also torn down. + * + * This gives all in-use FRMRs a fresh rkey and leaves them INVALID. + * + * This is invoked only in the transport connect worker in order + * to serialize with rpcrdma_register_frmr_external(). + */ +static void +frwr_op_reset(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct ib_device *device = r_xprt->rx_ia.ri_id->device; + unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth; + struct ib_pd *pd = r_xprt->rx_ia.ri_pd; + struct rpcrdma_mw *r; + int rc; + + list_for_each_entry(r, &buf->rb_all, mw_all) { + if (r->r.frmr.fr_state == FRMR_IS_INVALID) + continue; + + __frwr_release(r); + rc = __frwr_init(r, pd, device, depth); + if (rc) { + dprintk("RPC: %s: mw %p left %s\n", + __func__, r, + (r->r.frmr.fr_state == FRMR_IS_STALE ? + "stale" : "valid")); + continue; + } + + r->r.frmr.fr_state = FRMR_IS_INVALID; + } +} + +static void +frwr_op_destroy(struct rpcrdma_buffer *buf) +{ + struct rpcrdma_mw *r; + + while (!list_empty(&buf->rb_all)) { + r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); + list_del(&r->mw_all); + __frwr_release(r); + kfree(r); + } +} + +const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { + .ro_map = frwr_op_map, + .ro_unmap = frwr_op_unmap, + .ro_open = frwr_op_open, + .ro_maxpages = frwr_op_maxpages, + .ro_init = frwr_op_init, + .ro_reset = frwr_op_reset, + .ro_destroy = frwr_op_destroy, + .ro_displayname = "frwr", +}; diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c new file mode 100644 index 0000000..ba518af --- /dev/null +++ b/net/sunrpc/xprtrdma/physical_ops.c @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2015 Oracle. All rights reserved. + * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. + */ + +/* No-op chunk preparation. All client memory is pre-registered. + * Sometimes referred to as ALLPHYSICAL mode. + * + * Physical registration is simple because all client memory is + * pre-registered and never deregistered. This mode is good for + * adapter bring up, but is considered not safe: the server is + * trusted not to abuse its access to client memory not involved + * in RDMA I/O. + */ + +#include "xprt_rdma.h" + +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) +# define RPCDBG_FACILITY RPCDBG_TRANS +#endif + +static int +physical_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, + struct rpcrdma_create_data_internal *cdata) +{ + return 0; +} + +/* PHYSICAL memory registration conveys one page per chunk segment. + */ +static size_t +physical_op_maxpages(struct rpcrdma_xprt *r_xprt) +{ + return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, + rpcrdma_max_segments(r_xprt)); +} + +static int +physical_op_init(struct rpcrdma_xprt *r_xprt) +{ + return 0; +} + +/* The client's physical memory is already exposed for + * remote access via RDMA READ or RDMA WRITE. + */ +static int +physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, + int nsegs, bool writing) +{ + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + + rpcrdma_map_one(ia->ri_id->device, seg, + rpcrdma_data_dir(writing)); + seg->mr_rkey = ia->ri_bind_mem->rkey; + seg->mr_base = seg->mr_dma; + seg->mr_nsegs = 1; + return 1; +} + +/* Unmap a memory region, but leave it registered. + */ +static int +physical_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) +{ + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + + read_lock(&ia->ri_qplock); + rpcrdma_unmap_one(ia->ri_id->device, seg); + read_unlock(&ia->ri_qplock); + + return 1; +} + +static void +physical_op_reset(struct rpcrdma_xprt *r_xprt) +{ +} + +static void +physical_op_destroy(struct rpcrdma_buffer *buf) +{ +} + +const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = { + .ro_map = physical_op_map, + .ro_unmap = physical_op_unmap, + .ro_open = physical_op_open, + .ro_maxpages = physical_op_maxpages, + .ro_init = physical_op_init, + .ro_reset = physical_op_reset, + .ro_destroy = physical_op_destroy, + .ro_displayname = "physical", +}; diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 91ffde8..2c53ea9 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -53,6 +53,14 @@ # define RPCDBG_FACILITY RPCDBG_TRANS #endif +enum rpcrdma_chunktype { + rpcrdma_noch = 0, + rpcrdma_readch, + rpcrdma_areadch, + rpcrdma_writech, + rpcrdma_replych +}; + #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) static const char transfertypes[][12] = { "pure inline", /* no chunks */ @@ -179,6 +187,7 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, struct rpcrdma_write_array *warray = NULL; struct rpcrdma_write_chunk *cur_wchunk = NULL; __be32 *iptr = headerp->rm_body.rm_chunks; + int (*map)(struct rpcrdma_xprt *, struct rpcrdma_mr_seg *, int, bool); if (type == rpcrdma_readch || type == rpcrdma_areadch) { /* a read chunk - server will RDMA Read our memory */ @@ -201,9 +210,9 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, if (nsegs < 0) return nsegs; + map = r_xprt->rx_ia.ri_ops->ro_map; do { - n = rpcrdma_register_external(seg, nsegs, - cur_wchunk != NULL, r_xprt); + n = map(r_xprt, seg, nsegs, cur_wchunk != NULL); if (n <= 0) goto out; if (cur_rchunk) { /* read */ @@ -275,34 +284,13 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, return (unsigned char *)iptr - (unsigned char *)headerp; out: - if (r_xprt->rx_ia.ri_memreg_strategy != RPCRDMA_FRMR) { - for (pos = 0; nchunks--;) - pos += rpcrdma_deregister_external( - &req->rl_segments[pos], r_xprt); - } - return n; -} + if (r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR) + return n; -/* - * Marshal chunks. This routine returns the header length - * consumed by marshaling. - * - * Returns positive RPC/RDMA header size, or negative errno. - */ - -ssize_t -rpcrdma_marshal_chunks(struct rpc_rqst *rqst, ssize_t result) -{ - struct rpcrdma_req *req = rpcr_to_rdmar(rqst); - struct rpcrdma_msg *headerp = rdmab_to_msg(req->rl_rdmabuf); - - if (req->rl_rtype != rpcrdma_noch) - result = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf, - headerp, req->rl_rtype); - else if (req->rl_wtype != rpcrdma_noch) - result = rpcrdma_create_chunks(rqst, &rqst->rq_rcv_buf, - headerp, req->rl_wtype); - return result; + for (pos = 0; nchunks--;) + pos += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt, + &req->rl_segments[pos]); + return n; } /* @@ -397,6 +385,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) char *base; size_t rpclen, padlen; ssize_t hdrlen; + enum rpcrdma_chunktype rtype, wtype; struct rpcrdma_msg *headerp; /* @@ -433,13 +422,13 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * into pages; otherwise use reply chunks. */ if (rqst->rq_rcv_buf.buflen <= RPCRDMA_INLINE_READ_THRESHOLD(rqst)) - req->rl_wtype = rpcrdma_noch; + wtype = rpcrdma_noch; else if (rqst->rq_rcv_buf.page_len == 0) - req->rl_wtype = rpcrdma_replych; + wtype = rpcrdma_replych; else if (rqst->rq_rcv_buf.flags & XDRBUF_READ) - req->rl_wtype = rpcrdma_writech; + wtype = rpcrdma_writech; else - req->rl_wtype = rpcrdma_replych; + wtype = rpcrdma_replych; /* * Chunks needed for arguments? @@ -456,16 +445,16 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * TBD check NFSv4 setacl */ if (rqst->rq_snd_buf.len <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst)) - req->rl_rtype = rpcrdma_noch; + rtype = rpcrdma_noch; else if (rqst->rq_snd_buf.page_len == 0) - req->rl_rtype = rpcrdma_areadch; + rtype = rpcrdma_areadch; else - req->rl_rtype = rpcrdma_readch; + rtype = rpcrdma_readch; /* The following simplification is not true forever */ - if (req->rl_rtype != rpcrdma_noch && req->rl_wtype == rpcrdma_replych) - req->rl_wtype = rpcrdma_noch; - if (req->rl_rtype != rpcrdma_noch && req->rl_wtype != rpcrdma_noch) { + if (rtype != rpcrdma_noch && wtype == rpcrdma_replych) + wtype = rpcrdma_noch; + if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) { dprintk("RPC: %s: cannot marshal multiple chunk lists\n", __func__); return -EIO; @@ -479,7 +468,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * When padding is in use and applies to the transfer, insert * it and change the message type. */ - if (req->rl_rtype == rpcrdma_noch) { + if (rtype == rpcrdma_noch) { padlen = rpcrdma_inline_pullup(rqst, RPCRDMA_INLINE_PAD_VALUE(rqst)); @@ -494,7 +483,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero; headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero; hdrlen += 2 * sizeof(u32); /* extra words in padhdr */ - if (req->rl_wtype != rpcrdma_noch) { + if (wtype != rpcrdma_noch) { dprintk("RPC: %s: invalid chunk list\n", __func__); return -EIO; @@ -515,18 +504,26 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * on receive. Therefore, we request a reply chunk * for non-writes wherever feasible and efficient. */ - if (req->rl_wtype == rpcrdma_noch) - req->rl_wtype = rpcrdma_replych; + if (wtype == rpcrdma_noch) + wtype = rpcrdma_replych; } } - hdrlen = rpcrdma_marshal_chunks(rqst, hdrlen); + if (rtype != rpcrdma_noch) { + hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf, + headerp, rtype); + wtype = rtype; /* simplify dprintk */ + + } else if (wtype != rpcrdma_noch) { + hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_rcv_buf, + headerp, wtype); + } if (hdrlen < 0) return hdrlen; dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd" " headerp 0x%p base 0x%p lkey 0x%x\n", - __func__, transfertypes[req->rl_wtype], hdrlen, rpclen, padlen, + __func__, transfertypes[wtype], hdrlen, rpclen, padlen, headerp, base, rdmab_lkey(req->rl_rdmabuf)); /* diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 2e192ba..54f23b1 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -157,12 +157,47 @@ static struct ctl_table sunrpc_table[] = { static struct rpc_xprt_ops xprt_rdma_procs; /* forward reference */ static void +xprt_rdma_format_addresses4(struct rpc_xprt *xprt, struct sockaddr *sap) +{ + struct sockaddr_in *sin = (struct sockaddr_in *)sap; + char buf[20]; + + snprintf(buf, sizeof(buf), "%08x", ntohl(sin->sin_addr.s_addr)); + xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL); + + xprt->address_strings[RPC_DISPLAY_NETID] = RPCBIND_NETID_RDMA; +} + +static void +xprt_rdma_format_addresses6(struct rpc_xprt *xprt, struct sockaddr *sap) +{ + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; + char buf[40]; + + snprintf(buf, sizeof(buf), "%pi6", &sin6->sin6_addr); + xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL); + + xprt->address_strings[RPC_DISPLAY_NETID] = RPCBIND_NETID_RDMA6; +} + +static void xprt_rdma_format_addresses(struct rpc_xprt *xprt) { struct sockaddr *sap = (struct sockaddr *) &rpcx_to_rdmad(xprt).addr; - struct sockaddr_in *sin = (struct sockaddr_in *)sap; - char buf[64]; + char buf[128]; + + switch (sap->sa_family) { + case AF_INET: + xprt_rdma_format_addresses4(xprt, sap); + break; + case AF_INET6: + xprt_rdma_format_addresses6(xprt, sap); + break; + default: + pr_err("rpcrdma: Unrecognized address family\n"); + return; + } (void)rpc_ntop(sap, buf, sizeof(buf)); xprt->address_strings[RPC_DISPLAY_ADDR] = kstrdup(buf, GFP_KERNEL); @@ -170,16 +205,10 @@ xprt_rdma_format_addresses(struct rpc_xprt *xprt) snprintf(buf, sizeof(buf), "%u", rpc_get_port(sap)); xprt->address_strings[RPC_DISPLAY_PORT] = kstrdup(buf, GFP_KERNEL); - xprt->address_strings[RPC_DISPLAY_PROTO] = "rdma"; - - snprintf(buf, sizeof(buf), "%08x", ntohl(sin->sin_addr.s_addr)); - xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL); - snprintf(buf, sizeof(buf), "%4hx", rpc_get_port(sap)); xprt->address_strings[RPC_DISPLAY_HEX_PORT] = kstrdup(buf, GFP_KERNEL); - /* netid */ - xprt->address_strings[RPC_DISPLAY_NETID] = "rdma"; + xprt->address_strings[RPC_DISPLAY_PROTO] = "rdma"; } static void @@ -377,7 +406,10 @@ xprt_setup_rdma(struct xprt_create *args) xprt_rdma_connect_worker); xprt_rdma_format_addresses(xprt); - xprt->max_payload = rpcrdma_max_payload(new_xprt); + xprt->max_payload = new_xprt->rx_ia.ri_ops->ro_maxpages(new_xprt); + if (xprt->max_payload == 0) + goto out4; + xprt->max_payload <<= PAGE_SHIFT; dprintk("RPC: %s: transport data payload maximum: %zu bytes\n", __func__, xprt->max_payload); @@ -552,8 +584,8 @@ xprt_rdma_free(void *buffer) for (i = 0; req->rl_nchunks;) { --req->rl_nchunks; - i += rpcrdma_deregister_external( - &req->rl_segments[i], r_xprt); + i += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt, + &req->rl_segments[i]); } rpcrdma_buffer_put(req); @@ -579,10 +611,7 @@ xprt_rdma_send_request(struct rpc_task *task) struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); int rc = 0; - if (req->rl_niovs == 0) - rc = rpcrdma_marshal_req(rqst); - else if (r_xprt->rx_ia.ri_memreg_strategy != RPCRDMA_ALLPHYSICAL) - rc = rpcrdma_marshal_chunks(rqst, 0); + rc = rpcrdma_marshal_req(rqst); if (rc < 0) goto failed_marshal; diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index e28909f..4870d27 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -50,6 +50,7 @@ #include <linux/interrupt.h> #include <linux/slab.h> #include <linux/prefetch.h> +#include <linux/sunrpc/addr.h> #include <asm/bitops.h> #include "xprt_rdma.h" @@ -62,9 +63,6 @@ # define RPCDBG_FACILITY RPCDBG_TRANS #endif -static void rpcrdma_reset_frmrs(struct rpcrdma_ia *); -static void rpcrdma_reset_fmrs(struct rpcrdma_ia *); - /* * internal functions */ @@ -188,7 +186,7 @@ static const char * const wc_status[] = { "remote access error", "remote operation error", "transport retry counter exceeded", - "RNR retrycounter exceeded", + "RNR retry counter exceeded", "local RDD violation error", "remove invalid RD request", "operation aborted", @@ -206,21 +204,17 @@ static const char * const wc_status[] = { static void rpcrdma_sendcq_process_wc(struct ib_wc *wc) { - if (likely(wc->status == IB_WC_SUCCESS)) - return; - /* WARNING: Only wr_id and status are reliable at this point */ - if (wc->wr_id == 0ULL) { - if (wc->status != IB_WC_WR_FLUSH_ERR) + if (wc->wr_id == RPCRDMA_IGNORE_COMPLETION) { + if (wc->status != IB_WC_SUCCESS && + wc->status != IB_WC_WR_FLUSH_ERR) pr_err("RPC: %s: SEND: %s\n", __func__, COMPLETION_MSG(wc->status)); } else { struct rpcrdma_mw *r; r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; - r->r.frmr.fr_state = FRMR_IS_STALE; - pr_err("RPC: %s: frmr %p (stale): %s\n", - __func__, r, COMPLETION_MSG(wc->status)); + r->mw_sendcompletion(wc); } } @@ -424,7 +418,7 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) struct rpcrdma_ia *ia = &xprt->rx_ia; struct rpcrdma_ep *ep = &xprt->rx_ep; #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) - struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr; + struct sockaddr *sap = (struct sockaddr *)&ep->rep_remote_addr; #endif struct ib_qp_attr *attr = &ia->ri_qp_attr; struct ib_qp_init_attr *iattr = &ia->ri_qp_init_attr; @@ -480,9 +474,8 @@ connected: wake_up_all(&ep->rep_connect_wait); /*FALLTHROUGH*/ default: - dprintk("RPC: %s: %pI4:%u (ep 0x%p): %s\n", - __func__, &addr->sin_addr.s_addr, - ntohs(addr->sin_port), ep, + dprintk("RPC: %s: %pIS:%u (ep 0x%p): %s\n", + __func__, sap, rpc_get_port(sap), ep, CONNECTION_MSG(event->event)); break; } @@ -491,19 +484,16 @@ connected: if (connstate == 1) { int ird = attr->max_dest_rd_atomic; int tird = ep->rep_remote_cma.responder_resources; - printk(KERN_INFO "rpcrdma: connection to %pI4:%u " - "on %s, memreg %d slots %d ird %d%s\n", - &addr->sin_addr.s_addr, - ntohs(addr->sin_port), + + pr_info("rpcrdma: connection to %pIS:%u on %s, memreg '%s', %d credits, %d responders%s\n", + sap, rpc_get_port(sap), ia->ri_id->device->name, - ia->ri_memreg_strategy, + ia->ri_ops->ro_displayname, xprt->rx_buf.rb_max_requests, ird, ird < 4 && ird < tird / 2 ? " (low!)" : ""); } else if (connstate < 0) { - printk(KERN_INFO "rpcrdma: connection to %pI4:%u closed (%d)\n", - &addr->sin_addr.s_addr, - ntohs(addr->sin_port), - connstate); + pr_info("rpcrdma: connection to %pIS:%u closed (%d)\n", + sap, rpc_get_port(sap), connstate); } #endif @@ -621,17 +611,13 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) if (memreg == RPCRDMA_FRMR) { /* Requires both frmr reg and local dma lkey */ - if ((devattr->device_cap_flags & + if (((devattr->device_cap_flags & (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) != - (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) { + (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) || + (devattr->max_fast_reg_page_list_len == 0)) { dprintk("RPC: %s: FRMR registration " "not supported by HCA\n", __func__); memreg = RPCRDMA_MTHCAFMR; - } else { - /* Mind the ia limit on FRMR page list depth */ - ia->ri_max_frmr_depth = min_t(unsigned int, - RPCRDMA_MAX_DATA_SEGS, - devattr->max_fast_reg_page_list_len); } } if (memreg == RPCRDMA_MTHCAFMR) { @@ -652,13 +638,16 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) */ switch (memreg) { case RPCRDMA_FRMR: + ia->ri_ops = &rpcrdma_frwr_memreg_ops; break; case RPCRDMA_ALLPHYSICAL: + ia->ri_ops = &rpcrdma_physical_memreg_ops; mem_priv = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ; goto register_setup; case RPCRDMA_MTHCAFMR: + ia->ri_ops = &rpcrdma_fmr_memreg_ops; if (ia->ri_have_dma_lkey) break; mem_priv = IB_ACCESS_LOCAL_WRITE; @@ -678,8 +667,8 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) rc = -ENOMEM; goto out3; } - dprintk("RPC: %s: memory registration strategy is %d\n", - __func__, memreg); + dprintk("RPC: %s: memory registration strategy is '%s'\n", + __func__, ia->ri_ops->ro_displayname); /* Else will do memory reg/dereg for each chunk */ ia->ri_memreg_strategy = memreg; @@ -743,49 +732,11 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall; ep->rep_attr.qp_context = ep; - /* send_cq and recv_cq initialized below */ ep->rep_attr.srq = NULL; ep->rep_attr.cap.max_send_wr = cdata->max_requests; - switch (ia->ri_memreg_strategy) { - case RPCRDMA_FRMR: { - int depth = 7; - - /* Add room for frmr register and invalidate WRs. - * 1. FRMR reg WR for head - * 2. FRMR invalidate WR for head - * 3. N FRMR reg WRs for pagelist - * 4. N FRMR invalidate WRs for pagelist - * 5. FRMR reg WR for tail - * 6. FRMR invalidate WR for tail - * 7. The RDMA_SEND WR - */ - - /* Calculate N if the device max FRMR depth is smaller than - * RPCRDMA_MAX_DATA_SEGS. - */ - if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) { - int delta = RPCRDMA_MAX_DATA_SEGS - - ia->ri_max_frmr_depth; - - do { - depth += 2; /* FRMR reg + invalidate */ - delta -= ia->ri_max_frmr_depth; - } while (delta > 0); - - } - ep->rep_attr.cap.max_send_wr *= depth; - if (ep->rep_attr.cap.max_send_wr > devattr->max_qp_wr) { - cdata->max_requests = devattr->max_qp_wr / depth; - if (!cdata->max_requests) - return -EINVAL; - ep->rep_attr.cap.max_send_wr = cdata->max_requests * - depth; - } - break; - } - default: - break; - } + rc = ia->ri_ops->ro_open(ia, ep, cdata); + if (rc) + return rc; ep->rep_attr.cap.max_recv_wr = cdata->max_requests; ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2); ep->rep_attr.cap.max_recv_sge = 1; @@ -944,21 +895,9 @@ retry: rpcrdma_ep_disconnect(ep, ia); rpcrdma_flush_cqs(ep); - switch (ia->ri_memreg_strategy) { - case RPCRDMA_FRMR: - rpcrdma_reset_frmrs(ia); - break; - case RPCRDMA_MTHCAFMR: - rpcrdma_reset_fmrs(ia); - break; - case RPCRDMA_ALLPHYSICAL: - break; - default: - rc = -EIO; - goto out; - } - xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); + ia->ri_ops->ro_reset(xprt); + id = rpcrdma_create_id(xprt, ia, (struct sockaddr *)&xprt->rx_data.addr); if (IS_ERR(id)) { @@ -1123,91 +1062,6 @@ out: return ERR_PTR(rc); } -static int -rpcrdma_init_fmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf) -{ - int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ; - struct ib_fmr_attr fmr_attr = { - .max_pages = RPCRDMA_MAX_DATA_SEGS, - .max_maps = 1, - .page_shift = PAGE_SHIFT - }; - struct rpcrdma_mw *r; - int i, rc; - - i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS; - dprintk("RPC: %s: initializing %d FMRs\n", __func__, i); - - while (i--) { - r = kzalloc(sizeof(*r), GFP_KERNEL); - if (r == NULL) - return -ENOMEM; - - r->r.fmr = ib_alloc_fmr(ia->ri_pd, mr_access_flags, &fmr_attr); - if (IS_ERR(r->r.fmr)) { - rc = PTR_ERR(r->r.fmr); - dprintk("RPC: %s: ib_alloc_fmr failed %i\n", - __func__, rc); - goto out_free; - } - - list_add(&r->mw_list, &buf->rb_mws); - list_add(&r->mw_all, &buf->rb_all); - } - return 0; - -out_free: - kfree(r); - return rc; -} - -static int -rpcrdma_init_frmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf) -{ - struct rpcrdma_frmr *f; - struct rpcrdma_mw *r; - int i, rc; - - i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS; - dprintk("RPC: %s: initializing %d FRMRs\n", __func__, i); - - while (i--) { - r = kzalloc(sizeof(*r), GFP_KERNEL); - if (r == NULL) - return -ENOMEM; - f = &r->r.frmr; - - f->fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd, - ia->ri_max_frmr_depth); - if (IS_ERR(f->fr_mr)) { - rc = PTR_ERR(f->fr_mr); - dprintk("RPC: %s: ib_alloc_fast_reg_mr " - "failed %i\n", __func__, rc); - goto out_free; - } - - f->fr_pgl = ib_alloc_fast_reg_page_list(ia->ri_id->device, - ia->ri_max_frmr_depth); - if (IS_ERR(f->fr_pgl)) { - rc = PTR_ERR(f->fr_pgl); - dprintk("RPC: %s: ib_alloc_fast_reg_page_list " - "failed %i\n", __func__, rc); - - ib_dereg_mr(f->fr_mr); - goto out_free; - } - - list_add(&r->mw_list, &buf->rb_mws); - list_add(&r->mw_all, &buf->rb_all); - } - - return 0; - -out_free: - kfree(r); - return rc; -} - int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) { @@ -1244,22 +1098,9 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) buf->rb_recv_bufs = (struct rpcrdma_rep **) p; p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests]; - INIT_LIST_HEAD(&buf->rb_mws); - INIT_LIST_HEAD(&buf->rb_all); - switch (ia->ri_memreg_strategy) { - case RPCRDMA_FRMR: - rc = rpcrdma_init_frmrs(ia, buf); - if (rc) - goto out; - break; - case RPCRDMA_MTHCAFMR: - rc = rpcrdma_init_fmrs(ia, buf); - if (rc) - goto out; - break; - default: - break; - } + rc = ia->ri_ops->ro_init(r_xprt); + if (rc) + goto out; for (i = 0; i < buf->rb_max_requests; i++) { struct rpcrdma_req *req; @@ -1311,47 +1152,6 @@ rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req) kfree(req); } -static void -rpcrdma_destroy_fmrs(struct rpcrdma_buffer *buf) -{ - struct rpcrdma_mw *r; - int rc; - - while (!list_empty(&buf->rb_all)) { - r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); - list_del(&r->mw_all); - list_del(&r->mw_list); - - rc = ib_dealloc_fmr(r->r.fmr); - if (rc) - dprintk("RPC: %s: ib_dealloc_fmr failed %i\n", - __func__, rc); - - kfree(r); - } -} - -static void -rpcrdma_destroy_frmrs(struct rpcrdma_buffer *buf) -{ - struct rpcrdma_mw *r; - int rc; - - while (!list_empty(&buf->rb_all)) { - r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); - list_del(&r->mw_all); - list_del(&r->mw_list); - - rc = ib_dereg_mr(r->r.frmr.fr_mr); - if (rc) - dprintk("RPC: %s: ib_dereg_mr failed %i\n", - __func__, rc); - ib_free_fast_reg_page_list(r->r.frmr.fr_pgl); - - kfree(r); - } -} - void rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) { @@ -1372,104 +1172,11 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) rpcrdma_destroy_req(ia, buf->rb_send_bufs[i]); } - switch (ia->ri_memreg_strategy) { - case RPCRDMA_FRMR: - rpcrdma_destroy_frmrs(buf); - break; - case RPCRDMA_MTHCAFMR: - rpcrdma_destroy_fmrs(buf); - break; - default: - break; - } + ia->ri_ops->ro_destroy(buf); kfree(buf->rb_pool); } -/* After a disconnect, unmap all FMRs. - * - * This is invoked only in the transport connect worker in order - * to serialize with rpcrdma_register_fmr_external(). - */ -static void -rpcrdma_reset_fmrs(struct rpcrdma_ia *ia) -{ - struct rpcrdma_xprt *r_xprt = - container_of(ia, struct rpcrdma_xprt, rx_ia); - struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - struct list_head *pos; - struct rpcrdma_mw *r; - LIST_HEAD(l); - int rc; - - list_for_each(pos, &buf->rb_all) { - r = list_entry(pos, struct rpcrdma_mw, mw_all); - - INIT_LIST_HEAD(&l); - list_add(&r->r.fmr->list, &l); - rc = ib_unmap_fmr(&l); - if (rc) - dprintk("RPC: %s: ib_unmap_fmr failed %i\n", - __func__, rc); - } -} - -/* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in - * an unusable state. Find FRMRs in this state and dereg / reg - * each. FRMRs that are VALID and attached to an rpcrdma_req are - * also torn down. - * - * This gives all in-use FRMRs a fresh rkey and leaves them INVALID. - * - * This is invoked only in the transport connect worker in order - * to serialize with rpcrdma_register_frmr_external(). - */ -static void -rpcrdma_reset_frmrs(struct rpcrdma_ia *ia) -{ - struct rpcrdma_xprt *r_xprt = - container_of(ia, struct rpcrdma_xprt, rx_ia); - struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - struct list_head *pos; - struct rpcrdma_mw *r; - int rc; - - list_for_each(pos, &buf->rb_all) { - r = list_entry(pos, struct rpcrdma_mw, mw_all); - - if (r->r.frmr.fr_state == FRMR_IS_INVALID) - continue; - - rc = ib_dereg_mr(r->r.frmr.fr_mr); - if (rc) - dprintk("RPC: %s: ib_dereg_mr failed %i\n", - __func__, rc); - ib_free_fast_reg_page_list(r->r.frmr.fr_pgl); - - r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd, - ia->ri_max_frmr_depth); - if (IS_ERR(r->r.frmr.fr_mr)) { - rc = PTR_ERR(r->r.frmr.fr_mr); - dprintk("RPC: %s: ib_alloc_fast_reg_mr" - " failed %i\n", __func__, rc); - continue; - } - r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list( - ia->ri_id->device, - ia->ri_max_frmr_depth); - if (IS_ERR(r->r.frmr.fr_pgl)) { - rc = PTR_ERR(r->r.frmr.fr_pgl); - dprintk("RPC: %s: " - "ib_alloc_fast_reg_page_list " - "failed %i\n", __func__, rc); - - ib_dereg_mr(r->r.frmr.fr_mr); - continue; - } - r->r.frmr.fr_state = FRMR_IS_INVALID; - } -} - /* "*mw" can be NULL when rpcrdma_buffer_get_mrs() fails, leaving * some req segments uninitialized. */ @@ -1509,7 +1216,7 @@ rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf) } } -/* rpcrdma_unmap_one() was already done by rpcrdma_deregister_frmr_external(). +/* rpcrdma_unmap_one() was already done during deregistration. * Redo only the ib_post_send(). */ static void @@ -1729,6 +1436,14 @@ rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) * Wrappers for internal-use kmalloc memory registration, used by buffer code. */ +void +rpcrdma_mapping_error(struct rpcrdma_mr_seg *seg) +{ + dprintk("RPC: map_one: offset %p iova %llx len %zu\n", + seg->mr_offset, + (unsigned long long)seg->mr_dma, seg->mr_dmalen); +} + static int rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len, struct ib_mr **mrp, struct ib_sge *iov) @@ -1854,287 +1569,6 @@ rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb) } /* - * Wrappers for chunk registration, shared by read/write chunk code. - */ - -static void -rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing) -{ - seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE; - seg->mr_dmalen = seg->mr_len; - if (seg->mr_page) - seg->mr_dma = ib_dma_map_page(ia->ri_id->device, - seg->mr_page, offset_in_page(seg->mr_offset), - seg->mr_dmalen, seg->mr_dir); - else - seg->mr_dma = ib_dma_map_single(ia->ri_id->device, - seg->mr_offset, - seg->mr_dmalen, seg->mr_dir); - if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) { - dprintk("RPC: %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n", - __func__, - (unsigned long long)seg->mr_dma, - seg->mr_offset, seg->mr_dmalen); - } -} - -static void -rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg) -{ - if (seg->mr_page) - ib_dma_unmap_page(ia->ri_id->device, - seg->mr_dma, seg->mr_dmalen, seg->mr_dir); - else - ib_dma_unmap_single(ia->ri_id->device, - seg->mr_dma, seg->mr_dmalen, seg->mr_dir); -} - -static int -rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, - int *nsegs, int writing, struct rpcrdma_ia *ia, - struct rpcrdma_xprt *r_xprt) -{ - struct rpcrdma_mr_seg *seg1 = seg; - struct rpcrdma_mw *mw = seg1->rl_mw; - struct rpcrdma_frmr *frmr = &mw->r.frmr; - struct ib_mr *mr = frmr->fr_mr; - struct ib_send_wr fastreg_wr, *bad_wr; - u8 key; - int len, pageoff; - int i, rc; - int seg_len; - u64 pa; - int page_no; - - pageoff = offset_in_page(seg1->mr_offset); - seg1->mr_offset -= pageoff; /* start of page */ - seg1->mr_len += pageoff; - len = -pageoff; - if (*nsegs > ia->ri_max_frmr_depth) - *nsegs = ia->ri_max_frmr_depth; - for (page_no = i = 0; i < *nsegs;) { - rpcrdma_map_one(ia, seg, writing); - pa = seg->mr_dma; - for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) { - frmr->fr_pgl->page_list[page_no++] = pa; - pa += PAGE_SIZE; - } - len += seg->mr_len; - ++seg; - ++i; - /* Check for holes */ - if ((i < *nsegs && offset_in_page(seg->mr_offset)) || - offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) - break; - } - dprintk("RPC: %s: Using frmr %p to map %d segments\n", - __func__, mw, i); - - frmr->fr_state = FRMR_IS_VALID; - - memset(&fastreg_wr, 0, sizeof(fastreg_wr)); - fastreg_wr.wr_id = (unsigned long)(void *)mw; - fastreg_wr.opcode = IB_WR_FAST_REG_MR; - fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma; - fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl; - fastreg_wr.wr.fast_reg.page_list_len = page_no; - fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT; - fastreg_wr.wr.fast_reg.length = page_no << PAGE_SHIFT; - if (fastreg_wr.wr.fast_reg.length < len) { - rc = -EIO; - goto out_err; - } - - /* Bump the key */ - key = (u8)(mr->rkey & 0x000000FF); - ib_update_fast_reg_key(mr, ++key); - - fastreg_wr.wr.fast_reg.access_flags = (writing ? - IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : - IB_ACCESS_REMOTE_READ); - fastreg_wr.wr.fast_reg.rkey = mr->rkey; - DECR_CQCOUNT(&r_xprt->rx_ep); - - rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr); - if (rc) { - dprintk("RPC: %s: failed ib_post_send for register," - " status %i\n", __func__, rc); - ib_update_fast_reg_key(mr, --key); - goto out_err; - } else { - seg1->mr_rkey = mr->rkey; - seg1->mr_base = seg1->mr_dma + pageoff; - seg1->mr_nsegs = i; - seg1->mr_len = len; - } - *nsegs = i; - return 0; -out_err: - frmr->fr_state = FRMR_IS_INVALID; - while (i--) - rpcrdma_unmap_one(ia, --seg); - return rc; -} - -static int -rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg, - struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt) -{ - struct rpcrdma_mr_seg *seg1 = seg; - struct ib_send_wr invalidate_wr, *bad_wr; - int rc; - - seg1->rl_mw->r.frmr.fr_state = FRMR_IS_INVALID; - - memset(&invalidate_wr, 0, sizeof invalidate_wr); - invalidate_wr.wr_id = (unsigned long)(void *)seg1->rl_mw; - invalidate_wr.opcode = IB_WR_LOCAL_INV; - invalidate_wr.ex.invalidate_rkey = seg1->rl_mw->r.frmr.fr_mr->rkey; - DECR_CQCOUNT(&r_xprt->rx_ep); - - read_lock(&ia->ri_qplock); - while (seg1->mr_nsegs--) - rpcrdma_unmap_one(ia, seg++); - rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr); - read_unlock(&ia->ri_qplock); - if (rc) { - /* Force rpcrdma_buffer_get() to retry */ - seg1->rl_mw->r.frmr.fr_state = FRMR_IS_STALE; - dprintk("RPC: %s: failed ib_post_send for invalidate," - " status %i\n", __func__, rc); - } - return rc; -} - -static int -rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg, - int *nsegs, int writing, struct rpcrdma_ia *ia) -{ - struct rpcrdma_mr_seg *seg1 = seg; - u64 physaddrs[RPCRDMA_MAX_DATA_SEGS]; - int len, pageoff, i, rc; - - pageoff = offset_in_page(seg1->mr_offset); - seg1->mr_offset -= pageoff; /* start of page */ - seg1->mr_len += pageoff; - len = -pageoff; - if (*nsegs > RPCRDMA_MAX_DATA_SEGS) - *nsegs = RPCRDMA_MAX_DATA_SEGS; - for (i = 0; i < *nsegs;) { - rpcrdma_map_one(ia, seg, writing); - physaddrs[i] = seg->mr_dma; - len += seg->mr_len; - ++seg; - ++i; - /* Check for holes */ - if ((i < *nsegs && offset_in_page(seg->mr_offset)) || - offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) - break; - } - rc = ib_map_phys_fmr(seg1->rl_mw->r.fmr, physaddrs, i, seg1->mr_dma); - if (rc) { - dprintk("RPC: %s: failed ib_map_phys_fmr " - "%u@0x%llx+%i (%d)... status %i\n", __func__, - len, (unsigned long long)seg1->mr_dma, - pageoff, i, rc); - while (i--) - rpcrdma_unmap_one(ia, --seg); - } else { - seg1->mr_rkey = seg1->rl_mw->r.fmr->rkey; - seg1->mr_base = seg1->mr_dma + pageoff; - seg1->mr_nsegs = i; - seg1->mr_len = len; - } - *nsegs = i; - return rc; -} - -static int -rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg, - struct rpcrdma_ia *ia) -{ - struct rpcrdma_mr_seg *seg1 = seg; - LIST_HEAD(l); - int rc; - - list_add(&seg1->rl_mw->r.fmr->list, &l); - rc = ib_unmap_fmr(&l); - read_lock(&ia->ri_qplock); - while (seg1->mr_nsegs--) - rpcrdma_unmap_one(ia, seg++); - read_unlock(&ia->ri_qplock); - if (rc) - dprintk("RPC: %s: failed ib_unmap_fmr," - " status %i\n", __func__, rc); - return rc; -} - -int -rpcrdma_register_external(struct rpcrdma_mr_seg *seg, - int nsegs, int writing, struct rpcrdma_xprt *r_xprt) -{ - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - int rc = 0; - - switch (ia->ri_memreg_strategy) { - - case RPCRDMA_ALLPHYSICAL: - rpcrdma_map_one(ia, seg, writing); - seg->mr_rkey = ia->ri_bind_mem->rkey; - seg->mr_base = seg->mr_dma; - seg->mr_nsegs = 1; - nsegs = 1; - break; - - /* Registration using frmr registration */ - case RPCRDMA_FRMR: - rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt); - break; - - /* Registration using fmr memory registration */ - case RPCRDMA_MTHCAFMR: - rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia); - break; - - default: - return -EIO; - } - if (rc) - return rc; - - return nsegs; -} - -int -rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg, - struct rpcrdma_xprt *r_xprt) -{ - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - int nsegs = seg->mr_nsegs, rc; - - switch (ia->ri_memreg_strategy) { - - case RPCRDMA_ALLPHYSICAL: - read_lock(&ia->ri_qplock); - rpcrdma_unmap_one(ia, seg); - read_unlock(&ia->ri_qplock); - break; - - case RPCRDMA_FRMR: - rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt); - break; - - case RPCRDMA_MTHCAFMR: - rc = rpcrdma_deregister_fmr_external(seg, ia); - break; - - default: - break; - } - return nsegs; -} - -/* * Prepost any receive buffer, then post send. * * Receive buffer is donated to hardware, reclaimed upon recv completion. @@ -2156,7 +1590,7 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, } send_wr.next = NULL; - send_wr.wr_id = 0ULL; /* no send cookie */ + send_wr.wr_id = RPCRDMA_IGNORE_COMPLETION; send_wr.sg_list = req->rl_send_iov; send_wr.num_sge = req->rl_niovs; send_wr.opcode = IB_WR_SEND; @@ -2215,43 +1649,24 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia, return rc; } -/* Physical mapping means one Read/Write list entry per-page. - * All list entries must fit within an inline buffer - * - * NB: The server must return a Write list for NFS READ, - * which has the same constraint. Factor in the inline - * rsize as well. +/* How many chunk list items fit within our inline buffers? */ -static size_t -rpcrdma_physical_max_payload(struct rpcrdma_xprt *r_xprt) +unsigned int +rpcrdma_max_segments(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; - unsigned int inline_size, pages; + int bytes, segments; - inline_size = min_t(unsigned int, - cdata->inline_wsize, cdata->inline_rsize); - inline_size -= RPCRDMA_HDRLEN_MIN; - pages = inline_size / sizeof(struct rpcrdma_segment); - return pages << PAGE_SHIFT; -} - -static size_t -rpcrdma_mr_max_payload(struct rpcrdma_xprt *r_xprt) -{ - return RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT; -} - -size_t -rpcrdma_max_payload(struct rpcrdma_xprt *r_xprt) -{ - size_t result; - - switch (r_xprt->rx_ia.ri_memreg_strategy) { - case RPCRDMA_ALLPHYSICAL: - result = rpcrdma_physical_max_payload(r_xprt); - break; - default: - result = rpcrdma_mr_max_payload(r_xprt); + bytes = min_t(unsigned int, cdata->inline_wsize, cdata->inline_rsize); + bytes -= RPCRDMA_HDRLEN_MIN; + if (bytes < sizeof(struct rpcrdma_segment) * 2) { + pr_warn("RPC: %s: inline threshold too small\n", + __func__); + return 0; } - return result; + + segments = 1 << (fls(bytes / sizeof(struct rpcrdma_segment)) - 1); + dprintk("RPC: %s: max chunk list size = %d segments\n", + __func__, segments); + return segments; } diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 0a16fb6..78e0b8b 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -60,6 +60,7 @@ * Interface Adapter -- one per transport instance */ struct rpcrdma_ia { + const struct rpcrdma_memreg_ops *ri_ops; rwlock_t ri_qplock; struct rdma_cm_id *ri_id; struct ib_pd *ri_pd; @@ -105,6 +106,10 @@ struct rpcrdma_ep { #define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit) #define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount) +/* Force completion handler to ignore the signal + */ +#define RPCRDMA_IGNORE_COMPLETION (0ULL) + /* Registered buffer -- registered kmalloc'd memory for RDMA SEND/RECV * * The below structure appears at the front of a large region of kmalloc'd @@ -143,14 +148,6 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb) return (struct rpcrdma_msg *)rb->rg_base; } -enum rpcrdma_chunktype { - rpcrdma_noch = 0, - rpcrdma_readch, - rpcrdma_areadch, - rpcrdma_writech, - rpcrdma_replych -}; - /* * struct rpcrdma_rep -- this structure encapsulates state required to recv * and complete a reply, asychronously. It needs several pieces of @@ -213,6 +210,7 @@ struct rpcrdma_mw { struct ib_fmr *fmr; struct rpcrdma_frmr frmr; } r; + void (*mw_sendcompletion)(struct ib_wc *); struct list_head mw_list; struct list_head mw_all; }; @@ -258,7 +256,6 @@ struct rpcrdma_req { unsigned int rl_niovs; /* 0, 2 or 4 */ unsigned int rl_nchunks; /* non-zero if chunks */ unsigned int rl_connect_cookie; /* retry detection */ - enum rpcrdma_chunktype rl_rtype, rl_wtype; struct rpcrdma_buffer *rl_buffer; /* home base for this structure */ struct rpcrdma_rep *rl_reply;/* holder for reply buffer */ struct ib_sge rl_send_iov[4]; /* for active requests */ @@ -340,6 +337,29 @@ struct rpcrdma_stats { }; /* + * Per-registration mode operations + */ +struct rpcrdma_xprt; +struct rpcrdma_memreg_ops { + int (*ro_map)(struct rpcrdma_xprt *, + struct rpcrdma_mr_seg *, int, bool); + int (*ro_unmap)(struct rpcrdma_xprt *, + struct rpcrdma_mr_seg *); + int (*ro_open)(struct rpcrdma_ia *, + struct rpcrdma_ep *, + struct rpcrdma_create_data_internal *); + size_t (*ro_maxpages)(struct rpcrdma_xprt *); + int (*ro_init)(struct rpcrdma_xprt *); + void (*ro_reset)(struct rpcrdma_xprt *); + void (*ro_destroy)(struct rpcrdma_buffer *); + const char *ro_displayname; +}; + +extern const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops; +extern const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops; +extern const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops; + +/* * RPCRDMA transport -- encapsulates the structures above for * integration with RPC. * @@ -398,16 +418,56 @@ void rpcrdma_buffer_put(struct rpcrdma_req *); void rpcrdma_recv_buffer_get(struct rpcrdma_req *); void rpcrdma_recv_buffer_put(struct rpcrdma_rep *); -int rpcrdma_register_external(struct rpcrdma_mr_seg *, - int, int, struct rpcrdma_xprt *); -int rpcrdma_deregister_external(struct rpcrdma_mr_seg *, - struct rpcrdma_xprt *); - struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *, size_t, gfp_t); void rpcrdma_free_regbuf(struct rpcrdma_ia *, struct rpcrdma_regbuf *); +unsigned int rpcrdma_max_segments(struct rpcrdma_xprt *); + +/* + * Wrappers for chunk registration, shared by read/write chunk code. + */ + +void rpcrdma_mapping_error(struct rpcrdma_mr_seg *); + +static inline enum dma_data_direction +rpcrdma_data_dir(bool writing) +{ + return writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE; +} + +static inline void +rpcrdma_map_one(struct ib_device *device, struct rpcrdma_mr_seg *seg, + enum dma_data_direction direction) +{ + seg->mr_dir = direction; + seg->mr_dmalen = seg->mr_len; + + if (seg->mr_page) + seg->mr_dma = ib_dma_map_page(device, + seg->mr_page, offset_in_page(seg->mr_offset), + seg->mr_dmalen, seg->mr_dir); + else + seg->mr_dma = ib_dma_map_single(device, + seg->mr_offset, + seg->mr_dmalen, seg->mr_dir); + + if (ib_dma_mapping_error(device, seg->mr_dma)) + rpcrdma_mapping_error(seg); +} + +static inline void +rpcrdma_unmap_one(struct ib_device *device, struct rpcrdma_mr_seg *seg) +{ + if (seg->mr_page) + ib_dma_unmap_page(device, + seg->mr_dma, seg->mr_dmalen, seg->mr_dir); + else + ib_dma_unmap_single(device, + seg->mr_dma, seg->mr_dmalen, seg->mr_dir); +} + /* * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c */ @@ -418,9 +478,7 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *); /* * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c */ -ssize_t rpcrdma_marshal_chunks(struct rpc_rqst *, ssize_t); int rpcrdma_marshal_req(struct rpc_rqst *); -size_t rpcrdma_max_payload(struct rpcrdma_xprt *); /* Temporary NFS request map cache. Created in svc_rdma.c */ extern struct kmem_cache *svc_rdma_map_cachep; diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 46568b8..055453d 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -338,7 +338,7 @@ int netdev_switch_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi, fi, tos, type, nlflags, tb_id); if (!err) - fi->fib_flags |= RTNH_F_EXTERNAL; + fi->fib_flags |= RTNH_F_OFFLOAD; } return err; @@ -364,7 +364,7 @@ int netdev_switch_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi, const struct swdev_ops *ops; int err = 0; - if (!(fi->fib_flags & RTNH_F_EXTERNAL)) + if (!(fi->fib_flags & RTNH_F_OFFLOAD)) return 0; dev = netdev_switch_get_dev_by_nhs(fi); @@ -376,7 +376,7 @@ int netdev_switch_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi, err = ops->swdev_fib_ipv4_del(dev, htonl(dst), dst_len, fi, tos, type, tb_id); if (!err) - fi->fib_flags &= ~RTNH_F_EXTERNAL; + fi->fib_flags &= ~RTNH_F_OFFLOAD; } return err; diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 3613e72..70e3dac 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -591,14 +591,14 @@ void tipc_bearer_stop(struct net *net) /* Caller should hold rtnl_lock to protect the bearer */ static int __tipc_nl_add_bearer(struct tipc_nl_msg *msg, - struct tipc_bearer *bearer) + struct tipc_bearer *bearer, int nlflags) { void *hdr; struct nlattr *attrs; struct nlattr *prop; hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, - NLM_F_MULTI, TIPC_NL_BEARER_GET); + nlflags, TIPC_NL_BEARER_GET); if (!hdr) return -EMSGSIZE; @@ -657,7 +657,7 @@ int tipc_nl_bearer_dump(struct sk_buff *skb, struct netlink_callback *cb) if (!bearer) continue; - err = __tipc_nl_add_bearer(&msg, bearer); + err = __tipc_nl_add_bearer(&msg, bearer, NLM_F_MULTI); if (err) break; } @@ -705,7 +705,7 @@ int tipc_nl_bearer_get(struct sk_buff *skb, struct genl_info *info) goto err_out; } - err = __tipc_nl_add_bearer(&msg, bearer); + err = __tipc_nl_add_bearer(&msg, bearer, 0); if (err) goto err_out; rtnl_unlock(); @@ -857,14 +857,14 @@ int tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info) } static int __tipc_nl_add_media(struct tipc_nl_msg *msg, - struct tipc_media *media) + struct tipc_media *media, int nlflags) { void *hdr; struct nlattr *attrs; struct nlattr *prop; hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, - NLM_F_MULTI, TIPC_NL_MEDIA_GET); + nlflags, TIPC_NL_MEDIA_GET); if (!hdr) return -EMSGSIZE; @@ -916,7 +916,8 @@ int tipc_nl_media_dump(struct sk_buff *skb, struct netlink_callback *cb) rtnl_lock(); for (; media_info_array[i] != NULL; i++) { - err = __tipc_nl_add_media(&msg, media_info_array[i]); + err = __tipc_nl_add_media(&msg, media_info_array[i], + NLM_F_MULTI); if (err) break; } @@ -963,7 +964,7 @@ int tipc_nl_media_get(struct sk_buff *skb, struct genl_info *info) goto err_out; } - err = __tipc_nl_add_media(&msg, media); + err = __tipc_nl_add_media(&msg, media, 0); if (err) goto err_out; rtnl_unlock(); diff --git a/net/tipc/link.c b/net/tipc/link.c index a6b30df..43a515d 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1145,11 +1145,8 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b_ptr) } /* Synchronize with parallel link if applicable */ if (unlikely((l_ptr->flags & LINK_SYNCHING) && !msg_dup(msg))) { - link_handle_out_of_seq_msg(l_ptr, skb); - if (link_synch(l_ptr)) - link_retrieve_defq(l_ptr, &head); - skb = NULL; - goto unlock; + if (!link_synch(l_ptr)) + goto unlock; } l_ptr->next_in_no++; if (unlikely(!skb_queue_empty(&l_ptr->deferdq))) @@ -2013,7 +2010,7 @@ msg_full: /* Caller should hold appropriate locks to protect the link */ static int __tipc_nl_add_link(struct net *net, struct tipc_nl_msg *msg, - struct tipc_link *link) + struct tipc_link *link, int nlflags) { int err; void *hdr; @@ -2022,7 +2019,7 @@ static int __tipc_nl_add_link(struct net *net, struct tipc_nl_msg *msg, struct tipc_net *tn = net_generic(net, tipc_net_id); hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, - NLM_F_MULTI, TIPC_NL_LINK_GET); + nlflags, TIPC_NL_LINK_GET); if (!hdr) return -EMSGSIZE; @@ -2095,7 +2092,7 @@ static int __tipc_nl_add_node_links(struct net *net, struct tipc_nl_msg *msg, if (!node->links[i]) continue; - err = __tipc_nl_add_link(net, msg, node->links[i]); + err = __tipc_nl_add_link(net, msg, node->links[i], NLM_F_MULTI); if (err) return err; } @@ -2143,7 +2140,6 @@ int tipc_nl_link_dump(struct sk_buff *skb, struct netlink_callback *cb) err = __tipc_nl_add_node_links(net, &msg, node, &prev_link); tipc_node_unlock(node); - tipc_node_put(node); if (err) goto out; @@ -2210,7 +2206,7 @@ int tipc_nl_link_get(struct sk_buff *skb, struct genl_info *info) goto err_out; } - err = __tipc_nl_add_link(net, &msg, link); + err = __tipc_nl_add_link(net, &msg, link, 0); if (err) goto err_out; diff --git a/net/tipc/server.c b/net/tipc/server.c index ab6183c..77ff03e 100644 --- a/net/tipc/server.c +++ b/net/tipc/server.c @@ -102,7 +102,7 @@ static void tipc_conn_kref_release(struct kref *kref) } saddr->scope = -TIPC_NODE_SCOPE; kernel_bind(sock, (struct sockaddr *)saddr, sizeof(*saddr)); - sk_release_kernel(sk); + sock_release(sock); con->sock = NULL; } @@ -321,12 +321,9 @@ static struct socket *tipc_create_listen_sock(struct tipc_conn *con) struct socket *sock = NULL; int ret; - ret = sock_create_kern(AF_TIPC, SOCK_SEQPACKET, 0, &sock); + ret = __sock_create(s->net, AF_TIPC, SOCK_SEQPACKET, 0, &sock, 1); if (ret < 0) return NULL; - - sk_change_net(sock->sk, s->net); - ret = kernel_setsockopt(sock, SOL_TIPC, TIPC_IMPORTANCE, (char *)&s->imp, sizeof(s->imp)); if (ret < 0) @@ -376,7 +373,7 @@ static struct socket *tipc_create_listen_sock(struct tipc_conn *con) create_err: kernel_sock_shutdown(sock, SHUT_RDWR); - sk_release_kernel(sock->sk); + sock_release(sock); return NULL; } diff --git a/net/tipc/socket.c b/net/tipc/socket.c index ee90d74..9074b5c 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1764,13 +1764,14 @@ static int tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk, int tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq) { u32 dnode, dport = 0; - int err = -TIPC_ERR_NO_PORT; + int err; struct sk_buff *skb; struct tipc_sock *tsk; struct tipc_net *tn; struct sock *sk; while (skb_queue_len(inputq)) { + err = -TIPC_ERR_NO_PORT; skb = NULL; dport = tipc_skb_peek_port(inputq, dport); tsk = tipc_sk_lookup(net, dport); diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 433f287..5266ea7 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -305,7 +305,7 @@ static struct sock *unix_find_socket_byinode(struct inode *i) &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) { struct dentry *dentry = unix_sk(s)->path.dentry; - if (dentry && dentry->d_inode == i) { + if (dentry && d_backing_inode(dentry) == i) { sock_hold(s); goto found; } @@ -778,7 +778,7 @@ static struct sock *unix_find_other(struct net *net, err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path); if (err) goto fail; - inode = path.dentry->d_inode; + inode = d_backing_inode(path.dentry); err = inode_permission(inode, MAY_WRITE); if (err) goto put_fail; @@ -839,7 +839,7 @@ static int unix_mknod(const char *sun_path, umode_t mode, struct path *res) */ err = security_path_mknod(&path, dentry, mode, 0); if (!err) { - err = vfs_mknod(path.dentry->d_inode, dentry, mode, 0); + err = vfs_mknod(d_inode(path.dentry), dentry, mode, 0); if (!err) { res->mnt = mntget(path.mnt); res->dentry = dget(dentry); @@ -905,7 +905,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) goto out_up; } addr->hash = UNIX_HASH_SIZE; - hash = path.dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1); + hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE-1); spin_lock(&unix_table_lock); u->path = path; list = &unix_socket_table[hash]; diff --git a/net/unix/diag.c b/net/unix/diag.c index ef542fb..c512f64 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -25,7 +25,7 @@ static int sk_diag_dump_vfs(struct sock *sk, struct sk_buff *nlskb) if (dentry) { struct unix_diag_vfs uv = { - .udiag_vfs_ino = dentry->d_inode->i_ino, + .udiag_vfs_ino = d_backing_inode(dentry)->i_ino, .udiag_vfs_dev = dentry->d_sb->s_dev, }; diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 99f7012..a73a226 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -95,39 +95,36 @@ static DECLARE_WAIT_QUEUE_HEAD(unix_gc_wait); unsigned int unix_tot_inflight; - struct sock *unix_get_socket(struct file *filp) { struct sock *u_sock = NULL; struct inode *inode = file_inode(filp); - /* - * Socket ? - */ + /* Socket ? */ if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) { struct socket *sock = SOCKET_I(inode); struct sock *s = sock->sk; - /* - * PF_UNIX ? - */ + /* PF_UNIX ? */ if (s && sock->ops && sock->ops->family == PF_UNIX) u_sock = s; } return u_sock; } -/* - * Keep the number of times in flight count for the file - * descriptor if it is for an AF_UNIX socket. +/* Keep the number of times in flight count for the file + * descriptor if it is for an AF_UNIX socket. */ void unix_inflight(struct file *fp) { struct sock *s = unix_get_socket(fp); + if (s) { struct unix_sock *u = unix_sk(s); + spin_lock(&unix_gc_lock); + if (atomic_long_inc_return(&u->inflight) == 1) { BUG_ON(!list_empty(&u->link)); list_add_tail(&u->link, &gc_inflight_list); @@ -142,10 +139,13 @@ void unix_inflight(struct file *fp) void unix_notinflight(struct file *fp) { struct sock *s = unix_get_socket(fp); + if (s) { struct unix_sock *u = unix_sk(s); + spin_lock(&unix_gc_lock); BUG_ON(list_empty(&u->link)); + if (atomic_long_dec_and_test(&u->inflight)) list_del_init(&u->link); unix_tot_inflight--; @@ -161,32 +161,27 @@ static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *), spin_lock(&x->sk_receive_queue.lock); skb_queue_walk_safe(&x->sk_receive_queue, skb, next) { - /* - * Do we have file descriptors ? - */ + /* Do we have file descriptors ? */ if (UNIXCB(skb).fp) { bool hit = false; - /* - * Process the descriptors of this socket - */ + /* Process the descriptors of this socket */ int nfd = UNIXCB(skb).fp->count; struct file **fp = UNIXCB(skb).fp->fp; + while (nfd--) { - /* - * Get the socket the fd matches - * if it indeed does so - */ + /* Get the socket the fd matches if it indeed does so */ struct sock *sk = unix_get_socket(*fp++); + if (sk) { struct unix_sock *u = unix_sk(sk); - /* - * Ignore non-candidates, they could + /* Ignore non-candidates, they could * have been added to the queues after * starting the garbage collection */ if (test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) { hit = true; + func(u); } } @@ -203,24 +198,22 @@ static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *), static void scan_children(struct sock *x, void (*func)(struct unix_sock *), struct sk_buff_head *hitlist) { - if (x->sk_state != TCP_LISTEN) + if (x->sk_state != TCP_LISTEN) { scan_inflight(x, func, hitlist); - else { + } else { struct sk_buff *skb; struct sk_buff *next; struct unix_sock *u; LIST_HEAD(embryos); - /* - * For a listening socket collect the queued embryos + /* For a listening socket collect the queued embryos * and perform a scan on them as well. */ spin_lock(&x->sk_receive_queue.lock); skb_queue_walk_safe(&x->sk_receive_queue, skb, next) { u = unix_sk(skb->sk); - /* - * An embryo cannot be in-flight, so it's safe + /* An embryo cannot be in-flight, so it's safe * to use the list link. */ BUG_ON(!list_empty(&u->link)); @@ -249,8 +242,7 @@ static void inc_inflight(struct unix_sock *usk) static void inc_inflight_move_tail(struct unix_sock *u) { atomic_long_inc(&u->inflight); - /* - * If this still might be part of a cycle, move it to the end + /* If this still might be part of a cycle, move it to the end * of the list, so that it's checked even if it was already * passed over */ @@ -263,8 +255,7 @@ static bool gc_in_progress; void wait_for_unix_gc(void) { - /* - * If number of inflight sockets is insane, + /* If number of inflight sockets is insane, * force a garbage collect right now. */ if (unix_tot_inflight > UNIX_INFLIGHT_TRIGGER_GC && !gc_in_progress) @@ -288,8 +279,7 @@ void unix_gc(void) goto out; gc_in_progress = true; - /* - * First, select candidates for garbage collection. Only + /* First, select candidates for garbage collection. Only * in-flight sockets are considered, and from those only ones * which don't have any external reference. * @@ -320,15 +310,13 @@ void unix_gc(void) } } - /* - * Now remove all internal in-flight reference to children of + /* Now remove all internal in-flight reference to children of * the candidates. */ list_for_each_entry(u, &gc_candidates, link) scan_children(&u->sk, dec_inflight, NULL); - /* - * Restore the references for children of all candidates, + /* Restore the references for children of all candidates, * which have remaining references. Do this recursively, so * only those remain, which form cyclic references. * @@ -350,8 +338,7 @@ void unix_gc(void) } list_del(&cursor); - /* - * not_cycle_list contains those sockets which do not make up a + /* not_cycle_list contains those sockets which do not make up a * cycle. Restore these to the inflight list. */ while (!list_empty(¬_cycle_list)) { @@ -360,8 +347,7 @@ void unix_gc(void) list_move_tail(&u->link, &gc_inflight_list); } - /* - * Now gc_candidates contains only garbage. Restore original + /* Now gc_candidates contains only garbage. Restore original * inflight counters for these as well, and remove the skbuffs * which are creating the cycle(s). */ |