summaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/6lowpan/6lowpan_i.h4
-rw-r--r--net/6lowpan/Makefile2
-rw-r--r--net/6lowpan/core.c50
-rw-r--r--net/6lowpan/debugfs.c39
-rw-r--r--net/6lowpan/iphc.c167
-rw-r--r--net/6lowpan/ndisc.c234
-rw-r--r--net/ax25/af_ax25.c3
-rw-r--r--net/ax25/ax25_ds_timer.c5
-rw-r--r--net/ax25/ax25_std_timer.c5
-rw-r--r--net/ax25/ax25_subr.c3
-rw-r--r--net/batman-adv/routing.c1
-rw-r--r--net/batman-adv/soft-interface.c9
-rw-r--r--net/batman-adv/translation-table.c50
-rw-r--r--net/batman-adv/types.h2
-rw-r--r--net/bridge/br_device.c10
-rw-r--r--net/bridge/br_forward.c13
-rw-r--r--net/bridge/br_if.c9
-rw-r--r--net/bridge/br_input.c18
-rw-r--r--net/bridge/br_multicast.c221
-rw-r--r--net/bridge/br_netlink.c148
-rw-r--r--net/bridge/br_private.h64
-rw-r--r--net/bridge/br_sysfs_br.c25
-rw-r--r--net/caif/chnl_net.c1
-rw-r--r--net/can/Makefile3
-rw-r--r--net/can/af_can.c22
-rw-r--r--net/can/bcm.c309
-rw-r--r--net/can/proc.c3
-rw-r--r--net/core/dev.c67
-rw-r--r--net/core/devlink.c87
-rw-r--r--net/core/fib_rules.c49
-rw-r--r--net/core/filter.c320
-rw-r--r--net/core/neighbour.c6
-rw-r--r--net/core/net-sysfs.c15
-rw-r--r--net/core/pktgen.c42
-rw-r--r--net/core/rtnetlink.c88
-rw-r--r--net/core/utils.c8
-rw-r--r--net/ieee802154/6lowpan/core.c12
-rw-r--r--net/ieee802154/6lowpan/tx.c113
-rw-r--r--net/ipv4/esp4.c52
-rw-r--r--net/ipv4/gre_demux.c11
-rw-r--r--net/ipv4/inet_diag.c25
-rw-r--r--net/ipv4/ip_gre.c77
-rw-r--r--net/ipv4/ip_tunnel.c2
-rw-r--r--net/ipv4/ipconfig.c4
-rw-r--r--net/ipv4/ipmr.c4
-rw-r--r--net/ipv4/tcp.c67
-rw-r--r--net/ipv4/tcp_dctcp.c4
-rw-r--r--net/ipv4/tcp_ipv4.c31
-rw-r--r--net/ipv4/tcp_output.c7
-rw-r--r--net/ipv4/udp.c80
-rw-r--r--net/ipv4/udp_tunnel.c61
-rw-r--r--net/ipv4/xfrm4_policy.c8
-rw-r--r--net/ipv6/addrconf.c268
-rw-r--r--net/ipv6/icmp.c76
-rw-r--r--net/ipv6/ila/ila_common.c10
-rw-r--r--net/ipv6/ip6_checksum.c7
-rw-r--r--net/ipv6/ip6_gre.c2
-rw-r--r--net/ipv6/ip6_icmp.c2
-rw-r--r--net/ipv6/ip6_output.c12
-rw-r--r--net/ipv6/ip6mr.c1
-rw-r--r--net/ipv6/ndisc.c123
-rw-r--r--net/ipv6/route.c32
-rw-r--r--net/ipv6/sit.c48
-rw-r--r--net/ipv6/tcp_ipv6.c33
-rw-r--r--net/ipv6/udp.c71
-rw-r--r--net/ipv6/xfrm6_policy.c4
-rw-r--r--net/iucv/af_iucv.c223
-rw-r--r--net/kcm/kcmproc.c1
-rw-r--r--net/kcm/kcmsock.c8
-rw-r--r--net/l3mdev/l3mdev.c26
-rw-r--r--net/mac80211/mesh.c7
-rw-r--r--net/mpls/af_mpls.c7
-rw-r--r--net/netfilter/nf_conntrack_core.c2
-rw-r--r--net/netfilter/nf_tables_api.c24
-rw-r--r--net/netfilter/nf_tables_core.c2
-rw-r--r--net/netfilter/nft_hash.c3
-rw-r--r--net/netfilter/nft_meta.c9
-rw-r--r--net/netfilter/nft_rbtree.c3
-rw-r--r--net/openvswitch/conntrack.c83
-rw-r--r--net/openvswitch/datapath.c13
-rw-r--r--net/packet/af_packet.c6
-rw-r--r--net/rds/cong.c3
-rw-r--r--net/rds/connection.c328
-rw-r--r--net/rds/ib.c9
-rw-r--r--net/rds/ib.h8
-rw-r--r--net/rds/ib_cm.c11
-rw-r--r--net/rds/ib_rdma.c1
-rw-r--r--net/rds/ib_recv.c4
-rw-r--r--net/rds/ib_send.c4
-rw-r--r--net/rds/loop.c20
-rw-r--r--net/rds/rdma_transport.c1
-rw-r--r--net/rds/rds.h159
-rw-r--r--net/rds/rds_single_path.h30
-rw-r--r--net/rds/recv.c31
-rw-r--r--net/rds/send.c289
-rw-r--r--net/rds/sysctl.c3
-rw-r--r--net/rds/tcp.c135
-rw-r--r--net/rds/tcp.h24
-rw-r--r--net/rds/tcp_connect.c60
-rw-r--r--net/rds/tcp_listen.c19
-rw-r--r--net/rds/tcp_recv.c40
-rw-r--r--net/rds/tcp_send.c35
-rw-r--r--net/rds/threads.c103
-rw-r--r--net/rds/transport.c3
-rw-r--r--net/rxrpc/Makefile36
-rw-r--r--net/rxrpc/af_rxrpc.c107
-rw-r--r--net/rxrpc/ar-connection.c912
-rw-r--r--net/rxrpc/ar-error.c230
-rw-r--r--net/rxrpc/ar-internal.h382
-rw-r--r--net/rxrpc/ar-local.c417
-rw-r--r--net/rxrpc/ar-peer.c305
-rw-r--r--net/rxrpc/ar-transport.c286
-rw-r--r--net/rxrpc/call_accept.c (renamed from net/rxrpc/ar-accept.c)44
-rw-r--r--net/rxrpc/call_event.c (renamed from net/rxrpc/ar-ack.c)35
-rw-r--r--net/rxrpc/call_object.c (renamed from net/rxrpc/ar-call.c)209
-rw-r--r--net/rxrpc/conn_client.c94
-rw-r--r--net/rxrpc/conn_event.c (renamed from net/rxrpc/ar-connevent.c)31
-rw-r--r--net/rxrpc/conn_object.c686
-rw-r--r--net/rxrpc/input.c (renamed from net/rxrpc/ar-input.c)65
-rw-r--r--net/rxrpc/key.c (renamed from net/rxrpc/ar-key.c)2
-rw-r--r--net/rxrpc/local_event.c116
-rw-r--r--net/rxrpc/local_object.c387
-rw-r--r--net/rxrpc/output.c (renamed from net/rxrpc/ar-output.c)113
-rw-r--r--net/rxrpc/peer_event.c281
-rw-r--r--net/rxrpc/peer_object.c315
-rw-r--r--net/rxrpc/proc.c (renamed from net/rxrpc/ar-proc.c)39
-rw-r--r--net/rxrpc/recvmsg.c (renamed from net/rxrpc/ar-recvmsg.c)6
-rw-r--r--net/rxrpc/rxkad.c74
-rw-r--r--net/rxrpc/security.c (renamed from net/rxrpc/ar-security.c)8
-rw-r--r--net/rxrpc/skbuff.c (renamed from net/rxrpc/ar-skbuff.c)0
-rw-r--r--net/rxrpc/sysctl.c8
-rw-r--r--net/rxrpc/utils.c41
-rw-r--r--net/sched/act_api.c10
-rw-r--r--net/sched/act_bpf.c7
-rw-r--r--net/sched/act_ife.c56
-rw-r--r--net/sched/act_ipt.c12
-rw-r--r--net/sched/act_mirred.c3
-rw-r--r--net/sched/act_police.c8
-rw-r--r--net/sched/act_simple.c3
-rw-r--r--net/sched/act_skbedit.c29
-rw-r--r--net/sched/act_vlan.c4
-rw-r--r--net/sched/cls_api.c2
-rw-r--r--net/sched/cls_bpf.c7
-rw-r--r--net/sched/cls_flower.c42
-rw-r--r--net/sched/sch_atm.c13
-rw-r--r--net/sched/sch_blackhole.c5
-rw-r--r--net/sched/sch_cbq.c7
-rw-r--r--net/sched/sch_choke.c24
-rw-r--r--net/sched/sch_codel.c10
-rw-r--r--net/sched/sch_drr.c7
-rw-r--r--net/sched/sch_dsmark.c9
-rw-r--r--net/sched/sch_fifo.c19
-rw-r--r--net/sched/sch_fq.c26
-rw-r--r--net/sched/sch_fq_codel.c39
-rw-r--r--net/sched/sch_generic.c80
-rw-r--r--net/sched/sch_gred.c7
-rw-r--r--net/sched/sch_hfsc.c60
-rw-r--r--net/sched/sch_hhf.c14
-rw-r--r--net/sched/sch_htb.c30
-rw-r--r--net/sched/sch_multiq.c7
-rw-r--r--net/sched/sch_netem.c53
-rw-r--r--net/sched/sch_pie.c7
-rw-r--r--net/sched/sch_plug.c5
-rw-r--r--net/sched/sch_prio.c71
-rw-r--r--net/sched/sch_qfq.c7
-rw-r--r--net/sched/sch_red.c7
-rw-r--r--net/sched/sch_sfb.c7
-rw-r--r--net/sched/sch_sfq.c10
-rw-r--r--net/sched/sch_tbf.c16
-rw-r--r--net/sched/sch_teql.c4
-rw-r--r--net/sctp/protocol.c3
-rw-r--r--net/sctp/sctp_diag.c6
-rw-r--r--net/sctp/socket.c2
-rw-r--r--net/sunrpc/clnt.c31
-rw-r--r--net/sunrpc/svc_xprt.c2
-rw-r--r--net/sunrpc/xprtsock.c1
-rw-r--r--net/tipc/Makefile2
-rw-r--r--net/tipc/addr.h1
-rw-r--r--net/tipc/bearer.c10
-rw-r--r--net/tipc/bearer.h2
-rw-r--r--net/tipc/core.c1
-rw-r--r--net/tipc/core.h15
-rw-r--r--net/tipc/discover.c5
-rw-r--r--net/tipc/link.c52
-rw-r--r--net/tipc/monitor.c651
-rw-r--r--net/tipc/monitor.h73
-rw-r--r--net/tipc/msg.c6
-rw-r--r--net/tipc/msg.h11
-rw-r--r--net/tipc/node.c26
-rw-r--r--net/tipc/server.c3
-rw-r--r--net/tipc/socket.c54
-rw-r--r--net/tipc/udp_media.c24
-rw-r--r--net/unix/af_unix.c6
-rw-r--r--net/vmw_vsock/af_vsock.c12
-rw-r--r--net/wireless/util.c2
195 files changed, 7455 insertions, 4588 deletions
diff --git a/net/6lowpan/6lowpan_i.h b/net/6lowpan/6lowpan_i.h
index 97ecc27..a67caee 100644
--- a/net/6lowpan/6lowpan_i.h
+++ b/net/6lowpan/6lowpan_i.h
@@ -12,6 +12,10 @@ static inline bool lowpan_is_ll(const struct net_device *dev,
return lowpan_dev(dev)->lltype == lltype;
}
+extern const struct ndisc_ops lowpan_ndisc_ops;
+
+int addrconf_ifid_802154_6lowpan(u8 *eui, struct net_device *dev);
+
#ifdef CONFIG_6LOWPAN_DEBUGFS
int lowpan_dev_debugfs_init(struct net_device *dev);
void lowpan_dev_debugfs_exit(struct net_device *dev);
diff --git a/net/6lowpan/Makefile b/net/6lowpan/Makefile
index e44f3bf..12d131a 100644
--- a/net/6lowpan/Makefile
+++ b/net/6lowpan/Makefile
@@ -1,6 +1,6 @@
obj-$(CONFIG_6LOWPAN) += 6lowpan.o
-6lowpan-y := core.o iphc.o nhc.o
+6lowpan-y := core.o iphc.o nhc.o ndisc.o
6lowpan-$(CONFIG_6LOWPAN_DEBUGFS) += debugfs.o
#rfc6282 nhcs
diff --git a/net/6lowpan/core.c b/net/6lowpan/core.c
index 7a240b3..5945f7e 100644
--- a/net/6lowpan/core.c
+++ b/net/6lowpan/core.c
@@ -14,6 +14,7 @@
#include <linux/module.h>
#include <net/6lowpan.h>
+#include <net/addrconf.h>
#include "6lowpan_i.h"
@@ -33,6 +34,8 @@ int lowpan_register_netdevice(struct net_device *dev,
for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++)
lowpan_dev(dev)->ctx.table[i].id = i;
+ dev->ndisc_ops = &lowpan_ndisc_ops;
+
ret = register_netdevice(dev);
if (ret < 0)
return ret;
@@ -72,16 +75,61 @@ void lowpan_unregister_netdev(struct net_device *dev)
}
EXPORT_SYMBOL(lowpan_unregister_netdev);
+int addrconf_ifid_802154_6lowpan(u8 *eui, struct net_device *dev)
+{
+ struct wpan_dev *wpan_dev = lowpan_802154_dev(dev)->wdev->ieee802154_ptr;
+
+ /* Set short_addr autoconfiguration if short_addr is present only */
+ if (!lowpan_802154_is_valid_src_short_addr(wpan_dev->short_addr))
+ return -1;
+
+ /* For either address format, all zero addresses MUST NOT be used */
+ if (wpan_dev->pan_id == cpu_to_le16(0x0000) &&
+ wpan_dev->short_addr == cpu_to_le16(0x0000))
+ return -1;
+
+ /* Alternatively, if no PAN ID is known, 16 zero bits may be used */
+ if (wpan_dev->pan_id == cpu_to_le16(IEEE802154_PAN_ID_BROADCAST))
+ memset(eui, 0, 2);
+ else
+ ieee802154_le16_to_be16(eui, &wpan_dev->pan_id);
+
+ /* The "Universal/Local" (U/L) bit shall be set to zero */
+ eui[0] &= ~2;
+ eui[2] = 0;
+ eui[3] = 0xFF;
+ eui[4] = 0xFE;
+ eui[5] = 0;
+ ieee802154_le16_to_be16(&eui[6], &wpan_dev->short_addr);
+ return 0;
+}
+
static int lowpan_event(struct notifier_block *unused,
unsigned long event, void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct inet6_dev *idev;
+ struct in6_addr addr;
int i;
if (dev->type != ARPHRD_6LOWPAN)
return NOTIFY_DONE;
+ idev = __in6_dev_get(dev);
+ if (!idev)
+ return NOTIFY_DONE;
+
switch (event) {
+ case NETDEV_UP:
+ case NETDEV_CHANGE:
+ /* (802.15.4 6LoWPAN short address slaac handling */
+ if (lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154) &&
+ addrconf_ifid_802154_6lowpan(addr.s6_addr + 8, dev) == 0) {
+ __ipv6_addr_set_half(&addr.s6_addr32[0],
+ htonl(0xFE800000), 0);
+ addrconf_add_linklocal(idev, &addr, 0);
+ }
+ break;
case NETDEV_DOWN:
for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++)
clear_bit(LOWPAN_IPHC_CTX_FLAG_ACTIVE,
@@ -112,8 +160,6 @@ static int __init lowpan_module_init(void)
return ret;
}
- request_module_nowait("ipv6");
-
request_module_nowait("nhc_dest");
request_module_nowait("nhc_fragment");
request_module_nowait("nhc_hop");
diff --git a/net/6lowpan/debugfs.c b/net/6lowpan/debugfs.c
index acbaa3d..24915e0 100644
--- a/net/6lowpan/debugfs.c
+++ b/net/6lowpan/debugfs.c
@@ -245,6 +245,41 @@ static const struct file_operations lowpan_context_fops = {
.release = single_release,
};
+static int lowpan_short_addr_get(void *data, u64 *val)
+{
+ struct wpan_dev *wdev = data;
+
+ rtnl_lock();
+ *val = le16_to_cpu(wdev->short_addr);
+ rtnl_unlock();
+
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(lowpan_short_addr_fops, lowpan_short_addr_get,
+ NULL, "0x%04llx\n");
+
+static int lowpan_dev_debugfs_802154_init(const struct net_device *dev,
+ struct lowpan_dev *ldev)
+{
+ struct dentry *dentry, *root;
+
+ if (!lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154))
+ return 0;
+
+ root = debugfs_create_dir("ieee802154", ldev->iface_debugfs);
+ if (!root)
+ return -EINVAL;
+
+ dentry = debugfs_create_file("short_addr", 0444, root,
+ lowpan_802154_dev(dev)->wdev->ieee802154_ptr,
+ &lowpan_short_addr_fops);
+ if (!dentry)
+ return -EINVAL;
+
+ return 0;
+}
+
int lowpan_dev_debugfs_init(struct net_device *dev)
{
struct lowpan_dev *ldev = lowpan_dev(dev);
@@ -272,6 +307,10 @@ int lowpan_dev_debugfs_init(struct net_device *dev)
goto remove_root;
}
+ ret = lowpan_dev_debugfs_802154_init(dev, ldev);
+ if (ret < 0)
+ goto remove_root;
+
return 0;
remove_root:
diff --git a/net/6lowpan/iphc.c b/net/6lowpan/iphc.c
index 8501dd5..79f1fa2 100644
--- a/net/6lowpan/iphc.c
+++ b/net/6lowpan/iphc.c
@@ -761,22 +761,75 @@ static const u8 lowpan_iphc_dam_to_sam_value[] = {
[LOWPAN_IPHC_DAM_11] = LOWPAN_IPHC_SAM_11,
};
-static u8 lowpan_compress_ctx_addr(u8 **hc_ptr, const struct in6_addr *ipaddr,
+static inline bool
+lowpan_iphc_compress_ctx_802154_lladdr(const struct in6_addr *ipaddr,
+ const struct lowpan_iphc_ctx *ctx,
+ const void *lladdr)
+{
+ const struct ieee802154_addr *addr = lladdr;
+ unsigned char extended_addr[EUI64_ADDR_LEN];
+ bool lladdr_compress = false;
+ struct in6_addr tmp = {};
+
+ switch (addr->mode) {
+ case IEEE802154_ADDR_LONG:
+ ieee802154_le64_to_be64(&extended_addr, &addr->extended_addr);
+ /* check for SAM/DAM = 11 */
+ memcpy(&tmp.s6_addr[8], &extended_addr, EUI64_ADDR_LEN);
+ /* second bit-flip (Universe/Local) is done according RFC2464 */
+ tmp.s6_addr[8] ^= 0x02;
+ /* context information are always used */
+ ipv6_addr_prefix_copy(&tmp, &ctx->pfx, ctx->plen);
+ if (ipv6_addr_equal(&tmp, ipaddr))
+ lladdr_compress = true;
+ break;
+ case IEEE802154_ADDR_SHORT:
+ tmp.s6_addr[11] = 0xFF;
+ tmp.s6_addr[12] = 0xFE;
+ ieee802154_le16_to_be16(&tmp.s6_addr16[7],
+ &addr->short_addr);
+ /* context information are always used */
+ ipv6_addr_prefix_copy(&tmp, &ctx->pfx, ctx->plen);
+ if (ipv6_addr_equal(&tmp, ipaddr))
+ lladdr_compress = true;
+ break;
+ default:
+ /* should never handled and filtered by 802154 6lowpan */
+ WARN_ON_ONCE(1);
+ break;
+ }
+
+ return lladdr_compress;
+}
+
+static u8 lowpan_compress_ctx_addr(u8 **hc_ptr, const struct net_device *dev,
+ const struct in6_addr *ipaddr,
const struct lowpan_iphc_ctx *ctx,
const unsigned char *lladdr, bool sam)
{
struct in6_addr tmp = {};
u8 dam;
- /* check for SAM/DAM = 11 */
- memcpy(&tmp.s6_addr[8], lladdr, 8);
- /* second bit-flip (Universe/Local) is done according RFC2464 */
- tmp.s6_addr[8] ^= 0x02;
- /* context information are always used */
- ipv6_addr_prefix_copy(&tmp, &ctx->pfx, ctx->plen);
- if (ipv6_addr_equal(&tmp, ipaddr)) {
- dam = LOWPAN_IPHC_DAM_11;
- goto out;
+ switch (lowpan_dev(dev)->lltype) {
+ case LOWPAN_LLTYPE_IEEE802154:
+ if (lowpan_iphc_compress_ctx_802154_lladdr(ipaddr, ctx,
+ lladdr)) {
+ dam = LOWPAN_IPHC_DAM_11;
+ goto out;
+ }
+ break;
+ default:
+ /* check for SAM/DAM = 11 */
+ memcpy(&tmp.s6_addr[8], lladdr, EUI64_ADDR_LEN);
+ /* second bit-flip (Universe/Local) is done according RFC2464 */
+ tmp.s6_addr[8] ^= 0x02;
+ /* context information are always used */
+ ipv6_addr_prefix_copy(&tmp, &ctx->pfx, ctx->plen);
+ if (ipv6_addr_equal(&tmp, ipaddr)) {
+ dam = LOWPAN_IPHC_DAM_11;
+ goto out;
+ }
+ break;
}
memset(&tmp, 0, sizeof(tmp));
@@ -813,28 +866,85 @@ out:
return dam;
}
-static u8 lowpan_compress_addr_64(u8 **hc_ptr, const struct in6_addr *ipaddr,
+static inline bool
+lowpan_iphc_compress_802154_lladdr(const struct in6_addr *ipaddr,
+ const void *lladdr)
+{
+ const struct ieee802154_addr *addr = lladdr;
+ unsigned char extended_addr[EUI64_ADDR_LEN];
+ bool lladdr_compress = false;
+ struct in6_addr tmp = {};
+
+ switch (addr->mode) {
+ case IEEE802154_ADDR_LONG:
+ ieee802154_le64_to_be64(&extended_addr, &addr->extended_addr);
+ if (is_addr_mac_addr_based(ipaddr, extended_addr))
+ lladdr_compress = true;
+ break;
+ case IEEE802154_ADDR_SHORT:
+ /* fe:80::ff:fe00:XXXX
+ * \__/
+ * short_addr
+ *
+ * Universe/Local bit is zero.
+ */
+ tmp.s6_addr[0] = 0xFE;
+ tmp.s6_addr[1] = 0x80;
+ tmp.s6_addr[11] = 0xFF;
+ tmp.s6_addr[12] = 0xFE;
+ ieee802154_le16_to_be16(&tmp.s6_addr16[7],
+ &addr->short_addr);
+ if (ipv6_addr_equal(&tmp, ipaddr))
+ lladdr_compress = true;
+ break;
+ default:
+ /* should never handled and filtered by 802154 6lowpan */
+ WARN_ON_ONCE(1);
+ break;
+ }
+
+ return lladdr_compress;
+}
+
+static u8 lowpan_compress_addr_64(u8 **hc_ptr, const struct net_device *dev,
+ const struct in6_addr *ipaddr,
const unsigned char *lladdr, bool sam)
{
- u8 dam = LOWPAN_IPHC_DAM_00;
+ u8 dam = LOWPAN_IPHC_DAM_01;
- if (is_addr_mac_addr_based(ipaddr, lladdr)) {
- dam = LOWPAN_IPHC_DAM_11; /* 0-bits */
- pr_debug("address compression 0 bits\n");
- } else if (lowpan_is_iid_16_bit_compressable(ipaddr)) {
+ switch (lowpan_dev(dev)->lltype) {
+ case LOWPAN_LLTYPE_IEEE802154:
+ if (lowpan_iphc_compress_802154_lladdr(ipaddr, lladdr)) {
+ dam = LOWPAN_IPHC_DAM_11; /* 0-bits */
+ pr_debug("address compression 0 bits\n");
+ goto out;
+ }
+ break;
+ default:
+ if (is_addr_mac_addr_based(ipaddr, lladdr)) {
+ dam = LOWPAN_IPHC_DAM_11; /* 0-bits */
+ pr_debug("address compression 0 bits\n");
+ goto out;
+ }
+ break;
+ }
+
+ if (lowpan_is_iid_16_bit_compressable(ipaddr)) {
/* compress IID to 16 bits xxxx::XXXX */
lowpan_push_hc_data(hc_ptr, &ipaddr->s6_addr16[7], 2);
dam = LOWPAN_IPHC_DAM_10; /* 16-bits */
raw_dump_inline(NULL, "Compressed ipv6 addr is (16 bits)",
*hc_ptr - 2, 2);
- } else {
- /* do not compress IID => xxxx::IID */
- lowpan_push_hc_data(hc_ptr, &ipaddr->s6_addr16[4], 8);
- dam = LOWPAN_IPHC_DAM_01; /* 64-bits */
- raw_dump_inline(NULL, "Compressed ipv6 addr is (64 bits)",
- *hc_ptr - 8, 8);
+ goto out;
}
+ /* do not compress IID => xxxx::IID */
+ lowpan_push_hc_data(hc_ptr, &ipaddr->s6_addr16[4], 8);
+ raw_dump_inline(NULL, "Compressed ipv6 addr is (64 bits)",
+ *hc_ptr - 8, 8);
+
+out:
+
if (sam)
return lowpan_iphc_dam_to_sam_value[dam];
else
@@ -1013,9 +1123,6 @@ int lowpan_header_compress(struct sk_buff *skb, const struct net_device *dev,
iphc0 = LOWPAN_DISPATCH_IPHC;
iphc1 = 0;
- raw_dump_inline(__func__, "saddr", saddr, EUI64_ADDR_LEN);
- raw_dump_inline(__func__, "daddr", daddr, EUI64_ADDR_LEN);
-
raw_dump_table(__func__, "sending raw skb network uncompressed packet",
skb->data, skb->len);
@@ -1088,14 +1195,15 @@ int lowpan_header_compress(struct sk_buff *skb, const struct net_device *dev,
iphc1 |= LOWPAN_IPHC_SAC;
} else {
if (sci) {
- iphc1 |= lowpan_compress_ctx_addr(&hc_ptr, &hdr->saddr,
+ iphc1 |= lowpan_compress_ctx_addr(&hc_ptr, dev,
+ &hdr->saddr,
&sci_entry, saddr,
true);
iphc1 |= LOWPAN_IPHC_SAC;
} else {
if (ipv6_saddr_type & IPV6_ADDR_LINKLOCAL &&
lowpan_is_linklocal_zero_padded(hdr->saddr)) {
- iphc1 |= lowpan_compress_addr_64(&hc_ptr,
+ iphc1 |= lowpan_compress_addr_64(&hc_ptr, dev,
&hdr->saddr,
saddr, true);
pr_debug("source address unicast link-local %pI6c iphc1 0x%02x\n",
@@ -1123,14 +1231,15 @@ int lowpan_header_compress(struct sk_buff *skb, const struct net_device *dev,
}
} else {
if (dci) {
- iphc1 |= lowpan_compress_ctx_addr(&hc_ptr, &hdr->daddr,
+ iphc1 |= lowpan_compress_ctx_addr(&hc_ptr, dev,
+ &hdr->daddr,
&dci_entry, daddr,
false);
iphc1 |= LOWPAN_IPHC_DAC;
} else {
if (ipv6_daddr_type & IPV6_ADDR_LINKLOCAL &&
lowpan_is_linklocal_zero_padded(hdr->daddr)) {
- iphc1 |= lowpan_compress_addr_64(&hc_ptr,
+ iphc1 |= lowpan_compress_addr_64(&hc_ptr, dev,
&hdr->daddr,
daddr, false);
pr_debug("dest address unicast link-local %pI6c iphc1 0x%02x\n",
diff --git a/net/6lowpan/ndisc.c b/net/6lowpan/ndisc.c
new file mode 100644
index 0000000..ae1d419
--- /dev/null
+++ b/net/6lowpan/ndisc.c
@@ -0,0 +1,234 @@
+/* This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Authors:
+ * (C) 2016 Pengutronix, Alexander Aring <aar@pengutronix.de>
+ */
+
+#include <net/6lowpan.h>
+#include <net/addrconf.h>
+#include <net/ndisc.h>
+
+#include "6lowpan_i.h"
+
+static int lowpan_ndisc_is_useropt(u8 nd_opt_type)
+{
+ return nd_opt_type == ND_OPT_6CO;
+}
+
+#if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN)
+#define NDISC_802154_SHORT_ADDR_LENGTH 1
+static int lowpan_ndisc_parse_802154_options(const struct net_device *dev,
+ struct nd_opt_hdr *nd_opt,
+ struct ndisc_options *ndopts)
+{
+ switch (nd_opt->nd_opt_len) {
+ case NDISC_802154_SHORT_ADDR_LENGTH:
+ if (ndopts->nd_802154_opt_array[nd_opt->nd_opt_type])
+ ND_PRINTK(2, warn,
+ "%s: duplicated short addr ND6 option found: type=%d\n",
+ __func__, nd_opt->nd_opt_type);
+ else
+ ndopts->nd_802154_opt_array[nd_opt->nd_opt_type] = nd_opt;
+ return 1;
+ default:
+ /* all others will be handled by ndisc IPv6 option parsing */
+ return 0;
+ }
+}
+
+static int lowpan_ndisc_parse_options(const struct net_device *dev,
+ struct nd_opt_hdr *nd_opt,
+ struct ndisc_options *ndopts)
+{
+ switch (nd_opt->nd_opt_type) {
+ case ND_OPT_SOURCE_LL_ADDR:
+ case ND_OPT_TARGET_LL_ADDR:
+ return lowpan_ndisc_parse_802154_options(dev, nd_opt, ndopts);
+ default:
+ return 0;
+ }
+}
+
+static void lowpan_ndisc_802154_update(struct neighbour *n, u32 flags,
+ u8 icmp6_type,
+ const struct ndisc_options *ndopts)
+{
+ struct lowpan_802154_neigh *neigh = lowpan_802154_neigh(neighbour_priv(n));
+ u8 *lladdr_short = NULL;
+
+ switch (icmp6_type) {
+ case NDISC_ROUTER_SOLICITATION:
+ case NDISC_ROUTER_ADVERTISEMENT:
+ case NDISC_NEIGHBOUR_SOLICITATION:
+ if (ndopts->nd_802154_opts_src_lladdr) {
+ lladdr_short = __ndisc_opt_addr_data(ndopts->nd_802154_opts_src_lladdr,
+ IEEE802154_SHORT_ADDR_LEN, 0);
+ if (!lladdr_short) {
+ ND_PRINTK(2, warn,
+ "NA: invalid short link-layer address length\n");
+ return;
+ }
+ }
+ break;
+ case NDISC_REDIRECT:
+ case NDISC_NEIGHBOUR_ADVERTISEMENT:
+ if (ndopts->nd_802154_opts_tgt_lladdr) {
+ lladdr_short = __ndisc_opt_addr_data(ndopts->nd_802154_opts_tgt_lladdr,
+ IEEE802154_SHORT_ADDR_LEN, 0);
+ if (!lladdr_short) {
+ ND_PRINTK(2, warn,
+ "NA: invalid short link-layer address length\n");
+ return;
+ }
+ }
+ break;
+ default:
+ break;
+ }
+
+ write_lock_bh(&n->lock);
+ if (lladdr_short)
+ ieee802154_be16_to_le16(&neigh->short_addr, lladdr_short);
+ else
+ neigh->short_addr = cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC);
+ write_unlock_bh(&n->lock);
+}
+
+static void lowpan_ndisc_update(const struct net_device *dev,
+ struct neighbour *n, u32 flags, u8 icmp6_type,
+ const struct ndisc_options *ndopts)
+{
+ if (!lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154))
+ return;
+
+ /* react on overrides only. TODO check if this is really right. */
+ if (flags & NEIGH_UPDATE_F_OVERRIDE)
+ lowpan_ndisc_802154_update(n, flags, icmp6_type, ndopts);
+}
+
+static int lowpan_ndisc_opt_addr_space(const struct net_device *dev,
+ u8 icmp6_type, struct neighbour *neigh,
+ u8 *ha_buf, u8 **ha)
+{
+ struct lowpan_802154_neigh *n;
+ struct wpan_dev *wpan_dev;
+ int addr_space = 0;
+
+ if (!lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154))
+ return 0;
+
+ switch (icmp6_type) {
+ case NDISC_REDIRECT:
+ n = lowpan_802154_neigh(neighbour_priv(neigh));
+
+ read_lock_bh(&neigh->lock);
+ if (lowpan_802154_is_valid_src_short_addr(n->short_addr)) {
+ memcpy(ha_buf, &n->short_addr,
+ IEEE802154_SHORT_ADDR_LEN);
+ read_unlock_bh(&neigh->lock);
+ addr_space += __ndisc_opt_addr_space(IEEE802154_SHORT_ADDR_LEN, 0);
+ *ha = ha_buf;
+ }
+ read_unlock_bh(&neigh->lock);
+ break;
+ case NDISC_NEIGHBOUR_ADVERTISEMENT:
+ case NDISC_NEIGHBOUR_SOLICITATION:
+ case NDISC_ROUTER_SOLICITATION:
+ wpan_dev = lowpan_802154_dev(dev)->wdev->ieee802154_ptr;
+
+ if (lowpan_802154_is_valid_src_short_addr(wpan_dev->short_addr))
+ addr_space = __ndisc_opt_addr_space(IEEE802154_SHORT_ADDR_LEN, 0);
+ break;
+ default:
+ break;
+ }
+
+ return addr_space;
+}
+
+static void lowpan_ndisc_fill_addr_option(const struct net_device *dev,
+ struct sk_buff *skb, u8 icmp6_type,
+ const u8 *ha)
+{
+ struct wpan_dev *wpan_dev;
+ __be16 short_addr;
+ u8 opt_type;
+
+ if (!lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154))
+ return;
+
+ switch (icmp6_type) {
+ case NDISC_REDIRECT:
+ if (ha) {
+ ieee802154_le16_to_be16(&short_addr, ha);
+ __ndisc_fill_addr_option(skb, ND_OPT_TARGET_LL_ADDR,
+ &short_addr,
+ IEEE802154_SHORT_ADDR_LEN, 0);
+ }
+ return;
+ case NDISC_NEIGHBOUR_ADVERTISEMENT:
+ opt_type = ND_OPT_TARGET_LL_ADDR;
+ break;
+ case NDISC_ROUTER_SOLICITATION:
+ case NDISC_NEIGHBOUR_SOLICITATION:
+ opt_type = ND_OPT_SOURCE_LL_ADDR;
+ break;
+ default:
+ return;
+ }
+
+ wpan_dev = lowpan_802154_dev(dev)->wdev->ieee802154_ptr;
+
+ if (lowpan_802154_is_valid_src_short_addr(wpan_dev->short_addr)) {
+ ieee802154_le16_to_be16(&short_addr,
+ &wpan_dev->short_addr);
+ __ndisc_fill_addr_option(skb, opt_type, &short_addr,
+ IEEE802154_SHORT_ADDR_LEN, 0);
+ }
+}
+
+static void lowpan_ndisc_prefix_rcv_add_addr(struct net *net,
+ struct net_device *dev,
+ const struct prefix_info *pinfo,
+ struct inet6_dev *in6_dev,
+ struct in6_addr *addr,
+ int addr_type, u32 addr_flags,
+ bool sllao, bool tokenized,
+ __u32 valid_lft,
+ u32 prefered_lft,
+ bool dev_addr_generated)
+{
+ int err;
+
+ /* generates short based address for RA PIO's */
+ if (lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154) && dev_addr_generated &&
+ !addrconf_ifid_802154_6lowpan(addr->s6_addr + 8, dev)) {
+ err = addrconf_prefix_rcv_add_addr(net, dev, pinfo, in6_dev,
+ addr, addr_type, addr_flags,
+ sllao, tokenized, valid_lft,
+ prefered_lft);
+ if (err)
+ ND_PRINTK(2, warn,
+ "RA: could not add a short address based address for prefix: %pI6c\n",
+ &pinfo->prefix);
+ }
+}
+#endif
+
+const struct ndisc_ops lowpan_ndisc_ops = {
+ .is_useropt = lowpan_ndisc_is_useropt,
+#if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN)
+ .parse_options = lowpan_ndisc_parse_options,
+ .update = lowpan_ndisc_update,
+ .opt_addr_space = lowpan_ndisc_opt_addr_space,
+ .fill_addr_option = lowpan_ndisc_fill_addr_option,
+ .prefix_rcv_add_addr = lowpan_ndisc_prefix_rcv_add_addr,
+#endif
+};
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index fbd0acf..2fdebab 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -976,7 +976,8 @@ static int ax25_release(struct socket *sock)
release_sock(sk);
ax25_disconnect(ax25, 0);
lock_sock(sk);
- ax25_destroy_socket(ax25);
+ if (!sock_flag(ax25->sk, SOCK_DESTROY))
+ ax25_destroy_socket(ax25);
break;
case AX25_STATE_3:
diff --git a/net/ax25/ax25_ds_timer.c b/net/ax25/ax25_ds_timer.c
index 951cd57..5237dff 100644
--- a/net/ax25/ax25_ds_timer.c
+++ b/net/ax25/ax25_ds_timer.c
@@ -102,6 +102,7 @@ void ax25_ds_heartbeat_expiry(ax25_cb *ax25)
switch (ax25->state) {
case AX25_STATE_0:
+ case AX25_STATE_2:
/* Magic here: If we listen() and a new link dies before it
is accepted() it isn't 'dead' so doesn't get removed. */
if (!sk || sock_flag(sk, SOCK_DESTROY) ||
@@ -111,6 +112,7 @@ void ax25_ds_heartbeat_expiry(ax25_cb *ax25)
sock_hold(sk);
ax25_destroy_socket(ax25);
bh_unlock_sock(sk);
+ /* Ungrab socket and destroy it */
sock_put(sk);
} else
ax25_destroy_socket(ax25);
@@ -213,7 +215,8 @@ void ax25_ds_t1_timeout(ax25_cb *ax25)
case AX25_STATE_2:
if (ax25->n2count == ax25->n2) {
ax25_send_control(ax25, AX25_DISC, AX25_POLLON, AX25_COMMAND);
- ax25_disconnect(ax25, ETIMEDOUT);
+ if (!sock_flag(ax25->sk, SOCK_DESTROY))
+ ax25_disconnect(ax25, ETIMEDOUT);
return;
} else {
ax25->n2count++;
diff --git a/net/ax25/ax25_std_timer.c b/net/ax25/ax25_std_timer.c
index 004467c9..2c0d6ef 100644
--- a/net/ax25/ax25_std_timer.c
+++ b/net/ax25/ax25_std_timer.c
@@ -38,6 +38,7 @@ void ax25_std_heartbeat_expiry(ax25_cb *ax25)
switch (ax25->state) {
case AX25_STATE_0:
+ case AX25_STATE_2:
/* Magic here: If we listen() and a new link dies before it
is accepted() it isn't 'dead' so doesn't get removed. */
if (!sk || sock_flag(sk, SOCK_DESTROY) ||
@@ -47,6 +48,7 @@ void ax25_std_heartbeat_expiry(ax25_cb *ax25)
sock_hold(sk);
ax25_destroy_socket(ax25);
bh_unlock_sock(sk);
+ /* Ungrab socket and destroy it */
sock_put(sk);
} else
ax25_destroy_socket(ax25);
@@ -144,7 +146,8 @@ void ax25_std_t1timer_expiry(ax25_cb *ax25)
case AX25_STATE_2:
if (ax25->n2count == ax25->n2) {
ax25_send_control(ax25, AX25_DISC, AX25_POLLON, AX25_COMMAND);
- ax25_disconnect(ax25, ETIMEDOUT);
+ if (!sock_flag(ax25->sk, SOCK_DESTROY))
+ ax25_disconnect(ax25, ETIMEDOUT);
return;
} else {
ax25->n2count++;
diff --git a/net/ax25/ax25_subr.c b/net/ax25/ax25_subr.c
index 3b78e84..655a7d4 100644
--- a/net/ax25/ax25_subr.c
+++ b/net/ax25/ax25_subr.c
@@ -264,7 +264,8 @@ void ax25_disconnect(ax25_cb *ax25, int reason)
{
ax25_clear_queues(ax25);
- ax25_stop_heartbeat(ax25);
+ if (!sock_flag(ax25->sk, SOCK_DESTROY))
+ ax25_stop_heartbeat(ax25);
ax25_stop_t1timer(ax25);
ax25_stop_t2timer(ax25);
ax25_stop_t3timer(ax25);
diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
index 0f8c0dd..af8e119 100644
--- a/net/batman-adv/routing.c
+++ b/net/batman-adv/routing.c
@@ -387,6 +387,7 @@ int batadv_recv_icmp_packet(struct sk_buff *skb,
if (skb_cow(skb, ETH_HLEN) < 0)
goto out;
+ ethhdr = eth_hdr(skb);
icmph = (struct batadv_icmp_header *)skb->data;
icmp_packet_rr = (struct batadv_icmp_packet_rr *)icmph;
if (icmp_packet_rr->rr_cur >= BATADV_RR_LEN)
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index 18b6d07..7527c06 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -1040,7 +1040,9 @@ void batadv_softif_destroy_sysfs(struct net_device *soft_iface)
static void batadv_softif_destroy_netlink(struct net_device *soft_iface,
struct list_head *head)
{
+ struct batadv_priv *bat_priv = netdev_priv(soft_iface);
struct batadv_hard_iface *hard_iface;
+ struct batadv_softif_vlan *vlan;
list_for_each_entry(hard_iface, &batadv_hardif_list, list) {
if (hard_iface->soft_iface == soft_iface)
@@ -1048,6 +1050,13 @@ static void batadv_softif_destroy_netlink(struct net_device *soft_iface,
BATADV_IF_CLEANUP_KEEP);
}
+ /* destroy the "untagged" VLAN */
+ vlan = batadv_softif_vlan_get(bat_priv, BATADV_NO_FLAGS);
+ if (vlan) {
+ batadv_softif_destroy_vlan(bat_priv, vlan);
+ batadv_softif_vlan_put(vlan);
+ }
+
batadv_sysfs_del_meshif(soft_iface);
unregister_netdevice_queue(soft_iface, head);
}
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index 8bb82a3..7e6df7a 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -652,8 +652,10 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr,
/* increase the refcounter of the related vlan */
vlan = batadv_softif_vlan_get(bat_priv, vid);
- if (WARN(!vlan, "adding TT local entry %pM to non-existent VLAN %d",
- addr, BATADV_PRINT_VID(vid))) {
+ if (!vlan) {
+ net_ratelimited_function(batadv_info, soft_iface,
+ "adding TT local entry %pM to non-existent VLAN %d\n",
+ addr, BATADV_PRINT_VID(vid));
kfree(tt_local);
tt_local = NULL;
goto out;
@@ -693,7 +695,6 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr,
if (unlikely(hash_added != 0)) {
/* remove the reference for the hash */
batadv_tt_local_entry_put(tt_local);
- batadv_softif_vlan_put(vlan);
goto out;
}
@@ -2269,6 +2270,29 @@ static u32 batadv_tt_local_crc(struct batadv_priv *bat_priv,
return crc;
}
+/**
+ * batadv_tt_req_node_release - free tt_req node entry
+ * @ref: kref pointer of the tt req_node entry
+ */
+static void batadv_tt_req_node_release(struct kref *ref)
+{
+ struct batadv_tt_req_node *tt_req_node;
+
+ tt_req_node = container_of(ref, struct batadv_tt_req_node, refcount);
+
+ kfree(tt_req_node);
+}
+
+/**
+ * batadv_tt_req_node_put - decrement the tt_req_node refcounter and
+ * possibly release it
+ * @tt_req_node: tt_req_node to be free'd
+ */
+static void batadv_tt_req_node_put(struct batadv_tt_req_node *tt_req_node)
+{
+ kref_put(&tt_req_node->refcount, batadv_tt_req_node_release);
+}
+
static void batadv_tt_req_list_free(struct batadv_priv *bat_priv)
{
struct batadv_tt_req_node *node;
@@ -2278,7 +2302,7 @@ static void batadv_tt_req_list_free(struct batadv_priv *bat_priv)
hlist_for_each_entry_safe(node, safe, &bat_priv->tt.req_list, list) {
hlist_del_init(&node->list);
- kfree(node);
+ batadv_tt_req_node_put(node);
}
spin_unlock_bh(&bat_priv->tt.req_list_lock);
@@ -2315,7 +2339,7 @@ static void batadv_tt_req_purge(struct batadv_priv *bat_priv)
if (batadv_has_timed_out(node->issued_at,
BATADV_TT_REQUEST_TIMEOUT)) {
hlist_del_init(&node->list);
- kfree(node);
+ batadv_tt_req_node_put(node);
}
}
spin_unlock_bh(&bat_priv->tt.req_list_lock);
@@ -2347,9 +2371,11 @@ batadv_tt_req_node_new(struct batadv_priv *bat_priv,
if (!tt_req_node)
goto unlock;
+ kref_init(&tt_req_node->refcount);
ether_addr_copy(tt_req_node->addr, orig_node->orig);
tt_req_node->issued_at = jiffies;
+ kref_get(&tt_req_node->refcount);
hlist_add_head(&tt_req_node->list, &bat_priv->tt.req_list);
unlock:
spin_unlock_bh(&bat_priv->tt.req_list_lock);
@@ -2613,13 +2639,19 @@ static bool batadv_send_tt_request(struct batadv_priv *bat_priv,
out:
if (primary_if)
batadv_hardif_put(primary_if);
+
if (ret && tt_req_node) {
spin_lock_bh(&bat_priv->tt.req_list_lock);
- /* hlist_del_init() verifies tt_req_node still is in the list */
- hlist_del_init(&tt_req_node->list);
+ if (!hlist_unhashed(&tt_req_node->list)) {
+ hlist_del_init(&tt_req_node->list);
+ batadv_tt_req_node_put(tt_req_node);
+ }
spin_unlock_bh(&bat_priv->tt.req_list_lock);
- kfree(tt_req_node);
}
+
+ if (tt_req_node)
+ batadv_tt_req_node_put(tt_req_node);
+
kfree(tvlv_tt_data);
return ret;
}
@@ -3055,7 +3087,7 @@ static void batadv_handle_tt_response(struct batadv_priv *bat_priv,
if (!batadv_compare_eth(node->addr, resp_src))
continue;
hlist_del_init(&node->list);
- kfree(node);
+ batadv_tt_req_node_put(node);
}
spin_unlock_bh(&bat_priv->tt.req_list_lock);
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 4d6a7ce..43db7b6 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -1266,11 +1266,13 @@ struct batadv_tt_change_node {
* struct batadv_tt_req_node - data to keep track of the tt requests in flight
* @addr: mac address address of the originator this request was sent to
* @issued_at: timestamp used for purging stale tt requests
+ * @refcount: number of contexts the object is used by
* @list: list node for batadv_priv_tt::req_list
*/
struct batadv_tt_req_node {
u8 addr[ETH_ALEN];
unsigned long issued_at;
+ struct kref refcount;
struct hlist_node list;
};
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 2c8095a..0c39e0f 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -104,8 +104,16 @@ static int br_dev_init(struct net_device *dev)
return -ENOMEM;
err = br_vlan_init(br);
- if (err)
+ if (err) {
free_percpu(br->stats);
+ return err;
+ }
+
+ err = br_multicast_init_stats(br);
+ if (err) {
+ free_percpu(br->stats);
+ br_vlan_flush(br);
+ }
br_set_lockdep_class(dev);
return err;
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index f47759f..6c19603 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -198,8 +198,10 @@ static void br_flood(struct net_bridge *br, struct sk_buff *skb,
struct sk_buff *skb),
bool unicast)
{
- struct net_bridge_port *p;
+ u8 igmp_type = br_multicast_igmp_type(skb);
+ __be16 proto = skb->protocol;
struct net_bridge_port *prev;
+ struct net_bridge_port *p;
prev = NULL;
@@ -218,6 +220,9 @@ static void br_flood(struct net_bridge *br, struct sk_buff *skb,
prev = maybe_deliver(prev, p, skb, __packet_hook);
if (IS_ERR(prev))
goto out;
+ if (prev == p)
+ br_multicast_count(p->br, p, proto, igmp_type,
+ BR_MCAST_DIR_TX);
}
if (!prev)
@@ -257,9 +262,12 @@ static void br_multicast_flood(struct net_bridge_mdb_entry *mdst,
struct sk_buff *skb))
{
struct net_device *dev = BR_INPUT_SKB_CB(skb)->brdev;
+ u8 igmp_type = br_multicast_igmp_type(skb);
struct net_bridge *br = netdev_priv(dev);
struct net_bridge_port *prev = NULL;
struct net_bridge_port_group *p;
+ __be16 proto = skb->protocol;
+
struct hlist_node *rp;
rp = rcu_dereference(hlist_first_rcu(&br->router_list));
@@ -277,6 +285,9 @@ static void br_multicast_flood(struct net_bridge_mdb_entry *mdst,
prev = maybe_deliver(prev, port, skb, __packet_hook);
if (IS_ERR(prev))
goto out;
+ if (prev == port)
+ br_multicast_count(port->br, port, proto, igmp_type,
+ BR_MCAST_DIR_TX);
if ((unsigned long)lport >= (unsigned long)port)
p = rcu_dereference(p->next);
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 8217aec..f2fede0 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -345,8 +345,8 @@ static int find_portno(struct net_bridge *br)
static struct net_bridge_port *new_nbp(struct net_bridge *br,
struct net_device *dev)
{
- int index;
struct net_bridge_port *p;
+ int index, err;
index = find_portno(br);
if (index < 0)
@@ -366,7 +366,12 @@ static struct net_bridge_port *new_nbp(struct net_bridge *br,
br_init_port(p);
br_set_state(p, BR_STATE_DISABLED);
br_stp_port_timer_init(p);
- br_multicast_add_port(p);
+ err = br_multicast_add_port(p);
+ if (err) {
+ dev_put(dev);
+ kfree(p);
+ p = ERR_PTR(err);
+ }
return p;
}
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 1607977..786602b 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -60,6 +60,9 @@ static int br_pass_frame_up(struct sk_buff *skb)
skb = br_handle_vlan(br, vg, skb);
if (!skb)
return NET_RX_DROP;
+ /* update the multicast stats if the packet is IGMP/MLD */
+ br_multicast_count(br, NULL, skb->protocol, br_multicast_igmp_type(skb),
+ BR_MCAST_DIR_TX);
return NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN,
dev_net(indev), NULL, skb, indev, NULL,
@@ -213,8 +216,7 @@ drop:
}
EXPORT_SYMBOL_GPL(br_handle_frame_finish);
-/* note: already called with rcu_read_lock */
-static int br_handle_local_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+static void __br_handle_local_finish(struct sk_buff *skb)
{
struct net_bridge_port *p = br_port_get_rcu(skb->dev);
u16 vid = 0;
@@ -222,6 +224,14 @@ static int br_handle_local_finish(struct net *net, struct sock *sk, struct sk_bu
/* check if vlan is allowed, to avoid spoofing */
if (p->flags & BR_LEARNING && br_should_learn(p, skb, &vid))
br_fdb_update(p->br, p, eth_hdr(skb)->h_source, vid, false);
+}
+
+/* note: already called with rcu_read_lock */
+static int br_handle_local_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ struct net_bridge_port *p = br_port_get_rcu(skb->dev);
+
+ __br_handle_local_finish(skb);
BR_INPUT_SKB_CB(skb)->brdev = p->br->dev;
br_pass_frame_up(skb);
@@ -274,7 +284,9 @@ rx_handler_result_t br_handle_frame(struct sk_buff **pskb)
if (p->br->stp_enabled == BR_NO_STP ||
fwd_mask & (1u << dest[5]))
goto forward;
- break;
+ *pskb = skb;
+ __br_handle_local_finish(skb);
+ return RX_HANDLER_PASS;
case 0x01: /* IEEE MAC (Pause) */
goto drop;
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 6852f3c..e405eef 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -361,7 +361,8 @@ out:
}
static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br,
- __be32 group)
+ __be32 group,
+ u8 *igmp_type)
{
struct sk_buff *skb;
struct igmphdr *ih;
@@ -411,6 +412,7 @@ static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br,
skb_set_transport_header(skb, skb->len);
ih = igmp_hdr(skb);
+ *igmp_type = IGMP_HOST_MEMBERSHIP_QUERY;
ih->type = IGMP_HOST_MEMBERSHIP_QUERY;
ih->code = (group ? br->multicast_last_member_interval :
br->multicast_query_response_interval) /
@@ -428,7 +430,8 @@ out:
#if IS_ENABLED(CONFIG_IPV6)
static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br,
- const struct in6_addr *group)
+ const struct in6_addr *grp,
+ u8 *igmp_type)
{
struct sk_buff *skb;
struct ipv6hdr *ip6h;
@@ -464,8 +467,11 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br,
if (ipv6_dev_get_saddr(dev_net(br->dev), br->dev, &ip6h->daddr, 0,
&ip6h->saddr)) {
kfree_skb(skb);
+ br->has_ipv6_addr = 0;
return NULL;
}
+
+ br->has_ipv6_addr = 1;
ipv6_eth_mc_map(&ip6h->daddr, eth->h_dest);
hopopt = (u8 *)(ip6h + 1);
@@ -484,16 +490,17 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br,
skb_set_transport_header(skb, skb->len);
mldq = (struct mld_msg *) icmp6_hdr(skb);
- interval = ipv6_addr_any(group) ?
+ interval = ipv6_addr_any(grp) ?
br->multicast_query_response_interval :
br->multicast_last_member_interval;
+ *igmp_type = ICMPV6_MGM_QUERY;
mldq->mld_type = ICMPV6_MGM_QUERY;
mldq->mld_code = 0;
mldq->mld_cksum = 0;
mldq->mld_maxdelay = htons((u16)jiffies_to_msecs(interval));
mldq->mld_reserved = 0;
- mldq->mld_mca = *group;
+ mldq->mld_mca = *grp;
/* checksum */
mldq->mld_cksum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
@@ -510,14 +517,16 @@ out:
#endif
static struct sk_buff *br_multicast_alloc_query(struct net_bridge *br,
- struct br_ip *addr)
+ struct br_ip *addr,
+ u8 *igmp_type)
{
switch (addr->proto) {
case htons(ETH_P_IP):
- return br_ip4_multicast_alloc_query(br, addr->u.ip4);
+ return br_ip4_multicast_alloc_query(br, addr->u.ip4, igmp_type);
#if IS_ENABLED(CONFIG_IPV6)
case htons(ETH_P_IPV6):
- return br_ip6_multicast_alloc_query(br, &addr->u.ip6);
+ return br_ip6_multicast_alloc_query(br, &addr->u.ip6,
+ igmp_type);
#endif
}
return NULL;
@@ -826,18 +835,23 @@ static void __br_multicast_send_query(struct net_bridge *br,
struct br_ip *ip)
{
struct sk_buff *skb;
+ u8 igmp_type;
- skb = br_multicast_alloc_query(br, ip);
+ skb = br_multicast_alloc_query(br, ip, &igmp_type);
if (!skb)
return;
if (port) {
skb->dev = port->dev;
+ br_multicast_count(br, port, skb->protocol, igmp_type,
+ BR_MCAST_DIR_TX);
NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT,
dev_net(port->dev), NULL, skb, NULL, skb->dev,
br_dev_queue_push_xmit);
} else {
br_multicast_select_own_querier(br, ip, skb);
+ br_multicast_count(br, port, skb->protocol, igmp_type,
+ BR_MCAST_DIR_RX);
netif_rx(skb);
}
}
@@ -915,7 +929,7 @@ static void br_ip6_multicast_port_query_expired(unsigned long data)
}
#endif
-void br_multicast_add_port(struct net_bridge_port *port)
+int br_multicast_add_port(struct net_bridge_port *port)
{
port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
@@ -927,6 +941,11 @@ void br_multicast_add_port(struct net_bridge_port *port)
setup_timer(&port->ip6_own_query.timer,
br_ip6_multicast_port_query_expired, (unsigned long)port);
#endif
+ port->mcast_stats = netdev_alloc_pcpu_stats(struct bridge_mcast_stats);
+ if (!port->mcast_stats)
+ return -ENOMEM;
+
+ return 0;
}
void br_multicast_del_port(struct net_bridge_port *port)
@@ -941,6 +960,7 @@ void br_multicast_del_port(struct net_bridge_port *port)
br_multicast_del_pg(br, pg);
spin_unlock_bh(&br->multicast_lock);
del_timer_sync(&port->multicast_router_timer);
+ free_percpu(port->mcast_stats);
}
static void br_multicast_enable(struct bridge_mcast_own_query *query)
@@ -1580,6 +1600,39 @@ static void br_ip6_multicast_leave_group(struct net_bridge *br,
}
#endif
+static void br_multicast_err_count(const struct net_bridge *br,
+ const struct net_bridge_port *p,
+ __be16 proto)
+{
+ struct bridge_mcast_stats __percpu *stats;
+ struct bridge_mcast_stats *pstats;
+
+ if (!br->multicast_stats_enabled)
+ return;
+
+ if (p)
+ stats = p->mcast_stats;
+ else
+ stats = br->mcast_stats;
+ if (WARN_ON(!stats))
+ return;
+
+ pstats = this_cpu_ptr(stats);
+
+ u64_stats_update_begin(&pstats->syncp);
+ switch (proto) {
+ case htons(ETH_P_IP):
+ pstats->mstats.igmp_parse_errors++;
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case htons(ETH_P_IPV6):
+ pstats->mstats.mld_parse_errors++;
+ break;
+#endif
+ }
+ u64_stats_update_end(&pstats->syncp);
+}
+
static int br_multicast_ipv4_rcv(struct net_bridge *br,
struct net_bridge_port *port,
struct sk_buff *skb,
@@ -1596,11 +1649,12 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br,
BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
return 0;
} else if (err < 0) {
+ br_multicast_err_count(br, port, skb->protocol);
return err;
}
- BR_INPUT_SKB_CB(skb)->igmp = 1;
ih = igmp_hdr(skb);
+ BR_INPUT_SKB_CB(skb)->igmp = ih->type;
switch (ih->type) {
case IGMP_HOST_MEMBERSHIP_REPORT:
@@ -1622,6 +1676,9 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br,
if (skb_trimmed && skb_trimmed != skb)
kfree_skb(skb_trimmed);
+ br_multicast_count(br, port, skb->protocol, BR_INPUT_SKB_CB(skb)->igmp,
+ BR_MCAST_DIR_RX);
+
return err;
}
@@ -1642,11 +1699,12 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
return 0;
} else if (err < 0) {
+ br_multicast_err_count(br, port, skb->protocol);
return err;
}
- BR_INPUT_SKB_CB(skb)->igmp = 1;
mld = (struct mld_msg *)skb_transport_header(skb);
+ BR_INPUT_SKB_CB(skb)->igmp = mld->mld_type;
switch (mld->mld_type) {
case ICMPV6_MGM_REPORT:
@@ -1667,6 +1725,9 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
if (skb_trimmed && skb_trimmed != skb)
kfree_skb(skb_trimmed);
+ br_multicast_count(br, port, skb->protocol, BR_INPUT_SKB_CB(skb)->igmp,
+ BR_MCAST_DIR_RX);
+
return err;
}
#endif
@@ -1674,6 +1735,8 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port,
struct sk_buff *skb, u16 vid)
{
+ int ret = 0;
+
BR_INPUT_SKB_CB(skb)->igmp = 0;
BR_INPUT_SKB_CB(skb)->mrouters_only = 0;
@@ -1682,14 +1745,16 @@ int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port,
switch (skb->protocol) {
case htons(ETH_P_IP):
- return br_multicast_ipv4_rcv(br, port, skb, vid);
+ ret = br_multicast_ipv4_rcv(br, port, skb, vid);
+ break;
#if IS_ENABLED(CONFIG_IPV6)
case htons(ETH_P_IPV6):
- return br_multicast_ipv6_rcv(br, port, skb, vid);
+ ret = br_multicast_ipv6_rcv(br, port, skb, vid);
+ break;
#endif
}
- return 0;
+ return ret;
}
static void br_multicast_query_expired(struct net_bridge *br,
@@ -1745,6 +1810,7 @@ void br_multicast_init(struct net_bridge *br)
br->ip6_other_query.delay_time = 0;
br->ip6_querier.port = NULL;
#endif
+ br->has_ipv6_addr = 1;
spin_lock_init(&br->multicast_lock);
setup_timer(&br->multicast_router_timer,
@@ -1827,6 +1893,8 @@ void br_multicast_dev_del(struct net_bridge *br)
out:
spin_unlock_bh(&br->multicast_lock);
+
+ free_percpu(br->mcast_stats);
}
int br_multicast_set_router(struct net_bridge *br, unsigned long val)
@@ -2181,3 +2249,128 @@ unlock:
return ret;
}
EXPORT_SYMBOL_GPL(br_multicast_has_querier_adjacent);
+
+static void br_mcast_stats_add(struct bridge_mcast_stats __percpu *stats,
+ __be16 proto, u8 type, u8 dir)
+{
+ struct bridge_mcast_stats *pstats = this_cpu_ptr(stats);
+
+ u64_stats_update_begin(&pstats->syncp);
+ switch (proto) {
+ case htons(ETH_P_IP):
+ switch (type) {
+ case IGMP_HOST_MEMBERSHIP_REPORT:
+ pstats->mstats.igmp_v1reports[dir]++;
+ break;
+ case IGMPV2_HOST_MEMBERSHIP_REPORT:
+ pstats->mstats.igmp_v2reports[dir]++;
+ break;
+ case IGMPV3_HOST_MEMBERSHIP_REPORT:
+ pstats->mstats.igmp_v3reports[dir]++;
+ break;
+ case IGMP_HOST_MEMBERSHIP_QUERY:
+ pstats->mstats.igmp_queries[dir]++;
+ break;
+ case IGMP_HOST_LEAVE_MESSAGE:
+ pstats->mstats.igmp_leaves[dir]++;
+ break;
+ }
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case htons(ETH_P_IPV6):
+ switch (type) {
+ case ICMPV6_MGM_REPORT:
+ pstats->mstats.mld_v1reports[dir]++;
+ break;
+ case ICMPV6_MLD2_REPORT:
+ pstats->mstats.mld_v2reports[dir]++;
+ break;
+ case ICMPV6_MGM_QUERY:
+ pstats->mstats.mld_queries[dir]++;
+ break;
+ case ICMPV6_MGM_REDUCTION:
+ pstats->mstats.mld_leaves[dir]++;
+ break;
+ }
+ break;
+#endif /* CONFIG_IPV6 */
+ }
+ u64_stats_update_end(&pstats->syncp);
+}
+
+void br_multicast_count(struct net_bridge *br, const struct net_bridge_port *p,
+ __be16 proto, u8 type, u8 dir)
+{
+ struct bridge_mcast_stats __percpu *stats;
+
+ /* if multicast_disabled is true then igmp type can't be set */
+ if (!type || !br->multicast_stats_enabled)
+ return;
+
+ if (p)
+ stats = p->mcast_stats;
+ else
+ stats = br->mcast_stats;
+ if (WARN_ON(!stats))
+ return;
+
+ br_mcast_stats_add(stats, proto, type, dir);
+}
+
+int br_multicast_init_stats(struct net_bridge *br)
+{
+ br->mcast_stats = netdev_alloc_pcpu_stats(struct bridge_mcast_stats);
+ if (!br->mcast_stats)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void mcast_stats_add_dir(u64 *dst, u64 *src)
+{
+ dst[BR_MCAST_DIR_RX] += src[BR_MCAST_DIR_RX];
+ dst[BR_MCAST_DIR_TX] += src[BR_MCAST_DIR_TX];
+}
+
+void br_multicast_get_stats(const struct net_bridge *br,
+ const struct net_bridge_port *p,
+ struct br_mcast_stats *dest)
+{
+ struct bridge_mcast_stats __percpu *stats;
+ struct br_mcast_stats tdst;
+ int i;
+
+ memset(dest, 0, sizeof(*dest));
+ if (p)
+ stats = p->mcast_stats;
+ else
+ stats = br->mcast_stats;
+ if (WARN_ON(!stats))
+ return;
+
+ memset(&tdst, 0, sizeof(tdst));
+ for_each_possible_cpu(i) {
+ struct bridge_mcast_stats *cpu_stats = per_cpu_ptr(stats, i);
+ struct br_mcast_stats temp;
+ unsigned int start;
+
+ do {
+ start = u64_stats_fetch_begin_irq(&cpu_stats->syncp);
+ memcpy(&temp, &cpu_stats->mstats, sizeof(temp));
+ } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start));
+
+ mcast_stats_add_dir(tdst.igmp_queries, temp.igmp_queries);
+ mcast_stats_add_dir(tdst.igmp_leaves, temp.igmp_leaves);
+ mcast_stats_add_dir(tdst.igmp_v1reports, temp.igmp_v1reports);
+ mcast_stats_add_dir(tdst.igmp_v2reports, temp.igmp_v2reports);
+ mcast_stats_add_dir(tdst.igmp_v3reports, temp.igmp_v3reports);
+ tdst.igmp_parse_errors += temp.igmp_parse_errors;
+
+ mcast_stats_add_dir(tdst.mld_queries, temp.mld_queries);
+ mcast_stats_add_dir(tdst.mld_leaves, temp.mld_leaves);
+ mcast_stats_add_dir(tdst.mld_v1reports, temp.mld_v1reports);
+ mcast_stats_add_dir(tdst.mld_v2reports, temp.mld_v2reports);
+ tdst.mld_parse_errors += temp.mld_parse_errors;
+ }
+ memcpy(dest, &tdst, sizeof(*dest));
+}
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index a5343c7..f2a29e4 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -851,6 +851,7 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = {
[IFLA_BR_NF_CALL_ARPTABLES] = { .type = NLA_U8 },
[IFLA_BR_VLAN_DEFAULT_PVID] = { .type = NLA_U16 },
[IFLA_BR_VLAN_STATS_ENABLED] = { .type = NLA_U8 },
+ [IFLA_BR_MCAST_STATS_ENABLED] = { .type = NLA_U8 },
};
static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
@@ -1055,6 +1056,13 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
br->multicast_startup_query_interval = clock_t_to_jiffies(val);
}
+
+ if (data[IFLA_BR_MCAST_STATS_ENABLED]) {
+ __u8 mcast_stats;
+
+ mcast_stats = nla_get_u8(data[IFLA_BR_MCAST_STATS_ENABLED]);
+ br->multicast_stats_enabled = !!mcast_stats;
+ }
#endif
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
if (data[IFLA_BR_NF_CALL_IPTABLES]) {
@@ -1110,6 +1118,7 @@ static size_t br_get_size(const struct net_device *brdev)
nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_SNOOPING */
nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_QUERY_USE_IFADDR */
nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_QUERIER */
+ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_STATS_ENABLED */
nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_HASH_ELASTICITY */
nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_HASH_MAX */
nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_LAST_MEMBER_CNT */
@@ -1187,6 +1196,8 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev)
nla_put_u8(skb, IFLA_BR_MCAST_QUERY_USE_IFADDR,
br->multicast_query_use_ifaddr) ||
nla_put_u8(skb, IFLA_BR_MCAST_QUERIER, br->multicast_querier) ||
+ nla_put_u8(skb, IFLA_BR_MCAST_STATS_ENABLED,
+ br->multicast_stats_enabled) ||
nla_put_u32(skb, IFLA_BR_MCAST_HASH_ELASTICITY,
br->hash_elasticity) ||
nla_put_u32(skb, IFLA_BR_MCAST_HASH_MAX, br->hash_max) ||
@@ -1234,7 +1245,7 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev)
return 0;
}
-static size_t br_get_linkxstats_size(const struct net_device *dev)
+static size_t bridge_get_linkxstats_size(const struct net_device *dev)
{
struct net_bridge *br = netdev_priv(dev);
struct net_bridge_vlan_group *vg;
@@ -1242,53 +1253,88 @@ static size_t br_get_linkxstats_size(const struct net_device *dev)
int numvls = 0;
vg = br_vlan_group(br);
- if (!vg)
- return 0;
-
- /* we need to count all, even placeholder entries */
- list_for_each_entry(v, &vg->vlan_list, vlist)
- numvls++;
+ if (vg) {
+ /* we need to count all, even placeholder entries */
+ list_for_each_entry(v, &vg->vlan_list, vlist)
+ numvls++;
+ }
- /* account for the vlans and the link xstats type nest attribute */
return numvls * nla_total_size(sizeof(struct bridge_vlan_xstats)) +
+ nla_total_size(sizeof(struct br_mcast_stats)) +
nla_total_size(0);
}
-static int br_fill_linkxstats(struct sk_buff *skb, const struct net_device *dev,
- int *prividx)
+static size_t brport_get_linkxstats_size(const struct net_device *dev)
+{
+ return nla_total_size(sizeof(struct br_mcast_stats)) +
+ nla_total_size(0);
+}
+
+static size_t br_get_linkxstats_size(const struct net_device *dev, int attr)
+{
+ size_t retsize = 0;
+
+ switch (attr) {
+ case IFLA_STATS_LINK_XSTATS:
+ retsize = bridge_get_linkxstats_size(dev);
+ break;
+ case IFLA_STATS_LINK_XSTATS_SLAVE:
+ retsize = brport_get_linkxstats_size(dev);
+ break;
+ }
+
+ return retsize;
+}
+
+static int bridge_fill_linkxstats(struct sk_buff *skb,
+ const struct net_device *dev,
+ int *prividx)
{
struct net_bridge *br = netdev_priv(dev);
+ struct nlattr *nla __maybe_unused;
struct net_bridge_vlan_group *vg;
struct net_bridge_vlan *v;
struct nlattr *nest;
int vl_idx = 0;
- vg = br_vlan_group(br);
- if (!vg)
- goto out;
nest = nla_nest_start(skb, LINK_XSTATS_TYPE_BRIDGE);
if (!nest)
return -EMSGSIZE;
- list_for_each_entry(v, &vg->vlan_list, vlist) {
- struct bridge_vlan_xstats vxi;
- struct br_vlan_stats stats;
- if (vl_idx++ < *prividx)
- continue;
- memset(&vxi, 0, sizeof(vxi));
- vxi.vid = v->vid;
- br_vlan_get_stats(v, &stats);
- vxi.rx_bytes = stats.rx_bytes;
- vxi.rx_packets = stats.rx_packets;
- vxi.tx_bytes = stats.tx_bytes;
- vxi.tx_packets = stats.tx_packets;
-
- if (nla_put(skb, BRIDGE_XSTATS_VLAN, sizeof(vxi), &vxi))
+ vg = br_vlan_group(br);
+ if (vg) {
+ list_for_each_entry(v, &vg->vlan_list, vlist) {
+ struct bridge_vlan_xstats vxi;
+ struct br_vlan_stats stats;
+
+ if (++vl_idx < *prividx)
+ continue;
+ memset(&vxi, 0, sizeof(vxi));
+ vxi.vid = v->vid;
+ br_vlan_get_stats(v, &stats);
+ vxi.rx_bytes = stats.rx_bytes;
+ vxi.rx_packets = stats.rx_packets;
+ vxi.tx_bytes = stats.tx_bytes;
+ vxi.tx_packets = stats.tx_packets;
+
+ if (nla_put(skb, BRIDGE_XSTATS_VLAN, sizeof(vxi), &vxi))
+ goto nla_put_failure;
+ }
+ }
+
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+ if (++vl_idx >= *prividx) {
+ nla = nla_reserve_64bit(skb, BRIDGE_XSTATS_MCAST,
+ sizeof(struct br_mcast_stats),
+ BRIDGE_XSTATS_PAD);
+ if (!nla)
goto nla_put_failure;
+ br_multicast_get_stats(br, NULL, nla_data(nla));
}
+#endif
nla_nest_end(skb, nest);
*prividx = 0;
-out:
+
return 0;
nla_put_failure:
@@ -1298,6 +1344,52 @@ nla_put_failure:
return -EMSGSIZE;
}
+static int brport_fill_linkxstats(struct sk_buff *skb,
+ const struct net_device *dev,
+ int *prividx)
+{
+ struct net_bridge_port *p = br_port_get_rtnl(dev);
+ struct nlattr *nla __maybe_unused;
+ struct nlattr *nest;
+
+ if (!p)
+ return 0;
+
+ nest = nla_nest_start(skb, LINK_XSTATS_TYPE_BRIDGE);
+ if (!nest)
+ return -EMSGSIZE;
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+ nla = nla_reserve_64bit(skb, BRIDGE_XSTATS_MCAST,
+ sizeof(struct br_mcast_stats),
+ BRIDGE_XSTATS_PAD);
+ if (!nla) {
+ nla_nest_end(skb, nest);
+ return -EMSGSIZE;
+ }
+ br_multicast_get_stats(p->br, p, nla_data(nla));
+#endif
+ nla_nest_end(skb, nest);
+
+ return 0;
+}
+
+static int br_fill_linkxstats(struct sk_buff *skb, const struct net_device *dev,
+ int *prividx, int attr)
+{
+ int ret = -EINVAL;
+
+ switch (attr) {
+ case IFLA_STATS_LINK_XSTATS:
+ ret = bridge_fill_linkxstats(skb, dev, prividx);
+ break;
+ case IFLA_STATS_LINK_XSTATS_SLAVE:
+ ret = brport_fill_linkxstats(skb, dev, prividx);
+ break;
+ }
+
+ return ret;
+}
+
static struct rtnl_af_ops br_af_ops __read_mostly = {
.family = AF_BRIDGE,
.get_link_af_size = br_get_link_af_size_filtered,
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index c7fb5d7..4dc8511 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -75,6 +75,12 @@ struct bridge_mcast_querier {
struct br_ip addr;
struct net_bridge_port __rcu *port;
};
+
+/* IGMP/MLD statistics */
+struct bridge_mcast_stats {
+ struct br_mcast_stats mstats;
+ struct u64_stats_sync syncp;
+};
#endif
struct br_vlan_stats {
@@ -229,6 +235,7 @@ struct net_bridge_port
struct bridge_mcast_own_query ip6_own_query;
#endif /* IS_ENABLED(CONFIG_IPV6) */
unsigned char multicast_router;
+ struct bridge_mcast_stats __percpu *mcast_stats;
struct timer_list multicast_router_timer;
struct hlist_head mglist;
struct hlist_node rlist;
@@ -314,6 +321,8 @@ struct net_bridge
u8 multicast_disabled:1;
u8 multicast_querier:1;
u8 multicast_query_use_ifaddr:1;
+ u8 has_ipv6_addr:1;
+ u8 multicast_stats_enabled:1;
u32 hash_elasticity;
u32 hash_max;
@@ -336,6 +345,7 @@ struct net_bridge
struct bridge_mcast_other_query ip4_other_query;
struct bridge_mcast_own_query ip4_own_query;
struct bridge_mcast_querier ip4_querier;
+ struct bridge_mcast_stats __percpu *mcast_stats;
#if IS_ENABLED(CONFIG_IPV6)
struct bridge_mcast_other_query ip6_other_query;
struct bridge_mcast_own_query ip6_own_query;
@@ -542,7 +552,7 @@ int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port,
struct sk_buff *skb, u16 vid);
struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br,
struct sk_buff *skb, u16 vid);
-void br_multicast_add_port(struct net_bridge_port *port);
+int br_multicast_add_port(struct net_bridge_port *port);
void br_multicast_del_port(struct net_bridge_port *port);
void br_multicast_enable_port(struct net_bridge_port *port);
void br_multicast_disable_port(struct net_bridge_port *port);
@@ -575,6 +585,12 @@ void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port,
struct br_ip *group, int type, u8 flags);
void br_rtr_notify(struct net_device *dev, struct net_bridge_port *port,
int type);
+void br_multicast_count(struct net_bridge *br, const struct net_bridge_port *p,
+ __be16 proto, u8 type, u8 dir);
+int br_multicast_init_stats(struct net_bridge *br);
+void br_multicast_get_stats(const struct net_bridge *br,
+ const struct net_bridge_port *p,
+ struct br_mcast_stats *dest);
#define mlock_dereference(X, br) \
rcu_dereference_protected(X, lockdep_is_held(&br->multicast_lock))
@@ -588,10 +604,22 @@ static inline bool br_multicast_is_router(struct net_bridge *br)
static inline bool
__br_multicast_querier_exists(struct net_bridge *br,
- struct bridge_mcast_other_query *querier)
+ struct bridge_mcast_other_query *querier,
+ const bool is_ipv6)
{
+ bool own_querier_enabled;
+
+ if (br->multicast_querier) {
+ if (is_ipv6 && !br->has_ipv6_addr)
+ own_querier_enabled = false;
+ else
+ own_querier_enabled = true;
+ } else {
+ own_querier_enabled = false;
+ }
+
return time_is_before_jiffies(querier->delay_time) &&
- (br->multicast_querier || timer_pending(&querier->timer));
+ (own_querier_enabled || timer_pending(&querier->timer));
}
static inline bool br_multicast_querier_exists(struct net_bridge *br,
@@ -599,15 +627,22 @@ static inline bool br_multicast_querier_exists(struct net_bridge *br,
{
switch (eth->h_proto) {
case (htons(ETH_P_IP)):
- return __br_multicast_querier_exists(br, &br->ip4_other_query);
+ return __br_multicast_querier_exists(br,
+ &br->ip4_other_query, false);
#if IS_ENABLED(CONFIG_IPV6)
case (htons(ETH_P_IPV6)):
- return __br_multicast_querier_exists(br, &br->ip6_other_query);
+ return __br_multicast_querier_exists(br,
+ &br->ip6_other_query, true);
#endif
default:
return false;
}
}
+
+static inline int br_multicast_igmp_type(const struct sk_buff *skb)
+{
+ return BR_INPUT_SKB_CB(skb)->igmp;
+}
#else
static inline int br_multicast_rcv(struct net_bridge *br,
struct net_bridge_port *port,
@@ -623,8 +658,9 @@ static inline struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br,
return NULL;
}
-static inline void br_multicast_add_port(struct net_bridge_port *port)
+static inline int br_multicast_add_port(struct net_bridge_port *port)
{
+ return 0;
}
static inline void br_multicast_del_port(struct net_bridge_port *port)
@@ -680,6 +716,22 @@ static inline void br_mdb_init(void)
static inline void br_mdb_uninit(void)
{
}
+
+static inline void br_multicast_count(struct net_bridge *br,
+ const struct net_bridge_port *p,
+ __be16 proto, u8 type, u8 dir)
+{
+}
+
+static inline int br_multicast_init_stats(struct net_bridge *br)
+{
+ return 0;
+}
+
+static inline int br_multicast_igmp_type(const struct sk_buff *skb)
+{
+ return 0;
+}
#endif
/* br_vlan.c */
diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c
index beb4707..e120307 100644
--- a/net/bridge/br_sysfs_br.c
+++ b/net/bridge/br_sysfs_br.c
@@ -618,6 +618,30 @@ static ssize_t multicast_startup_query_interval_store(
return store_bridge_parm(d, buf, len, set_startup_query_interval);
}
static DEVICE_ATTR_RW(multicast_startup_query_interval);
+
+static ssize_t multicast_stats_enabled_show(struct device *d,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+
+ return sprintf(buf, "%u\n", br->multicast_stats_enabled);
+}
+
+static int set_stats_enabled(struct net_bridge *br, unsigned long val)
+{
+ br->multicast_stats_enabled = !!val;
+ return 0;
+}
+
+static ssize_t multicast_stats_enabled_store(struct device *d,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t len)
+{
+ return store_bridge_parm(d, buf, len, set_stats_enabled);
+}
+static DEVICE_ATTR_RW(multicast_stats_enabled);
#endif
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
static ssize_t nf_call_iptables_show(
@@ -784,6 +808,7 @@ static struct attribute *bridge_attrs[] = {
&dev_attr_multicast_query_interval.attr,
&dev_attr_multicast_query_response_interval.attr,
&dev_attr_multicast_startup_query_interval.attr,
+ &dev_attr_multicast_stats_enabled.attr,
#endif
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
&dev_attr_nf_call_iptables.attr,
diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c
index 67a4a36..3408ed5 100644
--- a/net/caif/chnl_net.c
+++ b/net/caif/chnl_net.c
@@ -13,7 +13,6 @@
#include <linux/module.h>
#include <linux/netdevice.h>
#include <linux/if_ether.h>
-#include <linux/moduleparam.h>
#include <linux/ip.h>
#include <linux/sched.h>
#include <linux/sockios.h>
diff --git a/net/can/Makefile b/net/can/Makefile
index cef49eb..1093675 100644
--- a/net/can/Makefile
+++ b/net/can/Makefile
@@ -3,7 +3,8 @@
#
obj-$(CONFIG_CAN) += can.o
-can-y := af_can.o proc.o
+can-y := af_can.o
+can-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_CAN_RAW) += can-raw.o
can-raw-y := raw.o
diff --git a/net/can/af_can.c b/net/can/af_can.c
index 166d436..1108079 100644
--- a/net/can/af_can.c
+++ b/net/can/af_can.c
@@ -911,14 +911,14 @@ static __init int can_init(void)
if (!rcv_cache)
return -ENOMEM;
- if (stats_timer) {
+ if (IS_ENABLED(CONFIG_PROC_FS)) {
+ if (stats_timer) {
/* the statistics are updated every second (timer triggered) */
- setup_timer(&can_stattimer, can_stat_update, 0);
- mod_timer(&can_stattimer, round_jiffies(jiffies + HZ));
- } else
- can_stattimer.function = NULL;
-
- can_init_proc();
+ setup_timer(&can_stattimer, can_stat_update, 0);
+ mod_timer(&can_stattimer, round_jiffies(jiffies + HZ));
+ }
+ can_init_proc();
+ }
/* protocol register */
sock_register(&can_family_ops);
@@ -933,10 +933,12 @@ static __exit void can_exit(void)
{
struct net_device *dev;
- if (stats_timer)
- del_timer_sync(&can_stattimer);
+ if (IS_ENABLED(CONFIG_PROC_FS)) {
+ if (stats_timer)
+ del_timer_sync(&can_stattimer);
- can_remove_proc();
+ can_remove_proc();
+ }
/* protocol unregister */
dev_remove_pack(&canfd_packet);
diff --git a/net/can/bcm.c b/net/can/bcm.c
index 6863310..8e999ff 100644
--- a/net/can/bcm.c
+++ b/net/can/bcm.c
@@ -1,7 +1,7 @@
/*
* bcm.c - Broadcast Manager to filter/send (cyclic) CAN content
*
- * Copyright (c) 2002-2007 Volkswagen Group Electronic Research
+ * Copyright (c) 2002-2016 Volkswagen Group Electronic Research
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -67,27 +67,31 @@
*/
#define MAX_NFRAMES 256
-/* use of last_frames[index].can_dlc */
+/* use of last_frames[index].flags */
#define RX_RECV 0x40 /* received data for this element */
#define RX_THR 0x80 /* element not been sent due to throttle feature */
-#define BCM_CAN_DLC_MASK 0x0F /* clean private flags in can_dlc by masking */
+#define BCM_CAN_FLAGS_MASK 0x3F /* to clean private flags after usage */
/* get best masking value for can_rx_register() for a given single can_id */
#define REGMASK(id) ((id & CAN_EFF_FLAG) ? \
(CAN_EFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG) : \
(CAN_SFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG))
-#define CAN_BCM_VERSION CAN_VERSION
+#define CAN_BCM_VERSION "20160617"
MODULE_DESCRIPTION("PF_CAN broadcast manager protocol");
MODULE_LICENSE("Dual BSD/GPL");
MODULE_AUTHOR("Oliver Hartkopp <oliver.hartkopp@volkswagen.de>");
MODULE_ALIAS("can-proto-2");
-/* easy access to can_frame payload */
-static inline u64 GET_U64(const struct can_frame *cp)
+/*
+ * easy access to the first 64 bit of can(fd)_frame payload. cp->data is
+ * 64 bit aligned so the offset has to be multiples of 8 which is ensured
+ * by the only callers in bcm_rx_cmp_to_index() bcm_rx_handler().
+ */
+static inline u64 get_u64(const struct canfd_frame *cp, int offset)
{
- return *(u64 *)cp->data;
+ return *(u64 *)(cp->data + offset);
}
struct bcm_op {
@@ -101,13 +105,14 @@ struct bcm_op {
struct tasklet_struct tsklet, thrtsklet;
ktime_t rx_stamp, kt_ival1, kt_ival2, kt_lastmsg;
int rx_ifindex;
+ int cfsiz;
u32 count;
u32 nframes;
u32 currframe;
- struct can_frame *frames;
- struct can_frame *last_frames;
- struct can_frame sframe;
- struct can_frame last_sframe;
+ struct canfd_frame *frames;
+ struct canfd_frame *last_frames;
+ struct canfd_frame sframe;
+ struct canfd_frame last_sframe;
struct sock *sk;
struct net_device *rx_reg_dev;
};
@@ -136,7 +141,7 @@ static inline ktime_t bcm_timeval_to_ktime(struct bcm_timeval tv)
return ktime_set(tv.tv_sec, tv.tv_usec * NSEC_PER_USEC);
}
-#define CFSIZ sizeof(struct can_frame)
+#define CFSIZ(flags) ((flags & CAN_FD_FRAME) ? CANFD_MTU : CAN_MTU)
#define OPSIZ sizeof(struct bcm_op)
#define MHSIZ sizeof(struct bcm_msg_head)
@@ -183,43 +188,50 @@ static int bcm_proc_show(struct seq_file *m, void *v)
if (!op->frames_abs)
continue;
- seq_printf(m, "rx_op: %03X %-5s ",
- op->can_id, bcm_proc_getifname(ifname, op->ifindex));
- seq_printf(m, "[%u]%c ", op->nframes,
- (op->flags & RX_CHECK_DLC)?'d':' ');
+ seq_printf(m, "rx_op: %03X %-5s ", op->can_id,
+ bcm_proc_getifname(ifname, op->ifindex));
+
+ if (op->flags & CAN_FD_FRAME)
+ seq_printf(m, "(%u)", op->nframes);
+ else
+ seq_printf(m, "[%u]", op->nframes);
+
+ seq_printf(m, "%c ", (op->flags & RX_CHECK_DLC) ? 'd' : ' ');
+
if (op->kt_ival1.tv64)
seq_printf(m, "timeo=%lld ",
- (long long)
- ktime_to_us(op->kt_ival1));
+ (long long)ktime_to_us(op->kt_ival1));
if (op->kt_ival2.tv64)
seq_printf(m, "thr=%lld ",
- (long long)
- ktime_to_us(op->kt_ival2));
+ (long long)ktime_to_us(op->kt_ival2));
seq_printf(m, "# recv %ld (%ld) => reduction: ",
- op->frames_filtered, op->frames_abs);
+ op->frames_filtered, op->frames_abs);
reduction = 100 - (op->frames_filtered * 100) / op->frames_abs;
seq_printf(m, "%s%ld%%\n",
- (reduction == 100)?"near ":"", reduction);
+ (reduction == 100) ? "near " : "", reduction);
}
list_for_each_entry(op, &bo->tx_ops, list) {
- seq_printf(m, "tx_op: %03X %s [%u] ",
- op->can_id,
- bcm_proc_getifname(ifname, op->ifindex),
- op->nframes);
+ seq_printf(m, "tx_op: %03X %s ", op->can_id,
+ bcm_proc_getifname(ifname, op->ifindex));
+
+ if (op->flags & CAN_FD_FRAME)
+ seq_printf(m, "(%u) ", op->nframes);
+ else
+ seq_printf(m, "[%u] ", op->nframes);
if (op->kt_ival1.tv64)
seq_printf(m, "t1=%lld ",
- (long long) ktime_to_us(op->kt_ival1));
+ (long long)ktime_to_us(op->kt_ival1));
if (op->kt_ival2.tv64)
seq_printf(m, "t2=%lld ",
- (long long) ktime_to_us(op->kt_ival2));
+ (long long)ktime_to_us(op->kt_ival2));
seq_printf(m, "# sent %ld\n", op->frames_abs);
}
@@ -248,7 +260,7 @@ static void bcm_can_tx(struct bcm_op *op)
{
struct sk_buff *skb;
struct net_device *dev;
- struct can_frame *cf = &op->frames[op->currframe];
+ struct canfd_frame *cf = op->frames + op->cfsiz * op->currframe;
/* no target device? => exit */
if (!op->ifindex)
@@ -260,7 +272,7 @@ static void bcm_can_tx(struct bcm_op *op)
return;
}
- skb = alloc_skb(CFSIZ + sizeof(struct can_skb_priv), gfp_any());
+ skb = alloc_skb(op->cfsiz + sizeof(struct can_skb_priv), gfp_any());
if (!skb)
goto out;
@@ -268,7 +280,7 @@ static void bcm_can_tx(struct bcm_op *op)
can_skb_prv(skb)->ifindex = dev->ifindex;
can_skb_prv(skb)->skbcnt = 0;
- memcpy(skb_put(skb, CFSIZ), cf, CFSIZ);
+ memcpy(skb_put(skb, op->cfsiz), cf, op->cfsiz);
/* send with loopback */
skb->dev = dev;
@@ -282,7 +294,7 @@ static void bcm_can_tx(struct bcm_op *op)
/* reached last frame? */
if (op->currframe >= op->nframes)
op->currframe = 0;
- out:
+out:
dev_put(dev);
}
@@ -291,13 +303,13 @@ static void bcm_can_tx(struct bcm_op *op)
* (consisting of bcm_msg_head + x CAN frames)
*/
static void bcm_send_to_user(struct bcm_op *op, struct bcm_msg_head *head,
- struct can_frame *frames, int has_timestamp)
+ struct canfd_frame *frames, int has_timestamp)
{
struct sk_buff *skb;
- struct can_frame *firstframe;
+ struct canfd_frame *firstframe;
struct sockaddr_can *addr;
struct sock *sk = op->sk;
- unsigned int datalen = head->nframes * CFSIZ;
+ unsigned int datalen = head->nframes * op->cfsiz;
int err;
skb = alloc_skb(sizeof(*head) + datalen, gfp_any());
@@ -307,19 +319,19 @@ static void bcm_send_to_user(struct bcm_op *op, struct bcm_msg_head *head,
memcpy(skb_put(skb, sizeof(*head)), head, sizeof(*head));
if (head->nframes) {
- /* can_frames starting here */
- firstframe = (struct can_frame *)skb_tail_pointer(skb);
+ /* CAN frames starting here */
+ firstframe = (struct canfd_frame *)skb_tail_pointer(skb);
memcpy(skb_put(skb, datalen), frames, datalen);
/*
- * the BCM uses the can_dlc-element of the can_frame
+ * the BCM uses the flags-element of the canfd_frame
* structure for internal purposes. This is only
* relevant for updates that are generated by the
* BCM, where nframes is 1
*/
if (head->nframes == 1)
- firstframe->can_dlc &= BCM_CAN_DLC_MASK;
+ firstframe->flags &= BCM_CAN_FLAGS_MASK;
}
if (has_timestamp) {
@@ -406,7 +418,7 @@ static enum hrtimer_restart bcm_tx_timeout_handler(struct hrtimer *hrtimer)
/*
* bcm_rx_changed - create a RX_CHANGED notification due to changed content
*/
-static void bcm_rx_changed(struct bcm_op *op, struct can_frame *data)
+static void bcm_rx_changed(struct bcm_op *op, struct canfd_frame *data)
{
struct bcm_msg_head head;
@@ -418,7 +430,7 @@ static void bcm_rx_changed(struct bcm_op *op, struct can_frame *data)
op->frames_filtered = op->frames_abs = 0;
/* this element is not throttled anymore */
- data->can_dlc &= (BCM_CAN_DLC_MASK|RX_RECV);
+ data->flags &= (BCM_CAN_FLAGS_MASK|RX_RECV);
head.opcode = RX_CHANGED;
head.flags = op->flags;
@@ -437,13 +449,13 @@ static void bcm_rx_changed(struct bcm_op *op, struct can_frame *data)
* 2. send a notification to the user (if possible)
*/
static void bcm_rx_update_and_send(struct bcm_op *op,
- struct can_frame *lastdata,
- const struct can_frame *rxdata)
+ struct canfd_frame *lastdata,
+ const struct canfd_frame *rxdata)
{
- memcpy(lastdata, rxdata, CFSIZ);
+ memcpy(lastdata, rxdata, op->cfsiz);
/* mark as used and throttled by default */
- lastdata->can_dlc |= (RX_RECV|RX_THR);
+ lastdata->flags |= (RX_RECV|RX_THR);
/* throttling mode inactive ? */
if (!op->kt_ival2.tv64) {
@@ -481,33 +493,36 @@ rx_changed_settime:
* received data stored in op->last_frames[]
*/
static void bcm_rx_cmp_to_index(struct bcm_op *op, unsigned int index,
- const struct can_frame *rxdata)
+ const struct canfd_frame *rxdata)
{
+ struct canfd_frame *cf = op->frames + op->cfsiz * index;
+ struct canfd_frame *lcf = op->last_frames + op->cfsiz * index;
+ int i;
+
/*
- * no one uses the MSBs of can_dlc for comparison,
+ * no one uses the MSBs of flags for comparison,
* so we use it here to detect the first time of reception
*/
- if (!(op->last_frames[index].can_dlc & RX_RECV)) {
+ if (!(lcf->flags & RX_RECV)) {
/* received data for the first time => send update to user */
- bcm_rx_update_and_send(op, &op->last_frames[index], rxdata);
+ bcm_rx_update_and_send(op, lcf, rxdata);
return;
}
- /* do a real check in can_frame data section */
-
- if ((GET_U64(&op->frames[index]) & GET_U64(rxdata)) !=
- (GET_U64(&op->frames[index]) & GET_U64(&op->last_frames[index]))) {
- bcm_rx_update_and_send(op, &op->last_frames[index], rxdata);
- return;
+ /* do a real check in CAN frame data section */
+ for (i = 0; i < rxdata->len; i += 8) {
+ if ((get_u64(cf, i) & get_u64(rxdata, i)) !=
+ (get_u64(cf, i) & get_u64(lcf, i))) {
+ bcm_rx_update_and_send(op, lcf, rxdata);
+ return;
+ }
}
if (op->flags & RX_CHECK_DLC) {
- /* do a real check in can_frame dlc */
- if (rxdata->can_dlc != (op->last_frames[index].can_dlc &
- BCM_CAN_DLC_MASK)) {
- bcm_rx_update_and_send(op, &op->last_frames[index],
- rxdata);
+ /* do a real check in CAN frame length */
+ if (rxdata->len != lcf->len) {
+ bcm_rx_update_and_send(op, lcf, rxdata);
return;
}
}
@@ -556,8 +571,8 @@ static enum hrtimer_restart bcm_rx_timeout_handler(struct hrtimer *hrtimer)
/* if user wants to be informed, when cyclic CAN-Messages come back */
if ((op->flags & RX_ANNOUNCE_RESUME) && op->last_frames) {
- /* clear received can_frames to indicate 'nothing received' */
- memset(op->last_frames, 0, op->nframes * CFSIZ);
+ /* clear received CAN frames to indicate 'nothing received' */
+ memset(op->last_frames, 0, op->nframes * op->cfsiz);
}
return HRTIMER_NORESTART;
@@ -569,9 +584,11 @@ static enum hrtimer_restart bcm_rx_timeout_handler(struct hrtimer *hrtimer)
static inline int bcm_rx_do_flush(struct bcm_op *op, int update,
unsigned int index)
{
- if ((op->last_frames) && (op->last_frames[index].can_dlc & RX_THR)) {
+ struct canfd_frame *lcf = op->last_frames + op->cfsiz * index;
+
+ if ((op->last_frames) && (lcf->flags & RX_THR)) {
if (update)
- bcm_rx_changed(op, &op->last_frames[index]);
+ bcm_rx_changed(op, lcf);
return 1;
}
return 0;
@@ -636,15 +653,19 @@ static enum hrtimer_restart bcm_rx_thr_handler(struct hrtimer *hrtimer)
static void bcm_rx_handler(struct sk_buff *skb, void *data)
{
struct bcm_op *op = (struct bcm_op *)data;
- const struct can_frame *rxframe = (struct can_frame *)skb->data;
+ const struct canfd_frame *rxframe = (struct canfd_frame *)skb->data;
unsigned int i;
- /* disable timeout */
- hrtimer_cancel(&op->timer);
-
if (op->can_id != rxframe->can_id)
return;
+ /* make sure to handle the correct frame type (CAN / CAN FD) */
+ if (skb->len != op->cfsiz)
+ return;
+
+ /* disable timeout */
+ hrtimer_cancel(&op->timer);
+
/* save rx timestamp */
op->rx_stamp = skb->tstamp;
/* save originator for recvfrom() */
@@ -675,13 +696,14 @@ static void bcm_rx_handler(struct sk_buff *skb, void *data)
* multiplex compare
*
* find the first multiplex mask that fits.
- * Remark: The MUX-mask is stored in index 0
+ * Remark: The MUX-mask is stored in index 0 - but only the
+ * first 64 bits of the frame data[] are relevant (CAN FD)
*/
for (i = 1; i < op->nframes; i++) {
- if ((GET_U64(&op->frames[0]) & GET_U64(rxframe)) ==
- (GET_U64(&op->frames[0]) &
- GET_U64(&op->frames[i]))) {
+ if ((get_u64(op->frames, 0) & get_u64(rxframe, 0)) ==
+ (get_u64(op->frames, 0) &
+ get_u64(op->frames + op->cfsiz * i, 0))) {
bcm_rx_cmp_to_index(op, i, rxframe);
break;
}
@@ -695,13 +717,14 @@ rx_starttimer:
/*
* helpers for bcm_op handling: find & delete bcm [rx|tx] op elements
*/
-static struct bcm_op *bcm_find_op(struct list_head *ops, canid_t can_id,
- int ifindex)
+static struct bcm_op *bcm_find_op(struct list_head *ops,
+ struct bcm_msg_head *mh, int ifindex)
{
struct bcm_op *op;
list_for_each_entry(op, ops, list) {
- if ((op->can_id == can_id) && (op->ifindex == ifindex))
+ if ((op->can_id == mh->can_id) && (op->ifindex == ifindex) &&
+ (op->flags & CAN_FD_FRAME) == (mh->flags & CAN_FD_FRAME))
return op;
}
@@ -744,12 +767,14 @@ static void bcm_rx_unreg(struct net_device *dev, struct bcm_op *op)
/*
* bcm_delete_rx_op - find and remove a rx op (returns number of removed ops)
*/
-static int bcm_delete_rx_op(struct list_head *ops, canid_t can_id, int ifindex)
+static int bcm_delete_rx_op(struct list_head *ops, struct bcm_msg_head *mh,
+ int ifindex)
{
struct bcm_op *op, *n;
list_for_each_entry_safe(op, n, ops, list) {
- if ((op->can_id == can_id) && (op->ifindex == ifindex)) {
+ if ((op->can_id == mh->can_id) && (op->ifindex == ifindex) &&
+ (op->flags & CAN_FD_FRAME) == (mh->flags & CAN_FD_FRAME)) {
/*
* Don't care if we're bound or not (due to netdev
@@ -789,12 +814,14 @@ static int bcm_delete_rx_op(struct list_head *ops, canid_t can_id, int ifindex)
/*
* bcm_delete_tx_op - find and remove a tx op (returns number of removed ops)
*/
-static int bcm_delete_tx_op(struct list_head *ops, canid_t can_id, int ifindex)
+static int bcm_delete_tx_op(struct list_head *ops, struct bcm_msg_head *mh,
+ int ifindex)
{
struct bcm_op *op, *n;
list_for_each_entry_safe(op, n, ops, list) {
- if ((op->can_id == can_id) && (op->ifindex == ifindex)) {
+ if ((op->can_id == mh->can_id) && (op->ifindex == ifindex) &&
+ (op->flags & CAN_FD_FRAME) == (mh->flags & CAN_FD_FRAME)) {
list_del(&op->list);
bcm_remove_op(op);
return 1; /* done */
@@ -810,7 +837,7 @@ static int bcm_delete_tx_op(struct list_head *ops, canid_t can_id, int ifindex)
static int bcm_read_op(struct list_head *ops, struct bcm_msg_head *msg_head,
int ifindex)
{
- struct bcm_op *op = bcm_find_op(ops, msg_head->can_id, ifindex);
+ struct bcm_op *op = bcm_find_op(ops, msg_head, ifindex);
if (!op)
return -EINVAL;
@@ -835,6 +862,7 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
{
struct bcm_sock *bo = bcm_sk(sk);
struct bcm_op *op;
+ struct canfd_frame *cf;
unsigned int i;
int err;
@@ -842,39 +870,46 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
if (!ifindex)
return -ENODEV;
- /* check nframes boundaries - we need at least one can_frame */
+ /* check nframes boundaries - we need at least one CAN frame */
if (msg_head->nframes < 1 || msg_head->nframes > MAX_NFRAMES)
return -EINVAL;
/* check the given can_id */
- op = bcm_find_op(&bo->tx_ops, msg_head->can_id, ifindex);
-
+ op = bcm_find_op(&bo->tx_ops, msg_head, ifindex);
if (op) {
/* update existing BCM operation */
/*
- * Do we need more space for the can_frames than currently
+ * Do we need more space for the CAN frames than currently
* allocated? -> This is a _really_ unusual use-case and
* therefore (complexity / locking) it is not supported.
*/
if (msg_head->nframes > op->nframes)
return -E2BIG;
- /* update can_frames content */
+ /* update CAN frames content */
for (i = 0; i < msg_head->nframes; i++) {
- err = memcpy_from_msg((u8 *)&op->frames[i], msg, CFSIZ);
- if (op->frames[i].can_dlc > 8)
- err = -EINVAL;
+ cf = op->frames + op->cfsiz * i;
+ err = memcpy_from_msg((u8 *)cf, msg, op->cfsiz);
+
+ if (op->flags & CAN_FD_FRAME) {
+ if (cf->len > 64)
+ err = -EINVAL;
+ } else {
+ if (cf->len > 8)
+ err = -EINVAL;
+ }
if (err < 0)
return err;
if (msg_head->flags & TX_CP_CAN_ID) {
/* copy can_id into frame */
- op->frames[i].can_id = msg_head->can_id;
+ cf->can_id = msg_head->can_id;
}
}
+ op->flags = msg_head->flags;
} else {
/* insert new BCM operation for the given can_id */
@@ -883,11 +918,13 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
if (!op)
return -ENOMEM;
- op->can_id = msg_head->can_id;
+ op->can_id = msg_head->can_id;
+ op->cfsiz = CFSIZ(msg_head->flags);
+ op->flags = msg_head->flags;
- /* create array for can_frames and copy the data */
+ /* create array for CAN frames and copy the data */
if (msg_head->nframes > 1) {
- op->frames = kmalloc(msg_head->nframes * CFSIZ,
+ op->frames = kmalloc(msg_head->nframes * op->cfsiz,
GFP_KERNEL);
if (!op->frames) {
kfree(op);
@@ -897,10 +934,17 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
op->frames = &op->sframe;
for (i = 0; i < msg_head->nframes; i++) {
- err = memcpy_from_msg((u8 *)&op->frames[i], msg, CFSIZ);
- if (op->frames[i].can_dlc > 8)
- err = -EINVAL;
+ cf = op->frames + op->cfsiz * i;
+ err = memcpy_from_msg((u8 *)cf, msg, op->cfsiz);
+
+ if (op->flags & CAN_FD_FRAME) {
+ if (cf->len > 64)
+ err = -EINVAL;
+ } else {
+ if (cf->len > 8)
+ err = -EINVAL;
+ }
if (err < 0) {
if (op->frames != &op->sframe)
@@ -911,7 +955,7 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
if (msg_head->flags & TX_CP_CAN_ID) {
/* copy can_id into frame */
- op->frames[i].can_id = msg_head->can_id;
+ cf->can_id = msg_head->can_id;
}
}
@@ -946,8 +990,6 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
/* check flags */
- op->flags = msg_head->flags;
-
if (op->flags & TX_RESET_MULTI_IDX) {
/* start multiple frame transmission with index 0 */
op->currframe = 0;
@@ -968,7 +1010,7 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
if (op->flags & STARTTIMER) {
hrtimer_cancel(&op->timer);
- /* spec: send can_frame when starting timer */
+ /* spec: send CAN frame when starting timer */
op->flags |= TX_ANNOUNCE;
}
@@ -981,7 +1023,7 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
if (op->flags & STARTTIMER)
bcm_tx_start_timer(op);
- return msg_head->nframes * CFSIZ + MHSIZ;
+ return msg_head->nframes * op->cfsiz + MHSIZ;
}
/*
@@ -1012,12 +1054,12 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
return -EINVAL;
/* check the given can_id */
- op = bcm_find_op(&bo->rx_ops, msg_head->can_id, ifindex);
+ op = bcm_find_op(&bo->rx_ops, msg_head, ifindex);
if (op) {
/* update existing BCM operation */
/*
- * Do we need more space for the can_frames than currently
+ * Do we need more space for the CAN frames than currently
* allocated? -> This is a _really_ unusual use-case and
* therefore (complexity / locking) it is not supported.
*/
@@ -1025,17 +1067,18 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
return -E2BIG;
if (msg_head->nframes) {
- /* update can_frames content */
+ /* update CAN frames content */
err = memcpy_from_msg((u8 *)op->frames, msg,
- msg_head->nframes * CFSIZ);
+ msg_head->nframes * op->cfsiz);
if (err < 0)
return err;
/* clear last_frames to indicate 'nothing received' */
- memset(op->last_frames, 0, msg_head->nframes * CFSIZ);
+ memset(op->last_frames, 0, msg_head->nframes * op->cfsiz);
}
op->nframes = msg_head->nframes;
+ op->flags = msg_head->flags;
/* Only an update -> do not call can_rx_register() */
do_rx_register = 0;
@@ -1046,20 +1089,22 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
if (!op)
return -ENOMEM;
- op->can_id = msg_head->can_id;
- op->nframes = msg_head->nframes;
+ op->can_id = msg_head->can_id;
+ op->nframes = msg_head->nframes;
+ op->cfsiz = CFSIZ(msg_head->flags);
+ op->flags = msg_head->flags;
if (msg_head->nframes > 1) {
- /* create array for can_frames and copy the data */
- op->frames = kmalloc(msg_head->nframes * CFSIZ,
+ /* create array for CAN frames and copy the data */
+ op->frames = kmalloc(msg_head->nframes * op->cfsiz,
GFP_KERNEL);
if (!op->frames) {
kfree(op);
return -ENOMEM;
}
- /* create and init array for received can_frames */
- op->last_frames = kzalloc(msg_head->nframes * CFSIZ,
+ /* create and init array for received CAN frames */
+ op->last_frames = kzalloc(msg_head->nframes * op->cfsiz,
GFP_KERNEL);
if (!op->last_frames) {
kfree(op->frames);
@@ -1074,7 +1119,7 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
if (msg_head->nframes) {
err = memcpy_from_msg((u8 *)op->frames, msg,
- msg_head->nframes * CFSIZ);
+ msg_head->nframes * op->cfsiz);
if (err < 0) {
if (op->frames != &op->sframe)
kfree(op->frames);
@@ -1116,7 +1161,6 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
} /* if ((op = bcm_find_op(&bo->rx_ops, msg_head->can_id, ifindex))) */
/* check flags */
- op->flags = msg_head->flags;
if (op->flags & RX_RTR_FRAME) {
@@ -1188,13 +1232,14 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
}
}
- return msg_head->nframes * CFSIZ + MHSIZ;
+ return msg_head->nframes * op->cfsiz + MHSIZ;
}
/*
* bcm_tx_send - send a single CAN frame to the CAN interface (for bcm_sendmsg)
*/
-static int bcm_tx_send(struct msghdr *msg, int ifindex, struct sock *sk)
+static int bcm_tx_send(struct msghdr *msg, int ifindex, struct sock *sk,
+ int cfsiz)
{
struct sk_buff *skb;
struct net_device *dev;
@@ -1204,13 +1249,13 @@ static int bcm_tx_send(struct msghdr *msg, int ifindex, struct sock *sk)
if (!ifindex)
return -ENODEV;
- skb = alloc_skb(CFSIZ + sizeof(struct can_skb_priv), GFP_KERNEL);
+ skb = alloc_skb(cfsiz + sizeof(struct can_skb_priv), GFP_KERNEL);
if (!skb)
return -ENOMEM;
can_skb_reserve(skb);
- err = memcpy_from_msg(skb_put(skb, CFSIZ), msg, CFSIZ);
+ err = memcpy_from_msg(skb_put(skb, cfsiz), msg, cfsiz);
if (err < 0) {
kfree_skb(skb);
return err;
@@ -1232,7 +1277,7 @@ static int bcm_tx_send(struct msghdr *msg, int ifindex, struct sock *sk)
if (err)
return err;
- return CFSIZ + MHSIZ;
+ return cfsiz + MHSIZ;
}
/*
@@ -1244,13 +1289,23 @@ static int bcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
struct bcm_sock *bo = bcm_sk(sk);
int ifindex = bo->ifindex; /* default ifindex for this bcm_op */
struct bcm_msg_head msg_head;
+ int cfsiz;
int ret; /* read bytes or error codes as return value */
if (!bo->bound)
return -ENOTCONN;
/* check for valid message length from userspace */
- if (size < MHSIZ || (size - MHSIZ) % CFSIZ)
+ if (size < MHSIZ)
+ return -EINVAL;
+
+ /* read message head information */
+ ret = memcpy_from_msg((u8 *)&msg_head, msg, MHSIZ);
+ if (ret < 0)
+ return ret;
+
+ cfsiz = CFSIZ(msg_head.flags);
+ if ((size - MHSIZ) % cfsiz)
return -EINVAL;
/* check for alternative ifindex for this bcm_op */
@@ -1284,12 +1339,6 @@ static int bcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
}
}
- /* read message head information */
-
- ret = memcpy_from_msg((u8 *)&msg_head, msg, MHSIZ);
- if (ret < 0)
- return ret;
-
lock_sock(sk);
switch (msg_head.opcode) {
@@ -1303,14 +1352,14 @@ static int bcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
break;
case TX_DELETE:
- if (bcm_delete_tx_op(&bo->tx_ops, msg_head.can_id, ifindex))
+ if (bcm_delete_tx_op(&bo->tx_ops, &msg_head, ifindex))
ret = MHSIZ;
else
ret = -EINVAL;
break;
case RX_DELETE:
- if (bcm_delete_rx_op(&bo->rx_ops, msg_head.can_id, ifindex))
+ if (bcm_delete_rx_op(&bo->rx_ops, &msg_head, ifindex))
ret = MHSIZ;
else
ret = -EINVAL;
@@ -1329,11 +1378,11 @@ static int bcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
break;
case TX_SEND:
- /* we need exactly one can_frame behind the msg head */
- if ((msg_head.nframes != 1) || (size != CFSIZ + MHSIZ))
+ /* we need exactly one CAN frame behind the msg head */
+ if ((msg_head.nframes != 1) || (size != cfsiz + MHSIZ))
ret = -EINVAL;
else
- ret = bcm_tx_send(msg, ifindex, sk);
+ ret = bcm_tx_send(msg, ifindex, sk, cfsiz);
break;
default:
diff --git a/net/can/proc.c b/net/can/proc.c
index 1a19b98..85ef7bb 100644
--- a/net/can/proc.c
+++ b/net/can/proc.c
@@ -517,8 +517,7 @@ void can_init_proc(void)
can_dir = proc_mkdir("can", init_net.proc_net);
if (!can_dir) {
- printk(KERN_INFO "can: failed to create /proc/net/can . "
- "CONFIG_PROC_FS missing?\n");
+ pr_info("can: failed to create /proc/net/can.\n");
return;
}
diff --git a/net/core/dev.c b/net/core/dev.c
index b148357..a4f3b0a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2422,7 +2422,7 @@ EXPORT_SYMBOL(__skb_tx_hash);
static void skb_warn_bad_offload(const struct sk_buff *skb)
{
- static const netdev_features_t null_features = 0;
+ static const netdev_features_t null_features;
struct net_device *dev = skb->dev;
const char *name = "";
@@ -3070,6 +3070,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
struct netdev_queue *txq)
{
spinlock_t *root_lock = qdisc_lock(q);
+ struct sk_buff *to_free = NULL;
bool contended;
int rc;
@@ -3086,7 +3087,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
spin_lock(root_lock);
if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
- kfree_skb(skb);
+ __qdisc_drop(skb, &to_free);
rc = NET_XMIT_DROP;
} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
qdisc_run_begin(q)) {
@@ -3109,7 +3110,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
rc = NET_XMIT_SUCCESS;
} else {
- rc = q->enqueue(skb, q) & NET_XMIT_MASK;
+ rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
if (qdisc_run_begin(q)) {
if (unlikely(contended)) {
spin_unlock(&q->busylock);
@@ -3119,6 +3120,8 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
}
}
spin_unlock(root_lock);
+ if (unlikely(to_free))
+ kfree_skb_list(to_free);
if (unlikely(contended))
spin_unlock(&q->busylock);
return rc;
@@ -5442,6 +5445,52 @@ void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
EXPORT_SYMBOL(netdev_lower_get_next);
/**
+ * netdev_all_lower_get_next - Get the next device from all lower neighbour list
+ * @dev: device
+ * @iter: list_head ** of the current position
+ *
+ * Gets the next netdev_adjacent from the dev's all lower neighbour
+ * list, starting from iter position. The caller must hold RTNL lock or
+ * its own locking that guarantees that the neighbour all lower
+ * list will remain unchanged.
+ */
+struct net_device *netdev_all_lower_get_next(struct net_device *dev, struct list_head **iter)
+{
+ struct netdev_adjacent *lower;
+
+ lower = list_entry(*iter, struct netdev_adjacent, list);
+
+ if (&lower->list == &dev->all_adj_list.lower)
+ return NULL;
+
+ *iter = lower->list.next;
+
+ return lower->dev;
+}
+EXPORT_SYMBOL(netdev_all_lower_get_next);
+
+/**
+ * netdev_all_lower_get_next_rcu - Get the next device from all
+ * lower neighbour list, RCU variant
+ * @dev: device
+ * @iter: list_head ** of the current position
+ *
+ * Gets the next netdev_adjacent from the dev's all lower neighbour
+ * list, starting from iter position. The caller must hold RCU read lock.
+ */
+struct net_device *netdev_all_lower_get_next_rcu(struct net_device *dev,
+ struct list_head **iter)
+{
+ struct netdev_adjacent *lower;
+
+ lower = list_first_or_null_rcu(&dev->all_adj_list.lower,
+ struct netdev_adjacent, list);
+
+ return lower ? lower->dev : NULL;
+}
+EXPORT_SYMBOL(netdev_all_lower_get_next_rcu);
+
+/**
* netdev_lower_get_first_private_rcu - Get the first ->private from the
* lower neighbour list, RCU
* variant
@@ -5911,7 +5960,7 @@ static void netdev_adjacent_add_links(struct net_device *dev)
struct net *net = dev_net(dev);
list_for_each_entry(iter, &dev->adj_list.upper, list) {
- if (!net_eq(net,dev_net(iter->dev)))
+ if (!net_eq(net, dev_net(iter->dev)))
continue;
netdev_adjacent_sysfs_add(iter->dev, dev,
&iter->dev->adj_list.lower);
@@ -5920,7 +5969,7 @@ static void netdev_adjacent_add_links(struct net_device *dev)
}
list_for_each_entry(iter, &dev->adj_list.lower, list) {
- if (!net_eq(net,dev_net(iter->dev)))
+ if (!net_eq(net, dev_net(iter->dev)))
continue;
netdev_adjacent_sysfs_add(iter->dev, dev,
&iter->dev->adj_list.upper);
@@ -5936,7 +5985,7 @@ static void netdev_adjacent_del_links(struct net_device *dev)
struct net *net = dev_net(dev);
list_for_each_entry(iter, &dev->adj_list.upper, list) {
- if (!net_eq(net,dev_net(iter->dev)))
+ if (!net_eq(net, dev_net(iter->dev)))
continue;
netdev_adjacent_sysfs_del(iter->dev, dev->name,
&iter->dev->adj_list.lower);
@@ -5945,7 +5994,7 @@ static void netdev_adjacent_del_links(struct net_device *dev)
}
list_for_each_entry(iter, &dev->adj_list.lower, list) {
- if (!net_eq(net,dev_net(iter->dev)))
+ if (!net_eq(net, dev_net(iter->dev)))
continue;
netdev_adjacent_sysfs_del(iter->dev, dev->name,
&iter->dev->adj_list.upper);
@@ -5961,7 +6010,7 @@ void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
struct net *net = dev_net(dev);
list_for_each_entry(iter, &dev->adj_list.upper, list) {
- if (!net_eq(net,dev_net(iter->dev)))
+ if (!net_eq(net, dev_net(iter->dev)))
continue;
netdev_adjacent_sysfs_del(iter->dev, oldname,
&iter->dev->adj_list.lower);
@@ -5970,7 +6019,7 @@ void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
}
list_for_each_entry(iter, &dev->adj_list.lower, list) {
- if (!net_eq(net,dev_net(iter->dev)))
+ if (!net_eq(net, dev_net(iter->dev)))
continue;
netdev_adjacent_sysfs_del(iter->dev, oldname,
&iter->dev->adj_list.upper);
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 933e8d4..b2e592a 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -1394,6 +1394,78 @@ static int devlink_nl_cmd_sb_occ_max_clear_doit(struct sk_buff *skb,
return -EOPNOTSUPP;
}
+static int devlink_eswitch_fill(struct sk_buff *msg, struct devlink *devlink,
+ enum devlink_command cmd, u32 portid,
+ u32 seq, int flags, u16 mode)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+
+ if (nla_put_u16(msg, DEVLINK_ATTR_ESWITCH_MODE, mode))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static int devlink_nl_cmd_eswitch_mode_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ const struct devlink_ops *ops = devlink->ops;
+ struct sk_buff *msg;
+ u16 mode;
+ int err;
+
+ if (!ops || !ops->eswitch_mode_get)
+ return -EOPNOTSUPP;
+
+ err = ops->eswitch_mode_get(devlink, &mode);
+ if (err)
+ return err;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_eswitch_fill(msg, devlink, DEVLINK_CMD_ESWITCH_MODE_GET,
+ info->snd_portid, info->snd_seq, 0, mode);
+
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int devlink_nl_cmd_eswitch_mode_set_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ const struct devlink_ops *ops = devlink->ops;
+ u16 mode;
+
+ if (!info->attrs[DEVLINK_ATTR_ESWITCH_MODE])
+ return -EINVAL;
+
+ mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]);
+
+ if (ops && ops->eswitch_mode_set)
+ return ops->eswitch_mode_set(devlink, mode);
+ return -EOPNOTSUPP;
+}
+
static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING },
[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING },
@@ -1407,6 +1479,7 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
[DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE] = { .type = NLA_U8 },
[DEVLINK_ATTR_SB_THRESHOLD] = { .type = NLA_U32 },
[DEVLINK_ATTR_SB_TC_INDEX] = { .type = NLA_U16 },
+ [DEVLINK_ATTR_ESWITCH_MODE] = { .type = NLA_U16 },
};
static const struct genl_ops devlink_nl_ops[] = {
@@ -1525,6 +1598,20 @@ static const struct genl_ops devlink_nl_ops[] = {
DEVLINK_NL_FLAG_NEED_SB |
DEVLINK_NL_FLAG_LOCK_PORTS,
},
+ {
+ .cmd = DEVLINK_CMD_ESWITCH_MODE_GET,
+ .doit = devlink_nl_cmd_eswitch_mode_get_doit,
+ .policy = devlink_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ },
+ {
+ .cmd = DEVLINK_CMD_ESWITCH_MODE_SET,
+ .doit = devlink_nl_cmd_eswitch_mode_set_doit,
+ .policy = devlink_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ },
};
/**
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 98298b1..be4629c 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -269,6 +269,49 @@ errout:
return err;
}
+static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh,
+ struct nlattr **tb, struct fib_rule *rule)
+{
+ struct fib_rule *r;
+
+ list_for_each_entry(r, &ops->rules_list, list) {
+ if (r->action != rule->action)
+ continue;
+
+ if (r->table != rule->table)
+ continue;
+
+ if (r->pref != rule->pref)
+ continue;
+
+ if (memcmp(r->iifname, rule->iifname, IFNAMSIZ))
+ continue;
+
+ if (memcmp(r->oifname, rule->oifname, IFNAMSIZ))
+ continue;
+
+ if (r->mark != rule->mark)
+ continue;
+
+ if (r->mark_mask != rule->mark_mask)
+ continue;
+
+ if (r->tun_id != rule->tun_id)
+ continue;
+
+ if (r->fr_net != rule->fr_net)
+ continue;
+
+ if (r->l3mdev != rule->l3mdev)
+ continue;
+
+ if (!ops->compare(r, frh, tb))
+ continue;
+ return 1;
+ }
+ return 0;
+}
+
int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
@@ -386,6 +429,12 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh)
if (rule->l3mdev && rule->table)
goto errout_free;
+ if ((nlh->nlmsg_flags & NLM_F_EXCL) &&
+ rule_exists(ops, frh, tb, rule)) {
+ err = -EEXIST;
+ goto errout_free;
+ }
+
err = ops->configure(rule, skb, frh, tb);
if (err < 0)
goto errout_free;
diff --git a/net/core/filter.c b/net/core/filter.c
index df6860c..10c4a2f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -150,6 +150,12 @@ static u64 __get_raw_cpu_id(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
return raw_smp_processor_id();
}
+static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = {
+ .func = __get_raw_cpu_id,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg,
struct bpf_insn *insn_buf)
{
@@ -1295,21 +1301,10 @@ int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk)
static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk)
{
- struct bpf_prog *prog;
-
if (sock_flag(sk, SOCK_FILTER_LOCKED))
return ERR_PTR(-EPERM);
- prog = bpf_prog_get(ufd);
- if (IS_ERR(prog))
- return prog;
-
- if (prog->type != BPF_PROG_TYPE_SOCKET_FILTER) {
- bpf_prog_put(prog);
- return ERR_PTR(-EINVAL);
- }
-
- return prog;
+ return bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER);
}
int sk_attach_bpf(u32 ufd, struct sock *sk)
@@ -1734,6 +1729,23 @@ static const struct bpf_func_proto bpf_get_route_realm_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};
+static u64 bpf_get_hash_recalc(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ /* If skb_clear_hash() was called due to mangling, we can
+ * trigger SW recalculation here. Later access to hash
+ * can then use the inline skb->hash via context directly
+ * instead of calling this helper again.
+ */
+ return skb_get_hash((struct sk_buff *) (unsigned long) r1);
+}
+
+static const struct bpf_func_proto bpf_get_hash_recalc_proto = {
+ .func = bpf_get_hash_recalc,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
static u64 bpf_skb_vlan_push(u64 r1, u64 r2, u64 vlan_tci, u64 r4, u64 r5)
{
struct sk_buff *skb = (struct sk_buff *) (long) r1;
@@ -1777,6 +1789,224 @@ const struct bpf_func_proto bpf_skb_vlan_pop_proto = {
};
EXPORT_SYMBOL_GPL(bpf_skb_vlan_pop_proto);
+static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len)
+{
+ /* Caller already did skb_cow() with len as headroom,
+ * so no need to do it here.
+ */
+ skb_push(skb, len);
+ memmove(skb->data, skb->data + len, off);
+ memset(skb->data + off, 0, len);
+
+ /* No skb_postpush_rcsum(skb, skb->data + off, len)
+ * needed here as it does not change the skb->csum
+ * result for checksum complete when summing over
+ * zeroed blocks.
+ */
+ return 0;
+}
+
+static int bpf_skb_generic_pop(struct sk_buff *skb, u32 off, u32 len)
+{
+ /* skb_ensure_writable() is not needed here, as we're
+ * already working on an uncloned skb.
+ */
+ if (unlikely(!pskb_may_pull(skb, off + len)))
+ return -ENOMEM;
+
+ skb_postpull_rcsum(skb, skb->data + off, len);
+ memmove(skb->data + len, skb->data, off);
+ __skb_pull(skb, len);
+
+ return 0;
+}
+
+static int bpf_skb_net_hdr_push(struct sk_buff *skb, u32 off, u32 len)
+{
+ bool trans_same = skb->transport_header == skb->network_header;
+ int ret;
+
+ /* There's no need for __skb_push()/__skb_pull() pair to
+ * get to the start of the mac header as we're guaranteed
+ * to always start from here under eBPF.
+ */
+ ret = bpf_skb_generic_push(skb, off, len);
+ if (likely(!ret)) {
+ skb->mac_header -= len;
+ skb->network_header -= len;
+ if (trans_same)
+ skb->transport_header = skb->network_header;
+ }
+
+ return ret;
+}
+
+static int bpf_skb_net_hdr_pop(struct sk_buff *skb, u32 off, u32 len)
+{
+ bool trans_same = skb->transport_header == skb->network_header;
+ int ret;
+
+ /* Same here, __skb_push()/__skb_pull() pair not needed. */
+ ret = bpf_skb_generic_pop(skb, off, len);
+ if (likely(!ret)) {
+ skb->mac_header += len;
+ skb->network_header += len;
+ if (trans_same)
+ skb->transport_header = skb->network_header;
+ }
+
+ return ret;
+}
+
+static int bpf_skb_proto_4_to_6(struct sk_buff *skb)
+{
+ const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr);
+ u32 off = skb->network_header - skb->mac_header;
+ int ret;
+
+ ret = skb_cow(skb, len_diff);
+ if (unlikely(ret < 0))
+ return ret;
+
+ ret = bpf_skb_net_hdr_push(skb, off, len_diff);
+ if (unlikely(ret < 0))
+ return ret;
+
+ if (skb_is_gso(skb)) {
+ /* SKB_GSO_UDP stays as is. SKB_GSO_TCPV4 needs to
+ * be changed into SKB_GSO_TCPV6.
+ */
+ if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
+ skb_shinfo(skb)->gso_type &= ~SKB_GSO_TCPV4;
+ skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV6;
+ }
+
+ /* Due to IPv6 header, MSS needs to be downgraded. */
+ skb_shinfo(skb)->gso_size -= len_diff;
+ /* Header must be checked, and gso_segs recomputed. */
+ skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
+ skb_shinfo(skb)->gso_segs = 0;
+ }
+
+ skb->protocol = htons(ETH_P_IPV6);
+ skb_clear_hash(skb);
+
+ return 0;
+}
+
+static int bpf_skb_proto_6_to_4(struct sk_buff *skb)
+{
+ const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr);
+ u32 off = skb->network_header - skb->mac_header;
+ int ret;
+
+ ret = skb_unclone(skb, GFP_ATOMIC);
+ if (unlikely(ret < 0))
+ return ret;
+
+ ret = bpf_skb_net_hdr_pop(skb, off, len_diff);
+ if (unlikely(ret < 0))
+ return ret;
+
+ if (skb_is_gso(skb)) {
+ /* SKB_GSO_UDP stays as is. SKB_GSO_TCPV6 needs to
+ * be changed into SKB_GSO_TCPV4.
+ */
+ if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6) {
+ skb_shinfo(skb)->gso_type &= ~SKB_GSO_TCPV6;
+ skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4;
+ }
+
+ /* Due to IPv4 header, MSS can be upgraded. */
+ skb_shinfo(skb)->gso_size += len_diff;
+ /* Header must be checked, and gso_segs recomputed. */
+ skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
+ skb_shinfo(skb)->gso_segs = 0;
+ }
+
+ skb->protocol = htons(ETH_P_IP);
+ skb_clear_hash(skb);
+
+ return 0;
+}
+
+static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto)
+{
+ __be16 from_proto = skb->protocol;
+
+ if (from_proto == htons(ETH_P_IP) &&
+ to_proto == htons(ETH_P_IPV6))
+ return bpf_skb_proto_4_to_6(skb);
+
+ if (from_proto == htons(ETH_P_IPV6) &&
+ to_proto == htons(ETH_P_IP))
+ return bpf_skb_proto_6_to_4(skb);
+
+ return -ENOTSUPP;
+}
+
+static u64 bpf_skb_change_proto(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
+{
+ struct sk_buff *skb = (struct sk_buff *) (long) r1;
+ __be16 proto = (__force __be16) r2;
+ int ret;
+
+ if (unlikely(flags))
+ return -EINVAL;
+
+ /* General idea is that this helper does the basic groundwork
+ * needed for changing the protocol, and eBPF program fills the
+ * rest through bpf_skb_store_bytes(), bpf_lX_csum_replace()
+ * and other helpers, rather than passing a raw buffer here.
+ *
+ * The rationale is to keep this minimal and without a need to
+ * deal with raw packet data. F.e. even if we would pass buffers
+ * here, the program still needs to call the bpf_lX_csum_replace()
+ * helpers anyway. Plus, this way we keep also separation of
+ * concerns, since f.e. bpf_skb_store_bytes() should only take
+ * care of stores.
+ *
+ * Currently, additional options and extension header space are
+ * not supported, but flags register is reserved so we can adapt
+ * that. For offloads, we mark packet as dodgy, so that headers
+ * need to be verified first.
+ */
+ ret = bpf_skb_proto_xlat(skb, proto);
+ bpf_compute_data_end(skb);
+ return ret;
+}
+
+static const struct bpf_func_proto bpf_skb_change_proto_proto = {
+ .func = bpf_skb_change_proto,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+};
+
+static u64 bpf_skb_change_type(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ struct sk_buff *skb = (struct sk_buff *) (long) r1;
+ u32 pkt_type = r2;
+
+ /* We only allow a restricted subset to be changed for now. */
+ if (unlikely(skb->pkt_type > PACKET_OTHERHOST ||
+ pkt_type > PACKET_OTHERHOST))
+ return -EINVAL;
+
+ skb->pkt_type = pkt_type;
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_skb_change_type_proto = {
+ .func = bpf_skb_change_type,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+};
+
bool bpf_helper_changes_skb_data(void *func)
{
if (func == bpf_skb_vlan_push)
@@ -1785,6 +2015,8 @@ bool bpf_helper_changes_skb_data(void *func)
return true;
if (func == bpf_skb_store_bytes)
return true;
+ if (func == bpf_skb_change_proto)
+ return true;
if (func == bpf_l3_csum_replace)
return true;
if (func == bpf_l4_csum_replace)
@@ -2024,6 +2256,40 @@ bpf_get_skb_set_tunnel_proto(enum bpf_func_id which)
}
}
+#ifdef CONFIG_SOCK_CGROUP_DATA
+static u64 bpf_skb_in_cgroup(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ struct sk_buff *skb = (struct sk_buff *)(long)r1;
+ struct bpf_map *map = (struct bpf_map *)(long)r2;
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ struct cgroup *cgrp;
+ struct sock *sk;
+ u32 i = (u32)r3;
+
+ sk = skb->sk;
+ if (!sk || !sk_fullsock(sk))
+ return -ENOENT;
+
+ if (unlikely(i >= array->map.max_entries))
+ return -E2BIG;
+
+ cgrp = READ_ONCE(array->ptrs[i]);
+ if (unlikely(!cgrp))
+ return -EAGAIN;
+
+ return cgroup_is_descendant(sock_cgroup_ptr(&sk->sk_cgrp_data), cgrp);
+}
+
+static const struct bpf_func_proto bpf_skb_in_cgroup_proto = {
+ .func = bpf_skb_in_cgroup,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+};
+#endif
+
static const struct bpf_func_proto *
sk_filter_func_proto(enum bpf_func_id func_id)
{
@@ -2037,7 +2303,7 @@ sk_filter_func_proto(enum bpf_func_id func_id)
case BPF_FUNC_get_prandom_u32:
return &bpf_get_prandom_u32_proto;
case BPF_FUNC_get_smp_processor_id:
- return &bpf_get_smp_processor_id_proto;
+ return &bpf_get_raw_smp_processor_id_proto;
case BPF_FUNC_tail_call:
return &bpf_tail_call_proto;
case BPF_FUNC_ktime_get_ns:
@@ -2072,6 +2338,10 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
return &bpf_skb_vlan_push_proto;
case BPF_FUNC_skb_vlan_pop:
return &bpf_skb_vlan_pop_proto;
+ case BPF_FUNC_skb_change_proto:
+ return &bpf_skb_change_proto_proto;
+ case BPF_FUNC_skb_change_type:
+ return &bpf_skb_change_type_proto;
case BPF_FUNC_skb_get_tunnel_key:
return &bpf_skb_get_tunnel_key_proto;
case BPF_FUNC_skb_set_tunnel_key:
@@ -2084,8 +2354,16 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
return &bpf_redirect_proto;
case BPF_FUNC_get_route_realm:
return &bpf_get_route_realm_proto;
+ case BPF_FUNC_get_hash_recalc:
+ return &bpf_get_hash_recalc_proto;
case BPF_FUNC_perf_event_output:
return bpf_get_event_output_proto();
+ case BPF_FUNC_get_smp_processor_id:
+ return &bpf_get_smp_processor_id_proto;
+#ifdef CONFIG_SOCK_CGROUP_DATA
+ case BPF_FUNC_skb_in_cgroup:
+ return &bpf_skb_in_cgroup_proto;
+#endif
default:
return sk_filter_func_proto(func_id);
}
@@ -2105,7 +2383,8 @@ static bool __is_valid_access(int off, int size, enum bpf_access_type type)
}
static bool sk_filter_is_valid_access(int off, int size,
- enum bpf_access_type type)
+ enum bpf_access_type type,
+ enum bpf_reg_type *reg_type)
{
switch (off) {
case offsetof(struct __sk_buff, tc_classid):
@@ -2128,7 +2407,8 @@ static bool sk_filter_is_valid_access(int off, int size,
}
static bool tc_cls_act_is_valid_access(int off, int size,
- enum bpf_access_type type)
+ enum bpf_access_type type,
+ enum bpf_reg_type *reg_type)
{
if (type == BPF_WRITE) {
switch (off) {
@@ -2143,6 +2423,16 @@ static bool tc_cls_act_is_valid_access(int off, int size,
return false;
}
}
+
+ switch (off) {
+ case offsetof(struct __sk_buff, data):
+ *reg_type = PTR_TO_PACKET;
+ break;
+ case offsetof(struct __sk_buff, data_end):
+ *reg_type = PTR_TO_PACKET_END;
+ break;
+ }
+
return __is_valid_access(off, size, type);
}
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 29dd8cc..510cd62 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -2469,13 +2469,17 @@ int neigh_xmit(int index, struct net_device *dev,
tbl = neigh_tables[index];
if (!tbl)
goto out;
+ rcu_read_lock_bh();
neigh = __neigh_lookup_noref(tbl, addr, dev);
if (!neigh)
neigh = __neigh_create(tbl, addr, dev, false);
err = PTR_ERR(neigh);
- if (IS_ERR(neigh))
+ if (IS_ERR(neigh)) {
+ rcu_read_unlock_bh();
goto out_kfree_skb;
+ }
err = neigh->output(neigh, skb);
+ rcu_read_unlock_bh();
}
else if (index == NEIGH_LINK_TABLE) {
err = dev_hard_header(skb, dev, ntohs(skb->protocol),
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 7a0b616..6e4f347 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -322,7 +322,20 @@ NETDEVICE_SHOW_RW(flags, fmt_hex);
static int change_tx_queue_len(struct net_device *dev, unsigned long new_len)
{
- dev->tx_queue_len = new_len;
+ int res, orig_len = dev->tx_queue_len;
+
+ if (new_len != orig_len) {
+ dev->tx_queue_len = new_len;
+ res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
+ res = notifier_to_errno(res);
+ if (res) {
+ netdev_err(dev,
+ "refused to change device tx_queue_len\n");
+ dev->tx_queue_len = orig_len;
+ return -EFAULT;
+ }
+ }
+
return 0;
}
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index f74ab9c..bbd118b 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -213,6 +213,7 @@
/* Xmit modes */
#define M_START_XMIT 0 /* Default normal TX */
#define M_NETIF_RECEIVE 1 /* Inject packets into stack */
+#define M_QUEUE_XMIT 2 /* Inject packet into qdisc */
/* If lock -- protects updating of if_list */
#define if_lock(t) spin_lock(&(t->if_lock));
@@ -626,6 +627,8 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
if (pkt_dev->xmit_mode == M_NETIF_RECEIVE)
seq_puts(seq, " xmit_mode: netif_receive\n");
+ else if (pkt_dev->xmit_mode == M_QUEUE_XMIT)
+ seq_puts(seq, " xmit_mode: xmit_queue\n");
seq_puts(seq, " Flags: ");
@@ -1142,8 +1145,10 @@ static ssize_t pktgen_if_write(struct file *file,
return len;
i += len;
- if ((value > 1) && (pkt_dev->xmit_mode == M_START_XMIT) &&
- (!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING)))
+ if ((value > 1) &&
+ ((pkt_dev->xmit_mode == M_QUEUE_XMIT) ||
+ ((pkt_dev->xmit_mode == M_START_XMIT) &&
+ (!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING)))))
return -ENOTSUPP;
pkt_dev->burst = value < 1 ? 1 : value;
sprintf(pg_result, "OK: burst=%d", pkt_dev->burst);
@@ -1198,6 +1203,9 @@ static ssize_t pktgen_if_write(struct file *file,
* at module loading time
*/
pkt_dev->clone_skb = 0;
+ } else if (strcmp(f, "queue_xmit") == 0) {
+ pkt_dev->xmit_mode = M_QUEUE_XMIT;
+ pkt_dev->last_ok = 1;
} else {
sprintf(pg_result,
"xmit_mode -:%s:- unknown\nAvailable modes: %s",
@@ -3434,6 +3442,36 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
#endif
} while (--burst > 0);
goto out; /* Skips xmit_mode M_START_XMIT */
+ } else if (pkt_dev->xmit_mode == M_QUEUE_XMIT) {
+ local_bh_disable();
+ atomic_inc(&pkt_dev->skb->users);
+
+ ret = dev_queue_xmit(pkt_dev->skb);
+ switch (ret) {
+ case NET_XMIT_SUCCESS:
+ pkt_dev->sofar++;
+ pkt_dev->seq_num++;
+ pkt_dev->tx_bytes += pkt_dev->last_pkt_size;
+ break;
+ case NET_XMIT_DROP:
+ case NET_XMIT_CN:
+ /* These are all valid return codes for a qdisc but
+ * indicate packets are being dropped or will likely
+ * be dropped soon.
+ */
+ case NETDEV_TX_BUSY:
+ /* qdisc may call dev_hard_start_xmit directly in cases
+ * where no queues exist e.g. loopback device, virtual
+ * devices, etc. In this case we need to handle
+ * NETDEV_TX_ codes.
+ */
+ default:
+ pkt_dev->errors++;
+ net_info_ratelimited("%s xmit error: %d\n",
+ pkt_dev->odevname, ret);
+ break;
+ }
+ goto out;
}
txq = skb_get_tx_queue(odev, pkt_dev->skb);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index d69c464..a9e3805 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -71,9 +71,31 @@ void rtnl_lock(void)
}
EXPORT_SYMBOL(rtnl_lock);
+static struct sk_buff *defer_kfree_skb_list;
+void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail)
+{
+ if (head && tail) {
+ tail->next = defer_kfree_skb_list;
+ defer_kfree_skb_list = head;
+ }
+}
+EXPORT_SYMBOL(rtnl_kfree_skbs);
+
void __rtnl_unlock(void)
{
+ struct sk_buff *head = defer_kfree_skb_list;
+
+ defer_kfree_skb_list = NULL;
+
mutex_unlock(&rtnl_mutex);
+
+ while (head) {
+ struct sk_buff *next = head->next;
+
+ kfree_skb(head);
+ cond_resched();
+ head = next;
+ }
}
void rtnl_unlock(void)
@@ -1905,11 +1927,19 @@ static int do_setlink(const struct sk_buff *skb,
if (tb[IFLA_TXQLEN]) {
unsigned long value = nla_get_u32(tb[IFLA_TXQLEN]);
-
- if (dev->tx_queue_len ^ value)
+ unsigned long orig_len = dev->tx_queue_len;
+
+ if (dev->tx_queue_len ^ value) {
+ dev->tx_queue_len = value;
+ err = call_netdevice_notifiers(
+ NETDEV_CHANGE_TX_QUEUE_LEN, dev);
+ err = notifier_to_errno(err);
+ if (err) {
+ dev->tx_queue_len = orig_len;
+ goto errout;
+ }
status |= DO_SETLINK_NOTIFY;
-
- dev->tx_queue_len = value;
+ }
}
if (tb[IFLA_OPERSTATE])
@@ -3497,7 +3527,32 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
if (!attr)
goto nla_put_failure;
- err = ops->fill_linkxstats(skb, dev, prividx);
+ err = ops->fill_linkxstats(skb, dev, prividx, *idxattr);
+ nla_nest_end(skb, attr);
+ if (err)
+ goto nla_put_failure;
+ *idxattr = 0;
+ }
+ }
+
+ if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS_SLAVE,
+ *idxattr)) {
+ const struct rtnl_link_ops *ops = NULL;
+ const struct net_device *master;
+
+ master = netdev_master_upper_dev_get(dev);
+ if (master)
+ ops = master->rtnl_link_ops;
+ if (ops && ops->fill_linkxstats) {
+ int err;
+
+ *idxattr = IFLA_STATS_LINK_XSTATS_SLAVE;
+ attr = nla_nest_start(skb,
+ IFLA_STATS_LINK_XSTATS_SLAVE);
+ if (!attr)
+ goto nla_put_failure;
+
+ err = ops->fill_linkxstats(skb, dev, prividx, *idxattr);
nla_nest_end(skb, attr);
if (err)
goto nla_put_failure;
@@ -3533,14 +3588,35 @@ static size_t if_nlmsg_stats_size(const struct net_device *dev,
if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS, 0)) {
const struct rtnl_link_ops *ops = dev->rtnl_link_ops;
+ int attr = IFLA_STATS_LINK_XSTATS;
if (ops && ops->get_linkxstats_size) {
- size += nla_total_size(ops->get_linkxstats_size(dev));
+ size += nla_total_size(ops->get_linkxstats_size(dev,
+ attr));
/* for IFLA_STATS_LINK_XSTATS */
size += nla_total_size(0);
}
}
+ if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS_SLAVE, 0)) {
+ struct net_device *_dev = (struct net_device *)dev;
+ const struct rtnl_link_ops *ops = NULL;
+ const struct net_device *master;
+
+ /* netdev_master_upper_dev_get can't take const */
+ master = netdev_master_upper_dev_get(_dev);
+ if (master)
+ ops = master->rtnl_link_ops;
+ if (ops && ops->get_linkxstats_size) {
+ int attr = IFLA_STATS_LINK_XSTATS_SLAVE;
+
+ size += nla_total_size(ops->get_linkxstats_size(dev,
+ attr));
+ /* for IFLA_STATS_LINK_XSTATS_SLAVE */
+ size += nla_total_size(0);
+ }
+ }
+
return size;
}
diff --git a/net/core/utils.c b/net/core/utils.c
index 3d17ca8..cf5622b 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -133,7 +133,7 @@ int in4_pton(const char *src, int srclen,
s = src;
d = dbuf;
i = 0;
- while(1) {
+ while (1) {
int c;
c = xdigit2bin(srclen > 0 ? *s : '\0', delim);
if (!(c & (IN6PTON_DIGIT | IN6PTON_DOT | IN6PTON_DELIM | IN6PTON_COLON_MASK))) {
@@ -283,11 +283,11 @@ cont:
i = 15; d--;
if (dc) {
- while(d >= dc)
+ while (d >= dc)
dst[i--] = *d--;
- while(i >= dc - dbuf)
+ while (i >= dc - dbuf)
dst[i--] = 0;
- while(i >= 0)
+ while (i >= 0)
dst[i--] = *d--;
} else
memcpy(dst, dbuf, sizeof(dbuf));
diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c
index 4e2b308..8c004a0 100644
--- a/net/ieee802154/6lowpan/core.c
+++ b/net/ieee802154/6lowpan/core.c
@@ -81,11 +81,21 @@ static int lowpan_stop(struct net_device *dev)
return 0;
}
+static int lowpan_neigh_construct(struct neighbour *n)
+{
+ struct lowpan_802154_neigh *neigh = lowpan_802154_neigh(neighbour_priv(n));
+
+ /* default no short_addr is available for a neighbour */
+ neigh->short_addr = cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC);
+ return 0;
+}
+
static const struct net_device_ops lowpan_netdev_ops = {
.ndo_init = lowpan_dev_init,
.ndo_start_xmit = lowpan_xmit,
.ndo_open = lowpan_open,
.ndo_stop = lowpan_stop,
+ .ndo_neigh_construct = lowpan_neigh_construct,
};
static void lowpan_setup(struct net_device *ldev)
@@ -150,6 +160,8 @@ static int lowpan_newlink(struct net *src_net, struct net_device *ldev,
wdev->needed_headroom;
ldev->needed_tailroom = wdev->needed_tailroom;
+ ldev->neigh_priv_len = sizeof(struct lowpan_802154_neigh);
+
ret = lowpan_register_netdevice(ldev, LOWPAN_LLTYPE_IEEE802154);
if (ret < 0) {
dev_put(wdev);
diff --git a/net/ieee802154/6lowpan/tx.c b/net/ieee802154/6lowpan/tx.c
index e459afd..dbb476d 100644
--- a/net/ieee802154/6lowpan/tx.c
+++ b/net/ieee802154/6lowpan/tx.c
@@ -9,6 +9,7 @@
*/
#include <net/6lowpan.h>
+#include <net/ndisc.h>
#include <net/ieee802154_netdev.h>
#include <net/mac802154.h>
@@ -17,19 +18,9 @@
#define LOWPAN_FRAG1_HEAD_SIZE 0x4
#define LOWPAN_FRAGN_HEAD_SIZE 0x5
-/* don't save pan id, it's intra pan */
-struct lowpan_addr {
- u8 mode;
- union {
- /* IPv6 needs big endian here */
- __be64 extended_addr;
- __be16 short_addr;
- } u;
-};
-
struct lowpan_addr_info {
- struct lowpan_addr daddr;
- struct lowpan_addr saddr;
+ struct ieee802154_addr daddr;
+ struct ieee802154_addr saddr;
};
static inline struct
@@ -48,12 +39,14 @@ lowpan_addr_info *lowpan_skb_priv(const struct sk_buff *skb)
* RAW/DGRAM sockets.
*/
int lowpan_header_create(struct sk_buff *skb, struct net_device *ldev,
- unsigned short type, const void *_daddr,
- const void *_saddr, unsigned int len)
+ unsigned short type, const void *daddr,
+ const void *saddr, unsigned int len)
{
- const u8 *saddr = _saddr;
- const u8 *daddr = _daddr;
- struct lowpan_addr_info *info;
+ struct wpan_dev *wpan_dev = lowpan_802154_dev(ldev)->wdev->ieee802154_ptr;
+ struct lowpan_addr_info *info = lowpan_skb_priv(skb);
+ struct lowpan_802154_neigh *llneigh = NULL;
+ const struct ipv6hdr *hdr = ipv6_hdr(skb);
+ struct neighbour *n;
/* TODO:
* if this package isn't ipv6 one, where should it be routed?
@@ -61,21 +54,50 @@ int lowpan_header_create(struct sk_buff *skb, struct net_device *ldev,
if (type != ETH_P_IPV6)
return 0;
- if (!saddr)
- saddr = ldev->dev_addr;
+ /* intra-pan communication */
+ info->saddr.pan_id = wpan_dev->pan_id;
+ info->daddr.pan_id = info->saddr.pan_id;
- raw_dump_inline(__func__, "saddr", (unsigned char *)saddr, 8);
- raw_dump_inline(__func__, "daddr", (unsigned char *)daddr, 8);
+ if (!memcmp(daddr, ldev->broadcast, EUI64_ADDR_LEN)) {
+ info->daddr.short_addr = cpu_to_le16(IEEE802154_ADDR_BROADCAST);
+ info->daddr.mode = IEEE802154_ADDR_SHORT;
+ } else {
+ __le16 short_addr = cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC);
+
+ n = neigh_lookup(&nd_tbl, &hdr->daddr, ldev);
+ if (n) {
+ llneigh = lowpan_802154_neigh(neighbour_priv(n));
+ read_lock_bh(&n->lock);
+ short_addr = llneigh->short_addr;
+ read_unlock_bh(&n->lock);
+ }
- info = lowpan_skb_priv(skb);
+ if (llneigh &&
+ lowpan_802154_is_valid_src_short_addr(short_addr)) {
+ info->daddr.short_addr = short_addr;
+ info->daddr.mode = IEEE802154_ADDR_SHORT;
+ } else {
+ info->daddr.mode = IEEE802154_ADDR_LONG;
+ ieee802154_be64_to_le64(&info->daddr.extended_addr,
+ daddr);
+ }
- /* TODO: Currently we only support extended_addr */
- info->daddr.mode = IEEE802154_ADDR_LONG;
- memcpy(&info->daddr.u.extended_addr, daddr,
- sizeof(info->daddr.u.extended_addr));
- info->saddr.mode = IEEE802154_ADDR_LONG;
- memcpy(&info->saddr.u.extended_addr, saddr,
- sizeof(info->daddr.u.extended_addr));
+ if (n)
+ neigh_release(n);
+ }
+
+ if (!saddr) {
+ if (lowpan_802154_is_valid_src_short_addr(wpan_dev->short_addr)) {
+ info->saddr.mode = IEEE802154_ADDR_SHORT;
+ info->saddr.short_addr = wpan_dev->short_addr;
+ } else {
+ info->saddr.mode = IEEE802154_ADDR_LONG;
+ info->saddr.extended_addr = wpan_dev->extended_addr;
+ }
+ } else {
+ info->saddr.mode = IEEE802154_ADDR_LONG;
+ ieee802154_be64_to_le64(&info->saddr.extended_addr, saddr);
+ }
return 0;
}
@@ -209,47 +231,26 @@ static int lowpan_header(struct sk_buff *skb, struct net_device *ldev,
u16 *dgram_size, u16 *dgram_offset)
{
struct wpan_dev *wpan_dev = lowpan_802154_dev(ldev)->wdev->ieee802154_ptr;
- struct ieee802154_addr sa, da;
struct ieee802154_mac_cb *cb = mac_cb_init(skb);
struct lowpan_addr_info info;
- void *daddr, *saddr;
memcpy(&info, lowpan_skb_priv(skb), sizeof(info));
- /* TODO: Currently we only support extended_addr */
- daddr = &info.daddr.u.extended_addr;
- saddr = &info.saddr.u.extended_addr;
-
*dgram_size = skb->len;
- lowpan_header_compress(skb, ldev, daddr, saddr);
+ lowpan_header_compress(skb, ldev, &info.daddr, &info.saddr);
/* dgram_offset = (saved bytes after compression) + lowpan header len */
*dgram_offset = (*dgram_size - skb->len) + skb_network_header_len(skb);
cb->type = IEEE802154_FC_TYPE_DATA;
- /* prepare wpan address data */
- sa.mode = IEEE802154_ADDR_LONG;
- sa.pan_id = wpan_dev->pan_id;
- sa.extended_addr = ieee802154_devaddr_from_raw(saddr);
-
- /* intra-PAN communications */
- da.pan_id = sa.pan_id;
-
- /* if the destination address is the broadcast address, use the
- * corresponding short address
- */
- if (!memcmp(daddr, ldev->broadcast, EUI64_ADDR_LEN)) {
- da.mode = IEEE802154_ADDR_SHORT;
- da.short_addr = cpu_to_le16(IEEE802154_ADDR_BROADCAST);
+ if (info.daddr.mode == IEEE802154_ADDR_SHORT &&
+ ieee802154_is_broadcast_short_addr(info.daddr.short_addr))
cb->ackreq = false;
- } else {
- da.mode = IEEE802154_ADDR_LONG;
- da.extended_addr = ieee802154_devaddr_from_raw(daddr);
+ else
cb->ackreq = wpan_dev->ackreq;
- }
- return wpan_dev_hard_header(skb, lowpan_802154_dev(ldev)->wdev, &da,
- &sa, 0);
+ return wpan_dev_hard_header(skb, lowpan_802154_dev(ldev)->wdev,
+ &info.daddr, &info.saddr, 0);
}
netdev_tx_t lowpan_xmit(struct sk_buff *skb, struct net_device *ldev)
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 4779374..d95631d 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -23,6 +23,11 @@ struct esp_skb_cb {
void *tmp;
};
+struct esp_output_extra {
+ __be32 seqhi;
+ u32 esphoff;
+};
+
#define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0]))
static u32 esp4_get_mtu(struct xfrm_state *x, int mtu);
@@ -35,11 +40,11 @@ static u32 esp4_get_mtu(struct xfrm_state *x, int mtu);
*
* TODO: Use spare space in skb for this where possible.
*/
-static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags, int seqhilen)
+static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags, int extralen)
{
unsigned int len;
- len = seqhilen;
+ len = extralen;
len += crypto_aead_ivsize(aead);
@@ -57,15 +62,16 @@ static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags, int seqhilen)
return kmalloc(len, GFP_ATOMIC);
}
-static inline __be32 *esp_tmp_seqhi(void *tmp)
+static inline void *esp_tmp_extra(void *tmp)
{
- return PTR_ALIGN((__be32 *)tmp, __alignof__(__be32));
+ return PTR_ALIGN(tmp, __alignof__(struct esp_output_extra));
}
-static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp, int seqhilen)
+
+static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp, int extralen)
{
return crypto_aead_ivsize(aead) ?
- PTR_ALIGN((u8 *)tmp + seqhilen,
- crypto_aead_alignmask(aead) + 1) : tmp + seqhilen;
+ PTR_ALIGN((u8 *)tmp + extralen,
+ crypto_aead_alignmask(aead) + 1) : tmp + extralen;
}
static inline struct aead_request *esp_tmp_req(struct crypto_aead *aead, u8 *iv)
@@ -99,7 +105,7 @@ static void esp_restore_header(struct sk_buff *skb, unsigned int offset)
{
struct ip_esp_hdr *esph = (void *)(skb->data + offset);
void *tmp = ESP_SKB_CB(skb)->tmp;
- __be32 *seqhi = esp_tmp_seqhi(tmp);
+ __be32 *seqhi = esp_tmp_extra(tmp);
esph->seq_no = esph->spi;
esph->spi = *seqhi;
@@ -107,7 +113,11 @@ static void esp_restore_header(struct sk_buff *skb, unsigned int offset)
static void esp_output_restore_header(struct sk_buff *skb)
{
- esp_restore_header(skb, skb_transport_offset(skb) - sizeof(__be32));
+ void *tmp = ESP_SKB_CB(skb)->tmp;
+ struct esp_output_extra *extra = esp_tmp_extra(tmp);
+
+ esp_restore_header(skb, skb_transport_offset(skb) + extra->esphoff -
+ sizeof(__be32));
}
static void esp_output_done_esn(struct crypto_async_request *base, int err)
@@ -121,6 +131,7 @@ static void esp_output_done_esn(struct crypto_async_request *base, int err)
static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
{
int err;
+ struct esp_output_extra *extra;
struct ip_esp_hdr *esph;
struct crypto_aead *aead;
struct aead_request *req;
@@ -137,8 +148,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
int tfclen;
int nfrags;
int assoclen;
- int seqhilen;
- __be32 *seqhi;
+ int extralen;
__be64 seqno;
/* skb is pure payload to encrypt */
@@ -166,21 +176,21 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
nfrags = err;
assoclen = sizeof(*esph);
- seqhilen = 0;
+ extralen = 0;
if (x->props.flags & XFRM_STATE_ESN) {
- seqhilen += sizeof(__be32);
- assoclen += seqhilen;
+ extralen += sizeof(*extra);
+ assoclen += sizeof(__be32);
}
- tmp = esp_alloc_tmp(aead, nfrags, seqhilen);
+ tmp = esp_alloc_tmp(aead, nfrags, extralen);
if (!tmp) {
err = -ENOMEM;
goto error;
}
- seqhi = esp_tmp_seqhi(tmp);
- iv = esp_tmp_iv(aead, tmp, seqhilen);
+ extra = esp_tmp_extra(tmp);
+ iv = esp_tmp_iv(aead, tmp, extralen);
req = esp_tmp_req(aead, iv);
sg = esp_req_sg(aead, req);
@@ -247,8 +257,10 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
* encryption.
*/
if ((x->props.flags & XFRM_STATE_ESN)) {
- esph = (void *)(skb_transport_header(skb) - sizeof(__be32));
- *seqhi = esph->spi;
+ extra->esphoff = (unsigned char *)esph -
+ skb_transport_header(skb);
+ esph = (struct ip_esp_hdr *)((unsigned char *)esph - 4);
+ extra->seqhi = esph->spi;
esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
aead_request_set_callback(req, 0, esp_output_done_esn, skb);
}
@@ -445,7 +457,7 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
goto out;
ESP_SKB_CB(skb)->tmp = tmp;
- seqhi = esp_tmp_seqhi(tmp);
+ seqhi = esp_tmp_extra(tmp);
iv = esp_tmp_iv(aead, tmp, seqhilen);
req = esp_tmp_req(aead, iv);
sg = esp_req_sg(aead, req);
diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c
index 4c39f4f..b798862 100644
--- a/net/ipv4/gre_demux.c
+++ b/net/ipv4/gre_demux.c
@@ -62,26 +62,26 @@ EXPORT_SYMBOL_GPL(gre_del_protocol);
/* Fills in tpi and returns header length to be pulled. */
int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
- bool *csum_err, __be16 proto)
+ bool *csum_err, __be16 proto, int nhs)
{
const struct gre_base_hdr *greh;
__be32 *options;
int hdr_len;
- if (unlikely(!pskb_may_pull(skb, sizeof(struct gre_base_hdr))))
+ if (unlikely(!pskb_may_pull(skb, nhs + sizeof(struct gre_base_hdr))))
return -EINVAL;
- greh = (struct gre_base_hdr *)skb_transport_header(skb);
+ greh = (struct gre_base_hdr *)(skb->data + nhs);
if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING)))
return -EINVAL;
tpi->flags = gre_flags_to_tnl_flags(greh->flags);
hdr_len = gre_calc_hlen(tpi->flags);
- if (!pskb_may_pull(skb, hdr_len))
+ if (!pskb_may_pull(skb, nhs + hdr_len))
return -EINVAL;
- greh = (struct gre_base_hdr *)skb_transport_header(skb);
+ greh = (struct gre_base_hdr *)(skb->data + nhs);
tpi->proto = greh->protocol;
options = (__be32 *)(greh + 1);
@@ -117,6 +117,7 @@ int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
if ((*(u8 *)options & 0xF0) != 0x40)
hdr_len += 4;
}
+ tpi->hdr_len = hdr_len;
return hdr_len;
}
EXPORT_SYMBOL(gre_parse_header);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 25af124..38c2c47 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -44,6 +44,7 @@ struct inet_diag_entry {
u16 dport;
u16 family;
u16 userlocks;
+ u32 ifindex;
};
static DEFINE_MUTEX(inet_diag_table_mutex);
@@ -571,6 +572,14 @@ static int inet_diag_bc_run(const struct nlattr *_bc,
yes = 0;
break;
}
+ case INET_DIAG_BC_DEV_COND: {
+ u32 ifindex;
+
+ ifindex = *((const u32 *)(op + 1));
+ if (ifindex != entry->ifindex)
+ yes = 0;
+ break;
+ }
}
if (yes) {
@@ -613,6 +622,7 @@ int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk)
entry_fill_addrs(&entry, sk);
entry.sport = inet->inet_num;
entry.dport = ntohs(inet->inet_dport);
+ entry.ifindex = sk->sk_bound_dev_if;
entry.userlocks = sk_fullsock(sk) ? sk->sk_userlocks : 0;
return inet_diag_bc_run(bc, &entry);
@@ -636,6 +646,17 @@ static int valid_cc(const void *bc, int len, int cc)
return 0;
}
+/* data is u32 ifindex */
+static bool valid_devcond(const struct inet_diag_bc_op *op, int len,
+ int *min_len)
+{
+ /* Check ifindex space. */
+ *min_len += sizeof(u32);
+ if (len < *min_len)
+ return false;
+
+ return true;
+}
/* Validate an inet_diag_hostcond. */
static bool valid_hostcond(const struct inet_diag_bc_op *op, int len,
int *min_len)
@@ -700,6 +721,10 @@ static int inet_diag_bc_audit(const void *bytecode, int bytecode_len)
if (!valid_hostcond(bc, len, &min_len))
return -EINVAL;
break;
+ case INET_DIAG_BC_DEV_COND:
+ if (!valid_devcond(bc, len, &min_len))
+ return -EINVAL;
+ break;
case INET_DIAG_BC_S_GE:
case INET_DIAG_BC_S_LE:
case INET_DIAG_BC_D_GE:
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 4d2025f..5b1481b 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -49,12 +49,6 @@
#include <net/gre.h>
#include <net/dst_metadata.h>
-#if IS_ENABLED(CONFIG_IPV6)
-#include <net/ipv6.h>
-#include <net/ip6_fib.h>
-#include <net/ip6_route.h>
-#endif
-
/*
Problems & solutions
--------------------
@@ -144,6 +138,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info,
const struct iphdr *iph;
const int type = icmp_hdr(skb)->type;
const int code = icmp_hdr(skb)->code;
+ unsigned int data_len = 0;
struct ip_tunnel *t;
switch (type) {
@@ -169,6 +164,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info,
case ICMP_TIME_EXCEEDED:
if (code != ICMP_EXC_TTL)
return;
+ data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */
break;
case ICMP_REDIRECT:
@@ -187,6 +183,13 @@ static void ipgre_err(struct sk_buff *skb, u32 info,
if (!t)
return;
+#if IS_ENABLED(CONFIG_IPV6)
+ if (tpi->proto == htons(ETH_P_IPV6) &&
+ !ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len,
+ type, data_len))
+ return;
+#endif
+
if (t->parms.iph.daddr == 0 ||
ipv4_is_multicast(t->parms.iph.daddr))
return;
@@ -217,12 +220,14 @@ static void gre_err(struct sk_buff *skb, u32 info)
* by themselves???
*/
+ const struct iphdr *iph = (struct iphdr *)skb->data;
const int type = icmp_hdr(skb)->type;
const int code = icmp_hdr(skb)->code;
struct tnl_ptk_info tpi;
bool csum_err = false;
- if (gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IP)) < 0) {
+ if (gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IP),
+ iph->ihl * 4) < 0) {
if (!csum_err) /* ignore csum errors. */
return;
}
@@ -338,7 +343,7 @@ static int gre_rcv(struct sk_buff *skb)
}
#endif
- hdr_len = gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IP));
+ hdr_len = gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IP), 0);
if (hdr_len < 0)
goto drop;
@@ -841,17 +846,19 @@ out:
return ipgre_tunnel_validate(tb, data);
}
-static void ipgre_netlink_parms(struct net_device *dev,
+static int ipgre_netlink_parms(struct net_device *dev,
struct nlattr *data[],
struct nlattr *tb[],
struct ip_tunnel_parm *parms)
{
+ struct ip_tunnel *t = netdev_priv(dev);
+
memset(parms, 0, sizeof(*parms));
parms->iph.protocol = IPPROTO_GRE;
if (!data)
- return;
+ return 0;
if (data[IFLA_GRE_LINK])
parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
@@ -880,16 +887,26 @@ static void ipgre_netlink_parms(struct net_device *dev,
if (data[IFLA_GRE_TOS])
parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
- if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
+ if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC])) {
+ if (t->ignore_df)
+ return -EINVAL;
parms->iph.frag_off = htons(IP_DF);
+ }
if (data[IFLA_GRE_COLLECT_METADATA]) {
- struct ip_tunnel *t = netdev_priv(dev);
-
t->collect_md = true;
if (dev->type == ARPHRD_IPGRE)
dev->type = ARPHRD_NONE;
}
+
+ if (data[IFLA_GRE_IGNORE_DF]) {
+ if (nla_get_u8(data[IFLA_GRE_IGNORE_DF])
+ && (parms->iph.frag_off & htons(IP_DF)))
+ return -EINVAL;
+ t->ignore_df = !!nla_get_u8(data[IFLA_GRE_IGNORE_DF]);
+ }
+
+ return 0;
}
/* This function returns true when ENCAP attributes are present in the nl msg */
@@ -960,16 +977,19 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev,
{
struct ip_tunnel_parm p;
struct ip_tunnel_encap ipencap;
+ int err;
if (ipgre_netlink_encap_parms(data, &ipencap)) {
struct ip_tunnel *t = netdev_priv(dev);
- int err = ip_tunnel_encap_setup(t, &ipencap);
+ err = ip_tunnel_encap_setup(t, &ipencap);
if (err < 0)
return err;
}
- ipgre_netlink_parms(dev, data, tb, &p);
+ err = ipgre_netlink_parms(dev, data, tb, &p);
+ if (err < 0)
+ return err;
return ip_tunnel_newlink(dev, tb, &p);
}
@@ -978,16 +998,19 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
{
struct ip_tunnel_parm p;
struct ip_tunnel_encap ipencap;
+ int err;
if (ipgre_netlink_encap_parms(data, &ipencap)) {
struct ip_tunnel *t = netdev_priv(dev);
- int err = ip_tunnel_encap_setup(t, &ipencap);
+ err = ip_tunnel_encap_setup(t, &ipencap);
if (err < 0)
return err;
}
- ipgre_netlink_parms(dev, data, tb, &p);
+ err = ipgre_netlink_parms(dev, data, tb, &p);
+ if (err < 0)
+ return err;
return ip_tunnel_changelink(dev, tb, &p);
}
@@ -1024,6 +1047,8 @@ static size_t ipgre_get_size(const struct net_device *dev)
nla_total_size(2) +
/* IFLA_GRE_COLLECT_METADATA */
nla_total_size(0) +
+ /* IFLA_GRE_IGNORE_DF */
+ nla_total_size(1) +
0;
}
@@ -1057,6 +1082,9 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
t->encap.flags))
goto nla_put_failure;
+ if (nla_put_u8(skb, IFLA_GRE_IGNORE_DF, t->ignore_df))
+ goto nla_put_failure;
+
if (t->collect_md) {
if (nla_put_flag(skb, IFLA_GRE_COLLECT_METADATA))
goto nla_put_failure;
@@ -1084,6 +1112,7 @@ static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
[IFLA_GRE_ENCAP_SPORT] = { .type = NLA_U16 },
[IFLA_GRE_ENCAP_DPORT] = { .type = NLA_U16 },
[IFLA_GRE_COLLECT_METADATA] = { .type = NLA_FLAG },
+ [IFLA_GRE_IGNORE_DF] = { .type = NLA_U8 },
};
static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
@@ -1121,6 +1150,7 @@ struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
{
struct nlattr *tb[IFLA_MAX + 1];
struct net_device *dev;
+ LIST_HEAD(list_kill);
struct ip_tunnel *t;
int err;
@@ -1136,8 +1166,10 @@ struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
t->collect_md = true;
err = ipgre_newlink(net, dev, tb, NULL);
- if (err < 0)
- goto out;
+ if (err < 0) {
+ free_netdev(dev);
+ return ERR_PTR(err);
+ }
/* openvswitch users expect packet sizes to be unrestricted,
* so set the largest MTU we can.
@@ -1146,9 +1178,14 @@ struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
if (err)
goto out;
+ err = rtnl_configure_link(dev, NULL);
+ if (err < 0)
+ goto out;
+
return dev;
out:
- free_netdev(dev);
+ ip_tunnel_dellink(dev, &list_kill);
+ unregister_netdevice_many(&list_kill);
return ERR_PTR(err);
}
EXPORT_SYMBOL_GPL(gretap_fb_dev_create);
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index d8f5e0a..95649eb 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -682,7 +682,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
}
df = tnl_params->frag_off;
- if (skb->protocol == htons(ETH_P_IP))
+ if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
df |= (inner_iph->frag_off&htons(IP_DF));
max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 2ed9dd2..1d71c40 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -127,7 +127,9 @@ __be32 ic_myaddr = NONE; /* My IP address */
static __be32 ic_netmask = NONE; /* Netmask for local subnet */
__be32 ic_gateway = NONE; /* Gateway IP address */
-__be32 ic_addrservaddr = NONE; /* IP Address of the IP addresses'server */
+#ifdef IPCONFIG_DYNAMIC
+static __be32 ic_addrservaddr = NONE; /* IP Address of the IP addresses'server */
+#endif
__be32 ic_servaddr = NONE; /* Boot server IP address */
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 21a38e2..5ad48ec 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -891,8 +891,10 @@ static struct mfc_cache *ipmr_cache_alloc(void)
{
struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
- if (c)
+ if (c) {
+ c->mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1;
c->mfc_un.res.minvif = MAXVIFS;
+ }
return c;
}
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 5c7ed14..032a96d 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2277,6 +2277,38 @@ static inline bool tcp_can_repair_sock(const struct sock *sk)
((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_ESTABLISHED));
}
+static int tcp_repair_set_window(struct tcp_sock *tp, char __user *optbuf, int len)
+{
+ struct tcp_repair_window opt;
+
+ if (!tp->repair)
+ return -EPERM;
+
+ if (len != sizeof(opt))
+ return -EINVAL;
+
+ if (copy_from_user(&opt, optbuf, sizeof(opt)))
+ return -EFAULT;
+
+ if (opt.max_window < opt.snd_wnd)
+ return -EINVAL;
+
+ if (after(opt.snd_wl1, tp->rcv_nxt + opt.rcv_wnd))
+ return -EINVAL;
+
+ if (after(opt.rcv_wup, tp->rcv_nxt))
+ return -EINVAL;
+
+ tp->snd_wl1 = opt.snd_wl1;
+ tp->snd_wnd = opt.snd_wnd;
+ tp->max_window = opt.max_window;
+
+ tp->rcv_wnd = opt.rcv_wnd;
+ tp->rcv_wup = opt.rcv_wup;
+
+ return 0;
+}
+
static int tcp_repair_options_est(struct tcp_sock *tp,
struct tcp_repair_opt __user *optbuf, unsigned int len)
{
@@ -2604,6 +2636,9 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
else
tp->tsoffset = val - tcp_time_stamp;
break;
+ case TCP_REPAIR_WINDOW:
+ err = tcp_repair_set_window(tp, optval, optlen);
+ break;
case TCP_NOTSENT_LOWAT:
tp->notsent_lowat = val;
sk->sk_write_space(sk);
@@ -2860,6 +2895,28 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
return -EINVAL;
break;
+ case TCP_REPAIR_WINDOW: {
+ struct tcp_repair_window opt;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ if (len != sizeof(opt))
+ return -EINVAL;
+
+ if (!tp->repair)
+ return -EPERM;
+
+ opt.snd_wl1 = tp->snd_wl1;
+ opt.snd_wnd = tp->snd_wnd;
+ opt.max_window = tp->max_window;
+ opt.rcv_wnd = tp->rcv_wnd;
+ opt.rcv_wup = tp->rcv_wup;
+
+ if (copy_to_user(optval, &opt, len))
+ return -EFAULT;
+ return 0;
+ }
case TCP_QUEUE_SEQ:
if (tp->repair_queue == TCP_SEND_QUEUE)
val = tp->write_seq;
@@ -2969,8 +3026,18 @@ static void __tcp_alloc_md5sig_pool(void)
return;
for_each_possible_cpu(cpu) {
+ void *scratch = per_cpu(tcp_md5sig_pool, cpu).scratch;
struct ahash_request *req;
+ if (!scratch) {
+ scratch = kmalloc_node(sizeof(union tcp_md5sum_block) +
+ sizeof(struct tcphdr),
+ GFP_KERNEL,
+ cpu_to_node(cpu));
+ if (!scratch)
+ return;
+ per_cpu(tcp_md5sig_pool, cpu).scratch = scratch;
+ }
if (per_cpu(tcp_md5sig_pool, cpu).md5_req)
continue;
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
index 7e538f7..10d728b 100644
--- a/net/ipv4/tcp_dctcp.c
+++ b/net/ipv4/tcp_dctcp.c
@@ -293,7 +293,7 @@ static size_t dctcp_get_info(struct sock *sk, u32 ext, int *attr,
*/
if (ext & (1 << (INET_DIAG_DCTCPINFO - 1)) ||
ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
- memset(info, 0, sizeof(struct tcp_dctcp_info));
+ memset(&info->dctcp, 0, sizeof(info->dctcp));
if (inet_csk(sk)->icsk_ca_ops != &dctcp_reno) {
info->dctcp.dctcp_enabled = 1;
info->dctcp.dctcp_ce_state = (u16) ca->ce_state;
@@ -303,7 +303,7 @@ static size_t dctcp_get_info(struct sock *sk, u32 ext, int *attr,
}
*attr = INET_DIAG_DCTCPINFO;
- return sizeof(*info);
+ return sizeof(info->dctcp);
}
return 0;
}
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 3708de2..32b048e 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1018,27 +1018,28 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
GFP_KERNEL);
}
-static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
- __be32 daddr, __be32 saddr, int nbytes)
+static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
+ __be32 daddr, __be32 saddr,
+ const struct tcphdr *th, int nbytes)
{
struct tcp4_pseudohdr *bp;
struct scatterlist sg;
+ struct tcphdr *_th;
- bp = &hp->md5_blk.ip4;
-
- /*
- * 1. the TCP pseudo-header (in the order: source IP address,
- * destination IP address, zero-padded protocol number, and
- * segment length)
- */
+ bp = hp->scratch;
bp->saddr = saddr;
bp->daddr = daddr;
bp->pad = 0;
bp->protocol = IPPROTO_TCP;
bp->len = cpu_to_be16(nbytes);
- sg_init_one(&sg, bp, sizeof(*bp));
- ahash_request_set_crypt(hp->md5_req, &sg, NULL, sizeof(*bp));
+ _th = (struct tcphdr *)(bp + 1);
+ memcpy(_th, th, sizeof(*th));
+ _th->check = 0;
+
+ sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
+ ahash_request_set_crypt(hp->md5_req, &sg, NULL,
+ sizeof(*bp) + sizeof(*th));
return crypto_ahash_update(hp->md5_req);
}
@@ -1055,9 +1056,7 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
if (crypto_ahash_init(req))
goto clear_hash;
- if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
- goto clear_hash;
- if (tcp_md5_hash_header(hp, th))
+ if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
goto clear_hash;
if (tcp_md5_hash_key(hp, key))
goto clear_hash;
@@ -1101,9 +1100,7 @@ int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
if (crypto_ahash_init(req))
goto clear_hash;
- if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
- goto clear_hash;
- if (tcp_md5_hash_header(hp, th))
+ if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
goto clear_hash;
if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
goto clear_hash;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index b1bcba0..b26aa87 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2753,7 +2753,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
struct sk_buff *hole = NULL;
- u32 last_lost;
+ u32 max_segs, last_lost;
int mib_idx;
int fwd_rexmitting = 0;
@@ -2773,6 +2773,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
last_lost = tp->snd_una;
}
+ max_segs = tcp_tso_autosize(sk, tcp_current_mss(sk));
tcp_for_write_queue_from(skb, sk) {
__u8 sacked = TCP_SKB_CB(skb)->sacked;
int segs;
@@ -2786,6 +2787,10 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
segs = tp->snd_cwnd - tcp_packets_in_flight(tp);
if (segs <= 0)
return;
+ /* In case tcp_shift_skb_data() have aggregated large skbs,
+ * we need to make sure not sending too bigs TSO packets
+ */
+ segs = min_t(int, segs, max_segs);
if (fwd_rexmitting) {
begin_fwd:
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 0ff31d9..ca5e8ea 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -391,9 +391,9 @@ int udp_v4_get_port(struct sock *sk, unsigned short snum)
return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal, hash2_nulladdr);
}
-static inline int compute_score(struct sock *sk, struct net *net,
- __be32 saddr, unsigned short hnum, __be16 sport,
- __be32 daddr, __be16 dport, int dif)
+static int compute_score(struct sock *sk, struct net *net,
+ __be32 saddr, __be16 sport,
+ __be32 daddr, unsigned short hnum, int dif)
{
int score;
struct inet_sock *inet;
@@ -434,52 +434,6 @@ static inline int compute_score(struct sock *sk, struct net *net,
return score;
}
-/*
- * In this second variant, we check (daddr, dport) matches (inet_rcv_sadd, inet_num)
- */
-static inline int compute_score2(struct sock *sk, struct net *net,
- __be32 saddr, __be16 sport,
- __be32 daddr, unsigned int hnum, int dif)
-{
- int score;
- struct inet_sock *inet;
-
- if (!net_eq(sock_net(sk), net) ||
- ipv6_only_sock(sk))
- return -1;
-
- inet = inet_sk(sk);
-
- if (inet->inet_rcv_saddr != daddr ||
- inet->inet_num != hnum)
- return -1;
-
- score = (sk->sk_family == PF_INET) ? 2 : 1;
-
- if (inet->inet_daddr) {
- if (inet->inet_daddr != saddr)
- return -1;
- score += 4;
- }
-
- if (inet->inet_dport) {
- if (inet->inet_dport != sport)
- return -1;
- score += 4;
- }
-
- if (sk->sk_bound_dev_if) {
- if (sk->sk_bound_dev_if != dif)
- return -1;
- score += 4;
- }
-
- if (sk->sk_incoming_cpu == raw_smp_processor_id())
- score++;
-
- return score;
-}
-
static u32 udp_ehashfn(const struct net *net, const __be32 laddr,
const __u16 lport, const __be32 faddr,
const __be16 fport)
@@ -492,11 +446,11 @@ static u32 udp_ehashfn(const struct net *net, const __be32 laddr,
udp_ehash_secret + net_hash_mix(net));
}
-/* called with read_rcu_lock() */
+/* called with rcu_read_lock() */
static struct sock *udp4_lib_lookup2(struct net *net,
__be32 saddr, __be16 sport,
__be32 daddr, unsigned int hnum, int dif,
- struct udp_hslot *hslot2, unsigned int slot2,
+ struct udp_hslot *hslot2,
struct sk_buff *skb)
{
struct sock *sk, *result;
@@ -506,7 +460,7 @@ static struct sock *udp4_lib_lookup2(struct net *net,
result = NULL;
badness = 0;
udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
- score = compute_score2(sk, net, saddr, sport,
+ score = compute_score(sk, net, saddr, sport,
daddr, hnum, dif);
if (score > badness) {
reuseport = sk->sk_reuseport;
@@ -554,17 +508,22 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
result = udp4_lib_lookup2(net, saddr, sport,
daddr, hnum, dif,
- hslot2, slot2, skb);
+ hslot2, skb);
if (!result) {
+ unsigned int old_slot2 = slot2;
hash2 = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
slot2 = hash2 & udptable->mask;
+ /* avoid searching the same slot again. */
+ if (unlikely(slot2 == old_slot2))
+ return result;
+
hslot2 = &udptable->hash2[slot2];
if (hslot->count < hslot2->count)
goto begin;
result = udp4_lib_lookup2(net, saddr, sport,
- htonl(INADDR_ANY), hnum, dif,
- hslot2, slot2, skb);
+ daddr, hnum, dif,
+ hslot2, skb);
}
return result;
}
@@ -572,8 +531,8 @@ begin:
result = NULL;
badness = 0;
sk_for_each_rcu(sk, &hslot->head) {
- score = compute_score(sk, net, saddr, hnum, sport,
- daddr, dport, dif);
+ score = compute_score(sk, net, saddr, sport,
+ daddr, hnum, dif);
if (score > badness) {
reuseport = sk->sk_reuseport;
if (reuseport) {
@@ -1755,8 +1714,11 @@ static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh,
return err;
}
- return skb_checksum_init_zero_check(skb, proto, uh->check,
- inet_compute_pseudo);
+ /* Note, we are only interested in != 0 or == 0, thus the
+ * force to int.
+ */
+ return (__force int)skb_checksum_init_zero_check(skb, proto, uh->check,
+ inet_compute_pseudo);
}
/*
diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c
index 47f12c7..58bd39f 100644
--- a/net/ipv4/udp_tunnel.c
+++ b/net/ipv4/udp_tunnel.c
@@ -76,6 +76,67 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock,
}
EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock);
+void udp_tunnel_push_rx_port(struct net_device *dev, struct socket *sock,
+ unsigned short type)
+{
+ struct sock *sk = sock->sk;
+ struct udp_tunnel_info ti;
+
+ if (!dev->netdev_ops->ndo_udp_tunnel_add)
+ return;
+
+ ti.type = type;
+ ti.sa_family = sk->sk_family;
+ ti.port = inet_sk(sk)->inet_sport;
+
+ dev->netdev_ops->ndo_udp_tunnel_add(dev, &ti);
+}
+EXPORT_SYMBOL_GPL(udp_tunnel_push_rx_port);
+
+/* Notify netdevs that UDP port started listening */
+void udp_tunnel_notify_add_rx_port(struct socket *sock, unsigned short type)
+{
+ struct sock *sk = sock->sk;
+ struct net *net = sock_net(sk);
+ struct udp_tunnel_info ti;
+ struct net_device *dev;
+
+ ti.type = type;
+ ti.sa_family = sk->sk_family;
+ ti.port = inet_sk(sk)->inet_sport;
+
+ rcu_read_lock();
+ for_each_netdev_rcu(net, dev) {
+ if (!dev->netdev_ops->ndo_udp_tunnel_add)
+ continue;
+ dev->netdev_ops->ndo_udp_tunnel_add(dev, &ti);
+ }
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(udp_tunnel_notify_add_rx_port);
+
+/* Notify netdevs that UDP port is no more listening */
+void udp_tunnel_notify_del_rx_port(struct socket *sock, unsigned short type)
+{
+ struct sock *sk = sock->sk;
+ struct net *net = sock_net(sk);
+ struct udp_tunnel_info ti;
+ struct net_device *dev;
+
+ ti.type = type;
+ ti.sa_family = sk->sk_family;
+ ti.port = inet_sk(sk)->inet_sport;
+
+ rcu_read_lock();
+ for_each_netdev_rcu(net, dev) {
+ if (!dev->netdev_ops->ndo_udp_tunnel_del)
+ continue;
+ dev->netdev_ops->ndo_udp_tunnel_del(dev, &ti);
+ }
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(udp_tunnel_notify_del_rx_port);
+
void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb,
__be32 src, __be32 dst, __u8 tos, __u8 ttl,
__be16 df, __be16 src_port, __be16 dst_port,
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 7b0edb3..b644a23 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -295,7 +295,7 @@ static struct ctl_table xfrm4_policy_table[] = {
{ }
};
-static int __net_init xfrm4_net_sysctl_init(struct net *net)
+static __net_init int xfrm4_net_sysctl_init(struct net *net)
{
struct ctl_table *table;
struct ctl_table_header *hdr;
@@ -323,7 +323,7 @@ err_alloc:
return -ENOMEM;
}
-static void __net_exit xfrm4_net_sysctl_exit(struct net *net)
+static __net_exit void xfrm4_net_sysctl_exit(struct net *net)
{
struct ctl_table *table;
@@ -336,12 +336,12 @@ static void __net_exit xfrm4_net_sysctl_exit(struct net *net)
kfree(table);
}
#else /* CONFIG_SYSCTL */
-static int inline xfrm4_net_sysctl_init(struct net *net)
+static inline int xfrm4_net_sysctl_init(struct net *net)
{
return 0;
}
-static void inline xfrm4_net_sysctl_exit(struct net *net)
+static inline void xfrm4_net_sysctl_exit(struct net *net)
{
}
#endif
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 47f837a..a1f6b7b 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1524,6 +1524,28 @@ out:
return hiscore_idx;
}
+static int ipv6_get_saddr_master(struct net *net,
+ const struct net_device *dst_dev,
+ const struct net_device *master,
+ struct ipv6_saddr_dst *dst,
+ struct ipv6_saddr_score *scores,
+ int hiscore_idx)
+{
+ struct inet6_dev *idev;
+
+ idev = __in6_dev_get(dst_dev);
+ if (idev)
+ hiscore_idx = __ipv6_dev_get_saddr(net, dst, idev,
+ scores, hiscore_idx);
+
+ idev = __in6_dev_get(master);
+ if (idev)
+ hiscore_idx = __ipv6_dev_get_saddr(net, dst, idev,
+ scores, hiscore_idx);
+
+ return hiscore_idx;
+}
+
int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev,
const struct in6_addr *daddr, unsigned int prefs,
struct in6_addr *saddr)
@@ -1577,13 +1599,39 @@ int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev,
if (idev)
hiscore_idx = __ipv6_dev_get_saddr(net, &dst, idev, scores, hiscore_idx);
} else {
+ const struct net_device *master;
+ int master_idx = 0;
+
+ /* if dst_dev exists and is enslaved to an L3 device, then
+ * prefer addresses from dst_dev and then the master over
+ * any other enslaved devices in the L3 domain.
+ */
+ master = l3mdev_master_dev_rcu(dst_dev);
+ if (master) {
+ master_idx = master->ifindex;
+
+ hiscore_idx = ipv6_get_saddr_master(net, dst_dev,
+ master, &dst,
+ scores, hiscore_idx);
+
+ if (scores[hiscore_idx].ifa)
+ goto out;
+ }
+
for_each_netdev_rcu(net, dev) {
+ /* only consider addresses on devices in the
+ * same L3 domain
+ */
+ if (l3mdev_master_ifindex_rcu(dev) != master_idx)
+ continue;
idev = __in6_dev_get(dev);
if (!idev)
continue;
hiscore_idx = __ipv6_dev_get_saddr(net, &dst, idev, scores, hiscore_idx);
}
}
+
+out:
rcu_read_unlock();
hiscore = &scores[hiscore_idx];
@@ -2254,7 +2302,7 @@ static struct inet6_dev *addrconf_add_dev(struct net_device *dev)
return ERR_PTR(-EACCES);
/* Add default multicast route */
- if (!(dev->flags & IFF_LOOPBACK))
+ if (!(dev->flags & IFF_LOOPBACK) && !netif_is_l3_master(dev))
addrconf_add_mroute(dev);
return idev;
@@ -2333,12 +2381,109 @@ static bool is_addr_mode_generate_stable(struct inet6_dev *idev)
idev->addr_gen_mode == IN6_ADDR_GEN_MODE_RANDOM;
}
+int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev,
+ const struct prefix_info *pinfo,
+ struct inet6_dev *in6_dev,
+ const struct in6_addr *addr, int addr_type,
+ u32 addr_flags, bool sllao, bool tokenized,
+ __u32 valid_lft, u32 prefered_lft)
+{
+ struct inet6_ifaddr *ifp = ipv6_get_ifaddr(net, addr, dev, 1);
+ int create = 0, update_lft = 0;
+
+ if (!ifp && valid_lft) {
+ int max_addresses = in6_dev->cnf.max_addresses;
+
+#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
+ if (in6_dev->cnf.optimistic_dad &&
+ !net->ipv6.devconf_all->forwarding && sllao)
+ addr_flags |= IFA_F_OPTIMISTIC;
+#endif
+
+ /* Do not allow to create too much of autoconfigured
+ * addresses; this would be too easy way to crash kernel.
+ */
+ if (!max_addresses ||
+ ipv6_count_addresses(in6_dev) < max_addresses)
+ ifp = ipv6_add_addr(in6_dev, addr, NULL,
+ pinfo->prefix_len,
+ addr_type&IPV6_ADDR_SCOPE_MASK,
+ addr_flags, valid_lft,
+ prefered_lft);
+
+ if (IS_ERR_OR_NULL(ifp))
+ return -1;
+
+ update_lft = 0;
+ create = 1;
+ spin_lock_bh(&ifp->lock);
+ ifp->flags |= IFA_F_MANAGETEMPADDR;
+ ifp->cstamp = jiffies;
+ ifp->tokenized = tokenized;
+ spin_unlock_bh(&ifp->lock);
+ addrconf_dad_start(ifp);
+ }
+
+ if (ifp) {
+ u32 flags;
+ unsigned long now;
+ u32 stored_lft;
+
+ /* update lifetime (RFC2462 5.5.3 e) */
+ spin_lock_bh(&ifp->lock);
+ now = jiffies;
+ if (ifp->valid_lft > (now - ifp->tstamp) / HZ)
+ stored_lft = ifp->valid_lft - (now - ifp->tstamp) / HZ;
+ else
+ stored_lft = 0;
+ if (!update_lft && !create && stored_lft) {
+ const u32 minimum_lft = min_t(u32,
+ stored_lft, MIN_VALID_LIFETIME);
+ valid_lft = max(valid_lft, minimum_lft);
+
+ /* RFC4862 Section 5.5.3e:
+ * "Note that the preferred lifetime of the
+ * corresponding address is always reset to
+ * the Preferred Lifetime in the received
+ * Prefix Information option, regardless of
+ * whether the valid lifetime is also reset or
+ * ignored."
+ *
+ * So we should always update prefered_lft here.
+ */
+ update_lft = 1;
+ }
+
+ if (update_lft) {
+ ifp->valid_lft = valid_lft;
+ ifp->prefered_lft = prefered_lft;
+ ifp->tstamp = now;
+ flags = ifp->flags;
+ ifp->flags &= ~IFA_F_DEPRECATED;
+ spin_unlock_bh(&ifp->lock);
+
+ if (!(flags&IFA_F_TENTATIVE))
+ ipv6_ifa_notify(0, ifp);
+ } else
+ spin_unlock_bh(&ifp->lock);
+
+ manage_tempaddrs(in6_dev, ifp, valid_lft, prefered_lft,
+ create, now);
+
+ in6_ifa_put(ifp);
+ addrconf_verify();
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(addrconf_prefix_rcv_add_addr);
+
void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao)
{
struct prefix_info *pinfo;
__u32 valid_lft;
__u32 prefered_lft;
- int addr_type;
+ int addr_type, err;
u32 addr_flags = 0;
struct inet6_dev *in6_dev;
struct net *net = dev_net(dev);
@@ -2432,10 +2577,8 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao)
/* Try to figure out our local address for this prefix */
if (pinfo->autoconf && in6_dev->cnf.autoconf) {
- struct inet6_ifaddr *ifp;
struct in6_addr addr;
- int create = 0, update_lft = 0;
- bool tokenized = false;
+ bool tokenized = false, dev_addr_generated = false;
if (pinfo->prefix_len == 64) {
memcpy(&addr, &pinfo->prefix, 8);
@@ -2453,106 +2596,36 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao)
goto ok;
} else if (ipv6_generate_eui64(addr.s6_addr + 8, dev) &&
ipv6_inherit_eui64(addr.s6_addr + 8, in6_dev)) {
- in6_dev_put(in6_dev);
- return;
+ goto put;
+ } else {
+ dev_addr_generated = true;
}
goto ok;
}
net_dbg_ratelimited("IPv6 addrconf: prefix with wrong length %d\n",
pinfo->prefix_len);
- in6_dev_put(in6_dev);
- return;
+ goto put;
ok:
+ err = addrconf_prefix_rcv_add_addr(net, dev, pinfo, in6_dev,
+ &addr, addr_type,
+ addr_flags, sllao,
+ tokenized, valid_lft,
+ prefered_lft);
+ if (err)
+ goto put;
- ifp = ipv6_get_ifaddr(net, &addr, dev, 1);
-
- if (!ifp && valid_lft) {
- int max_addresses = in6_dev->cnf.max_addresses;
-
-#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
- if (in6_dev->cnf.optimistic_dad &&
- !net->ipv6.devconf_all->forwarding && sllao)
- addr_flags |= IFA_F_OPTIMISTIC;
-#endif
-
- /* Do not allow to create too much of autoconfigured
- * addresses; this would be too easy way to crash kernel.
- */
- if (!max_addresses ||
- ipv6_count_addresses(in6_dev) < max_addresses)
- ifp = ipv6_add_addr(in6_dev, &addr, NULL,
- pinfo->prefix_len,
- addr_type&IPV6_ADDR_SCOPE_MASK,
- addr_flags, valid_lft,
- prefered_lft);
-
- if (IS_ERR_OR_NULL(ifp)) {
- in6_dev_put(in6_dev);
- return;
- }
-
- update_lft = 0;
- create = 1;
- spin_lock_bh(&ifp->lock);
- ifp->flags |= IFA_F_MANAGETEMPADDR;
- ifp->cstamp = jiffies;
- ifp->tokenized = tokenized;
- spin_unlock_bh(&ifp->lock);
- addrconf_dad_start(ifp);
- }
-
- if (ifp) {
- u32 flags;
- unsigned long now;
- u32 stored_lft;
-
- /* update lifetime (RFC2462 5.5.3 e) */
- spin_lock_bh(&ifp->lock);
- now = jiffies;
- if (ifp->valid_lft > (now - ifp->tstamp) / HZ)
- stored_lft = ifp->valid_lft - (now - ifp->tstamp) / HZ;
- else
- stored_lft = 0;
- if (!update_lft && !create && stored_lft) {
- const u32 minimum_lft = min_t(u32,
- stored_lft, MIN_VALID_LIFETIME);
- valid_lft = max(valid_lft, minimum_lft);
-
- /* RFC4862 Section 5.5.3e:
- * "Note that the preferred lifetime of the
- * corresponding address is always reset to
- * the Preferred Lifetime in the received
- * Prefix Information option, regardless of
- * whether the valid lifetime is also reset or
- * ignored."
- *
- * So we should always update prefered_lft here.
- */
- update_lft = 1;
- }
-
- if (update_lft) {
- ifp->valid_lft = valid_lft;
- ifp->prefered_lft = prefered_lft;
- ifp->tstamp = now;
- flags = ifp->flags;
- ifp->flags &= ~IFA_F_DEPRECATED;
- spin_unlock_bh(&ifp->lock);
-
- if (!(flags&IFA_F_TENTATIVE))
- ipv6_ifa_notify(0, ifp);
- } else
- spin_unlock_bh(&ifp->lock);
-
- manage_tempaddrs(in6_dev, ifp, valid_lft, prefered_lft,
- create, now);
-
- in6_ifa_put(ifp);
- addrconf_verify();
- }
+ /* Ignore error case here because previous prefix add addr was
+ * successful which will be notified.
+ */
+ ndisc_ops_prefix_rcv_add_addr(net, dev, pinfo, in6_dev, &addr,
+ addr_type, addr_flags, sllao,
+ tokenized, valid_lft,
+ prefered_lft,
+ dev_addr_generated);
}
inet6_prefix_notify(RTM_NEWPREFIX, in6_dev, pinfo);
+put:
in6_dev_put(in6_dev);
}
@@ -2947,8 +3020,8 @@ static void init_loopback(struct net_device *dev)
}
}
-static void addrconf_add_linklocal(struct inet6_dev *idev,
- const struct in6_addr *addr, u32 flags)
+void addrconf_add_linklocal(struct inet6_dev *idev,
+ const struct in6_addr *addr, u32 flags)
{
struct inet6_ifaddr *ifp;
u32 addr_flags = flags | IFA_F_PERMANENT;
@@ -2967,6 +3040,7 @@ static void addrconf_add_linklocal(struct inet6_dev *idev,
in6_ifa_put(ifp);
}
}
+EXPORT_SYMBOL_GPL(addrconf_add_linklocal);
static bool ipv6_reserved_interfaceid(struct in6_addr address)
{
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 40454bf..bd59c34 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -98,7 +98,7 @@ static void icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
if (!(type & ICMPV6_INFOMSG_MASK))
if (icmp6->icmp6_type == ICMPV6_ECHO_REQUEST)
- ping_err(skb, offset, info);
+ ping_err(skb, offset, ntohl(info));
}
static int icmpv6_rcv(struct sk_buff *skb);
@@ -388,7 +388,8 @@ relookup_failed:
/*
* Send an ICMP message in response to a packet in error
*/
-static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info)
+static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
+ const struct in6_addr *force_saddr)
{
struct net *net = dev_net(skb->dev);
struct inet6_dev *idev = NULL;
@@ -475,6 +476,8 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info)
memset(&fl6, 0, sizeof(fl6));
fl6.flowi6_proto = IPPROTO_ICMPV6;
fl6.daddr = hdr->saddr;
+ if (force_saddr)
+ saddr = force_saddr;
if (saddr)
fl6.saddr = *saddr;
fl6.flowi6_mark = mark;
@@ -551,10 +554,75 @@ out:
*/
void icmpv6_param_prob(struct sk_buff *skb, u8 code, int pos)
{
- icmp6_send(skb, ICMPV6_PARAMPROB, code, pos);
+ icmp6_send(skb, ICMPV6_PARAMPROB, code, pos, NULL);
kfree_skb(skb);
}
+/* Generate icmpv6 with type/code ICMPV6_DEST_UNREACH/ICMPV6_ADDR_UNREACH
+ * if sufficient data bytes are available
+ * @nhs is the size of the tunnel header(s) :
+ * Either an IPv4 header for SIT encap
+ * an IPv4 header + GRE header for GRE encap
+ */
+int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type,
+ unsigned int data_len)
+{
+ struct in6_addr temp_saddr;
+ struct rt6_info *rt;
+ struct sk_buff *skb2;
+ u32 info = 0;
+
+ if (!pskb_may_pull(skb, nhs + sizeof(struct ipv6hdr) + 8))
+ return 1;
+
+ /* RFC 4884 (partial) support for ICMP extensions */
+ if (data_len < 128 || (data_len & 7) || skb->len < data_len)
+ data_len = 0;
+
+ skb2 = data_len ? skb_copy(skb, GFP_ATOMIC) : skb_clone(skb, GFP_ATOMIC);
+
+ if (!skb2)
+ return 1;
+
+ skb_dst_drop(skb2);
+ skb_pull(skb2, nhs);
+ skb_reset_network_header(skb2);
+
+ rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr, NULL, 0, 0);
+
+ if (rt && rt->dst.dev)
+ skb2->dev = rt->dst.dev;
+
+ ipv6_addr_set_v4mapped(ip_hdr(skb)->saddr, &temp_saddr);
+
+ if (data_len) {
+ /* RFC 4884 (partial) support :
+ * insert 0 padding at the end, before the extensions
+ */
+ __skb_push(skb2, nhs);
+ skb_reset_network_header(skb2);
+ memmove(skb2->data, skb2->data + nhs, data_len - nhs);
+ memset(skb2->data + data_len - nhs, 0, nhs);
+ /* RFC 4884 4.5 : Length is measured in 64-bit words,
+ * and stored in reserved[0]
+ */
+ info = (data_len/8) << 24;
+ }
+ if (type == ICMP_TIME_EXCEEDED)
+ icmp6_send(skb2, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
+ info, &temp_saddr);
+ else
+ icmp6_send(skb2, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH,
+ info, &temp_saddr);
+ if (rt)
+ ip6_rt_put(rt);
+
+ kfree_skb(skb2);
+
+ return 0;
+}
+EXPORT_SYMBOL(ip6_err_gen_icmpv6_unreach);
+
static void icmpv6_echo_reply(struct sk_buff *skb)
{
struct net *net = dev_net(skb->dev);
@@ -587,7 +655,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
fl6.daddr = ipv6_hdr(skb)->saddr;
if (saddr)
fl6.saddr = *saddr;
- fl6.flowi6_oif = l3mdev_fib_oif(skb->dev);
+ fl6.flowi6_oif = skb->dev->ifindex;
fl6.fl6_icmp_type = ICMPV6_ECHO_REPLY;
fl6.flowi6_mark = mark;
security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
diff --git a/net/ipv6/ila/ila_common.c b/net/ipv6/ila/ila_common.c
index b3d00be..ec9efbc 100644
--- a/net/ipv6/ila/ila_common.c
+++ b/net/ipv6/ila/ila_common.c
@@ -34,12 +34,12 @@ static void ila_csum_do_neutral(struct ila_addr *iaddr,
if (p->locator_match.v64) {
diff = p->csum_diff;
} else {
- diff = compute_csum_diff8((__be32 *)iaddr,
- (__be32 *)&p->locator);
+ diff = compute_csum_diff8((__be32 *)&p->locator,
+ (__be32 *)iaddr);
}
fval = (__force __wsum)(ila_csum_neutral_set(iaddr->ident) ?
- ~CSUM_NEUTRAL_FLAG : CSUM_NEUTRAL_FLAG);
+ CSUM_NEUTRAL_FLAG : ~CSUM_NEUTRAL_FLAG);
diff = csum_add(diff, fval);
@@ -140,8 +140,8 @@ void ila_init_saved_csum(struct ila_params *p)
return;
p->csum_diff = compute_csum_diff8(
- (__be32 *)&p->locator_match,
- (__be32 *)&p->locator);
+ (__be32 *)&p->locator,
+ (__be32 *)&p->locator_match);
}
static int __init ila_init(void)
diff --git a/net/ipv6/ip6_checksum.c b/net/ipv6/ip6_checksum.c
index b2025bf..c0cbcb2 100644
--- a/net/ipv6/ip6_checksum.c
+++ b/net/ipv6/ip6_checksum.c
@@ -78,9 +78,12 @@ int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh, int proto)
* we accept a checksum of zero here. When we find the socket
* for the UDP packet we'll check if that socket allows zero checksum
* for IPv6 (set by socket option).
+ *
+ * Note, we are only interested in != 0 or == 0, thus the
+ * force to int.
*/
- return skb_checksum_init_zero_check(skb, proto, uh->check,
- ip6_compute_pseudo);
+ return (__force int)skb_checksum_init_zero_check(skb, proto, uh->check,
+ ip6_compute_pseudo);
}
EXPORT_SYMBOL(udp6_csum_init);
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index fdc9de2..776d145 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -468,7 +468,7 @@ static int gre_rcv(struct sk_buff *skb)
bool csum_err = false;
int hdr_len;
- hdr_len = gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IPV6));
+ hdr_len = gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IPV6), 0);
if (hdr_len < 0)
goto drop;
diff --git a/net/ipv6/ip6_icmp.c b/net/ipv6/ip6_icmp.c
index 14dacc5..713676f 100644
--- a/net/ipv6/ip6_icmp.c
+++ b/net/ipv6/ip6_icmp.c
@@ -39,7 +39,7 @@ void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info)
if (!send)
goto out;
- send(skb, type, code, info);
+ send(skb, type, code, info, NULL);
out:
rcu_read_unlock();
}
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index fd32175..1dfc402 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -910,6 +910,13 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
int err;
int flags = 0;
+ if (ipv6_addr_any(&fl6->saddr) && fl6->flowi6_oif &&
+ (!*dst || !(*dst)->error)) {
+ err = l3mdev_get_saddr6(net, sk, fl6);
+ if (err)
+ goto out_err;
+ }
+
/* The correct way to handle this would be to do
* ip6_route_get_saddr, and then ip6_route_output; however,
* the route-specific preferred source forces the
@@ -999,10 +1006,11 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
return 0;
out_err_release:
- if (err == -ENETUNREACH)
- IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
dst_release(*dst);
*dst = NULL;
+out_err:
+ if (err == -ENETUNREACH)
+ IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
return err;
}
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index f2e2013f8..487ef3b 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -1074,6 +1074,7 @@ static struct mfc6_cache *ip6mr_cache_alloc(void)
struct mfc6_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
if (!c)
return NULL;
+ c->mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1;
c->mfc_un.res.minvif = MAXMIFS;
return c;
}
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index c245895..fe65cdc 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -73,15 +73,6 @@
#include <linux/netfilter.h>
#include <linux/netfilter_ipv6.h>
-/* Set to 3 to get tracing... */
-#define ND_DEBUG 1
-
-#define ND_PRINTK(val, level, fmt, ...) \
-do { \
- if (val <= ND_DEBUG) \
- net_##level##_ratelimited(fmt, ##__VA_ARGS__); \
-} while (0)
-
static u32 ndisc_hash(const void *pkey,
const struct net_device *dev,
__u32 *hash_rnd);
@@ -150,11 +141,10 @@ struct neigh_table nd_tbl = {
};
EXPORT_SYMBOL_GPL(nd_tbl);
-static void ndisc_fill_addr_option(struct sk_buff *skb, int type, void *data)
+void __ndisc_fill_addr_option(struct sk_buff *skb, int type, void *data,
+ int data_len, int pad)
{
- int pad = ndisc_addr_option_pad(skb->dev->type);
- int data_len = skb->dev->addr_len;
- int space = ndisc_opt_addr_space(skb->dev);
+ int space = __ndisc_opt_addr_space(data_len, pad);
u8 *opt = skb_put(skb, space);
opt[0] = type;
@@ -171,6 +161,23 @@ static void ndisc_fill_addr_option(struct sk_buff *skb, int type, void *data)
if (space > 0)
memset(opt, 0, space);
}
+EXPORT_SYMBOL_GPL(__ndisc_fill_addr_option);
+
+static inline void ndisc_fill_addr_option(struct sk_buff *skb, int type,
+ void *data, u8 icmp6_type)
+{
+ __ndisc_fill_addr_option(skb, type, data, skb->dev->addr_len,
+ ndisc_addr_option_pad(skb->dev->type));
+ ndisc_ops_fill_addr_option(skb->dev, skb, icmp6_type);
+}
+
+static inline void ndisc_fill_redirect_addr_option(struct sk_buff *skb,
+ void *ha,
+ const u8 *ops_data)
+{
+ ndisc_fill_addr_option(skb, ND_OPT_TARGET_LL_ADDR, ha, NDISC_REDIRECT);
+ ndisc_ops_fill_redirect_addr_option(skb->dev, skb, ops_data);
+}
static struct nd_opt_hdr *ndisc_next_option(struct nd_opt_hdr *cur,
struct nd_opt_hdr *end)
@@ -185,24 +192,28 @@ static struct nd_opt_hdr *ndisc_next_option(struct nd_opt_hdr *cur,
return cur <= end && cur->nd_opt_type == type ? cur : NULL;
}
-static inline int ndisc_is_useropt(struct nd_opt_hdr *opt)
+static inline int ndisc_is_useropt(const struct net_device *dev,
+ struct nd_opt_hdr *opt)
{
return opt->nd_opt_type == ND_OPT_RDNSS ||
- opt->nd_opt_type == ND_OPT_DNSSL;
+ opt->nd_opt_type == ND_OPT_DNSSL ||
+ ndisc_ops_is_useropt(dev, opt->nd_opt_type);
}
-static struct nd_opt_hdr *ndisc_next_useropt(struct nd_opt_hdr *cur,
+static struct nd_opt_hdr *ndisc_next_useropt(const struct net_device *dev,
+ struct nd_opt_hdr *cur,
struct nd_opt_hdr *end)
{
if (!cur || !end || cur >= end)
return NULL;
do {
cur = ((void *)cur) + (cur->nd_opt_len << 3);
- } while (cur < end && !ndisc_is_useropt(cur));
- return cur <= end && ndisc_is_useropt(cur) ? cur : NULL;
+ } while (cur < end && !ndisc_is_useropt(dev, cur));
+ return cur <= end && ndisc_is_useropt(dev, cur) ? cur : NULL;
}
-struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len,
+struct ndisc_options *ndisc_parse_options(const struct net_device *dev,
+ u8 *opt, int opt_len,
struct ndisc_options *ndopts)
{
struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)opt;
@@ -217,6 +228,8 @@ struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len,
l = nd_opt->nd_opt_len << 3;
if (opt_len < l || l == 0)
return NULL;
+ if (ndisc_ops_parse_options(dev, nd_opt, ndopts))
+ goto next_opt;
switch (nd_opt->nd_opt_type) {
case ND_OPT_SOURCE_LL_ADDR:
case ND_OPT_TARGET_LL_ADDR:
@@ -243,7 +256,7 @@ struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len,
break;
#endif
default:
- if (ndisc_is_useropt(nd_opt)) {
+ if (ndisc_is_useropt(dev, nd_opt)) {
ndopts->nd_useropts_end = nd_opt;
if (!ndopts->nd_useropts)
ndopts->nd_useropts = nd_opt;
@@ -260,6 +273,7 @@ struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len,
nd_opt->nd_opt_len);
}
}
+next_opt:
opt_len -= l;
nd_opt = ((void *)nd_opt) + l;
}
@@ -509,7 +523,8 @@ void ndisc_send_na(struct net_device *dev, const struct in6_addr *daddr,
if (!dev->addr_len)
inc_opt = 0;
if (inc_opt)
- optlen += ndisc_opt_addr_space(dev);
+ optlen += ndisc_opt_addr_space(dev,
+ NDISC_NEIGHBOUR_ADVERTISEMENT);
skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen);
if (!skb)
@@ -528,8 +543,8 @@ void ndisc_send_na(struct net_device *dev, const struct in6_addr *daddr,
if (inc_opt)
ndisc_fill_addr_option(skb, ND_OPT_TARGET_LL_ADDR,
- dev->dev_addr);
-
+ dev->dev_addr,
+ NDISC_NEIGHBOUR_ADVERTISEMENT);
ndisc_send_skb(skb, daddr, src_addr);
}
@@ -574,7 +589,8 @@ void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit,
if (ipv6_addr_any(saddr))
inc_opt = false;
if (inc_opt)
- optlen += ndisc_opt_addr_space(dev);
+ optlen += ndisc_opt_addr_space(dev,
+ NDISC_NEIGHBOUR_SOLICITATION);
skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen);
if (!skb)
@@ -590,7 +606,8 @@ void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit,
if (inc_opt)
ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR,
- dev->dev_addr);
+ dev->dev_addr,
+ NDISC_NEIGHBOUR_SOLICITATION);
ndisc_send_skb(skb, daddr, saddr);
}
@@ -626,7 +643,7 @@ void ndisc_send_rs(struct net_device *dev, const struct in6_addr *saddr,
}
#endif
if (send_sllao)
- optlen += ndisc_opt_addr_space(dev);
+ optlen += ndisc_opt_addr_space(dev, NDISC_ROUTER_SOLICITATION);
skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen);
if (!skb)
@@ -641,7 +658,8 @@ void ndisc_send_rs(struct net_device *dev, const struct in6_addr *saddr,
if (send_sllao)
ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR,
- dev->dev_addr);
+ dev->dev_addr,
+ NDISC_ROUTER_SOLICITATION);
ndisc_send_skb(skb, daddr, saddr);
}
@@ -702,6 +720,15 @@ static int pndisc_is_router(const void *pkey,
return ret;
}
+void ndisc_update(const struct net_device *dev, struct neighbour *neigh,
+ const u8 *lladdr, u8 new, u32 flags, u8 icmp6_type,
+ struct ndisc_options *ndopts)
+{
+ neigh_update(neigh, lladdr, new, flags);
+ /* report ndisc ops about neighbour update */
+ ndisc_ops_update(dev, neigh, flags, icmp6_type, ndopts);
+}
+
static void ndisc_recv_ns(struct sk_buff *skb)
{
struct nd_msg *msg = (struct nd_msg *)skb_transport_header(skb);
@@ -738,7 +765,7 @@ static void ndisc_recv_ns(struct sk_buff *skb)
return;
}
- if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) {
+ if (!ndisc_parse_options(dev, msg->opt, ndoptlen, &ndopts)) {
ND_PRINTK(2, warn, "NS: invalid ND options\n");
return;
}
@@ -856,9 +883,10 @@ have_ifp:
neigh = __neigh_lookup(&nd_tbl, saddr, dev,
!inc || lladdr || !dev->addr_len);
if (neigh)
- neigh_update(neigh, lladdr, NUD_STALE,
+ ndisc_update(dev, neigh, lladdr, NUD_STALE,
NEIGH_UPDATE_F_WEAK_OVERRIDE|
- NEIGH_UPDATE_F_OVERRIDE);
+ NEIGH_UPDATE_F_OVERRIDE,
+ NDISC_NEIGHBOUR_SOLICITATION, &ndopts);
if (neigh || !dev->header_ops) {
ndisc_send_na(dev, saddr, &msg->target, !!is_router,
true, (ifp != NULL && inc), inc);
@@ -911,7 +939,7 @@ static void ndisc_recv_na(struct sk_buff *skb)
idev->cnf.drop_unsolicited_na)
return;
- if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) {
+ if (!ndisc_parse_options(dev, msg->opt, ndoptlen, &ndopts)) {
ND_PRINTK(2, warn, "NS: invalid ND option\n");
return;
}
@@ -967,12 +995,13 @@ static void ndisc_recv_na(struct sk_buff *skb)
goto out;
}
- neigh_update(neigh, lladdr,
+ ndisc_update(dev, neigh, lladdr,
msg->icmph.icmp6_solicited ? NUD_REACHABLE : NUD_STALE,
NEIGH_UPDATE_F_WEAK_OVERRIDE|
(msg->icmph.icmp6_override ? NEIGH_UPDATE_F_OVERRIDE : 0)|
NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
- (msg->icmph.icmp6_router ? NEIGH_UPDATE_F_ISROUTER : 0));
+ (msg->icmph.icmp6_router ? NEIGH_UPDATE_F_ISROUTER : 0),
+ NDISC_NEIGHBOUR_ADVERTISEMENT, &ndopts);
if ((old_flags & ~neigh->flags) & NTF_ROUTER) {
/*
@@ -1017,7 +1046,7 @@ static void ndisc_recv_rs(struct sk_buff *skb)
goto out;
/* Parse ND options */
- if (!ndisc_parse_options(rs_msg->opt, ndoptlen, &ndopts)) {
+ if (!ndisc_parse_options(skb->dev, rs_msg->opt, ndoptlen, &ndopts)) {
ND_PRINTK(2, notice, "NS: invalid ND option, ignored\n");
goto out;
}
@@ -1031,10 +1060,11 @@ static void ndisc_recv_rs(struct sk_buff *skb)
neigh = __neigh_lookup(&nd_tbl, saddr, skb->dev, 1);
if (neigh) {
- neigh_update(neigh, lladdr, NUD_STALE,
+ ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
NEIGH_UPDATE_F_WEAK_OVERRIDE|
NEIGH_UPDATE_F_OVERRIDE|
- NEIGH_UPDATE_F_OVERRIDE_ISROUTER);
+ NEIGH_UPDATE_F_OVERRIDE_ISROUTER,
+ NDISC_ROUTER_SOLICITATION, &ndopts);
neigh_release(neigh);
}
out:
@@ -1135,7 +1165,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
return;
}
- if (!ndisc_parse_options(opt, optlen, &ndopts)) {
+ if (!ndisc_parse_options(skb->dev, opt, optlen, &ndopts)) {
ND_PRINTK(2, warn, "RA: invalid ND options\n");
return;
}
@@ -1329,11 +1359,12 @@ skip_linkparms:
goto out;
}
}
- neigh_update(neigh, lladdr, NUD_STALE,
+ ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
NEIGH_UPDATE_F_WEAK_OVERRIDE|
NEIGH_UPDATE_F_OVERRIDE|
NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
- NEIGH_UPDATE_F_ISROUTER);
+ NEIGH_UPDATE_F_ISROUTER,
+ NDISC_ROUTER_ADVERTISEMENT, &ndopts);
}
if (!ipv6_accept_ra(in6_dev)) {
@@ -1421,7 +1452,8 @@ skip_routeinfo:
struct nd_opt_hdr *p;
for (p = ndopts.nd_useropts;
p;
- p = ndisc_next_useropt(p, ndopts.nd_useropts_end)) {
+ p = ndisc_next_useropt(skb->dev, p,
+ ndopts.nd_useropts_end)) {
ndisc_ra_useropt(skb, p);
}
}
@@ -1459,7 +1491,7 @@ static void ndisc_redirect_rcv(struct sk_buff *skb)
return;
}
- if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts))
+ if (!ndisc_parse_options(skb->dev, msg->opt, ndoptlen, &ndopts))
return;
if (!ndopts.nd_opts_rh) {
@@ -1504,7 +1536,8 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target)
struct dst_entry *dst;
struct flowi6 fl6;
int rd_len;
- u8 ha_buf[MAX_ADDR_LEN], *ha = NULL;
+ u8 ha_buf[MAX_ADDR_LEN], *ha = NULL,
+ ops_data_buf[NDISC_OPS_REDIRECT_DATA_SPACE], *ops_data = NULL;
int oif = l3mdev_fib_oif(dev);
bool ret;
@@ -1563,7 +1596,9 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target)
memcpy(ha_buf, neigh->ha, dev->addr_len);
read_unlock_bh(&neigh->lock);
ha = ha_buf;
- optlen += ndisc_opt_addr_space(dev);
+ optlen += ndisc_redirect_opt_addr_space(dev, neigh,
+ ops_data_buf,
+ &ops_data);
} else
read_unlock_bh(&neigh->lock);
@@ -1594,7 +1629,7 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target)
*/
if (ha)
- ndisc_fill_addr_option(buff, ND_OPT_TARGET_LL_ADDR, ha);
+ ndisc_fill_redirect_addr_option(buff, ha, ops_data);
/*
* build redirect option and copy skb over to the new packet.
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index c6ae6f9..4981755 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1042,8 +1042,8 @@ static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
return pcpu_rt;
}
-static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
- struct flowi6 *fl6, int flags)
+struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
+ int oif, struct flowi6 *fl6, int flags)
{
struct fib6_node *fn, *saved_fn;
struct rt6_info *rt;
@@ -1139,6 +1139,7 @@ redo_rt6_select:
}
}
+EXPORT_SYMBOL_GPL(ip6_pol_route);
static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
struct flowi6 *fl6, int flags)
@@ -1782,7 +1783,7 @@ static struct rt6_info *ip6_nh_lookup_table(struct net *net,
};
struct fib6_table *table;
struct rt6_info *rt;
- int flags = 0;
+ int flags = RT6_LOOKUP_F_IFACE;
table = fib6_get_table(net, cfg->fc_table);
if (!table)
@@ -2200,7 +2201,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
* first-hop router for the specified ICMP Destination Address.
*/
- if (!ndisc_parse_options(msg->opt, optlen, &ndopts)) {
+ if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
return;
}
@@ -2235,12 +2236,12 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
* We have finally decided to accept it.
*/
- neigh_update(neigh, lladdr, NUD_STALE,
+ ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
NEIGH_UPDATE_F_WEAK_OVERRIDE|
NEIGH_UPDATE_F_OVERRIDE|
(on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
- NEIGH_UPDATE_F_ISROUTER))
- );
+ NEIGH_UPDATE_F_ISROUTER)),
+ NDISC_REDIRECT, &ndopts);
nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
if (!nrt)
@@ -2585,23 +2586,6 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
return rt;
}
-int ip6_route_get_saddr(struct net *net,
- struct rt6_info *rt,
- const struct in6_addr *daddr,
- unsigned int prefs,
- struct in6_addr *saddr)
-{
- struct inet6_dev *idev =
- rt ? ip6_dst_idev((struct dst_entry *)rt) : NULL;
- int err = 0;
- if (rt && rt->rt6i_prefsrc.plen)
- *saddr = rt->rt6i_prefsrc.addr;
- else
- err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
- daddr, prefs, saddr);
- return err;
-}
-
/* remove deleted ip from prefsrc entries */
struct arg_dev_net_ip {
struct net_device *dev;
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index d9f2bd6..917a5cd 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -479,47 +479,12 @@ static void ipip6_tunnel_uninit(struct net_device *dev)
dev_put(dev);
}
-/* Generate icmpv6 with type/code ICMPV6_DEST_UNREACH/ICMPV6_ADDR_UNREACH
- * if sufficient data bytes are available
- */
-static int ipip6_err_gen_icmpv6_unreach(struct sk_buff *skb)
-{
- int ihl = ((const struct iphdr *)skb->data)->ihl*4;
- struct rt6_info *rt;
- struct sk_buff *skb2;
-
- if (!pskb_may_pull(skb, ihl + sizeof(struct ipv6hdr) + 8))
- return 1;
-
- skb2 = skb_clone(skb, GFP_ATOMIC);
-
- if (!skb2)
- return 1;
-
- skb_dst_drop(skb2);
- skb_pull(skb2, ihl);
- skb_reset_network_header(skb2);
-
- rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr, NULL, 0, 0);
-
- if (rt && rt->dst.dev)
- skb2->dev = rt->dst.dev;
-
- icmpv6_send(skb2, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
-
- if (rt)
- ip6_rt_put(rt);
-
- kfree_skb(skb2);
-
- return 0;
-}
-
static int ipip6_err(struct sk_buff *skb, u32 info)
{
const struct iphdr *iph = (const struct iphdr *)skb->data;
const int type = icmp_hdr(skb)->type;
const int code = icmp_hdr(skb)->code;
+ unsigned int data_len = 0;
struct ip_tunnel *t;
int err;
@@ -544,6 +509,7 @@ static int ipip6_err(struct sk_buff *skb, u32 info)
case ICMP_TIME_EXCEEDED:
if (code != ICMP_EXC_TTL)
return 0;
+ data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */
break;
case ICMP_REDIRECT:
break;
@@ -560,22 +526,22 @@ static int ipip6_err(struct sk_buff *skb, u32 info)
if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
ipv4_update_pmtu(skb, dev_net(skb->dev), info,
- t->parms.link, 0, IPPROTO_IPV6, 0);
+ t->parms.link, 0, iph->protocol, 0);
err = 0;
goto out;
}
if (type == ICMP_REDIRECT) {
ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0,
- IPPROTO_IPV6, 0);
+ iph->protocol, 0);
err = 0;
goto out;
}
- if (t->parms.iph.daddr == 0)
+ err = 0;
+ if (!ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4, type, data_len))
goto out;
- err = 0;
- if (!ipip6_err_gen_icmpv6_unreach(skb))
+ if (t->parms.iph.daddr == 0)
goto out;
if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index f36c2d0..37cf913 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -526,26 +526,33 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, char __user *optval,
AF_INET6, cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
}
-static int tcp_v6_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
- const struct in6_addr *daddr,
- const struct in6_addr *saddr, int nbytes)
+static int tcp_v6_md5_hash_headers(struct tcp_md5sig_pool *hp,
+ const struct in6_addr *daddr,
+ const struct in6_addr *saddr,
+ const struct tcphdr *th, int nbytes)
{
struct tcp6_pseudohdr *bp;
struct scatterlist sg;
+ struct tcphdr *_th;
- bp = &hp->md5_blk.ip6;
+ bp = hp->scratch;
/* 1. TCP pseudo-header (RFC2460) */
bp->saddr = *saddr;
bp->daddr = *daddr;
bp->protocol = cpu_to_be32(IPPROTO_TCP);
bp->len = cpu_to_be32(nbytes);
- sg_init_one(&sg, bp, sizeof(*bp));
- ahash_request_set_crypt(hp->md5_req, &sg, NULL, sizeof(*bp));
+ _th = (struct tcphdr *)(bp + 1);
+ memcpy(_th, th, sizeof(*th));
+ _th->check = 0;
+
+ sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
+ ahash_request_set_crypt(hp->md5_req, &sg, NULL,
+ sizeof(*bp) + sizeof(*th));
return crypto_ahash_update(hp->md5_req);
}
-static int tcp_v6_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
+static int tcp_v6_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
const struct in6_addr *daddr, struct in6_addr *saddr,
const struct tcphdr *th)
{
@@ -559,9 +566,7 @@ static int tcp_v6_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
if (crypto_ahash_init(req))
goto clear_hash;
- if (tcp_v6_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
- goto clear_hash;
- if (tcp_md5_hash_header(hp, th))
+ if (tcp_v6_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
goto clear_hash;
if (tcp_md5_hash_key(hp, key))
goto clear_hash;
@@ -606,9 +611,7 @@ static int tcp_v6_md5_hash_skb(char *md5_hash,
if (crypto_ahash_init(req))
goto clear_hash;
- if (tcp_v6_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
- goto clear_hash;
- if (tcp_md5_hash_header(hp, th))
+ if (tcp_v6_md5_hash_headers(hp, daddr, saddr, th, skb->len))
goto clear_hash;
if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
goto clear_hash;
@@ -738,7 +741,7 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 seq,
u32 ack, u32 win, u32 tsval, u32 tsecr,
int oif, struct tcp_md5sig_key *key, int rst,
- u8 tclass, u32 label)
+ u8 tclass, __be32 label)
{
const struct tcphdr *th = tcp_hdr(skb);
struct tcphdr *t1;
@@ -911,7 +914,7 @@ out:
static void tcp_v6_send_ack(const struct sock *sk, struct sk_buff *skb, u32 seq,
u32 ack, u32 win, u32 tsval, u32 tsecr, int oif,
struct tcp_md5sig_key *key, u8 tclass,
- u32 label)
+ __be32 label)
{
tcp_v6_send_response(sk, skb, seq, ack, win, tsval, tsecr, oif, key, 0,
tclass, label);
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 4bb5c13..0a71a312d 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -115,11 +115,10 @@ static void udp_v6_rehash(struct sock *sk)
udp_lib_rehash(sk, new_hash);
}
-static inline int compute_score(struct sock *sk, struct net *net,
- unsigned short hnum,
- const struct in6_addr *saddr, __be16 sport,
- const struct in6_addr *daddr, __be16 dport,
- int dif)
+static int compute_score(struct sock *sk, struct net *net,
+ const struct in6_addr *saddr, __be16 sport,
+ const struct in6_addr *daddr, unsigned short hnum,
+ int dif)
{
int score;
struct inet_sock *inet;
@@ -162,54 +161,11 @@ static inline int compute_score(struct sock *sk, struct net *net,
return score;
}
-static inline int compute_score2(struct sock *sk, struct net *net,
- const struct in6_addr *saddr, __be16 sport,
- const struct in6_addr *daddr,
- unsigned short hnum, int dif)
-{
- int score;
- struct inet_sock *inet;
-
- if (!net_eq(sock_net(sk), net) ||
- udp_sk(sk)->udp_port_hash != hnum ||
- sk->sk_family != PF_INET6)
- return -1;
-
- if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr))
- return -1;
-
- score = 0;
- inet = inet_sk(sk);
-
- if (inet->inet_dport) {
- if (inet->inet_dport != sport)
- return -1;
- score++;
- }
-
- if (!ipv6_addr_any(&sk->sk_v6_daddr)) {
- if (!ipv6_addr_equal(&sk->sk_v6_daddr, saddr))
- return -1;
- score++;
- }
-
- if (sk->sk_bound_dev_if) {
- if (sk->sk_bound_dev_if != dif)
- return -1;
- score++;
- }
-
- if (sk->sk_incoming_cpu == raw_smp_processor_id())
- score++;
-
- return score;
-}
-
-/* called with read_rcu_lock() */
+/* called with rcu_read_lock() */
static struct sock *udp6_lib_lookup2(struct net *net,
const struct in6_addr *saddr, __be16 sport,
const struct in6_addr *daddr, unsigned int hnum, int dif,
- struct udp_hslot *hslot2, unsigned int slot2,
+ struct udp_hslot *hslot2,
struct sk_buff *skb)
{
struct sock *sk, *result;
@@ -219,7 +175,7 @@ static struct sock *udp6_lib_lookup2(struct net *net,
result = NULL;
badness = -1;
udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
- score = compute_score2(sk, net, saddr, sport,
+ score = compute_score(sk, net, saddr, sport,
daddr, hnum, dif);
if (score > badness) {
reuseport = sk->sk_reuseport;
@@ -268,17 +224,22 @@ struct sock *__udp6_lib_lookup(struct net *net,
result = udp6_lib_lookup2(net, saddr, sport,
daddr, hnum, dif,
- hslot2, slot2, skb);
+ hslot2, skb);
if (!result) {
+ unsigned int old_slot2 = slot2;
hash2 = udp6_portaddr_hash(net, &in6addr_any, hnum);
slot2 = hash2 & udptable->mask;
+ /* avoid searching the same slot again. */
+ if (unlikely(slot2 == old_slot2))
+ return result;
+
hslot2 = &udptable->hash2[slot2];
if (hslot->count < hslot2->count)
goto begin;
result = udp6_lib_lookup2(net, saddr, sport,
- &in6addr_any, hnum, dif,
- hslot2, slot2, skb);
+ daddr, hnum, dif,
+ hslot2, skb);
}
return result;
}
@@ -286,7 +247,7 @@ begin:
result = NULL;
badness = -1;
sk_for_each_rcu(sk, &hslot->head) {
- score = compute_score(sk, net, hnum, saddr, sport, daddr, dport, dif);
+ score = compute_score(sk, net, saddr, sport, daddr, hnum, dif);
if (score > badness) {
reuseport = sk->sk_reuseport;
if (reuseport) {
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index c074771..6cc9700 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -366,12 +366,12 @@ static void __net_exit xfrm6_net_sysctl_exit(struct net *net)
kfree(table);
}
#else /* CONFIG_SYSCTL */
-static int inline xfrm6_net_sysctl_init(struct net *net)
+static inline int xfrm6_net_sysctl_init(struct net *net)
{
return 0;
}
-static void inline xfrm6_net_sysctl_exit(struct net *net)
+static inline void xfrm6_net_sysctl_exit(struct net *net)
{
}
#endif
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index fc3598a..37d674e 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -1033,6 +1033,7 @@ static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg,
{
struct sock *sk = sock->sk;
struct iucv_sock *iucv = iucv_sk(sk);
+ size_t headroom, linear;
struct sk_buff *skb;
struct iucv_message txmsg = {0};
struct cmsghdr *cmsg;
@@ -1110,20 +1111,31 @@ static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg,
* this is fine for SOCK_SEQPACKET (unless we want to support
* segmented records using the MSG_EOR flag), but
* for SOCK_STREAM we might want to improve it in future */
- if (iucv->transport == AF_IUCV_TRANS_HIPER)
- skb = sock_alloc_send_skb(sk,
- len + sizeof(struct af_iucv_trans_hdr) + ETH_HLEN,
- noblock, &err);
- else
- skb = sock_alloc_send_skb(sk, len, noblock, &err);
+ headroom = (iucv->transport == AF_IUCV_TRANS_HIPER)
+ ? sizeof(struct af_iucv_trans_hdr) + ETH_HLEN : 0;
+ if (headroom + len < PAGE_SIZE) {
+ linear = len;
+ } else {
+ /* In nonlinear "classic" iucv skb,
+ * reserve space for iucv_array
+ */
+ if (iucv->transport != AF_IUCV_TRANS_HIPER)
+ headroom += sizeof(struct iucv_array) *
+ (MAX_SKB_FRAGS + 1);
+ linear = PAGE_SIZE - headroom;
+ }
+ skb = sock_alloc_send_pskb(sk, headroom + linear, len - linear,
+ noblock, &err, 0);
if (!skb)
goto out;
- if (iucv->transport == AF_IUCV_TRANS_HIPER)
- skb_reserve(skb, sizeof(struct af_iucv_trans_hdr) + ETH_HLEN);
- if (memcpy_from_msg(skb_put(skb, len), msg, len)) {
- err = -EFAULT;
+ if (headroom)
+ skb_reserve(skb, headroom);
+ skb_put(skb, linear);
+ skb->len = len;
+ skb->data_len = len - linear;
+ err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
+ if (err)
goto fail;
- }
/* wait if outstanding messages for iucv path has reached */
timeo = sock_sndtimeo(sk, noblock);
@@ -1148,49 +1160,67 @@ static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg,
atomic_dec(&iucv->msg_sent);
goto fail;
}
- goto release;
- }
- skb_queue_tail(&iucv->send_skb_q, skb);
-
- if (((iucv->path->flags & IUCV_IPRMDATA) & iucv->flags)
- && skb->len <= 7) {
- err = iucv_send_iprm(iucv->path, &txmsg, skb);
+ } else { /* Classic VM IUCV transport */
+ skb_queue_tail(&iucv->send_skb_q, skb);
+
+ if (((iucv->path->flags & IUCV_IPRMDATA) & iucv->flags) &&
+ skb->len <= 7) {
+ err = iucv_send_iprm(iucv->path, &txmsg, skb);
+
+ /* on success: there is no message_complete callback */
+ /* for an IPRMDATA msg; remove skb from send queue */
+ if (err == 0) {
+ skb_unlink(skb, &iucv->send_skb_q);
+ kfree_skb(skb);
+ }
- /* on success: there is no message_complete callback
- * for an IPRMDATA msg; remove skb from send queue */
- if (err == 0) {
- skb_unlink(skb, &iucv->send_skb_q);
- kfree_skb(skb);
+ /* this error should never happen since the */
+ /* IUCV_IPRMDATA path flag is set... sever path */
+ if (err == 0x15) {
+ pr_iucv->path_sever(iucv->path, NULL);
+ skb_unlink(skb, &iucv->send_skb_q);
+ err = -EPIPE;
+ goto fail;
+ }
+ } else if (skb_is_nonlinear(skb)) {
+ struct iucv_array *iba = (struct iucv_array *)skb->head;
+ int i;
+
+ /* skip iucv_array lying in the headroom */
+ iba[0].address = (u32)(addr_t)skb->data;
+ iba[0].length = (u32)skb_headlen(skb);
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+ iba[i + 1].address =
+ (u32)(addr_t)skb_frag_address(frag);
+ iba[i + 1].length = (u32)skb_frag_size(frag);
+ }
+ err = pr_iucv->message_send(iucv->path, &txmsg,
+ IUCV_IPBUFLST, 0,
+ (void *)iba, skb->len);
+ } else { /* non-IPRM Linear skb */
+ err = pr_iucv->message_send(iucv->path, &txmsg,
+ 0, 0, (void *)skb->data, skb->len);
}
-
- /* this error should never happen since the
- * IUCV_IPRMDATA path flag is set... sever path */
- if (err == 0x15) {
- pr_iucv->path_sever(iucv->path, NULL);
+ if (err) {
+ if (err == 3) {
+ user_id[8] = 0;
+ memcpy(user_id, iucv->dst_user_id, 8);
+ appl_id[8] = 0;
+ memcpy(appl_id, iucv->dst_name, 8);
+ pr_err(
+ "Application %s on z/VM guest %s exceeds message limit\n",
+ appl_id, user_id);
+ err = -EAGAIN;
+ } else {
+ err = -EPIPE;
+ }
skb_unlink(skb, &iucv->send_skb_q);
- err = -EPIPE;
goto fail;
}
- } else
- err = pr_iucv->message_send(iucv->path, &txmsg, 0, 0,
- (void *) skb->data, skb->len);
- if (err) {
- if (err == 3) {
- user_id[8] = 0;
- memcpy(user_id, iucv->dst_user_id, 8);
- appl_id[8] = 0;
- memcpy(appl_id, iucv->dst_name, 8);
- pr_err("Application %s on z/VM guest %s"
- " exceeds message limit\n",
- appl_id, user_id);
- err = -EAGAIN;
- } else
- err = -EPIPE;
- skb_unlink(skb, &iucv->send_skb_q);
- goto fail;
}
-release:
release_sock(sk);
return len;
@@ -1201,42 +1231,32 @@ out:
return err;
}
-/* iucv_fragment_skb() - Fragment a single IUCV message into multiple skb's
- *
- * Locking: must be called with message_q.lock held
- */
-static int iucv_fragment_skb(struct sock *sk, struct sk_buff *skb, int len)
+static struct sk_buff *alloc_iucv_recv_skb(unsigned long len)
{
- int dataleft, size, copied = 0;
- struct sk_buff *nskb;
-
- dataleft = len;
- while (dataleft) {
- if (dataleft >= sk->sk_rcvbuf / 4)
- size = sk->sk_rcvbuf / 4;
- else
- size = dataleft;
-
- nskb = alloc_skb(size, GFP_ATOMIC | GFP_DMA);
- if (!nskb)
- return -ENOMEM;
-
- /* copy target class to control buffer of new skb */
- IUCV_SKB_CB(nskb)->class = IUCV_SKB_CB(skb)->class;
-
- /* copy data fragment */
- memcpy(nskb->data, skb->data + copied, size);
- copied += size;
- dataleft -= size;
-
- skb_reset_transport_header(nskb);
- skb_reset_network_header(nskb);
- nskb->len = size;
+ size_t headroom, linear;
+ struct sk_buff *skb;
+ int err;
- skb_queue_tail(&iucv_sk(sk)->backlog_skb_q, nskb);
+ if (len < PAGE_SIZE) {
+ headroom = 0;
+ linear = len;
+ } else {
+ headroom = sizeof(struct iucv_array) * (MAX_SKB_FRAGS + 1);
+ linear = PAGE_SIZE - headroom;
+ }
+ skb = alloc_skb_with_frags(headroom + linear, len - linear,
+ 0, &err, GFP_ATOMIC | GFP_DMA);
+ WARN_ONCE(!skb,
+ "alloc of recv iucv skb len=%lu failed with errcode=%d\n",
+ len, err);
+ if (skb) {
+ if (headroom)
+ skb_reserve(skb, headroom);
+ skb_put(skb, linear);
+ skb->len = len;
+ skb->data_len = len - linear;
}
-
- return 0;
+ return skb;
}
/* iucv_process_message() - Receive a single outstanding IUCV message
@@ -1263,31 +1283,32 @@ static void iucv_process_message(struct sock *sk, struct sk_buff *skb,
skb->len = 0;
}
} else {
- rc = pr_iucv->message_receive(path, msg,
+ if (skb_is_nonlinear(skb)) {
+ struct iucv_array *iba = (struct iucv_array *)skb->head;
+ int i;
+
+ iba[0].address = (u32)(addr_t)skb->data;
+ iba[0].length = (u32)skb_headlen(skb);
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+ iba[i + 1].address =
+ (u32)(addr_t)skb_frag_address(frag);
+ iba[i + 1].length = (u32)skb_frag_size(frag);
+ }
+ rc = pr_iucv->message_receive(path, msg,
+ IUCV_IPBUFLST,
+ (void *)iba, len, NULL);
+ } else {
+ rc = pr_iucv->message_receive(path, msg,
msg->flags & IUCV_IPRMDATA,
skb->data, len, NULL);
+ }
if (rc) {
kfree_skb(skb);
return;
}
- /* we need to fragment iucv messages for SOCK_STREAM only;
- * for SOCK_SEQPACKET, it is only relevant if we support
- * record segmentation using MSG_EOR (see also recvmsg()) */
- if (sk->sk_type == SOCK_STREAM &&
- skb->truesize >= sk->sk_rcvbuf / 4) {
- rc = iucv_fragment_skb(sk, skb, len);
- kfree_skb(skb);
- skb = NULL;
- if (rc) {
- pr_iucv->path_sever(path, NULL);
- return;
- }
- skb = skb_dequeue(&iucv_sk(sk)->backlog_skb_q);
- } else {
- skb_reset_transport_header(skb);
- skb_reset_network_header(skb);
- skb->len = len;
- }
+ WARN_ON_ONCE(skb->len != len);
}
IUCV_SKB_CB(skb)->offset = 0;
@@ -1306,7 +1327,7 @@ static void iucv_process_message_q(struct sock *sk)
struct sock_msg_q *p, *n;
list_for_each_entry_safe(p, n, &iucv->message_q.list, list) {
- skb = alloc_skb(iucv_msg_length(&p->msg), GFP_ATOMIC | GFP_DMA);
+ skb = alloc_iucv_recv_skb(iucv_msg_length(&p->msg));
if (!skb)
break;
iucv_process_message(sk, skb, p->path, &p->msg);
@@ -1801,7 +1822,7 @@ static void iucv_callback_rx(struct iucv_path *path, struct iucv_message *msg)
if (len > sk->sk_rcvbuf)
goto save_message;
- skb = alloc_skb(iucv_msg_length(msg), GFP_ATOMIC | GFP_DMA);
+ skb = alloc_iucv_recv_skb(iucv_msg_length(msg));
if (!skb)
goto save_message;
diff --git a/net/kcm/kcmproc.c b/net/kcm/kcmproc.c
index 7380087..fda7f47 100644
--- a/net/kcm/kcmproc.c
+++ b/net/kcm/kcmproc.c
@@ -241,6 +241,7 @@ static const struct file_operations kcm_seq_fops = {
.open = kcm_seq_open,
.read = seq_read,
.llseek = seq_lseek,
+ .release = seq_release_net,
};
static struct kcm_seq_muxinfo kcm_seq_muxinfo = {
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index 0b68ba7..cb39e05 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -1765,18 +1765,12 @@ static int kcm_attach_ioctl(struct socket *sock, struct kcm_attach *info)
if (!csock)
return -ENOENT;
- prog = bpf_prog_get(info->bpf_fd);
+ prog = bpf_prog_get_type(info->bpf_fd, BPF_PROG_TYPE_SOCKET_FILTER);
if (IS_ERR(prog)) {
err = PTR_ERR(prog);
goto out;
}
- if (prog->type != BPF_PROG_TYPE_SOCKET_FILTER) {
- bpf_prog_put(prog);
- err = -EINVAL;
- goto out;
- }
-
err = kcm_attach(sock, csock, prog);
if (err) {
bpf_prog_put(prog);
diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c
index 7da9780..c4a1c3e 100644
--- a/net/l3mdev/l3mdev.c
+++ b/net/l3mdev/l3mdev.c
@@ -108,7 +108,7 @@ EXPORT_SYMBOL_GPL(l3mdev_fib_table_by_index);
*/
struct dst_entry *l3mdev_get_rt6_dst(struct net *net,
- const struct flowi6 *fl6)
+ struct flowi6 *fl6)
{
struct dst_entry *dst = NULL;
struct net_device *dev;
@@ -162,6 +162,30 @@ int l3mdev_get_saddr(struct net *net, int ifindex, struct flowi4 *fl4)
}
EXPORT_SYMBOL_GPL(l3mdev_get_saddr);
+int l3mdev_get_saddr6(struct net *net, const struct sock *sk,
+ struct flowi6 *fl6)
+{
+ struct net_device *dev;
+ int rc = 0;
+
+ if (fl6->flowi6_oif) {
+ rcu_read_lock();
+
+ dev = dev_get_by_index_rcu(net, fl6->flowi6_oif);
+ if (dev && netif_is_l3_slave(dev))
+ dev = netdev_master_upper_dev_get_rcu(dev);
+
+ if (dev && netif_is_l3_master(dev) &&
+ dev->l3mdev_ops->l3mdev_get_saddr6)
+ rc = dev->l3mdev_ops->l3mdev_get_saddr6(dev, sk, fl6);
+
+ rcu_read_unlock();
+ }
+
+ return rc;
+}
+EXPORT_SYMBOL_GPL(l3mdev_get_saddr6);
+
/**
* l3mdev_fib_rule_match - Determine if flowi references an
* L3 master device
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index 21b1fdf..6a1603b 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -148,14 +148,17 @@ u32 mesh_accept_plinks_update(struct ieee80211_sub_if_data *sdata)
void mesh_sta_cleanup(struct sta_info *sta)
{
struct ieee80211_sub_if_data *sdata = sta->sdata;
- u32 changed;
+ u32 changed = 0;
/*
* maybe userspace handles peer allocation and peering, but in either
* case the beacon is still generated by the kernel and we might need
* an update.
*/
- changed = mesh_accept_plinks_update(sdata);
+ if (sdata->u.mesh.user_mpm &&
+ sta->mesh->plink_state == NL80211_PLINK_ESTAB)
+ changed |= mesh_plink_dec_estab_count(sdata);
+ changed |= mesh_accept_plinks_update(sdata);
if (!sdata->u.mesh.user_mpm) {
changed |= mesh_plink_deactivate(sta);
del_timer_sync(&sta->mesh->plink_timer);
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 7a4aa34..e9beaa5 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -1009,9 +1009,10 @@ static int mpls_dev_notify(struct notifier_block *this, unsigned long event,
unsigned int flags;
if (event == NETDEV_REGISTER) {
- /* For now just support ethernet devices */
- if ((dev->type == ARPHRD_ETHER) ||
- (dev->type == ARPHRD_LOOPBACK)) {
+ /* For now just support Ethernet and IPGRE devices */
+ if (dev->type == ARPHRD_ETHER ||
+ dev->type == ARPHRD_LOOPBACK ||
+ dev->type == ARPHRD_IPGRE) {
mdev = mpls_add_dev(dev);
if (IS_ERR(mdev))
return notifier_from_errno(PTR_ERR(mdev));
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index db2312e..f204274 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1544,6 +1544,8 @@ void nf_conntrack_cleanup_end(void)
nf_conntrack_tstamp_fini();
nf_conntrack_acct_fini();
nf_conntrack_expect_fini();
+
+ kmem_cache_destroy(nf_conntrack_cachep);
}
/*
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 7b7aa87..2c88187 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -2946,24 +2946,20 @@ int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set,
* jumps are already validated for that chain.
*/
list_for_each_entry(i, &set->bindings, list) {
- if (binding->flags & NFT_SET_MAP &&
+ if (i->flags & NFT_SET_MAP &&
i->chain == binding->chain)
goto bind;
}
+ iter.genmask = nft_genmask_next(ctx->net);
iter.skip = 0;
iter.count = 0;
iter.err = 0;
iter.fn = nf_tables_bind_check_setelem;
set->ops->walk(ctx, set, &iter);
- if (iter.err < 0) {
- /* Destroy anonymous sets if binding fails */
- if (set->flags & NFT_SET_ANONYMOUS)
- nf_tables_set_destroy(ctx, set);
-
+ if (iter.err < 0)
return iter.err;
- }
}
bind:
binding->chain = ctx->chain;
@@ -3192,12 +3188,13 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
if (nest == NULL)
goto nla_put_failure;
- args.cb = cb;
- args.skb = skb;
- args.iter.skip = cb->args[0];
- args.iter.count = 0;
- args.iter.err = 0;
- args.iter.fn = nf_tables_dump_setelem;
+ args.cb = cb;
+ args.skb = skb;
+ args.iter.genmask = nft_genmask_cur(ctx.net);
+ args.iter.skip = cb->args[0];
+ args.iter.count = 0;
+ args.iter.err = 0;
+ args.iter.fn = nf_tables_dump_setelem;
set->ops->walk(&ctx, set, &args.iter);
nla_nest_end(skb, nest);
@@ -4284,6 +4281,7 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx,
binding->chain != chain)
continue;
+ iter.genmask = nft_genmask_next(ctx->net);
iter.skip = 0;
iter.count = 0;
iter.err = 0;
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index e9f8dff..fb8b589 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -143,7 +143,7 @@ next_rule:
list_for_each_entry_continue_rcu(rule, &chain->rules, list) {
/* This rule is not active, skip. */
- if (unlikely(rule->genmask & (1 << gencursor)))
+ if (unlikely(rule->genmask & gencursor))
continue;
rulenum++;
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index 6fa0165..f39c53a 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -189,7 +189,6 @@ static void nft_hash_walk(const struct nft_ctx *ctx, const struct nft_set *set,
struct nft_hash_elem *he;
struct rhashtable_iter hti;
struct nft_set_elem elem;
- u8 genmask = nft_genmask_cur(read_pnet(&set->pnet));
int err;
err = rhashtable_walk_init(&priv->ht, &hti, GFP_KERNEL);
@@ -218,7 +217,7 @@ static void nft_hash_walk(const struct nft_ctx *ctx, const struct nft_set *set,
goto cont;
if (nft_set_elem_expired(&he->ext))
goto cont;
- if (!nft_set_elem_active(&he->ext, genmask))
+ if (!nft_set_elem_active(&he->ext, iter->genmask))
goto cont;
elem.priv = he;
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index 16c50b0..03e5e33 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -199,13 +199,6 @@ err:
}
EXPORT_SYMBOL_GPL(nft_meta_get_eval);
-/* don't change or set _LOOPBACK, _USER, etc. */
-static bool pkt_type_ok(u32 p)
-{
- return p == PACKET_HOST || p == PACKET_BROADCAST ||
- p == PACKET_MULTICAST || p == PACKET_OTHERHOST;
-}
-
void nft_meta_set_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
@@ -223,7 +216,7 @@ void nft_meta_set_eval(const struct nft_expr *expr,
break;
case NFT_META_PKTTYPE:
if (skb->pkt_type != value &&
- pkt_type_ok(value) && pkt_type_ok(skb->pkt_type))
+ skb_pkt_type_ok(value) && skb_pkt_type_ok(skb->pkt_type))
skb->pkt_type = value;
break;
case NFT_META_NFTRACE:
diff --git a/net/netfilter/nft_rbtree.c b/net/netfilter/nft_rbtree.c
index f762094..7201d57 100644
--- a/net/netfilter/nft_rbtree.c
+++ b/net/netfilter/nft_rbtree.c
@@ -211,7 +211,6 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx,
struct nft_rbtree_elem *rbe;
struct nft_set_elem elem;
struct rb_node *node;
- u8 genmask = nft_genmask_cur(read_pnet(&set->pnet));
spin_lock_bh(&nft_rbtree_lock);
for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
@@ -219,7 +218,7 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx,
if (iter->count < iter->skip)
goto cont;
- if (!nft_set_elem_active(&rbe->ext, genmask))
+ if (!nft_set_elem_active(&rbe->ext, iter->genmask))
goto cont;
elem.priv = rbe;
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 3d5feed..b4069a9 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -818,12 +818,33 @@ static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
*/
state = OVS_CS_F_TRACKED | OVS_CS_F_NEW | OVS_CS_F_RELATED;
__ovs_ct_update_key(key, state, &info->zone, exp->master);
- } else
- return __ovs_ct_lookup(net, key, info, skb);
+ } else {
+ struct nf_conn *ct;
+ int err;
+
+ err = __ovs_ct_lookup(net, key, info, skb);
+ if (err)
+ return err;
+
+ ct = (struct nf_conn *)skb->nfct;
+ if (ct)
+ nf_ct_deliver_cached_events(ct);
+ }
return 0;
}
+static bool labels_nonzero(const struct ovs_key_ct_labels *labels)
+{
+ size_t i;
+
+ for (i = 0; i < sizeof(*labels); i++)
+ if (labels->ct_labels[i])
+ return true;
+
+ return false;
+}
+
/* Lookup connection and confirm if unconfirmed. */
static int ovs_ct_commit(struct net *net, struct sw_flow_key *key,
const struct ovs_conntrack_info *info,
@@ -834,24 +855,32 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key,
err = __ovs_ct_lookup(net, key, info, skb);
if (err)
return err;
- /* This is a no-op if the connection has already been confirmed. */
+
+ /* Apply changes before confirming the connection so that the initial
+ * conntrack NEW netlink event carries the values given in the CT
+ * action.
+ */
+ if (info->mark.mask) {
+ err = ovs_ct_set_mark(skb, key, info->mark.value,
+ info->mark.mask);
+ if (err)
+ return err;
+ }
+ if (labels_nonzero(&info->labels.mask)) {
+ err = ovs_ct_set_labels(skb, key, &info->labels.value,
+ &info->labels.mask);
+ if (err)
+ return err;
+ }
+ /* This will take care of sending queued events even if the connection
+ * is already confirmed.
+ */
if (nf_conntrack_confirm(skb) != NF_ACCEPT)
return -EINVAL;
return 0;
}
-static bool labels_nonzero(const struct ovs_key_ct_labels *labels)
-{
- size_t i;
-
- for (i = 0; i < sizeof(*labels); i++)
- if (labels->ct_labels[i])
- return true;
-
- return false;
-}
-
/* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero
* value if 'skb' is freed.
*/
@@ -876,19 +905,7 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb,
err = ovs_ct_commit(net, key, info, skb);
else
err = ovs_ct_lookup(net, key, info, skb);
- if (err)
- goto err;
- if (info->mark.mask) {
- err = ovs_ct_set_mark(skb, key, info->mark.value,
- info->mark.mask);
- if (err)
- goto err;
- }
- if (labels_nonzero(&info->labels.mask))
- err = ovs_ct_set_labels(skb, key, &info->labels.value,
- &info->labels.mask);
-err:
skb_push(skb, nh_ofs);
if (err)
kfree_skb(skb);
@@ -1145,6 +1162,20 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
}
}
+#ifdef CONFIG_NF_CONNTRACK_MARK
+ if (!info->commit && info->mark.mask) {
+ OVS_NLERR(log,
+ "Setting conntrack mark requires 'commit' flag.");
+ return -EINVAL;
+ }
+#endif
+#ifdef CONFIG_NF_CONNTRACK_LABELS
+ if (!info->commit && labels_nonzero(&info->labels.mask)) {
+ OVS_NLERR(log,
+ "Setting conntrack labels requires 'commit' flag.");
+ return -EINVAL;
+ }
+#endif
if (rem > 0) {
OVS_NLERR(log, "Conntrack attr has %d unknown bytes", rem);
return -EINVAL;
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 6739342..524c0fd 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -387,7 +387,8 @@ static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info,
{
size_t size = NLMSG_ALIGN(sizeof(struct ovs_header))
+ nla_total_size(hdrlen) /* OVS_PACKET_ATTR_PACKET */
- + nla_total_size(ovs_key_attr_size()); /* OVS_PACKET_ATTR_KEY */
+ + nla_total_size(ovs_key_attr_size()) /* OVS_PACKET_ATTR_KEY */
+ + nla_total_size(sizeof(unsigned int)); /* OVS_PACKET_ATTR_LEN */
/* OVS_PACKET_ATTR_USERDATA */
if (upcall_info->userdata)
@@ -514,6 +515,16 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
pad_packet(dp, user_skb);
}
+ /* Add OVS_PACKET_ATTR_LEN when packet is truncated */
+ if (cutlen > 0) {
+ if (nla_put_u32(user_skb, OVS_PACKET_ATTR_LEN,
+ skb->len)) {
+ err = -ENOBUFS;
+ goto out;
+ }
+ pad_packet(dp, user_skb);
+ }
+
/* Only reserve room for attribute header, packet data is added
* in skb_zerocopy() */
if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0))) {
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index d1f3b9e..48b5895 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -1588,13 +1588,9 @@ static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data,
if (copy_from_user(&fd, data, len))
return -EFAULT;
- new = bpf_prog_get(fd);
+ new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
if (IS_ERR(new))
return PTR_ERR(new);
- if (new->type != BPF_PROG_TYPE_SOCKET_FILTER) {
- bpf_prog_put(new);
- return -EINVAL;
- }
__fanout_set_data_bpf(po->fanout, new);
return 0;
diff --git a/net/rds/cong.c b/net/rds/cong.c
index 6641bcf..8398fee 100644
--- a/net/rds/cong.c
+++ b/net/rds/cong.c
@@ -235,7 +235,8 @@ void rds_cong_queue_updates(struct rds_cong_map *map)
* therefore trigger warnings.
* Defer the xmit to rds_send_worker() instead.
*/
- queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+ queue_delayed_work(rds_wq,
+ &conn->c_path[0].cp_send_w, 0);
}
}
diff --git a/net/rds/connection.c b/net/rds/connection.c
index e3b118c..19a4fee 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -95,14 +95,16 @@ static struct rds_connection *rds_conn_lookup(struct net *net,
* and receiving over this connection again in the future. It is up to
* the transport to have serialized this call with its send and recv.
*/
-static void rds_conn_reset(struct rds_connection *conn)
+static void rds_conn_path_reset(struct rds_conn_path *cp)
{
+ struct rds_connection *conn = cp->cp_conn;
+
rdsdebug("connection %pI4 to %pI4 reset\n",
&conn->c_laddr, &conn->c_faddr);
rds_stats_inc(s_conn_reset);
- rds_send_reset(conn);
- conn->c_flags = 0;
+ rds_send_path_reset(cp);
+ cp->cp_flags = 0;
/* Do not clear next_rx_seq here, else we cannot distinguish
* retransmitted packets from new packets, and will hand all
@@ -110,6 +112,32 @@ static void rds_conn_reset(struct rds_connection *conn)
* reliability guarantees of RDS. */
}
+static void __rds_conn_path_init(struct rds_connection *conn,
+ struct rds_conn_path *cp, bool is_outgoing)
+{
+ spin_lock_init(&cp->cp_lock);
+ cp->cp_next_tx_seq = 1;
+ init_waitqueue_head(&cp->cp_waitq);
+ INIT_LIST_HEAD(&cp->cp_send_queue);
+ INIT_LIST_HEAD(&cp->cp_retrans);
+
+ cp->cp_conn = conn;
+ atomic_set(&cp->cp_state, RDS_CONN_DOWN);
+ cp->cp_send_gen = 0;
+ /* cp_outgoing is per-path. So we can only set it here
+ * for the single-path transports.
+ */
+ if (!conn->c_trans->t_mp_capable)
+ cp->cp_outgoing = (is_outgoing ? 1 : 0);
+ cp->cp_reconnect_jiffies = 0;
+ INIT_DELAYED_WORK(&cp->cp_send_w, rds_send_worker);
+ INIT_DELAYED_WORK(&cp->cp_recv_w, rds_recv_worker);
+ INIT_DELAYED_WORK(&cp->cp_conn_w, rds_connect_worker);
+ INIT_WORK(&cp->cp_down_w, rds_shutdown_worker);
+ mutex_init(&cp->cp_cm_lock);
+ cp->cp_flags = 0;
+}
+
/*
* There is only every one 'conn' for a given pair of addresses in the
* system at a time. They contain messages to be retransmitted and so
@@ -153,13 +181,8 @@ static struct rds_connection *__rds_conn_create(struct net *net,
INIT_HLIST_NODE(&conn->c_hash_node);
conn->c_laddr = laddr;
conn->c_faddr = faddr;
- spin_lock_init(&conn->c_lock);
- conn->c_next_tx_seq = 1;
- rds_conn_net_set(conn, net);
- init_waitqueue_head(&conn->c_waitq);
- INIT_LIST_HEAD(&conn->c_send_queue);
- INIT_LIST_HEAD(&conn->c_retrans);
+ rds_conn_net_set(conn, net);
ret = rds_cong_get_maps(conn);
if (ret) {
@@ -195,17 +218,6 @@ static struct rds_connection *__rds_conn_create(struct net *net,
goto out;
}
- atomic_set(&conn->c_state, RDS_CONN_DOWN);
- conn->c_send_gen = 0;
- conn->c_outgoing = (is_outgoing ? 1 : 0);
- conn->c_reconnect_jiffies = 0;
- INIT_DELAYED_WORK(&conn->c_send_w, rds_send_worker);
- INIT_DELAYED_WORK(&conn->c_recv_w, rds_recv_worker);
- INIT_DELAYED_WORK(&conn->c_conn_w, rds_connect_worker);
- INIT_WORK(&conn->c_down_w, rds_shutdown_worker);
- mutex_init(&conn->c_cm_lock);
- conn->c_flags = 0;
-
rdsdebug("allocated conn %p for %pI4 -> %pI4 over %s %s\n",
conn, &laddr, &faddr,
trans->t_name ? trans->t_name : "[unknown]",
@@ -222,7 +234,7 @@ static struct rds_connection *__rds_conn_create(struct net *net,
if (parent) {
/* Creating passive conn */
if (parent->c_passive) {
- trans->conn_free(conn->c_transport_data);
+ trans->conn_free(conn->c_path[0].cp_transport_data);
kmem_cache_free(rds_conn_slab, conn);
conn = parent->c_passive;
} else {
@@ -236,10 +248,29 @@ static struct rds_connection *__rds_conn_create(struct net *net,
found = rds_conn_lookup(net, head, laddr, faddr, trans);
if (found) {
- trans->conn_free(conn->c_transport_data);
+ struct rds_conn_path *cp;
+ int i;
+
+ for (i = 0; i < RDS_MPATH_WORKERS; i++) {
+ cp = &conn->c_path[i];
+ /* The ->conn_alloc invocation may have
+ * allocated resource for all paths, so all
+ * of them may have to be freed here.
+ */
+ if (cp->cp_transport_data)
+ trans->conn_free(cp->cp_transport_data);
+ }
kmem_cache_free(rds_conn_slab, conn);
conn = found;
} else {
+ int i;
+
+ for (i = 0; i < RDS_MPATH_WORKERS; i++) {
+ __rds_conn_path_init(conn, &conn->c_path[i],
+ is_outgoing);
+ conn->c_path[i].cp_index = i;
+ }
+
hlist_add_head_rcu(&conn->c_hash_node, head);
rds_cong_add_conn(conn);
rds_conn_count++;
@@ -267,10 +298,12 @@ struct rds_connection *rds_conn_create_outgoing(struct net *net,
}
EXPORT_SYMBOL_GPL(rds_conn_create_outgoing);
-void rds_conn_shutdown(struct rds_connection *conn)
+void rds_conn_shutdown(struct rds_conn_path *cp)
{
+ struct rds_connection *conn = cp->cp_conn;
+
/* shut it down unless it's down already */
- if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_DOWN)) {
+ if (!rds_conn_path_transition(cp, RDS_CONN_DOWN, RDS_CONN_DOWN)) {
/*
* Quiesce the connection mgmt handlers before we start tearing
* things down. We don't hold the mutex for the entire
@@ -278,35 +311,38 @@ void rds_conn_shutdown(struct rds_connection *conn)
* deadlocking with the CM handler. Instead, the CM event
* handler is supposed to check for state DISCONNECTING
*/
- mutex_lock(&conn->c_cm_lock);
- if (!rds_conn_transition(conn, RDS_CONN_UP, RDS_CONN_DISCONNECTING)
- && !rds_conn_transition(conn, RDS_CONN_ERROR, RDS_CONN_DISCONNECTING)) {
- rds_conn_error(conn, "shutdown called in state %d\n",
- atomic_read(&conn->c_state));
- mutex_unlock(&conn->c_cm_lock);
+ mutex_lock(&cp->cp_cm_lock);
+ if (!rds_conn_path_transition(cp, RDS_CONN_UP,
+ RDS_CONN_DISCONNECTING) &&
+ !rds_conn_path_transition(cp, RDS_CONN_ERROR,
+ RDS_CONN_DISCONNECTING)) {
+ rds_conn_path_error(cp,
+ "shutdown called in state %d\n",
+ atomic_read(&cp->cp_state));
+ mutex_unlock(&cp->cp_cm_lock);
return;
}
- mutex_unlock(&conn->c_cm_lock);
+ mutex_unlock(&cp->cp_cm_lock);
- wait_event(conn->c_waitq,
- !test_bit(RDS_IN_XMIT, &conn->c_flags));
- wait_event(conn->c_waitq,
- !test_bit(RDS_RECV_REFILL, &conn->c_flags));
+ wait_event(cp->cp_waitq,
+ !test_bit(RDS_IN_XMIT, &cp->cp_flags));
+ wait_event(cp->cp_waitq,
+ !test_bit(RDS_RECV_REFILL, &cp->cp_flags));
- conn->c_trans->conn_shutdown(conn);
- rds_conn_reset(conn);
+ conn->c_trans->conn_path_shutdown(cp);
+ rds_conn_path_reset(cp);
- if (!rds_conn_transition(conn, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN)) {
+ if (!rds_conn_path_transition(cp, RDS_CONN_DISCONNECTING,
+ RDS_CONN_DOWN)) {
/* This can happen - eg when we're in the middle of tearing
* down the connection, and someone unloads the rds module.
* Quite reproduceable with loopback connections.
* Mostly harmless.
*/
- rds_conn_error(conn,
- "%s: failed to transition to state DOWN, "
- "current state is %d\n",
- __func__,
- atomic_read(&conn->c_state));
+ rds_conn_path_error(cp, "%s: failed to transition "
+ "to state DOWN, current state "
+ "is %d\n", __func__,
+ atomic_read(&cp->cp_state));
return;
}
}
@@ -315,18 +351,47 @@ void rds_conn_shutdown(struct rds_connection *conn)
* The passive side of an IB loopback connection is never added
* to the conn hash, so we never trigger a reconnect on this
* conn - the reconnect is always triggered by the active peer. */
- cancel_delayed_work_sync(&conn->c_conn_w);
+ cancel_delayed_work_sync(&cp->cp_conn_w);
rcu_read_lock();
if (!hlist_unhashed(&conn->c_hash_node)) {
rcu_read_unlock();
- if (conn->c_trans->t_type != RDS_TRANS_TCP ||
- conn->c_outgoing == 1)
- rds_queue_reconnect(conn);
+ rds_queue_reconnect(cp);
} else {
rcu_read_unlock();
}
}
+/* destroy a single rds_conn_path. rds_conn_destroy() iterates over
+ * all paths using rds_conn_path_destroy()
+ */
+static void rds_conn_path_destroy(struct rds_conn_path *cp)
+{
+ struct rds_message *rm, *rtmp;
+
+ if (!cp->cp_transport_data)
+ return;
+
+ rds_conn_path_drop(cp);
+ flush_work(&cp->cp_down_w);
+
+ /* make sure lingering queued work won't try to ref the conn */
+ cancel_delayed_work_sync(&cp->cp_send_w);
+ cancel_delayed_work_sync(&cp->cp_recv_w);
+
+ /* tear down queued messages */
+ list_for_each_entry_safe(rm, rtmp,
+ &cp->cp_send_queue,
+ m_conn_item) {
+ list_del_init(&rm->m_conn_item);
+ BUG_ON(!list_empty(&rm->m_sock_item));
+ rds_message_put(rm);
+ }
+ if (cp->cp_xmit_rm)
+ rds_message_put(cp->cp_xmit_rm);
+
+ cp->cp_conn->c_trans->conn_free(cp->cp_transport_data);
+}
+
/*
* Stop and free a connection.
*
@@ -336,8 +401,9 @@ void rds_conn_shutdown(struct rds_connection *conn)
*/
void rds_conn_destroy(struct rds_connection *conn)
{
- struct rds_message *rm, *rtmp;
unsigned long flags;
+ int i;
+ struct rds_conn_path *cp;
rdsdebug("freeing conn %p for %pI4 -> "
"%pI4\n", conn, &conn->c_laddr,
@@ -350,25 +416,11 @@ void rds_conn_destroy(struct rds_connection *conn)
synchronize_rcu();
/* shut the connection down */
- rds_conn_drop(conn);
- flush_work(&conn->c_down_w);
-
- /* make sure lingering queued work won't try to ref the conn */
- cancel_delayed_work_sync(&conn->c_send_w);
- cancel_delayed_work_sync(&conn->c_recv_w);
-
- /* tear down queued messages */
- list_for_each_entry_safe(rm, rtmp,
- &conn->c_send_queue,
- m_conn_item) {
- list_del_init(&rm->m_conn_item);
- BUG_ON(!list_empty(&rm->m_sock_item));
- rds_message_put(rm);
+ for (i = 0; i < RDS_MPATH_WORKERS; i++) {
+ cp = &conn->c_path[i];
+ rds_conn_path_destroy(cp);
+ BUG_ON(!list_empty(&cp->cp_retrans));
}
- if (conn->c_xmit_rm)
- rds_message_put(conn->c_xmit_rm);
-
- conn->c_trans->conn_free(conn->c_transport_data);
/*
* The congestion maps aren't freed up here. They're
@@ -377,7 +429,6 @@ void rds_conn_destroy(struct rds_connection *conn)
*/
rds_cong_remove_conn(conn);
- BUG_ON(!list_empty(&conn->c_retrans));
kmem_cache_free(rds_conn_slab, conn);
spin_lock_irqsave(&rds_conn_lock, flags);
@@ -398,6 +449,7 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,
unsigned int total = 0;
unsigned long flags;
size_t i;
+ int j;
len /= sizeof(struct rds_info_message);
@@ -406,23 +458,32 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,
for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash);
i++, head++) {
hlist_for_each_entry_rcu(conn, head, c_hash_node) {
- if (want_send)
- list = &conn->c_send_queue;
- else
- list = &conn->c_retrans;
-
- spin_lock_irqsave(&conn->c_lock, flags);
-
- /* XXX too lazy to maintain counts.. */
- list_for_each_entry(rm, list, m_conn_item) {
- total++;
- if (total <= len)
- rds_inc_info_copy(&rm->m_inc, iter,
- conn->c_laddr,
- conn->c_faddr, 0);
+ struct rds_conn_path *cp;
+
+ for (j = 0; j < RDS_MPATH_WORKERS; j++) {
+ cp = &conn->c_path[j];
+ if (want_send)
+ list = &cp->cp_send_queue;
+ else
+ list = &cp->cp_retrans;
+
+ spin_lock_irqsave(&cp->cp_lock, flags);
+
+ /* XXX too lazy to maintain counts.. */
+ list_for_each_entry(rm, list, m_conn_item) {
+ total++;
+ if (total <= len)
+ rds_inc_info_copy(&rm->m_inc,
+ iter,
+ conn->c_laddr,
+ conn->c_faddr,
+ 0);
+ }
+
+ spin_unlock_irqrestore(&cp->cp_lock, flags);
+ if (!conn->c_trans->t_mp_capable)
+ break;
}
-
- spin_unlock_irqrestore(&conn->c_lock, flags);
}
}
rcu_read_unlock();
@@ -484,27 +545,72 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len,
}
EXPORT_SYMBOL_GPL(rds_for_each_conn_info);
-static int rds_conn_info_visitor(struct rds_connection *conn,
- void *buffer)
+void rds_walk_conn_path_info(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens,
+ int (*visitor)(struct rds_conn_path *, void *),
+ size_t item_len)
+{
+ u64 buffer[(item_len + 7) / 8];
+ struct hlist_head *head;
+ struct rds_connection *conn;
+ size_t i;
+ int j;
+
+ rcu_read_lock();
+
+ lens->nr = 0;
+ lens->each = item_len;
+
+ for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash);
+ i++, head++) {
+ hlist_for_each_entry_rcu(conn, head, c_hash_node) {
+ struct rds_conn_path *cp;
+
+ for (j = 0; j < RDS_MPATH_WORKERS; j++) {
+ cp = &conn->c_path[j];
+
+ /* XXX no cp_lock usage.. */
+ if (!visitor(cp, buffer))
+ continue;
+ if (!conn->c_trans->t_mp_capable)
+ break;
+ }
+
+ /* We copy as much as we can fit in the buffer,
+ * but we count all items so that the caller
+ * can resize the buffer.
+ */
+ if (len >= item_len) {
+ rds_info_copy(iter, buffer, item_len);
+ len -= item_len;
+ }
+ lens->nr++;
+ }
+ }
+ rcu_read_unlock();
+}
+
+static int rds_conn_info_visitor(struct rds_conn_path *cp, void *buffer)
{
struct rds_info_connection *cinfo = buffer;
- cinfo->next_tx_seq = conn->c_next_tx_seq;
- cinfo->next_rx_seq = conn->c_next_rx_seq;
- cinfo->laddr = conn->c_laddr;
- cinfo->faddr = conn->c_faddr;
- strncpy(cinfo->transport, conn->c_trans->t_name,
+ cinfo->next_tx_seq = cp->cp_next_tx_seq;
+ cinfo->next_rx_seq = cp->cp_next_rx_seq;
+ cinfo->laddr = cp->cp_conn->c_laddr;
+ cinfo->faddr = cp->cp_conn->c_faddr;
+ strncpy(cinfo->transport, cp->cp_conn->c_trans->t_name,
sizeof(cinfo->transport));
cinfo->flags = 0;
- rds_conn_info_set(cinfo->flags, test_bit(RDS_IN_XMIT, &conn->c_flags),
+ rds_conn_info_set(cinfo->flags, test_bit(RDS_IN_XMIT, &cp->cp_flags),
SENDING);
/* XXX Future: return the state rather than these funky bits */
rds_conn_info_set(cinfo->flags,
- atomic_read(&conn->c_state) == RDS_CONN_CONNECTING,
+ atomic_read(&cp->cp_state) == RDS_CONN_CONNECTING,
CONNECTING);
rds_conn_info_set(cinfo->flags,
- atomic_read(&conn->c_state) == RDS_CONN_UP,
+ atomic_read(&cp->cp_state) == RDS_CONN_UP,
CONNECTED);
return 1;
}
@@ -513,7 +619,7 @@ static void rds_conn_info(struct socket *sock, unsigned int len,
struct rds_info_iterator *iter,
struct rds_info_lengths *lens)
{
- rds_for_each_conn_info(sock, len, iter, lens,
+ rds_walk_conn_path_info(sock, len, iter, lens,
rds_conn_info_visitor,
sizeof(struct rds_info_connection));
}
@@ -553,10 +659,16 @@ void rds_conn_exit(void)
/*
* Force a disconnect
*/
+void rds_conn_path_drop(struct rds_conn_path *cp)
+{
+ atomic_set(&cp->cp_state, RDS_CONN_ERROR);
+ queue_work(rds_wq, &cp->cp_down_w);
+}
+EXPORT_SYMBOL_GPL(rds_conn_path_drop);
+
void rds_conn_drop(struct rds_connection *conn)
{
- atomic_set(&conn->c_state, RDS_CONN_ERROR);
- queue_work(rds_wq, &conn->c_down_w);
+ rds_conn_path_drop(&conn->c_path[0]);
}
EXPORT_SYMBOL_GPL(rds_conn_drop);
@@ -564,11 +676,17 @@ EXPORT_SYMBOL_GPL(rds_conn_drop);
* If the connection is down, trigger a connect. We may have scheduled a
* delayed reconnect however - in this case we should not interfere.
*/
+void rds_conn_path_connect_if_down(struct rds_conn_path *cp)
+{
+ if (rds_conn_path_state(cp) == RDS_CONN_DOWN &&
+ !test_and_set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags))
+ queue_delayed_work(rds_wq, &cp->cp_conn_w, 0);
+}
+
void rds_conn_connect_if_down(struct rds_connection *conn)
{
- if (rds_conn_state(conn) == RDS_CONN_DOWN &&
- !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags))
- queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
+ WARN_ON(conn->c_trans->t_mp_capable);
+ rds_conn_path_connect_if_down(&conn->c_path[0]);
}
EXPORT_SYMBOL_GPL(rds_conn_connect_if_down);
@@ -586,3 +704,15 @@ __rds_conn_error(struct rds_connection *conn, const char *fmt, ...)
rds_conn_drop(conn);
}
+
+void
+__rds_conn_path_error(struct rds_conn_path *cp, const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ vprintk(fmt, ap);
+ va_end(ap);
+
+ rds_conn_path_drop(cp);
+}
diff --git a/net/rds/ib.c b/net/rds/ib.c
index b5342fd..7eaf887 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -40,6 +40,7 @@
#include <linux/slab.h>
#include <linux/module.h>
+#include "rds_single_path.h"
#include "rds.h"
#include "ib.h"
#include "ib_mr.h"
@@ -380,15 +381,15 @@ void rds_ib_exit(void)
struct rds_transport rds_ib_transport = {
.laddr_check = rds_ib_laddr_check,
- .xmit_complete = rds_ib_xmit_complete,
+ .xmit_path_complete = rds_ib_xmit_path_complete,
.xmit = rds_ib_xmit,
.xmit_rdma = rds_ib_xmit_rdma,
.xmit_atomic = rds_ib_xmit_atomic,
- .recv = rds_ib_recv,
+ .recv_path = rds_ib_recv_path,
.conn_alloc = rds_ib_conn_alloc,
.conn_free = rds_ib_conn_free,
- .conn_connect = rds_ib_conn_connect,
- .conn_shutdown = rds_ib_conn_shutdown,
+ .conn_path_connect = rds_ib_conn_path_connect,
+ .conn_path_shutdown = rds_ib_conn_path_shutdown,
.inc_copy_to_user = rds_ib_inc_copy_to_user,
.inc_free = rds_ib_inc_free,
.cm_initiate_connect = rds_ib_cm_initiate_connect,
diff --git a/net/rds/ib.h b/net/rds/ib.h
index 627fb79..046f750 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -328,8 +328,8 @@ extern struct list_head ib_nodev_conns;
/* ib_cm.c */
int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp);
void rds_ib_conn_free(void *arg);
-int rds_ib_conn_connect(struct rds_connection *conn);
-void rds_ib_conn_shutdown(struct rds_connection *conn);
+int rds_ib_conn_path_connect(struct rds_conn_path *cp);
+void rds_ib_conn_path_shutdown(struct rds_conn_path *cp);
void rds_ib_state_change(struct sock *sk);
int rds_ib_listen_init(void);
void rds_ib_listen_stop(void);
@@ -354,7 +354,7 @@ void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc);
/* ib_recv.c */
int rds_ib_recv_init(void);
void rds_ib_recv_exit(void);
-int rds_ib_recv(struct rds_connection *conn);
+int rds_ib_recv_path(struct rds_conn_path *conn);
int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic);
void rds_ib_recv_free_caches(struct rds_ib_connection *ic);
void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp);
@@ -384,7 +384,7 @@ u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest);
extern wait_queue_head_t rds_ib_ring_empty_wait;
/* ib_send.c */
-void rds_ib_xmit_complete(struct rds_connection *conn);
+void rds_ib_xmit_path_complete(struct rds_conn_path *cp);
int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
unsigned int hdr_off, unsigned int sg, unsigned int off);
void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc);
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 310cabc..5b2ab95 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -36,6 +36,7 @@
#include <linux/vmalloc.h>
#include <linux/ratelimit.h>
+#include "rds_single_path.h"
#include "rds.h"
#include "ib.h"
@@ -111,7 +112,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
}
}
- if (conn->c_version < RDS_PROTOCOL(3,1)) {
+ if (conn->c_version < RDS_PROTOCOL(3, 1)) {
printk(KERN_NOTICE "RDS/IB: Connection to %pI4 version %u.%u failed,"
" no longer supported\n",
&conn->c_faddr,
@@ -273,7 +274,7 @@ static void rds_ib_tasklet_fn_send(unsigned long data)
if (rds_conn_up(conn) &&
(!test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ||
test_bit(0, &conn->c_map_queued)))
- rds_send_xmit(ic->conn);
+ rds_send_xmit(&ic->conn->c_path[0]);
}
static void poll_rcq(struct rds_ib_connection *ic, struct ib_cq *cq,
@@ -684,8 +685,9 @@ out:
return ret;
}
-int rds_ib_conn_connect(struct rds_connection *conn)
+int rds_ib_conn_path_connect(struct rds_conn_path *cp)
{
+ struct rds_connection *conn = cp->cp_conn;
struct rds_ib_connection *ic = conn->c_transport_data;
struct sockaddr_in src, dest;
int ret;
@@ -730,8 +732,9 @@ out:
* so that it can be called at any point during startup. In fact it
* can be called multiple times for a given connection.
*/
-void rds_ib_conn_shutdown(struct rds_connection *conn)
+void rds_ib_conn_path_shutdown(struct rds_conn_path *cp)
{
+ struct rds_connection *conn = cp->cp_conn;
struct rds_ib_connection *ic = conn->c_transport_data;
int err = 0;
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index a0f21b6..977f698 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -35,6 +35,7 @@
#include <linux/rculist.h>
#include <linux/llist.h>
+#include "rds_single_path.h"
#include "ib_mr.h"
struct workqueue_struct *rds_ib_mr_wq;
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index abc8cc8..606a11f 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -36,6 +36,7 @@
#include <linux/dma-mapping.h>
#include <rdma/rdma_cm.h>
+#include "rds_single_path.h"
#include "rds.h"
#include "ib.h"
@@ -1008,8 +1009,9 @@ void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic,
rds_ib_recv_refill(conn, 0, GFP_NOWAIT);
}
-int rds_ib_recv(struct rds_connection *conn)
+int rds_ib_recv_path(struct rds_conn_path *cp)
{
+ struct rds_connection *conn = cp->cp_conn;
struct rds_ib_connection *ic = conn->c_transport_data;
int ret = 0;
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index f27d2c8..84d90c9 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -36,6 +36,7 @@
#include <linux/dmapool.h>
#include <linux/ratelimit.h>
+#include "rds_single_path.h"
#include "rds.h"
#include "ib.h"
@@ -979,8 +980,9 @@ out:
return ret;
}
-void rds_ib_xmit_complete(struct rds_connection *conn)
+void rds_ib_xmit_path_complete(struct rds_conn_path *cp)
{
+ struct rds_connection *conn = cp->cp_conn;
struct rds_ib_connection *ic = conn->c_transport_data;
/* We may have a pending ACK or window update we were unable
diff --git a/net/rds/loop.c b/net/rds/loop.c
index 6b12b68..f2bf78d 100644
--- a/net/rds/loop.c
+++ b/net/rds/loop.c
@@ -34,6 +34,7 @@
#include <linux/slab.h>
#include <linux/in.h>
+#include "rds_single_path.h"
#include "rds.h"
#include "loop.h"
@@ -95,12 +96,13 @@ out:
*/
static void rds_loop_inc_free(struct rds_incoming *inc)
{
- struct rds_message *rm = container_of(inc, struct rds_message, m_inc);
- rds_message_put(rm);
+ struct rds_message *rm = container_of(inc, struct rds_message, m_inc);
+
+ rds_message_put(rm);
}
/* we need to at least give the thread something to succeed */
-static int rds_loop_recv(struct rds_connection *conn)
+static int rds_loop_recv_path(struct rds_conn_path *cp)
{
return 0;
}
@@ -148,13 +150,13 @@ static void rds_loop_conn_free(void *arg)
kfree(lc);
}
-static int rds_loop_conn_connect(struct rds_connection *conn)
+static int rds_loop_conn_path_connect(struct rds_conn_path *cp)
{
- rds_connect_complete(conn);
+ rds_connect_complete(cp->cp_conn);
return 0;
}
-static void rds_loop_conn_shutdown(struct rds_connection *conn)
+static void rds_loop_conn_path_shutdown(struct rds_conn_path *cp)
{
}
@@ -183,11 +185,11 @@ void rds_loop_exit(void)
*/
struct rds_transport rds_loop_transport = {
.xmit = rds_loop_xmit,
- .recv = rds_loop_recv,
+ .recv_path = rds_loop_recv_path,
.conn_alloc = rds_loop_conn_alloc,
.conn_free = rds_loop_conn_free,
- .conn_connect = rds_loop_conn_connect,
- .conn_shutdown = rds_loop_conn_shutdown,
+ .conn_path_connect = rds_loop_conn_path_connect,
+ .conn_path_shutdown = rds_loop_conn_path_shutdown,
.inc_copy_to_user = rds_message_inc_copy_to_user,
.inc_free = rds_loop_inc_free,
.t_name = "loopback",
diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c
index 7220beb..345f090 100644
--- a/net/rds/rdma_transport.c
+++ b/net/rds/rdma_transport.c
@@ -33,6 +33,7 @@
#include <linux/module.h>
#include <rdma/rdma_cm.h>
+#include "rds_single_path.h"
#include "rdma_transport.h"
#include "ib.h"
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 387df5f..6ef07bd 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -84,56 +84,69 @@ enum {
#define RDS_IN_XMIT 2
#define RDS_RECV_REFILL 3
+/* Max number of multipaths per RDS connection. Must be a power of 2 */
+#define RDS_MPATH_WORKERS 1
+
+/* Per mpath connection state */
+struct rds_conn_path {
+ struct rds_connection *cp_conn;
+ struct rds_message *cp_xmit_rm;
+ unsigned long cp_xmit_sg;
+ unsigned int cp_xmit_hdr_off;
+ unsigned int cp_xmit_data_off;
+ unsigned int cp_xmit_atomic_sent;
+ unsigned int cp_xmit_rdma_sent;
+ unsigned int cp_xmit_data_sent;
+
+ spinlock_t cp_lock; /* protect msg queues */
+ u64 cp_next_tx_seq;
+ struct list_head cp_send_queue;
+ struct list_head cp_retrans;
+
+ u64 cp_next_rx_seq;
+
+ void *cp_transport_data;
+
+ atomic_t cp_state;
+ unsigned long cp_send_gen;
+ unsigned long cp_flags;
+ unsigned long cp_reconnect_jiffies;
+ struct delayed_work cp_send_w;
+ struct delayed_work cp_recv_w;
+ struct delayed_work cp_conn_w;
+ struct work_struct cp_down_w;
+ struct mutex cp_cm_lock; /* protect cp_state & cm */
+ wait_queue_head_t cp_waitq;
+
+ unsigned int cp_unacked_packets;
+ unsigned int cp_unacked_bytes;
+ unsigned int cp_outgoing:1,
+ cp_pad_to_32:31;
+ unsigned int cp_index;
+};
+
+/* One rds_connection per RDS address pair */
struct rds_connection {
struct hlist_node c_hash_node;
__be32 c_laddr;
__be32 c_faddr;
unsigned int c_loopback:1,
- c_outgoing:1,
- c_pad_to_32:30;
+ c_pad_to_32:31;
+ int c_npaths;
struct rds_connection *c_passive;
+ struct rds_transport *c_trans;
struct rds_cong_map *c_lcong;
struct rds_cong_map *c_fcong;
- struct rds_message *c_xmit_rm;
- unsigned long c_xmit_sg;
- unsigned int c_xmit_hdr_off;
- unsigned int c_xmit_data_off;
- unsigned int c_xmit_atomic_sent;
- unsigned int c_xmit_rdma_sent;
- unsigned int c_xmit_data_sent;
-
- spinlock_t c_lock; /* protect msg queues */
- u64 c_next_tx_seq;
- struct list_head c_send_queue;
- struct list_head c_retrans;
-
- u64 c_next_rx_seq;
-
- struct rds_transport *c_trans;
- void *c_transport_data;
-
- atomic_t c_state;
- unsigned long c_send_gen;
- unsigned long c_flags;
- unsigned long c_reconnect_jiffies;
- struct delayed_work c_send_w;
- struct delayed_work c_recv_w;
- struct delayed_work c_conn_w;
- struct work_struct c_down_w;
- struct mutex c_cm_lock; /* protect conn state & cm */
- wait_queue_head_t c_waitq;
+ /* Protocol version */
+ unsigned int c_version;
+ possible_net_t c_net;
struct list_head c_map_item;
unsigned long c_map_queued;
- unsigned int c_unacked_packets;
- unsigned int c_unacked_bytes;
-
- /* Protocol version */
- unsigned int c_version;
- possible_net_t c_net;
+ struct rds_conn_path c_path[RDS_MPATH_WORKERS];
};
static inline
@@ -218,6 +231,7 @@ struct rds_incoming {
atomic_t i_refcount;
struct list_head i_item;
struct rds_connection *i_conn;
+ struct rds_conn_path *i_conn_path;
struct rds_header i_hdr;
unsigned long i_rx_jiffies;
__be32 i_saddr;
@@ -433,21 +447,22 @@ struct rds_transport {
char t_name[TRANSNAMSIZ];
struct list_head t_item;
struct module *t_owner;
- unsigned int t_prefer_loopback:1;
+ unsigned int t_prefer_loopback:1,
+ t_mp_capable:1;
unsigned int t_type;
int (*laddr_check)(struct net *net, __be32 addr);
int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp);
void (*conn_free)(void *data);
- int (*conn_connect)(struct rds_connection *conn);
- void (*conn_shutdown)(struct rds_connection *conn);
- void (*xmit_prepare)(struct rds_connection *conn);
- void (*xmit_complete)(struct rds_connection *conn);
+ int (*conn_path_connect)(struct rds_conn_path *cp);
+ void (*conn_path_shutdown)(struct rds_conn_path *conn);
+ void (*xmit_path_prepare)(struct rds_conn_path *cp);
+ void (*xmit_path_complete)(struct rds_conn_path *cp);
int (*xmit)(struct rds_connection *conn, struct rds_message *rm,
unsigned int hdr_off, unsigned int sg, unsigned int off);
int (*xmit_rdma)(struct rds_connection *conn, struct rm_rdma_op *op);
int (*xmit_atomic)(struct rds_connection *conn, struct rm_atomic_op *op);
- int (*recv)(struct rds_connection *conn);
+ int (*recv_path)(struct rds_conn_path *cp);
int (*inc_copy_to_user)(struct rds_incoming *inc, struct iov_iter *to);
void (*inc_free)(struct rds_incoming *inc);
@@ -636,10 +651,12 @@ struct rds_connection *rds_conn_create(struct net *net,
struct rds_connection *rds_conn_create_outgoing(struct net *net,
__be32 laddr, __be32 faddr,
struct rds_transport *trans, gfp_t gfp);
-void rds_conn_shutdown(struct rds_connection *conn);
+void rds_conn_shutdown(struct rds_conn_path *cpath);
void rds_conn_destroy(struct rds_connection *conn);
void rds_conn_drop(struct rds_connection *conn);
+void rds_conn_path_drop(struct rds_conn_path *cpath);
void rds_conn_connect_if_down(struct rds_connection *conn);
+void rds_conn_path_connect_if_down(struct rds_conn_path *cp);
void rds_for_each_conn_info(struct socket *sock, unsigned int len,
struct rds_info_iterator *iter,
struct rds_info_lengths *lens,
@@ -650,28 +667,60 @@ void __rds_conn_error(struct rds_connection *conn, const char *, ...);
#define rds_conn_error(conn, fmt...) \
__rds_conn_error(conn, KERN_WARNING "RDS: " fmt)
+void __rds_conn_path_error(struct rds_conn_path *cp, const char *, ...);
+#define rds_conn_path_error(cp, fmt...) \
+ __rds_conn_path_error(cp, KERN_WARNING "RDS: " fmt)
+
+static inline int
+rds_conn_path_transition(struct rds_conn_path *cp, int old, int new)
+{
+ return atomic_cmpxchg(&cp->cp_state, old, new) == old;
+}
+
static inline int
rds_conn_transition(struct rds_connection *conn, int old, int new)
{
- return atomic_cmpxchg(&conn->c_state, old, new) == old;
+ WARN_ON(conn->c_trans->t_mp_capable);
+ return rds_conn_path_transition(&conn->c_path[0], old, new);
+}
+
+static inline int
+rds_conn_path_state(struct rds_conn_path *cp)
+{
+ return atomic_read(&cp->cp_state);
}
static inline int
rds_conn_state(struct rds_connection *conn)
{
- return atomic_read(&conn->c_state);
+ WARN_ON(conn->c_trans->t_mp_capable);
+ return rds_conn_path_state(&conn->c_path[0]);
+}
+
+static inline int
+rds_conn_path_up(struct rds_conn_path *cp)
+{
+ return atomic_read(&cp->cp_state) == RDS_CONN_UP;
}
static inline int
rds_conn_up(struct rds_connection *conn)
{
- return atomic_read(&conn->c_state) == RDS_CONN_UP;
+ WARN_ON(conn->c_trans->t_mp_capable);
+ return rds_conn_path_up(&conn->c_path[0]);
+}
+
+static inline int
+rds_conn_path_connecting(struct rds_conn_path *cp)
+{
+ return atomic_read(&cp->cp_state) == RDS_CONN_CONNECTING;
}
static inline int
rds_conn_connecting(struct rds_connection *conn)
{
- return atomic_read(&conn->c_state) == RDS_CONN_CONNECTING;
+ WARN_ON(conn->c_trans->t_mp_capable);
+ return rds_conn_path_connecting(&conn->c_path[0]);
}
/* message.c */
@@ -720,6 +769,8 @@ void rds_page_exit(void);
/* recv.c */
void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
__be32 saddr);
+void rds_inc_path_init(struct rds_incoming *inc, struct rds_conn_path *conn,
+ __be32 saddr);
void rds_inc_put(struct rds_incoming *inc);
void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
struct rds_incoming *inc, gfp_t gfp);
@@ -733,16 +784,16 @@ void rds_inc_info_copy(struct rds_incoming *inc,
/* send.c */
int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len);
-void rds_send_reset(struct rds_connection *conn);
-int rds_send_xmit(struct rds_connection *conn);
+void rds_send_path_reset(struct rds_conn_path *conn);
+int rds_send_xmit(struct rds_conn_path *cp);
struct sockaddr_in;
void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest);
typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack);
void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
is_acked_func is_acked);
-int rds_send_pong(struct rds_connection *conn, __be16 dport);
-struct rds_message *rds_send_get_message(struct rds_connection *,
- struct rm_rdma_op *);
+void rds_send_path_drop_acked(struct rds_conn_path *cp, u64 ack,
+ is_acked_func is_acked);
+int rds_send_pong(struct rds_conn_path *cp, __be16 dport);
/* rdma.c */
void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force);
@@ -809,12 +860,12 @@ extern unsigned int rds_sysctl_trace_level;
int rds_threads_init(void);
void rds_threads_exit(void);
extern struct workqueue_struct *rds_wq;
-void rds_queue_reconnect(struct rds_connection *conn);
+void rds_queue_reconnect(struct rds_conn_path *cp);
void rds_connect_worker(struct work_struct *);
void rds_shutdown_worker(struct work_struct *);
void rds_send_worker(struct work_struct *);
void rds_recv_worker(struct work_struct *);
-void rds_connect_path_complete(struct rds_connection *conn, int curr);
+void rds_connect_path_complete(struct rds_conn_path *conn, int curr);
void rds_connect_complete(struct rds_connection *conn);
/* transport.c */
diff --git a/net/rds/rds_single_path.h b/net/rds/rds_single_path.h
new file mode 100644
index 0000000..e1241af
--- /dev/null
+++ b/net/rds/rds_single_path.h
@@ -0,0 +1,30 @@
+#ifndef _RDS_RDS_SINGLE_H
+#define _RDS_RDS_SINGLE_H
+
+#define c_xmit_rm c_path[0].cp_xmit_rm
+#define c_xmit_sg c_path[0].cp_xmit_sg
+#define c_xmit_hdr_off c_path[0].cp_xmit_hdr_off
+#define c_xmit_data_off c_path[0].cp_xmit_data_off
+#define c_xmit_atomic_sent c_path[0].cp_xmit_atomic_sent
+#define c_xmit_rdma_sent c_path[0].cp_xmit_rdma_sent
+#define c_xmit_data_sent c_path[0].cp_xmit_data_sent
+#define c_lock c_path[0].cp_lock
+#define c_next_tx_seq c_path[0].cp_next_tx_seq
+#define c_send_queue c_path[0].cp_send_queue
+#define c_retrans c_path[0].cp_retrans
+#define c_next_rx_seq c_path[0].cp_next_rx_seq
+#define c_transport_data c_path[0].cp_transport_data
+#define c_state c_path[0].cp_state
+#define c_send_gen c_path[0].cp_send_gen
+#define c_flags c_path[0].cp_flags
+#define c_reconnect_jiffies c_path[0].cp_reconnect_jiffies
+#define c_send_w c_path[0].cp_send_w
+#define c_recv_w c_path[0].cp_recv_w
+#define c_conn_w c_path[0].cp_conn_w
+#define c_down_w c_path[0].cp_down_w
+#define c_cm_lock c_path[0].cp_cm_lock
+#define c_waitq c_path[0].cp_waitq
+#define c_unacked_packets c_path[0].cp_unacked_packets
+#define c_unacked_bytes c_path[0].cp_unacked_bytes
+
+#endif /* _RDS_RDS_SINGLE_H */
diff --git a/net/rds/recv.c b/net/rds/recv.c
index 8413f6c..fed53a6 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -53,6 +53,20 @@ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
}
EXPORT_SYMBOL_GPL(rds_inc_init);
+void rds_inc_path_init(struct rds_incoming *inc, struct rds_conn_path *cp,
+ __be32 saddr)
+{
+ atomic_set(&inc->i_refcount, 1);
+ INIT_LIST_HEAD(&inc->i_item);
+ inc->i_conn = cp->cp_conn;
+ inc->i_conn_path = cp;
+ inc->i_saddr = saddr;
+ inc->i_rdma_cookie = 0;
+ inc->i_rx_tstamp.tv_sec = 0;
+ inc->i_rx_tstamp.tv_usec = 0;
+}
+EXPORT_SYMBOL_GPL(rds_inc_path_init);
+
static void rds_inc_addref(struct rds_incoming *inc)
{
rdsdebug("addref inc %p ref %d\n", inc, atomic_read(&inc->i_refcount));
@@ -164,13 +178,18 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
struct rds_sock *rs = NULL;
struct sock *sk;
unsigned long flags;
+ struct rds_conn_path *cp;
inc->i_conn = conn;
inc->i_rx_jiffies = jiffies;
+ if (conn->c_trans->t_mp_capable)
+ cp = inc->i_conn_path;
+ else
+ cp = &conn->c_path[0];
rdsdebug("conn %p next %llu inc %p seq %llu len %u sport %u dport %u "
"flags 0x%x rx_jiffies %lu\n", conn,
- (unsigned long long)conn->c_next_rx_seq,
+ (unsigned long long)cp->cp_next_rx_seq,
inc,
(unsigned long long)be64_to_cpu(inc->i_hdr.h_sequence),
be32_to_cpu(inc->i_hdr.h_len),
@@ -199,16 +218,20 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
* XXX we could spend more on the wire to get more robust failure
* detection, arguably worth it to avoid data corruption.
*/
- if (be64_to_cpu(inc->i_hdr.h_sequence) < conn->c_next_rx_seq &&
+ if (be64_to_cpu(inc->i_hdr.h_sequence) < cp->cp_next_rx_seq &&
(inc->i_hdr.h_flags & RDS_FLAG_RETRANSMITTED)) {
rds_stats_inc(s_recv_drop_old_seq);
goto out;
}
- conn->c_next_rx_seq = be64_to_cpu(inc->i_hdr.h_sequence) + 1;
+ cp->cp_next_rx_seq = be64_to_cpu(inc->i_hdr.h_sequence) + 1;
if (rds_sysctl_ping_enable && inc->i_hdr.h_dport == 0) {
+ if (inc->i_hdr.h_sport == 0) {
+ rdsdebug("ignore ping with 0 sport from 0x%x\n", saddr);
+ goto out;
+ }
rds_stats_inc(s_recv_ping);
- rds_send_pong(conn, inc->i_hdr.h_sport);
+ rds_send_pong(cp, inc->i_hdr.h_sport);
goto out;
}
diff --git a/net/rds/send.c b/net/rds/send.c
index b1962f8..5a9caf1 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -62,14 +62,14 @@ static void rds_send_remove_from_sock(struct list_head *messages, int status);
* Reset the send state. Callers must ensure that this doesn't race with
* rds_send_xmit().
*/
-void rds_send_reset(struct rds_connection *conn)
+void rds_send_path_reset(struct rds_conn_path *cp)
{
struct rds_message *rm, *tmp;
unsigned long flags;
- if (conn->c_xmit_rm) {
- rm = conn->c_xmit_rm;
- conn->c_xmit_rm = NULL;
+ if (cp->cp_xmit_rm) {
+ rm = cp->cp_xmit_rm;
+ cp->cp_xmit_rm = NULL;
/* Tell the user the RDMA op is no longer mapped by the
* transport. This isn't entirely true (it's flushed out
* independently) but as the connection is down, there's
@@ -78,37 +78,37 @@ void rds_send_reset(struct rds_connection *conn)
rds_message_put(rm);
}
- conn->c_xmit_sg = 0;
- conn->c_xmit_hdr_off = 0;
- conn->c_xmit_data_off = 0;
- conn->c_xmit_atomic_sent = 0;
- conn->c_xmit_rdma_sent = 0;
- conn->c_xmit_data_sent = 0;
+ cp->cp_xmit_sg = 0;
+ cp->cp_xmit_hdr_off = 0;
+ cp->cp_xmit_data_off = 0;
+ cp->cp_xmit_atomic_sent = 0;
+ cp->cp_xmit_rdma_sent = 0;
+ cp->cp_xmit_data_sent = 0;
- conn->c_map_queued = 0;
+ cp->cp_conn->c_map_queued = 0;
- conn->c_unacked_packets = rds_sysctl_max_unacked_packets;
- conn->c_unacked_bytes = rds_sysctl_max_unacked_bytes;
+ cp->cp_unacked_packets = rds_sysctl_max_unacked_packets;
+ cp->cp_unacked_bytes = rds_sysctl_max_unacked_bytes;
/* Mark messages as retransmissions, and move them to the send q */
- spin_lock_irqsave(&conn->c_lock, flags);
- list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
+ spin_lock_irqsave(&cp->cp_lock, flags);
+ list_for_each_entry_safe(rm, tmp, &cp->cp_retrans, m_conn_item) {
set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
set_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags);
}
- list_splice_init(&conn->c_retrans, &conn->c_send_queue);
- spin_unlock_irqrestore(&conn->c_lock, flags);
+ list_splice_init(&cp->cp_retrans, &cp->cp_send_queue);
+ spin_unlock_irqrestore(&cp->cp_lock, flags);
}
-EXPORT_SYMBOL_GPL(rds_send_reset);
+EXPORT_SYMBOL_GPL(rds_send_path_reset);
-static int acquire_in_xmit(struct rds_connection *conn)
+static int acquire_in_xmit(struct rds_conn_path *cp)
{
- return test_and_set_bit(RDS_IN_XMIT, &conn->c_flags) == 0;
+ return test_and_set_bit(RDS_IN_XMIT, &cp->cp_flags) == 0;
}
-static void release_in_xmit(struct rds_connection *conn)
+static void release_in_xmit(struct rds_conn_path *cp)
{
- clear_bit(RDS_IN_XMIT, &conn->c_flags);
+ clear_bit(RDS_IN_XMIT, &cp->cp_flags);
smp_mb__after_atomic();
/*
* We don't use wait_on_bit()/wake_up_bit() because our waking is in a
@@ -116,8 +116,8 @@ static void release_in_xmit(struct rds_connection *conn)
* the system-wide hashed waitqueue buckets in the fast path only to
* almost never find waiters.
*/
- if (waitqueue_active(&conn->c_waitq))
- wake_up_all(&conn->c_waitq);
+ if (waitqueue_active(&cp->cp_waitq))
+ wake_up_all(&cp->cp_waitq);
}
/*
@@ -134,8 +134,9 @@ static void release_in_xmit(struct rds_connection *conn)
* - small message latency is higher behind queued large messages
* - large message latency isn't starved by intervening small sends
*/
-int rds_send_xmit(struct rds_connection *conn)
+int rds_send_xmit(struct rds_conn_path *cp)
{
+ struct rds_connection *conn = cp->cp_conn;
struct rds_message *rm;
unsigned long flags;
unsigned int tmp;
@@ -155,7 +156,7 @@ restart:
* avoids blocking the caller and trading per-connection data between
* caches per message.
*/
- if (!acquire_in_xmit(conn)) {
+ if (!acquire_in_xmit(cp)) {
rds_stats_inc(s_send_lock_contention);
ret = -ENOMEM;
goto out;
@@ -169,21 +170,21 @@ restart:
* The acquire_in_xmit() check above ensures that only one
* caller can increment c_send_gen at any time.
*/
- conn->c_send_gen++;
- send_gen = conn->c_send_gen;
+ cp->cp_send_gen++;
+ send_gen = cp->cp_send_gen;
/*
* rds_conn_shutdown() sets the conn state and then tests RDS_IN_XMIT,
* we do the opposite to avoid races.
*/
- if (!rds_conn_up(conn)) {
- release_in_xmit(conn);
+ if (!rds_conn_path_up(cp)) {
+ release_in_xmit(cp);
ret = 0;
goto out;
}
- if (conn->c_trans->xmit_prepare)
- conn->c_trans->xmit_prepare(conn);
+ if (conn->c_trans->xmit_path_prepare)
+ conn->c_trans->xmit_path_prepare(cp);
/*
* spin trying to push headers and data down the connection until
@@ -191,7 +192,7 @@ restart:
*/
while (1) {
- rm = conn->c_xmit_rm;
+ rm = cp->cp_xmit_rm;
/*
* If between sending messages, we can send a pending congestion
@@ -204,14 +205,16 @@ restart:
break;
}
rm->data.op_active = 1;
+ rm->m_inc.i_conn_path = cp;
+ rm->m_inc.i_conn = cp->cp_conn;
- conn->c_xmit_rm = rm;
+ cp->cp_xmit_rm = rm;
}
/*
* If not already working on one, grab the next message.
*
- * c_xmit_rm holds a ref while we're sending this message down
+ * cp_xmit_rm holds a ref while we're sending this message down
* the connction. We can use this ref while holding the
* send_sem.. rds_send_reset() is serialized with it.
*/
@@ -228,10 +231,10 @@ restart:
if (batch_count >= send_batch_count)
goto over_batch;
- spin_lock_irqsave(&conn->c_lock, flags);
+ spin_lock_irqsave(&cp->cp_lock, flags);
- if (!list_empty(&conn->c_send_queue)) {
- rm = list_entry(conn->c_send_queue.next,
+ if (!list_empty(&cp->cp_send_queue)) {
+ rm = list_entry(cp->cp_send_queue.next,
struct rds_message,
m_conn_item);
rds_message_addref(rm);
@@ -240,10 +243,11 @@ restart:
* Move the message from the send queue to the retransmit
* list right away.
*/
- list_move_tail(&rm->m_conn_item, &conn->c_retrans);
+ list_move_tail(&rm->m_conn_item,
+ &cp->cp_retrans);
}
- spin_unlock_irqrestore(&conn->c_lock, flags);
+ spin_unlock_irqrestore(&cp->cp_lock, flags);
if (!rm)
break;
@@ -257,32 +261,34 @@ restart:
*/
if (rm->rdma.op_active &&
test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) {
- spin_lock_irqsave(&conn->c_lock, flags);
+ spin_lock_irqsave(&cp->cp_lock, flags);
if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags))
list_move(&rm->m_conn_item, &to_be_dropped);
- spin_unlock_irqrestore(&conn->c_lock, flags);
+ spin_unlock_irqrestore(&cp->cp_lock, flags);
continue;
}
/* Require an ACK every once in a while */
len = ntohl(rm->m_inc.i_hdr.h_len);
- if (conn->c_unacked_packets == 0 ||
- conn->c_unacked_bytes < len) {
+ if (cp->cp_unacked_packets == 0 ||
+ cp->cp_unacked_bytes < len) {
__set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
- conn->c_unacked_packets = rds_sysctl_max_unacked_packets;
- conn->c_unacked_bytes = rds_sysctl_max_unacked_bytes;
+ cp->cp_unacked_packets =
+ rds_sysctl_max_unacked_packets;
+ cp->cp_unacked_bytes =
+ rds_sysctl_max_unacked_bytes;
rds_stats_inc(s_send_ack_required);
} else {
- conn->c_unacked_bytes -= len;
- conn->c_unacked_packets--;
+ cp->cp_unacked_bytes -= len;
+ cp->cp_unacked_packets--;
}
- conn->c_xmit_rm = rm;
+ cp->cp_xmit_rm = rm;
}
/* The transport either sends the whole rdma or none of it */
- if (rm->rdma.op_active && !conn->c_xmit_rdma_sent) {
+ if (rm->rdma.op_active && !cp->cp_xmit_rdma_sent) {
rm->m_final_op = &rm->rdma;
/* The transport owns the mapped memory for now.
* You can't unmap it while it's on the send queue
@@ -294,11 +300,11 @@ restart:
wake_up_interruptible(&rm->m_flush_wait);
break;
}
- conn->c_xmit_rdma_sent = 1;
+ cp->cp_xmit_rdma_sent = 1;
}
- if (rm->atomic.op_active && !conn->c_xmit_atomic_sent) {
+ if (rm->atomic.op_active && !cp->cp_xmit_atomic_sent) {
rm->m_final_op = &rm->atomic;
/* The transport owns the mapped memory for now.
* You can't unmap it while it's on the send queue
@@ -310,7 +316,7 @@ restart:
wake_up_interruptible(&rm->m_flush_wait);
break;
}
- conn->c_xmit_atomic_sent = 1;
+ cp->cp_xmit_atomic_sent = 1;
}
@@ -336,41 +342,42 @@ restart:
rm->data.op_active = 0;
}
- if (rm->data.op_active && !conn->c_xmit_data_sent) {
+ if (rm->data.op_active && !cp->cp_xmit_data_sent) {
rm->m_final_op = &rm->data;
+
ret = conn->c_trans->xmit(conn, rm,
- conn->c_xmit_hdr_off,
- conn->c_xmit_sg,
- conn->c_xmit_data_off);
+ cp->cp_xmit_hdr_off,
+ cp->cp_xmit_sg,
+ cp->cp_xmit_data_off);
if (ret <= 0)
break;
- if (conn->c_xmit_hdr_off < sizeof(struct rds_header)) {
+ if (cp->cp_xmit_hdr_off < sizeof(struct rds_header)) {
tmp = min_t(int, ret,
sizeof(struct rds_header) -
- conn->c_xmit_hdr_off);
- conn->c_xmit_hdr_off += tmp;
+ cp->cp_xmit_hdr_off);
+ cp->cp_xmit_hdr_off += tmp;
ret -= tmp;
}
- sg = &rm->data.op_sg[conn->c_xmit_sg];
+ sg = &rm->data.op_sg[cp->cp_xmit_sg];
while (ret) {
tmp = min_t(int, ret, sg->length -
- conn->c_xmit_data_off);
- conn->c_xmit_data_off += tmp;
+ cp->cp_xmit_data_off);
+ cp->cp_xmit_data_off += tmp;
ret -= tmp;
- if (conn->c_xmit_data_off == sg->length) {
- conn->c_xmit_data_off = 0;
+ if (cp->cp_xmit_data_off == sg->length) {
+ cp->cp_xmit_data_off = 0;
sg++;
- conn->c_xmit_sg++;
- BUG_ON(ret != 0 &&
- conn->c_xmit_sg == rm->data.op_nents);
+ cp->cp_xmit_sg++;
+ BUG_ON(ret != 0 && cp->cp_xmit_sg ==
+ rm->data.op_nents);
}
}
- if (conn->c_xmit_hdr_off == sizeof(struct rds_header) &&
- (conn->c_xmit_sg == rm->data.op_nents))
- conn->c_xmit_data_sent = 1;
+ if (cp->cp_xmit_hdr_off == sizeof(struct rds_header) &&
+ (cp->cp_xmit_sg == rm->data.op_nents))
+ cp->cp_xmit_data_sent = 1;
}
/*
@@ -378,23 +385,23 @@ restart:
* if there is a data op. Thus, if the data is sent (or there was
* none), then we're done with the rm.
*/
- if (!rm->data.op_active || conn->c_xmit_data_sent) {
- conn->c_xmit_rm = NULL;
- conn->c_xmit_sg = 0;
- conn->c_xmit_hdr_off = 0;
- conn->c_xmit_data_off = 0;
- conn->c_xmit_rdma_sent = 0;
- conn->c_xmit_atomic_sent = 0;
- conn->c_xmit_data_sent = 0;
+ if (!rm->data.op_active || cp->cp_xmit_data_sent) {
+ cp->cp_xmit_rm = NULL;
+ cp->cp_xmit_sg = 0;
+ cp->cp_xmit_hdr_off = 0;
+ cp->cp_xmit_data_off = 0;
+ cp->cp_xmit_rdma_sent = 0;
+ cp->cp_xmit_atomic_sent = 0;
+ cp->cp_xmit_data_sent = 0;
rds_message_put(rm);
}
}
over_batch:
- if (conn->c_trans->xmit_complete)
- conn->c_trans->xmit_complete(conn);
- release_in_xmit(conn);
+ if (conn->c_trans->xmit_path_complete)
+ conn->c_trans->xmit_path_complete(cp);
+ release_in_xmit(cp);
/* Nuke any messages we decided not to retransmit. */
if (!list_empty(&to_be_dropped)) {
@@ -422,12 +429,12 @@ over_batch:
if (ret == 0) {
smp_mb();
if ((test_bit(0, &conn->c_map_queued) ||
- !list_empty(&conn->c_send_queue)) &&
- send_gen == conn->c_send_gen) {
+ !list_empty(&cp->cp_send_queue)) &&
+ send_gen == cp->cp_send_gen) {
rds_stats_inc(s_send_lock_queue_raced);
if (batch_count < send_batch_count)
goto restart;
- queue_delayed_work(rds_wq, &conn->c_send_w, 1);
+ queue_delayed_work(rds_wq, &cp->cp_send_w, 1);
}
}
out:
@@ -560,42 +567,6 @@ __rds_send_complete(struct rds_sock *rs, struct rds_message *rm, int status)
}
/*
- * This is called from the IB send completion when we detect
- * a RDMA operation that failed with remote access error.
- * So speed is not an issue here.
- */
-struct rds_message *rds_send_get_message(struct rds_connection *conn,
- struct rm_rdma_op *op)
-{
- struct rds_message *rm, *tmp, *found = NULL;
- unsigned long flags;
-
- spin_lock_irqsave(&conn->c_lock, flags);
-
- list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
- if (&rm->rdma == op) {
- atomic_inc(&rm->m_refcount);
- found = rm;
- goto out;
- }
- }
-
- list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) {
- if (&rm->rdma == op) {
- atomic_inc(&rm->m_refcount);
- found = rm;
- break;
- }
- }
-
-out:
- spin_unlock_irqrestore(&conn->c_lock, flags);
-
- return found;
-}
-EXPORT_SYMBOL_GPL(rds_send_get_message);
-
-/*
* This removes messages from the socket's list if they're on it. The list
* argument must be private to the caller, we must be able to modify it
* without locks. The messages must have a reference held for their
@@ -685,16 +656,16 @@ unlock_and_drop:
* assigned the m_ack_seq yet - but that's fine as long as tcp_is_acked
* checks the RDS_MSG_HAS_ACK_SEQ bit.
*/
-void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
- is_acked_func is_acked)
+void rds_send_path_drop_acked(struct rds_conn_path *cp, u64 ack,
+ is_acked_func is_acked)
{
struct rds_message *rm, *tmp;
unsigned long flags;
LIST_HEAD(list);
- spin_lock_irqsave(&conn->c_lock, flags);
+ spin_lock_irqsave(&cp->cp_lock, flags);
- list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
+ list_for_each_entry_safe(rm, tmp, &cp->cp_retrans, m_conn_item) {
if (!rds_send_is_acked(rm, ack, is_acked))
break;
@@ -706,17 +677,26 @@ void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
if (!list_empty(&list))
smp_mb__after_atomic();
- spin_unlock_irqrestore(&conn->c_lock, flags);
+ spin_unlock_irqrestore(&cp->cp_lock, flags);
/* now remove the messages from the sock list as needed */
rds_send_remove_from_sock(&list, RDS_RDMA_SUCCESS);
}
+EXPORT_SYMBOL_GPL(rds_send_path_drop_acked);
+
+void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
+ is_acked_func is_acked)
+{
+ WARN_ON(conn->c_trans->t_mp_capable);
+ rds_send_path_drop_acked(&conn->c_path[0], ack, is_acked);
+}
EXPORT_SYMBOL_GPL(rds_send_drop_acked);
void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
{
struct rds_message *rm, *tmp;
struct rds_connection *conn;
+ struct rds_conn_path *cp;
unsigned long flags;
LIST_HEAD(list);
@@ -745,22 +725,26 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
list_for_each_entry(rm, &list, m_sock_item) {
conn = rm->m_inc.i_conn;
+ if (conn->c_trans->t_mp_capable)
+ cp = rm->m_inc.i_conn_path;
+ else
+ cp = &conn->c_path[0];
- spin_lock_irqsave(&conn->c_lock, flags);
+ spin_lock_irqsave(&cp->cp_lock, flags);
/*
* Maybe someone else beat us to removing rm from the conn.
* If we race with their flag update we'll get the lock and
* then really see that the flag has been cleared.
*/
if (!test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) {
- spin_unlock_irqrestore(&conn->c_lock, flags);
+ spin_unlock_irqrestore(&cp->cp_lock, flags);
spin_lock_irqsave(&rm->m_rs_lock, flags);
rm->m_rs = NULL;
spin_unlock_irqrestore(&rm->m_rs_lock, flags);
continue;
}
list_del_init(&rm->m_conn_item);
- spin_unlock_irqrestore(&conn->c_lock, flags);
+ spin_unlock_irqrestore(&cp->cp_lock, flags);
/*
* Couldn't grab m_rs_lock in top loop (lock ordering),
@@ -809,6 +793,7 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
* message from the flow with RDS_CANCEL_SENT_TO.
*/
static int rds_send_queue_rm(struct rds_sock *rs, struct rds_connection *conn,
+ struct rds_conn_path *cp,
struct rds_message *rm, __be16 sport,
__be16 dport, int *queued)
{
@@ -852,13 +837,14 @@ static int rds_send_queue_rm(struct rds_sock *rs, struct rds_connection *conn,
trying to minimize the time we hold c_lock */
rds_message_populate_header(&rm->m_inc.i_hdr, sport, dport, 0);
rm->m_inc.i_conn = conn;
+ rm->m_inc.i_conn_path = cp;
rds_message_addref(rm);
- spin_lock(&conn->c_lock);
- rm->m_inc.i_hdr.h_sequence = cpu_to_be64(conn->c_next_tx_seq++);
- list_add_tail(&rm->m_conn_item, &conn->c_send_queue);
+ spin_lock(&cp->cp_lock);
+ rm->m_inc.i_hdr.h_sequence = cpu_to_be64(cp->cp_next_tx_seq++);
+ list_add_tail(&rm->m_conn_item, &cp->cp_send_queue);
set_bit(RDS_MSG_ON_CONN, &rm->m_flags);
- spin_unlock(&conn->c_lock);
+ spin_unlock(&cp->cp_lock);
rdsdebug("queued msg %p len %d, rs %p bytes %d seq %llu\n",
rm, len, rs, rs->rs_snd_bytes,
@@ -990,6 +976,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
int queued = 0, allocated_mr = 0;
int nonblock = msg->msg_flags & MSG_DONTWAIT;
long timeo = sock_sndtimeo(sk, nonblock);
+ struct rds_conn_path *cpath;
/* Mirror Linux UDP mirror of BSD error message compatibility */
/* XXX: Perhaps MSG_MORE someday */
@@ -1088,15 +1075,16 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
goto out;
}
- rds_conn_connect_if_down(conn);
+ cpath = &conn->c_path[0];
+
+ rds_conn_path_connect_if_down(cpath);
ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs);
if (ret) {
rs->rs_seen_congestion = 1;
goto out;
}
-
- while (!rds_send_queue_rm(rs, conn, rm, rs->rs_bound_port,
+ while (!rds_send_queue_rm(rs, conn, cpath, rm, rs->rs_bound_port,
dport, &queued)) {
rds_stats_inc(s_send_queue_full);
@@ -1106,7 +1094,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
}
timeo = wait_event_interruptible_timeout(*sk_sleep(sk),
- rds_send_queue_rm(rs, conn, rm,
+ rds_send_queue_rm(rs, conn, cpath, rm,
rs->rs_bound_port,
dport,
&queued),
@@ -1127,9 +1115,9 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
*/
rds_stats_inc(s_send_queued);
- ret = rds_send_xmit(conn);
+ ret = rds_send_xmit(cpath);
if (ret == -ENOMEM || ret == -EAGAIN)
- queue_delayed_work(rds_wq, &conn->c_send_w, 1);
+ queue_delayed_work(rds_wq, &cpath->cp_send_w, 1);
rds_message_put(rm);
return payload_len;
@@ -1150,7 +1138,7 @@ out:
* Reply to a ping packet.
*/
int
-rds_send_pong(struct rds_connection *conn, __be16 dport)
+rds_send_pong(struct rds_conn_path *cp, __be16 dport)
{
struct rds_message *rm;
unsigned long flags;
@@ -1162,31 +1150,32 @@ rds_send_pong(struct rds_connection *conn, __be16 dport)
goto out;
}
- rm->m_daddr = conn->c_faddr;
+ rm->m_daddr = cp->cp_conn->c_faddr;
rm->data.op_active = 1;
- rds_conn_connect_if_down(conn);
+ rds_conn_path_connect_if_down(cp);
- ret = rds_cong_wait(conn->c_fcong, dport, 1, NULL);
+ ret = rds_cong_wait(cp->cp_conn->c_fcong, dport, 1, NULL);
if (ret)
goto out;
- spin_lock_irqsave(&conn->c_lock, flags);
- list_add_tail(&rm->m_conn_item, &conn->c_send_queue);
+ spin_lock_irqsave(&cp->cp_lock, flags);
+ list_add_tail(&rm->m_conn_item, &cp->cp_send_queue);
set_bit(RDS_MSG_ON_CONN, &rm->m_flags);
rds_message_addref(rm);
- rm->m_inc.i_conn = conn;
+ rm->m_inc.i_conn = cp->cp_conn;
+ rm->m_inc.i_conn_path = cp;
rds_message_populate_header(&rm->m_inc.i_hdr, 0, dport,
- conn->c_next_tx_seq);
- conn->c_next_tx_seq++;
- spin_unlock_irqrestore(&conn->c_lock, flags);
+ cp->cp_next_tx_seq);
+ cp->cp_next_tx_seq++;
+ spin_unlock_irqrestore(&cp->cp_lock, flags);
rds_stats_inc(s_send_queued);
rds_stats_inc(s_send_pong);
/* schedule the send work on rds_wq */
- queue_delayed_work(rds_wq, &conn->c_send_w, 1);
+ queue_delayed_work(rds_wq, &cp->cp_send_w, 1);
rds_message_put(rm);
return 0;
diff --git a/net/rds/sysctl.c b/net/rds/sysctl.c
index c173f69..e381bbc 100644
--- a/net/rds/sysctl.c
+++ b/net/rds/sysctl.c
@@ -102,7 +102,8 @@ int rds_sysctl_init(void)
rds_sysctl_reconnect_min = msecs_to_jiffies(1);
rds_sysctl_reconnect_min_jiffies = rds_sysctl_reconnect_min;
- rds_sysctl_reg_table = register_net_sysctl(&init_net,"net/rds", rds_sysctl_rds_table);
+ rds_sysctl_reg_table =
+ register_net_sysctl(&init_net, "net/rds", rds_sysctl_rds_table);
if (!rds_sysctl_reg_table)
return -ENOMEM;
return 0;
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 74ee126..d278432 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -38,6 +38,7 @@
#include <net/net_namespace.h>
#include <net/netns/generic.h>
+#include "rds_single_path.h"
#include "rds.h"
#include "tcp.h"
@@ -56,8 +57,8 @@ static int rds_tcp_skbuf_handler(struct ctl_table *ctl, int write,
void __user *buffer, size_t *lenp,
loff_t *fpos);
-int rds_tcp_min_sndbuf = SOCK_MIN_SNDBUF;
-int rds_tcp_min_rcvbuf = SOCK_MIN_RCVBUF;
+static int rds_tcp_min_sndbuf = SOCK_MIN_SNDBUF;
+static int rds_tcp_min_rcvbuf = SOCK_MIN_RCVBUF;
static struct ctl_table rds_tcp_sysctl_table[] = {
#define RDS_TCP_SNDBUF 0
@@ -135,9 +136,9 @@ void rds_tcp_restore_callbacks(struct socket *sock,
* from being called while it isn't set.
*/
void rds_tcp_reset_callbacks(struct socket *sock,
- struct rds_connection *conn)
+ struct rds_conn_path *cp)
{
- struct rds_tcp_connection *tc = conn->c_transport_data;
+ struct rds_tcp_connection *tc = cp->cp_transport_data;
struct socket *osock = tc->t_sock;
if (!osock)
@@ -147,8 +148,8 @@ void rds_tcp_reset_callbacks(struct socket *sock,
* We have an outstanding SYN to this peer, which may
* potentially have transitioned to the RDS_CONN_UP state,
* so we must quiesce any send threads before resetting
- * c_transport_data. We quiesce these threads by setting
- * c_state to something other than RDS_CONN_UP, and then
+ * cp_transport_data. We quiesce these threads by setting
+ * cp_state to something other than RDS_CONN_UP, and then
* waiting for any existing threads in rds_send_xmit to
* complete release_in_xmit(). (Subsequent threads entering
* rds_send_xmit() will bail on !rds_conn_up().
@@ -163,8 +164,8 @@ void rds_tcp_reset_callbacks(struct socket *sock,
* RDS_CONN_RESETTTING, to ensure that rds_tcp_state_change
* cannot mark rds_conn_path_up() in the window before lock_sock()
*/
- atomic_set(&conn->c_state, RDS_CONN_RESETTING);
- wait_event(conn->c_waitq, !test_bit(RDS_IN_XMIT, &conn->c_flags));
+ atomic_set(&cp->cp_state, RDS_CONN_RESETTING);
+ wait_event(cp->cp_waitq, !test_bit(RDS_IN_XMIT, &cp->cp_flags));
lock_sock(osock->sk);
/* reset receive side state for rds_tcp_data_recv() for osock */
if (tc->t_tinc) {
@@ -185,11 +186,12 @@ void rds_tcp_reset_callbacks(struct socket *sock,
release_sock(osock->sk);
sock_release(osock);
newsock:
- rds_send_reset(conn);
+ rds_send_path_reset(cp);
lock_sock(sock->sk);
write_lock_bh(&sock->sk->sk_callback_lock);
tc->t_sock = sock;
- sock->sk->sk_user_data = conn;
+ tc->t_cpath = cp;
+ sock->sk->sk_user_data = cp;
sock->sk->sk_data_ready = rds_tcp_data_ready;
sock->sk->sk_write_space = rds_tcp_write_space;
sock->sk->sk_state_change = rds_tcp_state_change;
@@ -202,9 +204,9 @@ newsock:
* above rds_tcp_reset_callbacks for notes about synchronization
* with data path
*/
-void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn)
+void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp)
{
- struct rds_tcp_connection *tc = conn->c_transport_data;
+ struct rds_tcp_connection *tc = cp->cp_transport_data;
rdsdebug("setting sock %p callbacks to tc %p\n", sock, tc);
write_lock_bh(&sock->sk->sk_callback_lock);
@@ -220,12 +222,12 @@ void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn)
sock->sk->sk_data_ready = sock->sk->sk_user_data;
tc->t_sock = sock;
- tc->conn = conn;
+ tc->t_cpath = cp;
tc->t_orig_data_ready = sock->sk->sk_data_ready;
tc->t_orig_write_space = sock->sk->sk_write_space;
tc->t_orig_state_change = sock->sk->sk_state_change;
- sock->sk->sk_user_data = conn;
+ sock->sk->sk_user_data = cp;
sock->sk->sk_data_ready = rds_tcp_data_ready;
sock->sk->sk_write_space = rds_tcp_write_space;
sock->sk->sk_state_change = rds_tcp_state_change;
@@ -283,24 +285,29 @@ static int rds_tcp_laddr_check(struct net *net, __be32 addr)
static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp)
{
struct rds_tcp_connection *tc;
+ int i;
- tc = kmem_cache_alloc(rds_tcp_conn_slab, gfp);
- if (!tc)
- return -ENOMEM;
+ for (i = 0; i < RDS_MPATH_WORKERS; i++) {
+ tc = kmem_cache_alloc(rds_tcp_conn_slab, gfp);
+ if (!tc)
+ return -ENOMEM;
- mutex_init(&tc->t_conn_lock);
- tc->t_sock = NULL;
- tc->t_tinc = NULL;
- tc->t_tinc_hdr_rem = sizeof(struct rds_header);
- tc->t_tinc_data_rem = 0;
+ mutex_init(&tc->t_conn_path_lock);
+ tc->t_sock = NULL;
+ tc->t_tinc = NULL;
+ tc->t_tinc_hdr_rem = sizeof(struct rds_header);
+ tc->t_tinc_data_rem = 0;
- conn->c_transport_data = tc;
+ conn->c_path[i].cp_transport_data = tc;
+ tc->t_cpath = &conn->c_path[i];
- spin_lock_irq(&rds_tcp_conn_lock);
- list_add_tail(&tc->t_tcp_node, &rds_tcp_conn_list);
- spin_unlock_irq(&rds_tcp_conn_lock);
+ spin_lock_irq(&rds_tcp_conn_lock);
+ list_add_tail(&tc->t_tcp_node, &rds_tcp_conn_list);
+ spin_unlock_irq(&rds_tcp_conn_lock);
+ rdsdebug("rds_conn_path [%d] tc %p\n", i,
+ conn->c_path[i].cp_transport_data);
+ }
- rdsdebug("alloced tc %p\n", conn->c_transport_data);
return 0;
}
@@ -317,6 +324,17 @@ static void rds_tcp_conn_free(void *arg)
kmem_cache_free(rds_tcp_conn_slab, tc);
}
+static bool list_has_conn(struct list_head *list, struct rds_connection *conn)
+{
+ struct rds_tcp_connection *tc, *_tc;
+
+ list_for_each_entry_safe(tc, _tc, list, t_tcp_node) {
+ if (tc->t_cpath->cp_conn == conn)
+ return true;
+ }
+ return false;
+}
+
static void rds_tcp_destroy_conns(void)
{
struct rds_tcp_connection *tc, *_tc;
@@ -324,29 +342,28 @@ static void rds_tcp_destroy_conns(void)
/* avoid calling conn_destroy with irqs off */
spin_lock_irq(&rds_tcp_conn_lock);
- list_splice(&rds_tcp_conn_list, &tmp_list);
- INIT_LIST_HEAD(&rds_tcp_conn_list);
+ list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) {
+ if (!list_has_conn(&tmp_list, tc->t_cpath->cp_conn))
+ list_move_tail(&tc->t_tcp_node, &tmp_list);
+ }
spin_unlock_irq(&rds_tcp_conn_lock);
- list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node) {
- if (tc->conn->c_passive)
- rds_conn_destroy(tc->conn->c_passive);
- rds_conn_destroy(tc->conn);
- }
+ list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node)
+ rds_conn_destroy(tc->t_cpath->cp_conn);
}
static void rds_tcp_exit(void);
struct rds_transport rds_tcp_transport = {
.laddr_check = rds_tcp_laddr_check,
- .xmit_prepare = rds_tcp_xmit_prepare,
- .xmit_complete = rds_tcp_xmit_complete,
+ .xmit_path_prepare = rds_tcp_xmit_path_prepare,
+ .xmit_path_complete = rds_tcp_xmit_path_complete,
.xmit = rds_tcp_xmit,
- .recv = rds_tcp_recv,
+ .recv_path = rds_tcp_recv_path,
.conn_alloc = rds_tcp_conn_alloc,
.conn_free = rds_tcp_conn_free,
- .conn_connect = rds_tcp_conn_connect,
- .conn_shutdown = rds_tcp_conn_shutdown,
+ .conn_path_connect = rds_tcp_conn_path_connect,
+ .conn_path_shutdown = rds_tcp_conn_path_shutdown,
.inc_copy_to_user = rds_tcp_inc_copy_to_user,
.inc_free = rds_tcp_inc_free,
.stats_info_copy = rds_tcp_stats_info_copy,
@@ -488,10 +505,30 @@ static struct pernet_operations rds_tcp_net_ops = {
.size = sizeof(struct rds_tcp_net),
};
+/* explicitly send a RST on each socket, thereby releasing any socket refcnts
+ * that may otherwise hold up netns deletion.
+ */
+static void rds_tcp_conn_paths_destroy(struct rds_connection *conn)
+{
+ struct rds_conn_path *cp;
+ struct rds_tcp_connection *tc;
+ int i;
+ struct sock *sk;
+
+ for (i = 0; i < RDS_MPATH_WORKERS; i++) {
+ cp = &conn->c_path[i];
+ tc = cp->cp_transport_data;
+ if (!tc->t_sock)
+ continue;
+ sk = tc->t_sock->sk;
+ sk->sk_prot->disconnect(sk, 0);
+ tcp_done(sk);
+ }
+}
+
static void rds_tcp_kill_sock(struct net *net)
{
struct rds_tcp_connection *tc, *_tc;
- struct sock *sk;
LIST_HEAD(tmp_list);
struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
@@ -500,20 +537,17 @@ static void rds_tcp_kill_sock(struct net *net)
flush_work(&rtn->rds_tcp_accept_w);
spin_lock_irq(&rds_tcp_conn_lock);
list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) {
- struct net *c_net = read_pnet(&tc->conn->c_net);
+ struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net);
if (net != c_net || !tc->t_sock)
continue;
- list_move_tail(&tc->t_tcp_node, &tmp_list);
+ if (!list_has_conn(&tmp_list, tc->t_cpath->cp_conn))
+ list_move_tail(&tc->t_tcp_node, &tmp_list);
}
spin_unlock_irq(&rds_tcp_conn_lock);
list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node) {
- sk = tc->t_sock->sk;
- sk->sk_prot->disconnect(sk, 0);
- tcp_done(sk);
- if (tc->conn->c_passive)
- rds_conn_destroy(tc->conn->c_passive);
- rds_conn_destroy(tc->conn);
+ rds_tcp_conn_paths_destroy(tc->t_cpath->cp_conn);
+ rds_conn_destroy(tc->t_cpath->cp_conn);
}
}
@@ -551,12 +585,13 @@ static void rds_tcp_sysctl_reset(struct net *net)
spin_lock_irq(&rds_tcp_conn_lock);
list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) {
- struct net *c_net = read_pnet(&tc->conn->c_net);
+ struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net);
if (net != c_net || !tc->t_sock)
continue;
- rds_conn_drop(tc->conn); /* reconnect with new parameters */
+ /* reconnect with new parameters */
+ rds_conn_path_drop(tc->t_cpath);
}
spin_unlock_irq(&rds_tcp_conn_lock);
}
diff --git a/net/rds/tcp.h b/net/rds/tcp.h
index ec0602b..1c3160f 100644
--- a/net/rds/tcp.h
+++ b/net/rds/tcp.h
@@ -11,11 +11,11 @@ struct rds_tcp_incoming {
struct rds_tcp_connection {
struct list_head t_tcp_node;
- struct rds_connection *conn;
- /* t_conn_lock synchronizes the connection establishment between
- * rds_tcp_accept_one and rds_tcp_conn_connect
+ struct rds_conn_path *t_cpath;
+ /* t_conn_path_lock synchronizes the connection establishment between
+ * rds_tcp_accept_one and rds_tcp_conn_path_connect
*/
- struct mutex t_conn_lock;
+ struct mutex t_conn_path_lock;
struct socket *t_sock;
void *t_orig_write_space;
void *t_orig_data_ready;
@@ -49,8 +49,8 @@ struct rds_tcp_statistics {
/* tcp.c */
void rds_tcp_tune(struct socket *sock);
void rds_tcp_nonagle(struct socket *sock);
-void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn);
-void rds_tcp_reset_callbacks(struct socket *sock, struct rds_connection *conn);
+void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp);
+void rds_tcp_reset_callbacks(struct socket *sock, struct rds_conn_path *cp);
void rds_tcp_restore_callbacks(struct socket *sock,
struct rds_tcp_connection *tc);
u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc);
@@ -60,8 +60,8 @@ extern struct rds_transport rds_tcp_transport;
void rds_tcp_accept_work(struct sock *sk);
/* tcp_connect.c */
-int rds_tcp_conn_connect(struct rds_connection *conn);
-void rds_tcp_conn_shutdown(struct rds_connection *conn);
+int rds_tcp_conn_path_connect(struct rds_conn_path *cp);
+void rds_tcp_conn_path_shutdown(struct rds_conn_path *conn);
void rds_tcp_state_change(struct sock *sk);
/* tcp_listen.c */
@@ -75,15 +75,15 @@ int rds_tcp_keepalive(struct socket *sock);
int rds_tcp_recv_init(void);
void rds_tcp_recv_exit(void);
void rds_tcp_data_ready(struct sock *sk);
-int rds_tcp_recv(struct rds_connection *conn);
+int rds_tcp_recv_path(struct rds_conn_path *cp);
void rds_tcp_inc_free(struct rds_incoming *inc);
int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to);
/* tcp_send.c */
-void rds_tcp_xmit_prepare(struct rds_connection *conn);
-void rds_tcp_xmit_complete(struct rds_connection *conn);
+void rds_tcp_xmit_path_prepare(struct rds_conn_path *cp);
+void rds_tcp_xmit_path_complete(struct rds_conn_path *cp);
int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
- unsigned int hdr_off, unsigned int sg, unsigned int off);
+ unsigned int hdr_off, unsigned int sg, unsigned int off);
void rds_tcp_write_space(struct sock *sk);
/* tcp_stats.c */
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
index fba13d0..c916715 100644
--- a/net/rds/tcp_connect.c
+++ b/net/rds/tcp_connect.c
@@ -34,56 +34,58 @@
#include <linux/in.h>
#include <net/tcp.h>
+#include "rds_single_path.h"
#include "rds.h"
#include "tcp.h"
void rds_tcp_state_change(struct sock *sk)
{
void (*state_change)(struct sock *sk);
- struct rds_connection *conn;
+ struct rds_conn_path *cp;
struct rds_tcp_connection *tc;
read_lock_bh(&sk->sk_callback_lock);
- conn = sk->sk_user_data;
- if (!conn) {
+ cp = sk->sk_user_data;
+ if (!cp) {
state_change = sk->sk_state_change;
goto out;
}
- tc = conn->c_transport_data;
+ tc = cp->cp_transport_data;
state_change = tc->t_orig_state_change;
rdsdebug("sock %p state_change to %d\n", tc->t_sock, sk->sk_state);
- switch(sk->sk_state) {
- /* ignore connecting sockets as they make progress */
- case TCP_SYN_SENT:
- case TCP_SYN_RECV:
- break;
- case TCP_ESTABLISHED:
- rds_connect_path_complete(conn, RDS_CONN_CONNECTING);
- break;
- case TCP_CLOSE_WAIT:
- case TCP_CLOSE:
- rds_conn_drop(conn);
- default:
- break;
+ switch (sk->sk_state) {
+ /* ignore connecting sockets as they make progress */
+ case TCP_SYN_SENT:
+ case TCP_SYN_RECV:
+ break;
+ case TCP_ESTABLISHED:
+ rds_connect_path_complete(cp, RDS_CONN_CONNECTING);
+ break;
+ case TCP_CLOSE_WAIT:
+ case TCP_CLOSE:
+ rds_conn_path_drop(cp);
+ default:
+ break;
}
out:
read_unlock_bh(&sk->sk_callback_lock);
state_change(sk);
}
-int rds_tcp_conn_connect(struct rds_connection *conn)
+int rds_tcp_conn_path_connect(struct rds_conn_path *cp)
{
struct socket *sock = NULL;
struct sockaddr_in src, dest;
int ret;
- struct rds_tcp_connection *tc = conn->c_transport_data;
+ struct rds_connection *conn = cp->cp_conn;
+ struct rds_tcp_connection *tc = cp->cp_transport_data;
- mutex_lock(&tc->t_conn_lock);
+ mutex_lock(&tc->t_conn_path_lock);
- if (rds_conn_up(conn)) {
- mutex_unlock(&tc->t_conn_lock);
+ if (rds_conn_path_up(cp)) {
+ mutex_unlock(&tc->t_conn_path_lock);
return 0;
}
ret = sock_create_kern(rds_conn_net(conn), PF_INET,
@@ -112,10 +114,11 @@ int rds_tcp_conn_connect(struct rds_connection *conn)
* once we call connect() we can start getting callbacks and they
* own the socket
*/
- rds_tcp_set_callbacks(sock, conn);
+ rds_tcp_set_callbacks(sock, cp);
ret = sock->ops->connect(sock, (struct sockaddr *)&dest, sizeof(dest),
O_NONBLOCK);
+ cp->cp_outgoing = 1;
rdsdebug("connect to address %pI4 returned %d\n", &conn->c_faddr, ret);
if (ret == -EINPROGRESS)
ret = 0;
@@ -123,11 +126,11 @@ int rds_tcp_conn_connect(struct rds_connection *conn)
rds_tcp_keepalive(sock);
sock = NULL;
} else {
- rds_tcp_restore_callbacks(sock, conn->c_transport_data);
+ rds_tcp_restore_callbacks(sock, cp->cp_transport_data);
}
out:
- mutex_unlock(&tc->t_conn_lock);
+ mutex_unlock(&tc->t_conn_path_lock);
if (sock)
sock_release(sock);
return ret;
@@ -142,12 +145,13 @@ out:
* callbacks to those set by TCP. Our callbacks won't execute again once we
* hold the sock lock.
*/
-void rds_tcp_conn_shutdown(struct rds_connection *conn)
+void rds_tcp_conn_path_shutdown(struct rds_conn_path *cp)
{
- struct rds_tcp_connection *tc = conn->c_transport_data;
+ struct rds_tcp_connection *tc = cp->cp_transport_data;
struct socket *sock = tc->t_sock;
- rdsdebug("shutting down conn %p tc %p sock %p\n", conn, tc, sock);
+ rdsdebug("shutting down conn %p tc %p sock %p\n",
+ cp->cp_conn, tc, sock);
if (sock) {
sock->ops->shutdown(sock, RCV_SHUTDOWN | SEND_SHUTDOWN);
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index 686b1d0..ca975a2 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -35,6 +35,7 @@
#include <linux/in.h>
#include <net/tcp.h>
+#include "rds_single_path.h"
#include "rds.h"
#include "tcp.h"
@@ -78,6 +79,7 @@ int rds_tcp_accept_one(struct socket *sock)
struct inet_sock *inet;
struct rds_tcp_connection *rs_tcp = NULL;
int conn_state;
+ struct rds_conn_path *cp;
if (!sock) /* module unload or netns delete in progress */
return -ENETUNREACH;
@@ -119,8 +121,9 @@ int rds_tcp_accept_one(struct socket *sock)
* rds_tcp_state_change() will do that cleanup
*/
rs_tcp = (struct rds_tcp_connection *)conn->c_transport_data;
+ cp = &conn->c_path[0];
rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING);
- mutex_lock(&rs_tcp->t_conn_lock);
+ mutex_lock(&rs_tcp->t_conn_path_lock);
conn_state = rds_conn_state(conn);
if (conn_state != RDS_CONN_CONNECTING && conn_state != RDS_CONN_UP)
goto rst_nsk;
@@ -132,17 +135,17 @@ int rds_tcp_accept_one(struct socket *sock)
* c_transport_data.
*/
if (ntohl(inet->inet_saddr) < ntohl(inet->inet_daddr) ||
- !conn->c_outgoing) {
+ !conn->c_path[0].cp_outgoing) {
goto rst_nsk;
} else {
- rds_tcp_reset_callbacks(new_sock, conn);
- conn->c_outgoing = 0;
+ rds_tcp_reset_callbacks(new_sock, cp);
+ conn->c_path[0].cp_outgoing = 0;
/* rds_connect_path_complete() marks RDS_CONN_UP */
- rds_connect_path_complete(conn, RDS_CONN_DISCONNECTING);
+ rds_connect_path_complete(cp, RDS_CONN_RESETTING);
}
} else {
- rds_tcp_set_callbacks(new_sock, conn);
- rds_connect_path_complete(conn, RDS_CONN_CONNECTING);
+ rds_tcp_set_callbacks(new_sock, cp);
+ rds_connect_path_complete(cp, RDS_CONN_CONNECTING);
}
new_sock = NULL;
ret = 0;
@@ -153,7 +156,7 @@ rst_nsk:
ret = 0;
out:
if (rs_tcp)
- mutex_unlock(&rs_tcp->t_conn_lock);
+ mutex_unlock(&rs_tcp->t_conn_path_lock);
if (new_sock)
sock_release(new_sock);
return ret;
diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c
index c3196f9..ad4892e 100644
--- a/net/rds/tcp_recv.c
+++ b/net/rds/tcp_recv.c
@@ -147,7 +147,7 @@ static void rds_tcp_cong_recv(struct rds_connection *conn,
}
struct rds_tcp_desc_arg {
- struct rds_connection *conn;
+ struct rds_conn_path *conn_path;
gfp_t gfp;
};
@@ -155,8 +155,8 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
unsigned int offset, size_t len)
{
struct rds_tcp_desc_arg *arg = desc->arg.data;
- struct rds_connection *conn = arg->conn;
- struct rds_tcp_connection *tc = conn->c_transport_data;
+ struct rds_conn_path *cp = arg->conn_path;
+ struct rds_tcp_connection *tc = cp->cp_transport_data;
struct rds_tcp_incoming *tinc = tc->t_tinc;
struct sk_buff *clone;
size_t left = len, to_copy;
@@ -171,14 +171,15 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
while (left) {
if (!tinc) {
tinc = kmem_cache_alloc(rds_tcp_incoming_slab,
- arg->gfp);
+ arg->gfp);
if (!tinc) {
desc->error = -ENOMEM;
goto out;
}
tc->t_tinc = tinc;
rdsdebug("alloced tinc %p\n", tinc);
- rds_inc_init(&tinc->ti_inc, conn, conn->c_faddr);
+ rds_inc_path_init(&tinc->ti_inc, cp,
+ cp->cp_conn->c_faddr);
/*
* XXX * we might be able to use the __ variants when
* we've already serialized at a higher level.
@@ -228,6 +229,8 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
}
if (tc->t_tinc_hdr_rem == 0 && tc->t_tinc_data_rem == 0) {
+ struct rds_connection *conn = cp->cp_conn;
+
if (tinc->ti_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP)
rds_tcp_cong_recv(conn, tinc);
else
@@ -250,15 +253,15 @@ out:
}
/* the caller has to hold the sock lock */
-static int rds_tcp_read_sock(struct rds_connection *conn, gfp_t gfp)
+static int rds_tcp_read_sock(struct rds_conn_path *cp, gfp_t gfp)
{
- struct rds_tcp_connection *tc = conn->c_transport_data;
+ struct rds_tcp_connection *tc = cp->cp_transport_data;
struct socket *sock = tc->t_sock;
read_descriptor_t desc;
struct rds_tcp_desc_arg arg;
/* It's like glib in the kernel! */
- arg.conn = conn;
+ arg.conn_path = cp;
arg.gfp = gfp;
desc.arg.data = &arg;
desc.error = 0;
@@ -278,16 +281,17 @@ static int rds_tcp_read_sock(struct rds_connection *conn, gfp_t gfp)
* if we fail to allocate we're in trouble.. blindly wait some time before
* trying again to see if the VM can free up something for us.
*/
-int rds_tcp_recv(struct rds_connection *conn)
+int rds_tcp_recv_path(struct rds_conn_path *cp)
{
- struct rds_tcp_connection *tc = conn->c_transport_data;
+ struct rds_tcp_connection *tc = cp->cp_transport_data;
struct socket *sock = tc->t_sock;
int ret = 0;
- rdsdebug("recv worker conn %p tc %p sock %p\n", conn, tc, sock);
+ rdsdebug("recv worker path [%d] tc %p sock %p\n",
+ cp->cp_index, tc, sock);
lock_sock(sock->sk);
- ret = rds_tcp_read_sock(conn, GFP_KERNEL);
+ ret = rds_tcp_read_sock(cp, GFP_KERNEL);
release_sock(sock->sk);
return ret;
@@ -296,24 +300,24 @@ int rds_tcp_recv(struct rds_connection *conn)
void rds_tcp_data_ready(struct sock *sk)
{
void (*ready)(struct sock *sk);
- struct rds_connection *conn;
+ struct rds_conn_path *cp;
struct rds_tcp_connection *tc;
rdsdebug("data ready sk %p\n", sk);
read_lock_bh(&sk->sk_callback_lock);
- conn = sk->sk_user_data;
- if (!conn) { /* check for teardown race */
+ cp = sk->sk_user_data;
+ if (!cp) { /* check for teardown race */
ready = sk->sk_data_ready;
goto out;
}
- tc = conn->c_transport_data;
+ tc = cp->cp_transport_data;
ready = tc->t_orig_data_ready;
rds_tcp_stats_inc(s_tcp_data_ready_calls);
- if (rds_tcp_read_sock(conn, GFP_ATOMIC) == -ENOMEM)
- queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
+ if (rds_tcp_read_sock(cp, GFP_ATOMIC) == -ENOMEM)
+ queue_delayed_work(rds_wq, &cp->cp_recv_w, 0);
out:
read_unlock_bh(&sk->sk_callback_lock);
ready(sk);
diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c
index 22d0f20..57e0f58 100644
--- a/net/rds/tcp_send.c
+++ b/net/rds/tcp_send.c
@@ -34,6 +34,7 @@
#include <linux/in.h>
#include <net/tcp.h>
+#include "rds_single_path.h"
#include "rds.h"
#include "tcp.h"
@@ -48,16 +49,16 @@ static void rds_tcp_cork(struct socket *sock, int val)
set_fs(oldfs);
}
-void rds_tcp_xmit_prepare(struct rds_connection *conn)
+void rds_tcp_xmit_path_prepare(struct rds_conn_path *cp)
{
- struct rds_tcp_connection *tc = conn->c_transport_data;
+ struct rds_tcp_connection *tc = cp->cp_transport_data;
rds_tcp_cork(tc->t_sock, 1);
}
-void rds_tcp_xmit_complete(struct rds_connection *conn)
+void rds_tcp_xmit_path_complete(struct rds_conn_path *cp)
{
- struct rds_tcp_connection *tc = conn->c_transport_data;
+ struct rds_tcp_connection *tc = cp->cp_transport_data;
rds_tcp_cork(tc->t_sock, 0);
}
@@ -66,19 +67,19 @@ void rds_tcp_xmit_complete(struct rds_connection *conn)
static int rds_tcp_sendmsg(struct socket *sock, void *data, unsigned int len)
{
struct kvec vec = {
- .iov_base = data,
- .iov_len = len,
+ .iov_base = data,
+ .iov_len = len,
+ };
+ struct msghdr msg = {
+ .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL,
};
- struct msghdr msg = {
- .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL,
- };
return kernel_sendmsg(sock, &msg, &vec, 1, vec.iov_len);
}
/* the core send_sem serializes this with other xmit and shutdown */
int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
- unsigned int hdr_off, unsigned int sg, unsigned int off)
+ unsigned int hdr_off, unsigned int sg, unsigned int off)
{
struct rds_tcp_connection *tc = conn->c_transport_data;
int done = 0;
@@ -177,27 +178,27 @@ static int rds_tcp_is_acked(struct rds_message *rm, uint64_t ack)
void rds_tcp_write_space(struct sock *sk)
{
void (*write_space)(struct sock *sk);
- struct rds_connection *conn;
+ struct rds_conn_path *cp;
struct rds_tcp_connection *tc;
read_lock_bh(&sk->sk_callback_lock);
- conn = sk->sk_user_data;
- if (!conn) {
+ cp = sk->sk_user_data;
+ if (!cp) {
write_space = sk->sk_write_space;
goto out;
}
- tc = conn->c_transport_data;
+ tc = cp->cp_transport_data;
rdsdebug("write_space for tc %p\n", tc);
write_space = tc->t_orig_write_space;
rds_tcp_stats_inc(s_tcp_write_space_calls);
rdsdebug("tcp una %u\n", rds_tcp_snd_una(tc));
tc->t_last_seen_una = rds_tcp_snd_una(tc);
- rds_send_drop_acked(conn, rds_tcp_snd_una(tc), rds_tcp_is_acked);
+ rds_send_path_drop_acked(cp, rds_tcp_snd_una(tc), rds_tcp_is_acked);
- if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf)
- queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+ if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf)
+ queue_delayed_work(rds_wq, &cp->cp_send_w, 0);
out:
read_unlock_bh(&sk->sk_callback_lock);
diff --git a/net/rds/threads.c b/net/rds/threads.c
index 4a32304..bc97d67 100644
--- a/net/rds/threads.c
+++ b/net/rds/threads.c
@@ -71,30 +71,30 @@
struct workqueue_struct *rds_wq;
EXPORT_SYMBOL_GPL(rds_wq);
-void rds_connect_path_complete(struct rds_connection *conn, int curr)
+void rds_connect_path_complete(struct rds_conn_path *cp, int curr)
{
- if (!rds_conn_transition(conn, curr, RDS_CONN_UP)) {
+ if (!rds_conn_path_transition(cp, curr, RDS_CONN_UP)) {
printk(KERN_WARNING "%s: Cannot transition to state UP, "
"current state is %d\n",
__func__,
- atomic_read(&conn->c_state));
- rds_conn_drop(conn);
+ atomic_read(&cp->cp_state));
+ rds_conn_path_drop(cp);
return;
}
rdsdebug("conn %p for %pI4 to %pI4 complete\n",
- conn, &conn->c_laddr, &conn->c_faddr);
+ cp->cp_conn, &cp->cp_conn->c_laddr, &cp->cp_conn->c_faddr);
- conn->c_reconnect_jiffies = 0;
- set_bit(0, &conn->c_map_queued);
- queue_delayed_work(rds_wq, &conn->c_send_w, 0);
- queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
+ cp->cp_reconnect_jiffies = 0;
+ set_bit(0, &cp->cp_conn->c_map_queued);
+ queue_delayed_work(rds_wq, &cp->cp_send_w, 0);
+ queue_delayed_work(rds_wq, &cp->cp_recv_w, 0);
}
EXPORT_SYMBOL_GPL(rds_connect_path_complete);
void rds_connect_complete(struct rds_connection *conn)
{
- rds_connect_path_complete(conn, RDS_CONN_CONNECTING);
+ rds_connect_path_complete(&conn->c_path[0], RDS_CONN_CONNECTING);
}
EXPORT_SYMBOL_GPL(rds_connect_complete);
@@ -116,70 +116,85 @@ EXPORT_SYMBOL_GPL(rds_connect_complete);
* We should *always* start with a random backoff; otherwise a broken connection
* will always take several iterations to be re-established.
*/
-void rds_queue_reconnect(struct rds_connection *conn)
+void rds_queue_reconnect(struct rds_conn_path *cp)
{
unsigned long rand;
+ struct rds_connection *conn = cp->cp_conn;
rdsdebug("conn %p for %pI4 to %pI4 reconnect jiffies %lu\n",
conn, &conn->c_laddr, &conn->c_faddr,
- conn->c_reconnect_jiffies);
+ cp->cp_reconnect_jiffies);
- set_bit(RDS_RECONNECT_PENDING, &conn->c_flags);
- if (conn->c_reconnect_jiffies == 0) {
- conn->c_reconnect_jiffies = rds_sysctl_reconnect_min_jiffies;
- queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
+ /* let peer with smaller addr initiate reconnect, to avoid duels */
+ if (conn->c_trans->t_type == RDS_TRANS_TCP &&
+ conn->c_laddr > conn->c_faddr)
+ return;
+
+ set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags);
+ if (cp->cp_reconnect_jiffies == 0) {
+ cp->cp_reconnect_jiffies = rds_sysctl_reconnect_min_jiffies;
+ queue_delayed_work(rds_wq, &cp->cp_conn_w, 0);
return;
}
get_random_bytes(&rand, sizeof(rand));
rdsdebug("%lu delay %lu ceil conn %p for %pI4 -> %pI4\n",
- rand % conn->c_reconnect_jiffies, conn->c_reconnect_jiffies,
+ rand % cp->cp_reconnect_jiffies, cp->cp_reconnect_jiffies,
conn, &conn->c_laddr, &conn->c_faddr);
- queue_delayed_work(rds_wq, &conn->c_conn_w,
- rand % conn->c_reconnect_jiffies);
+ queue_delayed_work(rds_wq, &cp->cp_conn_w,
+ rand % cp->cp_reconnect_jiffies);
- conn->c_reconnect_jiffies = min(conn->c_reconnect_jiffies * 2,
+ cp->cp_reconnect_jiffies = min(cp->cp_reconnect_jiffies * 2,
rds_sysctl_reconnect_max_jiffies);
}
void rds_connect_worker(struct work_struct *work)
{
- struct rds_connection *conn = container_of(work, struct rds_connection, c_conn_w.work);
+ struct rds_conn_path *cp = container_of(work,
+ struct rds_conn_path,
+ cp_conn_w.work);
+ struct rds_connection *conn = cp->cp_conn;
int ret;
- clear_bit(RDS_RECONNECT_PENDING, &conn->c_flags);
- if (rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
- ret = conn->c_trans->conn_connect(conn);
+ clear_bit(RDS_RECONNECT_PENDING, &cp->cp_flags);
+ ret = rds_conn_path_transition(cp, RDS_CONN_DOWN, RDS_CONN_CONNECTING);
+ if (ret) {
+ ret = conn->c_trans->conn_path_connect(cp);
rdsdebug("conn %p for %pI4 to %pI4 dispatched, ret %d\n",
conn, &conn->c_laddr, &conn->c_faddr, ret);
if (ret) {
- if (rds_conn_transition(conn, RDS_CONN_CONNECTING, RDS_CONN_DOWN))
- rds_queue_reconnect(conn);
+ if (rds_conn_path_transition(cp,
+ RDS_CONN_CONNECTING,
+ RDS_CONN_DOWN))
+ rds_queue_reconnect(cp);
else
- rds_conn_error(conn, "RDS: connect failed\n");
+ rds_conn_path_error(cp,
+ "RDS: connect failed\n");
}
}
}
void rds_send_worker(struct work_struct *work)
{
- struct rds_connection *conn = container_of(work, struct rds_connection, c_send_w.work);
+ struct rds_conn_path *cp = container_of(work,
+ struct rds_conn_path,
+ cp_send_w.work);
int ret;
- if (rds_conn_state(conn) == RDS_CONN_UP) {
- clear_bit(RDS_LL_SEND_FULL, &conn->c_flags);
- ret = rds_send_xmit(conn);
+ if (rds_conn_path_state(cp) == RDS_CONN_UP) {
+ clear_bit(RDS_LL_SEND_FULL, &cp->cp_flags);
+ ret = rds_send_xmit(cp);
cond_resched();
- rdsdebug("conn %p ret %d\n", conn, ret);
+ rdsdebug("conn %p ret %d\n", cp->cp_conn, ret);
switch (ret) {
case -EAGAIN:
rds_stats_inc(s_send_immediate_retry);
- queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+ queue_delayed_work(rds_wq, &cp->cp_send_w, 0);
break;
case -ENOMEM:
rds_stats_inc(s_send_delayed_retry);
- queue_delayed_work(rds_wq, &conn->c_send_w, 2);
+ queue_delayed_work(rds_wq, &cp->cp_send_w, 2);
default:
break;
}
@@ -188,20 +203,22 @@ void rds_send_worker(struct work_struct *work)
void rds_recv_worker(struct work_struct *work)
{
- struct rds_connection *conn = container_of(work, struct rds_connection, c_recv_w.work);
+ struct rds_conn_path *cp = container_of(work,
+ struct rds_conn_path,
+ cp_recv_w.work);
int ret;
- if (rds_conn_state(conn) == RDS_CONN_UP) {
- ret = conn->c_trans->recv(conn);
- rdsdebug("conn %p ret %d\n", conn, ret);
+ if (rds_conn_path_state(cp) == RDS_CONN_UP) {
+ ret = cp->cp_conn->c_trans->recv_path(cp);
+ rdsdebug("conn %p ret %d\n", cp->cp_conn, ret);
switch (ret) {
case -EAGAIN:
rds_stats_inc(s_recv_immediate_retry);
- queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
+ queue_delayed_work(rds_wq, &cp->cp_recv_w, 0);
break;
case -ENOMEM:
rds_stats_inc(s_recv_delayed_retry);
- queue_delayed_work(rds_wq, &conn->c_recv_w, 2);
+ queue_delayed_work(rds_wq, &cp->cp_recv_w, 2);
default:
break;
}
@@ -210,9 +227,11 @@ void rds_recv_worker(struct work_struct *work)
void rds_shutdown_worker(struct work_struct *work)
{
- struct rds_connection *conn = container_of(work, struct rds_connection, c_down_w);
+ struct rds_conn_path *cp = container_of(work,
+ struct rds_conn_path,
+ cp_down_w);
- rds_conn_shutdown(conn);
+ rds_conn_shutdown(cp);
}
void rds_threads_exit(void)
diff --git a/net/rds/transport.c b/net/rds/transport.c
index f3afd1d..2ffd3e30 100644
--- a/net/rds/transport.c
+++ b/net/rds/transport.c
@@ -140,8 +140,7 @@ unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
rds_info_iter_unmap(iter);
down_read(&rds_trans_sem);
- for (i = 0; i < RDS_TRANS_COUNT; i++)
- {
+ for (i = 0; i < RDS_TRANS_COUNT; i++) {
trans = transports[i];
if (!trans || !trans->stats_info_copy)
continue;
diff --git a/net/rxrpc/Makefile b/net/rxrpc/Makefile
index e05a06e..6522e50 100644
--- a/net/rxrpc/Makefile
+++ b/net/rxrpc/Makefile
@@ -4,25 +4,27 @@
af-rxrpc-y := \
af_rxrpc.o \
- ar-accept.o \
- ar-ack.o \
- ar-call.o \
- ar-connection.o \
- ar-connevent.o \
- ar-error.o \
- ar-input.o \
- ar-key.o \
- ar-local.o \
- ar-output.o \
- ar-peer.o \
- ar-recvmsg.o \
- ar-security.o \
- ar-skbuff.o \
- ar-transport.o \
+ call_accept.o \
+ call_event.o \
+ call_object.o \
+ conn_client.o \
+ conn_event.o \
+ conn_object.o \
+ input.o \
insecure.o \
- misc.o
+ key.o \
+ local_event.o \
+ local_object.o \
+ misc.o \
+ output.o \
+ peer_event.o \
+ peer_object.o \
+ recvmsg.o \
+ security.o \
+ skbuff.o \
+ utils.o
-af-rxrpc-$(CONFIG_PROC_FS) += ar-proc.o
+af-rxrpc-$(CONFIG_PROC_FS) += proc.o
af-rxrpc-$(CONFIG_RXKAD) += rxkad.o
af-rxrpc-$(CONFIG_SYSCTL) += sysctl.o
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index a1bcb0e..5d3e795 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -97,11 +97,13 @@ static int rxrpc_validate_address(struct rxrpc_sock *rx,
srx->transport_len > len)
return -EINVAL;
- if (srx->transport.family != rx->proto)
+ if (srx->transport.family != rx->family)
return -EAFNOSUPPORT;
switch (srx->transport.family) {
case AF_INET:
+ if (srx->transport_len < sizeof(struct sockaddr_in))
+ return -EINVAL;
_debug("INET: %x @ %pI4",
ntohs(srx->transport.sin.sin_port),
&srx->transport.sin.sin_addr);
@@ -222,39 +224,6 @@ static int rxrpc_listen(struct socket *sock, int backlog)
return ret;
}
-/*
- * find a transport by address
- */
-struct rxrpc_transport *rxrpc_name_to_transport(struct rxrpc_sock *rx,
- struct sockaddr *addr,
- int addr_len, int flags,
- gfp_t gfp)
-{
- struct sockaddr_rxrpc *srx = (struct sockaddr_rxrpc *) addr;
- struct rxrpc_transport *trans;
- struct rxrpc_peer *peer;
-
- _enter("%p,%p,%d,%d", rx, addr, addr_len, flags);
-
- ASSERT(rx->local != NULL);
-
- if (rx->srx.transport_type != srx->transport_type)
- return ERR_PTR(-ESOCKTNOSUPPORT);
- if (rx->srx.transport.family != srx->transport.family)
- return ERR_PTR(-EAFNOSUPPORT);
-
- /* find a remote transport endpoint from the local one */
- peer = rxrpc_get_peer(srx, gfp);
- if (IS_ERR(peer))
- return ERR_CAST(peer);
-
- /* find a transport */
- trans = rxrpc_get_transport(rx->local, peer, gfp);
- rxrpc_put_peer(peer);
- _leave(" = %p", trans);
- return trans;
-}
-
/**
* rxrpc_kernel_begin_call - Allow a kernel service to begin a call
* @sock: The socket on which to make the call
@@ -275,39 +244,32 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock,
unsigned long user_call_ID,
gfp_t gfp)
{
- struct rxrpc_conn_bundle *bundle;
- struct rxrpc_transport *trans;
+ struct rxrpc_conn_parameters cp;
struct rxrpc_call *call;
struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
+ int ret;
_enter(",,%x,%lx", key_serial(key), user_call_ID);
- lock_sock(&rx->sk);
+ ret = rxrpc_validate_address(rx, srx, sizeof(*srx));
+ if (ret < 0)
+ return ERR_PTR(ret);
- trans = rxrpc_name_to_transport(rx, (struct sockaddr *)srx,
- sizeof(*srx), 0, gfp);
- if (IS_ERR(trans)) {
- call = ERR_CAST(trans);
- trans = NULL;
- goto out_notrans;
- }
+ lock_sock(&rx->sk);
if (!key)
key = rx->key;
if (key && !key->payload.data[0])
key = NULL; /* a no-security key */
- bundle = rxrpc_get_bundle(rx, trans, key, srx->srx_service, gfp);
- if (IS_ERR(bundle)) {
- call = ERR_CAST(bundle);
- goto out;
- }
+ memset(&cp, 0, sizeof(cp));
+ cp.local = rx->local;
+ cp.key = key;
+ cp.security_level = 0;
+ cp.exclusive = false;
+ cp.service_id = srx->srx_service;
+ call = rxrpc_new_client_call(rx, &cp, srx, user_call_ID, gfp);
- call = rxrpc_new_client_call(rx, trans, bundle, user_call_ID, gfp);
- rxrpc_put_bundle(trans, bundle);
-out:
- rxrpc_put_transport(trans);
-out_notrans:
release_sock(&rx->sk);
_leave(" = %p", call);
return call;
@@ -485,7 +447,7 @@ static int rxrpc_setsockopt(struct socket *sock, int level, int optname,
ret = -EISCONN;
if (rx->sk.sk_state != RXRPC_UNBOUND)
goto error;
- set_bit(RXRPC_SOCK_EXCLUSIVE_CONN, &rx->flags);
+ rx->exclusive = true;
goto success;
case RXRPC_SECURITY_KEY:
@@ -598,7 +560,7 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol,
sk->sk_destruct = rxrpc_sock_destructor;
rx = rxrpc_sk(sk);
- rx->proto = protocol;
+ rx->family = protocol;
rx->calls = RB_ROOT;
INIT_LIST_HEAD(&rx->listen_link);
@@ -660,16 +622,8 @@ static int rxrpc_release_sock(struct sock *sk)
flush_workqueue(rxrpc_workqueue);
rxrpc_purge_queue(&sk->sk_receive_queue);
- if (rx->conn) {
- rxrpc_put_connection(rx->conn);
- rx->conn = NULL;
- }
-
- if (rx->local) {
- rxrpc_put_local(rx->local);
- rx->local = NULL;
- }
-
+ rxrpc_put_local(rx->local);
+ rx->local = NULL;
key_put(rx->key);
rx->key = NULL;
key_put(rx->securities);
@@ -834,14 +788,29 @@ static void __exit af_rxrpc_exit(void)
proto_unregister(&rxrpc_proto);
rxrpc_destroy_all_calls();
rxrpc_destroy_all_connections();
- rxrpc_destroy_all_transports();
- rxrpc_destroy_all_peers();
- rxrpc_destroy_all_locals();
ASSERTCMP(atomic_read(&rxrpc_n_skbs), ==, 0);
+ /* We need to flush the scheduled work twice because the local endpoint
+ * records involve a work item in their destruction as they can only be
+ * destroyed from process context. However, a connection may have a
+ * work item outstanding - and this will pin the local endpoint record
+ * until the connection goes away.
+ *
+ * Peers don't pin locals and calls pin sockets - which prevents the
+ * module from being unloaded - so we should only need two flushes.
+ */
_debug("flush scheduled work");
flush_workqueue(rxrpc_workqueue);
+ _debug("flush scheduled work 2");
+ flush_workqueue(rxrpc_workqueue);
+ _debug("synchronise RCU");
+ rcu_barrier();
+ _debug("destroy locals");
+ ASSERT(idr_is_empty(&rxrpc_client_conn_ids));
+ idr_destroy(&rxrpc_client_conn_ids);
+ rxrpc_destroy_all_locals();
+
remove_proc_entry("rxrpc_conns", init_net.proc_net);
remove_proc_entry("rxrpc_calls", init_net.proc_net);
destroy_workqueue(rxrpc_workqueue);
diff --git a/net/rxrpc/ar-connection.c b/net/rxrpc/ar-connection.c
deleted file mode 100644
index 8ecde4b..0000000
--- a/net/rxrpc/ar-connection.c
+++ /dev/null
@@ -1,912 +0,0 @@
-/* RxRPC virtual connection handler
- *
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/net.h>
-#include <linux/skbuff.h>
-#include <linux/crypto.h>
-#include <net/sock.h>
-#include <net/af_rxrpc.h>
-#include "ar-internal.h"
-
-/*
- * Time till a connection expires after last use (in seconds).
- */
-unsigned int rxrpc_connection_expiry = 10 * 60;
-
-static void rxrpc_connection_reaper(struct work_struct *work);
-
-LIST_HEAD(rxrpc_connections);
-DEFINE_RWLOCK(rxrpc_connection_lock);
-static DECLARE_DELAYED_WORK(rxrpc_connection_reap, rxrpc_connection_reaper);
-
-/*
- * allocate a new client connection bundle
- */
-static struct rxrpc_conn_bundle *rxrpc_alloc_bundle(gfp_t gfp)
-{
- struct rxrpc_conn_bundle *bundle;
-
- _enter("");
-
- bundle = kzalloc(sizeof(struct rxrpc_conn_bundle), gfp);
- if (bundle) {
- INIT_LIST_HEAD(&bundle->unused_conns);
- INIT_LIST_HEAD(&bundle->avail_conns);
- INIT_LIST_HEAD(&bundle->busy_conns);
- init_waitqueue_head(&bundle->chanwait);
- atomic_set(&bundle->usage, 1);
- }
-
- _leave(" = %p", bundle);
- return bundle;
-}
-
-/*
- * compare bundle parameters with what we're looking for
- * - return -ve, 0 or +ve
- */
-static inline
-int rxrpc_cmp_bundle(const struct rxrpc_conn_bundle *bundle,
- struct key *key, u16 service_id)
-{
- return (bundle->service_id - service_id) ?:
- ((unsigned long)bundle->key - (unsigned long)key);
-}
-
-/*
- * get bundle of client connections that a client socket can make use of
- */
-struct rxrpc_conn_bundle *rxrpc_get_bundle(struct rxrpc_sock *rx,
- struct rxrpc_transport *trans,
- struct key *key,
- u16 service_id,
- gfp_t gfp)
-{
- struct rxrpc_conn_bundle *bundle, *candidate;
- struct rb_node *p, *parent, **pp;
-
- _enter("%p{%x},%x,%hx,",
- rx, key_serial(key), trans->debug_id, service_id);
-
- /* search the extant bundles first for one that matches the specified
- * user ID */
- spin_lock(&trans->client_lock);
-
- p = trans->bundles.rb_node;
- while (p) {
- bundle = rb_entry(p, struct rxrpc_conn_bundle, node);
-
- if (rxrpc_cmp_bundle(bundle, key, service_id) < 0)
- p = p->rb_left;
- else if (rxrpc_cmp_bundle(bundle, key, service_id) > 0)
- p = p->rb_right;
- else
- goto found_extant_bundle;
- }
-
- spin_unlock(&trans->client_lock);
-
- /* not yet present - create a candidate for a new record and then
- * redo the search */
- candidate = rxrpc_alloc_bundle(gfp);
- if (!candidate) {
- _leave(" = -ENOMEM");
- return ERR_PTR(-ENOMEM);
- }
-
- candidate->key = key_get(key);
- candidate->service_id = service_id;
-
- spin_lock(&trans->client_lock);
-
- pp = &trans->bundles.rb_node;
- parent = NULL;
- while (*pp) {
- parent = *pp;
- bundle = rb_entry(parent, struct rxrpc_conn_bundle, node);
-
- if (rxrpc_cmp_bundle(bundle, key, service_id) < 0)
- pp = &(*pp)->rb_left;
- else if (rxrpc_cmp_bundle(bundle, key, service_id) > 0)
- pp = &(*pp)->rb_right;
- else
- goto found_extant_second;
- }
-
- /* second search also failed; add the new bundle */
- bundle = candidate;
- candidate = NULL;
-
- rb_link_node(&bundle->node, parent, pp);
- rb_insert_color(&bundle->node, &trans->bundles);
- spin_unlock(&trans->client_lock);
- _net("BUNDLE new on trans %d", trans->debug_id);
- _leave(" = %p [new]", bundle);
- return bundle;
-
- /* we found the bundle in the list immediately */
-found_extant_bundle:
- atomic_inc(&bundle->usage);
- spin_unlock(&trans->client_lock);
- _net("BUNDLE old on trans %d", trans->debug_id);
- _leave(" = %p [extant %d]", bundle, atomic_read(&bundle->usage));
- return bundle;
-
- /* we found the bundle on the second time through the list */
-found_extant_second:
- atomic_inc(&bundle->usage);
- spin_unlock(&trans->client_lock);
- kfree(candidate);
- _net("BUNDLE old2 on trans %d", trans->debug_id);
- _leave(" = %p [second %d]", bundle, atomic_read(&bundle->usage));
- return bundle;
-}
-
-/*
- * release a bundle
- */
-void rxrpc_put_bundle(struct rxrpc_transport *trans,
- struct rxrpc_conn_bundle *bundle)
-{
- _enter("%p,%p{%d}",trans, bundle, atomic_read(&bundle->usage));
-
- if (atomic_dec_and_lock(&bundle->usage, &trans->client_lock)) {
- _debug("Destroy bundle");
- rb_erase(&bundle->node, &trans->bundles);
- spin_unlock(&trans->client_lock);
- ASSERT(list_empty(&bundle->unused_conns));
- ASSERT(list_empty(&bundle->avail_conns));
- ASSERT(list_empty(&bundle->busy_conns));
- ASSERTCMP(bundle->num_conns, ==, 0);
- key_put(bundle->key);
- kfree(bundle);
- }
-
- _leave("");
-}
-
-/*
- * allocate a new connection
- */
-static struct rxrpc_connection *rxrpc_alloc_connection(gfp_t gfp)
-{
- struct rxrpc_connection *conn;
-
- _enter("");
-
- conn = kzalloc(sizeof(struct rxrpc_connection), gfp);
- if (conn) {
- INIT_WORK(&conn->processor, &rxrpc_process_connection);
- INIT_LIST_HEAD(&conn->bundle_link);
- conn->calls = RB_ROOT;
- skb_queue_head_init(&conn->rx_queue);
- conn->security = &rxrpc_no_security;
- rwlock_init(&conn->lock);
- spin_lock_init(&conn->state_lock);
- atomic_set(&conn->usage, 1);
- conn->debug_id = atomic_inc_return(&rxrpc_debug_id);
- conn->avail_calls = RXRPC_MAXCALLS;
- conn->size_align = 4;
- conn->header_size = sizeof(struct rxrpc_wire_header);
- }
-
- _leave(" = %p{%d}", conn, conn ? conn->debug_id : 0);
- return conn;
-}
-
-/*
- * assign a connection ID to a connection and add it to the transport's
- * connection lookup tree
- * - called with transport client lock held
- */
-static void rxrpc_assign_connection_id(struct rxrpc_connection *conn)
-{
- struct rxrpc_connection *xconn;
- struct rb_node *parent, **p;
- __be32 epoch;
- u32 cid;
-
- _enter("");
-
- epoch = conn->epoch;
-
- write_lock_bh(&conn->trans->conn_lock);
-
- conn->trans->conn_idcounter += RXRPC_CID_INC;
- if (conn->trans->conn_idcounter < RXRPC_CID_INC)
- conn->trans->conn_idcounter = RXRPC_CID_INC;
- cid = conn->trans->conn_idcounter;
-
-attempt_insertion:
- parent = NULL;
- p = &conn->trans->client_conns.rb_node;
-
- while (*p) {
- parent = *p;
- xconn = rb_entry(parent, struct rxrpc_connection, node);
-
- if (epoch < xconn->epoch)
- p = &(*p)->rb_left;
- else if (epoch > xconn->epoch)
- p = &(*p)->rb_right;
- else if (cid < xconn->cid)
- p = &(*p)->rb_left;
- else if (cid > xconn->cid)
- p = &(*p)->rb_right;
- else
- goto id_exists;
- }
-
- /* we've found a suitable hole - arrange for this connection to occupy
- * it */
- rb_link_node(&conn->node, parent, p);
- rb_insert_color(&conn->node, &conn->trans->client_conns);
-
- conn->cid = cid;
- write_unlock_bh(&conn->trans->conn_lock);
- _leave(" [CID %x]", cid);
- return;
-
- /* we found a connection with the proposed ID - walk the tree from that
- * point looking for the next unused ID */
-id_exists:
- for (;;) {
- cid += RXRPC_CID_INC;
- if (cid < RXRPC_CID_INC) {
- cid = RXRPC_CID_INC;
- conn->trans->conn_idcounter = cid;
- goto attempt_insertion;
- }
-
- parent = rb_next(parent);
- if (!parent)
- goto attempt_insertion;
-
- xconn = rb_entry(parent, struct rxrpc_connection, node);
- if (epoch < xconn->epoch ||
- cid < xconn->cid)
- goto attempt_insertion;
- }
-}
-
-/*
- * add a call to a connection's call-by-ID tree
- */
-static void rxrpc_add_call_ID_to_conn(struct rxrpc_connection *conn,
- struct rxrpc_call *call)
-{
- struct rxrpc_call *xcall;
- struct rb_node *parent, **p;
- __be32 call_id;
-
- write_lock_bh(&conn->lock);
-
- call_id = call->call_id;
- p = &conn->calls.rb_node;
- parent = NULL;
- while (*p) {
- parent = *p;
- xcall = rb_entry(parent, struct rxrpc_call, conn_node);
-
- if (call_id < xcall->call_id)
- p = &(*p)->rb_left;
- else if (call_id > xcall->call_id)
- p = &(*p)->rb_right;
- else
- BUG();
- }
-
- rb_link_node(&call->conn_node, parent, p);
- rb_insert_color(&call->conn_node, &conn->calls);
-
- write_unlock_bh(&conn->lock);
-}
-
-/*
- * connect a call on an exclusive connection
- */
-static int rxrpc_connect_exclusive(struct rxrpc_sock *rx,
- struct rxrpc_transport *trans,
- u16 service_id,
- struct rxrpc_call *call,
- gfp_t gfp)
-{
- struct rxrpc_connection *conn;
- int chan, ret;
-
- _enter("");
-
- conn = rx->conn;
- if (!conn) {
- /* not yet present - create a candidate for a new connection
- * and then redo the check */
- conn = rxrpc_alloc_connection(gfp);
- if (!conn) {
- _leave(" = -ENOMEM");
- return -ENOMEM;
- }
-
- conn->trans = trans;
- conn->bundle = NULL;
- conn->service_id = service_id;
- conn->epoch = rxrpc_epoch;
- conn->in_clientflag = 0;
- conn->out_clientflag = RXRPC_CLIENT_INITIATED;
- conn->cid = 0;
- conn->state = RXRPC_CONN_CLIENT;
- conn->avail_calls = RXRPC_MAXCALLS - 1;
- conn->security_level = rx->min_sec_level;
- conn->key = key_get(rx->key);
-
- ret = rxrpc_init_client_conn_security(conn);
- if (ret < 0) {
- key_put(conn->key);
- kfree(conn);
- _leave(" = %d [key]", ret);
- return ret;
- }
-
- write_lock_bh(&rxrpc_connection_lock);
- list_add_tail(&conn->link, &rxrpc_connections);
- write_unlock_bh(&rxrpc_connection_lock);
-
- spin_lock(&trans->client_lock);
- atomic_inc(&trans->usage);
-
- _net("CONNECT EXCL new %d on TRANS %d",
- conn->debug_id, conn->trans->debug_id);
-
- rxrpc_assign_connection_id(conn);
- rx->conn = conn;
- } else {
- spin_lock(&trans->client_lock);
- }
-
- /* we've got a connection with a free channel and we can now attach the
- * call to it
- * - we're holding the transport's client lock
- * - we're holding a reference on the connection
- */
- for (chan = 0; chan < RXRPC_MAXCALLS; chan++)
- if (!conn->channels[chan])
- goto found_channel;
- goto no_free_channels;
-
-found_channel:
- atomic_inc(&conn->usage);
- conn->channels[chan] = call;
- call->conn = conn;
- call->channel = chan;
- call->cid = conn->cid | chan;
- call->call_id = ++conn->call_counter;
-
- _net("CONNECT client on conn %d chan %d as call %x",
- conn->debug_id, chan, call->call_id);
-
- spin_unlock(&trans->client_lock);
-
- rxrpc_add_call_ID_to_conn(conn, call);
- _leave(" = 0");
- return 0;
-
-no_free_channels:
- spin_unlock(&trans->client_lock);
- _leave(" = -ENOSR");
- return -ENOSR;
-}
-
-/*
- * find a connection for a call
- * - called in process context with IRQs enabled
- */
-int rxrpc_connect_call(struct rxrpc_sock *rx,
- struct rxrpc_transport *trans,
- struct rxrpc_conn_bundle *bundle,
- struct rxrpc_call *call,
- gfp_t gfp)
-{
- struct rxrpc_connection *conn, *candidate;
- int chan, ret;
-
- DECLARE_WAITQUEUE(myself, current);
-
- _enter("%p,%lx,", rx, call->user_call_ID);
-
- if (test_bit(RXRPC_SOCK_EXCLUSIVE_CONN, &rx->flags))
- return rxrpc_connect_exclusive(rx, trans, bundle->service_id,
- call, gfp);
-
- spin_lock(&trans->client_lock);
- for (;;) {
- /* see if the bundle has a call slot available */
- if (!list_empty(&bundle->avail_conns)) {
- _debug("avail");
- conn = list_entry(bundle->avail_conns.next,
- struct rxrpc_connection,
- bundle_link);
- if (conn->state >= RXRPC_CONN_REMOTELY_ABORTED) {
- list_del_init(&conn->bundle_link);
- bundle->num_conns--;
- continue;
- }
- if (--conn->avail_calls == 0)
- list_move(&conn->bundle_link,
- &bundle->busy_conns);
- ASSERTCMP(conn->avail_calls, <, RXRPC_MAXCALLS);
- ASSERT(conn->channels[0] == NULL ||
- conn->channels[1] == NULL ||
- conn->channels[2] == NULL ||
- conn->channels[3] == NULL);
- atomic_inc(&conn->usage);
- break;
- }
-
- if (!list_empty(&bundle->unused_conns)) {
- _debug("unused");
- conn = list_entry(bundle->unused_conns.next,
- struct rxrpc_connection,
- bundle_link);
- if (conn->state >= RXRPC_CONN_REMOTELY_ABORTED) {
- list_del_init(&conn->bundle_link);
- bundle->num_conns--;
- continue;
- }
- ASSERTCMP(conn->avail_calls, ==, RXRPC_MAXCALLS);
- conn->avail_calls = RXRPC_MAXCALLS - 1;
- ASSERT(conn->channels[0] == NULL &&
- conn->channels[1] == NULL &&
- conn->channels[2] == NULL &&
- conn->channels[3] == NULL);
- atomic_inc(&conn->usage);
- list_move(&conn->bundle_link, &bundle->avail_conns);
- break;
- }
-
- /* need to allocate a new connection */
- _debug("get new conn [%d]", bundle->num_conns);
-
- spin_unlock(&trans->client_lock);
-
- if (signal_pending(current))
- goto interrupted;
-
- if (bundle->num_conns >= 20) {
- _debug("too many conns");
-
- if (!gfpflags_allow_blocking(gfp)) {
- _leave(" = -EAGAIN");
- return -EAGAIN;
- }
-
- add_wait_queue(&bundle->chanwait, &myself);
- for (;;) {
- set_current_state(TASK_INTERRUPTIBLE);
- if (bundle->num_conns < 20 ||
- !list_empty(&bundle->unused_conns) ||
- !list_empty(&bundle->avail_conns))
- break;
- if (signal_pending(current))
- goto interrupted_dequeue;
- schedule();
- }
- remove_wait_queue(&bundle->chanwait, &myself);
- __set_current_state(TASK_RUNNING);
- spin_lock(&trans->client_lock);
- continue;
- }
-
- /* not yet present - create a candidate for a new connection and then
- * redo the check */
- candidate = rxrpc_alloc_connection(gfp);
- if (!candidate) {
- _leave(" = -ENOMEM");
- return -ENOMEM;
- }
-
- candidate->trans = trans;
- candidate->bundle = bundle;
- candidate->service_id = bundle->service_id;
- candidate->epoch = rxrpc_epoch;
- candidate->in_clientflag = 0;
- candidate->out_clientflag = RXRPC_CLIENT_INITIATED;
- candidate->cid = 0;
- candidate->state = RXRPC_CONN_CLIENT;
- candidate->avail_calls = RXRPC_MAXCALLS;
- candidate->security_level = rx->min_sec_level;
- candidate->key = key_get(bundle->key);
-
- ret = rxrpc_init_client_conn_security(candidate);
- if (ret < 0) {
- key_put(candidate->key);
- kfree(candidate);
- _leave(" = %d [key]", ret);
- return ret;
- }
-
- write_lock_bh(&rxrpc_connection_lock);
- list_add_tail(&candidate->link, &rxrpc_connections);
- write_unlock_bh(&rxrpc_connection_lock);
-
- spin_lock(&trans->client_lock);
-
- list_add(&candidate->bundle_link, &bundle->unused_conns);
- bundle->num_conns++;
- atomic_inc(&bundle->usage);
- atomic_inc(&trans->usage);
-
- _net("CONNECT new %d on TRANS %d",
- candidate->debug_id, candidate->trans->debug_id);
-
- rxrpc_assign_connection_id(candidate);
- candidate->security->prime_packet_security(candidate);
-
- /* leave the candidate lurking in zombie mode attached to the
- * bundle until we're ready for it */
- rxrpc_put_connection(candidate);
- candidate = NULL;
- }
-
- /* we've got a connection with a free channel and we can now attach the
- * call to it
- * - we're holding the transport's client lock
- * - we're holding a reference on the connection
- * - we're holding a reference on the bundle
- */
- for (chan = 0; chan < RXRPC_MAXCALLS; chan++)
- if (!conn->channels[chan])
- goto found_channel;
- ASSERT(conn->channels[0] == NULL ||
- conn->channels[1] == NULL ||
- conn->channels[2] == NULL ||
- conn->channels[3] == NULL);
- BUG();
-
-found_channel:
- conn->channels[chan] = call;
- call->conn = conn;
- call->channel = chan;
- call->cid = conn->cid | chan;
- call->call_id = ++conn->call_counter;
-
- _net("CONNECT client on conn %d chan %d as call %x",
- conn->debug_id, chan, call->call_id);
-
- ASSERTCMP(conn->avail_calls, <, RXRPC_MAXCALLS);
- spin_unlock(&trans->client_lock);
-
- rxrpc_add_call_ID_to_conn(conn, call);
-
- _leave(" = 0");
- return 0;
-
-interrupted_dequeue:
- remove_wait_queue(&bundle->chanwait, &myself);
- __set_current_state(TASK_RUNNING);
-interrupted:
- _leave(" = -ERESTARTSYS");
- return -ERESTARTSYS;
-}
-
-/*
- * get a record of an incoming connection
- */
-struct rxrpc_connection *
-rxrpc_incoming_connection(struct rxrpc_transport *trans,
- struct rxrpc_host_header *hdr)
-{
- struct rxrpc_connection *conn, *candidate = NULL;
- struct rb_node *p, **pp;
- const char *new = "old";
- __be32 epoch;
- u32 cid;
-
- _enter("");
-
- ASSERT(hdr->flags & RXRPC_CLIENT_INITIATED);
-
- epoch = hdr->epoch;
- cid = hdr->cid & RXRPC_CIDMASK;
-
- /* search the connection list first */
- read_lock_bh(&trans->conn_lock);
-
- p = trans->server_conns.rb_node;
- while (p) {
- conn = rb_entry(p, struct rxrpc_connection, node);
-
- _debug("maybe %x", conn->cid);
-
- if (epoch < conn->epoch)
- p = p->rb_left;
- else if (epoch > conn->epoch)
- p = p->rb_right;
- else if (cid < conn->cid)
- p = p->rb_left;
- else if (cid > conn->cid)
- p = p->rb_right;
- else
- goto found_extant_connection;
- }
- read_unlock_bh(&trans->conn_lock);
-
- /* not yet present - create a candidate for a new record and then
- * redo the search */
- candidate = rxrpc_alloc_connection(GFP_NOIO);
- if (!candidate) {
- _leave(" = -ENOMEM");
- return ERR_PTR(-ENOMEM);
- }
-
- candidate->trans = trans;
- candidate->epoch = hdr->epoch;
- candidate->cid = hdr->cid & RXRPC_CIDMASK;
- candidate->service_id = hdr->serviceId;
- candidate->security_ix = hdr->securityIndex;
- candidate->in_clientflag = RXRPC_CLIENT_INITIATED;
- candidate->out_clientflag = 0;
- candidate->state = RXRPC_CONN_SERVER;
- if (candidate->service_id)
- candidate->state = RXRPC_CONN_SERVER_UNSECURED;
-
- write_lock_bh(&trans->conn_lock);
-
- pp = &trans->server_conns.rb_node;
- p = NULL;
- while (*pp) {
- p = *pp;
- conn = rb_entry(p, struct rxrpc_connection, node);
-
- if (epoch < conn->epoch)
- pp = &(*pp)->rb_left;
- else if (epoch > conn->epoch)
- pp = &(*pp)->rb_right;
- else if (cid < conn->cid)
- pp = &(*pp)->rb_left;
- else if (cid > conn->cid)
- pp = &(*pp)->rb_right;
- else
- goto found_extant_second;
- }
-
- /* we can now add the new candidate to the list */
- conn = candidate;
- candidate = NULL;
- rb_link_node(&conn->node, p, pp);
- rb_insert_color(&conn->node, &trans->server_conns);
- atomic_inc(&conn->trans->usage);
-
- write_unlock_bh(&trans->conn_lock);
-
- write_lock_bh(&rxrpc_connection_lock);
- list_add_tail(&conn->link, &rxrpc_connections);
- write_unlock_bh(&rxrpc_connection_lock);
-
- new = "new";
-
-success:
- _net("CONNECTION %s %d {%x}", new, conn->debug_id, conn->cid);
-
- _leave(" = %p {u=%d}", conn, atomic_read(&conn->usage));
- return conn;
-
- /* we found the connection in the list immediately */
-found_extant_connection:
- if (hdr->securityIndex != conn->security_ix) {
- read_unlock_bh(&trans->conn_lock);
- goto security_mismatch;
- }
- atomic_inc(&conn->usage);
- read_unlock_bh(&trans->conn_lock);
- goto success;
-
- /* we found the connection on the second time through the list */
-found_extant_second:
- if (hdr->securityIndex != conn->security_ix) {
- write_unlock_bh(&trans->conn_lock);
- goto security_mismatch;
- }
- atomic_inc(&conn->usage);
- write_unlock_bh(&trans->conn_lock);
- kfree(candidate);
- goto success;
-
-security_mismatch:
- kfree(candidate);
- _leave(" = -EKEYREJECTED");
- return ERR_PTR(-EKEYREJECTED);
-}
-
-/*
- * find a connection based on transport and RxRPC connection ID for an incoming
- * packet
- */
-struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_transport *trans,
- struct rxrpc_host_header *hdr)
-{
- struct rxrpc_connection *conn;
- struct rb_node *p;
- u32 epoch, cid;
-
- _enter(",{%x,%x}", hdr->cid, hdr->flags);
-
- read_lock_bh(&trans->conn_lock);
-
- cid = hdr->cid & RXRPC_CIDMASK;
- epoch = hdr->epoch;
-
- if (hdr->flags & RXRPC_CLIENT_INITIATED)
- p = trans->server_conns.rb_node;
- else
- p = trans->client_conns.rb_node;
-
- while (p) {
- conn = rb_entry(p, struct rxrpc_connection, node);
-
- _debug("maybe %x", conn->cid);
-
- if (epoch < conn->epoch)
- p = p->rb_left;
- else if (epoch > conn->epoch)
- p = p->rb_right;
- else if (cid < conn->cid)
- p = p->rb_left;
- else if (cid > conn->cid)
- p = p->rb_right;
- else
- goto found;
- }
-
- read_unlock_bh(&trans->conn_lock);
- _leave(" = NULL");
- return NULL;
-
-found:
- atomic_inc(&conn->usage);
- read_unlock_bh(&trans->conn_lock);
- _leave(" = %p", conn);
- return conn;
-}
-
-/*
- * release a virtual connection
- */
-void rxrpc_put_connection(struct rxrpc_connection *conn)
-{
- _enter("%p{u=%d,d=%d}",
- conn, atomic_read(&conn->usage), conn->debug_id);
-
- ASSERTCMP(atomic_read(&conn->usage), >, 0);
-
- conn->put_time = ktime_get_seconds();
- if (atomic_dec_and_test(&conn->usage)) {
- _debug("zombie");
- rxrpc_queue_delayed_work(&rxrpc_connection_reap, 0);
- }
-
- _leave("");
-}
-
-/*
- * destroy a virtual connection
- */
-static void rxrpc_destroy_connection(struct rxrpc_connection *conn)
-{
- _enter("%p{%d}", conn, atomic_read(&conn->usage));
-
- ASSERTCMP(atomic_read(&conn->usage), ==, 0);
-
- _net("DESTROY CONN %d", conn->debug_id);
-
- if (conn->bundle)
- rxrpc_put_bundle(conn->trans, conn->bundle);
-
- ASSERT(RB_EMPTY_ROOT(&conn->calls));
- rxrpc_purge_queue(&conn->rx_queue);
-
- conn->security->clear(conn);
- key_put(conn->key);
- key_put(conn->server_key);
-
- rxrpc_put_transport(conn->trans);
- kfree(conn);
- _leave("");
-}
-
-/*
- * reap dead connections
- */
-static void rxrpc_connection_reaper(struct work_struct *work)
-{
- struct rxrpc_connection *conn, *_p;
- unsigned long now, earliest, reap_time;
-
- LIST_HEAD(graveyard);
-
- _enter("");
-
- now = ktime_get_seconds();
- earliest = ULONG_MAX;
-
- write_lock_bh(&rxrpc_connection_lock);
- list_for_each_entry_safe(conn, _p, &rxrpc_connections, link) {
- _debug("reap CONN %d { u=%d,t=%ld }",
- conn->debug_id, atomic_read(&conn->usage),
- (long) now - (long) conn->put_time);
-
- if (likely(atomic_read(&conn->usage) > 0))
- continue;
-
- spin_lock(&conn->trans->client_lock);
- write_lock(&conn->trans->conn_lock);
- reap_time = conn->put_time + rxrpc_connection_expiry;
-
- if (atomic_read(&conn->usage) > 0) {
- ;
- } else if (reap_time <= now) {
- list_move_tail(&conn->link, &graveyard);
- if (conn->out_clientflag)
- rb_erase(&conn->node,
- &conn->trans->client_conns);
- else
- rb_erase(&conn->node,
- &conn->trans->server_conns);
- if (conn->bundle) {
- list_del_init(&conn->bundle_link);
- conn->bundle->num_conns--;
- }
-
- } else if (reap_time < earliest) {
- earliest = reap_time;
- }
-
- write_unlock(&conn->trans->conn_lock);
- spin_unlock(&conn->trans->client_lock);
- }
- write_unlock_bh(&rxrpc_connection_lock);
-
- if (earliest != ULONG_MAX) {
- _debug("reschedule reaper %ld", (long) earliest - now);
- ASSERTCMP(earliest, >, now);
- rxrpc_queue_delayed_work(&rxrpc_connection_reap,
- (earliest - now) * HZ);
- }
-
- /* then destroy all those pulled out */
- while (!list_empty(&graveyard)) {
- conn = list_entry(graveyard.next, struct rxrpc_connection,
- link);
- list_del_init(&conn->link);
-
- ASSERTCMP(atomic_read(&conn->usage), ==, 0);
- rxrpc_destroy_connection(conn);
- }
-
- _leave("");
-}
-
-/*
- * preemptively destroy all the connection records rather than waiting for them
- * to time out
- */
-void __exit rxrpc_destroy_all_connections(void)
-{
- _enter("");
-
- rxrpc_connection_expiry = 0;
- cancel_delayed_work(&rxrpc_connection_reap);
- rxrpc_queue_delayed_work(&rxrpc_connection_reap, 0);
-
- _leave("");
-}
diff --git a/net/rxrpc/ar-error.c b/net/rxrpc/ar-error.c
deleted file mode 100644
index 3e82d6f..0000000
--- a/net/rxrpc/ar-error.c
+++ /dev/null
@@ -1,230 +0,0 @@
-/* Error message handling (ICMP)
- *
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/module.h>
-#include <linux/net.h>
-#include <linux/skbuff.h>
-#include <linux/errqueue.h>
-#include <linux/udp.h>
-#include <linux/in.h>
-#include <linux/in6.h>
-#include <linux/icmp.h>
-#include <net/sock.h>
-#include <net/af_rxrpc.h>
-#include <net/ip.h>
-#include "ar-internal.h"
-
-/*
- * handle an error received on the local endpoint
- */
-void rxrpc_UDP_error_report(struct sock *sk)
-{
- struct sock_exterr_skb *serr;
- struct rxrpc_transport *trans;
- struct rxrpc_local *local = sk->sk_user_data;
- struct rxrpc_peer *peer;
- struct sk_buff *skb;
- __be32 addr;
- __be16 port;
-
- _enter("%p{%d}", sk, local->debug_id);
-
- skb = sock_dequeue_err_skb(sk);
- if (!skb) {
- _leave("UDP socket errqueue empty");
- return;
- }
- serr = SKB_EXT_ERR(skb);
- if (!skb->len && serr->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING) {
- _leave("UDP empty message");
- kfree_skb(skb);
- return;
- }
-
- rxrpc_new_skb(skb);
-
- addr = *(__be32 *)(skb_network_header(skb) + serr->addr_offset);
- port = serr->port;
-
- _net("Rx UDP Error from %pI4:%hu", &addr, ntohs(port));
- _debug("Msg l:%d d:%d", skb->len, skb->data_len);
-
- peer = rxrpc_find_peer(local, addr, port);
- if (IS_ERR(peer)) {
- rxrpc_free_skb(skb);
- _leave(" [no peer]");
- return;
- }
-
- trans = rxrpc_find_transport(local, peer);
- if (!trans) {
- rxrpc_put_peer(peer);
- rxrpc_free_skb(skb);
- _leave(" [no trans]");
- return;
- }
-
- if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP &&
- serr->ee.ee_type == ICMP_DEST_UNREACH &&
- serr->ee.ee_code == ICMP_FRAG_NEEDED
- ) {
- u32 mtu = serr->ee.ee_info;
-
- _net("Rx Received ICMP Fragmentation Needed (%d)", mtu);
-
- /* wind down the local interface MTU */
- if (mtu > 0 && peer->if_mtu == 65535 && mtu < peer->if_mtu) {
- peer->if_mtu = mtu;
- _net("I/F MTU %u", mtu);
- }
-
- if (mtu == 0) {
- /* they didn't give us a size, estimate one */
- mtu = peer->if_mtu;
- if (mtu > 1500) {
- mtu >>= 1;
- if (mtu < 1500)
- mtu = 1500;
- } else {
- mtu -= 100;
- if (mtu < peer->hdrsize)
- mtu = peer->hdrsize + 4;
- }
- }
-
- if (mtu < peer->mtu) {
- spin_lock_bh(&peer->lock);
- peer->mtu = mtu;
- peer->maxdata = peer->mtu - peer->hdrsize;
- spin_unlock_bh(&peer->lock);
- _net("Net MTU %u (maxdata %u)",
- peer->mtu, peer->maxdata);
- }
- }
-
- rxrpc_put_peer(peer);
-
- /* pass the transport ref to error_handler to release */
- skb_queue_tail(&trans->error_queue, skb);
- rxrpc_queue_work(&trans->error_handler);
- _leave("");
-}
-
-/*
- * deal with UDP error messages
- */
-void rxrpc_UDP_error_handler(struct work_struct *work)
-{
- struct sock_extended_err *ee;
- struct sock_exterr_skb *serr;
- struct rxrpc_transport *trans =
- container_of(work, struct rxrpc_transport, error_handler);
- struct sk_buff *skb;
- int err;
-
- _enter("");
-
- skb = skb_dequeue(&trans->error_queue);
- if (!skb)
- return;
-
- serr = SKB_EXT_ERR(skb);
- ee = &serr->ee;
-
- _net("Rx Error o=%d t=%d c=%d e=%d",
- ee->ee_origin, ee->ee_type, ee->ee_code, ee->ee_errno);
-
- err = ee->ee_errno;
-
- switch (ee->ee_origin) {
- case SO_EE_ORIGIN_ICMP:
- switch (ee->ee_type) {
- case ICMP_DEST_UNREACH:
- switch (ee->ee_code) {
- case ICMP_NET_UNREACH:
- _net("Rx Received ICMP Network Unreachable");
- break;
- case ICMP_HOST_UNREACH:
- _net("Rx Received ICMP Host Unreachable");
- break;
- case ICMP_PORT_UNREACH:
- _net("Rx Received ICMP Port Unreachable");
- break;
- case ICMP_NET_UNKNOWN:
- _net("Rx Received ICMP Unknown Network");
- break;
- case ICMP_HOST_UNKNOWN:
- _net("Rx Received ICMP Unknown Host");
- break;
- default:
- _net("Rx Received ICMP DestUnreach code=%u",
- ee->ee_code);
- break;
- }
- break;
-
- case ICMP_TIME_EXCEEDED:
- _net("Rx Received ICMP TTL Exceeded");
- break;
-
- default:
- _proto("Rx Received ICMP error { type=%u code=%u }",
- ee->ee_type, ee->ee_code);
- break;
- }
- break;
-
- case SO_EE_ORIGIN_LOCAL:
- _proto("Rx Received local error { error=%d }",
- ee->ee_errno);
- break;
-
- case SO_EE_ORIGIN_NONE:
- case SO_EE_ORIGIN_ICMP6:
- default:
- _proto("Rx Received error report { orig=%u }",
- ee->ee_origin);
- break;
- }
-
- /* terminate all the affected calls if there's an unrecoverable
- * error */
- if (err) {
- struct rxrpc_call *call, *_n;
-
- _debug("ISSUE ERROR %d", err);
-
- spin_lock_bh(&trans->peer->lock);
- trans->peer->net_error = err;
-
- list_for_each_entry_safe(call, _n, &trans->peer->error_targets,
- error_link) {
- write_lock(&call->state_lock);
- if (call->state != RXRPC_CALL_COMPLETE &&
- call->state < RXRPC_CALL_NETWORK_ERROR) {
- call->state = RXRPC_CALL_NETWORK_ERROR;
- set_bit(RXRPC_CALL_EV_RCVD_ERROR, &call->events);
- rxrpc_queue_call(call);
- }
- write_unlock(&call->state_lock);
- list_del_init(&call->error_link);
- }
-
- spin_unlock_bh(&trans->peer->lock);
- }
-
- if (!skb_queue_empty(&trans->error_queue))
- rxrpc_queue_work(&trans->error_handler);
-
- rxrpc_free_skb(skb);
- rxrpc_put_transport(trans);
- _leave("");
-}
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index f715cca..702db72 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -9,7 +9,9 @@
* 2 of the License, or (at your option) any later version.
*/
+#include <linux/atomic.h>
#include <net/sock.h>
+#include <net/af_rxrpc.h>
#include <rxrpc/packet.h>
#if 0
@@ -35,6 +37,8 @@ struct rxrpc_crypt {
#define rxrpc_queue_call(CALL) rxrpc_queue_work(&(CALL)->processor)
#define rxrpc_queue_conn(CONN) rxrpc_queue_work(&(CONN)->processor)
+struct rxrpc_connection;
+
/*
* sk_state for RxRPC sockets
*/
@@ -55,7 +59,6 @@ struct rxrpc_sock {
struct sock sk;
rxrpc_interceptor_t interceptor; /* kernel service Rx interceptor function */
struct rxrpc_local *local; /* local endpoint */
- struct rxrpc_connection *conn; /* exclusive virtual connection */
struct list_head listen_link; /* link in the local endpoint's listen list */
struct list_head secureq; /* calls awaiting connection security clearance */
struct list_head acceptq; /* calls awaiting acceptance */
@@ -64,13 +67,13 @@ struct rxrpc_sock {
struct rb_root calls; /* outstanding calls on this socket */
unsigned long flags;
#define RXRPC_SOCK_CONNECTED 0 /* connect_srx is set */
-#define RXRPC_SOCK_EXCLUSIVE_CONN 1 /* exclusive connection for a client socket */
rwlock_t call_lock; /* lock for calls */
u32 min_sec_level; /* minimum security level */
#define RXRPC_SECURITY_MAX RXRPC_SECURITY_ENCRYPT
+ bool exclusive; /* Exclusive connection for a client socket */
+ sa_family_t family; /* Protocol family created with */
struct sockaddr_rxrpc srx; /* local address */
struct sockaddr_rxrpc connect_srx; /* Default client address from connect() */
- sa_family_t proto; /* protocol created with */
};
#define rxrpc_sk(__sk) container_of((__sk), struct rxrpc_sock, sk)
@@ -168,46 +171,52 @@ struct rxrpc_security {
};
/*
- * RxRPC local transport endpoint definition
- * - matched by local port, address and protocol type
+ * RxRPC local transport endpoint description
+ * - owned by a single AF_RXRPC socket
+ * - pointed to by transport socket struct sk_user_data
*/
struct rxrpc_local {
+ struct rcu_head rcu;
+ atomic_t usage;
+ struct list_head link;
struct socket *socket; /* my UDP socket */
- struct work_struct destroyer; /* endpoint destroyer */
- struct work_struct acceptor; /* incoming call processor */
- struct work_struct rejecter; /* packet reject writer */
- struct work_struct event_processor; /* endpoint event processor */
+ struct work_struct processor;
struct list_head services; /* services listening on this endpoint */
- struct list_head link; /* link in endpoint list */
struct rw_semaphore defrag_sem; /* control re-enablement of IP DF bit */
struct sk_buff_head accept_queue; /* incoming calls awaiting acceptance */
struct sk_buff_head reject_queue; /* packets awaiting rejection */
struct sk_buff_head event_queue; /* endpoint event packets awaiting processing */
+ struct rb_root client_conns; /* Client connections by socket params */
+ spinlock_t client_conns_lock; /* Lock for client_conns */
spinlock_t lock; /* access lock */
rwlock_t services_lock; /* lock for services list */
- atomic_t usage;
int debug_id; /* debug ID for printks */
- volatile char error_rcvd; /* T if received ICMP error outstanding */
+ bool dead;
struct sockaddr_rxrpc srx; /* local address */
};
/*
* RxRPC remote transport endpoint definition
- * - matched by remote port, address and protocol type
- * - holds the connection ID counter for connections between the two endpoints
+ * - matched by local endpoint, remote port, address and protocol type
*/
struct rxrpc_peer {
- struct work_struct destroyer; /* peer destroyer */
- struct list_head link; /* link in master peer list */
- struct list_head error_targets; /* targets for net error distribution */
- spinlock_t lock; /* access lock */
+ struct rcu_head rcu; /* This must be first */
atomic_t usage;
+ unsigned long hash_key;
+ struct hlist_node hash_link;
+ struct rxrpc_local *local;
+ struct hlist_head error_targets; /* targets for net error distribution */
+ struct work_struct error_distributor;
+ struct rb_root service_conns; /* Service connections */
+ rwlock_t conn_lock;
+ spinlock_t lock; /* access lock */
unsigned int if_mtu; /* interface MTU for this peer */
unsigned int mtu; /* network MTU for this peer */
unsigned int maxdata; /* data size (MTU - hdrsize) */
unsigned short hdrsize; /* header size (IP + UDP + RxRPC) */
int debug_id; /* debug ID for printks */
- int net_error; /* network error distributed */
+ int error_report; /* Net (+0) or local (+1000000) to distribute */
+#define RXRPC_LOCAL_ERROR_OFFSET 1000000
struct sockaddr_rxrpc srx; /* remote address */
/* calculated RTT cache */
@@ -219,68 +228,63 @@ struct rxrpc_peer {
};
/*
- * RxRPC point-to-point transport / connection manager definition
- * - handles a bundle of connections between two endpoints
- * - matched by { local, peer }
- */
-struct rxrpc_transport {
- struct rxrpc_local *local; /* local transport endpoint */
- struct rxrpc_peer *peer; /* remote transport endpoint */
- struct work_struct error_handler; /* network error distributor */
- struct rb_root bundles; /* client connection bundles on this transport */
- struct rb_root client_conns; /* client connections on this transport */
- struct rb_root server_conns; /* server connections on this transport */
- struct list_head link; /* link in master session list */
- struct sk_buff_head error_queue; /* error packets awaiting processing */
- unsigned long put_time; /* time at which to reap */
- spinlock_t client_lock; /* client connection allocation lock */
- rwlock_t conn_lock; /* lock for active/dead connections */
- atomic_t usage;
- int debug_id; /* debug ID for printks */
- unsigned int conn_idcounter; /* connection ID counter (client) */
+ * Keys for matching a connection.
+ */
+struct rxrpc_conn_proto {
+ unsigned long hash_key;
+ struct rxrpc_local *local; /* Representation of local endpoint */
+ u32 epoch; /* epoch of this connection */
+ u32 cid; /* connection ID */
+ u8 in_clientflag; /* RXRPC_CLIENT_INITIATED if we are server */
+ u8 addr_size; /* Size of the address */
+ sa_family_t family; /* Transport protocol */
+ __be16 port; /* Peer UDP/UDP6 port */
+ union { /* Peer address */
+ struct in_addr ipv4_addr;
+ struct in6_addr ipv6_addr;
+ u32 raw_addr[0];
+ };
};
-/*
- * RxRPC client connection bundle
- * - matched by { transport, service_id, key }
- */
-struct rxrpc_conn_bundle {
- struct rb_node node; /* node in transport's lookup tree */
- struct list_head unused_conns; /* unused connections in this bundle */
- struct list_head avail_conns; /* available connections in this bundle */
- struct list_head busy_conns; /* busy connections in this bundle */
- struct key *key; /* security for this bundle */
- wait_queue_head_t chanwait; /* wait for channel to become available */
- atomic_t usage;
- int debug_id; /* debug ID for printks */
- unsigned short num_conns; /* number of connections in this bundle */
- u16 service_id; /* Service ID for this bundle */
- u8 security_ix; /* security type */
+struct rxrpc_conn_parameters {
+ struct rxrpc_local *local; /* Representation of local endpoint */
+ struct rxrpc_peer *peer; /* Remote endpoint */
+ struct key *key; /* Security details */
+ bool exclusive; /* T if conn is exclusive */
+ u16 service_id; /* Service ID for this connection */
+ u32 security_level; /* Security level selected */
};
/*
* RxRPC connection definition
- * - matched by { transport, service_id, conn_id, direction, key }
+ * - matched by { local, peer, epoch, conn_id, direction }
* - each connection can only handle four simultaneous calls
*/
struct rxrpc_connection {
- struct rxrpc_transport *trans; /* transport session */
- struct rxrpc_conn_bundle *bundle; /* connection bundle (client) */
+ struct rxrpc_conn_proto proto;
+ struct rxrpc_conn_parameters params;
+
+ spinlock_t channel_lock;
+ struct rxrpc_call *channels[RXRPC_MAXCALLS]; /* active calls */
+ wait_queue_head_t channel_wq; /* queue to wait for channel to become available */
+
struct work_struct processor; /* connection event processor */
- struct rb_node node; /* node in transport's lookup tree */
+ union {
+ struct rb_node client_node; /* Node in local->client_conns */
+ struct rb_node service_node; /* Node in peer->service_conns */
+ };
struct list_head link; /* link in master connection list */
- struct list_head bundle_link; /* link in bundle */
struct rb_root calls; /* calls on this connection */
struct sk_buff_head rx_queue; /* received conn-level packets */
- struct rxrpc_call *channels[RXRPC_MAXCALLS]; /* channels (active calls) */
const struct rxrpc_security *security; /* applied security module */
- struct key *key; /* security for this connection (client) */
struct key *server_key; /* security for this service */
struct crypto_skcipher *cipher; /* encryption handle */
struct rxrpc_crypt csum_iv; /* packet checksum base */
+ unsigned long flags;
+#define RXRPC_CONN_HAS_IDR 0 /* - Has a client conn ID assigned */
unsigned long events;
#define RXRPC_CONN_CHALLENGE 0 /* send challenge packet */
- unsigned long put_time; /* time at which to reap */
+ unsigned long put_time; /* Time at which last put */
rwlock_t lock; /* access lock */
spinlock_t state_lock; /* state-change lock */
atomic_t usage;
@@ -301,17 +305,12 @@ struct rxrpc_connection {
unsigned int call_counter; /* call ID counter */
atomic_t serial; /* packet serial number counter */
atomic_t hi_serial; /* highest serial number received */
- u8 avail_calls; /* number of calls available */
+ atomic_t avail_chans; /* number of channels available */
u8 size_align; /* data size alignment (for security) */
u8 header_size; /* rxrpc + security header size */
u8 security_size; /* security header size */
- u32 security_level; /* security level negotiated */
u32 security_nonce; /* response re-use preventer */
- u32 epoch; /* epoch of this connection */
- u32 cid; /* connection ID */
- u16 service_id; /* service ID for this connection */
u8 security_ix; /* security type */
- u8 in_clientflag; /* RXRPC_CLIENT_INITIATED if we are server */
u8 out_clientflag; /* RXRPC_CLIENT_INITIATED if we are client */
};
@@ -357,6 +356,8 @@ enum rxrpc_call_event {
* The states that a call can be in.
*/
enum rxrpc_call_state {
+ RXRPC_CALL_UNINITIALISED,
+ RXRPC_CALL_CLIENT_AWAIT_CONN, /* - client waiting for connection to become available */
RXRPC_CALL_CLIENT_SEND_REQUEST, /* - client sending request phase */
RXRPC_CALL_CLIENT_AWAIT_REPLY, /* - client awaiting reply */
RXRPC_CALL_CLIENT_RECV_REPLY, /* - client receiving reply phase */
@@ -390,7 +391,7 @@ struct rxrpc_call {
struct work_struct destroyer; /* call destroyer */
struct work_struct processor; /* packet processor and ACK generator */
struct list_head link; /* link in master call list */
- struct list_head error_link; /* link in error distribution list */
+ struct hlist_node error_link; /* link in error distribution list */
struct list_head accept_link; /* calls awaiting acceptance */
struct rb_node sock_node; /* node in socket call tree */
struct rb_node conn_node; /* node in connection call tree */
@@ -408,7 +409,8 @@ struct rxrpc_call {
atomic_t sequence; /* Tx data packet sequence counter */
u32 local_abort; /* local abort code */
u32 remote_abort; /* remote abort code */
- int error; /* local error incurred */
+ int error_report; /* Network error (ICMP/local transport) */
+ int error; /* Local error incurred */
enum rxrpc_call_state state : 8; /* current state of call */
int debug_id; /* debug ID for printks */
u8 channel; /* connection channel occupied by this call */
@@ -444,7 +446,7 @@ struct rxrpc_call {
unsigned long hash_key; /* Full hash key */
u8 in_clientflag; /* Copy of conn->in_clientflag for hashing */
struct rxrpc_local *local; /* Local endpoint. Used for hashing. */
- sa_family_t proto; /* Frame protocol */
+ sa_family_t family; /* Frame protocol */
u32 call_id; /* call ID on connection */
u32 cid; /* connection ID plus channel index */
u32 epoch; /* epoch of this connection */
@@ -477,26 +479,22 @@ extern u32 rxrpc_epoch;
extern atomic_t rxrpc_debug_id;
extern struct workqueue_struct *rxrpc_workqueue;
-extern struct rxrpc_transport *rxrpc_name_to_transport(struct rxrpc_sock *,
- struct sockaddr *,
- int, int, gfp_t);
-
/*
- * ar-accept.c
+ * call_accept.c
*/
-void rxrpc_accept_incoming_calls(struct work_struct *);
+void rxrpc_accept_incoming_calls(struct rxrpc_local *);
struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *, unsigned long);
int rxrpc_reject_call(struct rxrpc_sock *);
/*
- * ar-ack.c
+ * call_event.c
*/
void __rxrpc_propose_ACK(struct rxrpc_call *, u8, u32, bool);
void rxrpc_propose_ACK(struct rxrpc_call *, u8, u32, bool);
void rxrpc_process_call(struct work_struct *);
/*
- * ar-call.c
+ * call_object.c
*/
extern unsigned int rxrpc_max_call_lifetime;
extern unsigned int rxrpc_dead_call_expiry;
@@ -508,68 +506,80 @@ struct rxrpc_call *rxrpc_find_call_hash(struct rxrpc_host_header *,
void *, sa_family_t, const void *);
struct rxrpc_call *rxrpc_find_call_by_user_ID(struct rxrpc_sock *, unsigned long);
struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *,
- struct rxrpc_transport *,
- struct rxrpc_conn_bundle *,
+ struct rxrpc_conn_parameters *,
+ struct sockaddr_rxrpc *,
unsigned long, gfp_t);
struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *,
struct rxrpc_connection *,
- struct rxrpc_host_header *);
+ struct sk_buff *);
void rxrpc_release_call(struct rxrpc_call *);
void rxrpc_release_calls_on_socket(struct rxrpc_sock *);
void __rxrpc_put_call(struct rxrpc_call *);
void __exit rxrpc_destroy_all_calls(void);
/*
- * ar-connection.c
+ * conn_client.c
*/
-extern unsigned int rxrpc_connection_expiry;
-extern struct list_head rxrpc_connections;
-extern rwlock_t rxrpc_connection_lock;
+extern struct idr rxrpc_client_conn_ids;
-struct rxrpc_conn_bundle *rxrpc_get_bundle(struct rxrpc_sock *,
- struct rxrpc_transport *,
- struct key *, u16, gfp_t);
-void rxrpc_put_bundle(struct rxrpc_transport *, struct rxrpc_conn_bundle *);
-int rxrpc_connect_call(struct rxrpc_sock *, struct rxrpc_transport *,
- struct rxrpc_conn_bundle *, struct rxrpc_call *, gfp_t);
-void rxrpc_put_connection(struct rxrpc_connection *);
-void __exit rxrpc_destroy_all_connections(void);
-struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_transport *,
- struct rxrpc_host_header *);
-extern struct rxrpc_connection *
-rxrpc_incoming_connection(struct rxrpc_transport *, struct rxrpc_host_header *);
+int rxrpc_get_client_connection_id(struct rxrpc_connection *, gfp_t);
+void rxrpc_put_client_connection_id(struct rxrpc_connection *);
/*
- * ar-connevent.c
+ * conn_event.c
*/
void rxrpc_process_connection(struct work_struct *);
void rxrpc_reject_packet(struct rxrpc_local *, struct sk_buff *);
-void rxrpc_reject_packets(struct work_struct *);
+void rxrpc_reject_packets(struct rxrpc_local *);
/*
- * ar-error.c
+ * conn_object.c
*/
-void rxrpc_UDP_error_report(struct sock *);
-void rxrpc_UDP_error_handler(struct work_struct *);
+extern unsigned int rxrpc_connection_expiry;
+extern struct list_head rxrpc_connections;
+extern rwlock_t rxrpc_connection_lock;
+
+int rxrpc_connect_call(struct rxrpc_call *, struct rxrpc_conn_parameters *,
+ struct sockaddr_rxrpc *, gfp_t);
+struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_local *,
+ struct rxrpc_peer *,
+ struct sk_buff *);
+void rxrpc_disconnect_call(struct rxrpc_call *);
+void rxrpc_put_connection(struct rxrpc_connection *);
+void __exit rxrpc_destroy_all_connections(void);
+struct rxrpc_connection *rxrpc_incoming_connection(struct rxrpc_local *,
+ struct rxrpc_peer *,
+ struct sk_buff *);
+
+static inline bool rxrpc_conn_is_client(const struct rxrpc_connection *conn)
+{
+ return conn->out_clientflag;
+}
+
+static inline bool rxrpc_conn_is_service(const struct rxrpc_connection *conn)
+{
+ return conn->proto.in_clientflag;
+}
+
+static inline void rxrpc_get_connection(struct rxrpc_connection *conn)
+{
+ atomic_inc(&conn->usage);
+}
/*
- * ar-input.c
+ * input.c
*/
void rxrpc_data_ready(struct sock *);
int rxrpc_queue_rcv_skb(struct rxrpc_call *, struct sk_buff *, bool, bool);
void rxrpc_fast_process_packet(struct rxrpc_call *, struct sk_buff *);
/*
- * ar-local.c
+ * insecure.c
*/
-extern rwlock_t rxrpc_local_lock;
-
-struct rxrpc_local *rxrpc_lookup_local(struct sockaddr_rxrpc *);
-void rxrpc_put_local(struct rxrpc_local *);
-void __exit rxrpc_destroy_all_locals(void);
+extern const struct rxrpc_security rxrpc_no_security;
/*
- * ar-key.c
+ * key.c
*/
extern struct key_type key_type_rxrpc;
extern struct key_type key_type_rxrpc_s;
@@ -580,79 +590,103 @@ int rxrpc_get_server_data_key(struct rxrpc_connection *, const void *, time_t,
u32);
/*
- * ar-output.c
+ * local_event.c
*/
-extern unsigned int rxrpc_resend_timeout;
-
-int rxrpc_send_packet(struct rxrpc_transport *, struct sk_buff *);
-int rxrpc_do_sendmsg(struct rxrpc_sock *, struct msghdr *, size_t);
+extern void rxrpc_process_local_events(struct rxrpc_local *);
/*
- * ar-peer.c
+ * local_object.c
*/
-struct rxrpc_peer *rxrpc_get_peer(struct sockaddr_rxrpc *, gfp_t);
-void rxrpc_put_peer(struct rxrpc_peer *);
-struct rxrpc_peer *rxrpc_find_peer(struct rxrpc_local *, __be32, __be16);
-void __exit rxrpc_destroy_all_peers(void);
+struct rxrpc_local *rxrpc_lookup_local(const struct sockaddr_rxrpc *);
+void __rxrpc_put_local(struct rxrpc_local *);
+void __exit rxrpc_destroy_all_locals(void);
-/*
- * ar-proc.c
- */
-extern const char *const rxrpc_call_states[];
-extern const struct file_operations rxrpc_call_seq_fops;
-extern const struct file_operations rxrpc_connection_seq_fops;
+static inline void rxrpc_get_local(struct rxrpc_local *local)
+{
+ atomic_inc(&local->usage);
+}
+
+static inline
+struct rxrpc_local *rxrpc_get_local_maybe(struct rxrpc_local *local)
+{
+ return atomic_inc_not_zero(&local->usage) ? local : NULL;
+}
+
+static inline void rxrpc_put_local(struct rxrpc_local *local)
+{
+ if (local && atomic_dec_and_test(&local->usage))
+ __rxrpc_put_local(local);
+}
/*
- * ar-recvmsg.c
+ * misc.c
*/
-void rxrpc_remove_user_ID(struct rxrpc_sock *, struct rxrpc_call *);
-int rxrpc_recvmsg(struct socket *, struct msghdr *, size_t, int);
+extern unsigned int rxrpc_max_backlog __read_mostly;
+extern unsigned int rxrpc_requested_ack_delay;
+extern unsigned int rxrpc_soft_ack_delay;
+extern unsigned int rxrpc_idle_ack_delay;
+extern unsigned int rxrpc_rx_window_size;
+extern unsigned int rxrpc_rx_mtu;
+extern unsigned int rxrpc_rx_jumbo_max;
+
+extern const char *const rxrpc_pkts[];
+extern const s8 rxrpc_ack_priority[];
+
+extern const char *rxrpc_acks(u8 reason);
/*
- * ar-security.c
+ * output.c
*/
-int __init rxrpc_init_security(void);
-void rxrpc_exit_security(void);
-int rxrpc_init_client_conn_security(struct rxrpc_connection *);
-int rxrpc_init_server_conn_security(struct rxrpc_connection *);
+extern unsigned int rxrpc_resend_timeout;
+
+int rxrpc_send_data_packet(struct rxrpc_connection *, struct sk_buff *);
+int rxrpc_do_sendmsg(struct rxrpc_sock *, struct msghdr *, size_t);
/*
- * ar-skbuff.c
+ * peer_event.c
*/
-void rxrpc_packet_destructor(struct sk_buff *);
+void rxrpc_error_report(struct sock *);
+void rxrpc_peer_error_distributor(struct work_struct *);
/*
- * ar-transport.c
+ * peer_object.c
*/
-extern unsigned int rxrpc_transport_expiry;
+struct rxrpc_peer *rxrpc_lookup_peer_rcu(struct rxrpc_local *,
+ const struct sockaddr_rxrpc *);
+struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *,
+ struct sockaddr_rxrpc *, gfp_t);
+struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *, gfp_t);
-struct rxrpc_transport *rxrpc_get_transport(struct rxrpc_local *,
- struct rxrpc_peer *, gfp_t);
-void rxrpc_put_transport(struct rxrpc_transport *);
-void __exit rxrpc_destroy_all_transports(void);
-struct rxrpc_transport *rxrpc_find_transport(struct rxrpc_local *,
- struct rxrpc_peer *);
+static inline void rxrpc_get_peer(struct rxrpc_peer *peer)
+{
+ atomic_inc(&peer->usage);
+}
+
+static inline
+struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *peer)
+{
+ return atomic_inc_not_zero(&peer->usage) ? peer : NULL;
+}
+
+extern void __rxrpc_put_peer(struct rxrpc_peer *peer);
+static inline void rxrpc_put_peer(struct rxrpc_peer *peer)
+{
+ if (peer && atomic_dec_and_test(&peer->usage))
+ __rxrpc_put_peer(peer);
+}
/*
- * insecure.c
+ * proc.c
*/
-extern const struct rxrpc_security rxrpc_no_security;
+extern const char *const rxrpc_call_states[];
+extern const struct file_operations rxrpc_call_seq_fops;
+extern const struct file_operations rxrpc_connection_seq_fops;
/*
- * misc.c
+ * recvmsg.c
*/
-extern unsigned int rxrpc_max_backlog __read_mostly;
-extern unsigned int rxrpc_requested_ack_delay;
-extern unsigned int rxrpc_soft_ack_delay;
-extern unsigned int rxrpc_idle_ack_delay;
-extern unsigned int rxrpc_rx_window_size;
-extern unsigned int rxrpc_rx_mtu;
-extern unsigned int rxrpc_rx_jumbo_max;
-
-extern const char *const rxrpc_pkts[];
-extern const s8 rxrpc_ack_priority[];
-
-extern const char *rxrpc_acks(u8 reason);
+void rxrpc_remove_user_ID(struct rxrpc_sock *, struct rxrpc_call *);
+int rxrpc_recvmsg(struct socket *, struct msghdr *, size_t, int);
/*
* rxkad.c
@@ -662,6 +696,19 @@ extern const struct rxrpc_security rxkad;
#endif
/*
+ * security.c
+ */
+int __init rxrpc_init_security(void);
+void rxrpc_exit_security(void);
+int rxrpc_init_client_conn_security(struct rxrpc_connection *);
+int rxrpc_init_server_conn_security(struct rxrpc_connection *);
+
+/*
+ * skbuff.c
+ */
+void rxrpc_packet_destructor(struct sk_buff *);
+
+/*
* sysctl.c
*/
#ifdef CONFIG_SYSCTL
@@ -673,6 +720,12 @@ static inline void rxrpc_sysctl_exit(void) {}
#endif
/*
+ * utils.c
+ */
+void rxrpc_get_addr_from_skb(struct rxrpc_local *, const struct sk_buff *,
+ struct sockaddr_rxrpc *);
+
+/*
* debug tracing
*/
extern unsigned int rxrpc_debug;
@@ -841,15 +894,6 @@ static inline void rxrpc_purge_queue(struct sk_buff_head *list)
rxrpc_free_skb(skb);
}
-static inline void __rxrpc_get_local(struct rxrpc_local *local, const char *f)
-{
- CHECK_SLAB_OKAY(&local->usage);
- if (atomic_inc_return(&local->usage) == 1)
- printk("resurrected (%s)\n", f);
-}
-
-#define rxrpc_get_local(LOCAL) __rxrpc_get_local((LOCAL), __func__)
-
#define rxrpc_get_call(CALL) \
do { \
CHECK_SLAB_OKAY(&(CALL)->usage); \
diff --git a/net/rxrpc/ar-local.c b/net/rxrpc/ar-local.c
deleted file mode 100644
index 111f250..0000000
--- a/net/rxrpc/ar-local.c
+++ /dev/null
@@ -1,417 +0,0 @@
-/* AF_RXRPC local endpoint management
- *
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/module.h>
-#include <linux/net.h>
-#include <linux/skbuff.h>
-#include <linux/slab.h>
-#include <linux/udp.h>
-#include <linux/ip.h>
-#include <net/sock.h>
-#include <net/af_rxrpc.h>
-#include <generated/utsrelease.h>
-#include "ar-internal.h"
-
-static const char rxrpc_version_string[65] = "linux-" UTS_RELEASE " AF_RXRPC";
-
-static LIST_HEAD(rxrpc_locals);
-DEFINE_RWLOCK(rxrpc_local_lock);
-static DECLARE_RWSEM(rxrpc_local_sem);
-static DECLARE_WAIT_QUEUE_HEAD(rxrpc_local_wq);
-
-static void rxrpc_destroy_local(struct work_struct *work);
-static void rxrpc_process_local_events(struct work_struct *work);
-
-/*
- * allocate a new local
- */
-static
-struct rxrpc_local *rxrpc_alloc_local(struct sockaddr_rxrpc *srx)
-{
- struct rxrpc_local *local;
-
- local = kzalloc(sizeof(struct rxrpc_local), GFP_KERNEL);
- if (local) {
- INIT_WORK(&local->destroyer, &rxrpc_destroy_local);
- INIT_WORK(&local->acceptor, &rxrpc_accept_incoming_calls);
- INIT_WORK(&local->rejecter, &rxrpc_reject_packets);
- INIT_WORK(&local->event_processor, &rxrpc_process_local_events);
- INIT_LIST_HEAD(&local->services);
- INIT_LIST_HEAD(&local->link);
- init_rwsem(&local->defrag_sem);
- skb_queue_head_init(&local->accept_queue);
- skb_queue_head_init(&local->reject_queue);
- skb_queue_head_init(&local->event_queue);
- spin_lock_init(&local->lock);
- rwlock_init(&local->services_lock);
- atomic_set(&local->usage, 1);
- local->debug_id = atomic_inc_return(&rxrpc_debug_id);
- memcpy(&local->srx, srx, sizeof(*srx));
- }
-
- _leave(" = %p", local);
- return local;
-}
-
-/*
- * create the local socket
- * - must be called with rxrpc_local_sem writelocked
- */
-static int rxrpc_create_local(struct rxrpc_local *local)
-{
- struct sock *sock;
- int ret, opt;
-
- _enter("%p{%d}", local, local->srx.transport_type);
-
- /* create a socket to represent the local endpoint */
- ret = sock_create_kern(&init_net, PF_INET, local->srx.transport_type,
- IPPROTO_UDP, &local->socket);
- if (ret < 0) {
- _leave(" = %d [socket]", ret);
- return ret;
- }
-
- /* if a local address was supplied then bind it */
- if (local->srx.transport_len > sizeof(sa_family_t)) {
- _debug("bind");
- ret = kernel_bind(local->socket,
- (struct sockaddr *) &local->srx.transport,
- local->srx.transport_len);
- if (ret < 0) {
- _debug("bind failed");
- goto error;
- }
- }
-
- /* we want to receive ICMP errors */
- opt = 1;
- ret = kernel_setsockopt(local->socket, SOL_IP, IP_RECVERR,
- (char *) &opt, sizeof(opt));
- if (ret < 0) {
- _debug("setsockopt failed");
- goto error;
- }
-
- /* we want to set the don't fragment bit */
- opt = IP_PMTUDISC_DO;
- ret = kernel_setsockopt(local->socket, SOL_IP, IP_MTU_DISCOVER,
- (char *) &opt, sizeof(opt));
- if (ret < 0) {
- _debug("setsockopt failed");
- goto error;
- }
-
- write_lock_bh(&rxrpc_local_lock);
- list_add(&local->link, &rxrpc_locals);
- write_unlock_bh(&rxrpc_local_lock);
-
- /* set the socket up */
- sock = local->socket->sk;
- sock->sk_user_data = local;
- sock->sk_data_ready = rxrpc_data_ready;
- sock->sk_error_report = rxrpc_UDP_error_report;
- _leave(" = 0");
- return 0;
-
-error:
- kernel_sock_shutdown(local->socket, SHUT_RDWR);
- local->socket->sk->sk_user_data = NULL;
- sock_release(local->socket);
- local->socket = NULL;
-
- _leave(" = %d", ret);
- return ret;
-}
-
-/*
- * create a new local endpoint using the specified UDP address
- */
-struct rxrpc_local *rxrpc_lookup_local(struct sockaddr_rxrpc *srx)
-{
- struct rxrpc_local *local;
- int ret;
-
- _enter("{%d,%u,%pI4+%hu}",
- srx->transport_type,
- srx->transport.family,
- &srx->transport.sin.sin_addr,
- ntohs(srx->transport.sin.sin_port));
-
- down_write(&rxrpc_local_sem);
-
- /* see if we have a suitable local local endpoint already */
- read_lock_bh(&rxrpc_local_lock);
-
- list_for_each_entry(local, &rxrpc_locals, link) {
- _debug("CMP {%d,%u,%pI4+%hu}",
- local->srx.transport_type,
- local->srx.transport.family,
- &local->srx.transport.sin.sin_addr,
- ntohs(local->srx.transport.sin.sin_port));
-
- if (local->srx.transport_type != srx->transport_type ||
- local->srx.transport.family != srx->transport.family)
- continue;
-
- switch (srx->transport.family) {
- case AF_INET:
- if (local->srx.transport.sin.sin_port !=
- srx->transport.sin.sin_port)
- continue;
- if (memcmp(&local->srx.transport.sin.sin_addr,
- &srx->transport.sin.sin_addr,
- sizeof(struct in_addr)) != 0)
- continue;
- goto found_local;
-
- default:
- BUG();
- }
- }
-
- read_unlock_bh(&rxrpc_local_lock);
-
- /* we didn't find one, so we need to create one */
- local = rxrpc_alloc_local(srx);
- if (!local) {
- up_write(&rxrpc_local_sem);
- return ERR_PTR(-ENOMEM);
- }
-
- ret = rxrpc_create_local(local);
- if (ret < 0) {
- up_write(&rxrpc_local_sem);
- kfree(local);
- _leave(" = %d", ret);
- return ERR_PTR(ret);
- }
-
- up_write(&rxrpc_local_sem);
-
- _net("LOCAL new %d {%d,%u,%pI4+%hu}",
- local->debug_id,
- local->srx.transport_type,
- local->srx.transport.family,
- &local->srx.transport.sin.sin_addr,
- ntohs(local->srx.transport.sin.sin_port));
-
- _leave(" = %p [new]", local);
- return local;
-
-found_local:
- rxrpc_get_local(local);
- read_unlock_bh(&rxrpc_local_lock);
- up_write(&rxrpc_local_sem);
-
- _net("LOCAL old %d {%d,%u,%pI4+%hu}",
- local->debug_id,
- local->srx.transport_type,
- local->srx.transport.family,
- &local->srx.transport.sin.sin_addr,
- ntohs(local->srx.transport.sin.sin_port));
-
- _leave(" = %p [reuse]", local);
- return local;
-}
-
-/*
- * release a local endpoint
- */
-void rxrpc_put_local(struct rxrpc_local *local)
-{
- _enter("%p{u=%d}", local, atomic_read(&local->usage));
-
- ASSERTCMP(atomic_read(&local->usage), >, 0);
-
- /* to prevent a race, the decrement and the dequeue must be effectively
- * atomic */
- write_lock_bh(&rxrpc_local_lock);
- if (unlikely(atomic_dec_and_test(&local->usage))) {
- _debug("destroy local");
- rxrpc_queue_work(&local->destroyer);
- }
- write_unlock_bh(&rxrpc_local_lock);
- _leave("");
-}
-
-/*
- * destroy a local endpoint
- */
-static void rxrpc_destroy_local(struct work_struct *work)
-{
- struct rxrpc_local *local =
- container_of(work, struct rxrpc_local, destroyer);
-
- _enter("%p{%d}", local, atomic_read(&local->usage));
-
- down_write(&rxrpc_local_sem);
-
- write_lock_bh(&rxrpc_local_lock);
- if (atomic_read(&local->usage) > 0) {
- write_unlock_bh(&rxrpc_local_lock);
- up_read(&rxrpc_local_sem);
- _leave(" [resurrected]");
- return;
- }
-
- list_del(&local->link);
- local->socket->sk->sk_user_data = NULL;
- write_unlock_bh(&rxrpc_local_lock);
-
- downgrade_write(&rxrpc_local_sem);
-
- ASSERT(list_empty(&local->services));
- ASSERT(!work_pending(&local->acceptor));
- ASSERT(!work_pending(&local->rejecter));
- ASSERT(!work_pending(&local->event_processor));
-
- /* finish cleaning up the local descriptor */
- rxrpc_purge_queue(&local->accept_queue);
- rxrpc_purge_queue(&local->reject_queue);
- rxrpc_purge_queue(&local->event_queue);
- kernel_sock_shutdown(local->socket, SHUT_RDWR);
- sock_release(local->socket);
-
- up_read(&rxrpc_local_sem);
-
- _net("DESTROY LOCAL %d", local->debug_id);
- kfree(local);
-
- if (list_empty(&rxrpc_locals))
- wake_up_all(&rxrpc_local_wq);
-
- _leave("");
-}
-
-/*
- * preemptively destroy all local local endpoint rather than waiting for
- * them to be destroyed
- */
-void __exit rxrpc_destroy_all_locals(void)
-{
- DECLARE_WAITQUEUE(myself,current);
-
- _enter("");
-
- /* we simply have to wait for them to go away */
- if (!list_empty(&rxrpc_locals)) {
- set_current_state(TASK_UNINTERRUPTIBLE);
- add_wait_queue(&rxrpc_local_wq, &myself);
-
- while (!list_empty(&rxrpc_locals)) {
- schedule();
- set_current_state(TASK_UNINTERRUPTIBLE);
- }
-
- remove_wait_queue(&rxrpc_local_wq, &myself);
- set_current_state(TASK_RUNNING);
- }
-
- _leave("");
-}
-
-/*
- * Reply to a version request
- */
-static void rxrpc_send_version_request(struct rxrpc_local *local,
- struct rxrpc_host_header *hdr,
- struct sk_buff *skb)
-{
- struct rxrpc_wire_header whdr;
- struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
- struct sockaddr_in sin;
- struct msghdr msg;
- struct kvec iov[2];
- size_t len;
- int ret;
-
- _enter("");
-
- sin.sin_family = AF_INET;
- sin.sin_port = udp_hdr(skb)->source;
- sin.sin_addr.s_addr = ip_hdr(skb)->saddr;
-
- msg.msg_name = &sin;
- msg.msg_namelen = sizeof(sin);
- msg.msg_control = NULL;
- msg.msg_controllen = 0;
- msg.msg_flags = 0;
-
- whdr.epoch = htonl(sp->hdr.epoch);
- whdr.cid = htonl(sp->hdr.cid);
- whdr.callNumber = htonl(sp->hdr.callNumber);
- whdr.seq = 0;
- whdr.serial = 0;
- whdr.type = RXRPC_PACKET_TYPE_VERSION;
- whdr.flags = RXRPC_LAST_PACKET | (~hdr->flags & RXRPC_CLIENT_INITIATED);
- whdr.userStatus = 0;
- whdr.securityIndex = 0;
- whdr._rsvd = 0;
- whdr.serviceId = htons(sp->hdr.serviceId);
-
- iov[0].iov_base = &whdr;
- iov[0].iov_len = sizeof(whdr);
- iov[1].iov_base = (char *)rxrpc_version_string;
- iov[1].iov_len = sizeof(rxrpc_version_string);
-
- len = iov[0].iov_len + iov[1].iov_len;
-
- _proto("Tx VERSION (reply)");
-
- ret = kernel_sendmsg(local->socket, &msg, iov, 2, len);
- if (ret < 0)
- _debug("sendmsg failed: %d", ret);
-
- _leave("");
-}
-
-/*
- * Process event packets targetted at a local endpoint.
- */
-static void rxrpc_process_local_events(struct work_struct *work)
-{
- struct rxrpc_local *local = container_of(work, struct rxrpc_local, event_processor);
- struct sk_buff *skb;
- char v;
-
- _enter("");
-
- atomic_inc(&local->usage);
-
- while ((skb = skb_dequeue(&local->event_queue))) {
- struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
-
- _debug("{%d},{%u}", local->debug_id, sp->hdr.type);
-
- switch (sp->hdr.type) {
- case RXRPC_PACKET_TYPE_VERSION:
- if (skb_copy_bits(skb, 0, &v, 1) < 0)
- return;
- _proto("Rx VERSION { %02x }", v);
- if (v == 0)
- rxrpc_send_version_request(local, &sp->hdr, skb);
- break;
-
- default:
- /* Just ignore anything we don't understand */
- break;
- }
-
- rxrpc_put_local(local);
- rxrpc_free_skb(skb);
- }
-
- rxrpc_put_local(local);
- _leave("");
-}
diff --git a/net/rxrpc/ar-peer.c b/net/rxrpc/ar-peer.c
deleted file mode 100644
index 0b54cda..0000000
--- a/net/rxrpc/ar-peer.c
+++ /dev/null
@@ -1,305 +0,0 @@
-/* RxRPC remote transport endpoint management
- *
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/module.h>
-#include <linux/net.h>
-#include <linux/skbuff.h>
-#include <linux/udp.h>
-#include <linux/in.h>
-#include <linux/in6.h>
-#include <linux/icmp.h>
-#include <linux/slab.h>
-#include <net/sock.h>
-#include <net/af_rxrpc.h>
-#include <net/ip.h>
-#include <net/route.h>
-#include "ar-internal.h"
-
-static LIST_HEAD(rxrpc_peers);
-static DEFINE_RWLOCK(rxrpc_peer_lock);
-static DECLARE_WAIT_QUEUE_HEAD(rxrpc_peer_wq);
-
-static void rxrpc_destroy_peer(struct work_struct *work);
-
-/*
- * assess the MTU size for the network interface through which this peer is
- * reached
- */
-static void rxrpc_assess_MTU_size(struct rxrpc_peer *peer)
-{
- struct rtable *rt;
- struct flowi4 fl4;
-
- peer->if_mtu = 1500;
-
- rt = ip_route_output_ports(&init_net, &fl4, NULL,
- peer->srx.transport.sin.sin_addr.s_addr, 0,
- htons(7000), htons(7001),
- IPPROTO_UDP, 0, 0);
- if (IS_ERR(rt)) {
- _leave(" [route err %ld]", PTR_ERR(rt));
- return;
- }
-
- peer->if_mtu = dst_mtu(&rt->dst);
- dst_release(&rt->dst);
-
- _leave(" [if_mtu %u]", peer->if_mtu);
-}
-
-/*
- * allocate a new peer
- */
-static struct rxrpc_peer *rxrpc_alloc_peer(struct sockaddr_rxrpc *srx,
- gfp_t gfp)
-{
- struct rxrpc_peer *peer;
-
- _enter("");
-
- peer = kzalloc(sizeof(struct rxrpc_peer), gfp);
- if (peer) {
- INIT_WORK(&peer->destroyer, &rxrpc_destroy_peer);
- INIT_LIST_HEAD(&peer->link);
- INIT_LIST_HEAD(&peer->error_targets);
- spin_lock_init(&peer->lock);
- atomic_set(&peer->usage, 1);
- peer->debug_id = atomic_inc_return(&rxrpc_debug_id);
- memcpy(&peer->srx, srx, sizeof(*srx));
-
- rxrpc_assess_MTU_size(peer);
- peer->mtu = peer->if_mtu;
-
- if (srx->transport.family == AF_INET) {
- peer->hdrsize = sizeof(struct iphdr);
- switch (srx->transport_type) {
- case SOCK_DGRAM:
- peer->hdrsize += sizeof(struct udphdr);
- break;
- default:
- BUG();
- break;
- }
- } else {
- BUG();
- }
-
- peer->hdrsize += sizeof(struct rxrpc_wire_header);
- peer->maxdata = peer->mtu - peer->hdrsize;
- }
-
- _leave(" = %p", peer);
- return peer;
-}
-
-/*
- * obtain a remote transport endpoint for the specified address
- */
-struct rxrpc_peer *rxrpc_get_peer(struct sockaddr_rxrpc *srx, gfp_t gfp)
-{
- struct rxrpc_peer *peer, *candidate;
- const char *new = "old";
- int usage;
-
- _enter("{%d,%d,%pI4+%hu}",
- srx->transport_type,
- srx->transport_len,
- &srx->transport.sin.sin_addr,
- ntohs(srx->transport.sin.sin_port));
-
- /* search the peer list first */
- read_lock_bh(&rxrpc_peer_lock);
- list_for_each_entry(peer, &rxrpc_peers, link) {
- _debug("check PEER %d { u=%d t=%d l=%d }",
- peer->debug_id,
- atomic_read(&peer->usage),
- peer->srx.transport_type,
- peer->srx.transport_len);
-
- if (atomic_read(&peer->usage) > 0 &&
- peer->srx.transport_type == srx->transport_type &&
- peer->srx.transport_len == srx->transport_len &&
- memcmp(&peer->srx.transport,
- &srx->transport,
- srx->transport_len) == 0)
- goto found_extant_peer;
- }
- read_unlock_bh(&rxrpc_peer_lock);
-
- /* not yet present - create a candidate for a new record and then
- * redo the search */
- candidate = rxrpc_alloc_peer(srx, gfp);
- if (!candidate) {
- _leave(" = -ENOMEM");
- return ERR_PTR(-ENOMEM);
- }
-
- write_lock_bh(&rxrpc_peer_lock);
-
- list_for_each_entry(peer, &rxrpc_peers, link) {
- if (atomic_read(&peer->usage) > 0 &&
- peer->srx.transport_type == srx->transport_type &&
- peer->srx.transport_len == srx->transport_len &&
- memcmp(&peer->srx.transport,
- &srx->transport,
- srx->transport_len) == 0)
- goto found_extant_second;
- }
-
- /* we can now add the new candidate to the list */
- peer = candidate;
- candidate = NULL;
- usage = atomic_read(&peer->usage);
-
- list_add_tail(&peer->link, &rxrpc_peers);
- write_unlock_bh(&rxrpc_peer_lock);
- new = "new";
-
-success:
- _net("PEER %s %d {%d,%u,%pI4+%hu}",
- new,
- peer->debug_id,
- peer->srx.transport_type,
- peer->srx.transport.family,
- &peer->srx.transport.sin.sin_addr,
- ntohs(peer->srx.transport.sin.sin_port));
-
- _leave(" = %p {u=%d}", peer, usage);
- return peer;
-
- /* we found the peer in the list immediately */
-found_extant_peer:
- usage = atomic_inc_return(&peer->usage);
- read_unlock_bh(&rxrpc_peer_lock);
- goto success;
-
- /* we found the peer on the second time through the list */
-found_extant_second:
- usage = atomic_inc_return(&peer->usage);
- write_unlock_bh(&rxrpc_peer_lock);
- kfree(candidate);
- goto success;
-}
-
-/*
- * find the peer associated with a packet
- */
-struct rxrpc_peer *rxrpc_find_peer(struct rxrpc_local *local,
- __be32 addr, __be16 port)
-{
- struct rxrpc_peer *peer;
-
- _enter("");
-
- /* search the peer list */
- read_lock_bh(&rxrpc_peer_lock);
-
- if (local->srx.transport.family == AF_INET &&
- local->srx.transport_type == SOCK_DGRAM
- ) {
- list_for_each_entry(peer, &rxrpc_peers, link) {
- if (atomic_read(&peer->usage) > 0 &&
- peer->srx.transport_type == SOCK_DGRAM &&
- peer->srx.transport.family == AF_INET &&
- peer->srx.transport.sin.sin_port == port &&
- peer->srx.transport.sin.sin_addr.s_addr == addr)
- goto found_UDP_peer;
- }
-
- goto new_UDP_peer;
- }
-
- read_unlock_bh(&rxrpc_peer_lock);
- _leave(" = -EAFNOSUPPORT");
- return ERR_PTR(-EAFNOSUPPORT);
-
-found_UDP_peer:
- _net("Rx UDP DGRAM from peer %d", peer->debug_id);
- atomic_inc(&peer->usage);
- read_unlock_bh(&rxrpc_peer_lock);
- _leave(" = %p", peer);
- return peer;
-
-new_UDP_peer:
- _net("Rx UDP DGRAM from NEW peer");
- read_unlock_bh(&rxrpc_peer_lock);
- _leave(" = -EBUSY [new]");
- return ERR_PTR(-EBUSY);
-}
-
-/*
- * release a remote transport endpoint
- */
-void rxrpc_put_peer(struct rxrpc_peer *peer)
-{
- _enter("%p{u=%d}", peer, atomic_read(&peer->usage));
-
- ASSERTCMP(atomic_read(&peer->usage), >, 0);
-
- if (likely(!atomic_dec_and_test(&peer->usage))) {
- _leave(" [in use]");
- return;
- }
-
- rxrpc_queue_work(&peer->destroyer);
- _leave("");
-}
-
-/*
- * destroy a remote transport endpoint
- */
-static void rxrpc_destroy_peer(struct work_struct *work)
-{
- struct rxrpc_peer *peer =
- container_of(work, struct rxrpc_peer, destroyer);
-
- _enter("%p{%d}", peer, atomic_read(&peer->usage));
-
- write_lock_bh(&rxrpc_peer_lock);
- list_del(&peer->link);
- write_unlock_bh(&rxrpc_peer_lock);
-
- _net("DESTROY PEER %d", peer->debug_id);
- kfree(peer);
-
- if (list_empty(&rxrpc_peers))
- wake_up_all(&rxrpc_peer_wq);
- _leave("");
-}
-
-/*
- * preemptively destroy all the peer records from a transport endpoint rather
- * than waiting for them to time out
- */
-void __exit rxrpc_destroy_all_peers(void)
-{
- DECLARE_WAITQUEUE(myself,current);
-
- _enter("");
-
- /* we simply have to wait for them to go away */
- if (!list_empty(&rxrpc_peers)) {
- set_current_state(TASK_UNINTERRUPTIBLE);
- add_wait_queue(&rxrpc_peer_wq, &myself);
-
- while (!list_empty(&rxrpc_peers)) {
- schedule();
- set_current_state(TASK_UNINTERRUPTIBLE);
- }
-
- remove_wait_queue(&rxrpc_peer_wq, &myself);
- set_current_state(TASK_RUNNING);
- }
-
- _leave("");
-}
diff --git a/net/rxrpc/ar-transport.c b/net/rxrpc/ar-transport.c
deleted file mode 100644
index a1b6518..0000000
--- a/net/rxrpc/ar-transport.c
+++ /dev/null
@@ -1,286 +0,0 @@
-/* RxRPC point-to-point transport session management
- *
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/module.h>
-#include <linux/net.h>
-#include <linux/skbuff.h>
-#include <linux/slab.h>
-#include <net/sock.h>
-#include <net/af_rxrpc.h>
-#include "ar-internal.h"
-
-/*
- * Time after last use at which transport record is cleaned up.
- */
-unsigned int rxrpc_transport_expiry = 3600 * 24;
-
-static void rxrpc_transport_reaper(struct work_struct *work);
-
-static LIST_HEAD(rxrpc_transports);
-static DEFINE_RWLOCK(rxrpc_transport_lock);
-static DECLARE_DELAYED_WORK(rxrpc_transport_reap, rxrpc_transport_reaper);
-
-/*
- * allocate a new transport session manager
- */
-static struct rxrpc_transport *rxrpc_alloc_transport(struct rxrpc_local *local,
- struct rxrpc_peer *peer,
- gfp_t gfp)
-{
- struct rxrpc_transport *trans;
-
- _enter("");
-
- trans = kzalloc(sizeof(struct rxrpc_transport), gfp);
- if (trans) {
- trans->local = local;
- trans->peer = peer;
- INIT_LIST_HEAD(&trans->link);
- trans->bundles = RB_ROOT;
- trans->client_conns = RB_ROOT;
- trans->server_conns = RB_ROOT;
- skb_queue_head_init(&trans->error_queue);
- spin_lock_init(&trans->client_lock);
- rwlock_init(&trans->conn_lock);
- atomic_set(&trans->usage, 1);
- trans->conn_idcounter = peer->srx.srx_service << 16;
- trans->debug_id = atomic_inc_return(&rxrpc_debug_id);
-
- if (peer->srx.transport.family == AF_INET) {
- switch (peer->srx.transport_type) {
- case SOCK_DGRAM:
- INIT_WORK(&trans->error_handler,
- rxrpc_UDP_error_handler);
- break;
- default:
- BUG();
- break;
- }
- } else {
- BUG();
- }
- }
-
- _leave(" = %p", trans);
- return trans;
-}
-
-/*
- * obtain a transport session for the nominated endpoints
- */
-struct rxrpc_transport *rxrpc_get_transport(struct rxrpc_local *local,
- struct rxrpc_peer *peer,
- gfp_t gfp)
-{
- struct rxrpc_transport *trans, *candidate;
- const char *new = "old";
- int usage;
-
- _enter("{%pI4+%hu},{%pI4+%hu},",
- &local->srx.transport.sin.sin_addr,
- ntohs(local->srx.transport.sin.sin_port),
- &peer->srx.transport.sin.sin_addr,
- ntohs(peer->srx.transport.sin.sin_port));
-
- /* search the transport list first */
- read_lock_bh(&rxrpc_transport_lock);
- list_for_each_entry(trans, &rxrpc_transports, link) {
- if (trans->local == local && trans->peer == peer)
- goto found_extant_transport;
- }
- read_unlock_bh(&rxrpc_transport_lock);
-
- /* not yet present - create a candidate for a new record and then
- * redo the search */
- candidate = rxrpc_alloc_transport(local, peer, gfp);
- if (!candidate) {
- _leave(" = -ENOMEM");
- return ERR_PTR(-ENOMEM);
- }
-
- write_lock_bh(&rxrpc_transport_lock);
-
- list_for_each_entry(trans, &rxrpc_transports, link) {
- if (trans->local == local && trans->peer == peer)
- goto found_extant_second;
- }
-
- /* we can now add the new candidate to the list */
- trans = candidate;
- candidate = NULL;
- usage = atomic_read(&trans->usage);
-
- rxrpc_get_local(trans->local);
- atomic_inc(&trans->peer->usage);
- list_add_tail(&trans->link, &rxrpc_transports);
- write_unlock_bh(&rxrpc_transport_lock);
- new = "new";
-
-success:
- _net("TRANSPORT %s %d local %d -> peer %d",
- new,
- trans->debug_id,
- trans->local->debug_id,
- trans->peer->debug_id);
-
- _leave(" = %p {u=%d}", trans, usage);
- return trans;
-
- /* we found the transport in the list immediately */
-found_extant_transport:
- usage = atomic_inc_return(&trans->usage);
- read_unlock_bh(&rxrpc_transport_lock);
- goto success;
-
- /* we found the transport on the second time through the list */
-found_extant_second:
- usage = atomic_inc_return(&trans->usage);
- write_unlock_bh(&rxrpc_transport_lock);
- kfree(candidate);
- goto success;
-}
-
-/*
- * find the transport connecting two endpoints
- */
-struct rxrpc_transport *rxrpc_find_transport(struct rxrpc_local *local,
- struct rxrpc_peer *peer)
-{
- struct rxrpc_transport *trans;
-
- _enter("{%pI4+%hu},{%pI4+%hu},",
- &local->srx.transport.sin.sin_addr,
- ntohs(local->srx.transport.sin.sin_port),
- &peer->srx.transport.sin.sin_addr,
- ntohs(peer->srx.transport.sin.sin_port));
-
- /* search the transport list */
- read_lock_bh(&rxrpc_transport_lock);
-
- list_for_each_entry(trans, &rxrpc_transports, link) {
- if (trans->local == local && trans->peer == peer)
- goto found_extant_transport;
- }
-
- read_unlock_bh(&rxrpc_transport_lock);
- _leave(" = NULL");
- return NULL;
-
-found_extant_transport:
- atomic_inc(&trans->usage);
- read_unlock_bh(&rxrpc_transport_lock);
- _leave(" = %p", trans);
- return trans;
-}
-
-/*
- * release a transport session
- */
-void rxrpc_put_transport(struct rxrpc_transport *trans)
-{
- _enter("%p{u=%d}", trans, atomic_read(&trans->usage));
-
- ASSERTCMP(atomic_read(&trans->usage), >, 0);
-
- trans->put_time = ktime_get_seconds();
- if (unlikely(atomic_dec_and_test(&trans->usage))) {
- _debug("zombie");
- /* let the reaper determine the timeout to avoid a race with
- * overextending the timeout if the reaper is running at the
- * same time */
- rxrpc_queue_delayed_work(&rxrpc_transport_reap, 0);
- }
- _leave("");
-}
-
-/*
- * clean up a transport session
- */
-static void rxrpc_cleanup_transport(struct rxrpc_transport *trans)
-{
- _net("DESTROY TRANS %d", trans->debug_id);
-
- rxrpc_purge_queue(&trans->error_queue);
-
- rxrpc_put_local(trans->local);
- rxrpc_put_peer(trans->peer);
- kfree(trans);
-}
-
-/*
- * reap dead transports that have passed their expiry date
- */
-static void rxrpc_transport_reaper(struct work_struct *work)
-{
- struct rxrpc_transport *trans, *_p;
- unsigned long now, earliest, reap_time;
-
- LIST_HEAD(graveyard);
-
- _enter("");
-
- now = ktime_get_seconds();
- earliest = ULONG_MAX;
-
- /* extract all the transports that have been dead too long */
- write_lock_bh(&rxrpc_transport_lock);
- list_for_each_entry_safe(trans, _p, &rxrpc_transports, link) {
- _debug("reap TRANS %d { u=%d t=%ld }",
- trans->debug_id, atomic_read(&trans->usage),
- (long) now - (long) trans->put_time);
-
- if (likely(atomic_read(&trans->usage) > 0))
- continue;
-
- reap_time = trans->put_time + rxrpc_transport_expiry;
- if (reap_time <= now)
- list_move_tail(&trans->link, &graveyard);
- else if (reap_time < earliest)
- earliest = reap_time;
- }
- write_unlock_bh(&rxrpc_transport_lock);
-
- if (earliest != ULONG_MAX) {
- _debug("reschedule reaper %ld", (long) earliest - now);
- ASSERTCMP(earliest, >, now);
- rxrpc_queue_delayed_work(&rxrpc_transport_reap,
- (earliest - now) * HZ);
- }
-
- /* then destroy all those pulled out */
- while (!list_empty(&graveyard)) {
- trans = list_entry(graveyard.next, struct rxrpc_transport,
- link);
- list_del_init(&trans->link);
-
- ASSERTCMP(atomic_read(&trans->usage), ==, 0);
- rxrpc_cleanup_transport(trans);
- }
-
- _leave("");
-}
-
-/*
- * preemptively destroy all the transport session records rather than waiting
- * for them to time out
- */
-void __exit rxrpc_destroy_all_transports(void)
-{
- _enter("");
-
- rxrpc_transport_expiry = 0;
- cancel_delayed_work(&rxrpc_transport_reap);
- rxrpc_queue_delayed_work(&rxrpc_transport_reap, 0);
-
- _leave("");
-}
diff --git a/net/rxrpc/ar-accept.c b/net/rxrpc/call_accept.c
index eea5f4a..202e053 100644
--- a/net/rxrpc/ar-accept.c
+++ b/net/rxrpc/call_accept.c
@@ -74,7 +74,6 @@ static int rxrpc_accept_incoming_call(struct rxrpc_local *local,
struct sockaddr_rxrpc *srx)
{
struct rxrpc_connection *conn;
- struct rxrpc_transport *trans;
struct rxrpc_skb_priv *sp, *nsp;
struct rxrpc_peer *peer;
struct rxrpc_call *call;
@@ -95,30 +94,22 @@ static int rxrpc_accept_incoming_call(struct rxrpc_local *local,
rxrpc_new_skb(notification);
notification->mark = RXRPC_SKB_MARK_NEW_CALL;
- peer = rxrpc_get_peer(srx, GFP_NOIO);
- if (IS_ERR(peer)) {
+ peer = rxrpc_lookup_peer(local, srx, GFP_NOIO);
+ if (!peer) {
_debug("no peer");
ret = -EBUSY;
goto error;
}
- trans = rxrpc_get_transport(local, peer, GFP_NOIO);
+ conn = rxrpc_incoming_connection(local, peer, skb);
rxrpc_put_peer(peer);
- if (IS_ERR(trans)) {
- _debug("no trans");
- ret = -EBUSY;
- goto error;
- }
-
- conn = rxrpc_incoming_connection(trans, &sp->hdr);
- rxrpc_put_transport(trans);
if (IS_ERR(conn)) {
_debug("no conn");
ret = PTR_ERR(conn);
goto error;
}
- call = rxrpc_incoming_call(rx, conn, &sp->hdr);
+ call = rxrpc_incoming_call(rx, conn, skb);
rxrpc_put_connection(conn);
if (IS_ERR(call)) {
_debug("no call");
@@ -141,7 +132,7 @@ static int rxrpc_accept_incoming_call(struct rxrpc_local *local,
_debug("await conn sec");
list_add_tail(&call->accept_link, &rx->secureq);
call->conn->state = RXRPC_CONN_SERVER_CHALLENGING;
- atomic_inc(&call->conn->usage);
+ rxrpc_get_connection(call->conn);
set_bit(RXRPC_CONN_CHALLENGE, &call->conn->events);
rxrpc_queue_conn(call->conn);
} else {
@@ -202,10 +193,8 @@ error_nofree:
* accept incoming calls that need peer, transport and/or connection setting up
* - the packets we get are all incoming client DATA packets that have seq == 1
*/
-void rxrpc_accept_incoming_calls(struct work_struct *work)
+void rxrpc_accept_incoming_calls(struct rxrpc_local *local)
{
- struct rxrpc_local *local =
- container_of(work, struct rxrpc_local, acceptor);
struct rxrpc_skb_priv *sp;
struct sockaddr_rxrpc srx;
struct rxrpc_sock *rx;
@@ -215,21 +204,8 @@ void rxrpc_accept_incoming_calls(struct work_struct *work)
_enter("%d", local->debug_id);
- read_lock_bh(&rxrpc_local_lock);
- if (atomic_read(&local->usage) > 0)
- rxrpc_get_local(local);
- else
- local = NULL;
- read_unlock_bh(&rxrpc_local_lock);
- if (!local) {
- _leave(" [local dead]");
- return;
- }
-
-process_next_packet:
skb = skb_dequeue(&local->accept_queue);
if (!skb) {
- rxrpc_put_local(local);
_leave("\n");
return;
}
@@ -292,7 +268,7 @@ found_service:
case -ECONNRESET: /* old calls are ignored */
case -ECONNABORTED: /* aborted calls are reaborted or ignored */
case 0:
- goto process_next_packet;
+ return;
case -ECONNREFUSED:
goto invalid_service;
case -EBUSY:
@@ -308,18 +284,18 @@ backlog_full:
busy:
rxrpc_busy(local, &srx, &whdr);
rxrpc_free_skb(skb);
- goto process_next_packet;
+ return;
invalid_service:
skb->priority = RX_INVALID_OPERATION;
rxrpc_reject_packet(local, skb);
- goto process_next_packet;
+ return;
/* can't change connection security type mid-flow */
security_mismatch:
skb->priority = RX_PROTOCOL_ERROR;
rxrpc_reject_packet(local, skb);
- goto process_next_packet;
+ return;
}
/*
diff --git a/net/rxrpc/ar-ack.c b/net/rxrpc/call_event.c
index 1838178..0ba8429 100644
--- a/net/rxrpc/ar-ack.c
+++ b/net/rxrpc/call_event.c
@@ -187,7 +187,7 @@ static void rxrpc_resend(struct rxrpc_call *call)
_proto("Tx DATA %%%u { #%d }",
sp->hdr.serial, sp->hdr.seq);
- if (rxrpc_send_packet(call->conn->trans, txb) < 0) {
+ if (rxrpc_send_data_packet(call->conn, txb) < 0) {
stop = true;
sp->resend_at = jiffies + 3;
} else {
@@ -545,7 +545,7 @@ static void rxrpc_extract_ackinfo(struct rxrpc_call *call, struct sk_buff *skb,
mtu = min(ntohl(ackinfo.rxMTU), ntohl(ackinfo.maxMTU));
- peer = call->conn->trans->peer;
+ peer = call->conn->params.peer;
if (mtu < peer->maxdata) {
spin_lock_bh(&peer->lock);
peer->maxdata = mtu;
@@ -836,13 +836,13 @@ void rxrpc_process_call(struct work_struct *work)
/* there's a good chance we're going to have to send a message, so set
* one up in advance */
- msg.msg_name = &call->conn->trans->peer->srx.transport;
- msg.msg_namelen = call->conn->trans->peer->srx.transport_len;
+ msg.msg_name = &call->conn->params.peer->srx.transport;
+ msg.msg_namelen = call->conn->params.peer->srx.transport_len;
msg.msg_control = NULL;
msg.msg_controllen = 0;
msg.msg_flags = 0;
- whdr.epoch = htonl(call->conn->epoch);
+ whdr.epoch = htonl(call->conn->proto.epoch);
whdr.cid = htonl(call->cid);
whdr.callNumber = htonl(call->call_id);
whdr.seq = 0;
@@ -864,17 +864,24 @@ void rxrpc_process_call(struct work_struct *work)
}
if (test_bit(RXRPC_CALL_EV_RCVD_ERROR, &call->events)) {
+ enum rxrpc_skb_mark mark;
int error;
clear_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events);
clear_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events);
clear_bit(RXRPC_CALL_EV_ABORT, &call->events);
- error = call->conn->trans->peer->net_error;
- _debug("post net error %d", error);
+ error = call->error_report;
+ if (error < RXRPC_LOCAL_ERROR_OFFSET) {
+ mark = RXRPC_SKB_MARK_NET_ERROR;
+ _debug("post net error %d", error);
+ } else {
+ mark = RXRPC_SKB_MARK_LOCAL_ERROR;
+ error -= RXRPC_LOCAL_ERROR_OFFSET;
+ _debug("post net local error %d", error);
+ }
- if (rxrpc_post_message(call, RXRPC_SKB_MARK_NET_ERROR,
- error, true) < 0)
+ if (rxrpc_post_message(call, mark, error, true) < 0)
goto no_mem;
clear_bit(RXRPC_CALL_EV_RCVD_ERROR, &call->events);
goto kill_ACKs;
@@ -1144,8 +1151,8 @@ send_ACK_with_skew:
ack.maxSkew = htons(atomic_read(&call->conn->hi_serial) -
ntohl(ack.serial));
send_ACK:
- mtu = call->conn->trans->peer->if_mtu;
- mtu -= call->conn->trans->peer->hdrsize;
+ mtu = call->conn->params.peer->if_mtu;
+ mtu -= call->conn->params.peer->hdrsize;
ackinfo.maxMTU = htonl(mtu);
ackinfo.rwind = htonl(rxrpc_rx_window_size);
@@ -1199,7 +1206,7 @@ send_message_2:
len += iov[1].iov_len;
}
- ret = kernel_sendmsg(call->conn->trans->local->socket,
+ ret = kernel_sendmsg(call->conn->params.local->socket,
&msg, iov, ioc, len);
if (ret < 0) {
_debug("sendmsg failed: %d", ret);
@@ -1257,7 +1264,7 @@ maybe_reschedule:
if (call->state >= RXRPC_CALL_COMPLETE &&
!list_empty(&call->accept_link)) {
_debug("X unlinking once-pending call %p { e=%lx f=%lx c=%x }",
- call, call->events, call->flags, call->conn->cid);
+ call, call->events, call->flags, call->conn->proto.cid);
read_lock_bh(&call->state_lock);
if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
@@ -1275,7 +1282,7 @@ error:
* this means there's a race between clearing the flag and setting the
* work pending bit and the work item being processed again */
if (call->events && !work_pending(&call->processor)) {
- _debug("jumpstart %x", call->conn->cid);
+ _debug("jumpstart %x", call->conn->proto.cid);
rxrpc_queue_call(call);
}
diff --git a/net/rxrpc/ar-call.c b/net/rxrpc/call_object.c
index 68125dc..ad933da 100644
--- a/net/rxrpc/ar-call.c
+++ b/net/rxrpc/call_object.c
@@ -31,6 +31,8 @@ unsigned int rxrpc_max_call_lifetime = 60 * HZ;
unsigned int rxrpc_dead_call_expiry = 2 * HZ;
const char *const rxrpc_call_states[NR__RXRPC_CALL_STATES] = {
+ [RXRPC_CALL_UNINITIALISED] = "Uninit",
+ [RXRPC_CALL_CLIENT_AWAIT_CONN] = "ClWtConn",
[RXRPC_CALL_CLIENT_SEND_REQUEST] = "ClSndReq",
[RXRPC_CALL_CLIENT_AWAIT_REPLY] = "ClAwtRpl",
[RXRPC_CALL_CLIENT_RECV_REPLY] = "ClRcvRpl",
@@ -71,7 +73,7 @@ static unsigned long rxrpc_call_hashfunc(
u32 call_id,
u32 epoch,
u16 service_id,
- sa_family_t proto,
+ sa_family_t family,
void *localptr,
unsigned int addr_size,
const u8 *peer_addr)
@@ -92,7 +94,7 @@ static unsigned long rxrpc_call_hashfunc(
key += (cid & RXRPC_CIDMASK) >> RXRPC_CIDSHIFT;
key += cid & RXRPC_CHANNELMASK;
key += in_clientflag;
- key += proto;
+ key += family;
/* Step through the peer address in 16-bit portions for speed */
for (i = 0, p = (const u16 *)peer_addr; i < addr_size >> 1; i++, p++)
key += *p;
@@ -109,7 +111,7 @@ static void rxrpc_call_hash_add(struct rxrpc_call *call)
unsigned int addr_size = 0;
_enter("");
- switch (call->proto) {
+ switch (call->family) {
case AF_INET:
addr_size = sizeof(call->peer_ip.ipv4_addr);
break;
@@ -121,8 +123,8 @@ static void rxrpc_call_hash_add(struct rxrpc_call *call)
}
key = rxrpc_call_hashfunc(call->in_clientflag, call->cid,
call->call_id, call->epoch,
- call->service_id, call->proto,
- call->conn->trans->local, addr_size,
+ call->service_id, call->family,
+ call->conn->params.local, addr_size,
call->peer_ip.ipv6_addr);
/* Store the full key in the call */
call->hash_key = key;
@@ -151,7 +153,7 @@ static void rxrpc_call_hash_del(struct rxrpc_call *call)
struct rxrpc_call *rxrpc_find_call_hash(
struct rxrpc_host_header *hdr,
void *localptr,
- sa_family_t proto,
+ sa_family_t family,
const void *peer_addr)
{
unsigned long key;
@@ -161,7 +163,7 @@ struct rxrpc_call *rxrpc_find_call_hash(
u8 in_clientflag = hdr->flags & RXRPC_CLIENT_INITIATED;
_enter("");
- switch (proto) {
+ switch (family) {
case AF_INET:
addr_size = sizeof(call->peer_ip.ipv4_addr);
break;
@@ -174,7 +176,7 @@ struct rxrpc_call *rxrpc_find_call_hash(
key = rxrpc_call_hashfunc(in_clientflag, hdr->cid, hdr->callNumber,
hdr->epoch, hdr->serviceId,
- proto, localptr, addr_size,
+ family, localptr, addr_size,
peer_addr);
hash_for_each_possible_rcu(rxrpc_call_hash, call, hash_node, key) {
if (call->hash_key == key &&
@@ -182,7 +184,7 @@ struct rxrpc_call *rxrpc_find_call_hash(
call->cid == hdr->cid &&
call->in_clientflag == in_clientflag &&
call->service_id == hdr->serviceId &&
- call->proto == proto &&
+ call->family == family &&
call->local == localptr &&
memcmp(call->peer_ip.ipv6_addr, peer_addr,
addr_size) == 0 &&
@@ -261,6 +263,7 @@ static struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp)
(unsigned long) call);
INIT_WORK(&call->destroyer, &rxrpc_destroy_call);
INIT_WORK(&call->processor, &rxrpc_process_call);
+ INIT_LIST_HEAD(&call->link);
INIT_LIST_HEAD(&call->accept_link);
skb_queue_head_init(&call->rx_queue);
skb_queue_head_init(&call->rx_oos_queue);
@@ -269,7 +272,6 @@ static struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp)
rwlock_init(&call->state_lock);
atomic_set(&call->usage, 1);
call->debug_id = atomic_inc_return(&rxrpc_debug_id);
- call->state = RXRPC_CALL_CLIENT_SEND_REQUEST;
memset(&call->sock_node, 0xed, sizeof(call->sock_node));
@@ -282,66 +284,77 @@ static struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp)
}
/*
- * allocate a new client call and attempt to get a connection slot for it
+ * Allocate a new client call.
*/
-static struct rxrpc_call *rxrpc_alloc_client_call(
- struct rxrpc_sock *rx,
- struct rxrpc_transport *trans,
- struct rxrpc_conn_bundle *bundle,
- gfp_t gfp)
+static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx,
+ struct sockaddr_rxrpc *srx,
+ gfp_t gfp)
{
struct rxrpc_call *call;
- int ret;
_enter("");
- ASSERT(rx != NULL);
- ASSERT(trans != NULL);
- ASSERT(bundle != NULL);
+ ASSERT(rx->local != NULL);
call = rxrpc_alloc_call(gfp);
if (!call)
return ERR_PTR(-ENOMEM);
+ call->state = RXRPC_CALL_CLIENT_AWAIT_CONN;
sock_hold(&rx->sk);
call->socket = rx;
call->rx_data_post = 1;
- ret = rxrpc_connect_call(rx, trans, bundle, call, gfp);
- if (ret < 0) {
- kmem_cache_free(rxrpc_call_jar, call);
- return ERR_PTR(ret);
- }
-
/* Record copies of information for hashtable lookup */
- call->proto = rx->proto;
- call->local = trans->local;
- switch (call->proto) {
+ call->family = rx->family;
+ call->local = rx->local;
+ switch (call->family) {
case AF_INET:
- call->peer_ip.ipv4_addr =
- trans->peer->srx.transport.sin.sin_addr.s_addr;
+ call->peer_ip.ipv4_addr = srx->transport.sin.sin_addr.s_addr;
break;
case AF_INET6:
memcpy(call->peer_ip.ipv6_addr,
- trans->peer->srx.transport.sin6.sin6_addr.in6_u.u6_addr8,
+ srx->transport.sin6.sin6_addr.in6_u.u6_addr8,
sizeof(call->peer_ip.ipv6_addr));
break;
}
- call->epoch = call->conn->epoch;
- call->service_id = call->conn->service_id;
- call->in_clientflag = call->conn->in_clientflag;
+
+ call->service_id = srx->srx_service;
+ call->in_clientflag = 0;
+
+ _leave(" = %p", call);
+ return call;
+}
+
+/*
+ * Begin client call.
+ */
+static int rxrpc_begin_client_call(struct rxrpc_call *call,
+ struct rxrpc_conn_parameters *cp,
+ struct sockaddr_rxrpc *srx,
+ gfp_t gfp)
+{
+ int ret;
+
+ /* Set up or get a connection record and set the protocol parameters,
+ * including channel number and call ID.
+ */
+ ret = rxrpc_connect_call(call, cp, srx, gfp);
+ if (ret < 0)
+ return ret;
+
+ call->state = RXRPC_CALL_CLIENT_SEND_REQUEST;
+
/* Add the new call to the hashtable */
rxrpc_call_hash_add(call);
- spin_lock(&call->conn->trans->peer->lock);
- list_add(&call->error_link, &call->conn->trans->peer->error_targets);
- spin_unlock(&call->conn->trans->peer->lock);
+ spin_lock(&call->conn->params.peer->lock);
+ hlist_add_head(&call->error_link, &call->conn->params.peer->error_targets);
+ spin_unlock(&call->conn->params.peer->lock);
call->lifetimer.expires = jiffies + rxrpc_max_call_lifetime;
add_timer(&call->lifetimer);
-
- _leave(" = %p", call);
- return call;
+ return 0;
}
/*
@@ -349,24 +362,24 @@ static struct rxrpc_call *rxrpc_alloc_client_call(
* - called in process context with IRQs enabled
*/
struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
- struct rxrpc_transport *trans,
- struct rxrpc_conn_bundle *bundle,
+ struct rxrpc_conn_parameters *cp,
+ struct sockaddr_rxrpc *srx,
unsigned long user_call_ID,
gfp_t gfp)
{
struct rxrpc_call *call, *xcall;
struct rb_node *parent, **pp;
+ int ret;
- _enter("%p,%d,%d,%lx",
- rx, trans->debug_id, bundle ? bundle->debug_id : -1,
- user_call_ID);
+ _enter("%p,%lx", rx, user_call_ID);
- call = rxrpc_alloc_client_call(rx, trans, bundle, gfp);
+ call = rxrpc_alloc_client_call(rx, srx, gfp);
if (IS_ERR(call)) {
_leave(" = %ld", PTR_ERR(call));
return call;
}
+ /* Publish the call, even though it is incompletely set up as yet */
call->user_call_ID = user_call_ID;
__set_bit(RXRPC_CALL_HAS_USERID, &call->flags);
@@ -396,11 +409,29 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
list_add_tail(&call->link, &rxrpc_calls);
write_unlock_bh(&rxrpc_call_lock);
+ ret = rxrpc_begin_client_call(call, cp, srx, gfp);
+ if (ret < 0)
+ goto error;
+
_net("CALL new %d on CONN %d", call->debug_id, call->conn->debug_id);
_leave(" = %p [new]", call);
return call;
+error:
+ write_lock(&rx->call_lock);
+ rb_erase(&call->sock_node, &rx->calls);
+ write_unlock(&rx->call_lock);
+ rxrpc_put_call(call);
+
+ write_lock_bh(&rxrpc_call_lock);
+ list_del(&call->link);
+ write_unlock_bh(&rxrpc_call_lock);
+
+ rxrpc_put_call(call);
+ _leave(" = %d", ret);
+ return ERR_PTR(ret);
+
/* We unexpectedly found the user ID in the list after taking
* the call_lock. This shouldn't happen unless the user races
* with itself and tries to add the same user ID twice at the
@@ -419,8 +450,9 @@ found_user_ID_now_present:
*/
struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx,
struct rxrpc_connection *conn,
- struct rxrpc_host_header *hdr)
+ struct sk_buff *skb)
{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
struct rxrpc_call *call, *candidate;
struct rb_node **p, *parent;
u32 call_id;
@@ -433,13 +465,13 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx,
if (!candidate)
return ERR_PTR(-EBUSY);
- candidate->socket = rx;
- candidate->conn = conn;
- candidate->cid = hdr->cid;
- candidate->call_id = hdr->callNumber;
- candidate->channel = hdr->cid & RXRPC_CHANNELMASK;
- candidate->rx_data_post = 0;
- candidate->state = RXRPC_CALL_SERVER_ACCEPTING;
+ candidate->socket = rx;
+ candidate->conn = conn;
+ candidate->cid = sp->hdr.cid;
+ candidate->call_id = sp->hdr.callNumber;
+ candidate->channel = sp->hdr.cid & RXRPC_CHANNELMASK;
+ candidate->rx_data_post = 0;
+ candidate->state = RXRPC_CALL_SERVER_ACCEPTING;
if (conn->security_ix > 0)
candidate->state = RXRPC_CALL_SERVER_SECURING;
@@ -448,7 +480,7 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx,
/* set the channel for this call */
call = conn->channels[candidate->channel];
_debug("channel[%u] is %p", candidate->channel, call);
- if (call && call->call_id == hdr->callNumber) {
+ if (call && call->call_id == sp->hdr.callNumber) {
/* already set; must've been a duplicate packet */
_debug("extant call [%d]", call->state);
ASSERTCMP(call->conn, ==, conn);
@@ -486,7 +518,7 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx,
/* check the call number isn't duplicate */
_debug("check dup");
- call_id = hdr->callNumber;
+ call_id = sp->hdr.callNumber;
p = &conn->calls.rb_node;
parent = NULL;
while (*p) {
@@ -512,36 +544,36 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx,
rb_insert_color(&call->conn_node, &conn->calls);
conn->channels[call->channel] = call;
sock_hold(&rx->sk);
- atomic_inc(&conn->usage);
+ rxrpc_get_connection(conn);
write_unlock_bh(&conn->lock);
- spin_lock(&conn->trans->peer->lock);
- list_add(&call->error_link, &conn->trans->peer->error_targets);
- spin_unlock(&conn->trans->peer->lock);
+ spin_lock(&conn->params.peer->lock);
+ hlist_add_head(&call->error_link, &conn->params.peer->error_targets);
+ spin_unlock(&conn->params.peer->lock);
write_lock_bh(&rxrpc_call_lock);
list_add_tail(&call->link, &rxrpc_calls);
write_unlock_bh(&rxrpc_call_lock);
/* Record copies of information for hashtable lookup */
- call->proto = rx->proto;
- call->local = conn->trans->local;
- switch (call->proto) {
+ call->family = rx->family;
+ call->local = conn->params.local;
+ switch (call->family) {
case AF_INET:
call->peer_ip.ipv4_addr =
- conn->trans->peer->srx.transport.sin.sin_addr.s_addr;
+ conn->params.peer->srx.transport.sin.sin_addr.s_addr;
break;
case AF_INET6:
memcpy(call->peer_ip.ipv6_addr,
- conn->trans->peer->srx.transport.sin6.sin6_addr.in6_u.u6_addr8,
+ conn->params.peer->srx.transport.sin6.sin6_addr.in6_u.u6_addr8,
sizeof(call->peer_ip.ipv6_addr));
break;
default:
break;
}
- call->epoch = conn->epoch;
- call->service_id = conn->service_id;
- call->in_clientflag = conn->in_clientflag;
+ call->epoch = conn->proto.epoch;
+ call->service_id = conn->params.service_id;
+ call->in_clientflag = conn->proto.in_clientflag;
/* Add the new call to the hashtable */
rxrpc_call_hash_add(call);
@@ -609,40 +641,13 @@ void rxrpc_release_call(struct rxrpc_call *call)
write_unlock_bh(&rx->call_lock);
/* free up the channel for reuse */
- spin_lock(&conn->trans->client_lock);
+ spin_lock(&conn->channel_lock);
write_lock_bh(&conn->lock);
write_lock(&call->state_lock);
- if (conn->channels[call->channel] == call)
- conn->channels[call->channel] = NULL;
-
- if (conn->out_clientflag && conn->bundle) {
- conn->avail_calls++;
- switch (conn->avail_calls) {
- case 1:
- list_move_tail(&conn->bundle_link,
- &conn->bundle->avail_conns);
- case 2 ... RXRPC_MAXCALLS - 1:
- ASSERT(conn->channels[0] == NULL ||
- conn->channels[1] == NULL ||
- conn->channels[2] == NULL ||
- conn->channels[3] == NULL);
- break;
- case RXRPC_MAXCALLS:
- list_move_tail(&conn->bundle_link,
- &conn->bundle->unused_conns);
- ASSERT(conn->channels[0] == NULL &&
- conn->channels[1] == NULL &&
- conn->channels[2] == NULL &&
- conn->channels[3] == NULL);
- break;
- default:
- pr_err("conn->avail_calls=%d\n", conn->avail_calls);
- BUG();
- }
- }
+ rxrpc_disconnect_call(call);
- spin_unlock(&conn->trans->client_lock);
+ spin_unlock(&conn->channel_lock);
if (call->state < RXRPC_CALL_COMPLETE &&
call->state != RXRPC_CALL_CLIENT_FINAL_ACK) {
@@ -811,9 +816,9 @@ static void rxrpc_cleanup_call(struct rxrpc_call *call)
}
if (call->conn) {
- spin_lock(&call->conn->trans->peer->lock);
- list_del(&call->error_link);
- spin_unlock(&call->conn->trans->peer->lock);
+ spin_lock(&call->conn->params.peer->lock);
+ hlist_del_init(&call->error_link);
+ spin_unlock(&call->conn->params.peer->lock);
write_lock_bh(&call->conn->lock);
rb_erase(&call->conn_node, &call->conn->calls);
diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c
new file mode 100644
index 0000000..82488d6
--- /dev/null
+++ b/net/rxrpc/conn_client.c
@@ -0,0 +1,94 @@
+/* Client connection-specific management code.
+ *
+ * Copyright (C) 2016 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/slab.h>
+#include <linux/idr.h>
+#include <linux/timer.h>
+#include "ar-internal.h"
+
+/*
+ * We use machine-unique IDs for our client connections.
+ */
+DEFINE_IDR(rxrpc_client_conn_ids);
+static DEFINE_SPINLOCK(rxrpc_conn_id_lock);
+
+/*
+ * Get a connection ID and epoch for a client connection from the global pool.
+ * The connection struct pointer is then recorded in the idr radix tree. The
+ * epoch is changed if this wraps.
+ *
+ * TODO: The IDR tree gets very expensive on memory if the connection IDs are
+ * widely scattered throughout the number space, so we shall need to retire
+ * connections that have, say, an ID more than four times the maximum number of
+ * client conns away from the current allocation point to try and keep the IDs
+ * concentrated. We will also need to retire connections from an old epoch.
+ */
+int rxrpc_get_client_connection_id(struct rxrpc_connection *conn, gfp_t gfp)
+{
+ u32 epoch;
+ int id;
+
+ _enter("");
+
+ idr_preload(gfp);
+ spin_lock(&rxrpc_conn_id_lock);
+
+ epoch = rxrpc_epoch;
+
+ /* We could use idr_alloc_cyclic() here, but we really need to know
+ * when the thing wraps so that we can advance the epoch.
+ */
+ if (rxrpc_client_conn_ids.cur == 0)
+ rxrpc_client_conn_ids.cur = 1;
+ id = idr_alloc(&rxrpc_client_conn_ids, conn,
+ rxrpc_client_conn_ids.cur, 0x40000000, GFP_NOWAIT);
+ if (id < 0) {
+ if (id != -ENOSPC)
+ goto error;
+ id = idr_alloc(&rxrpc_client_conn_ids, conn,
+ 1, 0x40000000, GFP_NOWAIT);
+ if (id < 0)
+ goto error;
+ epoch++;
+ rxrpc_epoch = epoch;
+ }
+ rxrpc_client_conn_ids.cur = id + 1;
+
+ spin_unlock(&rxrpc_conn_id_lock);
+ idr_preload_end();
+
+ conn->proto.epoch = epoch;
+ conn->proto.cid = id << RXRPC_CIDSHIFT;
+ set_bit(RXRPC_CONN_HAS_IDR, &conn->flags);
+ _leave(" [CID %x:%x]", epoch, conn->proto.cid);
+ return 0;
+
+error:
+ spin_unlock(&rxrpc_conn_id_lock);
+ idr_preload_end();
+ _leave(" = %d", id);
+ return id;
+}
+
+/*
+ * Release a connection ID for a client connection from the global pool.
+ */
+void rxrpc_put_client_connection_id(struct rxrpc_connection *conn)
+{
+ if (test_bit(RXRPC_CONN_HAS_IDR, &conn->flags)) {
+ spin_lock(&rxrpc_conn_id_lock);
+ idr_remove(&rxrpc_client_conn_ids,
+ conn->proto.cid >> RXRPC_CIDSHIFT);
+ spin_unlock(&rxrpc_conn_id_lock);
+ }
+}
diff --git a/net/rxrpc/ar-connevent.c b/net/rxrpc/conn_event.c
index 8bdd692..bf69715 100644
--- a/net/rxrpc/ar-connevent.c
+++ b/net/rxrpc/conn_event.c
@@ -88,14 +88,14 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn,
rxrpc_abort_calls(conn, RXRPC_CALL_LOCALLY_ABORTED, abort_code);
- msg.msg_name = &conn->trans->peer->srx.transport;
- msg.msg_namelen = conn->trans->peer->srx.transport_len;
+ msg.msg_name = &conn->params.peer->srx.transport;
+ msg.msg_namelen = conn->params.peer->srx.transport_len;
msg.msg_control = NULL;
msg.msg_controllen = 0;
msg.msg_flags = 0;
- whdr.epoch = htonl(conn->epoch);
- whdr.cid = htonl(conn->cid);
+ whdr.epoch = htonl(conn->proto.epoch);
+ whdr.cid = htonl(conn->proto.cid);
whdr.callNumber = 0;
whdr.seq = 0;
whdr.type = RXRPC_PACKET_TYPE_ABORT;
@@ -103,7 +103,7 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn,
whdr.userStatus = 0;
whdr.securityIndex = conn->security_ix;
whdr._rsvd = 0;
- whdr.serviceId = htons(conn->service_id);
+ whdr.serviceId = htons(conn->params.service_id);
word = htonl(conn->local_abort);
@@ -118,7 +118,7 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn,
whdr.serial = htonl(serial);
_proto("Tx CONN ABORT %%%u { %d }", serial, conn->local_abort);
- ret = kernel_sendmsg(conn->trans->local->socket, &msg, iov, 2, len);
+ ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len);
if (ret < 0) {
_debug("sendmsg failed: %d", ret);
return -EAGAIN;
@@ -220,7 +220,7 @@ static void rxrpc_secure_connection(struct rxrpc_connection *conn)
ASSERT(conn->security_ix != 0);
- if (!conn->key) {
+ if (!conn->params.key) {
_debug("set up security");
ret = rxrpc_init_server_conn_security(conn);
switch (ret) {
@@ -263,7 +263,7 @@ void rxrpc_process_connection(struct work_struct *work)
_enter("{%d}", conn->debug_id);
- atomic_inc(&conn->usage);
+ rxrpc_get_connection(conn);
if (test_and_clear_bit(RXRPC_CONN_CHALLENGE, &conn->events)) {
rxrpc_secure_connection(conn);
@@ -314,19 +314,14 @@ void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb)
{
CHECK_SLAB_OKAY(&local->usage);
- if (!atomic_inc_not_zero(&local->usage)) {
- printk("resurrected on reject\n");
- BUG();
- }
-
skb_queue_tail(&local->reject_queue, skb);
- rxrpc_queue_work(&local->rejecter);
+ rxrpc_queue_work(&local->processor);
}
/*
* reject packets through the local endpoint
*/
-void rxrpc_reject_packets(struct work_struct *work)
+void rxrpc_reject_packets(struct rxrpc_local *local)
{
union {
struct sockaddr sa;
@@ -334,16 +329,12 @@ void rxrpc_reject_packets(struct work_struct *work)
} sa;
struct rxrpc_skb_priv *sp;
struct rxrpc_wire_header whdr;
- struct rxrpc_local *local;
struct sk_buff *skb;
struct msghdr msg;
struct kvec iov[2];
size_t size;
__be32 code;
- local = container_of(work, struct rxrpc_local, rejecter);
- rxrpc_get_local(local);
-
_enter("%d", local->debug_id);
iov[0].iov_base = &whdr;
@@ -395,9 +386,7 @@ void rxrpc_reject_packets(struct work_struct *work)
}
rxrpc_free_skb(skb);
- rxrpc_put_local(local);
}
- rxrpc_put_local(local);
_leave("");
}
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
new file mode 100644
index 0000000..4bfad7c
--- /dev/null
+++ b/net/rxrpc/conn_object.c
@@ -0,0 +1,686 @@
+/* RxRPC virtual connection handler
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/crypto.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include "ar-internal.h"
+
+/*
+ * Time till a connection expires after last use (in seconds).
+ */
+unsigned int rxrpc_connection_expiry = 10 * 60;
+
+static void rxrpc_connection_reaper(struct work_struct *work);
+
+LIST_HEAD(rxrpc_connections);
+DEFINE_RWLOCK(rxrpc_connection_lock);
+static DECLARE_DELAYED_WORK(rxrpc_connection_reap, rxrpc_connection_reaper);
+
+/*
+ * allocate a new connection
+ */
+static struct rxrpc_connection *rxrpc_alloc_connection(gfp_t gfp)
+{
+ struct rxrpc_connection *conn;
+
+ _enter("");
+
+ conn = kzalloc(sizeof(struct rxrpc_connection), gfp);
+ if (conn) {
+ spin_lock_init(&conn->channel_lock);
+ init_waitqueue_head(&conn->channel_wq);
+ INIT_WORK(&conn->processor, &rxrpc_process_connection);
+ INIT_LIST_HEAD(&conn->link);
+ conn->calls = RB_ROOT;
+ skb_queue_head_init(&conn->rx_queue);
+ conn->security = &rxrpc_no_security;
+ rwlock_init(&conn->lock);
+ spin_lock_init(&conn->state_lock);
+ atomic_set(&conn->usage, 1);
+ conn->debug_id = atomic_inc_return(&rxrpc_debug_id);
+ atomic_set(&conn->avail_chans, RXRPC_MAXCALLS);
+ conn->size_align = 4;
+ conn->header_size = sizeof(struct rxrpc_wire_header);
+ }
+
+ _leave(" = %p{%d}", conn, conn ? conn->debug_id : 0);
+ return conn;
+}
+
+/*
+ * add a call to a connection's call-by-ID tree
+ */
+static void rxrpc_add_call_ID_to_conn(struct rxrpc_connection *conn,
+ struct rxrpc_call *call)
+{
+ struct rxrpc_call *xcall;
+ struct rb_node *parent, **p;
+ __be32 call_id;
+
+ write_lock_bh(&conn->lock);
+
+ call_id = call->call_id;
+ p = &conn->calls.rb_node;
+ parent = NULL;
+ while (*p) {
+ parent = *p;
+ xcall = rb_entry(parent, struct rxrpc_call, conn_node);
+
+ if (call_id < xcall->call_id)
+ p = &(*p)->rb_left;
+ else if (call_id > xcall->call_id)
+ p = &(*p)->rb_right;
+ else
+ BUG();
+ }
+
+ rb_link_node(&call->conn_node, parent, p);
+ rb_insert_color(&call->conn_node, &conn->calls);
+
+ write_unlock_bh(&conn->lock);
+}
+
+/*
+ * Allocate a client connection. The caller must take care to clear any
+ * padding bytes in *cp.
+ */
+static struct rxrpc_connection *
+rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, gfp_t gfp)
+{
+ struct rxrpc_connection *conn;
+ int ret;
+
+ _enter("");
+
+ conn = rxrpc_alloc_connection(gfp);
+ if (!conn) {
+ _leave(" = -ENOMEM");
+ return ERR_PTR(-ENOMEM);
+ }
+
+ conn->params = *cp;
+ conn->proto.local = cp->local;
+ conn->proto.epoch = rxrpc_epoch;
+ conn->proto.cid = 0;
+ conn->proto.in_clientflag = 0;
+ conn->proto.family = cp->peer->srx.transport.family;
+ conn->out_clientflag = RXRPC_CLIENT_INITIATED;
+ conn->state = RXRPC_CONN_CLIENT;
+
+ switch (conn->proto.family) {
+ case AF_INET:
+ conn->proto.addr_size = sizeof(conn->proto.ipv4_addr);
+ conn->proto.ipv4_addr = cp->peer->srx.transport.sin.sin_addr;
+ conn->proto.port = cp->peer->srx.transport.sin.sin_port;
+ break;
+ }
+
+ ret = rxrpc_get_client_connection_id(conn, gfp);
+ if (ret < 0)
+ goto error_0;
+
+ ret = rxrpc_init_client_conn_security(conn);
+ if (ret < 0)
+ goto error_1;
+
+ conn->security->prime_packet_security(conn);
+
+ write_lock(&rxrpc_connection_lock);
+ list_add_tail(&conn->link, &rxrpc_connections);
+ write_unlock(&rxrpc_connection_lock);
+
+ /* We steal the caller's peer ref. */
+ cp->peer = NULL;
+ rxrpc_get_local(conn->params.local);
+ key_get(conn->params.key);
+
+ _leave(" = %p", conn);
+ return conn;
+
+error_1:
+ rxrpc_put_client_connection_id(conn);
+error_0:
+ kfree(conn);
+ _leave(" = %d", ret);
+ return ERR_PTR(ret);
+}
+
+/*
+ * find a connection for a call
+ * - called in process context with IRQs enabled
+ */
+int rxrpc_connect_call(struct rxrpc_call *call,
+ struct rxrpc_conn_parameters *cp,
+ struct sockaddr_rxrpc *srx,
+ gfp_t gfp)
+{
+ struct rxrpc_connection *conn, *candidate = NULL;
+ struct rxrpc_local *local = cp->local;
+ struct rb_node *p, **pp, *parent;
+ long diff;
+ int chan;
+
+ DECLARE_WAITQUEUE(myself, current);
+
+ _enter("{%d,%lx},", call->debug_id, call->user_call_ID);
+
+ cp->peer = rxrpc_lookup_peer(cp->local, srx, gfp);
+ if (!cp->peer)
+ return -ENOMEM;
+
+ if (!cp->exclusive) {
+ /* Search for a existing client connection unless this is going
+ * to be a connection that's used exclusively for a single call.
+ */
+ _debug("search 1");
+ spin_lock(&local->client_conns_lock);
+ p = local->client_conns.rb_node;
+ while (p) {
+ conn = rb_entry(p, struct rxrpc_connection, client_node);
+
+#define cmp(X) ((long)conn->params.X - (long)cp->X)
+ diff = (cmp(peer) ?:
+ cmp(key) ?:
+ cmp(security_level));
+ if (diff < 0)
+ p = p->rb_left;
+ else if (diff > 0)
+ p = p->rb_right;
+ else
+ goto found_extant_conn;
+ }
+ spin_unlock(&local->client_conns_lock);
+ }
+
+ /* We didn't find a connection or we want an exclusive one. */
+ _debug("get new conn");
+ candidate = rxrpc_alloc_client_connection(cp, gfp);
+ if (!candidate) {
+ _leave(" = -ENOMEM");
+ return -ENOMEM;
+ }
+
+ if (cp->exclusive) {
+ /* Assign the call on an exclusive connection to channel 0 and
+ * don't add the connection to the endpoint's shareable conn
+ * lookup tree.
+ */
+ _debug("exclusive chan 0");
+ conn = candidate;
+ atomic_set(&conn->avail_chans, RXRPC_MAXCALLS - 1);
+ spin_lock(&conn->channel_lock);
+ chan = 0;
+ goto found_channel;
+ }
+
+ /* We need to redo the search before attempting to add a new connection
+ * lest we race with someone else adding a conflicting instance.
+ */
+ _debug("search 2");
+ spin_lock(&local->client_conns_lock);
+
+ pp = &local->client_conns.rb_node;
+ parent = NULL;
+ while (*pp) {
+ parent = *pp;
+ conn = rb_entry(parent, struct rxrpc_connection, client_node);
+
+ diff = (cmp(peer) ?:
+ cmp(key) ?:
+ cmp(security_level));
+ if (diff < 0)
+ pp = &(*pp)->rb_left;
+ else if (diff > 0)
+ pp = &(*pp)->rb_right;
+ else
+ goto found_extant_conn;
+ }
+
+ /* The second search also failed; simply add the new connection with
+ * the new call in channel 0. Note that we need to take the channel
+ * lock before dropping the client conn lock.
+ */
+ _debug("new conn");
+ conn = candidate;
+ candidate = NULL;
+
+ rb_link_node(&conn->client_node, parent, pp);
+ rb_insert_color(&conn->client_node, &local->client_conns);
+
+ atomic_set(&conn->avail_chans, RXRPC_MAXCALLS - 1);
+ spin_lock(&conn->channel_lock);
+ spin_unlock(&local->client_conns_lock);
+ chan = 0;
+
+found_channel:
+ _debug("found chan");
+ call->conn = conn;
+ call->channel = chan;
+ call->epoch = conn->proto.epoch;
+ call->cid = conn->proto.cid | chan;
+ call->call_id = ++conn->call_counter;
+ rcu_assign_pointer(conn->channels[chan], call);
+
+ _net("CONNECT call %d on conn %d", call->debug_id, conn->debug_id);
+
+ rxrpc_add_call_ID_to_conn(conn, call);
+ spin_unlock(&conn->channel_lock);
+ rxrpc_put_peer(cp->peer);
+ cp->peer = NULL;
+ _leave(" = %p {u=%d}", conn, atomic_read(&conn->usage));
+ return 0;
+
+ /* We found a suitable connection already in existence. Discard any
+ * candidate we may have allocated, and try to get a channel on this
+ * one.
+ */
+found_extant_conn:
+ _debug("found conn");
+ rxrpc_get_connection(conn);
+ spin_unlock(&local->client_conns_lock);
+
+ rxrpc_put_connection(candidate);
+
+ if (!atomic_add_unless(&conn->avail_chans, -1, 0)) {
+ if (!gfpflags_allow_blocking(gfp)) {
+ rxrpc_put_connection(conn);
+ _leave(" = -EAGAIN");
+ return -EAGAIN;
+ }
+
+ add_wait_queue(&conn->channel_wq, &myself);
+ for (;;) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (atomic_add_unless(&conn->avail_chans, -1, 0))
+ break;
+ if (signal_pending(current))
+ goto interrupted;
+ schedule();
+ }
+ remove_wait_queue(&conn->channel_wq, &myself);
+ __set_current_state(TASK_RUNNING);
+ }
+
+ /* The connection allegedly now has a free channel and we can now
+ * attach the call to it.
+ */
+ spin_lock(&conn->channel_lock);
+
+ for (chan = 0; chan < RXRPC_MAXCALLS; chan++)
+ if (!conn->channels[chan])
+ goto found_channel;
+ BUG();
+
+interrupted:
+ remove_wait_queue(&conn->channel_wq, &myself);
+ __set_current_state(TASK_RUNNING);
+ rxrpc_put_connection(conn);
+ rxrpc_put_peer(cp->peer);
+ cp->peer = NULL;
+ _leave(" = -ERESTARTSYS");
+ return -ERESTARTSYS;
+}
+
+/*
+ * get a record of an incoming connection
+ */
+struct rxrpc_connection *rxrpc_incoming_connection(struct rxrpc_local *local,
+ struct rxrpc_peer *peer,
+ struct sk_buff *skb)
+{
+ struct rxrpc_connection *conn, *candidate = NULL;
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ struct rb_node *p, **pp;
+ const char *new = "old";
+ __be32 epoch;
+ u32 cid;
+
+ _enter("");
+
+ ASSERT(sp->hdr.flags & RXRPC_CLIENT_INITIATED);
+
+ epoch = sp->hdr.epoch;
+ cid = sp->hdr.cid & RXRPC_CIDMASK;
+
+ /* search the connection list first */
+ read_lock_bh(&peer->conn_lock);
+
+ p = peer->service_conns.rb_node;
+ while (p) {
+ conn = rb_entry(p, struct rxrpc_connection, service_node);
+
+ _debug("maybe %x", conn->proto.cid);
+
+ if (epoch < conn->proto.epoch)
+ p = p->rb_left;
+ else if (epoch > conn->proto.epoch)
+ p = p->rb_right;
+ else if (cid < conn->proto.cid)
+ p = p->rb_left;
+ else if (cid > conn->proto.cid)
+ p = p->rb_right;
+ else
+ goto found_extant_connection;
+ }
+ read_unlock_bh(&peer->conn_lock);
+
+ /* not yet present - create a candidate for a new record and then
+ * redo the search */
+ candidate = rxrpc_alloc_connection(GFP_NOIO);
+ if (!candidate) {
+ _leave(" = -ENOMEM");
+ return ERR_PTR(-ENOMEM);
+ }
+
+ candidate->proto.local = local;
+ candidate->proto.epoch = sp->hdr.epoch;
+ candidate->proto.cid = sp->hdr.cid & RXRPC_CIDMASK;
+ candidate->proto.in_clientflag = RXRPC_CLIENT_INITIATED;
+ candidate->params.local = local;
+ candidate->params.peer = peer;
+ candidate->params.service_id = sp->hdr.serviceId;
+ candidate->security_ix = sp->hdr.securityIndex;
+ candidate->out_clientflag = 0;
+ candidate->state = RXRPC_CONN_SERVER;
+ if (candidate->params.service_id)
+ candidate->state = RXRPC_CONN_SERVER_UNSECURED;
+
+ write_lock_bh(&peer->conn_lock);
+
+ pp = &peer->service_conns.rb_node;
+ p = NULL;
+ while (*pp) {
+ p = *pp;
+ conn = rb_entry(p, struct rxrpc_connection, service_node);
+
+ if (epoch < conn->proto.epoch)
+ pp = &(*pp)->rb_left;
+ else if (epoch > conn->proto.epoch)
+ pp = &(*pp)->rb_right;
+ else if (cid < conn->proto.cid)
+ pp = &(*pp)->rb_left;
+ else if (cid > conn->proto.cid)
+ pp = &(*pp)->rb_right;
+ else
+ goto found_extant_second;
+ }
+
+ /* we can now add the new candidate to the list */
+ conn = candidate;
+ candidate = NULL;
+ rb_link_node(&conn->service_node, p, pp);
+ rb_insert_color(&conn->service_node, &peer->service_conns);
+ rxrpc_get_peer(peer);
+ rxrpc_get_local(local);
+
+ write_unlock_bh(&peer->conn_lock);
+
+ write_lock(&rxrpc_connection_lock);
+ list_add_tail(&conn->link, &rxrpc_connections);
+ write_unlock(&rxrpc_connection_lock);
+
+ new = "new";
+
+success:
+ _net("CONNECTION %s %d {%x}", new, conn->debug_id, conn->proto.cid);
+
+ _leave(" = %p {u=%d}", conn, atomic_read(&conn->usage));
+ return conn;
+
+ /* we found the connection in the list immediately */
+found_extant_connection:
+ if (sp->hdr.securityIndex != conn->security_ix) {
+ read_unlock_bh(&peer->conn_lock);
+ goto security_mismatch;
+ }
+ rxrpc_get_connection(conn);
+ read_unlock_bh(&peer->conn_lock);
+ goto success;
+
+ /* we found the connection on the second time through the list */
+found_extant_second:
+ if (sp->hdr.securityIndex != conn->security_ix) {
+ write_unlock_bh(&peer->conn_lock);
+ goto security_mismatch;
+ }
+ rxrpc_get_connection(conn);
+ write_unlock_bh(&peer->conn_lock);
+ kfree(candidate);
+ goto success;
+
+security_mismatch:
+ kfree(candidate);
+ _leave(" = -EKEYREJECTED");
+ return ERR_PTR(-EKEYREJECTED);
+}
+
+/*
+ * find a connection based on transport and RxRPC connection ID for an incoming
+ * packet
+ */
+struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_local *local,
+ struct rxrpc_peer *peer,
+ struct sk_buff *skb)
+{
+ struct rxrpc_connection *conn;
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ struct rb_node *p;
+ u32 epoch, cid;
+
+ _enter(",{%x,%x}", sp->hdr.cid, sp->hdr.flags);
+
+ read_lock_bh(&peer->conn_lock);
+
+ cid = sp->hdr.cid & RXRPC_CIDMASK;
+ epoch = sp->hdr.epoch;
+
+ if (sp->hdr.flags & RXRPC_CLIENT_INITIATED) {
+ p = peer->service_conns.rb_node;
+ while (p) {
+ conn = rb_entry(p, struct rxrpc_connection, service_node);
+
+ _debug("maybe %x", conn->proto.cid);
+
+ if (epoch < conn->proto.epoch)
+ p = p->rb_left;
+ else if (epoch > conn->proto.epoch)
+ p = p->rb_right;
+ else if (cid < conn->proto.cid)
+ p = p->rb_left;
+ else if (cid > conn->proto.cid)
+ p = p->rb_right;
+ else
+ goto found;
+ }
+ } else {
+ conn = idr_find(&rxrpc_client_conn_ids, cid >> RXRPC_CIDSHIFT);
+ if (conn && conn->proto.epoch == epoch)
+ goto found;
+ }
+
+ read_unlock_bh(&peer->conn_lock);
+ _leave(" = NULL");
+ return NULL;
+
+found:
+ rxrpc_get_connection(conn);
+ read_unlock_bh(&peer->conn_lock);
+ _leave(" = %p", conn);
+ return conn;
+}
+
+/*
+ * Disconnect a call and clear any channel it occupies when that call
+ * terminates.
+ */
+void rxrpc_disconnect_call(struct rxrpc_call *call)
+{
+ struct rxrpc_connection *conn = call->conn;
+ unsigned chan = call->channel;
+
+ _enter("%d,%d", conn->debug_id, call->channel);
+
+ if (conn->channels[chan] == call) {
+ rcu_assign_pointer(conn->channels[chan], NULL);
+ atomic_inc(&conn->avail_chans);
+ wake_up(&conn->channel_wq);
+ }
+}
+
+/*
+ * release a virtual connection
+ */
+void rxrpc_put_connection(struct rxrpc_connection *conn)
+{
+ if (!conn)
+ return;
+
+ _enter("%p{u=%d,d=%d}",
+ conn, atomic_read(&conn->usage), conn->debug_id);
+
+ ASSERTCMP(atomic_read(&conn->usage), >, 0);
+
+ conn->put_time = ktime_get_seconds();
+ if (atomic_dec_and_test(&conn->usage)) {
+ _debug("zombie");
+ rxrpc_queue_delayed_work(&rxrpc_connection_reap, 0);
+ }
+
+ _leave("");
+}
+
+/*
+ * destroy a virtual connection
+ */
+static void rxrpc_destroy_connection(struct rxrpc_connection *conn)
+{
+ _enter("%p{%d}", conn, atomic_read(&conn->usage));
+
+ ASSERTCMP(atomic_read(&conn->usage), ==, 0);
+
+ _net("DESTROY CONN %d", conn->debug_id);
+
+ ASSERT(RB_EMPTY_ROOT(&conn->calls));
+ rxrpc_purge_queue(&conn->rx_queue);
+
+ conn->security->clear(conn);
+ key_put(conn->params.key);
+ key_put(conn->server_key);
+ rxrpc_put_peer(conn->params.peer);
+ rxrpc_put_local(conn->params.local);
+
+ kfree(conn);
+ _leave("");
+}
+
+/*
+ * reap dead connections
+ */
+static void rxrpc_connection_reaper(struct work_struct *work)
+{
+ struct rxrpc_connection *conn, *_p;
+ struct rxrpc_peer *peer;
+ unsigned long now, earliest, reap_time;
+
+ LIST_HEAD(graveyard);
+
+ _enter("");
+
+ now = ktime_get_seconds();
+ earliest = ULONG_MAX;
+
+ write_lock(&rxrpc_connection_lock);
+ list_for_each_entry_safe(conn, _p, &rxrpc_connections, link) {
+ _debug("reap CONN %d { u=%d,t=%ld }",
+ conn->debug_id, atomic_read(&conn->usage),
+ (long) now - (long) conn->put_time);
+
+ if (likely(atomic_read(&conn->usage) > 0))
+ continue;
+
+ if (rxrpc_conn_is_client(conn)) {
+ struct rxrpc_local *local = conn->params.local;
+ spin_lock(&local->client_conns_lock);
+ reap_time = conn->put_time + rxrpc_connection_expiry;
+
+ if (atomic_read(&conn->usage) > 0) {
+ ;
+ } else if (reap_time <= now) {
+ list_move_tail(&conn->link, &graveyard);
+ rxrpc_put_client_connection_id(conn);
+ rb_erase(&conn->client_node,
+ &local->client_conns);
+ } else if (reap_time < earliest) {
+ earliest = reap_time;
+ }
+
+ spin_unlock(&local->client_conns_lock);
+ } else {
+ peer = conn->params.peer;
+ write_lock_bh(&peer->conn_lock);
+ reap_time = conn->put_time + rxrpc_connection_expiry;
+
+ if (atomic_read(&conn->usage) > 0) {
+ ;
+ } else if (reap_time <= now) {
+ list_move_tail(&conn->link, &graveyard);
+ rb_erase(&conn->service_node,
+ &peer->service_conns);
+ } else if (reap_time < earliest) {
+ earliest = reap_time;
+ }
+
+ write_unlock_bh(&peer->conn_lock);
+ }
+ }
+ write_unlock(&rxrpc_connection_lock);
+
+ if (earliest != ULONG_MAX) {
+ _debug("reschedule reaper %ld", (long) earliest - now);
+ ASSERTCMP(earliest, >, now);
+ rxrpc_queue_delayed_work(&rxrpc_connection_reap,
+ (earliest - now) * HZ);
+ }
+
+ /* then destroy all those pulled out */
+ while (!list_empty(&graveyard)) {
+ conn = list_entry(graveyard.next, struct rxrpc_connection,
+ link);
+ list_del_init(&conn->link);
+
+ ASSERTCMP(atomic_read(&conn->usage), ==, 0);
+ rxrpc_destroy_connection(conn);
+ }
+
+ _leave("");
+}
+
+/*
+ * preemptively destroy all the connection records rather than waiting for them
+ * to time out
+ */
+void __exit rxrpc_destroy_all_connections(void)
+{
+ _enter("");
+
+ rxrpc_connection_expiry = 0;
+ cancel_delayed_work(&rxrpc_connection_reap);
+ rxrpc_queue_delayed_work(&rxrpc_connection_reap, 0);
+
+ _leave("");
+}
diff --git a/net/rxrpc/ar-input.c b/net/rxrpc/input.c
index e0815a0..f4bd57b 100644
--- a/net/rxrpc/ar-input.c
+++ b/net/rxrpc/input.c
@@ -360,7 +360,7 @@ void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb)
case RXRPC_PACKET_TYPE_BUSY:
_proto("Rx BUSY %%%u", sp->hdr.serial);
- if (call->conn->out_clientflag)
+ if (rxrpc_conn_is_service(call->conn))
goto protocol_error;
write_lock_bh(&call->state_lock);
@@ -533,7 +533,7 @@ static void rxrpc_post_packet_to_call(struct rxrpc_call *call,
case RXRPC_CALL_COMPLETE:
case RXRPC_CALL_CLIENT_FINAL_ACK:
/* complete server call */
- if (call->conn->in_clientflag)
+ if (rxrpc_conn_is_service(call->conn))
goto dead_call;
/* resend last packet of a completed call */
_debug("final ack again");
@@ -560,7 +560,7 @@ static void rxrpc_post_packet_to_call(struct rxrpc_call *call,
dead_call:
if (sp->hdr.type != RXRPC_PACKET_TYPE_ABORT) {
skb->priority = RX_CALL_DEAD;
- rxrpc_reject_packet(call->conn->trans->local, skb);
+ rxrpc_reject_packet(call->conn->params.local, skb);
goto unlock;
}
free_unlock:
@@ -580,7 +580,7 @@ static void rxrpc_post_packet_to_conn(struct rxrpc_connection *conn,
{
_enter("%p,%p", conn, skb);
- atomic_inc(&conn->usage);
+ rxrpc_get_connection(conn);
skb_queue_tail(&conn->rx_queue, skb);
rxrpc_queue_conn(conn);
}
@@ -594,9 +594,8 @@ static void rxrpc_post_packet_to_local(struct rxrpc_local *local,
{
_enter("%p,%p", local, skb);
- atomic_inc(&local->usage);
skb_queue_tail(&local->event_queue, skb);
- rxrpc_queue_work(&local->event_processor);
+ rxrpc_queue_work(&local->processor);
}
/*
@@ -629,29 +628,27 @@ int rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb)
}
static struct rxrpc_connection *rxrpc_conn_from_local(struct rxrpc_local *local,
- struct sk_buff *skb,
- struct rxrpc_skb_priv *sp)
+ struct sk_buff *skb)
{
struct rxrpc_peer *peer;
- struct rxrpc_transport *trans;
struct rxrpc_connection *conn;
+ struct sockaddr_rxrpc srx;
- peer = rxrpc_find_peer(local, ip_hdr(skb)->saddr,
- udp_hdr(skb)->source);
- if (IS_ERR(peer))
- goto cant_find_conn;
-
- trans = rxrpc_find_transport(local, peer);
- rxrpc_put_peer(peer);
- if (!trans)
- goto cant_find_conn;
+ rxrpc_get_addr_from_skb(local, skb, &srx);
+ rcu_read_lock();
+ peer = rxrpc_lookup_peer_rcu(local, &srx);
+ if (!peer)
+ goto cant_find_peer;
- conn = rxrpc_find_connection(trans, &sp->hdr);
- rxrpc_put_transport(trans);
+ conn = rxrpc_find_connection(local, peer, skb);
+ rcu_read_unlock();
if (!conn)
goto cant_find_conn;
return conn;
+
+cant_find_peer:
+ rcu_read_unlock();
cant_find_conn:
return NULL;
}
@@ -659,11 +656,15 @@ cant_find_conn:
/*
* handle data received on the local endpoint
* - may be called in interrupt context
+ *
+ * The socket is locked by the caller and this prevents the socket from being
+ * shut down and the local endpoint from going away, thus sk_user_data will not
+ * be cleared until this function returns.
*/
void rxrpc_data_ready(struct sock *sk)
{
struct rxrpc_skb_priv *sp;
- struct rxrpc_local *local;
+ struct rxrpc_local *local = sk->sk_user_data;
struct sk_buff *skb;
int ret;
@@ -671,21 +672,8 @@ void rxrpc_data_ready(struct sock *sk)
ASSERT(!irqs_disabled());
- read_lock_bh(&rxrpc_local_lock);
- local = sk->sk_user_data;
- if (local && atomic_read(&local->usage) > 0)
- rxrpc_get_local(local);
- else
- local = NULL;
- read_unlock_bh(&rxrpc_local_lock);
- if (!local) {
- _leave(" [local dead]");
- return;
- }
-
skb = skb_recv_datagram(sk, 0, 1, &ret);
if (!skb) {
- rxrpc_put_local(local);
if (ret == -EAGAIN)
return;
_debug("UDP socket error %d", ret);
@@ -699,7 +687,6 @@ void rxrpc_data_ready(struct sock *sk)
/* we'll probably need to checksum it (didn't call sock_recvmsg) */
if (skb_checksum_complete(skb)) {
rxrpc_free_skb(skb);
- rxrpc_put_local(local);
__UDP_INC_STATS(&init_net, UDP_MIB_INERRORS, 0);
_leave(" [CSUM failed]");
return;
@@ -745,7 +732,7 @@ void rxrpc_data_ready(struct sock *sk)
* old-fashioned way doesn't really hurt */
struct rxrpc_connection *conn;
- conn = rxrpc_conn_from_local(local, skb, sp);
+ conn = rxrpc_conn_from_local(local, skb);
if (!conn)
goto cant_route_call;
@@ -764,7 +751,6 @@ void rxrpc_data_ready(struct sock *sk)
}
out:
- rxrpc_put_local(local);
return;
cant_route_call:
@@ -774,8 +760,7 @@ cant_route_call:
if (sp->hdr.seq == 1) {
_debug("first packet");
skb_queue_tail(&local->accept_queue, skb);
- rxrpc_queue_work(&local->acceptor);
- rxrpc_put_local(local);
+ rxrpc_queue_work(&local->processor);
_leave(" [incoming]");
return;
}
@@ -788,13 +773,11 @@ cant_route_call:
_debug("reject type %d",sp->hdr.type);
rxrpc_reject_packet(local, skb);
}
- rxrpc_put_local(local);
_leave(" [no call]");
return;
bad_message:
skb->priority = RX_PROTOCOL_ERROR;
rxrpc_reject_packet(local, skb);
- rxrpc_put_local(local);
_leave(" [badmsg]");
}
diff --git a/net/rxrpc/ar-key.c b/net/rxrpc/key.c
index 4ad56fa..18c737a 100644
--- a/net/rxrpc/ar-key.c
+++ b/net/rxrpc/key.c
@@ -987,7 +987,7 @@ int rxrpc_get_server_data_key(struct rxrpc_connection *conn,
if (ret < 0)
goto error;
- conn->key = key;
+ conn->params.key = key;
_leave(" = 0 [%d]", key_serial(key));
return 0;
diff --git a/net/rxrpc/local_event.c b/net/rxrpc/local_event.c
new file mode 100644
index 0000000..31a3f86
--- /dev/null
+++ b/net/rxrpc/local_event.c
@@ -0,0 +1,116 @@
+/* AF_RXRPC local endpoint management
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/udp.h>
+#include <linux/ip.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include <generated/utsrelease.h>
+#include "ar-internal.h"
+
+static const char rxrpc_version_string[65] = "linux-" UTS_RELEASE " AF_RXRPC";
+
+/*
+ * Reply to a version request
+ */
+static void rxrpc_send_version_request(struct rxrpc_local *local,
+ struct rxrpc_host_header *hdr,
+ struct sk_buff *skb)
+{
+ struct rxrpc_wire_header whdr;
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ struct sockaddr_in sin;
+ struct msghdr msg;
+ struct kvec iov[2];
+ size_t len;
+ int ret;
+
+ _enter("");
+
+ sin.sin_family = AF_INET;
+ sin.sin_port = udp_hdr(skb)->source;
+ sin.sin_addr.s_addr = ip_hdr(skb)->saddr;
+
+ msg.msg_name = &sin;
+ msg.msg_namelen = sizeof(sin);
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = 0;
+
+ whdr.epoch = htonl(sp->hdr.epoch);
+ whdr.cid = htonl(sp->hdr.cid);
+ whdr.callNumber = htonl(sp->hdr.callNumber);
+ whdr.seq = 0;
+ whdr.serial = 0;
+ whdr.type = RXRPC_PACKET_TYPE_VERSION;
+ whdr.flags = RXRPC_LAST_PACKET | (~hdr->flags & RXRPC_CLIENT_INITIATED);
+ whdr.userStatus = 0;
+ whdr.securityIndex = 0;
+ whdr._rsvd = 0;
+ whdr.serviceId = htons(sp->hdr.serviceId);
+
+ iov[0].iov_base = &whdr;
+ iov[0].iov_len = sizeof(whdr);
+ iov[1].iov_base = (char *)rxrpc_version_string;
+ iov[1].iov_len = sizeof(rxrpc_version_string);
+
+ len = iov[0].iov_len + iov[1].iov_len;
+
+ _proto("Tx VERSION (reply)");
+
+ ret = kernel_sendmsg(local->socket, &msg, iov, 2, len);
+ if (ret < 0)
+ _debug("sendmsg failed: %d", ret);
+
+ _leave("");
+}
+
+/*
+ * Process event packets targetted at a local endpoint.
+ */
+void rxrpc_process_local_events(struct rxrpc_local *local)
+{
+ struct sk_buff *skb;
+ char v;
+
+ _enter("");
+
+ skb = skb_dequeue(&local->event_queue);
+ if (skb) {
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+
+ _debug("{%d},{%u}", local->debug_id, sp->hdr.type);
+
+ switch (sp->hdr.type) {
+ case RXRPC_PACKET_TYPE_VERSION:
+ if (skb_copy_bits(skb, 0, &v, 1) < 0)
+ return;
+ _proto("Rx VERSION { %02x }", v);
+ if (v == 0)
+ rxrpc_send_version_request(local, &sp->hdr, skb);
+ break;
+
+ default:
+ /* Just ignore anything we don't understand */
+ break;
+ }
+
+ rxrpc_free_skb(skb);
+ }
+
+ _leave("");
+}
diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
new file mode 100644
index 0000000..3ab7764
--- /dev/null
+++ b/net/rxrpc/local_object.c
@@ -0,0 +1,387 @@
+/* Local endpoint object management
+ *
+ * Copyright (C) 2016 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/udp.h>
+#include <linux/ip.h>
+#include <linux/hashtable.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include "ar-internal.h"
+
+static void rxrpc_local_processor(struct work_struct *);
+static void rxrpc_local_rcu(struct rcu_head *);
+
+static DEFINE_MUTEX(rxrpc_local_mutex);
+static LIST_HEAD(rxrpc_local_endpoints);
+
+/*
+ * Compare a local to an address. Return -ve, 0 or +ve to indicate less than,
+ * same or greater than.
+ *
+ * We explicitly don't compare the RxRPC service ID as we want to reject
+ * conflicting uses by differing services. Further, we don't want to share
+ * addresses with different options (IPv6), so we don't compare those bits
+ * either.
+ */
+static long rxrpc_local_cmp_key(const struct rxrpc_local *local,
+ const struct sockaddr_rxrpc *srx)
+{
+ long diff;
+
+ diff = ((local->srx.transport_type - srx->transport_type) ?:
+ (local->srx.transport_len - srx->transport_len) ?:
+ (local->srx.transport.family - srx->transport.family));
+ if (diff != 0)
+ return diff;
+
+ switch (srx->transport.family) {
+ case AF_INET:
+ /* If the choice of UDP port is left up to the transport, then
+ * the endpoint record doesn't match.
+ */
+ return ((u16 __force)local->srx.transport.sin.sin_port -
+ (u16 __force)srx->transport.sin.sin_port) ?:
+ memcmp(&local->srx.transport.sin.sin_addr,
+ &srx->transport.sin.sin_addr,
+ sizeof(struct in_addr));
+ default:
+ BUG();
+ }
+}
+
+/*
+ * Allocate a new local endpoint.
+ */
+static struct rxrpc_local *rxrpc_alloc_local(const struct sockaddr_rxrpc *srx)
+{
+ struct rxrpc_local *local;
+
+ local = kzalloc(sizeof(struct rxrpc_local), GFP_KERNEL);
+ if (local) {
+ atomic_set(&local->usage, 1);
+ INIT_LIST_HEAD(&local->link);
+ INIT_WORK(&local->processor, rxrpc_local_processor);
+ INIT_LIST_HEAD(&local->services);
+ init_rwsem(&local->defrag_sem);
+ skb_queue_head_init(&local->accept_queue);
+ skb_queue_head_init(&local->reject_queue);
+ skb_queue_head_init(&local->event_queue);
+ local->client_conns = RB_ROOT;
+ spin_lock_init(&local->client_conns_lock);
+ spin_lock_init(&local->lock);
+ rwlock_init(&local->services_lock);
+ local->debug_id = atomic_inc_return(&rxrpc_debug_id);
+ memcpy(&local->srx, srx, sizeof(*srx));
+ }
+
+ _leave(" = %p", local);
+ return local;
+}
+
+/*
+ * create the local socket
+ * - must be called with rxrpc_local_mutex locked
+ */
+static int rxrpc_open_socket(struct rxrpc_local *local)
+{
+ struct sock *sock;
+ int ret, opt;
+
+ _enter("%p{%d}", local, local->srx.transport_type);
+
+ /* create a socket to represent the local endpoint */
+ ret = sock_create_kern(&init_net, PF_INET, local->srx.transport_type,
+ IPPROTO_UDP, &local->socket);
+ if (ret < 0) {
+ _leave(" = %d [socket]", ret);
+ return ret;
+ }
+
+ /* if a local address was supplied then bind it */
+ if (local->srx.transport_len > sizeof(sa_family_t)) {
+ _debug("bind");
+ ret = kernel_bind(local->socket,
+ (struct sockaddr *)&local->srx.transport,
+ local->srx.transport_len);
+ if (ret < 0) {
+ _debug("bind failed %d", ret);
+ goto error;
+ }
+ }
+
+ /* we want to receive ICMP errors */
+ opt = 1;
+ ret = kernel_setsockopt(local->socket, SOL_IP, IP_RECVERR,
+ (char *) &opt, sizeof(opt));
+ if (ret < 0) {
+ _debug("setsockopt failed");
+ goto error;
+ }
+
+ /* we want to set the don't fragment bit */
+ opt = IP_PMTUDISC_DO;
+ ret = kernel_setsockopt(local->socket, SOL_IP, IP_MTU_DISCOVER,
+ (char *) &opt, sizeof(opt));
+ if (ret < 0) {
+ _debug("setsockopt failed");
+ goto error;
+ }
+
+ /* set the socket up */
+ sock = local->socket->sk;
+ sock->sk_user_data = local;
+ sock->sk_data_ready = rxrpc_data_ready;
+ sock->sk_error_report = rxrpc_error_report;
+ _leave(" = 0");
+ return 0;
+
+error:
+ kernel_sock_shutdown(local->socket, SHUT_RDWR);
+ local->socket->sk->sk_user_data = NULL;
+ sock_release(local->socket);
+ local->socket = NULL;
+
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * Look up or create a new local endpoint using the specified local address.
+ */
+struct rxrpc_local *rxrpc_lookup_local(const struct sockaddr_rxrpc *srx)
+{
+ struct rxrpc_local *local;
+ struct list_head *cursor;
+ const char *age;
+ long diff;
+ int ret;
+
+ if (srx->transport.family == AF_INET) {
+ _enter("{%d,%u,%pI4+%hu}",
+ srx->transport_type,
+ srx->transport.family,
+ &srx->transport.sin.sin_addr,
+ ntohs(srx->transport.sin.sin_port));
+ } else {
+ _enter("{%d,%u}",
+ srx->transport_type,
+ srx->transport.family);
+ return ERR_PTR(-EAFNOSUPPORT);
+ }
+
+ mutex_lock(&rxrpc_local_mutex);
+
+ for (cursor = rxrpc_local_endpoints.next;
+ cursor != &rxrpc_local_endpoints;
+ cursor = cursor->next) {
+ local = list_entry(cursor, struct rxrpc_local, link);
+
+ diff = rxrpc_local_cmp_key(local, srx);
+ if (diff < 0)
+ continue;
+ if (diff > 0)
+ break;
+
+ /* Services aren't allowed to share transport sockets, so
+ * reject that here. It is possible that the object is dying -
+ * but it may also still have the local transport address that
+ * we want bound.
+ */
+ if (srx->srx_service) {
+ local = NULL;
+ goto addr_in_use;
+ }
+
+ /* Found a match. We replace a dying object. Attempting to
+ * bind the transport socket may still fail if we're attempting
+ * to use a local address that the dying object is still using.
+ */
+ if (!rxrpc_get_local_maybe(local)) {
+ cursor = cursor->next;
+ list_del_init(&local->link);
+ break;
+ }
+
+ age = "old";
+ goto found;
+ }
+
+ local = rxrpc_alloc_local(srx);
+ if (!local)
+ goto nomem;
+
+ ret = rxrpc_open_socket(local);
+ if (ret < 0)
+ goto sock_error;
+
+ list_add_tail(&local->link, cursor);
+ age = "new";
+
+found:
+ mutex_unlock(&rxrpc_local_mutex);
+
+ _net("LOCAL %s %d {%d,%u,%pI4+%hu}",
+ age,
+ local->debug_id,
+ local->srx.transport_type,
+ local->srx.transport.family,
+ &local->srx.transport.sin.sin_addr,
+ ntohs(local->srx.transport.sin.sin_port));
+
+ _leave(" = %p", local);
+ return local;
+
+nomem:
+ ret = -ENOMEM;
+sock_error:
+ mutex_unlock(&rxrpc_local_mutex);
+ kfree(local);
+ _leave(" = %d", ret);
+ return ERR_PTR(ret);
+
+addr_in_use:
+ mutex_unlock(&rxrpc_local_mutex);
+ _leave(" = -EADDRINUSE");
+ return ERR_PTR(-EADDRINUSE);
+}
+
+/*
+ * A local endpoint reached its end of life.
+ */
+void __rxrpc_put_local(struct rxrpc_local *local)
+{
+ _enter("%d", local->debug_id);
+ rxrpc_queue_work(&local->processor);
+}
+
+/*
+ * Destroy a local endpoint's socket and then hand the record to RCU to dispose
+ * of.
+ *
+ * Closing the socket cannot be done from bottom half context or RCU callback
+ * context because it might sleep.
+ */
+static void rxrpc_local_destroyer(struct rxrpc_local *local)
+{
+ struct socket *socket = local->socket;
+
+ _enter("%d", local->debug_id);
+
+ /* We can get a race between an incoming call packet queueing the
+ * processor again and the work processor starting the destruction
+ * process which will shut down the UDP socket.
+ */
+ if (local->dead) {
+ _leave(" [already dead]");
+ return;
+ }
+ local->dead = true;
+
+ mutex_lock(&rxrpc_local_mutex);
+ list_del_init(&local->link);
+ mutex_unlock(&rxrpc_local_mutex);
+
+ ASSERT(RB_EMPTY_ROOT(&local->client_conns));
+ ASSERT(list_empty(&local->services));
+
+ if (socket) {
+ local->socket = NULL;
+ kernel_sock_shutdown(socket, SHUT_RDWR);
+ socket->sk->sk_user_data = NULL;
+ sock_release(socket);
+ }
+
+ /* At this point, there should be no more packets coming in to the
+ * local endpoint.
+ */
+ rxrpc_purge_queue(&local->accept_queue);
+ rxrpc_purge_queue(&local->reject_queue);
+ rxrpc_purge_queue(&local->event_queue);
+
+ _debug("rcu local %d", local->debug_id);
+ call_rcu(&local->rcu, rxrpc_local_rcu);
+}
+
+/*
+ * Process events on an endpoint
+ */
+static void rxrpc_local_processor(struct work_struct *work)
+{
+ struct rxrpc_local *local =
+ container_of(work, struct rxrpc_local, processor);
+ bool again;
+
+ _enter("%d", local->debug_id);
+
+ do {
+ again = false;
+ if (atomic_read(&local->usage) == 0)
+ return rxrpc_local_destroyer(local);
+
+ if (!skb_queue_empty(&local->accept_queue)) {
+ rxrpc_accept_incoming_calls(local);
+ again = true;
+ }
+
+ if (!skb_queue_empty(&local->reject_queue)) {
+ rxrpc_reject_packets(local);
+ again = true;
+ }
+
+ if (!skb_queue_empty(&local->event_queue)) {
+ rxrpc_process_local_events(local);
+ again = true;
+ }
+ } while (again);
+}
+
+/*
+ * Destroy a local endpoint after the RCU grace period expires.
+ */
+static void rxrpc_local_rcu(struct rcu_head *rcu)
+{
+ struct rxrpc_local *local = container_of(rcu, struct rxrpc_local, rcu);
+
+ _enter("%d", local->debug_id);
+
+ ASSERT(!work_pending(&local->processor));
+
+ _net("DESTROY LOCAL %d", local->debug_id);
+ kfree(local);
+ _leave("");
+}
+
+/*
+ * Verify the local endpoint list is empty by this point.
+ */
+void __exit rxrpc_destroy_all_locals(void)
+{
+ struct rxrpc_local *local;
+
+ _enter("");
+
+ if (list_empty(&rxrpc_local_endpoints))
+ return;
+
+ mutex_lock(&rxrpc_local_mutex);
+ list_for_each_entry(local, &rxrpc_local_endpoints, link) {
+ pr_err("AF_RXRPC: Leaked local %p {%d}\n",
+ local, atomic_read(&local->usage));
+ }
+ mutex_unlock(&rxrpc_local_mutex);
+ BUG();
+}
diff --git a/net/rxrpc/ar-output.c b/net/rxrpc/output.c
index 2e3c406..f4bda06 100644
--- a/net/rxrpc/ar-output.c
+++ b/net/rxrpc/output.c
@@ -35,7 +35,8 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
static int rxrpc_sendmsg_cmsg(struct msghdr *msg,
unsigned long *user_call_ID,
enum rxrpc_command *command,
- u32 *abort_code)
+ u32 *abort_code,
+ bool *_exclusive)
{
struct cmsghdr *cmsg;
bool got_user_ID = false;
@@ -93,6 +94,11 @@ static int rxrpc_sendmsg_cmsg(struct msghdr *msg,
return -EINVAL;
break;
+ case RXRPC_EXCLUSIVE_CALL:
+ *_exclusive = true;
+ if (len != 0)
+ return -EINVAL;
+ break;
default:
return -EINVAL;
}
@@ -131,13 +137,11 @@ static void rxrpc_send_abort(struct rxrpc_call *call, u32 abort_code)
*/
static struct rxrpc_call *
rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg,
- unsigned long user_call_ID)
+ unsigned long user_call_ID, bool exclusive)
{
- struct rxrpc_conn_bundle *bundle;
- struct rxrpc_transport *trans;
+ struct rxrpc_conn_parameters cp;
struct rxrpc_call *call;
struct key *key;
- long ret;
DECLARE_SOCKADDR(struct sockaddr_rxrpc *, srx, msg->msg_name);
@@ -146,39 +150,20 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg,
if (!msg->msg_name)
return ERR_PTR(-EDESTADDRREQ);
- trans = rxrpc_name_to_transport(rx, msg->msg_name, msg->msg_namelen, 0,
- GFP_KERNEL);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- goto out;
- }
-
key = rx->key;
if (key && !rx->key->payload.data[0])
key = NULL;
- bundle = rxrpc_get_bundle(rx, trans, key, srx->srx_service, GFP_KERNEL);
- if (IS_ERR(bundle)) {
- ret = PTR_ERR(bundle);
- goto out_trans;
- }
- call = rxrpc_new_client_call(rx, trans, bundle, user_call_ID,
- GFP_KERNEL);
- rxrpc_put_bundle(trans, bundle);
- rxrpc_put_transport(trans);
- if (IS_ERR(call)) {
- ret = PTR_ERR(call);
- goto out_trans;
- }
+ memset(&cp, 0, sizeof(cp));
+ cp.local = rx->local;
+ cp.key = rx->key;
+ cp.security_level = rx->min_sec_level;
+ cp.exclusive = rx->exclusive | exclusive;
+ cp.service_id = srx->srx_service;
+ call = rxrpc_new_client_call(rx, &cp, srx, user_call_ID, GFP_KERNEL);
_leave(" = %p\n", call);
return call;
-
-out_trans:
- rxrpc_put_transport(trans);
-out:
- _leave(" = %ld", ret);
- return ERR_PTR(ret);
}
/*
@@ -191,12 +176,14 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
enum rxrpc_command cmd;
struct rxrpc_call *call;
unsigned long user_call_ID = 0;
+ bool exclusive = false;
u32 abort_code = 0;
int ret;
_enter("");
- ret = rxrpc_sendmsg_cmsg(msg, &user_call_ID, &cmd, &abort_code);
+ ret = rxrpc_sendmsg_cmsg(msg, &user_call_ID, &cmd, &abort_code,
+ &exclusive);
if (ret < 0)
return ret;
@@ -214,7 +201,8 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
if (!call) {
if (cmd != RXRPC_CMD_SEND_DATA)
return -EBADSLT;
- call = rxrpc_new_client_call_for_sendmsg(rx, msg, user_call_ID);
+ call = rxrpc_new_client_call_for_sendmsg(rx, msg, user_call_ID,
+ exclusive);
if (IS_ERR(call))
return PTR_ERR(call);
}
@@ -319,7 +307,7 @@ EXPORT_SYMBOL(rxrpc_kernel_abort_call);
/*
* send a packet through the transport endpoint
*/
-int rxrpc_send_packet(struct rxrpc_transport *trans, struct sk_buff *skb)
+int rxrpc_send_data_packet(struct rxrpc_connection *conn, struct sk_buff *skb)
{
struct kvec iov[1];
struct msghdr msg;
@@ -330,30 +318,30 @@ int rxrpc_send_packet(struct rxrpc_transport *trans, struct sk_buff *skb)
iov[0].iov_base = skb->head;
iov[0].iov_len = skb->len;
- msg.msg_name = &trans->peer->srx.transport.sin;
- msg.msg_namelen = sizeof(trans->peer->srx.transport.sin);
+ msg.msg_name = &conn->params.peer->srx.transport;
+ msg.msg_namelen = conn->params.peer->srx.transport_len;
msg.msg_control = NULL;
msg.msg_controllen = 0;
msg.msg_flags = 0;
/* send the packet with the don't fragment bit set if we currently
* think it's small enough */
- if (skb->len - sizeof(struct rxrpc_wire_header) < trans->peer->maxdata) {
- down_read(&trans->local->defrag_sem);
+ if (skb->len - sizeof(struct rxrpc_wire_header) < conn->params.peer->maxdata) {
+ down_read(&conn->params.local->defrag_sem);
/* send the packet by UDP
* - returns -EMSGSIZE if UDP would have to fragment the packet
* to go out of the interface
* - in which case, we'll have processed the ICMP error
* message and update the peer record
*/
- ret = kernel_sendmsg(trans->local->socket, &msg, iov, 1,
+ ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 1,
iov[0].iov_len);
- up_read(&trans->local->defrag_sem);
+ up_read(&conn->params.local->defrag_sem);
if (ret == -EMSGSIZE)
goto send_fragmentable;
- _leave(" = %d [%u]", ret, trans->peer->maxdata);
+ _leave(" = %d [%u]", ret, conn->params.peer->maxdata);
return ret;
}
@@ -361,21 +349,28 @@ send_fragmentable:
/* attempt to send this message with fragmentation enabled */
_debug("send fragment");
- down_write(&trans->local->defrag_sem);
- opt = IP_PMTUDISC_DONT;
- ret = kernel_setsockopt(trans->local->socket, SOL_IP, IP_MTU_DISCOVER,
- (char *) &opt, sizeof(opt));
- if (ret == 0) {
- ret = kernel_sendmsg(trans->local->socket, &msg, iov, 1,
- iov[0].iov_len);
-
- opt = IP_PMTUDISC_DO;
- kernel_setsockopt(trans->local->socket, SOL_IP,
- IP_MTU_DISCOVER, (char *) &opt, sizeof(opt));
+ down_write(&conn->params.local->defrag_sem);
+
+ switch (conn->params.local->srx.transport.family) {
+ case AF_INET:
+ opt = IP_PMTUDISC_DONT;
+ ret = kernel_setsockopt(conn->params.local->socket,
+ SOL_IP, IP_MTU_DISCOVER,
+ (char *)&opt, sizeof(opt));
+ if (ret == 0) {
+ ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 1,
+ iov[0].iov_len);
+
+ opt = IP_PMTUDISC_DO;
+ kernel_setsockopt(conn->params.local->socket, SOL_IP,
+ IP_MTU_DISCOVER,
+ (char *)&opt, sizeof(opt));
+ }
+ break;
}
- up_write(&trans->local->defrag_sem);
- _leave(" = %d [frag %u]", ret, trans->peer->maxdata);
+ up_write(&conn->params.local->defrag_sem);
+ _leave(" = %d [frag %u]", ret, conn->params.peer->maxdata);
return ret;
}
@@ -487,7 +482,7 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb,
if (try_to_del_timer_sync(&call->ack_timer) >= 0) {
/* the packet may be freed by rxrpc_process_call() before this
* returns */
- ret = rxrpc_send_packet(call->conn->trans, skb);
+ ret = rxrpc_send_data_packet(call->conn, skb);
_net("sent skb %p", skb);
} else {
_debug("failed to delete ACK timer");
@@ -573,7 +568,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
goto maybe_error;
}
- max = call->conn->trans->peer->maxdata;
+ max = call->conn->params.peer->maxdata;
max -= call->conn->security_size;
max &= ~(call->conn->size_align - 1UL);
@@ -664,7 +659,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
seq = atomic_inc_return(&call->sequence);
- sp->hdr.epoch = conn->epoch;
+ sp->hdr.epoch = conn->proto.epoch;
sp->hdr.cid = call->cid;
sp->hdr.callNumber = call->call_id;
sp->hdr.seq = seq;
@@ -707,7 +702,9 @@ out:
call_aborted:
rxrpc_free_skb(skb);
if (call->state == RXRPC_CALL_NETWORK_ERROR)
- ret = call->conn->trans->peer->net_error;
+ ret = call->error_report < RXRPC_LOCAL_ERROR_OFFSET ?
+ call->error_report :
+ call->error_report - RXRPC_LOCAL_ERROR_OFFSET;
else
ret = -ECONNABORTED;
_leave(" = %d", ret);
diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c
new file mode 100644
index 0000000..8940674
--- /dev/null
+++ b/net/rxrpc/peer_event.c
@@ -0,0 +1,281 @@
+/* Peer event handling, typically ICMP messages.
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/errqueue.h>
+#include <linux/udp.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/icmp.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include <net/ip.h>
+#include "ar-internal.h"
+
+static void rxrpc_store_error(struct rxrpc_peer *, struct sock_exterr_skb *);
+
+/*
+ * Find the peer associated with an ICMP packet.
+ */
+static struct rxrpc_peer *rxrpc_lookup_peer_icmp_rcu(struct rxrpc_local *local,
+ const struct sk_buff *skb)
+{
+ struct sock_exterr_skb *serr = SKB_EXT_ERR(skb);
+ struct sockaddr_rxrpc srx;
+
+ _enter("");
+
+ memset(&srx, 0, sizeof(srx));
+ srx.transport_type = local->srx.transport_type;
+ srx.transport.family = local->srx.transport.family;
+
+ /* Can we see an ICMP4 packet on an ICMP6 listening socket? and vice
+ * versa?
+ */
+ switch (srx.transport.family) {
+ case AF_INET:
+ srx.transport.sin.sin_port = serr->port;
+ srx.transport_len = sizeof(struct sockaddr_in);
+ switch (serr->ee.ee_origin) {
+ case SO_EE_ORIGIN_ICMP:
+ _net("Rx ICMP");
+ memcpy(&srx.transport.sin.sin_addr,
+ skb_network_header(skb) + serr->addr_offset,
+ sizeof(struct in_addr));
+ break;
+ case SO_EE_ORIGIN_ICMP6:
+ _net("Rx ICMP6 on v4 sock");
+ memcpy(&srx.transport.sin.sin_addr,
+ skb_network_header(skb) + serr->addr_offset + 12,
+ sizeof(struct in_addr));
+ break;
+ default:
+ memcpy(&srx.transport.sin.sin_addr, &ip_hdr(skb)->saddr,
+ sizeof(struct in_addr));
+ break;
+ }
+ break;
+
+ default:
+ BUG();
+ }
+
+ return rxrpc_lookup_peer_rcu(local, &srx);
+}
+
+/*
+ * Handle an MTU/fragmentation problem.
+ */
+static void rxrpc_adjust_mtu(struct rxrpc_peer *peer, struct sock_exterr_skb *serr)
+{
+ u32 mtu = serr->ee.ee_info;
+
+ _net("Rx ICMP Fragmentation Needed (%d)", mtu);
+
+ /* wind down the local interface MTU */
+ if (mtu > 0 && peer->if_mtu == 65535 && mtu < peer->if_mtu) {
+ peer->if_mtu = mtu;
+ _net("I/F MTU %u", mtu);
+ }
+
+ if (mtu == 0) {
+ /* they didn't give us a size, estimate one */
+ mtu = peer->if_mtu;
+ if (mtu > 1500) {
+ mtu >>= 1;
+ if (mtu < 1500)
+ mtu = 1500;
+ } else {
+ mtu -= 100;
+ if (mtu < peer->hdrsize)
+ mtu = peer->hdrsize + 4;
+ }
+ }
+
+ if (mtu < peer->mtu) {
+ spin_lock_bh(&peer->lock);
+ peer->mtu = mtu;
+ peer->maxdata = peer->mtu - peer->hdrsize;
+ spin_unlock_bh(&peer->lock);
+ _net("Net MTU %u (maxdata %u)",
+ peer->mtu, peer->maxdata);
+ }
+}
+
+/*
+ * Handle an error received on the local endpoint.
+ */
+void rxrpc_error_report(struct sock *sk)
+{
+ struct sock_exterr_skb *serr;
+ struct rxrpc_local *local = sk->sk_user_data;
+ struct rxrpc_peer *peer;
+ struct sk_buff *skb;
+
+ _enter("%p{%d}", sk, local->debug_id);
+
+ skb = sock_dequeue_err_skb(sk);
+ if (!skb) {
+ _leave("UDP socket errqueue empty");
+ return;
+ }
+ serr = SKB_EXT_ERR(skb);
+ if (!skb->len && serr->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING) {
+ _leave("UDP empty message");
+ kfree_skb(skb);
+ return;
+ }
+
+ rxrpc_new_skb(skb);
+
+ rcu_read_lock();
+ peer = rxrpc_lookup_peer_icmp_rcu(local, skb);
+ if (peer && !rxrpc_get_peer_maybe(peer))
+ peer = NULL;
+ if (!peer) {
+ rcu_read_unlock();
+ rxrpc_free_skb(skb);
+ _leave(" [no peer]");
+ return;
+ }
+
+ if ((serr->ee.ee_origin == SO_EE_ORIGIN_ICMP &&
+ serr->ee.ee_type == ICMP_DEST_UNREACH &&
+ serr->ee.ee_code == ICMP_FRAG_NEEDED)) {
+ rxrpc_adjust_mtu(peer, serr);
+ rcu_read_unlock();
+ rxrpc_free_skb(skb);
+ rxrpc_put_peer(peer);
+ _leave(" [MTU update]");
+ return;
+ }
+
+ rxrpc_store_error(peer, serr);
+ rcu_read_unlock();
+ rxrpc_free_skb(skb);
+
+ /* The ref we obtained is passed off to the work item */
+ rxrpc_queue_work(&peer->error_distributor);
+ _leave("");
+}
+
+/*
+ * Map an error report to error codes on the peer record.
+ */
+static void rxrpc_store_error(struct rxrpc_peer *peer,
+ struct sock_exterr_skb *serr)
+{
+ struct sock_extended_err *ee;
+ int err;
+
+ _enter("");
+
+ ee = &serr->ee;
+
+ _net("Rx Error o=%d t=%d c=%d e=%d",
+ ee->ee_origin, ee->ee_type, ee->ee_code, ee->ee_errno);
+
+ err = ee->ee_errno;
+
+ switch (ee->ee_origin) {
+ case SO_EE_ORIGIN_ICMP:
+ switch (ee->ee_type) {
+ case ICMP_DEST_UNREACH:
+ switch (ee->ee_code) {
+ case ICMP_NET_UNREACH:
+ _net("Rx Received ICMP Network Unreachable");
+ break;
+ case ICMP_HOST_UNREACH:
+ _net("Rx Received ICMP Host Unreachable");
+ break;
+ case ICMP_PORT_UNREACH:
+ _net("Rx Received ICMP Port Unreachable");
+ break;
+ case ICMP_NET_UNKNOWN:
+ _net("Rx Received ICMP Unknown Network");
+ break;
+ case ICMP_HOST_UNKNOWN:
+ _net("Rx Received ICMP Unknown Host");
+ break;
+ default:
+ _net("Rx Received ICMP DestUnreach code=%u",
+ ee->ee_code);
+ break;
+ }
+ break;
+
+ case ICMP_TIME_EXCEEDED:
+ _net("Rx Received ICMP TTL Exceeded");
+ break;
+
+ default:
+ _proto("Rx Received ICMP error { type=%u code=%u }",
+ ee->ee_type, ee->ee_code);
+ break;
+ }
+ break;
+
+ case SO_EE_ORIGIN_NONE:
+ case SO_EE_ORIGIN_LOCAL:
+ _proto("Rx Received local error { error=%d }", err);
+ err += RXRPC_LOCAL_ERROR_OFFSET;
+ break;
+
+ case SO_EE_ORIGIN_ICMP6:
+ default:
+ _proto("Rx Received error report { orig=%u }", ee->ee_origin);
+ break;
+ }
+
+ peer->error_report = err;
+}
+
+/*
+ * Distribute an error that occurred on a peer
+ */
+void rxrpc_peer_error_distributor(struct work_struct *work)
+{
+ struct rxrpc_peer *peer =
+ container_of(work, struct rxrpc_peer, error_distributor);
+ struct rxrpc_call *call;
+ int error_report;
+
+ _enter("");
+
+ error_report = READ_ONCE(peer->error_report);
+
+ _debug("ISSUE ERROR %d", error_report);
+
+ spin_lock_bh(&peer->lock);
+
+ while (!hlist_empty(&peer->error_targets)) {
+ call = hlist_entry(peer->error_targets.first,
+ struct rxrpc_call, error_link);
+ hlist_del_init(&call->error_link);
+
+ write_lock(&call->state_lock);
+ if (call->state != RXRPC_CALL_COMPLETE &&
+ call->state < RXRPC_CALL_NETWORK_ERROR) {
+ call->error_report = error_report;
+ call->state = RXRPC_CALL_NETWORK_ERROR;
+ set_bit(RXRPC_CALL_EV_RCVD_ERROR, &call->events);
+ rxrpc_queue_call(call);
+ }
+ write_unlock(&call->state_lock);
+ }
+
+ spin_unlock_bh(&peer->lock);
+
+ rxrpc_put_peer(peer);
+ _leave("");
+}
diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c
new file mode 100644
index 0000000..01d4930
--- /dev/null
+++ b/net/rxrpc/peer_object.c
@@ -0,0 +1,315 @@
+/* RxRPC remote transport endpoint record management
+ *
+ * Copyright (C) 2007, 2016 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/udp.h>
+#include <linux/in.h>
+#include <linux/slab.h>
+#include <linux/hashtable.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include "ar-internal.h"
+
+static DEFINE_HASHTABLE(rxrpc_peer_hash, 10);
+static DEFINE_SPINLOCK(rxrpc_peer_hash_lock);
+
+/*
+ * Hash a peer key.
+ */
+static unsigned long rxrpc_peer_hash_key(struct rxrpc_local *local,
+ const struct sockaddr_rxrpc *srx)
+{
+ const u16 *p;
+ unsigned int i, size;
+ unsigned long hash_key;
+
+ _enter("");
+
+ hash_key = (unsigned long)local / __alignof__(*local);
+ hash_key += srx->transport_type;
+ hash_key += srx->transport_len;
+ hash_key += srx->transport.family;
+
+ switch (srx->transport.family) {
+ case AF_INET:
+ hash_key += (u16 __force)srx->transport.sin.sin_port;
+ size = sizeof(srx->transport.sin.sin_addr);
+ p = (u16 *)&srx->transport.sin.sin_addr;
+ break;
+ default:
+ WARN(1, "AF_RXRPC: Unsupported transport address family\n");
+ return 0;
+ }
+
+ /* Step through the peer address in 16-bit portions for speed */
+ for (i = 0; i < size; i += sizeof(*p), p++)
+ hash_key += *p;
+
+ _leave(" 0x%lx", hash_key);
+ return hash_key;
+}
+
+/*
+ * Compare a peer to a key. Return -ve, 0 or +ve to indicate less than, same
+ * or greater than.
+ *
+ * Unfortunately, the primitives in linux/hashtable.h don't allow for sorted
+ * buckets and mid-bucket insertion, so we don't make full use of this
+ * information at this point.
+ */
+static long rxrpc_peer_cmp_key(const struct rxrpc_peer *peer,
+ struct rxrpc_local *local,
+ const struct sockaddr_rxrpc *srx,
+ unsigned long hash_key)
+{
+ long diff;
+
+ diff = ((peer->hash_key - hash_key) ?:
+ ((unsigned long)peer->local - (unsigned long)local) ?:
+ (peer->srx.transport_type - srx->transport_type) ?:
+ (peer->srx.transport_len - srx->transport_len) ?:
+ (peer->srx.transport.family - srx->transport.family));
+ if (diff != 0)
+ return diff;
+
+ switch (srx->transport.family) {
+ case AF_INET:
+ return ((u16 __force)peer->srx.transport.sin.sin_port -
+ (u16 __force)srx->transport.sin.sin_port) ?:
+ memcmp(&peer->srx.transport.sin.sin_addr,
+ &srx->transport.sin.sin_addr,
+ sizeof(struct in_addr));
+ default:
+ BUG();
+ }
+}
+
+/*
+ * Look up a remote transport endpoint for the specified address using RCU.
+ */
+static struct rxrpc_peer *__rxrpc_lookup_peer_rcu(
+ struct rxrpc_local *local,
+ const struct sockaddr_rxrpc *srx,
+ unsigned long hash_key)
+{
+ struct rxrpc_peer *peer;
+
+ hash_for_each_possible_rcu(rxrpc_peer_hash, peer, hash_link, hash_key) {
+ if (rxrpc_peer_cmp_key(peer, local, srx, hash_key) == 0) {
+ if (atomic_read(&peer->usage) == 0)
+ return NULL;
+ return peer;
+ }
+ }
+
+ return NULL;
+}
+
+/*
+ * Look up a remote transport endpoint for the specified address using RCU.
+ */
+struct rxrpc_peer *rxrpc_lookup_peer_rcu(struct rxrpc_local *local,
+ const struct sockaddr_rxrpc *srx)
+{
+ struct rxrpc_peer *peer;
+ unsigned long hash_key = rxrpc_peer_hash_key(local, srx);
+
+ peer = __rxrpc_lookup_peer_rcu(local, srx, hash_key);
+ if (peer) {
+ switch (srx->transport.family) {
+ case AF_INET:
+ _net("PEER %d {%d,%u,%pI4+%hu}",
+ peer->debug_id,
+ peer->srx.transport_type,
+ peer->srx.transport.family,
+ &peer->srx.transport.sin.sin_addr,
+ ntohs(peer->srx.transport.sin.sin_port));
+ break;
+ }
+
+ _leave(" = %p {u=%d}", peer, atomic_read(&peer->usage));
+ }
+ return peer;
+}
+
+/*
+ * assess the MTU size for the network interface through which this peer is
+ * reached
+ */
+static void rxrpc_assess_MTU_size(struct rxrpc_peer *peer)
+{
+ struct rtable *rt;
+ struct flowi4 fl4;
+
+ peer->if_mtu = 1500;
+
+ rt = ip_route_output_ports(&init_net, &fl4, NULL,
+ peer->srx.transport.sin.sin_addr.s_addr, 0,
+ htons(7000), htons(7001),
+ IPPROTO_UDP, 0, 0);
+ if (IS_ERR(rt)) {
+ _leave(" [route err %ld]", PTR_ERR(rt));
+ return;
+ }
+
+ peer->if_mtu = dst_mtu(&rt->dst);
+ dst_release(&rt->dst);
+
+ _leave(" [if_mtu %u]", peer->if_mtu);
+}
+
+/*
+ * Allocate a peer.
+ */
+struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp)
+{
+ struct rxrpc_peer *peer;
+
+ _enter("");
+
+ peer = kzalloc(sizeof(struct rxrpc_peer), gfp);
+ if (peer) {
+ atomic_set(&peer->usage, 1);
+ peer->local = local;
+ INIT_HLIST_HEAD(&peer->error_targets);
+ INIT_WORK(&peer->error_distributor,
+ &rxrpc_peer_error_distributor);
+ peer->service_conns = RB_ROOT;
+ rwlock_init(&peer->conn_lock);
+ spin_lock_init(&peer->lock);
+ peer->debug_id = atomic_inc_return(&rxrpc_debug_id);
+ }
+
+ _leave(" = %p", peer);
+ return peer;
+}
+
+/*
+ * Set up a new peer.
+ */
+static struct rxrpc_peer *rxrpc_create_peer(struct rxrpc_local *local,
+ struct sockaddr_rxrpc *srx,
+ unsigned long hash_key,
+ gfp_t gfp)
+{
+ struct rxrpc_peer *peer;
+
+ _enter("");
+
+ peer = rxrpc_alloc_peer(local, gfp);
+ if (peer) {
+ peer->hash_key = hash_key;
+ memcpy(&peer->srx, srx, sizeof(*srx));
+
+ rxrpc_assess_MTU_size(peer);
+ peer->mtu = peer->if_mtu;
+
+ if (srx->transport.family == AF_INET) {
+ peer->hdrsize = sizeof(struct iphdr);
+ switch (srx->transport_type) {
+ case SOCK_DGRAM:
+ peer->hdrsize += sizeof(struct udphdr);
+ break;
+ default:
+ BUG();
+ break;
+ }
+ } else {
+ BUG();
+ }
+
+ peer->hdrsize += sizeof(struct rxrpc_wire_header);
+ peer->maxdata = peer->mtu - peer->hdrsize;
+ }
+
+ _leave(" = %p", peer);
+ return peer;
+}
+
+/*
+ * obtain a remote transport endpoint for the specified address
+ */
+struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local,
+ struct sockaddr_rxrpc *srx, gfp_t gfp)
+{
+ struct rxrpc_peer *peer, *candidate;
+ unsigned long hash_key = rxrpc_peer_hash_key(local, srx);
+
+ _enter("{%d,%d,%pI4+%hu}",
+ srx->transport_type,
+ srx->transport_len,
+ &srx->transport.sin.sin_addr,
+ ntohs(srx->transport.sin.sin_port));
+
+ /* search the peer list first */
+ rcu_read_lock();
+ peer = __rxrpc_lookup_peer_rcu(local, srx, hash_key);
+ if (peer && !rxrpc_get_peer_maybe(peer))
+ peer = NULL;
+ rcu_read_unlock();
+
+ if (!peer) {
+ /* The peer is not yet present in hash - create a candidate
+ * for a new record and then redo the search.
+ */
+ candidate = rxrpc_create_peer(local, srx, hash_key, gfp);
+ if (!candidate) {
+ _leave(" = NULL [nomem]");
+ return NULL;
+ }
+
+ spin_lock(&rxrpc_peer_hash_lock);
+
+ /* Need to check that we aren't racing with someone else */
+ peer = __rxrpc_lookup_peer_rcu(local, srx, hash_key);
+ if (peer && !rxrpc_get_peer_maybe(peer))
+ peer = NULL;
+ if (!peer)
+ hash_add_rcu(rxrpc_peer_hash,
+ &candidate->hash_link, hash_key);
+
+ spin_unlock(&rxrpc_peer_hash_lock);
+
+ if (peer)
+ kfree(candidate);
+ else
+ peer = candidate;
+ }
+
+ _net("PEER %d {%d,%pI4+%hu}",
+ peer->debug_id,
+ peer->srx.transport_type,
+ &peer->srx.transport.sin.sin_addr,
+ ntohs(peer->srx.transport.sin.sin_port));
+
+ _leave(" = %p {u=%d}", peer, atomic_read(&peer->usage));
+ return peer;
+}
+
+/*
+ * Discard a ref on a remote peer record.
+ */
+void __rxrpc_put_peer(struct rxrpc_peer *peer)
+{
+ ASSERT(hlist_empty(&peer->error_targets));
+
+ spin_lock(&rxrpc_peer_hash_lock);
+ hash_del_rcu(&peer->hash_link);
+ spin_unlock(&rxrpc_peer_hash_lock);
+
+ kfree_rcu(peer, rcu);
+}
diff --git a/net/rxrpc/ar-proc.c b/net/rxrpc/proc.c
index 225163b..500cdcd 100644
--- a/net/rxrpc/ar-proc.c
+++ b/net/rxrpc/proc.c
@@ -46,7 +46,7 @@ static void rxrpc_call_seq_stop(struct seq_file *seq, void *v)
static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
{
- struct rxrpc_transport *trans;
+ struct rxrpc_connection *conn;
struct rxrpc_call *call;
char lbuff[4 + 4 + 4 + 4 + 5 + 1], rbuff[4 + 4 + 4 + 4 + 5 + 1];
@@ -59,25 +59,28 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
}
call = list_entry(v, struct rxrpc_call, link);
- trans = call->conn->trans;
sprintf(lbuff, "%pI4:%u",
- &trans->local->srx.transport.sin.sin_addr,
- ntohs(trans->local->srx.transport.sin.sin_port));
+ &call->local->srx.transport.sin.sin_addr,
+ ntohs(call->local->srx.transport.sin.sin_port));
- sprintf(rbuff, "%pI4:%u",
- &trans->peer->srx.transport.sin.sin_addr,
- ntohs(trans->peer->srx.transport.sin.sin_port));
+ conn = call->conn;
+ if (conn)
+ sprintf(rbuff, "%pI4:%u",
+ &conn->params.peer->srx.transport.sin.sin_addr,
+ ntohs(conn->params.peer->srx.transport.sin.sin_port));
+ else
+ strcpy(rbuff, "no_connection");
seq_printf(seq,
"UDP %-22.22s %-22.22s %4x %08x %08x %s %3u"
" %-8.8s %08x %lx\n",
lbuff,
rbuff,
- call->conn->service_id,
+ call->service_id,
call->cid,
call->call_id,
- call->conn->in_clientflag ? "Svc" : "Clt",
+ call->in_clientflag ? "Svc" : "Clt",
atomic_read(&call->usage),
rxrpc_call_states[call->state],
call->remote_abort ?: call->local_abort,
@@ -129,7 +132,6 @@ static void rxrpc_connection_seq_stop(struct seq_file *seq, void *v)
static int rxrpc_connection_seq_show(struct seq_file *seq, void *v)
{
struct rxrpc_connection *conn;
- struct rxrpc_transport *trans;
char lbuff[4 + 4 + 4 + 4 + 5 + 1], rbuff[4 + 4 + 4 + 4 + 5 + 1];
if (v == &rxrpc_connections) {
@@ -142,28 +144,27 @@ static int rxrpc_connection_seq_show(struct seq_file *seq, void *v)
}
conn = list_entry(v, struct rxrpc_connection, link);
- trans = conn->trans;
sprintf(lbuff, "%pI4:%u",
- &trans->local->srx.transport.sin.sin_addr,
- ntohs(trans->local->srx.transport.sin.sin_port));
+ &conn->params.local->srx.transport.sin.sin_addr,
+ ntohs(conn->params.local->srx.transport.sin.sin_port));
sprintf(rbuff, "%pI4:%u",
- &trans->peer->srx.transport.sin.sin_addr,
- ntohs(trans->peer->srx.transport.sin.sin_port));
+ &conn->params.peer->srx.transport.sin.sin_addr,
+ ntohs(conn->params.peer->srx.transport.sin.sin_port));
seq_printf(seq,
"UDP %-22.22s %-22.22s %4x %08x %08x %s %3u"
" %s %08x %08x %08x\n",
lbuff,
rbuff,
- conn->service_id,
- conn->cid,
+ conn->params.service_id,
+ conn->proto.cid,
conn->call_counter,
- conn->in_clientflag ? "Svc" : "Clt",
+ rxrpc_conn_is_service(conn) ? "Svc" : "Clt",
atomic_read(&conn->usage),
rxrpc_conn_states[conn->state],
- key_serial(conn->key),
+ key_serial(conn->params.key),
atomic_read(&conn->serial),
atomic_read(&conn->hi_serial));
diff --git a/net/rxrpc/ar-recvmsg.c b/net/rxrpc/recvmsg.c
index 59706b9..a3fa2ed 100644
--- a/net/rxrpc/ar-recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -147,9 +147,9 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
if (!continue_call) {
if (msg->msg_name) {
size_t len =
- sizeof(call->conn->trans->peer->srx);
+ sizeof(call->conn->params.peer->srx);
memcpy(msg->msg_name,
- &call->conn->trans->peer->srx, len);
+ &call->conn->params.peer->srx, len);
msg->msg_namelen = len;
}
sock_recv_timestamp(msg, &rx->sk, skb);
@@ -205,7 +205,7 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
/* we transferred the whole data packet */
if (sp->hdr.flags & RXRPC_LAST_PACKET) {
_debug("last");
- if (call->conn->out_clientflag) {
+ if (rxrpc_conn_is_client(call->conn)) {
/* last byte of reply received */
ret = copied;
goto terminal_message;
diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c
index 36a6340..23c05ec 100644
--- a/net/rxrpc/rxkad.c
+++ b/net/rxrpc/rxkad.c
@@ -58,9 +58,9 @@ static int rxkad_init_connection_security(struct rxrpc_connection *conn)
struct rxrpc_key_token *token;
int ret;
- _enter("{%d},{%x}", conn->debug_id, key_serial(conn->key));
+ _enter("{%d},{%x}", conn->debug_id, key_serial(conn->params.key));
- token = conn->key->payload.data[0];
+ token = conn->params.key->payload.data[0];
conn->security_ix = token->security_index;
ci = crypto_alloc_skcipher("pcbc(fcrypt)", 0, CRYPTO_ALG_ASYNC);
@@ -74,7 +74,7 @@ static int rxkad_init_connection_security(struct rxrpc_connection *conn)
sizeof(token->kad->session_key)) < 0)
BUG();
- switch (conn->security_level) {
+ switch (conn->params.security_level) {
case RXRPC_SECURITY_PLAIN:
break;
case RXRPC_SECURITY_AUTH:
@@ -115,14 +115,14 @@ static void rxkad_prime_packet_security(struct rxrpc_connection *conn)
_enter("");
- if (!conn->key)
+ if (!conn->params.key)
return;
- token = conn->key->payload.data[0];
+ token = conn->params.key->payload.data[0];
memcpy(&iv, token->kad->session_key, sizeof(iv));
- tmpbuf.x[0] = htonl(conn->epoch);
- tmpbuf.x[1] = htonl(conn->cid);
+ tmpbuf.x[0] = htonl(conn->proto.epoch);
+ tmpbuf.x[1] = htonl(conn->proto.cid);
tmpbuf.x[2] = 0;
tmpbuf.x[3] = htonl(conn->security_ix);
@@ -220,7 +220,7 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call,
rxkhdr.checksum = 0;
/* encrypt from the session key */
- token = call->conn->key->payload.data[0];
+ token = call->conn->params.key->payload.data[0];
memcpy(&iv, token->kad->session_key, sizeof(iv));
sg_init_one(&sg[0], sechdr, sizeof(rxkhdr));
@@ -277,13 +277,13 @@ static int rxkad_secure_packet(const struct rxrpc_call *call,
sp = rxrpc_skb(skb);
_enter("{%d{%x}},{#%u},%zu,",
- call->debug_id, key_serial(call->conn->key), sp->hdr.seq,
- data_size);
+ call->debug_id, key_serial(call->conn->params.key),
+ sp->hdr.seq, data_size);
if (!call->conn->cipher)
return 0;
- ret = key_validate(call->conn->key);
+ ret = key_validate(call->conn->params.key);
if (ret < 0)
return ret;
@@ -312,7 +312,7 @@ static int rxkad_secure_packet(const struct rxrpc_call *call,
y = 1; /* zero checksums are not permitted */
sp->hdr.cksum = y;
- switch (call->conn->security_level) {
+ switch (call->conn->params.security_level) {
case RXRPC_SECURITY_PLAIN:
ret = 0;
break;
@@ -446,7 +446,7 @@ static int rxkad_verify_packet_encrypt(const struct rxrpc_call *call,
skb_to_sgvec(skb, sg, 0, skb->len);
/* decrypt from the session key */
- token = call->conn->key->payload.data[0];
+ token = call->conn->params.key->payload.data[0];
memcpy(&iv, token->kad->session_key, sizeof(iv));
skcipher_request_set_tfm(req, call->conn->cipher);
@@ -516,7 +516,7 @@ static int rxkad_verify_packet(const struct rxrpc_call *call,
sp = rxrpc_skb(skb);
_enter("{%d{%x}},{#%u}",
- call->debug_id, key_serial(call->conn->key), sp->hdr.seq);
+ call->debug_id, key_serial(call->conn->params.key), sp->hdr.seq);
if (!call->conn->cipher)
return 0;
@@ -557,7 +557,7 @@ static int rxkad_verify_packet(const struct rxrpc_call *call,
return -EPROTO;
}
- switch (call->conn->security_level) {
+ switch (call->conn->params.security_level) {
case RXRPC_SECURITY_PLAIN:
ret = 0;
break;
@@ -589,9 +589,9 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn)
u32 serial;
int ret;
- _enter("{%d,%x}", conn->debug_id, key_serial(conn->key));
+ _enter("{%d,%x}", conn->debug_id, key_serial(conn->params.key));
- ret = key_validate(conn->key);
+ ret = key_validate(conn->params.key);
if (ret < 0)
return ret;
@@ -602,14 +602,14 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn)
challenge.min_level = htonl(0);
challenge.__padding = 0;
- msg.msg_name = &conn->trans->peer->srx.transport.sin;
- msg.msg_namelen = sizeof(conn->trans->peer->srx.transport.sin);
+ msg.msg_name = &conn->params.peer->srx.transport.sin;
+ msg.msg_namelen = sizeof(conn->params.peer->srx.transport.sin);
msg.msg_control = NULL;
msg.msg_controllen = 0;
msg.msg_flags = 0;
- whdr.epoch = htonl(conn->epoch);
- whdr.cid = htonl(conn->cid);
+ whdr.epoch = htonl(conn->proto.epoch);
+ whdr.cid = htonl(conn->proto.cid);
whdr.callNumber = 0;
whdr.seq = 0;
whdr.type = RXRPC_PACKET_TYPE_CHALLENGE;
@@ -617,7 +617,7 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn)
whdr.userStatus = 0;
whdr.securityIndex = conn->security_ix;
whdr._rsvd = 0;
- whdr.serviceId = htons(conn->service_id);
+ whdr.serviceId = htons(conn->params.service_id);
iov[0].iov_base = &whdr;
iov[0].iov_len = sizeof(whdr);
@@ -630,7 +630,7 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn)
whdr.serial = htonl(serial);
_proto("Tx CHALLENGE %%%u", serial);
- ret = kernel_sendmsg(conn->trans->local->socket, &msg, iov, 2, len);
+ ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len);
if (ret < 0) {
_debug("sendmsg failed: %d", ret);
return -EAGAIN;
@@ -657,8 +657,8 @@ static int rxkad_send_response(struct rxrpc_connection *conn,
_enter("");
- msg.msg_name = &conn->trans->peer->srx.transport.sin;
- msg.msg_namelen = sizeof(conn->trans->peer->srx.transport.sin);
+ msg.msg_name = &conn->params.peer->srx.transport.sin;
+ msg.msg_namelen = sizeof(conn->params.peer->srx.transport.sin);
msg.msg_control = NULL;
msg.msg_controllen = 0;
msg.msg_flags = 0;
@@ -684,7 +684,7 @@ static int rxkad_send_response(struct rxrpc_connection *conn,
whdr.serial = htonl(serial);
_proto("Tx RESPONSE %%%u", serial);
- ret = kernel_sendmsg(conn->trans->local->socket, &msg, iov, 3, len);
+ ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 3, len);
if (ret < 0) {
_debug("sendmsg failed: %d", ret);
return -EAGAIN;
@@ -771,14 +771,14 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn,
u32 version, nonce, min_level, abort_code;
int ret;
- _enter("{%d,%x}", conn->debug_id, key_serial(conn->key));
+ _enter("{%d,%x}", conn->debug_id, key_serial(conn->params.key));
- if (!conn->key) {
+ if (!conn->params.key) {
_leave(" = -EPROTO [no key]");
return -EPROTO;
}
- ret = key_validate(conn->key);
+ ret = key_validate(conn->params.key);
if (ret < 0) {
*_abort_code = RXKADEXPIRED;
return ret;
@@ -801,20 +801,20 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn,
goto protocol_error;
abort_code = RXKADLEVELFAIL;
- if (conn->security_level < min_level)
+ if (conn->params.security_level < min_level)
goto protocol_error;
- token = conn->key->payload.data[0];
+ token = conn->params.key->payload.data[0];
/* build the response packet */
memset(&resp, 0, sizeof(resp));
resp.version = htonl(RXKAD_VERSION);
- resp.encrypted.epoch = htonl(conn->epoch);
- resp.encrypted.cid = htonl(conn->cid);
+ resp.encrypted.epoch = htonl(conn->proto.epoch);
+ resp.encrypted.cid = htonl(conn->proto.cid);
resp.encrypted.securityIndex = htonl(conn->security_ix);
resp.encrypted.inc_nonce = htonl(nonce + 1);
- resp.encrypted.level = htonl(conn->security_level);
+ resp.encrypted.level = htonl(conn->params.security_level);
resp.kvno = htonl(token->kad->kvno);
resp.ticket_len = htonl(token->kad->ticket_len);
@@ -1096,9 +1096,9 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
rxkad_decrypt_response(conn, &response, &session_key);
abort_code = RXKADSEALEDINCON;
- if (ntohl(response.encrypted.epoch) != conn->epoch)
+ if (ntohl(response.encrypted.epoch) != conn->proto.epoch)
goto protocol_error_free;
- if (ntohl(response.encrypted.cid) != conn->cid)
+ if (ntohl(response.encrypted.cid) != conn->proto.cid)
goto protocol_error_free;
if (ntohl(response.encrypted.securityIndex) != conn->security_ix)
goto protocol_error_free;
@@ -1122,7 +1122,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
level = ntohl(response.encrypted.level);
if (level > RXRPC_SECURITY_ENCRYPT)
goto protocol_error_free;
- conn->security_level = level;
+ conn->params.security_level = level;
/* create a key to hold the security data and expiration time - after
* this the connection security can be handled in exactly the same way
diff --git a/net/rxrpc/ar-security.c b/net/rxrpc/security.c
index d223253..814d285 100644
--- a/net/rxrpc/ar-security.c
+++ b/net/rxrpc/security.c
@@ -76,7 +76,7 @@ int rxrpc_init_client_conn_security(struct rxrpc_connection *conn)
{
const struct rxrpc_security *sec;
struct rxrpc_key_token *token;
- struct key *key = conn->key;
+ struct key *key = conn->params.key;
int ret;
_enter("{%d},{%x}", conn->debug_id, key_serial(key));
@@ -113,7 +113,7 @@ int rxrpc_init_client_conn_security(struct rxrpc_connection *conn)
int rxrpc_init_server_conn_security(struct rxrpc_connection *conn)
{
const struct rxrpc_security *sec;
- struct rxrpc_local *local = conn->trans->local;
+ struct rxrpc_local *local = conn->params.local;
struct rxrpc_sock *rx;
struct key *key;
key_ref_t kref;
@@ -121,7 +121,7 @@ int rxrpc_init_server_conn_security(struct rxrpc_connection *conn)
_enter("");
- sprintf(kdesc, "%u:%u", conn->service_id, conn->security_ix);
+ sprintf(kdesc, "%u:%u", conn->params.service_id, conn->security_ix);
sec = rxrpc_security_lookup(conn->security_ix);
if (!sec) {
@@ -132,7 +132,7 @@ int rxrpc_init_server_conn_security(struct rxrpc_connection *conn)
/* find the service */
read_lock_bh(&local->services_lock);
list_for_each_entry(rx, &local->services, listen_link) {
- if (rx->srx.srx_service == conn->service_id)
+ if (rx->srx.srx_service == conn->params.service_id)
goto found_service;
}
diff --git a/net/rxrpc/ar-skbuff.c b/net/rxrpc/skbuff.c
index eee0cfd9..eee0cfd9 100644
--- a/net/rxrpc/ar-skbuff.c
+++ b/net/rxrpc/skbuff.c
diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c
index a99690a..03ad087 100644
--- a/net/rxrpc/sysctl.c
+++ b/net/rxrpc/sysctl.c
@@ -90,14 +90,6 @@ static struct ctl_table rxrpc_sysctl_table[] = {
.proc_handler = proc_dointvec_minmax,
.extra1 = (void *)&one,
},
- {
- .procname = "transport_expiry",
- .data = &rxrpc_transport_expiry,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = (void *)&one,
- },
/* Non-time values */
{
diff --git a/net/rxrpc/utils.c b/net/rxrpc/utils.c
new file mode 100644
index 0000000..f28122a
--- /dev/null
+++ b/net/rxrpc/utils.c
@@ -0,0 +1,41 @@
+/* Utility routines
+ *
+ * Copyright (C) 2015 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include "ar-internal.h"
+
+/*
+ * Set up an RxRPC address from a socket buffer.
+ */
+void rxrpc_get_addr_from_skb(struct rxrpc_local *local,
+ const struct sk_buff *skb,
+ struct sockaddr_rxrpc *srx)
+{
+ memset(srx, 0, sizeof(*srx));
+ srx->transport_type = local->srx.transport_type;
+ srx->transport.family = local->srx.transport.family;
+
+ /* Can we see an ipv4 UDP packet on an ipv6 UDP socket? and vice
+ * versa?
+ */
+ switch (srx->transport.family) {
+ case AF_INET:
+ srx->transport.sin.sin_port = udp_hdr(skb)->source;
+ srx->transport_len = sizeof(struct sockaddr_in);
+ memcpy(&srx->transport.sin.sin_addr, &ip_hdr(skb)->saddr,
+ sizeof(struct in_addr));
+ break;
+
+ default:
+ BUG();
+ }
+}
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index b6db56e..47ec230 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -224,8 +224,8 @@ int tcf_hash_search(struct tc_action_net *tn, struct tc_action *a, u32 index)
}
EXPORT_SYMBOL(tcf_hash_search);
-int tcf_hash_check(struct tc_action_net *tn, u32 index, struct tc_action *a,
- int bind)
+bool tcf_hash_check(struct tc_action_net *tn, u32 index, struct tc_action *a,
+ int bind)
{
struct tcf_hashinfo *hinfo = tn->hinfo;
struct tcf_common *p = NULL;
@@ -235,9 +235,9 @@ int tcf_hash_check(struct tc_action_net *tn, u32 index, struct tc_action *a,
p->tcfc_refcnt++;
a->priv = p;
a->hinfo = hinfo;
- return 1;
+ return true;
}
- return 0;
+ return false;
}
EXPORT_SYMBOL(tcf_hash_check);
@@ -1122,7 +1122,7 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
nla_nest_end(skb, nest);
ret = skb->len;
} else
- nla_nest_cancel(skb, nest);
+ nlmsg_trim(skb, b);
nlh->nlmsg_len = skb_tail_pointer(skb) - b;
if (NETLINK_CB(cb->skb).portid && ret)
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index f7b6cf4..ef74bff 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -223,15 +223,10 @@ static int tcf_bpf_init_from_efd(struct nlattr **tb, struct tcf_bpf_cfg *cfg)
bpf_fd = nla_get_u32(tb[TCA_ACT_BPF_FD]);
- fp = bpf_prog_get(bpf_fd);
+ fp = bpf_prog_get_type(bpf_fd, BPF_PROG_TYPE_SCHED_ACT);
if (IS_ERR(fp))
return PTR_ERR(fp);
- if (fp->type != BPF_PROG_TYPE_SCHED_ACT) {
- bpf_prog_put(fp);
- return -EINVAL;
- }
-
if (tb[TCA_ACT_BPF_NAME]) {
name = kmemdup(nla_data(tb[TCA_ACT_BPF_NAME]),
nla_len(tb[TCA_ACT_BPF_NAME]),
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index 02f5a8b..845ab51 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -106,9 +106,9 @@ int ife_get_meta_u16(struct sk_buff *skb, struct tcf_meta_info *mi)
}
EXPORT_SYMBOL_GPL(ife_get_meta_u16);
-int ife_alloc_meta_u32(struct tcf_meta_info *mi, void *metaval)
+int ife_alloc_meta_u32(struct tcf_meta_info *mi, void *metaval, gfp_t gfp)
{
- mi->metaval = kmemdup(metaval, sizeof(u32), GFP_KERNEL);
+ mi->metaval = kmemdup(metaval, sizeof(u32), gfp);
if (!mi->metaval)
return -ENOMEM;
@@ -116,9 +116,9 @@ int ife_alloc_meta_u32(struct tcf_meta_info *mi, void *metaval)
}
EXPORT_SYMBOL_GPL(ife_alloc_meta_u32);
-int ife_alloc_meta_u16(struct tcf_meta_info *mi, void *metaval)
+int ife_alloc_meta_u16(struct tcf_meta_info *mi, void *metaval, gfp_t gfp)
{
- mi->metaval = kmemdup(metaval, sizeof(u16), GFP_KERNEL);
+ mi->metaval = kmemdup(metaval, sizeof(u16), gfp);
if (!mi->metaval)
return -ENOMEM;
@@ -240,10 +240,10 @@ static int ife_validate_metatype(struct tcf_meta_ops *ops, void *val, int len)
}
/* called when adding new meta information
- * under ife->tcf_lock
+ * under ife->tcf_lock for existing action
*/
static int load_metaops_and_vet(struct tcf_ife_info *ife, u32 metaid,
- void *val, int len)
+ void *val, int len, bool exists)
{
struct tcf_meta_ops *ops = find_ife_oplist(metaid);
int ret = 0;
@@ -251,11 +251,13 @@ static int load_metaops_and_vet(struct tcf_ife_info *ife, u32 metaid,
if (!ops) {
ret = -ENOENT;
#ifdef CONFIG_MODULES
- spin_unlock_bh(&ife->tcf_lock);
+ if (exists)
+ spin_unlock_bh(&ife->tcf_lock);
rtnl_unlock();
request_module("ifemeta%u", metaid);
rtnl_lock();
- spin_lock_bh(&ife->tcf_lock);
+ if (exists)
+ spin_lock_bh(&ife->tcf_lock);
ops = find_ife_oplist(metaid);
#endif
}
@@ -272,10 +274,10 @@ static int load_metaops_and_vet(struct tcf_ife_info *ife, u32 metaid,
}
/* called when adding new meta information
- * under ife->tcf_lock
+ * under ife->tcf_lock for existing action
*/
static int add_metainfo(struct tcf_ife_info *ife, u32 metaid, void *metaval,
- int len)
+ int len, bool atomic)
{
struct tcf_meta_info *mi = NULL;
struct tcf_meta_ops *ops = find_ife_oplist(metaid);
@@ -284,7 +286,7 @@ static int add_metainfo(struct tcf_ife_info *ife, u32 metaid, void *metaval,
if (!ops)
return -ENOENT;
- mi = kzalloc(sizeof(*mi), GFP_KERNEL);
+ mi = kzalloc(sizeof(*mi), atomic ? GFP_ATOMIC : GFP_KERNEL);
if (!mi) {
/*put back what find_ife_oplist took */
module_put(ops->owner);
@@ -294,7 +296,7 @@ static int add_metainfo(struct tcf_ife_info *ife, u32 metaid, void *metaval,
mi->metaid = metaid;
mi->ops = ops;
if (len > 0) {
- ret = ops->alloc(mi, metaval);
+ ret = ops->alloc(mi, metaval, atomic ? GFP_ATOMIC : GFP_KERNEL);
if (ret != 0) {
kfree(mi);
module_put(ops->owner);
@@ -313,11 +315,13 @@ static int use_all_metadata(struct tcf_ife_info *ife)
int rc = 0;
int installed = 0;
+ read_lock(&ife_mod_lock);
list_for_each_entry(o, &ifeoplist, list) {
- rc = add_metainfo(ife, o->metaid, NULL, 0);
+ rc = add_metainfo(ife, o->metaid, NULL, 0, true);
if (rc == 0)
installed += 1;
}
+ read_unlock(&ife_mod_lock);
if (installed)
return 0;
@@ -385,8 +389,9 @@ static void tcf_ife_cleanup(struct tc_action *a, int bind)
spin_unlock_bh(&ife->tcf_lock);
}
-/* under ife->tcf_lock */
-static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb)
+/* under ife->tcf_lock for existing action */
+static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb,
+ bool exists)
{
int len = 0;
int rc = 0;
@@ -398,11 +403,11 @@ static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb)
val = nla_data(tb[i]);
len = nla_len(tb[i]);
- rc = load_metaops_and_vet(ife, i, val, len);
+ rc = load_metaops_and_vet(ife, i, val, len, exists);
if (rc != 0)
return rc;
- rc = add_metainfo(ife, i, val, len);
+ rc = add_metainfo(ife, i, val, len, exists);
if (rc)
return rc;
}
@@ -423,7 +428,8 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
u16 ife_type = 0;
u8 *daddr = NULL;
u8 *saddr = NULL;
- int ret = 0, exists = 0;
+ bool exists = false;
+ int ret = 0;
int err;
err = nla_parse_nested(tb, TCA_IFE_MAX, nla, ife_policy);
@@ -474,7 +480,8 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
saddr = nla_data(tb[TCA_IFE_SMAC]);
}
- spin_lock_bh(&ife->tcf_lock);
+ if (exists)
+ spin_lock_bh(&ife->tcf_lock);
ife->tcf_action = parm->action;
if (parm->flags & IFE_ENCODE) {
@@ -504,11 +511,12 @@ metadata_parse_err:
if (ret == ACT_P_CREATED)
_tcf_ife_cleanup(a, bind);
- spin_unlock_bh(&ife->tcf_lock);
+ if (exists)
+ spin_unlock_bh(&ife->tcf_lock);
return err;
}
- err = populate_metalist(ife, tb2);
+ err = populate_metalist(ife, tb2, exists);
if (err)
goto metadata_parse_err;
@@ -523,12 +531,14 @@ metadata_parse_err:
if (ret == ACT_P_CREATED)
_tcf_ife_cleanup(a, bind);
- spin_unlock_bh(&ife->tcf_lock);
+ if (exists)
+ spin_unlock_bh(&ife->tcf_lock);
return err;
}
}
- spin_unlock_bh(&ife->tcf_lock);
+ if (exists)
+ spin_unlock_bh(&ife->tcf_lock);
if (ret == ACT_P_CREATED)
tcf_hash_insert(tn, a);
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index e7c0f4d..b8c5060 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -97,7 +97,8 @@ static int __tcf_ipt_init(struct tc_action_net *tn, struct nlattr *nla,
struct tcf_ipt *ipt;
struct xt_entry_target *td, *t;
char *tname;
- int ret = 0, err, exists = 0;
+ bool exists = false;
+ int ret = 0, err;
u32 hook = 0;
u32 index = 0;
@@ -122,10 +123,13 @@ static int __tcf_ipt_init(struct tc_action_net *tn, struct nlattr *nla,
}
td = (struct xt_entry_target *)nla_data(tb[TCA_IPT_TARG]);
- if (nla_len(tb[TCA_IPT_TARG]) < td->u.target_size)
+ if (nla_len(tb[TCA_IPT_TARG]) < td->u.target_size) {
+ if (exists)
+ tcf_hash_release(a, bind);
return -EINVAL;
+ }
- if (!tcf_hash_check(tn, index, a, bind)) {
+ if (!exists) {
ret = tcf_hash_create(tn, index, est, a, sizeof(*ipt), bind,
false);
if (ret)
@@ -243,7 +247,7 @@ static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a,
default:
net_notice_ratelimited("tc filter: Bogus netfilter code %d assume ACCEPT\n",
ret);
- result = TC_POLICE_OK;
+ result = TC_ACT_OK;
break;
}
spin_unlock(&ipt->tcf_lock);
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 787751a..5b135d3 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -62,7 +62,8 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
struct tc_mirred *parm;
struct tcf_mirred *m;
struct net_device *dev;
- int ret, ok_push = 0, exists = 0;
+ int ret, ok_push = 0;
+ bool exists = false;
if (nla == NULL)
return -EINVAL;
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index ff34dd3..1e8ede3 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -115,9 +115,9 @@ static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = {
[TCA_POLICE_RESULT] = { .type = NLA_U32 },
};
-static int tcf_act_police_locate(struct net *net, struct nlattr *nla,
- struct nlattr *est, struct tc_action *a,
- int ovr, int bind)
+static int tcf_act_police_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action *a,
+ int ovr, int bind)
{
int ret = 0, err;
struct nlattr *tb[TCA_POLICE_MAX + 1];
@@ -366,7 +366,7 @@ static struct tc_action_ops act_police_ops = {
.owner = THIS_MODULE,
.act = tcf_act_police,
.dump = tcf_act_police_dump,
- .init = tcf_act_police_locate,
+ .init = tcf_act_police_init,
.walk = tcf_act_police_walker,
.lookup = tcf_police_search,
};
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index be5fbb5..318328d3 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -86,8 +86,9 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla,
struct nlattr *tb[TCA_DEF_MAX + 1];
struct tc_defact *parm;
struct tcf_defact *d;
+ bool exists = false;
+ int ret = 0, err;
char *defdata;
- int ret = 0, err, exists = 0;
if (nla == NULL)
return -EINVAL;
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index 7e2bc3c..8e573c0 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -47,6 +47,8 @@ static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a,
skb_set_queue_mapping(skb, d->queue_mapping);
if (d->flags & SKBEDIT_F_MARK)
skb->mark = d->mark;
+ if (d->flags & SKBEDIT_F_PTYPE)
+ skb->pkt_type = d->ptype;
spin_unlock(&d->tcf_lock);
return d->tcf_action;
@@ -57,6 +59,7 @@ static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = {
[TCA_SKBEDIT_PRIORITY] = { .len = sizeof(u32) },
[TCA_SKBEDIT_QUEUE_MAPPING] = { .len = sizeof(u16) },
[TCA_SKBEDIT_MARK] = { .len = sizeof(u32) },
+ [TCA_SKBEDIT_PTYPE] = { .len = sizeof(u16) },
};
static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
@@ -68,8 +71,9 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
struct tc_skbedit *parm;
struct tcf_skbedit *d;
u32 flags = 0, *priority = NULL, *mark = NULL;
- u16 *queue_mapping = NULL;
- int ret = 0, err, exists = 0;
+ u16 *queue_mapping = NULL, *ptype = NULL;
+ bool exists = false;
+ int ret = 0, err;
if (nla == NULL)
return -EINVAL;
@@ -91,6 +95,13 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
queue_mapping = nla_data(tb[TCA_SKBEDIT_QUEUE_MAPPING]);
}
+ if (tb[TCA_SKBEDIT_PTYPE] != NULL) {
+ ptype = nla_data(tb[TCA_SKBEDIT_PTYPE]);
+ if (!skb_pkt_type_ok(*ptype))
+ return -EINVAL;
+ flags |= SKBEDIT_F_PTYPE;
+ }
+
if (tb[TCA_SKBEDIT_MARK] != NULL) {
flags |= SKBEDIT_F_MARK;
mark = nla_data(tb[TCA_SKBEDIT_MARK]);
@@ -131,6 +142,8 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
d->queue_mapping = *queue_mapping;
if (flags & SKBEDIT_F_MARK)
d->mark = *mark;
+ if (flags & SKBEDIT_F_PTYPE)
+ d->ptype = *ptype;
d->tcf_action = parm->action;
@@ -157,16 +170,16 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
if (nla_put(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
if ((d->flags & SKBEDIT_F_PRIORITY) &&
- nla_put(skb, TCA_SKBEDIT_PRIORITY, sizeof(d->priority),
- &d->priority))
+ nla_put_u32(skb, TCA_SKBEDIT_PRIORITY, d->priority))
goto nla_put_failure;
if ((d->flags & SKBEDIT_F_QUEUE_MAPPING) &&
- nla_put(skb, TCA_SKBEDIT_QUEUE_MAPPING,
- sizeof(d->queue_mapping), &d->queue_mapping))
+ nla_put_u16(skb, TCA_SKBEDIT_QUEUE_MAPPING, d->queue_mapping))
goto nla_put_failure;
if ((d->flags & SKBEDIT_F_MARK) &&
- nla_put(skb, TCA_SKBEDIT_MARK, sizeof(d->mark),
- &d->mark))
+ nla_put_u32(skb, TCA_SKBEDIT_MARK, d->mark))
+ goto nla_put_failure;
+ if ((d->flags & SKBEDIT_F_PTYPE) &&
+ nla_put_u16(skb, TCA_SKBEDIT_PTYPE, d->ptype))
goto nla_put_failure;
tcf_tm_dump(&t, &d->tcf_tm);
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index b075d50..db9b7ed 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -77,8 +77,8 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
int action;
__be16 push_vid = 0;
__be16 push_proto = 0;
- int ret = 0, exists = 0;
- int err;
+ bool exists = false;
+ int ret = 0, err;
if (!nla)
return -EINVAL;
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index cca1ef5..843a716 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -169,7 +169,7 @@ replay:
if (prio == 0) {
switch (n->nlmsg_type) {
case RTM_DELTFILTER:
- if (protocol || t->tcm_handle)
+ if (protocol || t->tcm_handle || tca[TCA_KIND])
return -ENOENT;
break;
case RTM_NEWTFILTER:
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 7b342c7..c3002c2 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -272,15 +272,10 @@ static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog,
bpf_fd = nla_get_u32(tb[TCA_BPF_FD]);
- fp = bpf_prog_get(bpf_fd);
+ fp = bpf_prog_get_type(bpf_fd, BPF_PROG_TYPE_SCHED_CLS);
if (IS_ERR(fp))
return PTR_ERR(fp);
- if (fp->type != BPF_PROG_TYPE_SCHED_CLS) {
- bpf_prog_put(fp);
- return -EINVAL;
- }
-
if (tb[TCA_BPF_NAME]) {
name = kmemdup(nla_data(tb[TCA_BPF_NAME]),
nla_len(tb[TCA_BPF_NAME]),
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 1ea6f76..5060801 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -140,7 +140,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
f = rhashtable_lookup_fast(&head->ht,
fl_key_get_start(&skb_mkey, &head->mask),
head->ht_params);
- if (f && !(f->flags & TCA_CLS_FLAGS_SKIP_SW)) {
+ if (f && !tc_skip_sw(f->flags)) {
*res = f->res;
return tcf_exts_exec(skb, &f->exts, res);
}
@@ -187,19 +187,20 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, unsigned long cookie)
dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc);
}
-static void fl_hw_replace_filter(struct tcf_proto *tp,
- struct flow_dissector *dissector,
- struct fl_flow_key *mask,
- struct fl_flow_key *key,
- struct tcf_exts *actions,
- unsigned long cookie, u32 flags)
+static int fl_hw_replace_filter(struct tcf_proto *tp,
+ struct flow_dissector *dissector,
+ struct fl_flow_key *mask,
+ struct fl_flow_key *key,
+ struct tcf_exts *actions,
+ unsigned long cookie, u32 flags)
{
struct net_device *dev = tp->q->dev_queue->dev;
struct tc_cls_flower_offload offload = {0};
struct tc_to_netdev tc;
+ int err;
if (!tc_should_offload(dev, tp, flags))
- return;
+ return tc_skip_sw(flags) ? -EINVAL : 0;
offload.command = TC_CLSFLOWER_REPLACE;
offload.cookie = cookie;
@@ -211,7 +212,12 @@ static void fl_hw_replace_filter(struct tcf_proto *tp,
tc.type = TC_SETUP_CLSFLOWER;
tc.cls_flower = &offload;
- dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc);
+ err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc);
+
+ if (tc_skip_sw(flags))
+ return err;
+
+ return 0;
}
static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f)
@@ -572,20 +578,22 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
if (err)
goto errout;
- if (!(fnew->flags & TCA_CLS_FLAGS_SKIP_SW)) {
+ if (!tc_skip_sw(fnew->flags)) {
err = rhashtable_insert_fast(&head->ht, &fnew->ht_node,
head->ht_params);
if (err)
goto errout;
}
- fl_hw_replace_filter(tp,
- &head->dissector,
- &mask.key,
- &fnew->key,
- &fnew->exts,
- (unsigned long)fnew,
- fnew->flags);
+ err = fl_hw_replace_filter(tp,
+ &head->dissector,
+ &mask.key,
+ &fnew->key,
+ &fnew->exts,
+ (unsigned long)fnew,
+ fnew->flags);
+ if (err)
+ goto errout;
if (fold) {
rhashtable_remove_fast(&head->ht, &fold->ht_node,
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
index 0785b23..481e4f1 100644
--- a/net/sched/sch_atm.c
+++ b/net/sched/sch_atm.c
@@ -357,7 +357,8 @@ static struct tcf_proto __rcu **atm_tc_find_tcf(struct Qdisc *sch,
/* --------------------------- Qdisc operations ---------------------------- */
-static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
{
struct atm_qdisc_data *p = qdisc_priv(sch);
struct atm_flow_data *flow;
@@ -366,7 +367,7 @@ static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
pr_debug("atm_tc_enqueue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p);
- result = TC_POLICE_OK; /* be nice to gcc */
+ result = TC_ACT_OK; /* be nice to gcc */
flow = NULL;
if (TC_H_MAJ(skb->priority) != sch->handle ||
!(flow = (struct atm_flow_data *)atm_tc_get(sch, skb->priority))) {
@@ -398,12 +399,12 @@ done:
switch (result) {
case TC_ACT_QUEUED:
case TC_ACT_STOLEN:
- kfree_skb(skb);
+ __qdisc_drop(skb, to_free);
return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
case TC_ACT_SHOT:
- kfree_skb(skb);
+ __qdisc_drop(skb, to_free);
goto drop;
- case TC_POLICE_RECLASSIFY:
+ case TC_ACT_RECLASSIFY:
if (flow->excess)
flow = flow->excess;
else
@@ -413,7 +414,7 @@ done:
#endif
}
- ret = qdisc_enqueue(skb, flow->q);
+ ret = qdisc_enqueue(skb, flow->q, to_free);
if (ret != NET_XMIT_SUCCESS) {
drop: __maybe_unused
if (net_xmit_drop_count(ret)) {
diff --git a/net/sched/sch_blackhole.c b/net/sched/sch_blackhole.c
index 3fee70d..c98a61e 100644
--- a/net/sched/sch_blackhole.c
+++ b/net/sched/sch_blackhole.c
@@ -17,9 +17,10 @@
#include <linux/skbuff.h>
#include <net/pkt_sched.h>
-static int blackhole_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int blackhole_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
{
- qdisc_drop(skb, sch);
+ qdisc_drop(skb, sch, to_free);
return NET_XMIT_SUCCESS;
}
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index a29fd81..beb554a 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -358,7 +358,8 @@ cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
}
static int
-cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
{
struct cbq_sched_data *q = qdisc_priv(sch);
int uninitialized_var(ret);
@@ -370,11 +371,11 @@ cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
if (cl == NULL) {
if (ret & __NET_XMIT_BYPASS)
qdisc_qstats_drop(sch);
- kfree_skb(skb);
+ __qdisc_drop(skb, to_free);
return ret;
}
- ret = qdisc_enqueue(skb, cl->q);
+ ret = qdisc_enqueue(skb, cl->q, to_free);
if (ret == NET_XMIT_SUCCESS) {
sch->q.qlen++;
cbq_mark_toplevel(q, cl);
diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
index 04e0b05..3b6d5bd 100644
--- a/net/sched/sch_choke.c
+++ b/net/sched/sch_choke.c
@@ -115,7 +115,8 @@ static void choke_zap_tail_holes(struct choke_sched_data *q)
}
/* Drop packet from queue array by creating a "hole" */
-static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx)
+static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx,
+ struct sk_buff **to_free)
{
struct choke_sched_data *q = qdisc_priv(sch);
struct sk_buff *skb = q->tab[idx];
@@ -129,7 +130,7 @@ static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx)
qdisc_qstats_backlog_dec(sch, skb);
qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
- qdisc_drop(skb, sch);
+ qdisc_drop(skb, sch, to_free);
--sch->q.qlen;
}
@@ -261,7 +262,8 @@ static bool choke_match_random(const struct choke_sched_data *q,
return choke_match_flow(oskb, nskb);
}
-static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
{
int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
struct choke_sched_data *q = qdisc_priv(sch);
@@ -288,7 +290,7 @@ static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
/* Draw a packet at random from queue and compare flow */
if (choke_match_random(q, skb, &idx)) {
q->stats.matched++;
- choke_drop_by_idx(sch, idx);
+ choke_drop_by_idx(sch, idx, to_free);
goto congestion_drop;
}
@@ -331,16 +333,16 @@ static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
}
q->stats.pdrop++;
- return qdisc_drop(skb, sch);
+ return qdisc_drop(skb, sch, to_free);
congestion_drop:
- qdisc_drop(skb, sch);
+ qdisc_drop(skb, sch, to_free);
return NET_XMIT_CN;
other_drop:
if (ret & __NET_XMIT_BYPASS)
qdisc_qstats_drop(sch);
- kfree_skb(skb);
+ __qdisc_drop(skb, to_free);
return ret;
}
@@ -375,11 +377,11 @@ static void choke_reset(struct Qdisc *sch)
q->head = (q->head + 1) & q->tab_mask;
if (!skb)
continue;
- qdisc_qstats_backlog_dec(sch, skb);
- --sch->q.qlen;
- qdisc_drop(skb, sch);
+ rtnl_qdisc_drop(skb, sch);
}
+ sch->q.qlen = 0;
+ sch->qstats.backlog = 0;
memset(q->tab, 0, (q->tab_mask + 1) * sizeof(struct sk_buff *));
q->head = q->tail = 0;
red_restart(&q->vars);
@@ -455,7 +457,7 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt)
dropped += qdisc_pkt_len(skb);
qdisc_qstats_backlog_dec(sch, skb);
--sch->q.qlen;
- qdisc_drop(skb, sch);
+ rtnl_qdisc_drop(skb, sch);
}
qdisc_tree_reduce_backlog(sch, oqlen - sch->q.qlen, dropped);
q->head = 0;
diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c
index dddf3bb..4002df3 100644
--- a/net/sched/sch_codel.c
+++ b/net/sched/sch_codel.c
@@ -82,7 +82,8 @@ static void drop_func(struct sk_buff *skb, void *ctx)
{
struct Qdisc *sch = ctx;
- qdisc_drop(skb, sch);
+ kfree_skb(skb);
+ qdisc_qstats_drop(sch);
}
static struct sk_buff *codel_qdisc_dequeue(struct Qdisc *sch)
@@ -107,7 +108,8 @@ static struct sk_buff *codel_qdisc_dequeue(struct Qdisc *sch)
return skb;
}
-static int codel_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int codel_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
{
struct codel_sched_data *q;
@@ -117,7 +119,7 @@ static int codel_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
}
q = qdisc_priv(sch);
q->drop_overlimit++;
- return qdisc_drop(skb, sch);
+ return qdisc_drop(skb, sch, to_free);
}
static const struct nla_policy codel_policy[TCA_CODEL_MAX + 1] = {
@@ -174,7 +176,7 @@ static int codel_change(struct Qdisc *sch, struct nlattr *opt)
dropped += qdisc_pkt_len(skb);
qdisc_qstats_backlog_dec(sch, skb);
- qdisc_drop(skb, sch);
+ rtnl_qdisc_drop(skb, sch);
}
qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index 22609e4..8af5c59 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -350,7 +350,8 @@ static struct drr_class *drr_classify(struct sk_buff *skb, struct Qdisc *sch,
return NULL;
}
-static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
{
struct drr_sched *q = qdisc_priv(sch);
struct drr_class *cl;
@@ -360,11 +361,11 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch)
if (cl == NULL) {
if (err & __NET_XMIT_BYPASS)
qdisc_qstats_drop(sch);
- kfree_skb(skb);
+ __qdisc_drop(skb, to_free);
return err;
}
- err = qdisc_enqueue(skb, cl->qdisc);
+ err = qdisc_enqueue(skb, cl->qdisc, to_free);
if (unlikely(err != NET_XMIT_SUCCESS)) {
if (net_xmit_drop_count(err)) {
cl->qstats.drops++;
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index b9ba5f6..1308bbf 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -191,7 +191,8 @@ static inline struct tcf_proto __rcu **dsmark_find_tcf(struct Qdisc *sch,
/* --------------------------- Qdisc operations ---------------------------- */
-static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
{
struct dsmark_qdisc_data *p = qdisc_priv(sch);
int err;
@@ -234,7 +235,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch)
#ifdef CONFIG_NET_CLS_ACT
case TC_ACT_QUEUED:
case TC_ACT_STOLEN:
- kfree_skb(skb);
+ __qdisc_drop(skb, to_free);
return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
case TC_ACT_SHOT:
@@ -251,7 +252,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch)
}
}
- err = qdisc_enqueue(skb, p->q);
+ err = qdisc_enqueue(skb, p->q, to_free);
if (err != NET_XMIT_SUCCESS) {
if (net_xmit_drop_count(err))
qdisc_qstats_drop(sch);
@@ -264,7 +265,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch)
return NET_XMIT_SUCCESS;
drop:
- qdisc_drop(skb, sch);
+ qdisc_drop(skb, sch, to_free);
return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
}
diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c
index dea70e3..baeed6a 100644
--- a/net/sched/sch_fifo.c
+++ b/net/sched/sch_fifo.c
@@ -19,32 +19,39 @@
/* 1 band FIFO pseudo-"scheduler" */
-static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
{
if (likely(sch->qstats.backlog + qdisc_pkt_len(skb) <= sch->limit))
return qdisc_enqueue_tail(skb, sch);
- return qdisc_drop(skb, sch);
+ return qdisc_drop(skb, sch, to_free);
}
-static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
{
if (likely(skb_queue_len(&sch->q) < sch->limit))
return qdisc_enqueue_tail(skb, sch);
- return qdisc_drop(skb, sch);
+ return qdisc_drop(skb, sch, to_free);
}
-static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
{
+ unsigned int prev_backlog;
+
if (likely(skb_queue_len(&sch->q) < sch->limit))
return qdisc_enqueue_tail(skb, sch);
+ prev_backlog = sch->qstats.backlog;
/* queue full, remove one skb to fulfill the limit */
- __qdisc_queue_drop_head(sch, &sch->q);
+ __qdisc_queue_drop_head(sch, &sch->q, to_free);
qdisc_qstats_drop(sch);
qdisc_enqueue_tail(skb, sch);
+ qdisc_tree_reduce_backlog(sch, 0, prev_backlog - sch->qstats.backlog);
return NET_XMIT_CN;
}
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index f49c81e..e5458b9 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -368,18 +368,19 @@ static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb)
}
}
-static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
{
struct fq_sched_data *q = qdisc_priv(sch);
struct fq_flow *f;
if (unlikely(sch->q.qlen >= sch->limit))
- return qdisc_drop(skb, sch);
+ return qdisc_drop(skb, sch, to_free);
f = fq_classify(skb, q);
if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) {
q->stat_flows_plimit++;
- return qdisc_drop(skb, sch);
+ return qdisc_drop(skb, sch, to_free);
}
f->qlen++;
@@ -514,17 +515,25 @@ out:
return skb;
}
+static void fq_flow_purge(struct fq_flow *flow)
+{
+ rtnl_kfree_skbs(flow->head, flow->tail);
+ flow->head = NULL;
+ flow->qlen = 0;
+}
+
static void fq_reset(struct Qdisc *sch)
{
struct fq_sched_data *q = qdisc_priv(sch);
struct rb_root *root;
- struct sk_buff *skb;
struct rb_node *p;
struct fq_flow *f;
unsigned int idx;
- while ((skb = fq_dequeue_head(sch, &q->internal)) != NULL)
- kfree_skb(skb);
+ sch->q.qlen = 0;
+ sch->qstats.backlog = 0;
+
+ fq_flow_purge(&q->internal);
if (!q->fq_root)
return;
@@ -535,8 +544,7 @@ static void fq_reset(struct Qdisc *sch)
f = container_of(p, struct fq_flow, fq_node);
rb_erase(p, root);
- while ((skb = fq_dequeue_head(sch, f)) != NULL)
- kfree_skb(skb);
+ fq_flow_purge(f);
kmem_cache_free(fq_flow_cachep, f);
}
@@ -737,7 +745,7 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt)
if (!skb)
break;
drop_len += qdisc_pkt_len(skb);
- kfree_skb(skb);
+ rtnl_kfree_skbs(skb, skb);
drop_count++;
}
qdisc_tree_reduce_backlog(sch, drop_count, drop_len);
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index a302e8e..a5ea0e9 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -139,7 +139,8 @@ static inline void flow_queue_add(struct fq_codel_flow *flow,
skb->next = NULL;
}
-static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets)
+static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets,
+ struct sk_buff **to_free)
{
struct fq_codel_sched_data *q = qdisc_priv(sch);
struct sk_buff *skb;
@@ -171,8 +172,8 @@ static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets)
do {
skb = dequeue_head(flow);
len += qdisc_pkt_len(skb);
- mem += skb->truesize;
- kfree_skb(skb);
+ mem += get_codel_cb(skb)->mem_usage;
+ __qdisc_drop(skb, to_free);
} while (++i < max_packets && len < threshold);
flow->dropped += i;
@@ -184,7 +185,8 @@ static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets)
return idx;
}
-static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
{
struct fq_codel_sched_data *q = qdisc_priv(sch);
unsigned int idx, prev_backlog, prev_qlen;
@@ -197,7 +199,7 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
if (idx == 0) {
if (ret & __NET_XMIT_BYPASS)
qdisc_qstats_drop(sch);
- kfree_skb(skb);
+ __qdisc_drop(skb, to_free);
return ret;
}
idx--;
@@ -214,7 +216,8 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
flow->deficit = q->quantum;
flow->dropped = 0;
}
- q->memory_usage += skb->truesize;
+ get_codel_cb(skb)->mem_usage = skb->truesize;
+ q->memory_usage += get_codel_cb(skb)->mem_usage;
memory_limited = q->memory_usage > q->memory_limit;
if (++sch->q.qlen <= sch->limit && !memory_limited)
return NET_XMIT_SUCCESS;
@@ -229,7 +232,7 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
* So instead of dropping a single packet, drop half of its backlog
* with a 64 packets limit to not add a too big cpu spike here.
*/
- ret = fq_codel_drop(sch, q->drop_batch_size);
+ ret = fq_codel_drop(sch, q->drop_batch_size, to_free);
prev_qlen -= sch->q.qlen;
prev_backlog -= sch->qstats.backlog;
@@ -265,7 +268,7 @@ static struct sk_buff *dequeue_func(struct codel_vars *vars, void *ctx)
if (flow->head) {
skb = dequeue_head(flow);
q->backlogs[flow - q->flows] -= qdisc_pkt_len(skb);
- q->memory_usage -= skb->truesize;
+ q->memory_usage -= get_codel_cb(skb)->mem_usage;
sch->q.qlen--;
sch->qstats.backlog -= qdisc_pkt_len(skb);
}
@@ -276,7 +279,8 @@ static void drop_func(struct sk_buff *skb, void *ctx)
{
struct Qdisc *sch = ctx;
- qdisc_drop(skb, sch);
+ kfree_skb(skb);
+ qdisc_qstats_drop(sch);
}
static struct sk_buff *fq_codel_dequeue(struct Qdisc *sch)
@@ -336,6 +340,12 @@ begin:
return skb;
}
+static void fq_codel_flow_purge(struct fq_codel_flow *flow)
+{
+ rtnl_kfree_skbs(flow->head, flow->tail);
+ flow->head = NULL;
+}
+
static void fq_codel_reset(struct Qdisc *sch)
{
struct fq_codel_sched_data *q = qdisc_priv(sch);
@@ -346,18 +356,13 @@ static void fq_codel_reset(struct Qdisc *sch)
for (i = 0; i < q->flows_cnt; i++) {
struct fq_codel_flow *flow = q->flows + i;
- while (flow->head) {
- struct sk_buff *skb = dequeue_head(flow);
-
- qdisc_qstats_backlog_dec(sch, skb);
- kfree_skb(skb);
- }
-
+ fq_codel_flow_purge(flow);
INIT_LIST_HEAD(&flow->flowchain);
codel_vars_init(&flow->cvars);
}
memset(q->backlogs, 0, q->flows_cnt * sizeof(u32));
sch->q.qlen = 0;
+ sch->qstats.backlog = 0;
q->memory_usage = 0;
}
@@ -433,7 +438,7 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt)
struct sk_buff *skb = fq_codel_dequeue(sch);
q->cstats.drop_len += qdisc_pkt_len(skb);
- kfree_skb(skb);
+ rtnl_kfree_skbs(skb, skb);
q->cstats.drop_count++;
}
qdisc_tree_reduce_backlog(sch, q->cstats.drop_count, q->cstats.drop_len);
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 0c9cb51..e95b67c 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -77,6 +77,34 @@ static void try_bulk_dequeue_skb(struct Qdisc *q,
skb->next = NULL;
}
+/* This variant of try_bulk_dequeue_skb() makes sure
+ * all skbs in the chain are for the same txq
+ */
+static void try_bulk_dequeue_skb_slow(struct Qdisc *q,
+ struct sk_buff *skb,
+ int *packets)
+{
+ int mapping = skb_get_queue_mapping(skb);
+ struct sk_buff *nskb;
+ int cnt = 0;
+
+ do {
+ nskb = q->dequeue(q);
+ if (!nskb)
+ break;
+ if (unlikely(skb_get_queue_mapping(nskb) != mapping)) {
+ q->skb_bad_txq = nskb;
+ qdisc_qstats_backlog_inc(q, nskb);
+ q->q.qlen++;
+ break;
+ }
+ skb->next = nskb;
+ skb = nskb;
+ } while (++cnt < 8);
+ (*packets) += cnt;
+ skb->next = NULL;
+}
+
/* Note that dequeue_skb can possibly return a SKB list (via skb->next).
* A requeued skb (via q->gso_skb) can also be a SKB list.
*/
@@ -87,8 +115,9 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate,
const struct netdev_queue *txq = q->dev_queue;
*packets = 1;
- *validate = true;
if (unlikely(skb)) {
+ /* skb in gso_skb were already validated */
+ *validate = false;
/* check the reason of requeuing without tx lock first */
txq = skb_get_tx_queue(txq->dev, skb);
if (!netif_xmit_frozen_or_stopped(txq)) {
@@ -97,15 +126,30 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate,
q->q.qlen--;
} else
skb = NULL;
- /* skb in gso_skb were already validated */
- *validate = false;
- } else {
- if (!(q->flags & TCQ_F_ONETXQUEUE) ||
- !netif_xmit_frozen_or_stopped(txq)) {
- skb = q->dequeue(q);
- if (skb && qdisc_may_bulk(q))
- try_bulk_dequeue_skb(q, skb, txq, packets);
+ return skb;
+ }
+ *validate = true;
+ skb = q->skb_bad_txq;
+ if (unlikely(skb)) {
+ /* check the reason of requeuing without tx lock first */
+ txq = skb_get_tx_queue(txq->dev, skb);
+ if (!netif_xmit_frozen_or_stopped(txq)) {
+ q->skb_bad_txq = NULL;
+ qdisc_qstats_backlog_dec(q, skb);
+ q->q.qlen--;
+ goto bulk;
}
+ return NULL;
+ }
+ if (!(q->flags & TCQ_F_ONETXQUEUE) ||
+ !netif_xmit_frozen_or_stopped(txq))
+ skb = q->dequeue(q);
+ if (skb) {
+bulk:
+ if (qdisc_may_bulk(q))
+ try_bulk_dequeue_skb(q, skb, txq, packets);
+ else
+ try_bulk_dequeue_skb_slow(q, skb, packets);
}
return skb;
}
@@ -348,9 +392,10 @@ EXPORT_SYMBOL(netif_carrier_off);
cheaper.
*/
-static int noop_enqueue(struct sk_buff *skb, struct Qdisc *qdisc)
+static int noop_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
+ struct sk_buff **to_free)
{
- kfree_skb(skb);
+ __qdisc_drop(skb, to_free);
return NET_XMIT_CN;
}
@@ -439,7 +484,8 @@ static inline struct sk_buff_head *band2list(struct pfifo_fast_priv *priv,
return priv->q + band;
}
-static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc)
+static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
+ struct sk_buff **to_free)
{
if (skb_queue_len(&qdisc->q) < qdisc_dev(qdisc)->tx_queue_len) {
int band = prio2band[skb->priority & TC_PRIO_MAX];
@@ -451,7 +497,7 @@ static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc)
return __qdisc_enqueue_tail(skb, qdisc, list);
}
- return qdisc_drop(skb, qdisc);
+ return qdisc_drop(skb, qdisc, to_free);
}
static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
@@ -493,7 +539,7 @@ static void pfifo_fast_reset(struct Qdisc *qdisc)
struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
- __qdisc_reset_queue(qdisc, band2list(priv, prio));
+ __qdisc_reset_queue(band2list(priv, prio));
priv->bitmap = 0;
qdisc->qstats.backlog = 0;
@@ -622,11 +668,14 @@ void qdisc_reset(struct Qdisc *qdisc)
if (ops->reset)
ops->reset(qdisc);
+ kfree_skb(qdisc->skb_bad_txq);
+ qdisc->skb_bad_txq = NULL;
+
if (qdisc->gso_skb) {
kfree_skb_list(qdisc->gso_skb);
qdisc->gso_skb = NULL;
- qdisc->q.qlen = 0;
}
+ qdisc->q.qlen = 0;
}
EXPORT_SYMBOL(qdisc_reset);
@@ -665,6 +714,7 @@ void qdisc_destroy(struct Qdisc *qdisc)
dev_put(qdisc_dev(qdisc));
kfree_skb_list(qdisc->gso_skb);
+ kfree_skb(qdisc->skb_bad_txq);
/*
* gen_estimator est_timer() might access qdisc->q.lock,
* wait a RCU grace period before freeing qdisc.
diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c
index b5fb63c7..c78a093 100644
--- a/net/sched/sch_gred.c
+++ b/net/sched/sch_gred.c
@@ -149,7 +149,8 @@ static inline int gred_use_harddrop(struct gred_sched *t)
return t->red_flags & TC_RED_HARDDROP;
}
-static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
{
struct gred_sched_data *q = NULL;
struct gred_sched *t = qdisc_priv(sch);
@@ -237,10 +238,10 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch)
q->stats.pdrop++;
drop:
- return qdisc_drop(skb, sch);
+ return qdisc_drop(skb, sch, to_free);
congestion_drop:
- qdisc_drop(skb, sch);
+ qdisc_drop(skb, sch, to_free);
return NET_XMIT_CN;
}
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index bd08c36..dff92ea 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -130,7 +130,6 @@ struct hfsc_class {
struct rb_node vt_node; /* parent's vt_tree member */
struct rb_root cf_tree; /* active children sorted by cl_f */
struct rb_node cf_node; /* parent's cf_heap member */
- struct list_head dlist; /* drop list member */
u64 cl_total; /* total work in bytes */
u64 cl_cumul; /* cumulative work in bytes done by
@@ -177,8 +176,6 @@ struct hfsc_sched {
struct hfsc_class root; /* root class */
struct Qdisc_class_hash clhash; /* class hash */
struct rb_root eligible; /* eligible tree */
- struct list_head droplist; /* active leaf class list (for
- dropping) */
struct qdisc_watchdog watchdog; /* watchdog timer */
};
@@ -781,6 +778,20 @@ update_vf(struct hfsc_class *cl, unsigned int len, u64 cur_time)
else
go_passive = 0;
+ /* update vt */
+ cl->cl_vt = rtsc_y2x(&cl->cl_virtual, cl->cl_total)
+ - cl->cl_vtoff + cl->cl_vtadj;
+
+ /*
+ * if vt of the class is smaller than cvtmin,
+ * the class was skipped in the past due to non-fit.
+ * if so, we need to adjust vtadj.
+ */
+ if (cl->cl_vt < cl->cl_parent->cl_cvtmin) {
+ cl->cl_vtadj += cl->cl_parent->cl_cvtmin - cl->cl_vt;
+ cl->cl_vt = cl->cl_parent->cl_cvtmin;
+ }
+
if (go_passive) {
/* no more active child, going passive */
@@ -797,25 +808,10 @@ update_vf(struct hfsc_class *cl, unsigned int len, u64 cur_time)
continue;
}
- /*
- * update vt and f
- */
- cl->cl_vt = rtsc_y2x(&cl->cl_virtual, cl->cl_total)
- - cl->cl_vtoff + cl->cl_vtadj;
-
- /*
- * if vt of the class is smaller than cvtmin,
- * the class was skipped in the past due to non-fit.
- * if so, we need to adjust vtadj.
- */
- if (cl->cl_vt < cl->cl_parent->cl_cvtmin) {
- cl->cl_vtadj += cl->cl_parent->cl_cvtmin - cl->cl_vt;
- cl->cl_vt = cl->cl_parent->cl_cvtmin;
- }
-
/* update the vt tree */
vttree_update(cl);
+ /* update f */
if (cl->cl_flags & HFSC_USC) {
cl->cl_myf = cl->cl_myfadj + rtsc_y2x(&cl->cl_ulimit,
cl->cl_total);
@@ -858,7 +854,6 @@ set_active(struct hfsc_class *cl, unsigned int len)
if (cl->cl_flags & HFSC_FSC)
init_vf(cl, len);
- list_add_tail(&cl->dlist, &cl->sched->droplist);
}
static void
@@ -867,8 +862,6 @@ set_passive(struct hfsc_class *cl)
if (cl->cl_flags & HFSC_RSC)
eltree_remove(cl);
- list_del(&cl->dlist);
-
/*
* vttree is now handled in update_vf() so that update_vf(cl, 0, 0)
* needs to be called explicitly to remove a class from vttree.
@@ -882,7 +875,7 @@ qdisc_peek_len(struct Qdisc *sch)
unsigned int len;
skb = sch->ops->peek(sch);
- if (skb == NULL) {
+ if (unlikely(skb == NULL)) {
qdisc_warn_nonwc("qdisc_peek_len", sch);
return 0;
}
@@ -947,7 +940,7 @@ static void
hfsc_change_fsc(struct hfsc_class *cl, struct tc_service_curve *fsc)
{
sc2isc(fsc, &cl->cl_fsc);
- rtsc_init(&cl->cl_virtual, &cl->cl_fsc, cl->cl_vt, cl->cl_total);
+ rtsc_init(&cl->cl_virtual, &cl->cl_fsc, cl->cl_vtoff + cl->cl_vt, cl->cl_total);
cl->cl_flags |= HFSC_FSC;
}
@@ -1443,7 +1436,6 @@ hfsc_init_qdisc(struct Qdisc *sch, struct nlattr *opt)
if (err < 0)
return err;
q->eligible = RB_ROOT;
- INIT_LIST_HEAD(&q->droplist);
q->root.cl_common.classid = sch->handle;
q->root.refcnt = 1;
@@ -1527,7 +1519,6 @@ hfsc_reset_qdisc(struct Qdisc *sch)
hfsc_reset_class(cl);
}
q->eligible = RB_ROOT;
- INIT_LIST_HEAD(&q->droplist);
qdisc_watchdog_cancel(&q->watchdog);
sch->qstats.backlog = 0;
sch->q.qlen = 0;
@@ -1572,7 +1563,7 @@ hfsc_dump_qdisc(struct Qdisc *sch, struct sk_buff *skb)
}
static int
-hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
{
struct hfsc_class *cl;
int uninitialized_var(err);
@@ -1581,11 +1572,11 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
if (cl == NULL) {
if (err & __NET_XMIT_BYPASS)
qdisc_qstats_drop(sch);
- kfree_skb(skb);
+ __qdisc_drop(skb, to_free);
return err;
}
- err = qdisc_enqueue(skb, cl->qdisc);
+ err = qdisc_enqueue(skb, cl->qdisc, to_free);
if (unlikely(err != NET_XMIT_SUCCESS)) {
if (net_xmit_drop_count(err)) {
cl->qstats.drops++;
@@ -1594,8 +1585,17 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
return err;
}
- if (cl->qdisc->q.qlen == 1)
+ if (cl->qdisc->q.qlen == 1) {
set_active(cl, qdisc_pkt_len(skb));
+ /*
+ * If this is the first packet, isolate the head so an eventual
+ * head drop before the first dequeue operation has no chance
+ * to invalidate the deadline.
+ */
+ if (cl->cl_flags & HFSC_RSC)
+ cl->qdisc->ops->peek(cl->qdisc);
+
+ }
qdisc_qstats_backlog_inc(sch, skb);
sch->q.qlen++;
diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c
index c517918..e3d0458 100644
--- a/net/sched/sch_hhf.c
+++ b/net/sched/sch_hhf.c
@@ -345,7 +345,7 @@ static void bucket_add(struct wdrr_bucket *bucket, struct sk_buff *skb)
skb->next = NULL;
}
-static unsigned int hhf_drop(struct Qdisc *sch)
+static unsigned int hhf_drop(struct Qdisc *sch, struct sk_buff **to_free)
{
struct hhf_sched_data *q = qdisc_priv(sch);
struct wdrr_bucket *bucket;
@@ -359,16 +359,16 @@ static unsigned int hhf_drop(struct Qdisc *sch)
struct sk_buff *skb = dequeue_head(bucket);
sch->q.qlen--;
- qdisc_qstats_drop(sch);
qdisc_qstats_backlog_dec(sch, skb);
- kfree_skb(skb);
+ qdisc_drop(skb, sch, to_free);
}
/* Return id of the bucket from which the packet was dropped. */
return bucket - q->buckets;
}
-static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
{
struct hhf_sched_data *q = qdisc_priv(sch);
enum wdrr_bucket_idx idx;
@@ -406,7 +406,7 @@ static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
/* Return Congestion Notification only if we dropped a packet from this
* bucket.
*/
- if (hhf_drop(sch) == idx)
+ if (hhf_drop(sch, to_free) == idx)
return NET_XMIT_CN;
/* As we dropped a packet, better let upper stack know this. */
@@ -464,7 +464,7 @@ static void hhf_reset(struct Qdisc *sch)
struct sk_buff *skb;
while ((skb = hhf_dequeue(sch)) != NULL)
- kfree_skb(skb);
+ rtnl_kfree_skbs(skb, skb);
}
static void *hhf_zalloc(size_t sz)
@@ -574,7 +574,7 @@ static int hhf_change(struct Qdisc *sch, struct nlattr *opt)
while (sch->q.qlen > sch->limit) {
struct sk_buff *skb = hhf_dequeue(sch);
- kfree_skb(skb);
+ rtnl_kfree_skbs(skb, skb);
}
qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen,
prev_backlog - sch->qstats.backlog);
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 07dcd29..91982d9 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -117,7 +117,6 @@ struct htb_class {
* Written often fields
*/
struct gnet_stats_basic_packed bstats;
- struct gnet_stats_queue qstats;
struct tc_htb_xstats xstats; /* our special stats */
/* token bucket parameters */
@@ -140,6 +139,8 @@ struct htb_class {
enum htb_cmode cmode; /* current mode of the class */
struct rb_node pq_node; /* node for event queue */
struct rb_node node[TC_HTB_NUMPRIO]; /* node for self or feed tree */
+
+ unsigned int drops ____cacheline_aligned_in_smp;
};
struct htb_level {
@@ -569,7 +570,8 @@ static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
list_del_init(&cl->un.leaf.drop_list);
}
-static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
{
int uninitialized_var(ret);
struct htb_sched *q = qdisc_priv(sch);
@@ -581,19 +583,20 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
__skb_queue_tail(&q->direct_queue, skb);
q->direct_pkts++;
} else {
- return qdisc_drop(skb, sch);
+ return qdisc_drop(skb, sch, to_free);
}
#ifdef CONFIG_NET_CLS_ACT
} else if (!cl) {
if (ret & __NET_XMIT_BYPASS)
qdisc_qstats_drop(sch);
- kfree_skb(skb);
+ __qdisc_drop(skb, to_free);
return ret;
#endif
- } else if ((ret = qdisc_enqueue(skb, cl->un.leaf.q)) != NET_XMIT_SUCCESS) {
+ } else if ((ret = qdisc_enqueue(skb, cl->un.leaf.q,
+ to_free)) != NET_XMIT_SUCCESS) {
if (net_xmit_drop_count(ret)) {
qdisc_qstats_drop(sch);
- cl->qstats.drops++;
+ cl->drops++;
}
return ret;
} else {
@@ -957,7 +960,7 @@ static void htb_reset(struct Qdisc *sch)
}
}
qdisc_watchdog_cancel(&q->watchdog);
- __skb_queue_purge(&q->direct_queue);
+ __qdisc_reset_queue(&q->direct_queue);
sch->q.qlen = 0;
sch->qstats.backlog = 0;
memset(q->hlevel, 0, sizeof(q->hlevel));
@@ -981,7 +984,9 @@ static void htb_work_func(struct work_struct *work)
struct htb_sched *q = container_of(work, struct htb_sched, work);
struct Qdisc *sch = q->watchdog.qdisc;
+ rcu_read_lock();
__netif_schedule(qdisc_root(sch));
+ rcu_read_unlock();
}
static int htb_init(struct Qdisc *sch, struct nlattr *opt)
@@ -1108,17 +1113,22 @@ static int
htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
{
struct htb_class *cl = (struct htb_class *)arg;
+ struct gnet_stats_queue qs = {
+ .drops = cl->drops,
+ };
__u32 qlen = 0;
- if (!cl->level && cl->un.leaf.q)
+ if (!cl->level && cl->un.leaf.q) {
qlen = cl->un.leaf.q->q.qlen;
+ qs.backlog = cl->un.leaf.q->qstats.backlog;
+ }
cl->xstats.tokens = PSCHED_NS2TICKS(cl->tokens);
cl->xstats.ctokens = PSCHED_NS2TICKS(cl->ctokens);
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
d, NULL, &cl->bstats) < 0 ||
gnet_stats_copy_rate_est(d, NULL, &cl->rate_est) < 0 ||
- gnet_stats_copy_queue(d, NULL, &cl->qstats, qlen) < 0)
+ gnet_stats_copy_queue(d, NULL, &qs, qlen) < 0)
return -1;
return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
@@ -1231,7 +1241,7 @@ static void htb_destroy(struct Qdisc *sch)
htb_destroy_class(sch, cl);
}
qdisc_class_hash_destroy(&q->clhash);
- __skb_queue_purge(&q->direct_queue);
+ __qdisc_reset_queue(&q->direct_queue);
}
static int htb_delete(struct Qdisc *sch, unsigned long arg)
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index 5ea9330..9ffbb02 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -65,7 +65,8 @@ multiq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
}
static int
-multiq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+multiq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
{
struct Qdisc *qdisc;
int ret;
@@ -76,12 +77,12 @@ multiq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
if (ret & __NET_XMIT_BYPASS)
qdisc_qstats_drop(sch);
- kfree_skb(skb);
+ __qdisc_drop(skb, to_free);
return ret;
}
#endif
- ret = qdisc_enqueue(skb, qdisc);
+ ret = qdisc_enqueue(skb, qdisc, to_free);
if (ret == NET_XMIT_SUCCESS) {
sch->q.qlen++;
return NET_XMIT_SUCCESS;
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 876df13..aaaf021 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -368,9 +368,7 @@ static void tfifo_reset(struct Qdisc *sch)
struct sk_buff *skb = netem_rb_to_skb(p);
rb_erase(p, &q->t_root);
- skb->next = NULL;
- skb->prev = NULL;
- kfree_skb(skb);
+ rtnl_kfree_skbs(skb, skb);
}
}
@@ -399,7 +397,8 @@ static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
* when we statistically choose to corrupt one, we instead segment it, returning
* the first packet to be corrupted, and re-enqueue the remaining frames
*/
-static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch)
+static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
{
struct sk_buff *segs;
netdev_features_t features = netif_skb_features(skb);
@@ -407,7 +406,7 @@ static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch)
segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
if (IS_ERR_OR_NULL(segs)) {
- qdisc_drop(skb, sch);
+ qdisc_drop(skb, sch, to_free);
return NULL;
}
consume_skb(skb);
@@ -420,7 +419,8 @@ static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch)
* NET_XMIT_DROP: queue length didn't change.
* NET_XMIT_SUCCESS: one skb was queued.
*/
-static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
{
struct netem_sched_data *q = qdisc_priv(sch);
/* We don't fill cb now as skb_unshare() may invalidate it */
@@ -445,7 +445,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
}
if (count == 0) {
qdisc_qstats_drop(sch);
- kfree_skb(skb);
+ __qdisc_drop(skb, to_free);
return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
}
@@ -465,7 +465,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
q->duplicate = 0;
- rootq->enqueue(skb2, rootq);
+ rootq->enqueue(skb2, rootq, to_free);
q->duplicate = dupsave;
}
@@ -477,7 +477,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
*/
if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
if (skb_is_gso(skb)) {
- segs = netem_segment(skb, sch);
+ segs = netem_segment(skb, sch, to_free);
if (!segs)
return NET_XMIT_DROP;
} else {
@@ -487,10 +487,14 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
skb = segs;
segs = segs->next;
- if (!(skb = skb_unshare(skb, GFP_ATOMIC)) ||
- (skb->ip_summed == CHECKSUM_PARTIAL &&
- skb_checksum_help(skb))) {
- rc = qdisc_drop(skb, sch);
+ skb = skb_unshare(skb, GFP_ATOMIC);
+ if (unlikely(!skb)) {
+ qdisc_qstats_drop(sch);
+ goto finish_segs;
+ }
+ if (skb->ip_summed == CHECKSUM_PARTIAL &&
+ skb_checksum_help(skb)) {
+ qdisc_drop(skb, sch, to_free);
goto finish_segs;
}
@@ -499,7 +503,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
}
if (unlikely(skb_queue_len(&sch->q) >= sch->limit))
- return qdisc_drop(skb, sch);
+ return qdisc_drop(skb, sch, to_free);
qdisc_qstats_backlog_inc(sch, skb);
@@ -559,7 +563,7 @@ finish_segs:
segs->next = NULL;
qdisc_skb_cb(segs)->pkt_len = segs->len;
last_len = segs->len;
- rc = qdisc_enqueue(segs, sch);
+ rc = qdisc_enqueue(segs, sch, to_free);
if (rc != NET_XMIT_SUCCESS) {
if (net_xmit_drop_count(rc))
qdisc_qstats_drop(sch);
@@ -617,14 +621,17 @@ deliver:
#endif
if (q->qdisc) {
- int err = qdisc_enqueue(skb, q->qdisc);
-
- if (unlikely(err != NET_XMIT_SUCCESS)) {
- if (net_xmit_drop_count(err)) {
- qdisc_qstats_drop(sch);
- qdisc_tree_reduce_backlog(sch, 1,
- qdisc_pkt_len(skb));
- }
+ unsigned int pkt_len = qdisc_pkt_len(skb);
+ struct sk_buff *to_free = NULL;
+ int err;
+
+ err = qdisc_enqueue(skb, q->qdisc, &to_free);
+ kfree_skb_list(to_free);
+ if (err != NET_XMIT_SUCCESS &&
+ net_xmit_drop_count(err)) {
+ qdisc_qstats_drop(sch);
+ qdisc_tree_reduce_backlog(sch, 1,
+ pkt_len);
}
goto tfifo_dequeue;
}
diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c
index 71ae3b9..a570b0b 100644
--- a/net/sched/sch_pie.c
+++ b/net/sched/sch_pie.c
@@ -134,7 +134,8 @@ static bool drop_early(struct Qdisc *sch, u32 packet_size)
return false;
}
-static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
{
struct pie_sched_data *q = qdisc_priv(sch);
bool enqueue = false;
@@ -166,7 +167,7 @@ static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
out:
q->stats.dropped++;
- return qdisc_drop(skb, sch);
+ return qdisc_drop(skb, sch, to_free);
}
static const struct nla_policy pie_policy[TCA_PIE_MAX + 1] = {
@@ -234,7 +235,7 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt)
dropped += qdisc_pkt_len(skb);
qdisc_qstats_backlog_dec(sch, skb);
- qdisc_drop(skb, sch);
+ rtnl_qdisc_drop(skb, sch);
}
qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
diff --git a/net/sched/sch_plug.c b/net/sched/sch_plug.c
index a12cd37..1c6cbab 100644
--- a/net/sched/sch_plug.c
+++ b/net/sched/sch_plug.c
@@ -88,7 +88,8 @@ struct plug_sched_data {
u32 pkts_to_release;
};
-static int plug_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int plug_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
{
struct plug_sched_data *q = qdisc_priv(sch);
@@ -98,7 +99,7 @@ static int plug_enqueue(struct sk_buff *skb, struct Qdisc *sch)
return qdisc_enqueue_tail(skb, sch);
}
- return qdisc_drop(skb, sch);
+ return qdisc_drop(skb, sch, to_free);
}
static struct sk_buff *plug_dequeue(struct Qdisc *sch)
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index de49268..8f57589 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -67,7 +67,7 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
}
static int
-prio_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+prio_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
{
struct Qdisc *qdisc;
int ret;
@@ -83,7 +83,7 @@ prio_enqueue(struct sk_buff *skb, struct Qdisc *sch)
}
#endif
- ret = qdisc_enqueue(skb, qdisc);
+ ret = qdisc_enqueue(skb, qdisc, to_free);
if (ret == NET_XMIT_SUCCESS) {
qdisc_qstats_backlog_inc(sch, skb);
sch->q.qlen++;
@@ -153,8 +153,9 @@ prio_destroy(struct Qdisc *sch)
static int prio_tune(struct Qdisc *sch, struct nlattr *opt)
{
struct prio_sched_data *q = qdisc_priv(sch);
+ struct Qdisc *queues[TCQ_PRIO_BANDS];
+ int oldbands = q->bands, i;
struct tc_prio_qopt *qopt;
- int i;
if (nla_len(opt) < sizeof(*qopt))
return -EINVAL;
@@ -168,62 +169,42 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt)
return -EINVAL;
}
+ /* Before commit, make sure we can allocate all new qdiscs */
+ for (i = oldbands; i < qopt->bands; i++) {
+ queues[i] = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+ TC_H_MAKE(sch->handle, i + 1));
+ if (!queues[i]) {
+ while (i > oldbands)
+ qdisc_destroy(queues[--i]);
+ return -ENOMEM;
+ }
+ }
+
sch_tree_lock(sch);
q->bands = qopt->bands;
memcpy(q->prio2band, qopt->priomap, TC_PRIO_MAX+1);
- for (i = q->bands; i < TCQ_PRIO_BANDS; i++) {
+ for (i = q->bands; i < oldbands; i++) {
struct Qdisc *child = q->queues[i];
- q->queues[i] = &noop_qdisc;
- if (child != &noop_qdisc) {
- qdisc_tree_reduce_backlog(child, child->q.qlen, child->qstats.backlog);
- qdisc_destroy(child);
- }
- }
- sch_tree_unlock(sch);
- for (i = 0; i < q->bands; i++) {
- if (q->queues[i] == &noop_qdisc) {
- struct Qdisc *child, *old;
-
- child = qdisc_create_dflt(sch->dev_queue,
- &pfifo_qdisc_ops,
- TC_H_MAKE(sch->handle, i + 1));
- if (child) {
- sch_tree_lock(sch);
- old = q->queues[i];
- q->queues[i] = child;
-
- if (old != &noop_qdisc) {
- qdisc_tree_reduce_backlog(old,
- old->q.qlen,
- old->qstats.backlog);
- qdisc_destroy(old);
- }
- sch_tree_unlock(sch);
- }
- }
+ qdisc_tree_reduce_backlog(child, child->q.qlen,
+ child->qstats.backlog);
+ qdisc_destroy(child);
}
+
+ for (i = oldbands; i < q->bands; i++)
+ q->queues[i] = queues[i];
+
+ sch_tree_unlock(sch);
return 0;
}
static int prio_init(struct Qdisc *sch, struct nlattr *opt)
{
- struct prio_sched_data *q = qdisc_priv(sch);
- int i;
-
- for (i = 0; i < TCQ_PRIO_BANDS; i++)
- q->queues[i] = &noop_qdisc;
-
- if (opt == NULL) {
+ if (!opt)
return -EINVAL;
- } else {
- int err;
- if ((err = prio_tune(sch, opt)) != 0)
- return err;
- }
- return 0;
+ return prio_tune(sch, opt);
}
static int prio_dump(struct Qdisc *sch, struct sk_buff *skb)
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index 0427fa8..f27ffee 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -1217,7 +1217,8 @@ static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *q)
return agg;
}
-static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
{
struct qfq_sched *q = qdisc_priv(sch);
struct qfq_class *cl;
@@ -1240,11 +1241,11 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
qdisc_pkt_len(skb));
if (err) {
cl->qstats.drops++;
- return qdisc_drop(skb, sch);
+ return qdisc_drop(skb, sch, to_free);
}
}
- err = qdisc_enqueue(skb, cl->qdisc);
+ err = qdisc_enqueue(skb, cl->qdisc, to_free);
if (unlikely(err != NET_XMIT_SUCCESS)) {
pr_debug("qfq_enqueue: enqueue failed %d\n", err);
if (net_xmit_drop_count(err)) {
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index a0d5753..249b2a1 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -56,7 +56,8 @@ static inline int red_use_harddrop(struct red_sched_data *q)
return q->flags & TC_RED_HARDDROP;
}
-static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
{
struct red_sched_data *q = qdisc_priv(sch);
struct Qdisc *child = q->qdisc;
@@ -95,7 +96,7 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch)
break;
}
- ret = qdisc_enqueue(skb, child);
+ ret = qdisc_enqueue(skb, child, to_free);
if (likely(ret == NET_XMIT_SUCCESS)) {
qdisc_qstats_backlog_inc(sch, skb);
sch->q.qlen++;
@@ -106,7 +107,7 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch)
return ret;
congestion_drop:
- qdisc_drop(skb, sch);
+ qdisc_drop(skb, sch, to_free);
return NET_XMIT_CN;
}
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
index c696116..add3cc7 100644
--- a/net/sched/sch_sfb.c
+++ b/net/sched/sch_sfb.c
@@ -275,7 +275,8 @@ static bool sfb_classify(struct sk_buff *skb, struct tcf_proto *fl,
return false;
}
-static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
{
struct sfb_sched_data *q = qdisc_priv(sch);
@@ -397,7 +398,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
}
enqueue:
- ret = qdisc_enqueue(skb, child);
+ ret = qdisc_enqueue(skb, child, to_free);
if (likely(ret == NET_XMIT_SUCCESS)) {
sch->q.qlen++;
increment_qlen(skb, q);
@@ -408,7 +409,7 @@ enqueue:
return ret;
drop:
- qdisc_drop(skb, sch);
+ qdisc_drop(skb, sch, to_free);
return NET_XMIT_CN;
other_drop:
if (ret & __NET_XMIT_BYPASS)
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index a2e0b85..7f195ed 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -343,7 +343,7 @@ static int sfq_headdrop(const struct sfq_sched_data *q)
}
static int
-sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
{
struct sfq_sched_data *q = qdisc_priv(sch);
unsigned int hash, dropped;
@@ -367,7 +367,7 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
if (x == SFQ_EMPTY_SLOT) {
x = q->dep[0].next; /* get a free slot */
if (x >= SFQ_MAX_FLOWS)
- return qdisc_drop(skb, sch);
+ return qdisc_drop(skb, sch, to_free);
q->ht[hash] = x;
slot = &q->slots[x];
slot->hash = hash;
@@ -424,14 +424,14 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
if (slot->qlen >= q->maxdepth) {
congestion_drop:
if (!sfq_headdrop(q))
- return qdisc_drop(skb, sch);
+ return qdisc_drop(skb, sch, to_free);
/* We know we have at least one packet in queue */
head = slot_dequeue_head(slot);
delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
sch->qstats.backlog -= delta;
slot->backlog -= delta;
- qdisc_drop(head, sch);
+ qdisc_drop(head, sch, to_free);
slot_queue_add(slot, skb);
return NET_XMIT_CN;
@@ -520,7 +520,7 @@ sfq_reset(struct Qdisc *sch)
struct sk_buff *skb;
while ((skb = sfq_dequeue(sch)) != NULL)
- kfree_skb(skb);
+ rtnl_kfree_skbs(skb, skb);
}
/*
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index c12df84..303355c 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -155,7 +155,8 @@ static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb)
/* GSO packet is too big, segment it so that tbf can transmit
* each segment in time
*/
-static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch)
+static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
{
struct tbf_sched_data *q = qdisc_priv(sch);
struct sk_buff *segs, *nskb;
@@ -166,7 +167,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch)
segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
if (IS_ERR_OR_NULL(segs))
- return qdisc_drop(skb, sch);
+ return qdisc_drop(skb, sch, to_free);
nb = 0;
while (segs) {
@@ -174,7 +175,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch)
segs->next = NULL;
qdisc_skb_cb(segs)->pkt_len = segs->len;
len += segs->len;
- ret = qdisc_enqueue(segs, q->qdisc);
+ ret = qdisc_enqueue(segs, q->qdisc, to_free);
if (ret != NET_XMIT_SUCCESS) {
if (net_xmit_drop_count(ret))
qdisc_qstats_drop(sch);
@@ -190,17 +191,18 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch)
return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
}
-static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
{
struct tbf_sched_data *q = qdisc_priv(sch);
int ret;
if (qdisc_pkt_len(skb) > q->max_size) {
if (skb_is_gso(skb) && skb_gso_mac_seglen(skb) <= q->max_size)
- return tbf_segment(skb, sch);
- return qdisc_drop(skb, sch);
+ return tbf_segment(skb, sch, to_free);
+ return qdisc_drop(skb, sch, to_free);
}
- ret = qdisc_enqueue(skb, q->qdisc);
+ ret = qdisc_enqueue(skb, q->qdisc, to_free);
if (ret != NET_XMIT_SUCCESS) {
if (net_xmit_drop_count(ret))
qdisc_qstats_drop(sch);
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index e026871..2cd9b44 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -77,7 +77,7 @@ struct teql_sched_data {
/* "teql*" qdisc routines */
static int
-teql_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+teql_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
{
struct net_device *dev = qdisc_dev(sch);
struct teql_sched_data *q = qdisc_priv(sch);
@@ -87,7 +87,7 @@ teql_enqueue(struct sk_buff *skb, struct Qdisc *sch)
return NET_XMIT_SUCCESS;
}
- return qdisc_drop(skb, sch);
+ return qdisc_drop(skb, sch, to_free);
}
static struct sk_buff *
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 40022ee..3b56ae5 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -1479,7 +1479,8 @@ static __init int sctp_init(void)
INIT_HLIST_HEAD(&sctp_port_hashtable[i].chain);
}
- if (sctp_transport_hashtable_init())
+ status = sctp_transport_hashtable_init();
+ if (status)
goto err_thash_alloc;
pr_info("Hash tables configured (bind %d/%d)\n", sctp_port_hashsize,
diff --git a/net/sctp/sctp_diag.c b/net/sctp/sctp_diag.c
index 1ce724b..f69edcf 100644
--- a/net/sctp/sctp_diag.c
+++ b/net/sctp/sctp_diag.c
@@ -3,12 +3,6 @@
#include <linux/sock_diag.h>
#include <net/sctp/sctp.h>
-extern void inet_diag_msg_common_fill(struct inet_diag_msg *r,
- struct sock *sk);
-extern int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
- struct inet_diag_msg *r, int ext,
- struct user_namespace *user_ns);
-
static void sctp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
void *info);
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 6cae4c6..cdabbd8 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -7568,7 +7568,7 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
* is called, set RCV_SHUTDOWN flag.
*/
if (sctp_state(assoc, CLOSED) && sctp_style(newsk, TCP)) {
- newsk->sk_state = SCTP_SS_CLOSING;
+ newsk->sk_state = SCTP_SS_CLOSED;
newsk->sk_shutdown |= RCV_SHUTDOWN;
} else {
newsk->sk_state = SCTP_SS_ESTABLISHED;
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 06b4df9..2808d55 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -446,16 +446,27 @@ out_no_rpciod:
return ERR_PTR(err);
}
-struct rpc_clnt *rpc_create_xprt(struct rpc_create_args *args,
+static struct rpc_clnt *rpc_create_xprt(struct rpc_create_args *args,
struct rpc_xprt *xprt)
{
struct rpc_clnt *clnt = NULL;
struct rpc_xprt_switch *xps;
- xps = xprt_switch_alloc(xprt, GFP_KERNEL);
- if (xps == NULL)
- return ERR_PTR(-ENOMEM);
-
+ if (args->bc_xprt && args->bc_xprt->xpt_bc_xps) {
+ WARN_ON(args->protocol != XPRT_TRANSPORT_BC_TCP);
+ xps = args->bc_xprt->xpt_bc_xps;
+ xprt_switch_get(xps);
+ } else {
+ xps = xprt_switch_alloc(xprt, GFP_KERNEL);
+ if (xps == NULL) {
+ xprt_put(xprt);
+ return ERR_PTR(-ENOMEM);
+ }
+ if (xprt->bc_xprt) {
+ xprt_switch_get(xps);
+ xprt->bc_xprt->xpt_bc_xps = xps;
+ }
+ }
clnt = rpc_new_client(args, xps, xprt, NULL);
if (IS_ERR(clnt))
return clnt;
@@ -483,7 +494,6 @@ struct rpc_clnt *rpc_create_xprt(struct rpc_create_args *args,
return clnt;
}
-EXPORT_SYMBOL_GPL(rpc_create_xprt);
/**
* rpc_create - create an RPC client and transport with one call
@@ -509,6 +519,15 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
};
char servername[48];
+ if (args->bc_xprt) {
+ WARN_ON(args->protocol != XPRT_TRANSPORT_BC_TCP);
+ xprt = args->bc_xprt->xpt_bc_xprt;
+ if (xprt) {
+ xprt_get(xprt);
+ return rpc_create_xprt(args, xprt);
+ }
+ }
+
if (args->flags & RPC_CLNT_CREATE_INFINITE_SLOTS)
xprtargs.flags |= XPRT_CREATE_INFINITE_SLOTS;
if (args->flags & RPC_CLNT_CREATE_NO_IDLE_TIMEOUT)
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index f5572e3..4f01f63 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -136,6 +136,8 @@ static void svc_xprt_free(struct kref *kref)
/* See comment on corresponding get in xs_setup_bc_tcp(): */
if (xprt->xpt_bc_xprt)
xprt_put(xprt->xpt_bc_xprt);
+ if (xprt->xpt_bc_xps)
+ xprt_switch_put(xprt->xpt_bc_xps);
xprt->xpt_ops->xpo_free(xprt);
module_put(owner);
}
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 2d3e0c4..7e2b2fa 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -3057,6 +3057,7 @@ static struct rpc_xprt *xs_setup_bc_tcp(struct xprt_create *args)
return xprt;
args->bc_xprt->xpt_bc_xprt = NULL;
+ args->bc_xprt->xpt_bc_xps = NULL;
xprt_put(xprt);
ret = ERR_PTR(-EINVAL);
out_err:
diff --git a/net/tipc/Makefile b/net/tipc/Makefile
index 57e460b..31b9f9c 100644
--- a/net/tipc/Makefile
+++ b/net/tipc/Makefile
@@ -6,7 +6,7 @@ obj-$(CONFIG_TIPC) := tipc.o
tipc-y += addr.o bcast.o bearer.o \
core.o link.o discover.o msg.o \
- name_distr.o subscr.o name_table.o net.o \
+ name_distr.o subscr.o monitor.o name_table.o net.o \
netlink.o netlink_compat.o node.o socket.o eth_media.o \
server.o socket.o
diff --git a/net/tipc/addr.h b/net/tipc/addr.h
index 93f7c98..64f4004 100644
--- a/net/tipc/addr.h
+++ b/net/tipc/addr.h
@@ -73,4 +73,5 @@ int tipc_addr_node_valid(u32 addr);
int tipc_in_scope(u32 domain, u32 addr);
int tipc_addr_scope(u32 domain);
char *tipc_addr_string_fill(char *string, u32 addr);
+
#endif
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index 6f11c62..8584cc4 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -1,7 +1,7 @@
/*
* net/tipc/bearer.c: TIPC bearer code
*
- * Copyright (c) 1996-2006, 2013-2014, Ericsson AB
+ * Copyright (c) 1996-2006, 2013-2016, Ericsson AB
* Copyright (c) 2004-2006, 2010-2013, Wind River Systems
* All rights reserved.
*
@@ -39,6 +39,7 @@
#include "bearer.h"
#include "link.h"
#include "discover.h"
+#include "monitor.h"
#include "bcast.h"
#include "netlink.h"
@@ -313,6 +314,10 @@ restart:
rcu_assign_pointer(tn->bearer_list[bearer_id], b);
if (skb)
tipc_bearer_xmit_skb(net, bearer_id, skb, &b->bcast_addr);
+
+ if (tipc_mon_create(net, bearer_id))
+ return -ENOMEM;
+
pr_info("Enabled bearer <%s>, discovery domain %s, priority %u\n",
name,
tipc_addr_string_fill(addr_string, disc_domain), priority);
@@ -348,6 +353,7 @@ static void bearer_disable(struct net *net, struct tipc_bearer *b)
tipc_disc_delete(b->link_req);
RCU_INIT_POINTER(tn->bearer_list[bearer_id], NULL);
kfree_rcu(b, rcu);
+ tipc_mon_delete(net, bearer_id);
}
int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b,
@@ -405,7 +411,7 @@ int tipc_l2_send_msg(struct net *net, struct sk_buff *skb,
return 0;
/* Send RESET message even if bearer is detached from device */
- tipc_ptr = rtnl_dereference(dev->tipc_ptr);
+ tipc_ptr = rcu_dereference_rtnl(dev->tipc_ptr);
if (unlikely(!tipc_ptr && !msg_is_reset(buf_msg(skb))))
goto drop;
diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h
index f686e41..0d337c7 100644
--- a/net/tipc/bearer.h
+++ b/net/tipc/bearer.h
@@ -1,7 +1,7 @@
/*
* net/tipc/bearer.h: Include file for TIPC bearer code
*
- * Copyright (c) 1996-2006, 2013-2014, Ericsson AB
+ * Copyright (c) 1996-2006, 2013-2016, Ericsson AB
* Copyright (c) 2005, 2010-2011, Wind River Systems
* All rights reserved.
*
diff --git a/net/tipc/core.c b/net/tipc/core.c
index fe1b062..236b043 100644
--- a/net/tipc/core.c
+++ b/net/tipc/core.c
@@ -57,6 +57,7 @@ static int __net_init tipc_init_net(struct net *net)
tn->net_id = 4711;
tn->own_addr = 0;
+ tn->mon_threshold = TIPC_DEF_MON_THRESHOLD;
get_random_bytes(&tn->random, sizeof(int));
INIT_LIST_HEAD(&tn->node_list);
spin_lock_init(&tn->node_list_lock);
diff --git a/net/tipc/core.h b/net/tipc/core.h
index eff58dc..a1845fb 100644
--- a/net/tipc/core.h
+++ b/net/tipc/core.h
@@ -66,11 +66,13 @@ struct tipc_bc_base;
struct tipc_link;
struct tipc_name_table;
struct tipc_server;
+struct tipc_monitor;
#define TIPC_MOD_VER "2.0.0"
-#define NODE_HTABLE_SIZE 512
-#define MAX_BEARERS 3
+#define NODE_HTABLE_SIZE 512
+#define MAX_BEARERS 3
+#define TIPC_DEF_MON_THRESHOLD 32
extern int tipc_net_id __read_mostly;
extern int sysctl_tipc_rmem[3] __read_mostly;
@@ -88,6 +90,10 @@ struct tipc_net {
u32 num_nodes;
u32 num_links;
+ /* Neighbor monitoring list */
+ struct tipc_monitor *monitors[MAX_BEARERS];
+ int mon_threshold;
+
/* Bearer list */
struct tipc_bearer __rcu *bearer_list[MAX_BEARERS + 1];
@@ -126,6 +132,11 @@ static inline struct list_head *tipc_nodes(struct net *net)
return &tipc_net(net)->node_list;
}
+static inline unsigned int tipc_hashfn(u32 addr)
+{
+ return addr & (NODE_HTABLE_SIZE - 1);
+}
+
static inline u16 mod(u16 x)
{
return x & 0xffffu;
diff --git a/net/tipc/discover.c b/net/tipc/discover.c
index ad9d477..6b109a8 100644
--- a/net/tipc/discover.c
+++ b/net/tipc/discover.c
@@ -135,9 +135,12 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *skb,
u16 caps = msg_node_capabilities(hdr);
bool respond = false;
bool dupl_addr = false;
+ int err;
- bearer->media->msg2addr(bearer, &maddr, msg_media_addr(hdr));
+ err = bearer->media->msg2addr(bearer, &maddr, msg_media_addr(hdr));
kfree_skb(skb);
+ if (err)
+ return;
/* Ensure message from node is valid and communication is permitted */
if (net_id != tn->net_id)
diff --git a/net/tipc/link.c b/net/tipc/link.c
index a904ccd..c1df33f 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -42,6 +42,7 @@
#include "name_distr.h"
#include "discover.h"
#include "netlink.h"
+#include "monitor.h"
#include <linux/pkt_sched.h>
@@ -95,6 +96,7 @@ struct tipc_stats {
* @pmsg: convenience pointer to "proto_msg" field
* @priority: current link priority
* @net_plane: current link network plane ('A' through 'H')
+ * @mon_state: cookie with information needed by link monitor
* @backlog_limit: backlog queue congestion thresholds (indexed by importance)
* @exp_msg_count: # of tunnelled messages expected during link changeover
* @reset_rcv_checkpt: seq # of last acknowledged message at time of link reset
@@ -138,6 +140,7 @@ struct tipc_link {
char if_name[TIPC_MAX_IF_NAME];
u32 priority;
char net_plane;
+ struct tipc_mon_state mon_state;
u16 rst_cnt;
/* Failover/synch */
@@ -702,24 +705,32 @@ static void link_profile_stats(struct tipc_link *l)
*/
int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq)
{
- int mtyp, rc = 0;
+ int mtyp = 0;
+ int rc = 0;
bool state = false;
bool probe = false;
bool setup = false;
u16 bc_snt = l->bc_sndlink->snd_nxt - 1;
u16 bc_acked = l->bc_rcvlink->acked;
-
- link_profile_stats(l);
+ struct tipc_mon_state *mstate = &l->mon_state;
switch (l->state) {
case LINK_ESTABLISHED:
case LINK_SYNCHING:
- if (l->silent_intv_cnt > l->abort_limit)
- return tipc_link_fsm_evt(l, LINK_FAILURE_EVT);
mtyp = STATE_MSG;
+ link_profile_stats(l);
+ tipc_mon_get_state(l->net, l->addr, mstate, l->bearer_id);
+ if (mstate->reset || (l->silent_intv_cnt > l->abort_limit))
+ return tipc_link_fsm_evt(l, LINK_FAILURE_EVT);
state = bc_acked != bc_snt;
- probe = l->silent_intv_cnt;
- l->silent_intv_cnt++;
+ state |= l->bc_rcvlink->rcv_unacked;
+ state |= l->rcv_unacked;
+ state |= !skb_queue_empty(&l->transmq);
+ state |= !skb_queue_empty(&l->deferdq);
+ probe = mstate->probing;
+ probe |= l->silent_intv_cnt;
+ if (probe || mstate->monitoring)
+ l->silent_intv_cnt++;
break;
case LINK_RESET:
setup = l->rst_cnt++ <= 4;
@@ -830,6 +841,7 @@ void tipc_link_reset(struct tipc_link *l)
l->stats.recv_info = 0;
l->stale_count = 0;
l->bc_peer_is_up = false;
+ memset(&l->mon_state, 0, sizeof(l->mon_state));
tipc_link_reset_stats(l);
}
@@ -1238,6 +1250,9 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe,
struct tipc_msg *hdr;
struct sk_buff_head *dfq = &l->deferdq;
bool node_up = link_is_up(l->bc_rcvlink);
+ struct tipc_mon_state *mstate = &l->mon_state;
+ int dlen = 0;
+ void *data;
/* Don't send protocol message during reset or link failover */
if (tipc_link_is_blocked(l))
@@ -1250,12 +1265,13 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe,
rcvgap = buf_seqno(skb_peek(dfq)) - l->rcv_nxt;
skb = tipc_msg_create(LINK_PROTOCOL, mtyp, INT_H_SIZE,
- TIPC_MAX_IF_NAME, l->addr,
+ tipc_max_domain_size, l->addr,
tipc_own_addr(l->net), 0, 0, 0);
if (!skb)
return;
hdr = buf_msg(skb);
+ data = msg_data(hdr);
msg_set_session(hdr, l->session);
msg_set_bearer_id(hdr, l->bearer_id);
msg_set_net_plane(hdr, l->net_plane);
@@ -1271,14 +1287,18 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe,
if (mtyp == STATE_MSG) {
msg_set_seq_gap(hdr, rcvgap);
- msg_set_size(hdr, INT_H_SIZE);
msg_set_probe(hdr, probe);
+ tipc_mon_prep(l->net, data, &dlen, mstate, l->bearer_id);
+ msg_set_size(hdr, INT_H_SIZE + dlen);
+ skb_trim(skb, INT_H_SIZE + dlen);
l->stats.sent_states++;
l->rcv_unacked = 0;
} else {
/* RESET_MSG or ACTIVATE_MSG */
msg_set_max_pkt(hdr, l->advertised_mtu);
- strcpy(msg_data(hdr), l->if_name);
+ strcpy(data, l->if_name);
+ msg_set_size(hdr, INT_H_SIZE + TIPC_MAX_IF_NAME);
+ skb_trim(skb, INT_H_SIZE + TIPC_MAX_IF_NAME);
}
if (probe)
l->stats.sent_probes++;
@@ -1371,7 +1391,9 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
u16 peers_tol = msg_link_tolerance(hdr);
u16 peers_prio = msg_linkprio(hdr);
u16 rcv_nxt = l->rcv_nxt;
+ u16 dlen = msg_data_sz(hdr);
int mtyp = msg_type(hdr);
+ void *data;
char *if_name;
int rc = 0;
@@ -1381,6 +1403,10 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
if (tipc_own_addr(l->net) > msg_prevnode(hdr))
l->net_plane = msg_net_plane(hdr);
+ skb_linearize(skb);
+ hdr = buf_msg(skb);
+ data = msg_data(hdr);
+
switch (mtyp) {
case RESET_MSG:
@@ -1391,8 +1417,6 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
/* fall thru' */
case ACTIVATE_MSG:
- skb_linearize(skb);
- hdr = buf_msg(skb);
/* Complete own link name with peer's interface name */
if_name = strrchr(l->name, ':') + 1;
@@ -1400,7 +1424,7 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
break;
if (msg_data_sz(hdr) < TIPC_MAX_IF_NAME)
break;
- strncpy(if_name, msg_data(hdr), TIPC_MAX_IF_NAME);
+ strncpy(if_name, data, TIPC_MAX_IF_NAME);
/* Update own tolerance if peer indicates a non-zero value */
if (in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL))
@@ -1448,6 +1472,8 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
rc = TIPC_LINK_UP_EVT;
break;
}
+ tipc_mon_rcv(l->net, data, dlen, l->addr,
+ &l->mon_state, l->bearer_id);
/* Send NACK if peer has sent pkts we haven't received yet */
if (more(peers_snd_nxt, rcv_nxt) && !tipc_link_is_synching(l))
diff --git a/net/tipc/monitor.c b/net/tipc/monitor.c
new file mode 100644
index 0000000..0d489e8
--- /dev/null
+++ b/net/tipc/monitor.c
@@ -0,0 +1,651 @@
+/*
+ * net/tipc/monitor.c
+ *
+ * Copyright (c) 2016, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "core.h"
+#include "addr.h"
+#include "monitor.h"
+
+#define MAX_MON_DOMAIN 64
+#define MON_TIMEOUT 120000
+#define MAX_PEER_DOWN_EVENTS 4
+
+/* struct tipc_mon_domain: domain record to be transferred between peers
+ * @len: actual size of domain record
+ * @gen: current generation of sender's domain
+ * @ack_gen: most recent generation of self's domain acked by peer
+ * @member_cnt: number of domain member nodes described in this record
+ * @up_map: bit map indicating which of the members the sender considers up
+ * @members: identity of the domain members
+ */
+struct tipc_mon_domain {
+ u16 len;
+ u16 gen;
+ u16 ack_gen;
+ u16 member_cnt;
+ u64 up_map;
+ u32 members[MAX_MON_DOMAIN];
+};
+
+/* struct tipc_peer: state of a peer node and its domain
+ * @addr: tipc node identity of peer
+ * @head_map: shows which other nodes currently consider peer 'up'
+ * @domain: most recent domain record from peer
+ * @hash: position in hashed lookup list
+ * @list: position in linked list, in circular ascending order by 'addr'
+ * @applied: number of reported domain members applied on this monitor list
+ * @is_up: peer is up as seen from this node
+ * @is_head: peer is assigned domain head as seen from this node
+ * @is_local: peer is in local domain and should be continuously monitored
+ * @down_cnt: - numbers of other peers which have reported this on lost
+ */
+struct tipc_peer {
+ u32 addr;
+ struct tipc_mon_domain *domain;
+ struct hlist_node hash;
+ struct list_head list;
+ u8 applied;
+ u8 down_cnt;
+ bool is_up;
+ bool is_head;
+ bool is_local;
+};
+
+struct tipc_monitor {
+ struct hlist_head peers[NODE_HTABLE_SIZE];
+ int peer_cnt;
+ struct tipc_peer *self;
+ rwlock_t lock;
+ struct tipc_mon_domain cache;
+ u16 list_gen;
+ u16 dom_gen;
+ struct net *net;
+ struct timer_list timer;
+ unsigned long timer_intv;
+};
+
+static struct tipc_monitor *tipc_monitor(struct net *net, int bearer_id)
+{
+ return tipc_net(net)->monitors[bearer_id];
+}
+
+const int tipc_max_domain_size = sizeof(struct tipc_mon_domain);
+
+/* dom_rec_len(): actual length of domain record for transport
+ */
+static int dom_rec_len(struct tipc_mon_domain *dom, u16 mcnt)
+{
+ return ((void *)&dom->members - (void *)dom) + (mcnt * sizeof(u32));
+}
+
+/* dom_size() : calculate size of own domain based on number of peers
+ */
+static int dom_size(int peers)
+{
+ int i = 0;
+
+ while ((i * i) < peers)
+ i++;
+ return i < MAX_MON_DOMAIN ? i : MAX_MON_DOMAIN;
+}
+
+static void map_set(u64 *up_map, int i, unsigned int v)
+{
+ *up_map &= ~(1ULL << i);
+ *up_map |= ((u64)v << i);
+}
+
+static int map_get(u64 up_map, int i)
+{
+ return (up_map & (1 << i)) >> i;
+}
+
+static struct tipc_peer *peer_prev(struct tipc_peer *peer)
+{
+ return list_last_entry(&peer->list, struct tipc_peer, list);
+}
+
+static struct tipc_peer *peer_nxt(struct tipc_peer *peer)
+{
+ return list_first_entry(&peer->list, struct tipc_peer, list);
+}
+
+static struct tipc_peer *peer_head(struct tipc_peer *peer)
+{
+ while (!peer->is_head)
+ peer = peer_prev(peer);
+ return peer;
+}
+
+static struct tipc_peer *get_peer(struct tipc_monitor *mon, u32 addr)
+{
+ struct tipc_peer *peer;
+ unsigned int thash = tipc_hashfn(addr);
+
+ hlist_for_each_entry(peer, &mon->peers[thash], hash) {
+ if (peer->addr == addr)
+ return peer;
+ }
+ return NULL;
+}
+
+static struct tipc_peer *get_self(struct net *net, int bearer_id)
+{
+ struct tipc_monitor *mon = tipc_monitor(net, bearer_id);
+
+ return mon->self;
+}
+
+static inline bool tipc_mon_is_active(struct net *net, struct tipc_monitor *mon)
+{
+ struct tipc_net *tn = tipc_net(net);
+
+ return mon->peer_cnt > tn->mon_threshold;
+}
+
+/* mon_identify_lost_members() : - identify amd mark potentially lost members
+ */
+static void mon_identify_lost_members(struct tipc_peer *peer,
+ struct tipc_mon_domain *dom_bef,
+ int applied_bef)
+{
+ struct tipc_peer *member = peer;
+ struct tipc_mon_domain *dom_aft = peer->domain;
+ int applied_aft = peer->applied;
+ int i;
+
+ for (i = 0; i < applied_bef; i++) {
+ member = peer_nxt(member);
+
+ /* Do nothing if self or peer already see member as down */
+ if (!member->is_up || !map_get(dom_bef->up_map, i))
+ continue;
+
+ /* Loss of local node must be detected by active probing */
+ if (member->is_local)
+ continue;
+
+ /* Start probing if member was removed from applied domain */
+ if (!applied_aft || (applied_aft < i)) {
+ member->down_cnt = 1;
+ continue;
+ }
+
+ /* Member loss is confirmed if it is still in applied domain */
+ if (!map_get(dom_aft->up_map, i))
+ member->down_cnt++;
+ }
+}
+
+/* mon_apply_domain() : match a peer's domain record against monitor list
+ */
+static void mon_apply_domain(struct tipc_monitor *mon,
+ struct tipc_peer *peer)
+{
+ struct tipc_mon_domain *dom = peer->domain;
+ struct tipc_peer *member;
+ u32 addr;
+ int i;
+
+ if (!dom || !peer->is_up)
+ return;
+
+ /* Scan across domain members and match against monitor list */
+ peer->applied = 0;
+ member = peer_nxt(peer);
+ for (i = 0; i < dom->member_cnt; i++) {
+ addr = dom->members[i];
+ if (addr != member->addr)
+ return;
+ peer->applied++;
+ member = peer_nxt(member);
+ }
+}
+
+/* mon_update_local_domain() : update after peer addition/removal/up/down
+ */
+static void mon_update_local_domain(struct tipc_monitor *mon)
+{
+ struct tipc_peer *self = mon->self;
+ struct tipc_mon_domain *cache = &mon->cache;
+ struct tipc_mon_domain *dom = self->domain;
+ struct tipc_peer *peer = self;
+ u64 prev_up_map = dom->up_map;
+ u16 member_cnt, i;
+ bool diff;
+
+ /* Update local domain size based on current size of cluster */
+ member_cnt = dom_size(mon->peer_cnt) - 1;
+ self->applied = member_cnt;
+
+ /* Update native and cached outgoing local domain records */
+ dom->len = dom_rec_len(dom, member_cnt);
+ diff = dom->member_cnt != member_cnt;
+ dom->member_cnt = member_cnt;
+ for (i = 0; i < member_cnt; i++) {
+ peer = peer_nxt(peer);
+ diff |= dom->members[i] != peer->addr;
+ dom->members[i] = peer->addr;
+ map_set(&dom->up_map, i, peer->is_up);
+ cache->members[i] = htonl(peer->addr);
+ }
+ diff |= dom->up_map != prev_up_map;
+ if (!diff)
+ return;
+ dom->gen = ++mon->dom_gen;
+ cache->len = htons(dom->len);
+ cache->gen = htons(dom->gen);
+ cache->member_cnt = htons(member_cnt);
+ cache->up_map = cpu_to_be64(dom->up_map);
+ mon_apply_domain(mon, self);
+}
+
+/* mon_update_neighbors() : update preceding neighbors of added/removed peer
+ */
+static void mon_update_neighbors(struct tipc_monitor *mon,
+ struct tipc_peer *peer)
+{
+ int dz, i;
+
+ dz = dom_size(mon->peer_cnt);
+ for (i = 0; i < dz; i++) {
+ mon_apply_domain(mon, peer);
+ peer = peer_prev(peer);
+ }
+}
+
+/* mon_assign_roles() : reassign peer roles after a network change
+ * The monitor list is consistent at this stage; i.e., each peer is monitoring
+ * a set of domain members as matched between domain record and the monitor list
+ */
+static void mon_assign_roles(struct tipc_monitor *mon, struct tipc_peer *head)
+{
+ struct tipc_peer *peer = peer_nxt(head);
+ struct tipc_peer *self = mon->self;
+ int i = 0;
+
+ for (; peer != self; peer = peer_nxt(peer)) {
+ peer->is_local = false;
+
+ /* Update domain member */
+ if (i++ < head->applied) {
+ peer->is_head = false;
+ if (head == self)
+ peer->is_local = true;
+ continue;
+ }
+ /* Assign next domain head */
+ if (!peer->is_up)
+ continue;
+ if (peer->is_head)
+ break;
+ head = peer;
+ head->is_head = true;
+ i = 0;
+ }
+ mon->list_gen++;
+}
+
+void tipc_mon_remove_peer(struct net *net, u32 addr, int bearer_id)
+{
+ struct tipc_monitor *mon = tipc_monitor(net, bearer_id);
+ struct tipc_peer *self = get_self(net, bearer_id);
+ struct tipc_peer *peer, *prev, *head;
+
+ write_lock_bh(&mon->lock);
+ peer = get_peer(mon, addr);
+ if (!peer)
+ goto exit;
+ prev = peer_prev(peer);
+ list_del(&peer->list);
+ hlist_del(&peer->hash);
+ kfree(peer->domain);
+ kfree(peer);
+ mon->peer_cnt--;
+ head = peer_head(prev);
+ if (head == self)
+ mon_update_local_domain(mon);
+ mon_update_neighbors(mon, prev);
+
+ /* Revert to full-mesh monitoring if we reach threshold */
+ if (!tipc_mon_is_active(net, mon)) {
+ list_for_each_entry(peer, &self->list, list) {
+ kfree(peer->domain);
+ peer->domain = NULL;
+ peer->applied = 0;
+ }
+ }
+ mon_assign_roles(mon, head);
+exit:
+ write_unlock_bh(&mon->lock);
+}
+
+static bool tipc_mon_add_peer(struct tipc_monitor *mon, u32 addr,
+ struct tipc_peer **peer)
+{
+ struct tipc_peer *self = mon->self;
+ struct tipc_peer *cur, *prev, *p;
+
+ p = kzalloc(sizeof(*p), GFP_ATOMIC);
+ *peer = p;
+ if (!p)
+ return false;
+ p->addr = addr;
+
+ /* Add new peer to lookup list */
+ INIT_LIST_HEAD(&p->list);
+ hlist_add_head(&p->hash, &mon->peers[tipc_hashfn(addr)]);
+
+ /* Sort new peer into iterator list, in ascending circular order */
+ prev = self;
+ list_for_each_entry(cur, &self->list, list) {
+ if ((addr > prev->addr) && (addr < cur->addr))
+ break;
+ if (((addr < cur->addr) || (addr > prev->addr)) &&
+ (prev->addr > cur->addr))
+ break;
+ prev = cur;
+ }
+ list_add_tail(&p->list, &cur->list);
+ mon->peer_cnt++;
+ mon_update_neighbors(mon, p);
+ return true;
+}
+
+void tipc_mon_peer_up(struct net *net, u32 addr, int bearer_id)
+{
+ struct tipc_monitor *mon = tipc_monitor(net, bearer_id);
+ struct tipc_peer *self = get_self(net, bearer_id);
+ struct tipc_peer *peer, *head;
+
+ write_lock_bh(&mon->lock);
+ peer = get_peer(mon, addr);
+ if (!peer && !tipc_mon_add_peer(mon, addr, &peer))
+ goto exit;
+ peer->is_up = true;
+ head = peer_head(peer);
+ if (head == self)
+ mon_update_local_domain(mon);
+ mon_assign_roles(mon, head);
+exit:
+ write_unlock_bh(&mon->lock);
+}
+
+void tipc_mon_peer_down(struct net *net, u32 addr, int bearer_id)
+{
+ struct tipc_monitor *mon = tipc_monitor(net, bearer_id);
+ struct tipc_peer *self = get_self(net, bearer_id);
+ struct tipc_peer *peer, *head;
+ struct tipc_mon_domain *dom;
+ int applied;
+
+ write_lock_bh(&mon->lock);
+ peer = get_peer(mon, addr);
+ if (!peer) {
+ pr_warn("Mon: unknown link %x/%u DOWN\n", addr, bearer_id);
+ goto exit;
+ }
+ applied = peer->applied;
+ peer->applied = 0;
+ dom = peer->domain;
+ peer->domain = NULL;
+ if (peer->is_head)
+ mon_identify_lost_members(peer, dom, applied);
+ kfree(dom);
+ peer->is_up = false;
+ peer->is_head = false;
+ peer->is_local = false;
+ peer->down_cnt = 0;
+ head = peer_head(peer);
+ if (head == self)
+ mon_update_local_domain(mon);
+ mon_assign_roles(mon, head);
+exit:
+ write_unlock_bh(&mon->lock);
+}
+
+/* tipc_mon_rcv - process monitor domain event message
+ */
+void tipc_mon_rcv(struct net *net, void *data, u16 dlen, u32 addr,
+ struct tipc_mon_state *state, int bearer_id)
+{
+ struct tipc_monitor *mon = tipc_monitor(net, bearer_id);
+ struct tipc_mon_domain *arrv_dom = data;
+ struct tipc_mon_domain dom_bef;
+ struct tipc_mon_domain *dom;
+ struct tipc_peer *peer;
+ u16 new_member_cnt = ntohs(arrv_dom->member_cnt);
+ int new_dlen = dom_rec_len(arrv_dom, new_member_cnt);
+ u16 new_gen = ntohs(arrv_dom->gen);
+ u16 acked_gen = ntohs(arrv_dom->ack_gen);
+ bool probing = state->probing;
+ int i, applied_bef;
+
+ state->probing = false;
+ if (!dlen)
+ return;
+
+ /* Sanity check received domain record */
+ if ((dlen < new_dlen) || ntohs(arrv_dom->len) != new_dlen) {
+ pr_warn_ratelimited("Received illegal domain record\n");
+ return;
+ }
+
+ /* Synch generation numbers with peer if link just came up */
+ if (!state->synched) {
+ state->peer_gen = new_gen - 1;
+ state->acked_gen = acked_gen;
+ state->synched = true;
+ }
+
+ if (more(acked_gen, state->acked_gen))
+ state->acked_gen = acked_gen;
+
+ /* Drop duplicate unless we are waiting for a probe response */
+ if (!more(new_gen, state->peer_gen) && !probing)
+ return;
+
+ write_lock_bh(&mon->lock);
+ peer = get_peer(mon, addr);
+ if (!peer || !peer->is_up)
+ goto exit;
+
+ /* Peer is confirmed, stop any ongoing probing */
+ peer->down_cnt = 0;
+
+ /* Task is done for duplicate record */
+ if (!more(new_gen, state->peer_gen))
+ goto exit;
+
+ state->peer_gen = new_gen;
+
+ /* Cache current domain record for later use */
+ dom_bef.member_cnt = 0;
+ dom = peer->domain;
+ if (dom)
+ memcpy(&dom_bef, dom, dom->len);
+
+ /* Transform and store received domain record */
+ if (!dom || (dom->len < new_dlen)) {
+ kfree(dom);
+ dom = kmalloc(new_dlen, GFP_ATOMIC);
+ peer->domain = dom;
+ if (!dom)
+ goto exit;
+ }
+ dom->len = new_dlen;
+ dom->gen = new_gen;
+ dom->member_cnt = new_member_cnt;
+ dom->up_map = be64_to_cpu(arrv_dom->up_map);
+ for (i = 0; i < new_member_cnt; i++)
+ dom->members[i] = ntohl(arrv_dom->members[i]);
+
+ /* Update peers affected by this domain record */
+ applied_bef = peer->applied;
+ mon_apply_domain(mon, peer);
+ mon_identify_lost_members(peer, &dom_bef, applied_bef);
+ mon_assign_roles(mon, peer_head(peer));
+exit:
+ write_unlock_bh(&mon->lock);
+}
+
+void tipc_mon_prep(struct net *net, void *data, int *dlen,
+ struct tipc_mon_state *state, int bearer_id)
+{
+ struct tipc_monitor *mon = tipc_monitor(net, bearer_id);
+ struct tipc_mon_domain *dom = data;
+ u16 gen = mon->dom_gen;
+ u16 len;
+
+ if (!tipc_mon_is_active(net, mon))
+ return;
+
+ /* Send only a dummy record with ack if peer has acked our last sent */
+ if (likely(state->acked_gen == gen)) {
+ len = dom_rec_len(dom, 0);
+ *dlen = len;
+ dom->len = htons(len);
+ dom->gen = htons(gen);
+ dom->ack_gen = htons(state->peer_gen);
+ dom->member_cnt = 0;
+ return;
+ }
+ /* Send the full record */
+ read_lock_bh(&mon->lock);
+ len = ntohs(mon->cache.len);
+ *dlen = len;
+ memcpy(data, &mon->cache, len);
+ read_unlock_bh(&mon->lock);
+ dom->ack_gen = htons(state->peer_gen);
+}
+
+void tipc_mon_get_state(struct net *net, u32 addr,
+ struct tipc_mon_state *state,
+ int bearer_id)
+{
+ struct tipc_monitor *mon = tipc_monitor(net, bearer_id);
+ struct tipc_peer *peer;
+
+ /* Used cached state if table has not changed */
+ if (!state->probing &&
+ (state->list_gen == mon->list_gen) &&
+ (state->acked_gen == mon->dom_gen))
+ return;
+
+ read_lock_bh(&mon->lock);
+ peer = get_peer(mon, addr);
+ if (peer) {
+ state->probing = state->acked_gen != mon->dom_gen;
+ state->probing |= peer->down_cnt;
+ state->reset |= peer->down_cnt >= MAX_PEER_DOWN_EVENTS;
+ state->monitoring = peer->is_local;
+ state->monitoring |= peer->is_head;
+ state->list_gen = mon->list_gen;
+ }
+ read_unlock_bh(&mon->lock);
+}
+
+static void mon_timeout(unsigned long m)
+{
+ struct tipc_monitor *mon = (void *)m;
+ struct tipc_peer *self;
+ int best_member_cnt = dom_size(mon->peer_cnt) - 1;
+
+ write_lock_bh(&mon->lock);
+ self = mon->self;
+ if (self && (best_member_cnt != self->applied)) {
+ mon_update_local_domain(mon);
+ mon_assign_roles(mon, self);
+ }
+ write_unlock_bh(&mon->lock);
+ mod_timer(&mon->timer, jiffies + mon->timer_intv);
+}
+
+int tipc_mon_create(struct net *net, int bearer_id)
+{
+ struct tipc_net *tn = tipc_net(net);
+ struct tipc_monitor *mon;
+ struct tipc_peer *self;
+ struct tipc_mon_domain *dom;
+
+ if (tn->monitors[bearer_id])
+ return 0;
+
+ mon = kzalloc(sizeof(*mon), GFP_ATOMIC);
+ self = kzalloc(sizeof(*self), GFP_ATOMIC);
+ dom = kzalloc(sizeof(*dom), GFP_ATOMIC);
+ if (!mon || !self || !dom) {
+ kfree(mon);
+ kfree(self);
+ kfree(dom);
+ return -ENOMEM;
+ }
+ tn->monitors[bearer_id] = mon;
+ rwlock_init(&mon->lock);
+ mon->net = net;
+ mon->peer_cnt = 1;
+ mon->self = self;
+ self->domain = dom;
+ self->addr = tipc_own_addr(net);
+ self->is_up = true;
+ self->is_head = true;
+ INIT_LIST_HEAD(&self->list);
+ setup_timer(&mon->timer, mon_timeout, (unsigned long)mon);
+ mon->timer_intv = msecs_to_jiffies(MON_TIMEOUT + (tn->random & 0xffff));
+ mod_timer(&mon->timer, jiffies + mon->timer_intv);
+ return 0;
+}
+
+void tipc_mon_delete(struct net *net, int bearer_id)
+{
+ struct tipc_net *tn = tipc_net(net);
+ struct tipc_monitor *mon = tipc_monitor(net, bearer_id);
+ struct tipc_peer *self = get_self(net, bearer_id);
+ struct tipc_peer *peer, *tmp;
+
+ write_lock_bh(&mon->lock);
+ tn->monitors[bearer_id] = NULL;
+ list_for_each_entry_safe(peer, tmp, &self->list, list) {
+ list_del(&peer->list);
+ hlist_del(&peer->hash);
+ kfree(peer->domain);
+ kfree(peer);
+ }
+ mon->self = NULL;
+ write_unlock_bh(&mon->lock);
+ del_timer_sync(&mon->timer);
+ kfree(self->domain);
+ kfree(self);
+ kfree(mon);
+}
diff --git a/net/tipc/monitor.h b/net/tipc/monitor.h
new file mode 100644
index 0000000..598459c
--- /dev/null
+++ b/net/tipc/monitor.h
@@ -0,0 +1,73 @@
+/*
+ * net/tipc/monitor.h
+ *
+ * Copyright (c) 2015, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_MONITOR_H
+#define _TIPC_MONITOR_H
+
+/* struct tipc_mon_state: link instance's cache of monitor list and domain state
+ * @list_gen: current generation of this node's monitor list
+ * @gen: current generation of this node's local domain
+ * @peer_gen: most recent domain generation received from peer
+ * @acked_gen: most recent generation of self's domain acked by peer
+ * @monitoring: this peer endpoint should continuously monitored
+ * @probing: peer endpoint should be temporarily probed for potential loss
+ * @synched: domain record's generation has been synched with peer after reset
+ */
+struct tipc_mon_state {
+ u16 list_gen;
+ u16 peer_gen;
+ u16 acked_gen;
+ bool monitoring :1;
+ bool probing :1;
+ bool reset :1;
+ bool synched :1;
+};
+
+int tipc_mon_create(struct net *net, int bearer_id);
+void tipc_mon_delete(struct net *net, int bearer_id);
+
+void tipc_mon_peer_up(struct net *net, u32 addr, int bearer_id);
+void tipc_mon_peer_down(struct net *net, u32 addr, int bearer_id);
+void tipc_mon_prep(struct net *net, void *data, int *dlen,
+ struct tipc_mon_state *state, int bearer_id);
+void tipc_mon_rcv(struct net *net, void *data, u16 dlen, u32 addr,
+ struct tipc_mon_state *state, int bearer_id);
+void tipc_mon_get_state(struct net *net, u32 addr,
+ struct tipc_mon_state *state,
+ int bearer_id);
+void tipc_mon_remove_peer(struct net *net, u32 addr, int bearer_id);
+
+extern const int tipc_max_domain_size;
+#endif
diff --git a/net/tipc/msg.c b/net/tipc/msg.c
index 8740930..17201aa 100644
--- a/net/tipc/msg.c
+++ b/net/tipc/msg.c
@@ -41,6 +41,8 @@
#include "name_table.h"
#define MAX_FORWARD_SIZE 1024
+#define BUF_HEADROOM (LL_MAX_HEADER + 48)
+#define BUF_TAILROOM 16
static unsigned int align(unsigned int i)
{
@@ -505,6 +507,10 @@ bool tipc_msg_reverse(u32 own_node, struct sk_buff **skb, int err)
msg_set_hdr_sz(hdr, BASIC_H_SIZE);
}
+ if (skb_cloned(_skb) &&
+ pskb_expand_head(_skb, BUF_HEADROOM, BUF_TAILROOM, GFP_KERNEL))
+ goto exit;
+
/* Now reverse the concerned fields */
msg_set_errcode(hdr, err);
msg_set_origport(hdr, msg_destport(&ohdr));
diff --git a/net/tipc/msg.h b/net/tipc/msg.h
index 024da8a..7cf52fb 100644
--- a/net/tipc/msg.h
+++ b/net/tipc/msg.h
@@ -94,17 +94,6 @@ struct plist;
#define TIPC_MEDIA_INFO_OFFSET 5
-/**
- * TIPC message buffer code
- *
- * TIPC message buffer headroom reserves space for the worst-case
- * link-level device header (in case the message is sent off-node).
- *
- * Note: Headroom should be a multiple of 4 to ensure the TIPC header fields
- * are word aligned for quicker access
- */
-#define BUF_HEADROOM (LL_MAX_HEADER + 48)
-
struct tipc_skb_cb {
void *handle;
struct sk_buff *tail;
diff --git a/net/tipc/node.c b/net/tipc/node.c
index d6a490f..a3fc0a3 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -40,6 +40,7 @@
#include "name_distr.h"
#include "socket.h"
#include "bcast.h"
+#include "monitor.h"
#include "discover.h"
#include "netlink.h"
@@ -205,17 +206,6 @@ u16 tipc_node_get_capabilities(struct net *net, u32 addr)
return caps;
}
-/*
- * A trivial power-of-two bitmask technique is used for speed, since this
- * operation is done for every incoming TIPC packet. The number of hash table
- * entries has been chosen so that no hash chain exceeds 8 nodes and will
- * usually be much smaller (typically only a single node).
- */
-static unsigned int tipc_hashfn(u32 addr)
-{
- return addr & (NODE_HTABLE_SIZE - 1);
-}
-
static void tipc_node_kref_release(struct kref *kref)
{
struct tipc_node *n = container_of(kref, struct tipc_node, kref);
@@ -279,6 +269,7 @@ static void tipc_node_write_unlock(struct tipc_node *n)
u32 addr = 0;
u32 flags = n->action_flags;
u32 link_id = 0;
+ u32 bearer_id;
struct list_head *publ_list;
if (likely(!flags)) {
@@ -288,6 +279,7 @@ static void tipc_node_write_unlock(struct tipc_node *n)
addr = n->addr;
link_id = n->link_id;
+ bearer_id = link_id & 0xffff;
publ_list = &n->publ_list;
n->action_flags &= ~(TIPC_NOTIFY_NODE_DOWN | TIPC_NOTIFY_NODE_UP |
@@ -301,13 +293,16 @@ static void tipc_node_write_unlock(struct tipc_node *n)
if (flags & TIPC_NOTIFY_NODE_UP)
tipc_named_node_up(net, addr);
- if (flags & TIPC_NOTIFY_LINK_UP)
+ if (flags & TIPC_NOTIFY_LINK_UP) {
+ tipc_mon_peer_up(net, addr, bearer_id);
tipc_nametbl_publish(net, TIPC_LINK_STATE, addr, addr,
TIPC_NODE_SCOPE, link_id, addr);
-
- if (flags & TIPC_NOTIFY_LINK_DOWN)
+ }
+ if (flags & TIPC_NOTIFY_LINK_DOWN) {
+ tipc_mon_peer_down(net, addr, bearer_id);
tipc_nametbl_withdraw(net, TIPC_LINK_STATE, addr,
link_id, addr);
+ }
}
struct tipc_node *tipc_node_create(struct net *net, u32 addr, u16 capabilities)
@@ -691,6 +686,7 @@ static void tipc_node_link_down(struct tipc_node *n, int bearer_id, bool delete)
struct tipc_link *l = le->link;
struct tipc_media_addr *maddr;
struct sk_buff_head xmitq;
+ int old_bearer_id = bearer_id;
if (!l)
return;
@@ -710,6 +706,8 @@ static void tipc_node_link_down(struct tipc_node *n, int bearer_id, bool delete)
tipc_link_fsm_evt(l, LINK_RESET_EVT);
}
tipc_node_write_unlock(n);
+ if (delete)
+ tipc_mon_remove_peer(n->net, n->addr, old_bearer_id);
tipc_bearer_xmit(n->net, bearer_id, &xmitq, maddr);
tipc_sk_rcv(n->net, &le->inputq);
}
diff --git a/net/tipc/server.c b/net/tipc/server.c
index 272d20a..215849c 100644
--- a/net/tipc/server.c
+++ b/net/tipc/server.c
@@ -418,13 +418,12 @@ static struct outqueue_entry *tipc_alloc_entry(void *data, int len)
if (!entry)
return NULL;
- buf = kmalloc(len, GFP_ATOMIC);
+ buf = kmemdup(data, len, GFP_ATOMIC);
if (!buf) {
kfree(entry);
return NULL;
}
- memcpy(buf, data, len);
entry->iov.iov_base = buf;
entry->iov.iov_len = len;
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 88bfcd7..c49b8df 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -796,9 +796,11 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq,
* @tsk: receiving socket
* @skb: pointer to message buffer.
*/
-static void tipc_sk_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb)
+static void tipc_sk_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb,
+ struct sk_buff_head *xmitq)
{
struct sock *sk = &tsk->sk;
+ u32 onode = tsk_own_node(tsk);
struct tipc_msg *hdr = buf_msg(skb);
int mtyp = msg_type(hdr);
bool conn_cong;
@@ -811,7 +813,8 @@ static void tipc_sk_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb)
if (mtyp == CONN_PROBE) {
msg_set_type(hdr, CONN_PROBE_REPLY);
- tipc_sk_respond(sk, skb, TIPC_OK);
+ if (tipc_msg_reverse(onode, &skb, TIPC_OK))
+ __skb_queue_tail(xmitq, skb);
return;
} else if (mtyp == CONN_ACK) {
conn_cong = tsk_conn_cong(tsk);
@@ -1686,7 +1689,8 @@ static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *skb)
*
* Returns true if message was added to socket receive queue, otherwise false
*/
-static bool filter_rcv(struct sock *sk, struct sk_buff *skb)
+static bool filter_rcv(struct sock *sk, struct sk_buff *skb,
+ struct sk_buff_head *xmitq)
{
struct socket *sock = sk->sk_socket;
struct tipc_sock *tsk = tipc_sk(sk);
@@ -1696,7 +1700,7 @@ static bool filter_rcv(struct sock *sk, struct sk_buff *skb)
int usr = msg_user(hdr);
if (unlikely(msg_user(hdr) == CONN_MANAGER)) {
- tipc_sk_proto_rcv(tsk, skb);
+ tipc_sk_proto_rcv(tsk, skb, xmitq);
return false;
}
@@ -1739,7 +1743,8 @@ static bool filter_rcv(struct sock *sk, struct sk_buff *skb)
return true;
reject:
- tipc_sk_respond(sk, skb, err);
+ if (tipc_msg_reverse(tsk_own_node(tsk), &skb, err))
+ __skb_queue_tail(xmitq, skb);
return false;
}
@@ -1755,9 +1760,24 @@ reject:
static int tipc_backlog_rcv(struct sock *sk, struct sk_buff *skb)
{
unsigned int truesize = skb->truesize;
+ struct sk_buff_head xmitq;
+ u32 dnode, selector;
- if (likely(filter_rcv(sk, skb)))
+ __skb_queue_head_init(&xmitq);
+
+ if (likely(filter_rcv(sk, skb, &xmitq))) {
atomic_add(truesize, &tipc_sk(sk)->dupl_rcvcnt);
+ return 0;
+ }
+
+ if (skb_queue_empty(&xmitq))
+ return 0;
+
+ /* Send response/rejected message */
+ skb = __skb_dequeue(&xmitq);
+ dnode = msg_destnode(buf_msg(skb));
+ selector = msg_origport(buf_msg(skb));
+ tipc_node_xmit_skb(sock_net(sk), skb, dnode, selector);
return 0;
}
@@ -1771,12 +1791,13 @@ static int tipc_backlog_rcv(struct sock *sk, struct sk_buff *skb)
* Caller must hold socket lock
*/
static void tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk,
- u32 dport)
+ u32 dport, struct sk_buff_head *xmitq)
{
+ unsigned long time_limit = jiffies + 2;
+ struct sk_buff *skb;
unsigned int lim;
atomic_t *dcnt;
- struct sk_buff *skb;
- unsigned long time_limit = jiffies + 2;
+ u32 onode;
while (skb_queue_len(inputq)) {
if (unlikely(time_after_eq(jiffies, time_limit)))
@@ -1788,7 +1809,7 @@ static void tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk,
/* Add message directly to receive queue if possible */
if (!sock_owned_by_user(sk)) {
- filter_rcv(sk, skb);
+ filter_rcv(sk, skb, xmitq);
continue;
}
@@ -1801,7 +1822,9 @@ static void tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk,
continue;
/* Overload => reject message back to sender */
- tipc_sk_respond(sk, skb, TIPC_ERR_OVERLOAD);
+ onode = tipc_own_addr(sock_net(sk));
+ if (tipc_msg_reverse(onode, &skb, TIPC_ERR_OVERLOAD))
+ __skb_queue_tail(xmitq, skb);
break;
}
}
@@ -1814,12 +1837,14 @@ static void tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk,
*/
void tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq)
{
+ struct sk_buff_head xmitq;
u32 dnode, dport = 0;
int err;
struct tipc_sock *tsk;
struct sock *sk;
struct sk_buff *skb;
+ __skb_queue_head_init(&xmitq);
while (skb_queue_len(inputq)) {
dport = tipc_skb_peek_port(inputq, dport);
tsk = tipc_sk_lookup(net, dport);
@@ -1827,9 +1852,14 @@ void tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq)
if (likely(tsk)) {
sk = &tsk->sk;
if (likely(spin_trylock_bh(&sk->sk_lock.slock))) {
- tipc_sk_enqueue(inputq, sk, dport);
+ tipc_sk_enqueue(inputq, sk, dport, &xmitq);
spin_unlock_bh(&sk->sk_lock.slock);
}
+ /* Send pending response/rejected messages, if any */
+ while ((skb = __skb_dequeue(&xmitq))) {
+ dnode = msg_destnode(buf_msg(skb));
+ tipc_node_xmit_skb(net, skb, dnode, dport);
+ }
sock_put(sk);
continue;
}
diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c
index c9cf2be..b016c01 100644
--- a/net/tipc/udp_media.c
+++ b/net/tipc/udp_media.c
@@ -63,7 +63,7 @@
*/
struct udp_media_addr {
__be16 proto;
- __be16 udp_port;
+ __be16 port;
union {
struct in_addr ipv4;
struct in6_addr ipv6;
@@ -108,9 +108,9 @@ static int tipc_udp_addr2str(struct tipc_media_addr *a, char *buf, int size)
struct udp_media_addr *ua = (struct udp_media_addr *)&a->value;
if (ntohs(ua->proto) == ETH_P_IP)
- snprintf(buf, size, "%pI4:%u", &ua->ipv4, ntohs(ua->udp_port));
+ snprintf(buf, size, "%pI4:%u", &ua->ipv4, ntohs(ua->port));
else if (ntohs(ua->proto) == ETH_P_IPV6)
- snprintf(buf, size, "%pI6:%u", &ua->ipv6, ntohs(ua->udp_port));
+ snprintf(buf, size, "%pI6:%u", &ua->ipv6, ntohs(ua->port));
else
pr_err("Invalid UDP media address\n");
return 0;
@@ -178,8 +178,8 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb,
skb->dev = rt->dst.dev;
ttl = ip4_dst_hoplimit(&rt->dst);
udp_tunnel_xmit_skb(rt, ub->ubsock->sk, skb, src->ipv4.s_addr,
- dst->ipv4.s_addr, 0, ttl, 0, src->udp_port,
- dst->udp_port, false, true);
+ dst->ipv4.s_addr, 0, ttl, 0, src->port,
+ dst->port, false, true);
#if IS_ENABLED(CONFIG_IPV6)
} else {
struct dst_entry *ndst;
@@ -196,8 +196,8 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb,
ttl = ip6_dst_hoplimit(ndst);
err = udp_tunnel6_xmit_skb(ndst, ub->ubsock->sk, skb,
ndst->dev, &src->ipv6,
- &dst->ipv6, 0, ttl, 0, src->udp_port,
- dst->udp_port, false);
+ &dst->ipv6, 0, ttl, 0, src->port,
+ dst->port, false);
#endif
}
return err;
@@ -292,12 +292,12 @@ err:
ip4 = (struct sockaddr_in *)&sa_local;
local->proto = htons(ETH_P_IP);
- local->udp_port = ip4->sin_port;
+ local->port = ip4->sin_port;
local->ipv4.s_addr = ip4->sin_addr.s_addr;
ip4 = (struct sockaddr_in *)&sa_remote;
remote->proto = htons(ETH_P_IP);
- remote->udp_port = ip4->sin_port;
+ remote->port = ip4->sin_port;
remote->ipv4.s_addr = ip4->sin_addr.s_addr;
return 0;
@@ -312,13 +312,13 @@ err:
return -EINVAL;
local->proto = htons(ETH_P_IPV6);
- local->udp_port = ip6->sin6_port;
+ local->port = ip6->sin6_port;
memcpy(&local->ipv6, &ip6->sin6_addr, sizeof(struct in6_addr));
ub->ifindex = ip6->sin6_scope_id;
ip6 = (struct sockaddr_in6 *)&sa_remote;
remote->proto = htons(ETH_P_IPV6);
- remote->udp_port = ip6->sin6_port;
+ remote->port = ip6->sin6_port;
memcpy(&remote->ipv6, &ip6->sin6_addr, sizeof(struct in6_addr));
return 0;
#endif
@@ -386,7 +386,7 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b,
err = -EAFNOSUPPORT;
goto err;
}
- udp_conf.local_udp_port = local.udp_port;
+ udp_conf.local_udp_port = local.port;
err = udp_sock_create(net, &udp_conf, &ub->ubsock);
if (err)
goto err;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 80aa6a3..735362c 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -315,7 +315,7 @@ static struct sock *unix_find_socket_byinode(struct inode *i)
&unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
struct dentry *dentry = unix_sk(s)->path.dentry;
- if (dentry && d_backing_inode(dentry) == i) {
+ if (dentry && d_real_inode(dentry) == i) {
sock_hold(s);
goto found;
}
@@ -911,7 +911,7 @@ static struct sock *unix_find_other(struct net *net,
err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
if (err)
goto fail;
- inode = d_backing_inode(path.dentry);
+ inode = d_real_inode(path.dentry);
err = inode_permission(inode, MAY_WRITE);
if (err)
goto put_fail;
@@ -1048,7 +1048,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
goto out_up;
}
addr->hash = UNIX_HASH_SIZE;
- hash = d_backing_inode(dentry)->i_ino & (UNIX_HASH_SIZE - 1);
+ hash = d_real_inode(dentry)->i_ino & (UNIX_HASH_SIZE - 1);
spin_lock(&unix_table_lock);
u->path = u_path;
list = &unix_socket_table[hash];
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index b5f1221..b96ac91 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -61,6 +61,14 @@
* function will also cleanup rejected sockets, those that reach the connected
* state but leave it before they have been accepted.
*
+ * - Lock ordering for pending or accept queue sockets is:
+ *
+ * lock_sock(listener);
+ * lock_sock_nested(pending, SINGLE_DEPTH_NESTING);
+ *
+ * Using explicit nested locking keeps lockdep happy since normally only one
+ * lock of a given class may be taken at a time.
+ *
* - Sockets created by user action will be cleaned up when the user process
* calls close(2), causing our release implementation to be called. Our release
* implementation will perform some cleanup then drop the last reference so our
@@ -443,7 +451,7 @@ void vsock_pending_work(struct work_struct *work)
cleanup = true;
lock_sock(listener);
- lock_sock(sk);
+ lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
if (vsock_is_pending(sk)) {
vsock_remove_pending(listener, sk);
@@ -1292,7 +1300,7 @@ static int vsock_accept(struct socket *sock, struct socket *newsock, int flags)
if (connected) {
listener->sk_ack_backlog--;
- lock_sock(connected);
+ lock_sock_nested(connected, SINGLE_DEPTH_NESTING);
vconnected = vsock_sk(connected);
/* If the listener socket has received an error, then we should
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 4e809e9..2443ee3 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -509,7 +509,7 @@ static int __ieee80211_data_to_8023(struct sk_buff *skb, struct ethhdr *ehdr,
* replace EtherType */
hdrlen += ETH_ALEN + 2;
else
- tmp.h_proto = htons(skb->len);
+ tmp.h_proto = htons(skb->len - hdrlen);
pskb_pull(skb, hdrlen);