summaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/6lowpan/iphc.c2
-rw-r--r--net/Kconfig7
-rw-r--r--net/atm/br2684.c9
-rw-r--r--net/bluetooth/6lowpan.c23
-rw-r--r--net/bluetooth/Kconfig5
-rw-r--r--net/bluetooth/Makefile3
-rw-r--r--net/bluetooth/a2mp.c17
-rw-r--r--net/bluetooth/a2mp.h19
-rw-r--r--net/bluetooth/amp.c134
-rw-r--r--net/bluetooth/amp.h14
-rw-r--r--net/bluetooth/cmtp/capi.c8
-rw-r--r--net/bluetooth/hci_core.c11
-rw-r--r--net/bluetooth/hci_event.c157
-rw-r--r--net/bluetooth/l2cap_sock.c41
-rw-r--r--net/bluetooth/mgmt.c23
-rw-r--r--net/bridge/br_device.c1
-rw-r--r--net/bridge/br_if.c1
-rw-r--r--net/bridge/br_mdb.c144
-rw-r--r--net/bridge/br_multicast.c44
-rw-r--r--net/bridge/br_netfilter_hooks.c20
-rw-r--r--net/bridge/br_netfilter_ipv6.c2
-rw-r--r--net/bridge/br_netlink.c16
-rw-r--r--net/bridge/br_private.h16
-rw-r--r--net/bridge/br_vlan.c18
-rw-r--r--net/core/Makefile1
-rw-r--r--net/core/dev.c40
-rw-r--r--net/core/dst.c107
-rw-r--r--net/core/fib_rules.c24
-rw-r--r--net/core/filter.c149
-rw-r--r--net/core/flow_dissector.c58
-rw-r--r--net/core/lwtunnel.c243
-rw-r--r--net/core/neighbour.c14
-rw-r--r--net/core/net-sysfs.c25
-rw-r--r--net/core/pktgen.c5
-rw-r--r--net/core/rtnetlink.c42
-rw-r--r--net/core/timestamping.c6
-rw-r--r--net/dsa/dsa.c12
-rw-r--r--net/dsa/dsa_priv.h8
-rw-r--r--net/dsa/slave.c244
-rw-r--r--net/dsa/tag_brcm.c15
-rw-r--r--net/dsa/tag_dsa.c12
-rw-r--r--net/dsa/tag_edsa.c12
-rw-r--r--net/dsa/tag_trailer.c12
-rw-r--r--net/ethernet/eth.c2
-rw-r--r--net/ieee802154/rdev-ops.h20
-rw-r--r--net/ieee802154/sysfs.c38
-rw-r--r--net/ieee802154/trace.h22
-rw-r--r--net/ipv4/af_inet.c3
-rw-r--r--net/ipv4/arp.c65
-rw-r--r--net/ipv4/datagram.c2
-rw-r--r--net/ipv4/fib_frontend.c10
-rw-r--r--net/ipv4/fib_semantics.c99
-rw-r--r--net/ipv4/gre_demux.c235
-rw-r--r--net/ipv4/icmp.c1
-rw-r--r--net/ipv4/inet_hashtables.c38
-rw-r--r--net/ipv4/inet_timewait_sock.c55
-rw-r--r--net/ipv4/ip_fragment.c26
-rw-r--r--net/ipv4/ip_gre.c446
-rw-r--r--net/ipv4/ip_input.c3
-rw-r--r--net/ipv4/ip_tunnel.c37
-rw-r--r--net/ipv4/ip_tunnel_core.c121
-rw-r--r--net/ipv4/ipconfig.c2
-rw-r--r--net/ipv4/ipip.c2
-rw-r--r--net/ipv4/netfilter/arp_tables.c32
-rw-r--r--net/ipv4/netfilter/ip_tables.c68
-rw-r--r--net/ipv4/netfilter/nf_defrag_ipv4.c7
-rw-r--r--net/ipv4/ping.c3
-rw-r--r--net/ipv4/proc.c2
-rw-r--r--net/ipv4/route.c28
-rw-r--r--net/ipv4/tcp_bic.c2
-rw-r--r--net/ipv4/tcp_cdg.c2
-rw-r--r--net/ipv4/tcp_cong.c6
-rw-r--r--net/ipv4/tcp_cubic.c4
-rw-r--r--net/ipv4/tcp_highspeed.c2
-rw-r--r--net/ipv4/tcp_htcp.c2
-rw-r--r--net/ipv4/tcp_hybla.c2
-rw-r--r--net/ipv4/tcp_illinois.c2
-rw-r--r--net/ipv4/tcp_input.c64
-rw-r--r--net/ipv4/tcp_ipv4.c7
-rw-r--r--net/ipv4/tcp_metrics.c2
-rw-r--r--net/ipv4/tcp_minisocks.c6
-rw-r--r--net/ipv4/tcp_output.c8
-rw-r--r--net/ipv4/tcp_scalable.c2
-rw-r--r--net/ipv4/tcp_timer.c1
-rw-r--r--net/ipv4/tcp_vegas.c6
-rw-r--r--net/ipv4/tcp_veno.c2
-rw-r--r--net/ipv6/Kconfig11
-rw-r--r--net/ipv6/addrconf.c233
-rw-r--r--net/ipv6/addrconf_core.c11
-rw-r--r--net/ipv6/af_inet6.c12
-rw-r--r--net/ipv6/datagram.c10
-rw-r--r--net/ipv6/exthdrs.c2
-rw-r--r--net/ipv6/icmp.c6
-rw-r--r--net/ipv6/inet6_hashtables.c9
-rw-r--r--net/ipv6/ip6_fib.c2
-rw-r--r--net/ipv6/ip6_gre.c5
-rw-r--r--net/ipv6/ip6_input.c5
-rw-r--r--net/ipv6/ip6_output.c18
-rw-r--r--net/ipv6/ip6_tunnel.c2
-rw-r--r--net/ipv6/ndisc.c16
-rw-r--r--net/ipv6/netfilter/ip6_tables.c52
-rw-r--r--net/ipv6/netfilter/ip6t_REJECT.c5
-rw-r--r--net/ipv6/netfilter/nf_defrag_ipv6_hooks.c7
-rw-r--r--net/ipv6/raw.c3
-rw-r--r--net/ipv6/route.c77
-rw-r--r--net/ipv6/sit.c2
-rw-r--r--net/ipv6/sysctl_net_ipv6.c15
-rw-r--r--net/ipv6/tcp_ipv6.c7
-rw-r--r--net/mac802154/cfg.c54
-rw-r--r--net/mac802154/ieee802154_i.h11
-rw-r--r--net/mac802154/iface.c14
-rw-r--r--net/mac802154/main.c10
-rw-r--r--net/mac802154/rx.c14
-rw-r--r--net/mac802154/tx.c27
-rw-r--r--net/mac802154/util.c8
-rw-r--r--net/mpls/Kconfig8
-rw-r--r--net/mpls/Makefile1
-rw-r--r--net/mpls/af_mpls.c197
-rw-r--r--net/mpls/internal.h9
-rw-r--r--net/mpls/mpls_iptunnel.c233
-rw-r--r--net/netfilter/core.c225
-rw-r--r--net/netfilter/ipvs/ip_vs_sched.c2
-rw-r--r--net/netfilter/nf_conntrack_proto_sctp.c101
-rw-r--r--net/netfilter/nf_internals.h2
-rw-r--r--net/netfilter/nf_queue.c12
-rw-r--r--net/netfilter/nf_tables_api.c8
-rw-r--r--net/netfilter/nf_tables_core.c5
-rw-r--r--net/netfilter/nft_meta.c4
-rw-r--r--net/netfilter/x_tables.c29
-rw-r--r--net/netfilter/xt_TEE.c15
-rw-r--r--net/netfilter/xt_TPROXY.c6
-rw-r--r--net/openvswitch/Kconfig2
-rw-r--r--net/openvswitch/Makefile2
-rw-r--r--net/openvswitch/actions.c17
-rw-r--r--net/openvswitch/datapath.c19
-rw-r--r--net/openvswitch/datapath.h5
-rw-r--r--net/openvswitch/dp_notify.c5
-rw-r--r--net/openvswitch/flow.c4
-rw-r--r--net/openvswitch/flow.h79
-rw-r--r--net/openvswitch/flow_netlink.c84
-rw-r--r--net/openvswitch/flow_netlink.h3
-rw-r--r--net/openvswitch/flow_table.c4
-rw-r--r--net/openvswitch/vport-geneve.c17
-rw-r--r--net/openvswitch/vport-gre.c239
-rw-r--r--net/openvswitch/vport-internal_dev.c38
-rw-r--r--net/openvswitch/vport-netdev.c117
-rw-r--r--net/openvswitch/vport-netdev.h16
-rw-r--r--net/openvswitch/vport-vxlan.c224
-rw-r--r--net/openvswitch/vport-vxlan.h11
-rw-r--r--net/openvswitch/vport.c34
-rw-r--r--net/openvswitch/vport.h21
-rw-r--r--net/packet/af_packet.c6
-rw-r--r--net/rds/bind.c3
-rw-r--r--net/rds/connection.c16
-rw-r--r--net/rds/ib.c2
-rw-r--r--net/rds/ib_cm.c5
-rw-r--r--net/rds/iw.c2
-rw-r--r--net/rds/iw_cm.c5
-rw-r--r--net/rds/rds.h23
-rw-r--r--net/rds/send.c3
-rw-r--r--net/rds/tcp.c165
-rw-r--r--net/rds/tcp.h7
-rw-r--r--net/rds/tcp_connect.c9
-rw-r--r--net/rds/tcp_listen.c40
-rw-r--r--net/rds/transport.c4
-rw-r--r--net/sched/act_api.c44
-rw-r--r--net/sched/act_bpf.c53
-rw-r--r--net/sched/act_connmark.c3
-rw-r--r--net/sched/act_csum.c3
-rw-r--r--net/sched/act_gact.c44
-rw-r--r--net/sched/act_ipt.c2
-rw-r--r--net/sched/act_mirred.c58
-rw-r--r--net/sched/act_nat.c3
-rw-r--r--net/sched/act_pedit.c3
-rw-r--r--net/sched/act_simple.c3
-rw-r--r--net/sched/act_skbedit.c3
-rw-r--r--net/sched/act_vlan.c3
-rw-r--r--net/sched/cls_cgroup.c23
-rw-r--r--net/sched/sch_qfq.c1
-rw-r--r--net/sctp/protocol.c42
-rw-r--r--net/sctp/sm_statefuns.c2
-rw-r--r--net/switchdev/switchdev.c113
-rw-r--r--net/tipc/bcast.c31
-rw-r--r--net/tipc/bcast.h1
-rw-r--r--net/tipc/bearer.c30
-rw-r--r--net/tipc/bearer.h3
-rw-r--r--net/tipc/core.h10
-rw-r--r--net/tipc/discover.c130
-rw-r--r--net/tipc/link.c2030
-rw-r--r--net/tipc/link.h109
-rw-r--r--net/tipc/msg.c86
-rw-r--r--net/tipc/msg.h112
-rw-r--r--net/tipc/name_distr.c6
-rw-r--r--net/tipc/node.c964
-rw-r--r--net/tipc/node.h84
-rw-r--r--net/tipc/socket.c385
-rw-r--r--net/tipc/socket.h2
-rw-r--r--net/tipc/udp_media.c3
-rw-r--r--net/wimax/op-rfkill.c3
-rw-r--r--net/xfrm/xfrm_user.c2
200 files changed, 6447 insertions, 3934 deletions
diff --git a/net/6lowpan/iphc.c b/net/6lowpan/iphc.c
index 94a375c..9055d7b 100644
--- a/net/6lowpan/iphc.c
+++ b/net/6lowpan/iphc.c
@@ -613,6 +613,8 @@ EXPORT_SYMBOL_GPL(lowpan_header_compress);
static int __init lowpan_module_init(void)
{
+ request_module_nowait("ipv6");
+
request_module_nowait("nhc_dest");
request_module_nowait("nhc_fragment");
request_module_nowait("nhc_hop");
diff --git a/net/Kconfig b/net/Kconfig
index 57a7c5a..7021c1b 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -374,6 +374,13 @@ source "net/caif/Kconfig"
source "net/ceph/Kconfig"
source "net/nfc/Kconfig"
+config LWTUNNEL
+ bool "Network light weight tunnels"
+ ---help---
+ This feature provides an infrastructure to support light weight
+ tunnels like mpls. There is no netdevice associated with a light
+ weight tunnel endpoint. Tunnel encapsulation parameters are stored
+ with light weight tunnel state associated with fib routes.
endif # if NET
diff --git a/net/atm/br2684.c b/net/atm/br2684.c
index cc78538..aa0047c 100644
--- a/net/atm/br2684.c
+++ b/net/atm/br2684.c
@@ -802,13 +802,10 @@ static int br2684_seq_show(struct seq_file *seq, void *v)
(brdev->payload == p_bridged) ? "bridged" : "routed",
brvcc->copies_failed, brvcc->copies_needed);
#ifdef CONFIG_ATM_BR2684_IPFILTER
-#define b1(var, byte) ((u8 *) &brvcc->filter.var)[byte]
-#define bs(var) b1(var, 0), b1(var, 1), b1(var, 2), b1(var, 3)
if (brvcc->filter.netmask != 0)
- seq_printf(seq, " filter=%d.%d.%d.%d/"
- "%d.%d.%d.%d\n", bs(prefix), bs(netmask));
-#undef bs
-#undef b1
+ seq_printf(seq, " filter=%pI4/%pI4\n",
+ &brvcc->filter.prefix,
+ &brvcc->filter.netmask);
#endif /* CONFIG_ATM_BR2684_IPFILTER */
}
return 0;
diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c
index 2fb7b30..0ffe2e2 100644
--- a/net/bluetooth/6lowpan.c
+++ b/net/bluetooth/6lowpan.c
@@ -859,9 +859,22 @@ static int setup_netdev(struct l2cap_chan *chan, struct lowpan_dev **dev)
SET_NETDEV_DEV(netdev, &chan->conn->hcon->hdev->dev);
SET_NETDEV_DEVTYPE(netdev, &bt_type);
+ *dev = netdev_priv(netdev);
+ (*dev)->netdev = netdev;
+ (*dev)->hdev = chan->conn->hcon->hdev;
+ INIT_LIST_HEAD(&(*dev)->peers);
+
+ spin_lock(&devices_lock);
+ INIT_LIST_HEAD(&(*dev)->list);
+ list_add_rcu(&(*dev)->list, &bt_6lowpan_devices);
+ spin_unlock(&devices_lock);
+
err = register_netdev(netdev);
if (err < 0) {
BT_INFO("register_netdev failed %d", err);
+ spin_lock(&devices_lock);
+ list_del_rcu(&(*dev)->list);
+ spin_unlock(&devices_lock);
free_netdev(netdev);
goto out;
}
@@ -871,16 +884,6 @@ static int setup_netdev(struct l2cap_chan *chan, struct lowpan_dev **dev)
&chan->src, chan->src_type);
set_bit(__LINK_STATE_PRESENT, &netdev->state);
- *dev = netdev_priv(netdev);
- (*dev)->netdev = netdev;
- (*dev)->hdev = chan->conn->hcon->hdev;
- INIT_LIST_HEAD(&(*dev)->peers);
-
- spin_lock(&devices_lock);
- INIT_LIST_HEAD(&(*dev)->list);
- list_add_rcu(&(*dev)->list, &bt_6lowpan_devices);
- spin_unlock(&devices_lock);
-
return 0;
out:
diff --git a/net/bluetooth/Kconfig b/net/bluetooth/Kconfig
index b8c794b..95d1a66 100644
--- a/net/bluetooth/Kconfig
+++ b/net/bluetooth/Kconfig
@@ -53,6 +53,11 @@ source "net/bluetooth/cmtp/Kconfig"
source "net/bluetooth/hidp/Kconfig"
+config BT_HS
+ bool "Bluetooth High Speed (HS) features"
+ depends on BT_BREDR
+ default y
+
config BT_LE
bool "Bluetooth Low Energy (LE) features"
depends on BT
diff --git a/net/bluetooth/Makefile b/net/bluetooth/Makefile
index 29c12ae..2b15ae8 100644
--- a/net/bluetooth/Makefile
+++ b/net/bluetooth/Makefile
@@ -13,9 +13,10 @@ bluetooth_6lowpan-y := 6lowpan.o
bluetooth-y := af_bluetooth.o hci_core.o hci_conn.o hci_event.o mgmt.o \
hci_sock.o hci_sysfs.o l2cap_core.o l2cap_sock.o smp.o lib.o \
- a2mp.o amp.o ecc.o hci_request.o mgmt_util.o
+ ecc.o hci_request.o mgmt_util.o
bluetooth-$(CONFIG_BT_BREDR) += sco.o
+bluetooth-$(CONFIG_BT_HS) += a2mp.o amp.o
bluetooth-$(CONFIG_BT_DEBUGFS) += hci_debugfs.o
bluetooth-$(CONFIG_BT_SELFTEST) += selftest.o
diff --git a/net/bluetooth/a2mp.c b/net/bluetooth/a2mp.c
index 5a04eb1..5f123c3 100644
--- a/net/bluetooth/a2mp.c
+++ b/net/bluetooth/a2mp.c
@@ -16,6 +16,7 @@
#include <net/bluetooth/hci_core.h>
#include <net/bluetooth/l2cap.h>
+#include "hci_request.h"
#include "a2mp.h"
#include "amp.h"
@@ -286,11 +287,21 @@ static int a2mp_change_notify(struct amp_mgr *mgr, struct sk_buff *skb,
return 0;
}
+static void read_local_amp_info_complete(struct hci_dev *hdev, u8 status,
+ u16 opcode)
+{
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ a2mp_send_getinfo_rsp(hdev);
+}
+
static int a2mp_getinfo_req(struct amp_mgr *mgr, struct sk_buff *skb,
struct a2mp_cmd *hdr)
{
struct a2mp_info_req *req = (void *) skb->data;
struct hci_dev *hdev;
+ struct hci_request hreq;
+ int err = 0;
if (le16_to_cpu(hdr->len) < sizeof(*req))
return -EINVAL;
@@ -311,7 +322,11 @@ static int a2mp_getinfo_req(struct amp_mgr *mgr, struct sk_buff *skb,
}
set_bit(READ_LOC_AMP_INFO, &mgr->state);
- hci_send_cmd(hdev, HCI_OP_READ_LOCAL_AMP_INFO, 0, NULL);
+ hci_req_init(&hreq, hdev);
+ hci_req_add(&hreq, HCI_OP_READ_LOCAL_AMP_INFO, 0, NULL);
+ err = hci_req_run(&hreq, read_local_amp_info_complete);
+ if (err < 0)
+ a2mp_send_getinfo_rsp(hdev);
done:
if (hdev)
diff --git a/net/bluetooth/a2mp.h b/net/bluetooth/a2mp.h
index 296f665..a4ff3ea 100644
--- a/net/bluetooth/a2mp.h
+++ b/net/bluetooth/a2mp.h
@@ -130,10 +130,29 @@ struct a2mp_physlink_rsp {
#define A2MP_STATUS_SECURITY_VIOLATION 0x06
struct amp_mgr *amp_mgr_get(struct amp_mgr *mgr);
+
+#if IS_ENABLED(CONFIG_BT_HS)
int amp_mgr_put(struct amp_mgr *mgr);
struct l2cap_chan *a2mp_channel_create(struct l2cap_conn *conn,
struct sk_buff *skb);
void a2mp_discover_amp(struct l2cap_chan *chan);
+#else
+static inline int amp_mgr_put(struct amp_mgr *mgr)
+{
+ return 0;
+}
+
+static inline struct l2cap_chan *a2mp_channel_create(struct l2cap_conn *conn,
+ struct sk_buff *skb)
+{
+ return NULL;
+}
+
+static inline void a2mp_discover_amp(struct l2cap_chan *chan)
+{
+}
+#endif
+
void a2mp_send_getinfo_rsp(struct hci_dev *hdev);
void a2mp_send_getampassoc_rsp(struct hci_dev *hdev, u8 status);
void a2mp_send_create_phy_link_req(struct hci_dev *hdev, u8 status);
diff --git a/net/bluetooth/amp.c b/net/bluetooth/amp.c
index ee016f0..238ddd3 100644
--- a/net/bluetooth/amp.c
+++ b/net/bluetooth/amp.c
@@ -16,6 +16,7 @@
#include <net/bluetooth/hci_core.h>
#include <crypto/hash.h>
+#include "hci_request.h"
#include "a2mp.h"
#include "amp.h"
@@ -220,10 +221,49 @@ int phylink_gen_key(struct hci_conn *conn, u8 *data, u8 *len, u8 *type)
return hmac_sha256(gamp_key, HCI_AMP_LINK_KEY_SIZE, "802b", 4, data);
}
+static void read_local_amp_assoc_complete(struct hci_dev *hdev, u8 status,
+ u16 opcode, struct sk_buff *skb)
+{
+ struct hci_rp_read_local_amp_assoc *rp = (void *)skb->data;
+ struct amp_assoc *assoc = &hdev->loc_assoc;
+ size_t rem_len, frag_len;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+
+ if (rp->status)
+ goto send_rsp;
+
+ frag_len = skb->len - sizeof(*rp);
+ rem_len = __le16_to_cpu(rp->rem_len);
+
+ if (rem_len > frag_len) {
+ BT_DBG("frag_len %zu rem_len %zu", frag_len, rem_len);
+
+ memcpy(assoc->data + assoc->offset, rp->frag, frag_len);
+ assoc->offset += frag_len;
+
+ /* Read other fragments */
+ amp_read_loc_assoc_frag(hdev, rp->phy_handle);
+
+ return;
+ }
+
+ memcpy(assoc->data + assoc->offset, rp->frag, rem_len);
+ assoc->len = assoc->offset + rem_len;
+ assoc->offset = 0;
+
+send_rsp:
+ /* Send A2MP Rsp when all fragments are received */
+ a2mp_send_getampassoc_rsp(hdev, rp->status);
+ a2mp_send_create_phy_link_req(hdev, rp->status);
+}
+
void amp_read_loc_assoc_frag(struct hci_dev *hdev, u8 phy_handle)
{
struct hci_cp_read_local_amp_assoc cp;
struct amp_assoc *loc_assoc = &hdev->loc_assoc;
+ struct hci_request req;
+ int err = 0;
BT_DBG("%s handle %d", hdev->name, phy_handle);
@@ -231,12 +271,18 @@ void amp_read_loc_assoc_frag(struct hci_dev *hdev, u8 phy_handle)
cp.max_len = cpu_to_le16(hdev->amp_assoc_size);
cp.len_so_far = cpu_to_le16(loc_assoc->offset);
- hci_send_cmd(hdev, HCI_OP_READ_LOCAL_AMP_ASSOC, sizeof(cp), &cp);
+ hci_req_init(&req, hdev);
+ hci_req_add(&req, HCI_OP_READ_LOCAL_AMP_ASSOC, sizeof(cp), &cp);
+ err = hci_req_run_skb(&req, read_local_amp_assoc_complete);
+ if (err < 0)
+ a2mp_send_getampassoc_rsp(hdev, A2MP_STATUS_INVALID_CTRL_ID);
}
void amp_read_loc_assoc(struct hci_dev *hdev, struct amp_mgr *mgr)
{
struct hci_cp_read_local_amp_assoc cp;
+ struct hci_request req;
+ int err = 0;
memset(&hdev->loc_assoc, 0, sizeof(struct amp_assoc));
memset(&cp, 0, sizeof(cp));
@@ -244,7 +290,11 @@ void amp_read_loc_assoc(struct hci_dev *hdev, struct amp_mgr *mgr)
cp.max_len = cpu_to_le16(hdev->amp_assoc_size);
set_bit(READ_LOC_AMP_ASSOC, &mgr->state);
- hci_send_cmd(hdev, HCI_OP_READ_LOCAL_AMP_ASSOC, sizeof(cp), &cp);
+ hci_req_init(&req, hdev);
+ hci_req_add(&req, HCI_OP_READ_LOCAL_AMP_ASSOC, sizeof(cp), &cp);
+ hci_req_run_skb(&req, read_local_amp_assoc_complete);
+ if (err < 0)
+ a2mp_send_getampassoc_rsp(hdev, A2MP_STATUS_INVALID_CTRL_ID);
}
void amp_read_loc_assoc_final_data(struct hci_dev *hdev,
@@ -252,6 +302,8 @@ void amp_read_loc_assoc_final_data(struct hci_dev *hdev,
{
struct hci_cp_read_local_amp_assoc cp;
struct amp_mgr *mgr = hcon->amp_mgr;
+ struct hci_request req;
+ int err = 0;
cp.phy_handle = hcon->handle;
cp.len_so_far = cpu_to_le16(0);
@@ -260,7 +312,25 @@ void amp_read_loc_assoc_final_data(struct hci_dev *hdev,
set_bit(READ_LOC_AMP_ASSOC_FINAL, &mgr->state);
/* Read Local AMP Assoc final link information data */
- hci_send_cmd(hdev, HCI_OP_READ_LOCAL_AMP_ASSOC, sizeof(cp), &cp);
+ hci_req_init(&req, hdev);
+ hci_req_add(&req, HCI_OP_READ_LOCAL_AMP_ASSOC, sizeof(cp), &cp);
+ hci_req_run_skb(&req, read_local_amp_assoc_complete);
+ if (err < 0)
+ a2mp_send_getampassoc_rsp(hdev, A2MP_STATUS_INVALID_CTRL_ID);
+}
+
+static void write_remote_amp_assoc_complete(struct hci_dev *hdev, u8 status,
+ u16 opcode, struct sk_buff *skb)
+{
+ struct hci_rp_write_remote_amp_assoc *rp = (void *)skb->data;
+
+ BT_DBG("%s status 0x%2.2x phy_handle 0x%2.2x",
+ hdev->name, rp->status, rp->phy_handle);
+
+ if (rp->status)
+ return;
+
+ amp_write_rem_assoc_continue(hdev, rp->phy_handle);
}
/* Write AMP Assoc data fragments, returns true with last fragment written*/
@@ -270,6 +340,7 @@ static bool amp_write_rem_assoc_frag(struct hci_dev *hdev,
struct hci_cp_write_remote_amp_assoc *cp;
struct amp_mgr *mgr = hcon->amp_mgr;
struct amp_ctrl *ctrl;
+ struct hci_request req;
u16 frag_len, len;
ctrl = amp_ctrl_lookup(mgr, hcon->remote_id);
@@ -307,7 +378,9 @@ static bool amp_write_rem_assoc_frag(struct hci_dev *hdev,
amp_ctrl_put(ctrl);
- hci_send_cmd(hdev, HCI_OP_WRITE_REMOTE_AMP_ASSOC, len, cp);
+ hci_req_init(&req, hdev);
+ hci_req_add(&req, HCI_OP_WRITE_REMOTE_AMP_ASSOC, sizeof(cp), &cp);
+ hci_req_run_skb(&req, write_remote_amp_assoc_complete);
kfree(cp);
@@ -344,10 +417,37 @@ void amp_write_remote_assoc(struct hci_dev *hdev, u8 handle)
amp_write_rem_assoc_frag(hdev, hcon);
}
+static void create_phylink_complete(struct hci_dev *hdev, u8 status,
+ u16 opcode)
+{
+ struct hci_cp_create_phy_link *cp;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_CREATE_PHY_LINK);
+ if (!cp)
+ return;
+
+ hci_dev_lock(hdev);
+
+ if (status) {
+ struct hci_conn *hcon;
+
+ hcon = hci_conn_hash_lookup_handle(hdev, cp->phy_handle);
+ if (hcon)
+ hci_conn_del(hcon);
+ } else {
+ amp_write_remote_assoc(hdev, cp->phy_handle);
+ }
+
+ hci_dev_unlock(hdev);
+}
+
void amp_create_phylink(struct hci_dev *hdev, struct amp_mgr *mgr,
struct hci_conn *hcon)
{
struct hci_cp_create_phy_link cp;
+ struct hci_request req;
cp.phy_handle = hcon->handle;
@@ -360,13 +460,33 @@ void amp_create_phylink(struct hci_dev *hdev, struct amp_mgr *mgr,
return;
}
- hci_send_cmd(hdev, HCI_OP_CREATE_PHY_LINK, sizeof(cp), &cp);
+ hci_req_init(&req, hdev);
+ hci_req_add(&req, HCI_OP_CREATE_PHY_LINK, sizeof(cp), &cp);
+ hci_req_run(&req, create_phylink_complete);
+}
+
+static void accept_phylink_complete(struct hci_dev *hdev, u8 status,
+ u16 opcode)
+{
+ struct hci_cp_accept_phy_link *cp;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ if (status)
+ return;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_ACCEPT_PHY_LINK);
+ if (!cp)
+ return;
+
+ amp_write_remote_assoc(hdev, cp->phy_handle);
}
void amp_accept_phylink(struct hci_dev *hdev, struct amp_mgr *mgr,
struct hci_conn *hcon)
{
struct hci_cp_accept_phy_link cp;
+ struct hci_request req;
cp.phy_handle = hcon->handle;
@@ -379,7 +499,9 @@ void amp_accept_phylink(struct hci_dev *hdev, struct amp_mgr *mgr,
return;
}
- hci_send_cmd(hdev, HCI_OP_ACCEPT_PHY_LINK, sizeof(cp), &cp);
+ hci_req_init(&req, hdev);
+ hci_req_add(&req, HCI_OP_ACCEPT_PHY_LINK, sizeof(cp), &cp);
+ hci_req_run(&req, accept_phylink_complete);
}
void amp_physical_cfm(struct hci_conn *bredr_hcon, struct hci_conn *hs_hcon)
diff --git a/net/bluetooth/amp.h b/net/bluetooth/amp.h
index 7ea3db7..8848f81 100644
--- a/net/bluetooth/amp.h
+++ b/net/bluetooth/amp.h
@@ -44,6 +44,20 @@ void amp_create_phylink(struct hci_dev *hdev, struct amp_mgr *mgr,
struct hci_conn *hcon);
void amp_accept_phylink(struct hci_dev *hdev, struct amp_mgr *mgr,
struct hci_conn *hcon);
+
+#if IS_ENABLED(CONFIG_BT_HS)
+void amp_create_logical_link(struct l2cap_chan *chan);
+void amp_disconnect_logical_link(struct hci_chan *hchan);
+#else
+static inline void amp_create_logical_link(struct l2cap_chan *chan)
+{
+}
+
+static inline void amp_disconnect_logical_link(struct hci_chan *hchan)
+{
+}
+#endif
+
void amp_write_remote_assoc(struct hci_dev *hdev, u8 handle);
void amp_write_rem_assoc_continue(struct hci_dev *hdev, u8 handle);
void amp_physical_cfm(struct hci_conn *bredr_hcon, struct hci_conn *hs_hcon);
diff --git a/net/bluetooth/cmtp/capi.c b/net/bluetooth/cmtp/capi.c
index b0c6c6a..9a503387 100644
--- a/net/bluetooth/cmtp/capi.c
+++ b/net/bluetooth/cmtp/capi.c
@@ -100,9 +100,9 @@ static void cmtp_application_del(struct cmtp_session *session, struct cmtp_appli
static struct cmtp_application *cmtp_application_get(struct cmtp_session *session, int pattern, __u16 value)
{
struct cmtp_application *app;
- struct list_head *p, *n;
+ struct list_head *p;
- list_for_each_safe(p, n, &session->applications) {
+ list_for_each(p, &session->applications) {
app = list_entry(p, struct cmtp_application, list);
switch (pattern) {
case CMTP_MSGNUM:
@@ -511,13 +511,13 @@ static int cmtp_proc_show(struct seq_file *m, void *v)
struct capi_ctr *ctrl = m->private;
struct cmtp_session *session = ctrl->driverdata;
struct cmtp_application *app;
- struct list_head *p, *n;
+ struct list_head *p;
seq_printf(m, "%s\n\n", cmtp_procinfo(ctrl));
seq_printf(m, "addr %s\n", session->name);
seq_printf(m, "ctrl %d\n", session->num);
- list_for_each_safe(p, n, &session->applications) {
+ list_for_each(p, &session->applications) {
app = list_entry(p, struct cmtp_application, list);
seq_printf(m, "appl %d -> %d\n", app->appl, app->mapping);
}
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 2f8fb33..bc43b64 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -2822,10 +2822,6 @@ struct hci_conn_params *hci_conn_params_lookup(struct hci_dev *hdev,
{
struct hci_conn_params *params;
- /* The conn params list only contains identity addresses */
- if (!hci_is_identity_address(addr, addr_type))
- return NULL;
-
list_for_each_entry(params, &hdev->le_conn_params, list) {
if (bacmp(&params->addr, addr) == 0 &&
params->addr_type == addr_type) {
@@ -2842,10 +2838,6 @@ struct hci_conn_params *hci_pend_le_action_lookup(struct list_head *list,
{
struct hci_conn_params *param;
- /* The list only contains identity addresses */
- if (!hci_is_identity_address(addr, addr_type))
- return NULL;
-
list_for_each_entry(param, list, action) {
if (bacmp(&param->addr, addr) == 0 &&
param->addr_type == addr_type)
@@ -2861,9 +2853,6 @@ struct hci_conn_params *hci_conn_params_add(struct hci_dev *hdev,
{
struct hci_conn_params *params;
- if (!hci_is_identity_address(addr, addr_type))
- return NULL;
-
params = hci_conn_params_lookup(hdev, addr, addr_type);
if (params)
return params;
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 32363c2..218d7df 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -823,7 +823,7 @@ static void hci_cc_read_local_amp_info(struct hci_dev *hdev,
BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
if (rp->status)
- goto a2mp_rsp;
+ return;
hdev->amp_status = rp->amp_status;
hdev->amp_total_bw = __le32_to_cpu(rp->total_bw);
@@ -835,46 +835,6 @@ static void hci_cc_read_local_amp_info(struct hci_dev *hdev,
hdev->amp_assoc_size = __le16_to_cpu(rp->max_assoc_size);
hdev->amp_be_flush_to = __le32_to_cpu(rp->be_flush_to);
hdev->amp_max_flush_to = __le32_to_cpu(rp->max_flush_to);
-
-a2mp_rsp:
- a2mp_send_getinfo_rsp(hdev);
-}
-
-static void hci_cc_read_local_amp_assoc(struct hci_dev *hdev,
- struct sk_buff *skb)
-{
- struct hci_rp_read_local_amp_assoc *rp = (void *) skb->data;
- struct amp_assoc *assoc = &hdev->loc_assoc;
- size_t rem_len, frag_len;
-
- BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
-
- if (rp->status)
- goto a2mp_rsp;
-
- frag_len = skb->len - sizeof(*rp);
- rem_len = __le16_to_cpu(rp->rem_len);
-
- if (rem_len > frag_len) {
- BT_DBG("frag_len %zu rem_len %zu", frag_len, rem_len);
-
- memcpy(assoc->data + assoc->offset, rp->frag, frag_len);
- assoc->offset += frag_len;
-
- /* Read other fragments */
- amp_read_loc_assoc_frag(hdev, rp->phy_handle);
-
- return;
- }
-
- memcpy(assoc->data + assoc->offset, rp->frag, rem_len);
- assoc->len = assoc->offset + rem_len;
- assoc->offset = 0;
-
-a2mp_rsp:
- /* Send A2MP Rsp when all fragments are received */
- a2mp_send_getampassoc_rsp(hdev, rp->status);
- a2mp_send_create_phy_link_req(hdev, rp->status);
}
static void hci_cc_read_inq_rsp_tx_power(struct hci_dev *hdev,
@@ -1409,20 +1369,6 @@ static void hci_cc_set_adv_param(struct hci_dev *hdev, struct sk_buff *skb)
hci_dev_unlock(hdev);
}
-static void hci_cc_write_remote_amp_assoc(struct hci_dev *hdev,
- struct sk_buff *skb)
-{
- struct hci_rp_write_remote_amp_assoc *rp = (void *) skb->data;
-
- BT_DBG("%s status 0x%2.2x phy_handle 0x%2.2x",
- hdev->name, rp->status, rp->phy_handle);
-
- if (rp->status)
- return;
-
- amp_write_rem_assoc_continue(hdev, rp->phy_handle);
-}
-
static void hci_cc_read_rssi(struct hci_dev *hdev, struct sk_buff *skb)
{
struct hci_rp_read_rssi *rp = (void *) skb->data;
@@ -1944,47 +1890,6 @@ static void hci_cs_disconnect(struct hci_dev *hdev, u8 status)
hci_dev_unlock(hdev);
}
-static void hci_cs_create_phylink(struct hci_dev *hdev, u8 status)
-{
- struct hci_cp_create_phy_link *cp;
-
- BT_DBG("%s status 0x%2.2x", hdev->name, status);
-
- cp = hci_sent_cmd_data(hdev, HCI_OP_CREATE_PHY_LINK);
- if (!cp)
- return;
-
- hci_dev_lock(hdev);
-
- if (status) {
- struct hci_conn *hcon;
-
- hcon = hci_conn_hash_lookup_handle(hdev, cp->phy_handle);
- if (hcon)
- hci_conn_del(hcon);
- } else {
- amp_write_remote_assoc(hdev, cp->phy_handle);
- }
-
- hci_dev_unlock(hdev);
-}
-
-static void hci_cs_accept_phylink(struct hci_dev *hdev, u8 status)
-{
- struct hci_cp_accept_phy_link *cp;
-
- BT_DBG("%s status 0x%2.2x", hdev->name, status);
-
- if (status)
- return;
-
- cp = hci_sent_cmd_data(hdev, HCI_OP_ACCEPT_PHY_LINK);
- if (!cp)
- return;
-
- amp_write_remote_assoc(hdev, cp->phy_handle);
-}
-
static void hci_cs_le_create_conn(struct hci_dev *hdev, u8 status)
{
struct hci_cp_le_create_conn *cp;
@@ -2998,10 +2903,6 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb,
hci_cc_read_clock(hdev, skb);
break;
- case HCI_OP_READ_LOCAL_AMP_ASSOC:
- hci_cc_read_local_amp_assoc(hdev, skb);
- break;
-
case HCI_OP_READ_INQ_RSP_TX_POWER:
hci_cc_read_inq_rsp_tx_power(hdev, skb);
break;
@@ -3106,10 +3007,6 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb,
hci_cc_set_adv_param(hdev, skb);
break;
- case HCI_OP_WRITE_REMOTE_AMP_ASSOC:
- hci_cc_write_remote_amp_assoc(hdev, skb);
- break;
-
case HCI_OP_READ_RSSI:
hci_cc_read_rssi(hdev, skb);
break;
@@ -3193,14 +3090,6 @@ static void hci_cmd_status_evt(struct hci_dev *hdev, struct sk_buff *skb,
hci_cs_setup_sync_conn(hdev, ev->status);
break;
- case HCI_OP_CREATE_PHY_LINK:
- hci_cs_create_phylink(hdev, ev->status);
- break;
-
- case HCI_OP_ACCEPT_PHY_LINK:
- hci_cs_accept_phylink(hdev, ev->status);
- break;
-
case HCI_OP_SNIFF_MODE:
hci_cs_sniff_mode(hdev, ev->status);
break;
@@ -4399,6 +4288,23 @@ unlock:
hci_dev_unlock(hdev);
}
+#if IS_ENABLED(CONFIG_BT_HS)
+static void hci_chan_selected_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_ev_channel_selected *ev = (void *)skb->data;
+ struct hci_conn *hcon;
+
+ BT_DBG("%s handle 0x%2.2x", hdev->name, ev->phy_handle);
+
+ skb_pull(skb, sizeof(*ev));
+
+ hcon = hci_conn_hash_lookup_handle(hdev, ev->phy_handle);
+ if (!hcon)
+ return;
+
+ amp_read_loc_assoc_final_data(hdev, hcon);
+}
+
static void hci_phy_link_complete_evt(struct hci_dev *hdev,
struct sk_buff *skb)
{
@@ -4522,6 +4428,7 @@ static void hci_disconn_phylink_complete_evt(struct hci_dev *hdev,
hci_dev_unlock(hdev);
}
+#endif
static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
{
@@ -5206,22 +5113,6 @@ static void hci_le_meta_evt(struct hci_dev *hdev, struct sk_buff *skb)
}
}
-static void hci_chan_selected_evt(struct hci_dev *hdev, struct sk_buff *skb)
-{
- struct hci_ev_channel_selected *ev = (void *) skb->data;
- struct hci_conn *hcon;
-
- BT_DBG("%s handle 0x%2.2x", hdev->name, ev->phy_handle);
-
- skb_pull(skb, sizeof(*ev));
-
- hcon = hci_conn_hash_lookup_handle(hdev, ev->phy_handle);
- if (!hcon)
- return;
-
- amp_read_loc_assoc_final_data(hdev, hcon);
-}
-
static bool hci_get_cmd_complete(struct hci_dev *hdev, u16 opcode,
u8 event, struct sk_buff *skb)
{
@@ -5442,14 +5333,15 @@ void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb)
hci_le_meta_evt(hdev, skb);
break;
- case HCI_EV_CHANNEL_SELECTED:
- hci_chan_selected_evt(hdev, skb);
- break;
-
case HCI_EV_REMOTE_OOB_DATA_REQUEST:
hci_remote_oob_data_request_evt(hdev, skb);
break;
+#if IS_ENABLED(CONFIG_BT_HS)
+ case HCI_EV_CHANNEL_SELECTED:
+ hci_chan_selected_evt(hdev, skb);
+ break;
+
case HCI_EV_PHY_LINK_COMPLETE:
hci_phy_link_complete_evt(hdev, skb);
break;
@@ -5465,6 +5357,7 @@ void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb)
case HCI_EV_DISCONN_PHY_LINK_COMPLETE:
hci_disconn_phylink_complete_evt(hdev, skb);
break;
+#endif
case HCI_EV_NUM_COMP_BLOCKS:
hci_num_comp_blocks_evt(hdev, skb);
diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c
index 2442877..586b3d5 100644
--- a/net/bluetooth/l2cap_sock.c
+++ b/net/bluetooth/l2cap_sock.c
@@ -1054,18 +1054,23 @@ static void l2cap_sock_kill(struct sock *sk)
sock_put(sk);
}
-static int __l2cap_wait_ack(struct sock *sk)
+static int __l2cap_wait_ack(struct sock *sk, struct l2cap_chan *chan)
{
- struct l2cap_chan *chan = l2cap_pi(sk)->chan;
DECLARE_WAITQUEUE(wait, current);
int err = 0;
- int timeo = HZ/5;
+ int timeo = L2CAP_WAIT_ACK_POLL_PERIOD;
+ /* Timeout to prevent infinite loop */
+ unsigned long timeout = jiffies + L2CAP_WAIT_ACK_TIMEOUT;
add_wait_queue(sk_sleep(sk), &wait);
set_current_state(TASK_INTERRUPTIBLE);
- while (chan->unacked_frames > 0 && chan->conn) {
+ do {
+ BT_DBG("Waiting for %d ACKs, timeout %04d ms",
+ chan->unacked_frames, time_after(jiffies, timeout) ? 0 :
+ jiffies_to_msecs(timeout - jiffies));
+
if (!timeo)
- timeo = HZ/5;
+ timeo = L2CAP_WAIT_ACK_POLL_PERIOD;
if (signal_pending(current)) {
err = sock_intr_errno(timeo);
@@ -1080,7 +1085,15 @@ static int __l2cap_wait_ack(struct sock *sk)
err = sock_error(sk);
if (err)
break;
- }
+
+ if (time_after(jiffies, timeout)) {
+ err = -ENOLINK;
+ break;
+ }
+
+ } while (chan->unacked_frames > 0 &&
+ chan->state == BT_CONNECTED);
+
set_current_state(TASK_RUNNING);
remove_wait_queue(sk_sleep(sk), &wait);
return err;
@@ -1098,7 +1111,12 @@ static int l2cap_sock_shutdown(struct socket *sock, int how)
if (!sk)
return 0;
+ /* prevent sk structure from being freed whilst unlocked */
+ sock_hold(sk);
+
chan = l2cap_pi(sk)->chan;
+ /* prevent chan structure from being freed whilst unlocked */
+ l2cap_chan_hold(chan);
conn = chan->conn;
BT_DBG("chan %p state %s", chan, state_to_string(chan->state));
@@ -1110,8 +1128,10 @@ static int l2cap_sock_shutdown(struct socket *sock, int how)
lock_sock(sk);
if (!sk->sk_shutdown) {
- if (chan->mode == L2CAP_MODE_ERTM)
- err = __l2cap_wait_ack(sk);
+ if (chan->mode == L2CAP_MODE_ERTM &&
+ chan->unacked_frames > 0 &&
+ chan->state == BT_CONNECTED)
+ err = __l2cap_wait_ack(sk, chan);
sk->sk_shutdown = SHUTDOWN_MASK;
@@ -1134,6 +1154,11 @@ static int l2cap_sock_shutdown(struct socket *sock, int how)
if (conn)
mutex_unlock(&conn->chan_lock);
+ l2cap_chan_put(chan);
+ sock_put(sk);
+
+ BT_DBG("err: %d", err);
+
return err;
}
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 92720f3..e435438 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -6226,6 +6226,17 @@ static int add_device(struct sock *sk, struct hci_dev *hdev,
else
auto_conn = HCI_AUTO_CONN_REPORT;
+ /* Kernel internally uses conn_params with resolvable private
+ * address, but Add Device allows only identity addresses.
+ * Make sure it is enforced before calling
+ * hci_conn_params_lookup.
+ */
+ if (!hci_is_identity_address(&cp->addr.bdaddr, addr_type)) {
+ err = cmd->cmd_complete(cmd, MGMT_STATUS_INVALID_PARAMS);
+ mgmt_pending_remove(cmd);
+ goto unlock;
+ }
+
/* If the connection parameters don't exist for this device,
* they will be created and configured with defaults.
*/
@@ -6340,6 +6351,18 @@ static int remove_device(struct sock *sk, struct hci_dev *hdev,
else
addr_type = ADDR_LE_DEV_RANDOM;
+ /* Kernel internally uses conn_params with resolvable private
+ * address, but Remove Device allows only identity addresses.
+ * Make sure it is enforced before calling
+ * hci_conn_params_lookup.
+ */
+ if (!hci_is_identity_address(&cp->addr.bdaddr, addr_type)) {
+ err = cmd->cmd_complete(cmd,
+ MGMT_STATUS_INVALID_PARAMS);
+ mgmt_pending_remove(cmd);
+ goto unlock;
+ }
+
params = hci_conn_params_lookup(hdev, &cp->addr.bdaddr,
addr_type);
if (!params) {
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 4ff77a1..0aa8f5c 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -339,6 +339,7 @@ static const struct net_device_ops br_netdev_ops = {
.ndo_bridge_getlink = br_getlink,
.ndo_bridge_setlink = br_setlink,
.ndo_bridge_dellink = br_dellink,
+ .ndo_features_check = passthru_features_check,
};
static void br_dev_free(struct net_device *dev)
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index a538cb1..45e4757 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -281,6 +281,7 @@ void br_dev_delete(struct net_device *dev, struct list_head *head)
br_fdb_delete_by_port(br, NULL, 0, 1);
br_vlan_flush(br);
+ br_multicast_dev_del(br);
del_timer_sync(&br->gc_timer);
br_sysfs_delbr(br->dev);
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index c943219..d747275 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -85,6 +85,7 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
memset(&e, 0, sizeof(e));
e.ifindex = port->dev->ifindex;
e.state = p->state;
+ e.vid = p->addr.vid;
if (p->addr.proto == htons(ETH_P_IP))
e.addr.u.ip4 = p->addr.u.ip4;
#if IS_ENABLED(CONFIG_IPV6)
@@ -230,7 +231,7 @@ errout:
}
void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port,
- struct br_ip *group, int type)
+ struct br_ip *group, int type, u8 state)
{
struct br_mdb_entry entry;
@@ -241,9 +242,78 @@ void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port,
#if IS_ENABLED(CONFIG_IPV6)
entry.addr.u.ip6 = group->u.ip6;
#endif
+ entry.state = state;
+ entry.vid = group->vid;
__br_mdb_notify(dev, &entry, type);
}
+static int nlmsg_populate_rtr_fill(struct sk_buff *skb,
+ struct net_device *dev,
+ int ifindex, u32 pid,
+ u32 seq, int type, unsigned int flags)
+{
+ struct br_port_msg *bpm;
+ struct nlmsghdr *nlh;
+ struct nlattr *nest;
+
+ nlh = nlmsg_put(skb, pid, seq, type, sizeof(*bpm), NLM_F_MULTI);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ bpm = nlmsg_data(nlh);
+ memset(bpm, 0, sizeof(*bpm));
+ bpm->family = AF_BRIDGE;
+ bpm->ifindex = dev->ifindex;
+ nest = nla_nest_start(skb, MDBA_ROUTER);
+ if (!nest)
+ goto cancel;
+
+ if (nla_put_u32(skb, MDBA_ROUTER_PORT, ifindex))
+ goto end;
+
+ nla_nest_end(skb, nest);
+ nlmsg_end(skb, nlh);
+ return 0;
+
+end:
+ nla_nest_end(skb, nest);
+cancel:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static inline size_t rtnl_rtr_nlmsg_size(void)
+{
+ return NLMSG_ALIGN(sizeof(struct br_port_msg))
+ + nla_total_size(sizeof(__u32));
+}
+
+void br_rtr_notify(struct net_device *dev, struct net_bridge_port *port,
+ int type)
+{
+ struct net *net = dev_net(dev);
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+ int ifindex;
+
+ ifindex = port ? port->dev->ifindex : 0;
+ skb = nlmsg_new(rtnl_rtr_nlmsg_size(), GFP_ATOMIC);
+ if (!skb)
+ goto errout;
+
+ err = nlmsg_populate_rtr_fill(skb, dev, ifindex, 0, 0, type, NTF_SELF);
+ if (err < 0) {
+ kfree_skb(skb);
+ goto errout;
+ }
+
+ rtnl_notify(skb, net, 0, RTNLGRP_MDB, NULL, GFP_ATOMIC);
+ return;
+
+errout:
+ rtnl_set_sk_err(net, RTNLGRP_MDB, err);
+}
+
static bool is_valid_mdb_entry(struct br_mdb_entry *entry)
{
if (entry->ifindex == 0)
@@ -263,6 +333,8 @@ static bool is_valid_mdb_entry(struct br_mdb_entry *entry)
return false;
if (entry->state != MDB_PERMANENT && entry->state != MDB_TEMPORARY)
return false;
+ if (entry->vid >= VLAN_VID_MASK)
+ return false;
return true;
}
@@ -374,6 +446,7 @@ static int __br_mdb_add(struct net *net, struct net_bridge *br,
return -EINVAL;
memset(&ip, 0, sizeof(ip));
+ ip.vid = entry->vid;
ip.proto = entry->addr.proto;
if (ip.proto == htons(ETH_P_IP))
ip.u.ip4 = entry->addr.u.ip4;
@@ -391,8 +464,11 @@ static int __br_mdb_add(struct net *net, struct net_bridge *br,
static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
+ unsigned short vid = VLAN_N_VID;
+ struct net_device *dev, *pdev;
struct br_mdb_entry *entry;
- struct net_device *dev;
+ struct net_bridge_port *p;
+ struct net_port_vlans *pv;
struct net_bridge *br;
int err;
@@ -402,9 +478,32 @@ static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh)
br = netdev_priv(dev);
- err = __br_mdb_add(net, br, entry);
- if (!err)
- __br_mdb_notify(dev, entry, RTM_NEWMDB);
+ /* If vlan filtering is enabled and VLAN is not specified
+ * install mdb entry on all vlans configured on the port.
+ */
+ pdev = __dev_get_by_index(net, entry->ifindex);
+ if (!pdev)
+ return -ENODEV;
+
+ p = br_port_get_rtnl(pdev);
+ if (!p || p->br != br || p->state == BR_STATE_DISABLED)
+ return -EINVAL;
+
+ pv = nbp_get_vlan_info(p);
+ if (br_vlan_enabled(br) && pv && entry->vid == 0) {
+ for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID) {
+ entry->vid = vid;
+ err = __br_mdb_add(net, br, entry);
+ if (err)
+ break;
+ __br_mdb_notify(dev, entry, RTM_NEWMDB);
+ }
+ } else {
+ err = __br_mdb_add(net, br, entry);
+ if (!err)
+ __br_mdb_notify(dev, entry, RTM_NEWMDB);
+ }
+
return err;
}
@@ -421,6 +520,7 @@ static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry)
return -EINVAL;
memset(&ip, 0, sizeof(ip));
+ ip.vid = entry->vid;
ip.proto = entry->addr.proto;
if (ip.proto == htons(ETH_P_IP))
ip.u.ip4 = entry->addr.u.ip4;
@@ -465,8 +565,12 @@ unlock:
static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh)
{
- struct net_device *dev;
+ struct net *net = sock_net(skb->sk);
+ unsigned short vid = VLAN_N_VID;
+ struct net_device *dev, *pdev;
struct br_mdb_entry *entry;
+ struct net_bridge_port *p;
+ struct net_port_vlans *pv;
struct net_bridge *br;
int err;
@@ -476,9 +580,31 @@ static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh)
br = netdev_priv(dev);
- err = __br_mdb_del(br, entry);
- if (!err)
- __br_mdb_notify(dev, entry, RTM_DELMDB);
+ /* If vlan filtering is enabled and VLAN is not specified
+ * delete mdb entry on all vlans configured on the port.
+ */
+ pdev = __dev_get_by_index(net, entry->ifindex);
+ if (!pdev)
+ return -ENODEV;
+
+ p = br_port_get_rtnl(pdev);
+ if (!p || p->br != br || p->state == BR_STATE_DISABLED)
+ return -EINVAL;
+
+ pv = nbp_get_vlan_info(p);
+ if (br_vlan_enabled(br) && pv && entry->vid == 0) {
+ for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID) {
+ entry->vid = vid;
+ err = __br_mdb_del(br, entry);
+ if (!err)
+ __br_mdb_notify(dev, entry, RTM_DELMDB);
+ }
+ } else {
+ err = __br_mdb_del(br, entry);
+ if (!err)
+ __br_mdb_notify(dev, entry, RTM_DELMDB);
+ }
+
return err;
}
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 0b39dcc..0752796 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -283,6 +283,8 @@ static void br_multicast_del_pg(struct net_bridge *br,
rcu_assign_pointer(*pp, p->next);
hlist_del_init(&p->mglist);
del_timer(&p->timer);
+ br_mdb_notify(br->dev, p->port, &pg->addr, RTM_DELMDB,
+ p->state);
call_rcu_bh(&p->rcu, br_multicast_free_pg);
if (!mp->ports && !mp->mglist &&
@@ -704,7 +706,7 @@ static int br_multicast_add_group(struct net_bridge *br,
if (unlikely(!p))
goto err;
rcu_assign_pointer(*pp, p);
- br_mdb_notify(br->dev, port, group, RTM_NEWMDB);
+ br_mdb_notify(br->dev, port, group, RTM_NEWMDB, MDB_TEMPORARY);
found:
mod_timer(&p->timer, now + br->multicast_membership_interval);
@@ -764,6 +766,7 @@ static void br_multicast_router_expired(unsigned long data)
goto out;
hlist_del_init_rcu(&port->rlist);
+ br_rtr_notify(br->dev, port, RTM_DELMDB);
out:
spin_unlock(&br->multicast_lock);
@@ -924,6 +927,15 @@ void br_multicast_add_port(struct net_bridge_port *port)
void br_multicast_del_port(struct net_bridge_port *port)
{
+ struct net_bridge *br = port->br;
+ struct net_bridge_port_group *pg;
+ struct hlist_node *n;
+
+ /* Take care of the remaining groups, only perm ones should be left */
+ spin_lock_bh(&br->multicast_lock);
+ hlist_for_each_entry_safe(pg, n, &port->mglist, mglist)
+ br_multicast_del_pg(br, pg);
+ spin_unlock_bh(&br->multicast_lock);
del_timer_sync(&port->multicast_router_timer);
}
@@ -963,10 +975,13 @@ void br_multicast_disable_port(struct net_bridge_port *port)
spin_lock(&br->multicast_lock);
hlist_for_each_entry_safe(pg, n, &port->mglist, mglist)
- br_multicast_del_pg(br, pg);
+ if (pg->state == MDB_TEMPORARY)
+ br_multicast_del_pg(br, pg);
- if (!hlist_unhashed(&port->rlist))
+ if (!hlist_unhashed(&port->rlist)) {
hlist_del_init_rcu(&port->rlist);
+ br_rtr_notify(br->dev, port, RTM_DELMDB);
+ }
del_timer(&port->multicast_router_timer);
del_timer(&port->ip4_own_query.timer);
#if IS_ENABLED(CONFIG_IPV6)
@@ -1204,6 +1219,7 @@ static void br_multicast_add_router(struct net_bridge *br,
hlist_add_behind_rcu(&port->rlist, slot);
else
hlist_add_head_rcu(&port->rlist, &br->router_list);
+ br_rtr_notify(br->dev, port, RTM_NEWMDB);
}
static void br_multicast_mark_router(struct net_bridge *br,
@@ -1437,7 +1453,8 @@ br_multicast_leave_group(struct net_bridge *br,
hlist_del_init(&p->mglist);
del_timer(&p->timer);
call_rcu_bh(&p->rcu, br_multicast_free_pg);
- br_mdb_notify(br->dev, port, group, RTM_DELMDB);
+ br_mdb_notify(br->dev, port, group, RTM_DELMDB,
+ p->state);
if (!mp->ports && !mp->mglist &&
netif_running(br->dev))
@@ -1754,12 +1771,6 @@ void br_multicast_open(struct net_bridge *br)
void br_multicast_stop(struct net_bridge *br)
{
- struct net_bridge_mdb_htable *mdb;
- struct net_bridge_mdb_entry *mp;
- struct hlist_node *n;
- u32 ver;
- int i;
-
del_timer_sync(&br->multicast_router_timer);
del_timer_sync(&br->ip4_other_query.timer);
del_timer_sync(&br->ip4_own_query.timer);
@@ -1767,6 +1778,15 @@ void br_multicast_stop(struct net_bridge *br)
del_timer_sync(&br->ip6_other_query.timer);
del_timer_sync(&br->ip6_own_query.timer);
#endif
+}
+
+void br_multicast_dev_del(struct net_bridge *br)
+{
+ struct net_bridge_mdb_htable *mdb;
+ struct net_bridge_mdb_entry *mp;
+ struct hlist_node *n;
+ u32 ver;
+ int i;
spin_lock_bh(&br->multicast_lock);
mdb = mlock_dereference(br->mdb, br);
@@ -1834,8 +1854,10 @@ int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val)
p->multicast_router = val;
err = 0;
- if (val < 2 && !hlist_unhashed(&p->rlist))
+ if (val < 2 && !hlist_unhashed(&p->rlist)) {
hlist_del_init_rcu(&p->rlist);
+ br_rtr_notify(br->dev, p, RTM_DELMDB);
+ }
if (val == 1)
break;
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index c8b9bcf..0a6f095 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -49,9 +49,9 @@ static struct ctl_table_header *brnf_sysctl_header;
static int brnf_call_iptables __read_mostly = 1;
static int brnf_call_ip6tables __read_mostly = 1;
static int brnf_call_arptables __read_mostly = 1;
-static int brnf_filter_vlan_tagged __read_mostly = 0;
-static int brnf_filter_pppoe_tagged __read_mostly = 0;
-static int brnf_pass_vlan_indev __read_mostly = 0;
+static int brnf_filter_vlan_tagged __read_mostly;
+static int brnf_filter_pppoe_tagged __read_mostly;
+static int brnf_pass_vlan_indev __read_mostly;
#else
#define brnf_call_iptables 1
#define brnf_call_ip6tables 1
@@ -284,7 +284,7 @@ int br_nf_pre_routing_finish_bridge(struct sock *sk, struct sk_buff *skb)
nf_bridge->neigh_header,
ETH_HLEN-ETH_ALEN);
/* tell br_dev_xmit to continue with forwarding */
- nf_bridge->mask |= BRNF_BRIDGED_DNAT;
+ nf_bridge->bridged_dnat = 1;
/* FIXME Need to refragment */
ret = neigh->output(neigh, skb);
}
@@ -356,7 +356,7 @@ static int br_nf_pre_routing_finish(struct sock *sk, struct sk_buff *skb)
skb->pkt_type = PACKET_OTHERHOST;
nf_bridge->pkt_otherhost = false;
}
- nf_bridge->mask &= ~BRNF_NF_BRIDGE_PREROUTING;
+ nf_bridge->in_prerouting = 0;
if (br_nf_ipv4_daddr_was_changed(skb, nf_bridge)) {
if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) {
struct in_device *in_dev = __in_dev_get_rcu(dev);
@@ -444,7 +444,7 @@ struct net_device *setup_pre_routing(struct sk_buff *skb)
nf_bridge->pkt_otherhost = true;
}
- nf_bridge->mask |= BRNF_NF_BRIDGE_PREROUTING;
+ nf_bridge->in_prerouting = 1;
nf_bridge->physindev = skb->dev;
skb->dev = brnf_get_logical_dev(skb, skb->dev);
@@ -850,10 +850,8 @@ static unsigned int ip_sabotage_in(const struct nf_hook_ops *ops,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
- if (skb->nf_bridge &&
- !(skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING)) {
+ if (skb->nf_bridge && !skb->nf_bridge->in_prerouting)
return NF_STOP;
- }
return NF_ACCEPT;
}
@@ -872,7 +870,7 @@ static void br_nf_pre_routing_finish_bridge_slow(struct sk_buff *skb)
struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
skb_pull(skb, ETH_HLEN);
- nf_bridge->mask &= ~BRNF_BRIDGED_DNAT;
+ nf_bridge->bridged_dnat = 0;
BUILD_BUG_ON(sizeof(nf_bridge->neigh_header) != (ETH_HLEN - ETH_ALEN));
@@ -887,7 +885,7 @@ static void br_nf_pre_routing_finish_bridge_slow(struct sk_buff *skb)
static int br_nf_dev_xmit(struct sk_buff *skb)
{
- if (skb->nf_bridge && (skb->nf_bridge->mask & BRNF_BRIDGED_DNAT)) {
+ if (skb->nf_bridge && skb->nf_bridge->bridged_dnat) {
br_nf_pre_routing_finish_bridge_slow(skb);
return 1;
}
diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c
index 13b7d1e..77383bf 100644
--- a/net/bridge/br_netfilter_ipv6.c
+++ b/net/bridge/br_netfilter_ipv6.c
@@ -174,7 +174,7 @@ static int br_nf_pre_routing_finish_ipv6(struct sock *sk, struct sk_buff *skb)
skb->pkt_type = PACKET_OTHERHOST;
nf_bridge->pkt_otherhost = false;
}
- nf_bridge->mask &= ~BRNF_NF_BRIDGE_PREROUTING;
+ nf_bridge->in_prerouting = 0;
if (br_nf_ipv6_daddr_was_changed(skb, nf_bridge)) {
skb_dst_drop(skb);
v6ops->route_input(skb);
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 4d74a06..0f2408f 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -166,8 +166,6 @@ static int br_fill_ifvlaninfo_range(struct sk_buff *skb, u16 vid_start,
sizeof(vinfo), &vinfo))
goto nla_put_failure;
- vinfo.flags &= ~BRIDGE_VLAN_INFO_RANGE_BEGIN;
-
vinfo.vid = vid_end;
vinfo.flags = flags | BRIDGE_VLAN_INFO_RANGE_END;
if (nla_put(skb, IFLA_BRIDGE_VLAN_INFO,
@@ -730,6 +728,7 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = {
[IFLA_BR_AGEING_TIME] = { .type = NLA_U32 },
[IFLA_BR_STP_STATE] = { .type = NLA_U32 },
[IFLA_BR_PRIORITY] = { .type = NLA_U16 },
+ [IFLA_BR_VLAN_FILTERING] = { .type = NLA_U8 },
};
static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
@@ -777,6 +776,14 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
br_stp_set_bridge_priority(br, priority);
}
+ if (data[IFLA_BR_VLAN_FILTERING]) {
+ u8 vlan_filter = nla_get_u8(data[IFLA_BR_VLAN_FILTERING]);
+
+ err = __br_vlan_filter_toggle(br, vlan_filter);
+ if (err)
+ return err;
+ }
+
return 0;
}
@@ -788,6 +795,7 @@ static size_t br_get_size(const struct net_device *brdev)
nla_total_size(sizeof(u32)) + /* IFLA_BR_AGEING_TIME */
nla_total_size(sizeof(u32)) + /* IFLA_BR_STP_STATE */
nla_total_size(sizeof(u16)) + /* IFLA_BR_PRIORITY */
+ nla_total_size(sizeof(u8)) + /* IFLA_BR_VLAN_FILTERING */
0;
}
@@ -800,13 +808,15 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev)
u32 ageing_time = jiffies_to_clock_t(br->ageing_time);
u32 stp_enabled = br->stp_enabled;
u16 priority = (br->bridge_id.prio[0] << 8) | br->bridge_id.prio[1];
+ u8 vlan_enabled = br_vlan_enabled(br);
if (nla_put_u32(skb, IFLA_BR_FORWARD_DELAY, forward_delay) ||
nla_put_u32(skb, IFLA_BR_HELLO_TIME, hello_time) ||
nla_put_u32(skb, IFLA_BR_MAX_AGE, age_time) ||
nla_put_u32(skb, IFLA_BR_AGEING_TIME, ageing_time) ||
nla_put_u32(skb, IFLA_BR_STP_STATE, stp_enabled) ||
- nla_put_u16(skb, IFLA_BR_PRIORITY, priority))
+ nla_put_u16(skb, IFLA_BR_PRIORITY, priority) ||
+ nla_put_u8(skb, IFLA_BR_VLAN_FILTERING, vlan_enabled))
return -EMSGSIZE;
return 0;
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 8b21146..3d95647 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -466,6 +466,7 @@ void br_multicast_disable_port(struct net_bridge_port *port);
void br_multicast_init(struct net_bridge *br);
void br_multicast_open(struct net_bridge *br);
void br_multicast_stop(struct net_bridge *br);
+void br_multicast_dev_del(struct net_bridge *br);
void br_multicast_deliver(struct net_bridge_mdb_entry *mdst,
struct sk_buff *skb);
void br_multicast_forward(struct net_bridge_mdb_entry *mdst,
@@ -488,7 +489,9 @@ br_multicast_new_port_group(struct net_bridge_port *port, struct br_ip *group,
void br_mdb_init(void);
void br_mdb_uninit(void);
void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port,
- struct br_ip *group, int type);
+ struct br_ip *group, int type, u8 state);
+void br_rtr_notify(struct net_device *dev, struct net_bridge_port *port,
+ int type);
#define mlock_dereference(X, br) \
rcu_dereference_protected(X, lockdep_is_held(&br->multicast_lock))
@@ -565,6 +568,10 @@ static inline void br_multicast_stop(struct net_bridge *br)
{
}
+static inline void br_multicast_dev_del(struct net_bridge *br)
+{
+}
+
static inline void br_multicast_deliver(struct net_bridge_mdb_entry *mdst,
struct sk_buff *skb)
{
@@ -607,6 +614,7 @@ int br_vlan_delete(struct net_bridge *br, u16 vid);
void br_vlan_flush(struct net_bridge *br);
bool br_vlan_find(struct net_bridge *br, u16 vid);
void br_recalculate_fwd_mask(struct net_bridge *br);
+int __br_vlan_filter_toggle(struct net_bridge *br, unsigned long val);
int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val);
int br_vlan_set_proto(struct net_bridge *br, unsigned long val);
int br_vlan_init(struct net_bridge *br);
@@ -764,6 +772,12 @@ static inline int br_vlan_enabled(struct net_bridge *br)
{
return 0;
}
+
+static inline int __br_vlan_filter_toggle(struct net_bridge *br,
+ unsigned long val)
+{
+ return -EOPNOTSUPP;
+}
#endif
struct nf_br_ops {
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 0d41f81..3cef689 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -468,21 +468,27 @@ void br_recalculate_fwd_mask(struct net_bridge *br)
~(1u << br->group_addr[5]);
}
-int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val)
+int __br_vlan_filter_toggle(struct net_bridge *br, unsigned long val)
{
- if (!rtnl_trylock())
- return restart_syscall();
-
if (br->vlan_enabled == val)
- goto unlock;
+ return 0;
br->vlan_enabled = val;
br_manage_promisc(br);
recalculate_group_addr(br);
br_recalculate_fwd_mask(br);
-unlock:
+ return 0;
+}
+
+int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val)
+{
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ __br_vlan_filter_toggle(br, val);
rtnl_unlock();
+
return 0;
}
diff --git a/net/core/Makefile b/net/core/Makefile
index fec0856..086b01f 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -23,3 +23,4 @@ obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o
obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o
obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
+obj-$(CONFIG_LWTUNNEL) += lwtunnel.o
diff --git a/net/core/dev.c b/net/core/dev.c
index a8e4dd4..4870c35 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3061,6 +3061,16 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
else
skb_dst_force(skb);
+#ifdef CONFIG_NET_SWITCHDEV
+ /* Don't forward if offload device already forwarded */
+ if (skb->offload_fwd_mark &&
+ skb->offload_fwd_mark == dev->offload_fwd_mark) {
+ consume_skb(skb);
+ rc = NET_XMIT_SUCCESS;
+ goto out;
+ }
+#endif
+
txq = netdev_pick_tx(dev, skb, accel_priv);
q = rcu_dereference_bh(txq->qdisc);
@@ -3645,7 +3655,7 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb,
qdisc_skb_cb(skb)->pkt_len = skb->len;
skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
- qdisc_bstats_update_cpu(cl->q, skb);
+ qdisc_bstats_cpu_update(cl->q, skb);
switch (tc_classify(skb, cl, &cl_res)) {
case TC_ACT_OK:
@@ -3653,7 +3663,7 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb,
skb->tc_index = TC_H_MIN(cl_res.classid);
break;
case TC_ACT_SHOT:
- qdisc_qstats_drop_cpu(cl->q);
+ qdisc_qstats_cpu_drop(cl->q);
case TC_ACT_STOLEN:
case TC_ACT_QUEUED:
kfree_skb(skb);
@@ -4985,7 +4995,7 @@ EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
* Gets the next netdev_adjacent->private from the dev's lower neighbour
* list, starting from iter position. The caller must hold either hold the
* RTNL lock or its own locking that guarantees that the neighbour lower
- * list will remain unchainged.
+ * list will remain unchanged.
*/
void *netdev_lower_get_next_private(struct net_device *dev,
struct list_head **iter)
@@ -5040,7 +5050,7 @@ EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
* Gets the next netdev_adjacent from the dev's lower neighbour
* list, starting from iter position. The caller must hold RTNL lock or
* its own locking that guarantees that the neighbour lower
- * list will remain unchainged.
+ * list will remain unchanged.
*/
void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
{
@@ -6075,6 +6085,26 @@ int dev_get_phys_port_name(struct net_device *dev,
EXPORT_SYMBOL(dev_get_phys_port_name);
/**
+ * dev_change_proto_down - update protocol port state information
+ * @dev: device
+ * @proto_down: new value
+ *
+ * This info can be used by switch drivers to set the phys state of the
+ * port.
+ */
+int dev_change_proto_down(struct net_device *dev, bool proto_down)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (!ops->ndo_change_proto_down)
+ return -EOPNOTSUPP;
+ if (!netif_device_present(dev))
+ return -ENODEV;
+ return ops->ndo_change_proto_down(dev, proto_down);
+}
+EXPORT_SYMBOL(dev_change_proto_down);
+
+/**
* dev_new_index - allocate an ifindex
* @net: the applicable net namespace
*
@@ -7639,7 +7669,7 @@ static int __init net_dev_init(void)
open_softirq(NET_RX_SOFTIRQ, net_rx_action);
hotcpu_notifier(dev_cpu_callback, 0);
- dst_init();
+ dst_subsys_init();
rc = 0;
out:
return rc;
diff --git a/net/core/dst.c b/net/core/dst.c
index 002144be..f8694d1 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -22,6 +22,7 @@
#include <linux/prefetch.h>
#include <net/dst.h>
+#include <net/dst_metadata.h>
/*
* Theory of operations:
@@ -158,19 +159,10 @@ const u32 dst_default_metrics[RTAX_MAX + 1] = {
[RTAX_MAX] = 0xdeadbeef,
};
-
-void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
- int initial_ref, int initial_obsolete, unsigned short flags)
+void dst_init(struct dst_entry *dst, struct dst_ops *ops,
+ struct net_device *dev, int initial_ref, int initial_obsolete,
+ unsigned short flags)
{
- struct dst_entry *dst;
-
- if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) {
- if (ops->gc(ops))
- return NULL;
- }
- dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC);
- if (!dst)
- return NULL;
dst->child = NULL;
dst->dev = dev;
if (dev)
@@ -200,6 +192,25 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
dst->next = NULL;
if (!(flags & DST_NOCOUNT))
dst_entries_add(ops, 1);
+}
+EXPORT_SYMBOL(dst_init);
+
+void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
+ int initial_ref, int initial_obsolete, unsigned short flags)
+{
+ struct dst_entry *dst;
+
+ if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) {
+ if (ops->gc(ops))
+ return NULL;
+ }
+
+ dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC);
+ if (!dst)
+ return NULL;
+
+ dst_init(dst, ops, dev, initial_ref, initial_obsolete, flags);
+
return dst;
}
EXPORT_SYMBOL(dst_alloc);
@@ -248,7 +259,11 @@ again:
dst->ops->destroy(dst);
if (dst->dev)
dev_put(dst->dev);
- kmem_cache_free(dst->ops->kmem_cachep, dst);
+
+ if (dst->flags & DST_METADATA)
+ kfree(dst);
+ else
+ kmem_cache_free(dst->ops->kmem_cachep, dst);
dst = child;
if (dst) {
@@ -329,6 +344,70 @@ void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old)
}
EXPORT_SYMBOL(__dst_destroy_metrics_generic);
+static struct dst_ops md_dst_ops = {
+ .family = AF_UNSPEC,
+};
+
+static int dst_md_discard_sk(struct sock *sk, struct sk_buff *skb)
+{
+ WARN_ONCE(1, "Attempting to call output on metadata dst\n");
+ kfree_skb(skb);
+ return 0;
+}
+
+static int dst_md_discard(struct sk_buff *skb)
+{
+ WARN_ONCE(1, "Attempting to call input on metadata dst\n");
+ kfree_skb(skb);
+ return 0;
+}
+
+static void __metadata_dst_init(struct metadata_dst *md_dst, u8 optslen)
+{
+ struct dst_entry *dst;
+
+ dst = &md_dst->dst;
+ dst_init(dst, &md_dst_ops, NULL, 1, DST_OBSOLETE_NONE,
+ DST_METADATA | DST_NOCACHE | DST_NOCOUNT);
+
+ dst->input = dst_md_discard;
+ dst->output = dst_md_discard_sk;
+
+ memset(dst + 1, 0, sizeof(*md_dst) + optslen - sizeof(*dst));
+ md_dst->opts_len = optslen;
+}
+
+struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags)
+{
+ struct metadata_dst *md_dst;
+
+ md_dst = kmalloc(sizeof(*md_dst) + optslen, flags);
+ if (!md_dst)
+ return NULL;
+
+ __metadata_dst_init(md_dst, optslen);
+
+ return md_dst;
+}
+EXPORT_SYMBOL_GPL(metadata_dst_alloc);
+
+struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags)
+{
+ int cpu;
+ struct metadata_dst __percpu *md_dst;
+
+ md_dst = __alloc_percpu_gfp(sizeof(struct metadata_dst) + optslen,
+ __alignof__(struct metadata_dst), flags);
+ if (!md_dst)
+ return NULL;
+
+ for_each_possible_cpu(cpu)
+ __metadata_dst_init(per_cpu_ptr(md_dst, cpu), optslen);
+
+ return md_dst;
+}
+EXPORT_SYMBOL_GPL(metadata_dst_alloc_percpu);
+
/* Dirty hack. We did it in 2.2 (in __dst_free),
* we have _very_ good reasons not to repeat
* this mistake in 2.3, but we have no choice
@@ -393,7 +472,7 @@ static struct notifier_block dst_dev_notifier = {
.priority = -10, /* must be called after other network notifiers */
};
-void __init dst_init(void)
+void __init dst_subsys_init(void)
{
register_netdevice_notifier(&dst_dev_notifier);
}
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 9a12668..ae8306e 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -16,6 +16,7 @@
#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/fib_rules.h>
+#include <net/ip_tunnels.h>
int fib_default_rule_add(struct fib_rules_ops *ops,
u32 pref, u32 table, u32 flags)
@@ -186,6 +187,9 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
if ((rule->mark ^ fl->flowi_mark) & rule->mark_mask)
goto out;
+ if (rule->tun_id && (rule->tun_id != fl->flowi_tun_key.tun_id))
+ goto out;
+
ret = ops->match(rule, fl, flags);
out:
return (rule->flags & FIB_RULE_INVERT) ? !ret : ret;
@@ -330,6 +334,9 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh)
if (tb[FRA_FWMASK])
rule->mark_mask = nla_get_u32(tb[FRA_FWMASK]);
+ if (tb[FRA_TUN_ID])
+ rule->tun_id = nla_get_be64(tb[FRA_TUN_ID]);
+
rule->action = frh->action;
rule->flags = frh->flags;
rule->table = frh_get_table(frh, tb);
@@ -407,6 +414,9 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh)
if (unresolved)
ops->unresolved_rules++;
+ if (rule->tun_id)
+ ip_tunnel_need_metadata();
+
notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).portid);
flush_route_cache(ops);
rules_ops_put(ops);
@@ -473,6 +483,10 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh)
(rule->mark_mask != nla_get_u32(tb[FRA_FWMASK])))
continue;
+ if (tb[FRA_TUN_ID] &&
+ (rule->tun_id != nla_get_be64(tb[FRA_TUN_ID])))
+ continue;
+
if (!ops->compare(rule, frh, tb))
continue;
@@ -487,6 +501,9 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh)
goto errout;
}
+ if (rule->tun_id)
+ ip_tunnel_unneed_metadata();
+
list_del_rcu(&rule->list);
if (rule->action == FR_ACT_GOTO) {
@@ -535,7 +552,8 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops,
+ nla_total_size(4) /* FRA_SUPPRESS_PREFIXLEN */
+ nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */
+ nla_total_size(4) /* FRA_FWMARK */
- + nla_total_size(4); /* FRA_FWMASK */
+ + nla_total_size(4) /* FRA_FWMASK */
+ + nla_total_size(8); /* FRA_TUN_ID */
if (ops->nlmsg_payload)
payload += ops->nlmsg_payload(rule);
@@ -591,7 +609,9 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
((rule->mark_mask || rule->mark) &&
nla_put_u32(skb, FRA_FWMASK, rule->mark_mask)) ||
(rule->target &&
- nla_put_u32(skb, FRA_GOTO, rule->target)))
+ nla_put_u32(skb, FRA_GOTO, rule->target)) ||
+ (rule->tun_id &&
+ nla_put_be64(skb, FRA_TUN_ID, rule->tun_id)))
goto nla_put_failure;
if (rule->suppress_ifgroup != -1) {
diff --git a/net/core/filter.c b/net/core/filter.c
index be3098f..a50dbfa 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -47,6 +47,8 @@
#include <linux/if_vlan.h>
#include <linux/bpf.h>
#include <net/sch_generic.h>
+#include <net/cls_cgroup.h>
+#include <net/dst_metadata.h>
/**
* sk_filter - run a packet through a socket filter
@@ -1424,6 +1426,136 @@ const struct bpf_func_proto bpf_clone_redirect_proto = {
.arg3_type = ARG_ANYTHING,
};
+static u64 bpf_get_cgroup_classid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ return task_get_classid((struct sk_buff *) (unsigned long) r1);
+}
+
+static const struct bpf_func_proto bpf_get_cgroup_classid_proto = {
+ .func = bpf_get_cgroup_classid,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+static u64 bpf_skb_vlan_push(u64 r1, u64 r2, u64 vlan_tci, u64 r4, u64 r5)
+{
+ struct sk_buff *skb = (struct sk_buff *) (long) r1;
+ __be16 vlan_proto = (__force __be16) r2;
+
+ if (unlikely(vlan_proto != htons(ETH_P_8021Q) &&
+ vlan_proto != htons(ETH_P_8021AD)))
+ vlan_proto = htons(ETH_P_8021Q);
+
+ return skb_vlan_push(skb, vlan_proto, vlan_tci);
+}
+
+const struct bpf_func_proto bpf_skb_vlan_push_proto = {
+ .func = bpf_skb_vlan_push,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+};
+EXPORT_SYMBOL_GPL(bpf_skb_vlan_push_proto);
+
+static u64 bpf_skb_vlan_pop(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ struct sk_buff *skb = (struct sk_buff *) (long) r1;
+
+ return skb_vlan_pop(skb);
+}
+
+const struct bpf_func_proto bpf_skb_vlan_pop_proto = {
+ .func = bpf_skb_vlan_pop,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+EXPORT_SYMBOL_GPL(bpf_skb_vlan_pop_proto);
+
+bool bpf_helper_changes_skb_data(void *func)
+{
+ if (func == bpf_skb_vlan_push)
+ return true;
+ if (func == bpf_skb_vlan_pop)
+ return true;
+ return false;
+}
+
+static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
+{
+ struct sk_buff *skb = (struct sk_buff *) (long) r1;
+ struct bpf_tunnel_key *to = (struct bpf_tunnel_key *) (long) r2;
+ struct ip_tunnel_info *info = skb_tunnel_info(skb, AF_INET);
+
+ if (unlikely(size != sizeof(struct bpf_tunnel_key) || flags || !info))
+ return -EINVAL;
+
+ to->tunnel_id = be64_to_cpu(info->key.tun_id);
+ to->remote_ipv4 = be32_to_cpu(info->key.ipv4_src);
+
+ return 0;
+}
+
+const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = {
+ .func = bpf_skb_get_tunnel_key,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_STACK,
+ .arg3_type = ARG_CONST_STACK_SIZE,
+ .arg4_type = ARG_ANYTHING,
+};
+
+static struct metadata_dst __percpu *md_dst;
+
+static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
+{
+ struct sk_buff *skb = (struct sk_buff *) (long) r1;
+ struct bpf_tunnel_key *from = (struct bpf_tunnel_key *) (long) r2;
+ struct metadata_dst *md = this_cpu_ptr(md_dst);
+ struct ip_tunnel_info *info;
+
+ if (unlikely(size != sizeof(struct bpf_tunnel_key) || flags))
+ return -EINVAL;
+
+ skb_dst_drop(skb);
+ dst_hold((struct dst_entry *) md);
+ skb_dst_set(skb, (struct dst_entry *) md);
+
+ info = &md->u.tun_info;
+ info->mode = IP_TUNNEL_INFO_TX;
+ info->key.tun_id = cpu_to_be64(from->tunnel_id);
+ info->key.ipv4_dst = cpu_to_be32(from->remote_ipv4);
+
+ return 0;
+}
+
+const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
+ .func = bpf_skb_set_tunnel_key,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_STACK,
+ .arg3_type = ARG_CONST_STACK_SIZE,
+ .arg4_type = ARG_ANYTHING,
+};
+
+static const struct bpf_func_proto *bpf_get_skb_set_tunnel_key_proto(void)
+{
+ if (!md_dst) {
+ /* race is not possible, since it's called from
+ * verifier that is holding verifier mutex
+ */
+ md_dst = metadata_dst_alloc_percpu(0, GFP_KERNEL);
+ if (!md_dst)
+ return NULL;
+ }
+ return &bpf_skb_set_tunnel_key_proto;
+}
+
static const struct bpf_func_proto *
sk_filter_func_proto(enum bpf_func_id func_id)
{
@@ -1461,6 +1593,16 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
return &bpf_l4_csum_replace_proto;
case BPF_FUNC_clone_redirect:
return &bpf_clone_redirect_proto;
+ case BPF_FUNC_get_cgroup_classid:
+ return &bpf_get_cgroup_classid_proto;
+ case BPF_FUNC_skb_vlan_push:
+ return &bpf_skb_vlan_push_proto;
+ case BPF_FUNC_skb_vlan_pop:
+ return &bpf_skb_vlan_pop_proto;
+ case BPF_FUNC_skb_get_tunnel_key:
+ return &bpf_skb_get_tunnel_key_proto;
+ case BPF_FUNC_skb_set_tunnel_key:
+ return bpf_get_skb_set_tunnel_key_proto();
default:
return sk_filter_func_proto(func_id);
}
@@ -1569,6 +1711,13 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
offsetof(struct net_device, ifindex));
break;
+ case offsetof(struct __sk_buff, hash):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4);
+
+ *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+ offsetof(struct sk_buff, hash));
+ break;
+
case offsetof(struct __sk_buff, mark):
BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 2a834c6..11e6540 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -590,6 +590,15 @@ void make_flow_keys_digest(struct flow_keys_digest *digest,
}
EXPORT_SYMBOL(make_flow_keys_digest);
+static inline void __skb_set_sw_hash(struct sk_buff *skb, u32 hash,
+ struct flow_keys *keys)
+{
+ if (keys->ports.ports)
+ skb->l4_hash = 1;
+ skb->sw_hash = 1;
+ skb->hash = hash;
+}
+
/**
* __skb_get_hash: calculate a flow hash
* @skb: sk_buff to calculate flow hash from
@@ -609,10 +618,8 @@ void __skb_get_hash(struct sk_buff *skb)
hash = ___skb_get_hash(skb, &keys, hashrnd);
if (!hash)
return;
- if (keys.ports.ports)
- skb->l4_hash = 1;
- skb->sw_hash = 1;
- skb->hash = hash;
+
+ __skb_set_sw_hash(skb, hash, &keys);
}
EXPORT_SYMBOL(__skb_get_hash);
@@ -624,6 +631,49 @@ __u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb)
}
EXPORT_SYMBOL(skb_get_hash_perturb);
+__u32 __skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6)
+{
+ struct flow_keys keys;
+
+ memset(&keys, 0, sizeof(keys));
+
+ memcpy(&keys.addrs.v6addrs.src, &fl6->saddr,
+ sizeof(keys.addrs.v6addrs.src));
+ memcpy(&keys.addrs.v6addrs.dst, &fl6->daddr,
+ sizeof(keys.addrs.v6addrs.dst));
+ keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ keys.ports.src = fl6->fl6_sport;
+ keys.ports.dst = fl6->fl6_dport;
+ keys.keyid.keyid = fl6->fl6_gre_key;
+ keys.tags.flow_label = (__force u32)fl6->flowlabel;
+ keys.basic.ip_proto = fl6->flowi6_proto;
+
+ __skb_set_sw_hash(skb, flow_hash_from_keys(&keys), &keys);
+
+ return skb->hash;
+}
+EXPORT_SYMBOL(__skb_get_hash_flowi6);
+
+__u32 __skb_get_hash_flowi4(struct sk_buff *skb, struct flowi4 *fl4)
+{
+ struct flow_keys keys;
+
+ memset(&keys, 0, sizeof(keys));
+
+ keys.addrs.v4addrs.src = fl4->saddr;
+ keys.addrs.v4addrs.dst = fl4->daddr;
+ keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ keys.ports.src = fl4->fl4_sport;
+ keys.ports.dst = fl4->fl4_dport;
+ keys.keyid.keyid = fl4->fl4_gre_key;
+ keys.basic.ip_proto = fl4->flowi4_proto;
+
+ __skb_set_sw_hash(skb, flow_hash_from_keys(&keys), &keys);
+
+ return skb->hash;
+}
+EXPORT_SYMBOL(__skb_get_hash_flowi4);
+
u32 __skb_get_poff(const struct sk_buff *skb, void *data,
const struct flow_keys *keys, int hlen)
{
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
new file mode 100644
index 0000000..5d6d8e3
--- /dev/null
+++ b/net/core/lwtunnel.c
@@ -0,0 +1,243 @@
+/*
+ * lwtunnel Infrastructure for light weight tunnels like mpls
+ *
+ * Authors: Roopa Prabhu, <roopa@cumulusnetworks.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/lwtunnel.h>
+#include <linux/in.h>
+#include <linux/init.h>
+#include <linux/err.h>
+
+#include <net/lwtunnel.h>
+#include <net/rtnetlink.h>
+#include <net/ip6_fib.h>
+
+struct lwtunnel_state *lwtunnel_state_alloc(int encap_len)
+{
+ struct lwtunnel_state *lws;
+
+ lws = kzalloc(sizeof(*lws) + encap_len, GFP_ATOMIC);
+
+ return lws;
+}
+EXPORT_SYMBOL(lwtunnel_state_alloc);
+
+static const struct lwtunnel_encap_ops __rcu *
+ lwtun_encaps[LWTUNNEL_ENCAP_MAX + 1] __read_mostly;
+
+int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *ops,
+ unsigned int num)
+{
+ if (num > LWTUNNEL_ENCAP_MAX)
+ return -ERANGE;
+
+ return !cmpxchg((const struct lwtunnel_encap_ops **)
+ &lwtun_encaps[num],
+ NULL, ops) ? 0 : -1;
+}
+EXPORT_SYMBOL(lwtunnel_encap_add_ops);
+
+int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *ops,
+ unsigned int encap_type)
+{
+ int ret;
+
+ if (encap_type == LWTUNNEL_ENCAP_NONE ||
+ encap_type > LWTUNNEL_ENCAP_MAX)
+ return -ERANGE;
+
+ ret = (cmpxchg((const struct lwtunnel_encap_ops **)
+ &lwtun_encaps[encap_type],
+ ops, NULL) == ops) ? 0 : -1;
+
+ synchronize_net();
+
+ return ret;
+}
+EXPORT_SYMBOL(lwtunnel_encap_del_ops);
+
+int lwtunnel_build_state(struct net_device *dev, u16 encap_type,
+ struct nlattr *encap, struct lwtunnel_state **lws)
+{
+ const struct lwtunnel_encap_ops *ops;
+ int ret = -EINVAL;
+
+ if (encap_type == LWTUNNEL_ENCAP_NONE ||
+ encap_type > LWTUNNEL_ENCAP_MAX)
+ return ret;
+
+ ret = -EOPNOTSUPP;
+ rcu_read_lock();
+ ops = rcu_dereference(lwtun_encaps[encap_type]);
+ if (likely(ops && ops->build_state))
+ ret = ops->build_state(dev, encap, lws);
+ rcu_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL(lwtunnel_build_state);
+
+int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate)
+{
+ const struct lwtunnel_encap_ops *ops;
+ struct nlattr *nest;
+ int ret = -EINVAL;
+
+ if (!lwtstate)
+ return 0;
+
+ if (lwtstate->type == LWTUNNEL_ENCAP_NONE ||
+ lwtstate->type > LWTUNNEL_ENCAP_MAX)
+ return 0;
+
+ ret = -EOPNOTSUPP;
+ nest = nla_nest_start(skb, RTA_ENCAP);
+ rcu_read_lock();
+ ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
+ if (likely(ops && ops->fill_encap))
+ ret = ops->fill_encap(skb, lwtstate);
+ rcu_read_unlock();
+
+ if (ret)
+ goto nla_put_failure;
+ nla_nest_end(skb, nest);
+ ret = nla_put_u16(skb, RTA_ENCAP_TYPE, lwtstate->type);
+ if (ret)
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+
+ return (ret == -EOPNOTSUPP ? 0 : ret);
+}
+EXPORT_SYMBOL(lwtunnel_fill_encap);
+
+int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate)
+{
+ const struct lwtunnel_encap_ops *ops;
+ int ret = 0;
+
+ if (!lwtstate)
+ return 0;
+
+ if (lwtstate->type == LWTUNNEL_ENCAP_NONE ||
+ lwtstate->type > LWTUNNEL_ENCAP_MAX)
+ return 0;
+
+ rcu_read_lock();
+ ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
+ if (likely(ops && ops->get_encap_size))
+ ret = nla_total_size(ops->get_encap_size(lwtstate));
+ rcu_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL(lwtunnel_get_encap_size);
+
+int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b)
+{
+ const struct lwtunnel_encap_ops *ops;
+ int ret = 0;
+
+ if (!a && !b)
+ return 0;
+
+ if (!a || !b)
+ return 1;
+
+ if (a->type != b->type)
+ return 1;
+
+ if (a->type == LWTUNNEL_ENCAP_NONE ||
+ a->type > LWTUNNEL_ENCAP_MAX)
+ return 0;
+
+ rcu_read_lock();
+ ops = rcu_dereference(lwtun_encaps[a->type]);
+ if (likely(ops && ops->cmp_encap))
+ ret = ops->cmp_encap(a, b);
+ rcu_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL(lwtunnel_cmp_encap);
+
+int __lwtunnel_output(struct sock *sk, struct sk_buff *skb,
+ struct lwtunnel_state *lwtstate)
+{
+ const struct lwtunnel_encap_ops *ops;
+ int ret = -EINVAL;
+
+ if (!lwtstate)
+ goto drop;
+
+ if (lwtstate->type == LWTUNNEL_ENCAP_NONE ||
+ lwtstate->type > LWTUNNEL_ENCAP_MAX)
+ return 0;
+
+ ret = -EOPNOTSUPP;
+ rcu_read_lock();
+ ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
+ if (likely(ops && ops->output))
+ ret = ops->output(sk, skb);
+ rcu_read_unlock();
+
+ if (ret == -EOPNOTSUPP)
+ goto drop;
+
+ return ret;
+
+drop:
+ kfree_skb(skb);
+
+ return ret;
+}
+
+int lwtunnel_output6(struct sock *sk, struct sk_buff *skb)
+{
+ struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
+ struct lwtunnel_state *lwtstate = NULL;
+
+ if (rt) {
+ lwtstate = rt->rt6i_lwtstate;
+ skb->dev = rt->dst.dev;
+ }
+
+ skb->protocol = htons(ETH_P_IPV6);
+
+ return __lwtunnel_output(sk, skb, lwtstate);
+}
+EXPORT_SYMBOL(lwtunnel_output6);
+
+int lwtunnel_output(struct sock *sk, struct sk_buff *skb)
+{
+ struct rtable *rt = (struct rtable *)skb_dst(skb);
+ struct lwtunnel_state *lwtstate = NULL;
+
+ if (rt) {
+ lwtstate = rt->rt_lwtstate;
+ skb->dev = rt->dst.dev;
+ }
+
+ skb->protocol = htons(ETH_P_IP);
+
+ return __lwtunnel_output(sk, skb, lwtstate);
+}
+EXPORT_SYMBOL(lwtunnel_output);
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 84195da..2b515ba 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -274,8 +274,12 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device
(entries >= tbl->gc_thresh2 &&
time_after(now, tbl->last_flush + 5 * HZ))) {
if (!neigh_forced_gc(tbl) &&
- entries >= tbl->gc_thresh3)
+ entries >= tbl->gc_thresh3) {
+ net_info_ratelimited("%s: neighbor table overflow!\n",
+ tbl->id);
+ NEIGH_CACHE_STAT_INC(tbl, table_fulls);
goto out_entries;
+ }
}
n = kzalloc(tbl->entry_size + dev->neigh_priv_len, GFP_ATOMIC);
@@ -1849,6 +1853,7 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl,
ndst.ndts_rcv_probes_ucast += st->rcv_probes_ucast;
ndst.ndts_periodic_gc_runs += st->periodic_gc_runs;
ndst.ndts_forced_gc_runs += st->forced_gc_runs;
+ ndst.ndts_table_fulls += st->table_fulls;
}
if (nla_put(skb, NDTA_STATS, sizeof(ndst), &ndst))
@@ -2717,12 +2722,12 @@ static int neigh_stat_seq_show(struct seq_file *seq, void *v)
struct neigh_statistics *st = v;
if (v == SEQ_START_TOKEN) {
- seq_printf(seq, "entries allocs destroys hash_grows lookups hits res_failed rcv_probes_mcast rcv_probes_ucast periodic_gc_runs forced_gc_runs unresolved_discards\n");
+ seq_printf(seq, "entries allocs destroys hash_grows lookups hits res_failed rcv_probes_mcast rcv_probes_ucast periodic_gc_runs forced_gc_runs unresolved_discards table_fulls\n");
return 0;
}
seq_printf(seq, "%08x %08lx %08lx %08lx %08lx %08lx %08lx "
- "%08lx %08lx %08lx %08lx %08lx\n",
+ "%08lx %08lx %08lx %08lx %08lx %08lx\n",
atomic_read(&tbl->entries),
st->allocs,
@@ -2739,7 +2744,8 @@ static int neigh_stat_seq_show(struct seq_file *seq, void *v)
st->periodic_gc_runs,
st->forced_gc_runs,
- st->unres_discards
+ st->unres_discards,
+ st->table_fulls
);
return 0;
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 18b34d7..39ec694 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -404,6 +404,19 @@ static ssize_t group_store(struct device *dev, struct device_attribute *attr,
NETDEVICE_SHOW(group, fmt_dec);
static DEVICE_ATTR(netdev_group, S_IRUGO | S_IWUSR, group_show, group_store);
+static int change_proto_down(struct net_device *dev, unsigned long proto_down)
+{
+ return dev_change_proto_down(dev, (bool) proto_down);
+}
+
+static ssize_t proto_down_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return netdev_store(dev, attr, buf, len, change_proto_down);
+}
+NETDEVICE_SHOW_RW(proto_down, fmt_dec);
+
static ssize_t phys_port_id_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
@@ -501,6 +514,7 @@ static struct attribute *net_class_attrs[] = {
&dev_attr_phys_port_id.attr,
&dev_attr_phys_port_name.attr,
&dev_attr_phys_switch_id.attr,
+ &dev_attr_proto_down.attr,
NULL,
};
ATTRIBUTE_GROUPS(net_class);
@@ -712,14 +726,17 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue,
old_map = rcu_dereference_protected(queue->rps_map,
lockdep_is_held(&rps_map_lock));
rcu_assign_pointer(queue->rps_map, map);
- spin_unlock(&rps_map_lock);
if (map)
static_key_slow_inc(&rps_needed);
- if (old_map) {
- kfree_rcu(old_map, rcu);
+ if (old_map)
static_key_slow_dec(&rps_needed);
- }
+
+ spin_unlock(&rps_map_lock);
+
+ if (old_map)
+ kfree_rcu(old_map, rcu);
+
free_cpumask_var(mask);
return len;
}
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 1cbd209..de8d5cc 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -273,7 +273,6 @@ struct pktgen_dev {
/* runtime counters relating to clone_skb */
- __u64 allocated_skbs;
__u32 clone_count;
int last_ok; /* Was last skb sent?
* Or a failed transmit of some sort?
@@ -2279,7 +2278,7 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)
static inline void set_pkt_overhead(struct pktgen_dev *pkt_dev)
{
- pkt_dev->pkt_overhead = 0;
+ pkt_dev->pkt_overhead = LL_RESERVED_SPACE(pkt_dev->odev);
pkt_dev->pkt_overhead += pkt_dev->nr_labels*sizeof(u32);
pkt_dev->pkt_overhead += VLAN_TAG_SIZE(pkt_dev);
pkt_dev->pkt_overhead += SVLAN_TAG_SIZE(pkt_dev);
@@ -2788,6 +2787,7 @@ static struct sk_buff *pktgen_alloc_skb(struct net_device *dev,
} else {
skb = __netdev_alloc_skb(dev, size, GFP_NOWAIT);
}
+ skb_reserve(skb, LL_RESERVED_SPACE(dev));
return skb;
}
@@ -3397,7 +3397,6 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
return;
}
pkt_dev->last_pkt_size = pkt_dev->skb->len;
- pkt_dev->allocated_skbs++;
pkt_dev->clone_count = 0; /* reset counter */
}
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index dc004b1..788ceed 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -896,7 +896,9 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
+ rtnl_link_get_size(dev) /* IFLA_LINKINFO */
+ rtnl_link_get_af_size(dev) /* IFLA_AF_SPEC */
+ nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_PORT_ID */
- + nla_total_size(MAX_PHYS_ITEM_ID_LEN); /* IFLA_PHYS_SWITCH_ID */
+ + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_SWITCH_ID */
+ + nla_total_size(1); /* IFLA_PROTO_DOWN */
+
}
static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev)
@@ -1082,7 +1084,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
(dev->ifalias &&
nla_put_string(skb, IFLA_IFALIAS, dev->ifalias)) ||
nla_put_u32(skb, IFLA_CARRIER_CHANGES,
- atomic_read(&dev->carrier_changes)))
+ atomic_read(&dev->carrier_changes)) ||
+ nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down))
goto nla_put_failure;
if (1) {
@@ -1319,6 +1322,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
[IFLA_CARRIER_CHANGES] = { .type = NLA_U32 }, /* ignored */
[IFLA_PHYS_SWITCH_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN },
[IFLA_LINK_NETNSID] = { .type = NLA_S32 },
+ [IFLA_PROTO_DOWN] = { .type = NLA_U8 },
};
static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
@@ -1861,6 +1865,14 @@ static int do_setlink(const struct sk_buff *skb,
}
err = 0;
+ if (tb[IFLA_PROTO_DOWN]) {
+ err = dev_change_proto_down(dev,
+ nla_get_u8(tb[IFLA_PROTO_DOWN]));
+ if (err)
+ goto errout;
+ status |= DO_SETLINK_NOTIFY;
+ }
+
errout:
if (status & DO_SETLINK_MODIFIED) {
if (status & DO_SETLINK_NOTIFY)
@@ -1951,16 +1963,30 @@ static int rtnl_group_dellink(const struct net *net, int group)
return 0;
}
+int rtnl_delete_link(struct net_device *dev)
+{
+ const struct rtnl_link_ops *ops;
+ LIST_HEAD(list_kill);
+
+ ops = dev->rtnl_link_ops;
+ if (!ops || !ops->dellink)
+ return -EOPNOTSUPP;
+
+ ops->dellink(dev, &list_kill);
+ unregister_netdevice_many(&list_kill);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(rtnl_delete_link);
+
static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
- const struct rtnl_link_ops *ops;
struct net_device *dev;
struct ifinfomsg *ifm;
char ifname[IFNAMSIZ];
struct nlattr *tb[IFLA_MAX+1];
int err;
- LIST_HEAD(list_kill);
err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy);
if (err < 0)
@@ -1982,13 +2008,7 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh)
if (!dev)
return -ENODEV;
- ops = dev->rtnl_link_ops;
- if (!ops || !ops->dellink)
- return -EOPNOTSUPP;
-
- ops->dellink(dev, &list_kill);
- unregister_netdevice_many(&list_kill);
- return 0;
+ return rtnl_delete_link(dev);
}
int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm)
diff --git a/net/core/timestamping.c b/net/core/timestamping.c
index 43d3dd6..42689d5 100644
--- a/net/core/timestamping.c
+++ b/net/core/timestamping.c
@@ -60,11 +60,15 @@ bool skb_defer_rx_timestamp(struct sk_buff *skb)
struct phy_device *phydev;
unsigned int type;
+ if (!skb->dev || !skb->dev->phydev || !skb->dev->phydev->drv)
+ return false;
+
if (skb_headroom(skb) < ETH_HLEN)
return false;
+
__skb_push(skb, ETH_HLEN);
- type = classify(skb);
+ type = ptp_classify_raw(skb);
__skb_pull(skb, ETH_HLEN);
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index b445d49..78d4ac9 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -574,7 +574,7 @@ static int dsa_of_probe(struct device *dev)
{
struct device_node *np = dev->of_node;
struct device_node *child, *mdio, *ethernet, *port, *link;
- struct mii_bus *mdio_bus;
+ struct mii_bus *mdio_bus, *mdio_bus_switch;
struct net_device *ethernet_dev;
struct dsa_platform_data *pd;
struct dsa_chip_data *cd;
@@ -636,6 +636,16 @@ static int dsa_of_probe(struct device *dev)
if (!of_property_read_u32(child, "eeprom-length", &eeprom_len))
cd->eeprom_len = eeprom_len;
+ mdio = of_parse_phandle(child, "mii-bus", 0);
+ if (mdio) {
+ mdio_bus_switch = of_mdio_find_bus(mdio);
+ if (!mdio_bus_switch) {
+ ret = -EPROBE_DEFER;
+ goto out_free_chip;
+ }
+ cd->host_dev = &mdio_bus_switch->dev;
+ }
+
for_each_available_child_of_node(child, port) {
port_reg = of_get_property(port, "reg", NULL);
if (!port_reg)
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index d5f1f9b..311796c8 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -13,9 +13,10 @@
#include <linux/phy.h>
#include <linux/netdevice.h>
+#include <linux/netpoll.h>
struct dsa_device_ops {
- netdev_tx_t (*xmit)(struct sk_buff *skb, struct net_device *dev);
+ struct sk_buff *(*xmit)(struct sk_buff *skb, struct net_device *dev);
int (*rcv)(struct sk_buff *skb, struct net_device *dev,
struct packet_type *pt, struct net_device *orig_dev);
};
@@ -26,7 +27,7 @@ struct dsa_slave_priv {
* switch port.
*/
struct net_device *dev;
- netdev_tx_t (*xmit)(struct sk_buff *skb,
+ struct sk_buff * (*xmit)(struct sk_buff *skb,
struct net_device *dev);
/*
@@ -47,6 +48,9 @@ struct dsa_slave_priv {
int old_duplex;
struct net_device *bridge_dev;
+#ifdef CONFIG_NET_POLL_CONTROLLER
+ struct netpoll *netpoll;
+#endif
};
/* dsa.c */
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 35c47dd..aa0266f 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -18,6 +18,7 @@
#include <net/rtnetlink.h>
#include <net/switchdev.h>
#include <linux/if_bridge.h>
+#include <linux/netpoll.h>
#include "dsa_priv.h"
/* slave mii_bus handling ***************************************************/
@@ -199,103 +200,66 @@ out:
return 0;
}
-static int dsa_slave_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
- struct net_device *dev,
- const unsigned char *addr, u16 vid, u16 nlm_flags)
+static int dsa_slave_port_fdb_add(struct net_device *dev,
+ struct switchdev_obj *obj)
{
+ struct switchdev_obj_fdb *fdb = &obj->u.fdb;
struct dsa_slave_priv *p = netdev_priv(dev);
struct dsa_switch *ds = p->parent;
int ret = -EOPNOTSUPP;
- if (ds->drv->fdb_add)
- ret = ds->drv->fdb_add(ds, p->port, addr, vid);
+ if (obj->trans == SWITCHDEV_TRANS_PREPARE)
+ ret = ds->drv->port_fdb_add ? 0 : -EOPNOTSUPP;
+ else if (obj->trans == SWITCHDEV_TRANS_COMMIT)
+ ret = ds->drv->port_fdb_add(ds, p->port, fdb->addr, fdb->vid);
return ret;
}
-static int dsa_slave_fdb_del(struct ndmsg *ndm, struct nlattr *tb[],
- struct net_device *dev,
- const unsigned char *addr, u16 vid)
+static int dsa_slave_port_fdb_del(struct net_device *dev,
+ struct switchdev_obj *obj)
{
+ struct switchdev_obj_fdb *fdb = &obj->u.fdb;
struct dsa_slave_priv *p = netdev_priv(dev);
struct dsa_switch *ds = p->parent;
int ret = -EOPNOTSUPP;
- if (ds->drv->fdb_del)
- ret = ds->drv->fdb_del(ds, p->port, addr, vid);
+ if (ds->drv->port_fdb_del)
+ ret = ds->drv->port_fdb_del(ds, p->port, fdb->addr, fdb->vid);
return ret;
}
-static int dsa_slave_fill_info(struct net_device *dev, struct sk_buff *skb,
- const unsigned char *addr, u16 vid,
- bool is_static,
- u32 portid, u32 seq, int type,
- unsigned int flags)
-{
- struct nlmsghdr *nlh;
- struct ndmsg *ndm;
-
- nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags);
- if (!nlh)
- return -EMSGSIZE;
-
- ndm = nlmsg_data(nlh);
- ndm->ndm_family = AF_BRIDGE;
- ndm->ndm_pad1 = 0;
- ndm->ndm_pad2 = 0;
- ndm->ndm_flags = NTF_EXT_LEARNED;
- ndm->ndm_type = 0;
- ndm->ndm_ifindex = dev->ifindex;
- ndm->ndm_state = is_static ? NUD_NOARP : NUD_REACHABLE;
-
- if (nla_put(skb, NDA_LLADDR, ETH_ALEN, addr))
- goto nla_put_failure;
-
- if (vid && nla_put_u16(skb, NDA_VLAN, vid))
- goto nla_put_failure;
-
- nlmsg_end(skb, nlh);
- return 0;
-
-nla_put_failure:
- nlmsg_cancel(skb, nlh);
- return -EMSGSIZE;
-}
-
-/* Dump information about entries, in response to GETNEIGH */
-static int dsa_slave_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
- struct net_device *dev,
- struct net_device *filter_dev, int idx)
+static int dsa_slave_port_fdb_dump(struct net_device *dev,
+ struct switchdev_obj *obj)
{
struct dsa_slave_priv *p = netdev_priv(dev);
struct dsa_switch *ds = p->parent;
unsigned char addr[ETH_ALEN] = { 0 };
+ u16 vid = 0;
int ret;
- if (!ds->drv->fdb_getnext)
+ if (!ds->drv->port_fdb_getnext)
return -EOPNOTSUPP;
- for (; ; idx++) {
+ for (;;) {
bool is_static;
- ret = ds->drv->fdb_getnext(ds, p->port, addr, &is_static);
+ ret = ds->drv->port_fdb_getnext(ds, p->port, addr, &vid,
+ &is_static);
if (ret < 0)
break;
- if (idx < cb->args[0])
- continue;
+ obj->u.fdb.addr = addr;
+ obj->u.fdb.vid = vid;
+ obj->u.fdb.ndm_state = is_static ? NUD_NOARP : NUD_REACHABLE;
- ret = dsa_slave_fill_info(dev, skb, addr, 0,
- is_static,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- RTM_NEWNEIGH, NLM_F_MULTI);
+ ret = obj->cb(dev, obj);
if (ret < 0)
break;
}
- return idx;
+ return ret == -ENOENT ? 0 : ret;
}
static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
@@ -363,6 +327,62 @@ static int dsa_slave_port_attr_set(struct net_device *dev,
return ret;
}
+static int dsa_slave_port_obj_add(struct net_device *dev,
+ struct switchdev_obj *obj)
+{
+ int err;
+
+ /* For the prepare phase, ensure the full set of changes is feasable in
+ * one go in order to signal a failure properly. If an operation is not
+ * supported, return -EOPNOTSUPP.
+ */
+
+ switch (obj->id) {
+ case SWITCHDEV_OBJ_PORT_FDB:
+ err = dsa_slave_port_fdb_add(dev, obj);
+ break;
+ default:
+ err = -EOPNOTSUPP;
+ break;
+ }
+
+ return err;
+}
+
+static int dsa_slave_port_obj_del(struct net_device *dev,
+ struct switchdev_obj *obj)
+{
+ int err;
+
+ switch (obj->id) {
+ case SWITCHDEV_OBJ_PORT_FDB:
+ err = dsa_slave_port_fdb_del(dev, obj);
+ break;
+ default:
+ err = -EOPNOTSUPP;
+ break;
+ }
+
+ return err;
+}
+
+static int dsa_slave_port_obj_dump(struct net_device *dev,
+ struct switchdev_obj *obj)
+{
+ int err;
+
+ switch (obj->id) {
+ case SWITCHDEV_OBJ_PORT_FDB:
+ err = dsa_slave_port_fdb_dump(dev, obj);
+ break;
+ default:
+ err = -EOPNOTSUPP;
+ break;
+ }
+
+ return err;
+}
+
static int dsa_slave_bridge_port_join(struct net_device *dev,
struct net_device *br)
{
@@ -418,24 +438,53 @@ static int dsa_slave_port_attr_get(struct net_device *dev,
return 0;
}
-static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev)
+static inline netdev_tx_t dsa_netpoll_send_skb(struct dsa_slave_priv *p,
+ struct sk_buff *skb)
{
- struct dsa_slave_priv *p = netdev_priv(dev);
-
- return p->xmit(skb, dev);
+#ifdef CONFIG_NET_POLL_CONTROLLER
+ if (p->netpoll)
+ netpoll_send_skb(p->netpoll, skb);
+#else
+ BUG();
+#endif
+ return NETDEV_TX_OK;
}
-static netdev_tx_t dsa_slave_notag_xmit(struct sk_buff *skb,
- struct net_device *dev)
+static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct dsa_slave_priv *p = netdev_priv(dev);
+ struct sk_buff *nskb;
+
+ dev->stats.tx_packets++;
+ dev->stats.tx_bytes += skb->len;
+
+ /* Transmit function may have to reallocate the original SKB */
+ nskb = p->xmit(skb, dev);
+ if (!nskb)
+ return NETDEV_TX_OK;
+
+ /* SKB for netpoll still need to be mangled with the protocol-specific
+ * tag to be successfully transmitted
+ */
+ if (unlikely(netpoll_tx_running(dev)))
+ return dsa_netpoll_send_skb(p, nskb);
- skb->dev = p->parent->dst->master_netdev;
- dev_queue_xmit(skb);
+ /* Queue the SKB for transmission on the parent interface, but
+ * do not modify its EtherType
+ */
+ nskb->dev = p->parent->dst->master_netdev;
+ dev_queue_xmit(nskb);
return NETDEV_TX_OK;
}
+static struct sk_buff *dsa_slave_notag_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ /* Just return the original SKB */
+ return skb;
+}
+
/* ethtool operations *******************************************************/
static int
@@ -665,6 +714,49 @@ static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e)
return ret;
}
+#ifdef CONFIG_NET_POLL_CONTROLLER
+static int dsa_slave_netpoll_setup(struct net_device *dev,
+ struct netpoll_info *ni)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+ struct dsa_switch *ds = p->parent;
+ struct net_device *master = ds->dst->master_netdev;
+ struct netpoll *netpoll;
+ int err = 0;
+
+ netpoll = kzalloc(sizeof(*netpoll), GFP_KERNEL);
+ if (!netpoll)
+ return -ENOMEM;
+
+ err = __netpoll_setup(netpoll, master);
+ if (err) {
+ kfree(netpoll);
+ goto out;
+ }
+
+ p->netpoll = netpoll;
+out:
+ return err;
+}
+
+static void dsa_slave_netpoll_cleanup(struct net_device *dev)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+ struct netpoll *netpoll = p->netpoll;
+
+ if (!netpoll)
+ return;
+
+ p->netpoll = NULL;
+
+ __netpoll_free_async(netpoll);
+}
+
+static void dsa_slave_poll_controller(struct net_device *dev)
+{
+}
+#endif
+
static const struct ethtool_ops dsa_slave_ethtool_ops = {
.get_settings = dsa_slave_get_settings,
.set_settings = dsa_slave_set_settings,
@@ -692,16 +784,24 @@ static const struct net_device_ops dsa_slave_netdev_ops = {
.ndo_change_rx_flags = dsa_slave_change_rx_flags,
.ndo_set_rx_mode = dsa_slave_set_rx_mode,
.ndo_set_mac_address = dsa_slave_set_mac_address,
- .ndo_fdb_add = dsa_slave_fdb_add,
- .ndo_fdb_del = dsa_slave_fdb_del,
- .ndo_fdb_dump = dsa_slave_fdb_dump,
+ .ndo_fdb_add = switchdev_port_fdb_add,
+ .ndo_fdb_del = switchdev_port_fdb_del,
+ .ndo_fdb_dump = switchdev_port_fdb_dump,
.ndo_do_ioctl = dsa_slave_ioctl,
.ndo_get_iflink = dsa_slave_get_iflink,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+ .ndo_netpoll_setup = dsa_slave_netpoll_setup,
+ .ndo_netpoll_cleanup = dsa_slave_netpoll_cleanup,
+ .ndo_poll_controller = dsa_slave_poll_controller,
+#endif
};
static const struct switchdev_ops dsa_slave_switchdev_ops = {
.switchdev_port_attr_get = dsa_slave_port_attr_get,
.switchdev_port_attr_set = dsa_slave_port_attr_set,
+ .switchdev_port_obj_add = dsa_slave_port_obj_add,
+ .switchdev_port_obj_del = dsa_slave_port_obj_del,
+ .switchdev_port_obj_dump = dsa_slave_port_obj_dump,
};
static void dsa_slave_adjust_link(struct net_device *dev)
diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c
index 83d3572..e2aadb7 100644
--- a/net/dsa/tag_brcm.c
+++ b/net/dsa/tag_brcm.c
@@ -58,14 +58,11 @@
#define BRCM_EG_TC_MASK 0x7
#define BRCM_EG_PID_MASK 0x1f
-static netdev_tx_t brcm_tag_xmit(struct sk_buff *skb, struct net_device *dev)
+static struct sk_buff *brcm_tag_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct dsa_slave_priv *p = netdev_priv(dev);
u8 *brcm_tag;
- dev->stats.tx_packets++;
- dev->stats.tx_bytes += skb->len;
-
if (skb_cow_head(skb, BRCM_TAG_LEN) < 0)
goto out_free;
@@ -87,17 +84,11 @@ static netdev_tx_t brcm_tag_xmit(struct sk_buff *skb, struct net_device *dev)
brcm_tag[2] = BRCM_IG_DSTMAP2_MASK;
brcm_tag[3] = (1 << p->port) & BRCM_IG_DSTMAP1_MASK;
- /* Queue the SKB for transmission on the parent interface, but
- * do not modify its EtherType
- */
- skb->dev = p->parent->dst->master_netdev;
- dev_queue_xmit(skb);
-
- return NETDEV_TX_OK;
+ return skb;
out_free:
kfree_skb(skb);
- return NETDEV_TX_OK;
+ return NULL;
}
static int brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev,
diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c
index 2dab270..aa780e4 100644
--- a/net/dsa/tag_dsa.c
+++ b/net/dsa/tag_dsa.c
@@ -15,14 +15,11 @@
#define DSA_HLEN 4
-static netdev_tx_t dsa_xmit(struct sk_buff *skb, struct net_device *dev)
+static struct sk_buff *dsa_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct dsa_slave_priv *p = netdev_priv(dev);
u8 *dsa_header;
- dev->stats.tx_packets++;
- dev->stats.tx_bytes += skb->len;
-
/*
* Convert the outermost 802.1q tag to a DSA tag for tagged
* packets, or insert a DSA tag between the addresses and
@@ -63,14 +60,11 @@ static netdev_tx_t dsa_xmit(struct sk_buff *skb, struct net_device *dev)
dsa_header[3] = 0x00;
}
- skb->dev = p->parent->dst->master_netdev;
- dev_queue_xmit(skb);
-
- return NETDEV_TX_OK;
+ return skb;
out_free:
kfree_skb(skb);
- return NETDEV_TX_OK;
+ return NULL;
}
static int dsa_rcv(struct sk_buff *skb, struct net_device *dev,
diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c
index 9aeda59..2288c80 100644
--- a/net/dsa/tag_edsa.c
+++ b/net/dsa/tag_edsa.c
@@ -16,14 +16,11 @@
#define DSA_HLEN 4
#define EDSA_HLEN 8
-static netdev_tx_t edsa_xmit(struct sk_buff *skb, struct net_device *dev)
+static struct sk_buff *edsa_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct dsa_slave_priv *p = netdev_priv(dev);
u8 *edsa_header;
- dev->stats.tx_packets++;
- dev->stats.tx_bytes += skb->len;
-
/*
* Convert the outermost 802.1q tag to a DSA tag and prepend
* a DSA ethertype field is the packet is tagged, or insert
@@ -76,14 +73,11 @@ static netdev_tx_t edsa_xmit(struct sk_buff *skb, struct net_device *dev)
edsa_header[7] = 0x00;
}
- skb->dev = p->parent->dst->master_netdev;
- dev_queue_xmit(skb);
-
- return NETDEV_TX_OK;
+ return skb;
out_free:
kfree_skb(skb);
- return NETDEV_TX_OK;
+ return NULL;
}
static int edsa_rcv(struct sk_buff *skb, struct net_device *dev,
diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c
index e268f9d..d25efc9 100644
--- a/net/dsa/tag_trailer.c
+++ b/net/dsa/tag_trailer.c
@@ -13,16 +13,13 @@
#include <linux/slab.h>
#include "dsa_priv.h"
-static netdev_tx_t trailer_xmit(struct sk_buff *skb, struct net_device *dev)
+static struct sk_buff *trailer_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct dsa_slave_priv *p = netdev_priv(dev);
struct sk_buff *nskb;
int padlen;
u8 *trailer;
- dev->stats.tx_packets++;
- dev->stats.tx_bytes += skb->len;
-
/*
* We have to make sure that the trailer ends up as the very
* last 4 bytes of the packet. This means that we have to pad
@@ -36,7 +33,7 @@ static netdev_tx_t trailer_xmit(struct sk_buff *skb, struct net_device *dev)
nskb = alloc_skb(NET_IP_ALIGN + skb->len + padlen + 4, GFP_ATOMIC);
if (nskb == NULL) {
kfree_skb(skb);
- return NETDEV_TX_OK;
+ return NULL;
}
skb_reserve(nskb, NET_IP_ALIGN);
@@ -57,10 +54,7 @@ static netdev_tx_t trailer_xmit(struct sk_buff *skb, struct net_device *dev)
trailer[2] = 0x10;
trailer[3] = 0x00;
- nskb->dev = p->parent->dst->master_netdev;
- dev_queue_xmit(nskb);
-
- return NETDEV_TX_OK;
+ return nskb;
}
static int trailer_rcv(struct sk_buff *skb, struct net_device *dev,
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 77e0f0e..217127c 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -114,7 +114,7 @@ int eth_header(struct sk_buff *skb, struct net_device *dev,
EXPORT_SYMBOL(eth_header);
/**
- * eth_get_headlen - determine the the length of header for an ethernet frame
+ * eth_get_headlen - determine the length of header for an ethernet frame
* @data: pointer to start of frame
* @len: total length of frame
*
diff --git a/net/ieee802154/rdev-ops.h b/net/ieee802154/rdev-ops.h
index b2155a1..8d5960a 100644
--- a/net/ieee802154/rdev-ops.h
+++ b/net/ieee802154/rdev-ops.h
@@ -24,6 +24,26 @@ rdev_del_virtual_intf_deprecated(struct cfg802154_registered_device *rdev,
}
static inline int
+rdev_suspend(struct cfg802154_registered_device *rdev)
+{
+ int ret;
+ trace_802154_rdev_suspend(&rdev->wpan_phy);
+ ret = rdev->ops->suspend(&rdev->wpan_phy);
+ trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+ return ret;
+}
+
+static inline int
+rdev_resume(struct cfg802154_registered_device *rdev)
+{
+ int ret;
+ trace_802154_rdev_resume(&rdev->wpan_phy);
+ ret = rdev->ops->resume(&rdev->wpan_phy);
+ trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+ return ret;
+}
+
+static inline int
rdev_add_virtual_intf(struct cfg802154_registered_device *rdev, char *name,
unsigned char name_assign_type,
enum nl802154_iftype type, __le64 extended_addr)
diff --git a/net/ieee802154/sysfs.c b/net/ieee802154/sysfs.c
index 133b428..bd88525 100644
--- a/net/ieee802154/sysfs.c
+++ b/net/ieee802154/sysfs.c
@@ -14,11 +14,13 @@
*/
#include <linux/device.h>
+#include <linux/rtnetlink.h>
#include <net/cfg802154.h>
#include "core.h"
#include "sysfs.h"
+#include "rdev-ops.h"
static inline struct cfg802154_registered_device *
dev_to_rdev(struct device *dev)
@@ -62,10 +64,46 @@ static struct attribute *pmib_attrs[] = {
};
ATTRIBUTE_GROUPS(pmib);
+#ifdef CONFIG_PM_SLEEP
+static int wpan_phy_suspend(struct device *dev)
+{
+ struct cfg802154_registered_device *rdev = dev_to_rdev(dev);
+ int ret = 0;
+
+ if (rdev->ops->suspend) {
+ rtnl_lock();
+ ret = rdev_suspend(rdev);
+ rtnl_unlock();
+ }
+
+ return ret;
+}
+
+static int wpan_phy_resume(struct device *dev)
+{
+ struct cfg802154_registered_device *rdev = dev_to_rdev(dev);
+ int ret = 0;
+
+ if (rdev->ops->resume) {
+ rtnl_lock();
+ ret = rdev_resume(rdev);
+ rtnl_unlock();
+ }
+
+ return ret;
+}
+
+static SIMPLE_DEV_PM_OPS(wpan_phy_pm_ops, wpan_phy_suspend, wpan_phy_resume);
+#define WPAN_PHY_PM_OPS (&wpan_phy_pm_ops)
+#else
+#define WPAN_PHY_PM_OPS NULL
+#endif
+
struct class wpan_phy_class = {
.name = "ieee802154",
.dev_release = wpan_phy_release,
.dev_groups = pmib_groups,
+ .pm = WPAN_PHY_PM_OPS,
};
int wpan_phy_sysfs_init(void)
diff --git a/net/ieee802154/trace.h b/net/ieee802154/trace.h
index 9b5f0eb..4399b7f 100644
--- a/net/ieee802154/trace.h
+++ b/net/ieee802154/trace.h
@@ -40,6 +40,28 @@
* rdev->ops traces *
*************************************************************/
+DECLARE_EVENT_CLASS(wpan_phy_only_evt,
+ TP_PROTO(struct wpan_phy *wpan_phy),
+ TP_ARGS(wpan_phy),
+ TP_STRUCT__entry(
+ WPAN_PHY_ENTRY
+ ),
+ TP_fast_assign(
+ WPAN_PHY_ASSIGN;
+ ),
+ TP_printk(WPAN_PHY_PR_FMT, WPAN_PHY_PR_ARG)
+);
+
+DEFINE_EVENT(wpan_phy_only_evt, 802154_rdev_suspend,
+ TP_PROTO(struct wpan_phy *wpan_phy),
+ TP_ARGS(wpan_phy)
+);
+
+DEFINE_EVENT(wpan_phy_only_evt, 802154_rdev_resume,
+ TP_PROTO(struct wpan_phy *wpan_phy),
+ TP_ARGS(wpan_phy)
+);
+
TRACE_EVENT(802154_rdev_add_virtual_intf,
TP_PROTO(struct wpan_phy *wpan_phy, char *name,
enum nl802154_iftype type, __le64 extended_addr),
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 9532ee8..cc4e498 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -112,6 +112,7 @@
#include <net/raw.h>
#include <net/icmp.h>
#include <net/inet_common.h>
+#include <net/ip_tunnels.h>
#include <net/xfrm.h>
#include <net/net_namespace.h>
#include <net/secure_seq.h>
@@ -1780,6 +1781,8 @@ static int __init inet_init(void)
dev_add_pack(&ip_packet_type);
+ ip_tunnel_core_init();
+
rc = 0;
out:
return rc;
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 6c8b1fb..34a3085 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -291,6 +291,40 @@ static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb)
kfree_skb(skb);
}
+/* Create and send an arp packet. */
+static void arp_send_dst(int type, int ptype, __be32 dest_ip,
+ struct net_device *dev, __be32 src_ip,
+ const unsigned char *dest_hw,
+ const unsigned char *src_hw,
+ const unsigned char *target_hw, struct sk_buff *oskb)
+{
+ struct sk_buff *skb;
+
+ /* arp on this interface. */
+ if (dev->flags & IFF_NOARP)
+ return;
+
+ skb = arp_create(type, ptype, dest_ip, dev, src_ip,
+ dest_hw, src_hw, target_hw);
+ if (!skb)
+ return;
+
+ if (oskb)
+ skb_dst_copy(skb, oskb);
+
+ arp_xmit(skb);
+}
+
+void arp_send(int type, int ptype, __be32 dest_ip,
+ struct net_device *dev, __be32 src_ip,
+ const unsigned char *dest_hw, const unsigned char *src_hw,
+ const unsigned char *target_hw)
+{
+ arp_send_dst(type, ptype, dest_ip, dev, src_ip, dest_hw, src_hw,
+ target_hw, NULL);
+}
+EXPORT_SYMBOL(arp_send);
+
static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
{
__be32 saddr = 0;
@@ -346,8 +380,9 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
}
}
- arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr,
- dst_hw, dev->dev_addr, NULL);
+ arp_send_dst(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr,
+ dst_hw, dev->dev_addr, NULL,
+ dev->priv_flags & IFF_XMIT_DST_RELEASE ? NULL : skb);
}
static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip)
@@ -597,32 +632,6 @@ void arp_xmit(struct sk_buff *skb)
EXPORT_SYMBOL(arp_xmit);
/*
- * Create and send an arp packet.
- */
-void arp_send(int type, int ptype, __be32 dest_ip,
- struct net_device *dev, __be32 src_ip,
- const unsigned char *dest_hw, const unsigned char *src_hw,
- const unsigned char *target_hw)
-{
- struct sk_buff *skb;
-
- /*
- * No arp on this interface.
- */
-
- if (dev->flags&IFF_NOARP)
- return;
-
- skb = arp_create(type, ptype, dest_ip, dev, src_ip,
- dest_hw, src_hw, target_hw);
- if (!skb)
- return;
-
- arp_xmit(skb);
-}
-EXPORT_SYMBOL(arp_send);
-
-/*
* Process an arp request.
*/
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index 574fad9..f915abf 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -74,7 +74,7 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len
inet->inet_daddr = fl4->daddr;
inet->inet_dport = usin->sin_port;
sk->sk_state = TCP_ESTABLISHED;
- inet_set_txhash(sk);
+ sk_set_txhash(sk);
inet->inet_id = jiffies;
sk_dst_set(sk, &rt->dst);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 6bbc549..6b98de0 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -280,6 +280,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
fl4.flowi4_scope = scope;
fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0;
+ fl4.flowi4_tun_key.tun_id = 0;
if (!fib_lookup(net, &fl4, &res, 0))
return FIB_RES_PREFSRC(net, res);
} else {
@@ -313,6 +314,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
fl4.saddr = dst;
fl4.flowi4_tos = tos;
fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
+ fl4.flowi4_tun_key.tun_id = 0;
no_addr = idev->ifa_list == NULL;
@@ -591,6 +593,8 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
[RTA_METRICS] = { .type = NLA_NESTED },
[RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
[RTA_FLOW] = { .type = NLA_U32 },
+ [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
+ [RTA_ENCAP] = { .type = NLA_NESTED },
};
static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
@@ -656,6 +660,12 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
case RTA_TABLE:
cfg->fc_table = nla_get_u32(attr);
break;
+ case RTA_ENCAP:
+ cfg->fc_encap = attr;
+ break;
+ case RTA_ENCAP_TYPE:
+ cfg->fc_encap_type = nla_get_u16(attr);
+ break;
}
}
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 3a06586..558e196 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -42,6 +42,7 @@
#include <net/ip_fib.h>
#include <net/netlink.h>
#include <net/nexthop.h>
+#include <net/lwtunnel.h>
#include "fib_lookup.h"
@@ -208,6 +209,7 @@ static void free_fib_info_rcu(struct rcu_head *head)
change_nexthops(fi) {
if (nexthop_nh->nh_dev)
dev_put(nexthop_nh->nh_dev);
+ lwtstate_put(nexthop_nh->nh_lwtstate);
free_nh_exceptions(nexthop_nh);
rt_fibinfo_free_cpus(nexthop_nh->nh_pcpu_rth_output);
rt_fibinfo_free(&nexthop_nh->nh_rth_input);
@@ -266,6 +268,7 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
#ifdef CONFIG_IP_ROUTE_CLASSID
nh->nh_tclassid != onh->nh_tclassid ||
#endif
+ lwtunnel_cmp_encap(nh->nh_lwtstate, onh->nh_lwtstate) ||
((nh->nh_flags ^ onh->nh_flags) & ~RTNH_COMPARE_MASK))
return -1;
onh++;
@@ -366,6 +369,7 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi)
payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
if (fi->fib_nhs) {
+ size_t nh_encapsize = 0;
/* Also handles the special case fib_nhs == 1 */
/* each nexthop is packed in an attribute */
@@ -374,8 +378,21 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi)
/* may contain flow and gateway attribute */
nhsize += 2 * nla_total_size(4);
+ /* grab encap info */
+ for_nexthops(fi) {
+ if (nh->nh_lwtstate) {
+ /* RTA_ENCAP_TYPE */
+ nh_encapsize += lwtunnel_get_encap_size(
+ nh->nh_lwtstate);
+ /* RTA_ENCAP */
+ nh_encapsize += nla_total_size(2);
+ }
+ } endfor_nexthops(fi);
+
/* all nexthops are packed in a nested attribute */
- payload += nla_total_size(fi->fib_nhs * nhsize);
+ payload += nla_total_size((fi->fib_nhs * nhsize) +
+ nh_encapsize);
+
}
return payload;
@@ -421,13 +438,15 @@ static int fib_detect_death(struct fib_info *fi, int order,
if (n) {
state = n->nud_state;
neigh_release(n);
+ } else {
+ return 0;
}
if (state == NUD_REACHABLE)
return 0;
if ((state & NUD_VALID) && order != dflt)
return 0;
if ((state & NUD_VALID) ||
- (*last_idx < 0 && order > dflt)) {
+ (*last_idx < 0 && order > dflt && state != NUD_INCOMPLETE)) {
*last_resort = fi;
*last_idx = order;
}
@@ -452,6 +471,9 @@ static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
int remaining, struct fib_config *cfg)
{
+ struct net *net = cfg->fc_nlinfo.nl_net;
+ int ret;
+
change_nexthops(fi) {
int attrlen;
@@ -475,18 +497,66 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
if (nexthop_nh->nh_tclassid)
fi->fib_net->ipv4.fib_num_tclassid_users++;
#endif
+ nla = nla_find(attrs, attrlen, RTA_ENCAP);
+ if (nla) {
+ struct lwtunnel_state *lwtstate;
+ struct net_device *dev = NULL;
+ struct nlattr *nla_entype;
+
+ nla_entype = nla_find(attrs, attrlen,
+ RTA_ENCAP_TYPE);
+ if (!nla_entype)
+ goto err_inval;
+ if (cfg->fc_oif)
+ dev = __dev_get_by_index(net, cfg->fc_oif);
+ ret = lwtunnel_build_state(dev, nla_get_u16(
+ nla_entype),
+ nla, &lwtstate);
+ if (ret)
+ goto errout;
+ nexthop_nh->nh_lwtstate =
+ lwtstate_get(lwtstate);
+ }
}
rtnh = rtnh_next(rtnh, &remaining);
} endfor_nexthops(fi);
return 0;
+
+err_inval:
+ ret = -EINVAL;
+
+errout:
+ return ret;
}
#endif
+int fib_encap_match(struct net *net, u16 encap_type,
+ struct nlattr *encap,
+ int oif, const struct fib_nh *nh)
+{
+ struct lwtunnel_state *lwtstate;
+ struct net_device *dev = NULL;
+ int ret;
+
+ if (encap_type == LWTUNNEL_ENCAP_NONE)
+ return 0;
+
+ if (oif)
+ dev = __dev_get_by_index(net, oif);
+ ret = lwtunnel_build_state(dev, encap_type,
+ encap, &lwtstate);
+ if (!ret)
+ return lwtunnel_cmp_encap(lwtstate, nh->nh_lwtstate);
+
+ return 0;
+}
+
int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
{
+ struct net *net = cfg->fc_nlinfo.nl_net;
#ifdef CONFIG_IP_ROUTE_MULTIPATH
struct rtnexthop *rtnh;
int remaining;
@@ -496,6 +566,12 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
return 1;
if (cfg->fc_oif || cfg->fc_gw) {
+ if (cfg->fc_encap) {
+ if (fib_encap_match(net, cfg->fc_encap_type,
+ cfg->fc_encap, cfg->fc_oif,
+ fi->fib_nh))
+ return 1;
+ }
if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
(!cfg->fc_gw || cfg->fc_gw == fi->fib_nh->nh_gw))
return 0;
@@ -882,6 +958,21 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
} else {
struct fib_nh *nh = fi->fib_nh;
+ if (cfg->fc_encap) {
+ struct lwtunnel_state *lwtstate;
+ struct net_device *dev = NULL;
+
+ if (cfg->fc_encap_type == LWTUNNEL_ENCAP_NONE)
+ goto err_inval;
+ if (cfg->fc_oif)
+ dev = __dev_get_by_index(net, cfg->fc_oif);
+ err = lwtunnel_build_state(dev, cfg->fc_encap_type,
+ cfg->fc_encap, &lwtstate);
+ if (err)
+ goto failure;
+
+ nh->nh_lwtstate = lwtstate_get(lwtstate);
+ }
nh->nh_oif = cfg->fc_oif;
nh->nh_gw = cfg->fc_gw;
nh->nh_flags = cfg->fc_flags;
@@ -1055,6 +1146,8 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid))
goto nla_put_failure;
#endif
+ if (fi->fib_nh->nh_lwtstate)
+ lwtunnel_fill_encap(skb, fi->fib_nh->nh_lwtstate);
}
#ifdef CONFIG_IP_ROUTE_MULTIPATH
if (fi->fib_nhs > 1) {
@@ -1090,6 +1183,8 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid))
goto nla_put_failure;
#endif
+ if (nh->nh_lwtstate)
+ lwtunnel_fill_encap(skb, nh->nh_lwtstate);
/* length of rtnetlink header + attributes */
rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
} endfor_nexthops(fi);
diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c
index 4a7b5b2..d9c552a 100644
--- a/net/ipv4/gre_demux.c
+++ b/net/ipv4/gre_demux.c
@@ -31,7 +31,6 @@
#include <net/xfrm.h>
static const struct gre_protocol __rcu *gre_proto[GREPROTO_MAX] __read_mostly;
-static struct gre_cisco_protocol __rcu *gre_cisco_proto_list[GRE_IP_PROTO_MAX];
int gre_add_protocol(const struct gre_protocol *proto, u8 version)
{
@@ -61,197 +60,6 @@ int gre_del_protocol(const struct gre_protocol *proto, u8 version)
}
EXPORT_SYMBOL_GPL(gre_del_protocol);
-void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
- int hdr_len)
-{
- struct gre_base_hdr *greh;
-
- skb_push(skb, hdr_len);
-
- skb_reset_transport_header(skb);
- greh = (struct gre_base_hdr *)skb->data;
- greh->flags = tnl_flags_to_gre_flags(tpi->flags);
- greh->protocol = tpi->proto;
-
- if (tpi->flags&(TUNNEL_KEY|TUNNEL_CSUM|TUNNEL_SEQ)) {
- __be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4);
-
- if (tpi->flags&TUNNEL_SEQ) {
- *ptr = tpi->seq;
- ptr--;
- }
- if (tpi->flags&TUNNEL_KEY) {
- *ptr = tpi->key;
- ptr--;
- }
- if (tpi->flags&TUNNEL_CSUM &&
- !(skb_shinfo(skb)->gso_type &
- (SKB_GSO_GRE|SKB_GSO_GRE_CSUM))) {
- *ptr = 0;
- *(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0,
- skb->len, 0));
- }
- }
-}
-EXPORT_SYMBOL_GPL(gre_build_header);
-
-static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
- bool *csum_err)
-{
- const struct gre_base_hdr *greh;
- __be32 *options;
- int hdr_len;
-
- if (unlikely(!pskb_may_pull(skb, sizeof(struct gre_base_hdr))))
- return -EINVAL;
-
- greh = (struct gre_base_hdr *)skb_transport_header(skb);
- if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING)))
- return -EINVAL;
-
- tpi->flags = gre_flags_to_tnl_flags(greh->flags);
- hdr_len = ip_gre_calc_hlen(tpi->flags);
-
- if (!pskb_may_pull(skb, hdr_len))
- return -EINVAL;
-
- greh = (struct gre_base_hdr *)skb_transport_header(skb);
- tpi->proto = greh->protocol;
-
- options = (__be32 *)(greh + 1);
- if (greh->flags & GRE_CSUM) {
- if (skb_checksum_simple_validate(skb)) {
- *csum_err = true;
- return -EINVAL;
- }
-
- skb_checksum_try_convert(skb, IPPROTO_GRE, 0,
- null_compute_pseudo);
-
- options++;
- }
-
- if (greh->flags & GRE_KEY) {
- tpi->key = *options;
- options++;
- } else
- tpi->key = 0;
-
- if (unlikely(greh->flags & GRE_SEQ)) {
- tpi->seq = *options;
- options++;
- } else
- tpi->seq = 0;
-
- /* WCCP version 1 and 2 protocol decoding.
- * - Change protocol to IP
- * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
- */
- if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) {
- tpi->proto = htons(ETH_P_IP);
- if ((*(u8 *)options & 0xF0) != 0x40) {
- hdr_len += 4;
- if (!pskb_may_pull(skb, hdr_len))
- return -EINVAL;
- }
- }
-
- return iptunnel_pull_header(skb, hdr_len, tpi->proto);
-}
-
-static int gre_cisco_rcv(struct sk_buff *skb)
-{
- struct tnl_ptk_info tpi;
- int i;
- bool csum_err = false;
-
-#ifdef CONFIG_NET_IPGRE_BROADCAST
- if (ipv4_is_multicast(ip_hdr(skb)->daddr)) {
- /* Looped back packet, drop it! */
- if (rt_is_output_route(skb_rtable(skb)))
- goto drop;
- }
-#endif
-
- if (parse_gre_header(skb, &tpi, &csum_err) < 0)
- goto drop;
-
- rcu_read_lock();
- for (i = 0; i < GRE_IP_PROTO_MAX; i++) {
- struct gre_cisco_protocol *proto;
- int ret;
-
- proto = rcu_dereference(gre_cisco_proto_list[i]);
- if (!proto)
- continue;
- ret = proto->handler(skb, &tpi);
- if (ret == PACKET_RCVD) {
- rcu_read_unlock();
- return 0;
- }
- }
- rcu_read_unlock();
-
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
-drop:
- kfree_skb(skb);
- return 0;
-}
-
-static void gre_cisco_err(struct sk_buff *skb, u32 info)
-{
- /* All the routers (except for Linux) return only
- * 8 bytes of packet payload. It means, that precise relaying of
- * ICMP in the real Internet is absolutely infeasible.
- *
- * Moreover, Cisco "wise men" put GRE key to the third word
- * in GRE header. It makes impossible maintaining even soft
- * state for keyed
- * GRE tunnels with enabled checksum. Tell them "thank you".
- *
- * Well, I wonder, rfc1812 was written by Cisco employee,
- * what the hell these idiots break standards established
- * by themselves???
- */
-
- const int type = icmp_hdr(skb)->type;
- const int code = icmp_hdr(skb)->code;
- struct tnl_ptk_info tpi;
- bool csum_err = false;
- int i;
-
- if (parse_gre_header(skb, &tpi, &csum_err)) {
- if (!csum_err) /* ignore csum errors. */
- return;
- }
-
- if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
- ipv4_update_pmtu(skb, dev_net(skb->dev), info,
- skb->dev->ifindex, 0, IPPROTO_GRE, 0);
- return;
- }
- if (type == ICMP_REDIRECT) {
- ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex, 0,
- IPPROTO_GRE, 0);
- return;
- }
-
- rcu_read_lock();
- for (i = 0; i < GRE_IP_PROTO_MAX; i++) {
- struct gre_cisco_protocol *proto;
-
- proto = rcu_dereference(gre_cisco_proto_list[i]);
- if (!proto)
- continue;
-
- if (proto->err_handler(skb, info, &tpi) == PACKET_RCVD)
- goto out;
-
- }
-out:
- rcu_read_unlock();
-}
-
static int gre_rcv(struct sk_buff *skb)
{
const struct gre_protocol *proto;
@@ -302,60 +110,19 @@ static const struct net_protocol net_gre_protocol = {
.netns_ok = 1,
};
-static const struct gre_protocol ipgre_protocol = {
- .handler = gre_cisco_rcv,
- .err_handler = gre_cisco_err,
-};
-
-int gre_cisco_register(struct gre_cisco_protocol *newp)
-{
- struct gre_cisco_protocol **proto = (struct gre_cisco_protocol **)
- &gre_cisco_proto_list[newp->priority];
-
- return (cmpxchg(proto, NULL, newp) == NULL) ? 0 : -EBUSY;
-}
-EXPORT_SYMBOL_GPL(gre_cisco_register);
-
-int gre_cisco_unregister(struct gre_cisco_protocol *del_proto)
-{
- struct gre_cisco_protocol **proto = (struct gre_cisco_protocol **)
- &gre_cisco_proto_list[del_proto->priority];
- int ret;
-
- ret = (cmpxchg(proto, del_proto, NULL) == del_proto) ? 0 : -EINVAL;
-
- if (ret)
- return ret;
-
- synchronize_net();
- return 0;
-}
-EXPORT_SYMBOL_GPL(gre_cisco_unregister);
-
static int __init gre_init(void)
{
pr_info("GRE over IPv4 demultiplexor driver\n");
if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) {
pr_err("can't add protocol\n");
- goto err;
- }
-
- if (gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0) {
- pr_info("%s: can't add ipgre handler\n", __func__);
- goto err_gre;
+ return -EAGAIN;
}
-
return 0;
-err_gre:
- inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
-err:
- return -EAGAIN;
}
static void __exit gre_exit(void)
{
- gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
}
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index f5203fb..c0556f1 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -496,6 +496,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
}
/* Ugh! */
orefdst = skb_in->_skb_refdst; /* save old refdst */
+ skb_dst_set(skb_in, NULL);
err = ip_route_input(skb_in, fl4_dec.daddr, fl4_dec.saddr,
RT_TOS(tos), rt2->dst.dev);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 0cb9165..8912019 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -343,7 +343,6 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
struct sock *sk2;
const struct hlist_nulls_node *node;
struct inet_timewait_sock *tw = NULL;
- int twrefcnt = 0;
spin_lock(lock);
@@ -371,21 +370,17 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
WARN_ON(!sk_unhashed(sk));
__sk_nulls_add_node_rcu(sk, &head->chain);
if (tw) {
- twrefcnt = inet_twsk_unhash(tw);
+ sk_nulls_del_node_init_rcu((struct sock *)tw);
NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED);
}
spin_unlock(lock);
- if (twrefcnt)
- inet_twsk_put(tw);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
if (twp) {
*twp = tw;
} else if (tw) {
/* Silly. Should hash-dance instead... */
- inet_twsk_deschedule(tw);
-
- inet_twsk_put(tw);
+ inet_twsk_deschedule_put(tw);
}
return 0;
@@ -403,13 +398,12 @@ static u32 inet_sk_port_offset(const struct sock *sk)
inet->inet_dport);
}
-int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw)
+void __inet_hash_nolisten(struct sock *sk, struct sock *osk)
{
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
struct hlist_nulls_head *list;
struct inet_ehash_bucket *head;
spinlock_t *lock;
- int twrefcnt = 0;
WARN_ON(!sk_unhashed(sk));
@@ -420,23 +414,22 @@ int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw)
spin_lock(lock);
__sk_nulls_add_node_rcu(sk, list);
- if (tw) {
- WARN_ON(sk->sk_hash != tw->tw_hash);
- twrefcnt = inet_twsk_unhash(tw);
+ if (osk) {
+ WARN_ON(sk->sk_hash != osk->sk_hash);
+ sk_nulls_del_node_init_rcu(osk);
}
spin_unlock(lock);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
- return twrefcnt;
}
EXPORT_SYMBOL_GPL(__inet_hash_nolisten);
-int __inet_hash(struct sock *sk, struct inet_timewait_sock *tw)
+void __inet_hash(struct sock *sk, struct sock *osk)
{
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
struct inet_listen_hashbucket *ilb;
if (sk->sk_state != TCP_LISTEN)
- return __inet_hash_nolisten(sk, tw);
+ return __inet_hash_nolisten(sk, osk);
WARN_ON(!sk_unhashed(sk));
ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
@@ -445,7 +438,6 @@ int __inet_hash(struct sock *sk, struct inet_timewait_sock *tw)
__sk_nulls_add_node_rcu(sk, &ilb->head);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
spin_unlock(&ilb->lock);
- return 0;
}
EXPORT_SYMBOL(__inet_hash);
@@ -492,7 +484,6 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
struct inet_bind_bucket *tb;
int ret;
struct net *net = sock_net(sk);
- int twrefcnt = 1;
if (!snum) {
int i, remaining, low, high, port;
@@ -560,19 +551,14 @@ ok:
inet_bind_hash(sk, tb, port);
if (sk_unhashed(sk)) {
inet_sk(sk)->inet_sport = htons(port);
- twrefcnt += __inet_hash_nolisten(sk, tw);
+ __inet_hash_nolisten(sk, (struct sock *)tw);
}
if (tw)
- twrefcnt += inet_twsk_bind_unhash(tw, hinfo);
+ inet_twsk_bind_unhash(tw, hinfo);
spin_unlock(&head->lock);
- if (tw) {
- inet_twsk_deschedule(tw);
- while (twrefcnt) {
- twrefcnt--;
- inet_twsk_put(tw);
- }
- }
+ if (tw)
+ inet_twsk_deschedule_put(tw);
ret = 0;
goto out;
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 2ffbd16..ae22cc2 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -18,28 +18,6 @@
/**
- * inet_twsk_unhash - unhash a timewait socket from established hash
- * @tw: timewait socket
- *
- * unhash a timewait socket from established hash, if hashed.
- * ehash lock must be held by caller.
- * Returns 1 if caller should call inet_twsk_put() after lock release.
- */
-int inet_twsk_unhash(struct inet_timewait_sock *tw)
-{
- if (hlist_nulls_unhashed(&tw->tw_node))
- return 0;
-
- hlist_nulls_del_rcu(&tw->tw_node);
- sk_nulls_node_init(&tw->tw_node);
- /*
- * We cannot call inet_twsk_put() ourself under lock,
- * caller must call it for us.
- */
- return 1;
-}
-
-/**
* inet_twsk_bind_unhash - unhash a timewait socket from bind hash
* @tw: timewait socket
* @hashinfo: hashinfo pointer
@@ -48,35 +26,29 @@ int inet_twsk_unhash(struct inet_timewait_sock *tw)
* bind hash lock must be held by caller.
* Returns 1 if caller should call inet_twsk_put() after lock release.
*/
-int inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
+void inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
struct inet_hashinfo *hashinfo)
{
struct inet_bind_bucket *tb = tw->tw_tb;
if (!tb)
- return 0;
+ return;
__hlist_del(&tw->tw_bind_node);
tw->tw_tb = NULL;
inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
- /*
- * We cannot call inet_twsk_put() ourself under lock,
- * caller must call it for us.
- */
- return 1;
+ __sock_put((struct sock *)tw);
}
/* Must be called with locally disabled BHs. */
static void inet_twsk_kill(struct inet_timewait_sock *tw)
{
struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo;
- struct inet_bind_hashbucket *bhead;
- int refcnt;
- /* Unlink from established hashes. */
spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
+ struct inet_bind_hashbucket *bhead;
spin_lock(lock);
- refcnt = inet_twsk_unhash(tw);
+ sk_nulls_del_node_init_rcu((struct sock *)tw);
spin_unlock(lock);
/* Disassociate with bind bucket. */
@@ -84,11 +56,9 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw)
hashinfo->bhash_size)];
spin_lock(&bhead->lock);
- refcnt += inet_twsk_bind_unhash(tw, hashinfo);
+ inet_twsk_bind_unhash(tw, hashinfo);
spin_unlock(&bhead->lock);
- BUG_ON(refcnt >= atomic_read(&tw->tw_refcnt));
- atomic_sub(refcnt, &tw->tw_refcnt);
atomic_dec(&tw->tw_dr->tw_count);
inet_twsk_put(tw);
}
@@ -235,13 +205,17 @@ EXPORT_SYMBOL_GPL(inet_twsk_alloc);
* tcp_input.c to verify this.
*/
-/* This is for handling early-kills of TIME_WAIT sockets. */
-void inet_twsk_deschedule(struct inet_timewait_sock *tw)
+/* This is for handling early-kills of TIME_WAIT sockets.
+ * Warning : consume reference.
+ * Caller should not access tw anymore.
+ */
+void inet_twsk_deschedule_put(struct inet_timewait_sock *tw)
{
if (del_timer_sync(&tw->tw_timer))
inet_twsk_kill(tw);
+ inet_twsk_put(tw);
}
-EXPORT_SYMBOL(inet_twsk_deschedule);
+EXPORT_SYMBOL(inet_twsk_deschedule_put);
void inet_twsk_schedule(struct inet_timewait_sock *tw, const int timeo)
{
@@ -311,9 +285,8 @@ restart:
rcu_read_unlock();
local_bh_disable();
- inet_twsk_deschedule(tw);
+ inet_twsk_deschedule_put(tw);
local_bh_enable();
- inet_twsk_put(tw);
goto restart_rcu;
}
/* If the nulls value we got at the end of this lookup is
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 921138f..d96722a 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -522,7 +522,6 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
int len;
int ihlen;
int err;
- int sum_truesize;
u8 ecn;
ipq_kill(qp);
@@ -590,32 +589,19 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
add_frag_mem_limit(qp->q.net, clone->truesize);
}
+ skb_shinfo(head)->frag_list = head->next;
skb_push(head, head->data - skb_network_header(head));
- sum_truesize = head->truesize;
- for (fp = head->next; fp;) {
- bool headstolen;
- int delta;
- struct sk_buff *next = fp->next;
-
- sum_truesize += fp->truesize;
+ for (fp=head->next; fp; fp = fp->next) {
+ head->data_len += fp->len;
+ head->len += fp->len;
if (head->ip_summed != fp->ip_summed)
head->ip_summed = CHECKSUM_NONE;
else if (head->ip_summed == CHECKSUM_COMPLETE)
head->csum = csum_add(head->csum, fp->csum);
-
- if (skb_try_coalesce(head, fp, &headstolen, &delta)) {
- kfree_skb_partial(fp, headstolen);
- } else {
- if (!skb_shinfo(head)->frag_list)
- skb_shinfo(head)->frag_list = fp;
- head->data_len += fp->len;
- head->len += fp->len;
- head->truesize += fp->truesize;
- }
- fp = next;
+ head->truesize += fp->truesize;
}
- sub_frag_mem_limit(qp->q.net, sum_truesize);
+ sub_frag_mem_limit(qp->q.net, head->truesize);
head->next = NULL;
head->dev = dev;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 5fd7064..fb44d69 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -25,6 +25,7 @@
#include <linux/udp.h>
#include <linux/if_arp.h>
#include <linux/mroute.h>
+#include <linux/if_vlan.h>
#include <linux/init.h>
#include <linux/in6.h>
#include <linux/inetdevice.h>
@@ -47,6 +48,7 @@
#include <net/netns/generic.h>
#include <net/rtnetlink.h>
#include <net/gre.h>
+#include <net/dst_metadata.h>
#if IS_ENABLED(CONFIG_IPV6)
#include <net/ipv6.h>
@@ -121,8 +123,127 @@ static int ipgre_tunnel_init(struct net_device *dev);
static int ipgre_net_id __read_mostly;
static int gre_tap_net_id __read_mostly;
-static int ipgre_err(struct sk_buff *skb, u32 info,
- const struct tnl_ptk_info *tpi)
+static int ip_gre_calc_hlen(__be16 o_flags)
+{
+ int addend = 4;
+
+ if (o_flags & TUNNEL_CSUM)
+ addend += 4;
+ if (o_flags & TUNNEL_KEY)
+ addend += 4;
+ if (o_flags & TUNNEL_SEQ)
+ addend += 4;
+ return addend;
+}
+
+static __be16 gre_flags_to_tnl_flags(__be16 flags)
+{
+ __be16 tflags = 0;
+
+ if (flags & GRE_CSUM)
+ tflags |= TUNNEL_CSUM;
+ if (flags & GRE_ROUTING)
+ tflags |= TUNNEL_ROUTING;
+ if (flags & GRE_KEY)
+ tflags |= TUNNEL_KEY;
+ if (flags & GRE_SEQ)
+ tflags |= TUNNEL_SEQ;
+ if (flags & GRE_STRICT)
+ tflags |= TUNNEL_STRICT;
+ if (flags & GRE_REC)
+ tflags |= TUNNEL_REC;
+ if (flags & GRE_VERSION)
+ tflags |= TUNNEL_VERSION;
+
+ return tflags;
+}
+
+static __be16 tnl_flags_to_gre_flags(__be16 tflags)
+{
+ __be16 flags = 0;
+
+ if (tflags & TUNNEL_CSUM)
+ flags |= GRE_CSUM;
+ if (tflags & TUNNEL_ROUTING)
+ flags |= GRE_ROUTING;
+ if (tflags & TUNNEL_KEY)
+ flags |= GRE_KEY;
+ if (tflags & TUNNEL_SEQ)
+ flags |= GRE_SEQ;
+ if (tflags & TUNNEL_STRICT)
+ flags |= GRE_STRICT;
+ if (tflags & TUNNEL_REC)
+ flags |= GRE_REC;
+ if (tflags & TUNNEL_VERSION)
+ flags |= GRE_VERSION;
+
+ return flags;
+}
+
+static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
+ bool *csum_err)
+{
+ const struct gre_base_hdr *greh;
+ __be32 *options;
+ int hdr_len;
+
+ if (unlikely(!pskb_may_pull(skb, sizeof(struct gre_base_hdr))))
+ return -EINVAL;
+
+ greh = (struct gre_base_hdr *)skb_transport_header(skb);
+ if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING)))
+ return -EINVAL;
+
+ tpi->flags = gre_flags_to_tnl_flags(greh->flags);
+ hdr_len = ip_gre_calc_hlen(tpi->flags);
+
+ if (!pskb_may_pull(skb, hdr_len))
+ return -EINVAL;
+
+ greh = (struct gre_base_hdr *)skb_transport_header(skb);
+ tpi->proto = greh->protocol;
+
+ options = (__be32 *)(greh + 1);
+ if (greh->flags & GRE_CSUM) {
+ if (skb_checksum_simple_validate(skb)) {
+ *csum_err = true;
+ return -EINVAL;
+ }
+
+ skb_checksum_try_convert(skb, IPPROTO_GRE, 0,
+ null_compute_pseudo);
+ options++;
+ }
+
+ if (greh->flags & GRE_KEY) {
+ tpi->key = *options;
+ options++;
+ } else {
+ tpi->key = 0;
+ }
+ if (unlikely(greh->flags & GRE_SEQ)) {
+ tpi->seq = *options;
+ options++;
+ } else {
+ tpi->seq = 0;
+ }
+ /* WCCP version 1 and 2 protocol decoding.
+ * - Change protocol to IP
+ * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
+ */
+ if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) {
+ tpi->proto = htons(ETH_P_IP);
+ if ((*(u8 *)options & 0xF0) != 0x40) {
+ hdr_len += 4;
+ if (!pskb_may_pull(skb, hdr_len))
+ return -EINVAL;
+ }
+ }
+ return iptunnel_pull_header(skb, hdr_len, tpi->proto);
+}
+
+static void ipgre_err(struct sk_buff *skb, u32 info,
+ const struct tnl_ptk_info *tpi)
{
/* All the routers (except for Linux) return only
@@ -148,14 +269,14 @@ static int ipgre_err(struct sk_buff *skb, u32 info,
switch (type) {
default:
case ICMP_PARAMETERPROB:
- return PACKET_RCVD;
+ return;
case ICMP_DEST_UNREACH:
switch (code) {
case ICMP_SR_FAILED:
case ICMP_PORT_UNREACH:
/* Impossible event. */
- return PACKET_RCVD;
+ return;
default:
/* All others are translated to HOST_UNREACH.
rfc2003 contains "deep thoughts" about NET_UNREACH,
@@ -164,9 +285,10 @@ static int ipgre_err(struct sk_buff *skb, u32 info,
break;
}
break;
+
case ICMP_TIME_EXCEEDED:
if (code != ICMP_EXC_TTL)
- return PACKET_RCVD;
+ return;
break;
case ICMP_REDIRECT:
@@ -183,26 +305,85 @@ static int ipgre_err(struct sk_buff *skb, u32 info,
iph->daddr, iph->saddr, tpi->key);
if (!t)
- return PACKET_REJECT;
+ return;
if (t->parms.iph.daddr == 0 ||
ipv4_is_multicast(t->parms.iph.daddr))
- return PACKET_RCVD;
+ return;
if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
- return PACKET_RCVD;
+ return;
if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
t->err_count++;
else
t->err_count = 1;
t->err_time = jiffies;
- return PACKET_RCVD;
+}
+
+static void gre_err(struct sk_buff *skb, u32 info)
+{
+ /* All the routers (except for Linux) return only
+ * 8 bytes of packet payload. It means, that precise relaying of
+ * ICMP in the real Internet is absolutely infeasible.
+ *
+ * Moreover, Cisco "wise men" put GRE key to the third word
+ * in GRE header. It makes impossible maintaining even soft
+ * state for keyed
+ * GRE tunnels with enabled checksum. Tell them "thank you".
+ *
+ * Well, I wonder, rfc1812 was written by Cisco employee,
+ * what the hell these idiots break standards established
+ * by themselves???
+ */
+
+ const int type = icmp_hdr(skb)->type;
+ const int code = icmp_hdr(skb)->code;
+ struct tnl_ptk_info tpi;
+ bool csum_err = false;
+
+ if (parse_gre_header(skb, &tpi, &csum_err)) {
+ if (!csum_err) /* ignore csum errors. */
+ return;
+ }
+
+ if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
+ ipv4_update_pmtu(skb, dev_net(skb->dev), info,
+ skb->dev->ifindex, 0, IPPROTO_GRE, 0);
+ return;
+ }
+ if (type == ICMP_REDIRECT) {
+ ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex, 0,
+ IPPROTO_GRE, 0);
+ return;
+ }
+
+ ipgre_err(skb, info, &tpi);
+}
+
+static __be64 key_to_tunnel_id(__be32 key)
+{
+#ifdef __BIG_ENDIAN
+ return (__force __be64)((__force u32)key);
+#else
+ return (__force __be64)((__force u64)key << 32);
+#endif
+}
+
+/* Returns the least-significant 32 bits of a __be64. */
+static __be32 tunnel_id_to_key(__be64 x)
+{
+#ifdef __BIG_ENDIAN
+ return (__force __be32)x;
+#else
+ return (__force __be32)((__force u64)x >> 32);
+#endif
}
static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)
{
struct net *net = dev_net(skb->dev);
+ struct metadata_dst *tun_dst = NULL;
struct ip_tunnel_net *itn;
const struct iphdr *iph;
struct ip_tunnel *tunnel;
@@ -218,40 +399,194 @@ static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)
if (tunnel) {
skb_pop_mac_header(skb);
- ip_tunnel_rcv(tunnel, skb, tpi, log_ecn_error);
+ if (tunnel->collect_md) {
+ struct ip_tunnel_info *info;
+
+ tun_dst = metadata_dst_alloc(0, GFP_ATOMIC);
+ if (!tun_dst)
+ return PACKET_REJECT;
+
+ info = &tun_dst->u.tun_info;
+ info->key.ipv4_src = iph->saddr;
+ info->key.ipv4_dst = iph->daddr;
+ info->key.ipv4_tos = iph->tos;
+ info->key.ipv4_ttl = iph->ttl;
+
+ info->mode = IP_TUNNEL_INFO_RX;
+ info->key.tun_flags = tpi->flags &
+ (TUNNEL_CSUM | TUNNEL_KEY);
+ info->key.tun_id = key_to_tunnel_id(tpi->key);
+
+ info->key.tp_src = 0;
+ info->key.tp_dst = 0;
+ }
+
+ ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
return PACKET_RCVD;
}
return PACKET_REJECT;
}
+static int gre_rcv(struct sk_buff *skb)
+{
+ struct tnl_ptk_info tpi;
+ bool csum_err = false;
+
+#ifdef CONFIG_NET_IPGRE_BROADCAST
+ if (ipv4_is_multicast(ip_hdr(skb)->daddr)) {
+ /* Looped back packet, drop it! */
+ if (rt_is_output_route(skb_rtable(skb)))
+ goto drop;
+ }
+#endif
+
+ if (parse_gre_header(skb, &tpi, &csum_err) < 0)
+ goto drop;
+
+ if (ipgre_rcv(skb, &tpi) == PACKET_RCVD)
+ return 0;
+
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+
+static void build_header(struct sk_buff *skb, int hdr_len, __be16 flags,
+ __be16 proto, __be32 key, __be32 seq)
+{
+ struct gre_base_hdr *greh;
+
+ skb_push(skb, hdr_len);
+
+ skb_reset_transport_header(skb);
+ greh = (struct gre_base_hdr *)skb->data;
+ greh->flags = tnl_flags_to_gre_flags(flags);
+ greh->protocol = proto;
+
+ if (flags & (TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_SEQ)) {
+ __be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4);
+
+ if (flags & TUNNEL_SEQ) {
+ *ptr = seq;
+ ptr--;
+ }
+ if (flags & TUNNEL_KEY) {
+ *ptr = key;
+ ptr--;
+ }
+ if (flags & TUNNEL_CSUM &&
+ !(skb_shinfo(skb)->gso_type &
+ (SKB_GSO_GRE | SKB_GSO_GRE_CSUM))) {
+ *ptr = 0;
+ *(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0,
+ skb->len, 0));
+ }
+ }
+}
+
static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
const struct iphdr *tnl_params,
__be16 proto)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
- struct tnl_ptk_info tpi;
- tpi.flags = tunnel->parms.o_flags;
- tpi.proto = proto;
- tpi.key = tunnel->parms.o_key;
if (tunnel->parms.o_flags & TUNNEL_SEQ)
tunnel->o_seqno++;
- tpi.seq = htonl(tunnel->o_seqno);
/* Push GRE header. */
- gre_build_header(skb, &tpi, tunnel->tun_hlen);
-
- skb_set_inner_protocol(skb, tpi.proto);
+ build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags,
+ proto, tunnel->parms.o_key, htonl(tunnel->o_seqno));
+ skb_set_inner_protocol(skb, proto);
ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol);
}
+static struct sk_buff *gre_handle_offloads(struct sk_buff *skb,
+ bool csum)
+{
+ return iptunnel_handle_offloads(skb, csum,
+ csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE);
+}
+
+static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct ip_tunnel_info *tun_info;
+ struct net *net = dev_net(dev);
+ const struct ip_tunnel_key *key;
+ struct flowi4 fl;
+ struct rtable *rt;
+ int min_headroom;
+ int tunnel_hlen;
+ __be16 df, flags;
+ int err;
+
+ tun_info = skb_tunnel_info(skb, AF_INET);
+ if (unlikely(!tun_info || tun_info->mode != IP_TUNNEL_INFO_TX))
+ goto err_free_skb;
+
+ key = &tun_info->key;
+ memset(&fl, 0, sizeof(fl));
+ fl.daddr = key->ipv4_dst;
+ fl.saddr = key->ipv4_src;
+ fl.flowi4_tos = RT_TOS(key->ipv4_tos);
+ fl.flowi4_mark = skb->mark;
+ fl.flowi4_proto = IPPROTO_GRE;
+
+ rt = ip_route_output_key(net, &fl);
+ if (IS_ERR(rt))
+ goto err_free_skb;
+
+ tunnel_hlen = ip_gre_calc_hlen(key->tun_flags);
+
+ min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
+ + tunnel_hlen + sizeof(struct iphdr);
+ if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) {
+ int head_delta = SKB_DATA_ALIGN(min_headroom -
+ skb_headroom(skb) +
+ 16);
+ err = pskb_expand_head(skb, max_t(int, head_delta, 0),
+ 0, GFP_ATOMIC);
+ if (unlikely(err))
+ goto err_free_rt;
+ }
+
+ /* Push Tunnel header. */
+ skb = gre_handle_offloads(skb, !!(tun_info->key.tun_flags & TUNNEL_CSUM));
+ if (IS_ERR(skb)) {
+ skb = NULL;
+ goto err_free_rt;
+ }
+
+ flags = tun_info->key.tun_flags & (TUNNEL_CSUM | TUNNEL_KEY);
+ build_header(skb, tunnel_hlen, flags, htons(ETH_P_TEB),
+ tunnel_id_to_key(tun_info->key.tun_id), 0);
+
+ df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
+ err = iptunnel_xmit(skb->sk, rt, skb, fl.saddr,
+ key->ipv4_dst, IPPROTO_GRE,
+ key->ipv4_tos, key->ipv4_ttl, df, false);
+ iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
+ return;
+
+err_free_rt:
+ ip_rt_put(rt);
+err_free_skb:
+ kfree_skb(skb);
+ dev->stats.tx_dropped++;
+}
+
static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
const struct iphdr *tnl_params;
+ if (tunnel->collect_md) {
+ gre_fb_xmit(skb, dev);
+ return NETDEV_TX_OK;
+ }
+
if (dev->header_ops) {
/* Need space for new headers */
if (skb_cow_head(skb, dev->needed_headroom -
@@ -277,7 +612,6 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
goto out;
__gre_xmit(skb, dev, tnl_params, skb->protocol);
-
return NETDEV_TX_OK;
free_skb:
@@ -292,6 +626,11 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
{
struct ip_tunnel *tunnel = netdev_priv(dev);
+ if (tunnel->collect_md) {
+ gre_fb_xmit(skb, dev);
+ return NETDEV_TX_OK;
+ }
+
skb = gre_handle_offloads(skb, !!(tunnel->parms.o_flags&TUNNEL_CSUM));
if (IS_ERR(skb))
goto out;
@@ -300,7 +639,6 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
goto free_skb;
__gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_TEB));
-
return NETDEV_TX_OK;
free_skb:
@@ -530,10 +868,9 @@ static int ipgre_tunnel_init(struct net_device *dev)
return ip_tunnel_init(dev);
}
-static struct gre_cisco_protocol ipgre_protocol = {
- .handler = ipgre_rcv,
- .err_handler = ipgre_err,
- .priority = 0,
+static const struct gre_protocol ipgre_protocol = {
+ .handler = gre_rcv,
+ .err_handler = gre_err,
};
static int __net_init ipgre_init_net(struct net *net)
@@ -596,8 +933,10 @@ out:
return ipgre_tunnel_validate(tb, data);
}
-static void ipgre_netlink_parms(struct nlattr *data[], struct nlattr *tb[],
- struct ip_tunnel_parm *parms)
+static void ipgre_netlink_parms(struct net_device *dev,
+ struct nlattr *data[],
+ struct nlattr *tb[],
+ struct ip_tunnel_parm *parms)
{
memset(parms, 0, sizeof(*parms));
@@ -635,6 +974,12 @@ static void ipgre_netlink_parms(struct nlattr *data[], struct nlattr *tb[],
if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
parms->iph.frag_off = htons(IP_DF);
+
+ if (data[IFLA_GRE_COLLECT_METADATA]) {
+ struct ip_tunnel *t = netdev_priv(dev);
+
+ t->collect_md = true;
+ }
}
/* This function returns true when ENCAP attributes are present in the nl msg */
@@ -712,7 +1057,7 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev,
return err;
}
- ipgre_netlink_parms(data, tb, &p);
+ ipgre_netlink_parms(dev, data, tb, &p);
return ip_tunnel_newlink(dev, tb, &p);
}
@@ -730,7 +1075,7 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
return err;
}
- ipgre_netlink_parms(data, tb, &p);
+ ipgre_netlink_parms(dev, data, tb, &p);
return ip_tunnel_changelink(dev, tb, &p);
}
@@ -765,6 +1110,8 @@ static size_t ipgre_get_size(const struct net_device *dev)
nla_total_size(2) +
/* IFLA_GRE_ENCAP_DPORT */
nla_total_size(2) +
+ /* IFLA_GRE_COLLECT_METADATA */
+ nla_total_size(0) +
0;
}
@@ -796,6 +1143,11 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
t->encap.flags))
goto nla_put_failure;
+ if (t->collect_md) {
+ if (nla_put_flag(skb, IFLA_GRE_COLLECT_METADATA))
+ goto nla_put_failure;
+ }
+
return 0;
nla_put_failure:
@@ -817,6 +1169,7 @@ static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
[IFLA_GRE_ENCAP_FLAGS] = { .type = NLA_U16 },
[IFLA_GRE_ENCAP_SPORT] = { .type = NLA_U16 },
[IFLA_GRE_ENCAP_DPORT] = { .type = NLA_U16 },
+ [IFLA_GRE_COLLECT_METADATA] = { .type = NLA_FLAG },
};
static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
@@ -849,9 +1202,38 @@ static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
.get_link_net = ip_tunnel_get_link_net,
};
+struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
+ u8 name_assign_type)
+{
+ struct nlattr *tb[IFLA_MAX + 1];
+ struct net_device *dev;
+ struct ip_tunnel *t;
+ int err;
+
+ memset(&tb, 0, sizeof(tb));
+
+ dev = rtnl_create_link(net, name, name_assign_type,
+ &ipgre_tap_ops, tb);
+ if (IS_ERR(dev))
+ return dev;
+
+ /* Configure flow based GRE device. */
+ t = netdev_priv(dev);
+ t->collect_md = true;
+
+ err = ipgre_newlink(net, dev, tb, NULL);
+ if (err < 0)
+ goto out;
+ return dev;
+out:
+ free_netdev(dev);
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(gretap_fb_dev_create);
+
static int __net_init ipgre_tap_init_net(struct net *net)
{
- return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, NULL);
+ return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0");
}
static void __net_exit ipgre_tap_exit_net(struct net *net)
@@ -881,7 +1263,7 @@ static int __init ipgre_init(void)
if (err < 0)
goto pnet_tap_faied;
- err = gre_cisco_register(&ipgre_protocol);
+ err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
if (err < 0) {
pr_info("%s: can't add protocol\n", __func__);
goto add_proto_failed;
@@ -900,7 +1282,7 @@ static int __init ipgre_init(void)
tap_ops_failed:
rtnl_link_unregister(&ipgre_link_ops);
rtnl_link_failed:
- gre_cisco_unregister(&ipgre_protocol);
+ gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
add_proto_failed:
unregister_pernet_device(&ipgre_tap_net_ops);
pnet_tap_faied:
@@ -912,7 +1294,7 @@ static void __exit ipgre_fini(void)
{
rtnl_link_unregister(&ipgre_tap_ops);
rtnl_link_unregister(&ipgre_link_ops);
- gre_cisco_unregister(&ipgre_protocol);
+ gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
unregister_pernet_device(&ipgre_tap_net_ops);
unregister_pernet_device(&ipgre_net_ops);
}
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 2db4c87..f4fc8a7 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -146,6 +146,7 @@
#include <net/xfrm.h>
#include <linux/mroute.h>
#include <linux/netlink.h>
+#include <net/dst_metadata.h>
/*
* Process Router Attention IP option (RFC 2113)
@@ -331,7 +332,7 @@ static int ip_rcv_finish(struct sock *sk, struct sk_buff *skb)
* Initialise the virtual path cache for the packet. It describes
* how the packet travels inside Linux networking.
*/
- if (!skb_dst(skb)) {
+ if (!skb_valid_dst(skb)) {
int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
iph->tos, skb->dev);
if (unlikely(err)) {
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 626d9e5..cbb51f3 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -230,10 +230,13 @@ skip_key_lookup:
if (cand)
return cand;
+ t = rcu_dereference(itn->collect_md_tun);
+ if (t)
+ return t;
+
if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
return netdev_priv(itn->fb_tunnel_dev);
-
return NULL;
}
EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
@@ -261,11 +264,15 @@ static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
{
struct hlist_head *head = ip_bucket(itn, &t->parms);
+ if (t->collect_md)
+ rcu_assign_pointer(itn->collect_md_tun, t);
hlist_add_head_rcu(&t->hash_node, head);
}
-static void ip_tunnel_del(struct ip_tunnel *t)
+static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
{
+ if (t->collect_md)
+ rcu_assign_pointer(itn->collect_md_tun, NULL);
hlist_del_init_rcu(&t->hash_node);
}
@@ -419,7 +426,8 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net,
}
int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
- const struct tnl_ptk_info *tpi, bool log_ecn_error)
+ const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
+ bool log_ecn_error)
{
struct pcpu_sw_netstats *tstats;
const struct iphdr *iph = ip_hdr(skb);
@@ -478,6 +486,9 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
skb->dev = tunnel->dev;
}
+ if (tun_dst)
+ skb_dst_set(skb, (struct dst_entry *)tun_dst);
+
gro_cells_receive(&tunnel->gro_cells, skb);
return 0;
@@ -806,7 +817,7 @@ static void ip_tunnel_update(struct ip_tunnel_net *itn,
struct ip_tunnel_parm *p,
bool set_mtu)
{
- ip_tunnel_del(t);
+ ip_tunnel_del(itn, t);
t->parms.iph.saddr = p->iph.saddr;
t->parms.iph.daddr = p->iph.daddr;
t->parms.i_key = p->i_key;
@@ -967,7 +978,7 @@ void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
if (itn->fb_tunnel_dev != dev) {
- ip_tunnel_del(netdev_priv(dev));
+ ip_tunnel_del(itn, netdev_priv(dev));
unregister_netdevice_queue(dev, head);
}
}
@@ -1072,8 +1083,13 @@ int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
nt = netdev_priv(dev);
itn = net_generic(net, nt->ip_tnl_net_id);
- if (ip_tunnel_find(itn, p, dev->type))
- return -EEXIST;
+ if (nt->collect_md) {
+ if (rtnl_dereference(itn->collect_md_tun))
+ return -EEXIST;
+ } else {
+ if (ip_tunnel_find(itn, p, dev->type))
+ return -EEXIST;
+ }
nt->net = net;
nt->parms = *p;
@@ -1089,7 +1105,6 @@ int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
dev->mtu = mtu;
ip_tunnel_add(itn, nt);
-
out:
return err;
}
@@ -1163,6 +1178,10 @@ int ip_tunnel_init(struct net_device *dev)
iph->version = 4;
iph->ihl = 5;
+ if (tunnel->collect_md) {
+ dev->features |= NETIF_F_NETNS_LOCAL;
+ netif_keep_dst(dev);
+ }
return 0;
}
EXPORT_SYMBOL_GPL(ip_tunnel_init);
@@ -1176,7 +1195,7 @@ void ip_tunnel_uninit(struct net_device *dev)
itn = net_generic(net, tunnel->ip_tnl_net_id);
/* fb_tunnel_dev will be unregisted in net-exit call. */
if (itn->fb_tunnel_dev != dev)
- ip_tunnel_del(netdev_priv(dev));
+ ip_tunnel_del(itn, netdev_priv(dev));
ip_tunnel_dst_reset_all(tunnel);
}
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 6a51a71..5512f4e 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -32,6 +32,7 @@
#include <linux/etherdevice.h>
#include <linux/if_ether.h>
#include <linux/if_vlan.h>
+#include <linux/static_key.h>
#include <net/ip.h>
#include <net/icmp.h>
@@ -190,3 +191,123 @@ struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev,
return tot;
}
EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64);
+
+static const struct nla_policy ip_tun_policy[IP_TUN_MAX + 1] = {
+ [IP_TUN_ID] = { .type = NLA_U64 },
+ [IP_TUN_DST] = { .type = NLA_U32 },
+ [IP_TUN_SRC] = { .type = NLA_U32 },
+ [IP_TUN_TTL] = { .type = NLA_U8 },
+ [IP_TUN_TOS] = { .type = NLA_U8 },
+ [IP_TUN_SPORT] = { .type = NLA_U16 },
+ [IP_TUN_DPORT] = { .type = NLA_U16 },
+ [IP_TUN_FLAGS] = { .type = NLA_U16 },
+};
+
+static int ip_tun_build_state(struct net_device *dev, struct nlattr *attr,
+ struct lwtunnel_state **ts)
+{
+ struct ip_tunnel_info *tun_info;
+ struct lwtunnel_state *new_state;
+ struct nlattr *tb[IP_TUN_MAX + 1];
+ int err;
+
+ err = nla_parse_nested(tb, IP_TUN_MAX, attr, ip_tun_policy);
+ if (err < 0)
+ return err;
+
+ new_state = lwtunnel_state_alloc(sizeof(*tun_info));
+ if (!new_state)
+ return -ENOMEM;
+
+ new_state->type = LWTUNNEL_ENCAP_IP;
+
+ tun_info = lwt_tun_info(new_state);
+
+ if (tb[IP_TUN_ID])
+ tun_info->key.tun_id = nla_get_u64(tb[IP_TUN_ID]);
+
+ if (tb[IP_TUN_DST])
+ tun_info->key.ipv4_dst = nla_get_be32(tb[IP_TUN_DST]);
+
+ if (tb[IP_TUN_SRC])
+ tun_info->key.ipv4_src = nla_get_be32(tb[IP_TUN_SRC]);
+
+ if (tb[IP_TUN_TTL])
+ tun_info->key.ipv4_ttl = nla_get_u8(tb[IP_TUN_TTL]);
+
+ if (tb[IP_TUN_TOS])
+ tun_info->key.ipv4_tos = nla_get_u8(tb[IP_TUN_TOS]);
+
+ if (tb[IP_TUN_SPORT])
+ tun_info->key.tp_src = nla_get_be16(tb[IP_TUN_SPORT]);
+
+ if (tb[IP_TUN_DPORT])
+ tun_info->key.tp_dst = nla_get_be16(tb[IP_TUN_DPORT]);
+
+ if (tb[IP_TUN_FLAGS])
+ tun_info->key.tun_flags = nla_get_u16(tb[IP_TUN_FLAGS]);
+
+ tun_info->mode = IP_TUNNEL_INFO_TX;
+ tun_info->options = NULL;
+ tun_info->options_len = 0;
+
+ *ts = new_state;
+
+ return 0;
+}
+
+static int ip_tun_fill_encap_info(struct sk_buff *skb,
+ struct lwtunnel_state *lwtstate)
+{
+ struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate);
+
+ if (nla_put_u64(skb, IP_TUN_ID, tun_info->key.tun_id) ||
+ nla_put_be32(skb, IP_TUN_DST, tun_info->key.ipv4_dst) ||
+ nla_put_be32(skb, IP_TUN_SRC, tun_info->key.ipv4_src) ||
+ nla_put_u8(skb, IP_TUN_TOS, tun_info->key.ipv4_tos) ||
+ nla_put_u8(skb, IP_TUN_TTL, tun_info->key.ipv4_ttl) ||
+ nla_put_u16(skb, IP_TUN_SPORT, tun_info->key.tp_src) ||
+ nla_put_u16(skb, IP_TUN_DPORT, tun_info->key.tp_dst) ||
+ nla_put_u16(skb, IP_TUN_FLAGS, tun_info->key.tun_flags))
+ return -ENOMEM;
+
+ return 0;
+}
+
+static int ip_tun_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+ return nla_total_size(8) /* IP_TUN_ID */
+ + nla_total_size(4) /* IP_TUN_DST */
+ + nla_total_size(4) /* IP_TUN_SRC */
+ + nla_total_size(1) /* IP_TUN_TOS */
+ + nla_total_size(1) /* IP_TUN_TTL */
+ + nla_total_size(2) /* IP_TUN_SPORT */
+ + nla_total_size(2) /* IP_TUN_DPORT */
+ + nla_total_size(2); /* IP_TUN_FLAGS */
+}
+
+static const struct lwtunnel_encap_ops ip_tun_lwt_ops = {
+ .build_state = ip_tun_build_state,
+ .fill_encap = ip_tun_fill_encap_info,
+ .get_encap_size = ip_tun_encap_nlsize,
+};
+
+void __init ip_tunnel_core_init(void)
+{
+ lwtunnel_encap_add_ops(&ip_tun_lwt_ops, LWTUNNEL_ENCAP_IP);
+}
+
+struct static_key ip_tunnel_metadata_cnt = STATIC_KEY_INIT_FALSE;
+EXPORT_SYMBOL(ip_tunnel_metadata_cnt);
+
+void ip_tunnel_need_metadata(void)
+{
+ static_key_slow_inc(&ip_tunnel_metadata_cnt);
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_need_metadata);
+
+void ip_tunnel_unneed_metadata(void)
+{
+ static_key_slow_dec(&ip_tunnel_metadata_cnt);
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_unneed_metadata);
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 8e7328c..ed4ef09 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -94,7 +94,7 @@
/* Define the timeout for waiting for a DHCP/BOOTP/RARP reply */
#define CONF_OPEN_RETRIES 2 /* (Re)open devices twice */
#define CONF_SEND_RETRIES 6 /* Send six requests per open */
-#define CONF_INTER_TIMEOUT (HZ/2) /* Inter-device timeout: 1/2 second */
+#define CONF_INTER_TIMEOUT (HZ) /* Inter-device timeout: 1 second */
#define CONF_BASE_TIMEOUT (HZ*2) /* Initial timeout: 2 seconds */
#define CONF_TIMEOUT_RANDOM (HZ) /* Maximum amount of randomization */
#define CONF_TIMEOUT_MULT *7/4 /* Rate of timeout growth */
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 254238d..f34c31d 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -198,7 +198,7 @@ static int ipip_rcv(struct sk_buff *skb)
goto drop;
if (iptunnel_pull_header(skb, 0, tpi.proto))
goto drop;
- return ip_tunnel_rcv(tunnel, skb, &tpi, log_ecn_error);
+ return ip_tunnel_rcv(tunnel, skb, &tpi, NULL, log_ecn_error);
}
return -1;
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 92305a1..c416cb3 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -240,7 +240,7 @@ get_entry(const void *base, unsigned int offset)
return (struct arpt_entry *)(base + offset);
}
-static inline __pure
+static inline
struct arpt_entry *arpt_next_entry(const struct arpt_entry *entry)
{
return (void *)entry + entry->next_offset;
@@ -280,6 +280,9 @@ unsigned int arpt_do_table(struct sk_buff *skb,
table_base = private->entries;
jumpstack = (struct arpt_entry **)private->jumpstack[cpu];
+ /* No TEE support for arptables, so no need to switch to alternate
+ * stack. All targets that reenter must return absolute verdicts.
+ */
e = get_entry(table_base, private->hook_entry[hook]);
acpar.in = state->in;
@@ -325,11 +328,6 @@ unsigned int arpt_do_table(struct sk_buff *skb,
}
if (table_base + v
!= arpt_next_entry(e)) {
-
- if (stackidx >= private->stacksize) {
- verdict = NF_DROP;
- break;
- }
jumpstack[stackidx++] = e;
}
@@ -337,9 +335,6 @@ unsigned int arpt_do_table(struct sk_buff *skb,
continue;
}
- /* Targets which reenter must return
- * abs. verdicts
- */
acpar.target = t->u.kernel.target;
acpar.targinfo = t->data;
verdict = t->u.kernel.target->target(skb, &acpar);
@@ -372,10 +367,13 @@ static inline bool unconditional(const struct arpt_arp *arp)
/* Figures out from what hook each rule can be called: returns 0 if
* there are loops. Puts hook bitmask in comefrom.
+ *
+ * Keeps track of largest call depth seen and stores it in newinfo->stacksize.
*/
-static int mark_source_chains(const struct xt_table_info *newinfo,
+static int mark_source_chains(struct xt_table_info *newinfo,
unsigned int valid_hooks, void *entry0)
{
+ unsigned int calldepth, max_calldepth = 0;
unsigned int hook;
/* No recursion; use packet counter to save back ptrs (reset
@@ -391,6 +389,7 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
/* Set initial back pointer. */
e->counters.pcnt = pos;
+ calldepth = 0;
for (;;) {
const struct xt_standard_target *t
@@ -445,6 +444,8 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
(entry0 + pos + size);
e->counters.pcnt = pos;
pos += size;
+ if (calldepth > 0)
+ --calldepth;
} else {
int newpos = t->verdict;
@@ -459,6 +460,10 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
return 0;
}
+ if (entry0 + newpos != arpt_next_entry(e) &&
+ ++calldepth > max_calldepth)
+ max_calldepth = calldepth;
+
/* This a jump; chase it. */
duprintf("Jump rule %u -> %u\n",
pos, newpos);
@@ -475,6 +480,7 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
next:
duprintf("Finished chain %u\n", hook);
}
+ newinfo->stacksize = max_calldepth;
return 1;
}
@@ -664,9 +670,6 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0,
if (ret != 0)
break;
++i;
- if (strcmp(arpt_get_target(iter)->u.user.name,
- XT_ERROR_TARGET) == 0)
- ++newinfo->stacksize;
}
duprintf("translate_table: ARPT_ENTRY_ITERATE gives %d\n", ret);
if (ret != 0)
@@ -1439,9 +1442,6 @@ static int translate_compat_table(const char *name,
break;
}
++i;
- if (strcmp(arpt_get_target(iter1)->u.user.name,
- XT_ERROR_TARGET) == 0)
- ++newinfo->stacksize;
}
if (ret) {
/*
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 6c72fbb..787f99e 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -276,7 +276,7 @@ static void trace_packet(const struct sk_buff *skb,
}
#endif
-static inline __pure
+static inline
struct ipt_entry *ipt_next_entry(const struct ipt_entry *entry)
{
return (void *)entry + entry->next_offset;
@@ -296,12 +296,13 @@ ipt_do_table(struct sk_buff *skb,
const char *indev, *outdev;
const void *table_base;
struct ipt_entry *e, **jumpstack;
- unsigned int *stackptr, origptr, cpu;
+ unsigned int stackidx, cpu;
const struct xt_table_info *private;
struct xt_action_param acpar;
unsigned int addend;
/* Initialization */
+ stackidx = 0;
ip = ip_hdr(skb);
indev = state->in ? state->in->name : nulldevname;
outdev = state->out ? state->out->name : nulldevname;
@@ -331,13 +332,21 @@ ipt_do_table(struct sk_buff *skb,
smp_read_barrier_depends();
table_base = private->entries;
jumpstack = (struct ipt_entry **)private->jumpstack[cpu];
- stackptr = per_cpu_ptr(private->stackptr, cpu);
- origptr = *stackptr;
+
+ /* Switch to alternate jumpstack if we're being invoked via TEE.
+ * TEE issues XT_CONTINUE verdict on original skb so we must not
+ * clobber the jumpstack.
+ *
+ * For recursion via REJECT or SYNPROXY the stack will be clobbered
+ * but it is no problem since absolute verdict is issued by these.
+ */
+ if (static_key_false(&xt_tee_enabled))
+ jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated);
e = get_entry(table_base, private->hook_entry[hook]);
- pr_debug("Entering %s(hook %u); sp at %u (UF %p)\n",
- table->name, hook, origptr,
+ pr_debug("Entering %s(hook %u), UF %p\n",
+ table->name, hook,
get_entry(table_base, private->underflow[hook]));
do {
@@ -383,28 +392,24 @@ ipt_do_table(struct sk_buff *skb,
verdict = (unsigned int)(-v) - 1;
break;
}
- if (*stackptr <= origptr) {
+ if (stackidx == 0) {
e = get_entry(table_base,
private->underflow[hook]);
pr_debug("Underflow (this is normal) "
"to %p\n", e);
} else {
- e = jumpstack[--*stackptr];
+ e = jumpstack[--stackidx];
pr_debug("Pulled %p out from pos %u\n",
- e, *stackptr);
+ e, stackidx);
e = ipt_next_entry(e);
}
continue;
}
if (table_base + v != ipt_next_entry(e) &&
!(e->ip.flags & IPT_F_GOTO)) {
- if (*stackptr >= private->stacksize) {
- verdict = NF_DROP;
- break;
- }
- jumpstack[(*stackptr)++] = e;
+ jumpstack[stackidx++] = e;
pr_debug("Pushed %p into pos %u\n",
- e, *stackptr - 1);
+ e, stackidx - 1);
}
e = get_entry(table_base, v);
@@ -423,9 +428,8 @@ ipt_do_table(struct sk_buff *skb,
/* Verdict */
break;
} while (!acpar.hotdrop);
- pr_debug("Exiting %s; resetting sp from %u to %u\n",
- __func__, *stackptr, origptr);
- *stackptr = origptr;
+ pr_debug("Exiting %s; sp at %u\n", __func__, stackidx);
+
xt_write_recseq_end(addend);
local_bh_enable();
@@ -439,11 +443,15 @@ ipt_do_table(struct sk_buff *skb,
}
/* Figures out from what hook each rule can be called: returns 0 if
- there are loops. Puts hook bitmask in comefrom. */
+ * there are loops. Puts hook bitmask in comefrom.
+ *
+ * Keeps track of largest call depth seen and stores it in newinfo->stacksize.
+ */
static int
-mark_source_chains(const struct xt_table_info *newinfo,
+mark_source_chains(struct xt_table_info *newinfo,
unsigned int valid_hooks, void *entry0)
{
+ unsigned int calldepth, max_calldepth = 0;
unsigned int hook;
/* No recursion; use packet counter to save back ptrs (reset
@@ -457,6 +465,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
/* Set initial back pointer. */
e->counters.pcnt = pos;
+ calldepth = 0;
for (;;) {
const struct xt_standard_target *t
@@ -518,6 +527,9 @@ mark_source_chains(const struct xt_table_info *newinfo,
(entry0 + pos + size);
e->counters.pcnt = pos;
pos += size;
+ WARN_ON_ONCE(calldepth == 0);
+ if (calldepth > 0)
+ --calldepth;
} else {
int newpos = t->verdict;
@@ -531,9 +543,14 @@ mark_source_chains(const struct xt_table_info *newinfo,
newpos);
return 0;
}
+ if (entry0 + newpos != ipt_next_entry(e) &&
+ !(e->ip.flags & IPT_F_GOTO) &&
+ ++calldepth > max_calldepth)
+ max_calldepth = calldepth;
+
/* This a jump; chase it. */
- duprintf("Jump rule %u -> %u\n",
- pos, newpos);
+ duprintf("Jump rule %u -> %u, calldepth %d\n",
+ pos, newpos, calldepth);
} else {
/* ... this is a fallthru */
newpos = pos + e->next_offset;
@@ -547,6 +564,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
next:
duprintf("Finished chain %u\n", hook);
}
+ newinfo->stacksize = max_calldepth;
return 1;
}
@@ -826,9 +844,6 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
if (ret != 0)
return ret;
++i;
- if (strcmp(ipt_get_target(iter)->u.user.name,
- XT_ERROR_TARGET) == 0)
- ++newinfo->stacksize;
}
if (i != repl->num_entries) {
@@ -1744,9 +1759,6 @@ translate_compat_table(struct net *net,
if (ret != 0)
break;
++i;
- if (strcmp(ipt_get_target(iter1)->u.user.name,
- XT_ERROR_TARGET) == 0)
- ++newinfo->stacksize;
}
if (ret) {
/*
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index c88b7d4..b69e82b 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -49,12 +49,9 @@ static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum,
if (skb->nfct)
zone = nf_ct_zone((struct nf_conn *)skb->nfct);
#endif
-
-#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
- if (skb->nf_bridge &&
- skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING)
+ if (nf_bridge_in_prerouting(skb))
return IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone;
-#endif
+
if (hooknum == NF_INET_PRE_ROUTING)
return IP_DEFRAG_CONNTRACK_IN + zone;
else
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 05ff44b..e89094a 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -363,7 +363,8 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk,
scoped);
rcu_read_unlock();
- if (!(isk->freebind || isk->transparent || has_addr ||
+ if (!(net->ipv6.sysctl.ip_nonlocal_bind ||
+ isk->freebind || isk->transparent || has_addr ||
addr_type == IPV6_ADDR_ANY))
return -EADDRNOTAVAIL;
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index da5d483..3abd9d7 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -300,6 +300,8 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPACKSkippedChallenge", LINUX_MIB_TCPACKSKIPPEDCHALLENGE),
SNMP_MIB_ITEM("TCPWinProbe", LINUX_MIB_TCPWINPROBE),
SNMP_MIB_ITEM("TCPKeepAlive", LINUX_MIB_TCPKEEPALIVE),
+ SNMP_MIB_ITEM("TCPMTUPFail", LINUX_MIB_TCPMTUPFAIL),
+ SNMP_MIB_ITEM("TCPMTUPSuccess", LINUX_MIB_TCPMTUPSUCCESS),
SNMP_MIB_SENTINEL
};
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index e681b85..18fd7c9 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -91,6 +91,7 @@
#include <linux/slab.h>
#include <linux/jhash.h>
#include <net/dst.h>
+#include <net/dst_metadata.h>
#include <net/net_namespace.h>
#include <net/protocol.h>
#include <net/ip.h>
@@ -102,6 +103,7 @@
#include <net/tcp.h>
#include <net/icmp.h>
#include <net/xfrm.h>
+#include <net/lwtunnel.h>
#include <net/netevent.h>
#include <net/rtnetlink.h>
#ifdef CONFIG_SYSCTL
@@ -109,6 +111,7 @@
#include <linux/kmemleak.h>
#endif
#include <net/secure_seq.h>
+#include <net/ip_tunnels.h>
#define RT_FL_TOS(oldflp4) \
((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
@@ -1355,6 +1358,7 @@ static void ipv4_dst_destroy(struct dst_entry *dst)
list_del(&rt->rt_uncached);
spin_unlock_bh(&ul->lock);
}
+ lwtstate_put(rt->rt_lwtstate);
}
void rt_flush_dev(struct net_device *dev)
@@ -1403,6 +1407,7 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
#ifdef CONFIG_IP_ROUTE_CLASSID
rt->dst.tclassid = nh->nh_tclassid;
#endif
+ rt->rt_lwtstate = lwtstate_get(nh->nh_lwtstate);
if (unlikely(fnhe))
cached = rt_bind_exception(rt, fnhe, daddr);
else if (!(rt->dst.flags & DST_NOCACHE))
@@ -1488,6 +1493,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
rth->rt_gateway = 0;
rth->rt_uses_gateway = 0;
INIT_LIST_HEAD(&rth->rt_uncached);
+ rth->rt_lwtstate = NULL;
if (our) {
rth->dst.input= ip_local_deliver;
rth->rt_flags |= RTCF_LOCAL;
@@ -1546,7 +1552,6 @@ static int __mkroute_input(struct sk_buff *skb,
struct rtable *rth;
int err;
struct in_device *out_dev;
- unsigned int flags = 0;
bool do_cache;
u32 itag = 0;
@@ -1610,7 +1615,7 @@ static int __mkroute_input(struct sk_buff *skb,
}
rth->rt_genid = rt_genid_ipv4(dev_net(rth->dst.dev));
- rth->rt_flags = flags;
+ rth->rt_flags = 0;
rth->rt_type = res->type;
rth->rt_is_input = 1;
rth->rt_iif = 0;
@@ -1618,12 +1623,15 @@ static int __mkroute_input(struct sk_buff *skb,
rth->rt_gateway = 0;
rth->rt_uses_gateway = 0;
INIT_LIST_HEAD(&rth->rt_uncached);
+ rth->rt_lwtstate = NULL;
RT_CACHE_STAT_INC(in_slow_tot);
rth->dst.input = ip_forward;
rth->dst.output = ip_output;
rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
+ if (lwtunnel_output_redirect(rth->rt_lwtstate))
+ rth->dst.output = lwtunnel_output;
skb_dst_set(skb, &rth->dst);
out:
err = 0;
@@ -1662,6 +1670,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
{
struct fib_result res;
struct in_device *in_dev = __in_dev_get_rcu(dev);
+ struct ip_tunnel_info *tun_info;
struct flowi4 fl4;
unsigned int flags = 0;
u32 itag = 0;
@@ -1679,6 +1688,13 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
by fib_lookup.
*/
+ tun_info = skb_tunnel_info(skb, AF_INET);
+ if (tun_info && tun_info->mode == IP_TUNNEL_INFO_RX)
+ fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
+ else
+ fl4.flowi4_tun_key.tun_id = 0;
+ skb_dst_drop(skb);
+
if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
goto martian_source;
@@ -1792,6 +1808,8 @@ local_input:
rth->rt_gateway = 0;
rth->rt_uses_gateway = 0;
INIT_LIST_HEAD(&rth->rt_uncached);
+ rth->rt_lwtstate = NULL;
+
RT_CACHE_STAT_INC(in_slow_tot);
if (res.type == RTN_UNREACHABLE) {
rth->dst.input= ip_error;
@@ -1981,7 +1999,7 @@ add:
rth->rt_gateway = 0;
rth->rt_uses_gateway = 0;
INIT_LIST_HEAD(&rth->rt_uncached);
-
+ rth->rt_lwtstate = NULL;
RT_CACHE_STAT_INC(out_slow_tot);
if (flags & RTCF_LOCAL)
@@ -2004,6 +2022,8 @@ add:
}
rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
+ if (lwtunnel_output_redirect(rth->rt_lwtstate))
+ rth->dst.output = lwtunnel_output;
return rth;
}
@@ -2261,7 +2281,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
rt->rt_uses_gateway = ort->rt_uses_gateway;
INIT_LIST_HEAD(&rt->rt_uncached);
-
+ rt->rt_lwtstate = NULL;
dst_free(new);
}
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index c037644..fd1405d 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -146,7 +146,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
if (!tcp_is_cwnd_limited(sk))
return;
- if (tp->snd_cwnd <= tp->snd_ssthresh)
+ if (tcp_in_slow_start(tp))
tcp_slow_start(tp, acked);
else {
bictcp_update(ca, tp->snd_cwnd);
diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c
index 8c6fd3d..167b6a3 100644
--- a/net/ipv4/tcp_cdg.c
+++ b/net/ipv4/tcp_cdg.c
@@ -264,7 +264,7 @@ static void tcp_cdg_cong_avoid(struct sock *sk, u32 ack, u32 acked)
u32 prior_snd_cwnd;
u32 incr;
- if (tp->snd_cwnd < tp->snd_ssthresh && hystart_detect)
+ if (tcp_in_slow_start(tp) && hystart_detect)
tcp_cdg_hystart_update(sk);
if (after(ack, ca->rtt_seq) && ca->rtt.v64) {
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 84be008..a2ed23c 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -365,10 +365,8 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
*/
u32 tcp_slow_start(struct tcp_sock *tp, u32 acked)
{
- u32 cwnd = tp->snd_cwnd + acked;
+ u32 cwnd = min(tp->snd_cwnd + acked, tp->snd_ssthresh);
- if (cwnd > tp->snd_ssthresh)
- cwnd = tp->snd_ssthresh + 1;
acked -= cwnd - tp->snd_cwnd;
tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp);
@@ -413,7 +411,7 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
return;
/* In "safe" area, increase. */
- if (tp->snd_cwnd <= tp->snd_ssthresh) {
+ if (tcp_in_slow_start(tp)) {
acked = tcp_slow_start(tp, acked);
if (!acked)
return;
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 06d3d66..28011fb 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -320,7 +320,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
if (!tcp_is_cwnd_limited(sk))
return;
- if (tp->snd_cwnd <= tp->snd_ssthresh) {
+ if (tcp_in_slow_start(tp)) {
if (hystart && after(ack, ca->end_seq))
bictcp_hystart_reset(sk);
acked = tcp_slow_start(tp, acked);
@@ -439,7 +439,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us)
ca->delay_min = delay;
/* hystart triggers when cwnd is larger than some threshold */
- if (hystart && tp->snd_cwnd <= tp->snd_ssthresh &&
+ if (hystart && tcp_in_slow_start(tp) &&
tp->snd_cwnd >= hystart_low_window)
hystart_update(sk, delay);
}
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
index 882c08a..db78424 100644
--- a/net/ipv4/tcp_highspeed.c
+++ b/net/ipv4/tcp_highspeed.c
@@ -116,7 +116,7 @@ static void hstcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
if (!tcp_is_cwnd_limited(sk))
return;
- if (tp->snd_cwnd <= tp->snd_ssthresh)
+ if (tcp_in_slow_start(tp))
tcp_slow_start(tp, acked);
else {
/* Update AIMD parameters.
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index 58469ff..82f0d9e 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -236,7 +236,7 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
if (!tcp_is_cwnd_limited(sk))
return;
- if (tp->snd_cwnd <= tp->snd_ssthresh)
+ if (tcp_in_slow_start(tp))
tcp_slow_start(tp, acked);
else {
/* In dangerous area, increase slowly.
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
index f963b27..083831e 100644
--- a/net/ipv4/tcp_hybla.c
+++ b/net/ipv4/tcp_hybla.c
@@ -112,7 +112,7 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked)
rho_fractions = ca->rho_3ls - (ca->rho << 3);
- if (tp->snd_cwnd < tp->snd_ssthresh) {
+ if (tcp_in_slow_start(tp)) {
/*
* slow start
* INC = 2^RHO - 1
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
index f71002e..2ab9bbb 100644
--- a/net/ipv4/tcp_illinois.c
+++ b/net/ipv4/tcp_illinois.c
@@ -268,7 +268,7 @@ static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 acked)
return;
/* In slow start */
- if (tp->snd_cwnd <= tp->snd_ssthresh)
+ if (tcp_in_slow_start(tp))
tcp_slow_start(tp, acked);
else {
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 728f5b3..4e4d6bc 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -109,6 +109,7 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */
#define FLAG_DATA_SACKED 0x20 /* New SACK. */
#define FLAG_ECE 0x40 /* ECE in this ACK */
+#define FLAG_LOST_RETRANS 0x80 /* This ACK marks some retransmission lost */
#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/
#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */
#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
@@ -196,11 +197,13 @@ static void tcp_enter_quickack_mode(struct sock *sk)
* and the session is not interactive.
*/
-static inline bool tcp_in_quickack_mode(const struct sock *sk)
+static bool tcp_in_quickack_mode(struct sock *sk)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
+ const struct dst_entry *dst = __sk_dst_get(sk);
- return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong;
+ return (dst && dst_metric(dst, RTAX_QUICKACK)) ||
+ (icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong);
}
static void tcp_ecn_queue_cwr(struct tcp_sock *tp)
@@ -1037,7 +1040,7 @@ static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack,
* highest SACK block). Also calculate the lowest snd_nxt among the remaining
* retransmitted skbs to avoid some costly processing per ACKs.
*/
-static void tcp_mark_lost_retrans(struct sock *sk)
+static void tcp_mark_lost_retrans(struct sock *sk, int *flag)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
@@ -1078,7 +1081,7 @@ static void tcp_mark_lost_retrans(struct sock *sk)
if (after(received_upto, ack_seq)) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
tp->retrans_out -= tcp_skb_pcount(skb);
-
+ *flag |= FLAG_LOST_RETRANS;
tcp_skb_mark_lost_uncond_verify(tp, skb);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT);
} else {
@@ -1818,7 +1821,7 @@ advance_sp:
((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker))
tcp_update_reordering(sk, tp->fackets_out - state->reord, 0);
- tcp_mark_lost_retrans(sk);
+ tcp_mark_lost_retrans(sk, &state->flag);
tcp_verify_left_out(tp);
out:
@@ -2474,15 +2477,14 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
return false;
}
-/* The cwnd reduction in CWR and Recovery use the PRR algorithm
- * https://datatracker.ietf.org/doc/draft-ietf-tcpm-proportional-rate-reduction/
+/* The cwnd reduction in CWR and Recovery uses the PRR algorithm in RFC 6937.
* It computes the number of packets to send (sndcnt) based on packets newly
* delivered:
* 1) If the packets in flight is larger than ssthresh, PRR spreads the
* cwnd reductions across a full RTT.
- * 2) If packets in flight is lower than ssthresh (such as due to excess
- * losses and/or application stalls), do not perform any further cwnd
- * reductions, but instead slow start up to ssthresh.
+ * 2) Otherwise PRR uses packet conservation to send as much as delivered.
+ * But when the retransmits are acked without further losses, PRR
+ * slow starts cwnd up to ssthresh to speed up the recovery.
*/
static void tcp_init_cwnd_reduction(struct sock *sk)
{
@@ -2499,7 +2501,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk)
}
static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked,
- int fast_rexmit)
+ int fast_rexmit, int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
int sndcnt = 0;
@@ -2508,16 +2510,18 @@ static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked,
(tp->packets_out - tp->sacked_out);
tp->prr_delivered += newly_acked_sacked;
- if (tcp_packets_in_flight(tp) > tp->snd_ssthresh) {
+ if (delta < 0) {
u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
tp->prior_cwnd - 1;
sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
- } else {
+ } else if ((flag & FLAG_RETRANS_DATA_ACKED) &&
+ !(flag & FLAG_LOST_RETRANS)) {
sndcnt = min_t(int, delta,
max_t(int, tp->prr_delivered - tp->prr_out,
newly_acked_sacked) + 1);
+ } else {
+ sndcnt = min(delta, newly_acked_sacked);
}
-
sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0));
tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
}
@@ -2578,7 +2582,7 @@ static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked)
if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
tcp_try_keep_open(sk);
} else {
- tcp_cwnd_reduction(sk, prior_unsacked, 0);
+ tcp_cwnd_reduction(sk, prior_unsacked, 0, flag);
}
}
@@ -2588,6 +2592,7 @@ static void tcp_mtup_probe_failed(struct sock *sk)
icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1;
icsk->icsk_mtup.probe_size = 0;
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMTUPFAIL);
}
static void tcp_mtup_probe_success(struct sock *sk)
@@ -2607,6 +2612,7 @@ static void tcp_mtup_probe_success(struct sock *sk)
icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
icsk->icsk_mtup.probe_size = 0;
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMTUPSUCCESS);
}
/* Do a simple retransmit without using the backoff mechanisms in
@@ -2675,7 +2681,7 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
tp->prior_ssthresh = 0;
tcp_init_undo(tp);
- if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
+ if (!tcp_in_cwnd_reduction(sk)) {
if (!ece_ack)
tp->prior_ssthresh = tcp_current_ssthresh(sk);
tcp_init_cwnd_reduction(sk);
@@ -2735,7 +2741,7 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
/* Undo during fast recovery after partial ACK. */
static bool tcp_try_undo_partial(struct sock *sk, const int acked,
- const int prior_unsacked)
+ const int prior_unsacked, int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -2751,7 +2757,7 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked,
* mark more packets lost or retransmit more.
*/
if (tp->retrans_out) {
- tcp_cwnd_reduction(sk, prior_unsacked, 0);
+ tcp_cwnd_reduction(sk, prior_unsacked, 0, flag);
return true;
}
@@ -2838,7 +2844,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
if (tcp_is_reno(tp) && is_dupack)
tcp_add_reno_sack(sk);
} else {
- if (tcp_try_undo_partial(sk, acked, prior_unsacked))
+ if (tcp_try_undo_partial(sk, acked, prior_unsacked, flag))
return;
/* Partial ACK arrived. Force fast retransmit. */
do_lost = tcp_is_reno(tp) ||
@@ -2851,9 +2857,10 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
break;
case TCP_CA_Loss:
tcp_process_loss(sk, flag, is_dupack);
- if (icsk->icsk_ca_state != TCP_CA_Open)
+ if (icsk->icsk_ca_state != TCP_CA_Open &&
+ !(flag & FLAG_LOST_RETRANS))
return;
- /* Fall through to processing in Open state. */
+ /* Change state if cwnd is undone or retransmits are lost */
default:
if (tcp_is_reno(tp)) {
if (flag & FLAG_SND_UNA_ADVANCED)
@@ -2888,7 +2895,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
if (do_lost)
tcp_update_scoreboard(sk, fast_rexmit);
- tcp_cwnd_reduction(sk, prior_unsacked, fast_rexmit);
+ tcp_cwnd_reduction(sk, prior_unsacked, fast_rexmit, flag);
tcp_xmit_retransmit_queue(sk);
}
@@ -3562,10 +3569,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
&sack_state);
acked -= tp->packets_out;
- /* Advance cwnd if state allows */
- if (tcp_may_raise_cwnd(sk, flag))
- tcp_cong_avoid(sk, ack, acked);
-
if (tcp_ack_is_dubious(sk, flag)) {
is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
tcp_fastretrans_alert(sk, acked, prior_unsacked,
@@ -3574,6 +3577,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (tp->tlp_high_seq)
tcp_process_tlp_ack(sk, ack, flag);
+ /* Advance cwnd if state allows */
+ if (tcp_may_raise_cwnd(sk, flag))
+ tcp_cong_avoid(sk, ack, acked);
+
if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) {
struct dst_entry *dst = __sk_dst_get(sk);
if (dst)
@@ -3947,7 +3954,6 @@ void tcp_reset(struct sock *sk)
static void tcp_fin(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
- const struct dst_entry *dst;
inet_csk_schedule_ack(sk);
@@ -3959,9 +3965,7 @@ static void tcp_fin(struct sock *sk)
case TCP_ESTABLISHED:
/* Move to CLOSE_WAIT */
tcp_set_state(sk, TCP_CLOSE_WAIT);
- dst = __sk_dst_get(sk);
- if (!dst || !dst_metric(dst, RTAX_QUICKACK))
- inet_csk(sk)->icsk_ack.pingpong = 1;
+ inet_csk(sk)->icsk_ack.pingpong = 1;
break;
case TCP_CLOSE_WAIT:
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 0ea2e1c..93898e0 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -222,7 +222,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
if (err)
goto failure;
- inet_set_txhash(sk);
+ sk_set_txhash(sk);
rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
inet->inet_sport, inet->inet_dport, sk);
@@ -1277,7 +1277,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
newinet->mc_ttl = ip_hdr(skb)->ttl;
newinet->rcv_tos = ip_hdr(skb)->tos;
inet_csk(newsk)->icsk_ext_hdr_len = 0;
- inet_set_txhash(newsk);
+ sk_set_txhash(newsk);
if (inet_opt)
inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
newinet->inet_id = newtp->write_seq ^ jiffies;
@@ -1683,8 +1683,7 @@ do_time_wait:
iph->daddr, th->dest,
inet_iif(skb));
if (sk2) {
- inet_twsk_deschedule(inet_twsk(sk));
- inet_twsk_put(inet_twsk(sk));
+ inet_twsk_deschedule_put(inet_twsk(sk));
sk = sk2;
goto process;
}
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index a51d63a..b3d64f6 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -461,7 +461,7 @@ void tcp_update_metrics(struct sock *sk)
tcp_metric_set(tm, TCP_METRIC_CWND,
tp->snd_cwnd);
}
- } else if (tp->snd_cwnd > tp->snd_ssthresh &&
+ } else if (!tcp_in_slow_start(tp) &&
icsk->icsk_ca_state == TCP_CA_Open) {
/* Cong. avoidance phase, cwnd is reliable. */
if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH))
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 4bc00cb..6d8795b 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -147,8 +147,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
if (!th->fin ||
TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) {
kill_with_rst:
- inet_twsk_deschedule(tw);
- inet_twsk_put(tw);
+ inet_twsk_deschedule_put(tw);
return TCP_TW_RST;
}
@@ -198,8 +197,7 @@ kill_with_rst:
*/
if (sysctl_tcp_rfc1337 == 0) {
kill:
- inet_twsk_deschedule(tw);
- inet_twsk_put(tw);
+ inet_twsk_deschedule_put(tw);
return TCP_TW_SUCCESS;
}
}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index b1c218d..7d1efa7 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -163,7 +163,6 @@ static void tcp_event_data_sent(struct tcp_sock *tp,
{
struct inet_connection_sock *icsk = inet_csk(sk);
const u32 now = tcp_time_stamp;
- const struct dst_entry *dst = __sk_dst_get(sk);
if (sysctl_tcp_slow_start_after_idle &&
(!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto))
@@ -174,9 +173,8 @@ static void tcp_event_data_sent(struct tcp_sock *tp,
/* If it is a reply for ato after last received
* packet, enter pingpong mode.
*/
- if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato &&
- (!dst || !dst_metric(dst, RTAX_QUICKACK)))
- icsk->icsk_ack.pingpong = 1;
+ if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
+ icsk->icsk_ack.pingpong = 1;
}
/* Account for an ACK we sent. */
@@ -1776,7 +1774,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
goto send_now;
- if (!((1 << icsk->icsk_ca_state) & (TCPF_CA_Open | TCPF_CA_CWR)))
+ if (icsk->icsk_ca_state >= TCP_CA_Recovery)
goto send_now;
/* Avoid bursty behavior by allowing defer
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index 333bcb2..bf5ea9e 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -22,7 +22,7 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked)
if (!tcp_is_cwnd_limited(sk))
return;
- if (tp->snd_cwnd <= tp->snd_ssthresh)
+ if (tcp_in_slow_start(tp))
tcp_slow_start(tp, acked);
else
tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT),
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 5b752f5..7149ebc 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -649,4 +649,3 @@ void tcp_init_xmit_timers(struct sock *sk)
inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
&tcp_keepalive_timer);
}
-EXPORT_SYMBOL(tcp_init_xmit_timers);
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index a6cea1d..13951c4 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -225,7 +225,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
*/
diff = tp->snd_cwnd * (rtt-vegas->baseRTT) / vegas->baseRTT;
- if (diff > gamma && tp->snd_cwnd <= tp->snd_ssthresh) {
+ if (diff > gamma && tcp_in_slow_start(tp)) {
/* Going too fast. Time to slow down
* and switch to congestion avoidance.
*/
@@ -240,7 +240,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
tp->snd_cwnd = min(tp->snd_cwnd, (u32)target_cwnd+1);
tp->snd_ssthresh = tcp_vegas_ssthresh(tp);
- } else if (tp->snd_cwnd <= tp->snd_ssthresh) {
+ } else if (tcp_in_slow_start(tp)) {
/* Slow start. */
tcp_slow_start(tp, acked);
} else {
@@ -281,7 +281,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
vegas->minRTT = 0x7fffffff;
}
/* Use normal slow start */
- else if (tp->snd_cwnd <= tp->snd_ssthresh)
+ else if (tcp_in_slow_start(tp))
tcp_slow_start(tp, acked);
}
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index 112151e..0d094b9 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -150,7 +150,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
veno->diff = (tp->snd_cwnd << V_PARAM_SHIFT) - target_cwnd;
- if (tp->snd_cwnd <= tp->snd_ssthresh) {
+ if (tcp_in_slow_start(tp)) {
/* Slow start. */
tcp_slow_start(tp, acked);
} else {
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index 438a73a..643f613 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -5,16 +5,15 @@
# IPv6 as module will cause a CRASH if you try to unload it
menuconfig IPV6
tristate "The IPv6 protocol"
- default m
+ default y
---help---
- This is complemental support for the IP version 6.
- You will still be able to do traditional IPv4 networking as well.
+ Support for IP version 6 (IPv6).
For general information about IPv6, see
<https://en.wikipedia.org/wiki/IPv6>.
- For Linux IPv6 development information, see <http://www.linux-ipv6.org>.
- For specific information about IPv6 under Linux, read the HOWTO at
- <http://www.bieringer.de/linux/IPv6/>.
+ For specific information about IPv6 under Linux, see
+ Documentation/networking/ipv6.txt and read the HOWTO at
+ <http://www.tldp.org/HOWTO/Linux+IPv6-HOWTO/>
To compile this protocol support as a module, choose M here: the
module will be called ipv6.
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 21c2c81..53e3a9d 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -195,6 +195,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
.max_addresses = IPV6_MAX_ADDRESSES,
.accept_ra_defrtr = 1,
.accept_ra_from_local = 0,
+ .accept_ra_min_hop_limit= 1,
.accept_ra_pinfo = 1,
#ifdef CONFIG_IPV6_ROUTER_PREF
.accept_ra_rtr_pref = 1,
@@ -211,7 +212,8 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
.accept_ra_mtu = 1,
.stable_secret = {
.initialized = false,
- }
+ },
+ .use_oif_addrs_only = 0,
};
static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
@@ -236,6 +238,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
.max_addresses = IPV6_MAX_ADDRESSES,
.accept_ra_defrtr = 1,
.accept_ra_from_local = 0,
+ .accept_ra_min_hop_limit= 1,
.accept_ra_pinfo = 1,
#ifdef CONFIG_IPV6_ROUTER_PREF
.accept_ra_rtr_pref = 1,
@@ -253,6 +256,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
.stable_secret = {
.initialized = false,
},
+ .use_oif_addrs_only = 0,
};
/* Check if a valid qdisc is available */
@@ -1358,15 +1362,96 @@ out:
return ret;
}
+static int __ipv6_dev_get_saddr(struct net *net,
+ struct ipv6_saddr_dst *dst,
+ struct inet6_dev *idev,
+ struct ipv6_saddr_score *scores,
+ int hiscore_idx)
+{
+ struct ipv6_saddr_score *score = &scores[1 - hiscore_idx], *hiscore = &scores[hiscore_idx];
+
+ read_lock_bh(&idev->lock);
+ list_for_each_entry(score->ifa, &idev->addr_list, if_list) {
+ int i;
+
+ /*
+ * - Tentative Address (RFC2462 section 5.4)
+ * - A tentative address is not considered
+ * "assigned to an interface" in the traditional
+ * sense, unless it is also flagged as optimistic.
+ * - Candidate Source Address (section 4)
+ * - In any case, anycast addresses, multicast
+ * addresses, and the unspecified address MUST
+ * NOT be included in a candidate set.
+ */
+ if ((score->ifa->flags & IFA_F_TENTATIVE) &&
+ (!(score->ifa->flags & IFA_F_OPTIMISTIC)))
+ continue;
+
+ score->addr_type = __ipv6_addr_type(&score->ifa->addr);
+
+ if (unlikely(score->addr_type == IPV6_ADDR_ANY ||
+ score->addr_type & IPV6_ADDR_MULTICAST)) {
+ net_dbg_ratelimited("ADDRCONF: unspecified / multicast address assigned as unicast address on %s",
+ idev->dev->name);
+ continue;
+ }
+
+ score->rule = -1;
+ bitmap_zero(score->scorebits, IPV6_SADDR_RULE_MAX);
+
+ for (i = 0; i < IPV6_SADDR_RULE_MAX; i++) {
+ int minihiscore, miniscore;
+
+ minihiscore = ipv6_get_saddr_eval(net, hiscore, dst, i);
+ miniscore = ipv6_get_saddr_eval(net, score, dst, i);
+
+ if (minihiscore > miniscore) {
+ if (i == IPV6_SADDR_RULE_SCOPE &&
+ score->scopedist > 0) {
+ /*
+ * special case:
+ * each remaining entry
+ * has too small (not enough)
+ * scope, because ifa entries
+ * are sorted by their scope
+ * values.
+ */
+ goto out;
+ }
+ break;
+ } else if (minihiscore < miniscore) {
+ if (hiscore->ifa)
+ in6_ifa_put(hiscore->ifa);
+
+ in6_ifa_hold(score->ifa);
+
+ swap(hiscore, score);
+ hiscore_idx = 1 - hiscore_idx;
+
+ /* restore our iterator */
+ score->ifa = hiscore->ifa;
+
+ break;
+ }
+ }
+ }
+out:
+ read_unlock_bh(&idev->lock);
+ return hiscore_idx;
+}
+
int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev,
const struct in6_addr *daddr, unsigned int prefs,
struct in6_addr *saddr)
{
- struct ipv6_saddr_score scores[2],
- *score = &scores[0], *hiscore = &scores[1];
+ struct ipv6_saddr_score scores[2], *hiscore;
struct ipv6_saddr_dst dst;
+ struct inet6_dev *idev;
struct net_device *dev;
int dst_type;
+ bool use_oif_addr = false;
+ int hiscore_idx = 0;
dst_type = __ipv6_addr_type(daddr);
dst.addr = daddr;
@@ -1375,105 +1460,50 @@ int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev,
dst.label = ipv6_addr_label(net, daddr, dst_type, dst.ifindex);
dst.prefs = prefs;
- hiscore->rule = -1;
- hiscore->ifa = NULL;
+ scores[hiscore_idx].rule = -1;
+ scores[hiscore_idx].ifa = NULL;
rcu_read_lock();
- for_each_netdev_rcu(net, dev) {
- struct inet6_dev *idev;
-
- /* Candidate Source Address (section 4)
- * - multicast and link-local destination address,
- * the set of candidate source address MUST only
- * include addresses assigned to interfaces
- * belonging to the same link as the outgoing
- * interface.
- * (- For site-local destination addresses, the
- * set of candidate source addresses MUST only
- * include addresses assigned to interfaces
- * belonging to the same site as the outgoing
- * interface.)
- */
- if (((dst_type & IPV6_ADDR_MULTICAST) ||
- dst.scope <= IPV6_ADDR_SCOPE_LINKLOCAL) &&
- dst.ifindex && dev->ifindex != dst.ifindex)
- continue;
-
- idev = __in6_dev_get(dev);
- if (!idev)
- continue;
-
- read_lock_bh(&idev->lock);
- list_for_each_entry(score->ifa, &idev->addr_list, if_list) {
- int i;
-
- /*
- * - Tentative Address (RFC2462 section 5.4)
- * - A tentative address is not considered
- * "assigned to an interface" in the traditional
- * sense, unless it is also flagged as optimistic.
- * - Candidate Source Address (section 4)
- * - In any case, anycast addresses, multicast
- * addresses, and the unspecified address MUST
- * NOT be included in a candidate set.
- */
- if ((score->ifa->flags & IFA_F_TENTATIVE) &&
- (!(score->ifa->flags & IFA_F_OPTIMISTIC)))
- continue;
-
- score->addr_type = __ipv6_addr_type(&score->ifa->addr);
+ /* Candidate Source Address (section 4)
+ * - multicast and link-local destination address,
+ * the set of candidate source address MUST only
+ * include addresses assigned to interfaces
+ * belonging to the same link as the outgoing
+ * interface.
+ * (- For site-local destination addresses, the
+ * set of candidate source addresses MUST only
+ * include addresses assigned to interfaces
+ * belonging to the same site as the outgoing
+ * interface.)
+ * - "It is RECOMMENDED that the candidate source addresses
+ * be the set of unicast addresses assigned to the
+ * interface that will be used to send to the destination
+ * (the 'outgoing' interface)." (RFC 6724)
+ */
+ if (dst_dev) {
+ idev = __in6_dev_get(dst_dev);
+ if ((dst_type & IPV6_ADDR_MULTICAST) ||
+ dst.scope <= IPV6_ADDR_SCOPE_LINKLOCAL ||
+ (idev && idev->cnf.use_oif_addrs_only)) {
+ use_oif_addr = true;
+ }
+ }
- if (unlikely(score->addr_type == IPV6_ADDR_ANY ||
- score->addr_type & IPV6_ADDR_MULTICAST)) {
- net_dbg_ratelimited("ADDRCONF: unspecified / multicast address assigned as unicast address on %s",
- dev->name);
+ if (use_oif_addr) {
+ if (idev)
+ hiscore_idx = __ipv6_dev_get_saddr(net, &dst, idev, scores, hiscore_idx);
+ } else {
+ for_each_netdev_rcu(net, dev) {
+ idev = __in6_dev_get(dev);
+ if (!idev)
continue;
- }
-
- score->rule = -1;
- bitmap_zero(score->scorebits, IPV6_SADDR_RULE_MAX);
-
- for (i = 0; i < IPV6_SADDR_RULE_MAX; i++) {
- int minihiscore, miniscore;
-
- minihiscore = ipv6_get_saddr_eval(net, hiscore, &dst, i);
- miniscore = ipv6_get_saddr_eval(net, score, &dst, i);
-
- if (minihiscore > miniscore) {
- if (i == IPV6_SADDR_RULE_SCOPE &&
- score->scopedist > 0) {
- /*
- * special case:
- * each remaining entry
- * has too small (not enough)
- * scope, because ifa entries
- * are sorted by their scope
- * values.
- */
- goto try_nextdev;
- }
- break;
- } else if (minihiscore < miniscore) {
- if (hiscore->ifa)
- in6_ifa_put(hiscore->ifa);
-
- in6_ifa_hold(score->ifa);
-
- swap(hiscore, score);
-
- /* restore our iterator */
- score->ifa = hiscore->ifa;
-
- break;
- }
- }
+ hiscore_idx = __ipv6_dev_get_saddr(net, &dst, idev, scores, hiscore_idx);
}
-try_nextdev:
- read_unlock_bh(&idev->lock);
}
rcu_read_unlock();
+ hiscore = &scores[hiscore_idx];
if (!hiscore->ifa)
return -EADDRNOTAVAIL;
@@ -4560,6 +4590,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
array[DEVCONF_MAX_DESYNC_FACTOR] = cnf->max_desync_factor;
array[DEVCONF_MAX_ADDRESSES] = cnf->max_addresses;
array[DEVCONF_ACCEPT_RA_DEFRTR] = cnf->accept_ra_defrtr;
+ array[DEVCONF_ACCEPT_RA_MIN_HOP_LIMIT] = cnf->accept_ra_min_hop_limit;
array[DEVCONF_ACCEPT_RA_PINFO] = cnf->accept_ra_pinfo;
#ifdef CONFIG_IPV6_ROUTER_PREF
array[DEVCONF_ACCEPT_RA_RTR_PREF] = cnf->accept_ra_rtr_pref;
@@ -4586,6 +4617,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
array[DEVCONF_ACCEPT_RA_FROM_LOCAL] = cnf->accept_ra_from_local;
array[DEVCONF_ACCEPT_RA_MTU] = cnf->accept_ra_mtu;
/* we omit DEVCONF_STABLE_SECRET for now */
+ array[DEVCONF_USE_OIF_ADDRS_ONLY] = cnf->use_oif_addrs_only;
}
static inline size_t inet6_ifla6_size(void)
@@ -5456,6 +5488,13 @@ static struct addrconf_sysctl_table
.proc_handler = proc_dointvec,
},
{
+ .procname = "accept_ra_min_hop_limit",
+ .data = &ipv6_devconf.accept_ra_min_hop_limit,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
.procname = "accept_ra_pinfo",
.data = &ipv6_devconf.accept_ra_pinfo,
.maxlen = sizeof(int),
@@ -5585,6 +5624,14 @@ static struct addrconf_sysctl_table
.proc_handler = addrconf_sysctl_stable_secret,
},
{
+ .procname = "use_oif_addrs_only",
+ .data = &ipv6_devconf.use_oif_addrs_only,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+
+ },
+ {
/* sentinel */
}
},
diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c
index ca09bf4..bfa941f 100644
--- a/net/ipv6/addrconf_core.c
+++ b/net/ipv6/addrconf_core.c
@@ -107,7 +107,16 @@ int inet6addr_notifier_call_chain(unsigned long val, void *v)
}
EXPORT_SYMBOL(inet6addr_notifier_call_chain);
-const struct ipv6_stub *ipv6_stub __read_mostly;
+static int eafnosupport_ipv6_dst_lookup(struct net *net, struct sock *u1,
+ struct dst_entry **u2,
+ struct flowi6 *u3)
+{
+ return -EAFNOSUPPORT;
+}
+
+const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) {
+ .ipv6_dst_lookup = eafnosupport_ipv6_dst_lookup,
+};
EXPORT_SYMBOL_GPL(ipv6_stub);
/* IPv6 Wildcard Address and Loopback Address defined by RFC2553 */
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 7de52b6..44bb66b 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -197,6 +197,7 @@ lookup_protocol:
np->mcast_hops = IPV6_DEFAULT_MCASTHOPS;
np->mc_loop = 1;
np->pmtudisc = IPV6_PMTUDISC_WANT;
+ np->autoflowlabel = ip6_default_np_autolabel(sock_net(sk));
sk->sk_ipv6only = net->ipv6.sysctl.bindv6only;
/* Init the ipv4 part of the socket since we can have sockets
@@ -342,7 +343,8 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
*/
v4addr = LOOPBACK4_IPV6;
if (!(addr_type & IPV6_ADDR_MULTICAST)) {
- if (!(inet->freebind || inet->transparent) &&
+ if (!net->ipv6.sysctl.ip_nonlocal_bind &&
+ !(inet->freebind || inet->transparent) &&
!ipv6_chk_addr(net, &addr->sin6_addr,
dev, 0)) {
err = -EADDRNOTAVAIL;
@@ -679,8 +681,8 @@ bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb,
const struct ipv6_pinfo *np = inet6_sk(sk);
if (np->rxopt.all) {
- if ((opt->hop && (np->rxopt.bits.hopopts ||
- np->rxopt.bits.ohopopts)) ||
+ if (((opt->flags & IP6SKB_HOPBYHOP) &&
+ (np->rxopt.bits.hopopts || np->rxopt.bits.ohopopts)) ||
(ip6_flowinfo((struct ipv6hdr *) skb_network_header(skb)) &&
np->rxopt.bits.rxflow) ||
(opt->srcrt && (np->rxopt.bits.srcrt ||
@@ -766,10 +768,10 @@ static int __net_init inet6_net_init(struct net *net)
net->ipv6.sysctl.bindv6only = 0;
net->ipv6.sysctl.icmpv6_time = 1*HZ;
net->ipv6.sysctl.flowlabel_consistency = 1;
- net->ipv6.sysctl.auto_flowlabels = 0;
+ net->ipv6.sysctl.auto_flowlabels = IP6_DEFAULT_AUTO_FLOW_LABELS;
net->ipv6.sysctl.idgen_retries = 3;
net->ipv6.sysctl.idgen_delay = 1 * HZ;
- net->ipv6.sysctl.flowlabel_state_ranges = 1;
+ net->ipv6.sysctl.flowlabel_state_ranges = 0;
atomic_set(&net->ipv6.fib6_sernum, 1);
err = ipv6_init_mibs(net);
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index b10a889..9aadd57 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -199,7 +199,7 @@ ipv4_connected:
NULL);
sk->sk_state = TCP_ESTABLISHED;
- ip6_set_txhash(sk);
+ sk_set_txhash(sk);
out:
fl6_sock_release(flowlabel);
return err;
@@ -568,8 +568,8 @@ void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg,
}
/* HbH is allowed only once */
- if (np->rxopt.bits.hopopts && opt->hop) {
- u8 *ptr = nh + opt->hop;
+ if (np->rxopt.bits.hopopts && (opt->flags & IP6SKB_HOPBYHOP)) {
+ u8 *ptr = nh + sizeof(struct ipv6hdr);
put_cmsg(msg, SOL_IPV6, IPV6_HOPOPTS, (ptr[1]+1)<<3, ptr);
}
@@ -630,8 +630,8 @@ void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg,
int hlim = ipv6_hdr(skb)->hop_limit;
put_cmsg(msg, SOL_IPV6, IPV6_2292HOPLIMIT, sizeof(hlim), &hlim);
}
- if (np->rxopt.bits.ohopopts && opt->hop) {
- u8 *ptr = nh + opt->hop;
+ if (np->rxopt.bits.ohopopts && (opt->flags & IP6SKB_HOPBYHOP)) {
+ u8 *ptr = nh + sizeof(struct ipv6hdr);
put_cmsg(msg, SOL_IPV6, IPV6_2292HOPOPTS, (ptr[1]+1)<<3, ptr);
}
if (np->rxopt.bits.odstopts && opt->dst0) {
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index a7bbbe4..ce203b0 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -632,7 +632,7 @@ int ipv6_parse_hopopts(struct sk_buff *skb)
return -1;
}
- opt->hop = sizeof(struct ipv6hdr);
+ opt->flags |= IP6SKB_HOPBYHOP;
if (ip6_parse_tlv(tlvprochopopt_lst, skb)) {
skb->transport_header += (skb_transport_header(skb)[1] + 1) << 3;
opt = IP6CB(skb);
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 713d743..6c2b213 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -329,7 +329,7 @@ static struct dst_entry *icmpv6_route_lookup(struct net *net,
struct flowi6 fl2;
int err;
- err = ip6_dst_lookup(sk, &dst, fl6);
+ err = ip6_dst_lookup(net, sk, &dst, fl6);
if (err)
return ERR_PTR(err);
@@ -361,7 +361,7 @@ static struct dst_entry *icmpv6_route_lookup(struct net *net,
if (err)
goto relookup_failed;
- err = ip6_dst_lookup(sk, &dst2, &fl2);
+ err = ip6_dst_lookup(net, sk, &dst2, &fl2);
if (err)
goto relookup_failed;
@@ -591,7 +591,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
else if (!fl6.flowi6_oif)
fl6.flowi6_oif = np->ucast_oif;
- err = ip6_dst_lookup(sk, &dst, &fl6);
+ err = ip6_dst_lookup(net, sk, &dst, &fl6);
if (err)
goto out;
dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), sk, 0);
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index b4fd96d..6ac8dad 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -207,7 +207,6 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row,
struct sock *sk2;
const struct hlist_nulls_node *node;
struct inet_timewait_sock *tw = NULL;
- int twrefcnt = 0;
spin_lock(lock);
@@ -234,21 +233,17 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row,
WARN_ON(!sk_unhashed(sk));
__sk_nulls_add_node_rcu(sk, &head->chain);
if (tw) {
- twrefcnt = inet_twsk_unhash(tw);
+ sk_nulls_del_node_init_rcu((struct sock *)tw);
NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED);
}
spin_unlock(lock);
- if (twrefcnt)
- inet_twsk_put(tw);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
if (twp) {
*twp = tw;
} else if (tw) {
/* Silly. Should hash-dance instead... */
- inet_twsk_deschedule(tw);
-
- inet_twsk_put(tw);
+ inet_twsk_deschedule_put(tw);
}
return 0;
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 55d1986..5693b5e 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -32,6 +32,7 @@
#include <net/ipv6.h>
#include <net/ndisc.h>
#include <net/addrconf.h>
+#include <net/lwtunnel.h>
#include <net/ip6_fib.h>
#include <net/ip6_route.h>
@@ -177,6 +178,7 @@ static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt)
static void rt6_release(struct rt6_info *rt)
{
if (atomic_dec_and_test(&rt->rt6i_ref)) {
+ lwtstate_put(rt->rt6i_lwtstate);
rt6_free_pcpu(rt);
dst_free(&rt->dst);
}
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index a38d3ac..34f1218 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -728,7 +728,7 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb,
*/
ipv6h = ipv6_hdr(skb);
ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield),
- ip6_make_flowlabel(net, skb, fl6->flowlabel, false));
+ ip6_make_flowlabel(net, skb, fl6->flowlabel, true, fl6));
ipv6h->hop_limit = tunnel->parms.hop_limit;
ipv6h->nexthdr = proto;
ipv6h->saddr = fl6->saddr;
@@ -1182,7 +1182,8 @@ static int ip6gre_header(struct sk_buff *skb, struct net_device *dev,
ip6_flow_hdr(ipv6h, 0,
ip6_make_flowlabel(dev_net(dev), skb,
- t->fl.u.ip6.flowlabel, false));
+ t->fl.u.ip6.flowlabel, true,
+ &t->fl.u.ip6));
ipv6h->hop_limit = t->parms.hop_limit;
ipv6h->nexthdr = NEXTHDR_GRE;
ipv6h->saddr = t->parms.laddr;
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 57990c9..adba03a 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -45,6 +45,7 @@
#include <net/addrconf.h>
#include <net/xfrm.h>
#include <net/inet_ecn.h>
+#include <net/dst_metadata.h>
int ip6_rcv_finish(struct sock *sk, struct sk_buff *skb)
{
@@ -55,7 +56,7 @@ int ip6_rcv_finish(struct sock *sk, struct sk_buff *skb)
if (ipprot && ipprot->early_demux)
ipprot->early_demux(skb);
}
- if (!skb_dst(skb))
+ if (!skb_valid_dst(skb))
ip6_route_input(skb);
return dst_input(skb);
@@ -98,7 +99,7 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
* arrived via the sending interface (ethX), because of the
* nature of scoping architecture. --yoshfuji
*/
- IP6CB(skb)->iif = skb_dst(skb) ? ip6_dst_idev(skb_dst(skb))->dev->ifindex : dev->ifindex;
+ IP6CB(skb)->iif = skb_valid_dst(skb) ? ip6_dst_idev(skb_dst(skb))->dev->ifindex : dev->ifindex;
if (unlikely(!pskb_may_pull(skb, sizeof(*hdr))))
goto err;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index d5f7716..26ea479 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -207,7 +207,7 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
hlimit = ip6_dst_hoplimit(dst);
ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
- np->autoflowlabel));
+ np->autoflowlabel, fl6));
hdr->payload_len = htons(seg_len);
hdr->nexthdr = proto;
@@ -881,10 +881,9 @@ out:
return dst;
}
-static int ip6_dst_lookup_tail(struct sock *sk,
+static int ip6_dst_lookup_tail(struct net *net, struct sock *sk,
struct dst_entry **dst, struct flowi6 *fl6)
{
- struct net *net = sock_net(sk);
#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
struct neighbour *n;
struct rt6_info *rt;
@@ -994,10 +993,11 @@ out_err_release:
*
* It returns zero on success, or a standard errno code on error.
*/
-int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
+int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
+ struct flowi6 *fl6)
{
*dst = NULL;
- return ip6_dst_lookup_tail(sk, dst, fl6);
+ return ip6_dst_lookup_tail(net, sk, dst, fl6);
}
EXPORT_SYMBOL_GPL(ip6_dst_lookup);
@@ -1018,11 +1018,13 @@ struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
struct dst_entry *dst = NULL;
int err;
- err = ip6_dst_lookup_tail(sk, &dst, fl6);
+ err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
if (err)
return ERR_PTR(err);
if (final_dst)
fl6->daddr = *final_dst;
+ if (!fl6->flowi6_oif)
+ fl6->flowi6_oif = dst->dev->ifindex;
return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
}
@@ -1050,7 +1052,7 @@ struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
dst = ip6_sk_dst_check(sk, dst, fl6);
- err = ip6_dst_lookup_tail(sk, &dst, fl6);
+ err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
if (err)
return ERR_PTR(err);
if (final_dst)
@@ -1647,7 +1649,7 @@ struct sk_buff *__ip6_make_skb(struct sock *sk,
ip6_flow_hdr(hdr, v6_cork->tclass,
ip6_make_flowlabel(net, skb, fl6->flowlabel,
- np->autoflowlabel));
+ np->autoflowlabel, fl6));
hdr->hop_limit = v6_cork->hop_limit;
hdr->nexthdr = proto;
hdr->saddr = fl6->saddr;
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 2e67b66..b0ab420 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1095,7 +1095,7 @@ static int ip6_tnl_xmit2(struct sk_buff *skb,
skb_reset_network_header(skb);
ipv6h = ipv6_hdr(skb);
ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield),
- ip6_make_flowlabel(net, skb, fl6->flowlabel, false));
+ ip6_make_flowlabel(net, skb, fl6->flowlabel, true, fl6));
ipv6h->hop_limit = t->parms.hop_limit;
ipv6h->nexthdr = proto;
ipv6h->saddr = fl6->saddr;
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index c53331c..b305461 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1225,18 +1225,16 @@ static void ndisc_router_discovery(struct sk_buff *skb)
if (rt)
rt6_set_expires(rt, jiffies + (HZ * lifetime));
- if (ra_msg->icmph.icmp6_hop_limit) {
- /* Only set hop_limit on the interface if it is higher than
- * the current hop_limit.
- */
- if (in6_dev->cnf.hop_limit < ra_msg->icmph.icmp6_hop_limit) {
+ if (in6_dev->cnf.accept_ra_min_hop_limit < 256 &&
+ ra_msg->icmph.icmp6_hop_limit) {
+ if (in6_dev->cnf.accept_ra_min_hop_limit <= ra_msg->icmph.icmp6_hop_limit) {
in6_dev->cnf.hop_limit = ra_msg->icmph.icmp6_hop_limit;
+ if (rt)
+ dst_metric_set(&rt->dst, RTAX_HOPLIMIT,
+ ra_msg->icmph.icmp6_hop_limit);
} else {
- ND_PRINTK(2, warn, "RA: Got route advertisement with lower hop_limit than current\n");
+ ND_PRINTK(2, warn, "RA: Got route advertisement with lower hop_limit than minimum\n");
}
- if (rt)
- dst_metric_set(&rt->dst, RTAX_HOPLIMIT,
- ra_msg->icmph.icmp6_hop_limit);
}
skip_defrtr:
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 3c35ced..4e21f80 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -305,7 +305,7 @@ static void trace_packet(const struct sk_buff *skb,
}
#endif
-static inline __pure struct ip6t_entry *
+static inline struct ip6t_entry *
ip6t_next_entry(const struct ip6t_entry *entry)
{
return (void *)entry + entry->next_offset;
@@ -324,12 +324,13 @@ ip6t_do_table(struct sk_buff *skb,
const char *indev, *outdev;
const void *table_base;
struct ip6t_entry *e, **jumpstack;
- unsigned int *stackptr, origptr, cpu;
+ unsigned int stackidx, cpu;
const struct xt_table_info *private;
struct xt_action_param acpar;
unsigned int addend;
/* Initialization */
+ stackidx = 0;
indev = state->in ? state->in->name : nulldevname;
outdev = state->out ? state->out->name : nulldevname;
/* We handle fragments by dealing with the first fragment as
@@ -357,8 +358,16 @@ ip6t_do_table(struct sk_buff *skb,
cpu = smp_processor_id();
table_base = private->entries;
jumpstack = (struct ip6t_entry **)private->jumpstack[cpu];
- stackptr = per_cpu_ptr(private->stackptr, cpu);
- origptr = *stackptr;
+
+ /* Switch to alternate jumpstack if we're being invoked via TEE.
+ * TEE issues XT_CONTINUE verdict on original skb so we must not
+ * clobber the jumpstack.
+ *
+ * For recursion via REJECT or SYNPROXY the stack will be clobbered
+ * but it is no problem since absolute verdict is issued by these.
+ */
+ if (static_key_false(&xt_tee_enabled))
+ jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated);
e = get_entry(table_base, private->hook_entry[hook]);
@@ -406,20 +415,16 @@ ip6t_do_table(struct sk_buff *skb,
verdict = (unsigned int)(-v) - 1;
break;
}
- if (*stackptr <= origptr)
+ if (stackidx == 0)
e = get_entry(table_base,
private->underflow[hook]);
else
- e = ip6t_next_entry(jumpstack[--*stackptr]);
+ e = ip6t_next_entry(jumpstack[--stackidx]);
continue;
}
if (table_base + v != ip6t_next_entry(e) &&
!(e->ipv6.flags & IP6T_F_GOTO)) {
- if (*stackptr >= private->stacksize) {
- verdict = NF_DROP;
- break;
- }
- jumpstack[(*stackptr)++] = e;
+ jumpstack[stackidx++] = e;
}
e = get_entry(table_base, v);
@@ -437,8 +442,6 @@ ip6t_do_table(struct sk_buff *skb,
break;
} while (!acpar.hotdrop);
- *stackptr = origptr;
-
xt_write_recseq_end(addend);
local_bh_enable();
@@ -452,11 +455,15 @@ ip6t_do_table(struct sk_buff *skb,
}
/* Figures out from what hook each rule can be called: returns 0 if
- there are loops. Puts hook bitmask in comefrom. */
+ * there are loops. Puts hook bitmask in comefrom.
+ *
+ * Keeps track of largest call depth seen and stores it in newinfo->stacksize.
+ */
static int
-mark_source_chains(const struct xt_table_info *newinfo,
+mark_source_chains(struct xt_table_info *newinfo,
unsigned int valid_hooks, void *entry0)
{
+ unsigned int calldepth, max_calldepth = 0;
unsigned int hook;
/* No recursion; use packet counter to save back ptrs (reset
@@ -470,6 +477,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
/* Set initial back pointer. */
e->counters.pcnt = pos;
+ calldepth = 0;
for (;;) {
const struct xt_standard_target *t
@@ -531,6 +539,8 @@ mark_source_chains(const struct xt_table_info *newinfo,
(entry0 + pos + size);
e->counters.pcnt = pos;
pos += size;
+ if (calldepth > 0)
+ --calldepth;
} else {
int newpos = t->verdict;
@@ -544,6 +554,11 @@ mark_source_chains(const struct xt_table_info *newinfo,
newpos);
return 0;
}
+ if (entry0 + newpos != ip6t_next_entry(e) &&
+ !(e->ipv6.flags & IP6T_F_GOTO) &&
+ ++calldepth > max_calldepth)
+ max_calldepth = calldepth;
+
/* This a jump; chase it. */
duprintf("Jump rule %u -> %u\n",
pos, newpos);
@@ -560,6 +575,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
next:
duprintf("Finished chain %u\n", hook);
}
+ newinfo->stacksize = max_calldepth;
return 1;
}
@@ -839,9 +855,6 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
if (ret != 0)
return ret;
++i;
- if (strcmp(ip6t_get_target(iter)->u.user.name,
- XT_ERROR_TARGET) == 0)
- ++newinfo->stacksize;
}
if (i != repl->num_entries) {
@@ -1754,9 +1767,6 @@ translate_compat_table(struct net *net,
if (ret != 0)
break;
++i;
- if (strcmp(ip6t_get_target(iter1)->u.user.name,
- XT_ERROR_TARGET) == 0)
- ++newinfo->stacksize;
}
if (ret) {
/*
diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c
index 12331ef..567367a 100644
--- a/net/ipv6/netfilter/ip6t_REJECT.c
+++ b/net/ipv6/netfilter/ip6t_REJECT.c
@@ -35,14 +35,12 @@ MODULE_AUTHOR("Yasuyuki KOZAKAI <yasuyuki.kozakai@toshiba.co.jp>");
MODULE_DESCRIPTION("Xtables: packet \"rejection\" target for IPv6");
MODULE_LICENSE("GPL");
-
static unsigned int
reject_tg6(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct ip6t_reject_info *reject = par->targinfo;
struct net *net = dev_net((par->in != NULL) ? par->in : par->out);
- pr_debug("%s: medium point\n", __func__);
switch (reject->with) {
case IP6T_ICMP6_NO_ROUTE:
nf_send_unreach6(net, skb, ICMPV6_NOROUTE, par->hooknum);
@@ -65,9 +63,6 @@ reject_tg6(struct sk_buff *skb, const struct xt_action_param *par)
case IP6T_TCP_RESET:
nf_send_reset6(net, skb, par->hooknum);
break;
- default:
- net_info_ratelimited("case %u not handled yet\n", reject->with);
- break;
}
return NF_DROP;
diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
index a45db0b..267fb8d 100644
--- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
+++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
@@ -39,12 +39,9 @@ static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum,
if (skb->nfct)
zone = nf_ct_zone((struct nf_conn *)skb->nfct);
#endif
-
-#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
- if (skb->nf_bridge &&
- skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING)
+ if (nf_bridge_in_prerouting(skb))
return IP6_DEFRAG_CONNTRACK_BRIDGE_IN + zone;
-#endif
+
if (hooknum == NF_INET_PRE_ROUTING)
return IP6_DEFRAG_CONNTRACK_IN + zone;
else
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index ca4700c..fdbada156 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -295,7 +295,8 @@ static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
* unspecified and mapped address have a v4 equivalent.
*/
v4addr = LOOPBACK4_IPV6;
- if (!(addr_type & IPV6_ADDR_MULTICAST)) {
+ if (!(addr_type & IPV6_ADDR_MULTICAST) &&
+ !sock_net(sk)->ipv6.sysctl.ip_nonlocal_bind) {
err = -EADDRNOTAVAIL;
if (!ipv6_chk_addr(sock_net(sk), &addr->sin6_addr,
dev, 0)) {
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 9de4d2b..c0fa61e 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -58,6 +58,7 @@
#include <net/netevent.h>
#include <net/netlink.h>
#include <net/nexthop.h>
+#include <net/lwtunnel.h>
#include <asm/uaccess.h>
@@ -544,6 +545,7 @@ static void rt6_probe_deferred(struct work_struct *w)
static void rt6_probe(struct rt6_info *rt)
{
+ struct __rt6_probe_work *work;
struct neighbour *neigh;
/*
* Okay, this does not seem to be appropriate
@@ -558,34 +560,33 @@ static void rt6_probe(struct rt6_info *rt)
rcu_read_lock_bh();
neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
if (neigh) {
- write_lock(&neigh->lock);
if (neigh->nud_state & NUD_VALID)
goto out;
- }
-
- if (!neigh ||
- time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
- struct __rt6_probe_work *work;
+ work = NULL;
+ write_lock(&neigh->lock);
+ if (!(neigh->nud_state & NUD_VALID) &&
+ time_after(jiffies,
+ neigh->updated +
+ rt->rt6i_idev->cnf.rtr_probe_interval)) {
+ work = kmalloc(sizeof(*work), GFP_ATOMIC);
+ if (work)
+ __neigh_set_probe_once(neigh);
+ }
+ write_unlock(&neigh->lock);
+ } else {
work = kmalloc(sizeof(*work), GFP_ATOMIC);
+ }
- if (neigh && work)
- __neigh_set_probe_once(neigh);
-
- if (neigh)
- write_unlock(&neigh->lock);
+ if (work) {
+ INIT_WORK(&work->work, rt6_probe_deferred);
+ work->target = rt->rt6i_gateway;
+ dev_hold(rt->dst.dev);
+ work->dev = rt->dst.dev;
+ schedule_work(&work->work);
+ }
- if (work) {
- INIT_WORK(&work->work, rt6_probe_deferred);
- work->target = rt->rt6i_gateway;
- dev_hold(rt->dst.dev);
- work->dev = rt->dst.dev;
- schedule_work(&work->work);
- }
- } else {
out:
- write_unlock(&neigh->lock);
- }
rcu_read_unlock_bh();
}
#else
@@ -1770,6 +1771,18 @@ int ip6_route_add(struct fib6_config *cfg)
rt->dst.output = ip6_output;
+ if (cfg->fc_encap) {
+ struct lwtunnel_state *lwtstate;
+
+ err = lwtunnel_build_state(dev, cfg->fc_encap_type,
+ cfg->fc_encap, &lwtstate);
+ if (err)
+ goto out;
+ rt->rt6i_lwtstate = lwtstate_get(lwtstate);
+ if (lwtunnel_output_redirect(rt->rt6i_lwtstate))
+ rt->dst.output = lwtunnel_output6;
+ }
+
ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
rt->rt6i_dst.plen = cfg->fc_dst_len;
if (rt->rt6i_dst.plen == 128)
@@ -2149,6 +2162,7 @@ static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
#endif
rt->rt6i_prefsrc = ort->rt6i_prefsrc;
rt->rt6i_table = ort->rt6i_table;
+ rt->rt6i_lwtstate = lwtstate_get(ort->rt6i_lwtstate);
}
#ifdef CONFIG_IPV6_ROUTE_INFO
@@ -2597,6 +2611,8 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
[RTA_METRICS] = { .type = NLA_NESTED },
[RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
[RTA_PREF] = { .type = NLA_U8 },
+ [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
+ [RTA_ENCAP] = { .type = NLA_NESTED },
};
static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -2691,6 +2707,12 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
cfg->fc_flags |= RTF_PREF(pref);
}
+ if (tb[RTA_ENCAP])
+ cfg->fc_encap = tb[RTA_ENCAP];
+
+ if (tb[RTA_ENCAP_TYPE])
+ cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
+
err = 0;
errout:
return err;
@@ -2723,6 +2745,10 @@ beginning:
r_cfg.fc_gateway = nla_get_in6_addr(nla);
r_cfg.fc_flags |= RTF_GATEWAY;
}
+ r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
+ nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
+ if (nla)
+ r_cfg.fc_encap_type = nla_get_u16(nla);
}
err = add ? ip6_route_add(&r_cfg) : ip6_route_del(&r_cfg);
if (err) {
@@ -2785,7 +2811,7 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
return ip6_route_add(&cfg);
}
-static inline size_t rt6_nlmsg_size(void)
+static inline size_t rt6_nlmsg_size(struct rt6_info *rt)
{
return NLMSG_ALIGN(sizeof(struct rtmsg))
+ nla_total_size(16) /* RTA_SRC */
@@ -2799,7 +2825,8 @@ static inline size_t rt6_nlmsg_size(void)
+ RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
+ nla_total_size(sizeof(struct rta_cacheinfo))
+ nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
- + nla_total_size(1); /* RTA_PREF */
+ + nla_total_size(1) /* RTA_PREF */
+ + lwtunnel_get_encap_size(rt->rt6i_lwtstate);
}
static int rt6_fill_node(struct net *net,
@@ -2947,6 +2974,8 @@ static int rt6_fill_node(struct net *net,
if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
goto nla_put_failure;
+ lwtunnel_fill_encap(skb, rt->rt6i_lwtstate);
+
nlmsg_end(skb, nlh);
return 0;
@@ -3073,7 +3102,7 @@ void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
err = -ENOBUFS;
seq = info->nlh ? info->nlh->nlmsg_seq : 0;
- skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
+ skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
if (!skb)
goto errout;
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index ac35a28..94428fd 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -742,7 +742,7 @@ static int ipip_rcv(struct sk_buff *skb)
goto drop;
if (iptunnel_pull_header(skb, 0, tpi.proto))
goto drop;
- return ip_tunnel_rcv(tunnel, skb, &tpi, log_ecn_error);
+ return ip_tunnel_rcv(tunnel, skb, &tpi, NULL, log_ecn_error);
}
return 1;
diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
index 4e705ad..45243bb 100644
--- a/net/ipv6/sysctl_net_ipv6.c
+++ b/net/ipv6/sysctl_net_ipv6.c
@@ -17,6 +17,9 @@
#include <net/inet_frag.h>
static int one = 1;
+static int auto_flowlabels_min;
+static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX;
+
static struct ctl_table ipv6_table_template[] = {
{
@@ -45,7 +48,9 @@ static struct ctl_table ipv6_table_template[] = {
.data = &init_net.ipv6.sysctl.auto_flowlabels,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &auto_flowlabels_min,
+ .extra2 = &auto_flowlabels_max
},
{
.procname = "fwmark_reflect",
@@ -75,6 +80,13 @@ static struct ctl_table ipv6_table_template[] = {
.mode = 0644,
.proc_handler = proc_dointvec
},
+ {
+ .procname = "ip_nonlocal_bind",
+ .data = &init_net.ipv6.sysctl.ip_nonlocal_bind,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
{ }
};
@@ -117,6 +129,7 @@ static int __net_init ipv6_sysctl_net_init(struct net *net)
ipv6_table[5].data = &net->ipv6.sysctl.idgen_retries;
ipv6_table[6].data = &net->ipv6.sysctl.idgen_delay;
ipv6_table[7].data = &net->ipv6.sysctl.flowlabel_state_ranges;
+ ipv6_table[8].data = &net->ipv6.sysctl.ip_nonlocal_bind;
ipv6_route_table = ipv6_route_sysctl_init(net);
if (!ipv6_route_table)
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 7a6cea5..97d9314 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -276,7 +276,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
if (err)
goto late_failure;
- ip6_set_txhash(sk);
+ sk_set_txhash(sk);
if (!tp->write_seq && likely(!tp->repair))
tp->write_seq = secure_tcpv6_sequence_number(np->saddr.s6_addr32,
@@ -1090,7 +1090,7 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
newsk->sk_v6_rcv_saddr = ireq->ir_v6_loc_addr;
newsk->sk_bound_dev_if = ireq->ir_iif;
- ip6_set_txhash(newsk);
+ sk_set_txhash(newsk);
/* Now IPv6 options...
@@ -1481,8 +1481,7 @@ do_time_wait:
ntohs(th->dest), tcp_v6_iif(skb));
if (sk2) {
struct inet_timewait_sock *tw = inet_twsk(sk);
- inet_twsk_deschedule(tw);
- inet_twsk_put(tw);
+ inet_twsk_deschedule_put(tw);
sk = sk2;
tcp_v6_restore_cb(skb);
goto process;
diff --git a/net/mac802154/cfg.c b/net/mac802154/cfg.c
index 317c466..f7ba51e 100644
--- a/net/mac802154/cfg.c
+++ b/net/mac802154/cfg.c
@@ -44,6 +44,49 @@ static void ieee802154_del_iface_deprecated(struct wpan_phy *wpan_phy,
ieee802154_if_remove(sdata);
}
+#ifdef CONFIG_PM
+static int ieee802154_suspend(struct wpan_phy *wpan_phy)
+{
+ struct ieee802154_local *local = wpan_phy_priv(wpan_phy);
+
+ if (!local->open_count)
+ goto suspend;
+
+ ieee802154_stop_queue(&local->hw);
+ synchronize_net();
+
+ /* stop hardware - this must stop RX */
+ ieee802154_stop_device(local);
+
+suspend:
+ local->suspended = true;
+ return 0;
+}
+
+static int ieee802154_resume(struct wpan_phy *wpan_phy)
+{
+ struct ieee802154_local *local = wpan_phy_priv(wpan_phy);
+ int ret;
+
+ /* nothing to do if HW shouldn't run */
+ if (!local->open_count)
+ goto wake_up;
+
+ /* restart hardware */
+ ret = drv_start(local);
+ if (ret)
+ return ret;
+
+wake_up:
+ ieee802154_wake_queue(&local->hw);
+ local->suspended = false;
+ return 0;
+}
+#else
+#define ieee802154_suspend NULL
+#define ieee802154_resume NULL
+#endif
+
static int
ieee802154_add_iface(struct wpan_phy *phy, const char *name,
unsigned char name_assign_type,
@@ -145,13 +188,18 @@ static int
ieee802154_set_pan_id(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
__le16 pan_id)
{
+ int ret;
+
ASSERT_RTNL();
if (wpan_dev->pan_id == pan_id)
return 0;
- wpan_dev->pan_id = pan_id;
- return 0;
+ ret = mac802154_wpan_update_llsec(wpan_dev->netdev);
+ if (!ret)
+ wpan_dev->pan_id = pan_id;
+
+ return ret;
}
static int
@@ -227,6 +275,8 @@ ieee802154_set_lbt_mode(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
const struct cfg802154_ops mac802154_config_ops = {
.add_virtual_intf_deprecated = ieee802154_add_iface_deprecated,
.del_virtual_intf_deprecated = ieee802154_del_iface_deprecated,
+ .suspend = ieee802154_suspend,
+ .resume = ieee802154_resume,
.add_virtual_intf = ieee802154_add_iface,
.del_virtual_intf = ieee802154_del_iface,
.set_channel = ieee802154_set_channel,
diff --git a/net/mac802154/ieee802154_i.h b/net/mac802154/ieee802154_i.h
index 34755d5..56ccffa 100644
--- a/net/mac802154/ieee802154_i.h
+++ b/net/mac802154/ieee802154_i.h
@@ -56,9 +56,13 @@ struct ieee802154_local {
struct hrtimer ifs_timer;
bool started;
+ bool suspended;
struct tasklet_struct tasklet;
struct sk_buff_head skb_queue;
+
+ struct sk_buff *tx_skb;
+ struct work_struct tx_work;
};
enum {
@@ -94,8 +98,6 @@ struct ieee802154_sub_if_data {
struct mac802154_llsec sec;
};
-#define MAC802154_CHAN_NONE 0xff /* No channel is assigned */
-
/* utility functions/constants */
extern const void *const mac802154_wpan_phy_privid; /* for wpan_phy privid */
@@ -125,6 +127,8 @@ ieee802154_sdata_running(struct ieee802154_sub_if_data *sdata)
extern struct ieee802154_mlme_ops mac802154_mlme_wpan;
+void ieee802154_rx(struct ieee802154_local *local, struct sk_buff *skb);
+void ieee802154_xmit_worker(struct work_struct *work);
netdev_tx_t
ieee802154_monitor_start_xmit(struct sk_buff *skb, struct net_device *dev);
netdev_tx_t
@@ -167,6 +171,8 @@ void mac802154_get_table(struct net_device *dev,
struct ieee802154_llsec_table **t);
void mac802154_unlock_table(struct net_device *dev);
+int mac802154_wpan_update_llsec(struct net_device *dev);
+
/* interface handling */
int ieee802154_iface_init(void);
void ieee802154_iface_exit(void);
@@ -176,5 +182,6 @@ ieee802154_if_add(struct ieee802154_local *local, const char *name,
unsigned char name_assign_type, enum nl802154_iftype type,
__le64 extended_addr);
void ieee802154_remove_interfaces(struct ieee802154_local *local);
+void ieee802154_stop_device(struct ieee802154_local *local);
#endif /* __IEEE802154_I_H */
diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c
index 8b69824..416de90 100644
--- a/net/mac802154/iface.c
+++ b/net/mac802154/iface.c
@@ -30,7 +30,7 @@
#include "ieee802154_i.h"
#include "driver-ops.h"
-static int mac802154_wpan_update_llsec(struct net_device *dev)
+int mac802154_wpan_update_llsec(struct net_device *dev)
{
struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev);
@@ -314,11 +314,8 @@ static int mac802154_slave_close(struct net_device *dev)
clear_bit(SDATA_STATE_RUNNING, &sdata->state);
- if (!local->open_count) {
- flush_workqueue(local->workqueue);
- hrtimer_cancel(&local->ifs_timer);
- drv_stop(local);
- }
+ if (!local->open_count)
+ ieee802154_stop_device(local);
return 0;
}
@@ -471,6 +468,7 @@ ieee802154_setup_sdata(struct ieee802154_sub_if_data *sdata,
enum nl802154_iftype type)
{
struct wpan_dev *wpan_dev = &sdata->wpan_dev;
+ int ret;
u8 tmp;
/* set some type-dependent values */
@@ -505,6 +503,10 @@ ieee802154_setup_sdata(struct ieee802154_sub_if_data *sdata,
mutex_init(&sdata->sec_mtx);
mac802154_llsec_init(&sdata->sec);
+ ret = mac802154_wpan_update_llsec(sdata->dev);
+ if (ret < 0)
+ return ret;
+
break;
case NL802154_IFTYPE_MONITOR:
sdata->dev->destructor = free_netdev;
diff --git a/net/mac802154/main.c b/net/mac802154/main.c
index 356b346..9e55431 100644
--- a/net/mac802154/main.c
+++ b/net/mac802154/main.c
@@ -40,7 +40,7 @@ static void ieee802154_tasklet_handler(unsigned long data)
* netstack.
*/
skb->pkt_type = 0;
- ieee802154_rx(&local->hw, skb);
+ ieee802154_rx(local, skb);
break;
default:
WARN(1, "mac802154: Packet is of unknown type %d\n",
@@ -58,11 +58,9 @@ ieee802154_alloc_hw(size_t priv_data_len, const struct ieee802154_ops *ops)
struct ieee802154_local *local;
size_t priv_size;
- if (!ops || !(ops->xmit_async || ops->xmit_sync) || !ops->ed ||
- !ops->start || !ops->stop || !ops->set_channel) {
- pr_err("undefined IEEE802.15.4 device operations\n");
+ if (WARN_ON(!ops || !(ops->xmit_async || ops->xmit_sync) || !ops->ed ||
+ !ops->start || !ops->stop || !ops->set_channel))
return NULL;
- }
/* Ensure 32-byte alignment of our private data and hw private data.
* We use the wpan_phy priv data for both our ieee802154_local and for
@@ -107,6 +105,8 @@ ieee802154_alloc_hw(size_t priv_data_len, const struct ieee802154_ops *ops)
skb_queue_head_init(&local->skb_queue);
+ INIT_WORK(&local->tx_work, ieee802154_xmit_worker);
+
/* init supported flags with 802.15.4 default ranges */
phy->supported.max_minbe = 8;
phy->supported.min_maxbe = 3;
diff --git a/net/mac802154/rx.c b/net/mac802154/rx.c
index d93ad2d..d1c33c1 100644
--- a/net/mac802154/rx.c
+++ b/net/mac802154/rx.c
@@ -246,13 +246,15 @@ ieee802154_monitors_rx(struct ieee802154_local *local, struct sk_buff *skb)
}
}
-void ieee802154_rx(struct ieee802154_hw *hw, struct sk_buff *skb)
+void ieee802154_rx(struct ieee802154_local *local, struct sk_buff *skb)
{
- struct ieee802154_local *local = hw_to_local(hw);
u16 crc;
WARN_ON_ONCE(softirq_count() == 0);
+ if (local->suspended)
+ goto drop;
+
/* TODO: When a transceiver omits the checksum here, we
* add an own calculated one. This is currently an ugly
* solution because the monitor needs a crc here.
@@ -273,8 +275,7 @@ void ieee802154_rx(struct ieee802154_hw *hw, struct sk_buff *skb)
crc = crc_ccitt(0, skb->data, skb->len);
if (crc) {
rcu_read_unlock();
- kfree_skb(skb);
- return;
+ goto drop;
}
}
/* remove crc */
@@ -283,8 +284,11 @@ void ieee802154_rx(struct ieee802154_hw *hw, struct sk_buff *skb)
__ieee802154_rx_handle_packet(local, skb);
rcu_read_unlock();
+
+ return;
+drop:
+ kfree_skb(skb);
}
-EXPORT_SYMBOL(ieee802154_rx);
void
ieee802154_rx_irqsafe(struct ieee802154_hw *hw, struct sk_buff *skb, u8 lqi)
diff --git a/net/mac802154/tx.c b/net/mac802154/tx.c
index c62e956..7ed4391 100644
--- a/net/mac802154/tx.c
+++ b/net/mac802154/tx.c
@@ -30,23 +30,11 @@
#include "ieee802154_i.h"
#include "driver-ops.h"
-/* IEEE 802.15.4 transceivers can sleep during the xmit session, so process
- * packets through the workqueue.
- */
-struct ieee802154_xmit_cb {
- struct sk_buff *skb;
- struct work_struct work;
- struct ieee802154_local *local;
-};
-
-static struct ieee802154_xmit_cb ieee802154_xmit_cb;
-
-static void ieee802154_xmit_worker(struct work_struct *work)
+void ieee802154_xmit_worker(struct work_struct *work)
{
- struct ieee802154_xmit_cb *cb =
- container_of(work, struct ieee802154_xmit_cb, work);
- struct ieee802154_local *local = cb->local;
- struct sk_buff *skb = cb->skb;
+ struct ieee802154_local *local =
+ container_of(work, struct ieee802154_local, tx_work);
+ struct sk_buff *skb = local->tx_skb;
struct net_device *dev = skb->dev;
int res;
@@ -106,11 +94,8 @@ ieee802154_tx(struct ieee802154_local *local, struct sk_buff *skb)
dev->stats.tx_packets++;
dev->stats.tx_bytes += skb->len;
} else {
- INIT_WORK(&ieee802154_xmit_cb.work, ieee802154_xmit_worker);
- ieee802154_xmit_cb.skb = skb;
- ieee802154_xmit_cb.local = local;
-
- queue_work(local->workqueue, &ieee802154_xmit_cb.work);
+ local->tx_skb = skb;
+ queue_work(local->workqueue, &local->tx_work);
}
return NETDEV_TX_OK;
diff --git a/net/mac802154/util.c b/net/mac802154/util.c
index 583435f..f9fd095 100644
--- a/net/mac802154/util.c
+++ b/net/mac802154/util.c
@@ -14,6 +14,7 @@
*/
#include "ieee802154_i.h"
+#include "driver-ops.h"
/* privid for wpan_phys to determine whether they belong to us or not */
const void *const mac802154_wpan_phy_privid = &mac802154_wpan_phy_privid;
@@ -92,3 +93,10 @@ void ieee802154_xmit_complete(struct ieee802154_hw *hw, struct sk_buff *skb,
dev_consume_skb_any(skb);
}
EXPORT_SYMBOL(ieee802154_xmit_complete);
+
+void ieee802154_stop_device(struct ieee802154_local *local)
+{
+ flush_workqueue(local->workqueue);
+ hrtimer_cancel(&local->ifs_timer);
+ drv_stop(local);
+}
diff --git a/net/mpls/Kconfig b/net/mpls/Kconfig
index 17bde79..5c467ef 100644
--- a/net/mpls/Kconfig
+++ b/net/mpls/Kconfig
@@ -24,7 +24,13 @@ config NET_MPLS_GSO
config MPLS_ROUTING
tristate "MPLS: routing support"
- help
+ ---help---
Add support for forwarding of mpls packets.
+config MPLS_IPTUNNEL
+ tristate "MPLS: IP over MPLS tunnel support"
+ depends on LWTUNNEL && MPLS_ROUTING
+ ---help---
+ mpls ip tunnel support.
+
endif # MPLS
diff --git a/net/mpls/Makefile b/net/mpls/Makefile
index 65bbe68..9ca9236 100644
--- a/net/mpls/Makefile
+++ b/net/mpls/Makefile
@@ -3,5 +3,6 @@
#
obj-$(CONFIG_NET_MPLS_GSO) += mpls_gso.o
obj-$(CONFIG_MPLS_ROUTING) += mpls_router.o
+obj-$(CONFIG_MPLS_IPTUNNEL) += mpls_iptunnel.o
mpls_router-y := af_mpls.o
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 1f93a59..8c5707d 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -15,6 +15,10 @@
#include <net/ip_fib.h>
#include <net/netevent.h>
#include <net/netns/generic.h>
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/ipv6.h>
+#include <net/addrconf.h>
+#endif
#include "internal.h"
#define LABEL_NOT_SPECIFIED (1<<20)
@@ -23,11 +27,23 @@
/* This maximum ha length copied from the definition of struct neighbour */
#define MAX_VIA_ALEN (ALIGN(MAX_ADDR_LEN, sizeof(unsigned long)))
+enum mpls_payload_type {
+ MPT_UNSPEC, /* IPv4 or IPv6 */
+ MPT_IPV4 = 4,
+ MPT_IPV6 = 6,
+
+ /* Other types not implemented:
+ * - Pseudo-wire with or without control word (RFC4385)
+ * - GAL (RFC5586)
+ */
+};
+
struct mpls_route { /* next hop label forwarding entry */
struct net_device __rcu *rt_dev;
struct rcu_head rt_rcu;
u32 rt_label[MAX_NEW_LABELS];
u8 rt_protocol; /* routing protocol that set this entry */
+ u8 rt_payload_type;
u8 rt_labels;
u8 rt_via_alen;
u8 rt_via_table;
@@ -58,10 +74,11 @@ static inline struct mpls_dev *mpls_dev_get(const struct net_device *dev)
return rcu_dereference_rtnl(dev->mpls_ptr);
}
-static bool mpls_output_possible(const struct net_device *dev)
+bool mpls_output_possible(const struct net_device *dev)
{
return dev && (dev->flags & IFF_UP) && netif_carrier_ok(dev);
}
+EXPORT_SYMBOL_GPL(mpls_output_possible);
static unsigned int mpls_rt_header_size(const struct mpls_route *rt)
{
@@ -69,13 +86,14 @@ static unsigned int mpls_rt_header_size(const struct mpls_route *rt)
return rt->rt_labels * sizeof(struct mpls_shim_hdr);
}
-static unsigned int mpls_dev_mtu(const struct net_device *dev)
+unsigned int mpls_dev_mtu(const struct net_device *dev)
{
/* The amount of data the layer 2 frame can hold */
return dev->mtu;
}
+EXPORT_SYMBOL_GPL(mpls_dev_mtu);
-static bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
+bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
{
if (skb->len <= mtu)
return false;
@@ -85,20 +103,13 @@ static bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
return true;
}
+EXPORT_SYMBOL_GPL(mpls_pkt_too_big);
static bool mpls_egress(struct mpls_route *rt, struct sk_buff *skb,
struct mpls_entry_decoded dec)
{
- /* RFC4385 and RFC5586 encode other packets in mpls such that
- * they don't conflict with the ip version number, making
- * decoding by examining the ip version correct in everything
- * except for the strangest cases.
- *
- * The strange cases if we choose to support them will require
- * manual configuration.
- */
- struct iphdr *hdr4;
- bool success = true;
+ enum mpls_payload_type payload_type;
+ bool success = false;
/* The IPv4 code below accesses through the IPv4 header
* checksum, which is 12 bytes into the packet.
@@ -113,23 +124,32 @@ static bool mpls_egress(struct mpls_route *rt, struct sk_buff *skb,
if (!pskb_may_pull(skb, 12))
return false;
- /* Use ip_hdr to find the ip protocol version */
- hdr4 = ip_hdr(skb);
- if (hdr4->version == 4) {
+ payload_type = rt->rt_payload_type;
+ if (payload_type == MPT_UNSPEC)
+ payload_type = ip_hdr(skb)->version;
+
+ switch (payload_type) {
+ case MPT_IPV4: {
+ struct iphdr *hdr4 = ip_hdr(skb);
skb->protocol = htons(ETH_P_IP);
csum_replace2(&hdr4->check,
htons(hdr4->ttl << 8),
htons(dec.ttl << 8));
hdr4->ttl = dec.ttl;
+ success = true;
+ break;
}
- else if (hdr4->version == 6) {
+ case MPT_IPV6: {
struct ipv6hdr *hdr6 = ipv6_hdr(skb);
skb->protocol = htons(ETH_P_IPV6);
hdr6->hop_limit = dec.ttl;
+ success = true;
+ break;
+ }
+ case MPT_UNSPEC:
+ break;
}
- else
- /* version 0 and version 1 are used by pseudo wires */
- success = false;
+
return success;
}
@@ -248,16 +268,17 @@ static const struct nla_policy rtm_mpls_policy[RTA_MAX+1] = {
};
struct mpls_route_config {
- u32 rc_protocol;
- u32 rc_ifindex;
- u16 rc_via_table;
- u16 rc_via_alen;
- u8 rc_via[MAX_VIA_ALEN];
- u32 rc_label;
- u32 rc_output_labels;
- u32 rc_output_label[MAX_NEW_LABELS];
- u32 rc_nlflags;
- struct nl_info rc_nlinfo;
+ u32 rc_protocol;
+ u32 rc_ifindex;
+ u16 rc_via_table;
+ u16 rc_via_alen;
+ u8 rc_via[MAX_VIA_ALEN];
+ u32 rc_label;
+ u32 rc_output_labels;
+ u32 rc_output_label[MAX_NEW_LABELS];
+ u32 rc_nlflags;
+ enum mpls_payload_type rc_payload_type;
+ struct nl_info rc_nlinfo;
};
static struct mpls_route *mpls_rt_alloc(size_t alen)
@@ -286,7 +307,7 @@ static void mpls_notify_route(struct net *net, unsigned index,
struct mpls_route *rt = new ? new : old;
unsigned nlm_flags = (old && new) ? NLM_F_REPLACE : 0;
/* Ignore reserved labels for now */
- if (rt && (index >= 16))
+ if (rt && (index >= MPLS_LABEL_FIRST_UNRESERVED))
rtmsg_lfib(event, index, rt, nlh, net, portid, nlm_flags);
}
@@ -320,13 +341,96 @@ static unsigned find_free_label(struct net *net)
platform_label = rtnl_dereference(net->mpls.platform_label);
platform_labels = net->mpls.platform_labels;
- for (index = 16; index < platform_labels; index++) {
+ for (index = MPLS_LABEL_FIRST_UNRESERVED; index < platform_labels;
+ index++) {
if (!rtnl_dereference(platform_label[index]))
return index;
}
return LABEL_NOT_SPECIFIED;
}
+#if IS_ENABLED(CONFIG_INET)
+static struct net_device *inet_fib_lookup_dev(struct net *net, void *addr)
+{
+ struct net_device *dev;
+ struct rtable *rt;
+ struct in_addr daddr;
+
+ memcpy(&daddr, addr, sizeof(struct in_addr));
+ rt = ip_route_output(net, daddr.s_addr, 0, 0, 0);
+ if (IS_ERR(rt))
+ return ERR_CAST(rt);
+
+ dev = rt->dst.dev;
+ dev_hold(dev);
+
+ ip_rt_put(rt);
+
+ return dev;
+}
+#else
+static struct net_device *inet_fib_lookup_dev(struct net *net, void *addr)
+{
+ return ERR_PTR(-EAFNOSUPPORT);
+}
+#endif
+
+#if IS_ENABLED(CONFIG_IPV6)
+static struct net_device *inet6_fib_lookup_dev(struct net *net, void *addr)
+{
+ struct net_device *dev;
+ struct dst_entry *dst;
+ struct flowi6 fl6;
+ int err;
+
+ if (!ipv6_stub)
+ return ERR_PTR(-EAFNOSUPPORT);
+
+ memset(&fl6, 0, sizeof(fl6));
+ memcpy(&fl6.daddr, addr, sizeof(struct in6_addr));
+ err = ipv6_stub->ipv6_dst_lookup(net, NULL, &dst, &fl6);
+ if (err)
+ return ERR_PTR(err);
+
+ dev = dst->dev;
+ dev_hold(dev);
+ dst_release(dst);
+
+ return dev;
+}
+#else
+static struct net_device *inet6_fib_lookup_dev(struct net *net, void *addr)
+{
+ return ERR_PTR(-EAFNOSUPPORT);
+}
+#endif
+
+static struct net_device *find_outdev(struct net *net,
+ struct mpls_route_config *cfg)
+{
+ struct net_device *dev = NULL;
+
+ if (!cfg->rc_ifindex) {
+ switch (cfg->rc_via_table) {
+ case NEIGH_ARP_TABLE:
+ dev = inet_fib_lookup_dev(net, cfg->rc_via);
+ break;
+ case NEIGH_ND_TABLE:
+ dev = inet6_fib_lookup_dev(net, cfg->rc_via);
+ break;
+ case NEIGH_LINK_TABLE:
+ break;
+ }
+ } else {
+ dev = dev_get_by_index(net, cfg->rc_ifindex);
+ }
+
+ if (!dev)
+ return ERR_PTR(-ENODEV);
+
+ return dev;
+}
+
static int mpls_route_add(struct mpls_route_config *cfg)
{
struct mpls_route __rcu **platform_label;
@@ -345,8 +449,8 @@ static int mpls_route_add(struct mpls_route_config *cfg)
index = find_free_label(net);
}
- /* The first 16 labels are reserved, and may not be set */
- if (index < 16)
+ /* Reserved labels may not be set */
+ if (index < MPLS_LABEL_FIRST_UNRESERVED)
goto errout;
/* The full 20 bit range may not be supported. */
@@ -357,10 +461,12 @@ static int mpls_route_add(struct mpls_route_config *cfg)
if (cfg->rc_output_labels > MAX_NEW_LABELS)
goto errout;
- err = -ENODEV;
- dev = dev_get_by_index(net, cfg->rc_ifindex);
- if (!dev)
+ dev = find_outdev(net, cfg);
+ if (IS_ERR(dev)) {
+ err = PTR_ERR(dev);
+ dev = NULL;
goto errout;
+ }
/* Ensure this is a supported device */
err = -EINVAL;
@@ -401,6 +507,7 @@ static int mpls_route_add(struct mpls_route_config *cfg)
rt->rt_label[i] = cfg->rc_output_label[i];
rt->rt_protocol = cfg->rc_protocol;
RCU_INIT_POINTER(rt->rt_dev, dev);
+ rt->rt_payload_type = cfg->rc_payload_type;
rt->rt_via_table = cfg->rc_via_table;
memcpy(rt->rt_via, cfg->rc_via, cfg->rc_via_alen);
@@ -423,8 +530,8 @@ static int mpls_route_del(struct mpls_route_config *cfg)
index = cfg->rc_label;
- /* The first 16 labels are reserved, and may not be removed */
- if (index < 16)
+ /* Reserved labels may not be removed */
+ if (index < MPLS_LABEL_FIRST_UNRESERVED)
goto errout;
/* The full 20 bit range may not be supported */
@@ -626,6 +733,7 @@ int nla_put_labels(struct sk_buff *skb, int attrtype,
return 0;
}
+EXPORT_SYMBOL_GPL(nla_put_labels);
int nla_get_labels(const struct nlattr *nla,
u32 max_labels, u32 *labels, u32 label[])
@@ -671,6 +779,7 @@ int nla_get_labels(const struct nlattr *nla,
*labels = nla_labels;
return 0;
}
+EXPORT_SYMBOL_GPL(nla_get_labels);
static int rtm_to_route_config(struct sk_buff *skb, struct nlmsghdr *nlh,
struct mpls_route_config *cfg)
@@ -740,8 +849,8 @@ static int rtm_to_route_config(struct sk_buff *skb, struct nlmsghdr *nlh,
&cfg->rc_label))
goto errout;
- /* The first 16 labels are reserved, and may not be set */
- if (cfg->rc_label < 16)
+ /* Reserved labels may not be set */
+ if (cfg->rc_label < MPLS_LABEL_FIRST_UNRESERVED)
goto errout;
break;
@@ -866,8 +975,8 @@ static int mpls_dump_routes(struct sk_buff *skb, struct netlink_callback *cb)
ASSERT_RTNL();
index = cb->args[0];
- if (index < 16)
- index = 16;
+ if (index < MPLS_LABEL_FIRST_UNRESERVED)
+ index = MPLS_LABEL_FIRST_UNRESERVED;
platform_label = rtnl_dereference(net->mpls.platform_label);
platform_labels = net->mpls.platform_labels;
@@ -953,6 +1062,7 @@ static int resize_platform_label_table(struct net *net, size_t limit)
goto nort0;
RCU_INIT_POINTER(rt0->rt_dev, lo);
rt0->rt_protocol = RTPROT_KERNEL;
+ rt0->rt_payload_type = MPT_IPV4;
rt0->rt_via_table = NEIGH_LINK_TABLE;
memcpy(rt0->rt_via, lo->dev_addr, lo->addr_len);
}
@@ -963,6 +1073,7 @@ static int resize_platform_label_table(struct net *net, size_t limit)
goto nort2;
RCU_INIT_POINTER(rt2->rt_dev, lo);
rt2->rt_protocol = RTPROT_KERNEL;
+ rt2->rt_payload_type = MPT_IPV6;
rt2->rt_via_table = NEIGH_LINK_TABLE;
memcpy(rt2->rt_via, lo->dev_addr, lo->addr_len);
}
diff --git a/net/mpls/internal.h b/net/mpls/internal.h
index 8cabeb5..2681a4b 100644
--- a/net/mpls/internal.h
+++ b/net/mpls/internal.h
@@ -50,7 +50,12 @@ static inline struct mpls_entry_decoded mpls_entry_decode(struct mpls_shim_hdr *
return result;
}
-int nla_put_labels(struct sk_buff *skb, int attrtype, u8 labels, const u32 label[]);
-int nla_get_labels(const struct nlattr *nla, u32 max_labels, u32 *labels, u32 label[]);
+int nla_put_labels(struct sk_buff *skb, int attrtype, u8 labels,
+ const u32 label[]);
+int nla_get_labels(const struct nlattr *nla, u32 max_labels, u32 *labels,
+ u32 label[]);
+bool mpls_output_possible(const struct net_device *dev);
+unsigned int mpls_dev_mtu(const struct net_device *dev);
+bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu);
#endif /* MPLS_INTERNAL_H */
diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c
new file mode 100644
index 0000000..276f8c9
--- /dev/null
+++ b/net/mpls/mpls_iptunnel.c
@@ -0,0 +1,233 @@
+/*
+ * mpls tunnels An implementation mpls tunnels using the light weight tunnel
+ * infrastructure
+ *
+ * Authors: Roopa Prabhu, <roopa@cumulusnetworks.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/net.h>
+#include <linux/module.h>
+#include <linux/mpls.h>
+#include <linux/vmalloc.h>
+#include <net/ip.h>
+#include <net/dst.h>
+#include <net/lwtunnel.h>
+#include <net/netevent.h>
+#include <net/netns/generic.h>
+#include <net/ip6_fib.h>
+#include <net/route.h>
+#include <net/mpls_iptunnel.h>
+#include <linux/mpls_iptunnel.h>
+#include "internal.h"
+
+static const struct nla_policy mpls_iptunnel_policy[MPLS_IPTUNNEL_MAX + 1] = {
+ [MPLS_IPTUNNEL_DST] = { .type = NLA_U32 },
+};
+
+static unsigned int mpls_encap_size(struct mpls_iptunnel_encap *en)
+{
+ /* The size of the layer 2.5 labels to be added for this route */
+ return en->labels * sizeof(struct mpls_shim_hdr);
+}
+
+int mpls_output(struct sock *sk, struct sk_buff *skb)
+{
+ struct mpls_iptunnel_encap *tun_encap_info;
+ struct mpls_shim_hdr *hdr;
+ struct net_device *out_dev;
+ unsigned int hh_len;
+ unsigned int new_header_size;
+ unsigned int mtu;
+ struct dst_entry *dst = skb_dst(skb);
+ struct rtable *rt = NULL;
+ struct rt6_info *rt6 = NULL;
+ struct lwtunnel_state *lwtstate = NULL;
+ int err = 0;
+ bool bos;
+ int i;
+ unsigned int ttl;
+
+ /* Obtain the ttl */
+ if (skb->protocol == htons(ETH_P_IP)) {
+ ttl = ip_hdr(skb)->ttl;
+ rt = (struct rtable *)dst;
+ lwtstate = rt->rt_lwtstate;
+ } else if (skb->protocol == htons(ETH_P_IPV6)) {
+ ttl = ipv6_hdr(skb)->hop_limit;
+ rt6 = (struct rt6_info *)dst;
+ lwtstate = rt6->rt6i_lwtstate;
+ } else {
+ goto drop;
+ }
+
+ skb_orphan(skb);
+
+ /* Find the output device */
+ out_dev = dst->dev;
+ if (!mpls_output_possible(out_dev) ||
+ !lwtstate || skb_warn_if_lro(skb))
+ goto drop;
+
+ skb_forward_csum(skb);
+
+ tun_encap_info = mpls_lwtunnel_encap(lwtstate);
+
+ /* Verify the destination can hold the packet */
+ new_header_size = mpls_encap_size(tun_encap_info);
+ mtu = mpls_dev_mtu(out_dev);
+ if (mpls_pkt_too_big(skb, mtu - new_header_size))
+ goto drop;
+
+ hh_len = LL_RESERVED_SPACE(out_dev);
+ if (!out_dev->header_ops)
+ hh_len = 0;
+
+ /* Ensure there is enough space for the headers in the skb */
+ if (skb_cow(skb, hh_len + new_header_size))
+ goto drop;
+
+ skb_push(skb, new_header_size);
+ skb_reset_network_header(skb);
+
+ skb->dev = out_dev;
+ skb->protocol = htons(ETH_P_MPLS_UC);
+
+ /* Push the new labels */
+ hdr = mpls_hdr(skb);
+ bos = true;
+ for (i = tun_encap_info->labels - 1; i >= 0; i--) {
+ hdr[i] = mpls_entry_encode(tun_encap_info->label[i],
+ ttl, 0, bos);
+ bos = false;
+ }
+
+ if (rt)
+ err = neigh_xmit(NEIGH_ARP_TABLE, out_dev, &rt->rt_gateway,
+ skb);
+ else if (rt6)
+ err = neigh_xmit(NEIGH_ND_TABLE, out_dev, &rt6->rt6i_gateway,
+ skb);
+ if (err)
+ net_dbg_ratelimited("%s: packet transmission failed: %d\n",
+ __func__, err);
+
+ return 0;
+
+drop:
+ kfree_skb(skb);
+ return -EINVAL;
+}
+
+static int mpls_build_state(struct net_device *dev, struct nlattr *nla,
+ struct lwtunnel_state **ts)
+{
+ struct mpls_iptunnel_encap *tun_encap_info;
+ struct nlattr *tb[MPLS_IPTUNNEL_MAX + 1];
+ struct lwtunnel_state *newts;
+ int tun_encap_info_len;
+ int ret;
+
+ ret = nla_parse_nested(tb, MPLS_IPTUNNEL_MAX, nla,
+ mpls_iptunnel_policy);
+ if (ret < 0)
+ return ret;
+
+ if (!tb[MPLS_IPTUNNEL_DST])
+ return -EINVAL;
+
+ tun_encap_info_len = sizeof(*tun_encap_info);
+
+ newts = lwtunnel_state_alloc(tun_encap_info_len);
+ if (!newts)
+ return -ENOMEM;
+
+ newts->len = tun_encap_info_len;
+ tun_encap_info = mpls_lwtunnel_encap(newts);
+ ret = nla_get_labels(tb[MPLS_IPTUNNEL_DST], MAX_NEW_LABELS,
+ &tun_encap_info->labels, tun_encap_info->label);
+ if (ret)
+ goto errout;
+ newts->type = LWTUNNEL_ENCAP_MPLS;
+ newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT;
+
+ *ts = newts;
+
+ return 0;
+
+errout:
+ kfree(newts);
+ *ts = NULL;
+
+ return ret;
+}
+
+static int mpls_fill_encap_info(struct sk_buff *skb,
+ struct lwtunnel_state *lwtstate)
+{
+ struct mpls_iptunnel_encap *tun_encap_info;
+
+ tun_encap_info = mpls_lwtunnel_encap(lwtstate);
+
+ if (nla_put_labels(skb, MPLS_IPTUNNEL_DST, tun_encap_info->labels,
+ tun_encap_info->label))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static int mpls_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+ struct mpls_iptunnel_encap *tun_encap_info;
+
+ tun_encap_info = mpls_lwtunnel_encap(lwtstate);
+
+ return nla_total_size(tun_encap_info->labels * 4);
+}
+
+static int mpls_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
+{
+ struct mpls_iptunnel_encap *a_hdr = mpls_lwtunnel_encap(a);
+ struct mpls_iptunnel_encap *b_hdr = mpls_lwtunnel_encap(b);
+ int l;
+
+ if (a_hdr->labels != b_hdr->labels)
+ return 1;
+
+ for (l = 0; l < MAX_NEW_LABELS; l++)
+ if (a_hdr->label[l] != b_hdr->label[l])
+ return 1;
+ return 0;
+}
+
+static const struct lwtunnel_encap_ops mpls_iptun_ops = {
+ .build_state = mpls_build_state,
+ .output = mpls_output,
+ .fill_encap = mpls_fill_encap_info,
+ .get_encap_size = mpls_encap_nlsize,
+ .cmp_encap = mpls_encap_cmp,
+};
+
+static int __init mpls_iptunnel_init(void)
+{
+ return lwtunnel_encap_add_ops(&mpls_iptun_ops, LWTUNNEL_ENCAP_MPLS);
+}
+module_init(mpls_iptunnel_init);
+
+static void __exit mpls_iptunnel_exit(void)
+{
+ lwtunnel_encap_del_ops(&mpls_iptun_ops, LWTUNNEL_ENCAP_MPLS);
+}
+module_exit(mpls_iptunnel_exit);
+
+MODULE_DESCRIPTION("MultiProtocol Label Switching IP Tunnels");
+MODULE_LICENSE("GPL v2");
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index a0e5497..2a5a070 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -34,6 +34,9 @@ EXPORT_SYMBOL(nf_afinfo);
const struct nf_ipv6_ops __rcu *nf_ipv6_ops __read_mostly;
EXPORT_SYMBOL_GPL(nf_ipv6_ops);
+DEFINE_PER_CPU(bool, nf_skb_duplicated);
+EXPORT_SYMBOL_GPL(nf_skb_duplicated);
+
int nf_register_afinfo(const struct nf_afinfo *afinfo)
{
mutex_lock(&afinfo_mutex);
@@ -52,9 +55,6 @@ void nf_unregister_afinfo(const struct nf_afinfo *afinfo)
}
EXPORT_SYMBOL_GPL(nf_unregister_afinfo);
-struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS] __read_mostly;
-EXPORT_SYMBOL(nf_hooks);
-
#ifdef HAVE_JUMP_LABEL
struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
EXPORT_SYMBOL(nf_hooks_needed);
@@ -62,63 +62,166 @@ EXPORT_SYMBOL(nf_hooks_needed);
static DEFINE_MUTEX(nf_hook_mutex);
-int nf_register_hook(struct nf_hook_ops *reg)
+static struct list_head *nf_find_hook_list(struct net *net,
+ const struct nf_hook_ops *reg)
{
- struct list_head *nf_hook_list;
- struct nf_hook_ops *elem;
+ struct list_head *hook_list = NULL;
- mutex_lock(&nf_hook_mutex);
- switch (reg->pf) {
- case NFPROTO_NETDEV:
+ if (reg->pf != NFPROTO_NETDEV)
+ hook_list = &net->nf.hooks[reg->pf][reg->hooknum];
+ else if (reg->hooknum == NF_NETDEV_INGRESS) {
#ifdef CONFIG_NETFILTER_INGRESS
- if (reg->hooknum == NF_NETDEV_INGRESS) {
- BUG_ON(reg->dev == NULL);
- nf_hook_list = &reg->dev->nf_hooks_ingress;
- net_inc_ingress_queue();
- break;
- }
+ if (reg->dev && dev_net(reg->dev) == net)
+ hook_list = &reg->dev->nf_hooks_ingress;
#endif
- /* Fall through. */
- default:
- nf_hook_list = &nf_hooks[reg->pf][reg->hooknum];
- break;
+ }
+ return hook_list;
+}
+
+struct nf_hook_entry {
+ const struct nf_hook_ops *orig_ops;
+ struct nf_hook_ops ops;
+};
+
+int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg)
+{
+ struct list_head *hook_list;
+ struct nf_hook_entry *entry;
+ struct nf_hook_ops *elem;
+
+ entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry)
+ return -ENOMEM;
+
+ entry->orig_ops = reg;
+ entry->ops = *reg;
+
+ hook_list = nf_find_hook_list(net, reg);
+ if (!hook_list) {
+ kfree(entry);
+ return -ENOENT;
}
- list_for_each_entry(elem, nf_hook_list, list) {
+ mutex_lock(&nf_hook_mutex);
+ list_for_each_entry(elem, hook_list, list) {
if (reg->priority < elem->priority)
break;
}
- list_add_rcu(&reg->list, elem->list.prev);
+ list_add_rcu(&entry->ops.list, elem->list.prev);
mutex_unlock(&nf_hook_mutex);
+#ifdef CONFIG_NETFILTER_INGRESS
+ if (reg->pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS)
+ net_inc_ingress_queue();
+#endif
#ifdef HAVE_JUMP_LABEL
static_key_slow_inc(&nf_hooks_needed[reg->pf][reg->hooknum]);
#endif
return 0;
}
-EXPORT_SYMBOL(nf_register_hook);
+EXPORT_SYMBOL(nf_register_net_hook);
-void nf_unregister_hook(struct nf_hook_ops *reg)
+void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg)
{
+ struct list_head *hook_list;
+ struct nf_hook_entry *entry;
+ struct nf_hook_ops *elem;
+
+ hook_list = nf_find_hook_list(net, reg);
+ if (!hook_list)
+ return;
+
mutex_lock(&nf_hook_mutex);
- list_del_rcu(&reg->list);
- mutex_unlock(&nf_hook_mutex);
- switch (reg->pf) {
- case NFPROTO_NETDEV:
-#ifdef CONFIG_NETFILTER_INGRESS
- if (reg->hooknum == NF_NETDEV_INGRESS) {
- net_dec_ingress_queue();
+ list_for_each_entry(elem, hook_list, list) {
+ entry = container_of(elem, struct nf_hook_entry, ops);
+ if (entry->orig_ops == reg) {
+ list_del_rcu(&entry->ops.list);
break;
}
- break;
-#endif
- default:
- break;
}
+ mutex_unlock(&nf_hook_mutex);
+ if (&elem->list == hook_list) {
+ WARN(1, "nf_unregister_net_hook: hook not found!\n");
+ return;
+ }
+#ifdef CONFIG_NETFILTER_INGRESS
+ if (reg->pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS)
+ net_dec_ingress_queue();
+#endif
#ifdef HAVE_JUMP_LABEL
static_key_slow_dec(&nf_hooks_needed[reg->pf][reg->hooknum]);
#endif
synchronize_net();
- nf_queue_nf_hook_drop(reg);
+ nf_queue_nf_hook_drop(net, &entry->ops);
+ kfree(entry);
+}
+EXPORT_SYMBOL(nf_unregister_net_hook);
+
+int nf_register_net_hooks(struct net *net, const struct nf_hook_ops *reg,
+ unsigned int n)
+{
+ unsigned int i;
+ int err = 0;
+
+ for (i = 0; i < n; i++) {
+ err = nf_register_net_hook(net, &reg[i]);
+ if (err)
+ goto err;
+ }
+ return err;
+
+err:
+ if (i > 0)
+ nf_unregister_net_hooks(net, reg, i);
+ return err;
+}
+EXPORT_SYMBOL(nf_register_net_hooks);
+
+void nf_unregister_net_hooks(struct net *net, const struct nf_hook_ops *reg,
+ unsigned int n)
+{
+ while (n-- > 0)
+ nf_unregister_net_hook(net, &reg[n]);
+}
+EXPORT_SYMBOL(nf_unregister_net_hooks);
+
+static LIST_HEAD(nf_hook_list);
+
+int nf_register_hook(struct nf_hook_ops *reg)
+{
+ struct net *net, *last;
+ int ret;
+
+ rtnl_lock();
+ for_each_net(net) {
+ ret = nf_register_net_hook(net, reg);
+ if (ret && ret != -ENOENT)
+ goto rollback;
+ }
+ list_add_tail(&reg->list, &nf_hook_list);
+ rtnl_unlock();
+
+ return 0;
+rollback:
+ last = net;
+ for_each_net(net) {
+ if (net == last)
+ break;
+ nf_unregister_net_hook(net, reg);
+ }
+ rtnl_unlock();
+ return ret;
+}
+EXPORT_SYMBOL(nf_register_hook);
+
+void nf_unregister_hook(struct nf_hook_ops *reg)
+{
+ struct net *net;
+
+ rtnl_lock();
+ list_del(&reg->list);
+ for_each_net(net)
+ nf_unregister_net_hook(net, reg);
+ rtnl_unlock();
}
EXPORT_SYMBOL(nf_unregister_hook);
@@ -295,8 +398,46 @@ void (*nf_nat_decode_session_hook)(struct sk_buff *, struct flowi *);
EXPORT_SYMBOL(nf_nat_decode_session_hook);
#endif
+static int nf_register_hook_list(struct net *net)
+{
+ struct nf_hook_ops *elem;
+ int ret;
+
+ rtnl_lock();
+ list_for_each_entry(elem, &nf_hook_list, list) {
+ ret = nf_register_net_hook(net, elem);
+ if (ret && ret != -ENOENT)
+ goto out_undo;
+ }
+ rtnl_unlock();
+ return 0;
+
+out_undo:
+ list_for_each_entry_continue_reverse(elem, &nf_hook_list, list)
+ nf_unregister_net_hook(net, elem);
+ rtnl_unlock();
+ return ret;
+}
+
+static void nf_unregister_hook_list(struct net *net)
+{
+ struct nf_hook_ops *elem;
+
+ rtnl_lock();
+ list_for_each_entry(elem, &nf_hook_list, list)
+ nf_unregister_net_hook(net, elem);
+ rtnl_unlock();
+}
+
static int __net_init netfilter_net_init(struct net *net)
{
+ int i, h, ret;
+
+ for (i = 0; i < ARRAY_SIZE(net->nf.hooks); i++) {
+ for (h = 0; h < NF_MAX_HOOKS; h++)
+ INIT_LIST_HEAD(&net->nf.hooks[i][h]);
+ }
+
#ifdef CONFIG_PROC_FS
net->nf.proc_netfilter = proc_net_mkdir(net, "netfilter",
net->proc_net);
@@ -307,11 +448,16 @@ static int __net_init netfilter_net_init(struct net *net)
return -ENOMEM;
}
#endif
- return 0;
+ ret = nf_register_hook_list(net);
+ if (ret)
+ remove_proc_entry("netfilter", net->proc_net);
+
+ return ret;
}
static void __net_exit netfilter_net_exit(struct net *net)
{
+ nf_unregister_hook_list(net);
remove_proc_entry("netfilter", net->proc_net);
}
@@ -322,12 +468,7 @@ static struct pernet_operations netfilter_net_ops = {
int __init netfilter_init(void)
{
- int i, h, ret;
-
- for (i = 0; i < ARRAY_SIZE(nf_hooks); i++) {
- for (h = 0; h < NF_MAX_HOOKS; h++)
- INIT_LIST_HEAD(&nf_hooks[i][h]);
- }
+ int ret;
ret = register_pernet_subsys(&netfilter_net_ops);
if (ret < 0)
diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c
index 7e81416..a2ff7d7 100644
--- a/net/netfilter/ipvs/ip_vs_sched.c
+++ b/net/netfilter/ipvs/ip_vs_sched.c
@@ -137,7 +137,7 @@ struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name)
void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler)
{
- if (scheduler && scheduler->module)
+ if (scheduler)
module_put(scheduler->module);
}
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index b45da90..6719773 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -42,6 +42,8 @@ static const char *const sctp_conntrack_names[] = {
"SHUTDOWN_SENT",
"SHUTDOWN_RECD",
"SHUTDOWN_ACK_SENT",
+ "HEARTBEAT_SENT",
+ "HEARTBEAT_ACKED",
};
#define SECS * HZ
@@ -57,6 +59,8 @@ static unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] __read_mostly = {
[SCTP_CONNTRACK_SHUTDOWN_SENT] = 300 SECS / 1000,
[SCTP_CONNTRACK_SHUTDOWN_RECD] = 300 SECS / 1000,
[SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = 3 SECS,
+ [SCTP_CONNTRACK_HEARTBEAT_SENT] = 30 SECS,
+ [SCTP_CONNTRACK_HEARTBEAT_ACKED] = 210 SECS,
};
#define sNO SCTP_CONNTRACK_NONE
@@ -67,6 +71,8 @@ static unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] __read_mostly = {
#define sSS SCTP_CONNTRACK_SHUTDOWN_SENT
#define sSR SCTP_CONNTRACK_SHUTDOWN_RECD
#define sSA SCTP_CONNTRACK_SHUTDOWN_ACK_SENT
+#define sHS SCTP_CONNTRACK_HEARTBEAT_SENT
+#define sHA SCTP_CONNTRACK_HEARTBEAT_ACKED
#define sIV SCTP_CONNTRACK_MAX
/*
@@ -88,6 +94,10 @@ SHUTDOWN_ACK_SENT - We have seen a SHUTDOWN_ACK chunk in the direction opposite
to that of the SHUTDOWN chunk.
CLOSED - We have seen a SHUTDOWN_COMPLETE chunk in the direction of
the SHUTDOWN chunk. Connection is closed.
+HEARTBEAT_SENT - We have seen a HEARTBEAT in a new flow.
+HEARTBEAT_ACKED - We have seen a HEARTBEAT-ACK in the direction opposite to
+ that of the HEARTBEAT chunk. Secondary connection is
+ established.
*/
/* TODO
@@ -97,36 +107,40 @@ CLOSED - We have seen a SHUTDOWN_COMPLETE chunk in the direction of
- Check the error type in the reply dir before transitioning from
cookie echoed to closed.
- Sec 5.2.4 of RFC 2960
- - Multi Homing support.
+ - Full Multi Homing support.
*/
/* SCTP conntrack state transitions */
-static const u8 sctp_conntracks[2][9][SCTP_CONNTRACK_MAX] = {
+static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = {
{
/* ORIGINAL */
-/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */
-/* init */ {sCW, sCW, sCW, sCE, sES, sSS, sSR, sSA},
-/* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},
-/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
-/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA},
-/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA},
-/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Can't have Stale cookie*/
-/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA},/* 5.2.4 - Big TODO */
-/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Can't come in orig dir */
-/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL}
+/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA */
+/* init */ {sCW, sCW, sCW, sCE, sES, sSS, sSR, sSA, sCW, sHA},
+/* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA},
+/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
+/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA, sCL, sSS},
+/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA, sSA, sHA},
+/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA},/* Can't have Stale cookie*/
+/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA, sCL, sHA},/* 5.2.4 - Big TODO */
+/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA},/* Can't come in orig dir */
+/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL, sCL, sHA},
+/* heartbeat */ {sHS, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA},
+/* heartbeat_ack*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA}
},
{
/* REPLY */
-/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */
-/* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* INIT in sCL Big TODO */
-/* init_ack */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},
-/* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
-/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA},
-/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA},
-/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA},
-/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Can't come in reply dir */
-/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA},
-/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL}
+/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA */
+/* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA},/* INIT in sCL Big TODO */
+/* init_ack */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA},
+/* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV, sCL},
+/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA, sIV, sSR},
+/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA, sIV, sHA},
+/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA, sIV, sHA},
+/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA},/* Can't come in reply dir */
+/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA, sIV, sHA},
+/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL, sIV, sHA},
+/* heartbeat */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA},
+/* heartbeat_ack*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHA, sHA}
}
};
@@ -278,9 +292,16 @@ static int sctp_new_state(enum ip_conntrack_dir dir,
pr_debug("SCTP_CID_SHUTDOWN_COMPLETE\n");
i = 8;
break;
+ case SCTP_CID_HEARTBEAT:
+ pr_debug("SCTP_CID_HEARTBEAT");
+ i = 9;
+ break;
+ case SCTP_CID_HEARTBEAT_ACK:
+ pr_debug("SCTP_CID_HEARTBEAT_ACK");
+ i = 10;
+ break;
default:
- /* Other chunks like DATA, SACK, HEARTBEAT and
- its ACK do not cause a change in state */
+ /* Other chunks like DATA or SACK do not change the state */
pr_debug("Unknown chunk type, Will stay in %s\n",
sctp_conntrack_names[cur_state]);
return cur_state;
@@ -329,6 +350,8 @@ static int sctp_packet(struct nf_conn *ct,
!test_bit(SCTP_CID_COOKIE_ECHO, map) &&
!test_bit(SCTP_CID_ABORT, map) &&
!test_bit(SCTP_CID_SHUTDOWN_ACK, map) &&
+ !test_bit(SCTP_CID_HEARTBEAT, map) &&
+ !test_bit(SCTP_CID_HEARTBEAT_ACK, map) &&
sh->vtag != ct->proto.sctp.vtag[dir]) {
pr_debug("Verification tag check failed\n");
goto out;
@@ -357,6 +380,16 @@ static int sctp_packet(struct nf_conn *ct,
/* Sec 8.5.1 (D) */
if (sh->vtag != ct->proto.sctp.vtag[dir])
goto out_unlock;
+ } else if (sch->type == SCTP_CID_HEARTBEAT ||
+ sch->type == SCTP_CID_HEARTBEAT_ACK) {
+ if (ct->proto.sctp.vtag[dir] == 0) {
+ pr_debug("Setting vtag %x for dir %d\n",
+ sh->vtag, dir);
+ ct->proto.sctp.vtag[dir] = sh->vtag;
+ } else if (sh->vtag != ct->proto.sctp.vtag[dir]) {
+ pr_debug("Verification tag check failed\n");
+ goto out_unlock;
+ }
}
old_state = ct->proto.sctp.state;
@@ -466,6 +499,10 @@ static bool sctp_new(struct nf_conn *ct, const struct sk_buff *skb,
/* Sec 8.5.1 (A) */
return false;
}
+ } else if (sch->type == SCTP_CID_HEARTBEAT) {
+ pr_debug("Setting vtag %x for secondary conntrack\n",
+ sh->vtag);
+ ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] = sh->vtag;
}
/* If it is a shutdown ack OOTB packet, we expect a return
shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */
@@ -610,6 +647,8 @@ sctp_timeout_nla_policy[CTA_TIMEOUT_SCTP_MAX+1] = {
[CTA_TIMEOUT_SCTP_SHUTDOWN_SENT] = { .type = NLA_U32 },
[CTA_TIMEOUT_SCTP_SHUTDOWN_RECD] = { .type = NLA_U32 },
[CTA_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_SCTP_HEARTBEAT_SENT] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_SCTP_HEARTBEAT_ACKED] = { .type = NLA_U32 },
};
#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
@@ -658,6 +697,18 @@ static struct ctl_table sctp_sysctl_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
+ {
+ .procname = "nf_conntrack_sctp_timeout_heartbeat_sent",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "nf_conntrack_sctp_timeout_heartbeat_acked",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
{ }
};
@@ -730,6 +781,8 @@ static int sctp_kmemdup_sysctl_table(struct nf_proto_net *pn,
pn->ctl_table[4].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_SENT];
pn->ctl_table[5].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_RECD];
pn->ctl_table[6].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_ACK_SENT];
+ pn->ctl_table[7].data = &sn->timeouts[SCTP_CONNTRACK_HEARTBEAT_SENT];
+ pn->ctl_table[8].data = &sn->timeouts[SCTP_CONNTRACK_HEARTBEAT_ACKED];
#endif
return 0;
}
diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h
index 3992106..0655225 100644
--- a/net/netfilter/nf_internals.h
+++ b/net/netfilter/nf_internals.h
@@ -19,7 +19,7 @@ unsigned int nf_iterate(struct list_head *head, struct sk_buff *skb,
/* nf_queue.c */
int nf_queue(struct sk_buff *skb, struct nf_hook_ops *elem,
struct nf_hook_state *state, unsigned int queuenum);
-void nf_queue_nf_hook_drop(struct nf_hook_ops *ops);
+void nf_queue_nf_hook_drop(struct net *net, struct nf_hook_ops *ops);
int __init netfilter_queue_init(void);
/* nf_log.c */
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index 8a8b2ab..96777f9 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -105,21 +105,15 @@ bool nf_queue_entry_get_refs(struct nf_queue_entry *entry)
}
EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs);
-void nf_queue_nf_hook_drop(struct nf_hook_ops *ops)
+void nf_queue_nf_hook_drop(struct net *net, struct nf_hook_ops *ops)
{
const struct nf_queue_handler *qh;
- struct net *net;
- rtnl_lock();
rcu_read_lock();
qh = rcu_dereference(queue_handler);
- if (qh) {
- for_each_net(net) {
- qh->nf_hook_drop(net, ops);
- }
- }
+ if (qh)
+ qh->nf_hook_drop(net, ops);
rcu_read_unlock();
- rtnl_unlock();
}
/*
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index cfe6368..4a41eb9 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -130,20 +130,24 @@ static void nft_trans_destroy(struct nft_trans *trans)
int nft_register_basechain(struct nft_base_chain *basechain,
unsigned int hook_nops)
{
+ struct net *net = read_pnet(&basechain->pnet);
+
if (basechain->flags & NFT_BASECHAIN_DISABLED)
return 0;
- return nf_register_hooks(basechain->ops, hook_nops);
+ return nf_register_net_hooks(net, basechain->ops, hook_nops);
}
EXPORT_SYMBOL_GPL(nft_register_basechain);
void nft_unregister_basechain(struct nft_base_chain *basechain,
unsigned int hook_nops)
{
+ struct net *net = read_pnet(&basechain->pnet);
+
if (basechain->flags & NFT_BASECHAIN_DISABLED)
return;
- nf_unregister_hooks(basechain->ops, hook_nops);
+ nf_unregister_net_hooks(net, basechain->ops, hook_nops);
}
EXPORT_SYMBOL_GPL(nft_unregister_basechain);
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index f77bad4..05d0b03 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -114,7 +114,6 @@ unsigned int
nft_do_chain(struct nft_pktinfo *pkt, const struct nf_hook_ops *ops)
{
const struct nft_chain *chain = ops->priv, *basechain = chain;
- const struct net *chain_net = read_pnet(&nft_base_chain(basechain)->pnet);
const struct net *net = dev_net(pkt->in ? pkt->in : pkt->out);
const struct nft_rule *rule;
const struct nft_expr *expr, *last;
@@ -125,10 +124,6 @@ nft_do_chain(struct nft_pktinfo *pkt, const struct nf_hook_ops *ops)
int rulenum;
unsigned int gencursor = nft_genmask_cur(net);
- /* Ignore chains that are not for the current network namespace */
- if (!net_eq(net, chain_net))
- return NF_ACCEPT;
-
do_chain:
rulenum = 0;
rule = list_entry(&chain->rules, struct nft_rule, list);
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index 52561e1..cb2f13e 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -166,11 +166,13 @@ void nft_meta_get_eval(const struct nft_expr *expr,
goto err;
*dest = out->group;
break;
+#ifdef CONFIG_CGROUP_NET_CLASSID
case NFT_META_CGROUP:
if (skb->sk == NULL || !sk_fullsock(skb->sk))
goto err;
*dest = skb->sk->sk_classid;
break;
+#endif
default:
WARN_ON(1);
goto err;
@@ -246,7 +248,9 @@ int nft_meta_get_init(const struct nft_ctx *ctx,
case NFT_META_CPU:
case NFT_META_IIFGROUP:
case NFT_META_OIFGROUP:
+#ifdef CONFIG_CGROUP_NET_CLASSID
case NFT_META_CGROUP:
+#endif
len = sizeof(u32);
break;
case NFT_META_IIFNAME:
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index d324fe7..9b42b5e 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -67,9 +67,6 @@ static const char *const xt_prefix[NFPROTO_NUMPROTO] = {
[NFPROTO_IPV6] = "ip6",
};
-/* Allow this many total (re)entries. */
-static const unsigned int xt_jumpstack_multiplier = 2;
-
/* Registration hooks for targets. */
int xt_register_target(struct xt_target *target)
{
@@ -688,8 +685,6 @@ void xt_free_table_info(struct xt_table_info *info)
kvfree(info->jumpstack);
}
- free_percpu(info->stackptr);
-
kvfree(info);
}
EXPORT_SYMBOL(xt_free_table_info);
@@ -732,15 +727,14 @@ EXPORT_SYMBOL_GPL(xt_compat_unlock);
DEFINE_PER_CPU(seqcount_t, xt_recseq);
EXPORT_PER_CPU_SYMBOL_GPL(xt_recseq);
+struct static_key xt_tee_enabled __read_mostly;
+EXPORT_SYMBOL_GPL(xt_tee_enabled);
+
static int xt_jumpstack_alloc(struct xt_table_info *i)
{
unsigned int size;
int cpu;
- i->stackptr = alloc_percpu(unsigned int);
- if (i->stackptr == NULL)
- return -ENOMEM;
-
size = sizeof(void **) * nr_cpu_ids;
if (size > PAGE_SIZE)
i->jumpstack = vzalloc(size);
@@ -749,8 +743,21 @@ static int xt_jumpstack_alloc(struct xt_table_info *i)
if (i->jumpstack == NULL)
return -ENOMEM;
- i->stacksize *= xt_jumpstack_multiplier;
- size = sizeof(void *) * i->stacksize;
+ /* ruleset without jumps -- no stack needed */
+ if (i->stacksize == 0)
+ return 0;
+
+ /* Jumpstack needs to be able to record two full callchains, one
+ * from the first rule set traversal, plus one table reentrancy
+ * via -j TEE without clobbering the callchain that brought us to
+ * TEE target.
+ *
+ * This is done by allocating two jumpstacks per cpu, on reentry
+ * the upper half of the stack is used.
+ *
+ * see the jumpstack setup in ipt_do_table() for more details.
+ */
+ size = sizeof(void *) * i->stacksize * 2u;
for_each_possible_cpu(cpu) {
if (size > PAGE_SIZE)
i->jumpstack[cpu] = vmalloc_node(size,
diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c
index a747eb4..c5d6556 100644
--- a/net/netfilter/xt_TEE.c
+++ b/net/netfilter/xt_TEE.c
@@ -37,7 +37,6 @@ struct xt_tee_priv {
};
static const union nf_inet_addr tee_zero_address;
-static DEFINE_PER_CPU(bool, tee_active);
static struct net *pick_net(struct sk_buff *skb)
{
@@ -88,7 +87,7 @@ tee_tg4(struct sk_buff *skb, const struct xt_action_param *par)
const struct xt_tee_tginfo *info = par->targinfo;
struct iphdr *iph;
- if (__this_cpu_read(tee_active))
+ if (__this_cpu_read(nf_skb_duplicated))
return XT_CONTINUE;
/*
* Copy the skb, and route the copy. Will later return %XT_CONTINUE for
@@ -125,9 +124,9 @@ tee_tg4(struct sk_buff *skb, const struct xt_action_param *par)
ip_send_check(iph);
if (tee_tg_route4(skb, info)) {
- __this_cpu_write(tee_active, true);
+ __this_cpu_write(nf_skb_duplicated, true);
ip_local_out(skb);
- __this_cpu_write(tee_active, false);
+ __this_cpu_write(nf_skb_duplicated, false);
} else {
kfree_skb(skb);
}
@@ -170,7 +169,7 @@ tee_tg6(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_tee_tginfo *info = par->targinfo;
- if (__this_cpu_read(tee_active))
+ if (__this_cpu_read(nf_skb_duplicated))
return XT_CONTINUE;
skb = pskb_copy(skb, GFP_ATOMIC);
if (skb == NULL)
@@ -188,9 +187,9 @@ tee_tg6(struct sk_buff *skb, const struct xt_action_param *par)
--iph->hop_limit;
}
if (tee_tg_route6(skb, info)) {
- __this_cpu_write(tee_active, true);
+ __this_cpu_write(nf_skb_duplicated, true);
ip6_local_out(skb);
- __this_cpu_write(tee_active, false);
+ __this_cpu_write(nf_skb_duplicated, false);
} else {
kfree_skb(skb);
}
@@ -252,6 +251,7 @@ static int tee_tg_check(const struct xt_tgchk_param *par)
} else
info->priv = NULL;
+ static_key_slow_inc(&xt_tee_enabled);
return 0;
}
@@ -263,6 +263,7 @@ static void tee_tg_destroy(const struct xt_tgdtor_param *par)
unregister_netdevice_notifier(&info->priv->notifier);
kfree(info->priv);
}
+ static_key_slow_dec(&xt_tee_enabled);
}
static struct xt_target tee_tg_reg[] __read_mostly = {
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index cca96ce..d0c96c5 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -272,8 +272,7 @@ tproxy_handle_time_wait4(struct sk_buff *skb, __be32 laddr, __be16 lport,
hp->source, lport ? lport : hp->dest,
skb->dev, NFT_LOOKUP_LISTENER);
if (sk2) {
- inet_twsk_deschedule(inet_twsk(sk));
- inet_twsk_put(inet_twsk(sk));
+ inet_twsk_deschedule_put(inet_twsk(sk));
sk = sk2;
}
}
@@ -437,8 +436,7 @@ tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff,
tgi->lport ? tgi->lport : hp->dest,
skb->dev, NFT_LOOKUP_LISTENER);
if (sk2) {
- inet_twsk_deschedule(inet_twsk(sk));
- inet_twsk_put(inet_twsk(sk));
+ inet_twsk_deschedule_put(inet_twsk(sk));
sk = sk2;
}
}
diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig
index 1584040..422dc05 100644
--- a/net/openvswitch/Kconfig
+++ b/net/openvswitch/Kconfig
@@ -34,7 +34,7 @@ config OPENVSWITCH
config OPENVSWITCH_GRE
tristate "Open vSwitch GRE tunneling support"
depends on OPENVSWITCH
- depends on NET_IPGRE_DEMUX
+ depends on NET_IPGRE
default OPENVSWITCH
---help---
If you say Y here, then the Open vSwitch will be able create GRE
diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile
index 91b9478..6e1701d 100644
--- a/net/openvswitch/Makefile
+++ b/net/openvswitch/Makefile
@@ -15,6 +15,6 @@ openvswitch-y := \
vport-internal_dev.o \
vport-netdev.o
+obj-$(CONFIG_OPENVSWITCH_VXLAN)+= vport-vxlan.o
obj-$(CONFIG_OPENVSWITCH_GENEVE)+= vport-geneve.o
-obj-$(CONFIG_OPENVSWITCH_VXLAN) += vport-vxlan.o
obj-$(CONFIG_OPENVSWITCH_GRE) += vport-gre.o
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index ee34f47..14da52d 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -619,7 +619,7 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb,
struct sw_flow_key *key, const struct nlattr *attr,
const struct nlattr *actions, int actions_len)
{
- struct ovs_tunnel_info info;
+ struct ip_tunnel_info info;
struct dp_upcall_info upcall;
const struct nlattr *a;
int rem;
@@ -677,9 +677,12 @@ static int sample(struct datapath *dp, struct sk_buff *skb,
for (a = nla_data(attr), rem = nla_len(attr); rem > 0;
a = nla_next(a, &rem)) {
+ u32 probability;
+
switch (nla_type(a)) {
case OVS_SAMPLE_ATTR_PROBABILITY:
- if (prandom_u32() >= nla_get_u32(a))
+ probability = nla_get_u32(a);
+ if (!probability || prandom_u32() > probability)
return 0;
break;
@@ -741,7 +744,15 @@ static int execute_set_action(struct sk_buff *skb,
{
/* Only tunnel set execution is supported without a mask. */
if (nla_type(a) == OVS_KEY_ATTR_TUNNEL_INFO) {
- OVS_CB(skb)->egress_tun_info = nla_data(a);
+ struct ovs_tunnel_info *tun = nla_data(a);
+
+ skb_dst_drop(skb);
+ dst_hold((struct dst_entry *)tun->tun_dst);
+ skb_dst_set(skb, (struct dst_entry *)tun->tun_dst);
+
+ /* FIXME: Remove when all vports have been converted */
+ OVS_CB(skb)->egress_tun_info = &tun->tun_dst->u.tun_info;
+
return 0;
}
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index ff8c4a4..ffe984f 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -176,7 +176,7 @@ static inline struct datapath *get_dp(struct net *net, int dp_ifindex)
const char *ovs_dp_name(const struct datapath *dp)
{
struct vport *vport = ovs_vport_ovsl_rcu(dp, OVSP_LOCAL);
- return vport->ops->get_name(vport);
+ return ovs_vport_name(vport);
}
static int get_dpifindex(const struct datapath *dp)
@@ -188,7 +188,7 @@ static int get_dpifindex(const struct datapath *dp)
local = ovs_vport_rcu(dp, OVSP_LOCAL);
if (local)
- ifindex = netdev_vport_priv(local)->dev->ifindex;
+ ifindex = local->dev->ifindex;
else
ifindex = 0;
@@ -1018,7 +1018,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
}
ovs_unlock();
- ovs_nla_free_flow_actions(old_acts);
+ ovs_nla_free_flow_actions_rcu(old_acts);
ovs_flow_free(new_flow, false);
}
@@ -1030,7 +1030,7 @@ err_unlock_ovs:
ovs_unlock();
kfree_skb(reply);
err_kfree_acts:
- kfree(acts);
+ ovs_nla_free_flow_actions(acts);
err_kfree_flow:
ovs_flow_free(new_flow, false);
error:
@@ -1157,7 +1157,7 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info)
if (reply)
ovs_notify(&dp_flow_genl_family, reply, info);
if (old_acts)
- ovs_nla_free_flow_actions(old_acts);
+ ovs_nla_free_flow_actions_rcu(old_acts);
return 0;
@@ -1165,7 +1165,7 @@ err_unlock_ovs:
ovs_unlock();
kfree_skb(reply);
err_kfree_acts:
- kfree(acts);
+ ovs_nla_free_flow_actions(acts);
error:
return error;
}
@@ -1800,7 +1800,7 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb,
if (nla_put_u32(skb, OVS_VPORT_ATTR_PORT_NO, vport->port_no) ||
nla_put_u32(skb, OVS_VPORT_ATTR_TYPE, vport->ops->type) ||
nla_put_string(skb, OVS_VPORT_ATTR_NAME,
- vport->ops->get_name(vport)))
+ ovs_vport_name(vport)))
goto nla_put_failure;
ovs_vport_get_stats(vport, &vport_stats);
@@ -2219,13 +2219,10 @@ static void __net_exit list_vports_from_net(struct net *net, struct net *dnet,
struct vport *vport;
hlist_for_each_entry(vport, &dp->ports[i], dp_hash_node) {
- struct netdev_vport *netdev_vport;
-
if (vport->ops->type != OVS_VPORT_TYPE_INTERNAL)
continue;
- netdev_vport = netdev_vport_priv(vport);
- if (dev_net(netdev_vport->dev) == dnet)
+ if (dev_net(vport->dev) == dnet)
list_add(&vport->detach_list, head);
}
}
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index cd691e9..6b28c5c 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -25,6 +25,7 @@
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/u64_stats_sync.h>
+#include <net/ip_tunnels.h>
#include "flow.h"
#include "flow_table.h"
@@ -98,7 +99,7 @@ struct datapath {
* when a packet is received by OVS.
*/
struct ovs_skb_cb {
- struct ovs_tunnel_info *egress_tun_info;
+ struct ip_tunnel_info *egress_tun_info;
struct vport *input_vport;
};
#define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb)
@@ -114,7 +115,7 @@ struct ovs_skb_cb {
* @egress_tun_info: If nonnull, becomes %OVS_PACKET_ATTR_EGRESS_TUN_KEY.
*/
struct dp_upcall_info {
- const struct ovs_tunnel_info *egress_tun_info;
+ const struct ip_tunnel_info *egress_tun_info;
const struct nlattr *userdata;
const struct nlattr *actions;
int actions_len;
diff --git a/net/openvswitch/dp_notify.c b/net/openvswitch/dp_notify.c
index 2c631fe..a7a80a6 100644
--- a/net/openvswitch/dp_notify.c
+++ b/net/openvswitch/dp_notify.c
@@ -58,13 +58,10 @@ void ovs_dp_notify_wq(struct work_struct *work)
struct hlist_node *n;
hlist_for_each_entry_safe(vport, n, &dp->ports[i], dp_hash_node) {
- struct netdev_vport *netdev_vport;
-
if (vport->ops->type != OVS_VPORT_TYPE_NETDEV)
continue;
- netdev_vport = netdev_vport_priv(vport);
- if (!(netdev_vport->dev->priv_flags & IFF_OVS_DATAPATH))
+ if (!(vport->dev->priv_flags & IFF_OVS_DATAPATH))
dp_detach_port_notify(vport);
}
}
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index bc7b0ab..8db22ef 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -682,12 +682,12 @@ int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key)
return key_extract(skb, key);
}
-int ovs_flow_key_extract(const struct ovs_tunnel_info *tun_info,
+int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
struct sk_buff *skb, struct sw_flow_key *key)
{
/* Extract metadata from packet. */
if (tun_info) {
- memcpy(&key->tun_key, &tun_info->tunnel, sizeof(key->tun_key));
+ memcpy(&key->tun_key, &tun_info->key, sizeof(key->tun_key));
if (tun_info->options) {
BUILD_BUG_ON((1 << (sizeof(tun_info->options_len) *
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index a076e44..b62cdb3 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -32,31 +32,11 @@
#include <linux/time.h>
#include <linux/flex_array.h>
#include <net/inet_ecn.h>
+#include <net/ip_tunnels.h>
+#include <net/dst_metadata.h>
struct sk_buff;
-/* Used to memset ovs_key_ipv4_tunnel padding. */
-#define OVS_TUNNEL_KEY_SIZE \
- (offsetof(struct ovs_key_ipv4_tunnel, tp_dst) + \
- FIELD_SIZEOF(struct ovs_key_ipv4_tunnel, tp_dst))
-
-struct ovs_key_ipv4_tunnel {
- __be64 tun_id;
- __be32 ipv4_src;
- __be32 ipv4_dst;
- __be16 tun_flags;
- u8 ipv4_tos;
- u8 ipv4_ttl;
- __be16 tp_src;
- __be16 tp_dst;
-} __packed __aligned(4); /* Minimize padding. */
-
-struct ovs_tunnel_info {
- struct ovs_key_ipv4_tunnel tunnel;
- const void *options;
- u8 options_len;
-};
-
/* Store options at the end of the array if they are less than the
* maximum size. This allows us to get the benefits of variable length
* matching for small options.
@@ -66,54 +46,9 @@ struct ovs_tunnel_info {
#define TUN_METADATA_OPTS(flow_key, opt_len) \
((void *)((flow_key)->tun_opts + TUN_METADATA_OFFSET(opt_len)))
-static inline void __ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info,
- __be32 saddr, __be32 daddr,
- u8 tos, u8 ttl,
- __be16 tp_src,
- __be16 tp_dst,
- __be64 tun_id,
- __be16 tun_flags,
- const void *opts,
- u8 opts_len)
-{
- tun_info->tunnel.tun_id = tun_id;
- tun_info->tunnel.ipv4_src = saddr;
- tun_info->tunnel.ipv4_dst = daddr;
- tun_info->tunnel.ipv4_tos = tos;
- tun_info->tunnel.ipv4_ttl = ttl;
- tun_info->tunnel.tun_flags = tun_flags;
-
- /* For the tunnel types on the top of IPsec, the tp_src and tp_dst of
- * the upper tunnel are used.
- * E.g: GRE over IPSEC, the tp_src and tp_port are zero.
- */
- tun_info->tunnel.tp_src = tp_src;
- tun_info->tunnel.tp_dst = tp_dst;
-
- /* Clear struct padding. */
- if (sizeof(tun_info->tunnel) != OVS_TUNNEL_KEY_SIZE)
- memset((unsigned char *)&tun_info->tunnel + OVS_TUNNEL_KEY_SIZE,
- 0, sizeof(tun_info->tunnel) - OVS_TUNNEL_KEY_SIZE);
-
- tun_info->options = opts;
- tun_info->options_len = opts_len;
-}
-
-static inline void ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info,
- const struct iphdr *iph,
- __be16 tp_src,
- __be16 tp_dst,
- __be64 tun_id,
- __be16 tun_flags,
- const void *opts,
- u8 opts_len)
-{
- __ovs_flow_tun_info_init(tun_info, iph->saddr, iph->daddr,
- iph->tos, iph->ttl,
- tp_src, tp_dst,
- tun_id, tun_flags,
- opts, opts_len);
-}
+struct ovs_tunnel_info {
+ struct metadata_dst *tun_dst;
+};
#define OVS_SW_FLOW_KEY_METADATA_SIZE \
(offsetof(struct sw_flow_key, recirc_id) + \
@@ -122,7 +57,7 @@ static inline void ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info,
struct sw_flow_key {
u8 tun_opts[255];
u8 tun_opts_len;
- struct ovs_key_ipv4_tunnel tun_key; /* Encapsulating tunnel key. */
+ struct ip_tunnel_key tun_key; /* Encapsulating tunnel key. */
struct {
u32 priority; /* Packet QoS priority. */
u32 skb_mark; /* SKB mark. */
@@ -273,7 +208,7 @@ void ovs_flow_stats_clear(struct sw_flow *);
u64 ovs_flow_used_time(unsigned long flow_jiffies);
int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key);
-int ovs_flow_key_extract(const struct ovs_tunnel_info *tun_info,
+int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
struct sk_buff *skb,
struct sw_flow_key *key);
/* Extract key from packet coming from userspace. */
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 624e41c..a6eb77a 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -47,9 +47,9 @@
#include <net/ipv6.h>
#include <net/ndisc.h>
#include <net/mpls.h>
+#include <net/vxlan.h>
#include "flow_netlink.h"
-#include "vport-vxlan.h"
struct ovs_len_tbl {
int len;
@@ -475,7 +475,7 @@ static int vxlan_tun_opt_from_nlattr(const struct nlattr *a,
{
struct nlattr *tb[OVS_VXLAN_EXT_MAX+1];
unsigned long opt_key_offset;
- struct ovs_vxlan_opts opts;
+ struct vxlan_metadata opts;
int err;
BUILD_BUG_ON(sizeof(opts) > sizeof(match->key->tun_opts));
@@ -626,7 +626,7 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr,
static int vxlan_opt_to_nlattr(struct sk_buff *skb,
const void *tun_opts, int swkey_tun_opts_len)
{
- const struct ovs_vxlan_opts *opts = tun_opts;
+ const struct vxlan_metadata *opts = tun_opts;
struct nlattr *nla;
nla = nla_nest_start(skb, OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS);
@@ -641,7 +641,7 @@ static int vxlan_opt_to_nlattr(struct sk_buff *skb,
}
static int __ipv4_tun_to_nlattr(struct sk_buff *skb,
- const struct ovs_key_ipv4_tunnel *output,
+ const struct ip_tunnel_key *output,
const void *tun_opts, int swkey_tun_opts_len)
{
if (output->tun_flags & TUNNEL_KEY &&
@@ -689,7 +689,7 @@ static int __ipv4_tun_to_nlattr(struct sk_buff *skb,
}
static int ipv4_tun_to_nlattr(struct sk_buff *skb,
- const struct ovs_key_ipv4_tunnel *output,
+ const struct ip_tunnel_key *output,
const void *tun_opts, int swkey_tun_opts_len)
{
struct nlattr *nla;
@@ -708,9 +708,9 @@ static int ipv4_tun_to_nlattr(struct sk_buff *skb,
}
int ovs_nla_put_egress_tunnel_key(struct sk_buff *skb,
- const struct ovs_tunnel_info *egress_tun_info)
+ const struct ip_tunnel_info *egress_tun_info)
{
- return __ipv4_tun_to_nlattr(skb, &egress_tun_info->tunnel,
+ return __ipv4_tun_to_nlattr(skb, &egress_tun_info->key,
egress_tun_info->options,
egress_tun_info->options_len);
}
@@ -1548,11 +1548,48 @@ static struct sw_flow_actions *nla_alloc_flow_actions(int size, bool log)
return sfa;
}
+static void ovs_nla_free_set_action(const struct nlattr *a)
+{
+ const struct nlattr *ovs_key = nla_data(a);
+ struct ovs_tunnel_info *ovs_tun;
+
+ switch (nla_type(ovs_key)) {
+ case OVS_KEY_ATTR_TUNNEL_INFO:
+ ovs_tun = nla_data(ovs_key);
+ dst_release((struct dst_entry *)ovs_tun->tun_dst);
+ break;
+ }
+}
+
+void ovs_nla_free_flow_actions(struct sw_flow_actions *sf_acts)
+{
+ const struct nlattr *a;
+ int rem;
+
+ if (!sf_acts)
+ return;
+
+ nla_for_each_attr(a, sf_acts->actions, sf_acts->actions_len, rem) {
+ switch (nla_type(a)) {
+ case OVS_ACTION_ATTR_SET:
+ ovs_nla_free_set_action(a);
+ break;
+ }
+ }
+
+ kfree(sf_acts);
+}
+
+static void __ovs_nla_free_flow_actions(struct rcu_head *head)
+{
+ ovs_nla_free_flow_actions(container_of(head, struct sw_flow_actions, rcu));
+}
+
/* Schedules 'sf_acts' to be freed after the next RCU grace period.
* The caller must hold rcu_read_lock for this to be sensible. */
-void ovs_nla_free_flow_actions(struct sw_flow_actions *sf_acts)
+void ovs_nla_free_flow_actions_rcu(struct sw_flow_actions *sf_acts)
{
- kfree_rcu(sf_acts, rcu);
+ call_rcu(&sf_acts->rcu, __ovs_nla_free_flow_actions);
}
static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa,
@@ -1746,7 +1783,9 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
{
struct sw_flow_match match;
struct sw_flow_key key;
- struct ovs_tunnel_info *tun_info;
+ struct metadata_dst *tun_dst;
+ struct ip_tunnel_info *tun_info;
+ struct ovs_tunnel_info *ovs_tun;
struct nlattr *a;
int err = 0, start, opts_type;
@@ -1771,13 +1810,23 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
if (start < 0)
return start;
+ tun_dst = metadata_dst_alloc(key.tun_opts_len, GFP_KERNEL);
+ if (!tun_dst)
+ return -ENOMEM;
+
a = __add_action(sfa, OVS_KEY_ATTR_TUNNEL_INFO, NULL,
- sizeof(*tun_info) + key.tun_opts_len, log);
- if (IS_ERR(a))
+ sizeof(*ovs_tun), log);
+ if (IS_ERR(a)) {
+ dst_release((struct dst_entry *)tun_dst);
return PTR_ERR(a);
+ }
+
+ ovs_tun = nla_data(a);
+ ovs_tun->tun_dst = tun_dst;
- tun_info = nla_data(a);
- tun_info->tunnel = key.tun_key;
+ tun_info = &tun_dst->u.tun_info;
+ tun_info->mode = IP_TUNNEL_INFO_TX;
+ tun_info->key = key.tun_key;
tun_info->options_len = key.tun_opts_len;
if (tun_info->options_len) {
@@ -2177,7 +2226,7 @@ int ovs_nla_copy_actions(const struct nlattr *attr,
err = __ovs_nla_copy_actions(attr, key, 0, sfa, key->eth.type,
key->eth.tci, log);
if (err)
- kfree(*sfa);
+ ovs_nla_free_flow_actions(*sfa);
return err;
}
@@ -2227,13 +2276,14 @@ static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb)
switch (key_type) {
case OVS_KEY_ATTR_TUNNEL_INFO: {
- struct ovs_tunnel_info *tun_info = nla_data(ovs_key);
+ struct ovs_tunnel_info *ovs_tun = nla_data(ovs_key);
+ struct ip_tunnel_info *tun_info = &ovs_tun->tun_dst->u.tun_info;
start = nla_nest_start(skb, OVS_ACTION_ATTR_SET);
if (!start)
return -EMSGSIZE;
- err = ipv4_tun_to_nlattr(skb, &tun_info->tunnel,
+ err = ipv4_tun_to_nlattr(skb, &tun_info->key,
tun_info->options_len ?
tun_info->options : NULL,
tun_info->options_len);
diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h
index 5c3d75b..acd0744 100644
--- a/net/openvswitch/flow_netlink.h
+++ b/net/openvswitch/flow_netlink.h
@@ -55,7 +55,7 @@ int ovs_nla_put_mask(const struct sw_flow *flow, struct sk_buff *skb);
int ovs_nla_get_match(struct sw_flow_match *, const struct nlattr *key,
const struct nlattr *mask, bool log);
int ovs_nla_put_egress_tunnel_key(struct sk_buff *,
- const struct ovs_tunnel_info *);
+ const struct ip_tunnel_info *);
bool ovs_nla_get_ufid(struct sw_flow_id *, const struct nlattr *, bool log);
int ovs_nla_get_identifier(struct sw_flow_id *sfid, const struct nlattr *ufid,
@@ -69,5 +69,6 @@ int ovs_nla_put_actions(const struct nlattr *attr,
int len, struct sk_buff *skb);
void ovs_nla_free_flow_actions(struct sw_flow_actions *);
+void ovs_nla_free_flow_actions_rcu(struct sw_flow_actions *);
#endif /* flow_netlink.h */
diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index 6552394..3a9d1dde76 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -18,6 +18,7 @@
#include "flow.h"
#include "datapath.h"
+#include "flow_netlink.h"
#include <linux/uaccess.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
@@ -143,7 +144,8 @@ static void flow_free(struct sw_flow *flow)
if (ovs_identifier_is_key(&flow->id))
kfree(flow->id.unmasked_key);
- kfree((struct sw_flow_actions __force *)flow->sf_acts);
+ if (flow->sf_acts)
+ ovs_nla_free_flow_actions((struct sw_flow_actions __force *)flow->sf_acts);
for_each_node(node)
if (flow->stats[node])
kmem_cache_free(flow_stats_cache,
diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c
index 208c576..1da3a14 100644
--- a/net/openvswitch/vport-geneve.c
+++ b/net/openvswitch/vport-geneve.c
@@ -77,7 +77,7 @@ static void geneve_rcv(struct geneve_sock *gs, struct sk_buff *skb)
struct vport *vport = gs->rcv_data;
struct genevehdr *geneveh = geneve_hdr(skb);
int opts_len;
- struct ovs_tunnel_info tun_info;
+ struct ip_tunnel_info tun_info;
__be64 key;
__be16 flags;
@@ -90,10 +90,9 @@ static void geneve_rcv(struct geneve_sock *gs, struct sk_buff *skb)
key = vni_to_tunnel_id(geneveh->vni);
- ovs_flow_tun_info_init(&tun_info, ip_hdr(skb),
- udp_hdr(skb)->source, udp_hdr(skb)->dest,
- key, flags,
- geneveh->options, opts_len);
+ ip_tunnel_info_init(&tun_info, ip_hdr(skb),
+ udp_hdr(skb)->source, udp_hdr(skb)->dest,
+ key, flags, geneveh->options, opts_len);
ovs_vport_receive(vport, skb, &tun_info);
}
@@ -165,8 +164,8 @@ error:
static int geneve_tnl_send(struct vport *vport, struct sk_buff *skb)
{
- const struct ovs_key_ipv4_tunnel *tun_key;
- struct ovs_tunnel_info *tun_info;
+ const struct ip_tunnel_key *tun_key;
+ struct ip_tunnel_info *tun_info;
struct net *net = ovs_dp_get_net(vport->dp);
struct geneve_port *geneve_port = geneve_vport(vport);
__be16 dport = inet_sk(geneve_port->gs->sock->sk)->inet_sport;
@@ -183,7 +182,7 @@ static int geneve_tnl_send(struct vport *vport, struct sk_buff *skb)
goto error;
}
- tun_key = &tun_info->tunnel;
+ tun_key = &tun_info->key;
rt = ovs_tunnel_route_lookup(net, tun_key, skb->mark, &fl, IPPROTO_UDP);
if (IS_ERR(rt)) {
err = PTR_ERR(rt);
@@ -225,7 +224,7 @@ static const char *geneve_get_name(const struct vport *vport)
}
static int geneve_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
- struct ovs_tunnel_info *egress_tun_info)
+ struct ip_tunnel_info *egress_tun_info)
{
struct geneve_port *geneve_port = geneve_vport(vport);
struct net *net = ovs_dp_get_net(vport->dp);
diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c
index f17ac96..871801d 100644
--- a/net/openvswitch/vport-gre.c
+++ b/net/openvswitch/vport-gre.c
@@ -45,239 +45,47 @@
#include "datapath.h"
#include "vport.h"
+#include "vport-netdev.h"
static struct vport_ops ovs_gre_vport_ops;
-/* Returns the least-significant 32 bits of a __be64. */
-static __be32 be64_get_low32(__be64 x)
+static struct vport *gre_tnl_create(const struct vport_parms *parms)
{
-#ifdef __BIG_ENDIAN
- return (__force __be32)x;
-#else
- return (__force __be32)((__force u64)x >> 32);
-#endif
-}
-
-static __be16 filter_tnl_flags(__be16 flags)
-{
- return flags & (TUNNEL_CSUM | TUNNEL_KEY);
-}
-
-static struct sk_buff *__build_header(struct sk_buff *skb,
- int tunnel_hlen)
-{
- struct tnl_ptk_info tpi;
- const struct ovs_key_ipv4_tunnel *tun_key;
-
- tun_key = &OVS_CB(skb)->egress_tun_info->tunnel;
-
- skb = gre_handle_offloads(skb, !!(tun_key->tun_flags & TUNNEL_CSUM));
- if (IS_ERR(skb))
- return skb;
-
- tpi.flags = filter_tnl_flags(tun_key->tun_flags);
- tpi.proto = htons(ETH_P_TEB);
- tpi.key = be64_get_low32(tun_key->tun_id);
- tpi.seq = 0;
- gre_build_header(skb, &tpi, tunnel_hlen);
-
- return skb;
-}
-
-static __be64 key_to_tunnel_id(__be32 key, __be32 seq)
-{
-#ifdef __BIG_ENDIAN
- return (__force __be64)((__force u64)seq << 32 | (__force u32)key);
-#else
- return (__force __be64)((__force u64)key << 32 | (__force u32)seq);
-#endif
-}
-
-/* Called with rcu_read_lock and BH disabled. */
-static int gre_rcv(struct sk_buff *skb,
- const struct tnl_ptk_info *tpi)
-{
- struct ovs_tunnel_info tun_info;
- struct ovs_net *ovs_net;
- struct vport *vport;
- __be64 key;
-
- ovs_net = net_generic(dev_net(skb->dev), ovs_net_id);
- vport = rcu_dereference(ovs_net->vport_net.gre_vport);
- if (unlikely(!vport))
- return PACKET_REJECT;
-
- key = key_to_tunnel_id(tpi->key, tpi->seq);
- ovs_flow_tun_info_init(&tun_info, ip_hdr(skb), 0, 0, key,
- filter_tnl_flags(tpi->flags), NULL, 0);
-
- ovs_vport_receive(vport, skb, &tun_info);
- return PACKET_RCVD;
-}
-
-/* Called with rcu_read_lock and BH disabled. */
-static int gre_err(struct sk_buff *skb, u32 info,
- const struct tnl_ptk_info *tpi)
-{
- struct ovs_net *ovs_net;
+ struct net *net = ovs_dp_get_net(parms->dp);
+ struct net_device *dev;
struct vport *vport;
- ovs_net = net_generic(dev_net(skb->dev), ovs_net_id);
- vport = rcu_dereference(ovs_net->vport_net.gre_vport);
-
- if (unlikely(!vport))
- return PACKET_REJECT;
- else
- return PACKET_RCVD;
-}
-
-static int gre_tnl_send(struct vport *vport, struct sk_buff *skb)
-{
- struct net *net = ovs_dp_get_net(vport->dp);
- const struct ovs_key_ipv4_tunnel *tun_key;
- struct flowi4 fl;
- struct rtable *rt;
- int min_headroom;
- int tunnel_hlen;
- __be16 df;
- int err;
-
- if (unlikely(!OVS_CB(skb)->egress_tun_info)) {
- err = -EINVAL;
- goto err_free_skb;
- }
-
- tun_key = &OVS_CB(skb)->egress_tun_info->tunnel;
- rt = ovs_tunnel_route_lookup(net, tun_key, skb->mark, &fl, IPPROTO_GRE);
- if (IS_ERR(rt)) {
- err = PTR_ERR(rt);
- goto err_free_skb;
- }
-
- tunnel_hlen = ip_gre_calc_hlen(tun_key->tun_flags);
-
- min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
- + tunnel_hlen + sizeof(struct iphdr)
- + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
- if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) {
- int head_delta = SKB_DATA_ALIGN(min_headroom -
- skb_headroom(skb) +
- 16);
- err = pskb_expand_head(skb, max_t(int, head_delta, 0),
- 0, GFP_ATOMIC);
- if (unlikely(err))
- goto err_free_rt;
- }
-
- skb = vlan_hwaccel_push_inside(skb);
- if (unlikely(!skb)) {
- err = -ENOMEM;
- goto err_free_rt;
- }
-
- /* Push Tunnel header. */
- skb = __build_header(skb, tunnel_hlen);
- if (IS_ERR(skb)) {
- err = PTR_ERR(skb);
- skb = NULL;
- goto err_free_rt;
+ vport = ovs_vport_alloc(0, &ovs_gre_vport_ops, parms);
+ if (IS_ERR(vport))
+ return vport;
+
+ rtnl_lock();
+ dev = gretap_fb_dev_create(net, parms->name, NET_NAME_USER);
+ if (IS_ERR(dev)) {
+ rtnl_unlock();
+ ovs_vport_free(vport);
+ return ERR_CAST(dev);
}
- df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ?
- htons(IP_DF) : 0;
-
- skb->ignore_df = 1;
-
- return iptunnel_xmit(skb->sk, rt, skb, fl.saddr,
- tun_key->ipv4_dst, IPPROTO_GRE,
- tun_key->ipv4_tos, tun_key->ipv4_ttl, df, false);
-err_free_rt:
- ip_rt_put(rt);
-err_free_skb:
- kfree_skb(skb);
- return err;
-}
-
-static struct gre_cisco_protocol gre_protocol = {
- .handler = gre_rcv,
- .err_handler = gre_err,
- .priority = 1,
-};
-
-static int gre_ports;
-static int gre_init(void)
-{
- int err;
-
- gre_ports++;
- if (gre_ports > 1)
- return 0;
-
- err = gre_cisco_register(&gre_protocol);
- if (err)
- pr_warn("cannot register gre protocol handler\n");
-
- return err;
-}
-
-static void gre_exit(void)
-{
- gre_ports--;
- if (gre_ports > 0)
- return;
-
- gre_cisco_unregister(&gre_protocol);
-}
+ dev_change_flags(dev, dev->flags | IFF_UP);
+ rtnl_unlock();
-static const char *gre_get_name(const struct vport *vport)
-{
- return vport_priv(vport);
+ return vport;
}
static struct vport *gre_create(const struct vport_parms *parms)
{
- struct net *net = ovs_dp_get_net(parms->dp);
- struct ovs_net *ovs_net;
struct vport *vport;
- int err;
-
- err = gre_init();
- if (err)
- return ERR_PTR(err);
-
- ovs_net = net_generic(net, ovs_net_id);
- if (ovsl_dereference(ovs_net->vport_net.gre_vport)) {
- vport = ERR_PTR(-EEXIST);
- goto error;
- }
- vport = ovs_vport_alloc(IFNAMSIZ, &ovs_gre_vport_ops, parms);
+ vport = gre_tnl_create(parms);
if (IS_ERR(vport))
- goto error;
-
- strncpy(vport_priv(vport), parms->name, IFNAMSIZ);
- rcu_assign_pointer(ovs_net->vport_net.gre_vport, vport);
- return vport;
-
-error:
- gre_exit();
- return vport;
-}
-
-static void gre_tnl_destroy(struct vport *vport)
-{
- struct net *net = ovs_dp_get_net(vport->dp);
- struct ovs_net *ovs_net;
-
- ovs_net = net_generic(net, ovs_net_id);
+ return vport;
- RCU_INIT_POINTER(ovs_net->vport_net.gre_vport, NULL);
- ovs_vport_deferred_free(vport);
- gre_exit();
+ return ovs_netdev_link(vport, parms->name);
}
static int gre_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
- struct ovs_tunnel_info *egress_tun_info)
+ struct ip_tunnel_info *egress_tun_info)
{
return ovs_tunnel_get_egress_info(egress_tun_info,
ovs_dp_get_net(vport->dp),
@@ -288,10 +96,9 @@ static int gre_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
static struct vport_ops ovs_gre_vport_ops = {
.type = OVS_VPORT_TYPE_GRE,
.create = gre_create,
- .destroy = gre_tnl_destroy,
- .get_name = gre_get_name,
- .send = gre_tnl_send,
+ .send = ovs_netdev_send,
.get_egress_tun_info = gre_get_egress_tun_info,
+ .destroy = ovs_netdev_tunnel_destroy,
.owner = THIS_MODULE,
};
diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c
index 6a55f71..c058bbf 100644
--- a/net/openvswitch/vport-internal_dev.c
+++ b/net/openvswitch/vport-internal_dev.c
@@ -156,49 +156,44 @@ static void do_setup(struct net_device *netdev)
static struct vport *internal_dev_create(const struct vport_parms *parms)
{
struct vport *vport;
- struct netdev_vport *netdev_vport;
struct internal_dev *internal_dev;
int err;
- vport = ovs_vport_alloc(sizeof(struct netdev_vport),
- &ovs_internal_vport_ops, parms);
+ vport = ovs_vport_alloc(0, &ovs_internal_vport_ops, parms);
if (IS_ERR(vport)) {
err = PTR_ERR(vport);
goto error;
}
- netdev_vport = netdev_vport_priv(vport);
-
- netdev_vport->dev = alloc_netdev(sizeof(struct internal_dev),
- parms->name, NET_NAME_UNKNOWN,
- do_setup);
- if (!netdev_vport->dev) {
+ vport->dev = alloc_netdev(sizeof(struct internal_dev),
+ parms->name, NET_NAME_UNKNOWN, do_setup);
+ if (!vport->dev) {
err = -ENOMEM;
goto error_free_vport;
}
- dev_net_set(netdev_vport->dev, ovs_dp_get_net(vport->dp));
- internal_dev = internal_dev_priv(netdev_vport->dev);
+ dev_net_set(vport->dev, ovs_dp_get_net(vport->dp));
+ internal_dev = internal_dev_priv(vport->dev);
internal_dev->vport = vport;
/* Restrict bridge port to current netns. */
if (vport->port_no == OVSP_LOCAL)
- netdev_vport->dev->features |= NETIF_F_NETNS_LOCAL;
+ vport->dev->features |= NETIF_F_NETNS_LOCAL;
rtnl_lock();
- err = register_netdevice(netdev_vport->dev);
+ err = register_netdevice(vport->dev);
if (err)
goto error_free_netdev;
- dev_set_promiscuity(netdev_vport->dev, 1);
+ dev_set_promiscuity(vport->dev, 1);
rtnl_unlock();
- netif_start_queue(netdev_vport->dev);
+ netif_start_queue(vport->dev);
return vport;
error_free_netdev:
rtnl_unlock();
- free_netdev(netdev_vport->dev);
+ free_netdev(vport->dev);
error_free_vport:
ovs_vport_free(vport);
error:
@@ -207,21 +202,19 @@ error:
static void internal_dev_destroy(struct vport *vport)
{
- struct netdev_vport *netdev_vport = netdev_vport_priv(vport);
-
- netif_stop_queue(netdev_vport->dev);
+ netif_stop_queue(vport->dev);
rtnl_lock();
- dev_set_promiscuity(netdev_vport->dev, -1);
+ dev_set_promiscuity(vport->dev, -1);
/* unregister_netdevice() waits for an RCU grace period. */
- unregister_netdevice(netdev_vport->dev);
+ unregister_netdevice(vport->dev);
rtnl_unlock();
}
static int internal_dev_recv(struct vport *vport, struct sk_buff *skb)
{
- struct net_device *netdev = netdev_vport_priv(vport)->dev;
+ struct net_device *netdev = vport->dev;
int len;
if (unlikely(!(netdev->flags & IFF_UP))) {
@@ -249,7 +242,6 @@ static struct vport_ops ovs_internal_vport_ops = {
.type = OVS_VPORT_TYPE_INTERNAL,
.create = internal_dev_create,
.destroy = internal_dev_destroy,
- .get_name = ovs_netdev_get_name,
.send = internal_dev_recv,
};
diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c
index 33e6d6e..4b70aaa 100644
--- a/net/openvswitch/vport-netdev.c
+++ b/net/openvswitch/vport-netdev.c
@@ -26,10 +26,13 @@
#include <linux/rtnetlink.h>
#include <linux/skbuff.h>
#include <linux/openvswitch.h>
+#include <linux/export.h>
-#include <net/llc.h>
+#include <net/ip_tunnels.h>
+#include <net/rtnetlink.h>
#include "datapath.h"
+#include "vport.h"
#include "vport-internal_dev.h"
#include "vport-netdev.h"
@@ -54,7 +57,7 @@ static void netdev_port_receive(struct vport *vport, struct sk_buff *skb)
skb_push(skb, ETH_HLEN);
ovs_skb_postpush_rcsum(skb, skb->data, ETH_HLEN);
- ovs_vport_receive(vport, skb, NULL);
+ ovs_vport_receive(vport, skb, skb_tunnel_info(skb, AF_INET));
return;
error:
@@ -83,105 +86,112 @@ static struct net_device *get_dpdev(const struct datapath *dp)
local = ovs_vport_ovsl(dp, OVSP_LOCAL);
BUG_ON(!local);
- return netdev_vport_priv(local)->dev;
+ return local->dev;
}
-static struct vport *netdev_create(const struct vport_parms *parms)
+struct vport *ovs_netdev_link(struct vport *vport, const char *name)
{
- struct vport *vport;
- struct netdev_vport *netdev_vport;
int err;
- vport = ovs_vport_alloc(sizeof(struct netdev_vport),
- &ovs_netdev_vport_ops, parms);
- if (IS_ERR(vport)) {
- err = PTR_ERR(vport);
- goto error;
- }
-
- netdev_vport = netdev_vport_priv(vport);
-
- netdev_vport->dev = dev_get_by_name(ovs_dp_get_net(vport->dp), parms->name);
- if (!netdev_vport->dev) {
+ vport->dev = dev_get_by_name(ovs_dp_get_net(vport->dp), name);
+ if (!vport->dev) {
err = -ENODEV;
goto error_free_vport;
}
- if (netdev_vport->dev->flags & IFF_LOOPBACK ||
- netdev_vport->dev->type != ARPHRD_ETHER ||
- ovs_is_internal_dev(netdev_vport->dev)) {
+ if (vport->dev->flags & IFF_LOOPBACK ||
+ vport->dev->type != ARPHRD_ETHER ||
+ ovs_is_internal_dev(vport->dev)) {
err = -EINVAL;
goto error_put;
}
rtnl_lock();
- err = netdev_master_upper_dev_link(netdev_vport->dev,
+ err = netdev_master_upper_dev_link(vport->dev,
get_dpdev(vport->dp));
if (err)
goto error_unlock;
- err = netdev_rx_handler_register(netdev_vport->dev, netdev_frame_hook,
+ err = netdev_rx_handler_register(vport->dev, netdev_frame_hook,
vport);
if (err)
goto error_master_upper_dev_unlink;
- dev_disable_lro(netdev_vport->dev);
- dev_set_promiscuity(netdev_vport->dev, 1);
- netdev_vport->dev->priv_flags |= IFF_OVS_DATAPATH;
+ dev_disable_lro(vport->dev);
+ dev_set_promiscuity(vport->dev, 1);
+ vport->dev->priv_flags |= IFF_OVS_DATAPATH;
rtnl_unlock();
return vport;
error_master_upper_dev_unlink:
- netdev_upper_dev_unlink(netdev_vport->dev, get_dpdev(vport->dp));
+ netdev_upper_dev_unlink(vport->dev, get_dpdev(vport->dp));
error_unlock:
rtnl_unlock();
error_put:
- dev_put(netdev_vport->dev);
+ dev_put(vport->dev);
error_free_vport:
ovs_vport_free(vport);
-error:
return ERR_PTR(err);
}
+EXPORT_SYMBOL_GPL(ovs_netdev_link);
-static void free_port_rcu(struct rcu_head *rcu)
+static struct vport *netdev_create(const struct vport_parms *parms)
{
- struct netdev_vport *netdev_vport = container_of(rcu,
- struct netdev_vport, rcu);
+ struct vport *vport;
- dev_put(netdev_vport->dev);
- ovs_vport_free(vport_from_priv(netdev_vport));
+ vport = ovs_vport_alloc(0, &ovs_netdev_vport_ops, parms);
+ if (IS_ERR(vport))
+ return vport;
+
+ return ovs_netdev_link(vport, parms->name);
}
-void ovs_netdev_detach_dev(struct vport *vport)
+static void vport_netdev_free(struct rcu_head *rcu)
{
- struct netdev_vport *netdev_vport = netdev_vport_priv(vport);
+ struct vport *vport = container_of(rcu, struct vport, rcu);
+ if (vport->dev)
+ dev_put(vport->dev);
+ ovs_vport_free(vport);
+}
+
+void ovs_netdev_detach_dev(struct vport *vport)
+{
ASSERT_RTNL();
- netdev_vport->dev->priv_flags &= ~IFF_OVS_DATAPATH;
- netdev_rx_handler_unregister(netdev_vport->dev);
- netdev_upper_dev_unlink(netdev_vport->dev,
- netdev_master_upper_dev_get(netdev_vport->dev));
- dev_set_promiscuity(netdev_vport->dev, -1);
+ vport->dev->priv_flags &= ~IFF_OVS_DATAPATH;
+ netdev_rx_handler_unregister(vport->dev);
+ netdev_upper_dev_unlink(vport->dev,
+ netdev_master_upper_dev_get(vport->dev));
+ dev_set_promiscuity(vport->dev, -1);
}
+EXPORT_SYMBOL_GPL(ovs_netdev_detach_dev);
static void netdev_destroy(struct vport *vport)
{
- struct netdev_vport *netdev_vport = netdev_vport_priv(vport);
-
rtnl_lock();
- if (netdev_vport->dev->priv_flags & IFF_OVS_DATAPATH)
+ if (vport->dev->priv_flags & IFF_OVS_DATAPATH)
ovs_netdev_detach_dev(vport);
rtnl_unlock();
- call_rcu(&netdev_vport->rcu, free_port_rcu);
+ call_rcu(&vport->rcu, vport_netdev_free);
}
-const char *ovs_netdev_get_name(const struct vport *vport)
+void ovs_netdev_tunnel_destroy(struct vport *vport)
{
- const struct netdev_vport *netdev_vport = netdev_vport_priv(vport);
- return netdev_vport->dev->name;
+ rtnl_lock();
+ if (vport->dev->priv_flags & IFF_OVS_DATAPATH)
+ ovs_netdev_detach_dev(vport);
+
+ /* Early release so we can unregister the device */
+ dev_put(vport->dev);
+ rtnl_delete_link(vport->dev);
+ vport->dev = NULL;
+ rtnl_unlock();
+
+ call_rcu(&vport->rcu, vport_netdev_free);
}
+EXPORT_SYMBOL_GPL(ovs_netdev_tunnel_destroy);
static unsigned int packet_length(const struct sk_buff *skb)
{
@@ -193,20 +203,19 @@ static unsigned int packet_length(const struct sk_buff *skb)
return length;
}
-static int netdev_send(struct vport *vport, struct sk_buff *skb)
+int ovs_netdev_send(struct vport *vport, struct sk_buff *skb)
{
- struct netdev_vport *netdev_vport = netdev_vport_priv(vport);
- int mtu = netdev_vport->dev->mtu;
+ int mtu = vport->dev->mtu;
int len;
if (unlikely(packet_length(skb) > mtu && !skb_is_gso(skb))) {
net_warn_ratelimited("%s: dropped over-mtu packet: %d > %d\n",
- netdev_vport->dev->name,
+ vport->dev->name,
packet_length(skb), mtu);
goto drop;
}
- skb->dev = netdev_vport->dev;
+ skb->dev = vport->dev;
len = skb->len;
dev_queue_xmit(skb);
@@ -216,6 +225,7 @@ drop:
kfree_skb(skb);
return 0;
}
+EXPORT_SYMBOL_GPL(ovs_netdev_send);
/* Returns null if this device is not attached to a datapath. */
struct vport *ovs_netdev_get_vport(struct net_device *dev)
@@ -231,8 +241,7 @@ static struct vport_ops ovs_netdev_vport_ops = {
.type = OVS_VPORT_TYPE_NETDEV,
.create = netdev_create,
.destroy = netdev_destroy,
- .get_name = ovs_netdev_get_name,
- .send = netdev_send,
+ .send = ovs_netdev_send,
};
int __init ovs_netdev_init(void)
diff --git a/net/openvswitch/vport-netdev.h b/net/openvswitch/vport-netdev.h
index 6f7038e..497cc81 100644
--- a/net/openvswitch/vport-netdev.h
+++ b/net/openvswitch/vport-netdev.h
@@ -26,22 +26,12 @@
struct vport *ovs_netdev_get_vport(struct net_device *dev);
-struct netdev_vport {
- struct rcu_head rcu;
-
- struct net_device *dev;
-};
-
-static inline struct netdev_vport *
-netdev_vport_priv(const struct vport *vport)
-{
- return vport_priv(vport);
-}
-
-const char *ovs_netdev_get_name(const struct vport *);
+struct vport *ovs_netdev_link(struct vport *vport, const char *name);
+int ovs_netdev_send(struct vport *vport, struct sk_buff *skb);
void ovs_netdev_detach_dev(struct vport *);
int __init ovs_netdev_init(void);
void ovs_netdev_exit(void);
+void ovs_netdev_tunnel_destroy(struct vport *vport);
#endif /* vport_netdev.h */
diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c
index 6d39766..1e8b00a2 100644
--- a/net/openvswitch/vport-vxlan.c
+++ b/net/openvswitch/vport-vxlan.c
@@ -17,94 +17,37 @@
* 02110-1301, USA
*/
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/net.h>
-#include <linux/rculist.h>
-#include <linux/udp.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/openvswitch.h>
#include <linux/module.h>
-
-#include <net/icmp.h>
-#include <net/ip.h>
#include <net/udp.h>
#include <net/ip_tunnels.h>
#include <net/rtnetlink.h>
-#include <net/route.h>
-#include <net/dsfield.h>
-#include <net/inet_ecn.h>
-#include <net/net_namespace.h>
-#include <net/netns/generic.h>
#include <net/vxlan.h>
#include "datapath.h"
#include "vport.h"
-#include "vport-vxlan.h"
-
-/**
- * struct vxlan_port - Keeps track of open UDP ports
- * @vs: vxlan_sock created for the port.
- * @name: vport name.
- */
-struct vxlan_port {
- struct vxlan_sock *vs;
- char name[IFNAMSIZ];
- u32 exts; /* VXLAN_F_* in <net/vxlan.h> */
-};
-
-static struct vport_ops ovs_vxlan_vport_ops;
+#include "vport-netdev.h"
-static inline struct vxlan_port *vxlan_vport(const struct vport *vport)
-{
- return vport_priv(vport);
-}
-
-/* Called with rcu_read_lock and BH disabled. */
-static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
- struct vxlan_metadata *md)
-{
- struct ovs_tunnel_info tun_info;
- struct vxlan_port *vxlan_port;
- struct vport *vport = vs->data;
- struct iphdr *iph;
- struct ovs_vxlan_opts opts = {
- .gbp = md->gbp,
- };
- __be64 key;
- __be16 flags;
-
- flags = TUNNEL_KEY | (udp_hdr(skb)->check != 0 ? TUNNEL_CSUM : 0);
- vxlan_port = vxlan_vport(vport);
- if (vxlan_port->exts & VXLAN_F_GBP && md->gbp)
- flags |= TUNNEL_VXLAN_OPT;
-
- /* Save outer tunnel values */
- iph = ip_hdr(skb);
- key = cpu_to_be64(ntohl(md->vni) >> 8);
- ovs_flow_tun_info_init(&tun_info, iph,
- udp_hdr(skb)->source, udp_hdr(skb)->dest,
- key, flags, &opts, sizeof(opts));
-
- ovs_vport_receive(vport, skb, &tun_info);
-}
+static struct vport_ops ovs_vxlan_netdev_vport_ops;
static int vxlan_get_options(const struct vport *vport, struct sk_buff *skb)
{
- struct vxlan_port *vxlan_port = vxlan_vport(vport);
- __be16 dst_port = inet_sk(vxlan_port->vs->sock->sk)->inet_sport;
+ struct vxlan_dev *vxlan = netdev_priv(vport->dev);
+ __be16 dst_port = vxlan->cfg.dst_port;
if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, ntohs(dst_port)))
return -EMSGSIZE;
- if (vxlan_port->exts) {
+ if (vxlan->flags & VXLAN_F_GBP) {
struct nlattr *exts;
exts = nla_nest_start(skb, OVS_TUNNEL_ATTR_EXTENSION);
if (!exts)
return -EMSGSIZE;
- if (vxlan_port->exts & VXLAN_F_GBP &&
+ if (vxlan->flags & VXLAN_F_GBP &&
nla_put_flag(skb, OVS_VXLAN_EXT_GBP))
return -EMSGSIZE;
@@ -114,23 +57,14 @@ static int vxlan_get_options(const struct vport *vport, struct sk_buff *skb)
return 0;
}
-static void vxlan_tnl_destroy(struct vport *vport)
-{
- struct vxlan_port *vxlan_port = vxlan_vport(vport);
-
- vxlan_sock_release(vxlan_port->vs);
-
- ovs_vport_deferred_free(vport);
-}
-
-static const struct nla_policy exts_policy[OVS_VXLAN_EXT_MAX+1] = {
+static const struct nla_policy exts_policy[OVS_VXLAN_EXT_MAX + 1] = {
[OVS_VXLAN_EXT_GBP] = { .type = NLA_FLAG, },
};
-static int vxlan_configure_exts(struct vport *vport, struct nlattr *attr)
+static int vxlan_configure_exts(struct vport *vport, struct nlattr *attr,
+ struct vxlan_config *conf)
{
- struct nlattr *exts[OVS_VXLAN_EXT_MAX+1];
- struct vxlan_port *vxlan_port;
+ struct nlattr *exts[OVS_VXLAN_EXT_MAX + 1];
int err;
if (nla_len(attr) < sizeof(struct nlattr))
@@ -140,10 +74,8 @@ static int vxlan_configure_exts(struct vport *vport, struct nlattr *attr)
if (err < 0)
return err;
- vxlan_port = vxlan_vport(vport);
-
if (exts[OVS_VXLAN_EXT_GBP])
- vxlan_port->exts |= VXLAN_F_GBP;
+ conf->flags |= VXLAN_F_GBP;
return 0;
}
@@ -152,128 +84,74 @@ static struct vport *vxlan_tnl_create(const struct vport_parms *parms)
{
struct net *net = ovs_dp_get_net(parms->dp);
struct nlattr *options = parms->options;
- struct vxlan_port *vxlan_port;
- struct vxlan_sock *vs;
+ struct net_device *dev;
struct vport *vport;
struct nlattr *a;
- u16 dst_port;
int err;
+ struct vxlan_config conf = {
+ .no_share = true,
+ .flags = VXLAN_F_COLLECT_METADATA,
+ };
if (!options) {
err = -EINVAL;
goto error;
}
+
a = nla_find_nested(options, OVS_TUNNEL_ATTR_DST_PORT);
if (a && nla_len(a) == sizeof(u16)) {
- dst_port = nla_get_u16(a);
+ conf.dst_port = htons(nla_get_u16(a));
} else {
/* Require destination port from userspace. */
err = -EINVAL;
goto error;
}
- vport = ovs_vport_alloc(sizeof(struct vxlan_port),
- &ovs_vxlan_vport_ops, parms);
+ vport = ovs_vport_alloc(0, &ovs_vxlan_netdev_vport_ops, parms);
if (IS_ERR(vport))
return vport;
- vxlan_port = vxlan_vport(vport);
- strncpy(vxlan_port->name, parms->name, IFNAMSIZ);
-
a = nla_find_nested(options, OVS_TUNNEL_ATTR_EXTENSION);
if (a) {
- err = vxlan_configure_exts(vport, a);
+ err = vxlan_configure_exts(vport, a, &conf);
if (err) {
ovs_vport_free(vport);
goto error;
}
}
- vs = vxlan_sock_add(net, htons(dst_port), vxlan_rcv, vport, true,
- vxlan_port->exts);
- if (IS_ERR(vs)) {
+ rtnl_lock();
+ dev = vxlan_dev_create(net, parms->name, NET_NAME_USER, &conf);
+ if (IS_ERR(dev)) {
+ rtnl_unlock();
ovs_vport_free(vport);
- return (void *)vs;
+ return ERR_CAST(dev);
}
- vxlan_port->vs = vs;
+ dev_change_flags(dev, dev->flags | IFF_UP);
+ rtnl_unlock();
return vport;
-
error:
return ERR_PTR(err);
}
-static int vxlan_ext_gbp(struct sk_buff *skb)
+static struct vport *vxlan_create(const struct vport_parms *parms)
{
- const struct ovs_tunnel_info *tun_info;
- const struct ovs_vxlan_opts *opts;
-
- tun_info = OVS_CB(skb)->egress_tun_info;
- opts = tun_info->options;
-
- if (tun_info->tunnel.tun_flags & TUNNEL_VXLAN_OPT &&
- tun_info->options_len >= sizeof(*opts))
- return opts->gbp;
- else
- return 0;
-}
-
-static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb)
-{
- struct net *net = ovs_dp_get_net(vport->dp);
- struct vxlan_port *vxlan_port = vxlan_vport(vport);
- struct sock *sk = vxlan_port->vs->sock->sk;
- __be16 dst_port = inet_sk(sk)->inet_sport;
- const struct ovs_key_ipv4_tunnel *tun_key;
- struct vxlan_metadata md = {0};
- struct rtable *rt;
- struct flowi4 fl;
- __be16 src_port;
- __be16 df;
- int err;
- u32 vxflags;
-
- if (unlikely(!OVS_CB(skb)->egress_tun_info)) {
- err = -EINVAL;
- goto error;
- }
-
- tun_key = &OVS_CB(skb)->egress_tun_info->tunnel;
- rt = ovs_tunnel_route_lookup(net, tun_key, skb->mark, &fl, IPPROTO_UDP);
- if (IS_ERR(rt)) {
- err = PTR_ERR(rt);
- goto error;
- }
-
- df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ?
- htons(IP_DF) : 0;
+ struct vport *vport;
- skb->ignore_df = 1;
+ vport = vxlan_tnl_create(parms);
+ if (IS_ERR(vport))
+ return vport;
- src_port = udp_flow_src_port(net, skb, 0, 0, true);
- md.vni = htonl(be64_to_cpu(tun_key->tun_id) << 8);
- md.gbp = vxlan_ext_gbp(skb);
- vxflags = vxlan_port->exts |
- (tun_key->tun_flags & TUNNEL_CSUM ? VXLAN_F_UDP_CSUM : 0);
-
- err = vxlan_xmit_skb(rt, sk, skb, fl.saddr, tun_key->ipv4_dst,
- tun_key->ipv4_tos, tun_key->ipv4_ttl, df,
- src_port, dst_port,
- &md, false, vxflags);
- if (err < 0)
- ip_rt_put(rt);
- return err;
-error:
- kfree_skb(skb);
- return err;
+ return ovs_netdev_link(vport, parms->name);
}
static int vxlan_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
- struct ovs_tunnel_info *egress_tun_info)
+ struct ip_tunnel_info *egress_tun_info)
{
+ struct vxlan_dev *vxlan = netdev_priv(vport->dev);
struct net *net = ovs_dp_get_net(vport->dp);
- struct vxlan_port *vxlan_port = vxlan_vport(vport);
- __be16 dst_port = inet_sk(vxlan_port->vs->sock->sk)->inet_sport;
+ __be16 dst_port = vxlan_dev_dst_port(vxlan);
__be16 src_port;
int port_min;
int port_max;
@@ -287,31 +165,23 @@ static int vxlan_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
src_port, dst_port);
}
-static const char *vxlan_get_name(const struct vport *vport)
-{
- struct vxlan_port *vxlan_port = vxlan_vport(vport);
- return vxlan_port->name;
-}
-
-static struct vport_ops ovs_vxlan_vport_ops = {
- .type = OVS_VPORT_TYPE_VXLAN,
- .create = vxlan_tnl_create,
- .destroy = vxlan_tnl_destroy,
- .get_name = vxlan_get_name,
- .get_options = vxlan_get_options,
- .send = vxlan_tnl_send,
+static struct vport_ops ovs_vxlan_netdev_vport_ops = {
+ .type = OVS_VPORT_TYPE_VXLAN,
+ .create = vxlan_create,
+ .destroy = ovs_netdev_tunnel_destroy,
+ .get_options = vxlan_get_options,
+ .send = ovs_netdev_send,
.get_egress_tun_info = vxlan_get_egress_tun_info,
- .owner = THIS_MODULE,
};
static int __init ovs_vxlan_tnl_init(void)
{
- return ovs_vport_ops_register(&ovs_vxlan_vport_ops);
+ return ovs_vport_ops_register(&ovs_vxlan_netdev_vport_ops);
}
static void __exit ovs_vxlan_tnl_exit(void)
{
- ovs_vport_ops_unregister(&ovs_vxlan_vport_ops);
+ ovs_vport_ops_unregister(&ovs_vxlan_netdev_vport_ops);
}
module_init(ovs_vxlan_tnl_init);
diff --git a/net/openvswitch/vport-vxlan.h b/net/openvswitch/vport-vxlan.h
deleted file mode 100644
index 4b08233e..0000000
--- a/net/openvswitch/vport-vxlan.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#ifndef VPORT_VXLAN_H
-#define VPORT_VXLAN_H 1
-
-#include <linux/kernel.h>
-#include <linux/types.h>
-
-struct ovs_vxlan_opts {
- __u32 gbp;
-};
-
-#endif
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index 067a3ff..d14f594 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -113,7 +113,7 @@ struct vport *ovs_vport_locate(const struct net *net, const char *name)
struct vport *vport;
hlist_for_each_entry_rcu(vport, bucket, hash_node)
- if (!strcmp(name, vport->ops->get_name(vport)) &&
+ if (!strcmp(name, ovs_vport_name(vport)) &&
net_eq(ovs_dp_get_net(vport->dp), net))
return vport;
@@ -226,7 +226,7 @@ struct vport *ovs_vport_add(const struct vport_parms *parms)
}
bucket = hash_bucket(ovs_dp_get_net(vport->dp),
- vport->ops->get_name(vport));
+ ovs_vport_name(vport));
hlist_add_head_rcu(&vport->hash_node, bucket);
return vport;
}
@@ -469,7 +469,7 @@ u32 ovs_vport_find_upcall_portid(const struct vport *vport, struct sk_buff *skb)
* skb->data should point to the Ethernet header.
*/
void ovs_vport_receive(struct vport *vport, struct sk_buff *skb,
- const struct ovs_tunnel_info *tun_info)
+ const struct ip_tunnel_info *tun_info)
{
struct pcpu_sw_netstats *stats;
struct sw_flow_key key;
@@ -572,22 +572,22 @@ void ovs_vport_deferred_free(struct vport *vport)
}
EXPORT_SYMBOL_GPL(ovs_vport_deferred_free);
-int ovs_tunnel_get_egress_info(struct ovs_tunnel_info *egress_tun_info,
+int ovs_tunnel_get_egress_info(struct ip_tunnel_info *egress_tun_info,
struct net *net,
- const struct ovs_tunnel_info *tun_info,
+ const struct ip_tunnel_info *tun_info,
u8 ipproto,
u32 skb_mark,
__be16 tp_src,
__be16 tp_dst)
{
- const struct ovs_key_ipv4_tunnel *tun_key;
+ const struct ip_tunnel_key *tun_key;
struct rtable *rt;
struct flowi4 fl;
if (unlikely(!tun_info))
return -EINVAL;
- tun_key = &tun_info->tunnel;
+ tun_key = &tun_info->key;
/* Route lookup to get srouce IP address.
* The process may need to be changed if the corresponding process
@@ -602,22 +602,22 @@ int ovs_tunnel_get_egress_info(struct ovs_tunnel_info *egress_tun_info,
/* Generate egress_tun_info based on tun_info,
* saddr, tp_src and tp_dst
*/
- __ovs_flow_tun_info_init(egress_tun_info,
- fl.saddr, tun_key->ipv4_dst,
- tun_key->ipv4_tos,
- tun_key->ipv4_ttl,
- tp_src, tp_dst,
- tun_key->tun_id,
- tun_key->tun_flags,
- tun_info->options,
- tun_info->options_len);
+ __ip_tunnel_info_init(egress_tun_info,
+ fl.saddr, tun_key->ipv4_dst,
+ tun_key->ipv4_tos,
+ tun_key->ipv4_ttl,
+ tp_src, tp_dst,
+ tun_key->tun_id,
+ tun_key->tun_flags,
+ tun_info->options,
+ tun_info->options_len);
return 0;
}
EXPORT_SYMBOL_GPL(ovs_tunnel_get_egress_info);
int ovs_vport_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
- struct ovs_tunnel_info *info)
+ struct ip_tunnel_info *info)
{
/* get_egress_tun_info() is only implemented on tunnel ports. */
if (unlikely(!vport->ops->get_egress_tun_info))
diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h
index bc85331..1a689c2 100644
--- a/net/openvswitch/vport.h
+++ b/net/openvswitch/vport.h
@@ -27,6 +27,7 @@
#include <linux/skbuff.h>
#include <linux/spinlock.h>
#include <linux/u64_stats_sync.h>
+#include <net/route.h>
#include "datapath.h"
@@ -58,15 +59,15 @@ u32 ovs_vport_find_upcall_portid(const struct vport *, struct sk_buff *);
int ovs_vport_send(struct vport *, struct sk_buff *);
-int ovs_tunnel_get_egress_info(struct ovs_tunnel_info *egress_tun_info,
+int ovs_tunnel_get_egress_info(struct ip_tunnel_info *egress_tun_info,
struct net *net,
- const struct ovs_tunnel_info *tun_info,
+ const struct ip_tunnel_info *tun_info,
u8 ipproto,
u32 skb_mark,
__be16 tp_src,
__be16 tp_dst);
int ovs_vport_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
- struct ovs_tunnel_info *info);
+ struct ip_tunnel_info *info);
/* The following definitions are for implementers of vport devices: */
@@ -106,7 +107,7 @@ struct vport_portids {
* @detach_list: list used for detaching vport in net-exit call.
*/
struct vport {
- struct rcu_head rcu;
+ struct net_device *dev;
struct datapath *dp;
struct vport_portids __rcu *upcall_portids;
u16 port_no;
@@ -119,6 +120,7 @@ struct vport {
struct vport_err_stats err_stats;
struct list_head detach_list;
+ struct rcu_head rcu;
};
/**
@@ -176,7 +178,7 @@ struct vport_ops {
int (*send)(struct vport *, struct sk_buff *);
int (*get_egress_tun_info)(struct vport *, struct sk_buff *,
- struct ovs_tunnel_info *);
+ struct ip_tunnel_info *);
struct module *owner;
struct list_head list;
@@ -226,7 +228,7 @@ static inline struct vport *vport_from_priv(void *priv)
}
void ovs_vport_receive(struct vport *, struct sk_buff *,
- const struct ovs_tunnel_info *);
+ const struct ip_tunnel_info *);
static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb,
const void *start, unsigned int len)
@@ -235,11 +237,16 @@ static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb,
skb->csum = csum_add(skb->csum, csum_partial(start, len, 0));
}
+static inline const char *ovs_vport_name(struct vport *vport)
+{
+ return vport->dev ? vport->dev->name : vport->ops->get_name(vport);
+}
+
int ovs_vport_ops_register(struct vport_ops *ops);
void ovs_vport_ops_unregister(struct vport_ops *ops);
static inline struct rtable *ovs_tunnel_route_lookup(struct net *net,
- const struct ovs_key_ipv4_tunnel *key,
+ const struct ip_tunnel_key *key,
u32 mark,
struct flowi4 *fl,
u8 protocol)
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index ed458b3..b5afe53 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -518,13 +518,11 @@ static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
}
static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
- int tx_ring,
struct sk_buff_head *rb_queue)
{
struct tpacket_kbdq_core *pkc;
- pkc = tx_ring ? GET_PBDQC_FROM_RB(&po->tx_ring) :
- GET_PBDQC_FROM_RB(&po->rx_ring);
+ pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
spin_lock_bh(&rb_queue->lock);
pkc->delete_blk_timer = 1;
@@ -4043,7 +4041,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
if (closing && (po->tp_version > TPACKET_V2)) {
/* Because we don't support block-based V3 on tx-ring */
if (!tx_ring)
- prb_shutdown_retire_blk_timer(po, tx_ring, rb_queue);
+ prb_shutdown_retire_blk_timer(po, rb_queue);
}
release_sock(sk);
diff --git a/net/rds/bind.c b/net/rds/bind.c
index 4ebd29c..dd666fb 100644
--- a/net/rds/bind.c
+++ b/net/rds/bind.c
@@ -185,7 +185,8 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
ret = 0;
goto out;
}
- trans = rds_trans_get_preferred(sin->sin_addr.s_addr);
+ trans = rds_trans_get_preferred(sock_net(sock->sk),
+ sin->sin_addr.s_addr);
if (!trans) {
ret = -EADDRNOTAVAIL;
rds_remove_bound(rs);
diff --git a/net/rds/connection.c b/net/rds/connection.c
index da6da57..d4fecb2 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -117,7 +117,8 @@ static void rds_conn_reset(struct rds_connection *conn)
* For now they are not garbage collected once they're created. They
* are torn down as the module is removed, if ever.
*/
-static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
+static struct rds_connection *__rds_conn_create(struct net *net,
+ __be32 laddr, __be32 faddr,
struct rds_transport *trans, gfp_t gfp,
int is_outgoing)
{
@@ -157,6 +158,7 @@ new_conn:
conn->c_faddr = faddr;
spin_lock_init(&conn->c_lock);
conn->c_next_tx_seq = 1;
+ rds_conn_net_set(conn, net);
init_waitqueue_head(&conn->c_waitq);
INIT_LIST_HEAD(&conn->c_send_queue);
@@ -174,7 +176,7 @@ new_conn:
* can bind to the destination address then we'd rather the messages
* flow through loopback rather than either transport.
*/
- loop_trans = rds_trans_get_preferred(faddr);
+ loop_trans = rds_trans_get_preferred(net, faddr);
if (loop_trans) {
rds_trans_put(loop_trans);
conn->c_loopback = 1;
@@ -260,17 +262,19 @@ out:
return conn;
}
-struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr,
+struct rds_connection *rds_conn_create(struct net *net,
+ __be32 laddr, __be32 faddr,
struct rds_transport *trans, gfp_t gfp)
{
- return __rds_conn_create(laddr, faddr, trans, gfp, 0);
+ return __rds_conn_create(net, laddr, faddr, trans, gfp, 0);
}
EXPORT_SYMBOL_GPL(rds_conn_create);
-struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
+struct rds_connection *rds_conn_create_outgoing(struct net *net,
+ __be32 laddr, __be32 faddr,
struct rds_transport *trans, gfp_t gfp)
{
- return __rds_conn_create(laddr, faddr, trans, gfp, 1);
+ return __rds_conn_create(net, laddr, faddr, trans, gfp, 1);
}
EXPORT_SYMBOL_GPL(rds_conn_create_outgoing);
diff --git a/net/rds/ib.c b/net/rds/ib.c
index ba2dffe..1381422 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -317,7 +317,7 @@ static void rds_ib_ic_info(struct socket *sock, unsigned int len,
* allowed to influence which paths have priority. We could call userspace
* asserting this policy "routing".
*/
-static int rds_ib_laddr_check(__be32 addr)
+static int rds_ib_laddr_check(struct net *net, __be32 addr)
{
int ret;
struct rdma_cm_id *cm_id;
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 0da2a45..f40d8f5 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -448,8 +448,9 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
(unsigned long long)be64_to_cpu(lguid),
(unsigned long long)be64_to_cpu(fguid));
- conn = rds_conn_create(dp->dp_daddr, dp->dp_saddr, &rds_ib_transport,
- GFP_KERNEL);
+ /* RDS/IB is not currently netns aware, thus init_net */
+ conn = rds_conn_create(&init_net, dp->dp_daddr, dp->dp_saddr,
+ &rds_ib_transport, GFP_KERNEL);
if (IS_ERR(conn)) {
rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn));
conn = NULL;
diff --git a/net/rds/iw.c b/net/rds/iw.c
index 5899356..5d5a9d2 100644
--- a/net/rds/iw.c
+++ b/net/rds/iw.c
@@ -218,7 +218,7 @@ static void rds_iw_ic_info(struct socket *sock, unsigned int len,
* allowed to influence which paths have priority. We could call userspace
* asserting this policy "routing".
*/
-static int rds_iw_laddr_check(__be32 addr)
+static int rds_iw_laddr_check(struct net *net, __be32 addr)
{
int ret;
struct rdma_cm_id *cm_id;
diff --git a/net/rds/iw_cm.c b/net/rds/iw_cm.c
index 8f486fa..a6553a6 100644
--- a/net/rds/iw_cm.c
+++ b/net/rds/iw_cm.c
@@ -398,8 +398,9 @@ int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id,
&dp->dp_saddr, &dp->dp_daddr,
RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version));
- conn = rds_conn_create(dp->dp_daddr, dp->dp_saddr, &rds_iw_transport,
- GFP_KERNEL);
+ /* RDS/IW is not currently netns aware, thus init_net */
+ conn = rds_conn_create(&init_net, dp->dp_daddr, dp->dp_saddr,
+ &rds_iw_transport, GFP_KERNEL);
if (IS_ERR(conn)) {
rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn));
conn = NULL;
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 2260c1e4..9005fb0 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -128,8 +128,21 @@ struct rds_connection {
/* Protocol version */
unsigned int c_version;
+ possible_net_t c_net;
};
+static inline
+struct net *rds_conn_net(struct rds_connection *conn)
+{
+ return read_pnet(&conn->c_net);
+}
+
+static inline
+void rds_conn_net_set(struct rds_connection *conn, struct net *net)
+{
+ write_pnet(&conn->c_net, net);
+}
+
#define RDS_FLAG_CONG_BITMAP 0x01
#define RDS_FLAG_ACK_REQUIRED 0x02
#define RDS_FLAG_RETRANSMITTED 0x04
@@ -417,7 +430,7 @@ struct rds_transport {
unsigned int t_prefer_loopback:1;
unsigned int t_type;
- int (*laddr_check)(__be32 addr);
+ int (*laddr_check)(struct net *net, __be32 addr);
int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp);
void (*conn_free)(void *data);
int (*conn_connect)(struct rds_connection *conn);
@@ -608,9 +621,11 @@ struct rds_message *rds_cong_update_alloc(struct rds_connection *conn);
/* conn.c */
int rds_conn_init(void);
void rds_conn_exit(void);
-struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr,
+struct rds_connection *rds_conn_create(struct net *net,
+ __be32 laddr, __be32 faddr,
struct rds_transport *trans, gfp_t gfp);
-struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
+struct rds_connection *rds_conn_create_outgoing(struct net *net,
+ __be32 laddr, __be32 faddr,
struct rds_transport *trans, gfp_t gfp);
void rds_conn_shutdown(struct rds_connection *conn);
void rds_conn_destroy(struct rds_connection *conn);
@@ -795,7 +810,7 @@ void rds_connect_complete(struct rds_connection *conn);
/* transport.c */
int rds_trans_register(struct rds_transport *trans);
void rds_trans_unregister(struct rds_transport *trans);
-struct rds_transport *rds_trans_get_preferred(__be32 addr);
+struct rds_transport *rds_trans_get_preferred(struct net *net, __be32 addr);
void rds_trans_put(struct rds_transport *trans);
unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
unsigned int avail);
diff --git a/net/rds/send.c b/net/rds/send.c
index e9430f5..2581b8e 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -1023,7 +1023,8 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
if (rs->rs_conn && rs->rs_conn->c_faddr == daddr)
conn = rs->rs_conn;
else {
- conn = rds_conn_create_outgoing(rs->rs_bound_addr, daddr,
+ conn = rds_conn_create_outgoing(sock_net(sock->sk),
+ rs->rs_bound_addr, daddr,
rs->rs_transport,
sock->sk->sk_allocation);
if (IS_ERR(conn)) {
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index edac9ef..c42b60b 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -35,6 +35,9 @@
#include <linux/in.h>
#include <linux/module.h>
#include <net/tcp.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/tcp.h>
#include "rds.h"
#include "tcp.h"
@@ -189,9 +192,9 @@ out:
spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags);
}
-static int rds_tcp_laddr_check(__be32 addr)
+static int rds_tcp_laddr_check(struct net *net, __be32 addr)
{
- if (inet_addr_type(&init_net, addr) == RTN_LOCAL)
+ if (inet_addr_type(net, addr) == RTN_LOCAL)
return 0;
return -EADDRNOTAVAIL;
}
@@ -250,16 +253,7 @@ static void rds_tcp_destroy_conns(void)
}
}
-static void rds_tcp_exit(void)
-{
- rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
- rds_tcp_listen_stop();
- rds_tcp_destroy_conns();
- rds_trans_unregister(&rds_tcp_transport);
- rds_tcp_recv_exit();
- kmem_cache_destroy(rds_tcp_conn_slab);
-}
-module_exit(rds_tcp_exit);
+static void rds_tcp_exit(void);
struct rds_transport rds_tcp_transport = {
.laddr_check = rds_tcp_laddr_check,
@@ -281,6 +275,136 @@ struct rds_transport rds_tcp_transport = {
.t_prefer_loopback = 1,
};
+static int rds_tcp_netid;
+
+/* per-network namespace private data for this module */
+struct rds_tcp_net {
+ struct socket *rds_tcp_listen_sock;
+ struct work_struct rds_tcp_accept_w;
+};
+
+static void rds_tcp_accept_worker(struct work_struct *work)
+{
+ struct rds_tcp_net *rtn = container_of(work,
+ struct rds_tcp_net,
+ rds_tcp_accept_w);
+
+ while (rds_tcp_accept_one(rtn->rds_tcp_listen_sock) == 0)
+ cond_resched();
+}
+
+void rds_tcp_accept_work(struct sock *sk)
+{
+ struct net *net = sock_net(sk);
+ struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
+
+ queue_work(rds_wq, &rtn->rds_tcp_accept_w);
+}
+
+static __net_init int rds_tcp_init_net(struct net *net)
+{
+ struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
+
+ rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net);
+ if (!rtn->rds_tcp_listen_sock) {
+ pr_warn("could not set up listen sock\n");
+ return -EAFNOSUPPORT;
+ }
+ INIT_WORK(&rtn->rds_tcp_accept_w, rds_tcp_accept_worker);
+ return 0;
+}
+
+static void __net_exit rds_tcp_exit_net(struct net *net)
+{
+ struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
+
+ /* If rds_tcp_exit_net() is called as a result of netns deletion,
+ * the rds_tcp_kill_sock() device notifier would already have cleaned
+ * up the listen socket, thus there is no work to do in this function.
+ *
+ * If rds_tcp_exit_net() is called as a result of module unload,
+ * i.e., due to rds_tcp_exit() -> unregister_pernet_subsys(), then
+ * we do need to clean up the listen socket here.
+ */
+ if (rtn->rds_tcp_listen_sock) {
+ rds_tcp_listen_stop(rtn->rds_tcp_listen_sock);
+ rtn->rds_tcp_listen_sock = NULL;
+ flush_work(&rtn->rds_tcp_accept_w);
+ }
+}
+
+static struct pernet_operations rds_tcp_net_ops = {
+ .init = rds_tcp_init_net,
+ .exit = rds_tcp_exit_net,
+ .id = &rds_tcp_netid,
+ .size = sizeof(struct rds_tcp_net),
+};
+
+static void rds_tcp_kill_sock(struct net *net)
+{
+ struct rds_tcp_connection *tc, *_tc;
+ struct sock *sk;
+ LIST_HEAD(tmp_list);
+ struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
+
+ rds_tcp_listen_stop(rtn->rds_tcp_listen_sock);
+ rtn->rds_tcp_listen_sock = NULL;
+ flush_work(&rtn->rds_tcp_accept_w);
+ spin_lock_irq(&rds_tcp_conn_lock);
+ list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) {
+ struct net *c_net = read_pnet(&tc->conn->c_net);
+
+ if (net != c_net || !tc->t_sock)
+ continue;
+ list_move_tail(&tc->t_tcp_node, &tmp_list);
+ }
+ spin_unlock_irq(&rds_tcp_conn_lock);
+ list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node) {
+ sk = tc->t_sock->sk;
+ sk->sk_prot->disconnect(sk, 0);
+ tcp_done(sk);
+ if (tc->conn->c_passive)
+ rds_conn_destroy(tc->conn->c_passive);
+ rds_conn_destroy(tc->conn);
+ }
+}
+
+static int rds_tcp_dev_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+ /* rds-tcp registers as a pernet subys, so the ->exit will only
+ * get invoked after network acitivity has quiesced. We need to
+ * clean up all sockets to quiesce network activity, and use
+ * the unregistration of the per-net loopback device as a trigger
+ * to start that cleanup.
+ */
+ if (event == NETDEV_UNREGISTER_FINAL &&
+ dev->ifindex == LOOPBACK_IFINDEX)
+ rds_tcp_kill_sock(dev_net(dev));
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block rds_tcp_dev_notifier = {
+ .notifier_call = rds_tcp_dev_event,
+ .priority = -10, /* must be called after other network notifiers */
+};
+
+static void rds_tcp_exit(void)
+{
+ rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
+ unregister_pernet_subsys(&rds_tcp_net_ops);
+ if (unregister_netdevice_notifier(&rds_tcp_dev_notifier))
+ pr_warn("could not unregister rds_tcp_dev_notifier\n");
+ rds_tcp_destroy_conns();
+ rds_trans_unregister(&rds_tcp_transport);
+ rds_tcp_recv_exit();
+ kmem_cache_destroy(rds_tcp_conn_slab);
+}
+module_exit(rds_tcp_exit);
+
static int rds_tcp_init(void)
{
int ret;
@@ -293,6 +417,16 @@ static int rds_tcp_init(void)
goto out;
}
+ ret = register_netdevice_notifier(&rds_tcp_dev_notifier);
+ if (ret) {
+ pr_warn("could not register rds_tcp_dev_notifier\n");
+ goto out;
+ }
+
+ ret = register_pernet_subsys(&rds_tcp_net_ops);
+ if (ret)
+ goto out_slab;
+
ret = rds_tcp_recv_init();
if (ret)
goto out_slab;
@@ -301,19 +435,14 @@ static int rds_tcp_init(void)
if (ret)
goto out_recv;
- ret = rds_tcp_listen_init();
- if (ret)
- goto out_register;
-
rds_info_register_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
goto out;
-out_register:
- rds_trans_unregister(&rds_tcp_transport);
out_recv:
rds_tcp_recv_exit();
out_slab:
+ unregister_pernet_subsys(&rds_tcp_net_ops);
kmem_cache_destroy(rds_tcp_conn_slab);
out:
return ret;
diff --git a/net/rds/tcp.h b/net/rds/tcp.h
index 0dbdd37..64f873c 100644
--- a/net/rds/tcp.h
+++ b/net/rds/tcp.h
@@ -52,6 +52,7 @@ u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc);
u32 rds_tcp_snd_una(struct rds_tcp_connection *tc);
u64 rds_tcp_map_seq(struct rds_tcp_connection *tc, u32 seq);
extern struct rds_transport rds_tcp_transport;
+void rds_tcp_accept_work(struct sock *sk);
/* tcp_connect.c */
int rds_tcp_conn_connect(struct rds_connection *conn);
@@ -59,9 +60,11 @@ void rds_tcp_conn_shutdown(struct rds_connection *conn);
void rds_tcp_state_change(struct sock *sk);
/* tcp_listen.c */
-int rds_tcp_listen_init(void);
-void rds_tcp_listen_stop(void);
+struct socket *rds_tcp_listen_init(struct net *);
+void rds_tcp_listen_stop(struct socket *);
void rds_tcp_listen_data_ready(struct sock *sk);
+int rds_tcp_accept_one(struct socket *sock);
+int rds_tcp_keepalive(struct socket *sock);
/* tcp_recv.c */
int rds_tcp_recv_init(void);
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
index 973109c7..5cb1687 100644
--- a/net/rds/tcp_connect.c
+++ b/net/rds/tcp_connect.c
@@ -79,7 +79,8 @@ int rds_tcp_conn_connect(struct rds_connection *conn)
struct sockaddr_in src, dest;
int ret;
- ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
+ ret = sock_create_kern(rds_conn_net(conn), PF_INET,
+ SOCK_STREAM, IPPROTO_TCP, &sock);
if (ret < 0)
goto out;
@@ -111,10 +112,12 @@ int rds_tcp_conn_connect(struct rds_connection *conn)
rdsdebug("connect to address %pI4 returned %d\n", &conn->c_faddr, ret);
if (ret == -EINPROGRESS)
ret = 0;
- if (ret == 0)
+ if (ret == 0) {
+ rds_tcp_keepalive(sock);
sock = NULL;
- else
+ } else {
rds_tcp_restore_callbacks(sock, conn->c_transport_data);
+ }
out:
if (sock)
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index 0da49e3..444d78d 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -38,14 +38,7 @@
#include "rds.h"
#include "tcp.h"
-/*
- * cheesy, but simple..
- */
-static void rds_tcp_accept_worker(struct work_struct *work);
-static DECLARE_WORK(rds_tcp_listen_work, rds_tcp_accept_worker);
-static struct socket *rds_tcp_listen_sock;
-
-static int rds_tcp_keepalive(struct socket *sock)
+int rds_tcp_keepalive(struct socket *sock)
{
/* values below based on xs_udp_default_timeout */
int keepidle = 5; /* send a probe 'keepidle' secs after last data */
@@ -77,7 +70,7 @@ bail:
return ret;
}
-static int rds_tcp_accept_one(struct socket *sock)
+int rds_tcp_accept_one(struct socket *sock)
{
struct socket *new_sock = NULL;
struct rds_connection *conn;
@@ -85,8 +78,9 @@ static int rds_tcp_accept_one(struct socket *sock)
struct inet_sock *inet;
struct rds_tcp_connection *rs_tcp;
- ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type,
- sock->sk->sk_protocol, &new_sock);
+ ret = sock_create_kern(sock_net(sock->sk), sock->sk->sk_family,
+ sock->sk->sk_type, sock->sk->sk_protocol,
+ &new_sock);
if (ret)
goto out;
@@ -108,7 +102,8 @@ static int rds_tcp_accept_one(struct socket *sock)
&inet->inet_saddr, ntohs(inet->inet_sport),
&inet->inet_daddr, ntohs(inet->inet_dport));
- conn = rds_conn_create(inet->inet_saddr, inet->inet_daddr,
+ conn = rds_conn_create(sock_net(sock->sk),
+ inet->inet_saddr, inet->inet_daddr,
&rds_tcp_transport, GFP_KERNEL);
if (IS_ERR(conn)) {
ret = PTR_ERR(conn);
@@ -148,12 +143,6 @@ out:
return ret;
}
-static void rds_tcp_accept_worker(struct work_struct *work)
-{
- while (rds_tcp_accept_one(rds_tcp_listen_sock) == 0)
- cond_resched();
-}
-
void rds_tcp_listen_data_ready(struct sock *sk)
{
void (*ready)(struct sock *sk);
@@ -174,20 +163,20 @@ void rds_tcp_listen_data_ready(struct sock *sk)
* socket
*/
if (sk->sk_state == TCP_LISTEN)
- queue_work(rds_wq, &rds_tcp_listen_work);
+ rds_tcp_accept_work(sk);
out:
read_unlock(&sk->sk_callback_lock);
ready(sk);
}
-int rds_tcp_listen_init(void)
+struct socket *rds_tcp_listen_init(struct net *net)
{
struct sockaddr_in sin;
struct socket *sock = NULL;
int ret;
- ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
+ ret = sock_create_kern(net, PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
if (ret < 0)
goto out;
@@ -211,17 +200,15 @@ int rds_tcp_listen_init(void)
if (ret < 0)
goto out;
- rds_tcp_listen_sock = sock;
- sock = NULL;
+ return sock;
out:
if (sock)
sock_release(sock);
- return ret;
+ return NULL;
}
-void rds_tcp_listen_stop(void)
+void rds_tcp_listen_stop(struct socket *sock)
{
- struct socket *sock = rds_tcp_listen_sock;
struct sock *sk;
if (!sock)
@@ -242,5 +229,4 @@ void rds_tcp_listen_stop(void)
/* wait for accepts to stop and close the socket */
flush_workqueue(rds_wq);
sock_release(sock);
- rds_tcp_listen_sock = NULL;
}
diff --git a/net/rds/transport.c b/net/rds/transport.c
index 83498e1..f3afd1d 100644
--- a/net/rds/transport.c
+++ b/net/rds/transport.c
@@ -77,7 +77,7 @@ void rds_trans_put(struct rds_transport *trans)
module_put(trans->t_owner);
}
-struct rds_transport *rds_trans_get_preferred(__be32 addr)
+struct rds_transport *rds_trans_get_preferred(struct net *net, __be32 addr)
{
struct rds_transport *ret = NULL;
struct rds_transport *trans;
@@ -90,7 +90,7 @@ struct rds_transport *rds_trans_get_preferred(__be32 addr)
for (i = 0; i < RDS_TRANS_COUNT; i++) {
trans = transports[i];
- if (trans && (trans->laddr_check(addr) == 0) &&
+ if (trans && (trans->laddr_check(net, addr) == 0) &&
(!trans->t_owner || try_module_get(trans->t_owner))) {
ret = trans;
break;
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 43ec926..b087087 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -27,6 +27,15 @@
#include <net/act_api.h>
#include <net/netlink.h>
+static void free_tcf(struct rcu_head *head)
+{
+ struct tcf_common *p = container_of(head, struct tcf_common, tcfc_rcu);
+
+ free_percpu(p->cpu_bstats);
+ free_percpu(p->cpu_qstats);
+ kfree(p);
+}
+
void tcf_hash_destroy(struct tc_action *a)
{
struct tcf_common *p = a->priv;
@@ -41,7 +50,7 @@ void tcf_hash_destroy(struct tc_action *a)
* gen_estimator est_timer() might access p->tcfc_lock
* or bstats, wait a RCU grace period before freeing p
*/
- kfree_rcu(p, tcfc_rcu);
+ call_rcu(&p->tcfc_rcu, free_tcf);
}
EXPORT_SYMBOL(tcf_hash_destroy);
@@ -231,15 +240,16 @@ void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est)
if (est)
gen_kill_estimator(&pc->tcfc_bstats,
&pc->tcfc_rate_est);
- kfree_rcu(pc, tcfc_rcu);
+ call_rcu(&pc->tcfc_rcu, free_tcf);
}
EXPORT_SYMBOL(tcf_hash_cleanup);
int tcf_hash_create(u32 index, struct nlattr *est, struct tc_action *a,
- int size, int bind)
+ int size, int bind, bool cpustats)
{
struct tcf_hashinfo *hinfo = a->ops->hinfo;
struct tcf_common *p = kzalloc(size, GFP_KERNEL);
+ int err = -ENOMEM;
if (unlikely(!p))
return -ENOMEM;
@@ -247,18 +257,32 @@ int tcf_hash_create(u32 index, struct nlattr *est, struct tc_action *a,
if (bind)
p->tcfc_bindcnt = 1;
+ if (cpustats) {
+ p->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu);
+ if (!p->cpu_bstats) {
+err1:
+ kfree(p);
+ return err;
+ }
+ p->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
+ if (!p->cpu_qstats) {
+err2:
+ free_percpu(p->cpu_bstats);
+ goto err1;
+ }
+ }
spin_lock_init(&p->tcfc_lock);
INIT_HLIST_NODE(&p->tcfc_head);
p->tcfc_index = index ? index : tcf_hash_new_index(hinfo);
p->tcfc_tm.install = jiffies;
p->tcfc_tm.lastuse = jiffies;
if (est) {
- int err = gen_new_estimator(&p->tcfc_bstats, NULL,
- &p->tcfc_rate_est,
- &p->tcfc_lock, est);
+ err = gen_new_estimator(&p->tcfc_bstats, p->cpu_bstats,
+ &p->tcfc_rate_est,
+ &p->tcfc_lock, est);
if (err) {
- kfree(p);
- return err;
+ free_percpu(p->cpu_qstats);
+ goto err2;
}
}
@@ -616,10 +640,10 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a,
if (err < 0)
goto errout;
- if (gnet_stats_copy_basic(&d, NULL, &p->tcfc_bstats) < 0 ||
+ if (gnet_stats_copy_basic(&d, p->cpu_bstats, &p->tcfc_bstats) < 0 ||
gnet_stats_copy_rate_est(&d, &p->tcfc_bstats,
&p->tcfc_rate_est) < 0 ||
- gnet_stats_copy_queue(&d, NULL,
+ gnet_stats_copy_queue(&d, p->cpu_qstats,
&p->tcfc_qstats,
p->tcfc_qstats.qlen) < 0)
goto errout;
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index d0edeb7..1b97dab 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -278,7 +278,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
struct tc_act_bpf *parm;
struct tcf_bpf *prog;
bool is_bpf, is_ebpf;
- int ret;
+ int ret, res = 0;
if (!nla)
return -EINVAL;
@@ -287,41 +287,43 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
if (ret < 0)
return ret;
- is_bpf = tb[TCA_ACT_BPF_OPS_LEN] && tb[TCA_ACT_BPF_OPS];
- is_ebpf = tb[TCA_ACT_BPF_FD];
-
- if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf) ||
- !tb[TCA_ACT_BPF_PARMS])
+ if (!tb[TCA_ACT_BPF_PARMS])
return -EINVAL;
parm = nla_data(tb[TCA_ACT_BPF_PARMS]);
- memset(&cfg, 0, sizeof(cfg));
-
- ret = is_bpf ? tcf_bpf_init_from_ops(tb, &cfg) :
- tcf_bpf_init_from_efd(tb, &cfg);
- if (ret < 0)
- return ret;
-
if (!tcf_hash_check(parm->index, act, bind)) {
ret = tcf_hash_create(parm->index, est, act,
- sizeof(*prog), bind);
+ sizeof(*prog), bind, false);
if (ret < 0)
- goto destroy_fp;
+ return ret;
- ret = ACT_P_CREATED;
+ res = ACT_P_CREATED;
} else {
/* Don't override defaults. */
if (bind)
- goto destroy_fp;
+ return 0;
tcf_hash_release(act, bind);
- if (!replace) {
- ret = -EEXIST;
- goto destroy_fp;
- }
+ if (!replace)
+ return -EEXIST;
}
+ is_bpf = tb[TCA_ACT_BPF_OPS_LEN] && tb[TCA_ACT_BPF_OPS];
+ is_ebpf = tb[TCA_ACT_BPF_FD];
+
+ if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ memset(&cfg, 0, sizeof(cfg));
+
+ ret = is_bpf ? tcf_bpf_init_from_ops(tb, &cfg) :
+ tcf_bpf_init_from_efd(tb, &cfg);
+ if (ret < 0)
+ goto out;
+
prog = to_bpf(act);
spin_lock_bh(&prog->tcf_lock);
@@ -341,15 +343,16 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
spin_unlock_bh(&prog->tcf_lock);
- if (ret == ACT_P_CREATED)
+ if (res == ACT_P_CREATED)
tcf_hash_insert(act);
else
tcf_bpf_cfg_cleanup(&old);
- return ret;
+ return res;
+out:
+ if (res == ACT_P_CREATED)
+ tcf_hash_cleanup(act, est);
-destroy_fp:
- tcf_bpf_cfg_cleanup(&cfg);
return ret;
}
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index 295d14b..f2b5402 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -108,7 +108,8 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla,
parm = nla_data(tb[TCA_CONNMARK_PARMS]);
if (!tcf_hash_check(parm->index, a, bind)) {
- ret = tcf_hash_create(parm->index, est, a, sizeof(*ci), bind);
+ ret = tcf_hash_create(parm->index, est, a, sizeof(*ci),
+ bind, false);
if (ret)
return ret;
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index 4cd5cf1..b07c535 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -62,7 +62,8 @@ static int tcf_csum_init(struct net *n, struct nlattr *nla, struct nlattr *est,
parm = nla_data(tb[TCA_CSUM_PARMS]);
if (!tcf_hash_check(parm->index, a, bind)) {
- ret = tcf_hash_create(parm->index, est, a, sizeof(*p), bind);
+ ret = tcf_hash_create(parm->index, est, a, sizeof(*p),
+ bind, false);
if (ret)
return ret;
ret = ACT_P_CREATED;
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index 7fffc22..5c1b051 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -28,14 +28,18 @@
#ifdef CONFIG_GACT_PROB
static int gact_net_rand(struct tcf_gact *gact)
{
- if (!gact->tcfg_pval || prandom_u32() % gact->tcfg_pval)
+ smp_rmb(); /* coupled with smp_wmb() in tcf_gact_init() */
+ if (prandom_u32() % gact->tcfg_pval)
return gact->tcf_action;
return gact->tcfg_paction;
}
static int gact_determ(struct tcf_gact *gact)
{
- if (!gact->tcfg_pval || gact->tcf_bstats.packets % gact->tcfg_pval)
+ u32 pack = atomic_inc_return(&gact->packets);
+
+ smp_rmb(); /* coupled with smp_wmb() in tcf_gact_init() */
+ if (pack % gact->tcfg_pval)
return gact->tcf_action;
return gact->tcfg_paction;
}
@@ -85,7 +89,8 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla,
#endif
if (!tcf_hash_check(parm->index, a, bind)) {
- ret = tcf_hash_create(parm->index, est, a, sizeof(*gact), bind);
+ ret = tcf_hash_create(parm->index, est, a, sizeof(*gact),
+ bind, true);
if (ret)
return ret;
ret = ACT_P_CREATED;
@@ -99,16 +104,19 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla,
gact = to_gact(a);
- spin_lock_bh(&gact->tcf_lock);
+ ASSERT_RTNL();
gact->tcf_action = parm->action;
#ifdef CONFIG_GACT_PROB
if (p_parm) {
gact->tcfg_paction = p_parm->paction;
- gact->tcfg_pval = p_parm->pval;
+ gact->tcfg_pval = max_t(u16, 1, p_parm->pval);
+ /* Make sure tcfg_pval is written before tcfg_ptype
+ * coupled with smp_rmb() in gact_net_rand() & gact_determ()
+ */
+ smp_wmb();
gact->tcfg_ptype = p_parm->ptype;
}
#endif
- spin_unlock_bh(&gact->tcf_lock);
if (ret == ACT_P_CREATED)
tcf_hash_insert(a);
return ret;
@@ -118,23 +126,21 @@ static int tcf_gact(struct sk_buff *skb, const struct tc_action *a,
struct tcf_result *res)
{
struct tcf_gact *gact = a->priv;
- int action = TC_ACT_SHOT;
+ int action = READ_ONCE(gact->tcf_action);
- spin_lock(&gact->tcf_lock);
#ifdef CONFIG_GACT_PROB
- if (gact->tcfg_ptype)
- action = gact_rand[gact->tcfg_ptype](gact);
- else
- action = gact->tcf_action;
-#else
- action = gact->tcf_action;
+ {
+ u32 ptype = READ_ONCE(gact->tcfg_ptype);
+
+ if (ptype)
+ action = gact_rand[ptype](gact);
+ }
#endif
- gact->tcf_bstats.bytes += qdisc_pkt_len(skb);
- gact->tcf_bstats.packets++;
+ bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats), skb);
if (action == TC_ACT_SHOT)
- gact->tcf_qstats.drops++;
- gact->tcf_tm.lastuse = jiffies;
- spin_unlock(&gact->tcf_lock);
+ qstats_drop_inc(this_cpu_ptr(gact->common.cpu_qstats));
+
+ tcf_lastuse_update(&gact->tcf_tm);
return action;
}
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index cbc8dd7..99c9cc1 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -114,7 +114,7 @@ static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est,
index = nla_get_u32(tb[TCA_IPT_INDEX]);
if (!tcf_hash_check(index, a, bind) ) {
- ret = tcf_hash_create(index, est, a, sizeof(*ipt), bind);
+ ret = tcf_hash_create(index, est, a, sizeof(*ipt), bind, false);
if (ret)
return ret;
ret = ACT_P_CREATED;
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 2685450..2d1be4a 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -35,9 +35,11 @@ static LIST_HEAD(mirred_list);
static void tcf_mirred_release(struct tc_action *a, int bind)
{
struct tcf_mirred *m = to_mirred(a);
+ struct net_device *dev = rcu_dereference_protected(m->tcfm_dev, 1);
+
list_del(&m->tcfm_list);
- if (m->tcfm_dev)
- dev_put(m->tcfm_dev);
+ if (dev)
+ dev_put(dev);
}
static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = {
@@ -93,7 +95,8 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
if (!tcf_hash_check(parm->index, a, bind)) {
if (dev == NULL)
return -EINVAL;
- ret = tcf_hash_create(parm->index, est, a, sizeof(*m), bind);
+ ret = tcf_hash_create(parm->index, est, a, sizeof(*m),
+ bind, true);
if (ret)
return ret;
ret = ACT_P_CREATED;
@@ -107,18 +110,18 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
}
m = to_mirred(a);
- spin_lock_bh(&m->tcf_lock);
+ ASSERT_RTNL();
m->tcf_action = parm->action;
m->tcfm_eaction = parm->eaction;
if (dev != NULL) {
m->tcfm_ifindex = parm->ifindex;
if (ret != ACT_P_CREATED)
- dev_put(m->tcfm_dev);
+ dev_put(rcu_dereference_protected(m->tcfm_dev, 1));
dev_hold(dev);
- m->tcfm_dev = dev;
+ rcu_assign_pointer(m->tcfm_dev, dev);
m->tcfm_ok_push = ok_push;
}
- spin_unlock_bh(&m->tcf_lock);
+
if (ret == ACT_P_CREATED) {
list_add(&m->tcfm_list, &mirred_list);
tcf_hash_insert(a);
@@ -133,20 +136,22 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
struct tcf_mirred *m = a->priv;
struct net_device *dev;
struct sk_buff *skb2;
+ int retval, err;
u32 at;
- int retval, err = 1;
- spin_lock(&m->tcf_lock);
- m->tcf_tm.lastuse = jiffies;
- bstats_update(&m->tcf_bstats, skb);
+ tcf_lastuse_update(&m->tcf_tm);
+
+ bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb);
- dev = m->tcfm_dev;
- if (!dev) {
- printk_once(KERN_NOTICE "tc mirred: target device is gone\n");
+ rcu_read_lock();
+ retval = READ_ONCE(m->tcf_action);
+ dev = rcu_dereference(m->tcfm_dev);
+ if (unlikely(!dev)) {
+ pr_notice_once("tc mirred: target device is gone\n");
goto out;
}
- if (!(dev->flags & IFF_UP)) {
+ if (unlikely(!(dev->flags & IFF_UP))) {
net_notice_ratelimited("tc mirred to Houston: device %s is down\n",
dev->name);
goto out;
@@ -154,7 +159,7 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
at = G_TC_AT(skb->tc_verd);
skb2 = skb_clone(skb, GFP_ATOMIC);
- if (skb2 == NULL)
+ if (!skb2)
goto out;
if (!(at & AT_EGRESS)) {
@@ -170,16 +175,13 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
skb2->dev = dev;
err = dev_queue_xmit(skb2);
-out:
if (err) {
- m->tcf_qstats.overlimits++;
+out:
+ qstats_overlimit_inc(this_cpu_ptr(m->common.cpu_qstats));
if (m->tcfm_eaction != TCA_EGRESS_MIRROR)
retval = TC_ACT_SHOT;
- else
- retval = m->tcf_action;
- } else
- retval = m->tcf_action;
- spin_unlock(&m->tcf_lock);
+ }
+ rcu_read_unlock();
return retval;
}
@@ -218,14 +220,16 @@ static int mirred_device_event(struct notifier_block *unused,
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct tcf_mirred *m;
+ ASSERT_RTNL();
if (event == NETDEV_UNREGISTER)
list_for_each_entry(m, &mirred_list, tcfm_list) {
- spin_lock_bh(&m->tcf_lock);
- if (m->tcfm_dev == dev) {
+ if (rcu_access_pointer(m->tcfm_dev) == dev) {
dev_put(dev);
- m->tcfm_dev = NULL;
+ /* Note : no rcu grace period necessary, as
+ * net_device are already rcu protected.
+ */
+ RCU_INIT_POINTER(m->tcfm_dev, NULL);
}
- spin_unlock_bh(&m->tcf_lock);
}
return NOTIFY_DONE;
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index 270a030..5be0b3c 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -55,7 +55,8 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
parm = nla_data(tb[TCA_NAT_PARMS]);
if (!tcf_hash_check(parm->index, a, bind)) {
- ret = tcf_hash_create(parm->index, est, a, sizeof(*p), bind);
+ ret = tcf_hash_create(parm->index, est, a, sizeof(*p),
+ bind, false);
if (ret)
return ret;
ret = ACT_P_CREATED;
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index ff8b466..e38a770 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -57,7 +57,8 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
if (!tcf_hash_check(parm->index, a, bind)) {
if (!parm->nkeys)
return -EINVAL;
- ret = tcf_hash_create(parm->index, est, a, sizeof(*p), bind);
+ ret = tcf_hash_create(parm->index, est, a, sizeof(*p),
+ bind, false);
if (ret)
return ret;
p = to_pedit(a);
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index 6a8d948..d6b708d 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -103,7 +103,8 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla,
defdata = nla_data(tb[TCA_DEF_DATA]);
if (!tcf_hash_check(parm->index, a, bind)) {
- ret = tcf_hash_create(parm->index, est, a, sizeof(*d), bind);
+ ret = tcf_hash_create(parm->index, est, a, sizeof(*d),
+ bind, false);
if (ret)
return ret;
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index fcfeeaf..6751b5f 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -99,7 +99,8 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
parm = nla_data(tb[TCA_SKBEDIT_PARMS]);
if (!tcf_hash_check(parm->index, a, bind)) {
- ret = tcf_hash_create(parm->index, est, a, sizeof(*d), bind);
+ ret = tcf_hash_create(parm->index, est, a, sizeof(*d),
+ bind, false);
if (ret)
return ret;
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index d735ecf..796785e 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -116,7 +116,8 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
action = parm->v_action;
if (!tcf_hash_check(parm->index, a, bind)) {
- ret = tcf_hash_create(parm->index, est, a, sizeof(*v), bind);
+ ret = tcf_hash_create(parm->index, est, a, sizeof(*v),
+ bind, false);
if (ret)
return ret;
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index ea611b21..4c85bd3 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -30,35 +30,16 @@ static int cls_cgroup_classify(struct sk_buff *skb, const struct tcf_proto *tp,
struct tcf_result *res)
{
struct cls_cgroup_head *head = rcu_dereference_bh(tp->root);
- u32 classid;
-
- classid = task_cls_state(current)->classid;
-
- /*
- * Due to the nature of the classifier it is required to ignore all
- * packets originating from softirq context as accessing `current'
- * would lead to false results.
- *
- * This test assumes that all callers of dev_queue_xmit() explicitely
- * disable bh. Knowing this, it is possible to detect softirq based
- * calls by looking at the number of nested bh disable calls because
- * softirqs always disables bh.
- */
- if (in_serving_softirq()) {
- /* If there is an sk_classid we'll use that. */
- if (!skb->sk)
- return -1;
- classid = skb->sk->sk_classid;
- }
+ u32 classid = task_get_classid(skb);
if (!classid)
return -1;
-
if (!tcf_em_tree_match(skb, &head->ematches, NULL))
return -1;
res->classid = classid;
res->class = 0;
+
return tcf_exts_exec(skb, &head->exts, res);
}
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index b8d73bc..ffaeea6 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -186,7 +186,6 @@ struct qfq_sched {
u64 oldV, V; /* Precise virtual times. */
struct qfq_aggregate *in_serv_agg; /* Aggregate being served. */
- u32 num_active_agg; /* Num. of active aggregates */
u32 wsum; /* weight sum */
u32 iwsum; /* inverse weight sum */
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 59e8035..4345790 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -487,23 +487,35 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
*/
rcu_read_lock();
list_for_each_entry_rcu(laddr, &bp->address_list, list) {
+ struct net_device *odev;
+
if (!laddr->valid)
continue;
- if ((laddr->state == SCTP_ADDR_SRC) &&
- (AF_INET == laddr->a.sa.sa_family)) {
- fl4->fl4_sport = laddr->a.v4.sin_port;
- flowi4_update_output(fl4,
- asoc->base.sk->sk_bound_dev_if,
- RT_CONN_FLAGS(asoc->base.sk),
- daddr->v4.sin_addr.s_addr,
- laddr->a.v4.sin_addr.s_addr);
-
- rt = ip_route_output_key(sock_net(sk), fl4);
- if (!IS_ERR(rt)) {
- dst = &rt->dst;
- goto out_unlock;
- }
- }
+ if (laddr->state != SCTP_ADDR_SRC ||
+ AF_INET != laddr->a.sa.sa_family)
+ continue;
+
+ fl4->fl4_sport = laddr->a.v4.sin_port;
+ flowi4_update_output(fl4,
+ asoc->base.sk->sk_bound_dev_if,
+ RT_CONN_FLAGS(asoc->base.sk),
+ daddr->v4.sin_addr.s_addr,
+ laddr->a.v4.sin_addr.s_addr);
+
+ rt = ip_route_output_key(sock_net(sk), fl4);
+ if (IS_ERR(rt))
+ continue;
+
+ /* Ensure the src address belongs to the output
+ * interface.
+ */
+ odev = __ip_dev_find(sock_net(sk), laddr->a.v4.sin_addr.s_addr,
+ false);
+ if (!odev || odev->ifindex != fl4->flowi4_oif)
+ continue;
+
+ dst = &rt->dst;
+ break;
}
out_unlock:
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 3ee27b7..d7eaa73 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -853,7 +853,7 @@ nomem:
/*
* Respond to a normal COOKIE ACK chunk.
- * We are the side that is being asked for an association.
+ * We are the side that is asking for an association.
*
* RFC 2960 5.1 Normal Establishment of an Association
*
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index 9f2add3..16c1c43 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -810,7 +810,7 @@ static int switchdev_port_fdb_dump_cb(struct net_device *dev,
ndm->ndm_flags = NTF_SELF;
ndm->ndm_type = 0;
ndm->ndm_ifindex = dev->ifindex;
- ndm->ndm_state = NUD_REACHABLE;
+ ndm->ndm_state = obj->u.fdb.ndm_state;
if (nla_put(dump->skb, NDA_LLADDR, ETH_ALEN, obj->u.fdb.addr))
goto nla_put_failure;
@@ -910,13 +910,9 @@ static struct net_device *switchdev_get_dev_by_nhs(struct fib_info *fi)
if (switchdev_port_attr_get(dev, &attr))
return NULL;
- if (nhsel > 0) {
- if (prev_attr.u.ppid.id_len != attr.u.ppid.id_len)
+ if (nhsel > 0 &&
+ !netdev_phys_item_id_same(&prev_attr.u.ppid, &attr.u.ppid))
return NULL;
- if (memcmp(prev_attr.u.ppid.id, attr.u.ppid.id,
- attr.u.ppid.id_len))
- return NULL;
- }
prev_attr = attr;
}
@@ -1043,3 +1039,106 @@ void switchdev_fib_ipv4_abort(struct fib_info *fi)
fi->fib_net->ipv4.fib_offload_disabled = true;
}
EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_abort);
+
+static bool switchdev_port_same_parent_id(struct net_device *a,
+ struct net_device *b)
+{
+ struct switchdev_attr a_attr = {
+ .id = SWITCHDEV_ATTR_PORT_PARENT_ID,
+ .flags = SWITCHDEV_F_NO_RECURSE,
+ };
+ struct switchdev_attr b_attr = {
+ .id = SWITCHDEV_ATTR_PORT_PARENT_ID,
+ .flags = SWITCHDEV_F_NO_RECURSE,
+ };
+
+ if (switchdev_port_attr_get(a, &a_attr) ||
+ switchdev_port_attr_get(b, &b_attr))
+ return false;
+
+ return netdev_phys_item_id_same(&a_attr.u.ppid, &b_attr.u.ppid);
+}
+
+static u32 switchdev_port_fwd_mark_get(struct net_device *dev,
+ struct net_device *group_dev)
+{
+ struct net_device *lower_dev;
+ struct list_head *iter;
+
+ netdev_for_each_lower_dev(group_dev, lower_dev, iter) {
+ if (lower_dev == dev)
+ continue;
+ if (switchdev_port_same_parent_id(dev, lower_dev))
+ return lower_dev->offload_fwd_mark;
+ return switchdev_port_fwd_mark_get(dev, lower_dev);
+ }
+
+ return dev->ifindex;
+}
+
+static void switchdev_port_fwd_mark_reset(struct net_device *group_dev,
+ u32 old_mark, u32 *reset_mark)
+{
+ struct net_device *lower_dev;
+ struct list_head *iter;
+
+ netdev_for_each_lower_dev(group_dev, lower_dev, iter) {
+ if (lower_dev->offload_fwd_mark == old_mark) {
+ if (!*reset_mark)
+ *reset_mark = lower_dev->ifindex;
+ lower_dev->offload_fwd_mark = *reset_mark;
+ }
+ switchdev_port_fwd_mark_reset(lower_dev, old_mark, reset_mark);
+ }
+}
+
+/**
+ * switchdev_port_fwd_mark_set - Set port offload forwarding mark
+ *
+ * @dev: port device
+ * @group_dev: containing device
+ * @joining: true if dev is joining group; false if leaving group
+ *
+ * An ungrouped port's offload mark is just its ifindex. A grouped
+ * port's (member of a bridge, for example) offload mark is the ifindex
+ * of one of the ports in the group with the same parent (switch) ID.
+ * Ports on the same device in the same group will have the same mark.
+ *
+ * Example:
+ *
+ * br0 ifindex=9
+ * sw1p1 ifindex=2 mark=2
+ * sw1p2 ifindex=3 mark=2
+ * sw2p1 ifindex=4 mark=5
+ * sw2p2 ifindex=5 mark=5
+ *
+ * If sw2p2 leaves the bridge, we'll have:
+ *
+ * br0 ifindex=9
+ * sw1p1 ifindex=2 mark=2
+ * sw1p2 ifindex=3 mark=2
+ * sw2p1 ifindex=4 mark=4
+ * sw2p2 ifindex=5 mark=5
+ */
+void switchdev_port_fwd_mark_set(struct net_device *dev,
+ struct net_device *group_dev,
+ bool joining)
+{
+ u32 mark = dev->ifindex;
+ u32 reset_mark = 0;
+
+ if (group_dev && joining) {
+ mark = switchdev_port_fwd_mark_get(dev, group_dev);
+ } else if (group_dev && !joining) {
+ if (dev->offload_fwd_mark == mark)
+ /* Ohoh, this port was the mark reference port,
+ * but it's leaving the group, so reset the
+ * mark for the remaining ports in the group.
+ */
+ switchdev_port_fwd_mark_reset(group_dev, mark,
+ &reset_mark);
+ }
+
+ dev->offload_fwd_mark = mark;
+}
+EXPORT_SYMBOL_GPL(switchdev_port_fwd_mark_set);
diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index a816382..8b010c9 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -316,6 +316,29 @@ void tipc_bclink_update_link_state(struct tipc_node *n_ptr,
}
}
+void tipc_bclink_sync_state(struct tipc_node *n, struct tipc_msg *hdr)
+{
+ u16 last = msg_last_bcast(hdr);
+ int mtyp = msg_type(hdr);
+
+ if (unlikely(msg_user(hdr) != LINK_PROTOCOL))
+ return;
+ if (mtyp == STATE_MSG) {
+ tipc_bclink_update_link_state(n, last);
+ return;
+ }
+ /* Compatibility: older nodes don't know BCAST_PROTOCOL synchronization,
+ * and transfer synch info in LINK_PROTOCOL messages.
+ */
+ if (tipc_node_is_up(n))
+ return;
+ if ((mtyp != RESET_MSG) && (mtyp != ACTIVATE_MSG))
+ return;
+ n->bclink.last_sent = last;
+ n->bclink.last_in = last;
+ n->bclink.oos_state = 0;
+}
+
/**
* bclink_peek_nack - monitor retransmission requests sent by other nodes
*
@@ -358,10 +381,9 @@ int tipc_bclink_xmit(struct net *net, struct sk_buff_head *list)
/* Prepare clone of message for local node */
skb = tipc_msg_reassemble(list);
- if (unlikely(!skb)) {
- __skb_queue_purge(list);
+ if (unlikely(!skb))
return -EHOSTUNREACH;
- }
+
/* Broadcast to all nodes */
if (likely(bclink)) {
tipc_bclink_lock(net);
@@ -413,7 +435,7 @@ static void bclink_accept_pkt(struct tipc_node *node, u32 seqno)
* all nodes in the cluster don't ACK at the same time
*/
if (((seqno - tn->own_addr) % TIPC_MIN_LINK_WIN) == 0) {
- tipc_link_proto_xmit(node->active_links[node->addr & 1],
+ tipc_link_proto_xmit(node_active_link(node, node->addr),
STATE_MSG, 0, 0, 0, 0);
tn->bcl->stats.sent_acks++;
}
@@ -925,7 +947,6 @@ int tipc_bclink_init(struct net *net)
tipc_link_set_queue_limits(bcl, BCLINK_WIN_DEFAULT);
bcl->bearer_id = MAX_BEARERS;
rcu_assign_pointer(tn->bearer_list[MAX_BEARERS], &bcbearer->bearer);
- bcl->state = WORKING_WORKING;
bcl->pmsg = (struct tipc_msg *)&bcl->proto_msg;
msg_set_prevnode(bcl->pmsg, tn->own_addr);
strlcpy(bcl->name, tipc_bclink_name, TIPC_MAX_LINK_NAME);
diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h
index 3c290a48..d74c69b 100644
--- a/net/tipc/bcast.h
+++ b/net/tipc/bcast.h
@@ -133,5 +133,6 @@ void tipc_bclink_wakeup_users(struct net *net);
int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg);
int tipc_nl_bc_link_set(struct net *net, struct nlattr *attrs[]);
void tipc_bclink_input(struct net *net);
+void tipc_bclink_sync_state(struct tipc_node *n, struct tipc_msg *msg);
#endif
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index 00bc0e6..ce9f7bf 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -343,7 +343,7 @@ restart:
static int tipc_reset_bearer(struct net *net, struct tipc_bearer *b_ptr)
{
pr_info("Resetting bearer <%s>\n", b_ptr->name);
- tipc_link_delete_list(net, b_ptr->identity);
+ tipc_node_delete_links(net, b_ptr->identity);
tipc_disc_reset(net, b_ptr);
return 0;
}
@@ -361,7 +361,7 @@ static void bearer_disable(struct net *net, struct tipc_bearer *b_ptr)
pr_info("Disabling bearer <%s>\n", b_ptr->name);
b_ptr->media->disable_media(b_ptr);
- tipc_link_delete_list(net, b_ptr->identity);
+ tipc_node_delete_links(net, b_ptr->identity);
if (b_ptr->link_req)
tipc_disc_delete(b_ptr->link_req);
@@ -470,6 +470,32 @@ void tipc_bearer_send(struct net *net, u32 bearer_id, struct sk_buff *buf,
rcu_read_unlock();
}
+/* tipc_bearer_xmit() -send buffer to destination over bearer
+ */
+void tipc_bearer_xmit(struct net *net, u32 bearer_id,
+ struct sk_buff_head *xmitq,
+ struct tipc_media_addr *dst)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct tipc_bearer *b;
+ struct sk_buff *skb, *tmp;
+
+ if (skb_queue_empty(xmitq))
+ return;
+
+ rcu_read_lock();
+ b = rcu_dereference_rtnl(tn->bearer_list[bearer_id]);
+ if (likely(b)) {
+ skb_queue_walk_safe(xmitq, skb, tmp) {
+ __skb_dequeue(xmitq);
+ b->media->send_msg(net, skb, b, dst);
+ /* Until we remove cloning in tipc_l2_send_msg(): */
+ kfree_skb(skb);
+ }
+ }
+ rcu_read_unlock();
+}
+
/**
* tipc_l2_rcv_msg - handle incoming TIPC message from an interface
* @buf: the received packet
diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h
index dc714d9..6426f24 100644
--- a/net/tipc/bearer.h
+++ b/net/tipc/bearer.h
@@ -217,5 +217,8 @@ void tipc_bearer_cleanup(void);
void tipc_bearer_stop(struct net *net);
void tipc_bearer_send(struct net *net, u32 bearer_id, struct sk_buff *buf,
struct tipc_media_addr *dest);
+void tipc_bearer_xmit(struct net *net, u32 bearer_id,
+ struct sk_buff_head *xmitq,
+ struct tipc_media_addr *dst);
#endif /* _TIPC_BEARER_H */
diff --git a/net/tipc/core.h b/net/tipc/core.h
index 0fcf133..b96b41e 100644
--- a/net/tipc/core.h
+++ b/net/tipc/core.h
@@ -109,6 +109,11 @@ struct tipc_net {
atomic_t subscription_count;
};
+static inline struct tipc_net *tipc_net(struct net *net)
+{
+ return net_generic(net, tipc_net_id);
+}
+
static inline u16 mod(u16 x)
{
return x & 0xffffu;
@@ -129,6 +134,11 @@ static inline int less(u16 left, u16 right)
return less_eq(left, right) && (mod(right) != mod(left));
}
+static inline int in_range(u16 val, u16 min, u16 max)
+{
+ return !less(val, min) && !more(val, max);
+}
+
#ifdef CONFIG_SYSCTL
int tipc_register_sysctl(void);
void tipc_unregister_sysctl(void);
diff --git a/net/tipc/discover.c b/net/tipc/discover.c
index 967e292..d14e0a4 100644
--- a/net/tipc/discover.c
+++ b/net/tipc/discover.c
@@ -35,7 +35,7 @@
*/
#include "core.h"
-#include "link.h"
+#include "node.h"
#include "discover.h"
/* min delay during bearer start up */
@@ -120,30 +120,24 @@ static void disc_dupl_alert(struct tipc_bearer *b_ptr, u32 node_addr,
* @buf: buffer containing message
* @bearer: bearer that message arrived on
*/
-void tipc_disc_rcv(struct net *net, struct sk_buff *buf,
+void tipc_disc_rcv(struct net *net, struct sk_buff *skb,
struct tipc_bearer *bearer)
{
struct tipc_net *tn = net_generic(net, tipc_net_id);
- struct tipc_node *node;
- struct tipc_link *link;
struct tipc_media_addr maddr;
- struct sk_buff *rbuf;
- struct tipc_msg *msg = buf_msg(buf);
- u32 ddom = msg_dest_domain(msg);
- u32 onode = msg_prevnode(msg);
- u32 net_id = msg_bc_netid(msg);
- u32 mtyp = msg_type(msg);
- u32 signature = msg_node_sig(msg);
- u16 caps = msg_node_capabilities(msg);
- bool addr_match = false;
- bool sign_match = false;
- bool link_up = false;
- bool accept_addr = false;
- bool accept_sign = false;
+ struct sk_buff *rskb;
+ struct tipc_msg *hdr = buf_msg(skb);
+ u32 ddom = msg_dest_domain(hdr);
+ u32 onode = msg_prevnode(hdr);
+ u32 net_id = msg_bc_netid(hdr);
+ u32 mtyp = msg_type(hdr);
+ u32 signature = msg_node_sig(hdr);
+ u16 caps = msg_node_capabilities(hdr);
bool respond = false;
+ bool dupl_addr = false;
- bearer->media->msg2addr(bearer, &maddr, msg_media_addr(msg));
- kfree_skb(buf);
+ bearer->media->msg2addr(bearer, &maddr, msg_media_addr(hdr));
+ kfree_skb(skb);
/* Ensure message from node is valid and communication is permitted */
if (net_id != tn->net_id)
@@ -165,102 +159,20 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *buf,
if (!tipc_in_scope(bearer->domain, onode))
return;
- node = tipc_node_create(net, onode);
- if (!node)
- return;
- tipc_node_lock(node);
- node->capabilities = caps;
- link = node->links[bearer->identity];
-
- /* Prepare to validate requesting node's signature and media address */
- sign_match = (signature == node->signature);
- addr_match = link && !memcmp(&link->media_addr, &maddr, sizeof(maddr));
- link_up = link && tipc_link_is_up(link);
-
-
- /* These three flags give us eight permutations: */
-
- if (sign_match && addr_match && link_up) {
- /* All is fine. Do nothing. */
- } else if (sign_match && addr_match && !link_up) {
- /* Respond. The link will come up in due time */
- respond = true;
- } else if (sign_match && !addr_match && link_up) {
- /* Peer has changed i/f address without rebooting.
- * If so, the link will reset soon, and the next
- * discovery will be accepted. So we can ignore it.
- * It may also be an cloned or malicious peer having
- * chosen the same node address and signature as an
- * existing one.
- * Ignore requests until the link goes down, if ever.
- */
- disc_dupl_alert(bearer, onode, &maddr);
- } else if (sign_match && !addr_match && !link_up) {
- /* Peer link has changed i/f address without rebooting.
- * It may also be a cloned or malicious peer; we can't
- * distinguish between the two.
- * The signature is correct, so we must accept.
- */
- accept_addr = true;
- respond = true;
- } else if (!sign_match && addr_match && link_up) {
- /* Peer node rebooted. Two possibilities:
- * - Delayed re-discovery; this link endpoint has already
- * reset and re-established contact with the peer, before
- * receiving a discovery message from that node.
- * (The peer happened to receive one from this node first).
- * - The peer came back so fast that our side has not
- * discovered it yet. Probing from this side will soon
- * reset the link, since there can be no working link
- * endpoint at the peer end, and the link will re-establish.
- * Accept the signature, since it comes from a known peer.
- */
- accept_sign = true;
- } else if (!sign_match && addr_match && !link_up) {
- /* The peer node has rebooted.
- * Accept signature, since it is a known peer.
- */
- accept_sign = true;
- respond = true;
- } else if (!sign_match && !addr_match && link_up) {
- /* Peer rebooted with new address, or a new/duplicate peer.
- * Ignore until the link goes down, if ever.
- */
+ tipc_node_check_dest(net, onode, bearer, caps, signature,
+ &maddr, &respond, &dupl_addr);
+ if (dupl_addr)
disc_dupl_alert(bearer, onode, &maddr);
- } else if (!sign_match && !addr_match && !link_up) {
- /* Peer rebooted with new address, or it is a new peer.
- * Accept signature and address.
- */
- accept_sign = true;
- accept_addr = true;
- respond = true;
- }
-
- if (accept_sign)
- node->signature = signature;
-
- if (accept_addr) {
- if (!link)
- link = tipc_link_create(node, bearer, &maddr);
- if (link) {
- memcpy(&link->media_addr, &maddr, sizeof(maddr));
- tipc_link_reset(link);
- } else {
- respond = false;
- }
- }
/* Send response, if necessary */
if (respond && (mtyp == DSC_REQ_MSG)) {
- rbuf = tipc_buf_acquire(MAX_H_SIZE);
- if (rbuf) {
- tipc_disc_init_msg(net, rbuf, DSC_RESP_MSG, bearer);
- tipc_bearer_send(net, bearer->identity, rbuf, &maddr);
- kfree_skb(rbuf);
+ rskb = tipc_buf_acquire(MAX_H_SIZE);
+ if (rskb) {
+ tipc_disc_init_msg(net, rskb, DSC_RESP_MSG, bearer);
+ tipc_bearer_send(net, bearer->identity, rskb, &maddr);
+ kfree_skb(rskb);
}
}
- tipc_node_unlock(node);
- tipc_node_put(node);
}
/**
diff --git a/net/tipc/link.c b/net/tipc/link.c
index eaa9fe5..f067e54 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -48,9 +48,8 @@
/*
* Error message prefixes
*/
-static const char *link_co_err = "Link changeover error, ";
+static const char *link_co_err = "Link tunneling error, ";
static const char *link_rst_msg = "Resetting link ";
-static const char *link_unk_evt = "Unknown link event ";
static const struct nla_policy tipc_nl_link_policy[TIPC_NLA_LINK_MAX + 1] = {
[TIPC_NLA_LINK_UNSPEC] = { .type = NLA_UNSPEC },
@@ -77,256 +76,413 @@ static const struct nla_policy tipc_nl_prop_policy[TIPC_NLA_PROP_MAX + 1] = {
};
/*
+ * Interval between NACKs when packets arrive out of order
+ */
+#define TIPC_NACK_INTV (TIPC_MIN_LINK_WIN * 2)
+/*
* Out-of-range value for link session numbers
*/
-#define INVALID_SESSION 0x10000
+#define WILDCARD_SESSION 0x10000
-/*
- * Link state events:
+/* Link FSM states:
*/
-#define STARTING_EVT 856384768 /* link processing trigger */
-#define TRAFFIC_MSG_EVT 560815u /* rx'd ??? */
-#define SILENCE_EVT 560817u /* timer dicovered silence from peer */
+enum {
+ LINK_ESTABLISHED = 0xe,
+ LINK_ESTABLISHING = 0xe << 4,
+ LINK_RESET = 0x1 << 8,
+ LINK_RESETTING = 0x2 << 12,
+ LINK_PEER_RESET = 0xd << 16,
+ LINK_FAILINGOVER = 0xf << 20,
+ LINK_SYNCHING = 0xc << 24
+};
-/*
- * State value stored in 'failover_pkts'
+/* Link FSM state checking routines
*/
-#define FIRST_FAILOVER 0xffffu
-
-static void link_handle_out_of_seq_msg(struct tipc_link *link,
- struct sk_buff *skb);
-static void tipc_link_proto_rcv(struct tipc_link *link,
- struct sk_buff *skb);
-static void link_set_supervision_props(struct tipc_link *l_ptr, u32 tol);
-static void link_state_event(struct tipc_link *l_ptr, u32 event);
+static int link_is_up(struct tipc_link *l)
+{
+ return l->state & (LINK_ESTABLISHED | LINK_SYNCHING);
+}
+
+static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
+ struct sk_buff_head *xmitq);
+static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe,
+ u16 rcvgap, int tolerance, int priority,
+ struct sk_buff_head *xmitq);
static void link_reset_statistics(struct tipc_link *l_ptr);
static void link_print(struct tipc_link *l_ptr, const char *str);
-static void tipc_link_sync_xmit(struct tipc_link *l);
static void tipc_link_sync_rcv(struct tipc_node *n, struct sk_buff *buf);
-static void tipc_link_input(struct tipc_link *l, struct sk_buff *skb);
-static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb);
-static bool tipc_link_failover_rcv(struct tipc_link *l, struct sk_buff **skb);
-static void link_set_timer(struct tipc_link *link, unsigned long time);
+
/*
- * Simple link routines
+ * Simple non-static link routines (i.e. referenced outside this file)
*/
-static unsigned int align(unsigned int i)
+bool tipc_link_is_up(struct tipc_link *l)
{
- return (i + 3) & ~3u;
+ return link_is_up(l);
}
-static void tipc_link_release(struct kref *kref)
+bool tipc_link_is_reset(struct tipc_link *l)
{
- kfree(container_of(kref, struct tipc_link, ref));
+ return l->state & (LINK_RESET | LINK_FAILINGOVER | LINK_ESTABLISHING);
}
-static void tipc_link_get(struct tipc_link *l_ptr)
+bool tipc_link_is_synching(struct tipc_link *l)
{
- kref_get(&l_ptr->ref);
+ return l->state == LINK_SYNCHING;
}
-static void tipc_link_put(struct tipc_link *l_ptr)
+bool tipc_link_is_failingover(struct tipc_link *l)
{
- kref_put(&l_ptr->ref, tipc_link_release);
+ return l->state == LINK_FAILINGOVER;
}
-static struct tipc_link *tipc_parallel_link(struct tipc_link *l)
+bool tipc_link_is_blocked(struct tipc_link *l)
{
- if (l->owner->active_links[0] != l)
- return l->owner->active_links[0];
- return l->owner->active_links[1];
+ return l->state & (LINK_RESETTING | LINK_PEER_RESET | LINK_FAILINGOVER);
}
-/*
- * Simple non-static link routines (i.e. referenced outside this file)
- */
-int tipc_link_is_up(struct tipc_link *l_ptr)
+int tipc_link_is_active(struct tipc_link *l)
{
- if (!l_ptr)
- return 0;
- return link_working_working(l_ptr) || link_working_unknown(l_ptr);
+ struct tipc_node *n = l->owner;
+
+ return (node_active_link(n, 0) == l) || (node_active_link(n, 1) == l);
}
-int tipc_link_is_active(struct tipc_link *l_ptr)
+static u32 link_own_addr(struct tipc_link *l)
{
- return (l_ptr->owner->active_links[0] == l_ptr) ||
- (l_ptr->owner->active_links[1] == l_ptr);
+ return msg_prevnode(l->pmsg);
}
/**
- * link_timeout - handle expiration of link timer
- * @l_ptr: pointer to link
+ * tipc_link_create - create a new link
+ * @n: pointer to associated node
+ * @b: pointer to associated bearer
+ * @ownnode: identity of own node
+ * @peer: identity of peer node
+ * @maddr: media address to be used
+ * @inputq: queue to put messages ready for delivery
+ * @namedq: queue to put binding table update messages ready for delivery
+ * @link: return value, pointer to put the created link
+ *
+ * Returns true if link was created, otherwise false
*/
-static void link_timeout(unsigned long data)
+bool tipc_link_create(struct tipc_node *n, struct tipc_bearer *b, u32 session,
+ u32 ownnode, u32 peer, struct tipc_media_addr *maddr,
+ struct sk_buff_head *inputq, struct sk_buff_head *namedq,
+ struct tipc_link **link)
{
- struct tipc_link *l_ptr = (struct tipc_link *)data;
- struct sk_buff *skb;
+ struct tipc_link *l;
+ struct tipc_msg *hdr;
+ char *if_name;
+
+ l = kzalloc(sizeof(*l), GFP_ATOMIC);
+ if (!l)
+ return false;
+ *link = l;
+
+ /* Note: peer i/f name is completed by reset/activate message */
+ if_name = strchr(b->name, ':') + 1;
+ sprintf(l->name, "%u.%u.%u:%s-%u.%u.%u:unknown",
+ tipc_zone(ownnode), tipc_cluster(ownnode), tipc_node(ownnode),
+ if_name, tipc_zone(peer), tipc_cluster(peer), tipc_node(peer));
+
+ l->addr = peer;
+ l->media_addr = maddr;
+ l->owner = n;
+ l->peer_session = WILDCARD_SESSION;
+ l->bearer_id = b->identity;
+ l->tolerance = b->tolerance;
+ l->net_plane = b->net_plane;
+ l->advertised_mtu = b->mtu;
+ l->mtu = b->mtu;
+ l->priority = b->priority;
+ tipc_link_set_queue_limits(l, b->window);
+ l->inputq = inputq;
+ l->namedq = namedq;
+ l->state = LINK_RESETTING;
+ l->pmsg = (struct tipc_msg *)&l->proto_msg;
+ hdr = l->pmsg;
+ tipc_msg_init(ownnode, hdr, LINK_PROTOCOL, RESET_MSG, INT_H_SIZE, peer);
+ msg_set_size(hdr, sizeof(l->proto_msg));
+ msg_set_session(hdr, session);
+ msg_set_bearer_id(hdr, l->bearer_id);
+ strcpy((char *)msg_data(hdr), if_name);
+ __skb_queue_head_init(&l->transmq);
+ __skb_queue_head_init(&l->backlogq);
+ __skb_queue_head_init(&l->deferdq);
+ skb_queue_head_init(&l->wakeupq);
+ skb_queue_head_init(l->inputq);
+ return true;
+}
- tipc_node_lock(l_ptr->owner);
+/* tipc_link_build_bcast_sync_msg() - synchronize broadcast link endpoints.
+ *
+ * Give a newly added peer node the sequence number where it should
+ * start receiving and acking broadcast packets.
+ */
+void tipc_link_build_bcast_sync_msg(struct tipc_link *l,
+ struct sk_buff_head *xmitq)
+{
+ struct sk_buff *skb;
+ struct sk_buff_head list;
+ u16 last_sent;
- /* update counters used in statistical profiling of send traffic */
- l_ptr->stats.accu_queue_sz += skb_queue_len(&l_ptr->transmq);
- l_ptr->stats.queue_sz_counts++;
+ skb = tipc_msg_create(BCAST_PROTOCOL, STATE_MSG, INT_H_SIZE,
+ 0, l->addr, link_own_addr(l), 0, 0, 0);
+ if (!skb)
+ return;
+ last_sent = tipc_bclink_get_last_sent(l->owner->net);
+ msg_set_last_bcast(buf_msg(skb), last_sent);
+ __skb_queue_head_init(&list);
+ __skb_queue_tail(&list, skb);
+ tipc_link_xmit(l, &list, xmitq);
+}
- skb = skb_peek(&l_ptr->transmq);
- if (skb) {
- struct tipc_msg *msg = buf_msg(skb);
- u32 length = msg_size(msg);
+/**
+ * tipc_link_fsm_evt - link finite state machine
+ * @l: pointer to link
+ * @evt: state machine event to be processed
+ */
+int tipc_link_fsm_evt(struct tipc_link *l, int evt)
+{
+ int rc = 0;
- if ((msg_user(msg) == MSG_FRAGMENTER) &&
- (msg_type(msg) == FIRST_FRAGMENT)) {
- length = msg_size(msg_get_wrapped(msg));
+ switch (l->state) {
+ case LINK_RESETTING:
+ switch (evt) {
+ case LINK_PEER_RESET_EVT:
+ l->state = LINK_PEER_RESET;
+ break;
+ case LINK_RESET_EVT:
+ l->state = LINK_RESET;
+ break;
+ case LINK_FAILURE_EVT:
+ case LINK_FAILOVER_BEGIN_EVT:
+ case LINK_ESTABLISH_EVT:
+ case LINK_FAILOVER_END_EVT:
+ case LINK_SYNCH_BEGIN_EVT:
+ case LINK_SYNCH_END_EVT:
+ default:
+ goto illegal_evt;
}
- if (length) {
- l_ptr->stats.msg_lengths_total += length;
- l_ptr->stats.msg_length_counts++;
- if (length <= 64)
- l_ptr->stats.msg_length_profile[0]++;
- else if (length <= 256)
- l_ptr->stats.msg_length_profile[1]++;
- else if (length <= 1024)
- l_ptr->stats.msg_length_profile[2]++;
- else if (length <= 4096)
- l_ptr->stats.msg_length_profile[3]++;
- else if (length <= 16384)
- l_ptr->stats.msg_length_profile[4]++;
- else if (length <= 32768)
- l_ptr->stats.msg_length_profile[5]++;
- else
- l_ptr->stats.msg_length_profile[6]++;
+ break;
+ case LINK_RESET:
+ switch (evt) {
+ case LINK_PEER_RESET_EVT:
+ l->state = LINK_ESTABLISHING;
+ break;
+ case LINK_FAILOVER_BEGIN_EVT:
+ l->state = LINK_FAILINGOVER;
+ case LINK_FAILURE_EVT:
+ case LINK_RESET_EVT:
+ case LINK_ESTABLISH_EVT:
+ case LINK_FAILOVER_END_EVT:
+ break;
+ case LINK_SYNCH_BEGIN_EVT:
+ case LINK_SYNCH_END_EVT:
+ default:
+ goto illegal_evt;
+ }
+ break;
+ case LINK_PEER_RESET:
+ switch (evt) {
+ case LINK_RESET_EVT:
+ l->state = LINK_ESTABLISHING;
+ break;
+ case LINK_PEER_RESET_EVT:
+ case LINK_ESTABLISH_EVT:
+ case LINK_FAILURE_EVT:
+ break;
+ case LINK_SYNCH_BEGIN_EVT:
+ case LINK_SYNCH_END_EVT:
+ case LINK_FAILOVER_BEGIN_EVT:
+ case LINK_FAILOVER_END_EVT:
+ default:
+ goto illegal_evt;
}
+ break;
+ case LINK_FAILINGOVER:
+ switch (evt) {
+ case LINK_FAILOVER_END_EVT:
+ l->state = LINK_RESET;
+ break;
+ case LINK_PEER_RESET_EVT:
+ case LINK_RESET_EVT:
+ case LINK_ESTABLISH_EVT:
+ case LINK_FAILURE_EVT:
+ break;
+ case LINK_FAILOVER_BEGIN_EVT:
+ case LINK_SYNCH_BEGIN_EVT:
+ case LINK_SYNCH_END_EVT:
+ default:
+ goto illegal_evt;
+ }
+ break;
+ case LINK_ESTABLISHING:
+ switch (evt) {
+ case LINK_ESTABLISH_EVT:
+ l->state = LINK_ESTABLISHED;
+ rc |= TIPC_LINK_UP_EVT;
+ break;
+ case LINK_FAILOVER_BEGIN_EVT:
+ l->state = LINK_FAILINGOVER;
+ break;
+ case LINK_PEER_RESET_EVT:
+ case LINK_RESET_EVT:
+ case LINK_FAILURE_EVT:
+ case LINK_SYNCH_BEGIN_EVT:
+ case LINK_FAILOVER_END_EVT:
+ break;
+ case LINK_SYNCH_END_EVT:
+ default:
+ goto illegal_evt;
+ }
+ break;
+ case LINK_ESTABLISHED:
+ switch (evt) {
+ case LINK_PEER_RESET_EVT:
+ l->state = LINK_PEER_RESET;
+ rc |= TIPC_LINK_DOWN_EVT;
+ break;
+ case LINK_FAILURE_EVT:
+ l->state = LINK_RESETTING;
+ rc |= TIPC_LINK_DOWN_EVT;
+ break;
+ case LINK_RESET_EVT:
+ l->state = LINK_RESET;
+ break;
+ case LINK_ESTABLISH_EVT:
+ break;
+ case LINK_SYNCH_BEGIN_EVT:
+ l->state = LINK_SYNCHING;
+ break;
+ case LINK_SYNCH_END_EVT:
+ case LINK_FAILOVER_BEGIN_EVT:
+ case LINK_FAILOVER_END_EVT:
+ default:
+ goto illegal_evt;
+ }
+ break;
+ case LINK_SYNCHING:
+ switch (evt) {
+ case LINK_PEER_RESET_EVT:
+ l->state = LINK_PEER_RESET;
+ rc |= TIPC_LINK_DOWN_EVT;
+ break;
+ case LINK_FAILURE_EVT:
+ l->state = LINK_RESETTING;
+ rc |= TIPC_LINK_DOWN_EVT;
+ break;
+ case LINK_RESET_EVT:
+ l->state = LINK_RESET;
+ break;
+ case LINK_ESTABLISH_EVT:
+ case LINK_SYNCH_BEGIN_EVT:
+ break;
+ case LINK_SYNCH_END_EVT:
+ l->state = LINK_ESTABLISHED;
+ break;
+ case LINK_FAILOVER_BEGIN_EVT:
+ case LINK_FAILOVER_END_EVT:
+ default:
+ goto illegal_evt;
+ }
+ break;
+ default:
+ pr_err("Unknown FSM state %x in %s\n", l->state, l->name);
}
-
- /* do all other link processing performed on a periodic basis */
- if (l_ptr->silent_intv_cnt || tipc_bclink_acks_missing(l_ptr->owner))
- link_state_event(l_ptr, SILENCE_EVT);
- l_ptr->silent_intv_cnt++;
- if (skb_queue_len(&l_ptr->backlogq))
- tipc_link_push_packets(l_ptr);
- link_set_timer(l_ptr, l_ptr->keepalive_intv);
- tipc_node_unlock(l_ptr->owner);
- tipc_link_put(l_ptr);
-}
-
-static void link_set_timer(struct tipc_link *link, unsigned long time)
-{
- if (!mod_timer(&link->timer, jiffies + time))
- tipc_link_get(link);
+ return rc;
+illegal_evt:
+ pr_err("Illegal FSM event %x in state %x on link %s\n",
+ evt, l->state, l->name);
+ return rc;
}
-/**
- * tipc_link_create - create a new link
- * @n_ptr: pointer to associated node
- * @b_ptr: pointer to associated bearer
- * @media_addr: media address to use when sending messages over link
- *
- * Returns pointer to link.
+/* link_profile_stats - update statistical profiling of traffic
*/
-struct tipc_link *tipc_link_create(struct tipc_node *n_ptr,
- struct tipc_bearer *b_ptr,
- const struct tipc_media_addr *media_addr)
+static void link_profile_stats(struct tipc_link *l)
{
- struct tipc_net *tn = net_generic(n_ptr->net, tipc_net_id);
- struct tipc_link *l_ptr;
+ struct sk_buff *skb;
struct tipc_msg *msg;
- char *if_name;
- char addr_string[16];
- u32 peer = n_ptr->addr;
+ int length;
- if (n_ptr->link_cnt >= MAX_BEARERS) {
- tipc_addr_string_fill(addr_string, n_ptr->addr);
- pr_err("Cannot establish %uth link to %s. Max %u allowed.\n",
- n_ptr->link_cnt, addr_string, MAX_BEARERS);
- return NULL;
- }
+ /* Update counters used in statistical profiling of send traffic */
+ l->stats.accu_queue_sz += skb_queue_len(&l->transmq);
+ l->stats.queue_sz_counts++;
- if (n_ptr->links[b_ptr->identity]) {
- tipc_addr_string_fill(addr_string, n_ptr->addr);
- pr_err("Attempt to establish second link on <%s> to %s\n",
- b_ptr->name, addr_string);
- return NULL;
- }
+ skb = skb_peek(&l->transmq);
+ if (!skb)
+ return;
+ msg = buf_msg(skb);
+ length = msg_size(msg);
- l_ptr = kzalloc(sizeof(*l_ptr), GFP_ATOMIC);
- if (!l_ptr) {
- pr_warn("Link creation failed, no memory\n");
- return NULL;
+ if (msg_user(msg) == MSG_FRAGMENTER) {
+ if (msg_type(msg) != FIRST_FRAGMENT)
+ return;
+ length = msg_size(msg_get_wrapped(msg));
}
- kref_init(&l_ptr->ref);
- l_ptr->addr = peer;
- if_name = strchr(b_ptr->name, ':') + 1;
- sprintf(l_ptr->name, "%u.%u.%u:%s-%u.%u.%u:unknown",
- tipc_zone(tn->own_addr), tipc_cluster(tn->own_addr),
- tipc_node(tn->own_addr),
- if_name,
- tipc_zone(peer), tipc_cluster(peer), tipc_node(peer));
- /* note: peer i/f name is updated by reset/activate message */
- memcpy(&l_ptr->media_addr, media_addr, sizeof(*media_addr));
- l_ptr->owner = n_ptr;
- l_ptr->peer_session = INVALID_SESSION;
- l_ptr->bearer_id = b_ptr->identity;
- link_set_supervision_props(l_ptr, b_ptr->tolerance);
- l_ptr->state = RESET_UNKNOWN;
-
- l_ptr->pmsg = (struct tipc_msg *)&l_ptr->proto_msg;
- msg = l_ptr->pmsg;
- tipc_msg_init(tn->own_addr, msg, LINK_PROTOCOL, RESET_MSG, INT_H_SIZE,
- l_ptr->addr);
- msg_set_size(msg, sizeof(l_ptr->proto_msg));
- msg_set_session(msg, (tn->random & 0xffff));
- msg_set_bearer_id(msg, b_ptr->identity);
- strcpy((char *)msg_data(msg), if_name);
- l_ptr->net_plane = b_ptr->net_plane;
- l_ptr->advertised_mtu = b_ptr->mtu;
- l_ptr->mtu = l_ptr->advertised_mtu;
- l_ptr->priority = b_ptr->priority;
- tipc_link_set_queue_limits(l_ptr, b_ptr->window);
- l_ptr->snd_nxt = 1;
- __skb_queue_head_init(&l_ptr->transmq);
- __skb_queue_head_init(&l_ptr->backlogq);
- __skb_queue_head_init(&l_ptr->deferdq);
- skb_queue_head_init(&l_ptr->wakeupq);
- skb_queue_head_init(&l_ptr->inputq);
- skb_queue_head_init(&l_ptr->namedq);
- link_reset_statistics(l_ptr);
- tipc_node_attach_link(n_ptr, l_ptr);
- setup_timer(&l_ptr->timer, link_timeout, (unsigned long)l_ptr);
- link_state_event(l_ptr, STARTING_EVT);
-
- return l_ptr;
+ l->stats.msg_lengths_total += length;
+ l->stats.msg_length_counts++;
+ if (length <= 64)
+ l->stats.msg_length_profile[0]++;
+ else if (length <= 256)
+ l->stats.msg_length_profile[1]++;
+ else if (length <= 1024)
+ l->stats.msg_length_profile[2]++;
+ else if (length <= 4096)
+ l->stats.msg_length_profile[3]++;
+ else if (length <= 16384)
+ l->stats.msg_length_profile[4]++;
+ else if (length <= 32768)
+ l->stats.msg_length_profile[5]++;
+ else
+ l->stats.msg_length_profile[6]++;
}
-/**
- * tipc_link_delete - Delete a link
- * @l: link to be deleted
+/* tipc_link_timeout - perform periodic task as instructed from node timeout
*/
-void tipc_link_delete(struct tipc_link *l)
+int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq)
{
- tipc_link_reset(l);
- if (del_timer(&l->timer))
- tipc_link_put(l);
- l->flags |= LINK_STOPPED;
- /* Delete link now, or when timer is finished: */
- tipc_link_reset_fragments(l);
- tipc_node_detach_link(l->owner, l);
- tipc_link_put(l);
-}
+ int rc = 0;
+ int mtyp = STATE_MSG;
+ bool xmit = false;
+ bool prb = false;
+
+ link_profile_stats(l);
+
+ switch (l->state) {
+ case LINK_ESTABLISHED:
+ case LINK_SYNCHING:
+ if (!l->silent_intv_cnt) {
+ if (tipc_bclink_acks_missing(l->owner))
+ xmit = true;
+ } else if (l->silent_intv_cnt <= l->abort_limit) {
+ xmit = true;
+ prb = true;
+ } else {
+ rc |= tipc_link_fsm_evt(l, LINK_FAILURE_EVT);
+ }
+ l->silent_intv_cnt++;
+ break;
+ case LINK_RESET:
+ xmit = true;
+ mtyp = RESET_MSG;
+ break;
+ case LINK_ESTABLISHING:
+ xmit = true;
+ mtyp = ACTIVATE_MSG;
+ break;
+ case LINK_PEER_RESET:
+ case LINK_RESETTING:
+ case LINK_FAILINGOVER:
+ break;
+ default:
+ break;
+ }
-void tipc_link_delete_list(struct net *net, unsigned int bearer_id)
-{
- struct tipc_net *tn = net_generic(net, tipc_net_id);
- struct tipc_link *link;
- struct tipc_node *node;
+ if (xmit)
+ tipc_link_build_proto_msg(l, mtyp, prb, 0, 0, 0, xmitq);
- rcu_read_lock();
- list_for_each_entry_rcu(node, &tn->node_list, list) {
- tipc_node_lock(node);
- link = node->links[bearer_id];
- if (link)
- tipc_link_delete(link);
- tipc_node_unlock(node);
- }
- rcu_read_unlock();
+ return rc;
}
/**
@@ -334,7 +490,7 @@ void tipc_link_delete_list(struct net *net, unsigned int bearer_id)
* @link: congested link
* @list: message that was attempted sent
* Create pseudo msg to send back to user when congestion abates
- * Only consumes message if there is an error
+ * Does not consume buffer list
*/
static int link_schedule_user(struct tipc_link *link, struct sk_buff_head *list)
{
@@ -347,8 +503,7 @@ static int link_schedule_user(struct tipc_link *link, struct sk_buff_head *list)
/* This really cannot happen... */
if (unlikely(imp > TIPC_CRITICAL_IMPORTANCE)) {
pr_warn("%s<%s>, send queue full", link_rst_msg, link->name);
- tipc_link_reset(link);
- goto err;
+ return -ENOBUFS;
}
/* Non-blocking sender: */
if (TIPC_SKB_CB(skb_peek(list))->wakeup_pending)
@@ -358,15 +513,12 @@ static int link_schedule_user(struct tipc_link *link, struct sk_buff_head *list)
skb = tipc_msg_create(SOCK_WAKEUP, 0, INT_H_SIZE, 0,
addr, addr, oport, 0, 0);
if (!skb)
- goto err;
+ return -ENOBUFS;
TIPC_SKB_CB(skb)->chain_sz = skb_queue_len(list);
TIPC_SKB_CB(skb)->chain_imp = imp;
skb_queue_tail(&link->wakeupq, skb);
link->stats.link_congs++;
return -ELINKCONG;
-err:
- __skb_queue_purge(list);
- return -ENOBUFS;
}
/**
@@ -388,9 +540,7 @@ void link_prepare_wakeup(struct tipc_link *l)
if ((pnd[imp] + l->backlog[imp].len) >= lim)
break;
skb_unlink(skb, &l->wakeupq);
- skb_queue_tail(&l->inputq, skb);
- l->owner->inputq = &l->inputq;
- l->owner->action_flags |= TIPC_MSG_EVT;
+ skb_queue_tail(l->inputq, skb);
}
}
@@ -426,208 +576,36 @@ void tipc_link_purge_queues(struct tipc_link *l_ptr)
tipc_link_reset_fragments(l_ptr);
}
-void tipc_link_reset(struct tipc_link *l_ptr)
+void tipc_link_reset(struct tipc_link *l)
{
- u32 prev_state = l_ptr->state;
- int was_active_link = tipc_link_is_active(l_ptr);
- struct tipc_node *owner = l_ptr->owner;
- struct tipc_link *pl = tipc_parallel_link(l_ptr);
-
- msg_set_session(l_ptr->pmsg, ((msg_session(l_ptr->pmsg) + 1) & 0xffff));
+ tipc_link_fsm_evt(l, LINK_RESET_EVT);
/* Link is down, accept any session */
- l_ptr->peer_session = INVALID_SESSION;
-
- /* Prepare for renewed mtu size negotiation */
- l_ptr->mtu = l_ptr->advertised_mtu;
-
- l_ptr->state = RESET_UNKNOWN;
+ l->peer_session = WILDCARD_SESSION;
- if ((prev_state == RESET_UNKNOWN) || (prev_state == RESET_RESET))
- return;
-
- tipc_node_link_down(l_ptr->owner, l_ptr);
- tipc_bearer_remove_dest(owner->net, l_ptr->bearer_id, l_ptr->addr);
+ /* If peer is up, it only accepts an incremented session number */
+ msg_set_session(l->pmsg, msg_session(l->pmsg) + 1);
- if (was_active_link && tipc_node_is_up(l_ptr->owner) && (pl != l_ptr)) {
- l_ptr->flags |= LINK_FAILINGOVER;
- l_ptr->failover_checkpt = l_ptr->rcv_nxt;
- pl->failover_pkts = FIRST_FAILOVER;
- pl->failover_checkpt = l_ptr->rcv_nxt;
- pl->failover_skb = l_ptr->reasm_buf;
- } else {
- kfree_skb(l_ptr->reasm_buf);
- }
- /* Clean up all queues, except inputq: */
- __skb_queue_purge(&l_ptr->transmq);
- __skb_queue_purge(&l_ptr->deferdq);
- if (!owner->inputq)
- owner->inputq = &l_ptr->inputq;
- skb_queue_splice_init(&l_ptr->wakeupq, owner->inputq);
- if (!skb_queue_empty(owner->inputq))
- owner->action_flags |= TIPC_MSG_EVT;
- tipc_link_purge_backlog(l_ptr);
- l_ptr->reasm_buf = NULL;
- l_ptr->rcv_unacked = 0;
- l_ptr->snd_nxt = 1;
- l_ptr->silent_intv_cnt = 0;
- l_ptr->stale_count = 0;
- link_reset_statistics(l_ptr);
-}
-
-static void link_activate(struct tipc_link *link)
-{
- struct tipc_node *node = link->owner;
-
- link->rcv_nxt = 1;
- link->stats.recv_info = 1;
- link->silent_intv_cnt = 0;
- tipc_node_link_up(node, link);
- tipc_bearer_add_dest(node->net, link->bearer_id, link->addr);
-}
-
-/**
- * link_state_event - link finite state machine
- * @l_ptr: pointer to link
- * @event: state machine event to process
- */
-static void link_state_event(struct tipc_link *l_ptr, unsigned int event)
-{
- struct tipc_link *other;
- unsigned long timer_intv = l_ptr->keepalive_intv;
-
- if (l_ptr->flags & LINK_STOPPED)
- return;
-
- if (!(l_ptr->flags & LINK_STARTED) && (event != STARTING_EVT))
- return; /* Not yet. */
-
- if (l_ptr->flags & LINK_FAILINGOVER)
- return;
-
- switch (l_ptr->state) {
- case WORKING_WORKING:
- switch (event) {
- case TRAFFIC_MSG_EVT:
- case ACTIVATE_MSG:
- l_ptr->silent_intv_cnt = 0;
- break;
- case SILENCE_EVT:
- if (!l_ptr->silent_intv_cnt) {
- if (tipc_bclink_acks_missing(l_ptr->owner))
- tipc_link_proto_xmit(l_ptr, STATE_MSG,
- 0, 0, 0, 0);
- break;
- }
- l_ptr->state = WORKING_UNKNOWN;
- tipc_link_proto_xmit(l_ptr, STATE_MSG, 1, 0, 0, 0);
- break;
- case RESET_MSG:
- pr_debug("%s<%s>, requested by peer\n",
- link_rst_msg, l_ptr->name);
- tipc_link_reset(l_ptr);
- l_ptr->state = RESET_RESET;
- tipc_link_proto_xmit(l_ptr, ACTIVATE_MSG,
- 0, 0, 0, 0);
- break;
- default:
- pr_debug("%s%u in WW state\n", link_unk_evt, event);
- }
- break;
- case WORKING_UNKNOWN:
- switch (event) {
- case TRAFFIC_MSG_EVT:
- case ACTIVATE_MSG:
- l_ptr->state = WORKING_WORKING;
- l_ptr->silent_intv_cnt = 0;
- break;
- case RESET_MSG:
- pr_debug("%s<%s>, requested by peer while probing\n",
- link_rst_msg, l_ptr->name);
- tipc_link_reset(l_ptr);
- l_ptr->state = RESET_RESET;
- tipc_link_proto_xmit(l_ptr, ACTIVATE_MSG,
- 0, 0, 0, 0);
- break;
- case SILENCE_EVT:
- if (!l_ptr->silent_intv_cnt) {
- l_ptr->state = WORKING_WORKING;
- if (tipc_bclink_acks_missing(l_ptr->owner))
- tipc_link_proto_xmit(l_ptr, STATE_MSG,
- 0, 0, 0, 0);
- } else if (l_ptr->silent_intv_cnt <
- l_ptr->abort_limit) {
- tipc_link_proto_xmit(l_ptr, STATE_MSG,
- 1, 0, 0, 0);
- } else { /* Link has failed */
- pr_debug("%s<%s>, peer not responding\n",
- link_rst_msg, l_ptr->name);
- tipc_link_reset(l_ptr);
- l_ptr->state = RESET_UNKNOWN;
- tipc_link_proto_xmit(l_ptr, RESET_MSG,
- 0, 0, 0, 0);
- }
- break;
- default:
- pr_err("%s%u in WU state\n", link_unk_evt, event);
- }
- break;
- case RESET_UNKNOWN:
- switch (event) {
- case TRAFFIC_MSG_EVT:
- break;
- case ACTIVATE_MSG:
- other = l_ptr->owner->active_links[0];
- if (other && link_working_unknown(other))
- break;
- l_ptr->state = WORKING_WORKING;
- link_activate(l_ptr);
- tipc_link_proto_xmit(l_ptr, STATE_MSG, 1, 0, 0, 0);
- if (l_ptr->owner->working_links == 1)
- tipc_link_sync_xmit(l_ptr);
- break;
- case RESET_MSG:
- l_ptr->state = RESET_RESET;
- tipc_link_proto_xmit(l_ptr, ACTIVATE_MSG,
- 1, 0, 0, 0);
- break;
- case STARTING_EVT:
- l_ptr->flags |= LINK_STARTED;
- link_set_timer(l_ptr, timer_intv);
- break;
- case SILENCE_EVT:
- tipc_link_proto_xmit(l_ptr, RESET_MSG, 0, 0, 0, 0);
- break;
- default:
- pr_err("%s%u in RU state\n", link_unk_evt, event);
- }
- break;
- case RESET_RESET:
- switch (event) {
- case TRAFFIC_MSG_EVT:
- case ACTIVATE_MSG:
- other = l_ptr->owner->active_links[0];
- if (other && link_working_unknown(other))
- break;
- l_ptr->state = WORKING_WORKING;
- link_activate(l_ptr);
- tipc_link_proto_xmit(l_ptr, STATE_MSG, 1, 0, 0, 0);
- if (l_ptr->owner->working_links == 1)
- tipc_link_sync_xmit(l_ptr);
- break;
- case RESET_MSG:
- break;
- case SILENCE_EVT:
- tipc_link_proto_xmit(l_ptr, ACTIVATE_MSG,
- 0, 0, 0, 0);
- break;
- default:
- pr_err("%s%u in RR state\n", link_unk_evt, event);
- }
- break;
- default:
- pr_err("Unknown link state %u/%u\n", l_ptr->state, event);
- }
+ /* Prepare for renewed mtu size negotiation */
+ l->mtu = l->advertised_mtu;
+
+ /* Clean up all queues: */
+ __skb_queue_purge(&l->transmq);
+ __skb_queue_purge(&l->deferdq);
+ skb_queue_splice_init(&l->wakeupq, l->inputq);
+
+ tipc_link_purge_backlog(l);
+ kfree_skb(l->reasm_buf);
+ kfree_skb(l->failover_reasm_skb);
+ l->reasm_buf = NULL;
+ l->failover_reasm_skb = NULL;
+ l->rcv_unacked = 0;
+ l->snd_nxt = 1;
+ l->rcv_nxt = 1;
+ l->silent_intv_cnt = 0;
+ l->stats.recv_info = 0;
+ l->stale_count = 0;
+ link_reset_statistics(l);
}
/**
@@ -635,8 +613,7 @@ static void link_state_event(struct tipc_link *l_ptr, unsigned int event)
* @link: link to use
* @list: chain of buffers containing message
*
- * Consumes the buffer chain, except when returning -ELINKCONG,
- * since the caller then may want to make more send attempts.
+ * Consumes the buffer chain, except when returning an error code,
* Returns 0 if success, or errno: -ELINKCONG, -EMSGSIZE or -ENOBUFS
* Messages at TIPC_SYSTEM_IMPORTANCE are always accepted
*/
@@ -650,7 +627,7 @@ int __tipc_link_xmit(struct net *net, struct tipc_link *link,
u16 ack = mod(link->rcv_nxt - 1);
u16 seqno = link->snd_nxt;
u16 bc_last_in = link->owner->bclink.last_in;
- struct tipc_media_addr *addr = &link->media_addr;
+ struct tipc_media_addr *addr = link->media_addr;
struct sk_buff_head *transmq = &link->transmq;
struct sk_buff_head *backlogq = &link->backlogq;
struct sk_buff *skb, *bskb;
@@ -660,10 +637,9 @@ int __tipc_link_xmit(struct net *net, struct tipc_link *link,
if (unlikely(link->backlog[i].len >= link->backlog[i].limit))
return link_schedule_user(link, list);
}
- if (unlikely(msg_size(msg) > mtu)) {
- __skb_queue_purge(list);
+ if (unlikely(msg_size(msg) > mtu))
return -EMSGSIZE;
- }
+
/* Prepare each packet for sending, and add to relevant queue: */
while (skb_queue_len(list)) {
skb = skb_peek(list);
@@ -700,101 +676,76 @@ int __tipc_link_xmit(struct net *net, struct tipc_link *link,
return 0;
}
-static void skb2list(struct sk_buff *skb, struct sk_buff_head *list)
-{
- skb_queue_head_init(list);
- __skb_queue_tail(list, skb);
-}
-
-static int __tipc_link_xmit_skb(struct tipc_link *link, struct sk_buff *skb)
-{
- struct sk_buff_head head;
-
- skb2list(skb, &head);
- return __tipc_link_xmit(link->owner->net, link, &head);
-}
-
-/* tipc_link_xmit_skb(): send single buffer to destination
- * Buffers sent via this functon are generally TIPC_SYSTEM_IMPORTANCE
- * messages, which will not be rejected
- * The only exception is datagram messages rerouted after secondary
- * lookup, which are rare and safe to dispose of anyway.
- * TODO: Return real return value, and let callers use
- * tipc_wait_for_sendpkt() where applicable
- */
-int tipc_link_xmit_skb(struct net *net, struct sk_buff *skb, u32 dnode,
- u32 selector)
-{
- struct sk_buff_head head;
- int rc;
-
- skb2list(skb, &head);
- rc = tipc_link_xmit(net, &head, dnode, selector);
- if (rc == -ELINKCONG)
- kfree_skb(skb);
- return 0;
-}
-
/**
- * tipc_link_xmit() is the general link level function for message sending
- * @net: the applicable net namespace
+ * tipc_link_xmit(): enqueue buffer list according to queue situation
+ * @link: link to use
* @list: chain of buffers containing message
- * @dsz: amount of user data to be sent
- * @dnode: address of destination node
- * @selector: a number used for deterministic link selection
- * Consumes the buffer chain, except when returning -ELINKCONG
- * Returns 0 if success, otherwise errno: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE
+ * @xmitq: returned list of packets to be sent by caller
+ *
+ * Consumes the buffer chain, except when returning -ELINKCONG,
+ * since the caller then may want to make more send attempts.
+ * Returns 0 if success, or errno: -ELINKCONG, -EMSGSIZE or -ENOBUFS
+ * Messages at TIPC_SYSTEM_IMPORTANCE are always accepted
*/
-int tipc_link_xmit(struct net *net, struct sk_buff_head *list, u32 dnode,
- u32 selector)
+int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
+ struct sk_buff_head *xmitq)
{
- struct tipc_link *link = NULL;
- struct tipc_node *node;
- int rc = -EHOSTUNREACH;
+ struct tipc_msg *hdr = buf_msg(skb_peek(list));
+ unsigned int maxwin = l->window;
+ unsigned int i, imp = msg_importance(hdr);
+ unsigned int mtu = l->mtu;
+ u16 ack = l->rcv_nxt - 1;
+ u16 seqno = l->snd_nxt;
+ u16 bc_last_in = l->owner->bclink.last_in;
+ struct sk_buff_head *transmq = &l->transmq;
+ struct sk_buff_head *backlogq = &l->backlogq;
+ struct sk_buff *skb, *_skb, *bskb;
- node = tipc_node_find(net, dnode);
- if (node) {
- tipc_node_lock(node);
- link = node->active_links[selector & 1];
- if (link)
- rc = __tipc_link_xmit(net, link, list);
- tipc_node_unlock(node);
- tipc_node_put(node);
- }
- if (link)
- return rc;
-
- if (likely(in_own_node(net, dnode))) {
- tipc_sk_rcv(net, list);
- return 0;
+ /* Match msg importance against this and all higher backlog limits: */
+ for (i = imp; i <= TIPC_SYSTEM_IMPORTANCE; i++) {
+ if (unlikely(l->backlog[i].len >= l->backlog[i].limit))
+ return link_schedule_user(l, list);
}
+ if (unlikely(msg_size(hdr) > mtu))
+ return -EMSGSIZE;
- __skb_queue_purge(list);
- return rc;
-}
-
-/*
- * tipc_link_sync_xmit - synchronize broadcast link endpoints.
- *
- * Give a newly added peer node the sequence number where it should
- * start receiving and acking broadcast packets.
- *
- * Called with node locked
- */
-static void tipc_link_sync_xmit(struct tipc_link *link)
-{
- struct sk_buff *skb;
- struct tipc_msg *msg;
-
- skb = tipc_buf_acquire(INT_H_SIZE);
- if (!skb)
- return;
+ /* Prepare each packet for sending, and add to relevant queue: */
+ while (skb_queue_len(list)) {
+ skb = skb_peek(list);
+ hdr = buf_msg(skb);
+ msg_set_seqno(hdr, seqno);
+ msg_set_ack(hdr, ack);
+ msg_set_bcast_ack(hdr, bc_last_in);
- msg = buf_msg(skb);
- tipc_msg_init(link_own_addr(link), msg, BCAST_PROTOCOL, STATE_MSG,
- INT_H_SIZE, link->addr);
- msg_set_last_bcast(msg, link->owner->bclink.acked);
- __tipc_link_xmit_skb(link, skb);
+ if (likely(skb_queue_len(transmq) < maxwin)) {
+ _skb = skb_clone(skb, GFP_ATOMIC);
+ if (!_skb)
+ return -ENOBUFS;
+ __skb_dequeue(list);
+ __skb_queue_tail(transmq, skb);
+ __skb_queue_tail(xmitq, _skb);
+ l->rcv_unacked = 0;
+ seqno++;
+ continue;
+ }
+ if (tipc_msg_bundle(skb_peek_tail(backlogq), hdr, mtu)) {
+ kfree_skb(__skb_dequeue(list));
+ l->stats.sent_bundled++;
+ continue;
+ }
+ if (tipc_msg_make_bundle(&bskb, hdr, mtu, l->addr)) {
+ kfree_skb(__skb_dequeue(list));
+ __skb_queue_tail(backlogq, bskb);
+ l->backlog[msg_importance(buf_msg(bskb))].len++;
+ l->stats.sent_bundled++;
+ l->stats.sent_bundles++;
+ continue;
+ }
+ l->backlog[imp].len += skb_queue_len(list);
+ skb_queue_splice_tail_init(list, backlogq);
+ }
+ l->snd_nxt = seqno;
+ return 0;
}
/*
@@ -842,29 +793,37 @@ void tipc_link_push_packets(struct tipc_link *link)
link->rcv_unacked = 0;
__skb_queue_tail(&link->transmq, skb);
tipc_bearer_send(link->owner->net, link->bearer_id,
- skb, &link->media_addr);
+ skb, link->media_addr);
}
link->snd_nxt = seqno;
}
-void tipc_link_reset_all(struct tipc_node *node)
+void tipc_link_advance_backlog(struct tipc_link *l, struct sk_buff_head *xmitq)
{
- char addr_string[16];
- u32 i;
-
- tipc_node_lock(node);
+ struct sk_buff *skb, *_skb;
+ struct tipc_msg *hdr;
+ u16 seqno = l->snd_nxt;
+ u16 ack = l->rcv_nxt - 1;
- pr_warn("Resetting all links to %s\n",
- tipc_addr_string_fill(addr_string, node->addr));
-
- for (i = 0; i < MAX_BEARERS; i++) {
- if (node->links[i]) {
- link_print(node->links[i], "Resetting link\n");
- tipc_link_reset(node->links[i]);
- }
+ while (skb_queue_len(&l->transmq) < l->window) {
+ skb = skb_peek(&l->backlogq);
+ if (!skb)
+ break;
+ _skb = skb_clone(skb, GFP_ATOMIC);
+ if (!_skb)
+ break;
+ __skb_dequeue(&l->backlogq);
+ hdr = buf_msg(skb);
+ l->backlog[msg_importance(hdr)].len--;
+ __skb_queue_tail(&l->transmq, skb);
+ __skb_queue_tail(xmitq, _skb);
+ msg_set_ack(hdr, ack);
+ msg_set_seqno(hdr, seqno);
+ msg_set_bcast_ack(hdr, l->owner->bclink.last_in);
+ l->rcv_unacked = 0;
+ seqno++;
}
-
- tipc_node_unlock(node);
+ l->snd_nxt = seqno;
}
static void link_retransmit_failure(struct tipc_link *l_ptr,
@@ -877,9 +836,12 @@ static void link_retransmit_failure(struct tipc_link *l_ptr,
if (l_ptr->addr) {
/* Handle failure on standard link */
- link_print(l_ptr, "Resetting link\n");
- tipc_link_reset(l_ptr);
-
+ link_print(l_ptr, "Resetting link ");
+ pr_info("Failed msg: usr %u, typ %u, len %u, err %u\n",
+ msg_user(msg), msg_type(msg), msg_size(msg),
+ msg_errcode(msg));
+ pr_info("sqno %u, prev: %x, src: %x\n",
+ msg_seqno(msg), msg_prevnode(msg), msg_orignode(msg));
} else {
/* Handle failure on broadcast link */
struct tipc_node *n_ptr;
@@ -934,191 +896,45 @@ void tipc_link_retransmit(struct tipc_link *l_ptr, struct sk_buff *skb,
msg_set_ack(msg, mod(l_ptr->rcv_nxt - 1));
msg_set_bcast_ack(msg, l_ptr->owner->bclink.last_in);
tipc_bearer_send(l_ptr->owner->net, l_ptr->bearer_id, skb,
- &l_ptr->media_addr);
+ l_ptr->media_addr);
retransmits--;
l_ptr->stats.retransmitted++;
}
}
-/* link_synch(): check if all packets arrived before the synch
- * point have been consumed
- * Returns true if the parallel links are synched, otherwise false
- */
-static bool link_synch(struct tipc_link *l)
+static int tipc_link_retransm(struct tipc_link *l, int retransm,
+ struct sk_buff_head *xmitq)
{
- unsigned int post_synch;
- struct tipc_link *pl;
+ struct sk_buff *_skb, *skb = skb_peek(&l->transmq);
+ struct tipc_msg *hdr;
- pl = tipc_parallel_link(l);
- if (pl == l)
- goto synched;
-
- /* Was last pre-synch packet added to input queue ? */
- if (less_eq(pl->rcv_nxt, l->synch_point))
- return false;
-
- /* Is it still in the input queue ? */
- post_synch = mod(pl->rcv_nxt - l->synch_point) - 1;
- if (skb_queue_len(&pl->inputq) > post_synch)
- return false;
-synched:
- l->flags &= ~LINK_SYNCHING;
- return true;
-}
-
-static void link_retrieve_defq(struct tipc_link *link,
- struct sk_buff_head *list)
-{
- u16 seq_no;
-
- if (skb_queue_empty(&link->deferdq))
- return;
-
- seq_no = buf_seqno(skb_peek(&link->deferdq));
- if (seq_no == link->rcv_nxt)
- skb_queue_splice_tail_init(&link->deferdq, list);
-}
-
-/**
- * tipc_rcv - process TIPC packets/messages arriving from off-node
- * @net: the applicable net namespace
- * @skb: TIPC packet
- * @b_ptr: pointer to bearer message arrived on
- *
- * Invoked with no locks held. Bearer pointer must point to a valid bearer
- * structure (i.e. cannot be NULL), but bearer can be inactive.
- */
-void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b_ptr)
-{
- struct tipc_net *tn = net_generic(net, tipc_net_id);
- struct sk_buff_head head;
- struct tipc_node *n_ptr;
- struct tipc_link *l_ptr;
- struct sk_buff *skb1, *tmp;
- struct tipc_msg *msg;
- u16 seq_no;
- u16 ackd;
- u32 released;
-
- skb2list(skb, &head);
-
- while ((skb = __skb_dequeue(&head))) {
- /* Ensure message is well-formed */
- if (unlikely(!tipc_msg_validate(skb)))
- goto discard;
-
- /* Handle arrival of a non-unicast link message */
- msg = buf_msg(skb);
- if (unlikely(msg_non_seq(msg))) {
- if (msg_user(msg) == LINK_CONFIG)
- tipc_disc_rcv(net, skb, b_ptr);
- else
- tipc_bclink_rcv(net, skb);
- continue;
- }
-
- /* Discard unicast link messages destined for another node */
- if (unlikely(!msg_short(msg) &&
- (msg_destnode(msg) != tn->own_addr)))
- goto discard;
-
- /* Locate neighboring node that sent message */
- n_ptr = tipc_node_find(net, msg_prevnode(msg));
- if (unlikely(!n_ptr))
- goto discard;
-
- tipc_node_lock(n_ptr);
- /* Locate unicast link endpoint that should handle message */
- l_ptr = n_ptr->links[b_ptr->identity];
- if (unlikely(!l_ptr))
- goto unlock;
-
- /* Verify that communication with node is currently allowed */
- if ((n_ptr->action_flags & TIPC_WAIT_PEER_LINKS_DOWN) &&
- msg_user(msg) == LINK_PROTOCOL &&
- (msg_type(msg) == RESET_MSG ||
- msg_type(msg) == ACTIVATE_MSG) &&
- !msg_redundant_link(msg))
- n_ptr->action_flags &= ~TIPC_WAIT_PEER_LINKS_DOWN;
-
- if (tipc_node_blocked(n_ptr))
- goto unlock;
-
- /* Validate message sequence number info */
- seq_no = msg_seqno(msg);
- ackd = msg_ack(msg);
-
- /* Release acked messages */
- if (unlikely(n_ptr->bclink.acked != msg_bcast_ack(msg)))
- tipc_bclink_acknowledge(n_ptr, msg_bcast_ack(msg));
-
- released = 0;
- skb_queue_walk_safe(&l_ptr->transmq, skb1, tmp) {
- if (more(buf_seqno(skb1), ackd))
- break;
- __skb_unlink(skb1, &l_ptr->transmq);
- kfree_skb(skb1);
- released = 1;
- }
-
- /* Try sending any messages link endpoint has pending */
- if (unlikely(skb_queue_len(&l_ptr->backlogq)))
- tipc_link_push_packets(l_ptr);
-
- if (released && !skb_queue_empty(&l_ptr->wakeupq))
- link_prepare_wakeup(l_ptr);
-
- /* Process the incoming packet */
- if (unlikely(!link_working_working(l_ptr))) {
- if (msg_user(msg) == LINK_PROTOCOL) {
- tipc_link_proto_rcv(l_ptr, skb);
- link_retrieve_defq(l_ptr, &head);
- skb = NULL;
- goto unlock;
- }
-
- /* Traffic message. Conditionally activate link */
- link_state_event(l_ptr, TRAFFIC_MSG_EVT);
-
- if (link_working_working(l_ptr)) {
- /* Re-insert buffer in front of queue */
- __skb_queue_head(&head, skb);
- skb = NULL;
- goto unlock;
- }
- goto unlock;
- }
-
- /* Link is now in state WORKING_WORKING */
- if (unlikely(seq_no != l_ptr->rcv_nxt)) {
- link_handle_out_of_seq_msg(l_ptr, skb);
- link_retrieve_defq(l_ptr, &head);
- skb = NULL;
- goto unlock;
- }
- l_ptr->silent_intv_cnt = 0;
+ if (!skb)
+ return 0;
- /* Synchronize with parallel link if applicable */
- if (unlikely((l_ptr->flags & LINK_SYNCHING) && !msg_dup(msg))) {
- if (!link_synch(l_ptr))
- goto unlock;
- }
- l_ptr->rcv_nxt++;
- if (unlikely(!skb_queue_empty(&l_ptr->deferdq)))
- link_retrieve_defq(l_ptr, &head);
- if (unlikely(++l_ptr->rcv_unacked >= TIPC_MIN_LINK_WIN)) {
- l_ptr->stats.sent_acks++;
- tipc_link_proto_xmit(l_ptr, STATE_MSG, 0, 0, 0, 0);
- }
- tipc_link_input(l_ptr, skb);
- skb = NULL;
-unlock:
- tipc_node_unlock(n_ptr);
- tipc_node_put(n_ptr);
-discard:
- if (unlikely(skb))
- kfree_skb(skb);
+ /* Detect repeated retransmit failures on same packet */
+ if (likely(l->last_retransm != buf_seqno(skb))) {
+ l->last_retransm = buf_seqno(skb);
+ l->stale_count = 1;
+ } else if (++l->stale_count > 100) {
+ link_retransmit_failure(l, skb);
+ return tipc_link_fsm_evt(l, LINK_FAILURE_EVT);
+ }
+ skb_queue_walk(&l->transmq, skb) {
+ if (!retransm)
+ return 0;
+ hdr = buf_msg(skb);
+ _skb = __pskb_copy(skb, MIN_H_SIZE, GFP_ATOMIC);
+ if (!_skb)
+ return 0;
+ hdr = buf_msg(_skb);
+ msg_set_ack(hdr, l->rcv_nxt - 1);
+ msg_set_bcast_ack(hdr, l->owner->bclink.last_in);
+ _skb->priority = TC_PRIO_CONTROL;
+ __skb_queue_tail(xmitq, _skb);
+ retransm--;
+ l->stats.retransmitted++;
}
+ return 0;
}
/* tipc_data_input - deliver data and name distr msgs to upper layer
@@ -1126,29 +942,22 @@ discard:
* Consumes buffer if message is of right type
* Node lock must be held
*/
-static bool tipc_data_input(struct tipc_link *link, struct sk_buff *skb)
+static bool tipc_data_input(struct tipc_link *link, struct sk_buff *skb,
+ struct sk_buff_head *inputq)
{
struct tipc_node *node = link->owner;
- struct tipc_msg *msg = buf_msg(skb);
- u32 dport = msg_destport(msg);
- switch (msg_user(msg)) {
+ switch (msg_user(buf_msg(skb))) {
case TIPC_LOW_IMPORTANCE:
case TIPC_MEDIUM_IMPORTANCE:
case TIPC_HIGH_IMPORTANCE:
case TIPC_CRITICAL_IMPORTANCE:
case CONN_MANAGER:
- if (tipc_skb_queue_tail(&link->inputq, skb, dport)) {
- node->inputq = &link->inputq;
- node->action_flags |= TIPC_MSG_EVT;
- }
+ __skb_queue_tail(inputq, skb);
return true;
case NAME_DISTRIBUTOR:
node->bclink.recv_permitted = true;
- node->namedq = &link->namedq;
- skb_queue_tail(&link->namedq, skb);
- if (skb_queue_len(&link->namedq) == 1)
- node->action_flags |= TIPC_NAMED_MSG_EVT;
+ skb_queue_tail(link->namedq, skb);
return true;
case MSG_BUNDLER:
case TUNNEL_PROTOCOL:
@@ -1165,54 +974,160 @@ static bool tipc_data_input(struct tipc_link *link, struct sk_buff *skb)
/* tipc_link_input - process packet that has passed link protocol check
*
* Consumes buffer
- * Node lock must be held
*/
-static void tipc_link_input(struct tipc_link *link, struct sk_buff *skb)
+static int tipc_link_input(struct tipc_link *l, struct sk_buff *skb,
+ struct sk_buff_head *inputq)
{
- struct tipc_node *node = link->owner;
- struct tipc_msg *msg = buf_msg(skb);
+ struct tipc_node *node = l->owner;
+ struct tipc_msg *hdr = buf_msg(skb);
+ struct sk_buff **reasm_skb = &l->reasm_buf;
struct sk_buff *iskb;
+ int usr = msg_user(hdr);
+ int rc = 0;
int pos = 0;
+ int ipos = 0;
- if (likely(tipc_data_input(link, skb)))
- return;
+ if (unlikely(usr == TUNNEL_PROTOCOL)) {
+ if (msg_type(hdr) == SYNCH_MSG) {
+ __skb_queue_purge(&l->deferdq);
+ goto drop;
+ }
+ if (!tipc_msg_extract(skb, &iskb, &ipos))
+ return rc;
+ kfree_skb(skb);
+ skb = iskb;
+ hdr = buf_msg(skb);
+ if (less(msg_seqno(hdr), l->drop_point))
+ goto drop;
+ if (tipc_data_input(l, skb, inputq))
+ return rc;
+ usr = msg_user(hdr);
+ reasm_skb = &l->failover_reasm_skb;
+ }
- switch (msg_user(msg)) {
- case TUNNEL_PROTOCOL:
- if (msg_dup(msg)) {
- link->flags |= LINK_SYNCHING;
- link->synch_point = msg_seqno(msg_get_wrapped(msg));
- kfree_skb(skb);
- break;
+ if (usr == MSG_BUNDLER) {
+ l->stats.recv_bundles++;
+ l->stats.recv_bundled += msg_msgcnt(hdr);
+ while (tipc_msg_extract(skb, &iskb, &pos))
+ tipc_data_input(l, iskb, inputq);
+ return 0;
+ } else if (usr == MSG_FRAGMENTER) {
+ l->stats.recv_fragments++;
+ if (tipc_buf_append(reasm_skb, &skb)) {
+ l->stats.recv_fragmented++;
+ tipc_data_input(l, skb, inputq);
+ } else if (!*reasm_skb) {
+ return tipc_link_fsm_evt(l, LINK_FAILURE_EVT);
}
- if (!tipc_link_failover_rcv(link, &skb))
- break;
- if (msg_user(buf_msg(skb)) != MSG_BUNDLER) {
- tipc_data_input(link, skb);
+ return 0;
+ } else if (usr == BCAST_PROTOCOL) {
+ tipc_link_sync_rcv(node, skb);
+ return 0;
+ }
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+
+static bool tipc_link_release_pkts(struct tipc_link *l, u16 acked)
+{
+ bool released = false;
+ struct sk_buff *skb, *tmp;
+
+ skb_queue_walk_safe(&l->transmq, skb, tmp) {
+ if (more(buf_seqno(skb), acked))
break;
+ __skb_unlink(skb, &l->transmq);
+ kfree_skb(skb);
+ released = true;
+ }
+ return released;
+}
+
+/* tipc_link_rcv - process TIPC packets/messages arriving from off-node
+ * @link: the link that should handle the message
+ * @skb: TIPC packet
+ * @xmitq: queue to place packets to be sent after this call
+ */
+int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb,
+ struct sk_buff_head *xmitq)
+{
+ struct sk_buff_head *arrvq = &l->deferdq;
+ struct sk_buff_head tmpq;
+ struct tipc_msg *hdr;
+ u16 seqno, rcv_nxt;
+ int rc = 0;
+
+ __skb_queue_head_init(&tmpq);
+
+ if (unlikely(!__tipc_skb_queue_sorted(arrvq, skb))) {
+ if (!(skb_queue_len(arrvq) % TIPC_NACK_INTV))
+ tipc_link_build_proto_msg(l, STATE_MSG, 0,
+ 0, 0, 0, xmitq);
+ return rc;
+ }
+
+ while ((skb = skb_peek(arrvq))) {
+ hdr = buf_msg(skb);
+
+ /* Verify and update link state */
+ if (unlikely(msg_user(hdr) == LINK_PROTOCOL)) {
+ __skb_dequeue(arrvq);
+ rc = tipc_link_proto_rcv(l, skb, xmitq);
+ continue;
}
- case MSG_BUNDLER:
- link->stats.recv_bundles++;
- link->stats.recv_bundled += msg_msgcnt(msg);
- while (tipc_msg_extract(skb, &iskb, &pos))
- tipc_data_input(link, iskb);
- break;
- case MSG_FRAGMENTER:
- link->stats.recv_fragments++;
- if (tipc_buf_append(&link->reasm_buf, &skb)) {
- link->stats.recv_fragmented++;
- tipc_data_input(link, skb);
- } else if (!link->reasm_buf) {
- tipc_link_reset(link);
+ if (unlikely(!link_is_up(l))) {
+ rc = tipc_link_fsm_evt(l, LINK_ESTABLISH_EVT);
+ if (!link_is_up(l)) {
+ kfree_skb(__skb_dequeue(arrvq));
+ goto exit;
+ }
}
- break;
- case BCAST_PROTOCOL:
- tipc_link_sync_rcv(node, skb);
- break;
- default:
- break;
- };
+
+ l->silent_intv_cnt = 0;
+
+ /* Forward queues and wake up waiting users */
+ if (likely(tipc_link_release_pkts(l, msg_ack(hdr)))) {
+ tipc_link_advance_backlog(l, xmitq);
+ if (unlikely(!skb_queue_empty(&l->wakeupq)))
+ link_prepare_wakeup(l);
+ }
+
+ /* Defer reception if there is a gap in the sequence */
+ seqno = msg_seqno(hdr);
+ rcv_nxt = l->rcv_nxt;
+ if (unlikely(less(rcv_nxt, seqno))) {
+ l->stats.deferred_recv++;
+ goto exit;
+ }
+
+ __skb_dequeue(arrvq);
+
+ /* Drop if packet already received */
+ if (unlikely(more(rcv_nxt, seqno))) {
+ l->stats.duplicates++;
+ kfree_skb(skb);
+ goto exit;
+ }
+
+ /* Packet can be delivered */
+ l->rcv_nxt++;
+ l->stats.recv_info++;
+ if (unlikely(!tipc_data_input(l, skb, &tmpq)))
+ rc = tipc_link_input(l, skb, &tmpq);
+
+ /* Ack at regular intervals */
+ if (unlikely(++l->rcv_unacked >= TIPC_MIN_LINK_WIN)) {
+ l->rcv_unacked = 0;
+ l->stats.sent_acks++;
+ tipc_link_build_proto_msg(l, STATE_MSG,
+ 0, 0, 0, 0, xmitq);
+ }
+ }
+exit:
+ tipc_skb_queue_splice_tail(&tmpq, l->inputq);
+ return rc;
}
/**
@@ -1255,458 +1170,249 @@ u32 tipc_link_defer_pkt(struct sk_buff_head *list, struct sk_buff *skb)
}
/*
- * link_handle_out_of_seq_msg - handle arrival of out-of-sequence packet
+ * Send protocol message to the other endpoint.
*/
-static void link_handle_out_of_seq_msg(struct tipc_link *l_ptr,
- struct sk_buff *buf)
+void tipc_link_proto_xmit(struct tipc_link *l, u32 msg_typ, int probe_msg,
+ u32 gap, u32 tolerance, u32 priority)
{
- u32 seq_no = buf_seqno(buf);
+ struct sk_buff *skb = NULL;
+ struct sk_buff_head xmitq;
- if (likely(msg_user(buf_msg(buf)) == LINK_PROTOCOL)) {
- tipc_link_proto_rcv(l_ptr, buf);
+ __skb_queue_head_init(&xmitq);
+ tipc_link_build_proto_msg(l, msg_typ, probe_msg, gap,
+ tolerance, priority, &xmitq);
+ skb = __skb_dequeue(&xmitq);
+ if (!skb)
return;
- }
-
- /* Record OOS packet arrival */
- l_ptr->silent_intv_cnt = 0;
+ tipc_bearer_send(l->owner->net, l->bearer_id, skb, l->media_addr);
+ l->rcv_unacked = 0;
+ kfree_skb(skb);
+}
- /*
- * Discard packet if a duplicate; otherwise add it to deferred queue
- * and notify peer of gap as per protocol specification
- */
- if (less(seq_no, l_ptr->rcv_nxt)) {
- l_ptr->stats.duplicates++;
- kfree_skb(buf);
+/* tipc_link_build_proto_msg: prepare link protocol message for transmission
+ */
+static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe,
+ u16 rcvgap, int tolerance, int priority,
+ struct sk_buff_head *xmitq)
+{
+ struct sk_buff *skb = NULL;
+ struct tipc_msg *hdr = l->pmsg;
+ u16 snd_nxt = l->snd_nxt;
+ u16 rcv_nxt = l->rcv_nxt;
+ u16 rcv_last = rcv_nxt - 1;
+ int node_up = l->owner->bclink.recv_permitted;
+
+ /* Don't send protocol message during reset or link failover */
+ if (tipc_link_is_blocked(l))
return;
- }
- if (tipc_link_defer_pkt(&l_ptr->deferdq, buf)) {
- l_ptr->stats.deferred_recv++;
- if ((skb_queue_len(&l_ptr->deferdq) % TIPC_MIN_LINK_WIN) == 1)
- tipc_link_proto_xmit(l_ptr, STATE_MSG, 0, 0, 0, 0);
+ msg_set_type(hdr, mtyp);
+ msg_set_net_plane(hdr, l->net_plane);
+ msg_set_bcast_ack(hdr, l->owner->bclink.last_in);
+ msg_set_last_bcast(hdr, tipc_bclink_get_last_sent(l->owner->net));
+ msg_set_link_tolerance(hdr, tolerance);
+ msg_set_linkprio(hdr, priority);
+ msg_set_redundant_link(hdr, node_up);
+ msg_set_seq_gap(hdr, 0);
+
+ /* Compatibility: created msg must not be in sequence with pkt flow */
+ msg_set_seqno(hdr, snd_nxt + U16_MAX / 2);
+
+ if (mtyp == STATE_MSG) {
+ if (!tipc_link_is_up(l))
+ return;
+ msg_set_next_sent(hdr, snd_nxt);
+
+ /* Override rcvgap if there are packets in deferred queue */
+ if (!skb_queue_empty(&l->deferdq))
+ rcvgap = buf_seqno(skb_peek(&l->deferdq)) - rcv_nxt;
+ if (rcvgap) {
+ msg_set_seq_gap(hdr, rcvgap);
+ l->stats.sent_nacks++;
+ }
+ msg_set_ack(hdr, rcv_last);
+ msg_set_probe(hdr, probe);
+ if (probe)
+ l->stats.sent_probes++;
+ l->stats.sent_states++;
} else {
- l_ptr->stats.duplicates++;
+ /* RESET_MSG or ACTIVATE_MSG */
+ msg_set_max_pkt(hdr, l->advertised_mtu);
+ msg_set_ack(hdr, l->rcv_nxt - 1);
+ msg_set_next_sent(hdr, 1);
}
+ skb = tipc_buf_acquire(msg_size(hdr));
+ if (!skb)
+ return;
+ skb_copy_to_linear_data(skb, hdr, msg_size(hdr));
+ skb->priority = TC_PRIO_CONTROL;
+ __skb_queue_tail(xmitq, skb);
}
-/*
- * Send protocol message to the other endpoint.
+/* tipc_link_tnl_prepare(): prepare and return a list of tunnel packets
+ * with contents of the link's tranmsit and backlog queues.
*/
-void tipc_link_proto_xmit(struct tipc_link *l_ptr, u32 msg_typ, int probe_msg,
- u32 gap, u32 tolerance, u32 priority)
+void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl,
+ int mtyp, struct sk_buff_head *xmitq)
{
- struct sk_buff *buf = NULL;
- struct tipc_msg *msg = l_ptr->pmsg;
- u32 msg_size = sizeof(l_ptr->proto_msg);
- int r_flag;
- u16 last_rcv;
-
- /* Don't send protocol message during link failover */
- if (l_ptr->flags & LINK_FAILINGOVER)
- return;
+ struct sk_buff *skb, *tnlskb;
+ struct tipc_msg *hdr, tnlhdr;
+ struct sk_buff_head *queue = &l->transmq;
+ struct sk_buff_head tmpxq, tnlq;
+ u16 pktlen, pktcnt, seqno = l->snd_nxt;
- /* Abort non-RESET send if communication with node is prohibited */
- if ((tipc_node_blocked(l_ptr->owner)) && (msg_typ != RESET_MSG))
+ if (!tnl)
return;
- /* Create protocol message with "out-of-sequence" sequence number */
- msg_set_type(msg, msg_typ);
- msg_set_net_plane(msg, l_ptr->net_plane);
- msg_set_bcast_ack(msg, l_ptr->owner->bclink.last_in);
- msg_set_last_bcast(msg, tipc_bclink_get_last_sent(l_ptr->owner->net));
-
- if (msg_typ == STATE_MSG) {
- u16 next_sent = l_ptr->snd_nxt;
+ skb_queue_head_init(&tnlq);
+ skb_queue_head_init(&tmpxq);
- if (!tipc_link_is_up(l_ptr))
+ /* At least one packet required for safe algorithm => add dummy */
+ skb = tipc_msg_create(TIPC_LOW_IMPORTANCE, TIPC_DIRECT_MSG,
+ BASIC_H_SIZE, 0, l->addr, link_own_addr(l),
+ 0, 0, TIPC_ERR_NO_PORT);
+ if (!skb) {
+ pr_warn("%sunable to create tunnel packet\n", link_co_err);
+ return;
+ }
+ skb_queue_tail(&tnlq, skb);
+ tipc_link_xmit(l, &tnlq, &tmpxq);
+ __skb_queue_purge(&tmpxq);
+
+ /* Initialize reusable tunnel packet header */
+ tipc_msg_init(link_own_addr(l), &tnlhdr, TUNNEL_PROTOCOL,
+ mtyp, INT_H_SIZE, l->addr);
+ pktcnt = skb_queue_len(&l->transmq) + skb_queue_len(&l->backlogq);
+ msg_set_msgcnt(&tnlhdr, pktcnt);
+ msg_set_bearer_id(&tnlhdr, l->peer_bearer_id);
+tnl:
+ /* Wrap each packet into a tunnel packet */
+ skb_queue_walk(queue, skb) {
+ hdr = buf_msg(skb);
+ if (queue == &l->backlogq)
+ msg_set_seqno(hdr, seqno++);
+ pktlen = msg_size(hdr);
+ msg_set_size(&tnlhdr, pktlen + INT_H_SIZE);
+ tnlskb = tipc_buf_acquire(pktlen + INT_H_SIZE);
+ if (!tnlskb) {
+ pr_warn("%sunable to send packet\n", link_co_err);
return;
- msg_set_next_sent(msg, next_sent);
- if (!skb_queue_empty(&l_ptr->deferdq)) {
- last_rcv = buf_seqno(skb_peek(&l_ptr->deferdq));
- gap = mod(last_rcv - l_ptr->rcv_nxt);
}
- msg_set_seq_gap(msg, gap);
- if (gap)
- l_ptr->stats.sent_nacks++;
- msg_set_link_tolerance(msg, tolerance);
- msg_set_linkprio(msg, priority);
- msg_set_max_pkt(msg, l_ptr->mtu);
- msg_set_ack(msg, mod(l_ptr->rcv_nxt - 1));
- msg_set_probe(msg, probe_msg != 0);
- if (probe_msg)
- l_ptr->stats.sent_probes++;
- l_ptr->stats.sent_states++;
- } else { /* RESET_MSG or ACTIVATE_MSG */
- msg_set_ack(msg, mod(l_ptr->failover_checkpt - 1));
- msg_set_seq_gap(msg, 0);
- msg_set_next_sent(msg, 1);
- msg_set_probe(msg, 0);
- msg_set_link_tolerance(msg, l_ptr->tolerance);
- msg_set_linkprio(msg, l_ptr->priority);
- msg_set_max_pkt(msg, l_ptr->advertised_mtu);
+ skb_copy_to_linear_data(tnlskb, &tnlhdr, INT_H_SIZE);
+ skb_copy_to_linear_data_offset(tnlskb, INT_H_SIZE, hdr, pktlen);
+ __skb_queue_tail(&tnlq, tnlskb);
+ }
+ if (queue != &l->backlogq) {
+ queue = &l->backlogq;
+ goto tnl;
}
- r_flag = (l_ptr->owner->working_links > tipc_link_is_up(l_ptr));
- msg_set_redundant_link(msg, r_flag);
- msg_set_linkprio(msg, l_ptr->priority);
- msg_set_size(msg, msg_size);
-
- msg_set_seqno(msg, mod(l_ptr->snd_nxt + (0xffff / 2)));
-
- buf = tipc_buf_acquire(msg_size);
- if (!buf)
- return;
+ tipc_link_xmit(tnl, &tnlq, xmitq);
- skb_copy_to_linear_data(buf, msg, sizeof(l_ptr->proto_msg));
- buf->priority = TC_PRIO_CONTROL;
- tipc_bearer_send(l_ptr->owner->net, l_ptr->bearer_id, buf,
- &l_ptr->media_addr);
- l_ptr->rcv_unacked = 0;
- kfree_skb(buf);
+ if (mtyp == FAILOVER_MSG) {
+ tnl->drop_point = l->rcv_nxt;
+ tnl->failover_reasm_skb = l->reasm_buf;
+ l->reasm_buf = NULL;
+ }
}
-/*
- * Receive protocol message :
+/* tipc_link_proto_rcv(): receive link level protocol message :
* Note that network plane id propagates through the network, and may
- * change at any time. The node with lowest address rules
+ * change at any time. The node with lowest numerical id determines
+ * network plane
*/
-static void tipc_link_proto_rcv(struct tipc_link *l_ptr,
- struct sk_buff *buf)
+static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
+ struct sk_buff_head *xmitq)
{
- u32 rec_gap = 0;
- u32 msg_tol;
- struct tipc_msg *msg = buf_msg(buf);
+ struct tipc_msg *hdr = buf_msg(skb);
+ u16 rcvgap = 0;
+ u16 nacked_gap = msg_seq_gap(hdr);
+ u16 peers_snd_nxt = msg_next_sent(hdr);
+ u16 peers_tol = msg_link_tolerance(hdr);
+ u16 peers_prio = msg_linkprio(hdr);
+ char *if_name;
+ int rc = 0;
- if (l_ptr->flags & LINK_FAILINGOVER)
+ if (tipc_link_is_blocked(l))
goto exit;
- if (l_ptr->net_plane != msg_net_plane(msg))
- if (link_own_addr(l_ptr) > msg_prevnode(msg))
- l_ptr->net_plane = msg_net_plane(msg);
-
- switch (msg_type(msg)) {
+ if (link_own_addr(l) > msg_prevnode(hdr))
+ l->net_plane = msg_net_plane(hdr);
+ switch (msg_type(hdr)) {
case RESET_MSG:
- if (!link_working_unknown(l_ptr) &&
- (l_ptr->peer_session != INVALID_SESSION)) {
- if (less_eq(msg_session(msg), l_ptr->peer_session))
- break; /* duplicate or old reset: ignore */
- }
-
- if (!msg_redundant_link(msg) && (link_working_working(l_ptr) ||
- link_working_unknown(l_ptr))) {
- /*
- * peer has lost contact -- don't allow peer's links
- * to reactivate before we recognize loss & clean up
- */
- l_ptr->owner->action_flags |= TIPC_WAIT_OWN_LINKS_DOWN;
- }
-
- link_state_event(l_ptr, RESET_MSG);
+ /* Ignore duplicate RESET with old session number */
+ if ((less_eq(msg_session(hdr), l->peer_session)) &&
+ (l->peer_session != WILDCARD_SESSION))
+ break;
/* fall thru' */
- case ACTIVATE_MSG:
- /* Update link settings according other endpoint's values */
- strcpy((strrchr(l_ptr->name, ':') + 1), (char *)msg_data(msg));
- msg_tol = msg_link_tolerance(msg);
- if (msg_tol > l_ptr->tolerance)
- link_set_supervision_props(l_ptr, msg_tol);
-
- if (msg_linkprio(msg) > l_ptr->priority)
- l_ptr->priority = msg_linkprio(msg);
-
- if (l_ptr->mtu > msg_max_pkt(msg))
- l_ptr->mtu = msg_max_pkt(msg);
-
- /* Synchronize broadcast link info, if not done previously */
- if (!tipc_node_is_up(l_ptr->owner)) {
- l_ptr->owner->bclink.last_sent =
- l_ptr->owner->bclink.last_in =
- msg_last_bcast(msg);
- l_ptr->owner->bclink.oos_state = 0;
- }
-
- l_ptr->peer_session = msg_session(msg);
- l_ptr->peer_bearer_id = msg_bearer_id(msg);
-
- if (msg_type(msg) == ACTIVATE_MSG)
- link_state_event(l_ptr, ACTIVATE_MSG);
- break;
- case STATE_MSG:
+ case ACTIVATE_MSG:
- msg_tol = msg_link_tolerance(msg);
- if (msg_tol)
- link_set_supervision_props(l_ptr, msg_tol);
-
- if (msg_linkprio(msg) &&
- (msg_linkprio(msg) != l_ptr->priority)) {
- pr_debug("%s<%s>, priority change %u->%u\n",
- link_rst_msg, l_ptr->name,
- l_ptr->priority, msg_linkprio(msg));
- l_ptr->priority = msg_linkprio(msg);
- tipc_link_reset(l_ptr); /* Enforce change to take effect */
+ /* Complete own link name with peer's interface name */
+ if_name = strrchr(l->name, ':') + 1;
+ if (sizeof(l->name) - (if_name - l->name) <= TIPC_MAX_IF_NAME)
break;
- }
-
- /* Record reception; force mismatch at next timeout: */
- l_ptr->silent_intv_cnt = 0;
-
- link_state_event(l_ptr, TRAFFIC_MSG_EVT);
- l_ptr->stats.recv_states++;
- if (link_reset_unknown(l_ptr))
+ if (msg_data_sz(hdr) < TIPC_MAX_IF_NAME)
break;
+ strncpy(if_name, msg_data(hdr), TIPC_MAX_IF_NAME);
- if (less_eq(l_ptr->rcv_nxt, msg_next_sent(msg)))
- rec_gap = mod(msg_next_sent(msg) - l_ptr->rcv_nxt);
+ /* Update own tolerance if peer indicates a non-zero value */
+ if (in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL))
+ l->tolerance = peers_tol;
- if (msg_probe(msg))
- l_ptr->stats.recv_probes++;
+ /* Update own priority if peer's priority is higher */
+ if (in_range(peers_prio, l->priority + 1, TIPC_MAX_LINK_PRI))
+ l->priority = peers_prio;
- /* Protocol message before retransmits, reduce loss risk */
- if (l_ptr->owner->bclink.recv_permitted)
- tipc_bclink_update_link_state(l_ptr->owner,
- msg_last_bcast(msg));
-
- if (rec_gap || (msg_probe(msg))) {
- tipc_link_proto_xmit(l_ptr, STATE_MSG, 0,
- rec_gap, 0, 0);
- }
- if (msg_seq_gap(msg)) {
- l_ptr->stats.recv_nacks++;
- tipc_link_retransmit(l_ptr, skb_peek(&l_ptr->transmq),
- msg_seq_gap(msg));
+ if (msg_type(hdr) == RESET_MSG) {
+ rc |= tipc_link_fsm_evt(l, LINK_PEER_RESET_EVT);
+ } else if (!link_is_up(l)) {
+ tipc_link_fsm_evt(l, LINK_PEER_RESET_EVT);
+ rc |= tipc_link_fsm_evt(l, LINK_ESTABLISH_EVT);
}
+ l->peer_session = msg_session(hdr);
+ l->peer_bearer_id = msg_bearer_id(hdr);
+ if (l->mtu > msg_max_pkt(hdr))
+ l->mtu = msg_max_pkt(hdr);
break;
- }
-exit:
- kfree_skb(buf);
-}
-
-
-/* tipc_link_tunnel_xmit(): Tunnel one packet via a link belonging to
- * a different bearer. Owner node is locked.
- */
-static void tipc_link_tunnel_xmit(struct tipc_link *l_ptr,
- struct tipc_msg *tunnel_hdr,
- struct tipc_msg *msg,
- u32 selector)
-{
- struct tipc_link *tunnel;
- struct sk_buff *skb;
- u32 length = msg_size(msg);
-
- tunnel = l_ptr->owner->active_links[selector & 1];
- if (!tipc_link_is_up(tunnel)) {
- pr_warn("%stunnel link no longer available\n", link_co_err);
- return;
- }
- msg_set_size(tunnel_hdr, length + INT_H_SIZE);
- skb = tipc_buf_acquire(length + INT_H_SIZE);
- if (!skb) {
- pr_warn("%sunable to send tunnel msg\n", link_co_err);
- return;
- }
- skb_copy_to_linear_data(skb, tunnel_hdr, INT_H_SIZE);
- skb_copy_to_linear_data_offset(skb, INT_H_SIZE, msg, length);
- __tipc_link_xmit_skb(tunnel, skb);
-}
-
-
-/* tipc_link_failover_send_queue(): A link has gone down, but a second
- * link is still active. We can do failover. Tunnel the failing link's
- * whole send queue via the remaining link. This way, we don't lose
- * any packets, and sequence order is preserved for subsequent traffic
- * sent over the remaining link. Owner node is locked.
- */
-void tipc_link_failover_send_queue(struct tipc_link *l_ptr)
-{
- int msgcount;
- struct tipc_link *tunnel = l_ptr->owner->active_links[0];
- struct tipc_msg tunnel_hdr;
- struct sk_buff *skb;
- int split_bundles;
-
- if (!tunnel)
- return;
- tipc_msg_init(link_own_addr(l_ptr), &tunnel_hdr, TUNNEL_PROTOCOL,
- FAILOVER_MSG, INT_H_SIZE, l_ptr->addr);
-
- skb_queue_walk(&l_ptr->backlogq, skb) {
- msg_set_seqno(buf_msg(skb), l_ptr->snd_nxt);
- l_ptr->snd_nxt = mod(l_ptr->snd_nxt + 1);
- }
- skb_queue_splice_tail_init(&l_ptr->backlogq, &l_ptr->transmq);
- tipc_link_purge_backlog(l_ptr);
- msgcount = skb_queue_len(&l_ptr->transmq);
- msg_set_bearer_id(&tunnel_hdr, l_ptr->peer_bearer_id);
- msg_set_msgcnt(&tunnel_hdr, msgcount);
-
- if (skb_queue_empty(&l_ptr->transmq)) {
- skb = tipc_buf_acquire(INT_H_SIZE);
- if (skb) {
- skb_copy_to_linear_data(skb, &tunnel_hdr, INT_H_SIZE);
- msg_set_size(&tunnel_hdr, INT_H_SIZE);
- __tipc_link_xmit_skb(tunnel, skb);
- } else {
- pr_warn("%sunable to send changeover msg\n",
- link_co_err);
- }
- return;
- }
-
- split_bundles = (l_ptr->owner->active_links[0] !=
- l_ptr->owner->active_links[1]);
+ case STATE_MSG:
- skb_queue_walk(&l_ptr->transmq, skb) {
- struct tipc_msg *msg = buf_msg(skb);
+ /* Update own tolerance if peer indicates a non-zero value */
+ if (in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL))
+ l->tolerance = peers_tol;
- if ((msg_user(msg) == MSG_BUNDLER) && split_bundles) {
- struct tipc_msg *m = msg_get_wrapped(msg);
- unchar *pos = (unchar *)m;
+ l->silent_intv_cnt = 0;
+ l->stats.recv_states++;
+ if (msg_probe(hdr))
+ l->stats.recv_probes++;
+ rc = tipc_link_fsm_evt(l, LINK_ESTABLISH_EVT);
+ if (!link_is_up(l))
+ break;
- msgcount = msg_msgcnt(msg);
- while (msgcount--) {
- msg_set_seqno(m, msg_seqno(msg));
- tipc_link_tunnel_xmit(l_ptr, &tunnel_hdr, m,
- msg_link_selector(m));
- pos += align(msg_size(m));
- m = (struct tipc_msg *)pos;
- }
- } else {
- tipc_link_tunnel_xmit(l_ptr, &tunnel_hdr, msg,
- msg_link_selector(msg));
+ /* Send NACK if peer has sent pkts we haven't received yet */
+ if (more(peers_snd_nxt, l->rcv_nxt))
+ rcvgap = peers_snd_nxt - l->rcv_nxt;
+ if (rcvgap || (msg_probe(hdr)))
+ tipc_link_build_proto_msg(l, STATE_MSG, 0, rcvgap,
+ 0, 0, xmitq);
+ tipc_link_release_pkts(l, msg_ack(hdr));
+
+ /* If NACK, retransmit will now start at right position */
+ if (nacked_gap) {
+ rc = tipc_link_retransm(l, nacked_gap, xmitq);
+ l->stats.recv_nacks++;
}
- }
-}
-
-/* tipc_link_dup_queue_xmit(): A second link has become active. Tunnel a
- * duplicate of the first link's send queue via the new link. This way, we
- * are guaranteed that currently queued packets from a socket are delivered
- * before future traffic from the same socket, even if this is using the
- * new link. The last arriving copy of each duplicate packet is dropped at
- * the receiving end by the regular protocol check, so packet cardinality
- * and sequence order is preserved per sender/receiver socket pair.
- * Owner node is locked.
- */
-void tipc_link_dup_queue_xmit(struct tipc_link *link,
- struct tipc_link *tnl)
-{
- struct sk_buff *skb;
- struct tipc_msg tnl_hdr;
- struct sk_buff_head *queue = &link->transmq;
- int mcnt;
- u16 seqno;
-
- tipc_msg_init(link_own_addr(link), &tnl_hdr, TUNNEL_PROTOCOL,
- SYNCH_MSG, INT_H_SIZE, link->addr);
- mcnt = skb_queue_len(&link->transmq) + skb_queue_len(&link->backlogq);
- msg_set_msgcnt(&tnl_hdr, mcnt);
- msg_set_bearer_id(&tnl_hdr, link->peer_bearer_id);
-
-tunnel_queue:
- skb_queue_walk(queue, skb) {
- struct sk_buff *outskb;
- struct tipc_msg *msg = buf_msg(skb);
- u32 len = msg_size(msg);
- msg_set_ack(msg, mod(link->rcv_nxt - 1));
- msg_set_bcast_ack(msg, link->owner->bclink.last_in);
- msg_set_size(&tnl_hdr, len + INT_H_SIZE);
- outskb = tipc_buf_acquire(len + INT_H_SIZE);
- if (outskb == NULL) {
- pr_warn("%sunable to send duplicate msg\n",
- link_co_err);
- return;
- }
- skb_copy_to_linear_data(outskb, &tnl_hdr, INT_H_SIZE);
- skb_copy_to_linear_data_offset(outskb, INT_H_SIZE,
- skb->data, len);
- __tipc_link_xmit_skb(tnl, outskb);
- if (!tipc_link_is_up(link))
- return;
- }
- if (queue == &link->backlogq)
- return;
- seqno = link->snd_nxt;
- skb_queue_walk(&link->backlogq, skb) {
- msg_set_seqno(buf_msg(skb), seqno);
- seqno = mod(seqno + 1);
- }
- queue = &link->backlogq;
- goto tunnel_queue;
-}
-
-/* tipc_link_failover_rcv(): Receive a tunnelled FAILOVER_MSG packet
- * Owner node is locked.
- */
-static bool tipc_link_failover_rcv(struct tipc_link *link,
- struct sk_buff **skb)
-{
- struct tipc_msg *msg = buf_msg(*skb);
- struct sk_buff *iskb = NULL;
- struct tipc_link *pl = NULL;
- int bearer_id = msg_bearer_id(msg);
- int pos = 0;
-
- if (msg_type(msg) != FAILOVER_MSG) {
- pr_warn("%sunknown tunnel pkt received\n", link_co_err);
- goto exit;
- }
- if (bearer_id >= MAX_BEARERS)
- goto exit;
-
- if (bearer_id == link->bearer_id)
- goto exit;
-
- pl = link->owner->links[bearer_id];
- if (pl && tipc_link_is_up(pl))
- tipc_link_reset(pl);
-
- if (link->failover_pkts == FIRST_FAILOVER)
- link->failover_pkts = msg_msgcnt(msg);
-
- /* Should we expect an inner packet? */
- if (!link->failover_pkts)
- goto exit;
-
- if (!tipc_msg_extract(*skb, &iskb, &pos)) {
- pr_warn("%sno inner failover pkt\n", link_co_err);
- *skb = NULL;
- goto exit;
- }
- link->failover_pkts--;
- *skb = NULL;
-
- /* Was this packet already delivered? */
- if (less(buf_seqno(iskb), link->failover_checkpt)) {
- kfree_skb(iskb);
- iskb = NULL;
- goto exit;
- }
- if (msg_user(buf_msg(iskb)) == MSG_FRAGMENTER) {
- link->stats.recv_fragments++;
- tipc_buf_append(&link->failover_skb, &iskb);
+ tipc_link_advance_backlog(l, xmitq);
+ if (unlikely(!skb_queue_empty(&l->wakeupq)))
+ link_prepare_wakeup(l);
}
exit:
- if (!link->failover_pkts && pl)
- pl->flags &= ~LINK_FAILINGOVER;
- kfree_skb(*skb);
- *skb = iskb;
- return *skb;
-}
-
-static void link_set_supervision_props(struct tipc_link *l_ptr, u32 tol)
-{
- unsigned long intv = ((tol / 4) > 500) ? 500 : tol / 4;
-
- if ((tol < TIPC_MIN_LINK_TOL) || (tol > TIPC_MAX_LINK_TOL))
- return;
-
- l_ptr->tolerance = tol;
- l_ptr->keepalive_intv = msecs_to_jiffies(intv);
- l_ptr->abort_limit = tol / (jiffies_to_msecs(l_ptr->keepalive_intv));
+ kfree_skb(skb);
+ return rc;
}
void tipc_link_set_queue_limits(struct tipc_link *l, u32 win)
@@ -1743,7 +1449,7 @@ static struct tipc_node *tipc_link_find_owner(struct net *net,
list_for_each_entry_rcu(n_ptr, &tn->node_list, list) {
tipc_node_lock(n_ptr);
for (i = 0; i < MAX_BEARERS; i++) {
- l_ptr = n_ptr->links[i];
+ l_ptr = n_ptr->links[i].link;
if (l_ptr && !strcmp(l_ptr->name, link_name)) {
*bearer_id = i;
found_node = n_ptr;
@@ -1770,27 +1476,16 @@ static void link_reset_statistics(struct tipc_link *l_ptr)
l_ptr->stats.recv_info = l_ptr->rcv_nxt;
}
-static void link_print(struct tipc_link *l_ptr, const char *str)
+static void link_print(struct tipc_link *l, const char *str)
{
- struct tipc_net *tn = net_generic(l_ptr->owner->net, tipc_net_id);
- struct tipc_bearer *b_ptr;
-
- rcu_read_lock();
- b_ptr = rcu_dereference_rtnl(tn->bearer_list[l_ptr->bearer_id]);
- if (b_ptr)
- pr_info("%s Link %x<%s>:", str, l_ptr->addr, b_ptr->name);
- rcu_read_unlock();
-
- if (link_working_unknown(l_ptr))
- pr_cont(":WU\n");
- else if (link_reset_reset(l_ptr))
- pr_cont(":RR\n");
- else if (link_reset_unknown(l_ptr))
- pr_cont(":RU\n");
- else if (link_working_working(l_ptr))
- pr_cont(":WW\n");
- else
- pr_cont("\n");
+ struct sk_buff *hskb = skb_peek(&l->transmq);
+ u16 head = hskb ? msg_seqno(buf_msg(hskb)) : l->snd_nxt;
+ u16 tail = l->snd_nxt - 1;
+
+ pr_info("%s Link <%s> state %x\n", str, l->name, l->state);
+ pr_info("XMTQ: %u [%u-%u], BKLGQ: %u, SNDNX: %u, RCVNX: %u\n",
+ skb_queue_len(&l->transmq), head, tail,
+ skb_queue_len(&l->backlogq), l->snd_nxt, l->rcv_nxt);
}
/* Parse and validate nested (link) properties valid for media, bearer and link
@@ -1865,7 +1560,7 @@ int tipc_nl_link_set(struct sk_buff *skb, struct genl_info *info)
tipc_node_lock(node);
- link = node->links[bearer_id];
+ link = node->links[bearer_id].link;
if (!link) {
res = -EINVAL;
goto out;
@@ -1885,7 +1580,7 @@ int tipc_nl_link_set(struct sk_buff *skb, struct genl_info *info)
u32 tol;
tol = nla_get_u32(props[TIPC_NLA_PROP_TOL]);
- link_set_supervision_props(link, tol);
+ link->tolerance = tol;
tipc_link_proto_xmit(link, STATE_MSG, 0, 0, tol, 0);
}
if (props[TIPC_NLA_PROP_PRIO]) {
@@ -2055,10 +1750,11 @@ static int __tipc_nl_add_node_links(struct net *net, struct tipc_nl_msg *msg,
for (i = *prev_link; i < MAX_BEARERS; i++) {
*prev_link = i;
- if (!node->links[i])
+ if (!node->links[i].link)
continue;
- err = __tipc_nl_add_link(net, msg, node->links[i], NLM_F_MULTI);
+ err = __tipc_nl_add_link(net, msg,
+ node->links[i].link, NLM_F_MULTI);
if (err)
return err;
}
@@ -2172,7 +1868,7 @@ int tipc_nl_link_get(struct sk_buff *skb, struct genl_info *info)
return -EINVAL;
tipc_node_lock(node);
- link = node->links[bearer_id];
+ link = node->links[bearer_id].link;
if (!link) {
tipc_node_unlock(node);
nlmsg_free(msg.skb);
@@ -2227,7 +1923,7 @@ int tipc_nl_link_reset_stats(struct sk_buff *skb, struct genl_info *info)
tipc_node_lock(node);
- link = node->links[bearer_id];
+ link = node->links[bearer_id].link;
if (!link) {
tipc_node_unlock(node);
return -EINVAL;
diff --git a/net/tipc/link.h b/net/tipc/link.h
index ae0a0ea..39ff8b6 100644
--- a/net/tipc/link.h
+++ b/net/tipc/link.h
@@ -49,19 +49,25 @@
*/
#define INVALID_LINK_SEQ 0x10000
-/* Link working states
+/* Link FSM events:
*/
-#define WORKING_WORKING 560810u
-#define WORKING_UNKNOWN 560811u
-#define RESET_UNKNOWN 560812u
-#define RESET_RESET 560813u
+enum {
+ LINK_ESTABLISH_EVT = 0xec1ab1e,
+ LINK_PEER_RESET_EVT = 0x9eed0e,
+ LINK_FAILURE_EVT = 0xfa110e,
+ LINK_RESET_EVT = 0x10ca1d0e,
+ LINK_FAILOVER_BEGIN_EVT = 0xfa110bee,
+ LINK_FAILOVER_END_EVT = 0xfa110ede,
+ LINK_SYNCH_BEGIN_EVT = 0xc1ccbee,
+ LINK_SYNCH_END_EVT = 0xc1ccede
+};
-/* Link endpoint execution states
+/* Events returned from link at packet reception or at timeout
*/
-#define LINK_STARTED 0x0001
-#define LINK_STOPPED 0x0002
-#define LINK_SYNCHING 0x0004
-#define LINK_FAILINGOVER 0x0008
+enum {
+ TIPC_LINK_UP_EVT = 1,
+ TIPC_LINK_DOWN_EVT = (1 << 1)
+};
/* Starting value for maximum packet size negotiation on unicast links
* (unless bearer MTU is less)
@@ -106,7 +112,6 @@ struct tipc_stats {
* @timer: link timer
* @owner: pointer to peer node
* @refcnt: reference counter for permanent references (owner node & timer)
- * @flags: execution state flags for link endpoint instance
* @peer_session: link session # being used by peer end of link
* @peer_bearer_id: bearer id used by link's peer endpoint
* @bearer_id: local bearer id used by link
@@ -143,20 +148,17 @@ struct tipc_stats {
struct tipc_link {
u32 addr;
char name[TIPC_MAX_LINK_NAME];
- struct tipc_media_addr media_addr;
- struct timer_list timer;
+ struct tipc_media_addr *media_addr;
struct tipc_node *owner;
- struct kref ref;
/* Management and link supervision data */
- unsigned int flags;
u32 peer_session;
u32 peer_bearer_id;
u32 bearer_id;
u32 tolerance;
unsigned long keepalive_intv;
u32 abort_limit;
- int state;
+ u32 state;
u32 silent_intv_cnt;
struct {
unchar hdr[INT_H_SIZE];
@@ -165,12 +167,10 @@ struct tipc_link {
struct tipc_msg *pmsg;
u32 priority;
char net_plane;
- u16 synch_point;
- /* Failover */
- u16 failover_pkts;
- u16 failover_checkpt;
- struct sk_buff *failover_skb;
+ /* Failover/synch */
+ u16 drop_point;
+ struct sk_buff *failover_reasm_skb;
/* Max packet negotiation */
u16 mtu;
@@ -192,8 +192,8 @@ struct tipc_link {
u16 rcv_nxt;
u32 rcv_unacked;
struct sk_buff_head deferdq;
- struct sk_buff_head inputq;
- struct sk_buff_head namedq;
+ struct sk_buff_head *inputq;
+ struct sk_buff_head *namedq;
/* Congestion handling */
struct sk_buff_head wakeupq;
@@ -205,28 +205,29 @@ struct tipc_link {
struct tipc_stats stats;
};
-struct tipc_port;
-
-struct tipc_link *tipc_link_create(struct tipc_node *n_ptr,
- struct tipc_bearer *b_ptr,
- const struct tipc_media_addr *media_addr);
-void tipc_link_delete(struct tipc_link *link);
-void tipc_link_delete_list(struct net *net, unsigned int bearer_id);
-void tipc_link_failover_send_queue(struct tipc_link *l_ptr);
-void tipc_link_dup_queue_xmit(struct tipc_link *l_ptr, struct tipc_link *dest);
+bool tipc_link_create(struct tipc_node *n, struct tipc_bearer *b, u32 session,
+ u32 ownnode, u32 peer, struct tipc_media_addr *maddr,
+ struct sk_buff_head *inputq, struct sk_buff_head *namedq,
+ struct tipc_link **link);
+void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl,
+ int mtyp, struct sk_buff_head *xmitq);
+void tipc_link_build_bcast_sync_msg(struct tipc_link *l,
+ struct sk_buff_head *xmitq);
+int tipc_link_fsm_evt(struct tipc_link *l, int evt);
void tipc_link_reset_fragments(struct tipc_link *l_ptr);
-int tipc_link_is_up(struct tipc_link *l_ptr);
+bool tipc_link_is_up(struct tipc_link *l);
+bool tipc_link_is_reset(struct tipc_link *l);
+bool tipc_link_is_synching(struct tipc_link *l);
+bool tipc_link_is_failingover(struct tipc_link *l);
+bool tipc_link_is_blocked(struct tipc_link *l);
int tipc_link_is_active(struct tipc_link *l_ptr);
void tipc_link_purge_queues(struct tipc_link *l_ptr);
void tipc_link_purge_backlog(struct tipc_link *l);
-void tipc_link_reset_all(struct tipc_node *node);
void tipc_link_reset(struct tipc_link *l_ptr);
-int tipc_link_xmit_skb(struct net *net, struct sk_buff *skb, u32 dest,
- u32 selector);
-int tipc_link_xmit(struct net *net, struct sk_buff_head *list, u32 dest,
- u32 selector);
int __tipc_link_xmit(struct net *net, struct tipc_link *link,
struct sk_buff_head *list);
+int tipc_link_xmit(struct tipc_link *link, struct sk_buff_head *list,
+ struct sk_buff_head *xmitq);
void tipc_link_proto_xmit(struct tipc_link *l_ptr, u32 msg_typ, int prob,
u32 gap, u32 tolerance, u32 priority);
void tipc_link_push_packets(struct tipc_link *l_ptr);
@@ -242,34 +243,8 @@ int tipc_nl_link_get(struct sk_buff *skb, struct genl_info *info);
int tipc_nl_link_set(struct sk_buff *skb, struct genl_info *info);
int tipc_nl_link_reset_stats(struct sk_buff *skb, struct genl_info *info);
int tipc_nl_parse_link_prop(struct nlattr *prop, struct nlattr *props[]);
-void link_prepare_wakeup(struct tipc_link *l);
-
-static inline u32 link_own_addr(struct tipc_link *l)
-{
- return msg_prevnode(l->pmsg);
-}
-
-/*
- * Link status checking routines
- */
-static inline int link_working_working(struct tipc_link *l_ptr)
-{
- return l_ptr->state == WORKING_WORKING;
-}
-
-static inline int link_working_unknown(struct tipc_link *l_ptr)
-{
- return l_ptr->state == WORKING_UNKNOWN;
-}
-
-static inline int link_reset_unknown(struct tipc_link *l_ptr)
-{
- return l_ptr->state == RESET_UNKNOWN;
-}
-
-static inline int link_reset_reset(struct tipc_link *l_ptr)
-{
- return l_ptr->state == RESET_RESET;
-}
+int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq);
+int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb,
+ struct sk_buff_head *xmitq);
#endif
diff --git a/net/tipc/msg.c b/net/tipc/msg.c
index 08b4cc7..562c926 100644
--- a/net/tipc/msg.c
+++ b/net/tipc/msg.c
@@ -463,60 +463,72 @@ bool tipc_msg_make_bundle(struct sk_buff **skb, struct tipc_msg *msg,
/**
* tipc_msg_reverse(): swap source and destination addresses and add error code
- * @buf: buffer containing message to be reversed
- * @dnode: return value: node where to send message after reversal
- * @err: error code to be set in message
- * Consumes buffer if failure
+ * @own_node: originating node id for reversed message
+ * @skb: buffer containing message to be reversed; may be replaced.
+ * @err: error code to be set in message, if any
+ * Consumes buffer at failure
* Returns true if success, otherwise false
*/
-bool tipc_msg_reverse(u32 own_addr, struct sk_buff *buf, u32 *dnode,
- int err)
+bool tipc_msg_reverse(u32 own_node, struct sk_buff **skb, int err)
{
- struct tipc_msg *msg = buf_msg(buf);
+ struct sk_buff *_skb = *skb;
+ struct tipc_msg *hdr = buf_msg(_skb);
struct tipc_msg ohdr;
- uint rdsz = min_t(uint, msg_data_sz(msg), MAX_FORWARD_SIZE);
+ int dlen = min_t(uint, msg_data_sz(hdr), MAX_FORWARD_SIZE);
- if (skb_linearize(buf))
+ if (skb_linearize(_skb))
goto exit;
- msg = buf_msg(buf);
- if (msg_dest_droppable(msg))
+ hdr = buf_msg(_skb);
+ if (msg_dest_droppable(hdr))
goto exit;
- if (msg_errcode(msg))
+ if (msg_errcode(hdr))
goto exit;
- memcpy(&ohdr, msg, msg_hdr_sz(msg));
- msg_set_errcode(msg, err);
- msg_set_origport(msg, msg_destport(&ohdr));
- msg_set_destport(msg, msg_origport(&ohdr));
- msg_set_prevnode(msg, own_addr);
- if (!msg_short(msg)) {
- msg_set_orignode(msg, msg_destnode(&ohdr));
- msg_set_destnode(msg, msg_orignode(&ohdr));
+
+ /* Take a copy of original header before altering message */
+ memcpy(&ohdr, hdr, msg_hdr_sz(hdr));
+
+ /* Never return SHORT header; expand by replacing buffer if necessary */
+ if (msg_short(hdr)) {
+ *skb = tipc_buf_acquire(BASIC_H_SIZE + dlen);
+ if (!*skb)
+ goto exit;
+ memcpy((*skb)->data + BASIC_H_SIZE, msg_data(hdr), dlen);
+ kfree_skb(_skb);
+ _skb = *skb;
+ hdr = buf_msg(_skb);
+ memcpy(hdr, &ohdr, BASIC_H_SIZE);
+ msg_set_hdr_sz(hdr, BASIC_H_SIZE);
}
- msg_set_size(msg, msg_hdr_sz(msg) + rdsz);
- skb_trim(buf, msg_size(msg));
- skb_orphan(buf);
- *dnode = msg_orignode(&ohdr);
+
+ /* Now reverse the concerned fields */
+ msg_set_errcode(hdr, err);
+ msg_set_origport(hdr, msg_destport(&ohdr));
+ msg_set_destport(hdr, msg_origport(&ohdr));
+ msg_set_destnode(hdr, msg_prevnode(&ohdr));
+ msg_set_prevnode(hdr, own_node);
+ msg_set_orignode(hdr, own_node);
+ msg_set_size(hdr, msg_hdr_sz(hdr) + dlen);
+ skb_trim(_skb, msg_size(hdr));
+ skb_orphan(_skb);
return true;
exit:
- kfree_skb(buf);
- *dnode = 0;
+ kfree_skb(_skb);
+ *skb = NULL;
return false;
}
/**
* tipc_msg_lookup_dest(): try to find new destination for named message
* @skb: the buffer containing the message.
- * @dnode: return value: next-hop node, if destination found
- * @err: return value: error code to use, if message to be rejected
+ * @err: error code to be used by caller if lookup fails
* Does not consume buffer
* Returns true if a destination is found, false otherwise
*/
-bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb,
- u32 *dnode, int *err)
+bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err)
{
struct tipc_msg *msg = buf_msg(skb);
- u32 dport;
- u32 own_addr = tipc_own_addr(net);
+ u32 dport, dnode;
+ u32 onode = tipc_own_addr(net);
if (!msg_isdata(msg))
return false;
@@ -529,15 +541,15 @@ bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb,
return false;
if (msg_reroute_cnt(msg))
return false;
- *dnode = addr_domain(net, msg_lookup_scope(msg));
+ dnode = addr_domain(net, msg_lookup_scope(msg));
dport = tipc_nametbl_translate(net, msg_nametype(msg),
- msg_nameinst(msg), dnode);
+ msg_nameinst(msg), &dnode);
if (!dport)
return false;
msg_incr_reroute_cnt(msg);
- if (*dnode != own_addr)
- msg_set_prevnode(msg, own_addr);
- msg_set_destnode(msg, *dnode);
+ if (dnode != onode)
+ msg_set_prevnode(msg, onode);
+ msg_set_destnode(msg, dnode);
msg_set_destport(msg, dport);
*err = TIPC_OK;
return true;
diff --git a/net/tipc/msg.h b/net/tipc/msg.h
index 19c45fb..a82c584 100644
--- a/net/tipc/msg.h
+++ b/net/tipc/msg.h
@@ -38,6 +38,7 @@
#define _TIPC_MSG_H
#include <linux/tipc.h>
+#include "core.h"
/*
* Constants and routines used to read and write TIPC payload message headers
@@ -109,7 +110,6 @@ struct tipc_skb_cb {
struct sk_buff *tail;
bool validated;
bool wakeup_pending;
- bool bundling;
u16 chain_sz;
u16 chain_imp;
};
@@ -558,15 +558,6 @@ static inline void msg_set_node_capabilities(struct tipc_msg *m, u32 n)
msg_set_bits(m, 1, 15, 0x1fff, n);
}
-static inline bool msg_dup(struct tipc_msg *m)
-{
- if (likely(msg_user(m) != TUNNEL_PROTOCOL))
- return false;
- if (msg_type(m) != SYNCH_MSG)
- return false;
- return true;
-}
-
/*
* Word 2
*/
@@ -620,12 +611,12 @@ static inline void msg_set_fragm_no(struct tipc_msg *m, u32 n)
}
-static inline u32 msg_next_sent(struct tipc_msg *m)
+static inline u16 msg_next_sent(struct tipc_msg *m)
{
return msg_bits(m, 4, 0, 0xffff);
}
-static inline void msg_set_next_sent(struct tipc_msg *m, u32 n)
+static inline void msg_set_next_sent(struct tipc_msg *m, u16 n)
{
msg_set_bits(m, 4, 0, 0xffff, n);
}
@@ -658,12 +649,12 @@ static inline void msg_set_link_selector(struct tipc_msg *m, u32 n)
/*
* Word 5
*/
-static inline u32 msg_session(struct tipc_msg *m)
+static inline u16 msg_session(struct tipc_msg *m)
{
return msg_bits(m, 5, 16, 0xffff);
}
-static inline void msg_set_session(struct tipc_msg *m, u32 n)
+static inline void msg_set_session(struct tipc_msg *m, u16 n)
{
msg_set_bits(m, 5, 16, 0xffff, n);
}
@@ -726,12 +717,12 @@ static inline char *msg_media_addr(struct tipc_msg *m)
/*
* Word 9
*/
-static inline u32 msg_msgcnt(struct tipc_msg *m)
+static inline u16 msg_msgcnt(struct tipc_msg *m)
{
return msg_bits(m, 9, 16, 0xffff);
}
-static inline void msg_set_msgcnt(struct tipc_msg *m, u32 n)
+static inline void msg_set_msgcnt(struct tipc_msg *m, u16 n)
{
msg_set_bits(m, 9, 16, 0xffff, n);
}
@@ -766,10 +757,25 @@ static inline void msg_set_link_tolerance(struct tipc_msg *m, u32 n)
msg_set_bits(m, 9, 0, 0xffff, n);
}
+static inline bool msg_peer_link_is_up(struct tipc_msg *m)
+{
+ if (likely(msg_user(m) != LINK_PROTOCOL))
+ return true;
+ if (msg_type(m) == STATE_MSG)
+ return true;
+ return false;
+}
+
+static inline bool msg_peer_node_is_up(struct tipc_msg *m)
+{
+ if (msg_peer_link_is_up(m))
+ return true;
+ return msg_redundant_link(m);
+}
+
struct sk_buff *tipc_buf_acquire(u32 size);
bool tipc_msg_validate(struct sk_buff *skb);
-bool tipc_msg_reverse(u32 own_addr, struct sk_buff *buf, u32 *dnode,
- int err);
+bool tipc_msg_reverse(u32 own_addr, struct sk_buff **skb, int err);
void tipc_msg_init(u32 own_addr, struct tipc_msg *m, u32 user, u32 type,
u32 hsize, u32 destnode);
struct sk_buff *tipc_msg_create(uint user, uint type, uint hdr_sz,
@@ -782,8 +788,7 @@ bool tipc_msg_make_bundle(struct sk_buff **skb, struct tipc_msg *msg,
bool tipc_msg_extract(struct sk_buff *skb, struct sk_buff **iskb, int *pos);
int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m,
int offset, int dsz, int mtu, struct sk_buff_head *list);
-bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, u32 *dnode,
- int *err);
+bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err);
struct sk_buff *tipc_msg_reassemble(struct sk_buff_head *list);
static inline u16 buf_seqno(struct sk_buff *skb)
@@ -857,26 +862,65 @@ static inline struct sk_buff *tipc_skb_dequeue(struct sk_buff_head *list,
return skb;
}
-/* tipc_skb_queue_tail(): add buffer to tail of list;
+/* tipc_skb_queue_sorted(); sort pkt into list according to sequence number
* @list: list to be appended to
- * @skb: buffer to append. Always appended
- * @dport: the destination port of the buffer
- * returns true if dport differs from previous destination
+ * @skb: buffer to add
+ * Returns true if queue should treated further, otherwise false
*/
-static inline bool tipc_skb_queue_tail(struct sk_buff_head *list,
- struct sk_buff *skb, u32 dport)
+static inline bool __tipc_skb_queue_sorted(struct sk_buff_head *list,
+ struct sk_buff *skb)
{
- struct sk_buff *_skb = NULL;
- bool rv = false;
+ struct sk_buff *_skb, *tmp;
+ struct tipc_msg *hdr = buf_msg(skb);
+ u16 seqno = msg_seqno(hdr);
- spin_lock_bh(&list->lock);
- _skb = skb_peek_tail(list);
- if (!_skb || (msg_destport(buf_msg(_skb)) != dport) ||
- (skb_queue_len(list) > 32))
- rv = true;
+ if (skb_queue_empty(list) || (msg_user(hdr) == LINK_PROTOCOL)) {
+ __skb_queue_head(list, skb);
+ return true;
+ }
+ if (likely(less(seqno, buf_seqno(skb_peek(list))))) {
+ __skb_queue_head(list, skb);
+ return true;
+ }
+ if (!more(seqno, buf_seqno(skb_peek_tail(list)))) {
+ skb_queue_walk_safe(list, _skb, tmp) {
+ if (likely(less(seqno, buf_seqno(_skb)))) {
+ __skb_queue_before(list, _skb, skb);
+ return true;
+ }
+ }
+ }
__skb_queue_tail(list, skb);
+ return false;
+}
+
+/* tipc_skb_queue_splice_tail - append an skb list to lock protected list
+ * @list: the new list to append. Not lock protected
+ * @head: target list. Lock protected.
+ */
+static inline void tipc_skb_queue_splice_tail(struct sk_buff_head *list,
+ struct sk_buff_head *head)
+{
+ spin_lock_bh(&head->lock);
+ skb_queue_splice_tail(list, head);
+ spin_unlock_bh(&head->lock);
+}
+
+/* tipc_skb_queue_splice_tail_init - merge two lock protected skb lists
+ * @list: the new list to add. Lock protected. Will be reinitialized
+ * @head: target list. Lock protected.
+ */
+static inline void tipc_skb_queue_splice_tail_init(struct sk_buff_head *list,
+ struct sk_buff_head *head)
+{
+ struct sk_buff_head tmp;
+
+ __skb_queue_head_init(&tmp);
+
+ spin_lock_bh(&list->lock);
+ skb_queue_splice_tail_init(list, &tmp);
spin_unlock_bh(&list->lock);
- return rv;
+ tipc_skb_queue_splice_tail(&tmp, head);
}
#endif
diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c
index 41e7b7e..e6018b7 100644
--- a/net/tipc/name_distr.c
+++ b/net/tipc/name_distr.c
@@ -96,13 +96,13 @@ void named_cluster_distribute(struct net *net, struct sk_buff *skb)
dnode = node->addr;
if (in_own_node(net, dnode))
continue;
- if (!tipc_node_active_links(node))
+ if (!tipc_node_is_up(node))
continue;
oskb = pskb_copy(skb, GFP_ATOMIC);
if (!oskb)
break;
msg_set_destnode(buf_msg(oskb), dnode);
- tipc_link_xmit_skb(net, oskb, dnode, dnode);
+ tipc_node_xmit_skb(net, oskb, dnode, dnode);
}
rcu_read_unlock();
@@ -223,7 +223,7 @@ void tipc_named_node_up(struct net *net, u32 dnode)
&tn->nametbl->publ_list[TIPC_ZONE_SCOPE]);
rcu_read_unlock();
- tipc_link_xmit(net, &head, dnode, dnode);
+ tipc_node_xmit(net, &head, dnode, dnode);
}
static void tipc_publ_subscribe(struct net *net, struct publication *publ,
diff --git a/net/tipc/node.c b/net/tipc/node.c
index 0b1d61a..7c19164 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -40,10 +40,42 @@
#include "name_distr.h"
#include "socket.h"
#include "bcast.h"
+#include "discover.h"
-static void node_lost_contact(struct tipc_node *n_ptr);
+/* Node FSM states and events:
+ */
+enum {
+ SELF_DOWN_PEER_DOWN = 0xdd,
+ SELF_UP_PEER_UP = 0xaa,
+ SELF_DOWN_PEER_LEAVING = 0xd1,
+ SELF_UP_PEER_COMING = 0xac,
+ SELF_COMING_PEER_UP = 0xca,
+ SELF_LEAVING_PEER_DOWN = 0x1d,
+ NODE_FAILINGOVER = 0xf0,
+ NODE_SYNCHING = 0xcc
+};
+
+enum {
+ SELF_ESTABL_CONTACT_EVT = 0xece,
+ SELF_LOST_CONTACT_EVT = 0x1ce,
+ PEER_ESTABL_CONTACT_EVT = 0x9ece,
+ PEER_LOST_CONTACT_EVT = 0x91ce,
+ NODE_FAILOVER_BEGIN_EVT = 0xfbe,
+ NODE_FAILOVER_END_EVT = 0xfee,
+ NODE_SYNCH_BEGIN_EVT = 0xcbe,
+ NODE_SYNCH_END_EVT = 0xcee
+};
+
+static void __tipc_node_link_down(struct tipc_node *n, int *bearer_id,
+ struct sk_buff_head *xmitq,
+ struct tipc_media_addr **maddr);
+static void tipc_node_link_down(struct tipc_node *n, int bearer_id,
+ bool delete);
+static void node_lost_contact(struct tipc_node *n, struct sk_buff_head *inputq);
static void node_established_contact(struct tipc_node *n_ptr);
static void tipc_node_delete(struct tipc_node *node);
+static void tipc_node_timeout(unsigned long data);
+static void tipc_node_fsm_evt(struct tipc_node *n, int evt);
struct tipc_sock_conn {
u32 port;
@@ -110,7 +142,7 @@ struct tipc_node *tipc_node_find(struct net *net, u32 addr)
return NULL;
}
-struct tipc_node *tipc_node_create(struct net *net, u32 addr)
+struct tipc_node *tipc_node_create(struct net *net, u32 addr, u16 capabilities)
{
struct tipc_net *tn = net_generic(net, tipc_net_id);
struct tipc_node *n_ptr, *temp_node;
@@ -126,12 +158,14 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr)
}
n_ptr->addr = addr;
n_ptr->net = net;
+ n_ptr->capabilities = capabilities;
kref_init(&n_ptr->kref);
spin_lock_init(&n_ptr->lock);
INIT_HLIST_NODE(&n_ptr->hash);
INIT_LIST_HEAD(&n_ptr->list);
INIT_LIST_HEAD(&n_ptr->publ_list);
INIT_LIST_HEAD(&n_ptr->conn_sks);
+ skb_queue_head_init(&n_ptr->bclink.namedq);
__skb_queue_head_init(&n_ptr->bclink.deferdq);
hlist_add_head_rcu(&n_ptr->hash, &tn->node_htable[tipc_hashfn(addr)]);
list_for_each_entry_rcu(temp_node, &tn->node_list, list) {
@@ -139,14 +173,32 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr)
break;
}
list_add_tail_rcu(&n_ptr->list, &temp_node->list);
- n_ptr->action_flags = TIPC_WAIT_PEER_LINKS_DOWN;
+ n_ptr->state = SELF_DOWN_PEER_LEAVING;
n_ptr->signature = INVALID_NODE_SIG;
+ n_ptr->active_links[0] = INVALID_BEARER_ID;
+ n_ptr->active_links[1] = INVALID_BEARER_ID;
tipc_node_get(n_ptr);
+ setup_timer(&n_ptr->timer, tipc_node_timeout, (unsigned long)n_ptr);
+ n_ptr->keepalive_intv = U32_MAX;
exit:
spin_unlock_bh(&tn->node_list_lock);
return n_ptr;
}
+static void tipc_node_calculate_timer(struct tipc_node *n, struct tipc_link *l)
+{
+ unsigned long tol = l->tolerance;
+ unsigned long intv = ((tol / 4) > 500) ? 500 : tol / 4;
+ unsigned long keepalive_intv = msecs_to_jiffies(intv);
+
+ /* Link with lowest tolerance determines timer interval */
+ if (keepalive_intv < n->keepalive_intv)
+ n->keepalive_intv = keepalive_intv;
+
+ /* Ensure link's abort limit corresponds to current interval */
+ l->abort_limit = l->tolerance / jiffies_to_msecs(n->keepalive_intv);
+}
+
static void tipc_node_delete(struct tipc_node *node)
{
list_del_rcu(&node->list);
@@ -160,8 +212,11 @@ void tipc_node_stop(struct net *net)
struct tipc_node *node, *t_node;
spin_lock_bh(&tn->node_list_lock);
- list_for_each_entry_safe(node, t_node, &tn->node_list, list)
+ list_for_each_entry_safe(node, t_node, &tn->node_list, list) {
+ if (del_timer(&node->timer))
+ tipc_node_put(node);
tipc_node_put(node);
+ }
spin_unlock_bh(&tn->node_list_lock);
}
@@ -219,158 +274,547 @@ void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port)
tipc_node_put(node);
}
+/* tipc_node_timeout - handle expiration of node timer
+ */
+static void tipc_node_timeout(unsigned long data)
+{
+ struct tipc_node *n = (struct tipc_node *)data;
+ struct tipc_link_entry *le;
+ struct sk_buff_head xmitq;
+ int bearer_id;
+ int rc = 0;
+
+ __skb_queue_head_init(&xmitq);
+
+ for (bearer_id = 0; bearer_id < MAX_BEARERS; bearer_id++) {
+ tipc_node_lock(n);
+ le = &n->links[bearer_id];
+ if (le->link) {
+ /* Link tolerance may change asynchronously: */
+ tipc_node_calculate_timer(n, le->link);
+ rc = tipc_link_timeout(le->link, &xmitq);
+ }
+ tipc_node_unlock(n);
+ tipc_bearer_xmit(n->net, bearer_id, &xmitq, &le->maddr);
+ if (rc & TIPC_LINK_DOWN_EVT)
+ tipc_node_link_down(n, bearer_id, false);
+ }
+ if (!mod_timer(&n->timer, jiffies + n->keepalive_intv))
+ tipc_node_get(n);
+ tipc_node_put(n);
+}
+
/**
- * tipc_node_link_up - handle addition of link
- *
+ * __tipc_node_link_up - handle addition of link
+ * Node lock must be held by caller
* Link becomes active (alone or shared) or standby, depending on its priority.
*/
-void tipc_node_link_up(struct tipc_node *n_ptr, struct tipc_link *l_ptr)
+static void __tipc_node_link_up(struct tipc_node *n, int bearer_id,
+ struct sk_buff_head *xmitq)
{
- struct tipc_link **active = &n_ptr->active_links[0];
+ int *slot0 = &n->active_links[0];
+ int *slot1 = &n->active_links[1];
+ struct tipc_link *ol = node_active_link(n, 0);
+ struct tipc_link *nl = n->links[bearer_id].link;
- n_ptr->working_links++;
- n_ptr->action_flags |= TIPC_NOTIFY_LINK_UP;
- n_ptr->link_id = l_ptr->peer_bearer_id << 16 | l_ptr->bearer_id;
+ if (!nl || !tipc_link_is_up(nl))
+ return;
- pr_debug("Established link <%s> on network plane %c\n",
- l_ptr->name, l_ptr->net_plane);
+ n->working_links++;
+ n->action_flags |= TIPC_NOTIFY_LINK_UP;
+ n->link_id = nl->peer_bearer_id << 16 | bearer_id;
- if (!active[0]) {
- active[0] = active[1] = l_ptr;
- node_established_contact(n_ptr);
- goto exit;
- }
- if (l_ptr->priority < active[0]->priority) {
- pr_debug("New link <%s> becomes standby\n", l_ptr->name);
- goto exit;
+ /* Leave room for tunnel header when returning 'mtu' to users: */
+ n->links[bearer_id].mtu = nl->mtu - INT_H_SIZE;
+
+ tipc_bearer_add_dest(n->net, bearer_id, n->addr);
+
+ pr_debug("Established link <%s> on network plane %c\n",
+ nl->name, nl->net_plane);
+
+ /* First link? => give it both slots */
+ if (!ol) {
+ *slot0 = bearer_id;
+ *slot1 = bearer_id;
+ tipc_link_build_bcast_sync_msg(nl, xmitq);
+ node_established_contact(n);
+ return;
}
- tipc_link_dup_queue_xmit(active[0], l_ptr);
- if (l_ptr->priority == active[0]->priority) {
- active[0] = l_ptr;
- goto exit;
+
+ /* Second link => redistribute slots */
+ if (nl->priority > ol->priority) {
+ pr_debug("Old link <%s> becomes standby\n", ol->name);
+ *slot0 = bearer_id;
+ *slot1 = bearer_id;
+ } else if (nl->priority == ol->priority) {
+ *slot0 = bearer_id;
+ } else {
+ pr_debug("New link <%s> is standby\n", nl->name);
}
- pr_debug("Old link <%s> becomes standby\n", active[0]->name);
- if (active[1] != active[0])
- pr_debug("Old link <%s> becomes standby\n", active[1]->name);
- active[0] = active[1] = l_ptr;
-exit:
- /* Leave room for changeover header when returning 'mtu' to users: */
- n_ptr->act_mtus[0] = active[0]->mtu - INT_H_SIZE;
- n_ptr->act_mtus[1] = active[1]->mtu - INT_H_SIZE;
+
+ /* Prepare synchronization with first link */
+ tipc_link_tnl_prepare(ol, nl, SYNCH_MSG, xmitq);
}
/**
- * node_select_active_links - select active link
+ * tipc_node_link_up - handle addition of link
+ *
+ * Link becomes active (alone or shared) or standby, depending on its priority.
*/
-static void node_select_active_links(struct tipc_node *n_ptr)
+static void tipc_node_link_up(struct tipc_node *n, int bearer_id,
+ struct sk_buff_head *xmitq)
{
- struct tipc_link **active = &n_ptr->active_links[0];
- u32 i;
- u32 highest_prio = 0;
+ tipc_node_lock(n);
+ __tipc_node_link_up(n, bearer_id, xmitq);
+ tipc_node_unlock(n);
+}
- active[0] = active[1] = NULL;
+/**
+ * __tipc_node_link_down - handle loss of link
+ */
+static void __tipc_node_link_down(struct tipc_node *n, int *bearer_id,
+ struct sk_buff_head *xmitq,
+ struct tipc_media_addr **maddr)
+{
+ struct tipc_link_entry *le = &n->links[*bearer_id];
+ int *slot0 = &n->active_links[0];
+ int *slot1 = &n->active_links[1];
+ int i, highest = 0;
+ struct tipc_link *l, *_l, *tnl;
+
+ l = n->links[*bearer_id].link;
+ if (!l || tipc_link_is_reset(l))
+ return;
- for (i = 0; i < MAX_BEARERS; i++) {
- struct tipc_link *l_ptr = n_ptr->links[i];
+ n->working_links--;
+ n->action_flags |= TIPC_NOTIFY_LINK_DOWN;
+ n->link_id = l->peer_bearer_id << 16 | *bearer_id;
- if (!l_ptr || !tipc_link_is_up(l_ptr) ||
- (l_ptr->priority < highest_prio))
- continue;
+ tipc_bearer_remove_dest(n->net, *bearer_id, n->addr);
+
+ pr_debug("Lost link <%s> on network plane %c\n",
+ l->name, l->net_plane);
- if (l_ptr->priority > highest_prio) {
- highest_prio = l_ptr->priority;
- active[0] = active[1] = l_ptr;
- } else {
- active[1] = l_ptr;
+ /* Select new active link if any available */
+ *slot0 = INVALID_BEARER_ID;
+ *slot1 = INVALID_BEARER_ID;
+ for (i = 0; i < MAX_BEARERS; i++) {
+ _l = n->links[i].link;
+ if (!_l || !tipc_link_is_up(_l))
+ continue;
+ if (_l == l)
+ continue;
+ if (_l->priority < highest)
+ continue;
+ if (_l->priority > highest) {
+ highest = _l->priority;
+ *slot0 = i;
+ *slot1 = i;
+ continue;
}
+ *slot1 = i;
+ }
+
+ if (!tipc_node_is_up(n)) {
+ tipc_link_reset(l);
+ node_lost_contact(n, &le->inputq);
+ return;
}
+
+ /* There is still a working link => initiate failover */
+ tnl = node_active_link(n, 0);
+ n->sync_point = tnl->rcv_nxt + (U16_MAX / 2 - 1);
+ tipc_link_tnl_prepare(l, tnl, FAILOVER_MSG, xmitq);
+ tipc_link_reset(l);
+ tipc_link_fsm_evt(l, LINK_FAILOVER_BEGIN_EVT);
+ tipc_node_fsm_evt(n, NODE_FAILOVER_BEGIN_EVT);
+ *maddr = &n->links[tnl->bearer_id].maddr;
+ *bearer_id = tnl->bearer_id;
}
-/**
- * tipc_node_link_down - handle loss of link
- */
-void tipc_node_link_down(struct tipc_node *n_ptr, struct tipc_link *l_ptr)
+static void tipc_node_link_down(struct tipc_node *n, int bearer_id, bool delete)
{
- struct tipc_net *tn = net_generic(n_ptr->net, tipc_net_id);
- struct tipc_link **active;
+ struct tipc_link_entry *le = &n->links[bearer_id];
+ struct tipc_media_addr *maddr;
+ struct sk_buff_head xmitq;
+
+ __skb_queue_head_init(&xmitq);
+
+ tipc_node_lock(n);
+ __tipc_node_link_down(n, &bearer_id, &xmitq, &maddr);
+ if (delete && le->link) {
+ kfree(le->link);
+ le->link = NULL;
+ n->link_cnt--;
+ }
+ tipc_node_unlock(n);
- n_ptr->working_links--;
- n_ptr->action_flags |= TIPC_NOTIFY_LINK_DOWN;
- n_ptr->link_id = l_ptr->peer_bearer_id << 16 | l_ptr->bearer_id;
+ tipc_bearer_xmit(n->net, bearer_id, &xmitq, maddr);
+ tipc_sk_rcv(n->net, &le->inputq);
+}
- if (!tipc_link_is_active(l_ptr)) {
- pr_debug("Lost standby link <%s> on network plane %c\n",
- l_ptr->name, l_ptr->net_plane);
- return;
- }
- pr_debug("Lost link <%s> on network plane %c\n",
- l_ptr->name, l_ptr->net_plane);
-
- active = &n_ptr->active_links[0];
- if (active[0] == l_ptr)
- active[0] = active[1];
- if (active[1] == l_ptr)
- active[1] = active[0];
- if (active[0] == l_ptr)
- node_select_active_links(n_ptr);
- if (tipc_node_is_up(n_ptr))
- tipc_link_failover_send_queue(l_ptr);
- else
- node_lost_contact(n_ptr);
-
- /* Leave room for changeover header when returning 'mtu' to users: */
- if (active[0]) {
- n_ptr->act_mtus[0] = active[0]->mtu - INT_H_SIZE;
- n_ptr->act_mtus[1] = active[1]->mtu - INT_H_SIZE;
+bool tipc_node_is_up(struct tipc_node *n)
+{
+ return n->active_links[0] != INVALID_BEARER_ID;
+}
+
+void tipc_node_check_dest(struct net *net, u32 onode,
+ struct tipc_bearer *b,
+ u16 capabilities, u32 signature,
+ struct tipc_media_addr *maddr,
+ bool *respond, bool *dupl_addr)
+{
+ struct tipc_node *n;
+ struct tipc_link *l;
+ struct tipc_link_entry *le;
+ bool addr_match = false;
+ bool sign_match = false;
+ bool link_up = false;
+ bool accept_addr = false;
+ bool reset = true;
+
+ *dupl_addr = false;
+ *respond = false;
+
+ n = tipc_node_create(net, onode, capabilities);
+ if (!n)
return;
+
+ tipc_node_lock(n);
+
+ le = &n->links[b->identity];
+
+ /* Prepare to validate requesting node's signature and media address */
+ l = le->link;
+ link_up = l && tipc_link_is_up(l);
+ addr_match = l && !memcmp(&le->maddr, maddr, sizeof(*maddr));
+ sign_match = (signature == n->signature);
+
+ /* These three flags give us eight permutations: */
+
+ if (sign_match && addr_match && link_up) {
+ /* All is fine. Do nothing. */
+ reset = false;
+ } else if (sign_match && addr_match && !link_up) {
+ /* Respond. The link will come up in due time */
+ *respond = true;
+ } else if (sign_match && !addr_match && link_up) {
+ /* Peer has changed i/f address without rebooting.
+ * If so, the link will reset soon, and the next
+ * discovery will be accepted. So we can ignore it.
+ * It may also be an cloned or malicious peer having
+ * chosen the same node address and signature as an
+ * existing one.
+ * Ignore requests until the link goes down, if ever.
+ */
+ *dupl_addr = true;
+ } else if (sign_match && !addr_match && !link_up) {
+ /* Peer link has changed i/f address without rebooting.
+ * It may also be a cloned or malicious peer; we can't
+ * distinguish between the two.
+ * The signature is correct, so we must accept.
+ */
+ accept_addr = true;
+ *respond = true;
+ } else if (!sign_match && addr_match && link_up) {
+ /* Peer node rebooted. Two possibilities:
+ * - Delayed re-discovery; this link endpoint has already
+ * reset and re-established contact with the peer, before
+ * receiving a discovery message from that node.
+ * (The peer happened to receive one from this node first).
+ * - The peer came back so fast that our side has not
+ * discovered it yet. Probing from this side will soon
+ * reset the link, since there can be no working link
+ * endpoint at the peer end, and the link will re-establish.
+ * Accept the signature, since it comes from a known peer.
+ */
+ n->signature = signature;
+ } else if (!sign_match && addr_match && !link_up) {
+ /* The peer node has rebooted.
+ * Accept signature, since it is a known peer.
+ */
+ n->signature = signature;
+ *respond = true;
+ } else if (!sign_match && !addr_match && link_up) {
+ /* Peer rebooted with new address, or a new/duplicate peer.
+ * Ignore until the link goes down, if ever.
+ */
+ *dupl_addr = true;
+ } else if (!sign_match && !addr_match && !link_up) {
+ /* Peer rebooted with new address, or it is a new peer.
+ * Accept signature and address.
+ */
+ n->signature = signature;
+ accept_addr = true;
+ *respond = true;
}
- /* Loopback link went down? No fragmentation needed from now on. */
- if (n_ptr->addr == tn->own_addr) {
- n_ptr->act_mtus[0] = MAX_MSG_SIZE;
- n_ptr->act_mtus[1] = MAX_MSG_SIZE;
+
+ if (!accept_addr)
+ goto exit;
+
+ /* Now create new link if not already existing */
+ if (!l) {
+ if (n->link_cnt == 2) {
+ pr_warn("Cannot establish 3rd link to %x\n", n->addr);
+ goto exit;
+ }
+ if (!tipc_link_create(n, b, mod(tipc_net(net)->random),
+ tipc_own_addr(net), onode, &le->maddr,
+ &le->inputq, &n->bclink.namedq, &l)) {
+ *respond = false;
+ goto exit;
+ }
+ tipc_link_reset(l);
+ le->link = l;
+ n->link_cnt++;
+ tipc_node_calculate_timer(n, l);
+ if (n->link_cnt == 1)
+ if (!mod_timer(&n->timer, jiffies + n->keepalive_intv))
+ tipc_node_get(n);
}
+ memcpy(&le->maddr, maddr, sizeof(*maddr));
+exit:
+ tipc_node_unlock(n);
+ if (reset)
+ tipc_node_link_down(n, b->identity, false);
+ tipc_node_put(n);
}
-int tipc_node_active_links(struct tipc_node *n_ptr)
+void tipc_node_delete_links(struct net *net, int bearer_id)
{
- return n_ptr->active_links[0] != NULL;
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct tipc_node *n;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(n, &tn->node_list, list) {
+ tipc_node_link_down(n, bearer_id, true);
+ }
+ rcu_read_unlock();
}
-int tipc_node_is_up(struct tipc_node *n_ptr)
+static void tipc_node_reset_links(struct tipc_node *n)
{
- return tipc_node_active_links(n_ptr);
+ char addr_string[16];
+ int i;
+
+ pr_warn("Resetting all links to %s\n",
+ tipc_addr_string_fill(addr_string, n->addr));
+
+ for (i = 0; i < MAX_BEARERS; i++) {
+ tipc_node_link_down(n, i, false);
+ }
}
-void tipc_node_attach_link(struct tipc_node *n_ptr, struct tipc_link *l_ptr)
+/* tipc_node_fsm_evt - node finite state machine
+ * Determines when contact is allowed with peer node
+ */
+static void tipc_node_fsm_evt(struct tipc_node *n, int evt)
{
- n_ptr->links[l_ptr->bearer_id] = l_ptr;
- n_ptr->link_cnt++;
+ int state = n->state;
+
+ switch (state) {
+ case SELF_DOWN_PEER_DOWN:
+ switch (evt) {
+ case SELF_ESTABL_CONTACT_EVT:
+ state = SELF_UP_PEER_COMING;
+ break;
+ case PEER_ESTABL_CONTACT_EVT:
+ state = SELF_COMING_PEER_UP;
+ break;
+ case SELF_LOST_CONTACT_EVT:
+ case PEER_LOST_CONTACT_EVT:
+ break;
+ case NODE_SYNCH_END_EVT:
+ case NODE_SYNCH_BEGIN_EVT:
+ case NODE_FAILOVER_BEGIN_EVT:
+ case NODE_FAILOVER_END_EVT:
+ default:
+ goto illegal_evt;
+ }
+ break;
+ case SELF_UP_PEER_UP:
+ switch (evt) {
+ case SELF_LOST_CONTACT_EVT:
+ state = SELF_DOWN_PEER_LEAVING;
+ break;
+ case PEER_LOST_CONTACT_EVT:
+ state = SELF_LEAVING_PEER_DOWN;
+ break;
+ case NODE_SYNCH_BEGIN_EVT:
+ state = NODE_SYNCHING;
+ break;
+ case NODE_FAILOVER_BEGIN_EVT:
+ state = NODE_FAILINGOVER;
+ break;
+ case SELF_ESTABL_CONTACT_EVT:
+ case PEER_ESTABL_CONTACT_EVT:
+ case NODE_SYNCH_END_EVT:
+ case NODE_FAILOVER_END_EVT:
+ break;
+ default:
+ goto illegal_evt;
+ }
+ break;
+ case SELF_DOWN_PEER_LEAVING:
+ switch (evt) {
+ case PEER_LOST_CONTACT_EVT:
+ state = SELF_DOWN_PEER_DOWN;
+ break;
+ case SELF_ESTABL_CONTACT_EVT:
+ case PEER_ESTABL_CONTACT_EVT:
+ case SELF_LOST_CONTACT_EVT:
+ break;
+ case NODE_SYNCH_END_EVT:
+ case NODE_SYNCH_BEGIN_EVT:
+ case NODE_FAILOVER_BEGIN_EVT:
+ case NODE_FAILOVER_END_EVT:
+ default:
+ goto illegal_evt;
+ }
+ break;
+ case SELF_UP_PEER_COMING:
+ switch (evt) {
+ case PEER_ESTABL_CONTACT_EVT:
+ state = SELF_UP_PEER_UP;
+ break;
+ case SELF_LOST_CONTACT_EVT:
+ state = SELF_DOWN_PEER_LEAVING;
+ break;
+ case SELF_ESTABL_CONTACT_EVT:
+ case PEER_LOST_CONTACT_EVT:
+ break;
+ case NODE_SYNCH_END_EVT:
+ case NODE_SYNCH_BEGIN_EVT:
+ case NODE_FAILOVER_BEGIN_EVT:
+ case NODE_FAILOVER_END_EVT:
+ default:
+ goto illegal_evt;
+ }
+ break;
+ case SELF_COMING_PEER_UP:
+ switch (evt) {
+ case SELF_ESTABL_CONTACT_EVT:
+ state = SELF_UP_PEER_UP;
+ break;
+ case PEER_LOST_CONTACT_EVT:
+ state = SELF_LEAVING_PEER_DOWN;
+ break;
+ case SELF_LOST_CONTACT_EVT:
+ case PEER_ESTABL_CONTACT_EVT:
+ break;
+ case NODE_SYNCH_END_EVT:
+ case NODE_SYNCH_BEGIN_EVT:
+ case NODE_FAILOVER_BEGIN_EVT:
+ case NODE_FAILOVER_END_EVT:
+ default:
+ goto illegal_evt;
+ }
+ break;
+ case SELF_LEAVING_PEER_DOWN:
+ switch (evt) {
+ case SELF_LOST_CONTACT_EVT:
+ state = SELF_DOWN_PEER_DOWN;
+ break;
+ case SELF_ESTABL_CONTACT_EVT:
+ case PEER_ESTABL_CONTACT_EVT:
+ case PEER_LOST_CONTACT_EVT:
+ break;
+ case NODE_SYNCH_END_EVT:
+ case NODE_SYNCH_BEGIN_EVT:
+ case NODE_FAILOVER_BEGIN_EVT:
+ case NODE_FAILOVER_END_EVT:
+ default:
+ goto illegal_evt;
+ }
+ break;
+ case NODE_FAILINGOVER:
+ switch (evt) {
+ case SELF_LOST_CONTACT_EVT:
+ state = SELF_DOWN_PEER_LEAVING;
+ break;
+ case PEER_LOST_CONTACT_EVT:
+ state = SELF_LEAVING_PEER_DOWN;
+ break;
+ case NODE_FAILOVER_END_EVT:
+ state = SELF_UP_PEER_UP;
+ break;
+ case NODE_FAILOVER_BEGIN_EVT:
+ case SELF_ESTABL_CONTACT_EVT:
+ case PEER_ESTABL_CONTACT_EVT:
+ break;
+ case NODE_SYNCH_BEGIN_EVT:
+ case NODE_SYNCH_END_EVT:
+ default:
+ goto illegal_evt;
+ }
+ break;
+ case NODE_SYNCHING:
+ switch (evt) {
+ case SELF_LOST_CONTACT_EVT:
+ state = SELF_DOWN_PEER_LEAVING;
+ break;
+ case PEER_LOST_CONTACT_EVT:
+ state = SELF_LEAVING_PEER_DOWN;
+ break;
+ case NODE_SYNCH_END_EVT:
+ state = SELF_UP_PEER_UP;
+ break;
+ case NODE_FAILOVER_BEGIN_EVT:
+ state = NODE_FAILINGOVER;
+ break;
+ case NODE_SYNCH_BEGIN_EVT:
+ case SELF_ESTABL_CONTACT_EVT:
+ case PEER_ESTABL_CONTACT_EVT:
+ break;
+ case NODE_FAILOVER_END_EVT:
+ default:
+ goto illegal_evt;
+ }
+ break;
+ default:
+ pr_err("Unknown node fsm state %x\n", state);
+ break;
+ }
+ n->state = state;
+ return;
+
+illegal_evt:
+ pr_err("Illegal node fsm evt %x in state %x\n", evt, state);
}
-void tipc_node_detach_link(struct tipc_node *n_ptr, struct tipc_link *l_ptr)
+bool tipc_node_filter_pkt(struct tipc_node *n, struct tipc_msg *hdr)
{
- int i;
+ int state = n->state;
- for (i = 0; i < MAX_BEARERS; i++) {
- if (l_ptr != n_ptr->links[i])
- continue;
- n_ptr->links[i] = NULL;
- n_ptr->link_cnt--;
+ if (likely(state == SELF_UP_PEER_UP))
+ return true;
+
+ if (state == SELF_LEAVING_PEER_DOWN)
+ return false;
+
+ if (state == SELF_DOWN_PEER_LEAVING) {
+ if (msg_peer_node_is_up(hdr))
+ return false;
}
+
+ return true;
}
static void node_established_contact(struct tipc_node *n_ptr)
{
+ tipc_node_fsm_evt(n_ptr, SELF_ESTABL_CONTACT_EVT);
n_ptr->action_flags |= TIPC_NOTIFY_NODE_UP;
n_ptr->bclink.oos_state = 0;
n_ptr->bclink.acked = tipc_bclink_get_last_sent(n_ptr->net);
tipc_bclink_add_node(n_ptr->net, n_ptr->addr);
}
-static void node_lost_contact(struct tipc_node *n_ptr)
+static void node_lost_contact(struct tipc_node *n_ptr,
+ struct sk_buff_head *inputq)
{
char addr_string[16];
struct tipc_sock_conn *conn, *safe;
+ struct tipc_link *l;
struct list_head *conns = &n_ptr->conn_sks;
struct sk_buff *skb;
struct tipc_net *tn = net_generic(n_ptr->net, tipc_net_id);
@@ -396,21 +840,13 @@ static void node_lost_contact(struct tipc_node *n_ptr)
/* Abort any ongoing link failover */
for (i = 0; i < MAX_BEARERS; i++) {
- struct tipc_link *l_ptr = n_ptr->links[i];
- if (!l_ptr)
- continue;
- l_ptr->flags &= ~LINK_FAILINGOVER;
- l_ptr->failover_checkpt = 0;
- l_ptr->failover_pkts = 0;
- kfree_skb(l_ptr->failover_skb);
- l_ptr->failover_skb = NULL;
- tipc_link_reset_fragments(l_ptr);
+ l = n_ptr->links[i].link;
+ if (l)
+ tipc_link_fsm_evt(l, LINK_FAILOVER_END_EVT);
}
- n_ptr->action_flags &= ~TIPC_WAIT_OWN_LINKS_DOWN;
-
/* Prevent re-contact with node until cleanup is done */
- n_ptr->action_flags |= TIPC_WAIT_PEER_LINKS_DOWN;
+ tipc_node_fsm_evt(n_ptr, SELF_LOST_CONTACT_EVT);
/* Notify publications from this node */
n_ptr->action_flags |= TIPC_NOTIFY_NODE_DOWN;
@@ -421,10 +857,8 @@ static void node_lost_contact(struct tipc_node *n_ptr)
SHORT_H_SIZE, 0, tn->own_addr,
conn->peer_node, conn->port,
conn->peer_port, TIPC_ERR_NO_NODE);
- if (likely(skb)) {
- skb_queue_tail(n_ptr->inputq, skb);
- n_ptr->action_flags |= TIPC_MSG_EVT;
- }
+ if (likely(skb))
+ skb_queue_tail(inputq, skb);
list_del(&conn->list);
kfree(conn);
}
@@ -453,7 +887,7 @@ int tipc_node_get_linkname(struct net *net, u32 bearer_id, u32 addr,
goto exit;
tipc_node_lock(node);
- link = node->links[bearer_id];
+ link = node->links[bearer_id].link;
if (link) {
strncpy(linkname, link->name, len);
err = 0;
@@ -471,27 +905,20 @@ void tipc_node_unlock(struct tipc_node *node)
u32 flags = node->action_flags;
u32 link_id = 0;
struct list_head *publ_list;
- struct sk_buff_head *inputq = node->inputq;
- struct sk_buff_head *namedq;
- if (likely(!flags || (flags == TIPC_MSG_EVT))) {
- node->action_flags = 0;
+ if (likely(!flags)) {
spin_unlock_bh(&node->lock);
- if (flags == TIPC_MSG_EVT)
- tipc_sk_rcv(net, inputq);
return;
}
addr = node->addr;
link_id = node->link_id;
- namedq = node->namedq;
publ_list = &node->publ_list;
- node->action_flags &= ~(TIPC_MSG_EVT |
- TIPC_NOTIFY_NODE_DOWN | TIPC_NOTIFY_NODE_UP |
+ node->action_flags &= ~(TIPC_NOTIFY_NODE_DOWN | TIPC_NOTIFY_NODE_UP |
TIPC_NOTIFY_LINK_DOWN | TIPC_NOTIFY_LINK_UP |
TIPC_WAKEUP_BCAST_USERS | TIPC_BCAST_MSG_EVT |
- TIPC_NAMED_MSG_EVT | TIPC_BCAST_RESET);
+ TIPC_BCAST_RESET);
spin_unlock_bh(&node->lock);
@@ -512,17 +939,11 @@ void tipc_node_unlock(struct tipc_node *node)
tipc_nametbl_withdraw(net, TIPC_LINK_STATE, addr,
link_id, addr);
- if (flags & TIPC_MSG_EVT)
- tipc_sk_rcv(net, inputq);
-
- if (flags & TIPC_NAMED_MSG_EVT)
- tipc_named_rcv(net, namedq);
-
if (flags & TIPC_BCAST_MSG_EVT)
tipc_bclink_input(net);
if (flags & TIPC_BCAST_RESET)
- tipc_link_reset_all(node);
+ tipc_node_reset_links(node);
}
/* Caller should hold node lock for the passed node */
@@ -559,6 +980,279 @@ msg_full:
return -EMSGSIZE;
}
+static struct tipc_link *tipc_node_select_link(struct tipc_node *n, int sel,
+ int *bearer_id,
+ struct tipc_media_addr **maddr)
+{
+ int id = n->active_links[sel & 1];
+
+ if (unlikely(id < 0))
+ return NULL;
+
+ *bearer_id = id;
+ *maddr = &n->links[id].maddr;
+ return n->links[id].link;
+}
+
+/**
+ * tipc_node_xmit() is the general link level function for message sending
+ * @net: the applicable net namespace
+ * @list: chain of buffers containing message
+ * @dnode: address of destination node
+ * @selector: a number used for deterministic link selection
+ * Consumes the buffer chain, except when returning -ELINKCONG
+ * Returns 0 if success, otherwise errno: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE
+ */
+int tipc_node_xmit(struct net *net, struct sk_buff_head *list,
+ u32 dnode, int selector)
+{
+ struct tipc_link *l = NULL;
+ struct tipc_node *n;
+ struct sk_buff_head xmitq;
+ struct tipc_media_addr *maddr;
+ int bearer_id;
+ int rc = -EHOSTUNREACH;
+
+ __skb_queue_head_init(&xmitq);
+ n = tipc_node_find(net, dnode);
+ if (likely(n)) {
+ tipc_node_lock(n);
+ l = tipc_node_select_link(n, selector, &bearer_id, &maddr);
+ if (likely(l))
+ rc = tipc_link_xmit(l, list, &xmitq);
+ tipc_node_unlock(n);
+ if (unlikely(rc == -ENOBUFS))
+ tipc_node_link_down(n, bearer_id, false);
+ tipc_node_put(n);
+ }
+ if (likely(!rc)) {
+ tipc_bearer_xmit(net, bearer_id, &xmitq, maddr);
+ return 0;
+ }
+ if (likely(in_own_node(net, dnode))) {
+ tipc_sk_rcv(net, list);
+ return 0;
+ }
+ return rc;
+}
+
+/* tipc_node_xmit_skb(): send single buffer to destination
+ * Buffers sent via this functon are generally TIPC_SYSTEM_IMPORTANCE
+ * messages, which will not be rejected
+ * The only exception is datagram messages rerouted after secondary
+ * lookup, which are rare and safe to dispose of anyway.
+ * TODO: Return real return value, and let callers use
+ * tipc_wait_for_sendpkt() where applicable
+ */
+int tipc_node_xmit_skb(struct net *net, struct sk_buff *skb, u32 dnode,
+ u32 selector)
+{
+ struct sk_buff_head head;
+ int rc;
+
+ skb_queue_head_init(&head);
+ __skb_queue_tail(&head, skb);
+ rc = tipc_node_xmit(net, &head, dnode, selector);
+ if (rc == -ELINKCONG)
+ kfree_skb(skb);
+ return 0;
+}
+
+/**
+ * tipc_node_check_state - check and if necessary update node state
+ * @skb: TIPC packet
+ * @bearer_id: identity of bearer delivering the packet
+ * Returns true if state is ok, otherwise consumes buffer and returns false
+ */
+static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb,
+ int bearer_id, struct sk_buff_head *xmitq)
+{
+ struct tipc_msg *hdr = buf_msg(skb);
+ int usr = msg_user(hdr);
+ int mtyp = msg_type(hdr);
+ u16 oseqno = msg_seqno(hdr);
+ u16 iseqno = msg_seqno(msg_get_wrapped(hdr));
+ u16 exp_pkts = msg_msgcnt(hdr);
+ u16 rcv_nxt, syncpt, dlv_nxt;
+ int state = n->state;
+ struct tipc_link *l, *pl = NULL;
+ struct tipc_media_addr *maddr;
+ int i, pb_id;
+
+ l = n->links[bearer_id].link;
+ if (!l)
+ return false;
+ rcv_nxt = l->rcv_nxt;
+
+
+ if (likely((state == SELF_UP_PEER_UP) && (usr != TUNNEL_PROTOCOL)))
+ return true;
+
+ /* Find parallel link, if any */
+ for (i = 0; i < MAX_BEARERS; i++) {
+ if ((i != bearer_id) && n->links[i].link) {
+ pl = n->links[i].link;
+ break;
+ }
+ }
+
+ /* Update node accesibility if applicable */
+ if (state == SELF_UP_PEER_COMING) {
+ if (!tipc_link_is_up(l))
+ return true;
+ if (!msg_peer_link_is_up(hdr))
+ return true;
+ tipc_node_fsm_evt(n, PEER_ESTABL_CONTACT_EVT);
+ }
+
+ if (state == SELF_DOWN_PEER_LEAVING) {
+ if (msg_peer_node_is_up(hdr))
+ return false;
+ tipc_node_fsm_evt(n, PEER_LOST_CONTACT_EVT);
+ }
+
+ /* Ignore duplicate packets */
+ if (less(oseqno, rcv_nxt))
+ return true;
+
+ /* Initiate or update failover mode if applicable */
+ if ((usr == TUNNEL_PROTOCOL) && (mtyp == FAILOVER_MSG)) {
+ syncpt = oseqno + exp_pkts - 1;
+ if (pl && tipc_link_is_up(pl)) {
+ pb_id = pl->bearer_id;
+ __tipc_node_link_down(n, &pb_id, xmitq, &maddr);
+ tipc_skb_queue_splice_tail_init(pl->inputq, l->inputq);
+ }
+ /* If pkts arrive out of order, use lowest calculated syncpt */
+ if (less(syncpt, n->sync_point))
+ n->sync_point = syncpt;
+ }
+
+ /* Open parallel link when tunnel link reaches synch point */
+ if ((n->state == NODE_FAILINGOVER) && !tipc_link_is_failingover(l)) {
+ if (!more(rcv_nxt, n->sync_point))
+ return true;
+ tipc_node_fsm_evt(n, NODE_FAILOVER_END_EVT);
+ if (pl)
+ tipc_link_fsm_evt(pl, LINK_FAILOVER_END_EVT);
+ return true;
+ }
+
+ /* Initiate or update synch mode if applicable */
+ if ((usr == TUNNEL_PROTOCOL) && (mtyp == SYNCH_MSG)) {
+ syncpt = iseqno + exp_pkts - 1;
+ if (!tipc_link_is_up(l)) {
+ tipc_link_fsm_evt(l, LINK_ESTABLISH_EVT);
+ __tipc_node_link_up(n, bearer_id, xmitq);
+ }
+ if (n->state == SELF_UP_PEER_UP) {
+ n->sync_point = syncpt;
+ tipc_link_fsm_evt(l, LINK_SYNCH_BEGIN_EVT);
+ tipc_node_fsm_evt(n, NODE_SYNCH_BEGIN_EVT);
+ }
+ if (less(syncpt, n->sync_point))
+ n->sync_point = syncpt;
+ }
+
+ /* Open tunnel link when parallel link reaches synch point */
+ if ((n->state == NODE_SYNCHING) && tipc_link_is_synching(l)) {
+ if (pl)
+ dlv_nxt = mod(pl->rcv_nxt - skb_queue_len(pl->inputq));
+ if (!pl || more(dlv_nxt, n->sync_point)) {
+ tipc_link_fsm_evt(l, LINK_SYNCH_END_EVT);
+ tipc_node_fsm_evt(n, NODE_SYNCH_END_EVT);
+ return true;
+ }
+ if ((usr == TUNNEL_PROTOCOL) && (mtyp == SYNCH_MSG))
+ return true;
+ if (usr == LINK_PROTOCOL)
+ return true;
+ return false;
+ }
+ return true;
+}
+
+/**
+ * tipc_rcv - process TIPC packets/messages arriving from off-node
+ * @net: the applicable net namespace
+ * @skb: TIPC packet
+ * @bearer: pointer to bearer message arrived on
+ *
+ * Invoked with no locks held. Bearer pointer must point to a valid bearer
+ * structure (i.e. cannot be NULL), but bearer can be inactive.
+ */
+void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b)
+{
+ struct sk_buff_head xmitq;
+ struct tipc_node *n;
+ struct tipc_msg *hdr = buf_msg(skb);
+ int usr = msg_user(hdr);
+ int bearer_id = b->identity;
+ struct tipc_link_entry *le;
+ int rc = 0;
+
+ __skb_queue_head_init(&xmitq);
+
+ /* Ensure message is well-formed */
+ if (unlikely(!tipc_msg_validate(skb)))
+ goto discard;
+
+ /* Handle arrival of a non-unicast link packet */
+ if (unlikely(msg_non_seq(hdr))) {
+ if (usr == LINK_CONFIG)
+ tipc_disc_rcv(net, skb, b);
+ else
+ tipc_bclink_rcv(net, skb);
+ return;
+ }
+
+ /* Locate neighboring node that sent packet */
+ n = tipc_node_find(net, msg_prevnode(hdr));
+ if (unlikely(!n))
+ goto discard;
+ le = &n->links[bearer_id];
+
+ tipc_node_lock(n);
+
+ /* Is reception permitted at the moment ? */
+ if (!tipc_node_filter_pkt(n, hdr))
+ goto unlock;
+
+ if (unlikely(msg_user(hdr) == LINK_PROTOCOL))
+ tipc_bclink_sync_state(n, hdr);
+
+ /* Release acked broadcast packets */
+ if (unlikely(n->bclink.acked != msg_bcast_ack(hdr)))
+ tipc_bclink_acknowledge(n, msg_bcast_ack(hdr));
+
+ /* Check and if necessary update node state */
+ if (likely(tipc_node_check_state(n, skb, bearer_id, &xmitq))) {
+ rc = tipc_link_rcv(le->link, skb, &xmitq);
+ skb = NULL;
+ }
+unlock:
+ tipc_node_unlock(n);
+
+ if (unlikely(rc & TIPC_LINK_UP_EVT))
+ tipc_node_link_up(n, bearer_id, &xmitq);
+
+ if (unlikely(rc & TIPC_LINK_DOWN_EVT))
+ tipc_node_link_down(n, bearer_id, false);
+
+ if (unlikely(!skb_queue_empty(&n->bclink.namedq)))
+ tipc_named_rcv(net, &n->bclink.namedq);
+
+ if (!skb_queue_empty(&le->inputq))
+ tipc_sk_rcv(net, &le->inputq);
+
+ if (!skb_queue_empty(&xmitq))
+ tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr);
+
+ tipc_node_put(n);
+discard:
+ kfree_skb(skb);
+}
+
int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
int err;
diff --git a/net/tipc/node.h b/net/tipc/node.h
index 5a834cf..344b3e7 100644
--- a/net/tipc/node.h
+++ b/net/tipc/node.h
@@ -45,23 +45,19 @@
/* Out-of-range value for node signature */
#define INVALID_NODE_SIG 0x10000
+#define INVALID_BEARER_ID -1
+
/* Flags used to take different actions according to flag type
- * TIPC_WAIT_PEER_LINKS_DOWN: wait to see that peer's links are down
- * TIPC_WAIT_OWN_LINKS_DOWN: wait until peer node is declared down
* TIPC_NOTIFY_NODE_DOWN: notify node is down
* TIPC_NOTIFY_NODE_UP: notify node is up
* TIPC_DISTRIBUTE_NAME: publish or withdraw link state name type
*/
enum {
- TIPC_MSG_EVT = 1,
- TIPC_WAIT_PEER_LINKS_DOWN = (1 << 1),
- TIPC_WAIT_OWN_LINKS_DOWN = (1 << 2),
TIPC_NOTIFY_NODE_DOWN = (1 << 3),
TIPC_NOTIFY_NODE_UP = (1 << 4),
TIPC_WAKEUP_BCAST_USERS = (1 << 5),
TIPC_NOTIFY_LINK_UP = (1 << 6),
TIPC_NOTIFY_LINK_DOWN = (1 << 7),
- TIPC_NAMED_MSG_EVT = (1 << 8),
TIPC_BCAST_MSG_EVT = (1 << 9),
TIPC_BCAST_RESET = (1 << 10)
};
@@ -85,10 +81,17 @@ struct tipc_node_bclink {
u32 deferred_size;
struct sk_buff_head deferdq;
struct sk_buff *reasm_buf;
- int inputq_map;
+ struct sk_buff_head namedq;
bool recv_permitted;
};
+struct tipc_link_entry {
+ struct tipc_link *link;
+ u32 mtu;
+ struct sk_buff_head inputq;
+ struct tipc_media_addr maddr;
+};
+
/**
* struct tipc_node - TIPC node structure
* @addr: network address of node
@@ -98,11 +101,12 @@ struct tipc_node_bclink {
* @hash: links to adjacent nodes in unsorted hash chain
* @inputq: pointer to input queue containing messages for msg event
* @namedq: pointer to name table input queue with name table messages
- * @curr_link: the link holding the node lock, if any
- * @active_links: pointers to active links to node
- * @links: pointers to all links to node
+ * @active_links: bearer ids of active links, used as index into links[] array
+ * @links: array containing references to all links to node
* @action_flags: bit mask of different types of node actions
* @bclink: broadcast-related info
+ * @state: connectivity state vs peer node
+ * @sync_point: sequence number where synch/failover is finished
* @list: links to adjacent nodes in sorted list of cluster's nodes
* @working_links: number of working links to node (both active and standby)
* @link_cnt: number of links to node
@@ -118,14 +122,13 @@ struct tipc_node {
spinlock_t lock;
struct net *net;
struct hlist_node hash;
- struct sk_buff_head *inputq;
- struct sk_buff_head *namedq;
- struct tipc_link *active_links[2];
- u32 act_mtus[2];
- struct tipc_link *links[MAX_BEARERS];
+ int active_links[2];
+ struct tipc_link_entry links[MAX_BEARERS];
int action_flags;
struct tipc_node_bclink bclink;
struct list_head list;
+ int state;
+ u16 sync_point;
int link_cnt;
u16 working_links;
u16 capabilities;
@@ -133,25 +136,32 @@ struct tipc_node {
u32 link_id;
struct list_head publ_list;
struct list_head conn_sks;
+ unsigned long keepalive_intv;
+ struct timer_list timer;
struct rcu_head rcu;
};
struct tipc_node *tipc_node_find(struct net *net, u32 addr);
void tipc_node_put(struct tipc_node *node);
-struct tipc_node *tipc_node_create(struct net *net, u32 addr);
void tipc_node_stop(struct net *net);
+void tipc_node_check_dest(struct net *net, u32 onode,
+ struct tipc_bearer *bearer,
+ u16 capabilities, u32 signature,
+ struct tipc_media_addr *maddr,
+ bool *respond, bool *dupl_addr);
+void tipc_node_delete_links(struct net *net, int bearer_id);
void tipc_node_attach_link(struct tipc_node *n_ptr, struct tipc_link *l_ptr);
void tipc_node_detach_link(struct tipc_node *n_ptr, struct tipc_link *l_ptr);
-void tipc_node_link_down(struct tipc_node *n_ptr, struct tipc_link *l_ptr);
-void tipc_node_link_up(struct tipc_node *n_ptr, struct tipc_link *l_ptr);
-int tipc_node_active_links(struct tipc_node *n_ptr);
-int tipc_node_is_up(struct tipc_node *n_ptr);
+bool tipc_node_is_up(struct tipc_node *n);
int tipc_node_get_linkname(struct net *net, u32 bearer_id, u32 node,
char *linkname, size_t len);
void tipc_node_unlock(struct tipc_node *node);
+int tipc_node_xmit(struct net *net, struct sk_buff_head *list, u32 dnode,
+ int selector);
+int tipc_node_xmit_skb(struct net *net, struct sk_buff *skb, u32 dest,
+ u32 selector);
int tipc_node_add_conn(struct net *net, u32 dnode, u32 port, u32 peer_port);
void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port);
-
int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb);
static inline void tipc_node_lock(struct tipc_node *node)
@@ -159,26 +169,30 @@ static inline void tipc_node_lock(struct tipc_node *node)
spin_lock_bh(&node->lock);
}
-static inline bool tipc_node_blocked(struct tipc_node *node)
+static inline struct tipc_link *node_active_link(struct tipc_node *n, int sel)
{
- return (node->action_flags & (TIPC_WAIT_PEER_LINKS_DOWN |
- TIPC_NOTIFY_NODE_DOWN | TIPC_WAIT_OWN_LINKS_DOWN));
+ int bearer_id = n->active_links[sel & 1];
+
+ if (unlikely(bearer_id == INVALID_BEARER_ID))
+ return NULL;
+
+ return n->links[bearer_id].link;
}
-static inline uint tipc_node_get_mtu(struct net *net, u32 addr, u32 selector)
+static inline unsigned int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel)
{
- struct tipc_node *node;
- u32 mtu;
-
- node = tipc_node_find(net, addr);
+ struct tipc_node *n;
+ int bearer_id;
+ unsigned int mtu = MAX_MSG_SIZE;
- if (likely(node)) {
- mtu = node->act_mtus[selector & 1];
- tipc_node_put(node);
- } else {
- mtu = MAX_MSG_SIZE;
- }
+ n = tipc_node_find(net, addr);
+ if (unlikely(!n))
+ return mtu;
+ bearer_id = n->active_links[sel & 1];
+ if (likely(bearer_id != INVALID_BEARER_ID))
+ mtu = n->links[bearer_id].mtu;
+ tipc_node_put(n);
return mtu;
}
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 3a7567f..1060d52 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -248,6 +248,22 @@ static void tsk_advance_rx_queue(struct sock *sk)
kfree_skb(__skb_dequeue(&sk->sk_receive_queue));
}
+/* tipc_sk_respond() : send response message back to sender
+ */
+static void tipc_sk_respond(struct sock *sk, struct sk_buff *skb, int err)
+{
+ u32 selector;
+ u32 dnode;
+ u32 onode = tipc_own_addr(sock_net(sk));
+
+ if (!tipc_msg_reverse(onode, &skb, err))
+ return;
+
+ dnode = msg_destnode(buf_msg(skb));
+ selector = msg_origport(buf_msg(skb));
+ tipc_node_xmit_skb(sock_net(sk), skb, dnode, selector);
+}
+
/**
* tsk_rej_rx_queue - reject all buffers in socket receive queue
*
@@ -256,13 +272,9 @@ static void tsk_advance_rx_queue(struct sock *sk)
static void tsk_rej_rx_queue(struct sock *sk)
{
struct sk_buff *skb;
- u32 dnode;
- u32 own_node = tsk_own_node(tipc_sk(sk));
- while ((skb = __skb_dequeue(&sk->sk_receive_queue))) {
- if (tipc_msg_reverse(own_node, skb, &dnode, TIPC_ERR_NO_PORT))
- tipc_link_xmit_skb(sock_net(sk), skb, dnode, 0);
- }
+ while ((skb = __skb_dequeue(&sk->sk_receive_queue)))
+ tipc_sk_respond(sk, skb, TIPC_ERR_NO_PORT);
}
/* tsk_peer_msg - verify if message was sent by connected port's peer
@@ -441,9 +453,7 @@ static int tipc_release(struct socket *sock)
tsk->connected = 0;
tipc_node_remove_conn(net, dnode, tsk->portid);
}
- if (tipc_msg_reverse(tsk_own_node(tsk), skb, &dnode,
- TIPC_ERR_NO_PORT))
- tipc_link_xmit_skb(net, skb, dnode, 0);
+ tipc_sk_respond(sk, skb, TIPC_ERR_NO_PORT);
}
}
@@ -456,7 +466,7 @@ static int tipc_release(struct socket *sock)
tsk_own_node(tsk), tsk_peer_port(tsk),
tsk->portid, TIPC_ERR_NO_PORT);
if (skb)
- tipc_link_xmit_skb(net, skb, dnode, tsk->portid);
+ tipc_node_xmit_skb(net, skb, dnode, tsk->portid);
tipc_node_remove_conn(net, dnode, tsk->portid);
}
@@ -686,21 +696,22 @@ new_mtu:
do {
rc = tipc_bclink_xmit(net, pktchain);
- if (likely(rc >= 0)) {
- rc = dsz;
- break;
+ if (likely(!rc))
+ return dsz;
+
+ if (rc == -ELINKCONG) {
+ tsk->link_cong = 1;
+ rc = tipc_wait_for_sndmsg(sock, &timeo);
+ if (!rc)
+ continue;
}
+ __skb_queue_purge(pktchain);
if (rc == -EMSGSIZE) {
msg->msg_iter = save;
goto new_mtu;
}
- if (rc != -ELINKCONG)
- break;
- tipc_sk(sk)->link_cong = 1;
- rc = tipc_wait_for_sndmsg(sock, &timeo);
- if (rc)
- __skb_queue_purge(pktchain);
- } while (!rc);
+ break;
+ } while (1);
return rc;
}
@@ -763,35 +774,35 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq,
/**
* tipc_sk_proto_rcv - receive a connection mng protocol message
* @tsk: receiving socket
- * @skb: pointer to message buffer. Set to NULL if buffer is consumed.
+ * @skb: pointer to message buffer.
*/
-static void tipc_sk_proto_rcv(struct tipc_sock *tsk, struct sk_buff **skb)
+static void tipc_sk_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb)
{
- struct tipc_msg *msg = buf_msg(*skb);
+ struct sock *sk = &tsk->sk;
+ struct tipc_msg *hdr = buf_msg(skb);
+ int mtyp = msg_type(hdr);
int conn_cong;
- u32 dnode;
- u32 own_node = tsk_own_node(tsk);
+
/* Ignore if connection cannot be validated: */
- if (!tsk_peer_msg(tsk, msg))
+ if (!tsk_peer_msg(tsk, hdr))
goto exit;
tsk->probing_state = TIPC_CONN_OK;
- if (msg_type(msg) == CONN_ACK) {
+ if (mtyp == CONN_PROBE) {
+ msg_set_type(hdr, CONN_PROBE_REPLY);
+ tipc_sk_respond(sk, skb, TIPC_OK);
+ return;
+ } else if (mtyp == CONN_ACK) {
conn_cong = tsk_conn_cong(tsk);
- tsk->sent_unacked -= msg_msgcnt(msg);
+ tsk->sent_unacked -= msg_msgcnt(hdr);
if (conn_cong)
- tsk->sk.sk_write_space(&tsk->sk);
- } else if (msg_type(msg) == CONN_PROBE) {
- if (tipc_msg_reverse(own_node, *skb, &dnode, TIPC_OK)) {
- msg_set_type(msg, CONN_PROBE_REPLY);
- return;
- }
+ sk->sk_write_space(sk);
+ } else if (mtyp != CONN_PROBE_REPLY) {
+ pr_warn("Received unknown CONN_PROTO msg\n");
}
- /* Do nothing if msg_type() == CONN_PROBE_REPLY */
exit:
- kfree_skb(*skb);
- *skb = NULL;
+ kfree_skb(skb);
}
static int tipc_wait_for_sndmsg(struct socket *sock, long *timeo_p)
@@ -924,24 +935,25 @@ new_mtu:
do {
skb = skb_peek(pktchain);
TIPC_SKB_CB(skb)->wakeup_pending = tsk->link_cong;
- rc = tipc_link_xmit(net, pktchain, dnode, tsk->portid);
- if (likely(rc >= 0)) {
+ rc = tipc_node_xmit(net, pktchain, dnode, tsk->portid);
+ if (likely(!rc)) {
if (sock->state != SS_READY)
sock->state = SS_CONNECTING;
- rc = dsz;
- break;
+ return dsz;
}
+ if (rc == -ELINKCONG) {
+ tsk->link_cong = 1;
+ rc = tipc_wait_for_sndmsg(sock, &timeo);
+ if (!rc)
+ continue;
+ }
+ __skb_queue_purge(pktchain);
if (rc == -EMSGSIZE) {
m->msg_iter = save;
goto new_mtu;
}
- if (rc != -ELINKCONG)
- break;
- tsk->link_cong = 1;
- rc = tipc_wait_for_sndmsg(sock, &timeo);
- if (rc)
- __skb_queue_purge(pktchain);
- } while (!rc);
+ break;
+ } while (1);
return rc;
}
@@ -1043,15 +1055,16 @@ next:
return rc;
do {
if (likely(!tsk_conn_cong(tsk))) {
- rc = tipc_link_xmit(net, pktchain, dnode, portid);
+ rc = tipc_node_xmit(net, pktchain, dnode, portid);
if (likely(!rc)) {
tsk->sent_unacked++;
sent += send;
if (sent == dsz)
- break;
+ return dsz;
goto next;
}
if (rc == -EMSGSIZE) {
+ __skb_queue_purge(pktchain);
tsk->max_pkt = tipc_node_get_mtu(net, dnode,
portid);
m->msg_iter = save;
@@ -1059,13 +1072,13 @@ next:
}
if (rc != -ELINKCONG)
break;
+
tsk->link_cong = 1;
}
rc = tipc_wait_for_sndpkt(sock, &timeo);
- if (rc)
- __skb_queue_purge(pktchain);
} while (!rc);
+ __skb_queue_purge(pktchain);
return sent ? sent : rc;
}
@@ -1221,7 +1234,7 @@ static void tipc_sk_send_ack(struct tipc_sock *tsk, uint ack)
return;
msg = buf_msg(skb);
msg_set_msgcnt(msg, ack);
- tipc_link_xmit_skb(net, skb, dnode, msg_link_selector(msg));
+ tipc_node_xmit_skb(net, skb, dnode, msg_link_selector(msg));
}
static int tipc_wait_for_rcvmsg(struct socket *sock, long *timeop)
@@ -1507,82 +1520,81 @@ static void tipc_data_ready(struct sock *sk)
* @tsk: TIPC socket
* @skb: pointer to message buffer. Set to NULL if buffer is consumed
*
- * Returns 0 (TIPC_OK) if everything ok, -TIPC_ERR_NO_PORT otherwise
+ * Returns true if everything ok, false otherwise
*/
-static int filter_connect(struct tipc_sock *tsk, struct sk_buff **skb)
+static bool filter_connect(struct tipc_sock *tsk, struct sk_buff *skb)
{
struct sock *sk = &tsk->sk;
struct net *net = sock_net(sk);
struct socket *sock = sk->sk_socket;
- struct tipc_msg *msg = buf_msg(*skb);
- int retval = -TIPC_ERR_NO_PORT;
+ struct tipc_msg *hdr = buf_msg(skb);
- if (msg_mcast(msg))
- return retval;
+ if (unlikely(msg_mcast(hdr)))
+ return false;
switch ((int)sock->state) {
case SS_CONNECTED:
+
/* Accept only connection-based messages sent by peer */
- if (tsk_peer_msg(tsk, msg)) {
- if (unlikely(msg_errcode(msg))) {
- sock->state = SS_DISCONNECTING;
- tsk->connected = 0;
- /* let timer expire on it's own */
- tipc_node_remove_conn(net, tsk_peer_node(tsk),
- tsk->portid);
- }
- retval = TIPC_OK;
+ if (unlikely(!tsk_peer_msg(tsk, hdr)))
+ return false;
+
+ if (unlikely(msg_errcode(hdr))) {
+ sock->state = SS_DISCONNECTING;
+ tsk->connected = 0;
+ /* Let timer expire on it's own */
+ tipc_node_remove_conn(net, tsk_peer_node(tsk),
+ tsk->portid);
}
- break;
+ return true;
+
case SS_CONNECTING:
- /* Accept only ACK or NACK message */
- if (unlikely(!msg_connected(msg)))
- break;
+ /* Accept only ACK or NACK message */
+ if (unlikely(!msg_connected(hdr)))
+ return false;
- if (unlikely(msg_errcode(msg))) {
+ if (unlikely(msg_errcode(hdr))) {
sock->state = SS_DISCONNECTING;
sk->sk_err = ECONNREFUSED;
- retval = TIPC_OK;
- break;
+ return true;
}
- if (unlikely(msg_importance(msg) > TIPC_CRITICAL_IMPORTANCE)) {
+ if (unlikely(!msg_isdata(hdr))) {
sock->state = SS_DISCONNECTING;
sk->sk_err = EINVAL;
- retval = TIPC_OK;
- break;
+ return true;
}
- tipc_sk_finish_conn(tsk, msg_origport(msg), msg_orignode(msg));
- msg_set_importance(&tsk->phdr, msg_importance(msg));
+ tipc_sk_finish_conn(tsk, msg_origport(hdr), msg_orignode(hdr));
+ msg_set_importance(&tsk->phdr, msg_importance(hdr));
sock->state = SS_CONNECTED;
- /* If an incoming message is an 'ACK-', it should be
- * discarded here because it doesn't contain useful
- * data. In addition, we should try to wake up
- * connect() routine if sleeping.
- */
- if (msg_data_sz(msg) == 0) {
- kfree_skb(*skb);
- *skb = NULL;
- if (waitqueue_active(sk_sleep(sk)))
- wake_up_interruptible(sk_sleep(sk));
- }
- retval = TIPC_OK;
- break;
+ /* If 'ACK+' message, add to socket receive queue */
+ if (msg_data_sz(hdr))
+ return true;
+
+ /* If empty 'ACK-' message, wake up sleeping connect() */
+ if (waitqueue_active(sk_sleep(sk)))
+ wake_up_interruptible(sk_sleep(sk));
+
+ /* 'ACK-' message is neither accepted nor rejected: */
+ msg_set_dest_droppable(hdr, 1);
+ return false;
+
case SS_LISTENING:
case SS_UNCONNECTED:
+
/* Accept only SYN message */
- if (!msg_connected(msg) && !(msg_errcode(msg)))
- retval = TIPC_OK;
+ if (!msg_connected(hdr) && !(msg_errcode(hdr)))
+ return true;
break;
case SS_DISCONNECTING:
break;
default:
pr_err("Unknown socket state %u\n", sock->state);
}
- return retval;
+ return false;
}
/**
@@ -1617,61 +1629,70 @@ static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *buf)
/**
* filter_rcv - validate incoming message
* @sk: socket
- * @skb: pointer to message. Set to NULL if buffer is consumed.
+ * @skb: pointer to message.
*
* Enqueues message on receive queue if acceptable; optionally handles
* disconnect indication for a connected socket.
*
* Called with socket lock already taken
*
- * Returns 0 (TIPC_OK) if message was ok, -TIPC error code if rejected
+ * Returns true if message was added to socket receive queue, otherwise false
*/
-static int filter_rcv(struct sock *sk, struct sk_buff **skb)
+static bool filter_rcv(struct sock *sk, struct sk_buff *skb)
{
struct socket *sock = sk->sk_socket;
struct tipc_sock *tsk = tipc_sk(sk);
- struct tipc_msg *msg = buf_msg(*skb);
- unsigned int limit = rcvbuf_limit(sk, *skb);
- int rc = TIPC_OK;
+ struct tipc_msg *hdr = buf_msg(skb);
+ unsigned int limit = rcvbuf_limit(sk, skb);
+ int err = TIPC_OK;
+ int usr = msg_user(hdr);
- if (unlikely(msg_user(msg) == CONN_MANAGER)) {
+ if (unlikely(msg_user(hdr) == CONN_MANAGER)) {
tipc_sk_proto_rcv(tsk, skb);
- return TIPC_OK;
+ return false;
}
- if (unlikely(msg_user(msg) == SOCK_WAKEUP)) {
- kfree_skb(*skb);
+ if (unlikely(usr == SOCK_WAKEUP)) {
+ kfree_skb(skb);
tsk->link_cong = 0;
sk->sk_write_space(sk);
- *skb = NULL;
- return TIPC_OK;
+ return false;
}
- /* Reject message if it is wrong sort of message for socket */
- if (msg_type(msg) > TIPC_DIRECT_MSG)
- return -TIPC_ERR_NO_PORT;
+ /* Drop if illegal message type */
+ if (unlikely(msg_type(hdr) > TIPC_DIRECT_MSG)) {
+ kfree_skb(skb);
+ return false;
+ }
- if (sock->state == SS_READY) {
- if (msg_connected(msg))
- return -TIPC_ERR_NO_PORT;
- } else {
- rc = filter_connect(tsk, skb);
- if (rc != TIPC_OK || !*skb)
- return rc;
+ /* Reject if wrong message type for current socket state */
+ if (unlikely(sock->state == SS_READY)) {
+ if (msg_connected(hdr)) {
+ err = TIPC_ERR_NO_PORT;
+ goto reject;
+ }
+ } else if (unlikely(!filter_connect(tsk, skb))) {
+ err = TIPC_ERR_NO_PORT;
+ goto reject;
}
/* Reject message if there isn't room to queue it */
- if (sk_rmem_alloc_get(sk) + (*skb)->truesize >= limit)
- return -TIPC_ERR_OVERLOAD;
+ if (unlikely(sk_rmem_alloc_get(sk) + skb->truesize >= limit)) {
+ err = TIPC_ERR_OVERLOAD;
+ goto reject;
+ }
/* Enqueue message */
- TIPC_SKB_CB(*skb)->handle = NULL;
- __skb_queue_tail(&sk->sk_receive_queue, *skb);
- skb_set_owner_r(*skb, sk);
+ TIPC_SKB_CB(skb)->handle = NULL;
+ __skb_queue_tail(&sk->sk_receive_queue, skb);
+ skb_set_owner_r(skb, sk);
sk->sk_data_ready(sk);
- *skb = NULL;
- return TIPC_OK;
+ return true;
+
+reject:
+ tipc_sk_respond(sk, skb, err);
+ return false;
}
/**
@@ -1685,22 +1706,10 @@ static int filter_rcv(struct sock *sk, struct sk_buff **skb)
*/
static int tipc_backlog_rcv(struct sock *sk, struct sk_buff *skb)
{
- int err;
- atomic_t *dcnt;
- u32 dnode;
- struct tipc_sock *tsk = tipc_sk(sk);
- struct net *net = sock_net(sk);
- uint truesize = skb->truesize;
+ unsigned int truesize = skb->truesize;
- err = filter_rcv(sk, &skb);
- if (likely(!skb)) {
- dcnt = &tsk->dupl_rcvcnt;
- if (atomic_read(dcnt) < TIPC_CONN_OVERLOAD_LIMIT)
- atomic_add(truesize, dcnt);
- return 0;
- }
- if (!err || tipc_msg_reverse(tsk_own_node(tsk), skb, &dnode, -err))
- tipc_link_xmit_skb(net, skb, dnode, tsk->portid);
+ if (likely(filter_rcv(sk, skb)))
+ atomic_add(truesize, &tipc_sk(sk)->dupl_rcvcnt);
return 0;
}
@@ -1710,45 +1719,43 @@ static int tipc_backlog_rcv(struct sock *sk, struct sk_buff *skb)
* @inputq: list of incoming buffers with potentially different destinations
* @sk: socket where the buffers should be enqueued
* @dport: port number for the socket
- * @_skb: returned buffer to be forwarded or rejected, if applicable
*
* Caller must hold socket lock
- *
- * Returns TIPC_OK if all buffers enqueued, otherwise -TIPC_ERR_OVERLOAD
- * or -TIPC_ERR_NO_PORT
*/
-static int tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk,
- u32 dport, struct sk_buff **_skb)
+static void tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk,
+ u32 dport)
{
unsigned int lim;
atomic_t *dcnt;
- int err;
struct sk_buff *skb;
unsigned long time_limit = jiffies + 2;
while (skb_queue_len(inputq)) {
if (unlikely(time_after_eq(jiffies, time_limit)))
- return TIPC_OK;
+ return;
+
skb = tipc_skb_dequeue(inputq, dport);
if (unlikely(!skb))
- return TIPC_OK;
+ return;
+
+ /* Add message directly to receive queue if possible */
if (!sock_owned_by_user(sk)) {
- err = filter_rcv(sk, &skb);
- if (likely(!skb))
- continue;
- *_skb = skb;
- return err;
+ filter_rcv(sk, skb);
+ continue;
}
+
+ /* Try backlog, compensating for double-counted bytes */
dcnt = &tipc_sk(sk)->dupl_rcvcnt;
if (sk->sk_backlog.len)
atomic_set(dcnt, 0);
lim = rcvbuf_limit(sk, skb) + atomic_read(dcnt);
if (likely(!sk_add_backlog(sk, skb, lim)))
continue;
- *_skb = skb;
- return -TIPC_ERR_OVERLOAD;
+
+ /* Overload => reject message back to sender */
+ tipc_sk_respond(sk, skb, TIPC_ERR_OVERLOAD);
+ break;
}
- return TIPC_OK;
}
/**
@@ -1756,49 +1763,46 @@ static int tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk,
* @inputq: buffer list containing the buffers
* Consumes all buffers in list until inputq is empty
* Note: may be called in multiple threads referring to the same queue
- * Returns 0 if last buffer was accepted, otherwise -EHOSTUNREACH
- * Only node local calls check the return value, sending single-buffer queues
*/
-int tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq)
+void tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq)
{
u32 dnode, dport = 0;
int err;
- struct sk_buff *skb;
struct tipc_sock *tsk;
- struct tipc_net *tn;
struct sock *sk;
+ struct sk_buff *skb;
while (skb_queue_len(inputq)) {
- err = -TIPC_ERR_NO_PORT;
- skb = NULL;
dport = tipc_skb_peek_port(inputq, dport);
tsk = tipc_sk_lookup(net, dport);
+
if (likely(tsk)) {
sk = &tsk->sk;
if (likely(spin_trylock_bh(&sk->sk_lock.slock))) {
- err = tipc_sk_enqueue(inputq, sk, dport, &skb);
+ tipc_sk_enqueue(inputq, sk, dport);
spin_unlock_bh(&sk->sk_lock.slock);
- dport = 0;
}
sock_put(sk);
- } else {
- skb = tipc_skb_dequeue(inputq, dport);
- }
- if (likely(!skb))
continue;
- if (tipc_msg_lookup_dest(net, skb, &dnode, &err))
- goto xmit;
- if (!err) {
- dnode = msg_destnode(buf_msg(skb));
- goto xmit;
}
- tn = net_generic(net, tipc_net_id);
- if (!tipc_msg_reverse(tn->own_addr, skb, &dnode, -err))
+
+ /* No destination socket => dequeue skb if still there */
+ skb = tipc_skb_dequeue(inputq, dport);
+ if (!skb)
+ return;
+
+ /* Try secondary lookup if unresolved named message */
+ err = TIPC_ERR_NO_PORT;
+ if (tipc_msg_lookup_dest(net, skb, &err))
+ goto xmit;
+
+ /* Prepare for message rejection */
+ if (!tipc_msg_reverse(tipc_own_addr(net), &skb, err))
continue;
xmit:
- tipc_link_xmit_skb(net, skb, dnode, dport);
+ dnode = msg_destnode(buf_msg(skb));
+ tipc_node_xmit_skb(net, skb, dnode, dport);
}
- return err ? -EHOSTUNREACH : 0;
}
static int tipc_wait_for_connect(struct socket *sock, long *timeo_p)
@@ -2067,7 +2071,10 @@ static int tipc_shutdown(struct socket *sock, int how)
struct net *net = sock_net(sk);
struct tipc_sock *tsk = tipc_sk(sk);
struct sk_buff *skb;
- u32 dnode;
+ u32 dnode = tsk_peer_node(tsk);
+ u32 dport = tsk_peer_port(tsk);
+ u32 onode = tipc_own_addr(net);
+ u32 oport = tsk->portid;
int res;
if (how != SHUT_RDWR)
@@ -2080,6 +2087,8 @@ static int tipc_shutdown(struct socket *sock, int how)
case SS_CONNECTED:
restart:
+ dnode = tsk_peer_node(tsk);
+
/* Disconnect and send a 'FIN+' or 'FIN-' message to peer */
skb = __skb_dequeue(&sk->sk_receive_queue);
if (skb) {
@@ -2087,19 +2096,13 @@ restart:
kfree_skb(skb);
goto restart;
}
- if (tipc_msg_reverse(tsk_own_node(tsk), skb, &dnode,
- TIPC_CONN_SHUTDOWN))
- tipc_link_xmit_skb(net, skb, dnode,
- tsk->portid);
+ tipc_sk_respond(sk, skb, TIPC_CONN_SHUTDOWN);
} else {
- dnode = tsk_peer_node(tsk);
-
skb = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE,
TIPC_CONN_MSG, SHORT_H_SIZE,
- 0, dnode, tsk_own_node(tsk),
- tsk_peer_port(tsk),
- tsk->portid, TIPC_CONN_SHUTDOWN);
- tipc_link_xmit_skb(net, skb, dnode, tsk->portid);
+ 0, dnode, onode, dport, oport,
+ TIPC_CONN_SHUTDOWN);
+ tipc_node_xmit_skb(net, skb, dnode, tsk->portid);
}
tsk->connected = 0;
sock->state = SS_DISCONNECTING;
@@ -2161,7 +2164,7 @@ static void tipc_sk_timeout(unsigned long data)
}
bh_unlock_sock(sk);
if (skb)
- tipc_link_xmit_skb(sock_net(sk), skb, peer_node, tsk->portid);
+ tipc_node_xmit_skb(sock_net(sk), skb, peer_node, tsk->portid);
exit:
sock_put(sk);
}
diff --git a/net/tipc/socket.h b/net/tipc/socket.h
index bf65513..4241f22 100644
--- a/net/tipc/socket.h
+++ b/net/tipc/socket.h
@@ -44,7 +44,7 @@
SKB_TRUESIZE(TIPC_MAX_USER_MSG_SIZE))
int tipc_socket_init(void);
void tipc_socket_stop(void);
-int tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq);
+void tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq);
void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq,
struct sk_buff_head *inputq);
void tipc_sk_reinit(struct net *net);
diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c
index 66deebc..c170d31 100644
--- a/net/tipc/udp_media.c
+++ b/net/tipc/udp_media.c
@@ -194,7 +194,8 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb,
.saddr = src->ipv6,
.flowi6_proto = IPPROTO_UDP
};
- err = ipv6_stub->ipv6_dst_lookup(ub->ubsock->sk, &ndst, &fl6);
+ err = ipv6_stub->ipv6_dst_lookup(net, ub->ubsock->sk, &ndst,
+ &fl6);
if (err)
goto tx_error;
ttl = ip6_dst_hoplimit(ndst);
diff --git a/net/wimax/op-rfkill.c b/net/wimax/op-rfkill.c
index 7d73054..477364a 100644
--- a/net/wimax/op-rfkill.c
+++ b/net/wimax/op-rfkill.c
@@ -135,8 +135,7 @@ EXPORT_SYMBOL_GPL(wimax_report_rfkill_hw);
* @state: New state of the RF kill switch. %WIMAX_RF_ON radio on,
* %WIMAX_RF_OFF radio off.
*
- * Reports changes in the software RF switch state to the the WiMAX
- * stack.
+ * Reports changes in the software RF switch state to the WiMAX stack.
*
* The main use is during initialization, so the driver can query the
* device for its current software radio kill switch state and feed it
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index bd16c6c..0cebf1f 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -2048,7 +2048,7 @@ static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh,
xfrm_audit_policy_delete(xp, 1, true);
} else {
// reset the timers here?
- WARN(1, "Dont know what to do with soft policy expire\n");
+ WARN(1, "Don't know what to do with soft policy expire\n");
}
km_policy_expired(xp, p->dir, up->hard, nlh->nlmsg_pid);