summaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig18
-rw-r--r--net/ipv4/Makefile1
-rw-r--r--net/ipv4/af_inet.c53
-rw-r--r--net/ipv4/ah4.c3
-rw-r--r--net/ipv4/arp.c31
-rw-r--r--net/ipv4/devinet.c332
-rw-r--r--net/ipv4/esp4.c6
-rw-r--r--net/ipv4/fib_frontend.c27
-rw-r--r--net/ipv4/fib_semantics.c23
-rw-r--r--net/ipv4/fib_trie.c64
-rw-r--r--net/ipv4/gre.c109
-rw-r--r--net/ipv4/icmp.c27
-rw-r--r--net/ipv4/igmp.c11
-rw-r--r--net/ipv4/inet_connection_sock.c58
-rw-r--r--net/ipv4/inet_diag.c10
-rw-r--r--net/ipv4/inet_fragment.c148
-rw-r--r--net/ipv4/inet_hashtables.c36
-rw-r--r--net/ipv4/inet_lro.c5
-rw-r--r--net/ipv4/inet_timewait_sock.c7
-rw-r--r--net/ipv4/ip_fragment.c108
-rw-r--r--net/ipv4/ip_gre.c1509
-rw-r--r--net/ipv4/ip_input.c17
-rw-r--r--net/ipv4/ip_options.c7
-rw-r--r--net/ipv4/ip_output.c6
-rw-r--r--net/ipv4/ip_tunnel.c1035
-rw-r--r--net/ipv4/ip_vti.c45
-rw-r--r--net/ipv4/ipcomp.c2
-rw-r--r--net/ipv4/ipconfig.c18
-rw-r--r--net/ipv4/ipip.c746
-rw-r--r--net/ipv4/ipmr.c142
-rw-r--r--net/ipv4/netfilter.c15
-rw-r--r--net/ipv4/netfilter/Kconfig19
-rw-r--r--net/ipv4/netfilter/arp_tables.c11
-rw-r--r--net/ipv4/netfilter/arptable_filter.c4
-rw-r--r--net/ipv4/netfilter/ip_tables.c20
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c10
-rw-r--r--net/ipv4/netfilter/ipt_ULOG.c169
-rw-r--r--net/ipv4/netfilter/ipt_rpfilter.c8
-rw-r--r--net/ipv4/netfilter/iptable_mangle.c9
-rw-r--r--net/ipv4/netfilter/iptable_nat.c23
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c93
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c15
-rw-r--r--net/ipv4/netfilter/nf_conntrack_proto_icmp.c9
-rw-r--r--net/ipv4/netfilter/nf_nat_h323.c1
-rw-r--r--net/ipv4/netfilter/nf_nat_pptp.c2
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_gre.c2
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic.c2
-rw-r--r--net/ipv4/ping.c13
-rw-r--r--net/ipv4/proc.c30
-rw-r--r--net/ipv4/protocol.c6
-rw-r--r--net/ipv4/raw.c12
-rw-r--r--net/ipv4/route.c22
-rw-r--r--net/ipv4/syncookies.c14
-rw-r--r--net/ipv4/sysctl_net_ipv4.c56
-rw-r--r--net/ipv4/tcp.c340
-rw-r--r--net/ipv4/tcp_cong.c30
-rw-r--r--net/ipv4/tcp_input.c730
-rw-r--r--net/ipv4/tcp_ipv4.c171
-rw-r--r--net/ipv4/tcp_memcontrol.c3
-rw-r--r--net/ipv4/tcp_metrics.c15
-rw-r--r--net/ipv4/tcp_minisocks.c55
-rw-r--r--net/ipv4/tcp_output.c430
-rw-r--r--net/ipv4/tcp_probe.c6
-rw-r--r--net/ipv4/tcp_timer.c21
-rw-r--r--net/ipv4/tcp_westwood.c2
-rw-r--r--net/ipv4/udp.c212
-rw-r--r--net/ipv4/udp_diag.c10
-rw-r--r--net/ipv4/xfrm4_input.c2
-rw-r--r--net/ipv4/xfrm4_mode_tunnel.c12
-rw-r--r--net/ipv4/xfrm4_policy.c58
70 files changed, 3504 insertions, 3762 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 5a19aeb..8603ca8 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -166,6 +166,7 @@ config IP_PNP_RARP
config NET_IPIP
tristate "IP: tunneling"
select INET_TUNNEL
+ select NET_IP_TUNNEL
---help---
Tunneling means encapsulating data of one protocol type within
another protocol and sending it over a channel that understands the
@@ -186,9 +187,14 @@ config NET_IPGRE_DEMUX
This is helper module to demultiplex GRE packets on GRE version field criteria.
Required by ip_gre and pptp modules.
+config NET_IP_TUNNEL
+ tristate
+ default n
+
config NET_IPGRE
tristate "IP: GRE tunnels over IP"
depends on (IPV6 || IPV6=n) && NET_IPGRE_DEMUX
+ select NET_IP_TUNNEL
help
Tunneling means encapsulating data of one protocol type within
another protocol and sending it over a channel that understands the
@@ -313,6 +319,7 @@ config SYN_COOKIES
config NET_IPVTI
tristate "Virtual (secure) IP: tunneling"
select INET_TUNNEL
+ select NET_IP_TUNNEL
depends on INET_XFRM_MODE_TUNNEL
---help---
Tunneling means encapsulating data of one protocol type within
@@ -488,7 +495,6 @@ config TCP_CONG_HTCP
config TCP_CONG_HSTCP
tristate "High Speed TCP"
- depends on EXPERIMENTAL
default n
---help---
Sally Floyd's High Speed TCP (RFC 3649) congestion control.
@@ -499,7 +505,6 @@ config TCP_CONG_HSTCP
config TCP_CONG_HYBLA
tristate "TCP-Hybla congestion control algorithm"
- depends on EXPERIMENTAL
default n
---help---
TCP-Hybla is a sender-side only change that eliminates penalization of
@@ -509,7 +514,6 @@ config TCP_CONG_HYBLA
config TCP_CONG_VEGAS
tristate "TCP Vegas"
- depends on EXPERIMENTAL
default n
---help---
TCP Vegas is a sender-side only change to TCP that anticipates
@@ -520,7 +524,6 @@ config TCP_CONG_VEGAS
config TCP_CONG_SCALABLE
tristate "Scalable TCP"
- depends on EXPERIMENTAL
default n
---help---
Scalable TCP is a sender-side only change to TCP which uses a
@@ -530,7 +533,6 @@ config TCP_CONG_SCALABLE
config TCP_CONG_LP
tristate "TCP Low Priority"
- depends on EXPERIMENTAL
default n
---help---
TCP Low Priority (TCP-LP), a distributed algorithm whose goal is
@@ -540,7 +542,6 @@ config TCP_CONG_LP
config TCP_CONG_VENO
tristate "TCP Veno"
- depends on EXPERIMENTAL
default n
---help---
TCP Veno is a sender-side only enhancement of TCP to obtain better
@@ -552,7 +553,6 @@ config TCP_CONG_VENO
config TCP_CONG_YEAH
tristate "YeAH TCP"
- depends on EXPERIMENTAL
select TCP_CONG_VEGAS
default n
---help---
@@ -567,7 +567,6 @@ config TCP_CONG_YEAH
config TCP_CONG_ILLINOIS
tristate "TCP Illinois"
- depends on EXPERIMENTAL
default n
---help---
TCP-Illinois is a sender-side modification of TCP Reno for
@@ -631,8 +630,7 @@ config DEFAULT_TCP_CONG
default "cubic"
config TCP_MD5SIG
- bool "TCP: MD5 Signature Option support (RFC2385) (EXPERIMENTAL)"
- depends on EXPERIMENTAL
+ bool "TCP: MD5 Signature Option support (RFC2385)"
select CRYPTO
select CRYPTO_MD5
---help---
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 15ca63e..089cb9f 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -13,6 +13,7 @@ obj-y := route.o inetpeer.o protocol.o \
fib_frontend.o fib_semantics.o fib_trie.o \
inet_fragment.o ping.o
+obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o
obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 24b384b..d01be2a 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -111,10 +111,10 @@
#include <net/sock.h>
#include <net/raw.h>
#include <net/icmp.h>
-#include <net/ipip.h>
#include <net/inet_common.h>
#include <net/xfrm.h>
#include <net/net_namespace.h>
+#include <net/secure_seq.h>
#ifdef CONFIG_IP_MROUTE
#include <linux/mroute.h>
#endif
@@ -248,8 +248,12 @@ EXPORT_SYMBOL(inet_listen);
u32 inet_ehash_secret __read_mostly;
EXPORT_SYMBOL(inet_ehash_secret);
+u32 ipv6_hash_secret __read_mostly;
+EXPORT_SYMBOL(ipv6_hash_secret);
+
/*
- * inet_ehash_secret must be set exactly once
+ * inet_ehash_secret must be set exactly once, and to a non nul value
+ * ipv6_hash_secret must be set exactly once.
*/
void build_ehash_secret(void)
{
@@ -259,24 +263,12 @@ void build_ehash_secret(void)
get_random_bytes(&rnd, sizeof(rnd));
} while (rnd == 0);
- cmpxchg(&inet_ehash_secret, 0, rnd);
-}
-EXPORT_SYMBOL(build_ehash_secret);
-
-static inline int inet_netns_ok(struct net *net, __u8 protocol)
-{
- const struct net_protocol *ipprot;
-
- if (net_eq(net, &init_net))
- return 1;
-
- ipprot = rcu_dereference(inet_protos[protocol]);
- if (ipprot == NULL) {
- /* raw IP is OK */
- return 1;
+ if (cmpxchg(&inet_ehash_secret, 0, rnd) == 0) {
+ get_random_bytes(&ipv6_hash_secret, sizeof(ipv6_hash_secret));
+ net_secret_init();
}
- return ipprot->netns_ok;
}
+EXPORT_SYMBOL(build_ehash_secret);
/*
* Create an inet socket.
@@ -350,10 +342,6 @@ lookup_protocol:
!ns_capable(net->user_ns, CAP_NET_RAW))
goto out_rcu_unlock;
- err = -EAFNOSUPPORT;
- if (!inet_netns_ok(net, protocol))
- goto out_rcu_unlock;
-
sock->ops = answer->ops;
answer_prot = answer->prot;
answer_no_check = answer->no_check;
@@ -1297,15 +1285,16 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
int ihl;
int id;
unsigned int offset = 0;
-
- if (!(features & NETIF_F_V4_CSUM))
- features &= ~NETIF_F_SG;
+ bool tunnel;
if (unlikely(skb_shinfo(skb)->gso_type &
~(SKB_GSO_TCPV4 |
SKB_GSO_UDP |
SKB_GSO_DODGY |
SKB_GSO_TCP_ECN |
+ SKB_GSO_GRE |
+ SKB_GSO_TCPV6 |
+ SKB_GSO_UDP_TUNNEL |
0)))
goto out;
@@ -1320,6 +1309,8 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
if (unlikely(!pskb_may_pull(skb, ihl)))
goto out;
+ tunnel = !!skb->encapsulation;
+
__skb_pull(skb, ihl);
skb_reset_transport_header(skb);
iph = ip_hdr(skb);
@@ -1333,20 +1324,21 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
segs = ops->callbacks.gso_segment(skb, features);
rcu_read_unlock();
- if (!segs || IS_ERR(segs))
+ if (IS_ERR_OR_NULL(segs))
goto out;
skb = segs;
do {
iph = ip_hdr(skb);
- if (proto == IPPROTO_UDP) {
+ if (!tunnel && proto == IPPROTO_UDP) {
iph->id = htons(id);
iph->frag_off = htons(offset >> 3);
if (skb->next != NULL)
iph->frag_off |= htons(IP_MF);
offset += (skb->len - skb->mac_len - iph->ihl * 4);
- } else
+ } else {
iph->id = htons(id++);
+ }
iph->tot_len = htons(skb->len - skb->mac_len);
iph->check = 0;
iph->check = ip_fast_csum(skb_network_header(skb), iph->ihl);
@@ -1590,7 +1582,7 @@ static const struct net_offload udp_offload = {
static const struct net_protocol icmp_protocol = {
.handler = icmp_rcv,
- .err_handler = ping_err,
+ .err_handler = icmp_err,
.no_policy = 1,
.netns_ok = 1,
};
@@ -1705,12 +1697,11 @@ static struct packet_type ip_packet_type __read_mostly = {
static int __init inet_init(void)
{
- struct sk_buff *dummy_skb;
struct inet_protosw *q;
struct list_head *r;
int rc = -EINVAL;
- BUILD_BUG_ON(sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb));
+ BUILD_BUG_ON(sizeof(struct inet_skb_parm) > FIELD_SIZEOF(struct sk_buff, cb));
sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL);
if (!sysctl_local_reserved_ports)
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index a69b4e4..2e7f194 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -321,8 +321,7 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
/* We are going to _remove_ AH header to keep sockets happy,
* so... Later this can change. */
- if (skb_cloned(skb) &&
- pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+ if (skb_unclone(skb, GFP_ATOMIC))
goto out;
skb->ip_summed = CHECKSUM_NONE;
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index ded146b..247ec19 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -654,11 +654,19 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
arp_ptr += dev->addr_len;
memcpy(arp_ptr, &src_ip, 4);
arp_ptr += 4;
- if (target_hw != NULL)
- memcpy(arp_ptr, target_hw, dev->addr_len);
- else
- memset(arp_ptr, 0, dev->addr_len);
- arp_ptr += dev->addr_len;
+
+ switch (dev->type) {
+#if IS_ENABLED(CONFIG_FIREWIRE_NET)
+ case ARPHRD_IEEE1394:
+ break;
+#endif
+ default:
+ if (target_hw != NULL)
+ memcpy(arp_ptr, target_hw, dev->addr_len);
+ else
+ memset(arp_ptr, 0, dev->addr_len);
+ arp_ptr += dev->addr_len;
+ }
memcpy(arp_ptr, &dest_ip, 4);
return skb;
@@ -781,7 +789,14 @@ static int arp_process(struct sk_buff *skb)
arp_ptr += dev->addr_len;
memcpy(&sip, arp_ptr, 4);
arp_ptr += 4;
- arp_ptr += dev->addr_len;
+ switch (dev_type) {
+#if IS_ENABLED(CONFIG_FIREWIRE_NET)
+ case ARPHRD_IEEE1394:
+ break;
+#endif
+ default:
+ arp_ptr += dev->addr_len;
+ }
memcpy(&tip, arp_ptr, 4);
/*
* Check for bad requests for 127.x.x.x and requests for multicast
@@ -1405,14 +1420,14 @@ static const struct file_operations arp_seq_fops = {
static int __net_init arp_net_init(struct net *net)
{
- if (!proc_net_fops_create(net, "arp", S_IRUGO, &arp_seq_fops))
+ if (!proc_create("arp", S_IRUGO, net->proc_net, &arp_seq_fops))
return -ENOMEM;
return 0;
}
static void __net_exit arp_net_exit(struct net *net)
{
- proc_net_remove(net, "arp");
+ remove_proc_entry("arp", net->proc_net);
}
static struct pernet_operations arp_net_ops = {
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index a8e4f26..dfc39d4 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -63,6 +63,7 @@
#include <net/ip_fib.h>
#include <net/rtnetlink.h>
#include <net/net_namespace.h>
+#include <net/addrconf.h>
#include "fib_lookup.h"
@@ -93,6 +94,7 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {
[IFA_ADDRESS] = { .type = NLA_U32 },
[IFA_BROADCAST] = { .type = NLA_U32 },
[IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 },
+ [IFA_CACHEINFO] = { .len = sizeof(struct ifa_cacheinfo) },
};
#define IN4_ADDR_HSIZE_SHIFT 8
@@ -137,10 +139,9 @@ struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
u32 hash = inet_addr_hash(net, addr);
struct net_device *result = NULL;
struct in_ifaddr *ifa;
- struct hlist_node *node;
rcu_read_lock();
- hlist_for_each_entry_rcu(ifa, node, &inet_addr_lst[hash], hash) {
+ hlist_for_each_entry_rcu(ifa, &inet_addr_lst[hash], hash) {
if (ifa->ifa_local == addr) {
struct net_device *dev = ifa->ifa_dev->dev;
@@ -417,6 +418,10 @@ static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
__inet_del_ifa(in_dev, ifap, destroy, NULL, 0);
}
+static void check_lifetime(struct work_struct *work);
+
+static DECLARE_DELAYED_WORK(check_lifetime_work, check_lifetime);
+
static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
u32 portid)
{
@@ -462,6 +467,9 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
inet_hash_insert(dev_net(in_dev->dev), ifa);
+ cancel_delayed_work(&check_lifetime_work);
+ schedule_delayed_work(&check_lifetime_work, 0);
+
/* Send message first, then call notifier.
Notifier will trigger FIB update, so that
listeners of netlink will know about new ifaddr */
@@ -528,7 +536,7 @@ struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix,
return NULL;
}
-static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
struct nlattr *tb[IFA_MAX+1];
@@ -573,7 +581,131 @@ errout:
return err;
}
-static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh)
+#define INFINITY_LIFE_TIME 0xFFFFFFFF
+
+static void check_lifetime(struct work_struct *work)
+{
+ unsigned long now, next, next_sec, next_sched;
+ struct in_ifaddr *ifa;
+ struct hlist_node *n;
+ int i;
+
+ now = jiffies;
+ next = round_jiffies_up(now + ADDR_CHECK_FREQUENCY);
+
+ for (i = 0; i < IN4_ADDR_HSIZE; i++) {
+ bool change_needed = false;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(ifa, &inet_addr_lst[i], hash) {
+ unsigned long age;
+
+ if (ifa->ifa_flags & IFA_F_PERMANENT)
+ continue;
+
+ /* We try to batch several events at once. */
+ age = (now - ifa->ifa_tstamp +
+ ADDRCONF_TIMER_FUZZ_MINUS) / HZ;
+
+ if (ifa->ifa_valid_lft != INFINITY_LIFE_TIME &&
+ age >= ifa->ifa_valid_lft) {
+ change_needed = true;
+ } else if (ifa->ifa_preferred_lft ==
+ INFINITY_LIFE_TIME) {
+ continue;
+ } else if (age >= ifa->ifa_preferred_lft) {
+ if (time_before(ifa->ifa_tstamp +
+ ifa->ifa_valid_lft * HZ, next))
+ next = ifa->ifa_tstamp +
+ ifa->ifa_valid_lft * HZ;
+
+ if (!(ifa->ifa_flags & IFA_F_DEPRECATED))
+ change_needed = true;
+ } else if (time_before(ifa->ifa_tstamp +
+ ifa->ifa_preferred_lft * HZ,
+ next)) {
+ next = ifa->ifa_tstamp +
+ ifa->ifa_preferred_lft * HZ;
+ }
+ }
+ rcu_read_unlock();
+ if (!change_needed)
+ continue;
+ rtnl_lock();
+ hlist_for_each_entry_safe(ifa, n, &inet_addr_lst[i], hash) {
+ unsigned long age;
+
+ if (ifa->ifa_flags & IFA_F_PERMANENT)
+ continue;
+
+ /* We try to batch several events at once. */
+ age = (now - ifa->ifa_tstamp +
+ ADDRCONF_TIMER_FUZZ_MINUS) / HZ;
+
+ if (ifa->ifa_valid_lft != INFINITY_LIFE_TIME &&
+ age >= ifa->ifa_valid_lft) {
+ struct in_ifaddr **ifap;
+
+ for (ifap = &ifa->ifa_dev->ifa_list;
+ *ifap != NULL; ifap = &(*ifap)->ifa_next) {
+ if (*ifap == ifa) {
+ inet_del_ifa(ifa->ifa_dev,
+ ifap, 1);
+ break;
+ }
+ }
+ } else if (ifa->ifa_preferred_lft !=
+ INFINITY_LIFE_TIME &&
+ age >= ifa->ifa_preferred_lft &&
+ !(ifa->ifa_flags & IFA_F_DEPRECATED)) {
+ ifa->ifa_flags |= IFA_F_DEPRECATED;
+ rtmsg_ifa(RTM_NEWADDR, ifa, NULL, 0);
+ }
+ }
+ rtnl_unlock();
+ }
+
+ next_sec = round_jiffies_up(next);
+ next_sched = next;
+
+ /* If rounded timeout is accurate enough, accept it. */
+ if (time_before(next_sec, next + ADDRCONF_TIMER_FUZZ))
+ next_sched = next_sec;
+
+ now = jiffies;
+ /* And minimum interval is ADDRCONF_TIMER_FUZZ_MAX. */
+ if (time_before(next_sched, now + ADDRCONF_TIMER_FUZZ_MAX))
+ next_sched = now + ADDRCONF_TIMER_FUZZ_MAX;
+
+ schedule_delayed_work(&check_lifetime_work, next_sched - now);
+}
+
+static void set_ifa_lifetime(struct in_ifaddr *ifa, __u32 valid_lft,
+ __u32 prefered_lft)
+{
+ unsigned long timeout;
+
+ ifa->ifa_flags &= ~(IFA_F_PERMANENT | IFA_F_DEPRECATED);
+
+ timeout = addrconf_timeout_fixup(valid_lft, HZ);
+ if (addrconf_finite_timeout(timeout))
+ ifa->ifa_valid_lft = timeout;
+ else
+ ifa->ifa_flags |= IFA_F_PERMANENT;
+
+ timeout = addrconf_timeout_fixup(prefered_lft, HZ);
+ if (addrconf_finite_timeout(timeout)) {
+ if (timeout == 0)
+ ifa->ifa_flags |= IFA_F_DEPRECATED;
+ ifa->ifa_preferred_lft = timeout;
+ }
+ ifa->ifa_tstamp = jiffies;
+ if (!ifa->ifa_cstamp)
+ ifa->ifa_cstamp = ifa->ifa_tstamp;
+}
+
+static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh,
+ __u32 *pvalid_lft, __u32 *pprefered_lft)
{
struct nlattr *tb[IFA_MAX+1];
struct in_ifaddr *ifa;
@@ -633,24 +765,77 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh)
else
memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+ if (tb[IFA_CACHEINFO]) {
+ struct ifa_cacheinfo *ci;
+
+ ci = nla_data(tb[IFA_CACHEINFO]);
+ if (!ci->ifa_valid || ci->ifa_prefered > ci->ifa_valid) {
+ err = -EINVAL;
+ goto errout;
+ }
+ *pvalid_lft = ci->ifa_valid;
+ *pprefered_lft = ci->ifa_prefered;
+ }
+
return ifa;
errout:
return ERR_PTR(err);
}
-static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+static struct in_ifaddr *find_matching_ifa(struct in_ifaddr *ifa)
+{
+ struct in_device *in_dev = ifa->ifa_dev;
+ struct in_ifaddr *ifa1, **ifap;
+
+ if (!ifa->ifa_local)
+ return NULL;
+
+ for (ifap = &in_dev->ifa_list; (ifa1 = *ifap) != NULL;
+ ifap = &ifa1->ifa_next) {
+ if (ifa1->ifa_mask == ifa->ifa_mask &&
+ inet_ifa_match(ifa1->ifa_address, ifa) &&
+ ifa1->ifa_local == ifa->ifa_local)
+ return ifa1;
+ }
+ return NULL;
+}
+
+static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
struct in_ifaddr *ifa;
+ struct in_ifaddr *ifa_existing;
+ __u32 valid_lft = INFINITY_LIFE_TIME;
+ __u32 prefered_lft = INFINITY_LIFE_TIME;
ASSERT_RTNL();
- ifa = rtm_to_ifaddr(net, nlh);
+ ifa = rtm_to_ifaddr(net, nlh, &valid_lft, &prefered_lft);
if (IS_ERR(ifa))
return PTR_ERR(ifa);
- return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid);
+ ifa_existing = find_matching_ifa(ifa);
+ if (!ifa_existing) {
+ /* It would be best to check for !NLM_F_CREATE here but
+ * userspace alreay relies on not having to provide this.
+ */
+ set_ifa_lifetime(ifa, valid_lft, prefered_lft);
+ return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid);
+ } else {
+ inet_free_ifa(ifa);
+
+ if (nlh->nlmsg_flags & NLM_F_EXCL ||
+ !(nlh->nlmsg_flags & NLM_F_REPLACE))
+ return -EEXIST;
+ ifa = ifa_existing;
+ set_ifa_lifetime(ifa, valid_lft, prefered_lft);
+ cancel_delayed_work(&check_lifetime_work);
+ schedule_delayed_work(&check_lifetime_work, 0);
+ rtmsg_ifa(RTM_NEWADDR, ifa, nlh, NETLINK_CB(skb).portid);
+ blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa);
+ }
+ return 0;
}
/*
@@ -852,6 +1037,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
ifa->ifa_prefixlen = 32;
ifa->ifa_mask = inet_make_mask(32);
}
+ set_ifa_lifetime(ifa, INFINITY_LIFE_TIME, INFINITY_LIFE_TIME);
ret = inet_set_ifa(dev, ifa);
break;
@@ -1190,6 +1376,8 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
ifa->ifa_dev = in_dev;
ifa->ifa_scope = RT_SCOPE_HOST;
memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+ set_ifa_lifetime(ifa, INFINITY_LIFE_TIME,
+ INFINITY_LIFE_TIME);
inet_insert_ifa(ifa);
}
}
@@ -1246,11 +1434,30 @@ static size_t inet_nlmsg_size(void)
+ nla_total_size(IFNAMSIZ); /* IFA_LABEL */
}
+static inline u32 cstamp_delta(unsigned long cstamp)
+{
+ return (cstamp - INITIAL_JIFFIES) * 100UL / HZ;
+}
+
+static int put_cacheinfo(struct sk_buff *skb, unsigned long cstamp,
+ unsigned long tstamp, u32 preferred, u32 valid)
+{
+ struct ifa_cacheinfo ci;
+
+ ci.cstamp = cstamp_delta(cstamp);
+ ci.tstamp = cstamp_delta(tstamp);
+ ci.ifa_prefered = preferred;
+ ci.ifa_valid = valid;
+
+ return nla_put(skb, IFA_CACHEINFO, sizeof(ci), &ci);
+}
+
static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
u32 portid, u32 seq, int event, unsigned int flags)
{
struct ifaddrmsg *ifm;
struct nlmsghdr *nlh;
+ u32 preferred, valid;
nlh = nlmsg_put(skb, portid, seq, event, sizeof(*ifm), flags);
if (nlh == NULL)
@@ -1259,10 +1466,31 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
ifm = nlmsg_data(nlh);
ifm->ifa_family = AF_INET;
ifm->ifa_prefixlen = ifa->ifa_prefixlen;
- ifm->ifa_flags = ifa->ifa_flags|IFA_F_PERMANENT;
+ ifm->ifa_flags = ifa->ifa_flags;
ifm->ifa_scope = ifa->ifa_scope;
ifm->ifa_index = ifa->ifa_dev->dev->ifindex;
+ if (!(ifm->ifa_flags & IFA_F_PERMANENT)) {
+ preferred = ifa->ifa_preferred_lft;
+ valid = ifa->ifa_valid_lft;
+ if (preferred != INFINITY_LIFE_TIME) {
+ long tval = (jiffies - ifa->ifa_tstamp) / HZ;
+
+ if (preferred > tval)
+ preferred -= tval;
+ else
+ preferred = 0;
+ if (valid != INFINITY_LIFE_TIME) {
+ if (valid > tval)
+ valid -= tval;
+ else
+ valid = 0;
+ }
+ }
+ } else {
+ preferred = INFINITY_LIFE_TIME;
+ valid = INFINITY_LIFE_TIME;
+ }
if ((ifa->ifa_address &&
nla_put_be32(skb, IFA_ADDRESS, ifa->ifa_address)) ||
(ifa->ifa_local &&
@@ -1270,7 +1498,9 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
(ifa->ifa_broadcast &&
nla_put_be32(skb, IFA_BROADCAST, ifa->ifa_broadcast)) ||
(ifa->ifa_label[0] &&
- nla_put_string(skb, IFA_LABEL, ifa->ifa_label)))
+ nla_put_string(skb, IFA_LABEL, ifa->ifa_label)) ||
+ put_cacheinfo(skb, ifa->ifa_cstamp, ifa->ifa_tstamp,
+ preferred, valid))
goto nla_put_failure;
return nlmsg_end(skb, nlh);
@@ -1290,7 +1520,6 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
struct in_device *in_dev;
struct in_ifaddr *ifa;
struct hlist_head *head;
- struct hlist_node *node;
s_h = cb->args[0];
s_idx = idx = cb->args[1];
@@ -1300,7 +1529,9 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
idx = 0;
head = &net->dev_index_head[h];
rcu_read_lock();
- hlist_for_each_entry_rcu(dev, node, head, index_hlist) {
+ cb->seq = atomic_read(&net->ipv4.dev_addr_genid) ^
+ net->dev_base_seq;
+ hlist_for_each_entry_rcu(dev, head, index_hlist) {
if (idx < s_idx)
goto cont;
if (h > s_h || idx > s_idx)
@@ -1320,6 +1551,7 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
rcu_read_unlock();
goto done;
}
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
}
cont:
idx++;
@@ -1531,8 +1763,7 @@ static const struct nla_policy devconf_ipv4_policy[NETCONFA_MAX+1] = {
};
static int inet_netconf_get_devconf(struct sk_buff *in_skb,
- struct nlmsghdr *nlh,
- void *arg)
+ struct nlmsghdr *nlh)
{
struct net *net = sock_net(in_skb->sk);
struct nlattr *tb[NETCONFA_MAX+1];
@@ -1592,6 +1823,77 @@ errout:
return err;
}
+static int inet_netconf_dump_devconf(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ int h, s_h;
+ int idx, s_idx;
+ struct net_device *dev;
+ struct in_device *in_dev;
+ struct hlist_head *head;
+
+ s_h = cb->args[0];
+ s_idx = idx = cb->args[1];
+
+ for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
+ idx = 0;
+ head = &net->dev_index_head[h];
+ rcu_read_lock();
+ cb->seq = atomic_read(&net->ipv4.dev_addr_genid) ^
+ net->dev_base_seq;
+ hlist_for_each_entry_rcu(dev, head, index_hlist) {
+ if (idx < s_idx)
+ goto cont;
+ in_dev = __in_dev_get_rcu(dev);
+ if (!in_dev)
+ goto cont;
+
+ if (inet_netconf_fill_devconf(skb, dev->ifindex,
+ &in_dev->cnf,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWNETCONF,
+ NLM_F_MULTI,
+ -1) <= 0) {
+ rcu_read_unlock();
+ goto done;
+ }
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+cont:
+ idx++;
+ }
+ rcu_read_unlock();
+ }
+ if (h == NETDEV_HASHENTRIES) {
+ if (inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_ALL,
+ net->ipv4.devconf_all,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWNETCONF, NLM_F_MULTI,
+ -1) <= 0)
+ goto done;
+ else
+ h++;
+ }
+ if (h == NETDEV_HASHENTRIES + 1) {
+ if (inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_DEFAULT,
+ net->ipv4.devconf_dflt,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWNETCONF, NLM_F_MULTI,
+ -1) <= 0)
+ goto done;
+ else
+ h++;
+ }
+done:
+ cb->args[0] = h;
+ cb->args[1] = idx;
+
+ return skb->len;
+}
+
#ifdef CONFIG_SYSCTL
static void devinet_copy_dflt_conf(struct net *net, int i)
@@ -1988,12 +2290,14 @@ void __init devinet_init(void)
register_gifconf(PF_INET, inet_gifconf);
register_netdevice_notifier(&ip_netdev_notifier);
+ schedule_delayed_work(&check_lifetime_work, 0);
+
rtnl_af_register(&inet_af_ops);
rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL, NULL);
rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL, NULL);
rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr, NULL);
rtnl_register(PF_INET, RTM_GETNETCONF, inet_netconf_get_devconf,
- NULL, NULL);
+ inet_netconf_dump_devconf, NULL);
}
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 3b4f0cd..4cfe34d 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -139,8 +139,6 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
/* skb is pure payload to encrypt */
- err = -ENOMEM;
-
esp = x->data;
aead = esp->aead;
alen = crypto_aead_authsize(aead);
@@ -176,8 +174,10 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
}
tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen);
- if (!tmp)
+ if (!tmp) {
+ err = -ENOMEM;
goto error;
+ }
seqhi = esp_tmp_seqhi(tmp);
iv = esp_tmp_iv(aead, tmp, seqhilen);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 5cd75e2..c7629a2 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -112,7 +112,6 @@ struct fib_table *fib_new_table(struct net *net, u32 id)
struct fib_table *fib_get_table(struct net *net, u32 id)
{
struct fib_table *tb;
- struct hlist_node *node;
struct hlist_head *head;
unsigned int h;
@@ -122,7 +121,7 @@ struct fib_table *fib_get_table(struct net *net, u32 id)
rcu_read_lock();
head = &net->ipv4.fib_table_hash[h];
- hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
+ hlist_for_each_entry_rcu(tb, head, tb_hlist) {
if (tb->tb_id == id) {
rcu_read_unlock();
return tb;
@@ -137,13 +136,12 @@ static void fib_flush(struct net *net)
{
int flushed = 0;
struct fib_table *tb;
- struct hlist_node *node;
struct hlist_head *head;
unsigned int h;
for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
head = &net->ipv4.fib_table_hash[h];
- hlist_for_each_entry(tb, node, head, tb_hlist)
+ hlist_for_each_entry(tb, head, tb_hlist)
flushed += fib_table_flush(tb);
}
@@ -606,7 +604,7 @@ errout:
return err;
}
-static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
struct fib_config cfg;
@@ -628,7 +626,7 @@ errout:
return err;
}
-static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
struct fib_config cfg;
@@ -656,7 +654,6 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
unsigned int h, s_h;
unsigned int e = 0, s_e;
struct fib_table *tb;
- struct hlist_node *node;
struct hlist_head *head;
int dumped = 0;
@@ -670,7 +667,7 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) {
e = 0;
head = &net->ipv4.fib_table_hash[h];
- hlist_for_each_entry(tb, node, head, tb_hlist) {
+ hlist_for_each_entry(tb, head, tb_hlist) {
if (e < s_e)
goto next;
if (dumped)
@@ -960,8 +957,8 @@ static void nl_fib_input(struct sk_buff *skb)
net = sock_net(skb->sk);
nlh = nlmsg_hdr(skb);
- if (skb->len < NLMSG_SPACE(0) || skb->len < nlh->nlmsg_len ||
- nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*frn)))
+ if (skb->len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len ||
+ nlmsg_len(nlh) < sizeof(*frn))
return;
skb = skb_clone(skb, GFP_KERNEL);
@@ -969,12 +966,12 @@ static void nl_fib_input(struct sk_buff *skb)
return;
nlh = nlmsg_hdr(skb);
- frn = (struct fib_result_nl *) NLMSG_DATA(nlh);
+ frn = (struct fib_result_nl *) nlmsg_data(nlh);
tb = fib_get_table(net, frn->tb_id_in);
nl_fib_lookup(frn, tb);
- portid = NETLINK_CB(skb).portid; /* pid of sending process */
+ portid = NETLINK_CB(skb).portid; /* netlink portid */
NETLINK_CB(skb).portid = 0; /* from kernel */
NETLINK_CB(skb).dst_group = 0; /* unicast */
netlink_unicast(net->ipv4.fibnl, skb, portid, MSG_DONTWAIT);
@@ -1117,11 +1114,11 @@ static void ip_fib_net_exit(struct net *net)
for (i = 0; i < FIB_TABLE_HASHSZ; i++) {
struct fib_table *tb;
struct hlist_head *head;
- struct hlist_node *node, *tmp;
+ struct hlist_node *tmp;
head = &net->ipv4.fib_table_hash[i];
- hlist_for_each_entry_safe(tb, node, tmp, head, tb_hlist) {
- hlist_del(node);
+ hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) {
+ hlist_del(&tb->tb_hlist);
fib_table_flush(tb);
fib_free_table(tb);
}
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 4797a80..8f6cb7a 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -298,14 +298,13 @@ static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
static struct fib_info *fib_find_info(const struct fib_info *nfi)
{
struct hlist_head *head;
- struct hlist_node *node;
struct fib_info *fi;
unsigned int hash;
hash = fib_info_hashfn(nfi);
head = &fib_info_hash[hash];
- hlist_for_each_entry(fi, node, head, fib_hash) {
+ hlist_for_each_entry(fi, head, fib_hash) {
if (!net_eq(fi->fib_net, nfi->fib_net))
continue;
if (fi->fib_nhs != nfi->fib_nhs)
@@ -331,7 +330,6 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi)
int ip_fib_check_default(__be32 gw, struct net_device *dev)
{
struct hlist_head *head;
- struct hlist_node *node;
struct fib_nh *nh;
unsigned int hash;
@@ -339,7 +337,7 @@ int ip_fib_check_default(__be32 gw, struct net_device *dev)
hash = fib_devindex_hashfn(dev->ifindex);
head = &fib_info_devhash[hash];
- hlist_for_each_entry(nh, node, head, nh_hash) {
+ hlist_for_each_entry(nh, head, nh_hash) {
if (nh->nh_dev == dev &&
nh->nh_gw == gw &&
!(nh->nh_flags & RTNH_F_DEAD)) {
@@ -721,10 +719,10 @@ static void fib_info_hash_move(struct hlist_head *new_info_hash,
for (i = 0; i < old_size; i++) {
struct hlist_head *head = &fib_info_hash[i];
- struct hlist_node *node, *n;
+ struct hlist_node *n;
struct fib_info *fi;
- hlist_for_each_entry_safe(fi, node, n, head, fib_hash) {
+ hlist_for_each_entry_safe(fi, n, head, fib_hash) {
struct hlist_head *dest;
unsigned int new_hash;
@@ -739,10 +737,10 @@ static void fib_info_hash_move(struct hlist_head *new_info_hash,
for (i = 0; i < old_size; i++) {
struct hlist_head *lhead = &fib_info_laddrhash[i];
- struct hlist_node *node, *n;
+ struct hlist_node *n;
struct fib_info *fi;
- hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) {
+ hlist_for_each_entry_safe(fi, n, lhead, fib_lhash) {
struct hlist_head *ldest;
unsigned int new_hash;
@@ -1096,13 +1094,12 @@ int fib_sync_down_addr(struct net *net, __be32 local)
int ret = 0;
unsigned int hash = fib_laddr_hashfn(local);
struct hlist_head *head = &fib_info_laddrhash[hash];
- struct hlist_node *node;
struct fib_info *fi;
if (fib_info_laddrhash == NULL || local == 0)
return 0;
- hlist_for_each_entry(fi, node, head, fib_lhash) {
+ hlist_for_each_entry(fi, head, fib_lhash) {
if (!net_eq(fi->fib_net, net))
continue;
if (fi->fib_prefsrc == local) {
@@ -1120,13 +1117,12 @@ int fib_sync_down_dev(struct net_device *dev, int force)
struct fib_info *prev_fi = NULL;
unsigned int hash = fib_devindex_hashfn(dev->ifindex);
struct hlist_head *head = &fib_info_devhash[hash];
- struct hlist_node *node;
struct fib_nh *nh;
if (force)
scope = -1;
- hlist_for_each_entry(nh, node, head, nh_hash) {
+ hlist_for_each_entry(nh, head, nh_hash) {
struct fib_info *fi = nh->nh_parent;
int dead;
@@ -1232,7 +1228,6 @@ int fib_sync_up(struct net_device *dev)
struct fib_info *prev_fi;
unsigned int hash;
struct hlist_head *head;
- struct hlist_node *node;
struct fib_nh *nh;
int ret;
@@ -1244,7 +1239,7 @@ int fib_sync_up(struct net_device *dev)
head = &fib_info_devhash[hash];
ret = 0;
- hlist_for_each_entry(nh, node, head, nh_hash) {
+ hlist_for_each_entry(nh, head, nh_hash) {
struct fib_info *fi = nh->nh_parent;
int alive;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 31d771c..49616fe 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -125,7 +125,6 @@ struct tnode {
unsigned int empty_children; /* KEYLENGTH bits needed */
union {
struct rcu_head rcu;
- struct work_struct work;
struct tnode *tnode_free;
};
struct rt_trie_node __rcu *child[0];
@@ -383,12 +382,6 @@ static struct tnode *tnode_alloc(size_t size)
return vzalloc(size);
}
-static void __tnode_vfree(struct work_struct *arg)
-{
- struct tnode *tn = container_of(arg, struct tnode, work);
- vfree(tn);
-}
-
static void __tnode_free_rcu(struct rcu_head *head)
{
struct tnode *tn = container_of(head, struct tnode, rcu);
@@ -397,10 +390,8 @@ static void __tnode_free_rcu(struct rcu_head *head)
if (size <= PAGE_SIZE)
kfree(tn);
- else {
- INIT_WORK(&tn->work, __tnode_vfree);
- schedule_work(&tn->work);
- }
+ else
+ vfree(tn);
}
static inline void tnode_free(struct tnode *tn)
@@ -920,10 +911,9 @@ nomem:
static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
{
struct hlist_head *head = &l->list;
- struct hlist_node *node;
struct leaf_info *li;
- hlist_for_each_entry_rcu(li, node, head, hlist)
+ hlist_for_each_entry_rcu(li, head, hlist)
if (li->plen == plen)
return li;
@@ -943,12 +933,11 @@ static inline struct list_head *get_fa_head(struct leaf *l, int plen)
static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
{
struct leaf_info *li = NULL, *last = NULL;
- struct hlist_node *node;
if (hlist_empty(head)) {
hlist_add_head_rcu(&new->hlist, head);
} else {
- hlist_for_each_entry(li, node, head, hlist) {
+ hlist_for_each_entry(li, head, hlist) {
if (new->plen > li->plen)
break;
@@ -1354,9 +1343,8 @@ static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
{
struct leaf_info *li;
struct hlist_head *hhead = &l->list;
- struct hlist_node *node;
- hlist_for_each_entry_rcu(li, node, hhead, hlist) {
+ hlist_for_each_entry_rcu(li, hhead, hlist) {
struct fib_alias *fa;
if (l->key != (key & li->mask_plen))
@@ -1740,10 +1728,10 @@ static int trie_flush_leaf(struct leaf *l)
{
int found = 0;
struct hlist_head *lih = &l->list;
- struct hlist_node *node, *tmp;
+ struct hlist_node *tmp;
struct leaf_info *li = NULL;
- hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
+ hlist_for_each_entry_safe(li, tmp, lih, hlist) {
found += trie_flush_list(&li->falh);
if (list_empty(&li->falh)) {
@@ -1895,14 +1883,13 @@ static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
struct sk_buff *skb, struct netlink_callback *cb)
{
struct leaf_info *li;
- struct hlist_node *node;
int i, s_i;
s_i = cb->args[4];
i = 0;
/* rcu_read_lock is hold by caller */
- hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
+ hlist_for_each_entry_rcu(li, &l->list, hlist) {
if (i < s_i) {
i++;
continue;
@@ -2092,14 +2079,13 @@ static void trie_collect_stats(struct trie *t, struct trie_stat *s)
if (IS_LEAF(n)) {
struct leaf *l = (struct leaf *)n;
struct leaf_info *li;
- struct hlist_node *tmp;
s->leaves++;
s->totdepth += iter.depth;
if (iter.depth > s->maxdepth)
s->maxdepth = iter.depth;
- hlist_for_each_entry_rcu(li, tmp, &l->list, hlist)
+ hlist_for_each_entry_rcu(li, &l->list, hlist)
++s->prefixes;
} else {
const struct tnode *tn = (const struct tnode *) n;
@@ -2200,10 +2186,9 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v)
for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
struct hlist_head *head = &net->ipv4.fib_table_hash[h];
- struct hlist_node *node;
struct fib_table *tb;
- hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
+ hlist_for_each_entry_rcu(tb, head, tb_hlist) {
struct trie *t = (struct trie *) tb->tb_data;
struct trie_stat stat;
@@ -2245,10 +2230,9 @@ static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
struct hlist_head *head = &net->ipv4.fib_table_hash[h];
- struct hlist_node *node;
struct fib_table *tb;
- hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
+ hlist_for_each_entry_rcu(tb, head, tb_hlist) {
struct rt_trie_node *n;
for (n = fib_trie_get_first(iter,
@@ -2298,7 +2282,7 @@ static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
/* new hash chain */
while (++h < FIB_TABLE_HASHSZ) {
struct hlist_head *head = &net->ipv4.fib_table_hash[h];
- hlist_for_each_entry_rcu(tb, tb_node, head, tb_hlist) {
+ hlist_for_each_entry_rcu(tb, head, tb_hlist) {
n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
if (n)
goto found;
@@ -2381,13 +2365,12 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v)
} else {
struct leaf *l = (struct leaf *) n;
struct leaf_info *li;
- struct hlist_node *node;
__be32 val = htonl(l->key);
seq_indent(seq, iter->depth);
seq_printf(seq, " |-- %pI4\n", &val);
- hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
+ hlist_for_each_entry_rcu(li, &l->list, hlist) {
struct fib_alias *fa;
list_for_each_entry_rcu(fa, &li->falh, fa_list) {
@@ -2532,7 +2515,6 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
{
struct leaf *l = v;
struct leaf_info *li;
- struct hlist_node *node;
if (v == SEQ_START_TOKEN) {
seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
@@ -2541,7 +2523,7 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
return 0;
}
- hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
+ hlist_for_each_entry_rcu(li, &l->list, hlist) {
struct fib_alias *fa;
__be32 mask, prefix;
@@ -2607,31 +2589,31 @@ static const struct file_operations fib_route_fops = {
int __net_init fib_proc_init(struct net *net)
{
- if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops))
+ if (!proc_create("fib_trie", S_IRUGO, net->proc_net, &fib_trie_fops))
goto out1;
- if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO,
- &fib_triestat_fops))
+ if (!proc_create("fib_triestat", S_IRUGO, net->proc_net,
+ &fib_triestat_fops))
goto out2;
- if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops))
+ if (!proc_create("route", S_IRUGO, net->proc_net, &fib_route_fops))
goto out3;
return 0;
out3:
- proc_net_remove(net, "fib_triestat");
+ remove_proc_entry("fib_triestat", net->proc_net);
out2:
- proc_net_remove(net, "fib_trie");
+ remove_proc_entry("fib_trie", net->proc_net);
out1:
return -ENOMEM;
}
void __net_exit fib_proc_exit(struct net *net)
{
- proc_net_remove(net, "fib_trie");
- proc_net_remove(net, "fib_triestat");
- proc_net_remove(net, "route");
+ remove_proc_entry("fib_trie", net->proc_net);
+ remove_proc_entry("fib_triestat", net->proc_net);
+ remove_proc_entry("route", net->proc_net);
}
#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/gre.c b/net/ipv4/gre.c
index 42a4910..7856d16 100644
--- a/net/ipv4/gre.c
+++ b/net/ipv4/gre.c
@@ -19,6 +19,7 @@
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/netdevice.h>
+#include <linux/if_tunnel.h>
#include <linux/spinlock.h>
#include <net/protocol.h>
#include <net/gre.h>
@@ -112,12 +113,113 @@ static void gre_err(struct sk_buff *skb, u32 info)
rcu_read_unlock();
}
+static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ netdev_features_t enc_features;
+ int ghl = GRE_HEADER_SECTION;
+ struct gre_base_hdr *greh;
+ int mac_len = skb->mac_len;
+ __be16 protocol = skb->protocol;
+ int tnl_hlen;
+ bool csum;
+
+ if (unlikely(skb_shinfo(skb)->gso_type &
+ ~(SKB_GSO_TCPV4 |
+ SKB_GSO_TCPV6 |
+ SKB_GSO_UDP |
+ SKB_GSO_DODGY |
+ SKB_GSO_TCP_ECN |
+ SKB_GSO_GRE)))
+ goto out;
+
+ if (unlikely(!pskb_may_pull(skb, sizeof(*greh))))
+ goto out;
+
+ greh = (struct gre_base_hdr *)skb_transport_header(skb);
+
+ if (greh->flags & GRE_KEY)
+ ghl += GRE_HEADER_SECTION;
+ if (greh->flags & GRE_SEQ)
+ ghl += GRE_HEADER_SECTION;
+ if (greh->flags & GRE_CSUM) {
+ ghl += GRE_HEADER_SECTION;
+ csum = true;
+ } else
+ csum = false;
+
+ /* setup inner skb. */
+ skb->protocol = greh->protocol;
+ skb->encapsulation = 0;
+
+ if (unlikely(!pskb_may_pull(skb, ghl)))
+ goto out;
+ __skb_pull(skb, ghl);
+ skb_reset_mac_header(skb);
+ skb_set_network_header(skb, skb_inner_network_offset(skb));
+ skb->mac_len = skb_inner_network_offset(skb);
+
+ /* segment inner packet. */
+ enc_features = skb->dev->hw_enc_features & netif_skb_features(skb);
+ segs = skb_mac_gso_segment(skb, enc_features);
+ if (!segs || IS_ERR(segs))
+ goto out;
+
+ skb = segs;
+ tnl_hlen = skb_tnl_header_len(skb);
+ do {
+ __skb_push(skb, ghl);
+ if (csum) {
+ __be32 *pcsum;
+
+ if (skb_has_shared_frag(skb)) {
+ int err;
+
+ err = __skb_linearize(skb);
+ if (err) {
+ kfree_skb_list(segs);
+ segs = ERR_PTR(err);
+ goto out;
+ }
+ }
+
+ greh = (struct gre_base_hdr *)(skb->data);
+ pcsum = (__be32 *)(greh + 1);
+ *pcsum = 0;
+ *(__sum16 *)pcsum = csum_fold(skb_checksum(skb, 0, skb->len, 0));
+ }
+ __skb_push(skb, tnl_hlen - ghl);
+
+ skb_reset_mac_header(skb);
+ skb_set_network_header(skb, mac_len);
+ skb->mac_len = mac_len;
+ skb->protocol = protocol;
+ } while ((skb = skb->next));
+out:
+ return segs;
+}
+
+static int gre_gso_send_check(struct sk_buff *skb)
+{
+ if (!skb->encapsulation)
+ return -EINVAL;
+ return 0;
+}
+
static const struct net_protocol net_gre_protocol = {
.handler = gre_rcv,
.err_handler = gre_err,
.netns_ok = 1,
};
+static const struct net_offload gre_offload = {
+ .callbacks = {
+ .gso_send_check = gre_gso_send_check,
+ .gso_segment = gre_gso_segment,
+ },
+};
+
static int __init gre_init(void)
{
pr_info("GRE over IPv4 demultiplexor driver\n");
@@ -127,11 +229,18 @@ static int __init gre_init(void)
return -EAGAIN;
}
+ if (inet_add_offload(&gre_offload, IPPROTO_GRE)) {
+ pr_err("can't add protocol offload\n");
+ inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
+ return -EAGAIN;
+ }
+
return 0;
}
static void __exit gre_exit(void)
{
+ inet_del_offload(&gre_offload, IPPROTO_GRE);
inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
}
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 17ff9fd..76e10b4 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -881,7 +881,7 @@ int icmp_rcv(struct sk_buff *skb)
case CHECKSUM_NONE:
skb->csum = 0;
if (__skb_checksum_complete(skb))
- goto error;
+ goto csum_error;
}
if (!pskb_pull(skb, sizeof(*icmph)))
@@ -929,11 +929,36 @@ int icmp_rcv(struct sk_buff *skb)
drop:
kfree_skb(skb);
return 0;
+csum_error:
+ ICMP_INC_STATS_BH(net, ICMP_MIB_CSUMERRORS);
error:
ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
goto drop;
}
+void icmp_err(struct sk_buff *skb, u32 info)
+{
+ struct iphdr *iph = (struct iphdr *)skb->data;
+ struct icmphdr *icmph = (struct icmphdr *)(skb->data+(iph->ihl<<2));
+ int type = icmp_hdr(skb)->type;
+ int code = icmp_hdr(skb)->code;
+ struct net *net = dev_net(skb->dev);
+
+ /*
+ * Use ping_err to handle all icmp errors except those
+ * triggered by ICMP_ECHOREPLY which sent from kernel.
+ */
+ if (icmph->type != ICMP_ECHOREPLY) {
+ ping_err(skb, info);
+ return;
+ }
+
+ if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
+ ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ICMP, 0);
+ else if (type == ICMP_REDIRECT)
+ ipv4_redirect(skb, net, 0, 0, IPPROTO_ICMP, 0);
+}
+
/*
* This table is the definition of how we handle ICMP.
*/
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 736ab70..d8c2327 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -2646,24 +2646,25 @@ static int __net_init igmp_net_init(struct net *net)
{
struct proc_dir_entry *pde;
- pde = proc_net_fops_create(net, "igmp", S_IRUGO, &igmp_mc_seq_fops);
+ pde = proc_create("igmp", S_IRUGO, net->proc_net, &igmp_mc_seq_fops);
if (!pde)
goto out_igmp;
- pde = proc_net_fops_create(net, "mcfilter", S_IRUGO, &igmp_mcf_seq_fops);
+ pde = proc_create("mcfilter", S_IRUGO, net->proc_net,
+ &igmp_mcf_seq_fops);
if (!pde)
goto out_mcfilter;
return 0;
out_mcfilter:
- proc_net_remove(net, "igmp");
+ remove_proc_entry("igmp", net->proc_net);
out_igmp:
return -ENOMEM;
}
static void __net_exit igmp_net_exit(struct net *net)
{
- proc_net_remove(net, "mcfilter");
- proc_net_remove(net, "igmp");
+ remove_proc_entry("mcfilter", net->proc_net);
+ remove_proc_entry("igmp", net->proc_net);
}
static struct pernet_operations igmp_net_ops = {
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index d0670f0..6acb541 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -57,8 +57,9 @@ int inet_csk_bind_conflict(const struct sock *sk,
const struct inet_bind_bucket *tb, bool relax)
{
struct sock *sk2;
- struct hlist_node *node;
int reuse = sk->sk_reuse;
+ int reuseport = sk->sk_reuseport;
+ kuid_t uid = sock_i_uid((struct sock *)sk);
/*
* Unlike other sk lookup places we do not check
@@ -67,14 +68,17 @@ int inet_csk_bind_conflict(const struct sock *sk,
* one this bucket belongs to.
*/
- sk_for_each_bound(sk2, node, &tb->owners) {
+ sk_for_each_bound(sk2, &tb->owners) {
if (sk != sk2 &&
!inet_v6_ipv6only(sk2) &&
(!sk->sk_bound_dev_if ||
!sk2->sk_bound_dev_if ||
sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
- if (!reuse || !sk2->sk_reuse ||
- sk2->sk_state == TCP_LISTEN) {
+ if ((!reuse || !sk2->sk_reuse ||
+ sk2->sk_state == TCP_LISTEN) &&
+ (!reuseport || !sk2->sk_reuseport ||
+ (sk2->sk_state != TCP_TIME_WAIT &&
+ !uid_eq(uid, sock_i_uid(sk2))))) {
const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2);
if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) ||
sk2_rcv_saddr == sk_rcv_saddr(sk))
@@ -90,7 +94,7 @@ int inet_csk_bind_conflict(const struct sock *sk,
}
}
}
- return node != NULL;
+ return sk2 != NULL;
}
EXPORT_SYMBOL_GPL(inet_csk_bind_conflict);
@@ -101,11 +105,11 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
{
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
struct inet_bind_hashbucket *head;
- struct hlist_node *node;
struct inet_bind_bucket *tb;
int ret, attempts = 5;
struct net *net = sock_net(sk);
int smallest_size = -1, smallest_rover;
+ kuid_t uid = sock_i_uid(sk);
local_bh_disable();
if (!snum) {
@@ -123,11 +127,14 @@ again:
head = &hashinfo->bhash[inet_bhashfn(net, rover,
hashinfo->bhash_size)];
spin_lock(&head->lock);
- inet_bind_bucket_for_each(tb, node, &head->chain)
+ inet_bind_bucket_for_each(tb, &head->chain)
if (net_eq(ib_net(tb), net) && tb->port == rover) {
- if (tb->fastreuse > 0 &&
- sk->sk_reuse &&
- sk->sk_state != TCP_LISTEN &&
+ if (((tb->fastreuse > 0 &&
+ sk->sk_reuse &&
+ sk->sk_state != TCP_LISTEN) ||
+ (tb->fastreuseport > 0 &&
+ sk->sk_reuseport &&
+ uid_eq(tb->fastuid, uid))) &&
(tb->num_owners < smallest_size || smallest_size == -1)) {
smallest_size = tb->num_owners;
smallest_rover = rover;
@@ -174,7 +181,7 @@ have_snum:
head = &hashinfo->bhash[inet_bhashfn(net, snum,
hashinfo->bhash_size)];
spin_lock(&head->lock);
- inet_bind_bucket_for_each(tb, node, &head->chain)
+ inet_bind_bucket_for_each(tb, &head->chain)
if (net_eq(ib_net(tb), net) && tb->port == snum)
goto tb_found;
}
@@ -185,14 +192,18 @@ tb_found:
if (sk->sk_reuse == SK_FORCE_REUSE)
goto success;
- if (tb->fastreuse > 0 &&
- sk->sk_reuse && sk->sk_state != TCP_LISTEN &&
+ if (((tb->fastreuse > 0 &&
+ sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
+ (tb->fastreuseport > 0 &&
+ sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
smallest_size == -1) {
goto success;
} else {
ret = 1;
if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) {
- if (sk->sk_reuse && sk->sk_state != TCP_LISTEN &&
+ if (((sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
+ (tb->fastreuseport > 0 &&
+ sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
smallest_size != -1 && --attempts >= 0) {
spin_unlock(&head->lock);
goto again;
@@ -212,9 +223,19 @@ tb_not_found:
tb->fastreuse = 1;
else
tb->fastreuse = 0;
- } else if (tb->fastreuse &&
- (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
- tb->fastreuse = 0;
+ if (sk->sk_reuseport) {
+ tb->fastreuseport = 1;
+ tb->fastuid = uid;
+ } else
+ tb->fastreuseport = 0;
+ } else {
+ if (tb->fastreuse &&
+ (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
+ tb->fastreuse = 0;
+ if (tb->fastreuseport &&
+ (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid)))
+ tb->fastreuseport = 0;
+ }
success:
if (!inet_csk(sk)->icsk_bind_hash)
inet_bind_hash(sk, tb, snum);
@@ -538,7 +559,7 @@ static inline void syn_ack_recalc(struct request_sock *req, const int thresh,
int inet_rtx_syn_ack(struct sock *parent, struct request_sock *req)
{
- int err = req->rsk_ops->rtx_syn_ack(parent, req, NULL);
+ int err = req->rsk_ops->rtx_syn_ack(parent, req);
if (!err)
req->num_retrans++;
@@ -714,6 +735,7 @@ EXPORT_SYMBOL(inet_csk_destroy_sock);
* tcp/dccp_create_openreq_child().
*/
void inet_csk_prepare_forced_close(struct sock *sk)
+ __releases(&sk->sk_lock.slock)
{
/* sk_clone_lock locked the socket and set refcnt to 2 */
bh_unlock_sock(sk);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 7afa2c3..5f64875 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -158,7 +158,9 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
#define EXPIRES_IN_MS(tmo) DIV_ROUND_UP((tmo - jiffies) * 1000, HZ)
- if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
+ if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
+ icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+ icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
r->idiag_timer = 1;
r->idiag_retrans = icsk->icsk_retransmits;
r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout);
@@ -322,7 +324,7 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_s
}
err = sk_diag_fill(sk, rep, req,
- sk_user_ns(NETLINK_CB(in_skb).ssk),
+ sk_user_ns(NETLINK_CB(in_skb).sk),
NETLINK_CB(in_skb).portid,
nlh->nlmsg_seq, 0, nlh);
if (err < 0) {
@@ -628,7 +630,7 @@ static int inet_csk_diag_dump(struct sock *sk,
return 0;
return inet_csk_diag_fill(sk, skb, r,
- sk_user_ns(NETLINK_CB(cb->skb).ssk),
+ sk_user_ns(NETLINK_CB(cb->skb).sk),
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
}
@@ -803,7 +805,7 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
}
err = inet_diag_fill_req(skb, sk, req,
- sk_user_ns(NETLINK_CB(cb->skb).ssk),
+ sk_user_ns(NETLINK_CB(cb->skb).sk),
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, cb->nlh);
if (err < 0) {
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 4750d2b..7e06641 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -21,7 +21,30 @@
#include <linux/rtnetlink.h>
#include <linux/slab.h>
+#include <net/sock.h>
#include <net/inet_frag.h>
+#include <net/inet_ecn.h>
+
+/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
+ * Value : 0xff if frame should be dropped.
+ * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field
+ */
+const u8 ip_frag_ecn_table[16] = {
+ /* at least one fragment had CE, and others ECT_0 or ECT_1 */
+ [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = INET_ECN_CE,
+ [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = INET_ECN_CE,
+ [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = INET_ECN_CE,
+
+ /* invalid combinations : drop frame */
+ [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff,
+ [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff,
+ [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff,
+ [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
+ [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff,
+ [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff,
+ [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
+};
+EXPORT_SYMBOL(ip_frag_ecn_table);
static void inet_frag_secret_rebuild(unsigned long dummy)
{
@@ -29,20 +52,27 @@ static void inet_frag_secret_rebuild(unsigned long dummy)
unsigned long now = jiffies;
int i;
+ /* Per bucket lock NOT needed here, due to write lock protection */
write_lock(&f->lock);
+
get_random_bytes(&f->rnd, sizeof(u32));
for (i = 0; i < INETFRAGS_HASHSZ; i++) {
+ struct inet_frag_bucket *hb;
struct inet_frag_queue *q;
- struct hlist_node *p, *n;
+ struct hlist_node *n;
- hlist_for_each_entry_safe(q, p, n, &f->hash[i], list) {
+ hb = &f->hash[i];
+ hlist_for_each_entry_safe(q, n, &hb->chain, list) {
unsigned int hval = f->hashfn(q);
if (hval != i) {
+ struct inet_frag_bucket *hb_dest;
+
hlist_del(&q->list);
/* Relink to new hash chain. */
- hlist_add_head(&q->list, &f->hash[hval]);
+ hb_dest = &f->hash[hval];
+ hlist_add_head(&q->list, &hb_dest->chain);
}
}
}
@@ -55,9 +85,12 @@ void inet_frags_init(struct inet_frags *f)
{
int i;
- for (i = 0; i < INETFRAGS_HASHSZ; i++)
- INIT_HLIST_HEAD(&f->hash[i]);
+ for (i = 0; i < INETFRAGS_HASHSZ; i++) {
+ struct inet_frag_bucket *hb = &f->hash[i];
+ spin_lock_init(&hb->chain_lock);
+ INIT_HLIST_HEAD(&hb->chain);
+ }
rwlock_init(&f->lock);
f->rnd = (u32) ((num_physpages ^ (num_physpages>>7)) ^
@@ -73,8 +106,9 @@ EXPORT_SYMBOL(inet_frags_init);
void inet_frags_init_net(struct netns_frags *nf)
{
nf->nqueues = 0;
- atomic_set(&nf->mem, 0);
+ init_frag_mem_limit(nf);
INIT_LIST_HEAD(&nf->lru_list);
+ spin_lock_init(&nf->lru_lock);
}
EXPORT_SYMBOL(inet_frags_init_net);
@@ -91,16 +125,26 @@ void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
local_bh_disable();
inet_frag_evictor(nf, f, true);
local_bh_enable();
+
+ percpu_counter_destroy(&nf->mem);
}
EXPORT_SYMBOL(inet_frags_exit_net);
static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f)
{
- write_lock(&f->lock);
+ struct inet_frag_bucket *hb;
+ unsigned int hash;
+
+ read_lock(&f->lock);
+ hash = f->hashfn(fq);
+ hb = &f->hash[hash];
+
+ spin_lock(&hb->chain_lock);
hlist_del(&fq->list);
- list_del(&fq->lru_list);
- fq->net->nqueues--;
- write_unlock(&f->lock);
+ spin_unlock(&hb->chain_lock);
+
+ read_unlock(&f->lock);
+ inet_frag_lru_del(fq);
}
void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f)
@@ -117,12 +161,8 @@ void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f)
EXPORT_SYMBOL(inet_frag_kill);
static inline void frag_kfree_skb(struct netns_frags *nf, struct inet_frags *f,
- struct sk_buff *skb, int *work)
+ struct sk_buff *skb)
{
- if (work)
- *work -= skb->truesize;
-
- atomic_sub(skb->truesize, &nf->mem);
if (f->skb_free)
f->skb_free(skb);
kfree_skb(skb);
@@ -133,6 +173,7 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f,
{
struct sk_buff *fp;
struct netns_frags *nf;
+ unsigned int sum, sum_truesize = 0;
WARN_ON(!(q->last_in & INET_FRAG_COMPLETE));
WARN_ON(del_timer(&q->timer) != 0);
@@ -143,13 +184,14 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f,
while (fp) {
struct sk_buff *xp = fp->next;
- frag_kfree_skb(nf, f, fp, work);
+ sum_truesize += fp->truesize;
+ frag_kfree_skb(nf, f, fp);
fp = xp;
}
-
+ sum = sum_truesize + f->qsize;
if (work)
- *work -= f->qsize;
- atomic_sub(f->qsize, &nf->mem);
+ *work -= sum;
+ sub_frag_mem_limit(q, sum);
if (f->destructor)
f->destructor(q);
@@ -164,22 +206,26 @@ int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force)
int work, evicted = 0;
if (!force) {
- if (atomic_read(&nf->mem) <= nf->high_thresh)
+ if (frag_mem_limit(nf) <= nf->high_thresh)
return 0;
}
- work = atomic_read(&nf->mem) - nf->low_thresh;
+ work = frag_mem_limit(nf) - nf->low_thresh;
while (work > 0) {
- read_lock(&f->lock);
+ spin_lock(&nf->lru_lock);
+
if (list_empty(&nf->lru_list)) {
- read_unlock(&f->lock);
+ spin_unlock(&nf->lru_lock);
break;
}
q = list_first_entry(&nf->lru_list,
struct inet_frag_queue, lru_list);
atomic_inc(&q->refcnt);
- read_unlock(&f->lock);
+ /* Remove q from list to avoid several CPUs grabbing it */
+ list_del_init(&q->lru_list);
+
+ spin_unlock(&nf->lru_lock);
spin_lock(&q->lock);
if (!(q->last_in & INET_FRAG_COMPLETE))
@@ -199,28 +245,32 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
struct inet_frag_queue *qp_in, struct inet_frags *f,
void *arg)
{
+ struct inet_frag_bucket *hb;
struct inet_frag_queue *qp;
#ifdef CONFIG_SMP
- struct hlist_node *n;
#endif
unsigned int hash;
- write_lock(&f->lock);
+ read_lock(&f->lock); /* Protects against hash rebuild */
/*
* While we stayed w/o the lock other CPU could update
* the rnd seed, so we need to re-calculate the hash
* chain. Fortunatelly the qp_in can be used to get one.
*/
hash = f->hashfn(qp_in);
+ hb = &f->hash[hash];
+ spin_lock(&hb->chain_lock);
+
#ifdef CONFIG_SMP
/* With SMP race we have to recheck hash table, because
* such entry could be created on other cpu, while we
- * promoted read lock to write lock.
+ * released the hash bucket lock.
*/
- hlist_for_each_entry(qp, n, &f->hash[hash], list) {
+ hlist_for_each_entry(qp, &hb->chain, list) {
if (qp->net == nf && f->match(qp, arg)) {
atomic_inc(&qp->refcnt);
- write_unlock(&f->lock);
+ spin_unlock(&hb->chain_lock);
+ read_unlock(&f->lock);
qp_in->last_in |= INET_FRAG_COMPLETE;
inet_frag_put(qp_in, f);
return qp;
@@ -232,10 +282,10 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
atomic_inc(&qp->refcnt);
atomic_inc(&qp->refcnt);
- hlist_add_head(&qp->list, &f->hash[hash]);
- list_add_tail(&qp->lru_list, &nf->lru_list);
- nf->nqueues++;
- write_unlock(&f->lock);
+ hlist_add_head(&qp->list, &hb->chain);
+ spin_unlock(&hb->chain_lock);
+ read_unlock(&f->lock);
+ inet_frag_lru_add(nf, qp);
return qp;
}
@@ -250,10 +300,12 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
q->net = nf;
f->constructor(q, arg);
- atomic_add(f->qsize, &nf->mem);
+ add_frag_mem_limit(q, f->qsize);
+
setup_timer(&q->timer, f->frag_expire, (unsigned long)q);
spin_lock_init(&q->lock);
atomic_set(&q->refcnt, 1);
+ INIT_LIST_HEAD(&q->lru_list);
return q;
}
@@ -274,18 +326,40 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
struct inet_frags *f, void *key, unsigned int hash)
__releases(&f->lock)
{
+ struct inet_frag_bucket *hb;
struct inet_frag_queue *q;
- struct hlist_node *n;
+ int depth = 0;
- hlist_for_each_entry(q, n, &f->hash[hash], list) {
+ hb = &f->hash[hash];
+
+ spin_lock(&hb->chain_lock);
+ hlist_for_each_entry(q, &hb->chain, list) {
if (q->net == nf && f->match(q, key)) {
atomic_inc(&q->refcnt);
+ spin_unlock(&hb->chain_lock);
read_unlock(&f->lock);
return q;
}
+ depth++;
}
+ spin_unlock(&hb->chain_lock);
read_unlock(&f->lock);
- return inet_frag_create(nf, f, key);
+ if (depth <= INETFRAGS_MAXDEPTH)
+ return inet_frag_create(nf, f, key);
+ else
+ return ERR_PTR(-ENOBUFS);
}
EXPORT_SYMBOL(inet_frag_find);
+
+void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
+ const char *prefix)
+{
+ static const char msg[] = "inet_frag_find: Fragment hash bucket"
+ " list length grew over limit " __stringify(INETFRAGS_MAXDEPTH)
+ ". Dropping fragment.\n";
+
+ if (PTR_ERR(q) == -ENOBUFS)
+ LIMIT_NETDEBUG(KERN_WARNING "%s%s", prefix, msg);
+}
+EXPORT_SYMBOL(inet_frag_maybe_warn_overflow);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index fa3ae81..6af375a 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -39,6 +39,7 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
write_pnet(&tb->ib_net, hold_net(net));
tb->port = snum;
tb->fastreuse = 0;
+ tb->fastreuseport = 0;
tb->num_owners = 0;
INIT_HLIST_HEAD(&tb->owners);
hlist_add_head(&tb->node, &head->chain);
@@ -119,13 +120,12 @@ int __inet_inherit_port(struct sock *sk, struct sock *child)
* that the listener socket's icsk_bind_hash is the same
* as that of the child socket. We have to look up or
* create a new bind bucket for the child here. */
- struct hlist_node *node;
- inet_bind_bucket_for_each(tb, node, &head->chain) {
+ inet_bind_bucket_for_each(tb, &head->chain) {
if (net_eq(ib_net(tb), sock_net(sk)) &&
tb->port == port)
break;
}
- if (!node) {
+ if (!tb) {
tb = inet_bind_bucket_create(table->bind_bucket_cachep,
sock_net(sk), head, port);
if (!tb) {
@@ -151,16 +151,16 @@ static inline int compute_score(struct sock *sk, struct net *net,
if (net_eq(sock_net(sk), net) && inet->inet_num == hnum &&
!ipv6_only_sock(sk)) {
__be32 rcv_saddr = inet->inet_rcv_saddr;
- score = sk->sk_family == PF_INET ? 1 : 0;
+ score = sk->sk_family == PF_INET ? 2 : 1;
if (rcv_saddr) {
if (rcv_saddr != daddr)
return -1;
- score += 2;
+ score += 4;
}
if (sk->sk_bound_dev_if) {
if (sk->sk_bound_dev_if != dif)
return -1;
- score += 2;
+ score += 4;
}
}
return score;
@@ -176,6 +176,7 @@ static inline int compute_score(struct sock *sk, struct net *net,
struct sock *__inet_lookup_listener(struct net *net,
struct inet_hashinfo *hashinfo,
+ const __be32 saddr, __be16 sport,
const __be32 daddr, const unsigned short hnum,
const int dif)
{
@@ -183,17 +184,29 @@ struct sock *__inet_lookup_listener(struct net *net,
struct hlist_nulls_node *node;
unsigned int hash = inet_lhashfn(net, hnum);
struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
- int score, hiscore;
+ int score, hiscore, matches = 0, reuseport = 0;
+ u32 phash = 0;
rcu_read_lock();
begin:
result = NULL;
- hiscore = -1;
+ hiscore = 0;
sk_nulls_for_each_rcu(sk, node, &ilb->head) {
score = compute_score(sk, net, hnum, daddr, dif);
if (score > hiscore) {
result = sk;
hiscore = score;
+ reuseport = sk->sk_reuseport;
+ if (reuseport) {
+ phash = inet_ehashfn(net, daddr, hnum,
+ saddr, sport);
+ matches = 1;
+ }
+ } else if (score == hiscore && reuseport) {
+ matches++;
+ if (((u64)phash * matches) >> 32 == 0)
+ result = sk;
+ phash = next_pseudo_random32(phash);
}
}
/*
@@ -479,7 +492,6 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
int i, remaining, low, high, port;
static u32 hint;
u32 offset = hint + port_offset;
- struct hlist_node *node;
struct inet_timewait_sock *tw = NULL;
inet_get_local_port_range(&low, &high);
@@ -498,10 +510,11 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
* because the established check is already
* unique enough.
*/
- inet_bind_bucket_for_each(tb, node, &head->chain) {
+ inet_bind_bucket_for_each(tb, &head->chain) {
if (net_eq(ib_net(tb), net) &&
tb->port == port) {
- if (tb->fastreuse >= 0)
+ if (tb->fastreuse >= 0 ||
+ tb->fastreuseport >= 0)
goto next_port;
WARN_ON(hlist_empty(&tb->owners));
if (!check_established(death_row, sk,
@@ -518,6 +531,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
break;
}
tb->fastreuse = -1;
+ tb->fastreuseport = -1;
goto ok;
next_port:
diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c
index cc280a3..1975f52 100644
--- a/net/ipv4/inet_lro.c
+++ b/net/ipv4/inet_lro.c
@@ -29,6 +29,7 @@
#include <linux/module.h>
#include <linux/if_vlan.h>
#include <linux/inet_lro.h>
+#include <net/checksum.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Jan-Bernd Themann <themann@de.ibm.com>");
@@ -114,11 +115,9 @@ static void lro_update_tcp_ip_header(struct net_lro_desc *lro_desc)
*(p+2) = lro_desc->tcp_rcv_tsecr;
}
+ csum_replace2(&iph->check, iph->tot_len, htons(lro_desc->ip_tot_len));
iph->tot_len = htons(lro_desc->ip_tot_len);
- iph->check = 0;
- iph->check = ip_fast_csum((u8 *)lro_desc->iph, iph->ihl);
-
tcph->check = 0;
tcp_hdr_csum = csum_partial(tcph, TCP_HDR_LEN(tcph), 0);
lro_desc->data_csum = csum_add(lro_desc->data_csum, tcp_hdr_csum);
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 2784db3..1f27c9f 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -216,7 +216,6 @@ static int inet_twdr_do_twkill_work(struct inet_timewait_death_row *twdr,
const int slot)
{
struct inet_timewait_sock *tw;
- struct hlist_node *node;
unsigned int killed;
int ret;
@@ -229,7 +228,7 @@ static int inet_twdr_do_twkill_work(struct inet_timewait_death_row *twdr,
killed = 0;
ret = 0;
rescan:
- inet_twsk_for_each_inmate(tw, node, &twdr->cells[slot]) {
+ inet_twsk_for_each_inmate(tw, &twdr->cells[slot]) {
__inet_twsk_del_dead_node(tw);
spin_unlock(&twdr->death_lock);
__inet_twsk_kill(tw, twdr->hashinfo);
@@ -438,10 +437,10 @@ void inet_twdr_twcal_tick(unsigned long data)
for (n = 0; n < INET_TWDR_RECYCLE_SLOTS; n++) {
if (time_before_eq(j, now)) {
- struct hlist_node *node, *safe;
+ struct hlist_node *safe;
struct inet_timewait_sock *tw;
- inet_twsk_for_each_inmate_safe(tw, node, safe,
+ inet_twsk_for_each_inmate_safe(tw, safe,
&twdr->twcal_row[slot]) {
__inet_twsk_del_dead_node(tw);
__inet_twsk_kill(tw, twdr->hashinfo);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index eb9d63a..b66910a 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -79,40 +79,11 @@ struct ipq {
struct inet_peer *peer;
};
-/* RFC 3168 support :
- * We want to check ECN values of all fragments, do detect invalid combinations.
- * In ipq->ecn, we store the OR value of each ip4_frag_ecn() fragment value.
- */
-#define IPFRAG_ECN_NOT_ECT 0x01 /* one frag had ECN_NOT_ECT */
-#define IPFRAG_ECN_ECT_1 0x02 /* one frag had ECN_ECT_1 */
-#define IPFRAG_ECN_ECT_0 0x04 /* one frag had ECN_ECT_0 */
-#define IPFRAG_ECN_CE 0x08 /* one frag had ECN_CE */
-
static inline u8 ip4_frag_ecn(u8 tos)
{
return 1 << (tos & INET_ECN_MASK);
}
-/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
- * Value : 0xff if frame should be dropped.
- * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field
- */
-static const u8 ip4_frag_ecn_table[16] = {
- /* at least one fragment had CE, and others ECT_0 or ECT_1 */
- [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = INET_ECN_CE,
- [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = INET_ECN_CE,
- [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = INET_ECN_CE,
-
- /* invalid combinations : drop frame */
- [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff,
- [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff,
- [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff,
- [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
- [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff,
- [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff,
- [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
-};
-
static struct inet_frags ip4_frags;
int ip_frag_nqueues(struct net *net)
@@ -122,7 +93,7 @@ int ip_frag_nqueues(struct net *net)
int ip_frag_mem(struct net *net)
{
- return atomic_read(&net->ipv4.frags.mem);
+ return sum_frag_mem_limit(&net->ipv4.frags);
}
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
@@ -161,13 +132,6 @@ static bool ip4_frag_match(struct inet_frag_queue *q, void *a)
qp->user == arg->user;
}
-/* Memory Tracking Functions. */
-static void frag_kfree_skb(struct netns_frags *nf, struct sk_buff *skb)
-{
- atomic_sub(skb->truesize, &nf->mem);
- kfree_skb(skb);
-}
-
static void ip4_frag_init(struct inet_frag_queue *q, void *a)
{
struct ipq *qp = container_of(q, struct ipq, q);
@@ -255,8 +219,7 @@ static void ip_expire(unsigned long arg)
if (!head->dev)
goto out_rcu_unlock;
- /* skb dst is stale, drop it, and perform route lookup again */
- skb_dst_drop(head);
+ /* skb has no dst, perform route lookup again */
iph = ip_hdr(head);
err = ip_route_input_noref(head, iph->daddr, iph->saddr,
iph->tos, head->dev);
@@ -299,14 +262,11 @@ static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user)
hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol);
q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash);
- if (q == NULL)
- goto out_nomem;
-
+ if (IS_ERR_OR_NULL(q)) {
+ inet_frag_maybe_warn_overflow(q, pr_fmt());
+ return NULL;
+ }
return container_of(q, struct ipq, q);
-
-out_nomem:
- LIMIT_NETDEBUG(KERN_ERR pr_fmt("ip_frag_create: no memory left !\n"));
- return NULL;
}
/* Is the fragment too far ahead to be part of ipq? */
@@ -340,6 +300,7 @@ static inline int ip_frag_too_far(struct ipq *qp)
static int ip_frag_reinit(struct ipq *qp)
{
struct sk_buff *fp;
+ unsigned int sum_truesize = 0;
if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) {
atomic_inc(&qp->q.refcnt);
@@ -349,9 +310,12 @@ static int ip_frag_reinit(struct ipq *qp)
fp = qp->q.fragments;
do {
struct sk_buff *xp = fp->next;
- frag_kfree_skb(qp->q.net, fp);
+
+ sum_truesize += fp->truesize;
+ kfree_skb(fp);
fp = xp;
} while (fp);
+ sub_frag_mem_limit(&qp->q, sum_truesize);
qp->q.last_in = 0;
qp->q.len = 0;
@@ -496,7 +460,8 @@ found:
qp->q.fragments = next;
qp->q.meat -= free_it->len;
- frag_kfree_skb(qp->q.net, free_it);
+ sub_frag_mem_limit(&qp->q, free_it->truesize);
+ kfree_skb(free_it);
}
}
@@ -519,7 +484,7 @@ found:
qp->q.stamp = skb->tstamp;
qp->q.meat += skb->len;
qp->ecn |= ecn;
- atomic_add(skb->truesize, &qp->q.net->mem);
+ add_frag_mem_limit(&qp->q, skb->truesize);
if (offset == 0)
qp->q.last_in |= INET_FRAG_FIRST_IN;
@@ -528,12 +493,17 @@ found:
qp->q.max_size = skb->len + ihl;
if (qp->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
- qp->q.meat == qp->q.len)
- return ip_frag_reasm(qp, prev, dev);
+ qp->q.meat == qp->q.len) {
+ unsigned long orefdst = skb->_skb_refdst;
+
+ skb->_skb_refdst = 0UL;
+ err = ip_frag_reasm(qp, prev, dev);
+ skb->_skb_refdst = orefdst;
+ return err;
+ }
- write_lock(&ip4_frags.lock);
- list_move_tail(&qp->q.lru_list, &qp->q.net->lru_list);
- write_unlock(&ip4_frags.lock);
+ skb_dst_drop(skb);
+ inet_frag_lru_move(&qp->q);
return -EINPROGRESS;
err:
@@ -558,7 +528,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
ipq_kill(qp);
- ecn = ip4_frag_ecn_table[qp->ecn];
+ ecn = ip_frag_ecn_table[qp->ecn];
if (unlikely(ecn == 0xff)) {
err = -EINVAL;
goto out_fail;
@@ -594,7 +564,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
goto out_oversize;
/* Head of list must not be cloned. */
- if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC))
+ if (skb_unclone(head, GFP_ATOMIC))
goto out_nomem;
/* If the first fragment is fragmented itself, we split
@@ -617,7 +587,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
head->len -= clone->len;
clone->csum = 0;
clone->ip_summed = head->ip_summed;
- atomic_add(clone->truesize, &qp->q.net->mem);
+ add_frag_mem_limit(&qp->q, clone->truesize);
}
skb_push(head, head->data - skb_network_header(head));
@@ -645,7 +615,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
}
fp = next;
}
- atomic_sub(sum_truesize, &qp->q.net->mem);
+ sub_frag_mem_limit(&qp->q, sum_truesize);
head->next = NULL;
head->dev = dev;
@@ -851,14 +821,22 @@ static inline void ip4_frags_ctl_register(void)
static int __net_init ipv4_frags_init_net(struct net *net)
{
- /*
- * Fragment cache limits. We will commit 256K at one time. Should we
- * cross that limit we will prune down to 192K. This should cope with
- * even the most extreme cases without allowing an attacker to
- * measurably harm machine performance.
+ /* Fragment cache limits.
+ *
+ * The fragment memory accounting code, (tries to) account for
+ * the real memory usage, by measuring both the size of frag
+ * queue struct (inet_frag_queue (ipv4:ipq/ipv6:frag_queue))
+ * and the SKB's truesize.
+ *
+ * A 64K fragment consumes 129736 bytes (44*2944)+200
+ * (1500 truesize == 2944, sizeof(struct ipq) == 200)
+ *
+ * We will commit 4MB at one time. Should we cross that limit
+ * we will prune down to 3MB, making room for approx 8 big 64K
+ * fragments 8x128k.
*/
- net->ipv4.frags.high_thresh = 256 * 1024;
- net->ipv4.frags.low_thresh = 192 * 1024;
+ net->ipv4.frags.high_thresh = 4 * 1024 * 1024;
+ net->ipv4.frags.low_thresh = 3 * 1024 * 1024;
/*
* Important NOTE! Fragment queue must be destroyed before MSL expires.
* RFC791 is wrong proposing to prolongate timer each fragment arrival
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index e81b1ca..2a83591 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -37,7 +37,7 @@
#include <net/ip.h>
#include <net/icmp.h>
#include <net/protocol.h>
-#include <net/ipip.h>
+#include <net/ip_tunnels.h>
#include <net/arp.h>
#include <net/checksum.h>
#include <net/dsfield.h>
@@ -108,15 +108,6 @@
fatal route to network, even if it were you who configured
fatal static route: you are innocent. :-)
-
-
- 3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
- practically identical code. It would be good to glue them
- together, but it is not very evident, how to make them modular.
- sit is integral part of IPv6, ipip and gre are naturally modular.
- We could extract common parts (hash table, ioctl etc)
- to a separate module (ip_tunnel.c).
-
Alexey Kuznetsov.
*/
@@ -126,400 +117,137 @@ MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
static struct rtnl_link_ops ipgre_link_ops __read_mostly;
static int ipgre_tunnel_init(struct net_device *dev);
-static void ipgre_tunnel_setup(struct net_device *dev);
-static int ipgre_tunnel_bind_dev(struct net_device *dev);
-
-/* Fallback tunnel: no source, no destination, no key, no options */
-
-#define HASH_SIZE 16
static int ipgre_net_id __read_mostly;
-struct ipgre_net {
- struct ip_tunnel __rcu *tunnels[4][HASH_SIZE];
-
- struct net_device *fb_tunnel_dev;
-};
-
-/* Tunnel hash table */
-
-/*
- 4 hash tables:
-
- 3: (remote,local)
- 2: (remote,*)
- 1: (*,local)
- 0: (*,*)
+static int gre_tap_net_id __read_mostly;
- We require exact key match i.e. if a key is present in packet
- it will match only tunnel with the same key; if it is not present,
- it will match only keyless tunnel.
-
- All keysless packets, if not matched configured keyless tunnels
- will match fallback tunnel.
- */
+static __sum16 check_checksum(struct sk_buff *skb)
+{
+ __sum16 csum = 0;
-#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
+ switch (skb->ip_summed) {
+ case CHECKSUM_COMPLETE:
+ csum = csum_fold(skb->csum);
-#define tunnels_r_l tunnels[3]
-#define tunnels_r tunnels[2]
-#define tunnels_l tunnels[1]
-#define tunnels_wc tunnels[0]
+ if (!csum)
+ break;
+ /* Fall through. */
-static struct rtnl_link_stats64 *ipgre_get_stats64(struct net_device *dev,
- struct rtnl_link_stats64 *tot)
-{
- int i;
-
- for_each_possible_cpu(i) {
- const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
- u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
- unsigned int start;
-
- do {
- start = u64_stats_fetch_begin_bh(&tstats->syncp);
- rx_packets = tstats->rx_packets;
- tx_packets = tstats->tx_packets;
- rx_bytes = tstats->rx_bytes;
- tx_bytes = tstats->tx_bytes;
- } while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
-
- tot->rx_packets += rx_packets;
- tot->tx_packets += tx_packets;
- tot->rx_bytes += rx_bytes;
- tot->tx_bytes += tx_bytes;
+ case CHECKSUM_NONE:
+ skb->csum = 0;
+ csum = __skb_checksum_complete(skb);
+ skb->ip_summed = CHECKSUM_COMPLETE;
+ break;
}
- tot->multicast = dev->stats.multicast;
- tot->rx_crc_errors = dev->stats.rx_crc_errors;
- tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
- tot->rx_length_errors = dev->stats.rx_length_errors;
- tot->rx_frame_errors = dev->stats.rx_frame_errors;
- tot->rx_errors = dev->stats.rx_errors;
-
- tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
- tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
- tot->tx_dropped = dev->stats.tx_dropped;
- tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
- tot->tx_errors = dev->stats.tx_errors;
-
- return tot;
+ return csum;
}
-/* Does key in tunnel parameters match packet */
-static bool ipgre_key_match(const struct ip_tunnel_parm *p,
- __be16 flags, __be32 key)
+static int ip_gre_calc_hlen(__be16 o_flags)
{
- if (p->i_flags & GRE_KEY) {
- if (flags & GRE_KEY)
- return key == p->i_key;
- else
- return false; /* key expected, none present */
- } else
- return !(flags & GRE_KEY);
+ int addend = 4;
+
+ if (o_flags&TUNNEL_CSUM)
+ addend += 4;
+ if (o_flags&TUNNEL_KEY)
+ addend += 4;
+ if (o_flags&TUNNEL_SEQ)
+ addend += 4;
+ return addend;
}
-/* Given src, dst and key, find appropriate for input tunnel. */
-
-static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev,
- __be32 remote, __be32 local,
- __be16 flags, __be32 key,
- __be16 gre_proto)
+static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
+ bool *csum_err, int *hdr_len)
{
- struct net *net = dev_net(dev);
- int link = dev->ifindex;
- unsigned int h0 = HASH(remote);
- unsigned int h1 = HASH(key);
- struct ip_tunnel *t, *cand = NULL;
- struct ipgre_net *ign = net_generic(net, ipgre_net_id);
- int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
- ARPHRD_ETHER : ARPHRD_IPGRE;
- int score, cand_score = 4;
-
- for_each_ip_tunnel_rcu(t, ign->tunnels_r_l[h0 ^ h1]) {
- if (local != t->parms.iph.saddr ||
- remote != t->parms.iph.daddr ||
- !(t->dev->flags & IFF_UP))
- continue;
-
- if (!ipgre_key_match(&t->parms, flags, key))
- continue;
-
- if (t->dev->type != ARPHRD_IPGRE &&
- t->dev->type != dev_type)
- continue;
-
- score = 0;
- if (t->parms.link != link)
- score |= 1;
- if (t->dev->type != dev_type)
- score |= 2;
- if (score == 0)
- return t;
-
- if (score < cand_score) {
- cand = t;
- cand_score = score;
- }
- }
+ unsigned int ip_hlen = ip_hdrlen(skb);
+ const struct gre_base_hdr *greh;
+ __be32 *options;
- for_each_ip_tunnel_rcu(t, ign->tunnels_r[h0 ^ h1]) {
- if (remote != t->parms.iph.daddr ||
- !(t->dev->flags & IFF_UP))
- continue;
-
- if (!ipgre_key_match(&t->parms, flags, key))
- continue;
-
- if (t->dev->type != ARPHRD_IPGRE &&
- t->dev->type != dev_type)
- continue;
-
- score = 0;
- if (t->parms.link != link)
- score |= 1;
- if (t->dev->type != dev_type)
- score |= 2;
- if (score == 0)
- return t;
-
- if (score < cand_score) {
- cand = t;
- cand_score = score;
- }
- }
+ if (unlikely(!pskb_may_pull(skb, sizeof(struct gre_base_hdr))))
+ return -EINVAL;
- for_each_ip_tunnel_rcu(t, ign->tunnels_l[h1]) {
- if ((local != t->parms.iph.saddr &&
- (local != t->parms.iph.daddr ||
- !ipv4_is_multicast(local))) ||
- !(t->dev->flags & IFF_UP))
- continue;
-
- if (!ipgre_key_match(&t->parms, flags, key))
- continue;
-
- if (t->dev->type != ARPHRD_IPGRE &&
- t->dev->type != dev_type)
- continue;
-
- score = 0;
- if (t->parms.link != link)
- score |= 1;
- if (t->dev->type != dev_type)
- score |= 2;
- if (score == 0)
- return t;
-
- if (score < cand_score) {
- cand = t;
- cand_score = score;
- }
- }
+ greh = (struct gre_base_hdr *)(skb_network_header(skb) + ip_hlen);
+ if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING)))
+ return -EINVAL;
- for_each_ip_tunnel_rcu(t, ign->tunnels_wc[h1]) {
- if (t->parms.i_key != key ||
- !(t->dev->flags & IFF_UP))
- continue;
-
- if (t->dev->type != ARPHRD_IPGRE &&
- t->dev->type != dev_type)
- continue;
-
- score = 0;
- if (t->parms.link != link)
- score |= 1;
- if (t->dev->type != dev_type)
- score |= 2;
- if (score == 0)
- return t;
-
- if (score < cand_score) {
- cand = t;
- cand_score = score;
- }
- }
+ tpi->flags = gre_flags_to_tnl_flags(greh->flags);
+ *hdr_len = ip_gre_calc_hlen(tpi->flags);
- if (cand != NULL)
- return cand;
+ if (!pskb_may_pull(skb, *hdr_len))
+ return -EINVAL;
- dev = ign->fb_tunnel_dev;
- if (dev->flags & IFF_UP)
- return netdev_priv(dev);
+ greh = (struct gre_base_hdr *)(skb_network_header(skb) + ip_hlen);
- return NULL;
-}
+ tpi->proto = greh->protocol;
-static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign,
- struct ip_tunnel_parm *parms)
-{
- __be32 remote = parms->iph.daddr;
- __be32 local = parms->iph.saddr;
- __be32 key = parms->i_key;
- unsigned int h = HASH(key);
- int prio = 0;
-
- if (local)
- prio |= 1;
- if (remote && !ipv4_is_multicast(remote)) {
- prio |= 2;
- h ^= HASH(remote);
+ options = (__be32 *)(greh + 1);
+ if (greh->flags & GRE_CSUM) {
+ if (check_checksum(skb)) {
+ *csum_err = true;
+ return -EINVAL;
+ }
+ options++;
}
- return &ign->tunnels[prio][h];
-}
-
-static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign,
- struct ip_tunnel *t)
-{
- return __ipgre_bucket(ign, &t->parms);
-}
-
-static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
-{
- struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t);
+ if (greh->flags & GRE_KEY) {
+ tpi->key = *options;
+ options++;
+ } else
+ tpi->key = 0;
- rcu_assign_pointer(t->next, rtnl_dereference(*tp));
- rcu_assign_pointer(*tp, t);
-}
+ if (unlikely(greh->flags & GRE_SEQ)) {
+ tpi->seq = *options;
+ options++;
+ } else
+ tpi->seq = 0;
-static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
-{
- struct ip_tunnel __rcu **tp;
- struct ip_tunnel *iter;
-
- for (tp = ipgre_bucket(ign, t);
- (iter = rtnl_dereference(*tp)) != NULL;
- tp = &iter->next) {
- if (t == iter) {
- rcu_assign_pointer(*tp, t->next);
- break;
+ /* WCCP version 1 and 2 protocol decoding.
+ * - Change protocol to IP
+ * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
+ */
+ if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) {
+ tpi->proto = htons(ETH_P_IP);
+ if ((*(u8 *)options & 0xF0) != 0x40) {
+ *hdr_len += 4;
+ if (!pskb_may_pull(skb, *hdr_len))
+ return -EINVAL;
}
}
-}
-
-static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
- struct ip_tunnel_parm *parms,
- int type)
-{
- __be32 remote = parms->iph.daddr;
- __be32 local = parms->iph.saddr;
- __be32 key = parms->i_key;
- int link = parms->link;
- struct ip_tunnel *t;
- struct ip_tunnel __rcu **tp;
- struct ipgre_net *ign = net_generic(net, ipgre_net_id);
-
- for (tp = __ipgre_bucket(ign, parms);
- (t = rtnl_dereference(*tp)) != NULL;
- tp = &t->next)
- if (local == t->parms.iph.saddr &&
- remote == t->parms.iph.daddr &&
- key == t->parms.i_key &&
- link == t->parms.link &&
- type == t->dev->type)
- break;
-
- return t;
-}
-
-static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
- struct ip_tunnel_parm *parms, int create)
-{
- struct ip_tunnel *t, *nt;
- struct net_device *dev;
- char name[IFNAMSIZ];
- struct ipgre_net *ign = net_generic(net, ipgre_net_id);
-
- t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
- if (t || !create)
- return t;
-
- if (parms->name[0])
- strlcpy(name, parms->name, IFNAMSIZ);
- else
- strcpy(name, "gre%d");
-
- dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
- if (!dev)
- return NULL;
-
- dev_net_set(dev, net);
-
- nt = netdev_priv(dev);
- nt->parms = *parms;
- dev->rtnl_link_ops = &ipgre_link_ops;
- dev->mtu = ipgre_tunnel_bind_dev(dev);
-
- if (register_netdevice(dev) < 0)
- goto failed_free;
-
- /* Can use a lockless transmit, unless we generate output sequences */
- if (!(nt->parms.o_flags & GRE_SEQ))
- dev->features |= NETIF_F_LLTX;
-
- dev_hold(dev);
- ipgre_tunnel_link(ign, nt);
- return nt;
-
-failed_free:
- free_netdev(dev);
- return NULL;
-}
-
-static void ipgre_tunnel_uninit(struct net_device *dev)
-{
- struct net *net = dev_net(dev);
- struct ipgre_net *ign = net_generic(net, ipgre_net_id);
-
- ipgre_tunnel_unlink(ign, netdev_priv(dev));
- dev_put(dev);
+ return 0;
}
-
static void ipgre_err(struct sk_buff *skb, u32 info)
{
-/* All the routers (except for Linux) return only
- 8 bytes of packet payload. It means, that precise relaying of
- ICMP in the real Internet is absolutely infeasible.
-
- Moreover, Cisco "wise men" put GRE key to the third word
- in GRE header. It makes impossible maintaining even soft state for keyed
- GRE tunnels with enabled checksum. Tell them "thank you".
-
- Well, I wonder, rfc1812 was written by Cisco employee,
- what the hell these idiots break standards established
- by themselves???
- */
-
- const struct iphdr *iph = (const struct iphdr *)skb->data;
- __be16 *p = (__be16 *)(skb->data+(iph->ihl<<2));
- int grehlen = (iph->ihl<<2) + 4;
+ /* All the routers (except for Linux) return only
+ 8 bytes of packet payload. It means, that precise relaying of
+ ICMP in the real Internet is absolutely infeasible.
+
+ Moreover, Cisco "wise men" put GRE key to the third word
+ in GRE header. It makes impossible maintaining even soft
+ state for keyed GRE tunnels with enabled checksum. Tell
+ them "thank you".
+
+ Well, I wonder, rfc1812 was written by Cisco employee,
+ what the hell these idiots break standards established
+ by themselves???
+ */
+ struct net *net = dev_net(skb->dev);
+ struct ip_tunnel_net *itn;
+ const struct iphdr *iph;
const int type = icmp_hdr(skb)->type;
const int code = icmp_hdr(skb)->code;
struct ip_tunnel *t;
- __be16 flags;
- __be32 key = 0;
+ struct tnl_ptk_info tpi;
+ int hdr_len;
+ bool csum_err = false;
- flags = p[0];
- if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
- if (flags&(GRE_VERSION|GRE_ROUTING))
+ if (parse_gre_header(skb, &tpi, &csum_err, &hdr_len)) {
+ if (!csum_err) /* ignore csum errors. */
return;
- if (flags&GRE_KEY) {
- grehlen += 4;
- if (flags&GRE_CSUM)
- grehlen += 4;
- }
}
- /* If only 8 bytes returned, keyed message will be dropped here */
- if (skb_headlen(skb) < grehlen)
- return;
-
- if (flags & GRE_KEY)
- key = *(((__be32 *)p) + (grehlen / 4) - 1);
-
switch (type) {
default:
case ICMP_PARAMETERPROB:
@@ -548,8 +276,14 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
break;
}
- t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
- flags, key, p[1]);
+ if (tpi.proto == htons(ETH_P_TEB))
+ itn = net_generic(net, gre_tap_net_id);
+ else
+ itn = net_generic(net, ipgre_net_id);
+
+ iph = (const struct iphdr *)skb->data;
+ t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi.flags,
+ iph->daddr, iph->saddr, tpi.key);
if (t == NULL)
return;
@@ -578,612 +312,214 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
t->err_time = jiffies;
}
-static inline u8
-ipgre_ecn_encapsulate(u8 tos, const struct iphdr *old_iph, struct sk_buff *skb)
-{
- u8 inner = 0;
- if (skb->protocol == htons(ETH_P_IP))
- inner = old_iph->tos;
- else if (skb->protocol == htons(ETH_P_IPV6))
- inner = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
- return INET_ECN_encapsulate(tos, inner);
-}
-
static int ipgre_rcv(struct sk_buff *skb)
{
+ struct net *net = dev_net(skb->dev);
+ struct ip_tunnel_net *itn;
const struct iphdr *iph;
- u8 *h;
- __be16 flags;
- __sum16 csum = 0;
- __be32 key = 0;
- u32 seqno = 0;
struct ip_tunnel *tunnel;
- int offset = 4;
- __be16 gre_proto;
- int err;
+ struct tnl_ptk_info tpi;
+ int hdr_len;
+ bool csum_err = false;
- if (!pskb_may_pull(skb, 16))
+ if (parse_gre_header(skb, &tpi, &csum_err, &hdr_len) < 0)
goto drop;
- iph = ip_hdr(skb);
- h = skb->data;
- flags = *(__be16 *)h;
-
- if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
- /* - Version must be 0.
- - We do not support routing headers.
- */
- if (flags&(GRE_VERSION|GRE_ROUTING))
- goto drop;
-
- if (flags&GRE_CSUM) {
- switch (skb->ip_summed) {
- case CHECKSUM_COMPLETE:
- csum = csum_fold(skb->csum);
- if (!csum)
- break;
- /* fall through */
- case CHECKSUM_NONE:
- skb->csum = 0;
- csum = __skb_checksum_complete(skb);
- skb->ip_summed = CHECKSUM_COMPLETE;
- }
- offset += 4;
- }
- if (flags&GRE_KEY) {
- key = *(__be32 *)(h + offset);
- offset += 4;
- }
- if (flags&GRE_SEQ) {
- seqno = ntohl(*(__be32 *)(h + offset));
- offset += 4;
- }
- }
+ if (tpi.proto == htons(ETH_P_TEB))
+ itn = net_generic(net, gre_tap_net_id);
+ else
+ itn = net_generic(net, ipgre_net_id);
- gre_proto = *(__be16 *)(h + 2);
+ iph = ip_hdr(skb);
+ tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi.flags,
+ iph->saddr, iph->daddr, tpi.key);
- tunnel = ipgre_tunnel_lookup(skb->dev,
- iph->saddr, iph->daddr, flags, key,
- gre_proto);
if (tunnel) {
- struct pcpu_tstats *tstats;
-
- secpath_reset(skb);
-
- skb->protocol = gre_proto;
- /* WCCP version 1 and 2 protocol decoding.
- * - Change protocol to IP
- * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
- */
- if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
- skb->protocol = htons(ETH_P_IP);
- if ((*(h + offset) & 0xF0) != 0x40)
- offset += 4;
- }
-
- skb->mac_header = skb->network_header;
- __pskb_pull(skb, offset);
- skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
- skb->pkt_type = PACKET_HOST;
-#ifdef CONFIG_NET_IPGRE_BROADCAST
- if (ipv4_is_multicast(iph->daddr)) {
- /* Looped back packet, drop it! */
- if (rt_is_output_route(skb_rtable(skb)))
- goto drop;
- tunnel->dev->stats.multicast++;
- skb->pkt_type = PACKET_BROADCAST;
- }
-#endif
-
- if (((flags&GRE_CSUM) && csum) ||
- (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
- tunnel->dev->stats.rx_crc_errors++;
- tunnel->dev->stats.rx_errors++;
- goto drop;
- }
- if (tunnel->parms.i_flags&GRE_SEQ) {
- if (!(flags&GRE_SEQ) ||
- (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
- tunnel->dev->stats.rx_fifo_errors++;
- tunnel->dev->stats.rx_errors++;
- goto drop;
- }
- tunnel->i_seqno = seqno + 1;
- }
-
- /* Warning: All skb pointers will be invalidated! */
- if (tunnel->dev->type == ARPHRD_ETHER) {
- if (!pskb_may_pull(skb, ETH_HLEN)) {
- tunnel->dev->stats.rx_length_errors++;
- tunnel->dev->stats.rx_errors++;
- goto drop;
- }
-
- iph = ip_hdr(skb);
- skb->protocol = eth_type_trans(skb, tunnel->dev);
- skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
- }
-
- __skb_tunnel_rx(skb, tunnel->dev);
-
- skb_reset_network_header(skb);
- err = IP_ECN_decapsulate(iph, skb);
- if (unlikely(err)) {
- if (log_ecn_error)
- net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
- &iph->saddr, iph->tos);
- if (err > 1) {
- ++tunnel->dev->stats.rx_frame_errors;
- ++tunnel->dev->stats.rx_errors;
- goto drop;
- }
- }
-
- tstats = this_cpu_ptr(tunnel->dev->tstats);
- u64_stats_update_begin(&tstats->syncp);
- tstats->rx_packets++;
- tstats->rx_bytes += skb->len;
- u64_stats_update_end(&tstats->syncp);
-
- gro_cells_receive(&tunnel->gro_cells, skb);
+ ip_tunnel_rcv(tunnel, skb, &tpi, log_ecn_error);
return 0;
}
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
-
drop:
kfree_skb(skb);
return 0;
}
-static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
+static struct sk_buff *handle_offloads(struct ip_tunnel *tunnel, struct sk_buff *skb)
{
- struct ip_tunnel *tunnel = netdev_priv(dev);
- const struct iphdr *old_iph = ip_hdr(skb);
- const struct iphdr *tiph;
- struct flowi4 fl4;
- u8 tos;
- __be16 df;
- struct rtable *rt; /* Route to the other host */
- struct net_device *tdev; /* Device to other host */
- struct iphdr *iph; /* Our new IP header */
- unsigned int max_headroom; /* The extra header space needed */
- int gre_hlen;
- __be32 dst;
- int mtu;
- u8 ttl;
-
- if (skb->ip_summed == CHECKSUM_PARTIAL &&
- skb_checksum_help(skb))
- goto tx_error;
-
- if (dev->type == ARPHRD_ETHER)
- IPCB(skb)->flags = 0;
-
- if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
- gre_hlen = 0;
- if (skb->protocol == htons(ETH_P_IP))
- tiph = (const struct iphdr *)skb->data;
- else
- tiph = &tunnel->parms.iph;
- } else {
- gre_hlen = tunnel->hlen;
- tiph = &tunnel->parms.iph;
- }
-
- if ((dst = tiph->daddr) == 0) {
- /* NBMA tunnel */
-
- if (skb_dst(skb) == NULL) {
- dev->stats.tx_fifo_errors++;
- goto tx_error;
- }
-
- if (skb->protocol == htons(ETH_P_IP)) {
- rt = skb_rtable(skb);
- dst = rt_nexthop(rt, old_iph->daddr);
- }
-#if IS_ENABLED(CONFIG_IPV6)
- else if (skb->protocol == htons(ETH_P_IPV6)) {
- const struct in6_addr *addr6;
- struct neighbour *neigh;
- bool do_tx_error_icmp;
- int addr_type;
-
- neigh = dst_neigh_lookup(skb_dst(skb), &ipv6_hdr(skb)->daddr);
- if (neigh == NULL)
- goto tx_error;
-
- addr6 = (const struct in6_addr *)&neigh->primary_key;
- addr_type = ipv6_addr_type(addr6);
-
- if (addr_type == IPV6_ADDR_ANY) {
- addr6 = &ipv6_hdr(skb)->daddr;
- addr_type = ipv6_addr_type(addr6);
- }
-
- if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
- do_tx_error_icmp = true;
- else {
- do_tx_error_icmp = false;
- dst = addr6->s6_addr32[3];
- }
- neigh_release(neigh);
- if (do_tx_error_icmp)
- goto tx_error_icmp;
- }
-#endif
- else
- goto tx_error;
- }
-
- ttl = tiph->ttl;
- tos = tiph->tos;
- if (tos == 1) {
- tos = 0;
- if (skb->protocol == htons(ETH_P_IP))
- tos = old_iph->tos;
- else if (skb->protocol == htons(ETH_P_IPV6))
- tos = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
- }
+ int err;
- rt = ip_route_output_gre(dev_net(dev), &fl4, dst, tiph->saddr,
- tunnel->parms.o_key, RT_TOS(tos),
- tunnel->parms.link);
- if (IS_ERR(rt)) {
- dev->stats.tx_carrier_errors++;
- goto tx_error;
- }
- tdev = rt->dst.dev;
+ if (skb_is_gso(skb)) {
+ err = skb_unclone(skb, GFP_ATOMIC);
+ if (unlikely(err))
+ goto error;
+ skb_shinfo(skb)->gso_type |= SKB_GSO_GRE;
+ return skb;
+ } else if (skb->ip_summed == CHECKSUM_PARTIAL &&
+ tunnel->parms.o_flags&TUNNEL_CSUM) {
+ err = skb_checksum_help(skb);
+ if (unlikely(err))
+ goto error;
+ } else if (skb->ip_summed != CHECKSUM_PARTIAL)
+ skb->ip_summed = CHECKSUM_NONE;
+
+ return skb;
+
+error:
+ kfree_skb(skb);
+ return ERR_PTR(err);
+}
- if (tdev == dev) {
- ip_rt_put(rt);
- dev->stats.collisions++;
- goto tx_error;
- }
+static struct sk_buff *gre_build_header(struct sk_buff *skb,
+ const struct tnl_ptk_info *tpi,
+ int hdr_len)
+{
+ struct gre_base_hdr *greh;
- df = tiph->frag_off;
- if (df)
- mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen;
- else
- mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
+ skb_push(skb, hdr_len);
- if (skb_dst(skb))
- skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
+ greh = (struct gre_base_hdr *)skb->data;
+ greh->flags = tnl_flags_to_gre_flags(tpi->flags);
+ greh->protocol = tpi->proto;
- if (skb->protocol == htons(ETH_P_IP)) {
- df |= (old_iph->frag_off&htons(IP_DF));
+ if (tpi->flags&(TUNNEL_KEY|TUNNEL_CSUM|TUNNEL_SEQ)) {
+ __be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4);
- if ((old_iph->frag_off&htons(IP_DF)) &&
- mtu < ntohs(old_iph->tot_len)) {
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
- ip_rt_put(rt);
- goto tx_error;
+ if (tpi->flags&TUNNEL_SEQ) {
+ *ptr = tpi->seq;
+ ptr--;
}
- }
-#if IS_ENABLED(CONFIG_IPV6)
- else if (skb->protocol == htons(ETH_P_IPV6)) {
- struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
-
- if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
- if ((tunnel->parms.iph.daddr &&
- !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
- rt6->rt6i_dst.plen == 128) {
- rt6->rt6i_flags |= RTF_MODIFIED;
- dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
- }
+ if (tpi->flags&TUNNEL_KEY) {
+ *ptr = tpi->key;
+ ptr--;
}
-
- if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
- ip_rt_put(rt);
- goto tx_error;
+ if (tpi->flags&TUNNEL_CSUM &&
+ !(skb_shinfo(skb)->gso_type & SKB_GSO_GRE)) {
+ *(__sum16 *)ptr = 0;
+ *(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0,
+ skb->len, 0));
}
}
-#endif
- if (tunnel->err_count > 0) {
- if (time_before(jiffies,
- tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
- tunnel->err_count--;
+ return skb;
+}
+
+static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
+ const struct iphdr *tnl_params,
+ __be16 proto)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct tnl_ptk_info tpi;
- dst_link_failure(skb);
- } else
- tunnel->err_count = 0;
+ if (likely(!skb->encapsulation)) {
+ skb_reset_inner_headers(skb);
+ skb->encapsulation = 1;
}
- max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len;
-
- if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
- (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
- struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
- if (max_headroom > dev->needed_headroom)
- dev->needed_headroom = max_headroom;
- if (!new_skb) {
- ip_rt_put(rt);
- dev->stats.tx_dropped++;
- dev_kfree_skb(skb);
- return NETDEV_TX_OK;
- }
- if (skb->sk)
- skb_set_owner_w(new_skb, skb->sk);
- dev_kfree_skb(skb);
- skb = new_skb;
- old_iph = ip_hdr(skb);
- /* Warning : tiph value might point to freed memory */
+ tpi.flags = tunnel->parms.o_flags;
+ tpi.proto = proto;
+ tpi.key = tunnel->parms.o_key;
+ if (tunnel->parms.o_flags & TUNNEL_SEQ)
+ tunnel->o_seqno++;
+ tpi.seq = htonl(tunnel->o_seqno);
+
+ /* Push GRE header. */
+ skb = gre_build_header(skb, &tpi, tunnel->hlen);
+ if (unlikely(!skb)) {
+ dev->stats.tx_dropped++;
+ return;
}
- skb_push(skb, gre_hlen);
- skb_reset_network_header(skb);
- skb_set_transport_header(skb, sizeof(*iph));
- memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
- IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
- IPSKB_REROUTED);
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
-
- /*
- * Push down and install the IPIP header.
- */
+ ip_tunnel_xmit(skb, dev, tnl_params);
+}
- iph = ip_hdr(skb);
- iph->version = 4;
- iph->ihl = sizeof(struct iphdr) >> 2;
- iph->frag_off = df;
- iph->protocol = IPPROTO_GRE;
- iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb);
- iph->daddr = fl4.daddr;
- iph->saddr = fl4.saddr;
- iph->ttl = ttl;
-
- if (ttl == 0) {
- if (skb->protocol == htons(ETH_P_IP))
- iph->ttl = old_iph->ttl;
-#if IS_ENABLED(CONFIG_IPV6)
- else if (skb->protocol == htons(ETH_P_IPV6))
- iph->ttl = ((const struct ipv6hdr *)old_iph)->hop_limit;
-#endif
- else
- iph->ttl = ip4_dst_hoplimit(&rt->dst);
- }
+static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ const struct iphdr *tnl_params;
- ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
- ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
- htons(ETH_P_TEB) : skb->protocol;
+ skb = handle_offloads(tunnel, skb);
+ if (IS_ERR(skb))
+ goto out;
- if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
- __be32 *ptr = (__be32 *)(((u8 *)iph) + tunnel->hlen - 4);
+ if (dev->header_ops) {
+ /* Need space for new headers */
+ if (skb_cow_head(skb, dev->needed_headroom -
+ (tunnel->hlen + sizeof(struct iphdr))))
+ goto free_skb;
- if (tunnel->parms.o_flags&GRE_SEQ) {
- ++tunnel->o_seqno;
- *ptr = htonl(tunnel->o_seqno);
- ptr--;
- }
- if (tunnel->parms.o_flags&GRE_KEY) {
- *ptr = tunnel->parms.o_key;
- ptr--;
- }
- if (tunnel->parms.o_flags&GRE_CSUM) {
- int offset = skb_transport_offset(skb);
+ tnl_params = (const struct iphdr *)skb->data;
- *ptr = 0;
- *(__sum16 *)ptr = csum_fold(skb_checksum(skb, offset,
- skb->len - offset,
- 0));
- }
+ /* Pull skb since ip_tunnel_xmit() needs skb->data pointing
+ * to gre header.
+ */
+ skb_pull(skb, tunnel->hlen + sizeof(struct iphdr));
+ } else {
+ if (skb_cow_head(skb, dev->needed_headroom))
+ goto free_skb;
+
+ tnl_params = &tunnel->parms.iph;
}
- iptunnel_xmit(skb, dev);
+ __gre_xmit(skb, dev, tnl_params, skb->protocol);
+
return NETDEV_TX_OK;
-#if IS_ENABLED(CONFIG_IPV6)
-tx_error_icmp:
- dst_link_failure(skb);
-#endif
-tx_error:
- dev->stats.tx_errors++;
+free_skb:
dev_kfree_skb(skb);
+out:
+ dev->stats.tx_dropped++;
return NETDEV_TX_OK;
}
-static int ipgre_tunnel_bind_dev(struct net_device *dev)
+static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
+ struct net_device *dev)
{
- struct net_device *tdev = NULL;
- struct ip_tunnel *tunnel;
- const struct iphdr *iph;
- int hlen = LL_MAX_HEADER;
- int mtu = ETH_DATA_LEN;
- int addend = sizeof(struct iphdr) + 4;
-
- tunnel = netdev_priv(dev);
- iph = &tunnel->parms.iph;
-
- /* Guess output device to choose reasonable mtu and needed_headroom */
-
- if (iph->daddr) {
- struct flowi4 fl4;
- struct rtable *rt;
-
- rt = ip_route_output_gre(dev_net(dev), &fl4,
- iph->daddr, iph->saddr,
- tunnel->parms.o_key,
- RT_TOS(iph->tos),
- tunnel->parms.link);
- if (!IS_ERR(rt)) {
- tdev = rt->dst.dev;
- ip_rt_put(rt);
- }
-
- if (dev->type != ARPHRD_ETHER)
- dev->flags |= IFF_POINTOPOINT;
- }
+ struct ip_tunnel *tunnel = netdev_priv(dev);
- if (!tdev && tunnel->parms.link)
- tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
+ skb = handle_offloads(tunnel, skb);
+ if (IS_ERR(skb))
+ goto out;
- if (tdev) {
- hlen = tdev->hard_header_len + tdev->needed_headroom;
- mtu = tdev->mtu;
- }
- dev->iflink = tunnel->parms.link;
-
- /* Precalculate GRE options length */
- if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
- if (tunnel->parms.o_flags&GRE_CSUM)
- addend += 4;
- if (tunnel->parms.o_flags&GRE_KEY)
- addend += 4;
- if (tunnel->parms.o_flags&GRE_SEQ)
- addend += 4;
- }
- dev->needed_headroom = addend + hlen;
- mtu -= dev->hard_header_len + addend;
+ if (skb_cow_head(skb, dev->needed_headroom))
+ goto free_skb;
- if (mtu < 68)
- mtu = 68;
+ __gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_TEB));
- tunnel->hlen = addend;
+ return NETDEV_TX_OK;
- return mtu;
+free_skb:
+ dev_kfree_skb(skb);
+out:
+ dev->stats.tx_dropped++;
+ return NETDEV_TX_OK;
}
-static int
-ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
+static int ipgre_tunnel_ioctl(struct net_device *dev,
+ struct ifreq *ifr, int cmd)
{
int err = 0;
struct ip_tunnel_parm p;
- struct ip_tunnel *t;
- struct net *net = dev_net(dev);
- struct ipgre_net *ign = net_generic(net, ipgre_net_id);
-
- switch (cmd) {
- case SIOCGETTUNNEL:
- t = NULL;
- if (dev == ign->fb_tunnel_dev) {
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
- err = -EFAULT;
- break;
- }
- t = ipgre_tunnel_locate(net, &p, 0);
- }
- if (t == NULL)
- t = netdev_priv(dev);
- memcpy(&p, &t->parms, sizeof(p));
- if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
- err = -EFAULT;
- break;
-
- case SIOCADDTUNNEL:
- case SIOCCHGTUNNEL:
- err = -EPERM;
- if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
- goto done;
-
- err = -EFAULT;
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
- goto done;
-
- err = -EINVAL;
- if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
- p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
- ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
- goto done;
- if (p.iph.ttl)
- p.iph.frag_off |= htons(IP_DF);
-
- if (!(p.i_flags&GRE_KEY))
- p.i_key = 0;
- if (!(p.o_flags&GRE_KEY))
- p.o_key = 0;
-
- t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
-
- if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
- if (t != NULL) {
- if (t->dev != dev) {
- err = -EEXIST;
- break;
- }
- } else {
- unsigned int nflags = 0;
-
- t = netdev_priv(dev);
-
- if (ipv4_is_multicast(p.iph.daddr))
- nflags = IFF_BROADCAST;
- else if (p.iph.daddr)
- nflags = IFF_POINTOPOINT;
-
- if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
- err = -EINVAL;
- break;
- }
- ipgre_tunnel_unlink(ign, t);
- synchronize_net();
- t->parms.iph.saddr = p.iph.saddr;
- t->parms.iph.daddr = p.iph.daddr;
- t->parms.i_key = p.i_key;
- t->parms.o_key = p.o_key;
- memcpy(dev->dev_addr, &p.iph.saddr, 4);
- memcpy(dev->broadcast, &p.iph.daddr, 4);
- ipgre_tunnel_link(ign, t);
- netdev_state_change(dev);
- }
- }
-
- if (t) {
- err = 0;
- if (cmd == SIOCCHGTUNNEL) {
- t->parms.iph.ttl = p.iph.ttl;
- t->parms.iph.tos = p.iph.tos;
- t->parms.iph.frag_off = p.iph.frag_off;
- if (t->parms.link != p.link) {
- t->parms.link = p.link;
- dev->mtu = ipgre_tunnel_bind_dev(dev);
- netdev_state_change(dev);
- }
- }
- if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
- err = -EFAULT;
- } else
- err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
- break;
- case SIOCDELTUNNEL:
- err = -EPERM;
- if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
- goto done;
-
- if (dev == ign->fb_tunnel_dev) {
- err = -EFAULT;
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
- goto done;
- err = -ENOENT;
- if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
- goto done;
- err = -EPERM;
- if (t == netdev_priv(ign->fb_tunnel_dev))
- goto done;
- dev = t->dev;
- }
- unregister_netdevice(dev);
- err = 0;
- break;
-
- default:
- err = -EINVAL;
+ if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+ return -EFAULT;
+ if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
+ p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
+ ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING))) {
+ return -EINVAL;
}
+ p.i_flags = gre_flags_to_tnl_flags(p.i_flags);
+ p.o_flags = gre_flags_to_tnl_flags(p.o_flags);
-done:
- return err;
-}
+ err = ip_tunnel_ioctl(dev, &p, cmd);
+ if (err)
+ return err;
-static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
-{
- struct ip_tunnel *tunnel = netdev_priv(dev);
- if (new_mtu < 68 ||
- new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
- return -EINVAL;
- dev->mtu = new_mtu;
+ p.i_flags = tnl_flags_to_gre_flags(p.i_flags);
+ p.o_flags = tnl_flags_to_gre_flags(p.o_flags);
+
+ if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
+ return -EFAULT;
return 0;
}
@@ -1213,25 +549,23 @@ static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
...
ftp fec0:6666:6666::193.233.7.65
...
-
*/
-
static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
unsigned short type,
const void *daddr, const void *saddr, unsigned int len)
{
struct ip_tunnel *t = netdev_priv(dev);
- struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
- __be16 *p = (__be16 *)(iph+1);
+ struct iphdr *iph;
+ struct gre_base_hdr *greh;
- memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
- p[0] = t->parms.o_flags;
- p[1] = htons(type);
+ iph = (struct iphdr *)skb_push(skb, t->hlen + sizeof(*iph));
+ greh = (struct gre_base_hdr *)(iph+1);
+ greh->flags = tnl_flags_to_gre_flags(t->parms.o_flags);
+ greh->protocol = htons(type);
- /*
- * Set the source hardware address.
- */
+ memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
+ /* Set the source hardware address. */
if (saddr)
memcpy(&iph->saddr, saddr, 4);
if (daddr)
@@ -1239,7 +573,7 @@ static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
if (iph->daddr)
return t->hlen;
- return -t->hlen;
+ return -(t->hlen + sizeof(*iph));
}
static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
@@ -1293,31 +627,21 @@ static int ipgre_close(struct net_device *dev)
}
return 0;
}
-
#endif
static const struct net_device_ops ipgre_netdev_ops = {
.ndo_init = ipgre_tunnel_init,
- .ndo_uninit = ipgre_tunnel_uninit,
+ .ndo_uninit = ip_tunnel_uninit,
#ifdef CONFIG_NET_IPGRE_BROADCAST
.ndo_open = ipgre_open,
.ndo_stop = ipgre_close,
#endif
- .ndo_start_xmit = ipgre_tunnel_xmit,
+ .ndo_start_xmit = ipgre_xmit,
.ndo_do_ioctl = ipgre_tunnel_ioctl,
- .ndo_change_mtu = ipgre_tunnel_change_mtu,
- .ndo_get_stats64 = ipgre_get_stats64,
+ .ndo_change_mtu = ip_tunnel_change_mtu,
+ .ndo_get_stats64 = ip_tunnel_get_stats64,
};
-static void ipgre_dev_free(struct net_device *dev)
-{
- struct ip_tunnel *tunnel = netdev_priv(dev);
-
- gro_cells_destroy(&tunnel->gro_cells);
- free_percpu(dev->tstats);
- free_netdev(dev);
-}
-
#define GRE_FEATURES (NETIF_F_SG | \
NETIF_F_FRAGLIST | \
NETIF_F_HIGHDMA | \
@@ -1326,35 +650,48 @@ static void ipgre_dev_free(struct net_device *dev)
static void ipgre_tunnel_setup(struct net_device *dev)
{
dev->netdev_ops = &ipgre_netdev_ops;
- dev->destructor = ipgre_dev_free;
+ ip_tunnel_setup(dev, ipgre_net_id);
+}
- dev->type = ARPHRD_IPGRE;
- dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
+static void __gre_tunnel_init(struct net_device *dev)
+{
+ struct ip_tunnel *tunnel;
+
+ tunnel = netdev_priv(dev);
+ tunnel->hlen = ip_gre_calc_hlen(tunnel->parms.o_flags);
+ tunnel->parms.iph.protocol = IPPROTO_GRE;
+
+ dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
- dev->flags = IFF_NOARP;
- dev->iflink = 0;
- dev->addr_len = 4;
- dev->features |= NETIF_F_NETNS_LOCAL;
- dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
- dev->features |= GRE_FEATURES;
+ dev->features |= NETIF_F_NETNS_LOCAL | GRE_FEATURES;
dev->hw_features |= GRE_FEATURES;
+
+ if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) {
+ /* TCP offload with GRE SEQ is not supported. */
+ dev->features |= NETIF_F_GSO_SOFTWARE;
+ dev->hw_features |= NETIF_F_GSO_SOFTWARE;
+ /* Can use a lockless transmit, unless we generate
+ * output sequences
+ */
+ dev->features |= NETIF_F_LLTX;
+ }
}
static int ipgre_tunnel_init(struct net_device *dev)
{
- struct ip_tunnel *tunnel;
- struct iphdr *iph;
- int err;
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct iphdr *iph = &tunnel->parms.iph;
- tunnel = netdev_priv(dev);
- iph = &tunnel->parms.iph;
+ __gre_tunnel_init(dev);
- tunnel->dev = dev;
- strcpy(tunnel->parms.name, dev->name);
+ memcpy(dev->dev_addr, &iph->saddr, 4);
+ memcpy(dev->broadcast, &iph->daddr, 4);
- memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
- memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
+ dev->type = ARPHRD_IPGRE;
+ dev->flags = IFF_NOARP;
+ dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+ dev->addr_len = 4;
if (iph->daddr) {
#ifdef CONFIG_NET_IPGRE_BROADCAST
@@ -1368,106 +705,30 @@ static int ipgre_tunnel_init(struct net_device *dev)
} else
dev->header_ops = &ipgre_header_ops;
- dev->tstats = alloc_percpu(struct pcpu_tstats);
- if (!dev->tstats)
- return -ENOMEM;
-
- err = gro_cells_init(&tunnel->gro_cells, dev);
- if (err) {
- free_percpu(dev->tstats);
- return err;
- }
-
- return 0;
+ return ip_tunnel_init(dev);
}
-static void ipgre_fb_tunnel_init(struct net_device *dev)
-{
- struct ip_tunnel *tunnel = netdev_priv(dev);
- struct iphdr *iph = &tunnel->parms.iph;
-
- tunnel->dev = dev;
- strcpy(tunnel->parms.name, dev->name);
-
- iph->version = 4;
- iph->protocol = IPPROTO_GRE;
- iph->ihl = 5;
- tunnel->hlen = sizeof(struct iphdr) + 4;
-
- dev_hold(dev);
-}
-
-
static const struct gre_protocol ipgre_protocol = {
.handler = ipgre_rcv,
.err_handler = ipgre_err,
};
-static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
-{
- int prio;
-
- for (prio = 0; prio < 4; prio++) {
- int h;
- for (h = 0; h < HASH_SIZE; h++) {
- struct ip_tunnel *t;
-
- t = rtnl_dereference(ign->tunnels[prio][h]);
-
- while (t != NULL) {
- unregister_netdevice_queue(t->dev, head);
- t = rtnl_dereference(t->next);
- }
- }
- }
-}
-
static int __net_init ipgre_init_net(struct net *net)
{
- struct ipgre_net *ign = net_generic(net, ipgre_net_id);
- int err;
-
- ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
- ipgre_tunnel_setup);
- if (!ign->fb_tunnel_dev) {
- err = -ENOMEM;
- goto err_alloc_dev;
- }
- dev_net_set(ign->fb_tunnel_dev, net);
-
- ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
- ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
-
- if ((err = register_netdev(ign->fb_tunnel_dev)))
- goto err_reg_dev;
-
- rcu_assign_pointer(ign->tunnels_wc[0],
- netdev_priv(ign->fb_tunnel_dev));
- return 0;
-
-err_reg_dev:
- ipgre_dev_free(ign->fb_tunnel_dev);
-err_alloc_dev:
- return err;
+ return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL);
}
static void __net_exit ipgre_exit_net(struct net *net)
{
- struct ipgre_net *ign;
- LIST_HEAD(list);
-
- ign = net_generic(net, ipgre_net_id);
- rtnl_lock();
- ipgre_destroy_tunnels(ign, &list);
- unregister_netdevice_many(&list);
- rtnl_unlock();
+ struct ip_tunnel_net *itn = net_generic(net, ipgre_net_id);
+ ip_tunnel_delete_net(itn);
}
static struct pernet_operations ipgre_net_ops = {
.init = ipgre_init_net,
.exit = ipgre_exit_net,
.id = &ipgre_net_id,
- .size = sizeof(struct ipgre_net),
+ .size = sizeof(struct ip_tunnel_net),
};
static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
@@ -1512,8 +773,8 @@ out:
return ipgre_tunnel_validate(tb, data);
}
-static void ipgre_netlink_parms(struct nlattr *data[],
- struct ip_tunnel_parm *parms)
+static void ipgre_netlink_parms(struct nlattr *data[], struct nlattr *tb[],
+ struct ip_tunnel_parm *parms)
{
memset(parms, 0, sizeof(*parms));
@@ -1526,10 +787,10 @@ static void ipgre_netlink_parms(struct nlattr *data[],
parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
if (data[IFLA_GRE_IFLAGS])
- parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
+ parms->i_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_IFLAGS]));
if (data[IFLA_GRE_OFLAGS])
- parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
+ parms->o_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_OFLAGS]));
if (data[IFLA_GRE_IKEY])
parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
@@ -1553,145 +814,46 @@ static void ipgre_netlink_parms(struct nlattr *data[],
parms->iph.frag_off = htons(IP_DF);
}
-static int ipgre_tap_init(struct net_device *dev)
+static int gre_tap_init(struct net_device *dev)
{
- struct ip_tunnel *tunnel;
-
- tunnel = netdev_priv(dev);
-
- tunnel->dev = dev;
- strcpy(tunnel->parms.name, dev->name);
+ __gre_tunnel_init(dev);
- ipgre_tunnel_bind_dev(dev);
-
- dev->tstats = alloc_percpu(struct pcpu_tstats);
- if (!dev->tstats)
- return -ENOMEM;
-
- return 0;
+ return ip_tunnel_init(dev);
}
-static const struct net_device_ops ipgre_tap_netdev_ops = {
- .ndo_init = ipgre_tap_init,
- .ndo_uninit = ipgre_tunnel_uninit,
- .ndo_start_xmit = ipgre_tunnel_xmit,
+static const struct net_device_ops gre_tap_netdev_ops = {
+ .ndo_init = gre_tap_init,
+ .ndo_uninit = ip_tunnel_uninit,
+ .ndo_start_xmit = gre_tap_xmit,
.ndo_set_mac_address = eth_mac_addr,
.ndo_validate_addr = eth_validate_addr,
- .ndo_change_mtu = ipgre_tunnel_change_mtu,
- .ndo_get_stats64 = ipgre_get_stats64,
+ .ndo_change_mtu = ip_tunnel_change_mtu,
+ .ndo_get_stats64 = ip_tunnel_get_stats64,
};
static void ipgre_tap_setup(struct net_device *dev)
{
-
ether_setup(dev);
-
- dev->netdev_ops = &ipgre_tap_netdev_ops;
- dev->destructor = ipgre_dev_free;
-
- dev->iflink = 0;
- dev->features |= NETIF_F_NETNS_LOCAL;
+ dev->netdev_ops = &gre_tap_netdev_ops;
+ ip_tunnel_setup(dev, gre_tap_net_id);
}
-static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
- struct nlattr *data[])
+static int ipgre_newlink(struct net *src_net, struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[])
{
- struct ip_tunnel *nt;
- struct net *net = dev_net(dev);
- struct ipgre_net *ign = net_generic(net, ipgre_net_id);
- int mtu;
- int err;
-
- nt = netdev_priv(dev);
- ipgre_netlink_parms(data, &nt->parms);
-
- if (ipgre_tunnel_find(net, &nt->parms, dev->type))
- return -EEXIST;
-
- if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
- eth_hw_addr_random(dev);
-
- mtu = ipgre_tunnel_bind_dev(dev);
- if (!tb[IFLA_MTU])
- dev->mtu = mtu;
-
- /* Can use a lockless transmit, unless we generate output sequences */
- if (!(nt->parms.o_flags & GRE_SEQ))
- dev->features |= NETIF_F_LLTX;
-
- err = register_netdevice(dev);
- if (err)
- goto out;
-
- dev_hold(dev);
- ipgre_tunnel_link(ign, nt);
+ struct ip_tunnel_parm p;
-out:
- return err;
+ ipgre_netlink_parms(data, tb, &p);
+ return ip_tunnel_newlink(dev, tb, &p);
}
static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
struct nlattr *data[])
{
- struct ip_tunnel *t, *nt;
- struct net *net = dev_net(dev);
- struct ipgre_net *ign = net_generic(net, ipgre_net_id);
struct ip_tunnel_parm p;
- int mtu;
-
- if (dev == ign->fb_tunnel_dev)
- return -EINVAL;
-
- nt = netdev_priv(dev);
- ipgre_netlink_parms(data, &p);
-
- t = ipgre_tunnel_locate(net, &p, 0);
-
- if (t) {
- if (t->dev != dev)
- return -EEXIST;
- } else {
- t = nt;
-
- if (dev->type != ARPHRD_ETHER) {
- unsigned int nflags = 0;
-
- if (ipv4_is_multicast(p.iph.daddr))
- nflags = IFF_BROADCAST;
- else if (p.iph.daddr)
- nflags = IFF_POINTOPOINT;
-
- if ((dev->flags ^ nflags) &
- (IFF_POINTOPOINT | IFF_BROADCAST))
- return -EINVAL;
- }
- ipgre_tunnel_unlink(ign, t);
- t->parms.iph.saddr = p.iph.saddr;
- t->parms.iph.daddr = p.iph.daddr;
- t->parms.i_key = p.i_key;
- if (dev->type != ARPHRD_ETHER) {
- memcpy(dev->dev_addr, &p.iph.saddr, 4);
- memcpy(dev->broadcast, &p.iph.daddr, 4);
- }
- ipgre_tunnel_link(ign, t);
- netdev_state_change(dev);
- }
-
- t->parms.o_key = p.o_key;
- t->parms.iph.ttl = p.iph.ttl;
- t->parms.iph.tos = p.iph.tos;
- t->parms.iph.frag_off = p.iph.frag_off;
-
- if (t->parms.link != p.link) {
- t->parms.link = p.link;
- mtu = ipgre_tunnel_bind_dev(dev);
- if (!tb[IFLA_MTU])
- dev->mtu = mtu;
- netdev_state_change(dev);
- }
-
- return 0;
+ ipgre_netlink_parms(data, tb, &p);
+ return ip_tunnel_changelink(dev, tb, &p);
}
static size_t ipgre_get_size(const struct net_device *dev)
@@ -1726,8 +888,8 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
struct ip_tunnel_parm *p = &t->parms;
if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
- nla_put_be16(skb, IFLA_GRE_IFLAGS, p->i_flags) ||
- nla_put_be16(skb, IFLA_GRE_OFLAGS, p->o_flags) ||
+ nla_put_be16(skb, IFLA_GRE_IFLAGS, tnl_flags_to_gre_flags(p->i_flags)) ||
+ nla_put_be16(skb, IFLA_GRE_OFLAGS, tnl_flags_to_gre_flags(p->o_flags)) ||
nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
nla_put_be32(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
@@ -1765,6 +927,7 @@ static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
.validate = ipgre_tunnel_validate,
.newlink = ipgre_newlink,
.changelink = ipgre_changelink,
+ .dellink = ip_tunnel_dellink,
.get_size = ipgre_get_size,
.fill_info = ipgre_fill_info,
};
@@ -1778,13 +941,28 @@ static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
.validate = ipgre_tap_validate,
.newlink = ipgre_newlink,
.changelink = ipgre_changelink,
+ .dellink = ip_tunnel_dellink,
.get_size = ipgre_get_size,
.fill_info = ipgre_fill_info,
};
-/*
- * And now the modules code and kernel interface.
- */
+static int __net_init ipgre_tap_init_net(struct net *net)
+{
+ return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, NULL);
+}
+
+static void __net_exit ipgre_tap_exit_net(struct net *net)
+{
+ struct ip_tunnel_net *itn = net_generic(net, gre_tap_net_id);
+ ip_tunnel_delete_net(itn);
+}
+
+static struct pernet_operations ipgre_tap_net_ops = {
+ .init = ipgre_tap_init_net,
+ .exit = ipgre_tap_exit_net,
+ .id = &gre_tap_net_id,
+ .size = sizeof(struct ip_tunnel_net),
+};
static int __init ipgre_init(void)
{
@@ -1796,6 +974,10 @@ static int __init ipgre_init(void)
if (err < 0)
return err;
+ err = register_pernet_device(&ipgre_tap_net_ops);
+ if (err < 0)
+ goto pnet_tap_faied;
+
err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
if (err < 0) {
pr_info("%s: can't add protocol\n", __func__);
@@ -1810,16 +992,17 @@ static int __init ipgre_init(void)
if (err < 0)
goto tap_ops_failed;
-out:
- return err;
+ return 0;
tap_ops_failed:
rtnl_link_unregister(&ipgre_link_ops);
rtnl_link_failed:
gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
add_proto_failed:
+ unregister_pernet_device(&ipgre_tap_net_ops);
+pnet_tap_faied:
unregister_pernet_device(&ipgre_net_ops);
- goto out;
+ return err;
}
static void __exit ipgre_fini(void)
@@ -1828,6 +1011,7 @@ static void __exit ipgre_fini(void)
rtnl_link_unregister(&ipgre_link_ops);
if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
pr_info("%s: can't remove protocol\n", __func__);
+ unregister_pernet_device(&ipgre_tap_net_ops);
unregister_pernet_device(&ipgre_net_ops);
}
@@ -1837,3 +1021,4 @@ MODULE_LICENSE("GPL");
MODULE_ALIAS_RTNL_LINK("gre");
MODULE_ALIAS_RTNL_LINK("gretap");
MODULE_ALIAS_NETDEV("gre0");
+MODULE_ALIAS_NETDEV("gretap0");
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index b007afb..64df5f5 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -208,13 +208,6 @@ static int ip_local_deliver_finish(struct sk_buff *skb)
if (ipprot != NULL) {
int ret;
- if (!net_eq(net, &init_net) && !ipprot->netns_ok) {
- net_info_ratelimited("%s: proto %d isn't netns-ready\n",
- __func__, protocol);
- kfree_skb(skb);
- goto out;
- }
-
if (!ipprot->no_policy) {
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
kfree_skb(skb);
@@ -235,9 +228,11 @@ static int ip_local_deliver_finish(struct sk_buff *skb)
icmp_send(skb, ICMP_DEST_UNREACH,
ICMP_PROT_UNREACH, 0);
}
- } else
+ kfree_skb(skb);
+ } else {
IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS);
- kfree_skb(skb);
+ consume_skb(skb);
+ }
}
}
out:
@@ -425,7 +420,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
iph = ip_hdr(skb);
if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
- goto inhdr_error;
+ goto csum_error;
len = ntohs(iph->tot_len);
if (skb->len < len) {
@@ -452,6 +447,8 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, dev, NULL,
ip_rcv_finish);
+csum_error:
+ IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_CSUMERRORS);
inhdr_error:
IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS);
drop:
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index a47fc1d..d1f8e10 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -370,7 +370,6 @@ int ip_options_compile(struct net *net,
}
switch (optptr[3]&0xF) {
case IPOPT_TS_TSONLY:
- opt->ts = optptr - iph;
if (skb)
timeptr = &optptr[optptr[2]-1];
opt->ts_needtime = 1;
@@ -381,7 +380,6 @@ int ip_options_compile(struct net *net,
pp_ptr = optptr + 2;
goto error;
}
- opt->ts = optptr - iph;
if (rt) {
spec_dst_fill(&spec_dst, skb);
memcpy(&optptr[optptr[2]-1], &spec_dst, 4);
@@ -396,7 +394,6 @@ int ip_options_compile(struct net *net,
pp_ptr = optptr + 2;
goto error;
}
- opt->ts = optptr - iph;
{
__be32 addr;
memcpy(&addr, &optptr[optptr[2]-1], 4);
@@ -423,18 +420,18 @@ int ip_options_compile(struct net *net,
put_unaligned_be32(midtime, timeptr);
opt->is_changed = 1;
}
- } else {
+ } else if ((optptr[3]&0xF) != IPOPT_TS_PRESPEC) {
unsigned int overflow = optptr[3]>>4;
if (overflow == 15) {
pp_ptr = optptr + 3;
goto error;
}
- opt->ts = optptr - iph;
if (skb) {
optptr[3] = (optptr[3]&0xF)|((overflow+1)<<4);
opt->is_changed = 1;
}
}
+ opt->ts = optptr - iph;
break;
case IPOPT_RA:
if (optlen < 4) {
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 3e98ed2..4bcabf3 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -84,7 +84,7 @@ int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
EXPORT_SYMBOL(sysctl_ip_default_ttl);
/* Generate a checksum for an outgoing IP datagram. */
-__inline__ void ip_send_check(struct iphdr *iph)
+void ip_send_check(struct iphdr *iph)
{
iph->check = 0;
iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
@@ -430,8 +430,7 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
to->tc_index = from->tc_index;
#endif
nf_copy(to, from);
-#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
- defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
+#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
to->nf_trace = from->nf_trace;
#endif
#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
@@ -598,6 +597,7 @@ slow_path:
/* for offloaded checksums cleanup checksum before fragmentation */
if ((skb->ip_summed == CHECKSUM_PARTIAL) && skb_checksum_help(skb))
goto fail;
+ iph = ip_hdr(skb);
left = skb->len - hlen; /* Space per frame */
ptr = hlen; /* Where to start from */
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
new file mode 100644
index 0000000..7fa8f08
--- /dev/null
+++ b/net/ipv4/ip_tunnel.c
@@ -0,0 +1,1035 @@
+/*
+ * Copyright (c) 2013 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/if_arp.h>
+#include <linux/mroute.h>
+#include <linux/init.h>
+#include <linux/in6.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <linux/rculist.h>
+
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/protocol.h>
+#include <net/ip_tunnels.h>
+#include <net/arp.h>
+#include <net/checksum.h>
+#include <net/dsfield.h>
+#include <net/inet_ecn.h>
+#include <net/xfrm.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/rtnetlink.h>
+
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/ipv6.h>
+#include <net/ip6_fib.h>
+#include <net/ip6_route.h>
+#endif
+
+static unsigned int ip_tunnel_hash(struct ip_tunnel_net *itn,
+ __be32 key, __be32 remote)
+{
+ return hash_32((__force u32)key ^ (__force u32)remote,
+ IP_TNL_HASH_BITS);
+}
+
+/* Often modified stats are per cpu, other are shared (netdev->stats) */
+struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev,
+ struct rtnl_link_stats64 *tot)
+{
+ int i;
+
+ for_each_possible_cpu(i) {
+ const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
+ u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
+ unsigned int start;
+
+ do {
+ start = u64_stats_fetch_begin_bh(&tstats->syncp);
+ rx_packets = tstats->rx_packets;
+ tx_packets = tstats->tx_packets;
+ rx_bytes = tstats->rx_bytes;
+ tx_bytes = tstats->tx_bytes;
+ } while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
+
+ tot->rx_packets += rx_packets;
+ tot->tx_packets += tx_packets;
+ tot->rx_bytes += rx_bytes;
+ tot->tx_bytes += tx_bytes;
+ }
+
+ tot->multicast = dev->stats.multicast;
+
+ tot->rx_crc_errors = dev->stats.rx_crc_errors;
+ tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
+ tot->rx_length_errors = dev->stats.rx_length_errors;
+ tot->rx_frame_errors = dev->stats.rx_frame_errors;
+ tot->rx_errors = dev->stats.rx_errors;
+
+ tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
+ tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
+ tot->tx_dropped = dev->stats.tx_dropped;
+ tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
+ tot->tx_errors = dev->stats.tx_errors;
+
+ tot->collisions = dev->stats.collisions;
+
+ return tot;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64);
+
+static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
+ __be16 flags, __be32 key)
+{
+ if (p->i_flags & TUNNEL_KEY) {
+ if (flags & TUNNEL_KEY)
+ return key == p->i_key;
+ else
+ /* key expected, none present */
+ return false;
+ } else
+ return !(flags & TUNNEL_KEY);
+}
+
+/* Fallback tunnel: no source, no destination, no key, no options
+
+ Tunnel hash table:
+ We require exact key match i.e. if a key is present in packet
+ it will match only tunnel with the same key; if it is not present,
+ it will match only keyless tunnel.
+
+ All keysless packets, if not matched configured keyless tunnels
+ will match fallback tunnel.
+ Given src, dst and key, find appropriate for input tunnel.
+*/
+struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
+ int link, __be16 flags,
+ __be32 remote, __be32 local,
+ __be32 key)
+{
+ unsigned int hash;
+ struct ip_tunnel *t, *cand = NULL;
+ struct hlist_head *head;
+
+ hash = ip_tunnel_hash(itn, key, remote);
+ head = &itn->tunnels[hash];
+
+ hlist_for_each_entry_rcu(t, head, hash_node) {
+ if (local != t->parms.iph.saddr ||
+ remote != t->parms.iph.daddr ||
+ !(t->dev->flags & IFF_UP))
+ continue;
+
+ if (!ip_tunnel_key_match(&t->parms, flags, key))
+ continue;
+
+ if (t->parms.link == link)
+ return t;
+ else
+ cand = t;
+ }
+
+ hlist_for_each_entry_rcu(t, head, hash_node) {
+ if (remote != t->parms.iph.daddr ||
+ !(t->dev->flags & IFF_UP))
+ continue;
+
+ if (!ip_tunnel_key_match(&t->parms, flags, key))
+ continue;
+
+ if (t->parms.link == link)
+ return t;
+ else if (!cand)
+ cand = t;
+ }
+
+ hash = ip_tunnel_hash(itn, key, 0);
+ head = &itn->tunnels[hash];
+
+ hlist_for_each_entry_rcu(t, head, hash_node) {
+ if ((local != t->parms.iph.saddr &&
+ (local != t->parms.iph.daddr ||
+ !ipv4_is_multicast(local))) ||
+ !(t->dev->flags & IFF_UP))
+ continue;
+
+ if (!ip_tunnel_key_match(&t->parms, flags, key))
+ continue;
+
+ if (t->parms.link == link)
+ return t;
+ else if (!cand)
+ cand = t;
+ }
+
+ if (flags & TUNNEL_NO_KEY)
+ goto skip_key_lookup;
+
+ hlist_for_each_entry_rcu(t, head, hash_node) {
+ if (t->parms.i_key != key ||
+ !(t->dev->flags & IFF_UP))
+ continue;
+
+ if (t->parms.link == link)
+ return t;
+ else if (!cand)
+ cand = t;
+ }
+
+skip_key_lookup:
+ if (cand)
+ return cand;
+
+ if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
+ return netdev_priv(itn->fb_tunnel_dev);
+
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
+
+static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
+ struct ip_tunnel_parm *parms)
+{
+ unsigned int h;
+ __be32 remote;
+
+ if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
+ remote = parms->iph.daddr;
+ else
+ remote = 0;
+
+ h = ip_tunnel_hash(itn, parms->i_key, remote);
+ return &itn->tunnels[h];
+}
+
+static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
+{
+ struct hlist_head *head = ip_bucket(itn, &t->parms);
+
+ hlist_add_head_rcu(&t->hash_node, head);
+}
+
+static void ip_tunnel_del(struct ip_tunnel *t)
+{
+ hlist_del_init_rcu(&t->hash_node);
+}
+
+static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
+ struct ip_tunnel_parm *parms,
+ int type)
+{
+ __be32 remote = parms->iph.daddr;
+ __be32 local = parms->iph.saddr;
+ __be32 key = parms->i_key;
+ int link = parms->link;
+ struct ip_tunnel *t = NULL;
+ struct hlist_head *head = ip_bucket(itn, parms);
+
+ hlist_for_each_entry_rcu(t, head, hash_node) {
+ if (local == t->parms.iph.saddr &&
+ remote == t->parms.iph.daddr &&
+ key == t->parms.i_key &&
+ link == t->parms.link &&
+ type == t->dev->type)
+ break;
+ }
+ return t;
+}
+
+static struct net_device *__ip_tunnel_create(struct net *net,
+ const struct rtnl_link_ops *ops,
+ struct ip_tunnel_parm *parms)
+{
+ int err;
+ struct ip_tunnel *tunnel;
+ struct net_device *dev;
+ char name[IFNAMSIZ];
+
+ if (parms->name[0])
+ strlcpy(name, parms->name, IFNAMSIZ);
+ else {
+ if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
+ err = -E2BIG;
+ goto failed;
+ }
+ strlcpy(name, ops->kind, IFNAMSIZ);
+ strncat(name, "%d", 2);
+ }
+
+ ASSERT_RTNL();
+ dev = alloc_netdev(ops->priv_size, name, ops->setup);
+ if (!dev) {
+ err = -ENOMEM;
+ goto failed;
+ }
+ dev_net_set(dev, net);
+
+ dev->rtnl_link_ops = ops;
+
+ tunnel = netdev_priv(dev);
+ tunnel->parms = *parms;
+
+ err = register_netdevice(dev);
+ if (err)
+ goto failed_free;
+
+ return dev;
+
+failed_free:
+ free_netdev(dev);
+failed:
+ return ERR_PTR(err);
+}
+
+static inline struct rtable *ip_route_output_tunnel(struct net *net,
+ struct flowi4 *fl4,
+ int proto,
+ __be32 daddr, __be32 saddr,
+ __be32 key, __u8 tos, int oif)
+{
+ memset(fl4, 0, sizeof(*fl4));
+ fl4->flowi4_oif = oif;
+ fl4->daddr = daddr;
+ fl4->saddr = saddr;
+ fl4->flowi4_tos = tos;
+ fl4->flowi4_proto = proto;
+ fl4->fl4_gre_key = key;
+ return ip_route_output_key(net, fl4);
+}
+
+static int ip_tunnel_bind_dev(struct net_device *dev)
+{
+ struct net_device *tdev = NULL;
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ const struct iphdr *iph;
+ int hlen = LL_MAX_HEADER;
+ int mtu = ETH_DATA_LEN;
+ int t_hlen = tunnel->hlen + sizeof(struct iphdr);
+
+ iph = &tunnel->parms.iph;
+
+ /* Guess output device to choose reasonable mtu and needed_headroom */
+ if (iph->daddr) {
+ struct flowi4 fl4;
+ struct rtable *rt;
+
+ rt = ip_route_output_tunnel(dev_net(dev), &fl4,
+ tunnel->parms.iph.protocol,
+ iph->daddr, iph->saddr,
+ tunnel->parms.o_key,
+ RT_TOS(iph->tos),
+ tunnel->parms.link);
+ if (!IS_ERR(rt)) {
+ tdev = rt->dst.dev;
+ ip_rt_put(rt);
+ }
+ if (dev->type != ARPHRD_ETHER)
+ dev->flags |= IFF_POINTOPOINT;
+ }
+
+ if (!tdev && tunnel->parms.link)
+ tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
+
+ if (tdev) {
+ hlen = tdev->hard_header_len + tdev->needed_headroom;
+ mtu = tdev->mtu;
+ }
+ dev->iflink = tunnel->parms.link;
+
+ dev->needed_headroom = t_hlen + hlen;
+ mtu -= (dev->hard_header_len + t_hlen);
+
+ if (mtu < 68)
+ mtu = 68;
+
+ return mtu;
+}
+
+static struct ip_tunnel *ip_tunnel_create(struct net *net,
+ struct ip_tunnel_net *itn,
+ struct ip_tunnel_parm *parms)
+{
+ struct ip_tunnel *nt, *fbt;
+ struct net_device *dev;
+
+ BUG_ON(!itn->fb_tunnel_dev);
+ fbt = netdev_priv(itn->fb_tunnel_dev);
+ dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
+ if (IS_ERR(dev))
+ return NULL;
+
+ dev->mtu = ip_tunnel_bind_dev(dev);
+
+ nt = netdev_priv(dev);
+ ip_tunnel_add(itn, nt);
+ return nt;
+}
+
+int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
+ const struct tnl_ptk_info *tpi, bool log_ecn_error)
+{
+ struct pcpu_tstats *tstats;
+ const struct iphdr *iph = ip_hdr(skb);
+ int err;
+
+ secpath_reset(skb);
+
+ skb->protocol = tpi->proto;
+
+ skb->mac_header = skb->network_header;
+ __pskb_pull(skb, tunnel->hlen);
+ skb_postpull_rcsum(skb, skb_transport_header(skb), tunnel->hlen);
+#ifdef CONFIG_NET_IPGRE_BROADCAST
+ if (ipv4_is_multicast(iph->daddr)) {
+ /* Looped back packet, drop it! */
+ if (rt_is_output_route(skb_rtable(skb)))
+ goto drop;
+ tunnel->dev->stats.multicast++;
+ skb->pkt_type = PACKET_BROADCAST;
+ }
+#endif
+
+ if ((!(tpi->flags&TUNNEL_CSUM) && (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
+ ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
+ tunnel->dev->stats.rx_crc_errors++;
+ tunnel->dev->stats.rx_errors++;
+ goto drop;
+ }
+
+ if (tunnel->parms.i_flags&TUNNEL_SEQ) {
+ if (!(tpi->flags&TUNNEL_SEQ) ||
+ (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
+ tunnel->dev->stats.rx_fifo_errors++;
+ tunnel->dev->stats.rx_errors++;
+ goto drop;
+ }
+ tunnel->i_seqno = ntohl(tpi->seq) + 1;
+ }
+
+ /* Warning: All skb pointers will be invalidated! */
+ if (tunnel->dev->type == ARPHRD_ETHER) {
+ if (!pskb_may_pull(skb, ETH_HLEN)) {
+ tunnel->dev->stats.rx_length_errors++;
+ tunnel->dev->stats.rx_errors++;
+ goto drop;
+ }
+
+ iph = ip_hdr(skb);
+ skb->protocol = eth_type_trans(skb, tunnel->dev);
+ skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
+ }
+
+ skb->pkt_type = PACKET_HOST;
+ __skb_tunnel_rx(skb, tunnel->dev);
+
+ skb_reset_network_header(skb);
+ err = IP_ECN_decapsulate(iph, skb);
+ if (unlikely(err)) {
+ if (log_ecn_error)
+ net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
+ &iph->saddr, iph->tos);
+ if (err > 1) {
+ ++tunnel->dev->stats.rx_frame_errors;
+ ++tunnel->dev->stats.rx_errors;
+ goto drop;
+ }
+ }
+
+ tstats = this_cpu_ptr(tunnel->dev->tstats);
+ u64_stats_update_begin(&tstats->syncp);
+ tstats->rx_packets++;
+ tstats->rx_bytes += skb->len;
+ u64_stats_update_end(&tstats->syncp);
+
+ gro_cells_receive(&tunnel->gro_cells, skb);
+ return 0;
+
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
+
+void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
+ const struct iphdr *tnl_params)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ const struct iphdr *inner_iph;
+ struct iphdr *iph;
+ struct flowi4 fl4;
+ u8 tos, ttl;
+ __be16 df;
+ struct rtable *rt; /* Route to the other host */
+ struct net_device *tdev; /* Device to other host */
+ unsigned int max_headroom; /* The extra header space needed */
+ __be32 dst;
+ int mtu;
+
+ inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
+
+ memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+ dst = tnl_params->daddr;
+ if (dst == 0) {
+ /* NBMA tunnel */
+
+ if (skb_dst(skb) == NULL) {
+ dev->stats.tx_fifo_errors++;
+ goto tx_error;
+ }
+
+ if (skb->protocol == htons(ETH_P_IP)) {
+ rt = skb_rtable(skb);
+ dst = rt_nexthop(rt, inner_iph->daddr);
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (skb->protocol == htons(ETH_P_IPV6)) {
+ const struct in6_addr *addr6;
+ struct neighbour *neigh;
+ bool do_tx_error_icmp;
+ int addr_type;
+
+ neigh = dst_neigh_lookup(skb_dst(skb),
+ &ipv6_hdr(skb)->daddr);
+ if (neigh == NULL)
+ goto tx_error;
+
+ addr6 = (const struct in6_addr *)&neigh->primary_key;
+ addr_type = ipv6_addr_type(addr6);
+
+ if (addr_type == IPV6_ADDR_ANY) {
+ addr6 = &ipv6_hdr(skb)->daddr;
+ addr_type = ipv6_addr_type(addr6);
+ }
+
+ if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
+ do_tx_error_icmp = true;
+ else {
+ do_tx_error_icmp = false;
+ dst = addr6->s6_addr32[3];
+ }
+ neigh_release(neigh);
+ if (do_tx_error_icmp)
+ goto tx_error_icmp;
+ }
+#endif
+ else
+ goto tx_error;
+ }
+
+ tos = tnl_params->tos;
+ if (tos & 0x1) {
+ tos &= ~0x1;
+ if (skb->protocol == htons(ETH_P_IP))
+ tos = inner_iph->tos;
+ else if (skb->protocol == htons(ETH_P_IPV6))
+ tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
+ }
+
+ rt = ip_route_output_tunnel(dev_net(dev), &fl4,
+ tunnel->parms.iph.protocol,
+ dst, tnl_params->saddr,
+ tunnel->parms.o_key,
+ RT_TOS(tos),
+ tunnel->parms.link);
+ if (IS_ERR(rt)) {
+ dev->stats.tx_carrier_errors++;
+ goto tx_error;
+ }
+ tdev = rt->dst.dev;
+
+ if (tdev == dev) {
+ ip_rt_put(rt);
+ dev->stats.collisions++;
+ goto tx_error;
+ }
+
+ df = tnl_params->frag_off;
+
+ if (df)
+ mtu = dst_mtu(&rt->dst) - dev->hard_header_len
+ - sizeof(struct iphdr);
+ else
+ mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
+
+ if (skb_dst(skb))
+ skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
+
+ if (skb->protocol == htons(ETH_P_IP)) {
+ df |= (inner_iph->frag_off&htons(IP_DF));
+
+ if (!skb_is_gso(skb) &&
+ (inner_iph->frag_off&htons(IP_DF)) &&
+ mtu < ntohs(inner_iph->tot_len)) {
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
+ ip_rt_put(rt);
+ goto tx_error;
+ }
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (skb->protocol == htons(ETH_P_IPV6)) {
+ struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
+
+ if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
+ mtu >= IPV6_MIN_MTU) {
+ if ((tunnel->parms.iph.daddr &&
+ !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
+ rt6->rt6i_dst.plen == 128) {
+ rt6->rt6i_flags |= RTF_MODIFIED;
+ dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
+ }
+ }
+
+ if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
+ mtu < skb->len) {
+ icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ ip_rt_put(rt);
+ goto tx_error;
+ }
+ }
+#endif
+
+ if (tunnel->err_count > 0) {
+ if (time_before(jiffies,
+ tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
+ tunnel->err_count--;
+
+ dst_link_failure(skb);
+ } else
+ tunnel->err_count = 0;
+ }
+
+ ttl = tnl_params->ttl;
+ if (ttl == 0) {
+ if (skb->protocol == htons(ETH_P_IP))
+ ttl = inner_iph->ttl;
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (skb->protocol == htons(ETH_P_IPV6))
+ ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
+#endif
+ else
+ ttl = ip4_dst_hoplimit(&rt->dst);
+ }
+
+ max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr)
+ + rt->dst.header_len;
+ if (max_headroom > dev->needed_headroom) {
+ dev->needed_headroom = max_headroom;
+ if (skb_cow_head(skb, dev->needed_headroom)) {
+ dev->stats.tx_dropped++;
+ dev_kfree_skb(skb);
+ return;
+ }
+ }
+
+ skb_dst_drop(skb);
+ skb_dst_set(skb, &rt->dst);
+
+ /* Push down and install the IP header. */
+ skb_push(skb, sizeof(struct iphdr));
+ skb_reset_network_header(skb);
+
+ iph = ip_hdr(skb);
+ inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
+
+ iph->version = 4;
+ iph->ihl = sizeof(struct iphdr) >> 2;
+ iph->frag_off = df;
+ iph->protocol = tnl_params->protocol;
+ iph->tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
+ iph->daddr = fl4.daddr;
+ iph->saddr = fl4.saddr;
+ iph->ttl = ttl;
+ tunnel_ip_select_ident(skb, inner_iph, &rt->dst);
+
+ iptunnel_xmit(skb, dev);
+ return;
+
+#if IS_ENABLED(CONFIG_IPV6)
+tx_error_icmp:
+ dst_link_failure(skb);
+#endif
+tx_error:
+ dev->stats.tx_errors++;
+ dev_kfree_skb(skb);
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
+
+static void ip_tunnel_update(struct ip_tunnel_net *itn,
+ struct ip_tunnel *t,
+ struct net_device *dev,
+ struct ip_tunnel_parm *p,
+ bool set_mtu)
+{
+ ip_tunnel_del(t);
+ t->parms.iph.saddr = p->iph.saddr;
+ t->parms.iph.daddr = p->iph.daddr;
+ t->parms.i_key = p->i_key;
+ t->parms.o_key = p->o_key;
+ if (dev->type != ARPHRD_ETHER) {
+ memcpy(dev->dev_addr, &p->iph.saddr, 4);
+ memcpy(dev->broadcast, &p->iph.daddr, 4);
+ }
+ ip_tunnel_add(itn, t);
+
+ t->parms.iph.ttl = p->iph.ttl;
+ t->parms.iph.tos = p->iph.tos;
+ t->parms.iph.frag_off = p->iph.frag_off;
+
+ if (t->parms.link != p->link) {
+ int mtu;
+
+ t->parms.link = p->link;
+ mtu = ip_tunnel_bind_dev(dev);
+ if (set_mtu)
+ dev->mtu = mtu;
+ }
+ netdev_state_change(dev);
+}
+
+int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
+{
+ int err = 0;
+ struct ip_tunnel *t;
+ struct net *net = dev_net(dev);
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
+
+ BUG_ON(!itn->fb_tunnel_dev);
+ switch (cmd) {
+ case SIOCGETTUNNEL:
+ t = NULL;
+ if (dev == itn->fb_tunnel_dev)
+ t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
+ if (t == NULL)
+ t = netdev_priv(dev);
+ memcpy(p, &t->parms, sizeof(*p));
+ break;
+
+ case SIOCADDTUNNEL:
+ case SIOCCHGTUNNEL:
+ err = -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ goto done;
+ if (p->iph.ttl)
+ p->iph.frag_off |= htons(IP_DF);
+ if (!(p->i_flags&TUNNEL_KEY))
+ p->i_key = 0;
+ if (!(p->o_flags&TUNNEL_KEY))
+ p->o_key = 0;
+
+ t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
+
+ if (!t && (cmd == SIOCADDTUNNEL))
+ t = ip_tunnel_create(net, itn, p);
+
+ if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
+ if (t != NULL) {
+ if (t->dev != dev) {
+ err = -EEXIST;
+ break;
+ }
+ } else {
+ unsigned int nflags = 0;
+
+ if (ipv4_is_multicast(p->iph.daddr))
+ nflags = IFF_BROADCAST;
+ else if (p->iph.daddr)
+ nflags = IFF_POINTOPOINT;
+
+ if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
+ err = -EINVAL;
+ break;
+ }
+
+ t = netdev_priv(dev);
+ }
+ }
+
+ if (t) {
+ err = 0;
+ ip_tunnel_update(itn, t, dev, p, true);
+ } else
+ err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
+ break;
+
+ case SIOCDELTUNNEL:
+ err = -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ goto done;
+
+ if (dev == itn->fb_tunnel_dev) {
+ err = -ENOENT;
+ t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
+ if (t == NULL)
+ goto done;
+ err = -EPERM;
+ if (t == netdev_priv(itn->fb_tunnel_dev))
+ goto done;
+ dev = t->dev;
+ }
+ unregister_netdevice(dev);
+ err = 0;
+ break;
+
+ default:
+ err = -EINVAL;
+ }
+
+done:
+ return err;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
+
+int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ int t_hlen = tunnel->hlen + sizeof(struct iphdr);
+
+ if (new_mtu < 68 ||
+ new_mtu > 0xFFF8 - dev->hard_header_len - t_hlen)
+ return -EINVAL;
+ dev->mtu = new_mtu;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
+
+static void ip_tunnel_dev_free(struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+
+ gro_cells_destroy(&tunnel->gro_cells);
+ free_percpu(dev->tstats);
+ free_netdev(dev);
+}
+
+void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
+{
+ struct net *net = dev_net(dev);
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct ip_tunnel_net *itn;
+
+ itn = net_generic(net, tunnel->ip_tnl_net_id);
+
+ if (itn->fb_tunnel_dev != dev) {
+ ip_tunnel_del(netdev_priv(dev));
+ unregister_netdevice_queue(dev, head);
+ }
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
+
+int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
+ struct rtnl_link_ops *ops, char *devname)
+{
+ struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
+ struct ip_tunnel_parm parms;
+
+ itn->tunnels = kzalloc(IP_TNL_HASH_SIZE * sizeof(struct hlist_head), GFP_KERNEL);
+ if (!itn->tunnels)
+ return -ENOMEM;
+
+ if (!ops) {
+ itn->fb_tunnel_dev = NULL;
+ return 0;
+ }
+ memset(&parms, 0, sizeof(parms));
+ if (devname)
+ strlcpy(parms.name, devname, IFNAMSIZ);
+
+ rtnl_lock();
+ itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
+ rtnl_unlock();
+ if (IS_ERR(itn->fb_tunnel_dev)) {
+ kfree(itn->tunnels);
+ return PTR_ERR(itn->fb_tunnel_dev);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
+
+static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head)
+{
+ int h;
+
+ for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
+ struct ip_tunnel *t;
+ struct hlist_node *n;
+ struct hlist_head *thead = &itn->tunnels[h];
+
+ hlist_for_each_entry_safe(t, n, thead, hash_node)
+ unregister_netdevice_queue(t->dev, head);
+ }
+ if (itn->fb_tunnel_dev)
+ unregister_netdevice_queue(itn->fb_tunnel_dev, head);
+}
+
+void ip_tunnel_delete_net(struct ip_tunnel_net *itn)
+{
+ LIST_HEAD(list);
+
+ rtnl_lock();
+ ip_tunnel_destroy(itn, &list);
+ unregister_netdevice_many(&list);
+ rtnl_unlock();
+ kfree(itn->tunnels);
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
+
+int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
+ struct ip_tunnel_parm *p)
+{
+ struct ip_tunnel *nt;
+ struct net *net = dev_net(dev);
+ struct ip_tunnel_net *itn;
+ int mtu;
+ int err;
+
+ nt = netdev_priv(dev);
+ itn = net_generic(net, nt->ip_tnl_net_id);
+
+ if (ip_tunnel_find(itn, p, dev->type))
+ return -EEXIST;
+
+ nt->parms = *p;
+ err = register_netdevice(dev);
+ if (err)
+ goto out;
+
+ if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
+ eth_hw_addr_random(dev);
+
+ mtu = ip_tunnel_bind_dev(dev);
+ if (!tb[IFLA_MTU])
+ dev->mtu = mtu;
+
+ ip_tunnel_add(itn, nt);
+
+out:
+ return err;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
+
+int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
+ struct ip_tunnel_parm *p)
+{
+ struct ip_tunnel *t, *nt;
+ struct net *net = dev_net(dev);
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
+
+ if (dev == itn->fb_tunnel_dev)
+ return -EINVAL;
+
+ nt = netdev_priv(dev);
+
+ t = ip_tunnel_find(itn, p, dev->type);
+
+ if (t) {
+ if (t->dev != dev)
+ return -EEXIST;
+ } else {
+ t = nt;
+
+ if (dev->type != ARPHRD_ETHER) {
+ unsigned int nflags = 0;
+
+ if (ipv4_is_multicast(p->iph.daddr))
+ nflags = IFF_BROADCAST;
+ else if (p->iph.daddr)
+ nflags = IFF_POINTOPOINT;
+
+ if ((dev->flags ^ nflags) &
+ (IFF_POINTOPOINT | IFF_BROADCAST))
+ return -EINVAL;
+ }
+ }
+
+ ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
+
+int ip_tunnel_init(struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct iphdr *iph = &tunnel->parms.iph;
+ int err;
+
+ dev->destructor = ip_tunnel_dev_free;
+ dev->tstats = alloc_percpu(struct pcpu_tstats);
+ if (!dev->tstats)
+ return -ENOMEM;
+
+ err = gro_cells_init(&tunnel->gro_cells, dev);
+ if (err) {
+ free_percpu(dev->tstats);
+ return err;
+ }
+
+ tunnel->dev = dev;
+ strcpy(tunnel->parms.name, dev->name);
+ iph->version = 4;
+ iph->ihl = 5;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_init);
+
+void ip_tunnel_uninit(struct net_device *dev)
+{
+ struct net *net = dev_net(dev);
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct ip_tunnel_net *itn;
+
+ itn = net_generic(net, tunnel->ip_tnl_net_id);
+ /* fb_tunnel_dev will be unregisted in net-exit call. */
+ if (itn->fb_tunnel_dev != dev)
+ ip_tunnel_del(netdev_priv(dev));
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
+
+/* Do least required initialization, rest of init is done in tunnel_init call */
+void ip_tunnel_setup(struct net_device *dev, int net_id)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ tunnel->ip_tnl_net_id = net_id;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_setup);
+
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index c3a4233..c118f6b 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -38,7 +38,7 @@
#include <net/sock.h>
#include <net/ip.h>
#include <net/icmp.h>
-#include <net/ipip.h>
+#include <net/ip_tunnels.h>
#include <net/inet_ecn.h>
#include <net/xfrm.h>
#include <net/net_namespace.h>
@@ -82,44 +82,6 @@ static int vti_tunnel_bind_dev(struct net_device *dev);
} while (0)
-static struct rtnl_link_stats64 *vti_get_stats64(struct net_device *dev,
- struct rtnl_link_stats64 *tot)
-{
- int i;
-
- for_each_possible_cpu(i) {
- const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
- u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
- unsigned int start;
-
- do {
- start = u64_stats_fetch_begin_bh(&tstats->syncp);
- rx_packets = tstats->rx_packets;
- tx_packets = tstats->tx_packets;
- rx_bytes = tstats->rx_bytes;
- tx_bytes = tstats->tx_bytes;
- } while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
-
- tot->rx_packets += rx_packets;
- tot->tx_packets += tx_packets;
- tot->rx_bytes += rx_bytes;
- tot->tx_bytes += tx_bytes;
- }
-
- tot->multicast = dev->stats.multicast;
- tot->rx_crc_errors = dev->stats.rx_crc_errors;
- tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
- tot->rx_length_errors = dev->stats.rx_length_errors;
- tot->rx_errors = dev->stats.rx_errors;
- tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
- tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
- tot->tx_dropped = dev->stats.tx_dropped;
- tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
- tot->tx_errors = dev->stats.tx_errors;
-
- return tot;
-}
-
static struct ip_tunnel *vti_tunnel_lookup(struct net *net,
__be32 remote, __be32 local)
{
@@ -399,8 +361,7 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
tunnel->err_count = 0;
}
- IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
- IPSKB_REROUTED);
+ memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
skb_dst_drop(skb);
skb_dst_set(skb, &rt->dst);
nf_reset(skb);
@@ -597,7 +558,7 @@ static const struct net_device_ops vti_netdev_ops = {
.ndo_start_xmit = vti_tunnel_xmit,
.ndo_do_ioctl = vti_tunnel_ioctl,
.ndo_change_mtu = vti_tunnel_change_mtu,
- .ndo_get_stats64 = vti_get_stats64,
+ .ndo_get_stats64 = ip_tunnel_get_stats64,
};
static void vti_dev_free(struct net_device *dev)
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index 9a46dae..59cb8c7 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -75,6 +75,7 @@ static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x)
t->props.mode = x->props.mode;
t->props.saddr.a4 = x->props.saddr.a4;
t->props.flags = x->props.flags;
+ t->props.extra_flags = x->props.extra_flags;
memcpy(&t->mark, &x->mark, sizeof(t->mark));
if (xfrm_init_state(t))
@@ -163,6 +164,7 @@ static const struct net_protocol ipcomp4_protocol = {
.handler = xfrm4_rcv,
.err_handler = ipcomp4_err,
.no_policy = 1,
+ .netns_ok = 1,
};
static int __init ipcomp4_init(void)
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index a2e50ae..efa1138 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -206,7 +206,7 @@ static int __init ic_open_devs(void)
struct ic_device *d, **last;
struct net_device *dev;
unsigned short oflags;
- unsigned long start;
+ unsigned long start, next_msg;
last = &ic_first_dev;
rtnl_lock();
@@ -263,12 +263,23 @@ static int __init ic_open_devs(void)
/* wait for a carrier on at least one device */
start = jiffies;
+ next_msg = start + msecs_to_jiffies(CONF_CARRIER_TIMEOUT/12);
while (jiffies - start < msecs_to_jiffies(CONF_CARRIER_TIMEOUT)) {
+ int wait, elapsed;
+
for_each_netdev(&init_net, dev)
if (ic_is_init_dev(dev) && netif_carrier_ok(dev))
goto have_carrier;
msleep(1);
+
+ if time_before(jiffies, next_msg)
+ continue;
+
+ elapsed = jiffies_to_msecs(jiffies - start);
+ wait = (CONF_CARRIER_TIMEOUT - elapsed + 500)/1000;
+ pr_info("Waiting up to %d more seconds for network.\n", wait);
+ next_msg = jiffies + msecs_to_jiffies(CONF_CARRIER_TIMEOUT/12);
}
have_carrier:
rtnl_unlock();
@@ -1394,7 +1405,7 @@ static int __init ip_auto_config(void)
unsigned int i;
#ifdef CONFIG_PROC_FS
- proc_net_fops_create(&init_net, "pnp", S_IRUGO, &pnp_seq_fops);
+ proc_create("pnp", S_IRUGO, init_net.proc_net, &pnp_seq_fops);
#endif /* CONFIG_PROC_FS */
if (!ic_enable)
@@ -1522,7 +1533,8 @@ static int __init ip_auto_config(void)
}
for (i++; i < CONF_NAMESERVERS_MAX; i++)
if (ic_nameservers[i] != NONE)
- pr_cont(", nameserver%u=%pI4\n", i, &ic_nameservers[i]);
+ pr_cont(", nameserver%u=%pI4", i, &ic_nameservers[i]);
+ pr_cont("\n");
#endif /* !SILENT */
return 0;
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 191fc24..77bfcce 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -111,227 +111,21 @@
#include <net/sock.h>
#include <net/ip.h>
#include <net/icmp.h>
-#include <net/ipip.h>
+#include <net/ip_tunnels.h>
#include <net/inet_ecn.h>
#include <net/xfrm.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
-#define HASH_SIZE 16
-#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
-
static bool log_ecn_error = true;
module_param(log_ecn_error, bool, 0644);
MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
static int ipip_net_id __read_mostly;
-struct ipip_net {
- struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE];
- struct ip_tunnel __rcu *tunnels_r[HASH_SIZE];
- struct ip_tunnel __rcu *tunnels_l[HASH_SIZE];
- struct ip_tunnel __rcu *tunnels_wc[1];
- struct ip_tunnel __rcu **tunnels[4];
-
- struct net_device *fb_tunnel_dev;
-};
static int ipip_tunnel_init(struct net_device *dev);
-static void ipip_tunnel_setup(struct net_device *dev);
-static void ipip_dev_free(struct net_device *dev);
static struct rtnl_link_ops ipip_link_ops __read_mostly;
-static struct rtnl_link_stats64 *ipip_get_stats64(struct net_device *dev,
- struct rtnl_link_stats64 *tot)
-{
- int i;
-
- for_each_possible_cpu(i) {
- const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
- u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
- unsigned int start;
-
- do {
- start = u64_stats_fetch_begin_bh(&tstats->syncp);
- rx_packets = tstats->rx_packets;
- tx_packets = tstats->tx_packets;
- rx_bytes = tstats->rx_bytes;
- tx_bytes = tstats->tx_bytes;
- } while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
-
- tot->rx_packets += rx_packets;
- tot->tx_packets += tx_packets;
- tot->rx_bytes += rx_bytes;
- tot->tx_bytes += tx_bytes;
- }
-
- tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
- tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
- tot->tx_dropped = dev->stats.tx_dropped;
- tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
- tot->tx_errors = dev->stats.tx_errors;
- tot->collisions = dev->stats.collisions;
-
- return tot;
-}
-
-static struct ip_tunnel *ipip_tunnel_lookup(struct net *net,
- __be32 remote, __be32 local)
-{
- unsigned int h0 = HASH(remote);
- unsigned int h1 = HASH(local);
- struct ip_tunnel *t;
- struct ipip_net *ipn = net_generic(net, ipip_net_id);
-
- for_each_ip_tunnel_rcu(t, ipn->tunnels_r_l[h0 ^ h1])
- if (local == t->parms.iph.saddr &&
- remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
- return t;
-
- for_each_ip_tunnel_rcu(t, ipn->tunnels_r[h0])
- if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
- return t;
-
- for_each_ip_tunnel_rcu(t, ipn->tunnels_l[h1])
- if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP))
- return t;
-
- t = rcu_dereference(ipn->tunnels_wc[0]);
- if (t && (t->dev->flags&IFF_UP))
- return t;
- return NULL;
-}
-
-static struct ip_tunnel __rcu **__ipip_bucket(struct ipip_net *ipn,
- struct ip_tunnel_parm *parms)
-{
- __be32 remote = parms->iph.daddr;
- __be32 local = parms->iph.saddr;
- unsigned int h = 0;
- int prio = 0;
-
- if (remote) {
- prio |= 2;
- h ^= HASH(remote);
- }
- if (local) {
- prio |= 1;
- h ^= HASH(local);
- }
- return &ipn->tunnels[prio][h];
-}
-
-static inline struct ip_tunnel __rcu **ipip_bucket(struct ipip_net *ipn,
- struct ip_tunnel *t)
-{
- return __ipip_bucket(ipn, &t->parms);
-}
-
-static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t)
-{
- struct ip_tunnel __rcu **tp;
- struct ip_tunnel *iter;
-
- for (tp = ipip_bucket(ipn, t);
- (iter = rtnl_dereference(*tp)) != NULL;
- tp = &iter->next) {
- if (t == iter) {
- rcu_assign_pointer(*tp, t->next);
- break;
- }
- }
-}
-
-static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t)
-{
- struct ip_tunnel __rcu **tp = ipip_bucket(ipn, t);
-
- rcu_assign_pointer(t->next, rtnl_dereference(*tp));
- rcu_assign_pointer(*tp, t);
-}
-
-static int ipip_tunnel_create(struct net_device *dev)
-{
- struct ip_tunnel *t = netdev_priv(dev);
- struct net *net = dev_net(dev);
- struct ipip_net *ipn = net_generic(net, ipip_net_id);
- int err;
-
- err = ipip_tunnel_init(dev);
- if (err < 0)
- goto out;
-
- err = register_netdevice(dev);
- if (err < 0)
- goto out;
-
- strcpy(t->parms.name, dev->name);
- dev->rtnl_link_ops = &ipip_link_ops;
-
- dev_hold(dev);
- ipip_tunnel_link(ipn, t);
- return 0;
-
-out:
- return err;
-}
-
-static struct ip_tunnel *ipip_tunnel_locate(struct net *net,
- struct ip_tunnel_parm *parms, int create)
-{
- __be32 remote = parms->iph.daddr;
- __be32 local = parms->iph.saddr;
- struct ip_tunnel *t, *nt;
- struct ip_tunnel __rcu **tp;
- struct net_device *dev;
- char name[IFNAMSIZ];
- struct ipip_net *ipn = net_generic(net, ipip_net_id);
-
- for (tp = __ipip_bucket(ipn, parms);
- (t = rtnl_dereference(*tp)) != NULL;
- tp = &t->next) {
- if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr)
- return t;
- }
- if (!create)
- return NULL;
-
- if (parms->name[0])
- strlcpy(name, parms->name, IFNAMSIZ);
- else
- strcpy(name, "tunl%d");
-
- dev = alloc_netdev(sizeof(*t), name, ipip_tunnel_setup);
- if (dev == NULL)
- return NULL;
-
- dev_net_set(dev, net);
-
- nt = netdev_priv(dev);
- nt->parms = *parms;
-
- if (ipip_tunnel_create(dev) < 0)
- goto failed_free;
-
- return nt;
-
-failed_free:
- ipip_dev_free(dev);
- return NULL;
-}
-
-/* called with RTNL */
-static void ipip_tunnel_uninit(struct net_device *dev)
-{
- struct net *net = dev_net(dev);
- struct ipip_net *ipn = net_generic(net, ipip_net_id);
-
- if (dev == ipn->fb_tunnel_dev)
- RCU_INIT_POINTER(ipn->tunnels_wc[0], NULL);
- else
- ipip_tunnel_unlink(ipn, netdev_priv(dev));
- dev_put(dev);
-}
-
static int ipip_err(struct sk_buff *skb, u32 info)
{
@@ -339,41 +133,17 @@ static int ipip_err(struct sk_buff *skb, u32 info)
8 bytes of packet payload. It means, that precise relaying of
ICMP in the real Internet is absolutely infeasible.
*/
+ struct net *net = dev_net(skb->dev);
+ struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
const struct iphdr *iph = (const struct iphdr *)skb->data;
- const int type = icmp_hdr(skb)->type;
- const int code = icmp_hdr(skb)->code;
struct ip_tunnel *t;
int err;
-
- switch (type) {
- default:
- case ICMP_PARAMETERPROB:
- return 0;
-
- case ICMP_DEST_UNREACH:
- switch (code) {
- case ICMP_SR_FAILED:
- case ICMP_PORT_UNREACH:
- /* Impossible event. */
- return 0;
- default:
- /* All others are translated to HOST_UNREACH.
- rfc2003 contains "deep thoughts" about NET_UNREACH,
- I believe they are just ether pollution. --ANK
- */
- break;
- }
- break;
- case ICMP_TIME_EXCEEDED:
- if (code != ICMP_EXC_TTL)
- return 0;
- break;
- case ICMP_REDIRECT:
- break;
- }
+ const int type = icmp_hdr(skb)->type;
+ const int code = icmp_hdr(skb)->code;
err = -ENOENT;
- t = ipip_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr);
+ t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
+ iph->daddr, iph->saddr, 0);
if (t == NULL)
goto out;
@@ -403,53 +173,29 @@ static int ipip_err(struct sk_buff *skb, u32 info)
else
t->err_count = 1;
t->err_time = jiffies;
-out:
+out:
return err;
}
+static const struct tnl_ptk_info tpi = {
+ /* no tunnel info required for ipip. */
+ .proto = htons(ETH_P_IP),
+};
+
static int ipip_rcv(struct sk_buff *skb)
{
+ struct net *net = dev_net(skb->dev);
+ struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
struct ip_tunnel *tunnel;
const struct iphdr *iph = ip_hdr(skb);
- int err;
-
- tunnel = ipip_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr);
- if (tunnel != NULL) {
- struct pcpu_tstats *tstats;
+ tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
+ iph->saddr, iph->daddr, 0);
+ if (tunnel) {
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
goto drop;
-
- secpath_reset(skb);
-
- skb->mac_header = skb->network_header;
- skb_reset_network_header(skb);
- skb->protocol = htons(ETH_P_IP);
- skb->pkt_type = PACKET_HOST;
-
- __skb_tunnel_rx(skb, tunnel->dev);
-
- err = IP_ECN_decapsulate(iph, skb);
- if (unlikely(err)) {
- if (log_ecn_error)
- net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
- &iph->saddr, iph->tos);
- if (err > 1) {
- ++tunnel->dev->stats.rx_frame_errors;
- ++tunnel->dev->stats.rx_errors;
- goto drop;
- }
- }
-
- tstats = this_cpu_ptr(tunnel->dev->tstats);
- u64_stats_update_begin(&tstats->syncp);
- tstats->rx_packets++;
- tstats->rx_bytes += skb->len;
- u64_stats_update_end(&tstats->syncp);
-
- netif_rx(skb);
- return 0;
+ return ip_tunnel_rcv(tunnel, skb, &tpi, log_ecn_error);
}
return -1;
@@ -463,327 +209,64 @@ drop:
* This function assumes it is being called from dev_queue_xmit()
* and that skb is filled properly by that function.
*/
-
static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
const struct iphdr *tiph = &tunnel->parms.iph;
- u8 tos = tunnel->parms.iph.tos;
- __be16 df = tiph->frag_off;
- struct rtable *rt; /* Route to the other host */
- struct net_device *tdev; /* Device to other host */
- const struct iphdr *old_iph = ip_hdr(skb);
- struct iphdr *iph; /* Our new IP header */
- unsigned int max_headroom; /* The extra header space needed */
- __be32 dst = tiph->daddr;
- struct flowi4 fl4;
- int mtu;
-
- if (skb->protocol != htons(ETH_P_IP))
- goto tx_error;
-
- if (skb->ip_summed == CHECKSUM_PARTIAL &&
- skb_checksum_help(skb))
- goto tx_error;
- if (tos & 1)
- tos = old_iph->tos;
-
- if (!dst) {
- /* NBMA tunnel */
- if ((rt = skb_rtable(skb)) == NULL) {
- dev->stats.tx_fifo_errors++;
- goto tx_error;
- }
- dst = rt_nexthop(rt, old_iph->daddr);
- }
-
- rt = ip_route_output_ports(dev_net(dev), &fl4, NULL,
- dst, tiph->saddr,
- 0, 0,
- IPPROTO_IPIP, RT_TOS(tos),
- tunnel->parms.link);
- if (IS_ERR(rt)) {
- dev->stats.tx_carrier_errors++;
- goto tx_error_icmp;
- }
- tdev = rt->dst.dev;
-
- if (tdev == dev) {
- ip_rt_put(rt);
- dev->stats.collisions++;
+ if (unlikely(skb->protocol != htons(ETH_P_IP)))
goto tx_error;
- }
- df |= old_iph->frag_off & htons(IP_DF);
-
- if (df) {
- mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
-
- if (mtu < 68) {
- dev->stats.collisions++;
- ip_rt_put(rt);
- goto tx_error;
- }
-
- if (skb_dst(skb))
- skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
-
- if ((old_iph->frag_off & htons(IP_DF)) &&
- mtu < ntohs(old_iph->tot_len)) {
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
- htonl(mtu));
- ip_rt_put(rt);
- goto tx_error;
- }
+ if (likely(!skb->encapsulation)) {
+ skb_reset_inner_headers(skb);
+ skb->encapsulation = 1;
}
- if (tunnel->err_count > 0) {
- if (time_before(jiffies,
- tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
- tunnel->err_count--;
- dst_link_failure(skb);
- } else
- tunnel->err_count = 0;
- }
-
- /*
- * Okay, now see if we can stuff it in the buffer as-is.
- */
- max_headroom = (LL_RESERVED_SPACE(tdev)+sizeof(struct iphdr));
-
- if (skb_headroom(skb) < max_headroom || skb_shared(skb) ||
- (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
- struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
- if (!new_skb) {
- ip_rt_put(rt);
- dev->stats.tx_dropped++;
- dev_kfree_skb(skb);
- return NETDEV_TX_OK;
- }
- if (skb->sk)
- skb_set_owner_w(new_skb, skb->sk);
- dev_kfree_skb(skb);
- skb = new_skb;
- old_iph = ip_hdr(skb);
- }
-
- skb->transport_header = skb->network_header;
- skb_push(skb, sizeof(struct iphdr));
- skb_reset_network_header(skb);
- memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
- IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
- IPSKB_REROUTED);
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
-
- /*
- * Push down and install the IPIP header.
- */
-
- iph = ip_hdr(skb);
- iph->version = 4;
- iph->ihl = sizeof(struct iphdr)>>2;
- iph->frag_off = df;
- iph->protocol = IPPROTO_IPIP;
- iph->tos = INET_ECN_encapsulate(tos, old_iph->tos);
- iph->daddr = fl4.daddr;
- iph->saddr = fl4.saddr;
-
- if ((iph->ttl = tiph->ttl) == 0)
- iph->ttl = old_iph->ttl;
-
- iptunnel_xmit(skb, dev);
+ ip_tunnel_xmit(skb, dev, tiph);
return NETDEV_TX_OK;
-tx_error_icmp:
- dst_link_failure(skb);
tx_error:
dev->stats.tx_errors++;
dev_kfree_skb(skb);
return NETDEV_TX_OK;
}
-static void ipip_tunnel_bind_dev(struct net_device *dev)
-{
- struct net_device *tdev = NULL;
- struct ip_tunnel *tunnel;
- const struct iphdr *iph;
-
- tunnel = netdev_priv(dev);
- iph = &tunnel->parms.iph;
-
- if (iph->daddr) {
- struct rtable *rt;
- struct flowi4 fl4;
-
- rt = ip_route_output_ports(dev_net(dev), &fl4, NULL,
- iph->daddr, iph->saddr,
- 0, 0,
- IPPROTO_IPIP,
- RT_TOS(iph->tos),
- tunnel->parms.link);
- if (!IS_ERR(rt)) {
- tdev = rt->dst.dev;
- ip_rt_put(rt);
- }
- dev->flags |= IFF_POINTOPOINT;
- }
-
- if (!tdev && tunnel->parms.link)
- tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
-
- if (tdev) {
- dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr);
- dev->mtu = tdev->mtu - sizeof(struct iphdr);
- }
- dev->iflink = tunnel->parms.link;
-}
-
-static void ipip_tunnel_update(struct ip_tunnel *t, struct ip_tunnel_parm *p)
-{
- struct net *net = dev_net(t->dev);
- struct ipip_net *ipn = net_generic(net, ipip_net_id);
-
- ipip_tunnel_unlink(ipn, t);
- synchronize_net();
- t->parms.iph.saddr = p->iph.saddr;
- t->parms.iph.daddr = p->iph.daddr;
- memcpy(t->dev->dev_addr, &p->iph.saddr, 4);
- memcpy(t->dev->broadcast, &p->iph.daddr, 4);
- ipip_tunnel_link(ipn, t);
- t->parms.iph.ttl = p->iph.ttl;
- t->parms.iph.tos = p->iph.tos;
- t->parms.iph.frag_off = p->iph.frag_off;
- if (t->parms.link != p->link) {
- t->parms.link = p->link;
- ipip_tunnel_bind_dev(t->dev);
- }
- netdev_state_change(t->dev);
-}
-
static int
-ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
+ipip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
{
int err = 0;
struct ip_tunnel_parm p;
- struct ip_tunnel *t;
- struct net *net = dev_net(dev);
- struct ipip_net *ipn = net_generic(net, ipip_net_id);
-
- switch (cmd) {
- case SIOCGETTUNNEL:
- t = NULL;
- if (dev == ipn->fb_tunnel_dev) {
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
- err = -EFAULT;
- break;
- }
- t = ipip_tunnel_locate(net, &p, 0);
- }
- if (t == NULL)
- t = netdev_priv(dev);
- memcpy(&p, &t->parms, sizeof(p));
- if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
- err = -EFAULT;
- break;
-
- case SIOCADDTUNNEL:
- case SIOCCHGTUNNEL:
- err = -EPERM;
- if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
- goto done;
-
- err = -EFAULT;
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
- goto done;
-
- err = -EINVAL;
- if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP ||
- p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)))
- goto done;
- if (p.iph.ttl)
- p.iph.frag_off |= htons(IP_DF);
-
- t = ipip_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
-
- if (dev != ipn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
- if (t != NULL) {
- if (t->dev != dev) {
- err = -EEXIST;
- break;
- }
- } else {
- if (((dev->flags&IFF_POINTOPOINT) && !p.iph.daddr) ||
- (!(dev->flags&IFF_POINTOPOINT) && p.iph.daddr)) {
- err = -EINVAL;
- break;
- }
- t = netdev_priv(dev);
- }
-
- ipip_tunnel_update(t, &p);
- }
-
- if (t) {
- err = 0;
- if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
- err = -EFAULT;
- } else
- err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
- break;
-
- case SIOCDELTUNNEL:
- err = -EPERM;
- if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
- goto done;
-
- if (dev == ipn->fb_tunnel_dev) {
- err = -EFAULT;
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
- goto done;
- err = -ENOENT;
- if ((t = ipip_tunnel_locate(net, &p, 0)) == NULL)
- goto done;
- err = -EPERM;
- if (t->dev == ipn->fb_tunnel_dev)
- goto done;
- dev = t->dev;
- }
- unregister_netdevice(dev);
- err = 0;
- break;
- default:
- err = -EINVAL;
- }
-
-done:
- return err;
-}
+ if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+ return -EFAULT;
-static int ipip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
-{
- if (new_mtu < 68 || new_mtu > 0xFFF8 - sizeof(struct iphdr))
+ if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP ||
+ p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)))
+ return -EINVAL;
+ if (p.i_key || p.o_key || p.i_flags || p.o_flags)
return -EINVAL;
- dev->mtu = new_mtu;
+ if (p.iph.ttl)
+ p.iph.frag_off |= htons(IP_DF);
+
+ err = ip_tunnel_ioctl(dev, &p, cmd);
+ if (err)
+ return err;
+
+ if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
+ return -EFAULT;
+
return 0;
}
static const struct net_device_ops ipip_netdev_ops = {
- .ndo_uninit = ipip_tunnel_uninit,
+ .ndo_init = ipip_tunnel_init,
+ .ndo_uninit = ip_tunnel_uninit,
.ndo_start_xmit = ipip_tunnel_xmit,
.ndo_do_ioctl = ipip_tunnel_ioctl,
- .ndo_change_mtu = ipip_tunnel_change_mtu,
- .ndo_get_stats64 = ipip_get_stats64,
+ .ndo_change_mtu = ip_tunnel_change_mtu,
+ .ndo_get_stats64 = ip_tunnel_get_stats64,
};
-static void ipip_dev_free(struct net_device *dev)
-{
- free_percpu(dev->tstats);
- free_netdev(dev);
-}
-
#define IPIP_FEATURES (NETIF_F_SG | \
NETIF_F_FRAGLIST | \
NETIF_F_HIGHDMA | \
@@ -792,11 +275,8 @@ static void ipip_dev_free(struct net_device *dev)
static void ipip_tunnel_setup(struct net_device *dev)
{
dev->netdev_ops = &ipip_netdev_ops;
- dev->destructor = ipip_dev_free;
dev->type = ARPHRD_TUNNEL;
- dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr);
- dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr);
dev->flags = IFF_NOARP;
dev->iflink = 0;
dev->addr_len = 4;
@@ -806,46 +286,19 @@ static void ipip_tunnel_setup(struct net_device *dev)
dev->features |= IPIP_FEATURES;
dev->hw_features |= IPIP_FEATURES;
+ ip_tunnel_setup(dev, ipip_net_id);
}
static int ipip_tunnel_init(struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
- tunnel->dev = dev;
-
memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
- ipip_tunnel_bind_dev(dev);
-
- dev->tstats = alloc_percpu(struct pcpu_tstats);
- if (!dev->tstats)
- return -ENOMEM;
-
- return 0;
-}
-
-static int __net_init ipip_fb_tunnel_init(struct net_device *dev)
-{
- struct ip_tunnel *tunnel = netdev_priv(dev);
- struct iphdr *iph = &tunnel->parms.iph;
- struct ipip_net *ipn = net_generic(dev_net(dev), ipip_net_id);
-
- tunnel->dev = dev;
- strcpy(tunnel->parms.name, dev->name);
-
- iph->version = 4;
- iph->protocol = IPPROTO_IPIP;
- iph->ihl = 5;
-
- dev->tstats = alloc_percpu(struct pcpu_tstats);
- if (!dev->tstats)
- return -ENOMEM;
-
- dev_hold(dev);
- rcu_assign_pointer(ipn->tunnels_wc[0], tunnel);
- return 0;
+ tunnel->hlen = 0;
+ tunnel->parms.iph.protocol = IPPROTO_IPIP;
+ return ip_tunnel_init(dev);
}
static void ipip_netlink_parms(struct nlattr *data[],
@@ -885,28 +338,16 @@ static void ipip_netlink_parms(struct nlattr *data[],
static int ipip_newlink(struct net *src_net, struct net_device *dev,
struct nlattr *tb[], struct nlattr *data[])
{
- struct net *net = dev_net(dev);
- struct ip_tunnel *nt;
-
- nt = netdev_priv(dev);
- ipip_netlink_parms(data, &nt->parms);
-
- if (ipip_tunnel_locate(net, &nt->parms, 0))
- return -EEXIST;
+ struct ip_tunnel_parm p;
- return ipip_tunnel_create(dev);
+ ipip_netlink_parms(data, &p);
+ return ip_tunnel_newlink(dev, tb, &p);
}
static int ipip_changelink(struct net_device *dev, struct nlattr *tb[],
struct nlattr *data[])
{
- struct ip_tunnel *t;
struct ip_tunnel_parm p;
- struct net *net = dev_net(dev);
- struct ipip_net *ipn = net_generic(net, ipip_net_id);
-
- if (dev == ipn->fb_tunnel_dev)
- return -EINVAL;
ipip_netlink_parms(data, &p);
@@ -914,16 +355,7 @@ static int ipip_changelink(struct net_device *dev, struct nlattr *tb[],
(!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr))
return -EINVAL;
- t = ipip_tunnel_locate(net, &p, 0);
-
- if (t) {
- if (t->dev != dev)
- return -EEXIST;
- } else
- t = netdev_priv(dev);
-
- ipip_tunnel_update(t, &p);
- return 0;
+ return ip_tunnel_changelink(dev, tb, &p);
}
static size_t ipip_get_size(const struct net_device *dev)
@@ -980,6 +412,7 @@ static struct rtnl_link_ops ipip_link_ops __read_mostly = {
.setup = ipip_tunnel_setup,
.newlink = ipip_newlink,
.changelink = ipip_changelink,
+ .dellink = ip_tunnel_dellink,
.get_size = ipip_get_size,
.fill_info = ipip_fill_info,
};
@@ -990,90 +423,29 @@ static struct xfrm_tunnel ipip_handler __read_mostly = {
.priority = 1,
};
-static const char banner[] __initconst =
- KERN_INFO "IPv4 over IPv4 tunneling driver\n";
-
-static void ipip_destroy_tunnels(struct ipip_net *ipn, struct list_head *head)
-{
- int prio;
-
- for (prio = 1; prio < 4; prio++) {
- int h;
- for (h = 0; h < HASH_SIZE; h++) {
- struct ip_tunnel *t;
-
- t = rtnl_dereference(ipn->tunnels[prio][h]);
- while (t != NULL) {
- unregister_netdevice_queue(t->dev, head);
- t = rtnl_dereference(t->next);
- }
- }
- }
-}
-
static int __net_init ipip_init_net(struct net *net)
{
- struct ipip_net *ipn = net_generic(net, ipip_net_id);
- struct ip_tunnel *t;
- int err;
-
- ipn->tunnels[0] = ipn->tunnels_wc;
- ipn->tunnels[1] = ipn->tunnels_l;
- ipn->tunnels[2] = ipn->tunnels_r;
- ipn->tunnels[3] = ipn->tunnels_r_l;
-
- ipn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel),
- "tunl0",
- ipip_tunnel_setup);
- if (!ipn->fb_tunnel_dev) {
- err = -ENOMEM;
- goto err_alloc_dev;
- }
- dev_net_set(ipn->fb_tunnel_dev, net);
-
- err = ipip_fb_tunnel_init(ipn->fb_tunnel_dev);
- if (err)
- goto err_reg_dev;
-
- if ((err = register_netdev(ipn->fb_tunnel_dev)))
- goto err_reg_dev;
-
- t = netdev_priv(ipn->fb_tunnel_dev);
-
- strcpy(t->parms.name, ipn->fb_tunnel_dev->name);
- return 0;
-
-err_reg_dev:
- ipip_dev_free(ipn->fb_tunnel_dev);
-err_alloc_dev:
- /* nothing */
- return err;
+ return ip_tunnel_init_net(net, ipip_net_id, &ipip_link_ops, "tunl0");
}
static void __net_exit ipip_exit_net(struct net *net)
{
- struct ipip_net *ipn = net_generic(net, ipip_net_id);
- LIST_HEAD(list);
-
- rtnl_lock();
- ipip_destroy_tunnels(ipn, &list);
- unregister_netdevice_queue(ipn->fb_tunnel_dev, &list);
- unregister_netdevice_many(&list);
- rtnl_unlock();
+ struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
+ ip_tunnel_delete_net(itn);
}
static struct pernet_operations ipip_net_ops = {
.init = ipip_init_net,
.exit = ipip_exit_net,
.id = &ipip_net_id,
- .size = sizeof(struct ipip_net),
+ .size = sizeof(struct ip_tunnel_net),
};
static int __init ipip_init(void)
{
int err;
- printk(banner);
+ pr_info("ipip: IPv4 over IPv4 tunneling driver\n");
err = register_pernet_device(&ipip_net_ops);
if (err < 0)
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index a9454cb..9d9610a 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -61,7 +61,7 @@
#include <linux/netfilter_ipv4.h>
#include <linux/compat.h>
#include <linux/export.h>
-#include <net/ipip.h>
+#include <net/ip_tunnels.h>
#include <net/checksum.h>
#include <net/netlink.h>
#include <net/fib_rules.h>
@@ -626,9 +626,9 @@ static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
if (ip_hdr(skb)->version == 0) {
struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
nlh->nlmsg_type = NLMSG_ERROR;
- nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
+ nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr));
skb_trim(skb, nlh->nlmsg_len);
- e = NLMSG_DATA(nlh);
+ e = nlmsg_data(nlh);
e->error = -ETIMEDOUT;
memset(&e->msg, 0, sizeof(e->msg));
@@ -828,6 +828,49 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
return NULL;
}
+/* Look for a (*,*,oif) entry */
+static struct mfc_cache *ipmr_cache_find_any_parent(struct mr_table *mrt,
+ int vifi)
+{
+ int line = MFC_HASH(htonl(INADDR_ANY), htonl(INADDR_ANY));
+ struct mfc_cache *c;
+
+ list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list)
+ if (c->mfc_origin == htonl(INADDR_ANY) &&
+ c->mfc_mcastgrp == htonl(INADDR_ANY) &&
+ c->mfc_un.res.ttls[vifi] < 255)
+ return c;
+
+ return NULL;
+}
+
+/* Look for a (*,G) entry */
+static struct mfc_cache *ipmr_cache_find_any(struct mr_table *mrt,
+ __be32 mcastgrp, int vifi)
+{
+ int line = MFC_HASH(mcastgrp, htonl(INADDR_ANY));
+ struct mfc_cache *c, *proxy;
+
+ if (mcastgrp == htonl(INADDR_ANY))
+ goto skip;
+
+ list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list)
+ if (c->mfc_origin == htonl(INADDR_ANY) &&
+ c->mfc_mcastgrp == mcastgrp) {
+ if (c->mfc_un.res.ttls[vifi] < 255)
+ return c;
+
+ /* It's ok if the vifi is part of the static tree */
+ proxy = ipmr_cache_find_any_parent(mrt,
+ c->mfc_parent);
+ if (proxy && proxy->mfc_un.res.ttls[vifi] < 255)
+ return c;
+ }
+
+skip:
+ return ipmr_cache_find_any_parent(mrt, vifi);
+}
+
/*
* Allocate a multicast cache entry
*/
@@ -867,14 +910,14 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
if (ip_hdr(skb)->version == 0) {
struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
- if (__ipmr_fill_mroute(mrt, skb, c, NLMSG_DATA(nlh)) > 0) {
+ if (__ipmr_fill_mroute(mrt, skb, c, nlmsg_data(nlh)) > 0) {
nlh->nlmsg_len = skb_tail_pointer(skb) -
(u8 *)nlh;
} else {
nlh->nlmsg_type = NLMSG_ERROR;
- nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
+ nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr));
skb_trim(skb, nlh->nlmsg_len);
- e = NLMSG_DATA(nlh);
+ e = nlmsg_data(nlh);
e->error = -EMSGSIZE;
memset(&e->msg, 0, sizeof(e->msg));
}
@@ -1053,7 +1096,7 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
* MFC cache manipulation by user space mroute daemon
*/
-static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc)
+static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent)
{
int line;
struct mfc_cache *c, *next;
@@ -1062,7 +1105,8 @@ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc)
list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[line], list) {
if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
- c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
+ c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr &&
+ (parent == -1 || parent == c->mfc_parent)) {
list_del_rcu(&c->list);
mroute_netlink_event(mrt, c, RTM_DELROUTE);
ipmr_cache_free(c);
@@ -1073,7 +1117,7 @@ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc)
}
static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
- struct mfcctl *mfc, int mrtsock)
+ struct mfcctl *mfc, int mrtsock, int parent)
{
bool found = false;
int line;
@@ -1086,7 +1130,8 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
list_for_each_entry(c, &mrt->mfc_cache_array[line], list) {
if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
- c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
+ c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr &&
+ (parent == -1 || parent == c->mfc_parent)) {
found = true;
break;
}
@@ -1103,7 +1148,8 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
return 0;
}
- if (!ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr))
+ if (mfc->mfcc_mcastgrp.s_addr != htonl(INADDR_ANY) &&
+ !ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr))
return -EINVAL;
c = ipmr_cache_alloc();
@@ -1218,7 +1264,7 @@ static void mrtsock_destruct(struct sock *sk)
int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsigned int optlen)
{
- int ret;
+ int ret, parent = 0;
struct vifctl vif;
struct mfcctl mfc;
struct net *net = sock_net(sk);
@@ -1287,16 +1333,22 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
*/
case MRT_ADD_MFC:
case MRT_DEL_MFC:
+ parent = -1;
+ case MRT_ADD_MFC_PROXY:
+ case MRT_DEL_MFC_PROXY:
if (optlen != sizeof(mfc))
return -EINVAL;
if (copy_from_user(&mfc, optval, sizeof(mfc)))
return -EFAULT;
+ if (parent == 0)
+ parent = mfc.mfcc_parent;
rtnl_lock();
- if (optname == MRT_DEL_MFC)
- ret = ipmr_mfc_delete(mrt, &mfc);
+ if (optname == MRT_DEL_MFC || optname == MRT_DEL_MFC_PROXY)
+ ret = ipmr_mfc_delete(mrt, &mfc, parent);
else
ret = ipmr_mfc_add(net, mrt, &mfc,
- sk == rtnl_dereference(mrt->mroute_sk));
+ sk == rtnl_dereference(mrt->mroute_sk),
+ parent);
rtnl_unlock();
return ret;
/*
@@ -1749,17 +1801,28 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt,
{
int psend = -1;
int vif, ct;
+ int true_vifi = ipmr_find_vif(mrt, skb->dev);
vif = cache->mfc_parent;
cache->mfc_un.res.pkt++;
cache->mfc_un.res.bytes += skb->len;
+ if (cache->mfc_origin == htonl(INADDR_ANY) && true_vifi >= 0) {
+ struct mfc_cache *cache_proxy;
+
+ /* For an (*,G) entry, we only check that the incomming
+ * interface is part of the static tree.
+ */
+ cache_proxy = ipmr_cache_find_any_parent(mrt, vif);
+ if (cache_proxy &&
+ cache_proxy->mfc_un.res.ttls[true_vifi] < 255)
+ goto forward;
+ }
+
/*
* Wrong interface: drop packet and (maybe) send PIM assert.
*/
if (mrt->vif_table[vif].dev != skb->dev) {
- int true_vifi;
-
if (rt_is_output_route(skb_rtable(skb))) {
/* It is our own packet, looped back.
* Very complicated situation...
@@ -1776,7 +1839,6 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt,
}
cache->mfc_un.res.wrong_if++;
- true_vifi = ipmr_find_vif(mrt, skb->dev);
if (true_vifi >= 0 && mrt->mroute_do_assert &&
/* pimsm uses asserts, when switching from RPT to SPT,
@@ -1794,15 +1856,34 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt,
goto dont_forward;
}
+forward:
mrt->vif_table[vif].pkt_in++;
mrt->vif_table[vif].bytes_in += skb->len;
/*
* Forward the frame
*/
+ if (cache->mfc_origin == htonl(INADDR_ANY) &&
+ cache->mfc_mcastgrp == htonl(INADDR_ANY)) {
+ if (true_vifi >= 0 &&
+ true_vifi != cache->mfc_parent &&
+ ip_hdr(skb)->ttl >
+ cache->mfc_un.res.ttls[cache->mfc_parent]) {
+ /* It's an (*,*) entry and the packet is not coming from
+ * the upstream: forward the packet to the upstream
+ * only.
+ */
+ psend = cache->mfc_parent;
+ goto last_forward;
+ }
+ goto dont_forward;
+ }
for (ct = cache->mfc_un.res.maxvif - 1;
ct >= cache->mfc_un.res.minvif; ct--) {
- if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {
+ /* For (*,G) entry, don't forward to the incoming interface */
+ if ((cache->mfc_origin != htonl(INADDR_ANY) ||
+ ct != true_vifi) &&
+ ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {
if (psend != -1) {
struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
@@ -1813,6 +1894,7 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt,
psend = ct;
}
}
+last_forward:
if (psend != -1) {
if (local) {
struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
@@ -1902,6 +1984,13 @@ int ip_mr_input(struct sk_buff *skb)
/* already under rcu_read_lock() */
cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
+ if (cache == NULL) {
+ int vif = ipmr_find_vif(mrt, skb->dev);
+
+ if (vif >= 0)
+ cache = ipmr_cache_find_any(mrt, ip_hdr(skb)->daddr,
+ vif);
+ }
/*
* No usable cache entry
@@ -2107,7 +2196,12 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb,
rcu_read_lock();
cache = ipmr_cache_find(mrt, saddr, daddr);
+ if (cache == NULL && skb->dev) {
+ int vif = ipmr_find_vif(mrt, skb->dev);
+ if (vif >= 0)
+ cache = ipmr_cache_find_any(mrt, daddr, vif);
+ }
if (cache == NULL) {
struct sk_buff *skb2;
struct iphdr *iph;
@@ -2609,16 +2703,16 @@ static int __net_init ipmr_net_init(struct net *net)
#ifdef CONFIG_PROC_FS
err = -ENOMEM;
- if (!proc_net_fops_create(net, "ip_mr_vif", 0, &ipmr_vif_fops))
+ if (!proc_create("ip_mr_vif", 0, net->proc_net, &ipmr_vif_fops))
goto proc_vif_fail;
- if (!proc_net_fops_create(net, "ip_mr_cache", 0, &ipmr_mfc_fops))
+ if (!proc_create("ip_mr_cache", 0, net->proc_net, &ipmr_mfc_fops))
goto proc_cache_fail;
#endif
return 0;
#ifdef CONFIG_PROC_FS
proc_cache_fail:
- proc_net_remove(net, "ip_mr_vif");
+ remove_proc_entry("ip_mr_vif", net->proc_net);
proc_vif_fail:
ipmr_rules_exit(net);
#endif
@@ -2629,8 +2723,8 @@ fail:
static void __net_exit ipmr_net_exit(struct net *net)
{
#ifdef CONFIG_PROC_FS
- proc_net_remove(net, "ip_mr_cache");
- proc_net_remove(net, "ip_mr_vif");
+ remove_proc_entry("ip_mr_cache", net->proc_net);
+ remove_proc_entry("ip_mr_vif", net->proc_net);
#endif
ipmr_rules_exit(net);
}
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index 4c0cf63..c3e0ade 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -1,4 +1,9 @@
-/* IPv4 specific functions of netfilter core */
+/*
+ * IPv4 specific functions of netfilter core
+ *
+ * Rusty Russell (C) 2000 -- This code is GPL.
+ * Patrick McHardy (C) 2006-2012
+ */
#include <linux/kernel.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>
@@ -40,14 +45,14 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned int addr_type)
fl4.flowi4_flags = flags;
rt = ip_route_output_key(net, &fl4);
if (IS_ERR(rt))
- return -1;
+ return PTR_ERR(rt);
/* Drop old route. */
skb_dst_drop(skb);
skb_dst_set(skb, &rt->dst);
if (skb_dst(skb)->error)
- return -1;
+ return skb_dst(skb)->error;
#ifdef CONFIG_XFRM
if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
@@ -56,7 +61,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned int addr_type)
skb_dst_set(skb, NULL);
dst = xfrm_lookup(net, dst, flowi4_to_flowi(&fl4), skb->sk, 0);
if (IS_ERR(dst))
- return -1;
+ return PTR_ERR(dst);;
skb_dst_set(skb, dst);
}
#endif
@@ -66,7 +71,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned int addr_type)
if (skb_headroom(skb) < hh_len &&
pskb_expand_head(skb, HH_DATA_ALIGN(hh_len - skb_headroom(skb)),
0, GFP_ATOMIC))
- return -1;
+ return -ENOMEM;
return 0;
}
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index d8d6f2a..e7916c1 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -36,19 +36,6 @@ config NF_CONNTRACK_PROC_COMPAT
If unsure, say Y.
-config IP_NF_QUEUE
- tristate "IP Userspace queueing via NETLINK (OBSOLETE)"
- depends on NETFILTER_ADVANCED
- help
- Netfilter has the ability to queue packets to user space: the
- netlink device can be used to access them using this driver.
-
- This option enables the old IPv4-only "ip_queue" implementation
- which has been obsoleted by the new "nfnetlink_queue" code (see
- CONFIG_NETFILTER_NETLINK_QUEUE).
-
- To compile it as a module, choose M here. If unsure, say N.
-
config IP_NF_IPTABLES
tristate "IP tables support (required for filtering/masq/NAT)"
default m if NETFILTER_ADVANCED=n
@@ -84,7 +71,7 @@ config IP_NF_MATCH_ECN
config IP_NF_MATCH_RPFILTER
tristate '"rpfilter" reverse path filter match support'
- depends on NETFILTER_ADVANCED
+ depends on NETFILTER_ADVANCED && (IP_NF_MANGLE || IP_NF_RAW)
---help---
This option allows you to match packets whose replies would
go out via the interface the packet came in.
@@ -241,8 +228,8 @@ config IP_NF_MANGLE
To compile it as a module, choose M here. If unsure, say N.
config IP_NF_TARGET_CLUSTERIP
- tristate "CLUSTERIP target support (EXPERIMENTAL)"
- depends on IP_NF_MANGLE && EXPERIMENTAL
+ tristate "CLUSTERIP target support"
+ depends on IP_NF_MANGLE
depends on NF_CONNTRACK_IPV4
depends on NETFILTER_ADVANCED
select NF_CONNTRACK_MARK
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 3ea4127..85a4f21 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -6,6 +6,7 @@
* Some ARP specific bits are:
*
* Copyright (C) 2002 David S. Miller (davem@redhat.com)
+ * Copyright (C) 2006-2009 Patrick McHardy <kaber@trash.net>
*
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -901,7 +902,7 @@ static int get_info(struct net *net, void __user *user,
#endif
t = try_then_request_module(xt_find_table_lock(net, NFPROTO_ARP, name),
"arptable_%s", name);
- if (t && !IS_ERR(t)) {
+ if (!IS_ERR_OR_NULL(t)) {
struct arpt_getinfo info;
const struct xt_table_info *private = t->private;
#ifdef CONFIG_COMPAT
@@ -958,7 +959,7 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr,
}
t = xt_find_table_lock(net, NFPROTO_ARP, get.name);
- if (t && !IS_ERR(t)) {
+ if (!IS_ERR_OR_NULL(t)) {
const struct xt_table_info *private = t->private;
duprintf("t->private->number = %u\n",
@@ -1001,7 +1002,7 @@ static int __do_replace(struct net *net, const char *name,
t = try_then_request_module(xt_find_table_lock(net, NFPROTO_ARP, name),
"arptable_%s", name);
- if (!t || IS_ERR(t)) {
+ if (IS_ERR_OR_NULL(t)) {
ret = t ? PTR_ERR(t) : -ENOENT;
goto free_newinfo_counters_untrans;
}
@@ -1158,7 +1159,7 @@ static int do_add_counters(struct net *net, const void __user *user,
}
t = xt_find_table_lock(net, NFPROTO_ARP, name);
- if (!t || IS_ERR(t)) {
+ if (IS_ERR_OR_NULL(t)) {
ret = t ? PTR_ERR(t) : -ENOENT;
goto free;
}
@@ -1646,7 +1647,7 @@ static int compat_get_entries(struct net *net,
xt_compat_lock(NFPROTO_ARP);
t = xt_find_table_lock(net, NFPROTO_ARP, get.name);
- if (t && !IS_ERR(t)) {
+ if (!IS_ERR_OR_NULL(t)) {
const struct xt_table_info *private = t->private;
struct xt_table_info info;
diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c
index 79ca5e7..eadab1e 100644
--- a/net/ipv4/netfilter/arptable_filter.c
+++ b/net/ipv4/netfilter/arptable_filter.c
@@ -48,9 +48,7 @@ static int __net_init arptable_filter_net_init(struct net *net)
net->ipv4.arptable_filter =
arpt_register_table(net, &packet_filter, repl);
kfree(repl);
- if (IS_ERR(net->ipv4.arptable_filter))
- return PTR_ERR(net->ipv4.arptable_filter);
- return 0;
+ return PTR_RET(net->ipv4.arptable_filter);
}
static void __net_exit arptable_filter_net_exit(struct net *net)
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 537b249..cb91101 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -3,6 +3,7 @@
*
* Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
* Copyright (C) 2000-2005 Netfilter Core Team <coreteam@netfilter.org>
+ * Copyright (C) 2006-2010 Patrick McHardy <kaber@trash.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -195,8 +196,7 @@ ipt_get_target_c(const struct ipt_entry *e)
return ipt_get_target((struct ipt_entry *)e);
}
-#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
- defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
+#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
static const char *const hooknames[] = {
[NF_INET_PRE_ROUTING] = "PREROUTING",
[NF_INET_LOCAL_IN] = "INPUT",
@@ -272,6 +272,7 @@ static void trace_packet(const struct sk_buff *skb,
const char *hookname, *chainname, *comment;
const struct ipt_entry *iter;
unsigned int rulenum = 0;
+ struct net *net = dev_net(in ? in : out);
table_base = private->entries[smp_processor_id()];
root = get_entry(table_base, private->hook_entry[hook]);
@@ -284,7 +285,7 @@ static void trace_packet(const struct sk_buff *skb,
&chainname, &comment, &rulenum) != 0)
break;
- nf_log_packet(AF_INET, hook, skb, in, out, &trace_loginfo,
+ nf_log_packet(net, AF_INET, hook, skb, in, out, &trace_loginfo,
"TRACE: %s:%s:%s:%u ",
tablename, chainname, comment, rulenum);
}
@@ -374,8 +375,7 @@ ipt_do_table(struct sk_buff *skb,
t = ipt_get_target(e);
IP_NF_ASSERT(t->u.kernel.target);
-#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
- defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
+#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
/* The packet is traced: log it */
if (unlikely(skb->nf_trace))
trace_packet(skb, hook, in, out,
@@ -1162,7 +1162,7 @@ static int get_info(struct net *net, void __user *user,
#endif
t = try_then_request_module(xt_find_table_lock(net, AF_INET, name),
"iptable_%s", name);
- if (t && !IS_ERR(t)) {
+ if (!IS_ERR_OR_NULL(t)) {
struct ipt_getinfo info;
const struct xt_table_info *private = t->private;
#ifdef CONFIG_COMPAT
@@ -1221,7 +1221,7 @@ get_entries(struct net *net, struct ipt_get_entries __user *uptr,
}
t = xt_find_table_lock(net, AF_INET, get.name);
- if (t && !IS_ERR(t)) {
+ if (!IS_ERR_OR_NULL(t)) {
const struct xt_table_info *private = t->private;
duprintf("t->private->number = %u\n", private->number);
if (get.size == private->size)
@@ -1271,7 +1271,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
t = try_then_request_module(xt_find_table_lock(net, AF_INET, name),
"iptable_%s", name);
- if (!t || IS_ERR(t)) {
+ if (IS_ERR_OR_NULL(t)) {
ret = t ? PTR_ERR(t) : -ENOENT;
goto free_newinfo_counters_untrans;
}
@@ -1436,7 +1436,7 @@ do_add_counters(struct net *net, const void __user *user,
}
t = xt_find_table_lock(net, AF_INET, name);
- if (!t || IS_ERR(t)) {
+ if (IS_ERR_OR_NULL(t)) {
ret = t ? PTR_ERR(t) : -ENOENT;
goto free;
}
@@ -2020,7 +2020,7 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr,
xt_compat_lock(AF_INET);
t = xt_find_table_lock(net, AF_INET, get.name);
- if (t && !IS_ERR(t)) {
+ if (!IS_ERR_OR_NULL(t)) {
const struct xt_table_info *private = t->private;
struct xt_table_info info;
duprintf("t->private->number = %u\n", private->number);
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 75e33a7..0b732ef 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -105,7 +105,7 @@ clusterip_config_entry_put(struct clusterip_config *c)
* functions are also incrementing the refcount on their own,
* so it's safe to remove the entry even if it's in use. */
#ifdef CONFIG_PROC_FS
- remove_proc_entry(c->pde->name, c->pde->parent);
+ proc_remove(c->pde);
#endif
return;
}
@@ -631,7 +631,7 @@ static int clusterip_proc_open(struct inode *inode, struct file *file)
if (!ret) {
struct seq_file *sf = file->private_data;
- struct clusterip_config *c = PDE(inode)->data;
+ struct clusterip_config *c = PDE_DATA(inode);
sf->private = c;
@@ -643,7 +643,7 @@ static int clusterip_proc_open(struct inode *inode, struct file *file)
static int clusterip_proc_release(struct inode *inode, struct file *file)
{
- struct clusterip_config *c = PDE(inode)->data;
+ struct clusterip_config *c = PDE_DATA(inode);
int ret;
ret = seq_release(inode, file);
@@ -657,7 +657,7 @@ static int clusterip_proc_release(struct inode *inode, struct file *file)
static ssize_t clusterip_proc_write(struct file *file, const char __user *input,
size_t size, loff_t *ofs)
{
- struct clusterip_config *c = PDE(file->f_path.dentry->d_inode)->data;
+ struct clusterip_config *c = PDE_DATA(file_inode(file));
#define PROC_WRITELEN 10
char buffer[PROC_WRITELEN+1];
unsigned long nodenum;
@@ -736,7 +736,7 @@ static void __exit clusterip_tg_exit(void)
{
pr_info("ClusterIP Version %s unloading\n", CLUSTERIP_VERSION);
#ifdef CONFIG_PROC_FS
- remove_proc_entry(clusterip_procdir->name, clusterip_procdir->parent);
+ proc_remove(clusterip_procdir);
#endif
nf_unregister_hook(&cip_arp_ops);
xt_unregister_target(&clusterip_tg_reg);
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index b5ef3cb..32b0e97 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -4,6 +4,7 @@
* (C) 2000-2004 by Harald Welte <laforge@netfilter.org>
* (C) 1999-2001 Paul `Rusty' Russell
* (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2005-2007 Patrick McHardy <kaber@trash.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -37,7 +38,7 @@
#include <linux/skbuff.h>
#include <linux/kernel.h>
#include <linux/timer.h>
-#include <linux/netlink.h>
+#include <net/netlink.h>
#include <linux/netdevice.h>
#include <linux/mm.h>
#include <linux/moduleparam.h>
@@ -45,6 +46,7 @@
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_ipv4/ipt_ULOG.h>
#include <net/netfilter/nf_log.h>
+#include <net/netns/generic.h>
#include <net/sock.h>
#include <linux/bitops.h>
#include <asm/unaligned.h>
@@ -78,20 +80,26 @@ typedef struct {
struct timer_list timer; /* the timer function */
} ulog_buff_t;
-static ulog_buff_t ulog_buffers[ULOG_MAXNLGROUPS]; /* array of buffers */
+static int ulog_net_id __read_mostly;
+struct ulog_net {
+ unsigned int nlgroup[ULOG_MAXNLGROUPS];
+ ulog_buff_t ulog_buffers[ULOG_MAXNLGROUPS];
+ struct sock *nflognl;
+ spinlock_t lock;
+};
-static struct sock *nflognl; /* our socket */
-static DEFINE_SPINLOCK(ulog_lock); /* spinlock */
+static struct ulog_net *ulog_pernet(struct net *net)
+{
+ return net_generic(net, ulog_net_id);
+}
/* send one ulog_buff_t to userspace */
-static void ulog_send(unsigned int nlgroupnum)
+static void ulog_send(struct ulog_net *ulog, unsigned int nlgroupnum)
{
- ulog_buff_t *ub = &ulog_buffers[nlgroupnum];
+ ulog_buff_t *ub = &ulog->ulog_buffers[nlgroupnum];
- if (timer_pending(&ub->timer)) {
- pr_debug("ulog_send: timer was pending, deleting\n");
- del_timer(&ub->timer);
- }
+ pr_debug("ulog_send: timer is deleting\n");
+ del_timer(&ub->timer);
if (!ub->skb) {
pr_debug("ulog_send: nothing to send\n");
@@ -105,7 +113,8 @@ static void ulog_send(unsigned int nlgroupnum)
NETLINK_CB(ub->skb).dst_group = nlgroupnum + 1;
pr_debug("throwing %d packets to netlink group %u\n",
ub->qlen, nlgroupnum + 1);
- netlink_broadcast(nflognl, ub->skb, 0, nlgroupnum + 1, GFP_ATOMIC);
+ netlink_broadcast(ulog->nflognl, ub->skb, 0, nlgroupnum + 1,
+ GFP_ATOMIC);
ub->qlen = 0;
ub->skb = NULL;
@@ -116,13 +125,17 @@ static void ulog_send(unsigned int nlgroupnum)
/* timer function to flush queue in flushtimeout time */
static void ulog_timer(unsigned long data)
{
+ unsigned int groupnum = *((unsigned int *)data);
+ struct ulog_net *ulog = container_of((void *)data,
+ struct ulog_net,
+ nlgroup[groupnum]);
pr_debug("timer function called, calling ulog_send\n");
/* lock to protect against somebody modifying our structure
* from ipt_ulog_target at the same time */
- spin_lock_bh(&ulog_lock);
- ulog_send(data);
- spin_unlock_bh(&ulog_lock);
+ spin_lock_bh(&ulog->lock);
+ ulog_send(ulog, groupnum);
+ spin_unlock_bh(&ulog->lock);
}
static struct sk_buff *ulog_alloc_skb(unsigned int size)
@@ -150,7 +163,8 @@ static struct sk_buff *ulog_alloc_skb(unsigned int size)
return skb;
}
-static void ipt_ulog_packet(unsigned int hooknum,
+static void ipt_ulog_packet(struct net *net,
+ unsigned int hooknum,
const struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
@@ -162,6 +176,7 @@ static void ipt_ulog_packet(unsigned int hooknum,
size_t size, copy_len;
struct nlmsghdr *nlh;
struct timeval tv;
+ struct ulog_net *ulog = ulog_pernet(net);
/* ffs == find first bit set, necessary because userspace
* is already shifting groupnumber, but we need unshifted.
@@ -174,11 +189,11 @@ static void ipt_ulog_packet(unsigned int hooknum,
else
copy_len = loginfo->copy_range;
- size = NLMSG_SPACE(sizeof(*pm) + copy_len);
+ size = nlmsg_total_size(sizeof(*pm) + copy_len);
- ub = &ulog_buffers[groupnum];
+ ub = &ulog->ulog_buffers[groupnum];
- spin_lock_bh(&ulog_lock);
+ spin_lock_bh(&ulog->lock);
if (!ub->skb) {
if (!(ub->skb = ulog_alloc_skb(size)))
@@ -188,7 +203,7 @@ static void ipt_ulog_packet(unsigned int hooknum,
/* either the queue len is too high or we don't have
* enough room in nlskb left. send it to userspace. */
- ulog_send(groupnum);
+ ulog_send(ulog, groupnum);
if (!(ub->skb = ulog_alloc_skb(size)))
goto alloc_failure;
@@ -217,8 +232,10 @@ static void ipt_ulog_packet(unsigned int hooknum,
put_unaligned(tv.tv_usec, &pm->timestamp_usec);
put_unaligned(skb->mark, &pm->mark);
pm->hook = hooknum;
- if (prefix != NULL)
- strncpy(pm->prefix, prefix, sizeof(pm->prefix));
+ if (prefix != NULL) {
+ strncpy(pm->prefix, prefix, sizeof(pm->prefix) - 1);
+ pm->prefix[sizeof(pm->prefix) - 1] = '\0';
+ }
else if (loginfo->prefix[0] != '\0')
strncpy(pm->prefix, loginfo->prefix, sizeof(pm->prefix));
else
@@ -262,27 +279,30 @@ static void ipt_ulog_packet(unsigned int hooknum,
if (ub->qlen >= loginfo->qthreshold) {
if (loginfo->qthreshold > 1)
nlh->nlmsg_type = NLMSG_DONE;
- ulog_send(groupnum);
+ ulog_send(ulog, groupnum);
}
out_unlock:
- spin_unlock_bh(&ulog_lock);
+ spin_unlock_bh(&ulog->lock);
return;
alloc_failure:
pr_debug("Error building netlink message\n");
- spin_unlock_bh(&ulog_lock);
+ spin_unlock_bh(&ulog->lock);
}
static unsigned int
ulog_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
- ipt_ulog_packet(par->hooknum, skb, par->in, par->out,
+ struct net *net = dev_net(par->in ? par->in : par->out);
+
+ ipt_ulog_packet(net, par->hooknum, skb, par->in, par->out,
par->targinfo, NULL);
return XT_CONTINUE;
}
-static void ipt_logfn(u_int8_t pf,
+static void ipt_logfn(struct net *net,
+ u_int8_t pf,
unsigned int hooknum,
const struct sk_buff *skb,
const struct net_device *in,
@@ -304,7 +324,7 @@ static void ipt_logfn(u_int8_t pf,
strlcpy(loginfo.prefix, prefix, sizeof(loginfo.prefix));
}
- ipt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix);
+ ipt_ulog_packet(net, hooknum, skb, in, out, &loginfo, prefix);
}
static int ulog_tg_check(const struct xt_tgchk_param *par)
@@ -378,58 +398,48 @@ static struct nf_logger ipt_ulog_logger __read_mostly = {
.me = THIS_MODULE,
};
-static int __init ulog_tg_init(void)
+static int __net_init ulog_tg_net_init(struct net *net)
{
- int ret, i;
+ int i;
+ struct ulog_net *ulog = ulog_pernet(net);
struct netlink_kernel_cfg cfg = {
.groups = ULOG_MAXNLGROUPS,
};
- pr_debug("init module\n");
-
- if (nlbufsiz > 128*1024) {
- pr_warning("Netlink buffer has to be <= 128kB\n");
- return -EINVAL;
- }
-
+ spin_lock_init(&ulog->lock);
/* initialize ulog_buffers */
- for (i = 0; i < ULOG_MAXNLGROUPS; i++)
- setup_timer(&ulog_buffers[i].timer, ulog_timer, i);
+ for (i = 0; i < ULOG_MAXNLGROUPS; i++) {
+ ulog->nlgroup[i] = i;
+ setup_timer(&ulog->ulog_buffers[i].timer, ulog_timer,
+ (unsigned long)&ulog->nlgroup[i]);
+ }
- nflognl = netlink_kernel_create(&init_net, NETLINK_NFLOG, &cfg);
- if (!nflognl)
+ ulog->nflognl = netlink_kernel_create(net, NETLINK_NFLOG, &cfg);
+ if (!ulog->nflognl)
return -ENOMEM;
- ret = xt_register_target(&ulog_tg_reg);
- if (ret < 0) {
- netlink_kernel_release(nflognl);
- return ret;
- }
if (nflog)
- nf_log_register(NFPROTO_IPV4, &ipt_ulog_logger);
+ nf_log_set(net, NFPROTO_IPV4, &ipt_ulog_logger);
return 0;
}
-static void __exit ulog_tg_exit(void)
+static void __net_exit ulog_tg_net_exit(struct net *net)
{
ulog_buff_t *ub;
int i;
-
- pr_debug("cleanup_module\n");
+ struct ulog_net *ulog = ulog_pernet(net);
if (nflog)
- nf_log_unregister(&ipt_ulog_logger);
- xt_unregister_target(&ulog_tg_reg);
- netlink_kernel_release(nflognl);
+ nf_log_unset(net, &ipt_ulog_logger);
+
+ netlink_kernel_release(ulog->nflognl);
/* remove pending timers and free allocated skb's */
for (i = 0; i < ULOG_MAXNLGROUPS; i++) {
- ub = &ulog_buffers[i];
- if (timer_pending(&ub->timer)) {
- pr_debug("timer was pending, deleting\n");
- del_timer(&ub->timer);
- }
+ ub = &ulog->ulog_buffers[i];
+ pr_debug("timer is deleting\n");
+ del_timer(&ub->timer);
if (ub->skb) {
kfree_skb(ub->skb);
@@ -438,5 +448,50 @@ static void __exit ulog_tg_exit(void)
}
}
+static struct pernet_operations ulog_tg_net_ops = {
+ .init = ulog_tg_net_init,
+ .exit = ulog_tg_net_exit,
+ .id = &ulog_net_id,
+ .size = sizeof(struct ulog_net),
+};
+
+static int __init ulog_tg_init(void)
+{
+ int ret;
+ pr_debug("init module\n");
+
+ if (nlbufsiz > 128*1024) {
+ pr_warn("Netlink buffer has to be <= 128kB\n");
+ return -EINVAL;
+ }
+
+ ret = register_pernet_subsys(&ulog_tg_net_ops);
+ if (ret)
+ goto out_pernet;
+
+ ret = xt_register_target(&ulog_tg_reg);
+ if (ret < 0)
+ goto out_target;
+
+ if (nflog)
+ nf_log_register(NFPROTO_IPV4, &ipt_ulog_logger);
+
+ return 0;
+
+out_target:
+ unregister_pernet_subsys(&ulog_tg_net_ops);
+out_pernet:
+ return ret;
+}
+
+static void __exit ulog_tg_exit(void)
+{
+ pr_debug("cleanup_module\n");
+ if (nflog)
+ nf_log_unregister(&ipt_ulog_logger);
+ xt_unregister_target(&ulog_tg_reg);
+ unregister_pernet_subsys(&ulog_tg_net_ops);
+}
+
module_init(ulog_tg_init);
module_exit(ulog_tg_exit);
diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c
index c301300..c49dcd0 100644
--- a/net/ipv4/netfilter/ipt_rpfilter.c
+++ b/net/ipv4/netfilter/ipt_rpfilter.c
@@ -66,6 +66,12 @@ static bool rpfilter_lookup_reverse(struct flowi4 *fl4,
return dev_match;
}
+static bool rpfilter_is_local(const struct sk_buff *skb)
+{
+ const struct rtable *rt = skb_rtable(skb);
+ return rt && (rt->rt_flags & RTCF_LOCAL);
+}
+
static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct xt_rpfilter_info *info;
@@ -76,7 +82,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
info = par->matchinfo;
invert = info->flags & XT_RPFILTER_INVERT;
- if (par->in->flags & IFF_LOOPBACK)
+ if (rpfilter_is_local(skb))
return true ^ invert;
iph = ip_hdr(skb);
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index 85d88f2..cba5658 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -44,6 +44,7 @@ ipt_mangle_out(struct sk_buff *skb, const struct net_device *out)
u_int8_t tos;
__be32 saddr, daddr;
u_int32_t mark;
+ int err;
/* root is playing with raw sockets. */
if (skb->len < sizeof(struct iphdr) ||
@@ -66,9 +67,11 @@ ipt_mangle_out(struct sk_buff *skb, const struct net_device *out)
if (iph->saddr != saddr ||
iph->daddr != daddr ||
skb->mark != mark ||
- iph->tos != tos)
- if (ip_route_me_harder(skb, RTN_UNSPEC))
- ret = NF_DROP;
+ iph->tos != tos) {
+ err = ip_route_me_harder(skb, RTN_UNSPEC);
+ if (err < 0)
+ ret = NF_DROP_ERR(err);
+ }
}
return ret;
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
index eeaff7e..6383273 100644
--- a/net/ipv4/netfilter/iptable_nat.c
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -176,6 +176,7 @@ nf_nat_ipv4_out(unsigned int hooknum,
#ifdef CONFIG_XFRM
const struct nf_conn *ct;
enum ip_conntrack_info ctinfo;
+ int err;
#endif
unsigned int ret;
@@ -195,9 +196,11 @@ nf_nat_ipv4_out(unsigned int hooknum,
ct->tuplehash[!dir].tuple.dst.u3.ip) ||
(ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP &&
ct->tuplehash[dir].tuple.src.u.all !=
- ct->tuplehash[!dir].tuple.dst.u.all))
- if (nf_xfrm_me_harder(skb, AF_INET) < 0)
- ret = NF_DROP;
+ ct->tuplehash[!dir].tuple.dst.u.all)) {
+ err = nf_xfrm_me_harder(skb, AF_INET);
+ if (err < 0)
+ ret = NF_DROP_ERR(err);
+ }
}
#endif
return ret;
@@ -213,6 +216,7 @@ nf_nat_ipv4_local_fn(unsigned int hooknum,
const struct nf_conn *ct;
enum ip_conntrack_info ctinfo;
unsigned int ret;
+ int err;
/* root is playing with raw sockets. */
if (skb->len < sizeof(struct iphdr) ||
@@ -226,16 +230,19 @@ nf_nat_ipv4_local_fn(unsigned int hooknum,
if (ct->tuplehash[dir].tuple.dst.u3.ip !=
ct->tuplehash[!dir].tuple.src.u3.ip) {
- if (ip_route_me_harder(skb, RTN_UNSPEC))
- ret = NF_DROP;
+ err = ip_route_me_harder(skb, RTN_UNSPEC);
+ if (err < 0)
+ ret = NF_DROP_ERR(err);
}
#ifdef CONFIG_XFRM
else if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP &&
ct->tuplehash[dir].tuple.dst.u.all !=
- ct->tuplehash[!dir].tuple.src.u.all)
- if (nf_xfrm_me_harder(skb, AF_INET) < 0)
- ret = NF_DROP;
+ ct->tuplehash[!dir].tuple.src.u.all) {
+ err = nf_xfrm_me_harder(skb, AF_INET);
+ if (err < 0)
+ ret = NF_DROP_ERR(err);
+ }
#endif
}
return ret;
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index fcdd0c2..567d841 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -1,6 +1,7 @@
/* (C) 1999-2001 Paul `Rusty' Russell
* (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -100,7 +101,6 @@ static unsigned int ipv4_helper(unsigned int hooknum,
enum ip_conntrack_info ctinfo;
const struct nf_conn_help *help;
const struct nf_conntrack_helper *helper;
- unsigned int ret;
/* This is where we call the helper: as the packet goes out. */
ct = nf_ct_get(skb, &ctinfo);
@@ -116,13 +116,8 @@ static unsigned int ipv4_helper(unsigned int hooknum,
if (!helper)
return NF_ACCEPT;
- ret = helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb),
- ct, ctinfo);
- if (ret != NF_ACCEPT && (ret & NF_VERDICT_MASK) != NF_QUEUE) {
- nf_log_packet(NFPROTO_IPV4, hooknum, skb, in, out, NULL,
- "nf_ct_%s: dropping packet", helper->name);
- }
- return ret;
+ return helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb),
+ ct, ctinfo);
}
static unsigned int ipv4_confirm(unsigned int hooknum,
@@ -420,54 +415,43 @@ static int ipv4_net_init(struct net *net)
{
int ret = 0;
- ret = nf_conntrack_l4proto_register(net,
- &nf_conntrack_l4proto_tcp4);
+ ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_tcp4);
if (ret < 0) {
- pr_err("nf_conntrack_l4proto_tcp4 :protocol register failed\n");
+ pr_err("nf_conntrack_tcp4: pernet registration failed\n");
goto out_tcp;
}
- ret = nf_conntrack_l4proto_register(net,
- &nf_conntrack_l4proto_udp4);
+ ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_udp4);
if (ret < 0) {
- pr_err("nf_conntrack_l4proto_udp4 :protocol register failed\n");
+ pr_err("nf_conntrack_udp4: pernet registration failed\n");
goto out_udp;
}
- ret = nf_conntrack_l4proto_register(net,
- &nf_conntrack_l4proto_icmp);
+ ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_icmp);
if (ret < 0) {
- pr_err("nf_conntrack_l4proto_icmp4 :protocol register failed\n");
+ pr_err("nf_conntrack_icmp4: pernet registration failed\n");
goto out_icmp;
}
- ret = nf_conntrack_l3proto_register(net,
- &nf_conntrack_l3proto_ipv4);
+ ret = nf_ct_l3proto_pernet_register(net, &nf_conntrack_l3proto_ipv4);
if (ret < 0) {
- pr_err("nf_conntrack_l3proto_ipv4 :protocol register failed\n");
+ pr_err("nf_conntrack_ipv4: pernet registration failed\n");
goto out_ipv4;
}
return 0;
out_ipv4:
- nf_conntrack_l4proto_unregister(net,
- &nf_conntrack_l4proto_icmp);
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_icmp);
out_icmp:
- nf_conntrack_l4proto_unregister(net,
- &nf_conntrack_l4proto_udp4);
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udp4);
out_udp:
- nf_conntrack_l4proto_unregister(net,
- &nf_conntrack_l4proto_tcp4);
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_tcp4);
out_tcp:
return ret;
}
static void ipv4_net_exit(struct net *net)
{
- nf_conntrack_l3proto_unregister(net,
- &nf_conntrack_l3proto_ipv4);
- nf_conntrack_l4proto_unregister(net,
- &nf_conntrack_l4proto_icmp);
- nf_conntrack_l4proto_unregister(net,
- &nf_conntrack_l4proto_udp4);
- nf_conntrack_l4proto_unregister(net,
- &nf_conntrack_l4proto_tcp4);
+ nf_ct_l3proto_pernet_unregister(net, &nf_conntrack_l3proto_ipv4);
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_icmp);
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udp4);
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_tcp4);
}
static struct pernet_operations ipv4_net_ops = {
@@ -500,16 +484,49 @@ static int __init nf_conntrack_l3proto_ipv4_init(void)
pr_err("nf_conntrack_ipv4: can't register hooks.\n");
goto cleanup_pernet;
}
+
+ ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_tcp4);
+ if (ret < 0) {
+ pr_err("nf_conntrack_ipv4: can't register tcp4 proto.\n");
+ goto cleanup_hooks;
+ }
+
+ ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_udp4);
+ if (ret < 0) {
+ pr_err("nf_conntrack_ipv4: can't register udp4 proto.\n");
+ goto cleanup_tcp4;
+ }
+
+ ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_icmp);
+ if (ret < 0) {
+ pr_err("nf_conntrack_ipv4: can't register icmpv4 proto.\n");
+ goto cleanup_udp4;
+ }
+
+ ret = nf_ct_l3proto_register(&nf_conntrack_l3proto_ipv4);
+ if (ret < 0) {
+ pr_err("nf_conntrack_ipv4: can't register ipv4 proto.\n");
+ goto cleanup_icmpv4;
+ }
+
#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
ret = nf_conntrack_ipv4_compat_init();
if (ret < 0)
- goto cleanup_hooks;
+ goto cleanup_proto;
#endif
return ret;
#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
+ cleanup_proto:
+ nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv4);
+#endif
+ cleanup_icmpv4:
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_icmp);
+ cleanup_udp4:
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udp4);
+ cleanup_tcp4:
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_tcp4);
cleanup_hooks:
nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops));
-#endif
cleanup_pernet:
unregister_pernet_subsys(&ipv4_net_ops);
cleanup_sockopt:
@@ -523,6 +540,10 @@ static void __exit nf_conntrack_l3proto_ipv4_fini(void)
#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
nf_conntrack_ipv4_compat_fini();
#endif
+ nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv4);
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_icmp);
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udp4);
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_tcp4);
nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops));
unregister_pernet_subsys(&ipv4_net_ops);
nf_unregister_sockopt(&so_getorigdst);
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
index 9682b36..4c48e43 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
@@ -2,6 +2,7 @@
*
* (C) 1999-2001 Paul `Rusty' Russell
* (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2006-2010 Patrick McHardy <kaber@trash.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -417,12 +418,12 @@ static int __net_init ip_conntrack_net_init(struct net *net)
{
struct proc_dir_entry *proc, *proc_exp, *proc_stat;
- proc = proc_net_fops_create(net, "ip_conntrack", 0440, &ct_file_ops);
+ proc = proc_create("ip_conntrack", 0440, net->proc_net, &ct_file_ops);
if (!proc)
goto err1;
- proc_exp = proc_net_fops_create(net, "ip_conntrack_expect", 0440,
- &ip_exp_file_ops);
+ proc_exp = proc_create("ip_conntrack_expect", 0440, net->proc_net,
+ &ip_exp_file_ops);
if (!proc_exp)
goto err2;
@@ -433,9 +434,9 @@ static int __net_init ip_conntrack_net_init(struct net *net)
return 0;
err3:
- proc_net_remove(net, "ip_conntrack_expect");
+ remove_proc_entry("ip_conntrack_expect", net->proc_net);
err2:
- proc_net_remove(net, "ip_conntrack");
+ remove_proc_entry("ip_conntrack", net->proc_net);
err1:
return -ENOMEM;
}
@@ -443,8 +444,8 @@ err1:
static void __net_exit ip_conntrack_net_exit(struct net *net)
{
remove_proc_entry("ip_conntrack", net->proc_net_stat);
- proc_net_remove(net, "ip_conntrack_expect");
- proc_net_remove(net, "ip_conntrack");
+ remove_proc_entry("ip_conntrack_expect", net->proc_net);
+ remove_proc_entry("ip_conntrack", net->proc_net);
}
static struct pernet_operations ip_conntrack_net_ops = {
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index 5241d99..a338dad 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -1,5 +1,6 @@
/* (C) 1999-2001 Paul `Rusty' Russell
* (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2006-2010 Patrick McHardy <kaber@trash.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -187,8 +188,8 @@ icmp_error(struct net *net, struct nf_conn *tmpl,
icmph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_ih), &_ih);
if (icmph == NULL) {
if (LOG_INVALID(net, IPPROTO_ICMP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
- "nf_ct_icmp: short packet ");
+ nf_log_packet(net, PF_INET, 0, skb, NULL, NULL,
+ NULL, "nf_ct_icmp: short packet ");
return -NF_ACCEPT;
}
@@ -196,7 +197,7 @@ icmp_error(struct net *net, struct nf_conn *tmpl,
if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
nf_ip_checksum(skb, hooknum, dataoff, 0)) {
if (LOG_INVALID(net, IPPROTO_ICMP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
+ nf_log_packet(net, PF_INET, 0, skb, NULL, NULL, NULL,
"nf_ct_icmp: bad HW ICMP checksum ");
return -NF_ACCEPT;
}
@@ -209,7 +210,7 @@ icmp_error(struct net *net, struct nf_conn *tmpl,
*/
if (icmph->type > NR_ICMP_TYPES) {
if (LOG_INVALID(net, IPPROTO_ICMP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
+ nf_log_packet(net, PF_INET, 0, skb, NULL, NULL, NULL,
"nf_ct_icmp: invalid ICMP type ");
return -NF_ACCEPT;
}
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c
index 9c3db10..9eea059d 100644
--- a/net/ipv4/netfilter/nf_nat_h323.c
+++ b/net/ipv4/netfilter/nf_nat_h323.c
@@ -2,6 +2,7 @@
* H.323 extension for NAT alteration.
*
* Copyright (c) 2006 Jing Min Zhao <zhaojingmin@users.sourceforge.net>
+ * Copyright (c) 2006-2012 Patrick McHardy <kaber@trash.net>
*
* This source code is licensed under General Public License version 2.
*
diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c
index a06d7d7..657d230 100644
--- a/net/ipv4/netfilter/nf_nat_pptp.c
+++ b/net/ipv4/netfilter/nf_nat_pptp.c
@@ -13,6 +13,8 @@
*
* Development of this code funded by Astaro AG (http://www.astaro.com/)
*
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
+ *
* TODO: - NAT to a unique tuple, not to TCP source port
* (needs netfilter tuple reservation)
*/
diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c
index ea44f02..690d8901 100644
--- a/net/ipv4/netfilter/nf_nat_proto_gre.c
+++ b/net/ipv4/netfilter/nf_nat_proto_gre.c
@@ -21,6 +21,8 @@
*
* Development of this code funded by Astaro AG (http://www.astaro.com/)
*
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
+ *
*/
#include <linux/module.h>
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index bac7122..5f011cc 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -38,6 +38,8 @@
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
* Author: James Morris <jmorris@intercode.com.au>
+ *
+ * Copyright (c) 2006-2010 Patrick McHardy <kaber@trash.net>
*/
#include <linux/module.h>
#include <linux/moduleparam.h>
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 6f9c072..7d93d62 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -322,8 +322,8 @@ void ping_err(struct sk_buff *skb, u32 info)
struct iphdr *iph = (struct iphdr *)skb->data;
struct icmphdr *icmph = (struct icmphdr *)(skb->data+(iph->ihl<<2));
struct inet_sock *inet_sock;
- int type = icmph->type;
- int code = icmph->code;
+ int type = icmp_hdr(skb)->type;
+ int code = icmp_hdr(skb)->code;
struct net *net = dev_net(skb->dev);
struct sock *sk;
int harderr;
@@ -514,9 +514,8 @@ static int ping_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
ipc.opt = NULL;
ipc.oif = sk->sk_bound_dev_if;
ipc.tx_flags = 0;
- err = sock_tx_timestamp(sk, &ipc.tx_flags);
- if (err)
- return err;
+
+ sock_tx_timestamp(sk, &ipc.tx_flags);
if (msg->msg_controllen) {
err = ip_cmsg_send(sock_net(sk), msg, &ipc);
@@ -889,7 +888,7 @@ static int ping_proc_register(struct net *net)
struct proc_dir_entry *p;
int rc = 0;
- p = proc_net_fops_create(net, "icmp", S_IRUGO, &ping_seq_fops);
+ p = proc_create("icmp", S_IRUGO, net->proc_net, &ping_seq_fops);
if (!p)
rc = -ENOMEM;
return rc;
@@ -897,7 +896,7 @@ static int ping_proc_register(struct net *net)
static void ping_proc_unregister(struct net *net)
{
- proc_net_remove(net, "icmp");
+ remove_proc_entry("icmp", net->proc_net);
}
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 8de53e1..2a5bf86 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -125,6 +125,7 @@ static const struct snmp_mib snmp4_ipextstats_list[] = {
SNMP_MIB_ITEM("OutMcastOctets", IPSTATS_MIB_OUTMCASTOCTETS),
SNMP_MIB_ITEM("InBcastOctets", IPSTATS_MIB_INBCASTOCTETS),
SNMP_MIB_ITEM("OutBcastOctets", IPSTATS_MIB_OUTBCASTOCTETS),
+ SNMP_MIB_ITEM("InCsumErrors", IPSTATS_MIB_CSUMERRORS),
SNMP_MIB_SENTINEL
};
@@ -162,6 +163,7 @@ static const struct snmp_mib snmp4_tcp_list[] = {
SNMP_MIB_ITEM("RetransSegs", TCP_MIB_RETRANSSEGS),
SNMP_MIB_ITEM("InErrs", TCP_MIB_INERRS),
SNMP_MIB_ITEM("OutRsts", TCP_MIB_OUTRSTS),
+ SNMP_MIB_ITEM("InCsumErrors", TCP_MIB_CSUMERRORS),
SNMP_MIB_SENTINEL
};
@@ -172,6 +174,7 @@ static const struct snmp_mib snmp4_udp_list[] = {
SNMP_MIB_ITEM("OutDatagrams", UDP_MIB_OUTDATAGRAMS),
SNMP_MIB_ITEM("RcvbufErrors", UDP_MIB_RCVBUFERRORS),
SNMP_MIB_ITEM("SndbufErrors", UDP_MIB_SNDBUFERRORS),
+ SNMP_MIB_ITEM("InCsumErrors", UDP_MIB_CSUMERRORS),
SNMP_MIB_SENTINEL
};
@@ -224,6 +227,8 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPForwardRetrans", LINUX_MIB_TCPFORWARDRETRANS),
SNMP_MIB_ITEM("TCPSlowStartRetrans", LINUX_MIB_TCPSLOWSTARTRETRANS),
SNMP_MIB_ITEM("TCPTimeouts", LINUX_MIB_TCPTIMEOUTS),
+ SNMP_MIB_ITEM("TCPLossProbes", LINUX_MIB_TCPLOSSPROBES),
+ SNMP_MIB_ITEM("TCPLossProbeRecovery", LINUX_MIB_TCPLOSSPROBERECOVERY),
SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL),
SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL),
SNMP_MIB_ITEM("TCPSchedulerFailed", LINUX_MIB_TCPSCHEDULERFAILED),
@@ -267,6 +272,7 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPFastOpenPassiveFail", LINUX_MIB_TCPFASTOPENPASSIVEFAIL),
SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW),
SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD),
+ SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES),
SNMP_MIB_SENTINEL
};
@@ -319,15 +325,16 @@ static void icmp_put(struct seq_file *seq)
struct net *net = seq->private;
atomic_long_t *ptr = net->mib.icmpmsg_statistics->mibs;
- seq_puts(seq, "\nIcmp: InMsgs InErrors");
+ seq_puts(seq, "\nIcmp: InMsgs InErrors InCsumErrors");
for (i=0; icmpmibmap[i].name != NULL; i++)
seq_printf(seq, " In%s", icmpmibmap[i].name);
seq_printf(seq, " OutMsgs OutErrors");
for (i=0; icmpmibmap[i].name != NULL; i++)
seq_printf(seq, " Out%s", icmpmibmap[i].name);
- seq_printf(seq, "\nIcmp: %lu %lu",
+ seq_printf(seq, "\nIcmp: %lu %lu %lu",
snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INMSGS),
- snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INERRORS));
+ snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INERRORS),
+ snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_CSUMERRORS));
for (i=0; icmpmibmap[i].name != NULL; i++)
seq_printf(seq, " %lu",
atomic_long_read(ptr + icmpmibmap[i].index));
@@ -471,28 +478,29 @@ static const struct file_operations netstat_seq_fops = {
static __net_init int ip_proc_init_net(struct net *net)
{
- if (!proc_net_fops_create(net, "sockstat", S_IRUGO, &sockstat_seq_fops))
+ if (!proc_create("sockstat", S_IRUGO, net->proc_net,
+ &sockstat_seq_fops))
goto out_sockstat;
- if (!proc_net_fops_create(net, "netstat", S_IRUGO, &netstat_seq_fops))
+ if (!proc_create("netstat", S_IRUGO, net->proc_net, &netstat_seq_fops))
goto out_netstat;
- if (!proc_net_fops_create(net, "snmp", S_IRUGO, &snmp_seq_fops))
+ if (!proc_create("snmp", S_IRUGO, net->proc_net, &snmp_seq_fops))
goto out_snmp;
return 0;
out_snmp:
- proc_net_remove(net, "netstat");
+ remove_proc_entry("netstat", net->proc_net);
out_netstat:
- proc_net_remove(net, "sockstat");
+ remove_proc_entry("sockstat", net->proc_net);
out_sockstat:
return -ENOMEM;
}
static __net_exit void ip_proc_exit_net(struct net *net)
{
- proc_net_remove(net, "snmp");
- proc_net_remove(net, "netstat");
- proc_net_remove(net, "sockstat");
+ remove_proc_entry("snmp", net->proc_net);
+ remove_proc_entry("netstat", net->proc_net);
+ remove_proc_entry("sockstat", net->proc_net);
}
static __net_initdata struct pernet_operations ip_proc_ops = {
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
index 0f9d09f..ce84846 100644
--- a/net/ipv4/protocol.c
+++ b/net/ipv4/protocol.c
@@ -37,6 +37,12 @@ const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly;
int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol)
{
+ if (!prot->netns_ok) {
+ pr_err("Protocol %u is not namespace aware, cannot register.\n",
+ protocol);
+ return -EINVAL;
+ }
+
return !cmpxchg((const struct net_protocol **)&inet_protos[protocol],
NULL, prot) ? 0 : -1;
}
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 6f08991..dd44e0a 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -111,9 +111,7 @@ EXPORT_SYMBOL_GPL(raw_unhash_sk);
static struct sock *__raw_v4_lookup(struct net *net, struct sock *sk,
unsigned short num, __be32 raddr, __be32 laddr, int dif)
{
- struct hlist_node *node;
-
- sk_for_each_from(sk, node) {
+ sk_for_each_from(sk) {
struct inet_sock *inet = inet_sk(sk);
if (net_eq(sock_net(sk), net) && inet->inet_num == num &&
@@ -914,9 +912,7 @@ static struct sock *raw_get_first(struct seq_file *seq)
for (state->bucket = 0; state->bucket < RAW_HTABLE_SIZE;
++state->bucket) {
- struct hlist_node *node;
-
- sk_for_each(sk, node, &state->h->ht[state->bucket])
+ sk_for_each(sk, &state->h->ht[state->bucket])
if (sock_net(sk) == seq_file_net(seq))
goto found;
}
@@ -1050,7 +1046,7 @@ static const struct file_operations raw_seq_fops = {
static __net_init int raw_init_net(struct net *net)
{
- if (!proc_net_fops_create(net, "raw", S_IRUGO, &raw_seq_fops))
+ if (!proc_create("raw", S_IRUGO, net->proc_net, &raw_seq_fops))
return -ENOMEM;
return 0;
@@ -1058,7 +1054,7 @@ static __net_init int raw_init_net(struct net *net)
static __net_exit void raw_exit_net(struct net *net)
{
- proc_net_remove(net, "raw");
+ remove_proc_entry("raw", net->proc_net);
}
static __net_initdata struct pernet_operations raw_net_ops = {
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 63354f5..961a718 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -117,15 +117,11 @@
#define RT_GC_TIMEOUT (300*HZ)
static int ip_rt_max_size;
-static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
-static int ip_rt_gc_interval __read_mostly = 60 * HZ;
-static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
static int ip_rt_redirect_number __read_mostly = 9;
static int ip_rt_redirect_load __read_mostly = HZ / 50;
static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
static int ip_rt_error_cost __read_mostly = HZ;
static int ip_rt_error_burst __read_mostly = 5 * HZ;
-static int ip_rt_gc_elasticity __read_mostly = 8;
static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
static int ip_rt_min_advmss __read_mostly = 256;
@@ -388,8 +384,8 @@ static int __net_init ip_rt_do_proc_init(struct net *net)
{
struct proc_dir_entry *pde;
- pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
- &rt_cache_seq_fops);
+ pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
+ &rt_cache_seq_fops);
if (!pde)
goto err1;
@@ -749,10 +745,15 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf
{
struct rtable *rt;
struct flowi4 fl4;
+ const struct iphdr *iph = (const struct iphdr *) skb->data;
+ int oif = skb->dev->ifindex;
+ u8 tos = RT_TOS(iph->tos);
+ u8 prot = iph->protocol;
+ u32 mark = skb->mark;
rt = (struct rtable *) dst;
- ip_rt_build_flow_key(&fl4, sk, skb);
+ __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
__ip_do_redirect(rt, skb, &fl4, true);
}
@@ -2331,7 +2332,7 @@ nla_put_failure:
return -EMSGSIZE;
}
-static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
+static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(in_skb->sk);
struct rtmsg *rtm;
@@ -2439,6 +2440,11 @@ void ip_rt_multicast_event(struct in_device *in_dev)
}
#ifdef CONFIG_SYSCTL
+static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
+static int ip_rt_gc_interval __read_mostly = 60 * HZ;
+static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
+static int ip_rt_gc_elasticity __read_mostly = 8;
+
static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
void __user *buffer,
size_t *lenp, loff_t *ppos)
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index b236ef0..b05c96e 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -232,7 +232,8 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
*
* return false if we decode an option that should not be.
*/
-bool cookie_check_timestamp(struct tcp_options_received *tcp_opt, bool *ecn_ok)
+bool cookie_check_timestamp(struct tcp_options_received *tcp_opt,
+ struct net *net, bool *ecn_ok)
{
/* echoed timestamp, lowest bits contain options */
u32 options = tcp_opt->rcv_tsecr & TSMASK;
@@ -247,7 +248,7 @@ bool cookie_check_timestamp(struct tcp_options_received *tcp_opt, bool *ecn_ok)
tcp_opt->sack_ok = (options & (1 << 4)) ? TCP_SACK_SEEN : 0;
*ecn_ok = (options >> 5) & 1;
- if (*ecn_ok && !sysctl_tcp_ecn)
+ if (*ecn_ok && !net->ipv4.sysctl_tcp_ecn)
return false;
if (tcp_opt->sack_ok && !sysctl_tcp_sack)
@@ -266,7 +267,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
struct ip_options *opt)
{
struct tcp_options_received tcp_opt;
- const u8 *hash_location;
struct inet_request_sock *ireq;
struct tcp_request_sock *treq;
struct tcp_sock *tp = tcp_sk(sk);
@@ -293,9 +293,9 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
/* check for timestamp cookie support */
memset(&tcp_opt, 0, sizeof(tcp_opt));
- tcp_parse_options(skb, &tcp_opt, &hash_location, 0, NULL);
+ tcp_parse_options(skb, &tcp_opt, 0, NULL);
- if (!cookie_check_timestamp(&tcp_opt, &ecn_ok))
+ if (!cookie_check_timestamp(&tcp_opt, sock_net(sk), &ecn_ok))
goto out;
ret = NULL;
@@ -348,8 +348,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
* hasn't changed since we received the original syn, but I see
* no easy way to do this.
*/
- flowi4_init_output(&fl4, 0, sk->sk_mark, RT_CONN_FLAGS(sk),
- RT_SCOPE_UNIVERSE, IPPROTO_TCP,
+ flowi4_init_output(&fl4, sk->sk_bound_dev_if, sk->sk_mark,
+ RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP,
inet_sk_flowi_flags(sk),
(opt && opt->srr) ? opt->faddr : ireq->rmt_addr,
ireq->loc_addr, th->source, th->dest);
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index d84400b..fa2f63f 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -27,7 +27,8 @@
#include <net/tcp_memcontrol.h>
static int zero;
-static int two = 2;
+static int one = 1;
+static int four = 4;
static int tcp_retr1_max = 255;
static int ip_local_port_range_min[] = { 1, 1 };
static int ip_local_port_range_max[] = { 65535, 65535 };
@@ -232,8 +233,8 @@ static int ipv4_tcp_mem(ctl_table *ctl, int write,
return 0;
}
-int proc_tcp_fastopen_key(ctl_table *ctl, int write, void __user *buffer,
- size_t *lenp, loff_t *ppos)
+static int proc_tcp_fastopen_key(ctl_table *ctl, int write, void __user *buffer,
+ size_t *lenp, loff_t *ppos)
{
ctl_table tbl = { .maxlen = (TCP_FASTOPEN_KEY_LENGTH * 2 + 10) };
struct tcp_fastopen_context *ctxt;
@@ -538,13 +539,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .procname = "tcp_ecn",
- .data = &sysctl_tcp_ecn,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
.procname = "tcp_dsack",
.data = &sysctl_tcp_dsack,
.maxlen = sizeof(int),
@@ -556,14 +550,16 @@ static struct ctl_table ipv4_table[] = {
.data = &sysctl_tcp_wmem,
.maxlen = sizeof(sysctl_tcp_wmem),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
},
{
.procname = "tcp_rmem",
.data = &sysctl_tcp_rmem,
.maxlen = sizeof(sysctl_tcp_rmem),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
},
{
.procname = "tcp_app_win",
@@ -596,13 +592,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .procname = "tcp_frto_response",
- .data = &sysctl_tcp_frto_response,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
.procname = "tcp_low_latency",
.data = &sysctl_tcp_low_latency,
.maxlen = sizeof(int),
@@ -637,13 +626,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_tcp_congestion_control,
},
{
- .procname = "tcp_abc",
- .data = &sysctl_tcp_abc,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
.procname = "tcp_mtu_probing",
.data = &sysctl_tcp_mtu_probing,
.maxlen = sizeof(int),
@@ -744,13 +726,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec,
},
{
- .procname = "tcp_cookie_size",
- .data = &sysctl_tcp_cookie_size,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
.procname = "tcp_thin_linear_timeouts",
.data = &sysctl_tcp_thin_linear_timeouts,
.maxlen = sizeof(int),
@@ -771,7 +746,7 @@ static struct ctl_table ipv4_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &zero,
- .extra2 = &two,
+ .extra2 = &four,
},
{
.procname = "udp_mem",
@@ -786,7 +761,7 @@ static struct ctl_table ipv4_table[] = {
.maxlen = sizeof(sysctl_udp_rmem_min),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero
+ .extra1 = &one
},
{
.procname = "udp_wmem_min",
@@ -794,7 +769,7 @@ static struct ctl_table ipv4_table[] = {
.maxlen = sizeof(sysctl_udp_wmem_min),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero
+ .extra1 = &one
},
{ }
};
@@ -850,6 +825,13 @@ static struct ctl_table ipv4_net_table[] = {
.proc_handler = ipv4_ping_group_range,
},
{
+ .procname = "tcp_ecn",
+ .data = &init_net.ipv4.sysctl_tcp_ecn,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
.procname = "tcp_mem",
.maxlen = sizeof(init_net.ipv4.sysctl_tcp_mem),
.mode = 0644,
@@ -882,6 +864,8 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
&net->ipv4.sysctl_icmp_ratemask;
table[6].data =
&net->ipv4.sysctl_ping_group_range;
+ table[7].data =
+ &net->ipv4.sysctl_tcp_ecn;
/* Don't export sysctls to unprivileged users */
if (net->user_ns != &init_user_ns)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 2aa69c8..ab450c0 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -400,6 +400,8 @@ void tcp_init_sock(struct sock *sk)
tcp_enable_early_retrans(tp);
icsk->icsk_ca_ops = &tcp_init_congestion_ops;
+ tp->tsoffset = 0;
+
sk->sk_state = TCP_CLOSE;
sk->sk_write_space = sk_stream_write_space;
@@ -407,15 +409,6 @@ void tcp_init_sock(struct sock *sk)
icsk->icsk_sync_mss = tcp_sync_mss;
- /* TCP Cookie Transactions */
- if (sysctl_tcp_cookie_size > 0) {
- /* Default, cookies without s_data_payload. */
- tp->cookie_values =
- kzalloc(sizeof(*tp->cookie_values),
- sk->sk_allocation);
- if (tp->cookie_values != NULL)
- kref_init(&tp->cookie_values->kref);
- }
/* Presumed zeroed, in order of appearance:
* cookie_in_always, cookie_out_never,
* s_data_constant, s_data_in, s_data_out
@@ -773,7 +766,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
* Make sure that we have exactly size bytes
* available to the caller, no more, no less.
*/
- skb->avail_size = size;
+ skb->reserved_tailroom = skb->end - skb->tail - size;
return skb;
}
__kfree_skb(skb);
@@ -895,6 +888,7 @@ new_segment:
get_page(page);
skb_fill_page_desc(skb, i, page, offset, copy);
}
+ skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
skb->len += copy;
skb->data_len += copy;
@@ -1406,10 +1400,10 @@ static void tcp_service_net_dma(struct sock *sk, bool wait)
return;
last_issued = tp->ucopy.dma_cookie;
- dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
+ dma_async_issue_pending(tp->ucopy.dma_chan);
do {
- if (dma_async_memcpy_complete(tp->ucopy.dma_chan,
+ if (dma_async_is_tx_complete(tp->ucopy.dma_chan,
last_issued, &done,
&used) == DMA_SUCCESS) {
/* Safe to free early-copied skbs now */
@@ -1751,7 +1745,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
tcp_service_net_dma(sk, true);
tcp_cleanup_rbuf(sk, copied);
} else
- dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
+ dma_async_issue_pending(tp->ucopy.dma_chan);
}
#endif
if (copied >= target) {
@@ -1844,7 +1838,7 @@ do_prequeue:
break;
}
- dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
+ dma_async_issue_pending(tp->ucopy.dma_chan);
if ((offset + used) == skb->len)
copied_early = true;
@@ -2287,7 +2281,6 @@ int tcp_disconnect(struct sock *sk, int flags)
tp->packets_out = 0;
tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
tp->snd_cwnd_cnt = 0;
- tp->bytes_acked = 0;
tp->window_clamp = 0;
tcp_set_ca_state(sk, TCP_CA_Open);
tcp_clear_retrans(tp);
@@ -2395,92 +2388,6 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
release_sock(sk);
return err;
}
- case TCP_COOKIE_TRANSACTIONS: {
- struct tcp_cookie_transactions ctd;
- struct tcp_cookie_values *cvp = NULL;
-
- if (sizeof(ctd) > optlen)
- return -EINVAL;
- if (copy_from_user(&ctd, optval, sizeof(ctd)))
- return -EFAULT;
-
- if (ctd.tcpct_used > sizeof(ctd.tcpct_value) ||
- ctd.tcpct_s_data_desired > TCP_MSS_DESIRED)
- return -EINVAL;
-
- if (ctd.tcpct_cookie_desired == 0) {
- /* default to global value */
- } else if ((0x1 & ctd.tcpct_cookie_desired) ||
- ctd.tcpct_cookie_desired > TCP_COOKIE_MAX ||
- ctd.tcpct_cookie_desired < TCP_COOKIE_MIN) {
- return -EINVAL;
- }
-
- if (TCP_COOKIE_OUT_NEVER & ctd.tcpct_flags) {
- /* Supercedes all other values */
- lock_sock(sk);
- if (tp->cookie_values != NULL) {
- kref_put(&tp->cookie_values->kref,
- tcp_cookie_values_release);
- tp->cookie_values = NULL;
- }
- tp->rx_opt.cookie_in_always = 0; /* false */
- tp->rx_opt.cookie_out_never = 1; /* true */
- release_sock(sk);
- return err;
- }
-
- /* Allocate ancillary memory before locking.
- */
- if (ctd.tcpct_used > 0 ||
- (tp->cookie_values == NULL &&
- (sysctl_tcp_cookie_size > 0 ||
- ctd.tcpct_cookie_desired > 0 ||
- ctd.tcpct_s_data_desired > 0))) {
- cvp = kzalloc(sizeof(*cvp) + ctd.tcpct_used,
- GFP_KERNEL);
- if (cvp == NULL)
- return -ENOMEM;
-
- kref_init(&cvp->kref);
- }
- lock_sock(sk);
- tp->rx_opt.cookie_in_always =
- (TCP_COOKIE_IN_ALWAYS & ctd.tcpct_flags);
- tp->rx_opt.cookie_out_never = 0; /* false */
-
- if (tp->cookie_values != NULL) {
- if (cvp != NULL) {
- /* Changed values are recorded by a changed
- * pointer, ensuring the cookie will differ,
- * without separately hashing each value later.
- */
- kref_put(&tp->cookie_values->kref,
- tcp_cookie_values_release);
- } else {
- cvp = tp->cookie_values;
- }
- }
-
- if (cvp != NULL) {
- cvp->cookie_desired = ctd.tcpct_cookie_desired;
-
- if (ctd.tcpct_used > 0) {
- memcpy(cvp->s_data_payload, ctd.tcpct_value,
- ctd.tcpct_used);
- cvp->s_data_desired = ctd.tcpct_used;
- cvp->s_data_constant = 1; /* true */
- } else {
- /* No constant payload data. */
- cvp->s_data_desired = ctd.tcpct_s_data_desired;
- cvp->s_data_constant = 0; /* false */
- }
-
- tp->cookie_values = cvp;
- }
- release_sock(sk);
- return err;
- }
default:
/* fallthru */
break;
@@ -2711,6 +2618,12 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
else
err = -EINVAL;
break;
+ case TCP_TIMESTAMP:
+ if (!tp->repair)
+ err = -EPERM;
+ else
+ tp->tsoffset = val - tcp_time_stamp;
+ break;
default:
err = -ENOPROTOOPT;
break;
@@ -2894,41 +2807,6 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
return -EFAULT;
return 0;
- case TCP_COOKIE_TRANSACTIONS: {
- struct tcp_cookie_transactions ctd;
- struct tcp_cookie_values *cvp = tp->cookie_values;
-
- if (get_user(len, optlen))
- return -EFAULT;
- if (len < sizeof(ctd))
- return -EINVAL;
-
- memset(&ctd, 0, sizeof(ctd));
- ctd.tcpct_flags = (tp->rx_opt.cookie_in_always ?
- TCP_COOKIE_IN_ALWAYS : 0)
- | (tp->rx_opt.cookie_out_never ?
- TCP_COOKIE_OUT_NEVER : 0);
-
- if (cvp != NULL) {
- ctd.tcpct_flags |= (cvp->s_data_in ?
- TCP_S_DATA_IN : 0)
- | (cvp->s_data_out ?
- TCP_S_DATA_OUT : 0);
-
- ctd.tcpct_cookie_desired = cvp->cookie_desired;
- ctd.tcpct_s_data_desired = cvp->s_data_desired;
-
- memcpy(&ctd.tcpct_value[0], &cvp->cookie_pair[0],
- cvp->cookie_pair_size);
- ctd.tcpct_used = cvp->cookie_pair_size;
- }
-
- if (put_user(sizeof(ctd), optlen))
- return -EFAULT;
- if (copy_to_user(optval, &ctd, sizeof(ctd)))
- return -EFAULT;
- return 0;
- }
case TCP_THIN_LINEAR_TIMEOUTS:
val = tp->thin_lto;
break;
@@ -2959,6 +2837,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
case TCP_USER_TIMEOUT:
val = jiffies_to_msecs(icsk->icsk_user_timeout);
break;
+ case TCP_TIMESTAMP:
+ val = tcp_time_stamp + tp->tsoffset;
+ break;
default:
return -ENOPROTOOPT;
}
@@ -3004,6 +2885,9 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb,
__be32 delta;
unsigned int oldlen;
unsigned int mss;
+ struct sk_buff *gso_skb = skb;
+ __sum16 newcheck;
+ bool ooo_okay, copy_destructor;
if (!pskb_may_pull(skb, sizeof(*th)))
goto out;
@@ -3032,6 +2916,8 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb,
SKB_GSO_DODGY |
SKB_GSO_TCP_ECN |
SKB_GSO_TCPV6 |
+ SKB_GSO_GRE |
+ SKB_GSO_UDP_TUNNEL |
0) ||
!(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))))
goto out;
@@ -3042,27 +2928,48 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb,
goto out;
}
+ copy_destructor = gso_skb->destructor == tcp_wfree;
+ ooo_okay = gso_skb->ooo_okay;
+ /* All segments but the first should have ooo_okay cleared */
+ skb->ooo_okay = 0;
+
segs = skb_segment(skb, features);
if (IS_ERR(segs))
goto out;
+ /* Only first segment might have ooo_okay set */
+ segs->ooo_okay = ooo_okay;
+
delta = htonl(oldlen + (thlen + mss));
skb = segs;
th = tcp_hdr(skb);
seq = ntohl(th->seq);
+ newcheck = ~csum_fold((__force __wsum)((__force u32)th->check +
+ (__force u32)delta));
+
do {
th->fin = th->psh = 0;
+ th->check = newcheck;
- th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
- (__force u32)delta));
if (skb->ip_summed != CHECKSUM_PARTIAL)
th->check =
csum_fold(csum_partial(skb_transport_header(skb),
thlen, skb->csum));
seq += mss;
+ if (copy_destructor) {
+ skb->destructor = gso_skb->destructor;
+ skb->sk = gso_skb->sk;
+ /* {tcp|sock}_wfree() use exact truesize accounting :
+ * sum(skb->truesize) MUST be exactly be gso_skb->truesize
+ * So we account mss bytes of 'true size' for each segment.
+ * The last segment will contain the remaining.
+ */
+ skb->truesize = mss;
+ gso_skb->truesize -= mss;
+ }
skb = skb->next;
th = tcp_hdr(skb);
@@ -3070,6 +2977,17 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb,
th->cwr = 0;
} while (skb->next);
+ /* Following permits TCP Small Queues to work well with GSO :
+ * The callback to TCP stack will be called at the time last frag
+ * is freed at TX completion, and not right now when gso_skb
+ * is freed by GSO engine
+ */
+ if (copy_destructor) {
+ swap(gso_skb->sk, skb->sk);
+ swap(gso_skb->destructor, skb->destructor);
+ swap(gso_skb->truesize, skb->truesize);
+ }
+
delta = htonl(oldlen + (skb->tail - skb->transport_header) +
skb->data_len);
th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
@@ -3243,7 +3161,7 @@ __tcp_alloc_md5sig_pool(struct sock *sk)
struct crypto_hash *hash;
hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
- if (!hash || IS_ERR(hash))
+ if (IS_ERR_OR_NULL(hash))
goto out_free;
per_cpu_ptr(pool, cpu)->md5_desc.tfm = hash;
@@ -3371,8 +3289,11 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
for (i = 0; i < shi->nr_frags; ++i) {
const struct skb_frag_struct *f = &shi->frags[i];
- struct page *page = skb_frag_page(f);
- sg_set_page(&sg, page, skb_frag_size(f), f->page_offset);
+ unsigned int offset = f->page_offset;
+ struct page *page = skb_frag_page(f) + (offset >> PAGE_SHIFT);
+
+ sg_set_page(&sg, page, skb_frag_size(f),
+ offset_in_page(offset));
if (crypto_hash_update(desc, &sg, skb_frag_size(f)))
return 1;
}
@@ -3396,134 +3317,6 @@ EXPORT_SYMBOL(tcp_md5_hash_key);
#endif
-/* Each Responder maintains up to two secret values concurrently for
- * efficient secret rollover. Each secret value has 4 states:
- *
- * Generating. (tcp_secret_generating != tcp_secret_primary)
- * Generates new Responder-Cookies, but not yet used for primary
- * verification. This is a short-term state, typically lasting only
- * one round trip time (RTT).
- *
- * Primary. (tcp_secret_generating == tcp_secret_primary)
- * Used both for generation and primary verification.
- *
- * Retiring. (tcp_secret_retiring != tcp_secret_secondary)
- * Used for verification, until the first failure that can be
- * verified by the newer Generating secret. At that time, this
- * cookie's state is changed to Secondary, and the Generating
- * cookie's state is changed to Primary. This is a short-term state,
- * typically lasting only one round trip time (RTT).
- *
- * Secondary. (tcp_secret_retiring == tcp_secret_secondary)
- * Used for secondary verification, after primary verification
- * failures. This state lasts no more than twice the Maximum Segment
- * Lifetime (2MSL). Then, the secret is discarded.
- */
-struct tcp_cookie_secret {
- /* The secret is divided into two parts. The digest part is the
- * equivalent of previously hashing a secret and saving the state,
- * and serves as an initialization vector (IV). The message part
- * serves as the trailing secret.
- */
- u32 secrets[COOKIE_WORKSPACE_WORDS];
- unsigned long expires;
-};
-
-#define TCP_SECRET_1MSL (HZ * TCP_PAWS_MSL)
-#define TCP_SECRET_2MSL (HZ * TCP_PAWS_MSL * 2)
-#define TCP_SECRET_LIFE (HZ * 600)
-
-static struct tcp_cookie_secret tcp_secret_one;
-static struct tcp_cookie_secret tcp_secret_two;
-
-/* Essentially a circular list, without dynamic allocation. */
-static struct tcp_cookie_secret *tcp_secret_generating;
-static struct tcp_cookie_secret *tcp_secret_primary;
-static struct tcp_cookie_secret *tcp_secret_retiring;
-static struct tcp_cookie_secret *tcp_secret_secondary;
-
-static DEFINE_SPINLOCK(tcp_secret_locker);
-
-/* Select a pseudo-random word in the cookie workspace.
- */
-static inline u32 tcp_cookie_work(const u32 *ws, const int n)
-{
- return ws[COOKIE_DIGEST_WORDS + ((COOKIE_MESSAGE_WORDS-1) & ws[n])];
-}
-
-/* Fill bakery[COOKIE_WORKSPACE_WORDS] with generator, updating as needed.
- * Called in softirq context.
- * Returns: 0 for success.
- */
-int tcp_cookie_generator(u32 *bakery)
-{
- unsigned long jiffy = jiffies;
-
- if (unlikely(time_after_eq(jiffy, tcp_secret_generating->expires))) {
- spin_lock_bh(&tcp_secret_locker);
- if (!time_after_eq(jiffy, tcp_secret_generating->expires)) {
- /* refreshed by another */
- memcpy(bakery,
- &tcp_secret_generating->secrets[0],
- COOKIE_WORKSPACE_WORDS);
- } else {
- /* still needs refreshing */
- get_random_bytes(bakery, COOKIE_WORKSPACE_WORDS);
-
- /* The first time, paranoia assumes that the
- * randomization function isn't as strong. But,
- * this secret initialization is delayed until
- * the last possible moment (packet arrival).
- * Although that time is observable, it is
- * unpredictably variable. Mash in the most
- * volatile clock bits available, and expire the
- * secret extra quickly.
- */
- if (unlikely(tcp_secret_primary->expires ==
- tcp_secret_secondary->expires)) {
- struct timespec tv;
-
- getnstimeofday(&tv);
- bakery[COOKIE_DIGEST_WORDS+0] ^=
- (u32)tv.tv_nsec;
-
- tcp_secret_secondary->expires = jiffy
- + TCP_SECRET_1MSL
- + (0x0f & tcp_cookie_work(bakery, 0));
- } else {
- tcp_secret_secondary->expires = jiffy
- + TCP_SECRET_LIFE
- + (0xff & tcp_cookie_work(bakery, 1));
- tcp_secret_primary->expires = jiffy
- + TCP_SECRET_2MSL
- + (0x1f & tcp_cookie_work(bakery, 2));
- }
- memcpy(&tcp_secret_secondary->secrets[0],
- bakery, COOKIE_WORKSPACE_WORDS);
-
- rcu_assign_pointer(tcp_secret_generating,
- tcp_secret_secondary);
- rcu_assign_pointer(tcp_secret_retiring,
- tcp_secret_primary);
- /*
- * Neither call_rcu() nor synchronize_rcu() needed.
- * Retiring data is not freed. It is replaced after
- * further (locked) pointer updates, and a quiet time
- * (minimum 1MSL, maximum LIFE - 2MSL).
- */
- }
- spin_unlock_bh(&tcp_secret_locker);
- } else {
- rcu_read_lock_bh();
- memcpy(bakery,
- &rcu_dereference(tcp_secret_generating)->secrets[0],
- COOKIE_WORKSPACE_WORDS);
- rcu_read_unlock_bh();
- }
- return 0;
-}
-EXPORT_SYMBOL(tcp_cookie_generator);
-
void tcp_done(struct sock *sk)
{
struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
@@ -3578,7 +3371,6 @@ void __init tcp_init(void)
unsigned long limit;
int max_rshare, max_wshare, cnt;
unsigned int i;
- unsigned long jiffy = jiffies;
BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
@@ -3654,13 +3446,5 @@ void __init tcp_init(void)
tcp_register_congestion_control(&tcp_reno);
- memset(&tcp_secret_one.secrets[0], 0, sizeof(tcp_secret_one.secrets));
- memset(&tcp_secret_two.secrets[0], 0, sizeof(tcp_secret_two.secrets));
- tcp_secret_one.expires = jiffy; /* past due */
- tcp_secret_two.expires = jiffy; /* past due */
- tcp_secret_generating = &tcp_secret_one;
- tcp_secret_primary = &tcp_secret_one;
- tcp_secret_retiring = &tcp_secret_two;
- tcp_secret_secondary = &tcp_secret_two;
tcp_tasklet_init();
}
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index cdf2e70..019c238 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -317,28 +317,11 @@ void tcp_slow_start(struct tcp_sock *tp)
snd_cwnd = 1U;
}
- /* RFC3465: ABC Slow start
- * Increase only after a full MSS of bytes is acked
- *
- * TCP sender SHOULD increase cwnd by the number of
- * previously unacknowledged bytes ACKed by each incoming
- * acknowledgment, provided the increase is not more than L
- */
- if (sysctl_tcp_abc && tp->bytes_acked < tp->mss_cache)
- return;
-
if (sysctl_tcp_max_ssthresh > 0 && tp->snd_cwnd > sysctl_tcp_max_ssthresh)
cnt = sysctl_tcp_max_ssthresh >> 1; /* limited slow start */
else
cnt = snd_cwnd; /* exponential increase */
- /* RFC3465: ABC
- * We MAY increase by 2 if discovered delayed ack
- */
- if (sysctl_tcp_abc > 1 && tp->bytes_acked >= 2*tp->mss_cache)
- cnt <<= 1;
- tp->bytes_acked = 0;
-
tp->snd_cwnd_cnt += cnt;
while (tp->snd_cwnd_cnt >= snd_cwnd) {
tp->snd_cwnd_cnt -= snd_cwnd;
@@ -378,20 +361,9 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
/* In "safe" area, increase. */
if (tp->snd_cwnd <= tp->snd_ssthresh)
tcp_slow_start(tp);
-
/* In dangerous area, increase slowly. */
- else if (sysctl_tcp_abc) {
- /* RFC3465: Appropriate Byte Count
- * increase once for each full cwnd acked
- */
- if (tp->bytes_acked >= tp->snd_cwnd*tp->mss_cache) {
- tp->bytes_acked -= tp->snd_cwnd*tp->mss_cache;
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
- }
- } else {
+ else
tcp_cong_avoid_ai(tp, tp->snd_cwnd);
- }
}
EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index ad70a96..9c62257 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -81,8 +81,6 @@ int sysctl_tcp_sack __read_mostly = 1;
int sysctl_tcp_fack __read_mostly = 1;
int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH;
EXPORT_SYMBOL(sysctl_tcp_reordering);
-int sysctl_tcp_ecn __read_mostly = 2;
-EXPORT_SYMBOL(sysctl_tcp_ecn);
int sysctl_tcp_dsack __read_mostly = 1;
int sysctl_tcp_app_win __read_mostly = 31;
int sysctl_tcp_adv_win_scale __read_mostly = 1;
@@ -95,13 +93,11 @@ int sysctl_tcp_stdurg __read_mostly;
int sysctl_tcp_rfc1337 __read_mostly;
int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
int sysctl_tcp_frto __read_mostly = 2;
-int sysctl_tcp_frto_response __read_mostly;
int sysctl_tcp_thin_dupack __read_mostly;
int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
-int sysctl_tcp_abc __read_mostly;
-int sysctl_tcp_early_retrans __read_mostly = 2;
+int sysctl_tcp_early_retrans __read_mostly = 3;
#define FLAG_DATA 0x01 /* Incoming frame contained data. */
#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
@@ -111,17 +107,16 @@ int sysctl_tcp_early_retrans __read_mostly = 2;
#define FLAG_DATA_SACKED 0x20 /* New SACK. */
#define FLAG_ECE 0x40 /* ECE in this ACK */
#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/
-#define FLAG_ONLY_ORIG_SACKED 0x200 /* SACKs only non-rexmit sent before RTO */
+#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */
#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */
-#define FLAG_NONHEAD_RETRANS_ACKED 0x1000 /* Non-head rexmitted data was ACKed */
#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */
+#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */
#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE)
#define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED)
-#define FLAG_ANY_PROGRESS (FLAG_FORWARD_PROGRESS|FLAG_SND_UNA_ADVANCED)
#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
#define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))
@@ -1162,10 +1157,8 @@ static u8 tcp_sacktag_one(struct sock *sk,
tcp_highest_sack_seq(tp)))
state->reord = min(fack_count,
state->reord);
-
- /* SACK enhanced F-RTO (RFC4138; Appendix B) */
- if (!after(end_seq, tp->frto_highmark))
- state->flag |= FLAG_ONLY_ORIG_SACKED;
+ if (!after(end_seq, tp->high_seq))
+ state->flag |= FLAG_ORIG_SACK_ACKED;
}
if (sacked & TCPCB_LOST) {
@@ -1558,7 +1551,6 @@ static int
tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
u32 prior_snd_una)
{
- const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
const unsigned char *ptr = (skb_transport_header(ack_skb) +
TCP_SKB_CB(ack_skb)->sacked);
@@ -1731,12 +1723,6 @@ walk:
start_seq, end_seq, dup_sack);
advance_sp:
- /* SACK enhanced FRTO (RFC4138, Appendix B): Clearing correct
- * due to in-order walk
- */
- if (after(end_seq, tp->frto_highmark))
- state.flag &= ~FLAG_ONLY_ORIG_SACKED;
-
i++;
}
@@ -1753,8 +1739,7 @@ advance_sp:
tcp_verify_left_out(tp);
if ((state.reord < tp->fackets_out) &&
- ((icsk->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker) &&
- (!tp->frto_highmark || after(tp->snd_una, tp->frto_highmark)))
+ ((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker))
tcp_update_reordering(sk, tp->fackets_out - state.reord, 0);
out:
@@ -1828,198 +1813,6 @@ static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
tp->sacked_out = 0;
}
-static int tcp_is_sackfrto(const struct tcp_sock *tp)
-{
- return (sysctl_tcp_frto == 0x2) && !tcp_is_reno(tp);
-}
-
-/* F-RTO can only be used if TCP has never retransmitted anything other than
- * head (SACK enhanced variant from Appendix B of RFC4138 is more robust here)
- */
-bool tcp_use_frto(struct sock *sk)
-{
- const struct tcp_sock *tp = tcp_sk(sk);
- const struct inet_connection_sock *icsk = inet_csk(sk);
- struct sk_buff *skb;
-
- if (!sysctl_tcp_frto)
- return false;
-
- /* MTU probe and F-RTO won't really play nicely along currently */
- if (icsk->icsk_mtup.probe_size)
- return false;
-
- if (tcp_is_sackfrto(tp))
- return true;
-
- /* Avoid expensive walking of rexmit queue if possible */
- if (tp->retrans_out > 1)
- return false;
-
- skb = tcp_write_queue_head(sk);
- if (tcp_skb_is_last(sk, skb))
- return true;
- skb = tcp_write_queue_next(sk, skb); /* Skips head */
- tcp_for_write_queue_from(skb, sk) {
- if (skb == tcp_send_head(sk))
- break;
- if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)
- return false;
- /* Short-circuit when first non-SACKed skb has been checked */
- if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
- break;
- }
- return true;
-}
-
-/* RTO occurred, but do not yet enter Loss state. Instead, defer RTO
- * recovery a bit and use heuristics in tcp_process_frto() to detect if
- * the RTO was spurious. Only clear SACKED_RETRANS of the head here to
- * keep retrans_out counting accurate (with SACK F-RTO, other than head
- * may still have that bit set); TCPCB_LOST and remaining SACKED_RETRANS
- * bits are handled if the Loss state is really to be entered (in
- * tcp_enter_frto_loss).
- *
- * Do like tcp_enter_loss() would; when RTO expires the second time it
- * does:
- * "Reduce ssthresh if it has not yet been made inside this window."
- */
-void tcp_enter_frto(struct sock *sk)
-{
- const struct inet_connection_sock *icsk = inet_csk(sk);
- struct tcp_sock *tp = tcp_sk(sk);
- struct sk_buff *skb;
-
- if ((!tp->frto_counter && icsk->icsk_ca_state <= TCP_CA_Disorder) ||
- tp->snd_una == tp->high_seq ||
- ((icsk->icsk_ca_state == TCP_CA_Loss || tp->frto_counter) &&
- !icsk->icsk_retransmits)) {
- tp->prior_ssthresh = tcp_current_ssthresh(sk);
- /* Our state is too optimistic in ssthresh() call because cwnd
- * is not reduced until tcp_enter_frto_loss() when previous F-RTO
- * recovery has not yet completed. Pattern would be this: RTO,
- * Cumulative ACK, RTO (2xRTO for the same segment does not end
- * up here twice).
- * RFC4138 should be more specific on what to do, even though
- * RTO is quite unlikely to occur after the first Cumulative ACK
- * due to back-off and complexity of triggering events ...
- */
- if (tp->frto_counter) {
- u32 stored_cwnd;
- stored_cwnd = tp->snd_cwnd;
- tp->snd_cwnd = 2;
- tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
- tp->snd_cwnd = stored_cwnd;
- } else {
- tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
- }
- /* ... in theory, cong.control module could do "any tricks" in
- * ssthresh(), which means that ca_state, lost bits and lost_out
- * counter would have to be faked before the call occurs. We
- * consider that too expensive, unlikely and hacky, so modules
- * using these in ssthresh() must deal these incompatibility
- * issues if they receives CA_EVENT_FRTO and frto_counter != 0
- */
- tcp_ca_event(sk, CA_EVENT_FRTO);
- }
-
- tp->undo_marker = tp->snd_una;
- tp->undo_retrans = 0;
-
- skb = tcp_write_queue_head(sk);
- if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)
- tp->undo_marker = 0;
- if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
- TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
- tp->retrans_out -= tcp_skb_pcount(skb);
- }
- tcp_verify_left_out(tp);
-
- /* Too bad if TCP was application limited */
- tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1);
-
- /* Earlier loss recovery underway (see RFC4138; Appendix B).
- * The last condition is necessary at least in tp->frto_counter case.
- */
- if (tcp_is_sackfrto(tp) && (tp->frto_counter ||
- ((1 << icsk->icsk_ca_state) & (TCPF_CA_Recovery|TCPF_CA_Loss))) &&
- after(tp->high_seq, tp->snd_una)) {
- tp->frto_highmark = tp->high_seq;
- } else {
- tp->frto_highmark = tp->snd_nxt;
- }
- tcp_set_ca_state(sk, TCP_CA_Disorder);
- tp->high_seq = tp->snd_nxt;
- tp->frto_counter = 1;
-}
-
-/* Enter Loss state after F-RTO was applied. Dupack arrived after RTO,
- * which indicates that we should follow the traditional RTO recovery,
- * i.e. mark everything lost and do go-back-N retransmission.
- */
-static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- struct sk_buff *skb;
-
- tp->lost_out = 0;
- tp->retrans_out = 0;
- if (tcp_is_reno(tp))
- tcp_reset_reno_sack(tp);
-
- tcp_for_write_queue(skb, sk) {
- if (skb == tcp_send_head(sk))
- break;
-
- TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
- /*
- * Count the retransmission made on RTO correctly (only when
- * waiting for the first ACK and did not get it)...
- */
- if ((tp->frto_counter == 1) && !(flag & FLAG_DATA_ACKED)) {
- /* For some reason this R-bit might get cleared? */
- if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
- tp->retrans_out += tcp_skb_pcount(skb);
- /* ...enter this if branch just for the first segment */
- flag |= FLAG_DATA_ACKED;
- } else {
- if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)
- tp->undo_marker = 0;
- TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
- }
-
- /* Marking forward transmissions that were made after RTO lost
- * can cause unnecessary retransmissions in some scenarios,
- * SACK blocks will mitigate that in some but not in all cases.
- * We used to not mark them but it was causing break-ups with
- * receivers that do only in-order receival.
- *
- * TODO: we could detect presence of such receiver and select
- * different behavior per flow.
- */
- if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
- TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
- tp->lost_out += tcp_skb_pcount(skb);
- tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
- }
- }
- tcp_verify_left_out(tp);
-
- tp->snd_cwnd = tcp_packets_in_flight(tp) + allowed_segments;
- tp->snd_cwnd_cnt = 0;
- tp->snd_cwnd_stamp = tcp_time_stamp;
- tp->frto_counter = 0;
- tp->bytes_acked = 0;
-
- tp->reordering = min_t(unsigned int, tp->reordering,
- sysctl_tcp_reordering);
- tcp_set_ca_state(sk, TCP_CA_Loss);
- tp->high_seq = tp->snd_nxt;
- TCP_ECN_queue_cwr(tp);
-
- tcp_clear_all_retrans_hints(tp);
-}
-
static void tcp_clear_retrans_partial(struct tcp_sock *tp)
{
tp->retrans_out = 0;
@@ -2046,10 +1839,13 @@ void tcp_enter_loss(struct sock *sk, int how)
const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
+ bool new_recovery = false;
/* Reduce ssthresh if it has not yet been made inside this window. */
- if (icsk->icsk_ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq ||
+ if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
+ !after(tp->high_seq, tp->snd_una) ||
(icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
+ new_recovery = true;
tp->prior_ssthresh = tcp_current_ssthresh(sk);
tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
tcp_ca_event(sk, CA_EVENT_LOSS);
@@ -2058,17 +1854,13 @@ void tcp_enter_loss(struct sock *sk, int how)
tp->snd_cwnd_cnt = 0;
tp->snd_cwnd_stamp = tcp_time_stamp;
- tp->bytes_acked = 0;
tcp_clear_retrans_partial(tp);
if (tcp_is_reno(tp))
tcp_reset_reno_sack(tp);
- if (!how) {
- /* Push undo marker, if it was plain RTO and nothing
- * was retransmitted. */
- tp->undo_marker = tp->snd_una;
- } else {
+ tp->undo_marker = tp->snd_una;
+ if (how) {
tp->sacked_out = 0;
tp->fackets_out = 0;
}
@@ -2095,8 +1887,14 @@ void tcp_enter_loss(struct sock *sk, int how)
tcp_set_ca_state(sk, TCP_CA_Loss);
tp->high_seq = tp->snd_nxt;
TCP_ECN_queue_cwr(tp);
- /* Abort F-RTO algorithm if one is in progress */
- tp->frto_counter = 0;
+
+ /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous
+ * loss recovery is underway except recurring timeout(s) on
+ * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing
+ */
+ tp->frto = sysctl_tcp_frto &&
+ (new_recovery || icsk->icsk_retransmits) &&
+ !inet_csk(sk)->icsk_mtup.probe_size;
}
/* If ACK arrived pointing to a remembered SACK, it means that our
@@ -2155,15 +1953,16 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
* max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples
* available, or RTO is scheduled to fire first.
*/
- if (sysctl_tcp_early_retrans < 2 || (flag & FLAG_ECE) || !tp->srtt)
+ if (sysctl_tcp_early_retrans < 2 || sysctl_tcp_early_retrans > 3 ||
+ (flag & FLAG_ECE) || !tp->srtt)
return false;
delay = max_t(unsigned long, (tp->srtt >> 5), msecs_to_jiffies(2));
if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay)))
return false;
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, delay, TCP_RTO_MAX);
- tp->early_retrans_delayed = 1;
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_EARLY_RETRANS, delay,
+ TCP_RTO_MAX);
return true;
}
@@ -2279,10 +2078,6 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
struct tcp_sock *tp = tcp_sk(sk);
__u32 packets_out;
- /* Do not perform any recovery during F-RTO algorithm */
- if (tp->frto_counter)
- return false;
-
/* Trick#1: The loss is proven. */
if (tp->lost_out)
return true;
@@ -2326,7 +2121,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
* interval if appropriate.
*/
if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out &&
- (tp->packets_out == (tp->sacked_out + 1) && tp->packets_out < 4) &&
+ (tp->packets_out >= (tp->sacked_out + 1) && tp->packets_out < 4) &&
!tcp_may_send_now(sk))
return !tcp_pause_early_retransmit(sk, flag);
@@ -2643,12 +2438,12 @@ static int tcp_try_undo_partial(struct sock *sk, int acked)
return failed;
}
-/* Undo during loss recovery after partial ACK. */
-static bool tcp_try_undo_loss(struct sock *sk)
+/* Undo during loss recovery after partial ACK or using F-RTO. */
+static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
{
struct tcp_sock *tp = tcp_sk(sk);
- if (tcp_may_undo(tp)) {
+ if (frto_undo || tcp_may_undo(tp)) {
struct sk_buff *skb;
tcp_for_write_queue(skb, sk) {
if (skb == tcp_send_head(sk))
@@ -2662,9 +2457,12 @@ static bool tcp_try_undo_loss(struct sock *sk)
tp->lost_out = 0;
tcp_undo_cwr(sk, true);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO);
+ if (frto_undo)
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPSPURIOUSRTOS);
inet_csk(sk)->icsk_retransmits = 0;
tp->undo_marker = 0;
- if (tcp_is_sack(tp))
+ if (frto_undo || tcp_is_sack(tp))
tcp_set_ca_state(sk, TCP_CA_Open);
return true;
}
@@ -2686,7 +2484,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh)
struct tcp_sock *tp = tcp_sk(sk);
tp->high_seq = tp->snd_nxt;
- tp->bytes_acked = 0;
+ tp->tlp_high_seq = 0;
tp->snd_cwnd_cnt = 0;
tp->prior_cwnd = tp->snd_cwnd;
tp->prr_delivered = 0;
@@ -2737,7 +2535,6 @@ void tcp_enter_cwr(struct sock *sk, const int set_ssthresh)
struct tcp_sock *tp = tcp_sk(sk);
tp->prior_ssthresh = 0;
- tp->bytes_acked = 0;
if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
tp->undo_marker = 0;
tcp_init_cwnd_reduction(sk, set_ssthresh);
@@ -2765,7 +2562,7 @@ static void tcp_try_to_open(struct sock *sk, int flag, int newly_acked_sacked)
tcp_verify_left_out(tp);
- if (!tp->frto_counter && !tcp_any_retrans_done(sk))
+ if (!tcp_any_retrans_done(sk))
tp->retrans_stamp = 0;
if (flag & FLAG_ECE)
@@ -2882,6 +2679,58 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
tcp_set_ca_state(sk, TCP_CA_Recovery);
}
+/* Process an ACK in CA_Loss state. Move to CA_Open if lost data are
+ * recovered or spurious. Otherwise retransmits more on partial ACKs.
+ */
+static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ bool recovered = !before(tp->snd_una, tp->high_seq);
+
+ if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */
+ if (flag & FLAG_ORIG_SACK_ACKED) {
+ /* Step 3.b. A timeout is spurious if not all data are
+ * lost, i.e., never-retransmitted data are (s)acked.
+ */
+ tcp_try_undo_loss(sk, true);
+ return;
+ }
+ if (after(tp->snd_nxt, tp->high_seq) &&
+ (flag & FLAG_DATA_SACKED || is_dupack)) {
+ tp->frto = 0; /* Loss was real: 2nd part of step 3.a */
+ } else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) {
+ tp->high_seq = tp->snd_nxt;
+ __tcp_push_pending_frames(sk, tcp_current_mss(sk),
+ TCP_NAGLE_OFF);
+ if (after(tp->snd_nxt, tp->high_seq))
+ return; /* Step 2.b */
+ tp->frto = 0;
+ }
+ }
+
+ if (recovered) {
+ /* F-RTO RFC5682 sec 3.1 step 2.a and 1st part of step 3.a */
+ icsk->icsk_retransmits = 0;
+ tcp_try_undo_recovery(sk);
+ return;
+ }
+ if (flag & FLAG_DATA_ACKED)
+ icsk->icsk_retransmits = 0;
+ if (tcp_is_reno(tp)) {
+ /* A Reno DUPACK means new data in F-RTO step 2.b above are
+ * delivered. Lower inflight to clock out (re)tranmissions.
+ */
+ if (after(tp->snd_nxt, tp->high_seq) && is_dupack)
+ tcp_add_reno_sack(sk);
+ else if (flag & FLAG_SND_UNA_ADVANCED)
+ tcp_reset_reno_sack(tp);
+ }
+ if (tcp_try_undo_loss(sk, false))
+ return;
+ tcp_xmit_retransmit_queue(sk);
+}
+
/* Process an event, which can update packets-in-flight not trivially.
* Main goal of this function is to calculate new estimate for left_out,
* taking into account both packets sitting in receiver's buffer and
@@ -2894,8 +2743,8 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
* tcp_xmit_retransmit_queue().
*/
static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
- int prior_sacked, bool is_dupack,
- int flag)
+ int prior_sacked, int prior_packets,
+ bool is_dupack, int flag)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
@@ -2928,12 +2777,6 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
tp->retrans_stamp = 0;
} else if (!before(tp->snd_una, tp->high_seq)) {
switch (icsk->icsk_ca_state) {
- case TCP_CA_Loss:
- icsk->icsk_retransmits = 0;
- if (tcp_try_undo_recovery(sk))
- return;
- break;
-
case TCP_CA_CWR:
/* CWR is to be held something *above* high_seq
* is ACKed for CWR bit to reach receiver. */
@@ -2961,21 +2804,14 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
tcp_add_reno_sack(sk);
} else
do_lost = tcp_try_undo_partial(sk, pkts_acked);
- newly_acked_sacked = pkts_acked + tp->sacked_out - prior_sacked;
+ newly_acked_sacked = prior_packets - tp->packets_out +
+ tp->sacked_out - prior_sacked;
break;
case TCP_CA_Loss:
- if (flag & FLAG_DATA_ACKED)
- icsk->icsk_retransmits = 0;
- if (tcp_is_reno(tp) && flag & FLAG_SND_UNA_ADVANCED)
- tcp_reset_reno_sack(tp);
- if (!tcp_try_undo_loss(sk)) {
- tcp_moderate_cwnd(tp);
- tcp_xmit_retransmit_queue(sk);
- return;
- }
+ tcp_process_loss(sk, flag, is_dupack);
if (icsk->icsk_ca_state != TCP_CA_Open)
return;
- /* Loss is undone; fall through to processing in Open state. */
+ /* Fall through to processing in Open state. */
default:
if (tcp_is_reno(tp)) {
if (flag & FLAG_SND_UNA_ADVANCED)
@@ -2983,7 +2819,8 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
if (is_dupack)
tcp_add_reno_sack(sk);
}
- newly_acked_sacked = pkts_acked + tp->sacked_out - prior_sacked;
+ newly_acked_sacked = prior_packets - tp->packets_out +
+ tp->sacked_out - prior_sacked;
if (icsk->icsk_ca_state <= TCP_CA_Disorder)
tcp_try_undo_dsack(sk);
@@ -3088,6 +2925,7 @@ static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
*/
void tcp_rearm_rto(struct sock *sk)
{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
/* If the retrans timer is currently being used by Fast Open
@@ -3101,12 +2939,13 @@ void tcp_rearm_rto(struct sock *sk)
} else {
u32 rto = inet_csk(sk)->icsk_rto;
/* Offset the time elapsed after installing regular RTO */
- if (tp->early_retrans_delayed) {
+ if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+ icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
struct sk_buff *skb = tcp_write_queue_head(sk);
const u32 rto_time_stamp = TCP_SKB_CB(skb)->when + rto;
s32 delta = (s32)(rto_time_stamp - tcp_time_stamp);
/* delta may not be positive if the socket is locked
- * when the delayed ER timer fires and is rescheduled.
+ * when the retrans timer fires and is rescheduled.
*/
if (delta > 0)
rto = delta;
@@ -3114,7 +2953,6 @@ void tcp_rearm_rto(struct sock *sk)
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
TCP_RTO_MAX);
}
- tp->early_retrans_delayed = 0;
}
/* This function is called when the delayed ER timer fires. TCP enters
@@ -3202,8 +3040,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
flag |= FLAG_RETRANS_DATA_ACKED;
ca_seq_rtt = -1;
seq_rtt = -1;
- if ((flag & FLAG_DATA_ACKED) || (acked_pcount > 1))
- flag |= FLAG_NONHEAD_RETRANS_ACKED;
} else {
ca_seq_rtt = now - scb->when;
last_ackt = skb->tstamp;
@@ -3212,6 +3048,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
}
if (!(sacked & TCPCB_SACKED_ACKED))
reord = min(pkts_acked, reord);
+ if (!after(scb->end_seq, tp->high_seq))
+ flag |= FLAG_ORIG_SACK_ACKED;
}
if (sacked & TCPCB_SACKED_ACKED)
@@ -3412,166 +3250,74 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32
return flag;
}
-/* A very conservative spurious RTO response algorithm: reduce cwnd and
- * continue in congestion avoidance.
- */
-static void tcp_conservative_spur_to_response(struct tcp_sock *tp)
+/* RFC 5961 7 [ACK Throttling] */
+static void tcp_send_challenge_ack(struct sock *sk)
{
- tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
- tp->snd_cwnd_cnt = 0;
- tp->bytes_acked = 0;
- TCP_ECN_queue_cwr(tp);
- tcp_moderate_cwnd(tp);
-}
+ /* unprotected vars, we dont care of overwrites */
+ static u32 challenge_timestamp;
+ static unsigned int challenge_count;
+ u32 now = jiffies / HZ;
-/* A conservative spurious RTO response algorithm: reduce cwnd using
- * PRR and continue in congestion avoidance.
- */
-static void tcp_cwr_spur_to_response(struct sock *sk)
-{
- tcp_enter_cwr(sk, 0);
+ if (now != challenge_timestamp) {
+ challenge_timestamp = now;
+ challenge_count = 0;
+ }
+ if (++challenge_count <= sysctl_tcp_challenge_ack_limit) {
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK);
+ tcp_send_ack(sk);
+ }
}
-static void tcp_undo_spur_to_response(struct sock *sk, int flag)
+static void tcp_store_ts_recent(struct tcp_sock *tp)
{
- if (flag & FLAG_ECE)
- tcp_cwr_spur_to_response(sk);
- else
- tcp_undo_cwr(sk, true);
+ tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval;
+ tp->rx_opt.ts_recent_stamp = get_seconds();
}
-/* F-RTO spurious RTO detection algorithm (RFC4138)
- *
- * F-RTO affects during two new ACKs following RTO (well, almost, see inline
- * comments). State (ACK number) is kept in frto_counter. When ACK advances
- * window (but not to or beyond highest sequence sent before RTO):
- * On First ACK, send two new segments out.
- * On Second ACK, RTO was likely spurious. Do spurious response (response
- * algorithm is not part of the F-RTO detection algorithm
- * given in RFC4138 but can be selected separately).
- * Otherwise (basically on duplicate ACK), RTO was (likely) caused by a loss
- * and TCP falls back to conventional RTO recovery. F-RTO allows overriding
- * of Nagle, this is done using frto_counter states 2 and 3, when a new data
- * segment of any size sent during F-RTO, state 2 is upgraded to 3.
- *
- * Rationale: if the RTO was spurious, new ACKs should arrive from the
- * original window even after we transmit two new data segments.
- *
- * SACK version:
- * on first step, wait until first cumulative ACK arrives, then move to
- * the second step. In second step, the next ACK decides.
- *
- * F-RTO is implemented (mainly) in four functions:
- * - tcp_use_frto() is used to determine if TCP is can use F-RTO
- * - tcp_enter_frto() prepares TCP state on RTO if F-RTO is used, it is
- * called when tcp_use_frto() showed green light
- * - tcp_process_frto() handles incoming ACKs during F-RTO algorithm
- * - tcp_enter_frto_loss() is called if there is not enough evidence
- * to prove that the RTO is indeed spurious. It transfers the control
- * from F-RTO to the conventional RTO recovery
- */
-static bool tcp_process_frto(struct sock *sk, int flag)
+static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
{
- struct tcp_sock *tp = tcp_sk(sk);
-
- tcp_verify_left_out(tp);
-
- /* Duplicate the behavior from Loss state (fastretrans_alert) */
- if (flag & FLAG_DATA_ACKED)
- inet_csk(sk)->icsk_retransmits = 0;
-
- if ((flag & FLAG_NONHEAD_RETRANS_ACKED) ||
- ((tp->frto_counter >= 2) && (flag & FLAG_RETRANS_DATA_ACKED)))
- tp->undo_marker = 0;
-
- if (!before(tp->snd_una, tp->frto_highmark)) {
- tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), flag);
- return true;
- }
-
- if (!tcp_is_sackfrto(tp)) {
- /* RFC4138 shortcoming in step 2; should also have case c):
- * ACK isn't duplicate nor advances window, e.g., opposite dir
- * data, winupdate
+ if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) {
+ /* PAWS bug workaround wrt. ACK frames, the PAWS discard
+ * extra check below makes sure this can only happen
+ * for pure ACK frames. -DaveM
+ *
+ * Not only, also it occurs for expired timestamps.
*/
- if (!(flag & FLAG_ANY_PROGRESS) && (flag & FLAG_NOT_DUP))
- return true;
-
- if (!(flag & FLAG_DATA_ACKED)) {
- tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3),
- flag);
- return true;
- }
- } else {
- if (!(flag & FLAG_DATA_ACKED) && (tp->frto_counter == 1)) {
- if (!tcp_packets_in_flight(tp)) {
- tcp_enter_frto_loss(sk, 2, flag);
- return true;
- }
-
- /* Prevent sending of new data. */
- tp->snd_cwnd = min(tp->snd_cwnd,
- tcp_packets_in_flight(tp));
- return true;
- }
-
- if ((tp->frto_counter >= 2) &&
- (!(flag & FLAG_FORWARD_PROGRESS) ||
- ((flag & FLAG_DATA_SACKED) &&
- !(flag & FLAG_ONLY_ORIG_SACKED)))) {
- /* RFC4138 shortcoming (see comment above) */
- if (!(flag & FLAG_FORWARD_PROGRESS) &&
- (flag & FLAG_NOT_DUP))
- return true;
-
- tcp_enter_frto_loss(sk, 3, flag);
- return true;
- }
- }
- if (tp->frto_counter == 1) {
- /* tcp_may_send_now needs to see updated state */
- tp->snd_cwnd = tcp_packets_in_flight(tp) + 2;
- tp->frto_counter = 2;
-
- if (!tcp_may_send_now(sk))
- tcp_enter_frto_loss(sk, 2, flag);
-
- return true;
- } else {
- switch (sysctl_tcp_frto_response) {
- case 2:
- tcp_undo_spur_to_response(sk, flag);
- break;
- case 1:
- tcp_conservative_spur_to_response(tp);
- break;
- default:
- tcp_cwr_spur_to_response(sk);
- break;
- }
- tp->frto_counter = 0;
- tp->undo_marker = 0;
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSPURIOUSRTOS);
+ if (tcp_paws_check(&tp->rx_opt, 0))
+ tcp_store_ts_recent(tp);
}
- return false;
}
-/* RFC 5961 7 [ACK Throttling] */
-static void tcp_send_challenge_ack(struct sock *sk)
+/* This routine deals with acks during a TLP episode.
+ * Ref: loss detection algorithm in draft-dukkipati-tcpm-tcp-loss-probe.
+ */
+static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
{
- /* unprotected vars, we dont care of overwrites */
- static u32 challenge_timestamp;
- static unsigned int challenge_count;
- u32 now = jiffies / HZ;
+ struct tcp_sock *tp = tcp_sk(sk);
+ bool is_tlp_dupack = (ack == tp->tlp_high_seq) &&
+ !(flag & (FLAG_SND_UNA_ADVANCED |
+ FLAG_NOT_DUP | FLAG_DATA_SACKED));
- if (now != challenge_timestamp) {
- challenge_timestamp = now;
- challenge_count = 0;
+ /* Mark the end of TLP episode on receiving TLP dupack or when
+ * ack is after tlp_high_seq.
+ */
+ if (is_tlp_dupack) {
+ tp->tlp_high_seq = 0;
+ return;
}
- if (++challenge_count <= sysctl_tcp_challenge_ack_limit) {
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK);
- tcp_send_ack(sk);
+
+ if (after(ack, tp->tlp_high_seq)) {
+ tp->tlp_high_seq = 0;
+ /* Don't reduce cwnd if DSACK arrives for TLP retrans. */
+ if (!(flag & FLAG_DSACKING_ACK)) {
+ tcp_init_cwnd_reduction(sk, true);
+ tcp_set_ca_state(sk, TCP_CA_CWR);
+ tcp_end_cwnd_reduction(sk);
+ tcp_set_ca_state(sk, TCP_CA_Open);
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPLOSSPROBERECOVERY);
+ }
}
}
@@ -3586,10 +3332,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
bool is_dupack = false;
u32 prior_in_flight;
u32 prior_fackets;
- int prior_packets;
+ int prior_packets = tp->packets_out;
int prior_sacked = tp->sacked_out;
int pkts_acked = 0;
- bool frto_cwnd = false;
+ int previous_packets_out = 0;
/* If the ack is older than previous acks
* then we can probably ignore it.
@@ -3609,24 +3355,22 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (after(ack, tp->snd_nxt))
goto invalid_ack;
- if (tp->early_retrans_delayed)
+ if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+ icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
tcp_rearm_rto(sk);
if (after(ack, prior_snd_una))
flag |= FLAG_SND_UNA_ADVANCED;
- if (sysctl_tcp_abc) {
- if (icsk->icsk_ca_state < TCP_CA_CWR)
- tp->bytes_acked += ack - prior_snd_una;
- else if (icsk->icsk_ca_state == TCP_CA_Loss)
- /* we assume just one segment left network */
- tp->bytes_acked += min(ack - prior_snd_una,
- tp->mss_cache);
- }
-
prior_fackets = tp->fackets_out;
prior_in_flight = tcp_packets_in_flight(tp);
+ /* ts_recent update must be made after we are sure that the packet
+ * is in window.
+ */
+ if (flag & FLAG_UPDATE_TS_RECENT)
+ tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
+
if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
/* Window is constant, pure forward advance.
* No more checks are required.
@@ -3662,52 +3406,54 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
sk->sk_err_soft = 0;
icsk->icsk_probes_out = 0;
tp->rcv_tstamp = tcp_time_stamp;
- prior_packets = tp->packets_out;
if (!prior_packets)
goto no_queue;
/* See if we can take anything off of the retransmit queue. */
+ previous_packets_out = tp->packets_out;
flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una);
- pkts_acked = prior_packets - tp->packets_out;
-
- if (tp->frto_counter)
- frto_cwnd = tcp_process_frto(sk, flag);
- /* Guarantee sacktag reordering detection against wrap-arounds */
- if (before(tp->frto_highmark, tp->snd_una))
- tp->frto_highmark = 0;
+ pkts_acked = previous_packets_out - tp->packets_out;
if (tcp_ack_is_dubious(sk, flag)) {
/* Advance CWND, if state allows this. */
- if ((flag & FLAG_DATA_ACKED) && !frto_cwnd &&
- tcp_may_raise_cwnd(sk, flag))
+ if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag))
tcp_cong_avoid(sk, ack, prior_in_flight);
is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
tcp_fastretrans_alert(sk, pkts_acked, prior_sacked,
- is_dupack, flag);
+ prior_packets, is_dupack, flag);
} else {
- if ((flag & FLAG_DATA_ACKED) && !frto_cwnd)
+ if (flag & FLAG_DATA_ACKED)
tcp_cong_avoid(sk, ack, prior_in_flight);
}
+ if (tp->tlp_high_seq)
+ tcp_process_tlp_ack(sk, ack, flag);
+
if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) {
struct dst_entry *dst = __sk_dst_get(sk);
if (dst)
dst_confirm(dst);
}
+
+ if (icsk->icsk_pending == ICSK_TIME_RETRANS)
+ tcp_schedule_loss_probe(sk);
return 1;
no_queue:
/* If data was DSACKed, see if we can undo a cwnd reduction. */
if (flag & FLAG_DSACKING_ACK)
tcp_fastretrans_alert(sk, pkts_acked, prior_sacked,
- is_dupack, flag);
+ prior_packets, is_dupack, flag);
/* If this ack opens up a zero window, clear backoff. It was
* being used to time the probes, and is probably far higher than
* it needs to be for normal retransmission.
*/
if (tcp_send_head(sk))
tcp_ack_probe(sk);
+
+ if (tp->tlp_high_seq)
+ tcp_process_tlp_ack(sk, ack, flag);
return 1;
invalid_ack:
@@ -3721,7 +3467,7 @@ old_ack:
if (TCP_SKB_CB(skb)->sacked) {
flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una);
tcp_fastretrans_alert(sk, pkts_acked, prior_sacked,
- is_dupack, flag);
+ prior_packets, is_dupack, flag);
}
SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
@@ -3732,8 +3478,8 @@ old_ack:
* But, this can also be called on packets in the established flow when
* the fast version below fails.
*/
-void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *opt_rx,
- const u8 **hvpp, int estab,
+void tcp_parse_options(const struct sk_buff *skb,
+ struct tcp_options_received *opt_rx, int estab,
struct tcp_fastopen_cookie *foc)
{
const unsigned char *ptr;
@@ -3817,31 +3563,6 @@ void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *o
*/
break;
#endif
- case TCPOPT_COOKIE:
- /* This option is variable length.
- */
- switch (opsize) {
- case TCPOLEN_COOKIE_BASE:
- /* not yet implemented */
- break;
- case TCPOLEN_COOKIE_PAIR:
- /* not yet implemented */
- break;
- case TCPOLEN_COOKIE_MIN+0:
- case TCPOLEN_COOKIE_MIN+2:
- case TCPOLEN_COOKIE_MIN+4:
- case TCPOLEN_COOKIE_MIN+6:
- case TCPOLEN_COOKIE_MAX:
- /* 16-bit multiple */
- opt_rx->cookie_plus = opsize;
- *hvpp = ptr;
- break;
- default:
- /* ignore option */
- break;
- }
- break;
-
case TCPOPT_EXP:
/* Fast Open option shares code 254 using a
* 16 bits magic number. It's valid only in
@@ -3877,7 +3598,7 @@ static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr
++ptr;
tp->rx_opt.rcv_tsval = ntohl(*ptr);
++ptr;
- tp->rx_opt.rcv_tsecr = ntohl(*ptr);
+ tp->rx_opt.rcv_tsecr = ntohl(*ptr) - tp->tsoffset;
return true;
}
return false;
@@ -3887,8 +3608,7 @@ static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr
* If it is wrong it falls back on tcp_parse_options().
*/
static bool tcp_fast_parse_options(const struct sk_buff *skb,
- const struct tcphdr *th,
- struct tcp_sock *tp, const u8 **hvpp)
+ const struct tcphdr *th, struct tcp_sock *tp)
{
/* In the spirit of fast parsing, compare doff directly to constant
* values. Because equality is used, short doff can be ignored here.
@@ -3901,7 +3621,11 @@ static bool tcp_fast_parse_options(const struct sk_buff *skb,
if (tcp_parse_aligned_timestamp(tp, th))
return true;
}
- tcp_parse_options(skb, &tp->rx_opt, hvpp, 1, NULL);
+
+ tcp_parse_options(skb, &tp->rx_opt, 1, NULL);
+ if (tp->rx_opt.saw_tstamp)
+ tp->rx_opt.rcv_tsecr -= tp->tsoffset;
+
return true;
}
@@ -3943,27 +3667,6 @@ const u8 *tcp_parse_md5sig_option(const struct tcphdr *th)
EXPORT_SYMBOL(tcp_parse_md5sig_option);
#endif
-static inline void tcp_store_ts_recent(struct tcp_sock *tp)
-{
- tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval;
- tp->rx_opt.ts_recent_stamp = get_seconds();
-}
-
-static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
-{
- if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) {
- /* PAWS bug workaround wrt. ACK frames, the PAWS discard
- * extra check below makes sure this can only happen
- * for pure ACK frames. -DaveM
- *
- * Not only, also it occurs for expired timestamps.
- */
-
- if (tcp_paws_check(&tp->rx_opt, 0))
- tcp_store_ts_recent(tp);
- }
-}
-
/* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM
*
* It is not fatal. If this ACK does _not_ change critical state (seqs, window)
@@ -5279,12 +4982,10 @@ out:
static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
const struct tcphdr *th, int syn_inerr)
{
- const u8 *hash_location;
struct tcp_sock *tp = tcp_sk(sk);
/* RFC1323: H1. Apply PAWS check first. */
- if (tcp_fast_parse_options(skb, th, tp, &hash_location) &&
- tp->rx_opt.saw_tstamp &&
+ if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
tcp_paws_discard(sk, skb)) {
if (!th->rst) {
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
@@ -5498,6 +5199,9 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
if (tcp_checksum_complete_user(sk, skb))
goto csum_error;
+ if ((int)skb->truesize > sk->sk_forward_alloc)
+ goto step5;
+
/* Predicted packet is in window by definition.
* seq == rcv_nxt and rcv_wup <= rcv_nxt.
* Hence, check seq<=rcv_wup reduces to:
@@ -5509,9 +5213,6 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
tcp_rcv_rtt_measure_ts(sk, skb);
- if ((int)skb->truesize > sk->sk_forward_alloc)
- goto step5;
-
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS);
/* Bulk data transfer: receiver */
@@ -5559,14 +5260,9 @@ slow_path:
return 0;
step5:
- if (tcp_ack(sk, skb, FLAG_SLOWPATH) < 0)
+ if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0)
goto discard;
- /* ts_recent update must be made after we are sure that the packet
- * is in window.
- */
- tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
-
tcp_rcv_rtt_measure_ts(sk, skb);
/* Process urgent data. */
@@ -5580,6 +5276,7 @@ step5:
return 0;
csum_error:
+ TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
discard:
@@ -5638,12 +5335,11 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
if (mss == tp->rx_opt.user_mss) {
struct tcp_options_received opt;
- const u8 *hash_location;
/* Get original SYNACK MSS value if user MSS sets mss_clamp */
tcp_clear_options(&opt);
opt.user_mss = opt.mss_clamp = 0;
- tcp_parse_options(synack, &opt, &hash_location, 0, NULL);
+ tcp_parse_options(synack, &opt, 0, NULL);
mss = opt.mss_clamp;
}
@@ -5674,14 +5370,14 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
const struct tcphdr *th, unsigned int len)
{
- const u8 *hash_location;
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
- struct tcp_cookie_values *cvp = tp->cookie_values;
struct tcp_fastopen_cookie foc = { .len = -1 };
int saved_clamp = tp->rx_opt.mss_clamp;
- tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0, &foc);
+ tcp_parse_options(skb, &tp->rx_opt, 0, &foc);
+ if (tp->rx_opt.saw_tstamp)
+ tp->rx_opt.rcv_tsecr -= tp->tsoffset;
if (th->ack) {
/* rfc793:
@@ -5776,30 +5472,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
* is initialized. */
tp->copied_seq = tp->rcv_nxt;
- if (cvp != NULL &&
- cvp->cookie_pair_size > 0 &&
- tp->rx_opt.cookie_plus > 0) {
- int cookie_size = tp->rx_opt.cookie_plus
- - TCPOLEN_COOKIE_BASE;
- int cookie_pair_size = cookie_size
- + cvp->cookie_desired;
-
- /* A cookie extension option was sent and returned.
- * Note that each incoming SYNACK replaces the
- * Responder cookie. The initial exchange is most
- * fragile, as protection against spoofing relies
- * entirely upon the sequence and timestamp (above).
- * This replacement strategy allows the correct pair to
- * pass through, while any others will be filtered via
- * Responder verification later.
- */
- if (sizeof(cvp->cookie_pair) >= cookie_pair_size) {
- memcpy(&cvp->cookie_pair[cvp->cookie_desired],
- hash_location, cookie_size);
- cvp->cookie_pair_size = cookie_pair_size;
- }
- }
-
smp_mb();
tcp_finish_connect(sk, skb);
@@ -6000,7 +5672,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
/* step 5: check the ACK field */
if (true) {
- int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH) > 0;
+ int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH |
+ FLAG_UPDATE_TS_RECENT) > 0;
switch (sk->sk_state) {
case TCP_SYN_RECV:
@@ -6151,11 +5824,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
}
}
- /* ts_recent update must be made after we are sure that the packet
- * is in window.
- */
- tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
-
/* step 6: check the URG bit */
tcp_urg(sk, skb, th);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index eadb693..7999fc5 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -274,13 +274,6 @@ static void tcp_v4_mtu_reduced(struct sock *sk)
struct inet_sock *inet = inet_sk(sk);
u32 mtu = tcp_sk(sk)->mtu_info;
- /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
- * send out by Linux are always <576bytes so they should go through
- * unfragmented).
- */
- if (sk->sk_state == TCP_LISTEN)
- return;
-
dst = inet_csk_update_pmtu(sk, mtu);
if (!dst)
return;
@@ -408,6 +401,13 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
goto out;
if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
+ /* We are not interested in TCP_LISTEN and open_requests
+ * (SYN-ACKs send out by Linux are always <576bytes so
+ * they should go through unfragmented).
+ */
+ if (sk->sk_state == TCP_LISTEN)
+ goto out;
+
tp->mtu_info = info;
if (!sock_owned_by_user(sk)) {
tcp_v4_mtu_reduced(sk);
@@ -657,7 +657,8 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
* no RST generated if md5 hash doesn't match.
*/
sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev),
- &tcp_hashinfo, ip_hdr(skb)->daddr,
+ &tcp_hashinfo, ip_hdr(skb)->saddr,
+ th->source, ip_hdr(skb)->daddr,
ntohs(th->source), inet_iif(skb));
/* don't send rst if it can't find key */
if (!sk1)
@@ -725,7 +726,7 @@ release_sk1:
*/
static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
- u32 win, u32 ts, int oif,
+ u32 win, u32 tsval, u32 tsecr, int oif,
struct tcp_md5sig_key *key,
int reply_flags, u8 tos)
{
@@ -746,12 +747,12 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
arg.iov[0].iov_base = (unsigned char *)&rep;
arg.iov[0].iov_len = sizeof(rep.th);
- if (ts) {
+ if (tsecr) {
rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
(TCPOPT_TIMESTAMP << 8) |
TCPOLEN_TIMESTAMP);
- rep.opt[1] = htonl(tcp_time_stamp);
- rep.opt[2] = htonl(ts);
+ rep.opt[1] = htonl(tsval);
+ rep.opt[2] = htonl(tsecr);
arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
}
@@ -766,7 +767,7 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
#ifdef CONFIG_TCP_MD5SIG
if (key) {
- int offset = (ts) ? 3 : 0;
+ int offset = (tsecr) ? 3 : 0;
rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
(TCPOPT_NOP << 16) |
@@ -801,6 +802,7 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
+ tcp_time_stamp + tcptw->tw_ts_offset,
tcptw->tw_ts_recent,
tw->tw_bound_dev_if,
tcp_twsk_md5_key(tcptw),
@@ -820,6 +822,7 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
+ tcp_time_stamp,
req->ts_recent,
0,
tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
@@ -835,7 +838,6 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
*/
static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
struct request_sock *req,
- struct request_values *rvp,
u16 queue_mapping,
bool nocache)
{
@@ -848,7 +850,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
return -1;
- skb = tcp_make_synack(sk, dst, req, rvp, NULL);
+ skb = tcp_make_synack(sk, dst, req, NULL);
if (skb) {
__tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
@@ -865,10 +867,9 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
return err;
}
-static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
- struct request_values *rvp)
+static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req)
{
- int res = tcp_v4_send_synack(sk, NULL, req, rvp, 0, false);
+ int res = tcp_v4_send_synack(sk, NULL, req, 0, false);
if (!res)
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
@@ -951,7 +952,6 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
{
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_md5sig_key *key;
- struct hlist_node *pos;
unsigned int size = sizeof(struct in_addr);
struct tcp_md5sig_info *md5sig;
@@ -965,7 +965,7 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
if (family == AF_INET6)
size = sizeof(struct in6_addr);
#endif
- hlist_for_each_entry_rcu(key, pos, &md5sig->head, node) {
+ hlist_for_each_entry_rcu(key, &md5sig->head, node) {
if (key->family != family)
continue;
if (!memcmp(&key->addr, addr, size))
@@ -1003,7 +1003,7 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_md5sig_info *md5sig;
- key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET);
+ key = tcp_md5_do_lookup(sk, addr, family);
if (key) {
/* Pre-existing entry - just update that one. */
memcpy(key->key, newkey, newkeylen);
@@ -1048,7 +1048,7 @@ int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
struct tcp_md5sig_key *key;
struct tcp_md5sig_info *md5sig;
- key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET);
+ key = tcp_md5_do_lookup(sk, addr, family);
if (!key)
return -ENOENT;
hlist_del_rcu(&key->node);
@@ -1066,14 +1066,14 @@ static void tcp_clear_md5_list(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_md5sig_key *key;
- struct hlist_node *pos, *n;
+ struct hlist_node *n;
struct tcp_md5sig_info *md5sig;
md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
if (!hlist_empty(&md5sig->head))
tcp_free_md5sig_pool();
- hlist_for_each_entry_safe(key, pos, n, &md5sig->head, node) {
+ hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
hlist_del_rcu(&key->node);
atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
kfree_rcu(key, rcu);
@@ -1369,8 +1369,7 @@ static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb,
static int tcp_v4_conn_req_fastopen(struct sock *sk,
struct sk_buff *skb,
struct sk_buff *skb_synack,
- struct request_sock *req,
- struct request_values *rvp)
+ struct request_sock *req)
{
struct tcp_sock *tp = tcp_sk(sk);
struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
@@ -1465,9 +1464,7 @@ static int tcp_v4_conn_req_fastopen(struct sock *sk,
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
{
- struct tcp_extend_values tmp_ext;
struct tcp_options_received tmp_opt;
- const u8 *hash_location;
struct request_sock *req;
struct inet_request_sock *ireq;
struct tcp_sock *tp = tcp_sk(sk);
@@ -1517,42 +1514,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
tcp_clear_options(&tmp_opt);
tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
tmp_opt.user_mss = tp->rx_opt.user_mss;
- tcp_parse_options(skb, &tmp_opt, &hash_location, 0,
- want_cookie ? NULL : &foc);
-
- if (tmp_opt.cookie_plus > 0 &&
- tmp_opt.saw_tstamp &&
- !tp->rx_opt.cookie_out_never &&
- (sysctl_tcp_cookie_size > 0 ||
- (tp->cookie_values != NULL &&
- tp->cookie_values->cookie_desired > 0))) {
- u8 *c;
- u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
- int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
-
- if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
- goto drop_and_release;
-
- /* Secret recipe starts with IP addresses */
- *mess++ ^= (__force u32)daddr;
- *mess++ ^= (__force u32)saddr;
-
- /* plus variable length Initiator Cookie */
- c = (u8 *)mess;
- while (l-- > 0)
- *c++ ^= *hash_location++;
-
- want_cookie = false; /* not our kind of cookie */
- tmp_ext.cookie_out_never = 0; /* false */
- tmp_ext.cookie_plus = tmp_opt.cookie_plus;
- } else if (!tp->rx_opt.cookie_in_always) {
- /* redundant indications, but ensure initialization. */
- tmp_ext.cookie_out_never = 1; /* true */
- tmp_ext.cookie_plus = 0;
- } else {
- goto drop_and_release;
- }
- tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
+ tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc);
if (want_cookie && !tmp_opt.saw_tstamp)
tcp_clear_options(&tmp_opt);
@@ -1570,7 +1532,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
goto drop_and_free;
if (!want_cookie || tmp_opt.tstamp_ok)
- TCP_ECN_create_request(req, skb);
+ TCP_ECN_create_request(req, skb, sock_net(sk));
if (want_cookie) {
isn = cookie_v4_init_sequence(sk, skb, &req->mss);
@@ -1634,7 +1596,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
* of tcp_v4_send_synack()->tcp_select_initial_window().
*/
skb_synack = tcp_make_synack(sk, dst, req,
- (struct request_values *)&tmp_ext,
fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL);
if (skb_synack) {
@@ -1658,8 +1619,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
if (fastopen_cookie_present(&foc) && foc.len != 0)
NET_INC_STATS_BH(sock_net(sk),
LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
- } else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req,
- (struct request_values *)&tmp_ext))
+ } else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req))
goto drop_and_free;
return 0;
@@ -1906,6 +1866,7 @@ discard:
return 0;
csum_err:
+ TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
goto discard;
}
@@ -1948,6 +1909,51 @@ void tcp_v4_early_demux(struct sk_buff *skb)
}
}
+/* Packet is added to VJ-style prequeue for processing in process
+ * context, if a reader task is waiting. Apparently, this exciting
+ * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
+ * failed somewhere. Latency? Burstiness? Well, at least now we will
+ * see, why it failed. 8)8) --ANK
+ *
+ */
+bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (sysctl_tcp_low_latency || !tp->ucopy.task)
+ return false;
+
+ if (skb->len <= tcp_hdrlen(skb) &&
+ skb_queue_len(&tp->ucopy.prequeue) == 0)
+ return false;
+
+ skb_dst_force(skb);
+ __skb_queue_tail(&tp->ucopy.prequeue, skb);
+ tp->ucopy.memory += skb->truesize;
+ if (tp->ucopy.memory > sk->sk_rcvbuf) {
+ struct sk_buff *skb1;
+
+ BUG_ON(sock_owned_by_user(sk));
+
+ while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
+ sk_backlog_rcv(sk, skb1);
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPPREQUEUEDROPPED);
+ }
+
+ tp->ucopy.memory = 0;
+ } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
+ wake_up_interruptible_sync_poll(sk_sleep(sk),
+ POLLIN | POLLRDNORM | POLLRDBAND);
+ if (!inet_csk_ack_scheduled(sk))
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
+ (3 * tcp_rto_min(sk)) / 4,
+ TCP_RTO_MAX);
+ }
+ return true;
+}
+EXPORT_SYMBOL(tcp_prequeue);
+
/*
* From tcp_input.c
*/
@@ -1981,7 +1987,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
* provided case of th->doff==0 is eliminated.
* So, we defer the checks. */
if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
- goto bad_packet;
+ goto csum_error;
th = tcp_hdr(skb);
iph = ip_hdr(skb);
@@ -2047,6 +2053,8 @@ no_tcp_socket:
goto discard_it;
if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
+csum_error:
+ TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
bad_packet:
TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
} else {
@@ -2068,15 +2076,19 @@ do_time_wait:
goto discard_it;
}
- if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
- TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
+ if (skb->len < (th->doff << 2)) {
inet_twsk_put(inet_twsk(sk));
- goto discard_it;
+ goto bad_packet;
+ }
+ if (tcp_checksum_complete(skb)) {
+ inet_twsk_put(inet_twsk(sk));
+ goto csum_error;
}
switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
case TCP_TW_SYN: {
struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
&tcp_hashinfo,
+ iph->saddr, th->source,
iph->daddr, th->dest,
inet_iif(skb));
if (sk2) {
@@ -2194,12 +2206,6 @@ void tcp_v4_destroy_sock(struct sock *sk)
if (inet_csk(sk)->icsk_bind_hash)
inet_put_port(sk);
- /* TCP Cookie Transactions */
- if (tp->cookie_values != NULL) {
- kref_put(&tp->cookie_values->kref,
- tcp_cookie_values_release);
- tp->cookie_values = NULL;
- }
BUG_ON(tp->fastopen_rsk != NULL);
/* If socket is aborted during connect operation */
@@ -2577,7 +2583,7 @@ static void tcp_seq_stop(struct seq_file *seq, void *v)
int tcp_seq_open(struct inode *inode, struct file *file)
{
- struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
+ struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
struct tcp_iter_state *s;
int err;
@@ -2612,7 +2618,7 @@ EXPORT_SYMBOL(tcp_proc_register);
void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
{
- proc_net_remove(net, afinfo->name);
+ remove_proc_entry(afinfo->name, net->proc_net);
}
EXPORT_SYMBOL(tcp_proc_unregister);
@@ -2656,7 +2662,9 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
__u16 srcp = ntohs(inet->inet_sport);
int rx_queue;
- if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
+ if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
+ icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+ icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
timer_active = 1;
timer_expires = icsk->icsk_timeout;
} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
@@ -2891,6 +2899,7 @@ EXPORT_SYMBOL(tcp_prot);
static int __net_init tcp_sk_init(struct net *net)
{
+ net->ipv4.sysctl_tcp_ecn = 2;
return 0;
}
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index b6f3583..da14436 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -64,7 +64,6 @@ void tcp_destroy_cgroup(struct mem_cgroup *memcg)
{
struct cg_proto *cg_proto;
struct tcp_memcontrol *tcp;
- u64 val;
cg_proto = tcp_prot.proto_cgroup(memcg);
if (!cg_proto)
@@ -72,8 +71,6 @@ void tcp_destroy_cgroup(struct mem_cgroup *memcg)
tcp = tcp_from_cgproto(cg_proto);
percpu_counter_destroy(&tcp->tcp_sockets_allocated);
-
- val = res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT);
}
EXPORT_SYMBOL(tcp_destroy_cgroup);
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index f696d7c..f6a005c 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -96,7 +96,8 @@ struct tcpm_hash_bucket {
static DEFINE_SPINLOCK(tcp_metrics_lock);
-static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst)
+static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst,
+ bool fastopen_clear)
{
u32 val;
@@ -122,9 +123,11 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst)
tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING);
tm->tcpm_ts = 0;
tm->tcpm_ts_stamp = 0;
- tm->tcpm_fastopen.mss = 0;
- tm->tcpm_fastopen.syn_loss = 0;
- tm->tcpm_fastopen.cookie.len = 0;
+ if (fastopen_clear) {
+ tm->tcpm_fastopen.mss = 0;
+ tm->tcpm_fastopen.syn_loss = 0;
+ tm->tcpm_fastopen.cookie.len = 0;
+ }
}
static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
@@ -154,7 +157,7 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
}
tm->tcpm_addr = *addr;
- tcpm_suck_dst(tm, dst);
+ tcpm_suck_dst(tm, dst, true);
if (likely(!reclaim)) {
tm->tcpm_next = net->ipv4.tcp_metrics_hash[hash].chain;
@@ -171,7 +174,7 @@ out_unlock:
static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst)
{
if (tm && unlikely(time_after(jiffies, tm->tcpm_stamp + TCP_METRICS_TIMEOUT)))
- tcpm_suck_dst(tm, dst);
+ tcpm_suck_dst(tm, dst, false);
}
#define TCP_METRICS_RECLAIM_DEPTH 5
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index f35f2df..0f01788 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -93,15 +93,15 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
const struct tcphdr *th)
{
struct tcp_options_received tmp_opt;
- const u8 *hash_location;
struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
bool paws_reject = false;
tmp_opt.saw_tstamp = 0;
if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
- tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL);
+ tcp_parse_options(skb, &tmp_opt, 0, NULL);
if (tmp_opt.saw_tstamp) {
+ tmp_opt.rcv_tsecr -= tcptw->tw_ts_offset;
tmp_opt.ts_recent = tcptw->tw_ts_recent;
tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
@@ -288,6 +288,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
tcptw->tw_rcv_wnd = tcp_receive_window(tp);
tcptw->tw_ts_recent = tp->rx_opt.ts_recent;
tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
+ tcptw->tw_ts_offset = tp->tsoffset;
#if IS_ENABLED(CONFIG_IPV6)
if (tw->tw_family == PF_INET6) {
@@ -386,32 +387,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
struct tcp_request_sock *treq = tcp_rsk(req);
struct inet_connection_sock *newicsk = inet_csk(newsk);
struct tcp_sock *newtp = tcp_sk(newsk);
- struct tcp_sock *oldtp = tcp_sk(sk);
- struct tcp_cookie_values *oldcvp = oldtp->cookie_values;
-
- /* TCP Cookie Transactions require space for the cookie pair,
- * as it differs for each connection. There is no need to
- * copy any s_data_payload stored at the original socket.
- * Failure will prevent resuming the connection.
- *
- * Presumed copied, in order of appearance:
- * cookie_in_always, cookie_out_never
- */
- if (oldcvp != NULL) {
- struct tcp_cookie_values *newcvp =
- kzalloc(sizeof(*newtp->cookie_values),
- GFP_ATOMIC);
-
- if (newcvp != NULL) {
- kref_init(&newcvp->kref);
- newcvp->cookie_desired =
- oldcvp->cookie_desired;
- newtp->cookie_values = newcvp;
- } else {
- /* Not Yet Implemented */
- newtp->cookie_values = NULL;
- }
- }
/* Now setup tcp_sock */
newtp->pred_flags = 0;
@@ -420,8 +395,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
newtp->rcv_nxt = treq->rcv_isn + 1;
newtp->snd_sml = newtp->snd_una =
- newtp->snd_nxt = newtp->snd_up =
- treq->snt_isn + 1 + tcp_s_data_size(oldtp);
+ newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1;
tcp_prequeue_init(newtp);
INIT_LIST_HEAD(&newtp->tsq_node);
@@ -438,6 +412,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
newtp->fackets_out = 0;
newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
tcp_enable_early_retrans(newtp);
+ newtp->tlp_high_seq = 0;
/* So many TCP implementations out there (incorrectly) count the
* initial SYN frame in their delayed-ACK and congestion control
@@ -446,10 +421,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
*/
newtp->snd_cwnd = TCP_INIT_CWND;
newtp->snd_cwnd_cnt = 0;
- newtp->bytes_acked = 0;
-
- newtp->frto_counter = 0;
- newtp->frto_highmark = 0;
if (newicsk->icsk_ca_ops != &tcp_init_congestion_ops &&
!try_module_get(newicsk->icsk_ca_ops->owner))
@@ -458,8 +429,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
tcp_set_ca_state(newsk, TCP_CA_Open);
tcp_init_xmit_timers(newsk);
skb_queue_head_init(&newtp->out_of_order_queue);
- newtp->write_seq = newtp->pushed_seq =
- treq->snt_isn + 1 + tcp_s_data_size(oldtp);
+ newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1;
newtp->rx_opt.saw_tstamp = 0;
@@ -500,6 +470,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
newtp->rx_opt.ts_recent_stamp = 0;
newtp->tcp_header_len = sizeof(struct tcphdr);
}
+ newtp->tsoffset = 0;
#ifdef CONFIG_TCP_MD5SIG
newtp->md5sig_info = NULL; /*XXX*/
if (newtp->af_specific->md5_lookup(sk, newsk))
@@ -535,7 +506,6 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
bool fastopen)
{
struct tcp_options_received tmp_opt;
- const u8 *hash_location;
struct sock *child;
const struct tcphdr *th = tcp_hdr(skb);
__be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
@@ -545,7 +515,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
tmp_opt.saw_tstamp = 0;
if (th->doff > (sizeof(struct tcphdr)>>2)) {
- tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL);
+ tcp_parse_options(skb, &tmp_opt, 0, NULL);
if (tmp_opt.saw_tstamp) {
tmp_opt.ts_recent = req->ts_recent;
@@ -581,8 +551,13 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
*
* Note that even if there is new data in the SYN packet
* they will be thrown away too.
+ *
+ * Reset timer after retransmitting SYNACK, similar to
+ * the idea of fast retransmit in recovery.
*/
- inet_rtx_syn_ack(sk, req);
+ if (!inet_rtx_syn_ack(sk, req))
+ req->expires = min(TCP_TIMEOUT_INIT << req->num_timeout,
+ TCP_RTO_MAX) + jiffies;
return NULL;
}
@@ -645,7 +620,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
*/
if ((flg & TCP_FLAG_ACK) && !fastopen &&
(TCP_SKB_CB(skb)->ack_seq !=
- tcp_rsk(req)->snt_isn + 1 + tcp_s_data_size(tcp_sk(sk))))
+ tcp_rsk(req)->snt_isn + 1))
return sk;
/* Also, it would be not so bad idea to check rcv_tsecr, which
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 5d45159..ec335fa 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -65,28 +65,24 @@ int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS;
/* By default, RFC2861 behavior. */
int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
-int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */
-EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size);
-
static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
int push_one, gfp_t gfp);
/* Account for new data that has been sent to the network. */
static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
{
+ struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
unsigned int prior_packets = tp->packets_out;
tcp_advance_send_head(sk, skb);
tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
- /* Don't override Nagle indefinitely with F-RTO */
- if (tp->frto_counter == 2)
- tp->frto_counter = 3;
-
tp->packets_out += tcp_skb_pcount(skb);
- if (!prior_packets || tp->early_retrans_delayed)
+ if (!prior_packets || icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+ icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
tcp_rearm_rto(sk);
+ }
}
/* SND.NXT, if window was not shrunk.
@@ -314,7 +310,7 @@ static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb)
struct tcp_sock *tp = tcp_sk(sk);
tp->ecn_flags = 0;
- if (sysctl_tcp_ecn == 1) {
+ if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1) {
TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
tp->ecn_flags = TCP_ECN_OK;
}
@@ -384,7 +380,6 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp)
#define OPTION_TS (1 << 1)
#define OPTION_MD5 (1 << 2)
#define OPTION_WSCALE (1 << 3)
-#define OPTION_COOKIE_EXTENSION (1 << 4)
#define OPTION_FAST_OPEN_COOKIE (1 << 8)
struct tcp_out_options {
@@ -398,36 +393,6 @@ struct tcp_out_options {
struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */
};
-/* The sysctl int routines are generic, so check consistency here.
- */
-static u8 tcp_cookie_size_check(u8 desired)
-{
- int cookie_size;
-
- if (desired > 0)
- /* previously specified */
- return desired;
-
- cookie_size = ACCESS_ONCE(sysctl_tcp_cookie_size);
- if (cookie_size <= 0)
- /* no default specified */
- return 0;
-
- if (cookie_size <= TCP_COOKIE_MIN)
- /* value too small, specify minimum */
- return TCP_COOKIE_MIN;
-
- if (cookie_size >= TCP_COOKIE_MAX)
- /* value too large, specify maximum */
- return TCP_COOKIE_MAX;
-
- if (cookie_size & 1)
- /* 8-bit multiple, illegal, fix it */
- cookie_size++;
-
- return (u8)cookie_size;
-}
-
/* Write previously computed TCP options to the packet.
*
* Beware: Something in the Internet is very sensitive to the ordering of
@@ -446,27 +411,9 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
{
u16 options = opts->options; /* mungable copy */
- /* Having both authentication and cookies for security is redundant,
- * and there's certainly not enough room. Instead, the cookie-less
- * extension variant is proposed.
- *
- * Consider the pessimal case with authentication. The options
- * could look like:
- * COOKIE|MD5(20) + MSS(4) + SACK|TS(12) + WSCALE(4) == 40
- */
if (unlikely(OPTION_MD5 & options)) {
- if (unlikely(OPTION_COOKIE_EXTENSION & options)) {
- *ptr++ = htonl((TCPOPT_COOKIE << 24) |
- (TCPOLEN_COOKIE_BASE << 16) |
- (TCPOPT_MD5SIG << 8) |
- TCPOLEN_MD5SIG);
- } else {
- *ptr++ = htonl((TCPOPT_NOP << 24) |
- (TCPOPT_NOP << 16) |
- (TCPOPT_MD5SIG << 8) |
- TCPOLEN_MD5SIG);
- }
- options &= ~OPTION_COOKIE_EXTENSION;
+ *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
+ (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG);
/* overload cookie hash location */
opts->hash_location = (__u8 *)ptr;
ptr += 4;
@@ -495,44 +442,6 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
*ptr++ = htonl(opts->tsecr);
}
- /* Specification requires after timestamp, so do it now.
- *
- * Consider the pessimal case without authentication. The options
- * could look like:
- * MSS(4) + SACK|TS(12) + COOKIE(20) + WSCALE(4) == 40
- */
- if (unlikely(OPTION_COOKIE_EXTENSION & options)) {
- __u8 *cookie_copy = opts->hash_location;
- u8 cookie_size = opts->hash_size;
-
- /* 8-bit multiple handled in tcp_cookie_size_check() above,
- * and elsewhere.
- */
- if (0x2 & cookie_size) {
- __u8 *p = (__u8 *)ptr;
-
- /* 16-bit multiple */
- *p++ = TCPOPT_COOKIE;
- *p++ = TCPOLEN_COOKIE_BASE + cookie_size;
- *p++ = *cookie_copy++;
- *p++ = *cookie_copy++;
- ptr++;
- cookie_size -= 2;
- } else {
- /* 32-bit multiple */
- *ptr++ = htonl(((TCPOPT_NOP << 24) |
- (TCPOPT_NOP << 16) |
- (TCPOPT_COOKIE << 8) |
- TCPOLEN_COOKIE_BASE) +
- cookie_size);
- }
-
- if (cookie_size > 0) {
- memcpy(ptr, cookie_copy, cookie_size);
- ptr += (cookie_size / 4);
- }
- }
-
if (unlikely(OPTION_SACK_ADVERTISE & options)) {
*ptr++ = htonl((TCPOPT_NOP << 24) |
(TCPOPT_NOP << 16) |
@@ -591,11 +500,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
struct tcp_md5sig_key **md5)
{
struct tcp_sock *tp = tcp_sk(sk);
- struct tcp_cookie_values *cvp = tp->cookie_values;
unsigned int remaining = MAX_TCP_OPTION_SPACE;
- u8 cookie_size = (!tp->rx_opt.cookie_out_never && cvp != NULL) ?
- tcp_cookie_size_check(cvp->cookie_desired) :
- 0;
struct tcp_fastopen_request *fastopen = tp->fastopen_req;
#ifdef CONFIG_TCP_MD5SIG
@@ -622,7 +527,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
if (likely(sysctl_tcp_timestamps && *md5 == NULL)) {
opts->options |= OPTION_TS;
- opts->tsval = TCP_SKB_CB(skb)->when;
+ opts->tsval = TCP_SKB_CB(skb)->when + tp->tsoffset;
opts->tsecr = tp->rx_opt.ts_recent;
remaining -= TCPOLEN_TSTAMP_ALIGNED;
}
@@ -647,52 +552,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
tp->syn_fastopen = 1;
}
}
- /* Note that timestamps are required by the specification.
- *
- * Odd numbers of bytes are prohibited by the specification, ensuring
- * that the cookie is 16-bit aligned, and the resulting cookie pair is
- * 32-bit aligned.
- */
- if (*md5 == NULL &&
- (OPTION_TS & opts->options) &&
- cookie_size > 0) {
- int need = TCPOLEN_COOKIE_BASE + cookie_size;
-
- if (0x2 & need) {
- /* 32-bit multiple */
- need += 2; /* NOPs */
-
- if (need > remaining) {
- /* try shrinking cookie to fit */
- cookie_size -= 2;
- need -= 4;
- }
- }
- while (need > remaining && TCP_COOKIE_MIN <= cookie_size) {
- cookie_size -= 4;
- need -= 4;
- }
- if (TCP_COOKIE_MIN <= cookie_size) {
- opts->options |= OPTION_COOKIE_EXTENSION;
- opts->hash_location = (__u8 *)&cvp->cookie_pair[0];
- opts->hash_size = cookie_size;
-
- /* Remember for future incarnations. */
- cvp->cookie_desired = cookie_size;
-
- if (cvp->cookie_desired != cvp->cookie_pair_size) {
- /* Currently use random bytes as a nonce,
- * assuming these are completely unpredictable
- * by hostile users of the same system.
- */
- get_random_bytes(&cvp->cookie_pair[0],
- cookie_size);
- cvp->cookie_pair_size = cookie_size;
- }
- remaining -= need;
- }
- }
return MAX_TCP_OPTION_SPACE - remaining;
}
@@ -702,14 +562,10 @@ static unsigned int tcp_synack_options(struct sock *sk,
unsigned int mss, struct sk_buff *skb,
struct tcp_out_options *opts,
struct tcp_md5sig_key **md5,
- struct tcp_extend_values *xvp,
struct tcp_fastopen_cookie *foc)
{
struct inet_request_sock *ireq = inet_rsk(req);
unsigned int remaining = MAX_TCP_OPTION_SPACE;
- u8 cookie_plus = (xvp != NULL && !xvp->cookie_out_never) ?
- xvp->cookie_plus :
- 0;
#ifdef CONFIG_TCP_MD5SIG
*md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req);
@@ -757,28 +613,7 @@ static unsigned int tcp_synack_options(struct sock *sk,
remaining -= need;
}
}
- /* Similar rationale to tcp_syn_options() applies here, too.
- * If the <SYN> options fit, the same options should fit now!
- */
- if (*md5 == NULL &&
- ireq->tstamp_ok &&
- cookie_plus > TCPOLEN_COOKIE_BASE) {
- int need = cookie_plus; /* has TCPOLEN_COOKIE_BASE */
-
- if (0x2 & need) {
- /* 32-bit multiple */
- need += 2; /* NOPs */
- }
- if (need <= remaining) {
- opts->options |= OPTION_COOKIE_EXTENSION;
- opts->hash_size = cookie_plus - TCPOLEN_COOKIE_BASE;
- remaining -= need;
- } else {
- /* There's no error return, so flag it. */
- xvp->cookie_out_never = 1; /* true */
- opts->hash_size = 0;
- }
- }
+
return MAX_TCP_OPTION_SPACE - remaining;
}
@@ -806,7 +641,7 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
if (likely(tp->rx_opt.tstamp_ok)) {
opts->options |= OPTION_TS;
- opts->tsval = tcb ? tcb->when : 0;
+ opts->tsval = tcb ? tcb->when + tp->tsoffset : 0;
opts->tsecr = tp->rx_opt.ts_recent;
size += TCPOLEN_TSTAMP_ALIGNED;
}
@@ -953,7 +788,7 @@ void __init tcp_tasklet_init(void)
* We cant xmit new skbs from this context, as we might already
* hold qdisc lock.
*/
-static void tcp_wfree(struct sk_buff *skb)
+void tcp_wfree(struct sk_buff *skb)
{
struct sock *sk = skb->sk;
struct tcp_sock *tp = tcp_sk(sk);
@@ -1012,6 +847,13 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
__net_timestamp(skb);
if (likely(clone_it)) {
+ const struct sk_buff *fclone = skb + 1;
+
+ if (unlikely(skb->fclone == SKB_FCLONE_ORIG &&
+ fclone->fclone == SKB_FCLONE_CLONE))
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
+
if (unlikely(skb_cloned(skb)))
skb = pskb_copy(skb, gfp_mask);
else
@@ -1032,11 +874,13 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
&md5);
tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
- if (tcp_packets_in_flight(tp) == 0) {
+ if (tcp_packets_in_flight(tp) == 0)
tcp_ca_event(sk, CA_EVENT_TX_START);
- skb->ooo_okay = 1;
- } else
- skb->ooo_okay = 0;
+
+ /* if no packet is in qdisc/device queue, then allow XPS to select
+ * another queue.
+ */
+ skb->ooo_okay = sk_wmem_alloc_get(sk) == 0;
skb_push(skb, tcp_header_size);
skb_reset_transport_header(skb);
@@ -1298,7 +1142,6 @@ static void __pskb_trim_head(struct sk_buff *skb, int len)
eat = min_t(int, len, skb_headlen(skb));
if (eat) {
__skb_pull(skb, eat);
- skb->avail_size -= eat;
len -= eat;
if (!len)
return;
@@ -1331,7 +1174,7 @@ static void __pskb_trim_head(struct sk_buff *skb, int len)
/* Remove acked data from a packet in the transmit queue. */
int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
{
- if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+ if (skb_unclone(skb, GFP_ATOMIC))
return -ENOMEM;
__pskb_trim_head(skb, len);
@@ -1351,8 +1194,8 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
return 0;
}
-/* Calculate MSS. Not accounting for SACKs here. */
-int tcp_mtu_to_mss(struct sock *sk, int pmtu)
+/* Calculate MSS not accounting any TCP options. */
+static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu)
{
const struct tcp_sock *tp = tcp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -1381,13 +1224,17 @@ int tcp_mtu_to_mss(struct sock *sk, int pmtu)
/* Then reserve room for full set of TCP options and 8 bytes of data */
if (mss_now < 48)
mss_now = 48;
-
- /* Now subtract TCP options size, not including SACKs */
- mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
-
return mss_now;
}
+/* Calculate MSS. Not accounting for SACKs here. */
+int tcp_mtu_to_mss(struct sock *sk, int pmtu)
+{
+ /* Subtract TCP options size, not including SACKs */
+ return __tcp_mtu_to_mss(sk, pmtu) -
+ (tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr));
+}
+
/* Inverse of above */
int tcp_mss_to_mtu(struct sock *sk, int mss)
{
@@ -1629,11 +1476,8 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf
if (nonagle & TCP_NAGLE_PUSH)
return true;
- /* Don't use the nagle rule for urgent data (or for the final FIN).
- * Nagle can be ignored during F-RTO too (see RFC4138).
- */
- if (tcp_urg_mode(tp) || (tp->frto_counter == 2) ||
- (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
+ /* Don't use the nagle rule for urgent data (or for the final FIN). */
+ if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
return true;
if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
@@ -1806,8 +1650,11 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
goto send_now;
}
- /* Ok, it looks like it is advisable to defer. */
- tp->tso_deferred = 1 | (jiffies << 1);
+ /* Ok, it looks like it is advisable to defer.
+ * Do not rearm the timer if already set to not break TCP ACK clocking.
+ */
+ if (!tp->tso_deferred)
+ tp->tso_deferred = 1 | (jiffies << 1);
return true;
@@ -1955,6 +1802,9 @@ static int tcp_mtu_probe(struct sock *sk)
* snd_up-64k-mss .. snd_up cannot be large. However, taking into
* account rare use of URG, this is not a big flaw.
*
+ * Send at most one packet when push_one > 0. Temporarily ignore
+ * cwnd limit to force at most one packet out when push_one == 2.
+
* Returns true, if no segments are in flight and we have queued segments,
* but cannot send anything now because of SWS or another problem.
*/
@@ -1990,8 +1840,13 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
goto repair; /* Skip network transmission */
cwnd_quota = tcp_cwnd_test(tp, skb);
- if (!cwnd_quota)
- break;
+ if (!cwnd_quota) {
+ if (push_one == 2)
+ /* Force out a loss probe pkt. */
+ cwnd_quota = 1;
+ else
+ break;
+ }
if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
break;
@@ -2045,10 +1900,129 @@ repair:
if (likely(sent_pkts)) {
if (tcp_in_cwnd_reduction(sk))
tp->prr_out += sent_pkts;
+
+ /* Send one loss probe per tail loss episode. */
+ if (push_one != 2)
+ tcp_schedule_loss_probe(sk);
tcp_cwnd_validate(sk);
return false;
}
- return !tp->packets_out && tcp_send_head(sk);
+ return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk));
+}
+
+bool tcp_schedule_loss_probe(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 timeout, tlp_time_stamp, rto_time_stamp;
+ u32 rtt = tp->srtt >> 3;
+
+ if (WARN_ON(icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS))
+ return false;
+ /* No consecutive loss probes. */
+ if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) {
+ tcp_rearm_rto(sk);
+ return false;
+ }
+ /* Don't do any loss probe on a Fast Open connection before 3WHS
+ * finishes.
+ */
+ if (sk->sk_state == TCP_SYN_RECV)
+ return false;
+
+ /* TLP is only scheduled when next timer event is RTO. */
+ if (icsk->icsk_pending != ICSK_TIME_RETRANS)
+ return false;
+
+ /* Schedule a loss probe in 2*RTT for SACK capable connections
+ * in Open state, that are either limited by cwnd or application.
+ */
+ if (sysctl_tcp_early_retrans < 3 || !rtt || !tp->packets_out ||
+ !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open)
+ return false;
+
+ if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) &&
+ tcp_send_head(sk))
+ return false;
+
+ /* Probe timeout is at least 1.5*rtt + TCP_DELACK_MAX to account
+ * for delayed ack when there's one outstanding packet.
+ */
+ timeout = rtt << 1;
+ if (tp->packets_out == 1)
+ timeout = max_t(u32, timeout,
+ (rtt + (rtt >> 1) + TCP_DELACK_MAX));
+ timeout = max_t(u32, timeout, msecs_to_jiffies(10));
+
+ /* If RTO is shorter, just schedule TLP in its place. */
+ tlp_time_stamp = tcp_time_stamp + timeout;
+ rto_time_stamp = (u32)inet_csk(sk)->icsk_timeout;
+ if ((s32)(tlp_time_stamp - rto_time_stamp) > 0) {
+ s32 delta = rto_time_stamp - tcp_time_stamp;
+ if (delta > 0)
+ timeout = delta;
+ }
+
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout,
+ TCP_RTO_MAX);
+ return true;
+}
+
+/* When probe timeout (PTO) fires, send a new segment if one exists, else
+ * retransmit the last segment.
+ */
+void tcp_send_loss_probe(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+ int pcount;
+ int mss = tcp_current_mss(sk);
+ int err = -1;
+
+ if (tcp_send_head(sk) != NULL) {
+ err = tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
+ goto rearm_timer;
+ }
+
+ /* At most one outstanding TLP retransmission. */
+ if (tp->tlp_high_seq)
+ goto rearm_timer;
+
+ /* Retransmit last segment. */
+ skb = tcp_write_queue_tail(sk);
+ if (WARN_ON(!skb))
+ goto rearm_timer;
+
+ pcount = tcp_skb_pcount(skb);
+ if (WARN_ON(!pcount))
+ goto rearm_timer;
+
+ if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) {
+ if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss)))
+ goto rearm_timer;
+ skb = tcp_write_queue_tail(sk);
+ }
+
+ if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
+ goto rearm_timer;
+
+ /* Probe with zero data doesn't trigger fast recovery. */
+ if (skb->len > 0)
+ err = __tcp_retransmit_skb(sk, skb);
+
+ /* Record snd_nxt for loss detection. */
+ if (likely(!err))
+ tp->tlp_high_seq = tp->snd_nxt;
+
+rearm_timer:
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ inet_csk(sk)->icsk_rto,
+ TCP_RTO_MAX);
+
+ if (likely(!err))
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPLOSSPROBES);
+ return;
}
/* Push out any pending frames which were held back due to
@@ -2382,8 +2356,12 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
*/
TCP_SKB_CB(skb)->when = tcp_time_stamp;
- /* make sure skb->data is aligned on arches that require it */
- if (unlikely(NET_IP_ALIGN && ((unsigned long)skb->data & 3))) {
+ /* make sure skb->data is aligned on arches that require it
+ * and check if ack-trimming & collapsing extended the headroom
+ * beyond what csum_start can cover.
+ */
+ if (unlikely((NET_IP_ALIGN && ((unsigned long)skb->data & 3)) ||
+ skb_headroom(skb) >= 0xFFFF)) {
struct sk_buff *nskb = __pskb_copy(skb, MAX_TCP_HEADER,
GFP_ATOMIC);
return nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
@@ -2669,32 +2647,24 @@ int tcp_send_synack(struct sock *sk)
* sk: listener socket
* dst: dst entry attached to the SYNACK
* req: request_sock pointer
- * rvp: request_values pointer
*
* Allocate one skb and build a SYNACK packet.
* @dst is consumed : Caller should not use it again.
*/
struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
struct request_sock *req,
- struct request_values *rvp,
struct tcp_fastopen_cookie *foc)
{
struct tcp_out_options opts;
- struct tcp_extend_values *xvp = tcp_xv(rvp);
struct inet_request_sock *ireq = inet_rsk(req);
struct tcp_sock *tp = tcp_sk(sk);
- const struct tcp_cookie_values *cvp = tp->cookie_values;
struct tcphdr *th;
struct sk_buff *skb;
struct tcp_md5sig_key *md5;
int tcp_header_size;
int mss;
- int s_data_desired = 0;
- if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired)
- s_data_desired = cvp->s_data_desired;
- skb = alloc_skb(MAX_TCP_HEADER + 15 + s_data_desired,
- sk_gfp_atomic(sk, GFP_ATOMIC));
+ skb = alloc_skb(MAX_TCP_HEADER + 15, sk_gfp_atomic(sk, GFP_ATOMIC));
if (unlikely(!skb)) {
dst_release(dst);
return NULL;
@@ -2703,6 +2673,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
skb_reserve(skb, MAX_TCP_HEADER);
skb_dst_set(skb, dst);
+ security_skb_owned_by(skb, sk);
mss = dst_metric_advmss(dst);
if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
@@ -2736,9 +2707,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
else
#endif
TCP_SKB_CB(skb)->when = tcp_time_stamp;
- tcp_header_size = tcp_synack_options(sk, req, mss,
- skb, &opts, &md5, xvp, foc)
- + sizeof(*th);
+ tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, &md5,
+ foc) + sizeof(*th);
skb_push(skb, tcp_header_size);
skb_reset_transport_header(skb);
@@ -2756,40 +2726,6 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn,
TCPHDR_SYN | TCPHDR_ACK);
- if (OPTION_COOKIE_EXTENSION & opts.options) {
- if (s_data_desired) {
- u8 *buf = skb_put(skb, s_data_desired);
-
- /* copy data directly from the listening socket. */
- memcpy(buf, cvp->s_data_payload, s_data_desired);
- TCP_SKB_CB(skb)->end_seq += s_data_desired;
- }
-
- if (opts.hash_size > 0) {
- __u32 workspace[SHA_WORKSPACE_WORDS];
- u32 *mess = &xvp->cookie_bakery[COOKIE_DIGEST_WORDS];
- u32 *tail = &mess[COOKIE_MESSAGE_WORDS-1];
-
- /* Secret recipe depends on the Timestamp, (future)
- * Sequence and Acknowledgment Numbers, Initiator
- * Cookie, and others handled by IP variant caller.
- */
- *tail-- ^= opts.tsval;
- *tail-- ^= tcp_rsk(req)->rcv_isn + 1;
- *tail-- ^= TCP_SKB_CB(skb)->seq + 1;
-
- /* recommended */
- *tail-- ^= (((__force u32)th->dest << 16) | (__force u32)th->source);
- *tail-- ^= (u32)(unsigned long)cvp; /* per sockopt */
-
- sha_transform((__u32 *)&xvp->cookie_bakery[0],
- (char *)mess,
- &workspace[0]);
- opts.hash_location =
- (__u8 *)&xvp->cookie_bakery[0];
- }
- }
-
th->seq = htonl(TCP_SKB_CB(skb)->seq);
/* XXX data is queued and acked as is. No buffer/window check */
th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt);
@@ -2930,7 +2866,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
*/
if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->rx_opt.mss_clamp)
tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
- space = tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) -
+ space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) -
MAX_TCP_OPTION_SPACE;
syn_data = skb_copy_expand(syn, skb_headroom(syn), space,
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index 4526fe6..d4943f6 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -234,7 +234,7 @@ static __init int tcpprobe_init(void)
if (!tcp_probe.log)
goto err0;
- if (!proc_net_fops_create(&init_net, procname, S_IRUSR, &tcpprobe_fops))
+ if (!proc_create(procname, S_IRUSR, init_net.proc_net, &tcpprobe_fops))
goto err0;
ret = register_jprobe(&tcp_jprobe);
@@ -244,7 +244,7 @@ static __init int tcpprobe_init(void)
pr_info("probe registered (port=%d) bufsize=%u\n", port, bufsize);
return 0;
err1:
- proc_net_remove(&init_net, procname);
+ remove_proc_entry(procname, init_net.proc_net);
err0:
kfree(tcp_probe.log);
return ret;
@@ -253,7 +253,7 @@ module_init(tcpprobe_init);
static __exit void tcpprobe_exit(void)
{
- proc_net_remove(&init_net, procname);
+ remove_proc_entry(procname, init_net.proc_net);
unregister_jprobe(&tcp_jprobe);
kfree(tcp_probe.log);
}
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index b78aac3..4b85e6f 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -342,10 +342,6 @@ void tcp_retransmit_timer(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
- if (tp->early_retrans_delayed) {
- tcp_resume_early_retransmit(sk);
- return;
- }
if (tp->fastopen_rsk) {
WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
sk->sk_state != TCP_FIN_WAIT1);
@@ -360,6 +356,8 @@ void tcp_retransmit_timer(struct sock *sk)
WARN_ON(tcp_write_queue_empty(sk));
+ tp->tlp_high_seq = 0;
+
if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) &&
!((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) {
/* Receiver dastardly shrinks window. Our retransmits
@@ -418,11 +416,7 @@ void tcp_retransmit_timer(struct sock *sk)
NET_INC_STATS_BH(sock_net(sk), mib_idx);
}
- if (tcp_use_frto(sk)) {
- tcp_enter_frto(sk);
- } else {
- tcp_enter_loss(sk, 0);
- }
+ tcp_enter_loss(sk, 0);
if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk)) > 0) {
/* Retransmission failed because of local congestion,
@@ -495,13 +489,20 @@ void tcp_write_timer_handler(struct sock *sk)
}
event = icsk->icsk_pending;
- icsk->icsk_pending = 0;
switch (event) {
+ case ICSK_TIME_EARLY_RETRANS:
+ tcp_resume_early_retransmit(sk);
+ break;
+ case ICSK_TIME_LOSS_PROBE:
+ tcp_send_loss_probe(sk);
+ break;
case ICSK_TIME_RETRANS:
+ icsk->icsk_pending = 0;
tcp_retransmit_timer(sk);
break;
case ICSK_TIME_PROBE0:
+ icsk->icsk_pending = 0;
tcp_probe_timer(sk);
break;
}
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index 1b91bf4..76a1e23 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -236,7 +236,7 @@ static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event)
tp->snd_cwnd = tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);
break;
- case CA_EVENT_FRTO:
+ case CA_EVENT_LOSS:
tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);
/* Update RTT_min when next ack arrives */
w->reset_rtt_min = 1;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 1f4d405..0bf5d39 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -139,6 +139,7 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
{
struct sock *sk2;
struct hlist_nulls_node *node;
+ kuid_t uid = sock_i_uid(sk);
sk_nulls_for_each(sk2, node, &hslot->head)
if (net_eq(sock_net(sk2), net) &&
@@ -147,6 +148,8 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
(!sk2->sk_reuse || !sk->sk_reuse) &&
(!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
+ (!sk2->sk_reuseport || !sk->sk_reuseport ||
+ !uid_eq(uid, sock_i_uid(sk2))) &&
(*saddr_comp)(sk, sk2)) {
if (bitmap)
__set_bit(udp_sk(sk2)->udp_port_hash >> log,
@@ -169,6 +172,7 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
{
struct sock *sk2;
struct hlist_nulls_node *node;
+ kuid_t uid = sock_i_uid(sk);
int res = 0;
spin_lock(&hslot2->lock);
@@ -179,6 +183,8 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
(!sk2->sk_reuse || !sk->sk_reuse) &&
(!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
+ (!sk2->sk_reuseport || !sk->sk_reuseport ||
+ !uid_eq(uid, sock_i_uid(sk2))) &&
(*saddr_comp)(sk, sk2)) {
res = 1;
break;
@@ -337,26 +343,26 @@ static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr,
!ipv6_only_sock(sk)) {
struct inet_sock *inet = inet_sk(sk);
- score = (sk->sk_family == PF_INET ? 1 : 0);
+ score = (sk->sk_family == PF_INET ? 2 : 1);
if (inet->inet_rcv_saddr) {
if (inet->inet_rcv_saddr != daddr)
return -1;
- score += 2;
+ score += 4;
}
if (inet->inet_daddr) {
if (inet->inet_daddr != saddr)
return -1;
- score += 2;
+ score += 4;
}
if (inet->inet_dport) {
if (inet->inet_dport != sport)
return -1;
- score += 2;
+ score += 4;
}
if (sk->sk_bound_dev_if) {
if (sk->sk_bound_dev_if != dif)
return -1;
- score += 2;
+ score += 4;
}
}
return score;
@@ -365,7 +371,6 @@ static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr,
/*
* In this second variant, we check (daddr, dport) matches (inet_rcv_sadd, inet_num)
*/
-#define SCORE2_MAX (1 + 2 + 2 + 2)
static inline int compute_score2(struct sock *sk, struct net *net,
__be32 saddr, __be16 sport,
__be32 daddr, unsigned int hnum, int dif)
@@ -380,21 +385,21 @@ static inline int compute_score2(struct sock *sk, struct net *net,
if (inet->inet_num != hnum)
return -1;
- score = (sk->sk_family == PF_INET ? 1 : 0);
+ score = (sk->sk_family == PF_INET ? 2 : 1);
if (inet->inet_daddr) {
if (inet->inet_daddr != saddr)
return -1;
- score += 2;
+ score += 4;
}
if (inet->inet_dport) {
if (inet->inet_dport != sport)
return -1;
- score += 2;
+ score += 4;
}
if (sk->sk_bound_dev_if) {
if (sk->sk_bound_dev_if != dif)
return -1;
- score += 2;
+ score += 4;
}
}
return score;
@@ -409,19 +414,29 @@ static struct sock *udp4_lib_lookup2(struct net *net,
{
struct sock *sk, *result;
struct hlist_nulls_node *node;
- int score, badness;
+ int score, badness, matches = 0, reuseport = 0;
+ u32 hash = 0;
begin:
result = NULL;
- badness = -1;
+ badness = 0;
udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) {
score = compute_score2(sk, net, saddr, sport,
daddr, hnum, dif);
if (score > badness) {
result = sk;
badness = score;
- if (score == SCORE2_MAX)
- goto exact_match;
+ reuseport = sk->sk_reuseport;
+ if (reuseport) {
+ hash = inet_ehashfn(net, daddr, hnum,
+ saddr, htons(sport));
+ matches = 1;
+ }
+ } else if (score == badness && reuseport) {
+ matches++;
+ if (((u64)hash * matches) >> 32 == 0)
+ result = sk;
+ hash = next_pseudo_random32(hash);
}
}
/*
@@ -431,9 +446,7 @@ begin:
*/
if (get_nulls_value(node) != slot2)
goto begin;
-
if (result) {
-exact_match:
if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
result = NULL;
else if (unlikely(compute_score2(result, net, saddr, sport,
@@ -457,7 +470,8 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
unsigned short hnum = ntohs(dport);
unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
- int score, badness;
+ int score, badness, matches = 0, reuseport = 0;
+ u32 hash = 0;
rcu_read_lock();
if (hslot->count > 10) {
@@ -486,13 +500,24 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
}
begin:
result = NULL;
- badness = -1;
+ badness = 0;
sk_nulls_for_each_rcu(sk, node, &hslot->head) {
score = compute_score(sk, net, saddr, hnum, sport,
daddr, dport, dif);
if (score > badness) {
result = sk;
badness = score;
+ reuseport = sk->sk_reuseport;
+ if (reuseport) {
+ hash = inet_ehashfn(net, daddr, hnum,
+ saddr, htons(sport));
+ matches = 1;
+ }
+ } else if (score == badness && reuseport) {
+ matches++;
+ if (((u64)hash * matches) >> 32 == 0)
+ result = sk;
+ hash = next_pseudo_random32(hash);
}
}
/*
@@ -877,9 +902,9 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
ipc.addr = inet->inet_saddr;
ipc.oif = sk->sk_bound_dev_if;
- err = sock_tx_timestamp(sk, &ipc.tx_flags);
- if (err)
- return err;
+
+ sock_tx_timestamp(sk, &ipc.tx_flags);
+
if (msg->msg_controllen) {
err = ip_cmsg_send(sock_net(sk), msg, &ipc);
if (err)
@@ -971,7 +996,7 @@ back_from_confirm:
sizeof(struct udphdr), &ipc, &rt,
msg->msg_flags);
err = PTR_ERR(skb);
- if (skb && !IS_ERR(skb))
+ if (!IS_ERR_OR_NULL(skb))
err = udp_send_skb(skb, fl4);
goto out;
}
@@ -1106,6 +1131,8 @@ static unsigned int first_packet_length(struct sock *sk)
spin_lock_bh(&rcvq->lock);
while ((skb = skb_peek(rcvq)) != NULL &&
udp_lib_checksum_complete(skb)) {
+ UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_CSUMERRORS,
+ IS_UDPLITE(sk));
UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS,
IS_UDPLITE(sk));
atomic_inc(&sk->sk_drops);
@@ -1261,8 +1288,10 @@ out:
csum_copy_err:
slow = lock_sock_fast(sk);
- if (!skb_kill_datagram(sk, skb, flags))
+ if (!skb_kill_datagram(sk, skb, flags)) {
+ UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
+ }
unlock_sock_fast(sk, slow);
if (noblock)
@@ -1488,7 +1517,7 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
if (rcu_access_pointer(sk->sk_filter) &&
udp_lib_checksum_complete(skb))
- goto drop;
+ goto csum_error;
if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf))
@@ -1508,6 +1537,8 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
return rc;
+csum_error:
+ UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
drop:
UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
atomic_inc(&sk->sk_drops);
@@ -1724,6 +1755,7 @@ csum_error:
proto == IPPROTO_UDPLITE ? "Lite" : "",
&saddr, ntohs(uh->source), &daddr, ntohs(uh->dest),
ulen);
+ UDP_INC_STATS_BH(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE);
drop:
UDP_INC_STATS_BH(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE);
kfree_skb(skb);
@@ -1737,9 +1769,16 @@ int udp_rcv(struct sk_buff *skb)
void udp_destroy_sock(struct sock *sk)
{
+ struct udp_sock *up = udp_sk(sk);
bool slow = lock_sock_fast(sk);
udp_flush_pending_frames(sk);
unlock_sock_fast(sk, slow);
+ if (static_key_false(&udp_encap_needed) && up->encap_type) {
+ void (*encap_destroy)(struct sock *sk);
+ encap_destroy = ACCESS_ONCE(up->encap_destroy);
+ if (encap_destroy)
+ encap_destroy(sk);
+ }
}
/*
@@ -2061,7 +2100,7 @@ static void udp_seq_stop(struct seq_file *seq, void *v)
int udp_seq_open(struct inode *inode, struct file *file)
{
- struct udp_seq_afinfo *afinfo = PDE(inode)->data;
+ struct udp_seq_afinfo *afinfo = PDE_DATA(inode);
struct udp_iter_state *s;
int err;
@@ -2097,7 +2136,7 @@ EXPORT_SYMBOL(udp_proc_register);
void udp_proc_unregister(struct net *net, struct udp_seq_afinfo *afinfo)
{
- proc_net_remove(net, afinfo->name);
+ remove_proc_entry(afinfo->name, net->proc_net);
}
EXPORT_SYMBOL(udp_proc_unregister);
@@ -2247,31 +2286,91 @@ void __init udp_init(void)
int udp4_ufo_send_check(struct sk_buff *skb)
{
- const struct iphdr *iph;
- struct udphdr *uh;
-
- if (!pskb_may_pull(skb, sizeof(*uh)))
+ if (!pskb_may_pull(skb, sizeof(struct udphdr)))
return -EINVAL;
- iph = ip_hdr(skb);
- uh = udp_hdr(skb);
+ if (likely(!skb->encapsulation)) {
+ const struct iphdr *iph;
+ struct udphdr *uh;
- uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len,
- IPPROTO_UDP, 0);
- skb->csum_start = skb_transport_header(skb) - skb->head;
- skb->csum_offset = offsetof(struct udphdr, check);
- skb->ip_summed = CHECKSUM_PARTIAL;
+ iph = ip_hdr(skb);
+ uh = udp_hdr(skb);
+
+ uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len,
+ IPPROTO_UDP, 0);
+ skb->csum_start = skb_transport_header(skb) - skb->head;
+ skb->csum_offset = offsetof(struct udphdr, check);
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ }
return 0;
}
+static struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ int mac_len = skb->mac_len;
+ int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
+ __be16 protocol = skb->protocol;
+ netdev_features_t enc_features;
+ int outer_hlen;
+
+ if (unlikely(!pskb_may_pull(skb, tnl_hlen)))
+ goto out;
+
+ skb->encapsulation = 0;
+ __skb_pull(skb, tnl_hlen);
+ skb_reset_mac_header(skb);
+ skb_set_network_header(skb, skb_inner_network_offset(skb));
+ skb->mac_len = skb_inner_network_offset(skb);
+ skb->protocol = htons(ETH_P_TEB);
+
+ /* segment inner packet. */
+ enc_features = skb->dev->hw_enc_features & netif_skb_features(skb);
+ segs = skb_mac_gso_segment(skb, enc_features);
+ if (!segs || IS_ERR(segs))
+ goto out;
+
+ outer_hlen = skb_tnl_header_len(skb);
+ skb = segs;
+ do {
+ struct udphdr *uh;
+ int udp_offset = outer_hlen - tnl_hlen;
+
+ skb->mac_len = mac_len;
+
+ skb_push(skb, outer_hlen);
+ skb_reset_mac_header(skb);
+ skb_set_network_header(skb, mac_len);
+ skb_set_transport_header(skb, udp_offset);
+ uh = udp_hdr(skb);
+ uh->len = htons(skb->len - udp_offset);
+
+ /* csum segment if tunnel sets skb with csum. */
+ if (unlikely(uh->check)) {
+ struct iphdr *iph = ip_hdr(skb);
+
+ uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
+ skb->len - udp_offset,
+ IPPROTO_UDP, 0);
+ uh->check = csum_fold(skb_checksum(skb, udp_offset,
+ skb->len - udp_offset, 0));
+ if (uh->check == 0)
+ uh->check = CSUM_MANGLED_0;
+
+ }
+ skb->ip_summed = CHECKSUM_NONE;
+ skb->protocol = protocol;
+ } while ((skb = skb->next));
+out:
+ return segs;
+}
+
struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
netdev_features_t features)
{
struct sk_buff *segs = ERR_PTR(-EINVAL);
unsigned int mss;
- int offset;
- __wsum csum;
-
mss = skb_shinfo(skb)->gso_size;
if (unlikely(skb->len <= mss))
goto out;
@@ -2280,7 +2379,9 @@ struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
/* Packet is from an untrusted source, reset gso_segs. */
int type = skb_shinfo(skb)->gso_type;
- if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY) ||
+ if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY |
+ SKB_GSO_UDP_TUNNEL |
+ SKB_GSO_GRE) ||
!(type & (SKB_GSO_UDP))))
goto out;
@@ -2290,20 +2391,27 @@ struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
goto out;
}
- /* Do software UFO. Complete and fill in the UDP checksum as HW cannot
- * do checksum of UDP packets sent as multiple IP fragments.
- */
- offset = skb_checksum_start_offset(skb);
- csum = skb_checksum(skb, offset, skb->len - offset, 0);
- offset += skb->csum_offset;
- *(__sum16 *)(skb->data + offset) = csum_fold(csum);
- skb->ip_summed = CHECKSUM_NONE;
-
/* Fragment the skb. IP headers of the fragments are updated in
* inet_gso_segment()
*/
- segs = skb_segment(skb, features);
+ if (skb->encapsulation && skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL)
+ segs = skb_udp_tunnel_segment(skb, features);
+ else {
+ int offset;
+ __wsum csum;
+
+ /* Do software UFO. Complete and fill in the UDP checksum as
+ * HW cannot do checksum of UDP packets sent as multiple
+ * IP fragments.
+ */
+ offset = skb_checksum_start_offset(skb);
+ csum = skb_checksum(skb, offset, skb->len - offset, 0);
+ offset += skb->csum_offset;
+ *(__sum16 *)(skb->data + offset) = csum_fold(csum);
+ skb->ip_summed = CHECKSUM_NONE;
+
+ segs = skb_segment(skb, features);
+ }
out:
return segs;
}
-
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index 505b30a..7927db0 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -25,7 +25,7 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
return 0;
return inet_sk_diag_fill(sk, NULL, skb, req,
- sk_user_ns(NETLINK_CB(cb->skb).ssk),
+ sk_user_ns(NETLINK_CB(cb->skb).sk),
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
}
@@ -64,14 +64,14 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
goto out;
err = -ENOMEM;
- rep = alloc_skb(NLMSG_SPACE((sizeof(struct inet_diag_msg) +
- sizeof(struct inet_diag_meminfo) +
- 64)), GFP_KERNEL);
+ rep = nlmsg_new(sizeof(struct inet_diag_msg) +
+ sizeof(struct inet_diag_meminfo) + 64,
+ GFP_KERNEL);
if (!rep)
goto out;
err = inet_sk_diag_fill(sk, NULL, rep, req,
- sk_user_ns(NETLINK_CB(in_skb).ssk),
+ sk_user_ns(NETLINK_CB(in_skb).sk),
NETLINK_CB(in_skb).portid,
nlh->nlmsg_seq, 0, nlh);
if (err < 0) {
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index 06814b6..1f12c8b 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -132,7 +132,7 @@ int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb)
* header and optional ESP marker bytes) and then modify the
* protocol to ESP, and then call into the transform receiver.
*/
- if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+ if (skb_unclone(skb, GFP_ATOMIC))
goto drop;
/* Now we can update and verify the packet length... */
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index ddee0a0..eb1dd4d 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -103,8 +103,12 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
top_iph->protocol = xfrm_af2proto(skb_dst(skb)->ops->family);
- /* DS disclosed */
- top_iph->tos = INET_ECN_encapsulate(XFRM_MODE_SKB_CB(skb)->tos,
+ /* DS disclosing depends on XFRM_SA_XFLAG_DONT_ENCAP_DSCP */
+ if (x->props.extra_flags & XFRM_SA_XFLAG_DONT_ENCAP_DSCP)
+ top_iph->tos = 0;
+ else
+ top_iph->tos = XFRM_MODE_SKB_CB(skb)->tos;
+ top_iph->tos = INET_ECN_encapsulate(top_iph->tos,
XFRM_MODE_SKB_CB(skb)->tos);
flags = x->props.flags;
@@ -142,8 +146,8 @@ static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
for_each_input_rcu(rcv_notify_handlers, handler)
handler->handler(skb);
- if (skb_cloned(skb) &&
- (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
+ err = skb_unclone(skb, GFP_ATOMIC);
+ if (err)
goto out;
if (x->props.flags & XFRM_STATE_DECAP_DSCP)
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 3be0ac2..9a459be 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -262,21 +262,56 @@ static struct ctl_table xfrm4_policy_table[] = {
{ }
};
-static struct ctl_table_header *sysctl_hdr;
-#endif
-
-static void __init xfrm4_policy_init(void)
+static int __net_init xfrm4_net_init(struct net *net)
{
- xfrm_policy_register_afinfo(&xfrm4_policy_afinfo);
+ struct ctl_table *table;
+ struct ctl_table_header *hdr;
+
+ table = xfrm4_policy_table;
+ if (!net_eq(net, &init_net)) {
+ table = kmemdup(table, sizeof(xfrm4_policy_table), GFP_KERNEL);
+ if (!table)
+ goto err_alloc;
+
+ table[0].data = &net->xfrm.xfrm4_dst_ops.gc_thresh;
+ }
+
+ hdr = register_net_sysctl(net, "net/ipv4", table);
+ if (!hdr)
+ goto err_reg;
+
+ net->ipv4.xfrm4_hdr = hdr;
+ return 0;
+
+err_reg:
+ if (!net_eq(net, &init_net))
+ kfree(table);
+err_alloc:
+ return -ENOMEM;
}
-static void __exit xfrm4_policy_fini(void)
+static void __net_exit xfrm4_net_exit(struct net *net)
{
-#ifdef CONFIG_SYSCTL
- if (sysctl_hdr)
- unregister_net_sysctl_table(sysctl_hdr);
+ struct ctl_table *table;
+
+ if (net->ipv4.xfrm4_hdr == NULL)
+ return;
+
+ table = net->ipv4.xfrm4_hdr->ctl_table_arg;
+ unregister_net_sysctl_table(net->ipv4.xfrm4_hdr);
+ if (!net_eq(net, &init_net))
+ kfree(table);
+}
+
+static struct pernet_operations __net_initdata xfrm4_net_ops = {
+ .init = xfrm4_net_init,
+ .exit = xfrm4_net_exit,
+};
#endif
- xfrm_policy_unregister_afinfo(&xfrm4_policy_afinfo);
+
+static void __init xfrm4_policy_init(void)
+{
+ xfrm_policy_register_afinfo(&xfrm4_policy_afinfo);
}
void __init xfrm4_init(void)
@@ -286,8 +321,7 @@ void __init xfrm4_init(void)
xfrm4_state_init();
xfrm4_policy_init();
#ifdef CONFIG_SYSCTL
- sysctl_hdr = register_net_sysctl(&init_net, "net/ipv4",
- xfrm4_policy_table);
+ register_pernet_subsys(&xfrm4_net_ops);
#endif
}