summaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/devinet.c17
-rw-r--r--net/ipv4/fib_rules.c25
-rw-r--r--net/ipv4/igmp.c80
-rw-r--r--net/ipv4/ip_gre.c4
-rw-r--r--net/ipv4/ip_input.c8
-rw-r--r--net/ipv4/ip_tunnel.c67
-rw-r--r--net/ipv4/ip_vti.c528
-rw-r--r--net/ipv4/ipip.c3
-rw-r--r--net/ipv4/ipmr.c15
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c2
-rw-r--r--net/ipv4/ping.c2
-rw-r--r--net/ipv4/proc.c7
-rw-r--r--net/ipv4/raw.c2
-rw-r--r--net/ipv4/route.c24
-rw-r--r--net/ipv4/sysctl_net_ipv4.c7
-rw-r--r--net/ipv4/tcp.c15
-rw-r--r--net/ipv4/tcp_fastopen.c13
-rw-r--r--net/ipv4/tcp_input.c156
-rw-r--r--net/ipv4/tcp_ipv4.c27
-rw-r--r--net/ipv4/tcp_minisocks.c8
-rw-r--r--net/ipv4/tcp_output.c3
-rw-r--r--net/ipv4/tcp_probe.c67
-rw-r--r--net/ipv4/udp.c5
23 files changed, 418 insertions, 667 deletions
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 34ca6d5..a1b5bcb 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -73,6 +73,8 @@ static struct ipv4_devconf ipv4_devconf = {
[IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1,
[IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1,
[IPV4_DEVCONF_SHARED_MEDIA - 1] = 1,
+ [IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1] = 10000 /*ms*/,
+ [IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1] = 1000 /*ms*/,
},
};
@@ -83,6 +85,8 @@ static struct ipv4_devconf ipv4_devconf_dflt = {
[IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1,
[IPV4_DEVCONF_SHARED_MEDIA - 1] = 1,
[IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE - 1] = 1,
+ [IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1] = 10000 /*ms*/,
+ [IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1] = 1000 /*ms*/,
},
};
@@ -1126,10 +1130,7 @@ static int inet_gifconf(struct net_device *dev, char __user *buf, int len)
if (len < (int) sizeof(ifr))
break;
memset(&ifr, 0, sizeof(struct ifreq));
- if (ifa->ifa_label)
- strcpy(ifr.ifr_name, ifa->ifa_label);
- else
- strcpy(ifr.ifr_name, dev->name);
+ strcpy(ifr.ifr_name, ifa->ifa_label);
(*(struct sockaddr_in *)&ifr.ifr_addr).sin_family = AF_INET;
(*(struct sockaddr_in *)&ifr.ifr_addr).sin_addr.s_addr =
@@ -2097,11 +2098,15 @@ static struct devinet_sysctl_table {
DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"),
DEVINET_SYSCTL_RW_ENTRY(ARP_NOTIFY, "arp_notify"),
DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP_PVLAN, "proxy_arp_pvlan"),
+ DEVINET_SYSCTL_RW_ENTRY(FORCE_IGMP_VERSION,
+ "force_igmp_version"),
+ DEVINET_SYSCTL_RW_ENTRY(IGMPV2_UNSOLICITED_REPORT_INTERVAL,
+ "igmpv2_unsolicited_report_interval"),
+ DEVINET_SYSCTL_RW_ENTRY(IGMPV3_UNSOLICITED_REPORT_INTERVAL,
+ "igmpv3_unsolicited_report_interval"),
DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"),
DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"),
- DEVINET_SYSCTL_FLUSHING_ENTRY(FORCE_IGMP_VERSION,
- "force_igmp_version"),
DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES,
"promote_secondaries"),
DEVINET_SYSCTL_FLUSHING_ENTRY(ROUTE_LOCALNET,
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 26aa65d..523be38 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -101,6 +101,30 @@ errout:
return err;
}
+static bool fib4_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg)
+{
+ struct fib_result *result = (struct fib_result *) arg->result;
+ struct net_device *dev = result->fi->fib_dev;
+
+ /* do not accept result if the route does
+ * not meet the required prefix length
+ */
+ if (result->prefixlen <= rule->suppress_prefixlen)
+ goto suppress_route;
+
+ /* do not accept result if the route uses a device
+ * belonging to a forbidden interface group
+ */
+ if (rule->suppress_ifgroup != -1 && dev && dev->group == rule->suppress_ifgroup)
+ goto suppress_route;
+
+ return false;
+
+suppress_route:
+ if (!(arg->flags & FIB_LOOKUP_NOREF))
+ fib_info_put(result->fi);
+ return true;
+}
static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
{
@@ -267,6 +291,7 @@ static const struct fib_rules_ops __net_initconst fib4_rules_ops_template = {
.rule_size = sizeof(struct fib4_rule),
.addr_size = sizeof(u32),
.action = fib4_rule_action,
+ .suppress = fib4_rule_suppress,
.match = fib4_rule_match,
.configure = fib4_rule_configure,
.delete = fib4_rule_delete,
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index cd71190..d6c0e64 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -88,6 +88,7 @@
#include <linux/if_arp.h>
#include <linux/rtnetlink.h>
#include <linux/times.h>
+#include <linux/pkt_sched.h>
#include <net/net_namespace.h>
#include <net/arp.h>
@@ -113,7 +114,8 @@
#define IGMP_V1_Router_Present_Timeout (400*HZ)
#define IGMP_V2_Router_Present_Timeout (400*HZ)
-#define IGMP_Unsolicited_Report_Interval (10*HZ)
+#define IGMP_V2_Unsolicited_Report_Interval (10*HZ)
+#define IGMP_V3_Unsolicited_Report_Interval (1*HZ)
#define IGMP_Query_Response_Interval (10*HZ)
#define IGMP_Unsolicited_Report_Count 2
@@ -138,6 +140,29 @@
((in_dev)->mr_v2_seen && \
time_before(jiffies, (in_dev)->mr_v2_seen)))
+static int unsolicited_report_interval(struct in_device *in_dev)
+{
+ int interval_ms, interval_jiffies;
+
+ if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev))
+ interval_ms = IN_DEV_CONF_GET(
+ in_dev,
+ IGMPV2_UNSOLICITED_REPORT_INTERVAL);
+ else /* v3 */
+ interval_ms = IN_DEV_CONF_GET(
+ in_dev,
+ IGMPV3_UNSOLICITED_REPORT_INTERVAL);
+
+ interval_jiffies = msecs_to_jiffies(interval_ms);
+
+ /* _timer functions can't handle a delay of 0 jiffies so ensure
+ * we always return a positive value.
+ */
+ if (interval_jiffies <= 0)
+ interval_jiffies = 1;
+ return interval_jiffies;
+}
+
static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im);
static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr);
static void igmpv3_clear_delrec(struct in_device *in_dev);
@@ -315,6 +340,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
if (size < 256)
return NULL;
}
+ skb->priority = TC_PRIO_CONTROL;
igmp_skb_size(skb) = size;
rt = ip_route_output_ports(net, &fl4, NULL, IGMPV3_ALL_MCR, 0,
@@ -670,6 +696,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
ip_rt_put(rt);
return -1;
}
+ skb->priority = TC_PRIO_CONTROL;
skb_dst_set(skb, &rt->dst);
@@ -719,7 +746,8 @@ static void igmp_ifc_timer_expire(unsigned long data)
igmpv3_send_cr(in_dev);
if (in_dev->mr_ifc_count) {
in_dev->mr_ifc_count--;
- igmp_ifc_start_timer(in_dev, IGMP_Unsolicited_Report_Interval);
+ igmp_ifc_start_timer(in_dev,
+ unsolicited_report_interval(in_dev));
}
__in_dev_put(in_dev);
}
@@ -744,7 +772,7 @@ static void igmp_timer_expire(unsigned long data)
if (im->unsolicit_count) {
im->unsolicit_count--;
- igmp_start_timer(im, IGMP_Unsolicited_Report_Interval);
+ igmp_start_timer(im, unsolicited_report_interval(in_dev));
}
im->reporter = 1;
spin_unlock(&im->lock);
@@ -1323,16 +1351,17 @@ out:
EXPORT_SYMBOL(ip_mc_inc_group);
/*
- * Resend IGMP JOIN report; used for bonding.
- * Called with rcu_read_lock()
+ * Resend IGMP JOIN report; used by netdev notifier.
*/
-void ip_mc_rejoin_groups(struct in_device *in_dev)
+static void ip_mc_rejoin_groups(struct in_device *in_dev)
{
#ifdef CONFIG_IP_MULTICAST
struct ip_mc_list *im;
int type;
- for_each_pmc_rcu(in_dev, im) {
+ ASSERT_RTNL();
+
+ for_each_pmc_rtnl(in_dev, im) {
if (im->multiaddr == IGMP_ALL_HOSTS)
continue;
@@ -1349,7 +1378,6 @@ void ip_mc_rejoin_groups(struct in_device *in_dev)
}
#endif
}
-EXPORT_SYMBOL(ip_mc_rejoin_groups);
/*
* A socket has left a multicast group on device dev
@@ -2735,8 +2763,42 @@ static struct pernet_operations igmp_net_ops = {
.exit = igmp_net_exit,
};
+static int igmp_netdev_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct in_device *in_dev;
+
+ switch (event) {
+ case NETDEV_RESEND_IGMP:
+ in_dev = __in_dev_get_rtnl(dev);
+ if (in_dev)
+ ip_mc_rejoin_groups(in_dev);
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block igmp_notifier = {
+ .notifier_call = igmp_netdev_event,
+};
+
int __init igmp_mc_proc_init(void)
{
- return register_pernet_subsys(&igmp_net_ops);
+ int err;
+
+ err = register_pernet_subsys(&igmp_net_ops);
+ if (err)
+ return err;
+ err = register_netdevice_notifier(&igmp_notifier);
+ if (err)
+ goto reg_notif_fail;
+ return 0;
+
+reg_notif_fail:
+ unregister_pernet_subsys(&igmp_net_ops);
+ return err;
}
#endif
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 8d6939e..d7aea4c 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -534,7 +534,7 @@ static int __net_init ipgre_init_net(struct net *net)
static void __net_exit ipgre_exit_net(struct net *net)
{
struct ip_tunnel_net *itn = net_generic(net, ipgre_net_id);
- ip_tunnel_delete_net(itn);
+ ip_tunnel_delete_net(itn, &ipgre_link_ops);
}
static struct pernet_operations ipgre_net_ops = {
@@ -767,7 +767,7 @@ static int __net_init ipgre_tap_init_net(struct net *net)
static void __net_exit ipgre_tap_exit_net(struct net *net)
{
struct ip_tunnel_net *itn = net_generic(net, gre_tap_net_id);
- ip_tunnel_delete_net(itn);
+ ip_tunnel_delete_net(itn, &ipgre_tap_ops);
}
static struct pernet_operations ipgre_tap_net_ops = {
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 15e3e68..054a3e9 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -141,6 +141,7 @@
#include <net/icmp.h>
#include <net/raw.h>
#include <net/checksum.h>
+#include <net/inet_ecn.h>
#include <linux/netfilter_ipv4.h>
#include <net/xfrm.h>
#include <linux/mroute.h>
@@ -410,6 +411,13 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
if (iph->ihl < 5 || iph->version != 4)
goto inhdr_error;
+ BUILD_BUG_ON(IPSTATS_MIB_ECT1PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_1);
+ BUILD_BUG_ON(IPSTATS_MIB_ECT0PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_0);
+ BUILD_BUG_ON(IPSTATS_MIB_CEPKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_CE);
+ IP_ADD_STATS_BH(dev_net(dev),
+ IPSTATS_MIB_NOECTPKTS + (iph->tos & INET_ECN_MASK),
+ max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs));
+
if (!pskb_may_pull(skb, iph->ihl*4))
goto inhdr_error;
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index ca1cb2d..830de3f 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -350,7 +350,7 @@ static int ip_tunnel_bind_dev(struct net_device *dev)
struct flowi4 fl4;
struct rtable *rt;
- rt = ip_route_output_tunnel(dev_net(dev), &fl4,
+ rt = ip_route_output_tunnel(tunnel->net, &fl4,
tunnel->parms.iph.protocol,
iph->daddr, iph->saddr,
tunnel->parms.o_key,
@@ -365,7 +365,7 @@ static int ip_tunnel_bind_dev(struct net_device *dev)
}
if (!tdev && tunnel->parms.link)
- tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
+ tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
if (tdev) {
hlen = tdev->hard_header_len + tdev->needed_headroom;
@@ -454,15 +454,16 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
tstats->rx_bytes += skb->len;
u64_stats_update_end(&tstats->syncp);
- if (tunnel->net != dev_net(tunnel->dev))
- skb_scrub_packet(skb);
-
if (tunnel->dev->type == ARPHRD_ETHER) {
skb->protocol = eth_type_trans(skb, tunnel->dev);
skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
} else {
skb->dev = tunnel->dev;
}
+
+ if (!net_eq(tunnel->net, dev_net(tunnel->dev)))
+ skb_scrub_packet(skb);
+
gro_cells_receive(&tunnel->gro_cells, skb);
return 0;
@@ -613,7 +614,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
goto tx_error;
}
- if (tunnel->net != dev_net(dev))
+ if (!net_eq(tunnel->net, dev_net(dev)))
skb_scrub_packet(skb);
if (tunnel->err_count > 0) {
@@ -653,7 +654,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
}
}
- err = iptunnel_xmit(dev_net(dev), rt, skb,
+ err = iptunnel_xmit(tunnel->net, rt, skb,
fl4.saddr, fl4.daddr, protocol,
ip_tunnel_ecn_encap(tos, inner_iph, skb), ttl, df);
iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
@@ -820,11 +821,10 @@ static void ip_tunnel_dev_free(struct net_device *dev)
void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
{
- struct net *net = dev_net(dev);
struct ip_tunnel *tunnel = netdev_priv(dev);
struct ip_tunnel_net *itn;
- itn = net_generic(net, tunnel->ip_tnl_net_id);
+ itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
if (itn->fb_tunnel_dev != dev) {
ip_tunnel_del(netdev_priv(dev));
@@ -838,56 +838,68 @@ int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
{
struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
struct ip_tunnel_parm parms;
+ unsigned int i;
- itn->tunnels = kzalloc(IP_TNL_HASH_SIZE * sizeof(struct hlist_head), GFP_KERNEL);
- if (!itn->tunnels)
- return -ENOMEM;
+ for (i = 0; i < IP_TNL_HASH_SIZE; i++)
+ INIT_HLIST_HEAD(&itn->tunnels[i]);
if (!ops) {
itn->fb_tunnel_dev = NULL;
return 0;
}
+
memset(&parms, 0, sizeof(parms));
if (devname)
strlcpy(parms.name, devname, IFNAMSIZ);
rtnl_lock();
itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
+ /* FB netdevice is special: we have one, and only one per netns.
+ * Allowing to move it to another netns is clearly unsafe.
+ */
+ if (!IS_ERR(itn->fb_tunnel_dev))
+ itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
rtnl_unlock();
- if (IS_ERR(itn->fb_tunnel_dev)) {
- kfree(itn->tunnels);
- return PTR_ERR(itn->fb_tunnel_dev);
- }
- return 0;
+ return PTR_RET(itn->fb_tunnel_dev);
}
EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
-static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head)
+static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
+ struct rtnl_link_ops *ops)
{
+ struct net *net = dev_net(itn->fb_tunnel_dev);
+ struct net_device *dev, *aux;
int h;
+ for_each_netdev_safe(net, dev, aux)
+ if (dev->rtnl_link_ops == ops)
+ unregister_netdevice_queue(dev, head);
+
for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
struct ip_tunnel *t;
struct hlist_node *n;
struct hlist_head *thead = &itn->tunnels[h];
hlist_for_each_entry_safe(t, n, thead, hash_node)
- unregister_netdevice_queue(t->dev, head);
+ /* If dev is in the same netns, it has already
+ * been added to the list by the previous loop.
+ */
+ if (!net_eq(dev_net(t->dev), net))
+ unregister_netdevice_queue(t->dev, head);
}
if (itn->fb_tunnel_dev)
unregister_netdevice_queue(itn->fb_tunnel_dev, head);
}
-void ip_tunnel_delete_net(struct ip_tunnel_net *itn)
+void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
{
LIST_HEAD(list);
rtnl_lock();
- ip_tunnel_destroy(itn, &list);
+ ip_tunnel_destroy(itn, &list, ops);
unregister_netdevice_many(&list);
rtnl_unlock();
- kfree(itn->tunnels);
}
EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
@@ -929,23 +941,21 @@ EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
struct ip_tunnel_parm *p)
{
- struct ip_tunnel *t, *nt;
- struct net *net = dev_net(dev);
+ struct ip_tunnel *t;
struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct net *net = tunnel->net;
struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
if (dev == itn->fb_tunnel_dev)
return -EINVAL;
- nt = netdev_priv(dev);
-
t = ip_tunnel_find(itn, p, dev->type);
if (t) {
if (t->dev != dev)
return -EEXIST;
} else {
- t = nt;
+ t = tunnel;
if (dev->type != ARPHRD_ETHER) {
unsigned int nflags = 0;
@@ -984,6 +994,7 @@ int ip_tunnel_init(struct net_device *dev)
}
tunnel->dev = dev;
+ tunnel->net = dev_net(dev);
strcpy(tunnel->parms.name, dev->name);
iph->version = 4;
iph->ihl = 5;
@@ -994,8 +1005,8 @@ EXPORT_SYMBOL_GPL(ip_tunnel_init);
void ip_tunnel_uninit(struct net_device *dev)
{
- struct net *net = dev_net(dev);
struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct net *net = tunnel->net;
struct ip_tunnel_net *itn;
itn = net_generic(net, tunnel->ip_tnl_net_id);
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 17cc0ff..e805e7b 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -44,176 +44,10 @@
#include <net/net_namespace.h>
#include <net/netns/generic.h>
-#define HASH_SIZE 16
-#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&(HASH_SIZE-1))
-
static struct rtnl_link_ops vti_link_ops __read_mostly;
static int vti_net_id __read_mostly;
-struct vti_net {
- struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE];
- struct ip_tunnel __rcu *tunnels_r[HASH_SIZE];
- struct ip_tunnel __rcu *tunnels_l[HASH_SIZE];
- struct ip_tunnel __rcu *tunnels_wc[1];
- struct ip_tunnel __rcu **tunnels[4];
-
- struct net_device *fb_tunnel_dev;
-};
-
-static int vti_fb_tunnel_init(struct net_device *dev);
static int vti_tunnel_init(struct net_device *dev);
-static void vti_tunnel_setup(struct net_device *dev);
-static void vti_dev_free(struct net_device *dev);
-static int vti_tunnel_bind_dev(struct net_device *dev);
-
-#define VTI_XMIT(stats1, stats2) do { \
- int err; \
- int pkt_len = skb->len; \
- err = dst_output(skb); \
- if (net_xmit_eval(err) == 0) { \
- u64_stats_update_begin(&(stats1)->syncp); \
- (stats1)->tx_bytes += pkt_len; \
- (stats1)->tx_packets++; \
- u64_stats_update_end(&(stats1)->syncp); \
- } else { \
- (stats2)->tx_errors++; \
- (stats2)->tx_aborted_errors++; \
- } \
-} while (0)
-
-
-static struct ip_tunnel *vti_tunnel_lookup(struct net *net,
- __be32 remote, __be32 local)
-{
- unsigned h0 = HASH(remote);
- unsigned h1 = HASH(local);
- struct ip_tunnel *t;
- struct vti_net *ipn = net_generic(net, vti_net_id);
-
- for_each_ip_tunnel_rcu(t, ipn->tunnels_r_l[h0 ^ h1])
- if (local == t->parms.iph.saddr &&
- remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
- return t;
- for_each_ip_tunnel_rcu(t, ipn->tunnels_r[h0])
- if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
- return t;
-
- for_each_ip_tunnel_rcu(t, ipn->tunnels_l[h1])
- if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP))
- return t;
-
- for_each_ip_tunnel_rcu(t, ipn->tunnels_wc[0])
- if (t && (t->dev->flags&IFF_UP))
- return t;
- return NULL;
-}
-
-static struct ip_tunnel __rcu **__vti_bucket(struct vti_net *ipn,
- struct ip_tunnel_parm *parms)
-{
- __be32 remote = parms->iph.daddr;
- __be32 local = parms->iph.saddr;
- unsigned h = 0;
- int prio = 0;
-
- if (remote) {
- prio |= 2;
- h ^= HASH(remote);
- }
- if (local) {
- prio |= 1;
- h ^= HASH(local);
- }
- return &ipn->tunnels[prio][h];
-}
-
-static inline struct ip_tunnel __rcu **vti_bucket(struct vti_net *ipn,
- struct ip_tunnel *t)
-{
- return __vti_bucket(ipn, &t->parms);
-}
-
-static void vti_tunnel_unlink(struct vti_net *ipn, struct ip_tunnel *t)
-{
- struct ip_tunnel __rcu **tp;
- struct ip_tunnel *iter;
-
- for (tp = vti_bucket(ipn, t);
- (iter = rtnl_dereference(*tp)) != NULL;
- tp = &iter->next) {
- if (t == iter) {
- rcu_assign_pointer(*tp, t->next);
- break;
- }
- }
-}
-
-static void vti_tunnel_link(struct vti_net *ipn, struct ip_tunnel *t)
-{
- struct ip_tunnel __rcu **tp = vti_bucket(ipn, t);
-
- rcu_assign_pointer(t->next, rtnl_dereference(*tp));
- rcu_assign_pointer(*tp, t);
-}
-
-static struct ip_tunnel *vti_tunnel_locate(struct net *net,
- struct ip_tunnel_parm *parms,
- int create)
-{
- __be32 remote = parms->iph.daddr;
- __be32 local = parms->iph.saddr;
- struct ip_tunnel *t, *nt;
- struct ip_tunnel __rcu **tp;
- struct net_device *dev;
- char name[IFNAMSIZ];
- struct vti_net *ipn = net_generic(net, vti_net_id);
-
- for (tp = __vti_bucket(ipn, parms);
- (t = rtnl_dereference(*tp)) != NULL;
- tp = &t->next) {
- if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr)
- return t;
- }
- if (!create)
- return NULL;
-
- if (parms->name[0])
- strlcpy(name, parms->name, IFNAMSIZ);
- else
- strcpy(name, "vti%d");
-
- dev = alloc_netdev(sizeof(*t), name, vti_tunnel_setup);
- if (dev == NULL)
- return NULL;
-
- dev_net_set(dev, net);
-
- nt = netdev_priv(dev);
- nt->parms = *parms;
- dev->rtnl_link_ops = &vti_link_ops;
-
- vti_tunnel_bind_dev(dev);
-
- if (register_netdevice(dev) < 0)
- goto failed_free;
-
- dev_hold(dev);
- vti_tunnel_link(ipn, nt);
- return nt;
-
-failed_free:
- free_netdev(dev);
- return NULL;
-}
-
-static void vti_tunnel_uninit(struct net_device *dev)
-{
- struct net *net = dev_net(dev);
- struct vti_net *ipn = net_generic(net, vti_net_id);
-
- vti_tunnel_unlink(ipn, netdev_priv(dev));
- dev_put(dev);
-}
static int vti_err(struct sk_buff *skb, u32 info)
{
@@ -222,6 +56,8 @@ static int vti_err(struct sk_buff *skb, u32 info)
* 8 bytes of packet payload. It means, that precise relaying of
* ICMP in the real Internet is absolutely infeasible.
*/
+ struct net *net = dev_net(skb->dev);
+ struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
struct iphdr *iph = (struct iphdr *)skb->data;
const int type = icmp_hdr(skb)->type;
const int code = icmp_hdr(skb)->code;
@@ -252,7 +88,8 @@ static int vti_err(struct sk_buff *skb, u32 info)
err = -ENOENT;
- t = vti_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr);
+ t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
+ iph->daddr, iph->saddr, 0);
if (t == NULL)
goto out;
@@ -281,8 +118,11 @@ static int vti_rcv(struct sk_buff *skb)
{
struct ip_tunnel *tunnel;
const struct iphdr *iph = ip_hdr(skb);
+ struct net *net = dev_net(skb->dev);
+ struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
- tunnel = vti_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr);
+ tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
+ iph->saddr, iph->daddr, 0);
if (tunnel != NULL) {
struct pcpu_tstats *tstats;
@@ -311,7 +151,6 @@ static int vti_rcv(struct sk_buff *skb)
static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
- struct pcpu_tstats *tstats;
struct iphdr *tiph = &tunnel->parms.iph;
u8 tos;
struct rtable *rt; /* Route to the other host */
@@ -319,6 +158,7 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
struct iphdr *old_iph = ip_hdr(skb);
__be32 dst = tiph->daddr;
struct flowi4 fl4;
+ int err;
if (skb->protocol != htons(ETH_P_IP))
goto tx_error;
@@ -367,8 +207,10 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
nf_reset(skb);
skb->dev = skb_dst(skb)->dev;
- tstats = this_cpu_ptr(dev->tstats);
- VTI_XMIT(tstats, &dev->stats);
+ err = dst_output(skb);
+ if (net_xmit_eval(err) == 0)
+ err = skb->len;
+ iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
return NETDEV_TX_OK;
tx_error_icmp:
@@ -379,198 +221,57 @@ tx_error:
return NETDEV_TX_OK;
}
-static int vti_tunnel_bind_dev(struct net_device *dev)
-{
- struct net_device *tdev = NULL;
- struct ip_tunnel *tunnel;
- struct iphdr *iph;
-
- tunnel = netdev_priv(dev);
- iph = &tunnel->parms.iph;
-
- if (iph->daddr) {
- struct rtable *rt;
- struct flowi4 fl4;
- memset(&fl4, 0, sizeof(fl4));
- flowi4_init_output(&fl4, tunnel->parms.link,
- be32_to_cpu(tunnel->parms.i_key),
- RT_TOS(iph->tos), RT_SCOPE_UNIVERSE,
- IPPROTO_IPIP, 0,
- iph->daddr, iph->saddr, 0, 0);
- rt = ip_route_output_key(dev_net(dev), &fl4);
- if (!IS_ERR(rt)) {
- tdev = rt->dst.dev;
- ip_rt_put(rt);
- }
- dev->flags |= IFF_POINTOPOINT;
- }
-
- if (!tdev && tunnel->parms.link)
- tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
-
- if (tdev) {
- dev->hard_header_len = tdev->hard_header_len +
- sizeof(struct iphdr);
- dev->mtu = tdev->mtu;
- }
- dev->iflink = tunnel->parms.link;
- return dev->mtu;
-}
-
static int
vti_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
{
int err = 0;
struct ip_tunnel_parm p;
- struct ip_tunnel *t;
- struct net *net = dev_net(dev);
- struct vti_net *ipn = net_generic(net, vti_net_id);
-
- switch (cmd) {
- case SIOCGETTUNNEL:
- t = NULL;
- if (dev == ipn->fb_tunnel_dev) {
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data,
- sizeof(p))) {
- err = -EFAULT;
- break;
- }
- t = vti_tunnel_locate(net, &p, 0);
- }
- if (t == NULL)
- t = netdev_priv(dev);
- memcpy(&p, &t->parms, sizeof(p));
- p.i_flags |= GRE_KEY | VTI_ISVTI;
- p.o_flags |= GRE_KEY;
- if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
- err = -EFAULT;
- break;
-
- case SIOCADDTUNNEL:
- case SIOCCHGTUNNEL:
- err = -EPERM;
- if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
- goto done;
- err = -EFAULT;
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
- goto done;
+ if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+ return -EFAULT;
- err = -EINVAL;
+ if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP ||
p.iph.ihl != 5)
- goto done;
-
- t = vti_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
-
- if (dev != ipn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
- if (t != NULL) {
- if (t->dev != dev) {
- err = -EEXIST;
- break;
- }
- } else {
- if (((dev->flags&IFF_POINTOPOINT) &&
- !p.iph.daddr) ||
- (!(dev->flags&IFF_POINTOPOINT) &&
- p.iph.daddr)) {
- err = -EINVAL;
- break;
- }
- t = netdev_priv(dev);
- vti_tunnel_unlink(ipn, t);
- synchronize_net();
- t->parms.iph.saddr = p.iph.saddr;
- t->parms.iph.daddr = p.iph.daddr;
- t->parms.i_key = p.i_key;
- t->parms.o_key = p.o_key;
- t->parms.iph.protocol = IPPROTO_IPIP;
- memcpy(dev->dev_addr, &p.iph.saddr, 4);
- memcpy(dev->broadcast, &p.iph.daddr, 4);
- vti_tunnel_link(ipn, t);
- netdev_state_change(dev);
- }
- }
-
- if (t) {
- err = 0;
- if (cmd == SIOCCHGTUNNEL) {
- t->parms.i_key = p.i_key;
- t->parms.o_key = p.o_key;
- if (t->parms.link != p.link) {
- t->parms.link = p.link;
- vti_tunnel_bind_dev(dev);
- netdev_state_change(dev);
- }
- }
- p.i_flags |= GRE_KEY | VTI_ISVTI;
- p.o_flags |= GRE_KEY;
- if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms,
- sizeof(p)))
- err = -EFAULT;
- } else
- err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
- break;
+ return -EINVAL;
+ }
- case SIOCDELTUNNEL:
- err = -EPERM;
- if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
- goto done;
-
- if (dev == ipn->fb_tunnel_dev) {
- err = -EFAULT;
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data,
- sizeof(p)))
- goto done;
- err = -ENOENT;
-
- t = vti_tunnel_locate(net, &p, 0);
- if (t == NULL)
- goto done;
- err = -EPERM;
- if (t->dev == ipn->fb_tunnel_dev)
- goto done;
- dev = t->dev;
- }
- unregister_netdevice(dev);
- err = 0;
- break;
+ err = ip_tunnel_ioctl(dev, &p, cmd);
+ if (err)
+ return err;
- default:
- err = -EINVAL;
+ if (cmd != SIOCDELTUNNEL) {
+ p.i_flags |= GRE_KEY | VTI_ISVTI;
+ p.o_flags |= GRE_KEY;
}
-done:
- return err;
-}
-
-static int vti_tunnel_change_mtu(struct net_device *dev, int new_mtu)
-{
- if (new_mtu < 68 || new_mtu > 0xFFF8)
- return -EINVAL;
- dev->mtu = new_mtu;
+ if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
+ return -EFAULT;
return 0;
}
static const struct net_device_ops vti_netdev_ops = {
.ndo_init = vti_tunnel_init,
- .ndo_uninit = vti_tunnel_uninit,
+ .ndo_uninit = ip_tunnel_uninit,
.ndo_start_xmit = vti_tunnel_xmit,
.ndo_do_ioctl = vti_tunnel_ioctl,
- .ndo_change_mtu = vti_tunnel_change_mtu,
+ .ndo_change_mtu = ip_tunnel_change_mtu,
.ndo_get_stats64 = ip_tunnel_get_stats64,
};
-static void vti_dev_free(struct net_device *dev)
+static void vti_tunnel_setup(struct net_device *dev)
{
- free_percpu(dev->tstats);
- free_netdev(dev);
+ dev->netdev_ops = &vti_netdev_ops;
+ ip_tunnel_setup(dev, vti_net_id);
}
-static void vti_tunnel_setup(struct net_device *dev)
+static int vti_tunnel_init(struct net_device *dev)
{
- dev->netdev_ops = &vti_netdev_ops;
- dev->destructor = vti_dev_free;
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct iphdr *iph = &tunnel->parms.iph;
+
+ memcpy(dev->dev_addr, &iph->saddr, 4);
+ memcpy(dev->broadcast, &iph->daddr, 4);
dev->type = ARPHRD_TUNNEL;
dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr);
@@ -581,38 +282,18 @@ static void vti_tunnel_setup(struct net_device *dev)
dev->features |= NETIF_F_NETNS_LOCAL;
dev->features |= NETIF_F_LLTX;
dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
-}
-static int vti_tunnel_init(struct net_device *dev)
-{
- struct ip_tunnel *tunnel = netdev_priv(dev);
-
- tunnel->dev = dev;
- strcpy(tunnel->parms.name, dev->name);
-
- memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
- memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
-
- dev->tstats = alloc_percpu(struct pcpu_tstats);
- if (!dev->tstats)
- return -ENOMEM;
-
- return 0;
+ return ip_tunnel_init(dev);
}
-static int __net_init vti_fb_tunnel_init(struct net_device *dev)
+static void __net_init vti_fb_tunnel_init(struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
struct iphdr *iph = &tunnel->parms.iph;
- struct vti_net *ipn = net_generic(dev_net(dev), vti_net_id);
iph->version = 4;
iph->protocol = IPPROTO_IPIP;
iph->ihl = 5;
-
- dev_hold(dev);
- rcu_assign_pointer(ipn->tunnels_wc[0], tunnel);
- return 0;
}
static struct xfrm_tunnel vti_handler __read_mostly = {
@@ -621,76 +302,30 @@ static struct xfrm_tunnel vti_handler __read_mostly = {
.priority = 1,
};
-static void vti_destroy_tunnels(struct vti_net *ipn, struct list_head *head)
-{
- int prio;
-
- for (prio = 1; prio < 4; prio++) {
- int h;
- for (h = 0; h < HASH_SIZE; h++) {
- struct ip_tunnel *t;
-
- t = rtnl_dereference(ipn->tunnels[prio][h]);
- while (t != NULL) {
- unregister_netdevice_queue(t->dev, head);
- t = rtnl_dereference(t->next);
- }
- }
- }
-}
-
static int __net_init vti_init_net(struct net *net)
{
int err;
- struct vti_net *ipn = net_generic(net, vti_net_id);
-
- ipn->tunnels[0] = ipn->tunnels_wc;
- ipn->tunnels[1] = ipn->tunnels_l;
- ipn->tunnels[2] = ipn->tunnels_r;
- ipn->tunnels[3] = ipn->tunnels_r_l;
-
- ipn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel),
- "ip_vti0",
- vti_tunnel_setup);
- if (!ipn->fb_tunnel_dev) {
- err = -ENOMEM;
- goto err_alloc_dev;
- }
- dev_net_set(ipn->fb_tunnel_dev, net);
-
- err = vti_fb_tunnel_init(ipn->fb_tunnel_dev);
- if (err)
- goto err_reg_dev;
- ipn->fb_tunnel_dev->rtnl_link_ops = &vti_link_ops;
+ struct ip_tunnel_net *itn;
- err = register_netdev(ipn->fb_tunnel_dev);
+ err = ip_tunnel_init_net(net, vti_net_id, &vti_link_ops, "ip_vti0");
if (err)
- goto err_reg_dev;
+ return err;
+ itn = net_generic(net, vti_net_id);
+ vti_fb_tunnel_init(itn->fb_tunnel_dev);
return 0;
-
-err_reg_dev:
- vti_dev_free(ipn->fb_tunnel_dev);
-err_alloc_dev:
- /* nothing */
- return err;
}
static void __net_exit vti_exit_net(struct net *net)
{
- struct vti_net *ipn = net_generic(net, vti_net_id);
- LIST_HEAD(list);
-
- rtnl_lock();
- vti_destroy_tunnels(ipn, &list);
- unregister_netdevice_many(&list);
- rtnl_unlock();
+ struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
+ ip_tunnel_delete_net(itn, &vti_link_ops);
}
static struct pernet_operations vti_net_ops = {
.init = vti_init_net,
.exit = vti_exit_net,
.id = &vti_net_id,
- .size = sizeof(struct vti_net),
+ .size = sizeof(struct ip_tunnel_net),
};
static int vti_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
@@ -728,78 +363,19 @@ static void vti_netlink_parms(struct nlattr *data[],
static int vti_newlink(struct net *src_net, struct net_device *dev,
struct nlattr *tb[], struct nlattr *data[])
{
- struct ip_tunnel *nt;
- struct net *net = dev_net(dev);
- struct vti_net *ipn = net_generic(net, vti_net_id);
- int mtu;
- int err;
-
- nt = netdev_priv(dev);
- vti_netlink_parms(data, &nt->parms);
-
- if (vti_tunnel_locate(net, &nt->parms, 0))
- return -EEXIST;
+ struct ip_tunnel_parm parms;
- mtu = vti_tunnel_bind_dev(dev);
- if (!tb[IFLA_MTU])
- dev->mtu = mtu;
-
- err = register_netdevice(dev);
- if (err)
- goto out;
-
- dev_hold(dev);
- vti_tunnel_link(ipn, nt);
-
-out:
- return err;
+ vti_netlink_parms(data, &parms);
+ return ip_tunnel_newlink(dev, tb, &parms);
}
static int vti_changelink(struct net_device *dev, struct nlattr *tb[],
struct nlattr *data[])
{
- struct ip_tunnel *t, *nt;
- struct net *net = dev_net(dev);
- struct vti_net *ipn = net_generic(net, vti_net_id);
struct ip_tunnel_parm p;
- int mtu;
-
- if (dev == ipn->fb_tunnel_dev)
- return -EINVAL;
- nt = netdev_priv(dev);
vti_netlink_parms(data, &p);
-
- t = vti_tunnel_locate(net, &p, 0);
-
- if (t) {
- if (t->dev != dev)
- return -EEXIST;
- } else {
- t = nt;
-
- vti_tunnel_unlink(ipn, t);
- t->parms.iph.saddr = p.iph.saddr;
- t->parms.iph.daddr = p.iph.daddr;
- t->parms.i_key = p.i_key;
- t->parms.o_key = p.o_key;
- if (dev->type != ARPHRD_ETHER) {
- memcpy(dev->dev_addr, &p.iph.saddr, 4);
- memcpy(dev->broadcast, &p.iph.daddr, 4);
- }
- vti_tunnel_link(ipn, t);
- netdev_state_change(dev);
- }
-
- if (t->parms.link != p.link) {
- t->parms.link = p.link;
- mtu = vti_tunnel_bind_dev(dev);
- if (!tb[IFLA_MTU])
- dev->mtu = mtu;
- netdev_state_change(dev);
- }
-
- return 0;
+ return ip_tunnel_changelink(dev, tb, &p);
}
static size_t vti_get_size(const struct net_device *dev)
@@ -865,7 +441,7 @@ static int __init vti_init(void)
err = xfrm4_mode_tunnel_input_register(&vti_handler);
if (err < 0) {
unregister_pernet_device(&vti_net_ops);
- pr_info(KERN_INFO "vti init: can't register tunnel\n");
+ pr_info("vti init: can't register tunnel\n");
}
err = rtnl_link_register(&vti_link_ops);
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 51fc2a1..87bd295 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -286,7 +286,6 @@ static void ipip_tunnel_setup(struct net_device *dev)
dev->flags = IFF_NOARP;
dev->iflink = 0;
dev->addr_len = 4;
- dev->features |= NETIF_F_NETNS_LOCAL;
dev->features |= NETIF_F_LLTX;
dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
@@ -437,7 +436,7 @@ static int __net_init ipip_init_net(struct net *net)
static void __net_exit ipip_exit_net(struct net *net)
{
struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
- ip_tunnel_delete_net(itn);
+ ip_tunnel_delete_net(itn, &ipip_link_ops);
}
static struct pernet_operations ipip_net_ops = {
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 132a096..bacc0bc 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -127,9 +127,9 @@ static struct kmem_cache *mrt_cachep __read_mostly;
static struct mr_table *ipmr_new_table(struct net *net, u32 id);
static void ipmr_free_table(struct mr_table *mrt);
-static int ip_mr_forward(struct net *net, struct mr_table *mrt,
- struct sk_buff *skb, struct mfc_cache *cache,
- int local);
+static void ip_mr_forward(struct net *net, struct mr_table *mrt,
+ struct sk_buff *skb, struct mfc_cache *cache,
+ int local);
static int ipmr_cache_report(struct mr_table *mrt,
struct sk_buff *pkt, vifi_t vifi, int assert);
static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
@@ -1795,9 +1795,9 @@ static int ipmr_find_vif(struct mr_table *mrt, struct net_device *dev)
/* "local" means that we should preserve one skb (for local delivery) */
-static int ip_mr_forward(struct net *net, struct mr_table *mrt,
- struct sk_buff *skb, struct mfc_cache *cache,
- int local)
+static void ip_mr_forward(struct net *net, struct mr_table *mrt,
+ struct sk_buff *skb, struct mfc_cache *cache,
+ int local)
{
int psend = -1;
int vif, ct;
@@ -1903,14 +1903,13 @@ last_forward:
ipmr_queue_xmit(net, mrt, skb2, cache, psend);
} else {
ipmr_queue_xmit(net, mrt, skb, cache, psend);
- return 0;
+ return;
}
}
dont_forward:
if (!local)
kfree_skb(skb);
- return 0;
}
static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb)
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index 30e4de9..00352ce 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -118,7 +118,7 @@ static int masq_device_event(struct notifier_block *this,
NF_CT_ASSERT(dev->ifindex != 0);
nf_ct_iterate_cleanup(net, device_cmp,
- (void *)(long)dev->ifindex);
+ (void *)(long)dev->ifindex, 0, 0);
}
return NOTIFY_DONE;
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 746427c..d7d9882 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -1082,7 +1082,7 @@ static void ping_v4_format_sock(struct sock *sp, struct seq_file *f,
__u16 srcp = ntohs(inet->inet_sport);
seq_printf(f, "%5d: %08X:%04X %08X:%04X"
- " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d%n",
+ " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d%n",
bucket, src, srcp, dest, destp, sp->sk_state,
sk_wmem_alloc_get(sp),
sk_rmem_alloc_get(sp),
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 463bd12..4a03358 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -111,7 +111,7 @@ static const struct snmp_mib snmp4_ipstats_list[] = {
SNMP_MIB_SENTINEL
};
-/* Following RFC4293 items are displayed in /proc/net/netstat */
+/* Following items are displayed in /proc/net/netstat */
static const struct snmp_mib snmp4_ipextstats_list[] = {
SNMP_MIB_ITEM("InNoRoutes", IPSTATS_MIB_INNOROUTES),
SNMP_MIB_ITEM("InTruncatedPkts", IPSTATS_MIB_INTRUNCATEDPKTS),
@@ -125,7 +125,12 @@ static const struct snmp_mib snmp4_ipextstats_list[] = {
SNMP_MIB_ITEM("OutMcastOctets", IPSTATS_MIB_OUTMCASTOCTETS),
SNMP_MIB_ITEM("InBcastOctets", IPSTATS_MIB_INBCASTOCTETS),
SNMP_MIB_ITEM("OutBcastOctets", IPSTATS_MIB_OUTBCASTOCTETS),
+ /* Non RFC4293 fields */
SNMP_MIB_ITEM("InCsumErrors", IPSTATS_MIB_CSUMERRORS),
+ SNMP_MIB_ITEM("InNoECTPkts", IPSTATS_MIB_NOECTPKTS),
+ SNMP_MIB_ITEM("InECT1Pkts", IPSTATS_MIB_ECT1PKTS),
+ SNMP_MIB_ITEM("InECT0Pkts", IPSTATS_MIB_ECT0PKTS),
+ SNMP_MIB_ITEM("InCEPkts", IPSTATS_MIB_CEPKTS),
SNMP_MIB_SENTINEL
};
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index dd44e0a..41d8450 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -987,7 +987,7 @@ static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i)
srcp = inet->inet_num;
seq_printf(seq, "%4d: %08X:%04X %08X:%04X"
- " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d\n",
+ " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d\n",
i, src, srcp, dest, destp, sp->sk_state,
sk_wmem_alloc_get(sp),
sk_rmem_alloc_get(sp),
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index a9a54a2..727f436 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -112,7 +112,8 @@
#define RT_FL_TOS(oldflp4) \
((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
-#define IP_MAX_MTU 0xFFF0
+/* IPv4 datagram length is stored into 16bit field (tot_len) */
+#define IP_MAX_MTU 0xFFFF
#define RT_GC_TIMEOUT (300*HZ)
@@ -435,12 +436,12 @@ static inline int ip_rt_proc_init(void)
static inline bool rt_is_expired(const struct rtable *rth)
{
- return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
+ return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
}
void rt_cache_flush(struct net *net)
{
- rt_genid_bump(net);
+ rt_genid_bump_ipv4(net);
}
static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
@@ -1227,10 +1228,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
mtu = 576;
}
- if (mtu > IP_MAX_MTU)
- mtu = IP_MAX_MTU;
-
- return mtu;
+ return min_t(unsigned int, mtu, IP_MAX_MTU);
}
static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
@@ -1458,7 +1456,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
#endif
rth->dst.output = ip_rt_bug;
- rth->rt_genid = rt_genid(dev_net(dev));
+ rth->rt_genid = rt_genid_ipv4(dev_net(dev));
rth->rt_flags = RTCF_MULTICAST;
rth->rt_type = RTN_MULTICAST;
rth->rt_is_input= 1;
@@ -1589,7 +1587,7 @@ static int __mkroute_input(struct sk_buff *skb,
goto cleanup;
}
- rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
+ rth->rt_genid = rt_genid_ipv4(dev_net(rth->dst.dev));
rth->rt_flags = flags;
rth->rt_type = res->type;
rth->rt_is_input = 1;
@@ -1760,7 +1758,7 @@ local_input:
rth->dst.tclassid = itag;
#endif
- rth->rt_genid = rt_genid(net);
+ rth->rt_genid = rt_genid_ipv4(net);
rth->rt_flags = flags|RTCF_LOCAL;
rth->rt_type = res.type;
rth->rt_is_input = 1;
@@ -1945,7 +1943,7 @@ add:
rth->dst.output = ip_output;
- rth->rt_genid = rt_genid(dev_net(dev_out));
+ rth->rt_genid = rt_genid_ipv4(dev_net(dev_out));
rth->rt_flags = flags;
rth->rt_type = type;
rth->rt_is_input = 0;
@@ -2227,7 +2225,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
rt->rt_iif = ort->rt_iif;
rt->rt_pmtu = ort->rt_pmtu;
- rt->rt_genid = rt_genid(net);
+ rt->rt_genid = rt_genid_ipv4(net);
rt->rt_flags = ort->rt_flags;
rt->rt_type = ort->rt_type;
rt->rt_gateway = ort->rt_gateway;
@@ -2665,7 +2663,7 @@ static __net_initdata struct pernet_operations sysctl_route_ops = {
static __net_init int rt_genid_init(struct net *net)
{
- atomic_set(&net->rt_genid, 0);
+ atomic_set(&net->ipv4.rt_genid, 0);
atomic_set(&net->fnhe_genid, 0);
get_random_bytes(&net->ipv4.dev_addr_genid,
sizeof(net->ipv4.dev_addr_genid));
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 610e324..8ed7c32 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -559,6 +559,13 @@ static struct ctl_table ipv4_table[] = {
.extra1 = &one,
},
{
+ .procname = "tcp_notsent_lowat",
+ .data = &sysctl_tcp_notsent_lowat,
+ .maxlen = sizeof(sysctl_tcp_notsent_lowat),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
.procname = "tcp_rmem",
.data = &sysctl_tcp_rmem,
.maxlen = sizeof(sysctl_tcp_rmem),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index b2f6c74..4e42c03 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -410,10 +410,6 @@ void tcp_init_sock(struct sock *sk)
icsk->icsk_sync_mss = tcp_sync_mss;
- /* Presumed zeroed, in order of appearance:
- * cookie_in_always, cookie_out_never,
- * s_data_constant, s_data_in, s_data_out
- */
sk->sk_sndbuf = sysctl_tcp_wmem[1];
sk->sk_rcvbuf = sysctl_tcp_rmem[1];
@@ -499,7 +495,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
mask |= POLLIN | POLLRDNORM;
if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
- if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
+ if (sk_stream_is_writeable(sk)) {
mask |= POLLOUT | POLLWRNORM;
} else { /* send SIGIO later */
set_bit(SOCK_ASYNC_NOSPACE,
@@ -510,7 +506,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
* wspace test but before the flags are set,
* IO signal will be lost.
*/
- if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
+ if (sk_stream_is_writeable(sk))
mask |= POLLOUT | POLLWRNORM;
}
} else
@@ -2638,6 +2634,10 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
else
tp->tsoffset = val - tcp_time_stamp;
break;
+ case TCP_NOTSENT_LOWAT:
+ tp->notsent_lowat = val;
+ sk->sk_write_space(sk);
+ break;
default:
err = -ENOPROTOOPT;
break;
@@ -2854,6 +2854,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
case TCP_TIMESTAMP:
val = tcp_time_stamp + tp->tsoffset;
break;
+ case TCP_NOTSENT_LOWAT:
+ val = tp->notsent_lowat;
+ break;
default:
return -ENOPROTOOPT;
}
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 8f7ef0a..ab7bd35 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -58,23 +58,22 @@ error: kfree(ctx);
return err;
}
-/* Computes the fastopen cookie for the peer.
- * The peer address is a 128 bits long (pad with zeros for IPv4).
+/* Computes the fastopen cookie for the IP path.
+ * The path is a 128 bits long (pad with zeros for IPv4).
*
* The caller must check foc->len to determine if a valid cookie
* has been generated successfully.
*/
-void tcp_fastopen_cookie_gen(__be32 addr, struct tcp_fastopen_cookie *foc)
+void tcp_fastopen_cookie_gen(__be32 src, __be32 dst,
+ struct tcp_fastopen_cookie *foc)
{
- __be32 peer_addr[4] = { addr, 0, 0, 0 };
+ __be32 path[4] = { src, dst, 0, 0 };
struct tcp_fastopen_context *ctx;
rcu_read_lock();
ctx = rcu_dereference(tcp_fastopen_ctx);
if (ctx) {
- crypto_cipher_encrypt_one(ctx->tfm,
- foc->val,
- (__u8 *)peer_addr);
+ crypto_cipher_encrypt_one(ctx->tfm, foc->val, (__u8 *)path);
foc->len = TCP_FASTOPEN_COOKIE_SIZE;
}
rcu_read_unlock();
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 28af45a..ec492ea 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1048,6 +1048,7 @@ struct tcp_sacktag_state {
int reord;
int fack_count;
int flag;
+ s32 rtt; /* RTT measured by SACKing never-retransmitted data */
};
/* Check if skb is fully within the SACK block. In presence of GSO skbs,
@@ -1108,7 +1109,7 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
static u8 tcp_sacktag_one(struct sock *sk,
struct tcp_sacktag_state *state, u8 sacked,
u32 start_seq, u32 end_seq,
- bool dup_sack, int pcount)
+ int dup_sack, int pcount, u32 xmit_time)
{
struct tcp_sock *tp = tcp_sk(sk);
int fack_count = state->fack_count;
@@ -1148,6 +1149,9 @@ static u8 tcp_sacktag_one(struct sock *sk,
state->reord);
if (!after(end_seq, tp->high_seq))
state->flag |= FLAG_ORIG_SACK_ACKED;
+ /* Pick the earliest sequence sacked for RTT */
+ if (state->rtt < 0)
+ state->rtt = tcp_time_stamp - xmit_time;
}
if (sacked & TCPCB_LOST) {
@@ -1205,7 +1209,8 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
* tcp_highest_sack_seq() when skb is highest_sack.
*/
tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
- start_seq, end_seq, dup_sack, pcount);
+ start_seq, end_seq, dup_sack, pcount,
+ TCP_SKB_CB(skb)->when);
if (skb == tp->lost_skb_hint)
tp->lost_cnt_hint += pcount;
@@ -1479,7 +1484,8 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
TCP_SKB_CB(skb)->seq,
TCP_SKB_CB(skb)->end_seq,
dup_sack,
- tcp_skb_pcount(skb));
+ tcp_skb_pcount(skb),
+ TCP_SKB_CB(skb)->when);
if (!before(TCP_SKB_CB(skb)->seq,
tcp_highest_sack_seq(tp)))
@@ -1536,7 +1542,7 @@ static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_bl
static int
tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
- u32 prior_snd_una)
+ u32 prior_snd_una, s32 *sack_rtt)
{
struct tcp_sock *tp = tcp_sk(sk);
const unsigned char *ptr = (skb_transport_header(ack_skb) +
@@ -1554,6 +1560,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
state.flag = 0;
state.reord = tp->packets_out;
+ state.rtt = -1;
if (!tp->sacked_out) {
if (WARN_ON(tp->fackets_out))
@@ -1737,6 +1744,7 @@ out:
WARN_ON((int)tp->retrans_out < 0);
WARN_ON((int)tcp_packets_in_flight(tp) < 0);
#endif
+ *sack_rtt = state.rtt;
return state.flag;
}
@@ -1869,8 +1877,13 @@ void tcp_enter_loss(struct sock *sk, int how)
}
tcp_verify_left_out(tp);
- tp->reordering = min_t(unsigned int, tp->reordering,
- sysctl_tcp_reordering);
+ /* Timeout in disordered state after receiving substantial DUPACKs
+ * suggests that the degree of reordering is over-estimated.
+ */
+ if (icsk->icsk_ca_state <= TCP_CA_Disorder &&
+ tp->sacked_out >= sysctl_tcp_reordering)
+ tp->reordering = min_t(unsigned int, tp->reordering,
+ sysctl_tcp_reordering);
tcp_set_ca_state(sk, TCP_CA_Loss);
tp->high_seq = tp->snd_nxt;
TCP_ECN_queue_cwr(tp);
@@ -2472,8 +2485,6 @@ static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked)
if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
tcp_try_keep_open(sk);
- if (inet_csk(sk)->icsk_ca_state != TCP_CA_Open)
- tcp_moderate_cwnd(tp);
} else {
tcp_cwnd_reduction(sk, prior_unsacked, 0);
}
@@ -2792,65 +2803,51 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
tcp_xmit_retransmit_queue(sk);
}
-void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt)
+static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
+ s32 seq_rtt, s32 sack_rtt)
{
- tcp_rtt_estimator(sk, seq_rtt);
- tcp_set_rto(sk);
- inet_csk(sk)->icsk_backoff = 0;
-}
-EXPORT_SYMBOL(tcp_valid_rtt_meas);
+ const struct tcp_sock *tp = tcp_sk(sk);
+
+ /* Prefer RTT measured from ACK's timing to TS-ECR. This is because
+ * broken middle-boxes or peers may corrupt TS-ECR fields. But
+ * Karn's algorithm forbids taking RTT if some retransmitted data
+ * is acked (RFC6298).
+ */
+ if (flag & FLAG_RETRANS_DATA_ACKED)
+ seq_rtt = -1;
+
+ if (seq_rtt < 0)
+ seq_rtt = sack_rtt;
-/* Read draft-ietf-tcplw-high-performance before mucking
- * with this code. (Supersedes RFC1323)
- */
-static void tcp_ack_saw_tstamp(struct sock *sk, int flag)
-{
/* RTTM Rule: A TSecr value received in a segment is used to
* update the averaged RTT measurement only if the segment
* acknowledges some new data, i.e., only if it advances the
* left edge of the send window.
- *
* See draft-ietf-tcplw-high-performance-00, section 3.3.
- * 1998/04/10 Andrey V. Savochkin <saw@msu.ru>
- *
- * Changed: reset backoff as soon as we see the first valid sample.
- * If we do not, we get strongly overestimated rto. With timestamps
- * samples are accepted even from very old segments: f.e., when rtt=1
- * increases to 8, we retransmit 5 times and after 8 seconds delayed
- * answer arrives rto becomes 120 seconds! If at least one of segments
- * in window is lost... Voila. --ANK (010210)
*/
- struct tcp_sock *tp = tcp_sk(sk);
-
- tcp_valid_rtt_meas(sk, tcp_time_stamp - tp->rx_opt.rcv_tsecr);
-}
+ if (seq_rtt < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
+ seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
-static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag)
-{
- /* We don't have a timestamp. Can only use
- * packets that are not retransmitted to determine
- * rtt estimates. Also, we must not reset the
- * backoff for rto until we get a non-retransmitted
- * packet. This allows us to deal with a situation
- * where the network delay has increased suddenly.
- * I.e. Karn's algorithm. (SIGCOMM '87, p5.)
- */
+ if (seq_rtt < 0)
+ return false;
- if (flag & FLAG_RETRANS_DATA_ACKED)
- return;
+ tcp_rtt_estimator(sk, seq_rtt);
+ tcp_set_rto(sk);
- tcp_valid_rtt_meas(sk, seq_rtt);
+ /* RFC6298: only reset backoff on valid RTT measurement. */
+ inet_csk(sk)->icsk_backoff = 0;
+ return true;
}
-static inline void tcp_ack_update_rtt(struct sock *sk, const int flag,
- const s32 seq_rtt)
+/* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */
+static void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req)
{
- const struct tcp_sock *tp = tcp_sk(sk);
- /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */
- if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
- tcp_ack_saw_tstamp(sk, flag);
- else if (seq_rtt >= 0)
- tcp_ack_no_tstamp(sk, seq_rtt, flag);
+ struct tcp_sock *tp = tcp_sk(sk);
+ s32 seq_rtt = -1;
+
+ if (tp->lsndtime && !tp->total_retrans)
+ seq_rtt = tcp_time_stamp - tp->lsndtime;
+ tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt, -1);
}
static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
@@ -2939,7 +2936,7 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
* arrived at the other end.
*/
static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
- u32 prior_snd_una)
+ u32 prior_snd_una, s32 sack_rtt)
{
struct tcp_sock *tp = tcp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -2978,8 +2975,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
if (sacked & TCPCB_SACKED_RETRANS)
tp->retrans_out -= acked_pcount;
flag |= FLAG_RETRANS_DATA_ACKED;
- ca_seq_rtt = -1;
- seq_rtt = -1;
} else {
ca_seq_rtt = now - scb->when;
last_ackt = skb->tstamp;
@@ -3031,6 +3026,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
flag |= FLAG_SACK_RENEGING;
+ if (tcp_ack_update_rtt(sk, flag, seq_rtt, sack_rtt) ||
+ (flag & FLAG_ACKED))
+ tcp_rearm_rto(sk);
+
if (flag & FLAG_ACKED) {
const struct tcp_congestion_ops *ca_ops
= inet_csk(sk)->icsk_ca_ops;
@@ -3040,9 +3039,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
tcp_mtup_probe_success(sk);
}
- tcp_ack_update_rtt(sk, flag, seq_rtt);
- tcp_rearm_rto(sk);
-
if (tcp_is_reno(tp)) {
tcp_remove_reno_sacks(sk, pkts_acked);
} else {
@@ -3130,11 +3126,24 @@ static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag)
inet_csk(sk)->icsk_ca_state != TCP_CA_Open;
}
+/* Decide wheather to run the increase function of congestion control. */
static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
{
- const struct tcp_sock *tp = tcp_sk(sk);
- return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) &&
- !tcp_in_cwnd_reduction(sk);
+ if (tcp_in_cwnd_reduction(sk))
+ return false;
+
+ /* If reordering is high then always grow cwnd whenever data is
+ * delivered regardless of its ordering. Otherwise stay conservative
+ * and only grow cwnd on in-order delivery in Open state, and retain
+ * cwnd in Disordered state (RFC5681). A stretched ACK with
+ * new SACK or ECE mark may first advance cwnd here and later reduce
+ * cwnd in tcp_fastretrans_alert() based on more states.
+ */
+ if (tcp_sk(sk)->reordering > sysctl_tcp_reordering)
+ return flag & FLAG_FORWARD_PROGRESS;
+
+ return inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
+ flag & FLAG_DATA_ACKED;
}
/* Check that window update is acceptable.
@@ -3274,6 +3283,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
int prior_packets = tp->packets_out;
const int prior_unsacked = tp->packets_out - tp->sacked_out;
int acked = 0; /* Number of packets newly acked */
+ s32 sack_rtt = -1;
/* If the ack is older than previous acks
* then we can probably ignore it.
@@ -3330,7 +3340,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
flag |= tcp_ack_update_window(sk, skb, ack, ack_seq);
if (TCP_SKB_CB(skb)->sacked)
- flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una);
+ flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
+ &sack_rtt);
if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb)))
flag |= FLAG_ECE;
@@ -3349,21 +3360,18 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
/* See if we can take anything off of the retransmit queue. */
acked = tp->packets_out;
- flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una);
+ flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, sack_rtt);
acked -= tp->packets_out;
+ /* Advance cwnd if state allows */
+ if (tcp_may_raise_cwnd(sk, flag))
+ tcp_cong_avoid(sk, ack, prior_in_flight);
+
if (tcp_ack_is_dubious(sk, flag)) {
- /* Advance CWND, if state allows this. */
- if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag))
- tcp_cong_avoid(sk, ack, prior_in_flight);
is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
tcp_fastretrans_alert(sk, acked, prior_unsacked,
is_dupack, flag);
- } else {
- if (flag & FLAG_DATA_ACKED)
- tcp_cong_avoid(sk, ack, prior_in_flight);
}
-
if (tp->tlp_high_seq)
tcp_process_tlp_ack(sk, ack, flag);
@@ -3402,7 +3410,8 @@ old_ack:
* If data was DSACKed, see if we can undo a cwnd reduction.
*/
if (TCP_SKB_CB(skb)->sacked) {
- flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una);
+ flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
+ &sack_rtt);
tcp_fastretrans_alert(sk, acked, prior_unsacked,
is_dupack, flag);
}
@@ -5624,9 +5633,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
* so release it.
*/
if (req) {
- tcp_synack_rtt_meas(sk, req);
tp->total_retrans = req->num_retrans;
-
reqsk_fastopen_remove(sk, req, false);
} else {
/* Make sure socket is routed, for correct metrics. */
@@ -5651,6 +5658,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;
tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
+ tcp_synack_rtt_meas(sk, req);
if (tp->rx_opt.tstamp_ok)
tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index b299da5..09d45d7 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -821,8 +821,7 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
*/
static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
struct request_sock *req,
- u16 queue_mapping,
- bool nocache)
+ u16 queue_mapping)
{
const struct inet_request_sock *ireq = inet_rsk(req);
struct flowi4 fl4;
@@ -852,7 +851,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req)
{
- int res = tcp_v4_send_synack(sk, NULL, req, 0, false);
+ int res = tcp_v4_send_synack(sk, NULL, req, 0);
if (!res)
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
@@ -890,7 +889,7 @@ bool tcp_syn_flood_action(struct sock *sk,
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
- if (!lopt->synflood_warned) {
+ if (!lopt->synflood_warned && sysctl_tcp_syncookies != 2) {
lopt->synflood_warned = 1;
pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n",
proto, ntohs(tcp_hdr(skb)->dest), msg);
@@ -1316,9 +1315,11 @@ static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb,
tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
return true;
}
+
if (foc->len == TCP_FASTOPEN_COOKIE_SIZE) {
if ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_CHKED) == 0) {
- tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
+ tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr,
+ ip_hdr(skb)->daddr, valid_foc);
if ((valid_foc->len != TCP_FASTOPEN_COOKIE_SIZE) ||
memcmp(&foc->val[0], &valid_foc->val[0],
TCP_FASTOPEN_COOKIE_SIZE) != 0)
@@ -1329,14 +1330,16 @@ static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb,
tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
return true;
} else if (foc->len == 0) { /* Client requesting a cookie */
- tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
+ tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr,
+ ip_hdr(skb)->daddr, valid_foc);
NET_INC_STATS_BH(sock_net(sk),
LINUX_MIB_TCPFASTOPENCOOKIEREQD);
} else {
/* Client sent a cookie with wrong size. Treat it
* the same as invalid and return a valid one.
*/
- tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
+ tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr,
+ ip_hdr(skb)->daddr, valid_foc);
}
return false;
}
@@ -1462,7 +1465,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
* limitations, they conserve resources and peer is
* evidently real one.
*/
- if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
+ if ((sysctl_tcp_syncookies == 2 ||
+ inet_csk_reqsk_queue_is_full(sk)) && !isn) {
want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
if (!want_cookie)
goto drop;
@@ -1671,8 +1675,6 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
tcp_initialize_rcv_mss(newsk);
- tcp_synack_rtt_meas(newsk, req);
- newtp->total_retrans = req->num_retrans;
#ifdef CONFIG_TCP_MD5SIG
/* Copy over the MD5 key from the original socket */
@@ -2605,7 +2607,7 @@ static void get_openreq4(const struct sock *sk, const struct request_sock *req,
long delta = req->expires - jiffies;
seq_printf(f, "%4d: %08X:%04X %08X:%04X"
- " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
+ " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK%n",
i,
ireq->loc_addr,
ntohs(inet_sk(sk)->inet_sport),
@@ -2663,7 +2665,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
- "%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n",
+ "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d%n",
i, src, srcp, dest, destp, sk->sk_state,
tp->write_seq - tp->snd_una,
rx_queue,
@@ -2802,6 +2804,7 @@ struct proto tcp_prot = {
.unhash = inet_unhash,
.get_port = inet_csk_get_port,
.enter_memory_pressure = tcp_enter_memory_pressure,
+ .stream_memory_free = tcp_stream_memory_free,
.sockets_allocated = &tcp_sockets_allocated,
.orphan_count = &tcp_orphan_count,
.memory_allocated = &tcp_memory_allocated,
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index ab1c086..58a3e69 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -411,6 +411,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
tcp_enable_early_retrans(newtp);
newtp->tlp_high_seq = 0;
+ newtp->lsndtime = treq->snt_synack;
+ newtp->total_retrans = req->num_retrans;
/* So many TCP implementations out there (incorrectly) count the
* initial SYN frame in their delayed-ACK and congestion control
@@ -666,12 +668,6 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
if (!(flg & TCP_FLAG_ACK))
return NULL;
- /* Got ACK for our SYNACK, so update baseline for SYNACK RTT sample. */
- if (tmp_opt.saw_tstamp && tmp_opt.rcv_tsecr)
- tcp_rsk(req)->snt_synack = tmp_opt.rcv_tsecr;
- else if (req->num_retrans) /* don't take RTT sample if retrans && ~TS */
- tcp_rsk(req)->snt_synack = 0;
-
/* For Fast Open no more processing is needed (sk is the
* child socket).
*/
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 92fde8d..884efff 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -65,6 +65,9 @@ int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS;
/* By default, RFC2861 behavior. */
int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
+unsigned int sysctl_tcp_notsent_lowat __read_mostly = UINT_MAX;
+EXPORT_SYMBOL(sysctl_tcp_notsent_lowat);
+
static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
int push_one, gfp_t gfp);
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index d4943f6..301a3ef 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -54,12 +54,16 @@ static const char procname[] = "tcpprobe";
struct tcp_log {
ktime_t tstamp;
- __be32 saddr, daddr;
- __be16 sport, dport;
+ union {
+ struct sockaddr raw;
+ struct sockaddr_in v4;
+ struct sockaddr_in6 v6;
+ } src, dst;
u16 length;
u32 snd_nxt;
u32 snd_una;
u32 snd_wnd;
+ u32 rcv_wnd;
u32 snd_cwnd;
u32 ssthresh;
u32 srtt;
@@ -86,12 +90,36 @@ static inline int tcp_probe_avail(void)
return bufsize - tcp_probe_used() - 1;
}
+#define tcp_probe_copy_fl_to_si4(inet, si4, mem) \
+ do { \
+ si4.sin_family = AF_INET; \
+ si4.sin_port = inet->inet_##mem##port; \
+ si4.sin_addr.s_addr = inet->inet_##mem##addr; \
+ } while (0) \
+
+#if IS_ENABLED(CONFIG_IPV6)
+#define tcp_probe_copy_fl_to_si6(inet, si6, mem) \
+ do { \
+ struct ipv6_pinfo *pi6 = inet->pinet6; \
+ si6.sin6_family = AF_INET6; \
+ si6.sin6_port = inet->inet_##mem##port; \
+ si6.sin6_addr = pi6->mem##addr; \
+ si6.sin6_flowinfo = 0; /* No need here. */ \
+ si6.sin6_scope_id = 0; /* No need here. */ \
+ } while (0)
+#else
+#define tcp_probe_copy_fl_to_si6(fl, si6, mem) \
+ do { \
+ memset(&si6, 0, sizeof(si6)); \
+ } while (0)
+#endif
+
/*
* Hook inserted to be called before each receive packet.
* Note: arguments must match tcp_rcv_established()!
*/
static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
- struct tcphdr *th, unsigned int len)
+ const struct tcphdr *th, unsigned int len)
{
const struct tcp_sock *tp = tcp_sk(sk);
const struct inet_sock *inet = inet_sk(sk);
@@ -107,15 +135,25 @@ static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
struct tcp_log *p = tcp_probe.log + tcp_probe.head;
p->tstamp = ktime_get();
- p->saddr = inet->inet_saddr;
- p->sport = inet->inet_sport;
- p->daddr = inet->inet_daddr;
- p->dport = inet->inet_dport;
+ switch (sk->sk_family) {
+ case AF_INET:
+ tcp_probe_copy_fl_to_si4(inet, p->src.v4, s);
+ tcp_probe_copy_fl_to_si4(inet, p->dst.v4, d);
+ break;
+ case AF_INET6:
+ tcp_probe_copy_fl_to_si6(inet, p->src.v6, s);
+ tcp_probe_copy_fl_to_si6(inet, p->dst.v6, d);
+ break;
+ default:
+ BUG();
+ }
+
p->length = skb->len;
p->snd_nxt = tp->snd_nxt;
p->snd_una = tp->snd_una;
p->snd_cwnd = tp->snd_cwnd;
p->snd_wnd = tp->snd_wnd;
+ p->rcv_wnd = tp->rcv_wnd;
p->ssthresh = tcp_current_ssthresh(sk);
p->srtt = tp->srtt >> 3;
@@ -157,13 +195,11 @@ static int tcpprobe_sprint(char *tbuf, int n)
= ktime_to_timespec(ktime_sub(p->tstamp, tcp_probe.start));
return scnprintf(tbuf, n,
- "%lu.%09lu %pI4:%u %pI4:%u %d %#x %#x %u %u %u %u\n",
+ "%lu.%09lu %pISpc %pISpc %d %#x %#x %u %u %u %u %u\n",
(unsigned long) tv.tv_sec,
(unsigned long) tv.tv_nsec,
- &p->saddr, ntohs(p->sport),
- &p->daddr, ntohs(p->dport),
- p->length, p->snd_nxt, p->snd_una,
- p->snd_cwnd, p->ssthresh, p->snd_wnd, p->srtt);
+ &p->src, &p->dst, p->length, p->snd_nxt, p->snd_una,
+ p->snd_cwnd, p->ssthresh, p->snd_wnd, p->srtt, p->rcv_wnd);
}
static ssize_t tcpprobe_read(struct file *file, char __user *buf,
@@ -223,6 +259,13 @@ static __init int tcpprobe_init(void)
{
int ret = -ENOMEM;
+ /* Warning: if the function signature of tcp_rcv_established,
+ * has been changed, you also have to change the signature of
+ * jtcp_rcv_established, otherwise you end up right here!
+ */
+ BUILD_BUG_ON(__same_type(tcp_rcv_established,
+ jtcp_rcv_established) == 0);
+
init_waitqueue_head(&tcp_probe.wait);
spin_lock_init(&tcp_probe.lock);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 766e6ba..0b24508 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -704,7 +704,7 @@ EXPORT_SYMBOL(udp_flush_pending_frames);
* @src: source IP address
* @dst: destination IP address
*/
-static void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst)
+void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst)
{
struct udphdr *uh = udp_hdr(skb);
struct sk_buff *frags = skb_shinfo(skb)->frag_list;
@@ -740,6 +740,7 @@ static void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst)
uh->check = CSUM_MANGLED_0;
}
}
+EXPORT_SYMBOL_GPL(udp4_hwcsum);
static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4)
{
@@ -2158,7 +2159,7 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f,
__u16 srcp = ntohs(inet->inet_sport);
seq_printf(f, "%5d: %08X:%04X %08X:%04X"
- " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d%n",
+ " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d%n",
bucket, src, srcp, dest, destp, sp->sk_state,
sk_wmem_alloc_get(sp),
sk_rmem_alloc_get(sp),