summaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
authorScott Wood <scottwood@freescale.com>2014-04-08 01:00:49 (GMT)
committerScott Wood <scottwood@freescale.com>2014-04-08 19:58:35 (GMT)
commit47d2261a3fa71cde24263559a4219a25e50d8c89 (patch)
tree28774d5b330ccf1b777a3af222d8356918328013 /net/ipv4
parentfb7f27080adc65cd5f341bdf56a1d0c14f316c1b (diff)
parent5fb9d37f27351e42f002e372074249f92cbdf815 (diff)
downloadlinux-fsl-qoriq-47d2261a3fa71cde24263559a4219a25e50d8c89.tar.xz
Merge branch 'merge' into sdk-v1.6.x
This reverts v3.13-rc3+ (78fd82238d0e5716) to v3.12, except for commits which I noticed which appear relevant to the SDK. Signed-off-by: Scott Wood <scottwood@freescale.com> Conflicts: arch/powerpc/include/asm/kvm_host.h arch/powerpc/kvm/book3s_hv_rmhandlers.S arch/powerpc/kvm/book3s_interrupts.S arch/powerpc/kvm/e500.c arch/powerpc/kvm/e500mc.c arch/powerpc/sysdev/fsl_soc.h drivers/Kconfig drivers/cpufreq/ppc-corenet-cpufreq.c drivers/dma/fsldma.c drivers/dma/s3c24xx-dma.c drivers/misc/Makefile drivers/mmc/host/sdhci-of-esdhc.c drivers/mtd/devices/m25p80.c drivers/net/ethernet/freescale/gianfar.h drivers/platform/Kconfig drivers/platform/Makefile drivers/spi/spi-fsl-espi.c include/crypto/algapi.h include/linux/netdev_features.h include/linux/skbuff.h include/net/ip.h net/core/ethtool.c
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/af_inet.c108
-rw-r--r--net/ipv4/datagram.c2
-rw-r--r--net/ipv4/esp4.c49
-rw-r--r--net/ipv4/fib_frontend.c2
-rw-r--r--net/ipv4/fib_lookup.h26
-rw-r--r--net/ipv4/fib_semantics.c2
-rw-r--r--net/ipv4/fib_trie.c28
-rw-r--r--net/ipv4/gre_demux.c29
-rw-r--r--net/ipv4/gre_offload.c3
-rw-r--r--net/ipv4/icmp.c5
-rw-r--r--net/ipv4/inet_connection_sock.c54
-rw-r--r--net/ipv4/inet_diag.c120
-rw-r--r--net/ipv4/inet_fragment.c3
-rw-r--r--net/ipv4/inet_hashtables.c110
-rw-r--r--net/ipv4/inet_timewait_sock.c59
-rw-r--r--net/ipv4/ip_fragment.c1
-rw-r--r--net/ipv4/ip_output.c33
-rw-r--r--net/ipv4/ip_sockglue.c30
-rw-r--r--net/ipv4/ip_tunnel.c12
-rw-r--r--net/ipv4/ip_tunnel_core.c33
-rw-r--r--net/ipv4/ip_vti.c68
-rw-r--r--net/ipv4/ipip.c11
-rw-r--r--net/ipv4/netfilter/Kconfig21
-rw-r--r--net/ipv4/netfilter/Makefile6
-rw-r--r--net/ipv4/netfilter/arp_tables.c5
-rw-r--r--net/ipv4/netfilter/arptable_filter.c5
-rw-r--r--net/ipv4/netfilter/ip_tables.c5
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c112
-rw-r--r--net/ipv4/netfilter/ipt_SYNPROXY.c3
-rw-r--r--net/ipv4/netfilter/ipt_ULOG.c7
-rw-r--r--net/ipv4/netfilter/iptable_filter.c7
-rw-r--r--net/ipv4/netfilter/iptable_mangle.c10
-rw-r--r--net/ipv4/netfilter/iptable_nat.c26
-rw-r--r--net/ipv4/netfilter/iptable_raw.c6
-rw-r--r--net/ipv4/netfilter/iptable_security.c7
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c12
-rw-r--r--net/ipv4/netfilter/nf_defrag_ipv4.c6
-rw-r--r--net/ipv4/netfilter/nf_tables_arp.c102
-rw-r--r--net/ipv4/netfilter/nf_tables_ipv4.c127
-rw-r--r--net/ipv4/netfilter/nft_chain_nat_ipv4.c205
-rw-r--r--net/ipv4/netfilter/nft_chain_route_ipv4.c90
-rw-r--r--net/ipv4/netfilter/nft_reject_ipv4.c123
-rw-r--r--net/ipv4/ping.c100
-rw-r--r--net/ipv4/protocol.c8
-rw-r--r--net/ipv4/raw.c12
-rw-r--r--net/ipv4/route.c28
-rw-r--r--net/ipv4/syncookies.c80
-rw-r--r--net/ipv4/sysctl_net_ipv4.c137
-rw-r--r--net/ipv4/tcp.c27
-rw-r--r--net/ipv4/tcp_bic.c5
-rw-r--r--net/ipv4/tcp_cong.c47
-rw-r--r--net/ipv4/tcp_cubic.c5
-rw-r--r--net/ipv4/tcp_fastopen.c29
-rw-r--r--net/ipv4/tcp_highspeed.c4
-rw-r--r--net/ipv4/tcp_htcp.c4
-rw-r--r--net/ipv4/tcp_hybla.c5
-rw-r--r--net/ipv4/tcp_illinois.c5
-rw-r--r--net/ipv4/tcp_input.c171
-rw-r--r--net/ipv4/tcp_ipv4.c156
-rw-r--r--net/ipv4/tcp_lp.c5
-rw-r--r--net/ipv4/tcp_memcontrol.c90
-rw-r--r--net/ipv4/tcp_metrics.c37
-rw-r--r--net/ipv4/tcp_minisocks.c7
-rw-r--r--net/ipv4/tcp_offload.c52
-rw-r--r--net/ipv4/tcp_output.c40
-rw-r--r--net/ipv4/tcp_probe.c29
-rw-r--r--net/ipv4/tcp_scalable.c5
-rw-r--r--net/ipv4/tcp_timer.c9
-rw-r--r--net/ipv4/tcp_vegas.c11
-rw-r--r--net/ipv4/tcp_vegas.h10
-rw-r--r--net/ipv4/tcp_veno.c9
-rw-r--r--net/ipv4/tcp_yeah.c5
-rw-r--r--net/ipv4/udp.c264
-rw-r--r--net/ipv4/udp_impl.h36
-rw-r--r--net/ipv4/udp_offload.c1
-rw-r--r--net/ipv4/xfrm4_mode_tunnel.c16
-rw-r--r--net/ipv4/xfrm4_policy.c8
77 files changed, 1220 insertions, 1910 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 70011e0..cfeb85c 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -245,6 +245,29 @@ out:
}
EXPORT_SYMBOL(inet_listen);
+u32 inet_ehash_secret __read_mostly;
+EXPORT_SYMBOL(inet_ehash_secret);
+
+u32 ipv6_hash_secret __read_mostly;
+EXPORT_SYMBOL(ipv6_hash_secret);
+
+/*
+ * inet_ehash_secret must be set exactly once, and to a non nul value
+ * ipv6_hash_secret must be set exactly once.
+ */
+void build_ehash_secret(void)
+{
+ u32 rnd;
+
+ do {
+ get_random_bytes(&rnd, sizeof(rnd));
+ } while (rnd == 0);
+
+ if (cmpxchg(&inet_ehash_secret, 0, rnd) == 0)
+ get_random_bytes(&ipv6_hash_secret, sizeof(ipv6_hash_secret));
+}
+EXPORT_SYMBOL(build_ehash_secret);
+
/*
* Create an inet socket.
*/
@@ -261,6 +284,10 @@ static int inet_create(struct net *net, struct socket *sock, int protocol,
int try_loading_module = 0;
int err;
+ if (unlikely(!inet_ehash_secret))
+ if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
+ build_ehash_secret();
+
sock->state = SS_UNCONNECTED;
/* Look for the requested type/protocol pair. */
@@ -1227,36 +1254,36 @@ static int inet_gso_send_check(struct sk_buff *skb)
if (ihl < sizeof(*iph))
goto out;
- proto = iph->protocol;
-
- /* Warning: after this point, iph might be no longer valid */
if (unlikely(!pskb_may_pull(skb, ihl)))
goto out;
- __skb_pull(skb, ihl);
+ __skb_pull(skb, ihl);
skb_reset_transport_header(skb);
+ iph = ip_hdr(skb);
+ proto = iph->protocol;
err = -EPROTONOSUPPORT;
+ rcu_read_lock();
ops = rcu_dereference(inet_offloads[proto]);
if (likely(ops && ops->callbacks.gso_send_check))
err = ops->callbacks.gso_send_check(skb);
+ rcu_read_unlock();
out:
return err;
}
static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
- netdev_features_t features)
+ netdev_features_t features)
{
struct sk_buff *segs = ERR_PTR(-EINVAL);
const struct net_offload *ops;
- unsigned int offset = 0;
- bool udpfrag, encap;
struct iphdr *iph;
int proto;
- int nhoff;
int ihl;
int id;
+ unsigned int offset = 0;
+ bool tunnel;
if (unlikely(skb_shinfo(skb)->gso_type &
~(SKB_GSO_TCPV4 |
@@ -1264,16 +1291,12 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
SKB_GSO_DODGY |
SKB_GSO_TCP_ECN |
SKB_GSO_GRE |
- SKB_GSO_IPIP |
- SKB_GSO_SIT |
SKB_GSO_TCPV6 |
SKB_GSO_UDP_TUNNEL |
SKB_GSO_MPLS |
0)))
goto out;
- skb_reset_network_header(skb);
- nhoff = skb_network_header(skb) - skb_mac_header(skb);
if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))
goto out;
@@ -1282,50 +1305,42 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
if (ihl < sizeof(*iph))
goto out;
- id = ntohs(iph->id);
- proto = iph->protocol;
-
- /* Warning: after this point, iph might be no longer valid */
if (unlikely(!pskb_may_pull(skb, ihl)))
goto out;
- __skb_pull(skb, ihl);
- encap = SKB_GSO_CB(skb)->encap_level > 0;
- if (encap)
- features = skb->dev->hw_enc_features & netif_skb_features(skb);
- SKB_GSO_CB(skb)->encap_level += ihl;
+ tunnel = !!skb->encapsulation;
+ __skb_pull(skb, ihl);
skb_reset_transport_header(skb);
-
+ iph = ip_hdr(skb);
+ id = ntohs(iph->id);
+ proto = iph->protocol;
segs = ERR_PTR(-EPROTONOSUPPORT);
- /* Note : following gso_segment() might change skb->encapsulation */
- udpfrag = !skb->encapsulation && proto == IPPROTO_UDP;
-
+ rcu_read_lock();
ops = rcu_dereference(inet_offloads[proto]);
if (likely(ops && ops->callbacks.gso_segment))
segs = ops->callbacks.gso_segment(skb, features);
+ rcu_read_unlock();
if (IS_ERR_OR_NULL(segs))
goto out;
skb = segs;
do {
- iph = (struct iphdr *)(skb_mac_header(skb) + nhoff);
- if (udpfrag) {
+ iph = ip_hdr(skb);
+ if (!tunnel && proto == IPPROTO_UDP) {
iph->id = htons(id);
iph->frag_off = htons(offset >> 3);
if (skb->next != NULL)
iph->frag_off |= htons(IP_MF);
- offset += skb->len - nhoff - ihl;
- } else {
+ offset += (skb->len - skb->mac_len - iph->ihl * 4);
+ } else {
iph->id = htons(id++);
}
- iph->tot_len = htons(skb->len - nhoff);
- ip_send_check(iph);
- if (encap)
- skb_reset_inner_headers(skb);
- skb->network_header = (u8 *)iph - skb->head;
+ iph->tot_len = htons(skb->len - skb->mac_len);
+ iph->check = 0;
+ iph->check = ip_fast_csum(skb_network_header(skb), iph->ihl);
} while ((skb = skb->next));
out:
@@ -1503,7 +1518,6 @@ int snmp_mib_init(void __percpu *ptr[2], size_t mibsize, size_t align)
ptr[0] = __alloc_percpu(mibsize, align);
if (!ptr[0])
return -ENOMEM;
-
#if SNMP_ARRAY_SZ == 2
ptr[1] = __alloc_percpu(mibsize, align);
if (!ptr[1]) {
@@ -1532,7 +1546,6 @@ static const struct net_protocol tcp_protocol = {
};
static const struct net_protocol udp_protocol = {
- .early_demux = udp_v4_early_demux,
.handler = udp_rcv,
.err_handler = udp_err,
.no_policy = 1,
@@ -1548,8 +1561,6 @@ static const struct net_protocol icmp_protocol = {
static __net_init int ipv4_mib_init_net(struct net *net)
{
- int i;
-
if (snmp_mib_init((void __percpu **)net->mib.tcp_statistics,
sizeof(struct tcp_mib),
__alignof__(struct tcp_mib)) < 0)
@@ -1558,17 +1569,6 @@ static __net_init int ipv4_mib_init_net(struct net *net)
sizeof(struct ipstats_mib),
__alignof__(struct ipstats_mib)) < 0)
goto err_ip_mib;
-
- for_each_possible_cpu(i) {
- struct ipstats_mib *af_inet_stats;
- af_inet_stats = per_cpu_ptr(net->mib.ip_statistics[0], i);
- u64_stats_init(&af_inet_stats->syncp);
-#if SNMP_ARRAY_SZ == 2
- af_inet_stats = per_cpu_ptr(net->mib.ip_statistics[1], i);
- u64_stats_init(&af_inet_stats->syncp);
-#endif
- }
-
if (snmp_mib_init((void __percpu **)net->mib.net_statistics,
sizeof(struct linux_mib),
__alignof__(struct linux_mib)) < 0)
@@ -1646,13 +1646,6 @@ static struct packet_offload ip_packet_offload __read_mostly = {
},
};
-static const struct net_offload ipip_offload = {
- .callbacks = {
- .gso_send_check = inet_gso_send_check,
- .gso_segment = inet_gso_segment,
- },
-};
-
static int __init ipv4_offload_init(void)
{
/*
@@ -1664,7 +1657,6 @@ static int __init ipv4_offload_init(void)
pr_crit("%s: Cannot add TCP protocol offload\n", __func__);
dev_add_offload(&ip_packet_offload);
- inet_add_offload(&ipip_offload, IPPROTO_IPIP);
return 0;
}
@@ -1713,6 +1705,8 @@ static int __init inet_init(void)
ip_static_sysctl_init();
#endif
+ tcp_prot.sysctl_mem = init_net.ipv4.sysctl_tcp_mem;
+
/*
* Add all the base protocols.
*/
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index 19e3637..b28e863 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -57,7 +57,7 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
if (IS_ERR(rt)) {
err = PTR_ERR(rt);
if (err == -ENETUNREACH)
- IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
+ IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
goto out;
}
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 7785b28..109ee89 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -121,6 +121,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
struct aead_givcrypt_request *req;
struct scatterlist *sg;
struct scatterlist *asg;
+ struct esp_data *esp;
struct sk_buff *trailer;
void *tmp;
u8 *iv;
@@ -138,7 +139,8 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
/* skb is pure payload to encrypt */
- aead = x->data;
+ esp = x->data;
+ aead = esp->aead;
alen = crypto_aead_authsize(aead);
tfclen = 0;
@@ -152,6 +154,8 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
}
blksize = ALIGN(crypto_aead_blocksize(aead), 4);
clen = ALIGN(skb->len + 2 + tfclen, blksize);
+ if (esp->padlen)
+ clen = ALIGN(clen, esp->padlen);
plen = clen - skb->len - tfclen;
err = skb_cow_data(skb, tfclen + plen + alen, &trailer);
@@ -276,7 +280,8 @@ static int esp_input_done2(struct sk_buff *skb, int err)
{
const struct iphdr *iph;
struct xfrm_state *x = xfrm_input_state(skb);
- struct crypto_aead *aead = x->data;
+ struct esp_data *esp = x->data;
+ struct crypto_aead *aead = esp->aead;
int alen = crypto_aead_authsize(aead);
int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
int elen = skb->len - hlen;
@@ -371,7 +376,8 @@ static void esp_input_done(struct crypto_async_request *base, int err)
static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
{
struct ip_esp_hdr *esph;
- struct crypto_aead *aead = x->data;
+ struct esp_data *esp = x->data;
+ struct crypto_aead *aead = esp->aead;
struct aead_request *req;
struct sk_buff *trailer;
int elen = skb->len - sizeof(*esph) - crypto_aead_ivsize(aead);
@@ -453,8 +459,9 @@ out:
static u32 esp4_get_mtu(struct xfrm_state *x, int mtu)
{
- struct crypto_aead *aead = x->data;
- u32 blksize = ALIGN(crypto_aead_blocksize(aead), 4);
+ struct esp_data *esp = x->data;
+ u32 blksize = ALIGN(crypto_aead_blocksize(esp->aead), 4);
+ u32 align = max_t(u32, blksize, esp->padlen);
unsigned int net_adj;
switch (x->props.mode) {
@@ -469,8 +476,8 @@ static u32 esp4_get_mtu(struct xfrm_state *x, int mtu)
BUG();
}
- return ((mtu - x->props.header_len - crypto_aead_authsize(aead) -
- net_adj) & ~(blksize - 1)) + net_adj - 2;
+ return ((mtu - x->props.header_len - crypto_aead_authsize(esp->aead) -
+ net_adj) & ~(align - 1)) + net_adj - 2;
}
static void esp4_err(struct sk_buff *skb, u32 info)
@@ -504,16 +511,18 @@ static void esp4_err(struct sk_buff *skb, u32 info)
static void esp_destroy(struct xfrm_state *x)
{
- struct crypto_aead *aead = x->data;
+ struct esp_data *esp = x->data;
- if (!aead)
+ if (!esp)
return;
- crypto_free_aead(aead);
+ crypto_free_aead(esp->aead);
+ kfree(esp);
}
static int esp_init_aead(struct xfrm_state *x)
{
+ struct esp_data *esp = x->data;
struct crypto_aead *aead;
int err;
@@ -522,7 +531,7 @@ static int esp_init_aead(struct xfrm_state *x)
if (IS_ERR(aead))
goto error;
- x->data = aead;
+ esp->aead = aead;
err = crypto_aead_setkey(aead, x->aead->alg_key,
(x->aead->alg_key_len + 7) / 8);
@@ -539,6 +548,7 @@ error:
static int esp_init_authenc(struct xfrm_state *x)
{
+ struct esp_data *esp = x->data;
struct crypto_aead *aead;
struct crypto_authenc_key_param *param;
struct rtattr *rta;
@@ -573,7 +583,7 @@ static int esp_init_authenc(struct xfrm_state *x)
if (IS_ERR(aead))
goto error;
- x->data = aead;
+ esp->aead = aead;
keylen = (x->aalg ? (x->aalg->alg_key_len + 7) / 8 : 0) +
(x->ealg->alg_key_len + 7) / 8 + RTA_SPACE(sizeof(*param));
@@ -628,11 +638,16 @@ error:
static int esp_init_state(struct xfrm_state *x)
{
+ struct esp_data *esp;
struct crypto_aead *aead;
u32 align;
int err;
- x->data = NULL;
+ esp = kzalloc(sizeof(*esp), GFP_KERNEL);
+ if (esp == NULL)
+ return -ENOMEM;
+
+ x->data = esp;
if (x->aead)
err = esp_init_aead(x);
@@ -642,7 +657,9 @@ static int esp_init_state(struct xfrm_state *x)
if (err)
goto error;
- aead = x->data;
+ aead = esp->aead;
+
+ esp->padlen = 0;
x->props.header_len = sizeof(struct ip_esp_hdr) +
crypto_aead_ivsize(aead);
@@ -666,7 +683,9 @@ static int esp_init_state(struct xfrm_state *x)
}
align = ALIGN(crypto_aead_blocksize(aead), 4);
- x->props.trailer_len = align + 1 + crypto_aead_authsize(aead);
+ if (esp->padlen)
+ align = max_t(u32, align, esp->padlen);
+ x->props.trailer_len = align + 1 + crypto_aead_authsize(esp->aead);
error:
return err;
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index d846304..b3f627a 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -933,6 +933,7 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)
local_bh_disable();
frn->tb_id = tb->tb_id;
+ rcu_read_lock();
frn->err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
if (!frn->err) {
@@ -941,6 +942,7 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)
frn->type = res.type;
frn->scope = res.scope;
}
+ rcu_read_unlock();
local_bh_enable();
}
}
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index 388d113..af0f14a 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -24,17 +24,21 @@ static inline void fib_alias_accessed(struct fib_alias *fa)
}
/* Exported by fib_semantics.c */
-void fib_release_info(struct fib_info *);
-struct fib_info *fib_create_info(struct fib_config *cfg);
-int fib_nh_match(struct fib_config *cfg, struct fib_info *fi);
-int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, u32 tb_id,
- u8 type, __be32 dst, int dst_len, u8 tos, struct fib_info *fi,
- unsigned int);
-void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len,
- u32 tb_id, const struct nl_info *info, unsigned int nlm_flags);
-struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio);
-int fib_detect_death(struct fib_info *fi, int order,
- struct fib_info **last_resort, int *last_idx, int dflt);
+extern void fib_release_info(struct fib_info *);
+extern struct fib_info *fib_create_info(struct fib_config *cfg);
+extern int fib_nh_match(struct fib_config *cfg, struct fib_info *fi);
+extern int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
+ u32 tb_id, u8 type, __be32 dst,
+ int dst_len, u8 tos, struct fib_info *fi,
+ unsigned int);
+extern void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
+ int dst_len, u32 tb_id, struct nl_info *info,
+ unsigned int nlm_flags);
+extern struct fib_alias *fib_find_alias(struct list_head *fah,
+ u8 tos, u32 prio);
+extern int fib_detect_death(struct fib_info *fi, int order,
+ struct fib_info **last_resort,
+ int *last_idx, int dflt);
static inline void fib_result_assign(struct fib_result *res,
struct fib_info *fi)
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index e63f47a..d5dbca5 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -380,7 +380,7 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi)
}
void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
- int dst_len, u32 tb_id, const struct nl_info *info,
+ int dst_len, u32 tb_id, struct nl_info *info,
unsigned int nlm_flags)
{
struct sk_buff *skb;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 5afeb5a..3df6d3e 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -762,9 +762,12 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
if (IS_LEAF(node) || ((struct tnode *) node)->pos >
tn->pos + tn->bits - 1) {
- put_child(tn,
- tkey_extract_bits(node->key, oldtnode->pos, oldtnode->bits + 1),
- node);
+ if (tkey_extract_bits(node->key,
+ oldtnode->pos + oldtnode->bits,
+ 1) == 0)
+ put_child(tn, 2*i, node);
+ else
+ put_child(tn, 2*i+1, node);
continue;
}
@@ -1117,8 +1120,12 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
* first tnode need some special handling
*/
+ if (tp)
+ pos = tp->pos+tp->bits;
+ else
+ pos = 0;
+
if (n) {
- pos = tp ? tp->pos+tp->bits : 0;
newpos = tkey_mismatch(key, pos, n->key);
tn = tnode_new(n->key, newpos, 1);
} else {
@@ -2523,17 +2530,16 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
list_for_each_entry_rcu(fa, &li->falh, fa_list) {
const struct fib_info *fi = fa->fa_info;
unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
+ int len;
if (fa->fa_type == RTN_BROADCAST
|| fa->fa_type == RTN_MULTICAST)
continue;
- seq_setwidth(seq, 127);
-
if (fi)
seq_printf(seq,
"%s\t%08X\t%08X\t%04X\t%d\t%u\t"
- "%d\t%08X\t%d\t%u\t%u",
+ "%d\t%08X\t%d\t%u\t%u%n",
fi->fib_dev ? fi->fib_dev->name : "*",
prefix,
fi->fib_nh->nh_gw, flags, 0, 0,
@@ -2542,15 +2548,15 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
(fi->fib_advmss ?
fi->fib_advmss + 40 : 0),
fi->fib_window,
- fi->fib_rtt >> 3);
+ fi->fib_rtt >> 3, &len);
else
seq_printf(seq,
"*\t%08X\t%08X\t%04X\t%d\t%u\t"
- "%d\t%08X\t%d\t%u\t%u",
+ "%d\t%08X\t%d\t%u\t%u%n",
prefix, 0, flags, 0, 0, 0,
- mask, 0, 0, 0);
+ mask, 0, 0, 0, &len);
- seq_pad(seq, '\n');
+ seq_printf(seq, "%*s\n", 127 - len, "");
}
}
diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c
index 5893e99..736c9fc3 100644
--- a/net/ipv4/gre_demux.c
+++ b/net/ipv4/gre_demux.c
@@ -93,6 +93,35 @@ void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
}
EXPORT_SYMBOL_GPL(gre_build_header);
+struct sk_buff *gre_handle_offloads(struct sk_buff *skb, bool gre_csum)
+{
+ int err;
+
+ if (likely(!skb->encapsulation)) {
+ skb_reset_inner_headers(skb);
+ skb->encapsulation = 1;
+ }
+
+ if (skb_is_gso(skb)) {
+ err = skb_unclone(skb, GFP_ATOMIC);
+ if (unlikely(err))
+ goto error;
+ skb_shinfo(skb)->gso_type |= SKB_GSO_GRE;
+ return skb;
+ } else if (skb->ip_summed == CHECKSUM_PARTIAL && gre_csum) {
+ err = skb_checksum_help(skb);
+ if (unlikely(err))
+ goto error;
+ } else if (skb->ip_summed != CHECKSUM_PARTIAL)
+ skb->ip_summed = CHECKSUM_NONE;
+
+ return skb;
+error:
+ kfree_skb(skb);
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(gre_handle_offloads);
+
static __sum16 check_checksum(struct sk_buff *skb)
{
__sum16 csum = 0;
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index e5d4361..55e6bfb 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -39,8 +39,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
SKB_GSO_UDP |
SKB_GSO_DODGY |
SKB_GSO_TCP_ECN |
- SKB_GSO_GRE |
- SKB_GSO_IPIP)))
+ SKB_GSO_GRE)))
goto out;
if (unlikely(!pskb_may_pull(skb, sizeof(*greh))))
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 5c0e8bc..5f7d11a 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -353,9 +353,6 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
saddr = fib_compute_spec_dst(skb);
ipc.opt = NULL;
ipc.tx_flags = 0;
- ipc.ttl = 0;
- ipc.tos = -1;
-
if (icmp_param->replyopts.opt.opt.optlen) {
ipc.opt = &icmp_param->replyopts.opt;
if (ipc.opt->opt.srr)
@@ -611,8 +608,6 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
ipc.addr = iph->saddr;
ipc.opt = &icmp_param->replyopts.opt;
ipc.tx_flags = 0;
- ipc.ttl = 0;
- ipc.tos = -1;
rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos,
type, code, icmp_param);
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index fc0e649..6acb541 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -29,19 +29,27 @@ const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
EXPORT_SYMBOL(inet_csk_timer_bug_msg);
#endif
+/*
+ * This struct holds the first and last local port number.
+ */
+struct local_ports sysctl_local_ports __read_mostly = {
+ .lock = __SEQLOCK_UNLOCKED(sysctl_local_ports.lock),
+ .range = { 32768, 61000 },
+};
+
unsigned long *sysctl_local_reserved_ports;
EXPORT_SYMBOL(sysctl_local_reserved_ports);
-void inet_get_local_port_range(struct net *net, int *low, int *high)
+void inet_get_local_port_range(int *low, int *high)
{
unsigned int seq;
do {
- seq = read_seqbegin(&net->ipv4.sysctl_local_ports.lock);
+ seq = read_seqbegin(&sysctl_local_ports.lock);
- *low = net->ipv4.sysctl_local_ports.range[0];
- *high = net->ipv4.sysctl_local_ports.range[1];
- } while (read_seqretry(&net->ipv4.sysctl_local_ports.lock, seq));
+ *low = sysctl_local_ports.range[0];
+ *high = sysctl_local_ports.range[1];
+ } while (read_seqretry(&sysctl_local_ports.lock, seq));
}
EXPORT_SYMBOL(inet_get_local_port_range);
@@ -71,16 +79,17 @@ int inet_csk_bind_conflict(const struct sock *sk,
(!reuseport || !sk2->sk_reuseport ||
(sk2->sk_state != TCP_TIME_WAIT &&
!uid_eq(uid, sock_i_uid(sk2))))) {
-
- if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr ||
- sk2->sk_rcv_saddr == sk->sk_rcv_saddr)
+ const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2);
+ if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) ||
+ sk2_rcv_saddr == sk_rcv_saddr(sk))
break;
}
if (!relax && reuse && sk2->sk_reuse &&
sk2->sk_state != TCP_LISTEN) {
+ const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2);
- if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr ||
- sk2->sk_rcv_saddr == sk->sk_rcv_saddr)
+ if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) ||
+ sk2_rcv_saddr == sk_rcv_saddr(sk))
break;
}
}
@@ -107,7 +116,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
int remaining, rover, low, high;
again:
- inet_get_local_port_range(net, &low, &high);
+ inet_get_local_port_range(&low, &high);
remaining = (high - low) + 1;
smallest_rover = rover = net_random() % remaining + low;
@@ -412,8 +421,8 @@ struct dst_entry *inet_csk_route_req(struct sock *sk,
RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
sk->sk_protocol,
flags,
- (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
- ireq->ir_loc_addr, ireq->ir_rmt_port, inet_sk(sk)->inet_sport);
+ (opt && opt->opt.srr) ? opt->opt.faddr : ireq->rmt_addr,
+ ireq->loc_addr, ireq->rmt_port, inet_sk(sk)->inet_sport);
security_req_classify_flow(req, flowi4_to_flowi(fl4));
rt = ip_route_output_flow(net, fl4, sk);
if (IS_ERR(rt))
@@ -448,8 +457,8 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk,
flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
sk->sk_protocol, inet_sk_flowi_flags(sk),
- (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
- ireq->ir_loc_addr, ireq->ir_rmt_port, inet_sk(sk)->inet_sport);
+ (opt && opt->opt.srr) ? opt->opt.faddr : ireq->rmt_addr,
+ ireq->loc_addr, ireq->rmt_port, inet_sk(sk)->inet_sport);
security_req_classify_flow(req, flowi4_to_flowi(fl4));
rt = ip_route_output_flow(net, fl4, sk);
if (IS_ERR(rt))
@@ -495,9 +504,9 @@ struct request_sock *inet_csk_search_req(const struct sock *sk,
prev = &req->dl_next) {
const struct inet_request_sock *ireq = inet_rsk(req);
- if (ireq->ir_rmt_port == rport &&
- ireq->ir_rmt_addr == raddr &&
- ireq->ir_loc_addr == laddr &&
+ if (ireq->rmt_port == rport &&
+ ireq->rmt_addr == raddr &&
+ ireq->loc_addr == laddr &&
AF_INET_FAMILY(req->rsk_ops->family)) {
WARN_ON(req->sk);
*prevp = prev;
@@ -514,8 +523,7 @@ void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
- const u32 h = inet_synq_hash(inet_rsk(req)->ir_rmt_addr,
- inet_rsk(req)->ir_rmt_port,
+ const u32 h = inet_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port,
lopt->hash_rnd, lopt->nr_table_entries);
reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout);
@@ -675,9 +683,9 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
newsk->sk_state = TCP_SYN_RECV;
newicsk->icsk_bind_hash = NULL;
- inet_sk(newsk)->inet_dport = inet_rsk(req)->ir_rmt_port;
- inet_sk(newsk)->inet_num = inet_rsk(req)->ir_num;
- inet_sk(newsk)->inet_sport = htons(inet_rsk(req)->ir_num);
+ inet_sk(newsk)->inet_dport = inet_rsk(req)->rmt_port;
+ inet_sk(newsk)->inet_num = ntohs(inet_rsk(req)->loc_port);
+ inet_sk(newsk)->inet_sport = inet_rsk(req)->loc_port;
newsk->sk_write_space = sk_stream_write_space;
newicsk->icsk_retransmits = 0;
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 56a964a..5f64875 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -121,13 +121,13 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
#if IS_ENABLED(CONFIG_IPV6)
if (r->idiag_family == AF_INET6) {
+ const struct ipv6_pinfo *np = inet6_sk(sk);
- *(struct in6_addr *)r->id.idiag_src = sk->sk_v6_rcv_saddr;
- *(struct in6_addr *)r->id.idiag_dst = sk->sk_v6_daddr;
+ *(struct in6_addr *)r->id.idiag_src = np->rcv_saddr;
+ *(struct in6_addr *)r->id.idiag_dst = np->daddr;
if (ext & (1 << (INET_DIAG_TCLASS - 1)))
- if (nla_put_u8(skb, INET_DIAG_TCLASS,
- inet6_sk(sk)->tclass) < 0)
+ if (nla_put_u8(skb, INET_DIAG_TCLASS, np->tclass) < 0)
goto errout;
}
#endif
@@ -222,7 +222,7 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
u32 portid, u32 seq, u16 nlmsg_flags,
const struct nlmsghdr *unlh)
{
- s32 tmo;
+ long tmo;
struct inet_diag_msg *r;
struct nlmsghdr *nlh;
@@ -234,7 +234,7 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
r = nlmsg_data(nlh);
BUG_ON(tw->tw_state != TCP_TIME_WAIT);
- tmo = tw->tw_ttd - inet_tw_time_stamp();
+ tmo = tw->tw_ttd - jiffies;
if (tmo < 0)
tmo = 0;
@@ -248,15 +248,18 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
r->id.idiag_dst[0] = tw->tw_daddr;
r->idiag_state = tw->tw_substate;
r->idiag_timer = 3;
- r->idiag_expires = jiffies_to_msecs(tmo);
+ r->idiag_expires = DIV_ROUND_UP(tmo * 1000, HZ);
r->idiag_rqueue = 0;
r->idiag_wqueue = 0;
r->idiag_uid = 0;
r->idiag_inode = 0;
#if IS_ENABLED(CONFIG_IPV6)
if (tw->tw_family == AF_INET6) {
- *(struct in6_addr *)r->id.idiag_src = tw->tw_v6_rcv_saddr;
- *(struct in6_addr *)r->id.idiag_dst = tw->tw_v6_daddr;
+ const struct inet6_timewait_sock *tw6 =
+ inet6_twsk((struct sock *)tw);
+
+ *(struct in6_addr *)r->id.idiag_src = tw6->tw_v6_rcv_saddr;
+ *(struct in6_addr *)r->id.idiag_dst = tw6->tw_v6_daddr;
}
#endif
@@ -270,11 +273,10 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
const struct nlmsghdr *unlh)
{
if (sk->sk_state == TCP_TIME_WAIT)
- return inet_twsk_diag_fill(inet_twsk(sk), skb, r, portid, seq,
- nlmsg_flags, unlh);
-
- return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq,
- nlmsg_flags, unlh);
+ return inet_twsk_diag_fill((struct inet_timewait_sock *)sk,
+ skb, r, portid, seq, nlmsg_flags,
+ unlh);
+ return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq, nlmsg_flags, unlh);
}
int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_skb,
@@ -336,9 +338,12 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_s
err = 0;
out:
- if (sk)
- sock_gen_put(sk);
-
+ if (sk) {
+ if (sk->sk_state == TCP_TIME_WAIT)
+ inet_twsk_put((struct inet_timewait_sock *)sk);
+ else
+ sock_put(sk);
+ }
out_nosk:
return err;
}
@@ -484,9 +489,10 @@ int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk)
entry.family = sk->sk_family;
#if IS_ENABLED(CONFIG_IPV6)
if (entry.family == AF_INET6) {
+ struct ipv6_pinfo *np = inet6_sk(sk);
- entry.saddr = sk->sk_v6_rcv_saddr.s6_addr32;
- entry.daddr = sk->sk_v6_daddr.s6_addr32;
+ entry.saddr = np->rcv_saddr.s6_addr32;
+ entry.daddr = np->daddr.s6_addr32;
} else
#endif
{
@@ -629,22 +635,22 @@ static int inet_csk_diag_dump(struct sock *sk,
cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
}
-static int inet_twsk_diag_dump(struct sock *sk,
+static int inet_twsk_diag_dump(struct inet_timewait_sock *tw,
struct sk_buff *skb,
struct netlink_callback *cb,
struct inet_diag_req_v2 *r,
const struct nlattr *bc)
{
- struct inet_timewait_sock *tw = inet_twsk(sk);
-
if (bc != NULL) {
struct inet_diag_entry entry;
entry.family = tw->tw_family;
#if IS_ENABLED(CONFIG_IPV6)
if (tw->tw_family == AF_INET6) {
- entry.saddr = tw->tw_v6_rcv_saddr.s6_addr32;
- entry.daddr = tw->tw_v6_daddr.s6_addr32;
+ struct inet6_timewait_sock *tw6 =
+ inet6_twsk((struct sock *)tw);
+ entry.saddr = tw6->tw_v6_rcv_saddr.s6_addr32;
+ entry.daddr = tw6->tw_v6_daddr.s6_addr32;
} else
#endif
{
@@ -676,12 +682,12 @@ static inline void inet_diag_req_addrs(const struct sock *sk,
#if IS_ENABLED(CONFIG_IPV6)
if (sk->sk_family == AF_INET6) {
if (req->rsk_ops->family == AF_INET6) {
- entry->saddr = ireq->ir_v6_loc_addr.s6_addr32;
- entry->daddr = ireq->ir_v6_rmt_addr.s6_addr32;
+ entry->saddr = inet6_rsk(req)->loc_addr.s6_addr32;
+ entry->daddr = inet6_rsk(req)->rmt_addr.s6_addr32;
} else if (req->rsk_ops->family == AF_INET) {
- ipv6_addr_set_v4mapped(ireq->ir_loc_addr,
+ ipv6_addr_set_v4mapped(ireq->loc_addr,
&entry->saddr_storage);
- ipv6_addr_set_v4mapped(ireq->ir_rmt_addr,
+ ipv6_addr_set_v4mapped(ireq->rmt_addr,
&entry->daddr_storage);
entry->saddr = entry->saddr_storage.s6_addr32;
entry->daddr = entry->daddr_storage.s6_addr32;
@@ -689,8 +695,8 @@ static inline void inet_diag_req_addrs(const struct sock *sk,
} else
#endif
{
- entry->saddr = &ireq->ir_loc_addr;
- entry->daddr = &ireq->ir_rmt_addr;
+ entry->saddr = &ireq->loc_addr;
+ entry->daddr = &ireq->rmt_addr;
}
}
@@ -725,9 +731,9 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
tmo = 0;
r->id.idiag_sport = inet->inet_sport;
- r->id.idiag_dport = ireq->ir_rmt_port;
- r->id.idiag_src[0] = ireq->ir_loc_addr;
- r->id.idiag_dst[0] = ireq->ir_rmt_addr;
+ r->id.idiag_dport = ireq->rmt_port;
+ r->id.idiag_src[0] = ireq->loc_addr;
+ r->id.idiag_dst[0] = ireq->rmt_addr;
r->idiag_expires = jiffies_to_msecs(tmo);
r->idiag_rqueue = 0;
r->idiag_wqueue = 0;
@@ -786,13 +792,13 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
if (reqnum < s_reqnum)
continue;
- if (r->id.idiag_dport != ireq->ir_rmt_port &&
+ if (r->id.idiag_dport != ireq->rmt_port &&
r->id.idiag_dport)
continue;
if (bc) {
inet_diag_req_addrs(sk, req, &entry);
- entry.dport = ntohs(ireq->ir_rmt_port);
+ entry.dport = ntohs(ireq->rmt_port);
if (!inet_diag_bc_run(bc, &entry))
continue;
@@ -905,7 +911,8 @@ skip_listen_ht:
num = 0;
- if (hlist_nulls_empty(&head->chain))
+ if (hlist_nulls_empty(&head->chain) &&
+ hlist_nulls_empty(&head->twchain))
continue;
if (i > s_i)
@@ -913,7 +920,7 @@ skip_listen_ht:
spin_lock_bh(lock);
sk_nulls_for_each(sk, node, &head->chain) {
- int res;
+ struct inet_sock *inet = inet_sk(sk);
if (!net_eq(sock_net(sk), net))
continue;
@@ -922,19 +929,15 @@ skip_listen_ht:
if (!(r->idiag_states & (1 << sk->sk_state)))
goto next_normal;
if (r->sdiag_family != AF_UNSPEC &&
- sk->sk_family != r->sdiag_family)
+ sk->sk_family != r->sdiag_family)
goto next_normal;
- if (r->id.idiag_sport != htons(sk->sk_num) &&
+ if (r->id.idiag_sport != inet->inet_sport &&
r->id.idiag_sport)
goto next_normal;
- if (r->id.idiag_dport != sk->sk_dport &&
+ if (r->id.idiag_dport != inet->inet_dport &&
r->id.idiag_dport)
goto next_normal;
- if (sk->sk_state == TCP_TIME_WAIT)
- res = inet_twsk_diag_dump(sk, skb, cb, r, bc);
- else
- res = inet_csk_diag_dump(sk, skb, cb, r, bc);
- if (res < 0) {
+ if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) {
spin_unlock_bh(lock);
goto done;
}
@@ -942,6 +945,33 @@ next_normal:
++num;
}
+ if (r->idiag_states & TCPF_TIME_WAIT) {
+ struct inet_timewait_sock *tw;
+
+ inet_twsk_for_each(tw, node,
+ &head->twchain) {
+ if (!net_eq(twsk_net(tw), net))
+ continue;
+
+ if (num < s_num)
+ goto next_dying;
+ if (r->sdiag_family != AF_UNSPEC &&
+ tw->tw_family != r->sdiag_family)
+ goto next_dying;
+ if (r->id.idiag_sport != tw->tw_sport &&
+ r->id.idiag_sport)
+ goto next_dying;
+ if (r->id.idiag_dport != tw->tw_dport &&
+ r->id.idiag_dport)
+ goto next_dying;
+ if (inet_twsk_diag_dump(tw, skb, cb, r, bc) < 0) {
+ spin_unlock_bh(lock);
+ goto done;
+ }
+next_dying:
+ ++num;
+ }
+ }
spin_unlock_bh(lock);
}
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index bb075fc..c5313a9 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -93,6 +93,9 @@ void inet_frags_init(struct inet_frags *f)
}
rwlock_init(&f->lock);
+ f->rnd = (u32) ((totalram_pages ^ (totalram_pages >> 7)) ^
+ (jiffies ^ (jiffies >> 6)));
+
setup_timer(&f->secret_timer, inet_frag_secret_rebuild,
(unsigned long)f);
f->secret_timer.expires = jiffies + f->secret_interval;
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 8b9cf27..96da9c7 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -24,31 +24,6 @@
#include <net/secure_seq.h>
#include <net/ip.h>
-static unsigned int inet_ehashfn(struct net *net, const __be32 laddr,
- const __u16 lport, const __be32 faddr,
- const __be16 fport)
-{
- static u32 inet_ehash_secret __read_mostly;
-
- net_get_random_once(&inet_ehash_secret, sizeof(inet_ehash_secret));
-
- return __inet_ehashfn(laddr, lport, faddr, fport,
- inet_ehash_secret + net_hash_mix(net));
-}
-
-
-static unsigned int inet_sk_ehashfn(const struct sock *sk)
-{
- const struct inet_sock *inet = inet_sk(sk);
- const __be32 laddr = inet->inet_rcv_saddr;
- const __u16 lport = inet->inet_num;
- const __be32 faddr = inet->inet_daddr;
- const __be16 fport = inet->inet_dport;
- struct net *net = sock_net(sk);
-
- return inet_ehashfn(net, laddr, lport, faddr, fport);
-}
-
/*
* Allocate and initialize a new local port bind bucket.
* The bindhash mutex for snum's hash chain must be held here.
@@ -255,19 +230,6 @@ begin:
}
EXPORT_SYMBOL_GPL(__inet_lookup_listener);
-/* All sockets share common refcount, but have different destructors */
-void sock_gen_put(struct sock *sk)
-{
- if (!atomic_dec_and_test(&sk->sk_refcnt))
- return;
-
- if (sk->sk_state == TCP_TIME_WAIT)
- inet_twsk_free(inet_twsk(sk));
- else
- sk_free(sk);
-}
-EXPORT_SYMBOL_GPL(sock_gen_put);
-
struct sock *__inet_lookup_established(struct net *net,
struct inet_hashinfo *hashinfo,
const __be32 saddr, const __be16 sport,
@@ -293,13 +255,13 @@ begin:
if (likely(INET_MATCH(sk, net, acookie,
saddr, daddr, ports, dif))) {
if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt)))
- goto out;
+ goto begintw;
if (unlikely(!INET_MATCH(sk, net, acookie,
saddr, daddr, ports, dif))) {
- sock_gen_put(sk);
+ sock_put(sk);
goto begin;
}
- goto found;
+ goto out;
}
}
/*
@@ -309,9 +271,37 @@ begin:
*/
if (get_nulls_value(node) != slot)
goto begin;
-out:
+
+begintw:
+ /* Must check for a TIME_WAIT'er before going to listener hash. */
+ sk_nulls_for_each_rcu(sk, node, &head->twchain) {
+ if (sk->sk_hash != hash)
+ continue;
+ if (likely(INET_TW_MATCH(sk, net, acookie,
+ saddr, daddr, ports,
+ dif))) {
+ if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) {
+ sk = NULL;
+ goto out;
+ }
+ if (unlikely(!INET_TW_MATCH(sk, net, acookie,
+ saddr, daddr, ports,
+ dif))) {
+ inet_twsk_put(inet_twsk(sk));
+ goto begintw;
+ }
+ goto out;
+ }
+ }
+ /*
+ * if the nulls value we got at the end of this lookup is
+ * not the expected one, we must restart lookup.
+ * We probably met an item that was moved to another chain.
+ */
+ if (get_nulls_value(node) != slot)
+ goto begintw;
sk = NULL;
-found:
+out:
rcu_read_unlock();
return sk;
}
@@ -336,29 +326,39 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
spinlock_t *lock = inet_ehash_lockp(hinfo, hash);
struct sock *sk2;
const struct hlist_nulls_node *node;
- struct inet_timewait_sock *tw = NULL;
+ struct inet_timewait_sock *tw;
int twrefcnt = 0;
spin_lock(lock);
- sk_nulls_for_each(sk2, node, &head->chain) {
+ /* Check TIME-WAIT sockets first. */
+ sk_nulls_for_each(sk2, node, &head->twchain) {
if (sk2->sk_hash != hash)
continue;
- if (likely(INET_MATCH(sk2, net, acookie,
+ if (likely(INET_TW_MATCH(sk2, net, acookie,
saddr, daddr, ports, dif))) {
- if (sk2->sk_state == TCP_TIME_WAIT) {
- tw = inet_twsk(sk2);
- if (twsk_unique(sk, sk2, twp))
- break;
- }
- goto not_unique;
+ tw = inet_twsk(sk2);
+ if (twsk_unique(sk, sk2, twp))
+ goto unique;
+ else
+ goto not_unique;
}
}
+ tw = NULL;
+ /* And established part... */
+ sk_nulls_for_each(sk2, node, &head->chain) {
+ if (sk2->sk_hash != hash)
+ continue;
+ if (likely(INET_MATCH(sk2, net, acookie,
+ saddr, daddr, ports, dif)))
+ goto not_unique;
+ }
+
+unique:
/* Must record num and sport now. Otherwise we will see
- * in hash table socket with a funny identity.
- */
+ * in hash table socket with a funny identity. */
inet->inet_num = lport;
inet->inet_sport = htons(lport);
sk->sk_hash = hash;
@@ -494,7 +494,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
u32 offset = hint + port_offset;
struct inet_timewait_sock *tw = NULL;
- inet_get_local_port_range(net, &low, &high);
+ inet_get_local_port_range(&low, &high);
remaining = (high - low) + 1;
local_bh_disable();
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 6d592f8..1f27c9f 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -87,11 +87,19 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw,
refcnt += inet_twsk_bind_unhash(tw, hashinfo);
spin_unlock(&bhead->lock);
- BUG_ON(refcnt >= atomic_read(&tw->tw_refcnt));
- atomic_sub(refcnt, &tw->tw_refcnt);
+#ifdef SOCK_REFCNT_DEBUG
+ if (atomic_read(&tw->tw_refcnt) != 1) {
+ pr_debug("%s timewait_sock %p refcnt=%d\n",
+ tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt));
+ }
+#endif
+ while (refcnt) {
+ inet_twsk_put(tw);
+ refcnt--;
+ }
}
-void inet_twsk_free(struct inet_timewait_sock *tw)
+static noinline void inet_twsk_free(struct inet_timewait_sock *tw)
{
struct module *owner = tw->tw_prot->owner;
twsk_destructor((struct sock *)tw);
@@ -110,18 +118,6 @@ void inet_twsk_put(struct inet_timewait_sock *tw)
}
EXPORT_SYMBOL_GPL(inet_twsk_put);
-static void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw,
- struct hlist_nulls_head *list)
-{
- hlist_nulls_add_head_rcu(&tw->tw_node, list);
-}
-
-static void inet_twsk_add_bind_node(struct inet_timewait_sock *tw,
- struct hlist_head *list)
-{
- hlist_add_head(&tw->tw_bind_node, list);
-}
-
/*
* Enter the time wait state. This is called with locally disabled BH.
* Essentially we whip up a timewait bucket, copy the relevant info into it
@@ -150,21 +146,26 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
spin_lock(lock);
/*
- * Step 2: Hash TW into tcp ehash chain.
- * Notes :
- * - tw_refcnt is set to 3 because :
- * - We have one reference from bhash chain.
- * - We have one reference from ehash chain.
- * We can use atomic_set() because prior spin_lock()/spin_unlock()
- * committed into memory all tw fields.
+ * Step 2: Hash TW into TIMEWAIT chain.
+ * Should be done before removing sk from established chain
+ * because readers are lockless and search established first.
*/
- atomic_set(&tw->tw_refcnt, 1 + 1 + 1);
- inet_twsk_add_node_rcu(tw, &ehead->chain);
+ inet_twsk_add_node_rcu(tw, &ehead->twchain);
- /* Step 3: Remove SK from hash chain */
+ /* Step 3: Remove SK from established hash. */
if (__sk_nulls_del_node_init_rcu(sk))
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+ /*
+ * Notes :
+ * - We initially set tw_refcnt to 0 in inet_twsk_alloc()
+ * - We add one reference for the bhash link
+ * - We add one reference for the ehash link
+ * - We want this refcnt update done before allowing other
+ * threads to find this tw in ehash chain.
+ */
+ atomic_add(1 + 1 + 1, &tw->tw_refcnt);
+
spin_unlock(lock);
}
EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
@@ -386,11 +387,11 @@ void inet_twsk_schedule(struct inet_timewait_sock *tw,
if (slot >= INET_TWDR_TWKILL_SLOTS)
slot = INET_TWDR_TWKILL_SLOTS - 1;
}
- tw->tw_ttd = inet_tw_time_stamp() + timeo;
+ tw->tw_ttd = jiffies + timeo;
slot = (twdr->slot + slot) & (INET_TWDR_TWKILL_SLOTS - 1);
list = &twdr->cells[slot];
} else {
- tw->tw_ttd = inet_tw_time_stamp() + (slot << INET_TWDR_RECYCLE_TICK);
+ tw->tw_ttd = jiffies + (slot << INET_TWDR_RECYCLE_TICK);
if (twdr->twcal_hand < 0) {
twdr->twcal_hand = 0;
@@ -489,9 +490,7 @@ void inet_twsk_purge(struct inet_hashinfo *hashinfo,
restart_rcu:
rcu_read_lock();
restart:
- sk_nulls_for_each_rcu(sk, node, &head->chain) {
- if (sk->sk_state != TCP_TIME_WAIT)
- continue;
+ sk_nulls_for_each_rcu(sk, node, &head->twchain) {
tw = inet_twsk(sk);
if ((tw->tw_family != family) ||
atomic_read(&twsk_net(tw)->count))
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 2481993..b66910a 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -106,7 +106,6 @@ struct ip4_create_arg {
static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot)
{
- net_get_random_once(&ip4_frags.rnd, sizeof(ip4_frags.rnd));
return jhash_3words((__force u32)id << 16 | prot,
(__force u32)saddr, (__force u32)daddr,
ip4_frags.rnd) & (INETFRAGS_HASHSZ - 1);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 9124027..3982eab 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -810,7 +810,7 @@ static int __ip_append_data(struct sock *sk,
int copy;
int err;
int offset = 0;
- unsigned int maxfraglen, fragheaderlen, maxnonfragsize;
+ unsigned int maxfraglen, fragheaderlen;
int csummode = CHECKSUM_NONE;
struct rtable *rt = (struct rtable *)cork->dst;
@@ -823,10 +823,8 @@ static int __ip_append_data(struct sock *sk,
fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
- maxnonfragsize = (inet->pmtudisc >= IP_PMTUDISC_DO) ?
- mtu : 0xFFFF;
- if (cork->length + length > maxnonfragsize - fragheaderlen) {
+ if (cork->length + length > 0xFFFF - fragheaderlen) {
ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
mtu-exthdrlen);
return -EMSGSIZE;
@@ -1037,6 +1035,7 @@ error:
static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
struct ipcm_cookie *ipc, struct rtable **rtp)
{
+ struct inet_sock *inet = inet_sk(sk);
struct ip_options_rcu *opt;
struct rtable *rt;
@@ -1062,13 +1061,10 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
* We steal reference to this route, caller should not release it
*/
*rtp = NULL;
- cork->fragsize = ip_sk_use_pmtu(sk) ?
- dst_mtu(&rt->dst) : rt->dst.dev->mtu;
+ cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
+ rt->dst.dev->mtu : dst_mtu(&rt->dst);
cork->dst = &rt->dst;
cork->length = 0;
- cork->ttl = ipc->ttl;
- cork->tos = ipc->tos;
- cork->priority = ipc->priority;
cork->tx_flags = ipc->tx_flags;
return 0;
@@ -1123,7 +1119,7 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
int mtu;
int len;
int err;
- unsigned int maxfraglen, fragheaderlen, fraggap, maxnonfragsize;
+ unsigned int maxfraglen, fragheaderlen, fraggap;
if (inet->hdrincl)
return -EPERM;
@@ -1147,10 +1143,8 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
- maxnonfragsize = (inet->pmtudisc >= IP_PMTUDISC_DO) ?
- mtu : 0xFFFF;
- if (cork->length + size > maxnonfragsize - fragheaderlen) {
+ if (cork->length + size > 0xFFFF - fragheaderlen) {
ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu);
return -EMSGSIZE;
}
@@ -1314,8 +1308,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
/* DF bit is set when we want to see DF on outgoing frames.
* If local_df is set too, we still allow to fragment this frame
* locally. */
- if (inet->pmtudisc == IP_PMTUDISC_DO ||
- inet->pmtudisc == IP_PMTUDISC_PROBE ||
+ if (inet->pmtudisc >= IP_PMTUDISC_DO ||
(skb->len <= dst_mtu(&rt->dst) &&
ip_dont_fragment(sk, &rt->dst)))
df = htons(IP_DF);
@@ -1323,9 +1316,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
if (cork->flags & IPCORK_OPT)
opt = cork->opt;
- if (cork->ttl != 0)
- ttl = cork->ttl;
- else if (rt->rt_type == RTN_MULTICAST)
+ if (rt->rt_type == RTN_MULTICAST)
ttl = inet->mc_ttl;
else
ttl = ip_select_ttl(inet, &rt->dst);
@@ -1333,7 +1324,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
iph = ip_hdr(skb);
iph->version = 4;
iph->ihl = 5;
- iph->tos = (cork->tos != -1) ? cork->tos : inet->tos;
+ iph->tos = inet->tos;
iph->frag_off = df;
iph->ttl = ttl;
iph->protocol = sk->sk_protocol;
@@ -1345,7 +1336,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
ip_options_build(skb, opt, cork->addr, rt, 0);
}
- skb->priority = (cork->tos != -1) ? cork->priority: sk->sk_priority;
+ skb->priority = sk->sk_priority;
skb->mark = sk->sk_mark;
/*
* Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
@@ -1495,8 +1486,6 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,
ipc.addr = daddr;
ipc.opt = NULL;
ipc.tx_flags = 0;
- ipc.ttl = 0;
- ipc.tos = -1;
if (replyopts.opt.opt.optlen) {
ipc.opt = &replyopts.opt;
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index ddf32a6..d9c4f11 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -189,7 +189,7 @@ EXPORT_SYMBOL(ip_cmsg_recv);
int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc)
{
- int err, val;
+ int err;
struct cmsghdr *cmsg;
for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
@@ -215,24 +215,6 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc)
ipc->addr = info->ipi_spec_dst.s_addr;
break;
}
- case IP_TTL:
- if (cmsg->cmsg_len != CMSG_LEN(sizeof(int)))
- return -EINVAL;
- val = *(int *)CMSG_DATA(cmsg);
- if (val < 1 || val > 255)
- return -EINVAL;
- ipc->ttl = val;
- break;
- case IP_TOS:
- if (cmsg->cmsg_len != CMSG_LEN(sizeof(int)))
- return -EINVAL;
- val = *(int *)CMSG_DATA(cmsg);
- if (val < 0 || val > 255)
- return -EINVAL;
- ipc->tos = val;
- ipc->priority = rt_tos2priority(ipc->tos);
- break;
-
default:
return -EINVAL;
}
@@ -386,7 +368,7 @@ void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 inf
/*
* Handle MSG_ERRQUEUE
*/
-int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
+int ip_recv_error(struct sock *sk, struct msghdr *msg, int len)
{
struct sock_exterr_skb *serr;
struct sk_buff *skb, *skb2;
@@ -423,7 +405,6 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
serr->addr_offset);
sin->sin_port = serr->port;
memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
- *addr_len = sizeof(*sin);
}
memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err));
@@ -628,7 +609,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
inet->nodefrag = val ? 1 : 0;
break;
case IP_MTU_DISCOVER:
- if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_INTERFACE)
+ if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_PROBE)
goto e_inval;
inet->pmtudisc = val;
break;
@@ -1053,12 +1034,11 @@ e_inval:
* destination in skb->cb[] before dst drop.
* This way, receiver doesnt make cache line misses to read rtable.
*/
-void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb)
+void ipv4_pktinfo_prepare(struct sk_buff *skb)
{
struct in_pktinfo *pktinfo = PKTINFO_SKB_CB(skb);
- if ((inet_sk(sk)->cmsg_flags & IP_CMSG_PKTINFO) &&
- skb_rtable(skb)) {
+ if (skb_rtable(skb)) {
pktinfo->ipi_ifindex = inet_iif(skb);
pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb);
} else {
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 90ff957..63a6d6d 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -454,8 +454,6 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
tstats->rx_bytes += skb->len;
u64_stats_update_end(&tstats->syncp);
- skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
-
if (tunnel->dev->type == ARPHRD_ETHER) {
skb->protocol = eth_type_trans(skb, tunnel->dev);
skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
@@ -463,6 +461,8 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
skb->dev = tunnel->dev;
}
+ skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
+
gro_cells_receive(&tunnel->gro_cells, skb);
return 0;
@@ -976,19 +976,13 @@ int ip_tunnel_init(struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
struct iphdr *iph = &tunnel->parms.iph;
- int i, err;
+ int err;
dev->destructor = ip_tunnel_dev_free;
dev->tstats = alloc_percpu(struct pcpu_tstats);
if (!dev->tstats)
return -ENOMEM;
- for_each_possible_cpu(i) {
- struct pcpu_tstats *ipt_stats;
- ipt_stats = per_cpu_ptr(dev->tstats, i);
- u64_stats_init(&ipt_stats->syncp);
- }
-
err = gro_cells_init(&tunnel->gro_cells, dev);
if (err) {
free_percpu(dev->tstats);
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 42ffbc8..c31e3ad 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -116,36 +116,3 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto)
return 0;
}
EXPORT_SYMBOL_GPL(iptunnel_pull_header);
-
-struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb,
- bool csum_help,
- int gso_type_mask)
-{
- int err;
-
- if (likely(!skb->encapsulation)) {
- skb_reset_inner_headers(skb);
- skb->encapsulation = 1;
- }
-
- if (skb_is_gso(skb)) {
- err = skb_unclone(skb, GFP_ATOMIC);
- if (unlikely(err))
- goto error;
- skb_shinfo(skb)->gso_type |= gso_type_mask;
- return skb;
- }
-
- if (skb->ip_summed == CHECKSUM_PARTIAL && csum_help) {
- err = skb_checksum_help(skb);
- if (unlikely(err))
- goto error;
- } else if (skb->ip_summed != CHECKSUM_PARTIAL)
- skb->ip_summed = CHECKSUM_NONE;
-
- return skb;
-error:
- kfree_skb(skb);
- return ERR_PTR(err);
-}
-EXPORT_SYMBOL_GPL(iptunnel_handle_offloads);
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 52b802a..6e87f85 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -49,6 +49,70 @@ static struct rtnl_link_ops vti_link_ops __read_mostly;
static int vti_net_id __read_mostly;
static int vti_tunnel_init(struct net_device *dev);
+static int vti_err(struct sk_buff *skb, u32 info)
+{
+
+ /* All the routers (except for Linux) return only
+ * 8 bytes of packet payload. It means, that precise relaying of
+ * ICMP in the real Internet is absolutely infeasible.
+ */
+ struct net *net = dev_net(skb->dev);
+ struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
+ struct iphdr *iph = (struct iphdr *)skb->data;
+ const int type = icmp_hdr(skb)->type;
+ const int code = icmp_hdr(skb)->code;
+ struct ip_tunnel *t;
+ int err;
+
+ switch (type) {
+ default:
+ case ICMP_PARAMETERPROB:
+ return 0;
+
+ case ICMP_DEST_UNREACH:
+ switch (code) {
+ case ICMP_SR_FAILED:
+ case ICMP_PORT_UNREACH:
+ /* Impossible event. */
+ return 0;
+ default:
+ /* All others are translated to HOST_UNREACH. */
+ break;
+ }
+ break;
+ case ICMP_TIME_EXCEEDED:
+ if (code != ICMP_EXC_TTL)
+ return 0;
+ break;
+ }
+
+ err = -ENOENT;
+
+ t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
+ iph->daddr, iph->saddr, 0);
+ if (t == NULL)
+ goto out;
+
+ if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
+ ipv4_update_pmtu(skb, dev_net(skb->dev), info,
+ t->parms.link, 0, IPPROTO_IPIP, 0);
+ err = 0;
+ goto out;
+ }
+
+ err = 0;
+ if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
+ goto out;
+
+ if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
+ t->err_count++;
+ else
+ t->err_count = 1;
+ t->err_time = jiffies;
+out:
+ return err;
+}
+
/* We dont digest the packet therefore let the packet pass */
static int vti_rcv(struct sk_buff *skb)
{
@@ -126,7 +190,6 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
if (!rt->dst.xfrm ||
rt->dst.xfrm->props.mode != XFRM_MODE_TUNNEL) {
dev->stats.tx_carrier_errors++;
- ip_rt_put(rt);
goto tx_error_icmp;
}
tdev = rt->dst.dev;
@@ -241,8 +304,9 @@ static void __net_init vti_fb_tunnel_init(struct net_device *dev)
iph->ihl = 5;
}
-static struct xfrm_tunnel_notifier vti_handler __read_mostly = {
+static struct xfrm_tunnel vti_handler __read_mostly = {
.handler = vti_rcv,
+ .err_handler = vti_err,
.priority = 1,
};
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index fe3e9f7..7f80fb4 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -220,17 +220,17 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
if (unlikely(skb->protocol != htons(ETH_P_IP)))
goto tx_error;
- skb = iptunnel_handle_offloads(skb, false, SKB_GSO_IPIP);
- if (IS_ERR(skb))
- goto out;
+ if (likely(!skb->encapsulation)) {
+ skb_reset_inner_headers(skb);
+ skb->encapsulation = 1;
+ }
ip_tunnel_xmit(skb, dev, tiph, tiph->protocol);
return NETDEV_TX_OK;
tx_error:
- dev_kfree_skb(skb);
-out:
dev->stats.tx_errors++;
+ dev_kfree_skb(skb);
return NETDEV_TX_OK;
}
@@ -275,7 +275,6 @@ static const struct net_device_ops ipip_netdev_ops = {
#define IPIP_FEATURES (NETIF_F_SG | \
NETIF_F_FRAGLIST | \
NETIF_F_HIGHDMA | \
- NETIF_F_GSO_SOFTWARE | \
NETIF_F_HW_CSUM)
static void ipip_tunnel_setup(struct net_device *dev)
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 40d5607..1657e39b 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -36,27 +36,6 @@ config NF_CONNTRACK_PROC_COMPAT
If unsure, say Y.
-config NF_TABLES_IPV4
- depends on NF_TABLES
- tristate "IPv4 nf_tables support"
-
-config NFT_REJECT_IPV4
- depends on NF_TABLES_IPV4
- tristate "nf_tables IPv4 reject support"
-
-config NFT_CHAIN_ROUTE_IPV4
- depends on NF_TABLES_IPV4
- tristate "IPv4 nf_tables route chain support"
-
-config NFT_CHAIN_NAT_IPV4
- depends on NF_TABLES_IPV4
- depends on NF_NAT_IPV4 && NFT_NAT
- tristate "IPv4 nf_tables nat chain support"
-
-config NF_TABLES_ARP
- depends on NF_TABLES
- tristate "ARP nf_tables support"
-
config IP_NF_IPTABLES
tristate "IP tables support (required for filtering/masq/NAT)"
default m if NETFILTER_ADVANCED=n
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 19df72b..3622b24 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -27,12 +27,6 @@ obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o
# NAT protocols (nf_nat)
obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o
-obj-$(CONFIG_NF_TABLES_IPV4) += nf_tables_ipv4.o
-obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o
-obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o
-obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o
-obj-$(CONFIG_NF_TABLES_ARP) += nf_tables_arp.o
-
# generic IP tables
obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 59da7cd..85a4f21 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -271,11 +271,6 @@ unsigned int arpt_do_table(struct sk_buff *skb,
local_bh_disable();
addend = xt_write_recseq_begin();
private = table->private;
- /*
- * Ensure we load private-> members after we've fetched the base
- * pointer.
- */
- smp_read_barrier_depends();
table_base = private->entries[smp_processor_id()];
e = get_entry(table_base, private->hook_entry[hook]);
diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c
index 802ddec..a865f6f 100644
--- a/net/ipv4/netfilter/arptable_filter.c
+++ b/net/ipv4/netfilter/arptable_filter.c
@@ -27,14 +27,13 @@ static const struct xt_table packet_filter = {
/* The work comes in here from netfilter.c */
static unsigned int
-arptable_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
+arptable_filter_hook(unsigned int hook, struct sk_buff *skb,
const struct net_device *in, const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
const struct net *net = dev_net((in != NULL) ? in : out);
- return arpt_do_table(skb, ops->hooknum, in, out,
- net->ipv4.arptable_filter);
+ return arpt_do_table(skb, hook, in, out, net->ipv4.arptable_filter);
}
static struct nf_hook_ops *arpfilter_ops __read_mostly;
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 61dbcd5..cb91101 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -340,11 +340,6 @@ ipt_do_table(struct sk_buff *skb,
addend = xt_write_recseq_begin();
private = table->private;
cpu = smp_processor_id();
- /*
- * Ensure we load private-> members after we've fetched the base
- * pointer.
- */
- smp_read_barrier_depends();
table_base = private->entries[cpu];
jumpstack = (struct ipt_entry **)private->jumpstack[cpu];
stackptr = per_cpu_ptr(private->stackptr, cpu);
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 2510c02..0b732ef 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -28,7 +28,6 @@
#include <linux/netfilter_ipv4/ipt_CLUSTERIP.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/net_namespace.h>
-#include <net/netns/generic.h>
#include <net/checksum.h>
#include <net/ip.h>
@@ -58,21 +57,15 @@ struct clusterip_config {
struct rcu_head rcu;
};
-#ifdef CONFIG_PROC_FS
-static const struct file_operations clusterip_proc_fops;
-#endif
+static LIST_HEAD(clusterip_configs);
-static int clusterip_net_id __read_mostly;
-
-struct clusterip_net {
- struct list_head configs;
- /* lock protects the configs list */
- spinlock_t lock;
+/* clusterip_lock protects the clusterip_configs list */
+static DEFINE_SPINLOCK(clusterip_lock);
#ifdef CONFIG_PROC_FS
- struct proc_dir_entry *procdir;
+static const struct file_operations clusterip_proc_fops;
+static struct proc_dir_entry *clusterip_procdir;
#endif
-};
static inline void
clusterip_config_get(struct clusterip_config *c)
@@ -99,13 +92,10 @@ clusterip_config_put(struct clusterip_config *c)
static inline void
clusterip_config_entry_put(struct clusterip_config *c)
{
- struct net *net = dev_net(c->dev);
- struct clusterip_net *cn = net_generic(net, clusterip_net_id);
-
local_bh_disable();
- if (atomic_dec_and_lock(&c->entries, &cn->lock)) {
+ if (atomic_dec_and_lock(&c->entries, &clusterip_lock)) {
list_del_rcu(&c->list);
- spin_unlock(&cn->lock);
+ spin_unlock(&clusterip_lock);
local_bh_enable();
dev_mc_del(c->dev, c->clustermac);
@@ -123,12 +113,11 @@ clusterip_config_entry_put(struct clusterip_config *c)
}
static struct clusterip_config *
-__clusterip_config_find(struct net *net, __be32 clusterip)
+__clusterip_config_find(__be32 clusterip)
{
struct clusterip_config *c;
- struct clusterip_net *cn = net_generic(net, clusterip_net_id);
- list_for_each_entry_rcu(c, &cn->configs, list) {
+ list_for_each_entry_rcu(c, &clusterip_configs, list) {
if (c->clusterip == clusterip)
return c;
}
@@ -137,12 +126,12 @@ __clusterip_config_find(struct net *net, __be32 clusterip)
}
static inline struct clusterip_config *
-clusterip_config_find_get(struct net *net, __be32 clusterip, int entry)
+clusterip_config_find_get(__be32 clusterip, int entry)
{
struct clusterip_config *c;
rcu_read_lock_bh();
- c = __clusterip_config_find(net, clusterip);
+ c = __clusterip_config_find(clusterip);
if (c) {
if (unlikely(!atomic_inc_not_zero(&c->refcount)))
c = NULL;
@@ -169,7 +158,6 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,
struct net_device *dev)
{
struct clusterip_config *c;
- struct clusterip_net *cn = net_generic(dev_net(dev), clusterip_net_id);
c = kzalloc(sizeof(*c), GFP_ATOMIC);
if (!c)
@@ -192,7 +180,7 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,
/* create proc dir entry */
sprintf(buffer, "%pI4", &ip);
c->pde = proc_create_data(buffer, S_IWUSR|S_IRUSR,
- cn->procdir,
+ clusterip_procdir,
&clusterip_proc_fops, c);
if (!c->pde) {
kfree(c);
@@ -201,9 +189,9 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,
}
#endif
- spin_lock_bh(&cn->lock);
- list_add_rcu(&c->list, &cn->configs);
- spin_unlock_bh(&cn->lock);
+ spin_lock_bh(&clusterip_lock);
+ list_add_rcu(&c->list, &clusterip_configs);
+ spin_unlock_bh(&clusterip_lock);
return c;
}
@@ -382,7 +370,7 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par)
/* FIXME: further sanity checks */
- config = clusterip_config_find_get(par->net, e->ip.dst.s_addr, 1);
+ config = clusterip_config_find_get(e->ip.dst.s_addr, 1);
if (!config) {
if (!(cipinfo->flags & CLUSTERIP_FLAG_NEW)) {
pr_info("no config found for %pI4, need 'new'\n",
@@ -396,7 +384,7 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par)
return -EINVAL;
}
- dev = dev_get_by_name(par->net, e->ip.iniface);
+ dev = dev_get_by_name(&init_net, e->ip.iniface);
if (!dev) {
pr_info("no such interface %s\n",
e->ip.iniface);
@@ -495,7 +483,7 @@ static void arp_print(struct arp_payload *payload)
#endif
static unsigned int
-arp_mangle(const struct nf_hook_ops *ops,
+arp_mangle(unsigned int hook,
struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
@@ -504,7 +492,6 @@ arp_mangle(const struct nf_hook_ops *ops,
struct arphdr *arp = arp_hdr(skb);
struct arp_payload *payload;
struct clusterip_config *c;
- struct net *net = dev_net(in ? in : out);
/* we don't care about non-ethernet and non-ipv4 ARP */
if (arp->ar_hrd != htons(ARPHRD_ETHER) ||
@@ -521,7 +508,7 @@ arp_mangle(const struct nf_hook_ops *ops,
/* if there is no clusterip configuration for the arp reply's
* source ip, we don't want to mangle it */
- c = clusterip_config_find_get(net, payload->src_ip, 0);
+ c = clusterip_config_find_get(payload->src_ip, 0);
if (!c)
return NF_ACCEPT;
@@ -711,75 +698,48 @@ static const struct file_operations clusterip_proc_fops = {
#endif /* CONFIG_PROC_FS */
-static int clusterip_net_init(struct net *net)
-{
- struct clusterip_net *cn = net_generic(net, clusterip_net_id);
-
- INIT_LIST_HEAD(&cn->configs);
-
- spin_lock_init(&cn->lock);
-
-#ifdef CONFIG_PROC_FS
- cn->procdir = proc_mkdir("ipt_CLUSTERIP", net->proc_net);
- if (!cn->procdir) {
- pr_err("Unable to proc dir entry\n");
- return -ENOMEM;
- }
-#endif /* CONFIG_PROC_FS */
-
- return 0;
-}
-
-static void clusterip_net_exit(struct net *net)
-{
-#ifdef CONFIG_PROC_FS
- struct clusterip_net *cn = net_generic(net, clusterip_net_id);
- proc_remove(cn->procdir);
-#endif
-}
-
-static struct pernet_operations clusterip_net_ops = {
- .init = clusterip_net_init,
- .exit = clusterip_net_exit,
- .id = &clusterip_net_id,
- .size = sizeof(struct clusterip_net),
-};
-
static int __init clusterip_tg_init(void)
{
int ret;
- ret = register_pernet_subsys(&clusterip_net_ops);
- if (ret < 0)
- return ret;
-
ret = xt_register_target(&clusterip_tg_reg);
if (ret < 0)
- goto cleanup_subsys;
+ return ret;
ret = nf_register_hook(&cip_arp_ops);
if (ret < 0)
goto cleanup_target;
+#ifdef CONFIG_PROC_FS
+ clusterip_procdir = proc_mkdir("ipt_CLUSTERIP", init_net.proc_net);
+ if (!clusterip_procdir) {
+ pr_err("Unable to proc dir entry\n");
+ ret = -ENOMEM;
+ goto cleanup_hook;
+ }
+#endif /* CONFIG_PROC_FS */
+
pr_info("ClusterIP Version %s loaded successfully\n",
CLUSTERIP_VERSION);
-
return 0;
+#ifdef CONFIG_PROC_FS
+cleanup_hook:
+ nf_unregister_hook(&cip_arp_ops);
+#endif /* CONFIG_PROC_FS */
cleanup_target:
xt_unregister_target(&clusterip_tg_reg);
-cleanup_subsys:
- unregister_pernet_subsys(&clusterip_net_ops);
return ret;
}
static void __exit clusterip_tg_exit(void)
{
pr_info("ClusterIP Version %s unloading\n", CLUSTERIP_VERSION);
-
+#ifdef CONFIG_PROC_FS
+ proc_remove(clusterip_procdir);
+#endif
nf_unregister_hook(&cip_arp_ops);
xt_unregister_target(&clusterip_tg_reg);
- unregister_pernet_subsys(&clusterip_net_ops);
/* Wait for completion of call_rcu_bh()'s (clusterip_config_rcu_free) */
rcu_barrier_bh();
diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c
index f13bd91..b6346bf 100644
--- a/net/ipv4/netfilter/ipt_SYNPROXY.c
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -244,7 +244,6 @@ synproxy_recv_client_ack(const struct synproxy_net *snet,
this_cpu_inc(snet->stats->cookie_valid);
opts->mss = mss;
- opts->options |= XT_SYNPROXY_OPT_MSS;
if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP)
synproxy_check_timestamp_cookie(opts);
@@ -298,7 +297,7 @@ synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par)
return XT_CONTINUE;
}
-static unsigned int ipv4_synproxy_hook(const struct nf_hook_ops *ops,
+static unsigned int ipv4_synproxy_hook(unsigned int hooknum,
struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index 9cb993c..cbc2215 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -220,7 +220,6 @@ static void ipt_ulog_packet(struct net *net,
ub->qlen++;
pm = nlmsg_data(nlh);
- memset(pm, 0, sizeof(*pm));
/* We might not have a timestamp, get one */
if (skb->tstamp.tv64 == 0)
@@ -239,6 +238,8 @@ static void ipt_ulog_packet(struct net *net,
}
else if (loginfo->prefix[0] != '\0')
strncpy(pm->prefix, loginfo->prefix, sizeof(pm->prefix));
+ else
+ *(pm->prefix) = '\0';
if (in && in->hard_header_len > 0 &&
skb->mac_header != skb->network_header &&
@@ -250,9 +251,13 @@ static void ipt_ulog_packet(struct net *net,
if (in)
strncpy(pm->indev_name, in->name, sizeof(pm->indev_name));
+ else
+ pm->indev_name[0] = '\0';
if (out)
strncpy(pm->outdev_name, out->name, sizeof(pm->outdev_name));
+ else
+ pm->outdev_name[0] = '\0';
/* copy_len <= skb->len, so can't fail. */
if (skb_copy_bits(skb, 0, pm->payload, copy_len) < 0)
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
index e08a74a..50af5b4 100644
--- a/net/ipv4/netfilter/iptable_filter.c
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -33,21 +33,20 @@ static const struct xt_table packet_filter = {
};
static unsigned int
-iptable_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
+iptable_filter_hook(unsigned int hook, struct sk_buff *skb,
const struct net_device *in, const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
const struct net *net;
- if (ops->hooknum == NF_INET_LOCAL_OUT &&
+ if (hook == NF_INET_LOCAL_OUT &&
(skb->len < sizeof(struct iphdr) ||
ip_hdrlen(skb) < sizeof(struct iphdr)))
/* root is playing with raw sockets. */
return NF_ACCEPT;
net = dev_net((in != NULL) ? in : out);
- return ipt_do_table(skb, ops->hooknum, in, out,
- net->ipv4.iptable_filter);
+ return ipt_do_table(skb, hook, in, out, net->ipv4.iptable_filter);
}
static struct nf_hook_ops *filter_ops __read_mostly;
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index 6a5079c..0d8cd82 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -79,19 +79,19 @@ ipt_mangle_out(struct sk_buff *skb, const struct net_device *out)
/* The work comes in here from netfilter.c. */
static unsigned int
-iptable_mangle_hook(const struct nf_hook_ops *ops,
+iptable_mangle_hook(unsigned int hook,
struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
- if (ops->hooknum == NF_INET_LOCAL_OUT)
+ if (hook == NF_INET_LOCAL_OUT)
return ipt_mangle_out(skb, out);
- if (ops->hooknum == NF_INET_POST_ROUTING)
- return ipt_do_table(skb, ops->hooknum, in, out,
+ if (hook == NF_INET_POST_ROUTING)
+ return ipt_do_table(skb, hook, in, out,
dev_net(out)->ipv4.iptable_mangle);
/* PREROUTING/INPUT/FORWARD: */
- return ipt_do_table(skb, ops->hooknum, in, out,
+ return ipt_do_table(skb, hook, in, out,
dev_net(in)->ipv4.iptable_mangle);
}
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
index ee28861..683bfaf 100644
--- a/net/ipv4/netfilter/iptable_nat.c
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -61,7 +61,7 @@ static unsigned int nf_nat_rule_find(struct sk_buff *skb, unsigned int hooknum,
}
static unsigned int
-nf_nat_ipv4_fn(const struct nf_hook_ops *ops,
+nf_nat_ipv4_fn(unsigned int hooknum,
struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
@@ -71,7 +71,7 @@ nf_nat_ipv4_fn(const struct nf_hook_ops *ops,
enum ip_conntrack_info ctinfo;
struct nf_conn_nat *nat;
/* maniptype == SRC for postrouting. */
- enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum);
+ enum nf_nat_manip_type maniptype = HOOK2MANIP(hooknum);
/* We never see fragments: conntrack defrags on pre-routing
* and local-out, and nf_nat_out protects post-routing.
@@ -108,7 +108,7 @@ nf_nat_ipv4_fn(const struct nf_hook_ops *ops,
case IP_CT_RELATED_REPLY:
if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
- ops->hooknum))
+ hooknum))
return NF_DROP;
else
return NF_ACCEPT;
@@ -121,14 +121,14 @@ nf_nat_ipv4_fn(const struct nf_hook_ops *ops,
if (!nf_nat_initialized(ct, maniptype)) {
unsigned int ret;
- ret = nf_nat_rule_find(skb, ops->hooknum, in, out, ct);
+ ret = nf_nat_rule_find(skb, hooknum, in, out, ct);
if (ret != NF_ACCEPT)
return ret;
} else {
pr_debug("Already setup manip %s for ct %p\n",
maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST",
ct);
- if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out))
+ if (nf_nat_oif_changed(hooknum, ctinfo, nat, out))
goto oif_changed;
}
break;
@@ -137,11 +137,11 @@ nf_nat_ipv4_fn(const struct nf_hook_ops *ops,
/* ESTABLISHED */
NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED ||
ctinfo == IP_CT_ESTABLISHED_REPLY);
- if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out))
+ if (nf_nat_oif_changed(hooknum, ctinfo, nat, out))
goto oif_changed;
}
- return nf_nat_packet(ct, ctinfo, ops->hooknum, skb);
+ return nf_nat_packet(ct, ctinfo, hooknum, skb);
oif_changed:
nf_ct_kill_acct(ct, ctinfo, skb);
@@ -149,7 +149,7 @@ oif_changed:
}
static unsigned int
-nf_nat_ipv4_in(const struct nf_hook_ops *ops,
+nf_nat_ipv4_in(unsigned int hooknum,
struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
@@ -158,7 +158,7 @@ nf_nat_ipv4_in(const struct nf_hook_ops *ops,
unsigned int ret;
__be32 daddr = ip_hdr(skb)->daddr;
- ret = nf_nat_ipv4_fn(ops, skb, in, out, okfn);
+ ret = nf_nat_ipv4_fn(hooknum, skb, in, out, okfn);
if (ret != NF_DROP && ret != NF_STOLEN &&
daddr != ip_hdr(skb)->daddr)
skb_dst_drop(skb);
@@ -167,7 +167,7 @@ nf_nat_ipv4_in(const struct nf_hook_ops *ops,
}
static unsigned int
-nf_nat_ipv4_out(const struct nf_hook_ops *ops,
+nf_nat_ipv4_out(unsigned int hooknum,
struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
@@ -185,7 +185,7 @@ nf_nat_ipv4_out(const struct nf_hook_ops *ops,
ip_hdrlen(skb) < sizeof(struct iphdr))
return NF_ACCEPT;
- ret = nf_nat_ipv4_fn(ops, skb, in, out, okfn);
+ ret = nf_nat_ipv4_fn(hooknum, skb, in, out, okfn);
#ifdef CONFIG_XFRM
if (ret != NF_DROP && ret != NF_STOLEN &&
!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
@@ -207,7 +207,7 @@ nf_nat_ipv4_out(const struct nf_hook_ops *ops,
}
static unsigned int
-nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops,
+nf_nat_ipv4_local_fn(unsigned int hooknum,
struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
@@ -223,7 +223,7 @@ nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops,
ip_hdrlen(skb) < sizeof(struct iphdr))
return NF_ACCEPT;
- ret = nf_nat_ipv4_fn(ops, skb, in, out, okfn);
+ ret = nf_nat_ipv4_fn(hooknum, skb, in, out, okfn);
if (ret != NF_DROP && ret != NF_STOLEN &&
(ct = nf_ct_get(skb, &ctinfo)) != NULL) {
enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
index b2f7e8f..1f82aea 100644
--- a/net/ipv4/netfilter/iptable_raw.c
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -20,20 +20,20 @@ static const struct xt_table packet_raw = {
/* The work comes in here from netfilter.c. */
static unsigned int
-iptable_raw_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
+iptable_raw_hook(unsigned int hook, struct sk_buff *skb,
const struct net_device *in, const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
const struct net *net;
- if (ops->hooknum == NF_INET_LOCAL_OUT &&
+ if (hook == NF_INET_LOCAL_OUT &&
(skb->len < sizeof(struct iphdr) ||
ip_hdrlen(skb) < sizeof(struct iphdr)))
/* root is playing with raw sockets. */
return NF_ACCEPT;
net = dev_net((in != NULL) ? in : out);
- return ipt_do_table(skb, ops->hooknum, in, out, net->ipv4.iptable_raw);
+ return ipt_do_table(skb, hook, in, out, net->ipv4.iptable_raw);
}
static struct nf_hook_ops *rawtable_ops __read_mostly;
diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c
index c86647e..f867a8d 100644
--- a/net/ipv4/netfilter/iptable_security.c
+++ b/net/ipv4/netfilter/iptable_security.c
@@ -37,22 +37,21 @@ static const struct xt_table security_table = {
};
static unsigned int
-iptable_security_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
+iptable_security_hook(unsigned int hook, struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
const struct net *net;
- if (ops->hooknum == NF_INET_LOCAL_OUT &&
+ if (hook == NF_INET_LOCAL_OUT &&
(skb->len < sizeof(struct iphdr) ||
ip_hdrlen(skb) < sizeof(struct iphdr)))
/* Somebody is playing with raw sockets. */
return NF_ACCEPT;
net = dev_net((in != NULL) ? in : out);
- return ipt_do_table(skb, ops->hooknum, in, out,
- net->ipv4.iptable_security);
+ return ipt_do_table(skb, hook, in, out, net->ipv4.iptable_security);
}
static struct nf_hook_ops *sectbl_ops __read_mostly;
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index ecd8bec..86f5b34 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -92,7 +92,7 @@ static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
return NF_ACCEPT;
}
-static unsigned int ipv4_helper(const struct nf_hook_ops *ops,
+static unsigned int ipv4_helper(unsigned int hooknum,
struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
@@ -121,7 +121,7 @@ static unsigned int ipv4_helper(const struct nf_hook_ops *ops,
ct, ctinfo);
}
-static unsigned int ipv4_confirm(const struct nf_hook_ops *ops,
+static unsigned int ipv4_confirm(unsigned int hooknum,
struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
@@ -147,16 +147,16 @@ out:
return nf_conntrack_confirm(skb);
}
-static unsigned int ipv4_conntrack_in(const struct nf_hook_ops *ops,
+static unsigned int ipv4_conntrack_in(unsigned int hooknum,
struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
- return nf_conntrack_in(dev_net(in), PF_INET, ops->hooknum, skb);
+ return nf_conntrack_in(dev_net(in), PF_INET, hooknum, skb);
}
-static unsigned int ipv4_conntrack_local(const struct nf_hook_ops *ops,
+static unsigned int ipv4_conntrack_local(unsigned int hooknum,
struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
@@ -166,7 +166,7 @@ static unsigned int ipv4_conntrack_local(const struct nf_hook_ops *ops,
if (skb->len < sizeof(struct iphdr) ||
ip_hdrlen(skb) < sizeof(struct iphdr))
return NF_ACCEPT;
- return nf_conntrack_in(dev_net(out), PF_INET, ops->hooknum, skb);
+ return nf_conntrack_in(dev_net(out), PF_INET, hooknum, skb);
}
/* Connection tracking may drop packets, but never alters them, so
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index 12e13bd..7428155 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -60,7 +60,7 @@ static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum,
return IP_DEFRAG_CONNTRACK_OUT + zone;
}
-static unsigned int ipv4_conntrack_defrag(const struct nf_hook_ops *ops,
+static unsigned int ipv4_conntrack_defrag(unsigned int hooknum,
struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
@@ -83,9 +83,7 @@ static unsigned int ipv4_conntrack_defrag(const struct nf_hook_ops *ops,
#endif
/* Gather fragments. */
if (ip_is_fragment(ip_hdr(skb))) {
- enum ip_defrag_users user =
- nf_ct_defrag_user(ops->hooknum, skb);
-
+ enum ip_defrag_users user = nf_ct_defrag_user(hooknum, skb);
if (nf_ct_ipv4_gather_frags(skb, user))
return NF_STOLEN;
}
diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c
deleted file mode 100644
index 3e67ef1..0000000
--- a/net/ipv4/netfilter/nf_tables_arp.c
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2008-2010 Patrick McHardy <kaber@trash.net>
- * Copyright (c) 2013 Pablo Neira Ayuso <pablo@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Development of this code funded by Astaro AG (http://www.astaro.com/)
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/netfilter_arp.h>
-#include <net/netfilter/nf_tables.h>
-
-static struct nft_af_info nft_af_arp __read_mostly = {
- .family = NFPROTO_ARP,
- .nhooks = NF_ARP_NUMHOOKS,
- .owner = THIS_MODULE,
-};
-
-static int nf_tables_arp_init_net(struct net *net)
-{
- net->nft.arp = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL);
- if (net->nft.arp== NULL)
- return -ENOMEM;
-
- memcpy(net->nft.arp, &nft_af_arp, sizeof(nft_af_arp));
-
- if (nft_register_afinfo(net, net->nft.arp) < 0)
- goto err;
-
- return 0;
-err:
- kfree(net->nft.arp);
- return -ENOMEM;
-}
-
-static void nf_tables_arp_exit_net(struct net *net)
-{
- nft_unregister_afinfo(net->nft.arp);
- kfree(net->nft.arp);
-}
-
-static struct pernet_operations nf_tables_arp_net_ops = {
- .init = nf_tables_arp_init_net,
- .exit = nf_tables_arp_exit_net,
-};
-
-static unsigned int
-nft_do_chain_arp(const struct nf_hook_ops *ops,
- struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
- struct nft_pktinfo pkt;
-
- nft_set_pktinfo(&pkt, ops, skb, in, out);
-
- return nft_do_chain_pktinfo(&pkt, ops);
-}
-
-static struct nf_chain_type filter_arp = {
- .family = NFPROTO_ARP,
- .name = "filter",
- .type = NFT_CHAIN_T_DEFAULT,
- .hook_mask = (1 << NF_ARP_IN) |
- (1 << NF_ARP_OUT) |
- (1 << NF_ARP_FORWARD),
- .fn = {
- [NF_ARP_IN] = nft_do_chain_arp,
- [NF_ARP_OUT] = nft_do_chain_arp,
- [NF_ARP_FORWARD] = nft_do_chain_arp,
- },
-};
-
-static int __init nf_tables_arp_init(void)
-{
- int ret;
-
- nft_register_chain_type(&filter_arp);
- ret = register_pernet_subsys(&nf_tables_arp_net_ops);
- if (ret < 0)
- nft_unregister_chain_type(&filter_arp);
-
- return ret;
-}
-
-static void __exit nf_tables_arp_exit(void)
-{
- unregister_pernet_subsys(&nf_tables_arp_net_ops);
- nft_unregister_chain_type(&filter_arp);
-}
-
-module_init(nf_tables_arp_init);
-module_exit(nf_tables_arp_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_ALIAS_NFT_FAMILY(3); /* NFPROTO_ARP */
diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c
deleted file mode 100644
index 0f4cbfe..0000000
--- a/net/ipv4/netfilter/nf_tables_ipv4.c
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
- * Copyright (c) 2012-2013 Pablo Neira Ayuso <pablo@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Development of this code funded by Astaro AG (http://www.astaro.com/)
- */
-
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/ip.h>
-#include <linux/netfilter_ipv4.h>
-#include <net/netfilter/nf_tables.h>
-#include <net/net_namespace.h>
-#include <net/ip.h>
-#include <net/netfilter/nf_tables_ipv4.h>
-
-static unsigned int nft_ipv4_output(const struct nf_hook_ops *ops,
- struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
- struct nft_pktinfo pkt;
-
- if (unlikely(skb->len < sizeof(struct iphdr) ||
- ip_hdr(skb)->ihl < sizeof(struct iphdr) / 4)) {
- if (net_ratelimit())
- pr_info("nf_tables_ipv4: ignoring short SOCK_RAW "
- "packet\n");
- return NF_ACCEPT;
- }
- nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out);
-
- return nft_do_chain_pktinfo(&pkt, ops);
-}
-
-static struct nft_af_info nft_af_ipv4 __read_mostly = {
- .family = NFPROTO_IPV4,
- .nhooks = NF_INET_NUMHOOKS,
- .owner = THIS_MODULE,
- .hooks = {
- [NF_INET_LOCAL_OUT] = nft_ipv4_output,
- },
-};
-
-static int nf_tables_ipv4_init_net(struct net *net)
-{
- net->nft.ipv4 = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL);
- if (net->nft.ipv4 == NULL)
- return -ENOMEM;
-
- memcpy(net->nft.ipv4, &nft_af_ipv4, sizeof(nft_af_ipv4));
-
- if (nft_register_afinfo(net, net->nft.ipv4) < 0)
- goto err;
-
- return 0;
-err:
- kfree(net->nft.ipv4);
- return -ENOMEM;
-}
-
-static void nf_tables_ipv4_exit_net(struct net *net)
-{
- nft_unregister_afinfo(net->nft.ipv4);
- kfree(net->nft.ipv4);
-}
-
-static struct pernet_operations nf_tables_ipv4_net_ops = {
- .init = nf_tables_ipv4_init_net,
- .exit = nf_tables_ipv4_exit_net,
-};
-
-static unsigned int
-nft_do_chain_ipv4(const struct nf_hook_ops *ops,
- struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
- struct nft_pktinfo pkt;
-
- nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out);
-
- return nft_do_chain_pktinfo(&pkt, ops);
-}
-
-static struct nf_chain_type filter_ipv4 = {
- .family = NFPROTO_IPV4,
- .name = "filter",
- .type = NFT_CHAIN_T_DEFAULT,
- .hook_mask = (1 << NF_INET_LOCAL_IN) |
- (1 << NF_INET_LOCAL_OUT) |
- (1 << NF_INET_FORWARD) |
- (1 << NF_INET_PRE_ROUTING) |
- (1 << NF_INET_POST_ROUTING),
- .fn = {
- [NF_INET_LOCAL_IN] = nft_do_chain_ipv4,
- [NF_INET_LOCAL_OUT] = nft_ipv4_output,
- [NF_INET_FORWARD] = nft_do_chain_ipv4,
- [NF_INET_PRE_ROUTING] = nft_do_chain_ipv4,
- [NF_INET_POST_ROUTING] = nft_do_chain_ipv4,
- },
-};
-
-static int __init nf_tables_ipv4_init(void)
-{
- nft_register_chain_type(&filter_ipv4);
- return register_pernet_subsys(&nf_tables_ipv4_net_ops);
-}
-
-static void __exit nf_tables_ipv4_exit(void)
-{
- unregister_pernet_subsys(&nf_tables_ipv4_net_ops);
- nft_unregister_chain_type(&filter_ipv4);
-}
-
-module_init(nf_tables_ipv4_init);
-module_exit(nf_tables_ipv4_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_ALIAS_NFT_FAMILY(AF_INET);
diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
deleted file mode 100644
index cf2c792..0000000
--- a/net/ipv4/netfilter/nft_chain_nat_ipv4.c
+++ /dev/null
@@ -1,205 +0,0 @@
-/*
- * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
- * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org>
- * Copyright (c) 2012 Intel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Development of this code funded by Astaro AG (http://www.astaro.com/)
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/list.h>
-#include <linux/skbuff.h>
-#include <linux/ip.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/netfilter/nf_tables.h>
-#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_core.h>
-#include <net/netfilter/nf_tables.h>
-#include <net/netfilter/nf_tables_ipv4.h>
-#include <net/netfilter/nf_nat_l3proto.h>
-#include <net/ip.h>
-
-/*
- * NAT chains
- */
-
-static unsigned int nf_nat_fn(const struct nf_hook_ops *ops,
- struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
- enum ip_conntrack_info ctinfo;
- struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
- struct nf_conn_nat *nat;
- enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum);
- struct nft_pktinfo pkt;
- unsigned int ret;
-
- if (ct == NULL || nf_ct_is_untracked(ct))
- return NF_ACCEPT;
-
- NF_CT_ASSERT(!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)));
-
- nat = nfct_nat(ct);
- if (nat == NULL) {
- /* Conntrack module was loaded late, can't add extension. */
- if (nf_ct_is_confirmed(ct))
- return NF_ACCEPT;
- nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC);
- if (nat == NULL)
- return NF_ACCEPT;
- }
-
- switch (ctinfo) {
- case IP_CT_RELATED:
- case IP_CT_RELATED + IP_CT_IS_REPLY:
- if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
- if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
- ops->hooknum))
- return NF_DROP;
- else
- return NF_ACCEPT;
- }
- /* Fall through */
- case IP_CT_NEW:
- if (nf_nat_initialized(ct, maniptype))
- break;
-
- nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out);
-
- ret = nft_do_chain_pktinfo(&pkt, ops);
- if (ret != NF_ACCEPT)
- return ret;
- if (!nf_nat_initialized(ct, maniptype)) {
- ret = nf_nat_alloc_null_binding(ct, ops->hooknum);
- if (ret != NF_ACCEPT)
- return ret;
- }
- default:
- break;
- }
-
- return nf_nat_packet(ct, ctinfo, ops->hooknum, skb);
-}
-
-static unsigned int nf_nat_prerouting(const struct nf_hook_ops *ops,
- struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
- __be32 daddr = ip_hdr(skb)->daddr;
- unsigned int ret;
-
- ret = nf_nat_fn(ops, skb, in, out, okfn);
- if (ret != NF_DROP && ret != NF_STOLEN &&
- ip_hdr(skb)->daddr != daddr) {
- skb_dst_drop(skb);
- }
- return ret;
-}
-
-static unsigned int nf_nat_postrouting(const struct nf_hook_ops *ops,
- struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
- enum ip_conntrack_info ctinfo __maybe_unused;
- const struct nf_conn *ct __maybe_unused;
- unsigned int ret;
-
- ret = nf_nat_fn(ops, skb, in, out, okfn);
-#ifdef CONFIG_XFRM
- if (ret != NF_DROP && ret != NF_STOLEN &&
- (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
- enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
-
- if (ct->tuplehash[dir].tuple.src.u3.ip !=
- ct->tuplehash[!dir].tuple.dst.u3.ip ||
- ct->tuplehash[dir].tuple.src.u.all !=
- ct->tuplehash[!dir].tuple.dst.u.all)
- return nf_xfrm_me_harder(skb, AF_INET) == 0 ?
- ret : NF_DROP;
- }
-#endif
- return ret;
-}
-
-static unsigned int nf_nat_output(const struct nf_hook_ops *ops,
- struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
- enum ip_conntrack_info ctinfo;
- const struct nf_conn *ct;
- unsigned int ret;
-
- ret = nf_nat_fn(ops, skb, in, out, okfn);
- if (ret != NF_DROP && ret != NF_STOLEN &&
- (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
- enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
-
- if (ct->tuplehash[dir].tuple.dst.u3.ip !=
- ct->tuplehash[!dir].tuple.src.u3.ip) {
- if (ip_route_me_harder(skb, RTN_UNSPEC))
- ret = NF_DROP;
- }
-#ifdef CONFIG_XFRM
- else if (ct->tuplehash[dir].tuple.dst.u.all !=
- ct->tuplehash[!dir].tuple.src.u.all)
- if (nf_xfrm_me_harder(skb, AF_INET))
- ret = NF_DROP;
-#endif
- }
- return ret;
-}
-
-static struct nf_chain_type nft_chain_nat_ipv4 = {
- .family = NFPROTO_IPV4,
- .name = "nat",
- .type = NFT_CHAIN_T_NAT,
- .hook_mask = (1 << NF_INET_PRE_ROUTING) |
- (1 << NF_INET_POST_ROUTING) |
- (1 << NF_INET_LOCAL_OUT) |
- (1 << NF_INET_LOCAL_IN),
- .fn = {
- [NF_INET_PRE_ROUTING] = nf_nat_prerouting,
- [NF_INET_POST_ROUTING] = nf_nat_postrouting,
- [NF_INET_LOCAL_OUT] = nf_nat_output,
- [NF_INET_LOCAL_IN] = nf_nat_fn,
- },
- .me = THIS_MODULE,
-};
-
-static int __init nft_chain_nat_init(void)
-{
- int err;
-
- err = nft_register_chain_type(&nft_chain_nat_ipv4);
- if (err < 0)
- return err;
-
- return 0;
-}
-
-static void __exit nft_chain_nat_exit(void)
-{
- nft_unregister_chain_type(&nft_chain_nat_ipv4);
-}
-
-module_init(nft_chain_nat_init);
-module_exit(nft_chain_nat_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_ALIAS_NFT_CHAIN(AF_INET, "nat");
diff --git a/net/ipv4/netfilter/nft_chain_route_ipv4.c b/net/ipv4/netfilter/nft_chain_route_ipv4.c
deleted file mode 100644
index 4e6bf9a..0000000
--- a/net/ipv4/netfilter/nft_chain_route_ipv4.c
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
- * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/list.h>
-#include <linux/skbuff.h>
-#include <linux/netlink.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/netfilter/nfnetlink.h>
-#include <linux/netfilter/nf_tables.h>
-#include <net/netfilter/nf_tables.h>
-#include <net/netfilter/nf_tables_ipv4.h>
-#include <net/route.h>
-#include <net/ip.h>
-
-static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops,
- struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
- unsigned int ret;
- struct nft_pktinfo pkt;
- u32 mark;
- __be32 saddr, daddr;
- u_int8_t tos;
- const struct iphdr *iph;
-
- /* root is playing with raw sockets. */
- if (skb->len < sizeof(struct iphdr) ||
- ip_hdrlen(skb) < sizeof(struct iphdr))
- return NF_ACCEPT;
-
- nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out);
-
- mark = skb->mark;
- iph = ip_hdr(skb);
- saddr = iph->saddr;
- daddr = iph->daddr;
- tos = iph->tos;
-
- ret = nft_do_chain_pktinfo(&pkt, ops);
- if (ret != NF_DROP && ret != NF_QUEUE) {
- iph = ip_hdr(skb);
-
- if (iph->saddr != saddr ||
- iph->daddr != daddr ||
- skb->mark != mark ||
- iph->tos != tos)
- if (ip_route_me_harder(skb, RTN_UNSPEC))
- ret = NF_DROP;
- }
- return ret;
-}
-
-static struct nf_chain_type nft_chain_route_ipv4 = {
- .family = NFPROTO_IPV4,
- .name = "route",
- .type = NFT_CHAIN_T_ROUTE,
- .hook_mask = (1 << NF_INET_LOCAL_OUT),
- .fn = {
- [NF_INET_LOCAL_OUT] = nf_route_table_hook,
- },
- .me = THIS_MODULE,
-};
-
-static int __init nft_chain_route_init(void)
-{
- return nft_register_chain_type(&nft_chain_route_ipv4);
-}
-
-static void __exit nft_chain_route_exit(void)
-{
- nft_unregister_chain_type(&nft_chain_route_ipv4);
-}
-
-module_init(nft_chain_route_init);
-module_exit(nft_chain_route_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_ALIAS_NFT_CHAIN(AF_INET, "route");
diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c
deleted file mode 100644
index fff5ba1..0000000
--- a/net/ipv4/netfilter/nft_reject_ipv4.c
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Development of this code funded by Astaro AG (http://www.astaro.com/)
- */
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/netlink.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter/nf_tables.h>
-#include <net/netfilter/nf_tables.h>
-#include <net/icmp.h>
-
-struct nft_reject {
- enum nft_reject_types type:8;
- u8 icmp_code;
-};
-
-static void nft_reject_eval(const struct nft_expr *expr,
- struct nft_data data[NFT_REG_MAX + 1],
- const struct nft_pktinfo *pkt)
-{
- struct nft_reject *priv = nft_expr_priv(expr);
-
- switch (priv->type) {
- case NFT_REJECT_ICMP_UNREACH:
- icmp_send(pkt->skb, ICMP_DEST_UNREACH, priv->icmp_code, 0);
- break;
- case NFT_REJECT_TCP_RST:
- break;
- }
-
- data[NFT_REG_VERDICT].verdict = NF_DROP;
-}
-
-static const struct nla_policy nft_reject_policy[NFTA_REJECT_MAX + 1] = {
- [NFTA_REJECT_TYPE] = { .type = NLA_U32 },
- [NFTA_REJECT_ICMP_CODE] = { .type = NLA_U8 },
-};
-
-static int nft_reject_init(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nlattr * const tb[])
-{
- struct nft_reject *priv = nft_expr_priv(expr);
-
- if (tb[NFTA_REJECT_TYPE] == NULL)
- return -EINVAL;
-
- priv->type = ntohl(nla_get_be32(tb[NFTA_REJECT_TYPE]));
- switch (priv->type) {
- case NFT_REJECT_ICMP_UNREACH:
- if (tb[NFTA_REJECT_ICMP_CODE] == NULL)
- return -EINVAL;
- priv->icmp_code = nla_get_u8(tb[NFTA_REJECT_ICMP_CODE]);
- case NFT_REJECT_TCP_RST:
- break;
- default:
- return -EINVAL;
- }
-
- return 0;
-}
-
-static int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr)
-{
- const struct nft_reject *priv = nft_expr_priv(expr);
-
- if (nla_put_be32(skb, NFTA_REJECT_TYPE, priv->type))
- goto nla_put_failure;
-
- switch (priv->type) {
- case NFT_REJECT_ICMP_UNREACH:
- if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code))
- goto nla_put_failure;
- break;
- }
-
- return 0;
-
-nla_put_failure:
- return -1;
-}
-
-static struct nft_expr_type nft_reject_type;
-static const struct nft_expr_ops nft_reject_ops = {
- .type = &nft_reject_type,
- .size = NFT_EXPR_SIZE(sizeof(struct nft_reject)),
- .eval = nft_reject_eval,
- .init = nft_reject_init,
- .dump = nft_reject_dump,
-};
-
-static struct nft_expr_type nft_reject_type __read_mostly = {
- .name = "reject",
- .ops = &nft_reject_ops,
- .policy = nft_reject_policy,
- .maxattr = NFTA_REJECT_MAX,
- .owner = THIS_MODULE,
-};
-
-static int __init nft_reject_module_init(void)
-{
- return nft_register_expr(&nft_reject_type);
-}
-
-static void __exit nft_reject_module_exit(void)
-{
- nft_unregister_expr(&nft_reject_type);
-}
-
-module_init(nft_reject_module_init);
-module_exit(nft_reject_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_ALIAS_NFT_EXPR("reject");
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 242e7f4..d7d9882 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -202,14 +202,15 @@ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident)
#if IS_ENABLED(CONFIG_IPV6)
} else if (skb->protocol == htons(ETH_P_IPV6) &&
sk->sk_family == AF_INET6) {
+ struct ipv6_pinfo *np = inet6_sk(sk);
pr_debug("found: %p: num=%d, daddr=%pI6c, dif=%d\n", sk,
(int) isk->inet_num,
- &sk->sk_v6_rcv_saddr,
+ &inet6_sk(sk)->rcv_saddr,
sk->sk_bound_dev_if);
- if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr) &&
- !ipv6_addr_equal(&sk->sk_v6_rcv_saddr,
+ if (!ipv6_addr_any(&np->rcv_saddr) &&
+ !ipv6_addr_equal(&np->rcv_saddr,
&ipv6_hdr(skb)->daddr))
continue;
#endif
@@ -236,11 +237,11 @@ static void inet_get_ping_group_range_net(struct net *net, kgid_t *low,
unsigned int seq;
do {
- seq = read_seqbegin(&net->ipv4.sysctl_local_ports.lock);
+ seq = read_seqbegin(&sysctl_local_ports.lock);
*low = data[0];
*high = data[1];
- } while (read_seqretry(&net->ipv4.sysctl_local_ports.lock, seq));
+ } while (read_seqretry(&sysctl_local_ports.lock, seq));
}
@@ -361,7 +362,7 @@ static void ping_set_saddr(struct sock *sk, struct sockaddr *saddr)
} else if (saddr->sa_family == AF_INET6) {
struct sockaddr_in6 *addr = (struct sockaddr_in6 *) saddr;
struct ipv6_pinfo *np = inet6_sk(sk);
- sk->sk_v6_rcv_saddr = np->saddr = addr->sin6_addr;
+ np->rcv_saddr = np->saddr = addr->sin6_addr;
#endif
}
}
@@ -375,7 +376,7 @@ static void ping_clear_saddr(struct sock *sk, int dif)
#if IS_ENABLED(CONFIG_IPV6)
} else if (sk->sk_family == AF_INET6) {
struct ipv6_pinfo *np = inet6_sk(sk);
- memset(&sk->sk_v6_rcv_saddr, 0, sizeof(sk->sk_v6_rcv_saddr));
+ memset(&np->rcv_saddr, 0, sizeof(np->rcv_saddr));
memset(&np->saddr, 0, sizeof(np->saddr));
#endif
}
@@ -415,12 +416,10 @@ int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
(int)sk->sk_bound_dev_if);
err = 0;
- if (sk->sk_family == AF_INET && isk->inet_rcv_saddr)
+ if ((sk->sk_family == AF_INET && isk->inet_rcv_saddr) ||
+ (sk->sk_family == AF_INET6 &&
+ !ipv6_addr_any(&inet6_sk(sk)->rcv_saddr)))
sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
-#if IS_ENABLED(CONFIG_IPV6)
- if (sk->sk_family == AF_INET6 && !ipv6_addr_any(&sk->sk_v6_rcv_saddr))
- sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
-#endif
if (snum)
sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
@@ -430,7 +429,7 @@ int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
#if IS_ENABLED(CONFIG_IPV6)
if (sk->sk_family == AF_INET6)
- memset(&sk->sk_v6_daddr, 0, sizeof(sk->sk_v6_daddr));
+ memset(&inet6_sk(sk)->daddr, 0, sizeof(inet6_sk(sk)->daddr));
#endif
sk_dst_reset(sk);
@@ -714,8 +713,6 @@ int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
ipc.opt = NULL;
ipc.oif = sk->sk_bound_dev_if;
ipc.tx_flags = 0;
- ipc.ttl = 0;
- ipc.tos = -1;
sock_tx_timestamp(sk, &ipc.tx_flags);
@@ -747,7 +744,7 @@ int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
return -EINVAL;
faddr = ipc.opt->opt.faddr;
}
- tos = get_rttos(&ipc, inet);
+ tos = RT_TOS(inet->tos);
if (sock_flag(sk, SOCK_LOCALROUTE) ||
(msg->msg_flags & MSG_DONTROUTE) ||
(ipc.opt && ipc.opt->opt.is_strictroute)) {
@@ -772,7 +769,7 @@ int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
err = PTR_ERR(rt);
rt = NULL;
if (err == -ENETUNREACH)
- IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
+ IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
goto out;
}
@@ -830,6 +827,8 @@ int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
{
struct inet_sock *isk = inet_sk(sk);
int family = sk->sk_family;
+ struct sockaddr_in *sin;
+ struct sockaddr_in6 *sin6;
struct sk_buff *skb;
int copied, err;
@@ -839,13 +838,19 @@ int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
if (flags & MSG_OOB)
goto out;
+ if (addr_len) {
+ if (family == AF_INET)
+ *addr_len = sizeof(*sin);
+ else if (family == AF_INET6 && addr_len)
+ *addr_len = sizeof(*sin6);
+ }
+
if (flags & MSG_ERRQUEUE) {
if (family == AF_INET) {
- return ip_recv_error(sk, msg, len, addr_len);
+ return ip_recv_error(sk, msg, len);
#if IS_ENABLED(CONFIG_IPV6)
} else if (family == AF_INET6) {
- return pingv6_ops.ipv6_recv_error(sk, msg, len,
- addr_len);
+ return pingv6_ops.ipv6_recv_error(sk, msg, len);
#endif
}
}
@@ -869,15 +874,11 @@ int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
/* Copy the address and add cmsg data. */
if (family == AF_INET) {
- struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name;
-
- if (sin) {
- sin->sin_family = AF_INET;
- sin->sin_port = 0 /* skb->h.uh->source */;
- sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
- memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
- *addr_len = sizeof(*sin);
- }
+ sin = (struct sockaddr_in *) msg->msg_name;
+ sin->sin_family = AF_INET;
+ sin->sin_port = 0 /* skb->h.uh->source */;
+ sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
+ memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
if (isk->cmsg_flags)
ip_cmsg_recv(msg, skb);
@@ -886,21 +887,17 @@ int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
} else if (family == AF_INET6) {
struct ipv6_pinfo *np = inet6_sk(sk);
struct ipv6hdr *ip6 = ipv6_hdr(skb);
- struct sockaddr_in6 *sin6 =
- (struct sockaddr_in6 *)msg->msg_name;
-
- if (sin6) {
- sin6->sin6_family = AF_INET6;
- sin6->sin6_port = 0;
- sin6->sin6_addr = ip6->saddr;
- sin6->sin6_flowinfo = 0;
- if (np->sndflow)
- sin6->sin6_flowinfo = ip6_flowinfo(ip6);
- sin6->sin6_scope_id =
- ipv6_iface_scope_id(&sin6->sin6_addr,
- IP6CB(skb)->iif);
- *addr_len = sizeof(*sin6);
- }
+ sin6 = (struct sockaddr_in6 *) msg->msg_name;
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_port = 0;
+ sin6->sin6_addr = ip6->saddr;
+
+ sin6->sin6_flowinfo = 0;
+ if (np->sndflow)
+ sin6->sin6_flowinfo = ip6_flowinfo(ip6);
+
+ sin6->sin6_scope_id = ipv6_iface_scope_id(&sin6->sin6_addr,
+ IP6CB(skb)->iif);
if (inet6_sk(sk)->rxopt.all)
pingv6_ops.ip6_datagram_recv_ctl(sk, msg, skb);
@@ -1076,7 +1073,7 @@ void ping_seq_stop(struct seq_file *seq, void *v)
EXPORT_SYMBOL_GPL(ping_seq_stop);
static void ping_v4_format_sock(struct sock *sp, struct seq_file *f,
- int bucket)
+ int bucket, int *len)
{
struct inet_sock *inet = inet_sk(sp);
__be32 dest = inet->inet_daddr;
@@ -1085,7 +1082,7 @@ static void ping_v4_format_sock(struct sock *sp, struct seq_file *f,
__u16 srcp = ntohs(inet->inet_sport);
seq_printf(f, "%5d: %08X:%04X %08X:%04X"
- " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d",
+ " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d%n",
bucket, src, srcp, dest, destp, sp->sk_state,
sk_wmem_alloc_get(sp),
sk_rmem_alloc_get(sp),
@@ -1093,22 +1090,23 @@ static void ping_v4_format_sock(struct sock *sp, struct seq_file *f,
from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)),
0, sock_i_ino(sp),
atomic_read(&sp->sk_refcnt), sp,
- atomic_read(&sp->sk_drops));
+ atomic_read(&sp->sk_drops), len);
}
static int ping_v4_seq_show(struct seq_file *seq, void *v)
{
- seq_setwidth(seq, 127);
if (v == SEQ_START_TOKEN)
- seq_puts(seq, " sl local_address rem_address st tx_queue "
+ seq_printf(seq, "%-127s\n",
+ " sl local_address rem_address st tx_queue "
"rx_queue tr tm->when retrnsmt uid timeout "
"inode ref pointer drops");
else {
struct ping_iter_state *state = seq->private;
+ int len;
- ping_v4_format_sock(v, seq, state->bucket);
+ ping_v4_format_sock(v, seq, state->bucket, &len);
+ seq_printf(seq, "%*s\n", 127 - len, "");
}
- seq_pad(seq, '\n');
return 0;
}
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
index 46d6a1c..ce84846 100644
--- a/net/ipv4/protocol.c
+++ b/net/ipv4/protocol.c
@@ -31,6 +31,10 @@
const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly;
const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly;
+/*
+ * Add a protocol handler to the hash tables
+ */
+
int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol)
{
if (!prot->netns_ok) {
@@ -51,6 +55,10 @@ int inet_add_offload(const struct net_offload *prot, unsigned char protocol)
}
EXPORT_SYMBOL(inet_add_offload);
+/*
+ * Remove a protocol from the hash tables.
+ */
+
int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol)
{
int ret;
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 23c3e5b..193db03 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -299,7 +299,7 @@ static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
/* Charge it to the socket. */
- ipv4_pktinfo_prepare(sk, skb);
+ ipv4_pktinfo_prepare(skb);
if (sock_queue_rcv_skb(sk, skb) < 0) {
kfree_skb(skb);
return NET_RX_DROP;
@@ -519,8 +519,6 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
ipc.addr = inet->inet_saddr;
ipc.opt = NULL;
ipc.tx_flags = 0;
- ipc.ttl = 0;
- ipc.tos = -1;
ipc.oif = sk->sk_bound_dev_if;
if (msg->msg_controllen) {
@@ -560,7 +558,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
daddr = ipc.opt->opt.faddr;
}
}
- tos = get_rtconn_flags(&ipc, sk);
+ tos = RT_CONN_FLAGS(sk);
if (msg->msg_flags & MSG_DONTROUTE)
tos |= RTO_ONLINK;
@@ -696,8 +694,11 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
if (flags & MSG_OOB)
goto out;
+ if (addr_len)
+ *addr_len = sizeof(*sin);
+
if (flags & MSG_ERRQUEUE) {
- err = ip_recv_error(sk, msg, len, addr_len);
+ err = ip_recv_error(sk, msg, len);
goto out;
}
@@ -723,7 +724,6 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
sin->sin_port = 0;
memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
- *addr_len = sizeof(*sin);
}
if (inet->cmsg_flags)
ip_cmsg_recv(msg, skb);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 1417d01..fcbd95f 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -299,7 +299,7 @@ static int rt_cpu_seq_show(struct seq_file *seq, void *v)
seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
" %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
dst_entries_get_slow(&ipv4_dst_ops),
- 0, /* st->in_hit */
+ st->in_hit,
st->in_slow_tot,
st->in_slow_mc,
st->in_no_route,
@@ -307,16 +307,16 @@ static int rt_cpu_seq_show(struct seq_file *seq, void *v)
st->in_martian_dst,
st->in_martian_src,
- 0, /* st->out_hit */
+ st->out_hit,
st->out_slow_tot,
st->out_slow_mc,
- 0, /* st->gc_total */
- 0, /* st->gc_ignored */
- 0, /* st->gc_goal_miss */
- 0, /* st->gc_dst_overflow */
- 0, /* st->in_hlist_search */
- 0 /* st->out_hlist_search */
+ st->gc_total,
+ st->gc_ignored,
+ st->gc_goal_miss,
+ st->gc_dst_overflow,
+ st->in_hlist_search,
+ st->out_hlist_search
);
return 0;
}
@@ -1044,10 +1044,6 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
bool new = false;
bh_lock_sock(sk);
-
- if (!ip_sk_accept_pmtu(sk))
- goto out;
-
rt = (struct rtable *) __sk_dst_get(sk);
if (sock_owned_by_user(sk) || !rt) {
@@ -1784,12 +1780,8 @@ local_input:
rth->dst.error= -err;
rth->rt_flags &= ~RTCF_LOCAL;
}
- if (do_cache) {
- if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
- rth->dst.flags |= DST_NOCACHE;
- rt_add_uncached_list(rth);
- }
- }
+ if (do_cache)
+ rt_cache_route(&FIB_RES_NH(res), rth);
skb_dst_set(skb, &rth->dst);
err = 0;
goto out;
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index b95331e..14a15c4 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -25,7 +25,15 @@
extern int sysctl_tcp_syncookies;
-static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS];
+__u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS];
+EXPORT_SYMBOL(syncookie_secret);
+
+static __init int init_syncookies(void)
+{
+ get_random_bytes(syncookie_secret, sizeof(syncookie_secret));
+ return 0;
+}
+__initcall(init_syncookies);
#define COOKIEBITS 24 /* Upper bits store count */
#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)
@@ -36,11 +44,8 @@ static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS],
static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
u32 count, int c)
{
- __u32 *tmp;
-
- net_get_random_once(syncookie_secret, sizeof(syncookie_secret));
+ __u32 *tmp = __get_cpu_var(ipv4_cookie_scratch);
- tmp = __get_cpu_var(ipv4_cookie_scratch);
memcpy(tmp + 4, syncookie_secret[c], sizeof(syncookie_secret[c]));
tmp[0] = (__force u32)saddr;
tmp[1] = (__force u32)daddr;
@@ -84,7 +89,8 @@ __u32 cookie_init_timestamp(struct request_sock *req)
static __u32 secure_tcp_syn_cookie(__be32 saddr, __be32 daddr, __be16 sport,
- __be16 dport, __u32 sseq, __u32 data)
+ __be16 dport, __u32 sseq, __u32 count,
+ __u32 data)
{
/*
* Compute the secure sequence number.
@@ -96,7 +102,7 @@ static __u32 secure_tcp_syn_cookie(__be32 saddr, __be32 daddr, __be16 sport,
* As an extra hack, we add a small "data" value that encodes the
* MSS into the second hash value.
*/
- u32 count = tcp_cookie_time();
+
return (cookie_hash(saddr, daddr, sport, dport, 0, 0) +
sseq + (count << COOKIEBITS) +
((cookie_hash(saddr, daddr, sport, dport, count, 1) + data)
@@ -108,21 +114,22 @@ static __u32 secure_tcp_syn_cookie(__be32 saddr, __be32 daddr, __be16 sport,
* If the syncookie is bad, the data returned will be out of
* range. This must be checked by the caller.
*
- * The count value used to generate the cookie must be less than
- * MAX_SYNCOOKIE_AGE minutes in the past.
- * The return value (__u32)-1 if this test fails.
+ * The count value used to generate the cookie must be within
+ * "maxdiff" if the current (passed-in) "count". The return value
+ * is (__u32)-1 if this test fails.
*/
static __u32 check_tcp_syn_cookie(__u32 cookie, __be32 saddr, __be32 daddr,
- __be16 sport, __be16 dport, __u32 sseq)
+ __be16 sport, __be16 dport, __u32 sseq,
+ __u32 count, __u32 maxdiff)
{
- u32 diff, count = tcp_cookie_time();
+ __u32 diff;
/* Strip away the layers from the cookie */
cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq;
/* Cookie is now reduced to (count * 2^24) ^ (hash % 2^24) */
diff = (count - (cookie >> COOKIEBITS)) & ((__u32) - 1 >> COOKIEBITS);
- if (diff >= MAX_SYNCOOKIE_AGE)
+ if (diff >= maxdiff)
return (__u32)-1;
return (cookie -
@@ -131,22 +138,22 @@ static __u32 check_tcp_syn_cookie(__u32 cookie, __be32 saddr, __be32 daddr,
}
/*
- * MSS Values are chosen based on the 2011 paper
- * 'An Analysis of TCP Maximum Segement Sizes' by S. Alcock and R. Nelson.
- * Values ..
- * .. lower than 536 are rare (< 0.2%)
- * .. between 537 and 1299 account for less than < 1.5% of observed values
- * .. in the 1300-1349 range account for about 15 to 20% of observed mss values
- * .. exceeding 1460 are very rare (< 0.04%)
+ * MSS Values are taken from the 2009 paper
+ * 'Measuring TCP Maximum Segment Size' by S. Alcock and R. Nelson:
+ * - values 1440 to 1460 accounted for 80% of observed mss values
+ * - values outside the 536-1460 range are rare (<0.2%).
*
- * 1460 is the single most frequently announced mss value (30 to 46% depending
- * on monitor location). Table must be sorted.
+ * Table must be sorted.
*/
static __u16 const msstab[] = {
+ 64,
+ 512,
536,
- 1300,
- 1440, /* 1440, 1452: PPPoE */
+ 1024,
+ 1440,
1460,
+ 4312,
+ 8960,
};
/*
@@ -166,7 +173,7 @@ u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
return secure_tcp_syn_cookie(iph->saddr, iph->daddr,
th->source, th->dest, ntohl(th->seq),
- mssind);
+ jiffies / (HZ * 60), mssind);
}
EXPORT_SYMBOL_GPL(__cookie_v4_init_sequence);
@@ -182,6 +189,13 @@ __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp)
}
/*
+ * This (misnamed) value is the age of syncookie which is permitted.
+ * Its ideal value should be dependent on TCP_TIMEOUT_INIT and
+ * sysctl_tcp_retries1. It's a rather complicated formula (exponential
+ * backoff) to compute at runtime so it's currently hardcoded here.
+ */
+#define COUNTER_TRIES 4
+/*
* Check if a ack sequence number is a valid syncookie.
* Return the decoded mss if it is, or 0 if not.
*/
@@ -190,7 +204,9 @@ int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th,
{
__u32 seq = ntohl(th->seq) - 1;
__u32 mssind = check_tcp_syn_cookie(cookie, iph->saddr, iph->daddr,
- th->source, th->dest, seq);
+ th->source, th->dest, seq,
+ jiffies / (HZ * 60),
+ COUNTER_TRIES);
return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0;
}
@@ -299,10 +315,10 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
treq->rcv_isn = ntohl(th->seq) - 1;
treq->snt_isn = cookie;
req->mss = mss;
- ireq->ir_num = ntohs(th->dest);
- ireq->ir_rmt_port = th->source;
- ireq->ir_loc_addr = ip_hdr(skb)->daddr;
- ireq->ir_rmt_addr = ip_hdr(skb)->saddr;
+ ireq->loc_port = th->dest;
+ ireq->rmt_port = th->source;
+ ireq->loc_addr = ip_hdr(skb)->daddr;
+ ireq->rmt_addr = ip_hdr(skb)->saddr;
ireq->ecn_ok = ecn_ok;
ireq->snd_wscale = tcp_opt.snd_wscale;
ireq->sack_ok = tcp_opt.sack_ok;
@@ -342,8 +358,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
flowi4_init_output(&fl4, sk->sk_bound_dev_if, sk->sk_mark,
RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP,
inet_sk_flowi_flags(sk),
- (opt && opt->srr) ? opt->faddr : ireq->ir_rmt_addr,
- ireq->ir_loc_addr, th->source, th->dest);
+ (opt && opt->srr) ? opt->faddr : ireq->rmt_addr,
+ ireq->loc_addr, th->source, th->dest);
security_req_classify_flow(req, flowi4_to_flowi(&fl4));
rt = ip_route_output_key(sock_net(sk), &fl4);
if (IS_ERR(rt)) {
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 3d69ec8..540279f 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -43,12 +43,12 @@ static int ip_ping_group_range_min[] = { 0, 0 };
static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
/* Update system visible IP port range */
-static void set_local_port_range(struct net *net, int range[2])
+static void set_local_port_range(int range[2])
{
- write_seqlock(&net->ipv4.sysctl_local_ports.lock);
- net->ipv4.sysctl_local_ports.range[0] = range[0];
- net->ipv4.sysctl_local_ports.range[1] = range[1];
- write_sequnlock(&net->ipv4.sysctl_local_ports.lock);
+ write_seqlock(&sysctl_local_ports.lock);
+ sysctl_local_ports.range[0] = range[0];
+ sysctl_local_ports.range[1] = range[1];
+ write_sequnlock(&sysctl_local_ports.lock);
}
/* Validate changes from /proc interface. */
@@ -56,8 +56,6 @@ static int ipv4_local_port_range(struct ctl_table *table, int write,
void __user *buffer,
size_t *lenp, loff_t *ppos)
{
- struct net *net =
- container_of(table->data, struct net, ipv4.sysctl_local_ports.range);
int ret;
int range[2];
struct ctl_table tmp = {
@@ -68,15 +66,14 @@ static int ipv4_local_port_range(struct ctl_table *table, int write,
.extra2 = &ip_local_port_range_max,
};
- inet_get_local_port_range(net, &range[0], &range[1]);
-
+ inet_get_local_port_range(range, range + 1);
ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
if (write && ret == 0) {
if (range[1] < range[0])
ret = -EINVAL;
else
- set_local_port_range(net, range);
+ set_local_port_range(range);
}
return ret;
@@ -86,27 +83,23 @@ static int ipv4_local_port_range(struct ctl_table *table, int write,
static void inet_get_ping_group_range_table(struct ctl_table *table, kgid_t *low, kgid_t *high)
{
kgid_t *data = table->data;
- struct net *net =
- container_of(table->data, struct net, ipv4.sysctl_ping_group_range);
unsigned int seq;
do {
- seq = read_seqbegin(&net->ipv4.sysctl_local_ports.lock);
+ seq = read_seqbegin(&sysctl_local_ports.lock);
*low = data[0];
*high = data[1];
- } while (read_seqretry(&net->ipv4.sysctl_local_ports.lock, seq));
+ } while (read_seqretry(&sysctl_local_ports.lock, seq));
}
/* Update system visible IP port range */
static void set_ping_group_range(struct ctl_table *table, kgid_t low, kgid_t high)
{
kgid_t *data = table->data;
- struct net *net =
- container_of(table->data, struct net, ipv4.sysctl_ping_group_range);
- write_seqlock(&net->ipv4.sysctl_local_ports.lock);
+ write_seqlock(&sysctl_local_ports.lock);
data[0] = low;
data[1] = high;
- write_sequnlock(&net->ipv4.sysctl_local_ports.lock);
+ write_sequnlock(&sysctl_local_ports.lock);
}
/* Validate changes from /proc interface. */
@@ -200,6 +193,49 @@ static int proc_allowed_congestion_control(struct ctl_table *ctl,
return ret;
}
+static int ipv4_tcp_mem(struct ctl_table *ctl, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret;
+ unsigned long vec[3];
+ struct net *net = current->nsproxy->net_ns;
+#ifdef CONFIG_MEMCG_KMEM
+ struct mem_cgroup *memcg;
+#endif
+
+ struct ctl_table tmp = {
+ .data = &vec,
+ .maxlen = sizeof(vec),
+ .mode = ctl->mode,
+ };
+
+ if (!write) {
+ ctl->data = &net->ipv4.sysctl_tcp_mem;
+ return proc_doulongvec_minmax(ctl, write, buffer, lenp, ppos);
+ }
+
+ ret = proc_doulongvec_minmax(&tmp, write, buffer, lenp, ppos);
+ if (ret)
+ return ret;
+
+#ifdef CONFIG_MEMCG_KMEM
+ rcu_read_lock();
+ memcg = mem_cgroup_from_task(current);
+
+ tcp_prot_mem(memcg, vec[0], 0);
+ tcp_prot_mem(memcg, vec[1], 1);
+ tcp_prot_mem(memcg, vec[2], 2);
+ rcu_read_unlock();
+#endif
+
+ net->ipv4.sysctl_tcp_mem[0] = vec[0];
+ net->ipv4.sysctl_tcp_mem[1] = vec[1];
+ net->ipv4.sysctl_tcp_mem[2] = vec[2];
+
+ return 0;
+}
+
static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
@@ -231,11 +267,6 @@ static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write,
ret = -EINVAL;
goto bad_key;
}
- /* Generate a dummy secret but don't publish it. This
- * is needed so we don't regenerate a new key on the
- * first invocation of tcp_fastopen_cookie_gen
- */
- tcp_fastopen_init_key_once(false);
tcp_fastopen_reset_cipher(user_key, TCP_FASTOPEN_KEY_LENGTH);
}
@@ -444,6 +475,13 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
+ .procname = "ip_local_port_range",
+ .data = &sysctl_local_ports.range,
+ .maxlen = sizeof(sysctl_local_ports.range),
+ .mode = 0644,
+ .proc_handler = ipv4_local_port_range,
+ },
+ {
.procname = "ip_local_reserved_ports",
.data = NULL, /* initialized in sysctl_ipv4_init */
.maxlen = 65536,
@@ -514,13 +552,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .procname = "tcp_mem",
- .maxlen = sizeof(sysctl_tcp_mem),
- .data = &sysctl_tcp_mem,
- .mode = 0644,
- .proc_handler = proc_doulongvec_minmax,
- },
- {
.procname = "tcp_wmem",
.data = &sysctl_tcp_wmem,
.maxlen = sizeof(sysctl_tcp_wmem),
@@ -701,6 +732,13 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_allowed_congestion_control,
},
{
+ .procname = "tcp_max_ssthresh",
+ .data = &sysctl_tcp_max_ssthresh,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
.procname = "tcp_thin_linear_timeouts",
.data = &sysctl_tcp_thin_linear_timeouts,
.maxlen = sizeof(int),
@@ -816,11 +854,10 @@ static struct ctl_table ipv4_net_table[] = {
.proc_handler = proc_dointvec
},
{
- .procname = "ip_local_port_range",
- .maxlen = sizeof(init_net.ipv4.sysctl_local_ports.range),
- .data = &init_net.ipv4.sysctl_local_ports.range,
+ .procname = "tcp_mem",
+ .maxlen = sizeof(init_net.ipv4.sysctl_tcp_mem),
.mode = 0644,
- .proc_handler = ipv4_local_port_range,
+ .proc_handler = ipv4_tcp_mem,
},
{ }
};
@@ -831,15 +868,30 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
table = ipv4_net_table;
if (!net_eq(net, &init_net)) {
- int i;
-
table = kmemdup(table, sizeof(ipv4_net_table), GFP_KERNEL);
if (table == NULL)
goto err_alloc;
- /* Update the variables to point into the current struct net */
- for (i = 0; i < ARRAY_SIZE(ipv4_net_table) - 1; i++)
- table[i].data += (void *)net - (void *)&init_net;
+ table[0].data =
+ &net->ipv4.sysctl_icmp_echo_ignore_all;
+ table[1].data =
+ &net->ipv4.sysctl_icmp_echo_ignore_broadcasts;
+ table[2].data =
+ &net->ipv4.sysctl_icmp_ignore_bogus_error_responses;
+ table[3].data =
+ &net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr;
+ table[4].data =
+ &net->ipv4.sysctl_icmp_ratelimit;
+ table[5].data =
+ &net->ipv4.sysctl_icmp_ratemask;
+ table[6].data =
+ &net->ipv4.sysctl_ping_group_range;
+ table[7].data =
+ &net->ipv4.sysctl_tcp_ecn;
+
+ /* Don't export sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns)
+ table[0].procname = NULL;
}
/*
@@ -849,12 +901,7 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
net->ipv4.sysctl_ping_group_range[0] = make_kgid(&init_user_ns, 1);
net->ipv4.sysctl_ping_group_range[1] = make_kgid(&init_user_ns, 0);
- /*
- * Set defaults for local port range
- */
- seqlock_init(&net->ipv4.sysctl_local_ports.lock);
- net->ipv4.sysctl_local_ports.range[0] = 32768;
- net->ipv4.sysctl_local_ports.range[1] = 61000;
+ tcp_init_mem(net);
net->ipv4.ipv4_hdr = register_net_sysctl(net, "net/ipv4", table);
if (net->ipv4.ipv4_hdr == NULL)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c4638e6..6e5617b 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -288,11 +288,9 @@ int sysctl_tcp_min_tso_segs __read_mostly = 2;
struct percpu_counter tcp_orphan_count;
EXPORT_SYMBOL_GPL(tcp_orphan_count);
-long sysctl_tcp_mem[3] __read_mostly;
int sysctl_tcp_wmem[3] __read_mostly;
int sysctl_tcp_rmem[3] __read_mostly;
-EXPORT_SYMBOL(sysctl_tcp_mem);
EXPORT_SYMBOL(sysctl_tcp_rmem);
EXPORT_SYMBOL(sysctl_tcp_wmem);
@@ -808,6 +806,12 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
xmit_size_goal = min_t(u32, gso_size,
sk->sk_gso_max_size - 1 - hlen);
+ /* TSQ : try to have at least two segments in flight
+ * (one in NIC TX ring, another in Qdisc)
+ */
+ xmit_size_goal = min_t(u32, xmit_size_goal,
+ sysctl_tcp_limit_output_bytes >> 1);
+
xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);
/* We try hard to avoid divides here */
@@ -1425,7 +1429,7 @@ static void tcp_service_net_dma(struct sock *sk, bool wait)
do {
if (dma_async_is_tx_complete(tp->ucopy.dma_chan,
last_issued, &done,
- &used) == DMA_COMPLETE) {
+ &used) == DMA_SUCCESS) {
/* Safe to free early-copied skbs now */
__skb_queue_purge(&sk->sk_async_wait_queue);
break;
@@ -1433,7 +1437,7 @@ static void tcp_service_net_dma(struct sock *sk, bool wait)
struct sk_buff *skb;
while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
(dma_async_is_complete(skb->dma_cookie, done,
- used) == DMA_COMPLETE)) {
+ used) == DMA_SUCCESS)) {
__skb_dequeue(&sk->sk_async_wait_queue);
kfree_skb(skb);
}
@@ -3093,13 +3097,13 @@ static int __init set_thash_entries(char *str)
}
__setup("thash_entries=", set_thash_entries);
-static void tcp_init_mem(void)
+void tcp_init_mem(struct net *net)
{
unsigned long limit = nr_free_buffer_pages() / 8;
limit = max(limit, 128UL);
- sysctl_tcp_mem[0] = limit / 4 * 3;
- sysctl_tcp_mem[1] = limit;
- sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;
+ net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3;
+ net->ipv4.sysctl_tcp_mem[1] = limit;
+ net->ipv4.sysctl_tcp_mem[2] = net->ipv4.sysctl_tcp_mem[0] * 2;
}
void __init tcp_init(void)
@@ -3133,9 +3137,10 @@ void __init tcp_init(void)
&tcp_hashinfo.ehash_mask,
0,
thash_entries ? 0 : 512 * 1024);
- for (i = 0; i <= tcp_hashinfo.ehash_mask; i++)
+ for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) {
INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
-
+ INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].twchain, i);
+ }
if (inet_ehash_locks_alloc(&tcp_hashinfo))
panic("TCP: failed to alloc ehash_locks");
tcp_hashinfo.bhash =
@@ -3161,7 +3166,7 @@ void __init tcp_init(void)
sysctl_tcp_max_orphans = cnt / 2;
sysctl_max_syn_backlog = max(128, cnt / 256);
- tcp_init_mem();
+ tcp_init_mem(&init_net);
/* Set per-socket limits to no more than 1/128 the pressure threshold */
limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
max_wshare = min(4UL*1024*1024, limit);
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index 821846f..f45e1c2 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -140,8 +140,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
ca->cnt = 1;
}
-static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked,
- u32 in_flight)
+static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
{
struct tcp_sock *tp = tcp_sk(sk);
struct bictcp *ca = inet_csk_ca(sk);
@@ -150,7 +149,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked,
return;
if (tp->snd_cwnd <= tp->snd_ssthresh)
- tcp_slow_start(tp, acked);
+ tcp_slow_start(tp);
else {
bictcp_update(ca, tp->snd_cwnd);
tcp_cong_avoid_ai(tp, ca->cnt);
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index ad37bf1..019c238 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -15,6 +15,8 @@
#include <linux/gfp.h>
#include <net/tcp.h>
+int sysctl_tcp_max_ssthresh = 0;
+
static DEFINE_SPINLOCK(tcp_cong_list_lock);
static LIST_HEAD(tcp_cong_list);
@@ -297,24 +299,35 @@ bool tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight)
}
EXPORT_SYMBOL_GPL(tcp_is_cwnd_limited);
-/* Slow start is used when congestion window is no greater than the slow start
- * threshold. We base on RFC2581 and also handle stretch ACKs properly.
- * We do not implement RFC3465 Appropriate Byte Counting (ABC) per se but
- * something better;) a packet is only considered (s)acked in its entirety to
- * defend the ACK attacks described in the RFC. Slow start processes a stretch
- * ACK of degree N as if N acks of degree 1 are received back to back except
- * ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and
- * returns the leftover acks to adjust cwnd in congestion avoidance mode.
+/*
+ * Slow start is used when congestion window is less than slow start
+ * threshold. This version implements the basic RFC2581 version
+ * and optionally supports:
+ * RFC3742 Limited Slow Start - growth limited to max_ssthresh
+ * RFC3465 Appropriate Byte Counting - growth limited by bytes acknowledged
*/
-int tcp_slow_start(struct tcp_sock *tp, u32 acked)
+void tcp_slow_start(struct tcp_sock *tp)
{
- u32 cwnd = tp->snd_cwnd + acked;
+ int cnt; /* increase in packets */
+ unsigned int delta = 0;
+ u32 snd_cwnd = tp->snd_cwnd;
+
+ if (unlikely(!snd_cwnd)) {
+ pr_err_once("snd_cwnd is nul, please report this bug.\n");
+ snd_cwnd = 1U;
+ }
- if (cwnd > tp->snd_ssthresh)
- cwnd = tp->snd_ssthresh + 1;
- acked -= cwnd - tp->snd_cwnd;
- tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp);
- return acked;
+ if (sysctl_tcp_max_ssthresh > 0 && tp->snd_cwnd > sysctl_tcp_max_ssthresh)
+ cnt = sysctl_tcp_max_ssthresh >> 1; /* limited slow start */
+ else
+ cnt = snd_cwnd; /* exponential increase */
+
+ tp->snd_cwnd_cnt += cnt;
+ while (tp->snd_cwnd_cnt >= snd_cwnd) {
+ tp->snd_cwnd_cnt -= snd_cwnd;
+ delta++;
+ }
+ tp->snd_cwnd = min(snd_cwnd + delta, tp->snd_cwnd_clamp);
}
EXPORT_SYMBOL_GPL(tcp_slow_start);
@@ -338,7 +351,7 @@ EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai);
/* This is Jacobson's slow start and congestion avoidance.
* SIGCOMM '88, p. 328.
*/
-void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight)
+void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -347,7 +360,7 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight)
/* In "safe" area, increase. */
if (tp->snd_cwnd <= tp->snd_ssthresh)
- tcp_slow_start(tp, acked);
+ tcp_slow_start(tp);
/* In dangerous area, increase slowly. */
else
tcp_cong_avoid_ai(tp, tp->snd_cwnd);
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 828e4c3..b6ae92a 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -304,8 +304,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
ca->cnt = 1;
}
-static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked,
- u32 in_flight)
+static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
{
struct tcp_sock *tp = tcp_sk(sk);
struct bictcp *ca = inet_csk_ca(sk);
@@ -316,7 +315,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked,
if (tp->snd_cwnd <= tp->snd_ssthresh) {
if (hystart && after(ack, ca->end_seq))
bictcp_hystart_reset(sk);
- tcp_slow_start(tp, acked);
+ tcp_slow_start(tp);
} else {
bictcp_update(ca, tp->snd_cwnd);
tcp_cong_avoid_ai(tp, ca->cnt);
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index f195d93..ab7bd35 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -8,26 +8,12 @@
#include <net/inetpeer.h>
#include <net/tcp.h>
-int sysctl_tcp_fastopen __read_mostly = TFO_CLIENT_ENABLE;
+int sysctl_tcp_fastopen __read_mostly;
struct tcp_fastopen_context __rcu *tcp_fastopen_ctx;
static DEFINE_SPINLOCK(tcp_fastopen_ctx_lock);
-void tcp_fastopen_init_key_once(bool publish)
-{
- static u8 key[TCP_FASTOPEN_KEY_LENGTH];
-
- /* tcp_fastopen_reset_cipher publishes the new context
- * atomically, so we allow this race happening here.
- *
- * All call sites of tcp_fastopen_cookie_gen also check
- * for a valid cookie, so this is an acceptable risk.
- */
- if (net_get_random_once(key, sizeof(key)) && publish)
- tcp_fastopen_reset_cipher(key, sizeof(key));
-}
-
static void tcp_fastopen_ctx_free(struct rcu_head *head)
{
struct tcp_fastopen_context *ctx =
@@ -84,8 +70,6 @@ void tcp_fastopen_cookie_gen(__be32 src, __be32 dst,
__be32 path[4] = { src, dst, 0, 0 };
struct tcp_fastopen_context *ctx;
- tcp_fastopen_init_key_once(true);
-
rcu_read_lock();
ctx = rcu_dereference(tcp_fastopen_ctx);
if (ctx) {
@@ -94,3 +78,14 @@ void tcp_fastopen_cookie_gen(__be32 src, __be32 dst,
}
rcu_read_unlock();
}
+
+static int __init tcp_fastopen_init(void)
+{
+ __u8 key[TCP_FASTOPEN_KEY_LENGTH];
+
+ get_random_bytes(key, sizeof(key));
+ tcp_fastopen_reset_cipher(key, sizeof(key));
+ return 0;
+}
+
+late_initcall(tcp_fastopen_init);
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
index 8ed9305..30f27f6 100644
--- a/net/ipv4/tcp_highspeed.c
+++ b/net/ipv4/tcp_highspeed.c
@@ -109,7 +109,7 @@ static void hstcp_init(struct sock *sk)
tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);
}
-static void hstcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight)
+static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 in_flight)
{
struct tcp_sock *tp = tcp_sk(sk);
struct hstcp *ca = inet_csk_ca(sk);
@@ -118,7 +118,7 @@ static void hstcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight)
return;
if (tp->snd_cwnd <= tp->snd_ssthresh)
- tcp_slow_start(tp, acked);
+ tcp_slow_start(tp);
else {
/* Update AIMD parameters.
*
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index 4a194ac..c1a8175 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -227,7 +227,7 @@ static u32 htcp_recalc_ssthresh(struct sock *sk)
return max((tp->snd_cwnd * ca->beta) >> 7, 2U);
}
-static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight)
+static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
{
struct tcp_sock *tp = tcp_sk(sk);
struct htcp *ca = inet_csk_ca(sk);
@@ -236,7 +236,7 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight)
return;
if (tp->snd_cwnd <= tp->snd_ssthresh)
- tcp_slow_start(tp, acked);
+ tcp_slow_start(tp);
else {
/* In dangerous area, increase slowly.
* In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
index 478fe82..57bdd17 100644
--- a/net/ipv4/tcp_hybla.c
+++ b/net/ipv4/tcp_hybla.c
@@ -85,8 +85,7 @@ static inline u32 hybla_fraction(u32 odds)
* o Give cwnd a new value based on the model proposed
* o remember increments <1
*/
-static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked,
- u32 in_flight)
+static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
{
struct tcp_sock *tp = tcp_sk(sk);
struct hybla *ca = inet_csk_ca(sk);
@@ -103,7 +102,7 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked,
return;
if (!ca->hybla_en) {
- tcp_reno_cong_avoid(sk, ack, acked, in_flight);
+ tcp_reno_cong_avoid(sk, ack, in_flight);
return;
}
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
index 8a52099..834857f 100644
--- a/net/ipv4/tcp_illinois.c
+++ b/net/ipv4/tcp_illinois.c
@@ -256,8 +256,7 @@ static void tcp_illinois_state(struct sock *sk, u8 new_state)
/*
* Increase window in response to successful acknowledgment.
*/
-static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 acked,
- u32 in_flight)
+static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
{
struct tcp_sock *tp = tcp_sk(sk);
struct illinois *ca = inet_csk_ca(sk);
@@ -271,7 +270,7 @@ static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 acked,
/* In slow start */
if (tp->snd_cwnd <= tp->snd_ssthresh)
- tcp_slow_start(tp, acked);
+ tcp_slow_start(tp);
else {
u32 delta;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index c53b7f3..a16b01b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -267,31 +267,11 @@ static bool TCP_ECN_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr
* 1. Tuning sk->sk_sndbuf, when connection enters established state.
*/
-static void tcp_sndbuf_expand(struct sock *sk)
+static void tcp_fixup_sndbuf(struct sock *sk)
{
- const struct tcp_sock *tp = tcp_sk(sk);
- int sndmem, per_mss;
- u32 nr_segs;
-
- /* Worst case is non GSO/TSO : each frame consumes one skb
- * and skb->head is kmalloced using power of two area of memory
- */
- per_mss = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
- MAX_TCP_HEADER +
- SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
-
- per_mss = roundup_pow_of_two(per_mss) +
- SKB_DATA_ALIGN(sizeof(struct sk_buff));
-
- nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd);
- nr_segs = max_t(u32, nr_segs, tp->reordering + 1);
-
- /* Fast Recovery (RFC 5681 3.2) :
- * Cubic needs 1.7 factor, rounded to 2 to include
- * extra cushion (application might react slowly to POLLOUT)
- */
- sndmem = 2 * nr_segs * per_mss;
+ int sndmem = SKB_TRUESIZE(tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER);
+ sndmem *= TCP_INIT_CWND;
if (sk->sk_sndbuf < sndmem)
sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
}
@@ -375,12 +355,6 @@ static void tcp_fixup_rcvbuf(struct sock *sk)
rcvmem = 2 * SKB_TRUESIZE(mss + MAX_TCP_HEADER) *
tcp_default_init_rwnd(mss);
- /* Dynamic Right Sizing (DRS) has 2 to 3 RTT latency
- * Allow enough cushion so that sender is not limited by our window
- */
- if (sysctl_tcp_moderate_rcvbuf)
- rcvmem <<= 2;
-
if (sk->sk_rcvbuf < rcvmem)
sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]);
}
@@ -396,11 +370,9 @@ void tcp_init_buffer_space(struct sock *sk)
if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK))
tcp_fixup_rcvbuf(sk);
if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK))
- tcp_sndbuf_expand(sk);
+ tcp_fixup_sndbuf(sk);
tp->rcvq_space.space = tp->rcv_wnd;
- tp->rcvq_space.time = tcp_time_stamp;
- tp->rcvq_space.seq = tp->copied_seq;
maxwin = tcp_full_space(sk);
@@ -540,62 +512,48 @@ void tcp_rcv_space_adjust(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
int time;
- int copied;
+ int space;
+
+ if (tp->rcvq_space.time == 0)
+ goto new_measure;
time = tcp_time_stamp - tp->rcvq_space.time;
if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0)
return;
- /* Number of bytes copied to user in last RTT */
- copied = tp->copied_seq - tp->rcvq_space.seq;
- if (copied <= tp->rcvq_space.space)
- goto new_measure;
-
- /* A bit of theory :
- * copied = bytes received in previous RTT, our base window
- * To cope with packet losses, we need a 2x factor
- * To cope with slow start, and sender growing its cwin by 100 %
- * every RTT, we need a 4x factor, because the ACK we are sending
- * now is for the next RTT, not the current one :
- * <prev RTT . ><current RTT .. ><next RTT .... >
- */
-
- if (sysctl_tcp_moderate_rcvbuf &&
- !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
- int rcvwin, rcvmem, rcvbuf;
+ space = 2 * (tp->copied_seq - tp->rcvq_space.seq);
- /* minimal window to cope with packet losses, assuming
- * steady state. Add some cushion because of small variations.
- */
- rcvwin = (copied << 1) + 16 * tp->advmss;
+ space = max(tp->rcvq_space.space, space);
- /* If rate increased by 25%,
- * assume slow start, rcvwin = 3 * copied
- * If rate increased by 50%,
- * assume sender can use 2x growth, rcvwin = 4 * copied
- */
- if (copied >=
- tp->rcvq_space.space + (tp->rcvq_space.space >> 2)) {
- if (copied >=
- tp->rcvq_space.space + (tp->rcvq_space.space >> 1))
- rcvwin <<= 1;
- else
- rcvwin += (rcvwin >> 1);
- }
+ if (tp->rcvq_space.space != space) {
+ int rcvmem;
- rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
- while (tcp_win_from_space(rcvmem) < tp->advmss)
- rcvmem += 128;
+ tp->rcvq_space.space = space;
- rcvbuf = min(rcvwin / tp->advmss * rcvmem, sysctl_tcp_rmem[2]);
- if (rcvbuf > sk->sk_rcvbuf) {
- sk->sk_rcvbuf = rcvbuf;
+ if (sysctl_tcp_moderate_rcvbuf &&
+ !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
+ int new_clamp = space;
- /* Make the window clamp follow along. */
- tp->window_clamp = rcvwin;
+ /* Receive space grows, normalize in order to
+ * take into account packet headers and sk_buff
+ * structure overhead.
+ */
+ space /= tp->advmss;
+ if (!space)
+ space = 1;
+ rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
+ while (tcp_win_from_space(rcvmem) < tp->advmss)
+ rcvmem += 128;
+ space *= rcvmem;
+ space = min(space, sysctl_tcp_rmem[2]);
+ if (space > sk->sk_rcvbuf) {
+ sk->sk_rcvbuf = space;
+
+ /* Make the window clamp follow along. */
+ tp->window_clamp = new_clamp;
+ }
}
}
- tp->rcvq_space.space = copied;
new_measure:
tp->rcvq_space.seq = tp->copied_seq;
@@ -755,12 +713,7 @@ static void tcp_update_pacing_rate(struct sock *sk)
if (tp->srtt > 8 + 2)
do_div(rate, tp->srtt);
- /* ACCESS_ONCE() is needed because sch_fq fetches sk_pacing_rate
- * without any lock. We want to make sure compiler wont store
- * intermediate values in this location.
- */
- ACCESS_ONCE(sk->sk_pacing_rate) = min_t(u64, rate,
- sk->sk_max_pacing_rate);
+ sk->sk_pacing_rate = min_t(u64, rate, ~0U);
}
/* Calculate rto without backoff. This is the second half of Van Jacobson's
@@ -2903,8 +2856,7 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
* left edge of the send window.
* See draft-ietf-tcplw-high-performance-00, section 3.3.
*/
- if (seq_rtt < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
- flag & FLAG_ACKED)
+ if (seq_rtt < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
if (seq_rtt < 0)
@@ -2919,25 +2871,20 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
}
/* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */
-static void tcp_synack_rtt_meas(struct sock *sk, const u32 synack_stamp)
+static void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req)
{
struct tcp_sock *tp = tcp_sk(sk);
s32 seq_rtt = -1;
- if (synack_stamp && !tp->total_retrans)
- seq_rtt = tcp_time_stamp - synack_stamp;
-
- /* If the ACK acks both the SYNACK and the (Fast Open'd) data packets
- * sent in SYN_RECV, SYNACK RTT is the smooth RTT computed in tcp_ack()
- */
- if (!tp->srtt)
- tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt, -1);
+ if (tp->lsndtime && !tp->total_retrans)
+ seq_rtt = tcp_time_stamp - tp->lsndtime;
+ tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt, -1);
}
-static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight)
+static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
- icsk->icsk_ca_ops->cong_avoid(sk, ack, acked, in_flight);
+ icsk->icsk_ca_ops->cong_avoid(sk, ack, in_flight);
tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp;
}
@@ -3026,7 +2973,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
const struct inet_connection_sock *icsk = inet_csk(sk);
struct sk_buff *skb;
u32 now = tcp_time_stamp;
- bool fully_acked = true;
+ int fully_acked = true;
int flag = 0;
u32 pkts_acked = 0;
u32 reord = tp->packets_out;
@@ -3034,7 +2981,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
s32 seq_rtt = -1;
s32 ca_seq_rtt = -1;
ktime_t last_ackt = net_invalid_timestamp();
- bool rtt_update;
while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) {
struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
@@ -3111,13 +3057,14 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
flag |= FLAG_SACK_RENEGING;
- rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt, sack_rtt);
+ if (tcp_ack_update_rtt(sk, flag, seq_rtt, sack_rtt) ||
+ (flag & FLAG_ACKED))
+ tcp_rearm_rto(sk);
if (flag & FLAG_ACKED) {
const struct tcp_congestion_ops *ca_ops
= inet_csk(sk)->icsk_ca_ops;
- tcp_rearm_rto(sk);
if (unlikely(icsk->icsk_mtup.probe_size &&
!after(tp->mtu_probe.probe_seq_end, tp->snd_una))) {
tcp_mtup_probe_success(sk);
@@ -3156,13 +3103,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
ca_ops->pkts_acked(sk, pkts_acked, rtt_us);
}
- } else if (skb && rtt_update && sack_rtt >= 0 &&
- sack_rtt > (s32)(now - TCP_SKB_CB(skb)->when)) {
- /* Do not re-arm RTO if the sack RTT is measured from data sent
- * after when the head was last (re)transmitted. Otherwise the
- * timeout may continue to extend in loss recovery.
- */
- tcp_rearm_rto(sk);
}
#if FASTRETRANS_DEBUG > 0
@@ -3454,7 +3394,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
/* Advance cwnd if state allows */
if (tcp_may_raise_cwnd(sk, flag))
- tcp_cong_avoid(sk, ack, acked, prior_in_flight);
+ tcp_cong_avoid(sk, ack, prior_in_flight);
if (tcp_ack_is_dubious(sk, flag)) {
is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
@@ -4764,7 +4704,15 @@ static void tcp_new_space(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
if (tcp_should_expand_sndbuf(sk)) {
- tcp_sndbuf_expand(sk);
+ int sndmem = SKB_TRUESIZE(max_t(u32,
+ tp->rx_opt.mss_clamp,
+ tp->mss_cache) +
+ MAX_TCP_HEADER);
+ int demanded = max_t(unsigned int, tp->snd_cwnd,
+ tp->reordering + 1);
+ sndmem *= 2 * demanded;
+ if (sndmem > sk->sk_sndbuf)
+ sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
tp->snd_cwnd_stamp = tcp_time_stamp;
}
@@ -5639,7 +5587,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
struct request_sock *req;
int queued = 0;
bool acceptable;
- u32 synack_stamp;
tp->rx_opt.saw_tstamp = 0;
@@ -5722,18 +5669,16 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
* so release it.
*/
if (req) {
- synack_stamp = tcp_rsk(req)->snt_synack;
tp->total_retrans = req->num_retrans;
reqsk_fastopen_remove(sk, req, false);
} else {
- synack_stamp = tp->lsndtime;
/* Make sure socket is routed, for correct metrics. */
icsk->icsk_af_ops->rebuild_header(sk);
tcp_init_congestion_control(sk);
tcp_mtup_init(sk);
- tp->copied_seq = tp->rcv_nxt;
tcp_init_buffer_space(sk);
+ tp->copied_seq = tp->rcv_nxt;
}
smp_mb();
tcp_set_state(sk, TCP_ESTABLISHED);
@@ -5749,7 +5694,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;
tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
- tcp_synack_rtt_meas(sk, synack_stamp);
+ tcp_synack_rtt_meas(sk, req);
if (tp->rx_opt.tstamp_ok)
tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 0672139..b14266b 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -177,7 +177,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
if (IS_ERR(rt)) {
err = PTR_ERR(rt);
if (err == -ENETUNREACH)
- IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
+ IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
return err;
}
@@ -288,7 +288,6 @@ static void tcp_v4_mtu_reduced(struct sock *sk)
mtu = dst_mtu(dst);
if (inet->pmtudisc != IP_PMTUDISC_DONT &&
- ip_sk_accept_pmtu(sk) &&
inet_csk(sk)->icsk_pmtu_cookie > mtu) {
tcp_sync_mss(sk, mtu);
@@ -836,11 +835,11 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
skb = tcp_make_synack(sk, dst, req, NULL);
if (skb) {
- __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
+ __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
skb_set_queue_mapping(skb, queue_mapping);
- err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
- ireq->ir_rmt_addr,
+ err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
+ ireq->rmt_addr,
ireq->opt);
err = net_xmit_eval(err);
if (!tcp_rsk(req)->snt_synack && !err)
@@ -973,7 +972,7 @@ static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
{
union tcp_md5_addr *addr;
- addr = (union tcp_md5_addr *)&inet_rsk(req)->ir_rmt_addr;
+ addr = (union tcp_md5_addr *)&inet_rsk(req)->rmt_addr;
return tcp_md5_do_lookup(sk, addr, AF_INET);
}
@@ -1150,8 +1149,8 @@ int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
saddr = inet_sk(sk)->inet_saddr;
daddr = inet_sk(sk)->inet_daddr;
} else if (req) {
- saddr = inet_rsk(req)->ir_loc_addr;
- daddr = inet_rsk(req)->ir_rmt_addr;
+ saddr = inet_rsk(req)->loc_addr;
+ daddr = inet_rsk(req)->rmt_addr;
} else {
const struct iphdr *iph = ip_hdr(skb);
saddr = iph->saddr;
@@ -1367,8 +1366,8 @@ static int tcp_v4_conn_req_fastopen(struct sock *sk,
kfree_skb(skb_synack);
return -1;
}
- err = ip_build_and_send_pkt(skb_synack, sk, ireq->ir_loc_addr,
- ireq->ir_rmt_addr, ireq->opt);
+ err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
+ ireq->rmt_addr, ireq->opt);
err = net_xmit_eval(err);
if (!err)
tcp_rsk(req)->snt_synack = tcp_time_stamp;
@@ -1411,8 +1410,8 @@ static int tcp_v4_conn_req_fastopen(struct sock *sk,
inet_csk(child)->icsk_af_ops->rebuild_header(child);
tcp_init_congestion_control(child);
tcp_mtup_init(child);
- tcp_init_metrics(child);
tcp_init_buffer_space(child);
+ tcp_init_metrics(child);
/* Queue the data carried in the SYN packet. We need to first
* bump skb's refcnt because the caller will attempt to free it.
@@ -1503,8 +1502,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
tcp_openreq_init(req, &tmp_opt, skb);
ireq = inet_rsk(req);
- ireq->ir_loc_addr = daddr;
- ireq->ir_rmt_addr = saddr;
+ ireq->loc_addr = daddr;
+ ireq->rmt_addr = saddr;
ireq->no_srccheck = inet_sk(sk)->transparent;
ireq->opt = tcp_v4_save_options(skb);
@@ -1579,15 +1578,15 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL);
if (skb_synack) {
- __tcp_v4_send_check(skb_synack, ireq->ir_loc_addr, ireq->ir_rmt_addr);
+ __tcp_v4_send_check(skb_synack, ireq->loc_addr, ireq->rmt_addr);
skb_set_queue_mapping(skb_synack, skb_get_queue_mapping(skb));
} else
goto drop_and_free;
if (likely(!do_fastopen)) {
int err;
- err = ip_build_and_send_pkt(skb_synack, sk, ireq->ir_loc_addr,
- ireq->ir_rmt_addr, ireq->opt);
+ err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
+ ireq->rmt_addr, ireq->opt);
err = net_xmit_eval(err);
if (err || want_cookie)
goto drop_and_free;
@@ -1645,9 +1644,9 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
newtp = tcp_sk(newsk);
newinet = inet_sk(newsk);
ireq = inet_rsk(req);
- newinet->inet_daddr = ireq->ir_rmt_addr;
- newinet->inet_rcv_saddr = ireq->ir_loc_addr;
- newinet->inet_saddr = ireq->ir_loc_addr;
+ newinet->inet_daddr = ireq->rmt_addr;
+ newinet->inet_rcv_saddr = ireq->loc_addr;
+ newinet->inet_saddr = ireq->loc_addr;
inet_opt = ireq->opt;
rcu_assign_pointer(newinet->inet_opt, inet_opt);
ireq->opt = NULL;
@@ -2195,6 +2194,18 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock);
#ifdef CONFIG_PROC_FS
/* Proc filesystem TCP sock list dumping. */
+static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
+{
+ return hlist_nulls_empty(head) ? NULL :
+ list_entry(head->first, struct inet_timewait_sock, tw_node);
+}
+
+static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
+{
+ return !is_a_nulls(tw->tw_node.next) ?
+ hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
+}
+
/*
* Get next listener socket follow cur. If cur is NULL, get first socket
* starting from bucket given in st->bucket; when st->bucket is zero the
@@ -2298,9 +2309,10 @@ static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
return rc;
}
-static inline bool empty_bucket(const struct tcp_iter_state *st)
+static inline bool empty_bucket(struct tcp_iter_state *st)
{
- return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
+ return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
+ hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
}
/*
@@ -2317,6 +2329,7 @@ static void *established_get_first(struct seq_file *seq)
for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
struct sock *sk;
struct hlist_nulls_node *node;
+ struct inet_timewait_sock *tw;
spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
/* Lockless fast path for the common case of empty buckets */
@@ -2332,7 +2345,18 @@ static void *established_get_first(struct seq_file *seq)
rc = sk;
goto out;
}
+ st->state = TCP_SEQ_STATE_TIME_WAIT;
+ inet_twsk_for_each(tw, node,
+ &tcp_hashinfo.ehash[st->bucket].twchain) {
+ if (tw->tw_family != st->family ||
+ !net_eq(twsk_net(tw), net)) {
+ continue;
+ }
+ rc = tw;
+ goto out;
+ }
spin_unlock_bh(lock);
+ st->state = TCP_SEQ_STATE_ESTABLISHED;
}
out:
return rc;
@@ -2341,6 +2365,7 @@ out:
static void *established_get_next(struct seq_file *seq, void *cur)
{
struct sock *sk = cur;
+ struct inet_timewait_sock *tw;
struct hlist_nulls_node *node;
struct tcp_iter_state *st = seq->private;
struct net *net = seq_file_net(seq);
@@ -2348,16 +2373,45 @@ static void *established_get_next(struct seq_file *seq, void *cur)
++st->num;
++st->offset;
- sk = sk_nulls_next(sk);
+ if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
+ tw = cur;
+ tw = tw_next(tw);
+get_tw:
+ while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
+ tw = tw_next(tw);
+ }
+ if (tw) {
+ cur = tw;
+ goto out;
+ }
+ spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
+ st->state = TCP_SEQ_STATE_ESTABLISHED;
+
+ /* Look for next non empty bucket */
+ st->offset = 0;
+ while (++st->bucket <= tcp_hashinfo.ehash_mask &&
+ empty_bucket(st))
+ ;
+ if (st->bucket > tcp_hashinfo.ehash_mask)
+ return NULL;
+
+ spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
+ sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
+ } else
+ sk = sk_nulls_next(sk);
sk_nulls_for_each_from(sk, node) {
if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
- return sk;
+ goto found;
}
- spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
- ++st->bucket;
- return established_get_first(seq);
+ st->state = TCP_SEQ_STATE_TIME_WAIT;
+ tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
+ goto get_tw;
+found:
+ cur = sk;
+out:
+ return cur;
}
static void *established_get_idx(struct seq_file *seq, loff_t pos)
@@ -2410,9 +2464,10 @@ static void *tcp_seek_last_pos(struct seq_file *seq)
if (rc)
break;
st->bucket = 0;
- st->state = TCP_SEQ_STATE_ESTABLISHED;
/* Fallthrough */
case TCP_SEQ_STATE_ESTABLISHED:
+ case TCP_SEQ_STATE_TIME_WAIT:
+ st->state = TCP_SEQ_STATE_ESTABLISHED;
if (st->bucket > tcp_hashinfo.ehash_mask)
break;
rc = established_get_first(seq);
@@ -2469,6 +2524,7 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
}
break;
case TCP_SEQ_STATE_ESTABLISHED:
+ case TCP_SEQ_STATE_TIME_WAIT:
rc = established_get_next(seq, v);
break;
}
@@ -2492,6 +2548,7 @@ static void tcp_seq_stop(struct seq_file *seq, void *v)
if (v != SEQ_START_TOKEN)
spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
break;
+ case TCP_SEQ_STATE_TIME_WAIT:
case TCP_SEQ_STATE_ESTABLISHED:
if (v)
spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
@@ -2541,18 +2598,18 @@ void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
EXPORT_SYMBOL(tcp_proc_unregister);
static void get_openreq4(const struct sock *sk, const struct request_sock *req,
- struct seq_file *f, int i, kuid_t uid)
+ struct seq_file *f, int i, kuid_t uid, int *len)
{
const struct inet_request_sock *ireq = inet_rsk(req);
long delta = req->expires - jiffies;
seq_printf(f, "%4d: %08X:%04X %08X:%04X"
- " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
+ " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK%n",
i,
- ireq->ir_loc_addr,
+ ireq->loc_addr,
ntohs(inet_sk(sk)->inet_sport),
- ireq->ir_rmt_addr,
- ntohs(ireq->ir_rmt_port),
+ ireq->rmt_addr,
+ ntohs(ireq->rmt_port),
TCP_SYN_RECV,
0, 0, /* could print option size, but that is af dependent. */
1, /* timers active (only the expire timer) */
@@ -2562,10 +2619,11 @@ static void get_openreq4(const struct sock *sk, const struct request_sock *req,
0, /* non standard timer */
0, /* open_requests have no inode */
atomic_read(&sk->sk_refcnt),
- req);
+ req,
+ len);
}
-static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
+static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
{
int timer_active;
unsigned long timer_expires;
@@ -2604,7 +2662,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
- "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
+ "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d%n",
i, src, srcp, dest, destp, sk->sk_state,
tp->write_seq - tp->snd_una,
rx_queue,
@@ -2621,11 +2679,12 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
tp->snd_cwnd,
sk->sk_state == TCP_LISTEN ?
(fastopenq ? fastopenq->max_qlen : 0) :
- (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
+ (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh),
+ len);
}
static void get_timewait4_sock(const struct inet_timewait_sock *tw,
- struct seq_file *f, int i)
+ struct seq_file *f, int i, int *len)
{
__be32 dest, src;
__u16 destp, srcp;
@@ -2637,10 +2696,10 @@ static void get_timewait4_sock(const struct inet_timewait_sock *tw,
srcp = ntohs(tw->tw_sport);
seq_printf(f, "%4d: %08X:%04X %08X:%04X"
- " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
+ " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
- atomic_read(&tw->tw_refcnt), tw);
+ atomic_read(&tw->tw_refcnt), tw, len);
}
#define TMPSZ 150
@@ -2648,11 +2707,11 @@ static void get_timewait4_sock(const struct inet_timewait_sock *tw,
static int tcp4_seq_show(struct seq_file *seq, void *v)
{
struct tcp_iter_state *st;
- struct sock *sk = v;
+ int len;
- seq_setwidth(seq, TMPSZ - 1);
if (v == SEQ_START_TOKEN) {
- seq_puts(seq, " sl local_address rem_address st tx_queue "
+ seq_printf(seq, "%-*s\n", TMPSZ - 1,
+ " sl local_address rem_address st tx_queue "
"rx_queue tr tm->when retrnsmt uid timeout "
"inode");
goto out;
@@ -2662,17 +2721,17 @@ static int tcp4_seq_show(struct seq_file *seq, void *v)
switch (st->state) {
case TCP_SEQ_STATE_LISTENING:
case TCP_SEQ_STATE_ESTABLISHED:
- if (sk->sk_state == TCP_TIME_WAIT)
- get_timewait4_sock(v, seq, st->num);
- else
- get_tcp4_sock(v, seq, st->num);
+ get_tcp4_sock(v, seq, st->num, &len);
break;
case TCP_SEQ_STATE_OPENREQ:
- get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid);
+ get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
+ break;
+ case TCP_SEQ_STATE_TIME_WAIT:
+ get_timewait4_sock(v, seq, st->num, &len);
break;
}
+ seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
out:
- seq_pad(seq, '\n');
return 0;
}
@@ -2747,7 +2806,6 @@ struct proto tcp_prot = {
.orphan_count = &tcp_orphan_count,
.memory_allocated = &tcp_memory_allocated,
.memory_pressure = &tcp_memory_pressure,
- .sysctl_mem = sysctl_tcp_mem,
.sysctl_wmem = sysctl_tcp_wmem,
.sysctl_rmem = sysctl_tcp_rmem,
.max_header = MAX_TCP_HEADER,
diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c
index 991d62a..72f7218 100644
--- a/net/ipv4/tcp_lp.c
+++ b/net/ipv4/tcp_lp.c
@@ -115,13 +115,12 @@ static void tcp_lp_init(struct sock *sk)
* Will only call newReno CA when away from inference.
* From TCP-LP's paper, this will be handled in additive increasement.
*/
-static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 acked,
- u32 in_flight)
+static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
{
struct lp *lp = inet_csk_ca(sk);
if (!(lp->flag & LP_WITHIN_INF))
- tcp_reno_cong_avoid(sk, ack, acked, in_flight);
+ tcp_reno_cong_avoid(sk, ack, in_flight);
}
/**
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index 269a89e..559d4ae 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -6,10 +6,15 @@
#include <linux/memcontrol.h>
#include <linux/module.h>
+static inline struct tcp_memcontrol *tcp_from_cgproto(struct cg_proto *cg_proto)
+{
+ return container_of(cg_proto, struct tcp_memcontrol, cg_proto);
+}
+
static void memcg_tcp_enter_memory_pressure(struct sock *sk)
{
if (sk->sk_cgrp->memory_pressure)
- sk->sk_cgrp->memory_pressure = 1;
+ *sk->sk_cgrp->memory_pressure = 1;
}
EXPORT_SYMBOL(memcg_tcp_enter_memory_pressure);
@@ -22,24 +27,34 @@ int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
*/
struct res_counter *res_parent = NULL;
struct cg_proto *cg_proto, *parent_cg;
+ struct tcp_memcontrol *tcp;
struct mem_cgroup *parent = parent_mem_cgroup(memcg);
+ struct net *net = current->nsproxy->net_ns;
cg_proto = tcp_prot.proto_cgroup(memcg);
if (!cg_proto)
return 0;
- cg_proto->sysctl_mem[0] = sysctl_tcp_mem[0];
- cg_proto->sysctl_mem[1] = sysctl_tcp_mem[1];
- cg_proto->sysctl_mem[2] = sysctl_tcp_mem[2];
- cg_proto->memory_pressure = 0;
- cg_proto->memcg = memcg;
+ tcp = tcp_from_cgproto(cg_proto);
+
+ tcp->tcp_prot_mem[0] = net->ipv4.sysctl_tcp_mem[0];
+ tcp->tcp_prot_mem[1] = net->ipv4.sysctl_tcp_mem[1];
+ tcp->tcp_prot_mem[2] = net->ipv4.sysctl_tcp_mem[2];
+ tcp->tcp_memory_pressure = 0;
parent_cg = tcp_prot.proto_cgroup(parent);
if (parent_cg)
- res_parent = &parent_cg->memory_allocated;
+ res_parent = parent_cg->memory_allocated;
+
+ res_counter_init(&tcp->tcp_memory_allocated, res_parent);
+ percpu_counter_init(&tcp->tcp_sockets_allocated, 0);
- res_counter_init(&cg_proto->memory_allocated, res_parent);
- percpu_counter_init(&cg_proto->sockets_allocated, 0);
+ cg_proto->enter_memory_pressure = memcg_tcp_enter_memory_pressure;
+ cg_proto->memory_pressure = &tcp->tcp_memory_pressure;
+ cg_proto->sysctl_mem = tcp->tcp_prot_mem;
+ cg_proto->memory_allocated = &tcp->tcp_memory_allocated;
+ cg_proto->sockets_allocated = &tcp->tcp_sockets_allocated;
+ cg_proto->memcg = memcg;
return 0;
}
@@ -48,18 +63,23 @@ EXPORT_SYMBOL(tcp_init_cgroup);
void tcp_destroy_cgroup(struct mem_cgroup *memcg)
{
struct cg_proto *cg_proto;
+ struct tcp_memcontrol *tcp;
cg_proto = tcp_prot.proto_cgroup(memcg);
if (!cg_proto)
return;
- percpu_counter_destroy(&cg_proto->sockets_allocated);
+ tcp = tcp_from_cgproto(cg_proto);
+ percpu_counter_destroy(&tcp->tcp_sockets_allocated);
}
EXPORT_SYMBOL(tcp_destroy_cgroup);
static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)
{
+ struct net *net = current->nsproxy->net_ns;
+ struct tcp_memcontrol *tcp;
struct cg_proto *cg_proto;
+ u64 old_lim;
int i;
int ret;
@@ -70,13 +90,16 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)
if (val > RES_COUNTER_MAX)
val = RES_COUNTER_MAX;
- ret = res_counter_set_limit(&cg_proto->memory_allocated, val);
+ tcp = tcp_from_cgproto(cg_proto);
+
+ old_lim = res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT);
+ ret = res_counter_set_limit(&tcp->tcp_memory_allocated, val);
if (ret)
return ret;
for (i = 0; i < 3; i++)
- cg_proto->sysctl_mem[i] = min_t(long, val >> PAGE_SHIFT,
- sysctl_tcp_mem[i]);
+ tcp->tcp_prot_mem[i] = min_t(long, val >> PAGE_SHIFT,
+ net->ipv4.sysctl_tcp_mem[i]);
if (val == RES_COUNTER_MAX)
clear_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags);
@@ -133,24 +156,28 @@ static int tcp_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,
static u64 tcp_read_stat(struct mem_cgroup *memcg, int type, u64 default_val)
{
+ struct tcp_memcontrol *tcp;
struct cg_proto *cg_proto;
cg_proto = tcp_prot.proto_cgroup(memcg);
if (!cg_proto)
return default_val;
- return res_counter_read_u64(&cg_proto->memory_allocated, type);
+ tcp = tcp_from_cgproto(cg_proto);
+ return res_counter_read_u64(&tcp->tcp_memory_allocated, type);
}
static u64 tcp_read_usage(struct mem_cgroup *memcg)
{
+ struct tcp_memcontrol *tcp;
struct cg_proto *cg_proto;
cg_proto = tcp_prot.proto_cgroup(memcg);
if (!cg_proto)
return atomic_long_read(&tcp_memory_allocated) << PAGE_SHIFT;
- return res_counter_read_u64(&cg_proto->memory_allocated, RES_USAGE);
+ tcp = tcp_from_cgproto(cg_proto);
+ return res_counter_read_u64(&tcp->tcp_memory_allocated, RES_USAGE);
}
static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft)
@@ -178,25 +205,54 @@ static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft)
static int tcp_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event)
{
struct mem_cgroup *memcg;
+ struct tcp_memcontrol *tcp;
struct cg_proto *cg_proto;
memcg = mem_cgroup_from_css(css);
cg_proto = tcp_prot.proto_cgroup(memcg);
if (!cg_proto)
return 0;
+ tcp = tcp_from_cgproto(cg_proto);
switch (event) {
case RES_MAX_USAGE:
- res_counter_reset_max(&cg_proto->memory_allocated);
+ res_counter_reset_max(&tcp->tcp_memory_allocated);
break;
case RES_FAILCNT:
- res_counter_reset_failcnt(&cg_proto->memory_allocated);
+ res_counter_reset_failcnt(&tcp->tcp_memory_allocated);
break;
}
return 0;
}
+unsigned long long tcp_max_memory(const struct mem_cgroup *memcg)
+{
+ struct tcp_memcontrol *tcp;
+ struct cg_proto *cg_proto;
+
+ cg_proto = tcp_prot.proto_cgroup((struct mem_cgroup *)memcg);
+ if (!cg_proto)
+ return 0;
+
+ tcp = tcp_from_cgproto(cg_proto);
+ return res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT);
+}
+
+void tcp_prot_mem(struct mem_cgroup *memcg, long val, int idx)
+{
+ struct tcp_memcontrol *tcp;
+ struct cg_proto *cg_proto;
+
+ cg_proto = tcp_prot.proto_cgroup(memcg);
+ if (!cg_proto)
+ return;
+
+ tcp = tcp_from_cgproto(cg_proto);
+
+ tcp->tcp_prot_mem[idx] = val;
+}
+
static struct cftype tcp_files[] = {
{
.name = "kmem.tcp.limit_in_bytes",
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 0649373..52f3c6b 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -215,15 +215,13 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,
addr.family = req->rsk_ops->family;
switch (addr.family) {
case AF_INET:
- addr.addr.a4 = inet_rsk(req)->ir_rmt_addr;
+ addr.addr.a4 = inet_rsk(req)->rmt_addr;
hash = (__force unsigned int) addr.addr.a4;
break;
-#if IS_ENABLED(CONFIG_IPV6)
case AF_INET6:
- *(struct in6_addr *)addr.addr.a6 = inet_rsk(req)->ir_v6_rmt_addr;
- hash = ipv6_addr_hash(&inet_rsk(req)->ir_v6_rmt_addr);
+ *(struct in6_addr *)addr.addr.a6 = inet6_rsk(req)->rmt_addr;
+ hash = ipv6_addr_hash(&inet6_rsk(req)->rmt_addr);
break;
-#endif
default:
return NULL;
}
@@ -242,6 +240,7 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,
static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock *tw)
{
+ struct inet6_timewait_sock *tw6;
struct tcp_metrics_block *tm;
struct inetpeer_addr addr;
unsigned int hash;
@@ -253,12 +252,11 @@ static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock
addr.addr.a4 = tw->tw_daddr;
hash = (__force unsigned int) addr.addr.a4;
break;
-#if IS_ENABLED(CONFIG_IPV6)
case AF_INET6:
- *(struct in6_addr *)addr.addr.a6 = tw->tw_v6_daddr;
- hash = ipv6_addr_hash(&tw->tw_v6_daddr);
+ tw6 = inet6_twsk((struct sock *)tw);
+ *(struct in6_addr *)addr.addr.a6 = tw6->tw_v6_daddr;
+ hash = ipv6_addr_hash(&tw6->tw_v6_daddr);
break;
-#endif
default:
return NULL;
}
@@ -290,12 +288,10 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk,
addr.addr.a4 = inet_sk(sk)->inet_daddr;
hash = (__force unsigned int) addr.addr.a4;
break;
-#if IS_ENABLED(CONFIG_IPV6)
case AF_INET6:
- *(struct in6_addr *)addr.addr.a6 = sk->sk_v6_daddr;
- hash = ipv6_addr_hash(&sk->sk_v6_daddr);
+ *(struct in6_addr *)addr.addr.a6 = inet6_sk(sk)->daddr;
+ hash = ipv6_addr_hash(&inet6_sk(sk)->daddr);
break;
-#endif
default:
return NULL;
}
@@ -663,20 +659,16 @@ void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
struct tcp_fastopen_cookie *cookie, bool syn_lost)
{
- struct dst_entry *dst = __sk_dst_get(sk);
struct tcp_metrics_block *tm;
- if (!dst)
- return;
rcu_read_lock();
- tm = tcp_get_metrics(sk, dst, true);
+ tm = tcp_get_metrics(sk, __sk_dst_get(sk), true);
if (tm) {
struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen;
write_seqlock_bh(&fastopen_seqlock);
- if (mss)
- tfom->mss = mss;
- if (cookie && cookie->len > 0)
+ tfom->mss = mss;
+ if (cookie->len > 0)
tfom->cookie = *cookie;
if (syn_lost) {
++tfom->syn_loss;
@@ -991,7 +983,7 @@ static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info)
return 0;
}
-static const struct genl_ops tcp_metrics_nl_ops[] = {
+static struct genl_ops tcp_metrics_nl_ops[] = {
{
.cmd = TCP_METRICS_CMD_GET,
.doit = tcp_metrics_nl_cmd_get,
@@ -1082,7 +1074,8 @@ void __init tcp_metrics_init(void)
if (ret < 0)
goto cleanup;
ret = genl_register_family_with_ops(&tcp_metrics_nl_family,
- tcp_metrics_nl_ops);
+ tcp_metrics_nl_ops,
+ ARRAY_SIZE(tcp_metrics_nl_ops));
if (ret < 0)
goto cleanup_subsys;
return;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 97b6841..58a3e69 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -293,9 +293,12 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
#if IS_ENABLED(CONFIG_IPV6)
if (tw->tw_family == PF_INET6) {
struct ipv6_pinfo *np = inet6_sk(sk);
+ struct inet6_timewait_sock *tw6;
- tw->tw_v6_daddr = sk->sk_v6_daddr;
- tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
+ tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot);
+ tw6 = inet6_twsk((struct sock *)tw);
+ tw6->tw_v6_daddr = np->daddr;
+ tw6->tw_v6_rcv_saddr = np->rcv_saddr;
tw->tw_tclass = np->tclass;
tw->tw_ipv6only = np->ipv6only;
}
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 0560635..3a7525e 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -14,11 +14,10 @@
#include <net/tcp.h>
#include <net/protocol.h>
-struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
+struct sk_buff *tcp_tso_segment(struct sk_buff *skb,
netdev_features_t features)
{
struct sk_buff *segs = ERR_PTR(-EINVAL);
- unsigned int sum_truesize = 0;
struct tcphdr *th;
unsigned int thlen;
unsigned int seq;
@@ -57,8 +56,6 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
SKB_GSO_TCP_ECN |
SKB_GSO_TCPV6 |
SKB_GSO_GRE |
- SKB_GSO_IPIP |
- SKB_GSO_SIT |
SKB_GSO_MPLS |
SKB_GSO_UDP_TUNNEL |
0) ||
@@ -105,7 +102,13 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
if (copy_destructor) {
skb->destructor = gso_skb->destructor;
skb->sk = gso_skb->sk;
- sum_truesize += skb->truesize;
+ /* {tcp|sock}_wfree() use exact truesize accounting :
+ * sum(skb->truesize) MUST be exactly be gso_skb->truesize
+ * So we account mss bytes of 'true size' for each segment.
+ * The last segment will contain the remaining.
+ */
+ skb->truesize = mss;
+ gso_skb->truesize -= mss;
}
skb = skb->next;
th = tcp_hdr(skb);
@@ -122,9 +125,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
if (copy_destructor) {
swap(gso_skb->sk, skb->sk);
swap(gso_skb->destructor, skb->destructor);
- sum_truesize += skb->truesize;
- atomic_add(sum_truesize - gso_skb->truesize,
- &skb->sk->sk_wmem_alloc);
+ swap(gso_skb->truesize, skb->truesize);
}
delta = htonl(oldlen + (skb_tail_pointer(skb) -
@@ -138,7 +139,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
out:
return segs;
}
-EXPORT_SYMBOL(tcp_gso_segment);
+EXPORT_SYMBOL(tcp_tso_segment);
struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
{
@@ -274,32 +275,33 @@ static struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *
{
const struct iphdr *iph = skb_gro_network_header(skb);
__wsum wsum;
-
- /* Don't bother verifying checksum if we're going to flush anyway. */
- if (NAPI_GRO_CB(skb)->flush)
- goto skip_csum;
-
- wsum = skb->csum;
+ __sum16 sum;
switch (skb->ip_summed) {
- case CHECKSUM_NONE:
- wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb),
- 0);
-
- /* fall through */
-
case CHECKSUM_COMPLETE:
if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
- wsum)) {
+ skb->csum)) {
skb->ip_summed = CHECKSUM_UNNECESSARY;
break;
}
-
+flush:
NAPI_GRO_CB(skb)->flush = 1;
return NULL;
+
+ case CHECKSUM_NONE:
+ wsum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
+ skb_gro_len(skb), IPPROTO_TCP, 0);
+ sum = csum_fold(skb_checksum(skb,
+ skb_gro_offset(skb),
+ skb_gro_len(skb),
+ wsum));
+ if (sum)
+ goto flush;
+
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ break;
}
-skip_csum:
return tcp_gro_receive(head, skb);
}
@@ -318,7 +320,7 @@ static int tcp4_gro_complete(struct sk_buff *skb)
static const struct net_offload tcpv4_offload = {
.callbacks = {
.gso_send_check = tcp_v4_gso_send_check,
- .gso_segment = tcp_gso_segment,
+ .gso_segment = tcp_tso_segment,
.gro_receive = tcp4_gro_receive,
.gro_complete = tcp4_gro_complete,
},
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 7820f3a..d46f214 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -850,14 +850,14 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
BUG_ON(!skb || !tcp_skb_pcount(skb));
- if (clone_it) {
- const struct sk_buff *fclone = skb + 1;
+ /* If congestion control is doing timestamping, we must
+ * take such a timestamp before we potentially clone/copy.
+ */
+ if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP)
+ __net_timestamp(skb);
- /* If congestion control is doing timestamping, we must
- * take such a timestamp before we potentially clone/copy.
- */
- if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP)
- __net_timestamp(skb);
+ if (likely(clone_it)) {
+ const struct sk_buff *fclone = skb + 1;
if (unlikely(skb->fclone == SKB_FCLONE_ORIG &&
fclone->fclone == SKB_FCLONE_CLONE))
@@ -1875,12 +1875,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
* - better RTT estimation and ACK scheduling
* - faster recovery
* - high rates
- * Alas, some drivers / subsystems require a fair amount
- * of queued bytes to ensure line rate.
- * One example is wifi aggregation (802.11 AMPDU)
*/
- limit = max_t(unsigned int, sysctl_tcp_limit_output_bytes,
- sk->sk_pacing_rate >> 10);
+ limit = max(skb->truesize, sk->sk_pacing_rate >> 10);
if (atomic_read(&sk->sk_wmem_alloc) > limit) {
set_bit(TSQ_THROTTLED, &tp->tsq_flags);
@@ -2357,6 +2353,21 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
tcp_retrans_try_collapse(sk, skb, cur_mss);
+ /* Some Solaris stacks overoptimize and ignore the FIN on a
+ * retransmit when old data is attached. So strip it off
+ * since it is cheap to do so and saves bytes on the network.
+ */
+ if (skb->len > 0 &&
+ (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
+ tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
+ if (!pskb_trim(skb, 0)) {
+ /* Reuse, even though it does some unnecessary work */
+ tcp_init_nondata_skb(skb, TCP_SKB_CB(skb)->end_seq - 1,
+ TCP_SKB_CB(skb)->tcp_flags);
+ skb->ip_summed = CHECKSUM_NONE;
+ }
+ }
+
/* Make a copy, if the first transmission SKB clone we made
* is still in somebody's hands, else make a clone.
*/
@@ -2725,8 +2736,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
th->syn = 1;
th->ack = 1;
TCP_ECN_make_synack(req, th);
- th->source = htons(ireq->ir_num);
- th->dest = ireq->ir_rmt_port;
+ th->source = ireq->loc_port;
+ th->dest = ireq->rmt_port;
/* Setting of flags are superfluous here for callers (and ECE is
* not even correctly set)
*/
@@ -3097,6 +3108,7 @@ void tcp_send_window_probe(struct sock *sk)
{
if (sk->sk_state == TCP_ESTABLISHED) {
tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1;
+ tcp_sk(sk)->snd_nxt = tcp_sk(sk)->write_seq;
tcp_xmit_probe_skb(sk, 0);
}
}
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index 8b97d71..611beab 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -101,6 +101,22 @@ static inline int tcp_probe_avail(void)
si4.sin_addr.s_addr = inet->inet_##mem##addr; \
} while (0) \
+#if IS_ENABLED(CONFIG_IPV6)
+#define tcp_probe_copy_fl_to_si6(inet, si6, mem) \
+ do { \
+ struct ipv6_pinfo *pi6 = inet->pinet6; \
+ si6.sin6_family = AF_INET6; \
+ si6.sin6_port = inet->inet_##mem##port; \
+ si6.sin6_addr = pi6->mem##addr; \
+ si6.sin6_flowinfo = 0; /* No need here. */ \
+ si6.sin6_scope_id = 0; /* No need here. */ \
+ } while (0)
+#else
+#define tcp_probe_copy_fl_to_si6(fl, si6, mem) \
+ do { \
+ memset(&si6, 0, sizeof(si6)); \
+ } while (0)
+#endif
/*
* Hook inserted to be called before each receive packet.
@@ -131,17 +147,8 @@ static void jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
tcp_probe_copy_fl_to_si4(inet, p->dst.v4, d);
break;
case AF_INET6:
- memset(&p->src.v6, 0, sizeof(p->src.v6));
- memset(&p->dst.v6, 0, sizeof(p->dst.v6));
-#if IS_ENABLED(CONFIG_IPV6)
- p->src.v6.sin6_family = AF_INET6;
- p->src.v6.sin6_port = inet->inet_sport;
- p->src.v6.sin6_addr = inet6_sk(sk)->saddr;
-
- p->dst.v6.sin6_family = AF_INET6;
- p->dst.v6.sin6_port = inet->inet_dport;
- p->dst.v6.sin6_addr = sk->sk_v6_daddr;
-#endif
+ tcp_probe_copy_fl_to_si6(inet, p->src.v6, s);
+ tcp_probe_copy_fl_to_si6(inet, p->dst.v6, d);
break;
default:
BUG();
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index 19ea6c2..8ce55b8 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -15,8 +15,7 @@
#define TCP_SCALABLE_AI_CNT 50U
#define TCP_SCALABLE_MD_SCALE 3
-static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked,
- u32 in_flight)
+static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -24,7 +23,7 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked,
return;
if (tp->snd_cwnd <= tp->snd_ssthresh)
- tcp_slow_start(tp, acked);
+ tcp_slow_start(tp);
else
tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT));
}
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 64f0354..4b85e6f 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -156,16 +156,12 @@ static bool retransmits_timed_out(struct sock *sk,
static int tcp_write_timeout(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
- struct tcp_sock *tp = tcp_sk(sk);
int retry_until;
bool do_reset, syn_set = false;
if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
- if (icsk->icsk_retransmits) {
+ if (icsk->icsk_retransmits)
dst_negative_advice(sk);
- if (tp->syn_fastopen || tp->syn_data)
- tcp_fastopen_cache_set(sk, 0, NULL, true);
- }
retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
syn_set = true;
} else {
@@ -378,8 +374,9 @@ void tcp_retransmit_timer(struct sock *sk)
}
#if IS_ENABLED(CONFIG_IPV6)
else if (sk->sk_family == AF_INET6) {
+ struct ipv6_pinfo *np = inet6_sk(sk);
LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n"),
- &sk->sk_v6_daddr,
+ &np->daddr,
ntohs(inet->inet_dport), inet->inet_num,
tp->snd_una, tp->snd_nxt);
}
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index 06cae62..80fa2bf 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -163,14 +163,13 @@ static inline u32 tcp_vegas_ssthresh(struct tcp_sock *tp)
return min(tp->snd_ssthresh, tp->snd_cwnd-1);
}
-static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked,
- u32 in_flight)
+static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
{
struct tcp_sock *tp = tcp_sk(sk);
struct vegas *vegas = inet_csk_ca(sk);
if (!vegas->doing_vegas_now) {
- tcp_reno_cong_avoid(sk, ack, acked, in_flight);
+ tcp_reno_cong_avoid(sk, ack, in_flight);
return;
}
@@ -195,7 +194,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked,
/* We don't have enough RTT samples to do the Vegas
* calculation, so we'll behave like Reno.
*/
- tcp_reno_cong_avoid(sk, ack, acked, in_flight);
+ tcp_reno_cong_avoid(sk, ack, in_flight);
} else {
u32 rtt, diff;
u64 target_cwnd;
@@ -244,7 +243,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked,
} else if (tp->snd_cwnd <= tp->snd_ssthresh) {
/* Slow start. */
- tcp_slow_start(tp, acked);
+ tcp_slow_start(tp);
} else {
/* Congestion avoidance. */
@@ -284,7 +283,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked,
}
/* Use normal slow start */
else if (tp->snd_cwnd <= tp->snd_ssthresh)
- tcp_slow_start(tp, acked);
+ tcp_slow_start(tp);
}
diff --git a/net/ipv4/tcp_vegas.h b/net/ipv4/tcp_vegas.h
index 0531b99..6c0eea2 100644
--- a/net/ipv4/tcp_vegas.h
+++ b/net/ipv4/tcp_vegas.h
@@ -15,10 +15,10 @@ struct vegas {
u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */
};
-void tcp_vegas_init(struct sock *sk);
-void tcp_vegas_state(struct sock *sk, u8 ca_state);
-void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us);
-void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event);
-void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb);
+extern void tcp_vegas_init(struct sock *sk);
+extern void tcp_vegas_state(struct sock *sk, u8 ca_state);
+extern void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us);
+extern void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event);
+extern void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb);
#endif /* __TCP_VEGAS_H */
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index 326475a..ac43cd7 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -114,14 +114,13 @@ static void tcp_veno_cwnd_event(struct sock *sk, enum tcp_ca_event event)
tcp_veno_init(sk);
}
-static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked,
- u32 in_flight)
+static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
{
struct tcp_sock *tp = tcp_sk(sk);
struct veno *veno = inet_csk_ca(sk);
if (!veno->doing_veno_now) {
- tcp_reno_cong_avoid(sk, ack, acked, in_flight);
+ tcp_reno_cong_avoid(sk, ack, in_flight);
return;
}
@@ -134,7 +133,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked,
/* We don't have enough rtt samples to do the Veno
* calculation, so we'll behave like Reno.
*/
- tcp_reno_cong_avoid(sk, ack, acked, in_flight);
+ tcp_reno_cong_avoid(sk, ack, in_flight);
} else {
u64 target_cwnd;
u32 rtt;
@@ -153,7 +152,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked,
if (tp->snd_cwnd <= tp->snd_ssthresh) {
/* Slow start. */
- tcp_slow_start(tp, acked);
+ tcp_slow_start(tp);
} else {
/* Congestion avoidance. */
if (veno->diff < beta) {
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index a347a07..05c3b6f 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -69,8 +69,7 @@ static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked, s32 rtt_us)
tcp_vegas_pkts_acked(sk, pkts_acked, rtt_us);
}
-static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked,
- u32 in_flight)
+static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
{
struct tcp_sock *tp = tcp_sk(sk);
struct yeah *yeah = inet_csk_ca(sk);
@@ -79,7 +78,7 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked,
return;
if (tp->snd_cwnd <= tp->snd_ssthresh)
- tcp_slow_start(tp, acked);
+ tcp_slow_start(tp);
else if (!yeah->doing_reno_now) {
/* Scalable */
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 44f6a20..0ca44df 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -103,7 +103,6 @@
#include <linux/seq_file.h>
#include <net/net_namespace.h>
#include <net/icmp.h>
-#include <net/inet_hashtables.h>
#include <net/route.h>
#include <net/checksum.h>
#include <net/xfrm.h>
@@ -220,7 +219,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
unsigned short first, last;
DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN);
- inet_get_local_port_range(net, &low, &high);
+ inet_get_local_port_range(&low, &high);
remaining = (high - low) + 1;
rand = net_random();
@@ -407,18 +406,6 @@ static inline int compute_score2(struct sock *sk, struct net *net,
return score;
}
-static unsigned int udp_ehashfn(struct net *net, const __be32 laddr,
- const __u16 lport, const __be32 faddr,
- const __be16 fport)
-{
- static u32 udp_ehash_secret __read_mostly;
-
- net_get_random_once(&udp_ehash_secret, sizeof(udp_ehash_secret));
-
- return __inet_ehashfn(laddr, lport, faddr, fport,
- udp_ehash_secret + net_hash_mix(net));
-}
-
/* called with read_rcu_lock() */
static struct sock *udp4_lib_lookup2(struct net *net,
@@ -442,8 +429,8 @@ begin:
badness = score;
reuseport = sk->sk_reuseport;
if (reuseport) {
- hash = udp_ehashfn(net, daddr, hnum,
- saddr, sport);
+ hash = inet_ehashfn(net, daddr, hnum,
+ saddr, sport);
matches = 1;
}
} else if (score == badness && reuseport) {
@@ -523,8 +510,8 @@ begin:
badness = score;
reuseport = sk->sk_reuseport;
if (reuseport) {
- hash = udp_ehashfn(net, daddr, hnum,
- saddr, sport);
+ hash = inet_ehashfn(net, daddr, hnum,
+ saddr, sport);
matches = 1;
}
} else if (score == badness && reuseport) {
@@ -578,26 +565,6 @@ struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
}
EXPORT_SYMBOL_GPL(udp4_lib_lookup);
-static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk,
- __be16 loc_port, __be32 loc_addr,
- __be16 rmt_port, __be32 rmt_addr,
- int dif, unsigned short hnum)
-{
- struct inet_sock *inet = inet_sk(sk);
-
- if (!net_eq(sock_net(sk), net) ||
- udp_sk(sk)->udp_port_hash != hnum ||
- (inet->inet_daddr && inet->inet_daddr != rmt_addr) ||
- (inet->inet_dport != rmt_port && inet->inet_dport) ||
- (inet->inet_rcv_saddr && inet->inet_rcv_saddr != loc_addr) ||
- ipv6_only_sock(sk) ||
- (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif))
- return false;
- if (!ip_mc_sf_allow(sk, loc_addr, rmt_addr, dif))
- return false;
- return true;
-}
-
static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk,
__be16 loc_port, __be32 loc_addr,
__be16 rmt_port, __be32 rmt_addr,
@@ -608,11 +575,20 @@ static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk,
unsigned short hnum = ntohs(loc_port);
sk_nulls_for_each_from(s, node) {
- if (__udp_is_mcast_sock(net, s,
- loc_port, loc_addr,
- rmt_port, rmt_addr,
- dif, hnum))
- goto found;
+ struct inet_sock *inet = inet_sk(s);
+
+ if (!net_eq(sock_net(s), net) ||
+ udp_sk(s)->udp_port_hash != hnum ||
+ (inet->inet_daddr && inet->inet_daddr != rmt_addr) ||
+ (inet->inet_dport != rmt_port && inet->inet_dport) ||
+ (inet->inet_rcv_saddr &&
+ inet->inet_rcv_saddr != loc_addr) ||
+ ipv6_only_sock(s) ||
+ (s->sk_bound_dev_if && s->sk_bound_dev_if != dif))
+ continue;
+ if (!ip_mc_sf_allow(s, loc_addr, rmt_addr, dif))
+ continue;
+ goto found;
}
s = NULL;
found:
@@ -879,8 +855,6 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
ipc.opt = NULL;
ipc.tx_flags = 0;
- ipc.ttl = 0;
- ipc.tos = -1;
getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag;
@@ -964,7 +938,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
faddr = ipc.opt->opt.faddr;
connected = 0;
}
- tos = get_rttos(&ipc, inet);
+ tos = RT_TOS(inet->tos);
if (sock_flag(sk, SOCK_LOCALROUTE) ||
(msg->msg_flags & MSG_DONTROUTE) ||
(ipc.opt && ipc.opt->opt.is_strictroute)) {
@@ -999,7 +973,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
err = PTR_ERR(rt);
rt = NULL;
if (err == -ENETUNREACH)
- IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
+ IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
goto out;
}
@@ -1098,9 +1072,6 @@ int udp_sendpage(struct sock *sk, struct page *page, int offset,
struct udp_sock *up = udp_sk(sk);
int ret;
- if (flags & MSG_SENDPAGE_NOTLAST)
- flags |= MSG_MORE;
-
if (!up->pending) {
struct msghdr msg = { .msg_flags = flags|MSG_MORE };
@@ -1238,8 +1209,14 @@ int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
int is_udplite = IS_UDPLITE(sk);
bool slow;
+ /*
+ * Check any passed addresses
+ */
+ if (addr_len)
+ *addr_len = sizeof(*sin);
+
if (flags & MSG_ERRQUEUE)
- return ip_recv_error(sk, msg, len, addr_len);
+ return ip_recv_error(sk, msg, len);
try_again:
skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
@@ -1299,7 +1276,6 @@ try_again:
sin->sin_port = udp_hdr(skb)->source;
sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
- *addr_len = sizeof(*sin);
}
if (inet->cmsg_flags)
ip_cmsg_recv(msg, skb);
@@ -1427,10 +1403,8 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
int rc;
- if (inet_sk(sk)->inet_daddr) {
+ if (inet_sk(sk)->inet_daddr)
sock_rps_save_rxhash(sk, skb);
- sk_mark_napi_id(sk, skb);
- }
rc = sock_queue_rcv_skb(sk, skb);
if (rc < 0) {
@@ -1554,7 +1528,7 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
rc = 0;
- ipv4_pktinfo_prepare(sk, skb);
+ ipv4_pktinfo_prepare(skb);
bh_lock_sock(sk);
if (!sock_owned_by_user(sk))
rc = __udp_queue_rcv_skb(sk, skb);
@@ -1603,14 +1577,6 @@ static void flush_stack(struct sock **stack, unsigned int count,
kfree_skb(skb1);
}
-static void udp_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
-{
- struct dst_entry *dst = skb_dst(skb);
-
- dst_hold(dst);
- sk->sk_rx_dst = dst;
-}
-
/*
* Multicasts and broadcasts go to each listener.
*
@@ -1739,32 +1705,16 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
if (udp4_csum_init(skb, uh, proto))
goto csum_error;
- if (skb->sk) {
- int ret;
- sk = skb->sk;
-
- if (unlikely(sk->sk_rx_dst == NULL))
- udp_sk_rx_dst_set(sk, skb);
+ if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
+ return __udp4_lib_mcast_deliver(net, skb, uh,
+ saddr, daddr, udptable);
- ret = udp_queue_rcv_skb(sk, skb);
-
- /* a return value > 0 means to resubmit the input, but
- * it wants the return to be -protocol, or 0
- */
- if (ret > 0)
- return -ret;
- return 0;
- } else {
- if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
- return __udp4_lib_mcast_deliver(net, skb, uh,
- saddr, daddr, udptable);
-
- sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
- }
+ sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
if (sk != NULL) {
int ret;
+ sk_mark_napi_id(sk, skb);
ret = udp_queue_rcv_skb(sk, skb);
sock_put(sk);
@@ -1818,135 +1768,6 @@ drop:
return 0;
}
-/* We can only early demux multicast if there is a single matching socket.
- * If more than one socket found returns NULL
- */
-static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net,
- __be16 loc_port, __be32 loc_addr,
- __be16 rmt_port, __be32 rmt_addr,
- int dif)
-{
- struct sock *sk, *result;
- struct hlist_nulls_node *node;
- unsigned short hnum = ntohs(loc_port);
- unsigned int count, slot = udp_hashfn(net, hnum, udp_table.mask);
- struct udp_hslot *hslot = &udp_table.hash[slot];
-
- rcu_read_lock();
-begin:
- count = 0;
- result = NULL;
- sk_nulls_for_each_rcu(sk, node, &hslot->head) {
- if (__udp_is_mcast_sock(net, sk,
- loc_port, loc_addr,
- rmt_port, rmt_addr,
- dif, hnum)) {
- result = sk;
- ++count;
- }
- }
- /*
- * if the nulls value we got at the end of this lookup is
- * not the expected one, we must restart lookup.
- * We probably met an item that was moved to another chain.
- */
- if (get_nulls_value(node) != slot)
- goto begin;
-
- if (result) {
- if (count != 1 ||
- unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
- result = NULL;
- else if (unlikely(!__udp_is_mcast_sock(net, result,
- loc_port, loc_addr,
- rmt_port, rmt_addr,
- dif, hnum))) {
- sock_put(result);
- result = NULL;
- }
- }
- rcu_read_unlock();
- return result;
-}
-
-/* For unicast we should only early demux connected sockets or we can
- * break forwarding setups. The chains here can be long so only check
- * if the first socket is an exact match and if not move on.
- */
-static struct sock *__udp4_lib_demux_lookup(struct net *net,
- __be16 loc_port, __be32 loc_addr,
- __be16 rmt_port, __be32 rmt_addr,
- int dif)
-{
- struct sock *sk, *result;
- struct hlist_nulls_node *node;
- unsigned short hnum = ntohs(loc_port);
- unsigned int hash2 = udp4_portaddr_hash(net, loc_addr, hnum);
- unsigned int slot2 = hash2 & udp_table.mask;
- struct udp_hslot *hslot2 = &udp_table.hash2[slot2];
- INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr)
- const __portpair ports = INET_COMBINED_PORTS(rmt_port, hnum);
-
- rcu_read_lock();
- result = NULL;
- udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) {
- if (INET_MATCH(sk, net, acookie,
- rmt_addr, loc_addr, ports, dif))
- result = sk;
- /* Only check first socket in chain */
- break;
- }
-
- if (result) {
- if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
- result = NULL;
- else if (unlikely(!INET_MATCH(sk, net, acookie,
- rmt_addr, loc_addr,
- ports, dif))) {
- sock_put(result);
- result = NULL;
- }
- }
- rcu_read_unlock();
- return result;
-}
-
-void udp_v4_early_demux(struct sk_buff *skb)
-{
- const struct iphdr *iph = ip_hdr(skb);
- const struct udphdr *uh = udp_hdr(skb);
- struct sock *sk;
- struct dst_entry *dst;
- struct net *net = dev_net(skb->dev);
- int dif = skb->dev->ifindex;
-
- /* validate the packet */
- if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct udphdr)))
- return;
-
- if (skb->pkt_type == PACKET_BROADCAST ||
- skb->pkt_type == PACKET_MULTICAST)
- sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr,
- uh->source, iph->saddr, dif);
- else if (skb->pkt_type == PACKET_HOST)
- sk = __udp4_lib_demux_lookup(net, uh->dest, iph->daddr,
- uh->source, iph->saddr, dif);
- else
- return;
-
- if (!sk)
- return;
-
- skb->sk = sk;
- skb->destructor = sock_edemux;
- dst = sk->sk_rx_dst;
-
- if (dst)
- dst = dst_check(dst, 0);
- if (dst)
- skb_dst_set_noref(skb, dst);
-}
-
int udp_rcv(struct sk_buff *skb)
{
return __udp4_lib_rcv(skb, &udp_table, IPPROTO_UDP);
@@ -2329,7 +2150,7 @@ EXPORT_SYMBOL(udp_proc_unregister);
/* ------------------------------------------------------------------------ */
static void udp4_format_sock(struct sock *sp, struct seq_file *f,
- int bucket)
+ int bucket, int *len)
{
struct inet_sock *inet = inet_sk(sp);
__be32 dest = inet->inet_daddr;
@@ -2338,7 +2159,7 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f,
__u16 srcp = ntohs(inet->inet_sport);
seq_printf(f, "%5d: %08X:%04X %08X:%04X"
- " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d",
+ " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d%n",
bucket, src, srcp, dest, destp, sp->sk_state,
sk_wmem_alloc_get(sp),
sk_rmem_alloc_get(sp),
@@ -2346,22 +2167,23 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f,
from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)),
0, sock_i_ino(sp),
atomic_read(&sp->sk_refcnt), sp,
- atomic_read(&sp->sk_drops));
+ atomic_read(&sp->sk_drops), len);
}
int udp4_seq_show(struct seq_file *seq, void *v)
{
- seq_setwidth(seq, 127);
if (v == SEQ_START_TOKEN)
- seq_puts(seq, " sl local_address rem_address st tx_queue "
+ seq_printf(seq, "%-127s\n",
+ " sl local_address rem_address st tx_queue "
"rx_queue tr tm->when retrnsmt uid timeout "
"inode ref pointer drops");
else {
struct udp_iter_state *state = seq->private;
+ int len;
- udp4_format_sock(v, seq, state->bucket);
+ udp4_format_sock(v, seq, state->bucket, &len);
+ seq_printf(seq, "%*s\n", 127 - len, "");
}
- seq_pad(seq, '\n');
return 0;
}
diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h
index f3c2789..5a681e2 100644
--- a/net/ipv4/udp_impl.h
+++ b/net/ipv4/udp_impl.h
@@ -5,30 +5,30 @@
#include <net/protocol.h>
#include <net/inet_common.h>
-int __udp4_lib_rcv(struct sk_buff *, struct udp_table *, int);
-void __udp4_lib_err(struct sk_buff *, u32, struct udp_table *);
+extern int __udp4_lib_rcv(struct sk_buff *, struct udp_table *, int );
+extern void __udp4_lib_err(struct sk_buff *, u32, struct udp_table *);
-int udp_v4_get_port(struct sock *sk, unsigned short snum);
+extern int udp_v4_get_port(struct sock *sk, unsigned short snum);
-int udp_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, unsigned int optlen);
-int udp_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen);
+extern int udp_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, unsigned int optlen);
+extern int udp_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen);
#ifdef CONFIG_COMPAT
-int compat_udp_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, unsigned int optlen);
-int compat_udp_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen);
+extern int compat_udp_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, unsigned int optlen);
+extern int compat_udp_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen);
#endif
-int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
- size_t len, int noblock, int flags, int *addr_len);
-int udp_sendpage(struct sock *sk, struct page *page, int offset, size_t size,
- int flags);
-int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
-void udp_destroy_sock(struct sock *sk);
+extern int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+ size_t len, int noblock, int flags, int *addr_len);
+extern int udp_sendpage(struct sock *sk, struct page *page, int offset,
+ size_t size, int flags);
+extern int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
+extern void udp_destroy_sock(struct sock *sk);
#ifdef CONFIG_PROC_FS
-int udp4_seq_show(struct seq_file *seq, void *v);
+extern int udp4_seq_show(struct seq_file *seq, void *v);
#endif
#endif /* _UDP4_IMPL_H */
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 83206de..f35ecca 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -52,7 +52,6 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY |
SKB_GSO_UDP_TUNNEL |
- SKB_GSO_IPIP |
SKB_GSO_GRE | SKB_GSO_MPLS) ||
!(type & (SKB_GSO_UDP))))
goto out;
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index 31b1815..b5663c3 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -16,13 +16,13 @@
#include <net/xfrm.h>
/* Informational hook. The decap is still done here. */
-static struct xfrm_tunnel_notifier __rcu *rcv_notify_handlers __read_mostly;
+static struct xfrm_tunnel __rcu *rcv_notify_handlers __read_mostly;
static DEFINE_MUTEX(xfrm4_mode_tunnel_input_mutex);
-int xfrm4_mode_tunnel_input_register(struct xfrm_tunnel_notifier *handler)
+int xfrm4_mode_tunnel_input_register(struct xfrm_tunnel *handler)
{
- struct xfrm_tunnel_notifier __rcu **pprev;
- struct xfrm_tunnel_notifier *t;
+ struct xfrm_tunnel __rcu **pprev;
+ struct xfrm_tunnel *t;
int ret = -EEXIST;
int priority = handler->priority;
@@ -50,10 +50,10 @@ err:
}
EXPORT_SYMBOL_GPL(xfrm4_mode_tunnel_input_register);
-int xfrm4_mode_tunnel_input_deregister(struct xfrm_tunnel_notifier *handler)
+int xfrm4_mode_tunnel_input_deregister(struct xfrm_tunnel *handler)
{
- struct xfrm_tunnel_notifier __rcu **pprev;
- struct xfrm_tunnel_notifier *t;
+ struct xfrm_tunnel __rcu **pprev;
+ struct xfrm_tunnel *t;
int ret = -ENOENT;
mutex_lock(&xfrm4_mode_tunnel_input_mutex);
@@ -134,7 +134,7 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
{
- struct xfrm_tunnel_notifier *handler;
+ struct xfrm_tunnel *handler;
int err = -EINVAL;
if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPIP)
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index e1a6393..ccde542 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -104,14 +104,10 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
const struct iphdr *iph = ip_hdr(skb);
u8 *xprth = skb_network_header(skb) + iph->ihl * 4;
struct flowi4 *fl4 = &fl->u.ip4;
- int oif = 0;
-
- if (skb_dst(skb))
- oif = skb_dst(skb)->dev->ifindex;
memset(fl4, 0, sizeof(struct flowi4));
fl4->flowi4_mark = skb->mark;
- fl4->flowi4_oif = reverse ? skb->skb_iif : oif;
+ fl4->flowi4_oif = skb_dst(skb)->dev->ifindex;
if (!ip_is_fragment(iph)) {
switch (iph->protocol) {
@@ -240,7 +236,7 @@ static struct dst_ops xfrm4_dst_ops = {
.destroy = xfrm4_dst_destroy,
.ifdown = xfrm4_dst_ifdown,
.local_out = __ip_local_out,
- .gc_thresh = 32768,
+ .gc_thresh = 1024,
};
static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {