diff options
author | Scott Wood <scottwood@freescale.com> | 2014-04-08 01:00:49 (GMT) |
---|---|---|
committer | Scott Wood <scottwood@freescale.com> | 2014-04-08 19:58:35 (GMT) |
commit | 47d2261a3fa71cde24263559a4219a25e50d8c89 (patch) | |
tree | 28774d5b330ccf1b777a3af222d8356918328013 /net/ipv4 | |
parent | fb7f27080adc65cd5f341bdf56a1d0c14f316c1b (diff) | |
parent | 5fb9d37f27351e42f002e372074249f92cbdf815 (diff) | |
download | linux-fsl-qoriq-47d2261a3fa71cde24263559a4219a25e50d8c89.tar.xz |
Merge branch 'merge' into sdk-v1.6.x
This reverts v3.13-rc3+ (78fd82238d0e5716) to v3.12, except for
commits which I noticed which appear relevant to the SDK.
Signed-off-by: Scott Wood <scottwood@freescale.com>
Conflicts:
arch/powerpc/include/asm/kvm_host.h
arch/powerpc/kvm/book3s_hv_rmhandlers.S
arch/powerpc/kvm/book3s_interrupts.S
arch/powerpc/kvm/e500.c
arch/powerpc/kvm/e500mc.c
arch/powerpc/sysdev/fsl_soc.h
drivers/Kconfig
drivers/cpufreq/ppc-corenet-cpufreq.c
drivers/dma/fsldma.c
drivers/dma/s3c24xx-dma.c
drivers/misc/Makefile
drivers/mmc/host/sdhci-of-esdhc.c
drivers/mtd/devices/m25p80.c
drivers/net/ethernet/freescale/gianfar.h
drivers/platform/Kconfig
drivers/platform/Makefile
drivers/spi/spi-fsl-espi.c
include/crypto/algapi.h
include/linux/netdev_features.h
include/linux/skbuff.h
include/net/ip.h
net/core/ethtool.c
Diffstat (limited to 'net/ipv4')
77 files changed, 1220 insertions, 1910 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 70011e0..cfeb85c 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -245,6 +245,29 @@ out: } EXPORT_SYMBOL(inet_listen); +u32 inet_ehash_secret __read_mostly; +EXPORT_SYMBOL(inet_ehash_secret); + +u32 ipv6_hash_secret __read_mostly; +EXPORT_SYMBOL(ipv6_hash_secret); + +/* + * inet_ehash_secret must be set exactly once, and to a non nul value + * ipv6_hash_secret must be set exactly once. + */ +void build_ehash_secret(void) +{ + u32 rnd; + + do { + get_random_bytes(&rnd, sizeof(rnd)); + } while (rnd == 0); + + if (cmpxchg(&inet_ehash_secret, 0, rnd) == 0) + get_random_bytes(&ipv6_hash_secret, sizeof(ipv6_hash_secret)); +} +EXPORT_SYMBOL(build_ehash_secret); + /* * Create an inet socket. */ @@ -261,6 +284,10 @@ static int inet_create(struct net *net, struct socket *sock, int protocol, int try_loading_module = 0; int err; + if (unlikely(!inet_ehash_secret)) + if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM) + build_ehash_secret(); + sock->state = SS_UNCONNECTED; /* Look for the requested type/protocol pair. */ @@ -1227,36 +1254,36 @@ static int inet_gso_send_check(struct sk_buff *skb) if (ihl < sizeof(*iph)) goto out; - proto = iph->protocol; - - /* Warning: after this point, iph might be no longer valid */ if (unlikely(!pskb_may_pull(skb, ihl))) goto out; - __skb_pull(skb, ihl); + __skb_pull(skb, ihl); skb_reset_transport_header(skb); + iph = ip_hdr(skb); + proto = iph->protocol; err = -EPROTONOSUPPORT; + rcu_read_lock(); ops = rcu_dereference(inet_offloads[proto]); if (likely(ops && ops->callbacks.gso_send_check)) err = ops->callbacks.gso_send_check(skb); + rcu_read_unlock(); out: return err; } static struct sk_buff *inet_gso_segment(struct sk_buff *skb, - netdev_features_t features) + netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EINVAL); const struct net_offload *ops; - unsigned int offset = 0; - bool udpfrag, encap; struct iphdr *iph; int proto; - int nhoff; int ihl; int id; + unsigned int offset = 0; + bool tunnel; if (unlikely(skb_shinfo(skb)->gso_type & ~(SKB_GSO_TCPV4 | @@ -1264,16 +1291,12 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, SKB_GSO_DODGY | SKB_GSO_TCP_ECN | SKB_GSO_GRE | - SKB_GSO_IPIP | - SKB_GSO_SIT | SKB_GSO_TCPV6 | SKB_GSO_UDP_TUNNEL | SKB_GSO_MPLS | 0))) goto out; - skb_reset_network_header(skb); - nhoff = skb_network_header(skb) - skb_mac_header(skb); if (unlikely(!pskb_may_pull(skb, sizeof(*iph)))) goto out; @@ -1282,50 +1305,42 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, if (ihl < sizeof(*iph)) goto out; - id = ntohs(iph->id); - proto = iph->protocol; - - /* Warning: after this point, iph might be no longer valid */ if (unlikely(!pskb_may_pull(skb, ihl))) goto out; - __skb_pull(skb, ihl); - encap = SKB_GSO_CB(skb)->encap_level > 0; - if (encap) - features = skb->dev->hw_enc_features & netif_skb_features(skb); - SKB_GSO_CB(skb)->encap_level += ihl; + tunnel = !!skb->encapsulation; + __skb_pull(skb, ihl); skb_reset_transport_header(skb); - + iph = ip_hdr(skb); + id = ntohs(iph->id); + proto = iph->protocol; segs = ERR_PTR(-EPROTONOSUPPORT); - /* Note : following gso_segment() might change skb->encapsulation */ - udpfrag = !skb->encapsulation && proto == IPPROTO_UDP; - + rcu_read_lock(); ops = rcu_dereference(inet_offloads[proto]); if (likely(ops && ops->callbacks.gso_segment)) segs = ops->callbacks.gso_segment(skb, features); + rcu_read_unlock(); if (IS_ERR_OR_NULL(segs)) goto out; skb = segs; do { - iph = (struct iphdr *)(skb_mac_header(skb) + nhoff); - if (udpfrag) { + iph = ip_hdr(skb); + if (!tunnel && proto == IPPROTO_UDP) { iph->id = htons(id); iph->frag_off = htons(offset >> 3); if (skb->next != NULL) iph->frag_off |= htons(IP_MF); - offset += skb->len - nhoff - ihl; - } else { + offset += (skb->len - skb->mac_len - iph->ihl * 4); + } else { iph->id = htons(id++); } - iph->tot_len = htons(skb->len - nhoff); - ip_send_check(iph); - if (encap) - skb_reset_inner_headers(skb); - skb->network_header = (u8 *)iph - skb->head; + iph->tot_len = htons(skb->len - skb->mac_len); + iph->check = 0; + iph->check = ip_fast_csum(skb_network_header(skb), iph->ihl); } while ((skb = skb->next)); out: @@ -1503,7 +1518,6 @@ int snmp_mib_init(void __percpu *ptr[2], size_t mibsize, size_t align) ptr[0] = __alloc_percpu(mibsize, align); if (!ptr[0]) return -ENOMEM; - #if SNMP_ARRAY_SZ == 2 ptr[1] = __alloc_percpu(mibsize, align); if (!ptr[1]) { @@ -1532,7 +1546,6 @@ static const struct net_protocol tcp_protocol = { }; static const struct net_protocol udp_protocol = { - .early_demux = udp_v4_early_demux, .handler = udp_rcv, .err_handler = udp_err, .no_policy = 1, @@ -1548,8 +1561,6 @@ static const struct net_protocol icmp_protocol = { static __net_init int ipv4_mib_init_net(struct net *net) { - int i; - if (snmp_mib_init((void __percpu **)net->mib.tcp_statistics, sizeof(struct tcp_mib), __alignof__(struct tcp_mib)) < 0) @@ -1558,17 +1569,6 @@ static __net_init int ipv4_mib_init_net(struct net *net) sizeof(struct ipstats_mib), __alignof__(struct ipstats_mib)) < 0) goto err_ip_mib; - - for_each_possible_cpu(i) { - struct ipstats_mib *af_inet_stats; - af_inet_stats = per_cpu_ptr(net->mib.ip_statistics[0], i); - u64_stats_init(&af_inet_stats->syncp); -#if SNMP_ARRAY_SZ == 2 - af_inet_stats = per_cpu_ptr(net->mib.ip_statistics[1], i); - u64_stats_init(&af_inet_stats->syncp); -#endif - } - if (snmp_mib_init((void __percpu **)net->mib.net_statistics, sizeof(struct linux_mib), __alignof__(struct linux_mib)) < 0) @@ -1646,13 +1646,6 @@ static struct packet_offload ip_packet_offload __read_mostly = { }, }; -static const struct net_offload ipip_offload = { - .callbacks = { - .gso_send_check = inet_gso_send_check, - .gso_segment = inet_gso_segment, - }, -}; - static int __init ipv4_offload_init(void) { /* @@ -1664,7 +1657,6 @@ static int __init ipv4_offload_init(void) pr_crit("%s: Cannot add TCP protocol offload\n", __func__); dev_add_offload(&ip_packet_offload); - inet_add_offload(&ipip_offload, IPPROTO_IPIP); return 0; } @@ -1713,6 +1705,8 @@ static int __init inet_init(void) ip_static_sysctl_init(); #endif + tcp_prot.sysctl_mem = init_net.ipv4.sysctl_tcp_mem; + /* * Add all the base protocols. */ diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index 19e3637..b28e863 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -57,7 +57,7 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (IS_ERR(rt)) { err = PTR_ERR(rt); if (err == -ENETUNREACH) - IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); + IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); goto out; } diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 7785b28..109ee89 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -121,6 +121,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) struct aead_givcrypt_request *req; struct scatterlist *sg; struct scatterlist *asg; + struct esp_data *esp; struct sk_buff *trailer; void *tmp; u8 *iv; @@ -138,7 +139,8 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) /* skb is pure payload to encrypt */ - aead = x->data; + esp = x->data; + aead = esp->aead; alen = crypto_aead_authsize(aead); tfclen = 0; @@ -152,6 +154,8 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) } blksize = ALIGN(crypto_aead_blocksize(aead), 4); clen = ALIGN(skb->len + 2 + tfclen, blksize); + if (esp->padlen) + clen = ALIGN(clen, esp->padlen); plen = clen - skb->len - tfclen; err = skb_cow_data(skb, tfclen + plen + alen, &trailer); @@ -276,7 +280,8 @@ static int esp_input_done2(struct sk_buff *skb, int err) { const struct iphdr *iph; struct xfrm_state *x = xfrm_input_state(skb); - struct crypto_aead *aead = x->data; + struct esp_data *esp = x->data; + struct crypto_aead *aead = esp->aead; int alen = crypto_aead_authsize(aead); int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead); int elen = skb->len - hlen; @@ -371,7 +376,8 @@ static void esp_input_done(struct crypto_async_request *base, int err) static int esp_input(struct xfrm_state *x, struct sk_buff *skb) { struct ip_esp_hdr *esph; - struct crypto_aead *aead = x->data; + struct esp_data *esp = x->data; + struct crypto_aead *aead = esp->aead; struct aead_request *req; struct sk_buff *trailer; int elen = skb->len - sizeof(*esph) - crypto_aead_ivsize(aead); @@ -453,8 +459,9 @@ out: static u32 esp4_get_mtu(struct xfrm_state *x, int mtu) { - struct crypto_aead *aead = x->data; - u32 blksize = ALIGN(crypto_aead_blocksize(aead), 4); + struct esp_data *esp = x->data; + u32 blksize = ALIGN(crypto_aead_blocksize(esp->aead), 4); + u32 align = max_t(u32, blksize, esp->padlen); unsigned int net_adj; switch (x->props.mode) { @@ -469,8 +476,8 @@ static u32 esp4_get_mtu(struct xfrm_state *x, int mtu) BUG(); } - return ((mtu - x->props.header_len - crypto_aead_authsize(aead) - - net_adj) & ~(blksize - 1)) + net_adj - 2; + return ((mtu - x->props.header_len - crypto_aead_authsize(esp->aead) - + net_adj) & ~(align - 1)) + net_adj - 2; } static void esp4_err(struct sk_buff *skb, u32 info) @@ -504,16 +511,18 @@ static void esp4_err(struct sk_buff *skb, u32 info) static void esp_destroy(struct xfrm_state *x) { - struct crypto_aead *aead = x->data; + struct esp_data *esp = x->data; - if (!aead) + if (!esp) return; - crypto_free_aead(aead); + crypto_free_aead(esp->aead); + kfree(esp); } static int esp_init_aead(struct xfrm_state *x) { + struct esp_data *esp = x->data; struct crypto_aead *aead; int err; @@ -522,7 +531,7 @@ static int esp_init_aead(struct xfrm_state *x) if (IS_ERR(aead)) goto error; - x->data = aead; + esp->aead = aead; err = crypto_aead_setkey(aead, x->aead->alg_key, (x->aead->alg_key_len + 7) / 8); @@ -539,6 +548,7 @@ error: static int esp_init_authenc(struct xfrm_state *x) { + struct esp_data *esp = x->data; struct crypto_aead *aead; struct crypto_authenc_key_param *param; struct rtattr *rta; @@ -573,7 +583,7 @@ static int esp_init_authenc(struct xfrm_state *x) if (IS_ERR(aead)) goto error; - x->data = aead; + esp->aead = aead; keylen = (x->aalg ? (x->aalg->alg_key_len + 7) / 8 : 0) + (x->ealg->alg_key_len + 7) / 8 + RTA_SPACE(sizeof(*param)); @@ -628,11 +638,16 @@ error: static int esp_init_state(struct xfrm_state *x) { + struct esp_data *esp; struct crypto_aead *aead; u32 align; int err; - x->data = NULL; + esp = kzalloc(sizeof(*esp), GFP_KERNEL); + if (esp == NULL) + return -ENOMEM; + + x->data = esp; if (x->aead) err = esp_init_aead(x); @@ -642,7 +657,9 @@ static int esp_init_state(struct xfrm_state *x) if (err) goto error; - aead = x->data; + aead = esp->aead; + + esp->padlen = 0; x->props.header_len = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead); @@ -666,7 +683,9 @@ static int esp_init_state(struct xfrm_state *x) } align = ALIGN(crypto_aead_blocksize(aead), 4); - x->props.trailer_len = align + 1 + crypto_aead_authsize(aead); + if (esp->padlen) + align = max_t(u32, align, esp->padlen); + x->props.trailer_len = align + 1 + crypto_aead_authsize(esp->aead); error: return err; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index d846304..b3f627a 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -933,6 +933,7 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb) local_bh_disable(); frn->tb_id = tb->tb_id; + rcu_read_lock(); frn->err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF); if (!frn->err) { @@ -941,6 +942,7 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb) frn->type = res.type; frn->scope = res.scope; } + rcu_read_unlock(); local_bh_enable(); } } diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index 388d113..af0f14a 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -24,17 +24,21 @@ static inline void fib_alias_accessed(struct fib_alias *fa) } /* Exported by fib_semantics.c */ -void fib_release_info(struct fib_info *); -struct fib_info *fib_create_info(struct fib_config *cfg); -int fib_nh_match(struct fib_config *cfg, struct fib_info *fi); -int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, u32 tb_id, - u8 type, __be32 dst, int dst_len, u8 tos, struct fib_info *fi, - unsigned int); -void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len, - u32 tb_id, const struct nl_info *info, unsigned int nlm_flags); -struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio); -int fib_detect_death(struct fib_info *fi, int order, - struct fib_info **last_resort, int *last_idx, int dflt); +extern void fib_release_info(struct fib_info *); +extern struct fib_info *fib_create_info(struct fib_config *cfg); +extern int fib_nh_match(struct fib_config *cfg, struct fib_info *fi); +extern int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, + u32 tb_id, u8 type, __be32 dst, + int dst_len, u8 tos, struct fib_info *fi, + unsigned int); +extern void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, + int dst_len, u32 tb_id, struct nl_info *info, + unsigned int nlm_flags); +extern struct fib_alias *fib_find_alias(struct list_head *fah, + u8 tos, u32 prio); +extern int fib_detect_death(struct fib_info *fi, int order, + struct fib_info **last_resort, + int *last_idx, int dflt); static inline void fib_result_assign(struct fib_result *res, struct fib_info *fi) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index e63f47a..d5dbca5 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -380,7 +380,7 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi) } void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, - int dst_len, u32 tb_id, const struct nl_info *info, + int dst_len, u32 tb_id, struct nl_info *info, unsigned int nlm_flags) { struct sk_buff *skb; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 5afeb5a..3df6d3e 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -762,9 +762,12 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) if (IS_LEAF(node) || ((struct tnode *) node)->pos > tn->pos + tn->bits - 1) { - put_child(tn, - tkey_extract_bits(node->key, oldtnode->pos, oldtnode->bits + 1), - node); + if (tkey_extract_bits(node->key, + oldtnode->pos + oldtnode->bits, + 1) == 0) + put_child(tn, 2*i, node); + else + put_child(tn, 2*i+1, node); continue; } @@ -1117,8 +1120,12 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) * first tnode need some special handling */ + if (tp) + pos = tp->pos+tp->bits; + else + pos = 0; + if (n) { - pos = tp ? tp->pos+tp->bits : 0; newpos = tkey_mismatch(key, pos, n->key); tn = tnode_new(n->key, newpos, 1); } else { @@ -2523,17 +2530,16 @@ static int fib_route_seq_show(struct seq_file *seq, void *v) list_for_each_entry_rcu(fa, &li->falh, fa_list) { const struct fib_info *fi = fa->fa_info; unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi); + int len; if (fa->fa_type == RTN_BROADCAST || fa->fa_type == RTN_MULTICAST) continue; - seq_setwidth(seq, 127); - if (fi) seq_printf(seq, "%s\t%08X\t%08X\t%04X\t%d\t%u\t" - "%d\t%08X\t%d\t%u\t%u", + "%d\t%08X\t%d\t%u\t%u%n", fi->fib_dev ? fi->fib_dev->name : "*", prefix, fi->fib_nh->nh_gw, flags, 0, 0, @@ -2542,15 +2548,15 @@ static int fib_route_seq_show(struct seq_file *seq, void *v) (fi->fib_advmss ? fi->fib_advmss + 40 : 0), fi->fib_window, - fi->fib_rtt >> 3); + fi->fib_rtt >> 3, &len); else seq_printf(seq, "*\t%08X\t%08X\t%04X\t%d\t%u\t" - "%d\t%08X\t%d\t%u\t%u", + "%d\t%08X\t%d\t%u\t%u%n", prefix, 0, flags, 0, 0, 0, - mask, 0, 0, 0); + mask, 0, 0, 0, &len); - seq_pad(seq, '\n'); + seq_printf(seq, "%*s\n", 127 - len, ""); } } diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index 5893e99..736c9fc3 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c @@ -93,6 +93,35 @@ void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi, } EXPORT_SYMBOL_GPL(gre_build_header); +struct sk_buff *gre_handle_offloads(struct sk_buff *skb, bool gre_csum) +{ + int err; + + if (likely(!skb->encapsulation)) { + skb_reset_inner_headers(skb); + skb->encapsulation = 1; + } + + if (skb_is_gso(skb)) { + err = skb_unclone(skb, GFP_ATOMIC); + if (unlikely(err)) + goto error; + skb_shinfo(skb)->gso_type |= SKB_GSO_GRE; + return skb; + } else if (skb->ip_summed == CHECKSUM_PARTIAL && gre_csum) { + err = skb_checksum_help(skb); + if (unlikely(err)) + goto error; + } else if (skb->ip_summed != CHECKSUM_PARTIAL) + skb->ip_summed = CHECKSUM_NONE; + + return skb; +error: + kfree_skb(skb); + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(gre_handle_offloads); + static __sum16 check_checksum(struct sk_buff *skb) { __sum16 csum = 0; diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index e5d4361..55e6bfb 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -39,8 +39,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, SKB_GSO_UDP | SKB_GSO_DODGY | SKB_GSO_TCP_ECN | - SKB_GSO_GRE | - SKB_GSO_IPIP))) + SKB_GSO_GRE))) goto out; if (unlikely(!pskb_may_pull(skb, sizeof(*greh)))) diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 5c0e8bc..5f7d11a 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -353,9 +353,6 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) saddr = fib_compute_spec_dst(skb); ipc.opt = NULL; ipc.tx_flags = 0; - ipc.ttl = 0; - ipc.tos = -1; - if (icmp_param->replyopts.opt.opt.optlen) { ipc.opt = &icmp_param->replyopts.opt; if (ipc.opt->opt.srr) @@ -611,8 +608,6 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) ipc.addr = iph->saddr; ipc.opt = &icmp_param->replyopts.opt; ipc.tx_flags = 0; - ipc.ttl = 0; - ipc.tos = -1; rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, type, code, icmp_param); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index fc0e649..6acb541 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -29,19 +29,27 @@ const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; EXPORT_SYMBOL(inet_csk_timer_bug_msg); #endif +/* + * This struct holds the first and last local port number. + */ +struct local_ports sysctl_local_ports __read_mostly = { + .lock = __SEQLOCK_UNLOCKED(sysctl_local_ports.lock), + .range = { 32768, 61000 }, +}; + unsigned long *sysctl_local_reserved_ports; EXPORT_SYMBOL(sysctl_local_reserved_ports); -void inet_get_local_port_range(struct net *net, int *low, int *high) +void inet_get_local_port_range(int *low, int *high) { unsigned int seq; do { - seq = read_seqbegin(&net->ipv4.sysctl_local_ports.lock); + seq = read_seqbegin(&sysctl_local_ports.lock); - *low = net->ipv4.sysctl_local_ports.range[0]; - *high = net->ipv4.sysctl_local_ports.range[1]; - } while (read_seqretry(&net->ipv4.sysctl_local_ports.lock, seq)); + *low = sysctl_local_ports.range[0]; + *high = sysctl_local_ports.range[1]; + } while (read_seqretry(&sysctl_local_ports.lock, seq)); } EXPORT_SYMBOL(inet_get_local_port_range); @@ -71,16 +79,17 @@ int inet_csk_bind_conflict(const struct sock *sk, (!reuseport || !sk2->sk_reuseport || (sk2->sk_state != TCP_TIME_WAIT && !uid_eq(uid, sock_i_uid(sk2))))) { - - if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr || - sk2->sk_rcv_saddr == sk->sk_rcv_saddr) + const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2); + if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) || + sk2_rcv_saddr == sk_rcv_saddr(sk)) break; } if (!relax && reuse && sk2->sk_reuse && sk2->sk_state != TCP_LISTEN) { + const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2); - if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr || - sk2->sk_rcv_saddr == sk->sk_rcv_saddr) + if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) || + sk2_rcv_saddr == sk_rcv_saddr(sk)) break; } } @@ -107,7 +116,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) int remaining, rover, low, high; again: - inet_get_local_port_range(net, &low, &high); + inet_get_local_port_range(&low, &high); remaining = (high - low) + 1; smallest_rover = rover = net_random() % remaining + low; @@ -412,8 +421,8 @@ struct dst_entry *inet_csk_route_req(struct sock *sk, RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, sk->sk_protocol, flags, - (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, - ireq->ir_loc_addr, ireq->ir_rmt_port, inet_sk(sk)->inet_sport); + (opt && opt->opt.srr) ? opt->opt.faddr : ireq->rmt_addr, + ireq->loc_addr, ireq->rmt_port, inet_sk(sk)->inet_sport); security_req_classify_flow(req, flowi4_to_flowi(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) @@ -448,8 +457,8 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk, flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, sk->sk_protocol, inet_sk_flowi_flags(sk), - (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, - ireq->ir_loc_addr, ireq->ir_rmt_port, inet_sk(sk)->inet_sport); + (opt && opt->opt.srr) ? opt->opt.faddr : ireq->rmt_addr, + ireq->loc_addr, ireq->rmt_port, inet_sk(sk)->inet_sport); security_req_classify_flow(req, flowi4_to_flowi(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) @@ -495,9 +504,9 @@ struct request_sock *inet_csk_search_req(const struct sock *sk, prev = &req->dl_next) { const struct inet_request_sock *ireq = inet_rsk(req); - if (ireq->ir_rmt_port == rport && - ireq->ir_rmt_addr == raddr && - ireq->ir_loc_addr == laddr && + if (ireq->rmt_port == rport && + ireq->rmt_addr == raddr && + ireq->loc_addr == laddr && AF_INET_FAMILY(req->rsk_ops->family)) { WARN_ON(req->sk); *prevp = prev; @@ -514,8 +523,7 @@ void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, { struct inet_connection_sock *icsk = inet_csk(sk); struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; - const u32 h = inet_synq_hash(inet_rsk(req)->ir_rmt_addr, - inet_rsk(req)->ir_rmt_port, + const u32 h = inet_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd, lopt->nr_table_entries); reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout); @@ -675,9 +683,9 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, newsk->sk_state = TCP_SYN_RECV; newicsk->icsk_bind_hash = NULL; - inet_sk(newsk)->inet_dport = inet_rsk(req)->ir_rmt_port; - inet_sk(newsk)->inet_num = inet_rsk(req)->ir_num; - inet_sk(newsk)->inet_sport = htons(inet_rsk(req)->ir_num); + inet_sk(newsk)->inet_dport = inet_rsk(req)->rmt_port; + inet_sk(newsk)->inet_num = ntohs(inet_rsk(req)->loc_port); + inet_sk(newsk)->inet_sport = inet_rsk(req)->loc_port; newsk->sk_write_space = sk_stream_write_space; newicsk->icsk_retransmits = 0; diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 56a964a..5f64875 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -121,13 +121,13 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, #if IS_ENABLED(CONFIG_IPV6) if (r->idiag_family == AF_INET6) { + const struct ipv6_pinfo *np = inet6_sk(sk); - *(struct in6_addr *)r->id.idiag_src = sk->sk_v6_rcv_saddr; - *(struct in6_addr *)r->id.idiag_dst = sk->sk_v6_daddr; + *(struct in6_addr *)r->id.idiag_src = np->rcv_saddr; + *(struct in6_addr *)r->id.idiag_dst = np->daddr; if (ext & (1 << (INET_DIAG_TCLASS - 1))) - if (nla_put_u8(skb, INET_DIAG_TCLASS, - inet6_sk(sk)->tclass) < 0) + if (nla_put_u8(skb, INET_DIAG_TCLASS, np->tclass) < 0) goto errout; } #endif @@ -222,7 +222,7 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw, u32 portid, u32 seq, u16 nlmsg_flags, const struct nlmsghdr *unlh) { - s32 tmo; + long tmo; struct inet_diag_msg *r; struct nlmsghdr *nlh; @@ -234,7 +234,7 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw, r = nlmsg_data(nlh); BUG_ON(tw->tw_state != TCP_TIME_WAIT); - tmo = tw->tw_ttd - inet_tw_time_stamp(); + tmo = tw->tw_ttd - jiffies; if (tmo < 0) tmo = 0; @@ -248,15 +248,18 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw, r->id.idiag_dst[0] = tw->tw_daddr; r->idiag_state = tw->tw_substate; r->idiag_timer = 3; - r->idiag_expires = jiffies_to_msecs(tmo); + r->idiag_expires = DIV_ROUND_UP(tmo * 1000, HZ); r->idiag_rqueue = 0; r->idiag_wqueue = 0; r->idiag_uid = 0; r->idiag_inode = 0; #if IS_ENABLED(CONFIG_IPV6) if (tw->tw_family == AF_INET6) { - *(struct in6_addr *)r->id.idiag_src = tw->tw_v6_rcv_saddr; - *(struct in6_addr *)r->id.idiag_dst = tw->tw_v6_daddr; + const struct inet6_timewait_sock *tw6 = + inet6_twsk((struct sock *)tw); + + *(struct in6_addr *)r->id.idiag_src = tw6->tw_v6_rcv_saddr; + *(struct in6_addr *)r->id.idiag_dst = tw6->tw_v6_daddr; } #endif @@ -270,11 +273,10 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, const struct nlmsghdr *unlh) { if (sk->sk_state == TCP_TIME_WAIT) - return inet_twsk_diag_fill(inet_twsk(sk), skb, r, portid, seq, - nlmsg_flags, unlh); - - return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq, - nlmsg_flags, unlh); + return inet_twsk_diag_fill((struct inet_timewait_sock *)sk, + skb, r, portid, seq, nlmsg_flags, + unlh); + return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq, nlmsg_flags, unlh); } int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_skb, @@ -336,9 +338,12 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_s err = 0; out: - if (sk) - sock_gen_put(sk); - + if (sk) { + if (sk->sk_state == TCP_TIME_WAIT) + inet_twsk_put((struct inet_timewait_sock *)sk); + else + sock_put(sk); + } out_nosk: return err; } @@ -484,9 +489,10 @@ int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk) entry.family = sk->sk_family; #if IS_ENABLED(CONFIG_IPV6) if (entry.family == AF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); - entry.saddr = sk->sk_v6_rcv_saddr.s6_addr32; - entry.daddr = sk->sk_v6_daddr.s6_addr32; + entry.saddr = np->rcv_saddr.s6_addr32; + entry.daddr = np->daddr.s6_addr32; } else #endif { @@ -629,22 +635,22 @@ static int inet_csk_diag_dump(struct sock *sk, cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); } -static int inet_twsk_diag_dump(struct sock *sk, +static int inet_twsk_diag_dump(struct inet_timewait_sock *tw, struct sk_buff *skb, struct netlink_callback *cb, struct inet_diag_req_v2 *r, const struct nlattr *bc) { - struct inet_timewait_sock *tw = inet_twsk(sk); - if (bc != NULL) { struct inet_diag_entry entry; entry.family = tw->tw_family; #if IS_ENABLED(CONFIG_IPV6) if (tw->tw_family == AF_INET6) { - entry.saddr = tw->tw_v6_rcv_saddr.s6_addr32; - entry.daddr = tw->tw_v6_daddr.s6_addr32; + struct inet6_timewait_sock *tw6 = + inet6_twsk((struct sock *)tw); + entry.saddr = tw6->tw_v6_rcv_saddr.s6_addr32; + entry.daddr = tw6->tw_v6_daddr.s6_addr32; } else #endif { @@ -676,12 +682,12 @@ static inline void inet_diag_req_addrs(const struct sock *sk, #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) { if (req->rsk_ops->family == AF_INET6) { - entry->saddr = ireq->ir_v6_loc_addr.s6_addr32; - entry->daddr = ireq->ir_v6_rmt_addr.s6_addr32; + entry->saddr = inet6_rsk(req)->loc_addr.s6_addr32; + entry->daddr = inet6_rsk(req)->rmt_addr.s6_addr32; } else if (req->rsk_ops->family == AF_INET) { - ipv6_addr_set_v4mapped(ireq->ir_loc_addr, + ipv6_addr_set_v4mapped(ireq->loc_addr, &entry->saddr_storage); - ipv6_addr_set_v4mapped(ireq->ir_rmt_addr, + ipv6_addr_set_v4mapped(ireq->rmt_addr, &entry->daddr_storage); entry->saddr = entry->saddr_storage.s6_addr32; entry->daddr = entry->daddr_storage.s6_addr32; @@ -689,8 +695,8 @@ static inline void inet_diag_req_addrs(const struct sock *sk, } else #endif { - entry->saddr = &ireq->ir_loc_addr; - entry->daddr = &ireq->ir_rmt_addr; + entry->saddr = &ireq->loc_addr; + entry->daddr = &ireq->rmt_addr; } } @@ -725,9 +731,9 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk, tmo = 0; r->id.idiag_sport = inet->inet_sport; - r->id.idiag_dport = ireq->ir_rmt_port; - r->id.idiag_src[0] = ireq->ir_loc_addr; - r->id.idiag_dst[0] = ireq->ir_rmt_addr; + r->id.idiag_dport = ireq->rmt_port; + r->id.idiag_src[0] = ireq->loc_addr; + r->id.idiag_dst[0] = ireq->rmt_addr; r->idiag_expires = jiffies_to_msecs(tmo); r->idiag_rqueue = 0; r->idiag_wqueue = 0; @@ -786,13 +792,13 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, if (reqnum < s_reqnum) continue; - if (r->id.idiag_dport != ireq->ir_rmt_port && + if (r->id.idiag_dport != ireq->rmt_port && r->id.idiag_dport) continue; if (bc) { inet_diag_req_addrs(sk, req, &entry); - entry.dport = ntohs(ireq->ir_rmt_port); + entry.dport = ntohs(ireq->rmt_port); if (!inet_diag_bc_run(bc, &entry)) continue; @@ -905,7 +911,8 @@ skip_listen_ht: num = 0; - if (hlist_nulls_empty(&head->chain)) + if (hlist_nulls_empty(&head->chain) && + hlist_nulls_empty(&head->twchain)) continue; if (i > s_i) @@ -913,7 +920,7 @@ skip_listen_ht: spin_lock_bh(lock); sk_nulls_for_each(sk, node, &head->chain) { - int res; + struct inet_sock *inet = inet_sk(sk); if (!net_eq(sock_net(sk), net)) continue; @@ -922,19 +929,15 @@ skip_listen_ht: if (!(r->idiag_states & (1 << sk->sk_state))) goto next_normal; if (r->sdiag_family != AF_UNSPEC && - sk->sk_family != r->sdiag_family) + sk->sk_family != r->sdiag_family) goto next_normal; - if (r->id.idiag_sport != htons(sk->sk_num) && + if (r->id.idiag_sport != inet->inet_sport && r->id.idiag_sport) goto next_normal; - if (r->id.idiag_dport != sk->sk_dport && + if (r->id.idiag_dport != inet->inet_dport && r->id.idiag_dport) goto next_normal; - if (sk->sk_state == TCP_TIME_WAIT) - res = inet_twsk_diag_dump(sk, skb, cb, r, bc); - else - res = inet_csk_diag_dump(sk, skb, cb, r, bc); - if (res < 0) { + if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) { spin_unlock_bh(lock); goto done; } @@ -942,6 +945,33 @@ next_normal: ++num; } + if (r->idiag_states & TCPF_TIME_WAIT) { + struct inet_timewait_sock *tw; + + inet_twsk_for_each(tw, node, + &head->twchain) { + if (!net_eq(twsk_net(tw), net)) + continue; + + if (num < s_num) + goto next_dying; + if (r->sdiag_family != AF_UNSPEC && + tw->tw_family != r->sdiag_family) + goto next_dying; + if (r->id.idiag_sport != tw->tw_sport && + r->id.idiag_sport) + goto next_dying; + if (r->id.idiag_dport != tw->tw_dport && + r->id.idiag_dport) + goto next_dying; + if (inet_twsk_diag_dump(tw, skb, cb, r, bc) < 0) { + spin_unlock_bh(lock); + goto done; + } +next_dying: + ++num; + } + } spin_unlock_bh(lock); } diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index bb075fc..c5313a9 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -93,6 +93,9 @@ void inet_frags_init(struct inet_frags *f) } rwlock_init(&f->lock); + f->rnd = (u32) ((totalram_pages ^ (totalram_pages >> 7)) ^ + (jiffies ^ (jiffies >> 6))); + setup_timer(&f->secret_timer, inet_frag_secret_rebuild, (unsigned long)f); f->secret_timer.expires = jiffies + f->secret_interval; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 8b9cf27..96da9c7 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -24,31 +24,6 @@ #include <net/secure_seq.h> #include <net/ip.h> -static unsigned int inet_ehashfn(struct net *net, const __be32 laddr, - const __u16 lport, const __be32 faddr, - const __be16 fport) -{ - static u32 inet_ehash_secret __read_mostly; - - net_get_random_once(&inet_ehash_secret, sizeof(inet_ehash_secret)); - - return __inet_ehashfn(laddr, lport, faddr, fport, - inet_ehash_secret + net_hash_mix(net)); -} - - -static unsigned int inet_sk_ehashfn(const struct sock *sk) -{ - const struct inet_sock *inet = inet_sk(sk); - const __be32 laddr = inet->inet_rcv_saddr; - const __u16 lport = inet->inet_num; - const __be32 faddr = inet->inet_daddr; - const __be16 fport = inet->inet_dport; - struct net *net = sock_net(sk); - - return inet_ehashfn(net, laddr, lport, faddr, fport); -} - /* * Allocate and initialize a new local port bind bucket. * The bindhash mutex for snum's hash chain must be held here. @@ -255,19 +230,6 @@ begin: } EXPORT_SYMBOL_GPL(__inet_lookup_listener); -/* All sockets share common refcount, but have different destructors */ -void sock_gen_put(struct sock *sk) -{ - if (!atomic_dec_and_test(&sk->sk_refcnt)) - return; - - if (sk->sk_state == TCP_TIME_WAIT) - inet_twsk_free(inet_twsk(sk)); - else - sk_free(sk); -} -EXPORT_SYMBOL_GPL(sock_gen_put); - struct sock *__inet_lookup_established(struct net *net, struct inet_hashinfo *hashinfo, const __be32 saddr, const __be16 sport, @@ -293,13 +255,13 @@ begin: if (likely(INET_MATCH(sk, net, acookie, saddr, daddr, ports, dif))) { if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) - goto out; + goto begintw; if (unlikely(!INET_MATCH(sk, net, acookie, saddr, daddr, ports, dif))) { - sock_gen_put(sk); + sock_put(sk); goto begin; } - goto found; + goto out; } } /* @@ -309,9 +271,37 @@ begin: */ if (get_nulls_value(node) != slot) goto begin; -out: + +begintw: + /* Must check for a TIME_WAIT'er before going to listener hash. */ + sk_nulls_for_each_rcu(sk, node, &head->twchain) { + if (sk->sk_hash != hash) + continue; + if (likely(INET_TW_MATCH(sk, net, acookie, + saddr, daddr, ports, + dif))) { + if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) { + sk = NULL; + goto out; + } + if (unlikely(!INET_TW_MATCH(sk, net, acookie, + saddr, daddr, ports, + dif))) { + inet_twsk_put(inet_twsk(sk)); + goto begintw; + } + goto out; + } + } + /* + * if the nulls value we got at the end of this lookup is + * not the expected one, we must restart lookup. + * We probably met an item that was moved to another chain. + */ + if (get_nulls_value(node) != slot) + goto begintw; sk = NULL; -found: +out: rcu_read_unlock(); return sk; } @@ -336,29 +326,39 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, spinlock_t *lock = inet_ehash_lockp(hinfo, hash); struct sock *sk2; const struct hlist_nulls_node *node; - struct inet_timewait_sock *tw = NULL; + struct inet_timewait_sock *tw; int twrefcnt = 0; spin_lock(lock); - sk_nulls_for_each(sk2, node, &head->chain) { + /* Check TIME-WAIT sockets first. */ + sk_nulls_for_each(sk2, node, &head->twchain) { if (sk2->sk_hash != hash) continue; - if (likely(INET_MATCH(sk2, net, acookie, + if (likely(INET_TW_MATCH(sk2, net, acookie, saddr, daddr, ports, dif))) { - if (sk2->sk_state == TCP_TIME_WAIT) { - tw = inet_twsk(sk2); - if (twsk_unique(sk, sk2, twp)) - break; - } - goto not_unique; + tw = inet_twsk(sk2); + if (twsk_unique(sk, sk2, twp)) + goto unique; + else + goto not_unique; } } + tw = NULL; + /* And established part... */ + sk_nulls_for_each(sk2, node, &head->chain) { + if (sk2->sk_hash != hash) + continue; + if (likely(INET_MATCH(sk2, net, acookie, + saddr, daddr, ports, dif))) + goto not_unique; + } + +unique: /* Must record num and sport now. Otherwise we will see - * in hash table socket with a funny identity. - */ + * in hash table socket with a funny identity. */ inet->inet_num = lport; inet->inet_sport = htons(lport); sk->sk_hash = hash; @@ -494,7 +494,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, u32 offset = hint + port_offset; struct inet_timewait_sock *tw = NULL; - inet_get_local_port_range(net, &low, &high); + inet_get_local_port_range(&low, &high); remaining = (high - low) + 1; local_bh_disable(); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 6d592f8..1f27c9f 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -87,11 +87,19 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw, refcnt += inet_twsk_bind_unhash(tw, hashinfo); spin_unlock(&bhead->lock); - BUG_ON(refcnt >= atomic_read(&tw->tw_refcnt)); - atomic_sub(refcnt, &tw->tw_refcnt); +#ifdef SOCK_REFCNT_DEBUG + if (atomic_read(&tw->tw_refcnt) != 1) { + pr_debug("%s timewait_sock %p refcnt=%d\n", + tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt)); + } +#endif + while (refcnt) { + inet_twsk_put(tw); + refcnt--; + } } -void inet_twsk_free(struct inet_timewait_sock *tw) +static noinline void inet_twsk_free(struct inet_timewait_sock *tw) { struct module *owner = tw->tw_prot->owner; twsk_destructor((struct sock *)tw); @@ -110,18 +118,6 @@ void inet_twsk_put(struct inet_timewait_sock *tw) } EXPORT_SYMBOL_GPL(inet_twsk_put); -static void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw, - struct hlist_nulls_head *list) -{ - hlist_nulls_add_head_rcu(&tw->tw_node, list); -} - -static void inet_twsk_add_bind_node(struct inet_timewait_sock *tw, - struct hlist_head *list) -{ - hlist_add_head(&tw->tw_bind_node, list); -} - /* * Enter the time wait state. This is called with locally disabled BH. * Essentially we whip up a timewait bucket, copy the relevant info into it @@ -150,21 +146,26 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, spin_lock(lock); /* - * Step 2: Hash TW into tcp ehash chain. - * Notes : - * - tw_refcnt is set to 3 because : - * - We have one reference from bhash chain. - * - We have one reference from ehash chain. - * We can use atomic_set() because prior spin_lock()/spin_unlock() - * committed into memory all tw fields. + * Step 2: Hash TW into TIMEWAIT chain. + * Should be done before removing sk from established chain + * because readers are lockless and search established first. */ - atomic_set(&tw->tw_refcnt, 1 + 1 + 1); - inet_twsk_add_node_rcu(tw, &ehead->chain); + inet_twsk_add_node_rcu(tw, &ehead->twchain); - /* Step 3: Remove SK from hash chain */ + /* Step 3: Remove SK from established hash. */ if (__sk_nulls_del_node_init_rcu(sk)) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); + /* + * Notes : + * - We initially set tw_refcnt to 0 in inet_twsk_alloc() + * - We add one reference for the bhash link + * - We add one reference for the ehash link + * - We want this refcnt update done before allowing other + * threads to find this tw in ehash chain. + */ + atomic_add(1 + 1 + 1, &tw->tw_refcnt); + spin_unlock(lock); } EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); @@ -386,11 +387,11 @@ void inet_twsk_schedule(struct inet_timewait_sock *tw, if (slot >= INET_TWDR_TWKILL_SLOTS) slot = INET_TWDR_TWKILL_SLOTS - 1; } - tw->tw_ttd = inet_tw_time_stamp() + timeo; + tw->tw_ttd = jiffies + timeo; slot = (twdr->slot + slot) & (INET_TWDR_TWKILL_SLOTS - 1); list = &twdr->cells[slot]; } else { - tw->tw_ttd = inet_tw_time_stamp() + (slot << INET_TWDR_RECYCLE_TICK); + tw->tw_ttd = jiffies + (slot << INET_TWDR_RECYCLE_TICK); if (twdr->twcal_hand < 0) { twdr->twcal_hand = 0; @@ -489,9 +490,7 @@ void inet_twsk_purge(struct inet_hashinfo *hashinfo, restart_rcu: rcu_read_lock(); restart: - sk_nulls_for_each_rcu(sk, node, &head->chain) { - if (sk->sk_state != TCP_TIME_WAIT) - continue; + sk_nulls_for_each_rcu(sk, node, &head->twchain) { tw = inet_twsk(sk); if ((tw->tw_family != family) || atomic_read(&twsk_net(tw)->count)) diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 2481993..b66910a 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -106,7 +106,6 @@ struct ip4_create_arg { static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot) { - net_get_random_once(&ip4_frags.rnd, sizeof(ip4_frags.rnd)); return jhash_3words((__force u32)id << 16 | prot, (__force u32)saddr, (__force u32)daddr, ip4_frags.rnd) & (INETFRAGS_HASHSZ - 1); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 9124027..3982eab 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -810,7 +810,7 @@ static int __ip_append_data(struct sock *sk, int copy; int err; int offset = 0; - unsigned int maxfraglen, fragheaderlen, maxnonfragsize; + unsigned int maxfraglen, fragheaderlen; int csummode = CHECKSUM_NONE; struct rtable *rt = (struct rtable *)cork->dst; @@ -823,10 +823,8 @@ static int __ip_append_data(struct sock *sk, fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; - maxnonfragsize = (inet->pmtudisc >= IP_PMTUDISC_DO) ? - mtu : 0xFFFF; - if (cork->length + length > maxnonfragsize - fragheaderlen) { + if (cork->length + length > 0xFFFF - fragheaderlen) { ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu-exthdrlen); return -EMSGSIZE; @@ -1037,6 +1035,7 @@ error: static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, struct ipcm_cookie *ipc, struct rtable **rtp) { + struct inet_sock *inet = inet_sk(sk); struct ip_options_rcu *opt; struct rtable *rt; @@ -1062,13 +1061,10 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, * We steal reference to this route, caller should not release it */ *rtp = NULL; - cork->fragsize = ip_sk_use_pmtu(sk) ? - dst_mtu(&rt->dst) : rt->dst.dev->mtu; + cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ? + rt->dst.dev->mtu : dst_mtu(&rt->dst); cork->dst = &rt->dst; cork->length = 0; - cork->ttl = ipc->ttl; - cork->tos = ipc->tos; - cork->priority = ipc->priority; cork->tx_flags = ipc->tx_flags; return 0; @@ -1123,7 +1119,7 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page, int mtu; int len; int err; - unsigned int maxfraglen, fragheaderlen, fraggap, maxnonfragsize; + unsigned int maxfraglen, fragheaderlen, fraggap; if (inet->hdrincl) return -EPERM; @@ -1147,10 +1143,8 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page, fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; - maxnonfragsize = (inet->pmtudisc >= IP_PMTUDISC_DO) ? - mtu : 0xFFFF; - if (cork->length + size > maxnonfragsize - fragheaderlen) { + if (cork->length + size > 0xFFFF - fragheaderlen) { ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu); return -EMSGSIZE; } @@ -1314,8 +1308,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk, /* DF bit is set when we want to see DF on outgoing frames. * If local_df is set too, we still allow to fragment this frame * locally. */ - if (inet->pmtudisc == IP_PMTUDISC_DO || - inet->pmtudisc == IP_PMTUDISC_PROBE || + if (inet->pmtudisc >= IP_PMTUDISC_DO || (skb->len <= dst_mtu(&rt->dst) && ip_dont_fragment(sk, &rt->dst))) df = htons(IP_DF); @@ -1323,9 +1316,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk, if (cork->flags & IPCORK_OPT) opt = cork->opt; - if (cork->ttl != 0) - ttl = cork->ttl; - else if (rt->rt_type == RTN_MULTICAST) + if (rt->rt_type == RTN_MULTICAST) ttl = inet->mc_ttl; else ttl = ip_select_ttl(inet, &rt->dst); @@ -1333,7 +1324,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk, iph = ip_hdr(skb); iph->version = 4; iph->ihl = 5; - iph->tos = (cork->tos != -1) ? cork->tos : inet->tos; + iph->tos = inet->tos; iph->frag_off = df; iph->ttl = ttl; iph->protocol = sk->sk_protocol; @@ -1345,7 +1336,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk, ip_options_build(skb, opt, cork->addr, rt, 0); } - skb->priority = (cork->tos != -1) ? cork->priority: sk->sk_priority; + skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; /* * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec @@ -1495,8 +1486,6 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr, ipc.addr = daddr; ipc.opt = NULL; ipc.tx_flags = 0; - ipc.ttl = 0; - ipc.tos = -1; if (replyopts.opt.opt.optlen) { ipc.opt = &replyopts.opt; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index ddf32a6..d9c4f11 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -189,7 +189,7 @@ EXPORT_SYMBOL(ip_cmsg_recv); int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc) { - int err, val; + int err; struct cmsghdr *cmsg; for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { @@ -215,24 +215,6 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc) ipc->addr = info->ipi_spec_dst.s_addr; break; } - case IP_TTL: - if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) - return -EINVAL; - val = *(int *)CMSG_DATA(cmsg); - if (val < 1 || val > 255) - return -EINVAL; - ipc->ttl = val; - break; - case IP_TOS: - if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) - return -EINVAL; - val = *(int *)CMSG_DATA(cmsg); - if (val < 0 || val > 255) - return -EINVAL; - ipc->tos = val; - ipc->priority = rt_tos2priority(ipc->tos); - break; - default: return -EINVAL; } @@ -386,7 +368,7 @@ void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 inf /* * Handle MSG_ERRQUEUE */ -int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) +int ip_recv_error(struct sock *sk, struct msghdr *msg, int len) { struct sock_exterr_skb *serr; struct sk_buff *skb, *skb2; @@ -423,7 +405,6 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) serr->addr_offset); sin->sin_port = serr->port; memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); - *addr_len = sizeof(*sin); } memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err)); @@ -628,7 +609,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, inet->nodefrag = val ? 1 : 0; break; case IP_MTU_DISCOVER: - if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_INTERFACE) + if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_PROBE) goto e_inval; inet->pmtudisc = val; break; @@ -1053,12 +1034,11 @@ e_inval: * destination in skb->cb[] before dst drop. * This way, receiver doesnt make cache line misses to read rtable. */ -void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb) +void ipv4_pktinfo_prepare(struct sk_buff *skb) { struct in_pktinfo *pktinfo = PKTINFO_SKB_CB(skb); - if ((inet_sk(sk)->cmsg_flags & IP_CMSG_PKTINFO) && - skb_rtable(skb)) { + if (skb_rtable(skb)) { pktinfo->ipi_ifindex = inet_iif(skb); pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb); } else { diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 90ff957..63a6d6d 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -454,8 +454,6 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, tstats->rx_bytes += skb->len; u64_stats_update_end(&tstats->syncp); - skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev))); - if (tunnel->dev->type == ARPHRD_ETHER) { skb->protocol = eth_type_trans(skb, tunnel->dev); skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); @@ -463,6 +461,8 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, skb->dev = tunnel->dev; } + skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev))); + gro_cells_receive(&tunnel->gro_cells, skb); return 0; @@ -976,19 +976,13 @@ int ip_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct iphdr *iph = &tunnel->parms.iph; - int i, err; + int err; dev->destructor = ip_tunnel_dev_free; dev->tstats = alloc_percpu(struct pcpu_tstats); if (!dev->tstats) return -ENOMEM; - for_each_possible_cpu(i) { - struct pcpu_tstats *ipt_stats; - ipt_stats = per_cpu_ptr(dev->tstats, i); - u64_stats_init(&ipt_stats->syncp); - } - err = gro_cells_init(&tunnel->gro_cells, dev); if (err) { free_percpu(dev->tstats); diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 42ffbc8..c31e3ad 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -116,36 +116,3 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto) return 0; } EXPORT_SYMBOL_GPL(iptunnel_pull_header); - -struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb, - bool csum_help, - int gso_type_mask) -{ - int err; - - if (likely(!skb->encapsulation)) { - skb_reset_inner_headers(skb); - skb->encapsulation = 1; - } - - if (skb_is_gso(skb)) { - err = skb_unclone(skb, GFP_ATOMIC); - if (unlikely(err)) - goto error; - skb_shinfo(skb)->gso_type |= gso_type_mask; - return skb; - } - - if (skb->ip_summed == CHECKSUM_PARTIAL && csum_help) { - err = skb_checksum_help(skb); - if (unlikely(err)) - goto error; - } else if (skb->ip_summed != CHECKSUM_PARTIAL) - skb->ip_summed = CHECKSUM_NONE; - - return skb; -error: - kfree_skb(skb); - return ERR_PTR(err); -} -EXPORT_SYMBOL_GPL(iptunnel_handle_offloads); diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 52b802a..6e87f85 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -49,6 +49,70 @@ static struct rtnl_link_ops vti_link_ops __read_mostly; static int vti_net_id __read_mostly; static int vti_tunnel_init(struct net_device *dev); +static int vti_err(struct sk_buff *skb, u32 info) +{ + + /* All the routers (except for Linux) return only + * 8 bytes of packet payload. It means, that precise relaying of + * ICMP in the real Internet is absolutely infeasible. + */ + struct net *net = dev_net(skb->dev); + struct ip_tunnel_net *itn = net_generic(net, vti_net_id); + struct iphdr *iph = (struct iphdr *)skb->data; + const int type = icmp_hdr(skb)->type; + const int code = icmp_hdr(skb)->code; + struct ip_tunnel *t; + int err; + + switch (type) { + default: + case ICMP_PARAMETERPROB: + return 0; + + case ICMP_DEST_UNREACH: + switch (code) { + case ICMP_SR_FAILED: + case ICMP_PORT_UNREACH: + /* Impossible event. */ + return 0; + default: + /* All others are translated to HOST_UNREACH. */ + break; + } + break; + case ICMP_TIME_EXCEEDED: + if (code != ICMP_EXC_TTL) + return 0; + break; + } + + err = -ENOENT; + + t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, + iph->daddr, iph->saddr, 0); + if (t == NULL) + goto out; + + if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { + ipv4_update_pmtu(skb, dev_net(skb->dev), info, + t->parms.link, 0, IPPROTO_IPIP, 0); + err = 0; + goto out; + } + + err = 0; + if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) + goto out; + + if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO)) + t->err_count++; + else + t->err_count = 1; + t->err_time = jiffies; +out: + return err; +} + /* We dont digest the packet therefore let the packet pass */ static int vti_rcv(struct sk_buff *skb) { @@ -126,7 +190,6 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) if (!rt->dst.xfrm || rt->dst.xfrm->props.mode != XFRM_MODE_TUNNEL) { dev->stats.tx_carrier_errors++; - ip_rt_put(rt); goto tx_error_icmp; } tdev = rt->dst.dev; @@ -241,8 +304,9 @@ static void __net_init vti_fb_tunnel_init(struct net_device *dev) iph->ihl = 5; } -static struct xfrm_tunnel_notifier vti_handler __read_mostly = { +static struct xfrm_tunnel vti_handler __read_mostly = { .handler = vti_rcv, + .err_handler = vti_err, .priority = 1, }; diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index fe3e9f7..7f80fb4 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -220,17 +220,17 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) if (unlikely(skb->protocol != htons(ETH_P_IP))) goto tx_error; - skb = iptunnel_handle_offloads(skb, false, SKB_GSO_IPIP); - if (IS_ERR(skb)) - goto out; + if (likely(!skb->encapsulation)) { + skb_reset_inner_headers(skb); + skb->encapsulation = 1; + } ip_tunnel_xmit(skb, dev, tiph, tiph->protocol); return NETDEV_TX_OK; tx_error: - dev_kfree_skb(skb); -out: dev->stats.tx_errors++; + dev_kfree_skb(skb); return NETDEV_TX_OK; } @@ -275,7 +275,6 @@ static const struct net_device_ops ipip_netdev_ops = { #define IPIP_FEATURES (NETIF_F_SG | \ NETIF_F_FRAGLIST | \ NETIF_F_HIGHDMA | \ - NETIF_F_GSO_SOFTWARE | \ NETIF_F_HW_CSUM) static void ipip_tunnel_setup(struct net_device *dev) diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 40d5607..1657e39b 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -36,27 +36,6 @@ config NF_CONNTRACK_PROC_COMPAT If unsure, say Y. -config NF_TABLES_IPV4 - depends on NF_TABLES - tristate "IPv4 nf_tables support" - -config NFT_REJECT_IPV4 - depends on NF_TABLES_IPV4 - tristate "nf_tables IPv4 reject support" - -config NFT_CHAIN_ROUTE_IPV4 - depends on NF_TABLES_IPV4 - tristate "IPv4 nf_tables route chain support" - -config NFT_CHAIN_NAT_IPV4 - depends on NF_TABLES_IPV4 - depends on NF_NAT_IPV4 && NFT_NAT - tristate "IPv4 nf_tables nat chain support" - -config NF_TABLES_ARP - depends on NF_TABLES - tristate "ARP nf_tables support" - config IP_NF_IPTABLES tristate "IP tables support (required for filtering/masq/NAT)" default m if NETFILTER_ADVANCED=n diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 19df72b..3622b24 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -27,12 +27,6 @@ obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o # NAT protocols (nf_nat) obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o -obj-$(CONFIG_NF_TABLES_IPV4) += nf_tables_ipv4.o -obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o -obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o -obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o -obj-$(CONFIG_NF_TABLES_ARP) += nf_tables_arp.o - # generic IP tables obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 59da7cd..85a4f21 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -271,11 +271,6 @@ unsigned int arpt_do_table(struct sk_buff *skb, local_bh_disable(); addend = xt_write_recseq_begin(); private = table->private; - /* - * Ensure we load private-> members after we've fetched the base - * pointer. - */ - smp_read_barrier_depends(); table_base = private->entries[smp_processor_id()]; e = get_entry(table_base, private->hook_entry[hook]); diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c index 802ddec..a865f6f 100644 --- a/net/ipv4/netfilter/arptable_filter.c +++ b/net/ipv4/netfilter/arptable_filter.c @@ -27,14 +27,13 @@ static const struct xt_table packet_filter = { /* The work comes in here from netfilter.c */ static unsigned int -arptable_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, +arptable_filter_hook(unsigned int hook, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { const struct net *net = dev_net((in != NULL) ? in : out); - return arpt_do_table(skb, ops->hooknum, in, out, - net->ipv4.arptable_filter); + return arpt_do_table(skb, hook, in, out, net->ipv4.arptable_filter); } static struct nf_hook_ops *arpfilter_ops __read_mostly; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 61dbcd5..cb91101 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -340,11 +340,6 @@ ipt_do_table(struct sk_buff *skb, addend = xt_write_recseq_begin(); private = table->private; cpu = smp_processor_id(); - /* - * Ensure we load private-> members after we've fetched the base - * pointer. - */ - smp_read_barrier_depends(); table_base = private->entries[cpu]; jumpstack = (struct ipt_entry **)private->jumpstack[cpu]; stackptr = per_cpu_ptr(private->stackptr, cpu); diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 2510c02..0b732ef 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -28,7 +28,6 @@ #include <linux/netfilter_ipv4/ipt_CLUSTERIP.h> #include <net/netfilter/nf_conntrack.h> #include <net/net_namespace.h> -#include <net/netns/generic.h> #include <net/checksum.h> #include <net/ip.h> @@ -58,21 +57,15 @@ struct clusterip_config { struct rcu_head rcu; }; -#ifdef CONFIG_PROC_FS -static const struct file_operations clusterip_proc_fops; -#endif +static LIST_HEAD(clusterip_configs); -static int clusterip_net_id __read_mostly; - -struct clusterip_net { - struct list_head configs; - /* lock protects the configs list */ - spinlock_t lock; +/* clusterip_lock protects the clusterip_configs list */ +static DEFINE_SPINLOCK(clusterip_lock); #ifdef CONFIG_PROC_FS - struct proc_dir_entry *procdir; +static const struct file_operations clusterip_proc_fops; +static struct proc_dir_entry *clusterip_procdir; #endif -}; static inline void clusterip_config_get(struct clusterip_config *c) @@ -99,13 +92,10 @@ clusterip_config_put(struct clusterip_config *c) static inline void clusterip_config_entry_put(struct clusterip_config *c) { - struct net *net = dev_net(c->dev); - struct clusterip_net *cn = net_generic(net, clusterip_net_id); - local_bh_disable(); - if (atomic_dec_and_lock(&c->entries, &cn->lock)) { + if (atomic_dec_and_lock(&c->entries, &clusterip_lock)) { list_del_rcu(&c->list); - spin_unlock(&cn->lock); + spin_unlock(&clusterip_lock); local_bh_enable(); dev_mc_del(c->dev, c->clustermac); @@ -123,12 +113,11 @@ clusterip_config_entry_put(struct clusterip_config *c) } static struct clusterip_config * -__clusterip_config_find(struct net *net, __be32 clusterip) +__clusterip_config_find(__be32 clusterip) { struct clusterip_config *c; - struct clusterip_net *cn = net_generic(net, clusterip_net_id); - list_for_each_entry_rcu(c, &cn->configs, list) { + list_for_each_entry_rcu(c, &clusterip_configs, list) { if (c->clusterip == clusterip) return c; } @@ -137,12 +126,12 @@ __clusterip_config_find(struct net *net, __be32 clusterip) } static inline struct clusterip_config * -clusterip_config_find_get(struct net *net, __be32 clusterip, int entry) +clusterip_config_find_get(__be32 clusterip, int entry) { struct clusterip_config *c; rcu_read_lock_bh(); - c = __clusterip_config_find(net, clusterip); + c = __clusterip_config_find(clusterip); if (c) { if (unlikely(!atomic_inc_not_zero(&c->refcount))) c = NULL; @@ -169,7 +158,6 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip, struct net_device *dev) { struct clusterip_config *c; - struct clusterip_net *cn = net_generic(dev_net(dev), clusterip_net_id); c = kzalloc(sizeof(*c), GFP_ATOMIC); if (!c) @@ -192,7 +180,7 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip, /* create proc dir entry */ sprintf(buffer, "%pI4", &ip); c->pde = proc_create_data(buffer, S_IWUSR|S_IRUSR, - cn->procdir, + clusterip_procdir, &clusterip_proc_fops, c); if (!c->pde) { kfree(c); @@ -201,9 +189,9 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip, } #endif - spin_lock_bh(&cn->lock); - list_add_rcu(&c->list, &cn->configs); - spin_unlock_bh(&cn->lock); + spin_lock_bh(&clusterip_lock); + list_add_rcu(&c->list, &clusterip_configs); + spin_unlock_bh(&clusterip_lock); return c; } @@ -382,7 +370,7 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par) /* FIXME: further sanity checks */ - config = clusterip_config_find_get(par->net, e->ip.dst.s_addr, 1); + config = clusterip_config_find_get(e->ip.dst.s_addr, 1); if (!config) { if (!(cipinfo->flags & CLUSTERIP_FLAG_NEW)) { pr_info("no config found for %pI4, need 'new'\n", @@ -396,7 +384,7 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par) return -EINVAL; } - dev = dev_get_by_name(par->net, e->ip.iniface); + dev = dev_get_by_name(&init_net, e->ip.iniface); if (!dev) { pr_info("no such interface %s\n", e->ip.iniface); @@ -495,7 +483,7 @@ static void arp_print(struct arp_payload *payload) #endif static unsigned int -arp_mangle(const struct nf_hook_ops *ops, +arp_mangle(unsigned int hook, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, @@ -504,7 +492,6 @@ arp_mangle(const struct nf_hook_ops *ops, struct arphdr *arp = arp_hdr(skb); struct arp_payload *payload; struct clusterip_config *c; - struct net *net = dev_net(in ? in : out); /* we don't care about non-ethernet and non-ipv4 ARP */ if (arp->ar_hrd != htons(ARPHRD_ETHER) || @@ -521,7 +508,7 @@ arp_mangle(const struct nf_hook_ops *ops, /* if there is no clusterip configuration for the arp reply's * source ip, we don't want to mangle it */ - c = clusterip_config_find_get(net, payload->src_ip, 0); + c = clusterip_config_find_get(payload->src_ip, 0); if (!c) return NF_ACCEPT; @@ -711,75 +698,48 @@ static const struct file_operations clusterip_proc_fops = { #endif /* CONFIG_PROC_FS */ -static int clusterip_net_init(struct net *net) -{ - struct clusterip_net *cn = net_generic(net, clusterip_net_id); - - INIT_LIST_HEAD(&cn->configs); - - spin_lock_init(&cn->lock); - -#ifdef CONFIG_PROC_FS - cn->procdir = proc_mkdir("ipt_CLUSTERIP", net->proc_net); - if (!cn->procdir) { - pr_err("Unable to proc dir entry\n"); - return -ENOMEM; - } -#endif /* CONFIG_PROC_FS */ - - return 0; -} - -static void clusterip_net_exit(struct net *net) -{ -#ifdef CONFIG_PROC_FS - struct clusterip_net *cn = net_generic(net, clusterip_net_id); - proc_remove(cn->procdir); -#endif -} - -static struct pernet_operations clusterip_net_ops = { - .init = clusterip_net_init, - .exit = clusterip_net_exit, - .id = &clusterip_net_id, - .size = sizeof(struct clusterip_net), -}; - static int __init clusterip_tg_init(void) { int ret; - ret = register_pernet_subsys(&clusterip_net_ops); - if (ret < 0) - return ret; - ret = xt_register_target(&clusterip_tg_reg); if (ret < 0) - goto cleanup_subsys; + return ret; ret = nf_register_hook(&cip_arp_ops); if (ret < 0) goto cleanup_target; +#ifdef CONFIG_PROC_FS + clusterip_procdir = proc_mkdir("ipt_CLUSTERIP", init_net.proc_net); + if (!clusterip_procdir) { + pr_err("Unable to proc dir entry\n"); + ret = -ENOMEM; + goto cleanup_hook; + } +#endif /* CONFIG_PROC_FS */ + pr_info("ClusterIP Version %s loaded successfully\n", CLUSTERIP_VERSION); - return 0; +#ifdef CONFIG_PROC_FS +cleanup_hook: + nf_unregister_hook(&cip_arp_ops); +#endif /* CONFIG_PROC_FS */ cleanup_target: xt_unregister_target(&clusterip_tg_reg); -cleanup_subsys: - unregister_pernet_subsys(&clusterip_net_ops); return ret; } static void __exit clusterip_tg_exit(void) { pr_info("ClusterIP Version %s unloading\n", CLUSTERIP_VERSION); - +#ifdef CONFIG_PROC_FS + proc_remove(clusterip_procdir); +#endif nf_unregister_hook(&cip_arp_ops); xt_unregister_target(&clusterip_tg_reg); - unregister_pernet_subsys(&clusterip_net_ops); /* Wait for completion of call_rcu_bh()'s (clusterip_config_rcu_free) */ rcu_barrier_bh(); diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c index f13bd91..b6346bf 100644 --- a/net/ipv4/netfilter/ipt_SYNPROXY.c +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -244,7 +244,6 @@ synproxy_recv_client_ack(const struct synproxy_net *snet, this_cpu_inc(snet->stats->cookie_valid); opts->mss = mss; - opts->options |= XT_SYNPROXY_OPT_MSS; if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP) synproxy_check_timestamp_cookie(opts); @@ -298,7 +297,7 @@ synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par) return XT_CONTINUE; } -static unsigned int ipv4_synproxy_hook(const struct nf_hook_ops *ops, +static unsigned int ipv4_synproxy_hook(unsigned int hooknum, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c index 9cb993c..cbc2215 100644 --- a/net/ipv4/netfilter/ipt_ULOG.c +++ b/net/ipv4/netfilter/ipt_ULOG.c @@ -220,7 +220,6 @@ static void ipt_ulog_packet(struct net *net, ub->qlen++; pm = nlmsg_data(nlh); - memset(pm, 0, sizeof(*pm)); /* We might not have a timestamp, get one */ if (skb->tstamp.tv64 == 0) @@ -239,6 +238,8 @@ static void ipt_ulog_packet(struct net *net, } else if (loginfo->prefix[0] != '\0') strncpy(pm->prefix, loginfo->prefix, sizeof(pm->prefix)); + else + *(pm->prefix) = '\0'; if (in && in->hard_header_len > 0 && skb->mac_header != skb->network_header && @@ -250,9 +251,13 @@ static void ipt_ulog_packet(struct net *net, if (in) strncpy(pm->indev_name, in->name, sizeof(pm->indev_name)); + else + pm->indev_name[0] = '\0'; if (out) strncpy(pm->outdev_name, out->name, sizeof(pm->outdev_name)); + else + pm->outdev_name[0] = '\0'; /* copy_len <= skb->len, so can't fail. */ if (skb_copy_bits(skb, 0, pm->payload, copy_len) < 0) diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index e08a74a..50af5b4 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c @@ -33,21 +33,20 @@ static const struct xt_table packet_filter = { }; static unsigned int -iptable_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, +iptable_filter_hook(unsigned int hook, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { const struct net *net; - if (ops->hooknum == NF_INET_LOCAL_OUT && + if (hook == NF_INET_LOCAL_OUT && (skb->len < sizeof(struct iphdr) || ip_hdrlen(skb) < sizeof(struct iphdr))) /* root is playing with raw sockets. */ return NF_ACCEPT; net = dev_net((in != NULL) ? in : out); - return ipt_do_table(skb, ops->hooknum, in, out, - net->ipv4.iptable_filter); + return ipt_do_table(skb, hook, in, out, net->ipv4.iptable_filter); } static struct nf_hook_ops *filter_ops __read_mostly; diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index 6a5079c..0d8cd82 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -79,19 +79,19 @@ ipt_mangle_out(struct sk_buff *skb, const struct net_device *out) /* The work comes in here from netfilter.c. */ static unsigned int -iptable_mangle_hook(const struct nf_hook_ops *ops, +iptable_mangle_hook(unsigned int hook, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - if (ops->hooknum == NF_INET_LOCAL_OUT) + if (hook == NF_INET_LOCAL_OUT) return ipt_mangle_out(skb, out); - if (ops->hooknum == NF_INET_POST_ROUTING) - return ipt_do_table(skb, ops->hooknum, in, out, + if (hook == NF_INET_POST_ROUTING) + return ipt_do_table(skb, hook, in, out, dev_net(out)->ipv4.iptable_mangle); /* PREROUTING/INPUT/FORWARD: */ - return ipt_do_table(skb, ops->hooknum, in, out, + return ipt_do_table(skb, hook, in, out, dev_net(in)->ipv4.iptable_mangle); } diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index ee28861..683bfaf 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c @@ -61,7 +61,7 @@ static unsigned int nf_nat_rule_find(struct sk_buff *skb, unsigned int hooknum, } static unsigned int -nf_nat_ipv4_fn(const struct nf_hook_ops *ops, +nf_nat_ipv4_fn(unsigned int hooknum, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, @@ -71,7 +71,7 @@ nf_nat_ipv4_fn(const struct nf_hook_ops *ops, enum ip_conntrack_info ctinfo; struct nf_conn_nat *nat; /* maniptype == SRC for postrouting. */ - enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum); + enum nf_nat_manip_type maniptype = HOOK2MANIP(hooknum); /* We never see fragments: conntrack defrags on pre-routing * and local-out, and nf_nat_out protects post-routing. @@ -108,7 +108,7 @@ nf_nat_ipv4_fn(const struct nf_hook_ops *ops, case IP_CT_RELATED_REPLY: if (ip_hdr(skb)->protocol == IPPROTO_ICMP) { if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, - ops->hooknum)) + hooknum)) return NF_DROP; else return NF_ACCEPT; @@ -121,14 +121,14 @@ nf_nat_ipv4_fn(const struct nf_hook_ops *ops, if (!nf_nat_initialized(ct, maniptype)) { unsigned int ret; - ret = nf_nat_rule_find(skb, ops->hooknum, in, out, ct); + ret = nf_nat_rule_find(skb, hooknum, in, out, ct); if (ret != NF_ACCEPT) return ret; } else { pr_debug("Already setup manip %s for ct %p\n", maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST", ct); - if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out)) + if (nf_nat_oif_changed(hooknum, ctinfo, nat, out)) goto oif_changed; } break; @@ -137,11 +137,11 @@ nf_nat_ipv4_fn(const struct nf_hook_ops *ops, /* ESTABLISHED */ NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED || ctinfo == IP_CT_ESTABLISHED_REPLY); - if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out)) + if (nf_nat_oif_changed(hooknum, ctinfo, nat, out)) goto oif_changed; } - return nf_nat_packet(ct, ctinfo, ops->hooknum, skb); + return nf_nat_packet(ct, ctinfo, hooknum, skb); oif_changed: nf_ct_kill_acct(ct, ctinfo, skb); @@ -149,7 +149,7 @@ oif_changed: } static unsigned int -nf_nat_ipv4_in(const struct nf_hook_ops *ops, +nf_nat_ipv4_in(unsigned int hooknum, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, @@ -158,7 +158,7 @@ nf_nat_ipv4_in(const struct nf_hook_ops *ops, unsigned int ret; __be32 daddr = ip_hdr(skb)->daddr; - ret = nf_nat_ipv4_fn(ops, skb, in, out, okfn); + ret = nf_nat_ipv4_fn(hooknum, skb, in, out, okfn); if (ret != NF_DROP && ret != NF_STOLEN && daddr != ip_hdr(skb)->daddr) skb_dst_drop(skb); @@ -167,7 +167,7 @@ nf_nat_ipv4_in(const struct nf_hook_ops *ops, } static unsigned int -nf_nat_ipv4_out(const struct nf_hook_ops *ops, +nf_nat_ipv4_out(unsigned int hooknum, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, @@ -185,7 +185,7 @@ nf_nat_ipv4_out(const struct nf_hook_ops *ops, ip_hdrlen(skb) < sizeof(struct iphdr)) return NF_ACCEPT; - ret = nf_nat_ipv4_fn(ops, skb, in, out, okfn); + ret = nf_nat_ipv4_fn(hooknum, skb, in, out, okfn); #ifdef CONFIG_XFRM if (ret != NF_DROP && ret != NF_STOLEN && !(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && @@ -207,7 +207,7 @@ nf_nat_ipv4_out(const struct nf_hook_ops *ops, } static unsigned int -nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops, +nf_nat_ipv4_local_fn(unsigned int hooknum, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, @@ -223,7 +223,7 @@ nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops, ip_hdrlen(skb) < sizeof(struct iphdr)) return NF_ACCEPT; - ret = nf_nat_ipv4_fn(ops, skb, in, out, okfn); + ret = nf_nat_ipv4_fn(hooknum, skb, in, out, okfn); if (ret != NF_DROP && ret != NF_STOLEN && (ct = nf_ct_get(skb, &ctinfo)) != NULL) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index b2f7e8f..1f82aea 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -20,20 +20,20 @@ static const struct xt_table packet_raw = { /* The work comes in here from netfilter.c. */ static unsigned int -iptable_raw_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, +iptable_raw_hook(unsigned int hook, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { const struct net *net; - if (ops->hooknum == NF_INET_LOCAL_OUT && + if (hook == NF_INET_LOCAL_OUT && (skb->len < sizeof(struct iphdr) || ip_hdrlen(skb) < sizeof(struct iphdr))) /* root is playing with raw sockets. */ return NF_ACCEPT; net = dev_net((in != NULL) ? in : out); - return ipt_do_table(skb, ops->hooknum, in, out, net->ipv4.iptable_raw); + return ipt_do_table(skb, hook, in, out, net->ipv4.iptable_raw); } static struct nf_hook_ops *rawtable_ops __read_mostly; diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c index c86647e..f867a8d 100644 --- a/net/ipv4/netfilter/iptable_security.c +++ b/net/ipv4/netfilter/iptable_security.c @@ -37,22 +37,21 @@ static const struct xt_table security_table = { }; static unsigned int -iptable_security_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, +iptable_security_hook(unsigned int hook, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { const struct net *net; - if (ops->hooknum == NF_INET_LOCAL_OUT && + if (hook == NF_INET_LOCAL_OUT && (skb->len < sizeof(struct iphdr) || ip_hdrlen(skb) < sizeof(struct iphdr))) /* Somebody is playing with raw sockets. */ return NF_ACCEPT; net = dev_net((in != NULL) ? in : out); - return ipt_do_table(skb, ops->hooknum, in, out, - net->ipv4.iptable_security); + return ipt_do_table(skb, hook, in, out, net->ipv4.iptable_security); } static struct nf_hook_ops *sectbl_ops __read_mostly; diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index ecd8bec..86f5b34 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -92,7 +92,7 @@ static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, return NF_ACCEPT; } -static unsigned int ipv4_helper(const struct nf_hook_ops *ops, +static unsigned int ipv4_helper(unsigned int hooknum, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, @@ -121,7 +121,7 @@ static unsigned int ipv4_helper(const struct nf_hook_ops *ops, ct, ctinfo); } -static unsigned int ipv4_confirm(const struct nf_hook_ops *ops, +static unsigned int ipv4_confirm(unsigned int hooknum, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, @@ -147,16 +147,16 @@ out: return nf_conntrack_confirm(skb); } -static unsigned int ipv4_conntrack_in(const struct nf_hook_ops *ops, +static unsigned int ipv4_conntrack_in(unsigned int hooknum, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - return nf_conntrack_in(dev_net(in), PF_INET, ops->hooknum, skb); + return nf_conntrack_in(dev_net(in), PF_INET, hooknum, skb); } -static unsigned int ipv4_conntrack_local(const struct nf_hook_ops *ops, +static unsigned int ipv4_conntrack_local(unsigned int hooknum, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, @@ -166,7 +166,7 @@ static unsigned int ipv4_conntrack_local(const struct nf_hook_ops *ops, if (skb->len < sizeof(struct iphdr) || ip_hdrlen(skb) < sizeof(struct iphdr)) return NF_ACCEPT; - return nf_conntrack_in(dev_net(out), PF_INET, ops->hooknum, skb); + return nf_conntrack_in(dev_net(out), PF_INET, hooknum, skb); } /* Connection tracking may drop packets, but never alters them, so diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index 12e13bd..7428155 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -60,7 +60,7 @@ static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum, return IP_DEFRAG_CONNTRACK_OUT + zone; } -static unsigned int ipv4_conntrack_defrag(const struct nf_hook_ops *ops, +static unsigned int ipv4_conntrack_defrag(unsigned int hooknum, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, @@ -83,9 +83,7 @@ static unsigned int ipv4_conntrack_defrag(const struct nf_hook_ops *ops, #endif /* Gather fragments. */ if (ip_is_fragment(ip_hdr(skb))) { - enum ip_defrag_users user = - nf_ct_defrag_user(ops->hooknum, skb); - + enum ip_defrag_users user = nf_ct_defrag_user(hooknum, skb); if (nf_ct_ipv4_gather_frags(skb, user)) return NF_STOLEN; } diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c deleted file mode 100644 index 3e67ef1..0000000 --- a/net/ipv4/netfilter/nf_tables_arp.c +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Copyright (c) 2008-2010 Patrick McHardy <kaber@trash.net> - * Copyright (c) 2013 Pablo Neira Ayuso <pablo@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Development of this code funded by Astaro AG (http://www.astaro.com/) - */ - -#include <linux/module.h> -#include <linux/init.h> -#include <linux/netfilter_arp.h> -#include <net/netfilter/nf_tables.h> - -static struct nft_af_info nft_af_arp __read_mostly = { - .family = NFPROTO_ARP, - .nhooks = NF_ARP_NUMHOOKS, - .owner = THIS_MODULE, -}; - -static int nf_tables_arp_init_net(struct net *net) -{ - net->nft.arp = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL); - if (net->nft.arp== NULL) - return -ENOMEM; - - memcpy(net->nft.arp, &nft_af_arp, sizeof(nft_af_arp)); - - if (nft_register_afinfo(net, net->nft.arp) < 0) - goto err; - - return 0; -err: - kfree(net->nft.arp); - return -ENOMEM; -} - -static void nf_tables_arp_exit_net(struct net *net) -{ - nft_unregister_afinfo(net->nft.arp); - kfree(net->nft.arp); -} - -static struct pernet_operations nf_tables_arp_net_ops = { - .init = nf_tables_arp_init_net, - .exit = nf_tables_arp_exit_net, -}; - -static unsigned int -nft_do_chain_arp(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - struct nft_pktinfo pkt; - - nft_set_pktinfo(&pkt, ops, skb, in, out); - - return nft_do_chain_pktinfo(&pkt, ops); -} - -static struct nf_chain_type filter_arp = { - .family = NFPROTO_ARP, - .name = "filter", - .type = NFT_CHAIN_T_DEFAULT, - .hook_mask = (1 << NF_ARP_IN) | - (1 << NF_ARP_OUT) | - (1 << NF_ARP_FORWARD), - .fn = { - [NF_ARP_IN] = nft_do_chain_arp, - [NF_ARP_OUT] = nft_do_chain_arp, - [NF_ARP_FORWARD] = nft_do_chain_arp, - }, -}; - -static int __init nf_tables_arp_init(void) -{ - int ret; - - nft_register_chain_type(&filter_arp); - ret = register_pernet_subsys(&nf_tables_arp_net_ops); - if (ret < 0) - nft_unregister_chain_type(&filter_arp); - - return ret; -} - -static void __exit nf_tables_arp_exit(void) -{ - unregister_pernet_subsys(&nf_tables_arp_net_ops); - nft_unregister_chain_type(&filter_arp); -} - -module_init(nf_tables_arp_init); -module_exit(nf_tables_arp_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_ALIAS_NFT_FAMILY(3); /* NFPROTO_ARP */ diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c deleted file mode 100644 index 0f4cbfe..0000000 --- a/net/ipv4/netfilter/nf_tables_ipv4.c +++ /dev/null @@ -1,127 +0,0 @@ -/* - * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> - * Copyright (c) 2012-2013 Pablo Neira Ayuso <pablo@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Development of this code funded by Astaro AG (http://www.astaro.com/) - */ - -#include <linux/init.h> -#include <linux/module.h> -#include <linux/ip.h> -#include <linux/netfilter_ipv4.h> -#include <net/netfilter/nf_tables.h> -#include <net/net_namespace.h> -#include <net/ip.h> -#include <net/netfilter/nf_tables_ipv4.h> - -static unsigned int nft_ipv4_output(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - struct nft_pktinfo pkt; - - if (unlikely(skb->len < sizeof(struct iphdr) || - ip_hdr(skb)->ihl < sizeof(struct iphdr) / 4)) { - if (net_ratelimit()) - pr_info("nf_tables_ipv4: ignoring short SOCK_RAW " - "packet\n"); - return NF_ACCEPT; - } - nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out); - - return nft_do_chain_pktinfo(&pkt, ops); -} - -static struct nft_af_info nft_af_ipv4 __read_mostly = { - .family = NFPROTO_IPV4, - .nhooks = NF_INET_NUMHOOKS, - .owner = THIS_MODULE, - .hooks = { - [NF_INET_LOCAL_OUT] = nft_ipv4_output, - }, -}; - -static int nf_tables_ipv4_init_net(struct net *net) -{ - net->nft.ipv4 = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL); - if (net->nft.ipv4 == NULL) - return -ENOMEM; - - memcpy(net->nft.ipv4, &nft_af_ipv4, sizeof(nft_af_ipv4)); - - if (nft_register_afinfo(net, net->nft.ipv4) < 0) - goto err; - - return 0; -err: - kfree(net->nft.ipv4); - return -ENOMEM; -} - -static void nf_tables_ipv4_exit_net(struct net *net) -{ - nft_unregister_afinfo(net->nft.ipv4); - kfree(net->nft.ipv4); -} - -static struct pernet_operations nf_tables_ipv4_net_ops = { - .init = nf_tables_ipv4_init_net, - .exit = nf_tables_ipv4_exit_net, -}; - -static unsigned int -nft_do_chain_ipv4(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - struct nft_pktinfo pkt; - - nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out); - - return nft_do_chain_pktinfo(&pkt, ops); -} - -static struct nf_chain_type filter_ipv4 = { - .family = NFPROTO_IPV4, - .name = "filter", - .type = NFT_CHAIN_T_DEFAULT, - .hook_mask = (1 << NF_INET_LOCAL_IN) | - (1 << NF_INET_LOCAL_OUT) | - (1 << NF_INET_FORWARD) | - (1 << NF_INET_PRE_ROUTING) | - (1 << NF_INET_POST_ROUTING), - .fn = { - [NF_INET_LOCAL_IN] = nft_do_chain_ipv4, - [NF_INET_LOCAL_OUT] = nft_ipv4_output, - [NF_INET_FORWARD] = nft_do_chain_ipv4, - [NF_INET_PRE_ROUTING] = nft_do_chain_ipv4, - [NF_INET_POST_ROUTING] = nft_do_chain_ipv4, - }, -}; - -static int __init nf_tables_ipv4_init(void) -{ - nft_register_chain_type(&filter_ipv4); - return register_pernet_subsys(&nf_tables_ipv4_net_ops); -} - -static void __exit nf_tables_ipv4_exit(void) -{ - unregister_pernet_subsys(&nf_tables_ipv4_net_ops); - nft_unregister_chain_type(&filter_ipv4); -} - -module_init(nf_tables_ipv4_init); -module_exit(nf_tables_ipv4_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_ALIAS_NFT_FAMILY(AF_INET); diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c deleted file mode 100644 index cf2c792..0000000 --- a/net/ipv4/netfilter/nft_chain_nat_ipv4.c +++ /dev/null @@ -1,205 +0,0 @@ -/* - * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> - * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org> - * Copyright (c) 2012 Intel Corporation - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Development of this code funded by Astaro AG (http://www.astaro.com/) - */ - -#include <linux/module.h> -#include <linux/init.h> -#include <linux/list.h> -#include <linux/skbuff.h> -#include <linux/ip.h> -#include <linux/netfilter.h> -#include <linux/netfilter_ipv4.h> -#include <linux/netfilter/nf_tables.h> -#include <net/netfilter/nf_conntrack.h> -#include <net/netfilter/nf_nat.h> -#include <net/netfilter/nf_nat_core.h> -#include <net/netfilter/nf_tables.h> -#include <net/netfilter/nf_tables_ipv4.h> -#include <net/netfilter/nf_nat_l3proto.h> -#include <net/ip.h> - -/* - * NAT chains - */ - -static unsigned int nf_nat_fn(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - enum ip_conntrack_info ctinfo; - struct nf_conn *ct = nf_ct_get(skb, &ctinfo); - struct nf_conn_nat *nat; - enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum); - struct nft_pktinfo pkt; - unsigned int ret; - - if (ct == NULL || nf_ct_is_untracked(ct)) - return NF_ACCEPT; - - NF_CT_ASSERT(!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET))); - - nat = nfct_nat(ct); - if (nat == NULL) { - /* Conntrack module was loaded late, can't add extension. */ - if (nf_ct_is_confirmed(ct)) - return NF_ACCEPT; - nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC); - if (nat == NULL) - return NF_ACCEPT; - } - - switch (ctinfo) { - case IP_CT_RELATED: - case IP_CT_RELATED + IP_CT_IS_REPLY: - if (ip_hdr(skb)->protocol == IPPROTO_ICMP) { - if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, - ops->hooknum)) - return NF_DROP; - else - return NF_ACCEPT; - } - /* Fall through */ - case IP_CT_NEW: - if (nf_nat_initialized(ct, maniptype)) - break; - - nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out); - - ret = nft_do_chain_pktinfo(&pkt, ops); - if (ret != NF_ACCEPT) - return ret; - if (!nf_nat_initialized(ct, maniptype)) { - ret = nf_nat_alloc_null_binding(ct, ops->hooknum); - if (ret != NF_ACCEPT) - return ret; - } - default: - break; - } - - return nf_nat_packet(ct, ctinfo, ops->hooknum, skb); -} - -static unsigned int nf_nat_prerouting(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - __be32 daddr = ip_hdr(skb)->daddr; - unsigned int ret; - - ret = nf_nat_fn(ops, skb, in, out, okfn); - if (ret != NF_DROP && ret != NF_STOLEN && - ip_hdr(skb)->daddr != daddr) { - skb_dst_drop(skb); - } - return ret; -} - -static unsigned int nf_nat_postrouting(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - enum ip_conntrack_info ctinfo __maybe_unused; - const struct nf_conn *ct __maybe_unused; - unsigned int ret; - - ret = nf_nat_fn(ops, skb, in, out, okfn); -#ifdef CONFIG_XFRM - if (ret != NF_DROP && ret != NF_STOLEN && - (ct = nf_ct_get(skb, &ctinfo)) != NULL) { - enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - - if (ct->tuplehash[dir].tuple.src.u3.ip != - ct->tuplehash[!dir].tuple.dst.u3.ip || - ct->tuplehash[dir].tuple.src.u.all != - ct->tuplehash[!dir].tuple.dst.u.all) - return nf_xfrm_me_harder(skb, AF_INET) == 0 ? - ret : NF_DROP; - } -#endif - return ret; -} - -static unsigned int nf_nat_output(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - enum ip_conntrack_info ctinfo; - const struct nf_conn *ct; - unsigned int ret; - - ret = nf_nat_fn(ops, skb, in, out, okfn); - if (ret != NF_DROP && ret != NF_STOLEN && - (ct = nf_ct_get(skb, &ctinfo)) != NULL) { - enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - - if (ct->tuplehash[dir].tuple.dst.u3.ip != - ct->tuplehash[!dir].tuple.src.u3.ip) { - if (ip_route_me_harder(skb, RTN_UNSPEC)) - ret = NF_DROP; - } -#ifdef CONFIG_XFRM - else if (ct->tuplehash[dir].tuple.dst.u.all != - ct->tuplehash[!dir].tuple.src.u.all) - if (nf_xfrm_me_harder(skb, AF_INET)) - ret = NF_DROP; -#endif - } - return ret; -} - -static struct nf_chain_type nft_chain_nat_ipv4 = { - .family = NFPROTO_IPV4, - .name = "nat", - .type = NFT_CHAIN_T_NAT, - .hook_mask = (1 << NF_INET_PRE_ROUTING) | - (1 << NF_INET_POST_ROUTING) | - (1 << NF_INET_LOCAL_OUT) | - (1 << NF_INET_LOCAL_IN), - .fn = { - [NF_INET_PRE_ROUTING] = nf_nat_prerouting, - [NF_INET_POST_ROUTING] = nf_nat_postrouting, - [NF_INET_LOCAL_OUT] = nf_nat_output, - [NF_INET_LOCAL_IN] = nf_nat_fn, - }, - .me = THIS_MODULE, -}; - -static int __init nft_chain_nat_init(void) -{ - int err; - - err = nft_register_chain_type(&nft_chain_nat_ipv4); - if (err < 0) - return err; - - return 0; -} - -static void __exit nft_chain_nat_exit(void) -{ - nft_unregister_chain_type(&nft_chain_nat_ipv4); -} - -module_init(nft_chain_nat_init); -module_exit(nft_chain_nat_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_ALIAS_NFT_CHAIN(AF_INET, "nat"); diff --git a/net/ipv4/netfilter/nft_chain_route_ipv4.c b/net/ipv4/netfilter/nft_chain_route_ipv4.c deleted file mode 100644 index 4e6bf9a..0000000 --- a/net/ipv4/netfilter/nft_chain_route_ipv4.c +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> - * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/module.h> -#include <linux/init.h> -#include <linux/list.h> -#include <linux/skbuff.h> -#include <linux/netlink.h> -#include <linux/netfilter.h> -#include <linux/netfilter_ipv4.h> -#include <linux/netfilter/nfnetlink.h> -#include <linux/netfilter/nf_tables.h> -#include <net/netfilter/nf_tables.h> -#include <net/netfilter/nf_tables_ipv4.h> -#include <net/route.h> -#include <net/ip.h> - -static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - unsigned int ret; - struct nft_pktinfo pkt; - u32 mark; - __be32 saddr, daddr; - u_int8_t tos; - const struct iphdr *iph; - - /* root is playing with raw sockets. */ - if (skb->len < sizeof(struct iphdr) || - ip_hdrlen(skb) < sizeof(struct iphdr)) - return NF_ACCEPT; - - nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out); - - mark = skb->mark; - iph = ip_hdr(skb); - saddr = iph->saddr; - daddr = iph->daddr; - tos = iph->tos; - - ret = nft_do_chain_pktinfo(&pkt, ops); - if (ret != NF_DROP && ret != NF_QUEUE) { - iph = ip_hdr(skb); - - if (iph->saddr != saddr || - iph->daddr != daddr || - skb->mark != mark || - iph->tos != tos) - if (ip_route_me_harder(skb, RTN_UNSPEC)) - ret = NF_DROP; - } - return ret; -} - -static struct nf_chain_type nft_chain_route_ipv4 = { - .family = NFPROTO_IPV4, - .name = "route", - .type = NFT_CHAIN_T_ROUTE, - .hook_mask = (1 << NF_INET_LOCAL_OUT), - .fn = { - [NF_INET_LOCAL_OUT] = nf_route_table_hook, - }, - .me = THIS_MODULE, -}; - -static int __init nft_chain_route_init(void) -{ - return nft_register_chain_type(&nft_chain_route_ipv4); -} - -static void __exit nft_chain_route_exit(void) -{ - nft_unregister_chain_type(&nft_chain_route_ipv4); -} - -module_init(nft_chain_route_init); -module_exit(nft_chain_route_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_ALIAS_NFT_CHAIN(AF_INET, "route"); diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c deleted file mode 100644 index fff5ba1..0000000 --- a/net/ipv4/netfilter/nft_reject_ipv4.c +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Development of this code funded by Astaro AG (http://www.astaro.com/) - */ - -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/module.h> -#include <linux/netlink.h> -#include <linux/netfilter.h> -#include <linux/netfilter/nf_tables.h> -#include <net/netfilter/nf_tables.h> -#include <net/icmp.h> - -struct nft_reject { - enum nft_reject_types type:8; - u8 icmp_code; -}; - -static void nft_reject_eval(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], - const struct nft_pktinfo *pkt) -{ - struct nft_reject *priv = nft_expr_priv(expr); - - switch (priv->type) { - case NFT_REJECT_ICMP_UNREACH: - icmp_send(pkt->skb, ICMP_DEST_UNREACH, priv->icmp_code, 0); - break; - case NFT_REJECT_TCP_RST: - break; - } - - data[NFT_REG_VERDICT].verdict = NF_DROP; -} - -static const struct nla_policy nft_reject_policy[NFTA_REJECT_MAX + 1] = { - [NFTA_REJECT_TYPE] = { .type = NLA_U32 }, - [NFTA_REJECT_ICMP_CODE] = { .type = NLA_U8 }, -}; - -static int nft_reject_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]) -{ - struct nft_reject *priv = nft_expr_priv(expr); - - if (tb[NFTA_REJECT_TYPE] == NULL) - return -EINVAL; - - priv->type = ntohl(nla_get_be32(tb[NFTA_REJECT_TYPE])); - switch (priv->type) { - case NFT_REJECT_ICMP_UNREACH: - if (tb[NFTA_REJECT_ICMP_CODE] == NULL) - return -EINVAL; - priv->icmp_code = nla_get_u8(tb[NFTA_REJECT_ICMP_CODE]); - case NFT_REJECT_TCP_RST: - break; - default: - return -EINVAL; - } - - return 0; -} - -static int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr) -{ - const struct nft_reject *priv = nft_expr_priv(expr); - - if (nla_put_be32(skb, NFTA_REJECT_TYPE, priv->type)) - goto nla_put_failure; - - switch (priv->type) { - case NFT_REJECT_ICMP_UNREACH: - if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code)) - goto nla_put_failure; - break; - } - - return 0; - -nla_put_failure: - return -1; -} - -static struct nft_expr_type nft_reject_type; -static const struct nft_expr_ops nft_reject_ops = { - .type = &nft_reject_type, - .size = NFT_EXPR_SIZE(sizeof(struct nft_reject)), - .eval = nft_reject_eval, - .init = nft_reject_init, - .dump = nft_reject_dump, -}; - -static struct nft_expr_type nft_reject_type __read_mostly = { - .name = "reject", - .ops = &nft_reject_ops, - .policy = nft_reject_policy, - .maxattr = NFTA_REJECT_MAX, - .owner = THIS_MODULE, -}; - -static int __init nft_reject_module_init(void) -{ - return nft_register_expr(&nft_reject_type); -} - -static void __exit nft_reject_module_exit(void) -{ - nft_unregister_expr(&nft_reject_type); -} - -module_init(nft_reject_module_init); -module_exit(nft_reject_module_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_ALIAS_NFT_EXPR("reject"); diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 242e7f4..d7d9882 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -202,14 +202,15 @@ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident) #if IS_ENABLED(CONFIG_IPV6) } else if (skb->protocol == htons(ETH_P_IPV6) && sk->sk_family == AF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); pr_debug("found: %p: num=%d, daddr=%pI6c, dif=%d\n", sk, (int) isk->inet_num, - &sk->sk_v6_rcv_saddr, + &inet6_sk(sk)->rcv_saddr, sk->sk_bound_dev_if); - if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr) && - !ipv6_addr_equal(&sk->sk_v6_rcv_saddr, + if (!ipv6_addr_any(&np->rcv_saddr) && + !ipv6_addr_equal(&np->rcv_saddr, &ipv6_hdr(skb)->daddr)) continue; #endif @@ -236,11 +237,11 @@ static void inet_get_ping_group_range_net(struct net *net, kgid_t *low, unsigned int seq; do { - seq = read_seqbegin(&net->ipv4.sysctl_local_ports.lock); + seq = read_seqbegin(&sysctl_local_ports.lock); *low = data[0]; *high = data[1]; - } while (read_seqretry(&net->ipv4.sysctl_local_ports.lock, seq)); + } while (read_seqretry(&sysctl_local_ports.lock, seq)); } @@ -361,7 +362,7 @@ static void ping_set_saddr(struct sock *sk, struct sockaddr *saddr) } else if (saddr->sa_family == AF_INET6) { struct sockaddr_in6 *addr = (struct sockaddr_in6 *) saddr; struct ipv6_pinfo *np = inet6_sk(sk); - sk->sk_v6_rcv_saddr = np->saddr = addr->sin6_addr; + np->rcv_saddr = np->saddr = addr->sin6_addr; #endif } } @@ -375,7 +376,7 @@ static void ping_clear_saddr(struct sock *sk, int dif) #if IS_ENABLED(CONFIG_IPV6) } else if (sk->sk_family == AF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); - memset(&sk->sk_v6_rcv_saddr, 0, sizeof(sk->sk_v6_rcv_saddr)); + memset(&np->rcv_saddr, 0, sizeof(np->rcv_saddr)); memset(&np->saddr, 0, sizeof(np->saddr)); #endif } @@ -415,12 +416,10 @@ int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) (int)sk->sk_bound_dev_if); err = 0; - if (sk->sk_family == AF_INET && isk->inet_rcv_saddr) + if ((sk->sk_family == AF_INET && isk->inet_rcv_saddr) || + (sk->sk_family == AF_INET6 && + !ipv6_addr_any(&inet6_sk(sk)->rcv_saddr))) sk->sk_userlocks |= SOCK_BINDADDR_LOCK; -#if IS_ENABLED(CONFIG_IPV6) - if (sk->sk_family == AF_INET6 && !ipv6_addr_any(&sk->sk_v6_rcv_saddr)) - sk->sk_userlocks |= SOCK_BINDADDR_LOCK; -#endif if (snum) sk->sk_userlocks |= SOCK_BINDPORT_LOCK; @@ -430,7 +429,7 @@ int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) - memset(&sk->sk_v6_daddr, 0, sizeof(sk->sk_v6_daddr)); + memset(&inet6_sk(sk)->daddr, 0, sizeof(inet6_sk(sk)->daddr)); #endif sk_dst_reset(sk); @@ -714,8 +713,6 @@ int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, ipc.opt = NULL; ipc.oif = sk->sk_bound_dev_if; ipc.tx_flags = 0; - ipc.ttl = 0; - ipc.tos = -1; sock_tx_timestamp(sk, &ipc.tx_flags); @@ -747,7 +744,7 @@ int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, return -EINVAL; faddr = ipc.opt->opt.faddr; } - tos = get_rttos(&ipc, inet); + tos = RT_TOS(inet->tos); if (sock_flag(sk, SOCK_LOCALROUTE) || (msg->msg_flags & MSG_DONTROUTE) || (ipc.opt && ipc.opt->opt.is_strictroute)) { @@ -772,7 +769,7 @@ int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, err = PTR_ERR(rt); rt = NULL; if (err == -ENETUNREACH) - IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); + IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); goto out; } @@ -830,6 +827,8 @@ int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, { struct inet_sock *isk = inet_sk(sk); int family = sk->sk_family; + struct sockaddr_in *sin; + struct sockaddr_in6 *sin6; struct sk_buff *skb; int copied, err; @@ -839,13 +838,19 @@ int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, if (flags & MSG_OOB) goto out; + if (addr_len) { + if (family == AF_INET) + *addr_len = sizeof(*sin); + else if (family == AF_INET6 && addr_len) + *addr_len = sizeof(*sin6); + } + if (flags & MSG_ERRQUEUE) { if (family == AF_INET) { - return ip_recv_error(sk, msg, len, addr_len); + return ip_recv_error(sk, msg, len); #if IS_ENABLED(CONFIG_IPV6) } else if (family == AF_INET6) { - return pingv6_ops.ipv6_recv_error(sk, msg, len, - addr_len); + return pingv6_ops.ipv6_recv_error(sk, msg, len); #endif } } @@ -869,15 +874,11 @@ int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, /* Copy the address and add cmsg data. */ if (family == AF_INET) { - struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name; - - if (sin) { - sin->sin_family = AF_INET; - sin->sin_port = 0 /* skb->h.uh->source */; - sin->sin_addr.s_addr = ip_hdr(skb)->saddr; - memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); - *addr_len = sizeof(*sin); - } + sin = (struct sockaddr_in *) msg->msg_name; + sin->sin_family = AF_INET; + sin->sin_port = 0 /* skb->h.uh->source */; + sin->sin_addr.s_addr = ip_hdr(skb)->saddr; + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); if (isk->cmsg_flags) ip_cmsg_recv(msg, skb); @@ -886,21 +887,17 @@ int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, } else if (family == AF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6hdr *ip6 = ipv6_hdr(skb); - struct sockaddr_in6 *sin6 = - (struct sockaddr_in6 *)msg->msg_name; - - if (sin6) { - sin6->sin6_family = AF_INET6; - sin6->sin6_port = 0; - sin6->sin6_addr = ip6->saddr; - sin6->sin6_flowinfo = 0; - if (np->sndflow) - sin6->sin6_flowinfo = ip6_flowinfo(ip6); - sin6->sin6_scope_id = - ipv6_iface_scope_id(&sin6->sin6_addr, - IP6CB(skb)->iif); - *addr_len = sizeof(*sin6); - } + sin6 = (struct sockaddr_in6 *) msg->msg_name; + sin6->sin6_family = AF_INET6; + sin6->sin6_port = 0; + sin6->sin6_addr = ip6->saddr; + + sin6->sin6_flowinfo = 0; + if (np->sndflow) + sin6->sin6_flowinfo = ip6_flowinfo(ip6); + + sin6->sin6_scope_id = ipv6_iface_scope_id(&sin6->sin6_addr, + IP6CB(skb)->iif); if (inet6_sk(sk)->rxopt.all) pingv6_ops.ip6_datagram_recv_ctl(sk, msg, skb); @@ -1076,7 +1073,7 @@ void ping_seq_stop(struct seq_file *seq, void *v) EXPORT_SYMBOL_GPL(ping_seq_stop); static void ping_v4_format_sock(struct sock *sp, struct seq_file *f, - int bucket) + int bucket, int *len) { struct inet_sock *inet = inet_sk(sp); __be32 dest = inet->inet_daddr; @@ -1085,7 +1082,7 @@ static void ping_v4_format_sock(struct sock *sp, struct seq_file *f, __u16 srcp = ntohs(inet->inet_sport); seq_printf(f, "%5d: %08X:%04X %08X:%04X" - " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d", + " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d%n", bucket, src, srcp, dest, destp, sp->sk_state, sk_wmem_alloc_get(sp), sk_rmem_alloc_get(sp), @@ -1093,22 +1090,23 @@ static void ping_v4_format_sock(struct sock *sp, struct seq_file *f, from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)), 0, sock_i_ino(sp), atomic_read(&sp->sk_refcnt), sp, - atomic_read(&sp->sk_drops)); + atomic_read(&sp->sk_drops), len); } static int ping_v4_seq_show(struct seq_file *seq, void *v) { - seq_setwidth(seq, 127); if (v == SEQ_START_TOKEN) - seq_puts(seq, " sl local_address rem_address st tx_queue " + seq_printf(seq, "%-127s\n", + " sl local_address rem_address st tx_queue " "rx_queue tr tm->when retrnsmt uid timeout " "inode ref pointer drops"); else { struct ping_iter_state *state = seq->private; + int len; - ping_v4_format_sock(v, seq, state->bucket); + ping_v4_format_sock(v, seq, state->bucket, &len); + seq_printf(seq, "%*s\n", 127 - len, ""); } - seq_pad(seq, '\n'); return 0; } diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c index 46d6a1c..ce84846 100644 --- a/net/ipv4/protocol.c +++ b/net/ipv4/protocol.c @@ -31,6 +31,10 @@ const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly; const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly; +/* + * Add a protocol handler to the hash tables + */ + int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol) { if (!prot->netns_ok) { @@ -51,6 +55,10 @@ int inet_add_offload(const struct net_offload *prot, unsigned char protocol) } EXPORT_SYMBOL(inet_add_offload); +/* + * Remove a protocol from the hash tables. + */ + int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol) { int ret; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 23c3e5b..193db03 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -299,7 +299,7 @@ static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb) { /* Charge it to the socket. */ - ipv4_pktinfo_prepare(sk, skb); + ipv4_pktinfo_prepare(skb); if (sock_queue_rcv_skb(sk, skb) < 0) { kfree_skb(skb); return NET_RX_DROP; @@ -519,8 +519,6 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, ipc.addr = inet->inet_saddr; ipc.opt = NULL; ipc.tx_flags = 0; - ipc.ttl = 0; - ipc.tos = -1; ipc.oif = sk->sk_bound_dev_if; if (msg->msg_controllen) { @@ -560,7 +558,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, daddr = ipc.opt->opt.faddr; } } - tos = get_rtconn_flags(&ipc, sk); + tos = RT_CONN_FLAGS(sk); if (msg->msg_flags & MSG_DONTROUTE) tos |= RTO_ONLINK; @@ -696,8 +694,11 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, if (flags & MSG_OOB) goto out; + if (addr_len) + *addr_len = sizeof(*sin); + if (flags & MSG_ERRQUEUE) { - err = ip_recv_error(sk, msg, len, addr_len); + err = ip_recv_error(sk, msg, len); goto out; } @@ -723,7 +724,6 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, sin->sin_addr.s_addr = ip_hdr(skb)->saddr; sin->sin_port = 0; memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); - *addr_len = sizeof(*sin); } if (inet->cmsg_flags) ip_cmsg_recv(msg, skb); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 1417d01..fcbd95f 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -299,7 +299,7 @@ static int rt_cpu_seq_show(struct seq_file *seq, void *v) seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x " " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n", dst_entries_get_slow(&ipv4_dst_ops), - 0, /* st->in_hit */ + st->in_hit, st->in_slow_tot, st->in_slow_mc, st->in_no_route, @@ -307,16 +307,16 @@ static int rt_cpu_seq_show(struct seq_file *seq, void *v) st->in_martian_dst, st->in_martian_src, - 0, /* st->out_hit */ + st->out_hit, st->out_slow_tot, st->out_slow_mc, - 0, /* st->gc_total */ - 0, /* st->gc_ignored */ - 0, /* st->gc_goal_miss */ - 0, /* st->gc_dst_overflow */ - 0, /* st->in_hlist_search */ - 0 /* st->out_hlist_search */ + st->gc_total, + st->gc_ignored, + st->gc_goal_miss, + st->gc_dst_overflow, + st->in_hlist_search, + st->out_hlist_search ); return 0; } @@ -1044,10 +1044,6 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) bool new = false; bh_lock_sock(sk); - - if (!ip_sk_accept_pmtu(sk)) - goto out; - rt = (struct rtable *) __sk_dst_get(sk); if (sock_owned_by_user(sk) || !rt) { @@ -1784,12 +1780,8 @@ local_input: rth->dst.error= -err; rth->rt_flags &= ~RTCF_LOCAL; } - if (do_cache) { - if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) { - rth->dst.flags |= DST_NOCACHE; - rt_add_uncached_list(rth); - } - } + if (do_cache) + rt_cache_route(&FIB_RES_NH(res), rth); skb_dst_set(skb, &rth->dst); err = 0; goto out; diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index b95331e..14a15c4 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -25,7 +25,15 @@ extern int sysctl_tcp_syncookies; -static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS]; +__u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS]; +EXPORT_SYMBOL(syncookie_secret); + +static __init int init_syncookies(void) +{ + get_random_bytes(syncookie_secret, sizeof(syncookie_secret)); + return 0; +} +__initcall(init_syncookies); #define COOKIEBITS 24 /* Upper bits store count */ #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) @@ -36,11 +44,8 @@ static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, u32 count, int c) { - __u32 *tmp; - - net_get_random_once(syncookie_secret, sizeof(syncookie_secret)); + __u32 *tmp = __get_cpu_var(ipv4_cookie_scratch); - tmp = __get_cpu_var(ipv4_cookie_scratch); memcpy(tmp + 4, syncookie_secret[c], sizeof(syncookie_secret[c])); tmp[0] = (__force u32)saddr; tmp[1] = (__force u32)daddr; @@ -84,7 +89,8 @@ __u32 cookie_init_timestamp(struct request_sock *req) static __u32 secure_tcp_syn_cookie(__be32 saddr, __be32 daddr, __be16 sport, - __be16 dport, __u32 sseq, __u32 data) + __be16 dport, __u32 sseq, __u32 count, + __u32 data) { /* * Compute the secure sequence number. @@ -96,7 +102,7 @@ static __u32 secure_tcp_syn_cookie(__be32 saddr, __be32 daddr, __be16 sport, * As an extra hack, we add a small "data" value that encodes the * MSS into the second hash value. */ - u32 count = tcp_cookie_time(); + return (cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq + (count << COOKIEBITS) + ((cookie_hash(saddr, daddr, sport, dport, count, 1) + data) @@ -108,21 +114,22 @@ static __u32 secure_tcp_syn_cookie(__be32 saddr, __be32 daddr, __be16 sport, * If the syncookie is bad, the data returned will be out of * range. This must be checked by the caller. * - * The count value used to generate the cookie must be less than - * MAX_SYNCOOKIE_AGE minutes in the past. - * The return value (__u32)-1 if this test fails. + * The count value used to generate the cookie must be within + * "maxdiff" if the current (passed-in) "count". The return value + * is (__u32)-1 if this test fails. */ static __u32 check_tcp_syn_cookie(__u32 cookie, __be32 saddr, __be32 daddr, - __be16 sport, __be16 dport, __u32 sseq) + __be16 sport, __be16 dport, __u32 sseq, + __u32 count, __u32 maxdiff) { - u32 diff, count = tcp_cookie_time(); + __u32 diff; /* Strip away the layers from the cookie */ cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq; /* Cookie is now reduced to (count * 2^24) ^ (hash % 2^24) */ diff = (count - (cookie >> COOKIEBITS)) & ((__u32) - 1 >> COOKIEBITS); - if (diff >= MAX_SYNCOOKIE_AGE) + if (diff >= maxdiff) return (__u32)-1; return (cookie - @@ -131,22 +138,22 @@ static __u32 check_tcp_syn_cookie(__u32 cookie, __be32 saddr, __be32 daddr, } /* - * MSS Values are chosen based on the 2011 paper - * 'An Analysis of TCP Maximum Segement Sizes' by S. Alcock and R. Nelson. - * Values .. - * .. lower than 536 are rare (< 0.2%) - * .. between 537 and 1299 account for less than < 1.5% of observed values - * .. in the 1300-1349 range account for about 15 to 20% of observed mss values - * .. exceeding 1460 are very rare (< 0.04%) + * MSS Values are taken from the 2009 paper + * 'Measuring TCP Maximum Segment Size' by S. Alcock and R. Nelson: + * - values 1440 to 1460 accounted for 80% of observed mss values + * - values outside the 536-1460 range are rare (<0.2%). * - * 1460 is the single most frequently announced mss value (30 to 46% depending - * on monitor location). Table must be sorted. + * Table must be sorted. */ static __u16 const msstab[] = { + 64, + 512, 536, - 1300, - 1440, /* 1440, 1452: PPPoE */ + 1024, + 1440, 1460, + 4312, + 8960, }; /* @@ -166,7 +173,7 @@ u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th, return secure_tcp_syn_cookie(iph->saddr, iph->daddr, th->source, th->dest, ntohl(th->seq), - mssind); + jiffies / (HZ * 60), mssind); } EXPORT_SYMBOL_GPL(__cookie_v4_init_sequence); @@ -182,6 +189,13 @@ __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp) } /* + * This (misnamed) value is the age of syncookie which is permitted. + * Its ideal value should be dependent on TCP_TIMEOUT_INIT and + * sysctl_tcp_retries1. It's a rather complicated formula (exponential + * backoff) to compute at runtime so it's currently hardcoded here. + */ +#define COUNTER_TRIES 4 +/* * Check if a ack sequence number is a valid syncookie. * Return the decoded mss if it is, or 0 if not. */ @@ -190,7 +204,9 @@ int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th, { __u32 seq = ntohl(th->seq) - 1; __u32 mssind = check_tcp_syn_cookie(cookie, iph->saddr, iph->daddr, - th->source, th->dest, seq); + th->source, th->dest, seq, + jiffies / (HZ * 60), + COUNTER_TRIES); return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0; } @@ -299,10 +315,10 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, treq->rcv_isn = ntohl(th->seq) - 1; treq->snt_isn = cookie; req->mss = mss; - ireq->ir_num = ntohs(th->dest); - ireq->ir_rmt_port = th->source; - ireq->ir_loc_addr = ip_hdr(skb)->daddr; - ireq->ir_rmt_addr = ip_hdr(skb)->saddr; + ireq->loc_port = th->dest; + ireq->rmt_port = th->source; + ireq->loc_addr = ip_hdr(skb)->daddr; + ireq->rmt_addr = ip_hdr(skb)->saddr; ireq->ecn_ok = ecn_ok; ireq->snd_wscale = tcp_opt.snd_wscale; ireq->sack_ok = tcp_opt.sack_ok; @@ -342,8 +358,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, flowi4_init_output(&fl4, sk->sk_bound_dev_if, sk->sk_mark, RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP, inet_sk_flowi_flags(sk), - (opt && opt->srr) ? opt->faddr : ireq->ir_rmt_addr, - ireq->ir_loc_addr, th->source, th->dest); + (opt && opt->srr) ? opt->faddr : ireq->rmt_addr, + ireq->loc_addr, th->source, th->dest); security_req_classify_flow(req, flowi4_to_flowi(&fl4)); rt = ip_route_output_key(sock_net(sk), &fl4); if (IS_ERR(rt)) { diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 3d69ec8..540279f 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -43,12 +43,12 @@ static int ip_ping_group_range_min[] = { 0, 0 }; static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; /* Update system visible IP port range */ -static void set_local_port_range(struct net *net, int range[2]) +static void set_local_port_range(int range[2]) { - write_seqlock(&net->ipv4.sysctl_local_ports.lock); - net->ipv4.sysctl_local_ports.range[0] = range[0]; - net->ipv4.sysctl_local_ports.range[1] = range[1]; - write_sequnlock(&net->ipv4.sysctl_local_ports.lock); + write_seqlock(&sysctl_local_ports.lock); + sysctl_local_ports.range[0] = range[0]; + sysctl_local_ports.range[1] = range[1]; + write_sequnlock(&sysctl_local_ports.lock); } /* Validate changes from /proc interface. */ @@ -56,8 +56,6 @@ static int ipv4_local_port_range(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - struct net *net = - container_of(table->data, struct net, ipv4.sysctl_local_ports.range); int ret; int range[2]; struct ctl_table tmp = { @@ -68,15 +66,14 @@ static int ipv4_local_port_range(struct ctl_table *table, int write, .extra2 = &ip_local_port_range_max, }; - inet_get_local_port_range(net, &range[0], &range[1]); - + inet_get_local_port_range(range, range + 1); ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (write && ret == 0) { if (range[1] < range[0]) ret = -EINVAL; else - set_local_port_range(net, range); + set_local_port_range(range); } return ret; @@ -86,27 +83,23 @@ static int ipv4_local_port_range(struct ctl_table *table, int write, static void inet_get_ping_group_range_table(struct ctl_table *table, kgid_t *low, kgid_t *high) { kgid_t *data = table->data; - struct net *net = - container_of(table->data, struct net, ipv4.sysctl_ping_group_range); unsigned int seq; do { - seq = read_seqbegin(&net->ipv4.sysctl_local_ports.lock); + seq = read_seqbegin(&sysctl_local_ports.lock); *low = data[0]; *high = data[1]; - } while (read_seqretry(&net->ipv4.sysctl_local_ports.lock, seq)); + } while (read_seqretry(&sysctl_local_ports.lock, seq)); } /* Update system visible IP port range */ static void set_ping_group_range(struct ctl_table *table, kgid_t low, kgid_t high) { kgid_t *data = table->data; - struct net *net = - container_of(table->data, struct net, ipv4.sysctl_ping_group_range); - write_seqlock(&net->ipv4.sysctl_local_ports.lock); + write_seqlock(&sysctl_local_ports.lock); data[0] = low; data[1] = high; - write_sequnlock(&net->ipv4.sysctl_local_ports.lock); + write_sequnlock(&sysctl_local_ports.lock); } /* Validate changes from /proc interface. */ @@ -200,6 +193,49 @@ static int proc_allowed_congestion_control(struct ctl_table *ctl, return ret; } +static int ipv4_tcp_mem(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + unsigned long vec[3]; + struct net *net = current->nsproxy->net_ns; +#ifdef CONFIG_MEMCG_KMEM + struct mem_cgroup *memcg; +#endif + + struct ctl_table tmp = { + .data = &vec, + .maxlen = sizeof(vec), + .mode = ctl->mode, + }; + + if (!write) { + ctl->data = &net->ipv4.sysctl_tcp_mem; + return proc_doulongvec_minmax(ctl, write, buffer, lenp, ppos); + } + + ret = proc_doulongvec_minmax(&tmp, write, buffer, lenp, ppos); + if (ret) + return ret; + +#ifdef CONFIG_MEMCG_KMEM + rcu_read_lock(); + memcg = mem_cgroup_from_task(current); + + tcp_prot_mem(memcg, vec[0], 0); + tcp_prot_mem(memcg, vec[1], 1); + tcp_prot_mem(memcg, vec[2], 2); + rcu_read_unlock(); +#endif + + net->ipv4.sysctl_tcp_mem[0] = vec[0]; + net->ipv4.sysctl_tcp_mem[1] = vec[1]; + net->ipv4.sysctl_tcp_mem[2] = vec[2]; + + return 0; +} + static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -231,11 +267,6 @@ static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write, ret = -EINVAL; goto bad_key; } - /* Generate a dummy secret but don't publish it. This - * is needed so we don't regenerate a new key on the - * first invocation of tcp_fastopen_cookie_gen - */ - tcp_fastopen_init_key_once(false); tcp_fastopen_reset_cipher(user_key, TCP_FASTOPEN_KEY_LENGTH); } @@ -444,6 +475,13 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { + .procname = "ip_local_port_range", + .data = &sysctl_local_ports.range, + .maxlen = sizeof(sysctl_local_ports.range), + .mode = 0644, + .proc_handler = ipv4_local_port_range, + }, + { .procname = "ip_local_reserved_ports", .data = NULL, /* initialized in sysctl_ipv4_init */ .maxlen = 65536, @@ -514,13 +552,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .procname = "tcp_mem", - .maxlen = sizeof(sysctl_tcp_mem), - .data = &sysctl_tcp_mem, - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, - { .procname = "tcp_wmem", .data = &sysctl_tcp_wmem, .maxlen = sizeof(sysctl_tcp_wmem), @@ -701,6 +732,13 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_allowed_congestion_control, }, { + .procname = "tcp_max_ssthresh", + .data = &sysctl_tcp_max_ssthresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { .procname = "tcp_thin_linear_timeouts", .data = &sysctl_tcp_thin_linear_timeouts, .maxlen = sizeof(int), @@ -816,11 +854,10 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dointvec }, { - .procname = "ip_local_port_range", - .maxlen = sizeof(init_net.ipv4.sysctl_local_ports.range), - .data = &init_net.ipv4.sysctl_local_ports.range, + .procname = "tcp_mem", + .maxlen = sizeof(init_net.ipv4.sysctl_tcp_mem), .mode = 0644, - .proc_handler = ipv4_local_port_range, + .proc_handler = ipv4_tcp_mem, }, { } }; @@ -831,15 +868,30 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) table = ipv4_net_table; if (!net_eq(net, &init_net)) { - int i; - table = kmemdup(table, sizeof(ipv4_net_table), GFP_KERNEL); if (table == NULL) goto err_alloc; - /* Update the variables to point into the current struct net */ - for (i = 0; i < ARRAY_SIZE(ipv4_net_table) - 1; i++) - table[i].data += (void *)net - (void *)&init_net; + table[0].data = + &net->ipv4.sysctl_icmp_echo_ignore_all; + table[1].data = + &net->ipv4.sysctl_icmp_echo_ignore_broadcasts; + table[2].data = + &net->ipv4.sysctl_icmp_ignore_bogus_error_responses; + table[3].data = + &net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr; + table[4].data = + &net->ipv4.sysctl_icmp_ratelimit; + table[5].data = + &net->ipv4.sysctl_icmp_ratemask; + table[6].data = + &net->ipv4.sysctl_ping_group_range; + table[7].data = + &net->ipv4.sysctl_tcp_ecn; + + /* Don't export sysctls to unprivileged users */ + if (net->user_ns != &init_user_ns) + table[0].procname = NULL; } /* @@ -849,12 +901,7 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) net->ipv4.sysctl_ping_group_range[0] = make_kgid(&init_user_ns, 1); net->ipv4.sysctl_ping_group_range[1] = make_kgid(&init_user_ns, 0); - /* - * Set defaults for local port range - */ - seqlock_init(&net->ipv4.sysctl_local_ports.lock); - net->ipv4.sysctl_local_ports.range[0] = 32768; - net->ipv4.sysctl_local_ports.range[1] = 61000; + tcp_init_mem(net); net->ipv4.ipv4_hdr = register_net_sysctl(net, "net/ipv4", table); if (net->ipv4.ipv4_hdr == NULL) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index c4638e6..6e5617b 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -288,11 +288,9 @@ int sysctl_tcp_min_tso_segs __read_mostly = 2; struct percpu_counter tcp_orphan_count; EXPORT_SYMBOL_GPL(tcp_orphan_count); -long sysctl_tcp_mem[3] __read_mostly; int sysctl_tcp_wmem[3] __read_mostly; int sysctl_tcp_rmem[3] __read_mostly; -EXPORT_SYMBOL(sysctl_tcp_mem); EXPORT_SYMBOL(sysctl_tcp_rmem); EXPORT_SYMBOL(sysctl_tcp_wmem); @@ -808,6 +806,12 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, xmit_size_goal = min_t(u32, gso_size, sk->sk_gso_max_size - 1 - hlen); + /* TSQ : try to have at least two segments in flight + * (one in NIC TX ring, another in Qdisc) + */ + xmit_size_goal = min_t(u32, xmit_size_goal, + sysctl_tcp_limit_output_bytes >> 1); + xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal); /* We try hard to avoid divides here */ @@ -1425,7 +1429,7 @@ static void tcp_service_net_dma(struct sock *sk, bool wait) do { if (dma_async_is_tx_complete(tp->ucopy.dma_chan, last_issued, &done, - &used) == DMA_COMPLETE) { + &used) == DMA_SUCCESS) { /* Safe to free early-copied skbs now */ __skb_queue_purge(&sk->sk_async_wait_queue); break; @@ -1433,7 +1437,7 @@ static void tcp_service_net_dma(struct sock *sk, bool wait) struct sk_buff *skb; while ((skb = skb_peek(&sk->sk_async_wait_queue)) && (dma_async_is_complete(skb->dma_cookie, done, - used) == DMA_COMPLETE)) { + used) == DMA_SUCCESS)) { __skb_dequeue(&sk->sk_async_wait_queue); kfree_skb(skb); } @@ -3093,13 +3097,13 @@ static int __init set_thash_entries(char *str) } __setup("thash_entries=", set_thash_entries); -static void tcp_init_mem(void) +void tcp_init_mem(struct net *net) { unsigned long limit = nr_free_buffer_pages() / 8; limit = max(limit, 128UL); - sysctl_tcp_mem[0] = limit / 4 * 3; - sysctl_tcp_mem[1] = limit; - sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; + net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3; + net->ipv4.sysctl_tcp_mem[1] = limit; + net->ipv4.sysctl_tcp_mem[2] = net->ipv4.sysctl_tcp_mem[0] * 2; } void __init tcp_init(void) @@ -3133,9 +3137,10 @@ void __init tcp_init(void) &tcp_hashinfo.ehash_mask, 0, thash_entries ? 0 : 512 * 1024); - for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) + for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) { INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i); - + INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].twchain, i); + } if (inet_ehash_locks_alloc(&tcp_hashinfo)) panic("TCP: failed to alloc ehash_locks"); tcp_hashinfo.bhash = @@ -3161,7 +3166,7 @@ void __init tcp_init(void) sysctl_tcp_max_orphans = cnt / 2; sysctl_max_syn_backlog = max(128, cnt / 256); - tcp_init_mem(); + tcp_init_mem(&init_net); /* Set per-socket limits to no more than 1/128 the pressure threshold */ limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); max_wshare = min(4UL*1024*1024, limit); diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index 821846f..f45e1c2 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -140,8 +140,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) ca->cnt = 1; } -static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, - u32 in_flight) +static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) { struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); @@ -150,7 +149,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, return; if (tp->snd_cwnd <= tp->snd_ssthresh) - tcp_slow_start(tp, acked); + tcp_slow_start(tp); else { bictcp_update(ca, tp->snd_cwnd); tcp_cong_avoid_ai(tp, ca->cnt); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index ad37bf1..019c238 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -15,6 +15,8 @@ #include <linux/gfp.h> #include <net/tcp.h> +int sysctl_tcp_max_ssthresh = 0; + static DEFINE_SPINLOCK(tcp_cong_list_lock); static LIST_HEAD(tcp_cong_list); @@ -297,24 +299,35 @@ bool tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight) } EXPORT_SYMBOL_GPL(tcp_is_cwnd_limited); -/* Slow start is used when congestion window is no greater than the slow start - * threshold. We base on RFC2581 and also handle stretch ACKs properly. - * We do not implement RFC3465 Appropriate Byte Counting (ABC) per se but - * something better;) a packet is only considered (s)acked in its entirety to - * defend the ACK attacks described in the RFC. Slow start processes a stretch - * ACK of degree N as if N acks of degree 1 are received back to back except - * ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and - * returns the leftover acks to adjust cwnd in congestion avoidance mode. +/* + * Slow start is used when congestion window is less than slow start + * threshold. This version implements the basic RFC2581 version + * and optionally supports: + * RFC3742 Limited Slow Start - growth limited to max_ssthresh + * RFC3465 Appropriate Byte Counting - growth limited by bytes acknowledged */ -int tcp_slow_start(struct tcp_sock *tp, u32 acked) +void tcp_slow_start(struct tcp_sock *tp) { - u32 cwnd = tp->snd_cwnd + acked; + int cnt; /* increase in packets */ + unsigned int delta = 0; + u32 snd_cwnd = tp->snd_cwnd; + + if (unlikely(!snd_cwnd)) { + pr_err_once("snd_cwnd is nul, please report this bug.\n"); + snd_cwnd = 1U; + } - if (cwnd > tp->snd_ssthresh) - cwnd = tp->snd_ssthresh + 1; - acked -= cwnd - tp->snd_cwnd; - tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); - return acked; + if (sysctl_tcp_max_ssthresh > 0 && tp->snd_cwnd > sysctl_tcp_max_ssthresh) + cnt = sysctl_tcp_max_ssthresh >> 1; /* limited slow start */ + else + cnt = snd_cwnd; /* exponential increase */ + + tp->snd_cwnd_cnt += cnt; + while (tp->snd_cwnd_cnt >= snd_cwnd) { + tp->snd_cwnd_cnt -= snd_cwnd; + delta++; + } + tp->snd_cwnd = min(snd_cwnd + delta, tp->snd_cwnd_clamp); } EXPORT_SYMBOL_GPL(tcp_slow_start); @@ -338,7 +351,7 @@ EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai); /* This is Jacobson's slow start and congestion avoidance. * SIGCOMM '88, p. 328. */ -void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight) +void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) { struct tcp_sock *tp = tcp_sk(sk); @@ -347,7 +360,7 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight) /* In "safe" area, increase. */ if (tp->snd_cwnd <= tp->snd_ssthresh) - tcp_slow_start(tp, acked); + tcp_slow_start(tp); /* In dangerous area, increase slowly. */ else tcp_cong_avoid_ai(tp, tp->snd_cwnd); diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 828e4c3..b6ae92a 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -304,8 +304,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) ca->cnt = 1; } -static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, - u32 in_flight) +static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) { struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); @@ -316,7 +315,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, if (tp->snd_cwnd <= tp->snd_ssthresh) { if (hystart && after(ack, ca->end_seq)) bictcp_hystart_reset(sk); - tcp_slow_start(tp, acked); + tcp_slow_start(tp); } else { bictcp_update(ca, tp->snd_cwnd); tcp_cong_avoid_ai(tp, ca->cnt); diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index f195d93..ab7bd35 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -8,26 +8,12 @@ #include <net/inetpeer.h> #include <net/tcp.h> -int sysctl_tcp_fastopen __read_mostly = TFO_CLIENT_ENABLE; +int sysctl_tcp_fastopen __read_mostly; struct tcp_fastopen_context __rcu *tcp_fastopen_ctx; static DEFINE_SPINLOCK(tcp_fastopen_ctx_lock); -void tcp_fastopen_init_key_once(bool publish) -{ - static u8 key[TCP_FASTOPEN_KEY_LENGTH]; - - /* tcp_fastopen_reset_cipher publishes the new context - * atomically, so we allow this race happening here. - * - * All call sites of tcp_fastopen_cookie_gen also check - * for a valid cookie, so this is an acceptable risk. - */ - if (net_get_random_once(key, sizeof(key)) && publish) - tcp_fastopen_reset_cipher(key, sizeof(key)); -} - static void tcp_fastopen_ctx_free(struct rcu_head *head) { struct tcp_fastopen_context *ctx = @@ -84,8 +70,6 @@ void tcp_fastopen_cookie_gen(__be32 src, __be32 dst, __be32 path[4] = { src, dst, 0, 0 }; struct tcp_fastopen_context *ctx; - tcp_fastopen_init_key_once(true); - rcu_read_lock(); ctx = rcu_dereference(tcp_fastopen_ctx); if (ctx) { @@ -94,3 +78,14 @@ void tcp_fastopen_cookie_gen(__be32 src, __be32 dst, } rcu_read_unlock(); } + +static int __init tcp_fastopen_init(void) +{ + __u8 key[TCP_FASTOPEN_KEY_LENGTH]; + + get_random_bytes(key, sizeof(key)); + tcp_fastopen_reset_cipher(key, sizeof(key)); + return 0; +} + +late_initcall(tcp_fastopen_init); diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index 8ed9305..30f27f6 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c @@ -109,7 +109,7 @@ static void hstcp_init(struct sock *sk) tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128); } -static void hstcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight) +static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 in_flight) { struct tcp_sock *tp = tcp_sk(sk); struct hstcp *ca = inet_csk_ca(sk); @@ -118,7 +118,7 @@ static void hstcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight) return; if (tp->snd_cwnd <= tp->snd_ssthresh) - tcp_slow_start(tp, acked); + tcp_slow_start(tp); else { /* Update AIMD parameters. * diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index 4a194ac..c1a8175 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -227,7 +227,7 @@ static u32 htcp_recalc_ssthresh(struct sock *sk) return max((tp->snd_cwnd * ca->beta) >> 7, 2U); } -static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight) +static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) { struct tcp_sock *tp = tcp_sk(sk); struct htcp *ca = inet_csk_ca(sk); @@ -236,7 +236,7 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight) return; if (tp->snd_cwnd <= tp->snd_ssthresh) - tcp_slow_start(tp, acked); + tcp_slow_start(tp); else { /* In dangerous area, increase slowly. * In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c index 478fe82..57bdd17 100644 --- a/net/ipv4/tcp_hybla.c +++ b/net/ipv4/tcp_hybla.c @@ -85,8 +85,7 @@ static inline u32 hybla_fraction(u32 odds) * o Give cwnd a new value based on the model proposed * o remember increments <1 */ -static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked, - u32 in_flight) +static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) { struct tcp_sock *tp = tcp_sk(sk); struct hybla *ca = inet_csk_ca(sk); @@ -103,7 +102,7 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked, return; if (!ca->hybla_en) { - tcp_reno_cong_avoid(sk, ack, acked, in_flight); + tcp_reno_cong_avoid(sk, ack, in_flight); return; } diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c index 8a52099..834857f 100644 --- a/net/ipv4/tcp_illinois.c +++ b/net/ipv4/tcp_illinois.c @@ -256,8 +256,7 @@ static void tcp_illinois_state(struct sock *sk, u8 new_state) /* * Increase window in response to successful acknowledgment. */ -static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 acked, - u32 in_flight) +static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) { struct tcp_sock *tp = tcp_sk(sk); struct illinois *ca = inet_csk_ca(sk); @@ -271,7 +270,7 @@ static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 acked, /* In slow start */ if (tp->snd_cwnd <= tp->snd_ssthresh) - tcp_slow_start(tp, acked); + tcp_slow_start(tp); else { u32 delta; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c53b7f3..a16b01b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -267,31 +267,11 @@ static bool TCP_ECN_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr * 1. Tuning sk->sk_sndbuf, when connection enters established state. */ -static void tcp_sndbuf_expand(struct sock *sk) +static void tcp_fixup_sndbuf(struct sock *sk) { - const struct tcp_sock *tp = tcp_sk(sk); - int sndmem, per_mss; - u32 nr_segs; - - /* Worst case is non GSO/TSO : each frame consumes one skb - * and skb->head is kmalloced using power of two area of memory - */ - per_mss = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) + - MAX_TCP_HEADER + - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - - per_mss = roundup_pow_of_two(per_mss) + - SKB_DATA_ALIGN(sizeof(struct sk_buff)); - - nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd); - nr_segs = max_t(u32, nr_segs, tp->reordering + 1); - - /* Fast Recovery (RFC 5681 3.2) : - * Cubic needs 1.7 factor, rounded to 2 to include - * extra cushion (application might react slowly to POLLOUT) - */ - sndmem = 2 * nr_segs * per_mss; + int sndmem = SKB_TRUESIZE(tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER); + sndmem *= TCP_INIT_CWND; if (sk->sk_sndbuf < sndmem) sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]); } @@ -375,12 +355,6 @@ static void tcp_fixup_rcvbuf(struct sock *sk) rcvmem = 2 * SKB_TRUESIZE(mss + MAX_TCP_HEADER) * tcp_default_init_rwnd(mss); - /* Dynamic Right Sizing (DRS) has 2 to 3 RTT latency - * Allow enough cushion so that sender is not limited by our window - */ - if (sysctl_tcp_moderate_rcvbuf) - rcvmem <<= 2; - if (sk->sk_rcvbuf < rcvmem) sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]); } @@ -396,11 +370,9 @@ void tcp_init_buffer_space(struct sock *sk) if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) tcp_fixup_rcvbuf(sk); if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK)) - tcp_sndbuf_expand(sk); + tcp_fixup_sndbuf(sk); tp->rcvq_space.space = tp->rcv_wnd; - tp->rcvq_space.time = tcp_time_stamp; - tp->rcvq_space.seq = tp->copied_seq; maxwin = tcp_full_space(sk); @@ -540,62 +512,48 @@ void tcp_rcv_space_adjust(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); int time; - int copied; + int space; + + if (tp->rcvq_space.time == 0) + goto new_measure; time = tcp_time_stamp - tp->rcvq_space.time; if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0) return; - /* Number of bytes copied to user in last RTT */ - copied = tp->copied_seq - tp->rcvq_space.seq; - if (copied <= tp->rcvq_space.space) - goto new_measure; - - /* A bit of theory : - * copied = bytes received in previous RTT, our base window - * To cope with packet losses, we need a 2x factor - * To cope with slow start, and sender growing its cwin by 100 % - * every RTT, we need a 4x factor, because the ACK we are sending - * now is for the next RTT, not the current one : - * <prev RTT . ><current RTT .. ><next RTT .... > - */ - - if (sysctl_tcp_moderate_rcvbuf && - !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { - int rcvwin, rcvmem, rcvbuf; + space = 2 * (tp->copied_seq - tp->rcvq_space.seq); - /* minimal window to cope with packet losses, assuming - * steady state. Add some cushion because of small variations. - */ - rcvwin = (copied << 1) + 16 * tp->advmss; + space = max(tp->rcvq_space.space, space); - /* If rate increased by 25%, - * assume slow start, rcvwin = 3 * copied - * If rate increased by 50%, - * assume sender can use 2x growth, rcvwin = 4 * copied - */ - if (copied >= - tp->rcvq_space.space + (tp->rcvq_space.space >> 2)) { - if (copied >= - tp->rcvq_space.space + (tp->rcvq_space.space >> 1)) - rcvwin <<= 1; - else - rcvwin += (rcvwin >> 1); - } + if (tp->rcvq_space.space != space) { + int rcvmem; - rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER); - while (tcp_win_from_space(rcvmem) < tp->advmss) - rcvmem += 128; + tp->rcvq_space.space = space; - rcvbuf = min(rcvwin / tp->advmss * rcvmem, sysctl_tcp_rmem[2]); - if (rcvbuf > sk->sk_rcvbuf) { - sk->sk_rcvbuf = rcvbuf; + if (sysctl_tcp_moderate_rcvbuf && + !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { + int new_clamp = space; - /* Make the window clamp follow along. */ - tp->window_clamp = rcvwin; + /* Receive space grows, normalize in order to + * take into account packet headers and sk_buff + * structure overhead. + */ + space /= tp->advmss; + if (!space) + space = 1; + rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER); + while (tcp_win_from_space(rcvmem) < tp->advmss) + rcvmem += 128; + space *= rcvmem; + space = min(space, sysctl_tcp_rmem[2]); + if (space > sk->sk_rcvbuf) { + sk->sk_rcvbuf = space; + + /* Make the window clamp follow along. */ + tp->window_clamp = new_clamp; + } } } - tp->rcvq_space.space = copied; new_measure: tp->rcvq_space.seq = tp->copied_seq; @@ -755,12 +713,7 @@ static void tcp_update_pacing_rate(struct sock *sk) if (tp->srtt > 8 + 2) do_div(rate, tp->srtt); - /* ACCESS_ONCE() is needed because sch_fq fetches sk_pacing_rate - * without any lock. We want to make sure compiler wont store - * intermediate values in this location. - */ - ACCESS_ONCE(sk->sk_pacing_rate) = min_t(u64, rate, - sk->sk_max_pacing_rate); + sk->sk_pacing_rate = min_t(u64, rate, ~0U); } /* Calculate rto without backoff. This is the second half of Van Jacobson's @@ -2903,8 +2856,7 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, * left edge of the send window. * See draft-ietf-tcplw-high-performance-00, section 3.3. */ - if (seq_rtt < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && - flag & FLAG_ACKED) + if (seq_rtt < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; if (seq_rtt < 0) @@ -2919,25 +2871,20 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, } /* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */ -static void tcp_synack_rtt_meas(struct sock *sk, const u32 synack_stamp) +static void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req) { struct tcp_sock *tp = tcp_sk(sk); s32 seq_rtt = -1; - if (synack_stamp && !tp->total_retrans) - seq_rtt = tcp_time_stamp - synack_stamp; - - /* If the ACK acks both the SYNACK and the (Fast Open'd) data packets - * sent in SYN_RECV, SYNACK RTT is the smooth RTT computed in tcp_ack() - */ - if (!tp->srtt) - tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt, -1); + if (tp->lsndtime && !tp->total_retrans) + seq_rtt = tcp_time_stamp - tp->lsndtime; + tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt, -1); } -static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight) +static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) { const struct inet_connection_sock *icsk = inet_csk(sk); - icsk->icsk_ca_ops->cong_avoid(sk, ack, acked, in_flight); + icsk->icsk_ca_ops->cong_avoid(sk, ack, in_flight); tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp; } @@ -3026,7 +2973,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, const struct inet_connection_sock *icsk = inet_csk(sk); struct sk_buff *skb; u32 now = tcp_time_stamp; - bool fully_acked = true; + int fully_acked = true; int flag = 0; u32 pkts_acked = 0; u32 reord = tp->packets_out; @@ -3034,7 +2981,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, s32 seq_rtt = -1; s32 ca_seq_rtt = -1; ktime_t last_ackt = net_invalid_timestamp(); - bool rtt_update; while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); @@ -3111,13 +3057,14 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) flag |= FLAG_SACK_RENEGING; - rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt, sack_rtt); + if (tcp_ack_update_rtt(sk, flag, seq_rtt, sack_rtt) || + (flag & FLAG_ACKED)) + tcp_rearm_rto(sk); if (flag & FLAG_ACKED) { const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; - tcp_rearm_rto(sk); if (unlikely(icsk->icsk_mtup.probe_size && !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) { tcp_mtup_probe_success(sk); @@ -3156,13 +3103,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, ca_ops->pkts_acked(sk, pkts_acked, rtt_us); } - } else if (skb && rtt_update && sack_rtt >= 0 && - sack_rtt > (s32)(now - TCP_SKB_CB(skb)->when)) { - /* Do not re-arm RTO if the sack RTT is measured from data sent - * after when the head was last (re)transmitted. Otherwise the - * timeout may continue to extend in loss recovery. - */ - tcp_rearm_rto(sk); } #if FASTRETRANS_DEBUG > 0 @@ -3454,7 +3394,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) /* Advance cwnd if state allows */ if (tcp_may_raise_cwnd(sk, flag)) - tcp_cong_avoid(sk, ack, acked, prior_in_flight); + tcp_cong_avoid(sk, ack, prior_in_flight); if (tcp_ack_is_dubious(sk, flag)) { is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); @@ -4764,7 +4704,15 @@ static void tcp_new_space(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); if (tcp_should_expand_sndbuf(sk)) { - tcp_sndbuf_expand(sk); + int sndmem = SKB_TRUESIZE(max_t(u32, + tp->rx_opt.mss_clamp, + tp->mss_cache) + + MAX_TCP_HEADER); + int demanded = max_t(unsigned int, tp->snd_cwnd, + tp->reordering + 1); + sndmem *= 2 * demanded; + if (sndmem > sk->sk_sndbuf) + sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]); tp->snd_cwnd_stamp = tcp_time_stamp; } @@ -5639,7 +5587,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, struct request_sock *req; int queued = 0; bool acceptable; - u32 synack_stamp; tp->rx_opt.saw_tstamp = 0; @@ -5722,18 +5669,16 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, * so release it. */ if (req) { - synack_stamp = tcp_rsk(req)->snt_synack; tp->total_retrans = req->num_retrans; reqsk_fastopen_remove(sk, req, false); } else { - synack_stamp = tp->lsndtime; /* Make sure socket is routed, for correct metrics. */ icsk->icsk_af_ops->rebuild_header(sk); tcp_init_congestion_control(sk); tcp_mtup_init(sk); - tp->copied_seq = tp->rcv_nxt; tcp_init_buffer_space(sk); + tp->copied_seq = tp->rcv_nxt; } smp_mb(); tcp_set_state(sk, TCP_ESTABLISHED); @@ -5749,7 +5694,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, tp->snd_una = TCP_SKB_CB(skb)->ack_seq; tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale; tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); - tcp_synack_rtt_meas(sk, synack_stamp); + tcp_synack_rtt_meas(sk, req); if (tp->rx_opt.tstamp_ok) tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 0672139..b14266b 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -177,7 +177,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (IS_ERR(rt)) { err = PTR_ERR(rt); if (err == -ENETUNREACH) - IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); + IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); return err; } @@ -288,7 +288,6 @@ static void tcp_v4_mtu_reduced(struct sock *sk) mtu = dst_mtu(dst); if (inet->pmtudisc != IP_PMTUDISC_DONT && - ip_sk_accept_pmtu(sk) && inet_csk(sk)->icsk_pmtu_cookie > mtu) { tcp_sync_mss(sk, mtu); @@ -836,11 +835,11 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, skb = tcp_make_synack(sk, dst, req, NULL); if (skb) { - __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); + __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr); skb_set_queue_mapping(skb, queue_mapping); - err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, - ireq->ir_rmt_addr, + err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr, + ireq->rmt_addr, ireq->opt); err = net_xmit_eval(err); if (!tcp_rsk(req)->snt_synack && !err) @@ -973,7 +972,7 @@ static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk, { union tcp_md5_addr *addr; - addr = (union tcp_md5_addr *)&inet_rsk(req)->ir_rmt_addr; + addr = (union tcp_md5_addr *)&inet_rsk(req)->rmt_addr; return tcp_md5_do_lookup(sk, addr, AF_INET); } @@ -1150,8 +1149,8 @@ int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key, saddr = inet_sk(sk)->inet_saddr; daddr = inet_sk(sk)->inet_daddr; } else if (req) { - saddr = inet_rsk(req)->ir_loc_addr; - daddr = inet_rsk(req)->ir_rmt_addr; + saddr = inet_rsk(req)->loc_addr; + daddr = inet_rsk(req)->rmt_addr; } else { const struct iphdr *iph = ip_hdr(skb); saddr = iph->saddr; @@ -1367,8 +1366,8 @@ static int tcp_v4_conn_req_fastopen(struct sock *sk, kfree_skb(skb_synack); return -1; } - err = ip_build_and_send_pkt(skb_synack, sk, ireq->ir_loc_addr, - ireq->ir_rmt_addr, ireq->opt); + err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr, + ireq->rmt_addr, ireq->opt); err = net_xmit_eval(err); if (!err) tcp_rsk(req)->snt_synack = tcp_time_stamp; @@ -1411,8 +1410,8 @@ static int tcp_v4_conn_req_fastopen(struct sock *sk, inet_csk(child)->icsk_af_ops->rebuild_header(child); tcp_init_congestion_control(child); tcp_mtup_init(child); - tcp_init_metrics(child); tcp_init_buffer_space(child); + tcp_init_metrics(child); /* Queue the data carried in the SYN packet. We need to first * bump skb's refcnt because the caller will attempt to free it. @@ -1503,8 +1502,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) tcp_openreq_init(req, &tmp_opt, skb); ireq = inet_rsk(req); - ireq->ir_loc_addr = daddr; - ireq->ir_rmt_addr = saddr; + ireq->loc_addr = daddr; + ireq->rmt_addr = saddr; ireq->no_srccheck = inet_sk(sk)->transparent; ireq->opt = tcp_v4_save_options(skb); @@ -1579,15 +1578,15 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL); if (skb_synack) { - __tcp_v4_send_check(skb_synack, ireq->ir_loc_addr, ireq->ir_rmt_addr); + __tcp_v4_send_check(skb_synack, ireq->loc_addr, ireq->rmt_addr); skb_set_queue_mapping(skb_synack, skb_get_queue_mapping(skb)); } else goto drop_and_free; if (likely(!do_fastopen)) { int err; - err = ip_build_and_send_pkt(skb_synack, sk, ireq->ir_loc_addr, - ireq->ir_rmt_addr, ireq->opt); + err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr, + ireq->rmt_addr, ireq->opt); err = net_xmit_eval(err); if (err || want_cookie) goto drop_and_free; @@ -1645,9 +1644,9 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newtp = tcp_sk(newsk); newinet = inet_sk(newsk); ireq = inet_rsk(req); - newinet->inet_daddr = ireq->ir_rmt_addr; - newinet->inet_rcv_saddr = ireq->ir_loc_addr; - newinet->inet_saddr = ireq->ir_loc_addr; + newinet->inet_daddr = ireq->rmt_addr; + newinet->inet_rcv_saddr = ireq->loc_addr; + newinet->inet_saddr = ireq->loc_addr; inet_opt = ireq->opt; rcu_assign_pointer(newinet->inet_opt, inet_opt); ireq->opt = NULL; @@ -2195,6 +2194,18 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock); #ifdef CONFIG_PROC_FS /* Proc filesystem TCP sock list dumping. */ +static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head) +{ + return hlist_nulls_empty(head) ? NULL : + list_entry(head->first, struct inet_timewait_sock, tw_node); +} + +static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw) +{ + return !is_a_nulls(tw->tw_node.next) ? + hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL; +} + /* * Get next listener socket follow cur. If cur is NULL, get first socket * starting from bucket given in st->bucket; when st->bucket is zero the @@ -2298,9 +2309,10 @@ static void *listening_get_idx(struct seq_file *seq, loff_t *pos) return rc; } -static inline bool empty_bucket(const struct tcp_iter_state *st) +static inline bool empty_bucket(struct tcp_iter_state *st) { - return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain); + return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) && + hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain); } /* @@ -2317,6 +2329,7 @@ static void *established_get_first(struct seq_file *seq) for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) { struct sock *sk; struct hlist_nulls_node *node; + struct inet_timewait_sock *tw; spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); /* Lockless fast path for the common case of empty buckets */ @@ -2332,7 +2345,18 @@ static void *established_get_first(struct seq_file *seq) rc = sk; goto out; } + st->state = TCP_SEQ_STATE_TIME_WAIT; + inet_twsk_for_each(tw, node, + &tcp_hashinfo.ehash[st->bucket].twchain) { + if (tw->tw_family != st->family || + !net_eq(twsk_net(tw), net)) { + continue; + } + rc = tw; + goto out; + } spin_unlock_bh(lock); + st->state = TCP_SEQ_STATE_ESTABLISHED; } out: return rc; @@ -2341,6 +2365,7 @@ out: static void *established_get_next(struct seq_file *seq, void *cur) { struct sock *sk = cur; + struct inet_timewait_sock *tw; struct hlist_nulls_node *node; struct tcp_iter_state *st = seq->private; struct net *net = seq_file_net(seq); @@ -2348,16 +2373,45 @@ static void *established_get_next(struct seq_file *seq, void *cur) ++st->num; ++st->offset; - sk = sk_nulls_next(sk); + if (st->state == TCP_SEQ_STATE_TIME_WAIT) { + tw = cur; + tw = tw_next(tw); +get_tw: + while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) { + tw = tw_next(tw); + } + if (tw) { + cur = tw; + goto out; + } + spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); + st->state = TCP_SEQ_STATE_ESTABLISHED; + + /* Look for next non empty bucket */ + st->offset = 0; + while (++st->bucket <= tcp_hashinfo.ehash_mask && + empty_bucket(st)) + ; + if (st->bucket > tcp_hashinfo.ehash_mask) + return NULL; + + spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); + sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain); + } else + sk = sk_nulls_next(sk); sk_nulls_for_each_from(sk, node) { if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) - return sk; + goto found; } - spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); - ++st->bucket; - return established_get_first(seq); + st->state = TCP_SEQ_STATE_TIME_WAIT; + tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain); + goto get_tw; +found: + cur = sk; +out: + return cur; } static void *established_get_idx(struct seq_file *seq, loff_t pos) @@ -2410,9 +2464,10 @@ static void *tcp_seek_last_pos(struct seq_file *seq) if (rc) break; st->bucket = 0; - st->state = TCP_SEQ_STATE_ESTABLISHED; /* Fallthrough */ case TCP_SEQ_STATE_ESTABLISHED: + case TCP_SEQ_STATE_TIME_WAIT: + st->state = TCP_SEQ_STATE_ESTABLISHED; if (st->bucket > tcp_hashinfo.ehash_mask) break; rc = established_get_first(seq); @@ -2469,6 +2524,7 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) } break; case TCP_SEQ_STATE_ESTABLISHED: + case TCP_SEQ_STATE_TIME_WAIT: rc = established_get_next(seq, v); break; } @@ -2492,6 +2548,7 @@ static void tcp_seq_stop(struct seq_file *seq, void *v) if (v != SEQ_START_TOKEN) spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock); break; + case TCP_SEQ_STATE_TIME_WAIT: case TCP_SEQ_STATE_ESTABLISHED: if (v) spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); @@ -2541,18 +2598,18 @@ void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo) EXPORT_SYMBOL(tcp_proc_unregister); static void get_openreq4(const struct sock *sk, const struct request_sock *req, - struct seq_file *f, int i, kuid_t uid) + struct seq_file *f, int i, kuid_t uid, int *len) { const struct inet_request_sock *ireq = inet_rsk(req); long delta = req->expires - jiffies; seq_printf(f, "%4d: %08X:%04X %08X:%04X" - " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", + " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK%n", i, - ireq->ir_loc_addr, + ireq->loc_addr, ntohs(inet_sk(sk)->inet_sport), - ireq->ir_rmt_addr, - ntohs(ireq->ir_rmt_port), + ireq->rmt_addr, + ntohs(ireq->rmt_port), TCP_SYN_RECV, 0, 0, /* could print option size, but that is af dependent. */ 1, /* timers active (only the expire timer) */ @@ -2562,10 +2619,11 @@ static void get_openreq4(const struct sock *sk, const struct request_sock *req, 0, /* non standard timer */ 0, /* open_requests have no inode */ atomic_read(&sk->sk_refcnt), - req); + req, + len); } -static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) +static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len) { int timer_active; unsigned long timer_expires; @@ -2604,7 +2662,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " - "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", + "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d%n", i, src, srcp, dest, destp, sk->sk_state, tp->write_seq - tp->snd_una, rx_queue, @@ -2621,11 +2679,12 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) tp->snd_cwnd, sk->sk_state == TCP_LISTEN ? (fastopenq ? fastopenq->max_qlen : 0) : - (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); + (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh), + len); } static void get_timewait4_sock(const struct inet_timewait_sock *tw, - struct seq_file *f, int i) + struct seq_file *f, int i, int *len) { __be32 dest, src; __u16 destp, srcp; @@ -2637,10 +2696,10 @@ static void get_timewait4_sock(const struct inet_timewait_sock *tw, srcp = ntohs(tw->tw_sport); seq_printf(f, "%4d: %08X:%04X %08X:%04X" - " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", + " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n", i, src, srcp, dest, destp, tw->tw_substate, 0, 0, 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, - atomic_read(&tw->tw_refcnt), tw); + atomic_read(&tw->tw_refcnt), tw, len); } #define TMPSZ 150 @@ -2648,11 +2707,11 @@ static void get_timewait4_sock(const struct inet_timewait_sock *tw, static int tcp4_seq_show(struct seq_file *seq, void *v) { struct tcp_iter_state *st; - struct sock *sk = v; + int len; - seq_setwidth(seq, TMPSZ - 1); if (v == SEQ_START_TOKEN) { - seq_puts(seq, " sl local_address rem_address st tx_queue " + seq_printf(seq, "%-*s\n", TMPSZ - 1, + " sl local_address rem_address st tx_queue " "rx_queue tr tm->when retrnsmt uid timeout " "inode"); goto out; @@ -2662,17 +2721,17 @@ static int tcp4_seq_show(struct seq_file *seq, void *v) switch (st->state) { case TCP_SEQ_STATE_LISTENING: case TCP_SEQ_STATE_ESTABLISHED: - if (sk->sk_state == TCP_TIME_WAIT) - get_timewait4_sock(v, seq, st->num); - else - get_tcp4_sock(v, seq, st->num); + get_tcp4_sock(v, seq, st->num, &len); break; case TCP_SEQ_STATE_OPENREQ: - get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid); + get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len); + break; + case TCP_SEQ_STATE_TIME_WAIT: + get_timewait4_sock(v, seq, st->num, &len); break; } + seq_printf(seq, "%*s\n", TMPSZ - 1 - len, ""); out: - seq_pad(seq, '\n'); return 0; } @@ -2747,7 +2806,6 @@ struct proto tcp_prot = { .orphan_count = &tcp_orphan_count, .memory_allocated = &tcp_memory_allocated, .memory_pressure = &tcp_memory_pressure, - .sysctl_mem = sysctl_tcp_mem, .sysctl_wmem = sysctl_tcp_wmem, .sysctl_rmem = sysctl_tcp_rmem, .max_header = MAX_TCP_HEADER, diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c index 991d62a..72f7218 100644 --- a/net/ipv4/tcp_lp.c +++ b/net/ipv4/tcp_lp.c @@ -115,13 +115,12 @@ static void tcp_lp_init(struct sock *sk) * Will only call newReno CA when away from inference. * From TCP-LP's paper, this will be handled in additive increasement. */ -static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 acked, - u32 in_flight) +static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) { struct lp *lp = inet_csk_ca(sk); if (!(lp->flag & LP_WITHIN_INF)) - tcp_reno_cong_avoid(sk, ack, acked, in_flight); + tcp_reno_cong_avoid(sk, ack, in_flight); } /** diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index 269a89e..559d4ae 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -6,10 +6,15 @@ #include <linux/memcontrol.h> #include <linux/module.h> +static inline struct tcp_memcontrol *tcp_from_cgproto(struct cg_proto *cg_proto) +{ + return container_of(cg_proto, struct tcp_memcontrol, cg_proto); +} + static void memcg_tcp_enter_memory_pressure(struct sock *sk) { if (sk->sk_cgrp->memory_pressure) - sk->sk_cgrp->memory_pressure = 1; + *sk->sk_cgrp->memory_pressure = 1; } EXPORT_SYMBOL(memcg_tcp_enter_memory_pressure); @@ -22,24 +27,34 @@ int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss) */ struct res_counter *res_parent = NULL; struct cg_proto *cg_proto, *parent_cg; + struct tcp_memcontrol *tcp; struct mem_cgroup *parent = parent_mem_cgroup(memcg); + struct net *net = current->nsproxy->net_ns; cg_proto = tcp_prot.proto_cgroup(memcg); if (!cg_proto) return 0; - cg_proto->sysctl_mem[0] = sysctl_tcp_mem[0]; - cg_proto->sysctl_mem[1] = sysctl_tcp_mem[1]; - cg_proto->sysctl_mem[2] = sysctl_tcp_mem[2]; - cg_proto->memory_pressure = 0; - cg_proto->memcg = memcg; + tcp = tcp_from_cgproto(cg_proto); + + tcp->tcp_prot_mem[0] = net->ipv4.sysctl_tcp_mem[0]; + tcp->tcp_prot_mem[1] = net->ipv4.sysctl_tcp_mem[1]; + tcp->tcp_prot_mem[2] = net->ipv4.sysctl_tcp_mem[2]; + tcp->tcp_memory_pressure = 0; parent_cg = tcp_prot.proto_cgroup(parent); if (parent_cg) - res_parent = &parent_cg->memory_allocated; + res_parent = parent_cg->memory_allocated; + + res_counter_init(&tcp->tcp_memory_allocated, res_parent); + percpu_counter_init(&tcp->tcp_sockets_allocated, 0); - res_counter_init(&cg_proto->memory_allocated, res_parent); - percpu_counter_init(&cg_proto->sockets_allocated, 0); + cg_proto->enter_memory_pressure = memcg_tcp_enter_memory_pressure; + cg_proto->memory_pressure = &tcp->tcp_memory_pressure; + cg_proto->sysctl_mem = tcp->tcp_prot_mem; + cg_proto->memory_allocated = &tcp->tcp_memory_allocated; + cg_proto->sockets_allocated = &tcp->tcp_sockets_allocated; + cg_proto->memcg = memcg; return 0; } @@ -48,18 +63,23 @@ EXPORT_SYMBOL(tcp_init_cgroup); void tcp_destroy_cgroup(struct mem_cgroup *memcg) { struct cg_proto *cg_proto; + struct tcp_memcontrol *tcp; cg_proto = tcp_prot.proto_cgroup(memcg); if (!cg_proto) return; - percpu_counter_destroy(&cg_proto->sockets_allocated); + tcp = tcp_from_cgproto(cg_proto); + percpu_counter_destroy(&tcp->tcp_sockets_allocated); } EXPORT_SYMBOL(tcp_destroy_cgroup); static int tcp_update_limit(struct mem_cgroup *memcg, u64 val) { + struct net *net = current->nsproxy->net_ns; + struct tcp_memcontrol *tcp; struct cg_proto *cg_proto; + u64 old_lim; int i; int ret; @@ -70,13 +90,16 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val) if (val > RES_COUNTER_MAX) val = RES_COUNTER_MAX; - ret = res_counter_set_limit(&cg_proto->memory_allocated, val); + tcp = tcp_from_cgproto(cg_proto); + + old_lim = res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT); + ret = res_counter_set_limit(&tcp->tcp_memory_allocated, val); if (ret) return ret; for (i = 0; i < 3; i++) - cg_proto->sysctl_mem[i] = min_t(long, val >> PAGE_SHIFT, - sysctl_tcp_mem[i]); + tcp->tcp_prot_mem[i] = min_t(long, val >> PAGE_SHIFT, + net->ipv4.sysctl_tcp_mem[i]); if (val == RES_COUNTER_MAX) clear_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags); @@ -133,24 +156,28 @@ static int tcp_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft, static u64 tcp_read_stat(struct mem_cgroup *memcg, int type, u64 default_val) { + struct tcp_memcontrol *tcp; struct cg_proto *cg_proto; cg_proto = tcp_prot.proto_cgroup(memcg); if (!cg_proto) return default_val; - return res_counter_read_u64(&cg_proto->memory_allocated, type); + tcp = tcp_from_cgproto(cg_proto); + return res_counter_read_u64(&tcp->tcp_memory_allocated, type); } static u64 tcp_read_usage(struct mem_cgroup *memcg) { + struct tcp_memcontrol *tcp; struct cg_proto *cg_proto; cg_proto = tcp_prot.proto_cgroup(memcg); if (!cg_proto) return atomic_long_read(&tcp_memory_allocated) << PAGE_SHIFT; - return res_counter_read_u64(&cg_proto->memory_allocated, RES_USAGE); + tcp = tcp_from_cgproto(cg_proto); + return res_counter_read_u64(&tcp->tcp_memory_allocated, RES_USAGE); } static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft) @@ -178,25 +205,54 @@ static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft) static int tcp_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event) { struct mem_cgroup *memcg; + struct tcp_memcontrol *tcp; struct cg_proto *cg_proto; memcg = mem_cgroup_from_css(css); cg_proto = tcp_prot.proto_cgroup(memcg); if (!cg_proto) return 0; + tcp = tcp_from_cgproto(cg_proto); switch (event) { case RES_MAX_USAGE: - res_counter_reset_max(&cg_proto->memory_allocated); + res_counter_reset_max(&tcp->tcp_memory_allocated); break; case RES_FAILCNT: - res_counter_reset_failcnt(&cg_proto->memory_allocated); + res_counter_reset_failcnt(&tcp->tcp_memory_allocated); break; } return 0; } +unsigned long long tcp_max_memory(const struct mem_cgroup *memcg) +{ + struct tcp_memcontrol *tcp; + struct cg_proto *cg_proto; + + cg_proto = tcp_prot.proto_cgroup((struct mem_cgroup *)memcg); + if (!cg_proto) + return 0; + + tcp = tcp_from_cgproto(cg_proto); + return res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT); +} + +void tcp_prot_mem(struct mem_cgroup *memcg, long val, int idx) +{ + struct tcp_memcontrol *tcp; + struct cg_proto *cg_proto; + + cg_proto = tcp_prot.proto_cgroup(memcg); + if (!cg_proto) + return; + + tcp = tcp_from_cgproto(cg_proto); + + tcp->tcp_prot_mem[idx] = val; +} + static struct cftype tcp_files[] = { { .name = "kmem.tcp.limit_in_bytes", diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 0649373..52f3c6b 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -215,15 +215,13 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req, addr.family = req->rsk_ops->family; switch (addr.family) { case AF_INET: - addr.addr.a4 = inet_rsk(req)->ir_rmt_addr; + addr.addr.a4 = inet_rsk(req)->rmt_addr; hash = (__force unsigned int) addr.addr.a4; break; -#if IS_ENABLED(CONFIG_IPV6) case AF_INET6: - *(struct in6_addr *)addr.addr.a6 = inet_rsk(req)->ir_v6_rmt_addr; - hash = ipv6_addr_hash(&inet_rsk(req)->ir_v6_rmt_addr); + *(struct in6_addr *)addr.addr.a6 = inet6_rsk(req)->rmt_addr; + hash = ipv6_addr_hash(&inet6_rsk(req)->rmt_addr); break; -#endif default: return NULL; } @@ -242,6 +240,7 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req, static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock *tw) { + struct inet6_timewait_sock *tw6; struct tcp_metrics_block *tm; struct inetpeer_addr addr; unsigned int hash; @@ -253,12 +252,11 @@ static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock addr.addr.a4 = tw->tw_daddr; hash = (__force unsigned int) addr.addr.a4; break; -#if IS_ENABLED(CONFIG_IPV6) case AF_INET6: - *(struct in6_addr *)addr.addr.a6 = tw->tw_v6_daddr; - hash = ipv6_addr_hash(&tw->tw_v6_daddr); + tw6 = inet6_twsk((struct sock *)tw); + *(struct in6_addr *)addr.addr.a6 = tw6->tw_v6_daddr; + hash = ipv6_addr_hash(&tw6->tw_v6_daddr); break; -#endif default: return NULL; } @@ -290,12 +288,10 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk, addr.addr.a4 = inet_sk(sk)->inet_daddr; hash = (__force unsigned int) addr.addr.a4; break; -#if IS_ENABLED(CONFIG_IPV6) case AF_INET6: - *(struct in6_addr *)addr.addr.a6 = sk->sk_v6_daddr; - hash = ipv6_addr_hash(&sk->sk_v6_daddr); + *(struct in6_addr *)addr.addr.a6 = inet6_sk(sk)->daddr; + hash = ipv6_addr_hash(&inet6_sk(sk)->daddr); break; -#endif default: return NULL; } @@ -663,20 +659,16 @@ void tcp_fastopen_cache_get(struct sock *sk, u16 *mss, void tcp_fastopen_cache_set(struct sock *sk, u16 mss, struct tcp_fastopen_cookie *cookie, bool syn_lost) { - struct dst_entry *dst = __sk_dst_get(sk); struct tcp_metrics_block *tm; - if (!dst) - return; rcu_read_lock(); - tm = tcp_get_metrics(sk, dst, true); + tm = tcp_get_metrics(sk, __sk_dst_get(sk), true); if (tm) { struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen; write_seqlock_bh(&fastopen_seqlock); - if (mss) - tfom->mss = mss; - if (cookie && cookie->len > 0) + tfom->mss = mss; + if (cookie->len > 0) tfom->cookie = *cookie; if (syn_lost) { ++tfom->syn_loss; @@ -991,7 +983,7 @@ static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info) return 0; } -static const struct genl_ops tcp_metrics_nl_ops[] = { +static struct genl_ops tcp_metrics_nl_ops[] = { { .cmd = TCP_METRICS_CMD_GET, .doit = tcp_metrics_nl_cmd_get, @@ -1082,7 +1074,8 @@ void __init tcp_metrics_init(void) if (ret < 0) goto cleanup; ret = genl_register_family_with_ops(&tcp_metrics_nl_family, - tcp_metrics_nl_ops); + tcp_metrics_nl_ops, + ARRAY_SIZE(tcp_metrics_nl_ops)); if (ret < 0) goto cleanup_subsys; return; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 97b6841..58a3e69 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -293,9 +293,12 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) #if IS_ENABLED(CONFIG_IPV6) if (tw->tw_family == PF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); + struct inet6_timewait_sock *tw6; - tw->tw_v6_daddr = sk->sk_v6_daddr; - tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr; + tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot); + tw6 = inet6_twsk((struct sock *)tw); + tw6->tw_v6_daddr = np->daddr; + tw6->tw_v6_rcv_saddr = np->rcv_saddr; tw->tw_tclass = np->tclass; tw->tw_ipv6only = np->ipv6only; } diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 0560635..3a7525e 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -14,11 +14,10 @@ #include <net/tcp.h> #include <net/protocol.h> -struct sk_buff *tcp_gso_segment(struct sk_buff *skb, +struct sk_buff *tcp_tso_segment(struct sk_buff *skb, netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EINVAL); - unsigned int sum_truesize = 0; struct tcphdr *th; unsigned int thlen; unsigned int seq; @@ -57,8 +56,6 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, SKB_GSO_TCP_ECN | SKB_GSO_TCPV6 | SKB_GSO_GRE | - SKB_GSO_IPIP | - SKB_GSO_SIT | SKB_GSO_MPLS | SKB_GSO_UDP_TUNNEL | 0) || @@ -105,7 +102,13 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, if (copy_destructor) { skb->destructor = gso_skb->destructor; skb->sk = gso_skb->sk; - sum_truesize += skb->truesize; + /* {tcp|sock}_wfree() use exact truesize accounting : + * sum(skb->truesize) MUST be exactly be gso_skb->truesize + * So we account mss bytes of 'true size' for each segment. + * The last segment will contain the remaining. + */ + skb->truesize = mss; + gso_skb->truesize -= mss; } skb = skb->next; th = tcp_hdr(skb); @@ -122,9 +125,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, if (copy_destructor) { swap(gso_skb->sk, skb->sk); swap(gso_skb->destructor, skb->destructor); - sum_truesize += skb->truesize; - atomic_add(sum_truesize - gso_skb->truesize, - &skb->sk->sk_wmem_alloc); + swap(gso_skb->truesize, skb->truesize); } delta = htonl(oldlen + (skb_tail_pointer(skb) - @@ -138,7 +139,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, out: return segs; } -EXPORT_SYMBOL(tcp_gso_segment); +EXPORT_SYMBOL(tcp_tso_segment); struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) { @@ -274,32 +275,33 @@ static struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff * { const struct iphdr *iph = skb_gro_network_header(skb); __wsum wsum; - - /* Don't bother verifying checksum if we're going to flush anyway. */ - if (NAPI_GRO_CB(skb)->flush) - goto skip_csum; - - wsum = skb->csum; + __sum16 sum; switch (skb->ip_summed) { - case CHECKSUM_NONE: - wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), - 0); - - /* fall through */ - case CHECKSUM_COMPLETE: if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr, - wsum)) { + skb->csum)) { skb->ip_summed = CHECKSUM_UNNECESSARY; break; } - +flush: NAPI_GRO_CB(skb)->flush = 1; return NULL; + + case CHECKSUM_NONE: + wsum = csum_tcpudp_nofold(iph->saddr, iph->daddr, + skb_gro_len(skb), IPPROTO_TCP, 0); + sum = csum_fold(skb_checksum(skb, + skb_gro_offset(skb), + skb_gro_len(skb), + wsum)); + if (sum) + goto flush; + + skb->ip_summed = CHECKSUM_UNNECESSARY; + break; } -skip_csum: return tcp_gro_receive(head, skb); } @@ -318,7 +320,7 @@ static int tcp4_gro_complete(struct sk_buff *skb) static const struct net_offload tcpv4_offload = { .callbacks = { .gso_send_check = tcp_v4_gso_send_check, - .gso_segment = tcp_gso_segment, + .gso_segment = tcp_tso_segment, .gro_receive = tcp4_gro_receive, .gro_complete = tcp4_gro_complete, }, diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 7820f3a..d46f214 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -850,14 +850,14 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, BUG_ON(!skb || !tcp_skb_pcount(skb)); - if (clone_it) { - const struct sk_buff *fclone = skb + 1; + /* If congestion control is doing timestamping, we must + * take such a timestamp before we potentially clone/copy. + */ + if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP) + __net_timestamp(skb); - /* If congestion control is doing timestamping, we must - * take such a timestamp before we potentially clone/copy. - */ - if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP) - __net_timestamp(skb); + if (likely(clone_it)) { + const struct sk_buff *fclone = skb + 1; if (unlikely(skb->fclone == SKB_FCLONE_ORIG && fclone->fclone == SKB_FCLONE_CLONE)) @@ -1875,12 +1875,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, * - better RTT estimation and ACK scheduling * - faster recovery * - high rates - * Alas, some drivers / subsystems require a fair amount - * of queued bytes to ensure line rate. - * One example is wifi aggregation (802.11 AMPDU) */ - limit = max_t(unsigned int, sysctl_tcp_limit_output_bytes, - sk->sk_pacing_rate >> 10); + limit = max(skb->truesize, sk->sk_pacing_rate >> 10); if (atomic_read(&sk->sk_wmem_alloc) > limit) { set_bit(TSQ_THROTTLED, &tp->tsq_flags); @@ -2357,6 +2353,21 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) tcp_retrans_try_collapse(sk, skb, cur_mss); + /* Some Solaris stacks overoptimize and ignore the FIN on a + * retransmit when old data is attached. So strip it off + * since it is cheap to do so and saves bytes on the network. + */ + if (skb->len > 0 && + (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) && + tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) { + if (!pskb_trim(skb, 0)) { + /* Reuse, even though it does some unnecessary work */ + tcp_init_nondata_skb(skb, TCP_SKB_CB(skb)->end_seq - 1, + TCP_SKB_CB(skb)->tcp_flags); + skb->ip_summed = CHECKSUM_NONE; + } + } + /* Make a copy, if the first transmission SKB clone we made * is still in somebody's hands, else make a clone. */ @@ -2725,8 +2736,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, th->syn = 1; th->ack = 1; TCP_ECN_make_synack(req, th); - th->source = htons(ireq->ir_num); - th->dest = ireq->ir_rmt_port; + th->source = ireq->loc_port; + th->dest = ireq->rmt_port; /* Setting of flags are superfluous here for callers (and ECE is * not even correctly set) */ @@ -3097,6 +3108,7 @@ void tcp_send_window_probe(struct sock *sk) { if (sk->sk_state == TCP_ESTABLISHED) { tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1; + tcp_sk(sk)->snd_nxt = tcp_sk(sk)->write_seq; tcp_xmit_probe_skb(sk, 0); } } diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c index 8b97d71..611beab 100644 --- a/net/ipv4/tcp_probe.c +++ b/net/ipv4/tcp_probe.c @@ -101,6 +101,22 @@ static inline int tcp_probe_avail(void) si4.sin_addr.s_addr = inet->inet_##mem##addr; \ } while (0) \ +#if IS_ENABLED(CONFIG_IPV6) +#define tcp_probe_copy_fl_to_si6(inet, si6, mem) \ + do { \ + struct ipv6_pinfo *pi6 = inet->pinet6; \ + si6.sin6_family = AF_INET6; \ + si6.sin6_port = inet->inet_##mem##port; \ + si6.sin6_addr = pi6->mem##addr; \ + si6.sin6_flowinfo = 0; /* No need here. */ \ + si6.sin6_scope_id = 0; /* No need here. */ \ + } while (0) +#else +#define tcp_probe_copy_fl_to_si6(fl, si6, mem) \ + do { \ + memset(&si6, 0, sizeof(si6)); \ + } while (0) +#endif /* * Hook inserted to be called before each receive packet. @@ -131,17 +147,8 @@ static void jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, tcp_probe_copy_fl_to_si4(inet, p->dst.v4, d); break; case AF_INET6: - memset(&p->src.v6, 0, sizeof(p->src.v6)); - memset(&p->dst.v6, 0, sizeof(p->dst.v6)); -#if IS_ENABLED(CONFIG_IPV6) - p->src.v6.sin6_family = AF_INET6; - p->src.v6.sin6_port = inet->inet_sport; - p->src.v6.sin6_addr = inet6_sk(sk)->saddr; - - p->dst.v6.sin6_family = AF_INET6; - p->dst.v6.sin6_port = inet->inet_dport; - p->dst.v6.sin6_addr = sk->sk_v6_daddr; -#endif + tcp_probe_copy_fl_to_si6(inet, p->src.v6, s); + tcp_probe_copy_fl_to_si6(inet, p->dst.v6, d); break; default: BUG(); diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c index 19ea6c2..8ce55b8 100644 --- a/net/ipv4/tcp_scalable.c +++ b/net/ipv4/tcp_scalable.c @@ -15,8 +15,7 @@ #define TCP_SCALABLE_AI_CNT 50U #define TCP_SCALABLE_MD_SCALE 3 -static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked, - u32 in_flight) +static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) { struct tcp_sock *tp = tcp_sk(sk); @@ -24,7 +23,7 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked, return; if (tp->snd_cwnd <= tp->snd_ssthresh) - tcp_slow_start(tp, acked); + tcp_slow_start(tp); else tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)); } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 64f0354..4b85e6f 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -156,16 +156,12 @@ static bool retransmits_timed_out(struct sock *sk, static int tcp_write_timeout(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); - struct tcp_sock *tp = tcp_sk(sk); int retry_until; bool do_reset, syn_set = false; if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { - if (icsk->icsk_retransmits) { + if (icsk->icsk_retransmits) dst_negative_advice(sk); - if (tp->syn_fastopen || tp->syn_data) - tcp_fastopen_cache_set(sk, 0, NULL, true); - } retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; syn_set = true; } else { @@ -378,8 +374,9 @@ void tcp_retransmit_timer(struct sock *sk) } #if IS_ENABLED(CONFIG_IPV6) else if (sk->sk_family == AF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n"), - &sk->sk_v6_daddr, + &np->daddr, ntohs(inet->inet_dport), inet->inet_num, tp->snd_una, tp->snd_nxt); } diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 06cae62..80fa2bf 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -163,14 +163,13 @@ static inline u32 tcp_vegas_ssthresh(struct tcp_sock *tp) return min(tp->snd_ssthresh, tp->snd_cwnd-1); } -static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked, - u32 in_flight) +static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) { struct tcp_sock *tp = tcp_sk(sk); struct vegas *vegas = inet_csk_ca(sk); if (!vegas->doing_vegas_now) { - tcp_reno_cong_avoid(sk, ack, acked, in_flight); + tcp_reno_cong_avoid(sk, ack, in_flight); return; } @@ -195,7 +194,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked, /* We don't have enough RTT samples to do the Vegas * calculation, so we'll behave like Reno. */ - tcp_reno_cong_avoid(sk, ack, acked, in_flight); + tcp_reno_cong_avoid(sk, ack, in_flight); } else { u32 rtt, diff; u64 target_cwnd; @@ -244,7 +243,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked, } else if (tp->snd_cwnd <= tp->snd_ssthresh) { /* Slow start. */ - tcp_slow_start(tp, acked); + tcp_slow_start(tp); } else { /* Congestion avoidance. */ @@ -284,7 +283,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked, } /* Use normal slow start */ else if (tp->snd_cwnd <= tp->snd_ssthresh) - tcp_slow_start(tp, acked); + tcp_slow_start(tp); } diff --git a/net/ipv4/tcp_vegas.h b/net/ipv4/tcp_vegas.h index 0531b99..6c0eea2 100644 --- a/net/ipv4/tcp_vegas.h +++ b/net/ipv4/tcp_vegas.h @@ -15,10 +15,10 @@ struct vegas { u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */ }; -void tcp_vegas_init(struct sock *sk); -void tcp_vegas_state(struct sock *sk, u8 ca_state); -void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us); -void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event); -void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb); +extern void tcp_vegas_init(struct sock *sk); +extern void tcp_vegas_state(struct sock *sk, u8 ca_state); +extern void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us); +extern void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event); +extern void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb); #endif /* __TCP_VEGAS_H */ diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c index 326475a..ac43cd7 100644 --- a/net/ipv4/tcp_veno.c +++ b/net/ipv4/tcp_veno.c @@ -114,14 +114,13 @@ static void tcp_veno_cwnd_event(struct sock *sk, enum tcp_ca_event event) tcp_veno_init(sk); } -static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked, - u32 in_flight) +static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) { struct tcp_sock *tp = tcp_sk(sk); struct veno *veno = inet_csk_ca(sk); if (!veno->doing_veno_now) { - tcp_reno_cong_avoid(sk, ack, acked, in_flight); + tcp_reno_cong_avoid(sk, ack, in_flight); return; } @@ -134,7 +133,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked, /* We don't have enough rtt samples to do the Veno * calculation, so we'll behave like Reno. */ - tcp_reno_cong_avoid(sk, ack, acked, in_flight); + tcp_reno_cong_avoid(sk, ack, in_flight); } else { u64 target_cwnd; u32 rtt; @@ -153,7 +152,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked, if (tp->snd_cwnd <= tp->snd_ssthresh) { /* Slow start. */ - tcp_slow_start(tp, acked); + tcp_slow_start(tp); } else { /* Congestion avoidance. */ if (veno->diff < beta) { diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c index a347a07..05c3b6f 100644 --- a/net/ipv4/tcp_yeah.c +++ b/net/ipv4/tcp_yeah.c @@ -69,8 +69,7 @@ static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked, s32 rtt_us) tcp_vegas_pkts_acked(sk, pkts_acked, rtt_us); } -static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked, - u32 in_flight) +static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) { struct tcp_sock *tp = tcp_sk(sk); struct yeah *yeah = inet_csk_ca(sk); @@ -79,7 +78,7 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked, return; if (tp->snd_cwnd <= tp->snd_ssthresh) - tcp_slow_start(tp, acked); + tcp_slow_start(tp); else if (!yeah->doing_reno_now) { /* Scalable */ diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 44f6a20..0ca44df 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -103,7 +103,6 @@ #include <linux/seq_file.h> #include <net/net_namespace.h> #include <net/icmp.h> -#include <net/inet_hashtables.h> #include <net/route.h> #include <net/checksum.h> #include <net/xfrm.h> @@ -220,7 +219,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, unsigned short first, last; DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN); - inet_get_local_port_range(net, &low, &high); + inet_get_local_port_range(&low, &high); remaining = (high - low) + 1; rand = net_random(); @@ -407,18 +406,6 @@ static inline int compute_score2(struct sock *sk, struct net *net, return score; } -static unsigned int udp_ehashfn(struct net *net, const __be32 laddr, - const __u16 lport, const __be32 faddr, - const __be16 fport) -{ - static u32 udp_ehash_secret __read_mostly; - - net_get_random_once(&udp_ehash_secret, sizeof(udp_ehash_secret)); - - return __inet_ehashfn(laddr, lport, faddr, fport, - udp_ehash_secret + net_hash_mix(net)); -} - /* called with read_rcu_lock() */ static struct sock *udp4_lib_lookup2(struct net *net, @@ -442,8 +429,8 @@ begin: badness = score; reuseport = sk->sk_reuseport; if (reuseport) { - hash = udp_ehashfn(net, daddr, hnum, - saddr, sport); + hash = inet_ehashfn(net, daddr, hnum, + saddr, sport); matches = 1; } } else if (score == badness && reuseport) { @@ -523,8 +510,8 @@ begin: badness = score; reuseport = sk->sk_reuseport; if (reuseport) { - hash = udp_ehashfn(net, daddr, hnum, - saddr, sport); + hash = inet_ehashfn(net, daddr, hnum, + saddr, sport); matches = 1; } } else if (score == badness && reuseport) { @@ -578,26 +565,6 @@ struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, } EXPORT_SYMBOL_GPL(udp4_lib_lookup); -static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk, - __be16 loc_port, __be32 loc_addr, - __be16 rmt_port, __be32 rmt_addr, - int dif, unsigned short hnum) -{ - struct inet_sock *inet = inet_sk(sk); - - if (!net_eq(sock_net(sk), net) || - udp_sk(sk)->udp_port_hash != hnum || - (inet->inet_daddr && inet->inet_daddr != rmt_addr) || - (inet->inet_dport != rmt_port && inet->inet_dport) || - (inet->inet_rcv_saddr && inet->inet_rcv_saddr != loc_addr) || - ipv6_only_sock(sk) || - (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)) - return false; - if (!ip_mc_sf_allow(sk, loc_addr, rmt_addr, dif)) - return false; - return true; -} - static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk, __be16 loc_port, __be32 loc_addr, __be16 rmt_port, __be32 rmt_addr, @@ -608,11 +575,20 @@ static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk, unsigned short hnum = ntohs(loc_port); sk_nulls_for_each_from(s, node) { - if (__udp_is_mcast_sock(net, s, - loc_port, loc_addr, - rmt_port, rmt_addr, - dif, hnum)) - goto found; + struct inet_sock *inet = inet_sk(s); + + if (!net_eq(sock_net(s), net) || + udp_sk(s)->udp_port_hash != hnum || + (inet->inet_daddr && inet->inet_daddr != rmt_addr) || + (inet->inet_dport != rmt_port && inet->inet_dport) || + (inet->inet_rcv_saddr && + inet->inet_rcv_saddr != loc_addr) || + ipv6_only_sock(s) || + (s->sk_bound_dev_if && s->sk_bound_dev_if != dif)) + continue; + if (!ip_mc_sf_allow(s, loc_addr, rmt_addr, dif)) + continue; + goto found; } s = NULL; found: @@ -879,8 +855,6 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, ipc.opt = NULL; ipc.tx_flags = 0; - ipc.ttl = 0; - ipc.tos = -1; getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag; @@ -964,7 +938,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, faddr = ipc.opt->opt.faddr; connected = 0; } - tos = get_rttos(&ipc, inet); + tos = RT_TOS(inet->tos); if (sock_flag(sk, SOCK_LOCALROUTE) || (msg->msg_flags & MSG_DONTROUTE) || (ipc.opt && ipc.opt->opt.is_strictroute)) { @@ -999,7 +973,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, err = PTR_ERR(rt); rt = NULL; if (err == -ENETUNREACH) - IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); + IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); goto out; } @@ -1098,9 +1072,6 @@ int udp_sendpage(struct sock *sk, struct page *page, int offset, struct udp_sock *up = udp_sk(sk); int ret; - if (flags & MSG_SENDPAGE_NOTLAST) - flags |= MSG_MORE; - if (!up->pending) { struct msghdr msg = { .msg_flags = flags|MSG_MORE }; @@ -1238,8 +1209,14 @@ int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, int is_udplite = IS_UDPLITE(sk); bool slow; + /* + * Check any passed addresses + */ + if (addr_len) + *addr_len = sizeof(*sin); + if (flags & MSG_ERRQUEUE) - return ip_recv_error(sk, msg, len, addr_len); + return ip_recv_error(sk, msg, len); try_again: skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0), @@ -1299,7 +1276,6 @@ try_again: sin->sin_port = udp_hdr(skb)->source; sin->sin_addr.s_addr = ip_hdr(skb)->saddr; memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); - *addr_len = sizeof(*sin); } if (inet->cmsg_flags) ip_cmsg_recv(msg, skb); @@ -1427,10 +1403,8 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { int rc; - if (inet_sk(sk)->inet_daddr) { + if (inet_sk(sk)->inet_daddr) sock_rps_save_rxhash(sk, skb); - sk_mark_napi_id(sk, skb); - } rc = sock_queue_rcv_skb(sk, skb); if (rc < 0) { @@ -1554,7 +1528,7 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) rc = 0; - ipv4_pktinfo_prepare(sk, skb); + ipv4_pktinfo_prepare(skb); bh_lock_sock(sk); if (!sock_owned_by_user(sk)) rc = __udp_queue_rcv_skb(sk, skb); @@ -1603,14 +1577,6 @@ static void flush_stack(struct sock **stack, unsigned int count, kfree_skb(skb1); } -static void udp_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) -{ - struct dst_entry *dst = skb_dst(skb); - - dst_hold(dst); - sk->sk_rx_dst = dst; -} - /* * Multicasts and broadcasts go to each listener. * @@ -1739,32 +1705,16 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, if (udp4_csum_init(skb, uh, proto)) goto csum_error; - if (skb->sk) { - int ret; - sk = skb->sk; - - if (unlikely(sk->sk_rx_dst == NULL)) - udp_sk_rx_dst_set(sk, skb); + if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) + return __udp4_lib_mcast_deliver(net, skb, uh, + saddr, daddr, udptable); - ret = udp_queue_rcv_skb(sk, skb); - - /* a return value > 0 means to resubmit the input, but - * it wants the return to be -protocol, or 0 - */ - if (ret > 0) - return -ret; - return 0; - } else { - if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) - return __udp4_lib_mcast_deliver(net, skb, uh, - saddr, daddr, udptable); - - sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable); - } + sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable); if (sk != NULL) { int ret; + sk_mark_napi_id(sk, skb); ret = udp_queue_rcv_skb(sk, skb); sock_put(sk); @@ -1818,135 +1768,6 @@ drop: return 0; } -/* We can only early demux multicast if there is a single matching socket. - * If more than one socket found returns NULL - */ -static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net, - __be16 loc_port, __be32 loc_addr, - __be16 rmt_port, __be32 rmt_addr, - int dif) -{ - struct sock *sk, *result; - struct hlist_nulls_node *node; - unsigned short hnum = ntohs(loc_port); - unsigned int count, slot = udp_hashfn(net, hnum, udp_table.mask); - struct udp_hslot *hslot = &udp_table.hash[slot]; - - rcu_read_lock(); -begin: - count = 0; - result = NULL; - sk_nulls_for_each_rcu(sk, node, &hslot->head) { - if (__udp_is_mcast_sock(net, sk, - loc_port, loc_addr, - rmt_port, rmt_addr, - dif, hnum)) { - result = sk; - ++count; - } - } - /* - * if the nulls value we got at the end of this lookup is - * not the expected one, we must restart lookup. - * We probably met an item that was moved to another chain. - */ - if (get_nulls_value(node) != slot) - goto begin; - - if (result) { - if (count != 1 || - unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) - result = NULL; - else if (unlikely(!__udp_is_mcast_sock(net, result, - loc_port, loc_addr, - rmt_port, rmt_addr, - dif, hnum))) { - sock_put(result); - result = NULL; - } - } - rcu_read_unlock(); - return result; -} - -/* For unicast we should only early demux connected sockets or we can - * break forwarding setups. The chains here can be long so only check - * if the first socket is an exact match and if not move on. - */ -static struct sock *__udp4_lib_demux_lookup(struct net *net, - __be16 loc_port, __be32 loc_addr, - __be16 rmt_port, __be32 rmt_addr, - int dif) -{ - struct sock *sk, *result; - struct hlist_nulls_node *node; - unsigned short hnum = ntohs(loc_port); - unsigned int hash2 = udp4_portaddr_hash(net, loc_addr, hnum); - unsigned int slot2 = hash2 & udp_table.mask; - struct udp_hslot *hslot2 = &udp_table.hash2[slot2]; - INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr) - const __portpair ports = INET_COMBINED_PORTS(rmt_port, hnum); - - rcu_read_lock(); - result = NULL; - udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) { - if (INET_MATCH(sk, net, acookie, - rmt_addr, loc_addr, ports, dif)) - result = sk; - /* Only check first socket in chain */ - break; - } - - if (result) { - if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) - result = NULL; - else if (unlikely(!INET_MATCH(sk, net, acookie, - rmt_addr, loc_addr, - ports, dif))) { - sock_put(result); - result = NULL; - } - } - rcu_read_unlock(); - return result; -} - -void udp_v4_early_demux(struct sk_buff *skb) -{ - const struct iphdr *iph = ip_hdr(skb); - const struct udphdr *uh = udp_hdr(skb); - struct sock *sk; - struct dst_entry *dst; - struct net *net = dev_net(skb->dev); - int dif = skb->dev->ifindex; - - /* validate the packet */ - if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct udphdr))) - return; - - if (skb->pkt_type == PACKET_BROADCAST || - skb->pkt_type == PACKET_MULTICAST) - sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr, - uh->source, iph->saddr, dif); - else if (skb->pkt_type == PACKET_HOST) - sk = __udp4_lib_demux_lookup(net, uh->dest, iph->daddr, - uh->source, iph->saddr, dif); - else - return; - - if (!sk) - return; - - skb->sk = sk; - skb->destructor = sock_edemux; - dst = sk->sk_rx_dst; - - if (dst) - dst = dst_check(dst, 0); - if (dst) - skb_dst_set_noref(skb, dst); -} - int udp_rcv(struct sk_buff *skb) { return __udp4_lib_rcv(skb, &udp_table, IPPROTO_UDP); @@ -2329,7 +2150,7 @@ EXPORT_SYMBOL(udp_proc_unregister); /* ------------------------------------------------------------------------ */ static void udp4_format_sock(struct sock *sp, struct seq_file *f, - int bucket) + int bucket, int *len) { struct inet_sock *inet = inet_sk(sp); __be32 dest = inet->inet_daddr; @@ -2338,7 +2159,7 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f, __u16 srcp = ntohs(inet->inet_sport); seq_printf(f, "%5d: %08X:%04X %08X:%04X" - " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d", + " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d%n", bucket, src, srcp, dest, destp, sp->sk_state, sk_wmem_alloc_get(sp), sk_rmem_alloc_get(sp), @@ -2346,22 +2167,23 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f, from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)), 0, sock_i_ino(sp), atomic_read(&sp->sk_refcnt), sp, - atomic_read(&sp->sk_drops)); + atomic_read(&sp->sk_drops), len); } int udp4_seq_show(struct seq_file *seq, void *v) { - seq_setwidth(seq, 127); if (v == SEQ_START_TOKEN) - seq_puts(seq, " sl local_address rem_address st tx_queue " + seq_printf(seq, "%-127s\n", + " sl local_address rem_address st tx_queue " "rx_queue tr tm->when retrnsmt uid timeout " "inode ref pointer drops"); else { struct udp_iter_state *state = seq->private; + int len; - udp4_format_sock(v, seq, state->bucket); + udp4_format_sock(v, seq, state->bucket, &len); + seq_printf(seq, "%*s\n", 127 - len, ""); } - seq_pad(seq, '\n'); return 0; } diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h index f3c2789..5a681e2 100644 --- a/net/ipv4/udp_impl.h +++ b/net/ipv4/udp_impl.h @@ -5,30 +5,30 @@ #include <net/protocol.h> #include <net/inet_common.h> -int __udp4_lib_rcv(struct sk_buff *, struct udp_table *, int); -void __udp4_lib_err(struct sk_buff *, u32, struct udp_table *); +extern int __udp4_lib_rcv(struct sk_buff *, struct udp_table *, int ); +extern void __udp4_lib_err(struct sk_buff *, u32, struct udp_table *); -int udp_v4_get_port(struct sock *sk, unsigned short snum); +extern int udp_v4_get_port(struct sock *sk, unsigned short snum); -int udp_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen); -int udp_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen); +extern int udp_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, unsigned int optlen); +extern int udp_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen); #ifdef CONFIG_COMPAT -int compat_udp_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen); -int compat_udp_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen); +extern int compat_udp_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, unsigned int optlen); +extern int compat_udp_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen); #endif -int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, - size_t len, int noblock, int flags, int *addr_len); -int udp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, - int flags); -int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); -void udp_destroy_sock(struct sock *sk); +extern int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t len, int noblock, int flags, int *addr_len); +extern int udp_sendpage(struct sock *sk, struct page *page, int offset, + size_t size, int flags); +extern int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); +extern void udp_destroy_sock(struct sock *sk); #ifdef CONFIG_PROC_FS -int udp4_seq_show(struct seq_file *seq, void *v); +extern int udp4_seq_show(struct seq_file *seq, void *v); #endif #endif /* _UDP4_IMPL_H */ diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 83206de..f35ecca 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -52,7 +52,6 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY | SKB_GSO_UDP_TUNNEL | - SKB_GSO_IPIP | SKB_GSO_GRE | SKB_GSO_MPLS) || !(type & (SKB_GSO_UDP)))) goto out; diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index 31b1815..b5663c3 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -16,13 +16,13 @@ #include <net/xfrm.h> /* Informational hook. The decap is still done here. */ -static struct xfrm_tunnel_notifier __rcu *rcv_notify_handlers __read_mostly; +static struct xfrm_tunnel __rcu *rcv_notify_handlers __read_mostly; static DEFINE_MUTEX(xfrm4_mode_tunnel_input_mutex); -int xfrm4_mode_tunnel_input_register(struct xfrm_tunnel_notifier *handler) +int xfrm4_mode_tunnel_input_register(struct xfrm_tunnel *handler) { - struct xfrm_tunnel_notifier __rcu **pprev; - struct xfrm_tunnel_notifier *t; + struct xfrm_tunnel __rcu **pprev; + struct xfrm_tunnel *t; int ret = -EEXIST; int priority = handler->priority; @@ -50,10 +50,10 @@ err: } EXPORT_SYMBOL_GPL(xfrm4_mode_tunnel_input_register); -int xfrm4_mode_tunnel_input_deregister(struct xfrm_tunnel_notifier *handler) +int xfrm4_mode_tunnel_input_deregister(struct xfrm_tunnel *handler) { - struct xfrm_tunnel_notifier __rcu **pprev; - struct xfrm_tunnel_notifier *t; + struct xfrm_tunnel __rcu **pprev; + struct xfrm_tunnel *t; int ret = -ENOENT; mutex_lock(&xfrm4_mode_tunnel_input_mutex); @@ -134,7 +134,7 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) { - struct xfrm_tunnel_notifier *handler; + struct xfrm_tunnel *handler; int err = -EINVAL; if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPIP) diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index e1a6393..ccde542 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -104,14 +104,10 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) const struct iphdr *iph = ip_hdr(skb); u8 *xprth = skb_network_header(skb) + iph->ihl * 4; struct flowi4 *fl4 = &fl->u.ip4; - int oif = 0; - - if (skb_dst(skb)) - oif = skb_dst(skb)->dev->ifindex; memset(fl4, 0, sizeof(struct flowi4)); fl4->flowi4_mark = skb->mark; - fl4->flowi4_oif = reverse ? skb->skb_iif : oif; + fl4->flowi4_oif = skb_dst(skb)->dev->ifindex; if (!ip_is_fragment(iph)) { switch (iph->protocol) { @@ -240,7 +236,7 @@ static struct dst_ops xfrm4_dst_ops = { .destroy = xfrm4_dst_destroy, .ifdown = xfrm4_dst_ifdown, .local_out = __ip_local_out, - .gc_thresh = 32768, + .gc_thresh = 1024, }; static struct xfrm_policy_afinfo xfrm4_policy_afinfo = { |