From 39b9722315364121c6e2524515a6e95d52287549 Mon Sep 17 00:00:00 2001 From: Marco Angaroni Date: Tue, 5 Apr 2016 18:26:29 +0200 Subject: ipvs: handle connections started by real-servers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When using LVS-NAT and SIP persistence-egine over UDP, the following limitations are present with current implementation: 1) To actually have load-balancing based on Call-ID header, you need to use one-packet-scheduling mode. But with one-packet-scheduling the connection is deleted just after packet is forwarded, so SIP responses coming from real-servers do not match any connection and SNAT is not applied. 2) If you do not use "-o" option, IPVS behaves as normal UDP load balancer, so different SIP calls (each one identified by a different Call-ID) coming from the same ip-address/port go to the same real-server. So basically you don’t have load-balancing based on Call-ID as intended. 3) Call-ID is not learned when a new SIP call is started by a real-server (inside-to-outside direction), but only in the outside-to-inside direction. This would be a general problem for all SIP servers acting as Back2BackUserAgent. This patch aims to solve problems 1) and 3) while keeping OPS mode mandatory for SIP-UDP, so that 2) is not a problem anymore. The basic mechanism implemented is to make packets, that do not match any existent connection but come from real-servers, create new connections instead of let them pass without any effect. When such packets pass through ip_vs_out(), if their source ip address and source port match a configured real-server, a new connection is automatically created in the same way as it would have happened if the packet had come from outside-to-inside direction. A new connection template is created too if the virtual-service is persistent and there is no matching connection template found. The new connection automatically created, if the service had "-o" option, is an OPS connection that lasts only the time to forward the packet, just like it happens on the ingress side. The main part of this mechanism is implemented inside a persistent-engine specific callback (at the moment only SIP persistent engine exists) and is triggered only for UDP packets, since connection oriented protocols, by using different set of ports (typically ephemeral ports) to open new outgoing connections, should not need this feature. The following requisites are needed for automatic connection creation; if any is missing the packet simply goes the same way as before. a) virtual-service is not fwmark based (this is because fwmark services do not store address and port of the virtual-service, required to build the connection data). b) virtual-service and real-servers must not have been configured with omitted port (this is again to have all data to create the connection). Signed-off-by: Marco Angaroni Acked-by: Julian Anastasov Signed-off-by: Simon Horman diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index a6cc576..af4c10e 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -731,6 +731,12 @@ struct ip_vs_pe { u32 (*hashkey_raw)(const struct ip_vs_conn_param *p, u32 initval, bool inverse); int (*show_pe_data)(const struct ip_vs_conn *cp, char *buf); + /* create connections for real-server outgoing packets */ + struct ip_vs_conn* (*conn_out)(struct ip_vs_service *svc, + struct ip_vs_dest *dest, + struct sk_buff *skb, + const struct ip_vs_iphdr *iph, + __be16 dport, __be16 cport); }; /* The application module object (a.k.a. app incarnation) */ @@ -874,6 +880,7 @@ struct netns_ipvs { /* Service counters */ atomic_t ftpsvc_counter; atomic_t nullsvc_counter; + atomic_t conn_out_counter; #ifdef CONFIG_SYSCTL /* 1/rate drop and drop-entry variables */ @@ -1147,6 +1154,12 @@ static inline int sysctl_cache_bypass(struct netns_ipvs *ipvs) */ const char *ip_vs_proto_name(unsigned int proto); void ip_vs_init_hash_table(struct list_head *table, int rows); +struct ip_vs_conn *ip_vs_new_conn_out(struct ip_vs_service *svc, + struct ip_vs_dest *dest, + struct sk_buff *skb, + const struct ip_vs_iphdr *iph, + __be16 dport, + __be16 cport); #define IP_VS_INIT_HASH_TABLE(t) ip_vs_init_hash_table((t), ARRAY_SIZE((t))) #define IP_VS_APP_TYPE_FTP 1 @@ -1378,6 +1391,10 @@ ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u32 fwmark, __u16 protocol bool ip_vs_has_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol, const union nf_inet_addr *daddr, __be16 dport); +struct ip_vs_dest * +ip_vs_find_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol, + const union nf_inet_addr *daddr, __be16 dport); + int ip_vs_use_count_inc(void); void ip_vs_use_count_dec(void); int ip_vs_register_nl_ioctl(void); diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index b9a4082..f3bac2e 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -68,6 +68,7 @@ EXPORT_SYMBOL(ip_vs_conn_put); #ifdef CONFIG_IP_VS_DEBUG EXPORT_SYMBOL(ip_vs_get_debug_level); #endif +EXPORT_SYMBOL(ip_vs_new_conn_out); static int ip_vs_net_id __read_mostly; /* netns cnt used for uniqueness */ @@ -1100,6 +1101,143 @@ static inline bool is_new_conn_expected(const struct ip_vs_conn *cp, } } +/* Generic function to create new connections for outgoing RS packets + * + * Pre-requisites for successful connection creation: + * 1) Virtual Service is NOT fwmark based: + * In fwmark-VS actual vaddr and vport are unknown to IPVS + * 2) Real Server and Virtual Service were NOT configured without port: + * This is to allow match of different VS to the same RS ip-addr + */ +struct ip_vs_conn *ip_vs_new_conn_out(struct ip_vs_service *svc, + struct ip_vs_dest *dest, + struct sk_buff *skb, + const struct ip_vs_iphdr *iph, + __be16 dport, + __be16 cport) +{ + struct ip_vs_conn_param param; + struct ip_vs_conn *ct = NULL, *cp = NULL; + const union nf_inet_addr *vaddr, *daddr, *caddr; + union nf_inet_addr snet; + __be16 vport; + unsigned int flags; + + EnterFunction(12); + vaddr = &svc->addr; + vport = svc->port; + daddr = &iph->saddr; + caddr = &iph->daddr; + + /* check pre-requisites are satisfied */ + if (svc->fwmark) + return NULL; + if (!vport || !dport) + return NULL; + + /* for persistent service first create connection template */ + if (svc->flags & IP_VS_SVC_F_PERSISTENT) { + /* apply netmask the same way ingress-side does */ +#ifdef CONFIG_IP_VS_IPV6 + if (svc->af == AF_INET6) + ipv6_addr_prefix(&snet.in6, &caddr->in6, + (__force __u32)svc->netmask); + else +#endif + snet.ip = caddr->ip & svc->netmask; + /* fill params and create template if not existent */ + if (ip_vs_conn_fill_param_persist(svc, skb, iph->protocol, + &snet, 0, vaddr, + vport, ¶m) < 0) + return NULL; + ct = ip_vs_ct_in_get(¶m); + if (!ct) { + ct = ip_vs_conn_new(¶m, dest->af, daddr, dport, + IP_VS_CONN_F_TEMPLATE, dest, 0); + if (!ct) { + kfree(param.pe_data); + return NULL; + } + ct->timeout = svc->timeout; + } else { + kfree(param.pe_data); + } + } + + /* connection flags */ + flags = ((svc->flags & IP_VS_SVC_F_ONEPACKET) && + iph->protocol == IPPROTO_UDP) ? IP_VS_CONN_F_ONE_PACKET : 0; + /* create connection */ + ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol, + caddr, cport, vaddr, vport, ¶m); + cp = ip_vs_conn_new(¶m, dest->af, daddr, dport, flags, dest, 0); + if (!cp) { + if (ct) + ip_vs_conn_put(ct); + return NULL; + } + if (ct) { + ip_vs_control_add(cp, ct); + ip_vs_conn_put(ct); + } + ip_vs_conn_stats(cp, svc); + + /* return connection (will be used to handle outgoing packet) */ + IP_VS_DBG_BUF(6, "New connection RS-initiated:%c c:%s:%u v:%s:%u " + "d:%s:%u conn->flags:%X conn->refcnt:%d\n", + ip_vs_fwd_tag(cp), + IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), + IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport), + cp->flags, atomic_read(&cp->refcnt)); + LeaveFunction(12); + return cp; +} + +/* Handle outgoing packets which are considered requests initiated by + * real servers, so that subsequent responses from external client can be + * routed to the right real server. + * Used also for outgoing responses in OPS mode. + * + * Connection management is handled by persistent-engine specific callback. + */ +static struct ip_vs_conn *__ip_vs_rs_conn_out(unsigned int hooknum, + struct netns_ipvs *ipvs, + int af, struct sk_buff *skb, + const struct ip_vs_iphdr *iph) +{ + struct ip_vs_dest *dest; + struct ip_vs_conn *cp = NULL; + __be16 _ports[2], *pptr; + + if (hooknum == NF_INET_LOCAL_IN) + return NULL; + + pptr = frag_safe_skb_hp(skb, iph->len, + sizeof(_ports), _ports, iph); + if (!pptr) + return NULL; + + rcu_read_lock(); + dest = ip_vs_find_real_service(ipvs, af, iph->protocol, + &iph->saddr, pptr[0]); + if (dest) { + struct ip_vs_service *svc; + struct ip_vs_pe *pe; + + svc = rcu_dereference(dest->svc); + if (svc) { + pe = rcu_dereference(svc->pe); + if (pe && pe->conn_out) + cp = pe->conn_out(svc, dest, skb, iph, + pptr[0], pptr[1]); + } + } + rcu_read_unlock(); + + return cp; +} + /* Handle response packets: rewrite addresses and send away... */ static unsigned int @@ -1245,6 +1383,22 @@ ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, in if (likely(cp)) return handle_response(af, skb, pd, cp, &iph, hooknum); + + /* Check for real-server-started requests */ + if (atomic_read(&ipvs->conn_out_counter)) { + /* Currently only for UDP: + * connection oriented protocols typically use + * ephemeral ports for outgoing connections, so + * related incoming responses would not match any VS + */ + if (pp->protocol == IPPROTO_UDP) { + cp = __ip_vs_rs_conn_out(hooknum, ipvs, af, skb, &iph); + if (likely(cp)) + return handle_response(af, skb, pd, cp, &iph, + hooknum); + } + } + if (sysctl_nat_icmp_send(ipvs) && (pp->protocol == IPPROTO_TCP || pp->protocol == IPPROTO_UDP || diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 404b2a4..6794391 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -567,6 +567,36 @@ bool ip_vs_has_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol, return false; } +/* Find real service record by . + * In case of multiple records with the same , only + * the first found record is returned. + * + * To be called under RCU lock. + */ +struct ip_vs_dest *ip_vs_find_real_service(struct netns_ipvs *ipvs, int af, + __u16 protocol, + const union nf_inet_addr *daddr, + __be16 dport) +{ + unsigned int hash; + struct ip_vs_dest *dest; + + /* Check for "full" addressed entries */ + hash = ip_vs_rs_hashkey(af, daddr, dport); + + hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) { + if (dest->port == dport && + dest->af == af && + ip_vs_addr_equal(af, &dest->addr, daddr) && + (dest->protocol == protocol || dest->vfwmark)) { + /* HIT */ + return dest; + } + } + + return NULL; +} + /* Lookup destination by {addr,port} in the given service * Called under RCU lock. */ @@ -1253,6 +1283,8 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, atomic_inc(&ipvs->ftpsvc_counter); else if (svc->port == 0) atomic_inc(&ipvs->nullsvc_counter); + if (svc->pe && svc->pe->conn_out) + atomic_inc(&ipvs->conn_out_counter); ip_vs_start_estimator(ipvs, &svc->stats); @@ -1293,6 +1325,7 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) struct ip_vs_scheduler *sched = NULL, *old_sched; struct ip_vs_pe *pe = NULL, *old_pe = NULL; int ret = 0; + bool new_pe_conn_out, old_pe_conn_out; /* * Lookup the scheduler, by 'u->sched_name' @@ -1355,8 +1388,16 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) svc->netmask = u->netmask; old_pe = rcu_dereference_protected(svc->pe, 1); - if (pe != old_pe) + if (pe != old_pe) { rcu_assign_pointer(svc->pe, pe); + /* check for optional methods in new pe */ + new_pe_conn_out = (pe && pe->conn_out) ? true : false; + old_pe_conn_out = (old_pe && old_pe->conn_out) ? true : false; + if (new_pe_conn_out && !old_pe_conn_out) + atomic_inc(&svc->ipvs->conn_out_counter); + if (old_pe_conn_out && !new_pe_conn_out) + atomic_dec(&svc->ipvs->conn_out_counter); + } out: ip_vs_scheduler_put(old_sched); @@ -1389,6 +1430,8 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup) /* Unbind persistence engine, keep svc->pe */ old_pe = rcu_dereference_protected(svc->pe, 1); + if (old_pe && old_pe->conn_out) + atomic_dec(&ipvs->conn_out_counter); ip_vs_pe_put(old_pe); /* @@ -3957,6 +4000,7 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) (unsigned long) ipvs); atomic_set(&ipvs->ftpsvc_counter, 0); atomic_set(&ipvs->nullsvc_counter, 0); + atomic_set(&ipvs->conn_out_counter, 0); /* procfs stats */ ipvs->tot_stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats); diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c index 0a6eb5c..d07ef9e 100644 --- a/net/netfilter/ipvs/ip_vs_pe_sip.c +++ b/net/netfilter/ipvs/ip_vs_pe_sip.c @@ -143,6 +143,20 @@ static int ip_vs_sip_show_pe_data(const struct ip_vs_conn *cp, char *buf) return cp->pe_data_len; } +static struct ip_vs_conn * +ip_vs_sip_conn_out(struct ip_vs_service *svc, + struct ip_vs_dest *dest, + struct sk_buff *skb, + const struct ip_vs_iphdr *iph, + __be16 dport, + __be16 cport) +{ + if (likely(iph->protocol == IPPROTO_UDP)) + return ip_vs_new_conn_out(svc, dest, skb, iph, dport, cport); + /* currently no need to handle other than UDP */ + return NULL; +} + static struct ip_vs_pe ip_vs_sip_pe = { .name = "sip", @@ -153,6 +167,7 @@ static struct ip_vs_pe ip_vs_sip_pe = .ct_match = ip_vs_sip_ct_match, .hashkey_raw = ip_vs_sip_hashkey_raw, .show_pe_data = ip_vs_sip_show_pe_data, + .conn_out = ip_vs_sip_conn_out, }; static int __init ip_vs_sip_init(void) -- cgit v0.10.2 From 013b042465d3fefef84b4b87947747eda08277e2 Mon Sep 17 00:00:00 2001 From: Marco Angaroni Date: Tue, 5 Apr 2016 18:26:52 +0200 Subject: ipvs: optimize release of connections in OPS mode One-packet-scheduling is the most expensive mode in IPVS from performance point of view: for each packet to be processed a new connection data structure is created and, after packet is sent, deleted by starting a new timer set to expire immediately. SIP persistent-engine needs OPS mode to have Call-ID based load balancing, so OPS mode performance has negative impact in SIP protocol load balancing. This patch aims to improve performance of OPS mode by means of the following changes in the release mechanism of OPS connections: a) call expire callback ip_vs_conn_expire() directly instead of starting a timer programmed to fire immediately. b) avoid call_rcu() overhead inside expire callback, since OPS connection are not inserted in the hash-table and last just the time to process the packet, hence there is no concurrent access to such data structures. Signed-off-by: Marco Angaroni Acked-by: Julian Anastasov Signed-off-by: Simon Horman diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 85ca189..dd75d41 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -104,6 +104,7 @@ static inline void ct_write_unlock_bh(unsigned int key) spin_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); } +static void ip_vs_conn_expire(unsigned long data); /* * Returns hash value for IPVS connection entry @@ -453,10 +454,16 @@ ip_vs_conn_out_get_proto(struct netns_ipvs *ipvs, int af, } EXPORT_SYMBOL_GPL(ip_vs_conn_out_get_proto); +static void __ip_vs_conn_put_notimer(struct ip_vs_conn *cp) +{ + __ip_vs_conn_put(cp); + ip_vs_conn_expire((unsigned long)cp); +} + /* * Put back the conn and restart its timer with its timeout */ -void ip_vs_conn_put(struct ip_vs_conn *cp) +static void __ip_vs_conn_put_timer(struct ip_vs_conn *cp) { unsigned long t = (cp->flags & IP_VS_CONN_F_ONE_PACKET) ? 0 : cp->timeout; @@ -465,6 +472,16 @@ void ip_vs_conn_put(struct ip_vs_conn *cp) __ip_vs_conn_put(cp); } +void ip_vs_conn_put(struct ip_vs_conn *cp) +{ + if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && + (atomic_read(&cp->refcnt) == 1) && + !timer_pending(&cp->timer)) + /* expire connection immediately */ + __ip_vs_conn_put_notimer(cp); + else + __ip_vs_conn_put_timer(cp); +} /* * Fill a no_client_port connection with a client port number @@ -834,7 +851,10 @@ static void ip_vs_conn_expire(unsigned long data) ip_vs_unbind_dest(cp); if (cp->flags & IP_VS_CONN_F_NO_CPORT) atomic_dec(&ip_vs_conn_no_cport_cnt); - call_rcu(&cp->rcu_head, ip_vs_conn_rcu_free); + if (cp->flags & IP_VS_CONN_F_ONE_PACKET) + ip_vs_conn_rcu_free(&cp->rcu_head); + else + call_rcu(&cp->rcu_head, ip_vs_conn_rcu_free); atomic_dec(&ipvs->conn_count); return; } @@ -850,7 +870,7 @@ static void ip_vs_conn_expire(unsigned long data) if (ipvs->sync_state & IP_VS_STATE_MASTER) ip_vs_sync_conn(ipvs, cp, sysctl_sync_threshold(ipvs)); - ip_vs_conn_put(cp); + __ip_vs_conn_put_timer(cp); } /* Modify timer, so that it expires as soon as possible. -- cgit v0.10.2 From 8fb04d9fc70a67ccabf71dbabf92d7f6fca64a16 Mon Sep 17 00:00:00 2001 From: Marco Angaroni Date: Sat, 9 Apr 2016 14:14:23 +0200 Subject: ipvs: don't alter conntrack in OPS mode When using OPS mode in conjunction with SIP persistent-engine, packets originating from the same ip-address/port could be balanced to different real servers, and (to properly handle SIP responses) OPS connections are created in the in-out direction too, where ip_vs_update_conntrack() is called to modify the reply tuple. As a result, there can be collision of conntrack tuples, causing random packet drops, as explained below: conntrack1: orig=CIP->VIP, reply=RIP1->CIP conntrack2: orig=RIP2->CIP, reply=CIP->VIP Tuple CIP->VIP is both in orig of conntrack1 and reply of conntrack2. The collision triggers packet drop inside nf_conntrack processing. In addition, the current implementation deletes the conntrack object at every expire of an OPS connection (once every forwarded packet), to have it recreated from scratch at next packet traversing IPVS. Since in OPS mode, by definition, we don't expect any associated response, the choices implemented in this patch are: a) don't call nf_conntrack_alter_reply() for OPS connections inside ip_vs_update_conntrack(). b) don't delete the conntrack object at OPS connection expire. The result is that created conntrack objects for each tuple CIP->VIP, RIP-N->CIP, etc. are left in UNREPLIED state and not modified by IPVS OPS connection management. This eliminates packet drops and leaves a single conntrack object for each tuple packets are sent from. Signed-off-by: Marco Angaroni Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index dd75d41..292365f 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -836,7 +836,8 @@ static void ip_vs_conn_expire(unsigned long data) if (cp->control) ip_vs_control_del(cp); - if (cp->flags & IP_VS_CONN_F_NFCT) { + if ((cp->flags & IP_VS_CONN_F_NFCT) && + !(cp->flags & IP_VS_CONN_F_ONE_PACKET)) { /* Do not access conntracks during subsys cleanup * because nf_conntrack_find_get can not be used after * conntrack cleanup for the net. diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c index 30434fb..f04fd8d 100644 --- a/net/netfilter/ipvs/ip_vs_nfct.c +++ b/net/netfilter/ipvs/ip_vs_nfct.c @@ -93,6 +93,10 @@ ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin) if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) return; + /* Never alter conntrack for OPS conns (no reply is expected) */ + if (cp->flags & IP_VS_CONN_F_ONE_PACKET) + return; + /* Alter reply only in original direction */ if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) return; -- cgit v0.10.2 From a3efd81205b128a802025abb689925177a4607ed Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 18 Apr 2016 16:16:59 +0200 Subject: netfilter: conntrack: move generation seqcnt out of netns_ct We only allow rehash in init namespace, so we only use init_ns.generation. And even if we would allow it, it makes no sense as the conntrack locks are global; any ongoing rehash prevents insert/ delete. So make this private to nf_conntrack_core instead. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h index 723b61c..b052785 100644 --- a/include/net/netns/conntrack.h +++ b/include/net/netns/conntrack.h @@ -94,7 +94,6 @@ struct netns_ct { int sysctl_checksum; unsigned int htable_size; - seqcount_t generation; struct kmem_cache *nf_conntrack_cachep; struct hlist_nulls_head *hash; struct hlist_head *expect_hash; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 2fd6074..a53c009 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -69,6 +69,7 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock); EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock); static __read_mostly spinlock_t nf_conntrack_locks_all_lock; +static __read_mostly seqcount_t nf_conntrack_generation; static __read_mostly bool nf_conntrack_locks_all; void nf_conntrack_lock(spinlock_t *lock) __acquires(lock) @@ -107,7 +108,7 @@ static bool nf_conntrack_double_lock(struct net *net, unsigned int h1, spin_lock_nested(&nf_conntrack_locks[h1], SINGLE_DEPTH_NESTING); } - if (read_seqcount_retry(&net->ct.generation, sequence)) { + if (read_seqcount_retry(&nf_conntrack_generation, sequence)) { nf_conntrack_double_unlock(h1, h2); return true; } @@ -393,7 +394,7 @@ static void nf_ct_delete_from_lists(struct nf_conn *ct) local_bh_disable(); do { - sequence = read_seqcount_begin(&net->ct.generation); + sequence = read_seqcount_begin(&nf_conntrack_generation); hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); reply_hash = hash_conntrack(net, @@ -560,7 +561,7 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) local_bh_disable(); do { - sequence = read_seqcount_begin(&net->ct.generation); + sequence = read_seqcount_begin(&nf_conntrack_generation); hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); reply_hash = hash_conntrack(net, @@ -628,7 +629,7 @@ __nf_conntrack_confirm(struct sk_buff *skb) local_bh_disable(); do { - sequence = read_seqcount_begin(&net->ct.generation); + sequence = read_seqcount_begin(&nf_conntrack_generation); /* reuse the hash saved before */ hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev; hash = hash_bucket(hash, net); @@ -771,12 +772,12 @@ static noinline int early_drop(struct net *net, unsigned int _hash) local_bh_disable(); restart: - sequence = read_seqcount_begin(&net->ct.generation); + sequence = read_seqcount_begin(&nf_conntrack_generation); hash = hash_bucket(_hash, net); for (; i < net->ct.htable_size; i++) { lockp = &nf_conntrack_locks[hash % CONNTRACK_LOCKS]; nf_conntrack_lock(lockp); - if (read_seqcount_retry(&net->ct.generation, sequence)) { + if (read_seqcount_retry(&nf_conntrack_generation, sequence)) { spin_unlock(lockp); goto restart; } @@ -1607,7 +1608,7 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) local_bh_disable(); nf_conntrack_all_lock(); - write_seqcount_begin(&init_net.ct.generation); + write_seqcount_begin(&nf_conntrack_generation); /* Lookups in the old hash might happen in parallel, which means we * might get false negatives during connection lookup. New connections @@ -1631,7 +1632,7 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) init_net.ct.htable_size = nf_conntrack_htable_size = hashsize; init_net.ct.hash = hash; - write_seqcount_end(&init_net.ct.generation); + write_seqcount_end(&nf_conntrack_generation); nf_conntrack_all_unlock(); local_bh_enable(); @@ -1657,6 +1658,8 @@ int nf_conntrack_init_start(void) int max_factor = 8; int i, ret, cpu; + seqcount_init(&nf_conntrack_generation); + for (i = 0; i < CONNTRACK_LOCKS; i++) spin_lock_init(&nf_conntrack_locks[i]); @@ -1783,7 +1786,6 @@ int nf_conntrack_init_net(struct net *net) int cpu; atomic_set(&net->ct.count, 0); - seqcount_init(&net->ct.generation); net->ct.pcpu_lists = alloc_percpu(struct ct_pcpu); if (!net->ct.pcpu_lists) -- cgit v0.10.2 From 7001c6d109ea41a88e7156f467cf9fb5f37f5036 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 18 Apr 2016 16:17:00 +0200 Subject: netfilter: conntrack: use get_random_once for nat and expectations Use a private seed and init it using get_random_once. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 278927a..c2f7c4f 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -38,6 +38,7 @@ EXPORT_SYMBOL_GPL(nf_ct_expect_hsize); unsigned int nf_ct_expect_max __read_mostly; static struct kmem_cache *nf_ct_expect_cachep __read_mostly; +static unsigned int nf_ct_expect_hashrnd __read_mostly; /* nf_conntrack_expect helper functions */ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, @@ -76,13 +77,11 @@ static unsigned int nf_ct_expect_dst_hash(const struct nf_conntrack_tuple *tuple { unsigned int hash; - if (unlikely(!nf_conntrack_hash_rnd)) { - init_nf_conntrack_hash_rnd(); - } + get_random_once(&nf_ct_expect_hashrnd, sizeof(nf_ct_expect_hashrnd)); hash = jhash2(tuple->dst.u3.all, ARRAY_SIZE(tuple->dst.u3.all), (((tuple->dst.protonum ^ tuple->src.l3num) << 16) | - (__force __u16)tuple->dst.u.all) ^ nf_conntrack_hash_rnd); + (__force __u16)tuple->dst.u.all) ^ nf_ct_expect_hashrnd); return reciprocal_scale(hash, nf_ct_expect_hsize); } diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 06a9f45..3d52271 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -37,7 +37,7 @@ static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO] __read_mostly; static const struct nf_nat_l4proto __rcu **nf_nat_l4protos[NFPROTO_NUMPROTO] __read_mostly; - +static unsigned int nf_nat_hash_rnd __read_mostly; inline const struct nf_nat_l3proto * __nf_nat_l3proto_find(u8 family) @@ -122,9 +122,11 @@ hash_by_src(const struct net *net, const struct nf_conntrack_tuple *tuple) { unsigned int hash; + get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd)); + /* Original src, to ensure we map it consistently if poss. */ hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32), - tuple->dst.protonum ^ nf_conntrack_hash_rnd); + tuple->dst.protonum ^ nf_nat_hash_rnd); return reciprocal_scale(hash, net->ct.nat_htable_size); } -- cgit v0.10.2 From 141658fb02c248e6243d619cb7d48a76158a66ac Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 18 Apr 2016 16:17:01 +0200 Subject: netfilter: conntrack: use get_random_once for conntrack hash seed As earlier commit removed accessed to the hash from other files we can also make it static. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index fde4068..dd78bea 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -289,8 +289,6 @@ struct kernel_param; int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp); extern unsigned int nf_conntrack_htable_size; extern unsigned int nf_conntrack_max; -extern unsigned int nf_conntrack_hash_rnd; -void init_nf_conntrack_hash_rnd(void); struct nf_conn *nf_ct_tmpl_alloc(struct net *net, const struct nf_conntrack_zone *zone, diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index a53c009..1fd0ff1 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -142,13 +142,14 @@ EXPORT_SYMBOL_GPL(nf_conntrack_max); DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked); EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked); -unsigned int nf_conntrack_hash_rnd __read_mostly; -EXPORT_SYMBOL_GPL(nf_conntrack_hash_rnd); +static unsigned int nf_conntrack_hash_rnd __read_mostly; static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple) { unsigned int n; + get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd)); + /* The direction must be ignored, so we hash everything up to the * destination ports (which is a multiple of 4) and treat the last * three bytes manually. @@ -815,21 +816,6 @@ restart: return dropped; } -void init_nf_conntrack_hash_rnd(void) -{ - unsigned int rand; - - /* - * Why not initialize nf_conntrack_rnd in a "init()" function ? - * Because there isn't enough entropy when system initializing, - * and we initialize it as late as possible. - */ - do { - get_random_bytes(&rand, sizeof(rand)); - } while (!rand); - cmpxchg(&nf_conntrack_hash_rnd, 0, rand); -} - static struct nf_conn * __nf_conntrack_alloc(struct net *net, const struct nf_conntrack_zone *zone, @@ -839,12 +825,6 @@ __nf_conntrack_alloc(struct net *net, { struct nf_conn *ct; - if (unlikely(!nf_conntrack_hash_rnd)) { - init_nf_conntrack_hash_rnd(); - /* recompute the hash as nf_conntrack_hash_rnd is initialized */ - hash = hash_conntrack_raw(orig); - } - /* We don't want any race condition at early drop stage */ atomic_inc(&net->ct.count); -- cgit v0.10.2 From 0e9091d6862f60499fa3faec7c2060c1929d0763 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 12 Apr 2016 23:50:34 +0200 Subject: netfilter: nf_tables: introduce nft_setelem_parse_flags() helper This function parses the set element flags, thus, we can reuse the same handling when deleting elements. Signed-off-by: Pablo Neira Ayuso diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 7a85a9d..1b3210b 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3375,6 +3375,22 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem) } EXPORT_SYMBOL_GPL(nft_set_elem_destroy); +static int nft_setelem_parse_flags(const struct nft_set *set, + const struct nlattr *attr, u32 *flags) +{ + if (attr == NULL) + return 0; + + *flags = ntohl(nla_get_be32(attr)); + if (*flags & ~NFT_SET_ELEM_INTERVAL_END) + return -EINVAL; + if (!(set->flags & NFT_SET_INTERVAL) && + *flags & NFT_SET_ELEM_INTERVAL_END) + return -EINVAL; + + return 0; +} + static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, const struct nlattr *attr) { @@ -3388,8 +3404,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, struct nft_data data; enum nft_registers dreg; struct nft_trans *trans; + u32 flags = 0; u64 timeout; - u32 flags; u8 ulen; int err; @@ -3403,17 +3419,11 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, nft_set_ext_prepare(&tmpl); - flags = 0; - if (nla[NFTA_SET_ELEM_FLAGS] != NULL) { - flags = ntohl(nla_get_be32(nla[NFTA_SET_ELEM_FLAGS])); - if (flags & ~NFT_SET_ELEM_INTERVAL_END) - return -EINVAL; - if (!(set->flags & NFT_SET_INTERVAL) && - flags & NFT_SET_ELEM_INTERVAL_END) - return -EINVAL; - if (flags != 0) - nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS); - } + err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags); + if (err < 0) + return err; + if (flags != 0) + nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS); if (set->flags & NFT_SET_MAP) { if (nla[NFTA_SET_ELEM_DATA] == NULL && -- cgit v0.10.2 From 3971ca14350062fc30b2dd3ca182234f17d268c2 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 12 Apr 2016 23:50:35 +0200 Subject: netfilter: nf_tables: parse element flags from nft_del_setelem() Parse flags and pass them to the set via ->deactivate() to check if we remove the right element from the intervals. Signed-off-by: Pablo Neira Ayuso diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 1b3210b..73c8fad 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3592,9 +3592,13 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, const struct nlattr *attr) { struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; + struct nft_set_ext_tmpl tmpl; struct nft_data_desc desc; struct nft_set_elem elem; + struct nft_set_ext *ext; struct nft_trans *trans; + u32 flags = 0; + void *priv; int err; err = nla_parse_nested(nla, NFTA_SET_ELEM_MAX, attr, @@ -3606,6 +3610,14 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, if (nla[NFTA_SET_ELEM_KEY] == NULL) goto err1; + nft_set_ext_prepare(&tmpl); + + err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags); + if (err < 0) + return err; + if (flags != 0) + nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS); + err = nft_data_init(ctx, &elem.key.val, sizeof(elem.key), &desc, nla[NFTA_SET_ELEM_KEY]); if (err < 0) @@ -3615,24 +3627,40 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, if (desc.type != NFT_DATA_VALUE || desc.len != set->klen) goto err2; + nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, desc.len); + + err = -ENOMEM; + elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, NULL, 0, + GFP_KERNEL); + if (elem.priv == NULL) + goto err2; + + ext = nft_set_elem_ext(set, elem.priv); + if (flags) + *nft_set_ext_flags(ext) = flags; + trans = nft_trans_elem_alloc(ctx, NFT_MSG_DELSETELEM, set); if (trans == NULL) { err = -ENOMEM; - goto err2; + goto err3; } - elem.priv = set->ops->deactivate(set, &elem); - if (elem.priv == NULL) { + priv = set->ops->deactivate(set, &elem); + if (priv == NULL) { err = -ENOENT; - goto err3; + goto err4; } + kfree(elem.priv); + elem.priv = priv; nft_trans_elem(trans) = elem; list_add_tail(&trans->list, &ctx->net->nft.commit_list); return 0; -err3: +err4: kfree(trans); +err3: + kfree(elem.priv); err2: nft_data_uninit(&elem.key.val, desc.type); err1: -- cgit v0.10.2 From ef1d20e0f8a80ba2942a59331d472322794d6748 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 12 Apr 2016 23:50:36 +0200 Subject: netfilter: nft_rbtree: introduce nft_rbtree_interval_end() helper Add this new nft_rbtree_interval_end() helper function to check in the end interval is set. Signed-off-by: Pablo Neira Ayuso diff --git a/net/netfilter/nft_rbtree.c b/net/netfilter/nft_rbtree.c index 1c30f41..29f2ab8 100644 --- a/net/netfilter/nft_rbtree.c +++ b/net/netfilter/nft_rbtree.c @@ -29,6 +29,11 @@ struct nft_rbtree_elem { struct nft_set_ext ext; }; +static bool nft_rbtree_interval_end(const struct nft_rbtree_elem *rbe) +{ + return nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) && + (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END); +} static bool nft_rbtree_lookup(const struct nft_set *set, const u32 *key, const struct nft_set_ext **ext) @@ -56,9 +61,7 @@ found: parent = parent->rb_left; continue; } - if (nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) && - *nft_set_ext_flags(&rbe->ext) & - NFT_SET_ELEM_INTERVAL_END) + if (nft_rbtree_interval_end(rbe)) goto out; spin_unlock_bh(&nft_rbtree_lock); -- cgit v0.10.2 From e701001e7cbe88cdc937037f6f398669eef7e7ff Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 12 Apr 2016 23:50:37 +0200 Subject: netfilter: nft_rbtree: allow adjacent intervals with dynamic updates This patch fixes dynamic element updates for adjacent intervals in the rb-tree representation. Since elements are sorted in the rb-tree, in case of adjacent nodes with the same key, the assumption is that an interval end node must be placed before an interval opening. In tree lookup operations, the idea is to search for the closer element that is smaller than the one we're searching for. Given that we'll have two possible matchings, we have to take the opening interval in case of adjacent nodes. Range merges are not trivial with the current representation, specifically we have to check if node extensions are equal and make sure we keep the existing internal states around. Signed-off-by: Pablo Neira Ayuso diff --git a/net/netfilter/nft_rbtree.c b/net/netfilter/nft_rbtree.c index 29f2ab8..f762094 100644 --- a/net/netfilter/nft_rbtree.c +++ b/net/netfilter/nft_rbtree.c @@ -35,6 +35,12 @@ static bool nft_rbtree_interval_end(const struct nft_rbtree_elem *rbe) (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END); } +static bool nft_rbtree_equal(const struct nft_set *set, const void *this, + const struct nft_rbtree_elem *interval) +{ + return memcmp(this, nft_set_ext_key(&interval->ext), set->klen) == 0; +} + static bool nft_rbtree_lookup(const struct nft_set *set, const u32 *key, const struct nft_set_ext **ext) { @@ -42,6 +48,7 @@ static bool nft_rbtree_lookup(const struct nft_set *set, const u32 *key, const struct nft_rbtree_elem *rbe, *interval = NULL; const struct rb_node *parent; u8 genmask = nft_genmask_cur(read_pnet(&set->pnet)); + const void *this; int d; spin_lock_bh(&nft_rbtree_lock); @@ -49,9 +56,16 @@ static bool nft_rbtree_lookup(const struct nft_set *set, const u32 *key, while (parent != NULL) { rbe = rb_entry(parent, struct nft_rbtree_elem, node); - d = memcmp(nft_set_ext_key(&rbe->ext), key, set->klen); + this = nft_set_ext_key(&rbe->ext); + d = memcmp(this, key, set->klen); if (d < 0) { parent = parent->rb_left; + /* In case of adjacent ranges, we always see the high + * part of the range in first place, before the low one. + * So don't update interval if the keys are equal. + */ + if (interval && nft_rbtree_equal(set, this, interval)) + continue; interval = rbe; } else if (d > 0) parent = parent->rb_right; @@ -101,9 +115,16 @@ static int __nft_rbtree_insert(const struct nft_set *set, else if (d > 0) p = &parent->rb_right; else { - if (nft_set_elem_active(&rbe->ext, genmask)) - return -EEXIST; - p = &parent->rb_left; + if (nft_set_elem_active(&rbe->ext, genmask)) { + if (nft_rbtree_interval_end(rbe) && + !nft_rbtree_interval_end(new)) + p = &parent->rb_left; + else if (!nft_rbtree_interval_end(rbe) && + nft_rbtree_interval_end(new)) + p = &parent->rb_right; + else + return -EEXIST; + } } } rb_link_node(&new->node, parent, p); @@ -148,7 +169,7 @@ static void *nft_rbtree_deactivate(const struct nft_set *set, { const struct nft_rbtree *priv = nft_set_priv(set); const struct rb_node *parent = priv->root.rb_node; - struct nft_rbtree_elem *rbe; + struct nft_rbtree_elem *rbe, *this = elem->priv; u8 genmask = nft_genmask_cur(read_pnet(&set->pnet)); int d; @@ -166,6 +187,15 @@ static void *nft_rbtree_deactivate(const struct nft_set *set, parent = parent->rb_left; continue; } + if (nft_rbtree_interval_end(rbe) && + !nft_rbtree_interval_end(this)) { + parent = parent->rb_left; + continue; + } else if (!nft_rbtree_interval_end(rbe) && + nft_rbtree_interval_end(this)) { + parent = parent->rb_right; + continue; + } nft_set_elem_change_active(set, &rbe->ext); return rbe; } -- cgit v0.10.2 From 3bb398d925ec73e42b778cf823c8f4aecae359ea Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 14 Apr 2016 17:13:41 +0200 Subject: netfilter: nf_ct_helper: disable automatic helper assignment Four years ago we introduced a new sysctl knob to disable automatic helper assignment in 72110dfaa907 ("netfilter: nf_ct_helper: disable automatic helper assignment"). This knob kept this behaviour enabled by default to remain conservative. This measure was introduced to provide a secure way to configure iptables and connection tracking helpers through explicit rules. Give the time we have waited for this, let's turn off this by default now, worse case users still have a chance to recover the former behaviour by explicitly enabling this back through sysctl. Signed-off-by: Pablo Neira Ayuso diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 3b40ec5..498bf74 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -38,10 +38,10 @@ unsigned int nf_ct_helper_hsize __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_helper_hsize); static unsigned int nf_ct_helper_count __read_mostly; -static bool nf_ct_auto_assign_helper __read_mostly = true; +static bool nf_ct_auto_assign_helper __read_mostly = false; module_param_named(nf_conntrack_helper, nf_ct_auto_assign_helper, bool, 0644); MODULE_PARM_DESC(nf_conntrack_helper, - "Enable automatic conntrack helper assignment (default 1)"); + "Enable automatic conntrack helper assignment (default 0)"); #ifdef CONFIG_SYSCTL static struct ctl_table helper_sysctl_table[] = { -- cgit v0.10.2 From d2b484b577776f3c6f4d52505b27bad27ea1fe00 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Fri, 22 Apr 2016 02:56:57 -0700 Subject: netfilter: ip6t_SYNPROXY: unnecessary to check whether ip6_route_output returns NULL ip6_route_output() will never return a NULL pointer, so there's no need to check it. Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c index 5d778dd..06bed74 100644 --- a/net/ipv6/netfilter/ip6t_SYNPROXY.c +++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c @@ -60,7 +60,7 @@ synproxy_send_tcp(struct net *net, fl6.fl6_dport = nth->dest; security_skb_classify_flow((struct sk_buff *)skb, flowi6_to_flowi(&fl6)); dst = ip6_route_output(net, NULL, &fl6); - if (dst == NULL || dst->error) { + if (dst->error) { dst_release(dst); goto free_nskb; } -- cgit v0.10.2 From 92b4423e3a0bc5d43ecde4bcad871f8b5ba04efd Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 29 Apr 2016 10:39:34 +0200 Subject: netfilter: fix IS_ERR_VALUE usage This is a forward-port of the original patch from Andrzej Hajda, he said: "IS_ERR_VALUE should be used only with unsigned long type. Otherwise it can work incorrectly. To achieve this function xt_percpu_counter_alloc is modified to return unsigned long, and its result is assigned to temporary variable to perform error checking, before assigning to .pcnt field. The patch follows conclusion from discussion on LKML [1][2]. [1]: http://permalink.gmane.org/gmane.linux.kernel/2120927 [2]: http://permalink.gmane.org/gmane.linux.kernel/2150581" Original patch from Andrzej is here: http://patchwork.ozlabs.org/patch/582970/ This patch has clashed with input validation fixes for x_tables. Signed-off-by: Pablo Neira Ayuso diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 4dd9306..dc4f58a 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -380,16 +380,16 @@ static inline unsigned long ifname_compare_aligned(const char *_a, * allows us to return 0 for single core systems without forcing * callers to deal with SMP vs. NONSMP issues. */ -static inline u64 xt_percpu_counter_alloc(void) +static inline unsigned long xt_percpu_counter_alloc(void) { if (nr_cpu_ids > 1) { void __percpu *res = __alloc_percpu(sizeof(struct xt_counters), sizeof(struct xt_counters)); if (res == NULL) - return (u64) -ENOMEM; + return -ENOMEM; - return (u64) (__force unsigned long) res; + return (__force unsigned long) res; } return 0; diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 60f5161..3355ed7 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -513,11 +513,13 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size) { struct xt_entry_target *t; struct xt_target *target; + unsigned long pcnt; int ret; - e->counters.pcnt = xt_percpu_counter_alloc(); - if (IS_ERR_VALUE(e->counters.pcnt)) + pcnt = xt_percpu_counter_alloc(); + if (IS_ERR_VALUE(pcnt)) return -ENOMEM; + e->counters.pcnt = pcnt; t = arpt_get_target(e); target = xt_request_find_target(NFPROTO_ARP, t->u.user.name, diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 735d1ee..21ccc19 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -656,10 +656,12 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name, unsigned int j; struct xt_mtchk_param mtpar; struct xt_entry_match *ematch; + unsigned long pcnt; - e->counters.pcnt = xt_percpu_counter_alloc(); - if (IS_ERR_VALUE(e->counters.pcnt)) + pcnt = xt_percpu_counter_alloc(); + if (IS_ERR_VALUE(pcnt)) return -ENOMEM; + e->counters.pcnt = pcnt; j = 0; mtpar.net = net; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 73e606c..17874e8 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -669,10 +669,12 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name, unsigned int j; struct xt_mtchk_param mtpar; struct xt_entry_match *ematch; + unsigned long pcnt; - e->counters.pcnt = xt_percpu_counter_alloc(); - if (IS_ERR_VALUE(e->counters.pcnt)) + pcnt = xt_percpu_counter_alloc(); + if (IS_ERR_VALUE(pcnt)) return -ENOMEM; + e->counters.pcnt = pcnt; j = 0; mtpar.net = net; -- cgit v0.10.2 From 1ad8f48df6f683f186b03b51381419ac4aec73d3 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 26 Apr 2016 11:59:53 +0200 Subject: netfilter: nftables: add connlabel set support Conntrack labels are currently sized depending on the iptables ruleset, i.e. if we're asked to test or set bits 1, 2, and 65 then we would allocate enough room to store at least bit 65. However, with nft, the input is just a register with arbitrary runtime content. We therefore ask for the upper ceiling we currently have, which is enough room to store 128 bits. Alternatively, we could alter nf_connlabel_replace to increase net->ct.label_words at run time, but since 128 bits is not that big we'd only save sizeof(long) so it doesn't seem worth it for now. This follows a similar approach that xtables 'connlabel' match uses, so when user inputs ct label set bar then we will set the bit used by the 'bar' label and leave the rest alone. This is done by passing the sreg content to nf_connlabels_replace as both value and mask argument. Labels (bits) already set thus cannot be re-set to zero, but this is not supported by xtables connlabel match either. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 25998fa..137e308 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -198,6 +198,14 @@ static void nft_ct_set_eval(const struct nft_expr *expr, } break; #endif +#ifdef CONFIG_NF_CONNTRACK_LABELS + case NFT_CT_LABELS: + nf_connlabels_replace(ct, + ®s->data[priv->sreg], + ®s->data[priv->sreg], + NF_CT_LABELS_MAX_SIZE / sizeof(u32)); + break; +#endif default: break; } @@ -365,6 +373,16 @@ static int nft_ct_set_init(const struct nft_ctx *ctx, len = FIELD_SIZEOF(struct nf_conn, mark); break; #endif +#ifdef CONFIG_NF_CONNTRACK_LABELS + case NFT_CT_LABELS: + if (tb[NFTA_CT_DIRECTION]) + return -EINVAL; + len = NF_CT_LABELS_MAX_SIZE; + err = nf_connlabels_get(ctx->net, (len * BITS_PER_BYTE) - 1); + if (err) + return err; + break; +#endif default: return -EOPNOTSUPP; } @@ -384,6 +402,18 @@ static int nft_ct_set_init(const struct nft_ctx *ctx, static void nft_ct_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { + struct nft_ct *priv = nft_expr_priv(expr); + + switch (priv->key) { +#ifdef CONFIG_NF_CONNTRACK_LABELS + case NFT_CT_LABELS: + nf_connlabels_put(ctx->net); + break; +#endif + default: + break; + } + nft_ct_l3proto_module_put(ctx->afi->family); } -- cgit v0.10.2 From 2cf1234807bdd4ae5d3096a63c8fd5d4d5cad0ef Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 28 Apr 2016 19:13:40 +0200 Subject: netfilter: conntrack: keep BH enabled during lookup No need to disable BH here anymore: stats are switched to _ATOMIC variant (== this_cpu_inc()), which nowadays generates same code as the non _ATOMIC NF_STAT, at least on x86. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 1fd0ff1..1b63359 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -472,18 +472,13 @@ ____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone, struct hlist_nulls_node *n; unsigned int bucket = hash_bucket(hash, net); - /* Disable BHs the entire time since we normally need to disable them - * at least once for the stats anyway. - */ - local_bh_disable(); begin: hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[bucket], hnnode) { if (nf_ct_key_equal(h, tuple, zone)) { - NF_CT_STAT_INC(net, found); - local_bh_enable(); + NF_CT_STAT_INC_ATOMIC(net, found); return h; } - NF_CT_STAT_INC(net, searched); + NF_CT_STAT_INC_ATOMIC(net, searched); } /* * if the nulls value we got at the end of this lookup is @@ -491,10 +486,9 @@ begin: * We probably met an item that was moved to another chain. */ if (get_nulls_value(n) != bucket) { - NF_CT_STAT_INC(net, search_restart); + NF_CT_STAT_INC_ATOMIC(net, search_restart); goto begin; } - local_bh_enable(); return NULL; } @@ -735,22 +729,19 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, zone = nf_ct_zone(ignored_conntrack); hash = hash_conntrack(net, tuple); - /* Disable BHs the entire time since we need to disable them at - * least once for the stats anyway. - */ - rcu_read_lock_bh(); + rcu_read_lock(); hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnnode) { ct = nf_ct_tuplehash_to_ctrack(h); if (ct != ignored_conntrack && nf_ct_tuple_equal(tuple, &h->tuple) && nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h))) { - NF_CT_STAT_INC(net, found); - rcu_read_unlock_bh(); + NF_CT_STAT_INC_ATOMIC(net, found); + rcu_read_unlock(); return 1; } - NF_CT_STAT_INC(net, searched); + NF_CT_STAT_INC_ATOMIC(net, searched); } - rcu_read_unlock_bh(); + rcu_read_unlock(); return 0; } -- cgit v0.10.2 From 5e3c61f981756361e7dc74e2c673121028449e35 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 28 Apr 2016 19:13:41 +0200 Subject: netfilter: conntrack: fix lookup race during hash resize When resizing the conntrack hash table at runtime via echo 42 > /sys/module/nf_conntrack/parameters/hashsize, we are racing with the conntrack lookup path -- reads can happen in parallel and nothing prevents readers from observing a the newly allocated hash but the old size (or vice versa). So access to hash[bucket] can trigger OOB read access in case the table got expanded and we saw the new size but the old hash pointer (or it got shrunk and we got new hash ptr but the size of the old and larger table): kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] SMP KASAN CPU: 0 PID: 3 Comm: ksoftirqd/0 Not tainted 4.6.0-rc2+ #107 [..] Call Trace: [] ? nf_conntrack_tuple_taken+0x12a/0xe90 [] ? nf_ct_invert_tuplepr+0x221/0x3a0 [] get_unique_tuple+0xfb3/0x2760 Use generation counter to obtain the address/length of the same table. Also add a synchronize_net before freeing the old hash. AFAICS, without it we might access ct_hash[bucket] after ct_hash has been freed, provided that lockless reader got delayed by another event: CPU1 CPU2 seq_begin seq_retry resize occurs free oldhash for_each(oldhash[size]) Note that resize is only supported in init_netns, it took over 2 minutes of constant resizing+flooding to produce the warning, so this isn't a big problem in practice. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 1b63359..29fa08b 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -469,11 +469,18 @@ ____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple, u32 hash) { struct nf_conntrack_tuple_hash *h; + struct hlist_nulls_head *ct_hash; struct hlist_nulls_node *n; - unsigned int bucket = hash_bucket(hash, net); + unsigned int bucket, sequence; begin: - hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[bucket], hnnode) { + do { + sequence = read_seqcount_begin(&nf_conntrack_generation); + bucket = hash_bucket(hash, net); + ct_hash = net->ct.hash; + } while (read_seqcount_retry(&nf_conntrack_generation, sequence)); + + hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) { if (nf_ct_key_equal(h, tuple, zone)) { NF_CT_STAT_INC_ATOMIC(net, found); return h; @@ -722,15 +729,21 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, struct net *net = nf_ct_net(ignored_conntrack); const struct nf_conntrack_zone *zone; struct nf_conntrack_tuple_hash *h; + struct hlist_nulls_head *ct_hash; + unsigned int hash, sequence; struct hlist_nulls_node *n; struct nf_conn *ct; - unsigned int hash; zone = nf_ct_zone(ignored_conntrack); - hash = hash_conntrack(net, tuple); rcu_read_lock(); - hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnnode) { + do { + sequence = read_seqcount_begin(&nf_conntrack_generation); + hash = hash_conntrack(net, tuple); + ct_hash = net->ct.hash; + } while (read_seqcount_retry(&nf_conntrack_generation, sequence)); + + hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) { ct = nf_ct_tuplehash_to_ctrack(h); if (ct != ignored_conntrack && nf_ct_tuple_equal(tuple, &h->tuple) && @@ -1607,6 +1620,7 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) nf_conntrack_all_unlock(); local_bh_enable(); + synchronize_net(); nf_ct_free_hashtable(old_hash, old_size); return 0; } -- cgit v0.10.2 From 88b68bc5237c84c6ff6f78568653780869a94a95 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 28 Apr 2016 19:13:42 +0200 Subject: netfilter: conntrack: don't attempt to iterate over empty table Once we place all conntracks into same table iteration becomes more costly because the table contains conntracks that we are not interested in (belonging to other netns). So don't bother scanning if the current namespace has no entries. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 29fa08b..f2e75a5 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1428,6 +1428,9 @@ void nf_ct_iterate_cleanup(struct net *net, might_sleep(); + if (atomic_read(&net->ct.count) == 0) + return; + while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) { /* Time to push up daises... */ if (del_timer(&ct->timeout)) -- cgit v0.10.2 From 868043485ecb7cda503af0dfb9e2804e0260196a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 28 Apr 2016 19:13:43 +0200 Subject: netfilter: conntrack: use nf_ct_key_equal() in more places This prepares for upcoming change that places all conntracks into a single, global table. For this to work we will need to also compare net pointer during lookup. To avoid open-coding such check use the nf_ct_key_equal helper and then later extend it to also consider net_eq. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index f2e75a5..3b9c302 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -572,16 +572,13 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) /* See if there's one in the list already, including reverse */ hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode) - if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, - &h->tuple) && - nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone, - NF_CT_DIRECTION(h))) + if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + zone)) goto out; + hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode) - if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, - &h->tuple) && - nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone, - NF_CT_DIRECTION(h))) + if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, + zone)) goto out; add_timer(&ct->timeout); @@ -665,16 +662,13 @@ __nf_conntrack_confirm(struct sk_buff *skb) NAT could have grabbed it without realizing, since we're not in the hash. If there is, we lost race. */ hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode) - if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, - &h->tuple) && - nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone, - NF_CT_DIRECTION(h))) + if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + zone)) goto out; + hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode) - if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, - &h->tuple) && - nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone, - NF_CT_DIRECTION(h))) + if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, + zone)) goto out; /* Timer relative to confirmation time, not original @@ -746,8 +740,7 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) { ct = nf_ct_tuplehash_to_ctrack(h); if (ct != ignored_conntrack && - nf_ct_tuple_equal(tuple, &h->tuple) && - nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h))) { + nf_ct_key_equal(h, tuple, zone)) { NF_CT_STAT_INC_ATOMIC(net, found); rcu_read_unlock(); return 1; -- cgit v0.10.2 From 245cfdcaba2e7e4ee16b12af547ead37f9c501cd Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 28 Apr 2016 19:13:44 +0200 Subject: netfilter: conntrack: small refactoring of conntrack seq_printf The iteration process is lockless, so we test if the conntrack object is eligible for printing (e.g. is AF_INET) after obtaining the reference count. Once we put all conntracks into same hash table we might see more entries that need to be skipped. So add a helper and first perform the test in a lockless fashion for fast skip. Once we obtain the reference count, just repeat the check. Note that this refactoring also includes a missing check for unconfirmed conntrack entries due to slab rcu object re-usage, so they need to be skipped since they are not part of the listing. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c index f0dfe92..483cf79 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c @@ -114,6 +114,19 @@ static inline void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) } #endif +static bool ct_seq_should_skip(const struct nf_conn *ct, + const struct nf_conntrack_tuple_hash *hash) +{ + /* we only want to print DIR_ORIGINAL */ + if (NF_CT_DIRECTION(hash)) + return true; + + if (nf_ct_l3num(ct) != AF_INET) + return true; + + return false; +} + static int ct_seq_show(struct seq_file *s, void *v) { struct nf_conntrack_tuple_hash *hash = v; @@ -123,14 +136,15 @@ static int ct_seq_show(struct seq_file *s, void *v) int ret = 0; NF_CT_ASSERT(ct); - if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use))) + if (ct_seq_should_skip(ct, hash)) return 0; + if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use))) + return 0; - /* we only want to print DIR_ORIGINAL */ - if (NF_CT_DIRECTION(hash)) - goto release; - if (nf_ct_l3num(ct) != AF_INET) + /* check if we raced w. object reuse */ + if (!nf_ct_is_confirmed(ct) || + ct_seq_should_skip(ct, hash)) goto release; l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct)); -- cgit v0.10.2 From e0c7d47221883966d930fa7335b3ca295bc316b2 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 28 Apr 2016 19:13:45 +0200 Subject: netfilter: conntrack: check netns when comparing conntrack objects Once we place all conntracks in the same hash table we must also compare the netns pointer to skip conntracks that belong to a different namespace. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c index 483cf79..171aba1 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c @@ -115,6 +115,7 @@ static inline void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) #endif static bool ct_seq_should_skip(const struct nf_conn *ct, + const struct net *net, const struct nf_conntrack_tuple_hash *hash) { /* we only want to print DIR_ORIGINAL */ @@ -124,6 +125,9 @@ static bool ct_seq_should_skip(const struct nf_conn *ct, if (nf_ct_l3num(ct) != AF_INET) return true; + if (!net_eq(nf_ct_net(ct), net)) + return true; + return false; } @@ -136,7 +140,7 @@ static int ct_seq_show(struct seq_file *s, void *v) int ret = 0; NF_CT_ASSERT(ct); - if (ct_seq_should_skip(ct, hash)) + if (ct_seq_should_skip(ct, seq_file_net(s), hash)) return 0; if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use))) @@ -144,7 +148,7 @@ static int ct_seq_show(struct seq_file *s, void *v) /* check if we raced w. object reuse */ if (!nf_ct_is_confirmed(ct) || - ct_seq_should_skip(ct, hash)) + ct_seq_should_skip(ct, seq_file_net(s), hash)) goto release; l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct)); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 3b9c302..10ae2ee 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -447,7 +447,8 @@ static void death_by_timeout(unsigned long ul_conntrack) static inline bool nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone) + const struct nf_conntrack_zone *zone, + const struct net *net) { struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); @@ -456,7 +457,8 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, */ return nf_ct_tuple_equal(tuple, &h->tuple) && nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h)) && - nf_ct_is_confirmed(ct); + nf_ct_is_confirmed(ct) && + net_eq(net, nf_ct_net(ct)); } /* @@ -481,7 +483,7 @@ begin: } while (read_seqcount_retry(&nf_conntrack_generation, sequence)); hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) { - if (nf_ct_key_equal(h, tuple, zone)) { + if (nf_ct_key_equal(h, tuple, zone, net)) { NF_CT_STAT_INC_ATOMIC(net, found); return h; } @@ -517,7 +519,7 @@ begin: !atomic_inc_not_zero(&ct->ct_general.use))) h = NULL; else { - if (unlikely(!nf_ct_key_equal(h, tuple, zone))) { + if (unlikely(!nf_ct_key_equal(h, tuple, zone, net))) { nf_ct_put(ct); goto begin; } @@ -573,12 +575,12 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) /* See if there's one in the list already, including reverse */ hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode) if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, - zone)) + zone, net)) goto out; hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode) if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, - zone)) + zone, net)) goto out; add_timer(&ct->timeout); @@ -663,12 +665,12 @@ __nf_conntrack_confirm(struct sk_buff *skb) not in the hash. If there is, we lost race. */ hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode) if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, - zone)) + zone, net)) goto out; hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode) if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, - zone)) + zone, net)) goto out; /* Timer relative to confirmation time, not original @@ -740,7 +742,7 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) { ct = nf_ct_tuplehash_to_ctrack(h); if (ct != ignored_conntrack && - nf_ct_key_equal(h, tuple, zone)) { + nf_ct_key_equal(h, tuple, zone, net)) { NF_CT_STAT_INC_ATOMIC(net, found); rcu_read_unlock(); return 1; @@ -1383,7 +1385,8 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data), if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) continue; ct = nf_ct_tuplehash_to_ctrack(h); - if (iter(ct, data)) + if (net_eq(nf_ct_net(ct), net) && + iter(ct, data)) goto found; } } diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 294a8e2..f6bbcb2 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -837,6 +837,9 @@ restart: if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) continue; ct = nf_ct_tuplehash_to_ctrack(h); + if (!net_eq(net, nf_ct_net(ct))) + continue; + /* Dump entries of a given L3 protocol number. * If it is not specified, ie. l3proto == 0, * then dump everything. */ -- cgit v0.10.2 From 1b8c8a9f648c809c01a44114d7535ac8ca4c5ba3 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 3 May 2016 00:25:58 +0200 Subject: netfilter: conntrack: make netns address part of hash Once we place all conntracks into a global hash table we want them to be spread across entire hash table, even if namespaces have overlapping ip addresses. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 10ae2ee..ebafa77 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -54,6 +54,7 @@ #include #include #include +#include #define NF_CONNTRACK_VERSION "0.5.0" @@ -144,9 +145,11 @@ EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked); static unsigned int nf_conntrack_hash_rnd __read_mostly; -static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple) +static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, + const struct net *net) { unsigned int n; + u32 seed; get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd)); @@ -154,32 +157,29 @@ static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple) * destination ports (which is a multiple of 4) and treat the last * three bytes manually. */ + seed = nf_conntrack_hash_rnd ^ net_hash_mix(net); n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32); - return jhash2((u32 *)tuple, n, nf_conntrack_hash_rnd ^ + return jhash2((u32 *)tuple, n, seed ^ (((__force __u16)tuple->dst.u.all << 16) | tuple->dst.protonum)); } -static u32 __hash_bucket(u32 hash, unsigned int size) -{ - return reciprocal_scale(hash, size); -} - static u32 hash_bucket(u32 hash, const struct net *net) { - return __hash_bucket(hash, net->ct.htable_size); + return reciprocal_scale(hash, net->ct.htable_size); } -static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple, - unsigned int size) +static u32 __hash_conntrack(const struct net *net, + const struct nf_conntrack_tuple *tuple, + unsigned int size) { - return __hash_bucket(hash_conntrack_raw(tuple), size); + return reciprocal_scale(hash_conntrack_raw(tuple, net), size); } -static inline u_int32_t hash_conntrack(const struct net *net, - const struct nf_conntrack_tuple *tuple) +static u32 hash_conntrack(const struct net *net, + const struct nf_conntrack_tuple *tuple) { - return __hash_conntrack(tuple, net->ct.htable_size); + return __hash_conntrack(net, tuple, net->ct.htable_size); } bool @@ -535,7 +535,7 @@ nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { return __nf_conntrack_find_get(net, zone, tuple, - hash_conntrack_raw(tuple)); + hash_conntrack_raw(tuple, net)); } EXPORT_SYMBOL_GPL(nf_conntrack_find_get); @@ -1041,7 +1041,7 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl, /* look for tuple match */ zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); - hash = hash_conntrack_raw(&tuple); + hash = hash_conntrack_raw(&tuple, net); h = __nf_conntrack_find_get(net, zone, &tuple, hash); if (!h) { h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto, @@ -1605,7 +1605,8 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) struct nf_conntrack_tuple_hash, hnnode); ct = nf_ct_tuplehash_to_ctrack(h); hlist_nulls_del_rcu(&h->hnnode); - bucket = __hash_conntrack(&h->tuple, hashsize); + bucket = __hash_conntrack(nf_ct_net(ct), + &h->tuple, hashsize); hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]); } } -- cgit v0.10.2 From 56d52d4892d0e478a005b99ed10d0a7f488ea8c1 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 2 May 2016 18:39:55 +0200 Subject: netfilter: conntrack: use a single hashtable for all namespaces We already include netns address in the hash and compare the netns pointers during lookup, so even if namespaces have overlapping addresses entries will be spread across the table. Assuming 64k bucket size, this change saves 0.5 mbyte per namespace on a 64bit system. NAT bysrc and expectation hash is still per namespace, those will changed too soon. Future patch will also make conntrack object slab cache global again. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h index 62e17d1..3e2f332 100644 --- a/include/net/netfilter/nf_conntrack_core.h +++ b/include/net/netfilter/nf_conntrack_core.h @@ -81,6 +81,7 @@ print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple, #define CONNTRACK_LOCKS 1024 +extern struct hlist_nulls_head *nf_conntrack_hash; extern spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS]; void nf_conntrack_lock(spinlock_t *lock); diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h index b052785..251c435 100644 --- a/include/net/netns/conntrack.h +++ b/include/net/netns/conntrack.h @@ -93,9 +93,7 @@ struct netns_ct { int sysctl_tstamp; int sysctl_checksum; - unsigned int htable_size; struct kmem_cache *nf_conntrack_cachep; - struct hlist_nulls_head *hash; struct hlist_head *expect_hash; struct ct_pcpu __percpu *pcpu_lists; struct ip_conntrack_stat __percpu *stat; diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index e3c46e8..ae1a71a 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -360,7 +360,7 @@ static int ipv4_init_net(struct net *net) in->ctl_table[0].data = &nf_conntrack_max; in->ctl_table[1].data = &net->ct.count; - in->ctl_table[2].data = &net->ct.htable_size; + in->ctl_table[2].data = &nf_conntrack_htable_size; in->ctl_table[3].data = &net->ct.sysctl_checksum; in->ctl_table[4].data = &net->ct.sysctl_log_invalid; #endif diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c index 171aba1..f8fc7ab 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c @@ -31,15 +31,14 @@ struct ct_iter_state { static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) { - struct net *net = seq_file_net(seq); struct ct_iter_state *st = seq->private; struct hlist_nulls_node *n; for (st->bucket = 0; - st->bucket < net->ct.htable_size; + st->bucket < nf_conntrack_htable_size; st->bucket++) { n = rcu_dereference( - hlist_nulls_first_rcu(&net->ct.hash[st->bucket])); + hlist_nulls_first_rcu(&nf_conntrack_hash[st->bucket])); if (!is_a_nulls(n)) return n; } @@ -49,17 +48,16 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) static struct hlist_nulls_node *ct_get_next(struct seq_file *seq, struct hlist_nulls_node *head) { - struct net *net = seq_file_net(seq); struct ct_iter_state *st = seq->private; head = rcu_dereference(hlist_nulls_next_rcu(head)); while (is_a_nulls(head)) { if (likely(get_nulls_value(head) == st->bucket)) { - if (++st->bucket >= net->ct.htable_size) + if (++st->bucket >= nf_conntrack_htable_size) return NULL; } head = rcu_dereference( - hlist_nulls_first_rcu(&net->ct.hash[st->bucket])); + hlist_nulls_first_rcu(&nf_conntrack_hash[st->bucket])); } return head; } diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index ebafa77..4c906e7 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -69,6 +69,9 @@ EXPORT_SYMBOL_GPL(nf_conntrack_locks); __cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock); EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock); +struct hlist_nulls_head *nf_conntrack_hash __read_mostly; +EXPORT_SYMBOL_GPL(nf_conntrack_hash); + static __read_mostly spinlock_t nf_conntrack_locks_all_lock; static __read_mostly seqcount_t nf_conntrack_generation; static __read_mostly bool nf_conntrack_locks_all; @@ -164,9 +167,9 @@ static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, tuple->dst.protonum)); } -static u32 hash_bucket(u32 hash, const struct net *net) +static u32 scale_hash(u32 hash) { - return reciprocal_scale(hash, net->ct.htable_size); + return reciprocal_scale(hash, nf_conntrack_htable_size); } static u32 __hash_conntrack(const struct net *net, @@ -179,7 +182,7 @@ static u32 __hash_conntrack(const struct net *net, static u32 hash_conntrack(const struct net *net, const struct nf_conntrack_tuple *tuple) { - return __hash_conntrack(net, tuple, net->ct.htable_size); + return scale_hash(hash_conntrack_raw(tuple, net)); } bool @@ -478,8 +481,8 @@ ____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone, begin: do { sequence = read_seqcount_begin(&nf_conntrack_generation); - bucket = hash_bucket(hash, net); - ct_hash = net->ct.hash; + bucket = scale_hash(hash); + ct_hash = nf_conntrack_hash; } while (read_seqcount_retry(&nf_conntrack_generation, sequence)); hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) { @@ -543,12 +546,10 @@ static void __nf_conntrack_hash_insert(struct nf_conn *ct, unsigned int hash, unsigned int reply_hash) { - struct net *net = nf_ct_net(ct); - hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, - &net->ct.hash[hash]); + &nf_conntrack_hash[hash]); hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode, - &net->ct.hash[reply_hash]); + &nf_conntrack_hash[reply_hash]); } int @@ -573,12 +574,12 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); /* See if there's one in the list already, including reverse */ - hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode) + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, zone, net)) goto out; - hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode) + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, zone, net)) goto out; @@ -633,7 +634,7 @@ __nf_conntrack_confirm(struct sk_buff *skb) sequence = read_seqcount_begin(&nf_conntrack_generation); /* reuse the hash saved before */ hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev; - hash = hash_bucket(hash, net); + hash = scale_hash(hash); reply_hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); @@ -663,12 +664,12 @@ __nf_conntrack_confirm(struct sk_buff *skb) /* See if there's one in the list already, including reverse: NAT could have grabbed it without realizing, since we're not in the hash. If there is, we lost race. */ - hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode) + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, zone, net)) goto out; - hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode) + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, zone, net)) goto out; @@ -736,7 +737,7 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, do { sequence = read_seqcount_begin(&nf_conntrack_generation); hash = hash_conntrack(net, tuple); - ct_hash = net->ct.hash; + ct_hash = nf_conntrack_hash; } while (read_seqcount_retry(&nf_conntrack_generation, sequence)); hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) { @@ -773,16 +774,16 @@ static noinline int early_drop(struct net *net, unsigned int _hash) local_bh_disable(); restart: sequence = read_seqcount_begin(&nf_conntrack_generation); - hash = hash_bucket(_hash, net); - for (; i < net->ct.htable_size; i++) { + hash = scale_hash(_hash); + for (; i < nf_conntrack_htable_size; i++) { lockp = &nf_conntrack_locks[hash % CONNTRACK_LOCKS]; nf_conntrack_lock(lockp); if (read_seqcount_retry(&nf_conntrack_generation, sequence)) { spin_unlock(lockp); goto restart; } - hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], - hnnode) { + hlist_nulls_for_each_entry_rcu(h, n, &nf_conntrack_hash[hash], + hnnode) { tmp = nf_ct_tuplehash_to_ctrack(h); if (!test_bit(IPS_ASSURED_BIT, &tmp->status) && !nf_ct_is_dying(tmp) && @@ -793,7 +794,7 @@ restart: cnt++; } - hash = (hash + 1) % net->ct.htable_size; + hash = (hash + 1) % nf_conntrack_htable_size; spin_unlock(lockp); if (ct || cnt >= NF_CT_EVICTION_RANGE) @@ -1376,12 +1377,12 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data), int cpu; spinlock_t *lockp; - for (; *bucket < net->ct.htable_size; (*bucket)++) { + for (; *bucket < nf_conntrack_htable_size; (*bucket)++) { lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS]; local_bh_disable(); nf_conntrack_lock(lockp); - if (*bucket < net->ct.htable_size) { - hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) { + if (*bucket < nf_conntrack_htable_size) { + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[*bucket], hnnode) { if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) continue; ct = nf_ct_tuplehash_to_ctrack(h); @@ -1478,6 +1479,8 @@ void nf_conntrack_cleanup_end(void) while (untrack_refs() > 0) schedule(); + nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size); + #ifdef CONFIG_NF_CONNTRACK_ZONES nf_ct_extend_unregister(&nf_ct_zone_extend); #endif @@ -1528,7 +1531,6 @@ i_see_dead_people: } list_for_each_entry(net, net_exit_list, exit_list) { - nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size); nf_conntrack_proto_pernet_fini(net); nf_conntrack_helper_pernet_fini(net); nf_conntrack_ecache_pernet_fini(net); @@ -1599,10 +1601,10 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) * though since that required taking the locks. */ - for (i = 0; i < init_net.ct.htable_size; i++) { - while (!hlist_nulls_empty(&init_net.ct.hash[i])) { - h = hlist_nulls_entry(init_net.ct.hash[i].first, - struct nf_conntrack_tuple_hash, hnnode); + for (i = 0; i < nf_conntrack_htable_size; i++) { + while (!hlist_nulls_empty(&nf_conntrack_hash[i])) { + h = hlist_nulls_entry(nf_conntrack_hash[i].first, + struct nf_conntrack_tuple_hash, hnnode); ct = nf_ct_tuplehash_to_ctrack(h); hlist_nulls_del_rcu(&h->hnnode); bucket = __hash_conntrack(nf_ct_net(ct), @@ -1610,11 +1612,11 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]); } } - old_size = init_net.ct.htable_size; - old_hash = init_net.ct.hash; + old_size = nf_conntrack_htable_size; + old_hash = nf_conntrack_hash; - init_net.ct.htable_size = nf_conntrack_htable_size = hashsize; - init_net.ct.hash = hash; + nf_conntrack_hash = hash; + nf_conntrack_htable_size = hashsize; write_seqcount_end(&nf_conntrack_generation); nf_conntrack_all_unlock(); @@ -1670,6 +1672,11 @@ int nf_conntrack_init_start(void) * entries. */ max_factor = 4; } + + nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, 1); + if (!nf_conntrack_hash) + return -ENOMEM; + nf_conntrack_max = max_factor * nf_conntrack_htable_size; printk(KERN_INFO "nf_conntrack version %s (%u buckets, %d max)\n", @@ -1748,6 +1755,7 @@ err_tstamp: err_acct: nf_conntrack_expect_fini(); err_expect: + nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size); return ret; } @@ -1800,12 +1808,6 @@ int nf_conntrack_init_net(struct net *net) goto err_cache; } - net->ct.htable_size = nf_conntrack_htable_size; - net->ct.hash = nf_ct_alloc_hashtable(&net->ct.htable_size, 1); - if (!net->ct.hash) { - printk(KERN_ERR "Unable to create nf_conntrack_hash\n"); - goto err_hash; - } ret = nf_conntrack_expect_pernet_init(net); if (ret < 0) goto err_expect; @@ -1837,8 +1839,6 @@ err_tstamp: err_acct: nf_conntrack_expect_pernet_fini(net); err_expect: - nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size); -err_hash: kmem_cache_destroy(net->ct.nf_conntrack_cachep); err_cache: kfree(net->ct.slabname); diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 498bf74..cb48e6a 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -424,10 +424,10 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me, spin_unlock_bh(&pcpu->lock); } local_bh_disable(); - for (i = 0; i < net->ct.htable_size; i++) { + for (i = 0; i < nf_conntrack_htable_size; i++) { nf_conntrack_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); - if (i < net->ct.htable_size) { - hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode) + if (i < nf_conntrack_htable_size) { + hlist_nulls_for_each_entry(h, nn, &nf_conntrack_hash[i], hnnode) unhelp(h, me); } spin_unlock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index f6bbcb2..e00f178 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -824,16 +824,16 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) last = (struct nf_conn *)cb->args[1]; local_bh_disable(); - for (; cb->args[0] < net->ct.htable_size; cb->args[0]++) { + for (; cb->args[0] < nf_conntrack_htable_size; cb->args[0]++) { restart: lockp = &nf_conntrack_locks[cb->args[0] % CONNTRACK_LOCKS]; nf_conntrack_lock(lockp); - if (cb->args[0] >= net->ct.htable_size) { + if (cb->args[0] >= nf_conntrack_htable_size) { spin_unlock(lockp); goto out; } - hlist_nulls_for_each_entry(h, n, &net->ct.hash[cb->args[0]], - hnnode) { + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[cb->args[0]], + hnnode) { if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) continue; ct = nf_ct_tuplehash_to_ctrack(h); diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 0f1a45b..f87e84e 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -54,14 +54,13 @@ struct ct_iter_state { static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) { - struct net *net = seq_file_net(seq); struct ct_iter_state *st = seq->private; struct hlist_nulls_node *n; for (st->bucket = 0; - st->bucket < net->ct.htable_size; + st->bucket < nf_conntrack_htable_size; st->bucket++) { - n = rcu_dereference(hlist_nulls_first_rcu(&net->ct.hash[st->bucket])); + n = rcu_dereference(hlist_nulls_first_rcu(&nf_conntrack_hash[st->bucket])); if (!is_a_nulls(n)) return n; } @@ -71,18 +70,17 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) static struct hlist_nulls_node *ct_get_next(struct seq_file *seq, struct hlist_nulls_node *head) { - struct net *net = seq_file_net(seq); struct ct_iter_state *st = seq->private; head = rcu_dereference(hlist_nulls_next_rcu(head)); while (is_a_nulls(head)) { if (likely(get_nulls_value(head) == st->bucket)) { - if (++st->bucket >= net->ct.htable_size) + if (++st->bucket >= nf_conntrack_htable_size) return NULL; } head = rcu_dereference( hlist_nulls_first_rcu( - &net->ct.hash[st->bucket])); + &nf_conntrack_hash[st->bucket])); } return head; } @@ -458,7 +456,7 @@ static struct ctl_table nf_ct_sysctl_table[] = { }, { .procname = "nf_conntrack_buckets", - .data = &init_net.ct.htable_size, + .data = &nf_conntrack_htable_size, .maxlen = sizeof(unsigned int), .mode = 0444, .proc_handler = proc_dointvec, @@ -512,7 +510,6 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net) goto out_kmemdup; table[1].data = &net->ct.count; - table[2].data = &net->ct.htable_size; table[3].data = &net->ct.sysctl_checksum; table[4].data = &net->ct.sysctl_log_invalid; diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 3d52271..d74e716 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -824,7 +824,7 @@ nfnetlink_parse_nat_setup(struct nf_conn *ct, static int __net_init nf_nat_net_init(struct net *net) { /* Leave them the same for the moment. */ - net->ct.nat_htable_size = net->ct.htable_size; + net->ct.nat_htable_size = nf_conntrack_htable_size; net->ct.nat_bysource = nf_ct_alloc_hashtable(&net->ct.nat_htable_size, 0); if (!net->ct.nat_bysource) return -ENOMEM; diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index 2671b9d..3c84f14 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -306,10 +306,10 @@ static void ctnl_untimeout(struct net *net, struct ctnl_timeout *timeout) int i; local_bh_disable(); - for (i = 0; i < net->ct.htable_size; i++) { + for (i = 0; i < nf_conntrack_htable_size; i++) { nf_conntrack_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); - if (i < net->ct.htable_size) { - hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode) + if (i < nf_conntrack_htable_size) { + hlist_nulls_for_each_entry(h, nn, &nf_conntrack_hash[i], hnnode) untimeout(h, timeout); } spin_unlock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); -- cgit v0.10.2 From 3e86638e9a0be8bcf7db007909d8307b8b9f8e3b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 2 May 2016 18:40:14 +0200 Subject: netfilter: conntrack: consider ct netns in early_drop logic When iterating, skip conntrack entries living in a different netns. We could ignore netns and kill some other non-assured one, but it has two problems: - a netns can kill non-assured conntracks in other namespace - we would start to 'over-subscribe' the affected/overlimit netns. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 4c906e7..e3787cf 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -764,18 +764,20 @@ static noinline int early_drop(struct net *net, unsigned int _hash) { /* Use oldest entry, which is roughly LRU */ struct nf_conntrack_tuple_hash *h; - struct nf_conn *ct = NULL, *tmp; + struct nf_conn *tmp; struct hlist_nulls_node *n; - unsigned int i = 0, cnt = 0; - int dropped = 0; - unsigned int hash, sequence; + unsigned int i, hash, sequence; + struct nf_conn *ct = NULL; spinlock_t *lockp; + bool ret = false; + + i = 0; local_bh_disable(); restart: sequence = read_seqcount_begin(&nf_conntrack_generation); - hash = scale_hash(_hash); - for (; i < nf_conntrack_htable_size; i++) { + for (; i < NF_CT_EVICTION_RANGE; i++) { + hash = scale_hash(_hash++); lockp = &nf_conntrack_locks[hash % CONNTRACK_LOCKS]; nf_conntrack_lock(lockp); if (read_seqcount_retry(&nf_conntrack_generation, sequence)) { @@ -785,35 +787,40 @@ restart: hlist_nulls_for_each_entry_rcu(h, n, &nf_conntrack_hash[hash], hnnode) { tmp = nf_ct_tuplehash_to_ctrack(h); - if (!test_bit(IPS_ASSURED_BIT, &tmp->status) && - !nf_ct_is_dying(tmp) && - atomic_inc_not_zero(&tmp->ct_general.use)) { + + if (test_bit(IPS_ASSURED_BIT, &tmp->status) || + !net_eq(nf_ct_net(tmp), net) || + nf_ct_is_dying(tmp)) + continue; + + if (atomic_inc_not_zero(&tmp->ct_general.use)) { ct = tmp; break; } - cnt++; } - hash = (hash + 1) % nf_conntrack_htable_size; spin_unlock(lockp); - - if (ct || cnt >= NF_CT_EVICTION_RANGE) + if (ct) break; - } + local_bh_enable(); if (!ct) - return dropped; + return false; - if (del_timer(&ct->timeout)) { + /* kill only if in same netns -- might have moved due to + * SLAB_DESTROY_BY_RCU rules + */ + if (net_eq(nf_ct_net(ct), net) && del_timer(&ct->timeout)) { if (nf_ct_delete(ct, 0, 0)) { - dropped = 1; NF_CT_STAT_INC_ATOMIC(net, early_drop); + ret = true; } } + nf_ct_put(ct); - return dropped; + return ret; } static struct nf_conn * -- cgit v0.10.2 From 4b4ceb9dbf6a549682edff9fc5f04c204da50ab9 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 1 May 2016 00:34:37 +0200 Subject: netfilter: conntrack: __nf_ct_l4proto_find() always returns valid pointer Remove unnecessary check for non-nul pointer in destroy_conntrack() given that __nf_ct_l4proto_find() returns the generic protocol tracker if the protocol is not supported. Signed-off-by: Pablo Neira Ayuso diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index e3787cf..f72ede1 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -363,7 +363,7 @@ destroy_conntrack(struct nf_conntrack *nfct) } rcu_read_lock(); l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); - if (l4proto && l4proto->destroy) + if (l4proto->destroy) l4proto->destroy(ct); rcu_read_unlock(); -- cgit v0.10.2 From ba76738c032ec0af3acbecd85c429c6a5c9e5e5e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 2 May 2016 21:28:57 +0200 Subject: netfilter: conntrack: introduce nf_ct_acct_update() Introduce a helper function to update conntrack counters. __nf_ct_kill_acct() was unnecessarily subtracting skb_network_offset() that is expected to be zero from the ipv4/ipv6 hooks. Signed-off-by: Pablo Neira Ayuso diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index f72ede1..25e0c26 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -602,6 +602,21 @@ out: } EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert); +static inline void nf_ct_acct_update(struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned int len) +{ + struct nf_conn_acct *acct; + + acct = nf_conn_acct_find(ct); + if (acct) { + struct nf_conn_counter *counter = acct->counter; + + atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets); + atomic64_add(len, &counter[CTINFO2DIR(ctinfo)].bytes); + } +} + /* Confirm a connection given skb; places it in hash table */ int __nf_conntrack_confirm(struct sk_buff *skb) @@ -1258,17 +1273,8 @@ void __nf_ct_refresh_acct(struct nf_conn *ct, } acct: - if (do_acct) { - struct nf_conn_acct *acct; - - acct = nf_conn_acct_find(ct); - if (acct) { - struct nf_conn_counter *counter = acct->counter; - - atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets); - atomic64_add(skb->len, &counter[CTINFO2DIR(ctinfo)].bytes); - } - } + if (do_acct) + nf_ct_acct_update(ct, ctinfo, skb->len); } EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct); @@ -1277,18 +1283,8 @@ bool __nf_ct_kill_acct(struct nf_conn *ct, const struct sk_buff *skb, int do_acct) { - if (do_acct) { - struct nf_conn_acct *acct; - - acct = nf_conn_acct_find(ct); - if (acct) { - struct nf_conn_counter *counter = acct->counter; - - atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets); - atomic64_add(skb->len - skb_network_offset(skb), - &counter[CTINFO2DIR(ctinfo)].bytes); - } - } + if (do_acct) + nf_ct_acct_update(ct, ctinfo, skb->len); if (del_timer(&ct->timeout)) { ct->timeout.function((unsigned long)ct); -- cgit v0.10.2 From 71d8c47fc653711c41bc3282e5b0e605b3727956 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 1 May 2016 00:28:40 +0200 Subject: netfilter: conntrack: introduce clash resolution on insertion race This patch introduces nf_ct_resolve_clash() to resolve race condition on conntrack insertions. This is particularly a problem for connection-less protocols such as UDP, with no initial handshake. Two or more packets may race to insert the entry resulting in packet drops. Another problematic scenario are packets enqueued to userspace via NFQUEUE after the raw table, that make it easier to trigger this race. To resolve this, the idea is to reset the conntrack entry to the one that won race. Packet and bytes counters are also merged. The 'insert_failed' stats still accounts for this situation, after this patch, the drop counter is bumped whenever we drop packets, so we can watch for unresolved clashes. Signed-off-by: Pablo Neira Ayuso diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h index 956d8a6..1a5fb36 100644 --- a/include/net/netfilter/nf_conntrack_l4proto.h +++ b/include/net/netfilter/nf_conntrack_l4proto.h @@ -23,6 +23,9 @@ struct nf_conntrack_l4proto { /* L4 Protocol number. */ u_int8_t l4proto; + /* Resolve clashes on insertion races. */ + bool allow_clash; + /* Try to fill in the third arg: dataoff is offset past network protocol hdr. Return true if possible. */ bool (*pkt_to_tuple)(const struct sk_buff *skb, unsigned int dataoff, diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 25e0c26..f58a704 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -617,6 +617,48 @@ static inline void nf_ct_acct_update(struct nf_conn *ct, } } +static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo, + const struct nf_conn *loser_ct) +{ + struct nf_conn_acct *acct; + + acct = nf_conn_acct_find(loser_ct); + if (acct) { + struct nf_conn_counter *counter = acct->counter; + enum ip_conntrack_info ctinfo; + unsigned int bytes; + + /* u32 should be fine since we must have seen one packet. */ + bytes = atomic64_read(&counter[CTINFO2DIR(ctinfo)].bytes); + nf_ct_acct_update(ct, ctinfo, bytes); + } +} + +/* Resolve race on insertion if this protocol allows this. */ +static int nf_ct_resolve_clash(struct net *net, struct sk_buff *skb, + enum ip_conntrack_info ctinfo, + struct nf_conntrack_tuple_hash *h) +{ + /* This is the conntrack entry already in hashes that won race. */ + struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); + struct nf_conntrack_l4proto *l4proto; + + l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); + if (l4proto->allow_clash && + !nf_ct_is_dying(ct) && + atomic_inc_not_zero(&ct->ct_general.use)) { + nf_ct_acct_merge(ct, ctinfo, (struct nf_conn *)skb->nfct); + nf_conntrack_put(skb->nfct); + /* Assign conntrack already in hashes to this skbuff. Don't + * modify skb->nfctinfo to ensure consistent stateful filtering. + */ + skb->nfct = &ct->ct_general; + return NF_ACCEPT; + } + NF_CT_STAT_INC(net, drop); + return NF_DROP; +} + /* Confirm a connection given skb; places it in hash table */ int __nf_conntrack_confirm(struct sk_buff *skb) @@ -631,6 +673,7 @@ __nf_conntrack_confirm(struct sk_buff *skb) enum ip_conntrack_info ctinfo; struct net *net; unsigned int sequence; + int ret = NF_DROP; ct = nf_ct_get(skb, &ctinfo); net = nf_ct_net(ct); @@ -673,8 +716,10 @@ __nf_conntrack_confirm(struct sk_buff *skb) */ nf_ct_del_from_dying_or_unconfirmed_list(ct); - if (unlikely(nf_ct_is_dying(ct))) - goto out; + if (unlikely(nf_ct_is_dying(ct))) { + nf_ct_add_to_dying_list(ct); + goto dying; + } /* See if there's one in the list already, including reverse: NAT could have grabbed it without realizing, since we're @@ -725,10 +770,12 @@ __nf_conntrack_confirm(struct sk_buff *skb) out: nf_ct_add_to_dying_list(ct); + ret = nf_ct_resolve_clash(net, skb, ctinfo, h); +dying: nf_conntrack_double_unlock(hash, reply_hash); NF_CT_STAT_INC(net, insert_failed); local_bh_enable(); - return NF_DROP; + return ret; } EXPORT_SYMBOL_GPL(__nf_conntrack_confirm); diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index 478f92f..4fd0405 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -309,6 +309,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 __read_mostly = .l3proto = PF_INET, .l4proto = IPPROTO_UDP, .name = "udp", + .allow_clash = true, .pkt_to_tuple = udp_pkt_to_tuple, .invert_tuple = udp_invert_tuple, .print_tuple = udp_print_tuple, @@ -341,6 +342,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 __read_mostly = .l3proto = PF_INET6, .l4proto = IPPROTO_UDP, .name = "udp", + .allow_clash = true, .pkt_to_tuple = udp_pkt_to_tuple, .invert_tuple = udp_invert_tuple, .print_tuple = udp_print_tuple, diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c index 1ac8ee1..9d692f5 100644 --- a/net/netfilter/nf_conntrack_proto_udplite.c +++ b/net/netfilter/nf_conntrack_proto_udplite.c @@ -274,6 +274,7 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 __read_mostly = .l3proto = PF_INET, .l4proto = IPPROTO_UDPLITE, .name = "udplite", + .allow_clash = true, .pkt_to_tuple = udplite_pkt_to_tuple, .invert_tuple = udplite_invert_tuple, .print_tuple = udplite_print_tuple, @@ -306,6 +307,7 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 __read_mostly = .l3proto = PF_INET6, .l4proto = IPPROTO_UDPLITE, .name = "udplite", + .allow_clash = true, .pkt_to_tuple = udplite_pkt_to_tuple, .invert_tuple = udplite_invert_tuple, .print_tuple = udplite_print_tuple, -- cgit v0.10.2 From 3b78155b1b3688dbe910fecdc3e003f431b46630 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 3 May 2016 11:13:29 +0200 Subject: openvswitch: __nf_ct_l{3,4}proto_find() always return a valid pointer If the protocol is not natively supported, this assigns generic protocol tracker so we can always assume a valid pointer after these calls. Signed-off-by: Pablo Neira Ayuso Acked-by: Jarno Rajahalme Acked-by: Joe Stringer diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 9741a76..9f0bc49 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -439,20 +439,12 @@ ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone, u8 protonum; l3proto = __nf_ct_l3proto_find(l3num); - if (!l3proto) { - pr_debug("ovs_ct_find_existing: Can't get l3proto\n"); - return NULL; - } if (l3proto->get_l4proto(skb, skb_network_offset(skb), &dataoff, &protonum) <= 0) { pr_debug("ovs_ct_find_existing: Can't get protonum\n"); return NULL; } l4proto = __nf_ct_l4proto_find(l3num, protonum); - if (!l4proto) { - pr_debug("ovs_ct_find_existing: Can't get l4proto\n"); - return NULL; - } if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num, protonum, net, &tuple, l3proto, l4proto)) { pr_debug("ovs_ct_find_existing: Can't get tuple\n"); -- cgit v0.10.2 From d7cdf81657776ca1aa8377fd84d02fd8774db483 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 3 May 2016 13:54:23 +0200 Subject: netfilter: x_tables: get rid of old and inconsistent debugging The dprintf() and duprintf() functions are enabled at compile time, these days we have better runtime debugging through pr_debug() and static keys. On top of this, this debugging is so old that I don't expect anyone using this anymore, so let's get rid of this. IP_NF_ASSERT() is still left in place, although this needs that NETFILTER_DEBUG is enabled, I think these assertions provide useful context information when reading the code. Note that ARP_NF_ASSERT() has been removed as there is no user of this. Kill also DEBUG_ALLOW_ALL and a couple of pr_error() and pr_debug() spots that are inconsistently placed in the code. Signed-off-by: Pablo Neira Ayuso diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 3355ed7..2033f92 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -34,27 +34,6 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("David S. Miller "); MODULE_DESCRIPTION("arptables core"); -/*#define DEBUG_ARP_TABLES*/ -/*#define DEBUG_ARP_TABLES_USER*/ - -#ifdef DEBUG_ARP_TABLES -#define dprintf(format, args...) pr_debug(format, ## args) -#else -#define dprintf(format, args...) -#endif - -#ifdef DEBUG_ARP_TABLES_USER -#define duprintf(format, args...) pr_debug(format, ## args) -#else -#define duprintf(format, args...) -#endif - -#ifdef CONFIG_NETFILTER_DEBUG -#define ARP_NF_ASSERT(x) WARN_ON(!(x)) -#else -#define ARP_NF_ASSERT(x) -#endif - void *arpt_alloc_initial_table(const struct xt_table *info) { return xt_alloc_initial_table(arpt, ARPT); @@ -113,36 +92,20 @@ static inline int arp_packet_match(const struct arphdr *arphdr, #define FWINV(bool, invflg) ((bool) ^ !!(arpinfo->invflags & (invflg))) if (FWINV((arphdr->ar_op & arpinfo->arpop_mask) != arpinfo->arpop, - ARPT_INV_ARPOP)) { - dprintf("ARP operation field mismatch.\n"); - dprintf("ar_op: %04x info->arpop: %04x info->arpop_mask: %04x\n", - arphdr->ar_op, arpinfo->arpop, arpinfo->arpop_mask); + ARPT_INV_ARPOP)) return 0; - } if (FWINV((arphdr->ar_hrd & arpinfo->arhrd_mask) != arpinfo->arhrd, - ARPT_INV_ARPHRD)) { - dprintf("ARP hardware address format mismatch.\n"); - dprintf("ar_hrd: %04x info->arhrd: %04x info->arhrd_mask: %04x\n", - arphdr->ar_hrd, arpinfo->arhrd, arpinfo->arhrd_mask); + ARPT_INV_ARPHRD)) return 0; - } if (FWINV((arphdr->ar_pro & arpinfo->arpro_mask) != arpinfo->arpro, - ARPT_INV_ARPPRO)) { - dprintf("ARP protocol address format mismatch.\n"); - dprintf("ar_pro: %04x info->arpro: %04x info->arpro_mask: %04x\n", - arphdr->ar_pro, arpinfo->arpro, arpinfo->arpro_mask); + ARPT_INV_ARPPRO)) return 0; - } if (FWINV((arphdr->ar_hln & arpinfo->arhln_mask) != arpinfo->arhln, - ARPT_INV_ARPHLN)) { - dprintf("ARP hardware address length mismatch.\n"); - dprintf("ar_hln: %02x info->arhln: %02x info->arhln_mask: %02x\n", - arphdr->ar_hln, arpinfo->arhln, arpinfo->arhln_mask); + ARPT_INV_ARPHLN)) return 0; - } src_devaddr = arpptr; arpptr += dev->addr_len; @@ -155,49 +118,25 @@ static inline int arp_packet_match(const struct arphdr *arphdr, if (FWINV(arp_devaddr_compare(&arpinfo->src_devaddr, src_devaddr, dev->addr_len), ARPT_INV_SRCDEVADDR) || FWINV(arp_devaddr_compare(&arpinfo->tgt_devaddr, tgt_devaddr, dev->addr_len), - ARPT_INV_TGTDEVADDR)) { - dprintf("Source or target device address mismatch.\n"); - + ARPT_INV_TGTDEVADDR)) return 0; - } if (FWINV((src_ipaddr & arpinfo->smsk.s_addr) != arpinfo->src.s_addr, ARPT_INV_SRCIP) || FWINV(((tgt_ipaddr & arpinfo->tmsk.s_addr) != arpinfo->tgt.s_addr), - ARPT_INV_TGTIP)) { - dprintf("Source or target IP address mismatch.\n"); - - dprintf("SRC: %pI4. Mask: %pI4. Target: %pI4.%s\n", - &src_ipaddr, - &arpinfo->smsk.s_addr, - &arpinfo->src.s_addr, - arpinfo->invflags & ARPT_INV_SRCIP ? " (INV)" : ""); - dprintf("TGT: %pI4 Mask: %pI4 Target: %pI4.%s\n", - &tgt_ipaddr, - &arpinfo->tmsk.s_addr, - &arpinfo->tgt.s_addr, - arpinfo->invflags & ARPT_INV_TGTIP ? " (INV)" : ""); + ARPT_INV_TGTIP)) return 0; - } /* Look for ifname matches. */ ret = ifname_compare(indev, arpinfo->iniface, arpinfo->iniface_mask); - if (FWINV(ret != 0, ARPT_INV_VIA_IN)) { - dprintf("VIA in mismatch (%s vs %s).%s\n", - indev, arpinfo->iniface, - arpinfo->invflags & ARPT_INV_VIA_IN ? " (INV)" : ""); + if (FWINV(ret != 0, ARPT_INV_VIA_IN)) return 0; - } ret = ifname_compare(outdev, arpinfo->outiface, arpinfo->outiface_mask); - if (FWINV(ret != 0, ARPT_INV_VIA_OUT)) { - dprintf("VIA out mismatch (%s vs %s).%s\n", - outdev, arpinfo->outiface, - arpinfo->invflags & ARPT_INV_VIA_OUT ? " (INV)" : ""); + if (FWINV(ret != 0, ARPT_INV_VIA_OUT)) return 0; - } return 1; #undef FWINV @@ -205,16 +144,10 @@ static inline int arp_packet_match(const struct arphdr *arphdr, static inline int arp_checkentry(const struct arpt_arp *arp) { - if (arp->flags & ~ARPT_F_MASK) { - duprintf("Unknown flag bits set: %08X\n", - arp->flags & ~ARPT_F_MASK); + if (arp->flags & ~ARPT_F_MASK) return 0; - } - if (arp->invflags & ~ARPT_INV_MASK) { - duprintf("Unknown invflag bits set: %08X\n", - arp->invflags & ~ARPT_INV_MASK); + if (arp->invflags & ~ARPT_INV_MASK) return 0; - } return 1; } @@ -406,11 +339,9 @@ static int mark_source_chains(const struct xt_table_info *newinfo, = (void *)arpt_get_target_c(e); int visited = e->comefrom & (1 << hook); - if (e->comefrom & (1 << NF_ARP_NUMHOOKS)) { - pr_notice("arptables: loop hook %u pos %u %08X.\n", - hook, pos, e->comefrom); + if (e->comefrom & (1 << NF_ARP_NUMHOOKS)) return 0; - } + e->comefrom |= ((1 << hook) | (1 << NF_ARP_NUMHOOKS)); @@ -423,12 +354,8 @@ static int mark_source_chains(const struct xt_table_info *newinfo, if ((strcmp(t->target.u.user.name, XT_STANDARD_TARGET) == 0) && - t->verdict < -NF_MAX_VERDICT - 1) { - duprintf("mark_source_chains: bad " - "negative verdict (%i)\n", - t->verdict); + t->verdict < -NF_MAX_VERDICT - 1) return 0; - } /* Return: backtrack through the last * big jump. @@ -462,8 +389,6 @@ static int mark_source_chains(const struct xt_table_info *newinfo, XT_STANDARD_TARGET) == 0 && newpos >= 0) { /* This a jump; chase it. */ - duprintf("Jump rule %u -> %u\n", - pos, newpos); e = (struct arpt_entry *) (entry0 + newpos); if (!find_jump_target(newinfo, e)) @@ -480,8 +405,7 @@ static int mark_source_chains(const struct xt_table_info *newinfo, pos = newpos; } } -next: - duprintf("Finished chain %u\n", hook); +next: ; } return 1; } @@ -489,7 +413,6 @@ next: static inline int check_target(struct arpt_entry *e, const char *name) { struct xt_entry_target *t = arpt_get_target(e); - int ret; struct xt_tgchk_param par = { .table = name, .entryinfo = e, @@ -499,13 +422,7 @@ static inline int check_target(struct arpt_entry *e, const char *name) .family = NFPROTO_ARP, }; - ret = xt_check_target(&par, t->u.target_size - sizeof(*t), 0, false); - if (ret < 0) { - duprintf("arp_tables: check failed for `%s'.\n", - t->u.kernel.target->name); - return ret; - } - return 0; + return xt_check_target(&par, t->u.target_size - sizeof(*t), 0, false); } static inline int @@ -525,7 +442,6 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size) target = xt_request_find_target(NFPROTO_ARP, t->u.user.name, t->u.user.revision); if (IS_ERR(target)) { - duprintf("find_check_entry: `%s' not found\n", t->u.user.name); ret = PTR_ERR(target); goto out; } @@ -571,17 +487,12 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e, if ((unsigned long)e % __alignof__(struct arpt_entry) != 0 || (unsigned char *)e + sizeof(struct arpt_entry) >= limit || - (unsigned char *)e + e->next_offset > limit) { - duprintf("Bad offset %p\n", e); + (unsigned char *)e + e->next_offset > limit) return -EINVAL; - } if (e->next_offset - < sizeof(struct arpt_entry) + sizeof(struct xt_entry_target)) { - duprintf("checking: element %p size %u\n", - e, e->next_offset); + < sizeof(struct arpt_entry) + sizeof(struct xt_entry_target)) return -EINVAL; - } if (!arp_checkentry(&e->arp)) return -EINVAL; @@ -598,12 +509,9 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e, if ((unsigned char *)e - base == hook_entries[h]) newinfo->hook_entry[h] = hook_entries[h]; if ((unsigned char *)e - base == underflows[h]) { - if (!check_underflow(e)) { - pr_debug("Underflows must be unconditional and " - "use the STANDARD target with " - "ACCEPT/DROP\n"); + if (!check_underflow(e)) return -EINVAL; - } + newinfo->underflow[h] = underflows[h]; } } @@ -648,7 +556,6 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0, newinfo->underflow[i] = 0xFFFFFFFF; } - duprintf("translate_table: size %u\n", newinfo->size); i = 0; /* Walk through entries, checking offsets. */ @@ -665,31 +572,21 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0, XT_ERROR_TARGET) == 0) ++newinfo->stacksize; } - duprintf("translate_table: ARPT_ENTRY_ITERATE gives %d\n", ret); if (ret != 0) return ret; - if (i != repl->num_entries) { - duprintf("translate_table: %u not %u entries\n", - i, repl->num_entries); + if (i != repl->num_entries) return -EINVAL; - } /* Check hooks all assigned */ for (i = 0; i < NF_ARP_NUMHOOKS; i++) { /* Only hooks which are valid */ if (!(repl->valid_hooks & (1 << i))) continue; - if (newinfo->hook_entry[i] == 0xFFFFFFFF) { - duprintf("Invalid hook entry %u %u\n", - i, repl->hook_entry[i]); + if (newinfo->hook_entry[i] == 0xFFFFFFFF) return -EINVAL; - } - if (newinfo->underflow[i] == 0xFFFFFFFF) { - duprintf("Invalid underflow %u %u\n", - i, repl->underflow[i]); + if (newinfo->underflow[i] == 0xFFFFFFFF) return -EINVAL; - } } if (!mark_source_chains(newinfo, repl->valid_hooks, entry0)) @@ -897,11 +794,8 @@ static int get_info(struct net *net, void __user *user, struct xt_table *t; int ret; - if (*len != sizeof(struct arpt_getinfo)) { - duprintf("length %u != %Zu\n", *len, - sizeof(struct arpt_getinfo)); + if (*len != sizeof(struct arpt_getinfo)) return -EINVAL; - } if (copy_from_user(name, user, sizeof(name)) != 0) return -EFAULT; @@ -957,33 +851,25 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr, struct arpt_get_entries get; struct xt_table *t; - if (*len < sizeof(get)) { - duprintf("get_entries: %u < %Zu\n", *len, sizeof(get)); + if (*len < sizeof(get)) return -EINVAL; - } if (copy_from_user(&get, uptr, sizeof(get)) != 0) return -EFAULT; - if (*len != sizeof(struct arpt_get_entries) + get.size) { - duprintf("get_entries: %u != %Zu\n", *len, - sizeof(struct arpt_get_entries) + get.size); + if (*len != sizeof(struct arpt_get_entries) + get.size) return -EINVAL; - } + get.name[sizeof(get.name) - 1] = '\0'; t = xt_find_table_lock(net, NFPROTO_ARP, get.name); if (!IS_ERR_OR_NULL(t)) { const struct xt_table_info *private = t->private; - duprintf("t->private->number = %u\n", - private->number); if (get.size == private->size) ret = copy_entries_to_user(private->size, t, uptr->entrytable); - else { - duprintf("get_entries: I've got %u not %u!\n", - private->size, get.size); + else ret = -EAGAIN; - } + module_put(t->me); xt_table_unlock(t); } else @@ -1021,8 +907,6 @@ static int __do_replace(struct net *net, const char *name, /* You lied! */ if (valid_hooks != t->valid_hooks) { - duprintf("Valid hook crap: %08X vs %08X\n", - valid_hooks, t->valid_hooks); ret = -EINVAL; goto put_module; } @@ -1032,8 +916,6 @@ static int __do_replace(struct net *net, const char *name, goto put_module; /* Update module usage count based on number of rules */ - duprintf("do_replace: oldnum=%u, initnum=%u, newnum=%u\n", - oldinfo->number, oldinfo->initial_entries, newinfo->number); if ((oldinfo->number > oldinfo->initial_entries) || (newinfo->number <= oldinfo->initial_entries)) module_put(t->me); @@ -1103,8 +985,6 @@ static int do_replace(struct net *net, const void __user *user, if (ret != 0) goto free_newinfo; - duprintf("arp_tables: Translated table\n"); - ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo, tmp.num_counters, tmp.counters); if (ret) @@ -1202,20 +1082,14 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e, unsigned int entry_offset; int ret, off; - duprintf("check_compat_entry_size_and_hooks %p\n", e); if ((unsigned long)e % __alignof__(struct compat_arpt_entry) != 0 || (unsigned char *)e + sizeof(struct compat_arpt_entry) >= limit || - (unsigned char *)e + e->next_offset > limit) { - duprintf("Bad offset %p, limit = %p\n", e, limit); + (unsigned char *)e + e->next_offset > limit) return -EINVAL; - } if (e->next_offset < sizeof(struct compat_arpt_entry) + - sizeof(struct compat_xt_entry_target)) { - duprintf("checking: element %p size %u\n", - e, e->next_offset); + sizeof(struct compat_xt_entry_target)) return -EINVAL; - } if (!arp_checkentry(&e->arp)) return -EINVAL; @@ -1232,8 +1106,6 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e, target = xt_request_find_target(NFPROTO_ARP, t->u.user.name, t->u.user.revision); if (IS_ERR(target)) { - duprintf("check_compat_entry_size_and_hooks: `%s' not found\n", - t->u.user.name); ret = PTR_ERR(target); goto out; } @@ -1303,7 +1175,6 @@ static int translate_compat_table(struct xt_table_info **pinfo, size = compatr->size; info->number = compatr->num_entries; - duprintf("translate_compat_table: size %u\n", info->size); j = 0; xt_compat_lock(NFPROTO_ARP); xt_compat_init_offsets(NFPROTO_ARP, compatr->num_entries); @@ -1318,11 +1189,8 @@ static int translate_compat_table(struct xt_table_info **pinfo, } ret = -EINVAL; - if (j != compatr->num_entries) { - duprintf("translate_compat_table: %u not %u entries\n", - j, compatr->num_entries); + if (j != compatr->num_entries) goto out_unlock; - } ret = -ENOMEM; newinfo = xt_alloc_table_info(size); @@ -1413,8 +1281,6 @@ static int compat_do_replace(struct net *net, void __user *user, if (ret != 0) goto free_newinfo; - duprintf("compat_do_replace: Translated table\n"); - ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo, tmp.num_counters, compat_ptr(tmp.counters)); if (ret) @@ -1447,7 +1313,6 @@ static int compat_do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, break; default: - duprintf("do_arpt_set_ctl: unknown request %i\n", cmd); ret = -EINVAL; } @@ -1530,17 +1395,13 @@ static int compat_get_entries(struct net *net, struct compat_arpt_get_entries get; struct xt_table *t; - if (*len < sizeof(get)) { - duprintf("compat_get_entries: %u < %zu\n", *len, sizeof(get)); + if (*len < sizeof(get)) return -EINVAL; - } if (copy_from_user(&get, uptr, sizeof(get)) != 0) return -EFAULT; - if (*len != sizeof(struct compat_arpt_get_entries) + get.size) { - duprintf("compat_get_entries: %u != %zu\n", - *len, sizeof(get) + get.size); + if (*len != sizeof(struct compat_arpt_get_entries) + get.size) return -EINVAL; - } + get.name[sizeof(get.name) - 1] = '\0'; xt_compat_lock(NFPROTO_ARP); @@ -1549,16 +1410,13 @@ static int compat_get_entries(struct net *net, const struct xt_table_info *private = t->private; struct xt_table_info info; - duprintf("t->private->number = %u\n", private->number); ret = compat_table_info(private, &info); if (!ret && get.size == info.size) { ret = compat_copy_entries_to_user(private->size, t, uptr->entrytable); - } else if (!ret) { - duprintf("compat_get_entries: I've got %u not %u!\n", - private->size, get.size); + } else if (!ret) ret = -EAGAIN; - } + xt_compat_flush_offsets(NFPROTO_ARP); module_put(t->me); xt_table_unlock(t); @@ -1610,7 +1468,6 @@ static int do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned break; default: - duprintf("do_arpt_set_ctl: unknown request %i\n", cmd); ret = -EINVAL; } @@ -1653,7 +1510,6 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len } default: - duprintf("do_arpt_get_ctl: unknown request %i\n", cmd); ret = -EINVAL; } @@ -1698,7 +1554,6 @@ int arpt_register_table(struct net *net, memcpy(loc_cpu_entry, repl->entries, repl->size); ret = translate_table(newinfo, loc_cpu_entry, repl); - duprintf("arpt_register_table: translate table gives %d\n", ret); if (ret != 0) goto out_free; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 21ccc19..54906e0 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -35,34 +35,12 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team "); MODULE_DESCRIPTION("IPv4 packet filter"); -/*#define DEBUG_IP_FIREWALL*/ -/*#define DEBUG_ALLOW_ALL*/ /* Useful for remote debugging */ -/*#define DEBUG_IP_FIREWALL_USER*/ - -#ifdef DEBUG_IP_FIREWALL -#define dprintf(format, args...) pr_info(format , ## args) -#else -#define dprintf(format, args...) -#endif - -#ifdef DEBUG_IP_FIREWALL_USER -#define duprintf(format, args...) pr_info(format , ## args) -#else -#define duprintf(format, args...) -#endif - #ifdef CONFIG_NETFILTER_DEBUG #define IP_NF_ASSERT(x) WARN_ON(!(x)) #else #define IP_NF_ASSERT(x) #endif -#if 0 -/* All the better to debug you with... */ -#define static -#define inline -#endif - void *ipt_alloc_initial_table(const struct xt_table *info) { return xt_alloc_initial_table(ipt, IPT); @@ -85,52 +63,28 @@ ip_packet_match(const struct iphdr *ip, if (FWINV((ip->saddr&ipinfo->smsk.s_addr) != ipinfo->src.s_addr, IPT_INV_SRCIP) || FWINV((ip->daddr&ipinfo->dmsk.s_addr) != ipinfo->dst.s_addr, - IPT_INV_DSTIP)) { - dprintf("Source or dest mismatch.\n"); - - dprintf("SRC: %pI4. Mask: %pI4. Target: %pI4.%s\n", - &ip->saddr, &ipinfo->smsk.s_addr, &ipinfo->src.s_addr, - ipinfo->invflags & IPT_INV_SRCIP ? " (INV)" : ""); - dprintf("DST: %pI4 Mask: %pI4 Target: %pI4.%s\n", - &ip->daddr, &ipinfo->dmsk.s_addr, &ipinfo->dst.s_addr, - ipinfo->invflags & IPT_INV_DSTIP ? " (INV)" : ""); + IPT_INV_DSTIP)) return false; - } ret = ifname_compare_aligned(indev, ipinfo->iniface, ipinfo->iniface_mask); - if (FWINV(ret != 0, IPT_INV_VIA_IN)) { - dprintf("VIA in mismatch (%s vs %s).%s\n", - indev, ipinfo->iniface, - ipinfo->invflags & IPT_INV_VIA_IN ? " (INV)" : ""); + if (FWINV(ret != 0, IPT_INV_VIA_IN)) return false; - } ret = ifname_compare_aligned(outdev, ipinfo->outiface, ipinfo->outiface_mask); - if (FWINV(ret != 0, IPT_INV_VIA_OUT)) { - dprintf("VIA out mismatch (%s vs %s).%s\n", - outdev, ipinfo->outiface, - ipinfo->invflags & IPT_INV_VIA_OUT ? " (INV)" : ""); + if (FWINV(ret != 0, IPT_INV_VIA_OUT)) return false; - } /* Check specific protocol */ if (ipinfo->proto && - FWINV(ip->protocol != ipinfo->proto, IPT_INV_PROTO)) { - dprintf("Packet protocol %hi does not match %hi.%s\n", - ip->protocol, ipinfo->proto, - ipinfo->invflags & IPT_INV_PROTO ? " (INV)" : ""); + FWINV(ip->protocol != ipinfo->proto, IPT_INV_PROTO)) return false; - } /* If we have a fragment rule but the packet is not a fragment * then we return zero */ - if (FWINV((ipinfo->flags&IPT_F_FRAG) && !isfrag, IPT_INV_FRAG)) { - dprintf("Fragment rule but not fragment.%s\n", - ipinfo->invflags & IPT_INV_FRAG ? " (INV)" : ""); + if (FWINV((ipinfo->flags&IPT_F_FRAG) && !isfrag, IPT_INV_FRAG)) return false; - } return true; } @@ -138,16 +92,10 @@ ip_packet_match(const struct iphdr *ip, static bool ip_checkentry(const struct ipt_ip *ip) { - if (ip->flags & ~IPT_F_MASK) { - duprintf("Unknown flag bits set: %08X\n", - ip->flags & ~IPT_F_MASK); + if (ip->flags & ~IPT_F_MASK) return false; - } - if (ip->invflags & ~IPT_INV_MASK) { - duprintf("Unknown invflag bits set: %08X\n", - ip->invflags & ~IPT_INV_MASK); + if (ip->invflags & ~IPT_INV_MASK) return false; - } return true; } @@ -346,10 +294,6 @@ ipt_do_table(struct sk_buff *skb, e = get_entry(table_base, private->hook_entry[hook]); - pr_debug("Entering %s(hook %u), UF %p\n", - table->name, hook, - get_entry(table_base, private->underflow[hook])); - do { const struct xt_entry_target *t; const struct xt_entry_match *ematch; @@ -396,22 +340,15 @@ ipt_do_table(struct sk_buff *skb, if (stackidx == 0) { e = get_entry(table_base, private->underflow[hook]); - pr_debug("Underflow (this is normal) " - "to %p\n", e); } else { e = jumpstack[--stackidx]; - pr_debug("Pulled %p out from pos %u\n", - e, stackidx); e = ipt_next_entry(e); } continue; } if (table_base + v != ipt_next_entry(e) && - !(e->ip.flags & IPT_F_GOTO)) { + !(e->ip.flags & IPT_F_GOTO)) jumpstack[stackidx++] = e; - pr_debug("Pushed %p into pos %u\n", - e, stackidx - 1); - } e = get_entry(table_base, v); continue; @@ -429,18 +366,13 @@ ipt_do_table(struct sk_buff *skb, /* Verdict */ break; } while (!acpar.hotdrop); - pr_debug("Exiting %s; sp at %u\n", __func__, stackidx); xt_write_recseq_end(addend); local_bh_enable(); -#ifdef DEBUG_ALLOW_ALL - return NF_ACCEPT; -#else if (acpar.hotdrop) return NF_DROP; else return verdict; -#endif } static bool find_jump_target(const struct xt_table_info *t, @@ -480,11 +412,9 @@ mark_source_chains(const struct xt_table_info *newinfo, = (void *)ipt_get_target_c(e); int visited = e->comefrom & (1 << hook); - if (e->comefrom & (1 << NF_INET_NUMHOOKS)) { - pr_err("iptables: loop hook %u pos %u %08X.\n", - hook, pos, e->comefrom); + if (e->comefrom & (1 << NF_INET_NUMHOOKS)) return 0; - } + e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS)); /* Unconditional return/END. */ @@ -496,26 +426,13 @@ mark_source_chains(const struct xt_table_info *newinfo, if ((strcmp(t->target.u.user.name, XT_STANDARD_TARGET) == 0) && - t->verdict < -NF_MAX_VERDICT - 1) { - duprintf("mark_source_chains: bad " - "negative verdict (%i)\n", - t->verdict); + t->verdict < -NF_MAX_VERDICT - 1) return 0; - } /* Return: backtrack through the last big jump. */ do { e->comefrom ^= (1<comefrom - & (1 << NF_INET_NUMHOOKS)) { - duprintf("Back unset " - "on hook %u " - "rule %u\n", - hook, pos); - } -#endif oldpos = pos; pos = e->counters.pcnt; e->counters.pcnt = 0; @@ -543,8 +460,6 @@ mark_source_chains(const struct xt_table_info *newinfo, XT_STANDARD_TARGET) == 0 && newpos >= 0) { /* This a jump; chase it. */ - duprintf("Jump rule %u -> %u\n", - pos, newpos); e = (struct ipt_entry *) (entry0 + newpos); if (!find_jump_target(newinfo, e)) @@ -561,8 +476,7 @@ mark_source_chains(const struct xt_table_info *newinfo, pos = newpos; } } -next: - duprintf("Finished chain %u\n", hook); +next: ; } return 1; } @@ -584,18 +498,12 @@ static int check_match(struct xt_entry_match *m, struct xt_mtchk_param *par) { const struct ipt_ip *ip = par->entryinfo; - int ret; par->match = m->u.kernel.match; par->matchinfo = m->data; - ret = xt_check_match(par, m->u.match_size - sizeof(*m), - ip->proto, ip->invflags & IPT_INV_PROTO); - if (ret < 0) { - duprintf("check failed for `%s'.\n", par->match->name); - return ret; - } - return 0; + return xt_check_match(par, m->u.match_size - sizeof(*m), + ip->proto, ip->invflags & IPT_INV_PROTO); } static int @@ -606,10 +514,8 @@ find_check_match(struct xt_entry_match *m, struct xt_mtchk_param *par) match = xt_request_find_match(NFPROTO_IPV4, m->u.user.name, m->u.user.revision); - if (IS_ERR(match)) { - duprintf("find_check_match: `%s' not found\n", m->u.user.name); + if (IS_ERR(match)) return PTR_ERR(match); - } m->u.kernel.match = match; ret = check_match(m, par); @@ -634,16 +540,9 @@ static int check_target(struct ipt_entry *e, struct net *net, const char *name) .hook_mask = e->comefrom, .family = NFPROTO_IPV4, }; - int ret; - ret = xt_check_target(&par, t->u.target_size - sizeof(*t), - e->ip.proto, e->ip.invflags & IPT_INV_PROTO); - if (ret < 0) { - duprintf("check failed for `%s'.\n", - t->u.kernel.target->name); - return ret; - } - return 0; + return xt_check_target(&par, t->u.target_size - sizeof(*t), + e->ip.proto, e->ip.invflags & IPT_INV_PROTO); } static int @@ -680,7 +579,6 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name, target = xt_request_find_target(NFPROTO_IPV4, t->u.user.name, t->u.user.revision); if (IS_ERR(target)) { - duprintf("find_check_entry: `%s' not found\n", t->u.user.name); ret = PTR_ERR(target); goto cleanup_matches; } @@ -734,17 +632,12 @@ check_entry_size_and_hooks(struct ipt_entry *e, if ((unsigned long)e % __alignof__(struct ipt_entry) != 0 || (unsigned char *)e + sizeof(struct ipt_entry) >= limit || - (unsigned char *)e + e->next_offset > limit) { - duprintf("Bad offset %p\n", e); + (unsigned char *)e + e->next_offset > limit) return -EINVAL; - } if (e->next_offset - < sizeof(struct ipt_entry) + sizeof(struct xt_entry_target)) { - duprintf("checking: element %p size %u\n", - e, e->next_offset); + < sizeof(struct ipt_entry) + sizeof(struct xt_entry_target)) return -EINVAL; - } if (!ip_checkentry(&e->ip)) return -EINVAL; @@ -761,12 +654,9 @@ check_entry_size_and_hooks(struct ipt_entry *e, if ((unsigned char *)e - base == hook_entries[h]) newinfo->hook_entry[h] = hook_entries[h]; if ((unsigned char *)e - base == underflows[h]) { - if (!check_underflow(e)) { - pr_debug("Underflows must be unconditional and " - "use the STANDARD target with " - "ACCEPT/DROP\n"); + if (!check_underflow(e)) return -EINVAL; - } + newinfo->underflow[h] = underflows[h]; } } @@ -818,7 +708,6 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, newinfo->underflow[i] = 0xFFFFFFFF; } - duprintf("translate_table: size %u\n", newinfo->size); i = 0; /* Walk through entries, checking offsets. */ xt_entry_foreach(iter, entry0, newinfo->size) { @@ -835,27 +724,18 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, ++newinfo->stacksize; } - if (i != repl->num_entries) { - duprintf("translate_table: %u not %u entries\n", - i, repl->num_entries); + if (i != repl->num_entries) return -EINVAL; - } /* Check hooks all assigned */ for (i = 0; i < NF_INET_NUMHOOKS; i++) { /* Only hooks which are valid */ if (!(repl->valid_hooks & (1 << i))) continue; - if (newinfo->hook_entry[i] == 0xFFFFFFFF) { - duprintf("Invalid hook entry %u %u\n", - i, repl->hook_entry[i]); + if (newinfo->hook_entry[i] == 0xFFFFFFFF) return -EINVAL; - } - if (newinfo->underflow[i] == 0xFFFFFFFF) { - duprintf("Invalid underflow %u %u\n", - i, repl->underflow[i]); + if (newinfo->underflow[i] == 0xFFFFFFFF) return -EINVAL; - } } if (!mark_source_chains(newinfo, repl->valid_hooks, entry0)) @@ -1083,11 +963,8 @@ static int get_info(struct net *net, void __user *user, struct xt_table *t; int ret; - if (*len != sizeof(struct ipt_getinfo)) { - duprintf("length %u != %zu\n", *len, - sizeof(struct ipt_getinfo)); + if (*len != sizeof(struct ipt_getinfo)) return -EINVAL; - } if (copy_from_user(name, user, sizeof(name)) != 0) return -EFAULT; @@ -1145,31 +1022,23 @@ get_entries(struct net *net, struct ipt_get_entries __user *uptr, struct ipt_get_entries get; struct xt_table *t; - if (*len < sizeof(get)) { - duprintf("get_entries: %u < %zu\n", *len, sizeof(get)); + if (*len < sizeof(get)) return -EINVAL; - } if (copy_from_user(&get, uptr, sizeof(get)) != 0) return -EFAULT; - if (*len != sizeof(struct ipt_get_entries) + get.size) { - duprintf("get_entries: %u != %zu\n", - *len, sizeof(get) + get.size); + if (*len != sizeof(struct ipt_get_entries) + get.size) return -EINVAL; - } get.name[sizeof(get.name) - 1] = '\0'; t = xt_find_table_lock(net, AF_INET, get.name); if (!IS_ERR_OR_NULL(t)) { const struct xt_table_info *private = t->private; - duprintf("t->private->number = %u\n", private->number); if (get.size == private->size) ret = copy_entries_to_user(private->size, t, uptr->entrytable); - else { - duprintf("get_entries: I've got %u not %u!\n", - private->size, get.size); + else ret = -EAGAIN; - } + module_put(t->me); xt_table_unlock(t); } else @@ -1205,8 +1074,6 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, /* You lied! */ if (valid_hooks != t->valid_hooks) { - duprintf("Valid hook crap: %08X vs %08X\n", - valid_hooks, t->valid_hooks); ret = -EINVAL; goto put_module; } @@ -1216,8 +1083,6 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, goto put_module; /* Update module usage count based on number of rules */ - duprintf("do_replace: oldnum=%u, initnum=%u, newnum=%u\n", - oldinfo->number, oldinfo->initial_entries, newinfo->number); if ((oldinfo->number > oldinfo->initial_entries) || (newinfo->number <= oldinfo->initial_entries)) module_put(t->me); @@ -1286,8 +1151,6 @@ do_replace(struct net *net, const void __user *user, unsigned int len) if (ret != 0) goto free_newinfo; - duprintf("Translated table\n"); - ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo, tmp.num_counters, tmp.counters); if (ret) @@ -1413,11 +1276,9 @@ compat_find_calc_match(struct xt_entry_match *m, match = xt_request_find_match(NFPROTO_IPV4, m->u.user.name, m->u.user.revision); - if (IS_ERR(match)) { - duprintf("compat_check_calc_match: `%s' not found\n", - m->u.user.name); + if (IS_ERR(match)) return PTR_ERR(match); - } + m->u.kernel.match = match; *size += xt_compat_match_offset(match); return 0; @@ -1449,20 +1310,14 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e, unsigned int j; int ret, off; - duprintf("check_compat_entry_size_and_hooks %p\n", e); if ((unsigned long)e % __alignof__(struct compat_ipt_entry) != 0 || (unsigned char *)e + sizeof(struct compat_ipt_entry) >= limit || - (unsigned char *)e + e->next_offset > limit) { - duprintf("Bad offset %p, limit = %p\n", e, limit); + (unsigned char *)e + e->next_offset > limit) return -EINVAL; - } if (e->next_offset < sizeof(struct compat_ipt_entry) + - sizeof(struct compat_xt_entry_target)) { - duprintf("checking: element %p size %u\n", - e, e->next_offset); + sizeof(struct compat_xt_entry_target)) return -EINVAL; - } if (!ip_checkentry(&e->ip)) return -EINVAL; @@ -1486,8 +1341,6 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e, target = xt_request_find_target(NFPROTO_IPV4, t->u.user.name, t->u.user.revision); if (IS_ERR(target)) { - duprintf("check_compat_entry_size_and_hooks: `%s' not found\n", - t->u.user.name); ret = PTR_ERR(target); goto release_matches; } @@ -1569,7 +1422,6 @@ translate_compat_table(struct net *net, size = compatr->size; info->number = compatr->num_entries; - duprintf("translate_compat_table: size %u\n", info->size); j = 0; xt_compat_lock(AF_INET); xt_compat_init_offsets(AF_INET, compatr->num_entries); @@ -1584,11 +1436,8 @@ translate_compat_table(struct net *net, } ret = -EINVAL; - if (j != compatr->num_entries) { - duprintf("translate_compat_table: %u not %u entries\n", - j, compatr->num_entries); + if (j != compatr->num_entries) goto out_unlock; - } ret = -ENOMEM; newinfo = xt_alloc_table_info(size); @@ -1685,8 +1534,6 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) if (ret != 0) goto free_newinfo; - duprintf("compat_do_replace: Translated table\n"); - ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo, tmp.num_counters, compat_ptr(tmp.counters)); if (ret) @@ -1720,7 +1567,6 @@ compat_do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, break; default: - duprintf("do_ipt_set_ctl: unknown request %i\n", cmd); ret = -EINVAL; } @@ -1770,19 +1616,15 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr, struct compat_ipt_get_entries get; struct xt_table *t; - if (*len < sizeof(get)) { - duprintf("compat_get_entries: %u < %zu\n", *len, sizeof(get)); + if (*len < sizeof(get)) return -EINVAL; - } if (copy_from_user(&get, uptr, sizeof(get)) != 0) return -EFAULT; - if (*len != sizeof(struct compat_ipt_get_entries) + get.size) { - duprintf("compat_get_entries: %u != %zu\n", - *len, sizeof(get) + get.size); + if (*len != sizeof(struct compat_ipt_get_entries) + get.size) return -EINVAL; - } + get.name[sizeof(get.name) - 1] = '\0'; xt_compat_lock(AF_INET); @@ -1790,16 +1632,13 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr, if (!IS_ERR_OR_NULL(t)) { const struct xt_table_info *private = t->private; struct xt_table_info info; - duprintf("t->private->number = %u\n", private->number); ret = compat_table_info(private, &info); - if (!ret && get.size == info.size) { + if (!ret && get.size == info.size) ret = compat_copy_entries_to_user(private->size, t, uptr->entrytable); - } else if (!ret) { - duprintf("compat_get_entries: I've got %u not %u!\n", - private->size, get.size); + else if (!ret) ret = -EAGAIN; - } + xt_compat_flush_offsets(AF_INET); module_put(t->me); xt_table_unlock(t); @@ -1852,7 +1691,6 @@ do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) break; default: - duprintf("do_ipt_set_ctl: unknown request %i\n", cmd); ret = -EINVAL; } @@ -1904,7 +1742,6 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) } default: - duprintf("do_ipt_get_ctl: unknown request %i\n", cmd); ret = -EINVAL; } @@ -2006,7 +1843,6 @@ icmp_match(const struct sk_buff *skb, struct xt_action_param *par) /* We've been asked to examine this packet, and we * can't. Hence, no choice but to drop. */ - duprintf("Dropping evil ICMP tinygram.\n"); par->hotdrop = true; return false; } diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 17874e8..63e06c3 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -39,34 +39,12 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team "); MODULE_DESCRIPTION("IPv6 packet filter"); -/*#define DEBUG_IP_FIREWALL*/ -/*#define DEBUG_ALLOW_ALL*/ /* Useful for remote debugging */ -/*#define DEBUG_IP_FIREWALL_USER*/ - -#ifdef DEBUG_IP_FIREWALL -#define dprintf(format, args...) pr_info(format , ## args) -#else -#define dprintf(format, args...) -#endif - -#ifdef DEBUG_IP_FIREWALL_USER -#define duprintf(format, args...) pr_info(format , ## args) -#else -#define duprintf(format, args...) -#endif - #ifdef CONFIG_NETFILTER_DEBUG #define IP_NF_ASSERT(x) WARN_ON(!(x)) #else #define IP_NF_ASSERT(x) #endif -#if 0 -/* All the better to debug you with... */ -#define static -#define inline -#endif - void *ip6t_alloc_initial_table(const struct xt_table *info) { return xt_alloc_initial_table(ip6t, IP6T); @@ -100,35 +78,18 @@ ip6_packet_match(const struct sk_buff *skb, if (FWINV(ipv6_masked_addr_cmp(&ipv6->saddr, &ip6info->smsk, &ip6info->src), IP6T_INV_SRCIP) || FWINV(ipv6_masked_addr_cmp(&ipv6->daddr, &ip6info->dmsk, - &ip6info->dst), IP6T_INV_DSTIP)) { - dprintf("Source or dest mismatch.\n"); -/* - dprintf("SRC: %u. Mask: %u. Target: %u.%s\n", ip->saddr, - ipinfo->smsk.s_addr, ipinfo->src.s_addr, - ipinfo->invflags & IP6T_INV_SRCIP ? " (INV)" : ""); - dprintf("DST: %u. Mask: %u. Target: %u.%s\n", ip->daddr, - ipinfo->dmsk.s_addr, ipinfo->dst.s_addr, - ipinfo->invflags & IP6T_INV_DSTIP ? " (INV)" : "");*/ + &ip6info->dst), IP6T_INV_DSTIP)) return false; - } ret = ifname_compare_aligned(indev, ip6info->iniface, ip6info->iniface_mask); - if (FWINV(ret != 0, IP6T_INV_VIA_IN)) { - dprintf("VIA in mismatch (%s vs %s).%s\n", - indev, ip6info->iniface, - ip6info->invflags & IP6T_INV_VIA_IN ? " (INV)" : ""); + if (FWINV(ret != 0, IP6T_INV_VIA_IN)) return false; - } ret = ifname_compare_aligned(outdev, ip6info->outiface, ip6info->outiface_mask); - if (FWINV(ret != 0, IP6T_INV_VIA_OUT)) { - dprintf("VIA out mismatch (%s vs %s).%s\n", - outdev, ip6info->outiface, - ip6info->invflags & IP6T_INV_VIA_OUT ? " (INV)" : ""); + if (FWINV(ret != 0, IP6T_INV_VIA_OUT)) return false; - } /* ... might want to do something with class and flowlabel here ... */ @@ -145,11 +106,6 @@ ip6_packet_match(const struct sk_buff *skb, } *fragoff = _frag_off; - dprintf("Packet protocol %hi ?= %s%hi.\n", - protohdr, - ip6info->invflags & IP6T_INV_PROTO ? "!":"", - ip6info->proto); - if (ip6info->proto == protohdr) { if (ip6info->invflags & IP6T_INV_PROTO) return false; @@ -169,16 +125,11 @@ ip6_packet_match(const struct sk_buff *skb, static bool ip6_checkentry(const struct ip6t_ip6 *ipv6) { - if (ipv6->flags & ~IP6T_F_MASK) { - duprintf("Unknown flag bits set: %08X\n", - ipv6->flags & ~IP6T_F_MASK); + if (ipv6->flags & ~IP6T_F_MASK) return false; - } - if (ipv6->invflags & ~IP6T_INV_MASK) { - duprintf("Unknown invflag bits set: %08X\n", - ipv6->invflags & ~IP6T_INV_MASK); + if (ipv6->invflags & ~IP6T_INV_MASK) return false; - } + return true; } @@ -446,13 +397,9 @@ ip6t_do_table(struct sk_buff *skb, xt_write_recseq_end(addend); local_bh_enable(); -#ifdef DEBUG_ALLOW_ALL - return NF_ACCEPT; -#else if (acpar.hotdrop) return NF_DROP; else return verdict; -#endif } static bool find_jump_target(const struct xt_table_info *t, @@ -492,11 +439,9 @@ mark_source_chains(const struct xt_table_info *newinfo, = (void *)ip6t_get_target_c(e); int visited = e->comefrom & (1 << hook); - if (e->comefrom & (1 << NF_INET_NUMHOOKS)) { - pr_err("iptables: loop hook %u pos %u %08X.\n", - hook, pos, e->comefrom); + if (e->comefrom & (1 << NF_INET_NUMHOOKS)) return 0; - } + e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS)); /* Unconditional return/END. */ @@ -508,26 +453,13 @@ mark_source_chains(const struct xt_table_info *newinfo, if ((strcmp(t->target.u.user.name, XT_STANDARD_TARGET) == 0) && - t->verdict < -NF_MAX_VERDICT - 1) { - duprintf("mark_source_chains: bad " - "negative verdict (%i)\n", - t->verdict); + t->verdict < -NF_MAX_VERDICT - 1) return 0; - } /* Return: backtrack through the last big jump. */ do { e->comefrom ^= (1<comefrom - & (1 << NF_INET_NUMHOOKS)) { - duprintf("Back unset " - "on hook %u " - "rule %u\n", - hook, pos); - } -#endif oldpos = pos; pos = e->counters.pcnt; e->counters.pcnt = 0; @@ -555,8 +487,6 @@ mark_source_chains(const struct xt_table_info *newinfo, XT_STANDARD_TARGET) == 0 && newpos >= 0) { /* This a jump; chase it. */ - duprintf("Jump rule %u -> %u\n", - pos, newpos); e = (struct ip6t_entry *) (entry0 + newpos); if (!find_jump_target(newinfo, e)) @@ -573,8 +503,7 @@ mark_source_chains(const struct xt_table_info *newinfo, pos = newpos; } } -next: - duprintf("Finished chain %u\n", hook); +next: ; } return 1; } @@ -595,19 +524,12 @@ static void cleanup_match(struct xt_entry_match *m, struct net *net) static int check_match(struct xt_entry_match *m, struct xt_mtchk_param *par) { const struct ip6t_ip6 *ipv6 = par->entryinfo; - int ret; par->match = m->u.kernel.match; par->matchinfo = m->data; - ret = xt_check_match(par, m->u.match_size - sizeof(*m), - ipv6->proto, ipv6->invflags & IP6T_INV_PROTO); - if (ret < 0) { - duprintf("ip_tables: check failed for `%s'.\n", - par.match->name); - return ret; - } - return 0; + return xt_check_match(par, m->u.match_size - sizeof(*m), + ipv6->proto, ipv6->invflags & IP6T_INV_PROTO); } static int @@ -618,10 +540,9 @@ find_check_match(struct xt_entry_match *m, struct xt_mtchk_param *par) match = xt_request_find_match(NFPROTO_IPV6, m->u.user.name, m->u.user.revision); - if (IS_ERR(match)) { - duprintf("find_check_match: `%s' not found\n", m->u.user.name); + if (IS_ERR(match)) return PTR_ERR(match); - } + m->u.kernel.match = match; ret = check_match(m, par); @@ -646,17 +567,11 @@ static int check_target(struct ip6t_entry *e, struct net *net, const char *name) .hook_mask = e->comefrom, .family = NFPROTO_IPV6, }; - int ret; t = ip6t_get_target(e); - ret = xt_check_target(&par, t->u.target_size - sizeof(*t), - e->ipv6.proto, e->ipv6.invflags & IP6T_INV_PROTO); - if (ret < 0) { - duprintf("ip_tables: check failed for `%s'.\n", - t->u.kernel.target->name); - return ret; - } - return 0; + return xt_check_target(&par, t->u.target_size - sizeof(*t), + e->ipv6.proto, + e->ipv6.invflags & IP6T_INV_PROTO); } static int @@ -693,7 +608,6 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name, target = xt_request_find_target(NFPROTO_IPV6, t->u.user.name, t->u.user.revision); if (IS_ERR(target)) { - duprintf("find_check_entry: `%s' not found\n", t->u.user.name); ret = PTR_ERR(target); goto cleanup_matches; } @@ -746,17 +660,12 @@ check_entry_size_and_hooks(struct ip6t_entry *e, if ((unsigned long)e % __alignof__(struct ip6t_entry) != 0 || (unsigned char *)e + sizeof(struct ip6t_entry) >= limit || - (unsigned char *)e + e->next_offset > limit) { - duprintf("Bad offset %p\n", e); + (unsigned char *)e + e->next_offset > limit) return -EINVAL; - } if (e->next_offset - < sizeof(struct ip6t_entry) + sizeof(struct xt_entry_target)) { - duprintf("checking: element %p size %u\n", - e, e->next_offset); + < sizeof(struct ip6t_entry) + sizeof(struct xt_entry_target)) return -EINVAL; - } if (!ip6_checkentry(&e->ipv6)) return -EINVAL; @@ -773,12 +682,9 @@ check_entry_size_and_hooks(struct ip6t_entry *e, if ((unsigned char *)e - base == hook_entries[h]) newinfo->hook_entry[h] = hook_entries[h]; if ((unsigned char *)e - base == underflows[h]) { - if (!check_underflow(e)) { - pr_debug("Underflows must be unconditional and " - "use the STANDARD target with " - "ACCEPT/DROP\n"); + if (!check_underflow(e)) return -EINVAL; - } + newinfo->underflow[h] = underflows[h]; } } @@ -830,7 +736,6 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, newinfo->underflow[i] = 0xFFFFFFFF; } - duprintf("translate_table: size %u\n", newinfo->size); i = 0; /* Walk through entries, checking offsets. */ xt_entry_foreach(iter, entry0, newinfo->size) { @@ -847,27 +752,18 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, ++newinfo->stacksize; } - if (i != repl->num_entries) { - duprintf("translate_table: %u not %u entries\n", - i, repl->num_entries); + if (i != repl->num_entries) return -EINVAL; - } /* Check hooks all assigned */ for (i = 0; i < NF_INET_NUMHOOKS; i++) { /* Only hooks which are valid */ if (!(repl->valid_hooks & (1 << i))) continue; - if (newinfo->hook_entry[i] == 0xFFFFFFFF) { - duprintf("Invalid hook entry %u %u\n", - i, repl->hook_entry[i]); + if (newinfo->hook_entry[i] == 0xFFFFFFFF) return -EINVAL; - } - if (newinfo->underflow[i] == 0xFFFFFFFF) { - duprintf("Invalid underflow %u %u\n", - i, repl->underflow[i]); + if (newinfo->underflow[i] == 0xFFFFFFFF) return -EINVAL; - } } if (!mark_source_chains(newinfo, repl->valid_hooks, entry0)) @@ -1095,11 +991,8 @@ static int get_info(struct net *net, void __user *user, struct xt_table *t; int ret; - if (*len != sizeof(struct ip6t_getinfo)) { - duprintf("length %u != %zu\n", *len, - sizeof(struct ip6t_getinfo)); + if (*len != sizeof(struct ip6t_getinfo)) return -EINVAL; - } if (copy_from_user(name, user, sizeof(name)) != 0) return -EFAULT; @@ -1157,31 +1050,24 @@ get_entries(struct net *net, struct ip6t_get_entries __user *uptr, struct ip6t_get_entries get; struct xt_table *t; - if (*len < sizeof(get)) { - duprintf("get_entries: %u < %zu\n", *len, sizeof(get)); + if (*len < sizeof(get)) return -EINVAL; - } if (copy_from_user(&get, uptr, sizeof(get)) != 0) return -EFAULT; - if (*len != sizeof(struct ip6t_get_entries) + get.size) { - duprintf("get_entries: %u != %zu\n", - *len, sizeof(get) + get.size); + if (*len != sizeof(struct ip6t_get_entries) + get.size) return -EINVAL; - } + get.name[sizeof(get.name) - 1] = '\0'; t = xt_find_table_lock(net, AF_INET6, get.name); if (!IS_ERR_OR_NULL(t)) { struct xt_table_info *private = t->private; - duprintf("t->private->number = %u\n", private->number); if (get.size == private->size) ret = copy_entries_to_user(private->size, t, uptr->entrytable); - else { - duprintf("get_entries: I've got %u not %u!\n", - private->size, get.size); + else ret = -EAGAIN; - } + module_put(t->me); xt_table_unlock(t); } else @@ -1217,8 +1103,6 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, /* You lied! */ if (valid_hooks != t->valid_hooks) { - duprintf("Valid hook crap: %08X vs %08X\n", - valid_hooks, t->valid_hooks); ret = -EINVAL; goto put_module; } @@ -1228,8 +1112,6 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, goto put_module; /* Update module usage count based on number of rules */ - duprintf("do_replace: oldnum=%u, initnum=%u, newnum=%u\n", - oldinfo->number, oldinfo->initial_entries, newinfo->number); if ((oldinfo->number > oldinfo->initial_entries) || (newinfo->number <= oldinfo->initial_entries)) module_put(t->me); @@ -1298,8 +1180,6 @@ do_replace(struct net *net, const void __user *user, unsigned int len) if (ret != 0) goto free_newinfo; - duprintf("ip_tables: Translated table\n"); - ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo, tmp.num_counters, tmp.counters); if (ret) @@ -1424,11 +1304,9 @@ compat_find_calc_match(struct xt_entry_match *m, match = xt_request_find_match(NFPROTO_IPV6, m->u.user.name, m->u.user.revision); - if (IS_ERR(match)) { - duprintf("compat_check_calc_match: `%s' not found\n", - m->u.user.name); + if (IS_ERR(match)) return PTR_ERR(match); - } + m->u.kernel.match = match; *size += xt_compat_match_offset(match); return 0; @@ -1460,20 +1338,14 @@ check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e, unsigned int j; int ret, off; - duprintf("check_compat_entry_size_and_hooks %p\n", e); if ((unsigned long)e % __alignof__(struct compat_ip6t_entry) != 0 || (unsigned char *)e + sizeof(struct compat_ip6t_entry) >= limit || - (unsigned char *)e + e->next_offset > limit) { - duprintf("Bad offset %p, limit = %p\n", e, limit); + (unsigned char *)e + e->next_offset > limit) return -EINVAL; - } if (e->next_offset < sizeof(struct compat_ip6t_entry) + - sizeof(struct compat_xt_entry_target)) { - duprintf("checking: element %p size %u\n", - e, e->next_offset); + sizeof(struct compat_xt_entry_target)) return -EINVAL; - } if (!ip6_checkentry(&e->ipv6)) return -EINVAL; @@ -1497,8 +1369,6 @@ check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e, target = xt_request_find_target(NFPROTO_IPV6, t->u.user.name, t->u.user.revision); if (IS_ERR(target)) { - duprintf("check_compat_entry_size_and_hooks: `%s' not found\n", - t->u.user.name); ret = PTR_ERR(target); goto release_matches; } @@ -1577,7 +1447,6 @@ translate_compat_table(struct net *net, size = compatr->size; info->number = compatr->num_entries; - duprintf("translate_compat_table: size %u\n", info->size); j = 0; xt_compat_lock(AF_INET6); xt_compat_init_offsets(AF_INET6, compatr->num_entries); @@ -1592,11 +1461,8 @@ translate_compat_table(struct net *net, } ret = -EINVAL; - if (j != compatr->num_entries) { - duprintf("translate_compat_table: %u not %u entries\n", - j, compatr->num_entries); + if (j != compatr->num_entries) goto out_unlock; - } ret = -ENOMEM; newinfo = xt_alloc_table_info(size); @@ -1687,8 +1553,6 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) if (ret != 0) goto free_newinfo; - duprintf("compat_do_replace: Translated table\n"); - ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo, tmp.num_counters, compat_ptr(tmp.counters)); if (ret) @@ -1722,7 +1586,6 @@ compat_do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user, break; default: - duprintf("do_ip6t_set_ctl: unknown request %i\n", cmd); ret = -EINVAL; } @@ -1772,19 +1635,15 @@ compat_get_entries(struct net *net, struct compat_ip6t_get_entries __user *uptr, struct compat_ip6t_get_entries get; struct xt_table *t; - if (*len < sizeof(get)) { - duprintf("compat_get_entries: %u < %zu\n", *len, sizeof(get)); + if (*len < sizeof(get)) return -EINVAL; - } if (copy_from_user(&get, uptr, sizeof(get)) != 0) return -EFAULT; - if (*len != sizeof(struct compat_ip6t_get_entries) + get.size) { - duprintf("compat_get_entries: %u != %zu\n", - *len, sizeof(get) + get.size); + if (*len != sizeof(struct compat_ip6t_get_entries) + get.size) return -EINVAL; - } + get.name[sizeof(get.name) - 1] = '\0'; xt_compat_lock(AF_INET6); @@ -1792,16 +1651,13 @@ compat_get_entries(struct net *net, struct compat_ip6t_get_entries __user *uptr, if (!IS_ERR_OR_NULL(t)) { const struct xt_table_info *private = t->private; struct xt_table_info info; - duprintf("t->private->number = %u\n", private->number); ret = compat_table_info(private, &info); - if (!ret && get.size == info.size) { + if (!ret && get.size == info.size) ret = compat_copy_entries_to_user(private->size, t, uptr->entrytable); - } else if (!ret) { - duprintf("compat_get_entries: I've got %u not %u!\n", - private->size, get.size); + else if (!ret) ret = -EAGAIN; - } + xt_compat_flush_offsets(AF_INET6); module_put(t->me); xt_table_unlock(t); @@ -1854,7 +1710,6 @@ do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) break; default: - duprintf("do_ip6t_set_ctl: unknown request %i\n", cmd); ret = -EINVAL; } @@ -1906,7 +1761,6 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) } default: - duprintf("do_ip6t_get_ctl: unknown request %i\n", cmd); ret = -EINVAL; } @@ -2008,7 +1862,6 @@ icmp6_match(const struct sk_buff *skb, struct xt_action_param *par) /* We've been asked to examine this packet, and we * can't. Hence, no choice but to drop. */ - duprintf("Dropping evil ICMP tinygram.\n"); par->hotdrop = true; return false; } -- cgit v0.10.2 From cb39ad8b8ef224c544074962780bf763077d6141 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 4 May 2016 17:49:53 +0200 Subject: netfilter: nf_tables: allow set names up to 32 bytes Currently, we support set names of up to 16 bytes, get this aligned with the maximum length we can use in ipset to make it easier when considering migration to nf_tables. Signed-off-by: Pablo Neira Ayuso diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index f6b1daf..0922354 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -303,7 +303,7 @@ void nft_unregister_set(struct nft_set_ops *ops); struct nft_set { struct list_head list; struct list_head bindings; - char name[IFNAMSIZ]; + char name[NFT_SET_MAXNAMELEN]; u32 ktype; u32 dtype; u32 size; diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 6602313..6a4dbe0 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -3,6 +3,7 @@ #define NFT_TABLE_MAXNAMELEN 32 #define NFT_CHAIN_MAXNAMELEN 32 +#define NFT_SET_MAXNAMELEN 32 #define NFT_USERDATA_MAXLEN 256 /** diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 73c8fad..4d292b9 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2317,7 +2317,7 @@ nft_select_set_ops(const struct nlattr * const nla[], static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = { [NFTA_SET_TABLE] = { .type = NLA_STRING }, [NFTA_SET_NAME] = { .type = NLA_STRING, - .len = IFNAMSIZ - 1 }, + .len = NFT_SET_MAXNAMELEN - 1 }, [NFTA_SET_FLAGS] = { .type = NLA_U32 }, [NFTA_SET_KEY_TYPE] = { .type = NLA_U32 }, [NFTA_SET_KEY_LEN] = { .type = NLA_U32 }, @@ -2401,7 +2401,7 @@ static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set, unsigned long *inuse; unsigned int n = 0, min = 0; - p = strnchr(name, IFNAMSIZ, '%'); + p = strnchr(name, NFT_SET_MAXNAMELEN, '%'); if (p != NULL) { if (p[1] != 'd' || strchr(p + 2, '%')) return -EINVAL; @@ -2696,7 +2696,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, struct nft_table *table; struct nft_set *set; struct nft_ctx ctx; - char name[IFNAMSIZ]; + char name[NFT_SET_MAXNAMELEN]; unsigned int size; bool create; u64 timeout; -- cgit v0.10.2 From 698e2a8dca98e4de32f3f630e6d9cd93753c52e1 Mon Sep 17 00:00:00 2001 From: Marco Angaroni Date: Tue, 26 Apr 2016 21:20:22 +0200 Subject: ipvs: make drop_entry protection effective for SIP-pe DoS protection policy that deletes connections to avoid out of memory is currently not effective for SIP-pe plus OPS-mode for two reasons: 1) connection templates (holding SIP call-id) are always skipped in ip_vs_random_dropentry() 2) in_pkts counter (used by drop_entry algorithm) is not incremented for connection templates This patch addresses such problems with the following changes: a) connection templates associated (via their dest) to virtual-services configured in OPS mode are included in ip_vs_random_dropentry() monitoring. This applies to SIP-pe over UDP (which requires OPS mode), but is more general principle: when OPS is controlled by templates memory can be used only by templates themselves, since OPS conns are deleted after packet is forwarded. b) OPS connections, if controlled by a template, cause increment of in_pkts counter of their template. This is already happening but only in case director is in master-slave mode (see ip_vs_sync_conn()). Signed-off-by: Marco Angaroni Acked-by: Julian Anastasov Signed-off-by: Simon Horman diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 292365f..2cb3c62 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -1261,6 +1261,16 @@ static inline int todrop_entry(struct ip_vs_conn *cp) return 1; } +static inline bool ip_vs_conn_ops_mode(struct ip_vs_conn *cp) +{ + struct ip_vs_service *svc; + + if (!cp->dest) + return false; + svc = rcu_dereference(cp->dest->svc); + return svc && (svc->flags & IP_VS_SVC_F_ONEPACKET); +} + /* Called from keventd and must protect itself from softirqs */ void ip_vs_random_dropentry(struct netns_ipvs *ipvs) { @@ -1275,11 +1285,16 @@ void ip_vs_random_dropentry(struct netns_ipvs *ipvs) unsigned int hash = prandom_u32() & ip_vs_conn_tab_mask; hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { - if (cp->flags & IP_VS_CONN_F_TEMPLATE) - /* connection template */ - continue; if (cp->ipvs != ipvs) continue; + if (cp->flags & IP_VS_CONN_F_TEMPLATE) { + if (atomic_read(&cp->n_control) || + !ip_vs_conn_ops_mode(cp)) + continue; + else + /* connection template of OPS */ + goto try_drop; + } if (cp->protocol == IPPROTO_TCP) { switch(cp->state) { case IP_VS_TCP_S_SYN_RECV: @@ -1307,6 +1322,7 @@ void ip_vs_random_dropentry(struct netns_ipvs *ipvs) continue; } } else { +try_drop: if (!todrop_entry(cp)) continue; } diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index f3bac2e..1207f20 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -612,7 +612,10 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, ret = cp->packet_xmit(skb, cp, pd->pp, iph); /* do not touch skb anymore */ - atomic_inc(&cp->in_pkts); + if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && cp->control) + atomic_inc(&cp->control->in_pkts); + else + atomic_inc(&cp->in_pkts); ip_vs_conn_put(cp); return ret; } @@ -1991,6 +1994,9 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int if (ipvs->sync_state & IP_VS_STATE_MASTER) ip_vs_sync_conn(ipvs, cp, pkts); + else if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && cp->control) + /* increment is done inside ip_vs_sync_conn too */ + atomic_inc(&cp->control->in_pkts); ip_vs_conn_put(cp); return ret; -- cgit v0.10.2 From 03d7dc5cdfe6fd4e5bd04cfc2be7ae259f956428 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 6 May 2016 00:51:47 +0200 Subject: netfilter: conntrack: check netns when walking expect hash Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c index f8fc7ab..2b4c729 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c @@ -301,6 +301,9 @@ static int exp_seq_show(struct seq_file *s, void *v) exp = hlist_entry(n, struct nf_conntrack_expect, hnode); + if (!net_eq(nf_ct_net(exp->master), seq_file_net(s))) + return 0; + if (exp->tuple.src.l3num != AF_INET) return 0; diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index c2f7c4f..da95d74 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -86,6 +86,17 @@ static unsigned int nf_ct_expect_dst_hash(const struct nf_conntrack_tuple *tuple return reciprocal_scale(hash, nf_ct_expect_hsize); } +static bool +nf_ct_exp_equal(const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_expect *i, + const struct nf_conntrack_zone *zone, + const struct net *net) +{ + return nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) && + net_eq(net, nf_ct_net(i->master)) && + nf_ct_zone_equal_any(i->master, zone); +} + struct nf_conntrack_expect * __nf_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone, @@ -99,8 +110,7 @@ __nf_ct_expect_find(struct net *net, h = nf_ct_expect_dst_hash(tuple); hlist_for_each_entry_rcu(i, &net->ct.expect_hash[h], hnode) { - if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) && - nf_ct_zone_equal_any(i->master, zone)) + if (nf_ct_exp_equal(tuple, i, zone, net)) return i; } return NULL; @@ -141,8 +151,7 @@ nf_ct_find_expectation(struct net *net, h = nf_ct_expect_dst_hash(tuple); hlist_for_each_entry(i, &net->ct.expect_hash[h], hnode) { if (!(i->flags & NF_CT_EXPECT_INACTIVE) && - nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) && - nf_ct_zone_equal_any(i->master, zone)) { + nf_ct_exp_equal(tuple, i, zone, net)) { exp = i; break; } @@ -222,6 +231,7 @@ static inline int expect_clash(const struct nf_conntrack_expect *a, } return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask) && + net_eq(nf_ct_net(a->master), nf_ct_net(b->master)) && nf_ct_zone_equal_any(a->master, nf_ct_zone(b->master)); } @@ -231,6 +241,7 @@ static inline int expect_matches(const struct nf_conntrack_expect *a, return a->master == b->master && a->class == b->class && nf_ct_tuple_equal(&a->tuple, &b->tuple) && nf_ct_tuple_mask_equal(&a->mask, &b->mask) && + net_eq(nf_ct_net(a->master), nf_ct_net(b->master)) && nf_ct_zone_equal_any(a->master, nf_ct_zone(b->master)); } diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index e00f178..5dfb84d 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -2636,6 +2636,10 @@ restart: hnode) { if (l3proto && exp->tuple.src.l3num != l3proto) continue; + + if (!net_eq(nf_ct_net(exp->master), net)) + continue; + if (cb->args[1]) { if (exp != last) continue; @@ -2888,6 +2892,10 @@ static int ctnetlink_del_expect(struct net *net, struct sock *ctnl, hlist_for_each_entry_safe(exp, next, &net->ct.expect_hash[i], hnode) { + + if (!net_eq(nf_ct_exp_net(exp), net)) + continue; + m_help = nfct_help(exp->master); if (!strcmp(m_help->helper->name, name) && del_timer(&exp->timeout)) { @@ -2906,6 +2914,10 @@ static int ctnetlink_del_expect(struct net *net, struct sock *ctnl, hlist_for_each_entry_safe(exp, next, &net->ct.expect_hash[i], hnode) { + + if (!net_eq(nf_ct_exp_net(exp), net)) + continue; + if (del_timer(&exp->timeout)) { nf_ct_unlink_expect_report(exp, NETLINK_CB(skb).portid, -- cgit v0.10.2 From a9a083c3878f28e9d368f6dfb1a79a6f04ad8123 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 6 May 2016 00:51:48 +0200 Subject: netfilter: conntrack: make netns address part of expect hash Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index da95d74..130f1be 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -73,15 +74,17 @@ static void nf_ct_expectation_timed_out(unsigned long ul_expect) nf_ct_expect_put(exp); } -static unsigned int nf_ct_expect_dst_hash(const struct nf_conntrack_tuple *tuple) +static unsigned int nf_ct_expect_dst_hash(const struct net *n, const struct nf_conntrack_tuple *tuple) { - unsigned int hash; + unsigned int hash, seed; get_random_once(&nf_ct_expect_hashrnd, sizeof(nf_ct_expect_hashrnd)); + seed = nf_ct_expect_hashrnd ^ net_hash_mix(n); + hash = jhash2(tuple->dst.u3.all, ARRAY_SIZE(tuple->dst.u3.all), (((tuple->dst.protonum ^ tuple->src.l3num) << 16) | - (__force __u16)tuple->dst.u.all) ^ nf_ct_expect_hashrnd); + (__force __u16)tuple->dst.u.all) ^ seed); return reciprocal_scale(hash, nf_ct_expect_hsize); } @@ -108,7 +111,7 @@ __nf_ct_expect_find(struct net *net, if (!net->ct.expect_count) return NULL; - h = nf_ct_expect_dst_hash(tuple); + h = nf_ct_expect_dst_hash(net, tuple); hlist_for_each_entry_rcu(i, &net->ct.expect_hash[h], hnode) { if (nf_ct_exp_equal(tuple, i, zone, net)) return i; @@ -148,7 +151,7 @@ nf_ct_find_expectation(struct net *net, if (!net->ct.expect_count) return NULL; - h = nf_ct_expect_dst_hash(tuple); + h = nf_ct_expect_dst_hash(net, tuple); hlist_for_each_entry(i, &net->ct.expect_hash[h], hnode) { if (!(i->flags & NF_CT_EXPECT_INACTIVE) && nf_ct_exp_equal(tuple, i, zone, net)) { @@ -352,7 +355,7 @@ static int nf_ct_expect_insert(struct nf_conntrack_expect *exp) struct nf_conn_help *master_help = nfct_help(exp->master); struct nf_conntrack_helper *helper; struct net *net = nf_ct_exp_net(exp); - unsigned int h = nf_ct_expect_dst_hash(&exp->tuple); + unsigned int h = nf_ct_expect_dst_hash(net, &exp->tuple); /* two references : one for hash insert, one for the timer */ atomic_add(2, &exp->use); @@ -411,7 +414,7 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect) ret = -ESHUTDOWN; goto out; } - h = nf_ct_expect_dst_hash(&expect->tuple); + h = nf_ct_expect_dst_hash(net, &expect->tuple); hlist_for_each_entry_safe(i, next, &net->ct.expect_hash[h], hnode) { if (expect_matches(i, expect)) { if (del_timer(&i->timeout)) { -- cgit v0.10.2 From 0a93aaedc46af2c5feecfb1066d98bfb491ec0b8 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 6 May 2016 00:51:49 +0200 Subject: netfilter: conntrack: use a single expectation table for all namespaces We already include netns address in the hash and compare the netns pointers during lookup, so even if namespaces have overlapping addresses entries will be spread across the expectation table. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso diff --git a/include/net/netfilter/nf_conntrack_expect.h b/include/net/netfilter/nf_conntrack_expect.h index dce56f0..5ed33ea 100644 --- a/include/net/netfilter/nf_conntrack_expect.h +++ b/include/net/netfilter/nf_conntrack_expect.h @@ -10,6 +10,7 @@ extern unsigned int nf_ct_expect_hsize; extern unsigned int nf_ct_expect_max; +extern struct hlist_head *nf_ct_expect_hash; struct nf_conntrack_expect { /* Conntrack expectation list member */ diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h index 251c435..2811ddc 100644 --- a/include/net/netns/conntrack.h +++ b/include/net/netns/conntrack.h @@ -94,7 +94,6 @@ struct netns_ct { int sysctl_checksum; struct kmem_cache *nf_conntrack_cachep; - struct hlist_head *expect_hash; struct ct_pcpu __percpu *pcpu_lists; struct ip_conntrack_stat __percpu *stat; struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb; diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c index 2b4c729..c6f3c40 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c @@ -236,13 +236,12 @@ struct ct_expect_iter_state { static struct hlist_node *ct_expect_get_first(struct seq_file *seq) { - struct net *net = seq_file_net(seq); struct ct_expect_iter_state *st = seq->private; struct hlist_node *n; for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) { n = rcu_dereference( - hlist_first_rcu(&net->ct.expect_hash[st->bucket])); + hlist_first_rcu(&nf_ct_expect_hash[st->bucket])); if (n) return n; } @@ -252,7 +251,6 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq) static struct hlist_node *ct_expect_get_next(struct seq_file *seq, struct hlist_node *head) { - struct net *net = seq_file_net(seq); struct ct_expect_iter_state *st = seq->private; head = rcu_dereference(hlist_next_rcu(head)); @@ -260,7 +258,7 @@ static struct hlist_node *ct_expect_get_next(struct seq_file *seq, if (++st->bucket >= nf_ct_expect_hsize) return NULL; head = rcu_dereference( - hlist_first_rcu(&net->ct.expect_hash[st->bucket])); + hlist_first_rcu(&nf_ct_expect_hash[st->bucket])); } return head; } diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 130f1be..9e36931 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -36,6 +36,9 @@ unsigned int nf_ct_expect_hsize __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_expect_hsize); +struct hlist_head *nf_ct_expect_hash __read_mostly; +EXPORT_SYMBOL_GPL(nf_ct_expect_hash); + unsigned int nf_ct_expect_max __read_mostly; static struct kmem_cache *nf_ct_expect_cachep __read_mostly; @@ -112,7 +115,7 @@ __nf_ct_expect_find(struct net *net, return NULL; h = nf_ct_expect_dst_hash(net, tuple); - hlist_for_each_entry_rcu(i, &net->ct.expect_hash[h], hnode) { + hlist_for_each_entry_rcu(i, &nf_ct_expect_hash[h], hnode) { if (nf_ct_exp_equal(tuple, i, zone, net)) return i; } @@ -152,7 +155,7 @@ nf_ct_find_expectation(struct net *net, return NULL; h = nf_ct_expect_dst_hash(net, tuple); - hlist_for_each_entry(i, &net->ct.expect_hash[h], hnode) { + hlist_for_each_entry(i, &nf_ct_expect_hash[h], hnode) { if (!(i->flags & NF_CT_EXPECT_INACTIVE) && nf_ct_exp_equal(tuple, i, zone, net)) { exp = i; @@ -363,7 +366,7 @@ static int nf_ct_expect_insert(struct nf_conntrack_expect *exp) hlist_add_head(&exp->lnode, &master_help->expectations); master_help->expecting[exp->class]++; - hlist_add_head_rcu(&exp->hnode, &net->ct.expect_hash[h]); + hlist_add_head_rcu(&exp->hnode, &nf_ct_expect_hash[h]); net->ct.expect_count++; setup_timer(&exp->timeout, nf_ct_expectation_timed_out, @@ -415,7 +418,7 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect) goto out; } h = nf_ct_expect_dst_hash(net, &expect->tuple); - hlist_for_each_entry_safe(i, next, &net->ct.expect_hash[h], hnode) { + hlist_for_each_entry_safe(i, next, &nf_ct_expect_hash[h], hnode) { if (expect_matches(i, expect)) { if (del_timer(&i->timeout)) { nf_ct_unlink_expect(i); @@ -481,12 +484,11 @@ struct ct_expect_iter_state { static struct hlist_node *ct_expect_get_first(struct seq_file *seq) { - struct net *net = seq_file_net(seq); struct ct_expect_iter_state *st = seq->private; struct hlist_node *n; for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) { - n = rcu_dereference(hlist_first_rcu(&net->ct.expect_hash[st->bucket])); + n = rcu_dereference(hlist_first_rcu(&nf_ct_expect_hash[st->bucket])); if (n) return n; } @@ -496,14 +498,13 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq) static struct hlist_node *ct_expect_get_next(struct seq_file *seq, struct hlist_node *head) { - struct net *net = seq_file_net(seq); struct ct_expect_iter_state *st = seq->private; head = rcu_dereference(hlist_next_rcu(head)); while (head == NULL) { if (++st->bucket >= nf_ct_expect_hsize) return NULL; - head = rcu_dereference(hlist_first_rcu(&net->ct.expect_hash[st->bucket])); + head = rcu_dereference(hlist_first_rcu(&nf_ct_expect_hash[st->bucket])); } return head; } @@ -636,28 +637,13 @@ module_param_named(expect_hashsize, nf_ct_expect_hsize, uint, 0400); int nf_conntrack_expect_pernet_init(struct net *net) { - int err = -ENOMEM; - net->ct.expect_count = 0; - net->ct.expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, 0); - if (net->ct.expect_hash == NULL) - goto err1; - - err = exp_proc_init(net); - if (err < 0) - goto err2; - - return 0; -err2: - nf_ct_free_hashtable(net->ct.expect_hash, nf_ct_expect_hsize); -err1: - return err; + return exp_proc_init(net); } void nf_conntrack_expect_pernet_fini(struct net *net) { exp_proc_remove(net); - nf_ct_free_hashtable(net->ct.expect_hash, nf_ct_expect_hsize); } int nf_conntrack_expect_init(void) @@ -673,6 +659,13 @@ int nf_conntrack_expect_init(void) 0, 0, NULL); if (!nf_ct_expect_cachep) return -ENOMEM; + + nf_ct_expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, 0); + if (!nf_ct_expect_hash) { + kmem_cache_destroy(nf_ct_expect_cachep); + return -ENOMEM; + } + return 0; } @@ -680,4 +673,5 @@ void nf_conntrack_expect_fini(void) { rcu_barrier(); /* Wait for call_rcu() before destroy */ kmem_cache_destroy(nf_ct_expect_cachep); + nf_ct_free_hashtable(nf_ct_expect_hash, nf_ct_expect_hsize); } diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index cb48e6a..f703adb 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -400,7 +400,7 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me, spin_lock_bh(&nf_conntrack_expect_lock); for (i = 0; i < nf_ct_expect_hsize; i++) { hlist_for_each_entry_safe(exp, next, - &net->ct.expect_hash[i], hnode) { + &nf_ct_expect_hash[i], hnode) { struct nf_conn_help *help = nfct_help(exp->master); if ((rcu_dereference_protected( help->helper, diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 5dfb84d..a18d1ce 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -2632,7 +2632,7 @@ ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb) last = (struct nf_conntrack_expect *)cb->args[1]; for (; cb->args[0] < nf_ct_expect_hsize; cb->args[0]++) { restart: - hlist_for_each_entry(exp, &net->ct.expect_hash[cb->args[0]], + hlist_for_each_entry(exp, &nf_ct_expect_hash[cb->args[0]], hnode) { if (l3proto && exp->tuple.src.l3num != l3proto) continue; @@ -2890,7 +2890,7 @@ static int ctnetlink_del_expect(struct net *net, struct sock *ctnl, spin_lock_bh(&nf_conntrack_expect_lock); for (i = 0; i < nf_ct_expect_hsize; i++) { hlist_for_each_entry_safe(exp, next, - &net->ct.expect_hash[i], + &nf_ct_expect_hash[i], hnode) { if (!net_eq(nf_ct_exp_net(exp), net)) @@ -2912,7 +2912,7 @@ static int ctnetlink_del_expect(struct net *net, struct sock *ctnl, spin_lock_bh(&nf_conntrack_expect_lock); for (i = 0; i < nf_ct_expect_hsize; i++) { hlist_for_each_entry_safe(exp, next, - &net->ct.expect_hash[i], + &nf_ct_expect_hash[i], hnode) { if (!net_eq(nf_ct_exp_net(exp), net)) -- cgit v0.10.2 From 464c38556e06723b4c77d36fecff140b8527bc59 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 9 May 2016 16:24:30 +0200 Subject: netfilter: conntrack: make netns address part of nat bysrc hash Will be needed soon when we place all in the same hash table. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index d74e716..069912c 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -118,7 +118,7 @@ EXPORT_SYMBOL(nf_xfrm_me_harder); /* We keep an extra hash for each conntrack, for fast searching. */ static inline unsigned int -hash_by_src(const struct net *net, const struct nf_conntrack_tuple *tuple) +hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple) { unsigned int hash; @@ -126,9 +126,9 @@ hash_by_src(const struct net *net, const struct nf_conntrack_tuple *tuple) /* Original src, to ensure we map it consistently if poss. */ hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32), - tuple->dst.protonum ^ nf_nat_hash_rnd); + tuple->dst.protonum ^ nf_nat_hash_rnd ^ net_hash_mix(n)); - return reciprocal_scale(hash, net->ct.nat_htable_size); + return reciprocal_scale(hash, n->ct.nat_htable_size); } /* Is this tuple already taken? (not by us) */ -- cgit v0.10.2 From a76ae1c85576b4b833a506925417d746bc839302 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 9 May 2016 16:24:31 +0200 Subject: netfilter: conntrack: use a single nat bysource table for all namespaces We already include netns address in the hash, so we only need to use net_eq in find_appropriate_src and can then put all entries into same table. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h index 2811ddc..1e751bf 100644 --- a/include/net/netns/conntrack.h +++ b/include/net/netns/conntrack.h @@ -103,9 +103,5 @@ struct netns_ct { unsigned int labels_used; u8 label_words; #endif -#ifdef CONFIG_NF_NAT_NEEDED - struct hlist_head *nat_bysource; - unsigned int nat_htable_size; -#endif }; #endif diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 069912c..6877a39 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -37,6 +37,9 @@ static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO] __read_mostly; static const struct nf_nat_l4proto __rcu **nf_nat_l4protos[NFPROTO_NUMPROTO] __read_mostly; + +static struct hlist_head *nf_nat_bysource __read_mostly; +static unsigned int nf_nat_htable_size __read_mostly; static unsigned int nf_nat_hash_rnd __read_mostly; inline const struct nf_nat_l3proto * @@ -128,7 +131,7 @@ hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple) hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32), tuple->dst.protonum ^ nf_nat_hash_rnd ^ net_hash_mix(n)); - return reciprocal_scale(hash, n->ct.nat_htable_size); + return reciprocal_scale(hash, nf_nat_htable_size); } /* Is this tuple already taken? (not by us) */ @@ -198,9 +201,10 @@ find_appropriate_src(struct net *net, const struct nf_conn_nat *nat; const struct nf_conn *ct; - hlist_for_each_entry_rcu(nat, &net->ct.nat_bysource[h], bysource) { + hlist_for_each_entry_rcu(nat, &nf_nat_bysource[h], bysource) { ct = nat->ct; if (same_src(ct, tuple) && + net_eq(net, nf_ct_net(ct)) && nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) { /* Copy source part from reply tuple. */ nf_ct_invert_tuplepr(result, @@ -433,7 +437,7 @@ nf_nat_setup_info(struct nf_conn *ct, nat = nfct_nat(ct); nat->ct = ct; hlist_add_head_rcu(&nat->bysource, - &net->ct.nat_bysource[srchash]); + &nf_nat_bysource[srchash]); spin_unlock_bh(&nf_nat_lock); } @@ -821,27 +825,14 @@ nfnetlink_parse_nat_setup(struct nf_conn *ct, } #endif -static int __net_init nf_nat_net_init(struct net *net) -{ - /* Leave them the same for the moment. */ - net->ct.nat_htable_size = nf_conntrack_htable_size; - net->ct.nat_bysource = nf_ct_alloc_hashtable(&net->ct.nat_htable_size, 0); - if (!net->ct.nat_bysource) - return -ENOMEM; - return 0; -} - static void __net_exit nf_nat_net_exit(struct net *net) { struct nf_nat_proto_clean clean = {}; nf_ct_iterate_cleanup(net, nf_nat_proto_clean, &clean, 0, 0); - synchronize_rcu(); - nf_ct_free_hashtable(net->ct.nat_bysource, net->ct.nat_htable_size); } static struct pernet_operations nf_nat_net_ops = { - .init = nf_nat_net_init, .exit = nf_nat_net_exit, }; @@ -854,8 +845,16 @@ static int __init nf_nat_init(void) { int ret; + /* Leave them the same for the moment. */ + nf_nat_htable_size = nf_conntrack_htable_size; + + nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0); + if (!nf_nat_bysource) + return -ENOMEM; + ret = nf_ct_extend_register(&nat_extend); if (ret < 0) { + nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size); printk(KERN_ERR "nf_nat_core: Unable to register extension\n"); return ret; } @@ -879,6 +878,7 @@ static int __init nf_nat_init(void) return 0; cleanup_extend: + nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size); nf_ct_extend_unregister(&nat_extend); return ret; } @@ -897,6 +897,7 @@ static void __exit nf_nat_cleanup(void) for (i = 0; i < NFPROTO_NUMPROTO; i++) kfree(nf_nat_l4protos[i]); synchronize_net(); + nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size); } MODULE_LICENSE("GPL"); -- cgit v0.10.2 From 0c5366b3a8c77fd6d67b763c5a76dfdc314e7726 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 9 May 2016 16:24:32 +0200 Subject: netfilter: conntrack: use single slab cache An earlier patch changed lookup side to also net_eq() namespaces after obtaining a reference on the conntrack, so a single kmemcache can be used. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h index 1e751bf..38b1a80 100644 --- a/include/net/netns/conntrack.h +++ b/include/net/netns/conntrack.h @@ -84,7 +84,6 @@ struct netns_ct { struct ctl_table_header *event_sysctl_header; struct ctl_table_header *helper_sysctl_header; #endif - char *slabname; unsigned int sysctl_log_invalid; /* Log invalid packets */ int sysctl_events; int sysctl_acct; @@ -93,7 +92,6 @@ struct netns_ct { int sysctl_tstamp; int sysctl_checksum; - struct kmem_cache *nf_conntrack_cachep; struct ct_pcpu __percpu *pcpu_lists; struct ip_conntrack_stat __percpu *stat; struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index f58a704..0cd2936 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -72,6 +72,7 @@ EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock); struct hlist_nulls_head *nf_conntrack_hash __read_mostly; EXPORT_SYMBOL_GPL(nf_conntrack_hash); +static __read_mostly struct kmem_cache *nf_conntrack_cachep; static __read_mostly spinlock_t nf_conntrack_locks_all_lock; static __read_mostly seqcount_t nf_conntrack_generation; static __read_mostly bool nf_conntrack_locks_all; @@ -910,7 +911,7 @@ __nf_conntrack_alloc(struct net *net, * Do not use kmem_cache_zalloc(), as this cache uses * SLAB_DESTROY_BY_RCU. */ - ct = kmem_cache_alloc(net->ct.nf_conntrack_cachep, gfp); + ct = kmem_cache_alloc(nf_conntrack_cachep, gfp); if (ct == NULL) goto out; @@ -937,7 +938,7 @@ __nf_conntrack_alloc(struct net *net, atomic_set(&ct->ct_general.use, 0); return ct; out_free: - kmem_cache_free(net->ct.nf_conntrack_cachep, ct); + kmem_cache_free(nf_conntrack_cachep, ct); out: atomic_dec(&net->ct.count); return ERR_PTR(-ENOMEM); @@ -964,7 +965,7 @@ void nf_conntrack_free(struct nf_conn *ct) nf_ct_ext_destroy(ct); nf_ct_ext_free(ct); - kmem_cache_free(net->ct.nf_conntrack_cachep, ct); + kmem_cache_free(nf_conntrack_cachep, ct); smp_mb__before_atomic(); atomic_dec(&net->ct.count); } @@ -1587,8 +1588,6 @@ i_see_dead_people: nf_conntrack_tstamp_pernet_fini(net); nf_conntrack_acct_pernet_fini(net); nf_conntrack_expect_pernet_fini(net); - kmem_cache_destroy(net->ct.nf_conntrack_cachep); - kfree(net->ct.slabname); free_percpu(net->ct.stat); free_percpu(net->ct.pcpu_lists); } @@ -1693,7 +1692,8 @@ EXPORT_SYMBOL_GPL(nf_ct_untracked_status_or); int nf_conntrack_init_start(void) { int max_factor = 8; - int i, ret, cpu; + int ret = -ENOMEM; + int i, cpu; seqcount_init(&nf_conntrack_generation); @@ -1729,6 +1729,12 @@ int nf_conntrack_init_start(void) nf_conntrack_max = max_factor * nf_conntrack_htable_size; + nf_conntrack_cachep = kmem_cache_create("nf_conntrack", + sizeof(struct nf_conn), 0, + SLAB_DESTROY_BY_RCU, NULL); + if (!nf_conntrack_cachep) + goto err_cachep; + printk(KERN_INFO "nf_conntrack version %s (%u buckets, %d max)\n", NF_CONNTRACK_VERSION, nf_conntrack_htable_size, nf_conntrack_max); @@ -1805,6 +1811,8 @@ err_tstamp: err_acct: nf_conntrack_expect_fini(); err_expect: + kmem_cache_destroy(nf_conntrack_cachep); +err_cachep: nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size); return ret; } @@ -1846,18 +1854,6 @@ int nf_conntrack_init_net(struct net *net) if (!net->ct.stat) goto err_pcpu_lists; - net->ct.slabname = kasprintf(GFP_KERNEL, "nf_conntrack_%p", net); - if (!net->ct.slabname) - goto err_slabname; - - net->ct.nf_conntrack_cachep = kmem_cache_create(net->ct.slabname, - sizeof(struct nf_conn), 0, - SLAB_DESTROY_BY_RCU, NULL); - if (!net->ct.nf_conntrack_cachep) { - printk(KERN_ERR "Unable to create nf_conn slab cache\n"); - goto err_cache; - } - ret = nf_conntrack_expect_pernet_init(net); if (ret < 0) goto err_expect; @@ -1889,10 +1885,6 @@ err_tstamp: err_acct: nf_conntrack_expect_pernet_fini(net); err_expect: - kmem_cache_destroy(net->ct.nf_conntrack_cachep); -err_cache: - kfree(net->ct.slabname); -err_slabname: free_percpu(net->ct.stat); err_pcpu_lists: free_percpu(net->ct.pcpu_lists); -- cgit v0.10.2