From 0d6ef0688d8744454646298b85336407be05e309 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Fri, 10 Jul 2015 15:42:49 +0900 Subject: ipvs: Delete an unnecessary check before the function call "module_put" The module_put() function tests whether its argument is NULL and then returns immediately. Thus the test around the call is not needed. This issue was detected by using the Coccinelle software. Signed-off-by: Markus Elfring Signed-off-by: Simon Horman Signed-off-by: Pablo Neira Ayuso diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c index 199760c..e50221b 100644 --- a/net/netfilter/ipvs/ip_vs_sched.c +++ b/net/netfilter/ipvs/ip_vs_sched.c @@ -137,7 +137,7 @@ struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name) void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler) { - if (scheduler && scheduler->module) + if (scheduler) module_put(scheduler->module); } -- cgit v0.10.2 From 70aa996601335ca3069190ebcdae8870828086a8 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 10 Jul 2015 18:13:20 -0500 Subject: netfilter: kill nf_hooks_active The function obscures what is going on in nf_hook_thresh and it's existence requires computing the hook list twice. Signed-off-by: "Eric W. Biederman" Signed-off-by: Pablo Neira Ayuso diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 00050df..60e89348 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -150,11 +150,6 @@ static inline bool nf_hook_list_active(struct list_head *nf_hook_list, } #endif -static inline bool nf_hooks_active(u_int8_t pf, unsigned int hook) -{ - return nf_hook_list_active(&nf_hooks[pf][hook], pf, hook); -} - int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state); /** @@ -172,10 +167,12 @@ static inline int nf_hook_thresh(u_int8_t pf, unsigned int hook, int (*okfn)(struct sock *, struct sk_buff *), int thresh) { - if (nf_hooks_active(pf, hook)) { + struct list_head *nf_hook_list = &nf_hooks[pf][hook]; + + if (nf_hook_list_active(nf_hook_list, pf, hook)) { struct nf_hook_state state; - nf_hook_state_init(&state, &nf_hooks[pf][hook], hook, thresh, + nf_hook_state_init(&state, nf_hook_list, hook, thresh, pf, indev, outdev, sk, okfn); return nf_hook_slow(skb, &state); } -- cgit v0.10.2 From 4c0911566dec7755d15cb89239fb2db4447f7a62 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 10 Jul 2015 18:13:58 -0500 Subject: netfilter: Simply the tests for enabling and disabling the ingress queue hook Replace an overcomplicated switch statement with a simple if statement. This also removes the ingress queue enable outside of nf_hook_mutex as the protection provided by the mutex is not necessary and the code is clearer having both of the static key increments together. Signed-off-by: "Eric W. Biederman" Signed-off-by: Pablo Neira Ayuso diff --git a/net/netfilter/core.c b/net/netfilter/core.c index a0e5497..c4c3b85 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -74,7 +74,6 @@ int nf_register_hook(struct nf_hook_ops *reg) if (reg->hooknum == NF_NETDEV_INGRESS) { BUG_ON(reg->dev == NULL); nf_hook_list = ®->dev->nf_hooks_ingress; - net_inc_ingress_queue(); break; } #endif @@ -90,6 +89,10 @@ int nf_register_hook(struct nf_hook_ops *reg) } list_add_rcu(®->list, elem->list.prev); mutex_unlock(&nf_hook_mutex); +#ifdef CONFIG_NETFILTER_INGRESS + if (reg->pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS) + net_inc_ingress_queue(); +#endif #ifdef HAVE_JUMP_LABEL static_key_slow_inc(&nf_hooks_needed[reg->pf][reg->hooknum]); #endif @@ -102,18 +105,10 @@ void nf_unregister_hook(struct nf_hook_ops *reg) mutex_lock(&nf_hook_mutex); list_del_rcu(®->list); mutex_unlock(&nf_hook_mutex); - switch (reg->pf) { - case NFPROTO_NETDEV: #ifdef CONFIG_NETFILTER_INGRESS - if (reg->hooknum == NF_NETDEV_INGRESS) { - net_dec_ingress_queue(); - break; - } - break; + if (reg->pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS) + net_dec_ingress_queue(); #endif - default: - break; - } #ifdef HAVE_JUMP_LABEL static_key_slow_dec(&nf_hooks_needed[reg->pf][reg->hooknum]); #endif -- cgit v0.10.2 From 0edcf282b0a6f38168294264837cf7d52a2f5255 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 10 Jul 2015 18:14:30 -0500 Subject: netfilter: Factor out the hook list selection from nf_register_hook - Add a new function find_nf_hook_list to select the nf_hook_list - Fail nf_register_hook when asked for a per netdevice hook list when support for per netdevice hook lists is not built into the kernel. - Move the hook list head selection outside of nf_hook_mutex as nothing in the selection requires the hook list, and error handling is simpler if a mutex is not held. Signed-off-by: "Eric W. Biederman" Signed-off-by: Pablo Neira Ayuso diff --git a/net/netfilter/core.c b/net/netfilter/core.c index c4c3b85..fa4d3c1 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -62,27 +62,31 @@ EXPORT_SYMBOL(nf_hooks_needed); static DEFINE_MUTEX(nf_hook_mutex); -int nf_register_hook(struct nf_hook_ops *reg) +static struct list_head *find_nf_hook_list(const struct nf_hook_ops *reg) { - struct list_head *nf_hook_list; - struct nf_hook_ops *elem; + struct list_head *nf_hook_list = NULL; - mutex_lock(&nf_hook_mutex); - switch (reg->pf) { - case NFPROTO_NETDEV: + if (reg->pf != NFPROTO_NETDEV) + nf_hook_list = &nf_hooks[reg->pf][reg->hooknum]; + else if (reg->hooknum == NF_NETDEV_INGRESS) { #ifdef CONFIG_NETFILTER_INGRESS - if (reg->hooknum == NF_NETDEV_INGRESS) { - BUG_ON(reg->dev == NULL); + if (reg->dev) nf_hook_list = ®->dev->nf_hooks_ingress; - break; - } #endif - /* Fall through. */ - default: - nf_hook_list = &nf_hooks[reg->pf][reg->hooknum]; - break; } + return nf_hook_list; +} + +int nf_register_hook(struct nf_hook_ops *reg) +{ + struct list_head *nf_hook_list; + struct nf_hook_ops *elem; + nf_hook_list = find_nf_hook_list(reg); + if (!nf_hook_list) + return -ENOENT; + + mutex_lock(&nf_hook_mutex); list_for_each_entry(elem, nf_hook_list, list) { if (reg->priority < elem->priority) break; -- cgit v0.10.2 From 085db2c04557d31db61541f361bd8b4de92c9939 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 10 Jul 2015 18:15:06 -0500 Subject: netfilter: Per network namespace netfilter hooks. - Add a new set of functions for registering and unregistering per network namespace hooks. - Modify the old global namespace hook functions to use the per network namespace hooks in their implementation, so their remains a single list that needs to be walked for any hook (this is important for keeping the hook priority working and for keeping the code walking the hooks simple). - Only allow registering the per netdevice hooks in the network namespace where the network device lives. - Dynamically allocate the structures in the per network namespace hook list in nf_register_net_hook, and unregister them in nf_unregister_net_hook. Dynamic allocate is required somewhere as the number of network namespaces are not fixed so we might as well allocate them in the registration function. The chain of registered hooks on any list is expected to be small so the cost of walking that list to find the entry we are unregistering should also be small. Performing the management of the dynamically allocated list entries in the registration and unregistration functions keeps the complexity from spreading. Signed-off-by: "Eric W. Biederman" diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 60e89348..9bbd110 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -11,6 +11,8 @@ #include #include #include +#include +#include #ifdef CONFIG_NETFILTER static inline int NF_DROP_GETERR(int verdict) @@ -118,6 +120,13 @@ struct nf_sockopt_ops { }; /* Function to register/unregister hook points. */ +int nf_register_net_hook(struct net *net, const struct nf_hook_ops *ops); +void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *ops); +int nf_register_net_hooks(struct net *net, const struct nf_hook_ops *reg, + unsigned int n); +void nf_unregister_net_hooks(struct net *net, const struct nf_hook_ops *reg, + unsigned int n); + int nf_register_hook(struct nf_hook_ops *reg); void nf_unregister_hook(struct nf_hook_ops *reg); int nf_register_hooks(struct nf_hook_ops *reg, unsigned int n); @@ -128,8 +137,6 @@ void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n); int nf_register_sockopt(struct nf_sockopt_ops *reg); void nf_unregister_sockopt(struct nf_sockopt_ops *reg); -extern struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; - #ifdef HAVE_JUMP_LABEL extern struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; @@ -167,7 +174,8 @@ static inline int nf_hook_thresh(u_int8_t pf, unsigned int hook, int (*okfn)(struct sock *, struct sk_buff *), int thresh) { - struct list_head *nf_hook_list = &nf_hooks[pf][hook]; + struct net *net = dev_net(indev ? indev : outdev); + struct list_head *nf_hook_list = &net->nf.hooks[pf][hook]; if (nf_hook_list_active(nf_hook_list, pf, hook)) { struct nf_hook_state state; diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h index 532e4ba..38aa498 100644 --- a/include/net/netns/netfilter.h +++ b/include/net/netns/netfilter.h @@ -14,5 +14,6 @@ struct netns_nf { #ifdef CONFIG_SYSCTL struct ctl_table_header *nf_log_dir_header; #endif + struct list_head hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; }; #endif diff --git a/net/netfilter/core.c b/net/netfilter/core.c index fa4d3c1..56ead1a 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -52,9 +52,6 @@ void nf_unregister_afinfo(const struct nf_afinfo *afinfo) } EXPORT_SYMBOL_GPL(nf_unregister_afinfo); -struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS] __read_mostly; -EXPORT_SYMBOL(nf_hooks); - #ifdef HAVE_JUMP_LABEL struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; EXPORT_SYMBOL(nf_hooks_needed); @@ -62,27 +59,40 @@ EXPORT_SYMBOL(nf_hooks_needed); static DEFINE_MUTEX(nf_hook_mutex); -static struct list_head *find_nf_hook_list(const struct nf_hook_ops *reg) +static struct list_head *find_nf_hook_list(struct net *net, + const struct nf_hook_ops *reg) { struct list_head *nf_hook_list = NULL; if (reg->pf != NFPROTO_NETDEV) - nf_hook_list = &nf_hooks[reg->pf][reg->hooknum]; + nf_hook_list = &net->nf.hooks[reg->pf][reg->hooknum]; else if (reg->hooknum == NF_NETDEV_INGRESS) { #ifdef CONFIG_NETFILTER_INGRESS - if (reg->dev) + if (reg->dev && dev_net(reg->dev) == net) nf_hook_list = ®->dev->nf_hooks_ingress; #endif } return nf_hook_list; } -int nf_register_hook(struct nf_hook_ops *reg) +int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg) { struct list_head *nf_hook_list; - struct nf_hook_ops *elem; + struct nf_hook_ops *elem, *new; + + new = kzalloc(sizeof(*new), GFP_KERNEL); + if (!new) + return -ENOMEM; - nf_hook_list = find_nf_hook_list(reg); + new->hook = reg->hook; + new->dev = reg->dev; + new->owner = reg->owner; + new->priv = reg->priv; + new->pf = reg->pf; + new->hooknum = reg->hooknum; + new->priority = reg->priority; + + nf_hook_list = find_nf_hook_list(net, reg); if (!nf_hook_list) return -ENOENT; @@ -91,7 +101,7 @@ int nf_register_hook(struct nf_hook_ops *reg) if (reg->priority < elem->priority) break; } - list_add_rcu(®->list, elem->list.prev); + list_add_rcu(&new->list, elem->list.prev); mutex_unlock(&nf_hook_mutex); #ifdef CONFIG_NETFILTER_INGRESS if (reg->pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS) @@ -102,13 +112,35 @@ int nf_register_hook(struct nf_hook_ops *reg) #endif return 0; } -EXPORT_SYMBOL(nf_register_hook); +EXPORT_SYMBOL(nf_register_net_hook); -void nf_unregister_hook(struct nf_hook_ops *reg) +void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg) { + struct list_head *nf_hook_list; + struct nf_hook_ops *elem; + + nf_hook_list = find_nf_hook_list(net, reg); + if (!nf_hook_list) + return; + mutex_lock(&nf_hook_mutex); - list_del_rcu(®->list); + list_for_each_entry(elem, nf_hook_list, list) { + if ((reg->hook == elem->hook) && + (reg->dev == elem->dev) && + (reg->owner == elem->owner) && + (reg->priv == elem->priv) && + (reg->pf == elem->pf) && + (reg->hooknum == elem->hooknum) && + (reg->priority == elem->priority)) { + list_del_rcu(&elem->list); + break; + } + } mutex_unlock(&nf_hook_mutex); + if (&elem->list == nf_hook_list) { + WARN(1, "nf_unregister_net_hook: hook not found!\n"); + return; + } #ifdef CONFIG_NETFILTER_INGRESS if (reg->pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS) net_dec_ingress_queue(); @@ -117,7 +149,77 @@ void nf_unregister_hook(struct nf_hook_ops *reg) static_key_slow_dec(&nf_hooks_needed[reg->pf][reg->hooknum]); #endif synchronize_net(); - nf_queue_nf_hook_drop(reg); + nf_queue_nf_hook_drop(elem); + kfree(elem); +} +EXPORT_SYMBOL(nf_unregister_net_hook); + +int nf_register_net_hooks(struct net *net, const struct nf_hook_ops *reg, + unsigned int n) +{ + unsigned int i; + int err = 0; + + for (i = 0; i < n; i++) { + err = nf_register_net_hook(net, ®[i]); + if (err) + goto err; + } + return err; + +err: + if (i > 0) + nf_unregister_net_hooks(net, reg, i); + return err; +} +EXPORT_SYMBOL(nf_register_net_hooks); + +void nf_unregister_net_hooks(struct net *net, const struct nf_hook_ops *reg, + unsigned int n) +{ + while (n-- > 0) + nf_unregister_net_hook(net, ®[n]); +} +EXPORT_SYMBOL(nf_unregister_net_hooks); + +static LIST_HEAD(nf_hook_list); + +int nf_register_hook(struct nf_hook_ops *reg) +{ + struct net *net, *last; + int ret; + + rtnl_lock(); + for_each_net(net) { + ret = nf_register_net_hook(net, reg); + if (ret && ret != -ENOENT) + goto rollback; + } + list_add_tail(®->list, &nf_hook_list); + rtnl_unlock(); + + return 0; +rollback: + last = net; + for_each_net(net) { + if (net == last) + break; + nf_unregister_net_hook(net, reg); + } + rtnl_unlock(); + return ret; +} +EXPORT_SYMBOL(nf_register_hook); + +void nf_unregister_hook(struct nf_hook_ops *reg) +{ + struct net *net; + + rtnl_lock(); + list_del(®->list); + for_each_net(net) + nf_unregister_net_hook(net, reg); + rtnl_unlock(); } EXPORT_SYMBOL(nf_unregister_hook); @@ -294,8 +396,46 @@ void (*nf_nat_decode_session_hook)(struct sk_buff *, struct flowi *); EXPORT_SYMBOL(nf_nat_decode_session_hook); #endif +static int nf_register_hook_list(struct net *net) +{ + struct nf_hook_ops *elem; + int ret; + + rtnl_lock(); + list_for_each_entry(elem, &nf_hook_list, list) { + ret = nf_register_net_hook(net, elem); + if (ret && ret != -ENOENT) + goto out_undo; + } + rtnl_unlock(); + return 0; + +out_undo: + list_for_each_entry_continue_reverse(elem, &nf_hook_list, list) + nf_unregister_net_hook(net, elem); + rtnl_unlock(); + return ret; +} + +static void nf_unregister_hook_list(struct net *net) +{ + struct nf_hook_ops *elem; + + rtnl_lock(); + list_for_each_entry(elem, &nf_hook_list, list) + nf_unregister_net_hook(net, elem); + rtnl_unlock(); +} + static int __net_init netfilter_net_init(struct net *net) { + int i, h, ret; + + for (i = 0; i < ARRAY_SIZE(net->nf.hooks); i++) { + for (h = 0; h < NF_MAX_HOOKS; h++) + INIT_LIST_HEAD(&net->nf.hooks[i][h]); + } + #ifdef CONFIG_PROC_FS net->nf.proc_netfilter = proc_net_mkdir(net, "netfilter", net->proc_net); @@ -306,11 +446,16 @@ static int __net_init netfilter_net_init(struct net *net) return -ENOMEM; } #endif - return 0; + ret = nf_register_hook_list(net); + if (ret) + remove_proc_entry("netfilter", net->proc_net); + + return ret; } static void __net_exit netfilter_net_exit(struct net *net) { + nf_unregister_hook_list(net); remove_proc_entry("netfilter", net->proc_net); } @@ -321,12 +466,7 @@ static struct pernet_operations netfilter_net_ops = { int __init netfilter_init(void) { - int i, h, ret; - - for (i = 0; i < ARRAY_SIZE(nf_hooks); i++) { - for (h = 0; h < NF_MAX_HOOKS; h++) - INIT_LIST_HEAD(&nf_hooks[i][h]); - } + int ret; ret = register_pernet_subsys(&netfilter_net_ops); if (ret < 0) -- cgit v0.10.2 From fd2ecda0341960d0ce361d648cf4dd98187afb06 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 10 Jul 2015 18:15:44 -0500 Subject: netfilter: nftables: Only run the nftables chains in the proper netns - Register the nftables chains in the network namespace that they need to run in. - Remove the hacks that stopped chains running in the wrong network namespace. Signed-off-by: "Eric W. Biederman" Signed-off-by: Pablo Neira Ayuso diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index cfe6368..4a41eb9 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -130,20 +130,24 @@ static void nft_trans_destroy(struct nft_trans *trans) int nft_register_basechain(struct nft_base_chain *basechain, unsigned int hook_nops) { + struct net *net = read_pnet(&basechain->pnet); + if (basechain->flags & NFT_BASECHAIN_DISABLED) return 0; - return nf_register_hooks(basechain->ops, hook_nops); + return nf_register_net_hooks(net, basechain->ops, hook_nops); } EXPORT_SYMBOL_GPL(nft_register_basechain); void nft_unregister_basechain(struct nft_base_chain *basechain, unsigned int hook_nops) { + struct net *net = read_pnet(&basechain->pnet); + if (basechain->flags & NFT_BASECHAIN_DISABLED) return; - nf_unregister_hooks(basechain->ops, hook_nops); + nf_unregister_net_hooks(net, basechain->ops, hook_nops); } EXPORT_SYMBOL_GPL(nft_unregister_basechain); diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index f77bad4..05d0b03 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -114,7 +114,6 @@ unsigned int nft_do_chain(struct nft_pktinfo *pkt, const struct nf_hook_ops *ops) { const struct nft_chain *chain = ops->priv, *basechain = chain; - const struct net *chain_net = read_pnet(&nft_base_chain(basechain)->pnet); const struct net *net = dev_net(pkt->in ? pkt->in : pkt->out); const struct nft_rule *rule; const struct nft_expr *expr, *last; @@ -125,10 +124,6 @@ nft_do_chain(struct nft_pktinfo *pkt, const struct nf_hook_ops *ops) int rulenum; unsigned int gencursor = nft_genmask_cur(net); - /* Ignore chains that are not for the current network namespace */ - if (!net_eq(net, chain_net)) - return NF_ACCEPT; - do_chain: rulenum = 0; rule = list_entry(&chain->rules, struct nft_rule, list); -- cgit v0.10.2 From 98d1bd802cdbc8f56868fae51edec13e86b59515 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 14 Jul 2015 17:51:06 +0200 Subject: netfilter: xtables: compute exact size needed for jumpstack The {arp,ip,ip6tables} jump stack is currently sized based on the number of user chains. However, its rather unlikely that every user defined chain jumps to the next, so lets use the existing loop detection logic to also track the chain depths. The stacksize is then set to the largest chain depth seen. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 92305a1..ae6d0a1 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -372,10 +372,13 @@ static inline bool unconditional(const struct arpt_arp *arp) /* Figures out from what hook each rule can be called: returns 0 if * there are loops. Puts hook bitmask in comefrom. + * + * Keeps track of largest call depth seen and stores it in newinfo->stacksize. */ -static int mark_source_chains(const struct xt_table_info *newinfo, +static int mark_source_chains(struct xt_table_info *newinfo, unsigned int valid_hooks, void *entry0) { + unsigned int calldepth, max_calldepth = 0; unsigned int hook; /* No recursion; use packet counter to save back ptrs (reset @@ -391,6 +394,7 @@ static int mark_source_chains(const struct xt_table_info *newinfo, /* Set initial back pointer. */ e->counters.pcnt = pos; + calldepth = 0; for (;;) { const struct xt_standard_target *t @@ -445,6 +449,8 @@ static int mark_source_chains(const struct xt_table_info *newinfo, (entry0 + pos + size); e->counters.pcnt = pos; pos += size; + if (calldepth > 0) + --calldepth; } else { int newpos = t->verdict; @@ -459,6 +465,10 @@ static int mark_source_chains(const struct xt_table_info *newinfo, return 0; } + if (entry0 + newpos != arpt_next_entry(e) && + ++calldepth > max_calldepth) + max_calldepth = calldepth; + /* This a jump; chase it. */ duprintf("Jump rule %u -> %u\n", pos, newpos); @@ -475,6 +485,7 @@ static int mark_source_chains(const struct xt_table_info *newinfo, next: duprintf("Finished chain %u\n", hook); } + newinfo->stacksize = max_calldepth; return 1; } @@ -664,9 +675,6 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0, if (ret != 0) break; ++i; - if (strcmp(arpt_get_target(iter)->u.user.name, - XT_ERROR_TARGET) == 0) - ++newinfo->stacksize; } duprintf("translate_table: ARPT_ENTRY_ITERATE gives %d\n", ret); if (ret != 0) @@ -1439,9 +1447,6 @@ static int translate_compat_table(const char *name, break; } ++i; - if (strcmp(arpt_get_target(iter1)->u.user.name, - XT_ERROR_TARGET) == 0) - ++newinfo->stacksize; } if (ret) { /* diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 6c72fbb..5e44b35 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -439,11 +439,15 @@ ipt_do_table(struct sk_buff *skb, } /* Figures out from what hook each rule can be called: returns 0 if - there are loops. Puts hook bitmask in comefrom. */ + * there are loops. Puts hook bitmask in comefrom. + * + * Keeps track of largest call depth seen and stores it in newinfo->stacksize. + */ static int -mark_source_chains(const struct xt_table_info *newinfo, +mark_source_chains(struct xt_table_info *newinfo, unsigned int valid_hooks, void *entry0) { + unsigned int calldepth, max_calldepth = 0; unsigned int hook; /* No recursion; use packet counter to save back ptrs (reset @@ -457,6 +461,7 @@ mark_source_chains(const struct xt_table_info *newinfo, /* Set initial back pointer. */ e->counters.pcnt = pos; + calldepth = 0; for (;;) { const struct xt_standard_target *t @@ -518,6 +523,9 @@ mark_source_chains(const struct xt_table_info *newinfo, (entry0 + pos + size); e->counters.pcnt = pos; pos += size; + WARN_ON_ONCE(calldepth == 0); + if (calldepth > 0) + --calldepth; } else { int newpos = t->verdict; @@ -531,9 +539,14 @@ mark_source_chains(const struct xt_table_info *newinfo, newpos); return 0; } + if (entry0 + newpos != ipt_next_entry(e) && + !(e->ip.flags & IPT_F_GOTO) && + ++calldepth > max_calldepth) + max_calldepth = calldepth; + /* This a jump; chase it. */ - duprintf("Jump rule %u -> %u\n", - pos, newpos); + duprintf("Jump rule %u -> %u, calldepth %d\n", + pos, newpos, calldepth); } else { /* ... this is a fallthru */ newpos = pos + e->next_offset; @@ -547,6 +560,7 @@ mark_source_chains(const struct xt_table_info *newinfo, next: duprintf("Finished chain %u\n", hook); } + newinfo->stacksize = max_calldepth; return 1; } @@ -826,9 +840,6 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, if (ret != 0) return ret; ++i; - if (strcmp(ipt_get_target(iter)->u.user.name, - XT_ERROR_TARGET) == 0) - ++newinfo->stacksize; } if (i != repl->num_entries) { @@ -1744,9 +1755,6 @@ translate_compat_table(struct net *net, if (ret != 0) break; ++i; - if (strcmp(ipt_get_target(iter1)->u.user.name, - XT_ERROR_TARGET) == 0) - ++newinfo->stacksize; } if (ret) { /* diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 3c35ced..baf0321 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -452,11 +452,15 @@ ip6t_do_table(struct sk_buff *skb, } /* Figures out from what hook each rule can be called: returns 0 if - there are loops. Puts hook bitmask in comefrom. */ + * there are loops. Puts hook bitmask in comefrom. + * + * Keeps track of largest call depth seen and stores it in newinfo->stacksize. + */ static int -mark_source_chains(const struct xt_table_info *newinfo, +mark_source_chains(struct xt_table_info *newinfo, unsigned int valid_hooks, void *entry0) { + unsigned int calldepth, max_calldepth = 0; unsigned int hook; /* No recursion; use packet counter to save back ptrs (reset @@ -470,6 +474,7 @@ mark_source_chains(const struct xt_table_info *newinfo, /* Set initial back pointer. */ e->counters.pcnt = pos; + calldepth = 0; for (;;) { const struct xt_standard_target *t @@ -531,6 +536,8 @@ mark_source_chains(const struct xt_table_info *newinfo, (entry0 + pos + size); e->counters.pcnt = pos; pos += size; + if (calldepth > 0) + --calldepth; } else { int newpos = t->verdict; @@ -544,6 +551,11 @@ mark_source_chains(const struct xt_table_info *newinfo, newpos); return 0; } + if (entry0 + newpos != ip6t_next_entry(e) && + !(e->ipv6.flags & IP6T_F_GOTO) && + ++calldepth > max_calldepth) + max_calldepth = calldepth; + /* This a jump; chase it. */ duprintf("Jump rule %u -> %u\n", pos, newpos); @@ -560,6 +572,7 @@ mark_source_chains(const struct xt_table_info *newinfo, next: duprintf("Finished chain %u\n", hook); } + newinfo->stacksize = max_calldepth; return 1; } @@ -839,9 +852,6 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, if (ret != 0) return ret; ++i; - if (strcmp(ip6t_get_target(iter)->u.user.name, - XT_ERROR_TARGET) == 0) - ++newinfo->stacksize; } if (i != repl->num_entries) { @@ -1754,9 +1764,6 @@ translate_compat_table(struct net *net, if (ret != 0) break; ++i; - if (strcmp(ip6t_get_target(iter1)->u.user.name, - XT_ERROR_TARGET) == 0) - ++newinfo->stacksize; } if (ret) { /* diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index d324fe7..4db7d60 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -749,6 +749,10 @@ static int xt_jumpstack_alloc(struct xt_table_info *i) if (i->jumpstack == NULL) return -ENOMEM; + /* ruleset without jumps -- no stack needed */ + if (i->stacksize == 0) + return 0; + i->stacksize *= xt_jumpstack_multiplier; size = sizeof(void *) * i->stacksize; for_each_possible_cpu(cpu) { -- cgit v0.10.2 From e7c8899f3e6f2830136cf6e115c4a55ce7a3920a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 14 Jul 2015 17:51:07 +0200 Subject: netfilter: move tee_active to core This prepares for a TEE like expression in nftables. We want to ensure only one duplicate is sent, so both will use the same percpu variable to detect duplication. The other use case is detection of recursive call to xtables, but since we don't want dependency from nft to xtables core its put into core.c instead of the x_tables core. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 9bbd110..e01da73 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -390,4 +390,15 @@ extern struct nfq_ct_hook __rcu *nfq_ct_hook; static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {} #endif +/** + * nf_skb_duplicated - TEE target has sent a packet + * + * When a xtables target sends a packet, the OUTPUT and POSTROUTING + * hooks are traversed again, i.e. nft and xtables are invoked recursively. + * + * This is used by xtables TEE target to prevent the duplicated skb from + * being duplicated again. + */ +DECLARE_PER_CPU(bool, nf_skb_duplicated); + #endif /*__LINUX_NETFILTER_H*/ diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 56ead1a..6896cee 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -34,6 +34,9 @@ EXPORT_SYMBOL(nf_afinfo); const struct nf_ipv6_ops __rcu *nf_ipv6_ops __read_mostly; EXPORT_SYMBOL_GPL(nf_ipv6_ops); +DEFINE_PER_CPU(bool, nf_skb_duplicated); +EXPORT_SYMBOL_GPL(nf_skb_duplicated); + int nf_register_afinfo(const struct nf_afinfo *afinfo) { mutex_lock(&afinfo_mutex); diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c index a747eb4..8950e79 100644 --- a/net/netfilter/xt_TEE.c +++ b/net/netfilter/xt_TEE.c @@ -37,7 +37,6 @@ struct xt_tee_priv { }; static const union nf_inet_addr tee_zero_address; -static DEFINE_PER_CPU(bool, tee_active); static struct net *pick_net(struct sk_buff *skb) { @@ -88,7 +87,7 @@ tee_tg4(struct sk_buff *skb, const struct xt_action_param *par) const struct xt_tee_tginfo *info = par->targinfo; struct iphdr *iph; - if (__this_cpu_read(tee_active)) + if (__this_cpu_read(nf_skb_duplicated)) return XT_CONTINUE; /* * Copy the skb, and route the copy. Will later return %XT_CONTINUE for @@ -125,9 +124,9 @@ tee_tg4(struct sk_buff *skb, const struct xt_action_param *par) ip_send_check(iph); if (tee_tg_route4(skb, info)) { - __this_cpu_write(tee_active, true); + __this_cpu_write(nf_skb_duplicated, true); ip_local_out(skb); - __this_cpu_write(tee_active, false); + __this_cpu_write(nf_skb_duplicated, false); } else { kfree_skb(skb); } @@ -170,7 +169,7 @@ tee_tg6(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_tee_tginfo *info = par->targinfo; - if (__this_cpu_read(tee_active)) + if (__this_cpu_read(nf_skb_duplicated)) return XT_CONTINUE; skb = pskb_copy(skb, GFP_ATOMIC); if (skb == NULL) @@ -188,9 +187,9 @@ tee_tg6(struct sk_buff *skb, const struct xt_action_param *par) --iph->hop_limit; } if (tee_tg_route6(skb, info)) { - __this_cpu_write(tee_active, true); + __this_cpu_write(nf_skb_duplicated, true); ip6_local_out(skb); - __this_cpu_write(tee_active, false); + __this_cpu_write(nf_skb_duplicated, false); } else { kfree_skb(skb); } -- cgit v0.10.2 From 7814b6ec6d0d63444abdb49554166c8cfcbd063e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 14 Jul 2015 17:51:08 +0200 Subject: netfilter: xtables: don't save/restore jumpstack offset In most cases there is no reentrancy into ip/ip6tables. For skbs sent by REJECT or SYNPROXY targets, there is one level of reentrancy, but its not relevant as those targets issue an absolute verdict, i.e. the jumpstack can be clobbered since its not used after the target issues absolute verdict (ACCEPT, DROP, STOLEN, etc). So the only special case where it is relevant is the TEE target, which returns XT_CONTINUE. This patch changes ip(6)_do_table to always use the jump stack starting from 0. When we detect we're operating on an skb sent via TEE (percpu nf_skb_duplicated is 1) we switch to an alternate stack to leave the original one alone. Since there is no TEE support for arptables, it doesn't need to test if tee is active. The jump stack overflow tests are no longer needed as well -- since ->stacksize is the largest call depth we cannot exceed it. A much better alternative to the external jumpstack would be to just declare a jumps[32] stack on the local stack frame, but that would mean we'd have to reject iptables rulesets that used to work before. Another alternative would be to start rejecting rulesets with a larger call depth, e.g. 1000 -- in this case it would be feasible to allocate the entire stack in the percpu area which would avoid one dereference. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 286098a..1492845 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -222,7 +222,6 @@ struct xt_table_info { * @stacksize jumps (number of user chains) can possibly be made. */ unsigned int stacksize; - unsigned int __percpu *stackptr; void ***jumpstack; unsigned char entries[0] __aligned(8); diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index ae6d0a1..969fdbe 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -280,6 +280,9 @@ unsigned int arpt_do_table(struct sk_buff *skb, table_base = private->entries; jumpstack = (struct arpt_entry **)private->jumpstack[cpu]; + /* No TEE support for arptables, so no need to switch to alternate + * stack. All targets that reenter must return absolute verdicts. + */ e = get_entry(table_base, private->hook_entry[hook]); acpar.in = state->in; @@ -325,11 +328,6 @@ unsigned int arpt_do_table(struct sk_buff *skb, } if (table_base + v != arpt_next_entry(e)) { - - if (stackidx >= private->stacksize) { - verdict = NF_DROP; - break; - } jumpstack[stackidx++] = e; } @@ -337,9 +335,6 @@ unsigned int arpt_do_table(struct sk_buff *skb, continue; } - /* Targets which reenter must return - * abs. verdicts - */ acpar.target = t->u.kernel.target; acpar.targinfo = t->data; verdict = t->u.kernel.target->target(skb, &acpar); diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 5e44b35..a2e4b01 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -296,12 +296,13 @@ ipt_do_table(struct sk_buff *skb, const char *indev, *outdev; const void *table_base; struct ipt_entry *e, **jumpstack; - unsigned int *stackptr, origptr, cpu; + unsigned int stackidx, cpu; const struct xt_table_info *private; struct xt_action_param acpar; unsigned int addend; /* Initialization */ + stackidx = 0; ip = ip_hdr(skb); indev = state->in ? state->in->name : nulldevname; outdev = state->out ? state->out->name : nulldevname; @@ -331,13 +332,20 @@ ipt_do_table(struct sk_buff *skb, smp_read_barrier_depends(); table_base = private->entries; jumpstack = (struct ipt_entry **)private->jumpstack[cpu]; - stackptr = per_cpu_ptr(private->stackptr, cpu); - origptr = *stackptr; + + /* Switch to alternate jumpstack if we're being invoked via TEE. + * TEE issues XT_CONTINUE verdict on original skb so we must not + * clobber the jumpstack. + * + * For recursion via REJECT or SYNPROXY the stack will be clobbered + * but it is no problem since absolute verdict is issued by these. + */ + jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated); e = get_entry(table_base, private->hook_entry[hook]); - pr_debug("Entering %s(hook %u); sp at %u (UF %p)\n", - table->name, hook, origptr, + pr_debug("Entering %s(hook %u), UF %p\n", + table->name, hook, get_entry(table_base, private->underflow[hook])); do { @@ -383,28 +391,24 @@ ipt_do_table(struct sk_buff *skb, verdict = (unsigned int)(-v) - 1; break; } - if (*stackptr <= origptr) { + if (stackidx == 0) { e = get_entry(table_base, private->underflow[hook]); pr_debug("Underflow (this is normal) " "to %p\n", e); } else { - e = jumpstack[--*stackptr]; + e = jumpstack[--stackidx]; pr_debug("Pulled %p out from pos %u\n", - e, *stackptr); + e, stackidx); e = ipt_next_entry(e); } continue; } if (table_base + v != ipt_next_entry(e) && !(e->ip.flags & IPT_F_GOTO)) { - if (*stackptr >= private->stacksize) { - verdict = NF_DROP; - break; - } - jumpstack[(*stackptr)++] = e; + jumpstack[stackidx++] = e; pr_debug("Pushed %p into pos %u\n", - e, *stackptr - 1); + e, stackidx - 1); } e = get_entry(table_base, v); @@ -423,9 +427,8 @@ ipt_do_table(struct sk_buff *skb, /* Verdict */ break; } while (!acpar.hotdrop); - pr_debug("Exiting %s; resetting sp from %u to %u\n", - __func__, *stackptr, origptr); - *stackptr = origptr; + pr_debug("Exiting %s; sp at %u\n", __func__, stackidx); + xt_write_recseq_end(addend); local_bh_enable(); diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index baf0321..531281f 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -324,12 +324,13 @@ ip6t_do_table(struct sk_buff *skb, const char *indev, *outdev; const void *table_base; struct ip6t_entry *e, **jumpstack; - unsigned int *stackptr, origptr, cpu; + unsigned int stackidx, cpu; const struct xt_table_info *private; struct xt_action_param acpar; unsigned int addend; /* Initialization */ + stackidx = 0; indev = state->in ? state->in->name : nulldevname; outdev = state->out ? state->out->name : nulldevname; /* We handle fragments by dealing with the first fragment as @@ -357,8 +358,15 @@ ip6t_do_table(struct sk_buff *skb, cpu = smp_processor_id(); table_base = private->entries; jumpstack = (struct ip6t_entry **)private->jumpstack[cpu]; - stackptr = per_cpu_ptr(private->stackptr, cpu); - origptr = *stackptr; + + /* Switch to alternate jumpstack if we're being invoked via TEE. + * TEE issues XT_CONTINUE verdict on original skb so we must not + * clobber the jumpstack. + * + * For recursion via REJECT or SYNPROXY the stack will be clobbered + * but it is no problem since absolute verdict is issued by these. + */ + jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated); e = get_entry(table_base, private->hook_entry[hook]); @@ -406,20 +414,16 @@ ip6t_do_table(struct sk_buff *skb, verdict = (unsigned int)(-v) - 1; break; } - if (*stackptr <= origptr) + if (stackidx == 0) e = get_entry(table_base, private->underflow[hook]); else - e = ip6t_next_entry(jumpstack[--*stackptr]); + e = ip6t_next_entry(jumpstack[--stackidx]); continue; } if (table_base + v != ip6t_next_entry(e) && !(e->ipv6.flags & IP6T_F_GOTO)) { - if (*stackptr >= private->stacksize) { - verdict = NF_DROP; - break; - } - jumpstack[(*stackptr)++] = e; + jumpstack[stackidx++] = e; } e = get_entry(table_base, v); @@ -437,8 +441,6 @@ ip6t_do_table(struct sk_buff *skb, break; } while (!acpar.hotdrop); - *stackptr = origptr; - xt_write_recseq_end(addend); local_bh_enable(); diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 4db7d60..154447e 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -67,9 +67,6 @@ static const char *const xt_prefix[NFPROTO_NUMPROTO] = { [NFPROTO_IPV6] = "ip6", }; -/* Allow this many total (re)entries. */ -static const unsigned int xt_jumpstack_multiplier = 2; - /* Registration hooks for targets. */ int xt_register_target(struct xt_target *target) { @@ -688,8 +685,6 @@ void xt_free_table_info(struct xt_table_info *info) kvfree(info->jumpstack); } - free_percpu(info->stackptr); - kvfree(info); } EXPORT_SYMBOL(xt_free_table_info); @@ -737,10 +732,6 @@ static int xt_jumpstack_alloc(struct xt_table_info *i) unsigned int size; int cpu; - i->stackptr = alloc_percpu(unsigned int); - if (i->stackptr == NULL) - return -ENOMEM; - size = sizeof(void **) * nr_cpu_ids; if (size > PAGE_SIZE) i->jumpstack = vzalloc(size); @@ -753,8 +744,17 @@ static int xt_jumpstack_alloc(struct xt_table_info *i) if (i->stacksize == 0) return 0; - i->stacksize *= xt_jumpstack_multiplier; - size = sizeof(void *) * i->stacksize; + /* Jumpstack needs to be able to record two full callchains, one + * from the first rule set traversal, plus one table reentrancy + * via -j TEE without clobbering the callchain that brought us to + * TEE target. + * + * This is done by allocating two jumpstacks per cpu, on reentry + * the upper half of the stack is used. + * + * see the jumpstack setup in ipt_do_table() for more details. + */ + size = sizeof(void *) * i->stacksize * 2u; for_each_possible_cpu(cpu) { if (size > PAGE_SIZE) i->jumpstack[cpu] = vmalloc_node(size, -- cgit v0.10.2 From dcebd3153e0a7749bb054ab73fa4e1ca33e9d3f9 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 14 Jul 2015 17:51:09 +0200 Subject: netfilter: add and use jump label for xt_tee Don't bother testing if we need to switch to alternate stack unless TEE target is used. Suggested-by: Eric Dumazet Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 1492845..b006b71 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -3,6 +3,7 @@ #include +#include #include /** @@ -280,6 +281,12 @@ void xt_free_table_info(struct xt_table_info *info); */ DECLARE_PER_CPU(seqcount_t, xt_recseq); +/* xt_tee_enabled - true if x_tables needs to handle reentrancy + * + * Enabled if current ip(6)tables ruleset has at least one -j TEE rule. + */ +extern struct static_key xt_tee_enabled; + /** * xt_write_recseq_begin - start of a write section * diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index a2e4b01..ff585bd 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -340,7 +340,8 @@ ipt_do_table(struct sk_buff *skb, * For recursion via REJECT or SYNPROXY the stack will be clobbered * but it is no problem since absolute verdict is issued by these. */ - jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated); + if (static_key_false(&xt_tee_enabled)) + jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated); e = get_entry(table_base, private->hook_entry[hook]); diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 531281f..ea6d105 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -366,7 +366,8 @@ ip6t_do_table(struct sk_buff *skb, * For recursion via REJECT or SYNPROXY the stack will be clobbered * but it is no problem since absolute verdict is issued by these. */ - jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated); + if (static_key_false(&xt_tee_enabled)) + jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated); e = get_entry(table_base, private->hook_entry[hook]); diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 154447e..9b42b5e 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -727,6 +727,9 @@ EXPORT_SYMBOL_GPL(xt_compat_unlock); DEFINE_PER_CPU(seqcount_t, xt_recseq); EXPORT_PER_CPU_SYMBOL_GPL(xt_recseq); +struct static_key xt_tee_enabled __read_mostly; +EXPORT_SYMBOL_GPL(xt_tee_enabled); + static int xt_jumpstack_alloc(struct xt_table_info *i) { unsigned int size; diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c index 8950e79..c5d6556 100644 --- a/net/netfilter/xt_TEE.c +++ b/net/netfilter/xt_TEE.c @@ -251,6 +251,7 @@ static int tee_tg_check(const struct xt_tgchk_param *par) } else info->priv = NULL; + static_key_slow_inc(&xt_tee_enabled); return 0; } @@ -262,6 +263,7 @@ static void tee_tg_destroy(const struct xt_tgdtor_param *par) unregister_netdevice_notifier(&info->priv->notifier); kfree(info->priv); } + static_key_slow_dec(&xt_tee_enabled); } static struct xt_target tee_tg_reg[] __read_mostly = { -- cgit v0.10.2 From 6c7941dee9c41d6ab5a8be06ec44aa579a6123e1 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 14 Jul 2015 17:51:10 +0200 Subject: netfilter: xtables: remove __pure annotation sparse complains: ip_tables.c:361:27: warning: incorrect type in assignment (different modifiers) ip_tables.c:361:27: expected struct ipt_entry *[assigned] e ip_tables.c:361:27: got struct ipt_entry [pure] * doesn't change generated code. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 969fdbe..c416cb3 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -240,7 +240,7 @@ get_entry(const void *base, unsigned int offset) return (struct arpt_entry *)(base + offset); } -static inline __pure +static inline struct arpt_entry *arpt_next_entry(const struct arpt_entry *entry) { return (void *)entry + entry->next_offset; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index ff585bd..787f99e 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -276,7 +276,7 @@ static void trace_packet(const struct sk_buff *skb, } #endif -static inline __pure +static inline struct ipt_entry *ipt_next_entry(const struct ipt_entry *entry) { return (void *)entry + entry->next_offset; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index ea6d105..4e21f80 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -305,7 +305,7 @@ static void trace_packet(const struct sk_buff *skb, } #endif -static inline __pure struct ip6t_entry * +static inline struct ip6t_entry * ip6t_next_entry(const struct ip6t_entry *entry) { return (void *)entry + entry->next_offset; -- cgit v0.10.2 From e317fa505dcdfa25f0e4c888f991eb7fd1562e1e Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 18 Jul 2015 10:21:14 -0500 Subject: netfilter: Fix memory leak in nf_register_net_hook In the rare case that when it is a attempted to use a per network device netfilter hook and the network device does not exist the newly allocated structure can leak. Be a good citizen and free the newly allocated structure in the error handling code. Fixes: 085db2c04557 ("netfilter: Per network namespace netfilter hooks.") Reported-by: kbuild@01.org Reported-by: Dan Carpenter Signed-off-by: "Eric W. Biederman" Signed-off-by: Pablo Neira Ayuso diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 6896cee..87d237d 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -96,8 +96,10 @@ int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg) new->priority = reg->priority; nf_hook_list = find_nf_hook_list(net, reg); - if (!nf_hook_list) + if (!nf_hook_list) { + kfree(new); return -ENOENT; + } mutex_lock(&nf_hook_mutex); list_for_each_entry(elem, nf_hook_list, list) { -- cgit v0.10.2 From 2385eb0c5fbcb4316d3490b3affba8e15efc7eb8 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 20 Jul 2015 12:55:02 +0200 Subject: netfilter: nf_queue: fix nf_queue_nf_hook_drop() This function reacquires the rtnl_lock() which is already held by nf_unregister_hook(). This can be triggered via: modprobe nf_conntrack_ipv4 && rmmod nf_conntrack_ipv4 [ 720.628746] INFO: task rmmod:3578 blocked for more than 120 seconds. [ 720.628749] Not tainted 4.2.0-rc2+ #113 [ 720.628752] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 720.628754] rmmod D ffff8800ca46fd58 0 3578 3571 0x00000080 [...] [ 720.628783] Call Trace: [ 720.628790] [] schedule+0x6b/0x90 [ 720.628795] [] schedule_preempt_disabled+0x13/0x20 [ 720.628799] [] mutex_lock_nested+0x1f5/0x380 [ 720.628803] [] ? rtnl_lock+0x12/0x20 [ 720.628807] [] ? rtnl_lock+0x12/0x20 [ 720.628812] [] rtnl_lock+0x12/0x20 [ 720.628817] [] nf_queue_nf_hook_drop+0x15/0x160 [ 720.628825] [] nf_unregister_net_hook+0x168/0x190 [ 720.628831] [] nf_unregister_hook+0x64/0x80 [ 720.628837] [] nf_unregister_hooks+0x20/0x30 [...] Moreover, nf_unregister_net_hook() should only destroy the queue for this netns, not for every netns. Reported-by: Fengguang Wu Fixes: 085db2c04557 ("netfilter: Per network namespace netfilter hooks.") Signed-off-by: Pablo Neira Ayuso Acked-by: "Eric W. Biederman" diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 87d237d..12504fbb 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -154,7 +154,7 @@ void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg) static_key_slow_dec(&nf_hooks_needed[reg->pf][reg->hooknum]); #endif synchronize_net(); - nf_queue_nf_hook_drop(elem); + nf_queue_nf_hook_drop(net, elem); kfree(elem); } EXPORT_SYMBOL(nf_unregister_net_hook); diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h index 3992106..0655225 100644 --- a/net/netfilter/nf_internals.h +++ b/net/netfilter/nf_internals.h @@ -19,7 +19,7 @@ unsigned int nf_iterate(struct list_head *head, struct sk_buff *skb, /* nf_queue.c */ int nf_queue(struct sk_buff *skb, struct nf_hook_ops *elem, struct nf_hook_state *state, unsigned int queuenum); -void nf_queue_nf_hook_drop(struct nf_hook_ops *ops); +void nf_queue_nf_hook_drop(struct net *net, struct nf_hook_ops *ops); int __init netfilter_queue_init(void); /* nf_log.c */ diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index 8a8b2ab..96777f9 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -105,21 +105,15 @@ bool nf_queue_entry_get_refs(struct nf_queue_entry *entry) } EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs); -void nf_queue_nf_hook_drop(struct nf_hook_ops *ops) +void nf_queue_nf_hook_drop(struct net *net, struct nf_hook_ops *ops) { const struct nf_queue_handler *qh; - struct net *net; - rtnl_lock(); rcu_read_lock(); qh = rcu_dereference(queue_handler); - if (qh) { - for_each_net(net) { - qh->nf_hook_drop(net, ops); - } - } + if (qh) + qh->nf_hook_drop(net, ops); rcu_read_unlock(); - rtnl_unlock(); } /* -- cgit v0.10.2 From 7181ebafd4306c9328fa1cd0ead69afa397ffe75 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 20 Jul 2015 09:31:25 +0200 Subject: netfilter: fix possible removal of wrong hook nf_unregister_net_hook() uses the nf_hook_ops fields as tuple to look up for the corresponding hook in the list. However, we may have two hooks with exactly the same configuration. This shouldn't be a problem for nftables since every new chain has an unique priv field set, but this may still cause us problems in the future, so better address this problem now by keeping a reference to the original nf_hook_ops structure to make sure we delete the right hook from nf_unregister_net_hook(). Fixes: 085db2c04557 ("netfilter: Per network namespace netfilter hooks.") Signed-off-by: Pablo Neira Ayuso Acked-by: "Eric W. Biederman" diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 12504fbb..0ecb2b5 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -78,26 +78,27 @@ static struct list_head *find_nf_hook_list(struct net *net, return nf_hook_list; } +struct nf_hook_entry { + const struct nf_hook_ops *orig_ops; + struct nf_hook_ops ops; +}; + int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg) { struct list_head *nf_hook_list; - struct nf_hook_ops *elem, *new; + struct nf_hook_entry *entry; + struct nf_hook_ops *elem; - new = kzalloc(sizeof(*new), GFP_KERNEL); - if (!new) + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) return -ENOMEM; - new->hook = reg->hook; - new->dev = reg->dev; - new->owner = reg->owner; - new->priv = reg->priv; - new->pf = reg->pf; - new->hooknum = reg->hooknum; - new->priority = reg->priority; + entry->orig_ops = reg; + entry->ops = *reg; nf_hook_list = find_nf_hook_list(net, reg); if (!nf_hook_list) { - kfree(new); + kfree(entry); return -ENOENT; } @@ -106,7 +107,7 @@ int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg) if (reg->priority < elem->priority) break; } - list_add_rcu(&new->list, elem->list.prev); + list_add_rcu(&entry->ops.list, elem->list.prev); mutex_unlock(&nf_hook_mutex); #ifdef CONFIG_NETFILTER_INGRESS if (reg->pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS) @@ -122,6 +123,7 @@ EXPORT_SYMBOL(nf_register_net_hook); void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg) { struct list_head *nf_hook_list; + struct nf_hook_entry *entry; struct nf_hook_ops *elem; nf_hook_list = find_nf_hook_list(net, reg); @@ -130,14 +132,9 @@ void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg) mutex_lock(&nf_hook_mutex); list_for_each_entry(elem, nf_hook_list, list) { - if ((reg->hook == elem->hook) && - (reg->dev == elem->dev) && - (reg->owner == elem->owner) && - (reg->priv == elem->priv) && - (reg->pf == elem->pf) && - (reg->hooknum == elem->hooknum) && - (reg->priority == elem->priority)) { - list_del_rcu(&elem->list); + entry = container_of(elem, struct nf_hook_entry, ops); + if (entry->orig_ops == reg) { + list_del_rcu(&entry->ops.list); break; } } @@ -154,8 +151,8 @@ void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg) static_key_slow_dec(&nf_hooks_needed[reg->pf][reg->hooknum]); #endif synchronize_net(); - nf_queue_nf_hook_drop(net, elem); - kfree(elem); + nf_queue_nf_hook_drop(net, &entry->ops); + kfree(entry); } EXPORT_SYMBOL(nf_unregister_net_hook); -- cgit v0.10.2 From 3bbd14e0a2e3a988b1b5fe702a2539bd8d0ec622 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 20 Jul 2015 13:32:52 +0200 Subject: netfilter: rename local nf_hook_list to hook_list 085db2c04557 ("netfilter: Per network namespace netfilter hooks.") introduced a new nf_hook_list that is global, so let's avoid this overlap. Signed-off-by: Pablo Neira Ayuso Acked-by: "Eric W. Biederman" diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index e01da73..d788ce6 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -140,20 +140,20 @@ void nf_unregister_sockopt(struct nf_sockopt_ops *reg); #ifdef HAVE_JUMP_LABEL extern struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; -static inline bool nf_hook_list_active(struct list_head *nf_hook_list, +static inline bool nf_hook_list_active(struct list_head *hook_list, u_int8_t pf, unsigned int hook) { if (__builtin_constant_p(pf) && __builtin_constant_p(hook)) return static_key_false(&nf_hooks_needed[pf][hook]); - return !list_empty(nf_hook_list); + return !list_empty(hook_list); } #else -static inline bool nf_hook_list_active(struct list_head *nf_hook_list, +static inline bool nf_hook_list_active(struct list_head *hook_list, u_int8_t pf, unsigned int hook) { - return !list_empty(nf_hook_list); + return !list_empty(hook_list); } #endif @@ -175,12 +175,12 @@ static inline int nf_hook_thresh(u_int8_t pf, unsigned int hook, int thresh) { struct net *net = dev_net(indev ? indev : outdev); - struct list_head *nf_hook_list = &net->nf.hooks[pf][hook]; + struct list_head *hook_list = &net->nf.hooks[pf][hook]; - if (nf_hook_list_active(nf_hook_list, pf, hook)) { + if (nf_hook_list_active(hook_list, pf, hook)) { struct nf_hook_state state; - nf_hook_state_init(&state, nf_hook_list, hook, thresh, + nf_hook_state_init(&state, hook_list, hook, thresh, pf, indev, outdev, sk, okfn); return nf_hook_slow(skb, &state); } diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 0ecb2b5..2a5a070 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -62,20 +62,20 @@ EXPORT_SYMBOL(nf_hooks_needed); static DEFINE_MUTEX(nf_hook_mutex); -static struct list_head *find_nf_hook_list(struct net *net, +static struct list_head *nf_find_hook_list(struct net *net, const struct nf_hook_ops *reg) { - struct list_head *nf_hook_list = NULL; + struct list_head *hook_list = NULL; if (reg->pf != NFPROTO_NETDEV) - nf_hook_list = &net->nf.hooks[reg->pf][reg->hooknum]; + hook_list = &net->nf.hooks[reg->pf][reg->hooknum]; else if (reg->hooknum == NF_NETDEV_INGRESS) { #ifdef CONFIG_NETFILTER_INGRESS if (reg->dev && dev_net(reg->dev) == net) - nf_hook_list = ®->dev->nf_hooks_ingress; + hook_list = ®->dev->nf_hooks_ingress; #endif } - return nf_hook_list; + return hook_list; } struct nf_hook_entry { @@ -85,7 +85,7 @@ struct nf_hook_entry { int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg) { - struct list_head *nf_hook_list; + struct list_head *hook_list; struct nf_hook_entry *entry; struct nf_hook_ops *elem; @@ -96,14 +96,14 @@ int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg) entry->orig_ops = reg; entry->ops = *reg; - nf_hook_list = find_nf_hook_list(net, reg); - if (!nf_hook_list) { + hook_list = nf_find_hook_list(net, reg); + if (!hook_list) { kfree(entry); return -ENOENT; } mutex_lock(&nf_hook_mutex); - list_for_each_entry(elem, nf_hook_list, list) { + list_for_each_entry(elem, hook_list, list) { if (reg->priority < elem->priority) break; } @@ -122,16 +122,16 @@ EXPORT_SYMBOL(nf_register_net_hook); void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg) { - struct list_head *nf_hook_list; + struct list_head *hook_list; struct nf_hook_entry *entry; struct nf_hook_ops *elem; - nf_hook_list = find_nf_hook_list(net, reg); - if (!nf_hook_list) + hook_list = nf_find_hook_list(net, reg); + if (!hook_list) return; mutex_lock(&nf_hook_mutex); - list_for_each_entry(elem, nf_hook_list, list) { + list_for_each_entry(elem, hook_list, list) { entry = container_of(elem, struct nf_hook_entry, ops); if (entry->orig_ops == reg) { list_del_rcu(&entry->ops.list); @@ -139,7 +139,7 @@ void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg) } } mutex_unlock(&nf_hook_mutex); - if (&elem->list == nf_hook_list) { + if (&elem->list == hook_list) { WARN(1, "nf_unregister_net_hook: hook not found!\n"); return; } -- cgit v0.10.2 From d7ee3519042798be6224e97f259ed47a63da4620 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Kube=C4=8Dek?= Date: Fri, 17 Jul 2015 16:17:56 +0200 Subject: netfilter: nf_ct_sctp: minimal multihoming support Currently nf_conntrack_proto_sctp module handles only packets between primary addresses used to establish the connection. Any packets between secondary addresses are classified as invalid so that usual firewall configurations drop them. Allowing HEARTBEAT and HEARTBEAT-ACK chunks to establish a new conntrack would allow traffic between secondary addresses to pass through. A more sophisticated solution based on the addresses advertised in the initial handshake (and possibly also later dynamic address addition and removal) would be much harder to implement. Moreover, in general we cannot assume to always see the initial handshake as it can be routed through a different path. The patch adds two new conntrack states: SCTP_CONNTRACK_HEARTBEAT_SENT - a HEARTBEAT chunk seen but not acked SCTP_CONNTRACK_HEARTBEAT_ACKED - a HEARTBEAT acked by HEARTBEAT-ACK State transition rules: - HEARTBEAT_SENT responds to usual chunks the same way as NONE (so that the behaviour changes as little as possible) - HEARTBEAT_ACKED responds to usual chunks the same way as ESTABLISHED does, except the resulting state is HEARTBEAT_ACKED rather than ESTABLISHED - previously existing states except NONE are preserved when HEARTBEAT or HEARTBEAT-ACK is seen - NONE (in the initial direction) changes to HEARTBEAT_SENT on HEARTBEAT and to CLOSED on HEARTBEAT-ACK - HEARTBEAT_SENT changes to HEARTBEAT_ACKED on HEARTBEAT-ACK in the reply direction - HEARTBEAT_SENT and HEARTBEAT_ACKED are preserved on HEARTBEAT and HEARTBEAT-ACK otherwise Normally, vtag is set from the INIT chunk for the reply direction and from the INIT-ACK chunk for the originating direction (i.e. each of these defines vtag value for the opposite direction). For secondary conntracks, we can't rely on seeing INIT/INIT-ACK and even if we have seen them, we would need to connect two different conntracks. Therefore simplified logic is applied: vtag of first packet in each direction (HEARTBEAT in the originating and HEARTBEAT-ACK in reply direction) is saved and all following packets in that direction are compared with this saved value. While INIT and INIT-ACK define vtag for the opposite direction, vtags extracted from HEARTBEAT and HEARTBEAT-ACK are always for their direction. Default timeout values for new states are HEARTBEAT_SENT: 30 seconds (default hb_interval) HEARTBEAT_ACKED: 210 seconds (hb_interval * path_max_retry + max_rto) (We cannot expect to see the shutdown sequence so that, unlike ESTABLISHED, the HEARTBEAT_ACKED timeout shouldn't be too long.) Signed-off-by: Michal Kubecek Signed-off-by: Pablo Neira Ayuso diff --git a/include/uapi/linux/netfilter/nf_conntrack_sctp.h b/include/uapi/linux/netfilter/nf_conntrack_sctp.h index ceeefe6..ed4e776 100644 --- a/include/uapi/linux/netfilter/nf_conntrack_sctp.h +++ b/include/uapi/linux/netfilter/nf_conntrack_sctp.h @@ -13,6 +13,8 @@ enum sctp_conntrack { SCTP_CONNTRACK_SHUTDOWN_SENT, SCTP_CONNTRACK_SHUTDOWN_RECD, SCTP_CONNTRACK_SHUTDOWN_ACK_SENT, + SCTP_CONNTRACK_HEARTBEAT_SENT, + SCTP_CONNTRACK_HEARTBEAT_ACKED, SCTP_CONNTRACK_MAX }; diff --git a/include/uapi/linux/netfilter/nfnetlink_cttimeout.h b/include/uapi/linux/netfilter/nfnetlink_cttimeout.h index 1ab0b97..f2c10dc 100644 --- a/include/uapi/linux/netfilter/nfnetlink_cttimeout.h +++ b/include/uapi/linux/netfilter/nfnetlink_cttimeout.h @@ -92,6 +92,8 @@ enum ctattr_timeout_sctp { CTA_TIMEOUT_SCTP_SHUTDOWN_SENT, CTA_TIMEOUT_SCTP_SHUTDOWN_RECD, CTA_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT, + CTA_TIMEOUT_SCTP_HEARTBEAT_SENT, + CTA_TIMEOUT_SCTP_HEARTBEAT_ACKED, __CTA_TIMEOUT_SCTP_MAX }; #define CTA_TIMEOUT_SCTP_MAX (__CTA_TIMEOUT_SCTP_MAX - 1) diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index b45da90..6719773 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -42,6 +42,8 @@ static const char *const sctp_conntrack_names[] = { "SHUTDOWN_SENT", "SHUTDOWN_RECD", "SHUTDOWN_ACK_SENT", + "HEARTBEAT_SENT", + "HEARTBEAT_ACKED", }; #define SECS * HZ @@ -57,6 +59,8 @@ static unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] __read_mostly = { [SCTP_CONNTRACK_SHUTDOWN_SENT] = 300 SECS / 1000, [SCTP_CONNTRACK_SHUTDOWN_RECD] = 300 SECS / 1000, [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = 3 SECS, + [SCTP_CONNTRACK_HEARTBEAT_SENT] = 30 SECS, + [SCTP_CONNTRACK_HEARTBEAT_ACKED] = 210 SECS, }; #define sNO SCTP_CONNTRACK_NONE @@ -67,6 +71,8 @@ static unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] __read_mostly = { #define sSS SCTP_CONNTRACK_SHUTDOWN_SENT #define sSR SCTP_CONNTRACK_SHUTDOWN_RECD #define sSA SCTP_CONNTRACK_SHUTDOWN_ACK_SENT +#define sHS SCTP_CONNTRACK_HEARTBEAT_SENT +#define sHA SCTP_CONNTRACK_HEARTBEAT_ACKED #define sIV SCTP_CONNTRACK_MAX /* @@ -88,6 +94,10 @@ SHUTDOWN_ACK_SENT - We have seen a SHUTDOWN_ACK chunk in the direction opposite to that of the SHUTDOWN chunk. CLOSED - We have seen a SHUTDOWN_COMPLETE chunk in the direction of the SHUTDOWN chunk. Connection is closed. +HEARTBEAT_SENT - We have seen a HEARTBEAT in a new flow. +HEARTBEAT_ACKED - We have seen a HEARTBEAT-ACK in the direction opposite to + that of the HEARTBEAT chunk. Secondary connection is + established. */ /* TODO @@ -97,36 +107,40 @@ CLOSED - We have seen a SHUTDOWN_COMPLETE chunk in the direction of - Check the error type in the reply dir before transitioning from cookie echoed to closed. - Sec 5.2.4 of RFC 2960 - - Multi Homing support. + - Full Multi Homing support. */ /* SCTP conntrack state transitions */ -static const u8 sctp_conntracks[2][9][SCTP_CONNTRACK_MAX] = { +static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = { { /* ORIGINAL */ -/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */ -/* init */ {sCW, sCW, sCW, sCE, sES, sSS, sSR, sSA}, -/* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA}, -/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, -/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA}, -/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA}, -/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Can't have Stale cookie*/ -/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA},/* 5.2.4 - Big TODO */ -/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Can't come in orig dir */ -/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL} +/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA */ +/* init */ {sCW, sCW, sCW, sCE, sES, sSS, sSR, sSA, sCW, sHA}, +/* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA}, +/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, +/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA, sCL, sSS}, +/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA, sSA, sHA}, +/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA},/* Can't have Stale cookie*/ +/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA, sCL, sHA},/* 5.2.4 - Big TODO */ +/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA},/* Can't come in orig dir */ +/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL, sCL, sHA}, +/* heartbeat */ {sHS, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA}, +/* heartbeat_ack*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA} }, { /* REPLY */ -/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */ -/* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* INIT in sCL Big TODO */ -/* init_ack */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA}, -/* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, -/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA}, -/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA}, -/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA}, -/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Can't come in reply dir */ -/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA}, -/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL} +/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA */ +/* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA},/* INIT in sCL Big TODO */ +/* init_ack */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA}, +/* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV, sCL}, +/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA, sIV, sSR}, +/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA, sIV, sHA}, +/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA, sIV, sHA}, +/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA},/* Can't come in reply dir */ +/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA, sIV, sHA}, +/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL, sIV, sHA}, +/* heartbeat */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA}, +/* heartbeat_ack*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHA, sHA} } }; @@ -278,9 +292,16 @@ static int sctp_new_state(enum ip_conntrack_dir dir, pr_debug("SCTP_CID_SHUTDOWN_COMPLETE\n"); i = 8; break; + case SCTP_CID_HEARTBEAT: + pr_debug("SCTP_CID_HEARTBEAT"); + i = 9; + break; + case SCTP_CID_HEARTBEAT_ACK: + pr_debug("SCTP_CID_HEARTBEAT_ACK"); + i = 10; + break; default: - /* Other chunks like DATA, SACK, HEARTBEAT and - its ACK do not cause a change in state */ + /* Other chunks like DATA or SACK do not change the state */ pr_debug("Unknown chunk type, Will stay in %s\n", sctp_conntrack_names[cur_state]); return cur_state; @@ -329,6 +350,8 @@ static int sctp_packet(struct nf_conn *ct, !test_bit(SCTP_CID_COOKIE_ECHO, map) && !test_bit(SCTP_CID_ABORT, map) && !test_bit(SCTP_CID_SHUTDOWN_ACK, map) && + !test_bit(SCTP_CID_HEARTBEAT, map) && + !test_bit(SCTP_CID_HEARTBEAT_ACK, map) && sh->vtag != ct->proto.sctp.vtag[dir]) { pr_debug("Verification tag check failed\n"); goto out; @@ -357,6 +380,16 @@ static int sctp_packet(struct nf_conn *ct, /* Sec 8.5.1 (D) */ if (sh->vtag != ct->proto.sctp.vtag[dir]) goto out_unlock; + } else if (sch->type == SCTP_CID_HEARTBEAT || + sch->type == SCTP_CID_HEARTBEAT_ACK) { + if (ct->proto.sctp.vtag[dir] == 0) { + pr_debug("Setting vtag %x for dir %d\n", + sh->vtag, dir); + ct->proto.sctp.vtag[dir] = sh->vtag; + } else if (sh->vtag != ct->proto.sctp.vtag[dir]) { + pr_debug("Verification tag check failed\n"); + goto out_unlock; + } } old_state = ct->proto.sctp.state; @@ -466,6 +499,10 @@ static bool sctp_new(struct nf_conn *ct, const struct sk_buff *skb, /* Sec 8.5.1 (A) */ return false; } + } else if (sch->type == SCTP_CID_HEARTBEAT) { + pr_debug("Setting vtag %x for secondary conntrack\n", + sh->vtag); + ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] = sh->vtag; } /* If it is a shutdown ack OOTB packet, we expect a return shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */ @@ -610,6 +647,8 @@ sctp_timeout_nla_policy[CTA_TIMEOUT_SCTP_MAX+1] = { [CTA_TIMEOUT_SCTP_SHUTDOWN_SENT] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_SHUTDOWN_RECD] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT] = { .type = NLA_U32 }, + [CTA_TIMEOUT_SCTP_HEARTBEAT_SENT] = { .type = NLA_U32 }, + [CTA_TIMEOUT_SCTP_HEARTBEAT_ACKED] = { .type = NLA_U32 }, }; #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ @@ -658,6 +697,18 @@ static struct ctl_table sctp_sysctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, + { + .procname = "nf_conntrack_sctp_timeout_heartbeat_sent", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_sctp_timeout_heartbeat_acked", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, { } }; @@ -730,6 +781,8 @@ static int sctp_kmemdup_sysctl_table(struct nf_proto_net *pn, pn->ctl_table[4].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_SENT]; pn->ctl_table[5].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_RECD]; pn->ctl_table[6].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_ACK_SENT]; + pn->ctl_table[7].data = &sn->timeouts[SCTP_CONNTRACK_HEARTBEAT_SENT]; + pn->ctl_table[8].data = &sn->timeouts[SCTP_CONNTRACK_HEARTBEAT_ACKED]; #endif return 0; } -- cgit v0.10.2 From 72b1e5e4cac72efa6b739b47e41f53e4520b4194 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 23 Jul 2015 16:21:30 +0200 Subject: netfilter: bridge: reduce nf_bridge_info to 32 bytes again We can use union for most of the temporary cruft (original ipv4/ipv6 address, source mac, physoutdev) since they're used during different stages of br netfilter traversal. Also get rid of the last two ->mask users. Shrinks struct from 48 to 32 on 64bit arch. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso diff --git a/include/linux/netfilter_bridge.h b/include/linux/netfilter_bridge.h index 6d80fc6..2437b8a 100644 --- a/include/linux/netfilter_bridge.h +++ b/include/linux/netfilter_bridge.h @@ -17,9 +17,6 @@ enum nf_br_hook_priorities { #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) -#define BRNF_BRIDGED_DNAT 0x02 -#define BRNF_NF_BRIDGE_PREROUTING 0x08 - int br_handle_frame_finish(struct sock *sk, struct sk_buff *skb); static inline void br_drop_fake_rtable(struct sk_buff *skb) @@ -63,8 +60,17 @@ nf_bridge_get_physoutdev(const struct sk_buff *skb) { return skb->nf_bridge ? skb->nf_bridge->physoutdev : NULL; } + +static inline bool nf_bridge_in_prerouting(const struct sk_buff *skb) +{ + return skb->nf_bridge && skb->nf_bridge->in_prerouting; +} #else #define br_drop_fake_rtable(skb) do { } while (0) +static inline bool nf_bridge_in_prerouting(const struct sk_buff *skb) +{ + return false; +} #endif /* CONFIG_BRIDGE_NETFILTER */ #endif diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index d6cdd6e..ac732e6 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -173,17 +173,24 @@ struct nf_bridge_info { BRNF_PROTO_8021Q, BRNF_PROTO_PPPOE } orig_proto:8; - bool pkt_otherhost; + u8 pkt_otherhost:1; + u8 in_prerouting:1; + u8 bridged_dnat:1; __u16 frag_max_size; - unsigned int mask; struct net_device *physindev; union { - struct net_device *physoutdev; - char neigh_header[8]; - }; - union { + /* prerouting: detect dnat in orig/reply direction */ __be32 ipv4_daddr; struct in6_addr ipv6_daddr; + + /* after prerouting + nat detected: store original source + * mac since neigh resolution overwrites it, only used while + * skb is out in neigh layer. + */ + char neigh_header[8]; + + /* always valid & non-NULL from FORWARD on, for physdev match */ + struct net_device *physoutdev; }; }; #endif diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index c8b9bcf..ec51c2b 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -284,7 +284,7 @@ int br_nf_pre_routing_finish_bridge(struct sock *sk, struct sk_buff *skb) nf_bridge->neigh_header, ETH_HLEN-ETH_ALEN); /* tell br_dev_xmit to continue with forwarding */ - nf_bridge->mask |= BRNF_BRIDGED_DNAT; + nf_bridge->bridged_dnat = 1; /* FIXME Need to refragment */ ret = neigh->output(neigh, skb); } @@ -356,7 +356,7 @@ static int br_nf_pre_routing_finish(struct sock *sk, struct sk_buff *skb) skb->pkt_type = PACKET_OTHERHOST; nf_bridge->pkt_otherhost = false; } - nf_bridge->mask &= ~BRNF_NF_BRIDGE_PREROUTING; + nf_bridge->in_prerouting = 0; if (br_nf_ipv4_daddr_was_changed(skb, nf_bridge)) { if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) { struct in_device *in_dev = __in_dev_get_rcu(dev); @@ -444,7 +444,7 @@ struct net_device *setup_pre_routing(struct sk_buff *skb) nf_bridge->pkt_otherhost = true; } - nf_bridge->mask |= BRNF_NF_BRIDGE_PREROUTING; + nf_bridge->in_prerouting = 1; nf_bridge->physindev = skb->dev; skb->dev = brnf_get_logical_dev(skb, skb->dev); @@ -850,10 +850,8 @@ static unsigned int ip_sabotage_in(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct nf_hook_state *state) { - if (skb->nf_bridge && - !(skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING)) { + if (skb->nf_bridge && !skb->nf_bridge->in_prerouting) return NF_STOP; - } return NF_ACCEPT; } @@ -872,7 +870,7 @@ static void br_nf_pre_routing_finish_bridge_slow(struct sk_buff *skb) struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); skb_pull(skb, ETH_HLEN); - nf_bridge->mask &= ~BRNF_BRIDGED_DNAT; + nf_bridge->bridged_dnat = 0; BUILD_BUG_ON(sizeof(nf_bridge->neigh_header) != (ETH_HLEN - ETH_ALEN)); @@ -887,7 +885,7 @@ static void br_nf_pre_routing_finish_bridge_slow(struct sk_buff *skb) static int br_nf_dev_xmit(struct sk_buff *skb) { - if (skb->nf_bridge && (skb->nf_bridge->mask & BRNF_BRIDGED_DNAT)) { + if (skb->nf_bridge && skb->nf_bridge->bridged_dnat) { br_nf_pre_routing_finish_bridge_slow(skb); return 1; } diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c index 13b7d1e..77383bf 100644 --- a/net/bridge/br_netfilter_ipv6.c +++ b/net/bridge/br_netfilter_ipv6.c @@ -174,7 +174,7 @@ static int br_nf_pre_routing_finish_ipv6(struct sock *sk, struct sk_buff *skb) skb->pkt_type = PACKET_OTHERHOST; nf_bridge->pkt_otherhost = false; } - nf_bridge->mask &= ~BRNF_NF_BRIDGE_PREROUTING; + nf_bridge->in_prerouting = 0; if (br_nf_ipv6_daddr_was_changed(skb, nf_bridge)) { skb_dst_drop(skb); v6ops->route_input(skb); diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index c88b7d4..b69e82b 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -49,12 +49,9 @@ static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum, if (skb->nfct) zone = nf_ct_zone((struct nf_conn *)skb->nfct); #endif - -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - if (skb->nf_bridge && - skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING) + if (nf_bridge_in_prerouting(skb)) return IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone; -#endif + if (hooknum == NF_INET_PRE_ROUTING) return IP_DEFRAG_CONNTRACK_IN + zone; else diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c index a45db0b..267fb8d 100644 --- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c +++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c @@ -39,12 +39,9 @@ static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum, if (skb->nfct) zone = nf_ct_zone((struct nf_conn *)skb->nfct); #endif - -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - if (skb->nf_bridge && - skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING) + if (nf_bridge_in_prerouting(skb)) return IP6_DEFRAG_CONNTRACK_BRIDGE_IN + zone; -#endif + if (hooknum == NF_INET_PRE_ROUTING) return IP6_DEFRAG_CONNTRACK_IN + zone; else -- cgit v0.10.2 From f4b3eee727e876d625cfe3585af48f4983c435d7 Mon Sep 17 00:00:00 2001 From: Bernhard Thaler Date: Thu, 30 Jul 2015 06:06:12 +0200 Subject: netfilter: bridge: do not initialize statics to 0 or NULL Fix checkpatch.pl "ERROR: do not initialise statics to 0 or NULL" for all statics explicitly initialized to 0. Signed-off-by: Bernhard Thaler Signed-off-by: Pablo Neira Ayuso diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index ec51c2b..0a6f095 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -49,9 +49,9 @@ static struct ctl_table_header *brnf_sysctl_header; static int brnf_call_iptables __read_mostly = 1; static int brnf_call_ip6tables __read_mostly = 1; static int brnf_call_arptables __read_mostly = 1; -static int brnf_filter_vlan_tagged __read_mostly = 0; -static int brnf_filter_pppoe_tagged __read_mostly = 0; -static int brnf_pass_vlan_indev __read_mostly = 0; +static int brnf_filter_vlan_tagged __read_mostly; +static int brnf_filter_pppoe_tagged __read_mostly; +static int brnf_pass_vlan_indev __read_mostly; #else #define brnf_call_iptables 1 #define brnf_call_ip6tables 1 -- cgit v0.10.2 From a6cd379b4d68867295ea35a719008e86d7a2ee9f Mon Sep 17 00:00:00 2001 From: Subash Abhinov Kasiviswanathan Date: Thu, 30 Jul 2015 16:53:45 +0000 Subject: netfilter: ip6t_REJECT: Remove debug messages from reject_tg6() Make it similar to reject_tg() in ipt_REJECT. Suggested-by: Pablo Neira Ayuso Signed-off-by: Subash Abhinov Kasiviswanathan Signed-off-by: Pablo Neira Ayuso diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c index 12331ef..567367a 100644 --- a/net/ipv6/netfilter/ip6t_REJECT.c +++ b/net/ipv6/netfilter/ip6t_REJECT.c @@ -35,14 +35,12 @@ MODULE_AUTHOR("Yasuyuki KOZAKAI "); MODULE_DESCRIPTION("Xtables: packet \"rejection\" target for IPv6"); MODULE_LICENSE("GPL"); - static unsigned int reject_tg6(struct sk_buff *skb, const struct xt_action_param *par) { const struct ip6t_reject_info *reject = par->targinfo; struct net *net = dev_net((par->in != NULL) ? par->in : par->out); - pr_debug("%s: medium point\n", __func__); switch (reject->with) { case IP6T_ICMP6_NO_ROUTE: nf_send_unreach6(net, skb, ICMPV6_NOROUTE, par->hooknum); @@ -65,9 +63,6 @@ reject_tg6(struct sk_buff *skb, const struct xt_action_param *par) case IP6T_TCP_RESET: nf_send_reset6(net, skb, par->hooknum); break; - default: - net_info_ratelimited("case %u not handled yet\n", reject->with); - break; } return NF_DROP; -- cgit v0.10.2