From 9076aea76538556224e7d73ab718f8841330818a Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Tue, 27 Nov 2012 09:58:09 +0000 Subject: netfilter: ipset: Increase the number of maximal sets automatically The max number of sets was hardcoded at kernel cofiguration time and could only be modified via a module parameter. The patch adds the support of increasing the max number of sets automatically, as needed. The array of sets is incremented by 64 new slots if we run out of empty slots. The absolute limit for the maximal number of sets is limited by 65534. Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index fed899f..6d6d8f2 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -28,9 +28,10 @@ static LIST_HEAD(ip_set_type_list); /* all registered set types */ static DEFINE_MUTEX(ip_set_type_mutex); /* protects ip_set_type_list */ static DEFINE_RWLOCK(ip_set_ref_lock); /* protects the set refs */ -static struct ip_set **ip_set_list; /* all individual sets */ +static struct ip_set * __rcu *ip_set_list; /* all individual sets */ static ip_set_id_t ip_set_max = CONFIG_IP_SET_MAX; /* max number of sets */ +#define IP_SET_INC 64 #define STREQ(a, b) (strncmp(a, b, IPSET_MAXNAMELEN) == 0) static unsigned int max_sets; @@ -42,6 +43,12 @@ MODULE_AUTHOR("Jozsef Kadlecsik "); MODULE_DESCRIPTION("core IP set support"); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET); +/* When the nfnl mutex is held: */ +#define nfnl_dereference(p) \ + rcu_dereference_protected(p, 1) +#define nfnl_set(id) \ + nfnl_dereference(ip_set_list)[id] + /* * The set types are implemented in modules and registered set types * can be found in ip_set_type_list. Adding/deleting types is @@ -321,19 +328,19 @@ EXPORT_SYMBOL_GPL(ip_set_get_ipaddr6); */ static inline void -__ip_set_get(ip_set_id_t index) +__ip_set_get(struct ip_set *set) { write_lock_bh(&ip_set_ref_lock); - ip_set_list[index]->ref++; + set->ref++; write_unlock_bh(&ip_set_ref_lock); } static inline void -__ip_set_put(ip_set_id_t index) +__ip_set_put(struct ip_set *set) { write_lock_bh(&ip_set_ref_lock); - BUG_ON(ip_set_list[index]->ref == 0); - ip_set_list[index]->ref--; + BUG_ON(set->ref == 0); + set->ref--; write_unlock_bh(&ip_set_ref_lock); } @@ -344,12 +351,25 @@ __ip_set_put(ip_set_id_t index) * so it can't be destroyed (or changed) under our foot. */ +static inline struct ip_set * +ip_set_rcu_get(ip_set_id_t index) +{ + struct ip_set *set; + + rcu_read_lock(); + /* ip_set_list itself needs to be protected */ + set = rcu_dereference(ip_set_list)[index]; + rcu_read_unlock(); + + return set; +} + int ip_set_test(ip_set_id_t index, const struct sk_buff *skb, const struct xt_action_param *par, const struct ip_set_adt_opt *opt) { - struct ip_set *set = ip_set_list[index]; + struct ip_set *set = ip_set_rcu_get(index); int ret = 0; BUG_ON(set == NULL); @@ -388,7 +408,7 @@ ip_set_add(ip_set_id_t index, const struct sk_buff *skb, const struct xt_action_param *par, const struct ip_set_adt_opt *opt) { - struct ip_set *set = ip_set_list[index]; + struct ip_set *set = ip_set_rcu_get(index); int ret; BUG_ON(set == NULL); @@ -411,7 +431,7 @@ ip_set_del(ip_set_id_t index, const struct sk_buff *skb, const struct xt_action_param *par, const struct ip_set_adt_opt *opt) { - struct ip_set *set = ip_set_list[index]; + struct ip_set *set = ip_set_rcu_get(index); int ret = 0; BUG_ON(set == NULL); @@ -440,14 +460,17 @@ ip_set_get_byname(const char *name, struct ip_set **set) ip_set_id_t i, index = IPSET_INVALID_ID; struct ip_set *s; + rcu_read_lock(); for (i = 0; i < ip_set_max; i++) { - s = ip_set_list[i]; + s = rcu_dereference(ip_set_list)[i]; if (s != NULL && STREQ(s->name, name)) { - __ip_set_get(i); + __ip_set_get(s); index = i; *set = s; + break; } } + rcu_read_unlock(); return index; } @@ -462,8 +485,13 @@ EXPORT_SYMBOL_GPL(ip_set_get_byname); void ip_set_put_byindex(ip_set_id_t index) { - if (ip_set_list[index] != NULL) - __ip_set_put(index); + struct ip_set *set; + + rcu_read_lock(); + set = rcu_dereference(ip_set_list)[index]; + if (set != NULL) + __ip_set_put(set); + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(ip_set_put_byindex); @@ -477,7 +505,7 @@ EXPORT_SYMBOL_GPL(ip_set_put_byindex); const char * ip_set_name_byindex(ip_set_id_t index) { - const struct ip_set *set = ip_set_list[index]; + const struct ip_set *set = ip_set_rcu_get(index); BUG_ON(set == NULL); BUG_ON(set->ref == 0); @@ -501,11 +529,18 @@ EXPORT_SYMBOL_GPL(ip_set_name_byindex); ip_set_id_t ip_set_nfnl_get(const char *name) { + ip_set_id_t i, index = IPSET_INVALID_ID; struct ip_set *s; - ip_set_id_t index; nfnl_lock(); - index = ip_set_get_byname(name, &s); + for (i = 0; i < ip_set_max; i++) { + s = nfnl_set(i); + if (s != NULL && STREQ(s->name, name)) { + __ip_set_get(s); + index = i; + break; + } + } nfnl_unlock(); return index; @@ -521,12 +556,15 @@ EXPORT_SYMBOL_GPL(ip_set_nfnl_get); ip_set_id_t ip_set_nfnl_get_byindex(ip_set_id_t index) { + struct ip_set *set; + if (index > ip_set_max) return IPSET_INVALID_ID; nfnl_lock(); - if (ip_set_list[index]) - __ip_set_get(index); + set = nfnl_set(index); + if (set) + __ip_set_get(set); else index = IPSET_INVALID_ID; nfnl_unlock(); @@ -545,8 +583,11 @@ EXPORT_SYMBOL_GPL(ip_set_nfnl_get_byindex); void ip_set_nfnl_put(ip_set_id_t index) { + struct ip_set *set; nfnl_lock(); - ip_set_put_byindex(index); + set = nfnl_set(index); + if (set != NULL) + __ip_set_put(set); nfnl_unlock(); } EXPORT_SYMBOL_GPL(ip_set_nfnl_put); @@ -603,41 +644,46 @@ static const struct nla_policy ip_set_create_policy[IPSET_ATTR_CMD_MAX + 1] = { [IPSET_ATTR_DATA] = { .type = NLA_NESTED }, }; -static ip_set_id_t -find_set_id(const char *name) +static struct ip_set * +find_set_and_id(const char *name, ip_set_id_t *id) { - ip_set_id_t i, index = IPSET_INVALID_ID; - const struct ip_set *set; + struct ip_set *set = NULL; + ip_set_id_t i; - for (i = 0; index == IPSET_INVALID_ID && i < ip_set_max; i++) { - set = ip_set_list[i]; - if (set != NULL && STREQ(set->name, name)) - index = i; + *id = IPSET_INVALID_ID; + for (i = 0; i < ip_set_max; i++) { + set = nfnl_set(i); + if (set != NULL && STREQ(set->name, name)) { + *id = i; + break; + } } - return index; + return (*id == IPSET_INVALID_ID ? NULL : set); } static inline struct ip_set * find_set(const char *name) { - ip_set_id_t index = find_set_id(name); + ip_set_id_t id; - return index == IPSET_INVALID_ID ? NULL : ip_set_list[index]; + return find_set_and_id(name, &id); } static int find_free_id(const char *name, ip_set_id_t *index, struct ip_set **set) { + struct ip_set *s; ip_set_id_t i; *index = IPSET_INVALID_ID; for (i = 0; i < ip_set_max; i++) { - if (ip_set_list[i] == NULL) { + s = nfnl_set(i); + if (s == NULL) { if (*index == IPSET_INVALID_ID) *index = i; - } else if (STREQ(name, ip_set_list[i]->name)) { + } else if (STREQ(name, s->name)) { /* Name clash */ - *set = ip_set_list[i]; + *set = s; return -EEXIST; } } @@ -730,10 +776,9 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb, * and check clashing. */ ret = find_free_id(set->name, &index, &clash); - if (ret != 0) { + if (ret == -EEXIST) { /* If this is the same set and requested, ignore error */ - if (ret == -EEXIST && - (flags & IPSET_FLAG_EXIST) && + if ((flags & IPSET_FLAG_EXIST) && STREQ(set->type->name, clash->type->name) && set->type->family == clash->type->family && set->type->revision_min == clash->type->revision_min && @@ -741,13 +786,36 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb, set->variant->same_set(set, clash)) ret = 0; goto cleanup; - } + } else if (ret == -IPSET_ERR_MAX_SETS) { + struct ip_set **list, **tmp; + ip_set_id_t i = ip_set_max + IP_SET_INC; + + if (i < ip_set_max || i == IPSET_INVALID_ID) + /* Wraparound */ + goto cleanup; + + list = kzalloc(sizeof(struct ip_set *) * i, GFP_KERNEL); + if (!list) + goto cleanup; + /* nfnl mutex is held, both lists are valid */ + tmp = nfnl_dereference(ip_set_list); + memcpy(list, tmp, sizeof(struct ip_set *) * ip_set_max); + rcu_assign_pointer(ip_set_list, list); + /* Make sure all current packets have passed through */ + synchronize_net(); + /* Use new list */ + index = ip_set_max; + ip_set_max = i; + kfree(tmp); + ret = 0; + } else if (ret) + goto cleanup; /* * Finally! Add our shiny new set to the list, and be done. */ pr_debug("create: '%s' created with index %u!\n", set->name, index); - ip_set_list[index] = set; + nfnl_set(index) = set; return ret; @@ -772,10 +840,10 @@ ip_set_setname_policy[IPSET_ATTR_CMD_MAX + 1] = { static void ip_set_destroy_set(ip_set_id_t index) { - struct ip_set *set = ip_set_list[index]; + struct ip_set *set = nfnl_set(index); pr_debug("set: %s\n", set->name); - ip_set_list[index] = NULL; + nfnl_set(index) = NULL; /* Must call it without holding any lock */ set->variant->destroy(set); @@ -788,6 +856,7 @@ ip_set_destroy(struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const attr[]) { + struct ip_set *s; ip_set_id_t i; int ret = 0; @@ -807,22 +876,24 @@ ip_set_destroy(struct sock *ctnl, struct sk_buff *skb, read_lock_bh(&ip_set_ref_lock); if (!attr[IPSET_ATTR_SETNAME]) { for (i = 0; i < ip_set_max; i++) { - if (ip_set_list[i] != NULL && ip_set_list[i]->ref) { + s = nfnl_set(i); + if (s != NULL && s->ref) { ret = -IPSET_ERR_BUSY; goto out; } } read_unlock_bh(&ip_set_ref_lock); for (i = 0; i < ip_set_max; i++) { - if (ip_set_list[i] != NULL) + s = nfnl_set(i); + if (s != NULL) ip_set_destroy_set(i); } } else { - i = find_set_id(nla_data(attr[IPSET_ATTR_SETNAME])); - if (i == IPSET_INVALID_ID) { + s = find_set_and_id(nla_data(attr[IPSET_ATTR_SETNAME]), &i); + if (s == NULL) { ret = -ENOENT; goto out; - } else if (ip_set_list[i]->ref) { + } else if (s->ref) { ret = -IPSET_ERR_BUSY; goto out; } @@ -853,21 +924,24 @@ ip_set_flush(struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const attr[]) { + struct ip_set *s; ip_set_id_t i; if (unlikely(protocol_failed(attr))) return -IPSET_ERR_PROTOCOL; if (!attr[IPSET_ATTR_SETNAME]) { - for (i = 0; i < ip_set_max; i++) - if (ip_set_list[i] != NULL) - ip_set_flush_set(ip_set_list[i]); + for (i = 0; i < ip_set_max; i++) { + s = nfnl_set(i); + if (s != NULL) + ip_set_flush_set(s); + } } else { - i = find_set_id(nla_data(attr[IPSET_ATTR_SETNAME])); - if (i == IPSET_INVALID_ID) + s = find_set(nla_data(attr[IPSET_ATTR_SETNAME])); + if (s == NULL) return -ENOENT; - ip_set_flush_set(ip_set_list[i]); + ip_set_flush_set(s); } return 0; @@ -889,7 +963,7 @@ ip_set_rename(struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const attr[]) { - struct ip_set *set; + struct ip_set *set, *s; const char *name2; ip_set_id_t i; int ret = 0; @@ -911,8 +985,8 @@ ip_set_rename(struct sock *ctnl, struct sk_buff *skb, name2 = nla_data(attr[IPSET_ATTR_SETNAME2]); for (i = 0; i < ip_set_max; i++) { - if (ip_set_list[i] != NULL && - STREQ(ip_set_list[i]->name, name2)) { + s = nfnl_set(i); + if (s != NULL && STREQ(s->name, name2)) { ret = -IPSET_ERR_EXIST_SETNAME2; goto out; } @@ -947,17 +1021,14 @@ ip_set_swap(struct sock *ctnl, struct sk_buff *skb, attr[IPSET_ATTR_SETNAME2] == NULL)) return -IPSET_ERR_PROTOCOL; - from_id = find_set_id(nla_data(attr[IPSET_ATTR_SETNAME])); - if (from_id == IPSET_INVALID_ID) + from = find_set_and_id(nla_data(attr[IPSET_ATTR_SETNAME]), &from_id); + if (from == NULL) return -ENOENT; - to_id = find_set_id(nla_data(attr[IPSET_ATTR_SETNAME2])); - if (to_id == IPSET_INVALID_ID) + to = find_set_and_id(nla_data(attr[IPSET_ATTR_SETNAME2]), &to_id); + if (to == NULL) return -IPSET_ERR_EXIST_SETNAME2; - from = ip_set_list[from_id]; - to = ip_set_list[to_id]; - /* Features must not change. * Not an artificial restriction anymore, as we must prevent * possible loops created by swapping in setlist type of sets. */ @@ -971,8 +1042,8 @@ ip_set_swap(struct sock *ctnl, struct sk_buff *skb, write_lock_bh(&ip_set_ref_lock); swap(from->ref, to->ref); - ip_set_list[from_id] = to; - ip_set_list[to_id] = from; + nfnl_set(from_id) = to; + nfnl_set(to_id) = from; write_unlock_bh(&ip_set_ref_lock); return 0; @@ -992,7 +1063,7 @@ static int ip_set_dump_done(struct netlink_callback *cb) { if (cb->args[2]) { - pr_debug("release set %s\n", ip_set_list[cb->args[1]]->name); + pr_debug("release set %s\n", nfnl_set(cb->args[1])->name); ip_set_put_byindex((ip_set_id_t) cb->args[1]); } return 0; @@ -1030,8 +1101,11 @@ dump_init(struct netlink_callback *cb) */ if (cda[IPSET_ATTR_SETNAME]) { - index = find_set_id(nla_data(cda[IPSET_ATTR_SETNAME])); - if (index == IPSET_INVALID_ID) + struct ip_set *set; + + set = find_set_and_id(nla_data(cda[IPSET_ATTR_SETNAME]), + &index); + if (set == NULL) return -ENOENT; dump_type = DUMP_ONE; @@ -1081,7 +1155,7 @@ dump_last: dump_type, dump_flags, cb->args[1]); for (; cb->args[1] < max; cb->args[1]++) { index = (ip_set_id_t) cb->args[1]; - set = ip_set_list[index]; + set = nfnl_set(index); if (set == NULL) { if (dump_type == DUMP_ONE) { ret = -ENOENT; @@ -1100,7 +1174,7 @@ dump_last: if (!cb->args[2]) { /* Start listing: make sure set won't be destroyed */ pr_debug("reference set\n"); - __ip_set_get(index); + __ip_set_get(set); } nlh = start_msg(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags, @@ -1159,7 +1233,7 @@ next_set: release_refcount: /* If there was an error or set is done, release set */ if (ret || !cb->args[2]) { - pr_debug("release set %s\n", ip_set_list[index]->name); + pr_debug("release set %s\n", nfnl_set(index)->name); ip_set_put_byindex(index); cb->args[2] = 0; } @@ -1409,17 +1483,15 @@ ip_set_header(struct sock *ctnl, struct sk_buff *skb, const struct ip_set *set; struct sk_buff *skb2; struct nlmsghdr *nlh2; - ip_set_id_t index; int ret = 0; if (unlikely(protocol_failed(attr) || attr[IPSET_ATTR_SETNAME] == NULL)) return -IPSET_ERR_PROTOCOL; - index = find_set_id(nla_data(attr[IPSET_ATTR_SETNAME])); - if (index == IPSET_INVALID_ID) + set = find_set(nla_data(attr[IPSET_ATTR_SETNAME])); + if (set == NULL) return -ENOENT; - set = ip_set_list[index]; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (skb2 == NULL) @@ -1684,6 +1756,7 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len) } case IP_SET_OP_GET_BYNAME: { struct ip_set_req_get_set *req_get = data; + ip_set_id_t id; if (*len != sizeof(struct ip_set_req_get_set)) { ret = -EINVAL; @@ -1691,12 +1764,14 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len) } req_get->set.name[IPSET_MAXNAMELEN - 1] = '\0'; nfnl_lock(); - req_get->set.index = find_set_id(req_get->set.name); + find_set_and_id(req_get->set.name, &id); + req_get->set.index = id; nfnl_unlock(); goto copy; } case IP_SET_OP_GET_BYINDEX: { struct ip_set_req_get_set *req_get = data; + struct ip_set *set; if (*len != sizeof(struct ip_set_req_get_set) || req_get->set.index >= ip_set_max) { @@ -1704,9 +1779,8 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len) goto done; } nfnl_lock(); - strncpy(req_get->set.name, - ip_set_list[req_get->set.index] - ? ip_set_list[req_get->set.index]->name : "", + set = nfnl_set(req_get->set.index); + strncpy(req_get->set.name, set ? set->name : "", IPSET_MAXNAMELEN); nfnl_unlock(); goto copy; @@ -1737,6 +1811,7 @@ static struct nf_sockopt_ops so_set __read_mostly = { static int __init ip_set_init(void) { + struct ip_set **list; int ret; if (max_sets) @@ -1744,22 +1819,22 @@ ip_set_init(void) if (ip_set_max >= IPSET_INVALID_ID) ip_set_max = IPSET_INVALID_ID - 1; - ip_set_list = kzalloc(sizeof(struct ip_set *) * ip_set_max, - GFP_KERNEL); - if (!ip_set_list) + list = kzalloc(sizeof(struct ip_set *) * ip_set_max, GFP_KERNEL); + if (!list) return -ENOMEM; + rcu_assign_pointer(ip_set_list, list); ret = nfnetlink_subsys_register(&ip_set_netlink_subsys); if (ret != 0) { pr_err("ip_set: cannot register with nfnetlink.\n"); - kfree(ip_set_list); + kfree(list); return ret; } ret = nf_register_sockopt(&so_set); if (ret != 0) { pr_err("SO_SET registry failed: %d\n", ret); nfnetlink_subsys_unregister(&ip_set_netlink_subsys); - kfree(ip_set_list); + kfree(list); return ret; } @@ -1770,10 +1845,12 @@ ip_set_init(void) static void __exit ip_set_fini(void) { + struct ip_set **list = rcu_dereference_protected(ip_set_list, 1); + /* There can't be any existing set */ nf_unregister_sockopt(&so_set); nfnetlink_subsys_unregister(&ip_set_netlink_subsys); - kfree(ip_set_list); + kfree(list); pr_debug("these are the famous last words\n"); } -- cgit v0.10.2 From 04dac0111da7e1d284952cd415162451ffaa094d Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 27 Nov 2012 21:30:52 +0100 Subject: netfilter: nf_conntrack: improve nf_conn object traceability This patch modifies the conntrack subsystem so that all existing allocated conntrack objects can be found in any of the following places: * the hash table, this is the typical place for alive conntrack objects. * the unconfirmed list, this is the place for newly created conntrack objects that are still traversing the stack. * the dying list, this is where you can find conntrack objects that are dying or that should die anytime soon (eg. once the destroy event is delivered to the conntrackd daemon). Thus, we make sure that we follow the track for all existing conntrack objects. This patch, together with some extension of the ctnetlink interface to dump the content of the dying and unconfirmed lists, will help in case to debug suspected nf_conn object leaks. Signed-off-by: Pablo Neira Ayuso diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index f1494fe..caca0c4 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -182,7 +182,7 @@ __nf_conntrack_find(struct net *net, u16 zone, extern int nf_conntrack_hash_check_insert(struct nf_conn *ct); extern void nf_ct_delete_from_lists(struct nf_conn *ct); -extern void nf_ct_insert_dying_list(struct nf_conn *ct); +extern void nf_ct_dying_timeout(struct nf_conn *ct); extern void nf_conntrack_flush_report(struct net *net, u32 pid, int report); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 0f241be..af17516 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -221,11 +221,9 @@ destroy_conntrack(struct nf_conntrack *nfct) * too. */ nf_ct_remove_expectations(ct); - /* We overload first tuple to link into unconfirmed list. */ - if (!nf_ct_is_confirmed(ct)) { - BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode)); - hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); - } + /* We overload first tuple to link into unconfirmed or dying list.*/ + BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode)); + hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); NF_CT_STAT_INC(net, delete); spin_unlock_bh(&nf_conntrack_lock); @@ -247,6 +245,9 @@ void nf_ct_delete_from_lists(struct nf_conn *ct) * Otherwise we can get spurious warnings. */ NF_CT_STAT_INC(net, delete_list); clean_from_lists(ct); + /* add this conntrack to the dying list */ + hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, + &net->ct.dying); spin_unlock_bh(&nf_conntrack_lock); } EXPORT_SYMBOL_GPL(nf_ct_delete_from_lists); @@ -268,31 +269,23 @@ static void death_by_event(unsigned long ul_conntrack) } /* we've got the event delivered, now it's dying */ set_bit(IPS_DYING_BIT, &ct->status); - spin_lock(&nf_conntrack_lock); - hlist_nulls_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); - spin_unlock(&nf_conntrack_lock); nf_ct_put(ct); } -void nf_ct_insert_dying_list(struct nf_conn *ct) +void nf_ct_dying_timeout(struct nf_conn *ct) { struct net *net = nf_ct_net(ct); struct nf_conntrack_ecache *ecache = nf_ct_ecache_find(ct); BUG_ON(ecache == NULL); - /* add this conntrack to the dying list */ - spin_lock_bh(&nf_conntrack_lock); - hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, - &net->ct.dying); - spin_unlock_bh(&nf_conntrack_lock); /* set a new timer to retry event delivery */ setup_timer(&ecache->timeout, death_by_event, (unsigned long)ct); ecache->timeout.expires = jiffies + (random32() % net->ct.sysctl_events_retry_timeout); add_timer(&ecache->timeout); } -EXPORT_SYMBOL_GPL(nf_ct_insert_dying_list); +EXPORT_SYMBOL_GPL(nf_ct_dying_timeout); static void death_by_timeout(unsigned long ul_conntrack) { @@ -307,7 +300,7 @@ static void death_by_timeout(unsigned long ul_conntrack) unlikely(nf_conntrack_event(IPCT_DESTROY, ct) < 0)) { /* destroy event was not delivered */ nf_ct_delete_from_lists(ct); - nf_ct_insert_dying_list(ct); + nf_ct_dying_timeout(ct); return; } set_bit(IPS_DYING_BIT, &ct->status); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 7bbfb3d..34370a9 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -989,7 +989,7 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, nlmsg_report(nlh)) < 0) { nf_ct_delete_from_lists(ct); /* we failed to report the event, try later */ - nf_ct_insert_dying_list(ct); + nf_ct_dying_timeout(ct); nf_ct_put(ct); return 0; } -- cgit v0.10.2 From d871befe357ccc262edbb0a4f9aeea650012edf5 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 27 Nov 2012 14:49:42 +0100 Subject: netfilter: ctnetlink: dump entries from the dying and unconfirmed lists This patch adds a new operation to dump the content of the dying and unconfirmed lists. Under some situations, the global conntrack counter can be inconsistent with the number of entries that we can dump from the conntrack table. The way to resolve this is to allow dumping the content of the unconfirmed and dying lists, so far it was not possible to look at its content. This provides some extra instrumentation to resolve problematic situations in which anyone suspects memory leaks. Signed-off-by: Pablo Neira Ayuso diff --git a/include/uapi/linux/netfilter/nfnetlink_conntrack.h b/include/uapi/linux/netfilter/nfnetlink_conntrack.h index 43bfe3e..86e930c 100644 --- a/include/uapi/linux/netfilter/nfnetlink_conntrack.h +++ b/include/uapi/linux/netfilter/nfnetlink_conntrack.h @@ -9,6 +9,8 @@ enum cntl_msg_types { IPCTNL_MSG_CT_GET_CTRZERO, IPCTNL_MSG_CT_GET_STATS_CPU, IPCTNL_MSG_CT_GET_STATS, + IPCTNL_MSG_CT_GET_DYING, + IPCTNL_MSG_CT_GET_UNCONFIRMED, IPCTNL_MSG_MAX }; diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 34370a9..c24a00a 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1089,6 +1089,112 @@ out: return err == -EAGAIN ? -ENOBUFS : err; } +static int ctnetlink_done_list(struct netlink_callback *cb) +{ + if (cb->args[1]) + nf_ct_put((struct nf_conn *)cb->args[1]); + return 0; +} + +static int +ctnetlink_dump_list(struct sk_buff *skb, struct netlink_callback *cb, + struct hlist_nulls_head *list) +{ + struct nf_conn *ct, *last; + struct nf_conntrack_tuple_hash *h; + struct hlist_nulls_node *n; + struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); + u_int8_t l3proto = nfmsg->nfgen_family; + int res; + + if (cb->args[2]) + return 0; + + spin_lock_bh(&nf_conntrack_lock); + last = (struct nf_conn *)cb->args[1]; +restart: + hlist_nulls_for_each_entry(h, n, list, hnnode) { + ct = nf_ct_tuplehash_to_ctrack(h); + if (l3proto && nf_ct_l3num(ct) != l3proto) + continue; + if (cb->args[1]) { + if (ct != last) + continue; + cb->args[1] = 0; + } + rcu_read_lock(); + res = ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NFNL_MSG_TYPE(cb->nlh->nlmsg_type), + ct); + rcu_read_unlock(); + if (res < 0) { + nf_conntrack_get(&ct->ct_general); + cb->args[1] = (unsigned long)ct; + goto out; + } + } + if (cb->args[1]) { + cb->args[1] = 0; + goto restart; + } else + cb->args[2] = 1; +out: + spin_unlock_bh(&nf_conntrack_lock); + if (last) + nf_ct_put(last); + + return skb->len; +} + +static int +ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + + return ctnetlink_dump_list(skb, cb, &net->ct.dying); +} + +static int +ctnetlink_get_ct_dying(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) +{ + if (nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .dump = ctnetlink_dump_dying, + .done = ctnetlink_done_list, + }; + return netlink_dump_start(ctnl, skb, nlh, &c); + } + + return -EOPNOTSUPP; +} + +static int +ctnetlink_dump_unconfirmed(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + + return ctnetlink_dump_list(skb, cb, &net->ct.unconfirmed); +} + +static int +ctnetlink_get_ct_unconfirmed(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) +{ + if (nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .dump = ctnetlink_dump_unconfirmed, + .done = ctnetlink_done_list, + }; + return netlink_dump_start(ctnl, skb, nlh, &c); + } + + return -EOPNOTSUPP; +} + #ifdef CONFIG_NF_NAT_NEEDED static int ctnetlink_parse_nat_setup(struct nf_conn *ct, @@ -2712,6 +2818,8 @@ static const struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = { .policy = ct_nla_policy }, [IPCTNL_MSG_CT_GET_STATS_CPU] = { .call = ctnetlink_stat_ct_cpu }, [IPCTNL_MSG_CT_GET_STATS] = { .call = ctnetlink_stat_ct }, + [IPCTNL_MSG_CT_GET_DYING] = { .call = ctnetlink_get_ct_dying }, + [IPCTNL_MSG_CT_GET_UNCONFIRMED] = { .call = ctnetlink_get_ct_unconfirmed }, }; static const struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_EXP_MAX] = { -- cgit v0.10.2 From 0360ae412d09bc6f4864c801effcb20bfd84520e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 23 Nov 2012 06:22:21 +0000 Subject: netfilter: kill support for per-af queue backends We used to have several queueing backends, but nowadays only nfnetlink_queue remains. In light of this there doesn't seem to be a good reason to support per-af registering -- just hook up nfnetlink_queue on module load and remove it on unload. This means that the userspace BIND/UNBIND_PF commands are now obsolete; the kernel will ignore them. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h index 252fd10..fb1c0be 100644 --- a/include/net/netfilter/nf_queue.h +++ b/include/net/netfilter/nf_queue.h @@ -21,14 +21,10 @@ struct nf_queue_entry { struct nf_queue_handler { int (*outfn)(struct nf_queue_entry *entry, unsigned int queuenum); - char *name; }; -extern int nf_register_queue_handler(u_int8_t pf, - const struct nf_queue_handler *qh); -extern int nf_unregister_queue_handler(u_int8_t pf, - const struct nf_queue_handler *qh); -extern void nf_unregister_queue_handlers(const struct nf_queue_handler *qh); +void nf_register_queue_handler(const struct nf_queue_handler *qh); +void nf_unregister_queue_handler(void); extern void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict); #endif /* _NF_QUEUE_H */ diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 68912dad..a9c488b6 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -295,8 +295,6 @@ void __init netfilter_init(void) panic("cannot create netfilter proc entry"); #endif - if (netfilter_queue_init() < 0) - panic("cannot initialize nf_queue"); if (netfilter_log_init() < 0) panic("cannot initialize nf_log"); } diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index 8d2cf9e..d812c12 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -14,84 +14,32 @@ #include "nf_internals.h" /* - * A queue handler may be registered for each protocol. Each is protected by - * long term mutex. The handler must provide an an outfn() to accept packets - * for queueing and must reinject all packets it receives, no matter what. + * Hook for nfnetlink_queue to register its queue handler. + * We do this so that most of the NFQUEUE code can be modular. + * + * Once the queue is registered it must reinject all packets it + * receives, no matter what. */ -static const struct nf_queue_handler __rcu *queue_handler[NFPROTO_NUMPROTO] __read_mostly; - -static DEFINE_MUTEX(queue_handler_mutex); +static const struct nf_queue_handler __rcu *queue_handler __read_mostly; /* return EBUSY when somebody else is registered, return EEXIST if the * same handler is registered, return 0 in case of success. */ -int nf_register_queue_handler(u_int8_t pf, const struct nf_queue_handler *qh) +void nf_register_queue_handler(const struct nf_queue_handler *qh) { - int ret; - const struct nf_queue_handler *old; - - if (pf >= ARRAY_SIZE(queue_handler)) - return -EINVAL; - - mutex_lock(&queue_handler_mutex); - old = rcu_dereference_protected(queue_handler[pf], - lockdep_is_held(&queue_handler_mutex)); - if (old == qh) - ret = -EEXIST; - else if (old) - ret = -EBUSY; - else { - rcu_assign_pointer(queue_handler[pf], qh); - ret = 0; - } - mutex_unlock(&queue_handler_mutex); - - return ret; + /* should never happen, we only have one queueing backend in kernel */ + WARN_ON(rcu_access_pointer(queue_handler)); + rcu_assign_pointer(queue_handler, qh); } EXPORT_SYMBOL(nf_register_queue_handler); /* The caller must flush their queue before this */ -int nf_unregister_queue_handler(u_int8_t pf, const struct nf_queue_handler *qh) +void nf_unregister_queue_handler(void) { - const struct nf_queue_handler *old; - - if (pf >= ARRAY_SIZE(queue_handler)) - return -EINVAL; - - mutex_lock(&queue_handler_mutex); - old = rcu_dereference_protected(queue_handler[pf], - lockdep_is_held(&queue_handler_mutex)); - if (old && old != qh) { - mutex_unlock(&queue_handler_mutex); - return -EINVAL; - } - - RCU_INIT_POINTER(queue_handler[pf], NULL); - mutex_unlock(&queue_handler_mutex); - + RCU_INIT_POINTER(queue_handler, NULL); synchronize_rcu(); - - return 0; } EXPORT_SYMBOL(nf_unregister_queue_handler); -void nf_unregister_queue_handlers(const struct nf_queue_handler *qh) -{ - u_int8_t pf; - - mutex_lock(&queue_handler_mutex); - for (pf = 0; pf < ARRAY_SIZE(queue_handler); pf++) { - if (rcu_dereference_protected( - queue_handler[pf], - lockdep_is_held(&queue_handler_mutex) - ) == qh) - RCU_INIT_POINTER(queue_handler[pf], NULL); - } - mutex_unlock(&queue_handler_mutex); - - synchronize_rcu(); -} -EXPORT_SYMBOL_GPL(nf_unregister_queue_handlers); - static void nf_queue_entry_release_refs(struct nf_queue_entry *entry) { /* Release those devices we held, or Alexey will kill me. */ @@ -137,7 +85,7 @@ static int __nf_queue(struct sk_buff *skb, /* QUEUE == DROP if no one is waiting, to be safe. */ rcu_read_lock(); - qh = rcu_dereference(queue_handler[pf]); + qh = rcu_dereference(queue_handler); if (!qh) { status = -ESRCH; goto err_unlock; @@ -344,77 +292,3 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) kfree(entry); } EXPORT_SYMBOL(nf_reinject); - -#ifdef CONFIG_PROC_FS -static void *seq_start(struct seq_file *seq, loff_t *pos) -{ - if (*pos >= ARRAY_SIZE(queue_handler)) - return NULL; - - return pos; -} - -static void *seq_next(struct seq_file *s, void *v, loff_t *pos) -{ - (*pos)++; - - if (*pos >= ARRAY_SIZE(queue_handler)) - return NULL; - - return pos; -} - -static void seq_stop(struct seq_file *s, void *v) -{ - -} - -static int seq_show(struct seq_file *s, void *v) -{ - int ret; - loff_t *pos = v; - const struct nf_queue_handler *qh; - - rcu_read_lock(); - qh = rcu_dereference(queue_handler[*pos]); - if (!qh) - ret = seq_printf(s, "%2lld NONE\n", *pos); - else - ret = seq_printf(s, "%2lld %s\n", *pos, qh->name); - rcu_read_unlock(); - - return ret; -} - -static const struct seq_operations nfqueue_seq_ops = { - .start = seq_start, - .next = seq_next, - .stop = seq_stop, - .show = seq_show, -}; - -static int nfqueue_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &nfqueue_seq_ops); -} - -static const struct file_operations nfqueue_file_ops = { - .owner = THIS_MODULE, - .open = nfqueue_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; -#endif /* PROC_FS */ - - -int __init netfilter_queue_init(void) -{ -#ifdef CONFIG_PROC_FS - if (!proc_create("nf_queue", S_IRUGO, - proc_net_netfilter, &nfqueue_file_ops)) - return -1; -#endif - return 0; -} - diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c index e12d44e..3158d87 100644 --- a/net/netfilter/nfnetlink_queue_core.c +++ b/net/netfilter/nfnetlink_queue_core.c @@ -809,7 +809,6 @@ static const struct nla_policy nfqa_cfg_policy[NFQA_CFG_MAX+1] = { }; static const struct nf_queue_handler nfqh = { - .name = "nf_queue", .outfn = &nfqnl_enqueue_packet, }; @@ -827,14 +826,10 @@ nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb, if (nfqa[NFQA_CFG_CMD]) { cmd = nla_data(nfqa[NFQA_CFG_CMD]); - /* Commands without queue context - might sleep */ + /* Obsolete commands without queue context */ switch (cmd->command) { - case NFQNL_CFG_CMD_PF_BIND: - return nf_register_queue_handler(ntohs(cmd->pf), - &nfqh); - case NFQNL_CFG_CMD_PF_UNBIND: - return nf_unregister_queue_handler(ntohs(cmd->pf), - &nfqh); + case NFQNL_CFG_CMD_PF_BIND: return 0; + case NFQNL_CFG_CMD_PF_UNBIND: return 0; } } @@ -1074,6 +1069,7 @@ static int __init nfnetlink_queue_init(void) #endif register_netdevice_notifier(&nfqnl_dev_notifier); + nf_register_queue_handler(&nfqh); return status; #ifdef CONFIG_PROC_FS @@ -1087,7 +1083,7 @@ cleanup_netlink_notifier: static void __exit nfnetlink_queue_fini(void) { - nf_unregister_queue_handlers(&nfqh); + nf_unregister_queue_handler(); unregister_netdevice_notifier(&nfqnl_dev_notifier); #ifdef CONFIG_PROC_FS remove_proc_entry("nfnetlink_queue", proc_net_netfilter); -- cgit v0.10.2 From 6d1fafcaecaa2e66eb9861a39d22fc7380ce6f78 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 22 Nov 2012 01:32:46 +0000 Subject: netfilter: ctnetlink: nla_policy updates Add stricter checking for a few attributes. Note that these changes don't fix any bug in the current code base. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index c24a00a..4e078cd 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -898,7 +898,8 @@ ctnetlink_parse_zone(const struct nlattr *attr, u16 *zone) } static const struct nla_policy help_nla_policy[CTA_HELP_MAX+1] = { - [CTA_HELP_NAME] = { .type = NLA_NUL_STRING }, + [CTA_HELP_NAME] = { .type = NLA_NUL_STRING, + .len = NF_CT_HELPER_NAME_LEN - 1 }, }; static inline int @@ -932,6 +933,8 @@ static const struct nla_policy ct_nla_policy[CTA_MAX+1] = { [CTA_ID] = { .type = NLA_U32 }, [CTA_NAT_DST] = { .type = NLA_NESTED }, [CTA_TUPLE_MASTER] = { .type = NLA_NESTED }, + [CTA_NAT_SEQ_ADJ_ORIG] = { .type = NLA_NESTED }, + [CTA_NAT_SEQ_ADJ_REPLY] = { .type = NLA_NESTED }, [CTA_ZONE] = { .type = NLA_U16 }, [CTA_MARK_MASK] = { .type = NLA_U32 }, }; @@ -2322,7 +2325,8 @@ static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = { [CTA_EXPECT_MASK] = { .type = NLA_NESTED }, [CTA_EXPECT_TIMEOUT] = { .type = NLA_U32 }, [CTA_EXPECT_ID] = { .type = NLA_U32 }, - [CTA_EXPECT_HELP_NAME] = { .type = NLA_NUL_STRING }, + [CTA_EXPECT_HELP_NAME] = { .type = NLA_NUL_STRING, + .len = NF_CT_HELPER_NAME_LEN - 1 }, [CTA_EXPECT_ZONE] = { .type = NLA_U16 }, [CTA_EXPECT_FLAGS] = { .type = NLA_U32 }, [CTA_EXPECT_CLASS] = { .type = NLA_U32 }, diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 61f9285..83876e9 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -1353,6 +1353,8 @@ static const struct nla_policy tcp_timeout_nla_policy[CTA_TIMEOUT_TCP_MAX+1] = { [CTA_TIMEOUT_TCP_TIME_WAIT] = { .type = NLA_U32 }, [CTA_TIMEOUT_TCP_CLOSE] = { .type = NLA_U32 }, [CTA_TIMEOUT_TCP_SYN_SENT2] = { .type = NLA_U32 }, + [CTA_TIMEOUT_TCP_RETRANS] = { .type = NLA_U32 }, + [CTA_TIMEOUT_TCP_UNACK] = { .type = NLA_U32 }, }; #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ -- cgit v0.10.2 From a0ecb85a2c3af73c63b6d44ce82aea52347ccf55 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Fri, 30 Nov 2012 12:37:26 +0000 Subject: netfilter: nf_nat: Handle routing changes in MASQUERADE target When the route changes (backup default route, VPNs) which affect a masqueraded target, the packets were sent out with the outdated source address. The patch addresses the issue by comparing the outgoing interface directly with the masqueraded interface in the nat table. Events are inefficient in this case, because it'd require adding route events to the network core and then scanning the whole conntrack table and re-checking the route for all entry. Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h index bd8eea7..ad14a79 100644 --- a/include/net/netfilter/nf_nat.h +++ b/include/net/netfilter/nf_nat.h @@ -68,4 +68,19 @@ static inline struct nf_conn_nat *nfct_nat(const struct nf_conn *ct) #endif } +static inline bool nf_nat_oif_changed(unsigned int hooknum, + enum ip_conntrack_info ctinfo, + struct nf_conn_nat *nat, + const struct net_device *out) +{ +#if IS_ENABLED(CONFIG_IP_NF_TARGET_MASQUERADE) || \ + IS_ENABLED(CONFIG_IP6_NF_TARGET_MASQUERADE) + return nat->masq_index && hooknum == NF_INET_POST_ROUTING && + CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL && + nat->masq_index != out->ifindex; +#else + return false; +#endif +} + #endif diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index ac635a7..da2c8a3 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c @@ -134,6 +134,10 @@ nf_nat_ipv4_fn(unsigned int hooknum, /* ESTABLISHED */ NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED || ctinfo == IP_CT_ESTABLISHED_REPLY); + if (nf_nat_oif_changed(hooknum, ctinfo, nat, out)) { + nf_ct_kill_acct(ct, ctinfo, skb); + return NF_DROP; + } } return nf_nat_packet(ct, ctinfo, hooknum, skb); diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c index fa84cf8..6c8ae24 100644 --- a/net/ipv6/netfilter/ip6table_nat.c +++ b/net/ipv6/netfilter/ip6table_nat.c @@ -137,6 +137,10 @@ nf_nat_ipv6_fn(unsigned int hooknum, /* ESTABLISHED */ NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED || ctinfo == IP_CT_ESTABLISHED_REPLY); + if (nf_nat_oif_changed(hooknum, ctinfo, nat, out)) { + nf_ct_kill_acct(ct, ctinfo, skb); + return NF_DROP; + } } return nf_nat_packet(ct, ctinfo, hooknum, skb); -- cgit v0.10.2