diff options
Diffstat (limited to 'net')
95 files changed, 967 insertions, 655 deletions
diff --git a/net/802/garp.c b/net/802/garp.c index 941f2a3..c1df2da 100644 --- a/net/802/garp.c +++ b/net/802/garp.c @@ -346,8 +346,8 @@ int garp_request_join(const struct net_device *dev, const struct garp_application *appl, const void *data, u8 len, u8 type) { - struct garp_port *port = dev->garp_port; - struct garp_applicant *app = port->applicants[appl->type]; + struct garp_port *port = rtnl_dereference(dev->garp_port); + struct garp_applicant *app = rtnl_dereference(port->applicants[appl->type]); struct garp_attr *attr; spin_lock_bh(&app->lock); @@ -366,8 +366,8 @@ void garp_request_leave(const struct net_device *dev, const struct garp_application *appl, const void *data, u8 len, u8 type) { - struct garp_port *port = dev->garp_port; - struct garp_applicant *app = port->applicants[appl->type]; + struct garp_port *port = rtnl_dereference(dev->garp_port); + struct garp_applicant *app = rtnl_dereference(port->applicants[appl->type]); struct garp_attr *attr; spin_lock_bh(&app->lock); @@ -546,11 +546,11 @@ static int garp_init_port(struct net_device *dev) static void garp_release_port(struct net_device *dev) { - struct garp_port *port = dev->garp_port; + struct garp_port *port = rtnl_dereference(dev->garp_port); unsigned int i; for (i = 0; i <= GARP_APPLICATION_MAX; i++) { - if (port->applicants[i]) + if (rtnl_dereference(port->applicants[i])) return; } rcu_assign_pointer(dev->garp_port, NULL); @@ -565,7 +565,7 @@ int garp_init_applicant(struct net_device *dev, struct garp_application *appl) ASSERT_RTNL(); - if (!dev->garp_port) { + if (!rtnl_dereference(dev->garp_port)) { err = garp_init_port(dev); if (err < 0) goto err1; @@ -601,8 +601,8 @@ EXPORT_SYMBOL_GPL(garp_init_applicant); void garp_uninit_applicant(struct net_device *dev, struct garp_application *appl) { - struct garp_port *port = dev->garp_port; - struct garp_applicant *app = port->applicants[appl->type]; + struct garp_port *port = rtnl_dereference(dev->garp_port); + struct garp_applicant *app = rtnl_dereference(port->applicants[appl->type]); ASSERT_RTNL(); diff --git a/net/802/stp.c b/net/802/stp.c index 53c8f77..978c30b 100644 --- a/net/802/stp.c +++ b/net/802/stp.c @@ -21,8 +21,8 @@ #define GARP_ADDR_MAX 0x2F #define GARP_ADDR_RANGE (GARP_ADDR_MAX - GARP_ADDR_MIN) -static const struct stp_proto *garp_protos[GARP_ADDR_RANGE + 1] __read_mostly; -static const struct stp_proto *stp_proto __read_mostly; +static const struct stp_proto __rcu *garp_protos[GARP_ADDR_RANGE + 1] __read_mostly; +static const struct stp_proto __rcu *stp_proto __read_mostly; static struct llc_sap *sap __read_mostly; static unsigned int sap_registered; diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 05b867e..52077ca 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -112,7 +112,7 @@ void unregister_vlan_dev(struct net_device *dev, struct list_head *head) ASSERT_RTNL(); - grp = real_dev->vlgrp; + grp = rtnl_dereference(real_dev->vlgrp); BUG_ON(!grp); /* Take it out of our own structures, but be sure to interlock with @@ -177,7 +177,7 @@ int register_vlan_dev(struct net_device *dev) struct vlan_group *grp, *ngrp = NULL; int err; - grp = real_dev->vlgrp; + grp = rtnl_dereference(real_dev->vlgrp); if (!grp) { ngrp = grp = vlan_group_alloc(real_dev); if (!grp) @@ -385,7 +385,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, dev->netdev_ops->ndo_vlan_rx_add_vid(dev, 0); } - grp = dev->vlgrp; + grp = rtnl_dereference(dev->vlgrp); if (!grp) goto out; diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 26eaebf..bb86d29 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -1392,6 +1392,7 @@ static int ax25_getname(struct socket *sock, struct sockaddr *uaddr, ax25_cb *ax25; int err = 0; + memset(fsa, 0, sizeof(fsa)); lock_sock(sk); ax25 = ax25_sk(sk); @@ -1403,7 +1404,6 @@ static int ax25_getname(struct socket *sock, struct sockaddr *uaddr, fsa->fsa_ax25.sax25_family = AF_AX25; fsa->fsa_ax25.sax25_call = ax25->dest_addr; - fsa->fsa_ax25.sax25_ndigis = 0; if (ax25->digipeat != NULL) { ndigi = ax25->digipeat->ndigi; diff --git a/net/caif/caif_config_util.c b/net/caif/caif_config_util.c index 76ae683..d522d8c 100644 --- a/net/caif/caif_config_util.c +++ b/net/caif/caif_config_util.c @@ -16,11 +16,18 @@ int connect_req_to_link_param(struct cfcnfg *cnfg, { struct dev_info *dev_info; enum cfcnfg_phy_preference pref; + int res; + memset(l, 0, sizeof(*l)); - l->priority = s->priority; + /* In caif protocol low value is high priority */ + l->priority = CAIF_PRIO_MAX - s->priority + 1; - if (s->link_name[0] != '\0') - l->phyid = cfcnfg_get_named(cnfg, s->link_name); + if (s->ifindex != 0){ + res = cfcnfg_get_id_from_ifi(cnfg, s->ifindex); + if (res < 0) + return res; + l->phyid = res; + } else { switch (s->link_selector) { case CAIF_LINK_HIGH_BANDW: diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c index b99369a..a42a408 100644 --- a/net/caif/caif_dev.c +++ b/net/caif/caif_dev.c @@ -307,6 +307,8 @@ static int caif_device_notify(struct notifier_block *me, unsigned long what, case NETDEV_UNREGISTER: caifd = caif_get(dev); + if (caifd == NULL) + break; netdev_info(dev, "unregister\n"); atomic_set(&caifd->state, what); caif_device_destroy(dev); diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index 2eca2dd..1bf0cf5 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -716,8 +716,7 @@ static int setsockopt(struct socket *sock, { struct sock *sk = sock->sk; struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); - int prio, linksel; - struct ifreq ifreq; + int linksel; if (cf_sk->sk.sk_socket->state != SS_UNCONNECTED) return -ENOPROTOOPT; @@ -735,33 +734,6 @@ static int setsockopt(struct socket *sock, release_sock(&cf_sk->sk); return 0; - case SO_PRIORITY: - if (lvl != SOL_SOCKET) - goto bad_sol; - if (ol < sizeof(int)) - return -EINVAL; - if (copy_from_user(&prio, ov, sizeof(int))) - return -EINVAL; - lock_sock(&(cf_sk->sk)); - cf_sk->conn_req.priority = prio; - release_sock(&cf_sk->sk); - return 0; - - case SO_BINDTODEVICE: - if (lvl != SOL_SOCKET) - goto bad_sol; - if (ol < sizeof(struct ifreq)) - return -EINVAL; - if (copy_from_user(&ifreq, ov, sizeof(ifreq))) - return -EFAULT; - lock_sock(&(cf_sk->sk)); - strncpy(cf_sk->conn_req.link_name, ifreq.ifr_name, - sizeof(cf_sk->conn_req.link_name)); - cf_sk->conn_req.link_name - [sizeof(cf_sk->conn_req.link_name)-1] = 0; - release_sock(&cf_sk->sk); - return 0; - case CAIFSO_REQ_PARAM: if (lvl != SOL_CAIF) goto bad_sol; @@ -880,6 +852,18 @@ static int caif_connect(struct socket *sock, struct sockaddr *uaddr, sock->state = SS_CONNECTING; sk->sk_state = CAIF_CONNECTING; + /* Check priority value comming from socket */ + /* if priority value is out of range it will be ajusted */ + if (cf_sk->sk.sk_priority > CAIF_PRIO_MAX) + cf_sk->conn_req.priority = CAIF_PRIO_MAX; + else if (cf_sk->sk.sk_priority < CAIF_PRIO_MIN) + cf_sk->conn_req.priority = CAIF_PRIO_MIN; + else + cf_sk->conn_req.priority = cf_sk->sk.sk_priority; + + /*ifindex = id of the interface.*/ + cf_sk->conn_req.ifindex = cf_sk->sk.sk_bound_dev_if; + dbfs_atomic_inc(&cnt.num_connect_req); cf_sk->layer.receive = caif_sktrecv_cb; err = caif_connect_client(&cf_sk->conn_req, @@ -905,6 +889,7 @@ static int caif_connect(struct socket *sock, struct sockaddr *uaddr, cf_sk->maxframe = mtu - (headroom + tailroom); if (cf_sk->maxframe < 1) { pr_warn("CAIF Interface MTU too small (%d)\n", dev->mtu); + err = -ENODEV; goto out; } @@ -1142,7 +1127,7 @@ static int caif_create(struct net *net, struct socket *sock, int protocol, set_rx_flow_on(cf_sk); /* Set default options on configuration */ - cf_sk->conn_req.priority = CAIF_PRIO_NORMAL; + cf_sk->sk.sk_priority= CAIF_PRIO_NORMAL; cf_sk->conn_req.link_selector = CAIF_LINK_LOW_LATENCY; cf_sk->conn_req.protocol = protocol; /* Increase the number of sockets created. */ diff --git a/net/caif/cfcnfg.c b/net/caif/cfcnfg.c index 41adafd1..21ede14 100644 --- a/net/caif/cfcnfg.c +++ b/net/caif/cfcnfg.c @@ -173,18 +173,15 @@ static struct cfcnfg_phyinfo *cfcnfg_get_phyinfo(struct cfcnfg *cnfg, return NULL; } -int cfcnfg_get_named(struct cfcnfg *cnfg, char *name) + +int cfcnfg_get_id_from_ifi(struct cfcnfg *cnfg, int ifi) { int i; - - /* Try to match with specified name */ - for (i = 0; i < MAX_PHY_LAYERS; i++) { - if (cnfg->phy_layers[i].frm_layer != NULL - && strcmp(cnfg->phy_layers[i].phy_layer->name, - name) == 0) - return cnfg->phy_layers[i].frm_layer->id; - } - return 0; + for (i = 0; i < MAX_PHY_LAYERS; i++) + if (cnfg->phy_layers[i].frm_layer != NULL && + cnfg->phy_layers[i].ifindex == ifi) + return i; + return -ENODEV; } int cfcnfg_disconn_adapt_layer(struct cfcnfg *cnfg, struct cflayer *adap_layer) diff --git a/net/caif/cfctrl.c b/net/caif/cfctrl.c index 08f267a..3cd8f97 100644 --- a/net/caif/cfctrl.c +++ b/net/caif/cfctrl.c @@ -361,11 +361,10 @@ void cfctrl_cancel_req(struct cflayer *layr, struct cflayer *adap_layer) struct cfctrl_request_info *p, *tmp; struct cfctrl *ctrl = container_obj(layr); spin_lock(&ctrl->info_list_lock); - pr_warn("enter\n"); list_for_each_entry_safe(p, tmp, &ctrl->list, list) { if (p->client_layer == adap_layer) { - pr_warn("cancel req :%d\n", p->sequence_no); + pr_debug("cancel req :%d\n", p->sequence_no); list_del(&p->list); kfree(p); } diff --git a/net/caif/cfdbgl.c b/net/caif/cfdbgl.c index 496fda9..11a2af4 100644 --- a/net/caif/cfdbgl.c +++ b/net/caif/cfdbgl.c @@ -12,6 +12,8 @@ #include <net/caif/cfsrvl.h> #include <net/caif/cfpkt.h> +#define container_obj(layr) ((struct cfsrvl *) layr) + static int cfdbgl_receive(struct cflayer *layr, struct cfpkt *pkt); static int cfdbgl_transmit(struct cflayer *layr, struct cfpkt *pkt); @@ -38,5 +40,17 @@ static int cfdbgl_receive(struct cflayer *layr, struct cfpkt *pkt) static int cfdbgl_transmit(struct cflayer *layr, struct cfpkt *pkt) { + struct cfsrvl *service = container_obj(layr); + struct caif_payload_info *info; + int ret; + + if (!cfsrvl_ready(service, &ret)) + return ret; + + /* Add info for MUX-layer to route the packet out */ + info = cfpkt_info(pkt); + info->channel_id = service->layer.id; + info->dev_info = &service->dev_info; + return layr->dn->transmit(layr->dn, pkt); } diff --git a/net/caif/cfrfml.c b/net/caif/cfrfml.c index bde8481..e2fb5fa 100644 --- a/net/caif/cfrfml.c +++ b/net/caif/cfrfml.c @@ -193,7 +193,7 @@ out: static int cfrfml_transmit_segment(struct cfrfml *rfml, struct cfpkt *pkt) { - caif_assert(cfpkt_getlen(pkt) >= rfml->fragment_size); + caif_assert(cfpkt_getlen(pkt) < rfml->fragment_size); /* Add info for MUX-layer to route the packet out. */ cfpkt_info(pkt)->channel_id = rfml->serv.layer.id; diff --git a/net/compat.c b/net/compat.c index 63d260e..3649d58 100644 --- a/net/compat.c +++ b/net/compat.c @@ -41,10 +41,12 @@ static inline int iov_from_user_compat_to_kern(struct iovec *kiov, compat_size_t len; if (get_user(len, &uiov32->iov_len) || - get_user(buf, &uiov32->iov_base)) { - tot_len = -EFAULT; - break; - } + get_user(buf, &uiov32->iov_base)) + return -EFAULT; + + if (len > INT_MAX - tot_len) + len = INT_MAX - tot_len; + tot_len += len; kiov->iov_base = compat_ptr(buf); kiov->iov_len = (__kernel_size_t) len; diff --git a/net/core/dev.c b/net/core/dev.c index 78b5a89..0dd54a6 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1685,10 +1685,10 @@ EXPORT_SYMBOL(netif_device_attach); static bool can_checksum_protocol(unsigned long features, __be16 protocol) { - return ((features & NETIF_F_GEN_CSUM) || - ((features & NETIF_F_IP_CSUM) && + return ((features & NETIF_F_NO_CSUM) || + ((features & NETIF_F_V4_CSUM) && protocol == htons(ETH_P_IP)) || - ((features & NETIF_F_IPV6_CSUM) && + ((features & NETIF_F_V6_CSUM) && protocol == htons(ETH_P_IPV6)) || ((features & NETIF_F_FCOE_CRC) && protocol == htons(ETH_P_FCOE))); @@ -1696,22 +1696,18 @@ static bool can_checksum_protocol(unsigned long features, __be16 protocol) static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb) { + __be16 protocol = skb->protocol; int features = dev->features; - if (vlan_tx_tag_present(skb)) + if (vlan_tx_tag_present(skb)) { features &= dev->vlan_features; - - if (can_checksum_protocol(features, skb->protocol)) - return true; - - if (skb->protocol == htons(ETH_P_8021Q)) { + } else if (protocol == htons(ETH_P_8021Q)) { struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; - if (can_checksum_protocol(dev->features & dev->vlan_features, - veh->h_vlan_encapsulated_proto)) - return true; + protocol = veh->h_vlan_encapsulated_proto; + features &= dev->vlan_features; } - return false; + return can_checksum_protocol(features, protocol); } /** @@ -2135,7 +2131,7 @@ static struct netdev_queue *dev_pick_tx(struct net_device *dev, } else { struct sock *sk = skb->sk; queue_index = sk_tx_queue_get(sk); - if (queue_index < 0) { + if (queue_index < 0 || queue_index >= dev->real_num_tx_queues) { queue_index = 0; if (dev->real_num_tx_queues > 1) @@ -2213,7 +2209,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, } static DEFINE_PER_CPU(int, xmit_recursion); -#define RECURSION_LIMIT 3 +#define RECURSION_LIMIT 10 /** * dev_queue_xmit - transmit a buffer @@ -2413,7 +2409,7 @@ EXPORT_SYMBOL(__skb_get_rxhash); #ifdef CONFIG_RPS /* One global table that all flow-based protocols share. */ -struct rps_sock_flow_table *rps_sock_flow_table __read_mostly; +struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly; EXPORT_SYMBOL(rps_sock_flow_table); /* @@ -2425,7 +2421,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow **rflowp) { struct netdev_rx_queue *rxqueue; - struct rps_map *map = NULL; + struct rps_map *map; struct rps_dev_flow_table *flow_table; struct rps_sock_flow_table *sock_flow_table; int cpu = -1; @@ -2444,15 +2440,15 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, } else rxqueue = dev->_rx; - if (rxqueue->rps_map) { - map = rcu_dereference(rxqueue->rps_map); - if (map && map->len == 1) { + map = rcu_dereference(rxqueue->rps_map); + if (map) { + if (map->len == 1) { tcpu = map->cpus[0]; if (cpu_online(tcpu)) cpu = tcpu; goto done; } - } else if (!rxqueue->rps_flow_table) { + } else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) { goto done; } @@ -5416,7 +5412,7 @@ void netdev_run_todo(void) /* paranoia */ BUG_ON(netdev_refcnt_read(dev)); WARN_ON(rcu_dereference_raw(dev->ip_ptr)); - WARN_ON(dev->ip6_ptr); + WARN_ON(rcu_dereference_raw(dev->ip6_ptr)); WARN_ON(dev->dn_ptr); if (dev->destructor) diff --git a/net/core/dst.c b/net/core/dst.c index 8abe628..b99c7c7 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -370,6 +370,7 @@ static int dst_dev_event(struct notifier_block *this, unsigned long event, static struct notifier_block dst_dev_notifier = { .notifier_call = dst_dev_event, + .priority = -10, /* must be called after other network notifiers */ }; void __init dst_init(void) diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 1bc3f25..82a4369 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -351,12 +351,12 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) list_for_each_entry(r, &ops->rules_list, list) { if (r->pref == rule->target) { - rule->ctarget = r; + RCU_INIT_POINTER(rule->ctarget, r); break; } } - if (rule->ctarget == NULL) + if (rcu_dereference_protected(rule->ctarget, 1) == NULL) unresolved = 1; } else if (rule->action == FR_ACT_GOTO) goto errout_free; @@ -373,6 +373,11 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) fib_rule_get(rule); + if (last) + list_add_rcu(&rule->list, &last->list); + else + list_add_rcu(&rule->list, &ops->rules_list); + if (ops->unresolved_rules) { /* * There are unresolved goto rules in the list, check if @@ -381,7 +386,7 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) list_for_each_entry(r, &ops->rules_list, list) { if (r->action == FR_ACT_GOTO && r->target == rule->pref) { - BUG_ON(r->ctarget != NULL); + BUG_ON(rtnl_dereference(r->ctarget) != NULL); rcu_assign_pointer(r->ctarget, rule); if (--ops->unresolved_rules == 0) break; @@ -395,11 +400,6 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) if (unresolved) ops->unresolved_rules++; - if (last) - list_add_rcu(&rule->list, &last->list); - else - list_add_rcu(&rule->list, &ops->rules_list); - notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).pid); flush_route_cache(ops); rules_ops_put(ops); @@ -487,7 +487,7 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) */ if (ops->nr_goto_rules > 0) { list_for_each_entry(tmp, &ops->rules_list, list) { - if (tmp->ctarget == rule) { + if (rtnl_dereference(tmp->ctarget) == rule) { rcu_assign_pointer(tmp->ctarget, NULL); ops->unresolved_rules++; } @@ -545,7 +545,8 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, frh->action = rule->action; frh->flags = rule->flags; - if (rule->action == FR_ACT_GOTO && rule->ctarget == NULL) + if (rule->action == FR_ACT_GOTO && + rcu_dereference_raw(rule->ctarget) == NULL) frh->flags |= FIB_RULE_UNRESOLVED; if (rule->iifname[0]) { diff --git a/net/core/filter.c b/net/core/filter.c index 7adf503..23e9b2a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -89,8 +89,8 @@ int sk_filter(struct sock *sk, struct sk_buff *skb) rcu_read_lock_bh(); filter = rcu_dereference_bh(sk->sk_filter); if (filter) { - unsigned int pkt_len = sk_run_filter(skb, filter->insns, - filter->len); + unsigned int pkt_len = sk_run_filter(skb, filter->insns, filter->len); + err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM; } rcu_read_unlock_bh(); @@ -112,39 +112,41 @@ EXPORT_SYMBOL(sk_filter); */ unsigned int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen) { - struct sock_filter *fentry; /* We walk down these */ void *ptr; u32 A = 0; /* Accumulator */ u32 X = 0; /* Index Register */ u32 mem[BPF_MEMWORDS]; /* Scratch Memory Store */ + unsigned long memvalid = 0; u32 tmp; int k; int pc; + BUILD_BUG_ON(BPF_MEMWORDS > BITS_PER_LONG); /* * Process array of filter instructions. */ for (pc = 0; pc < flen; pc++) { - fentry = &filter[pc]; + const struct sock_filter *fentry = &filter[pc]; + u32 f_k = fentry->k; switch (fentry->code) { case BPF_S_ALU_ADD_X: A += X; continue; case BPF_S_ALU_ADD_K: - A += fentry->k; + A += f_k; continue; case BPF_S_ALU_SUB_X: A -= X; continue; case BPF_S_ALU_SUB_K: - A -= fentry->k; + A -= f_k; continue; case BPF_S_ALU_MUL_X: A *= X; continue; case BPF_S_ALU_MUL_K: - A *= fentry->k; + A *= f_k; continue; case BPF_S_ALU_DIV_X: if (X == 0) @@ -152,49 +154,49 @@ unsigned int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int A /= X; continue; case BPF_S_ALU_DIV_K: - A /= fentry->k; + A /= f_k; continue; case BPF_S_ALU_AND_X: A &= X; continue; case BPF_S_ALU_AND_K: - A &= fentry->k; + A &= f_k; continue; case BPF_S_ALU_OR_X: A |= X; continue; case BPF_S_ALU_OR_K: - A |= fentry->k; + A |= f_k; continue; case BPF_S_ALU_LSH_X: A <<= X; continue; case BPF_S_ALU_LSH_K: - A <<= fentry->k; + A <<= f_k; continue; case BPF_S_ALU_RSH_X: A >>= X; continue; case BPF_S_ALU_RSH_K: - A >>= fentry->k; + A >>= f_k; continue; case BPF_S_ALU_NEG: A = -A; continue; case BPF_S_JMP_JA: - pc += fentry->k; + pc += f_k; continue; case BPF_S_JMP_JGT_K: - pc += (A > fentry->k) ? fentry->jt : fentry->jf; + pc += (A > f_k) ? fentry->jt : fentry->jf; continue; case BPF_S_JMP_JGE_K: - pc += (A >= fentry->k) ? fentry->jt : fentry->jf; + pc += (A >= f_k) ? fentry->jt : fentry->jf; continue; case BPF_S_JMP_JEQ_K: - pc += (A == fentry->k) ? fentry->jt : fentry->jf; + pc += (A == f_k) ? fentry->jt : fentry->jf; continue; case BPF_S_JMP_JSET_K: - pc += (A & fentry->k) ? fentry->jt : fentry->jf; + pc += (A & f_k) ? fentry->jt : fentry->jf; continue; case BPF_S_JMP_JGT_X: pc += (A > X) ? fentry->jt : fentry->jf; @@ -209,7 +211,7 @@ unsigned int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int pc += (A & X) ? fentry->jt : fentry->jf; continue; case BPF_S_LD_W_ABS: - k = fentry->k; + k = f_k; load_w: ptr = load_pointer(skb, k, 4, &tmp); if (ptr != NULL) { @@ -218,7 +220,7 @@ load_w: } break; case BPF_S_LD_H_ABS: - k = fentry->k; + k = f_k; load_h: ptr = load_pointer(skb, k, 2, &tmp); if (ptr != NULL) { @@ -227,7 +229,7 @@ load_h: } break; case BPF_S_LD_B_ABS: - k = fentry->k; + k = f_k; load_b: ptr = load_pointer(skb, k, 1, &tmp); if (ptr != NULL) { @@ -242,32 +244,34 @@ load_b: X = skb->len; continue; case BPF_S_LD_W_IND: - k = X + fentry->k; + k = X + f_k; goto load_w; case BPF_S_LD_H_IND: - k = X + fentry->k; + k = X + f_k; goto load_h; case BPF_S_LD_B_IND: - k = X + fentry->k; + k = X + f_k; goto load_b; case BPF_S_LDX_B_MSH: - ptr = load_pointer(skb, fentry->k, 1, &tmp); + ptr = load_pointer(skb, f_k, 1, &tmp); if (ptr != NULL) { X = (*(u8 *)ptr & 0xf) << 2; continue; } return 0; case BPF_S_LD_IMM: - A = fentry->k; + A = f_k; continue; case BPF_S_LDX_IMM: - X = fentry->k; + X = f_k; continue; case BPF_S_LD_MEM: - A = mem[fentry->k]; + A = (memvalid & (1UL << f_k)) ? + mem[f_k] : 0; continue; case BPF_S_LDX_MEM: - X = mem[fentry->k]; + X = (memvalid & (1UL << f_k)) ? + mem[f_k] : 0; continue; case BPF_S_MISC_TAX: X = A; @@ -276,14 +280,16 @@ load_b: A = X; continue; case BPF_S_RET_K: - return fentry->k; + return f_k; case BPF_S_RET_A: return A; case BPF_S_ST: - mem[fentry->k] = A; + memvalid |= 1UL << f_k; + mem[f_k] = A; continue; case BPF_S_STX: - mem[fentry->k] = X; + memvalid |= 1UL << f_k; + mem[f_k] = X; continue; default: WARN_ON(1); diff --git a/net/core/iovec.c b/net/core/iovec.c index 72aceb1..c40f27e 100644 --- a/net/core/iovec.c +++ b/net/core/iovec.c @@ -35,10 +35,9 @@ * in any case. */ -long verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr *address, int mode) +int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr *address, int mode) { - int size, ct; - long err; + int size, ct, err; if (m->msg_namelen) { if (mode == VERIFY_READ) { @@ -62,14 +61,13 @@ long verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr *address, err = 0; for (ct = 0; ct < m->msg_iovlen; ct++) { - err += iov[ct].iov_len; - /* - * Goal is not to verify user data, but to prevent returning - * negative value, which is interpreted as errno. - * Overflow is still possible, but it is harmless. - */ - if (err < 0) - return -EMSGSIZE; + size_t len = iov[ct].iov_len; + + if (len > INT_MAX - err) { + len = INT_MAX - err; + iov[ct].iov_len = len; + } + err += len; } return err; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index b143173..a5ff5a8 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -598,7 +598,8 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue, } spin_lock(&rps_map_lock); - old_map = queue->rps_map; + old_map = rcu_dereference_protected(queue->rps_map, + lockdep_is_held(&rps_map_lock)); rcu_assign_pointer(queue->rps_map, map); spin_unlock(&rps_map_lock); @@ -677,7 +678,8 @@ static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, table = NULL; spin_lock(&rps_dev_flow_lock); - old_table = queue->rps_flow_table; + old_table = rcu_dereference_protected(queue->rps_flow_table, + lockdep_is_held(&rps_dev_flow_lock)); rcu_assign_pointer(queue->rps_flow_table, table); spin_unlock(&rps_dev_flow_lock); @@ -705,13 +707,17 @@ static void rx_queue_release(struct kobject *kobj) { struct netdev_rx_queue *queue = to_rx_queue(kobj); struct netdev_rx_queue *first = queue->first; + struct rps_map *map; + struct rps_dev_flow_table *flow_table; - if (queue->rps_map) - call_rcu(&queue->rps_map->rcu, rps_map_release); - if (queue->rps_flow_table) - call_rcu(&queue->rps_flow_table->rcu, - rps_dev_flow_table_release); + map = rcu_dereference_raw(queue->rps_map); + if (map) + call_rcu(&map->rcu, rps_map_release); + + flow_table = rcu_dereference_raw(queue->rps_flow_table); + if (flow_table) + call_rcu(&flow_table->rcu, rps_dev_flow_table_release); if (atomic_dec_and_test(&first->count)) kfree(first); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index c988e68..3f86026 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -42,7 +42,9 @@ static int net_assign_generic(struct net *net, int id, void *data) BUG_ON(!mutex_is_locked(&net_mutex)); BUG_ON(id == 0); - ng = old_ng = net->gen; + old_ng = rcu_dereference_protected(net->gen, + lockdep_is_held(&net_mutex)); + ng = old_ng; if (old_ng->len >= id) goto assign; diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 2c0df0f..33bc382 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -771,10 +771,10 @@ done: static unsigned long num_arg(const char __user * user_buffer, unsigned long maxlen, unsigned long *num) { - int i = 0; + int i; *num = 0; - for (; i < maxlen; i++) { + for (i = 0; i < maxlen; i++) { char c; if (get_user(c, &user_buffer[i])) return -EFAULT; @@ -789,9 +789,9 @@ static unsigned long num_arg(const char __user * user_buffer, static int strn_len(const char __user * user_buffer, unsigned int maxlen) { - int i = 0; + int i; - for (; i < maxlen; i++) { + for (i = 0; i < maxlen; i++) { char c; if (get_user(c, &user_buffer[i])) return -EFAULT; @@ -846,7 +846,7 @@ static ssize_t pktgen_if_write(struct file *file, { struct seq_file *seq = file->private_data; struct pktgen_dev *pkt_dev = seq->private; - int i = 0, max, len; + int i, max, len; char name[16], valstr[32]; unsigned long value = 0; char *pg_result = NULL; @@ -860,13 +860,13 @@ static ssize_t pktgen_if_write(struct file *file, return -EINVAL; } - max = count - i; - tmp = count_trail_chars(&user_buffer[i], max); + max = count; + tmp = count_trail_chars(user_buffer, max); if (tmp < 0) { pr_warning("illegal format\n"); return tmp; } - i += tmp; + i = tmp; /* Read variable name */ @@ -887,10 +887,11 @@ static ssize_t pktgen_if_write(struct file *file, i += len; if (debug) { - char tb[count + 1]; - if (copy_from_user(tb, user_buffer, count)) + size_t copy = min_t(size_t, count, 1023); + char tb[copy + 1]; + if (copy_from_user(tb, user_buffer, copy)) return -EFAULT; - tb[count] = 0; + tb[copy] = 0; printk(KERN_DEBUG "pktgen: %s,%lu buffer -:%s:-\n", name, (unsigned long)count, tb); } @@ -1764,7 +1765,7 @@ static ssize_t pktgen_thread_write(struct file *file, { struct seq_file *seq = file->private_data; struct pktgen_thread *t = seq->private; - int i = 0, max, len, ret; + int i, max, len, ret; char name[40]; char *pg_result; @@ -1773,12 +1774,12 @@ static ssize_t pktgen_thread_write(struct file *file, return -EINVAL; } - max = count - i; - len = count_trail_chars(&user_buffer[i], max); + max = count; + len = count_trail_chars(user_buffer, max); if (len < 0) return len; - i += len; + i = len; /* Read variable name */ @@ -1975,7 +1976,7 @@ static struct net_device *pktgen_dev_get_by_name(struct pktgen_dev *pkt_dev, const char *ifname) { char b[IFNAMSIZ+5]; - int i = 0; + int i; for (i = 0; ifname[i] != '@'; i++) { if (i == IFNAMSIZ) @@ -2519,8 +2520,8 @@ static void free_SAs(struct pktgen_dev *pkt_dev) { if (pkt_dev->cflows) { /* let go of the SAs if we have them */ - int i = 0; - for (; i < pkt_dev->cflows; i++) { + int i; + for (i = 0; i < pkt_dev->cflows; i++) { struct xfrm_state *x = pkt_dev->flows[i].x; if (x) { xfrm_state_put(x); @@ -2611,8 +2612,8 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, /* Update any of the values, used when we're incrementing various * fields. */ - queue_map = pkt_dev->cur_queue_map; mod_cur_headers(pkt_dev); + queue_map = pkt_dev->cur_queue_map; datalen = (odev->hard_header_len + 16) & ~0xf; @@ -2975,8 +2976,8 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, /* Update any of the values, used when we're incrementing various * fields. */ - queue_map = pkt_dev->cur_queue_map; mod_cur_headers(pkt_dev); + queue_map = pkt_dev->cur_queue_map; skb = __netdev_alloc_skb(odev, pkt_dev->cur_pkt_size + 64 diff --git a/net/core/sock.c b/net/core/sock.c index 11db436..fb60801 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1225,7 +1225,7 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority) sock_reset_flag(newsk, SOCK_DONE); skb_queue_head_init(&newsk->sk_error_queue); - filter = newsk->sk_filter; + filter = rcu_dereference_protected(newsk->sk_filter, 1); if (filter != NULL) sk_filter_charge(newsk, filter); @@ -1653,10 +1653,10 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind) { struct proto *prot = sk->sk_prot; int amt = sk_mem_pages(size); - int allocated; + long allocated; sk->sk_forward_alloc += amt * SK_MEM_QUANTUM; - allocated = atomic_add_return(amt, prot->memory_allocated); + allocated = atomic_long_add_return(amt, prot->memory_allocated); /* Under limit. */ if (allocated <= prot->sysctl_mem[0]) { @@ -1714,7 +1714,7 @@ suppress_allocation: /* Alas. Undo changes. */ sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM; - atomic_sub(amt, prot->memory_allocated); + atomic_long_sub(amt, prot->memory_allocated); return 0; } EXPORT_SYMBOL(__sk_mem_schedule); @@ -1727,12 +1727,12 @@ void __sk_mem_reclaim(struct sock *sk) { struct proto *prot = sk->sk_prot; - atomic_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT, + atomic_long_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT, prot->memory_allocated); sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1; if (prot->memory_pressure && *prot->memory_pressure && - (atomic_read(prot->memory_allocated) < prot->sysctl_mem[0])) + (atomic_long_read(prot->memory_allocated) < prot->sysctl_mem[0])) *prot->memory_pressure = 0; } EXPORT_SYMBOL(__sk_mem_reclaim); @@ -2452,12 +2452,12 @@ static char proto_method_implemented(const void *method) static void proto_seq_printf(struct seq_file *seq, struct proto *proto) { - seq_printf(seq, "%-9s %4u %6d %6d %-3s %6u %-3s %-10s " + seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s " "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", proto->name, proto->obj_size, sock_prot_inuse_get(seq_file_net(seq), proto), - proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1, + proto->memory_allocated != NULL ? atomic_long_read(proto->memory_allocated) : -1L, proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI", proto->max_header, proto->slab == NULL ? "no" : "yes", diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 01eee5d..385b609 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -34,7 +34,8 @@ static int rps_sock_flow_sysctl(ctl_table *table, int write, mutex_lock(&sock_flow_mutex); - orig_sock_table = rps_sock_flow_table; + orig_sock_table = rcu_dereference_protected(rps_sock_flow_table, + lockdep_is_held(&sock_flow_mutex)); size = orig_size = orig_sock_table ? orig_sock_table->mask + 1 : 0; ret = proc_dointvec(&tmp, write, buffer, lenp, ppos); diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h index 117fb09..75c3582 100644 --- a/net/dccp/ccid.h +++ b/net/dccp/ccid.h @@ -134,13 +134,41 @@ static inline int ccid_get_current_tx_ccid(struct dccp_sock *dp) extern void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk); extern void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk); +/* + * Congestion control of queued data packets via CCID decision. + * + * The TX CCID performs its congestion-control by indicating whether and when a + * queued packet may be sent, using the return code of ccid_hc_tx_send_packet(). + * The following modes are supported via the symbolic constants below: + * - timer-based pacing (CCID returns a delay value in milliseconds); + * - autonomous dequeueing (CCID internally schedules dccps_xmitlet). + */ + +enum ccid_dequeueing_decision { + CCID_PACKET_SEND_AT_ONCE = 0x00000, /* "green light": no delay */ + CCID_PACKET_DELAY_MAX = 0x0FFFF, /* maximum delay in msecs */ + CCID_PACKET_DELAY = 0x10000, /* CCID msec-delay mode */ + CCID_PACKET_WILL_DEQUEUE_LATER = 0x20000, /* CCID autonomous mode */ + CCID_PACKET_ERR = 0xF0000, /* error condition */ +}; + +static inline int ccid_packet_dequeue_eval(const int return_code) +{ + if (return_code < 0) + return CCID_PACKET_ERR; + if (return_code == 0) + return CCID_PACKET_SEND_AT_ONCE; + if (return_code <= CCID_PACKET_DELAY_MAX) + return CCID_PACKET_DELAY; + return return_code; +} + static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk, struct sk_buff *skb) { - int rc = 0; if (ccid->ccid_ops->ccid_hc_tx_send_packet != NULL) - rc = ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb); - return rc; + return ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb); + return CCID_PACKET_SEND_AT_ONCE; } static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk, diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index d850e29..6576eae 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -78,12 +78,9 @@ static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hc) static int ccid2_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) { - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - - if (hc->tx_pipe < hc->tx_cwnd) - return 0; - - return 1; /* XXX CCID should dequeue when ready instead of polling */ + if (ccid2_cwnd_network_limited(ccid2_hc_tx_sk(sk))) + return CCID_PACKET_WILL_DEQUEUE_LATER; + return CCID_PACKET_SEND_AT_ONCE; } static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val) @@ -115,6 +112,7 @@ static void ccid2_hc_tx_rto_expire(unsigned long data) { struct sock *sk = (struct sock *)data; struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); + const bool sender_was_blocked = ccid2_cwnd_network_limited(hc); bh_lock_sock(sk); if (sock_owned_by_user(sk)) { @@ -129,8 +127,6 @@ static void ccid2_hc_tx_rto_expire(unsigned long data) if (hc->tx_rto > DCCP_RTO_MAX) hc->tx_rto = DCCP_RTO_MAX; - sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto); - /* adjust pipe, cwnd etc */ hc->tx_ssthresh = hc->tx_cwnd / 2; if (hc->tx_ssthresh < 2) @@ -146,6 +142,12 @@ static void ccid2_hc_tx_rto_expire(unsigned long data) hc->tx_rpseq = 0; hc->tx_rpdupack = -1; ccid2_change_l_ack_ratio(sk, 1); + + /* if we were blocked before, we may now send cwnd=1 packet */ + if (sender_was_blocked) + tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet); + /* restart backed-off timer */ + sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto); out: bh_unlock_sock(sk); sock_put(sk); @@ -434,6 +436,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) { struct dccp_sock *dp = dccp_sk(sk); struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); + const bool sender_was_blocked = ccid2_cwnd_network_limited(hc); u64 ackno, seqno; struct ccid2_seq *seqp; unsigned char *vector; @@ -631,6 +634,10 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) sk_stop_timer(sk, &hc->tx_rtotimer); else sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto); + + /* check if incoming Acks allow pending packets to be sent */ + if (sender_was_blocked && !ccid2_cwnd_network_limited(hc)) + tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet); } static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h index 9731c2d..25cb6b2 100644 --- a/net/dccp/ccids/ccid2.h +++ b/net/dccp/ccids/ccid2.h @@ -81,6 +81,11 @@ struct ccid2_hc_tx_sock { u64 tx_high_ack; }; +static inline bool ccid2_cwnd_network_limited(struct ccid2_hc_tx_sock *hc) +{ + return hc->tx_pipe >= hc->tx_cwnd; +} + struct ccid2_hc_rx_sock { int rx_data; }; diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 3060a60..3d604e1 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -268,11 +268,11 @@ out: sock_put(sk); } -/* - * returns - * > 0: delay (in msecs) that should pass before actually sending - * = 0: can send immediately - * < 0: error condition; do not send packet +/** + * ccid3_hc_tx_send_packet - Delay-based dequeueing of TX packets + * @skb: next packet candidate to send on @sk + * This function uses the convention of ccid_packet_dequeue_eval() and + * returns a millisecond-delay value between 0 and t_mbi = 64000 msec. */ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) { @@ -348,7 +348,7 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) /* set the nominal send time for the next following packet */ hc->tx_t_nom = ktime_add_us(hc->tx_t_nom, hc->tx_t_ipi); - return 0; + return CCID_PACKET_SEND_AT_ONCE; } static void ccid3_hc_tx_packet_sent(struct sock *sk, unsigned int len) diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 3eb264b..a8ed459 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -243,8 +243,9 @@ extern void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, extern void dccp_send_sync(struct sock *sk, const u64 seq, const enum dccp_pkt_type pkt_type); -extern void dccp_write_xmit(struct sock *sk, int block); -extern void dccp_write_space(struct sock *sk); +extern void dccp_write_xmit(struct sock *sk); +extern void dccp_write_space(struct sock *sk); +extern void dccp_flush_write_queue(struct sock *sk, long *time_budget); extern void dccp_init_xmit_timers(struct sock *sk); static inline void dccp_clear_xmit_timers(struct sock *sk) diff --git a/net/dccp/output.c b/net/dccp/output.c index a988fe9..45b9185 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -209,108 +209,150 @@ void dccp_write_space(struct sock *sk) } /** - * dccp_wait_for_ccid - Wait for ccid to tell us we can send a packet + * dccp_wait_for_ccid - Await CCID send permission * @sk: socket to wait for - * @skb: current skb to pass on for waiting - * @delay: sleep timeout in milliseconds (> 0) - * This function is called by default when the socket is closed, and - * when a non-zero linger time is set on the socket. For consistency + * @delay: timeout in jiffies + * This is used by CCIDs which need to delay the send time in process context. */ -static int dccp_wait_for_ccid(struct sock *sk, struct sk_buff *skb, int delay) +static int dccp_wait_for_ccid(struct sock *sk, unsigned long delay) { - struct dccp_sock *dp = dccp_sk(sk); DEFINE_WAIT(wait); - unsigned long jiffdelay; - int rc; + long remaining; + + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + sk->sk_write_pending++; + release_sock(sk); + + remaining = schedule_timeout(delay); + + lock_sock(sk); + sk->sk_write_pending--; + finish_wait(sk_sleep(sk), &wait); + + if (signal_pending(current) || sk->sk_err) + return -1; + return remaining; +} + +/** + * dccp_xmit_packet - Send data packet under control of CCID + * Transmits next-queued payload and informs CCID to account for the packet. + */ +static void dccp_xmit_packet(struct sock *sk) +{ + int err, len; + struct dccp_sock *dp = dccp_sk(sk); + struct sk_buff *skb = skb_dequeue(&sk->sk_write_queue); - do { - dccp_pr_debug("delayed send by %d msec\n", delay); - jiffdelay = msecs_to_jiffies(delay); + if (unlikely(skb == NULL)) + return; + len = skb->len; - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + if (sk->sk_state == DCCP_PARTOPEN) { + const u32 cur_mps = dp->dccps_mss_cache - DCCP_FEATNEG_OVERHEAD; + /* + * See 8.1.5 - Handshake Completion. + * + * For robustness we resend Confirm options until the client has + * entered OPEN. During the initial feature negotiation, the MPS + * is smaller than usual, reduced by the Change/Confirm options. + */ + if (!list_empty(&dp->dccps_featneg) && len > cur_mps) { + DCCP_WARN("Payload too large (%d) for featneg.\n", len); + dccp_send_ack(sk); + dccp_feat_list_purge(&dp->dccps_featneg); + } - sk->sk_write_pending++; - release_sock(sk); - schedule_timeout(jiffdelay); - lock_sock(sk); - sk->sk_write_pending--; + inet_csk_schedule_ack(sk); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, + inet_csk(sk)->icsk_rto, + DCCP_RTO_MAX); + DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK; + } else if (dccp_ack_pending(sk)) { + DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK; + } else { + DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATA; + } + + err = dccp_transmit_skb(sk, skb); + if (err) + dccp_pr_debug("transmit_skb() returned err=%d\n", err); + /* + * Register this one as sent even if an error occurred. To the remote + * end a local packet drop is indistinguishable from network loss, i.e. + * any local drop will eventually be reported via receiver feedback. + */ + ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, len); +} - if (sk->sk_err) - goto do_error; - if (signal_pending(current)) - goto do_interrupted; +/** + * dccp_flush_write_queue - Drain queue at end of connection + * Since dccp_sendmsg queues packets without waiting for them to be sent, it may + * happen that the TX queue is not empty at the end of a connection. We give the + * HC-sender CCID a grace period of up to @time_budget jiffies. If this function + * returns with a non-empty write queue, it will be purged later. + */ +void dccp_flush_write_queue(struct sock *sk, long *time_budget) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct sk_buff *skb; + long delay, rc; + while (*time_budget > 0 && (skb = skb_peek(&sk->sk_write_queue))) { rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); - } while ((delay = rc) > 0); -out: - finish_wait(sk_sleep(sk), &wait); - return rc; - -do_error: - rc = -EPIPE; - goto out; -do_interrupted: - rc = -EINTR; - goto out; + + switch (ccid_packet_dequeue_eval(rc)) { + case CCID_PACKET_WILL_DEQUEUE_LATER: + /* + * If the CCID determines when to send, the next sending + * time is unknown or the CCID may not even send again + * (e.g. remote host crashes or lost Ack packets). + */ + DCCP_WARN("CCID did not manage to send all packets\n"); + return; + case CCID_PACKET_DELAY: + delay = msecs_to_jiffies(rc); + if (delay > *time_budget) + return; + rc = dccp_wait_for_ccid(sk, delay); + if (rc < 0) + return; + *time_budget -= (delay - rc); + /* check again if we can send now */ + break; + case CCID_PACKET_SEND_AT_ONCE: + dccp_xmit_packet(sk); + break; + case CCID_PACKET_ERR: + skb_dequeue(&sk->sk_write_queue); + kfree_skb(skb); + dccp_pr_debug("packet discarded due to err=%ld\n", rc); + } + } } -void dccp_write_xmit(struct sock *sk, int block) +void dccp_write_xmit(struct sock *sk) { struct dccp_sock *dp = dccp_sk(sk); struct sk_buff *skb; while ((skb = skb_peek(&sk->sk_write_queue))) { - int err = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); - - if (err > 0) { - if (!block) { - sk_reset_timer(sk, &dp->dccps_xmit_timer, - msecs_to_jiffies(err)+jiffies); - break; - } else - err = dccp_wait_for_ccid(sk, skb, err); - if (err && err != -EINTR) - DCCP_BUG("err=%d after dccp_wait_for_ccid", err); - } + int rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); - skb_dequeue(&sk->sk_write_queue); - if (err == 0) { - struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); - const int len = skb->len; - - if (sk->sk_state == DCCP_PARTOPEN) { - const u32 cur_mps = dp->dccps_mss_cache - DCCP_FEATNEG_OVERHEAD; - /* - * See 8.1.5 - Handshake Completion. - * - * For robustness we resend Confirm options until the client has - * entered OPEN. During the initial feature negotiation, the MPS - * is smaller than usual, reduced by the Change/Confirm options. - */ - if (!list_empty(&dp->dccps_featneg) && len > cur_mps) { - DCCP_WARN("Payload too large (%d) for featneg.\n", len); - dccp_send_ack(sk); - dccp_feat_list_purge(&dp->dccps_featneg); - } - - inet_csk_schedule_ack(sk); - inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, - inet_csk(sk)->icsk_rto, - DCCP_RTO_MAX); - dcb->dccpd_type = DCCP_PKT_DATAACK; - } else if (dccp_ack_pending(sk)) - dcb->dccpd_type = DCCP_PKT_DATAACK; - else - dcb->dccpd_type = DCCP_PKT_DATA; - - err = dccp_transmit_skb(sk, skb); - ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, len); - if (err) - DCCP_BUG("err=%d after ccid_hc_tx_packet_sent", - err); - } else { - dccp_pr_debug("packet discarded due to err=%d\n", err); + switch (ccid_packet_dequeue_eval(rc)) { + case CCID_PACKET_WILL_DEQUEUE_LATER: + return; + case CCID_PACKET_DELAY: + sk_reset_timer(sk, &dp->dccps_xmit_timer, + jiffies + msecs_to_jiffies(rc)); + return; + case CCID_PACKET_SEND_AT_ONCE: + dccp_xmit_packet(sk); + break; + case CCID_PACKET_ERR: + skb_dequeue(&sk->sk_write_queue); kfree_skb(skb); + dccp_pr_debug("packet discarded due to err=%d\n", rc); } } } @@ -622,7 +664,6 @@ void dccp_send_close(struct sock *sk, const int active) DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSE; if (active) { - dccp_write_xmit(sk, 1); dccp_skb_entail(sk, skb); dccp_transmit_skb(sk, skb_clone(skb, prio)); /* diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 7e5fc04..ef343d5 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -726,7 +726,13 @@ int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, goto out_discard; skb_queue_tail(&sk->sk_write_queue, skb); - dccp_write_xmit(sk,0); + /* + * The xmit_timer is set if the TX CCID is rate-based and will expire + * when congestion control permits to release further packets into the + * network. Window-based CCIDs do not use this timer. + */ + if (!timer_pending(&dp->dccps_xmit_timer)) + dccp_write_xmit(sk); out_release: release_sock(sk); return rc ? : len; @@ -951,9 +957,22 @@ void dccp_close(struct sock *sk, long timeout) /* Check zero linger _after_ checking for unread data. */ sk->sk_prot->disconnect(sk, 0); } else if (sk->sk_state != DCCP_CLOSED) { + /* + * Normal connection termination. May need to wait if there are + * still packets in the TX queue that are delayed by the CCID. + */ + dccp_flush_write_queue(sk, &timeout); dccp_terminate_connection(sk); } + /* + * Flush write queue. This may be necessary in several cases: + * - we have been closed by the peer but still have application data; + * - abortive termination (unread data or zero linger time), + * - normal termination but queue could not be flushed within time limit + */ + __skb_queue_purge(&sk->sk_write_queue); + sk_stream_wait_close(sk, timeout); adjudge_to_death: diff --git a/net/dccp/timer.c b/net/dccp/timer.c index 1a9aa05d..7587870 100644 --- a/net/dccp/timer.c +++ b/net/dccp/timer.c @@ -237,32 +237,35 @@ out: sock_put(sk); } -/* Transmit-delay timer: used by the CCIDs to delay actual send time */ -static void dccp_write_xmit_timer(unsigned long data) +/** + * dccp_write_xmitlet - Workhorse for CCID packet dequeueing interface + * See the comments above %ccid_dequeueing_decision for supported modes. + */ +static void dccp_write_xmitlet(unsigned long data) { struct sock *sk = (struct sock *)data; - struct dccp_sock *dp = dccp_sk(sk); bh_lock_sock(sk); if (sock_owned_by_user(sk)) - sk_reset_timer(sk, &dp->dccps_xmit_timer, jiffies+1); + sk_reset_timer(sk, &dccp_sk(sk)->dccps_xmit_timer, jiffies + 1); else - dccp_write_xmit(sk, 0); + dccp_write_xmit(sk); bh_unlock_sock(sk); - sock_put(sk); } -static void dccp_init_write_xmit_timer(struct sock *sk) +static void dccp_write_xmit_timer(unsigned long data) { - struct dccp_sock *dp = dccp_sk(sk); - - setup_timer(&dp->dccps_xmit_timer, dccp_write_xmit_timer, - (unsigned long)sk); + dccp_write_xmitlet(data); + sock_put((struct sock *)data); } void dccp_init_xmit_timers(struct sock *sk) { - dccp_init_write_xmit_timer(sk); + struct dccp_sock *dp = dccp_sk(sk); + + tasklet_init(&dp->dccps_xmitlet, dccp_write_xmitlet, (unsigned long)sk); + setup_timer(&dp->dccps_xmit_timer, dccp_write_xmit_timer, + (unsigned long)sk); inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer, &dccp_keepalive_timer); } diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index d6b93d1..a76b78d 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -155,7 +155,7 @@ static const struct proto_ops dn_proto_ops; static DEFINE_RWLOCK(dn_hash_lock); static struct hlist_head dn_sk_hash[DN_SK_HASH_SIZE]; static struct hlist_head dn_wild_sk; -static atomic_t decnet_memory_allocated; +static atomic_long_t decnet_memory_allocated; static int __dn_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen, int flags); static int __dn_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen, int flags); diff --git a/net/decnet/sysctl_net_decnet.c b/net/decnet/sysctl_net_decnet.c index be3eb8e..28f8b5e 100644 --- a/net/decnet/sysctl_net_decnet.c +++ b/net/decnet/sysctl_net_decnet.c @@ -38,7 +38,7 @@ int decnet_log_martians = 1; int decnet_no_fc_max_cwnd = NSP_MIN_WINDOW; /* Reasonable defaults, I hope, based on tcp's defaults */ -int sysctl_decnet_mem[3] = { 768 << 3, 1024 << 3, 1536 << 3 }; +long sysctl_decnet_mem[3] = { 768 << 3, 1024 << 3, 1536 << 3 }; int sysctl_decnet_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 }; int sysctl_decnet_rmem[3] = { 4 * 1024, 87380, 87380 * 2 }; @@ -324,7 +324,7 @@ static ctl_table dn_table[] = { .data = &sysctl_decnet_mem, .maxlen = sizeof(sysctl_decnet_mem), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_doulongvec_minmax }, { .procname = "decnet_rmem", diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 36e27c2..eb6f69a 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1052,7 +1052,7 @@ static void ip_fib_net_exit(struct net *net) hlist_for_each_entry_safe(tb, node, tmp, head, tb_hlist) { hlist_del(node); fib_table_flush(tb); - kfree(tb); + fib_free_table(tb); } } kfree(net->ipv4.fib_table_hash); diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c index 43e1c59..b3acb04 100644 --- a/net/ipv4/fib_hash.c +++ b/net/ipv4/fib_hash.c @@ -120,11 +120,12 @@ static inline void fn_rebuild_zone(struct fn_zone *fz, struct fib_node *f; hlist_for_each_entry_safe(f, node, n, &old_ht[i], fn_hash) { - struct hlist_head __rcu *new_head; + struct hlist_head *new_head; hlist_del_rcu(&f->fn_hash); - new_head = &fz->fz_hash[fn_hash(f->fn_key, fz)]; + new_head = rcu_dereference_protected(fz->fz_hash, 1) + + fn_hash(f->fn_key, fz); hlist_add_head_rcu(&f->fn_hash, new_head); } } @@ -179,8 +180,8 @@ static void fn_rehash_zone(struct fn_zone *fz) memcpy(&nfz, fz, sizeof(nfz)); write_seqlock_bh(&fz->fz_lock); - old_ht = fz->fz_hash; - nfz.fz_hash = ht; + old_ht = rcu_dereference_protected(fz->fz_hash, 1); + RCU_INIT_POINTER(nfz.fz_hash, ht); nfz.fz_hashmask = new_hashmask; nfz.fz_divisor = new_divisor; fn_rebuild_zone(&nfz, old_ht, old_divisor); @@ -236,7 +237,7 @@ fn_new_zone(struct fn_hash *table, int z) seqlock_init(&fz->fz_lock); fz->fz_divisor = z ? EMBEDDED_HASH_SIZE : 1; fz->fz_hashmask = fz->fz_divisor - 1; - fz->fz_hash = fz->fz_embedded_hash; + RCU_INIT_POINTER(fz->fz_hash, fz->fz_embedded_hash); fz->fz_order = z; fz->fz_revorder = 32 - z; fz->fz_mask = inet_make_mask(z); @@ -272,7 +273,7 @@ int fib_table_lookup(struct fib_table *tb, for (fz = rcu_dereference(t->fn_zone_list); fz != NULL; fz = rcu_dereference(fz->fz_next)) { - struct hlist_head __rcu *head; + struct hlist_head *head; struct hlist_node *node; struct fib_node *f; __be32 k; @@ -282,7 +283,7 @@ int fib_table_lookup(struct fib_table *tb, seq = read_seqbegin(&fz->fz_lock); k = fz_key(flp->fl4_dst, fz); - head = &fz->fz_hash[fn_hash(k, fz)]; + head = rcu_dereference(fz->fz_hash) + fn_hash(k, fz); hlist_for_each_entry_rcu(f, node, head, fn_hash) { if (f->fn_key != k) continue; @@ -311,6 +312,7 @@ void fib_table_select_default(struct fib_table *tb, struct fib_info *last_resort; struct fn_hash *t = (struct fn_hash *)tb->tb_data; struct fn_zone *fz = t->fn_zones[0]; + struct hlist_head *head; if (fz == NULL) return; @@ -320,7 +322,8 @@ void fib_table_select_default(struct fib_table *tb, order = -1; rcu_read_lock(); - hlist_for_each_entry_rcu(f, node, &fz->fz_hash[0], fn_hash) { + head = rcu_dereference(fz->fz_hash); + hlist_for_each_entry_rcu(f, node, head, fn_hash) { struct fib_alias *fa; list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) { @@ -374,7 +377,7 @@ out: /* Insert node F to FZ. */ static inline void fib_insert_node(struct fn_zone *fz, struct fib_node *f) { - struct hlist_head *head = &fz->fz_hash[fn_hash(f->fn_key, fz)]; + struct hlist_head *head = rtnl_dereference(fz->fz_hash) + fn_hash(f->fn_key, fz); hlist_add_head_rcu(&f->fn_hash, head); } @@ -382,7 +385,7 @@ static inline void fib_insert_node(struct fn_zone *fz, struct fib_node *f) /* Return the node in FZ matching KEY. */ static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key) { - struct hlist_head *head = &fz->fz_hash[fn_hash(key, fz)]; + struct hlist_head *head = rtnl_dereference(fz->fz_hash) + fn_hash(key, fz); struct hlist_node *node; struct fib_node *f; @@ -662,7 +665,7 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) static int fn_flush_list(struct fn_zone *fz, int idx) { - struct hlist_head *head = &fz->fz_hash[idx]; + struct hlist_head *head = rtnl_dereference(fz->fz_hash) + idx; struct hlist_node *node, *n; struct fib_node *f; int found = 0; @@ -713,6 +716,24 @@ int fib_table_flush(struct fib_table *tb) return found; } +void fib_free_table(struct fib_table *tb) +{ + struct fn_hash *table = (struct fn_hash *) tb->tb_data; + struct fn_zone *fz, *next; + + next = table->fn_zone_list; + while (next != NULL) { + fz = next; + next = fz->fz_next; + + if (fz->fz_hash != fz->fz_embedded_hash) + fz_hash_free(fz->fz_hash, fz->fz_divisor); + + kfree(fz); + } + + kfree(tb); +} static inline int fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb, @@ -761,14 +782,15 @@ fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb, struct fn_zone *fz) { int h, s_h; + struct hlist_head *head = rcu_dereference(fz->fz_hash); - if (fz->fz_hash == NULL) + if (head == NULL) return skb->len; s_h = cb->args[3]; for (h = s_h; h < fz->fz_divisor; h++) { - if (hlist_empty(&fz->fz_hash[h])) + if (hlist_empty(head + h)) continue; - if (fn_hash_dump_bucket(skb, cb, tb, fz, &fz->fz_hash[h]) < 0) { + if (fn_hash_dump_bucket(skb, cb, tb, fz, head + h) < 0) { cb->args[3] = h; return -1; } @@ -872,7 +894,7 @@ static struct fib_alias *fib_get_first(struct seq_file *seq) if (!iter->zone->fz_nent) continue; - iter->hash_head = iter->zone->fz_hash; + iter->hash_head = rcu_dereference(iter->zone->fz_hash); maxslot = iter->zone->fz_divisor; for (iter->bucket = 0; iter->bucket < maxslot; @@ -957,7 +979,7 @@ static struct fib_alias *fib_get_next(struct seq_file *seq) goto out; iter->bucket = 0; - iter->hash_head = iter->zone->fz_hash; + iter->hash_head = rcu_dereference(iter->zone->fz_hash); hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) { list_for_each_entry(fa, &fn->fn_alias, fa_list) { diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index a29edf2..c079cc0 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -47,11 +47,8 @@ extern int fib_detect_death(struct fib_info *fi, int order, static inline void fib_result_assign(struct fib_result *res, struct fib_info *fi) { - if (res->fi != NULL) - fib_info_put(res->fi); + /* we used to play games with refcounts, but we now use RCU */ res->fi = fi; - if (fi != NULL) - atomic_inc(&fi->fib_clntref); } #endif /* _FIB_LOOKUP_H */ diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index b144508..200eb53 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1797,6 +1797,11 @@ int fib_table_flush(struct fib_table *tb) return found; } +void fib_free_table(struct fib_table *tb) +{ + kfree(tb); +} + void fib_table_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res) diff --git a/net/ipv4/gre.c b/net/ipv4/gre.c index caea688..c6933f2 100644 --- a/net/ipv4/gre.c +++ b/net/ipv4/gre.c @@ -22,7 +22,7 @@ #include <net/gre.h> -static const struct gre_protocol *gre_proto[GREPROTO_MAX] __read_mostly; +static const struct gre_protocol __rcu *gre_proto[GREPROTO_MAX] __read_mostly; static DEFINE_SPINLOCK(gre_proto_lock); int gre_add_protocol(const struct gre_protocol *proto, u8 version) @@ -51,7 +51,8 @@ int gre_del_protocol(const struct gre_protocol *proto, u8 version) goto err_out; spin_lock(&gre_proto_lock); - if (gre_proto[version] != proto) + if (rcu_dereference_protected(gre_proto[version], + lockdep_is_held(&gre_proto_lock)) != proto) goto err_out_unlock; rcu_assign_pointer(gre_proto[version], NULL); spin_unlock(&gre_proto_lock); diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index c8877c6..3c53c2d 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -2306,10 +2306,8 @@ void ip_mc_drop_socket(struct sock *sk) in_dev = inetdev_by_index(net, iml->multi.imr_ifindex); (void) ip_mc_leave_src(sk, iml, in_dev); - if (in_dev != NULL) { + if (in_dev != NULL) ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr); - in_dev_put(in_dev); - } /* decrease mem now to avoid the memleak warning */ atomic_sub(sizeof(*iml), &sk->sk_omem_alloc); call_rcu(&iml->rcu, ip_mc_socklist_reclaim); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index ba80426..2ada171 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -490,9 +490,11 @@ static int inet_csk_diag_dump(struct sock *sk, { struct inet_diag_req *r = NLMSG_DATA(cb->nlh); - if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) { + if (nlmsg_attrlen(cb->nlh, sizeof(*r))) { struct inet_diag_entry entry; - struct rtattr *bc = (struct rtattr *)(r + 1); + const struct nlattr *bc = nlmsg_find_attr(cb->nlh, + sizeof(*r), + INET_DIAG_REQ_BYTECODE); struct inet_sock *inet = inet_sk(sk); entry.family = sk->sk_family; @@ -512,7 +514,7 @@ static int inet_csk_diag_dump(struct sock *sk, entry.dport = ntohs(inet->inet_dport); entry.userlocks = sk->sk_userlocks; - if (!inet_diag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry)) + if (!inet_diag_bc_run(nla_data(bc), nla_len(bc), &entry)) return 0; } @@ -527,9 +529,11 @@ static int inet_twsk_diag_dump(struct inet_timewait_sock *tw, { struct inet_diag_req *r = NLMSG_DATA(cb->nlh); - if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) { + if (nlmsg_attrlen(cb->nlh, sizeof(*r))) { struct inet_diag_entry entry; - struct rtattr *bc = (struct rtattr *)(r + 1); + const struct nlattr *bc = nlmsg_find_attr(cb->nlh, + sizeof(*r), + INET_DIAG_REQ_BYTECODE); entry.family = tw->tw_family; #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) @@ -548,7 +552,7 @@ static int inet_twsk_diag_dump(struct inet_timewait_sock *tw, entry.dport = ntohs(tw->tw_dport); entry.userlocks = 0; - if (!inet_diag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry)) + if (!inet_diag_bc_run(nla_data(bc), nla_len(bc), &entry)) return 0; } @@ -618,7 +622,7 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, struct inet_diag_req *r = NLMSG_DATA(cb->nlh); struct inet_connection_sock *icsk = inet_csk(sk); struct listen_sock *lopt; - struct rtattr *bc = NULL; + const struct nlattr *bc = NULL; struct inet_sock *inet = inet_sk(sk); int j, s_j; int reqnum, s_reqnum; @@ -638,8 +642,9 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, if (!lopt || !lopt->qlen) goto out; - if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) { - bc = (struct rtattr *)(r + 1); + if (nlmsg_attrlen(cb->nlh, sizeof(*r))) { + bc = nlmsg_find_attr(cb->nlh, sizeof(*r), + INET_DIAG_REQ_BYTECODE); entry.sport = inet->inet_num; entry.userlocks = sk->sk_userlocks; } @@ -672,8 +677,8 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, &ireq->rmt_addr; entry.dport = ntohs(ireq->rmt_port); - if (!inet_diag_bc_run(RTA_DATA(bc), - RTA_PAYLOAD(bc), &entry)) + if (!inet_diag_bc_run(nla_data(bc), + nla_len(bc), &entry)) continue; } diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index 9ffa24b..9e94d7c 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -72,18 +72,19 @@ static struct kmem_cache *peer_cachep __read_mostly; #define node_height(x) x->avl_height #define peer_avl_empty ((struct inet_peer *)&peer_fake_node) +#define peer_avl_empty_rcu ((struct inet_peer __rcu __force *)&peer_fake_node) static const struct inet_peer peer_fake_node = { - .avl_left = peer_avl_empty, - .avl_right = peer_avl_empty, + .avl_left = peer_avl_empty_rcu, + .avl_right = peer_avl_empty_rcu, .avl_height = 0 }; static struct { - struct inet_peer *root; + struct inet_peer __rcu *root; spinlock_t lock; int total; } peers = { - .root = peer_avl_empty, + .root = peer_avl_empty_rcu, .lock = __SPIN_LOCK_UNLOCKED(peers.lock), .total = 0, }; @@ -156,11 +157,14 @@ static void unlink_from_unused(struct inet_peer *p) */ #define lookup(_daddr, _stack) \ ({ \ - struct inet_peer *u, **v; \ + struct inet_peer *u; \ + struct inet_peer __rcu **v; \ \ stackptr = _stack; \ *stackptr++ = &peers.root; \ - for (u = peers.root; u != peer_avl_empty; ) { \ + for (u = rcu_dereference_protected(peers.root, \ + lockdep_is_held(&peers.lock)); \ + u != peer_avl_empty; ) { \ if (_daddr == u->v4daddr) \ break; \ if ((__force __u32)_daddr < (__force __u32)u->v4daddr) \ @@ -168,7 +172,8 @@ static void unlink_from_unused(struct inet_peer *p) else \ v = &u->avl_right; \ *stackptr++ = v; \ - u = *v; \ + u = rcu_dereference_protected(*v, \ + lockdep_is_held(&peers.lock)); \ } \ u; \ }) @@ -209,13 +214,17 @@ static struct inet_peer *lookup_rcu_bh(__be32 daddr) /* Called with local BH disabled and the pool lock held. */ #define lookup_rightempty(start) \ ({ \ - struct inet_peer *u, **v; \ + struct inet_peer *u; \ + struct inet_peer __rcu **v; \ *stackptr++ = &start->avl_left; \ v = &start->avl_left; \ - for (u = *v; u->avl_right != peer_avl_empty; ) { \ + for (u = rcu_dereference_protected(*v, \ + lockdep_is_held(&peers.lock)); \ + u->avl_right != peer_avl_empty_rcu; ) { \ v = &u->avl_right; \ *stackptr++ = v; \ - u = *v; \ + u = rcu_dereference_protected(*v, \ + lockdep_is_held(&peers.lock)); \ } \ u; \ }) @@ -224,74 +233,86 @@ static struct inet_peer *lookup_rcu_bh(__be32 daddr) * Variable names are the proof of operation correctness. * Look into mm/map_avl.c for more detail description of the ideas. */ -static void peer_avl_rebalance(struct inet_peer **stack[], - struct inet_peer ***stackend) +static void peer_avl_rebalance(struct inet_peer __rcu **stack[], + struct inet_peer __rcu ***stackend) { - struct inet_peer **nodep, *node, *l, *r; + struct inet_peer __rcu **nodep; + struct inet_peer *node, *l, *r; int lh, rh; while (stackend > stack) { nodep = *--stackend; - node = *nodep; - l = node->avl_left; - r = node->avl_right; + node = rcu_dereference_protected(*nodep, + lockdep_is_held(&peers.lock)); + l = rcu_dereference_protected(node->avl_left, + lockdep_is_held(&peers.lock)); + r = rcu_dereference_protected(node->avl_right, + lockdep_is_held(&peers.lock)); lh = node_height(l); rh = node_height(r); if (lh > rh + 1) { /* l: RH+2 */ struct inet_peer *ll, *lr, *lrl, *lrr; int lrh; - ll = l->avl_left; - lr = l->avl_right; + ll = rcu_dereference_protected(l->avl_left, + lockdep_is_held(&peers.lock)); + lr = rcu_dereference_protected(l->avl_right, + lockdep_is_held(&peers.lock)); lrh = node_height(lr); if (lrh <= node_height(ll)) { /* ll: RH+1 */ - node->avl_left = lr; /* lr: RH or RH+1 */ - node->avl_right = r; /* r: RH */ + RCU_INIT_POINTER(node->avl_left, lr); /* lr: RH or RH+1 */ + RCU_INIT_POINTER(node->avl_right, r); /* r: RH */ node->avl_height = lrh + 1; /* RH+1 or RH+2 */ - l->avl_left = ll; /* ll: RH+1 */ - l->avl_right = node; /* node: RH+1 or RH+2 */ + RCU_INIT_POINTER(l->avl_left, ll); /* ll: RH+1 */ + RCU_INIT_POINTER(l->avl_right, node); /* node: RH+1 or RH+2 */ l->avl_height = node->avl_height + 1; - *nodep = l; + RCU_INIT_POINTER(*nodep, l); } else { /* ll: RH, lr: RH+1 */ - lrl = lr->avl_left; /* lrl: RH or RH-1 */ - lrr = lr->avl_right; /* lrr: RH or RH-1 */ - node->avl_left = lrr; /* lrr: RH or RH-1 */ - node->avl_right = r; /* r: RH */ + lrl = rcu_dereference_protected(lr->avl_left, + lockdep_is_held(&peers.lock)); /* lrl: RH or RH-1 */ + lrr = rcu_dereference_protected(lr->avl_right, + lockdep_is_held(&peers.lock)); /* lrr: RH or RH-1 */ + RCU_INIT_POINTER(node->avl_left, lrr); /* lrr: RH or RH-1 */ + RCU_INIT_POINTER(node->avl_right, r); /* r: RH */ node->avl_height = rh + 1; /* node: RH+1 */ - l->avl_left = ll; /* ll: RH */ - l->avl_right = lrl; /* lrl: RH or RH-1 */ + RCU_INIT_POINTER(l->avl_left, ll); /* ll: RH */ + RCU_INIT_POINTER(l->avl_right, lrl); /* lrl: RH or RH-1 */ l->avl_height = rh + 1; /* l: RH+1 */ - lr->avl_left = l; /* l: RH+1 */ - lr->avl_right = node; /* node: RH+1 */ + RCU_INIT_POINTER(lr->avl_left, l); /* l: RH+1 */ + RCU_INIT_POINTER(lr->avl_right, node); /* node: RH+1 */ lr->avl_height = rh + 2; - *nodep = lr; + RCU_INIT_POINTER(*nodep, lr); } } else if (rh > lh + 1) { /* r: LH+2 */ struct inet_peer *rr, *rl, *rlr, *rll; int rlh; - rr = r->avl_right; - rl = r->avl_left; + rr = rcu_dereference_protected(r->avl_right, + lockdep_is_held(&peers.lock)); + rl = rcu_dereference_protected(r->avl_left, + lockdep_is_held(&peers.lock)); rlh = node_height(rl); if (rlh <= node_height(rr)) { /* rr: LH+1 */ - node->avl_right = rl; /* rl: LH or LH+1 */ - node->avl_left = l; /* l: LH */ + RCU_INIT_POINTER(node->avl_right, rl); /* rl: LH or LH+1 */ + RCU_INIT_POINTER(node->avl_left, l); /* l: LH */ node->avl_height = rlh + 1; /* LH+1 or LH+2 */ - r->avl_right = rr; /* rr: LH+1 */ - r->avl_left = node; /* node: LH+1 or LH+2 */ + RCU_INIT_POINTER(r->avl_right, rr); /* rr: LH+1 */ + RCU_INIT_POINTER(r->avl_left, node); /* node: LH+1 or LH+2 */ r->avl_height = node->avl_height + 1; - *nodep = r; + RCU_INIT_POINTER(*nodep, r); } else { /* rr: RH, rl: RH+1 */ - rlr = rl->avl_right; /* rlr: LH or LH-1 */ - rll = rl->avl_left; /* rll: LH or LH-1 */ - node->avl_right = rll; /* rll: LH or LH-1 */ - node->avl_left = l; /* l: LH */ + rlr = rcu_dereference_protected(rl->avl_right, + lockdep_is_held(&peers.lock)); /* rlr: LH or LH-1 */ + rll = rcu_dereference_protected(rl->avl_left, + lockdep_is_held(&peers.lock)); /* rll: LH or LH-1 */ + RCU_INIT_POINTER(node->avl_right, rll); /* rll: LH or LH-1 */ + RCU_INIT_POINTER(node->avl_left, l); /* l: LH */ node->avl_height = lh + 1; /* node: LH+1 */ - r->avl_right = rr; /* rr: LH */ - r->avl_left = rlr; /* rlr: LH or LH-1 */ + RCU_INIT_POINTER(r->avl_right, rr); /* rr: LH */ + RCU_INIT_POINTER(r->avl_left, rlr); /* rlr: LH or LH-1 */ r->avl_height = lh + 1; /* r: LH+1 */ - rl->avl_right = r; /* r: LH+1 */ - rl->avl_left = node; /* node: LH+1 */ + RCU_INIT_POINTER(rl->avl_right, r); /* r: LH+1 */ + RCU_INIT_POINTER(rl->avl_left, node); /* node: LH+1 */ rl->avl_height = lh + 2; - *nodep = rl; + RCU_INIT_POINTER(*nodep, rl); } } else { node->avl_height = (lh > rh ? lh : rh) + 1; @@ -303,10 +324,10 @@ static void peer_avl_rebalance(struct inet_peer **stack[], #define link_to_pool(n) \ do { \ n->avl_height = 1; \ - n->avl_left = peer_avl_empty; \ - n->avl_right = peer_avl_empty; \ - smp_wmb(); /* lockless readers can catch us now */ \ - **--stackptr = n; \ + n->avl_left = peer_avl_empty_rcu; \ + n->avl_right = peer_avl_empty_rcu; \ + /* lockless readers can catch us now */ \ + rcu_assign_pointer(**--stackptr, n); \ peer_avl_rebalance(stack, stackptr); \ } while (0) @@ -330,24 +351,25 @@ static void unlink_from_pool(struct inet_peer *p) * We use refcnt=-1 to alert lockless readers this entry is deleted. */ if (atomic_cmpxchg(&p->refcnt, 1, -1) == 1) { - struct inet_peer **stack[PEER_MAXDEPTH]; - struct inet_peer ***stackptr, ***delp; + struct inet_peer __rcu **stack[PEER_MAXDEPTH]; + struct inet_peer __rcu ***stackptr, ***delp; if (lookup(p->v4daddr, stack) != p) BUG(); delp = stackptr - 1; /* *delp[0] == p */ - if (p->avl_left == peer_avl_empty) { + if (p->avl_left == peer_avl_empty_rcu) { *delp[0] = p->avl_right; --stackptr; } else { /* look for a node to insert instead of p */ struct inet_peer *t; t = lookup_rightempty(p); - BUG_ON(*stackptr[-1] != t); + BUG_ON(rcu_dereference_protected(*stackptr[-1], + lockdep_is_held(&peers.lock)) != t); **--stackptr = t->avl_left; /* t is removed, t->v4daddr > x->v4daddr for any * x in p->avl_left subtree. * Put t in the old place of p. */ - *delp[0] = t; + RCU_INIT_POINTER(*delp[0], t); t->avl_left = p->avl_left; t->avl_right = p->avl_right; t->avl_height = p->avl_height; @@ -414,7 +436,7 @@ static int cleanup_once(unsigned long ttl) struct inet_peer *inet_getpeer(__be32 daddr, int create) { struct inet_peer *p; - struct inet_peer **stack[PEER_MAXDEPTH], ***stackptr; + struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr; /* Look up for the address quickly, lockless. * Because of a concurrent writer, we might not find an existing entry. diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index d0ffcbe..70ff77f 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1072,6 +1072,7 @@ ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) break; } ipgre_tunnel_unlink(ign, t); + synchronize_net(); t->parms.iph.saddr = p.iph.saddr; t->parms.iph.daddr = p.iph.daddr; t->parms.i_key = p.i_key; @@ -1324,7 +1325,6 @@ static void ipgre_fb_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct iphdr *iph = &tunnel->parms.iph; - struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id); tunnel->dev = dev; strcpy(tunnel->parms.name, dev->name); @@ -1335,7 +1335,6 @@ static void ipgre_fb_tunnel_init(struct net_device *dev) tunnel->hlen = sizeof(struct iphdr) + 4; dev_hold(dev); - rcu_assign_pointer(ign->tunnels_wc[0], tunnel); } @@ -1382,10 +1381,12 @@ static int __net_init ipgre_init_net(struct net *net) if ((err = register_netdev(ign->fb_tunnel_dev))) goto err_reg_dev; + rcu_assign_pointer(ign->tunnels_wc[0], + netdev_priv(ign->fb_tunnel_dev)); return 0; err_reg_dev: - free_netdev(ign->fb_tunnel_dev); + ipgre_dev_free(ign->fb_tunnel_dev); err_alloc_dev: return err; } diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 64b70ad..3948c86 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -238,7 +238,7 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc) but receiver should be enough clever f.e. to forward mtrace requests, sent to multicast group to reach destination designated router. */ -struct ip_ra_chain *ip_ra_chain; +struct ip_ra_chain __rcu *ip_ra_chain; static DEFINE_SPINLOCK(ip_ra_lock); @@ -253,7 +253,8 @@ static void ip_ra_destroy_rcu(struct rcu_head *head) int ip_ra_control(struct sock *sk, unsigned char on, void (*destructor)(struct sock *)) { - struct ip_ra_chain *ra, *new_ra, **rap; + struct ip_ra_chain *ra, *new_ra; + struct ip_ra_chain __rcu **rap; if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num == IPPROTO_RAW) return -EINVAL; @@ -261,7 +262,10 @@ int ip_ra_control(struct sock *sk, unsigned char on, new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; spin_lock_bh(&ip_ra_lock); - for (rap = &ip_ra_chain; (ra = *rap) != NULL; rap = &ra->next) { + for (rap = &ip_ra_chain; + (ra = rcu_dereference_protected(*rap, + lockdep_is_held(&ip_ra_lock))) != NULL; + rap = &ra->next) { if (ra->sk == sk) { if (on) { spin_unlock_bh(&ip_ra_lock); diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index e9b816e..cd300aa 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -676,6 +676,7 @@ ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) } t = netdev_priv(dev); ipip_tunnel_unlink(ipn, t); + synchronize_net(); t->parms.iph.saddr = p.iph.saddr; t->parms.iph.daddr = p.iph.daddr; memcpy(dev->dev_addr, &p.iph.saddr, 4); diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 3cad259..3fac340 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -927,6 +927,7 @@ static int get_info(struct net *net, void __user *user, private = &tmp; } #endif + memset(&info, 0, sizeof(info)); info.valid_hooks = t->valid_hooks; memcpy(info.hook_entry, private->hook_entry, sizeof(info.hook_entry)); diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index d31b007..a846d63 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1124,6 +1124,7 @@ static int get_info(struct net *net, void __user *user, private = &tmp; } #endif + memset(&info, 0, sizeof(info)); info.valid_hooks = t->valid_hooks; memcpy(info.hook_entry, private->hook_entry, sizeof(info.hook_entry)); diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index 295c974..c04787c 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c @@ -47,26 +47,6 @@ __nf_nat_proto_find(u_int8_t protonum) return rcu_dereference(nf_nat_protos[protonum]); } -static const struct nf_nat_protocol * -nf_nat_proto_find_get(u_int8_t protonum) -{ - const struct nf_nat_protocol *p; - - rcu_read_lock(); - p = __nf_nat_proto_find(protonum); - if (!try_module_get(p->me)) - p = &nf_nat_unknown_protocol; - rcu_read_unlock(); - - return p; -} - -static void -nf_nat_proto_put(const struct nf_nat_protocol *p) -{ - module_put(p->me); -} - /* We keep an extra hash for each conntrack, for fast searching. */ static inline unsigned int hash_by_src(const struct net *net, u16 zone, @@ -588,6 +568,26 @@ static struct nf_ct_ext_type nat_extend __read_mostly = { #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_conntrack.h> +static const struct nf_nat_protocol * +nf_nat_proto_find_get(u_int8_t protonum) +{ + const struct nf_nat_protocol *p; + + rcu_read_lock(); + p = __nf_nat_proto_find(protonum); + if (!try_module_get(p->me)) + p = &nf_nat_unknown_protocol; + rcu_read_unlock(); + + return p; +} + +static void +nf_nat_proto_put(const struct nf_nat_protocol *p) +{ + module_put(p->me); +} + static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = { [CTA_PROTONAT_PORT_MIN] = { .type = NLA_U16 }, [CTA_PROTONAT_PORT_MAX] = { .type = NLA_U16 }, diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 4ae1f20..1b48eb1 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -59,13 +59,13 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) local_bh_enable(); socket_seq_show(seq); - seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %d\n", + seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n", sock_prot_inuse_get(net, &tcp_prot), orphans, tcp_death_row.tw_count, sockets, - atomic_read(&tcp_memory_allocated)); - seq_printf(seq, "UDP: inuse %d mem %d\n", + atomic_long_read(&tcp_memory_allocated)); + seq_printf(seq, "UDP: inuse %d mem %ld\n", sock_prot_inuse_get(net, &udp_prot), - atomic_read(&udp_memory_allocated)); + atomic_long_read(&udp_memory_allocated)); seq_printf(seq, "UDPLITE: inuse %d\n", sock_prot_inuse_get(net, &udplite_prot)); seq_printf(seq, "RAW: inuse %d\n", diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c index 65699c2..9ae5c01 100644 --- a/net/ipv4/protocol.c +++ b/net/ipv4/protocol.c @@ -28,7 +28,7 @@ #include <linux/spinlock.h> #include <net/protocol.h> -const struct net_protocol *inet_protos[MAX_INET_PROTOS] __read_mostly; +const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly; /* * Add a protocol handler to the hash tables @@ -38,7 +38,8 @@ int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol) { int hash = protocol & (MAX_INET_PROTOS - 1); - return !cmpxchg(&inet_protos[hash], NULL, prot) ? 0 : -1; + return !cmpxchg((const struct net_protocol **)&inet_protos[hash], + NULL, prot) ? 0 : -1; } EXPORT_SYMBOL(inet_add_protocol); @@ -50,7 +51,8 @@ int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol) { int ret, hash = protocol & (MAX_INET_PROTOS - 1); - ret = (cmpxchg(&inet_protos[hash], prot, NULL) == prot) ? 0 : -1; + ret = (cmpxchg((const struct net_protocol **)&inet_protos[hash], + prot, NULL) == prot) ? 0 : -1; synchronize_net(); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index d6cb2bf..987bf9a 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -198,7 +198,7 @@ const __u8 ip_tos2prio[16] = { */ struct rt_hash_bucket { - struct rtable *chain; + struct rtable __rcu *chain; }; #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \ @@ -280,7 +280,7 @@ static struct rtable *rt_cache_get_first(struct seq_file *seq) struct rtable *r = NULL; for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) { - if (!rt_hash_table[st->bucket].chain) + if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain)) continue; rcu_read_lock_bh(); r = rcu_dereference_bh(rt_hash_table[st->bucket].chain); @@ -300,17 +300,17 @@ static struct rtable *__rt_cache_get_next(struct seq_file *seq, { struct rt_cache_iter_state *st = seq->private; - r = r->dst.rt_next; + r = rcu_dereference_bh(r->dst.rt_next); while (!r) { rcu_read_unlock_bh(); do { if (--st->bucket < 0) return NULL; - } while (!rt_hash_table[st->bucket].chain); + } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain)); rcu_read_lock_bh(); - r = rt_hash_table[st->bucket].chain; + r = rcu_dereference_bh(rt_hash_table[st->bucket].chain); } - return rcu_dereference_bh(r); + return r; } static struct rtable *rt_cache_get_next(struct seq_file *seq, @@ -721,19 +721,23 @@ static void rt_do_flush(int process_context) for (i = 0; i <= rt_hash_mask; i++) { if (process_context && need_resched()) cond_resched(); - rth = rt_hash_table[i].chain; + rth = rcu_dereference_raw(rt_hash_table[i].chain); if (!rth) continue; spin_lock_bh(rt_hash_lock_addr(i)); #ifdef CONFIG_NET_NS { - struct rtable ** prev, * p; + struct rtable __rcu **prev; + struct rtable *p; - rth = rt_hash_table[i].chain; + rth = rcu_dereference_protected(rt_hash_table[i].chain, + lockdep_is_held(rt_hash_lock_addr(i))); /* defer releasing the head of the list after spin_unlock */ - for (tail = rth; tail; tail = tail->dst.rt_next) + for (tail = rth; tail; + tail = rcu_dereference_protected(tail->dst.rt_next, + lockdep_is_held(rt_hash_lock_addr(i)))) if (!rt_is_expired(tail)) break; if (rth != tail) @@ -741,8 +745,12 @@ static void rt_do_flush(int process_context) /* call rt_free on entries after the tail requiring flush */ prev = &rt_hash_table[i].chain; - for (p = *prev; p; p = next) { - next = p->dst.rt_next; + for (p = rcu_dereference_protected(*prev, + lockdep_is_held(rt_hash_lock_addr(i))); + p != NULL; + p = next) { + next = rcu_dereference_protected(p->dst.rt_next, + lockdep_is_held(rt_hash_lock_addr(i))); if (!rt_is_expired(p)) { prev = &p->dst.rt_next; } else { @@ -752,14 +760,15 @@ static void rt_do_flush(int process_context) } } #else - rth = rt_hash_table[i].chain; - rt_hash_table[i].chain = NULL; + rth = rcu_dereference_protected(rt_hash_table[i].chain, + lockdep_is_held(rt_hash_lock_addr(i))); + rcu_assign_pointer(rt_hash_table[i].chain, NULL); tail = NULL; #endif spin_unlock_bh(rt_hash_lock_addr(i)); for (; rth != tail; rth = next) { - next = rth->dst.rt_next; + next = rcu_dereference_protected(rth->dst.rt_next, 1); rt_free(rth); } } @@ -790,7 +799,7 @@ static int has_noalias(const struct rtable *head, const struct rtable *rth) while (aux != rth) { if (compare_hash_inputs(&aux->fl, &rth->fl)) return 0; - aux = aux->dst.rt_next; + aux = rcu_dereference_protected(aux->dst.rt_next, 1); } return ONE; } @@ -799,7 +808,8 @@ static void rt_check_expire(void) { static unsigned int rover; unsigned int i = rover, goal; - struct rtable *rth, **rthp; + struct rtable *rth; + struct rtable __rcu **rthp; unsigned long samples = 0; unsigned long sum = 0, sum2 = 0; unsigned long delta; @@ -825,11 +835,12 @@ static void rt_check_expire(void) samples++; - if (*rthp == NULL) + if (rcu_dereference_raw(*rthp) == NULL) continue; length = 0; spin_lock_bh(rt_hash_lock_addr(i)); - while ((rth = *rthp) != NULL) { + while ((rth = rcu_dereference_protected(*rthp, + lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) { prefetch(rth->dst.rt_next); if (rt_is_expired(rth)) { *rthp = rth->dst.rt_next; @@ -941,7 +952,8 @@ static int rt_garbage_collect(struct dst_ops *ops) static unsigned long last_gc; static int rover; static int equilibrium; - struct rtable *rth, **rthp; + struct rtable *rth; + struct rtable __rcu **rthp; unsigned long now = jiffies; int goal; int entries = dst_entries_get_fast(&ipv4_dst_ops); @@ -995,7 +1007,8 @@ static int rt_garbage_collect(struct dst_ops *ops) k = (k + 1) & rt_hash_mask; rthp = &rt_hash_table[k].chain; spin_lock_bh(rt_hash_lock_addr(k)); - while ((rth = *rthp) != NULL) { + while ((rth = rcu_dereference_protected(*rthp, + lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) { if (!rt_is_expired(rth) && !rt_may_expire(rth, tmo, expire)) { tmo >>= 1; @@ -1071,7 +1084,7 @@ static int slow_chain_length(const struct rtable *head) while (rth) { length += has_noalias(head, rth); - rth = rth->dst.rt_next; + rth = rcu_dereference_protected(rth->dst.rt_next, 1); } return length >> FRACT_BITS; } @@ -1079,9 +1092,9 @@ static int slow_chain_length(const struct rtable *head) static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp, struct sk_buff *skb, int ifindex) { - struct rtable *rth, **rthp; + struct rtable *rth, *cand; + struct rtable __rcu **rthp, **candp; unsigned long now; - struct rtable *cand, **candp; u32 min_score; int chain_length; int attempts = !in_softirq(); @@ -1128,7 +1141,8 @@ restart: rthp = &rt_hash_table[hash].chain; spin_lock_bh(rt_hash_lock_addr(hash)); - while ((rth = *rthp) != NULL) { + while ((rth = rcu_dereference_protected(*rthp, + lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) { if (rt_is_expired(rth)) { *rthp = rth->dst.rt_next; rt_free(rth); @@ -1324,12 +1338,14 @@ EXPORT_SYMBOL(__ip_select_ident); static void rt_del(unsigned hash, struct rtable *rt) { - struct rtable **rthp, *aux; + struct rtable __rcu **rthp; + struct rtable *aux; rthp = &rt_hash_table[hash].chain; spin_lock_bh(rt_hash_lock_addr(hash)); ip_rt_put(rt); - while ((aux = *rthp) != NULL) { + while ((aux = rcu_dereference_protected(*rthp, + lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) { if (aux == rt || rt_is_expired(aux)) { *rthp = aux->dst.rt_next; rt_free(aux); @@ -1346,7 +1362,8 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, { int i, k; struct in_device *in_dev = __in_dev_get_rcu(dev); - struct rtable *rth, **rthp; + struct rtable *rth; + struct rtable __rcu **rthp; __be32 skeys[2] = { saddr, 0 }; int ikeys[2] = { dev->ifindex, 0 }; struct netevent_redirect netevent; @@ -1379,7 +1396,7 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, unsigned hash = rt_hash(daddr, skeys[i], ikeys[k], rt_genid(net)); - rthp=&rt_hash_table[hash].chain; + rthp = &rt_hash_table[hash].chain; while ((rth = rcu_dereference(*rthp)) != NULL) { struct rtable *rt; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index d96c1da..e91911d 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -398,7 +398,7 @@ static struct ctl_table ipv4_table[] = { .data = &sysctl_tcp_mem, .maxlen = sizeof(sysctl_tcp_mem), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_doulongvec_minmax }, { .procname = "tcp_wmem", @@ -602,8 +602,7 @@ static struct ctl_table ipv4_table[] = { .data = &sysctl_udp_mem, .maxlen = sizeof(sysctl_udp_mem), .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero + .proc_handler = proc_doulongvec_minmax, }, { .procname = "udp_rmem_min", diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 1664a05..0814199 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -282,7 +282,7 @@ int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT; struct percpu_counter tcp_orphan_count; EXPORT_SYMBOL_GPL(tcp_orphan_count); -int sysctl_tcp_mem[3] __read_mostly; +long sysctl_tcp_mem[3] __read_mostly; int sysctl_tcp_wmem[3] __read_mostly; int sysctl_tcp_rmem[3] __read_mostly; @@ -290,7 +290,7 @@ EXPORT_SYMBOL(sysctl_tcp_mem); EXPORT_SYMBOL(sysctl_tcp_rmem); EXPORT_SYMBOL(sysctl_tcp_wmem); -atomic_t tcp_memory_allocated; /* Current allocated memory. */ +atomic_long_t tcp_memory_allocated; /* Current allocated memory. */ EXPORT_SYMBOL(tcp_memory_allocated); /* @@ -2246,7 +2246,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, /* Values greater than interface MTU won't take effect. However * at the point when this call is done we typically don't yet * know which interface is going to be used */ - if (val < 8 || val > MAX_TCP_WINDOW) { + if (val < 64 || val > MAX_TCP_WINDOW) { err = -EINVAL; break; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3357f69..6d8ab1c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -259,8 +259,11 @@ static void tcp_fixup_sndbuf(struct sock *sk) int sndmem = tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff); - if (sk->sk_sndbuf < 3 * sndmem) - sk->sk_sndbuf = min(3 * sndmem, sysctl_tcp_wmem[2]); + if (sk->sk_sndbuf < 3 * sndmem) { + sk->sk_sndbuf = 3 * sndmem; + if (sk->sk_sndbuf > sysctl_tcp_wmem[2]) + sk->sk_sndbuf = sysctl_tcp_wmem[2]; + } } /* 2. Tuning advertised window (window_clamp, rcv_ssthresh) @@ -396,7 +399,7 @@ static void tcp_clamp_window(struct sock *sk) if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && !tcp_memory_pressure && - atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { + atomic_long_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), sysctl_tcp_rmem[2]); } @@ -4861,7 +4864,7 @@ static int tcp_should_expand_sndbuf(struct sock *sk) return 0; /* If we are under soft global TCP memory pressure, do not expand. */ - if (atomic_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0]) + if (atomic_long_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0]) return 0; /* If we filled the congestion window, do not expand. */ diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c index 9a17bd2..ac3b3ee 100644 --- a/net/ipv4/tunnel4.c +++ b/net/ipv4/tunnel4.c @@ -14,27 +14,32 @@ #include <net/protocol.h> #include <net/xfrm.h> -static struct xfrm_tunnel *tunnel4_handlers __read_mostly; -static struct xfrm_tunnel *tunnel64_handlers __read_mostly; +static struct xfrm_tunnel __rcu *tunnel4_handlers __read_mostly; +static struct xfrm_tunnel __rcu *tunnel64_handlers __read_mostly; static DEFINE_MUTEX(tunnel4_mutex); -static inline struct xfrm_tunnel **fam_handlers(unsigned short family) +static inline struct xfrm_tunnel __rcu **fam_handlers(unsigned short family) { return (family == AF_INET) ? &tunnel4_handlers : &tunnel64_handlers; } int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family) { - struct xfrm_tunnel **pprev; + struct xfrm_tunnel __rcu **pprev; + struct xfrm_tunnel *t; + int ret = -EEXIST; int priority = handler->priority; mutex_lock(&tunnel4_mutex); - for (pprev = fam_handlers(family); *pprev; pprev = &(*pprev)->next) { - if ((*pprev)->priority > priority) + for (pprev = fam_handlers(family); + (t = rcu_dereference_protected(*pprev, + lockdep_is_held(&tunnel4_mutex))) != NULL; + pprev = &t->next) { + if (t->priority > priority) break; - if ((*pprev)->priority == priority) + if (t->priority == priority) goto err; } @@ -52,13 +57,17 @@ EXPORT_SYMBOL(xfrm4_tunnel_register); int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family) { - struct xfrm_tunnel **pprev; + struct xfrm_tunnel __rcu **pprev; + struct xfrm_tunnel *t; int ret = -ENOENT; mutex_lock(&tunnel4_mutex); - for (pprev = fam_handlers(family); *pprev; pprev = &(*pprev)->next) { - if (*pprev == handler) { + for (pprev = fam_handlers(family); + (t = rcu_dereference_protected(*pprev, + lockdep_is_held(&tunnel4_mutex))) != NULL; + pprev = &t->next) { + if (t == handler) { *pprev = handler->next; ret = 0; break; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index b3f7e8c..5e0a3a5 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -110,7 +110,7 @@ struct udp_table udp_table __read_mostly; EXPORT_SYMBOL(udp_table); -int sysctl_udp_mem[3] __read_mostly; +long sysctl_udp_mem[3] __read_mostly; EXPORT_SYMBOL(sysctl_udp_mem); int sysctl_udp_rmem_min __read_mostly; @@ -119,7 +119,7 @@ EXPORT_SYMBOL(sysctl_udp_rmem_min); int sysctl_udp_wmem_min __read_mostly; EXPORT_SYMBOL(sysctl_udp_wmem_min); -atomic_t udp_memory_allocated; +atomic_long_t udp_memory_allocated; EXPORT_SYMBOL(udp_memory_allocated); #define MAX_UDP_PORTS 65536 @@ -1413,7 +1413,7 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) } } - if (sk->sk_filter) { + if (rcu_dereference_raw(sk->sk_filter)) { if (udp_lib_checksum_complete(skb)) goto drop; } diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index ec7a91d..e048ec6 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -836,7 +836,7 @@ static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp, struct inet6_ifaddr *i { struct inet6_dev *idev = ifp->idev; struct in6_addr addr, *tmpaddr; - unsigned long tmp_prefered_lft, tmp_valid_lft, tmp_cstamp, tmp_tstamp; + unsigned long tmp_prefered_lft, tmp_valid_lft, tmp_cstamp, tmp_tstamp, age; unsigned long regen_advance; int tmp_plen; int ret = 0; @@ -886,12 +886,13 @@ retry: goto out; } memcpy(&addr.s6_addr[8], idev->rndid, 8); + age = (jiffies - ifp->tstamp) / HZ; tmp_valid_lft = min_t(__u32, ifp->valid_lft, - idev->cnf.temp_valid_lft); + idev->cnf.temp_valid_lft + age); tmp_prefered_lft = min_t(__u32, ifp->prefered_lft, - idev->cnf.temp_prefered_lft - + idev->cnf.temp_prefered_lft + age - idev->cnf.max_desync_factor); tmp_plen = ifp->prefix_len; max_addresses = idev->cnf.max_addresses; @@ -1426,8 +1427,10 @@ void addrconf_dad_failure(struct inet6_ifaddr *ifp) { struct inet6_dev *idev = ifp->idev; - if (addrconf_dad_end(ifp)) + if (addrconf_dad_end(ifp)) { + in6_ifa_put(ifp); return; + } if (net_ratelimit()) printk(KERN_INFO "%s: IPv6 duplicate address %pI6c detected!\n", @@ -2021,10 +2024,11 @@ ok: ipv6_ifa_notify(0, ift); } - if (create && in6_dev->cnf.use_tempaddr > 0) { + if ((create || list_empty(&in6_dev->tempaddr_list)) && in6_dev->cnf.use_tempaddr > 0) { /* * When a new public address is created as described in [ADDRCONF], - * also create a new temporary address. + * also create a new temporary address. Also create a temporary + * address if it's enabled but no temporary address currently exists. */ read_unlock_bh(&in6_dev->lock); ipv6_create_tempaddr(ifp, NULL); diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index c2c0f89..2a59610 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1284,6 +1284,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) t = netdev_priv(dev); ip6_tnl_unlink(ip6n, t); + synchronize_net(); err = ip6_tnl_change(t, &p); ip6_tnl_link(ip6n, t); netdev_state_change(dev); @@ -1371,6 +1372,7 @@ static void ip6_tnl_dev_setup(struct net_device *dev) dev->flags |= IFF_NOARP; dev->addr_len = sizeof(struct in6_addr); dev->features |= NETIF_F_NETNS_LOCAL; + dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; } diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 0553867..d1770e0 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -343,6 +343,10 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, break; case IPV6_TRANSPARENT: + if (!capable(CAP_NET_ADMIN)) { + retv = -EPERM; + break; + } if (optlen < sizeof(int)) goto e_inval; /* we don't have a separate transparent bit for IPV6 we use the one in the IPv4 socket */ diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 44d2eea..4484648 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -5,10 +5,15 @@ menu "IPv6: Netfilter Configuration" depends on INET && IPV6 && NETFILTER +config NF_DEFRAG_IPV6 + tristate + default n + config NF_CONNTRACK_IPV6 tristate "IPv6 connection tracking support" depends on INET && IPV6 && NF_CONNTRACK default m if NETFILTER_ADVANCED=n + select NF_DEFRAG_IPV6 ---help--- Connection tracking keeps a record of what packets have passed through your machine, in order to figure out how they are related diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index 3f8e4a3..0a432c9 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -12,11 +12,14 @@ obj-$(CONFIG_IP6_NF_SECURITY) += ip6table_security.o # objects for l3 independent conntrack nf_conntrack_ipv6-objs := nf_conntrack_l3proto_ipv6.o nf_conntrack_proto_icmpv6.o -nf_defrag_ipv6-objs := nf_defrag_ipv6_hooks.o nf_conntrack_reasm.o # l3 independent conntrack obj-$(CONFIG_NF_CONNTRACK_IPV6) += nf_conntrack_ipv6.o nf_defrag_ipv6.o +# defrag +nf_defrag_ipv6-objs := nf_defrag_ipv6_hooks.o nf_conntrack_reasm.o +obj-$(CONFIG_NF_DEFRAG_IPV6) += nf_defrag_ipv6.o + # matches obj-$(CONFIG_IP6_NF_MATCH_AH) += ip6t_ah.o obj-$(CONFIG_IP6_NF_MATCH_EUI64) += ip6t_eui64.o diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 51df035..4555823 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1137,6 +1137,7 @@ static int get_info(struct net *net, void __user *user, private = &tmp; } #endif + memset(&info, 0, sizeof(info)); info.valid_hooks = t->valid_hooks; memcpy(info.hook_entry, private->hook_entry, sizeof(info.hook_entry)); diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 489d71b..3a3f129 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -625,21 +625,24 @@ int nf_ct_frag6_init(void) inet_frags_init_net(&nf_init_frags); inet_frags_init(&nf_frags); +#ifdef CONFIG_SYSCTL nf_ct_frag6_sysctl_header = register_sysctl_paths(nf_net_netfilter_sysctl_path, nf_ct_frag6_sysctl_table); if (!nf_ct_frag6_sysctl_header) { inet_frags_fini(&nf_frags); return -ENOMEM; } +#endif return 0; } void nf_ct_frag6_cleanup(void) { +#ifdef CONFIG_SYSCTL unregister_sysctl_table(nf_ct_frag6_sysctl_header); nf_ct_frag6_sysctl_header = NULL; - +#endif inet_frags_fini(&nf_frags); nf_init_frags.low_thresh = 0; diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index d082eae..24b3558 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -126,6 +126,8 @@ static const struct snmp_mib snmp6_udp6_list[] = { SNMP_MIB_ITEM("Udp6NoPorts", UDP_MIB_NOPORTS), SNMP_MIB_ITEM("Udp6InErrors", UDP_MIB_INERRORS), SNMP_MIB_ITEM("Udp6OutDatagrams", UDP_MIB_OUTDATAGRAMS), + SNMP_MIB_ITEM("Udp6RcvbufErrors", UDP_MIB_RCVBUFERRORS), + SNMP_MIB_ITEM("Udp6SndbufErrors", UDP_MIB_SNDBUFERRORS), SNMP_MIB_SENTINEL }; @@ -134,6 +136,8 @@ static const struct snmp_mib snmp6_udplite6_list[] = { SNMP_MIB_ITEM("UdpLite6NoPorts", UDP_MIB_NOPORTS), SNMP_MIB_ITEM("UdpLite6InErrors", UDP_MIB_INERRORS), SNMP_MIB_ITEM("UdpLite6OutDatagrams", UDP_MIB_OUTDATAGRAMS), + SNMP_MIB_ITEM("UdpLite6RcvbufErrors", UDP_MIB_RCVBUFERRORS), + SNMP_MIB_ITEM("UdpLite6SndbufErrors", UDP_MIB_SNDBUFERRORS), SNMP_MIB_SENTINEL }; diff --git a/net/ipv6/protocol.c b/net/ipv6/protocol.c index 9bb936a..9a7978f 100644 --- a/net/ipv6/protocol.c +++ b/net/ipv6/protocol.c @@ -25,13 +25,14 @@ #include <linux/spinlock.h> #include <net/protocol.h> -const struct inet6_protocol *inet6_protos[MAX_INET_PROTOS] __read_mostly; +const struct inet6_protocol __rcu *inet6_protos[MAX_INET_PROTOS] __read_mostly; int inet6_add_protocol(const struct inet6_protocol *prot, unsigned char protocol) { int hash = protocol & (MAX_INET_PROTOS - 1); - return !cmpxchg(&inet6_protos[hash], NULL, prot) ? 0 : -1; + return !cmpxchg((const struct inet6_protocol **)&inet6_protos[hash], + NULL, prot) ? 0 : -1; } EXPORT_SYMBOL(inet6_add_protocol); @@ -43,7 +44,8 @@ int inet6_del_protocol(const struct inet6_protocol *prot, unsigned char protocol { int ret, hash = protocol & (MAX_INET_PROTOS - 1); - ret = (cmpxchg(&inet6_protos[hash], prot, NULL) == prot) ? 0 : -1; + ret = (cmpxchg((const struct inet6_protocol **)&inet6_protos[hash], + prot, NULL) == prot) ? 0 : -1; synchronize_net(); diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 45e6efb7..86c3952 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -373,7 +373,7 @@ void raw6_icmp_error(struct sk_buff *skb, int nexthdr, static inline int rawv6_rcv_skb(struct sock * sk, struct sk_buff * skb) { - if ((raw6_sk(sk)->checksum || sk->sk_filter) && + if ((raw6_sk(sk)->checksum || rcu_dereference_raw(sk->sk_filter)) && skb_checksum_complete(skb)) { atomic_inc(&sk->sk_drops); kfree_skb(skb); diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index c7ba314..0f27664 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -349,7 +349,7 @@ found: /* Check for overlap with preceding fragment. */ if (prev && - (FRAG6_CB(prev)->offset + prev->len) - offset > 0) + (FRAG6_CB(prev)->offset + prev->len) > offset) goto discard_fq; /* Look for overlap with succeeding segment. */ diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 25661f9..fc32833 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2741,6 +2741,7 @@ static void __net_exit ip6_route_net_exit(struct net *net) kfree(net->ipv6.ip6_prohibit_entry); kfree(net->ipv6.ip6_blk_hole_entry); #endif + dst_entries_destroy(&net->ipv6.ip6_dst_ops); } static struct pernet_operations ip6_route_net_ops = { @@ -2832,5 +2833,6 @@ void ip6_route_cleanup(void) xfrm6_fini(); fib6_gc_cleanup(); unregister_pernet_subsys(&ip6_route_net_ops); + dst_entries_destroy(&ip6_dst_blackhole_ops); kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); } diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 367a6cc..d6bfaec 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -963,6 +963,7 @@ ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) } t = netdev_priv(dev); ipip6_tunnel_unlink(sitn, t); + synchronize_net(); t->parms.iph.saddr = p.iph.saddr; t->parms.iph.daddr = p.iph.daddr; memcpy(dev->dev_addr, &p.iph.saddr, 4); diff --git a/net/ipv6/tunnel6.c b/net/ipv6/tunnel6.c index d986472..4f3cec1 100644 --- a/net/ipv6/tunnel6.c +++ b/net/ipv6/tunnel6.c @@ -30,23 +30,26 @@ #include <net/protocol.h> #include <net/xfrm.h> -static struct xfrm6_tunnel *tunnel6_handlers __read_mostly; -static struct xfrm6_tunnel *tunnel46_handlers __read_mostly; +static struct xfrm6_tunnel __rcu *tunnel6_handlers __read_mostly; +static struct xfrm6_tunnel __rcu *tunnel46_handlers __read_mostly; static DEFINE_MUTEX(tunnel6_mutex); int xfrm6_tunnel_register(struct xfrm6_tunnel *handler, unsigned short family) { - struct xfrm6_tunnel **pprev; + struct xfrm6_tunnel __rcu **pprev; + struct xfrm6_tunnel *t; int ret = -EEXIST; int priority = handler->priority; mutex_lock(&tunnel6_mutex); for (pprev = (family == AF_INET6) ? &tunnel6_handlers : &tunnel46_handlers; - *pprev; pprev = &(*pprev)->next) { - if ((*pprev)->priority > priority) + (t = rcu_dereference_protected(*pprev, + lockdep_is_held(&tunnel6_mutex))) != NULL; + pprev = &t->next) { + if (t->priority > priority) break; - if ((*pprev)->priority == priority) + if (t->priority == priority) goto err; } @@ -65,14 +68,17 @@ EXPORT_SYMBOL(xfrm6_tunnel_register); int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler, unsigned short family) { - struct xfrm6_tunnel **pprev; + struct xfrm6_tunnel __rcu **pprev; + struct xfrm6_tunnel *t; int ret = -ENOENT; mutex_lock(&tunnel6_mutex); for (pprev = (family == AF_INET6) ? &tunnel6_handlers : &tunnel46_handlers; - *pprev; pprev = &(*pprev)->next) { - if (*pprev == handler) { + (t = rcu_dereference_protected(*pprev, + lockdep_is_held(&tunnel6_mutex))) != NULL; + pprev = &t->next) { + if (t == handler) { *pprev = handler->next; ret = 0; break; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index c84dad4..91def93 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -527,7 +527,7 @@ int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) } } - if (sk->sk_filter) { + if (rcu_dereference_raw(sk->sk_filter)) { if (udp_lib_checksum_complete(skb)) goto drop; } diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 1712af1..c64ce0a 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -111,6 +111,10 @@ struct l2tp_net { spinlock_t l2tp_session_hlist_lock; }; +static void l2tp_session_set_header_len(struct l2tp_session *session, int version); +static void l2tp_tunnel_free(struct l2tp_tunnel *tunnel); +static void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel); + static inline struct l2tp_net *l2tp_pernet(struct net *net) { BUG_ON(!net); @@ -118,6 +122,34 @@ static inline struct l2tp_net *l2tp_pernet(struct net *net) return net_generic(net, l2tp_net_id); } + +/* Tunnel reference counts. Incremented per session that is added to + * the tunnel. + */ +static inline void l2tp_tunnel_inc_refcount_1(struct l2tp_tunnel *tunnel) +{ + atomic_inc(&tunnel->ref_count); +} + +static inline void l2tp_tunnel_dec_refcount_1(struct l2tp_tunnel *tunnel) +{ + if (atomic_dec_and_test(&tunnel->ref_count)) + l2tp_tunnel_free(tunnel); +} +#ifdef L2TP_REFCNT_DEBUG +#define l2tp_tunnel_inc_refcount(_t) do { \ + printk(KERN_DEBUG "l2tp_tunnel_inc_refcount: %s:%d %s: cnt=%d\n", __func__, __LINE__, (_t)->name, atomic_read(&_t->ref_count)); \ + l2tp_tunnel_inc_refcount_1(_t); \ + } while (0) +#define l2tp_tunnel_dec_refcount(_t) do { \ + printk(KERN_DEBUG "l2tp_tunnel_dec_refcount: %s:%d %s: cnt=%d\n", __func__, __LINE__, (_t)->name, atomic_read(&_t->ref_count)); \ + l2tp_tunnel_dec_refcount_1(_t); \ + } while (0) +#else +#define l2tp_tunnel_inc_refcount(t) l2tp_tunnel_inc_refcount_1(t) +#define l2tp_tunnel_dec_refcount(t) l2tp_tunnel_dec_refcount_1(t) +#endif + /* Session hash global list for L2TPv3. * The session_id SHOULD be random according to RFC3931, but several * L2TP implementations use incrementing session_ids. So we do a real @@ -699,8 +731,8 @@ EXPORT_SYMBOL(l2tp_recv_common); * Returns 1 if the packet was not a good data packet and could not be * forwarded. All such packets are passed up to userspace to deal with. */ -int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb, - int (*payload_hook)(struct sk_buff *skb)) +static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb, + int (*payload_hook)(struct sk_buff *skb)) { struct l2tp_session *session = NULL; unsigned char *ptr, *optr; @@ -812,7 +844,6 @@ error: return 1; } -EXPORT_SYMBOL_GPL(l2tp_udp_recv_core); /* UDP encapsulation receive handler. See net/ipv4/udp.c. * Return codes: @@ -922,7 +953,8 @@ static int l2tp_build_l2tpv3_header(struct l2tp_session *session, void *buf) return bufp - optr; } -int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, size_t data_len) +static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, + size_t data_len) { struct l2tp_tunnel *tunnel = session->tunnel; unsigned int len = skb->len; @@ -970,7 +1002,6 @@ int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, size_t dat return 0; } -EXPORT_SYMBOL_GPL(l2tp_xmit_core); /* Automatically called when the skb is freed. */ @@ -1089,7 +1120,7 @@ EXPORT_SYMBOL_GPL(l2tp_xmit_skb); * The tunnel context is deleted only when all session sockets have been * closed. */ -void l2tp_tunnel_destruct(struct sock *sk) +static void l2tp_tunnel_destruct(struct sock *sk) { struct l2tp_tunnel *tunnel; @@ -1128,11 +1159,10 @@ void l2tp_tunnel_destruct(struct sock *sk) end: return; } -EXPORT_SYMBOL(l2tp_tunnel_destruct); /* When the tunnel is closed, all the attached sessions need to go too. */ -void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel) +static void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel) { int hash; struct hlist_node *walk; @@ -1193,12 +1223,11 @@ again: } write_unlock_bh(&tunnel->hlist_lock); } -EXPORT_SYMBOL_GPL(l2tp_tunnel_closeall); /* Really kill the tunnel. * Come here only when all sessions have been cleared from the tunnel. */ -void l2tp_tunnel_free(struct l2tp_tunnel *tunnel) +static void l2tp_tunnel_free(struct l2tp_tunnel *tunnel) { struct l2tp_net *pn = l2tp_pernet(tunnel->l2tp_net); @@ -1217,7 +1246,6 @@ void l2tp_tunnel_free(struct l2tp_tunnel *tunnel) atomic_dec(&l2tp_tunnel_count); kfree(tunnel); } -EXPORT_SYMBOL_GPL(l2tp_tunnel_free); /* Create a socket for the tunnel, if one isn't set up by * userspace. This is used for static tunnels where there is no @@ -1512,7 +1540,7 @@ EXPORT_SYMBOL_GPL(l2tp_session_delete); /* We come here whenever a session's send_seq, cookie_len or * l2specific_len parameters are set. */ -void l2tp_session_set_header_len(struct l2tp_session *session, int version) +static void l2tp_session_set_header_len(struct l2tp_session *session, int version) { if (version == L2TP_HDR_VER_2) { session->hdr_len = 6; @@ -1525,7 +1553,6 @@ void l2tp_session_set_header_len(struct l2tp_session *session, int version) } } -EXPORT_SYMBOL_GPL(l2tp_session_set_header_len); struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg) { diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index f0f318e..a16a48e 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -231,48 +231,15 @@ extern int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_i extern int l2tp_tunnel_delete(struct l2tp_tunnel *tunnel); extern struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg); extern int l2tp_session_delete(struct l2tp_session *session); -extern void l2tp_tunnel_free(struct l2tp_tunnel *tunnel); extern void l2tp_session_free(struct l2tp_session *session); extern void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, unsigned char *ptr, unsigned char *optr, u16 hdrflags, int length, int (*payload_hook)(struct sk_buff *skb)); -extern int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb, int (*payload_hook)(struct sk_buff *skb)); extern int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb); -extern int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, size_t data_len); extern int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len); -extern void l2tp_tunnel_destruct(struct sock *sk); -extern void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel); -extern void l2tp_session_set_header_len(struct l2tp_session *session, int version); extern int l2tp_nl_register_ops(enum l2tp_pwtype pw_type, const struct l2tp_nl_cmd_ops *ops); extern void l2tp_nl_unregister_ops(enum l2tp_pwtype pw_type); -/* Tunnel reference counts. Incremented per session that is added to - * the tunnel. - */ -static inline void l2tp_tunnel_inc_refcount_1(struct l2tp_tunnel *tunnel) -{ - atomic_inc(&tunnel->ref_count); -} - -static inline void l2tp_tunnel_dec_refcount_1(struct l2tp_tunnel *tunnel) -{ - if (atomic_dec_and_test(&tunnel->ref_count)) - l2tp_tunnel_free(tunnel); -} -#ifdef L2TP_REFCNT_DEBUG -#define l2tp_tunnel_inc_refcount(_t) do { \ - printk(KERN_DEBUG "l2tp_tunnel_inc_refcount: %s:%d %s: cnt=%d\n", __func__, __LINE__, (_t)->name, atomic_read(&_t->ref_count)); \ - l2tp_tunnel_inc_refcount_1(_t); \ - } while (0) -#define l2tp_tunnel_dec_refcount(_t) do { \ - printk(KERN_DEBUG "l2tp_tunnel_dec_refcount: %s:%d %s: cnt=%d\n", __func__, __LINE__, (_t)->name, atomic_read(&_t->ref_count)); \ - l2tp_tunnel_dec_refcount_1(_t); \ - } while (0) -#else -#define l2tp_tunnel_inc_refcount(t) l2tp_tunnel_inc_refcount_1(t) -#define l2tp_tunnel_dec_refcount(t) l2tp_tunnel_dec_refcount_1(t) -#endif - /* Session reference counts. Incremented when code obtains a reference * to a session. */ diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c index 104ec3b..b8dbae8 100644 --- a/net/l2tp/l2tp_debugfs.c +++ b/net/l2tp/l2tp_debugfs.c @@ -249,7 +249,7 @@ static int l2tp_dfs_seq_open(struct inode *inode, struct file *file) struct seq_file *seq; int rc = -ENOMEM; - pd = kzalloc(GFP_KERNEL, sizeof(*pd)); + pd = kzalloc(sizeof(*pd), GFP_KERNEL); if (pd == NULL) goto out; diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 1c770c0..0bf6a59 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -576,7 +576,7 @@ out: return copied; } -struct proto l2tp_ip_prot = { +static struct proto l2tp_ip_prot = { .name = "L2TP/IP", .owner = THIS_MODULE, .init = l2tp_ip_open, diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 4328825..1534f2b 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -525,6 +525,7 @@ config NETFILTER_XT_TARGET_TPROXY depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED select NF_DEFRAG_IPV4 + select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES help This option adds a `TPROXY' target, which is somewhat similar to REDIRECT. It can only be used in the mangle table and is useful @@ -927,6 +928,7 @@ config NETFILTER_XT_MATCH_SOCKET depends on NETFILTER_ADVANCED depends on !NF_CONNTRACK || NF_CONNTRACK select NF_DEFRAG_IPV4 + select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES help This option adds a `socket' match, which can be used to match packets for which a TCP or UDP socket lookup finds a valid socket. diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 1eacf8d..27a5ea6 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1312,7 +1312,8 @@ void *nf_ct_alloc_hashtable(unsigned int *sizep, int *vmalloced, int nulls) if (!hash) { *vmalloced = 1; printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n"); - hash = __vmalloc(sz, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL); + hash = __vmalloc(sz, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, + PAGE_KERNEL); } if (hash && nulls) diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index ed6d929..dc7bb74 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -292,6 +292,12 @@ int nf_conntrack_l4proto_register(struct nf_conntrack_l4proto *l4proto) for (i = 0; i < MAX_NF_CT_PROTO; i++) proto_array[i] = &nf_conntrack_l4proto_generic; + + /* Before making proto_array visible to lockless readers, + * we must make sure its content is committed to memory. + */ + smp_wmb(); + nf_ct_protos[l4proto->l3proto] = proto_array; } else if (nf_ct_protos[l4proto->l3proto][l4proto->l4proto] != &nf_conntrack_l4proto_generic) { diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c index 19c482c..640678f 100644 --- a/net/netfilter/xt_TPROXY.c +++ b/net/netfilter/xt_TPROXY.c @@ -21,7 +21,9 @@ #include <linux/netfilter_ipv4/ip_tables.h> #include <net/netfilter/ipv4/nf_defrag_ipv4.h> -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + +#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) +#define XT_TPROXY_HAVE_IPV6 1 #include <net/if_inet6.h> #include <net/addrconf.h> #include <linux/netfilter_ipv6/ip6_tables.h> @@ -172,7 +174,7 @@ tproxy_tg4_v1(struct sk_buff *skb, const struct xt_action_param *par) return tproxy_tg4(skb, tgi->laddr.ip, tgi->lport, tgi->mark_mask, tgi->mark_value); } -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#ifdef XT_TPROXY_HAVE_IPV6 static inline const struct in6_addr * tproxy_laddr6(struct sk_buff *skb, const struct in6_addr *user_laddr, @@ -372,7 +374,7 @@ static struct xt_target tproxy_tg_reg[] __read_mostly = { .hooks = 1 << NF_INET_PRE_ROUTING, .me = THIS_MODULE, }, -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#ifdef XT_TPROXY_HAVE_IPV6 { .name = "TPROXY", .family = NFPROTO_IPV6, @@ -391,7 +393,7 @@ static struct xt_target tproxy_tg_reg[] __read_mostly = { static int __init tproxy_tg_init(void) { nf_defrag_ipv4_enable(); -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#ifdef XT_TPROXY_HAVE_IPV6 nf_defrag_ipv6_enable(); #endif diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index 2dbd4c8..00d6ae83 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -14,7 +14,6 @@ #include <linux/skbuff.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv4/ip_tables.h> -#include <linux/netfilter_ipv6/ip6_tables.h> #include <net/tcp.h> #include <net/udp.h> #include <net/icmp.h> @@ -22,7 +21,12 @@ #include <net/inet_sock.h> #include <net/netfilter/nf_tproxy_core.h> #include <net/netfilter/ipv4/nf_defrag_ipv4.h> + +#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) +#define XT_SOCKET_HAVE_IPV6 1 +#include <linux/netfilter_ipv6/ip6_tables.h> #include <net/netfilter/ipv6/nf_defrag_ipv6.h> +#endif #include <linux/netfilter/xt_socket.h> @@ -186,12 +190,12 @@ socket_mt4_v1(const struct sk_buff *skb, struct xt_action_param *par) return socket_match(skb, par, par->matchinfo); } -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#ifdef XT_SOCKET_HAVE_IPV6 static int extract_icmp6_fields(const struct sk_buff *skb, unsigned int outside_hdrlen, - u8 *protocol, + int *protocol, struct in6_addr **raddr, struct in6_addr **laddr, __be16 *rport, @@ -248,8 +252,7 @@ socket_mt6_v1(const struct sk_buff *skb, struct xt_action_param *par) struct sock *sk; struct in6_addr *daddr, *saddr; __be16 dport, sport; - int thoff; - u8 tproto; + int thoff, tproto; const struct xt_socket_mtinfo1 *info = (struct xt_socket_mtinfo1 *) par->matchinfo; tproto = ipv6_find_hdr(skb, &thoff, -1, NULL); @@ -301,7 +304,7 @@ socket_mt6_v1(const struct sk_buff *skb, struct xt_action_param *par) sk = NULL; } - pr_debug("proto %hhu %pI6:%hu -> %pI6:%hu " + pr_debug("proto %hhd %pI6:%hu -> %pI6:%hu " "(orig %pI6:%hu) sock %p\n", tproto, saddr, ntohs(sport), daddr, ntohs(dport), @@ -331,7 +334,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = { (1 << NF_INET_LOCAL_IN), .me = THIS_MODULE, }, -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#ifdef XT_SOCKET_HAVE_IPV6 { .name = "socket", .revision = 1, @@ -348,7 +351,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = { static int __init socket_mt_init(void) { nf_defrag_ipv4_enable(); -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#ifdef XT_SOCKET_HAVE_IPV6 nf_defrag_ipv6_enable(); #endif diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index cd96ed3..478181d 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -83,9 +83,9 @@ struct netlink_sock { struct module *module; }; -struct listeners_rcu_head { - struct rcu_head rcu_head; - void *ptr; +struct listeners { + struct rcu_head rcu; + unsigned long masks[0]; }; #define NETLINK_KERNEL_SOCKET 0x1 @@ -119,7 +119,7 @@ struct nl_pid_hash { struct netlink_table { struct nl_pid_hash hash; struct hlist_head mc_list; - unsigned long *listeners; + struct listeners __rcu *listeners; unsigned int nl_nonroot; unsigned int groups; struct mutex *cb_mutex; @@ -338,7 +338,7 @@ netlink_update_listeners(struct sock *sk) if (i < NLGRPLONGS(nlk_sk(sk)->ngroups)) mask |= nlk_sk(sk)->groups[i]; } - tbl->listeners[i] = mask; + tbl->listeners->masks[i] = mask; } /* this function is only called with the netlink table "grabbed", which * makes sure updates are visible before bind or setsockopt return. */ @@ -936,7 +936,7 @@ EXPORT_SYMBOL(netlink_unicast); int netlink_has_listeners(struct sock *sk, unsigned int group) { int res = 0; - unsigned long *listeners; + struct listeners *listeners; BUG_ON(!netlink_is_kernel(sk)); @@ -944,7 +944,7 @@ int netlink_has_listeners(struct sock *sk, unsigned int group) listeners = rcu_dereference(nl_table[sk->sk_protocol].listeners); if (group - 1 < nl_table[sk->sk_protocol].groups) - res = test_bit(group - 1, listeners); + res = test_bit(group - 1, listeners->masks); rcu_read_unlock(); @@ -1498,7 +1498,7 @@ netlink_kernel_create(struct net *net, int unit, unsigned int groups, struct socket *sock; struct sock *sk; struct netlink_sock *nlk; - unsigned long *listeners = NULL; + struct listeners *listeners = NULL; BUG_ON(!nl_table); @@ -1523,8 +1523,7 @@ netlink_kernel_create(struct net *net, int unit, unsigned int groups, if (groups < 32) groups = 32; - listeners = kzalloc(NLGRPSZ(groups) + sizeof(struct listeners_rcu_head), - GFP_KERNEL); + listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL); if (!listeners) goto out_sock_release; @@ -1541,7 +1540,7 @@ netlink_kernel_create(struct net *net, int unit, unsigned int groups, netlink_table_grab(); if (!nl_table[unit].registered) { nl_table[unit].groups = groups; - nl_table[unit].listeners = listeners; + rcu_assign_pointer(nl_table[unit].listeners, listeners); nl_table[unit].cb_mutex = cb_mutex; nl_table[unit].module = module; nl_table[unit].registered = 1; @@ -1572,43 +1571,28 @@ netlink_kernel_release(struct sock *sk) EXPORT_SYMBOL(netlink_kernel_release); -static void netlink_free_old_listeners(struct rcu_head *rcu_head) +static void listeners_free_rcu(struct rcu_head *head) { - struct listeners_rcu_head *lrh; - - lrh = container_of(rcu_head, struct listeners_rcu_head, rcu_head); - kfree(lrh->ptr); + kfree(container_of(head, struct listeners, rcu)); } int __netlink_change_ngroups(struct sock *sk, unsigned int groups) { - unsigned long *listeners, *old = NULL; - struct listeners_rcu_head *old_rcu_head; + struct listeners *new, *old; struct netlink_table *tbl = &nl_table[sk->sk_protocol]; if (groups < 32) groups = 32; if (NLGRPSZ(tbl->groups) < NLGRPSZ(groups)) { - listeners = kzalloc(NLGRPSZ(groups) + - sizeof(struct listeners_rcu_head), - GFP_ATOMIC); - if (!listeners) + new = kzalloc(sizeof(*new) + NLGRPSZ(groups), GFP_ATOMIC); + if (!new) return -ENOMEM; - old = tbl->listeners; - memcpy(listeners, old, NLGRPSZ(tbl->groups)); - rcu_assign_pointer(tbl->listeners, listeners); - /* - * Free the old memory after an RCU grace period so we - * don't leak it. We use call_rcu() here in order to be - * able to call this function from atomic contexts. The - * allocation of this memory will have reserved enough - * space for struct listeners_rcu_head at the end. - */ - old_rcu_head = (void *)(tbl->listeners + - NLGRPLONGS(tbl->groups)); - old_rcu_head->ptr = old; - call_rcu(&old_rcu_head->rcu_head, netlink_free_old_listeners); + old = rcu_dereference_raw(tbl->listeners); + memcpy(new->masks, old->masks, NLGRPSZ(tbl->groups)); + rcu_assign_pointer(tbl->listeners, new); + + call_rcu(&old->rcu, listeners_free_rcu); } tbl->groups = groups; @@ -2104,18 +2088,17 @@ static void __net_exit netlink_net_exit(struct net *net) static void __init netlink_add_usersock_entry(void) { - unsigned long *listeners; + struct listeners *listeners; int groups = 32; - listeners = kzalloc(NLGRPSZ(groups) + sizeof(struct listeners_rcu_head), - GFP_KERNEL); + listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL); if (!listeners) - panic("netlink_add_usersock_entry: Cannot allocate listneres\n"); + panic("netlink_add_usersock_entry: Cannot allocate listeners\n"); netlink_table_grab(); nl_table[NETLINK_USERSOCK].groups = groups; - nl_table[NETLINK_USERSOCK].listeners = listeners; + rcu_assign_pointer(nl_table[NETLINK_USERSOCK].listeners, listeners); nl_table[NETLINK_USERSOCK].module = THIS_MODULE; nl_table[NETLINK_USERSOCK].registered = 1; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 3616f27..0856a13 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1719,7 +1719,7 @@ static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr, rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex); if (dev) - strlcpy(uaddr->sa_data, dev->name, 15); + strncpy(uaddr->sa_data, dev->name, 14); else memset(uaddr->sa_data, 0, 14); rcu_read_unlock(); @@ -1742,6 +1742,7 @@ static int packet_getname(struct socket *sock, struct sockaddr *uaddr, sll->sll_family = AF_PACKET; sll->sll_ifindex = po->ifindex; sll->sll_protocol = po->num; + sll->sll_pkttype = 0; rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex); if (dev) { diff --git a/net/rds/loop.c b/net/rds/loop.c index c390156..aeec1d4 100644 --- a/net/rds/loop.c +++ b/net/rds/loop.c @@ -134,8 +134,12 @@ static int rds_loop_conn_alloc(struct rds_connection *conn, gfp_t gfp) static void rds_loop_conn_free(void *arg) { struct rds_loop_connection *lc = arg; + unsigned long flags; + rdsdebug("lc %p\n", lc); + spin_lock_irqsave(&loop_conns_lock, flags); list_del(&lc->loop_node); + spin_unlock_irqrestore(&loop_conns_lock, flags); kfree(lc); } diff --git a/net/rds/message.c b/net/rds/message.c index a84545d..1fd3d29 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -224,6 +224,9 @@ struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents) WARN_ON(rm->m_used_sgs + nents > rm->m_total_sgs); WARN_ON(!nents); + if (rm->m_used_sgs + nents > rm->m_total_sgs) + return NULL; + sg_ret = &sg_first[rm->m_used_sgs]; sg_init_table(sg_ret, nents); rm->m_used_sgs += nents; @@ -246,6 +249,10 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len); rm->data.op_nents = ceil(total_len, PAGE_SIZE); rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs); + if (!rm->data.op_sg) { + rds_message_put(rm); + return ERR_PTR(-ENOMEM); + } for (i = 0; i < rm->data.op_nents; ++i) { sg_set_page(&rm->data.op_sg[i], diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 1a41deb..8920f2a 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -479,13 +479,38 @@ void rds_atomic_free_op(struct rm_atomic_op *ao) /* - * Count the number of pages needed to describe an incoming iovec. + * Count the number of pages needed to describe an incoming iovec array. */ -static int rds_rdma_pages(struct rds_rdma_args *args) +static int rds_rdma_pages(struct rds_iovec iov[], int nr_iovecs) +{ + int tot_pages = 0; + unsigned int nr_pages; + unsigned int i; + + /* figure out the number of pages in the vector */ + for (i = 0; i < nr_iovecs; i++) { + nr_pages = rds_pages_in_vec(&iov[i]); + if (nr_pages == 0) + return -EINVAL; + + tot_pages += nr_pages; + + /* + * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1, + * so tot_pages cannot overflow without first going negative. + */ + if (tot_pages < 0) + return -EINVAL; + } + + return tot_pages; +} + +int rds_rdma_extra_size(struct rds_rdma_args *args) { struct rds_iovec vec; struct rds_iovec __user *local_vec; - unsigned int tot_pages = 0; + int tot_pages = 0; unsigned int nr_pages; unsigned int i; @@ -502,14 +527,16 @@ static int rds_rdma_pages(struct rds_rdma_args *args) return -EINVAL; tot_pages += nr_pages; - } - return tot_pages; -} + /* + * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1, + * so tot_pages cannot overflow without first going negative. + */ + if (tot_pages < 0) + return -EINVAL; + } -int rds_rdma_extra_size(struct rds_rdma_args *args) -{ - return rds_rdma_pages(args) * sizeof(struct scatterlist); + return tot_pages * sizeof(struct scatterlist); } /* @@ -520,13 +547,12 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg) { struct rds_rdma_args *args; - struct rds_iovec vec; struct rm_rdma_op *op = &rm->rdma; int nr_pages; unsigned int nr_bytes; struct page **pages = NULL; - struct rds_iovec __user *local_vec; - unsigned int nr; + struct rds_iovec iovstack[UIO_FASTIOV], *iovs = iovstack; + int iov_size; unsigned int i, j; int ret = 0; @@ -546,9 +572,26 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, goto out; } - nr_pages = rds_rdma_pages(args); - if (nr_pages < 0) + /* Check whether to allocate the iovec area */ + iov_size = args->nr_local * sizeof(struct rds_iovec); + if (args->nr_local > UIO_FASTIOV) { + iovs = sock_kmalloc(rds_rs_to_sk(rs), iov_size, GFP_KERNEL); + if (!iovs) { + ret = -ENOMEM; + goto out; + } + } + + if (copy_from_user(iovs, (struct rds_iovec __user *)(unsigned long) args->local_vec_addr, iov_size)) { + ret = -EFAULT; + goto out; + } + + nr_pages = rds_rdma_pages(iovs, args->nr_local); + if (nr_pages < 0) { + ret = -EINVAL; goto out; + } pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); if (!pages) { @@ -564,6 +607,10 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, op->op_recverr = rs->rs_recverr; WARN_ON(!nr_pages); op->op_sg = rds_message_alloc_sgs(rm, nr_pages); + if (!op->op_sg) { + ret = -ENOMEM; + goto out; + } if (op->op_notify || op->op_recverr) { /* We allocate an uninitialized notifier here, because @@ -597,50 +644,40 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, (unsigned long long)args->remote_vec.addr, op->op_rkey); - local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr; - for (i = 0; i < args->nr_local; i++) { - if (copy_from_user(&vec, &local_vec[i], - sizeof(struct rds_iovec))) { - ret = -EFAULT; - goto out; - } - - nr = rds_pages_in_vec(&vec); - if (nr == 0) { - ret = -EINVAL; - goto out; - } + struct rds_iovec *iov = &iovs[i]; + /* don't need to check, rds_rdma_pages() verified nr will be +nonzero */ + unsigned int nr = rds_pages_in_vec(iov); - rs->rs_user_addr = vec.addr; - rs->rs_user_bytes = vec.bytes; + rs->rs_user_addr = iov->addr; + rs->rs_user_bytes = iov->bytes; /* If it's a WRITE operation, we want to pin the pages for reading. * If it's a READ operation, we need to pin the pages for writing. */ - ret = rds_pin_pages(vec.addr, nr, pages, !op->op_write); + ret = rds_pin_pages(iov->addr, nr, pages, !op->op_write); if (ret < 0) goto out; - rdsdebug("RDS: nr_bytes %u nr %u vec.bytes %llu vec.addr %llx\n", - nr_bytes, nr, vec.bytes, vec.addr); + rdsdebug("RDS: nr_bytes %u nr %u iov->bytes %llu iov->addr %llx\n", + nr_bytes, nr, iov->bytes, iov->addr); - nr_bytes += vec.bytes; + nr_bytes += iov->bytes; for (j = 0; j < nr; j++) { - unsigned int offset = vec.addr & ~PAGE_MASK; + unsigned int offset = iov->addr & ~PAGE_MASK; struct scatterlist *sg; sg = &op->op_sg[op->op_nents + j]; sg_set_page(sg, pages[j], - min_t(unsigned int, vec.bytes, PAGE_SIZE - offset), + min_t(unsigned int, iov->bytes, PAGE_SIZE - offset), offset); - rdsdebug("RDS: sg->offset %x sg->len %x vec.addr %llx vec.bytes %llu\n", - sg->offset, sg->length, vec.addr, vec.bytes); + rdsdebug("RDS: sg->offset %x sg->len %x iov->addr %llx iov->bytes %llu\n", + sg->offset, sg->length, iov->addr, iov->bytes); - vec.addr += sg->length; - vec.bytes -= sg->length; + iov->addr += sg->length; + iov->bytes -= sg->length; } op->op_nents += nr; @@ -655,13 +692,14 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, } op->op_bytes = nr_bytes; - ret = 0; out: + if (iovs != iovstack) + sock_kfree_s(rds_rs_to_sk(rs), iovs, iov_size); kfree(pages); if (ret) rds_rdma_free_op(op); - - rds_stats_inc(s_send_rdma); + else + rds_stats_inc(s_send_rdma); return ret; } @@ -773,6 +811,10 @@ int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm, rm->atomic.op_active = 1; rm->atomic.op_recverr = rs->rs_recverr; rm->atomic.op_sg = rds_message_alloc_sgs(rm, 1); + if (!rm->atomic.op_sg) { + ret = -ENOMEM; + goto err; + } /* verify 8 byte-aligned */ if (args->local_addr & 0x7) { diff --git a/net/rds/send.c b/net/rds/send.c index 0bc9db1..35b9c2e 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -973,6 +973,10 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, /* Attach data to the rm */ if (payload_len) { rm->data.op_sg = rds_message_alloc_sgs(rm, ceil(payload_len, PAGE_SIZE)); + if (!rm->data.op_sg) { + ret = -ENOMEM; + goto out; + } ret = rds_message_copy_from_user(rm, msg->msg_iov, payload_len); if (ret) goto out; diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 08a8c6c..8e0a320 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -221,7 +221,13 @@ static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp) static void rds_tcp_conn_free(void *arg) { struct rds_tcp_connection *tc = arg; + unsigned long flags; rdsdebug("freeing tc %p\n", tc); + + spin_lock_irqsave(&rds_tcp_conn_lock, flags); + list_del(&tc->t_tcp_node); + spin_unlock_irqrestore(&rds_tcp_conn_lock, flags); + kmem_cache_free(rds_tcp_conn_slab, tc); } diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index efd4f95..f23d915 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -268,6 +268,10 @@ static int basic_dump(struct tcf_proto *tp, unsigned long fh, goto nla_put_failure; nla_nest_end(skb, nest); + + if (tcf_exts_dump_stats(skb, &f->exts, &basic_ext_map) < 0) + goto nla_put_failure; + return skb->len; nla_put_failure: diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index 37dff78..d49c40f 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -34,8 +34,6 @@ struct cgroup_subsys net_cls_subsys = { .populate = cgrp_populate, #ifdef CONFIG_NET_CLS_CGROUP .subsys_id = net_cls_subsys_id, -#else -#define net_cls_subsys_id net_cls_subsys.subsys_id #endif .module = THIS_MODULE, }; diff --git a/net/sched/em_text.c b/net/sched/em_text.c index 7632532..ea8f566 100644 --- a/net/sched/em_text.c +++ b/net/sched/em_text.c @@ -103,7 +103,8 @@ retry: static void em_text_destroy(struct tcf_proto *tp, struct tcf_ematch *m) { - textsearch_destroy(EM_TEXT_PRIV(m)->config); + if (EM_TEXT_PRIV(m) && EM_TEXT_PRIV(m)->config) + textsearch_destroy(EM_TEXT_PRIV(m)->config); } static int em_text_dump(struct sk_buff *skb, struct tcf_ematch *m) diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 1ef29c7..e58f947 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -92,7 +92,7 @@ static struct sctp_af *sctp_af_v6_specific; struct kmem_cache *sctp_chunk_cachep __read_mostly; struct kmem_cache *sctp_bucket_cachep __read_mostly; -int sysctl_sctp_mem[3]; +long sysctl_sctp_mem[3]; int sysctl_sctp_rmem[3]; int sysctl_sctp_wmem[3]; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index e34ca9c..6bd5543 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -111,12 +111,12 @@ static void sctp_sock_migrate(struct sock *, struct sock *, static char *sctp_hmac_alg = SCTP_COOKIE_HMAC_ALG; extern struct kmem_cache *sctp_bucket_cachep; -extern int sysctl_sctp_mem[3]; +extern long sysctl_sctp_mem[3]; extern int sysctl_sctp_rmem[3]; extern int sysctl_sctp_wmem[3]; static int sctp_memory_pressure; -static atomic_t sctp_memory_allocated; +static atomic_long_t sctp_memory_allocated; struct percpu_counter sctp_sockets_allocated; static void sctp_enter_memory_pressure(struct sock *sk) diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 832590b..50cb57f 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -54,7 +54,7 @@ static int sack_timer_max = 500; static int addr_scope_max = 3; /* check sctp_scope_policy_t in include/net/sctp/constants.h for max entries */ static int rwnd_scale_max = 16; -extern int sysctl_sctp_mem[3]; +extern long sysctl_sctp_mem[3]; extern int sysctl_sctp_rmem[3]; extern int sysctl_sctp_wmem[3]; @@ -203,7 +203,7 @@ static ctl_table sctp_table[] = { .data = &sysctl_sctp_mem, .maxlen = sizeof(sysctl_sctp_mem), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_doulongvec_minmax }, { .procname = "sctp_rmem", diff --git a/net/socket.c b/net/socket.c index abf3e25..2808b4d 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1652,6 +1652,8 @@ SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len, struct iovec iov; int fput_needed; + if (len > INT_MAX) + len = INT_MAX; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) goto out; @@ -1709,6 +1711,8 @@ SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, ubuf, size_t, size, int err, err2; int fput_needed; + if (size > INT_MAX) + size = INT_MAX; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) goto out; diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 33217fc..e9f0d50 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -396,6 +396,7 @@ static int get_name(struct socket *sock, struct sockaddr *uaddr, struct sockaddr_tipc *addr = (struct sockaddr_tipc *)uaddr; struct tipc_sock *tsock = tipc_sk(sock->sk); + memset(addr, 0, sizeof(*addr)); if (peer) { if ((sock->state != SS_CONNECTED) && ((peer != 2) || (sock->state != SS_DISCONNECTING))) diff --git a/net/x25/x25_facilities.c b/net/x25/x25_facilities.c index 771bab0..3a8c4c4 100644 --- a/net/x25/x25_facilities.c +++ b/net/x25/x25_facilities.c @@ -134,15 +134,15 @@ int x25_parse_facilities(struct sk_buff *skb, struct x25_facilities *facilities, case X25_FAC_CLASS_D: switch (*p) { case X25_FAC_CALLING_AE: - if (p[1] > X25_MAX_DTE_FACIL_LEN) - break; + if (p[1] > X25_MAX_DTE_FACIL_LEN || p[1] <= 1) + return 0; dte_facs->calling_len = p[2]; memcpy(dte_facs->calling_ae, &p[3], p[1] - 1); *vc_fac_mask |= X25_MASK_CALLING_AE; break; case X25_FAC_CALLED_AE: - if (p[1] > X25_MAX_DTE_FACIL_LEN) - break; + if (p[1] > X25_MAX_DTE_FACIL_LEN || p[1] <= 1) + return 0; dte_facs->called_len = p[2]; memcpy(dte_facs->called_ae, &p[3], p[1] - 1); *vc_fac_mask |= X25_MASK_CALLED_AE; diff --git a/net/x25/x25_in.c b/net/x25/x25_in.c index 6317896..f729f02 100644 --- a/net/x25/x25_in.c +++ b/net/x25/x25_in.c @@ -119,6 +119,8 @@ static int x25_state1_machine(struct sock *sk, struct sk_buff *skb, int frametyp &x25->vc_facil_mask); if (len > 0) skb_pull(skb, len); + else + return -1; /* * Copy any Call User Data. */ |