From d83769a580f1132ac26439f50068a29b02be535e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 21 Apr 2015 18:32:24 -0700 Subject: tcp: fix possible deadlock in tcp_send_fin() Using sk_stream_alloc_skb() in tcp_send_fin() is dangerous in case a huge process is killed by OOM, and tcp_mem[2] is hit. To be able to free memory we need to make progress, so this patch allows FIN packets to not care about tcp_mem[2], if skb allocation succeeded. In a follow-up patch, we might abort tcp_send_fin() infinite loop in case TIF_MEMDIE is set on this thread, as memory allocator did its best getting extra memory already. This patch reverts d22e15371811 ("tcp: fix tcp fin memory accounting") Fixes: d22e15371811 ("tcp: fix tcp fin memory accounting") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 8c8d7e0..2ade67b 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2812,6 +2812,21 @@ begin_fwd: } } +/* We allow to exceed memory limits for FIN packets to expedite + * connection tear down and (memory) recovery. + * Otherwise tcp_send_fin() could loop forever. + */ +static void sk_forced_wmem_schedule(struct sock *sk, int size) +{ + int amt, status; + + if (size <= sk->sk_forward_alloc) + return; + amt = sk_mem_pages(size); + sk->sk_forward_alloc += amt * SK_MEM_QUANTUM; + sk_memory_allocated_add(sk, amt, &status); +} + /* Send a fin. The caller locks the socket for us. This cannot be * allowed to fail queueing a FIN frame under any circumstances. */ @@ -2834,11 +2849,14 @@ void tcp_send_fin(struct sock *sk) } else { /* Socket is locked, keep trying until memory is available. */ for (;;) { - skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation); + skb = alloc_skb_fclone(MAX_TCP_HEADER, + sk->sk_allocation); if (skb) break; yield(); } + skb_reserve(skb, MAX_TCP_HEADER); + sk_forced_wmem_schedule(sk, skb->truesize); /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ tcp_init_nondata_skb(skb, tp->write_seq, TCPHDR_ACK | TCPHDR_FIN); -- cgit v0.10.2 From e2307ed6cbe71c74e291681aaa7e92ab98bc3177 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Wed, 22 Apr 2015 09:41:45 +0200 Subject: rhashtable: Schedule async resize when sync realloc fails When rhashtable_insert_rehash() fails with ENOMEM, this indicates that we can't allocate the necessary memory in the current context but the limits as set by the user would still allow to grow. Thus attempt an async resize in the background where we can allocate using GFP_KERNEL which is more likely to succeed. The insertion itself will still fail to indicate pressure. This fixes a bug where the table would never continue growing once the utilization is above 100%. Fixes: ccd57b1bd324 ("rhashtable: Add immediate rehash during insertion") Signed-off-by: Thomas Graf Acked-by: Herbert Xu Signed-off-by: David S. Miller diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 4898442..f648cfd 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -410,8 +410,13 @@ int rhashtable_insert_rehash(struct rhashtable *ht) return -EBUSY; new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC); - if (new_tbl == NULL) + if (new_tbl == NULL) { + /* Schedule async resize/rehash to try allocation + * non-atomic context. + */ + schedule_work(&ht->run_work); return -ENOMEM; + } err = rhashtable_rehash_attach(ht, tbl, new_tbl); if (err) { -- cgit v0.10.2 From a87b9ebf1709687ff213091d0fdb4254b1564803 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Wed, 22 Apr 2015 09:41:46 +0200 Subject: rhashtable: Do not schedule more than one rehash if we can't grow further The current code currently only stops inserting rehashes into the chain when no resizes are currently scheduled. As long as resizes are scheduled and while inserting above the utilization watermark, more and more rehashes will be scheduled. This lead to a perfect DoS storm with thousands of rehashes scheduled which lead to thousands of spinlocks to be taken sequentially. Instead, only allow either a series of resizes or a single rehash. Drop any further rehashes and return -EBUSY. Fixes: ccd57b1bd324 ("rhashtable: Add immediate rehash during insertion") Signed-off-by: Thomas Graf Acked-by: Herbert Xu Signed-off-by: David S. Miller diff --git a/lib/rhashtable.c b/lib/rhashtable.c index f648cfd..b28df40 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -405,8 +405,8 @@ int rhashtable_insert_rehash(struct rhashtable *ht) if (rht_grow_above_75(ht, tbl)) size *= 2; - /* More than two rehashes (not resizes) detected. */ - else if (WARN_ON(old_tbl != tbl && old_tbl->size == size)) + /* Do not schedule more than one rehash */ + else if (old_tbl != tbl) return -EBUSY; new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC); -- cgit v0.10.2 From 909d9faae2a447110aa061070145297fffe129cb Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Wed, 22 Apr 2015 12:47:32 +0300 Subject: bnx2x: Prevent inner-reload while VFs exist On some feature changes, driver employes an inner-reload flow where it resets the function and re-configures it with the new required set of parameters. Such a flow proves fatal to any VF since those were not intended to be used while HW is being reset underneath, causing them [at best] to lose all connectivity. This changes driver behavior to fail all configuration changes [e.g., mtu change] requested of the driver in case VFs are active. Signed-off-by: Yuval Mintz Signed-off-by: Ariel Elior Signed-off-by: David S. Miller diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c index 2f63467..6f7dc81 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c @@ -4809,6 +4809,23 @@ netdev_features_t bnx2x_fix_features(struct net_device *dev, { struct bnx2x *bp = netdev_priv(dev); + if (pci_num_vf(bp->pdev)) { + netdev_features_t changed = dev->features ^ features; + + /* Revert the requested changes in features if they + * would require internal reload of PF in bnx2x_set_features(). + */ + if (!(features & NETIF_F_RXCSUM) && !bp->disable_tpa) { + features &= ~NETIF_F_RXCSUM; + features |= dev->features & NETIF_F_RXCSUM; + } + + if (changed & NETIF_F_LOOPBACK) { + features &= ~NETIF_F_LOOPBACK; + features |= dev->features & NETIF_F_LOOPBACK; + } + } + /* TPA requires Rx CSUM offloading */ if (!(features & NETIF_F_RXCSUM)) { features &= ~NETIF_F_LRO; @@ -4839,15 +4856,18 @@ int bnx2x_set_features(struct net_device *dev, netdev_features_t features) else flags &= ~GRO_ENABLE_FLAG; - if (features & NETIF_F_LOOPBACK) { - if (bp->link_params.loopback_mode != LOOPBACK_BMAC) { - bp->link_params.loopback_mode = LOOPBACK_BMAC; - bnx2x_reload = true; - } - } else { - if (bp->link_params.loopback_mode != LOOPBACK_NONE) { - bp->link_params.loopback_mode = LOOPBACK_NONE; - bnx2x_reload = true; + /* VFs or non SRIOV PFs should be able to change loopback feature */ + if (!pci_num_vf(bp->pdev)) { + if (features & NETIF_F_LOOPBACK) { + if (bp->link_params.loopback_mode != LOOPBACK_BMAC) { + bp->link_params.loopback_mode = LOOPBACK_BMAC; + bnx2x_reload = true; + } + } else { + if (bp->link_params.loopback_mode != LOOPBACK_NONE) { + bp->link_params.loopback_mode = LOOPBACK_NONE; + bnx2x_reload = true; + } } } @@ -4931,6 +4951,11 @@ int bnx2x_resume(struct pci_dev *pdev) } bp = netdev_priv(dev); + if (pci_num_vf(bp->pdev)) { + DP(BNX2X_MSG_IOV, "VFs are enabled, can not change MTU\n"); + return -EPERM; + } + if (bp->recovery_state != BNX2X_RECOVERY_DONE) { BNX2X_ERR("Handling parity error recovery. Try again later\n"); return -EAGAIN; diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c index e3d853c..48ed005 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c @@ -1843,6 +1843,12 @@ static int bnx2x_set_ringparam(struct net_device *dev, "set ring params command parameters: rx_pending = %d, tx_pending = %d\n", ering->rx_pending, ering->tx_pending); + if (pci_num_vf(bp->pdev)) { + DP(BNX2X_MSG_IOV, + "VFs are enabled, can not change ring parameters\n"); + return -EPERM; + } + if (bp->recovery_state != BNX2X_RECOVERY_DONE) { DP(BNX2X_MSG_ETHTOOL, "Handling parity error recovery. Try again later\n"); @@ -2899,6 +2905,12 @@ static void bnx2x_self_test(struct net_device *dev, u8 is_serdes, link_up; int rc, cnt = 0; + if (pci_num_vf(bp->pdev)) { + DP(BNX2X_MSG_IOV, + "VFs are enabled, can not perform self test\n"); + return; + } + if (bp->recovery_state != BNX2X_RECOVERY_DONE) { netdev_err(bp->dev, "Handling parity error recovery. Try again later\n"); @@ -3468,6 +3480,11 @@ static int bnx2x_set_channels(struct net_device *dev, channels->rx_count, channels->tx_count, channels->other_count, channels->combined_count); + if (pci_num_vf(bp->pdev)) { + DP(BNX2X_MSG_IOV, "VFs are enabled, can not set channels\n"); + return -EPERM; + } + /* We don't support separate rx / tx channels. * We don't allow setting 'other' channels. */ -- cgit v0.10.2 From 03c57747a7020a28a200e7e920fb48ecdc9b0fb8 Mon Sep 17 00:00:00 2001 From: Robert Shearman Date: Wed, 22 Apr 2015 11:14:37 +0100 Subject: mpls: Per-device MPLS state Add per-device MPLS state to supported interfaces. Use the presence of this state in mpls_route_add to determine that this is a supported interface. Use the presence of mpls_dev to drop packets that arrived on an unsupported interface - previously they were allowed through. Cc: "Eric W. Biederman" Signed-off-by: Robert Shearman Reviewed-by: "Eric W. Biederman" Signed-off-by: David S. Miller diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index bcbde79..dae106a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -60,6 +60,7 @@ struct phy_device; struct wireless_dev; /* 802.15.4 specific */ struct wpan_dev; +struct mpls_dev; void netdev_set_default_ethtool_ops(struct net_device *dev, const struct ethtool_ops *ops); @@ -1627,6 +1628,9 @@ struct net_device { void *ax25_ptr; struct wireless_dev *ieee80211_ptr; struct wpan_dev *ieee802154_ptr; +#if IS_ENABLED(CONFIG_MPLS_ROUTING) + struct mpls_dev __rcu *mpls_ptr; +#endif /* * Cache lines mostly used on receive path (including eth_type_trans()) diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index db8a2ea..ad45017 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -53,6 +53,11 @@ static struct mpls_route *mpls_route_input_rcu(struct net *net, unsigned index) return rt; } +static inline struct mpls_dev *mpls_dev_get(const struct net_device *dev) +{ + return rcu_dereference_rtnl(dev->mpls_ptr); +} + static bool mpls_output_possible(const struct net_device *dev) { return dev && (dev->flags & IFF_UP) && netif_carrier_ok(dev); @@ -136,6 +141,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev, struct mpls_route *rt; struct mpls_entry_decoded dec; struct net_device *out_dev; + struct mpls_dev *mdev; unsigned int hh_len; unsigned int new_header_size; unsigned int mtu; @@ -143,6 +149,10 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev, /* Careful this entire function runs inside of an rcu critical section */ + mdev = mpls_dev_get(dev); + if (!mdev) + goto drop; + if (skb->pkt_type != PACKET_HOST) goto drop; @@ -352,9 +362,9 @@ static int mpls_route_add(struct mpls_route_config *cfg) if (!dev) goto errout; - /* For now just support ethernet devices */ + /* Ensure this is a supported device */ err = -EINVAL; - if ((dev->type != ARPHRD_ETHER) && (dev->type != ARPHRD_LOOPBACK)) + if (!mpls_dev_get(dev)) goto errout; err = -EINVAL; @@ -428,10 +438,27 @@ errout: return err; } +static struct mpls_dev *mpls_add_dev(struct net_device *dev) +{ + struct mpls_dev *mdev; + int err = -ENOMEM; + + ASSERT_RTNL(); + + mdev = kzalloc(sizeof(*mdev), GFP_KERNEL); + if (!mdev) + return ERR_PTR(err); + + rcu_assign_pointer(dev->mpls_ptr, mdev); + + return mdev; +} + static void mpls_ifdown(struct net_device *dev) { struct mpls_route __rcu **platform_label; struct net *net = dev_net(dev); + struct mpls_dev *mdev; unsigned index; platform_label = rtnl_dereference(net->mpls.platform_label); @@ -443,14 +470,33 @@ static void mpls_ifdown(struct net_device *dev) continue; rt->rt_dev = NULL; } + + mdev = mpls_dev_get(dev); + if (!mdev) + return; + + RCU_INIT_POINTER(dev->mpls_ptr, NULL); + + kfree(mdev); } static int mpls_dev_notify(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct mpls_dev *mdev; switch(event) { + case NETDEV_REGISTER: + /* For now just support ethernet devices */ + if ((dev->type == ARPHRD_ETHER) || + (dev->type == ARPHRD_LOOPBACK)) { + mdev = mpls_add_dev(dev); + if (IS_ERR(mdev)) + return notifier_from_errno(PTR_ERR(mdev)); + } + break; + case NETDEV_UNREGISTER: mpls_ifdown(dev); break; diff --git a/net/mpls/internal.h b/net/mpls/internal.h index fb6de92..8090cb3 100644 --- a/net/mpls/internal.h +++ b/net/mpls/internal.h @@ -22,6 +22,9 @@ struct mpls_entry_decoded { u8 bos; }; +struct mpls_dev { +}; + struct sk_buff; static inline struct mpls_shim_hdr *mpls_hdr(const struct sk_buff *skb) -- cgit v0.10.2 From 37bde79979c3862c79294c62ddcef7efc477e4bf Mon Sep 17 00:00:00 2001 From: Robert Shearman Date: Wed, 22 Apr 2015 11:14:38 +0100 Subject: mpls: Per-device enabling of packet input An MPLS network is a single trust domain where the edges must be in control of what labels make their way into the core. The simplest way of ensuring this is for the edge device to always impose the labels, and not allow forward labeled traffic from untrusted neighbours. This is achieved by allowing a per-device configuration of whether MPLS traffic input from that interface should be processed or not. To be secure by default, the default state is changed to MPLS being disabled on all interfaces unless explicitly enabled and no global option is provided to change the default. Whilst this differs from other protocols (e.g. IPv6), network operators are used to explicitly enabling MPLS forwarding on interfaces, and with the number of links to the MPLS core typically fairly low this doesn't present too much of a burden on operators. Cc: "Eric W. Biederman" Signed-off-by: Robert Shearman Reviewed-by: "Eric W. Biederman" Signed-off-by: David S. Miller diff --git a/Documentation/networking/mpls-sysctl.txt b/Documentation/networking/mpls-sysctl.txt index 639ddf0..9ed15f8 100644 --- a/Documentation/networking/mpls-sysctl.txt +++ b/Documentation/networking/mpls-sysctl.txt @@ -18,3 +18,12 @@ platform_labels - INTEGER Possible values: 0 - 1048575 Default: 0 + +conf//input - BOOL + Control whether packets can be input on this interface. + + If disabled, packets will be discarded without further + processing. + + 0 - disabled (default) + not 0 - enabled diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index ad45017..9fdd94c 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -150,7 +150,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev, /* Careful this entire function runs inside of an rcu critical section */ mdev = mpls_dev_get(dev); - if (!mdev) + if (!mdev || !mdev->input_enabled) goto drop; if (skb->pkt_type != PACKET_HOST) @@ -438,6 +438,60 @@ errout: return err; } +#define MPLS_PERDEV_SYSCTL_OFFSET(field) \ + (&((struct mpls_dev *)0)->field) + +static const struct ctl_table mpls_dev_table[] = { + { + .procname = "input", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + .data = MPLS_PERDEV_SYSCTL_OFFSET(input_enabled), + }, + { } +}; + +static int mpls_dev_sysctl_register(struct net_device *dev, + struct mpls_dev *mdev) +{ + char path[sizeof("net/mpls/conf/") + IFNAMSIZ]; + struct ctl_table *table; + int i; + + table = kmemdup(&mpls_dev_table, sizeof(mpls_dev_table), GFP_KERNEL); + if (!table) + goto out; + + /* Table data contains only offsets relative to the base of + * the mdev at this point, so make them absolute. + */ + for (i = 0; i < ARRAY_SIZE(mpls_dev_table); i++) + table[i].data = (char *)mdev + (uintptr_t)table[i].data; + + snprintf(path, sizeof(path), "net/mpls/conf/%s", dev->name); + + mdev->sysctl = register_net_sysctl(dev_net(dev), path, table); + if (!mdev->sysctl) + goto free; + + return 0; + +free: + kfree(table); +out: + return -ENOBUFS; +} + +static void mpls_dev_sysctl_unregister(struct mpls_dev *mdev) +{ + struct ctl_table *table; + + table = mdev->sysctl->ctl_table_arg; + unregister_net_sysctl_table(mdev->sysctl); + kfree(table); +} + static struct mpls_dev *mpls_add_dev(struct net_device *dev) { struct mpls_dev *mdev; @@ -449,9 +503,17 @@ static struct mpls_dev *mpls_add_dev(struct net_device *dev) if (!mdev) return ERR_PTR(err); + err = mpls_dev_sysctl_register(dev, mdev); + if (err) + goto free; + rcu_assign_pointer(dev->mpls_ptr, mdev); return mdev; + +free: + kfree(mdev); + return ERR_PTR(err); } static void mpls_ifdown(struct net_device *dev) @@ -475,6 +537,8 @@ static void mpls_ifdown(struct net_device *dev) if (!mdev) return; + mpls_dev_sysctl_unregister(mdev); + RCU_INIT_POINTER(dev->mpls_ptr, NULL); kfree(mdev); @@ -958,7 +1022,7 @@ static int mpls_platform_labels(struct ctl_table *table, int write, return ret; } -static struct ctl_table mpls_table[] = { +static const struct ctl_table mpls_table[] = { { .procname = "platform_labels", .data = NULL, diff --git a/net/mpls/internal.h b/net/mpls/internal.h index 8090cb3..693877d 100644 --- a/net/mpls/internal.h +++ b/net/mpls/internal.h @@ -23,6 +23,9 @@ struct mpls_entry_decoded { }; struct mpls_dev { + int input_enabled; + + struct ctl_table_header *sysctl; }; struct sk_buff; -- cgit v0.10.2 From 5a9ab0176198d91dfc153f5e6c5fdc5afa613607 Mon Sep 17 00:00:00 2001 From: Robert Shearman Date: Wed, 22 Apr 2015 11:14:39 +0100 Subject: mpls: Prevent use of implicit NULL label as outgoing label The reserved implicit-NULL label isn't allowed to appear in the label stack for packets, so make it an error for the control plane to specify it as an outgoing label. Suggested-by: "Eric W. Biederman" Signed-off-by: Robert Shearman Reviewed-by: "Eric W. Biederman" Signed-off-by: David S. Miller diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 9fdd94c..954810c 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -646,6 +646,15 @@ int nla_get_labels(const struct nlattr *nla, if ((dec.bos != bos) || dec.ttl || dec.tc) return -EINVAL; + switch (dec.label) { + case LABEL_IMPLICIT_NULL: + /* RFC3032: This is a label that an LSR may + * assign and distribute, but which never + * actually appears in the encapsulation. + */ + return -EINVAL; + } + label[i] = dec.label; } *labels = nla_labels; -- cgit v0.10.2 From 26349c71b4323e62f8de0fe45f2f06c4df535b9b Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 22 Apr 2015 16:56:16 +0200 Subject: ip6_gre: use netdev_alloc_pcpu_stats() The code there just open-codes the same, so use the provided macro instead. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index b5e6cc1..a38d3ac 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1246,7 +1246,6 @@ static void ip6gre_tunnel_setup(struct net_device *dev) static int ip6gre_tunnel_init(struct net_device *dev) { struct ip6_tnl *tunnel; - int i; tunnel = netdev_priv(dev); @@ -1260,16 +1259,10 @@ static int ip6gre_tunnel_init(struct net_device *dev) if (ipv6_addr_any(&tunnel->parms.raddr)) dev->header_ops = &ip6gre_header_ops; - dev->tstats = alloc_percpu(struct pcpu_sw_netstats); + dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!dev->tstats) return -ENOMEM; - for_each_possible_cpu(i) { - struct pcpu_sw_netstats *ip6gre_tunnel_stats; - ip6gre_tunnel_stats = per_cpu_ptr(dev->tstats, i); - u64_stats_init(&ip6gre_tunnel_stats->syncp); - } - return 0; } -- cgit v0.10.2 From 79930f5892e134c6da1254389577fffb8bd72c66 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 22 Apr 2015 07:33:36 -0700 Subject: net: do not deplete pfmemalloc reserve build_skb() should look at the page pfmemalloc status. If set, this means page allocator allocated this page in the expectation it would help to free other pages. Networking stack can do that only if skb->pfmemalloc is also set. Also, we must refrain using high order pages from the pfmemalloc reserve, so __page_frag_refill() must also use __GFP_NOMEMALLOC for them. Under memory pressure, using order-0 pages is probably the best strategy. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller diff --git a/net/core/skbuff.c b/net/core/skbuff.c index d1967da..456ead5 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -311,7 +311,11 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size) memset(skb, 0, offsetof(struct sk_buff, tail)); skb->truesize = SKB_TRUESIZE(size); - skb->head_frag = frag_size != 0; + if (frag_size) { + skb->head_frag = 1; + if (virt_to_head_page(data)->pfmemalloc) + skb->pfmemalloc = 1; + } atomic_set(&skb->users, 1); skb->head = data; skb->data = data; @@ -348,7 +352,8 @@ static struct page *__page_frag_refill(struct netdev_alloc_cache *nc, gfp_t gfp = gfp_mask; if (order) { - gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY; + gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY | + __GFP_NOMEMALLOC; page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, order); nc->frag.size = PAGE_SIZE << (page ? order : 0); } -- cgit v0.10.2 From 21d3515ce7179523a2941cc015960dd788290e33 Mon Sep 17 00:00:00 2001 From: Ben Shelton Date: Wed, 22 Apr 2015 17:28:54 -0500 Subject: net/macb: Factor out one-time assignment from loop In 02c958dd3 (net/macb: add TX multiqueue support for gem), the initialization of tx_head and tx_tail in macb_init_rings() was moved inside the loop that iterates over each element in the ring. Since tx_head and tx_tail only need to be assigned once, move them back out of the loop. Signed-off-by: Ben Shelton Signed-off-by: David S. Miller diff --git a/drivers/net/ethernet/cadence/macb.c b/drivers/net/ethernet/cadence/macb.c index 9f53872..665c290 100644 --- a/drivers/net/ethernet/cadence/macb.c +++ b/drivers/net/ethernet/cadence/macb.c @@ -1473,9 +1473,9 @@ static void macb_init_rings(struct macb *bp) for (i = 0; i < TX_RING_SIZE; i++) { bp->queues[0].tx_ring[i].addr = 0; bp->queues[0].tx_ring[i].ctrl = MACB_BIT(TX_USED); - bp->queues[0].tx_head = 0; - bp->queues[0].tx_tail = 0; } + bp->queues[0].tx_head = 0; + bp->queues[0].tx_tail = 0; bp->queues[0].tx_ring[TX_RING_SIZE - 1].ctrl |= MACB_BIT(TX_WRAP); bp->rx_tail = 0; -- cgit v0.10.2 From 608404290e2d9d1756db4013c4ee12fa7617dad9 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Wed, 22 Apr 2015 15:49:10 +0800 Subject: vxlan: remove the unnecessary codes The return value of vxlan_fdb_replace always is greater than or equal to 0 Signed-off-by: Li RongQing Signed-off-by: David S. Miller diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 154116a..27a5f95 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -730,12 +730,8 @@ static int vxlan_fdb_create(struct vxlan_dev *vxlan, /* Only change unicasts */ if (!(is_multicast_ether_addr(f->eth_addr) || is_zero_ether_addr(f->eth_addr))) { - int rc = vxlan_fdb_replace(f, ip, port, vni, + notify |= vxlan_fdb_replace(f, ip, port, vni, ifindex); - - if (rc < 0) - return rc; - notify |= rc; } else return -EOPNOTSUPP; } -- cgit v0.10.2 From ec65aafb9e3f316ff9167289e288856a7d528773 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 23 Apr 2015 12:06:30 +0200 Subject: netdev_alloc_pcpu_stats: use less common iterator variable With the CPU iteration variable called 'i', it's relatively easy to have variable shadowing which sparse will warn about. Avoid that by renaming the variable to __cpu which is less likely to be used in the surrounding context. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index dae106a..dbad4d7 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2025,10 +2025,10 @@ struct pcpu_sw_netstats { ({ \ typeof(type) __percpu *pcpu_stats = alloc_percpu(type); \ if (pcpu_stats) { \ - int i; \ - for_each_possible_cpu(i) { \ + int __cpu; \ + for_each_possible_cpu(__cpu) { \ typeof(type) *stat; \ - stat = per_cpu_ptr(pcpu_stats, i); \ + stat = per_cpu_ptr(pcpu_stats, __cpu); \ u64_stats_init(&stat->syncp); \ } \ } \ -- cgit v0.10.2 From 4fce14820c1b0a3fd399719f970e0c3ae40dd270 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Thu, 23 Apr 2015 14:43:05 +1000 Subject: ibmveth: Fix off-by-one error in ibmveth_change_mtu() AFAIK the PAPR document which defines the virtual device interface used by the ibmveth driver doesn't specify a specific maximum MTU. So, in the ibmveth driver, the maximum allowed MTU is determined by the maximum allocated buffer size of 64k (corresponding to one page in the common case) minus the per-buffer overhead IBMVETH_BUFF_OH (which has value 22 for 14 bytes of ethernet header, plus 8 bytes for an opaque handle). This suggests a maximum allowable MTU of 65514 bytes, but in fact the driver only permits a maximum MTU of 65513. This is because there is a < instead of an <= in ibmveth_change_mtu(), which only permits an MTU which is strictly smaller than the buffer size, rather than allowing the buffer to be completely filled. This patch fixes the buglet. Signed-off-by: David Gibson Acked-by: Thomas Falcon Signed-off-by: David S. Miller diff --git a/drivers/net/ethernet/ibm/ibmveth.c b/drivers/net/ethernet/ibm/ibmveth.c index cd7675a..1813476 100644 --- a/drivers/net/ethernet/ibm/ibmveth.c +++ b/drivers/net/ethernet/ibm/ibmveth.c @@ -1238,7 +1238,7 @@ static int ibmveth_change_mtu(struct net_device *dev, int new_mtu) return -EINVAL; for (i = 0; i < IBMVETH_NUM_BUFF_POOLS; i++) - if (new_mtu_oh < adapter->rx_buff_pool[i].buff_size) + if (new_mtu_oh <= adapter->rx_buff_pool[i].buff_size) break; if (i == IBMVETH_NUM_BUFF_POOLS) @@ -1257,7 +1257,7 @@ static int ibmveth_change_mtu(struct net_device *dev, int new_mtu) for (i = 0; i < IBMVETH_NUM_BUFF_POOLS; i++) { adapter->rx_buff_pool[i].active = 1; - if (new_mtu_oh < adapter->rx_buff_pool[i].buff_size) { + if (new_mtu_oh <= adapter->rx_buff_pool[i].buff_size) { dev->mtu = new_mtu; vio_cmo_set_dev_desired(viodev, ibmveth_get_desired_dma -- cgit v0.10.2 From def81f69bfbd70a3278a7592a4ab8717300cbac1 Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Thu, 23 Apr 2015 09:37:38 -0400 Subject: tipc: fix topology server broken issue When a new topology server is launched in a new namespace, its listening socket is inserted into the "init ns" namespace's socket hash table rather than the one owned by the new namespace. Although the socket's namespace is forcedly changed to the new namespace later, the socket is still stored in the socket hash table of "init ns" namespace. When a client created in the new namespace connects its own topology server, the connection is failed as its server's socket could not be found from its own namespace's socket table. If __sock_create() instead of original sock_create_kern() is used to create the server's socket through specifying an expected namesapce, the socket will be inserted into the specified namespace's socket table, thereby avoiding to the topology server broken issue. Fixes: 76100a8a64bc ("tipc: fix netns refcnt leak") Reported-by: Erik Hugne Signed-off-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller diff --git a/net/tipc/server.c b/net/tipc/server.c index ab6183c..77ff03e 100644 --- a/net/tipc/server.c +++ b/net/tipc/server.c @@ -102,7 +102,7 @@ static void tipc_conn_kref_release(struct kref *kref) } saddr->scope = -TIPC_NODE_SCOPE; kernel_bind(sock, (struct sockaddr *)saddr, sizeof(*saddr)); - sk_release_kernel(sk); + sock_release(sock); con->sock = NULL; } @@ -321,12 +321,9 @@ static struct socket *tipc_create_listen_sock(struct tipc_conn *con) struct socket *sock = NULL; int ret; - ret = sock_create_kern(AF_TIPC, SOCK_SEQPACKET, 0, &sock); + ret = __sock_create(s->net, AF_TIPC, SOCK_SEQPACKET, 0, &sock, 1); if (ret < 0) return NULL; - - sk_change_net(sock->sk, s->net); - ret = kernel_setsockopt(sock, SOL_TIPC, TIPC_IMPORTANCE, (char *)&s->imp, sizeof(s->imp)); if (ret < 0) @@ -376,7 +373,7 @@ static struct socket *tipc_create_listen_sock(struct tipc_conn *con) create_err: kernel_sock_shutdown(sock, SHUT_RDWR); - sk_release_kernel(sock->sk); + sock_release(sock); return NULL; } -- cgit v0.10.2 From 9871b27f6705fc6e0ba633b136369a289b2bfb99 Mon Sep 17 00:00:00 2001 From: Erik Hugne Date: Thu, 23 Apr 2015 09:37:39 -0400 Subject: tipc: fix random link reset problem In the function tipc_sk_rcv(), the stack variable 'err' is only initialized to TIPC_ERR_NO_PORT for the first iteration over the link input queue. If a chain of messages are received from a link, failure to lookup the socket for any but the first message will cause the message to bounce back out on a random link. We fix this by properly initializing err. Signed-off-by: Erik Hugne Signed-off-by: Jon Maloy Signed-off-by: David S. Miller diff --git a/net/tipc/socket.c b/net/tipc/socket.c index ee90d74..9074b5c 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1764,13 +1764,14 @@ static int tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk, int tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq) { u32 dnode, dport = 0; - int err = -TIPC_ERR_NO_PORT; + int err; struct sk_buff *skb; struct tipc_sock *tsk; struct tipc_net *tn; struct sock *sk; while (skb_queue_len(inputq)) { + err = -TIPC_ERR_NO_PORT; skb = NULL; dport = tipc_skb_peek_port(inputq, dport); tsk = tipc_sk_lookup(net, dport); -- cgit v0.10.2 From 73a317377303b5ec14d4703d73ba87efffbb779d Mon Sep 17 00:00:00 2001 From: Erik Hugne Date: Thu, 23 Apr 2015 09:37:40 -0400 Subject: tipc: fix node refcount issue When link statistics is dumped over netlink, we iterate over the list of peer nodes and append each links statistics to the netlink msg. In the case where the dump is resumed after filling up a nlmsg, the node refcnt is decremented without having been incremented previously which may cause the node reference to be freed. When this happens, the following info/stacktrace will be generated, followed by a crash or undefined behavior. We fix this by removing the erroneous call to tipc_node_put inside the loop that iterates over nodes. [ 384.312303] INFO: trying to register non-static key. [ 384.313110] the code is fine but needs lockdep annotation. [ 384.313290] turning off the locking correctness validator. [ 384.313290] CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.0.0+ #13 [ 384.313290] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 384.313290] ffff88003c6d0290 ffff88003cc03ca8 ffffffff8170adf1 0000000000000007 [ 384.313290] ffffffff82728730 ffff88003cc03d38 ffffffff810a6a6d 00000000001d7200 [ 384.313290] ffff88003c6d0ab0 ffff88003cc03ce8 0000000000000285 0000000000000001 [ 384.313290] Call Trace: [ 384.313290] [] dump_stack+0x4c/0x65 [ 384.313290] [] __lock_acquire+0xf3d/0xf50 [ 384.313290] [] lock_acquire+0xd5/0x290 [ 384.313290] [] ? link_timeout+0x1c/0x170 [tipc] [ 384.313290] [] ? link_state_event+0x4e0/0x4e0 [tipc] [ 384.313290] [] _raw_spin_lock_bh+0x40/0x80 [ 384.313290] [] ? link_timeout+0x1c/0x170 [tipc] [ 384.313290] [] link_timeout+0x1c/0x170 [tipc] [ 384.313290] [] call_timer_fn+0xb8/0x490 [ 384.313290] [] ? process_timeout+0x10/0x10 [ 384.313290] [] run_timer_softirq+0x21c/0x420 [ 384.313290] [] ? link_state_event+0x4e0/0x4e0 [tipc] [ 384.313290] [] __do_softirq+0xf4/0x630 [ 384.313290] [] irq_exit+0x5d/0x60 [ 384.313290] [] smp_apic_timer_interrupt+0x41/0x50 [ 384.313290] [] apic_timer_interrupt+0x70/0x80 [ 384.313290] [] ? default_idle+0x20/0x210 [ 384.313290] [] ? default_idle+0x1e/0x210 [ 384.313290] [] arch_cpu_idle+0xa/0x10 [ 384.313290] [] cpu_startup_entry+0x2c3/0x530 [ 384.313290] [] ? clockevents_register_device+0x113/0x200 [ 384.313290] [] start_secondary+0x13f/0x170 Fixes: 8a0f6ebe8494 ("tipc: involve reference counter for node structure") Signed-off-by: Erik Hugne Signed-off-by: Jon Maloy Signed-off-by: David S. Miller diff --git a/net/tipc/link.c b/net/tipc/link.c index a6b30df..57be6e6 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -2143,7 +2143,6 @@ int tipc_nl_link_dump(struct sk_buff *skb, struct netlink_callback *cb) err = __tipc_nl_add_node_links(net, &msg, node, &prev_link); tipc_node_unlock(node); - tipc_node_put(node); if (err) goto out; -- cgit v0.10.2 From d1ab39f17f8653021620d0355ee1cd24d7442a4f Mon Sep 17 00:00:00 2001 From: Jason Eastman Date: Wed, 22 Apr 2015 00:56:42 -0600 Subject: net: unix: garbage: fixed several comment and whitespace style issues fixed several comment and whitespace style issues Signed-off-by: Jason Eastman Signed-off-by: David S. Miller diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 99f7012..a73a226 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -95,39 +95,36 @@ static DECLARE_WAIT_QUEUE_HEAD(unix_gc_wait); unsigned int unix_tot_inflight; - struct sock *unix_get_socket(struct file *filp) { struct sock *u_sock = NULL; struct inode *inode = file_inode(filp); - /* - * Socket ? - */ + /* Socket ? */ if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) { struct socket *sock = SOCKET_I(inode); struct sock *s = sock->sk; - /* - * PF_UNIX ? - */ + /* PF_UNIX ? */ if (s && sock->ops && sock->ops->family == PF_UNIX) u_sock = s; } return u_sock; } -/* - * Keep the number of times in flight count for the file - * descriptor if it is for an AF_UNIX socket. +/* Keep the number of times in flight count for the file + * descriptor if it is for an AF_UNIX socket. */ void unix_inflight(struct file *fp) { struct sock *s = unix_get_socket(fp); + if (s) { struct unix_sock *u = unix_sk(s); + spin_lock(&unix_gc_lock); + if (atomic_long_inc_return(&u->inflight) == 1) { BUG_ON(!list_empty(&u->link)); list_add_tail(&u->link, &gc_inflight_list); @@ -142,10 +139,13 @@ void unix_inflight(struct file *fp) void unix_notinflight(struct file *fp) { struct sock *s = unix_get_socket(fp); + if (s) { struct unix_sock *u = unix_sk(s); + spin_lock(&unix_gc_lock); BUG_ON(list_empty(&u->link)); + if (atomic_long_dec_and_test(&u->inflight)) list_del_init(&u->link); unix_tot_inflight--; @@ -161,32 +161,27 @@ static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *), spin_lock(&x->sk_receive_queue.lock); skb_queue_walk_safe(&x->sk_receive_queue, skb, next) { - /* - * Do we have file descriptors ? - */ + /* Do we have file descriptors ? */ if (UNIXCB(skb).fp) { bool hit = false; - /* - * Process the descriptors of this socket - */ + /* Process the descriptors of this socket */ int nfd = UNIXCB(skb).fp->count; struct file **fp = UNIXCB(skb).fp->fp; + while (nfd--) { - /* - * Get the socket the fd matches - * if it indeed does so - */ + /* Get the socket the fd matches if it indeed does so */ struct sock *sk = unix_get_socket(*fp++); + if (sk) { struct unix_sock *u = unix_sk(sk); - /* - * Ignore non-candidates, they could + /* Ignore non-candidates, they could * have been added to the queues after * starting the garbage collection */ if (test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) { hit = true; + func(u); } } @@ -203,24 +198,22 @@ static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *), static void scan_children(struct sock *x, void (*func)(struct unix_sock *), struct sk_buff_head *hitlist) { - if (x->sk_state != TCP_LISTEN) + if (x->sk_state != TCP_LISTEN) { scan_inflight(x, func, hitlist); - else { + } else { struct sk_buff *skb; struct sk_buff *next; struct unix_sock *u; LIST_HEAD(embryos); - /* - * For a listening socket collect the queued embryos + /* For a listening socket collect the queued embryos * and perform a scan on them as well. */ spin_lock(&x->sk_receive_queue.lock); skb_queue_walk_safe(&x->sk_receive_queue, skb, next) { u = unix_sk(skb->sk); - /* - * An embryo cannot be in-flight, so it's safe + /* An embryo cannot be in-flight, so it's safe * to use the list link. */ BUG_ON(!list_empty(&u->link)); @@ -249,8 +242,7 @@ static void inc_inflight(struct unix_sock *usk) static void inc_inflight_move_tail(struct unix_sock *u) { atomic_long_inc(&u->inflight); - /* - * If this still might be part of a cycle, move it to the end + /* If this still might be part of a cycle, move it to the end * of the list, so that it's checked even if it was already * passed over */ @@ -263,8 +255,7 @@ static bool gc_in_progress; void wait_for_unix_gc(void) { - /* - * If number of inflight sockets is insane, + /* If number of inflight sockets is insane, * force a garbage collect right now. */ if (unix_tot_inflight > UNIX_INFLIGHT_TRIGGER_GC && !gc_in_progress) @@ -288,8 +279,7 @@ void unix_gc(void) goto out; gc_in_progress = true; - /* - * First, select candidates for garbage collection. Only + /* First, select candidates for garbage collection. Only * in-flight sockets are considered, and from those only ones * which don't have any external reference. * @@ -320,15 +310,13 @@ void unix_gc(void) } } - /* - * Now remove all internal in-flight reference to children of + /* Now remove all internal in-flight reference to children of * the candidates. */ list_for_each_entry(u, &gc_candidates, link) scan_children(&u->sk, dec_inflight, NULL); - /* - * Restore the references for children of all candidates, + /* Restore the references for children of all candidates, * which have remaining references. Do this recursively, so * only those remain, which form cyclic references. * @@ -350,8 +338,7 @@ void unix_gc(void) } list_del(&cursor); - /* - * not_cycle_list contains those sockets which do not make up a + /* not_cycle_list contains those sockets which do not make up a * cycle. Restore these to the inflight list. */ while (!list_empty(¬_cycle_list)) { @@ -360,8 +347,7 @@ void unix_gc(void) list_move_tail(&u->link, &gc_inflight_list); } - /* - * Now gc_candidates contains only garbage. Restore original + /* Now gc_candidates contains only garbage. Restore original * inflight counters for these as well, and remove the skbuffs * which are creating the cycle(s). */ -- cgit v0.10.2 From 61e77d297dd1fc739828027a2790ee5365de50d2 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 23 Apr 2015 19:59:33 +0200 Subject: ethernet: amd: AMD_XGBE should depend on HAS_DMA If NO_DMA=y: drivers/built-in.o: In function `xgbe_probe': xgbe-main.c:(.text+0x2def0a): undefined reference to `dma_set_mask' xgbe-main.c:(.text+0x2def20): undefined reference to `dma_supported' drivers/built-in.o: In function `xgbe_rx_poll': xgbe-drv.c:(.text+0x2e0320): undefined reference to `dma_sync_single_for_cpu' xgbe-drv.c:(.text+0x2e035e): undefined reference to `dma_sync_single_for_cpu' drivers/built-in.o: In function `xgbe_unmap_rdata': xgbe-desc.c:(.text+0x2e5fe4): undefined reference to `dma_unmap_page' xgbe-desc.c:(.text+0x2e5ffa): undefined reference to `dma_unmap_single' xgbe-desc.c:(.text+0x2e604a): undefined reference to `dma_unmap_page' xgbe-desc.c:(.text+0x2e6084): undefined reference to `dma_unmap_page' drivers/built-in.o: In function `xgbe_alloc_pages': xgbe-desc.c:(.text+0x2e6156): undefined reference to `dma_map_page' xgbe-desc.c:(.text+0x2e6164): undefined reference to `dma_mapping_error' drivers/built-in.o: In function `xgbe_free_ring': xgbe-desc.c:(.text+0x2e63d4): undefined reference to `dma_unmap_page' xgbe-desc.c:(.text+0x2e640e): undefined reference to `dma_unmap_page' xgbe-desc.c:(.text+0x2e644a): undefined reference to `dma_free_coherent' drivers/built-in.o: In function `xgbe_init_ring': xgbe-desc.c:(.text+0x2e64d4): undefined reference to `dma_alloc_coherent' drivers/built-in.o: In function `xgbe_map_tx_skb': xgbe-desc.c:(.text+0x2e6628): undefined reference to `dma_map_single' xgbe-desc.c:(.text+0x2e6638): undefined reference to `dma_mapping_error' xgbe-desc.c:(.text+0x2e66b2): undefined reference to `dma_map_single' xgbe-desc.c:(.text+0x2e66c2): undefined reference to `dma_mapping_error' xgbe-desc.c:(.text+0x2e6762): undefined reference to `dma_map_page' xgbe-desc.c:(.text+0x2e6772): undefined reference to `dma_mapping_error' Signed-off-by: Geert Uytterhoeven Signed-off-by: David S. Miller diff --git a/drivers/net/ethernet/amd/Kconfig b/drivers/net/ethernet/amd/Kconfig index c638c85..089c269 100644 --- a/drivers/net/ethernet/amd/Kconfig +++ b/drivers/net/ethernet/amd/Kconfig @@ -179,7 +179,7 @@ config SUNLANCE config AMD_XGBE tristate "AMD 10GbE Ethernet driver" - depends on (OF_NET || ACPI) && HAS_IOMEM + depends on (OF_NET || ACPI) && HAS_IOMEM && HAS_DMA select PHYLIB select AMD_XGBE_PHY select BITREVERSE -- cgit v0.10.2 From 0357cc1def050c4d6107835ed6b79122243bd240 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 23 Apr 2015 19:59:34 +0200 Subject: ethernet: arc: ARC_EMAC and EMAC_ROCKCHIP should depend on HAS_DMA If NO_DMA=y: drivers/built-in.o: In function `arc_emac_tx_clean': emac_main.c:(.text+0x2decde): undefined reference to `dma_unmap_single' drivers/built-in.o: In function `arc_emac_rx': emac_main.c:(.text+0x2dee1c): undefined reference to `dma_unmap_single' emac_main.c:(.text+0x2dee72): undefined reference to `dma_map_single' emac_main.c:(.text+0x2dee7e): undefined reference to `dma_mapping_error' drivers/built-in.o: In function `arc_emac_probe': (.text+0x2df2ee): undefined reference to `dmam_alloc_coherent' drivers/built-in.o: In function `arc_emac_open': emac_main.c:(.text+0x2df6d8): undefined reference to `dma_map_single' emac_main.c:(.text+0x2df6e4): undefined reference to `dma_mapping_error' drivers/built-in.o: In function `arc_emac_tx': emac_main.c:(.text+0x2df9e4): undefined reference to `dma_map_single' emac_main.c:(.text+0x2df9f0): undefined reference to `dma_mapping_error' Signed-off-by: Geert Uytterhoeven Signed-off-by: David S. Miller diff --git a/drivers/net/ethernet/arc/Kconfig b/drivers/net/ethernet/arc/Kconfig index 8e262e2..dea29ee 100644 --- a/drivers/net/ethernet/arc/Kconfig +++ b/drivers/net/ethernet/arc/Kconfig @@ -25,8 +25,7 @@ config ARC_EMAC_CORE config ARC_EMAC tristate "ARC EMAC support" select ARC_EMAC_CORE - depends on OF_IRQ - depends on OF_NET + depends on OF_IRQ && OF_NET && HAS_DMA ---help--- On some legacy ARC (Synopsys) FPGA boards such as ARCAngel4/ML50x non-standard on-chip ethernet device ARC EMAC 10/100 is used. @@ -35,7 +34,7 @@ config ARC_EMAC config EMAC_ROCKCHIP tristate "Rockchip EMAC support" select ARC_EMAC_CORE - depends on OF_IRQ && OF_NET && REGULATOR + depends on OF_IRQ && OF_NET && REGULATOR && HAS_DMA ---help--- Support for Rockchip RK3066/RK3188 EMAC ethernet controllers. This selects Rockchip SoC glue layer support for the -- cgit v0.10.2 From 2fb42aab48762fb1d5e21fbb40acad0f2650b9b4 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 23 Apr 2015 20:04:51 +0200 Subject: can: CAN_GRCAN should depend on HAS_DMA If NO_DMA=y: drivers/built-in.o: In function `grcan_free_dma_buffers': grcan.c:(.text+0x2d7716): undefined reference to `dma_free_coherent' drivers/built-in.o: In function `grcan_allocate_dma_buffers': grcan.c:(.text+0x2d779c): undefined reference to `dma_alloc_coherent' Signed-off-by: Geert Uytterhoeven Signed-off-by: David S. Miller diff --git a/drivers/net/can/Kconfig b/drivers/net/can/Kconfig index 58808f6..e8c96b8 100644 --- a/drivers/net/can/Kconfig +++ b/drivers/net/can/Kconfig @@ -112,7 +112,7 @@ config PCH_CAN config CAN_GRCAN tristate "Aeroflex Gaisler GRCAN and GRHCAN CAN devices" - depends on OF + depends on OF && HAS_DMA ---help--- Say Y here if you want to use Aeroflex Gaisler GRCAN or GRHCAN. Note that the driver supports little endian, even though little -- cgit v0.10.2 From e4b6c30375e83b92d9c3e9b9d853417e8cc74006 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Tue, 21 Apr 2015 13:09:45 -0700 Subject: ethernet: myri10ge: use arch_phys_wc_add() This driver already uses ioremap_wc() on the same range so when write-combining is available that will be used instead. Cc: Hyong-Youb Kim Cc: Andy Lutomirski Cc: Suresh Siddha Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Juergen Gross Cc: Daniel Vetter Cc: netdev@vger.kernel.org Cc: Juergen Gross Cc: Daniel Vetter Cc: Andy Lutomirski Cc: Dave Airlie Cc: Antonino Daplas Cc: Jean-Christophe Plagniol-Villard Cc: Tomi Valkeinen Cc: linux-kernel@vger.kernel.org Cc: netdev@vger.kernel.org Signed-off-by: Luis R. Rodriguez Signed-off-by: David S. Miller diff --git a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c index 1412f5a..2bae502 100644 --- a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c +++ b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c @@ -69,11 +69,7 @@ #include #include #include -#include #include -#ifdef CONFIG_MTRR -#include -#endif #include #include "myri10ge_mcp.h" @@ -242,8 +238,7 @@ struct myri10ge_priv { unsigned int rdma_tags_available; int intr_coal_delay; __be32 __iomem *intr_coal_delay_ptr; - int mtrr; - int wc_enabled; + int wc_cookie; int down_cnt; wait_queue_head_t down_wq; struct work_struct watchdog_work; @@ -1905,7 +1900,7 @@ static const char myri10ge_gstrings_main_stats[][ETH_GSTRING_LEN] = { "tx_aborted_errors", "tx_carrier_errors", "tx_fifo_errors", "tx_heartbeat_errors", "tx_window_errors", /* device-specific stats */ - "tx_boundary", "WC", "irq", "MSI", "MSIX", + "tx_boundary", "irq", "MSI", "MSIX", "read_dma_bw_MBs", "write_dma_bw_MBs", "read_write_dma_bw_MBs", "serial_number", "watchdog_resets", #ifdef CONFIG_MYRI10GE_DCA @@ -1984,7 +1979,6 @@ myri10ge_get_ethtool_stats(struct net_device *netdev, data[i] = ((u64 *)&link_stats)[i]; data[i++] = (unsigned int)mgp->tx_boundary; - data[i++] = (unsigned int)mgp->wc_enabled; data[i++] = (unsigned int)mgp->pdev->irq; data[i++] = (unsigned int)mgp->msi_enabled; data[i++] = (unsigned int)mgp->msix_enabled; @@ -4040,14 +4034,7 @@ static int myri10ge_probe(struct pci_dev *pdev, const struct pci_device_id *ent) mgp->board_span = pci_resource_len(pdev, 0); mgp->iomem_base = pci_resource_start(pdev, 0); - mgp->mtrr = -1; - mgp->wc_enabled = 0; -#ifdef CONFIG_MTRR - mgp->mtrr = mtrr_add(mgp->iomem_base, mgp->board_span, - MTRR_TYPE_WRCOMB, 1); - if (mgp->mtrr >= 0) - mgp->wc_enabled = 1; -#endif + mgp->wc_cookie = arch_phys_wc_add(mgp->iomem_base, mgp->board_span); mgp->sram = ioremap_wc(mgp->iomem_base, mgp->board_span); if (mgp->sram == NULL) { dev_err(&pdev->dev, "ioremap failed for %ld bytes at 0x%lx\n", @@ -4146,14 +4133,14 @@ static int myri10ge_probe(struct pci_dev *pdev, const struct pci_device_id *ent) goto abort_with_state; } if (mgp->msix_enabled) - dev_info(dev, "%d MSI-X IRQs, tx bndry %d, fw %s, WC %s\n", + dev_info(dev, "%d MSI-X IRQs, tx bndry %d, fw %s, MTRR %s, WC Enabled\n", mgp->num_slices, mgp->tx_boundary, mgp->fw_name, - (mgp->wc_enabled ? "Enabled" : "Disabled")); + (mgp->wc_cookie > 0 ? "Enabled" : "Disabled")); else - dev_info(dev, "%s IRQ %d, tx bndry %d, fw %s, WC %s\n", + dev_info(dev, "%s IRQ %d, tx bndry %d, fw %s, MTRR %s, WC Enabled\n", mgp->msi_enabled ? "MSI" : "xPIC", pdev->irq, mgp->tx_boundary, mgp->fw_name, - (mgp->wc_enabled ? "Enabled" : "Disabled")); + (mgp->wc_cookie > 0 ? "Enabled" : "Disabled")); board_number++; return 0; @@ -4175,10 +4162,7 @@ abort_with_ioremap: iounmap(mgp->sram); abort_with_mtrr: -#ifdef CONFIG_MTRR - if (mgp->mtrr >= 0) - mtrr_del(mgp->mtrr, mgp->iomem_base, mgp->board_span); -#endif + arch_phys_wc_del(mgp->wc_cookie); dma_free_coherent(&pdev->dev, sizeof(*mgp->cmd), mgp->cmd, mgp->cmd_bus); @@ -4220,11 +4204,7 @@ static void myri10ge_remove(struct pci_dev *pdev) pci_restore_state(pdev); iounmap(mgp->sram); - -#ifdef CONFIG_MTRR - if (mgp->mtrr >= 0) - mtrr_del(mgp->mtrr, mgp->iomem_base, mgp->board_span); -#endif + arch_phys_wc_del(mgp->wc_cookie); myri10ge_free_slices(mgp); kfree(mgp->msix_vectors); dma_free_coherent(&pdev->dev, sizeof(*mgp->cmd), -- cgit v0.10.2 From 845704a535e9b3c76448f52af1b70e4422ea03fd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 23 Apr 2015 10:42:39 -0700 Subject: tcp: avoid looping in tcp_send_fin() Presence of an unbound loop in tcp_send_fin() had always been hard to explain when analyzing crash dumps involving gigantic dying processes with millions of sockets. Lets try a different strategy : In case of memory pressure, try to add the FIN flag to last packet in write queue, even if packet was already sent. TCP stack will be able to deliver this FIN after a timeout event. Note that this FIN being delivered by a retransmit, it also carries a Push flag given our current implementation. By checking sk_under_memory_pressure(), we anticipate that cooking many FIN packets might deplete tcp memory. In the case we could not allocate a packet, even with __GFP_WAIT allocation, then not sending a FIN seems quite reasonable if it allows to get rid of this socket, free memory, and not block the process from eventually doing other useful work. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 2ade67b..a369e8a 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2814,7 +2814,8 @@ begin_fwd: /* We allow to exceed memory limits for FIN packets to expedite * connection tear down and (memory) recovery. - * Otherwise tcp_send_fin() could loop forever. + * Otherwise tcp_send_fin() could be tempted to either delay FIN + * or even be forced to close flow without any FIN. */ static void sk_forced_wmem_schedule(struct sock *sk, int size) { @@ -2827,33 +2828,40 @@ static void sk_forced_wmem_schedule(struct sock *sk, int size) sk_memory_allocated_add(sk, amt, &status); } -/* Send a fin. The caller locks the socket for us. This cannot be - * allowed to fail queueing a FIN frame under any circumstances. +/* Send a FIN. The caller locks the socket for us. + * We should try to send a FIN packet really hard, but eventually give up. */ void tcp_send_fin(struct sock *sk) { + struct sk_buff *skb, *tskb = tcp_write_queue_tail(sk); struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb = tcp_write_queue_tail(sk); - int mss_now; - /* Optimization, tack on the FIN if we have a queue of - * unsent frames. But be careful about outgoing SACKS - * and IP options. + /* Optimization, tack on the FIN if we have one skb in write queue and + * this skb was not yet sent, or we are under memory pressure. + * Note: in the latter case, FIN packet will be sent after a timeout, + * as TCP stack thinks it has already been transmitted. */ - mss_now = tcp_current_mss(sk); - - if (tcp_send_head(sk)) { - TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN; - TCP_SKB_CB(skb)->end_seq++; + if (tskb && (tcp_send_head(sk) || sk_under_memory_pressure(sk))) { +coalesce: + TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN; + TCP_SKB_CB(tskb)->end_seq++; tp->write_seq++; + if (!tcp_send_head(sk)) { + /* This means tskb was already sent. + * Pretend we included the FIN on previous transmit. + * We need to set tp->snd_nxt to the value it would have + * if FIN had been sent. This is because retransmit path + * does not change tp->snd_nxt. + */ + tp->snd_nxt++; + return; + } } else { - /* Socket is locked, keep trying until memory is available. */ - for (;;) { - skb = alloc_skb_fclone(MAX_TCP_HEADER, - sk->sk_allocation); - if (skb) - break; - yield(); + skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation); + if (unlikely(!skb)) { + if (tskb) + goto coalesce; + return; } skb_reserve(skb, MAX_TCP_HEADER); sk_forced_wmem_schedule(sk, skb->truesize); @@ -2862,7 +2870,7 @@ void tcp_send_fin(struct sock *sk) TCPHDR_ACK | TCPHDR_FIN); tcp_queue_skb(sk, skb); } - __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF); + __tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF); } /* We get here when a process closes a file descriptor (either due to -- cgit v0.10.2 From e580267df97eda407c525dbaee5430e0d51a0edb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Thu, 23 Apr 2015 20:56:29 +0200 Subject: bgmac: fix requests for extra polling calls from NAPI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After d75b1ade567f ("net: less interrupt masking in NAPI") polling function has to return whole budget when it wants NAPI to call it again. Signed-off-by: Rafał Miłecki Cc: Felix Fietkau Fixes: eb64e2923a886 ("bgmac: leave interrupts disabled as long as there is work to do") Acked-by: Felix Fietkau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller diff --git a/drivers/net/ethernet/broadcom/bgmac.c b/drivers/net/ethernet/broadcom/bgmac.c index de77d3a..21e3c38 100644 --- a/drivers/net/ethernet/broadcom/bgmac.c +++ b/drivers/net/ethernet/broadcom/bgmac.c @@ -1260,7 +1260,7 @@ static int bgmac_poll(struct napi_struct *napi, int weight) /* Poll again if more events arrived in the meantime */ if (bgmac_read(bgmac, BGMAC_INT_STATUS) & (BGMAC_IS_TX0 | BGMAC_IS_RX)) - return handled; + return weight; if (handled < weight) { napi_complete(napi); -- cgit v0.10.2 From 1d8dc3d3c8f1d8ee1da9d54c5d7c8694419ade42 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 23 Apr 2015 16:38:43 +0200 Subject: rhashtable: don't attempt to grow when at max_size The conversion of mac80211's station table to rhashtable had a bug that I found by accident in code review, that hadn't been found as rhashtable apparently managed to have a maximum hash chain length of one (!) in all our testing. In order to test the bug and verify the fix I set my rhashtable's max_size very low (4) in order to force getting hash collisions. At that point, rhashtable WARNed in rhashtable_insert_rehash() but didn't actually reject the hash table insertion. This caused it to lose insertions - my master list of stations would have 9 entries, but the rhashtable only had 5. This may warrant a deeper look, but that WARN_ON() just shouldn't happen. Fix this by not returning true from rht_grow_above_100() when the rhashtable's max_size has been reached - in this case the user is explicitly configuring it to be at most that big, so even if it's now above 100% it shouldn't attempt to resize. This fixes the "lost insertion" issue and consequently allows my code to display its error (and verify my fix for it.) Signed-off-by: Johannes Berg Acked-by: Thomas Graf Signed-off-by: David S. Miller diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index e23d242..dbcbcc5 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -282,7 +282,8 @@ static inline bool rht_shrink_below_30(const struct rhashtable *ht, static inline bool rht_grow_above_100(const struct rhashtable *ht, const struct bucket_table *tbl) { - return atomic_read(&ht->nelems) > tbl->size; + return atomic_read(&ht->nelems) > tbl->size && + (!ht->p.max_size || tbl->size < ht->p.max_size); } /* The bucket lock is selected based on the hash and protects mutations -- cgit v0.10.2 From b357a364c57c940ddb932224542494363df37378 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 23 Apr 2015 18:03:44 -0700 Subject: inet: fix possible panic in reqsk_queue_unlink() [ 3897.923145] BUG: unable to handle kernel NULL pointer dereference at 0000000000000080 [ 3897.931025] IP: [] reqsk_timer_handler+0x1a6/0x243 There is a race when reqsk_timer_handler() and tcp_check_req() call inet_csk_reqsk_queue_unlink() on the same req at the same time. Before commit fa76ce7328b2 ("inet: get rid of central tcp/dccp listener timer"), listener spinlock was held and race could not happen. To solve this bug, we change reqsk_queue_unlink() to not assume req must be found, and we return a status, to conditionally release a refcount on the request sock. This also means tcp_check_req() in non fastopen case might or not consume req refcount, so tcp_v6_hnd_req() & tcp_v4_hnd_req() have to properly handle this. (Same remark for dccp_check_req() and its callers) inet_csk_reqsk_queue_drop() is now too big to be inlined, as it is called 4 times in tcp and 3 times in dccp. Fixes: fa76ce7328b2 ("inet: get rid of central tcp/dccp listener timer") Signed-off-by: Eric Dumazet Reported-by: Yuchung Cheng Signed-off-by: David S. Miller diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 7b5887c..48a81582 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -279,12 +279,6 @@ static inline void inet_csk_reqsk_queue_add(struct sock *sk, void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, unsigned long timeout); -static inline void inet_csk_reqsk_queue_removed(struct sock *sk, - struct request_sock *req) -{ - reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req); -} - static inline void inet_csk_reqsk_queue_added(struct sock *sk, const unsigned long timeout) { @@ -306,19 +300,7 @@ static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk) return reqsk_queue_is_full(&inet_csk(sk)->icsk_accept_queue); } -static inline void inet_csk_reqsk_queue_unlink(struct sock *sk, - struct request_sock *req) -{ - reqsk_queue_unlink(&inet_csk(sk)->icsk_accept_queue, req); -} - -static inline void inet_csk_reqsk_queue_drop(struct sock *sk, - struct request_sock *req) -{ - inet_csk_reqsk_queue_unlink(sk, req); - inet_csk_reqsk_queue_removed(sk, req); - reqsk_put(req); -} +void inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req); void inet_csk_destroy_sock(struct sock *sk); void inet_csk_prepare_forced_close(struct sock *sk); diff --git a/include/net/request_sock.h b/include/net/request_sock.h index fe41f3c..9f4265c 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -212,24 +212,6 @@ static inline int reqsk_queue_empty(struct request_sock_queue *queue) return queue->rskq_accept_head == NULL; } -static inline void reqsk_queue_unlink(struct request_sock_queue *queue, - struct request_sock *req) -{ - struct listen_sock *lopt = queue->listen_opt; - struct request_sock **prev; - - spin_lock(&queue->syn_wait_lock); - - prev = &lopt->syn_table[req->rsk_hash]; - while (*prev != req) - prev = &(*prev)->dl_next; - *prev = req->dl_next; - - spin_unlock(&queue->syn_wait_lock); - if (del_timer(&req->rsk_timer)) - reqsk_put(req); -} - static inline void reqsk_queue_add(struct request_sock_queue *queue, struct request_sock *req, struct sock *parent, diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 2b4f21d..ccf4c56 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -453,7 +453,8 @@ static struct sock *dccp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) iph->saddr, iph->daddr); if (req) { nsk = dccp_check_req(sk, skb, req); - reqsk_put(req); + if (!nsk) + reqsk_put(req); return nsk; } nsk = inet_lookup_established(sock_net(sk), &dccp_hashinfo, diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 9d05510..5165571 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -301,7 +301,8 @@ static struct sock *dccp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) &iph->daddr, inet6_iif(skb)); if (req) { nsk = dccp_check_req(sk, skb, req); - reqsk_put(req); + if (!nsk) + reqsk_put(req); return nsk; } nsk = __inet6_lookup_established(sock_net(sk), &dccp_hashinfo, diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index 5f56666..30addee 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -186,8 +186,7 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb, if (child == NULL) goto listen_overflow; - inet_csk_reqsk_queue_unlink(sk, req); - inet_csk_reqsk_queue_removed(sk, req); + inet_csk_reqsk_queue_drop(sk, req); inet_csk_reqsk_queue_add(sk, req, child); out: return child; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 5c3dd62..8976ca4 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -564,6 +564,40 @@ int inet_rtx_syn_ack(struct sock *parent, struct request_sock *req) } EXPORT_SYMBOL(inet_rtx_syn_ack); +/* return true if req was found in the syn_table[] */ +static bool reqsk_queue_unlink(struct request_sock_queue *queue, + struct request_sock *req) +{ + struct listen_sock *lopt = queue->listen_opt; + struct request_sock **prev; + bool found = false; + + spin_lock(&queue->syn_wait_lock); + + for (prev = &lopt->syn_table[req->rsk_hash]; *prev != NULL; + prev = &(*prev)->dl_next) { + if (*prev == req) { + *prev = req->dl_next; + found = true; + break; + } + } + + spin_unlock(&queue->syn_wait_lock); + if (del_timer(&req->rsk_timer)) + reqsk_put(req); + return found; +} + +void inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req) +{ + if (reqsk_queue_unlink(&inet_csk(sk)->icsk_accept_queue, req)) { + reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req); + reqsk_put(req); + } +} +EXPORT_SYMBOL(inet_csk_reqsk_queue_drop); + static void reqsk_timer_handler(unsigned long data) { struct request_sock *req = (struct request_sock *)data; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 3571f2b..fc1c658 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1348,7 +1348,8 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) req = inet_csk_search_req(sk, th->source, iph->saddr, iph->daddr); if (req) { nsk = tcp_check_req(sk, skb, req, false); - reqsk_put(req); + if (!nsk) + reqsk_put(req); return nsk; } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 63d6311..e5d7649 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -755,10 +755,11 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, if (!child) goto listen_overflow; - inet_csk_reqsk_queue_unlink(sk, req); - inet_csk_reqsk_queue_removed(sk, req); - + inet_csk_reqsk_queue_drop(sk, req); inet_csk_reqsk_queue_add(sk, req, child); + /* Warning: caller must not call reqsk_put(req); + * child stole last reference on it. + */ return child; listen_overflow: diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index ad51df8..b6575d6 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -946,7 +946,8 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb) &ipv6_hdr(skb)->daddr, tcp_v6_iif(skb)); if (req) { nsk = tcp_check_req(sk, skb, req, false); - reqsk_put(req); + if (!nsk) + reqsk_put(req); return nsk; } nsk = __inet6_lookup_established(sock_net(sk), &tcp_hashinfo, -- cgit v0.10.2 From 2d6c9091ab7630dfcf34417c6683ce4764d7d40a Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Wed, 22 Apr 2015 13:06:54 -0400 Subject: net: mdio-gpio: support access that may sleep Some systems using mdio-gpio may use gpio on message based busses, which require sleeping (e.g. gpio from an I2C I/O expander). Since this driver does not use IRQ handler, it is safe to use the _cansleep suffixed gpio accessors. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller diff --git a/drivers/net/phy/mdio-gpio.c b/drivers/net/phy/mdio-gpio.c index 49ce7ec..c9cb486c 100644 --- a/drivers/net/phy/mdio-gpio.c +++ b/drivers/net/phy/mdio-gpio.c @@ -80,7 +80,8 @@ static void mdio_dir(struct mdiobb_ctrl *ctrl, int dir) * assume the pin serves as pull-up. If direction is * output, the default value is high. */ - gpio_set_value(bitbang->mdo, 1 ^ bitbang->mdo_active_low); + gpio_set_value_cansleep(bitbang->mdo, + 1 ^ bitbang->mdo_active_low); return; } @@ -96,7 +97,8 @@ static int mdio_get(struct mdiobb_ctrl *ctrl) struct mdio_gpio_info *bitbang = container_of(ctrl, struct mdio_gpio_info, ctrl); - return gpio_get_value(bitbang->mdio) ^ bitbang->mdio_active_low; + return gpio_get_value_cansleep(bitbang->mdio) ^ + bitbang->mdio_active_low; } static void mdio_set(struct mdiobb_ctrl *ctrl, int what) @@ -105,9 +107,11 @@ static void mdio_set(struct mdiobb_ctrl *ctrl, int what) container_of(ctrl, struct mdio_gpio_info, ctrl); if (bitbang->mdo) - gpio_set_value(bitbang->mdo, what ^ bitbang->mdo_active_low); + gpio_set_value_cansleep(bitbang->mdo, + what ^ bitbang->mdo_active_low); else - gpio_set_value(bitbang->mdio, what ^ bitbang->mdio_active_low); + gpio_set_value_cansleep(bitbang->mdio, + what ^ bitbang->mdio_active_low); } static void mdc_set(struct mdiobb_ctrl *ctrl, int what) @@ -115,7 +119,7 @@ static void mdc_set(struct mdiobb_ctrl *ctrl, int what) struct mdio_gpio_info *bitbang = container_of(ctrl, struct mdio_gpio_info, ctrl); - gpio_set_value(bitbang->mdc, what ^ bitbang->mdc_active_low); + gpio_set_value_cansleep(bitbang->mdc, what ^ bitbang->mdc_active_low); } static struct mdiobb_ops mdio_gpio_ops = { -- cgit v0.10.2 From dfc8f370316b31b9ca9ce96e50f1c438d9410b4f Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Fri, 24 Apr 2015 15:22:23 +1000 Subject: net/tg3: Release IRQs on permanent error When having permanent EEH error, the PCI device will be removed from the system. For this case, we shouldn't set pcierr_recovery to true wrongly, which blocks the driver to release the allocated interrupts and their handlers. Eventually, we can't disable MSI or MSIx successfully because of the MSI or MSIx interrupts still have associated interrupt actions, which is turned into following stack dump. Oops: Exception in kernel mode, sig: 5 [#1] : [c0000000003b76a8] .free_msi_irqs+0x80/0x1a0 (unreliable) [c00000000039f388] .pci_remove_bus_device+0x98/0x110 [c0000000000790f4] .pcibios_remove_pci_devices+0x9c/0x128 [c000000000077b98] .handle_eeh_events+0x2d8/0x4b0 [c0000000000782d0] .eeh_event_handler+0x130/0x1c0 [c000000000022bd4] .kernel_thread+0x54/0x70 Signed-off-by: Gavin Shan Acked-by: Prashant Sreedharan Signed-off-by: David S. Miller diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index 1270b18..069952f 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -18129,7 +18129,9 @@ static pci_ers_result_t tg3_io_error_detected(struct pci_dev *pdev, rtnl_lock(); - tp->pcierr_recovery = true; + /* We needn't recover from permanent error */ + if (state == pci_channel_io_frozen) + tp->pcierr_recovery = true; /* We probably don't have netdev yet */ if (!netdev || !netif_running(netdev)) -- cgit v0.10.2 From 3051f39253680624504021fd3b29fc621a4fd6b6 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 24 Apr 2015 15:52:32 +1000 Subject: ehea: Fix memory hook reference counting crashes The recent commit to only register the EHEA memory hotplug hooks on adapter probe has a few problems. Firstly the reference counting is wrong for multiple adapters, in that the hooks are registered multiple times. Secondly the check in the tear down path is backward. Finally the error path doesn't decrement the count. The multiple registration of the hooks is the biggest problem, as it leads to oopses when the system is rebooted, and/or errors during memory hotplug, eg: $ ./mem-on-off-test.sh -r 2 ... ehea: memory is going offline ehea: LPAR memory changed - re-initializing driver ehea: re-initializing driver complete ehea: memory is going offline ehea: LPAR memory changed - re-initializing driver ehea: opcode=26c ret=fffffffffffffffc arg1=8000000003000003 arg2=0 arg3=700000060000d600 arg4=3fded0000 arg5=200 arg6=0 arg7=0 ehea: register_rpage_mr failed ehea: registering mr failed ehea: register MR failed - driver inoperable! ehea: memory is going offline Fixes: aa183323312d ("ehea: Register memory hotplug, reboot and crash hooks on adapter probe") Signed-off-by: Michael Ellerman Signed-off-by: David S. Miller diff --git a/drivers/net/ethernet/ibm/ehea/ehea_main.c b/drivers/net/ethernet/ibm/ehea/ehea_main.c index 291c870..2a0dc12 100644 --- a/drivers/net/ethernet/ibm/ehea/ehea_main.c +++ b/drivers/net/ethernet/ibm/ehea/ehea_main.c @@ -3347,7 +3347,7 @@ static int ehea_register_memory_hooks(void) { int ret = 0; - if (atomic_inc_and_test(&ehea_memory_hooks_registered)) + if (atomic_inc_return(&ehea_memory_hooks_registered) > 1) return 0; ret = ehea_create_busmap(); @@ -3381,12 +3381,14 @@ out3: out2: unregister_reboot_notifier(&ehea_reboot_nb); out: + atomic_dec(&ehea_memory_hooks_registered); return ret; } static void ehea_unregister_memory_hooks(void) { - if (atomic_read(&ehea_memory_hooks_registered)) + /* Only remove the hooks if we've registered them */ + if (atomic_read(&ehea_memory_hooks_registered) == 0) return; unregister_reboot_notifier(&ehea_reboot_nb); -- cgit v0.10.2 From 20d96964b51844d4f3bea0249f62db7073c13c30 Mon Sep 17 00:00:00 2001 From: Chee Nouk Phoon Date: Fri, 24 Apr 2015 02:32:07 -0500 Subject: net: eth: altera: Resolve false errors from MSGDMA to TSE This patch resolves false errors from MSGDMA in TX mSGDMA MM to ST mode, and is a continuation of the patch recently submitted by Andrea Oetken. The MSGDMA had a logic bug that masked detection of this issue prior to Quartus 14.1/Build 164. When the MSGDMA logic bug was addressed in Quartus 14.1/Build 164, the driver problem was exposed. The problem is corrected by making sure MSGDMA_DESC_CTL_TR_ERR_IRQ is not set for any of the transmit DMA descriptors, and only used for receive descriptors. Fixes: 71cd26e altera tse: Error-Bit on tx-avalon-stream always set. Signed-off-by: Chee Nouk Phoon Signed-off-by: Vince Bridgers a Cc: Andreas Oetken Signed-off-by: David S. Miller diff --git a/drivers/net/ethernet/altera/altera_msgdmahw.h b/drivers/net/ethernet/altera/altera_msgdmahw.h index eba070f..89cd11d 100644 --- a/drivers/net/ethernet/altera/altera_msgdmahw.h +++ b/drivers/net/ethernet/altera/altera_msgdmahw.h @@ -58,15 +58,12 @@ struct msgdma_extended_desc { /* Tx buffer control flags */ #define MSGDMA_DESC_CTL_TX_FIRST (MSGDMA_DESC_CTL_GEN_SOP | \ - MSGDMA_DESC_CTL_TR_ERR_IRQ | \ MSGDMA_DESC_CTL_GO) -#define MSGDMA_DESC_CTL_TX_MIDDLE (MSGDMA_DESC_CTL_TR_ERR_IRQ | \ - MSGDMA_DESC_CTL_GO) +#define MSGDMA_DESC_CTL_TX_MIDDLE (MSGDMA_DESC_CTL_GO) #define MSGDMA_DESC_CTL_TX_LAST (MSGDMA_DESC_CTL_GEN_EOP | \ MSGDMA_DESC_CTL_TR_COMP_IRQ | \ - MSGDMA_DESC_CTL_TR_ERR_IRQ | \ MSGDMA_DESC_CTL_GO) #define MSGDMA_DESC_CTL_TX_SINGLE (MSGDMA_DESC_CTL_GEN_SOP | \ -- cgit v0.10.2 From 2ea2f62c8bda242433809c7f4e9eae1c52c40bbe Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 24 Apr 2015 16:05:01 -0700 Subject: net: fix crash in build_skb() When I added pfmemalloc support in build_skb(), I forgot netlink was using build_skb() with a vmalloc() area. In this patch I introduce __build_skb() for netlink use, and build_skb() is a wrapper handling both skb->head_frag and skb->pfmemalloc This means netlink no longer has to hack skb->head_frag [ 1567.700067] kernel BUG at arch/x86/mm/physaddr.c:26! [ 1567.700067] invalid opcode: 0000 [#1] PREEMPT SMP KASAN [ 1567.700067] Dumping ftrace buffer: [ 1567.700067] (ftrace buffer empty) [ 1567.700067] Modules linked in: [ 1567.700067] CPU: 9 PID: 16186 Comm: trinity-c182 Not tainted 4.0.0-next-20150424-sasha-00037-g4796e21 #2167 [ 1567.700067] task: ffff880127efb000 ti: ffff880246770000 task.ti: ffff880246770000 [ 1567.700067] RIP: __phys_addr (arch/x86/mm/physaddr.c:26 (discriminator 3)) [ 1567.700067] RSP: 0018:ffff8802467779d8 EFLAGS: 00010202 [ 1567.700067] RAX: 000041000ed8e000 RBX: ffffc9008ed8e000 RCX: 000000000000002c [ 1567.700067] RDX: 0000000000000004 RSI: 0000000000000000 RDI: ffffffffb3fd6049 [ 1567.700067] RBP: ffff8802467779f8 R08: 0000000000000019 R09: ffff8801d0168000 [ 1567.700067] R10: ffff8801d01680c7 R11: ffffed003a02d019 R12: ffffc9000ed8e000 [ 1567.700067] R13: 0000000000000f40 R14: 0000000000001180 R15: ffffc9000ed8e000 [ 1567.700067] FS: 00007f2a7da3f700(0000) GS:ffff8801d1000000(0000) knlGS:0000000000000000 [ 1567.700067] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1567.700067] CR2: 0000000000738308 CR3: 000000022e329000 CR4: 00000000000007e0 [ 1567.700067] Stack: [ 1567.700067] ffffc9000ed8e000 ffff8801d0168000 ffffc9000ed8e000 ffff8801d0168000 [ 1567.700067] ffff880246777a28 ffffffffad7c0a21 0000000000001080 ffff880246777c08 [ 1567.700067] ffff88060d302e68 ffff880246777b58 ffff880246777b88 ffffffffad9a6821 [ 1567.700067] Call Trace: [ 1567.700067] build_skb (include/linux/mm.h:508 net/core/skbuff.c:316) [ 1567.700067] netlink_sendmsg (net/netlink/af_netlink.c:1633 net/netlink/af_netlink.c:2329) [ 1567.774369] ? sched_clock_cpu (kernel/sched/clock.c:311) [ 1567.774369] ? netlink_unicast (net/netlink/af_netlink.c:2273) [ 1567.774369] ? netlink_unicast (net/netlink/af_netlink.c:2273) [ 1567.774369] sock_sendmsg (net/socket.c:614 net/socket.c:623) [ 1567.774369] sock_write_iter (net/socket.c:823) [ 1567.774369] ? sock_sendmsg (net/socket.c:806) [ 1567.774369] __vfs_write (fs/read_write.c:479 fs/read_write.c:491) [ 1567.774369] ? get_lock_stats (kernel/locking/lockdep.c:249) [ 1567.774369] ? default_llseek (fs/read_write.c:487) [ 1567.774369] ? vtime_account_user (kernel/sched/cputime.c:701) [ 1567.774369] ? rw_verify_area (fs/read_write.c:406 (discriminator 4)) [ 1567.774369] vfs_write (fs/read_write.c:539) [ 1567.774369] SyS_write (fs/read_write.c:586 fs/read_write.c:577) [ 1567.774369] ? SyS_read (fs/read_write.c:577) [ 1567.774369] ? __this_cpu_preempt_check (lib/smp_processor_id.c:63) [ 1567.774369] ? trace_hardirqs_on_caller (kernel/locking/lockdep.c:2594 kernel/locking/lockdep.c:2636) [ 1567.774369] ? trace_hardirqs_on_thunk (arch/x86/lib/thunk_64.S:42) [ 1567.774369] system_call_fastpath (arch/x86/kernel/entry_64.S:261) Fixes: 79930f5892e ("net: do not deplete pfmemalloc reserve") Signed-off-by: Eric Dumazet Reported-by: Sasha Levin Signed-off-by: David S. Miller diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 06793b5..66e374d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -773,6 +773,7 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, struct sk_buff *__alloc_skb(unsigned int size, gfp_t priority, int flags, int node); +struct sk_buff *__build_skb(void *data, unsigned int frag_size); struct sk_buff *build_skb(void *data, unsigned int frag_size); static inline struct sk_buff *alloc_skb(unsigned int size, gfp_t priority) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 456ead5..3cfff2a 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -280,13 +280,14 @@ nodata: EXPORT_SYMBOL(__alloc_skb); /** - * build_skb - build a network buffer + * __build_skb - build a network buffer * @data: data buffer provided by caller - * @frag_size: size of fragment, or 0 if head was kmalloced + * @frag_size: size of data, or 0 if head was kmalloced * * Allocate a new &sk_buff. Caller provides space holding head and * skb_shared_info. @data must have been allocated by kmalloc() only if - * @frag_size is 0, otherwise data should come from the page allocator. + * @frag_size is 0, otherwise data should come from the page allocator + * or vmalloc() * The return is the new skb buffer. * On a failure the return is %NULL, and @data is not freed. * Notes : @@ -297,7 +298,7 @@ EXPORT_SYMBOL(__alloc_skb); * before giving packet to stack. * RX rings only contains data buffers, not full skbs. */ -struct sk_buff *build_skb(void *data, unsigned int frag_size) +struct sk_buff *__build_skb(void *data, unsigned int frag_size) { struct skb_shared_info *shinfo; struct sk_buff *skb; @@ -311,11 +312,6 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size) memset(skb, 0, offsetof(struct sk_buff, tail)); skb->truesize = SKB_TRUESIZE(size); - if (frag_size) { - skb->head_frag = 1; - if (virt_to_head_page(data)->pfmemalloc) - skb->pfmemalloc = 1; - } atomic_set(&skb->users, 1); skb->head = data; skb->data = data; @@ -332,6 +328,23 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size) return skb; } + +/* build_skb() is wrapper over __build_skb(), that specifically + * takes care of skb->head and skb->pfmemalloc + * This means that if @frag_size is not zero, then @data must be backed + * by a page fragment, not kmalloc() or vmalloc() + */ +struct sk_buff *build_skb(void *data, unsigned int frag_size) +{ + struct sk_buff *skb = __build_skb(data, frag_size); + + if (skb && frag_size) { + skb->head_frag = 1; + if (virt_to_head_page(data)->pfmemalloc) + skb->pfmemalloc = 1; + } + return skb; +} EXPORT_SYMBOL(build_skb); struct netdev_alloc_cache { diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 19909d0..ec4adbd 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1629,13 +1629,11 @@ static struct sk_buff *netlink_alloc_large_skb(unsigned int size, if (data == NULL) return NULL; - skb = build_skb(data, size); + skb = __build_skb(data, size); if (skb == NULL) vfree(data); - else { - skb->head_frag = 0; + else skb->destructor = netlink_skb_destructor; - } return skb; } -- cgit v0.10.2 From 0e03fd3e335d272bee88fe733d5fd13f5c5b7140 Mon Sep 17 00:00:00 2001 From: Alexey Khoroshilov Date: Sat, 25 Apr 2015 04:07:03 +0300 Subject: pxa168: fix double deallocation of managed resources Commit 43d3ddf87a57 ("net: pxa168_eth: add device tree support") starts to use managed resources by adding devm_clk_get() and devm_ioremap_resource(), but it leaves explicit iounmap() and clock_put() in pxa168_eth_remove() and in failure handling code of pxa168_eth_probe(). As a result double free can happen. The patch removes explicit resource deallocation. Also it converts clk_disable() to clk_disable_unprepare() to make it symmetrical with clk_prepare_enable(). Found by Linux Driver Verification project (linuxtesting.org). Signed-off-by: Alexey Khoroshilov Signed-off-by: David S. Miller diff --git a/drivers/net/ethernet/marvell/pxa168_eth.c b/drivers/net/ethernet/marvell/pxa168_eth.c index af829c5..7ace07d 100644 --- a/drivers/net/ethernet/marvell/pxa168_eth.c +++ b/drivers/net/ethernet/marvell/pxa168_eth.c @@ -1508,7 +1508,8 @@ static int pxa168_eth_probe(struct platform_device *pdev) np = of_parse_phandle(pdev->dev.of_node, "phy-handle", 0); if (!np) { dev_err(&pdev->dev, "missing phy-handle\n"); - return -EINVAL; + err = -EINVAL; + goto err_netdev; } of_property_read_u32(np, "reg", &pep->phy_addr); pep->phy_intf = of_get_phy_mode(pdev->dev.of_node); @@ -1526,7 +1527,7 @@ static int pxa168_eth_probe(struct platform_device *pdev) pep->smi_bus = mdiobus_alloc(); if (pep->smi_bus == NULL) { err = -ENOMEM; - goto err_base; + goto err_netdev; } pep->smi_bus->priv = pep; pep->smi_bus->name = "pxa168_eth smi"; @@ -1551,13 +1552,10 @@ err_mdiobus: mdiobus_unregister(pep->smi_bus); err_free_mdio: mdiobus_free(pep->smi_bus); -err_base: - iounmap(pep->base); err_netdev: free_netdev(dev); err_clk: - clk_disable(clk); - clk_put(clk); + clk_disable_unprepare(clk); return err; } @@ -1574,13 +1572,9 @@ static int pxa168_eth_remove(struct platform_device *pdev) if (pep->phy) phy_disconnect(pep->phy); if (pep->clk) { - clk_disable(pep->clk); - clk_put(pep->clk); - pep->clk = NULL; + clk_disable_unprepare(pep->clk); } - iounmap(pep->base); - pep->base = NULL; mdiobus_unregister(pep->smi_bus); mdiobus_free(pep->smi_bus); unregister_netdev(dev); -- cgit v0.10.2 From 7cdbc6f74f8e6c06304b69b4e944fbd669581c7c Mon Sep 17 00:00:00 2001 From: Andreas Oetken Date: Sat, 25 Apr 2015 18:07:52 +0200 Subject: altera tse: add support for fixed-links. Add support for fixed-links in configurations without PHY. (e.g. connection to a switch, SGMII point to point, SFPs) Check: Documentation/devicetree/bindings/net/fixed-link.txt. Signed-off-by: Andreas Oetken Signed-off-by: David S. Miller diff --git a/drivers/net/ethernet/altera/altera_tse_main.c b/drivers/net/ethernet/altera/altera_tse_main.c index 90a7630..0533c05 100644 --- a/drivers/net/ethernet/altera/altera_tse_main.c +++ b/drivers/net/ethernet/altera/altera_tse_main.c @@ -777,6 +777,8 @@ static int init_phy(struct net_device *dev) struct altera_tse_private *priv = netdev_priv(dev); struct phy_device *phydev; struct device_node *phynode; + bool fixed_link = false; + int rc = 0; /* Avoid init phy in case of no phy present */ if (!priv->phy_iface) @@ -789,13 +791,32 @@ static int init_phy(struct net_device *dev) phynode = of_parse_phandle(priv->device->of_node, "phy-handle", 0); if (!phynode) { - netdev_dbg(dev, "no phy-handle found\n"); - if (!priv->mdio) { - netdev_err(dev, - "No phy-handle nor local mdio specified\n"); - return -ENODEV; + /* check if a fixed-link is defined in device-tree */ + if (of_phy_is_fixed_link(priv->device->of_node)) { + rc = of_phy_register_fixed_link(priv->device->of_node); + if (rc < 0) { + netdev_err(dev, "cannot register fixed PHY\n"); + return rc; + } + + /* In the case of a fixed PHY, the DT node associated + * to the PHY is the Ethernet MAC DT node. + */ + phynode = of_node_get(priv->device->of_node); + fixed_link = true; + + netdev_dbg(dev, "fixed-link detected\n"); + phydev = of_phy_connect(dev, phynode, + &altera_tse_adjust_link, + 0, priv->phy_iface); + } else { + netdev_dbg(dev, "no phy-handle found\n"); + if (!priv->mdio) { + netdev_err(dev, "No phy-handle nor local mdio specified\n"); + return -ENODEV; + } + phydev = connect_local_phy(dev); } - phydev = connect_local_phy(dev); } else { netdev_dbg(dev, "phy-handle found\n"); phydev = of_phy_connect(dev, phynode, @@ -819,10 +840,10 @@ static int init_phy(struct net_device *dev) /* Broken HW is sometimes missing the pull-up resistor on the * MDIO line, which results in reads to non-existent devices returning * 0 rather than 0xffff. Catch this here and treat 0 as a non-existent - * device as well. + * device as well. If a fixed-link is used the phy_id is always 0. * Note: phydev->phy_id is the result of reading the UID PHY registers. */ - if (phydev->phy_id == 0) { + if ((phydev->phy_id == 0) && !fixed_link) { netdev_err(dev, "Bad PHY UID 0x%08x\n", phydev->phy_id); phy_disconnect(phydev); return -ENODEV; -- cgit v0.10.2 From a31196b07f8034eba6a3487a1ad1bb5ec5cd58a5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 25 Apr 2015 09:35:24 -0700 Subject: net: rfs: fix crash in get_rps_cpus() Commit 567e4b79731c ("net: rfs: add hash collision detection") had one mistake : RPS_NO_CPU is no longer the marker for invalid cpu in set_rps_cpu() and get_rps_cpu(), as @next_cpu was the result of an AND with rps_cpu_mask This bug showed up on a host with 72 cpus : next_cpu was 0x7f, and the code was trying to access percpu data of an non existent cpu. In a follow up patch, we might get rid of compares against nr_cpu_ids, if we init the tables with 0. This is silly to test for a very unlikely condition that exists only shortly after table initialization, as we got rid of rps_reset_sock_flow() and similar functions that were writing this RPS_NO_CPU magic value at flow dismantle : When table is old enough, it never contains this value anymore. Fixes: 567e4b79731c ("net: rfs: add hash collision detection") Signed-off-by: Eric Dumazet Cc: Tom Herbert Cc: Ben Hutchings Signed-off-by: David S. Miller diff --git a/Documentation/networking/scaling.txt b/Documentation/networking/scaling.txt index cbfac09..59f4db2 100644 --- a/Documentation/networking/scaling.txt +++ b/Documentation/networking/scaling.txt @@ -282,7 +282,7 @@ following is true: - The current CPU's queue head counter >= the recorded tail counter value in rps_dev_flow[i] -- The current CPU is unset (equal to RPS_NO_CPU) +- The current CPU is unset (>= nr_cpu_ids) - The current CPU is offline After this check, the packet is sent to the (possibly updated) current diff --git a/net/core/dev.c b/net/core/dev.c index 1796cef5..c7ba038 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3079,7 +3079,7 @@ static struct rps_dev_flow * set_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow *rflow, u16 next_cpu) { - if (next_cpu != RPS_NO_CPU) { + if (next_cpu < nr_cpu_ids) { #ifdef CONFIG_RFS_ACCEL struct netdev_rx_queue *rxqueue; struct rps_dev_flow_table *flow_table; @@ -3184,7 +3184,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, * If the desired CPU (where last recvmsg was done) is * different from current CPU (one in the rx-queue flow * table entry), switch if one of the following holds: - * - Current CPU is unset (equal to RPS_NO_CPU). + * - Current CPU is unset (>= nr_cpu_ids). * - Current CPU is offline. * - The current CPU's queue tail has advanced beyond the * last packet that was enqueued using this table entry. @@ -3192,14 +3192,14 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, * have been dequeued, thus preserving in order delivery. */ if (unlikely(tcpu != next_cpu) && - (tcpu == RPS_NO_CPU || !cpu_online(tcpu) || + (tcpu >= nr_cpu_ids || !cpu_online(tcpu) || ((int)(per_cpu(softnet_data, tcpu).input_queue_head - rflow->last_qtail)) >= 0)) { tcpu = next_cpu; rflow = set_rps_cpu(dev, skb, rflow, next_cpu); } - if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) { + if (tcpu < nr_cpu_ids && cpu_online(tcpu)) { *rflowp = rflow; cpu = tcpu; goto done; @@ -3240,14 +3240,14 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, struct rps_dev_flow_table *flow_table; struct rps_dev_flow *rflow; bool expire = true; - int cpu; + unsigned int cpu; rcu_read_lock(); flow_table = rcu_dereference(rxqueue->rps_flow_table); if (flow_table && flow_id <= flow_table->mask) { rflow = &flow_table->flows[flow_id]; cpu = ACCESS_ONCE(rflow->cpu); - if (rflow->filter == filter_id && cpu != RPS_NO_CPU && + if (rflow->filter == filter_id && cpu < nr_cpu_ids && ((int)(per_cpu(softnet_data, cpu).input_queue_head - rflow->last_qtail) < (int)(10 * flow_table->mask))) -- cgit v0.10.2 From 73b5a6f2a7a1cb78ccdec3900afc8657e11bc6bf Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Sun, 26 Apr 2015 15:55:57 +0300 Subject: net/bonding: Make DRV macros private The bonding modules currently defines four macros with general names that pollute the global namespace: DRV_VERSION DRV_RELDATE DRV_NAME DRV_DESCRIPTION Fixing that by defining a private bonding_priv.h header files which includes those defines. Signed-off-by: Matan Barak Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 78dde56..3a10551 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -82,6 +82,8 @@ #include #include +#include "bonding_priv.h" + /*---------------------------- Module parameters ----------------------------*/ /* monitor all links that often (in milliseconds). <=0 disables monitoring */ diff --git a/drivers/net/bonding/bond_procfs.c b/drivers/net/bonding/bond_procfs.c index 62694cf..b20b35ac 100644 --- a/drivers/net/bonding/bond_procfs.c +++ b/drivers/net/bonding/bond_procfs.c @@ -4,6 +4,7 @@ #include #include +#include "bonding_priv.h" static void *bond_info_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) diff --git a/drivers/net/bonding/bonding_priv.h b/drivers/net/bonding/bonding_priv.h new file mode 100644 index 0000000..5a4d81a --- /dev/null +++ b/drivers/net/bonding/bonding_priv.h @@ -0,0 +1,25 @@ +/* + * Bond several ethernet interfaces into a Cisco, running 'Etherchannel'. + * + * Portions are (c) Copyright 1995 Simon "Guru Aleph-Null" Janes + * NCM: Network and Communications Management, Inc. + * + * BUT, I'm the one who modified it for ethernet, so: + * (c) Copyright 1999, Thomas Davis, tadavis@lbl.gov + * + * This software may be used and distributed according to the terms + * of the GNU Public License, incorporated herein by reference. + * + */ + +#ifndef _BONDING_PRIV_H +#define _BONDING_PRIV_H + +#define DRV_VERSION "3.7.1" +#define DRV_RELDATE "April 27, 2011" +#define DRV_NAME "bonding" +#define DRV_DESCRIPTION "Ethernet Channel Bonding Driver" + +#define bond_version DRV_DESCRIPTION ": v" DRV_VERSION " (" DRV_RELDATE ")\n" + +#endif diff --git a/include/net/bonding.h b/include/net/bonding.h index fda6fee..78ed135 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -30,13 +30,6 @@ #include #include -#define DRV_VERSION "3.7.1" -#define DRV_RELDATE "April 27, 2011" -#define DRV_NAME "bonding" -#define DRV_DESCRIPTION "Ethernet Channel Bonding Driver" - -#define bond_version DRV_DESCRIPTION ": v" DRV_VERSION " (" DRV_RELDATE ")\n" - #define BOND_MAX_ARP_TARGETS 16 #define BOND_DEFAULT_MIIMON 100 -- cgit v0.10.2 From 325301892a2d348323e09598ae108ba26889f7f9 Mon Sep 17 00:00:00 2001 From: Sylvain Rochet Date: Sun, 26 Apr 2015 20:40:52 +0200 Subject: ppp: mppe: sanity error path rework We are going to need sanity error path a little further, rework to be able to use the sanity error path anywhere in decompressor. Signed-off-by: Sylvain Rochet Signed-off-by: David S. Miller diff --git a/drivers/net/ppp/ppp_mppe.c b/drivers/net/ppp/ppp_mppe.c index 911b216..692ee0f 100644 --- a/drivers/net/ppp/ppp_mppe.c +++ b/drivers/net/ppp/ppp_mppe.c @@ -478,7 +478,6 @@ mppe_decompress(void *arg, unsigned char *ibuf, int isize, unsigned char *obuf, struct blkcipher_desc desc = { .tfm = state->arc4 }; unsigned ccount; int flushed = MPPE_BITS(ibuf) & MPPE_BIT_FLUSHED; - int sanity = 0; struct scatterlist sg_in[1], sg_out[1]; if (isize <= PPP_HDRLEN + MPPE_OVHD) { @@ -514,31 +513,19 @@ mppe_decompress(void *arg, unsigned char *ibuf, int isize, unsigned char *obuf, "mppe_decompress[%d]: ENCRYPTED bit not set!\n", state->unit); state->sanity_errors += 100; - sanity = 1; + goto sanity_error; } if (!state->stateful && !flushed) { printk(KERN_DEBUG "mppe_decompress[%d]: FLUSHED bit not set in " "stateless mode!\n", state->unit); state->sanity_errors += 100; - sanity = 1; + goto sanity_error; } if (state->stateful && ((ccount & 0xff) == 0xff) && !flushed) { printk(KERN_DEBUG "mppe_decompress[%d]: FLUSHED bit not set on " "flag packet!\n", state->unit); state->sanity_errors += 100; - sanity = 1; - } - - if (sanity) { - if (state->sanity_errors < SANITY_MAX) - return DECOMP_ERROR; - else - /* - * Take LCP down if the peer is sending too many bogons. - * We don't want to do this for a single or just a few - * instances since it could just be due to packet corruption. - */ - return DECOMP_FATALERROR; + goto sanity_error; } /* @@ -649,6 +636,16 @@ mppe_decompress(void *arg, unsigned char *ibuf, int isize, unsigned char *obuf, state->sanity_errors >>= 1; return osize; + +sanity_error: + if (state->sanity_errors < SANITY_MAX) + return DECOMP_ERROR; + else + /* Take LCP down if the peer is sending too many bogons. + * We don't want to do this for a single or just a few + * instances since it could just be due to packet corruption. + */ + return DECOMP_FATALERROR; } /* -- cgit v0.10.2 From 03654763148f9a3878b8b70c30d1ffce2fca3dff Mon Sep 17 00:00:00 2001 From: Sylvain Rochet Date: Sun, 26 Apr 2015 20:40:53 +0200 Subject: ppp: mppe: discard late packet in stateless mode When PPP is used over a link which does not guarantee packet ordering, we might get late MPPE packets. This is a problem because MPPE must be kept synchronized and the current implementation does not drop them and rekey 4095 times instead of 0, which is wrong. In order to prevent rekeying about a whole count space times (~ 4095 times), drop packets which are not within the forward 4096/2 window and increase sanity error counter. Signed-off-by: Sylvain Rochet Signed-off-by: David S. Miller diff --git a/drivers/net/ppp/ppp_mppe.c b/drivers/net/ppp/ppp_mppe.c index 692ee0f..05005c6 100644 --- a/drivers/net/ppp/ppp_mppe.c +++ b/drivers/net/ppp/ppp_mppe.c @@ -533,6 +533,13 @@ mppe_decompress(void *arg, unsigned char *ibuf, int isize, unsigned char *obuf, */ if (!state->stateful) { + /* Discard late packet */ + if ((ccount - state->ccount) % MPPE_CCOUNT_SPACE + > MPPE_CCOUNT_SPACE / 2) { + state->sanity_errors++; + goto sanity_error; + } + /* RFC 3078, sec 8.1. Rekey for every packet. */ while (state->ccount != ccount) { mppe_rekey(state, 0); -- cgit v0.10.2 From 129d23a56623eea0947a05288158d76dc7f2f0ac Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 27 Apr 2015 13:20:34 -0400 Subject: netfilter; Add some missing default cases to switch statements in nft_reject. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes: ==================== net/netfilter/nft_reject.c: In function ‘nft_reject_dump’: net/netfilter/nft_reject.c:61:2: warning: enumeration value ‘NFT_REJECT_TCP_RST’ not handled in switch [-Wswitch] switch (priv->type) { ^ net/netfilter/nft_reject.c:61:2: warning: enumeration value ‘NFT_REJECT_ICMPX_UNREACH’ not handled in switch [-Wswi\ tch] net/netfilter/nft_reject_inet.c: In function ‘nft_reject_inet_dump’: net/netfilter/nft_reject_inet.c:105:2: warning: enumeration value ‘NFT_REJECT_TCP_RST’ not handled in switch [-Wswi\ tch] switch (priv->type) { ^ ==================== Signed-off-by: David S. Miller diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c index 57d3e1a..0522fc9 100644 --- a/net/netfilter/nft_reject.c +++ b/net/netfilter/nft_reject.c @@ -63,6 +63,8 @@ int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr) if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code)) goto nla_put_failure; break; + default: + break; } return 0; diff --git a/net/netfilter/nft_reject_inet.c b/net/netfilter/nft_reject_inet.c index 62cabee..635dbba 100644 --- a/net/netfilter/nft_reject_inet.c +++ b/net/netfilter/nft_reject_inet.c @@ -108,6 +108,8 @@ static int nft_reject_inet_dump(struct sk_buff *skb, if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code)) goto nla_put_failure; break; + default: + break; } return 0; -- cgit v0.10.2 From 33df10e2ee414183953cc89453bf9c85027a188b Mon Sep 17 00:00:00 2001 From: Rojhalat Ibrahim Date: Mon, 27 Apr 2015 10:37:31 +0200 Subject: mdio-mux-gpio: use new gpiod_get_array and gpiod_put_array functions Use the new gpiod_get_array and gpiod_put_array functions (added to mainline in the v4.1 merge window) for obtaining and disposing of GPIO descriptors. Cc: David Miller Cc: Linus Walleij Signed-off-by: Rojhalat Ibrahim Signed-off-by: David S. Miller diff --git a/drivers/net/phy/mdio-mux-gpio.c b/drivers/net/phy/mdio-mux-gpio.c index 1a87a58..66edd99 100644 --- a/drivers/net/phy/mdio-mux-gpio.c +++ b/drivers/net/phy/mdio-mux-gpio.c @@ -12,33 +12,30 @@ #include #include #include -#include +#include #define DRV_VERSION "1.1" #define DRV_DESCRIPTION "GPIO controlled MDIO bus multiplexer driver" -#define MDIO_MUX_GPIO_MAX_BITS 8 - struct mdio_mux_gpio_state { - struct gpio_desc *gpio[MDIO_MUX_GPIO_MAX_BITS]; - unsigned int num_gpios; + struct gpio_descs *gpios; void *mux_handle; }; static int mdio_mux_gpio_switch_fn(int current_child, int desired_child, void *data) { - int values[MDIO_MUX_GPIO_MAX_BITS]; - unsigned int n; struct mdio_mux_gpio_state *s = data; + int values[s->gpios->ndescs]; + unsigned int n; if (current_child == desired_child) return 0; - for (n = 0; n < s->num_gpios; n++) { + for (n = 0; n < s->gpios->ndescs; n++) values[n] = (desired_child >> n) & 1; - } - gpiod_set_array_cansleep(s->num_gpios, s->gpio, values); + + gpiod_set_array_cansleep(s->gpios->ndescs, s->gpios->desc, values); return 0; } @@ -46,56 +43,33 @@ static int mdio_mux_gpio_switch_fn(int current_child, int desired_child, static int mdio_mux_gpio_probe(struct platform_device *pdev) { struct mdio_mux_gpio_state *s; - int num_gpios; - unsigned int n; int r; - if (!pdev->dev.of_node) - return -ENODEV; - - num_gpios = of_gpio_count(pdev->dev.of_node); - if (num_gpios <= 0 || num_gpios > MDIO_MUX_GPIO_MAX_BITS) - return -ENODEV; - s = devm_kzalloc(&pdev->dev, sizeof(*s), GFP_KERNEL); if (!s) return -ENOMEM; - s->num_gpios = num_gpios; - - for (n = 0; n < num_gpios; ) { - struct gpio_desc *gpio = gpiod_get_index(&pdev->dev, NULL, n, - GPIOD_OUT_LOW); - if (IS_ERR(gpio)) { - r = PTR_ERR(gpio); - goto err; - } - s->gpio[n] = gpio; - n++; - } + s->gpios = gpiod_get_array(&pdev->dev, NULL, GPIOD_OUT_LOW); + if (IS_ERR(s->gpios)) + return PTR_ERR(s->gpios); r = mdio_mux_init(&pdev->dev, mdio_mux_gpio_switch_fn, &s->mux_handle, s); - if (r == 0) { - pdev->dev.platform_data = s; - return 0; - } -err: - while (n) { - n--; - gpiod_put(s->gpio[n]); + if (r != 0) { + gpiod_put_array(s->gpios); + return r; } - return r; + + pdev->dev.platform_data = s; + return 0; } static int mdio_mux_gpio_remove(struct platform_device *pdev) { - unsigned int n; struct mdio_mux_gpio_state *s = dev_get_platdata(&pdev->dev); mdio_mux_uninit(s->mux_handle); - for (n = 0; n < s->num_gpios; n++) - gpiod_put(s->gpio[n]); + gpiod_put_array(s->gpios); return 0; } -- cgit v0.10.2 From b37069090b7c5615610a8aa6b36533d67b364d38 Mon Sep 17 00:00:00 2001 From: Amir Vadai Date: Mon, 27 Apr 2015 13:40:56 +0300 Subject: net/mlx4_en: Prevent setting invalid RSS hash function mlx4_en_check_rxfh_func() was checking for hardware support before setting a known RSS hash function, but didn't do any check before setting unknown RSS hash function. Need to make it fail on such values. In this occasion, moved the actual setting of the new value from the check function into mlx4_en_set_rxfh(). Fixes: 947cbb0 ("net/mlx4_en: Support for configurable RSS hash function") Signed-off-by: Amir Vadai Signed-off-by: David S. Miller diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c index 3f44e2b..a2ddf3d 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c @@ -1102,20 +1102,21 @@ static int mlx4_en_check_rxfh_func(struct net_device *dev, u8 hfunc) struct mlx4_en_priv *priv = netdev_priv(dev); /* check if requested function is supported by the device */ - if ((hfunc == ETH_RSS_HASH_TOP && - !(priv->mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS_TOP)) || - (hfunc == ETH_RSS_HASH_XOR && - !(priv->mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS_XOR))) - return -EINVAL; + if (hfunc == ETH_RSS_HASH_TOP) { + if (!(priv->mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS_TOP)) + return -EINVAL; + if (!(dev->features & NETIF_F_RXHASH)) + en_warn(priv, "Toeplitz hash function should be used in conjunction with RX hashing for optimal performance\n"); + return 0; + } else if (hfunc == ETH_RSS_HASH_XOR) { + if (!(priv->mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS_XOR)) + return -EINVAL; + if (dev->features & NETIF_F_RXHASH) + en_warn(priv, "Enabling both XOR Hash function and RX Hashing can limit RPS functionality\n"); + return 0; + } - priv->rss_hash_fn = hfunc; - if (hfunc == ETH_RSS_HASH_TOP && !(dev->features & NETIF_F_RXHASH)) - en_warn(priv, - "Toeplitz hash function should be used in conjunction with RX hashing for optimal performance\n"); - if (hfunc == ETH_RSS_HASH_XOR && (dev->features & NETIF_F_RXHASH)) - en_warn(priv, - "Enabling both XOR Hash function and RX Hashing can limit RPS functionality\n"); - return 0; + return -EINVAL; } static int mlx4_en_get_rxfh(struct net_device *dev, u32 *ring_index, u8 *key, @@ -1189,6 +1190,8 @@ static int mlx4_en_set_rxfh(struct net_device *dev, const u32 *ring_index, priv->prof->rss_rings = rss_rings; if (key) memcpy(priv->rss_key, key, MLX4_EN_RSS_KEY_SIZE); + if (hfunc != ETH_RSS_HASH_NO_CHANGE) + priv->rss_hash_fn = hfunc; if (port_up) { err = mlx4_en_start_port(dev); -- cgit v0.10.2 From 94435f764cc5838a7e94008f17628ad63384bf06 Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Mon, 27 Apr 2015 23:14:57 +0900 Subject: net:treewide: Fix typo in drivers/net This patch fix spelling typo in printk. Signed-off-by: Masanari Iida Signed-off-by: David S. Miller diff --git a/drivers/net/can/usb/kvaser_usb.c b/drivers/net/can/usb/kvaser_usb.c index 4643914..8b17a90 100644 --- a/drivers/net/can/usb/kvaser_usb.c +++ b/drivers/net/can/usb/kvaser_usb.c @@ -1102,7 +1102,7 @@ static void kvaser_usb_rx_can_err(const struct kvaser_usb_net_priv *priv, if (msg->u.rx_can_header.flag & (MSG_FLAG_ERROR_FRAME | MSG_FLAG_NERR)) { - netdev_err(priv->netdev, "Unknow error (flags: 0x%02x)\n", + netdev_err(priv->netdev, "Unknown error (flags: 0x%02x)\n", msg->u.rx_can_header.flag); stats->rx_errors++; diff --git a/drivers/net/ethernet/8390/etherh.c b/drivers/net/ethernet/8390/etherh.c index b36ee9e..d686b9c 100644 --- a/drivers/net/ethernet/8390/etherh.c +++ b/drivers/net/ethernet/8390/etherh.c @@ -523,7 +523,7 @@ static int etherh_addr(char *addr, struct expansion_card *ec) char *s; if (!ecard_readchunk(&cd, ec, 0xf5, 0)) { - printk(KERN_ERR "%s: unable to read podule description string\n", + printk(KERN_ERR "%s: unable to read module description string\n", dev_name(&ec->dev)); goto no_addr; } -- cgit v0.10.2 From 22a8f237c0551bae95ffcd2a7ff17d6f5fcce7e7 Mon Sep 17 00:00:00 2001 From: Michal Schmidt Date: Mon, 27 Apr 2015 17:20:38 +0200 Subject: bnx2x: really disable TPA if 'disable_tpa' option is set bnx2x's 'disable_tpa=1' module option is not respected properly and TPA (transparent packet aggregation) remains enabled. Even though the module option causes LRO to be disabled, TPA is enabled in GRO mode. Additionally, disabling GRO via ethtool then has no effect. One can still observe tpa_* statistics increase and large packets being received in tcpdump. The bug was an unintended consequence of commit aebf6244cd39 "bnx2x: Be more forgiving toward SW GRO". Fix it by following the bp->disable_tpa flag when initializing fp's. Signed-off-by: Michal Schmidt Signed-off-by: David S. Miller diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c index 6f7dc81..3558a36 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c @@ -2485,8 +2485,10 @@ static void bnx2x_bz_fp(struct bnx2x *bp, int index) else if (bp->flags & GRO_ENABLE_FLAG) fp->mode = TPA_MODE_GRO; - /* We don't want TPA on an FCoE L2 ring */ - if (IS_FCOE_FP(fp)) + /* We don't want TPA if it's disabled in bp + * or if this is an FCoE L2 ring. + */ + if (bp->disable_tpa || IS_FCOE_FP(fp)) fp->disable_tpa = 1; } -- cgit v0.10.2