From 1a0150a93c496986297fc08304ac564213c08942 Mon Sep 17 00:00:00 2001 From: "brenohl@br.ibm.com" Date: Fri, 27 Jul 2012 08:54:52 +0000 Subject: qlge: Add offload features to vlan interfaces This patch fills the net_device vlan_features with the proper hardware features, thus, improving the vlan interface performance. With the patch applied, I can see around 148% improvement on a TCP_STREAM test, from 3.5 Gb/s to 8.7 Gb/s. On TCP_RR, I see a 11% improvement, from 18k to 20. The CPU utilization is almost the same on both cases, from the comparison above. Signed-off-by: Breno Leitao Signed-off-by: David S. Miller diff --git a/drivers/net/ethernet/qlogic/qlge/qlge_main.c b/drivers/net/ethernet/qlogic/qlge/qlge_main.c index 3769f57..b53a3b6 100644 --- a/drivers/net/ethernet/qlogic/qlge/qlge_main.c +++ b/drivers/net/ethernet/qlogic/qlge/qlge_main.c @@ -4682,6 +4682,7 @@ static int __devinit qlge_probe(struct pci_dev *pdev, NETIF_F_HW_VLAN_TX | NETIF_F_RXCSUM; ndev->features = ndev->hw_features | NETIF_F_HW_VLAN_RX | NETIF_F_HW_VLAN_FILTER; + ndev->vlan_features = ndev->hw_features; if (test_bit(QL_DMA64, &qdev->flags)) ndev->features |= NETIF_F_HIGHDMA; -- cgit v0.10.2 From 4ea4bf7ebcbacee2f4736d261efb0693e87a34c9 Mon Sep 17 00:00:00 2001 From: Lin Ming Date: Sun, 29 Jul 2012 01:19:55 +0000 Subject: ipv4: fix debug info in tnode_new It should print size of struct rt_trie_node * allocated instead of size of struct rt_trie_node. Signed-off-by: Lin Ming Signed-off-by: David S. Miller diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 18cbc15..2a6fdc2 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -473,7 +473,7 @@ static struct tnode *tnode_new(t_key key, int pos, int bits) } pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode), - sizeof(struct rt_trie_node) << bits); + sizeof(struct rt_trie_node *) << bits); return tn; } -- cgit v0.10.2 From 61648d91fc278fd1d500da8061d17e6920cd3500 Mon Sep 17 00:00:00 2001 From: Lin Ming Date: Sun, 29 Jul 2012 02:00:03 +0000 Subject: ipv4: clean up put_child The first parameter struct trie *t is not used anymore. Remove it. Signed-off-by: Lin Ming Signed-off-by: David S. Miller diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 2a6fdc2..f0cdb30 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -159,7 +159,6 @@ struct trie { #endif }; -static void put_child(struct trie *t, struct tnode *tn, int i, struct rt_trie_node *n); static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n, int wasfull); static struct rt_trie_node *resize(struct trie *t, struct tnode *tn); @@ -490,7 +489,7 @@ static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node * return ((struct tnode *) n)->pos == tn->pos + tn->bits; } -static inline void put_child(struct trie *t, struct tnode *tn, int i, +static inline void put_child(struct tnode *tn, int i, struct rt_trie_node *n) { tnode_put_child_reorg(tn, i, n, -1); @@ -754,8 +753,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) goto nomem; } - put_child(t, tn, 2*i, (struct rt_trie_node *) left); - put_child(t, tn, 2*i+1, (struct rt_trie_node *) right); + put_child(tn, 2*i, (struct rt_trie_node *) left); + put_child(tn, 2*i+1, (struct rt_trie_node *) right); } } @@ -776,9 +775,9 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) if (tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits, 1) == 0) - put_child(t, tn, 2*i, node); + put_child(tn, 2*i, node); else - put_child(t, tn, 2*i+1, node); + put_child(tn, 2*i+1, node); continue; } @@ -786,8 +785,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) inode = (struct tnode *) node; if (inode->bits == 1) { - put_child(t, tn, 2*i, rtnl_dereference(inode->child[0])); - put_child(t, tn, 2*i+1, rtnl_dereference(inode->child[1])); + put_child(tn, 2*i, rtnl_dereference(inode->child[0])); + put_child(tn, 2*i+1, rtnl_dereference(inode->child[1])); tnode_free_safe(inode); continue; @@ -817,22 +816,22 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) */ left = (struct tnode *) tnode_get_child(tn, 2*i); - put_child(t, tn, 2*i, NULL); + put_child(tn, 2*i, NULL); BUG_ON(!left); right = (struct tnode *) tnode_get_child(tn, 2*i+1); - put_child(t, tn, 2*i+1, NULL); + put_child(tn, 2*i+1, NULL); BUG_ON(!right); size = tnode_child_length(left); for (j = 0; j < size; j++) { - put_child(t, left, j, rtnl_dereference(inode->child[j])); - put_child(t, right, j, rtnl_dereference(inode->child[j + size])); + put_child(left, j, rtnl_dereference(inode->child[j])); + put_child(right, j, rtnl_dereference(inode->child[j + size])); } - put_child(t, tn, 2*i, resize(t, left)); - put_child(t, tn, 2*i+1, resize(t, right)); + put_child(tn, 2*i, resize(t, left)); + put_child(tn, 2*i+1, resize(t, right)); tnode_free_safe(inode); } @@ -877,7 +876,7 @@ static struct tnode *halve(struct trie *t, struct tnode *tn) if (!newn) goto nomem; - put_child(t, tn, i/2, (struct rt_trie_node *)newn); + put_child(tn, i/2, (struct rt_trie_node *)newn); } } @@ -892,21 +891,21 @@ static struct tnode *halve(struct trie *t, struct tnode *tn) if (left == NULL) { if (right == NULL) /* Both are empty */ continue; - put_child(t, tn, i/2, right); + put_child(tn, i/2, right); continue; } if (right == NULL) { - put_child(t, tn, i/2, left); + put_child(tn, i/2, left); continue; } /* Two nonempty children */ newBinNode = (struct tnode *) tnode_get_child(tn, i/2); - put_child(t, tn, i/2, NULL); - put_child(t, newBinNode, 0, left); - put_child(t, newBinNode, 1, right); - put_child(t, tn, i/2, resize(t, newBinNode)); + put_child(tn, i/2, NULL); + put_child(newBinNode, 0, left); + put_child(newBinNode, 1, right); + put_child(tn, i/2, resize(t, newBinNode)); } tnode_free_safe(oldtnode); return tn; @@ -1125,7 +1124,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) node_set_parent((struct rt_trie_node *)l, tp); cindex = tkey_extract_bits(key, tp->pos, tp->bits); - put_child(t, tp, cindex, (struct rt_trie_node *)l); + put_child(tp, cindex, (struct rt_trie_node *)l); } else { /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */ /* @@ -1155,12 +1154,12 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) node_set_parent((struct rt_trie_node *)tn, tp); missbit = tkey_extract_bits(key, newpos, 1); - put_child(t, tn, missbit, (struct rt_trie_node *)l); - put_child(t, tn, 1-missbit, n); + put_child(tn, missbit, (struct rt_trie_node *)l); + put_child(tn, 1-missbit, n); if (tp) { cindex = tkey_extract_bits(key, tp->pos, tp->bits); - put_child(t, tp, cindex, (struct rt_trie_node *)tn); + put_child(tp, cindex, (struct rt_trie_node *)tn); } else { rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn); tp = tn; @@ -1619,7 +1618,7 @@ static void trie_leaf_remove(struct trie *t, struct leaf *l) if (tp) { t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits); - put_child(t, tp, cindex, NULL); + put_child(tp, cindex, NULL); trie_rebalance(t, tp); } else RCU_INIT_POINTER(t->trie, NULL); -- cgit v0.10.2 From ea4b3857861fe147137517da200d04cbbc91ad49 Mon Sep 17 00:00:00 2001 From: Devendra Naga Date: Sun, 29 Jul 2012 03:19:23 +0000 Subject: bnx2x: remove cast around the kmalloc in bnx2x_prev_mark_path casting the void pointer is redundant (Documentation/CodingStyle) Signed-off-by: Devendra Naga Acked-by: Eilon Greenstein Signed-off-by: David S. Miller diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c index 9aaf863..dd451c3 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c @@ -9360,8 +9360,7 @@ static int __devinit bnx2x_prev_mark_path(struct bnx2x *bp) struct bnx2x_prev_path_list *tmp_list; int rc; - tmp_list = (struct bnx2x_prev_path_list *) - kmalloc(sizeof(struct bnx2x_prev_path_list), GFP_KERNEL); + tmp_list = kmalloc(sizeof(struct bnx2x_prev_path_list), GFP_KERNEL); if (!tmp_list) { BNX2X_ERR("Failed to allocate 'bnx2x_prev_path_list'\n"); return -ENOMEM; -- cgit v0.10.2 From 17a2bf798621ce8579f7563deaf4640f15931d0e Mon Sep 17 00:00:00 2001 From: Devendra Naga Date: Sun, 29 Jul 2012 03:28:47 +0000 Subject: seeq: use PTR_RET at init_module of driver the driver sees wether the dev_seeq pointer is having a error that can be read by using the PTR_ERR, and returns it at error case, other wise 0 at success case. the PTR_RET does the same thing, and use PTR_RET instead of redoing the code of PTR_RET Signed-off-by: Devendra Naga Acked-by: David Howells Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller diff --git a/drivers/net/ethernet/seeq/seeq8005.c b/drivers/net/ethernet/seeq/seeq8005.c index 698edbb..d6e50de 100644 --- a/drivers/net/ethernet/seeq/seeq8005.c +++ b/drivers/net/ethernet/seeq/seeq8005.c @@ -736,9 +736,7 @@ MODULE_PARM_DESC(irq, "SEEQ 8005 IRQ number"); int __init init_module(void) { dev_seeq = seeq8005_probe(-1); - if (IS_ERR(dev_seeq)) - return PTR_ERR(dev_seeq); - return 0; + return PTR_RET(dev_seeq); } void __exit cleanup_module(void) -- cgit v0.10.2 From b41a9a66f67817f8acd85bd650e012a14da39faa Mon Sep 17 00:00:00 2001 From: Karsten Keil Date: Sun, 29 Jul 2012 07:15:13 +0000 Subject: mISDN: Bugfix only few bytes are transfered on a connection The test for the fillempty condition was wrong in one place. Changed the variable to the right boolean type. Signed-off-by: Karsten Keil Signed-off-by: David S. Miller diff --git a/drivers/isdn/hardware/mISDN/avmfritz.c b/drivers/isdn/hardware/mISDN/avmfritz.c index c08fc60..fa6ca47 100644 --- a/drivers/isdn/hardware/mISDN/avmfritz.c +++ b/drivers/isdn/hardware/mISDN/avmfritz.c @@ -449,7 +449,8 @@ hdlc_fill_fifo(struct bchannel *bch) { struct fritzcard *fc = bch->hw; struct hdlc_hw *hdlc; - int count, fs, cnt = 0, idx, fillempty = 0; + int count, fs, cnt = 0, idx; + bool fillempty = false; u8 *p; u32 *ptr, val, addr; @@ -462,7 +463,7 @@ hdlc_fill_fifo(struct bchannel *bch) return; count = fs; p = bch->fill; - fillempty = 1; + fillempty = true; } else { count = bch->tx_skb->len - bch->tx_idx; if (count <= 0) @@ -477,7 +478,7 @@ hdlc_fill_fifo(struct bchannel *bch) hdlc->ctrl.sr.cmd |= HDLC_CMD_XME; } ptr = (u32 *)p; - if (fillempty) { + if (!fillempty) { pr_debug("%s.B%d: %d/%d/%d", fc->name, bch->nr, count, bch->tx_idx, bch->tx_skb->len); bch->tx_idx += count; -- cgit v0.10.2 From 8253947e2cdfb14717c9212b751b7aec9ea9ef5e Mon Sep 17 00:00:00 2001 From: Li Wei Date: Sun, 29 Jul 2012 16:01:30 +0000 Subject: ipv6: fix incorrect route 'expires' value passed to userspace When userspace use RTM_GETROUTE to dump route table, with an already expired route entry, we always got an 'expires' value(2147157) calculated base on INT_MAX. The reason of this problem is in the following satement: rt->dst.expires - jiffies < INT_MAX gcc promoted the type of both sides of '<' to unsigned long, thus a small negative value would be considered greater than INT_MAX. With the help of Eric Dumazet, do the out of bound checks in rtnl_put_cacheinfo(), _after_ conversion to clock_t. Signed-off-by: Li Wei Signed-off-by: David S. Miller diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index bc9e380..5ff949d 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -625,9 +625,13 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, .rta_id = id, }; - if (expires) - ci.rta_expires = jiffies_to_clock_t(expires); + if (expires) { + unsigned long clock; + clock = jiffies_to_clock_t(abs(expires)); + clock = min_t(unsigned long, clock, INT_MAX); + ci.rta_expires = (expires > 0) ? clock : -clock; + } return nla_put(skb, RTA_CACHEINFO, sizeof(ci), &ci); } EXPORT_SYMBOL_GPL(rtnl_put_cacheinfo); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index cf02cb9..8e80fd2 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2480,12 +2480,8 @@ static int rt6_fill_node(struct net *net, goto nla_put_failure; if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric)) goto nla_put_failure; - if (!(rt->rt6i_flags & RTF_EXPIRES)) - expires = 0; - else if (rt->dst.expires - jiffies < INT_MAX) - expires = rt->dst.expires - jiffies; - else - expires = INT_MAX; + + expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0; if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0) goto nla_put_failure; -- cgit v0.10.2 From 8151ad576d34a5ec1f1068edf25f3b7c47f6edab Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Sun, 29 Jul 2012 19:15:41 +0000 Subject: tg3: Request APE_LOCK_PHY before PHY access to prevent PHY access conflict with APE firmware. Signed-off-by: Michael Chan Signed-off-by: David S. Miller diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index 9a009fd..a528f9a 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -672,6 +672,12 @@ static int tg3_ape_lock(struct tg3 *tp, int locknum) else bit = 1 << tp->pci_fn; break; + case TG3_APE_LOCK_PHY0: + case TG3_APE_LOCK_PHY1: + case TG3_APE_LOCK_PHY2: + case TG3_APE_LOCK_PHY3: + bit = APE_LOCK_REQ_DRIVER; + break; default: return -EINVAL; } @@ -723,6 +729,12 @@ static void tg3_ape_unlock(struct tg3 *tp, int locknum) else bit = 1 << tp->pci_fn; break; + case TG3_APE_LOCK_PHY0: + case TG3_APE_LOCK_PHY1: + case TG3_APE_LOCK_PHY2: + case TG3_APE_LOCK_PHY3: + bit = APE_LOCK_GRANT_DRIVER; + break; default: return; } @@ -1052,6 +1064,8 @@ static int tg3_readphy(struct tg3 *tp, int reg, u32 *val) udelay(80); } + tg3_ape_lock(tp, tp->phy_ape_lock); + *val = 0x0; frame_val = ((tp->phy_addr << MI_COM_PHY_ADDR_SHIFT) & @@ -1086,6 +1100,8 @@ static int tg3_readphy(struct tg3 *tp, int reg, u32 *val) udelay(80); } + tg3_ape_unlock(tp, tp->phy_ape_lock); + return ret; } @@ -1105,6 +1121,8 @@ static int tg3_writephy(struct tg3 *tp, int reg, u32 val) udelay(80); } + tg3_ape_lock(tp, tp->phy_ape_lock); + frame_val = ((tp->phy_addr << MI_COM_PHY_ADDR_SHIFT) & MI_COM_PHY_ADDR_MASK); frame_val |= ((reg << MI_COM_REG_ADDR_SHIFT) & @@ -1135,6 +1153,8 @@ static int tg3_writephy(struct tg3 *tp, int reg, u32 val) udelay(80); } + tg3_ape_unlock(tp, tp->phy_ape_lock); + return ret; } @@ -13648,6 +13668,23 @@ static int __devinit tg3_phy_probe(struct tg3 *tp) tg3_flag_set(tp, PAUSE_AUTONEG); tp->link_config.flowctrl = FLOW_CTRL_TX | FLOW_CTRL_RX; + if (tg3_flag(tp, ENABLE_APE)) { + switch (tp->pci_fn) { + case 0: + tp->phy_ape_lock = TG3_APE_LOCK_PHY0; + break; + case 1: + tp->phy_ape_lock = TG3_APE_LOCK_PHY1; + break; + case 2: + tp->phy_ape_lock = TG3_APE_LOCK_PHY2; + break; + case 3: + tp->phy_ape_lock = TG3_APE_LOCK_PHY3; + break; + } + } + if (tg3_flag(tp, USE_PHYLIB)) return tg3_phy_init(tp); diff --git a/drivers/net/ethernet/broadcom/tg3.h b/drivers/net/ethernet/broadcom/tg3.h index a1b75cd..6fb45a5 100644 --- a/drivers/net/ethernet/broadcom/tg3.h +++ b/drivers/net/ethernet/broadcom/tg3.h @@ -3107,6 +3107,7 @@ struct tg3 { int old_link; u8 phy_addr; + u8 phy_ape_lock; /* PHY info */ u32 phy_id; -- cgit v0.10.2 From 10ce95d6ef36c65df7dcd3b8fcf86913f8b298bd Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Sun, 29 Jul 2012 19:15:42 +0000 Subject: tg3: Fix Read DMA workaround for 5719 A0. The workaround was mis-applied to all 5719 and 5720 chips. Signed-off-by: Michael Chan Signed-off-by: David S. Miller diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index a528f9a..f0434df 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -9086,8 +9086,7 @@ static int tg3_reset_hw(struct tg3 *tp, int reset_phy) GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_57780 || tg3_flag(tp, 57765_PLUS)) { val = tr32(TG3_RDMA_RSRVCTRL_REG); - if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5719 || - GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5720) { + if (tp->pci_chip_rev_id == CHIPREV_ID_5719_A0) { val &= ~(TG3_RDMA_RSRVCTRL_TXMRGN_MASK | TG3_RDMA_RSRVCTRL_FIFO_LWM_MASK | TG3_RDMA_RSRVCTRL_FIFO_HWM_MASK); -- cgit v0.10.2 From 091f0ea30074bc43f9250961b3247af713024bc6 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Sun, 29 Jul 2012 19:15:43 +0000 Subject: tg3: Add New 5719 Read DMA workaround After Power-on-reset, the 5719's TX DMA length registers may contain uninitialized values and cause TX DMA to stall. Check for invalid values and set a register bit to flush the TX channels. The bit needs to be turned off after the DMA channels have been flushed. Signed-off-by: Michael Chan Signed-off-by: David S. Miller diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index f0434df..50045ed 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -9276,6 +9276,19 @@ static int tg3_reset_hw(struct tg3 *tp, int reset_phy) tw32_f(RDMAC_MODE, rdmac_mode); udelay(40); + if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5719) { + for (i = 0; i < TG3_NUM_RDMA_CHANNELS; i++) { + if (tr32(TG3_RDMA_LENGTH + (i << 2)) > TG3_MAX_MTU(tp)) + break; + } + if (i < TG3_NUM_RDMA_CHANNELS) { + val = tr32(TG3_LSO_RD_DMA_CRPTEN_CTRL); + val |= TG3_LSO_RD_DMA_TX_LENGTH_WA; + tw32(TG3_LSO_RD_DMA_CRPTEN_CTRL, val); + tg3_flag_set(tp, 5719_RDMA_BUG); + } + } + tw32(RCVDCC_MODE, RCVDCC_MODE_ENABLE | RCVDCC_MODE_ATTN_ENABLE); if (!tg3_flag(tp, 5705_PLUS)) tw32(MBFREE_MODE, MBFREE_MODE_ENABLE); @@ -9635,6 +9648,16 @@ static void tg3_periodic_fetch_stats(struct tg3 *tp) TG3_STAT_ADD32(&sp->tx_ucast_packets, MAC_TX_STATS_UCAST); TG3_STAT_ADD32(&sp->tx_mcast_packets, MAC_TX_STATS_MCAST); TG3_STAT_ADD32(&sp->tx_bcast_packets, MAC_TX_STATS_BCAST); + if (unlikely(tg3_flag(tp, 5719_RDMA_BUG) && + (sp->tx_ucast_packets.low + sp->tx_mcast_packets.low + + sp->tx_bcast_packets.low) > TG3_NUM_RDMA_CHANNELS)) { + u32 val; + + val = tr32(TG3_LSO_RD_DMA_CRPTEN_CTRL); + val &= ~TG3_LSO_RD_DMA_TX_LENGTH_WA; + tw32(TG3_LSO_RD_DMA_CRPTEN_CTRL, val); + tg3_flag_clear(tp, 5719_RDMA_BUG); + } TG3_STAT_ADD32(&sp->rx_octets, MAC_RX_STATS_OCTETS); TG3_STAT_ADD32(&sp->rx_fragments, MAC_RX_STATS_FRAGMENTS); diff --git a/drivers/net/ethernet/broadcom/tg3.h b/drivers/net/ethernet/broadcom/tg3.h index 6fb45a5..6d52cb2 100644 --- a/drivers/net/ethernet/broadcom/tg3.h +++ b/drivers/net/ethernet/broadcom/tg3.h @@ -1376,7 +1376,11 @@ #define TG3_LSO_RD_DMA_CRPTEN_CTRL 0x00004910 #define TG3_LSO_RD_DMA_CRPTEN_CTRL_BLEN_BD_4K 0x00030000 #define TG3_LSO_RD_DMA_CRPTEN_CTRL_BLEN_LSO_4K 0x000c0000 -/* 0x4914 --> 0x4c00 unused */ +#define TG3_LSO_RD_DMA_TX_LENGTH_WA 0x02000000 +/* 0x4914 --> 0x4be0 unused */ + +#define TG3_NUM_RDMA_CHANNELS 4 +#define TG3_RDMA_LENGTH 0x00004be0 /* Write DMA control registers */ #define WDMAC_MODE 0x00004c00 @@ -2959,6 +2963,7 @@ enum TG3_FLAGS { TG3_FLAG_L1PLLPD_EN, TG3_FLAG_APE_HAS_NCSI, TG3_FLAG_4K_FIFO_LIMIT, + TG3_FLAG_5719_RDMA_BUG, TG3_FLAG_RESET_TASK_PENDING, TG3_FLAG_5705_PLUS, TG3_FLAG_IS_5788, -- cgit v0.10.2 From 0f566b208b41918053b2e67399673aaec02dde5d Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Sun, 29 Jul 2012 19:15:44 +0000 Subject: tg3: Fix race condition in tg3_get_stats64() Spinlock should be taken before checking for tp->hw_stats. Signed-off-by: Michael Chan Signed-off-by: David S. Miller diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index 50045ed..f03614b 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -12524,10 +12524,12 @@ static struct rtnl_link_stats64 *tg3_get_stats64(struct net_device *dev, { struct tg3 *tp = netdev_priv(dev); - if (!tp->hw_stats) + spin_lock_bh(&tp->lock); + if (!tp->hw_stats) { + spin_unlock_bh(&tp->lock); return &tp->net_stats_prev; + } - spin_lock_bh(&tp->lock); tg3_get_nstats(tp, stats); spin_unlock_bh(&tp->lock); -- cgit v0.10.2 From cac83e53917ebc058066eb98023c11fdaa2262dc Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Sun, 29 Jul 2012 19:15:45 +0000 Subject: tg3: Update version to 3.124 Signed-off-by: Michael Chan Signed-off-by: David S. Miller diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index f03614b..bf906c5 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -92,7 +92,7 @@ static inline void _tg3_flag_clear(enum TG3_FLAGS flag, unsigned long *bits) #define DRV_MODULE_NAME "tg3" #define TG3_MAJ_NUM 3 -#define TG3_MIN_NUM 123 +#define TG3_MIN_NUM 124 #define DRV_MODULE_VERSION \ __stringify(TG3_MAJ_NUM) "." __stringify(TG3_MIN_NUM) #define DRV_MODULE_RELDATE "March 21, 2012" -- cgit v0.10.2 From a117dacde0288f3ec60b6e5bcedae8fa37ee0dfc Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Sun, 29 Jul 2012 19:45:14 +0000 Subject: net/tun: fix ioctl() based info leaks The tun module leaks up to 36 bytes of memory by not fully initializing a structure located on the stack that gets copied to user memory by the TUNGETIFF and SIOCGIFHWADDR ioctl()s. Signed-off-by: Mathias Krause Signed-off-by: David S. Miller diff --git a/drivers/net/tun.c b/drivers/net/tun.c index c62163e..f55c462 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1379,9 +1379,11 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, int vnet_hdr_sz; int ret; - if (cmd == TUNSETIFF || _IOC_TYPE(cmd) == 0x89) + if (cmd == TUNSETIFF || _IOC_TYPE(cmd) == 0x89) { if (copy_from_user(&ifr, argp, ifreq_len)) return -EFAULT; + } else + memset(&ifr, 0, sizeof(ifr)); if (cmd == TUNGETFEATURES) { /* Currently this just means: "what IFF flags are valid?". -- cgit v0.10.2 From 8bbb181308bc348e02bfdbebdedd4e4ec9d452ce Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 30 Jul 2012 14:52:48 -0700 Subject: tun: Fix formatting. Signed-off-by: David S. Miller diff --git a/drivers/net/tun.c b/drivers/net/tun.c index f55c462..926d4db 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1382,9 +1382,9 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, if (cmd == TUNSETIFF || _IOC_TYPE(cmd) == 0x89) { if (copy_from_user(&ifr, argp, ifreq_len)) return -EFAULT; - } else + } else { memset(&ifr, 0, sizeof(ifr)); - + } if (cmd == TUNGETFEATURES) { /* Currently this just means: "what IFF flags are valid?". * This is needed because we never checked for invalid flags on -- cgit v0.10.2 From cca32e4bf999a34ac08d959f351f2b30bcd02460 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 29 Jul 2012 21:06:13 +0000 Subject: net: TCP early demux cleanup early_demux() handlers should be called in RCU context, and as we use skb_dst_set_noref(skb, dst), caller must not exit from RCU context before dst use (skb_dst(skb)) or release (skb_drop(dst)) Therefore, rcu_read_lock()/rcu_read_unlock() pairs around ->early_demux() are confusing and not needed : Protocol handlers are already in an RCU read lock section. (__netif_receive_skb() does the rcu_read_lock() ) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 981ff1e..f1395a6 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -325,14 +325,12 @@ static int ip_rcv_finish(struct sk_buff *skb) const struct net_protocol *ipprot; int protocol = iph->protocol; - rcu_read_lock(); ipprot = rcu_dereference(inet_protos[protocol]); if (ipprot && ipprot->early_demux) { ipprot->early_demux(skb); /* must reload iph, skb->head might have changed */ iph = ip_hdr(skb); } - rcu_read_unlock(); } /* diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 47975e3..a52d864 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -52,11 +52,9 @@ int ip6_rcv_finish(struct sk_buff *skb) if (sysctl_ip_early_demux && !skb_dst(skb)) { const struct inet6_protocol *ipprot; - rcu_read_lock(); ipprot = rcu_dereference(inet6_protos[ipv6_hdr(skb)->nexthdr]); if (ipprot && ipprot->early_demux) ipprot->early_demux(skb); - rcu_read_unlock(); } if (!skb_dst(skb)) ip6_route_input(skb); -- cgit v0.10.2 From 404e0a8b6a55d5e1cd138c6deb1bca9abdf75d8c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 29 Jul 2012 23:20:37 +0000 Subject: net: ipv4: fix RCU races on dst refcounts commit c6cffba4ffa2 (ipv4: Fix input route performance regression.) added various fatal races with dst refcounts. crashes happen on tcp workloads if routes are added/deleted at the same time. The dst_free() calls from free_fib_info_rcu() are clearly racy. We need instead regular dst refcounting (dst_release()) and make sure dst_release() is aware of RCU grace periods : Add DST_RCU_FREE flag so that dst_release() respects an RCU grace period before dst destruction for cached dst Introduce a new inet_sk_rx_dst_set() helper, using atomic_inc_not_zero() to make sure we dont increase a zero refcount (On a dst currently waiting an rcu grace period before destruction) rt_cache_route() must take a reference on the new cached route, and release it if was not able to install it. With this patch, my machines survive various benchmarks. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller diff --git a/include/net/dst.h b/include/net/dst.h index baf5978..31a9fd3 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -61,6 +61,7 @@ struct dst_entry { #define DST_NOPEER 0x0040 #define DST_FAKE_RTABLE 0x0080 #define DST_XFRM_TUNNEL 0x0100 +#define DST_RCU_FREE 0x0200 unsigned short pending_confirm; @@ -382,12 +383,6 @@ static inline void dst_free(struct dst_entry *dst) __dst_free(dst); } -static inline void dst_rcu_free(struct rcu_head *head) -{ - struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); - dst_free(dst); -} - static inline void dst_confirm(struct dst_entry *dst) { dst->pending_confirm = 1; diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 613cfa4..e3fd34c 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -249,4 +249,17 @@ static inline __u8 inet_sk_flowi_flags(const struct sock *sk) return flags; } +static inline void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + + if (atomic_inc_not_zero(&dst->__refcnt)) { + if (!(dst->flags & DST_RCU_FREE)) + dst->flags |= DST_RCU_FREE; + + sk->sk_rx_dst = dst; + inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; + } +} + #endif /* _INET_SOCK_H */ diff --git a/net/core/dst.c b/net/core/dst.c index 069d51d..d9e33eb 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -258,6 +258,15 @@ again: } EXPORT_SYMBOL(dst_destroy); +static void dst_rcu_destroy(struct rcu_head *head) +{ + struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); + + dst = dst_destroy(dst); + if (dst) + __dst_free(dst); +} + void dst_release(struct dst_entry *dst) { if (dst) { @@ -265,10 +274,14 @@ void dst_release(struct dst_entry *dst) newrefcnt = atomic_dec_return(&dst->__refcnt); WARN_ON(newrefcnt < 0); - if (unlikely(dst->flags & DST_NOCACHE) && !newrefcnt) { - dst = dst_destroy(dst); - if (dst) - __dst_free(dst); + if (unlikely(dst->flags & (DST_NOCACHE | DST_RCU_FREE)) && !newrefcnt) { + if (dst->flags & DST_RCU_FREE) { + call_rcu_bh(&dst->rcu_head, dst_rcu_destroy); + } else { + dst = dst_destroy(dst); + if (dst) + __dst_free(dst); + } } } } @@ -320,11 +333,14 @@ EXPORT_SYMBOL(__dst_destroy_metrics_generic); */ void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst) { + bool hold; + WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); /* If dst not in cache, we must take a reference, because * dst_release() will destroy dst as soon as its refcount becomes zero */ - if (unlikely(dst->flags & DST_NOCACHE)) { + hold = (dst->flags & (DST_NOCACHE | DST_RCU_FREE)) == DST_NOCACHE; + if (unlikely(hold)) { dst_hold(dst); skb_dst_set(skb, dst); } else { diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index 85a3604..2671977 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -184,6 +184,12 @@ static __inline__ unsigned int dn_hash(__le16 src, __le16 dst) return dn_rt_hash_mask & (unsigned int)tmp; } +static inline void dst_rcu_free(struct rcu_head *head) +{ + struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); + dst_free(dst); +} + static inline void dnrt_free(struct dn_route *rt) { call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free); diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index da0cc2e..e55171f 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -172,9 +172,9 @@ static void free_fib_info_rcu(struct rcu_head *head) if (nexthop_nh->nh_exceptions) free_nh_exceptions(nexthop_nh); if (nexthop_nh->nh_rth_output) - dst_free(&nexthop_nh->nh_rth_output->dst); + dst_release(&nexthop_nh->nh_rth_output->dst); if (nexthop_nh->nh_rth_input) - dst_free(&nexthop_nh->nh_rth_input->dst); + dst_release(&nexthop_nh->nh_rth_input->dst); } endfor_nexthops(fi); release_net(fi->fib_net); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index fc1a81c..d6eabcf 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1199,11 +1199,6 @@ restart: fnhe->fnhe_stamp = jiffies; } -static inline void rt_free(struct rtable *rt) -{ - call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free); -} - static void rt_cache_route(struct fib_nh *nh, struct rtable *rt) { struct rtable *orig, *prev, **p = &nh->nh_rth_output; @@ -1213,17 +1208,14 @@ static void rt_cache_route(struct fib_nh *nh, struct rtable *rt) orig = *p; + rt->dst.flags |= DST_RCU_FREE; + dst_hold(&rt->dst); prev = cmpxchg(p, orig, rt); if (prev == orig) { if (orig) - rt_free(orig); + dst_release(&orig->dst); } else { - /* Routes we intend to cache in the FIB nexthop have - * the DST_NOCACHE bit clear. However, if we are - * unsuccessful at storing this route into the cache - * we really need to set it. - */ - rt->dst.flags |= DST_NOCACHE; + dst_release(&rt->dst); } } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a356e1f..9be30b0 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5604,8 +5604,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) tcp_set_state(sk, TCP_ESTABLISHED); if (skb != NULL) { - sk->sk_rx_dst = dst_clone(skb_dst(skb)); - inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; + inet_sk_rx_dst_set(sk, skb); security_inet_conn_established(sk, skb); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 2fbd992..7f91e5a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1617,19 +1617,19 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) #endif if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ + struct dst_entry *dst = sk->sk_rx_dst; + sock_rps_save_rxhash(sk, skb); - if (sk->sk_rx_dst) { - struct dst_entry *dst = sk->sk_rx_dst; + if (dst) { if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || dst->ops->check(dst, 0) == NULL) { dst_release(dst); sk->sk_rx_dst = NULL; } } - if (unlikely(sk->sk_rx_dst == NULL)) { - sk->sk_rx_dst = dst_clone(skb_dst(skb)); - inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; - } + if (unlikely(sk->sk_rx_dst == NULL)) + inet_sk_rx_dst_set(sk, skb); + if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) { rsk = sk; goto reset; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 3f1cc20..232a90c 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -387,8 +387,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct tcp_sock *oldtp = tcp_sk(sk); struct tcp_cookie_values *oldcvp = oldtp->cookie_values; - newsk->sk_rx_dst = dst_clone(skb_dst(skb)); - inet_sk(newsk)->rx_dst_ifindex = skb->skb_iif; + inet_sk_rx_dst_set(newsk, skb); /* TCP Cookie Transactions require space for the cookie pair, * as it differs for each connection. There is no need to -- cgit v0.10.2 From 0c7462a2351b4cc502f326aad7fedd04909928be Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 30 Jul 2012 07:14:29 +0000 Subject: ipv4: remove rt_cache_rebuild_count After IP route cache removal, rt_cache_rebuild_count is no longer used. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 406a522..ca447b3 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -48,12 +48,6 @@ min_adv_mss - INTEGER The advertised MSS depends on the first hop route MTU, but will never be lower than this setting. -rt_cache_rebuild_count - INTEGER - The per net-namespace route cache emergency rebuild threshold. - Any net-namespace having its route cache rebuilt due to - a hash bucket chain being too long more than this many times - will have its route caching disabled - IP Fragmentation: ipfrag_high_thresh - INTEGER diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 0ffb8e3..1474dd6 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -61,8 +61,6 @@ struct netns_ipv4 { int sysctl_icmp_ratelimit; int sysctl_icmp_ratemask; int sysctl_icmp_errors_use_inbound_ifaddr; - int sysctl_rt_cache_rebuild_count; - int current_rt_cache_rebuild_count; unsigned int sysctl_ping_group_range[2]; long sysctl_tcp_mem[3]; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 5840c32..4b6487a 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -784,13 +784,6 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dointvec }, { - .procname = "rt_cache_rebuild_count", - .data = &init_net.ipv4.sysctl_rt_cache_rebuild_count, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { .procname = "ping_group_range", .data = &init_net.ipv4.sysctl_ping_group_range, .maxlen = sizeof(init_net.ipv4.sysctl_ping_group_range), @@ -829,8 +822,6 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) table[5].data = &net->ipv4.sysctl_icmp_ratemask; table[6].data = - &net->ipv4.sysctl_rt_cache_rebuild_count; - table[7].data = &net->ipv4.sysctl_ping_group_range; } @@ -842,8 +833,6 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) net->ipv4.sysctl_ping_group_range[0] = 1; net->ipv4.sysctl_ping_group_range[1] = 0; - net->ipv4.sysctl_rt_cache_rebuild_count = 4; - tcp_init_mem(net); net->ipv4.ipv4_hdr = register_net_sysctl(net, "net/ipv4", table); -- cgit v0.10.2 From 5a0d513b622ee41e117fc37e26e27e8ef42e8dae Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Mon, 30 Jul 2012 08:55:49 +0000 Subject: bridge: make port attributes const Simple table that can be marked const. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c index 6229b62..13b36bd 100644 --- a/net/bridge/br_sysfs_if.c +++ b/net/bridge/br_sysfs_if.c @@ -27,7 +27,7 @@ struct brport_attribute { }; #define BRPORT_ATTR(_name,_mode,_show,_store) \ -struct brport_attribute brport_attr_##_name = { \ +const struct brport_attribute brport_attr_##_name = { \ .attr = {.name = __stringify(_name), \ .mode = _mode }, \ .show = _show, \ @@ -164,7 +164,7 @@ static BRPORT_ATTR(multicast_router, S_IRUGO | S_IWUSR, show_multicast_router, store_multicast_router); #endif -static struct brport_attribute *brport_attrs[] = { +static const struct brport_attribute *brport_attrs[] = { &brport_attr_path_cost, &brport_attr_priority, &brport_attr_port_id, @@ -241,7 +241,7 @@ const struct sysfs_ops brport_sysfs_ops = { int br_sysfs_addif(struct net_bridge_port *p) { struct net_bridge *br = p->br; - struct brport_attribute **a; + const struct brport_attribute **a; int err; err = sysfs_create_link(&p->kobj, &br->dev->dev.kobj, -- cgit v0.10.2 From 54764bb647b2e847c512acf8d443df965da35000 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 31 Jul 2012 01:08:23 +0000 Subject: ipv4: Restore old dst_free() behavior. commit 404e0a8b6a55 (net: ipv4: fix RCU races on dst refcounts) tried to solve a race but added a problem at device/fib dismantle time : We really want to call dst_free() as soon as possible, even if sockets still have dst in their cache. dst_release() calls in free_fib_info_rcu() are not welcomed. Root of the problem was that now we also cache output routes (in nh_rth_output), we must use call_rcu() instead of call_rcu_bh() in rt_free(), because output route lookups are done in process context. Based on feedback and initial patch from David Miller (adding another call_rcu_bh() call in fib, but it appears it was not the right fix) I left the inet_sk_rx_dst_set() helper and added __rcu attributes to nh_rth_output and nh_rth_input to better document what is going on in this code. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller diff --git a/include/net/dst.h b/include/net/dst.h index 31a9fd3..baf5978 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -61,7 +61,6 @@ struct dst_entry { #define DST_NOPEER 0x0040 #define DST_FAKE_RTABLE 0x0080 #define DST_XFRM_TUNNEL 0x0100 -#define DST_RCU_FREE 0x0200 unsigned short pending_confirm; @@ -383,6 +382,12 @@ static inline void dst_free(struct dst_entry *dst) __dst_free(dst); } +static inline void dst_rcu_free(struct rcu_head *head) +{ + struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); + dst_free(dst); +} + static inline void dst_confirm(struct dst_entry *dst) { dst->pending_confirm = 1; diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index e3fd34c..83b567f 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -253,13 +253,9 @@ static inline void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb { struct dst_entry *dst = skb_dst(skb); - if (atomic_inc_not_zero(&dst->__refcnt)) { - if (!(dst->flags & DST_RCU_FREE)) - dst->flags |= DST_RCU_FREE; - - sk->sk_rx_dst = dst; - inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; - } + dst_hold(dst); + sk->sk_rx_dst = dst; + inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; } #endif /* _INET_SOCK_H */ diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index e69c3a4..e521a03 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -81,8 +81,8 @@ struct fib_nh { __be32 nh_gw; __be32 nh_saddr; int nh_saddr_genid; - struct rtable *nh_rth_output; - struct rtable *nh_rth_input; + struct rtable __rcu *nh_rth_output; + struct rtable __rcu *nh_rth_input; struct fnhe_hash_bucket *nh_exceptions; }; diff --git a/net/core/dst.c b/net/core/dst.c index d9e33eb..069d51d 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -258,15 +258,6 @@ again: } EXPORT_SYMBOL(dst_destroy); -static void dst_rcu_destroy(struct rcu_head *head) -{ - struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); - - dst = dst_destroy(dst); - if (dst) - __dst_free(dst); -} - void dst_release(struct dst_entry *dst) { if (dst) { @@ -274,14 +265,10 @@ void dst_release(struct dst_entry *dst) newrefcnt = atomic_dec_return(&dst->__refcnt); WARN_ON(newrefcnt < 0); - if (unlikely(dst->flags & (DST_NOCACHE | DST_RCU_FREE)) && !newrefcnt) { - if (dst->flags & DST_RCU_FREE) { - call_rcu_bh(&dst->rcu_head, dst_rcu_destroy); - } else { - dst = dst_destroy(dst); - if (dst) - __dst_free(dst); - } + if (unlikely(dst->flags & DST_NOCACHE) && !newrefcnt) { + dst = dst_destroy(dst); + if (dst) + __dst_free(dst); } } } @@ -333,14 +320,11 @@ EXPORT_SYMBOL(__dst_destroy_metrics_generic); */ void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst) { - bool hold; - WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); /* If dst not in cache, we must take a reference, because * dst_release() will destroy dst as soon as its refcount becomes zero */ - hold = (dst->flags & (DST_NOCACHE | DST_RCU_FREE)) == DST_NOCACHE; - if (unlikely(hold)) { + if (unlikely(dst->flags & DST_NOCACHE)) { dst_hold(dst); skb_dst_set(skb, dst); } else { diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index 2671977..85a3604 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -184,12 +184,6 @@ static __inline__ unsigned int dn_hash(__le16 src, __le16 dst) return dn_rt_hash_mask & (unsigned int)tmp; } -static inline void dst_rcu_free(struct rcu_head *head) -{ - struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); - dst_free(dst); -} - static inline void dnrt_free(struct dn_route *rt) { call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free); diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index e55171f..625cf18 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -161,6 +161,21 @@ static void free_nh_exceptions(struct fib_nh *nh) kfree(hash); } +static void rt_nexthop_free(struct rtable __rcu **rtp) +{ + struct rtable *rt = rcu_dereference_protected(*rtp, 1); + + if (!rt) + return; + + /* Not even needed : RCU_INIT_POINTER(*rtp, NULL); + * because we waited an RCU grace period before calling + * free_fib_info_rcu() + */ + + dst_free(&rt->dst); +} + /* Release a nexthop info record */ static void free_fib_info_rcu(struct rcu_head *head) { @@ -171,10 +186,8 @@ static void free_fib_info_rcu(struct rcu_head *head) dev_put(nexthop_nh->nh_dev); if (nexthop_nh->nh_exceptions) free_nh_exceptions(nexthop_nh); - if (nexthop_nh->nh_rth_output) - dst_release(&nexthop_nh->nh_rth_output->dst); - if (nexthop_nh->nh_rth_input) - dst_release(&nexthop_nh->nh_rth_input->dst); + rt_nexthop_free(&nexthop_nh->nh_rth_output); + rt_nexthop_free(&nexthop_nh->nh_rth_input); } endfor_nexthops(fi); release_net(fi->fib_net); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index d6eabcf..2bd1074 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1199,23 +1199,31 @@ restart: fnhe->fnhe_stamp = jiffies; } +static inline void rt_free(struct rtable *rt) +{ + call_rcu(&rt->dst.rcu_head, dst_rcu_free); +} + static void rt_cache_route(struct fib_nh *nh, struct rtable *rt) { - struct rtable *orig, *prev, **p = &nh->nh_rth_output; + struct rtable *orig, *prev, **p = (struct rtable **)&nh->nh_rth_output; if (rt_is_input_route(rt)) - p = &nh->nh_rth_input; + p = (struct rtable **)&nh->nh_rth_input; orig = *p; - rt->dst.flags |= DST_RCU_FREE; - dst_hold(&rt->dst); prev = cmpxchg(p, orig, rt); if (prev == orig) { if (orig) - dst_release(&orig->dst); + rt_free(orig); } else { - dst_release(&rt->dst); + /* Routes we intend to cache in the FIB nexthop have + * the DST_NOCACHE bit clear. However, if we are + * unsuccessful at storing this route into the cache + * we really need to set it. + */ + rt->dst.flags |= DST_NOCACHE; } } @@ -1412,7 +1420,7 @@ static int __mkroute_input(struct sk_buff *skb, do_cache = false; if (res->fi) { if (!itag) { - rth = FIB_RES_NH(*res).nh_rth_input; + rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input); if (rt_cache_valid(rth)) { skb_dst_set_noref(skb, &rth->dst); goto out; @@ -1574,7 +1582,7 @@ local_input: do_cache = false; if (res.fi) { if (!itag) { - rth = FIB_RES_NH(res).nh_rth_input; + rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input); if (rt_cache_valid(rth)) { skb_dst_set_noref(skb, &rth->dst); err = 0; @@ -1742,7 +1750,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res, if (fi) { fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr); if (!fnhe) { - rth = FIB_RES_NH(*res).nh_rth_output; + rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_output); if (rt_cache_valid(rth)) { dst_hold(&rth->dst); return rth; -- cgit v0.10.2 From d26b3a7c4b3b26319f18bb645de93eba8f4bdcd5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 31 Jul 2012 05:45:30 +0000 Subject: ipv4: percpu nh_rth_output cache Input path is mostly run under RCU and doesnt touch dst refcnt But output path on forwarding or UDP workloads hits badly dst refcount, and we have lot of false sharing, for example in ipv4_mtu() when reading rt->rt_pmtu Using a percpu cache for nh_rth_output gives a nice performance increase at a small cost. 24 udpflood test on my 24 cpu machine (dummy0 output device) (each process sends 1.000.000 udp frames, 24 processes are started) before : 5.24 s after : 2.06 s For reference, time on linux-3.5 : 6.60 s Signed-off-by: Eric Dumazet Tested-by: Alexander Duyck Signed-off-by: David S. Miller diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index e521a03..e331746 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -21,6 +21,7 @@ #include #include #include +#include struct fib_config { u8 fc_dst_len; @@ -81,7 +82,7 @@ struct fib_nh { __be32 nh_gw; __be32 nh_saddr; int nh_saddr_genid; - struct rtable __rcu *nh_rth_output; + struct rtable __rcu * __percpu *nh_pcpu_rth_output; struct rtable __rcu *nh_rth_input; struct fnhe_hash_bucket *nh_exceptions; }; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 625cf18..fe2ca02 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -176,6 +176,23 @@ static void rt_nexthop_free(struct rtable __rcu **rtp) dst_free(&rt->dst); } +static void rt_nexthop_free_cpus(struct rtable __rcu * __percpu *rtp) +{ + int cpu; + + if (!rtp) + return; + + for_each_possible_cpu(cpu) { + struct rtable *rt; + + rt = rcu_dereference_protected(*per_cpu_ptr(rtp, cpu), 1); + if (rt) + dst_free(&rt->dst); + } + free_percpu(rtp); +} + /* Release a nexthop info record */ static void free_fib_info_rcu(struct rcu_head *head) { @@ -186,7 +203,7 @@ static void free_fib_info_rcu(struct rcu_head *head) dev_put(nexthop_nh->nh_dev); if (nexthop_nh->nh_exceptions) free_nh_exceptions(nexthop_nh); - rt_nexthop_free(&nexthop_nh->nh_rth_output); + rt_nexthop_free_cpus(nexthop_nh->nh_pcpu_rth_output); rt_nexthop_free(&nexthop_nh->nh_rth_input); } endfor_nexthops(fi); @@ -817,6 +834,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg) fi->fib_nhs = nhs; change_nexthops(fi) { nexthop_nh->nh_parent = fi; + nexthop_nh->nh_pcpu_rth_output = alloc_percpu(struct rtable __rcu *); } endfor_nexthops(fi) if (cfg->fc_mx) { diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 2bd1074..4f6276c 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1206,11 +1206,15 @@ static inline void rt_free(struct rtable *rt) static void rt_cache_route(struct fib_nh *nh, struct rtable *rt) { - struct rtable *orig, *prev, **p = (struct rtable **)&nh->nh_rth_output; + struct rtable *orig, *prev, **p; - if (rt_is_input_route(rt)) + if (rt_is_input_route(rt)) { p = (struct rtable **)&nh->nh_rth_input; - + } else { + if (!nh->nh_pcpu_rth_output) + goto nocache; + p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output); + } orig = *p; prev = cmpxchg(p, orig, rt); @@ -1223,6 +1227,7 @@ static void rt_cache_route(struct fib_nh *nh, struct rtable *rt) * unsuccessful at storing this route into the cache * we really need to set it. */ +nocache: rt->dst.flags |= DST_NOCACHE; } } @@ -1749,8 +1754,11 @@ static struct rtable *__mkroute_output(const struct fib_result *res, fnhe = NULL; if (fi) { fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr); - if (!fnhe) { - rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_output); + if (!fnhe && FIB_RES_NH(*res).nh_pcpu_rth_output) { + struct rtable __rcu **prth; + + prth = __this_cpu_ptr(FIB_RES_NH(*res).nh_pcpu_rth_output); + rth = rcu_dereference(*prth); if (rt_cache_valid(rth)) { dst_hold(&rth->dst); return rth; -- cgit v0.10.2 From c5038a8327b980a5b279fa193163c468011de009 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 31 Jul 2012 15:02:02 -0700 Subject: ipv4: Cache routes in nexthop exception entries. Signed-off-by: David S. Miller diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index e331746..926142e 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -55,6 +55,7 @@ struct fib_nh_exception { u32 fnhe_pmtu; __be32 fnhe_gw; unsigned long fnhe_expires; + struct rtable __rcu *fnhe_rth; unsigned long fnhe_stamp; }; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index fe2ca02..da80dc1 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -140,6 +140,21 @@ const struct fib_prop fib_props[RTN_MAX + 1] = { }, }; +static void rt_fibinfo_free(struct rtable __rcu **rtp) +{ + struct rtable *rt = rcu_dereference_protected(*rtp, 1); + + if (!rt) + return; + + /* Not even needed : RCU_INIT_POINTER(*rtp, NULL); + * because we waited an RCU grace period before calling + * free_fib_info_rcu() + */ + + dst_free(&rt->dst); +} + static void free_nh_exceptions(struct fib_nh *nh) { struct fnhe_hash_bucket *hash = nh->nh_exceptions; @@ -153,6 +168,9 @@ static void free_nh_exceptions(struct fib_nh *nh) struct fib_nh_exception *next; next = rcu_dereference_protected(fnhe->fnhe_next, 1); + + rt_fibinfo_free(&fnhe->fnhe_rth); + kfree(fnhe); fnhe = next; @@ -161,22 +179,7 @@ static void free_nh_exceptions(struct fib_nh *nh) kfree(hash); } -static void rt_nexthop_free(struct rtable __rcu **rtp) -{ - struct rtable *rt = rcu_dereference_protected(*rtp, 1); - - if (!rt) - return; - - /* Not even needed : RCU_INIT_POINTER(*rtp, NULL); - * because we waited an RCU grace period before calling - * free_fib_info_rcu() - */ - - dst_free(&rt->dst); -} - -static void rt_nexthop_free_cpus(struct rtable __rcu * __percpu *rtp) +static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp) { int cpu; @@ -203,8 +206,8 @@ static void free_fib_info_rcu(struct rcu_head *head) dev_put(nexthop_nh->nh_dev); if (nexthop_nh->nh_exceptions) free_nh_exceptions(nexthop_nh); - rt_nexthop_free_cpus(nexthop_nh->nh_pcpu_rth_output); - rt_nexthop_free(&nexthop_nh->nh_rth_input); + rt_fibinfo_free_cpus(nexthop_nh->nh_pcpu_rth_output); + rt_fibinfo_free(&nexthop_nh->nh_rth_input); } endfor_nexthops(fi); release_net(fi->fib_net); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 4f6276c..b102eeb 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -587,11 +587,17 @@ static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk, build_sk_flow_key(fl4, sk); } -static DEFINE_SEQLOCK(fnhe_seqlock); +static inline void rt_free(struct rtable *rt) +{ + call_rcu(&rt->dst.rcu_head, dst_rcu_free); +} + +static DEFINE_SPINLOCK(fnhe_lock); static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash) { struct fib_nh_exception *fnhe, *oldest; + struct rtable *orig; oldest = rcu_dereference(hash->chain); for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe; @@ -599,6 +605,11 @@ static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash) if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) oldest = fnhe; } + orig = rcu_dereference(oldest->fnhe_rth); + if (orig) { + RCU_INIT_POINTER(oldest->fnhe_rth, NULL); + rt_free(orig); + } return oldest; } @@ -620,7 +631,7 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, int depth; u32 hval = fnhe_hashfun(daddr); - write_seqlock_bh(&fnhe_seqlock); + spin_lock_bh(&fnhe_lock); hash = nh->nh_exceptions; if (!hash) { @@ -667,7 +678,7 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, fnhe->fnhe_stamp = jiffies; out_unlock: - write_sequnlock_bh(&fnhe_seqlock); + spin_unlock_bh(&fnhe_lock); return; } @@ -1167,41 +1178,40 @@ static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr) static void rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, __be32 daddr) { - __be32 fnhe_daddr, gw; - unsigned long expires; - unsigned int seq; - u32 pmtu; - -restart: - seq = read_seqbegin(&fnhe_seqlock); - fnhe_daddr = fnhe->fnhe_daddr; - gw = fnhe->fnhe_gw; - pmtu = fnhe->fnhe_pmtu; - expires = fnhe->fnhe_expires; - if (read_seqretry(&fnhe_seqlock, seq)) - goto restart; - - if (daddr != fnhe_daddr) - return; + spin_lock_bh(&fnhe_lock); - if (pmtu) { - unsigned long diff = expires - jiffies; + if (daddr == fnhe->fnhe_daddr) { + struct rtable *orig; - if (time_before(jiffies, expires)) { - rt->rt_pmtu = pmtu; - dst_set_expires(&rt->dst, diff); + if (fnhe->fnhe_pmtu) { + unsigned long expires = fnhe->fnhe_expires; + unsigned long diff = expires - jiffies; + + if (time_before(jiffies, expires)) { + rt->rt_pmtu = fnhe->fnhe_pmtu; + dst_set_expires(&rt->dst, diff); + } + } + if (fnhe->fnhe_gw) { + rt->rt_flags |= RTCF_REDIRECTED; + rt->rt_gateway = fnhe->fnhe_gw; } - } - if (gw) { - rt->rt_flags |= RTCF_REDIRECTED; - rt->rt_gateway = gw; - } - fnhe->fnhe_stamp = jiffies; -} -static inline void rt_free(struct rtable *rt) -{ - call_rcu(&rt->dst.rcu_head, dst_rcu_free); + orig = rcu_dereference(fnhe->fnhe_rth); + rcu_assign_pointer(fnhe->fnhe_rth, rt); + if (orig) + rt_free(orig); + + fnhe->fnhe_stamp = jiffies; + } else { + /* Routes we intend to cache in nexthop exception have + * the DST_NOCACHE bit clear. However, if we are + * unsuccessful at storing this route into the cache + * we really need to set it. + */ + rt->dst.flags |= DST_NOCACHE; + } + spin_unlock_bh(&fnhe_lock); } static void rt_cache_route(struct fib_nh *nh, struct rtable *rt) @@ -1249,13 +1259,13 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) rt->rt_gateway = nh->nh_gw; - if (unlikely(fnhe)) - rt_bind_exception(rt, fnhe, daddr); dst_init_metrics(&rt->dst, fi->fib_metrics, true); #ifdef CONFIG_IP_ROUTE_CLASSID rt->dst.tclassid = nh->nh_tclassid; #endif - if (!(rt->dst.flags & DST_NOCACHE)) + if (unlikely(fnhe)) + rt_bind_exception(rt, fnhe, daddr); + else if (!(rt->dst.flags & DST_NOCACHE)) rt_cache_route(nh, rt); } @@ -1753,22 +1763,23 @@ static struct rtable *__mkroute_output(const struct fib_result *res, fnhe = NULL; if (fi) { - fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr); - if (!fnhe && FIB_RES_NH(*res).nh_pcpu_rth_output) { - struct rtable __rcu **prth; + struct rtable __rcu **prth; + fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr); + if (fnhe) + prth = &fnhe->fnhe_rth; + else prth = __this_cpu_ptr(FIB_RES_NH(*res).nh_pcpu_rth_output); - rth = rcu_dereference(*prth); - if (rt_cache_valid(rth)) { - dst_hold(&rth->dst); - return rth; - } + rth = rcu_dereference(*prth); + if (rt_cache_valid(rth)) { + dst_hold(&rth->dst); + return rth; } } rth = rt_dst_alloc(dev_out, IN_DEV_CONF_GET(in_dev, NOPOLICY), IN_DEV_CONF_GET(in_dev, NOXFRM), - fi && !fnhe); + fi); if (!rth) return ERR_PTR(-ENOBUFS); -- cgit v0.10.2 From caacf05e5ad1abf0a2864863da4e33024bc68ec6 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 31 Jul 2012 15:06:50 -0700 Subject: ipv4: Properly purge netdev references on uncached routes. When a device is unregistered, we have to purge all of the references to it that may exist in the entire system. If a route is uncached, we currently have no way of accomplishing this. So create a global list that is scanned when a network device goes down. This mirrors the logic in net/core/dst.c's dst_ifdown(). Signed-off-by: David S. Miller diff --git a/include/net/route.h b/include/net/route.h index 8c52bc6..776a27f 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -57,6 +57,8 @@ struct rtable { /* Miscellaneous cached information */ u32 rt_pmtu; + + struct list_head rt_uncached; }; static inline bool rt_is_input_route(const struct rtable *rt) @@ -107,6 +109,7 @@ extern struct ip_rt_acct __percpu *ip_rt_acct; struct in_device; extern int ip_rt_init(void); extern void rt_cache_flush(struct net *net, int how); +extern void rt_flush_dev(struct net_device *dev); extern struct rtable *__ip_route_output_key(struct net *, struct flowi4 *flp); extern struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp, struct sock *sk); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 8732cc7..c43ae3f 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1046,6 +1046,7 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo if (event == NETDEV_UNREGISTER) { fib_disable_ip(dev, 2, -1); + rt_flush_dev(dev); return NOTIFY_DONE; } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index b102eeb..c035251 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -147,6 +147,7 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu); static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb); +static void ipv4_dst_destroy(struct dst_entry *dst); static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, int how) @@ -170,6 +171,7 @@ static struct dst_ops ipv4_dst_ops = { .default_advmss = ipv4_default_advmss, .mtu = ipv4_mtu, .cow_metrics = ipv4_cow_metrics, + .destroy = ipv4_dst_destroy, .ifdown = ipv4_dst_ifdown, .negative_advice = ipv4_negative_advice, .link_failure = ipv4_link_failure, @@ -1175,9 +1177,11 @@ static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr) return NULL; } -static void rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, +static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, __be32 daddr) { + bool ret = false; + spin_lock_bh(&fnhe_lock); if (daddr == fnhe->fnhe_daddr) { @@ -1203,6 +1207,7 @@ static void rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, rt_free(orig); fnhe->fnhe_stamp = jiffies; + ret = true; } else { /* Routes we intend to cache in nexthop exception have * the DST_NOCACHE bit clear. However, if we are @@ -1212,11 +1217,14 @@ static void rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, rt->dst.flags |= DST_NOCACHE; } spin_unlock_bh(&fnhe_lock); + + return ret; } -static void rt_cache_route(struct fib_nh *nh, struct rtable *rt) +static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt) { struct rtable *orig, *prev, **p; + bool ret = true; if (rt_is_input_route(rt)) { p = (struct rtable **)&nh->nh_rth_input; @@ -1239,6 +1247,48 @@ static void rt_cache_route(struct fib_nh *nh, struct rtable *rt) */ nocache: rt->dst.flags |= DST_NOCACHE; + ret = false; + } + + return ret; +} + +static DEFINE_SPINLOCK(rt_uncached_lock); +static LIST_HEAD(rt_uncached_list); + +static void rt_add_uncached_list(struct rtable *rt) +{ + spin_lock_bh(&rt_uncached_lock); + list_add_tail(&rt->rt_uncached, &rt_uncached_list); + spin_unlock_bh(&rt_uncached_lock); +} + +static void ipv4_dst_destroy(struct dst_entry *dst) +{ + struct rtable *rt = (struct rtable *) dst; + + if (dst->flags & DST_NOCACHE) { + spin_lock_bh(&rt_uncached_lock); + list_del(&rt->rt_uncached); + spin_unlock_bh(&rt_uncached_lock); + } +} + +void rt_flush_dev(struct net_device *dev) +{ + if (!list_empty(&rt_uncached_list)) { + struct net *net = dev_net(dev); + struct rtable *rt; + + spin_lock_bh(&rt_uncached_lock); + list_for_each_entry(rt, &rt_uncached_list, rt_uncached) { + if (rt->dst.dev != dev) + continue; + rt->dst.dev = net->loopback_dev; + dev_hold(rt->dst.dev); + dev_put(dev); + } + spin_unlock_bh(&rt_uncached_lock); } } @@ -1254,6 +1304,8 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, struct fib_nh_exception *fnhe, struct fib_info *fi, u16 type, u32 itag) { + bool cached = false; + if (fi) { struct fib_nh *nh = &FIB_RES_NH(*res); @@ -1264,10 +1316,12 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, rt->dst.tclassid = nh->nh_tclassid; #endif if (unlikely(fnhe)) - rt_bind_exception(rt, fnhe, daddr); + cached = rt_bind_exception(rt, fnhe, daddr); else if (!(rt->dst.flags & DST_NOCACHE)) - rt_cache_route(nh, rt); + cached = rt_cache_route(nh, rt); } + if (unlikely(!cached)) + rt_add_uncached_list(rt); #ifdef CONFIG_IP_ROUTE_CLASSID #ifdef CONFIG_IP_MULTIPLE_TABLES @@ -1334,6 +1388,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, rth->rt_iif = 0; rth->rt_pmtu = 0; rth->rt_gateway = 0; + INIT_LIST_HEAD(&rth->rt_uncached); if (our) { rth->dst.input= ip_local_deliver; rth->rt_flags |= RTCF_LOCAL; @@ -1459,6 +1514,7 @@ static int __mkroute_input(struct sk_buff *skb, rth->rt_iif = 0; rth->rt_pmtu = 0; rth->rt_gateway = 0; + INIT_LIST_HEAD(&rth->rt_uncached); rth->dst.input = ip_forward; rth->dst.output = ip_output; @@ -1625,6 +1681,7 @@ local_input: rth->rt_iif = 0; rth->rt_pmtu = 0; rth->rt_gateway = 0; + INIT_LIST_HEAD(&rth->rt_uncached); if (res.type == RTN_UNREACHABLE) { rth->dst.input= ip_error; rth->dst.error= -err; @@ -1792,6 +1849,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res, rth->rt_iif = orig_oif ? : 0; rth->rt_pmtu = 0; rth->rt_gateway = 0; + INIT_LIST_HEAD(&rth->rt_uncached); RT_CACHE_STAT_INC(out_slow_tot); @@ -2071,6 +2129,8 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or rt->rt_type = ort->rt_type; rt->rt_gateway = ort->rt_gateway; + INIT_LIST_HEAD(&rt->rt_uncached); + dst_free(new); } diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index c628184..681ea2f 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -92,6 +92,7 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, xdst->u.rt.rt_type = rt->rt_type; xdst->u.rt.rt_gateway = rt->rt_gateway; xdst->u.rt.rt_pmtu = rt->rt_pmtu; + INIT_LIST_HEAD(&xdst->u.rt.rt_uncached); return 0; } -- cgit v0.10.2