From aa9d85605f5ab070b64842b3eba797cf81698ae1 Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Wed, 28 Aug 2013 23:25:04 +0200 Subject: net: rename netdev_upper to netdev_adjacent Rename the structure to reflect the upcoming addition of lower_dev_list. CC: "David S. Miller" CC: Eric Dumazet CC: Jiri Pirko CC: Alexander Duyck CC: Cong Wang Signed-off-by: Veaceslav Falico Signed-off-by: David S. Miller diff --git a/net/core/dev.c b/net/core/dev.c index 1ed2b66..5072e2c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4367,7 +4367,7 @@ softnet_break: goto out; } -struct netdev_upper { +struct netdev_adjacent { struct net_device *dev; bool master; struct list_head list; @@ -4378,7 +4378,7 @@ struct netdev_upper { static void __append_search_uppers(struct list_head *search_list, struct net_device *dev) { - struct netdev_upper *upper; + struct netdev_adjacent *upper; list_for_each_entry(upper, &dev->upper_dev_list, list) { /* check if this upper is not already in search list */ @@ -4391,8 +4391,8 @@ static bool __netdev_search_upper_dev(struct net_device *dev, struct net_device *upper_dev) { LIST_HEAD(search_list); - struct netdev_upper *upper; - struct netdev_upper *tmp; + struct netdev_adjacent *upper; + struct netdev_adjacent *tmp; bool ret = false; __append_search_uppers(&search_list, dev); @@ -4408,10 +4408,10 @@ static bool __netdev_search_upper_dev(struct net_device *dev, return ret; } -static struct netdev_upper *__netdev_find_upper(struct net_device *dev, +static struct netdev_adjacent *__netdev_find_upper(struct net_device *dev, struct net_device *upper_dev) { - struct netdev_upper *upper; + struct netdev_adjacent *upper; list_for_each_entry(upper, &dev->upper_dev_list, list) { if (upper->dev == upper_dev) @@ -4462,7 +4462,7 @@ EXPORT_SYMBOL(netdev_has_any_upper_dev); */ struct net_device *netdev_master_upper_dev_get(struct net_device *dev) { - struct netdev_upper *upper; + struct netdev_adjacent *upper; ASSERT_RTNL(); @@ -4470,7 +4470,7 @@ struct net_device *netdev_master_upper_dev_get(struct net_device *dev) return NULL; upper = list_first_entry(&dev->upper_dev_list, - struct netdev_upper, list); + struct netdev_adjacent, list); if (likely(upper->master)) return upper->dev; return NULL; @@ -4486,10 +4486,10 @@ EXPORT_SYMBOL(netdev_master_upper_dev_get); */ struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev) { - struct netdev_upper *upper; + struct netdev_adjacent *upper; upper = list_first_or_null_rcu(&dev->upper_dev_list, - struct netdev_upper, list); + struct netdev_adjacent, list); if (upper && likely(upper->master)) return upper->dev; return NULL; @@ -4499,7 +4499,7 @@ EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu); static int __netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, bool master) { - struct netdev_upper *upper; + struct netdev_adjacent *upper; ASSERT_RTNL(); @@ -4580,7 +4580,7 @@ EXPORT_SYMBOL(netdev_master_upper_dev_link); void netdev_upper_dev_unlink(struct net_device *dev, struct net_device *upper_dev) { - struct netdev_upper *upper; + struct netdev_adjacent *upper; ASSERT_RTNL(); -- cgit v0.10.2 From 5d261913ca3daf6c2d21d38924235667b3d07c40 Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Wed, 28 Aug 2013 23:25:05 +0200 Subject: net: add lower_dev_list to net_device and make a full mesh This patch adds lower_dev_list list_head to net_device, which is the same as upper_dev_list, only for lower devices, and begins to use it in the same way as the upper list. It also changes the way the whole adjacent device lists work - now they contain *all* of upper/lower devices, not only the first level. The first level devices are distinguished by the bool neighbour field in netdev_adjacent, also added by this patch. There are cases when a device can be added several times to the adjacent list, the simplest would be: /---- eth0.10 ---\ eth0- --- bond0 \---- eth0.20 ---/ where both bond0 and eth0 'see' each other in the adjacent lists two times. To avoid duplication of netdev_adjacent structures ref_nr is being kept as the number of times the device was added to the list. The 'full view' is achieved by adding, on link creation, all of the upper_dev's upper_dev_list devices as upper devices to all of the lower_dev's lower_dev_list devices (and to the lower_dev itself), and vice versa. On unlink they are removed using the same logic. I've tested it with thousands vlans/bonds/bridges, everything works ok and no observable lags even on a huge number of interfaces. Memory footprint for 128 devices interconnected with each other via both upper and lower (which is impossible, but for the comparison) lists would be: 128*128*2*sizeof(netdev_adjacent) = 1.5MB but in the real world we usualy have at most several devices with slaves and a lot of vlans, so the footprint will be much lower. CC: "David S. Miller" CC: Eric Dumazet CC: Jiri Pirko CC: Alexander Duyck CC: Cong Wang Signed-off-by: Veaceslav Falico Signed-off-by: David S. Miller diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 077363d..5ccf5b7 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1125,6 +1125,7 @@ struct net_device { struct list_head napi_list; struct list_head unreg_list; struct list_head upper_dev_list; /* List of upper devices */ + struct list_head lower_dev_list; /* currently active device features */ diff --git a/net/core/dev.c b/net/core/dev.c index 5072e2c..2aa914e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4369,7 +4369,16 @@ softnet_break: struct netdev_adjacent { struct net_device *dev; + + /* upper master flag, there can only be one master device per list */ bool master; + + /* indicates that this dev is our first-level lower/upper device */ + bool neighbour; + + /* counter for the number of times this device was added to us */ + u16 ref_nr; + struct list_head list; struct rcu_head rcu; struct list_head search_list; @@ -4408,18 +4417,34 @@ static bool __netdev_search_upper_dev(struct net_device *dev, return ret; } -static struct netdev_adjacent *__netdev_find_upper(struct net_device *dev, - struct net_device *upper_dev) +static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev, + struct net_device *adj_dev, + bool upper) { - struct netdev_adjacent *upper; + struct netdev_adjacent *adj; + struct list_head *dev_list; - list_for_each_entry(upper, &dev->upper_dev_list, list) { - if (upper->dev == upper_dev) - return upper; + dev_list = upper ? &dev->upper_dev_list : &dev->lower_dev_list; + + list_for_each_entry(adj, dev_list, list) { + if (adj->dev == adj_dev) + return adj; } return NULL; } +static inline struct netdev_adjacent *__netdev_find_upper(struct net_device *dev, + struct net_device *udev) +{ + return __netdev_find_adj(dev, udev, true); +} + +static inline struct netdev_adjacent *__netdev_find_lower(struct net_device *dev, + struct net_device *ldev) +{ + return __netdev_find_adj(dev, ldev, false); +} + /** * netdev_has_upper_dev - Check if device is linked to an upper device * @dev: device @@ -4496,10 +4521,149 @@ struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev) } EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu); +static int __netdev_adjacent_dev_insert(struct net_device *dev, + struct net_device *adj_dev, + bool neighbour, bool master, + bool upper) +{ + struct netdev_adjacent *adj; + + adj = __netdev_find_adj(dev, adj_dev, upper); + + if (adj) { + BUG_ON(neighbour); + adj->ref_nr++; + return 0; + } + + adj = kmalloc(sizeof(*adj), GFP_KERNEL); + if (!adj) + return -ENOMEM; + + adj->dev = adj_dev; + adj->master = master; + adj->neighbour = neighbour; + adj->ref_nr = 1; + INIT_LIST_HEAD(&adj->search_list); + + dev_hold(adj_dev); + pr_debug("dev_hold for %s, because of %s link added from %s to %s\n", + adj_dev->name, upper ? "upper" : "lower", dev->name, + adj_dev->name); + + if (!upper) { + list_add_tail_rcu(&adj->list, &dev->lower_dev_list); + return 0; + } + + /* Ensure that master upper link is always the first item in list. */ + if (master) + list_add_rcu(&adj->list, &dev->upper_dev_list); + else + list_add_tail_rcu(&adj->list, &dev->upper_dev_list); + + return 0; +} + +static inline int __netdev_upper_dev_insert(struct net_device *dev, + struct net_device *udev, + bool master, bool neighbour) +{ + return __netdev_adjacent_dev_insert(dev, udev, neighbour, master, + true); +} + +static inline int __netdev_lower_dev_insert(struct net_device *dev, + struct net_device *ldev, + bool neighbour) +{ + return __netdev_adjacent_dev_insert(dev, ldev, neighbour, false, + false); +} + +void __netdev_adjacent_dev_remove(struct net_device *dev, + struct net_device *adj_dev, bool upper) +{ + struct netdev_adjacent *adj; + + if (upper) + adj = __netdev_find_upper(dev, adj_dev); + else + adj = __netdev_find_lower(dev, adj_dev); + + if (!adj) + BUG(); + + if (adj->ref_nr > 1) { + adj->ref_nr--; + return; + } + + list_del_rcu(&adj->list); + pr_debug("dev_put for %s, because of %s link removed from %s to %s\n", + adj_dev->name, upper ? "upper" : "lower", dev->name, + adj_dev->name); + dev_put(adj_dev); + kfree_rcu(adj, rcu); +} + +static inline void __netdev_upper_dev_remove(struct net_device *dev, + struct net_device *udev) +{ + return __netdev_adjacent_dev_remove(dev, udev, true); +} + +static inline void __netdev_lower_dev_remove(struct net_device *dev, + struct net_device *ldev) +{ + return __netdev_adjacent_dev_remove(dev, ldev, false); +} + +int __netdev_adjacent_dev_insert_link(struct net_device *dev, + struct net_device *upper_dev, + bool master, bool neighbour) +{ + int ret; + + ret = __netdev_upper_dev_insert(dev, upper_dev, master, neighbour); + if (ret) + return ret; + + ret = __netdev_lower_dev_insert(upper_dev, dev, neighbour); + if (ret) { + __netdev_upper_dev_remove(dev, upper_dev); + return ret; + } + + return 0; +} + +static inline int __netdev_adjacent_dev_link(struct net_device *dev, + struct net_device *udev) +{ + return __netdev_adjacent_dev_insert_link(dev, udev, false, false); +} + +static inline int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, + struct net_device *udev, + bool master) +{ + return __netdev_adjacent_dev_insert_link(dev, udev, master, true); +} + +void __netdev_adjacent_dev_unlink(struct net_device *dev, + struct net_device *upper_dev) +{ + __netdev_upper_dev_remove(dev, upper_dev); + __netdev_lower_dev_remove(upper_dev, dev); +} + + static int __netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, bool master) { - struct netdev_adjacent *upper; + struct netdev_adjacent *i, *j, *to_i, *to_j; + int ret = 0; ASSERT_RTNL(); @@ -4516,22 +4680,76 @@ static int __netdev_upper_dev_link(struct net_device *dev, if (master && netdev_master_upper_dev_get(dev)) return -EBUSY; - upper = kmalloc(sizeof(*upper), GFP_KERNEL); - if (!upper) - return -ENOMEM; + ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, master); + if (ret) + return ret; - upper->dev = upper_dev; - upper->master = master; - INIT_LIST_HEAD(&upper->search_list); + /* Now that we linked these devs, make all the upper_dev's + * upper_dev_list visible to every dev's lower_dev_list and vice + * versa, and don't forget the devices itself. All of these + * links are non-neighbours. + */ + list_for_each_entry(i, &upper_dev->upper_dev_list, list) { + list_for_each_entry(j, &dev->lower_dev_list, list) { + ret = __netdev_adjacent_dev_link(i->dev, j->dev); + if (ret) + goto rollback_mesh; + } + } + + /* add dev to every upper_dev's upper device */ + list_for_each_entry(i, &upper_dev->upper_dev_list, list) { + ret = __netdev_adjacent_dev_link(dev, i->dev); + if (ret) + goto rollback_upper_mesh; + } + + /* add upper_dev to every dev's lower device */ + list_for_each_entry(i, &dev->lower_dev_list, list) { + ret = __netdev_adjacent_dev_link(i->dev, upper_dev); + if (ret) + goto rollback_lower_mesh; + } - /* Ensure that master upper link is always the first item in list. */ - if (master) - list_add_rcu(&upper->list, &dev->upper_dev_list); - else - list_add_tail_rcu(&upper->list, &dev->upper_dev_list); - dev_hold(upper_dev); call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev); return 0; + +rollback_lower_mesh: + to_i = i; + list_for_each_entry(i, &dev->lower_dev_list, list) { + if (i == to_i) + break; + __netdev_adjacent_dev_unlink(i->dev, upper_dev); + } + + i = NULL; + +rollback_upper_mesh: + to_i = i; + list_for_each_entry(i, &upper_dev->upper_dev_list, list) { + if (i == to_i) + break; + __netdev_adjacent_dev_unlink(dev, i->dev); + } + + i = j = NULL; + +rollback_mesh: + to_i = i; + to_j = j; + list_for_each_entry(i, &dev->lower_dev_list, list) { + list_for_each_entry(j, &upper_dev->upper_dev_list, list) { + if (i == to_i && j == to_j) + break; + __netdev_adjacent_dev_unlink(i->dev, j->dev); + } + if (i == to_i) + break; + } + + __netdev_adjacent_dev_unlink(dev, upper_dev); + + return ret; } /** @@ -4580,16 +4798,28 @@ EXPORT_SYMBOL(netdev_master_upper_dev_link); void netdev_upper_dev_unlink(struct net_device *dev, struct net_device *upper_dev) { - struct netdev_adjacent *upper; - + struct netdev_adjacent *i, *j; ASSERT_RTNL(); - upper = __netdev_find_upper(dev, upper_dev); - if (!upper) - return; - list_del_rcu(&upper->list); - dev_put(upper_dev); - kfree_rcu(upper, rcu); + __netdev_adjacent_dev_unlink(dev, upper_dev); + + /* Here is the tricky part. We must remove all dev's lower + * devices from all upper_dev's upper devices and vice + * versa, to maintain the graph relationship. + */ + list_for_each_entry(i, &dev->lower_dev_list, list) + list_for_each_entry(j, &upper_dev->upper_dev_list, list) + __netdev_adjacent_dev_unlink(i->dev, j->dev); + + /* remove also the devices itself from lower/upper device + * list + */ + list_for_each_entry(i, &dev->lower_dev_list, list) + __netdev_adjacent_dev_unlink(i->dev, upper_dev); + + list_for_each_entry(i, &upper_dev->upper_dev_list, list) + __netdev_adjacent_dev_unlink(dev, i->dev); + call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev); } EXPORT_SYMBOL(netdev_upper_dev_unlink); @@ -5850,6 +6080,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, INIT_LIST_HEAD(&dev->unreg_list); INIT_LIST_HEAD(&dev->link_watch_list); INIT_LIST_HEAD(&dev->upper_dev_list); + INIT_LIST_HEAD(&dev->lower_dev_list); dev->priv_flags = IFF_XMIT_DST_RELEASE; setup(dev); -- cgit v0.10.2 From 620f3186caa8124e0efaf329751cf51c5d55c731 Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Wed, 28 Aug 2013 23:25:06 +0200 Subject: net: remove search_list from netdev_adjacent We already don't need it cause we see every upper/lower device in the list already. CC: "David S. Miller" CC: Eric Dumazet CC: Jiri Pirko CC: Alexander Duyck CC: Cong Wang Signed-off-by: Veaceslav Falico Signed-off-by: David S. Miller diff --git a/net/core/dev.c b/net/core/dev.c index 2aa914e..749925a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4381,42 +4381,8 @@ struct netdev_adjacent { struct list_head list; struct rcu_head rcu; - struct list_head search_list; }; -static void __append_search_uppers(struct list_head *search_list, - struct net_device *dev) -{ - struct netdev_adjacent *upper; - - list_for_each_entry(upper, &dev->upper_dev_list, list) { - /* check if this upper is not already in search list */ - if (list_empty(&upper->search_list)) - list_add_tail(&upper->search_list, search_list); - } -} - -static bool __netdev_search_upper_dev(struct net_device *dev, - struct net_device *upper_dev) -{ - LIST_HEAD(search_list); - struct netdev_adjacent *upper; - struct netdev_adjacent *tmp; - bool ret = false; - - __append_search_uppers(&search_list, dev); - list_for_each_entry(upper, &search_list, search_list) { - if (upper->dev == upper_dev) { - ret = true; - break; - } - __append_search_uppers(&search_list, upper->dev); - } - list_for_each_entry_safe(upper, tmp, &search_list, search_list) - INIT_LIST_HEAD(&upper->search_list); - return ret; -} - static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev, struct net_device *adj_dev, bool upper) @@ -4544,7 +4510,6 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev, adj->master = master; adj->neighbour = neighbour; adj->ref_nr = 1; - INIT_LIST_HEAD(&adj->search_list); dev_hold(adj_dev); pr_debug("dev_hold for %s, because of %s link added from %s to %s\n", @@ -4671,7 +4636,7 @@ static int __netdev_upper_dev_link(struct net_device *dev, return -EBUSY; /* To prevent loops, check if dev is not upper device to upper_dev. */ - if (__netdev_search_upper_dev(upper_dev, dev)) + if (__netdev_find_upper(upper_dev, dev)) return -EBUSY; if (__netdev_find_upper(dev, upper_dev)) -- cgit v0.10.2 From 48311f46853c0361f9fba7e0e6bb1652d633c049 Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Wed, 28 Aug 2013 23:25:07 +0200 Subject: net: add netdev_upper_get_next_dev_rcu(dev, iter) This function returns the next dev in the dev->upper_dev_list after the struct list_head **iter position, and updates *iter accordingly. Returns NULL if there are no devices left. Caller must hold RCU read lock. CC: "David S. Miller" CC: Eric Dumazet CC: Jiri Pirko CC: Alexander Duyck CC: Cong Wang Signed-off-by: Veaceslav Falico Signed-off-by: David S. Miller diff --git a/net/core/dev.c b/net/core/dev.c index 749925a..6fbb0c9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4468,6 +4468,31 @@ struct net_device *netdev_master_upper_dev_get(struct net_device *dev) } EXPORT_SYMBOL(netdev_master_upper_dev_get); +/* netdev_upper_get_next_dev_rcu - Get the next dev from upper list + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next device from the dev's upper list, starting from iter + * position. The caller must hold RCU read lock. + */ +struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, + struct list_head **iter) +{ + struct netdev_adjacent *upper; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); + + if (&upper->list == &dev->upper_dev_list) + return NULL; + + *iter = &upper->list; + + return upper->dev; +} +EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu); + /** * netdev_master_upper_dev_get_rcu - Get master upper device * @dev: device -- cgit v0.10.2 From 8b5be8561b804edf6b58fc27edbccf1d45863e08 Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Wed, 28 Aug 2013 23:25:08 +0200 Subject: net: add netdev_for_each_upper_dev_rcu() The new macro netdev_for_each_upper_dev_rcu(dev, upper, iter) iterates through the dev->upper_dev_list starting from the first element, using the netdev_upper_get_next_dev_rcu(dev, &iter). Must be called under RCU read lock. CC: "David S. Miller" CC: Eric Dumazet CC: Jiri Pirko CC: Alexander Duyck CC: Cong Wang Signed-off-by: Veaceslav Falico Signed-off-by: David S. Miller diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5ccf5b7..3ad49b8 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2768,6 +2768,16 @@ extern int bpf_jit_enable; extern bool netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev); extern bool netdev_has_any_upper_dev(struct net_device *dev); +extern struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, + struct list_head **iter); + +/* iterate through upper list, must be called under RCU read lock */ +#define netdev_for_each_upper_dev_rcu(dev, upper, iter) \ + for (iter = &(dev)->upper_dev_list, \ + upper = netdev_upper_get_next_dev_rcu(dev, &(iter)); \ + upper; \ + upper = netdev_upper_get_next_dev_rcu(dev, &(iter))) + extern struct net_device *netdev_master_upper_dev_get(struct net_device *dev); extern struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev); extern int netdev_upper_dev_link(struct net_device *dev, -- cgit v0.10.2 From c752af2c555617017dbb3a66f81e974b66ad5e2d Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Wed, 28 Aug 2013 23:25:09 +0200 Subject: bonding: use netdev_upper list in bond_vlan_used Convert bond_vlan_used() to traverse the upper device list to see if we have any vlans above us. It's protected by rcu, and in case we are holding rtnl_lock we should call vlan_uses_dev() instead - it's faster. CC: Jay Vosburgh CC: Andy Gospodarek Signed-off-by: Veaceslav Falico Signed-off-by: David S. Miller diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h index 4bf52d5..230197d 100644 --- a/drivers/net/bonding/bonding.h +++ b/drivers/net/bonding/bonding.h @@ -267,9 +267,22 @@ struct bonding { #endif /* CONFIG_DEBUG_FS */ }; +/* if we hold rtnl_lock() - call vlan_uses_dev() */ static inline bool bond_vlan_used(struct bonding *bond) { - return !list_empty(&bond->vlan_list); + struct net_device *upper; + struct list_head *iter; + + rcu_read_lock(); + netdev_for_each_upper_dev_rcu(bond->dev, upper, iter) { + if (upper->priv_flags & IFF_802_1Q_VLAN) { + rcu_read_unlock(); + return true; + } + } + rcu_read_unlock(); + + return false; } #define bond_slave_get_rcu(dev) \ -- cgit v0.10.2 From 27bc11e63888c7cb0bd6d443e98775254cf7dbdd Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Wed, 28 Aug 2013 23:25:10 +0200 Subject: bonding: make bond_arp_send_all use upper device list Currently, bond_arp_send_all() is aware only of vlans, which breaks configurations like bond <- bridge (or any other 'upper' device) with IP (which is quite a common scenario for virt setups). To fix this we convert the bond_arp_send_all() to first verify if the rt device is the bond itself, and if not - to go through its list of upper vlans and their respectiv upper devices (if the vlan's upper device matches - tag the packet), if still not found - go through all of our upper list devices to see if any of them match the route device for the target. If the match is a vlan device - we also save its vlan_id and tag it in bond_arp_send(). Also, clean the function a bit to be more readable. CC: Vlad Yasevich CC: Jay Vosburgh CC: Andy Gospodarek Signed-off-by: Veaceslav Falico Signed-off-by: David S. Miller diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 7407e65..28f4ad0 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -2444,30 +2444,16 @@ static void bond_arp_send(struct net_device *slave_dev, int arp_op, __be32 dest_ static void bond_arp_send_all(struct bonding *bond, struct slave *slave) { - int i, vlan_id; - __be32 *targets = bond->params.arp_targets; - struct vlan_entry *vlan; - struct net_device *vlan_dev = NULL; + struct net_device *upper, *vlan_upper; + struct list_head *iter, *vlan_iter; struct rtable *rt; + __be32 *targets = bond->params.arp_targets, addr; + int i, vlan_id; - for (i = 0; (i < BOND_MAX_ARP_TARGETS); i++) { - __be32 addr; - if (!targets[i]) - break; + for (i = 0; i < BOND_MAX_ARP_TARGETS && targets[i]; i++) { pr_debug("basa: target %pI4\n", &targets[i]); - if (!bond_vlan_used(bond)) { - pr_debug("basa: empty vlan: arp_send\n"); - addr = bond_confirm_addr(bond->dev, targets[i], 0); - bond_arp_send(slave->dev, ARPOP_REQUEST, targets[i], - addr, 0); - continue; - } - /* - * If VLANs are configured, we do a route lookup to - * determine which VLAN interface would be used, so we - * can tag the ARP with the proper VLAN tag. - */ + /* Find out through which dev should the packet go */ rt = ip_route_output(dev_net(bond->dev), targets[i], 0, RTO_ONLINK, 0); if (IS_ERR(rt)) { @@ -2478,47 +2464,61 @@ static void bond_arp_send_all(struct bonding *bond, struct slave *slave) continue; } - /* - * This target is not on a VLAN - */ - if (rt->dst.dev == bond->dev) { - ip_rt_put(rt); - pr_debug("basa: rtdev == bond->dev: arp_send\n"); - addr = bond_confirm_addr(bond->dev, targets[i], 0); - bond_arp_send(slave->dev, ARPOP_REQUEST, targets[i], - addr, 0); - continue; - } - vlan_id = 0; - list_for_each_entry(vlan, &bond->vlan_list, vlan_list) { - rcu_read_lock(); - vlan_dev = __vlan_find_dev_deep(bond->dev, - htons(ETH_P_8021Q), - vlan->vlan_id); - rcu_read_unlock(); - if (vlan_dev == rt->dst.dev) { - vlan_id = vlan->vlan_id; - pr_debug("basa: vlan match on %s %d\n", - vlan_dev->name, vlan_id); - break; + + /* bond device itself */ + if (rt->dst.dev == bond->dev) + goto found; + + rcu_read_lock(); + /* first we search only for vlan devices. for every vlan + * found we verify its upper dev list, searching for the + * rt->dst.dev. If found we save the tag of the vlan and + * proceed to send the packet. + * + * TODO: QinQ? + */ + netdev_for_each_upper_dev_rcu(bond->dev, vlan_upper, vlan_iter) { + if (!is_vlan_dev(vlan_upper)) + continue; + netdev_for_each_upper_dev_rcu(vlan_upper, upper, iter) { + if (upper == rt->dst.dev) { + vlan_id = vlan_dev_vlan_id(vlan_upper); + rcu_read_unlock(); + goto found; + } } } - if (vlan_id && vlan_dev) { - ip_rt_put(rt); - addr = bond_confirm_addr(vlan_dev, targets[i], 0); - bond_arp_send(slave->dev, ARPOP_REQUEST, targets[i], - addr, vlan_id); - continue; + /* if the device we're looking for is not on top of any of + * our upper vlans, then just search for any dev that + * matches, and in case it's a vlan - save the id + */ + netdev_for_each_upper_dev_rcu(bond->dev, upper, iter) { + if (upper == rt->dst.dev) { + /* if it's a vlan - get its VID */ + if (is_vlan_dev(upper)) + vlan_id = vlan_dev_vlan_id(upper); + + rcu_read_unlock(); + goto found; + } } + rcu_read_unlock(); - if (net_ratelimit()) { + /* Not our device - skip */ + if (net_ratelimit()) pr_warning("%s: no path to arp_ip_target %pI4 via rt.dev %s\n", bond->dev->name, &targets[i], rt->dst.dev ? rt->dst.dev->name : "NULL"); - } ip_rt_put(rt); + continue; + +found: + addr = bond_confirm_addr(rt->dst.dev, targets[i], 0); + ip_rt_put(rt); + bond_arp_send(slave->dev, ARPOP_REQUEST, targets[i], + addr, vlan_id); } } -- cgit v0.10.2 From 50223ce4be70367ca5d8135bfd4c976e148bc491 Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Wed, 28 Aug 2013 23:25:11 +0200 Subject: bonding: convert bond_has_this_ip() to use upper devices Currently, bond_has_this_ip() is aware only of vlan upper devices, and thus will return false if the address is associated with the upper bridge or any other device, and thus will break the arp logic. Fix this by using the upper device list. For every upper device we verify if the address associated with it is our address, and if yes - return true. CC: Jay Vosburgh CC: Andy Gospodarek Signed-off-by: Veaceslav Falico Signed-off-by: David S. Miller diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 28f4ad0..f38d590 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -2392,24 +2392,25 @@ re_arm: } } -static int bond_has_this_ip(struct bonding *bond, __be32 ip) +static bool bond_has_this_ip(struct bonding *bond, __be32 ip) { - struct vlan_entry *vlan; - struct net_device *vlan_dev; + struct net_device *upper; + struct list_head *iter; + bool ret = false; if (ip == bond_confirm_addr(bond->dev, 0, ip)) - return 1; + return true; - list_for_each_entry(vlan, &bond->vlan_list, vlan_list) { - rcu_read_lock(); - vlan_dev = __vlan_find_dev_deep(bond->dev, htons(ETH_P_8021Q), - vlan->vlan_id); - rcu_read_unlock(); - if (vlan_dev && ip == bond_confirm_addr(vlan_dev, 0, ip)) - return 1; + rcu_read_lock(); + netdev_for_each_upper_dev_rcu(bond->dev, upper, iter) { + if (ip == bond_confirm_addr(upper, 0, ip)) { + ret = true; + break; + } } + rcu_read_unlock(); - return 0; + return ret; } /* -- cgit v0.10.2 From a59d3d21ea7636d4cc7fb921104b9b4a59196839 Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Wed, 28 Aug 2013 23:25:12 +0200 Subject: bonding: use vlan_uses_dev() in __bond_release_one() We always hold the rtnl_lock() in __bond_release_one(), so use vlan_uses_dev() instead of bond_vlan_used(). CC: Jay Vosburgh CC: Andy Gospodarek Signed-off-by: Veaceslav Falico Signed-off-by: David S. Miller diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index f38d590..55a48d3 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1954,7 +1954,7 @@ static int __bond_release_one(struct net_device *bond_dev, bond_set_carrier(bond); eth_hw_addr_random(bond_dev); - if (bond_vlan_used(bond)) { + if (vlan_uses_dev(bond_dev)) { pr_warning("%s: Warning: clearing HW address of %s while it still has VLANs.\n", bond_dev->name, bond_dev->name); pr_warning("%s: When re-adding slaves, make sure the bond's HW address matches its VLANs'.\n", -- cgit v0.10.2 From 7aa6498123a67001e950808ce6eb5930a4c5945f Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Wed, 28 Aug 2013 23:25:13 +0200 Subject: bonding: split alb_send_learning_packets() Create alb_send_lp_vid(), which will handle the skb/lp creation, vlan tagging and sending, and use it in alb_send_learning_packets(). This way all the logic remains in alb_send_learning_packets(), which becomes a lot more cleaner and easier to understand. CC: Jay Vosburgh CC: Andy Gospodarek Signed-off-by: Veaceslav Falico Signed-off-by: David S. Miller diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c index 3a5db7b..3ca3c85 100644 --- a/drivers/net/bonding/bond_alb.c +++ b/drivers/net/bonding/bond_alb.c @@ -971,35 +971,53 @@ static void rlb_clear_vlan(struct bonding *bond, unsigned short vlan_id) /*********************** tlb/rlb shared functions *********************/ -static void alb_send_learning_packets(struct slave *slave, u8 mac_addr[]) +static void alb_send_lp_vid(struct slave *slave, u8 mac_addr[], + u16 vid) { - struct bonding *bond = bond_get_bond_by_slave(slave); struct learning_pkt pkt; + struct sk_buff *skb; int size = sizeof(struct learning_pkt); - int i; + char *data; memset(&pkt, 0, size); memcpy(pkt.mac_dst, mac_addr, ETH_ALEN); memcpy(pkt.mac_src, mac_addr, ETH_ALEN); pkt.type = cpu_to_be16(ETH_P_LOOP); - for (i = 0; i < MAX_LP_BURST; i++) { - struct sk_buff *skb; - char *data; + skb = dev_alloc_skb(size); + if (!skb) + return; - skb = dev_alloc_skb(size); + data = skb_put(skb, size); + memcpy(data, &pkt, size); + + skb_reset_mac_header(skb); + skb->network_header = skb->mac_header + ETH_HLEN; + skb->protocol = pkt.type; + skb->priority = TC_PRIO_CONTROL; + skb->dev = slave->dev; + + if (vid) { + skb = vlan_put_tag(skb, htons(ETH_P_8021Q), vid); if (!skb) { + pr_err("%s: Error: failed to insert VLAN tag\n", + slave->bond->dev->name); return; } + } - data = skb_put(skb, size); - memcpy(data, &pkt, size); + dev_queue_xmit(skb); +} - skb_reset_mac_header(skb); - skb->network_header = skb->mac_header + ETH_HLEN; - skb->protocol = pkt.type; - skb->priority = TC_PRIO_CONTROL; - skb->dev = slave->dev; + +static void alb_send_learning_packets(struct slave *slave, u8 mac_addr[]) +{ + struct bonding *bond = bond_get_bond_by_slave(slave); + u16 vlan_id; + int i; + + for (i = 0; i < MAX_LP_BURST; i++) { + vlan_id = 0; if (bond_vlan_used(bond)) { struct vlan_entry *vlan; @@ -1008,20 +1026,13 @@ static void alb_send_learning_packets(struct slave *slave, u8 mac_addr[]) bond->alb_info.current_alb_vlan); bond->alb_info.current_alb_vlan = vlan; - if (!vlan) { - kfree_skb(skb); + if (!vlan) continue; - } - skb = vlan_put_tag(skb, htons(ETH_P_8021Q), vlan->vlan_id); - if (!skb) { - pr_err("%s: Error: failed to insert VLAN tag\n", - bond->dev->name); - continue; - } + vlan_id = vlan->vlan_id; } - dev_queue_xmit(skb); + alb_send_lp_vid(slave, mac_addr, vlan_id); } } -- cgit v0.10.2 From 5bf94b839a44aa9d8199607c32b95832f2892ee6 Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Wed, 28 Aug 2013 23:25:14 +0200 Subject: bonding: make alb_send_learning_packets() use upper dev list Currently, if there are vlans on top of bond, alb_send_learning_packets() will never send LPs from the bond itself (i.e. untagged), which might leave untagged clients unupdated. Also, the 'circular vlan' logic (i.e. update only MAX_LP_BURST vlans at a time, and save the last vlan for the next update) is really suboptimal - in case of lots of vlans it will take a lot of time to update every vlan. It is also never called in any hot path and sends only a few small packets - thus the optimization by itself is useless. So remove the whole current_alb_vlan/MAX_LP_BURST logic from alb_send_learning_packets(). Instead, we'll first send a packet untagged and then traverse the upper dev list, sending a tagged packet for each vlan found. Also, remove the MAX_LP_BURST define - we already don't need it. CC: Jay Vosburgh CC: Andy Gospodarek Signed-off-by: Veaceslav Falico Signed-off-by: David S. Miller diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c index 3ca3c85..b6a68b4 100644 --- a/drivers/net/bonding/bond_alb.c +++ b/drivers/net/bonding/bond_alb.c @@ -1013,27 +1013,20 @@ static void alb_send_lp_vid(struct slave *slave, u8 mac_addr[], static void alb_send_learning_packets(struct slave *slave, u8 mac_addr[]) { struct bonding *bond = bond_get_bond_by_slave(slave); - u16 vlan_id; - int i; - - for (i = 0; i < MAX_LP_BURST; i++) { - vlan_id = 0; - - if (bond_vlan_used(bond)) { - struct vlan_entry *vlan; + struct net_device *upper; + struct list_head *iter; - vlan = bond_next_vlan(bond, - bond->alb_info.current_alb_vlan); - - bond->alb_info.current_alb_vlan = vlan; - if (!vlan) - continue; - - vlan_id = vlan->vlan_id; - } + /* send untagged */ + alb_send_lp_vid(slave, mac_addr, 0); - alb_send_lp_vid(slave, mac_addr, vlan_id); + /* loop through vlans and send one packet for each */ + rcu_read_lock(); + netdev_for_each_upper_dev_rcu(bond->dev, upper, iter) { + if (upper->priv_flags & IFF_802_1Q_VLAN) + alb_send_lp_vid(slave, mac_addr, + vlan_dev_vlan_id(upper)); } + rcu_read_unlock(); } static int alb_set_slave_mac_addr(struct slave *slave, u8 addr[]) diff --git a/drivers/net/bonding/bond_alb.h b/drivers/net/bonding/bond_alb.h index e7a5b8b..e139172 100644 --- a/drivers/net/bonding/bond_alb.h +++ b/drivers/net/bonding/bond_alb.h @@ -53,7 +53,6 @@ struct slave; #define TLB_NULL_INDEX 0xffffffff -#define MAX_LP_BURST 3 /* rlb defs */ #define RLB_HASH_TABLE_SIZE 256 -- cgit v0.10.2 From e868b0c938d9cc0d7ed4bd77d5c37676e833ed95 Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Wed, 28 Aug 2013 23:25:15 +0200 Subject: bonding: remove vlan_list/current_alb_vlan Currently there are no real users of vlan_list/current_alb_vlan, only the helpers which maintain them, so remove them. CC: Jay Vosburgh CC: Andy Gospodarek Signed-off-by: Veaceslav Falico Signed-off-by: David S. Miller diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c index b6a68b4..0182352 100644 --- a/drivers/net/bonding/bond_alb.c +++ b/drivers/net/bonding/bond_alb.c @@ -1763,11 +1763,6 @@ int bond_alb_set_mac_address(struct net_device *bond_dev, void *addr) void bond_alb_clear_vlan(struct bonding *bond, unsigned short vlan_id) { - if (bond->alb_info.current_alb_vlan && - (bond->alb_info.current_alb_vlan->vlan_id == vlan_id)) { - bond->alb_info.current_alb_vlan = NULL; - } - if (bond->alb_info.rlb_enabled) { rlb_clear_vlan(bond, vlan_id); } diff --git a/drivers/net/bonding/bond_alb.h b/drivers/net/bonding/bond_alb.h index e139172..e02c9c5 100644 --- a/drivers/net/bonding/bond_alb.h +++ b/drivers/net/bonding/bond_alb.h @@ -169,7 +169,6 @@ struct alb_bond_info { * rx traffic should be * rebalanced */ - struct vlan_entry *current_alb_vlan; }; int bond_alb_initialize(struct bonding *bond, int rlb_enabled); diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 55a48d3..b87ad76 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -283,116 +283,6 @@ const char *bond_mode_name(int mode) /*---------------------------------- VLAN -----------------------------------*/ /** - * bond_add_vlan - add a new vlan id on bond - * @bond: bond that got the notification - * @vlan_id: the vlan id to add - * - * Returns -ENOMEM if allocation failed. - */ -static int bond_add_vlan(struct bonding *bond, unsigned short vlan_id) -{ - struct vlan_entry *vlan; - - pr_debug("bond: %s, vlan id %d\n", - (bond ? bond->dev->name : "None"), vlan_id); - - vlan = kzalloc(sizeof(struct vlan_entry), GFP_KERNEL); - if (!vlan) - return -ENOMEM; - - INIT_LIST_HEAD(&vlan->vlan_list); - vlan->vlan_id = vlan_id; - - write_lock_bh(&bond->lock); - - list_add_tail(&vlan->vlan_list, &bond->vlan_list); - - write_unlock_bh(&bond->lock); - - pr_debug("added VLAN ID %d on bond %s\n", vlan_id, bond->dev->name); - - return 0; -} - -/** - * bond_del_vlan - delete a vlan id from bond - * @bond: bond that got the notification - * @vlan_id: the vlan id to delete - * - * returns -ENODEV if @vlan_id was not found in @bond. - */ -static int bond_del_vlan(struct bonding *bond, unsigned short vlan_id) -{ - struct vlan_entry *vlan; - int res = -ENODEV; - - pr_debug("bond: %s, vlan id %d\n", bond->dev->name, vlan_id); - - block_netpoll_tx(); - write_lock_bh(&bond->lock); - - list_for_each_entry(vlan, &bond->vlan_list, vlan_list) { - if (vlan->vlan_id == vlan_id) { - list_del(&vlan->vlan_list); - - if (bond_is_lb(bond)) - bond_alb_clear_vlan(bond, vlan_id); - - pr_debug("removed VLAN ID %d from bond %s\n", - vlan_id, bond->dev->name); - - kfree(vlan); - - res = 0; - goto out; - } - } - - pr_debug("couldn't find VLAN ID %d in bond %s\n", - vlan_id, bond->dev->name); - -out: - write_unlock_bh(&bond->lock); - unblock_netpoll_tx(); - return res; -} - -/** - * bond_next_vlan - safely skip to the next item in the vlans list. - * @bond: the bond we're working on - * @curr: item we're advancing from - * - * Returns %NULL if list is empty, bond->next_vlan if @curr is %NULL, - * or @curr->next otherwise (even if it is @curr itself again). - * - * Caller must hold bond->lock - */ -struct vlan_entry *bond_next_vlan(struct bonding *bond, struct vlan_entry *curr) -{ - struct vlan_entry *next, *last; - - if (list_empty(&bond->vlan_list)) - return NULL; - - if (!curr) { - next = list_entry(bond->vlan_list.next, - struct vlan_entry, vlan_list); - } else { - last = list_entry(bond->vlan_list.prev, - struct vlan_entry, vlan_list); - if (last == curr) { - next = list_entry(bond->vlan_list.next, - struct vlan_entry, vlan_list); - } else { - next = list_entry(curr->vlan_list.next, - struct vlan_entry, vlan_list); - } - } - - return next; -} - -/** * bond_dev_queue_xmit - Prepare skb for xmit. * * @bond: bond device that got this skb for tx. @@ -451,13 +341,6 @@ static int bond_vlan_rx_add_vid(struct net_device *bond_dev, goto unwind; } - res = bond_add_vlan(bond, vid); - if (res) { - pr_err("%s: Error: Failed to add vlan id %d\n", - bond_dev->name, vid); - goto unwind; - } - return 0; unwind: @@ -478,17 +361,12 @@ static int bond_vlan_rx_kill_vid(struct net_device *bond_dev, { struct bonding *bond = netdev_priv(bond_dev); struct slave *slave; - int res; bond_for_each_slave(bond, slave) vlan_vid_del(slave->dev, proto, vid); - res = bond_del_vlan(bond, vid); - if (res) { - pr_err("%s: Error: Failed to remove vlan id %d\n", - bond_dev->name, vid); - return res; - } + if (bond_is_lb(bond)) + bond_alb_clear_vlan(bond, vid); return 0; } @@ -4143,7 +4021,6 @@ static void bond_setup(struct net_device *bond_dev) /* Initialize pointers */ bond->dev = bond_dev; - INIT_LIST_HEAD(&bond->vlan_list); /* Initialize the device entry points */ ether_setup(bond_dev); @@ -4196,7 +4073,6 @@ static void bond_uninit(struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); struct slave *slave, *tmp_slave; - struct vlan_entry *vlan, *tmp; bond_netpoll_cleanup(bond_dev); @@ -4208,11 +4084,6 @@ static void bond_uninit(struct net_device *bond_dev) list_del(&bond->bond_list); bond_debug_unregister(bond); - - list_for_each_entry_safe(vlan, tmp, &bond->vlan_list, vlan_list) { - list_del(&vlan->vlan_list); - kfree(vlan); - } } /*------------------------- Module initialization ---------------------------*/ diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h index 230197d..4abc925 100644 --- a/drivers/net/bonding/bonding.h +++ b/drivers/net/bonding/bonding.h @@ -185,11 +185,6 @@ struct bond_parm_tbl { #define BOND_MAX_MODENAME_LEN 20 -struct vlan_entry { - struct list_head vlan_list; - unsigned short vlan_id; -}; - struct slave { struct net_device *dev; /* first - useful for panic debug */ struct list_head list; @@ -254,7 +249,6 @@ struct bonding { struct ad_bond_info ad_info; struct alb_bond_info alb_info; struct bond_params params; - struct list_head vlan_list; struct workqueue_struct *wq; struct delayed_work mii_work; struct delayed_work arp_work; -- cgit v0.10.2 From 3e32582f7d5b6f364ddd31110bf7277d69fd4e91 Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Wed, 28 Aug 2013 23:25:16 +0200 Subject: bonding: pr_debug instead of pr_warn in bond_arp_send_all They're simply annoying and will spam dmesg constantly if we hit them, so convert to pr_debug so that we still can access them in case of debugging. CC: Jay Vosburgh CC: Andy Gospodarek Signed-off-by: Veaceslav Falico Signed-off-by: David S. Miller diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index b87ad76..c50679f 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -2336,10 +2336,8 @@ static void bond_arp_send_all(struct bonding *bond, struct slave *slave) rt = ip_route_output(dev_net(bond->dev), targets[i], 0, RTO_ONLINK, 0); if (IS_ERR(rt)) { - if (net_ratelimit()) { - pr_warning("%s: no route to arp_ip_target %pI4\n", - bond->dev->name, &targets[i]); - } + pr_debug("%s: no route to arp_ip_target %pI4\n", + bond->dev->name, &targets[i]); continue; } @@ -2386,10 +2384,10 @@ static void bond_arp_send_all(struct bonding *bond, struct slave *slave) rcu_read_unlock(); /* Not our device - skip */ - if (net_ratelimit()) - pr_warning("%s: no path to arp_ip_target %pI4 via rt.dev %s\n", - bond->dev->name, &targets[i], - rt->dst.dev ? rt->dst.dev->name : "NULL"); + pr_debug("%s: no path to arp_ip_target %pI4 via rt.dev %s\n", + bond->dev->name, &targets[i], + rt->dst.dev ? rt->dst.dev->name : "NULL"); + ip_rt_put(rt); continue; -- cgit v0.10.2