From ca0a10672dad94aa1f89645f89eb6047b7bf2a19 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Mon, 30 Sep 2013 22:05:07 +0200 Subject: netfilter: ebt_ulog: fix info leaks The ulog messages leak heap bytes by the means of padding bytes and incompletely filled string arrays. Fix those by memset(0)'ing the whole struct before filling it. Signed-off-by: Mathias Krause Signed-off-by: Pablo Neira Ayuso diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c index 5180938..7c470c3 100644 --- a/net/bridge/netfilter/ebt_ulog.c +++ b/net/bridge/netfilter/ebt_ulog.c @@ -181,6 +181,7 @@ static void ebt_ulog_packet(struct net *net, unsigned int hooknr, ub->qlen++; pm = nlmsg_data(nlh); + memset(pm, 0, sizeof(*pm)); /* Fill in the ulog data */ pm->version = EBT_ULOG_VERSION; @@ -193,8 +194,6 @@ static void ebt_ulog_packet(struct net *net, unsigned int hooknr, pm->hook = hooknr; if (uloginfo->prefix != NULL) strcpy(pm->prefix, uloginfo->prefix); - else - *(pm->prefix) = '\0'; if (in) { strcpy(pm->physindev, in->name); @@ -204,16 +203,14 @@ static void ebt_ulog_packet(struct net *net, unsigned int hooknr, strcpy(pm->indev, br_port_get_rcu(in)->br->dev->name); else strcpy(pm->indev, in->name); - } else - pm->indev[0] = pm->physindev[0] = '\0'; + } if (out) { /* If out exists, then out is a bridge port */ strcpy(pm->physoutdev, out->name); /* rcu_read_lock()ed by nf_hook_slow */ strcpy(pm->outdev, br_port_get_rcu(out)->br->dev->name); - } else - pm->outdev[0] = pm->physoutdev[0] = '\0'; + } if (skb_copy_bits(skb, -ETH_HLEN, pm->data, copy_len) < 0) BUG(); -- cgit v0.10.2 From 278f2b3e2af5f32ea1afe34fa12a2518153e6e49 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Mon, 30 Sep 2013 22:05:08 +0200 Subject: netfilter: ipt_ULOG: fix info leaks The ulog messages leak heap bytes by the means of padding bytes and incompletely filled string arrays. Fix those by memset(0)'ing the whole struct before filling it. Signed-off-by: Mathias Krause Signed-off-by: Pablo Neira Ayuso diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c index cbc2215..9cb993c 100644 --- a/net/ipv4/netfilter/ipt_ULOG.c +++ b/net/ipv4/netfilter/ipt_ULOG.c @@ -220,6 +220,7 @@ static void ipt_ulog_packet(struct net *net, ub->qlen++; pm = nlmsg_data(nlh); + memset(pm, 0, sizeof(*pm)); /* We might not have a timestamp, get one */ if (skb->tstamp.tv64 == 0) @@ -238,8 +239,6 @@ static void ipt_ulog_packet(struct net *net, } else if (loginfo->prefix[0] != '\0') strncpy(pm->prefix, loginfo->prefix, sizeof(pm->prefix)); - else - *(pm->prefix) = '\0'; if (in && in->hard_header_len > 0 && skb->mac_header != skb->network_header && @@ -251,13 +250,9 @@ static void ipt_ulog_packet(struct net *net, if (in) strncpy(pm->indev_name, in->name, sizeof(pm->indev_name)); - else - pm->indev_name[0] = '\0'; if (out) strncpy(pm->outdev_name, out->name, sizeof(pm->outdev_name)); - else - pm->outdev_name[0] = '\0'; /* copy_len <= skb->len, so can't fail. */ if (skb_copy_bits(skb, 0, pm->payload, copy_len) < 0) -- cgit v0.10.2 From b07c26511e94ab856f3700c56d582c0da36d5b4d Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 15 Oct 2013 14:54:11 -0700 Subject: openvswitch: fix vport-netdev unregister The combination of two commits: commit 8e4e1713e4 ("openvswitch: Simplify datapath locking.") commit 2537b4dd0a ("openvswitch:: link upper device for port devices") introduced a bug where upper_dev wasn't unlinked upon netdev_unregister notification The following steps: modprobe openvswitch ovs-dpctl add-dp test ip tuntap add dev tap1 mode tap ovs-dpctl add-if test tap1 ip tuntap del dev tap1 mode tap are causing multiple warnings: [ 62.747557] gre: GRE over IPv4 demultiplexor driver [ 62.749579] openvswitch: Open vSwitch switching datapath [ 62.755087] device test entered promiscuous mode [ 62.765911] device tap1 entered promiscuous mode [ 62.766033] IPv6: ADDRCONF(NETDEV_UP): tap1: link is not ready [ 62.769017] ------------[ cut here ]------------ [ 62.769022] WARNING: CPU: 1 PID: 3267 at net/core/dev.c:5501 rollback_registered_many+0x20f/0x240() [ 62.769023] Modules linked in: openvswitch gre vxlan ip_tunnel libcrc32c ip6table_filter ip6_tables ebtable_nat ebtables nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack xt_CHECKSUM iptable_mangle ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc vhost_net macvtap macvlan vhost kvm_intel kvm dm_crypt iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi hid_generic mxm_wmi eeepc_wmi asus_wmi sparse_keymap dm_multipath psmouse serio_raw usbhid hid parport_pc ppdev firewire_ohci lpc_ich firewire_core e1000e crc_itu_t binfmt_misc igb dca ptp pps_core mac_hid wmi lp parport i2o_config i2o_block video [ 62.769051] CPU: 1 PID: 3267 Comm: ip Not tainted 3.12.0-rc3+ #60 [ 62.769052] Hardware name: System manufacturer System Product Name/P8Z77 WS, BIOS 3007 07/26/2012 [ 62.769053] 0000000000000009 ffff8807f25cbd28 ffffffff8175e575 0000000000000006 [ 62.769055] 0000000000000000 ffff8807f25cbd68 ffffffff8105314c ffff8807f25cbd58 [ 62.769057] ffff8807f2634000 ffff8807f25cbdc8 ffff8807f25cbd88 ffff8807f25cbdc8 [ 62.769059] Call Trace: [ 62.769062] [] dump_stack+0x55/0x76 [ 62.769065] [] warn_slowpath_common+0x8c/0xc0 [ 62.769067] [] warn_slowpath_null+0x1a/0x20 [ 62.769069] [] rollback_registered_many+0x20f/0x240 [ 62.769071] [] rollback_registered+0x31/0x40 [ 62.769073] [] unregister_netdevice_queue+0x58/0x90 [ 62.769075] [] __tun_detach+0x140/0x340 [ 62.769077] [] tun_chr_close+0x36/0x60 [ 62.769080] [] __fput+0xff/0x260 [ 62.769082] [] ____fput+0xe/0x10 [ 62.769084] [] task_work_run+0xb5/0xe0 [ 62.769087] [] do_notify_resume+0x59/0x80 [ 62.769089] [] ? trace_hardirqs_on_thunk+0x3a/0x3f [ 62.769091] [] int_signal+0x12/0x17 [ 62.769093] ---[ end trace 838756c62e156ffb ]--- [ 62.769481] ------------[ cut here ]------------ [ 62.769485] WARNING: CPU: 1 PID: 92 at fs/sysfs/inode.c:325 sysfs_hash_and_remove+0xa9/0xb0() [ 62.769486] sysfs: can not remove 'master', no directory [ 62.769486] Modules linked in: openvswitch gre vxlan ip_tunnel libcrc32c ip6table_filter ip6_tables ebtable_nat ebtables nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack xt_CHECKSUM iptable_mangle ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc vhost_net macvtap macvlan vhost kvm_intel kvm dm_crypt iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi hid_generic mxm_wmi eeepc_wmi asus_wmi sparse_keymap dm_multipath psmouse serio_raw usbhid hid parport_pc ppdev firewire_ohci lpc_ich firewire_core e1000e crc_itu_t binfmt_misc igb dca ptp pps_core mac_hid wmi lp parport i2o_config i2o_block video [ 62.769514] CPU: 1 PID: 92 Comm: kworker/1:2 Tainted: G W 3.12.0-rc3+ #60 [ 62.769515] Hardware name: System manufacturer System Product Name/P8Z77 WS, BIOS 3007 07/26/2012 [ 62.769518] Workqueue: events ovs_dp_notify_wq [openvswitch] [ 62.769519] 0000000000000009 ffff880807ad3ac8 ffffffff8175e575 0000000000000006 [ 62.769521] ffff880807ad3b18 ffff880807ad3b08 ffffffff8105314c ffff880807ad3b28 [ 62.769523] 0000000000000000 ffffffff81a87a1f ffff8807f2634000 ffff880037038500 [ 62.769525] Call Trace: [ 62.769528] [] dump_stack+0x55/0x76 [ 62.769529] [] warn_slowpath_common+0x8c/0xc0 [ 62.769531] [] warn_slowpath_fmt+0x46/0x50 [ 62.769533] [] sysfs_hash_and_remove+0xa9/0xb0 [ 62.769535] [] sysfs_remove_link+0x26/0x30 [ 62.769538] [] __netdev_adjacent_dev_remove+0xf7/0x150 [ 62.769540] [] __netdev_adjacent_dev_unlink_lists+0x27/0x50 [ 62.769542] [] __netdev_adjacent_dev_unlink_neighbour+0x3a/0x50 [ 62.769544] [] netdev_upper_dev_unlink+0x3d/0x140 [ 62.769548] [] netdev_destroy+0x4b/0x80 [openvswitch] [ 62.769550] [] ovs_vport_del+0x46/0x60 [openvswitch] [ 62.769552] [] ovs_dp_detach_port+0x44/0x60 [openvswitch] [ 62.769555] [] ovs_dp_notify_wq+0xb4/0x150 [openvswitch] [ 62.769557] [] process_one_work+0x1d8/0x6a0 [ 62.769559] [] ? process_one_work+0x178/0x6a0 [ 62.769562] [] worker_thread+0x11b/0x370 [ 62.769564] [] ? rescuer_thread+0x350/0x350 [ 62.769566] [] kthread+0xea/0xf0 [ 62.769568] [] ? flush_kthread_worker+0x150/0x150 [ 62.769570] [] ret_from_fork+0x7c/0xb0 [ 62.769572] [] ? flush_kthread_worker+0x150/0x150 [ 62.769573] ---[ end trace 838756c62e156ffc ]--- [ 62.769574] ------------[ cut here ]------------ [ 62.769576] WARNING: CPU: 1 PID: 92 at fs/sysfs/inode.c:325 sysfs_hash_and_remove+0xa9/0xb0() [ 62.769577] sysfs: can not remove 'upper_test', no directory [ 62.769577] Modules linked in: openvswitch gre vxlan ip_tunnel libcrc32c ip6table_filter ip6_tables ebtable_nat ebtables nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack xt_CHECKSUM iptable_mangle ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc vhost_net macvtap macvlan vhost kvm_intel kvm dm_crypt iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi hid_generic mxm_wmi eeepc_wmi asus_wmi sparse_keymap dm_multipath psmouse serio_raw usbhid hid parport_pc ppdev firewire_ohci lpc_ich firewire_core e1000e crc_itu_t binfmt_misc igb dca ptp pps_core mac_hid wmi lp parport i2o_config i2o_block video [ 62.769603] CPU: 1 PID: 92 Comm: kworker/1:2 Tainted: G W 3.12.0-rc3+ #60 [ 62.769604] Hardware name: System manufacturer System Product Name/P8Z77 WS, BIOS 3007 07/26/2012 [ 62.769606] Workqueue: events ovs_dp_notify_wq [openvswitch] [ 62.769607] 0000000000000009 ffff880807ad3ac8 ffffffff8175e575 0000000000000006 [ 62.769609] ffff880807ad3b18 ffff880807ad3b08 ffffffff8105314c ffff880807ad3b58 [ 62.769611] 0000000000000000 ffff880807ad3bd9 ffff8807f2634000 ffff880037038500 [ 62.769613] Call Trace: [ 62.769615] [] dump_stack+0x55/0x76 [ 62.769617] [] warn_slowpath_common+0x8c/0xc0 [ 62.769619] [] warn_slowpath_fmt+0x46/0x50 [ 62.769621] [] sysfs_hash_and_remove+0xa9/0xb0 [ 62.769622] [] sysfs_remove_link+0x26/0x30 [ 62.769624] [] __netdev_adjacent_dev_remove+0x122/0x150 [ 62.769627] [] __netdev_adjacent_dev_unlink_lists+0x27/0x50 [ 62.769629] [] __netdev_adjacent_dev_unlink_neighbour+0x3a/0x50 [ 62.769631] [] netdev_upper_dev_unlink+0x3d/0x140 [ 62.769633] [] netdev_destroy+0x4b/0x80 [openvswitch] [ 62.769636] [] ovs_vport_del+0x46/0x60 [openvswitch] [ 62.769638] [] ovs_dp_detach_port+0x44/0x60 [openvswitch] [ 62.769640] [] ovs_dp_notify_wq+0xb4/0x150 [openvswitch] [ 62.769642] [] process_one_work+0x1d8/0x6a0 [ 62.769644] [] ? process_one_work+0x178/0x6a0 [ 62.769646] [] worker_thread+0x11b/0x370 [ 62.769648] [] ? rescuer_thread+0x350/0x350 [ 62.769650] [] kthread+0xea/0xf0 [ 62.769652] [] ? flush_kthread_worker+0x150/0x150 [ 62.769654] [] ret_from_fork+0x7c/0xb0 [ 62.769656] [] ? flush_kthread_worker+0x150/0x150 [ 62.769657] ---[ end trace 838756c62e156ffd ]--- [ 62.769724] device tap1 left promiscuous mode This patch also affects moving devices between net namespaces. OVS used to ignore netns move notifications which caused problems. Like: ovs-dpctl add-if test tap1 ip link set tap1 netns 3512 and then removing tap1 inside the namespace will cause hang on missing dev_put. With this patch OVS will detach dev upon receiving netns move event. Signed-off-by: Alexei Starovoitov Signed-off-by: Jesse Gross diff --git a/net/openvswitch/dp_notify.c b/net/openvswitch/dp_notify.c index c323567..5c2dab2 100644 --- a/net/openvswitch/dp_notify.c +++ b/net/openvswitch/dp_notify.c @@ -65,8 +65,7 @@ void ovs_dp_notify_wq(struct work_struct *work) continue; netdev_vport = netdev_vport_priv(vport); - if (netdev_vport->dev->reg_state == NETREG_UNREGISTERED || - netdev_vport->dev->reg_state == NETREG_UNREGISTERING) + if (!(netdev_vport->dev->priv_flags & IFF_OVS_DATAPATH)) dp_detach_port_notify(vport); } } @@ -88,6 +87,10 @@ static int dp_device_event(struct notifier_block *unused, unsigned long event, return NOTIFY_DONE; if (event == NETDEV_UNREGISTER) { + /* upper_dev_unlink and decrement promisc immediately */ + ovs_netdev_detach_dev(vport); + + /* schedule vport destroy, dev_put and genl notification */ ovs_net = net_generic(dev_net(dev), ovs_net_id); queue_work(system_wq, &ovs_net->dp_notify_work); } diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index 09d93c1..d21f77d 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -150,15 +150,25 @@ static void free_port_rcu(struct rcu_head *rcu) ovs_vport_free(vport_from_priv(netdev_vport)); } -static void netdev_destroy(struct vport *vport) +void ovs_netdev_detach_dev(struct vport *vport) { struct netdev_vport *netdev_vport = netdev_vport_priv(vport); - rtnl_lock(); + ASSERT_RTNL(); netdev_vport->dev->priv_flags &= ~IFF_OVS_DATAPATH; netdev_rx_handler_unregister(netdev_vport->dev); - netdev_upper_dev_unlink(netdev_vport->dev, get_dpdev(vport->dp)); + netdev_upper_dev_unlink(netdev_vport->dev, + netdev_master_upper_dev_get(netdev_vport->dev)); dev_set_promiscuity(netdev_vport->dev, -1); +} + +static void netdev_destroy(struct vport *vport) +{ + struct netdev_vport *netdev_vport = netdev_vport_priv(vport); + + rtnl_lock(); + if (netdev_vport->dev->priv_flags & IFF_OVS_DATAPATH) + ovs_netdev_detach_dev(vport); rtnl_unlock(); call_rcu(&netdev_vport->rcu, free_port_rcu); diff --git a/net/openvswitch/vport-netdev.h b/net/openvswitch/vport-netdev.h index dd298b5..8df01c11 100644 --- a/net/openvswitch/vport-netdev.h +++ b/net/openvswitch/vport-netdev.h @@ -39,5 +39,6 @@ netdev_vport_priv(const struct vport *vport) } const char *ovs_netdev_get_name(const struct vport *); +void ovs_netdev_detach_dev(struct vport *); #endif /* vport_netdev.h */ -- cgit v0.10.2 From 12e3594698f6c3ab6ebacc79f2fb2ad2bb5952b5 Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Thu, 17 Oct 2013 15:07:40 +0200 Subject: xfrm: prevent ipcomp scratch buffer race condition In ipcomp_compress(), sortirq is enabled too early, allowing the per-cpu scratch buffer to be rewritten by ipcomp_decompress() (called on the same CPU in softirq context) between populating the buffer and copying the compressed data to the skb. v2: as pointed out by Steffen Klassert, if we also move the local_bh_disable() before reading the per-cpu pointers, we can get rid of get_cpu()/put_cpu(). v3: removed ipcomp_decompress part (as explained by Herbert Xu, it cannot be called from process context), get rid of cpu variable (thanks to Eric Dumazet) Signed-off-by: Michal Kubecek Reviewed-by: Eric Dumazet Acked-by: Herbert Xu Signed-off-by: Steffen Klassert diff --git a/net/xfrm/xfrm_ipcomp.c b/net/xfrm/xfrm_ipcomp.c index 2906d52..3be02b6 100644 --- a/net/xfrm/xfrm_ipcomp.c +++ b/net/xfrm/xfrm_ipcomp.c @@ -141,14 +141,14 @@ static int ipcomp_compress(struct xfrm_state *x, struct sk_buff *skb) const int plen = skb->len; int dlen = IPCOMP_SCRATCH_SIZE; u8 *start = skb->data; - const int cpu = get_cpu(); - u8 *scratch = *per_cpu_ptr(ipcomp_scratches, cpu); - struct crypto_comp *tfm = *per_cpu_ptr(ipcd->tfms, cpu); + struct crypto_comp *tfm; + u8 *scratch; int err; local_bh_disable(); + scratch = *this_cpu_ptr(ipcomp_scratches); + tfm = *this_cpu_ptr(ipcd->tfms); err = crypto_comp_compress(tfm, start, plen, scratch, &dlen); - local_bh_enable(); if (err) goto out; @@ -158,13 +158,13 @@ static int ipcomp_compress(struct xfrm_state *x, struct sk_buff *skb) } memcpy(start + sizeof(struct ip_comp_hdr), scratch, dlen); - put_cpu(); + local_bh_enable(); pskb_trim(skb, dlen + sizeof(struct ip_comp_hdr)); return 0; out: - put_cpu(); + local_bh_enable(); return err; } -- cgit v0.10.2 From b416c144f46af1a30ddfa4e4319a8f077381ad63 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 21 Oct 2013 13:14:53 +0100 Subject: netfilter: x_tables: fix ordering of jumpstack allocation and table update During kernel stability testing on an SMP ARMv7 system, Yalin Wang reported the following panic from the netfilter code: 1fe0: 0000001c 5e2d3b10 4007e779 4009e110 60000010 00000032 ff565656 ff545454 [] (ipt_do_table+0x448/0x584) from [] (nf_iterate+0x48/0x7c) [] (nf_iterate+0x48/0x7c) from [] (nf_hook_slow+0x58/0x104) [] (nf_hook_slow+0x58/0x104) from [] (ip_local_deliver+0x88/0xa8) [] (ip_local_deliver+0x88/0xa8) from [] (ip_rcv_finish+0x418/0x43c) [] (ip_rcv_finish+0x418/0x43c) from [] (__netif_receive_skb+0x4cc/0x598) [] (__netif_receive_skb+0x4cc/0x598) from [] (process_backlog+0x84/0x158) [] (process_backlog+0x84/0x158) from [] (net_rx_action+0x70/0x1dc) [] (net_rx_action+0x70/0x1dc) from [] (__do_softirq+0x11c/0x27c) [] (__do_softirq+0x11c/0x27c) from [] (do_softirq+0x44/0x50) [] (do_softirq+0x44/0x50) from [] (local_bh_enable_ip+0x8c/0xd0) [] (local_bh_enable_ip+0x8c/0xd0) from [] (inet_stream_connect+0x164/0x298) [] (inet_stream_connect+0x164/0x298) from [] (sys_connect+0x88/0xc8) [] (sys_connect+0x88/0xc8) from [] (ret_fast_syscall+0x0/0x30) Code: 2a000021 e59d2028 e59de01c e59f011c (e7824103) ---[ end trace da227214a82491bd ]--- Kernel panic - not syncing: Fatal exception in interrupt This comes about because CPU1 is executing xt_replace_table in response to a setsockopt syscall, resulting in: ret = xt_jumpstack_alloc(newinfo); --> newinfo->jumpstack = kzalloc(size, GFP_KERNEL); [...] table->private = newinfo; newinfo->initial_entries = private->initial_entries; Meanwhile, CPU0 is handling the network receive path and ends up in ipt_do_table, resulting in: private = table->private; [...] jumpstack = (struct ipt_entry **)private->jumpstack[cpu]; On weakly ordered memory architectures, the writes to table->private and newinfo->jumpstack from CPU1 can be observed out of order by CPU0. Furthermore, on architectures which don't respect ordering of address dependencies (i.e. Alpha), the reads from CPU0 can also be re-ordered. This patch adds an smp_wmb() before the assignment to table->private (which is essentially publishing newinfo) to ensure that all writes to newinfo will be observed before plugging it into the table structure. A dependent-read barrier is also added on the consumer sides, to ensure the same ordering requirements are also respected there. Cc: Paul E. McKenney Reported-by: Wang, Yalin Tested-by: Wang, Yalin Signed-off-by: Will Deacon Acked-by: Eric Dumazet Signed-off-by: Pablo Neira Ayuso diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 85a4f21..59da7cd 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -271,6 +271,11 @@ unsigned int arpt_do_table(struct sk_buff *skb, local_bh_disable(); addend = xt_write_recseq_begin(); private = table->private; + /* + * Ensure we load private-> members after we've fetched the base + * pointer. + */ + smp_read_barrier_depends(); table_base = private->entries[smp_processor_id()]; e = get_entry(table_base, private->hook_entry[hook]); diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index d23118d..718dfbd 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -327,6 +327,11 @@ ipt_do_table(struct sk_buff *skb, addend = xt_write_recseq_begin(); private = table->private; cpu = smp_processor_id(); + /* + * Ensure we load private-> members after we've fetched the base + * pointer. + */ + smp_read_barrier_depends(); table_base = private->entries[cpu]; jumpstack = (struct ipt_entry **)private->jumpstack[cpu]; stackptr = per_cpu_ptr(private->stackptr, cpu); diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 44400c2..710238f 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -349,6 +349,11 @@ ip6t_do_table(struct sk_buff *skb, local_bh_disable(); addend = xt_write_recseq_begin(); private = table->private; + /* + * Ensure we load private-> members after we've fetched the base + * pointer. + */ + smp_read_barrier_depends(); cpu = smp_processor_id(); table_base = private->entries[cpu]; jumpstack = (struct ip6t_entry **)private->jumpstack[cpu]; diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 8b03028..227aa11 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -845,8 +845,13 @@ xt_replace_table(struct xt_table *table, return NULL; } - table->private = newinfo; newinfo->initial_entries = private->initial_entries; + /* + * Ensure contents of newinfo are visible before assigning to + * private. + */ + smp_wmb(); + table->private = newinfo; /* * Even though table entries have now been swapped, other CPU's -- cgit v0.10.2 From fecda03493646b53f53892fa3c38c75ba9310374 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 22 Oct 2013 18:34:56 +0200 Subject: net: sctp: fix ASCONF to allow non SCTP_ADDR_SRC addresses in ipv6 Commit 8a07eb0a50 ("sctp: Add ASCONF operation on the single-homed host") implemented possible use of IPv4 addresses with non SCTP_ADDR_SRC state as source address when sending ASCONF (ADD) packets, but IPv6 part for that was not implemented in 8a07eb0a50. Therefore, as this is not restricted to IPv4-only, fix this up to allow the same for IPv6 addresses in SCTP. Signed-off-by: Daniel Borkmann Cc: Michio Honda Acked-by: Michio Honda Acked-by: Vlad Yasevich Acked-by: Neil Horman Signed-off-by: David S. Miller diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index e7b2d4f..96a5591 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -279,7 +279,9 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr, sctp_v6_to_addr(&dst_saddr, &fl6->saddr, htons(bp->port)); rcu_read_lock(); list_for_each_entry_rcu(laddr, &bp->address_list, list) { - if (!laddr->valid || (laddr->state != SCTP_ADDR_SRC)) + if (!laddr->valid || laddr->state == SCTP_ADDR_DEL || + (laddr->state != SCTP_ADDR_SRC && + !asoc->src_out_of_asoc_ok)) continue; /* Do not compare against v4 addrs */ -- cgit v0.10.2 From 0e67d9903d71ce3c5889fa2e1788d4335794a0f6 Mon Sep 17 00:00:00 2001 From: Sebastian Siewior Date: Tue, 22 Oct 2013 20:36:25 +0200 Subject: net: wan: sbni: remove assembly crc32 code There is also a C function doing the same thing. Unless the asm code is 110% faster we could stick to the C function. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: David S. Miller diff --git a/drivers/net/wan/sbni.c b/drivers/net/wan/sbni.c index 5bbcb5e..388ddf6 100644 --- a/drivers/net/wan/sbni.c +++ b/drivers/net/wan/sbni.c @@ -148,10 +148,6 @@ static int enslave( struct net_device *, struct net_device * ); static int emancipate( struct net_device * ); #endif -#ifdef __i386__ -#define ASM_CRC 1 -#endif - static const char version[] = "Granch SBNI12 driver ver 5.0.1 Jun 22 2001 Denis I.Timofeev.\n"; @@ -1551,88 +1547,6 @@ __setup( "sbni=", sbni_setup ); /* -------------------------------------------------------------------------- */ -#ifdef ASM_CRC - -static u32 -calc_crc32( u32 crc, u8 *p, u32 len ) -{ - register u32 _crc; - _crc = crc; - - __asm__ __volatile__ ( - "xorl %%ebx, %%ebx\n" - "movl %2, %%esi\n" - "movl %3, %%ecx\n" - "movl $crc32tab, %%edi\n" - "shrl $2, %%ecx\n" - "jz 1f\n" - - ".align 4\n" - "0:\n" - "movb %%al, %%bl\n" - "movl (%%esi), %%edx\n" - "shrl $8, %%eax\n" - "xorb %%dl, %%bl\n" - "shrl $8, %%edx\n" - "xorl (%%edi,%%ebx,4), %%eax\n" - - "movb %%al, %%bl\n" - "shrl $8, %%eax\n" - "xorb %%dl, %%bl\n" - "shrl $8, %%edx\n" - "xorl (%%edi,%%ebx,4), %%eax\n" - - "movb %%al, %%bl\n" - "shrl $8, %%eax\n" - "xorb %%dl, %%bl\n" - "movb %%dh, %%dl\n" - "xorl (%%edi,%%ebx,4), %%eax\n" - - "movb %%al, %%bl\n" - "shrl $8, %%eax\n" - "xorb %%dl, %%bl\n" - "addl $4, %%esi\n" - "xorl (%%edi,%%ebx,4), %%eax\n" - - "decl %%ecx\n" - "jnz 0b\n" - - "1:\n" - "movl %3, %%ecx\n" - "andl $3, %%ecx\n" - "jz 2f\n" - - "movb %%al, %%bl\n" - "shrl $8, %%eax\n" - "xorb (%%esi), %%bl\n" - "xorl (%%edi,%%ebx,4), %%eax\n" - - "decl %%ecx\n" - "jz 2f\n" - - "movb %%al, %%bl\n" - "shrl $8, %%eax\n" - "xorb 1(%%esi), %%bl\n" - "xorl (%%edi,%%ebx,4), %%eax\n" - - "decl %%ecx\n" - "jz 2f\n" - - "movb %%al, %%bl\n" - "shrl $8, %%eax\n" - "xorb 2(%%esi), %%bl\n" - "xorl (%%edi,%%ebx,4), %%eax\n" - "2:\n" - : "=a" (_crc) - : "0" (_crc), "g" (p), "g" (len) - : "bx", "cx", "dx", "si", "di" - ); - - return _crc; -} - -#else /* ASM_CRC */ - static u32 calc_crc32( u32 crc, u8 *p, u32 len ) { @@ -1642,9 +1556,6 @@ calc_crc32( u32 crc, u8 *p, u32 len ) return crc; } -#endif /* ASM_CRC */ - - static u32 crc32tab[] __attribute__ ((aligned(8))) = { 0xD202EF8D, 0xA505DF1B, 0x3C0C8EA1, 0x4B0BBE37, 0xD56F2B94, 0xA2681B02, 0x3B614AB8, 0x4C667A2E, -- cgit v0.10.2 From 45e526e80e6fdc796d3bc05716d5c930a427df4d Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Wed, 23 Oct 2013 15:04:49 +0200 Subject: netconsole: fix NULL pointer dereference We need to disable the netconsole (enabled = 0) before setting nt->np.dev to NULL because otherwise we might still have users after the netpoll_cleanup() since nt->enabled is set afterwards and we can have a message which will result in a NULL pointer dereference. It is very easy to hit dereferences all over the netpoll_send_udp function by running the following two loops in parallel: while [ 1 ]; do echo 1 > enabled; echo 0 > enabled; done; while [ 1 ]; do echo 00:11:22:33:44:55 > remote_mac; done; (the second loop is to generate messages, it can be done by anything) We're safe to set nt->np.dev = NULL and nt->enabled = 0 with the spinlock since it's required in the write_msg() function. Signed-off-by: Nikolay Aleksandrov Reviewed-by: Veacelsav Falico Signed-off-by: David S. Miller diff --git a/drivers/net/netconsole.c b/drivers/net/netconsole.c index adeee61..1505dcb 100644 --- a/drivers/net/netconsole.c +++ b/drivers/net/netconsole.c @@ -310,6 +310,7 @@ static ssize_t store_enabled(struct netconsole_target *nt, const char *buf, size_t count) { + unsigned long flags; int enabled; int err; @@ -342,6 +343,13 @@ static ssize_t store_enabled(struct netconsole_target *nt, printk(KERN_INFO "netconsole: network logging started\n"); } else { /* 0 */ + /* We need to disable the netconsole before cleaning it up + * otherwise we might end up in write_msg() with + * nt->np.dev == NULL and nt->enabled == 1 + */ + spin_lock_irqsave(&target_list_lock, flags); + nt->enabled = 0; + spin_unlock_irqrestore(&target_list_lock, flags); netpoll_cleanup(&nt->np); } -- cgit v0.10.2 From c7c6effdeffcafd792b9f880ad52e48689eea4ad Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Thu, 24 Oct 2013 12:09:24 +0200 Subject: netconsole: fix multiple race conditions In every netconsole option that can be set through configfs there's a race when checking for nt->enabled since it can be modified at the same time. Probably the most damage can be done by store_enabled when racing with another instance of itself. Fix all the races with one stone by moving the mutex lock around the ->store call for all options. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller diff --git a/drivers/net/netconsole.c b/drivers/net/netconsole.c index 1505dcb..c9a1592 100644 --- a/drivers/net/netconsole.c +++ b/drivers/net/netconsole.c @@ -325,9 +325,7 @@ static ssize_t store_enabled(struct netconsole_target *nt, return -EINVAL; } - mutex_lock(&nt->mutex); if (enabled) { /* 1 */ - /* * Skip netpoll_parse_options() -- all the attributes are * already configured via configfs. Just print them out. @@ -335,13 +333,10 @@ static ssize_t store_enabled(struct netconsole_target *nt, netpoll_print_options(&nt->np); err = netpoll_setup(&nt->np); - if (err) { - mutex_unlock(&nt->mutex); + if (err) return err; - } printk(KERN_INFO "netconsole: network logging started\n"); - } else { /* 0 */ /* We need to disable the netconsole before cleaning it up * otherwise we might end up in write_msg() with @@ -354,7 +349,6 @@ static ssize_t store_enabled(struct netconsole_target *nt, } nt->enabled = enabled; - mutex_unlock(&nt->mutex); return strnlen(buf, count); } @@ -571,8 +565,10 @@ static ssize_t netconsole_target_attr_store(struct config_item *item, struct netconsole_target_attr *na = container_of(attr, struct netconsole_target_attr, attr); + mutex_lock(&nt->mutex); if (na->store) ret = na->store(nt, buf, count); + mutex_unlock(&nt->mutex); return ret; } -- cgit v0.10.2 From 8fb479a47c869820966e7298f38038aa334d889c Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Wed, 23 Oct 2013 23:36:30 +0200 Subject: netpoll: fix rx_hook() interface by passing the skb Right now skb->data is passed to rx_hook() even if the skb has not been linearised and without giving rx_hook() a way to linearise it. Change the rx_hook() interface and make it accept the skb and the offset to the UDP payload as arguments. rx_hook() is also renamed to rx_skb_hook() to ensure that out of the tree users notice the API change. In this way any rx_skb_hook() implementation can perform all the needed operations to properly (and safely) access the skb data. Signed-off-by: Antonio Quartulli Signed-off-by: David S. Miller diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index f3c7c24..fbfdb9d 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -24,7 +24,8 @@ struct netpoll { struct net_device *dev; char dev_name[IFNAMSIZ]; const char *name; - void (*rx_hook)(struct netpoll *, int, char *, int); + void (*rx_skb_hook)(struct netpoll *np, int source, struct sk_buff *skb, + int offset, int len); union inet_addr local_ip, remote_ip; bool ipv6; @@ -41,7 +42,7 @@ struct netpoll_info { unsigned long rx_flags; spinlock_t rx_lock; struct semaphore dev_lock; - struct list_head rx_np; /* netpolls that registered an rx_hook */ + struct list_head rx_np; /* netpolls that registered an rx_skb_hook */ struct sk_buff_head neigh_tx; /* list of neigh requests to reply to */ struct sk_buff_head txq; diff --git a/net/core/netpoll.c b/net/core/netpoll.c index fc75c9e..8f97199 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -636,8 +636,9 @@ static void netpoll_neigh_reply(struct sk_buff *skb, struct netpoll_info *npinfo netpoll_send_skb(np, send_skb); - /* If there are several rx_hooks for the same address, - we're fine by sending a single reply */ + /* If there are several rx_skb_hooks for the same + * address we're fine by sending a single reply + */ break; } spin_unlock_irqrestore(&npinfo->rx_lock, flags); @@ -719,8 +720,9 @@ static void netpoll_neigh_reply(struct sk_buff *skb, struct netpoll_info *npinfo netpoll_send_skb(np, send_skb); - /* If there are several rx_hooks for the same address, - we're fine by sending a single reply */ + /* If there are several rx_skb_hooks for the same + * address, we're fine by sending a single reply + */ break; } spin_unlock_irqrestore(&npinfo->rx_lock, flags); @@ -756,11 +758,12 @@ static bool pkt_is_ns(struct sk_buff *skb) int __netpoll_rx(struct sk_buff *skb, struct netpoll_info *npinfo) { - int proto, len, ulen; - int hits = 0; + int proto, len, ulen, data_len; + int hits = 0, offset; const struct iphdr *iph; struct udphdr *uh; struct netpoll *np, *tmp; + uint16_t source; if (list_empty(&npinfo->rx_np)) goto out; @@ -820,7 +823,10 @@ int __netpoll_rx(struct sk_buff *skb, struct netpoll_info *npinfo) len -= iph->ihl*4; uh = (struct udphdr *)(((char *)iph) + iph->ihl*4); + offset = (unsigned char *)(uh + 1) - skb->data; ulen = ntohs(uh->len); + data_len = skb->len - offset; + source = ntohs(uh->source); if (ulen != len) goto out; @@ -834,9 +840,7 @@ int __netpoll_rx(struct sk_buff *skb, struct netpoll_info *npinfo) if (np->local_port && np->local_port != ntohs(uh->dest)) continue; - np->rx_hook(np, ntohs(uh->source), - (char *)(uh+1), - ulen - sizeof(struct udphdr)); + np->rx_skb_hook(np, source, skb, offset, data_len); hits++; } } else { @@ -859,7 +863,10 @@ int __netpoll_rx(struct sk_buff *skb, struct netpoll_info *npinfo) if (!pskb_may_pull(skb, sizeof(struct udphdr))) goto out; uh = udp_hdr(skb); + offset = (unsigned char *)(uh + 1) - skb->data; ulen = ntohs(uh->len); + data_len = skb->len - offset; + source = ntohs(uh->source); if (ulen != skb->len) goto out; if (udp6_csum_init(skb, uh, IPPROTO_UDP)) @@ -872,9 +879,7 @@ int __netpoll_rx(struct sk_buff *skb, struct netpoll_info *npinfo) if (np->local_port && np->local_port != ntohs(uh->dest)) continue; - np->rx_hook(np, ntohs(uh->source), - (char *)(uh+1), - ulen - sizeof(struct udphdr)); + np->rx_skb_hook(np, source, skb, offset, data_len); hits++; } #endif @@ -1062,7 +1067,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev, gfp_t gfp) npinfo->netpoll = np; - if (np->rx_hook) { + if (np->rx_skb_hook) { spin_lock_irqsave(&npinfo->rx_lock, flags); npinfo->rx_flags |= NETPOLL_RX_ENABLED; list_add_tail(&np->rx, &npinfo->rx_np); -- cgit v0.10.2 From 01ba16d6ec85a1ec4669c75513a76b61ec53ee50 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Thu, 24 Oct 2013 10:14:27 +0200 Subject: ipv6: reset dst.expires value when clearing expire flag On receiving a packet too big icmp error we update the expire value by calling rt6_update_expires. This function uses dst_set_expires which is implemented that it can only reduce the expiration value of the dst entry. If we insert new routing non-expiry information into the ipv6 fib where we already have a matching rt6_info we only clear the RTF_EXPIRES flag in rt6i_flags and leave the dst.expires value as is. When new mtu information arrives for that cached dst_entry we again call dst_set_expires. This time it won't update the dst.expire value because we left the dst.expire value intact from the last update. So dst_set_expires won't touch dst.expires. Fix this by resetting dst.expires when clearing the RTF_EXPIRE flag. dst_set_expires checks for a zero expiration and updates the dst.expires. In the past this (not updating dst.expires) was necessary because dst.expire was placed in a union with the dst_entry *from reference and rt6_clean_expires did assign NULL to it. This split happend in ecd9883724b78cc72ed92c98bcb1a46c764fff21 ("ipv6: fix race condition regarding dst->expires and dst->from"). Reported-by: Steinar H. Gunderson Reported-by: Valentijn Sessink Cc: YOSHIFUJI Hideaki Acked-by: Eric Dumazet Tested-by: Valentijn Sessink Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 48ec25a..5e661a9 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -165,6 +165,7 @@ static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst) static inline void rt6_clean_expires(struct rt6_info *rt) { rt->rt6i_flags &= ~RTF_EXPIRES; + rt->dst.expires = 0; } static inline void rt6_set_expires(struct rt6_info *rt, unsigned long expires) -- cgit v0.10.2 From e3bc10bd95d7fcc3f2ac690c6ff22833ea6781d6 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Thu, 24 Oct 2013 07:48:24 +0200 Subject: ipv6: ip6_dst_check needs to check for expired dst_entries On receiving a packet too big icmp error we check if our current cached dst_entry in the socket is still valid. This validation check did not care about the expiration of the (cached) route. The error path I traced down: The socket receives a packet too big mtu notification. It still has a valid dst_entry and thus issues the ip6_rt_pmtu_update on this dst_entry, setting RTF_EXPIRE and updates the dst.expiration value (which could fail because of not up-to-date expiration values, see previous patch). In some seldom cases we race with a) the ip6_fib gc or b) another routing lookup which would result in a recreation of the cached rt6_info from its parent non-cached rt6_info. While copying the rt6_info we reinitialize the metrics store by copying it over from the parent thus invalidating the just installed pmtu update (both dsts use the same key to the inetpeer storage). The dst_entry with the just invalidated metrics data would just get its RTF_EXPIRES flag cleared and would continue to stay valid for the socket. We should have not issued the pmtu update on the already expired dst_entry in the first placed. By checking the expiration on the dst entry and doing a relookup in case it is out of date we close the race because we would install a new rt6_info into the fib before we issue the pmtu update, thus closing this race. Not reliably updating the dst.expire value was fixed by the patch "ipv6: reset dst.expires value when clearing expire flag". Reported-by: Steinar H. Gunderson Reported-by: Valentijn Sessink Cc: YOSHIFUJI Hideaki Signed-off-by: Hannes Frederic Sowa Reviewed-by: Eric Dumazet Tested-by: Valentijn Sessink Signed-off-by: David S. Miller diff --git a/net/ipv6/route.c b/net/ipv6/route.c index f54e3a1..04e17b3 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1087,10 +1087,13 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) if (rt->rt6i_genid != rt_genid_ipv6(dev_net(rt->dst.dev))) return NULL; - if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) - return dst; + if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie)) + return NULL; - return NULL; + if (rt6_check_expired(rt)) + return NULL; + + return dst; } static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) -- cgit v0.10.2 From 4c8e84b22aa1bfee40b047d7810ba08615235c05 Mon Sep 17 00:00:00 2001 From: Freddy Xin Date: Thu, 24 Oct 2013 14:58:25 +0800 Subject: ax88179_178a: Remove AX_MEDIUM_ALWAYS_ONE bit in AX_MEDIUM_STATUS_MODE register to avoid TX throttling Remove AX_MEDIUM_ALWAYS_ONE in AX_MEDIUM_STATUS_MODE register. Setting this bit may cause TX throttling in Half-Duplex mode. Signed-off-by: Freddy Xin Signed-off-by: David S. Miller diff --git a/drivers/net/usb/ax88179_178a.c b/drivers/net/usb/ax88179_178a.c index 846cc19..8e8d0fc 100644 --- a/drivers/net/usb/ax88179_178a.c +++ b/drivers/net/usb/ax88179_178a.c @@ -78,7 +78,6 @@ #define AX_MEDIUM_STATUS_MODE 0x22 #define AX_MEDIUM_GIGAMODE 0x01 #define AX_MEDIUM_FULL_DUPLEX 0x02 - #define AX_MEDIUM_ALWAYS_ONE 0x04 #define AX_MEDIUM_EN_125MHZ 0x08 #define AX_MEDIUM_RXFLOW_CTRLEN 0x10 #define AX_MEDIUM_TXFLOW_CTRLEN 0x20 @@ -1065,8 +1064,8 @@ static int ax88179_bind(struct usbnet *dev, struct usb_interface *intf) /* Configure default medium type => giga */ *tmp16 = AX_MEDIUM_RECEIVE_EN | AX_MEDIUM_TXFLOW_CTRLEN | - AX_MEDIUM_RXFLOW_CTRLEN | AX_MEDIUM_ALWAYS_ONE | - AX_MEDIUM_FULL_DUPLEX | AX_MEDIUM_GIGAMODE; + AX_MEDIUM_RXFLOW_CTRLEN | AX_MEDIUM_FULL_DUPLEX | + AX_MEDIUM_GIGAMODE; ax88179_write_cmd(dev, AX_ACCESS_MAC, AX_MEDIUM_STATUS_MODE, 2, 2, tmp16); @@ -1225,7 +1224,7 @@ static int ax88179_link_reset(struct usbnet *dev) } mode = AX_MEDIUM_RECEIVE_EN | AX_MEDIUM_TXFLOW_CTRLEN | - AX_MEDIUM_RXFLOW_CTRLEN | AX_MEDIUM_ALWAYS_ONE; + AX_MEDIUM_RXFLOW_CTRLEN; ax88179_read_cmd(dev, AX_ACCESS_MAC, PHYSICAL_LINK_STATUS, 1, 1, &link_sts); @@ -1339,8 +1338,8 @@ static int ax88179_reset(struct usbnet *dev) /* Configure default medium type => giga */ *tmp16 = AX_MEDIUM_RECEIVE_EN | AX_MEDIUM_TXFLOW_CTRLEN | - AX_MEDIUM_RXFLOW_CTRLEN | AX_MEDIUM_ALWAYS_ONE | - AX_MEDIUM_FULL_DUPLEX | AX_MEDIUM_GIGAMODE; + AX_MEDIUM_RXFLOW_CTRLEN | AX_MEDIUM_FULL_DUPLEX | + AX_MEDIUM_GIGAMODE; ax88179_write_cmd(dev, AX_ACCESS_MAC, AX_MEDIUM_STATUS_MODE, 2, 2, tmp16); -- cgit v0.10.2 From 598c45b309eb401510653fed45fe74efae93be4e Mon Sep 17 00:00:00 2001 From: Shahed Shaikh Date: Fri, 25 Oct 2013 10:38:36 -0400 Subject: qlcnic: Do not force adapter to perform LRO without destination IP check Forcing adapter to perform LRO without destination IP check degrades the performance. Signed-off-by: Shahed Shaikh Signed-off-by: David S. Miller diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_hw.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_hw.c index f8adc7b..b64e2be 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_hw.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_hw.c @@ -785,8 +785,6 @@ void qlcnic_82xx_config_intr_coalesce(struct qlcnic_adapter *adapter) #define QLCNIC_ENABLE_IPV4_LRO 1 #define QLCNIC_ENABLE_IPV6_LRO 2 -#define QLCNIC_NO_DEST_IPV4_CHECK (1 << 8) -#define QLCNIC_NO_DEST_IPV6_CHECK (2 << 8) int qlcnic_82xx_config_hw_lro(struct qlcnic_adapter *adapter, int enable) { @@ -806,11 +804,10 @@ int qlcnic_82xx_config_hw_lro(struct qlcnic_adapter *adapter, int enable) word = 0; if (enable) { - word = QLCNIC_ENABLE_IPV4_LRO | QLCNIC_NO_DEST_IPV4_CHECK; + word = QLCNIC_ENABLE_IPV4_LRO; if (adapter->ahw->extra_capability[0] & QLCNIC_FW_CAP2_HW_LRO_IPV6) - word |= QLCNIC_ENABLE_IPV6_LRO | - QLCNIC_NO_DEST_IPV6_CHECK; + word |= QLCNIC_ENABLE_IPV6_LRO; } req.words[0] = cpu_to_le64(word); -- cgit v0.10.2 From d6994ca798f5897a4342f727b21d77e01d92f093 Mon Sep 17 00:00:00 2001 From: Shahed Shaikh Date: Fri, 25 Oct 2013 10:38:37 -0400 Subject: qlcnic: Do not read QLCNIC_FW_CAPABILITY_MORE_CAPS bit for 83xx adapter Only 82xx adapter advertises QLCNIC_FW_CAPABILITY_MORE_CAPS bit. Reading this bit from 83xx adapter causes the driver to skip extra capabilities registers. Because of this, driver was not issuing qlcnic_fw_cmd_set_drv_version() for 83xx adapter. This bug was introduced in commit 8af3f33db05c6d0146ad14905145a5c923770856 ("qlcnic: Add support for 'set driver version' in 83XX"). Signed-off-by: Shahed Shaikh Signed-off-by: David S. Miller diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c index 3ca00e0..ace217c 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c @@ -2276,9 +2276,9 @@ int qlcnic_83xx_get_nic_info(struct qlcnic_adapter *adapter, temp = (cmd.rsp.arg[8] & 0x7FFE0000) >> 17; npar_info->max_linkspeed_reg_offset = temp; } - if (npar_info->capabilities & QLCNIC_FW_CAPABILITY_MORE_CAPS) - memcpy(ahw->extra_capability, &cmd.rsp.arg[16], - sizeof(ahw->extra_capability)); + + memcpy(ahw->extra_capability, &cmd.rsp.arg[16], + sizeof(ahw->extra_capability)); out: qlcnic_free_mbx_args(&cmd); diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c index 9e61eb8..d8f4897e 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c @@ -1131,7 +1131,10 @@ qlcnic_initialize_nic(struct qlcnic_adapter *adapter) if (err == -EIO) return err; adapter->ahw->extra_capability[0] = temp; + } else { + adapter->ahw->extra_capability[0] = 0; } + adapter->ahw->max_mac_filters = nic_info.max_mac_filters; adapter->ahw->max_mtu = nic_info.max_mtu; @@ -2159,8 +2162,7 @@ void qlcnic_set_drv_version(struct qlcnic_adapter *adapter) else if (qlcnic_83xx_check(adapter)) fw_cmd = QLCNIC_CMD_83XX_SET_DRV_VER; - if ((ahw->capabilities & QLCNIC_FW_CAPABILITY_MORE_CAPS) && - (ahw->extra_capability[0] & QLCNIC_FW_CAPABILITY_SET_DRV_VER)) + if (ahw->extra_capability[0] & QLCNIC_FW_CAPABILITY_SET_DRV_VER) qlcnic_fw_cmd_set_drv_version(adapter, fw_cmd); } -- cgit v0.10.2 From e9e2a904ef0a4f46ee5c845f3ae04e62b917bb6d Mon Sep 17 00:00:00 2001 From: Somnath Kotur Date: Thu, 24 Oct 2013 14:37:53 +0530 Subject: be2net: Warn users of possible broken functionality on BE2 cards with very old FW versions with latest driver On very old FW versions < 4.0, the mailbox command to set interrupts on the card succeeds even though it is not supported and should have failed, leading to a scenario where interrupts do not work. Hence warn users to upgrade to a suitable FW version to avoid seeing broken functionality. Signed-off-by: Somnath Kotur Signed-off-by: David S. Miller diff --git a/drivers/net/ethernet/emulex/benet/be.h b/drivers/net/ethernet/emulex/benet/be.h index db02023..c99dac6 100644 --- a/drivers/net/ethernet/emulex/benet/be.h +++ b/drivers/net/ethernet/emulex/benet/be.h @@ -696,6 +696,15 @@ static inline int qnq_async_evt_rcvd(struct be_adapter *adapter) return adapter->flags & BE_FLAGS_QNQ_ASYNC_EVT_RCVD; } +static inline int fw_major_num(const char *fw_ver) +{ + int fw_major = 0; + + sscanf(fw_ver, "%d.", &fw_major); + + return fw_major; +} + extern void be_cq_notify(struct be_adapter *adapter, u16 qid, bool arm, u16 num_popped); extern void be_link_status_update(struct be_adapter *adapter, u8 link_status); diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index 2c38cc4..53ed58b 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -3247,6 +3247,12 @@ static int be_setup(struct be_adapter *adapter) be_cmd_get_fw_ver(adapter, adapter->fw_ver, adapter->fw_on_flash); + if (BE2_chip(adapter) && fw_major_num(adapter->fw_ver) < 4) { + dev_err(dev, "Firmware on card is old(%s), IRQs may not work.", + adapter->fw_ver); + dev_err(dev, "Please upgrade firmware to version >= 4.0\n"); + } + if (adapter->vlans_added) be_vid_config(adapter); -- cgit v0.10.2 From bc15afa39ecc16f01c3389d15d8f6015a427fe85 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 24 Oct 2013 08:44:25 -0700 Subject: tcp: fix SYNACK RTT estimation in Fast Open tp->lsndtime may not always be the SYNACK timestamp if a passive Fast Open socket sends data before handshake completes. And if the remote acknowledges both the data and the SYNACK, the RTT sample is already taken in tcp_ack(), so no need to call tcp_update_ack_rtt() in tcp_synack_rtt_meas() aagain. Signed-off-by: Yuchung Cheng Acked-by: Neal Cardwell Acked-by: Eric Dumazet Signed-off-by: David S. Miller diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a16b01b..305cd05 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2871,14 +2871,19 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, } /* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */ -static void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req) +static void tcp_synack_rtt_meas(struct sock *sk, const u32 synack_stamp) { struct tcp_sock *tp = tcp_sk(sk); s32 seq_rtt = -1; - if (tp->lsndtime && !tp->total_retrans) - seq_rtt = tcp_time_stamp - tp->lsndtime; - tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt, -1); + if (synack_stamp && !tp->total_retrans) + seq_rtt = tcp_time_stamp - synack_stamp; + + /* If the ACK acks both the SYNACK and the (Fast Open'd) data packets + * sent in SYN_RECV, SYNACK RTT is the smooth RTT computed in tcp_ack() + */ + if (!tp->srtt) + tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt, -1); } static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) @@ -5587,6 +5592,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, struct request_sock *req; int queued = 0; bool acceptable; + u32 synack_stamp; tp->rx_opt.saw_tstamp = 0; @@ -5669,9 +5675,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, * so release it. */ if (req) { + synack_stamp = tcp_rsk(req)->snt_synack; tp->total_retrans = req->num_retrans; reqsk_fastopen_remove(sk, req, false); } else { + synack_stamp = tp->lsndtime; /* Make sure socket is routed, for correct metrics. */ icsk->icsk_af_ops->rebuild_header(sk); tcp_init_congestion_control(sk); @@ -5694,7 +5702,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, tp->snd_una = TCP_SKB_CB(skb)->ack_seq; tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale; tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); - tcp_synack_rtt_meas(sk, req); + tcp_synack_rtt_meas(sk, synack_stamp); if (tp->rx_opt.tstamp_ok) tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; -- cgit v0.10.2 From 2909d874f34eae157aecab0af27c6dc4a1751f8f Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 24 Oct 2013 08:55:25 -0700 Subject: tcp: only take RTT from timestamps if new data is acked Patch ed08495c3 "tcp: use RTT from SACK for RTO" has a bug that it does not check if the ACK acknowledge new data before taking the RTT sample from TCP timestamps. This patch adds the check back as required by the RFC. Signed-off-by: Yuchung Cheng Acked-by: Neal Cardwell Acked-by: Eric Dumazet Signed-off-by: David S. Miller diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 305cd05..6ffe41a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2856,7 +2856,8 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, * left edge of the send window. * See draft-ietf-tcplw-high-performance-00, section 3.3. */ - if (seq_rtt < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) + if (seq_rtt < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && + flag & FLAG_ACKED) seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; if (seq_rtt < 0) -- cgit v0.10.2 From 2f715c1dde6e1760f3101358dc26f8c9489be0bf Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 24 Oct 2013 08:59:27 -0700 Subject: tcp: do not rearm RTO when future data are sacked Patch ed08495c3 "tcp: use RTT from SACK for RTO" always re-arms RTO upon obtaining a RTT sample from newly sacked data. But technically RTO should only be re-armed when the data sent before the last (re)transmission of write queue head are (s)acked. Otherwise the RTO may continue to extend during loss recovery on data sent in the future. Note that RTTs from ACK or timestamps do not have this problem, as the RTT source must be from data sent before. The new RTO re-arm policy is 1) Always re-arm RTO if SND.UNA is advanced 2) Re-arm RTO if sack RTT is available, provided the sacked data was sent before the last time write_queue_head was sent. Signed-off-by: Larry Brakmo Signed-off-by: Yuchung Cheng Acked-by: Neal Cardwell Acked-by: Eric Dumazet Signed-off-by: David S. Miller diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 6ffe41a..068c8fb 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2987,6 +2987,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, s32 seq_rtt = -1; s32 ca_seq_rtt = -1; ktime_t last_ackt = net_invalid_timestamp(); + bool rtt_update; while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); @@ -3063,14 +3064,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) flag |= FLAG_SACK_RENEGING; - if (tcp_ack_update_rtt(sk, flag, seq_rtt, sack_rtt) || - (flag & FLAG_ACKED)) - tcp_rearm_rto(sk); + rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt, sack_rtt); if (flag & FLAG_ACKED) { const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; + tcp_rearm_rto(sk); if (unlikely(icsk->icsk_mtup.probe_size && !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) { tcp_mtup_probe_success(sk); @@ -3109,6 +3109,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, ca_ops->pkts_acked(sk, pkts_acked, rtt_us); } + } else if (skb && rtt_update && sack_rtt >= 0 && + sack_rtt > (s32)(now - TCP_SKB_CB(skb)->when)) { + /* Do not re-arm RTO if the sack RTT is measured from data sent + * after when the head was last (re)transmitted. Otherwise the + * timeout may continue to extend in loss recovery. + */ + tcp_rearm_rto(sk); } #if FASTRETRANS_DEBUG > 0 -- cgit v0.10.2 From ad86de802d0ea6776eccd0a2526ba31101d89267 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 24 Oct 2013 18:56:57 -0700 Subject: Documentation/networking: netdev-FAQ typo corrections Various typo fixes to netdev-FAQ.txt: - capitalize Linux - hyphenate dual-word adjectives - minor punctuation fixes Signed-off-by: Randy Dunlap Cc: Paul Gortmaker Acked-by: Paul Gortmaker Signed-off-by: David S. Miller diff --git a/Documentation/networking/netdev-FAQ.txt b/Documentation/networking/netdev-FAQ.txt index d9112f0..3a2c586 100644 --- a/Documentation/networking/netdev-FAQ.txt +++ b/Documentation/networking/netdev-FAQ.txt @@ -4,23 +4,23 @@ Information you need to know about netdev Q: What is netdev? -A: It is a mailing list for all network related linux stuff. This includes +A: It is a mailing list for all network-related Linux stuff. This includes anything found under net/ (i.e. core code like IPv6) and drivers/net - (i.e. hardware specific drivers) in the linux source tree. + (i.e. hardware specific drivers) in the Linux source tree. Note that some subsystems (e.g. wireless drivers) which have a high volume of traffic have their own specific mailing lists. - The netdev list is managed (like many other linux mailing lists) through + The netdev list is managed (like many other Linux mailing lists) through VGER ( http://vger.kernel.org/ ) and archives can be found below: http://marc.info/?l=linux-netdev http://www.spinics.net/lists/netdev/ - Aside from subsystems like that mentioned above, all network related linux - development (i.e. RFC, review, comments, etc) takes place on netdev. + Aside from subsystems like that mentioned above, all network-related Linux + development (i.e. RFC, review, comments, etc.) takes place on netdev. -Q: How do the changes posted to netdev make their way into linux? +Q: How do the changes posted to netdev make their way into Linux? A: There are always two trees (git repositories) in play. Both are driven by David Miller, the main network maintainer. There is the "net" tree, @@ -35,7 +35,7 @@ A: There are always two trees (git repositories) in play. Both are driven Q: How often do changes from these trees make it to the mainline Linus tree? A: To understand this, you need to know a bit of background information - on the cadence of linux development. Each new release starts off with + on the cadence of Linux development. Each new release starts off with a two week "merge window" where the main maintainers feed their new stuff to Linus for merging into the mainline tree. After the two weeks, the merge window is closed, and it is called/tagged "-rc1". No new @@ -46,7 +46,7 @@ A: To understand this, you need to know a bit of background information things are in a state of churn), and a week after the last vX.Y-rcN was done, the official "vX.Y" is released. - Relating that to netdev: At the beginning of the 2 week merge window, + Relating that to netdev: At the beginning of the 2-week merge window, the net-next tree will be closed - no new changes/features. The accumulated new content of the past ~10 weeks will be passed onto mainline/Linus via a pull request for vX.Y -- at the same time, @@ -59,12 +59,12 @@ A: To understand this, you need to know a bit of background information IMPORTANT: Do not send new net-next content to netdev during the period during which net-next tree is closed. - Shortly after the two weeks have passed, (and vX.Y-rc1 is released) the + Shortly after the two weeks have passed (and vX.Y-rc1 is released), the tree for net-next reopens to collect content for the next (vX.Y+1) release. If you aren't subscribed to netdev and/or are simply unsure if net-next has re-opened yet, simply check the net-next git repository link above for - any new networking related commits. + any new networking-related commits. The "net" tree continues to collect fixes for the vX.Y content, and is fed back to Linus at regular (~weekly) intervals. Meaning that the @@ -217,7 +217,7 @@ A: Attention to detail. Re-read your own work as if you were the to why it happens, and then if necessary, explain why the fix proposed is the best way to get things done. Don't mangle whitespace, and as is common, don't mis-indent function arguments that span multiple lines. - If it is your 1st patch, mail it to yourself so you can test apply + If it is your first patch, mail it to yourself so you can test apply it to an unpatched tree to confirm infrastructure didn't mangle it. Finally, go back and read Documentation/SubmittingPatches to be -- cgit v0.10.2 From fc59d5bdf1e3dca0336d155e55d812286db075ad Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 27 Oct 2013 16:26:43 -0700 Subject: pkt_sched: fq: clear time_next_packet for reused flows When a socket is freed/reallocated, we need to clear time_next_packet or else we can inherit a prior value and delay first packets of the new flow. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index a9dfdda..fdc041c 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -255,6 +255,7 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) f->socket_hash != sk->sk_hash)) { f->credit = q->initial_quantum; f->socket_hash = sk->sk_hash; + f->time_next_packet = 0ULL; } return f; } -- cgit v0.10.2 From eeb1b73378b560e00ff1da2ef09fed9254f4e128 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Fri, 25 Oct 2013 10:21:32 +0200 Subject: xfrm: Increase the garbage collector threshold With the removal of the routing cache, we lost the option to tweak the garbage collector threshold along with the maximum routing cache size. So git commit 703fb94ec ("xfrm: Fix the gc threshold value for ipv4") moved back to a static threshold. It turned out that the current threshold before we start garbage collecting is much to small for some workloads, so increase it from 1024 to 32768. This means that we start the garbage collector if we have more than 32768 dst entries in the system and refuse new allocations if we are above 65536. Reported-by: Wolfgang Walter Signed-off-by: Steffen Klassert diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index ccde542..4764ee4 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -236,7 +236,7 @@ static struct dst_ops xfrm4_dst_ops = { .destroy = xfrm4_dst_destroy, .ifdown = xfrm4_dst_ifdown, .local_out = __ip_local_out, - .gc_thresh = 1024, + .gc_thresh = 32768, }; static struct xfrm_policy_afinfo xfrm4_policy_afinfo = { diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 08ed277..dd503a3 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -285,7 +285,7 @@ static struct dst_ops xfrm6_dst_ops = { .destroy = xfrm6_dst_destroy, .ifdown = xfrm6_dst_ifdown, .local_out = __ip6_local_out, - .gc_thresh = 1024, + .gc_thresh = 32768, }; static struct xfrm_policy_afinfo xfrm6_policy_afinfo = { -- cgit v0.10.2 From 0d08c42cf9a71530fef5ebcfe368f38f2dd0476f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 25 Oct 2013 17:26:17 -0700 Subject: tcp: gso: fix truesize tracking commit 6ff50cd55545 ("tcp: gso: do not generate out of order packets") had an heuristic that can trigger a warning in skb_try_coalesce(), because skb->truesize of the gso segments were exactly set to mss. This breaks the requirement that skb->truesize >= skb->len + truesizeof(struct sk_buff); It can trivially be reproduced by : ifconfig lo mtu 1500 ethtool -K lo tso off netperf As the skbs are looped into the TCP networking stack, skb_try_coalesce() warns us of these skb under-estimating their truesize. Signed-off-by: Eric Dumazet Reported-by: Alexei Starovoitov Signed-off-by: David S. Miller diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 3a7525e..533c58a 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -18,6 +18,7 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EINVAL); + unsigned int sum_truesize = 0; struct tcphdr *th; unsigned int thlen; unsigned int seq; @@ -102,13 +103,7 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, if (copy_destructor) { skb->destructor = gso_skb->destructor; skb->sk = gso_skb->sk; - /* {tcp|sock}_wfree() use exact truesize accounting : - * sum(skb->truesize) MUST be exactly be gso_skb->truesize - * So we account mss bytes of 'true size' for each segment. - * The last segment will contain the remaining. - */ - skb->truesize = mss; - gso_skb->truesize -= mss; + sum_truesize += skb->truesize; } skb = skb->next; th = tcp_hdr(skb); @@ -125,7 +120,9 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, if (copy_destructor) { swap(gso_skb->sk, skb->sk); swap(gso_skb->destructor, skb->destructor); - swap(gso_skb->truesize, skb->truesize); + sum_truesize += skb->truesize; + atomic_add(sum_truesize - gso_skb->truesize, + &skb->sk->sk_wmem_alloc); } delta = htonl(oldlen + (skb_tail_pointer(skb) - -- cgit v0.10.2 From e3ed4eaef4932fd3867465784d11a36deaa6d22c Mon Sep 17 00:00:00 2001 From: Dmitry Kravkov Date: Sun, 27 Oct 2013 13:07:00 +0200 Subject: bnx2x: prevent FW assert on low mem during unload Buffers for FW statistics were allocated at an inappropriate time; In a machine where the driver encounters problems allocating all of its queues, the driver would still create FW requests for the statistics of the non-existing queues. The wrong order of memory allocation could lead to zeroed statistics messages being sent, leading to fw assert in case function 0 was down. This changes the order of allocations, guaranteeing that statistic requests will only be generated for actual queues. Signed-off-by: Dmitry Kravkov Signed-off-by: Yuval Mintz Signed-off-by: Ariel Elior Signed-off-by: David S. Miller diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c index 4ab4c89..74d6486 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c @@ -2545,10 +2545,6 @@ int bnx2x_nic_load(struct bnx2x *bp, int load_mode) } } - /* Allocated memory for FW statistics */ - if (bnx2x_alloc_fw_stats_mem(bp)) - LOAD_ERROR_EXIT(bp, load_error0); - /* need to be done after alloc mem, since it's self adjusting to amount * of memory available for RSS queues */ @@ -2558,6 +2554,10 @@ int bnx2x_nic_load(struct bnx2x *bp, int load_mode) LOAD_ERROR_EXIT(bp, load_error0); } + /* Allocated memory for FW statistics */ + if (bnx2x_alloc_fw_stats_mem(bp)) + LOAD_ERROR_EXIT(bp, load_error0); + /* request pf to initialize status blocks */ if (IS_VF(bp)) { rc = bnx2x_vfpf_init(bp); @@ -2812,8 +2812,8 @@ load_error1: if (IS_PF(bp)) bnx2x_clear_pf_load(bp); load_error0: - bnx2x_free_fp_mem(bp); bnx2x_free_fw_stats_mem(bp); + bnx2x_free_fp_mem(bp); bnx2x_free_mem(bp); return rc; -- cgit v0.10.2 From 826cb7b43b5bd8995f84edeacbbf569946a58f7c Mon Sep 17 00:00:00 2001 From: Ariel Elior Date: Sun, 27 Oct 2013 13:07:01 +0200 Subject: bnx2x: Disable VF access on PF removal When the bnx2x driver is rmmoded, if VFs of a given PF will be assigned to a VM then that PF will be unable to call `pci_disable_sriov()'. If for that same PF there would also exist unassigned VFs in the hypervisor, the result will be that after the removal there will still be virtual PCI functions on the hypervisor. If the bnx2x module were to be re-inserted, the result will be that the VFs on the hypervisor will be re-probed directly following the PF's probe, even though that in regular loading flow sriov is only enabled once PF is loaded. The probed VF will then try to access its bar, causing a PCI error as the HW is not in a state enabling such a request. This patch adds a missing disablement procedure to the PF's removal, one that sets registers viewable to the VF to indicate that the VFs have no permission to access the bar, thus resulting in probe errors instead of PCI errors. Signed-off-by: Ariel Elior Signed-off-by: Yuval Mintz Signed-off-by: David S. Miller diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c index bf08ad6..5e07efb 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c @@ -2018,6 +2018,8 @@ failed: void bnx2x_iov_remove_one(struct bnx2x *bp) { + int vf_idx; + /* if SRIOV is not enabled there's nothing to do */ if (!IS_SRIOV(bp)) return; @@ -2026,6 +2028,18 @@ void bnx2x_iov_remove_one(struct bnx2x *bp) pci_disable_sriov(bp->pdev); DP(BNX2X_MSG_IOV, "sriov disabled\n"); + /* disable access to all VFs */ + for (vf_idx = 0; vf_idx < bp->vfdb->sriov.total; vf_idx++) { + bnx2x_pretend_func(bp, + HW_VF_HANDLE(bp, + bp->vfdb->sriov.first_vf_in_pf + + vf_idx)); + DP(BNX2X_MSG_IOV, "disabling internal access for vf %d\n", + bp->vfdb->sriov.first_vf_in_pf + vf_idx); + bnx2x_vf_enable_internal(bp, 0); + bnx2x_pretend_func(bp, BP_ABS_FUNC(bp)); + } + /* free vf database */ __bnx2x_iov_free_vfdb(bp); } @@ -3197,7 +3211,7 @@ int bnx2x_enable_sriov(struct bnx2x *bp) * the "acquire" messages to appear on the VF PF channel. */ DP(BNX2X_MSG_IOV, "about to call enable sriov\n"); - pci_disable_sriov(bp->pdev); + bnx2x_disable_sriov(bp); rc = pci_enable_sriov(bp->pdev, req_vfs); if (rc) { BNX2X_ERR("pci_enable_sriov failed with %d\n", rc); -- cgit v0.10.2 From 262e827fe745642589450ae241b7afd3912c3f25 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Sun, 27 Oct 2013 21:02:39 +0000 Subject: cxgb3: Fix length calculation in write_ofld_wr() on 32-bit architectures The length calculation here is now invalid on 32-bit architectures, since sk_buff::tail is a pointer and sk_buff::transport_header is an integer offset: drivers/net/ethernet/chelsio/cxgb3/sge.c: In function 'write_ofld_wr': drivers/net/ethernet/chelsio/cxgb3/sge.c:1603:9: warning: passing argument 4 of 'make_sgl' makes integer from pointer without a cast [enabled by default] adap->pdev); ^ drivers/net/ethernet/chelsio/cxgb3/sge.c:964:28: note: expected 'unsigned int' but argument is of type 'sk_buff_data_t' static inline unsigned int make_sgl(const struct sk_buff *skb, ^ Use the appropriate skb accessor functions. Compile-tested only. Signed-off-by: Ben Hutchings Fixes: 1a37e412a022 ('net: Use 16bits for *_headers fields of struct skbuff') Signed-off-by: David S. Miller diff --git a/drivers/net/ethernet/chelsio/cxgb3/sge.c b/drivers/net/ethernet/chelsio/cxgb3/sge.c index 9c89dc8..632b318 100644 --- a/drivers/net/ethernet/chelsio/cxgb3/sge.c +++ b/drivers/net/ethernet/chelsio/cxgb3/sge.c @@ -1599,7 +1599,8 @@ static void write_ofld_wr(struct adapter *adap, struct sk_buff *skb, flits = skb_transport_offset(skb) / 8; sgp = ndesc == 1 ? (struct sg_ent *)&d->flit[flits] : sgl; sgl_flits = make_sgl(skb, sgp, skb_transport_header(skb), - skb->tail - skb->transport_header, + skb_tail_pointer(skb) - + skb_transport_header(skb), adap->pdev); if (need_skb_unmap()) { setup_deferred_unmapping(skb, adap->pdev, sgp, sgl_flits); -- cgit v0.10.2 From 059dfa6a93b779516321e5112db9d7621b1367ba Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Mon, 28 Oct 2013 12:07:57 +0000 Subject: xen-netback: use jiffies_64 value to calculate credit timeout time_after_eq() only works if the delta is < MAX_ULONG/2. For a 32bit Dom0, if netfront sends packets at a very low rate, the time between subsequent calls to tx_credit_exceeded() may exceed MAX_ULONG/2 and the test for timer_after_eq() will be incorrect. Credit will not be replenished and the guest may become unable to send packets (e.g., if prior to the long gap, all credit was exhausted). Use jiffies_64 variant to mitigate this problem for 32bit Dom0. Suggested-by: Jan Beulich Signed-off-by: Wei Liu Reviewed-by: David Vrabel Cc: Ian Campbell Cc: Jason Luan Signed-off-by: David S. Miller diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h index 5715318..400fea1 100644 --- a/drivers/net/xen-netback/common.h +++ b/drivers/net/xen-netback/common.h @@ -163,6 +163,7 @@ struct xenvif { unsigned long credit_usec; unsigned long remaining_credit; struct timer_list credit_timeout; + u64 credit_window_start; /* Statistics */ unsigned long rx_gso_checksum_fixup; diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c index 01bb854..459935a 100644 --- a/drivers/net/xen-netback/interface.c +++ b/drivers/net/xen-netback/interface.c @@ -312,8 +312,7 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid, vif->credit_bytes = vif->remaining_credit = ~0UL; vif->credit_usec = 0UL; init_timer(&vif->credit_timeout); - /* Initialize 'expires' now: it's used to track the credit window. */ - vif->credit_timeout.expires = jiffies; + vif->credit_window_start = get_jiffies_64(); dev->netdev_ops = &xenvif_netdev_ops; dev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO; diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index f3e591c..900da4b 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -1185,9 +1185,8 @@ out: static bool tx_credit_exceeded(struct xenvif *vif, unsigned size) { - unsigned long now = jiffies; - unsigned long next_credit = - vif->credit_timeout.expires + + u64 now = get_jiffies_64(); + u64 next_credit = vif->credit_window_start + msecs_to_jiffies(vif->credit_usec / 1000); /* Timer could already be pending in rare cases. */ @@ -1195,8 +1194,8 @@ static bool tx_credit_exceeded(struct xenvif *vif, unsigned size) return true; /* Passed the point where we can replenish credit? */ - if (time_after_eq(now, next_credit)) { - vif->credit_timeout.expires = now; + if (time_after_eq64(now, next_credit)) { + vif->credit_window_start = now; tx_add_credit(vif); } @@ -1208,6 +1207,7 @@ static bool tx_credit_exceeded(struct xenvif *vif, unsigned size) tx_credit_callback; mod_timer(&vif->credit_timeout, next_credit); + vif->credit_window_start = next_credit; return true; } -- cgit v0.10.2 From d954777324ffcba0b2f8119c102237426c654eeb Mon Sep 17 00:00:00 2001 From: Holger Eitzenberger Date: Mon, 28 Oct 2013 14:42:33 +0100 Subject: netfilter: xt_NFQUEUE: fix --queue-bypass regression V3 of the NFQUEUE target ignores the --queue-bypass flag, causing packets to be dropped when the userspace listener isn't running. Regression is in since 8746ddcf12bb26 ("netfilter: xt_NFQUEUE: introduce CPU fanout"). Reported-by: Florian Westphal Signed-off-by: Holger Eitzenberger Signed-off-by: Pablo Neira Ayuso diff --git a/net/netfilter/xt_NFQUEUE.c b/net/netfilter/xt_NFQUEUE.c index 1e2fae3..ed00fef 100644 --- a/net/netfilter/xt_NFQUEUE.c +++ b/net/netfilter/xt_NFQUEUE.c @@ -147,6 +147,7 @@ nfqueue_tg_v3(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_NFQ_info_v3 *info = par->targinfo; u32 queue = info->queuenum; + int ret; if (info->queues_total > 1) { if (info->flags & NFQ_FLAG_CPU_FANOUT) { @@ -157,7 +158,11 @@ nfqueue_tg_v3(struct sk_buff *skb, const struct xt_action_param *par) queue = nfqueue_hash(skb, par); } - return NF_QUEUE_NR(queue); + ret = NF_QUEUE_NR(queue); + if (info->flags & NFQ_FLAG_BYPASS) + ret |= NF_VERDICT_FLAG_QUEUE_BYPASS; + + return ret; } static struct xt_target nfqueue_tg_reg[] __read_mostly = { -- cgit v0.10.2 From 706e282b6975305285980ddd903bcd246d2e4fcd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20Dr=C3=BCing?= Date: Mon, 28 Oct 2013 18:33:12 +0100 Subject: net: x25: Fix dead URLs in Kconfig MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update the URLs in the Kconfig file to the new pages at sangoma.com and cisco.com Signed-off-by: Michael Drüing Signed-off-by: David S. Miller diff --git a/net/x25/Kconfig b/net/x25/Kconfig index c959312c..e2fa133 100644 --- a/net/x25/Kconfig +++ b/net/x25/Kconfig @@ -16,8 +16,8 @@ config X25 if you want that) and the lower level data link layer protocol LAPB (say Y to "LAPB Data Link Driver" below if you want that). - You can read more about X.25 at and - . + You can read more about X.25 at and + . Information about X.25 for Linux is contained in the files and . -- cgit v0.10.2 From 06499098a02b9ed906a7b6060f2c60fb813918d4 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Mon, 28 Oct 2013 15:45:07 -0400 Subject: bridge: pass correct vlan id to multicast code Currently multicast code attempts to extrace the vlan id from the skb even when vlan filtering is disabled. This can lead to mdb entries being created with the wrong vlan id. Pass the already extracted vlan id to the multicast filtering code to make the correct id is used in creation as well as lookup. Signed-off-by: Vlad Yasevich Acked-by: Toshiaki Makita Signed-off-by: David S. Miller diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index ca04163..e6b7fec 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -64,7 +64,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) br_flood_deliver(br, skb, false); goto out; } - if (br_multicast_rcv(br, NULL, skb)) { + if (br_multicast_rcv(br, NULL, skb, vid)) { kfree_skb(skb); goto out; } diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index a2fd37e..7e73c32 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -80,7 +80,7 @@ int br_handle_frame_finish(struct sk_buff *skb) br_fdb_update(br, p, eth_hdr(skb)->h_source, vid); if (!is_broadcast_ether_addr(dest) && is_multicast_ether_addr(dest) && - br_multicast_rcv(br, p, skb)) + br_multicast_rcv(br, p, skb, vid)) goto drop; if (p->state == BR_STATE_LEARNING) diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 8b0b610..686284f 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -947,7 +947,8 @@ void br_multicast_disable_port(struct net_bridge_port *port) static int br_ip4_multicast_igmp3_report(struct net_bridge *br, struct net_bridge_port *port, - struct sk_buff *skb) + struct sk_buff *skb, + u16 vid) { struct igmpv3_report *ih; struct igmpv3_grec *grec; @@ -957,12 +958,10 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br, int type; int err = 0; __be32 group; - u16 vid = 0; if (!pskb_may_pull(skb, sizeof(*ih))) return -EINVAL; - br_vlan_get_tag(skb, &vid); ih = igmpv3_report_hdr(skb); num = ntohs(ih->ngrec); len = sizeof(*ih); @@ -1005,7 +1004,8 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br, #if IS_ENABLED(CONFIG_IPV6) static int br_ip6_multicast_mld2_report(struct net_bridge *br, struct net_bridge_port *port, - struct sk_buff *skb) + struct sk_buff *skb, + u16 vid) { struct icmp6hdr *icmp6h; struct mld2_grec *grec; @@ -1013,12 +1013,10 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br, int len; int num; int err = 0; - u16 vid = 0; if (!pskb_may_pull(skb, sizeof(*icmp6h))) return -EINVAL; - br_vlan_get_tag(skb, &vid); icmp6h = icmp6_hdr(skb); num = ntohs(icmp6h->icmp6_dataun.un_data16[1]); len = sizeof(*icmp6h); @@ -1141,7 +1139,8 @@ static void br_multicast_query_received(struct net_bridge *br, static int br_ip4_multicast_query(struct net_bridge *br, struct net_bridge_port *port, - struct sk_buff *skb) + struct sk_buff *skb, + u16 vid) { const struct iphdr *iph = ip_hdr(skb); struct igmphdr *ih = igmp_hdr(skb); @@ -1153,7 +1152,6 @@ static int br_ip4_multicast_query(struct net_bridge *br, unsigned long now = jiffies; __be32 group; int err = 0; - u16 vid = 0; spin_lock(&br->multicast_lock); if (!netif_running(br->dev) || @@ -1189,7 +1187,6 @@ static int br_ip4_multicast_query(struct net_bridge *br, if (!group) goto out; - br_vlan_get_tag(skb, &vid); mp = br_mdb_ip4_get(mlock_dereference(br->mdb, br), group, vid); if (!mp) goto out; @@ -1219,7 +1216,8 @@ out: #if IS_ENABLED(CONFIG_IPV6) static int br_ip6_multicast_query(struct net_bridge *br, struct net_bridge_port *port, - struct sk_buff *skb) + struct sk_buff *skb, + u16 vid) { const struct ipv6hdr *ip6h = ipv6_hdr(skb); struct mld_msg *mld; @@ -1231,7 +1229,6 @@ static int br_ip6_multicast_query(struct net_bridge *br, unsigned long now = jiffies; const struct in6_addr *group = NULL; int err = 0; - u16 vid = 0; spin_lock(&br->multicast_lock); if (!netif_running(br->dev) || @@ -1265,7 +1262,6 @@ static int br_ip6_multicast_query(struct net_bridge *br, if (!group) goto out; - br_vlan_get_tag(skb, &vid); mp = br_mdb_ip6_get(mlock_dereference(br->mdb, br), group, vid); if (!mp) goto out; @@ -1439,7 +1435,8 @@ static void br_ip6_multicast_leave_group(struct net_bridge *br, static int br_multicast_ipv4_rcv(struct net_bridge *br, struct net_bridge_port *port, - struct sk_buff *skb) + struct sk_buff *skb, + u16 vid) { struct sk_buff *skb2 = skb; const struct iphdr *iph; @@ -1447,7 +1444,6 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br, unsigned int len; unsigned int offset; int err; - u16 vid = 0; /* We treat OOM as packet loss for now. */ if (!pskb_may_pull(skb, sizeof(*iph))) @@ -1508,7 +1504,6 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br, err = 0; - br_vlan_get_tag(skb2, &vid); BR_INPUT_SKB_CB(skb)->igmp = 1; ih = igmp_hdr(skb2); @@ -1519,10 +1514,10 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br, err = br_ip4_multicast_add_group(br, port, ih->group, vid); break; case IGMPV3_HOST_MEMBERSHIP_REPORT: - err = br_ip4_multicast_igmp3_report(br, port, skb2); + err = br_ip4_multicast_igmp3_report(br, port, skb2, vid); break; case IGMP_HOST_MEMBERSHIP_QUERY: - err = br_ip4_multicast_query(br, port, skb2); + err = br_ip4_multicast_query(br, port, skb2, vid); break; case IGMP_HOST_LEAVE_MESSAGE: br_ip4_multicast_leave_group(br, port, ih->group, vid); @@ -1540,7 +1535,8 @@ err_out: #if IS_ENABLED(CONFIG_IPV6) static int br_multicast_ipv6_rcv(struct net_bridge *br, struct net_bridge_port *port, - struct sk_buff *skb) + struct sk_buff *skb, + u16 vid) { struct sk_buff *skb2; const struct ipv6hdr *ip6h; @@ -1550,7 +1546,6 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br, unsigned int len; int offset; int err; - u16 vid = 0; if (!pskb_may_pull(skb, sizeof(*ip6h))) return -EINVAL; @@ -1640,7 +1635,6 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br, err = 0; - br_vlan_get_tag(skb, &vid); BR_INPUT_SKB_CB(skb)->igmp = 1; switch (icmp6_type) { @@ -1657,10 +1651,10 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br, break; } case ICMPV6_MLD2_REPORT: - err = br_ip6_multicast_mld2_report(br, port, skb2); + err = br_ip6_multicast_mld2_report(br, port, skb2, vid); break; case ICMPV6_MGM_QUERY: - err = br_ip6_multicast_query(br, port, skb2); + err = br_ip6_multicast_query(br, port, skb2, vid); break; case ICMPV6_MGM_REDUCTION: { @@ -1681,7 +1675,7 @@ out: #endif int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port, - struct sk_buff *skb) + struct sk_buff *skb, u16 vid) { BR_INPUT_SKB_CB(skb)->igmp = 0; BR_INPUT_SKB_CB(skb)->mrouters_only = 0; @@ -1691,10 +1685,10 @@ int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port, switch (skb->protocol) { case htons(ETH_P_IP): - return br_multicast_ipv4_rcv(br, port, skb); + return br_multicast_ipv4_rcv(br, port, skb, vid); #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): - return br_multicast_ipv6_rcv(br, port, skb); + return br_multicast_ipv6_rcv(br, port, skb, vid); #endif } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index e14c33b..2e8244e 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -451,7 +451,8 @@ extern int br_ioctl_deviceless_stub(struct net *net, unsigned int cmd, void __us extern unsigned int br_mdb_rehash_seq; extern int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port, - struct sk_buff *skb); + struct sk_buff *skb, + u16 vid); extern struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br, struct sk_buff *skb, u16 vid); extern void br_multicast_add_port(struct net_bridge_port *port); @@ -522,7 +523,8 @@ static inline bool br_multicast_querier_exists(struct net_bridge *br, #else static inline int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port, - struct sk_buff *skb) + struct sk_buff *skb, + u16 vid) { return 0; } -- cgit v0.10.2 From ec9debbd9a88d8ea86c488d6ffcac419ee7d46d9 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 29 Oct 2013 15:11:07 +0800 Subject: virtio-net: correctly handle cpu hotplug notifier during resuming commit 3ab098df35f8b98b6553edc2e40234af512ba877 (virtio-net: don't respond to cpu hotplug notifier if we're not ready) tries to bypass the cpu hotplug notifier by checking the config_enable and does nothing is it was false. So it need to try to hold the config_lock mutex which may happen in atomic environment which leads the following warnings: [ 622.944441] CPU0 attaching NULL sched-domain. [ 622.944446] CPU1 attaching NULL sched-domain. [ 622.944485] CPU0 attaching NULL sched-domain. [ 622.950795] BUG: sleeping function called from invalid context at kernel/mutex.c:616 [ 622.950796] in_atomic(): 1, irqs_disabled(): 1, pid: 10, name: migration/1 [ 622.950796] no locks held by migration/1/10. [ 622.950798] CPU: 1 PID: 10 Comm: migration/1 Not tainted 3.12.0-rc5-wl-01249-gb91e82d #317 [ 622.950799] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 622.950802] 0000000000000000 ffff88001d42dba0 ffffffff81a32f22 ffff88001bfb9c70 [ 622.950803] ffff88001d42dbb0 ffffffff810edb02 ffff88001d42dc38 ffffffff81a396ed [ 622.950805] 0000000000000046 ffff88001d42dbe8 ffffffff810e861d 0000000000000000 [ 622.950805] Call Trace: [ 622.950810] [] dump_stack+0x54/0x74 [ 622.950815] [] __might_sleep+0x112/0x114 [ 622.950817] [] mutex_lock_nested+0x3c/0x3c6 [ 622.950818] [] ? up+0x39/0x3e [ 622.950821] [] ? acpi_os_signal_semaphore+0x21/0x2d [ 622.950824] [] ? acpi_ut_release_mutex+0x5e/0x62 [ 622.950828] [] virtnet_cpu_callback+0x33/0x87 [ 622.950830] [] notifier_call_chain+0x3c/0x5e [ 622.950832] [] __raw_notifier_call_chain+0xe/0x10 [ 622.950835] [] __cpu_notify+0x20/0x37 [ 622.950836] [] cpu_notify+0x13/0x15 [ 622.950838] [] take_cpu_down+0x27/0x3a [ 622.950841] [] stop_machine_cpu_stop+0x93/0xf1 [ 622.950842] [] cpu_stopper_thread+0xa0/0x12f [ 622.950844] [] ? cpu_stopper_thread+0x12f/0x12f [ 622.950847] [] ? lock_release_holdtime.part.7+0xa3/0xa8 [ 622.950848] [] ? cpu_stop_should_run+0x3f/0x47 [ 622.950850] [] smpboot_thread_fn+0x1c5/0x1e3 [ 622.950852] [] ? lg_global_unlock+0x67/0x67 [ 622.950854] [] kthread+0xd8/0xe0 [ 622.950857] [] ? wait_for_common+0x12f/0x164 [ 622.950859] [] ? kthread_create_on_node+0x124/0x124 [ 622.950861] [] ret_from_fork+0x7c/0xb0 [ 622.950862] [] ? kthread_create_on_node+0x124/0x124 [ 622.950876] smpboot: CPU 1 is now offline [ 623.194556] SMP alternatives: lockdep: fixing up alternatives [ 623.194559] smpboot: Booting Node 0 Processor 1 APIC 0x1 ... A correct fix is to unregister the hotcpu notifier during restore and register a new one in resume. Reported-by: Fengguang Wu Tested-by: Fengguang Wu Cc: Wanlong Gao Cc: Rusty Russell Cc: Michael S. Tsirkin Signed-off-by: Jason Wang Acked-by: Michael S. Tsirkin Reviewed-by: Wanlong Gao Signed-off-by: David S. Miller diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 9fbdfcd..bbc9cb8 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -1118,11 +1118,6 @@ static int virtnet_cpu_callback(struct notifier_block *nfb, { struct virtnet_info *vi = container_of(nfb, struct virtnet_info, nb); - mutex_lock(&vi->config_lock); - - if (!vi->config_enable) - goto done; - switch(action & ~CPU_TASKS_FROZEN) { case CPU_ONLINE: case CPU_DOWN_FAILED: @@ -1136,8 +1131,6 @@ static int virtnet_cpu_callback(struct notifier_block *nfb, break; } -done: - mutex_unlock(&vi->config_lock); return NOTIFY_OK; } @@ -1699,6 +1692,8 @@ static int virtnet_freeze(struct virtio_device *vdev) struct virtnet_info *vi = vdev->priv; int i; + unregister_hotcpu_notifier(&vi->nb); + /* Prevent config work handler from accessing the device */ mutex_lock(&vi->config_lock); vi->config_enable = false; @@ -1747,6 +1742,10 @@ static int virtnet_restore(struct virtio_device *vdev) virtnet_set_queues(vi, vi->curr_queue_pairs); rtnl_unlock(); + err = register_hotcpu_notifier(&vi->nb); + if (err) + return err; + return 0; } #endif -- cgit v0.10.2 From b4dfd326c29c241c2bb8463167217eb2438b7c3d Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Wed, 30 Oct 2013 10:50:37 +1100 Subject: ibm emac: Don't call napi_complete if napi_reschedule failed This patch fixes a bug which would trigger the BUG_ON() at net/core/dev.c:4156. It was found that this was due to continuing processing in the current poll call even when the call to napi_reschedule failed, indicating the device was already on the polling list. This resulted in an extra call to napi_complete which triggered the BUG_ON(). This patch ensures that we only contine processing rotting packets in the current mal_poll call if we are not already on the polling list. Signed-off-by: Alistair Popple Signed-off-by: David S. Miller diff --git a/drivers/net/ethernet/ibm/emac/mal.c b/drivers/net/ethernet/ibm/emac/mal.c index dac564c..909f9b6 100644 --- a/drivers/net/ethernet/ibm/emac/mal.c +++ b/drivers/net/ethernet/ibm/emac/mal.c @@ -442,15 +442,11 @@ static int mal_poll(struct napi_struct *napi, int budget) if (unlikely(mc->ops->peek_rx(mc->dev) || test_bit(MAL_COMMAC_RX_STOPPED, &mc->flags))) { MAL_DBG2(mal, "rotting packet" NL); - if (napi_reschedule(napi)) - mal_disable_eob_irq(mal); - else - MAL_DBG2(mal, "already in poll list" NL); - - if (budget > 0) - goto again; - else + if (!napi_reschedule(napi)) goto more_work; + + mal_disable_eob_irq(mal); + goto again; } mc->ops->poll_tx(mc->dev); } -- cgit v0.10.2 From 32663b8b8948cc05f812ab82c1c7db2db3ddf717 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Wed, 30 Oct 2013 10:50:38 +1100 Subject: ibm emac: Fix locking for enable/disable eob irq Calls to mal_enable_eob_irq perform a read-write-modify of a dcr to enable device irqs which is protected by a spin lock. However calls to mal_disable_eob_irq do not take the corresponding lock. This patch resolves the problem by ensuring that calls to mal_disable_eob_irq also take the lock. Signed-off-by: Alistair Popple Signed-off-by: David S. Miller diff --git a/drivers/net/ethernet/ibm/emac/mal.c b/drivers/net/ethernet/ibm/emac/mal.c index 909f9b6..e784751 100644 --- a/drivers/net/ethernet/ibm/emac/mal.c +++ b/drivers/net/ethernet/ibm/emac/mal.c @@ -263,7 +263,9 @@ static inline void mal_schedule_poll(struct mal_instance *mal) { if (likely(napi_schedule_prep(&mal->napi))) { MAL_DBG2(mal, "schedule_poll" NL); + spin_lock(&mal->lock); mal_disable_eob_irq(mal); + spin_unlock(&mal->lock); __napi_schedule(&mal->napi); } else MAL_DBG2(mal, "already in poll" NL); @@ -445,7 +447,9 @@ static int mal_poll(struct napi_struct *napi, int budget) if (!napi_reschedule(napi)) goto more_work; + spin_lock_irqsave(&mal->lock, flags); mal_disable_eob_irq(mal); + spin_unlock_irqrestore(&mal->lock, flags); goto again; } mc->ops->poll_tx(mc->dev); -- cgit v0.10.2 From b757a62e9f0df5c997c666dca4ad81197b5d8917 Mon Sep 17 00:00:00 2001 From: Nathan Hintz Date: Tue, 29 Oct 2013 19:32:01 -0700 Subject: bgmac: don't update slot on skb alloc/dma mapping error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Don't update the slot in "bgmac_dma_rx_skb_for_slot" unless both the skb alloc and dma mapping are successful; and free the newly allocated skb if a dma mapping error occurs. This relieves the caller of the need to deduce/execute the appropriate cleanup action required when an error occurs. Signed-off-by: Nathan Hintz Acked-by: Rafał Miłecki Signed-off-by: David S. Miller diff --git a/drivers/net/ethernet/broadcom/bgmac.c b/drivers/net/ethernet/broadcom/bgmac.c index 249468f..9e8a3e0 100644 --- a/drivers/net/ethernet/broadcom/bgmac.c +++ b/drivers/net/ethernet/broadcom/bgmac.c @@ -244,25 +244,33 @@ static int bgmac_dma_rx_skb_for_slot(struct bgmac *bgmac, struct bgmac_slot_info *slot) { struct device *dma_dev = bgmac->core->dma_dev; + struct sk_buff *skb; + dma_addr_t dma_addr; struct bgmac_rx_header *rx; /* Alloc skb */ - slot->skb = netdev_alloc_skb(bgmac->net_dev, BGMAC_RX_BUF_SIZE); - if (!slot->skb) + skb = netdev_alloc_skb(bgmac->net_dev, BGMAC_RX_BUF_SIZE); + if (!skb) return -ENOMEM; /* Poison - if everything goes fine, hardware will overwrite it */ - rx = (struct bgmac_rx_header *)slot->skb->data; + rx = (struct bgmac_rx_header *)skb->data; rx->len = cpu_to_le16(0xdead); rx->flags = cpu_to_le16(0xbeef); /* Map skb for the DMA */ - slot->dma_addr = dma_map_single(dma_dev, slot->skb->data, - BGMAC_RX_BUF_SIZE, DMA_FROM_DEVICE); - if (dma_mapping_error(dma_dev, slot->dma_addr)) { + dma_addr = dma_map_single(dma_dev, skb->data, + BGMAC_RX_BUF_SIZE, DMA_FROM_DEVICE); + if (dma_mapping_error(dma_dev, dma_addr)) { bgmac_err(bgmac, "DMA mapping error\n"); + dev_kfree_skb(skb); return -ENOMEM; } + + /* Update the slot */ + slot->skb = skb; + slot->dma_addr = dma_addr; + if (slot->dma_addr & 0xC0000000) bgmac_warn(bgmac, "DMA address using 0xC0000000 bit(s), it may need translation trick\n"); -- cgit v0.10.2 From c17cb8b55b104c549aa20a72fa44141ad2c65ec2 Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Wed, 30 Oct 2013 16:46:15 +0900 Subject: doc:net: Fix typo in Documentation/networking Correct spelling typo in Documentation/networking Signed-off-by: Masanari Iida Acked-by: Randy Dunlap Signed-off-by: David S. Miller diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt index d718bc2..bf5dbe3 100644 --- a/Documentation/networking/dccp.txt +++ b/Documentation/networking/dccp.txt @@ -18,8 +18,8 @@ Introduction Datagram Congestion Control Protocol (DCCP) is an unreliable, connection oriented protocol designed to solve issues present in UDP and TCP, particularly for real-time and multimedia (streaming) traffic. -It divides into a base protocol (RFC 4340) and plugable congestion control -modules called CCIDs. Like plugable TCP congestion control, at least one CCID +It divides into a base protocol (RFC 4340) and pluggable congestion control +modules called CCIDs. Like pluggable TCP congestion control, at least one CCID needs to be enabled in order for the protocol to function properly. In the Linux implementation, this is the TCP-like CCID2 (RFC 4341). Additional CCIDs, such as the TCP-friendly CCID3 (RFC 4342), are optional. diff --git a/Documentation/networking/e100.txt b/Documentation/networking/e100.txt index 13a3212..f862cf3 100644 --- a/Documentation/networking/e100.txt +++ b/Documentation/networking/e100.txt @@ -103,7 +103,7 @@ Additional Configurations PRO/100 Family of Adapters is e100. As an example, if you install the e100 driver for two PRO/100 adapters - (eth0 and eth1), add the following to a configuraton file in /etc/modprobe.d/ + (eth0 and eth1), add the following to a configuration file in /etc/modprobe.d/ alias eth0 e100 alias eth1 e100 diff --git a/Documentation/networking/ieee802154.txt b/Documentation/networking/ieee802154.txt index 09eb573..22bbc72 100644 --- a/Documentation/networking/ieee802154.txt +++ b/Documentation/networking/ieee802154.txt @@ -4,7 +4,7 @@ Introduction ============ -The IEEE 802.15.4 working group focuses on standartization of bottom +The IEEE 802.15.4 working group focuses on standardization of bottom two layers: Medium Access Control (MAC) and Physical (PHY). And there are mainly two options available for upper layers: - ZigBee - proprietary protocol from ZigBee Alliance @@ -66,7 +66,7 @@ net_device, with .type = ARPHRD_IEEE802154. Data is exchanged with socket family code via plain sk_buffs. On skb reception skb->cb must contain additional info as described in the struct ieee802154_mac_cb. During packet transmission the skb->cb is used to provide additional data to device's header_ops->create -function. Be aware, that this data can be overriden later (when socket code +function. Be aware that this data can be overridden later (when socket code submits skb to qdisc), so if you need something from that cb later, you should store info in the skb->data on your own. diff --git a/Documentation/networking/l2tp.txt b/Documentation/networking/l2tp.txt index e63fc1f..c74434d 100644 --- a/Documentation/networking/l2tp.txt +++ b/Documentation/networking/l2tp.txt @@ -197,7 +197,7 @@ state information because the file format is subject to change. It is implemented to provide extra debug information to help diagnose problems.) Users should use the netlink API. -/proc/net/pppol2tp is also provided for backwards compaibility with +/proc/net/pppol2tp is also provided for backwards compatibility with the original pppol2tp driver. It lists information about L2TPv2 tunnels and sessions only. Its use is discouraged. diff --git a/Documentation/networking/netdev-FAQ.txt b/Documentation/networking/netdev-FAQ.txt index 3a2c586..0fe1c6e 100644 --- a/Documentation/networking/netdev-FAQ.txt +++ b/Documentation/networking/netdev-FAQ.txt @@ -68,7 +68,7 @@ A: To understand this, you need to know a bit of background information The "net" tree continues to collect fixes for the vX.Y content, and is fed back to Linus at regular (~weekly) intervals. Meaning that the - focus for "net" is on stablilization and bugfixes. + focus for "net" is on stabilization and bugfixes. Finally, the vX.Y gets released, and the whole cycle starts over. diff --git a/Documentation/networking/netlink_mmap.txt b/Documentation/networking/netlink_mmap.txt index 5333788..b261229 100644 --- a/Documentation/networking/netlink_mmap.txt +++ b/Documentation/networking/netlink_mmap.txt @@ -45,7 +45,7 @@ processing. Conversion of the reception path involves calling poll() on the file descriptor, once the socket is readable the frames from the ring are -processsed in order until no more messages are available, as indicated by +processed in order until no more messages are available, as indicated by a status word in the frame header. On kernel side, in order to make use of memory mapped I/O on receive, the @@ -56,7 +56,7 @@ Dumps of kernel databases automatically support memory mapped I/O. Conversion of the transmit path involves changing message construction to use memory from the TX ring instead of (usually) a buffer declared on the -stack and setting up the frame header approriately. Optionally poll() can +stack and setting up the frame header appropriately. Optionally poll() can be used to wait for free frames in the TX ring. Structured and definitions for using memory mapped I/O are contained in @@ -231,7 +231,7 @@ Ring setup: if (setsockopt(fd, NETLINK_TX_RING, &req, sizeof(req)) < 0) exit(1) - /* Calculate size of each invididual ring */ + /* Calculate size of each individual ring */ ring_size = req.nm_block_nr * req.nm_block_size; /* Map RX/TX rings. The TX ring is located after the RX ring */ diff --git a/Documentation/networking/operstates.txt b/Documentation/networking/operstates.txt index 9769457..355c6d8 100644 --- a/Documentation/networking/operstates.txt +++ b/Documentation/networking/operstates.txt @@ -89,8 +89,8 @@ packets. The name 'carrier' and the inversion are historical, think of it as lower layer. Note that for certain kind of soft-devices, which are not managing any -real hardware, there is possible to set this bit from userpsace. -One should use TVL IFLA_CARRIER to do so. +real hardware, it is possible to set this bit from userspace. One +should use TVL IFLA_CARRIER to do so. netif_carrier_ok() can be used to query that bit. diff --git a/Documentation/networking/rxrpc.txt b/Documentation/networking/rxrpc.txt index 60d05eb..b89bc82e 100644 --- a/Documentation/networking/rxrpc.txt +++ b/Documentation/networking/rxrpc.txt @@ -144,7 +144,7 @@ An overview of the RxRPC protocol: (*) Calls use ACK packets to handle reliability. Data packets are also explicitly sequenced per call. - (*) There are two types of positive acknowledgement: hard-ACKs and soft-ACKs. + (*) There are two types of positive acknowledgment: hard-ACKs and soft-ACKs. A hard-ACK indicates to the far side that all the data received to a point has been received and processed; a soft-ACK indicates that the data has been received but may yet be discarded and re-requested. The sender may diff --git a/Documentation/networking/stmmac.txt b/Documentation/networking/stmmac.txt index 457b8bb..cdd916d 100644 --- a/Documentation/networking/stmmac.txt +++ b/Documentation/networking/stmmac.txt @@ -160,7 +160,7 @@ Where: o pmt: core has the embedded power module (optional). o force_sf_dma_mode: force DMA to use the Store and Forward mode instead of the Threshold. - o force_thresh_dma_mode: force DMA to use the Shreshold mode other than + o force_thresh_dma_mode: force DMA to use the Threshold mode other than the Store and Forward mode. o riwt_off: force to disable the RX watchdog feature and switch to NAPI mode. o fix_mac_speed: this callback is used for modifying some syscfg registers @@ -175,7 +175,7 @@ Where: registers. o custom_cfg/custom_data: this is a custom configuration that can be passed while initializing the resources. - o bsp_priv: another private poiter. + o bsp_priv: another private pointer. For MDIO bus The we have: @@ -271,7 +271,7 @@ reset procedure etc). o dwmac1000_dma.c: dma functions for the GMAC chip; o dwmac1000.h: specific header file for the GMAC; o dwmac100_core: MAC 100 core and dma code; - o dwmac100_dma.c: dma funtions for the MAC chip; + o dwmac100_dma.c: dma functions for the MAC chip; o dwmac1000.h: specific header file for the MAC; o dwmac_lib.c: generic DMA functions shared among chips; o enh_desc.c: functions for handling enhanced descriptors; @@ -364,4 +364,4 @@ Auto-negotiated Link Parter Ability. 10) TODO: o XGMAC is not supported. o Complete the TBI & RTBI support. - o extened VLAN support for 3.70a SYNP GMAC. + o extend VLAN support for 3.70a SYNP GMAC. diff --git a/Documentation/networking/vortex.txt b/Documentation/networking/vortex.txt index 9a8041d..97282da 100644 --- a/Documentation/networking/vortex.txt +++ b/Documentation/networking/vortex.txt @@ -68,7 +68,7 @@ Module parameters There are several parameters which may be provided to the driver when its module is loaded. These are usually placed in /etc/modprobe.d/*.conf -configuretion files. Example: +configuration files. Example: options 3c59x debug=3 rx_copybreak=300 @@ -178,7 +178,7 @@ max_interrupt_work=N The driver's interrupt service routine can handle many receive and transmit packets in a single invocation. It does this in a loop. - The value of max_interrupt_work governs how mnay times the interrupt + The value of max_interrupt_work governs how many times the interrupt service routine will loop. The default value is 32 loops. If this is exceeded the interrupt service routine gives up and generates a warning message "eth0: Too much work in interrupt". diff --git a/Documentation/networking/x25-iface.txt b/Documentation/networking/x25-iface.txt index 78f662e..7f213b5 100644 --- a/Documentation/networking/x25-iface.txt +++ b/Documentation/networking/x25-iface.txt @@ -105,7 +105,7 @@ reduced by the following measures or a combination thereof: later. The lapb module interface was modified to support this. Its data_indication() method should now transparently pass the - netif_rx() return value to the (lapb mopdule) caller. + netif_rx() return value to the (lapb module) caller. (2) Drivers for kernel versions 2.2.x should always check the global variable netdev_dropping when a new frame is received. The driver should only call netif_rx() if netdev_dropping is zero. Otherwise -- cgit v0.10.2 From 5d0f801a2ccec3b1fdabc3392c8d99ed0413d216 Mon Sep 17 00:00:00 2001 From: Markus Pargmann Date: Mon, 28 Oct 2013 09:54:40 +0100 Subject: can: c_can: Fix RX message handling, handle lost message before EOB If we handle end of block messages with higher priority than a lost message, we can run into an endless interrupt loop. This is reproducable with a am335x processor and "cansequence -r" at 1Mbit. As soon as we loose a packet we can't escape from an interrupt loop. This patch fixes the problem by handling lost packets before EOB packets. Cc: linux-stable Signed-off-by: Markus Pargmann Signed-off-by: Marc Kleine-Budde diff --git a/drivers/net/can/c_can/c_can.c b/drivers/net/can/c_can/c_can.c index a668cd4..e3fc07c 100644 --- a/drivers/net/can/c_can/c_can.c +++ b/drivers/net/can/c_can/c_can.c @@ -814,9 +814,6 @@ static int c_can_do_rx_poll(struct net_device *dev, int quota) msg_ctrl_save = priv->read_reg(priv, C_CAN_IFACE(MSGCTRL_REG, 0)); - if (msg_ctrl_save & IF_MCONT_EOB) - return num_rx_pkts; - if (msg_ctrl_save & IF_MCONT_MSGLST) { c_can_handle_lost_msg_obj(dev, 0, msg_obj); num_rx_pkts++; @@ -824,6 +821,9 @@ static int c_can_do_rx_poll(struct net_device *dev, int quota) continue; } + if (msg_ctrl_save & IF_MCONT_EOB) + return num_rx_pkts; + if (!(msg_ctrl_save & IF_MCONT_NEWDAT)) continue; -- cgit v0.10.2 From 896e23bd04ea50a146dffd342e2f96180f0812a5 Mon Sep 17 00:00:00 2001 From: Olivier Sobrie Date: Sun, 27 Oct 2013 22:07:53 +0100 Subject: can: kvaser_usb: fix usb endpoints detection Some devices, like the Kvaser Memorator Professional, have several bulk in endpoints. Only the first one found must be used by the driver. The same holds for the bulk out endpoint. The official Kvaser driver (leaf) was used as reference for this patch. Cc: linux-stable Signed-off-by: Olivier Sobrie Signed-off-by: Marc Kleine-Budde diff --git a/drivers/net/can/usb/kvaser_usb.c b/drivers/net/can/usb/kvaser_usb.c index 3b95465..4b2d5ed 100644 --- a/drivers/net/can/usb/kvaser_usb.c +++ b/drivers/net/can/usb/kvaser_usb.c @@ -1544,9 +1544,9 @@ static int kvaser_usb_init_one(struct usb_interface *intf, return 0; } -static void kvaser_usb_get_endpoints(const struct usb_interface *intf, - struct usb_endpoint_descriptor **in, - struct usb_endpoint_descriptor **out) +static int kvaser_usb_get_endpoints(const struct usb_interface *intf, + struct usb_endpoint_descriptor **in, + struct usb_endpoint_descriptor **out) { const struct usb_host_interface *iface_desc; struct usb_endpoint_descriptor *endpoint; @@ -1557,12 +1557,18 @@ static void kvaser_usb_get_endpoints(const struct usb_interface *intf, for (i = 0; i < iface_desc->desc.bNumEndpoints; ++i) { endpoint = &iface_desc->endpoint[i].desc; - if (usb_endpoint_is_bulk_in(endpoint)) + if (!*in && usb_endpoint_is_bulk_in(endpoint)) *in = endpoint; - if (usb_endpoint_is_bulk_out(endpoint)) + if (!*out && usb_endpoint_is_bulk_out(endpoint)) *out = endpoint; + + /* use first bulk endpoint for in and out */ + if (*in && *out) + return 0; } + + return -ENODEV; } static int kvaser_usb_probe(struct usb_interface *intf, @@ -1576,8 +1582,8 @@ static int kvaser_usb_probe(struct usb_interface *intf, if (!dev) return -ENOMEM; - kvaser_usb_get_endpoints(intf, &dev->bulk_in, &dev->bulk_out); - if (!dev->bulk_in || !dev->bulk_out) { + err = kvaser_usb_get_endpoints(intf, &dev->bulk_in, &dev->bulk_out); + if (err) { dev_err(&intf->dev, "Cannot get usb endpoint(s)"); return err; } -- cgit v0.10.2 From 84502b5ef9849a9694673b15c31bd3ac693010ae Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Wed, 30 Oct 2013 11:16:28 +0100 Subject: xfrm: Fix null pointer dereference when decoding sessions On some codepaths the skb does not have a dst entry when xfrm_decode_session() is called. So check for a valid skb_dst() before dereferencing the device interface index. We use 0 as the device index if there is no valid skb_dst(), or at reverse decoding we use skb_iif as device interface index. Bug was introduced with git commit bafd4bd4dc ("xfrm: Decode sessions with output interface."). Reported-by: Meelis Roos Tested-by: Meelis Roos Signed-off-by: Steffen Klassert diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 4764ee4..e1a6393 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -104,10 +104,14 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) const struct iphdr *iph = ip_hdr(skb); u8 *xprth = skb_network_header(skb) + iph->ihl * 4; struct flowi4 *fl4 = &fl->u.ip4; + int oif = 0; + + if (skb_dst(skb)) + oif = skb_dst(skb)->dev->ifindex; memset(fl4, 0, sizeof(struct flowi4)); fl4->flowi4_mark = skb->mark; - fl4->flowi4_oif = skb_dst(skb)->dev->ifindex; + fl4->flowi4_oif = reverse ? skb->skb_iif : oif; if (!ip_is_fragment(iph)) { switch (iph->protocol) { diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index dd503a3..5f8e128 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -135,10 +135,14 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse) struct ipv6_opt_hdr *exthdr; const unsigned char *nh = skb_network_header(skb); u8 nexthdr = nh[IP6CB(skb)->nhoff]; + int oif = 0; + + if (skb_dst(skb)) + oif = skb_dst(skb)->dev->ifindex; memset(fl6, 0, sizeof(struct flowi6)); fl6->flowi6_mark = skb->mark; - fl6->flowi6_oif = skb_dst(skb)->dev->ifindex; + fl6->flowi6_oif = reverse ? skb->skb_iif : oif; fl6->daddr = reverse ? hdr->saddr : hdr->daddr; fl6->saddr = reverse ? hdr->daddr : hdr->saddr; -- cgit v0.10.2 From 6f092343855a71e03b8d209815d8c45bf3a27fcd Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Fri, 1 Nov 2013 15:01:10 +0800 Subject: net: flow_dissector: fail on evil iph->ihl We don't validate iph->ihl which may lead a dead loop if we meet a IPIP skb whose iph->ihl is zero. Fix this by failing immediately when iph->ihl is evil (less than 5). This issue were introduced by commit ec5efe7946280d1e84603389a1030ccec0a767ae (rps: support IPIP encapsulation). Cc: Eric Dumazet Cc: Petr Matousek Cc: Michael S. Tsirkin Cc: Daniel Borkmann Signed-off-by: Jason Wang Acked-by: Eric Dumazet Signed-off-by: David S. Miller diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 8d7d0dd..143b6fd 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -40,7 +40,7 @@ again: struct iphdr _iph; ip: iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); - if (!iph) + if (!iph || iph->ihl < 5) return false; if (ip_is_fragment(iph)) -- cgit v0.10.2 From 7926c1d5be0b7cbe5b8d5c788d7d39237e7b212c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 31 Oct 2013 09:13:32 +0100 Subject: net: sctp: do not trigger BUG_ON in sctp_cmd_delete_tcb Introduced in f9e42b853523 ("net: sctp: sideeffect: throw BUG if primary_path is NULL"), we intended to find a buggy assoc that's part of the assoc hash table with a primary_path that is NULL. However, we better remove the BUG_ON for now and find a more suitable place to assert for these things as Mark reports that this also triggers the bug when duplication cookie processing happens, and the assoc is not part of the hash table (so all good in this case). Such a situation can for example easily be reproduced by: tc qdisc add dev eth0 root handle 1: prio bands 2 priomap 1 1 1 1 1 1 tc qdisc add dev eth0 parent 1:2 handle 20: netem loss 20% tc filter add dev eth0 protocol ip parent 1: prio 2 u32 match ip \ protocol 132 0xff match u8 0x0b 0xff at 32 flowid 1:2 This drops 20% of COOKIE-ACK packets. After some follow-up discussion with Vlad we came to the conclusion that for now we should still better remove this BUG_ON() assertion, and come up with two follow-ups later on, that is, i) find a more suitable place for this assertion, and possibly ii) have a special allocator/initializer for such kind of temporary assocs. Reported-by: Mark Thomas Signed-off-by: Vlad Yasevich Signed-off-by: Daniel Borkmann Acked-by: Neil Horman Signed-off-by: David S. Miller diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index 666c668..1a6eef3 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -860,7 +860,6 @@ static void sctp_cmd_delete_tcb(sctp_cmd_seq_t *cmds, (!asoc->temp) && (sk->sk_shutdown != SHUTDOWN_MASK)) return; - BUG_ON(asoc->peer.primary_path == NULL); sctp_unhash_established(asoc); sctp_association_free(asoc); } -- cgit v0.10.2 From c32b7dfbb1dfb3f0a68f250deff65103c8bb704a Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Sun, 3 Nov 2013 10:04:07 +0200 Subject: net/mlx4_core: Fix call to __mlx4_unregister_mac In function mlx4_master_deactivate_admin_state() __mlx4_unregister_mac was called using the MAC index. It should be called with the value of the MAC itself. Signed-off-by: Jack Morgenstein Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c index ea20182..bb11624 100644 --- a/drivers/net/ethernet/mellanox/mlx4/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c @@ -1691,7 +1691,7 @@ static void mlx4_master_deactivate_admin_state(struct mlx4_priv *priv, int slave vp_oper->vlan_idx = NO_INDX; } if (NO_INDX != vp_oper->mac_idx) { - __mlx4_unregister_mac(&priv->dev, port, vp_oper->mac_idx); + __mlx4_unregister_mac(&priv->dev, port, vp_oper->state.mac); vp_oper->mac_idx = NO_INDX; } } -- cgit v0.10.2