From 444346385249603f8970314dcdb058d84a2530a4 Mon Sep 17 00:00:00 2001 From: Mitch Williams Date: Tue, 7 Apr 2015 11:32:55 -0700 Subject: i40e: stop VF rings Explicitly stop the rings belonging to each VF when disabling SR-IOV. Even though the VFs were gone, and the associated VSIs were removed, the rings were not stopped, and in some circumstances the hardware would continue to access the memory formerly used by the rings, causing memory corruption or DMAR errors, both of which would lead to general malaise of the kernel. To relieve this condition, explicitly stop all the rings associated with each VF before releasing its resources. Signed-off-by: Mitch Williams Tested-by: Jim Young Signed-off-by: Jeff Kirsher diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index 4d69e1f..9b3fc83 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -733,6 +733,11 @@ void i40e_free_vfs(struct i40e_pf *pf) while (test_and_set_bit(__I40E_VF_DISABLE, &pf->state)) usleep_range(1000, 2000); + for (i = 0; i < pf->num_alloc_vfs; i++) + if (test_bit(I40E_VF_STAT_INIT, &pf->vf[i].vf_states)) + i40e_vsi_control_rings(pf->vsi[pf->vf[i].lan_vsi_idx], + false); + /* Disable IOV before freeing resources. This lets any VF drivers * running in the host get themselves cleaned up before we yank * the carpet out from underneath their feet. -- cgit v0.10.2 From 2619ef475f7607f1cf1c434d4adca2593a59a6e6 Mon Sep 17 00:00:00 2001 From: Mitch Williams Date: Tue, 7 Apr 2015 19:45:30 -0400 Subject: i40evf: fix bad indentation Not sure how this slipped through. Cosmetic change only. Signed-off-by: Mitch Williams Tested-by: Jim Young Signed-off-by: Jeff Kirsher diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_main.c b/drivers/net/ethernet/intel/i40evf/i40evf_main.c index 6d5f3b2..a31bfd1 100644 --- a/drivers/net/ethernet/intel/i40evf/i40evf_main.c +++ b/drivers/net/ethernet/intel/i40evf/i40evf_main.c @@ -2029,7 +2029,7 @@ static void i40evf_init_task(struct work_struct *work) if (err) { dev_err(&pdev->dev, "Failed to set MAC type (%d)\n", err); - goto err; + goto err; } err = i40evf_check_reset_complete(hw); if (err) { -- cgit v0.10.2 From e7c8c60bc5d48994a67e4b1c7bfb01d6979dbc54 Mon Sep 17 00:00:00 2001 From: Anjali Singhai Jain Date: Tue, 7 Apr 2015 19:45:31 -0400 Subject: i40e: Add support to program FDir SB rules for VF from PF through ethtool With this patch we can now add Flow director Sideband rules for a VF from it's PF. Here is an example on how it can be done when VF id = 5 and queue = 2: "ethtool -N ethx flow-type udp4 src-ip x.x.x.x dst-ip y.y.y.y src-port p1 dst-port p2 action 2 user-def 5" User-def specifies VF id and action specifies queue. Change-ID: Ib37d6dff3823a4d85caffde638473891c38c2b89 Signed-off-by: Anjali Singhai Jain Tested-by: Jim Young Signed-off-by: Jeff Kirsher diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c index c848b18..7d60587 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c +++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c @@ -1914,6 +1914,16 @@ static int i40e_get_ethtool_fdir_entry(struct i40e_pf *pf, else fsp->ring_cookie = rule->q_index; + if (rule->dest_vsi != pf->vsi[pf->lan_vsi]->id) { + struct i40e_vsi *vsi; + + vsi = i40e_find_vsi_from_id(pf, rule->dest_vsi); + if (vsi && vsi->type == I40E_VSI_SRIOV) { + fsp->h_ext.data[1] = htonl(vsi->vf_id); + fsp->m_ext.data[1] = htonl(0x1); + } + } + return 0; } @@ -2207,6 +2217,7 @@ static int i40e_add_fdir_ethtool(struct i40e_vsi *vsi, struct i40e_fdir_filter *input; struct i40e_pf *pf; int ret = -EINVAL; + u16 vf_id; if (!vsi) return -EINVAL; @@ -2267,7 +2278,22 @@ static int i40e_add_fdir_ethtool(struct i40e_vsi *vsi, input->dst_ip[0] = fsp->h_u.tcp_ip4_spec.ip4src; input->src_ip[0] = fsp->h_u.tcp_ip4_spec.ip4dst; + if (ntohl(fsp->m_ext.data[1])) { + if (ntohl(fsp->h_ext.data[1]) >= pf->num_alloc_vfs) { + netif_info(pf, drv, vsi->netdev, "Invalid VF id\n"); + goto free_input; + } + vf_id = ntohl(fsp->h_ext.data[1]); + /* Find vsi id from vf id and override dest vsi */ + input->dest_vsi = pf->vf[vf_id].lan_vsi_id; + if (input->q_index >= pf->vf[vf_id].num_queue_pairs) { + netif_info(pf, drv, vsi->netdev, "Invalid queue id\n"); + goto free_input; + } + } + ret = i40e_add_del_fdir(vsi, input, true); +free_input: if (ret) kfree(input); else -- cgit v0.10.2 From ed636960c3b5a44e59eccafef94837d14e3bc10f Mon Sep 17 00:00:00 2001 From: Mitch Williams Date: Tue, 7 Apr 2015 19:45:32 -0400 Subject: i40evf: remove aq_pending The aq_pending field in the adapter structure is actually redundant with the current_op field. Remove the aq_pending field and expunge all traces of it from the official record. This simplifies the code significantly, especially in the virtual channel completion routine. Change-ID: Ib2957c8c19882bd0cecc6fcd133912c24b46a1ff Signed-off-by: Mitch Williams Tested-by: Jim Young Signed-off-by: Jeff Kirsher diff --git a/drivers/net/ethernet/intel/i40evf/i40evf.h b/drivers/net/ethernet/intel/i40evf/i40evf.h index 34c8565..1b98c25 100644 --- a/drivers/net/ethernet/intel/i40evf/i40evf.h +++ b/drivers/net/ethernet/intel/i40evf/i40evf.h @@ -225,7 +225,6 @@ struct i40evf_adapter { #define I40E_FLAG_RX_CSUM_ENABLED I40EVF_FLAG_RX_CSUM_ENABLED /* flags for admin queue service task */ u32 aq_required; - u32 aq_pending; #define I40EVF_FLAG_AQ_ENABLE_QUEUES (u32)(1) #define I40EVF_FLAG_AQ_DISABLE_QUEUES (u32)(1 << 1) #define I40EVF_FLAG_AQ_ADD_MAC_FILTER (u32)(1 << 2) diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_main.c b/drivers/net/ethernet/intel/i40evf/i40evf_main.c index a31bfd1..7c53aca 100644 --- a/drivers/net/ethernet/intel/i40evf/i40evf_main.c +++ b/drivers/net/ethernet/intel/i40evf/i40evf_main.c @@ -1008,7 +1008,6 @@ void i40evf_down(struct i40evf_adapter *adapter) adapter->state != __I40EVF_RESETTING) { /* cancel any current operation */ adapter->current_op = I40E_VIRTCHNL_OP_UNKNOWN; - adapter->aq_pending = 0; /* Schedule operations to close down the HW. Don't wait * here for this to complete. The watchdog is still running * and it will take care of this. @@ -1335,7 +1334,6 @@ static void i40evf_watchdog_task(struct work_struct *work) */ return; } - adapter->aq_pending = 0; adapter->aq_required = 0; adapter->current_op = I40E_VIRTCHNL_OP_UNKNOWN; goto watchdog_done; @@ -1355,7 +1353,6 @@ static void i40evf_watchdog_task(struct work_struct *work) adapter->flags |= I40EVF_FLAG_RESET_PENDING; dev_err(&adapter->pdev->dev, "Hardware reset detected\n"); schedule_work(&adapter->reset_task); - adapter->aq_pending = 0; adapter->aq_required = 0; adapter->current_op = I40E_VIRTCHNL_OP_UNKNOWN; goto watchdog_done; @@ -1364,7 +1361,7 @@ static void i40evf_watchdog_task(struct work_struct *work) /* Process admin queue tasks. After init, everything gets done * here so we don't race on the admin queue. */ - if (adapter->aq_pending) { + if (adapter->current_op) { if (!i40evf_asq_done(hw)) { dev_dbg(&adapter->pdev->dev, "Admin queue timeout\n"); i40evf_send_api_ver(adapter); @@ -2249,7 +2246,6 @@ static void i40evf_shutdown(struct pci_dev *pdev) /* Prevent the watchdog from running. */ adapter->state = __I40EVF_REMOVE; adapter->aq_required = 0; - adapter->aq_pending = 0; #ifdef CONFIG_PM pci_save_state(pdev); @@ -2467,7 +2463,6 @@ static void i40evf_remove(struct pci_dev *pdev) /* Shut down all the garbage mashers on the detention level */ adapter->state = __I40EVF_REMOVE; adapter->aq_required = 0; - adapter->aq_pending = 0; i40evf_request_reset(adapter); msleep(20); /* If the FW isn't responding, kick it once, but only once. */ diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_virtchnl.c b/drivers/net/ethernet/intel/i40evf/i40evf_virtchnl.c index 4240a49..61e0905 100644 --- a/drivers/net/ethernet/intel/i40evf/i40evf_virtchnl.c +++ b/drivers/net/ethernet/intel/i40evf/i40evf_virtchnl.c @@ -250,7 +250,6 @@ void i40evf_configure_queues(struct i40evf_adapter *adapter) vqpi++; } - adapter->aq_pending |= I40EVF_FLAG_AQ_CONFIGURE_QUEUES; adapter->aq_required &= ~I40EVF_FLAG_AQ_CONFIGURE_QUEUES; i40evf_send_pf_msg(adapter, I40E_VIRTCHNL_OP_CONFIG_VSI_QUEUES, (u8 *)vqci, len); @@ -277,7 +276,6 @@ void i40evf_enable_queues(struct i40evf_adapter *adapter) vqs.vsi_id = adapter->vsi_res->vsi_id; vqs.tx_queues = (1 << adapter->num_active_queues) - 1; vqs.rx_queues = vqs.tx_queues; - adapter->aq_pending |= I40EVF_FLAG_AQ_ENABLE_QUEUES; adapter->aq_required &= ~I40EVF_FLAG_AQ_ENABLE_QUEUES; i40evf_send_pf_msg(adapter, I40E_VIRTCHNL_OP_ENABLE_QUEUES, (u8 *)&vqs, sizeof(vqs)); @@ -303,7 +301,6 @@ void i40evf_disable_queues(struct i40evf_adapter *adapter) vqs.vsi_id = adapter->vsi_res->vsi_id; vqs.tx_queues = (1 << adapter->num_active_queues) - 1; vqs.rx_queues = vqs.tx_queues; - adapter->aq_pending |= I40EVF_FLAG_AQ_DISABLE_QUEUES; adapter->aq_required &= ~I40EVF_FLAG_AQ_DISABLE_QUEUES; i40evf_send_pf_msg(adapter, I40E_VIRTCHNL_OP_DISABLE_QUEUES, (u8 *)&vqs, sizeof(vqs)); @@ -354,7 +351,6 @@ void i40evf_map_queues(struct i40evf_adapter *adapter) vimi->vecmap[v_idx].txq_map = 0; vimi->vecmap[v_idx].rxq_map = 0; - adapter->aq_pending |= I40EVF_FLAG_AQ_MAP_VECTORS; adapter->aq_required &= ~I40EVF_FLAG_AQ_MAP_VECTORS; i40evf_send_pf_msg(adapter, I40E_VIRTCHNL_OP_CONFIG_IRQ_MAP, (u8 *)vimi, len); @@ -415,7 +411,6 @@ void i40evf_add_ether_addrs(struct i40evf_adapter *adapter) f->add = false; } } - adapter->aq_pending |= I40EVF_FLAG_AQ_ADD_MAC_FILTER; adapter->aq_required &= ~I40EVF_FLAG_AQ_ADD_MAC_FILTER; i40evf_send_pf_msg(adapter, I40E_VIRTCHNL_OP_ADD_ETHER_ADDRESS, (u8 *)veal, len); @@ -476,7 +471,6 @@ void i40evf_del_ether_addrs(struct i40evf_adapter *adapter) kfree(f); } } - adapter->aq_pending |= I40EVF_FLAG_AQ_DEL_MAC_FILTER; adapter->aq_required &= ~I40EVF_FLAG_AQ_DEL_MAC_FILTER; i40evf_send_pf_msg(adapter, I40E_VIRTCHNL_OP_DEL_ETHER_ADDRESS, (u8 *)veal, len); @@ -537,7 +531,6 @@ void i40evf_add_vlans(struct i40evf_adapter *adapter) f->add = false; } } - adapter->aq_pending |= I40EVF_FLAG_AQ_ADD_VLAN_FILTER; adapter->aq_required &= ~I40EVF_FLAG_AQ_ADD_VLAN_FILTER; i40evf_send_pf_msg(adapter, I40E_VIRTCHNL_OP_ADD_VLAN, (u8 *)vvfl, len); kfree(vvfl); @@ -598,7 +591,6 @@ void i40evf_del_vlans(struct i40evf_adapter *adapter) kfree(f); } } - adapter->aq_pending |= I40EVF_FLAG_AQ_DEL_VLAN_FILTER; adapter->aq_required &= ~I40EVF_FLAG_AQ_DEL_VLAN_FILTER; i40evf_send_pf_msg(adapter, I40E_VIRTCHNL_OP_DEL_VLAN, (u8 *)vvfl, len); kfree(vvfl); @@ -720,9 +712,6 @@ void i40evf_virtchnl_completion(struct i40evf_adapter *adapter, __func__, v_retval, v_opcode); } switch (v_opcode) { - case I40E_VIRTCHNL_OP_VERSION: - /* no action, but also not an error */ - break; case I40E_VIRTCHNL_OP_GET_STATS: { struct i40e_eth_stats *stats = (struct i40e_eth_stats *)msg; @@ -740,39 +729,30 @@ void i40evf_virtchnl_completion(struct i40evf_adapter *adapter, adapter->current_stats = *stats; } break; - case I40E_VIRTCHNL_OP_ADD_ETHER_ADDRESS: - adapter->aq_pending &= ~(I40EVF_FLAG_AQ_ADD_MAC_FILTER); - break; - case I40E_VIRTCHNL_OP_DEL_ETHER_ADDRESS: - adapter->aq_pending &= ~(I40EVF_FLAG_AQ_DEL_MAC_FILTER); - break; - case I40E_VIRTCHNL_OP_ADD_VLAN: - adapter->aq_pending &= ~(I40EVF_FLAG_AQ_ADD_VLAN_FILTER); - break; - case I40E_VIRTCHNL_OP_DEL_VLAN: - adapter->aq_pending &= ~(I40EVF_FLAG_AQ_DEL_VLAN_FILTER); - break; case I40E_VIRTCHNL_OP_ENABLE_QUEUES: - adapter->aq_pending &= ~(I40EVF_FLAG_AQ_ENABLE_QUEUES); /* enable transmits */ i40evf_irq_enable(adapter, true); netif_tx_start_all_queues(adapter->netdev); netif_carrier_on(adapter->netdev); break; case I40E_VIRTCHNL_OP_DISABLE_QUEUES: - adapter->aq_pending &= ~(I40EVF_FLAG_AQ_DISABLE_QUEUES); i40evf_free_all_tx_resources(adapter); i40evf_free_all_rx_resources(adapter); break; - case I40E_VIRTCHNL_OP_CONFIG_VSI_QUEUES: - adapter->aq_pending &= ~(I40EVF_FLAG_AQ_CONFIGURE_QUEUES); - break; + case I40E_VIRTCHNL_OP_VERSION: + case I40E_VIRTCHNL_OP_GET_VF_RESOURCES: case I40E_VIRTCHNL_OP_CONFIG_IRQ_MAP: - adapter->aq_pending &= ~(I40EVF_FLAG_AQ_MAP_VECTORS); + /* Don't display an error if we get these out of sequence. + * If the firmware needed to get kicked, we'll get these and + * it's no problem. + */ + if (v_opcode != adapter->current_op) + return; break; default: - dev_info(&adapter->pdev->dev, "Received unexpected message %d from PF\n", - v_opcode); + if (v_opcode != adapter->current_op) + dev_warn(&adapter->pdev->dev, "Expected response %d from PF, received %d\n", + adapter->current_op, v_opcode); break; } /* switch v_opcode */ adapter->current_op = I40E_VIRTCHNL_OP_UNKNOWN; -- cgit v0.10.2 From 055b295d99adb246b2b9284e1d1e602630c119e3 Mon Sep 17 00:00:00 2001 From: Mitch Williams Date: Tue, 7 Apr 2015 19:45:33 -0400 Subject: i40e: notify VFs of link state Gratuitously notify VFs of link state when they activate their queues. In general, this is the last thing that a VF driver will do as it opens its interface, so this is a good time to notify the VF. Currently, VF devices assume link is up unless told otherwise, which means that VFs instantiated on a PF with no link will report the wrong state. This change corrects that issue. Change-ID: Iea53622904ecc681ac3f8938d81c30033ef9a0a6 Signed-off-by: Mitch Williams Tested-by: Jim Young Signed-off-by: Jeff Kirsher diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index 9b3fc83..8df2b52 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -25,6 +25,7 @@ ******************************************************************************/ #include "i40e.h" +static void i40e_vc_notify_vf_link_state(struct i40e_vf *vf); /***********************misc routines*****************************/ @@ -1767,6 +1768,7 @@ int i40e_vc_process_vf_msg(struct i40e_pf *pf, u16 vf_id, u32 v_opcode, break; case I40E_VIRTCHNL_OP_ENABLE_QUEUES: ret = i40e_vc_enable_queues_msg(vf, msg, msglen); + i40e_vc_notify_vf_link_state(vf); break; case I40E_VIRTCHNL_OP_DISABLE_QUEUES: ret = i40e_vc_disable_queues_msg(vf, msg, msglen); @@ -1875,35 +1877,45 @@ static void i40e_vc_vf_broadcast(struct i40e_pf *pf, /** * i40e_vc_notify_link_state - * @pf: pointer to the PF structure + * @vf: pointer to the VF structure * - * send a link status message to all VFs on a given PF + * send a link status message to a single VF **/ -void i40e_vc_notify_link_state(struct i40e_pf *pf) +static void i40e_vc_notify_vf_link_state(struct i40e_vf *vf) { struct i40e_virtchnl_pf_event pfe; + struct i40e_pf *pf = vf->pf; struct i40e_hw *hw = &pf->hw; - struct i40e_vf *vf = pf->vf; struct i40e_link_status *ls = &pf->hw.phy.link_info; - int i; + int abs_vf_id = vf->vf_id + hw->func_caps.vf_base_id; pfe.event = I40E_VIRTCHNL_EVENT_LINK_CHANGE; pfe.severity = I40E_PF_EVENT_SEVERITY_INFO; - for (i = 0; i < pf->num_alloc_vfs; i++, vf++) { - int abs_vf_id = vf->vf_id + hw->func_caps.vf_base_id; - if (vf->link_forced) { - pfe.event_data.link_event.link_status = vf->link_up; - pfe.event_data.link_event.link_speed = - (vf->link_up ? I40E_LINK_SPEED_40GB : 0); - } else { - pfe.event_data.link_event.link_status = - ls->link_info & I40E_AQ_LINK_UP; - pfe.event_data.link_event.link_speed = ls->link_speed; - } - i40e_aq_send_msg_to_vf(hw, abs_vf_id, I40E_VIRTCHNL_OP_EVENT, - 0, (u8 *)&pfe, sizeof(pfe), - NULL); + if (vf->link_forced) { + pfe.event_data.link_event.link_status = vf->link_up; + pfe.event_data.link_event.link_speed = + (vf->link_up ? I40E_LINK_SPEED_40GB : 0); + } else { + pfe.event_data.link_event.link_status = + ls->link_info & I40E_AQ_LINK_UP; + pfe.event_data.link_event.link_speed = ls->link_speed; } + i40e_aq_send_msg_to_vf(hw, abs_vf_id, I40E_VIRTCHNL_OP_EVENT, + 0, (u8 *)&pfe, sizeof(pfe), NULL); +} + +/** + * i40e_vc_notify_link_state + * @pf: pointer to the PF structure + * + * send a link status message to all VFs on a given PF + **/ +void i40e_vc_notify_link_state(struct i40e_pf *pf) +{ + int i; + + for (i = 0; i < pf->num_alloc_vfs; i++) + i40e_vc_notify_vf_link_state(&pf->vf[i]); } /** -- cgit v0.10.2 From 532b045588f5a0f65df90e5d3de1afe89926449e Mon Sep 17 00:00:00 2001 From: Mitch Williams Date: Tue, 7 Apr 2015 19:45:34 -0400 Subject: i40e: move VF notification routines up Move the VF notification functions to the top of the file. This eliminates an unnecessary declaration. Change-ID: I036171f14180ee9f0ce4e0a21334d6a217d06c94 Signed-off-by: Mitch Williams Tested-by: Jim Young Signed-off-by: Jeff Kirsher diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index 8df2b52..98e0921 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -25,8 +25,130 @@ ******************************************************************************/ #include "i40e.h" -static void i40e_vc_notify_vf_link_state(struct i40e_vf *vf); +/*********************notification routines***********************/ + +/** + * i40e_vc_vf_broadcast + * @pf: pointer to the PF structure + * @opcode: operation code + * @retval: return value + * @msg: pointer to the msg buffer + * @msglen: msg length + * + * send a message to all VFs on a given PF + **/ +static void i40e_vc_vf_broadcast(struct i40e_pf *pf, + enum i40e_virtchnl_ops v_opcode, + i40e_status v_retval, u8 *msg, + u16 msglen) +{ + struct i40e_hw *hw = &pf->hw; + struct i40e_vf *vf = pf->vf; + int i; + + for (i = 0; i < pf->num_alloc_vfs; i++, vf++) { + int abs_vf_id = vf->vf_id + hw->func_caps.vf_base_id; + /* Not all vfs are enabled so skip the ones that are not */ + if (!test_bit(I40E_VF_STAT_INIT, &vf->vf_states) && + !test_bit(I40E_VF_STAT_ACTIVE, &vf->vf_states)) + continue; + + /* Ignore return value on purpose - a given VF may fail, but + * we need to keep going and send to all of them + */ + i40e_aq_send_msg_to_vf(hw, abs_vf_id, v_opcode, v_retval, + msg, msglen, NULL); + } +} + +/** + * i40e_vc_notify_link_state + * @vf: pointer to the VF structure + * + * send a link status message to a single VF + **/ +static void i40e_vc_notify_vf_link_state(struct i40e_vf *vf) +{ + struct i40e_virtchnl_pf_event pfe; + struct i40e_pf *pf = vf->pf; + struct i40e_hw *hw = &pf->hw; + struct i40e_link_status *ls = &pf->hw.phy.link_info; + int abs_vf_id = vf->vf_id + hw->func_caps.vf_base_id; + + pfe.event = I40E_VIRTCHNL_EVENT_LINK_CHANGE; + pfe.severity = I40E_PF_EVENT_SEVERITY_INFO; + if (vf->link_forced) { + pfe.event_data.link_event.link_status = vf->link_up; + pfe.event_data.link_event.link_speed = + (vf->link_up ? I40E_LINK_SPEED_40GB : 0); + } else { + pfe.event_data.link_event.link_status = + ls->link_info & I40E_AQ_LINK_UP; + pfe.event_data.link_event.link_speed = ls->link_speed; + } + i40e_aq_send_msg_to_vf(hw, abs_vf_id, I40E_VIRTCHNL_OP_EVENT, + 0, (u8 *)&pfe, sizeof(pfe), NULL); +} + +/** + * i40e_vc_notify_link_state + * @pf: pointer to the PF structure + * + * send a link status message to all VFs on a given PF + **/ +void i40e_vc_notify_link_state(struct i40e_pf *pf) +{ + int i; + + for (i = 0; i < pf->num_alloc_vfs; i++) + i40e_vc_notify_vf_link_state(&pf->vf[i]); +} + +/** + * i40e_vc_notify_reset + * @pf: pointer to the PF structure + * + * indicate a pending reset to all VFs on a given PF + **/ +void i40e_vc_notify_reset(struct i40e_pf *pf) +{ + struct i40e_virtchnl_pf_event pfe; + + pfe.event = I40E_VIRTCHNL_EVENT_RESET_IMPENDING; + pfe.severity = I40E_PF_EVENT_SEVERITY_CERTAIN_DOOM; + i40e_vc_vf_broadcast(pf, I40E_VIRTCHNL_OP_EVENT, 0, + (u8 *)&pfe, sizeof(struct i40e_virtchnl_pf_event)); +} + +/** + * i40e_vc_notify_vf_reset + * @vf: pointer to the VF structure + * + * indicate a pending reset to the given VF + **/ +void i40e_vc_notify_vf_reset(struct i40e_vf *vf) +{ + struct i40e_virtchnl_pf_event pfe; + int abs_vf_id; + + /* validate the request */ + if (!vf || vf->vf_id >= vf->pf->num_alloc_vfs) + return; + + /* verify if the VF is in either init or active before proceeding */ + if (!test_bit(I40E_VF_STAT_INIT, &vf->vf_states) && + !test_bit(I40E_VF_STAT_ACTIVE, &vf->vf_states)) + return; + + abs_vf_id = vf->vf_id + vf->pf->hw.func_caps.vf_base_id; + + pfe.event = I40E_VIRTCHNL_EVENT_RESET_IMPENDING; + pfe.severity = I40E_PF_EVENT_SEVERITY_CERTAIN_DOOM; + i40e_aq_send_msg_to_vf(&vf->pf->hw, abs_vf_id, I40E_VIRTCHNL_OP_EVENT, + 0, (u8 *)&pfe, + sizeof(struct i40e_virtchnl_pf_event), NULL); +} /***********************misc routines*****************************/ /** @@ -1842,128 +1964,6 @@ int i40e_vc_process_vflr_event(struct i40e_pf *pf) } /** - * i40e_vc_vf_broadcast - * @pf: pointer to the PF structure - * @opcode: operation code - * @retval: return value - * @msg: pointer to the msg buffer - * @msglen: msg length - * - * send a message to all VFs on a given PF - **/ -static void i40e_vc_vf_broadcast(struct i40e_pf *pf, - enum i40e_virtchnl_ops v_opcode, - i40e_status v_retval, u8 *msg, - u16 msglen) -{ - struct i40e_hw *hw = &pf->hw; - struct i40e_vf *vf = pf->vf; - int i; - - for (i = 0; i < pf->num_alloc_vfs; i++, vf++) { - int abs_vf_id = vf->vf_id + hw->func_caps.vf_base_id; - /* Not all VFs are enabled so skip the ones that are not */ - if (!test_bit(I40E_VF_STAT_INIT, &vf->vf_states) && - !test_bit(I40E_VF_STAT_ACTIVE, &vf->vf_states)) - continue; - - /* Ignore return value on purpose - a given VF may fail, but - * we need to keep going and send to all of them - */ - i40e_aq_send_msg_to_vf(hw, abs_vf_id, v_opcode, v_retval, - msg, msglen, NULL); - } -} - -/** - * i40e_vc_notify_link_state - * @vf: pointer to the VF structure - * - * send a link status message to a single VF - **/ -static void i40e_vc_notify_vf_link_state(struct i40e_vf *vf) -{ - struct i40e_virtchnl_pf_event pfe; - struct i40e_pf *pf = vf->pf; - struct i40e_hw *hw = &pf->hw; - struct i40e_link_status *ls = &pf->hw.phy.link_info; - int abs_vf_id = vf->vf_id + hw->func_caps.vf_base_id; - - pfe.event = I40E_VIRTCHNL_EVENT_LINK_CHANGE; - pfe.severity = I40E_PF_EVENT_SEVERITY_INFO; - if (vf->link_forced) { - pfe.event_data.link_event.link_status = vf->link_up; - pfe.event_data.link_event.link_speed = - (vf->link_up ? I40E_LINK_SPEED_40GB : 0); - } else { - pfe.event_data.link_event.link_status = - ls->link_info & I40E_AQ_LINK_UP; - pfe.event_data.link_event.link_speed = ls->link_speed; - } - i40e_aq_send_msg_to_vf(hw, abs_vf_id, I40E_VIRTCHNL_OP_EVENT, - 0, (u8 *)&pfe, sizeof(pfe), NULL); -} - -/** - * i40e_vc_notify_link_state - * @pf: pointer to the PF structure - * - * send a link status message to all VFs on a given PF - **/ -void i40e_vc_notify_link_state(struct i40e_pf *pf) -{ - int i; - - for (i = 0; i < pf->num_alloc_vfs; i++) - i40e_vc_notify_vf_link_state(&pf->vf[i]); -} - -/** - * i40e_vc_notify_reset - * @pf: pointer to the PF structure - * - * indicate a pending reset to all VFs on a given PF - **/ -void i40e_vc_notify_reset(struct i40e_pf *pf) -{ - struct i40e_virtchnl_pf_event pfe; - - pfe.event = I40E_VIRTCHNL_EVENT_RESET_IMPENDING; - pfe.severity = I40E_PF_EVENT_SEVERITY_CERTAIN_DOOM; - i40e_vc_vf_broadcast(pf, I40E_VIRTCHNL_OP_EVENT, I40E_SUCCESS, - (u8 *)&pfe, sizeof(struct i40e_virtchnl_pf_event)); -} - -/** - * i40e_vc_notify_vf_reset - * @vf: pointer to the VF structure - * - * indicate a pending reset to the given VF - **/ -void i40e_vc_notify_vf_reset(struct i40e_vf *vf) -{ - struct i40e_virtchnl_pf_event pfe; - int abs_vf_id; - - /* validate the request */ - if (!vf || vf->vf_id >= vf->pf->num_alloc_vfs) - return; - - /* verify if the VF is in either init or active before proceeding */ - if (!test_bit(I40E_VF_STAT_INIT, &vf->vf_states) && - !test_bit(I40E_VF_STAT_ACTIVE, &vf->vf_states)) - return; - - abs_vf_id = vf->vf_id + vf->pf->hw.func_caps.vf_base_id; - - pfe.event = I40E_VIRTCHNL_EVENT_RESET_IMPENDING; - pfe.severity = I40E_PF_EVENT_SEVERITY_CERTAIN_DOOM; - i40e_aq_send_msg_to_vf(&vf->pf->hw, abs_vf_id, I40E_VIRTCHNL_OP_EVENT, - I40E_SUCCESS, (u8 *)&pfe, - sizeof(struct i40e_virtchnl_pf_event), NULL); -} - -/** * i40e_ndo_set_vf_mac * @netdev: network interface device structure * @vf_id: VF identifier -- cgit v0.10.2 From 57175ac1987bf770122f6fe5056fe73269add00f Mon Sep 17 00:00:00 2001 From: Anjali Singhai Jain Date: Tue, 7 Apr 2015 19:45:35 -0400 Subject: i40e: For VF reset (VFR and VFLR) add some more delay With a HW issue that was recently discovered, after a VFLR HW might be indicating to us a reset completion little too early. So wait another 10 msec for cache to be cleaned up. Change-ID: I6a24dcf5dd7ffcd6500246e717411ef58532d1e9 Signed-off-by: Anjali Singhai Jain Tested-by: Jim Young Signed-off-by: Jeff Kirsher diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index 98e0921..78d1c4f 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -812,6 +812,9 @@ void i40e_reset_vf(struct i40e_vf *vf, bool flr) } } + if (flr) + usleep_range(10000, 20000); + if (!rsd) dev_err(&pf->pdev->dev, "VF reset check timeout on VF %d\n", vf->vf_id); -- cgit v0.10.2 From f18ae1009276b604ff5776b41af981f25db52240 Mon Sep 17 00:00:00 2001 From: Vasu Dev Date: Tue, 7 Apr 2015 19:45:36 -0400 Subject: i40e: print FCoE capability reported by the device function This is to allow quick check for FCoE capability is enabled or not in device function before any SW overrides. Change-ID: I5f78ba798d566f143161273156916c6f4074496e Signed-off-by: Vasu Dev Tested-by: Jim Young Signed-off-by: Jeff Kirsher diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c b/drivers/net/ethernet/intel/i40e/i40e_common.c index d596f66..44dc4b1 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_common.c +++ b/drivers/net/ethernet/intel/i40e/i40e_common.c @@ -2546,6 +2546,9 @@ static void i40e_parse_discover_capabilities(struct i40e_hw *hw, void *buff, } } + if (p->fcoe) + i40e_debug(hw, I40E_DEBUG_ALL, "device is FCoE capable\n"); + /* Software override ensuring FCoE is disabled if npar or mfp * mode because it is not supported in these modes. */ -- cgit v0.10.2 From 3169c323cf9dc3b00b0a6a3d434968832daa4147 Mon Sep 17 00:00:00 2001 From: Jesse Brandeburg Date: Tue, 7 Apr 2015 19:45:37 -0400 Subject: i40e: enable user dump of internal hardware state This is a feature to enable better debugging of user reported issues by allowing a bash script to acquire information about the internal hardware state. The data output to the kernel log is collected by the script and can then be sent to Intel. This is a critical debugging feature for helping us interpret and reproduce complex customer setups. Change-ID: Ie8b3ab09086d6870a709015f51ada05af10b41bb Signed-off-by: Jesse Brandeburg Tested-by: Jim Young Signed-off-by: Jeff Kirsher diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c b/drivers/net/ethernet/intel/i40e/i40e_common.c index 44dc4b1..ce522d7 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_common.c +++ b/drivers/net/ethernet/intel/i40e/i40e_common.c @@ -3506,6 +3506,63 @@ void i40e_set_pci_config_data(struct i40e_hw *hw, u16 link_status) } /** + * i40e_aq_debug_dump + * @hw: pointer to the hardware structure + * @cluster_id: specific cluster to dump + * @table_id: table id within cluster + * @start_index: index of line in the block to read + * @buff_size: dump buffer size + * @buff: dump buffer + * @ret_buff_size: actual buffer size returned + * @ret_next_table: next block to read + * @ret_next_index: next index to read + * + * Dump internal FW/HW data for debug purposes. + * + **/ +i40e_status i40e_aq_debug_dump(struct i40e_hw *hw, u8 cluster_id, + u8 table_id, u32 start_index, u16 buff_size, + void *buff, u16 *ret_buff_size, + u8 *ret_next_table, u32 *ret_next_index, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_debug_dump_internals *cmd = + (struct i40e_aqc_debug_dump_internals *)&desc.params.raw; + struct i40e_aqc_debug_dump_internals *resp = + (struct i40e_aqc_debug_dump_internals *)&desc.params.raw; + i40e_status status; + + if (buff_size == 0 || !buff) + return I40E_ERR_PARAM; + + i40e_fill_default_direct_cmd_desc(&desc, + i40e_aqc_opc_debug_dump_internals); + /* Indirect Command */ + desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_BUF); + if (buff_size > I40E_AQ_LARGE_BUF) + desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_LB); + + cmd->cluster_id = cluster_id; + cmd->table_id = table_id; + cmd->idx = cpu_to_le32(start_index); + + desc.datalen = cpu_to_le16(buff_size); + + status = i40e_asq_send_command(hw, &desc, buff, buff_size, cmd_details); + if (!status) { + if (ret_buff_size) + *ret_buff_size = le16_to_cpu(desc.datalen); + if (ret_next_table) + *ret_next_table = resp->table_id; + if (ret_next_index) + *ret_next_index = le32_to_cpu(resp->idx); + } + + return status; +} + +/** * i40e_read_bw_from_alt_ram * @hw: pointer to the hardware structure * @max_bw: pointer for max_bw read diff --git a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c index daa8826..34170ea 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c +++ b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c @@ -1388,6 +1388,50 @@ static ssize_t i40e_dbg_command_write(struct file *filp, r_cfg->app[i].selector, r_cfg->app[i].protocolid); } + } else if (strncmp(&cmd_buf[5], "debug fwdata", 12) == 0) { + int cluster_id, table_id; + int index, ret; + u16 buff_len = 4096; + u32 next_index; + u8 next_table; + u8 *buff; + u16 rlen; + + cnt = sscanf(&cmd_buf[18], "%i %i %i", + &cluster_id, &table_id, &index); + if (cnt != 3) { + dev_info(&pf->pdev->dev, + "dump debug fwdata \n"); + goto command_write_done; + } + + dev_info(&pf->pdev->dev, + "AQ debug dump fwdata params %x %x %x %x\n", + cluster_id, table_id, index, buff_len); + buff = kzalloc(buff_len, GFP_KERNEL); + if (!buff) + goto command_write_done; + + ret = i40e_aq_debug_dump(&pf->hw, cluster_id, table_id, + index, buff_len, buff, &rlen, + &next_table, &next_index, + NULL); + if (ret) { + dev_info(&pf->pdev->dev, + "debug dump fwdata AQ Failed %d 0x%x\n", + ret, pf->hw.aq.asq_last_status); + kfree(buff); + buff = NULL; + goto command_write_done; + } + dev_info(&pf->pdev->dev, + "AQ debug dump fwdata rlen=0x%x next_table=0x%x next_index=0x%x\n", + rlen, next_table, next_index); + print_hex_dump(KERN_INFO, "AQ buffer WB: ", + DUMP_PREFIX_OFFSET, 16, 1, + buff, rlen, true); + kfree(buff); + buff = NULL; } else { dev_info(&pf->pdev->dev, "dump desc tx [], dump desc rx [],\n"); @@ -1903,6 +1947,7 @@ static ssize_t i40e_dbg_command_write(struct file *filp, dev_info(&pf->pdev->dev, " dump desc rx []\n"); dev_info(&pf->pdev->dev, " dump desc aq\n"); dev_info(&pf->pdev->dev, " dump reset stats\n"); + dev_info(&pf->pdev->dev, " dump debug fwdata \n"); dev_info(&pf->pdev->dev, " msg_enable [level]\n"); dev_info(&pf->pdev->dev, " read \n"); dev_info(&pf->pdev->dev, " write \n"); diff --git a/drivers/net/ethernet/intel/i40e/i40e_prototype.h b/drivers/net/ethernet/intel/i40e/i40e_prototype.h index fea0d37..7b34f1e 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_prototype.h +++ b/drivers/net/ethernet/intel/i40e/i40e_prototype.h @@ -303,4 +303,9 @@ i40e_status i40e_aq_add_rem_control_packet_filter(struct i40e_hw *hw, u16 vsi_seid, u16 queue, bool is_add, struct i40e_control_filter_stats *stats, struct i40e_asq_cmd_details *cmd_details); +i40e_status i40e_aq_debug_dump(struct i40e_hw *hw, u8 cluster_id, + u8 table_id, u32 start_index, u16 buff_size, + void *buff, u16 *ret_buff_size, + u8 *ret_next_table, u32 *ret_next_index, + struct i40e_asq_cmd_details *cmd_details); #endif /* _I40E_PROTOTYPE_H_ */ -- cgit v0.10.2 From 73b23402bb232ce655cf45aaca4f7feee317deb1 Mon Sep 17 00:00:00 2001 From: Kevin Scott Date: Tue, 7 Apr 2015 19:45:38 -0400 Subject: i40e/i40evf: Save WR_CSR_PROT field from DEV/FUNC capabilities Store the 8 bytes of the WR_CSR_PROT field returned as part of the get device/function capabilities AQ command. Change-ID: Ifcaeea2ff29885fa769e4f384c7db88a25e8afd0 Signed-off-by: Kevin Scott Tested-by: Jim Young Signed-off-by: Jeff Kirsher diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c b/drivers/net/ethernet/intel/i40e/i40e_common.c index ce522d7..0bae22d 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_common.c +++ b/drivers/net/ethernet/intel/i40e/i40e_common.c @@ -2397,6 +2397,7 @@ i40e_aq_erase_nvm_exit: #define I40E_DEV_FUNC_CAP_LED 0x61 #define I40E_DEV_FUNC_CAP_SDP 0x62 #define I40E_DEV_FUNC_CAP_MDIO 0x63 +#define I40E_DEV_FUNC_CAP_WR_CSR_PROT 0x64 /** * i40e_parse_discover_capabilities @@ -2541,6 +2542,10 @@ static void i40e_parse_discover_capabilities(struct i40e_hw *hw, void *buff, p->fd_filters_guaranteed = number; p->fd_filters_best_effort = logical_id; break; + case I40E_DEV_FUNC_CAP_WR_CSR_PROT: + p->wr_csr_prot = (u64)number; + p->wr_csr_prot |= (u64)logical_id << 32; + break; default: break; } diff --git a/drivers/net/ethernet/intel/i40e/i40e_type.h b/drivers/net/ethernet/intel/i40e/i40e_type.h index 67c7bc9e9..568e855 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_type.h +++ b/drivers/net/ethernet/intel/i40e/i40e_type.h @@ -242,6 +242,7 @@ struct i40e_hw_capabilities { u8 rx_buf_chain_len; u32 enabled_tcmap; u32 maxtc; + u64 wr_csr_prot; }; struct i40e_mac_info { diff --git a/drivers/net/ethernet/intel/i40evf/i40e_type.h b/drivers/net/ethernet/intel/i40evf/i40e_type.h index 9c79cb6..ec9d83a 100644 --- a/drivers/net/ethernet/intel/i40evf/i40e_type.h +++ b/drivers/net/ethernet/intel/i40evf/i40e_type.h @@ -242,6 +242,7 @@ struct i40e_hw_capabilities { u8 rx_buf_chain_len; u32 enabled_tcmap; u32 maxtc; + u64 wr_csr_prot; }; struct i40e_mac_info { -- cgit v0.10.2 From c1147280d2a336ac6723a8dc8b74f809fd6db02c Mon Sep 17 00:00:00 2001 From: Jesse Brandeburg Date: Tue, 7 Apr 2015 19:45:39 -0400 Subject: i40e: handle possible memory allocation failure The init_interrupt_scheme function had a possible failure path to allocate memory that was found by smatch. This adds the correct handling to the function to abort probe if the memory allocation fails. Change-ID: I2bf1d826a244209619da4c452d0d58b3eb5e26a3 Signed-off-by: Jesse Brandeburg Tested-by: Jim Young Signed-off-by: Jeff Kirsher diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 63de3f4..b297294 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -7301,7 +7301,7 @@ err_out: * i40e_init_interrupt_scheme - Determine proper interrupt scheme * @pf: board private structure to initialize **/ -static void i40e_init_interrupt_scheme(struct i40e_pf *pf) +static int i40e_init_interrupt_scheme(struct i40e_pf *pf) { int vectors = 0; ssize_t size; @@ -7343,11 +7343,17 @@ static void i40e_init_interrupt_scheme(struct i40e_pf *pf) /* set up vector assignment tracking */ size = sizeof(struct i40e_lump_tracking) + (sizeof(u16) * vectors); pf->irq_pile = kzalloc(size, GFP_KERNEL); + if (!pf->irq_pile) { + dev_err(&pf->pdev->dev, "error allocating irq_pile memory\n"); + return -ENOMEM; + } pf->irq_pile->num_entries = vectors; pf->irq_pile->search_hint = 0; - /* track first vector for misc interrupts */ + /* track first vector for misc interrupts, ignore return */ (void)i40e_get_lump(pf, pf->irq_pile, 1, I40E_PILE_VALID_BIT - 1); + + return 0; } /** @@ -9827,7 +9833,9 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent) /* set up the main switch operations */ i40e_determine_queue_usage(pf); - i40e_init_interrupt_scheme(pf); + err = i40e_init_interrupt_scheme(pf); + if (err) + goto err_switch_setup; /* The number of VSIs reported by the FW is the minimum guaranteed * to us; HW supports far more and we share the remaining pool with -- cgit v0.10.2 From 0f575bf3382d5bdb58ba571b1fd573b994bb11ba Mon Sep 17 00:00:00 2001 From: Jesse Brandeburg Date: Tue, 7 Apr 2015 19:45:40 -0400 Subject: i40e: get rid of unused locals These changes just remove unused variables and any code that uses them as the results of storing into these variables doesn't have any side effects that I can see or provide any benefit. Change-ID: I8a5ec7132ff1443d23aae729cef94beaaaf19e3a Signed-off-by: Jesse Brandeburg Tested-by: Jim Young Signed-off-by: Jeff Kirsher diff --git a/drivers/net/ethernet/intel/i40e/i40e_dcb.c b/drivers/net/ethernet/intel/i40e/i40e_dcb.c index 6e14667..2547aa2 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_dcb.c +++ b/drivers/net/ethernet/intel/i40e/i40e_dcb.c @@ -419,7 +419,7 @@ static void i40e_cee_to_dcb_v1_config( { u16 status, tlv_status = le16_to_cpu(cee_cfg->tlv_status); u16 app_prio = le16_to_cpu(cee_cfg->oper_app_prio); - u8 i, tc, err, sync, oper; + u8 i, tc, err; /* CEE PG data to ETS config */ dcbcfg->etscfg.maxtcs = cee_cfg->oper_num_tc; @@ -456,9 +456,7 @@ static void i40e_cee_to_dcb_v1_config( status = (tlv_status & I40E_AQC_CEE_APP_STATUS_MASK) >> I40E_AQC_CEE_APP_STATUS_SHIFT; err = (status & I40E_TLV_STATUS_ERR) ? 1 : 0; - sync = (status & I40E_TLV_STATUS_SYNC) ? 1 : 0; - oper = (status & I40E_TLV_STATUS_OPER) ? 1 : 0; - /* Add APPs if Error is False and Oper/Sync is True */ + /* Add APPs if Error is False */ if (!err) { /* CEE operating configuration supports FCoE/iSCSI/FIP only */ dcbcfg->numapps = I40E_CEE_OPER_MAX_APPS; diff --git a/drivers/net/ethernet/intel/i40e/i40e_nvm.c b/drivers/net/ethernet/intel/i40e/i40e_nvm.c index e49acd2..554e49d 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_nvm.c +++ b/drivers/net/ethernet/intel/i40e/i40e_nvm.c @@ -821,13 +821,12 @@ static enum i40e_nvmupd_cmd i40e_nvmupd_validate_command(struct i40e_hw *hw, int *errno) { enum i40e_nvmupd_cmd upd_cmd; - u8 transaction, module; + u8 transaction; /* anything that doesn't match a recognized case is an error */ upd_cmd = I40E_NVMUPD_INVALID; transaction = i40e_nvmupd_get_transaction(cmd->config); - module = i40e_nvmupd_get_module(cmd->config); /* limits on data size */ if ((cmd->data_size < 1) || -- cgit v0.10.2 From edf5cffd58d8e1a4b1bd5de43bc9f0553f9910fc Mon Sep 17 00:00:00 2001 From: Greg Rose Date: Tue, 7 Apr 2015 19:45:41 -0400 Subject: i40e: Use new 40G speeds The kernel has added SPEED_40000 for ethtool. Go ahead and use the new #define. Change-ID: Ic7e16e5c9e91085afe539f11ee1b7668adc4d0ef Signed-off-by: Greg Rose Tested-by: Jim Young Signed-off-by: Jeff Kirsher diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c index 7d60587..4cbaaeb 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c +++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c @@ -356,8 +356,7 @@ static void i40e_get_settings_link_up(struct i40e_hw *hw, /* Set speed and duplex */ switch (link_speed) { case I40E_LINK_SPEED_40GB: - /* need a SPEED_40000 in ethtool.h */ - ethtool_cmd_speed_set(ecmd, 40000); + ethtool_cmd_speed_set(ecmd, SPEED_40000); break; case I40E_LINK_SPEED_20GB: ethtool_cmd_speed_set(ecmd, SPEED_20000); -- cgit v0.10.2 From 2aea6dcb7864e174dada15728c43c5330637d424 Mon Sep 17 00:00:00 2001 From: Catherine Sullivan Date: Tue, 7 Apr 2015 19:45:42 -0400 Subject: i40e: Bump version to 1.3.2 Bump. Change-ID: Id14baae72332d0f1a9bc5d351ea1a85cb0295ec3 Signed-off-by: Catherine Sullivan Tested-by: Jim Young Signed-off-by: Jeff Kirsher diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index b297294..24481cd 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -39,7 +39,7 @@ static const char i40e_driver_string[] = #define DRV_VERSION_MAJOR 1 #define DRV_VERSION_MINOR 3 -#define DRV_VERSION_BUILD 1 +#define DRV_VERSION_BUILD 2 #define DRV_VERSION __stringify(DRV_VERSION_MAJOR) "." \ __stringify(DRV_VERSION_MINOR) "." \ __stringify(DRV_VERSION_BUILD) DRV_KERN -- cgit v0.10.2 From 48b63776c8255d5b0a84c5caafab5560d9cb7f3b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:42:42 +0200 Subject: net: hip04: Make tx coalesce timer actually work The code sets the expiry value of the timer to a relative value and starts it with hrtimer_start_expires. That's fine, but that only works once. The timer is started in relative mode, so the expiry value gets overwritten with the absolut expiry time (now + expiry). So once the timer expired, a new call to hrtimer_start_expires results in an immidiately expired timer, because the expiry value is already in the past. Use the proper mechanisms to (re)start the timer in the intended way. Signed-off-by: Thomas Gleixner Cc: "David S. Miller" Cc: dingtianhong Cc: Arnd Bergmann Cc: Zhangfei Gao Cc: Dan Carpenter Cc: netdev@vger.kernel.org Acked-by: Ding Tianhong Acked-by: Arnd Bergmann Signed-off-by: David S. Miller diff --git a/drivers/net/ethernet/hisilicon/hip04_eth.c b/drivers/net/ethernet/hisilicon/hip04_eth.c index b72d238..3b39fdd 100644 --- a/drivers/net/ethernet/hisilicon/hip04_eth.c +++ b/drivers/net/ethernet/hisilicon/hip04_eth.c @@ -413,6 +413,15 @@ out: return count; } +static void hip04_start_tx_timer(struct hip04_priv *priv) +{ + unsigned long ns = priv->tx_coalesce_usecs * NSEC_PER_USEC / 2; + + /* allow timer to fire after half the time at the earliest */ + hrtimer_start_range_ns(&priv->tx_coalesce_timer, ns_to_ktime(ns), + ns, HRTIMER_MODE_REL); +} + static int hip04_mac_start_xmit(struct sk_buff *skb, struct net_device *ndev) { struct hip04_priv *priv = netdev_priv(ndev); @@ -466,8 +475,7 @@ static int hip04_mac_start_xmit(struct sk_buff *skb, struct net_device *ndev) } } else if (!hrtimer_is_queued(&priv->tx_coalesce_timer)) { /* cleanup not pending yet, start a new timer */ - hrtimer_start_expires(&priv->tx_coalesce_timer, - HRTIMER_MODE_REL); + hip04_start_tx_timer(priv); } return NETDEV_TX_OK; @@ -549,7 +557,7 @@ done: /* clean up tx descriptors and start a new timer if necessary */ tx_remaining = hip04_tx_reclaim(ndev, false); if (rx < budget && tx_remaining) - hrtimer_start_expires(&priv->tx_coalesce_timer, HRTIMER_MODE_REL); + hip04_start_tx_timer(priv); return rx; } @@ -809,7 +817,6 @@ static int hip04_mac_probe(struct platform_device *pdev) struct hip04_priv *priv; struct resource *res; unsigned int irq; - ktime_t txtime; int ret; ndev = alloc_etherdev(sizeof(struct hip04_priv)); @@ -846,9 +853,6 @@ static int hip04_mac_probe(struct platform_device *pdev) */ priv->tx_coalesce_frames = TX_DESC_NUM * 3 / 4; priv->tx_coalesce_usecs = 200; - /* allow timer to fire after half the time at the earliest */ - txtime = ktime_set(0, priv->tx_coalesce_usecs * NSEC_PER_USEC / 2); - hrtimer_set_expires_range(&priv->tx_coalesce_timer, txtime, txtime); priv->tx_coalesce_timer.function = tx_done; priv->map = syscon_node_to_regmap(arg.np); -- cgit v0.10.2 From 074975d0374333f656c48487aa046a21a9b9d7a1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 14 Apr 2015 18:45:00 -0700 Subject: bnx2x: Fix busy_poll vs netpoll Commit 9a2620c877454 ("bnx2x: prevent WARN during driver unload") switched the napi/busy_lock locking mechanism from spin_lock() into spin_lock_bh(), breaking inter-operability with netconsole, as netpoll disables interrupts prior to calling our napi mechanism. This switches the driver into using atomic assignments instead of the spinlock mechanisms previously employed. Based on initial patch from Yuval Mintz & Ariel Elior I basically added softirq starvation avoidance, and mixture of atomic operations, plain writes and barriers. Note this slightly reduces the overhead for this driver when no busy_poll sockets are in use. Fixes: 9a2620c877454 ("bnx2x: prevent WARN during driver unload") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h index 4085c4b..355d5fe 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h @@ -531,20 +531,8 @@ struct bnx2x_fastpath { struct napi_struct napi; #ifdef CONFIG_NET_RX_BUSY_POLL - unsigned int state; -#define BNX2X_FP_STATE_IDLE 0 -#define BNX2X_FP_STATE_NAPI (1 << 0) /* NAPI owns this FP */ -#define BNX2X_FP_STATE_POLL (1 << 1) /* poll owns this FP */ -#define BNX2X_FP_STATE_DISABLED (1 << 2) -#define BNX2X_FP_STATE_NAPI_YIELD (1 << 3) /* NAPI yielded this FP */ -#define BNX2X_FP_STATE_POLL_YIELD (1 << 4) /* poll yielded this FP */ -#define BNX2X_FP_OWNED (BNX2X_FP_STATE_NAPI | BNX2X_FP_STATE_POLL) -#define BNX2X_FP_YIELD (BNX2X_FP_STATE_NAPI_YIELD | BNX2X_FP_STATE_POLL_YIELD) -#define BNX2X_FP_LOCKED (BNX2X_FP_OWNED | BNX2X_FP_STATE_DISABLED) -#define BNX2X_FP_USER_PEND (BNX2X_FP_STATE_POLL | BNX2X_FP_STATE_POLL_YIELD) - /* protect state */ - spinlock_t lock; -#endif /* CONFIG_NET_RX_BUSY_POLL */ + unsigned long busy_poll_state; +#endif union host_hc_status_block status_blk; /* chip independent shortcuts into sb structure */ @@ -619,104 +607,83 @@ struct bnx2x_fastpath { #define bnx2x_fp_qstats(bp, fp) (&((bp)->fp_stats[(fp)->index].eth_q_stats)) #ifdef CONFIG_NET_RX_BUSY_POLL -static inline void bnx2x_fp_init_lock(struct bnx2x_fastpath *fp) + +enum bnx2x_fp_state { + BNX2X_STATE_FP_NAPI = BIT(0), /* NAPI handler owns the queue */ + + BNX2X_STATE_FP_NAPI_REQ_BIT = 1, /* NAPI would like to own the queue */ + BNX2X_STATE_FP_NAPI_REQ = BIT(1), + + BNX2X_STATE_FP_POLL_BIT = 2, + BNX2X_STATE_FP_POLL = BIT(2), /* busy_poll owns the queue */ + + BNX2X_STATE_FP_DISABLE_BIT = 3, /* queue is dismantled */ +}; + +static inline void bnx2x_fp_busy_poll_init(struct bnx2x_fastpath *fp) { - spin_lock_init(&fp->lock); - fp->state = BNX2X_FP_STATE_IDLE; + WRITE_ONCE(fp->busy_poll_state, 0); } /* called from the device poll routine to get ownership of a FP */ static inline bool bnx2x_fp_lock_napi(struct bnx2x_fastpath *fp) { - bool rc = true; - - spin_lock_bh(&fp->lock); - if (fp->state & BNX2X_FP_LOCKED) { - WARN_ON(fp->state & BNX2X_FP_STATE_NAPI); - fp->state |= BNX2X_FP_STATE_NAPI_YIELD; - rc = false; - } else { - /* we don't care if someone yielded */ - fp->state = BNX2X_FP_STATE_NAPI; + unsigned long prev, old = READ_ONCE(fp->busy_poll_state); + + while (1) { + switch (old) { + case BNX2X_STATE_FP_POLL: + /* make sure bnx2x_fp_lock_poll() wont starve us */ + set_bit(BNX2X_STATE_FP_NAPI_REQ_BIT, + &fp->busy_poll_state); + /* fallthrough */ + case BNX2X_STATE_FP_POLL | BNX2X_STATE_FP_NAPI_REQ: + return false; + default: + break; + } + prev = cmpxchg(&fp->busy_poll_state, old, BNX2X_STATE_FP_NAPI); + if (unlikely(prev != old)) { + old = prev; + continue; + } + return true; } - spin_unlock_bh(&fp->lock); - return rc; } -/* returns true is someone tried to get the FP while napi had it */ -static inline bool bnx2x_fp_unlock_napi(struct bnx2x_fastpath *fp) +static inline void bnx2x_fp_unlock_napi(struct bnx2x_fastpath *fp) { - bool rc = false; - - spin_lock_bh(&fp->lock); - WARN_ON(fp->state & - (BNX2X_FP_STATE_POLL | BNX2X_FP_STATE_NAPI_YIELD)); - - if (fp->state & BNX2X_FP_STATE_POLL_YIELD) - rc = true; - - /* state ==> idle, unless currently disabled */ - fp->state &= BNX2X_FP_STATE_DISABLED; - spin_unlock_bh(&fp->lock); - return rc; + smp_wmb(); + fp->busy_poll_state = 0; } /* called from bnx2x_low_latency_poll() */ static inline bool bnx2x_fp_lock_poll(struct bnx2x_fastpath *fp) { - bool rc = true; - - spin_lock_bh(&fp->lock); - if ((fp->state & BNX2X_FP_LOCKED)) { - fp->state |= BNX2X_FP_STATE_POLL_YIELD; - rc = false; - } else { - /* preserve yield marks */ - fp->state |= BNX2X_FP_STATE_POLL; - } - spin_unlock_bh(&fp->lock); - return rc; + return cmpxchg(&fp->busy_poll_state, 0, BNX2X_STATE_FP_POLL) == 0; } -/* returns true if someone tried to get the FP while it was locked */ -static inline bool bnx2x_fp_unlock_poll(struct bnx2x_fastpath *fp) +static inline void bnx2x_fp_unlock_poll(struct bnx2x_fastpath *fp) { - bool rc = false; - - spin_lock_bh(&fp->lock); - WARN_ON(fp->state & BNX2X_FP_STATE_NAPI); - - if (fp->state & BNX2X_FP_STATE_POLL_YIELD) - rc = true; - - /* state ==> idle, unless currently disabled */ - fp->state &= BNX2X_FP_STATE_DISABLED; - spin_unlock_bh(&fp->lock); - return rc; + smp_mb__before_atomic(); + clear_bit(BNX2X_STATE_FP_POLL_BIT, &fp->busy_poll_state); } -/* true if a socket is polling, even if it did not get the lock */ +/* true if a socket is polling */ static inline bool bnx2x_fp_ll_polling(struct bnx2x_fastpath *fp) { - WARN_ON(!(fp->state & BNX2X_FP_OWNED)); - return fp->state & BNX2X_FP_USER_PEND; + return READ_ONCE(fp->busy_poll_state) & BNX2X_STATE_FP_POLL; } /* false if fp is currently owned */ static inline bool bnx2x_fp_ll_disable(struct bnx2x_fastpath *fp) { - int rc = true; - - spin_lock_bh(&fp->lock); - if (fp->state & BNX2X_FP_OWNED) - rc = false; - fp->state |= BNX2X_FP_STATE_DISABLED; - spin_unlock_bh(&fp->lock); + set_bit(BNX2X_STATE_FP_DISABLE_BIT, &fp->busy_poll_state); + return !bnx2x_fp_ll_polling(fp); - return rc; } #else -static inline void bnx2x_fp_init_lock(struct bnx2x_fastpath *fp) +static inline void bnx2x_fp_busy_poll_init(struct bnx2x_fastpath *fp) { } @@ -725,9 +692,8 @@ static inline bool bnx2x_fp_lock_napi(struct bnx2x_fastpath *fp) return true; } -static inline bool bnx2x_fp_unlock_napi(struct bnx2x_fastpath *fp) +static inline void bnx2x_fp_unlock_napi(struct bnx2x_fastpath *fp) { - return false; } static inline bool bnx2x_fp_lock_poll(struct bnx2x_fastpath *fp) @@ -735,9 +701,8 @@ static inline bool bnx2x_fp_lock_poll(struct bnx2x_fastpath *fp) return false; } -static inline bool bnx2x_fp_unlock_poll(struct bnx2x_fastpath *fp) +static inline void bnx2x_fp_unlock_poll(struct bnx2x_fastpath *fp) { - return false; } static inline bool bnx2x_fp_ll_polling(struct bnx2x_fastpath *fp) diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c index 0a9faa1..2f63467 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c @@ -1849,7 +1849,7 @@ static void bnx2x_napi_enable_cnic(struct bnx2x *bp) int i; for_each_rx_queue_cnic(bp, i) { - bnx2x_fp_init_lock(&bp->fp[i]); + bnx2x_fp_busy_poll_init(&bp->fp[i]); napi_enable(&bnx2x_fp(bp, i, napi)); } } @@ -1859,7 +1859,7 @@ static void bnx2x_napi_enable(struct bnx2x *bp) int i; for_each_eth_queue(bp, i) { - bnx2x_fp_init_lock(&bp->fp[i]); + bnx2x_fp_busy_poll_init(&bp->fp[i]); napi_enable(&bnx2x_fp(bp, i, napi)); } } @@ -3191,9 +3191,10 @@ static int bnx2x_poll(struct napi_struct *napi, int budget) } } + bnx2x_fp_unlock_napi(fp); + /* Fall out from the NAPI loop if needed */ - if (!bnx2x_fp_unlock_napi(fp) && - !(bnx2x_has_rx_work(fp) || bnx2x_has_tx_work(fp))) { + if (!(bnx2x_has_rx_work(fp) || bnx2x_has_tx_work(fp))) { /* No need to update SB for FCoE L2 ring as long as * it's connected to the default SB and the SB -- cgit v0.10.2 From f72f116a2a70f616ea44f86775ae6404c84ea8ef Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 14 Apr 2015 13:24:33 -0700 Subject: cxgb4: drop __GFP_NOFAIL allocation set_filter_wr is requesting __GFP_NOFAIL allocation although it can return ENOMEM without any problems obviously (t4_l2t_set_switching does that already). So the non-failing requirement is too strong without any obvious reason. Drop __GFP_NOFAIL and reorganize the code to have the failure paths easier. The same applies to _c4iw_write_mem_dma_aligned which uses __GFP_NOFAIL and then checks the return value and returns -ENOMEM on failure. This doesn't make any sense what so ever. Either the allocation cannot fail or it can. del_filter_wr seems to be safe as well because the filter entry is not marked as pending and the return value is propagated up the stack up to c4iw_destroy_listen. Signed-off-by: Michal Hocko Cc: David Rientjes Cc: Johannes Weiner Cc: Dave Chinner Cc: "Theodore Ts'o" Cc: Mel Gorman Cc: Tetsuo Handa Cc: "David S. Miller" Cc: Hariprasad S Cc: Jan Kara Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: David S. Miller diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c index 6791fd1..3ef0cf9 100644 --- a/drivers/infiniband/hw/cxgb4/mem.c +++ b/drivers/infiniband/hw/cxgb4/mem.c @@ -73,7 +73,7 @@ static int _c4iw_write_mem_dma_aligned(struct c4iw_rdev *rdev, u32 addr, c4iw_init_wr_wait(&wr_wait); wr_len = roundup(sizeof(*req) + sizeof(*sgl), 16); - skb = alloc_skb(wr_len, GFP_KERNEL | __GFP_NOFAIL); + skb = alloc_skb(wr_len, GFP_KERNEL); if (!skb) return -ENOMEM; set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0); diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c index 6de0544..803d91b 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c @@ -1140,6 +1140,10 @@ static int set_filter_wr(struct adapter *adapter, int fidx) struct fw_filter_wr *fwr; unsigned int ftid; + skb = alloc_skb(sizeof(*fwr), GFP_KERNEL); + if (!skb) + return -ENOMEM; + /* If the new filter requires loopback Destination MAC and/or VLAN * rewriting then we need to allocate a Layer 2 Table (L2T) entry for * the filter. @@ -1147,19 +1151,21 @@ static int set_filter_wr(struct adapter *adapter, int fidx) if (f->fs.newdmac || f->fs.newvlan) { /* allocate L2T entry for new filter */ f->l2t = t4_l2t_alloc_switching(adapter->l2t); - if (f->l2t == NULL) + if (f->l2t == NULL) { + kfree_skb(skb); return -EAGAIN; + } if (t4_l2t_set_switching(adapter, f->l2t, f->fs.vlan, f->fs.eport, f->fs.dmac)) { cxgb4_l2t_release(f->l2t); f->l2t = NULL; + kfree_skb(skb); return -ENOMEM; } } ftid = adapter->tids.ftid_base + fidx; - skb = alloc_skb(sizeof(*fwr), GFP_KERNEL | __GFP_NOFAIL); fwr = (struct fw_filter_wr *)__skb_put(skb, sizeof(*fwr)); memset(fwr, 0, sizeof(*fwr)); @@ -1257,7 +1263,10 @@ static int del_filter_wr(struct adapter *adapter, int fidx) len = sizeof(*fwr); ftid = adapter->tids.ftid_base + fidx; - skb = alloc_skb(len, GFP_KERNEL | __GFP_NOFAIL); + skb = alloc_skb(len, GFP_KERNEL); + if (!skb) + return -ENOMEM; + fwr = (struct fw_filter_wr *)__skb_put(skb, len); t4_mk_filtdelwr(ftid, fwr, adapter->sge.fw_evtq.abs_id); -- cgit v0.10.2 From c3de6317d748e23b9e46ba36e10483728d00d144 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 14 Apr 2015 15:57:13 -0700 Subject: bpf: fix verifier memory corruption Due to missing bounds check the DAG pass of the BPF verifier can corrupt the memory which can cause random crashes during program loading: [8.449451] BUG: unable to handle kernel paging request at ffffffffffffffff [8.451293] IP: [] kmem_cache_alloc_trace+0x8d/0x2f0 [8.452329] Oops: 0000 [#1] SMP [8.452329] Call Trace: [8.452329] [] bpf_check+0x852/0x2000 [8.452329] [] bpf_prog_load+0x1e4/0x310 [8.452329] [] ? might_fault+0x5f/0xb0 [8.452329] [] SyS_bpf+0x806/0xa30 Fixes: f1bca824dabb ("bpf: add search pruning optimization to verifier") Signed-off-by: Alexei Starovoitov Acked-by: Hannes Frederic Sowa Acked-by: Daniel Borkmann Signed-off-by: David S. Miller diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 630a7ba..66bec36 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1397,7 +1397,8 @@ peek_stack: /* tell verifier to check for equivalent states * after every call and jump */ - env->explored_states[t + 1] = STATE_LIST_MARK; + if (t + 1 < insn_cnt) + env->explored_states[t + 1] = STATE_LIST_MARK; } else { /* conditional jump with two edges */ ret = push_insn(t, t + 1, FALLTHROUGH, env); -- cgit v0.10.2 From 540207ae69777b85d167df28f469e77f0fcbb8f9 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Wed, 15 Apr 2015 11:48:49 -0700 Subject: fou: avoid missing unlock in failure path Fixes: 7a6c8c34e5b7 ("fou: implement FOU_CMD_GET") Reported-by: Dan Carpenter Cc: Dan Carpenter Signed-off-by: Cong Wang Signed-off-by: David S. Miller diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index af150b4..34968cd 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -711,11 +711,10 @@ static int fou_nl_dump(struct sk_buff *skb, struct netlink_callback *cb) cb->nlh->nlmsg_seq, NLM_F_MULTI, skb, FOU_CMD_GET); if (ret) - goto done; + break; } mutex_unlock(&fn->fou_lock); -done: cb->args[0] = idx; return skb->len; } -- cgit v0.10.2 From 538cc282a3efb62cdb62ff9e6e44918c11a3c886 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Wed, 15 Apr 2015 22:12:42 -0700 Subject: dsa: mv88e6xxx: Fix error handling in mv88e6xxx_set_port_state Return correct error code if _mv88e6xxx_reg_read returns an error. Fixes: facd95b2e0ec0 ("net: dsa: mv88e6xxx: Add Hardware bridging support") Signed-off-by: Guenter Roeck Reviewed-by: Andrew Lunn Reported-by: kbuild test robot Signed-off-by: David S. Miller diff --git a/drivers/net/dsa/mv88e6xxx.c b/drivers/net/dsa/mv88e6xxx.c index fc8d3b6..f64186a 100644 --- a/drivers/net/dsa/mv88e6xxx.c +++ b/drivers/net/dsa/mv88e6xxx.c @@ -908,8 +908,10 @@ static int mv88e6xxx_set_port_state(struct dsa_switch *ds, int port, u8 state) mutex_lock(&ps->smi_mutex); reg = _mv88e6xxx_reg_read(ds, REG_PORT(port), PORT_CONTROL); - if (reg < 0) + if (reg < 0) { + ret = reg; goto abort; + } oldstate = reg & PORT_CONTROL_STATE_MASK; if (oldstate != state) { -- cgit v0.10.2 From 3122a92e80957e30ea9afd963128c6e6c0f5c2ac Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Thu, 16 Apr 2015 20:21:02 +0800 Subject: rocker: fix error return code in rocker_probe() Fix to return -EINVAL from the invalid PCI region size error handling case instead of 0, as done elsewhere in this function. Signed-off-by: Wei Yongjun Acked-by: Jiri Pirko Signed-off-by: David S. Miller diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c index a87b177..a570a60 100644 --- a/drivers/net/ethernet/rocker/rocker.c +++ b/drivers/net/ethernet/rocker/rocker.c @@ -4759,6 +4759,7 @@ static int rocker_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (pci_resource_len(pdev, 0) < ROCKER_PCI_BAR0_SIZE) { dev_err(&pdev->dev, "invalid PCI region size\n"); + err = -EINVAL; goto err_pci_resource_len_check; } -- cgit v0.10.2 From b892e98090fed57797528bb17abcca260f23df2d Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Thu, 16 Apr 2015 21:06:47 +0800 Subject: ethernet: remove unused including Remove including that don't need it. Signed-off-by: Wei Yongjun Signed-off-by: David S. Miller diff --git a/drivers/net/ethernet/qualcomm/qca_spi.c b/drivers/net/ethernet/qualcomm/qca_spi.c index 4a42e96..f66641d 100644 --- a/drivers/net/ethernet/qualcomm/qca_spi.c +++ b/drivers/net/ethernet/qualcomm/qca_spi.c @@ -41,7 +41,6 @@ #include #include #include -#include #include "qca_7k.h" #include "qca_debug.h" -- cgit v0.10.2 From 5a950ad58d412d76d33f4f4399d69308d511c1a4 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Thu, 16 Apr 2015 21:17:35 +0800 Subject: netns: remove duplicated include from net_namespace.c Remove duplicated include. Signed-off-by: Wei Yongjun Acked-by: Nicolas Dichtel Signed-off-by: David S. Miller diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index a3abb719..78fc04a 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include -- cgit v0.10.2 From 31976cfdc7e2e8b0b23d731703e4f1e21ff79466 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Thu, 16 Apr 2015 06:49:50 -0700 Subject: dsa: mv88e6xxx: Drop duplicate declaration of 'ret' variable A duplicate declaration of 'ret' can result in hiding an error code. Drop it. Fixes: 17ee3e04ddbf ("net: dsa: Provide additional RMON statistics") Signed-off-by: Guenter Roeck Signed-off-by: David S. Miller diff --git a/drivers/net/dsa/mv88e6xxx.c b/drivers/net/dsa/mv88e6xxx.c index f64186a..12f186c 100644 --- a/drivers/net/dsa/mv88e6xxx.c +++ b/drivers/net/dsa/mv88e6xxx.c @@ -602,8 +602,6 @@ static void _mv88e6xxx_get_ethtool_stats(struct dsa_switch *ds, u32 high = 0; if (s->reg >= 0x100) { - int ret; - ret = mv88e6xxx_reg_read(ds, REG_PORT(port), s->reg - 0x100); if (ret < 0) -- cgit v0.10.2 From e743471f8d9c6ad00eb10a6d1e05803231e1de83 Mon Sep 17 00:00:00 2001 From: Bryan O'Donoghue Date: Thu, 16 Apr 2015 17:56:03 +0100 Subject: stmmac: fix oops on rmmod after assigning ip addr An oops exists in the flow of stmmac_release(). phy_ethtool_get_wol() depends on phydev->drv. phydev->drv will be null after stmmac_mdio_unreg() completes. Steps to reproduce on Quark X1000: 1. ifconfig eth0 192.168.0.1 2. rmmod stmmac_pci To fix this stmmac_mdio_unreg() should be run after unregister_netdev(). Signed-off-by: Bryan O'Donoghue Reported-by: Dan O'Donovan Signed-off-by: David S. Miller diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 06103ca..6065173 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -2970,15 +2970,15 @@ int stmmac_dvr_remove(struct net_device *ndev) priv->hw->dma->stop_tx(priv->ioaddr); stmmac_set_mac(priv->ioaddr, false); - if (priv->pcs != STMMAC_PCS_RGMII && priv->pcs != STMMAC_PCS_TBI && - priv->pcs != STMMAC_PCS_RTBI) - stmmac_mdio_unregister(ndev); netif_carrier_off(ndev); unregister_netdev(ndev); if (priv->stmmac_rst) reset_control_assert(priv->stmmac_rst); clk_disable_unprepare(priv->pclk); clk_disable_unprepare(priv->stmmac_clk); + if (priv->pcs != STMMAC_PCS_RGMII && priv->pcs != STMMAC_PCS_TBI && + priv->pcs != STMMAC_PCS_RTBI) + stmmac_mdio_unregister(ndev); free_netdev(ndev); return 0; -- cgit v0.10.2 From 13967f0c2abc088c27718319b9a571ccbf43fa89 Mon Sep 17 00:00:00 2001 From: Vince Bridgers Date: Wed, 15 Apr 2015 11:17:38 -0500 Subject: stmmac: Add properties for transmit and receive fifo sizes The Synopsys stmmac fifo sizes are configurable, and need to be known in order to configure certain controller features. This patch adds tx-fifo-depth and rx-fifo-depth properties to the stmmac document file. Signed-off-by: Vince Bridgers Signed-off-by: David S. Miller diff --git a/Documentation/devicetree/bindings/net/ethernet.txt b/Documentation/devicetree/bindings/net/ethernet.txt index 3fc3605..41b3f3f 100644 --- a/Documentation/devicetree/bindings/net/ethernet.txt +++ b/Documentation/devicetree/bindings/net/ethernet.txt @@ -19,6 +19,12 @@ The following properties are common to the Ethernet controllers: - phy: the same as "phy-handle" property, not recommended for new bindings. - phy-device: the same as "phy-handle" property, not recommended for new bindings. +- rx-fifo-depth: the size of the controller's receive fifo in bytes. This + is used for components that can have configurable receive fifo sizes, + and is useful for determining certain configuration settings such as + flow control thresholds. +- tx-fifo-depth: the size of the controller's transmit fifo in bytes. This + is used for components that can have configurable fifo sizes. Child nodes of the Ethernet controller are typically the individual PHY devices connected via the MDIO bus (sometimes the MDIO bus controller is separate). diff --git a/Documentation/devicetree/bindings/net/stmmac.txt b/Documentation/devicetree/bindings/net/stmmac.txt index 29aca85..f34fc3c 100644 --- a/Documentation/devicetree/bindings/net/stmmac.txt +++ b/Documentation/devicetree/bindings/net/stmmac.txt @@ -45,6 +45,8 @@ Optional properties: If not passed then the system clock will be used and this is fine on some platforms. - snps,burst_len: The AXI burst lenth value of the AXI BUS MODE register. +- tx-fifo-depth: See ethernet.txt file in the same directory +- rx-fifo-depth: See ethernet.txt file in the same directory Examples: @@ -59,6 +61,8 @@ Examples: phy-mode = "gmii"; snps,multicast-filter-bins = <256>; snps,perfect-filter-entries = <128>; + rx-fifo-depth = <16384>; + tx-fifo-depth = <16384>; clocks = <&clock>; clock-names = "stmmaceth"; }; -- cgit v0.10.2 From 2453beb632e3e8c194c87953a536a4c58b149867 Mon Sep 17 00:00:00 2001 From: Vince Bridgers Date: Wed, 15 Apr 2015 11:17:39 -0500 Subject: stmmac: Add defines and documentation for enabling flow control Add defines and documentation for enabling flow control on the stmmac. Flow control was not implemented correctly on the stmmac driver and is currently non-functional as a result. This is the first in a series of small patches to correctly implement this feature. Signed-off-by: Vince Bridgers Signed-off-by: David S. Miller diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000.h b/drivers/net/ethernet/stmicro/stmmac/dwmac1000.h index 64d8f56..b3fe057 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000.h +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000.h @@ -172,6 +172,7 @@ enum inter_frame_gap { /* GMAC FLOW CTRL defines */ #define GMAC_FLOW_CTRL_PT_MASK 0xffff0000 /* Pause Time Mask */ #define GMAC_FLOW_CTRL_PT_SHIFT 16 +#define GMAC_FLOW_CTRL_UP 0x00000008 /* Unicast pause frame enable */ #define GMAC_FLOW_CTRL_RFE 0x00000004 /* Rx Flow Control Enable */ #define GMAC_FLOW_CTRL_TFE 0x00000002 /* Tx Flow Control Enable */ #define GMAC_FLOW_CTRL_FCB_BPA 0x00000001 /* Flow Control Busy ... */ @@ -246,6 +247,56 @@ enum ttc_control { #define DMA_CONTROL_FEF 0x00000080 #define DMA_CONTROL_FUF 0x00000040 +/* Receive flow control activation field + * RFA field in DMA control register, bits 23,10:9 + */ +#define DMA_CONTROL_RFA_MASK 0x00800600 + +/* Receive flow control deactivation field + * RFD field in DMA control register, bits 22,12:11 + */ +#define DMA_CONTROL_RFD_MASK 0x00401800 + +/* RFD and RFA fields are encoded as follows + * + * Bit Field + * 0,00 - Full minus 1KB (only valid when rxfifo >= 4KB and EFC enabled) + * 0,01 - Full minus 2KB (only valid when rxfifo >= 4KB and EFC enabled) + * 0,10 - Full minus 3KB (only valid when rxfifo >= 4KB and EFC enabled) + * 0,11 - Full minus 4KB (only valid when rxfifo > 4KB and EFC enabled) + * 1,00 - Full minus 5KB (only valid when rxfifo > 8KB and EFC enabled) + * 1,01 - Full minus 6KB (only valid when rxfifo > 8KB and EFC enabled) + * 1,10 - Full minus 7KB (only valid when rxfifo > 8KB and EFC enabled) + * 1,11 - Reserved + * + * RFD should always be > RFA for a given FIFO size. RFD == RFA may work, + * but packet throughput performance may not be as expected. + * + * Be sure that bit 3 in GMAC Register 6 is set for Unicast Pause frame + * detection (IEEE Specification Requirement, Annex 31B, 31B.1, Pause + * Description). + * + * Be sure that DZPA (bit 7 in Flow Control Register, GMAC Register 6), + * is set to 0. This allows pause frames with a quanta of 0 to be sent + * as an XOFF message to the link peer. + */ + +#define RFA_FULL_MINUS_1K 0x00000000 +#define RFA_FULL_MINUS_2K 0x00000200 +#define RFA_FULL_MINUS_3K 0x00000400 +#define RFA_FULL_MINUS_4K 0x00000600 +#define RFA_FULL_MINUS_5K 0x00800000 +#define RFA_FULL_MINUS_6K 0x00800200 +#define RFA_FULL_MINUS_7K 0x00800400 + +#define RFD_FULL_MINUS_1K 0x00000000 +#define RFD_FULL_MINUS_2K 0x00000800 +#define RFD_FULL_MINUS_3K 0x00001000 +#define RFD_FULL_MINUS_4K 0x00001800 +#define RFD_FULL_MINUS_5K 0x00400000 +#define RFD_FULL_MINUS_6K 0x00400800 +#define RFD_FULL_MINUS_7K 0x00401000 + enum rtc_control { DMA_CONTROL_RTC_64 = 0x00000000, DMA_CONTROL_RTC_32 = 0x00000008, -- cgit v0.10.2 From e7877f52fd4a8d7012f9b0faecc047a50c132a79 Mon Sep 17 00:00:00 2001 From: Vince Bridgers Date: Wed, 15 Apr 2015 11:17:40 -0500 Subject: stmmac: Read tx-fifo-depth and rx-fifo-depth from the devicetree Read the tx-fifo-depth and rx-fifo-depth from the devicetree. The Synopsys stmmac controller fifos are configurable per product instance, and the fifo sizes are needed to configure certain features correctly such as flow control. Signed-off-by: Vince Bridgers Signed-off-by: David S. Miller diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c index f9b42f1..705bbdf 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c @@ -181,6 +181,10 @@ static int stmmac_probe_config_dt(struct platform_device *pdev, sizeof(struct stmmac_mdio_bus_data), GFP_KERNEL); + of_property_read_u32(np, "tx-fifo-depth", &plat->tx_fifo_size); + + of_property_read_u32(np, "rx-fifo-depth", &plat->rx_fifo_size); + plat->force_sf_dma_mode = of_property_read_bool(np, "snps,force_sf_dma_mode"); diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index cd63851..7f484a2 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -114,6 +114,8 @@ struct plat_stmmacenet_data { int maxmtu; int multicast_filter_bins; int unicast_filter_entries; + int tx_fifo_size; + int rx_fifo_size; void (*fix_mac_speed)(void *priv, unsigned int speed); void (*bus_setup)(void __iomem *ioaddr); void *(*setup)(struct platform_device *pdev); -- cgit v0.10.2 From 545d655ebbea65986cb762905b81bff54f42eb6d Mon Sep 17 00:00:00 2001 From: Vince Bridgers Date: Wed, 15 Apr 2015 11:17:41 -0500 Subject: stmmac: Enable unicast pause frame detect in GMAC Register 6 Unicast pause frame detect was not being enabled for the Synopsys stmmac. This patch sets Unicast pause frame detect in MAC register 6 so that pause frame detection by the stmmac conforms to IEEE 802.3, Annex 31B.3.3 Receive Operation - Specifically, a MAC shall respond to pause frames containing either the reserved multicast address or the unique physical address associated with this station. Signed-off-by: Vince Bridgers Signed-off-by: David S. Miller diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c index 0adcf73..371a669 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c @@ -201,7 +201,10 @@ static void dwmac1000_flow_ctrl(struct mac_device_info *hw, unsigned int duplex, unsigned int fc, unsigned int pause_time) { void __iomem *ioaddr = hw->pcsr; - unsigned int flow = 0; + /* Set flow such that DZPQ in Mac Register 6 is 0, + * and unicast pause detect is enabled. + */ + unsigned int flow = GMAC_FLOW_CTRL_UP; pr_debug("GMAC Flow-Control:\n"); if (fc & FLOW_RX) { -- cgit v0.10.2 From f88203a229cca0b3634738b7dae47419d1da6dc8 Mon Sep 17 00:00:00 2001 From: Vince Bridgers Date: Wed, 15 Apr 2015 11:17:42 -0500 Subject: stmmac: Configure Flow Control to work correctly based on rxfifo size Configure flow control correctly, and based on the receive fifo size read as a property from the devicetree since the Synopsys stmmac fifo sizes are configurable based on a particular chip's implementation. This patch maintains the previous incorrect behavior unless the receive fifo size is found in the devicetree. Signed-off-by: Vince Bridgers Signed-off-by: David S. Miller diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h index cd77289..623c6ed 100644 --- a/drivers/net/ethernet/stmicro/stmmac/common.h +++ b/drivers/net/ethernet/stmicro/stmmac/common.h @@ -150,7 +150,7 @@ struct stmmac_extra_stats { #define MAC_CSR_H_FRQ_MASK 0x20 #define HASH_TABLE_SIZE 64 -#define PAUSE_TIME 0x200 +#define PAUSE_TIME 0xffff /* Flow Control defines */ #define FLOW_OFF 0 @@ -357,7 +357,8 @@ struct stmmac_dma_ops { void (*dump_regs) (void __iomem *ioaddr); /* Set tx/rx threshold in the csr6 register * An invalid value enables the store-and-forward mode */ - void (*dma_mode) (void __iomem *ioaddr, int txmode, int rxmode); + void (*dma_mode)(void __iomem *ioaddr, int txmode, int rxmode, + int rxfifosz); /* To track extra statistic (if supported) */ void (*dma_diagnostic_fr) (void *data, struct stmmac_extra_stats *x, void __iomem *ioaddr); diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c index 59d92e8..0e8937c 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c @@ -106,8 +106,29 @@ static int dwmac1000_dma_init(void __iomem *ioaddr, int pbl, int fb, int mb, return 0; } +static u32 dwmac1000_configure_fc(u32 csr6, int rxfifosz) +{ + csr6 &= ~DMA_CONTROL_RFA_MASK; + csr6 &= ~DMA_CONTROL_RFD_MASK; + + /* Leave flow control disabled if receive fifo size is less than + * 4K or 0. Otherwise, send XOFF when fifo is 1K less than full, + * and send XON when 2K less than full. + */ + if (rxfifosz < 4096) { + csr6 &= ~DMA_CONTROL_EFC; + pr_debug("GMAC: disabling flow control, rxfifo too small(%d)\n", + rxfifosz); + } else { + csr6 |= DMA_CONTROL_EFC; + csr6 |= RFA_FULL_MINUS_1K; + csr6 |= RFD_FULL_MINUS_2K; + } + return csr6; +} + static void dwmac1000_dma_operation_mode(void __iomem *ioaddr, int txmode, - int rxmode) + int rxmode, int rxfifosz) { u32 csr6 = readl(ioaddr + DMA_CONTROL); @@ -153,6 +174,9 @@ static void dwmac1000_dma_operation_mode(void __iomem *ioaddr, int txmode, csr6 |= DMA_CONTROL_RTC_128; } + /* Configure flow control based on rx fifo size */ + csr6 = dwmac1000_configure_fc(csr6, rxfifosz); + writel(csr6, ioaddr + DMA_CONTROL); } diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac100_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwmac100_dma.c index 7d1dce9..9d0971c 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac100_dma.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac100_dma.c @@ -72,7 +72,7 @@ static int dwmac100_dma_init(void __iomem *ioaddr, int pbl, int fb, int mb, * control register. */ static void dwmac100_dma_operation_mode(void __iomem *ioaddr, int txmode, - int rxmode) + int rxmode, int rxfifosz) { u32 csr6 = readl(ioaddr + DMA_CONTROL); diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 6065173..05c146f 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -1277,8 +1277,10 @@ static void free_dma_desc_resources(struct stmmac_priv *priv) */ static void stmmac_dma_operation_mode(struct stmmac_priv *priv) { + int rxfifosz = priv->plat->rx_fifo_size; + if (priv->plat->force_thresh_dma_mode) - priv->hw->dma->dma_mode(priv->ioaddr, tc, tc); + priv->hw->dma->dma_mode(priv->ioaddr, tc, tc, rxfifosz); else if (priv->plat->force_sf_dma_mode || priv->plat->tx_coe) { /* * In case of GMAC, SF mode can be enabled @@ -1287,10 +1289,12 @@ static void stmmac_dma_operation_mode(struct stmmac_priv *priv) * 2) There is no bugged Jumbo frame support * that needs to not insert csum in the TDES. */ - priv->hw->dma->dma_mode(priv->ioaddr, SF_DMA_MODE, SF_DMA_MODE); + priv->hw->dma->dma_mode(priv->ioaddr, SF_DMA_MODE, SF_DMA_MODE, + rxfifosz); priv->xstats.threshold = SF_DMA_MODE; } else - priv->hw->dma->dma_mode(priv->ioaddr, tc, SF_DMA_MODE); + priv->hw->dma->dma_mode(priv->ioaddr, tc, SF_DMA_MODE, + rxfifosz); } /** @@ -1442,6 +1446,7 @@ static void stmmac_tx_err(struct stmmac_priv *priv) static void stmmac_dma_interrupt(struct stmmac_priv *priv) { int status; + int rxfifosz = priv->plat->rx_fifo_size; status = priv->hw->dma->dma_interrupt(priv->ioaddr, &priv->xstats); if (likely((status & handle_rx)) || (status & handle_tx)) { @@ -1456,10 +1461,11 @@ static void stmmac_dma_interrupt(struct stmmac_priv *priv) (tc <= 256)) { tc += 64; if (priv->plat->force_thresh_dma_mode) - priv->hw->dma->dma_mode(priv->ioaddr, tc, tc); + priv->hw->dma->dma_mode(priv->ioaddr, tc, tc, + rxfifosz); else priv->hw->dma->dma_mode(priv->ioaddr, tc, - SF_DMA_MODE); + SF_DMA_MODE, rxfifosz); priv->xstats.threshold = tc; } } else if (unlikely(status == tx_hard_error)) -- cgit v0.10.2 From a166151cbe33b53221c24259e4a7201064b3ba79 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 15 Apr 2015 12:55:45 -0700 Subject: bpf: fix bpf helpers to use skb->mac_header relative offsets For the short-term solution, lets fix bpf helper functions to use skb->mac_header relative offsets instead of skb->data in order to get the same eBPF programs with cls_bpf and act_bpf work on ingress and egress qdisc path. We need to ensure that mac_header is set before calling into programs. This is effectively the first option from below referenced discussion. More long term solution for LD_ABS|LD_IND instructions will be more intrusive but also more beneficial than this, and implemented later as it's too risky at this point in time. I.e., we plan to look into the option of moving skb_pull() out of eth_type_trans() and into netif_receive_skb() as has been suggested as second option. Meanwhile, this solution ensures ingress can be used with eBPF, too, and that we won't run into ABI troubles later. For dealing with negative offsets inside eBPF helper functions, we've implemented bpf_skb_clone_unwritable() to test for unwriteable headers. Reference: http://thread.gmane.org/gmane.linux.network/359129/focus=359694 Fixes: 608cd71a9c7c ("tc: bpf: generalize pedit action") Fixes: 91bc4822c3d6 ("tc: bpf: add checksum helpers") Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 5c1cee1..a9ebdf5 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -177,7 +177,7 @@ enum bpf_func_id { /** * skb_store_bytes(skb, offset, from, len, flags) - store bytes into packet * @skb: pointer to skb - * @offset: offset within packet from skb->data + * @offset: offset within packet from skb->mac_header * @from: pointer where to copy bytes from * @len: number of bytes to store into packet * @flags: bit 0 - if true, recompute skb->csum diff --git a/include/uapi/linux/filter.h b/include/uapi/linux/filter.h index 34c7936..c97340e 100644 --- a/include/uapi/linux/filter.h +++ b/include/uapi/linux/filter.h @@ -79,8 +79,11 @@ struct sock_fprog { /* Required for SO_ATTACH_FILTER. */ #define SKF_AD_RANDOM 56 #define SKF_AD_VLAN_TPID 60 #define SKF_AD_MAX 64 -#define SKF_NET_OFF (-0x100000) -#define SKF_LL_OFF (-0x200000) +#define SKF_NET_OFF (-0x100000) +#define SKF_LL_OFF (-0x200000) + +#define BPF_NET_OFF SKF_NET_OFF +#define BPF_LL_OFF SKF_LL_OFF #endif /* _UAPI__LINUX_FILTER_H__ */ diff --git a/net/core/filter.c b/net/core/filter.c index b669e75..bf831a8 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1175,12 +1175,27 @@ int sk_attach_bpf(u32 ufd, struct sock *sk) return 0; } +/** + * bpf_skb_clone_not_writable - is the header of a clone not writable + * @skb: buffer to check + * @len: length up to which to write, can be negative + * + * Returns true if modifying the header part of the cloned buffer + * does require the data to be copied. I.e. this version works with + * negative lengths needed for eBPF case! + */ +static bool bpf_skb_clone_unwritable(const struct sk_buff *skb, int len) +{ + return skb_header_cloned(skb) || + (int) skb_headroom(skb) + len > skb->hdr_len; +} + #define BPF_RECOMPUTE_CSUM(flags) ((flags) & 1) static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) { struct sk_buff *skb = (struct sk_buff *) (long) r1; - unsigned int offset = (unsigned int) r2; + int offset = (int) r2; void *from = (void *) (long) r3; unsigned int len = (unsigned int) r4; char buf[16]; @@ -1194,10 +1209,12 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) * * so check for invalid 'offset' and too large 'len' */ - if (unlikely(offset > 0xffff || len > sizeof(buf))) + if (unlikely((u32) offset > 0xffff || len > sizeof(buf))) return -EFAULT; - if (skb_cloned(skb) && !skb_clone_writable(skb, offset + len)) + offset -= skb->data - skb_mac_header(skb); + if (unlikely(skb_cloned(skb) && + bpf_skb_clone_unwritable(skb, offset + len))) return -EFAULT; ptr = skb_header_pointer(skb, offset, len, buf); @@ -1232,15 +1249,18 @@ const struct bpf_func_proto bpf_skb_store_bytes_proto = { #define BPF_HEADER_FIELD_SIZE(flags) ((flags) & 0x0f) #define BPF_IS_PSEUDO_HEADER(flags) ((flags) & 0x10) -static u64 bpf_l3_csum_replace(u64 r1, u64 offset, u64 from, u64 to, u64 flags) +static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) { struct sk_buff *skb = (struct sk_buff *) (long) r1; + int offset = (int) r2; __sum16 sum, *ptr; - if (unlikely(offset > 0xffff)) + if (unlikely((u32) offset > 0xffff)) return -EFAULT; - if (skb_cloned(skb) && !skb_clone_writable(skb, offset + sizeof(sum))) + offset -= skb->data - skb_mac_header(skb); + if (unlikely(skb_cloned(skb) && + bpf_skb_clone_unwritable(skb, offset + sizeof(sum)))) return -EFAULT; ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum); @@ -1276,16 +1296,19 @@ const struct bpf_func_proto bpf_l3_csum_replace_proto = { .arg5_type = ARG_ANYTHING, }; -static u64 bpf_l4_csum_replace(u64 r1, u64 offset, u64 from, u64 to, u64 flags) +static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) { struct sk_buff *skb = (struct sk_buff *) (long) r1; u32 is_pseudo = BPF_IS_PSEUDO_HEADER(flags); + int offset = (int) r2; __sum16 sum, *ptr; - if (unlikely(offset > 0xffff)) + if (unlikely((u32) offset > 0xffff)) return -EFAULT; - if (skb_cloned(skb) && !skb_clone_writable(skb, offset + sizeof(sum))) + offset -= skb->data - skb_mac_header(skb); + if (unlikely(skb_cloned(skb) && + bpf_skb_clone_unwritable(skb, offset + sizeof(sum)))) return -EFAULT; ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum); diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 4d2cede..dc6a2d3 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -38,6 +38,9 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act, struct tcf_bpf *prog = act->priv; int action, filter_res; + if (unlikely(!skb_mac_header_was_set(skb))) + return TC_ACT_UNSPEC; + spin_lock(&prog->tcf_lock); prog->tcf_tm.lastuse = jiffies; diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 5c4171c..91bd9c1 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -66,6 +66,9 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct cls_bpf_prog *prog; int ret = -1; + if (unlikely(!skb_mac_header_was_set(skb))) + return -1; + /* Needed here for accessing maps. */ rcu_read_lock(); list_for_each_entry_rcu(prog, &head->plist, link) { diff --git a/samples/bpf/tcbpf1_kern.c b/samples/bpf/tcbpf1_kern.c index 7cf3f42..7c27710 100644 --- a/samples/bpf/tcbpf1_kern.c +++ b/samples/bpf/tcbpf1_kern.c @@ -4,6 +4,8 @@ #include #include #include +#include + #include "bpf_helpers.h" /* compiler workaround */ @@ -14,18 +16,12 @@ static inline void set_dst_mac(struct __sk_buff *skb, char *mac) bpf_skb_store_bytes(skb, 0, mac, ETH_ALEN, 1); } -/* use 1 below for ingress qdisc and 0 for egress */ -#if 0 -#undef ETH_HLEN -#define ETH_HLEN 0 -#endif - #define IP_CSUM_OFF (ETH_HLEN + offsetof(struct iphdr, check)) #define TOS_OFF (ETH_HLEN + offsetof(struct iphdr, tos)) static inline void set_ip_tos(struct __sk_buff *skb, __u8 new_tos) { - __u8 old_tos = load_byte(skb, TOS_OFF); + __u8 old_tos = load_byte(skb, BPF_LL_OFF + TOS_OFF); bpf_l3_csum_replace(skb, IP_CSUM_OFF, htons(old_tos), htons(new_tos), 2); bpf_skb_store_bytes(skb, TOS_OFF, &new_tos, sizeof(new_tos), 0); @@ -38,7 +34,7 @@ static inline void set_ip_tos(struct __sk_buff *skb, __u8 new_tos) static inline void set_tcp_ip_src(struct __sk_buff *skb, __u32 new_ip) { - __u32 old_ip = _htonl(load_word(skb, IP_SRC_OFF)); + __u32 old_ip = _htonl(load_word(skb, BPF_LL_OFF + IP_SRC_OFF)); bpf_l4_csum_replace(skb, TCP_CSUM_OFF, old_ip, new_ip, IS_PSEUDO | sizeof(new_ip)); bpf_l3_csum_replace(skb, IP_CSUM_OFF, old_ip, new_ip, sizeof(new_ip)); @@ -48,7 +44,7 @@ static inline void set_tcp_ip_src(struct __sk_buff *skb, __u32 new_ip) #define TCP_DPORT_OFF (ETH_HLEN + sizeof(struct iphdr) + offsetof(struct tcphdr, dest)) static inline void set_tcp_dest_port(struct __sk_buff *skb, __u16 new_port) { - __u16 old_port = htons(load_half(skb, TCP_DPORT_OFF)); + __u16 old_port = htons(load_half(skb, BPF_LL_OFF + TCP_DPORT_OFF)); bpf_l4_csum_replace(skb, TCP_CSUM_OFF, old_port, new_port, sizeof(new_port)); bpf_skb_store_bytes(skb, TCP_DPORT_OFF, &new_port, sizeof(new_port), 0); @@ -57,7 +53,7 @@ static inline void set_tcp_dest_port(struct __sk_buff *skb, __u16 new_port) SEC("classifier") int bpf_prog1(struct __sk_buff *skb) { - __u8 proto = load_byte(skb, ETH_HLEN + offsetof(struct iphdr, protocol)); + __u8 proto = load_byte(skb, BPF_LL_OFF + ETH_HLEN + offsetof(struct iphdr, protocol)); long *value; if (proto == IPPROTO_TCP) { -- cgit v0.10.2 From 725f9dcd58dedfea49ef958babf6c0bf6b7594a9 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 15 Apr 2015 16:19:33 -0700 Subject: bpf: fix two bugs in verification logic when accessing 'ctx' pointer 1. first bug is a silly mistake. It broke tracing examples and prevented simple bpf programs from loading. In the following code: if (insn->imm == 0 && BPF_SIZE(insn->code) == BPF_W) { } else if (...) { // this part should have been executed when // insn->code == BPF_W and insn->imm != 0 } Obviously it's not doing that. So simple instructions like: r2 = *(u64 *)(r1 + 8) will be rejected. Note the comments in the code around these branches were and still valid and indicate the true intent. Replace it with: if (BPF_SIZE(insn->code) != BPF_W) continue; if (insn->imm == 0) { } else if (...) { // now this code will be executed when // insn->code == BPF_W and insn->imm != 0 } 2. second bug is more subtle. If malicious code is using the same dest register as source register, the checks designed to prevent the same instruction to be used with different pointer types will fail to trigger, since we were assigning src_reg_type when it was already overwritten by check_mem_access(). The fix is trivial. Just move line: src_reg_type = regs[insn->src_reg].type; before check_mem_access(). Add new 'access skb fields bad4' test to check this case. Fixes: 9bac3d6d548e ("bpf: allow extended BPF programs access skb fields") Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 66bec36..47dcd3a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1637,6 +1637,8 @@ static int do_check(struct verifier_env *env) if (err) return err; + src_reg_type = regs[insn->src_reg].type; + /* check that memory (src_reg + off) is readable, * the state of dst_reg will be updated by this func */ @@ -1646,9 +1648,12 @@ static int do_check(struct verifier_env *env) if (err) return err; - src_reg_type = regs[insn->src_reg].type; + if (BPF_SIZE(insn->code) != BPF_W) { + insn_idx++; + continue; + } - if (insn->imm == 0 && BPF_SIZE(insn->code) == BPF_W) { + if (insn->imm == 0) { /* saw a valid insn * dst_reg = *(u32 *)(src_reg + off) * use reserved 'imm' field to mark this insn diff --git a/samples/bpf/test_verifier.c b/samples/bpf/test_verifier.c index 9ab6456..12f3780 100644 --- a/samples/bpf/test_verifier.c +++ b/samples/bpf/test_verifier.c @@ -721,6 +721,28 @@ static struct bpf_test tests[] = { .errstr = "different pointers", .result = REJECT, }, + { + "access skb fields bad4", + .insns = { + BPF_JMP_IMM(BPF_JGE, BPF_REG_1, 0, 3), + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1, + offsetof(struct __sk_buff, len)), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_JMP_IMM(BPF_JA, 0, 0, -13), + }, + .fixup = {7}, + .errstr = "different pointers", + .result = REJECT, + }, }; static int probe_filter_length(struct bpf_insn *fp) -- cgit v0.10.2 From 4c0ee414e877b899f7fc80aafb98d9425c02797f Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 16 Apr 2015 16:12:53 +0800 Subject: Revert "net: Reset secmark when scrubbing packet" This patch reverts commit b8fb4e0648a2ab3734140342002f68fb0c7d1602 because the secmark must be preserved even when a packet crosses namespace boundaries. The reason is that security labels apply to the system as a whole and is not per-namespace. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 3b6e583..f9800f4 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4133,7 +4133,6 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet) skb_dst_drop(skb); skb->mark = 0; skb_sender_cpu_clear(skb); - skb_init_secmark(skb); secpath_reset(skb); nf_reset(skb); nf_reset_trace(skb); -- cgit v0.10.2 From 213dd74aee765d4e5f3f4b9607fef0cf97faa2af Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 16 Apr 2015 09:03:27 +0800 Subject: skbuff: Do not scrub skb mark within the same name space MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On Wed, Apr 15, 2015 at 05:41:26PM +0200, Nicolas Dichtel wrote: > Le 15/04/2015 15:57, Herbert Xu a écrit : > >On Wed, Apr 15, 2015 at 06:22:29PM +0800, Herbert Xu wrote: > [snip] > >Subject: skbuff: Do not scrub skb mark within the same name space > > > >The commit ea23192e8e577dfc51e0f4fc5ca113af334edff9 ("tunnels: > Maybe add a Fixes tag? > Fixes: ea23192e8e57 ("tunnels: harmonize cleanup done on skb on rx path") > > >harmonize cleanup done on skb on rx path") broke anyone trying to > >use netfilter marking across IPv4 tunnels. While most of the > >fields that are cleared by skb_scrub_packet don't matter, the > >netfilter mark must be preserved. > > > >This patch rearranges skb_scurb_packet to preserve the mark field. > nit: s/scurb/scrub > > Else it's fine for me. Sure. PS I used the wrong email for James the first time around. So let me repeat the question here. Should secmark be preserved or cleared across tunnels within the same name space? In fact, do our security models even support name spaces? ---8<--- The commit ea23192e8e577dfc51e0f4fc5ca113af334edff9 ("tunnels: harmonize cleanup done on skb on rx path") broke anyone trying to use netfilter marking across IPv4 tunnels. While most of the fields that are cleared by skb_scrub_packet don't matter, the netfilter mark must be preserved. This patch rearranges skb_scrub_packet to preserve the mark field. Fixes: ea23192e8e57 ("tunnels: harmonize cleanup done on skb on rx path") Signed-off-by: Herbert Xu Acked-by: Thomas Graf Signed-off-by: David S. Miller diff --git a/net/core/skbuff.c b/net/core/skbuff.c index f9800f4..d1967da 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4124,18 +4124,21 @@ EXPORT_SYMBOL(skb_try_coalesce); */ void skb_scrub_packet(struct sk_buff *skb, bool xnet) { - if (xnet) - skb_orphan(skb); skb->tstamp.tv64 = 0; skb->pkt_type = PACKET_HOST; skb->skb_iif = 0; skb->ignore_df = 0; skb_dst_drop(skb); - skb->mark = 0; skb_sender_cpu_clear(skb); secpath_reset(skb); nf_reset(skb); nf_reset_trace(skb); + + if (!xnet) + return; + + skb_orphan(skb); + skb->mark = 0; } EXPORT_SYMBOL_GPL(skb_scrub_packet); -- cgit v0.10.2 From c3ffe6d2c9733729307799a60e1ae09a9878e697 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 16 Apr 2015 20:49:14 +0200 Subject: net: dsa: mv88e6xxx: Add missing initialization in mv88e6xxx_set_port_state() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit drivers/net/dsa/mv88e6xxx.c: In function ‘mv88e6xxx_set_port_state’: drivers/net/dsa/mv88e6xxx.c:905: warning: ‘ret’ may be used uninitialized in this function If oldstate == state, mv88e6xxx_set_port_state() will return an uninitialized value. Pre-initialize ret to zero to fix this. Signed-off-by: Geert Uytterhoeven Signed-off-by: David S. Miller diff --git a/drivers/net/dsa/mv88e6xxx.c b/drivers/net/dsa/mv88e6xxx.c index 12f186c..9f0c2b9 100644 --- a/drivers/net/dsa/mv88e6xxx.c +++ b/drivers/net/dsa/mv88e6xxx.c @@ -900,7 +900,7 @@ static int _mv88e6xxx_flush_fid(struct dsa_switch *ds, int fid) static int mv88e6xxx_set_port_state(struct dsa_switch *ds, int port, u8 state) { struct mv88e6xxx_priv_state *ps = ds_to_priv(ds); - int reg, ret; + int reg, ret = 0; u8 oldstate; mutex_lock(&ps->smi_mutex); -- cgit v0.10.2 From fad9dfefea6405039491e7e4fc21fb6e59e7d26c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 16 Apr 2015 16:12:28 -0700 Subject: tcp: tcp_get_info() should fetch socket fields once tcp_get_info() can be called without holding socket lock, so any socket fields can change under us. Use READ_ONCE() to fetch sk_pacing_rate and sk_max_pacing_rate Fixes: 977cb0ecf82e ("tcp: add pacing_rate information into tcp_info") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 18e3a12..59c8a02 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2595,6 +2595,7 @@ void tcp_get_info(const struct sock *sk, struct tcp_info *info) const struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); u32 now = tcp_time_stamp; + u32 rate; memset(info, 0, sizeof(*info)); @@ -2655,10 +2656,11 @@ void tcp_get_info(const struct sock *sk, struct tcp_info *info) info->tcpi_total_retrans = tp->total_retrans; - info->tcpi_pacing_rate = sk->sk_pacing_rate != ~0U ? - sk->sk_pacing_rate : ~0ULL; - info->tcpi_max_pacing_rate = sk->sk_max_pacing_rate != ~0U ? - sk->sk_max_pacing_rate : ~0ULL; + rate = READ_ONCE(sk->sk_pacing_rate); + info->tcpi_pacing_rate = rate != ~0U ? rate : ~0ULL; + + rate = READ_ONCE(sk->sk_max_pacing_rate); + info->tcpi_max_pacing_rate = rate != ~0U ? rate : ~0ULL; } EXPORT_SYMBOL_GPL(tcp_get_info); -- cgit v0.10.2 From 521f1cf1dbb9d5ad858dca5dc75d1b45f64b6589 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 16 Apr 2015 18:10:35 -0700 Subject: inet_diag: fix access to tcp cc information Two different problems are fixed here : 1) inet_sk_diag_fill() might be called without socket lock held. icsk->icsk_ca_ops can change under us and module be unloaded. -> Access to freed memory. Fix this using rcu_read_lock() to prevent module unload. 2) Some TCP Congestion Control modules provide information but again this is not safe against icsk->icsk_ca_ops change and nla_put() errors were ignored. Some sockets could not get the additional info if skb was almost full. Fix this by returning a status from get_info() handlers and using rcu protection as well. Signed-off-by: Eric Dumazet Acked-by: Daniel Borkmann Signed-off-by: David S. Miller diff --git a/include/net/tcp.h b/include/net/tcp.h index 9598871..051dc5c2 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -829,7 +829,7 @@ struct tcp_congestion_ops { /* hook for packet ack accounting (optional) */ void (*pkts_acked)(struct sock *sk, u32 num_acked, s32 rtt_us); /* get info for inet_diag (optional) */ - void (*get_info)(struct sock *sk, u32 ext, struct sk_buff *skb); + int (*get_info)(struct sock *sk, u32 ext, struct sk_buff *skb); char name[TCP_CA_NAME_MAX]; struct module *owner; diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 70e8b3c..bb77ebd 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -111,6 +111,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, const struct nlmsghdr *unlh) { const struct inet_sock *inet = inet_sk(sk); + const struct tcp_congestion_ops *ca_ops; const struct inet_diag_handler *handler; int ext = req->idiag_ext; struct inet_diag_msg *r; @@ -208,16 +209,31 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, info = nla_data(attr); } - if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops) - if (nla_put_string(skb, INET_DIAG_CONG, - icsk->icsk_ca_ops->name) < 0) + if (ext & (1 << (INET_DIAG_CONG - 1))) { + int err = 0; + + rcu_read_lock(); + ca_ops = READ_ONCE(icsk->icsk_ca_ops); + if (ca_ops) + err = nla_put_string(skb, INET_DIAG_CONG, ca_ops->name); + rcu_read_unlock(); + if (err < 0) goto errout; + } handler->idiag_get_info(sk, r, info); - if (sk->sk_state < TCP_TIME_WAIT && - icsk->icsk_ca_ops && icsk->icsk_ca_ops->get_info) - icsk->icsk_ca_ops->get_info(sk, ext, skb); + if (sk->sk_state < TCP_TIME_WAIT) { + int err = 0; + + rcu_read_lock(); + ca_ops = READ_ONCE(icsk->icsk_ca_ops); + if (ca_ops && ca_ops->get_info) + err = ca_ops->get_info(sk, ext, skb); + rcu_read_unlock(); + if (err < 0) + goto errout; + } out: nlmsg_end(skb, nlh); diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c index b504371..4376016 100644 --- a/net/ipv4/tcp_dctcp.c +++ b/net/ipv4/tcp_dctcp.c @@ -277,7 +277,7 @@ static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev) } } -static void dctcp_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) +static int dctcp_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) { const struct dctcp *ca = inet_csk_ca(sk); @@ -297,8 +297,9 @@ static void dctcp_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) info.dctcp_ab_tot = ca->acked_bytes_total; } - nla_put(skb, INET_DIAG_DCTCPINFO, sizeof(info), &info); + return nla_put(skb, INET_DIAG_DCTCPINFO, sizeof(info), &info); } + return 0; } static struct tcp_congestion_ops dctcp __read_mostly = { diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c index 1d5a30a..67476f0 100644 --- a/net/ipv4/tcp_illinois.c +++ b/net/ipv4/tcp_illinois.c @@ -300,8 +300,7 @@ static u32 tcp_illinois_ssthresh(struct sock *sk) } /* Extract info for Tcp socket info provided via netlink. */ -static void tcp_illinois_info(struct sock *sk, u32 ext, - struct sk_buff *skb) +static int tcp_illinois_info(struct sock *sk, u32 ext, struct sk_buff *skb) { const struct illinois *ca = inet_csk_ca(sk); @@ -318,8 +317,9 @@ static void tcp_illinois_info(struct sock *sk, u32 ext, do_div(t, info.tcpv_rttcnt); info.tcpv_rtt = t; } - nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info); + return nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info); } + return 0; } static struct tcp_congestion_ops tcp_illinois __read_mostly = { diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index a6afde6..c71a1b8 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -286,7 +286,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked) } /* Extract info for Tcp socket info provided via netlink. */ -void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) +int tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) { const struct vegas *ca = inet_csk_ca(sk); if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { @@ -297,8 +297,9 @@ void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) .tcpv_minrtt = ca->minRTT, }; - nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info); + return nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info); } + return 0; } EXPORT_SYMBOL_GPL(tcp_vegas_get_info); diff --git a/net/ipv4/tcp_vegas.h b/net/ipv4/tcp_vegas.h index 0531b99..e8a6b33 100644 --- a/net/ipv4/tcp_vegas.h +++ b/net/ipv4/tcp_vegas.h @@ -19,6 +19,6 @@ void tcp_vegas_init(struct sock *sk); void tcp_vegas_state(struct sock *sk, u8 ca_state); void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us); void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event); -void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb); +int tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb); #endif /* __TCP_VEGAS_H */ diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index bb63fba..b3c57cc 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -256,8 +256,7 @@ static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) } /* Extract info for Tcp socket info provided via netlink. */ -static void tcp_westwood_info(struct sock *sk, u32 ext, - struct sk_buff *skb) +static int tcp_westwood_info(struct sock *sk, u32 ext, struct sk_buff *skb) { const struct westwood *ca = inet_csk_ca(sk); @@ -268,8 +267,9 @@ static void tcp_westwood_info(struct sock *sk, u32 ext, .tcpv_minrtt = jiffies_to_usecs(ca->rtt_min), }; - nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info); + return nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info); } + return 0; } static struct tcp_congestion_ops tcp_westwood __read_mostly = { -- cgit v0.10.2 From f40ae91307c275fc8b17420fa74145e9937c3c0b Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 17 Apr 2015 13:32:09 +0800 Subject: act_mirred: Fix bogus header when redirecting from VLAN When you redirect a VLAN device to any device, you end up with crap in af_packet on the xmit path because hard_header_len is not equal to skb->mac_len. So the redirected packet contains four extra bytes at the start which then gets interpreted as part of the MAC address. This patch fixes this by only pushing skb->mac_len. We also need to fix ifb because it tries to undo the pushing done by act_mirred. Signed-off-by: Herbert Xu Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c index 34f846b..94570aa 100644 --- a/drivers/net/ifb.c +++ b/drivers/net/ifb.c @@ -105,7 +105,7 @@ static void ri_tasklet(unsigned long dev) if (from & AT_EGRESS) { dev_queue_xmit(skb); } else if (from & AT_INGRESS) { - skb_pull(skb, skb->dev->hard_header_len); + skb_pull(skb, skb->mac_len); netif_receive_skb(skb); } else BUG(); diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 5953517..3f63cea 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -157,7 +157,7 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a, if (!(at & AT_EGRESS)) { if (m->tcfm_ok_push) - skb_push(skb2, skb2->dev->hard_header_len); + skb_push(skb2, skb->mac_len); } /* mirror is always swallowed */ -- cgit v0.10.2 From 8b86a61da37cbbcf4bd6e87fda494a59b1cf16c4 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 17 Apr 2015 15:45:04 +0200 Subject: net: remove unused 'dev' argument from netif_needs_gso() In commit 04ffcb255f22 ("net: Add ndo_gso_check") Tom originally added the 'dev' argument to be able to call ndo_gso_check(). Then later, when generalizing this in commit 5f35227ea34b ("net: Generalize ndo_gso_check to ndo_features_check") Jesse removed the call to ndo_gso_check() in netif_needs_gso() by calling the new ndo_features_check() in a different place. This made the 'dev' argument unused. Remove the unused argument and go back to the code as before. Cc: Tom Herbert Cc: Jesse Gross Signed-off-by: Johannes Berg Signed-off-by: David S. Miller diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c index 8362aef..58c6ba5 100644 --- a/drivers/net/macvtap.c +++ b/drivers/net/macvtap.c @@ -313,7 +313,7 @@ static rx_handler_result_t macvtap_handle_frame(struct sk_buff **pskb) */ if (q->flags & IFF_VNET_HDR) features |= vlan->tap_features; - if (netif_needs_gso(dev, skb, features)) { + if (netif_needs_gso(skb, features)) { struct sk_buff *segs = __skb_gso_segment(skb, features, false); if (IS_ERR(segs)) diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index 720aaf6..8dcf310 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -560,7 +560,7 @@ static int xennet_start_xmit(struct sk_buff *skb, struct net_device *dev) if (unlikely(!netif_carrier_ok(dev) || (slots > 1 && !xennet_can_sg(dev)) || - netif_needs_gso(dev, skb, netif_skb_features(skb)))) { + netif_needs_gso(skb, netif_skb_features(skb)))) { spin_unlock_irqrestore(&queue->tx_lock, flags); goto drop; } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b5679ae..bcbde79 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3713,7 +3713,7 @@ static inline bool skb_gso_ok(struct sk_buff *skb, netdev_features_t features) (!skb_has_frag_list(skb) || (features & NETIF_F_FRAGLIST)); } -static inline bool netif_needs_gso(struct net_device *dev, struct sk_buff *skb, +static inline bool netif_needs_gso(struct sk_buff *skb, netdev_features_t features) { return skb_is_gso(skb) && (!skb_gso_ok(skb, features) || diff --git a/net/core/dev.c b/net/core/dev.c index af4a1b0a..1796cef5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2713,7 +2713,7 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device if (unlikely(!skb)) goto out_null; - if (netif_needs_gso(dev, skb, features)) { + if (netif_needs_gso(skb, features)) { struct sk_buff *segs; segs = skb_gso_segment(skb, features); -- cgit v0.10.2 From 93ea337852d6c9f292e2ccfc394be0f2859b99ae Mon Sep 17 00:00:00 2001 From: Andreas Oetken Date: Thu, 16 Apr 2015 23:48:08 +0200 Subject: altera tse: Fix network-delays and -retransmissions after high throughput. Fix bug which occurs when more than packets are available during napi-poll, leading to "delays" and retransmissions on the network. Check for (count < limit) before checking the get_rx_status in tse_rx-function. Function get_rx_status is reading from the response-fifo. If there is currently a response in the fifo, reading the last byte of the response pops the value from the fifo. If the limit is checked as second condition and the limit is reached the fifo is popped but the packet is not processed. Signed-off-by: Andreas Oetken Signed-off-by: David S. Miller diff --git a/drivers/net/ethernet/altera/altera_tse_main.c b/drivers/net/ethernet/altera/altera_tse_main.c index 79ea358..90a7630 100644 --- a/drivers/net/ethernet/altera/altera_tse_main.c +++ b/drivers/net/ethernet/altera/altera_tse_main.c @@ -376,8 +376,13 @@ static int tse_rx(struct altera_tse_private *priv, int limit) u16 pktlength; u16 pktstatus; - while (((rxstatus = priv->dmaops->get_rx_status(priv)) != 0) && - (count < limit)) { + /* Check for count < limit first as get_rx_status is changing + * the response-fifo so we must process the next packet + * after calling get_rx_status if a response is pending. + * (reading the last byte of the response pops the value from the fifo.) + */ + while ((count < limit) && + ((rxstatus = priv->dmaops->get_rx_status(priv)) != 0)) { pktstatus = rxstatus >> 16; pktlength = rxstatus & 0xffff; -- cgit v0.10.2 From 1d20a16062e771b6e26b843c0cde3b17c1146e00 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 17 Apr 2015 15:15:40 -0400 Subject: sfc: Fix memcpy() with const destination compiler warning. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit drivers/net/ethernet/sfc/selftest.c: In function ‘efx_iterate_state’: drivers/net/ethernet/sfc/selftest.c:388:9: warning: passing argument 1 of ‘memcpy’ discards ‘const’ qualifier from pointer target type [-Wdiscarded-array-qualifiers] This is because the msg[] member of struct efx_loopback_payload is marked as 'const'. Remove that. Signed-off-by: David S. Miller diff --git a/drivers/net/ethernet/sfc/selftest.c b/drivers/net/ethernet/sfc/selftest.c index 10b6173..b605dfd5 100644 --- a/drivers/net/ethernet/sfc/selftest.c +++ b/drivers/net/ethernet/sfc/selftest.c @@ -46,7 +46,7 @@ struct efx_loopback_payload { struct iphdr ip; struct udphdr udp; __be16 iteration; - const char msg[64]; + char msg[64]; } __packed; /* Loopback test source MAC address */ -- cgit v0.10.2 From 2c1539597419df6dfeb8ed774d60198b5830b8f7 Mon Sep 17 00:00:00 2001 From: Erez Shitrit Date: Thu, 16 Apr 2015 16:34:34 +0300 Subject: IB/ipoib: Fix ndo_get_iflink Currently, iflink of the parent interface was always accessed, even when interface didn't have a parent and hence we crashed there. Handle the interface types properly: for a child interface, return the ifindex of the parent, for parent interface, return its ifindex. For child devices, make sure to set the parent pointer prior to invoking register_netdevice(), this allows the new ndo to be called by the stack immediately after the child device is registered. Fixes: 5aa7add8f14b ('infiniband/ipoib: implement ndo_get_iflink') Reported-by: Honggang Li Signed-off-by: Erez Shitrit Signed-off-by: Honggang Li Reviewed-By: Jason Gunthorpe + Acked-by: Nicolas Dichtel Signed-off-by: David S. Miller diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 657b89b..915ad04 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -846,6 +846,11 @@ static int ipoib_get_iflink(const struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); + /* parent interface */ + if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) + return dev->ifindex; + + /* child/vlan interface */ return priv->parent->ifindex; } diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c index 4dd1313..fca1a88 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c @@ -58,6 +58,7 @@ int __ipoib_vlan_add(struct ipoib_dev_priv *ppriv, struct ipoib_dev_priv *priv, /* MTU will be reset when mcast join happens */ priv->dev->mtu = IPOIB_UD_MTU(priv->max_ib_mtu); priv->mcast_mtu = priv->admin_mtu = priv->dev->mtu; + priv->parent = ppriv->dev; set_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags); result = ipoib_set_dev_features(priv, ppriv->ca); @@ -84,8 +85,6 @@ int __ipoib_vlan_add(struct ipoib_dev_priv *ppriv, struct ipoib_dev_priv *priv, goto register_failed; } - priv->parent = ppriv->dev; - ipoib_create_debug_files(priv->dev); /* RTNL childs don't need proprietary sysfs entries */ -- cgit v0.10.2 From 2591ffd3085e34a231480efe8f1cac83ebb81725 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 17 Apr 2015 19:06:30 +0200 Subject: netns: remove BUG_ONs from net_generic() This inline has ~500 callsites. On 04/14/2015 08:37 PM, David Miller wrote: > That BUG_ON() was added 7 years ago, and I don't remember it ever > triggering or helping us diagnose something, so just remove it and > keep the function inlined. On x86 allyesconfig build: text data bss dec hex filename 82447071 22255384 20627456 125329911 77861f7 vmlinux4 82441375 22255384 20627456 125324215 7784bb7 vmlinux5prime Signed-off-by: Denys Vlasenko CC: Eric W. Biederman CC: David S. Miller CC: Jan Engelhardt CC: Jiri Pirko CC: linux-kernel@vger.kernel.org CC: netdev@vger.kernel.org Signed-off-by: David S. Miller diff --git a/include/net/netns/generic.h b/include/net/netns/generic.h index 0931618..70e1585 100644 --- a/include/net/netns/generic.h +++ b/include/net/netns/generic.h @@ -38,11 +38,9 @@ static inline void *net_generic(const struct net *net, int id) rcu_read_lock(); ng = rcu_dereference(net->gen); - BUG_ON(id == 0 || id > ng->len); ptr = ng->ptr[id - 1]; rcu_read_unlock(); - BUG_ON(!ptr); return ptr; } #endif -- cgit v0.10.2 From e3122b7fae7b4e3d1d49fa84f6515bcbe6cbc6fc Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 17 Apr 2015 15:12:25 -0400 Subject: net: dsa: use DEVICE_ATTR_RW to declare temp1_max Since commit da4759c (sysfs: Use only return value from is_visible for the file mode), it is possible to reduce the permissions of a file. So declare temp1_max with the DEVICE_ATTR_RW macro and remove the write permission in dsa_hwmon_attrs_visible if set_temp_limit isn't provided. Signed-off-by: Vivien Didelot Reviewed-by: Guenter Roeck Signed-off-by: David S. Miller diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 5eaadab..079a224 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -124,7 +124,7 @@ static ssize_t temp1_max_store(struct device *dev, return count; } -static DEVICE_ATTR(temp1_max, S_IRUGO, temp1_max_show, temp1_max_store); +static DEVICE_ATTR_RW(temp1_max); static ssize_t temp1_max_alarm_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -159,8 +159,8 @@ static umode_t dsa_hwmon_attrs_visible(struct kobject *kobj, if (index == 1) { if (!drv->get_temp_limit) mode = 0; - else if (drv->set_temp_limit) - mode |= S_IWUSR; + else if (!drv->set_temp_limit) + mode &= ~S_IWUSR; } else if (index == 2 && !drv->get_temp_alarm) { mode = 0; } -- cgit v0.10.2