From 35205b211c8d17a8a0b5e8926cb7c73e9a7ef1ad Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Tue, 5 Mar 2013 01:03:47 +0000
Subject: sfc: Disable soft interrupt handling during efx_device_detach_sync()

efx_device_detach_sync() locks all TX queues before marking the device
detached and thus disabling further TX scheduling.  But it can still
be interrupted by TX completions which then result in TX scheduling in
soft interrupt context.  This will deadlock when it tries to acquire
a TX queue lock that efx_device_detach_sync() already acquired.

To avoid deadlock, we must use netif_tx_{,un}lock_bh().

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>

diff --git a/drivers/net/ethernet/sfc/efx.h b/drivers/net/ethernet/sfc/efx.h
index 50247df..d2f790d 100644
--- a/drivers/net/ethernet/sfc/efx.h
+++ b/drivers/net/ethernet/sfc/efx.h
@@ -171,9 +171,9 @@ static inline void efx_device_detach_sync(struct efx_nic *efx)
 	 * TX scheduler is stopped when we're done and before
 	 * netif_device_present() becomes false.
 	 */
-	netif_tx_lock(dev);
+	netif_tx_lock_bh(dev);
 	netif_device_detach(dev);
-	netif_tx_unlock(dev);
+	netif_tx_unlock_bh(dev);
 }
 
 #endif /* EFX_EFX_H */
-- 
cgit v0.10.2


From c73e787a8db9117d59b5180baf83203a42ecadca Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Tue, 5 Mar 2013 17:49:39 +0000
Subject: sfc: Correct efx_rx_buffer::page_offset when EFX_PAGE_IP_ALIGN != 0

RX DMA buffers start at an offset of EFX_PAGE_IP_ALIGN bytes from the
start of a cache line.  This offset obviously needs to be included in
the virtual address, but this was missed in commit b590ace09d51
('sfc: Fix efx_rx_buf_offset() in the presence of swiotlb') since
EFX_PAGE_IP_ALIGN is equal to 0 on both x86 and powerpc.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>

diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c
index 879ff58..bb579a6 100644
--- a/drivers/net/ethernet/sfc/rx.c
+++ b/drivers/net/ethernet/sfc/rx.c
@@ -215,7 +215,7 @@ static int efx_init_rx_buffers_page(struct efx_rx_queue *rx_queue)
 		rx_buf = efx_rx_buffer(rx_queue, index);
 		rx_buf->dma_addr = dma_addr + EFX_PAGE_IP_ALIGN;
 		rx_buf->u.page = page;
-		rx_buf->page_offset = page_offset;
+		rx_buf->page_offset = page_offset + EFX_PAGE_IP_ALIGN;
 		rx_buf->len = efx->rx_buffer_len - EFX_PAGE_IP_ALIGN;
 		rx_buf->flags = EFX_RX_BUF_PAGE;
 		++rx_queue->added_count;
-- 
cgit v0.10.2


From 4a74dc65e3ad825a66dfbcb256f98c550f96445b Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Tue, 5 Mar 2013 20:13:54 +0000
Subject: sfc: Allow efx_channel_type::receive_skb() to reject a packet

Instead of having efx_ptp_rx() call netif_receive_skb() for an invalid
PTP packet, make it return false for rejected packets and have
efx_rx_deliver() pass them up.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>

diff --git a/drivers/net/ethernet/sfc/net_driver.h b/drivers/net/ethernet/sfc/net_driver.h
index 0a90abd..cdcf510 100644
--- a/drivers/net/ethernet/sfc/net_driver.h
+++ b/drivers/net/ethernet/sfc/net_driver.h
@@ -410,7 +410,7 @@ struct efx_channel_type {
 	void (*post_remove)(struct efx_channel *);
 	void (*get_name)(struct efx_channel *, char *buf, size_t len);
 	struct efx_channel *(*copy)(const struct efx_channel *);
-	void (*receive_skb)(struct efx_channel *, struct sk_buff *);
+	bool (*receive_skb)(struct efx_channel *, struct sk_buff *);
 	bool keep_eventq;
 };
 
diff --git a/drivers/net/ethernet/sfc/ptp.c b/drivers/net/ethernet/sfc/ptp.c
index 3f93624..faf4baf 100644
--- a/drivers/net/ethernet/sfc/ptp.c
+++ b/drivers/net/ethernet/sfc/ptp.c
@@ -1006,7 +1006,7 @@ bool efx_ptp_is_ptp_tx(struct efx_nic *efx, struct sk_buff *skb)
  * the receive timestamp from the MC - this will probably occur after the
  * packet arrival because of the processing in the MC.
  */
-static void efx_ptp_rx(struct efx_channel *channel, struct sk_buff *skb)
+static bool efx_ptp_rx(struct efx_channel *channel, struct sk_buff *skb)
 {
 	struct efx_nic *efx = channel->efx;
 	struct efx_ptp_data *ptp = efx->ptp_data;
@@ -1019,18 +1019,15 @@ static void efx_ptp_rx(struct efx_channel *channel, struct sk_buff *skb)
 	/* Correct version? */
 	if (ptp->mode == MC_CMD_PTP_MODE_V1) {
 		if (skb->len < PTP_V1_MIN_LENGTH) {
-			netif_receive_skb(skb);
-			return;
+			return false;
 		}
 		version = ntohs(*(__be16 *)&skb->data[PTP_V1_VERSION_OFFSET]);
 		if (version != PTP_VERSION_V1) {
-			netif_receive_skb(skb);
-			return;
+			return false;
 		}
 	} else {
 		if (skb->len < PTP_V2_MIN_LENGTH) {
-			netif_receive_skb(skb);
-			return;
+			return false;
 		}
 		version = skb->data[PTP_V2_VERSION_OFFSET];
 
@@ -1041,8 +1038,7 @@ static void efx_ptp_rx(struct efx_channel *channel, struct sk_buff *skb)
 		BUILD_BUG_ON(PTP_V1_SEQUENCE_LENGTH != PTP_V2_SEQUENCE_LENGTH);
 
 		if ((version & PTP_VERSION_V2_MASK) != PTP_VERSION_V2) {
-			netif_receive_skb(skb);
-			return;
+			return false;
 		}
 	}
 
@@ -1073,6 +1069,8 @@ static void efx_ptp_rx(struct efx_channel *channel, struct sk_buff *skb)
 
 	skb_queue_tail(&ptp->rxq, skb);
 	queue_work(ptp->workwq, &ptp->work);
+
+	return true;
 }
 
 /* Transmit a PTP packet.  This has to be transmitted by the MC
diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c
index bb579a6..f31c23e 100644
--- a/drivers/net/ethernet/sfc/rx.c
+++ b/drivers/net/ethernet/sfc/rx.c
@@ -575,12 +575,14 @@ static void efx_rx_deliver(struct efx_channel *channel,
 	/* Record the rx_queue */
 	skb_record_rx_queue(skb, channel->rx_queue.core_index);
 
-	/* Pass the packet up */
 	if (channel->type->receive_skb)
-		channel->type->receive_skb(channel, skb);
-	else
-		netif_receive_skb(skb);
+		if (channel->type->receive_skb(channel, skb))
+			goto handled;
+
+	/* Pass the packet up */
+	netif_receive_skb(skb);
 
+handled:
 	/* Update allocation strategy method */
 	channel->rx_alloc_level += RX_ALLOC_FACTOR_SKB;
 }
-- 
cgit v0.10.2


From c939a316459783e5cd6c6bd9dc90ea11b18ecd7f Mon Sep 17 00:00:00 2001
From: Laurence Evans <levans@solarflare.com>
Date: Thu, 15 Nov 2012 10:56:07 +0000
Subject: sfc: PTP changes to support improved UUID filtering mode

There is a long-standing problem with the packet-timestamp matching in
the driver. When a PTP packet is received by the MC, the FPGA
timestamps the packet and the MC sends the timestamp and 6 bytes of
the UUID to the driver. The driver then matches the timestamp against
received packets using the same 6 bytes of UUID.

The problem comes from the choice of which 6 bytes to use. The PTP
spec is slightly contradictory and misleading in one of the two places
where the UUIDs are discussed. From section 7.2.2.2 of the spec, a
PTPD2 UUID can be either a EUI-64 or a EUI-64 constructed from a
EUI-48. The typical ethernet based implementation uses a EUI-64
constructed from a EUI-48. This works by taking the first 3 bytes of
the MAC address of the NIC being used for PTP (the OUI), then
inserting 0xFF, 0xFE, then taking the last 3 bytes of the MAC address
giving
          MAC[0], MAC[1], MAC[2], 0xFF, 0xFE, MAC[3], MAC[4], MAC[5]
The current MC firmware and driver discard the first two bytes of this
UUID and packets are matched against timestamps using bytes 2 to 7 so
there is a small risk that in a deployment of Solarflare PTP NICs used
with other vendors NICs, that a PTP packet could be matched against
the wrong timestamp. This applies to all other organisations whose
third byte of the OUI is 0x53. It's a long list but I notice that it
includes Cisco.

The necessary modifications to use bytes 0-2 and 5-7 of the UUID to
match against are quite small but introduce incompatibility between
older version of the firmware and driver.

When PTP is enabled via SO_TIMESTAMPING specifying PTP V2, the driver
will try to enable PTP in the firmware using the enhanced mode
(above). If the firmware returns an error, the driver will enable PTP
in the firmware using the old mode.

[bwh: Fix some style errors; remove private ioctl bits]
Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>

diff --git a/drivers/net/ethernet/sfc/mcdi_pcol.h b/drivers/net/ethernet/sfc/mcdi_pcol.h
index 9d426d0..c5c9747 100644
--- a/drivers/net/ethernet/sfc/mcdi_pcol.h
+++ b/drivers/net/ethernet/sfc/mcdi_pcol.h
@@ -553,6 +553,7 @@
 #define          MC_CMD_PTP_MODE_V1_VLAN 0x1 /* enum */
 #define          MC_CMD_PTP_MODE_V2 0x2 /* enum */
 #define          MC_CMD_PTP_MODE_V2_VLAN 0x3 /* enum */
+#define          MC_CMD_PTP_MODE_V2_ENHANCED 0x4 /* enum */
 
 /* MC_CMD_PTP_IN_DISABLE msgrequest */
 #define    MC_CMD_PTP_IN_DISABLE_LEN 8
diff --git a/drivers/net/ethernet/sfc/ptp.c b/drivers/net/ethernet/sfc/ptp.c
index faf4baf..2b40cbd 100644
--- a/drivers/net/ethernet/sfc/ptp.c
+++ b/drivers/net/ethernet/sfc/ptp.c
@@ -99,6 +99,9 @@
 #define PTP_V2_VERSION_LENGTH	1
 #define PTP_V2_VERSION_OFFSET	29
 
+#define PTP_V2_UUID_LENGTH	8
+#define PTP_V2_UUID_OFFSET	48
+
 /* Although PTP V2 UUIDs are comprised a ClockIdentity (8) and PortNumber (2),
  * the MC only captures the last six bytes of the clock identity. These values
  * reflect those, not the ones used in the standard.  The standard permits
@@ -1011,7 +1014,7 @@ static bool efx_ptp_rx(struct efx_channel *channel, struct sk_buff *skb)
 	struct efx_nic *efx = channel->efx;
 	struct efx_ptp_data *ptp = efx->ptp_data;
 	struct efx_ptp_match *match = (struct efx_ptp_match *)skb->cb;
-	u8 *data;
+	u8 *match_data_012, *match_data_345;
 	unsigned int version;
 
 	match->expiry = jiffies + msecs_to_jiffies(PKT_EVENT_LIFETIME_MS);
@@ -1025,21 +1028,35 @@ static bool efx_ptp_rx(struct efx_channel *channel, struct sk_buff *skb)
 		if (version != PTP_VERSION_V1) {
 			return false;
 		}
+
+		/* PTP V1 uses all six bytes of the UUID to match the packet
+		 * to the timestamp
+		 */
+		match_data_012 = skb->data + PTP_V1_UUID_OFFSET;
+		match_data_345 = skb->data + PTP_V1_UUID_OFFSET + 3;
 	} else {
 		if (skb->len < PTP_V2_MIN_LENGTH) {
 			return false;
 		}
 		version = skb->data[PTP_V2_VERSION_OFFSET];
-
-		BUG_ON(ptp->mode != MC_CMD_PTP_MODE_V2);
-		BUILD_BUG_ON(PTP_V1_UUID_OFFSET != PTP_V2_MC_UUID_OFFSET);
-		BUILD_BUG_ON(PTP_V1_UUID_LENGTH != PTP_V2_MC_UUID_LENGTH);
-		BUILD_BUG_ON(PTP_V1_SEQUENCE_OFFSET != PTP_V2_SEQUENCE_OFFSET);
-		BUILD_BUG_ON(PTP_V1_SEQUENCE_LENGTH != PTP_V2_SEQUENCE_LENGTH);
-
 		if ((version & PTP_VERSION_V2_MASK) != PTP_VERSION_V2) {
 			return false;
 		}
+
+		/* The original V2 implementation uses bytes 2-7 of
+		 * the UUID to match the packet to the timestamp. This
+		 * discards two of the bytes of the MAC address used
+		 * to create the UUID (SF bug 33070).  The PTP V2
+		 * enhanced mode fixes this issue and uses bytes 0-2
+		 * and byte 5-7 of the UUID.
+		 */
+		match_data_345 = skb->data + PTP_V2_UUID_OFFSET + 5;
+		if (ptp->mode == MC_CMD_PTP_MODE_V2) {
+			match_data_012 = skb->data + PTP_V2_UUID_OFFSET + 2;
+		} else {
+			match_data_012 = skb->data + PTP_V2_UUID_OFFSET + 0;
+			BUG_ON(ptp->mode != MC_CMD_PTP_MODE_V2_ENHANCED);
+		}
 	}
 
 	/* Does this packet require timestamping? */
@@ -1052,14 +1069,19 @@ static bool efx_ptp_rx(struct efx_channel *channel, struct sk_buff *skb)
 		timestamps = skb_hwtstamps(skb);
 		memset(timestamps, 0, sizeof(*timestamps));
 
+		/* We expect the sequence number to be in the same position in
+		 * the packet for PTP V1 and V2
+		 */
+		BUILD_BUG_ON(PTP_V1_SEQUENCE_OFFSET != PTP_V2_SEQUENCE_OFFSET);
+		BUILD_BUG_ON(PTP_V1_SEQUENCE_LENGTH != PTP_V2_SEQUENCE_LENGTH);
+
 		/* Extract UUID/Sequence information */
-		data = skb->data + PTP_V1_UUID_OFFSET;
-		match->words[0] = (data[0]         |
-				   (data[1] << 8)  |
-				   (data[2] << 16) |
-				   (data[3] << 24));
-		match->words[1] = (data[4]         |
-				   (data[5] << 8)  |
+		match->words[0] = (match_data_012[0]         |
+				   (match_data_012[1] << 8)  |
+				   (match_data_012[2] << 16) |
+				   (match_data_345[0] << 24));
+		match->words[1] = (match_data_345[1]         |
+				   (match_data_345[2] << 8)  |
 				   (skb->data[PTP_V1_SEQUENCE_OFFSET +
 					      PTP_V1_SEQUENCE_LENGTH - 1] <<
 				    16));
@@ -1165,7 +1187,7 @@ static int efx_ptp_ts_init(struct efx_nic *efx, struct hwtstamp_config *init)
 	 * timestamped
 	 */
 		init->rx_filter = HWTSTAMP_FILTER_PTP_V2_L4_EVENT;
-		new_mode = MC_CMD_PTP_MODE_V2;
+		new_mode = MC_CMD_PTP_MODE_V2_ENHANCED;
 		enable_wanted = true;
 		break;
 	case HWTSTAMP_FILTER_PTP_V2_EVENT:
@@ -1184,7 +1206,14 @@ static int efx_ptp_ts_init(struct efx_nic *efx, struct hwtstamp_config *init)
 	if (init->tx_type != HWTSTAMP_TX_OFF)
 		enable_wanted = true;
 
+	/* Old versions of the firmware do not support the improved
+	 * UUID filtering option (SF bug 33070).  If the firmware does
+	 * not accept the enhanced mode, fall back to the standard PTP
+	 * v2 UUID filtering.
+	 */
 	rc = efx_ptp_change_mode(efx, enable_wanted, new_mode);
+	if ((rc != 0) && (new_mode == MC_CMD_PTP_MODE_V2_ENHANCED))
+		rc = efx_ptp_change_mode(efx, enable_wanted, MC_CMD_PTP_MODE_V2);
 	if (rc != 0)
 		return rc;
 
-- 
cgit v0.10.2


From 9230451af9efcf5e3d60ce7f4fec2468e8ce54b1 Mon Sep 17 00:00:00 2001
From: Laurence Evans <levans@solarflare.com>
Date: Mon, 11 Feb 2013 13:55:08 +0000
Subject: sfc: tidy up PTP synchronize function efx_ptp_process_times()

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>

diff --git a/drivers/net/ethernet/sfc/ptp.c b/drivers/net/ethernet/sfc/ptp.c
index 2b40cbd..d1858c0 100644
--- a/drivers/net/ethernet/sfc/ptp.c
+++ b/drivers/net/ethernet/sfc/ptp.c
@@ -432,13 +432,10 @@ static int efx_ptp_process_times(struct efx_nic *efx, u8 *synch_buf,
 	unsigned number_readings = (response_length /
 			       MC_CMD_PTP_OUT_SYNCHRONIZE_TIMESET_LEN);
 	unsigned i;
-	unsigned min;
-	unsigned min_set = 0;
 	unsigned total;
 	unsigned ngood = 0;
 	unsigned last_good = 0;
 	struct efx_ptp_data *ptp = efx->ptp_data;
-	bool min_valid = false;
 	u32 last_sec;
 	u32 start_sec;
 	struct timespec delta;
@@ -446,35 +443,17 @@ static int efx_ptp_process_times(struct efx_nic *efx, u8 *synch_buf,
 	if (number_readings == 0)
 		return -EAGAIN;
 
-	/* Find minimum value in this set of results, discarding clearly
-	 * erroneous results.
+	/* Read the set of results and increment stats for any results that
+	 * appera to be erroneous.
 	 */
 	for (i = 0; i < number_readings; i++) {
 		efx_ptp_read_timeset(synch_buf, &ptp->timeset[i]);
 		synch_buf += MC_CMD_PTP_OUT_SYNCHRONIZE_TIMESET_LEN;
-		if (ptp->timeset[i].window > SYNCHRONISATION_GRANULARITY_NS) {
-			if (min_valid) {
-				if (ptp->timeset[i].window < min_set)
-					min_set = ptp->timeset[i].window;
-			} else {
-				min_valid = true;
-				min_set = ptp->timeset[i].window;
-			}
-		}
-	}
-
-	if (min_valid) {
-		if (ptp->base_sync_valid && (min_set > ptp->base_sync_ns))
-			min = ptp->base_sync_ns;
-		else
-			min = min_set;
-	} else {
-		min = SYNCHRONISATION_GRANULARITY_NS;
 	}
 
-	/* Discard excessively long synchronise durations.  The MC times
-	 * when it finishes reading the host time so the corrected window
-	 * time should be fairly constant for a given platform.
+	/* Find the last good host-MC synchronization result. The MC times
+	 * when it finishes reading the host time so the corrected window time
+	 * should be fairly constant for a given platform.
 	 */
 	total = 0;
 	for (i = 0; i < number_readings; i++)
@@ -492,8 +471,8 @@ static int efx_ptp_process_times(struct efx_nic *efx, u8 *synch_buf,
 
 	if (ngood == 0) {
 		netif_warn(efx, drv, efx->net_dev,
-			   "PTP no suitable synchronisations %dns %dns\n",
-			   ptp->base_sync_ns, min_set);
+			   "PTP no suitable synchronisations %dns\n",
+			   ptp->base_sync_ns);
 		return -EAGAIN;
 	}
 
-- 
cgit v0.10.2


From 97d48a10c670f87bba9e5b2241e32f2eccd3fef0 Mon Sep 17 00:00:00 2001
From: Alexandre Rames <arames@solarflare.com>
Date: Fri, 11 Jan 2013 12:26:21 +0000
Subject: sfc: Remove rx_alloc_method SKB

[bwh: Remove more dead code, and make efx_ptp_rx() pull the data it
 needs into the header area.]
Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>

diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index 0bc0099..11a8108 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -247,10 +247,8 @@ static int efx_process_channel(struct efx_channel *channel, int budget)
 			__efx_rx_packet(channel, channel->rx_pkt);
 			channel->rx_pkt = NULL;
 		}
-		if (rx_queue->enabled) {
-			efx_rx_strategy(channel);
+		if (rx_queue->enabled)
 			efx_fast_push_rx_descriptors(rx_queue);
-		}
 	}
 
 	return spent;
@@ -655,16 +653,12 @@ static void efx_start_datapath(struct efx_nic *efx)
 		efx_for_each_channel_tx_queue(tx_queue, channel)
 			efx_init_tx_queue(tx_queue);
 
-		/* The rx buffer allocation strategy is MTU dependent */
-		efx_rx_strategy(channel);
-
 		efx_for_each_channel_rx_queue(rx_queue, channel) {
 			efx_init_rx_queue(rx_queue);
 			efx_nic_generate_fill_event(rx_queue);
 		}
 
 		WARN_ON(channel->rx_pkt != NULL);
-		efx_rx_strategy(channel);
 	}
 
 	if (netif_device_present(efx->net_dev))
diff --git a/drivers/net/ethernet/sfc/efx.h b/drivers/net/ethernet/sfc/efx.h
index d2f790d..64c555e 100644
--- a/drivers/net/ethernet/sfc/efx.h
+++ b/drivers/net/ethernet/sfc/efx.h
@@ -37,7 +37,6 @@ extern int efx_probe_rx_queue(struct efx_rx_queue *rx_queue);
 extern void efx_remove_rx_queue(struct efx_rx_queue *rx_queue);
 extern void efx_init_rx_queue(struct efx_rx_queue *rx_queue);
 extern void efx_fini_rx_queue(struct efx_rx_queue *rx_queue);
-extern void efx_rx_strategy(struct efx_channel *channel);
 extern void efx_fast_push_rx_descriptors(struct efx_rx_queue *rx_queue);
 extern void efx_rx_slow_fill(unsigned long context);
 extern void __efx_rx_packet(struct efx_channel *channel,
diff --git a/drivers/net/ethernet/sfc/net_driver.h b/drivers/net/ethernet/sfc/net_driver.h
index cdcf510..c83fe09 100644
--- a/drivers/net/ethernet/sfc/net_driver.h
+++ b/drivers/net/ethernet/sfc/net_driver.h
@@ -206,25 +206,19 @@ struct efx_tx_queue {
 /**
  * struct efx_rx_buffer - An Efx RX data buffer
  * @dma_addr: DMA base address of the buffer
- * @skb: The associated socket buffer. Valid iff !(@flags & %EFX_RX_BUF_PAGE).
+ * @page: The associated page buffer.
  *	Will be %NULL if the buffer slot is currently free.
- * @page: The associated page buffer. Valif iff @flags & %EFX_RX_BUF_PAGE.
- *	Will be %NULL if the buffer slot is currently free.
- * @page_offset: Offset within page. Valid iff @flags & %EFX_RX_BUF_PAGE.
+ * @page_offset: Offset within page
  * @len: Buffer length, in bytes.
  * @flags: Flags for buffer and packet state.
  */
 struct efx_rx_buffer {
 	dma_addr_t dma_addr;
-	union {
-		struct sk_buff *skb;
-		struct page *page;
-	} u;
+	struct page *page;
 	u16 page_offset;
 	u16 len;
 	u16 flags;
 };
-#define EFX_RX_BUF_PAGE		0x0001
 #define EFX_RX_PKT_CSUMMED	0x0002
 #define EFX_RX_PKT_DISCARD	0x0004
 
@@ -266,8 +260,6 @@ struct efx_rx_page_state {
  * @min_fill: RX descriptor minimum non-zero fill level.
  *	This records the minimum fill level observed when a ring
  *	refill was triggered.
- * @alloc_page_count: RX allocation strategy counter.
- * @alloc_skb_count: RX allocation strategy counter.
  * @slow_fill: Timer used to defer efx_nic_generate_fill_event().
  */
 struct efx_rx_queue {
@@ -286,8 +278,6 @@ struct efx_rx_queue {
 	unsigned int fast_fill_trigger;
 	unsigned int min_fill;
 	unsigned int min_overfill;
-	unsigned int alloc_page_count;
-	unsigned int alloc_skb_count;
 	struct timer_list slow_fill;
 	unsigned int slow_fill_count;
 };
@@ -336,10 +326,6 @@ enum efx_rx_alloc_method {
  * @event_test_cpu: Last CPU to handle interrupt or test event for this channel
  * @irq_count: Number of IRQs since last adaptive moderation decision
  * @irq_mod_score: IRQ moderation score
- * @rx_alloc_level: Watermark based heuristic counter for pushing descriptors
- *	and diagnostic counters
- * @rx_alloc_push_pages: RX allocation method currently in use for pushing
- *	descriptors
  * @n_rx_tobe_disc: Count of RX_TOBE_DISC errors
  * @n_rx_ip_hdr_chksum_err: Count of RX IP header checksum errors
  * @n_rx_tcp_udp_chksum_err: Count of RX TCP and UDP checksum errors
@@ -371,9 +357,6 @@ struct efx_channel {
 	unsigned int rfs_filters_added;
 #endif
 
-	int rx_alloc_level;
-	int rx_alloc_push_pages;
-
 	unsigned n_rx_tobe_disc;
 	unsigned n_rx_ip_hdr_chksum_err;
 	unsigned n_rx_tcp_udp_chksum_err;
diff --git a/drivers/net/ethernet/sfc/ptp.c b/drivers/net/ethernet/sfc/ptp.c
index d1858c0..07f6baa 100644
--- a/drivers/net/ethernet/sfc/ptp.c
+++ b/drivers/net/ethernet/sfc/ptp.c
@@ -1000,7 +1000,7 @@ static bool efx_ptp_rx(struct efx_channel *channel, struct sk_buff *skb)
 
 	/* Correct version? */
 	if (ptp->mode == MC_CMD_PTP_MODE_V1) {
-		if (skb->len < PTP_V1_MIN_LENGTH) {
+		if (!pskb_may_pull(skb, PTP_V1_MIN_LENGTH)) {
 			return false;
 		}
 		version = ntohs(*(__be16 *)&skb->data[PTP_V1_VERSION_OFFSET]);
@@ -1014,7 +1014,7 @@ static bool efx_ptp_rx(struct efx_channel *channel, struct sk_buff *skb)
 		match_data_012 = skb->data + PTP_V1_UUID_OFFSET;
 		match_data_345 = skb->data + PTP_V1_UUID_OFFSET + 3;
 	} else {
-		if (skb->len < PTP_V2_MIN_LENGTH) {
+		if (!pskb_may_pull(skb, PTP_V2_MIN_LENGTH)) {
 			return false;
 		}
 		version = skb->data[PTP_V2_VERSION_OFFSET];
diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c
index f31c23e..e7aa28e 100644
--- a/drivers/net/ethernet/sfc/rx.c
+++ b/drivers/net/ethernet/sfc/rx.c
@@ -33,46 +33,6 @@
 /* Size of buffer allocated for skb header area. */
 #define EFX_SKB_HEADERS  64u
 
-/*
- * rx_alloc_method - RX buffer allocation method
- *
- * This driver supports two methods for allocating and using RX buffers:
- * each RX buffer may be backed by an skb or by an order-n page.
- *
- * When GRO is in use then the second method has a lower overhead,
- * since we don't have to allocate then free skbs on reassembled frames.
- *
- * Values:
- *   - RX_ALLOC_METHOD_AUTO = 0
- *   - RX_ALLOC_METHOD_SKB  = 1
- *   - RX_ALLOC_METHOD_PAGE = 2
- *
- * The heuristic for %RX_ALLOC_METHOD_AUTO is a simple hysteresis count
- * controlled by the parameters below.
- *
- *   - Since pushing and popping descriptors are separated by the rx_queue
- *     size, so the watermarks should be ~rxd_size.
- *   - The performance win by using page-based allocation for GRO is less
- *     than the performance hit of using page-based allocation of non-GRO,
- *     so the watermarks should reflect this.
- *
- * Per channel we maintain a single variable, updated by each channel:
- *
- *   rx_alloc_level += (gro_performed ? RX_ALLOC_FACTOR_GRO :
- *                      RX_ALLOC_FACTOR_SKB)
- * Per NAPI poll interval, we constrain rx_alloc_level to 0..MAX (which
- * limits the hysteresis), and update the allocation strategy:
- *
- *   rx_alloc_method = (rx_alloc_level > RX_ALLOC_LEVEL_GRO ?
- *                      RX_ALLOC_METHOD_PAGE : RX_ALLOC_METHOD_SKB)
- */
-static int rx_alloc_method = RX_ALLOC_METHOD_AUTO;
-
-#define RX_ALLOC_LEVEL_GRO 0x2000
-#define RX_ALLOC_LEVEL_MAX 0x3000
-#define RX_ALLOC_FACTOR_GRO 1
-#define RX_ALLOC_FACTOR_SKB (-2)
-
 /* This is the percentage fill level below which new RX descriptors
  * will be added to the RX descriptor ring.
  */
@@ -99,10 +59,7 @@ static inline unsigned int efx_rx_buf_size(struct efx_nic *efx)
 
 static u8 *efx_rx_buf_eh(struct efx_nic *efx, struct efx_rx_buffer *buf)
 {
-	if (buf->flags & EFX_RX_BUF_PAGE)
-		return page_address(buf->u.page) + efx_rx_buf_offset(efx, buf);
-	else
-		return (u8 *)buf->u.skb->data + efx->type->rx_buffer_hash_size;
+	return page_address(buf->page) + efx_rx_buf_offset(efx, buf);
 }
 
 static inline u32 efx_rx_buf_hash(const u8 *eh)
@@ -120,56 +77,7 @@ static inline u32 efx_rx_buf_hash(const u8 *eh)
 }
 
 /**
- * efx_init_rx_buffers_skb - create EFX_RX_BATCH skb-based RX buffers
- *
- * @rx_queue:		Efx RX queue
- *
- * This allocates EFX_RX_BATCH skbs, maps them for DMA, and populates a
- * struct efx_rx_buffer for each one. Return a negative error code or 0
- * on success. May fail having only inserted fewer than EFX_RX_BATCH
- * buffers.
- */
-static int efx_init_rx_buffers_skb(struct efx_rx_queue *rx_queue)
-{
-	struct efx_nic *efx = rx_queue->efx;
-	struct net_device *net_dev = efx->net_dev;
-	struct efx_rx_buffer *rx_buf;
-	struct sk_buff *skb;
-	int skb_len = efx->rx_buffer_len;
-	unsigned index, count;
-
-	for (count = 0; count < EFX_RX_BATCH; ++count) {
-		index = rx_queue->added_count & rx_queue->ptr_mask;
-		rx_buf = efx_rx_buffer(rx_queue, index);
-
-		rx_buf->u.skb = skb = netdev_alloc_skb(net_dev, skb_len);
-		if (unlikely(!skb))
-			return -ENOMEM;
-
-		/* Adjust the SKB for padding */
-		skb_reserve(skb, NET_IP_ALIGN);
-		rx_buf->len = skb_len - NET_IP_ALIGN;
-		rx_buf->flags = 0;
-
-		rx_buf->dma_addr = dma_map_single(&efx->pci_dev->dev,
-						  skb->data, rx_buf->len,
-						  DMA_FROM_DEVICE);
-		if (unlikely(dma_mapping_error(&efx->pci_dev->dev,
-					       rx_buf->dma_addr))) {
-			dev_kfree_skb_any(skb);
-			rx_buf->u.skb = NULL;
-			return -EIO;
-		}
-
-		++rx_queue->added_count;
-		++rx_queue->alloc_skb_count;
-	}
-
-	return 0;
-}
-
-/**
- * efx_init_rx_buffers_page - create EFX_RX_BATCH page-based RX buffers
+ * efx_init_rx_buffers - create EFX_RX_BATCH page-based RX buffers
  *
  * @rx_queue:		Efx RX queue
  *
@@ -178,7 +86,7 @@ static int efx_init_rx_buffers_skb(struct efx_rx_queue *rx_queue)
  * code or 0 on success. If a single page can be split between two buffers,
  * then the page will either be inserted fully, or not at at all.
  */
-static int efx_init_rx_buffers_page(struct efx_rx_queue *rx_queue)
+static int efx_init_rx_buffers(struct efx_rx_queue *rx_queue)
 {
 	struct efx_nic *efx = rx_queue->efx;
 	struct efx_rx_buffer *rx_buf;
@@ -214,12 +122,11 @@ static int efx_init_rx_buffers_page(struct efx_rx_queue *rx_queue)
 		index = rx_queue->added_count & rx_queue->ptr_mask;
 		rx_buf = efx_rx_buffer(rx_queue, index);
 		rx_buf->dma_addr = dma_addr + EFX_PAGE_IP_ALIGN;
-		rx_buf->u.page = page;
+		rx_buf->page = page;
 		rx_buf->page_offset = page_offset + EFX_PAGE_IP_ALIGN;
 		rx_buf->len = efx->rx_buffer_len - EFX_PAGE_IP_ALIGN;
-		rx_buf->flags = EFX_RX_BUF_PAGE;
+		rx_buf->flags = 0;
 		++rx_queue->added_count;
-		++rx_queue->alloc_page_count;
 		++state->refcnt;
 
 		if ((~count & 1) && (efx->rx_buffer_len <= EFX_RX_HALF_PAGE)) {
@@ -239,10 +146,10 @@ static void efx_unmap_rx_buffer(struct efx_nic *efx,
 				struct efx_rx_buffer *rx_buf,
 				unsigned int used_len)
 {
-	if ((rx_buf->flags & EFX_RX_BUF_PAGE) && rx_buf->u.page) {
+	if (rx_buf->page) {
 		struct efx_rx_page_state *state;
 
-		state = page_address(rx_buf->u.page);
+		state = page_address(rx_buf->page);
 		if (--state->refcnt == 0) {
 			dma_unmap_page(&efx->pci_dev->dev,
 				       state->dma_addr,
@@ -253,21 +160,15 @@ static void efx_unmap_rx_buffer(struct efx_nic *efx,
 						rx_buf->dma_addr, used_len,
 						DMA_FROM_DEVICE);
 		}
-	} else if (!(rx_buf->flags & EFX_RX_BUF_PAGE) && rx_buf->u.skb) {
-		dma_unmap_single(&efx->pci_dev->dev, rx_buf->dma_addr,
-				 rx_buf->len, DMA_FROM_DEVICE);
 	}
 }
 
 static void efx_free_rx_buffer(struct efx_nic *efx,
 			       struct efx_rx_buffer *rx_buf)
 {
-	if ((rx_buf->flags & EFX_RX_BUF_PAGE) && rx_buf->u.page) {
-		__free_pages(rx_buf->u.page, efx->rx_buffer_order);
-		rx_buf->u.page = NULL;
-	} else if (!(rx_buf->flags & EFX_RX_BUF_PAGE) && rx_buf->u.skb) {
-		dev_kfree_skb_any(rx_buf->u.skb);
-		rx_buf->u.skb = NULL;
+	if (rx_buf->page) {
+		__free_pages(rx_buf->page, efx->rx_buffer_order);
+		rx_buf->page = NULL;
 	}
 }
 
@@ -283,7 +184,7 @@ static void efx_fini_rx_buffer(struct efx_rx_queue *rx_queue,
 static void efx_resurrect_rx_buffer(struct efx_rx_queue *rx_queue,
 				    struct efx_rx_buffer *rx_buf)
 {
-	struct efx_rx_page_state *state = page_address(rx_buf->u.page);
+	struct efx_rx_page_state *state = page_address(rx_buf->page);
 	struct efx_rx_buffer *new_buf;
 	unsigned fill_level, index;
 
@@ -298,14 +199,13 @@ static void efx_resurrect_rx_buffer(struct efx_rx_queue *rx_queue,
 	}
 
 	++state->refcnt;
-	get_page(rx_buf->u.page);
+	get_page(rx_buf->page);
 
 	index = rx_queue->added_count & rx_queue->ptr_mask;
 	new_buf = efx_rx_buffer(rx_queue, index);
 	new_buf->dma_addr = rx_buf->dma_addr ^ (PAGE_SIZE >> 1);
-	new_buf->u.page = rx_buf->u.page;
+	new_buf->page = rx_buf->page;
 	new_buf->len = rx_buf->len;
-	new_buf->flags = EFX_RX_BUF_PAGE;
 	++rx_queue->added_count;
 }
 
@@ -319,18 +219,17 @@ static void efx_recycle_rx_buffer(struct efx_channel *channel,
 	struct efx_rx_buffer *new_buf;
 	unsigned index;
 
-	rx_buf->flags &= EFX_RX_BUF_PAGE;
+	rx_buf->flags = 0;
 
-	if ((rx_buf->flags & EFX_RX_BUF_PAGE) &&
-	    efx->rx_buffer_len <= EFX_RX_HALF_PAGE &&
-	    page_count(rx_buf->u.page) == 1)
+	if (efx->rx_buffer_len <= EFX_RX_HALF_PAGE &&
+	    page_count(rx_buf->page) == 1)
 		efx_resurrect_rx_buffer(rx_queue, rx_buf);
 
 	index = rx_queue->added_count & rx_queue->ptr_mask;
 	new_buf = efx_rx_buffer(rx_queue, index);
 
 	memcpy(new_buf, rx_buf, sizeof(*new_buf));
-	rx_buf->u.page = NULL;
+	rx_buf->page = NULL;
 	++rx_queue->added_count;
 }
 
@@ -348,7 +247,6 @@ static void efx_recycle_rx_buffer(struct efx_channel *channel,
  */
 void efx_fast_push_rx_descriptors(struct efx_rx_queue *rx_queue)
 {
-	struct efx_channel *channel = efx_rx_queue_channel(rx_queue);
 	unsigned fill_level;
 	int space, rc = 0;
 
@@ -369,16 +267,13 @@ void efx_fast_push_rx_descriptors(struct efx_rx_queue *rx_queue)
 
 	netif_vdbg(rx_queue->efx, rx_status, rx_queue->efx->net_dev,
 		   "RX queue %d fast-filling descriptor ring from"
-		   " level %d to level %d using %s allocation\n",
+		   " level %d to level %d\n",
 		   efx_rx_queue_index(rx_queue), fill_level,
-		   rx_queue->max_fill,
-		   channel->rx_alloc_push_pages ? "page" : "skb");
+		   rx_queue->max_fill);
+
 
 	do {
-		if (channel->rx_alloc_push_pages)
-			rc = efx_init_rx_buffers_page(rx_queue);
-		else
-			rc = efx_init_rx_buffers_skb(rx_queue);
+		rc = efx_init_rx_buffers(rx_queue);
 		if (unlikely(rc)) {
 			/* Ensure that we don't leave the rx queue empty */
 			if (rx_queue->added_count == rx_queue->removed_count)
@@ -408,7 +303,7 @@ void efx_rx_slow_fill(unsigned long context)
 
 static void efx_rx_packet__check_len(struct efx_rx_queue *rx_queue,
 				     struct efx_rx_buffer *rx_buf,
-				     int len, bool *leak_packet)
+				     int len)
 {
 	struct efx_nic *efx = rx_queue->efx;
 	unsigned max_len = rx_buf->len - efx->type->rx_buffer_padding;
@@ -428,11 +323,6 @@ static void efx_rx_packet__check_len(struct efx_rx_queue *rx_queue,
 				  "RX event (0x%x > 0x%x+0x%x). Leaking\n",
 				  efx_rx_queue_index(rx_queue), len, max_len,
 				  efx->type->rx_buffer_padding);
-		/* If this buffer was skb-allocated, then the meta
-		 * data at the end of the skb will be trashed. So
-		 * we have no choice but to leak the fragment.
-		 */
-		*leak_packet = !(rx_buf->flags & EFX_RX_BUF_PAGE);
 		efx_schedule_reset(efx, RESET_TYPE_RX_RECOVERY);
 	} else {
 		if (net_ratelimit())
@@ -454,51 +344,78 @@ static void efx_rx_packet_gro(struct efx_channel *channel,
 {
 	struct napi_struct *napi = &channel->napi_str;
 	gro_result_t gro_result;
+	struct efx_nic *efx = channel->efx;
+	struct page *page = rx_buf->page;
+	struct sk_buff *skb;
 
-	if (rx_buf->flags & EFX_RX_BUF_PAGE) {
-		struct efx_nic *efx = channel->efx;
-		struct page *page = rx_buf->u.page;
-		struct sk_buff *skb;
-
-		rx_buf->u.page = NULL;
+	rx_buf->page = NULL;
 
-		skb = napi_get_frags(napi);
-		if (!skb) {
-			put_page(page);
-			return;
-		}
+	skb = napi_get_frags(napi);
+	if (!skb) {
+		put_page(page);
+		return;
+	}
 
-		if (efx->net_dev->features & NETIF_F_RXHASH)
-			skb->rxhash = efx_rx_buf_hash(eh);
+	if (efx->net_dev->features & NETIF_F_RXHASH)
+		skb->rxhash = efx_rx_buf_hash(eh);
 
-		skb_fill_page_desc(skb, 0, page,
-				   efx_rx_buf_offset(efx, rx_buf), rx_buf->len);
+	skb_fill_page_desc(skb, 0, page,
+			   efx_rx_buf_offset(efx, rx_buf), rx_buf->len);
 
-		skb->len = rx_buf->len;
-		skb->data_len = rx_buf->len;
-		skb->truesize += rx_buf->len;
-		skb->ip_summed = ((rx_buf->flags & EFX_RX_PKT_CSUMMED) ?
-				  CHECKSUM_UNNECESSARY : CHECKSUM_NONE);
+	skb->len = rx_buf->len;
+	skb->data_len = rx_buf->len;
+	skb->truesize += rx_buf->len;
+	skb->ip_summed = ((rx_buf->flags & EFX_RX_PKT_CSUMMED) ?
+			  CHECKSUM_UNNECESSARY : CHECKSUM_NONE);
 
-		skb_record_rx_queue(skb, channel->rx_queue.core_index);
+	skb_record_rx_queue(skb, channel->rx_queue.core_index);
 
 		gro_result = napi_gro_frags(napi);
-	} else {
-		struct sk_buff *skb = rx_buf->u.skb;
 
-		EFX_BUG_ON_PARANOID(!(rx_buf->flags & EFX_RX_PKT_CSUMMED));
-		rx_buf->u.skb = NULL;
-		skb->ip_summed = CHECKSUM_UNNECESSARY;
+	if (gro_result != GRO_DROP)
+		channel->irq_mod_score += 2;
+}
 
-		gro_result = napi_gro_receive(napi, skb);
-	}
+/* Allocate and construct an SKB around a struct page.*/
+static struct sk_buff *efx_rx_mk_skb(struct efx_channel *channel,
+				     struct efx_rx_buffer *rx_buf,
+				     u8 *eh, int hdr_len)
+{
+	struct efx_nic *efx = channel->efx;
+	struct sk_buff *skb;
 
-	if (gro_result == GRO_NORMAL) {
-		channel->rx_alloc_level += RX_ALLOC_FACTOR_SKB;
-	} else if (gro_result != GRO_DROP) {
-		channel->rx_alloc_level += RX_ALLOC_FACTOR_GRO;
-		channel->irq_mod_score += 2;
+	/* Allocate an SKB to store the headers */
+	skb = netdev_alloc_skb(efx->net_dev, hdr_len + EFX_PAGE_SKB_ALIGN);
+	if (unlikely(skb == NULL))
+		return NULL;
+
+	EFX_BUG_ON_PARANOID(rx_buf->len < hdr_len);
+
+	skb_reserve(skb, EFX_PAGE_SKB_ALIGN);
+
+	skb->len = rx_buf->len;
+	skb->truesize = rx_buf->len + sizeof(struct sk_buff);
+	memcpy(skb->data, eh, hdr_len);
+	skb->tail += hdr_len;
+
+	/* Append the remaining page onto the frag list */
+	if (rx_buf->len > hdr_len) {
+		skb->data_len = skb->len - hdr_len;
+		skb_fill_page_desc(skb, 0, rx_buf->page,
+				   efx_rx_buf_offset(efx, rx_buf) + hdr_len,
+				   skb->data_len);
+	} else {
+		__free_pages(rx_buf->page, efx->rx_buffer_order);
+		skb->data_len = 0;
 	}
+
+	/* Ownership has transferred from the rx_buf to skb */
+	rx_buf->page = NULL;
+
+	/* Move past the ethernet header */
+	skb->protocol = eth_type_trans(skb, efx->net_dev);
+
+	return skb;
 }
 
 void efx_rx_packet(struct efx_rx_queue *rx_queue, unsigned int index,
@@ -507,7 +424,6 @@ void efx_rx_packet(struct efx_rx_queue *rx_queue, unsigned int index,
 	struct efx_nic *efx = rx_queue->efx;
 	struct efx_channel *channel = efx_rx_queue_channel(rx_queue);
 	struct efx_rx_buffer *rx_buf;
-	bool leak_packet = false;
 
 	rx_buf = efx_rx_buffer(rx_queue, index);
 	rx_buf->flags |= flags;
@@ -519,7 +435,7 @@ void efx_rx_packet(struct efx_rx_queue *rx_queue, unsigned int index,
 	rx_queue->removed_count++;
 
 	/* Validate the length encoded in the event vs the descriptor pushed */
-	efx_rx_packet__check_len(rx_queue, rx_buf, len, &leak_packet);
+	efx_rx_packet__check_len(rx_queue, rx_buf, len);
 
 	netif_vdbg(efx, rx_status, efx->net_dev,
 		   "RX queue %d received id %x at %llx+%x %s%s\n",
@@ -530,10 +446,7 @@ void efx_rx_packet(struct efx_rx_queue *rx_queue, unsigned int index,
 
 	/* Discard packet, if instructed to do so */
 	if (unlikely(rx_buf->flags & EFX_RX_PKT_DISCARD)) {
-		if (unlikely(leak_packet))
-			channel->n_skbuff_leaks++;
-		else
-			efx_recycle_rx_buffer(channel, rx_buf);
+		efx_recycle_rx_buffer(channel, rx_buf);
 
 		/* Don't hold off the previous receive */
 		rx_buf = NULL;
@@ -560,31 +473,28 @@ out:
 	channel->rx_pkt = rx_buf;
 }
 
-static void efx_rx_deliver(struct efx_channel *channel,
+static void efx_rx_deliver(struct efx_channel *channel, u8 *eh,
 			   struct efx_rx_buffer *rx_buf)
 {
 	struct sk_buff *skb;
+	u16 hdr_len = min_t(u16, rx_buf->len, EFX_SKB_HEADERS);
 
-	/* We now own the SKB */
-	skb = rx_buf->u.skb;
-	rx_buf->u.skb = NULL;
+	skb = efx_rx_mk_skb(channel, rx_buf, eh, hdr_len);
+	if (unlikely(skb == NULL)) {
+		efx_free_rx_buffer(channel->efx, rx_buf);
+		return;
+	}
+	skb_record_rx_queue(skb, channel->rx_queue.core_index);
 
 	/* Set the SKB flags */
 	skb_checksum_none_assert(skb);
 
-	/* Record the rx_queue */
-	skb_record_rx_queue(skb, channel->rx_queue.core_index);
-
 	if (channel->type->receive_skb)
 		if (channel->type->receive_skb(channel, skb))
-			goto handled;
+			return;
 
 	/* Pass the packet up */
 	netif_receive_skb(skb);
-
-handled:
-	/* Update allocation strategy method */
-	channel->rx_alloc_level += RX_ALLOC_FACTOR_SKB;
 }
 
 /* Handle a received packet.  Second half: Touches packet payload. */
@@ -602,60 +512,13 @@ void __efx_rx_packet(struct efx_channel *channel, struct efx_rx_buffer *rx_buf)
 		return;
 	}
 
-	if (!(rx_buf->flags & EFX_RX_BUF_PAGE)) {
-		struct sk_buff *skb = rx_buf->u.skb;
-
-		prefetch(skb_shinfo(skb));
-
-		skb_reserve(skb, efx->type->rx_buffer_hash_size);
-		skb_put(skb, rx_buf->len);
-
-		if (efx->net_dev->features & NETIF_F_RXHASH)
-			skb->rxhash = efx_rx_buf_hash(eh);
-
-		/* Move past the ethernet header. rx_buf->data still points
-		 * at the ethernet header */
-		skb->protocol = eth_type_trans(skb, efx->net_dev);
-
-		skb_record_rx_queue(skb, channel->rx_queue.core_index);
-	}
-
 	if (unlikely(!(efx->net_dev->features & NETIF_F_RXCSUM)))
 		rx_buf->flags &= ~EFX_RX_PKT_CSUMMED;
 
-	if (likely(rx_buf->flags & (EFX_RX_BUF_PAGE | EFX_RX_PKT_CSUMMED)) &&
-	    !channel->type->receive_skb)
+	if (!channel->type->receive_skb)
 		efx_rx_packet_gro(channel, rx_buf, eh);
 	else
-		efx_rx_deliver(channel, rx_buf);
-}
-
-void efx_rx_strategy(struct efx_channel *channel)
-{
-	enum efx_rx_alloc_method method = rx_alloc_method;
-
-	if (channel->type->receive_skb) {
-		channel->rx_alloc_push_pages = false;
-		return;
-	}
-
-	/* Only makes sense to use page based allocation if GRO is enabled */
-	if (!(channel->efx->net_dev->features & NETIF_F_GRO)) {
-		method = RX_ALLOC_METHOD_SKB;
-	} else if (method == RX_ALLOC_METHOD_AUTO) {
-		/* Constrain the rx_alloc_level */
-		if (channel->rx_alloc_level < 0)
-			channel->rx_alloc_level = 0;
-		else if (channel->rx_alloc_level > RX_ALLOC_LEVEL_MAX)
-			channel->rx_alloc_level = RX_ALLOC_LEVEL_MAX;
-
-		/* Decide on the allocation method */
-		method = ((channel->rx_alloc_level > RX_ALLOC_LEVEL_GRO) ?
-			  RX_ALLOC_METHOD_PAGE : RX_ALLOC_METHOD_SKB);
-	}
-
-	/* Push the option */
-	channel->rx_alloc_push_pages = (method == RX_ALLOC_METHOD_PAGE);
+		efx_rx_deliver(channel, eh, rx_buf);
 }
 
 int efx_probe_rx_queue(struct efx_rx_queue *rx_queue)
@@ -756,9 +619,6 @@ void efx_remove_rx_queue(struct efx_rx_queue *rx_queue)
 }
 
 
-module_param(rx_alloc_method, int, 0644);
-MODULE_PARM_DESC(rx_alloc_method, "Allocation method used for RX buffers");
-
 module_param(rx_refill_threshold, uint, 0444);
 MODULE_PARM_DESC(rx_refill_threshold,
 		 "RX descriptor ring refill threshold (%)");
-- 
cgit v0.10.2


From 7de07a4deb8e7707892b952daa59eff67701f0c3 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Mon, 14 Jan 2013 17:43:15 +0000
Subject: sfc: More sensible semantics for efx_filter_insert_filter() replace
 flag

The 'replace' flag to efx_filter_insert_filter() controls whether the
new filter may replace *any* filter, and is checked even before
priority comparison.  But lower-priority filters should never
block insertion of higher-priority filters.

Change the priority checking so that lower-priority filters are
replaced regardless of the value of the flag, and rename the
flag to 'replace_equal'.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>

diff --git a/drivers/net/ethernet/sfc/filter.c b/drivers/net/ethernet/sfc/filter.c
index 8af42cd..2fdd3a5 100644
--- a/drivers/net/ethernet/sfc/filter.c
+++ b/drivers/net/ethernet/sfc/filter.c
@@ -650,14 +650,22 @@ u32 efx_filter_get_rx_id_limit(struct efx_nic *efx)
  * efx_filter_insert_filter - add or replace a filter
  * @efx: NIC in which to insert the filter
  * @spec: Specification for the filter
- * @replace: Flag for whether the specified filter may replace a filter
- *	with an identical match expression and equal or lower priority
+ * @replace_equal: Flag for whether the specified filter may replace an
+ *	existing filter with equal priority
  *
  * On success, return the filter ID.
  * On failure, return a negative error code.
+ *
+ * If an existing filter has equal match values to the new filter
+ * spec, then the new filter might replace it, depending on the
+ * relative priorities.  If the existing filter has lower priority, or
+ * if @replace_equal is set and it has equal priority, then it is
+ * replaced.  Otherwise the function fails, returning -%EPERM if
+ * the existing filter has higher priority or -%EEXIST if it has
+ * equal priority.
  */
 s32 efx_filter_insert_filter(struct efx_nic *efx, struct efx_filter_spec *spec,
-			     bool replace)
+			     bool replace_equal)
 {
 	struct efx_filter_state *state = efx->filter_state;
 	struct efx_filter_table *table = efx_filter_spec_table(state, spec);
@@ -687,7 +695,7 @@ s32 efx_filter_insert_filter(struct efx_nic *efx, struct efx_filter_spec *spec,
 
 	if (test_bit(filter_idx, table->used_bitmap)) {
 		/* Should we replace the existing filter? */
-		if (!replace) {
+		if (spec->priority == saved_spec->priority && !replace_equal) {
 			rc = -EEXIST;
 			goto out;
 		}
-- 
cgit v0.10.2


From e3a699fab34a724fa8693c1274d3ddb3e213a134 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Mon, 14 Jan 2013 21:23:14 +0000
Subject: sfc: Remove redundant parameter to efx_filter_search()

The 'for_insert' parameter is redundant since there are no longer
any other operations that need to search based on a filter spec.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>

diff --git a/drivers/net/ethernet/sfc/filter.c b/drivers/net/ethernet/sfc/filter.c
index 2fdd3a5..3d94ed7 100644
--- a/drivers/net/ethernet/sfc/filter.c
+++ b/drivers/net/ethernet/sfc/filter.c
@@ -522,7 +522,7 @@ static bool efx_filter_equal(const struct efx_filter_spec *left,
 
 static int efx_filter_search(struct efx_filter_table *table,
 			     struct efx_filter_spec *spec, u32 key,
-			     bool for_insert, unsigned int *depth_required)
+			     unsigned int *depth_required)
 {
 	unsigned hash, incr, filter_idx, depth, depth_max;
 
@@ -531,25 +531,20 @@ static int efx_filter_search(struct efx_filter_table *table,
 
 	filter_idx = hash & (table->size - 1);
 	depth = 1;
-	depth_max = (for_insert ?
-		     (spec->priority <= EFX_FILTER_PRI_HINT ?
-		      FILTER_CTL_SRCH_HINT_MAX : FILTER_CTL_SRCH_MAX) :
-		     table->search_depth[spec->type]);
+	depth_max = (spec->priority <= EFX_FILTER_PRI_HINT ?
+		     FILTER_CTL_SRCH_HINT_MAX : FILTER_CTL_SRCH_MAX);
 
 	for (;;) {
-		/* Return success if entry is used and matches this spec
-		 * or entry is unused and we are trying to insert.
-		 */
-		if (test_bit(filter_idx, table->used_bitmap) ?
-		    efx_filter_equal(spec, &table->spec[filter_idx]) :
-		    for_insert) {
+		/* Return success if entry is unused or matches this spec */
+		if (!test_bit(filter_idx, table->used_bitmap) ||
+		    efx_filter_equal(spec, &table->spec[filter_idx])) {
 			*depth_required = depth;
 			return filter_idx;
 		}
 
 		/* Return failure if we reached the maximum search depth */
 		if (depth == depth_max)
-			return for_insert ? -EBUSY : -ENOENT;
+			return -EBUSY;
 
 		filter_idx = (filter_idx + incr) & (table->size - 1);
 		++depth;
@@ -686,7 +681,7 @@ s32 efx_filter_insert_filter(struct efx_nic *efx, struct efx_filter_spec *spec,
 
 	spin_lock_bh(&state->lock);
 
-	rc = efx_filter_search(table, spec, key, true, &depth);
+	rc = efx_filter_search(table, spec, key, &depth);
 	if (rc < 0)
 		goto out;
 	filter_idx = rc;
-- 
cgit v0.10.2


From 385904f819e31fcf5a5aa53fa91f3352bffa6d19 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Mon, 14 Jan 2013 21:23:15 +0000
Subject: sfc: Don't use efx_filter_{build,hash,increment}() for default MAC
 filters

These functions happen to work for default MAC filters: they generate
an initial index of 1/0 for unicast/multicast respectively and an
increment of 1 for either, so a search succeeds at depth 2.  But this
is a matter of luck rather than design, and it really won't work well
with the bug fix we're about to do.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>

diff --git a/drivers/net/ethernet/sfc/filter.c b/drivers/net/ethernet/sfc/filter.c
index 3d94ed7..8d83d98 100644
--- a/drivers/net/ethernet/sfc/filter.c
+++ b/drivers/net/ethernet/sfc/filter.c
@@ -463,13 +463,6 @@ static u32 efx_filter_build(efx_oword_t *filter, struct efx_filter_spec *spec)
 		break;
 	}
 
-	case EFX_FILTER_TABLE_RX_DEF:
-		/* One filter spec per type */
-		BUILD_BUG_ON(EFX_FILTER_INDEX_UC_DEF != 0);
-		BUILD_BUG_ON(EFX_FILTER_INDEX_MC_DEF !=
-			     EFX_FILTER_MC_DEF - EFX_FILTER_UC_DEF);
-		return spec->type - EFX_FILTER_UC_DEF;
-
 	case EFX_FILTER_TABLE_RX_MAC: {
 		bool is_wild = spec->type == EFX_FILTER_MAC_WILD;
 		EFX_POPULATE_OWORD_7(
@@ -667,25 +660,35 @@ s32 efx_filter_insert_filter(struct efx_nic *efx, struct efx_filter_spec *spec,
 	struct efx_filter_spec *saved_spec;
 	efx_oword_t filter;
 	unsigned int filter_idx, depth = 0;
-	u32 key;
 	int rc;
 
 	if (!table || table->size == 0)
 		return -EINVAL;
 
-	key = efx_filter_build(&filter, spec);
-
 	netif_vdbg(efx, hw, efx->net_dev,
 		   "%s: type %d search_depth=%d", __func__, spec->type,
 		   table->search_depth[spec->type]);
 
-	spin_lock_bh(&state->lock);
+	if (table->id == EFX_FILTER_TABLE_RX_DEF) {
+		/* One filter spec per type */
+		BUILD_BUG_ON(EFX_FILTER_INDEX_UC_DEF != 0);
+		BUILD_BUG_ON(EFX_FILTER_INDEX_MC_DEF !=
+			     EFX_FILTER_MC_DEF - EFX_FILTER_UC_DEF);
+		filter_idx = spec->type - EFX_FILTER_INDEX_UC_DEF;
+
+		spin_lock_bh(&state->lock);
+	} else {
+		u32 key = efx_filter_build(&filter, spec);
+
+		spin_lock_bh(&state->lock);
+
+		rc = efx_filter_search(table, spec, key, &depth);
+		if (rc < 0)
+			goto out;
+		filter_idx = rc;
+		BUG_ON(filter_idx >= table->size);
+	}
 
-	rc = efx_filter_search(table, spec, key, &depth);
-	if (rc < 0)
-		goto out;
-	filter_idx = rc;
-	BUG_ON(filter_idx >= table->size);
 	saved_spec = &table->spec[filter_idx];
 
 	if (test_bit(filter_idx, table->used_bitmap)) {
-- 
cgit v0.10.2


From 297891cecc23b85c6b788b0ec6f2f7e71d89003e Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Mon, 14 Jan 2013 21:23:16 +0000
Subject: sfc: Merge efx_filter_search() into efx_filter_insert()

efx_filter_search() is only called from efx_filter_insert(), and
neither function is very long.  The following bug fix requires a more
sophisticated search with a third result, which is going to be easier
to implement as part of the same function.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>

diff --git a/drivers/net/ethernet/sfc/filter.c b/drivers/net/ethernet/sfc/filter.c
index 8d83d98..769560a 100644
--- a/drivers/net/ethernet/sfc/filter.c
+++ b/drivers/net/ethernet/sfc/filter.c
@@ -513,37 +513,6 @@ static bool efx_filter_equal(const struct efx_filter_spec *left,
 	return true;
 }
 
-static int efx_filter_search(struct efx_filter_table *table,
-			     struct efx_filter_spec *spec, u32 key,
-			     unsigned int *depth_required)
-{
-	unsigned hash, incr, filter_idx, depth, depth_max;
-
-	hash = efx_filter_hash(key);
-	incr = efx_filter_increment(key);
-
-	filter_idx = hash & (table->size - 1);
-	depth = 1;
-	depth_max = (spec->priority <= EFX_FILTER_PRI_HINT ?
-		     FILTER_CTL_SRCH_HINT_MAX : FILTER_CTL_SRCH_MAX);
-
-	for (;;) {
-		/* Return success if entry is unused or matches this spec */
-		if (!test_bit(filter_idx, table->used_bitmap) ||
-		    efx_filter_equal(spec, &table->spec[filter_idx])) {
-			*depth_required = depth;
-			return filter_idx;
-		}
-
-		/* Return failure if we reached the maximum search depth */
-		if (depth == depth_max)
-			return -EBUSY;
-
-		filter_idx = (filter_idx + incr) & (table->size - 1);
-		++depth;
-	}
-}
-
 /*
  * Construct/deconstruct external filter IDs.  At least the RX filter
  * IDs must be ordered by matching priority, for RX NFC semantics.
@@ -679,14 +648,33 @@ s32 efx_filter_insert_filter(struct efx_nic *efx, struct efx_filter_spec *spec,
 		spin_lock_bh(&state->lock);
 	} else {
 		u32 key = efx_filter_build(&filter, spec);
+		unsigned int hash = efx_filter_hash(key);
+		unsigned int incr = efx_filter_increment(key);
+		unsigned int depth_max =
+			spec->priority <= EFX_FILTER_PRI_HINT ?
+			FILTER_CTL_SRCH_HINT_MAX : FILTER_CTL_SRCH_MAX;
+
+		filter_idx = hash & (table->size - 1);
+		depth = 1;
 
 		spin_lock_bh(&state->lock);
 
-		rc = efx_filter_search(table, spec, key, &depth);
-		if (rc < 0)
-			goto out;
-		filter_idx = rc;
-		BUG_ON(filter_idx >= table->size);
+		for (;;) {
+			if (!test_bit(filter_idx, table->used_bitmap) ||
+			    efx_filter_equal(spec, &table->spec[filter_idx]))
+				break;
+
+			/* Return failure if we reached the maximum search
+			 * depth
+			 */
+			if (depth == depth_max) {
+				rc = -EBUSY;
+				goto out;
+			}
+
+			filter_idx = (filter_idx + incr) & (table->size - 1);
+			++depth;
+		}
 	}
 
 	saved_spec = &table->spec[filter_idx];
-- 
cgit v0.10.2


From d9ccfdd4b3b675750518e035549f5c778d4027f2 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Mon, 14 Jan 2013 21:23:17 +0000
Subject: sfc: Fix replacement detection in efx_filter_insert_filter()

efx_filter_insert_filter() uses the first table entry in the hash chain
that either has the same match values or is empty.  This means that
replacement doesn't always work correctly:

1. Insert filter F1 with match values M1, hashing to H1, at first
   possible entry E1.
2. Insert filter F2 with match values M2, hashing to H1, at second
   possible entry E2.
3. Remove filter F1.
4. Insert filter F3 with match values M2, hashing to H1, at first
   possible entry E1.

F3 should have either replaced F2 or been rejected (depending on
priority and the replace_equal parameter).

Instead, search for both a matching filter that the inserted filter
would replace, and an available insertion point, up to the applicable
maximum search depths.  If we insert at lower depth than a replaced
filter, clear the replaced filter.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>

diff --git a/drivers/net/ethernet/sfc/filter.c b/drivers/net/ethernet/sfc/filter.c
index 769560a..0de8daf 100644
--- a/drivers/net/ethernet/sfc/filter.c
+++ b/drivers/net/ethernet/sfc/filter.c
@@ -66,6 +66,10 @@ struct efx_filter_state {
 #endif
 };
 
+static void efx_filter_table_clear_entry(struct efx_nic *efx,
+					 struct efx_filter_table *table,
+					 unsigned int filter_idx);
+
 /* The filter hash function is LFSR polynomial x^16 + x^3 + 1 of a 32-bit
  * key derived from the n-tuple.  The initial LFSR state is 0xffff. */
 static u16 efx_filter_hash(u32 key)
@@ -626,9 +630,9 @@ s32 efx_filter_insert_filter(struct efx_nic *efx, struct efx_filter_spec *spec,
 {
 	struct efx_filter_state *state = efx->filter_state;
 	struct efx_filter_table *table = efx_filter_spec_table(state, spec);
-	struct efx_filter_spec *saved_spec;
 	efx_oword_t filter;
-	unsigned int filter_idx, depth = 0;
+	int rep_index, ins_index;
+	unsigned int depth = 0;
 	int rc;
 
 	if (!table || table->size == 0)
@@ -643,44 +647,74 @@ s32 efx_filter_insert_filter(struct efx_nic *efx, struct efx_filter_spec *spec,
 		BUILD_BUG_ON(EFX_FILTER_INDEX_UC_DEF != 0);
 		BUILD_BUG_ON(EFX_FILTER_INDEX_MC_DEF !=
 			     EFX_FILTER_MC_DEF - EFX_FILTER_UC_DEF);
-		filter_idx = spec->type - EFX_FILTER_INDEX_UC_DEF;
+		rep_index = spec->type - EFX_FILTER_INDEX_UC_DEF;
+		ins_index = rep_index;
 
 		spin_lock_bh(&state->lock);
 	} else {
+		/* Search concurrently for
+		 * (1) a filter to be replaced (rep_index): any filter
+		 *     with the same match values, up to the current
+		 *     search depth for this type, and
+		 * (2) the insertion point (ins_index): (1) or any
+		 *     free slot before it or up to the maximum search
+		 *     depth for this priority
+		 * We fail if we cannot find (2).
+		 *
+		 * We can stop once either
+		 * (a) we find (1), in which case we have definitely
+		 *     found (2) as well; or
+		 * (b) we have searched exhaustively for (1), and have
+		 *     either found (2) or searched exhaustively for it
+		 */
 		u32 key = efx_filter_build(&filter, spec);
 		unsigned int hash = efx_filter_hash(key);
 		unsigned int incr = efx_filter_increment(key);
-		unsigned int depth_max =
+		unsigned int max_rep_depth = table->search_depth[spec->type];
+		unsigned int max_ins_depth =
 			spec->priority <= EFX_FILTER_PRI_HINT ?
 			FILTER_CTL_SRCH_HINT_MAX : FILTER_CTL_SRCH_MAX;
+		unsigned int i = hash & (table->size - 1);
 
-		filter_idx = hash & (table->size - 1);
+		ins_index = -1;
 		depth = 1;
 
 		spin_lock_bh(&state->lock);
 
 		for (;;) {
-			if (!test_bit(filter_idx, table->used_bitmap) ||
-			    efx_filter_equal(spec, &table->spec[filter_idx]))
+			if (!test_bit(i, table->used_bitmap)) {
+				if (ins_index < 0)
+					ins_index = i;
+			} else if (efx_filter_equal(spec, &table->spec[i])) {
+				/* Case (a) */
+				if (ins_index < 0)
+					ins_index = i;
+				rep_index = i;
 				break;
+			}
 
-			/* Return failure if we reached the maximum search
-			 * depth
-			 */
-			if (depth == depth_max) {
-				rc = -EBUSY;
-				goto out;
+			if (depth >= max_rep_depth &&
+			    (ins_index >= 0 || depth >= max_ins_depth)) {
+				/* Case (b) */
+				if (ins_index < 0) {
+					rc = -EBUSY;
+					goto out;
+				}
+				rep_index = -1;
+				break;
 			}
 
-			filter_idx = (filter_idx + incr) & (table->size - 1);
+			i = (i + incr) & (table->size - 1);
 			++depth;
 		}
 	}
 
-	saved_spec = &table->spec[filter_idx];
+	/* If we found a filter to be replaced, check whether we
+	 * should do so
+	 */
+	if (rep_index >= 0) {
+		struct efx_filter_spec *saved_spec = &table->spec[rep_index];
 
-	if (test_bit(filter_idx, table->used_bitmap)) {
-		/* Should we replace the existing filter? */
 		if (spec->priority == saved_spec->priority && !replace_equal) {
 			rc = -EEXIST;
 			goto out;
@@ -689,11 +723,14 @@ s32 efx_filter_insert_filter(struct efx_nic *efx, struct efx_filter_spec *spec,
 			rc = -EPERM;
 			goto out;
 		}
-	} else {
-		__set_bit(filter_idx, table->used_bitmap);
+	}
+
+	/* Insert the filter */
+	if (ins_index != rep_index) {
+		__set_bit(ins_index, table->used_bitmap);
 		++table->used;
 	}
-	*saved_spec = *spec;
+	table->spec[ins_index] = *spec;
 
 	if (table->id == EFX_FILTER_TABLE_RX_DEF) {
 		efx_filter_push_rx_config(efx);
@@ -707,13 +744,19 @@ s32 efx_filter_insert_filter(struct efx_nic *efx, struct efx_filter_spec *spec,
 		}
 
 		efx_writeo(efx, &filter,
-			   table->offset + table->step * filter_idx);
+			   table->offset + table->step * ins_index);
+
+		/* If we were able to replace a filter by inserting
+		 * at a lower depth, clear the replaced filter
+		 */
+		if (ins_index != rep_index && rep_index >= 0)
+			efx_filter_table_clear_entry(efx, table, rep_index);
 	}
 
 	netif_vdbg(efx, hw, efx->net_dev,
 		   "%s: filter type %d index %d rxq %u set",
-		   __func__, spec->type, filter_idx, spec->dmaq_id);
-	rc = efx_filter_make_id(spec, filter_idx);
+		   __func__, spec->type, ins_index, spec->dmaq_id);
+	rc = efx_filter_make_id(spec, ins_index);
 
 out:
 	spin_unlock_bh(&state->lock);
-- 
cgit v0.10.2


From 634ab72c39b3df78ac7d6ccd4a50133059bae14f Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Wed, 6 Mar 2013 19:39:20 +0000
Subject: sfc: Disable RSS when using SR-IOV and only 1 RX queue on the PF

On Siena, VFs share RSS configuration with the PF.  We attempted to
support configurations where the PF only uses 1 RX queue and VFs use
multiple RX queues, by (1) setting up RSS for the number of RX queues
per VF (2) disabling RSS in the PF's RX default filters.

Unfortunately commit cd2d5b529cdb ('sfc: Add SR-IOV back-end support
for SFC9000 family') only included (1).  This is (2).

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>

diff --git a/drivers/net/ethernet/sfc/filter.c b/drivers/net/ethernet/sfc/filter.c
index 0de8daf..61b4408 100644
--- a/drivers/net/ethernet/sfc/filter.c
+++ b/drivers/net/ethernet/sfc/filter.c
@@ -414,8 +414,12 @@ static void efx_filter_reset_rx_def(struct efx_nic *efx, unsigned filter_idx)
 	struct efx_filter_table *table = &state->table[EFX_FILTER_TABLE_RX_DEF];
 	struct efx_filter_spec *spec = &table->spec[filter_idx];
 
+	/* If there's only one channel then disable RSS for non VF
+	 * traffic, thereby allowing VFs to use RSS when the PF can't.
+	 */
 	efx_filter_init_rx(spec, EFX_FILTER_PRI_MANUAL,
-			   EFX_FILTER_FLAG_RX_RSS, 0);
+			   efx->n_rx_channels > 1 ? EFX_FILTER_FLAG_RX_RSS : 0,
+			   0);
 	spec->type = EFX_FILTER_UC_DEF + filter_idx;
 	table->used_bitmap[0] |= 1 << filter_idx;
 }
-- 
cgit v0.10.2


From 626950db84c065925ee10c2e833da265cbda8800 Mon Sep 17 00:00:00 2001
From: Alexandre Rames <arames@solarflare.com>
Date: Mon, 14 Jan 2013 17:20:22 +0000
Subject: sfc: Add AER and EEH support for Siena

The Linux side of EEH is triggered by MMIO reads, but this
driver's data path does not issue any MMIO reads (except in
legacy interrupt mode).  Therefore add a monitor function
to poll EEH periodically.

When preparing to reset the device based on our own error
detection, also poll EEH and defer to its recovery mechanism
if appropriate.

[bwh: Use a separate condition for the initial link poll; fix some
 style errors]
Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>

diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index 11a8108..5e1ddc5 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -21,7 +21,9 @@
 #include <linux/ethtool.h>
 #include <linux/topology.h>
 #include <linux/gfp.h>
+#include <linux/pci.h>
 #include <linux/cpu_rmap.h>
+#include <linux/aer.h>
 #include "net_driver.h"
 #include "efx.h"
 #include "nic.h"
@@ -71,17 +73,19 @@ const char *const efx_loopback_mode_names[] = {
 
 const unsigned int efx_reset_type_max = RESET_TYPE_MAX;
 const char *const efx_reset_type_names[] = {
-	[RESET_TYPE_INVISIBLE]     = "INVISIBLE",
-	[RESET_TYPE_ALL]           = "ALL",
-	[RESET_TYPE_WORLD]         = "WORLD",
-	[RESET_TYPE_DISABLE]       = "DISABLE",
-	[RESET_TYPE_TX_WATCHDOG]   = "TX_WATCHDOG",
-	[RESET_TYPE_INT_ERROR]     = "INT_ERROR",
-	[RESET_TYPE_RX_RECOVERY]   = "RX_RECOVERY",
-	[RESET_TYPE_RX_DESC_FETCH] = "RX_DESC_FETCH",
-	[RESET_TYPE_TX_DESC_FETCH] = "TX_DESC_FETCH",
-	[RESET_TYPE_TX_SKIP]       = "TX_SKIP",
-	[RESET_TYPE_MC_FAILURE]    = "MC_FAILURE",
+	[RESET_TYPE_INVISIBLE]          = "INVISIBLE",
+	[RESET_TYPE_ALL]                = "ALL",
+	[RESET_TYPE_RECOVER_OR_ALL]     = "RECOVER_OR_ALL",
+	[RESET_TYPE_WORLD]              = "WORLD",
+	[RESET_TYPE_RECOVER_OR_DISABLE] = "RECOVER_OR_DISABLE",
+	[RESET_TYPE_DISABLE]            = "DISABLE",
+	[RESET_TYPE_TX_WATCHDOG]        = "TX_WATCHDOG",
+	[RESET_TYPE_INT_ERROR]          = "INT_ERROR",
+	[RESET_TYPE_RX_RECOVERY]        = "RX_RECOVERY",
+	[RESET_TYPE_RX_DESC_FETCH]      = "RX_DESC_FETCH",
+	[RESET_TYPE_TX_DESC_FETCH]      = "TX_DESC_FETCH",
+	[RESET_TYPE_TX_SKIP]            = "TX_SKIP",
+	[RESET_TYPE_MC_FAILURE]         = "MC_FAILURE",
 };
 
 #define EFX_MAX_MTU (9 * 1024)
@@ -117,9 +121,12 @@ MODULE_PARM_DESC(separate_tx_channels,
 static int napi_weight = 64;
 
 /* This is the time (in jiffies) between invocations of the hardware
- * monitor.  On Falcon-based NICs, this will:
+ * monitor.
+ * On Falcon-based NICs, this will:
  * - Check the on-board hardware monitor;
  * - Poll the link state and reconfigure the hardware as necessary.
+ * On Siena-based NICs for power systems with EEH support, this will give EEH a
+ * chance to start.
  */
 static unsigned int efx_monitor_interval = 1 * HZ;
 
@@ -203,13 +210,14 @@ static void efx_stop_all(struct efx_nic *efx);
 #define EFX_ASSERT_RESET_SERIALISED(efx)		\
 	do {						\
 		if ((efx->state == STATE_READY) ||	\
+		    (efx->state == STATE_RECOVERY) ||	\
 		    (efx->state == STATE_DISABLED))	\
 			ASSERT_RTNL();			\
 	} while (0)
 
 static int efx_check_disabled(struct efx_nic *efx)
 {
-	if (efx->state == STATE_DISABLED) {
+	if (efx->state == STATE_DISABLED || efx->state == STATE_RECOVERY) {
 		netif_err(efx, drv, efx->net_dev,
 			  "device is disabled due to earlier errors\n");
 		return -EIO;
@@ -677,7 +685,7 @@ static void efx_stop_datapath(struct efx_nic *efx)
 	BUG_ON(efx->port_enabled);
 
 	/* Only perform flush if dma is enabled */
-	if (dev->is_busmaster) {
+	if (dev->is_busmaster && efx->state != STATE_RECOVERY) {
 		rc = efx_nic_flush_queues(efx);
 
 		if (rc && EFX_WORKAROUND_7803(efx)) {
@@ -1590,13 +1598,15 @@ static void efx_start_all(struct efx_nic *efx)
 	efx_start_port(efx);
 	efx_start_datapath(efx);
 
-	/* Start the hardware monitor if there is one. Otherwise (we're link
-	 * event driven), we have to poll the PHY because after an event queue
-	 * flush, we could have a missed a link state change */
-	if (efx->type->monitor != NULL) {
+	/* Start the hardware monitor if there is one */
+	if (efx->type->monitor != NULL)
 		queue_delayed_work(efx->workqueue, &efx->monitor_work,
 				   efx_monitor_interval);
-	} else {
+
+	/* If link state detection is normally event-driven, we have
+	 * to poll now because we could have missed a change
+	 */
+	if (efx_nic_rev(efx) >= EFX_REV_SIENA_A0) {
 		mutex_lock(&efx->mac_lock);
 		if (efx->phy_op->poll(efx))
 			efx_link_status_changed(efx);
@@ -2303,7 +2313,9 @@ int efx_reset(struct efx_nic *efx, enum reset_type method)
 
 out:
 	/* Leave device stopped if necessary */
-	disabled = rc || method == RESET_TYPE_DISABLE;
+	disabled = rc ||
+		method == RESET_TYPE_DISABLE ||
+		method == RESET_TYPE_RECOVER_OR_DISABLE;
 	rc2 = efx_reset_up(efx, method, !disabled);
 	if (rc2) {
 		disabled = true;
@@ -2322,13 +2334,48 @@ out:
 	return rc;
 }
 
+/* Try recovery mechanisms.
+ * For now only EEH is supported.
+ * Returns 0 if the recovery mechanisms are unsuccessful.
+ * Returns a non-zero value otherwise.
+ */
+static int efx_try_recovery(struct efx_nic *efx)
+{
+#ifdef CONFIG_EEH
+	/* A PCI error can occur and not be seen by EEH because nothing
+	 * happens on the PCI bus. In this case the driver may fail and
+	 * schedule a 'recover or reset', leading to this recovery handler.
+	 * Manually call the eeh failure check function.
+	 */
+	struct eeh_dev *eehdev =
+		of_node_to_eeh_dev(pci_device_to_OF_node(efx->pci_dev));
+
+	if (eeh_dev_check_failure(eehdev)) {
+		/* The EEH mechanisms will handle the error and reset the
+		 * device if necessary.
+		 */
+		return 1;
+	}
+#endif
+	return 0;
+}
+
 /* The worker thread exists so that code that cannot sleep can
  * schedule a reset for later.
  */
 static void efx_reset_work(struct work_struct *data)
 {
 	struct efx_nic *efx = container_of(data, struct efx_nic, reset_work);
-	unsigned long pending = ACCESS_ONCE(efx->reset_pending);
+	unsigned long pending;
+	enum reset_type method;
+
+	pending = ACCESS_ONCE(efx->reset_pending);
+	method = fls(pending) - 1;
+
+	if ((method == RESET_TYPE_RECOVER_OR_DISABLE ||
+	     method == RESET_TYPE_RECOVER_OR_ALL) &&
+	    efx_try_recovery(efx))
+		return;
 
 	if (!pending)
 		return;
@@ -2340,7 +2387,7 @@ static void efx_reset_work(struct work_struct *data)
 	 * it cannot change again.
 	 */
 	if (efx->state == STATE_READY)
-		(void)efx_reset(efx, fls(pending) - 1);
+		(void)efx_reset(efx, method);
 
 	rtnl_unlock();
 }
@@ -2349,11 +2396,20 @@ void efx_schedule_reset(struct efx_nic *efx, enum reset_type type)
 {
 	enum reset_type method;
 
+	if (efx->state == STATE_RECOVERY) {
+		netif_dbg(efx, drv, efx->net_dev,
+			  "recovering: skip scheduling %s reset\n",
+			  RESET_TYPE(type));
+		return;
+	}
+
 	switch (type) {
 	case RESET_TYPE_INVISIBLE:
 	case RESET_TYPE_ALL:
+	case RESET_TYPE_RECOVER_OR_ALL:
 	case RESET_TYPE_WORLD:
 	case RESET_TYPE_DISABLE:
+	case RESET_TYPE_RECOVER_OR_DISABLE:
 		method = type;
 		netif_dbg(efx, drv, efx->net_dev, "scheduling %s reset\n",
 			  RESET_TYPE(method));
@@ -2563,6 +2619,8 @@ static void efx_pci_remove(struct pci_dev *pci_dev)
 	efx_fini_struct(efx);
 	pci_set_drvdata(pci_dev, NULL);
 	free_netdev(efx->net_dev);
+
+	pci_disable_pcie_error_reporting(pci_dev);
 };
 
 /* NIC VPD information
@@ -2735,6 +2793,11 @@ static int efx_pci_probe(struct pci_dev *pci_dev,
 		netif_warn(efx, probe, efx->net_dev,
 			   "failed to create MTDs (%d)\n", rc);
 
+	rc = pci_enable_pcie_error_reporting(pci_dev);
+	if (rc && rc != -EINVAL)
+		netif_warn(efx, probe, efx->net_dev,
+			   "pci_enable_pcie_error_reporting failed (%d)\n", rc);
+
 	return 0;
 
  fail4:
@@ -2859,12 +2922,112 @@ static const struct dev_pm_ops efx_pm_ops = {
 	.restore	= efx_pm_resume,
 };
 
+/* A PCI error affecting this device was detected.
+ * At this point MMIO and DMA may be disabled.
+ * Stop the software path and request a slot reset.
+ */
+pci_ers_result_t efx_io_error_detected(struct pci_dev *pdev,
+				       enum pci_channel_state state)
+{
+	pci_ers_result_t status = PCI_ERS_RESULT_RECOVERED;
+	struct efx_nic *efx = pci_get_drvdata(pdev);
+
+	if (state == pci_channel_io_perm_failure)
+		return PCI_ERS_RESULT_DISCONNECT;
+
+	rtnl_lock();
+
+	if (efx->state != STATE_DISABLED) {
+		efx->state = STATE_RECOVERY;
+		efx->reset_pending = 0;
+
+		efx_device_detach_sync(efx);
+
+		efx_stop_all(efx);
+		efx_stop_interrupts(efx, false);
+
+		status = PCI_ERS_RESULT_NEED_RESET;
+	} else {
+		/* If the interface is disabled we don't want to do anything
+		 * with it.
+		 */
+		status = PCI_ERS_RESULT_RECOVERED;
+	}
+
+	rtnl_unlock();
+
+	pci_disable_device(pdev);
+
+	return status;
+}
+
+/* Fake a successfull reset, which will be performed later in efx_io_resume. */
+pci_ers_result_t efx_io_slot_reset(struct pci_dev *pdev)
+{
+	struct efx_nic *efx = pci_get_drvdata(pdev);
+	pci_ers_result_t status = PCI_ERS_RESULT_RECOVERED;
+	int rc;
+
+	if (pci_enable_device(pdev)) {
+		netif_err(efx, hw, efx->net_dev,
+			  "Cannot re-enable PCI device after reset.\n");
+		status =  PCI_ERS_RESULT_DISCONNECT;
+	}
+
+	rc = pci_cleanup_aer_uncorrect_error_status(pdev);
+	if (rc) {
+		netif_err(efx, hw, efx->net_dev,
+		"pci_cleanup_aer_uncorrect_error_status failed (%d)\n", rc);
+		/* Non-fatal error. Continue. */
+	}
+
+	return status;
+}
+
+/* Perform the actual reset and resume I/O operations. */
+static void efx_io_resume(struct pci_dev *pdev)
+{
+	struct efx_nic *efx = pci_get_drvdata(pdev);
+	int rc;
+
+	rtnl_lock();
+
+	if (efx->state == STATE_DISABLED)
+		goto out;
+
+	rc = efx_reset(efx, RESET_TYPE_ALL);
+	if (rc) {
+		netif_err(efx, hw, efx->net_dev,
+			  "efx_reset failed after PCI error (%d)\n", rc);
+	} else {
+		efx->state = STATE_READY;
+		netif_dbg(efx, hw, efx->net_dev,
+			  "Done resetting and resuming IO after PCI error.\n");
+	}
+
+out:
+	rtnl_unlock();
+}
+
+/* For simplicity and reliability, we always require a slot reset and try to
+ * reset the hardware when a pci error affecting the device is detected.
+ * We leave both the link_reset and mmio_enabled callback unimplemented:
+ * with our request for slot reset the mmio_enabled callback will never be
+ * called, and the link_reset callback is not used by AER or EEH mechanisms.
+ */
+static struct pci_error_handlers efx_err_handlers = {
+	.error_detected = efx_io_error_detected,
+	.slot_reset	= efx_io_slot_reset,
+	.resume		= efx_io_resume,
+};
+
 static struct pci_driver efx_pci_driver = {
 	.name		= KBUILD_MODNAME,
 	.id_table	= efx_pci_table,
 	.probe		= efx_pci_probe,
 	.remove		= efx_pci_remove,
 	.driver.pm	= &efx_pm_ops,
+	.err_handler	= &efx_err_handlers,
 };
 
 /**************************************************************************
diff --git a/drivers/net/ethernet/sfc/enum.h b/drivers/net/ethernet/sfc/enum.h
index 182dbe2..ab8fb58 100644
--- a/drivers/net/ethernet/sfc/enum.h
+++ b/drivers/net/ethernet/sfc/enum.h
@@ -137,8 +137,12 @@ enum efx_loopback_mode {
  * Reset methods are numbered in order of increasing scope.
  *
  * @RESET_TYPE_INVISIBLE: Reset datapath and MAC (Falcon only)
+ * @RESET_TYPE_RECOVER_OR_ALL: Try to recover. Apply RESET_TYPE_ALL
+ * if unsuccessful.
  * @RESET_TYPE_ALL: Reset datapath, MAC and PHY
  * @RESET_TYPE_WORLD: Reset as much as possible
+ * @RESET_TYPE_RECOVER_OR_DISABLE: Try to recover. Apply RESET_TYPE_DISABLE if
+ * unsuccessful.
  * @RESET_TYPE_DISABLE: Reset datapath, MAC and PHY; leave NIC disabled
  * @RESET_TYPE_TX_WATCHDOG: reset due to TX watchdog
  * @RESET_TYPE_INT_ERROR: reset due to internal error
@@ -150,9 +154,11 @@ enum efx_loopback_mode {
  */
 enum reset_type {
 	RESET_TYPE_INVISIBLE = 0,
-	RESET_TYPE_ALL = 1,
-	RESET_TYPE_WORLD = 2,
-	RESET_TYPE_DISABLE = 3,
+	RESET_TYPE_RECOVER_OR_ALL = 1,
+	RESET_TYPE_ALL = 2,
+	RESET_TYPE_WORLD = 3,
+	RESET_TYPE_RECOVER_OR_DISABLE = 4,
+	RESET_TYPE_DISABLE = 5,
 	RESET_TYPE_MAX_METHOD,
 	RESET_TYPE_TX_WATCHDOG,
 	RESET_TYPE_INT_ERROR,
diff --git a/drivers/net/ethernet/sfc/net_driver.h b/drivers/net/ethernet/sfc/net_driver.h
index c83fe09..9e90081 100644
--- a/drivers/net/ethernet/sfc/net_driver.h
+++ b/drivers/net/ethernet/sfc/net_driver.h
@@ -429,6 +429,7 @@ enum nic_state {
 	STATE_UNINIT = 0,	/* device being probed/removed or is frozen */
 	STATE_READY = 1,	/* hardware ready and netdev registered */
 	STATE_DISABLED = 2,	/* device disabled due to hardware errors */
+	STATE_RECOVERY = 3,	/* device recovering from PCI error */
 };
 
 /*
diff --git a/drivers/net/ethernet/sfc/siena.c b/drivers/net/ethernet/sfc/siena.c
index ba40f67..e07ff0d 100644
--- a/drivers/net/ethernet/sfc/siena.c
+++ b/drivers/net/ethernet/sfc/siena.c
@@ -202,7 +202,7 @@ out:
 
 static enum reset_type siena_map_reset_reason(enum reset_type reason)
 {
-	return RESET_TYPE_ALL;
+	return RESET_TYPE_RECOVER_OR_ALL;
 }
 
 static int siena_map_reset_flags(u32 *flags)
@@ -245,6 +245,22 @@ static int siena_reset_hw(struct efx_nic *efx, enum reset_type method)
 		return efx_mcdi_reset_port(efx);
 }
 
+#ifdef CONFIG_EEH
+/* When a PCI device is isolated from the bus, a subsequent MMIO read is
+ * required for the kernel EEH mechanisms to notice. As the Solarflare driver
+ * was written to minimise MMIO read (for latency) then a periodic call to check
+ * the EEH status of the device is required so that device recovery can happen
+ * in a timely fashion.
+ */
+static void siena_monitor(struct efx_nic *efx)
+{
+	struct eeh_dev *eehdev =
+		of_node_to_eeh_dev(pci_device_to_OF_node(efx->pci_dev));
+
+	eeh_dev_check_failure(eehdev);
+}
+#endif
+
 static int siena_probe_nvconfig(struct efx_nic *efx)
 {
 	u32 caps = 0;
@@ -665,7 +681,11 @@ const struct efx_nic_type siena_a0_nic_type = {
 	.init = siena_init_nic,
 	.dimension_resources = siena_dimension_resources,
 	.fini = efx_port_dummy_op_void,
+#ifdef CONFIG_EEH
+	.monitor = siena_monitor,
+#else
 	.monitor = NULL,
+#endif
 	.map_reset_reason = siena_map_reset_reason,
 	.map_reset_flags = siena_map_reset_flags,
 	.reset = siena_reset_hw,
-- 
cgit v0.10.2


From 80c2e716d555912168f93853f96a24d0de75897b Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Wed, 23 Jan 2013 21:52:13 +0000
Subject: sfc: Document current usage of efx_rx_buffer::len and
 efx_nic::rx_buffer_len

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>

diff --git a/drivers/net/ethernet/sfc/net_driver.h b/drivers/net/ethernet/sfc/net_driver.h
index 9e90081..f74411f 100644
--- a/drivers/net/ethernet/sfc/net_driver.h
+++ b/drivers/net/ethernet/sfc/net_driver.h
@@ -209,7 +209,8 @@ struct efx_tx_queue {
  * @page: The associated page buffer.
  *	Will be %NULL if the buffer slot is currently free.
  * @page_offset: Offset within page
- * @len: Buffer length, in bytes.
+ * @len: If pending: length for DMA descriptor.
+ *	If completed: received length, excluding hash prefix.
  * @flags: Flags for buffer and packet state.
  */
 struct efx_rx_buffer {
@@ -668,7 +669,8 @@ struct vfdi_status;
  * @n_channels: Number of channels in use
  * @n_rx_channels: Number of channels used for RX (= number of RX queues)
  * @n_tx_channels: Number of channels used for TX
- * @rx_buffer_len: RX buffer length
+ * @rx_buffer_len: RX buffer length, including start alignment but excluding
+ *	any metadata
  * @rx_buffer_order: Order (log2) of number of pages for each RX buffer
  * @rx_hash_key: Toeplitz hash key for RSS
  * @rx_indir_table: Indirection table for RSS
-- 
cgit v0.10.2


From 272baeeb6a98f5f746c2eeab4973c2df89e9d7ea Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Tue, 29 Jan 2013 23:33:14 +0000
Subject: sfc: Properly distinguish RX buffer and DMA lengths

Replace efx_nic::rx_buffer_len with efx_nic::rx_dma_len, the maximum
RX DMA length.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>

diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index 5e1ddc5..34b56ec 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -639,12 +639,11 @@ static void efx_start_datapath(struct efx_nic *efx)
 	 * support the current MTU, including padding for header
 	 * alignment and overruns.
 	 */
-	efx->rx_buffer_len = (max(EFX_PAGE_IP_ALIGN, NET_IP_ALIGN) +
-			      EFX_MAX_FRAME_LEN(efx->net_dev->mtu) +
-			      efx->type->rx_buffer_hash_size +
-			      efx->type->rx_buffer_padding);
-	efx->rx_buffer_order = get_order(efx->rx_buffer_len +
-					 sizeof(struct efx_rx_page_state));
+	efx->rx_dma_len = (efx->type->rx_buffer_hash_size +
+			   EFX_MAX_FRAME_LEN(efx->net_dev->mtu) +
+			   efx->type->rx_buffer_padding);
+	efx->rx_buffer_order = get_order(sizeof(struct efx_rx_page_state) +
+					 EFX_PAGE_IP_ALIGN + efx->rx_dma_len);
 
 	/* We must keep at least one descriptor in a TX ring empty.
 	 * We could avoid this when the queue size does not exactly
diff --git a/drivers/net/ethernet/sfc/net_driver.h b/drivers/net/ethernet/sfc/net_driver.h
index f74411f..fc6770e 100644
--- a/drivers/net/ethernet/sfc/net_driver.h
+++ b/drivers/net/ethernet/sfc/net_driver.h
@@ -669,8 +669,7 @@ struct vfdi_status;
  * @n_channels: Number of channels in use
  * @n_rx_channels: Number of channels used for RX (= number of RX queues)
  * @n_tx_channels: Number of channels used for TX
- * @rx_buffer_len: RX buffer length, including start alignment but excluding
- *	any metadata
+ * @rx_dma_len: Current maximum RX DMA length
  * @rx_buffer_order: Order (log2) of number of pages for each RX buffer
  * @rx_hash_key: Toeplitz hash key for RSS
  * @rx_indir_table: Indirection table for RSS
@@ -786,7 +785,7 @@ struct efx_nic {
 	unsigned rss_spread;
 	unsigned tx_channel_offset;
 	unsigned n_tx_channels;
-	unsigned int rx_buffer_len;
+	unsigned int rx_dma_len;
 	unsigned int rx_buffer_order;
 	u8 rx_hash_key[40];
 	u32 rx_indir_table[128];
diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c
index e7aa28e..31361db 100644
--- a/drivers/net/ethernet/sfc/rx.c
+++ b/drivers/net/ethernet/sfc/rx.c
@@ -27,8 +27,9 @@
 /* Number of RX descriptors pushed at once. */
 #define EFX_RX_BATCH  8
 
-/* Maximum size of a buffer sharing a page */
-#define EFX_RX_HALF_PAGE ((PAGE_SIZE >> 1) - sizeof(struct efx_rx_page_state))
+/* Maximum length for an RX descriptor sharing a page */
+#define EFX_RX_HALF_PAGE ((PAGE_SIZE >> 1) - sizeof(struct efx_rx_page_state) \
+			  - EFX_PAGE_IP_ALIGN)
 
 /* Size of buffer allocated for skb header area. */
 #define EFX_SKB_HEADERS  64u
@@ -52,10 +53,6 @@ static inline unsigned int efx_rx_buf_offset(struct efx_nic *efx,
 {
 	return buf->page_offset + efx->type->rx_buffer_hash_size;
 }
-static inline unsigned int efx_rx_buf_size(struct efx_nic *efx)
-{
-	return PAGE_SIZE << efx->rx_buffer_order;
-}
 
 static u8 *efx_rx_buf_eh(struct efx_nic *efx, struct efx_rx_buffer *buf)
 {
@@ -105,7 +102,7 @@ static int efx_init_rx_buffers(struct efx_rx_queue *rx_queue)
 		if (unlikely(page == NULL))
 			return -ENOMEM;
 		dma_addr = dma_map_page(&efx->pci_dev->dev, page, 0,
-					efx_rx_buf_size(efx),
+					PAGE_SIZE << efx->rx_buffer_order,
 					DMA_FROM_DEVICE);
 		if (unlikely(dma_mapping_error(&efx->pci_dev->dev, dma_addr))) {
 			__free_pages(page, efx->rx_buffer_order);
@@ -124,12 +121,12 @@ static int efx_init_rx_buffers(struct efx_rx_queue *rx_queue)
 		rx_buf->dma_addr = dma_addr + EFX_PAGE_IP_ALIGN;
 		rx_buf->page = page;
 		rx_buf->page_offset = page_offset + EFX_PAGE_IP_ALIGN;
-		rx_buf->len = efx->rx_buffer_len - EFX_PAGE_IP_ALIGN;
+		rx_buf->len = efx->rx_dma_len;
 		rx_buf->flags = 0;
 		++rx_queue->added_count;
 		++state->refcnt;
 
-		if ((~count & 1) && (efx->rx_buffer_len <= EFX_RX_HALF_PAGE)) {
+		if ((~count & 1) && (efx->rx_dma_len <= EFX_RX_HALF_PAGE)) {
 			/* Use the second half of the page */
 			get_page(page);
 			dma_addr += (PAGE_SIZE >> 1);
@@ -153,7 +150,7 @@ static void efx_unmap_rx_buffer(struct efx_nic *efx,
 		if (--state->refcnt == 0) {
 			dma_unmap_page(&efx->pci_dev->dev,
 				       state->dma_addr,
-				       efx_rx_buf_size(efx),
+				       PAGE_SIZE << efx->rx_buffer_order,
 				       DMA_FROM_DEVICE);
 		} else if (used_len) {
 			dma_sync_single_for_cpu(&efx->pci_dev->dev,
@@ -221,7 +218,7 @@ static void efx_recycle_rx_buffer(struct efx_channel *channel,
 
 	rx_buf->flags = 0;
 
-	if (efx->rx_buffer_len <= EFX_RX_HALF_PAGE &&
+	if (efx->rx_dma_len <= EFX_RX_HALF_PAGE &&
 	    page_count(rx_buf->page) == 1)
 		efx_resurrect_rx_buffer(rx_queue, rx_buf);
 
-- 
cgit v0.10.2


From 9bc2fc9b5272cc888fb10d5839f7188fa0bfdc90 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Tue, 29 Jan 2013 23:33:14 +0000
Subject: sfc: Make RX queue descriptor counts unsigned for consistency

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>

diff --git a/drivers/net/ethernet/sfc/net_driver.h b/drivers/net/ethernet/sfc/net_driver.h
index fc6770e..3fd6dbe 100644
--- a/drivers/net/ethernet/sfc/net_driver.h
+++ b/drivers/net/ethernet/sfc/net_driver.h
@@ -272,9 +272,9 @@ struct efx_rx_queue {
 	bool enabled;
 	bool flush_pending;
 
-	int added_count;
-	int notified_count;
-	int removed_count;
+	unsigned int added_count;
+	unsigned int notified_count;
+	unsigned int removed_count;
 	unsigned int max_fill;
 	unsigned int fast_fill_trigger;
 	unsigned int min_fill;
-- 
cgit v0.10.2


From ff734ef4bca05fd5cd51b83d2e2a9f008b64f9a3 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Tue, 29 Jan 2013 23:33:14 +0000
Subject: sfc: Wrap __efx_rx_packet() with efx_rx_flush_packet()

The pipeline mechanism will need to change a bit for scattered
packets.  Add a wrapper to insulate efx_process_channel() from this.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>

diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index 34b56ec..f8013c3 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -250,11 +250,7 @@ static int efx_process_channel(struct efx_channel *channel, int budget)
 		struct efx_rx_queue *rx_queue =
 			efx_channel_get_rx_queue(channel);
 
-		/* Deliver last RX packet. */
-		if (channel->rx_pkt) {
-			__efx_rx_packet(channel, channel->rx_pkt);
-			channel->rx_pkt = NULL;
-		}
+		efx_rx_flush_packet(channel);
 		if (rx_queue->enabled)
 			efx_fast_push_rx_descriptors(rx_queue);
 	}
diff --git a/drivers/net/ethernet/sfc/efx.h b/drivers/net/ethernet/sfc/efx.h
index 64c555e..00e7077 100644
--- a/drivers/net/ethernet/sfc/efx.h
+++ b/drivers/net/ethernet/sfc/efx.h
@@ -43,6 +43,13 @@ extern void __efx_rx_packet(struct efx_channel *channel,
 			    struct efx_rx_buffer *rx_buf);
 extern void efx_rx_packet(struct efx_rx_queue *rx_queue, unsigned int index,
 			  unsigned int len, u16 flags);
+static inline void efx_rx_flush_packet(struct efx_channel *channel)
+{
+	if (channel->rx_pkt) {
+		__efx_rx_packet(channel, channel->rx_pkt);
+		channel->rx_pkt = NULL;
+	}
+}
 extern void efx_schedule_slow_fill(struct efx_rx_queue *rx_queue);
 
 #define EFX_MAX_DMAQ_SIZE 4096UL
diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c
index 31361db..60f4eb7 100644
--- a/drivers/net/ethernet/sfc/rx.c
+++ b/drivers/net/ethernet/sfc/rx.c
@@ -465,8 +465,7 @@ void efx_rx_packet(struct efx_rx_queue *rx_queue, unsigned int index,
 	 */
 	rx_buf->len = len - efx->type->rx_buffer_hash_size;
 out:
-	if (channel->rx_pkt)
-		__efx_rx_packet(channel, channel->rx_pkt);
+	efx_rx_flush_packet(channel);
 	channel->rx_pkt = rx_buf;
 }
 
-- 
cgit v0.10.2


From b184f16b7feb9ede7d658ee6f2c77434d580d764 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Tue, 29 Jan 2013 23:33:15 +0000
Subject: sfc: Replace efx_rx_buf_eh() with simpler efx_rx_buf_va()

efx_rx_buf_va() returns the virtual address of the current start of
the buffer.  The callers must add the hash prefix size themselves.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>

diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c
index 60f4eb7..23d67d1 100644
--- a/drivers/net/ethernet/sfc/rx.c
+++ b/drivers/net/ethernet/sfc/rx.c
@@ -54,9 +54,9 @@ static inline unsigned int efx_rx_buf_offset(struct efx_nic *efx,
 	return buf->page_offset + efx->type->rx_buffer_hash_size;
 }
 
-static u8 *efx_rx_buf_eh(struct efx_nic *efx, struct efx_rx_buffer *buf)
+static inline u8 *efx_rx_buf_va(struct efx_rx_buffer *buf)
 {
-	return page_address(buf->page) + efx_rx_buf_offset(efx, buf);
+	return page_address(buf->page) + buf->page_offset;
 }
 
 static inline u32 efx_rx_buf_hash(const u8 *eh)
@@ -458,7 +458,7 @@ void efx_rx_packet(struct efx_rx_queue *rx_queue, unsigned int index,
 	/* Prefetch nice and early so data will (hopefully) be in cache by
 	 * the time we look at it.
 	 */
-	prefetch(efx_rx_buf_eh(efx, rx_buf));
+	prefetch(efx_rx_buf_va(rx_buf) + efx->type->rx_buffer_hash_size);
 
 	/* Pipeline receives so that we give time for packet headers to be
 	 * prefetched into cache.
@@ -497,7 +497,7 @@ static void efx_rx_deliver(struct efx_channel *channel, u8 *eh,
 void __efx_rx_packet(struct efx_channel *channel, struct efx_rx_buffer *rx_buf)
 {
 	struct efx_nic *efx = channel->efx;
-	u8 *eh = efx_rx_buf_eh(efx, rx_buf);
+	u8 *eh = efx_rx_buf_va(rx_buf) + efx->type->rx_buffer_hash_size;
 
 	/* If we're in loopback test, then pass the packet directly to the
 	 * loopback layer, and free the rx_buf here
-- 
cgit v0.10.2


From 5036b7c7b9137bd084f28438396432348f20e0bc Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Tue, 29 Jan 2013 23:33:15 +0000
Subject: sfc: Explicitly prefetch RX hash prefix, not just Ethernet heade

Currently we prefetch from the Ethernet header, but we will also read
the hash prefix.  In practice they should be in the same cache line
and this won't hurt, but it is still pointless to add on the hash
prefix size.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>

diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c
index 23d67d1..8e78a2f 100644
--- a/drivers/net/ethernet/sfc/rx.c
+++ b/drivers/net/ethernet/sfc/rx.c
@@ -458,7 +458,7 @@ void efx_rx_packet(struct efx_rx_queue *rx_queue, unsigned int index,
 	/* Prefetch nice and early so data will (hopefully) be in cache by
 	 * the time we look at it.
 	 */
-	prefetch(efx_rx_buf_va(rx_buf) + efx->type->rx_buffer_hash_size);
+	prefetch(efx_rx_buf_va(rx_buf));
 
 	/* Pipeline receives so that we give time for packet headers to be
 	 * prefetched into cache.
-- 
cgit v0.10.2


From b74e3e8cd6f952faf8797fca81a5a2ceace6b9aa Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Tue, 29 Jan 2013 23:33:15 +0000
Subject: sfc: Update RX buffer address together with length

Adjust rx_buf->page_offset when we eat the RX hash prefix.  Remove
efx_rx_buf_offset(), which is now redundant.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>

diff --git a/drivers/net/ethernet/sfc/net_driver.h b/drivers/net/ethernet/sfc/net_driver.h
index 3fd6dbe..1bc911f 100644
--- a/drivers/net/ethernet/sfc/net_driver.h
+++ b/drivers/net/ethernet/sfc/net_driver.h
@@ -208,7 +208,8 @@ struct efx_tx_queue {
  * @dma_addr: DMA base address of the buffer
  * @page: The associated page buffer.
  *	Will be %NULL if the buffer slot is currently free.
- * @page_offset: Offset within page
+ * @page_offset: If pending: offset in @page of DMA base address.
+ *	If completed: offset in @page of Ethernet header.
  * @len: If pending: length for DMA descriptor.
  *	If completed: received length, excluding hash prefix.
  * @flags: Flags for buffer and packet state.
diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c
index 8e78a2f..0451872 100644
--- a/drivers/net/ethernet/sfc/rx.c
+++ b/drivers/net/ethernet/sfc/rx.c
@@ -47,13 +47,6 @@ static unsigned int rx_refill_threshold;
  */
 #define EFX_RXD_HEAD_ROOM 2
 
-/* Offset of ethernet header within page */
-static inline unsigned int efx_rx_buf_offset(struct efx_nic *efx,
-					     struct efx_rx_buffer *buf)
-{
-	return buf->page_offset + efx->type->rx_buffer_hash_size;
-}
-
 static inline u8 *efx_rx_buf_va(struct efx_rx_buffer *buf)
 {
 	return page_address(buf->page) + buf->page_offset;
@@ -356,8 +349,7 @@ static void efx_rx_packet_gro(struct efx_channel *channel,
 	if (efx->net_dev->features & NETIF_F_RXHASH)
 		skb->rxhash = efx_rx_buf_hash(eh);
 
-	skb_fill_page_desc(skb, 0, page,
-			   efx_rx_buf_offset(efx, rx_buf), rx_buf->len);
+	skb_fill_page_desc(skb, 0, page, rx_buf->page_offset, rx_buf->len);
 
 	skb->len = rx_buf->len;
 	skb->data_len = rx_buf->len;
@@ -399,7 +391,7 @@ static struct sk_buff *efx_rx_mk_skb(struct efx_channel *channel,
 	if (rx_buf->len > hdr_len) {
 		skb->data_len = skb->len - hdr_len;
 		skb_fill_page_desc(skb, 0, rx_buf->page,
-				   efx_rx_buf_offset(efx, rx_buf) + hdr_len,
+				   rx_buf->page_offset + hdr_len,
 				   skb->data_len);
 	} else {
 		__free_pages(rx_buf->page, efx->rx_buffer_order);
@@ -460,10 +452,12 @@ void efx_rx_packet(struct efx_rx_queue *rx_queue, unsigned int index,
 	 */
 	prefetch(efx_rx_buf_va(rx_buf));
 
+	rx_buf->page_offset += efx->type->rx_buffer_hash_size;
+	rx_buf->len = len - efx->type->rx_buffer_hash_size;
+
 	/* Pipeline receives so that we give time for packet headers to be
 	 * prefetched into cache.
 	 */
-	rx_buf->len = len - efx->type->rx_buffer_hash_size;
 out:
 	efx_rx_flush_packet(channel);
 	channel->rx_pkt = rx_buf;
@@ -497,7 +491,7 @@ static void efx_rx_deliver(struct efx_channel *channel, u8 *eh,
 void __efx_rx_packet(struct efx_channel *channel, struct efx_rx_buffer *rx_buf)
 {
 	struct efx_nic *efx = channel->efx;
-	u8 *eh = efx_rx_buf_va(rx_buf) + efx->type->rx_buffer_hash_size;
+	u8 *eh = efx_rx_buf_va(rx_buf);
 
 	/* If we're in loopback test, then pass the packet directly to the
 	 * loopback layer, and free the rx_buf here
-- 
cgit v0.10.2


From 85740cdf0b84224a9fce62dc9150008ef8d6ab4e Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Tue, 29 Jan 2013 23:33:15 +0000
Subject: sfc: Enable RX DMA scattering where possible

Enable RX DMA scattering iff an RX buffer large enough for the current
MTU will not fit into a single page and the NIC supports DMA
scattering for kernel-mode RX queues.

On Falcon and Siena, the RX_USR_BUF_SIZE field is used as the DMA
limit for both all RX queues with scatter enabled.  Set it to 1824,
matching what Onload uses now.

Maintain a statistic for frames truncated due to lack of descriptors
(rx_nodesc_trunc).  This is distinct from rx_frm_trunc which may be
incremented when scattering is disabled and implies an over-length
frame.

Whenever an MTU change causes scattering to be turned on or off,
update filters that point to the PF queues, but leave others
unchanged, as VF drivers assume scattering is off.

Add n_frags parameters to various functions, and make them iterate:
- efx_rx_packet()
- efx_recycle_rx_buffers()
- efx_rx_mk_skb()
- efx_rx_deliver()

Make efx_handle_rx_event() responsible for updating
efx_rx_queue::removed_count.

Change the RX pipeline state to a starting ring index and number of
fragments, and make __efx_rx_packet() responsible for clearing it.

Based on earlier versions by David Riddoch and Jon Cooper.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>

diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index f8013c3..1213af5 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -88,8 +88,6 @@ const char *const efx_reset_type_names[] = {
 	[RESET_TYPE_MC_FAILURE]         = "MC_FAILURE",
 };
 
-#define EFX_MAX_MTU (9 * 1024)
-
 /* Reset workqueue. If any NIC has a hardware failure then a reset will be
  * queued onto this work queue. This is not a per-nic work queue, because
  * efx_reset_work() acquires the rtnl lock, so resets are naturally serialised.
@@ -627,9 +625,11 @@ fail:
  */
 static void efx_start_datapath(struct efx_nic *efx)
 {
+	bool old_rx_scatter = efx->rx_scatter;
 	struct efx_tx_queue *tx_queue;
 	struct efx_rx_queue *rx_queue;
 	struct efx_channel *channel;
+	size_t rx_buf_len;
 
 	/* Calculate the rx buffer allocation parameters required to
 	 * support the current MTU, including padding for header
@@ -638,8 +638,32 @@ static void efx_start_datapath(struct efx_nic *efx)
 	efx->rx_dma_len = (efx->type->rx_buffer_hash_size +
 			   EFX_MAX_FRAME_LEN(efx->net_dev->mtu) +
 			   efx->type->rx_buffer_padding);
-	efx->rx_buffer_order = get_order(sizeof(struct efx_rx_page_state) +
-					 EFX_PAGE_IP_ALIGN + efx->rx_dma_len);
+	rx_buf_len = (sizeof(struct efx_rx_page_state) +
+		      EFX_PAGE_IP_ALIGN + efx->rx_dma_len);
+	if (rx_buf_len <= PAGE_SIZE) {
+		efx->rx_scatter = false;
+		efx->rx_buffer_order = 0;
+		if (rx_buf_len <= PAGE_SIZE / 2)
+			efx->rx_buffer_truesize = PAGE_SIZE / 2;
+		else
+			efx->rx_buffer_truesize = PAGE_SIZE;
+	} else if (efx->type->can_rx_scatter) {
+		BUILD_BUG_ON(sizeof(struct efx_rx_page_state) +
+			     EFX_PAGE_IP_ALIGN + EFX_RX_USR_BUF_SIZE >
+			     PAGE_SIZE / 2);
+		efx->rx_scatter = true;
+		efx->rx_dma_len = EFX_RX_USR_BUF_SIZE;
+		efx->rx_buffer_order = 0;
+		efx->rx_buffer_truesize = PAGE_SIZE / 2;
+	} else {
+		efx->rx_scatter = false;
+		efx->rx_buffer_order = get_order(rx_buf_len);
+		efx->rx_buffer_truesize = PAGE_SIZE << efx->rx_buffer_order;
+	}
+
+	/* RX filters also have scatter-enabled flags */
+	if (efx->rx_scatter != old_rx_scatter)
+		efx_filter_update_rx_scatter(efx);
 
 	/* We must keep at least one descriptor in a TX ring empty.
 	 * We could avoid this when the queue size does not exactly
@@ -661,7 +685,7 @@ static void efx_start_datapath(struct efx_nic *efx)
 			efx_nic_generate_fill_event(rx_queue);
 		}
 
-		WARN_ON(channel->rx_pkt != NULL);
+		WARN_ON(channel->rx_pkt_n_frags);
 	}
 
 	if (netif_device_present(efx->net_dev))
diff --git a/drivers/net/ethernet/sfc/efx.h b/drivers/net/ethernet/sfc/efx.h
index 00e7077..211da79 100644
--- a/drivers/net/ethernet/sfc/efx.h
+++ b/drivers/net/ethernet/sfc/efx.h
@@ -39,16 +39,14 @@ extern void efx_init_rx_queue(struct efx_rx_queue *rx_queue);
 extern void efx_fini_rx_queue(struct efx_rx_queue *rx_queue);
 extern void efx_fast_push_rx_descriptors(struct efx_rx_queue *rx_queue);
 extern void efx_rx_slow_fill(unsigned long context);
-extern void __efx_rx_packet(struct efx_channel *channel,
-			    struct efx_rx_buffer *rx_buf);
-extern void efx_rx_packet(struct efx_rx_queue *rx_queue, unsigned int index,
+extern void __efx_rx_packet(struct efx_channel *channel);
+extern void efx_rx_packet(struct efx_rx_queue *rx_queue,
+			  unsigned int index, unsigned int n_frags,
 			  unsigned int len, u16 flags);
 static inline void efx_rx_flush_packet(struct efx_channel *channel)
 {
-	if (channel->rx_pkt) {
-		__efx_rx_packet(channel, channel->rx_pkt);
-		channel->rx_pkt = NULL;
-	}
+	if (channel->rx_pkt_n_frags)
+		__efx_rx_packet(channel);
 }
 extern void efx_schedule_slow_fill(struct efx_rx_queue *rx_queue);
 
@@ -73,6 +71,7 @@ extern void efx_schedule_slow_fill(struct efx_rx_queue *rx_queue);
 extern int efx_probe_filters(struct efx_nic *efx);
 extern void efx_restore_filters(struct efx_nic *efx);
 extern void efx_remove_filters(struct efx_nic *efx);
+extern void efx_filter_update_rx_scatter(struct efx_nic *efx);
 extern s32 efx_filter_insert_filter(struct efx_nic *efx,
 				    struct efx_filter_spec *spec,
 				    bool replace);
diff --git a/drivers/net/ethernet/sfc/ethtool.c b/drivers/net/ethernet/sfc/ethtool.c
index 8e61cd0..6e76817 100644
--- a/drivers/net/ethernet/sfc/ethtool.c
+++ b/drivers/net/ethernet/sfc/ethtool.c
@@ -154,6 +154,7 @@ static const struct efx_ethtool_stat efx_ethtool_stats[] = {
 	EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_tcp_udp_chksum_err),
 	EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_mcast_mismatch),
 	EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_frm_trunc),
+	EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_nodesc_trunc),
 };
 
 /* Number of ethtool statistics */
@@ -978,7 +979,8 @@ static int efx_ethtool_set_class_rule(struct efx_nic *efx,
 	     rule->m_ext.data[1]))
 		return -EINVAL;
 
-	efx_filter_init_rx(&spec, EFX_FILTER_PRI_MANUAL, 0,
+	efx_filter_init_rx(&spec, EFX_FILTER_PRI_MANUAL,
+			   efx->rx_scatter ? EFX_FILTER_FLAG_RX_SCATTER : 0,
 			   (rule->ring_cookie == RX_CLS_FLOW_DISC) ?
 			   0xfff : rule->ring_cookie);
 
diff --git a/drivers/net/ethernet/sfc/falcon.c b/drivers/net/ethernet/sfc/falcon.c
index 49bcd19..4486102 100644
--- a/drivers/net/ethernet/sfc/falcon.c
+++ b/drivers/net/ethernet/sfc/falcon.c
@@ -1546,10 +1546,6 @@ static int falcon_probe_nic(struct efx_nic *efx)
 
 static void falcon_init_rx_cfg(struct efx_nic *efx)
 {
-	/* Prior to Siena the RX DMA engine will split each frame at
-	 * intervals of RX_USR_BUF_SIZE (32-byte units). We set it to
-	 * be so large that that never happens. */
-	const unsigned huge_buf_size = (3 * 4096) >> 5;
 	/* RX control FIFO thresholds (32 entries) */
 	const unsigned ctrl_xon_thr = 20;
 	const unsigned ctrl_xoff_thr = 25;
@@ -1557,10 +1553,15 @@ static void falcon_init_rx_cfg(struct efx_nic *efx)
 
 	efx_reado(efx, &reg, FR_AZ_RX_CFG);
 	if (efx_nic_rev(efx) <= EFX_REV_FALCON_A1) {
-		/* Data FIFO size is 5.5K */
+		/* Data FIFO size is 5.5K.  The RX DMA engine only
+		 * supports scattering for user-mode queues, but will
+		 * split DMA writes at intervals of RX_USR_BUF_SIZE
+		 * (32-byte units) even for kernel-mode queues.  We
+		 * set it to be so large that that never happens.
+		 */
 		EFX_SET_OWORD_FIELD(reg, FRF_AA_RX_DESC_PUSH_EN, 0);
 		EFX_SET_OWORD_FIELD(reg, FRF_AA_RX_USR_BUF_SIZE,
-				    huge_buf_size);
+				    (3 * 4096) >> 5);
 		EFX_SET_OWORD_FIELD(reg, FRF_AA_RX_XON_MAC_TH, 512 >> 8);
 		EFX_SET_OWORD_FIELD(reg, FRF_AA_RX_XOFF_MAC_TH, 2048 >> 8);
 		EFX_SET_OWORD_FIELD(reg, FRF_AA_RX_XON_TX_TH, ctrl_xon_thr);
@@ -1569,7 +1570,7 @@ static void falcon_init_rx_cfg(struct efx_nic *efx)
 		/* Data FIFO size is 80K; register fields moved */
 		EFX_SET_OWORD_FIELD(reg, FRF_BZ_RX_DESC_PUSH_EN, 0);
 		EFX_SET_OWORD_FIELD(reg, FRF_BZ_RX_USR_BUF_SIZE,
-				    huge_buf_size);
+				    EFX_RX_USR_BUF_SIZE >> 5);
 		/* Send XON and XOFF at ~3 * max MTU away from empty/full */
 		EFX_SET_OWORD_FIELD(reg, FRF_BZ_RX_XON_MAC_TH, 27648 >> 8);
 		EFX_SET_OWORD_FIELD(reg, FRF_BZ_RX_XOFF_MAC_TH, 54272 >> 8);
@@ -1815,6 +1816,7 @@ const struct efx_nic_type falcon_a1_nic_type = {
 	.evq_rptr_tbl_base = FR_AA_EVQ_RPTR_KER,
 	.max_dma_mask = DMA_BIT_MASK(FSF_AZ_TX_KER_BUF_ADDR_WIDTH),
 	.rx_buffer_padding = 0x24,
+	.can_rx_scatter = false,
 	.max_interrupt_mode = EFX_INT_MODE_MSI,
 	.phys_addr_channels = 4,
 	.timer_period_max =  1 << FRF_AB_TC_TIMER_VAL_WIDTH,
@@ -1865,6 +1867,7 @@ const struct efx_nic_type falcon_b0_nic_type = {
 	.max_dma_mask = DMA_BIT_MASK(FSF_AZ_TX_KER_BUF_ADDR_WIDTH),
 	.rx_buffer_hash_size = 0x10,
 	.rx_buffer_padding = 0,
+	.can_rx_scatter = true,
 	.max_interrupt_mode = EFX_INT_MODE_MSIX,
 	.phys_addr_channels = 32, /* Hardware limit is 64, but the legacy
 				   * interrupt handler only supports 32
diff --git a/drivers/net/ethernet/sfc/filter.c b/drivers/net/ethernet/sfc/filter.c
index 61b4408..2397f0e 100644
--- a/drivers/net/ethernet/sfc/filter.c
+++ b/drivers/net/ethernet/sfc/filter.c
@@ -172,6 +172,25 @@ static void efx_filter_push_rx_config(struct efx_nic *efx)
 			filter_ctl, FRF_CZ_MULTICAST_NOMATCH_RSS_ENABLED,
 			!!(table->spec[EFX_FILTER_INDEX_MC_DEF].flags &
 			   EFX_FILTER_FLAG_RX_RSS));
+
+		/* There is a single bit to enable RX scatter for all
+		 * unmatched packets.  Only set it if scatter is
+		 * enabled in both filter specs.
+		 */
+		EFX_SET_OWORD_FIELD(
+			filter_ctl, FRF_BZ_SCATTER_ENBL_NO_MATCH_Q,
+			!!(table->spec[EFX_FILTER_INDEX_UC_DEF].flags &
+			   table->spec[EFX_FILTER_INDEX_MC_DEF].flags &
+			   EFX_FILTER_FLAG_RX_SCATTER));
+	} else if (efx_nic_rev(efx) >= EFX_REV_FALCON_B0) {
+		/* We don't expose 'default' filters because unmatched
+		 * packets always go to the queue number found in the
+		 * RSS table.  But we still need to set the RX scatter
+		 * bit here.
+		 */
+		EFX_SET_OWORD_FIELD(
+			filter_ctl, FRF_BZ_SCATTER_ENBL_NO_MATCH_Q,
+			efx->rx_scatter);
 	}
 
 	efx_writeo(efx, &filter_ctl, FR_BZ_RX_FILTER_CTL);
@@ -413,13 +432,18 @@ static void efx_filter_reset_rx_def(struct efx_nic *efx, unsigned filter_idx)
 	struct efx_filter_state *state = efx->filter_state;
 	struct efx_filter_table *table = &state->table[EFX_FILTER_TABLE_RX_DEF];
 	struct efx_filter_spec *spec = &table->spec[filter_idx];
+	enum efx_filter_flags flags = 0;
 
 	/* If there's only one channel then disable RSS for non VF
 	 * traffic, thereby allowing VFs to use RSS when the PF can't.
 	 */
-	efx_filter_init_rx(spec, EFX_FILTER_PRI_MANUAL,
-			   efx->n_rx_channels > 1 ? EFX_FILTER_FLAG_RX_RSS : 0,
-			   0);
+	if (efx->n_rx_channels > 1)
+		flags |= EFX_FILTER_FLAG_RX_RSS;
+
+	if (efx->rx_scatter)
+		flags |= EFX_FILTER_FLAG_RX_SCATTER;
+
+	efx_filter_init_rx(spec, EFX_FILTER_PRI_MANUAL, flags, 0);
 	spec->type = EFX_FILTER_UC_DEF + filter_idx;
 	table->used_bitmap[0] |= 1 << filter_idx;
 }
@@ -1101,6 +1125,50 @@ void efx_remove_filters(struct efx_nic *efx)
 	kfree(state);
 }
 
+/* Update scatter enable flags for filters pointing to our own RX queues */
+void efx_filter_update_rx_scatter(struct efx_nic *efx)
+{
+	struct efx_filter_state *state = efx->filter_state;
+	enum efx_filter_table_id table_id;
+	struct efx_filter_table *table;
+	efx_oword_t filter;
+	unsigned int filter_idx;
+
+	spin_lock_bh(&state->lock);
+
+	for (table_id = EFX_FILTER_TABLE_RX_IP;
+	     table_id <= EFX_FILTER_TABLE_RX_DEF;
+	     table_id++) {
+		table = &state->table[table_id];
+
+		for (filter_idx = 0; filter_idx < table->size; filter_idx++) {
+			if (!test_bit(filter_idx, table->used_bitmap) ||
+			    table->spec[filter_idx].dmaq_id >=
+			    efx->n_rx_channels)
+				continue;
+
+			if (efx->rx_scatter)
+				table->spec[filter_idx].flags |=
+					EFX_FILTER_FLAG_RX_SCATTER;
+			else
+				table->spec[filter_idx].flags &=
+					~EFX_FILTER_FLAG_RX_SCATTER;
+
+			if (table_id == EFX_FILTER_TABLE_RX_DEF)
+				/* Pushed by efx_filter_push_rx_config() */
+				continue;
+
+			efx_filter_build(&filter, &table->spec[filter_idx]);
+			efx_writeo(efx, &filter,
+				   table->offset + table->step * filter_idx);
+		}
+	}
+
+	efx_filter_push_rx_config(efx);
+
+	spin_unlock_bh(&state->lock);
+}
+
 #ifdef CONFIG_RFS_ACCEL
 
 int efx_filter_rfs(struct net_device *net_dev, const struct sk_buff *skb,
diff --git a/drivers/net/ethernet/sfc/net_driver.h b/drivers/net/ethernet/sfc/net_driver.h
index 1bc911f..e41b54b 100644
--- a/drivers/net/ethernet/sfc/net_driver.h
+++ b/drivers/net/ethernet/sfc/net_driver.h
@@ -69,6 +69,12 @@
 #define EFX_TXQ_TYPES		4
 #define EFX_MAX_TX_QUEUES	(EFX_TXQ_TYPES * EFX_MAX_CHANNELS)
 
+/* Maximum possible MTU the driver supports */
+#define EFX_MAX_MTU (9 * 1024)
+
+/* Size of an RX scatter buffer.  Small enough to pack 2 into a 4K page. */
+#define EFX_RX_USR_BUF_SIZE 1824
+
 /* Forward declare Precision Time Protocol (PTP) support structure. */
 struct efx_ptp_data;
 
@@ -212,7 +218,8 @@ struct efx_tx_queue {
  *	If completed: offset in @page of Ethernet header.
  * @len: If pending: length for DMA descriptor.
  *	If completed: received length, excluding hash prefix.
- * @flags: Flags for buffer and packet state.
+ * @flags: Flags for buffer and packet state.  These are only set on the
+ *	first buffer of a scattered packet.
  */
 struct efx_rx_buffer {
 	dma_addr_t dma_addr;
@@ -256,6 +263,7 @@ struct efx_rx_page_state {
  * @added_count: Number of buffers added to the receive queue.
  * @notified_count: Number of buffers given to NIC (<= @added_count).
  * @removed_count: Number of buffers removed from the receive queue.
+ * @scatter_n: Number of buffers used by current packet
  * @max_fill: RX descriptor maximum fill level (<= ring size)
  * @fast_fill_trigger: RX descriptor fill level that will trigger a fast fill
  *	(<= @max_fill)
@@ -276,6 +284,7 @@ struct efx_rx_queue {
 	unsigned int added_count;
 	unsigned int notified_count;
 	unsigned int removed_count;
+	unsigned int scatter_n;
 	unsigned int max_fill;
 	unsigned int fast_fill_trigger;
 	unsigned int min_fill;
@@ -335,6 +344,12 @@ enum efx_rx_alloc_method {
  * @n_rx_frm_trunc: Count of RX_FRM_TRUNC errors
  * @n_rx_overlength: Count of RX_OVERLENGTH errors
  * @n_skbuff_leaks: Count of skbuffs leaked due to RX overrun
+ * @n_rx_nodesc_trunc: Number of RX packets truncated and then dropped due to
+ *	lack of descriptors
+ * @rx_pkt_n_frags: Number of fragments in next packet to be delivered by
+ *	__efx_rx_packet(), or zero if there is none
+ * @rx_pkt_index: Ring index of first buffer for next packet to be delivered
+ *	by __efx_rx_packet(), if @rx_pkt_n_frags != 0
  * @rx_queue: RX queue for this channel
  * @tx_queue: TX queues for this channel
  */
@@ -366,11 +381,10 @@ struct efx_channel {
 	unsigned n_rx_frm_trunc;
 	unsigned n_rx_overlength;
 	unsigned n_skbuff_leaks;
+	unsigned int n_rx_nodesc_trunc;
 
-	/* Used to pipeline received packets in order to optimise memory
-	 * access with prefetches.
-	 */
-	struct efx_rx_buffer *rx_pkt;
+	unsigned int rx_pkt_n_frags;
+	unsigned int rx_pkt_index;
 
 	struct efx_rx_queue rx_queue;
 	struct efx_tx_queue tx_queue[EFX_TXQ_TYPES];
@@ -672,8 +686,11 @@ struct vfdi_status;
  * @n_tx_channels: Number of channels used for TX
  * @rx_dma_len: Current maximum RX DMA length
  * @rx_buffer_order: Order (log2) of number of pages for each RX buffer
+ * @rx_buffer_truesize: Amortised allocation size of an RX buffer,
+ *	for use in sk_buff::truesize
  * @rx_hash_key: Toeplitz hash key for RSS
  * @rx_indir_table: Indirection table for RSS
+ * @rx_scatter: Scatter mode enabled for receives
  * @int_error_count: Number of internal errors seen recently
  * @int_error_expire: Time at which error count will be expired
  * @irq_status: Interrupt status buffer
@@ -788,8 +805,10 @@ struct efx_nic {
 	unsigned n_tx_channels;
 	unsigned int rx_dma_len;
 	unsigned int rx_buffer_order;
+	unsigned int rx_buffer_truesize;
 	u8 rx_hash_key[40];
 	u32 rx_indir_table[128];
+	bool rx_scatter;
 
 	unsigned int_error_count;
 	unsigned long int_error_expire;
@@ -920,8 +939,9 @@ static inline unsigned int efx_port_num(struct efx_nic *efx)
  * @evq_ptr_tbl_base: Event queue pointer table base address
  * @evq_rptr_tbl_base: Event queue read-pointer table base address
  * @max_dma_mask: Maximum possible DMA mask
- * @rx_buffer_hash_size: Size of hash at start of RX buffer
- * @rx_buffer_padding: Size of padding at end of RX buffer
+ * @rx_buffer_hash_size: Size of hash at start of RX packet
+ * @rx_buffer_padding: Size of padding at end of RX packet
+ * @can_rx_scatter: NIC is able to scatter packet to multiple buffers
  * @max_interrupt_mode: Highest capability interrupt mode supported
  *	from &enum efx_init_mode.
  * @phys_addr_channels: Number of channels with physically addressed
@@ -969,6 +989,7 @@ struct efx_nic_type {
 	u64 max_dma_mask;
 	unsigned int rx_buffer_hash_size;
 	unsigned int rx_buffer_padding;
+	bool can_rx_scatter;
 	unsigned int max_interrupt_mode;
 	unsigned int phys_addr_channels;
 	unsigned int timer_period_max;
diff --git a/drivers/net/ethernet/sfc/nic.c b/drivers/net/ethernet/sfc/nic.c
index 0ad790c..f9f5df8 100644
--- a/drivers/net/ethernet/sfc/nic.c
+++ b/drivers/net/ethernet/sfc/nic.c
@@ -591,12 +591,22 @@ void efx_nic_init_rx(struct efx_rx_queue *rx_queue)
 	struct efx_nic *efx = rx_queue->efx;
 	bool is_b0 = efx_nic_rev(efx) >= EFX_REV_FALCON_B0;
 	bool iscsi_digest_en = is_b0;
+	bool jumbo_en;
+
+	/* For kernel-mode queues in Falcon A1, the JUMBO flag enables
+	 * DMA to continue after a PCIe page boundary (and scattering
+	 * is not possible).  In Falcon B0 and Siena, it enables
+	 * scatter.
+	 */
+	jumbo_en = !is_b0 || efx->rx_scatter;
 
 	netif_dbg(efx, hw, efx->net_dev,
 		  "RX queue %d ring in special buffers %d-%d\n",
 		  efx_rx_queue_index(rx_queue), rx_queue->rxd.index,
 		  rx_queue->rxd.index + rx_queue->rxd.entries - 1);
 
+	rx_queue->scatter_n = 0;
+
 	/* Pin RX descriptor ring */
 	efx_init_special_buffer(efx, &rx_queue->rxd);
 
@@ -613,8 +623,7 @@ void efx_nic_init_rx(struct efx_rx_queue *rx_queue)
 			      FRF_AZ_RX_DESCQ_SIZE,
 			      __ffs(rx_queue->rxd.entries),
 			      FRF_AZ_RX_DESCQ_TYPE, 0 /* kernel queue */ ,
-			      /* For >=B0 this is scatter so disable */
-			      FRF_AZ_RX_DESCQ_JUMBO, !is_b0,
+			      FRF_AZ_RX_DESCQ_JUMBO, jumbo_en,
 			      FRF_AZ_RX_DESCQ_EN, 1);
 	efx_writeo_table(efx, &rx_desc_ptr, efx->type->rxd_ptr_tbl_base,
 			 efx_rx_queue_index(rx_queue));
@@ -968,13 +977,24 @@ static u16 efx_handle_rx_not_ok(struct efx_rx_queue *rx_queue,
 		EFX_RX_PKT_DISCARD : 0;
 }
 
-/* Handle receive events that are not in-order. */
-static void
+/* Handle receive events that are not in-order. Return true if this
+ * can be handled as a partial packet discard, false if it's more
+ * serious.
+ */
+static bool
 efx_handle_rx_bad_index(struct efx_rx_queue *rx_queue, unsigned index)
 {
+	struct efx_channel *channel = efx_rx_queue_channel(rx_queue);
 	struct efx_nic *efx = rx_queue->efx;
 	unsigned expected, dropped;
 
+	if (rx_queue->scatter_n &&
+	    index == ((rx_queue->removed_count + rx_queue->scatter_n - 1) &
+		      rx_queue->ptr_mask)) {
+		++channel->n_rx_nodesc_trunc;
+		return true;
+	}
+
 	expected = rx_queue->removed_count & rx_queue->ptr_mask;
 	dropped = (index - expected) & rx_queue->ptr_mask;
 	netif_info(efx, rx_err, efx->net_dev,
@@ -983,6 +1003,7 @@ efx_handle_rx_bad_index(struct efx_rx_queue *rx_queue, unsigned index)
 
 	efx_schedule_reset(efx, EFX_WORKAROUND_5676(efx) ?
 			   RESET_TYPE_RX_RECOVERY : RESET_TYPE_DISABLE);
+	return false;
 }
 
 /* Handle a packet received event
@@ -998,7 +1019,7 @@ efx_handle_rx_event(struct efx_channel *channel, const efx_qword_t *event)
 	unsigned int rx_ev_desc_ptr, rx_ev_byte_cnt;
 	unsigned int rx_ev_hdr_type, rx_ev_mcast_pkt;
 	unsigned expected_ptr;
-	bool rx_ev_pkt_ok;
+	bool rx_ev_pkt_ok, rx_ev_sop, rx_ev_cont;
 	u16 flags;
 	struct efx_rx_queue *rx_queue;
 	struct efx_nic *efx = channel->efx;
@@ -1006,21 +1027,56 @@ efx_handle_rx_event(struct efx_channel *channel, const efx_qword_t *event)
 	if (unlikely(ACCESS_ONCE(efx->reset_pending)))
 		return;
 
-	/* Basic packet information */
-	rx_ev_byte_cnt = EFX_QWORD_FIELD(*event, FSF_AZ_RX_EV_BYTE_CNT);
-	rx_ev_pkt_ok = EFX_QWORD_FIELD(*event, FSF_AZ_RX_EV_PKT_OK);
-	rx_ev_hdr_type = EFX_QWORD_FIELD(*event, FSF_AZ_RX_EV_HDR_TYPE);
-	WARN_ON(EFX_QWORD_FIELD(*event, FSF_AZ_RX_EV_JUMBO_CONT));
-	WARN_ON(EFX_QWORD_FIELD(*event, FSF_AZ_RX_EV_SOP) != 1);
+	rx_ev_cont = EFX_QWORD_FIELD(*event, FSF_AZ_RX_EV_JUMBO_CONT);
+	rx_ev_sop = EFX_QWORD_FIELD(*event, FSF_AZ_RX_EV_SOP);
 	WARN_ON(EFX_QWORD_FIELD(*event, FSF_AZ_RX_EV_Q_LABEL) !=
 		channel->channel);
 
 	rx_queue = efx_channel_get_rx_queue(channel);
 
 	rx_ev_desc_ptr = EFX_QWORD_FIELD(*event, FSF_AZ_RX_EV_DESC_PTR);
-	expected_ptr = rx_queue->removed_count & rx_queue->ptr_mask;
-	if (unlikely(rx_ev_desc_ptr != expected_ptr))
-		efx_handle_rx_bad_index(rx_queue, rx_ev_desc_ptr);
+	expected_ptr = ((rx_queue->removed_count + rx_queue->scatter_n) &
+			rx_queue->ptr_mask);
+
+	/* Check for partial drops and other errors */
+	if (unlikely(rx_ev_desc_ptr != expected_ptr) ||
+	    unlikely(rx_ev_sop != (rx_queue->scatter_n == 0))) {
+		if (rx_ev_desc_ptr != expected_ptr &&
+		    !efx_handle_rx_bad_index(rx_queue, rx_ev_desc_ptr))
+			return;
+
+		/* Discard all pending fragments */
+		if (rx_queue->scatter_n) {
+			efx_rx_packet(
+				rx_queue,
+				rx_queue->removed_count & rx_queue->ptr_mask,
+				rx_queue->scatter_n, 0, EFX_RX_PKT_DISCARD);
+			rx_queue->removed_count += rx_queue->scatter_n;
+			rx_queue->scatter_n = 0;
+		}
+
+		/* Return if there is no new fragment */
+		if (rx_ev_desc_ptr != expected_ptr)
+			return;
+
+		/* Discard new fragment if not SOP */
+		if (!rx_ev_sop) {
+			efx_rx_packet(
+				rx_queue,
+				rx_queue->removed_count & rx_queue->ptr_mask,
+				1, 0, EFX_RX_PKT_DISCARD);
+			++rx_queue->removed_count;
+			return;
+		}
+	}
+
+	++rx_queue->scatter_n;
+	if (rx_ev_cont)
+		return;
+
+	rx_ev_byte_cnt = EFX_QWORD_FIELD(*event, FSF_AZ_RX_EV_BYTE_CNT);
+	rx_ev_pkt_ok = EFX_QWORD_FIELD(*event, FSF_AZ_RX_EV_PKT_OK);
+	rx_ev_hdr_type = EFX_QWORD_FIELD(*event, FSF_AZ_RX_EV_HDR_TYPE);
 
 	if (likely(rx_ev_pkt_ok)) {
 		/* If packet is marked as OK and packet type is TCP/IP or
@@ -1048,7 +1104,11 @@ efx_handle_rx_event(struct efx_channel *channel, const efx_qword_t *event)
 	channel->irq_mod_score += 2;
 
 	/* Handle received packet */
-	efx_rx_packet(rx_queue, rx_ev_desc_ptr, rx_ev_byte_cnt, flags);
+	efx_rx_packet(rx_queue,
+		      rx_queue->removed_count & rx_queue->ptr_mask,
+		      rx_queue->scatter_n, rx_ev_byte_cnt, flags);
+	rx_queue->removed_count += rx_queue->scatter_n;
+	rx_queue->scatter_n = 0;
 }
 
 /* If this flush done event corresponds to a &struct efx_tx_queue, then
diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c
index 0451872..88aa1ff 100644
--- a/drivers/net/ethernet/sfc/rx.c
+++ b/drivers/net/ethernet/sfc/rx.c
@@ -39,13 +39,17 @@
  */
 static unsigned int rx_refill_threshold;
 
+/* Each packet can consume up to ceil(max_frame_len / buffer_size) buffers */
+#define EFX_RX_MAX_FRAGS DIV_ROUND_UP(EFX_MAX_FRAME_LEN(EFX_MAX_MTU), \
+				      EFX_RX_USR_BUF_SIZE)
+
 /*
  * RX maximum head room required.
  *
- * This must be at least 1 to prevent overflow and at least 2 to allow
- * pipelined receives.
+ * This must be at least 1 to prevent overflow, plus one packet-worth
+ * to allow pipelined receives.
  */
-#define EFX_RXD_HEAD_ROOM 2
+#define EFX_RXD_HEAD_ROOM (1 + EFX_RX_MAX_FRAGS)
 
 static inline u8 *efx_rx_buf_va(struct efx_rx_buffer *buf)
 {
@@ -66,6 +70,15 @@ static inline u32 efx_rx_buf_hash(const u8 *eh)
 #endif
 }
 
+static inline struct efx_rx_buffer *
+efx_rx_buf_next(struct efx_rx_queue *rx_queue, struct efx_rx_buffer *rx_buf)
+{
+	if (unlikely(rx_buf == efx_rx_buffer(rx_queue, rx_queue->ptr_mask)))
+		return efx_rx_buffer(rx_queue, 0);
+	else
+		return rx_buf + 1;
+}
+
 /**
  * efx_init_rx_buffers - create EFX_RX_BATCH page-based RX buffers
  *
@@ -199,28 +212,34 @@ static void efx_resurrect_rx_buffer(struct efx_rx_queue *rx_queue,
 	++rx_queue->added_count;
 }
 
-/* Recycle the given rx buffer directly back into the rx_queue. There is
- * always room to add this buffer, because we've just popped a buffer. */
-static void efx_recycle_rx_buffer(struct efx_channel *channel,
-				  struct efx_rx_buffer *rx_buf)
+/* Recycle buffers directly back into the rx_queue. There is always
+ * room to add these buffer, because we've just popped them.
+ */
+static void efx_recycle_rx_buffers(struct efx_channel *channel,
+				   struct efx_rx_buffer *rx_buf,
+				   unsigned int n_frags)
 {
 	struct efx_nic *efx = channel->efx;
 	struct efx_rx_queue *rx_queue = efx_channel_get_rx_queue(channel);
 	struct efx_rx_buffer *new_buf;
 	unsigned index;
 
-	rx_buf->flags = 0;
+	do {
+		rx_buf->flags = 0;
 
-	if (efx->rx_dma_len <= EFX_RX_HALF_PAGE &&
-	    page_count(rx_buf->page) == 1)
-		efx_resurrect_rx_buffer(rx_queue, rx_buf);
+		if (efx->rx_dma_len <= EFX_RX_HALF_PAGE &&
+		    page_count(rx_buf->page) == 1)
+			efx_resurrect_rx_buffer(rx_queue, rx_buf);
 
-	index = rx_queue->added_count & rx_queue->ptr_mask;
-	new_buf = efx_rx_buffer(rx_queue, index);
+		index = rx_queue->added_count & rx_queue->ptr_mask;
+		new_buf = efx_rx_buffer(rx_queue, index);
 
-	memcpy(new_buf, rx_buf, sizeof(*new_buf));
-	rx_buf->page = NULL;
-	++rx_queue->added_count;
+		memcpy(new_buf, rx_buf, sizeof(*new_buf));
+		rx_buf->page = NULL;
+		++rx_queue->added_count;
+
+		rx_buf = efx_rx_buf_next(rx_queue, rx_buf);
+	} while (--n_frags);
 }
 
 /**
@@ -328,46 +347,56 @@ static void efx_rx_packet__check_len(struct efx_rx_queue *rx_queue,
 /* Pass a received packet up through GRO.  GRO can handle pages
  * regardless of checksum state and skbs with a good checksum.
  */
-static void efx_rx_packet_gro(struct efx_channel *channel,
-			      struct efx_rx_buffer *rx_buf,
-			      const u8 *eh)
+static void
+efx_rx_packet_gro(struct efx_channel *channel, struct efx_rx_buffer *rx_buf,
+		  unsigned int n_frags, u8 *eh)
 {
 	struct napi_struct *napi = &channel->napi_str;
 	gro_result_t gro_result;
 	struct efx_nic *efx = channel->efx;
-	struct page *page = rx_buf->page;
 	struct sk_buff *skb;
 
-	rx_buf->page = NULL;
-
 	skb = napi_get_frags(napi);
-	if (!skb) {
-		put_page(page);
+	if (unlikely(!skb)) {
+		while (n_frags--) {
+			put_page(rx_buf->page);
+			rx_buf->page = NULL;
+			rx_buf = efx_rx_buf_next(&channel->rx_queue, rx_buf);
+		}
 		return;
 	}
 
 	if (efx->net_dev->features & NETIF_F_RXHASH)
 		skb->rxhash = efx_rx_buf_hash(eh);
-
-	skb_fill_page_desc(skb, 0, page, rx_buf->page_offset, rx_buf->len);
-
-	skb->len = rx_buf->len;
-	skb->data_len = rx_buf->len;
-	skb->truesize += rx_buf->len;
 	skb->ip_summed = ((rx_buf->flags & EFX_RX_PKT_CSUMMED) ?
 			  CHECKSUM_UNNECESSARY : CHECKSUM_NONE);
 
-	skb_record_rx_queue(skb, channel->rx_queue.core_index);
+	for (;;) {
+		skb_fill_page_desc(skb, skb_shinfo(skb)->nr_frags,
+				   rx_buf->page, rx_buf->page_offset,
+				   rx_buf->len);
+		rx_buf->page = NULL;
+		skb->len += rx_buf->len;
+		if (skb_shinfo(skb)->nr_frags == n_frags)
+			break;
 
-		gro_result = napi_gro_frags(napi);
+		rx_buf = efx_rx_buf_next(&channel->rx_queue, rx_buf);
+	}
+
+	skb->data_len = skb->len;
+	skb->truesize += n_frags * efx->rx_buffer_truesize;
+
+	skb_record_rx_queue(skb, channel->rx_queue.core_index);
 
+	gro_result = napi_gro_frags(napi);
 	if (gro_result != GRO_DROP)
 		channel->irq_mod_score += 2;
 }
 
-/* Allocate and construct an SKB around a struct page.*/
+/* Allocate and construct an SKB around page fragments */
 static struct sk_buff *efx_rx_mk_skb(struct efx_channel *channel,
 				     struct efx_rx_buffer *rx_buf,
+				     unsigned int n_frags,
 				     u8 *eh, int hdr_len)
 {
 	struct efx_nic *efx = channel->efx;
@@ -381,25 +410,32 @@ static struct sk_buff *efx_rx_mk_skb(struct efx_channel *channel,
 	EFX_BUG_ON_PARANOID(rx_buf->len < hdr_len);
 
 	skb_reserve(skb, EFX_PAGE_SKB_ALIGN);
+	memcpy(__skb_put(skb, hdr_len), eh, hdr_len);
 
-	skb->len = rx_buf->len;
-	skb->truesize = rx_buf->len + sizeof(struct sk_buff);
-	memcpy(skb->data, eh, hdr_len);
-	skb->tail += hdr_len;
-
-	/* Append the remaining page onto the frag list */
+	/* Append the remaining page(s) onto the frag list */
 	if (rx_buf->len > hdr_len) {
-		skb->data_len = skb->len - hdr_len;
-		skb_fill_page_desc(skb, 0, rx_buf->page,
-				   rx_buf->page_offset + hdr_len,
-				   skb->data_len);
+		rx_buf->page_offset += hdr_len;
+		rx_buf->len -= hdr_len;
+
+		for (;;) {
+			skb_fill_page_desc(skb, skb_shinfo(skb)->nr_frags,
+					   rx_buf->page, rx_buf->page_offset,
+					   rx_buf->len);
+			rx_buf->page = NULL;
+			skb->len += rx_buf->len;
+			skb->data_len += rx_buf->len;
+			if (skb_shinfo(skb)->nr_frags == n_frags)
+				break;
+
+			rx_buf = efx_rx_buf_next(&channel->rx_queue, rx_buf);
+		}
 	} else {
 		__free_pages(rx_buf->page, efx->rx_buffer_order);
-		skb->data_len = 0;
+		rx_buf->page = NULL;
+		n_frags = 0;
 	}
 
-	/* Ownership has transferred from the rx_buf to skb */
-	rx_buf->page = NULL;
+	skb->truesize += n_frags * efx->rx_buffer_truesize;
 
 	/* Move past the ethernet header */
 	skb->protocol = eth_type_trans(skb, efx->net_dev);
@@ -408,7 +444,7 @@ static struct sk_buff *efx_rx_mk_skb(struct efx_channel *channel,
 }
 
 void efx_rx_packet(struct efx_rx_queue *rx_queue, unsigned int index,
-		   unsigned int len, u16 flags)
+		   unsigned int n_frags, unsigned int len, u16 flags)
 {
 	struct efx_nic *efx = rx_queue->efx;
 	struct efx_channel *channel = efx_rx_queue_channel(rx_queue);
@@ -417,35 +453,43 @@ void efx_rx_packet(struct efx_rx_queue *rx_queue, unsigned int index,
 	rx_buf = efx_rx_buffer(rx_queue, index);
 	rx_buf->flags |= flags;
 
-	/* This allows the refill path to post another buffer.
-	 * EFX_RXD_HEAD_ROOM ensures that the slot we are using
-	 * isn't overwritten yet.
-	 */
-	rx_queue->removed_count++;
-
-	/* Validate the length encoded in the event vs the descriptor pushed */
-	efx_rx_packet__check_len(rx_queue, rx_buf, len);
+	/* Validate the number of fragments and completed length */
+	if (n_frags == 1) {
+		efx_rx_packet__check_len(rx_queue, rx_buf, len);
+	} else if (unlikely(n_frags > EFX_RX_MAX_FRAGS) ||
+		   unlikely(len <= (n_frags - 1) * EFX_RX_USR_BUF_SIZE) ||
+		   unlikely(len > n_frags * EFX_RX_USR_BUF_SIZE) ||
+		   unlikely(!efx->rx_scatter)) {
+		/* If this isn't an explicit discard request, either
+		 * the hardware or the driver is broken.
+		 */
+		WARN_ON(!(len == 0 && rx_buf->flags & EFX_RX_PKT_DISCARD));
+		rx_buf->flags |= EFX_RX_PKT_DISCARD;
+	}
 
 	netif_vdbg(efx, rx_status, efx->net_dev,
-		   "RX queue %d received id %x at %llx+%x %s%s\n",
+		   "RX queue %d received ids %x-%x len %d %s%s\n",
 		   efx_rx_queue_index(rx_queue), index,
-		   (unsigned long long)rx_buf->dma_addr, len,
+		   (index + n_frags - 1) & rx_queue->ptr_mask, len,
 		   (rx_buf->flags & EFX_RX_PKT_CSUMMED) ? " [SUMMED]" : "",
 		   (rx_buf->flags & EFX_RX_PKT_DISCARD) ? " [DISCARD]" : "");
 
-	/* Discard packet, if instructed to do so */
+	/* Discard packet, if instructed to do so.  Process the
+	 * previous receive first.
+	 */
 	if (unlikely(rx_buf->flags & EFX_RX_PKT_DISCARD)) {
-		efx_recycle_rx_buffer(channel, rx_buf);
-
-		/* Don't hold off the previous receive */
-		rx_buf = NULL;
-		goto out;
+		efx_rx_flush_packet(channel);
+		efx_recycle_rx_buffers(channel, rx_buf, n_frags);
+		return;
 	}
 
+	if (n_frags == 1)
+		rx_buf->len = len;
+
 	/* Release and/or sync DMA mapping - assumes all RX buffers
 	 * consumed in-order per RX queue
 	 */
-	efx_unmap_rx_buffer(efx, rx_buf, len);
+	efx_unmap_rx_buffer(efx, rx_buf, rx_buf->len);
 
 	/* Prefetch nice and early so data will (hopefully) be in cache by
 	 * the time we look at it.
@@ -453,23 +497,40 @@ void efx_rx_packet(struct efx_rx_queue *rx_queue, unsigned int index,
 	prefetch(efx_rx_buf_va(rx_buf));
 
 	rx_buf->page_offset += efx->type->rx_buffer_hash_size;
-	rx_buf->len = len - efx->type->rx_buffer_hash_size;
+	rx_buf->len -= efx->type->rx_buffer_hash_size;
+
+	if (n_frags > 1) {
+		/* Release/sync DMA mapping for additional fragments.
+		 * Fix length for last fragment.
+		 */
+		unsigned int tail_frags = n_frags - 1;
+
+		for (;;) {
+			rx_buf = efx_rx_buf_next(rx_queue, rx_buf);
+			if (--tail_frags == 0)
+				break;
+			efx_unmap_rx_buffer(efx, rx_buf, EFX_RX_USR_BUF_SIZE);
+		}
+		rx_buf->len = len - (n_frags - 1) * EFX_RX_USR_BUF_SIZE;
+		efx_unmap_rx_buffer(efx, rx_buf, rx_buf->len);
+	}
 
 	/* Pipeline receives so that we give time for packet headers to be
 	 * prefetched into cache.
 	 */
-out:
 	efx_rx_flush_packet(channel);
-	channel->rx_pkt = rx_buf;
+	channel->rx_pkt_n_frags = n_frags;
+	channel->rx_pkt_index = index;
 }
 
 static void efx_rx_deliver(struct efx_channel *channel, u8 *eh,
-			   struct efx_rx_buffer *rx_buf)
+			   struct efx_rx_buffer *rx_buf,
+			   unsigned int n_frags)
 {
 	struct sk_buff *skb;
 	u16 hdr_len = min_t(u16, rx_buf->len, EFX_SKB_HEADERS);
 
-	skb = efx_rx_mk_skb(channel, rx_buf, eh, hdr_len);
+	skb = efx_rx_mk_skb(channel, rx_buf, n_frags, eh, hdr_len);
 	if (unlikely(skb == NULL)) {
 		efx_free_rx_buffer(channel->efx, rx_buf);
 		return;
@@ -488,9 +549,11 @@ static void efx_rx_deliver(struct efx_channel *channel, u8 *eh,
 }
 
 /* Handle a received packet.  Second half: Touches packet payload. */
-void __efx_rx_packet(struct efx_channel *channel, struct efx_rx_buffer *rx_buf)
+void __efx_rx_packet(struct efx_channel *channel)
 {
 	struct efx_nic *efx = channel->efx;
+	struct efx_rx_buffer *rx_buf =
+		efx_rx_buffer(&channel->rx_queue, channel->rx_pkt_index);
 	u8 *eh = efx_rx_buf_va(rx_buf);
 
 	/* If we're in loopback test, then pass the packet directly to the
@@ -499,16 +562,18 @@ void __efx_rx_packet(struct efx_channel *channel, struct efx_rx_buffer *rx_buf)
 	if (unlikely(efx->loopback_selftest)) {
 		efx_loopback_rx_packet(efx, eh, rx_buf->len);
 		efx_free_rx_buffer(efx, rx_buf);
-		return;
+		goto out;
 	}
 
 	if (unlikely(!(efx->net_dev->features & NETIF_F_RXCSUM)))
 		rx_buf->flags &= ~EFX_RX_PKT_CSUMMED;
 
 	if (!channel->type->receive_skb)
-		efx_rx_packet_gro(channel, rx_buf, eh);
+		efx_rx_packet_gro(channel, rx_buf, channel->rx_pkt_n_frags, eh);
 	else
-		efx_rx_deliver(channel, eh, rx_buf);
+		efx_rx_deliver(channel, eh, rx_buf, channel->rx_pkt_n_frags);
+out:
+	channel->rx_pkt_n_frags = 0;
 }
 
 int efx_probe_rx_queue(struct efx_rx_queue *rx_queue)
diff --git a/drivers/net/ethernet/sfc/siena.c b/drivers/net/ethernet/sfc/siena.c
index e07ff0d..5166924 100644
--- a/drivers/net/ethernet/sfc/siena.c
+++ b/drivers/net/ethernet/sfc/siena.c
@@ -414,6 +414,8 @@ static int siena_init_nic(struct efx_nic *efx)
 	EFX_SET_OWORD_FIELD(temp, FRF_BZ_RX_HASH_INSRT_HDR, 1);
 	EFX_SET_OWORD_FIELD(temp, FRF_BZ_RX_HASH_ALG, 1);
 	EFX_SET_OWORD_FIELD(temp, FRF_BZ_RX_IP_HASH, 1);
+	EFX_SET_OWORD_FIELD(temp, FRF_BZ_RX_USR_BUF_SIZE,
+			    EFX_RX_USR_BUF_SIZE >> 5);
 	efx_writeo(efx, &temp, FR_AZ_RX_CFG);
 
 	/* Set hash key for IPv4 */
@@ -718,6 +720,7 @@ const struct efx_nic_type siena_a0_nic_type = {
 	.max_dma_mask = DMA_BIT_MASK(FSF_AZ_TX_KER_BUF_ADDR_WIDTH),
 	.rx_buffer_hash_size = 0x10,
 	.rx_buffer_padding = 0,
+	.can_rx_scatter = true,
 	.max_interrupt_mode = EFX_INT_MODE_MSIX,
 	.phys_addr_channels = 32, /* Hardware limit is 64, but the legacy
 				   * interrupt handler only supports 32
-- 
cgit v0.10.2


From 2768935a46603bb9bdd121864b1f2b2e8a71cccc Mon Sep 17 00:00:00 2001
From: Daniel Pieczko <dpieczko@solarflare.com>
Date: Wed, 13 Feb 2013 10:54:41 +0000
Subject: sfc: reuse pages to avoid DMA mapping/unmapping costs

On POWER systems, DMA mapping/unmapping operations are very expensive.
These changes reduce these costs by trying to reuse DMA mapped pages.

After all the buffers associated with a page have been processed and
passed up, the page is placed into a ring (if there is room).  For
each page that is required for a refill operation, a page in the ring
is examined to determine if its page count has fallen to 1, ie. the
kernel has released its reference to these packets.  If this is the
case, the page can be immediately added back into the RX descriptor
ring, without having to re-map it for DMA.

If the kernel is still holding a reference to this page, it is removed
from the ring and unmapped for DMA.  Then a new page, which can
immediately be used by RX buffers in the descriptor ring, is allocated
and DMA mapped.

The time a page needs to spend in the recycle ring before the kernel
has released its page references is based on the number of buffers
that use this page.  As large pages can hold more RX buffers, the RX
recycle ring can be shorter.  This reduces memory usage on POWER
systems, while maintaining the performance gain achieved by recycling
pages, following the driver change to pack more than two RX buffers
into large pages.

When an IOMMU is not present, the recycle ring can be small to reduce
memory usage, since DMA mapping operations are inexpensive.

With a small recycle ring, attempting to refill the descriptor queue
with more buffers than the equivalent size of the recycle ring could
ultimately lead to memory leaks if page entries in the recycle ring
were overwritten.  To prevent this, the check to see if the recycle
ring is full is changed to check if the next entry to be written is
NULL.

[bwh: Combine and rebase several commits so this is complete
 before the following buffer-packing changes.  Remove module
 parameter.]
Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>

diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index 1213af5..a70c458 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -661,6 +661,8 @@ static void efx_start_datapath(struct efx_nic *efx)
 		efx->rx_buffer_truesize = PAGE_SIZE << efx->rx_buffer_order;
 	}
 
+	efx->rx_bufs_per_page = (rx_buf_len <= PAGE_SIZE / 2) ? 2 : 1;
+
 	/* RX filters also have scatter-enabled flags */
 	if (efx->rx_scatter != old_rx_scatter)
 		efx_filter_update_rx_scatter(efx);
diff --git a/drivers/net/ethernet/sfc/net_driver.h b/drivers/net/ethernet/sfc/net_driver.h
index e41b54b..370c5bc 100644
--- a/drivers/net/ethernet/sfc/net_driver.h
+++ b/drivers/net/ethernet/sfc/net_driver.h
@@ -264,12 +264,22 @@ struct efx_rx_page_state {
  * @notified_count: Number of buffers given to NIC (<= @added_count).
  * @removed_count: Number of buffers removed from the receive queue.
  * @scatter_n: Number of buffers used by current packet
+ * @page_ring: The ring to store DMA mapped pages for reuse.
+ * @page_add: Counter to calculate the write pointer for the recycle ring.
+ * @page_remove: Counter to calculate the read pointer for the recycle ring.
+ * @page_recycle_count: The number of pages that have been recycled.
+ * @page_recycle_failed: The number of pages that couldn't be recycled because
+ *      the kernel still held a reference to them.
+ * @page_recycle_full: The number of pages that were released because the
+ *      recycle ring was full.
+ * @page_ptr_mask: The number of pages in the RX recycle ring minus 1.
  * @max_fill: RX descriptor maximum fill level (<= ring size)
  * @fast_fill_trigger: RX descriptor fill level that will trigger a fast fill
  *	(<= @max_fill)
  * @min_fill: RX descriptor minimum non-zero fill level.
  *	This records the minimum fill level observed when a ring
  *	refill was triggered.
+ * @recycle_count: RX buffer recycle counter.
  * @slow_fill: Timer used to defer efx_nic_generate_fill_event().
  */
 struct efx_rx_queue {
@@ -285,10 +295,18 @@ struct efx_rx_queue {
 	unsigned int notified_count;
 	unsigned int removed_count;
 	unsigned int scatter_n;
+	struct page **page_ring;
+	unsigned int page_add;
+	unsigned int page_remove;
+	unsigned int page_recycle_count;
+	unsigned int page_recycle_failed;
+	unsigned int page_recycle_full;
+	unsigned int page_ptr_mask;
 	unsigned int max_fill;
 	unsigned int fast_fill_trigger;
 	unsigned int min_fill;
 	unsigned int min_overfill;
+	unsigned int recycle_count;
 	struct timer_list slow_fill;
 	unsigned int slow_fill_count;
 };
@@ -806,6 +824,7 @@ struct efx_nic {
 	unsigned int rx_dma_len;
 	unsigned int rx_buffer_order;
 	unsigned int rx_buffer_truesize;
+	unsigned int rx_bufs_per_page;
 	u8 rx_hash_key[40];
 	u32 rx_indir_table[128];
 	bool rx_scatter;
diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c
index 88aa1ff..eea56f3 100644
--- a/drivers/net/ethernet/sfc/rx.c
+++ b/drivers/net/ethernet/sfc/rx.c
@@ -16,6 +16,7 @@
 #include <linux/udp.h>
 #include <linux/prefetch.h>
 #include <linux/moduleparam.h>
+#include <linux/iommu.h>
 #include <net/ip.h>
 #include <net/checksum.h>
 #include "net_driver.h"
@@ -27,6 +28,13 @@
 /* Number of RX descriptors pushed at once. */
 #define EFX_RX_BATCH  8
 
+/* Number of RX buffers to recycle pages for.  When creating the RX page recycle
+ * ring, this number is divided by the number of buffers per page to calculate
+ * the number of pages to store in the RX page recycle ring.
+ */
+#define EFX_RECYCLE_RING_SIZE_IOMMU 4096
+#define EFX_RECYCLE_RING_SIZE_NOIOMMU (2 * EFX_RX_BATCH)
+
 /* Maximum length for an RX descriptor sharing a page */
 #define EFX_RX_HALF_PAGE ((PAGE_SIZE >> 1) - sizeof(struct efx_rx_page_state) \
 			  - EFX_PAGE_IP_ALIGN)
@@ -79,6 +87,56 @@ efx_rx_buf_next(struct efx_rx_queue *rx_queue, struct efx_rx_buffer *rx_buf)
 		return rx_buf + 1;
 }
 
+static inline void efx_sync_rx_buffer(struct efx_nic *efx,
+				      struct efx_rx_buffer *rx_buf,
+				      unsigned int len)
+{
+	dma_sync_single_for_cpu(&efx->pci_dev->dev, rx_buf->dma_addr, len,
+				DMA_FROM_DEVICE);
+}
+
+/* Return true if this is the last RX buffer using a page. */
+static inline bool efx_rx_is_last_buffer(struct efx_nic *efx,
+					 struct efx_rx_buffer *rx_buf)
+{
+	return (rx_buf->page_offset >= (PAGE_SIZE >> 1) ||
+		efx->rx_dma_len > EFX_RX_HALF_PAGE);
+}
+
+/* Check the RX page recycle ring for a page that can be reused. */
+static struct page *efx_reuse_page(struct efx_rx_queue *rx_queue)
+{
+	struct efx_nic *efx = rx_queue->efx;
+	struct page *page;
+	struct efx_rx_page_state *state;
+	unsigned index;
+
+	index = rx_queue->page_remove & rx_queue->page_ptr_mask;
+	page = rx_queue->page_ring[index];
+	if (page == NULL)
+		return NULL;
+
+	rx_queue->page_ring[index] = NULL;
+	/* page_remove cannot exceed page_add. */
+	if (rx_queue->page_remove != rx_queue->page_add)
+		++rx_queue->page_remove;
+
+	/* If page_count is 1 then we hold the only reference to this page. */
+	if (page_count(page) == 1) {
+		++rx_queue->page_recycle_count;
+		return page;
+	} else {
+		state = page_address(page);
+		dma_unmap_page(&efx->pci_dev->dev, state->dma_addr,
+			       PAGE_SIZE << efx->rx_buffer_order,
+			       DMA_FROM_DEVICE);
+		put_page(page);
+		++rx_queue->page_recycle_failed;
+	}
+
+	return NULL;
+}
+
 /**
  * efx_init_rx_buffers - create EFX_RX_BATCH page-based RX buffers
  *
@@ -103,20 +161,28 @@ static int efx_init_rx_buffers(struct efx_rx_queue *rx_queue)
 	BUILD_BUG_ON(EFX_RX_BATCH & 1);
 
 	for (count = 0; count < EFX_RX_BATCH; ++count) {
-		page = alloc_pages(__GFP_COLD | __GFP_COMP | GFP_ATOMIC,
-				   efx->rx_buffer_order);
-		if (unlikely(page == NULL))
-			return -ENOMEM;
-		dma_addr = dma_map_page(&efx->pci_dev->dev, page, 0,
-					PAGE_SIZE << efx->rx_buffer_order,
-					DMA_FROM_DEVICE);
-		if (unlikely(dma_mapping_error(&efx->pci_dev->dev, dma_addr))) {
-			__free_pages(page, efx->rx_buffer_order);
-			return -EIO;
+		page = efx_reuse_page(rx_queue);
+		if (page == NULL) {
+			page = alloc_pages(__GFP_COLD | __GFP_COMP | GFP_ATOMIC,
+					   efx->rx_buffer_order);
+			if (unlikely(page == NULL))
+				return -ENOMEM;
+			dma_addr =
+				dma_map_page(&efx->pci_dev->dev, page, 0,
+					     PAGE_SIZE << efx->rx_buffer_order,
+					     DMA_FROM_DEVICE);
+			if (unlikely(dma_mapping_error(&efx->pci_dev->dev,
+						       dma_addr))) {
+				__free_pages(page, efx->rx_buffer_order);
+				return -EIO;
+			}
+			state = page_address(page);
+			state->dma_addr = dma_addr;
+		} else {
+			state = page_address(page);
+			dma_addr = state->dma_addr;
 		}
-		state = page_address(page);
-		state->refcnt = 0;
-		state->dma_addr = dma_addr;
+		get_page(page);
 
 		dma_addr += sizeof(struct efx_rx_page_state);
 		page_offset = sizeof(struct efx_rx_page_state);
@@ -128,9 +194,7 @@ static int efx_init_rx_buffers(struct efx_rx_queue *rx_queue)
 		rx_buf->page = page;
 		rx_buf->page_offset = page_offset + EFX_PAGE_IP_ALIGN;
 		rx_buf->len = efx->rx_dma_len;
-		rx_buf->flags = 0;
 		++rx_queue->added_count;
-		++state->refcnt;
 
 		if ((~count & 1) && (efx->rx_dma_len <= EFX_RX_HALF_PAGE)) {
 			/* Use the second half of the page */
@@ -145,99 +209,91 @@ static int efx_init_rx_buffers(struct efx_rx_queue *rx_queue)
 	return 0;
 }
 
+/* Unmap a DMA-mapped page.  This function is only called for the final RX
+ * buffer in a page.
+ */
 static void efx_unmap_rx_buffer(struct efx_nic *efx,
-				struct efx_rx_buffer *rx_buf,
-				unsigned int used_len)
+				struct efx_rx_buffer *rx_buf)
 {
-	if (rx_buf->page) {
-		struct efx_rx_page_state *state;
-
-		state = page_address(rx_buf->page);
-		if (--state->refcnt == 0) {
-			dma_unmap_page(&efx->pci_dev->dev,
-				       state->dma_addr,
-				       PAGE_SIZE << efx->rx_buffer_order,
-				       DMA_FROM_DEVICE);
-		} else if (used_len) {
-			dma_sync_single_for_cpu(&efx->pci_dev->dev,
-						rx_buf->dma_addr, used_len,
-						DMA_FROM_DEVICE);
-		}
+	struct page *page = rx_buf->page;
+
+	if (page) {
+		struct efx_rx_page_state *state = page_address(page);
+		dma_unmap_page(&efx->pci_dev->dev,
+			       state->dma_addr,
+			       PAGE_SIZE << efx->rx_buffer_order,
+			       DMA_FROM_DEVICE);
 	}
 }
 
-static void efx_free_rx_buffer(struct efx_nic *efx,
-			       struct efx_rx_buffer *rx_buf)
+static void efx_free_rx_buffer(struct efx_rx_buffer *rx_buf)
 {
 	if (rx_buf->page) {
-		__free_pages(rx_buf->page, efx->rx_buffer_order);
+		put_page(rx_buf->page);
 		rx_buf->page = NULL;
 	}
 }
 
-static void efx_fini_rx_buffer(struct efx_rx_queue *rx_queue,
-			       struct efx_rx_buffer *rx_buf)
+/* Attempt to recycle the page if there is an RX recycle ring; the page can
+ * only be added if this is the final RX buffer, to prevent pages being used in
+ * the descriptor ring and appearing in the recycle ring simultaneously.
+ */
+static void efx_recycle_rx_page(struct efx_channel *channel,
+				struct efx_rx_buffer *rx_buf)
 {
-	efx_unmap_rx_buffer(rx_queue->efx, rx_buf, 0);
-	efx_free_rx_buffer(rx_queue->efx, rx_buf);
-}
+	struct page *page = rx_buf->page;
+	struct efx_rx_queue *rx_queue = efx_channel_get_rx_queue(channel);
+	struct efx_nic *efx = rx_queue->efx;
+	unsigned index;
 
-/* Attempt to resurrect the other receive buffer that used to share this page,
- * which had previously been passed up to the kernel and freed. */
-static void efx_resurrect_rx_buffer(struct efx_rx_queue *rx_queue,
-				    struct efx_rx_buffer *rx_buf)
-{
-	struct efx_rx_page_state *state = page_address(rx_buf->page);
-	struct efx_rx_buffer *new_buf;
-	unsigned fill_level, index;
-
-	/* +1 because efx_rx_packet() incremented removed_count. +1 because
-	 * we'd like to insert an additional descriptor whilst leaving
-	 * EFX_RXD_HEAD_ROOM for the non-recycle path */
-	fill_level = (rx_queue->added_count - rx_queue->removed_count + 2);
-	if (unlikely(fill_level > rx_queue->max_fill)) {
-		/* We could place "state" on a list, and drain the list in
-		 * efx_fast_push_rx_descriptors(). For now, this will do. */
+	/* Only recycle the page after processing the final buffer. */
+	if (!efx_rx_is_last_buffer(efx, rx_buf))
 		return;
-	}
 
-	++state->refcnt;
-	get_page(rx_buf->page);
+	index = rx_queue->page_add & rx_queue->page_ptr_mask;
+	if (rx_queue->page_ring[index] == NULL) {
+		unsigned read_index = rx_queue->page_remove &
+			rx_queue->page_ptr_mask;
 
-	index = rx_queue->added_count & rx_queue->ptr_mask;
-	new_buf = efx_rx_buffer(rx_queue, index);
-	new_buf->dma_addr = rx_buf->dma_addr ^ (PAGE_SIZE >> 1);
-	new_buf->page = rx_buf->page;
-	new_buf->len = rx_buf->len;
-	++rx_queue->added_count;
+		/* The next slot in the recycle ring is available, but
+		 * increment page_remove if the read pointer currently
+		 * points here.
+		 */
+		if (read_index == index)
+			++rx_queue->page_remove;
+		rx_queue->page_ring[index] = page;
+		++rx_queue->page_add;
+		return;
+	}
+	++rx_queue->page_recycle_full;
+	efx_unmap_rx_buffer(efx, rx_buf);
+	put_page(rx_buf->page);
 }
 
-/* Recycle buffers directly back into the rx_queue. There is always
- * room to add these buffer, because we've just popped them.
- */
+static void efx_fini_rx_buffer(struct efx_rx_queue *rx_queue,
+			       struct efx_rx_buffer *rx_buf)
+{
+	/* Release the page reference we hold for the buffer. */
+	if (rx_buf->page)
+		put_page(rx_buf->page);
+
+	/* If this is the last buffer in a page, unmap and free it. */
+	if (efx_rx_is_last_buffer(rx_queue->efx, rx_buf)) {
+		efx_unmap_rx_buffer(rx_queue->efx, rx_buf);
+		efx_free_rx_buffer(rx_buf);
+	}
+	rx_buf->page = NULL;
+}
+
+/* Recycle the pages that are used by buffers that have just been received. */
 static void efx_recycle_rx_buffers(struct efx_channel *channel,
 				   struct efx_rx_buffer *rx_buf,
 				   unsigned int n_frags)
 {
-	struct efx_nic *efx = channel->efx;
 	struct efx_rx_queue *rx_queue = efx_channel_get_rx_queue(channel);
-	struct efx_rx_buffer *new_buf;
-	unsigned index;
 
 	do {
-		rx_buf->flags = 0;
-
-		if (efx->rx_dma_len <= EFX_RX_HALF_PAGE &&
-		    page_count(rx_buf->page) == 1)
-			efx_resurrect_rx_buffer(rx_queue, rx_buf);
-
-		index = rx_queue->added_count & rx_queue->ptr_mask;
-		new_buf = efx_rx_buffer(rx_queue, index);
-
-		memcpy(new_buf, rx_buf, sizeof(*new_buf));
-		rx_buf->page = NULL;
-		++rx_queue->added_count;
-
+		efx_recycle_rx_page(channel, rx_buf);
 		rx_buf = efx_rx_buf_next(rx_queue, rx_buf);
 	} while (--n_frags);
 }
@@ -451,7 +507,7 @@ void efx_rx_packet(struct efx_rx_queue *rx_queue, unsigned int index,
 	struct efx_rx_buffer *rx_buf;
 
 	rx_buf = efx_rx_buffer(rx_queue, index);
-	rx_buf->flags |= flags;
+	rx_buf->flags = flags;
 
 	/* Validate the number of fragments and completed length */
 	if (n_frags == 1) {
@@ -479,6 +535,7 @@ void efx_rx_packet(struct efx_rx_queue *rx_queue, unsigned int index,
 	 */
 	if (unlikely(rx_buf->flags & EFX_RX_PKT_DISCARD)) {
 		efx_rx_flush_packet(channel);
+		put_page(rx_buf->page);
 		efx_recycle_rx_buffers(channel, rx_buf, n_frags);
 		return;
 	}
@@ -486,10 +543,10 @@ void efx_rx_packet(struct efx_rx_queue *rx_queue, unsigned int index,
 	if (n_frags == 1)
 		rx_buf->len = len;
 
-	/* Release and/or sync DMA mapping - assumes all RX buffers
-	 * consumed in-order per RX queue
+	/* Release and/or sync the DMA mapping - assumes all RX buffers
+	 * consumed in-order per RX queue.
 	 */
-	efx_unmap_rx_buffer(efx, rx_buf, rx_buf->len);
+	efx_sync_rx_buffer(efx, rx_buf, rx_buf->len);
 
 	/* Prefetch nice and early so data will (hopefully) be in cache by
 	 * the time we look at it.
@@ -509,12 +566,16 @@ void efx_rx_packet(struct efx_rx_queue *rx_queue, unsigned int index,
 			rx_buf = efx_rx_buf_next(rx_queue, rx_buf);
 			if (--tail_frags == 0)
 				break;
-			efx_unmap_rx_buffer(efx, rx_buf, EFX_RX_USR_BUF_SIZE);
+			efx_sync_rx_buffer(efx, rx_buf, EFX_RX_USR_BUF_SIZE);
 		}
 		rx_buf->len = len - (n_frags - 1) * EFX_RX_USR_BUF_SIZE;
-		efx_unmap_rx_buffer(efx, rx_buf, rx_buf->len);
+		efx_sync_rx_buffer(efx, rx_buf, rx_buf->len);
 	}
 
+	/* All fragments have been DMA-synced, so recycle buffers and pages. */
+	rx_buf = efx_rx_buffer(rx_queue, index);
+	efx_recycle_rx_buffers(channel, rx_buf, n_frags);
+
 	/* Pipeline receives so that we give time for packet headers to be
 	 * prefetched into cache.
 	 */
@@ -532,7 +593,7 @@ static void efx_rx_deliver(struct efx_channel *channel, u8 *eh,
 
 	skb = efx_rx_mk_skb(channel, rx_buf, n_frags, eh, hdr_len);
 	if (unlikely(skb == NULL)) {
-		efx_free_rx_buffer(channel->efx, rx_buf);
+		efx_free_rx_buffer(rx_buf);
 		return;
 	}
 	skb_record_rx_queue(skb, channel->rx_queue.core_index);
@@ -561,7 +622,7 @@ void __efx_rx_packet(struct efx_channel *channel)
 	 */
 	if (unlikely(efx->loopback_selftest)) {
 		efx_loopback_rx_packet(efx, eh, rx_buf->len);
-		efx_free_rx_buffer(efx, rx_buf);
+		efx_free_rx_buffer(rx_buf);
 		goto out;
 	}
 
@@ -603,9 +664,32 @@ int efx_probe_rx_queue(struct efx_rx_queue *rx_queue)
 		kfree(rx_queue->buffer);
 		rx_queue->buffer = NULL;
 	}
+
 	return rc;
 }
 
+void efx_init_rx_recycle_ring(struct efx_nic *efx,
+			      struct efx_rx_queue *rx_queue)
+{
+	unsigned int bufs_in_recycle_ring, page_ring_size;
+
+	/* Set the RX recycle ring size */
+#ifdef CONFIG_PPC64
+	bufs_in_recycle_ring = EFX_RECYCLE_RING_SIZE_IOMMU;
+#else
+	if (efx->pci_dev->dev.iommu_group)
+		bufs_in_recycle_ring = EFX_RECYCLE_RING_SIZE_IOMMU;
+	else
+		bufs_in_recycle_ring = EFX_RECYCLE_RING_SIZE_NOIOMMU;
+#endif /* CONFIG_PPC64 */
+
+	page_ring_size = roundup_pow_of_two(bufs_in_recycle_ring /
+					    efx->rx_bufs_per_page);
+	rx_queue->page_ring = kcalloc(page_ring_size,
+				      sizeof(*rx_queue->page_ring), GFP_KERNEL);
+	rx_queue->page_ptr_mask = page_ring_size - 1;
+}
+
 void efx_init_rx_queue(struct efx_rx_queue *rx_queue)
 {
 	struct efx_nic *efx = rx_queue->efx;
@@ -619,6 +703,13 @@ void efx_init_rx_queue(struct efx_rx_queue *rx_queue)
 	rx_queue->notified_count = 0;
 	rx_queue->removed_count = 0;
 	rx_queue->min_fill = -1U;
+	efx_init_rx_recycle_ring(efx, rx_queue);
+
+	rx_queue->page_remove = 0;
+	rx_queue->page_add = rx_queue->page_ptr_mask + 1;
+	rx_queue->page_recycle_count = 0;
+	rx_queue->page_recycle_failed = 0;
+	rx_queue->page_recycle_full = 0;
 
 	/* Initialise limit fields */
 	max_fill = efx->rxq_entries - EFX_RXD_HEAD_ROOM;
@@ -642,6 +733,7 @@ void efx_init_rx_queue(struct efx_rx_queue *rx_queue)
 void efx_fini_rx_queue(struct efx_rx_queue *rx_queue)
 {
 	int i;
+	struct efx_nic *efx = rx_queue->efx;
 	struct efx_rx_buffer *rx_buf;
 
 	netif_dbg(rx_queue->efx, drv, rx_queue->efx->net_dev,
@@ -653,13 +745,32 @@ void efx_fini_rx_queue(struct efx_rx_queue *rx_queue)
 	del_timer_sync(&rx_queue->slow_fill);
 	efx_nic_fini_rx(rx_queue);
 
-	/* Release RX buffers NB start at index 0 not current HW ptr */
+	/* Release RX buffers from the current read ptr to the write ptr */
 	if (rx_queue->buffer) {
-		for (i = 0; i <= rx_queue->ptr_mask; i++) {
-			rx_buf = efx_rx_buffer(rx_queue, i);
+		for (i = rx_queue->removed_count; i < rx_queue->added_count;
+		     i++) {
+			unsigned index = i & rx_queue->ptr_mask;
+			rx_buf = efx_rx_buffer(rx_queue, index);
 			efx_fini_rx_buffer(rx_queue, rx_buf);
 		}
 	}
+
+	/* Unmap and release the pages in the recycle ring. Remove the ring. */
+	for (i = 0; i <= rx_queue->page_ptr_mask; i++) {
+		struct page *page = rx_queue->page_ring[i];
+		struct efx_rx_page_state *state;
+
+		if (page == NULL)
+			continue;
+
+		state = page_address(page);
+		dma_unmap_page(&efx->pci_dev->dev, state->dma_addr,
+			       PAGE_SIZE << efx->rx_buffer_order,
+			       DMA_FROM_DEVICE);
+		put_page(page);
+	}
+	kfree(rx_queue->page_ring);
+	rx_queue->page_ring = NULL;
 }
 
 void efx_remove_rx_queue(struct efx_rx_queue *rx_queue)
-- 
cgit v0.10.2


From 179ea7f039f68ae4247a340bfb59fd861e7def12 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Thu, 7 Mar 2013 16:31:17 +0000
Subject: sfc: Replace efx_rx_is_last_buffer() with a flag

This condition is brittle and we have lots of flags to spare.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>

diff --git a/drivers/net/ethernet/sfc/net_driver.h b/drivers/net/ethernet/sfc/net_driver.h
index 370c5bc..e22e75c 100644
--- a/drivers/net/ethernet/sfc/net_driver.h
+++ b/drivers/net/ethernet/sfc/net_driver.h
@@ -228,6 +228,7 @@ struct efx_rx_buffer {
 	u16 len;
 	u16 flags;
 };
+#define EFX_RX_BUF_LAST_IN_PAGE	0x0001
 #define EFX_RX_PKT_CSUMMED	0x0002
 #define EFX_RX_PKT_DISCARD	0x0004
 
diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c
index eea56f3..4cc2ba4 100644
--- a/drivers/net/ethernet/sfc/rx.c
+++ b/drivers/net/ethernet/sfc/rx.c
@@ -95,14 +95,6 @@ static inline void efx_sync_rx_buffer(struct efx_nic *efx,
 				DMA_FROM_DEVICE);
 }
 
-/* Return true if this is the last RX buffer using a page. */
-static inline bool efx_rx_is_last_buffer(struct efx_nic *efx,
-					 struct efx_rx_buffer *rx_buf)
-{
-	return (rx_buf->page_offset >= (PAGE_SIZE >> 1) ||
-		efx->rx_dma_len > EFX_RX_HALF_PAGE);
-}
-
 /* Check the RX page recycle ring for a page that can be reused. */
 static struct page *efx_reuse_page(struct efx_rx_queue *rx_queue)
 {
@@ -199,11 +191,14 @@ static int efx_init_rx_buffers(struct efx_rx_queue *rx_queue)
 		if ((~count & 1) && (efx->rx_dma_len <= EFX_RX_HALF_PAGE)) {
 			/* Use the second half of the page */
 			get_page(page);
+			rx_buf->flags = 0;
 			dma_addr += (PAGE_SIZE >> 1);
 			page_offset += (PAGE_SIZE >> 1);
 			++count;
 			goto split;
 		}
+
+		rx_buf->flags = EFX_RX_BUF_LAST_IN_PAGE;
 	}
 
 	return 0;
@@ -247,7 +242,7 @@ static void efx_recycle_rx_page(struct efx_channel *channel,
 	unsigned index;
 
 	/* Only recycle the page after processing the final buffer. */
-	if (!efx_rx_is_last_buffer(efx, rx_buf))
+	if (!(rx_buf->flags & EFX_RX_BUF_LAST_IN_PAGE))
 		return;
 
 	index = rx_queue->page_add & rx_queue->page_ptr_mask;
@@ -278,7 +273,7 @@ static void efx_fini_rx_buffer(struct efx_rx_queue *rx_queue,
 		put_page(rx_buf->page);
 
 	/* If this is the last buffer in a page, unmap and free it. */
-	if (efx_rx_is_last_buffer(rx_queue->efx, rx_buf)) {
+	if (rx_buf->flags & EFX_RX_BUF_LAST_IN_PAGE) {
 		efx_unmap_rx_buffer(rx_queue->efx, rx_buf);
 		efx_free_rx_buffer(rx_buf);
 	}
@@ -507,7 +502,7 @@ void efx_rx_packet(struct efx_rx_queue *rx_queue, unsigned int index,
 	struct efx_rx_buffer *rx_buf;
 
 	rx_buf = efx_rx_buffer(rx_queue, index);
-	rx_buf->flags = flags;
+	rx_buf->flags |= flags;
 
 	/* Validate the number of fragments and completed length */
 	if (n_frags == 1) {
-- 
cgit v0.10.2


From 1648a23fa159e5c433aac06dc5e0d9db36146016 Mon Sep 17 00:00:00 2001
From: Daniel Pieczko <dpieczko@solarflare.com>
Date: Wed, 13 Feb 2013 10:54:41 +0000
Subject: sfc: allocate more RX buffers per page

Allocating 2 buffers per page is insanely inefficient when MTU is 1500
and PAGE_SIZE is 64K (as it usually is on POWER).  Allocate as many as
we can fit, and choose the refill batch size at run-time so that we
still always use a whole page at once.

[bwh: Fix loop condition to allow for compound pages; rebase]
Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>

diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index a70c458..f050248 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -643,10 +643,6 @@ static void efx_start_datapath(struct efx_nic *efx)
 	if (rx_buf_len <= PAGE_SIZE) {
 		efx->rx_scatter = false;
 		efx->rx_buffer_order = 0;
-		if (rx_buf_len <= PAGE_SIZE / 2)
-			efx->rx_buffer_truesize = PAGE_SIZE / 2;
-		else
-			efx->rx_buffer_truesize = PAGE_SIZE;
 	} else if (efx->type->can_rx_scatter) {
 		BUILD_BUG_ON(sizeof(struct efx_rx_page_state) +
 			     EFX_PAGE_IP_ALIGN + EFX_RX_USR_BUF_SIZE >
@@ -654,14 +650,22 @@ static void efx_start_datapath(struct efx_nic *efx)
 		efx->rx_scatter = true;
 		efx->rx_dma_len = EFX_RX_USR_BUF_SIZE;
 		efx->rx_buffer_order = 0;
-		efx->rx_buffer_truesize = PAGE_SIZE / 2;
 	} else {
 		efx->rx_scatter = false;
 		efx->rx_buffer_order = get_order(rx_buf_len);
-		efx->rx_buffer_truesize = PAGE_SIZE << efx->rx_buffer_order;
 	}
 
-	efx->rx_bufs_per_page = (rx_buf_len <= PAGE_SIZE / 2) ? 2 : 1;
+	efx_rx_config_page_split(efx);
+	if (efx->rx_buffer_order)
+		netif_dbg(efx, drv, efx->net_dev,
+			  "RX buf len=%u; page order=%u batch=%u\n",
+			  efx->rx_dma_len, efx->rx_buffer_order,
+			  efx->rx_pages_per_batch);
+	else
+		netif_dbg(efx, drv, efx->net_dev,
+			  "RX buf len=%u step=%u bpp=%u; page batch=%u\n",
+			  efx->rx_dma_len, efx->rx_page_buf_step,
+			  efx->rx_bufs_per_page, efx->rx_pages_per_batch);
 
 	/* RX filters also have scatter-enabled flags */
 	if (efx->rx_scatter != old_rx_scatter)
diff --git a/drivers/net/ethernet/sfc/efx.h b/drivers/net/ethernet/sfc/efx.h
index 211da79..8372da2 100644
--- a/drivers/net/ethernet/sfc/efx.h
+++ b/drivers/net/ethernet/sfc/efx.h
@@ -33,6 +33,7 @@ extern int efx_setup_tc(struct net_device *net_dev, u8 num_tc);
 extern unsigned int efx_tx_max_skb_descs(struct efx_nic *efx);
 
 /* RX */
+extern void efx_rx_config_page_split(struct efx_nic *efx);
 extern int efx_probe_rx_queue(struct efx_rx_queue *rx_queue);
 extern void efx_remove_rx_queue(struct efx_rx_queue *rx_queue);
 extern void efx_init_rx_queue(struct efx_rx_queue *rx_queue);
diff --git a/drivers/net/ethernet/sfc/net_driver.h b/drivers/net/ethernet/sfc/net_driver.h
index e22e75c..9bd433a 100644
--- a/drivers/net/ethernet/sfc/net_driver.h
+++ b/drivers/net/ethernet/sfc/net_driver.h
@@ -825,7 +825,9 @@ struct efx_nic {
 	unsigned int rx_dma_len;
 	unsigned int rx_buffer_order;
 	unsigned int rx_buffer_truesize;
+	unsigned int rx_page_buf_step;
 	unsigned int rx_bufs_per_page;
+	unsigned int rx_pages_per_batch;
 	u8 rx_hash_key[40];
 	u32 rx_indir_table[128];
 	bool rx_scatter;
diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c
index 4cc2ba4..a948b36 100644
--- a/drivers/net/ethernet/sfc/rx.c
+++ b/drivers/net/ethernet/sfc/rx.c
@@ -25,19 +25,15 @@
 #include "selftest.h"
 #include "workarounds.h"
 
-/* Number of RX descriptors pushed at once. */
-#define EFX_RX_BATCH  8
+/* Preferred number of descriptors to fill at once */
+#define EFX_RX_PREFERRED_BATCH 8U
 
 /* Number of RX buffers to recycle pages for.  When creating the RX page recycle
  * ring, this number is divided by the number of buffers per page to calculate
  * the number of pages to store in the RX page recycle ring.
  */
 #define EFX_RECYCLE_RING_SIZE_IOMMU 4096
-#define EFX_RECYCLE_RING_SIZE_NOIOMMU (2 * EFX_RX_BATCH)
-
-/* Maximum length for an RX descriptor sharing a page */
-#define EFX_RX_HALF_PAGE ((PAGE_SIZE >> 1) - sizeof(struct efx_rx_page_state) \
-			  - EFX_PAGE_IP_ALIGN)
+#define EFX_RECYCLE_RING_SIZE_NOIOMMU (2 * EFX_RX_PREFERRED_BATCH)
 
 /* Size of buffer allocated for skb header area. */
 #define EFX_SKB_HEADERS  64u
@@ -95,6 +91,19 @@ static inline void efx_sync_rx_buffer(struct efx_nic *efx,
 				DMA_FROM_DEVICE);
 }
 
+void efx_rx_config_page_split(struct efx_nic *efx)
+{
+	efx->rx_page_buf_step = ALIGN(efx->rx_dma_len + EFX_PAGE_IP_ALIGN,
+				      L1_CACHE_BYTES);
+	efx->rx_bufs_per_page = efx->rx_buffer_order ? 1 :
+		((PAGE_SIZE - sizeof(struct efx_rx_page_state)) /
+		 efx->rx_page_buf_step);
+	efx->rx_buffer_truesize = (PAGE_SIZE << efx->rx_buffer_order) /
+		efx->rx_bufs_per_page;
+	efx->rx_pages_per_batch = DIV_ROUND_UP(EFX_RX_PREFERRED_BATCH,
+					       efx->rx_bufs_per_page);
+}
+
 /* Check the RX page recycle ring for a page that can be reused. */
 static struct page *efx_reuse_page(struct efx_rx_queue *rx_queue)
 {
@@ -134,10 +143,10 @@ static struct page *efx_reuse_page(struct efx_rx_queue *rx_queue)
  *
  * @rx_queue:		Efx RX queue
  *
- * This allocates memory for EFX_RX_BATCH receive buffers, maps them for DMA,
- * and populates struct efx_rx_buffers for each one. Return a negative error
- * code or 0 on success. If a single page can be split between two buffers,
- * then the page will either be inserted fully, or not at at all.
+ * This allocates a batch of pages, maps them for DMA, and populates
+ * struct efx_rx_buffers for each one. Return a negative error code or
+ * 0 on success. If a single page can be used for multiple buffers,
+ * then the page will either be inserted fully, or not at all.
  */
 static int efx_init_rx_buffers(struct efx_rx_queue *rx_queue)
 {
@@ -149,10 +158,8 @@ static int efx_init_rx_buffers(struct efx_rx_queue *rx_queue)
 	dma_addr_t dma_addr;
 	unsigned index, count;
 
-	/* We can split a page between two buffers */
-	BUILD_BUG_ON(EFX_RX_BATCH & 1);
-
-	for (count = 0; count < EFX_RX_BATCH; ++count) {
+	count = 0;
+	do {
 		page = efx_reuse_page(rx_queue);
 		if (page == NULL) {
 			page = alloc_pages(__GFP_COLD | __GFP_COMP | GFP_ATOMIC,
@@ -174,32 +181,26 @@ static int efx_init_rx_buffers(struct efx_rx_queue *rx_queue)
 			state = page_address(page);
 			dma_addr = state->dma_addr;
 		}
-		get_page(page);
 
 		dma_addr += sizeof(struct efx_rx_page_state);
 		page_offset = sizeof(struct efx_rx_page_state);
 
-	split:
-		index = rx_queue->added_count & rx_queue->ptr_mask;
-		rx_buf = efx_rx_buffer(rx_queue, index);
-		rx_buf->dma_addr = dma_addr + EFX_PAGE_IP_ALIGN;
-		rx_buf->page = page;
-		rx_buf->page_offset = page_offset + EFX_PAGE_IP_ALIGN;
-		rx_buf->len = efx->rx_dma_len;
-		++rx_queue->added_count;
-
-		if ((~count & 1) && (efx->rx_dma_len <= EFX_RX_HALF_PAGE)) {
-			/* Use the second half of the page */
-			get_page(page);
+		do {
+			index = rx_queue->added_count & rx_queue->ptr_mask;
+			rx_buf = efx_rx_buffer(rx_queue, index);
+			rx_buf->dma_addr = dma_addr + EFX_PAGE_IP_ALIGN;
+			rx_buf->page = page;
+			rx_buf->page_offset = page_offset + EFX_PAGE_IP_ALIGN;
+			rx_buf->len = efx->rx_dma_len;
 			rx_buf->flags = 0;
-			dma_addr += (PAGE_SIZE >> 1);
-			page_offset += (PAGE_SIZE >> 1);
-			++count;
-			goto split;
-		}
+			++rx_queue->added_count;
+			get_page(page);
+			dma_addr += efx->rx_page_buf_step;
+			page_offset += efx->rx_page_buf_step;
+		} while (page_offset + efx->rx_page_buf_step <= PAGE_SIZE);
 
 		rx_buf->flags = EFX_RX_BUF_LAST_IN_PAGE;
-	}
+	} while (++count < efx->rx_pages_per_batch);
 
 	return 0;
 }
@@ -307,7 +308,8 @@ static void efx_recycle_rx_buffers(struct efx_channel *channel,
  */
 void efx_fast_push_rx_descriptors(struct efx_rx_queue *rx_queue)
 {
-	unsigned fill_level;
+	struct efx_nic *efx = rx_queue->efx;
+	unsigned int fill_level, batch_size;
 	int space, rc = 0;
 
 	/* Calculate current fill level, and exit if we don't need to fill */
@@ -322,8 +324,9 @@ void efx_fast_push_rx_descriptors(struct efx_rx_queue *rx_queue)
 			rx_queue->min_fill = fill_level;
 	}
 
+	batch_size = efx->rx_pages_per_batch * efx->rx_bufs_per_page;
 	space = rx_queue->max_fill - fill_level;
-	EFX_BUG_ON_PARANOID(space < EFX_RX_BATCH);
+	EFX_BUG_ON_PARANOID(space < batch_size);
 
 	netif_vdbg(rx_queue->efx, rx_status, rx_queue->efx->net_dev,
 		   "RX queue %d fast-filling descriptor ring from"
@@ -340,7 +343,7 @@ void efx_fast_push_rx_descriptors(struct efx_rx_queue *rx_queue)
 				efx_schedule_slow_fill(rx_queue);
 			goto out;
 		}
-	} while ((space -= EFX_RX_BATCH) >= EFX_RX_BATCH);
+	} while ((space -= batch_size) >= batch_size);
 
 	netif_vdbg(rx_queue->efx, rx_status, rx_queue->efx->net_dev,
 		   "RX queue %d fast-filled descriptor ring "
@@ -708,7 +711,8 @@ void efx_init_rx_queue(struct efx_rx_queue *rx_queue)
 
 	/* Initialise limit fields */
 	max_fill = efx->rxq_entries - EFX_RXD_HEAD_ROOM;
-	max_trigger = max_fill - EFX_RX_BATCH;
+	max_trigger =
+		max_fill - efx->rx_pages_per_batch * efx->rx_bufs_per_page;
 	if (rx_refill_threshold != 0) {
 		trigger = max_fill * min(rx_refill_threshold, 100U) / 100U;
 		if (trigger > max_trigger)
-- 
cgit v0.10.2