1 files changed, 186 insertions, 207 deletions
diff --git a/drivers/net/sfc/rx.c b/drivers/net/sfc/rx.c
index e308818..9fb698e 100644
--- a/drivers/net/sfc/rx.c
+++ b/drivers/net/sfc/rx.c
@@ -25,6 +25,9 @@
 /* Number of RX descriptors pushed at once. */
 #define EFX_RX_BATCH  8
 
+/* Maximum size of a buffer sharing a page */
+#define EFX_RX_HALF_PAGE ((PAGE_SIZE >> 1) - sizeof(struct efx_rx_page_state))
+
 /* Size of buffer allocated for skb header area. */
 #define EFX_SKB_HEADERS  64u
 
@@ -98,155 +101,138 @@ static inline unsigned int efx_rx_buf_size(struct efx_nic *efx)
 	return PAGE_SIZE << efx->rx_buffer_order;
 }
 
-
 /**
- * efx_init_rx_buffer_skb - create new RX buffer using skb-based allocation
+ * efx_init_rx_buffers_skb - create EFX_RX_BATCH skb-based RX buffers
  *
  * @rx_queue:		Efx RX queue
- * @rx_buf:		RX buffer structure to populate
  *
- * This allocates memory for a new receive buffer, maps it for DMA,
- * and populates a struct efx_rx_buffer with the relevant
- * information.  Return a negative error code or 0 on success.
+ * This allocates EFX_RX_BATCH skbs, maps them for DMA, and populates a
+ * struct efx_rx_buffer for each one. Return a negative error code or 0
+ * on success. May fail having only inserted fewer than EFX_RX_BATCH
+ * buffers.
  */
-static int efx_init_rx_buffer_skb(struct efx_rx_queue *rx_queue,
-				  struct efx_rx_buffer *rx_buf)
+static int efx_init_rx_buffers_skb(struct efx_rx_queue *rx_queue)
 {
 	struct efx_nic *efx = rx_queue->efx;
 	struct net_device *net_dev = efx->net_dev;
+	struct efx_rx_buffer *rx_buf;
 	int skb_len = efx->rx_buffer_len;
+	unsigned index, count;
 
-	rx_buf->skb = netdev_alloc_skb(net_dev, skb_len);
-	if (unlikely(!rx_buf->skb))
-		return -ENOMEM;
+	for (count = 0; count < EFX_RX_BATCH; ++count) {
+		index = rx_queue->added_count & EFX_RXQ_MASK;
+		rx_buf = efx_rx_buffer(rx_queue, index);
 
-	/* Adjust the SKB for padding and checksum */
-	skb_reserve(rx_buf->skb, NET_IP_ALIGN);
-	rx_buf->len = skb_len - NET_IP_ALIGN;
-	rx_buf->data = (char *)rx_buf->skb->data;
-	rx_buf->skb->ip_summed = CHECKSUM_UNNECESSARY;
+		rx_buf->skb = netdev_alloc_skb(net_dev, skb_len);
+		if (unlikely(!rx_buf->skb))
+			return -ENOMEM;
+		rx_buf->page = NULL;
 
-	rx_buf->dma_addr = pci_map_single(efx->pci_dev,
-					  rx_buf->data, rx_buf->len,
-					  PCI_DMA_FROMDEVICE);
+		/* Adjust the SKB for padding and checksum */
+		skb_reserve(rx_buf->skb, NET_IP_ALIGN);
+		rx_buf->len = skb_len - NET_IP_ALIGN;
+		rx_buf->data = (char *)rx_buf->skb->data;
+		rx_buf->skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+		rx_buf->dma_addr = pci_map_single(efx->pci_dev,
+						  rx_buf->data, rx_buf->len,
+						  PCI_DMA_FROMDEVICE);
+		if (unlikely(pci_dma_mapping_error(efx->pci_dev,
+						   rx_buf->dma_addr))) {
+			dev_kfree_skb_any(rx_buf->skb);
+			rx_buf->skb = NULL;
+			return -EIO;
+		}
 
-	if (unlikely(pci_dma_mapping_error(efx->pci_dev, rx_buf->dma_addr))) {
-		dev_kfree_skb_any(rx_buf->skb);
-		rx_buf->skb = NULL;
-		return -EIO;
+		++rx_queue->added_count;
+		++rx_queue->alloc_skb_count;
 	}
 
 	return 0;
 }
 
 /**
- * efx_init_rx_buffer_page - create new RX buffer using page-based allocation
+ * efx_init_rx_buffers_page - create EFX_RX_BATCH page-based RX buffers
  *
  * @rx_queue:		Efx RX queue
- * @rx_buf:		RX buffer structure to populate
  *
- * This allocates memory for a new receive buffer, maps it for DMA,
- * and populates a struct efx_rx_buffer with the relevant
- * information.  Return a negative error code or 0 on success.
+ * This allocates memory for EFX_RX_BATCH receive buffers, maps them for DMA,
+ * and populates struct efx_rx_buffers for each one. Return a negative error
+ * code or 0 on success. If a single page can be split between two buffers,
+ * then the page will either be inserted fully, or not at at all.
  */
-static int efx_init_rx_buffer_page(struct efx_rx_queue *rx_queue,
-				   struct efx_rx_buffer *rx_buf)
+static int efx_init_rx_buffers_page(struct efx_rx_queue *rx_queue)
 {
 	struct efx_nic *efx = rx_queue->efx;
-	int bytes, space, offset;
-
-	bytes = efx->rx_buffer_len - EFX_PAGE_IP_ALIGN;
-
-	/* If there is space left in the previously allocated page,
-	 * then use it. Otherwise allocate a new one */
-	rx_buf->page = rx_queue->buf_page;
-	if (rx_buf->page == NULL) {
-		dma_addr_t dma_addr;
-
-		rx_buf->page = alloc_pages(__GFP_COLD | __GFP_COMP | GFP_ATOMIC,
-					   efx->rx_buffer_order);
-		if (unlikely(rx_buf->page == NULL))
+	struct efx_rx_buffer *rx_buf;
+	struct page *page;
+	void *page_addr;
+	struct efx_rx_page_state *state;
+	dma_addr_t dma_addr;
+	unsigned index, count;
+
+	/* We can split a page between two buffers */
+	BUILD_BUG_ON(EFX_RX_BATCH & 1);
+
+	for (count = 0; count < EFX_RX_BATCH; ++count) {
+		page = alloc_pages(__GFP_COLD | __GFP_COMP | GFP_ATOMIC,
+				   efx->rx_buffer_order);
+		if (unlikely(page == NULL))
 			return -ENOMEM;
-
-		dma_addr = pci_map_page(efx->pci_dev, rx_buf->page,
-					0, efx_rx_buf_size(efx),
+		dma_addr = pci_map_page(efx->pci_dev, page, 0,
+					efx_rx_buf_size(efx),
 					PCI_DMA_FROMDEVICE);
-
 		if (unlikely(pci_dma_mapping_error(efx->pci_dev, dma_addr))) {
-			__free_pages(rx_buf->page, efx->rx_buffer_order);
-			rx_buf->page = NULL;
+			__free_pages(page, efx->rx_buffer_order);
 			return -EIO;
 		}
-
-		rx_queue->buf_page = rx_buf->page;
-		rx_queue->buf_dma_addr = dma_addr;
-		rx_queue->buf_data = (page_address(rx_buf->page) +
-				      EFX_PAGE_IP_ALIGN);
-	}
-
-	rx_buf->len = bytes;
-	rx_buf->data = rx_queue->buf_data;
-	offset = efx_rx_buf_offset(rx_buf);
-	rx_buf->dma_addr = rx_queue->buf_dma_addr + offset;
-
-	/* Try to pack multiple buffers per page */
-	if (efx->rx_buffer_order == 0) {
-		/* The next buffer starts on the next 512 byte boundary */
-		rx_queue->buf_data += ((bytes + 0x1ff) & ~0x1ff);
-		offset += ((bytes + 0x1ff) & ~0x1ff);
-
-		space = efx_rx_buf_size(efx) - offset;
-		if (space >= bytes) {
-			/* Refs dropped on kernel releasing each skb */
-			get_page(rx_queue->buf_page);
-			goto out;
+		page_addr = page_address(page);
+		state = page_addr;
+		state->refcnt = 0;
+		state->dma_addr = dma_addr;
+
+		page_addr += sizeof(struct efx_rx_page_state);
+		dma_addr += sizeof(struct efx_rx_page_state);
+
+	split:
+		index = rx_queue->added_count & EFX_RXQ_MASK;
+		rx_buf = efx_rx_buffer(rx_queue, index);
+		rx_buf->dma_addr = dma_addr + EFX_PAGE_IP_ALIGN;
+		rx_buf->skb = NULL;
+		rx_buf->page = page;
+		rx_buf->data = page_addr + EFX_PAGE_IP_ALIGN;
+		rx_buf->len = efx->rx_buffer_len - EFX_PAGE_IP_ALIGN;
+		++rx_queue->added_count;
+		++rx_queue->alloc_page_count;
+		++state->refcnt;
+
+		if ((~count & 1) && (efx->rx_buffer_len <= EFX_RX_HALF_PAGE)) {
+			/* Use the second half of the page */
+			get_page(page);
+			dma_addr += (PAGE_SIZE >> 1);
+			page_addr += (PAGE_SIZE >> 1);
+			++count;
+			goto split;
 		}
 	}
 
-	/* This is the final RX buffer for this page, so mark it for
-	 * unmapping */
-	rx_queue->buf_page = NULL;
-	rx_buf->unmap_addr = rx_queue->buf_dma_addr;
-
- out:
 	return 0;
 }
 
-/* This allocates memory for a new receive buffer, maps it for DMA,
- * and populates a struct efx_rx_buffer with the relevant
- * information.
- */
-static int efx_init_rx_buffer(struct efx_rx_queue *rx_queue,
-			      struct efx_rx_buffer *new_rx_buf)
-{
-	int rc = 0;
-
-	if (rx_queue->channel->rx_alloc_push_pages) {
-		new_rx_buf->skb = NULL;
-		rc = efx_init_rx_buffer_page(rx_queue, new_rx_buf);
-		rx_queue->alloc_page_count++;
-	} else {
-		new_rx_buf->page = NULL;
-		rc = efx_init_rx_buffer_skb(rx_queue, new_rx_buf);
-		rx_queue->alloc_skb_count++;
-	}
-
-	if (unlikely(rc < 0))
-		EFX_LOG_RL(rx_queue->efx, "%s RXQ[%d] =%d\n", __func__,
-			   rx_queue->queue, rc);
-	return rc;
-}
-
 static void efx_unmap_rx_buffer(struct efx_nic *efx,
 				struct efx_rx_buffer *rx_buf)
 {
 	if (rx_buf->page) {
+		struct efx_rx_page_state *state;
+
 		EFX_BUG_ON_PARANOID(rx_buf->skb);
-		if (rx_buf->unmap_addr) {
-			pci_unmap_page(efx->pci_dev, rx_buf->unmap_addr,
+
+		state = page_address(rx_buf->page);
+		if (--state->refcnt == 0) {
+			pci_unmap_page(efx->pci_dev,
+				       state->dma_addr,
 				       efx_rx_buf_size(efx),
 				       PCI_DMA_FROMDEVICE);
-			rx_buf->unmap_addr = 0;
 		}
 	} else if (likely(rx_buf->skb)) {
 		pci_unmap_single(efx->pci_dev, rx_buf->dma_addr,
@@ -273,31 +259,84 @@ static void efx_fini_rx_buffer(struct efx_rx_queue *rx_queue,
 	efx_free_rx_buffer(rx_queue->efx, rx_buf);
 }
 
+/* Attempt to resurrect the other receive buffer that used to share this page,
+ * which had previously been passed up to the kernel and freed. */
+static void efx_resurrect_rx_buffer(struct efx_rx_queue *rx_queue,
+				    struct efx_rx_buffer *rx_buf)
+{
+	struct efx_rx_page_state *state = page_address(rx_buf->page);
+	struct efx_rx_buffer *new_buf;
+	unsigned fill_level, index;
+
+	/* +1 because efx_rx_packet() incremented removed_count. +1 because
+	 * we'd like to insert an additional descriptor whilst leaving
+	 * EFX_RXD_HEAD_ROOM for the non-recycle path */
+	fill_level = (rx_queue->added_count - rx_queue->removed_count + 2);
+	if (unlikely(fill_level >= EFX_RXQ_SIZE - EFX_RXD_HEAD_ROOM)) {
+		/* We could place "state" on a list, and drain the list in
+		 * efx_fast_push_rx_descriptors(). For now, this will do. */
+		return;
+	}
+
+	++state->refcnt;
+	get_page(rx_buf->page);
+
+	index = rx_queue->added_count & EFX_RXQ_MASK;
+	new_buf = efx_rx_buffer(rx_queue, index);
+	new_buf->dma_addr = rx_buf->dma_addr ^ (PAGE_SIZE >> 1);
+	new_buf->skb = NULL;
+	new_buf->page = rx_buf->page;
+	new_buf->data = (void *)
+		((__force unsigned long)rx_buf->data ^ (PAGE_SIZE >> 1));
+	new_buf->len = rx_buf->len;
+	++rx_queue->added_count;
+}
+
+/* Recycle the given rx buffer directly back into the rx_queue. There is
+ * always room to add this buffer, because we've just popped a buffer. */
+static void efx_recycle_rx_buffer(struct efx_channel *channel,
+				  struct efx_rx_buffer *rx_buf)
+{
+	struct efx_nic *efx = channel->efx;
+	struct efx_rx_queue *rx_queue = &efx->rx_queue[channel->channel];
+	struct efx_rx_buffer *new_buf;
+	unsigned index;
+
+	if (rx_buf->page != NULL && efx->rx_buffer_len <= EFX_RX_HALF_PAGE &&
+	    page_count(rx_buf->page) == 1)
+		efx_resurrect_rx_buffer(rx_queue, rx_buf);
+
+	index = rx_queue->added_count & EFX_RXQ_MASK;
+	new_buf = efx_rx_buffer(rx_queue, index);
+
+	memcpy(new_buf, rx_buf, sizeof(*new_buf));
+	rx_buf->page = NULL;
+	rx_buf->skb = NULL;
+	++rx_queue->added_count;
+}
+
 /**
  * efx_fast_push_rx_descriptors - push new RX descriptors quickly
  * @rx_queue:		RX descriptor queue
- * @retry:              Recheck the fill level
  * This will aim to fill the RX descriptor queue up to
  * @rx_queue->@fast_fill_limit. If there is insufficient atomic
- * memory to do so, the caller should retry.
+ * memory to do so, a slow fill will be scheduled.
+ *
+ * The caller must provide serialisation (none is used here). In practise,
+ * this means this function must run from the NAPI handler, or be called
+ * when NAPI is disabled.
  */
-static int __efx_fast_push_rx_descriptors(struct efx_rx_queue *rx_queue,
-					  int retry)
+void efx_fast_push_rx_descriptors(struct efx_rx_queue *rx_queue)
 {
-	struct efx_rx_buffer *rx_buf;
-	unsigned fill_level, index;
-	int i, space, rc = 0;
+	struct efx_channel *channel = rx_queue->channel;
+	unsigned fill_level;
+	int space, rc = 0;
 
-	/* Calculate current fill level.  Do this outside the lock,
-	 * because most of the time we'll end up not wanting to do the
-	 * fill anyway.
-	 */
+	/* Calculate current fill level, and exit if we don't need to fill */
 	fill_level = (rx_queue->added_count - rx_queue->removed_count);
 	EFX_BUG_ON_PARANOID(fill_level > EFX_RXQ_SIZE);
-
-	/* Don't fill if we don't need to */
 	if (fill_level >= rx_queue->fast_fill_trigger)
-		return 0;
+		goto out;
 
 	/* Record minimum fill level */
 	if (unlikely(fill_level < rx_queue->min_fill)) {
@@ -305,34 +344,25 @@ static int __efx_fast_push_rx_descriptors(struct efx_rx_queue *rx_queue,
 			rx_queue->min_fill = fill_level;
 	}
 
-	/* Acquire RX add lock.  If this lock is contended, then a fast
-	 * fill must already be in progress (e.g. in the refill
-	 * tasklet), so we don't need to do anything
-	 */
-	if (!spin_trylock_bh(&rx_queue->add_lock))
-		return -1;
-
- retry:
-	/* Recalculate current fill level now that we have the lock */
-	fill_level = (rx_queue->added_count - rx_queue->removed_count);
-	EFX_BUG_ON_PARANOID(fill_level > EFX_RXQ_SIZE);
 	space = rx_queue->fast_fill_limit - fill_level;
 	if (space < EFX_RX_BATCH)
-		goto out_unlock;
+		goto out;
 
 	EFX_TRACE(rx_queue->efx, "RX queue %d fast-filling descriptor ring from"
 		  " level %d to level %d using %s allocation\n",
 		  rx_queue->queue, fill_level, rx_queue->fast_fill_limit,
-		  rx_queue->channel->rx_alloc_push_pages ? "page" : "skb");
+		  channel->rx_alloc_push_pages ? "page" : "skb");
 
 	do {
-		for (i = 0; i < EFX_RX_BATCH; ++i) {
-			index = rx_queue->added_count & EFX_RXQ_MASK;
-			rx_buf = efx_rx_buffer(rx_queue, index);
-			rc = efx_init_rx_buffer(rx_queue, rx_buf);
-			if (unlikely(rc))
-				goto out;
-			++rx_queue->added_count;
+		if (channel->rx_alloc_push_pages)
+			rc = efx_init_rx_buffers_page(rx_queue);
+		else
+			rc = efx_init_rx_buffers_skb(rx_queue);
+		if (unlikely(rc)) {
+			/* Ensure that we don't leave the rx queue empty */
+			if (rx_queue->added_count == rx_queue->removed_count)
+				efx_schedule_slow_fill(rx_queue);
+			goto out;
 		}
 	} while ((space -= EFX_RX_BATCH) >= EFX_RX_BATCH);
 
@@ -341,63 +371,18 @@ static int __efx_fast_push_rx_descriptors(struct efx_rx_queue *rx_queue,
 		  rx_queue->added_count - rx_queue->removed_count);
 
  out:
-	/* Send write pointer to card. */
-	efx_nic_notify_rx_desc(rx_queue);
-
-	/* If the fast fill is running inside from the refill tasklet, then
-	 * for SMP systems it may be running on a different CPU to
-	 * RX event processing, which means that the fill level may now be
-	 * out of date. */
-	if (unlikely(retry && (rc == 0)))
-		goto retry;
-
- out_unlock:
-	spin_unlock_bh(&rx_queue->add_lock);
-
-	return rc;
-}
-
-/**
- * efx_fast_push_rx_descriptors - push new RX descriptors quickly
- * @rx_queue:		RX descriptor queue
- *
- * This will aim to fill the RX descriptor queue up to
- * @rx_queue->@fast_fill_limit.  If there is insufficient memory to do so,
- * it will schedule a work item to immediately continue the fast fill
- */
-void efx_fast_push_rx_descriptors(struct efx_rx_queue *rx_queue)
-{
-	int rc;
-
-	rc = __efx_fast_push_rx_descriptors(rx_queue, 0);
-	if (unlikely(rc)) {
-		/* Schedule the work item to run immediately. The hope is
-		 * that work is immediately pending to free some memory
-		 * (e.g. an RX event or TX completion)
-		 */
-		efx_schedule_slow_fill(rx_queue, 0);
-	}
+	if (rx_queue->notified_count != rx_queue->added_count)
+		efx_nic_notify_rx_desc(rx_queue);
 }
 
-void efx_rx_work(struct work_struct *data)
+void efx_rx_slow_fill(unsigned long context)
 {
-	struct efx_rx_queue *rx_queue;
-	int rc;
-
-	rx_queue = container_of(data, struct efx_rx_queue, work.work);
-
-	if (unlikely(!rx_queue->channel->enabled))
-		return;
-
-	EFX_TRACE(rx_queue->efx, "RX queue %d worker thread executing on CPU "
-		  "%d\n", rx_queue->queue, raw_smp_processor_id());
+	struct efx_rx_queue *rx_queue = (struct efx_rx_queue *)context;
+	struct efx_channel *channel = rx_queue->channel;
 
+	/* Post an event to cause NAPI to run and refill the queue */
+	efx_nic_generate_fill_event(channel);
 	++rx_queue->slow_fill_count;
-	/* Push new RX descriptors, allowing at least 1 jiffy for
-	 * the kernel to free some more memory. */
-	rc = __efx_fast_push_rx_descriptors(rx_queue, 1);
-	if (rc)
-		efx_schedule_slow_fill(rx_queue, 1);
 }
 
 static void efx_rx_packet__check_len(struct efx_rx_queue *rx_queue,
@@ -498,6 +483,7 @@ void efx_rx_packet(struct efx_rx_queue *rx_queue, unsigned int index,
 		   unsigned int len, bool checksummed, bool discard)
 {
 	struct efx_nic *efx = rx_queue->efx;
+	struct efx_channel *channel = rx_queue->channel;
 	struct efx_rx_buffer *rx_buf;
 	bool leak_packet = false;
 
@@ -525,12 +511,13 @@ void efx_rx_packet(struct efx_rx_queue *rx_queue, unsigned int index,
 	/* Discard packet, if instructed to do so */
 	if (unlikely(discard)) {
 		if (unlikely(leak_packet))
-			rx_queue->channel->n_skbuff_leaks++;
+			channel->n_skbuff_leaks++;
 		else
-			/* We haven't called efx_unmap_rx_buffer yet,
-			 * so fini the entire rx_buffer here */
-			efx_fini_rx_buffer(rx_queue, rx_buf);
-		return;
+			efx_recycle_rx_buffer(channel, rx_buf);
+
+		/* Don't hold off the previous receive */
+		rx_buf = NULL;
+		goto out;
 	}
 
 	/* Release card resources - assumes all RX buffers consumed in-order
@@ -547,6 +534,7 @@ void efx_rx_packet(struct efx_rx_queue *rx_queue, unsigned int index,
 	 * prefetched into cache.
 	 */
 	rx_buf->len = len;
+out:
 	if (rx_queue->channel->rx_pkt)
 		__efx_rx_packet(rx_queue->channel,
 				rx_queue->channel->rx_pkt,
@@ -682,6 +670,7 @@ void efx_fini_rx_queue(struct efx_rx_queue *rx_queue)
 
 	EFX_LOG(rx_queue->efx, "shutting down RX queue %d\n", rx_queue->queue);
 
+	del_timer_sync(&rx_queue->slow_fill);
 	efx_nic_fini_rx(rx_queue);
 
 	/* Release RX buffers NB start at index 0 not current HW ptr */
@@ -691,16 +680,6 @@ void efx_fini_rx_queue(struct efx_rx_queue *rx_queue)
 			efx_fini_rx_buffer(rx_queue, rx_buf);
 		}
 	}
-
-	/* For a page that is part-way through splitting into RX buffers */
-	if (rx_queue->buf_page != NULL) {
-		pci_unmap_page(rx_queue->efx->pci_dev, rx_queue->buf_dma_addr,
-			       efx_rx_buf_size(rx_queue->efx),
-			       PCI_DMA_FROMDEVICE);
-		__free_pages(rx_queue->buf_page,
-			     rx_queue->efx->rx_buffer_order);
-		rx_queue->buf_page = NULL;
-	}
 }
 
 void efx_remove_rx_queue(struct efx_rx_queue *rx_queue)