From 5b057c6b1a25d57edf2b4d1e956e50936480a9ff Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 23 Jun 2006 02:06:41 -0700 Subject: [NET]: Avoid allocating skb in skb_pad First of all it is unnecessary to allocate a new skb in skb_pad since the existing one is not shared. More importantly, our hard_start_xmit interface does not allow a new skb to be allocated since that breaks requeueing. This patch uses pskb_expand_head to expand the existing skb and linearize it if needed. Actually, someone should sift through every instance of skb_pad on a non-linear skb as they do not fit the reasons why this was originally created. Incidentally, this fixes a minor bug when the skb is cloned (tcpdump, TCP, etc.). As it is skb_pad will simply write over a cloned skb. Because of the position of the write it is unlikely to cause problems but still it's best if we don't do it. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller diff --git a/drivers/net/3c527.c b/drivers/net/3c527.c index 1b1cb00..157eda5 100644 --- a/drivers/net/3c527.c +++ b/drivers/net/3c527.c @@ -1031,8 +1031,7 @@ static int mc32_send_packet(struct sk_buff *skb, struct net_device *dev) return 1; } - skb = skb_padto(skb, ETH_ZLEN); - if (skb == NULL) { + if (skb_padto(skb, ETH_ZLEN)) { netif_wake_queue(dev); return 0; } diff --git a/drivers/net/82596.c b/drivers/net/82596.c index da0c878..8a9f7d6 100644 --- a/drivers/net/82596.c +++ b/drivers/net/82596.c @@ -1070,8 +1070,7 @@ static int i596_start_xmit(struct sk_buff *skb, struct net_device *dev) skb->len, (unsigned int)skb->data)); if (skb->len < ETH_ZLEN) { - skb = skb_padto(skb, ETH_ZLEN); - if (skb == NULL) + if (skb_padto(skb, ETH_ZLEN)) return 0; length = ETH_ZLEN; } diff --git a/drivers/net/a2065.c b/drivers/net/a2065.c index 79bb56b..71165ac 100644 --- a/drivers/net/a2065.c +++ b/drivers/net/a2065.c @@ -573,8 +573,7 @@ static int lance_start_xmit (struct sk_buff *skb, struct net_device *dev) if (len < ETH_ZLEN) { len = ETH_ZLEN; - skb = skb_padto(skb, ETH_ZLEN); - if (skb == NULL) + if (skb_padto(skb, ETH_ZLEN)) return 0; } diff --git a/drivers/net/ariadne.c b/drivers/net/ariadne.c index d1b6b1f..a9bb7a4 100644 --- a/drivers/net/ariadne.c +++ b/drivers/net/ariadne.c @@ -607,8 +607,7 @@ static int ariadne_start_xmit(struct sk_buff *skb, struct net_device *dev) /* FIXME: is the 79C960 new enough to do its own padding right ? */ if (skb->len < ETH_ZLEN) { - skb = skb_padto(skb, ETH_ZLEN); - if (skb == NULL) + if (skb_padto(skb, ETH_ZLEN)) return 0; len = ETH_ZLEN; } diff --git a/drivers/net/arm/ether1.c b/drivers/net/arm/ether1.c index 36475eb..312955d 100644 --- a/drivers/net/arm/ether1.c +++ b/drivers/net/arm/ether1.c @@ -700,8 +700,7 @@ ether1_sendpacket (struct sk_buff *skb, struct net_device *dev) } if (skb->len < ETH_ZLEN) { - skb = skb_padto(skb, ETH_ZLEN); - if (skb == NULL) + if (skb_padto(skb, ETH_ZLEN)) goto out; } diff --git a/drivers/net/arm/ether3.c b/drivers/net/arm/ether3.c index f1d5b10..0810741 100644 --- a/drivers/net/arm/ether3.c +++ b/drivers/net/arm/ether3.c @@ -518,8 +518,7 @@ ether3_sendpacket(struct sk_buff *skb, struct net_device *dev) length = (length + 1) & ~1; if (length != skb->len) { - skb = skb_padto(skb, length); - if (skb == NULL) + if (skb_padto(skb, length)) goto out; } diff --git a/drivers/net/atarilance.c b/drivers/net/atarilance.c index 442b2cb..91783a8 100644 --- a/drivers/net/atarilance.c +++ b/drivers/net/atarilance.c @@ -804,8 +804,7 @@ static int lance_start_xmit( struct sk_buff *skb, struct net_device *dev ) ++len; if (len > skb->len) { - skb = skb_padto(skb, len); - if (skb == NULL) + if (skb_padto(skb, len)) return 0; } diff --git a/drivers/net/cassini.c b/drivers/net/cassini.c index 39f36aa..565a54f 100644 --- a/drivers/net/cassini.c +++ b/drivers/net/cassini.c @@ -2915,8 +2915,7 @@ static int cas_start_xmit(struct sk_buff *skb, struct net_device *dev) */ static int ring; - skb = skb_padto(skb, cp->min_frame_size); - if (!skb) + if (skb_padto(skb, cp->min_frame_size)) return 0; /* XXX: we need some higher-level QoS hooks to steer packets to diff --git a/drivers/net/declance.c b/drivers/net/declance.c index f130bda..d3d958e 100644 --- a/drivers/net/declance.c +++ b/drivers/net/declance.c @@ -885,8 +885,7 @@ static int lance_start_xmit(struct sk_buff *skb, struct net_device *dev) len = skblen; if (len < ETH_ZLEN) { - skb = skb_padto(skb, ETH_ZLEN); - if (skb == NULL) + if (skb_padto(skb, ETH_ZLEN)) return 0; len = ETH_ZLEN; } diff --git a/drivers/net/depca.c b/drivers/net/depca.c index 0941d40..e946c43d 100644 --- a/drivers/net/depca.c +++ b/drivers/net/depca.c @@ -938,11 +938,8 @@ static int depca_start_xmit(struct sk_buff *skb, struct net_device *dev) if (skb->len < 1) goto out; - if (skb->len < ETH_ZLEN) { - skb = skb_padto(skb, ETH_ZLEN); - if (skb == NULL) - goto out; - } + if (skb_padto(skb, ETH_ZLEN)) + goto out; netif_stop_queue(dev); diff --git a/drivers/net/eepro.c b/drivers/net/eepro.c index a806dfe..e70f172 100644 --- a/drivers/net/eepro.c +++ b/drivers/net/eepro.c @@ -1154,8 +1154,7 @@ static int eepro_send_packet(struct sk_buff *skb, struct net_device *dev) printk(KERN_DEBUG "%s: entering eepro_send_packet routine.\n", dev->name); if (length < ETH_ZLEN) { - skb = skb_padto(skb, ETH_ZLEN); - if (skb == NULL) + if (skb_padto(skb, ETH_ZLEN)) return 0; length = ETH_ZLEN; } diff --git a/drivers/net/eexpress.c b/drivers/net/eexpress.c index 82bd356..a74b207 100644 --- a/drivers/net/eexpress.c +++ b/drivers/net/eexpress.c @@ -677,8 +677,7 @@ static int eexp_xmit(struct sk_buff *buf, struct net_device *dev) #endif if (buf->len < ETH_ZLEN) { - buf = skb_padto(buf, ETH_ZLEN); - if (buf == NULL) + if (skb_padto(buf, ETH_ZLEN)) return 0; length = ETH_ZLEN; } diff --git a/drivers/net/epic100.c b/drivers/net/epic100.c index 8d680ce..724d7dc 100644 --- a/drivers/net/epic100.c +++ b/drivers/net/epic100.c @@ -1027,11 +1027,8 @@ static int epic_start_xmit(struct sk_buff *skb, struct net_device *dev) u32 ctrl_word; unsigned long flags; - if (skb->len < ETH_ZLEN) { - skb = skb_padto(skb, ETH_ZLEN); - if (skb == NULL) - return 0; - } + if (skb_padto(skb, ETH_ZLEN)) + return 0; /* Caution: the write order is important here, set the field with the "ownership" bit last. */ diff --git a/drivers/net/eth16i.c b/drivers/net/eth16i.c index b67545b..4bf76f8 100644 --- a/drivers/net/eth16i.c +++ b/drivers/net/eth16i.c @@ -1064,8 +1064,7 @@ static int eth16i_tx(struct sk_buff *skb, struct net_device *dev) unsigned long flags; if (length < ETH_ZLEN) { - skb = skb_padto(skb, ETH_ZLEN); - if (skb == NULL) + if (skb_padto(skb, ETH_ZLEN)) return 0; length = ETH_ZLEN; } diff --git a/drivers/net/hp100.c b/drivers/net/hp100.c index 247c8ca..dd1dc32 100644 --- a/drivers/net/hp100.c +++ b/drivers/net/hp100.c @@ -1487,11 +1487,8 @@ static int hp100_start_xmit_bm(struct sk_buff *skb, struct net_device *dev) if (skb->len <= 0) return 0; - if (skb->len < ETH_ZLEN && lp->chip == HP100_CHIPID_SHASTA) { - skb = skb_padto(skb, ETH_ZLEN); - if (skb == NULL) - return 0; - } + if (lp->chip == HP100_CHIPID_SHASTA && skb_padto(skb, ETH_ZLEN)) + return 0; /* Get Tx ring tail pointer */ if (lp->txrtail->next == lp->txrhead) { diff --git a/drivers/net/lance.c b/drivers/net/lance.c index bb5ad47..c1c3452 100644 --- a/drivers/net/lance.c +++ b/drivers/net/lance.c @@ -968,8 +968,7 @@ static int lance_start_xmit(struct sk_buff *skb, struct net_device *dev) /* The old LANCE chips doesn't automatically pad buffers to min. size. */ if (chip_table[lp->chip_version].flags & LANCE_MUST_PAD) { if (skb->len < ETH_ZLEN) { - skb = skb_padto(skb, ETH_ZLEN); - if (skb == NULL) + if (skb_padto(skb, ETH_ZLEN)) goto out; lp->tx_ring[entry].length = -ETH_ZLEN; } diff --git a/drivers/net/lasi_82596.c b/drivers/net/lasi_82596.c index 957888d..1ab0944 100644 --- a/drivers/net/lasi_82596.c +++ b/drivers/net/lasi_82596.c @@ -1083,8 +1083,7 @@ static int i596_start_xmit(struct sk_buff *skb, struct net_device *dev) skb->len, skb->data)); if (length < ETH_ZLEN) { - skb = skb_padto(skb, ETH_ZLEN); - if (skb == NULL) + if (skb_padto(skb, ETH_ZLEN)) return 0; length = ETH_ZLEN; } diff --git a/drivers/net/lp486e.c b/drivers/net/lp486e.c index 94d5ea1..bf3f343 100644 --- a/drivers/net/lp486e.c +++ b/drivers/net/lp486e.c @@ -877,8 +877,7 @@ static int i596_start_xmit (struct sk_buff *skb, struct net_device *dev) { length = skb->len; if (length < ETH_ZLEN) { - skb = skb_padto(skb, ETH_ZLEN); - if (skb == NULL) + if (skb_padto(skb, ETH_ZLEN)) return 0; length = ETH_ZLEN; } diff --git a/drivers/net/myri10ge/myri10ge.c b/drivers/net/myri10ge/myri10ge.c index 5a74f63..b983e1e 100644 --- a/drivers/net/myri10ge/myri10ge.c +++ b/drivers/net/myri10ge/myri10ge.c @@ -1939,8 +1939,7 @@ again: /* pad frames to at least ETH_ZLEN bytes */ if (unlikely(skb->len < ETH_ZLEN)) { - skb = skb_padto(skb, ETH_ZLEN); - if (skb == NULL) { + if (skb_padto(skb, ETH_ZLEN)) { /* The packet is gone, so we must * return 0 */ mgp->stats.tx_dropped += 1; diff --git a/drivers/net/pcmcia/fmvj18x_cs.c b/drivers/net/pcmcia/fmvj18x_cs.c index 09b1176..ea93b8f1 100644 --- a/drivers/net/pcmcia/fmvj18x_cs.c +++ b/drivers/net/pcmcia/fmvj18x_cs.c @@ -831,8 +831,7 @@ static int fjn_start_xmit(struct sk_buff *skb, struct net_device *dev) if (length < ETH_ZLEN) { - skb = skb_padto(skb, ETH_ZLEN); - if (skb == NULL) + if (skb_padto(skb, ETH_ZLEN)) return 0; length = ETH_ZLEN; } diff --git a/drivers/net/pcmcia/xirc2ps_cs.c b/drivers/net/pcmcia/xirc2ps_cs.c index e80d1e3..9bae77c 100644 --- a/drivers/net/pcmcia/xirc2ps_cs.c +++ b/drivers/net/pcmcia/xirc2ps_cs.c @@ -1374,8 +1374,7 @@ do_start_xmit(struct sk_buff *skb, struct net_device *dev) */ if (pktlen < ETH_ZLEN) { - skb = skb_padto(skb, ETH_ZLEN); - if (skb == NULL) + if (skb_padto(skb, ETH_ZLEN)) return 0; pktlen = ETH_ZLEN; } diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c index 9945cc6..985afe0 100644 --- a/drivers/net/r8169.c +++ b/drivers/net/r8169.c @@ -2222,8 +2222,7 @@ static int rtl8169_start_xmit(struct sk_buff *skb, struct net_device *dev) len = skb->len; if (unlikely(len < ETH_ZLEN)) { - skb = skb_padto(skb, ETH_ZLEN); - if (!skb) + if (skb_padto(skb, ETH_ZLEN)) goto err_update_stats; len = ETH_ZLEN; } diff --git a/drivers/net/seeq8005.c b/drivers/net/seeq8005.c index bcef03f..efd0f23 100644 --- a/drivers/net/seeq8005.c +++ b/drivers/net/seeq8005.c @@ -396,8 +396,7 @@ static int seeq8005_send_packet(struct sk_buff *skb, struct net_device *dev) unsigned char *buf; if (length < ETH_ZLEN) { - skb = skb_padto(skb, ETH_ZLEN); - if (skb == NULL) + if (skb_padto(skb, ETH_ZLEN)) return 0; length = ETH_ZLEN; } diff --git a/drivers/net/sis190.c b/drivers/net/sis190.c index 31dd3f0..df39f34 100644 --- a/drivers/net/sis190.c +++ b/drivers/net/sis190.c @@ -1156,8 +1156,7 @@ static int sis190_start_xmit(struct sk_buff *skb, struct net_device *dev) dma_addr_t mapping; if (unlikely(skb->len < ETH_ZLEN)) { - skb = skb_padto(skb, ETH_ZLEN); - if (!skb) { + if (skb_padto(skb, ETH_ZLEN)) { tp->stats.tx_dropped++; goto out; } diff --git a/drivers/net/sk98lin/skge.c b/drivers/net/sk98lin/skge.c index 38a26df..f3efbd1 100644 --- a/drivers/net/sk98lin/skge.c +++ b/drivers/net/sk98lin/skge.c @@ -1525,7 +1525,7 @@ struct sk_buff *pMessage) /* pointer to send-message */ ** This is to resolve faulty padding by the HW with 0xaa bytes. */ if (BytesSend < C_LEN_ETHERNET_MINSIZE) { - if ((pMessage = skb_padto(pMessage, C_LEN_ETHERNET_MINSIZE)) == NULL) { + if (skb_padto(pMessage, C_LEN_ETHERNET_MINSIZE)) { spin_unlock_irqrestore(&pTxPort->TxDesRingLock, Flags); return 0; } diff --git a/drivers/net/skge.c b/drivers/net/skge.c index 536dd1c..19a4a16 100644 --- a/drivers/net/skge.c +++ b/drivers/net/skge.c @@ -2310,8 +2310,7 @@ static int skge_xmit_frame(struct sk_buff *skb, struct net_device *dev) u64 map; unsigned long flags; - skb = skb_padto(skb, ETH_ZLEN); - if (!skb) + if (skb_padto(skb, ETH_ZLEN)) return NETDEV_TX_OK; if (!spin_trylock_irqsave(&skge->tx_lock, flags)) diff --git a/drivers/net/smc9194.c b/drivers/net/smc9194.c index 6cf16f3..8b0321f 100644 --- a/drivers/net/smc9194.c +++ b/drivers/net/smc9194.c @@ -523,8 +523,7 @@ static int smc_wait_to_send_packet( struct sk_buff * skb, struct net_device * de length = skb->len; if (length < ETH_ZLEN) { - skb = skb_padto(skb, ETH_ZLEN); - if (skb == NULL) { + if (skb_padto(skb, ETH_ZLEN)) { netif_wake_queue(dev); return 0; } diff --git a/drivers/net/sonic.c b/drivers/net/sonic.c index 90b818a..cab0dd9 100644 --- a/drivers/net/sonic.c +++ b/drivers/net/sonic.c @@ -231,8 +231,7 @@ static int sonic_send_packet(struct sk_buff *skb, struct net_device *dev) length = skb->len; if (length < ETH_ZLEN) { - skb = skb_padto(skb, ETH_ZLEN); - if (skb == NULL) + if (skb_padto(skb, ETH_ZLEN)) return 0; length = ETH_ZLEN; } diff --git a/drivers/net/starfire.c b/drivers/net/starfire.c index 9b7805b..c158eed 100644 --- a/drivers/net/starfire.c +++ b/drivers/net/starfire.c @@ -1349,8 +1349,7 @@ static int start_tx(struct sk_buff *skb, struct net_device *dev) #if defined(ZEROCOPY) && defined(HAS_BROKEN_FIRMWARE) if (skb->ip_summed == CHECKSUM_HW) { - skb = skb_padto(skb, (skb->len + PADDING_MASK) & ~PADDING_MASK); - if (skb == NULL) + if (skb_padto(skb, (skb->len + PADDING_MASK) & ~PADDING_MASK)) return NETDEV_TX_OK; } #endif /* ZEROCOPY && HAS_BROKEN_FIRMWARE */ diff --git a/drivers/net/via-rhine.c b/drivers/net/via-rhine.c index fdc2103..c80a4f1 100644 --- a/drivers/net/via-rhine.c +++ b/drivers/net/via-rhine.c @@ -1284,11 +1284,8 @@ static int rhine_start_tx(struct sk_buff *skb, struct net_device *dev) /* Calculate the next Tx descriptor entry. */ entry = rp->cur_tx % TX_RING_SIZE; - if (skb->len < ETH_ZLEN) { - skb = skb_padto(skb, ETH_ZLEN); - if (skb == NULL) - return 0; - } + if (skb_padto(skb, ETH_ZLEN)) + return 0; rp->tx_skbuff[entry] = skb; diff --git a/drivers/net/wireless/ray_cs.c b/drivers/net/wireless/ray_cs.c index 879eb42..a915fe6 100644 --- a/drivers/net/wireless/ray_cs.c +++ b/drivers/net/wireless/ray_cs.c @@ -924,8 +924,7 @@ static int ray_dev_start_xmit(struct sk_buff *skb, struct net_device *dev) if (length < ETH_ZLEN) { - skb = skb_padto(skb, ETH_ZLEN); - if (skb == NULL) + if (skb_padto(skb, ETH_ZLEN)) return 0; length = ETH_ZLEN; } diff --git a/drivers/net/wireless/wavelan_cs.c b/drivers/net/wireless/wavelan_cs.c index f7724eb..561250f 100644 --- a/drivers/net/wireless/wavelan_cs.c +++ b/drivers/net/wireless/wavelan_cs.c @@ -3194,11 +3194,8 @@ wavelan_packet_xmit(struct sk_buff * skb, * and we don't have the Ethernet specific requirement of beeing * able to detect collisions, therefore in theory we don't really * need to pad. Jean II */ - if (skb->len < ETH_ZLEN) { - skb = skb_padto(skb, ETH_ZLEN); - if (skb == NULL) - return 0; - } + if (skb_padto(skb, ETH_ZLEN)) + return 0; wv_packet_write(dev, skb->data, skb->len); diff --git a/drivers/net/yellowfin.c b/drivers/net/yellowfin.c index fd0f43b..ecec8e5 100644 --- a/drivers/net/yellowfin.c +++ b/drivers/net/yellowfin.c @@ -862,13 +862,11 @@ static int yellowfin_start_xmit(struct sk_buff *skb, struct net_device *dev) /* Fix GX chipset errata. */ if (cacheline_end > 24 || cacheline_end == 0) { len = skb->len + 32 - cacheline_end + 1; - if (len != skb->len) - skb = skb_padto(skb, len); - } - if (skb == NULL) { - yp->tx_skbuff[entry] = NULL; - netif_wake_queue(dev); - return 0; + if (skb_padto(skb, len)) { + yp->tx_skbuff[entry] = NULL; + netif_wake_queue(dev); + return 0; + } } } yp->tx_skbuff[entry] = skb; diff --git a/drivers/net/znet.c b/drivers/net/znet.c index 3ac047b..a7c089d 100644 --- a/drivers/net/znet.c +++ b/drivers/net/znet.c @@ -544,8 +544,7 @@ static int znet_send_packet(struct sk_buff *skb, struct net_device *dev) printk(KERN_DEBUG "%s: ZNet_send_packet.\n", dev->name); if (length < ETH_ZLEN) { - skb = skb_padto(skb, ETH_ZLEN); - if (skb == NULL) + if (skb_padto(skb, ETH_ZLEN)) return 0; length = ETH_ZLEN; } diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 66f8819..f8c7eb7 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -345,7 +345,7 @@ extern struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, extern struct sk_buff *skb_copy_expand(const struct sk_buff *skb, int newheadroom, int newtailroom, gfp_t priority); -extern struct sk_buff * skb_pad(struct sk_buff *skb, int pad); +extern int skb_pad(struct sk_buff *skb, int pad); #define dev_kfree_skb(a) kfree_skb(a) extern void skb_over_panic(struct sk_buff *skb, int len, void *here); @@ -1122,16 +1122,15 @@ static inline int skb_cow(struct sk_buff *skb, unsigned int headroom) * * Pads up a buffer to ensure the trailing bytes exist and are * blanked. If the buffer already contains sufficient data it - * is untouched. Returns the buffer, which may be a replacement - * for the original, or NULL for out of memory - in which case - * the original buffer is still freed. + * is untouched. Otherwise it is extended. Returns zero on + * success. The skb is freed on error. */ -static inline struct sk_buff *skb_padto(struct sk_buff *skb, unsigned int len) +static inline int skb_padto(struct sk_buff *skb, unsigned int len) { unsigned int size = skb->len; if (likely(size >= len)) - return skb; + return 0; return skb_pad(skb, len-size); } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index bb7210f..fe63d4e 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -781,24 +781,40 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb, * filled. Used by network drivers which may DMA or transfer data * beyond the buffer end onto the wire. * - * May return NULL in out of memory cases. + * May return error in out of memory cases. The skb is freed on error. */ -struct sk_buff *skb_pad(struct sk_buff *skb, int pad) +int skb_pad(struct sk_buff *skb, int pad) { - struct sk_buff *nskb; + int err; + int ntail; /* If the skbuff is non linear tailroom is always zero.. */ - if (skb_tailroom(skb) >= pad) { + if (!skb_cloned(skb) && skb_tailroom(skb) >= pad) { memset(skb->data+skb->len, 0, pad); - return skb; + return 0; } - - nskb = skb_copy_expand(skb, skb_headroom(skb), skb_tailroom(skb) + pad, GFP_ATOMIC); + + ntail = skb->data_len + pad - (skb->end - skb->tail); + if (likely(skb_cloned(skb) || ntail > 0)) { + err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC); + if (unlikely(err)) + goto free_skb; + } + + /* FIXME: The use of this function with non-linear skb's really needs + * to be audited. + */ + err = skb_linearize(skb); + if (unlikely(err)) + goto free_skb; + + memset(skb->data + skb->len, 0, pad); + return 0; + +free_skb: kfree_skb(skb); - if (nskb) - memset(nskb->data+nskb->len, 0, pad); - return nskb; + return err; } /* Trims skb to length len. It can change skb pointers. -- cgit v0.10.2 From 102128e3a27821bdcbacb10f4f2bba253f587ba4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Stelmach?= Date: Thu, 22 Jun 2006 01:37:19 -0700 Subject: [IPV6]: Fix source address selection. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two additional labels (RFC 3484, sec. 10.3) for IPv6 addreses are defined to make a distinction between global unicast addresses and Unique Local Addresses (fc00::/7, RFC 4193) and Teredo (2001::/32, RFC 4380). It is necessary to avoid attempts of connection that would either fail (eg. fec0:: to 2001:feed::) or be sub-optimal (2001:0:: to 2001:feed::). Signed-off-by: Ɓukasz Stelmach Signed-off-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index c2c26fa..6b361fc 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -862,6 +862,8 @@ static int inline ipv6_saddr_label(const struct in6_addr *addr, int type) * 2002::/16 2 * ::/96 3 * ::ffff:0:0/96 4 + * fc00::/7 5 + * 2001::/32 6 */ if (type & IPV6_ADDR_LOOPBACK) return 0; @@ -869,8 +871,12 @@ static int inline ipv6_saddr_label(const struct in6_addr *addr, int type) return 3; else if (type & IPV6_ADDR_MAPPED) return 4; + else if (addr->s6_addr32[0] == htonl(0x20010000)) + return 6; else if (addr->s6_addr16[0] == htons(0x2002)) return 2; + else if ((addr->s6_addr[0] & 0xfe) == 0xfc) + return 5; return 1; } -- cgit v0.10.2 From 5e2707fa3aed8c24075087cbaea2628725adbe55 Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Thu, 22 Jun 2006 01:41:18 -0700 Subject: [IPV6] ADDRCONF: Fix default source address selection without CONFIG_IPV6_PRIVACY We need to update hiscore.rule even if we don't enable CONFIG_IPV6_PRIVACY, because we have more less significant rule; longest match. Signed-off-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 6b361fc..4da6645 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1075,6 +1075,9 @@ int ipv6_dev_get_saddr(struct net_device *daddr_dev, if (hiscore.attrs & IPV6_SADDR_SCORE_PRIVACY) continue; } +#else + if (hiscore.rule < 7) + hiscore.rule++; #endif /* Rule 8: Use longest matching prefix */ if (hiscore.rule < 8) { -- cgit v0.10.2 From d4828d85d188dc70ed172802e798d3978bb6e29e Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 22 Jun 2006 02:28:18 -0700 Subject: [NET]: Prevent transmission after dev_deactivate The dev_deactivate function has bit-rotted since the introduction of lockless drivers. In particular, the spin_unlock_wait call at the end has no effect on the xmit routine of lockless drivers. With a little bit of work, we can make it much more useful by providing the guarantee that when it returns, no more calls to the xmit routine of the underlying driver will be made. The idea is simple. There are two entry points in to the xmit routine. The first comes from dev_queue_xmit. That one is easily stopped by using synchronize_rcu. This works because we set the qdisc to noop_qdisc before the synchronize_rcu call. That in turn causes all subsequent packets sent to dev_queue_xmit to be dropped. The synchronize_rcu call also ensures all outstanding calls leave their critical section. The other entry point is from qdisc_run. Since we now have a bit that indicates whether it's running, all we have to do is to wait until the bit is off. I've removed the loop to wait for __LINK_STATE_SCHED to clear. This is useless because netif_wake_queue can cause it to be set again. It is also harmless because we've disarmed qdisc_run. I've also removed the spin_unlock_wait on xmit_lock because its only purpose of making sure that all outstanding xmit_lock holders have exited is also given by dev_watchdog_down. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller diff --git a/net/core/dev.c b/net/core/dev.c index ab39fe1..29e3888 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1295,7 +1295,7 @@ int dev_queue_xmit(struct sk_buff *skb) /* Disable soft irqs for various locks below. Also * stops preemption for RCU. */ - local_bh_disable(); + rcu_read_lock_bh(); /* Updates of qdisc are serialized by queue_lock. * The struct Qdisc which is pointed to by qdisc is now a @@ -1369,13 +1369,13 @@ int dev_queue_xmit(struct sk_buff *skb) } rc = -ENETDOWN; - local_bh_enable(); + rcu_read_unlock_bh(); out_kfree_skb: kfree_skb(skb); return rc; out: - local_bh_enable(); + rcu_read_unlock_bh(); return rc; } diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index d7aca8e..7aad012 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -181,9 +181,13 @@ requeue: void __qdisc_run(struct net_device *dev) { + if (unlikely(dev->qdisc == &noop_qdisc)) + goto out; + while (qdisc_restart(dev) < 0 && !netif_queue_stopped(dev)) /* NOTHING */; +out: clear_bit(__LINK_STATE_QDISC_RUNNING, &dev->state); } @@ -583,10 +587,12 @@ void dev_deactivate(struct net_device *dev) dev_watchdog_down(dev); - while (test_bit(__LINK_STATE_SCHED, &dev->state)) - yield(); + /* Wait for outstanding dev_queue_xmit calls. */ + synchronize_rcu(); - spin_unlock_wait(&dev->_xmit_lock); + /* Wait for outstanding qdisc_run calls. */ + while (test_bit(__LINK_STATE_QDISC_RUNNING, &dev->state)) + yield(); } void dev_init_scheduler(struct net_device *dev) -- cgit v0.10.2 From 7967168cefdbc63bf332d6b1548eca7cd65ebbcc Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 22 Jun 2006 02:40:14 -0700 Subject: [NET]: Merge TSO/UFO fields in sk_buff Having separate fields in sk_buff for TSO/UFO (tso_size/ufo_size) is not going to scale if we add any more segmentation methods (e.g., DCCP). So let's merge them. They were used to tell the protocol of a packet. This function has been subsumed by the new gso_type field. This is essentially a set of netdev feature bits (shifted by 16 bits) that are required to process a specific skb. As such it's easy to tell whether a given device can process a GSO skb: you just have to and the gso_type field and the netdev's features field. I've made gso_type a conjunction. The idea is that you have a base type (e.g., SKB_GSO_TCPV4) that can be modified further to support new features. For example, if we add a hardware TSO type that supports ECN, they would declare NETIF_F_TSO | NETIF_F_TSO_ECN. All TSO packets with CWR set would have a gso_type of SKB_GSO_TCPV4 | SKB_GSO_TCPV4_ECN while all other TSO packets would be SKB_GSO_TCPV4. This means that only the CWR packets need to be emulated in software. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller diff --git a/drivers/net/8139cp.c b/drivers/net/8139cp.c index a26077a..0cdc830 100644 --- a/drivers/net/8139cp.c +++ b/drivers/net/8139cp.c @@ -797,7 +797,7 @@ static int cp_start_xmit (struct sk_buff *skb, struct net_device *dev) entry = cp->tx_head; eor = (entry == (CP_TX_RING_SIZE - 1)) ? RingEnd : 0; if (dev->features & NETIF_F_TSO) - mss = skb_shinfo(skb)->tso_size; + mss = skb_shinfo(skb)->gso_size; if (skb_shinfo(skb)->nr_frags == 0) { struct cp_desc *txd = &cp->tx_ring[entry]; diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c index 702d546..7635736 100644 --- a/drivers/net/bnx2.c +++ b/drivers/net/bnx2.c @@ -1640,7 +1640,7 @@ bnx2_tx_int(struct bnx2 *bp) skb = tx_buf->skb; #ifdef BCM_TSO /* partial BD completions possible with TSO packets */ - if (skb_shinfo(skb)->tso_size) { + if (skb_shinfo(skb)->gso_size) { u16 last_idx, last_ring_idx; last_idx = sw_cons + @@ -4428,7 +4428,7 @@ bnx2_start_xmit(struct sk_buff *skb, struct net_device *dev) (TX_BD_FLAGS_VLAN_TAG | (vlan_tx_tag_get(skb) << 16)); } #ifdef BCM_TSO - if ((mss = skb_shinfo(skb)->tso_size) && + if ((mss = skb_shinfo(skb)->gso_size) && (skb->len > (bp->dev->mtu + ETH_HLEN))) { u32 tcp_opt_len, ip_tcp_len; diff --git a/drivers/net/chelsio/sge.c b/drivers/net/chelsio/sge.c index 4391bf4..53efff6 100644 --- a/drivers/net/chelsio/sge.c +++ b/drivers/net/chelsio/sge.c @@ -1418,7 +1418,7 @@ int t1_start_xmit(struct sk_buff *skb, struct net_device *dev) struct cpl_tx_pkt *cpl; #ifdef NETIF_F_TSO - if (skb_shinfo(skb)->tso_size) { + if (skb_shinfo(skb)->gso_size) { int eth_type; struct cpl_tx_pkt_lso *hdr; @@ -1433,7 +1433,7 @@ int t1_start_xmit(struct sk_buff *skb, struct net_device *dev) hdr->ip_hdr_words = skb->nh.iph->ihl; hdr->tcp_hdr_words = skb->h.th->doff; hdr->eth_type_mss = htons(MK_ETH_TYPE_MSS(eth_type, - skb_shinfo(skb)->tso_size)); + skb_shinfo(skb)->gso_size)); hdr->len = htonl(skb->len - sizeof(*hdr)); cpl = (struct cpl_tx_pkt *)hdr; sge->stats.tx_lso_pkts++; diff --git a/drivers/net/e1000/e1000_main.c b/drivers/net/e1000/e1000_main.c index a373ccb..32b7d44 100644 --- a/drivers/net/e1000/e1000_main.c +++ b/drivers/net/e1000/e1000_main.c @@ -2394,7 +2394,7 @@ e1000_tso(struct e1000_adapter *adapter, struct e1000_tx_ring *tx_ring, uint8_t ipcss, ipcso, tucss, tucso, hdr_len; int err; - if (skb_shinfo(skb)->tso_size) { + if (skb_shinfo(skb)->gso_size) { if (skb_header_cloned(skb)) { err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); if (err) @@ -2402,7 +2402,7 @@ e1000_tso(struct e1000_adapter *adapter, struct e1000_tx_ring *tx_ring, } hdr_len = ((skb->h.raw - skb->data) + (skb->h.th->doff << 2)); - mss = skb_shinfo(skb)->tso_size; + mss = skb_shinfo(skb)->gso_size; if (skb->protocol == htons(ETH_P_IP)) { skb->nh.iph->tot_len = 0; skb->nh.iph->check = 0; @@ -2519,7 +2519,7 @@ e1000_tx_map(struct e1000_adapter *adapter, struct e1000_tx_ring *tx_ring, * tso gets written back prematurely before the data is fully * DMA'd to the controller */ if (!skb->data_len && tx_ring->last_tx_tso && - !skb_shinfo(skb)->tso_size) { + !skb_shinfo(skb)->gso_size) { tx_ring->last_tx_tso = 0; size -= 4; } @@ -2757,7 +2757,7 @@ e1000_xmit_frame(struct sk_buff *skb, struct net_device *netdev) } #ifdef NETIF_F_TSO - mss = skb_shinfo(skb)->tso_size; + mss = skb_shinfo(skb)->gso_size; /* The controller does a simple calculation to * make sure there is enough room in the FIFO before * initiating the DMA for each buffer. The calc is: @@ -2807,7 +2807,7 @@ e1000_xmit_frame(struct sk_buff *skb, struct net_device *netdev) #ifdef NETIF_F_TSO /* Controller Erratum workaround */ if (!skb->data_len && tx_ring->last_tx_tso && - !skb_shinfo(skb)->tso_size) + !skb_shinfo(skb)->gso_size) count++; #endif diff --git a/drivers/net/forcedeth.c b/drivers/net/forcedeth.c index 191383d..21be4fa 100644 --- a/drivers/net/forcedeth.c +++ b/drivers/net/forcedeth.c @@ -1495,8 +1495,8 @@ static int nv_start_xmit(struct sk_buff *skb, struct net_device *dev) np->tx_skbuff[nr] = skb; #ifdef NETIF_F_TSO - if (skb_shinfo(skb)->tso_size) - tx_flags_extra = NV_TX2_TSO | (skb_shinfo(skb)->tso_size << NV_TX2_TSO_SHIFT); + if (skb_shinfo(skb)->gso_size) + tx_flags_extra = NV_TX2_TSO | (skb_shinfo(skb)->gso_size << NV_TX2_TSO_SHIFT); else #endif tx_flags_extra = (skb->ip_summed == CHECKSUM_HW ? (NV_TX2_CHECKSUM_L3|NV_TX2_CHECKSUM_L4) : 0); diff --git a/drivers/net/ixgb/ixgb_main.c b/drivers/net/ixgb/ixgb_main.c index 57006fb..8bb32f9 100644 --- a/drivers/net/ixgb/ixgb_main.c +++ b/drivers/net/ixgb/ixgb_main.c @@ -1173,7 +1173,7 @@ ixgb_tso(struct ixgb_adapter *adapter, struct sk_buff *skb) uint16_t ipcse, tucse, mss; int err; - if(likely(skb_shinfo(skb)->tso_size)) { + if(likely(skb_shinfo(skb)->gso_size)) { if (skb_header_cloned(skb)) { err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); if (err) @@ -1181,7 +1181,7 @@ ixgb_tso(struct ixgb_adapter *adapter, struct sk_buff *skb) } hdr_len = ((skb->h.raw - skb->data) + (skb->h.th->doff << 2)); - mss = skb_shinfo(skb)->tso_size; + mss = skb_shinfo(skb)->gso_size; skb->nh.iph->tot_len = 0; skb->nh.iph->check = 0; skb->h.th->check = ~csum_tcpudp_magic(skb->nh.iph->saddr, diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c index b79d6e8..43fef7d 100644 --- a/drivers/net/loopback.c +++ b/drivers/net/loopback.c @@ -74,7 +74,7 @@ static void emulate_large_send_offload(struct sk_buff *skb) struct iphdr *iph = skb->nh.iph; struct tcphdr *th = (struct tcphdr*)(skb->nh.raw + (iph->ihl * 4)); unsigned int doffset = (iph->ihl + th->doff) * 4; - unsigned int mtu = skb_shinfo(skb)->tso_size + doffset; + unsigned int mtu = skb_shinfo(skb)->gso_size + doffset; unsigned int offset = 0; u32 seq = ntohl(th->seq); u16 id = ntohs(iph->id); @@ -139,7 +139,7 @@ static int loopback_xmit(struct sk_buff *skb, struct net_device *dev) #endif #ifdef LOOPBACK_TSO - if (skb_shinfo(skb)->tso_size) { + if (skb_shinfo(skb)->gso_size) { BUG_ON(skb->protocol != htons(ETH_P_IP)); BUG_ON(skb->nh.iph->protocol != IPPROTO_TCP); diff --git a/drivers/net/myri10ge/myri10ge.c b/drivers/net/myri10ge/myri10ge.c index b983e1e..dbdf189 100644 --- a/drivers/net/myri10ge/myri10ge.c +++ b/drivers/net/myri10ge/myri10ge.c @@ -1879,7 +1879,7 @@ again: #ifdef NETIF_F_TSO if (skb->len > (dev->mtu + ETH_HLEN)) { - mss = skb_shinfo(skb)->tso_size; + mss = skb_shinfo(skb)->gso_size; if (mss != 0) max_segments = MYRI10GE_MAX_SEND_DESC_TSO; } @@ -2112,7 +2112,7 @@ abort_linearize: } idx = (idx + 1) & tx->mask; } while (idx != last_idx); - if (skb_shinfo(skb)->tso_size) { + if (skb_shinfo(skb)->gso_size) { printk(KERN_ERR "myri10ge: %s: TSO but wanted to linearize?!?!?\n", mgp->dev->name); diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c index 985afe0..12d1cb2 100644 --- a/drivers/net/r8169.c +++ b/drivers/net/r8169.c @@ -2172,7 +2172,7 @@ static int rtl8169_xmit_frags(struct rtl8169_private *tp, struct sk_buff *skb, static inline u32 rtl8169_tso_csum(struct sk_buff *skb, struct net_device *dev) { if (dev->features & NETIF_F_TSO) { - u32 mss = skb_shinfo(skb)->tso_size; + u32 mss = skb_shinfo(skb)->gso_size; if (mss) return LargeSend | ((mss & MSSMask) << MSSShift); diff --git a/drivers/net/s2io.c b/drivers/net/s2io.c index 11daed4..3defe5d 100644 --- a/drivers/net/s2io.c +++ b/drivers/net/s2io.c @@ -3959,8 +3959,8 @@ static int s2io_xmit(struct sk_buff *skb, struct net_device *dev) txdp->Control_1 = 0; txdp->Control_2 = 0; #ifdef NETIF_F_TSO - mss = skb_shinfo(skb)->tso_size; - if (mss) { + mss = skb_shinfo(skb)->gso_size; + if (skb_shinfo(skb)->gso_type == SKB_GSO_TCPV4) { txdp->Control_1 |= TXD_TCP_LSO_EN; txdp->Control_1 |= TXD_TCP_LSO_MSS(mss); } @@ -3980,10 +3980,10 @@ static int s2io_xmit(struct sk_buff *skb, struct net_device *dev) } frg_len = skb->len - skb->data_len; - if (skb_shinfo(skb)->ufo_size) { + if (skb_shinfo(skb)->gso_type == SKB_GSO_UDPV4) { int ufo_size; - ufo_size = skb_shinfo(skb)->ufo_size; + ufo_size = skb_shinfo(skb)->gso_size; ufo_size &= ~7; txdp->Control_1 |= TXD_UFO_EN; txdp->Control_1 |= TXD_UFO_MSS(ufo_size); @@ -4009,7 +4009,7 @@ static int s2io_xmit(struct sk_buff *skb, struct net_device *dev) txdp->Host_Control = (unsigned long) skb; txdp->Control_1 |= TXD_BUFFER0_SIZE(frg_len); - if (skb_shinfo(skb)->ufo_size) + if (skb_shinfo(skb)->gso_type == SKB_GSO_UDPV4) txdp->Control_1 |= TXD_UFO_EN; frg_cnt = skb_shinfo(skb)->nr_frags; @@ -4024,12 +4024,12 @@ static int s2io_xmit(struct sk_buff *skb, struct net_device *dev) (sp->pdev, frag->page, frag->page_offset, frag->size, PCI_DMA_TODEVICE); txdp->Control_1 = TXD_BUFFER0_SIZE(frag->size); - if (skb_shinfo(skb)->ufo_size) + if (skb_shinfo(skb)->gso_type == SKB_GSO_UDPV4) txdp->Control_1 |= TXD_UFO_EN; } txdp->Control_1 |= TXD_GATHER_CODE_LAST; - if (skb_shinfo(skb)->ufo_size) + if (skb_shinfo(skb)->gso_type == SKB_GSO_UDPV4) frg_cnt++; /* as Txd0 was used for inband header */ tx_fifo = mac_control->tx_FIFO_start[queue]; @@ -4043,7 +4043,7 @@ static int s2io_xmit(struct sk_buff *skb, struct net_device *dev) if (mss) val64 |= TX_FIFO_SPECIAL_FUNC; #endif - if (skb_shinfo(skb)->ufo_size) + if (skb_shinfo(skb)->gso_type == SKB_GSO_UDPV4) val64 |= TX_FIFO_SPECIAL_FUNC; writeq(val64, &tx_fifo->List_Control); diff --git a/drivers/net/sky2.c b/drivers/net/sky2.c index fba1e4d4..d357787 100644 --- a/drivers/net/sky2.c +++ b/drivers/net/sky2.c @@ -1160,7 +1160,7 @@ static unsigned tx_le_req(const struct sk_buff *skb) count = sizeof(dma_addr_t) / sizeof(u32); count += skb_shinfo(skb)->nr_frags * count; - if (skb_shinfo(skb)->tso_size) + if (skb_shinfo(skb)->gso_size) ++count; if (skb->ip_summed == CHECKSUM_HW) @@ -1232,7 +1232,7 @@ static int sky2_xmit_frame(struct sk_buff *skb, struct net_device *dev) } /* Check for TCP Segmentation Offload */ - mss = skb_shinfo(skb)->tso_size; + mss = skb_shinfo(skb)->gso_size; if (mss != 0) { /* just drop the packet if non-linear expansion fails */ if (skb_header_cloned(skb) && diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c index b2ddd45..e3e380f 100644 --- a/drivers/net/tg3.c +++ b/drivers/net/tg3.c @@ -3780,7 +3780,7 @@ static int tg3_start_xmit(struct sk_buff *skb, struct net_device *dev) #if TG3_TSO_SUPPORT != 0 mss = 0; if (skb->len > (tp->dev->mtu + ETH_HLEN) && - (mss = skb_shinfo(skb)->tso_size) != 0) { + (mss = skb_shinfo(skb)->gso_size) != 0) { int tcp_opt_len, ip_tcp_len; if (skb_header_cloned(skb) && @@ -3905,7 +3905,7 @@ static int tg3_start_xmit_dma_bug(struct sk_buff *skb, struct net_device *dev) #if TG3_TSO_SUPPORT != 0 mss = 0; if (skb->len > (tp->dev->mtu + ETH_HLEN) && - (mss = skb_shinfo(skb)->tso_size) != 0) { + (mss = skb_shinfo(skb)->gso_size) != 0) { int tcp_opt_len, ip_tcp_len; if (skb_header_cloned(skb) && diff --git a/drivers/net/typhoon.c b/drivers/net/typhoon.c index d9258d4..e49e8b5 100644 --- a/drivers/net/typhoon.c +++ b/drivers/net/typhoon.c @@ -340,7 +340,7 @@ enum state_values { #endif #if defined(NETIF_F_TSO) -#define skb_tso_size(x) (skb_shinfo(x)->tso_size) +#define skb_tso_size(x) (skb_shinfo(x)->gso_size) #define TSO_NUM_DESCRIPTORS 2 #define TSO_OFFLOAD_ON TYPHOON_OFFLOAD_TCP_SEGMENT #else diff --git a/drivers/s390/net/qeth_eddp.c b/drivers/s390/net/qeth_eddp.c index 0bab60a..38aad83 100644 --- a/drivers/s390/net/qeth_eddp.c +++ b/drivers/s390/net/qeth_eddp.c @@ -420,7 +420,7 @@ __qeth_eddp_fill_context_tcp(struct qeth_eddp_context *ctx, } tcph = eddp->skb->h.th; while (eddp->skb_offset < eddp->skb->len) { - data_len = min((int)skb_shinfo(eddp->skb)->tso_size, + data_len = min((int)skb_shinfo(eddp->skb)->gso_size, (int)(eddp->skb->len - eddp->skb_offset)); /* prepare qdio hdr */ if (eddp->qh.hdr.l2.id == QETH_HEADER_TYPE_LAYER2){ @@ -515,20 +515,20 @@ qeth_eddp_calc_num_pages(struct qeth_eddp_context *ctx, struct sk_buff *skb, QETH_DBF_TEXT(trace, 5, "eddpcanp"); /* can we put multiple skbs in one page? */ - skbs_per_page = PAGE_SIZE / (skb_shinfo(skb)->tso_size + hdr_len); + skbs_per_page = PAGE_SIZE / (skb_shinfo(skb)->gso_size + hdr_len); if (skbs_per_page > 1){ - ctx->num_pages = (skb_shinfo(skb)->tso_segs + 1) / + ctx->num_pages = (skb_shinfo(skb)->gso_segs + 1) / skbs_per_page + 1; ctx->elements_per_skb = 1; } else { /* no -> how many elements per skb? */ - ctx->elements_per_skb = (skb_shinfo(skb)->tso_size + hdr_len + + ctx->elements_per_skb = (skb_shinfo(skb)->gso_size + hdr_len + PAGE_SIZE) >> PAGE_SHIFT; ctx->num_pages = ctx->elements_per_skb * - (skb_shinfo(skb)->tso_segs + 1); + (skb_shinfo(skb)->gso_segs + 1); } ctx->num_elements = ctx->elements_per_skb * - (skb_shinfo(skb)->tso_segs + 1); + (skb_shinfo(skb)->gso_segs + 1); } static inline struct qeth_eddp_context * diff --git a/drivers/s390/net/qeth_main.c b/drivers/s390/net/qeth_main.c index 9e671a4..56009d7 100644 --- a/drivers/s390/net/qeth_main.c +++ b/drivers/s390/net/qeth_main.c @@ -4417,7 +4417,7 @@ qeth_send_packet(struct qeth_card *card, struct sk_buff *skb) struct qeth_eddp_context *ctx = NULL; int tx_bytes = skb->len; unsigned short nr_frags = skb_shinfo(skb)->nr_frags; - unsigned short tso_size = skb_shinfo(skb)->tso_size; + unsigned short tso_size = skb_shinfo(skb)->gso_size; int rc; QETH_DBF_TEXT(trace, 6, "sendpkt"); @@ -4453,7 +4453,7 @@ qeth_send_packet(struct qeth_card *card, struct sk_buff *skb) queue = card->qdio.out_qs [qeth_get_priority_queue(card, skb, ipv, cast_type)]; - if (skb_shinfo(skb)->tso_size) + if (skb_shinfo(skb)->gso_size) large_send = card->options.large_send; /*are we able to do TSO ? If so ,prepare and send it from here */ diff --git a/drivers/s390/net/qeth_tso.h b/drivers/s390/net/qeth_tso.h index 24ef40c..593f298 100644 --- a/drivers/s390/net/qeth_tso.h +++ b/drivers/s390/net/qeth_tso.h @@ -51,7 +51,7 @@ qeth_tso_fill_header(struct qeth_card *card, struct sk_buff *skb) hdr->ext.hdr_version = 1; hdr->ext.hdr_len = 28; /*insert non-fix values */ - hdr->ext.mss = skb_shinfo(skb)->tso_size; + hdr->ext.mss = skb_shinfo(skb)->gso_size; hdr->ext.dg_hdr_len = (__u16)(iph->ihl*4 + tcph->doff*4); hdr->ext.payload_len = (__u16)(skb->len - hdr->ext.dg_hdr_len - sizeof(struct qeth_hdr_tso)); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index cead6be..fa56713 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -308,9 +308,12 @@ struct net_device #define NETIF_F_HW_VLAN_RX 256 /* Receive VLAN hw acceleration */ #define NETIF_F_HW_VLAN_FILTER 512 /* Receive filtering on VLAN */ #define NETIF_F_VLAN_CHALLENGED 1024 /* Device cannot handle VLAN packets */ -#define NETIF_F_TSO 2048 /* Can offload TCP/IP segmentation */ #define NETIF_F_LLTX 4096 /* LockLess TX */ -#define NETIF_F_UFO 8192 /* Can offload UDP Large Send*/ + + /* Segmentation offload features */ +#define NETIF_F_GSO_SHIFT 16 +#define NETIF_F_TSO (SKB_GSO_TCPV4 << NETIF_F_GSO_SHIFT) +#define NETIF_F_UFO (SKB_GSO_UDPV4 << NETIF_F_GSO_SHIFT) #define NETIF_F_GEN_CSUM (NETIF_F_NO_CSUM | NETIF_F_HW_CSUM) #define NETIF_F_ALL_CSUM (NETIF_F_IP_CSUM | NETIF_F_GEN_CSUM) @@ -979,6 +982,13 @@ extern void dev_seq_stop(struct seq_file *seq, void *v); extern void linkwatch_run_queue(void); +static inline int netif_needs_gso(struct net_device *dev, struct sk_buff *skb) +{ + int feature = skb_shinfo(skb)->gso_type << NETIF_F_GSO_SHIFT; + return skb_shinfo(skb)->gso_size && + (dev->features & feature) != feature; +} + #endif /* __KERNEL__ */ #endif /* _LINUX_DEV_H */ diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f8c7eb7..97b0d2d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -134,9 +134,10 @@ struct skb_frag_struct { struct skb_shared_info { atomic_t dataref; unsigned short nr_frags; - unsigned short tso_size; - unsigned short tso_segs; - unsigned short ufo_size; + unsigned short gso_size; + /* Warning: this field is not always filled in (UFO)! */ + unsigned short gso_segs; + unsigned short gso_type; unsigned int ip6_frag_id; struct sk_buff *frag_list; skb_frag_t frags[MAX_SKB_FRAGS]; @@ -168,6 +169,11 @@ enum { SKB_FCLONE_CLONE, }; +enum { + SKB_GSO_TCPV4 = 1 << 0, + SKB_GSO_UDPV4 = 1 << 1, +}; + /** * struct sk_buff - socket buffer * @next: Next buffer in list diff --git a/include/net/tcp.h b/include/net/tcp.h index 5f4eb5c..b197a9e 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -569,13 +569,13 @@ struct tcp_skb_cb { */ static inline int tcp_skb_pcount(const struct sk_buff *skb) { - return skb_shinfo(skb)->tso_segs; + return skb_shinfo(skb)->gso_segs; } /* This is valid iff tcp_skb_pcount() > 1. */ static inline int tcp_skb_mss(const struct sk_buff *skb) { - return skb_shinfo(skb)->tso_size; + return skb_shinfo(skb)->gso_size; } static inline void tcp_dec_pcount_approx(__u32 *count, diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 0dca027..8be9f21 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -34,8 +34,8 @@ static inline unsigned packet_length(const struct sk_buff *skb) int br_dev_queue_push_xmit(struct sk_buff *skb) { - /* drop mtu oversized packets except tso */ - if (packet_length(skb) > skb->dev->mtu && !skb_shinfo(skb)->tso_size) + /* drop mtu oversized packets except gso */ + if (packet_length(skb) > skb->dev->mtu && !skb_shinfo(skb)->gso_size) kfree_skb(skb); else { #ifdef CONFIG_BRIDGE_NETFILTER diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index 3e41f9d..8298a51 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -761,7 +761,7 @@ static int br_nf_dev_queue_xmit(struct sk_buff *skb) { if (skb->protocol == htons(ETH_P_IP) && skb->len > skb->dev->mtu && - !(skb_shinfo(skb)->ufo_size || skb_shinfo(skb)->tso_size)) + !skb_shinfo(skb)->gso_size) return ip_fragment(skb, br_dev_queue_push_xmit); else return br_dev_queue_push_xmit(skb); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index fe63d4e..368d985 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -172,9 +172,9 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, shinfo = skb_shinfo(skb); atomic_set(&shinfo->dataref, 1); shinfo->nr_frags = 0; - shinfo->tso_size = 0; - shinfo->tso_segs = 0; - shinfo->ufo_size = 0; + shinfo->gso_size = 0; + shinfo->gso_segs = 0; + shinfo->gso_type = 0; shinfo->ip6_frag_id = 0; shinfo->frag_list = NULL; @@ -238,8 +238,9 @@ struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp, atomic_set(&(skb_shinfo(skb)->dataref), 1); skb_shinfo(skb)->nr_frags = 0; - skb_shinfo(skb)->tso_size = 0; - skb_shinfo(skb)->tso_segs = 0; + skb_shinfo(skb)->gso_size = 0; + skb_shinfo(skb)->gso_segs = 0; + skb_shinfo(skb)->gso_type = 0; skb_shinfo(skb)->frag_list = NULL; out: return skb; @@ -528,8 +529,9 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) #endif skb_copy_secmark(new, old); atomic_set(&new->users, 1); - skb_shinfo(new)->tso_size = skb_shinfo(old)->tso_size; - skb_shinfo(new)->tso_segs = skb_shinfo(old)->tso_segs; + skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size; + skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs; + skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type; } /** diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 8538aac..7624fd1 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -210,8 +210,7 @@ static inline int ip_finish_output(struct sk_buff *skb) return dst_output(skb); } #endif - if (skb->len > dst_mtu(skb->dst) && - !(skb_shinfo(skb)->ufo_size || skb_shinfo(skb)->tso_size)) + if (skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->gso_size) return ip_fragment(skb, ip_finish_output2); else return ip_finish_output2(skb); @@ -362,7 +361,7 @@ packet_routed: } ip_select_ident_more(iph, &rt->u.dst, sk, - (skb_shinfo(skb)->tso_segs ?: 1) - 1); + (skb_shinfo(skb)->gso_segs ?: 1) - 1); /* Add an IP checksum. */ ip_send_check(iph); @@ -744,7 +743,8 @@ static inline int ip_ufo_append_data(struct sock *sk, (length - transhdrlen)); if (!err) { /* specify the length of each IP datagram fragment*/ - skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen); + skb_shinfo(skb)->gso_size = mtu - fragheaderlen; + skb_shinfo(skb)->gso_type = SKB_GSO_UDPV4; __skb_queue_tail(&sk->sk_write_queue, skb); return 0; @@ -1087,14 +1087,16 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, inet->cork.length += size; if ((sk->sk_protocol == IPPROTO_UDP) && - (rt->u.dst.dev->features & NETIF_F_UFO)) - skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen); + (rt->u.dst.dev->features & NETIF_F_UFO)) { + skb_shinfo(skb)->gso_size = mtu - fragheaderlen; + skb_shinfo(skb)->gso_type = SKB_GSO_UDPV4; + } while (size > 0) { int i; - if (skb_shinfo(skb)->ufo_size) + if (skb_shinfo(skb)->gso_size) len = size; else { diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 74998f2..062dd1a 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -571,7 +571,7 @@ new_segment: skb->ip_summed = CHECKSUM_HW; tp->write_seq += copy; TCP_SKB_CB(skb)->end_seq += copy; - skb_shinfo(skb)->tso_segs = 0; + skb_shinfo(skb)->gso_segs = 0; if (!copied) TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH; @@ -818,7 +818,7 @@ new_segment: tp->write_seq += copy; TCP_SKB_CB(skb)->end_seq += copy; - skb_shinfo(skb)->tso_segs = 0; + skb_shinfo(skb)->gso_segs = 0; from += copy; copied += copy; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e08245b..94fe5b1 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1073,7 +1073,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ else pkt_len = (end_seq - TCP_SKB_CB(skb)->seq); - if (tcp_fragment(sk, skb, pkt_len, skb_shinfo(skb)->tso_size)) + if (tcp_fragment(sk, skb, pkt_len, skb_shinfo(skb)->gso_size)) break; pcount = tcp_skb_pcount(skb); } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 07bb5a2..bdd71db 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -515,15 +515,17 @@ static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned /* Avoid the costly divide in the normal * non-TSO case. */ - skb_shinfo(skb)->tso_segs = 1; - skb_shinfo(skb)->tso_size = 0; + skb_shinfo(skb)->gso_segs = 1; + skb_shinfo(skb)->gso_size = 0; + skb_shinfo(skb)->gso_type = 0; } else { unsigned int factor; factor = skb->len + (mss_now - 1); factor /= mss_now; - skb_shinfo(skb)->tso_segs = factor; - skb_shinfo(skb)->tso_size = mss_now; + skb_shinfo(skb)->gso_segs = factor; + skb_shinfo(skb)->gso_size = mss_now; + skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; } } @@ -914,7 +916,7 @@ static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int if (!tso_segs || (tso_segs > 1 && - skb_shinfo(skb)->tso_size != mss_now)) { + tcp_skb_mss(skb) != mss_now)) { tcp_set_skb_tso_segs(sk, skb, mss_now); tso_segs = tcp_skb_pcount(skb); } @@ -1724,8 +1726,9 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) { if (!pskb_trim(skb, 0)) { TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1; - skb_shinfo(skb)->tso_segs = 1; - skb_shinfo(skb)->tso_size = 0; + skb_shinfo(skb)->gso_segs = 1; + skb_shinfo(skb)->gso_size = 0; + skb_shinfo(skb)->gso_type = 0; skb->ip_summed = CHECKSUM_NONE; skb->csum = 0; } @@ -1930,8 +1933,9 @@ void tcp_send_fin(struct sock *sk) skb->csum = 0; TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN); TCP_SKB_CB(skb)->sacked = 0; - skb_shinfo(skb)->tso_segs = 1; - skb_shinfo(skb)->tso_size = 0; + skb_shinfo(skb)->gso_segs = 1; + skb_shinfo(skb)->gso_size = 0; + skb_shinfo(skb)->gso_type = 0; /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ TCP_SKB_CB(skb)->seq = tp->write_seq; @@ -1963,8 +1967,9 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) skb->csum = 0; TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST); TCP_SKB_CB(skb)->sacked = 0; - skb_shinfo(skb)->tso_segs = 1; - skb_shinfo(skb)->tso_size = 0; + skb_shinfo(skb)->gso_segs = 1; + skb_shinfo(skb)->gso_size = 0; + skb_shinfo(skb)->gso_type = 0; /* Send it off. */ TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp); @@ -2047,8 +2052,9 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, TCP_SKB_CB(skb)->seq = tcp_rsk(req)->snt_isn; TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1; TCP_SKB_CB(skb)->sacked = 0; - skb_shinfo(skb)->tso_segs = 1; - skb_shinfo(skb)->tso_size = 0; + skb_shinfo(skb)->gso_segs = 1; + skb_shinfo(skb)->gso_size = 0; + skb_shinfo(skb)->gso_type = 0; th->seq = htonl(TCP_SKB_CB(skb)->seq); th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1); if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ @@ -2152,8 +2158,9 @@ int tcp_connect(struct sock *sk) TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN; TCP_ECN_send_syn(sk, tp, buff); TCP_SKB_CB(buff)->sacked = 0; - skb_shinfo(buff)->tso_segs = 1; - skb_shinfo(buff)->tso_size = 0; + skb_shinfo(buff)->gso_segs = 1; + skb_shinfo(buff)->gso_size = 0; + skb_shinfo(buff)->gso_type = 0; buff->csum = 0; TCP_SKB_CB(buff)->seq = tp->write_seq++; TCP_SKB_CB(buff)->end_seq = tp->write_seq; @@ -2257,8 +2264,9 @@ void tcp_send_ack(struct sock *sk) buff->csum = 0; TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK; TCP_SKB_CB(buff)->sacked = 0; - skb_shinfo(buff)->tso_segs = 1; - skb_shinfo(buff)->tso_size = 0; + skb_shinfo(buff)->gso_segs = 1; + skb_shinfo(buff)->gso_size = 0; + skb_shinfo(buff)->gso_type = 0; /* Send it off, this clears delayed acks for us. */ TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp); @@ -2293,8 +2301,9 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent) skb->csum = 0; TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK; TCP_SKB_CB(skb)->sacked = urgent; - skb_shinfo(skb)->tso_segs = 1; - skb_shinfo(skb)->tso_size = 0; + skb_shinfo(skb)->gso_segs = 1; + skb_shinfo(skb)->gso_size = 0; + skb_shinfo(skb)->gso_type = 0; /* Use a previous sequence. This should cause the other * end to send an ack. Don't queue or clone SKB, just diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index d29620f..abb94de 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -148,7 +148,7 @@ static int ip6_output2(struct sk_buff *skb) int ip6_output(struct sk_buff *skb) { - if ((skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->ufo_size) || + if ((skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->gso_size) || dst_allfrag(skb->dst)) return ip6_fragment(skb, ip6_output2); else @@ -833,8 +833,9 @@ static inline int ip6_ufo_append_data(struct sock *sk, struct frag_hdr fhdr; /* specify the length of each IP datagram fragment*/ - skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen) - - sizeof(struct frag_hdr); + skb_shinfo(skb)->gso_size = mtu - fragheaderlen - + sizeof(struct frag_hdr); + skb_shinfo(skb)->gso_type = SKB_GSO_UDPV4; ipv6_select_ident(skb, &fhdr); skb_shinfo(skb)->ip6_frag_id = fhdr.identification; __skb_queue_tail(&sk->sk_write_queue, skb); -- cgit v0.10.2 From f6a78bfcb141f963187464bac838d46a81c3882a Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 22 Jun 2006 02:57:17 -0700 Subject: [NET]: Add generic segmentation offload This patch adds the infrastructure for generic segmentation offload. The idea is to tap into the potential savings of TSO without hardware support by postponing the allocation of segmented skb's until just before the entry point into the NIC driver. The same structure can be used to support software IPv6 TSO, as well as UFO and segmentation offload for other relevant protocols, e.g., DCCP. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index fa56713..b4eae18 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -405,6 +405,9 @@ struct net_device struct list_head qdisc_list; unsigned long tx_queue_len; /* Max frames per queue allowed */ + /* Partially transmitted GSO packet. */ + struct sk_buff *gso_skb; + /* ingress path synchronizer */ spinlock_t ingress_lock; struct Qdisc *qdisc_ingress; @@ -539,6 +542,7 @@ struct packet_type { struct net_device *, struct packet_type *, struct net_device *); + struct sk_buff *(*gso_segment)(struct sk_buff *skb, int sg); void *af_packet_priv; struct list_head list; }; @@ -689,7 +693,8 @@ extern int dev_change_name(struct net_device *, char *); extern int dev_set_mtu(struct net_device *, int); extern int dev_set_mac_address(struct net_device *, struct sockaddr *); -extern void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev); +extern int dev_hard_start_xmit(struct sk_buff *skb, + struct net_device *dev); extern void dev_init(void); @@ -963,6 +968,7 @@ extern int netdev_max_backlog; extern int weight_p; extern int netdev_set_master(struct net_device *dev, struct net_device *master); extern int skb_checksum_help(struct sk_buff *skb, int inward); +extern struct sk_buff *skb_gso_segment(struct sk_buff *skb, int sg); #ifdef CONFIG_BUG extern void netdev_rx_csum_fault(struct net_device *dev); #else diff --git a/net/core/dev.c b/net/core/dev.c index 29e3888..d293e0f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -116,6 +116,7 @@ #include #include #include +#include /* * The list of packet types we will receive (as opposed to discard) @@ -1048,7 +1049,7 @@ static inline void net_timestamp(struct sk_buff *skb) * taps currently in use. */ -void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) +static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) { struct packet_type *ptype; @@ -1186,6 +1187,40 @@ out: return ret; } +/** + * skb_gso_segment - Perform segmentation on skb. + * @skb: buffer to segment + * @sg: whether scatter-gather is supported on the target. + * + * This function segments the given skb and returns a list of segments. + */ +struct sk_buff *skb_gso_segment(struct sk_buff *skb, int sg) +{ + struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); + struct packet_type *ptype; + int type = skb->protocol; + + BUG_ON(skb_shinfo(skb)->frag_list); + BUG_ON(skb->ip_summed != CHECKSUM_HW); + + skb->mac.raw = skb->data; + skb->mac_len = skb->nh.raw - skb->data; + __skb_pull(skb, skb->mac_len); + + rcu_read_lock(); + list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type) & 15], list) { + if (ptype->type == type && !ptype->dev && ptype->gso_segment) { + segs = ptype->gso_segment(skb, sg); + break; + } + } + rcu_read_unlock(); + + return segs; +} + +EXPORT_SYMBOL(skb_gso_segment); + /* Take action when hardware reception checksum errors are detected. */ #ifdef CONFIG_BUG void netdev_rx_csum_fault(struct net_device *dev) @@ -1222,6 +1257,86 @@ static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb) #define illegal_highdma(dev, skb) (0) #endif +struct dev_gso_cb { + void (*destructor)(struct sk_buff *skb); +}; + +#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb) + +static void dev_gso_skb_destructor(struct sk_buff *skb) +{ + struct dev_gso_cb *cb; + + do { + struct sk_buff *nskb = skb->next; + + skb->next = nskb->next; + nskb->next = NULL; + kfree_skb(nskb); + } while (skb->next); + + cb = DEV_GSO_CB(skb); + if (cb->destructor) + cb->destructor(skb); +} + +/** + * dev_gso_segment - Perform emulated hardware segmentation on skb. + * @skb: buffer to segment + * + * This function segments the given skb and stores the list of segments + * in skb->next. + */ +static int dev_gso_segment(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + struct sk_buff *segs; + + segs = skb_gso_segment(skb, dev->features & NETIF_F_SG && + !illegal_highdma(dev, skb)); + if (unlikely(IS_ERR(segs))) + return PTR_ERR(segs); + + skb->next = segs; + DEV_GSO_CB(skb)->destructor = skb->destructor; + skb->destructor = dev_gso_skb_destructor; + + return 0; +} + +int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev) +{ + if (likely(!skb->next)) { + if (netdev_nit) + dev_queue_xmit_nit(skb, dev); + + if (!netif_needs_gso(dev, skb)) + return dev->hard_start_xmit(skb, dev); + + if (unlikely(dev_gso_segment(skb))) + goto out_kfree_skb; + } + + do { + struct sk_buff *nskb = skb->next; + int rc; + + skb->next = nskb->next; + nskb->next = NULL; + rc = dev->hard_start_xmit(nskb, dev); + if (unlikely(rc)) { + skb->next = nskb; + return rc; + } + } while (skb->next); + + skb->destructor = DEV_GSO_CB(skb)->destructor; + +out_kfree_skb: + kfree_skb(skb); + return 0; +} + #define HARD_TX_LOCK(dev, cpu) { \ if ((dev->features & NETIF_F_LLTX) == 0) { \ netif_tx_lock(dev); \ @@ -1266,6 +1381,10 @@ int dev_queue_xmit(struct sk_buff *skb) struct Qdisc *q; int rc = -ENOMEM; + /* GSO will handle the following emulations directly. */ + if (netif_needs_gso(dev, skb)) + goto gso; + if (skb_shinfo(skb)->frag_list && !(dev->features & NETIF_F_FRAGLIST) && __skb_linearize(skb)) @@ -1290,6 +1409,7 @@ int dev_queue_xmit(struct sk_buff *skb) if (skb_checksum_help(skb, 0)) goto out_kfree_skb; +gso: spin_lock_prefetch(&dev->queue_lock); /* Disable soft irqs for various locks below. Also @@ -1346,11 +1466,8 @@ int dev_queue_xmit(struct sk_buff *skb) HARD_TX_LOCK(dev, cpu); if (!netif_queue_stopped(dev)) { - if (netdev_nit) - dev_queue_xmit_nit(skb, dev); - rc = 0; - if (!dev->hard_start_xmit(skb, dev)) { + if (!dev_hard_start_xmit(skb, dev)) { HARD_TX_UNLOCK(dev); goto out; } diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 7aad012..74d4a1d 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -96,8 +96,11 @@ static inline int qdisc_restart(struct net_device *dev) struct sk_buff *skb; /* Dequeue packet */ - if ((skb = q->dequeue(q)) != NULL) { + if (((skb = dev->gso_skb)) || ((skb = q->dequeue(q)))) { unsigned nolock = (dev->features & NETIF_F_LLTX); + + dev->gso_skb = NULL; + /* * When the driver has LLTX set it does its own locking * in start_xmit. No need to add additional overhead by @@ -134,10 +137,8 @@ static inline int qdisc_restart(struct net_device *dev) if (!netif_queue_stopped(dev)) { int ret; - if (netdev_nit) - dev_queue_xmit_nit(skb, dev); - ret = dev->hard_start_xmit(skb, dev); + ret = dev_hard_start_xmit(skb, dev); if (ret == NETDEV_TX_OK) { if (!nolock) { netif_tx_unlock(dev); @@ -171,7 +172,10 @@ static inline int qdisc_restart(struct net_device *dev) */ requeue: - q->ops->requeue(skb, q); + if (skb->next) + dev->gso_skb = skb; + else + q->ops->requeue(skb, q); netif_schedule(dev); return 1; } @@ -593,6 +597,11 @@ void dev_deactivate(struct net_device *dev) /* Wait for outstanding qdisc_run calls. */ while (test_bit(__LINK_STATE_QDISC_RUNNING, &dev->state)) yield(); + + if (dev->gso_skb) { + kfree_skb(dev->gso_skb); + dev->gso_skb = NULL; + } } void dev_init_scheduler(struct net_device *dev) -- cgit v0.10.2 From f4c50d990dcf11a296679dc05de3873783236711 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 22 Jun 2006 03:02:40 -0700 Subject: [NET]: Add software TSOv4 This patch adds the GSO implementation for IPv4 TCP. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 97b0d2d..a45bba9 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1297,6 +1297,7 @@ extern void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len); extern void skb_release_data(struct sk_buff *skb); +extern struct sk_buff *skb_segment(struct sk_buff *skb, int sg); static inline void *skb_header_pointer(const struct sk_buff *skb, int offset, int len, void *buffer) diff --git a/include/net/protocol.h b/include/net/protocol.h index bcaee39..3b6dc15 100644 --- a/include/net/protocol.h +++ b/include/net/protocol.h @@ -36,6 +36,7 @@ struct net_protocol { int (*handler)(struct sk_buff *skb); void (*err_handler)(struct sk_buff *skb, u32 info); + struct sk_buff *(*gso_segment)(struct sk_buff *skb, int sg); int no_policy; }; diff --git a/include/net/tcp.h b/include/net/tcp.h index b197a9e..ca3d38d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1086,6 +1086,8 @@ extern struct request_sock_ops tcp_request_sock_ops; extern int tcp_v4_destroy_sock(struct sock *sk); +extern struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int sg); + #ifdef CONFIG_PROC_FS extern int tcp4_proc_init(void); extern void tcp4_proc_exit(void); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 368d985..8e5044b 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1842,6 +1842,132 @@ unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len) EXPORT_SYMBOL_GPL(skb_pull_rcsum); +/** + * skb_segment - Perform protocol segmentation on skb. + * @skb: buffer to segment + * @sg: whether scatter-gather can be used for generated segments + * + * This function performs segmentation on the given skb. It returns + * the segment at the given position. It returns NULL if there are + * no more segments to generate, or when an error is encountered. + */ +struct sk_buff *skb_segment(struct sk_buff *skb, int sg) +{ + struct sk_buff *segs = NULL; + struct sk_buff *tail = NULL; + unsigned int mss = skb_shinfo(skb)->gso_size; + unsigned int doffset = skb->data - skb->mac.raw; + unsigned int offset = doffset; + unsigned int headroom; + unsigned int len; + int nfrags = skb_shinfo(skb)->nr_frags; + int err = -ENOMEM; + int i = 0; + int pos; + + __skb_push(skb, doffset); + headroom = skb_headroom(skb); + pos = skb_headlen(skb); + + do { + struct sk_buff *nskb; + skb_frag_t *frag; + int hsize, nsize; + int k; + int size; + + len = skb->len - offset; + if (len > mss) + len = mss; + + hsize = skb_headlen(skb) - offset; + if (hsize < 0) + hsize = 0; + nsize = hsize + doffset; + if (nsize > len + doffset || !sg) + nsize = len + doffset; + + nskb = alloc_skb(nsize + headroom, GFP_ATOMIC); + if (unlikely(!nskb)) + goto err; + + if (segs) + tail->next = nskb; + else + segs = nskb; + tail = nskb; + + nskb->dev = skb->dev; + nskb->priority = skb->priority; + nskb->protocol = skb->protocol; + nskb->dst = dst_clone(skb->dst); + memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); + nskb->pkt_type = skb->pkt_type; + nskb->mac_len = skb->mac_len; + + skb_reserve(nskb, headroom); + nskb->mac.raw = nskb->data; + nskb->nh.raw = nskb->data + skb->mac_len; + nskb->h.raw = nskb->nh.raw + (skb->h.raw - skb->nh.raw); + memcpy(skb_put(nskb, doffset), skb->data, doffset); + + if (!sg) { + nskb->csum = skb_copy_and_csum_bits(skb, offset, + skb_put(nskb, len), + len, 0); + continue; + } + + frag = skb_shinfo(nskb)->frags; + k = 0; + + nskb->ip_summed = CHECKSUM_HW; + nskb->csum = skb->csum; + memcpy(skb_put(nskb, hsize), skb->data + offset, hsize); + + while (pos < offset + len) { + BUG_ON(i >= nfrags); + + *frag = skb_shinfo(skb)->frags[i]; + get_page(frag->page); + size = frag->size; + + if (pos < offset) { + frag->page_offset += offset - pos; + frag->size -= offset - pos; + } + + k++; + + if (pos + size <= offset + len) { + i++; + pos += size; + } else { + frag->size -= pos + size - (offset + len); + break; + } + + frag++; + } + + skb_shinfo(nskb)->nr_frags = k; + nskb->data_len = len - hsize; + nskb->len += nskb->data_len; + nskb->truesize += nskb->data_len; + } while ((offset += len) < skb->len); + + return segs; + +err: + while ((skb = segs)) { + segs = skb->next; + kfree(skb); + } + return ERR_PTR(err); +} + +EXPORT_SYMBOL_GPL(skb_segment); + void __init skb_init(void) { skbuff_head_cache = kmem_cache_create("skbuff_head_cache", diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 0a27745..461216b 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -68,6 +68,7 @@ */ #include +#include #include #include #include @@ -1096,6 +1097,54 @@ int inet_sk_rebuild_header(struct sock *sk) EXPORT_SYMBOL(inet_sk_rebuild_header); +static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int sg) +{ + struct sk_buff *segs = ERR_PTR(-EINVAL); + struct iphdr *iph; + struct net_protocol *ops; + int proto; + int ihl; + int id; + + if (!pskb_may_pull(skb, sizeof(*iph))) + goto out; + + iph = skb->nh.iph; + ihl = iph->ihl * 4; + if (ihl < sizeof(*iph)) + goto out; + + if (!pskb_may_pull(skb, ihl)) + goto out; + + skb->h.raw = __skb_pull(skb, ihl); + iph = skb->nh.iph; + id = ntohs(iph->id); + proto = iph->protocol & (MAX_INET_PROTOS - 1); + segs = ERR_PTR(-EPROTONOSUPPORT); + + rcu_read_lock(); + ops = rcu_dereference(inet_protos[proto]); + if (ops && ops->gso_segment) + segs = ops->gso_segment(skb, sg); + rcu_read_unlock(); + + if (IS_ERR(segs)) + goto out; + + skb = segs; + do { + iph = skb->nh.iph; + iph->id = htons(id++); + iph->tot_len = htons(skb->len - skb->mac_len); + iph->check = 0; + iph->check = ip_fast_csum(skb->nh.raw, iph->ihl); + } while ((skb = skb->next)); + +out: + return segs; +} + #ifdef CONFIG_IP_MULTICAST static struct net_protocol igmp_protocol = { .handler = igmp_rcv, @@ -1105,6 +1154,7 @@ static struct net_protocol igmp_protocol = { static struct net_protocol tcp_protocol = { .handler = tcp_v4_rcv, .err_handler = tcp_v4_err, + .gso_segment = tcp_tso_segment, .no_policy = 1, }; @@ -1150,6 +1200,7 @@ static int ipv4_proc_init(void); static struct packet_type ip_packet_type = { .type = __constant_htons(ETH_P_IP), .func = ip_rcv, + .gso_segment = inet_gso_segment, }; static int __init inet_init(void) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 062dd1a..0e029c4 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -258,6 +258,7 @@ #include #include #include +#include #include #include @@ -2144,6 +2145,67 @@ int compat_tcp_getsockopt(struct sock *sk, int level, int optname, EXPORT_SYMBOL(compat_tcp_getsockopt); #endif +struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int sg) +{ + struct sk_buff *segs = ERR_PTR(-EINVAL); + struct tcphdr *th; + unsigned thlen; + unsigned int seq; + unsigned int delta; + unsigned int oldlen; + unsigned int len; + + if (!pskb_may_pull(skb, sizeof(*th))) + goto out; + + th = skb->h.th; + thlen = th->doff * 4; + if (thlen < sizeof(*th)) + goto out; + + if (!pskb_may_pull(skb, thlen)) + goto out; + + oldlen = ~htonl(skb->len); + __skb_pull(skb, thlen); + + segs = skb_segment(skb, sg); + if (IS_ERR(segs)) + goto out; + + len = skb_shinfo(skb)->gso_size; + delta = csum_add(oldlen, htonl(thlen + len)); + + skb = segs; + th = skb->h.th; + seq = ntohl(th->seq); + + do { + th->fin = th->psh = 0; + + if (skb->ip_summed == CHECKSUM_NONE) { + th->check = csum_fold(csum_partial( + skb->h.raw, thlen, csum_add(skb->csum, delta))); + } + + seq += len; + skb = skb->next; + th = skb->h.th; + + th->seq = htonl(seq); + th->cwr = 0; + } while (skb->next); + + if (skb->ip_summed == CHECKSUM_NONE) { + delta = csum_add(oldlen, htonl(skb->tail - skb->h.raw)); + th->check = csum_fold(csum_partial( + skb->h.raw, thlen, csum_add(skb->csum, delta))); + } + +out: + return segs; +} + extern void __skb_cb_too_small_for_tcp(int, int); extern struct tcp_congestion_ops tcp_reno; -- cgit v0.10.2 From 37c3185a02d4b85fbe134bf5204535405dd2c957 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 22 Jun 2006 03:07:29 -0700 Subject: [NET]: Added GSO toggle This patch adds a generic segmentation offload toggle that can be turned on/off for each net device. For now it only supports in TCPv4. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index cf2abec..c6310ae 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -411,6 +411,8 @@ struct ethtool_ops { #define ETHTOOL_GPERMADDR 0x00000020 /* Get permanent hardware address */ #define ETHTOOL_GUFO 0x00000021 /* Get UFO enable (ethtool_value) */ #define ETHTOOL_SUFO 0x00000022 /* Set UFO enable (ethtool_value) */ +#define ETHTOOL_GGSO 0x00000023 /* Get GSO enable (ethtool_value) */ +#define ETHTOOL_SGSO 0x00000024 /* Set GSO enable (ethtool_value) */ /* compatibility with older code */ #define SPARC_ETH_GSET ETHTOOL_GSET diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b4eae18..bc747e5 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -308,6 +308,7 @@ struct net_device #define NETIF_F_HW_VLAN_RX 256 /* Receive VLAN hw acceleration */ #define NETIF_F_HW_VLAN_FILTER 512 /* Receive filtering on VLAN */ #define NETIF_F_VLAN_CHALLENGED 1024 /* Device cannot handle VLAN packets */ +#define NETIF_F_GSO 2048 /* Enable software GSO. */ #define NETIF_F_LLTX 4096 /* LockLess TX */ /* Segmentation offload features */ diff --git a/include/net/sock.h b/include/net/sock.h index d10dfec..a897f05 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1030,9 +1030,13 @@ static inline void sk_setup_caps(struct sock *sk, struct dst_entry *dst) { __sk_dst_set(sk, dst); sk->sk_route_caps = dst->dev->features; + if (sk->sk_route_caps & NETIF_F_GSO) + sk->sk_route_caps |= NETIF_F_TSO; if (sk->sk_route_caps & NETIF_F_TSO) { if (sock_flag(sk, SOCK_NO_LARGESEND) || dst->header_len) sk->sk_route_caps &= ~NETIF_F_TSO; + else + sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; } } diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index fdec773..07956ec 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -376,15 +376,20 @@ void br_features_recompute(struct net_bridge *br) features = br->feature_mask & ~NETIF_F_ALL_CSUM; list_for_each_entry(p, &br->port_list, list) { - if (checksum & NETIF_F_NO_CSUM && - !(p->dev->features & NETIF_F_NO_CSUM)) + unsigned long feature = p->dev->features; + + if (checksum & NETIF_F_NO_CSUM && !(feature & NETIF_F_NO_CSUM)) checksum ^= NETIF_F_NO_CSUM | NETIF_F_HW_CSUM; - if (checksum & NETIF_F_HW_CSUM && - !(p->dev->features & NETIF_F_HW_CSUM)) + if (checksum & NETIF_F_HW_CSUM && !(feature & NETIF_F_HW_CSUM)) checksum ^= NETIF_F_HW_CSUM | NETIF_F_IP_CSUM; - if (!(p->dev->features & NETIF_F_IP_CSUM)) + if (!(feature & NETIF_F_IP_CSUM)) checksum = 0; - features &= p->dev->features; + + if (feature & NETIF_F_GSO) + feature |= NETIF_F_TSO; + feature |= NETIF_F_GSO; + + features &= feature; } br->dev->features = features | checksum | NETIF_F_LLTX; diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 33ce7ed..27ce168 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -614,6 +614,29 @@ static int ethtool_set_ufo(struct net_device *dev, char __user *useraddr) return dev->ethtool_ops->set_ufo(dev, edata.data); } +static int ethtool_get_gso(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_value edata = { ETHTOOL_GGSO }; + + edata.data = dev->features & NETIF_F_GSO; + if (copy_to_user(useraddr, &edata, sizeof(edata))) + return -EFAULT; + return 0; +} + +static int ethtool_set_gso(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_value edata; + + if (copy_from_user(&edata, useraddr, sizeof(edata))) + return -EFAULT; + if (edata.data) + dev->features |= NETIF_F_GSO; + else + dev->features &= ~NETIF_F_GSO; + return 0; +} + static int ethtool_self_test(struct net_device *dev, char __user *useraddr) { struct ethtool_test test; @@ -905,6 +928,12 @@ int dev_ethtool(struct ifreq *ifr) case ETHTOOL_SUFO: rc = ethtool_set_ufo(dev, useraddr); break; + case ETHTOOL_GGSO: + rc = ethtool_get_gso(dev, useraddr); + break; + case ETHTOOL_SGSO: + rc = ethtool_set_gso(dev, useraddr); + break; default: rc = -EOPNOTSUPP; } -- cgit v0.10.2 From 09b8f7a93efd4b2c4ef391e2fbf076f28c6d36d6 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 22 Jun 2006 03:08:03 -0700 Subject: [IPSEC]: Handle GSO packets This patch segments GSO packets received by the IPsec stack. This can happen when a NIC driver injects GSO packets into the stack which are then forwarded to another host. The primary application of this is going to be Xen where its backend driver may inject GSO packets into dom0. Of course this also can be used by other virtualisation schemes such as VMWare or UML since the tap device could be modified to inject GSO packets received through splice. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index ac9d91d..193363e 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -9,6 +9,8 @@ */ #include +#include +#include #include #include #include @@ -97,16 +99,10 @@ error_nolock: goto out_exit; } -static int xfrm4_output_finish(struct sk_buff *skb) +static int xfrm4_output_finish2(struct sk_buff *skb) { int err; -#ifdef CONFIG_NETFILTER - if (!skb->dst->xfrm) { - IPCB(skb)->flags |= IPSKB_REROUTED; - return dst_output(skb); - } -#endif while (likely((err = xfrm4_output_one(skb)) == 0)) { nf_reset(skb); @@ -119,7 +115,7 @@ static int xfrm4_output_finish(struct sk_buff *skb) return dst_output(skb); err = nf_hook(PF_INET, NF_IP_POST_ROUTING, &skb, NULL, - skb->dst->dev, xfrm4_output_finish); + skb->dst->dev, xfrm4_output_finish2); if (unlikely(err != 1)) break; } @@ -127,6 +123,48 @@ static int xfrm4_output_finish(struct sk_buff *skb) return err; } +static int xfrm4_output_finish(struct sk_buff *skb) +{ + struct sk_buff *segs; + +#ifdef CONFIG_NETFILTER + if (!skb->dst->xfrm) { + IPCB(skb)->flags |= IPSKB_REROUTED; + return dst_output(skb); + } +#endif + + if (!skb_shinfo(skb)->gso_size) + return xfrm4_output_finish2(skb); + + skb->protocol = htons(ETH_P_IP); + segs = skb_gso_segment(skb, 0); + kfree_skb(skb); + if (unlikely(IS_ERR(segs))) + return PTR_ERR(segs); + + do { + struct sk_buff *nskb = segs->next; + int err; + + segs->next = NULL; + err = xfrm4_output_finish2(segs); + + if (unlikely(err)) { + while ((segs = nskb)) { + nskb = segs->next; + segs->next = NULL; + kfree_skb(segs); + } + return err; + } + + segs = nskb; + } while (segs); + + return 0; +} + int xfrm4_output(struct sk_buff *skb) { return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, skb->dst->dev, diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index 16e8425..48fccb1 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -94,7 +94,7 @@ error_nolock: goto out_exit; } -static int xfrm6_output_finish(struct sk_buff *skb) +static int xfrm6_output_finish2(struct sk_buff *skb) { int err; @@ -110,7 +110,7 @@ static int xfrm6_output_finish(struct sk_buff *skb) return dst_output(skb); err = nf_hook(PF_INET6, NF_IP6_POST_ROUTING, &skb, NULL, - skb->dst->dev, xfrm6_output_finish); + skb->dst->dev, xfrm6_output_finish2); if (unlikely(err != 1)) break; } @@ -118,6 +118,41 @@ static int xfrm6_output_finish(struct sk_buff *skb) return err; } +static int xfrm6_output_finish(struct sk_buff *skb) +{ + struct sk_buff *segs; + + if (!skb_shinfo(skb)->gso_size) + return xfrm6_output_finish2(skb); + + skb->protocol = htons(ETH_P_IP); + segs = skb_gso_segment(skb, 0); + kfree_skb(skb); + if (unlikely(IS_ERR(segs))) + return PTR_ERR(segs); + + do { + struct sk_buff *nskb = segs->next; + int err; + + segs->next = NULL; + err = xfrm6_output_finish2(segs); + + if (unlikely(err)) { + while ((segs = nskb)) { + nskb = segs->next; + segs->next = NULL; + kfree_skb(segs); + } + return err; + } + + segs = nskb; + } while (segs); + + return 0; +} + int xfrm6_output(struct sk_buff *skb) { return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb, NULL, skb->dst->dev, -- cgit v0.10.2 From c8a553ad7f0bf943047943a758cf07017819cb3c Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 22 Jun 2006 14:28:09 -0700 Subject: [TCP]: Move inclusion of to correct place in The new header shouldn't be included from the !__KERNEL__ portion of tcp.h Signed-off-by: David Woodhouse Signed-off-by: David S. Miller diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 420a689..8ebf497 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -18,7 +18,6 @@ #define _LINUX_TCP_H #include -#include #include struct tcphdr { @@ -161,6 +160,7 @@ struct tcp_info #ifdef __KERNEL__ #include +#include #include #include #include -- cgit v0.10.2 From f4b8ea7849544114e9d3d682df4d400180854677 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 22 Jun 2006 16:00:11 -0700 Subject: [NET]: fix net-core kernel-doc Warning(/var/linsrc/linux-2617-g4//include/linux/skbuff.h:304): No description found for parameter 'dma_cookie' Warning(/var/linsrc/linux-2617-g4//include/net/sock.h:1274): No description found for parameter 'copied_early' Warning(/var/linsrc/linux-2617-g4//net/core/dev.c:3309): No description found for parameter 'chan' Warning(/var/linsrc/linux-2617-g4//net/core/dev.c:3309): No description found for parameter 'event' Signed-off-by: Randy Dunlap Signed-off-by: David S. Miller diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index a45bba9..16eef03 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -215,6 +215,8 @@ enum { * @nf_bridge: Saved data about a bridged frame - see br_netfilter.c * @tc_index: Traffic control index * @tc_verd: traffic control verdict + * @dma_cookie: a cookie to one of several possible DMA operations + * done by skb DMA functions * @secmark: security marking */ diff --git a/include/net/sock.h b/include/net/sock.h index a897f05..2d8d6ad 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1269,6 +1269,7 @@ sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) * sk_eat_skb - Release a skb if it is no longer needed * @sk: socket to eat this skb from * @skb: socket buffer to eat + * @copied_early: flag indicating whether DMA operations copied this data early * * This routine must be called with interrupts disabled or with the socket * locked so that the sk_buff queue operation is ok. diff --git a/net/core/dev.c b/net/core/dev.c index d293e0f..9b8f0f2 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3418,8 +3418,8 @@ static void net_dma_rebalance(void) /** * netdev_dma_event - event callback for the net_dma_client * @client: should always be net_dma_client - * @chan: - * @event: + * @chan: DMA channel for the event + * @event: event type */ static void netdev_dma_event(struct dma_client *client, struct dma_chan *chan, enum dma_event event) -- cgit v0.10.2 From ca6bb5d7ab22ac79f608fe6cbc6b12de6a5a19f0 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 22 Jun 2006 16:07:52 -0700 Subject: [NET]: Require CAP_NET_ADMIN to create tuntap devices. The tuntap driver allows an admin to create persistent devices and assign ownership of them to individual users. Unfortunately, relaxing the permissions on the /dev/net/tun device node so that they can actually use those devices will _also_ allow those users to create arbitrary new devices of their own. This patch corrects that, and adjusts the recommended permissions for the device node accordingly. Signed-off-By: David Woodhouse Signed-off-by: David S. Miller diff --git a/Documentation/networking/tuntap.txt b/Documentation/networking/tuntap.txt index 76750fb..839cbb7 100644 --- a/Documentation/networking/tuntap.txt +++ b/Documentation/networking/tuntap.txt @@ -39,10 +39,13 @@ Copyright (C) 1999-2000 Maxim Krasnyansky mknod /dev/net/tun c 10 200 Set permissions: - e.g. chmod 0700 /dev/net/tun - if you want the device only accessible by root. Giving regular users the - right to assign network devices is NOT a good idea. Users could assign - bogus network interfaces to trick firewalls or administrators. + e.g. chmod 0666 /dev/net/tun + There's no harm in allowing the device to be accessible by non-root users, + since CAP_NET_ADMIN is required for creating network devices or for + connecting to network devices which aren't owned by the user in question. + If you want to create persistent devices and give ownership of them to + unprivileged users, then you need the /dev/net/tun device to be usable by + those users. Driver module autoloading diff --git a/drivers/net/tun.c b/drivers/net/tun.c index a1ed2d9..6c62d5c 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -490,6 +490,9 @@ static int tun_set_iff(struct file *file, struct ifreq *ifr) err = -EINVAL; + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + /* Set dev type */ if (ifr->ifr_flags & IFF_TUN) { /* TUN device */ -- cgit v0.10.2