diff options
62 files changed, 1390 insertions, 539 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 47b6c79..e20c17a 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -551,6 +551,20 @@ tcp_thin_dupack - BOOLEAN Documentation/networking/tcp-thin.txt Default: 0 +tcp_limit_output_bytes - INTEGER + Controls TCP Small Queue limit per tcp socket. + TCP bulk sender tends to increase packets in flight until it + gets losses notifications. With SNDBUF autotuning, this can + result in a large amount of packets queued in qdisc/device + on the local machine, hurting latency of other flows, for + typical pfifo_fast qdiscs. + tcp_limit_output_bytes limits the number of bytes on qdisc + or device to reduce artificial RTT/cwnd and reduce bufferbloat. + Note: For GSO/TSO enabled flows, we try to have at least two + packets in flight. Reducing tcp_limit_output_bytes might also + reduce the size of individual GSO packet (64KB being the max) + Default: 131072 + UDP variables: udp_mem - vector of 3 INTEGERs: min, pressure, max diff --git a/arch/m68k/include/asm/mcfne.h b/arch/m68k/include/asm/mcf8390.h index bf638be..a72a208 100644 --- a/arch/m68k/include/asm/mcfne.h +++ b/arch/m68k/include/asm/mcf8390.h @@ -1,7 +1,7 @@ /****************************************************************************/ /* - * mcfne.h -- NE2000 in ColdFire eval boards. + * mcf8390.h -- NS8390 support for ColdFire eval boards. * * (C) Copyright 1999-2000, Greg Ungerer (gerg@snapgear.com) * (C) Copyright 2000, Lineo (www.lineo.com) @@ -14,8 +14,8 @@ */ /****************************************************************************/ -#ifndef mcfne_h -#define mcfne_h +#ifndef mcf8390_h +#define mcf8390_h /****************************************************************************/ @@ -37,6 +37,7 @@ #if defined(CONFIG_ARN5206) #define NE2000_ADDR 0x40000300 #define NE2000_ODDOFFSET 0x00010000 +#define NE2000_ADDRSIZE 0x00020000 #define NE2000_IRQ_VECTOR 0xf0 #define NE2000_IRQ_PRIORITY 2 #define NE2000_IRQ_LEVEL 4 @@ -46,6 +47,7 @@ #if defined(CONFIG_M5206eC3) #define NE2000_ADDR 0x40000300 #define NE2000_ODDOFFSET 0x00010000 +#define NE2000_ADDRSIZE 0x00020000 #define NE2000_IRQ_VECTOR 0x1c #define NE2000_IRQ_PRIORITY 2 #define NE2000_IRQ_LEVEL 4 @@ -54,6 +56,7 @@ #if defined(CONFIG_M5206e) && defined(CONFIG_NETtel) #define NE2000_ADDR 0x30000300 +#define NE2000_ADDRSIZE 0x00001000 #define NE2000_IRQ_VECTOR 25 #define NE2000_IRQ_PRIORITY 1 #define NE2000_IRQ_LEVEL 3 @@ -63,6 +66,7 @@ #if defined(CONFIG_M5307C3) #define NE2000_ADDR 0x40000300 #define NE2000_ODDOFFSET 0x00010000 +#define NE2000_ADDRSIZE 0x00020000 #define NE2000_IRQ_VECTOR 0x1b #define NE2000_BYTE volatile unsigned short #endif @@ -70,6 +74,7 @@ #if defined(CONFIG_M5272) && defined(CONFIG_NETtel) #define NE2000_ADDR 0x30600300 #define NE2000_ODDOFFSET 0x00008000 +#define NE2000_ADDRSIZE 0x00010000 #define NE2000_IRQ_VECTOR 67 #undef BSWAP #define BSWAP(w) (w) @@ -82,6 +87,7 @@ #define NE2000_ADDR0 0x30600300 #define NE2000_ADDR1 0x30800300 #define NE2000_ODDOFFSET 0x00008000 +#define NE2000_ADDRSIZE 0x00010000 #define NE2000_IRQ_VECTOR0 27 #define NE2000_IRQ_VECTOR1 29 #undef BSWAP @@ -94,6 +100,7 @@ #if defined(CONFIG_M5307) && defined(CONFIG_SECUREEDGEMP3) #define NE2000_ADDR 0x30600300 #define NE2000_ODDOFFSET 0x00008000 +#define NE2000_ADDRSIZE 0x00010000 #define NE2000_IRQ_VECTOR 27 #undef BSWAP #define BSWAP(w) (w) @@ -105,6 +112,7 @@ #if defined(CONFIG_ARN5307) #define NE2000_ADDR 0xfe600300 #define NE2000_ODDOFFSET 0x00010000 +#define NE2000_ADDRSIZE 0x00020000 #define NE2000_IRQ_VECTOR 0x1b #define NE2000_IRQ_PRIORITY 2 #define NE2000_IRQ_LEVEL 3 @@ -114,129 +122,10 @@ #if defined(CONFIG_M5407C3) #define NE2000_ADDR 0x40000300 #define NE2000_ODDOFFSET 0x00010000 +#define NE2000_ADDRSIZE 0x00020000 #define NE2000_IRQ_VECTOR 0x1b #define NE2000_BYTE volatile unsigned short #endif /****************************************************************************/ - -/* - * Side-band address space for odd address requires re-mapping - * many of the standard ISA access functions. - */ -#ifdef NE2000_ODDOFFSET - -#undef outb -#undef outb_p -#undef inb -#undef inb_p -#undef outsb -#undef outsw -#undef insb -#undef insw - -#define outb ne2000_outb -#define inb ne2000_inb -#define outb_p ne2000_outb -#define inb_p ne2000_inb -#define outsb ne2000_outsb -#define outsw ne2000_outsw -#define insb ne2000_insb -#define insw ne2000_insw - - -#ifndef COLDFIRE_NE2000_FUNCS - -void ne2000_outb(unsigned int val, unsigned int addr); -int ne2000_inb(unsigned int addr); -void ne2000_insb(unsigned int addr, void *vbuf, int unsigned long len); -void ne2000_insw(unsigned int addr, void *vbuf, unsigned long len); -void ne2000_outsb(unsigned int addr, void *vbuf, unsigned long len); -void ne2000_outsw(unsigned int addr, void *vbuf, unsigned long len); - -#else - -/* - * This macro converts a conventional register address into the - * real memory pointer of the mapped NE2000 device. - * On most NE2000 implementations on ColdFire boards the chip is - * mapped in kinda funny, due to its ISA heritage. - */ -#define NE2000_PTR(addr) ((addr&0x1)?(NE2000_ODDOFFSET+addr-1):(addr)) -#define NE2000_DATA_PTR(addr) (addr) - - -void ne2000_outb(unsigned int val, unsigned int addr) -{ - NE2000_BYTE *rp; - - rp = (NE2000_BYTE *) NE2000_PTR(addr); - *rp = RSWAP(val); -} - -int ne2000_inb(unsigned int addr) -{ - NE2000_BYTE *rp, val; - - rp = (NE2000_BYTE *) NE2000_PTR(addr); - val = *rp; - return((int) ((NE2000_BYTE) RSWAP(val))); -} - -void ne2000_insb(unsigned int addr, void *vbuf, int unsigned long len) -{ - NE2000_BYTE *rp, val; - unsigned char *buf; - - buf = (unsigned char *) vbuf; - rp = (NE2000_BYTE *) NE2000_DATA_PTR(addr); - for (; (len > 0); len--) { - val = *rp; - *buf++ = RSWAP(val); - } -} - -void ne2000_insw(unsigned int addr, void *vbuf, unsigned long len) -{ - volatile unsigned short *rp; - unsigned short w, *buf; - - buf = (unsigned short *) vbuf; - rp = (volatile unsigned short *) NE2000_DATA_PTR(addr); - for (; (len > 0); len--) { - w = *rp; - *buf++ = BSWAP(w); - } -} - -void ne2000_outsb(unsigned int addr, const void *vbuf, unsigned long len) -{ - NE2000_BYTE *rp, val; - unsigned char *buf; - - buf = (unsigned char *) vbuf; - rp = (NE2000_BYTE *) NE2000_DATA_PTR(addr); - for (; (len > 0); len--) { - val = *buf++; - *rp = RSWAP(val); - } -} - -void ne2000_outsw(unsigned int addr, const void *vbuf, unsigned long len) -{ - volatile unsigned short *rp; - unsigned short w, *buf; - - buf = (unsigned short *) vbuf; - rp = (volatile unsigned short *) NE2000_DATA_PTR(addr); - for (; (len > 0); len--) { - w = *buf++; - *rp = BSWAP(w); - } -} - -#endif /* COLDFIRE_NE2000_FUNCS */ -#endif /* NE2000_OFFOFFSET */ - -/****************************************************************************/ -#endif /* mcfne_h */ +#endif /* mcf8390_h */ diff --git a/drivers/ieee802154/at86rf230.c b/drivers/ieee802154/at86rf230.c index 902e38b..5d30940 100644 --- a/drivers/ieee802154/at86rf230.c +++ b/drivers/ieee802154/at86rf230.c @@ -543,6 +543,13 @@ at86rf230_xmit(struct ieee802154_dev *dev, struct sk_buff *skb) int rc; unsigned long flags; + spin_lock(&lp->lock); + if (lp->irq_disabled) { + spin_unlock(&lp->lock); + return -EBUSY; + } + spin_unlock(&lp->lock); + might_sleep(); rc = at86rf230_state(dev, STATE_FORCE_TX_ON); @@ -592,12 +599,8 @@ static int at86rf230_rx(struct at86rf230_local *lp) if (!skb) return -ENOMEM; - if (at86rf230_write_subreg(lp, SR_RX_PDT_DIS, 1) || - at86rf230_read_fbuf(lp, skb_put(skb, len), &len, &lqi) || - at86rf230_write_subreg(lp, SR_RX_SAFE_MODE, 1) || - at86rf230_write_subreg(lp, SR_RX_PDT_DIS, 0)) { + if (at86rf230_read_fbuf(lp, skb_put(skb, len), &len, &lqi)) goto err; - } if (len < 2) goto err; @@ -633,7 +636,6 @@ static void at86rf230_irqwork(struct work_struct *work) int rc; unsigned long flags; - spin_lock_irqsave(&lp->lock, flags); rc = at86rf230_read_subreg(lp, RG_IRQ_STATUS, 0xff, 0, &val); status |= val; @@ -643,31 +645,33 @@ static void at86rf230_irqwork(struct work_struct *work) status &= ~IRQ_TRX_UR; /* FIXME: possibly handle ???*/ if (status & IRQ_TRX_END) { + spin_lock_irqsave(&lp->lock, flags); status &= ~IRQ_TRX_END; if (lp->is_tx) { lp->is_tx = 0; + spin_unlock_irqrestore(&lp->lock, flags); complete(&lp->tx_complete); } else { + spin_unlock_irqrestore(&lp->lock, flags); at86rf230_rx(lp); } } - if (lp->irq_disabled) { - lp->irq_disabled = 0; - enable_irq(lp->spi->irq); - } + spin_lock_irqsave(&lp->lock, flags); + lp->irq_disabled = 0; spin_unlock_irqrestore(&lp->lock, flags); + + enable_irq(lp->spi->irq); } static irqreturn_t at86rf230_isr(int irq, void *data) { struct at86rf230_local *lp = data; + disable_irq_nosync(irq); + spin_lock(&lp->lock); - if (!lp->irq_disabled) { - disable_irq_nosync(irq); - lp->irq_disabled = 1; - } + lp->irq_disabled = 1; spin_unlock(&lp->lock); schedule_work(&lp->irqwork); diff --git a/drivers/net/ethernet/8390/Kconfig b/drivers/net/ethernet/8390/Kconfig index 2e53867..e1219e0 100644 --- a/drivers/net/ethernet/8390/Kconfig +++ b/drivers/net/ethernet/8390/Kconfig @@ -162,6 +162,20 @@ config MAC8390 and read the Ethernet-HOWTO, available from <http://www.tldp.org/docs.html#howto>. +config MCF8390 + tristate "ColdFire NS8390 based Ethernet support" + depends on COLDFIRE + select CRC32 + ---help--- + This driver is for Ethernet devices using an NS8390-compatible + chipset on many common ColdFire CPU based boards. Many of the older + Freescale dev boards use this, and some other common boards like + some SnapGear routers do as well. + + If you have one of these boards and want to use the network interface + on them then choose Y. To compile this driver as a module, choose M + here, the module will be called mcf8390. + config NE2000 tristate "NE2000/NE1000 support" depends on (ISA || (Q40 && m) || M32R || MACH_TX49XX) diff --git a/drivers/net/ethernet/8390/Makefile b/drivers/net/ethernet/8390/Makefile index d13790b..f43038b 100644 --- a/drivers/net/ethernet/8390/Makefile +++ b/drivers/net/ethernet/8390/Makefile @@ -14,6 +14,7 @@ obj-$(CONFIG_HPLAN_PLUS) += hp-plus.o 8390p.o obj-$(CONFIG_HPLAN) += hp.o 8390p.o obj-$(CONFIG_HYDRA) += hydra.o 8390.o obj-$(CONFIG_LNE390) += lne390.o 8390.o +obj-$(CONFIG_MCF8390) += mcf8390.o 8390.o obj-$(CONFIG_NE2000) += ne.o 8390p.o obj-$(CONFIG_NE2_MCA) += ne2.o 8390p.o obj-$(CONFIG_NE2K_PCI) += ne2k-pci.o 8390.o diff --git a/drivers/net/ethernet/8390/mcf8390.c b/drivers/net/ethernet/8390/mcf8390.c new file mode 100644 index 0000000..230efd6 --- /dev/null +++ b/drivers/net/ethernet/8390/mcf8390.c @@ -0,0 +1,480 @@ +/* + * Support for ColdFire CPU based boards using a NS8390 Ethernet device. + * + * Derived from the many other 8390 drivers. + * + * (C) Copyright 2012, Greg Ungerer <gerg@uclinux.org> + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file COPYING in the main directory of the Linux + * distribution for more details. + */ + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/init.h> +#include <linux/platform_device.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/jiffies.h> +#include <linux/io.h> +#include <asm/mcf8390.h> + +static const char version[] = + "mcf8390.c: (15-06-2012) Greg Ungerer <gerg@uclinux.org>"; + +#define NE_CMD 0x00 +#define NE_DATAPORT 0x10 /* NatSemi-defined port window offset */ +#define NE_RESET 0x1f /* Issue a read to reset ,a write to clear */ +#define NE_EN0_ISR 0x07 +#define NE_EN0_DCFG 0x0e +#define NE_EN0_RSARLO 0x08 +#define NE_EN0_RSARHI 0x09 +#define NE_EN0_RCNTLO 0x0a +#define NE_EN0_RXCR 0x0c +#define NE_EN0_TXCR 0x0d +#define NE_EN0_RCNTHI 0x0b +#define NE_EN0_IMR 0x0f + +#define NESM_START_PG 0x40 /* First page of TX buffer */ +#define NESM_STOP_PG 0x80 /* Last page +1 of RX ring */ + +#ifdef NE2000_ODDOFFSET +/* + * A lot of the ColdFire boards use a separate address region for odd offset + * register addresses. The following functions convert and map as required. + * Note that the data port accesses are treated a little differently, and + * always accessed via the insX/outsX functions. + */ +static inline u32 NE_PTR(u32 addr) +{ + if (addr & 1) + return addr - 1 + NE2000_ODDOFFSET; + return addr; +} + +static inline u32 NE_DATA_PTR(u32 addr) +{ + return addr; +} + +void ei_outb(u32 val, u32 addr) +{ + NE2000_BYTE *rp; + + rp = (NE2000_BYTE *) NE_PTR(addr); + *rp = RSWAP(val); +} + +#define ei_inb ei_inb +u8 ei_inb(u32 addr) +{ + NE2000_BYTE *rp, val; + + rp = (NE2000_BYTE *) NE_PTR(addr); + val = *rp; + return (u8) (RSWAP(val) & 0xff); +} + +void ei_insb(u32 addr, void *vbuf, int len) +{ + NE2000_BYTE *rp, val; + u8 *buf; + + buf = (u8 *) vbuf; + rp = (NE2000_BYTE *) NE_DATA_PTR(addr); + for (; (len > 0); len--) { + val = *rp; + *buf++ = RSWAP(val); + } +} + +void ei_insw(u32 addr, void *vbuf, int len) +{ + volatile u16 *rp; + u16 w, *buf; + + buf = (u16 *) vbuf; + rp = (volatile u16 *) NE_DATA_PTR(addr); + for (; (len > 0); len--) { + w = *rp; + *buf++ = BSWAP(w); + } +} + +void ei_outsb(u32 addr, const void *vbuf, int len) +{ + NE2000_BYTE *rp, val; + u8 *buf; + + buf = (u8 *) vbuf; + rp = (NE2000_BYTE *) NE_DATA_PTR(addr); + for (; (len > 0); len--) { + val = *buf++; + *rp = RSWAP(val); + } +} + +void ei_outsw(u32 addr, const void *vbuf, int len) +{ + volatile u16 *rp; + u16 w, *buf; + + buf = (u16 *) vbuf; + rp = (volatile u16 *) NE_DATA_PTR(addr); + for (; (len > 0); len--) { + w = *buf++; + *rp = BSWAP(w); + } +} + +#else /* !NE2000_ODDOFFSET */ + +#define ei_inb inb +#define ei_outb outb +#define ei_insb insb +#define ei_insw insw +#define ei_outsb outsb +#define ei_outsw outsw + +#endif /* !NE2000_ODDOFFSET */ + +#define ei_inb_p ei_inb +#define ei_outb_p ei_outb + +#include "lib8390.c" + +/* + * Hard reset the card. This used to pause for the same period that a + * 8390 reset command required, but that shouldn't be necessary. + */ +static void mcf8390_reset_8390(struct net_device *dev) +{ + unsigned long reset_start_time = jiffies; + u32 addr = dev->base_addr; + + if (ei_debug > 1) + netdev_dbg(dev, "resetting the 8390 t=%ld...\n", jiffies); + + ei_outb(ei_inb(addr + NE_RESET), addr + NE_RESET); + + ei_status.txing = 0; + ei_status.dmaing = 0; + + /* This check _should_not_ be necessary, omit eventually. */ + while ((ei_inb(addr + NE_EN0_ISR) & ENISR_RESET) == 0) { + if (time_after(jiffies, reset_start_time + 2 * HZ / 100)) { + netdev_warn(dev, "%s: did not complete\n", __func__); + break; + } + } + + ei_outb(ENISR_RESET, addr + NE_EN0_ISR); +} + +/* + * This *shouldn't* happen. + * If it does, it's the last thing you'll see + */ +static void mcf8390_dmaing_err(const char *func, struct net_device *dev, + struct ei_device *ei_local) +{ + netdev_err(dev, "%s: DMAing conflict [DMAstat:%d][irqlock:%d]\n", + func, ei_local->dmaing, ei_local->irqlock); +} + +/* + * Grab the 8390 specific header. Similar to the block_input routine, but + * we don't need to be concerned with ring wrap as the header will be at + * the start of a page, so we optimize accordingly. + */ +static void mcf8390_get_8390_hdr(struct net_device *dev, + struct e8390_pkt_hdr *hdr, int ring_page) +{ + struct ei_device *ei_local = netdev_priv(dev); + u32 addr = dev->base_addr; + + if (ei_local->dmaing) { + mcf8390_dmaing_err(__func__, dev, ei_local); + return; + } + + ei_local->dmaing |= 0x01; + ei_outb(E8390_NODMA + E8390_PAGE0 + E8390_START, addr + NE_CMD); + ei_outb(ENISR_RDC, addr + NE_EN0_ISR); + ei_outb(sizeof(struct e8390_pkt_hdr), addr + NE_EN0_RCNTLO); + ei_outb(0, addr + NE_EN0_RCNTHI); + ei_outb(0, addr + NE_EN0_RSARLO); /* On page boundary */ + ei_outb(ring_page, addr + NE_EN0_RSARHI); + ei_outb(E8390_RREAD + E8390_START, addr + NE_CMD); + + ei_insw(addr + NE_DATAPORT, hdr, sizeof(struct e8390_pkt_hdr) >> 1); + + outb(ENISR_RDC, addr + NE_EN0_ISR); /* Ack intr */ + ei_local->dmaing &= ~0x01; + + hdr->count = cpu_to_le16(hdr->count); +} + +/* + * Block input and output, similar to the Crynwr packet driver. + * If you are porting to a new ethercard, look at the packet driver source + * for hints. The NEx000 doesn't share the on-board packet memory -- + * you have to put the packet out through the "remote DMA" dataport + * using z_writeb. + */ +static void mcf8390_block_input(struct net_device *dev, int count, + struct sk_buff *skb, int ring_offset) +{ + struct ei_device *ei_local = netdev_priv(dev); + u32 addr = dev->base_addr; + char *buf = skb->data; + + if (ei_local->dmaing) { + mcf8390_dmaing_err(__func__, dev, ei_local); + return; + } + + ei_local->dmaing |= 0x01; + ei_outb(E8390_NODMA + E8390_PAGE0 + E8390_START, addr + NE_CMD); + ei_outb(ENISR_RDC, addr + NE_EN0_ISR); + ei_outb(count & 0xff, addr + NE_EN0_RCNTLO); + ei_outb(count >> 8, addr + NE_EN0_RCNTHI); + ei_outb(ring_offset & 0xff, addr + NE_EN0_RSARLO); + ei_outb(ring_offset >> 8, addr + NE_EN0_RSARHI); + ei_outb(E8390_RREAD + E8390_START, addr + NE_CMD); + + ei_insw(addr + NE_DATAPORT, buf, count >> 1); + if (count & 1) + buf[count - 1] = ei_inb(addr + NE_DATAPORT); + + ei_outb(ENISR_RDC, addr + NE_EN0_ISR); /* Ack intr */ + ei_local->dmaing &= ~0x01; +} + +static void mcf8390_block_output(struct net_device *dev, int count, + const unsigned char *buf, + const int start_page) +{ + struct ei_device *ei_local = netdev_priv(dev); + u32 addr = dev->base_addr; + unsigned long dma_start; + + /* Make sure we transfer all bytes if 16bit IO writes */ + if (count & 0x1) + count++; + + if (ei_local->dmaing) { + mcf8390_dmaing_err(__func__, dev, ei_local); + return; + } + + ei_local->dmaing |= 0x01; + /* We should already be in page 0, but to be safe... */ + ei_outb(E8390_PAGE0 + E8390_START + E8390_NODMA, addr + NE_CMD); + + ei_outb(ENISR_RDC, addr + NE_EN0_ISR); + + /* Now the normal output. */ + ei_outb(count & 0xff, addr + NE_EN0_RCNTLO); + ei_outb(count >> 8, addr + NE_EN0_RCNTHI); + ei_outb(0x00, addr + NE_EN0_RSARLO); + ei_outb(start_page, addr + NE_EN0_RSARHI); + ei_outb(E8390_RWRITE + E8390_START, addr + NE_CMD); + + ei_outsw(addr + NE_DATAPORT, buf, count >> 1); + + dma_start = jiffies; + while ((ei_inb(addr + NE_EN0_ISR) & ENISR_RDC) == 0) { + if (time_after(jiffies, dma_start + 2 * HZ / 100)) { /* 20ms */ + netdev_err(dev, "timeout waiting for Tx RDC\n"); + mcf8390_reset_8390(dev); + __NS8390_init(dev, 1); + break; + } + } + + ei_outb(ENISR_RDC, addr + NE_EN0_ISR); /* Ack intr */ + ei_local->dmaing &= ~0x01; +} + +static const struct net_device_ops mcf8390_netdev_ops = { + .ndo_open = __ei_open, + .ndo_stop = __ei_close, + .ndo_start_xmit = __ei_start_xmit, + .ndo_tx_timeout = __ei_tx_timeout, + .ndo_get_stats = __ei_get_stats, + .ndo_set_rx_mode = __ei_set_multicast_list, + .ndo_validate_addr = eth_validate_addr, + .ndo_set_mac_address = eth_mac_addr, + .ndo_change_mtu = eth_change_mtu, +#ifdef CONFIG_NET_POLL_CONTROLLER + .ndo_poll_controller = __ei_poll, +#endif +}; + +static int mcf8390_init(struct net_device *dev) +{ + static u32 offsets[] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + }; + struct ei_device *ei_local = netdev_priv(dev); + unsigned char SA_prom[32]; + u32 addr = dev->base_addr; + int start_page, stop_page; + int i, ret; + + mcf8390_reset_8390(dev); + + /* + * Read the 16 bytes of station address PROM. + * We must first initialize registers, + * similar to NS8390_init(eifdev, 0). + * We can't reliably read the SAPROM address without this. + * (I learned the hard way!). + */ + { + static const struct { + u32 value; + u32 offset; + } program_seq[] = { + {E8390_NODMA + E8390_PAGE0 + E8390_STOP, NE_CMD}, + /* Select page 0 */ + {0x48, NE_EN0_DCFG}, /* 0x48: Set byte-wide access */ + {0x00, NE_EN0_RCNTLO}, /* Clear the count regs */ + {0x00, NE_EN0_RCNTHI}, + {0x00, NE_EN0_IMR}, /* Mask completion irq */ + {0xFF, NE_EN0_ISR}, + {E8390_RXOFF, NE_EN0_RXCR}, /* 0x20 Set to monitor */ + {E8390_TXOFF, NE_EN0_TXCR}, /* 0x02 and loopback mode */ + {32, NE_EN0_RCNTLO}, + {0x00, NE_EN0_RCNTHI}, + {0x00, NE_EN0_RSARLO}, /* DMA starting at 0x0000 */ + {0x00, NE_EN0_RSARHI}, + {E8390_RREAD + E8390_START, NE_CMD}, + }; + for (i = 0; i < ARRAY_SIZE(program_seq); i++) { + ei_outb(program_seq[i].value, + addr + program_seq[i].offset); + } + } + + for (i = 0; i < 16; i++) { + SA_prom[i] = ei_inb(addr + NE_DATAPORT); + ei_inb(addr + NE_DATAPORT); + } + + /* We must set the 8390 for word mode. */ + ei_outb(0x49, addr + NE_EN0_DCFG); + start_page = NESM_START_PG; + stop_page = NESM_STOP_PG; + + /* Install the Interrupt handler */ + ret = request_irq(dev->irq, __ei_interrupt, 0, dev->name, dev); + if (ret) + return ret; + + for (i = 0; i < ETH_ALEN; i++) + dev->dev_addr[i] = SA_prom[i]; + + netdev_dbg(dev, "Found ethernet address: %pM\n", dev->dev_addr); + + ei_local->name = "mcf8390"; + ei_local->tx_start_page = start_page; + ei_local->stop_page = stop_page; + ei_local->word16 = 1; + ei_local->rx_start_page = start_page + TX_PAGES; + ei_local->reset_8390 = mcf8390_reset_8390; + ei_local->block_input = mcf8390_block_input; + ei_local->block_output = mcf8390_block_output; + ei_local->get_8390_hdr = mcf8390_get_8390_hdr; + ei_local->reg_offset = offsets; + + dev->netdev_ops = &mcf8390_netdev_ops; + __NS8390_init(dev, 0); + ret = register_netdev(dev); + if (ret) { + free_irq(dev->irq, dev); + return ret; + } + + netdev_info(dev, "addr=0x%08x irq=%d, Ethernet Address %pM\n", + addr, dev->irq, dev->dev_addr); + return 0; +} + +static int mcf8390_probe(struct platform_device *pdev) +{ + struct net_device *dev; + struct ei_device *ei_local; + struct resource *mem, *irq; + resource_size_t msize; + int ret; + + irq = platform_get_resource(pdev, IORESOURCE_IRQ, 0); + if (irq == NULL) { + dev_err(&pdev->dev, "no IRQ specified?\n"); + return -ENXIO; + } + + mem = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (mem == NULL) { + dev_err(&pdev->dev, "no memory address specified?\n"); + return -ENXIO; + } + msize = resource_size(mem); + if (!request_mem_region(mem->start, msize, pdev->name)) + return -EBUSY; + + dev = ____alloc_ei_netdev(0); + if (dev == NULL) { + release_mem_region(mem->start, msize); + return -ENOMEM; + } + + SET_NETDEV_DEV(dev, &pdev->dev); + platform_set_drvdata(pdev, dev); + ei_local = netdev_priv(dev); + + dev->irq = irq->start; + dev->base_addr = mem->start; + + ret = mcf8390_init(dev); + if (ret) { + release_mem_region(mem->start, msize); + free_netdev(dev); + return ret; + } + return 0; +} + +static int mcf8390_remove(struct platform_device *pdev) +{ + struct net_device *dev = platform_get_drvdata(pdev); + struct resource *mem; + + unregister_netdev(dev); + mem = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (mem) + release_mem_region(mem->start, resource_size(mem)); + free_netdev(dev); + return 0; +} + +static struct platform_driver mcf8390_drv = { + .driver = { + .name = "mcf8390", + .owner = THIS_MODULE, + }, + .probe = mcf8390_probe, + .remove = mcf8390_remove, +}; + +module_platform_driver(mcf8390_drv); + +MODULE_DESCRIPTION("MCF8390 ColdFire NS8390 driver"); +MODULE_AUTHOR("Greg Ungerer <gerg@uclinux.org>"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:mcf8390"); diff --git a/drivers/net/ethernet/qlogic/netxen/netxen_nic_init.c b/drivers/net/ethernet/qlogic/netxen/netxen_nic_init.c index b2c1b676..bc165f4 100644 --- a/drivers/net/ethernet/qlogic/netxen/netxen_nic_init.c +++ b/drivers/net/ethernet/qlogic/netxen/netxen_nic_init.c @@ -1437,8 +1437,6 @@ netxen_handle_linkevent(struct netxen_adapter *adapter, nx_fw_msg_t *msg) netdev->name, cable_len); } - netxen_advert_link_change(adapter, link_status); - /* update link parameters */ if (duplex == LINKEVENT_FULL_DUPLEX) adapter->link_duplex = DUPLEX_FULL; @@ -1447,6 +1445,8 @@ netxen_handle_linkevent(struct netxen_adapter *adapter, nx_fw_msg_t *msg) adapter->module_type = module; adapter->link_autoneg = autoneg; adapter->link_speed = link_speed; + + netxen_advert_link_change(adapter, link_status); } static void diff --git a/drivers/net/usb/smsc95xx.c b/drivers/net/usb/smsc95xx.c index 05ecf14..bd7cbaa 100644 --- a/drivers/net/usb/smsc95xx.c +++ b/drivers/net/usb/smsc95xx.c @@ -589,7 +589,8 @@ smsc95xx_ethtool_getregs(struct net_device *netdev, struct ethtool_regs *regs, void *buf) { struct usbnet *dev = netdev_priv(netdev); - unsigned int i, j, retval; + unsigned int i, j; + int retval; u32 *data = buf; retval = smsc95xx_read_reg(dev, ID_REV, ®s->version); diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 8260ef7..bc6c8fd 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -410,6 +410,22 @@ struct tcp6_sock { extern int inet6_sk_rebuild_header(struct sock *sk); +struct inet6_timewait_sock { + struct in6_addr tw_v6_daddr; + struct in6_addr tw_v6_rcv_saddr; +}; + +struct tcp6_timewait_sock { + struct tcp_timewait_sock tcp6tw_tcp; + struct inet6_timewait_sock tcp6tw_inet6; +}; + +static inline struct inet6_timewait_sock *inet6_twsk(const struct sock *sk) +{ + return (struct inet6_timewait_sock *)(((u8 *)sk) + + inet_twsk(sk)->tw_ipv6_offset); +} + #if IS_ENABLED(CONFIG_IPV6) static inline struct ipv6_pinfo * inet6_sk(const struct sock *__sk) { @@ -459,28 +475,12 @@ static inline void inet_sk_copy_descendant(struct sock *sk_to, #define __ipv6_only_sock(sk) (inet6_sk(sk)->ipv6only) #define ipv6_only_sock(sk) ((sk)->sk_family == PF_INET6 && __ipv6_only_sock(sk)) -struct inet6_timewait_sock { - struct in6_addr tw_v6_daddr; - struct in6_addr tw_v6_rcv_saddr; -}; - -struct tcp6_timewait_sock { - struct tcp_timewait_sock tcp6tw_tcp; - struct inet6_timewait_sock tcp6tw_inet6; -}; - static inline u16 inet6_tw_offset(const struct proto *prot) { return prot->twsk_prot->twsk_obj_size - sizeof(struct inet6_timewait_sock); } -static inline struct inet6_timewait_sock *inet6_twsk(const struct sock *sk) -{ - return (struct inet6_timewait_sock *)(((u8 *)sk) + - inet_twsk(sk)->tw_ipv6_offset); -} - static inline struct in6_addr *__inet6_rcv_saddr(const struct sock *sk) { return likely(sk->sk_state != TCP_TIME_WAIT) ? diff --git a/include/linux/pkt_cls.h b/include/linux/pkt_cls.h index 38fbd4b..082eafa 100644 --- a/include/linux/pkt_cls.h +++ b/include/linux/pkt_cls.h @@ -453,7 +453,8 @@ enum { #define TCF_EM_TEXT 5 #define TCF_EM_VLAN 6 #define TCF_EM_CANID 7 -#define TCF_EM_MAX 7 +#define TCF_EM_IPSET 8 +#define TCF_EM_MAX 8 enum { TCF_EM_PROG_TC diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 2de9cf4..1888169 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -339,6 +339,9 @@ struct tcp_sock { u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */ u32 lsndtime; /* timestamp of last sent data packet (for restart window) */ + struct list_head tsq_node; /* anchor in tsq_tasklet.head list */ + unsigned long tsq_flags; + /* Data for direct copy to user */ struct { struct sk_buff_head prequeue; @@ -494,6 +497,12 @@ struct tcp_sock { struct tcp_cookie_values *cookie_values; }; +enum tsq_flags { + TSQ_THROTTLED, + TSQ_QUEUED, + TSQ_OWNED, /* tcp_tasklet_func() found socket was locked */ +}; + static inline struct tcp_sock *tcp_sk(const struct sock *sk) { return (struct tcp_sock *)sk; diff --git a/include/net/dst_ops.h b/include/net/dst_ops.h index 4badc86..085931f 100644 --- a/include/net/dst_ops.h +++ b/include/net/dst_ops.h @@ -25,6 +25,7 @@ struct dst_ops { struct dst_entry * (*negative_advice)(struct dst_entry *); void (*link_failure)(struct sk_buff *); void (*update_pmtu)(struct dst_entry *dst, u32 mtu); + void (*redirect)(struct dst_entry *dst, struct sk_buff *skb); int (*local_out)(struct sk_buff *skb); struct neighbour * (*neigh_lookup)(const struct dst_entry *dst, struct sk_buff *skb, diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 58cb3fc..b6b6f7d 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -133,17 +133,12 @@ extern int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, const struct in6_addr *gwaddr); -extern void rt6_redirect(const struct in6_addr *dest, - const struct in6_addr *src, - const struct in6_addr *saddr, - struct neighbour *neigh, - u8 *lladdr, - int on_link); - extern void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, int oif, u32 mark); extern void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu); +extern void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark); +extern void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk); struct netlink_callback; diff --git a/include/net/ipv6.h b/include/net/ipv6.h index d4261d4..f695f39 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -251,6 +251,8 @@ static inline void fl6_sock_release(struct ip6_flowlabel *fl) atomic_dec(&fl->users); } +extern void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info); + extern int ip6_ra_control(struct sock *sk, int sel); extern int ipv6_parse_hopopts(struct sk_buff *skb); diff --git a/include/net/ndisc.h b/include/net/ndisc.h index c02b6ad..96a3b5c 100644 --- a/include/net/ndisc.h +++ b/include/net/ndisc.h @@ -47,6 +47,8 @@ enum { #include <linux/icmpv6.h> #include <linux/in6.h> #include <linux/types.h> +#include <linux/if_arp.h> +#include <linux/netdevice.h> #include <net/neighbour.h> @@ -80,6 +82,54 @@ struct nd_opt_hdr { __u8 nd_opt_len; } __packed; +/* ND options */ +struct ndisc_options { + struct nd_opt_hdr *nd_opt_array[__ND_OPT_ARRAY_MAX]; +#ifdef CONFIG_IPV6_ROUTE_INFO + struct nd_opt_hdr *nd_opts_ri; + struct nd_opt_hdr *nd_opts_ri_end; +#endif + struct nd_opt_hdr *nd_useropts; + struct nd_opt_hdr *nd_useropts_end; +}; + +#define nd_opts_src_lladdr nd_opt_array[ND_OPT_SOURCE_LL_ADDR] +#define nd_opts_tgt_lladdr nd_opt_array[ND_OPT_TARGET_LL_ADDR] +#define nd_opts_pi nd_opt_array[ND_OPT_PREFIX_INFO] +#define nd_opts_pi_end nd_opt_array[__ND_OPT_PREFIX_INFO_END] +#define nd_opts_rh nd_opt_array[ND_OPT_REDIRECT_HDR] +#define nd_opts_mtu nd_opt_array[ND_OPT_MTU] + +#define NDISC_OPT_SPACE(len) (((len)+2+7)&~7) + +extern struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len, + struct ndisc_options *ndopts); + +/* + * Return the padding between the option length and the start of the + * link addr. Currently only IP-over-InfiniBand needs this, although + * if RFC 3831 IPv6-over-Fibre Channel is ever implemented it may + * also need a pad of 2. + */ +static int ndisc_addr_option_pad(unsigned short type) +{ + switch (type) { + case ARPHRD_INFINIBAND: return 2; + default: return 0; + } +} + +static inline u8 *ndisc_opt_addr_data(struct nd_opt_hdr *p, + struct net_device *dev) +{ + u8 *lladdr = (u8 *)(p + 1); + int lladdrlen = p->nd_opt_len << 3; + int prepad = ndisc_addr_option_pad(dev->type); + if (lladdrlen != NDISC_OPT_SPACE(dev->addr_len + prepad)) + return NULL; + return lladdr + prepad; +} + static inline u32 ndisc_hashfn(const void *pkey, const struct net_device *dev, __u32 *hash_rnd) { const u32 *p32 = pkey; diff --git a/include/net/route.h b/include/net/route.h index 5236236..ace3cb4 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -108,8 +108,6 @@ extern struct ip_rt_acct __percpu *ip_rt_acct; struct in_device; extern int ip_rt_init(void); -extern void ip_rt_redirect(__be32 old_gw, __be32 dst, __be32 new_gw, - __be32 src, struct net_device *dev); extern void rt_cache_flush(struct net *net, int how); extern void rt_cache_flush_batch(struct net *net); extern struct rtable *__ip_route_output_key(struct net *, struct flowi4 *flp); @@ -181,6 +179,9 @@ static inline int ip_route_input_noref(struct sk_buff *skb, __be32 dst, __be32 s extern void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, int oif, u32 mark, u8 protocol, int flow_flags); extern void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu); +extern void ipv4_redirect(struct sk_buff *skb, struct net *net, + int oif, u32 mark, u8 protocol, int flow_flags); +extern void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk); extern void ip_rt_send_redirect(struct sk_buff *skb); extern unsigned int inet_addr_type(struct net *net, __be32 addr); diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index a2ef814..1f2735d 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -162,6 +162,8 @@ struct sock *sctp_err_lookup(int family, struct sk_buff *, void sctp_err_finish(struct sock *, struct sctp_association *); void sctp_icmp_frag_needed(struct sock *, struct sctp_association *, struct sctp_transport *t, __u32 pmtu); +void sctp_icmp_redirect(struct sock *, struct sctp_transport *, + struct sk_buff *); void sctp_icmp_proto_unreachable(struct sock *sk, struct sctp_association *asoc, struct sctp_transport *t); diff --git a/include/net/sock.h b/include/net/sock.h index dcb54a0..88de092 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -858,6 +858,8 @@ struct proto { int (*backlog_rcv) (struct sock *sk, struct sk_buff *skb); + void (*release_cb)(struct sock *sk); + /* Keeping track of sk's, looking them up, and port selection methods. */ void (*hash)(struct sock *sk); void (*unhash)(struct sock *sk); diff --git a/include/net/tcp.h b/include/net/tcp.h index 3618fef..439984b 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -253,6 +253,7 @@ extern int sysctl_tcp_cookie_size; extern int sysctl_tcp_thin_linear_timeouts; extern int sysctl_tcp_thin_dupack; extern int sysctl_tcp_early_retrans; +extern int sysctl_tcp_limit_output_bytes; extern atomic_long_t tcp_memory_allocated; extern struct percpu_counter tcp_sockets_allocated; @@ -321,6 +322,8 @@ extern struct proto tcp_prot; extern void tcp_init_mem(struct net *net); +extern void tcp_tasklet_init(void); + extern void tcp_v4_err(struct sk_buff *skb, u32); extern void tcp_shutdown (struct sock *sk, int how); @@ -334,6 +337,7 @@ extern int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t size); extern int tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); +extern void tcp_release_cb(struct sock *sk); extern int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg); extern int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th, unsigned int len); diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index b98d3d7..81f76c4 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -115,6 +115,10 @@ static void fake_update_pmtu(struct dst_entry *dst, u32 mtu) { } +static void fake_redirect(struct dst_entry *dst, struct sk_buff *skb) +{ +} + static u32 *fake_cow_metrics(struct dst_entry *dst, unsigned long old) { return NULL; @@ -136,6 +140,7 @@ static struct dst_ops fake_dst_ops = { .family = AF_INET, .protocol = cpu_to_be16(ETH_P_IP), .update_pmtu = fake_update_pmtu, + .redirect = fake_redirect, .cow_metrics = fake_cow_metrics, .neigh_lookup = fake_neigh_lookup, .mtu = fake_mtu, diff --git a/net/core/sock.c b/net/core/sock.c index 929bdcc..24039ac 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2159,6 +2159,10 @@ void release_sock(struct sock *sk) spin_lock_bh(&sk->sk_lock.slock); if (sk->sk_backlog.tail) __release_sock(sk); + + if (sk->sk_prot->release_cb) + sk->sk_prot->release_cb(sk); + sk->sk_lock.owned = 0; if (waitqueue_active(&sk->sk_lock.wq)) wake_up(&sk->sk_lock.wq); diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 3eb76b5..129ed8f 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -195,6 +195,14 @@ static inline void dccp_do_pmtu_discovery(struct sock *sk, } /* else let the usual retransmit timer handle it */ } +static void dccp_do_redirect(struct sk_buff *skb, struct sock *sk) +{ + struct dst_entry *dst = __sk_dst_check(sk, 0); + + if (dst) + dst->ops->redirect(dst, skb); +} + /* * This routine is called by the ICMP module when it gets some sort of error * condition. If err < 0 then the socket should be closed and the error @@ -259,6 +267,9 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info) } switch (type) { + case ICMP_REDIRECT: + dccp_do_redirect(skb, sk); + goto out; case ICMP_SOURCE_QUENCH: /* Just silently ignore these. */ goto out; diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 02162cf..090c080 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -130,6 +130,13 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, np = inet6_sk(sk); + if (type == NDISC_REDIRECT) { + struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie); + + if (dst) + dst->ops->redirect(dst, skb); + } + if (type == ICMPV6_PKT_TOOBIG) { struct dst_entry *dst = NULL; diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index b5594cc..e9c4e2e 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -118,6 +118,7 @@ static void dn_dst_ifdown(struct dst_entry *, struct net_device *dev, int how); static struct dst_entry *dn_dst_negative_advice(struct dst_entry *); static void dn_dst_link_failure(struct sk_buff *); static void dn_dst_update_pmtu(struct dst_entry *dst, u32 mtu); +static void dn_dst_redirect(struct dst_entry *dst, struct sk_buff *skb); static struct neighbour *dn_dst_neigh_lookup(const struct dst_entry *dst, struct sk_buff *skb, const void *daddr); @@ -145,6 +146,7 @@ static struct dst_ops dn_dst_ops = { .negative_advice = dn_dst_negative_advice, .link_failure = dn_dst_link_failure, .update_pmtu = dn_dst_update_pmtu, + .redirect = dn_dst_redirect, .neigh_lookup = dn_dst_neigh_lookup, }; @@ -292,6 +294,10 @@ static void dn_dst_update_pmtu(struct dst_entry *dst, u32 mtu) } } +static void dn_dst_redirect(struct dst_entry *dst, struct sk_buff *skb) +{ +} + /* * When a route has been marked obsolete. (e.g. routing cache flush) */ diff --git a/net/ieee802154/6lowpan.c b/net/ieee802154/6lowpan.c index f4070e5..6871ec1 100644 --- a/net/ieee802154/6lowpan.c +++ b/net/ieee802154/6lowpan.c @@ -113,7 +113,6 @@ struct lowpan_dev_record { struct lowpan_fragment { struct sk_buff *skb; /* skb to be assembled */ - spinlock_t lock; /* concurency lock */ u16 length; /* length to be assemled */ u32 bytes_rcv; /* bytes received */ u16 tag; /* current fragment tag */ @@ -123,7 +122,7 @@ struct lowpan_fragment { static unsigned short fragment_tag; static LIST_HEAD(lowpan_fragments); -spinlock_t flist_lock; +static DEFINE_SPINLOCK(flist_lock); static inline struct lowpan_dev_info *lowpan_dev_info(const struct net_device *dev) @@ -637,16 +636,13 @@ static void lowpan_fragment_timer_expired(unsigned long entry_addr) pr_debug("timer expired for frame with tag %d\n", entry->tag); - spin_lock(&flist_lock); list_del(&entry->list); - spin_unlock(&flist_lock); - dev_kfree_skb(entry->skb); kfree(entry); } static struct lowpan_fragment * -lowpan_alloc_new_frame(struct sk_buff *skb, u8 iphc0, u8 len, u8 tag) +lowpan_alloc_new_frame(struct sk_buff *skb, u8 iphc0, u8 len, u16 tag) { struct lowpan_fragment *frame; @@ -661,8 +657,8 @@ lowpan_alloc_new_frame(struct sk_buff *skb, u8 iphc0, u8 len, u8 tag) frame->tag = tag; /* allocate buffer for frame assembling */ - frame->skb = alloc_skb(frame->length + - sizeof(struct ipv6hdr), GFP_ATOMIC); + frame->skb = netdev_alloc_skb_ip_align(skb->dev, frame->length + + sizeof(struct ipv6hdr)); if (!frame->skb) goto skb_err; @@ -727,7 +723,7 @@ lowpan_process_data(struct sk_buff *skb) * check if frame assembling with the same tag is * already in progress */ - spin_lock(&flist_lock); + spin_lock_bh(&flist_lock); list_for_each_entry(frame, &lowpan_fragments, list) if (frame->tag == tag) { @@ -761,9 +757,9 @@ lowpan_process_data(struct sk_buff *skb) if ((frame->bytes_rcv == frame->length) && frame->timer.expires > jiffies) { /* if timer haven't expired - first of all delete it */ - del_timer(&frame->timer); + del_timer_sync(&frame->timer); list_del(&frame->list); - spin_unlock(&flist_lock); + spin_unlock_bh(&flist_lock); dev_kfree_skb(skb); skb = frame->skb; @@ -774,7 +770,7 @@ lowpan_process_data(struct sk_buff *skb) break; } - spin_unlock(&flist_lock); + spin_unlock_bh(&flist_lock); return kfree_skb(skb), 0; } @@ -929,7 +925,7 @@ lowpan_process_data(struct sk_buff *skb) return lowpan_skb_deliver(skb, &hdr); unlock_and_drop: - spin_unlock(&flist_lock); + spin_unlock_bh(&flist_lock); drop: kfree_skb(skb); return -EINVAL; @@ -1186,8 +1182,6 @@ static int lowpan_newlink(struct net *src_net, struct net_device *dev, list_add_tail(&entry->list, &lowpan_devices); mutex_unlock(&lowpan_dev_info(dev)->dev_list_mtx); - spin_lock_init(&flist_lock); - register_netdevice(dev); return 0; @@ -1198,19 +1192,9 @@ static void lowpan_dellink(struct net_device *dev, struct list_head *head) struct lowpan_dev_info *lowpan_dev = lowpan_dev_info(dev); struct net_device *real_dev = lowpan_dev->real_dev; struct lowpan_dev_record *entry, *tmp; - struct lowpan_fragment *frame, *tframe; ASSERT_RTNL(); - spin_lock(&flist_lock); - list_for_each_entry_safe(frame, tframe, &lowpan_fragments, list) { - del_timer(&frame->timer); - list_del(&frame->list); - dev_kfree_skb(frame->skb); - kfree(frame); - } - spin_unlock(&flist_lock); - mutex_lock(&lowpan_dev_info(dev)->dev_list_mtx); list_for_each_entry_safe(entry, tmp, &lowpan_devices, list) { if (entry->ldev == dev) { @@ -1266,9 +1250,24 @@ out: static void __exit lowpan_cleanup_module(void) { + struct lowpan_fragment *frame, *tframe; + lowpan_netlink_fini(); dev_remove_pack(&lowpan_packet_type); + + /* Now 6lowpan packet_type is removed, so no new fragments are + * expected on RX, therefore that's the time to clean incomplete + * fragments. + */ + spin_lock_bh(&flist_lock); + list_for_each_entry_safe(frame, tframe, &lowpan_fragments, list) { + del_timer_sync(&frame->timer); + list_del(&frame->list); + dev_kfree_skb(frame->skb); + kfree(frame); + } + spin_unlock_bh(&flist_lock); } module_init(lowpan_init_module); diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index 916d5ec..a0d8392 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -398,17 +398,25 @@ static void ah4_err(struct sk_buff *skb, u32 info) struct ip_auth_hdr *ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2)); struct xfrm_state *x; - if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH || - icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) + switch (icmp_hdr(skb)->type) { + case ICMP_DEST_UNREACH: + if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) + return; + case ICMP_REDIRECT: + break; + default: return; + } x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET); if (!x) return; - pr_debug("pmtu discovery on SA AH/%08x/%08x\n", - ntohl(ah->spi), ntohl(iph->daddr)); - ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_AH, 0); + + if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) + ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_AH, 0); + else + ipv4_redirect(skb, net, 0, 0, IPPROTO_AH, 0); xfrm_state_put(x); } diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 7b95b49..b61e9de 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -484,17 +484,25 @@ static void esp4_err(struct sk_buff *skb, u32 info) struct ip_esp_hdr *esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2)); struct xfrm_state *x; - if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH || - icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) + switch (icmp_hdr(skb)->type) { + case ICMP_DEST_UNREACH: + if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) + return; + case ICMP_REDIRECT: + break; + default: return; + } x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET); if (!x) return; - NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n", - ntohl(esph->spi), ntohl(iph->daddr)); - ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ESP, 0); + + if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) + ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ESP, 0); + else + ipv4_redirect(skb, net, 0, 0, IPPROTO_ESP, 0); xfrm_state_put(x); } diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 4a04944..d01aeb4 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -634,18 +634,31 @@ out:; EXPORT_SYMBOL(icmp_send); +static void icmp_socket_deliver(struct sk_buff *skb, u32 info) +{ + const struct iphdr *iph = (const struct iphdr *) skb->data; + const struct net_protocol *ipprot; + int protocol = iph->protocol; + + raw_icmp_error(skb, protocol, info); + + rcu_read_lock(); + ipprot = rcu_dereference(inet_protos[protocol]); + if (ipprot && ipprot->err_handler) + ipprot->err_handler(skb, info); + rcu_read_unlock(); +} + /* * Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, and ICMP_QUENCH. */ static void icmp_unreach(struct sk_buff *skb) { - const struct net_protocol *ipprot; const struct iphdr *iph; struct icmphdr *icmph; struct net *net; u32 info = 0; - int protocol; net = dev_net(skb_dst(skb)->dev); @@ -726,19 +739,7 @@ static void icmp_unreach(struct sk_buff *skb) if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) goto out; - iph = (const struct iphdr *)skb->data; - protocol = iph->protocol; - - /* - * Deliver ICMP message to raw sockets. Pretty useless feature? - */ - raw_icmp_error(skb, protocol, info); - - rcu_read_lock(); - ipprot = rcu_dereference(inet_protos[protocol]); - if (ipprot && ipprot->err_handler) - ipprot->err_handler(skb, info); - rcu_read_unlock(); + icmp_socket_deliver(skb, info); out: return; @@ -754,46 +755,15 @@ out_err: static void icmp_redirect(struct sk_buff *skb) { - const struct iphdr *iph; - - if (skb->len < sizeof(struct iphdr)) - goto out_err; - - /* - * Get the copied header of the packet that caused the redirect - */ - if (!pskb_may_pull(skb, sizeof(struct iphdr))) - goto out; - - iph = (const struct iphdr *)skb->data; - - switch (icmp_hdr(skb)->code & 7) { - case ICMP_REDIR_NET: - case ICMP_REDIR_NETTOS: - /* - * As per RFC recommendations now handle it as a host redirect. - */ - case ICMP_REDIR_HOST: - case ICMP_REDIR_HOSTTOS: - ip_rt_redirect(ip_hdr(skb)->saddr, iph->daddr, - icmp_hdr(skb)->un.gateway, - iph->saddr, skb->dev); - break; + if (skb->len < sizeof(struct iphdr)) { + ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS); + return; } - /* Ping wants to see redirects. - * Let's pretend they are errors of sorts... */ - if (iph->protocol == IPPROTO_ICMP && - iph->ihl >= 5 && - pskb_may_pull(skb, (iph->ihl<<2)+8)) { - ping_err(skb, icmp_hdr(skb)->un.gateway); - } + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + return; -out: - return; -out_err: - ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS); - goto out; + icmp_socket_deliver(skb, icmp_hdr(skb)->un.gateway); } /* diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 594cec3..0c31235 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -528,6 +528,9 @@ static void ipgre_err(struct sk_buff *skb, u32 info) if (code != ICMP_EXC_TTL) return; break; + + case ICMP_REDIRECT: + break; } rcu_read_lock(); @@ -543,7 +546,11 @@ static void ipgre_err(struct sk_buff *skb, u32 info) t->parms.link, 0, IPPROTO_GRE, 0); goto out; } - + if (type == ICMP_REDIRECT) { + ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0, + IPPROTO_GRE, 0); + goto out; + } if (t->parms.iph.daddr == 0 || ipv4_is_multicast(t->parms.iph.daddr)) goto out; diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index b913754..d3ab47e 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -31,18 +31,26 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info) struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); struct xfrm_state *x; - if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH || - icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) + switch (icmp_hdr(skb)->type) { + case ICMP_DEST_UNREACH: + if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) + return; + case ICMP_REDIRECT: + break; + default: return; + } spi = htonl(ntohs(ipch->cpi)); x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, spi, IPPROTO_COMP, AF_INET); if (!x) return; - NETDEBUG(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%pI4\n", - spi, &iph->daddr); - ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_COMP, 0); + + if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) + ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_COMP, 0); + else + ipv4_redirect(skb, net, 0, 0, IPPROTO_COMP, 0); xfrm_state_put(x); } diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 715338a..c2d0e6d 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -360,6 +360,8 @@ static int ipip_err(struct sk_buff *skb, u32 info) if (code != ICMP_EXC_TTL) return 0; break; + case ICMP_REDIRECT: + break; } err = -ENOENT; @@ -376,6 +378,13 @@ static int ipip_err(struct sk_buff *skb, u32 info) goto out; } + if (type == ICMP_REDIRECT) { + ipv4_redirect(skb, dev_net(skb->dev), t->dev->ifindex, 0, + IPPROTO_IPIP, 0); + err = 0; + goto out; + } + if (t->parms.iph.daddr == 0) goto out; diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 340fcf2..6232d47 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -387,6 +387,7 @@ void ping_err(struct sk_buff *skb, u32 info) break; case ICMP_REDIRECT: /* See ICMP_SOURCE_QUENCH */ + ipv4_sk_redirect(skb, sk); err = EREMOTEIO; break; } diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 659ddfb..ff0f071 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -218,6 +218,8 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info) if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) ipv4_sk_update_pmtu(skb, sk, info); + else if (type == ICMP_REDIRECT) + ipv4_sk_redirect(skb, sk); /* Report error on raw socket, if: 1. User requested ip_recverr. diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 95bfa1b..9319bf1 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -149,6 +149,7 @@ static void ipv4_dst_destroy(struct dst_entry *dst); static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); static void ipv4_link_failure(struct sk_buff *skb); static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu); +static void ip_do_redirect(struct dst_entry *dst, struct sk_buff *skb); static int rt_garbage_collect(struct dst_ops *ops); static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, @@ -179,6 +180,7 @@ static struct dst_ops ipv4_dst_ops = { .negative_advice = ipv4_negative_advice, .link_failure = ipv4_link_failure, .update_pmtu = ip_rt_update_pmtu, + .redirect = ip_do_redirect, .local_out = __ip_local_out, .neigh_lookup = ipv4_neigh_lookup, }; @@ -1271,16 +1273,32 @@ static void rt_del(unsigned int hash, struct rtable *rt) spin_unlock_bh(rt_hash_lock_addr(hash)); } -/* called in rcu_read_lock() section */ -void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, - __be32 saddr, struct net_device *dev) +static void ip_do_redirect(struct dst_entry *dst, struct sk_buff *skb) { - int s, i; - struct in_device *in_dev = __in_dev_get_rcu(dev); - __be32 skeys[2] = { saddr, 0 }; - int ikeys[2] = { dev->ifindex, 0 }; + __be32 new_gw = icmp_hdr(skb)->un.gateway; + __be32 old_gw = ip_hdr(skb)->saddr; + struct net_device *dev = skb->dev; + struct in_device *in_dev; + struct neighbour *n; + struct rtable *rt; struct net *net; + switch (icmp_hdr(skb)->code & 7) { + case ICMP_REDIR_NET: + case ICMP_REDIR_NETTOS: + case ICMP_REDIR_HOST: + case ICMP_REDIR_HOSTTOS: + break; + + default: + return; + } + + rt = (struct rtable *) dst; + if (rt->rt_gateway != old_gw) + return; + + in_dev = __in_dev_get_rcu(dev); if (!in_dev) return; @@ -1300,55 +1318,31 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, goto reject_redirect; } - for (s = 0; s < 2; s++) { - for (i = 0; i < 2; i++) { - unsigned int hash; - struct rtable __rcu **rthp; - struct rtable *rt; - - hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net)); - - rthp = &rt_hash_table[hash].chain; - - while ((rt = rcu_dereference(*rthp)) != NULL) { - struct neighbour *n; - - rthp = &rt->dst.rt_next; - - if (rt->rt_key_dst != daddr || - rt->rt_key_src != skeys[s] || - rt->rt_oif != ikeys[i] || - rt_is_input_route(rt) || - rt_is_expired(rt) || - !net_eq(dev_net(rt->dst.dev), net) || - rt->dst.error || - rt->dst.dev != dev || - rt->rt_gateway != old_gw) - continue; - - n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw); - if (n) { - if (!(n->nud_state & NUD_VALID)) { - neigh_event_send(n, NULL); - } else { - rt->rt_gateway = new_gw; - rt->rt_flags |= RTCF_REDIRECTED; - call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n); - } - neigh_release(n); - } - } + n = ipv4_neigh_lookup(dst, NULL, &new_gw); + if (n) { + if (!(n->nud_state & NUD_VALID)) { + neigh_event_send(n, NULL); + } else { + rt->rt_gateway = new_gw; + rt->rt_flags |= RTCF_REDIRECTED; + call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n); } + neigh_release(n); } return; reject_redirect: #ifdef CONFIG_IP_ROUTE_VERBOSE - if (IN_DEV_LOG_MARTIANS(in_dev)) + if (IN_DEV_LOG_MARTIANS(in_dev)) { + const struct iphdr *iph = (const struct iphdr *) skb->data; + __be32 daddr = iph->daddr; + __be32 saddr = iph->saddr; + net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n" " Advised path = %pI4 -> %pI4\n", &old_gw, dev->name, &new_gw, &saddr, &daddr); + } #endif ; } @@ -1554,6 +1548,34 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) } EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu); +void ipv4_redirect(struct sk_buff *skb, struct net *net, + int oif, u32 mark, u8 protocol, int flow_flags) +{ + const struct iphdr *iph = (const struct iphdr *)skb->data; + struct flowi4 fl4; + struct rtable *rt; + + flowi4_init_output(&fl4, oif, mark, RT_TOS(iph->tos), RT_SCOPE_UNIVERSE, + protocol, flow_flags, iph->daddr, iph->saddr, 0, 0); + rt = __ip_route_output_key(net, &fl4); + if (!IS_ERR(rt)) { + ip_do_redirect(&rt->dst, skb); + ip_rt_put(rt); + } +} +EXPORT_SYMBOL_GPL(ipv4_redirect); + +void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk) +{ + const struct inet_sock *inet = inet_sk(sk); + + return ipv4_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, + sk->sk_mark, + inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, + inet_sk_flowi_flags(sk)); +} +EXPORT_SYMBOL_GPL(ipv4_sk_redirect); + static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) { struct rtable *rt = (struct rtable *) dst; @@ -2571,6 +2593,10 @@ static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) { } +static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sk_buff *skb) +{ +} + static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst, unsigned long old) { @@ -2585,6 +2611,7 @@ static struct dst_ops ipv4_dst_blackhole_ops = { .mtu = ipv4_blackhole_mtu, .default_advmss = ipv4_default_advmss, .update_pmtu = ipv4_rt_blackhole_update_pmtu, + .redirect = ipv4_rt_blackhole_redirect, .cow_metrics = ipv4_rt_blackhole_cow_metrics, .neigh_lookup = ipv4_neigh_lookup, }; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 12aa0c5..70730f7 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -598,6 +598,13 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "tcp_limit_output_bytes", + .data = &sysctl_tcp_limit_output_bytes, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, #ifdef CONFIG_NET_DMA { .procname = "tcp_dma_copybreak", diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index d902da9..4252cd8 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -376,6 +376,7 @@ void tcp_init_sock(struct sock *sk) skb_queue_head_init(&tp->out_of_order_queue); tcp_init_xmit_timers(sk); tcp_prequeue_init(tp); + INIT_LIST_HEAD(&tp->tsq_node); icsk->icsk_rto = TCP_TIMEOUT_INIT; tp->mdev = TCP_TIMEOUT_INIT; @@ -796,6 +797,10 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, inet_csk(sk)->icsk_ext_hdr_len - tp->tcp_header_len); + /* TSQ : try to have two TSO segments in flight */ + xmit_size_goal = min_t(u32, xmit_size_goal, + sysctl_tcp_limit_output_bytes >> 1); + xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal); /* We try hard to avoid divides here */ @@ -3574,4 +3579,5 @@ void __init tcp_init(void) tcp_secret_primary = &tcp_secret_one; tcp_secret_retiring = &tcp_secret_two; tcp_secret_secondary = &tcp_secret_two; + tcp_tasklet_init(); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ddefd39..7a0062c 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -321,6 +321,14 @@ static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu) } /* else let the usual retransmit timer handle it */ } +static void do_redirect(struct sk_buff *skb, struct sock *sk) +{ + struct dst_entry *dst = __sk_dst_check(sk, 0); + + if (dst) + dst->ops->redirect(dst, skb); +} + /* * This routine is called by the ICMP module when it gets some * sort of error condition. If err < 0 then the socket should @@ -394,6 +402,9 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) } switch (type) { + case ICMP_REDIRECT: + do_redirect(icmp_skb, sk); + goto out; case ICMP_SOURCE_QUENCH: /* Just silently ignore these. */ goto out; @@ -2588,6 +2599,7 @@ struct proto tcp_prot = { .sendmsg = tcp_sendmsg, .sendpage = tcp_sendpage, .backlog_rcv = tcp_v4_do_rcv, + .release_cb = tcp_release_cb, .hash = inet_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 1fd83d3..5a38a2d 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -412,7 +412,7 @@ void tcp_update_metrics(struct sock *sk) max(tp->snd_cwnd >> 1, tp->snd_ssthresh)); if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) { val = tcp_metric_get(tm, TCP_METRIC_CWND); - tcp_metric_set(tm, RTAX_CWND, (val + tp->snd_cwnd) >> 1); + tcp_metric_set(tm, TCP_METRIC_CWND, (val + tp->snd_cwnd) >> 1); } } else { /* Else slow start did not finish, cwnd is non-sense, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 6560886..c66f2ed 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -424,6 +424,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, treq->snt_isn + 1 + tcp_s_data_size(oldtp); tcp_prequeue_init(newtp); + INIT_LIST_HEAD(&newtp->tsq_node); tcp_init_wl(newtp, treq->rcv_isn); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index c465d3e..03854ab 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -50,6 +50,9 @@ int sysctl_tcp_retrans_collapse __read_mostly = 1; */ int sysctl_tcp_workaround_signed_windows __read_mostly = 0; +/* Default TSQ limit of two TSO segments */ +int sysctl_tcp_limit_output_bytes __read_mostly = 131072; + /* This limits the percentage of the congestion window which we * will allow a single TSO frame to consume. Building TSO frames * which are too large can cause TCP streams to be bursty. @@ -65,6 +68,8 @@ int sysctl_tcp_slow_start_after_idle __read_mostly = 1; int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */ EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size); +static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, + int push_one, gfp_t gfp); /* Account for new data that has been sent to the network. */ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) @@ -783,6 +788,140 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb return size; } + +/* TCP SMALL QUEUES (TSQ) + * + * TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev) + * to reduce RTT and bufferbloat. + * We do this using a special skb destructor (tcp_wfree). + * + * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb + * needs to be reallocated in a driver. + * The invariant being skb->truesize substracted from sk->sk_wmem_alloc + * + * Since transmit from skb destructor is forbidden, we use a tasklet + * to process all sockets that eventually need to send more skbs. + * We use one tasklet per cpu, with its own queue of sockets. + */ +struct tsq_tasklet { + struct tasklet_struct tasklet; + struct list_head head; /* queue of tcp sockets */ +}; +static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet); + +/* + * One tasklest per cpu tries to send more skbs. + * We run in tasklet context but need to disable irqs when + * transfering tsq->head because tcp_wfree() might + * interrupt us (non NAPI drivers) + */ +static void tcp_tasklet_func(unsigned long data) +{ + struct tsq_tasklet *tsq = (struct tsq_tasklet *)data; + LIST_HEAD(list); + unsigned long flags; + struct list_head *q, *n; + struct tcp_sock *tp; + struct sock *sk; + + local_irq_save(flags); + list_splice_init(&tsq->head, &list); + local_irq_restore(flags); + + list_for_each_safe(q, n, &list) { + tp = list_entry(q, struct tcp_sock, tsq_node); + list_del(&tp->tsq_node); + + sk = (struct sock *)tp; + bh_lock_sock(sk); + + if (!sock_owned_by_user(sk)) { + if ((1 << sk->sk_state) & + (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | + TCPF_CLOSING | TCPF_CLOSE_WAIT)) + tcp_write_xmit(sk, + tcp_current_mss(sk), + 0, 0, + GFP_ATOMIC); + } else { + /* defer the work to tcp_release_cb() */ + set_bit(TSQ_OWNED, &tp->tsq_flags); + } + bh_unlock_sock(sk); + + clear_bit(TSQ_QUEUED, &tp->tsq_flags); + sk_free(sk); + } +} + +/** + * tcp_release_cb - tcp release_sock() callback + * @sk: socket + * + * called from release_sock() to perform protocol dependent + * actions before socket release. + */ +void tcp_release_cb(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (test_and_clear_bit(TSQ_OWNED, &tp->tsq_flags)) { + if ((1 << sk->sk_state) & + (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | + TCPF_CLOSING | TCPF_CLOSE_WAIT)) + tcp_write_xmit(sk, + tcp_current_mss(sk), + 0, 0, + GFP_ATOMIC); + } +} +EXPORT_SYMBOL(tcp_release_cb); + +void __init tcp_tasklet_init(void) +{ + int i; + + for_each_possible_cpu(i) { + struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i); + + INIT_LIST_HEAD(&tsq->head); + tasklet_init(&tsq->tasklet, + tcp_tasklet_func, + (unsigned long)tsq); + } +} + +/* + * Write buffer destructor automatically called from kfree_skb. + * We cant xmit new skbs from this context, as we might already + * hold qdisc lock. + */ +void tcp_wfree(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + struct tcp_sock *tp = tcp_sk(sk); + + if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) && + !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) { + unsigned long flags; + struct tsq_tasklet *tsq; + + /* Keep a ref on socket. + * This last ref will be released in tcp_tasklet_func() + */ + atomic_sub(skb->truesize - 1, &sk->sk_wmem_alloc); + + /* queue this socket to tasklet queue */ + local_irq_save(flags); + tsq = &__get_cpu_var(tsq_tasklet); + list_add(&tp->tsq_node, &tsq->head); + tasklet_schedule(&tsq->tasklet); + local_irq_restore(flags); + } else { + sock_wfree(skb); + } +} + /* This routine actually transmits TCP packets queued in by * tcp_do_sendmsg(). This is used by both the initial * transmission and possible later retransmissions. @@ -844,7 +983,12 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); - skb_set_owner_w(skb, sk); + + skb_orphan(skb); + skb->sk = sk; + skb->destructor = (sysctl_tcp_limit_output_bytes > 0) ? + tcp_wfree : sock_wfree; + atomic_add(skb->truesize, &sk->sk_wmem_alloc); /* Build TCP header and checksum it. */ th = tcp_hdr(skb); @@ -1780,6 +1924,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, while ((skb = tcp_send_head(sk))) { unsigned int limit; + tso_segs = tcp_init_tso_segs(sk, skb, mss_now); BUG_ON(!tso_segs); @@ -1800,6 +1945,13 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, break; } + /* TSQ : sk_wmem_alloc accounts skb truesize, + * including skb overhead. But thats OK. + */ + if (atomic_read(&sk->sk_wmem_alloc) >= sysctl_tcp_limit_output_bytes) { + set_bit(TSQ_THROTTLED, &tp->tsq_flags); + break; + } limit = mss_now; if (tso_segs > 1 && !tcp_urg_mode(tp)) limit = tcp_mss_split_point(sk, skb, mss_now, diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index ee37d47..b4c3582 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -630,6 +630,9 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) err = icmp_err_convert[code].errno; } break; + case ICMP_REDIRECT: + ipv4_sk_redirect(skb, sk); + break; } /* diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 87d3fcc..737131c 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -202,6 +202,14 @@ static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu) path->ops->update_pmtu(path, mtu); } +static void xfrm4_redirect(struct dst_entry *dst, struct sk_buff *skb) +{ + struct xfrm_dst *xdst = (struct xfrm_dst *)dst; + struct dst_entry *path = xdst->route; + + path->ops->redirect(path, skb); +} + static void xfrm4_dst_destroy(struct dst_entry *dst) { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; @@ -225,6 +233,7 @@ static struct dst_ops xfrm4_dst_ops = { .protocol = cpu_to_be16(ETH_P_IP), .gc = xfrm4_garbage_collect, .update_pmtu = xfrm4_update_pmtu, + .redirect = xfrm4_redirect, .cow_metrics = dst_cow_metrics_generic, .destroy = xfrm4_dst_destroy, .ifdown = xfrm4_dst_ifdown, diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index 49d4d26..7e61395 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -613,16 +613,18 @@ static void ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, struct xfrm_state *x; if (type != ICMPV6_DEST_UNREACH && - type != ICMPV6_PKT_TOOBIG) + type != ICMPV6_PKT_TOOBIG && + type != NDISC_REDIRECT) return; x = xfrm_state_lookup(net, skb->mark, (xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET6); if (!x) return; - NETDEBUG(KERN_DEBUG "pmtu discovery on SA AH/%08x/%pI6\n", - ntohl(ah->spi), &iph->daddr); - ip6_update_pmtu(skb, net, info, 0, 0); + if (type == NDISC_REDIRECT) + ip6_redirect(skb, net, 0, 0); + else + ip6_update_pmtu(skb, net, info, 0, 0); xfrm_state_put(x); } diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 89a615b..6dc7fd3 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -434,16 +434,19 @@ static void esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, struct xfrm_state *x; if (type != ICMPV6_DEST_UNREACH && - type != ICMPV6_PKT_TOOBIG) + type != ICMPV6_PKT_TOOBIG && + type != NDISC_REDIRECT) return; x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET6); if (!x) return; - pr_debug("pmtu discovery on SA ESP/%08x/%pI6\n", - ntohl(esph->spi), &iph->daddr); - ip6_update_pmtu(skb, net, info, 0, 0); + + if (type == NDISC_REDIRECT) + ip6_redirect(skb, net, 0, 0); + else + ip6_update_pmtu(skb, net, info, 0, 0); xfrm_state_put(x); } diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index a113f7d..24d69db 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -598,7 +598,7 @@ out: icmpv6_xmit_unlock(sk); } -static void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info) +void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info) { const struct inet6_protocol *ipprot; int inner_offset; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 6af3fcf..61d1065 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -550,6 +550,9 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, rel_type = ICMP_DEST_UNREACH; rel_code = ICMP_FRAG_NEEDED; break; + case NDISC_REDIRECT: + rel_type = ICMP_REDIRECT; + rel_code = ICMP_REDIR_HOST; default: return 0; } @@ -608,6 +611,8 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, skb_dst(skb2)->ops->update_pmtu(skb_dst(skb2), rel_info); } + if (rel_type == ICMP_REDIRECT) + skb_dst(skb2)->ops->redirect(skb_dst(skb2), skb2); icmp_send(skb2, rel_type, rel_code, htonl(rel_info)); diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c index 9283238..7af5aee 100644 --- a/net/ipv6/ipcomp6.c +++ b/net/ipv6/ipcomp6.c @@ -64,7 +64,9 @@ static void ipcomp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, (struct ip_comp_hdr *)(skb->data + offset); struct xfrm_state *x; - if (type != ICMPV6_DEST_UNREACH && type != ICMPV6_PKT_TOOBIG) + if (type != ICMPV6_DEST_UNREACH && + type != ICMPV6_PKT_TOOBIG && + type != NDISC_REDIRECT) return; spi = htonl(ntohs(ipcomph->cpi)); @@ -73,9 +75,10 @@ static void ipcomp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (!x) return; - pr_debug("pmtu discovery on SA IPCOMP/%08x/%pI6\n", - spi, &iph->daddr); - ip6_update_pmtu(skb, net, info, 0, 0); + if (type == NDISC_REDIRECT) + ip6_redirect(skb, net, 0, 0); + else + ip6_update_pmtu(skb, net, info, 0, 0); xfrm_state_put(x); } diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 0fddd57..ff36194 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -143,40 +143,6 @@ struct neigh_table nd_tbl = { .gc_thresh3 = 1024, }; -/* ND options */ -struct ndisc_options { - struct nd_opt_hdr *nd_opt_array[__ND_OPT_ARRAY_MAX]; -#ifdef CONFIG_IPV6_ROUTE_INFO - struct nd_opt_hdr *nd_opts_ri; - struct nd_opt_hdr *nd_opts_ri_end; -#endif - struct nd_opt_hdr *nd_useropts; - struct nd_opt_hdr *nd_useropts_end; -}; - -#define nd_opts_src_lladdr nd_opt_array[ND_OPT_SOURCE_LL_ADDR] -#define nd_opts_tgt_lladdr nd_opt_array[ND_OPT_TARGET_LL_ADDR] -#define nd_opts_pi nd_opt_array[ND_OPT_PREFIX_INFO] -#define nd_opts_pi_end nd_opt_array[__ND_OPT_PREFIX_INFO_END] -#define nd_opts_rh nd_opt_array[ND_OPT_REDIRECT_HDR] -#define nd_opts_mtu nd_opt_array[ND_OPT_MTU] - -#define NDISC_OPT_SPACE(len) (((len)+2+7)&~7) - -/* - * Return the padding between the option length and the start of the - * link addr. Currently only IP-over-InfiniBand needs this, although - * if RFC 3831 IPv6-over-Fibre Channel is ever implemented it may - * also need a pad of 2. - */ -static int ndisc_addr_option_pad(unsigned short type) -{ - switch (type) { - case ARPHRD_INFINIBAND: return 2; - default: return 0; - } -} - static inline int ndisc_opt_addr_space(struct net_device *dev) { return NDISC_OPT_SPACE(dev->addr_len + ndisc_addr_option_pad(dev->type)); @@ -233,8 +199,8 @@ static struct nd_opt_hdr *ndisc_next_useropt(struct nd_opt_hdr *cur, return cur <= end && ndisc_is_useropt(cur) ? cur : NULL; } -static struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len, - struct ndisc_options *ndopts) +struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len, + struct ndisc_options *ndopts) { struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)opt; @@ -297,17 +263,6 @@ static struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len, return ndopts; } -static inline u8 *ndisc_opt_addr_data(struct nd_opt_hdr *p, - struct net_device *dev) -{ - u8 *lladdr = (u8 *)(p + 1); - int lladdrlen = p->nd_opt_len << 3; - int prepad = ndisc_addr_option_pad(dev->type); - if (lladdrlen != NDISC_OPT_SPACE(dev->addr_len + prepad)) - return NULL; - return lladdr + prepad; -} - int ndisc_mc_map(const struct in6_addr *addr, char *buf, struct net_device *dev, int dir) { switch (dev->type) { @@ -1379,16 +1334,6 @@ out: static void ndisc_redirect_rcv(struct sk_buff *skb) { - struct inet6_dev *in6_dev; - struct icmp6hdr *icmph; - const struct in6_addr *dest; - const struct in6_addr *target; /* new first hop to destination */ - struct neighbour *neigh; - int on_link = 0; - struct ndisc_options ndopts; - int optlen; - u8 *lladdr = NULL; - #ifdef CONFIG_IPV6_NDISC_NODETYPE switch (skb->ndisc_nodetype) { case NDISC_NODETYPE_HOST: @@ -1405,65 +1350,7 @@ static void ndisc_redirect_rcv(struct sk_buff *skb) return; } - optlen = skb->tail - skb->transport_header; - optlen -= sizeof(struct icmp6hdr) + 2 * sizeof(struct in6_addr); - - if (optlen < 0) { - ND_PRINTK(2, warn, "Redirect: packet too short\n"); - return; - } - - icmph = icmp6_hdr(skb); - target = (const struct in6_addr *) (icmph + 1); - dest = target + 1; - - if (ipv6_addr_is_multicast(dest)) { - ND_PRINTK(2, warn, - "Redirect: destination address is multicast\n"); - return; - } - - if (ipv6_addr_equal(dest, target)) { - on_link = 1; - } else if (ipv6_addr_type(target) != - (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { - ND_PRINTK(2, warn, - "Redirect: target address is not link-local unicast\n"); - return; - } - - in6_dev = __in6_dev_get(skb->dev); - if (!in6_dev) - return; - if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects) - return; - - /* RFC2461 8.1: - * The IP source address of the Redirect MUST be the same as the current - * first-hop router for the specified ICMP Destination Address. - */ - - if (!ndisc_parse_options((u8*)(dest + 1), optlen, &ndopts)) { - ND_PRINTK(2, warn, "Redirect: invalid ND options\n"); - return; - } - if (ndopts.nd_opts_tgt_lladdr) { - lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, - skb->dev); - if (!lladdr) { - ND_PRINTK(2, warn, - "Redirect: invalid link-layer address length\n"); - return; - } - } - - neigh = __neigh_lookup(&nd_tbl, target, skb->dev, 1); - if (neigh) { - rt6_redirect(dest, &ipv6_hdr(skb)->daddr, - &ipv6_hdr(skb)->saddr, neigh, lladdr, - on_link); - neigh_release(neigh); - } + icmpv6_notify(skb, NDISC_REDIRECT, 0, 0); } void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index b5c1dcb..ef0579d 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -332,6 +332,8 @@ static void rawv6_err(struct sock *sk, struct sk_buff *skb, ip6_sk_update_pmtu(skb, sk, info); harderr = (np->pmtudisc == IPV6_PMTUDISC_DO); } + if (type == NDISC_REDIRECT) + ip6_sk_redirect(skb, sk); if (np->recverr) { u8 *payload = skb->data; if (!inet->hdrincl) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 563f12c..3151aab 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -79,6 +79,7 @@ static int ip6_pkt_discard(struct sk_buff *skb); static int ip6_pkt_discard_out(struct sk_buff *skb); static void ip6_link_failure(struct sk_buff *skb); static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu); +static void rt6_do_redirect(struct dst_entry *dst, struct sk_buff *skb); #ifdef CONFIG_IPV6_ROUTE_INFO static struct rt6_info *rt6_add_route_info(struct net *net, @@ -174,6 +175,7 @@ static struct dst_ops ip6_dst_ops_template = { .negative_advice = ip6_negative_advice, .link_failure = ip6_link_failure, .update_pmtu = ip6_rt_update_pmtu, + .redirect = rt6_do_redirect, .local_out = __ip6_local_out, .neigh_lookup = ip6_neigh_lookup, }; @@ -189,6 +191,10 @@ static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) { } +static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sk_buff *skb) +{ +} + static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst, unsigned long old) { @@ -203,6 +209,7 @@ static struct dst_ops ip6_dst_blackhole_ops = { .mtu = ip6_blackhole_mtu, .default_advmss = ip6_default_advmss, .update_pmtu = ip6_rt_blackhole_update_pmtu, + .redirect = ip6_rt_blackhole_redirect, .cow_metrics = ip6_rt_blackhole_cow_metrics, .neigh_lookup = ip6_neigh_lookup, }; @@ -1112,6 +1119,33 @@ void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) } EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu); +void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark) +{ + const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; + struct dst_entry *dst; + struct flowi6 fl6; + + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_oif = oif; + fl6.flowi6_mark = mark; + fl6.flowi6_flags = 0; + fl6.daddr = iph->daddr; + fl6.saddr = iph->saddr; + fl6.flowlabel = (*(__be32 *) iph) & IPV6_FLOWINFO_MASK; + + dst = ip6_route_output(net, NULL, &fl6); + if (!dst->error) + rt6_do_redirect(dst, skb); + dst_release(dst); +} +EXPORT_SYMBOL_GPL(ip6_redirect); + +void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) +{ + ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark); +} +EXPORT_SYMBOL_GPL(ip6_sk_redirect); + static unsigned int ip6_default_advmss(const struct dst_entry *dst) { struct net_device *dev = dst->dev; @@ -1604,108 +1638,94 @@ static int ip6_route_del(struct fib6_config *cfg) return err; } -/* - * Handle redirects - */ -struct ip6rd_flowi { - struct flowi6 fl6; - struct in6_addr gateway; -}; - -static struct rt6_info *__ip6_route_redirect(struct net *net, - struct fib6_table *table, - struct flowi6 *fl6, - int flags) +static void rt6_do_redirect(struct dst_entry *dst, struct sk_buff *skb) { - struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; - struct rt6_info *rt; - struct fib6_node *fn; + struct net *net = dev_net(skb->dev); + struct netevent_redirect netevent; + struct rt6_info *rt, *nrt = NULL; + const struct in6_addr *target; + struct ndisc_options ndopts; + const struct in6_addr *dest; + struct neighbour *old_neigh; + struct inet6_dev *in6_dev; + struct neighbour *neigh; + struct icmp6hdr *icmph; + int optlen, on_link; + u8 *lladdr; - /* - * Get the "current" route for this destination and - * check if the redirect has come from approriate router. - * - * RFC 2461 specifies that redirects should only be - * accepted if they come from the nexthop to the target. - * Due to the way the routes are chosen, this notion - * is a bit fuzzy and one might need to check all possible - * routes. - */ + optlen = skb->tail - skb->transport_header; + optlen -= sizeof(struct icmp6hdr) + 2 * sizeof(struct in6_addr); - read_lock_bh(&table->tb6_lock); - fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); -restart: - for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { - /* - * Current route is on-link; redirect is always invalid. - * - * Seems, previous statement is not true. It could - * be node, which looks for us as on-link (f.e. proxy ndisc) - * But then router serving it might decide, that we should - * know truth 8)8) --ANK (980726). - */ - if (rt6_check_expired(rt)) - continue; - if (!(rt->rt6i_flags & RTF_GATEWAY)) - continue; - if (fl6->flowi6_oif != rt->dst.dev->ifindex) - continue; - if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) - continue; - break; + if (optlen < 0) { + net_dbg_ratelimited("rt6_do_redirect: packet too short\n"); + return; } - if (!rt) - rt = net->ipv6.ip6_null_entry; - BACKTRACK(net, &fl6->saddr); -out: - dst_hold(&rt->dst); + icmph = icmp6_hdr(skb); + target = (const struct in6_addr *) (icmph + 1); + dest = target + 1; - read_unlock_bh(&table->tb6_lock); - - return rt; -}; - -static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest, - const struct in6_addr *src, - const struct in6_addr *gateway, - struct net_device *dev) -{ - int flags = RT6_LOOKUP_F_HAS_SADDR; - struct net *net = dev_net(dev); - struct ip6rd_flowi rdfl = { - .fl6 = { - .flowi6_oif = dev->ifindex, - .daddr = *dest, - .saddr = *src, - }, - }; + if (ipv6_addr_is_multicast(dest)) { + net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n"); + return; + } - rdfl.gateway = *gateway; + on_link = 0; + if (ipv6_addr_equal(dest, target)) { + on_link = 1; + } else if (ipv6_addr_type(target) != + (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { + net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n"); + return; + } - if (rt6_need_strict(dest)) - flags |= RT6_LOOKUP_F_IFACE; + in6_dev = __in6_dev_get(skb->dev); + if (!in6_dev) + return; + if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects) + return; - return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6, - flags, __ip6_route_redirect); -} + /* RFC2461 8.1: + * The IP source address of the Redirect MUST be the same as the current + * first-hop router for the specified ICMP Destination Address. + */ -void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src, - const struct in6_addr *saddr, - struct neighbour *neigh, u8 *lladdr, int on_link) -{ - struct rt6_info *rt, *nrt = NULL; - struct netevent_redirect netevent; - struct net *net = dev_net(neigh->dev); - struct neighbour *old_neigh; + if (!ndisc_parse_options((u8*)(dest + 1), optlen, &ndopts)) { + net_dbg_ratelimited("rt6_redirect: invalid ND options\n"); + return; + } - rt = ip6_route_redirect(dest, src, saddr, neigh->dev); + lladdr = NULL; + if (ndopts.nd_opts_tgt_lladdr) { + lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, + skb->dev); + if (!lladdr) { + net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n"); + return; + } + } + rt = (struct rt6_info *) dst; if (rt == net->ipv6.ip6_null_entry) { net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n"); - goto out; + return; } + /* Redirect received -> path was valid. + * Look, redirects are sent only in response to data packets, + * so that this nexthop apparently is reachable. --ANK + */ + dst_confirm(&rt->dst); + + neigh = __neigh_lookup(&nd_tbl, target, skb->dev, 1); + if (!neigh) + return; + + /* Duplicate redirect: silently ignore. */ + old_neigh = rt->n; + if (neigh == old_neigh) + goto out; + /* * We have finally decided to accept it. */ @@ -1717,18 +1737,6 @@ void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src, NEIGH_UPDATE_F_ISROUTER)) ); - /* - * Redirect received -> path was valid. - * Look, redirects are sent only in response to data packets, - * so that this nexthop apparently is reachable. --ANK - */ - dst_confirm(&rt->dst); - - /* Duplicate redirect: silently ignore. */ - old_neigh = rt->n; - if (neigh == old_neigh) - goto out; - nrt = ip6_rt_copy(rt, dest); if (!nrt) goto out; @@ -1751,12 +1759,12 @@ void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src, call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); if (rt->rt6i_flags & RTF_CACHE) { + rt = (struct rt6_info *) dst_clone(&rt->dst); ip6_del_rt(rt); - return; } out: - dst_release(&rt->dst); + neigh_release(neigh); } /* diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 49aea94..fbf1622 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -539,6 +539,8 @@ static int ipip6_err(struct sk_buff *skb, u32 info) if (code != ICMP_EXC_TTL) return 0; break; + case ICMP_REDIRECT: + break; } err = -ENOENT; @@ -557,6 +559,12 @@ static int ipip6_err(struct sk_buff *skb, u32 info) err = 0; goto out; } + if (type == ICMP_REDIRECT) { + ipv4_redirect(skb, dev_net(skb->dev), t->dev->ifindex, 0, + IPPROTO_IPV6, 0); + err = 0; + goto out; + } if (t->parms.iph.daddr == 0) goto out; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 61175cb..3071f37 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -363,6 +363,13 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, np = inet6_sk(sk); + if (type == NDISC_REDIRECT) { + struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie); + + if (dst) + dst->ops->redirect(dst,skb); + } + if (type == ICMPV6_PKT_TOOBIG) { struct dst_entry *dst; @@ -1970,6 +1977,7 @@ struct proto tcpv6_prot = { .sendmsg = tcp_sendmsg, .sendpage = tcp_sendpage, .backlog_rcv = tcp_v6_do_rcv, + .release_cb = tcp_release_cb, .hash = tcp_v6_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 1ecd102..99d0077 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -483,6 +483,8 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (type == ICMPV6_PKT_TOOBIG) ip6_sk_update_pmtu(skb, sk, info); + if (type == NDISC_REDIRECT) + ip6_sk_redirect(skb, sk); np = inet6_sk(sk); diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index bb02038..f5a9cb8 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -215,6 +215,14 @@ static void xfrm6_update_pmtu(struct dst_entry *dst, u32 mtu) path->ops->update_pmtu(path, mtu); } +static void xfrm6_redirect(struct dst_entry *dst, struct sk_buff *skb) +{ + struct xfrm_dst *xdst = (struct xfrm_dst *)dst; + struct dst_entry *path = xdst->route; + + path->ops->redirect(path, skb); +} + static void xfrm6_dst_destroy(struct dst_entry *dst) { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; @@ -261,6 +269,7 @@ static struct dst_ops xfrm6_dst_ops = { .protocol = cpu_to_be16(ETH_P_IPV6), .gc = xfrm6_garbage_collect, .update_pmtu = xfrm6_update_pmtu, + .redirect = xfrm6_redirect, .cow_metrics = dst_cow_metrics_generic, .destroy = xfrm6_dst_destroy, .ifdown = xfrm6_dst_ifdown, diff --git a/net/mac802154/mac802154.h b/net/mac802154/mac802154.h index 6967864..a4dcaf1 100644 --- a/net/mac802154/mac802154.h +++ b/net/mac802154/mac802154.h @@ -109,6 +109,7 @@ netdev_tx_t mac802154_tx(struct mac802154_priv *priv, struct sk_buff *skb, /* MIB callbacks */ void mac802154_dev_set_short_addr(struct net_device *dev, u16 val); +u16 mac802154_dev_get_short_addr(const struct net_device *dev); void mac802154_dev_set_ieee_addr(struct net_device *dev); u16 mac802154_dev_get_pan_id(const struct net_device *dev); void mac802154_dev_set_pan_id(struct net_device *dev, u16 val); diff --git a/net/mac802154/mac_cmd.c b/net/mac802154/mac_cmd.c index 7f5403e..d8d2770 100644 --- a/net/mac802154/mac_cmd.c +++ b/net/mac802154/mac_cmd.c @@ -55,7 +55,7 @@ static int mac802154_mlme_start_req(struct net_device *dev, return 0; } -struct wpan_phy *mac802154_get_phy(const struct net_device *dev) +static struct wpan_phy *mac802154_get_phy(const struct net_device *dev) { struct mac802154_sub_if_data *priv = netdev_priv(dev); @@ -71,4 +71,6 @@ struct ieee802154_reduced_mlme_ops mac802154_mlme_reduced = { struct ieee802154_mlme_ops mac802154_mlme_wpan = { .get_phy = mac802154_get_phy, .start_req = mac802154_mlme_start_req, + .get_pan_id = mac802154_dev_get_pan_id, + .get_short_addr = mac802154_dev_get_short_addr, }; diff --git a/net/mac802154/mib.c b/net/mac802154/mib.c index 380829d..f47781a 100644 --- a/net/mac802154/mib.c +++ b/net/mac802154/mib.c @@ -39,7 +39,7 @@ struct hw_addr_filt_notify_work { unsigned long changed; }; -struct mac802154_priv *mac802154_slave_get_priv(struct net_device *dev) +static struct mac802154_priv *mac802154_slave_get_priv(struct net_device *dev) { struct mac802154_sub_if_data *priv = netdev_priv(dev); @@ -100,6 +100,20 @@ void mac802154_dev_set_short_addr(struct net_device *dev, u16 val) } } +u16 mac802154_dev_get_short_addr(const struct net_device *dev) +{ + struct mac802154_sub_if_data *priv = netdev_priv(dev); + u16 ret; + + BUG_ON(dev->type != ARPHRD_IEEE802154); + + spin_lock_bh(&priv->mib_lock); + ret = priv->short_addr; + spin_unlock_bh(&priv->mib_lock); + + return ret; +} + void mac802154_dev_set_ieee_addr(struct net_device *dev) { struct mac802154_sub_if_data *priv = netdev_priv(dev); diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 4a5d2bd..62fb51f 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -517,6 +517,16 @@ config NET_EMATCH_CANID To compile this code as a module, choose M here: the module will be called em_canid. +config NET_EMATCH_IPSET + tristate "IPset" + depends on NET_EMATCH && IP_SET + ---help--- + Say Y here if you want to be able to classify packets based on + ipset membership. + + To compile this code as a module, choose M here: the + module will be called em_ipset. + config NET_CLS_ACT bool "Actions" ---help--- diff --git a/net/sched/Makefile b/net/sched/Makefile index bcada75..978cbf0 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -56,3 +56,4 @@ obj-$(CONFIG_NET_EMATCH_U32) += em_u32.o obj-$(CONFIG_NET_EMATCH_META) += em_meta.o obj-$(CONFIG_NET_EMATCH_TEXT) += em_text.o obj-$(CONFIG_NET_EMATCH_CANID) += em_canid.o +obj-$(CONFIG_NET_EMATCH_IPSET) += em_ipset.o diff --git a/net/sched/em_ipset.c b/net/sched/em_ipset.c new file mode 100644 index 0000000..3130320 --- /dev/null +++ b/net/sched/em_ipset.c @@ -0,0 +1,135 @@ +/* + * net/sched/em_ipset.c ipset ematch + * + * Copyright (c) 2012 Florian Westphal <fw@strlen.de> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + */ + +#include <linux/gfp.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/skbuff.h> +#include <linux/netfilter/xt_set.h> +#include <linux/ipv6.h> +#include <net/ip.h> +#include <net/pkt_cls.h> + +static int em_ipset_change(struct tcf_proto *tp, void *data, int data_len, + struct tcf_ematch *em) +{ + struct xt_set_info *set = data; + ip_set_id_t index; + + if (data_len != sizeof(*set)) + return -EINVAL; + + index = ip_set_nfnl_get_byindex(set->index); + if (index == IPSET_INVALID_ID) + return -ENOENT; + + em->datalen = sizeof(*set); + em->data = (unsigned long)kmemdup(data, em->datalen, GFP_KERNEL); + if (em->data) + return 0; + + ip_set_nfnl_put(index); + return -ENOMEM; +} + +static void em_ipset_destroy(struct tcf_proto *p, struct tcf_ematch *em) +{ + const struct xt_set_info *set = (const void *) em->data; + if (set) { + ip_set_nfnl_put(set->index); + kfree((void *) em->data); + } +} + +static int em_ipset_match(struct sk_buff *skb, struct tcf_ematch *em, + struct tcf_pkt_info *info) +{ + struct ip_set_adt_opt opt; + struct xt_action_param acpar; + const struct xt_set_info *set = (const void *) em->data; + struct net_device *dev, *indev = NULL; + int ret, network_offset; + + switch (skb->protocol) { + case htons(ETH_P_IP): + acpar.family = NFPROTO_IPV4; + if (!pskb_network_may_pull(skb, sizeof(struct iphdr))) + return 0; + acpar.thoff = ip_hdrlen(skb); + break; + case htons(ETH_P_IPV6): + acpar.family = NFPROTO_IPV6; + if (!pskb_network_may_pull(skb, sizeof(struct ipv6hdr))) + return 0; + /* doesn't call ipv6_find_hdr() because ipset doesn't use thoff, yet */ + acpar.thoff = sizeof(struct ipv6hdr); + break; + default: + return 0; + } + + acpar.hooknum = 0; + + opt.family = acpar.family; + opt.dim = set->dim; + opt.flags = set->flags; + opt.cmdflags = 0; + opt.timeout = ~0u; + + network_offset = skb_network_offset(skb); + skb_pull(skb, network_offset); + + dev = skb->dev; + + rcu_read_lock(); + + if (dev && skb->skb_iif) + indev = dev_get_by_index_rcu(dev_net(dev), skb->skb_iif); + + acpar.in = indev ? indev : dev; + acpar.out = dev; + + ret = ip_set_test(set->index, skb, &acpar, &opt); + + rcu_read_unlock(); + + skb_push(skb, network_offset); + return ret; +} + +static struct tcf_ematch_ops em_ipset_ops = { + .kind = TCF_EM_IPSET, + .change = em_ipset_change, + .destroy = em_ipset_destroy, + .match = em_ipset_match, + .owner = THIS_MODULE, + .link = LIST_HEAD_INIT(em_ipset_ops.link) +}; + +static int __init init_em_ipset(void) +{ + return tcf_em_register(&em_ipset_ops); +} + +static void __exit exit_em_ipset(void) +{ + tcf_em_unregister(&em_ipset_ops); +} + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Florian Westphal <fw@strlen.de>"); +MODULE_DESCRIPTION("TC extended match for IP sets"); + +module_init(init_em_ipset); +module_exit(exit_em_ipset); + +MODULE_ALIAS_TCF_EMATCH(TCF_EM_IPSET); diff --git a/net/sctp/input.c b/net/sctp/input.c index 80564fe..f050d45 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -423,6 +423,18 @@ void sctp_icmp_frag_needed(struct sock *sk, struct sctp_association *asoc, sctp_retransmit(&asoc->outqueue, t, SCTP_RTXR_PMTUD); } +void sctp_icmp_redirect(struct sock *sk, struct sctp_transport *t, + struct sk_buff *skb) +{ + struct dst_entry *dst; + + if (!t) + return; + dst = sctp_transport_dst_check(t); + if (dst) + dst->ops->redirect(dst, skb); +} + /* * SCTP Implementer's Guide, 2.37 ICMP handling procedures * @@ -628,6 +640,10 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info) err = EHOSTUNREACH; break; + case ICMP_REDIRECT: + sctp_icmp_redirect(sk, transport, skb); + err = 0; + break; default: goto out_unlock; } diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 91f4791..ed7139e 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -185,6 +185,9 @@ SCTP_STATIC void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, goto out_unlock; } break; + case NDISC_REDIRECT: + sctp_icmp_redirect(sk, transport, skb); + break; default: break; } |