From 22ba0317c81ba263172baaefd2cb38de78c4598f Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Mon, 28 Apr 2008 18:38:49 +0300
Subject: udf: fs/udf/partition.c:udf_get_pblock() mustn't be inline
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch fixes the following build error with UML and gcc 4.3:

<--  snip  -->

...
  CC      fs/udf/partition.o
/home/bunk/linux/kernel-2.6/git/linux-2.6/fs/udf/partition.c: In function ‘udf_get_pblock_virt15’:
/home/bunk/linux/kernel-2.6/git/linux-2.6/fs/udf/partition.c:32: sorry, unimplemented: inlining failed in call to ‘udf_get_pblock’: function body not available
/home/bunk/linux/kernel-2.6/git/linux-2.6/fs/udf/partition.c:102: sorry, unimplemented: called from here
make[3]: *** [fs/udf/partition.o] Error 1

<--  snip  -->

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>

diff --git a/fs/udf/partition.c b/fs/udf/partition.c
index 63610f0..96dfd20 100644
--- a/fs/udf/partition.c
+++ b/fs/udf/partition.c
@@ -27,8 +27,8 @@
 #include <linux/slab.h>
 #include <linux/buffer_head.h>
 
-inline uint32_t udf_get_pblock(struct super_block *sb, uint32_t block,
-			       uint16_t partition, uint32_t offset)
+uint32_t udf_get_pblock(struct super_block *sb, uint32_t block,
+			uint16_t partition, uint32_t offset)
 {
 	struct udf_sb_info *sbi = UDF_SB(sb);
 	struct udf_part_map *map;
-- 
cgit v0.10.2


From fa3959f457109cc7d082b86ea6daae927982815b Mon Sep 17 00:00:00 2001
From: Lennert Buytenhek <buytenh@wantstofly.org>
Date: Thu, 24 Apr 2008 01:27:02 +0200
Subject: mv643xx_eth: get rid of static variables, allow multiple instances

Move mv643xx_eth's static state (ethernet register block base address
and MII management interface spinlock) into a struct hanging off the
shared platform device.  This is necessary to support chips that
contain multiple mv643xx_eth silicon blocks.

Signed-off-by: Lennert Buytenhek <buytenh@marvell.com>
Acked-by: Nicolas Pitre <nico@marvell.com>
Signed-off-by: Dale Farnsworth <dale@farnsworth.org>

diff --git a/arch/arm/mach-orion5x/common.c b/arch/arm/mach-orion5x/common.c
index 439c778..2bedf71 100644
--- a/arch/arm/mach-orion5x/common.c
+++ b/arch/arm/mach-orion5x/common.c
@@ -223,7 +223,9 @@ static struct platform_device orion5x_eth = {
 
 void __init orion5x_eth_init(struct mv643xx_eth_platform_data *eth_data)
 {
+	eth_data->shared = &orion5x_eth_shared;
 	orion5x_eth.dev.platform_data = eth_data;
+
 	platform_device_register(&orion5x_eth_shared);
 	platform_device_register(&orion5x_eth);
 }
diff --git a/arch/powerpc/platforms/chrp/pegasos_eth.c b/arch/powerpc/platforms/chrp/pegasos_eth.c
index 5bcc58d..130ff72 100644
--- a/arch/powerpc/platforms/chrp/pegasos_eth.c
+++ b/arch/powerpc/platforms/chrp/pegasos_eth.c
@@ -58,7 +58,9 @@ static struct resource mv643xx_eth0_resources[] = {
 
 
 static struct mv643xx_eth_platform_data eth0_pd = {
+	.shared		= &mv643xx_eth_shared_device,
 	.port_number	= 0,
+
 	.tx_sram_addr = PEGASOS2_SRAM_BASE_ETH0,
 	.tx_sram_size = PEGASOS2_SRAM_TXRING_SIZE,
 	.tx_queue_size = PEGASOS2_SRAM_TXRING_SIZE/16,
@@ -88,7 +90,9 @@ static struct resource mv643xx_eth1_resources[] = {
 };
 
 static struct mv643xx_eth_platform_data eth1_pd = {
+	.shared		= &mv643xx_eth_shared_device,
 	.port_number	= 1,
+
 	.tx_sram_addr = PEGASOS2_SRAM_BASE_ETH1,
 	.tx_sram_size = PEGASOS2_SRAM_TXRING_SIZE,
 	.tx_queue_size = PEGASOS2_SRAM_TXRING_SIZE/16,
diff --git a/arch/powerpc/sysdev/mv64x60_dev.c b/arch/powerpc/sysdev/mv64x60_dev.c
index 41af122..a132e0d 100644
--- a/arch/powerpc/sysdev/mv64x60_dev.c
+++ b/arch/powerpc/sysdev/mv64x60_dev.c
@@ -239,6 +239,8 @@ static int __init mv64x60_eth_device_setup(struct device_node *np, int id,
 
 	memset(&pdata, 0, sizeof(pdata));
 
+	pdata.shared = shared_pdev;
+
 	prop = of_get_property(np, "reg", NULL);
 	if (!prop)
 		return -ENODEV;
diff --git a/arch/ppc/syslib/mv64x60.c b/arch/ppc/syslib/mv64x60.c
index 90fe904..418f305 100644
--- a/arch/ppc/syslib/mv64x60.c
+++ b/arch/ppc/syslib/mv64x60.c
@@ -341,6 +341,7 @@ static struct resource mv64x60_eth0_resources[] = {
 };
 
 static struct mv643xx_eth_platform_data eth0_pd = {
+	.shared		= &mv64x60_eth_shared_device;
 	.port_number	= 0,
 };
 
@@ -366,6 +367,7 @@ static struct resource mv64x60_eth1_resources[] = {
 };
 
 static struct mv643xx_eth_platform_data eth1_pd = {
+	.shared		= &mv64x60_eth_shared_device;
 	.port_number	= 1,
 };
 
@@ -391,6 +393,7 @@ static struct resource mv64x60_eth2_resources[] = {
 };
 
 static struct mv643xx_eth_platform_data eth2_pd = {
+	.shared		= &mv64x60_eth_shared_device;
 	.port_number	= 2,
 };
 
diff --git a/drivers/net/mv643xx_eth.c b/drivers/net/mv643xx_eth.c
index 381b36e..eebf0d2 100644
--- a/drivers/net/mv643xx_eth.c
+++ b/drivers/net/mv643xx_eth.c
@@ -507,7 +507,15 @@ struct mv643xx_mib_counters {
 	u32 late_collision;
 };
 
+struct mv643xx_shared_private {
+	void __iomem *eth_base;
+
+	/* used to protect SMI_REG, which is shared across ports */
+	spinlock_t phy_lock;
+};
+
 struct mv643xx_private {
+	struct mv643xx_shared_private *shared;
 	int port_num;			/* User Ethernet port number	*/
 
 	u32 rx_sram_addr;		/* Base address of rx sram area */
@@ -614,19 +622,14 @@ static const struct ethtool_ops mv643xx_ethtool_ops;
 static char mv643xx_driver_name[] = "mv643xx_eth";
 static char mv643xx_driver_version[] = "1.0";
 
-static void __iomem *mv643xx_eth_base;
-
-/* used to protect SMI_REG, which is shared across ports */
-static DEFINE_SPINLOCK(mv643xx_eth_phy_lock);
-
 static inline u32 rdl(struct mv643xx_private *mp, int offset)
 {
-	return readl(mv643xx_eth_base + offset);
+	return readl(mp->shared->eth_base + offset);
 }
 
 static inline void wrl(struct mv643xx_private *mp, int offset, u32 data)
 {
-	writel(data, mv643xx_eth_base + offset);
+	writel(data, mp->shared->eth_base + offset);
 }
 
 /*
@@ -1827,6 +1830,11 @@ static int mv643xx_eth_probe(struct platform_device *pdev)
 		return -ENODEV;
 	}
 
+	if (pd->shared == NULL) {
+		printk(KERN_ERR "No mv643xx_eth_platform_data->shared\n");
+		return -ENODEV;
+	}
+
 	dev = alloc_etherdev(sizeof(struct mv643xx_private));
 	if (!dev)
 		return -ENOMEM;
@@ -1877,6 +1885,7 @@ static int mv643xx_eth_probe(struct platform_device *pdev)
 
 	spin_lock_init(&mp->lock);
 
+	mp->shared = platform_get_drvdata(pd->shared);
 	port_num = mp->port_num = pd->port_number;
 
 	/* set default config values */
@@ -1986,27 +1995,46 @@ static int mv643xx_eth_remove(struct platform_device *pdev)
 static int mv643xx_eth_shared_probe(struct platform_device *pdev)
 {
 	static int mv643xx_version_printed = 0;
+	struct mv643xx_shared_private *msp;
 	struct resource *res;
+	int ret;
 
 	if (!mv643xx_version_printed++)
 		printk(KERN_NOTICE "MV-643xx 10/100/1000 Ethernet Driver\n");
 
+	ret = -EINVAL;
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	if (res == NULL)
-		return -ENODEV;
+		goto out;
 
-	mv643xx_eth_base = ioremap(res->start, res->end - res->start + 1);
-	if (mv643xx_eth_base == NULL)
-		return -ENOMEM;
+	ret = -ENOMEM;
+	msp = kmalloc(sizeof(*msp), GFP_KERNEL);
+	if (msp == NULL)
+		goto out;
+	memset(msp, 0, sizeof(*msp));
+
+	msp->eth_base = ioremap(res->start, res->end - res->start + 1);
+	if (msp->eth_base == NULL)
+		goto out_free;
+
+	spin_lock_init(&msp->phy_lock);
+
+	platform_set_drvdata(pdev, msp);
 
 	return 0;
 
+out_free:
+	kfree(msp);
+out:
+	return ret;
 }
 
 static int mv643xx_eth_shared_remove(struct platform_device *pdev)
 {
-	iounmap(mv643xx_eth_base);
-	mv643xx_eth_base = NULL;
+	struct mv643xx_shared_private *msp = platform_get_drvdata(pdev);
+
+	iounmap(msp->eth_base);
+	kfree(msp);
 
 	return 0;
 }
@@ -2911,7 +2939,7 @@ static void eth_port_read_smi_reg(struct mv643xx_private *mp,
 	int i;
 
 	/* the SMI register is a shared resource */
-	spin_lock_irqsave(&mv643xx_eth_phy_lock, flags);
+	spin_lock_irqsave(&mp->shared->phy_lock, flags);
 
 	/* wait for the SMI register to become available */
 	for (i = 0; rdl(mp, SMI_REG) & ETH_SMI_BUSY; i++) {
@@ -2936,7 +2964,7 @@ static void eth_port_read_smi_reg(struct mv643xx_private *mp,
 
 	*value = rdl(mp, SMI_REG) & 0xffff;
 out:
-	spin_unlock_irqrestore(&mv643xx_eth_phy_lock, flags);
+	spin_unlock_irqrestore(&mp->shared->phy_lock, flags);
 }
 
 /*
@@ -2969,7 +2997,7 @@ static void eth_port_write_smi_reg(struct mv643xx_private *mp,
 	phy_addr = ethernet_phy_get(mp);
 
 	/* the SMI register is a shared resource */
-	spin_lock_irqsave(&mv643xx_eth_phy_lock, flags);
+	spin_lock_irqsave(&mp->shared->phy_lock, flags);
 
 	/* wait for the SMI register to become available */
 	for (i = 0; rdl(mp, SMI_REG) & ETH_SMI_BUSY; i++) {
@@ -2983,7 +3011,7 @@ static void eth_port_write_smi_reg(struct mv643xx_private *mp,
 	wrl(mp, SMI_REG, (phy_addr << 16) | (phy_reg << 21) |
 				ETH_SMI_OPCODE_WRITE | (value & 0xffff));
 out:
-	spin_unlock_irqrestore(&mv643xx_eth_phy_lock, flags);
+	spin_unlock_irqrestore(&mp->shared->phy_lock, flags);
 }
 
 /*
diff --git a/include/linux/mv643xx_eth.h b/include/linux/mv643xx_eth.h
index 30e11aa..2d59855 100644
--- a/include/linux/mv643xx_eth.h
+++ b/include/linux/mv643xx_eth.h
@@ -1,6 +1,7 @@
 /*
  * MV-643XX ethernet platform device data definition file.
  */
+
 #ifndef __LINUX_MV643XX_ETH_H
 #define __LINUX_MV643XX_ETH_H
 
@@ -13,7 +14,9 @@
 #define MV643XX_ETH_BASE_ADDR_ENABLE_REG	0x2290
 
 struct mv643xx_eth_platform_data {
+	struct platform_device	*shared;
 	int		port_number;
+
 	u16		force_phy_addr;	/* force override if phy_addr == 0 */
 	u16		phy_addr;
 
-- 
cgit v0.10.2


From f2ce825d2a89b30af14fa577298fecaab7bc9504 Mon Sep 17 00:00:00 2001
From: Lennert Buytenhek <buytenh@wantstofly.org>
Date: Thu, 24 Apr 2008 01:27:17 +0200
Subject: mv643xx_eth: mbus decode window support

Make it possible to pass mbus_dram_target_info to the mv643xx_eth
driver via the platform data, and make the mv643xx_eth driver
program the window registers based on this data if it is passed in.

Signed-off-by: Lennert Buytenhek <buytenh@marvell.com>
Reviewed-by: Tzachi Perelstein <tzachi@marvell.com>
Acked-by: Russell King <rmk+kernel@arm.linux.org.uk>
Signed-off-by: Dale Farnsworth <dale@farnsworth.org>

diff --git a/drivers/net/mv643xx_eth.c b/drivers/net/mv643xx_eth.c
index eebf0d2..aabf1c6 100644
--- a/drivers/net/mv643xx_eth.c
+++ b/drivers/net/mv643xx_eth.c
@@ -91,6 +91,11 @@
  */
 #define PHY_ADDR_REG				0x0000
 #define SMI_REG					0x0004
+#define WINDOW_BASE(i)				(0x0200 + ((i) << 3))
+#define WINDOW_SIZE(i)				(0x0204 + ((i) << 3))
+#define WINDOW_REMAP_HIGH(i)			(0x0280 + ((i) << 2))
+#define WINDOW_BAR_ENABLE			0x0290
+#define WINDOW_PROTECT(i)			(0x0294 + ((i) << 4))
 
 /*
  * Per-port registers.
@@ -512,6 +517,8 @@ struct mv643xx_shared_private {
 
 	/* used to protect SMI_REG, which is shared across ports */
 	spinlock_t phy_lock;
+
+	u32 win_protect;
 };
 
 struct mv643xx_private {
@@ -1888,6 +1895,9 @@ static int mv643xx_eth_probe(struct platform_device *pdev)
 	mp->shared = platform_get_drvdata(pd->shared);
 	port_num = mp->port_num = pd->port_number;
 
+	if (mp->shared->win_protect)
+		wrl(mp, WINDOW_PROTECT(port_num), mp->shared->win_protect);
+
 	/* set default config values */
 	eth_port_uc_addr_get(mp, dev->dev_addr);
 	mp->rx_ring_size = PORT_DEFAULT_RECEIVE_QUEUE_SIZE;
@@ -1992,9 +2002,44 @@ static int mv643xx_eth_remove(struct platform_device *pdev)
 	return 0;
 }
 
+static void mv643xx_eth_conf_mbus_windows(struct mv643xx_shared_private *msp,
+					  struct mbus_dram_target_info *dram)
+{
+	void __iomem *base = msp->eth_base;
+	u32 win_enable;
+	u32 win_protect;
+	int i;
+
+	for (i = 0; i < 6; i++) {
+		writel(0, base + WINDOW_BASE(i));
+		writel(0, base + WINDOW_SIZE(i));
+		if (i < 4)
+			writel(0, base + WINDOW_REMAP_HIGH(i));
+	}
+
+	win_enable = 0x3f;
+	win_protect = 0;
+
+	for (i = 0; i < dram->num_cs; i++) {
+		struct mbus_dram_window *cs = dram->cs + i;
+
+		writel((cs->base & 0xffff0000) |
+			(cs->mbus_attr << 8) |
+			dram->mbus_dram_target_id, base + WINDOW_BASE(i));
+		writel((cs->size - 1) & 0xffff0000, base + WINDOW_SIZE(i));
+
+		win_enable &= ~(1 << i);
+		win_protect |= 3 << (2 * i);
+	}
+
+	writel(win_enable, base + WINDOW_BAR_ENABLE);
+	msp->win_protect = win_protect;
+}
+
 static int mv643xx_eth_shared_probe(struct platform_device *pdev)
 {
 	static int mv643xx_version_printed = 0;
+	struct mv643xx_eth_shared_platform_data *pd = pdev->dev.platform_data;
 	struct mv643xx_shared_private *msp;
 	struct resource *res;
 	int ret;
@@ -2021,6 +2066,12 @@ static int mv643xx_eth_shared_probe(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, msp);
 
+	/*
+	 * (Re-)program MBUS remapping windows if we are asked to.
+	 */
+	if (pd != NULL && pd->dram != NULL)
+		mv643xx_eth_conf_mbus_windows(msp, pd->dram);
+
 	return 0;
 
 out_free:
diff --git a/include/linux/mv643xx_eth.h b/include/linux/mv643xx_eth.h
index 2d59855..4801b02 100644
--- a/include/linux/mv643xx_eth.h
+++ b/include/linux/mv643xx_eth.h
@@ -5,6 +5,8 @@
 #ifndef __LINUX_MV643XX_ETH_H
 #define __LINUX_MV643XX_ETH_H
 
+#include <linux/mbus.h>
+
 #define MV643XX_ETH_SHARED_NAME		"mv643xx_eth_shared"
 #define MV643XX_ETH_NAME		"mv643xx_eth"
 #define MV643XX_ETH_SHARED_REGS		0x2000
@@ -13,6 +15,10 @@
 #define MV643XX_ETH_SIZE_REG_4		0x2224
 #define MV643XX_ETH_BASE_ADDR_ENABLE_REG	0x2290
 
+struct mv643xx_eth_shared_platform_data {
+	struct mbus_dram_target_info	*dram;
+};
+
 struct mv643xx_eth_platform_data {
 	struct platform_device	*shared;
 	int		port_number;
-- 
cgit v0.10.2


From c416a41f99be190e1f558cb06f70ddd560ce8b4b Mon Sep 17 00:00:00 2001
From: Lennert Buytenhek <buytenh@wantstofly.org>
Date: Thu, 24 Apr 2008 01:27:32 +0200
Subject: mv643xx_eth: configurable t_clk

Make t_clk configurable via platform device data (with the current
hardcoded value, 133 MHz, being the default), as it varies across
different chip families.

Signed-off-by: Lennert Buytenhek <buytenh@marvell.com>
Acked-by: Nicolas Pitre <nico@marvell.com>
Signed-off-by: Dale Farnsworth <dale@farnsworth.org>

diff --git a/drivers/net/mv643xx_eth.c b/drivers/net/mv643xx_eth.c
index aabf1c6..8bd41e4 100644
--- a/drivers/net/mv643xx_eth.c
+++ b/drivers/net/mv643xx_eth.c
@@ -519,6 +519,8 @@ struct mv643xx_shared_private {
 	spinlock_t phy_lock;
 
 	u32 win_protect;
+
+	unsigned int t_clk;
 };
 
 struct mv643xx_private {
@@ -1129,7 +1131,6 @@ static irqreturn_t mv643xx_eth_int_handler(int irq, void *dev_id)
  *
  * INPUT:
  *	struct mv643xx_private *mp	Ethernet port
- *	unsigned int t_clk		t_clk of the MV-643xx chip in HZ units
  *	unsigned int delay		Delay in usec
  *
  * OUTPUT:
@@ -1140,10 +1141,10 @@ static irqreturn_t mv643xx_eth_int_handler(int irq, void *dev_id)
  *
  */
 static unsigned int eth_port_set_rx_coal(struct mv643xx_private *mp,
-					unsigned int t_clk, unsigned int delay)
+					unsigned int delay)
 {
 	unsigned int port_num = mp->port_num;
-	unsigned int coal = ((t_clk / 1000000) * delay) / 64;
+	unsigned int coal = ((mp->shared->t_clk / 1000000) * delay) / 64;
 
 	/* Set RX Coalescing mechanism */
 	wrl(mp, SDMA_CONFIG_REG(port_num),
@@ -1168,7 +1169,6 @@ static unsigned int eth_port_set_rx_coal(struct mv643xx_private *mp,
  *
  * INPUT:
  *	struct mv643xx_private *mp	Ethernet port
- *	unsigned int t_clk		t_clk of the MV-643xx chip in HZ units
  *	unsigned int delay		Delay in uSeconds
  *
  * OUTPUT:
@@ -1179,9 +1179,9 @@ static unsigned int eth_port_set_rx_coal(struct mv643xx_private *mp,
  *
  */
 static unsigned int eth_port_set_tx_coal(struct mv643xx_private *mp,
-					unsigned int t_clk, unsigned int delay)
+					unsigned int delay)
 {
-	unsigned int coal = ((t_clk / 1000000) * delay) / 64;
+	unsigned int coal = ((mp->shared->t_clk / 1000000) * delay) / 64;
 
 	/* Set TX Coalescing mechanism */
 	wrl(mp, TX_FIFO_URGENT_THRESHOLD_REG(mp->port_num), coal << 4);
@@ -1423,11 +1423,11 @@ static int mv643xx_eth_open(struct net_device *dev)
 
 #ifdef MV643XX_COAL
 	mp->rx_int_coal =
-		eth_port_set_rx_coal(mp, 133000000, MV643XX_RX_COAL);
+		eth_port_set_rx_coal(mp, MV643XX_RX_COAL);
 #endif
 
 	mp->tx_int_coal =
-		eth_port_set_tx_coal(mp, 133000000, MV643XX_TX_COAL);
+		eth_port_set_tx_coal(mp, MV643XX_TX_COAL);
 
 	/* Unmask phy and link status changes interrupts */
 	wrl(mp, INTERRUPT_EXTEND_MASK_REG(port_num), ETH_INT_UNMASK_ALL_EXT);
@@ -2063,6 +2063,7 @@ static int mv643xx_eth_shared_probe(struct platform_device *pdev)
 		goto out_free;
 
 	spin_lock_init(&msp->phy_lock);
+	msp->t_clk = (pd != NULL && pd->t_clk != 0) ? pd->t_clk : 133000000;
 
 	platform_set_drvdata(pdev, msp);
 
diff --git a/include/linux/mv643xx_eth.h b/include/linux/mv643xx_eth.h
index 4801b02..9f3a603 100644
--- a/include/linux/mv643xx_eth.h
+++ b/include/linux/mv643xx_eth.h
@@ -17,6 +17,7 @@
 
 struct mv643xx_eth_shared_platform_data {
 	struct mbus_dram_target_info	*dram;
+	unsigned int	t_clk;
 };
 
 struct mv643xx_eth_platform_data {
-- 
cgit v0.10.2


From 240e4419e0cfcba737883b637ec2bdcc071ea03d Mon Sep 17 00:00:00 2001
From: Lennert Buytenhek <buytenh@wantstofly.org>
Date: Thu, 24 Apr 2008 01:27:44 +0200
Subject: mv643xx_eth: shorten shared platform driver name

Change the MV643XX_ETH_SHARED_NAME platform driver name to something
shorter than 19 characters, so that we can register multiple (otherwise
we end up with sysfs conflicts since all instances will map to
"mv643xx_eth_shared." as there is a 20-char sysfs file name limit.)

Signed-off-by: Lennert Buytenhek <buytenh@marvell.com>
Acked-by: Nicolas Pitre <nico@marvell.com>
Signed-off-by: Dale Farnsworth <dale@farnsworth.org>

diff --git a/include/linux/mv643xx_eth.h b/include/linux/mv643xx_eth.h
index 9f3a603..66dc957 100644
--- a/include/linux/mv643xx_eth.h
+++ b/include/linux/mv643xx_eth.h
@@ -7,8 +7,8 @@
 
 #include <linux/mbus.h>
 
-#define MV643XX_ETH_SHARED_NAME		"mv643xx_eth_shared"
-#define MV643XX_ETH_NAME		"mv643xx_eth"
+#define MV643XX_ETH_SHARED_NAME		"mv643xx_eth"
+#define MV643XX_ETH_NAME		"mv643xx_eth_port"
 #define MV643XX_ETH_SHARED_REGS		0x2000
 #define MV643XX_ETH_SHARED_REGS_SIZE	0x2000
 #define MV643XX_ETH_BAR_4		0x2220
-- 
cgit v0.10.2


From ce4e2e4558903ef92edf1ab4e09b0b338a09fd61 Mon Sep 17 00:00:00 2001
From: Lennert Buytenhek <buytenh@wantstofly.org>
Date: Thu, 24 Apr 2008 01:29:59 +0200
Subject: mv643xx_eth: inter-mv643xx SMI port sharing

There exist chips with up to four mv643xx_eth silicon blocks but
only one external SMI (MII management) interface -- the SMI logic
of the first block is shared by all the blocks.

Handle this by allowing a per-port override of which
mv643xx_eth_shared's SMI registers (and spinlock) to use.

Signed-off-by: Lennert Buytenhek <buytenh@marvell.com>
Acked-by: Nicolas Pitre <nico@marvell.com>
Signed-off-by: Dale Farnsworth <dale@farnsworth.org>

diff --git a/drivers/net/mv643xx_eth.c b/drivers/net/mv643xx_eth.c
index 8bd41e4..b7915cd 100644
--- a/drivers/net/mv643xx_eth.c
+++ b/drivers/net/mv643xx_eth.c
@@ -527,6 +527,8 @@ struct mv643xx_private {
 	struct mv643xx_shared_private *shared;
 	int port_num;			/* User Ethernet port number	*/
 
+	struct mv643xx_shared_private *shared_smi;
+
 	u32 rx_sram_addr;		/* Base address of rx sram area */
 	u32 rx_sram_size;		/* Size of rx sram area		*/
 	u32 tx_sram_addr;		/* Base address of tx sram area */
@@ -1898,6 +1900,10 @@ static int mv643xx_eth_probe(struct platform_device *pdev)
 	if (mp->shared->win_protect)
 		wrl(mp, WINDOW_PROTECT(port_num), mp->shared->win_protect);
 
+	mp->shared_smi = mp->shared;
+	if (pd->shared_smi != NULL)
+		mp->shared_smi = platform_get_drvdata(pd->shared_smi);
+
 	/* set default config values */
 	eth_port_uc_addr_get(mp, dev->dev_addr);
 	mp->rx_ring_size = PORT_DEFAULT_RECEIVE_QUEUE_SIZE;
@@ -2986,15 +2992,16 @@ static void eth_port_reset(struct mv643xx_private *mp)
 static void eth_port_read_smi_reg(struct mv643xx_private *mp,
 				unsigned int phy_reg, unsigned int *value)
 {
+	void __iomem *smi_reg = mp->shared_smi->eth_base + SMI_REG;
 	int phy_addr = ethernet_phy_get(mp);
 	unsigned long flags;
 	int i;
 
 	/* the SMI register is a shared resource */
-	spin_lock_irqsave(&mp->shared->phy_lock, flags);
+	spin_lock_irqsave(&mp->shared_smi->phy_lock, flags);
 
 	/* wait for the SMI register to become available */
-	for (i = 0; rdl(mp, SMI_REG) & ETH_SMI_BUSY; i++) {
+	for (i = 0; readl(smi_reg) & ETH_SMI_BUSY; i++) {
 		if (i == PHY_WAIT_ITERATIONS) {
 			printk("%s: PHY busy timeout\n", mp->dev->name);
 			goto out;
@@ -3002,11 +3009,11 @@ static void eth_port_read_smi_reg(struct mv643xx_private *mp,
 		udelay(PHY_WAIT_MICRO_SECONDS);
 	}
 
-	wrl(mp, SMI_REG,
-		(phy_addr << 16) | (phy_reg << 21) | ETH_SMI_OPCODE_READ);
+	writel((phy_addr << 16) | (phy_reg << 21) | ETH_SMI_OPCODE_READ,
+		smi_reg);
 
 	/* now wait for the data to be valid */
-	for (i = 0; !(rdl(mp, SMI_REG) & ETH_SMI_READ_VALID); i++) {
+	for (i = 0; !(readl(smi_reg) & ETH_SMI_READ_VALID); i++) {
 		if (i == PHY_WAIT_ITERATIONS) {
 			printk("%s: PHY read timeout\n", mp->dev->name);
 			goto out;
@@ -3014,9 +3021,9 @@ static void eth_port_read_smi_reg(struct mv643xx_private *mp,
 		udelay(PHY_WAIT_MICRO_SECONDS);
 	}
 
-	*value = rdl(mp, SMI_REG) & 0xffff;
+	*value = readl(smi_reg) & 0xffff;
 out:
-	spin_unlock_irqrestore(&mp->shared->phy_lock, flags);
+	spin_unlock_irqrestore(&mp->shared_smi->phy_lock, flags);
 }
 
 /*
@@ -3042,17 +3049,16 @@ out:
 static void eth_port_write_smi_reg(struct mv643xx_private *mp,
 				   unsigned int phy_reg, unsigned int value)
 {
-	int phy_addr;
-	int i;
+	void __iomem *smi_reg = mp->shared_smi->eth_base + SMI_REG;
+	int phy_addr = ethernet_phy_get(mp);
 	unsigned long flags;
-
-	phy_addr = ethernet_phy_get(mp);
+	int i;
 
 	/* the SMI register is a shared resource */
-	spin_lock_irqsave(&mp->shared->phy_lock, flags);
+	spin_lock_irqsave(&mp->shared_smi->phy_lock, flags);
 
 	/* wait for the SMI register to become available */
-	for (i = 0; rdl(mp, SMI_REG) & ETH_SMI_BUSY; i++) {
+	for (i = 0; readl(smi_reg) & ETH_SMI_BUSY; i++) {
 		if (i == PHY_WAIT_ITERATIONS) {
 			printk("%s: PHY busy timeout\n", mp->dev->name);
 			goto out;
@@ -3060,10 +3066,10 @@ static void eth_port_write_smi_reg(struct mv643xx_private *mp,
 		udelay(PHY_WAIT_MICRO_SECONDS);
 	}
 
-	wrl(mp, SMI_REG, (phy_addr << 16) | (phy_reg << 21) |
-				ETH_SMI_OPCODE_WRITE | (value & 0xffff));
+	writel((phy_addr << 16) | (phy_reg << 21) |
+		ETH_SMI_OPCODE_WRITE | (value & 0xffff), smi_reg);
 out:
-	spin_unlock_irqrestore(&mp->shared->phy_lock, flags);
+	spin_unlock_irqrestore(&mp->shared_smi->phy_lock, flags);
 }
 
 /*
diff --git a/include/linux/mv643xx_eth.h b/include/linux/mv643xx_eth.h
index 66dc957..a15cdd4 100644
--- a/include/linux/mv643xx_eth.h
+++ b/include/linux/mv643xx_eth.h
@@ -24,6 +24,8 @@ struct mv643xx_eth_platform_data {
 	struct platform_device	*shared;
 	int		port_number;
 
+	struct platform_device	*shared_smi;
+
 	u16		force_phy_addr;	/* force override if phy_addr == 0 */
 	u16		phy_addr;
 
-- 
cgit v0.10.2


From 98db6f193c93e9b4729215af2c9101210e11d26c Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Tue, 29 Apr 2008 22:38:48 +0200
Subject: x86: fix section mismatch in pci_scan_bus

Fix following section mismatch warning:
WARNING: vmlinux.o(.text+0x275616): Section mismatch in reference from the function pci_scan_bus() to the function .devinit.text:pci_scan_bus_parented()

The warning was seen with a CONFIG_DEBUG_SECTION_MISMATCH=y build.
The inline function pci_scan_bus refer to functions annotated
__devinit - so annotate it __devinit too.
This revealed a few x86 specific functions that were only
used from __init or __devinit context.
So annotate these __devinit and the warning was killed.

The added include in pci.h was not strictly required but
added to avoid being dependent on indirect includes.

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: Jesse Barnes <jbarnes@hobbes.lan>

diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 2a4d751..88b5416 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -489,7 +489,7 @@ void pcibios_disable_device (struct pci_dev *dev)
 		pcibios_disable_irq(dev);
 }
 
-struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops, int node)
+struct pci_bus * __devinit pci_scan_bus_on_node(int busno, struct pci_ops *ops, int node)
 {
 	struct pci_bus *bus = NULL;
 	struct pci_sysdata *sd;
@@ -512,7 +512,7 @@ struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops, int node)
 	return bus;
 }
 
-struct pci_bus *pci_scan_bus_with_sysdata(int busno)
+struct pci_bus * __devinit pci_scan_bus_with_sysdata(int busno)
 {
 	return pci_scan_bus_on_node(busno, &pci_root_ops, -1);
 }
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 96acd0d..a59517b 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -44,6 +44,7 @@
 #include <linux/mod_devicetable.h>
 
 #include <linux/types.h>
+#include <linux/init.h>
 #include <linux/ioport.h>
 #include <linux/list.h>
 #include <linux/compiler.h>
@@ -474,7 +475,7 @@ extern struct pci_bus *pci_find_bus(int domain, int busnr);
 void pci_bus_add_devices(struct pci_bus *bus);
 struct pci_bus *pci_scan_bus_parented(struct device *parent, int bus,
 				      struct pci_ops *ops, void *sysdata);
-static inline struct pci_bus *pci_scan_bus(int bus, struct pci_ops *ops,
+static inline struct pci_bus * __devinit pci_scan_bus(int bus, struct pci_ops *ops,
 					   void *sysdata)
 {
 	struct pci_bus *root_bus;
-- 
cgit v0.10.2


From 70b9f7dc1435412ca2b89b13a8353bd9915a7189 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel.send@gmail.com>
Date: Mon, 28 Apr 2008 16:27:23 -0700
Subject: x86/pci: remove flag in pci_cfg_space_size_ext

so let pci_cfg_space_size call it directly without flag.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>

diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index b60b2ab..ff3a6a3 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -502,7 +502,7 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_SIEMENS, 0x0015,
  */
 static void fam10h_pci_cfg_space_size(struct pci_dev *dev)
 {
-	dev->cfg_size = pci_cfg_space_size_ext(dev, 0);
+	dev->cfg_size = pci_cfg_space_size_ext(dev);
 }
 
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1200, fam10h_pci_cfg_space_size);
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 4a55bf3..3706ce7 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -842,13 +842,25 @@ static void set_pcie_port_type(struct pci_dev *pdev)
  * reading the dword at 0x100 which must either be 0 or a valid extended
  * capability header.
  */
-int pci_cfg_space_size_ext(struct pci_dev *dev, unsigned check_exp_pcix)
+int pci_cfg_space_size_ext(struct pci_dev *dev)
 {
-	int pos;
 	u32 status;
 
-	if (!check_exp_pcix)
-		goto skip;
+	if (pci_read_config_dword(dev, 256, &status) != PCIBIOS_SUCCESSFUL)
+		goto fail;
+	if (status == 0xffffffff)
+		goto fail;
+
+	return PCI_CFG_SPACE_EXP_SIZE;
+
+ fail:
+	return PCI_CFG_SPACE_SIZE;
+}
+
+int pci_cfg_space_size(struct pci_dev *dev)
+{
+	int pos;
+	u32 status;
 
 	pos = pci_find_capability(dev, PCI_CAP_ID_EXP);
 	if (!pos) {
@@ -861,23 +873,12 @@ int pci_cfg_space_size_ext(struct pci_dev *dev, unsigned check_exp_pcix)
 			goto fail;
 	}
 
- skip:
-	if (pci_read_config_dword(dev, 256, &status) != PCIBIOS_SUCCESSFUL)
-		goto fail;
-	if (status == 0xffffffff)
-		goto fail;
-
-	return PCI_CFG_SPACE_EXP_SIZE;
+	return pci_cfg_space_size_ext(dev);
 
  fail:
 	return PCI_CFG_SPACE_SIZE;
 }
 
-int pci_cfg_space_size(struct pci_dev *dev)
-{
-	return pci_cfg_space_size_ext(dev, 1);
-}
-
 static void pci_release_bus_bridge_dev(struct device *dev)
 {
 	kfree(dev);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index a59517b..509159b 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -667,7 +667,7 @@ int pci_scan_bridge(struct pci_bus *bus, struct pci_dev *dev, int max,
 
 void pci_walk_bus(struct pci_bus *top, void (*cb)(struct pci_dev *, void *),
 		  void *userdata);
-int pci_cfg_space_size_ext(struct pci_dev *dev, unsigned check_exp_pcix);
+int pci_cfg_space_size_ext(struct pci_dev *dev);
 int pci_cfg_space_size(struct pci_dev *dev);
 unsigned char pci_bus_max_busnr(struct pci_bus *bus);
 
-- 
cgit v0.10.2


From 3ae15e1623b9d32eb410c2a23d90e47b16e6acd0 Mon Sep 17 00:00:00 2001
From: Roland Dreier <rolandd@cisco.com>
Date: Wed, 30 Apr 2008 19:52:55 -0700
Subject: IB/mlx4: Fix off-by-one errors in calls to mlx4_ib_free_cq_buf()

When I merged bbf8eed1 ("IB/mlx4: Add support for resizing CQs") I
changed things around so that mlx4_ib_alloc_cq_buf() and
mlx4_ib_free_cq_buf() were used everywhere they could be.  However, I
screwed up the number of entries passed into mlx4_ib_alloc_cq_buf()
in a couple places -- the function bumps the number of entries
internally, so the caller shouldn't add 1 as well.

Passing a too-big value for the number of entries to mlx4_ib_free_cq_buf()
can cause the cleanup to go off the end of an array and corrupt
allocator state in interesting ways.

Signed-off-by: Roland Dreier <rolandd@cisco.com>

diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c
index 2f199c5..4521319 100644
--- a/drivers/infiniband/hw/mlx4/cq.c
+++ b/drivers/infiniband/hw/mlx4/cq.c
@@ -246,7 +246,7 @@ err_mtt:
 	if (context)
 		ib_umem_release(cq->umem);
 	else
-		mlx4_ib_free_cq_buf(dev, &cq->buf, entries);
+		mlx4_ib_free_cq_buf(dev, &cq->buf, cq->ibcq.cqe);
 
 err_db:
 	if (!context)
@@ -434,7 +434,7 @@ int mlx4_ib_destroy_cq(struct ib_cq *cq)
 		mlx4_ib_db_unmap_user(to_mucontext(cq->uobject->context), &mcq->db);
 		ib_umem_release(mcq->umem);
 	} else {
-		mlx4_ib_free_cq_buf(dev, &mcq->buf, cq->cqe + 1);
+		mlx4_ib_free_cq_buf(dev, &mcq->buf, cq->cqe);
 		mlx4_db_free(dev->dev, &mcq->db);
 	}
 
-- 
cgit v0.10.2


From 57ce41d1d18279cc90223f3deadca70c7de1cfca Mon Sep 17 00:00:00 2001
From: Eli Cohen <eli@dev.mellanox.co.il>
Date: Wed, 30 Apr 2008 20:02:45 -0700
Subject: IB/ipoib: Fix transmit queue stalling forever

Commit f56bcd80 ("IPoIB: Use separate CQ for UD send completions")
introduced a bug where the transmit queue could get stopped and never
woken up.  The problem is that send completions are only polled at the
end of the xmit function, so if the send queue fills up and the xmit
path stops the queue, then there is no way for send completions to
ever get polled, and so the transmit queue stays stopped forever.

Fix this by arming the send CQ just before posting the last send
request that fills the send queue.  Then, when the completion event
handler is called, drain the send CQ.  Since it is possible that not
enough send completions are in the CQ, verify that the the net queue
has been woken up after draining the send CQ, and if not arm a timer
and drain again at the timer function.

Signed-off-by: Roland Dreier <rolandd@cisco.com>

diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index 9044f88..ca126fc 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -334,6 +334,7 @@ struct ipoib_dev_priv {
 #endif
 	int	hca_caps;
 	struct ipoib_ethtool_st ethtool;
+	struct timer_list poll_timer;
 };
 
 struct ipoib_ah {
@@ -404,6 +405,7 @@ extern struct workqueue_struct *ipoib_workqueue;
 
 int ipoib_poll(struct napi_struct *napi, int budget);
 void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr);
+void ipoib_send_comp_handler(struct ib_cq *cq, void *dev_ptr);
 
 struct ipoib_ah *ipoib_create_ah(struct net_device *dev,
 				 struct ib_pd *pd, struct ib_ah_attr *attr);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index 97b815c..f429bce 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -461,6 +461,26 @@ void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr)
 	netif_rx_schedule(dev, &priv->napi);
 }
 
+static void drain_tx_cq(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	unsigned long flags;
+
+	spin_lock_irqsave(&priv->tx_lock, flags);
+	while (poll_tx(priv))
+		; /* nothing */
+
+	if (netif_queue_stopped(dev))
+		mod_timer(&priv->poll_timer, jiffies + 1);
+
+	spin_unlock_irqrestore(&priv->tx_lock, flags);
+}
+
+void ipoib_send_comp_handler(struct ib_cq *cq, void *dev_ptr)
+{
+	drain_tx_cq((struct net_device *)dev_ptr);
+}
+
 static inline int post_send(struct ipoib_dev_priv *priv,
 			    unsigned int wr_id,
 			    struct ib_ah *address, u32 qpn,
@@ -555,12 +575,22 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
 	else
 		priv->tx_wr.send_flags &= ~IB_SEND_IP_CSUM;
 
+	if (++priv->tx_outstanding == ipoib_sendq_size) {
+		ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n");
+		if (ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP))
+			ipoib_warn(priv, "request notify on send CQ failed\n");
+		netif_stop_queue(dev);
+	}
+
 	if (unlikely(post_send(priv, priv->tx_head & (ipoib_sendq_size - 1),
 			       address->ah, qpn, tx_req, phead, hlen))) {
 		ipoib_warn(priv, "post_send failed\n");
 		++dev->stats.tx_errors;
+		--priv->tx_outstanding;
 		ipoib_dma_unmap_tx(priv->ca, tx_req);
 		dev_kfree_skb_any(skb);
+		if (netif_queue_stopped(dev))
+			netif_wake_queue(dev);
 	} else {
 		dev->trans_start = jiffies;
 
@@ -568,14 +598,11 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
 		++priv->tx_head;
 		skb_orphan(skb);
 
-		if (++priv->tx_outstanding == ipoib_sendq_size) {
-			ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n");
-			netif_stop_queue(dev);
-		}
 	}
 
 	if (unlikely(priv->tx_outstanding > MAX_SEND_CQE))
-		poll_tx(priv);
+		while (poll_tx(priv))
+			; /* nothing */
 }
 
 static void __ipoib_reap_ah(struct net_device *dev)
@@ -609,6 +636,11 @@ void ipoib_reap_ah(struct work_struct *work)
 				   round_jiffies_relative(HZ));
 }
 
+static void ipoib_ib_tx_timer_func(unsigned long ctx)
+{
+	drain_tx_cq((struct net_device *)ctx);
+}
+
 int ipoib_ib_dev_open(struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -645,6 +677,10 @@ int ipoib_ib_dev_open(struct net_device *dev)
 	queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task,
 			   round_jiffies_relative(HZ));
 
+	init_timer(&priv->poll_timer);
+	priv->poll_timer.function = ipoib_ib_tx_timer_func;
+	priv->poll_timer.data = (unsigned long)dev;
+
 	set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags);
 
 	return 0;
@@ -810,6 +846,7 @@ int ipoib_ib_dev_stop(struct net_device *dev, int flush)
 	ipoib_dbg(priv, "All sends and receives done.\n");
 
 timeout:
+	del_timer_sync(&priv->poll_timer);
 	qp_attr.qp_state = IB_QPS_RESET;
 	if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE))
 		ipoib_warn(priv, "Failed to modify QP to RESET state\n");
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
index c1e7ece..8766d29 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
@@ -187,7 +187,8 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
 		goto out_free_mr;
 	}
 
-	priv->send_cq = ib_create_cq(priv->ca, NULL, NULL, dev, ipoib_sendq_size, 0);
+	priv->send_cq = ib_create_cq(priv->ca, ipoib_send_comp_handler, NULL,
+				     dev, ipoib_sendq_size, 0);
 	if (IS_ERR(priv->send_cq)) {
 		printk(KERN_WARNING "%s: failed to create send CQ\n", ca->name);
 		goto out_free_recv_cq;
-- 
cgit v0.10.2


From c8286944b802c5ce4063ec3c334b38c6757a9434 Mon Sep 17 00:00:00 2001
From: Steve Wise <swise@opengridcomputing.com>
Date: Fri, 2 May 2008 11:17:41 -0500
Subject: RDMA/cxgb3: QP flush fixes

- Flush the QP only after the HW disables the connection.  Currently
  we flush the QP when transitioning to CLOSING.  This exposes a race
  condition where the HW can complete a RECV WR, for instance, -and-
  the SW can flush that same WR.

- Only call CQ event handlers on flush IFF we actually flushed something.

Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>

diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.c b/drivers/infiniband/hw/cxgb3/cxio_hal.c
index ed2ee4b..5fd8506 100644
--- a/drivers/infiniband/hw/cxgb3/cxio_hal.c
+++ b/drivers/infiniband/hw/cxgb3/cxio_hal.c
@@ -359,9 +359,10 @@ static void insert_recv_cqe(struct t3_wq *wq, struct t3_cq *cq)
 	cq->sw_wptr++;
 }
 
-void cxio_flush_rq(struct t3_wq *wq, struct t3_cq *cq, int count)
+int cxio_flush_rq(struct t3_wq *wq, struct t3_cq *cq, int count)
 {
 	u32 ptr;
+	int flushed = 0;
 
 	PDBG("%s wq %p cq %p\n", __func__, wq, cq);
 
@@ -369,8 +370,11 @@ void cxio_flush_rq(struct t3_wq *wq, struct t3_cq *cq, int count)
 	PDBG("%s rq_rptr %u rq_wptr %u skip count %u\n", __func__,
 	    wq->rq_rptr, wq->rq_wptr, count);
 	ptr = wq->rq_rptr + count;
-	while (ptr++ != wq->rq_wptr)
+	while (ptr++ != wq->rq_wptr) {
 		insert_recv_cqe(wq, cq);
+		flushed++;
+	}
+	return flushed;
 }
 
 static void insert_sq_cqe(struct t3_wq *wq, struct t3_cq *cq,
@@ -394,9 +398,10 @@ static void insert_sq_cqe(struct t3_wq *wq, struct t3_cq *cq,
 	cq->sw_wptr++;
 }
 
-void cxio_flush_sq(struct t3_wq *wq, struct t3_cq *cq, int count)
+int cxio_flush_sq(struct t3_wq *wq, struct t3_cq *cq, int count)
 {
 	__u32 ptr;
+	int flushed = 0;
 	struct t3_swsq *sqp = wq->sq + Q_PTR2IDX(wq->sq_rptr, wq->sq_size_log2);
 
 	ptr = wq->sq_rptr + count;
@@ -405,7 +410,9 @@ void cxio_flush_sq(struct t3_wq *wq, struct t3_cq *cq, int count)
 		insert_sq_cqe(wq, cq, sqp);
 		sqp++;
 		ptr++;
+		flushed++;
 	}
+	return flushed;
 }
 
 /*
diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.h b/drivers/infiniband/hw/cxgb3/cxio_hal.h
index 2bcff7f..69ab08e 100644
--- a/drivers/infiniband/hw/cxgb3/cxio_hal.h
+++ b/drivers/infiniband/hw/cxgb3/cxio_hal.h
@@ -173,8 +173,8 @@ u32 cxio_hal_get_pdid(struct cxio_hal_resource *rscp);
 void cxio_hal_put_pdid(struct cxio_hal_resource *rscp, u32 pdid);
 int __init cxio_hal_init(void);
 void __exit cxio_hal_exit(void);
-void cxio_flush_rq(struct t3_wq *wq, struct t3_cq *cq, int count);
-void cxio_flush_sq(struct t3_wq *wq, struct t3_cq *cq, int count);
+int cxio_flush_rq(struct t3_wq *wq, struct t3_cq *cq, int count);
+int cxio_flush_sq(struct t3_wq *wq, struct t3_cq *cq, int count);
 void cxio_count_rcqes(struct t3_cq *cq, struct t3_wq *wq, int *count);
 void cxio_count_scqes(struct t3_cq *cq, struct t3_wq *wq, int *count);
 void cxio_flush_hw_cq(struct t3_cq *cq);
diff --git a/drivers/infiniband/hw/cxgb3/iwch_qp.c b/drivers/infiniband/hw/cxgb3/iwch_qp.c
index 9b4be88..79dbe5b 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_qp.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_qp.c
@@ -655,6 +655,7 @@ static void __flush_qp(struct iwch_qp *qhp, unsigned long *flag)
 {
 	struct iwch_cq *rchp, *schp;
 	int count;
+	int flushed;
 
 	rchp = get_chp(qhp->rhp, qhp->attr.rcq);
 	schp = get_chp(qhp->rhp, qhp->attr.scq);
@@ -669,20 +670,22 @@ static void __flush_qp(struct iwch_qp *qhp, unsigned long *flag)
 	spin_lock(&qhp->lock);
 	cxio_flush_hw_cq(&rchp->cq);
 	cxio_count_rcqes(&rchp->cq, &qhp->wq, &count);
-	cxio_flush_rq(&qhp->wq, &rchp->cq, count);
+	flushed = cxio_flush_rq(&qhp->wq, &rchp->cq, count);
 	spin_unlock(&qhp->lock);
 	spin_unlock_irqrestore(&rchp->lock, *flag);
-	(*rchp->ibcq.comp_handler)(&rchp->ibcq, rchp->ibcq.cq_context);
+	if (flushed)
+		(*rchp->ibcq.comp_handler)(&rchp->ibcq, rchp->ibcq.cq_context);
 
 	/* locking heirarchy: cq lock first, then qp lock. */
 	spin_lock_irqsave(&schp->lock, *flag);
 	spin_lock(&qhp->lock);
 	cxio_flush_hw_cq(&schp->cq);
 	cxio_count_scqes(&schp->cq, &qhp->wq, &count);
-	cxio_flush_sq(&qhp->wq, &schp->cq, count);
+	flushed = cxio_flush_sq(&qhp->wq, &schp->cq, count);
 	spin_unlock(&qhp->lock);
 	spin_unlock_irqrestore(&schp->lock, *flag);
-	(*schp->ibcq.comp_handler)(&schp->ibcq, schp->ibcq.cq_context);
+	if (flushed)
+		(*schp->ibcq.comp_handler)(&schp->ibcq, schp->ibcq.cq_context);
 
 	/* deref */
 	if (atomic_dec_and_test(&qhp->refcnt))
@@ -880,7 +883,6 @@ int iwch_modify_qp(struct iwch_dev *rhp, struct iwch_qp *qhp,
 				ep = qhp->ep;
 				get_ep(&ep->com);
 			}
-			flush_qp(qhp, &flag);
 			break;
 		case IWCH_QP_STATE_TERMINATE:
 			qhp->attr.state = IWCH_QP_STATE_TERMINATE;
@@ -911,6 +913,7 @@ int iwch_modify_qp(struct iwch_dev *rhp, struct iwch_qp *qhp,
 		}
 		switch (attrs->next_state) {
 			case IWCH_QP_STATE_IDLE:
+				flush_qp(qhp, &flag);
 				qhp->attr.state = IWCH_QP_STATE_IDLE;
 				qhp->attr.llp_stream_handle = NULL;
 				put_ep(&qhp->ep->com);
-- 
cgit v0.10.2


From c4d49776e8f5bf2d900d2b6d4855c1670a535ac5 Mon Sep 17 00:00:00 2001
From: Steve Wise <swise@opengridcomputing.com>
Date: Fri, 2 May 2008 10:57:09 -0700
Subject: RDMA/cxgb3: Silently ignore close reply after abort.

Remove bad BUG_ON() that can trigger in correct operation from
close_con_rpl().  It is possible to get a close_rpl message on a dead
connection.  The sequence is:

	- host refs ep for close exchange
	- host posts close_req
	- hw posts PEER_ABORT from incoming RST
	- host marks ep DEAD
	- host posts ABORT_RPL and releases ep resources
	- hw posts CLOSE_RPL
	- host derefs ep and ep freed.

Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>

diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.c b/drivers/infiniband/hw/cxgb3/iwch_cm.c
index d44a6df..32e290e 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_cm.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_cm.c
@@ -1650,8 +1650,8 @@ static int close_con_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
 		release = 1;
 		break;
 	case ABORTING:
-		break;
 	case DEAD:
+		break;
 	default:
 		BUG_ON(1);
 		break;
-- 
cgit v0.10.2


From 77a8d5741f3ee2c79554382179cca7b5893d6ae9 Mon Sep 17 00:00:00 2001
From: Steve Wise <swise@opengridcomputing.com>
Date: Fri, 2 May 2008 10:57:09 -0700
Subject: RDMA/cxgb3: Bump up the MPA connection setup timeout.

Testing on large clusters shows its way too short at 10 secs.

Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>

diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.c b/drivers/infiniband/hw/cxgb3/iwch_cm.c
index 32e290e..c325c44 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_cm.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_cm.c
@@ -67,10 +67,10 @@ int peer2peer = 0;
 module_param(peer2peer, int, 0644);
 MODULE_PARM_DESC(peer2peer, "Support peer2peer ULPs (default=0)");
 
-static int ep_timeout_secs = 10;
+static int ep_timeout_secs = 60;
 module_param(ep_timeout_secs, int, 0644);
 MODULE_PARM_DESC(ep_timeout_secs, "CM Endpoint operation timeout "
-				   "in seconds (default=10)");
+				   "in seconds (default=60)");
 
 static int mpa_rev = 1;
 module_param(mpa_rev, int, 0644);
-- 
cgit v0.10.2


From 4346f65426cbceb64794b468e4af6f5632d58c5e Mon Sep 17 00:00:00 2001
From: Oliver Hartkopp <oliver@hartkopp.net>
Date: Wed, 30 Apr 2008 23:04:37 +0200
Subject: hrtimer: remove duplicate helper function

The helper function hrtimer_callback_running() is used in
kernel/hrtimer.c as well as in the updated net/can/bcm.c which now
supports hrtimers. Moving the helper function to hrtimer.h removes the
duplicate definition in the C-files.

Signed-off-by: Oliver Hartkopp <oliver@hartkopp.net>
Cc: David Miller <davem@davemloft.net>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 31a4d65..6d93dce 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -316,6 +316,15 @@ static inline int hrtimer_is_queued(struct hrtimer *timer)
 		(HRTIMER_STATE_ENQUEUED | HRTIMER_STATE_PENDING);
 }
 
+/*
+ * Helper function to check, whether the timer is running the callback
+ * function
+ */
+static inline int hrtimer_callback_running(struct hrtimer *timer)
+{
+	return timer->state & HRTIMER_STATE_CALLBACK;
+}
+
 /* Forward a hrtimer so it expires after now: */
 extern u64
 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 9af1d6a..421be5f 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -154,15 +154,6 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
 }
 
 /*
- * Helper function to check, whether the timer is running the callback
- * function
- */
-static inline int hrtimer_callback_running(struct hrtimer *timer)
-{
-	return timer->state & HRTIMER_STATE_CALLBACK;
-}
-
-/*
  * Functions and macros which are different for UP/SMP systems are kept in a
  * single place
  */
diff --git a/net/can/bcm.c b/net/can/bcm.c
index 74fd2d3..d9a3a9d 100644
--- a/net/can/bcm.c
+++ b/net/can/bcm.c
@@ -412,12 +412,6 @@ static void bcm_rx_changed(struct bcm_op *op, struct can_frame *data)
 	bcm_send_to_user(op, &head, data, 1);
 }
 
-/* TODO: move to linux/hrtimer.h */
-static inline int hrtimer_callback_running(struct hrtimer *timer)
-{
-        return timer->state & HRTIMER_STATE_CALLBACK;
-}
-
 /*
  * bcm_rx_update_and_send - process a detected relevant receive content change
  *                          1. update the last received data
-- 
cgit v0.10.2


From 4359a023a8c3b247b348c310bf510b23f3c1ab64 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Fri, 2 May 2008 12:49:40 +0200
Subject: clocksource: Fix permissions for available_clocksource

File permissions for
/sys/devices/system/clocksource/clocksource0/available_clocksource
are 600 which allows write access. But this is in fact a read only
file. So change permissions to 400.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: John Stultz <johnstul@us.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 73961f3..83221ed 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -474,7 +474,7 @@ sysfs_show_available_clocksources(struct sys_device *dev, char *buf)
 static SYSDEV_ATTR(current_clocksource, 0600, sysfs_show_current_clocksources,
 		   sysfs_override_clocksource);
 
-static SYSDEV_ATTR(available_clocksource, 0600,
+static SYSDEV_ATTR(available_clocksource, 0400,
 		   sysfs_show_available_clocksources, NULL);
 
 static struct sysdev_class clocksource_sysclass = {
-- 
cgit v0.10.2


From 4f95f81a48623982879f4fa80c641933444afd18 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Sat, 3 May 2008 14:23:14 +0200
Subject: clocksource: allow read access to available/current_clocksource

There is no harm, when users can read the info and we ask often enough
during debugging for this kind of information.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 83221ed..dadde53 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -471,10 +471,10 @@ sysfs_show_available_clocksources(struct sys_device *dev, char *buf)
 /*
  * Sysfs setup bits:
  */
-static SYSDEV_ATTR(current_clocksource, 0600, sysfs_show_current_clocksources,
+static SYSDEV_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources,
 		   sysfs_override_clocksource);
 
-static SYSDEV_ATTR(available_clocksource, 0400,
+static SYSDEV_ATTR(available_clocksource, 0444,
 		   sysfs_show_available_clocksources, NULL);
 
 static struct sysdev_class clocksource_sysclass = {
-- 
cgit v0.10.2


From d35c7b0e54a596c5a8134d75999b7f391a9c6550 Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@redhat.com>
Date: Sat, 3 May 2008 15:10:37 -0400
Subject: unified (weak) sys_pipe implementation

This replaces the duplicated arch-specific versions of "sys_pipe()" with
one unified implementation.  This removes almost 250 lines of duplicated
code.

It's marked __weak, so that *if* an architecture wants to override the
default implementation it can do so by simply having its own replacement
version, since many architectures use alternate calling conventions for
the 'pipe()' system call for legacy reasons (ie traditional UNIX
implementations often return the two file descriptors in registers)

I still haven't changed the cris version even though Linus says the BKL
isn't needed.  The arch maintainer can easily do it if there are really
no obstacles.

Signed-off-by: Ulrich Drepper <drepper@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/arm/kernel/sys_arm.c b/arch/arm/kernel/sys_arm.c
index 9bd1870..0128687 100644
--- a/arch/arm/kernel/sys_arm.c
+++ b/arch/arm/kernel/sys_arm.c
@@ -34,23 +34,6 @@ extern unsigned long do_mremap(unsigned long addr, unsigned long old_len,
 			       unsigned long new_len, unsigned long flags,
 			       unsigned long new_addr);
 
-/*
- * sys_pipe() is the normal C calling standard for creating
- * a pipe. It's not the way unix traditionally does this, though.
- */
-asmlinkage int sys_pipe(unsigned long __user *fildes)
-{
-	int fd[2];
-	int error;
-
-	error = do_pipe(fd);
-	if (!error) {
-		if (copy_to_user(fildes, fd, 2*sizeof(int)))
-			error = -EFAULT;
-	}
-	return error;
-}
-
 /* common code for old and new mmaps */
 inline long do_mmap2(
 	unsigned long addr, unsigned long len,
diff --git a/arch/avr32/kernel/sys_avr32.c b/arch/avr32/kernel/sys_avr32.c
index 8deb600..8e8911e 100644
--- a/arch/avr32/kernel/sys_avr32.c
+++ b/arch/avr32/kernel/sys_avr32.c
@@ -14,19 +14,6 @@
 #include <asm/mman.h>
 #include <asm/uaccess.h>
 
-asmlinkage int sys_pipe(unsigned long __user *filedes)
-{
-	int fd[2];
-	int error;
-
-	error = do_pipe(fd);
-	if (!error) {
-		if (copy_to_user(filedes, fd, sizeof(fd)))
-			error = -EFAULT;
-	}
-	return error;
-}
-
 asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
 			  unsigned long prot, unsigned long flags,
 			  unsigned long fd, off_t offset)
diff --git a/arch/blackfin/kernel/sys_bfin.c b/arch/blackfin/kernel/sys_bfin.c
index efb7b25..fce49d7 100644
--- a/arch/blackfin/kernel/sys_bfin.c
+++ b/arch/blackfin/kernel/sys_bfin.c
@@ -45,23 +45,6 @@
 #include <asm/cacheflush.h>
 #include <asm/dma.h>
 
-/*
- * sys_pipe() is the normal C calling standard for creating
- * a pipe. It's not the way unix traditionally does this, though.
- */
-asmlinkage int sys_pipe(unsigned long __user *fildes)
-{
-	int fd[2];
-	int error;
-
-	error = do_pipe(fd);
-	if (!error) {
-		if (copy_to_user(fildes, fd, 2 * sizeof(int)))
-			error = -EFAULT;
-	}
-	return error;
-}
-
 /* common code for old and new mmaps */
 static inline long
 do_mmap2(unsigned long addr, unsigned long len,
diff --git a/arch/frv/kernel/sys_frv.c b/arch/frv/kernel/sys_frv.c
index 04c6b16..49b2cf2 100644
--- a/arch/frv/kernel/sys_frv.c
+++ b/arch/frv/kernel/sys_frv.c
@@ -28,23 +28,6 @@
 #include <asm/setup.h>
 #include <asm/uaccess.h>
 
-/*
- * sys_pipe() is the normal C calling standard for creating
- * a pipe. It's not the way unix traditionally does this, though.
- */
-asmlinkage long sys_pipe(unsigned long __user * fildes)
-{
-	int fd[2];
-	int error;
-
-	error = do_pipe(fd);
-	if (!error) {
-		if (copy_to_user(fildes, fd, 2*sizeof(int)))
-			error = -EFAULT;
-	}
-	return error;
-}
-
 asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
 			  unsigned long prot, unsigned long flags,
 			  unsigned long fd, unsigned long pgoff)
diff --git a/arch/h8300/kernel/sys_h8300.c b/arch/h8300/kernel/sys_h8300.c
index 00608be6..2745656 100644
--- a/arch/h8300/kernel/sys_h8300.c
+++ b/arch/h8300/kernel/sys_h8300.c
@@ -27,23 +27,6 @@
 #include <asm/traps.h>
 #include <asm/unistd.h>
 
-/*
- * sys_pipe() is the normal C calling standard for creating
- * a pipe. It's not the way unix traditionally does this, though.
- */
-asmlinkage int sys_pipe(unsigned long * fildes)
-{
-	int fd[2];
-	int error;
-
-	error = do_pipe(fd);
-	if (!error) {
-		if (copy_to_user(fildes, fd, 2*sizeof(int)))
-			error = -EFAULT;
-	}
-	return error;
-}
-
 /* common code for old and new mmaps */
 static inline long do_mmap2(
 	unsigned long addr, unsigned long len,
diff --git a/arch/m68k/kernel/sys_m68k.c b/arch/m68k/kernel/sys_m68k.c
index e892f17..7f54efa 100644
--- a/arch/m68k/kernel/sys_m68k.c
+++ b/arch/m68k/kernel/sys_m68k.c
@@ -30,23 +30,6 @@
 #include <asm/page.h>
 #include <asm/unistd.h>
 
-/*
- * sys_pipe() is the normal C calling standard for creating
- * a pipe. It's not the way unix traditionally does this, though.
- */
-asmlinkage int sys_pipe(unsigned long __user * fildes)
-{
-	int fd[2];
-	int error;
-
-	error = do_pipe(fd);
-	if (!error) {
-		if (copy_to_user(fildes, fd, 2*sizeof(int)))
-			error = -EFAULT;
-	}
-	return error;
-}
-
 /* common code for old and new mmaps */
 static inline long do_mmap2(
 	unsigned long addr, unsigned long len,
diff --git a/arch/m68knommu/kernel/sys_m68k.c b/arch/m68knommu/kernel/sys_m68k.c
index 65f7a95..7002816 100644
--- a/arch/m68knommu/kernel/sys_m68k.c
+++ b/arch/m68knommu/kernel/sys_m68k.c
@@ -28,23 +28,6 @@
 #include <asm/cacheflush.h>
 #include <asm/unistd.h>
 
-/*
- * sys_pipe() is the normal C calling standard for creating
- * a pipe. It's not the way unix traditionally does this, though.
- */
-asmlinkage int sys_pipe(unsigned long * fildes)
-{
-	int fd[2];
-	int error;
-
-	error = do_pipe(fd);
-	if (!error) {
-		if (copy_to_user(fildes, fd, 2*sizeof(int)))
-			error = -EFAULT;
-	}
-	return error;
-}
-
 /* common code for old and new mmaps */
 static inline long do_mmap2(
 	unsigned long addr, unsigned long len,
diff --git a/arch/mn10300/kernel/sys_mn10300.c b/arch/mn10300/kernel/sys_mn10300.c
index 5f17a1e..bca5a84 100644
--- a/arch/mn10300/kernel/sys_mn10300.c
+++ b/arch/mn10300/kernel/sys_mn10300.c
@@ -29,23 +29,6 @@
 #define MIN_MAP_ADDR	PAGE_SIZE	/* minimum fixed mmap address */
 
 /*
- * sys_pipe() is the normal C calling standard for creating
- * a pipe. It's not the way Unix traditionally does this, though.
- */
-asmlinkage long sys_pipe(unsigned long __user *fildes)
-{
-	int fd[2];
-	int error;
-
-	error = do_pipe(fd);
-	if (!error) {
-		if (copy_to_user(fildes, fd, 2 * sizeof(int)))
-			error = -EFAULT;
-	}
-	return error;
-}
-
-/*
  * memory mapping syscall
  */
 asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
diff --git a/arch/parisc/kernel/sys_parisc.c b/arch/parisc/kernel/sys_parisc.c
index 4f58921..71b3195 100644
--- a/arch/parisc/kernel/sys_parisc.c
+++ b/arch/parisc/kernel/sys_parisc.c
@@ -33,19 +33,6 @@
 #include <linux/utsname.h>
 #include <linux/personality.h>
 
-int sys_pipe(int __user *fildes)
-{
-	int fd[2];
-	int error;
-
-	error = do_pipe(fd);
-	if (!error) {
-		if (copy_to_user(fildes, fd, 2*sizeof(int)))
-			error = -EFAULT;
-	}
-	return error;
-}
-
 static unsigned long get_unshared_area(unsigned long addr, unsigned long len)
 {
 	struct vm_area_struct *vma;
diff --git a/arch/powerpc/kernel/syscalls.c b/arch/powerpc/kernel/syscalls.c
index e722a4e..4fe69ca 100644
--- a/arch/powerpc/kernel/syscalls.c
+++ b/arch/powerpc/kernel/syscalls.c
@@ -136,23 +136,6 @@ int sys_ipc(uint call, int first, unsigned long second, long third,
 	return ret;
 }
 
-/*
- * sys_pipe() is the normal C calling standard for creating
- * a pipe. It's not the way unix traditionally does this, though.
- */
-int sys_pipe(int __user *fildes)
-{
-	int fd[2];
-	int error;
-
-	error = do_pipe(fd);
-	if (!error) {
-		if (copy_to_user(fildes, fd, 2*sizeof(int)))
-			error = -EFAULT;
-	}
-	return error;
-}
-
 static inline unsigned long do_mmap2(unsigned long addr, size_t len,
 			unsigned long prot, unsigned long flags,
 			unsigned long fd, unsigned long off, int shift)
diff --git a/arch/s390/kernel/sys_s390.c b/arch/s390/kernel/sys_s390.c
index 988d0d6..5fdb799 100644
--- a/arch/s390/kernel/sys_s390.c
+++ b/arch/s390/kernel/sys_s390.c
@@ -32,23 +32,6 @@
 #include <asm/uaccess.h>
 #include "entry.h"
 
-/*
- * sys_pipe() is the normal C calling standard for creating
- * a pipe. It's not the way Unix traditionally does this, though.
- */
-asmlinkage long sys_pipe(unsigned long __user *fildes)
-{
-	int fd[2];
-	int error;
-
-	error = do_pipe(fd);
-	if (!error) {
-		if (copy_to_user(fildes, fd, 2*sizeof(int)))
-			error = -EFAULT;
-	}
-	return error;
-}
-
 /* common code for old and new mmaps */
 static inline long do_mmap2(
 	unsigned long addr, unsigned long len,
diff --git a/arch/sh/kernel/sys_sh64.c b/arch/sh/kernel/sys_sh64.c
index 578004d..91fb844 100644
--- a/arch/sh/kernel/sys_sh64.c
+++ b/arch/sh/kernel/sys_sh64.c
@@ -31,23 +31,6 @@
 #include <asm/unistd.h>
 
 /*
- * sys_pipe() is the normal C calling standard for creating
- * a pipe. It's not the way Unix traditionally does this, though.
- */
-asmlinkage int sys_pipe(unsigned long * fildes)
-{
-        int fd[2];
-        int error;
-
-        error = do_pipe(fd);
-        if (!error) {
-                if (copy_to_user(fildes, fd, 2*sizeof(int)))
-                        error = -EFAULT;
-        }
-        return error;
-}
-
-/*
  * Do a system call from kernel instead of calling sys_execve so we
  * end up with proper pt_regs.
  */
diff --git a/arch/um/kernel/syscall.c b/arch/um/kernel/syscall.c
index 9cffc62..128ee85 100644
--- a/arch/um/kernel/syscall.c
+++ b/arch/um/kernel/syscall.c
@@ -73,23 +73,6 @@ long old_mmap(unsigned long addr, unsigned long len,
  out:
 	return err;
 }
-/*
- * sys_pipe() is the normal C calling standard for creating
- * a pipe. It's not the way unix traditionally does this, though.
- */
-long sys_pipe(unsigned long __user * fildes)
-{
-	int fd[2];
-	long error;
-
-	error = do_pipe(fd);
-	if (!error) {
-		if (copy_to_user(fildes, fd, sizeof(fd)))
-			error = -EFAULT;
-	}
-	return error;
-}
-
 
 long sys_uname(struct old_utsname __user * name)
 {
diff --git a/arch/v850/kernel/syscalls.c b/arch/v850/kernel/syscalls.c
index 003db9c..1a83daf 100644
--- a/arch/v850/kernel/syscalls.c
+++ b/arch/v850/kernel/syscalls.c
@@ -132,23 +132,6 @@ sys_ipc (uint call, int first, int second, int third, void *ptr, long fifth)
 	return ret;
 }
 
-/*
- * sys_pipe() is the normal C calling standard for creating
- * a pipe. It's not the way unix traditionally does this, though.
- */
-int sys_pipe (int *fildes)
-{
-	int fd[2];
-	int error;
-
-	error = do_pipe (fd);
-	if (!error) {
-		if (copy_to_user (fildes, fd, 2*sizeof (int)))
-			error = -EFAULT;
-	}
-	return error;
-}
-
 static inline unsigned long
 do_mmap2 (unsigned long addr, size_t len,
 	 unsigned long prot, unsigned long flags,
diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c
index a86d26f..d2ab52c 100644
--- a/arch/x86/kernel/sys_i386_32.c
+++ b/arch/x86/kernel/sys_i386_32.c
@@ -22,23 +22,6 @@
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 
-/*
- * sys_pipe() is the normal C calling standard for creating
- * a pipe. It's not the way Unix traditionally does this, though.
- */
-asmlinkage int sys_pipe(unsigned long __user * fildes)
-{
-	int fd[2];
-	int error;
-
-	error = do_pipe(fd);
-	if (!error) {
-		if (copy_to_user(fildes, fd, 2*sizeof(int)))
-			error = -EFAULT;
-	}
-	return error;
-}
-
 asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
 			  unsigned long prot, unsigned long flags,
 			  unsigned long fd, unsigned long pgoff)
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index bd802a5..3b360ef 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -17,23 +17,6 @@
 #include <asm/uaccess.h>
 #include <asm/ia32.h>
 
-/*
- * sys_pipe() is the normal C calling standard for creating
- * a pipe. It's not the way Unix traditionally does this, though.
- */
-asmlinkage long sys_pipe(int __user *fildes)
-{
-	int fd[2];
-	int error;
-
-	error = do_pipe(fd);
-	if (!error) {
-		if (copy_to_user(fildes, fd, 2*sizeof(int)))
-			error = -EFAULT;
-	}
-	return error;
-}
-
 asmlinkage long sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags,
 	unsigned long fd, unsigned long off)
 {
diff --git a/fs/pipe.c b/fs/pipe.c
index f73492b..3499f9f 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1076,6 +1076,23 @@ int do_pipe(int *fd)
 }
 
 /*
+ * sys_pipe() is the normal C calling standard for creating
+ * a pipe. It's not the way Unix traditionally does this, though.
+ */
+asmlinkage long __weak sys_pipe(int __user *fildes)
+{
+	int fd[2];
+	int error;
+
+	error = do_pipe(fd);
+	if (!error) {
+		if (copy_to_user(fildes, fd, sizeof(fd)))
+			error = -EFAULT;
+	}
+	return error;
+}
+
+/*
  * pipefs should _never_ be mounted by userland - too much of security hassle,
  * no real gain from having the whole whorehouse mounted. So we don't need
  * any operations on the root directory. However, we need a non-trivial
diff --git a/include/asm-powerpc/syscalls.h b/include/asm-powerpc/syscalls.h
index b3ca41f..2b8a458 100644
--- a/include/asm-powerpc/syscalls.h
+++ b/include/asm-powerpc/syscalls.h
@@ -30,7 +30,7 @@ asmlinkage int sys_fork(unsigned long p1, unsigned long p2,
 asmlinkage int sys_vfork(unsigned long p1, unsigned long p2,
 		unsigned long p3, unsigned long p4, unsigned long p5,
 		unsigned long p6, struct pt_regs *regs);
-asmlinkage int sys_pipe(int __user *fildes);
+asmlinkage long sys_pipe(int __user *fildes);
 asmlinkage long sys_rt_sigaction(int sig,
 		const struct sigaction __user *act,
 		struct sigaction __user *oact, size_t sigsetsize);
-- 
cgit v0.10.2


From 269f21344b23e552c21c9e2d7ca258479dcd7a0a Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@redhat.com>
Date: Sat, 3 May 2008 15:28:45 -0400
Subject: tiny mq_open optimization

A very small cleanup for mq_open.

We do not have to call set_close_on_exit if we create the file
descriptor right away with the flag set.  We have a function for this
now.  The resulting code is smaller and a tiny bit faster.

Signed-off-by: Ulrich Drepper <drepper@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 94fd3b0..b3b69fd 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -673,7 +673,7 @@ asmlinkage long sys_mq_open(const char __user *u_name, int oflag, mode_t mode,
 	if (IS_ERR(name = getname(u_name)))
 		return PTR_ERR(name);
 
-	fd = get_unused_fd();
+	fd = get_unused_fd_flags(O_CLOEXEC);
 	if (fd < 0)
 		goto out_putname;
 
@@ -709,7 +709,6 @@ asmlinkage long sys_mq_open(const char __user *u_name, int oflag, mode_t mode,
 		goto out_putfd;
 	}
 
-	set_close_on_exec(fd, 1);
 	fd_install(fd, filp);
 	goto out_upsem;
 
-- 
cgit v0.10.2


From 3ba08b00e0d8413d79be9cab8ec085ceb6ae6fd6 Mon Sep 17 00:00:00 2001
From: Jarek Poplawski <jarkao2@gmail.com>
Date: Sat, 3 May 2008 20:46:29 -0700
Subject: sch_htb: remove from event queue in htb_parent_to_leaf()

There is lack of removing a class from the event queue while changing
from parent to leaf which can cause corruption of this rb tree. This
patch fixes a bug introduced by my patch: "sch_htb: turn intermediate
classes into leaves" commit: 160d5e10f87b1dc88fd9b84b31b1718e0fd76398.

Many thanks to Jan 'yanek' Bortl for finding a way to reproduce this
rare bug and narrowing the test case, which made possible proper
diagnosing.

This patch is recommended for all kernels starting from 2.6.20.

Reported-and-tested-by: Jan 'yanek' Bortl <yanek@ya.bofh.cz>
Signed-off-by: Jarek Poplawski <jarkao2@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 66148cc..5bc1ed4 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -1197,12 +1197,16 @@ static inline int htb_parent_last_child(struct htb_class *cl)
 	return 1;
 }
 
-static void htb_parent_to_leaf(struct htb_class *cl, struct Qdisc *new_q)
+static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl,
+			       struct Qdisc *new_q)
 {
 	struct htb_class *parent = cl->parent;
 
 	BUG_TRAP(!cl->level && cl->un.leaf.q && !cl->prio_activity);
 
+	if (parent->cmode != HTB_CAN_SEND)
+		htb_safe_rb_erase(&parent->pq_node, q->wait_pq + parent->level);
+
 	parent->level = 0;
 	memset(&parent->un.inner, 0, sizeof(parent->un.inner));
 	INIT_LIST_HEAD(&parent->un.leaf.drop_list);
@@ -1300,7 +1304,7 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg)
 		htb_deactivate(q, cl);
 
 	if (last_child)
-		htb_parent_to_leaf(cl, new_q);
+		htb_parent_to_leaf(q, cl, new_q);
 
 	if (--cl->refcnt == 0)
 		htb_destroy_class(sch, cl);
-- 
cgit v0.10.2


From 5b81d689e57a85b1ff60edfb65e5b8a12d28ecee Mon Sep 17 00:00:00 2001
From: Robert Reif <reif@earthlink.net>
Date: Sat, 3 May 2008 20:55:27 -0700
Subject: sparc: tcx.c remove unnecessary function

From: Robert Reif <reif@earthlink.net>

Replaced tcx_init_one with tcx_probe.
Fixed some checkpatch problems.

Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/video/tcx.c b/drivers/video/tcx.c
index a717743..44e8c27 100644
--- a/drivers/video/tcx.c
+++ b/drivers/video/tcx.c
@@ -84,7 +84,7 @@ struct tcx_tec {
 
 struct tcx_thc {
 	u32 thc_rev;
-        u32 thc_pad0[511];
+	u32 thc_pad0[511];
 	u32 thc_hs;		/* hsync timing */
 	u32 thc_hsdvs;
 	u32 thc_hd;
@@ -126,10 +126,10 @@ struct tcx_par {
 };
 
 /* Reset control plane so that WID is 8-bit plane. */
-static void __tcx_set_control_plane (struct tcx_par *par)
+static void __tcx_set_control_plane(struct tcx_par *par)
 {
 	u32 __iomem *p, *pend;
-        
+
 	if (par->lowdepth)
 		return;
 
@@ -143,8 +143,8 @@ static void __tcx_set_control_plane (struct tcx_par *par)
 		sbus_writel(tmp, p);
 	}
 }
-                                                
-static void tcx_reset (struct fb_info *info)
+
+static void tcx_reset(struct fb_info *info)
 {
 	struct tcx_par *par = (struct tcx_par *) info->par;
 	unsigned long flags;
@@ -365,7 +365,8 @@ static void tcx_unmap_regs(struct of_device *op, struct fb_info *info,
 			   info->screen_base, par->fbsize);
 }
 
-static int __devinit tcx_init_one(struct of_device *op)
+static int __devinit tcx_probe(struct of_device *op,
+			       const struct of_device_id *match)
 {
 	struct device_node *dp = op->node;
 	struct fb_info *info;
@@ -488,13 +489,6 @@ out_err:
 	return err;
 }
 
-static int __devinit tcx_probe(struct of_device *dev, const struct of_device_id *match)
-{
-	struct of_device *op = to_of_device(&dev->dev);
-
-	return tcx_init_one(op);
-}
-
 static int __devexit tcx_remove(struct of_device *op)
 {
 	struct fb_info *info = dev_get_drvdata(&op->dev);
-- 
cgit v0.10.2


From c8005785102e5b67ecf213f06a3d6c001f6f8cb4 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Sat, 3 May 2008 20:56:42 -0700
Subject: net: Fix useless comment reference loop.

include/linux/skbuff.h says:
        /* These elements must be at the end, see alloc_skb() for details.  */

net/core/skbuff.c says:
	* See comment in sk_buff definition, just before the 'tail' member

This patch contains my guess as to the actual reason rather than a
dead comment reference loop.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 4fe605f..5c459f2 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -200,7 +200,9 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
 		goto nodata;
 
 	/*
-	 * See comment in sk_buff definition, just before the 'tail' member
+	 * Only clear those fields we need to clear, not those that we will
+	 * actually initialise below. Hence, don't put any more fields after
+	 * the tail pointer in struct sk_buff!
 	 */
 	memset(skb, 0, offsetof(struct sk_buff, tail));
 	skb->truesize = size + sizeof(struct sk_buff);
-- 
cgit v0.10.2


From 81d6ec6b36bdf30e283ab98e7646571484401dbb Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sat, 3 May 2008 21:00:55 -0700
Subject: Revert "[SPARC64]: Wrap SMP IPIs with irq_enter()/irq_exit()."

This reverts commit 2664ef44cf5053d2b7dff01cecac70bc601a5f68.

Ingo moved around where the softlockup dependency sits
so this change is no longer necessary.

Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c
index 3aba476..0d6403a 100644
--- a/arch/sparc64/kernel/smp.c
+++ b/arch/sparc64/kernel/smp.c
@@ -865,21 +865,14 @@ void smp_call_function_client(int irq, struct pt_regs *regs)
 	void *info = call_data->info;
 
 	clear_softint(1 << irq);
-
-	irq_enter();
-
-	if (!call_data->wait) {
-		/* let initiator proceed after getting data */
-		atomic_inc(&call_data->finished);
-	}
-
-	func(info);
-
-	irq_exit();
-
 	if (call_data->wait) {
 		/* let initiator proceed only after completion */
+		func(info);
 		atomic_inc(&call_data->finished);
+	} else {
+		/* let initiator proceed after getting data */
+		atomic_inc(&call_data->finished);
+		func(info);
 	}
 }
 
@@ -1041,9 +1034,7 @@ void smp_receive_signal(int cpu)
 
 void smp_receive_signal_client(int irq, struct pt_regs *regs)
 {
-	irq_enter();
 	clear_softint(1 << irq);
-	irq_exit();
 }
 
 void smp_new_mmu_context_version_client(int irq, struct pt_regs *regs)
@@ -1051,8 +1042,6 @@ void smp_new_mmu_context_version_client(int irq, struct pt_regs *regs)
 	struct mm_struct *mm;
 	unsigned long flags;
 
-	irq_enter();
-
 	clear_softint(1 << irq);
 
 	/* See if we need to allocate a new TLB context because
@@ -1072,8 +1061,6 @@ void smp_new_mmu_context_version_client(int irq, struct pt_regs *regs)
 	load_secondary_context(mm);
 	__flush_tlb_mm(CTX_HWBITS(mm->context),
 		       SECONDARY_CONTEXT);
-
-	irq_exit();
 }
 
 void smp_new_mmu_context_version(void)
@@ -1239,8 +1226,6 @@ void smp_penguin_jailcell(int irq, struct pt_regs *regs)
 {
 	clear_softint(1 << irq);
 
-	irq_enter();
-
 	preempt_disable();
 
 	__asm__ __volatile__("flushw");
@@ -1253,8 +1238,6 @@ void smp_penguin_jailcell(int irq, struct pt_regs *regs)
 	prom_world(0);
 
 	preempt_enable();
-
-	irq_exit();
 }
 
 /* /proc/profile writes can call this, don't __init it please. */
-- 
cgit v0.10.2


From 41fef0ee7b8f3fe3f3dd2ddc9b170f3d88bce595 Mon Sep 17 00:00:00 2001
From: Marcin Slusarz <marcin.slusarz@gmail.com>
Date: Sat, 3 May 2008 21:03:01 -0700
Subject: xfrm: convert empty xfrm_audit_* macros to functions

it removes these warnings when CONFIG_AUDITSYSCALL is unset:

net/xfrm/xfrm_user.c: In function 'xfrm_add_sa':
net/xfrm/xfrm_user.c:412: warning: unused variable 'sid'
net/xfrm/xfrm_user.c:411: warning: unused variable 'sessionid'
net/xfrm/xfrm_user.c:410: warning: unused variable 'loginuid'
net/xfrm/xfrm_user.c: In function 'xfrm_del_sa':
net/xfrm/xfrm_user.c:485: warning: unused variable 'sid'
net/xfrm/xfrm_user.c:484: warning: unused variable 'sessionid'
net/xfrm/xfrm_user.c:483: warning: unused variable 'loginuid'
net/xfrm/xfrm_user.c: In function 'xfrm_add_policy':
net/xfrm/xfrm_user.c:1132: warning: unused variable 'sid'
net/xfrm/xfrm_user.c:1131: warning: unused variable 'sessionid'
net/xfrm/xfrm_user.c:1130: warning: unused variable 'loginuid'
net/xfrm/xfrm_user.c: In function 'xfrm_get_policy':
net/xfrm/xfrm_user.c:1382: warning: unused variable 'sid'
net/xfrm/xfrm_user.c:1381: warning: unused variable 'sessionid'
net/xfrm/xfrm_user.c:1380: warning: unused variable 'loginuid'
net/xfrm/xfrm_user.c: In function 'xfrm_add_pol_expire':
net/xfrm/xfrm_user.c:1620: warning: unused variable 'sid'
net/xfrm/xfrm_user.c:1619: warning: unused variable 'sessionid'
net/xfrm/xfrm_user.c:1618: warning: unused variable 'loginuid'
net/xfrm/xfrm_user.c: In function 'xfrm_add_sa_expire':
net/xfrm/xfrm_user.c:1658: warning: unused variable 'sid'
net/xfrm/xfrm_user.c:1657: warning: unused variable 'sessionid'
net/xfrm/xfrm_user.c:1656: warning: unused variable 'loginuid'

Signed-off-by: Marcin Slusarz <marcin.slusarz@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index d1350bc..2933d747 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -648,14 +648,46 @@ extern void xfrm_audit_state_notfound(struct sk_buff *skb, u16 family,
 extern void xfrm_audit_state_icvfail(struct xfrm_state *x,
 				     struct sk_buff *skb, u8 proto);
 #else
-#define xfrm_audit_policy_add(x, r, a, se, s)	do { ; } while (0)
-#define xfrm_audit_policy_delete(x, r, a, se, s)	do { ; } while (0)
-#define xfrm_audit_state_add(x, r, a, se, s)	do { ; } while (0)
-#define xfrm_audit_state_delete(x, r, a, se, s)	do { ; } while (0)
-#define xfrm_audit_state_replay_overflow(x, s)	do { ; } while (0)
-#define xfrm_audit_state_notfound_simple(s, f)	do { ; } while (0)
-#define xfrm_audit_state_notfound(s, f, sp, sq)	do { ; } while (0)
-#define xfrm_audit_state_icvfail(x, s, p)	do { ; } while (0)
+
+static inline void xfrm_audit_policy_add(struct xfrm_policy *xp, int result,
+				  u32 auid, u32 ses, u32 secid)
+{
+}
+
+static inline void xfrm_audit_policy_delete(struct xfrm_policy *xp, int result,
+				  u32 auid, u32 ses, u32 secid)
+{
+}
+
+static inline void xfrm_audit_state_add(struct xfrm_state *x, int result,
+				 u32 auid, u32 ses, u32 secid)
+{
+}
+
+static inline void xfrm_audit_state_delete(struct xfrm_state *x, int result,
+				    u32 auid, u32 ses, u32 secid)
+{
+}
+
+static inline void xfrm_audit_state_replay_overflow(struct xfrm_state *x,
+					     struct sk_buff *skb)
+{
+}
+
+static inline void xfrm_audit_state_notfound_simple(struct sk_buff *skb,
+				      u16 family)
+{
+}
+
+static inline void xfrm_audit_state_notfound(struct sk_buff *skb, u16 family,
+				      __be32 net_spi, __be32 net_seq)
+{
+}
+
+static inline void xfrm_audit_state_icvfail(struct xfrm_state *x,
+				     struct sk_buff *skb, u8 proto)
+{
+}
 #endif /* CONFIG_AUDITSYSCALL */
 
 static inline void xfrm_pol_hold(struct xfrm_policy *policy)
-- 
cgit v0.10.2


From e544ff00da4b53069dbca3debbfb02d455f72467 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sat, 3 May 2008 21:10:58 -0700
Subject: lapbeth: Release ->ethdev when unregistering device.

Otherwise it leaks forever.

Based upon a report by Roland <devzero@web.de>

Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/wan/lapbether.c b/drivers/net/wan/lapbether.c
index b5860b9..24fd613 100644
--- a/drivers/net/wan/lapbether.c
+++ b/drivers/net/wan/lapbether.c
@@ -459,6 +459,7 @@ static void __exit lapbeth_cleanup_driver(void)
 	list_for_each_safe(entry, tmp, &lapbeth_devices) {
 		lapbeth = list_entry(entry, struct lapbethdev, node);
 
+		dev_put(lapbeth->ethdev);
 		unregister_netdevice(lapbeth->axdev);
 	}
 	rtnl_unlock();
-- 
cgit v0.10.2


From 59f7137a1369c25308672def38f3b126d0c7575a Mon Sep 17 00:00:00 2001
From: Robert Reif <reif@earthlink.net>
Date: Sat, 3 May 2008 21:12:00 -0700
Subject: sparc video: make blank use proper constant

Make blank functions use proper constant for unblanking.

Signed-off-by: Robert Reif <reif@earthlink.net>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/video/bw2.c b/drivers/video/bw2.c
index 275d9da..79f85dc 100644
--- a/drivers/video/bw2.c
+++ b/drivers/video/bw2.c
@@ -329,7 +329,7 @@ static int __devinit bw2_probe(struct of_device *op, const struct of_device_id *
 	if (!info->screen_base)
 		goto out_unmap_regs;
 
-	bw2_blank(0, info);
+	bw2_blank(FB_BLANK_UNBLANK, info);
 
 	bw2_init_fix(info, linebytes);
 
diff --git a/drivers/video/cg3.c b/drivers/video/cg3.c
index 010ea53..e31e26a 100644
--- a/drivers/video/cg3.c
+++ b/drivers/video/cg3.c
@@ -398,7 +398,7 @@ static int __devinit cg3_probe(struct of_device *op,
 	if (!info->screen_base)
 		goto out_unmap_regs;
 
-	cg3_blank(0, info);
+	cg3_blank(FB_BLANK_UNBLANK, info);
 
 	if (!of_find_property(dp, "width", NULL)) {
 		err = cg3_do_default_mode(par);
diff --git a/drivers/video/cg6.c b/drivers/video/cg6.c
index fc90db6..8000bcc 100644
--- a/drivers/video/cg6.c
+++ b/drivers/video/cg6.c
@@ -767,7 +767,7 @@ static int __devinit cg6_probe(struct of_device *op,
 
 	cg6_bt_init(par);
 	cg6_chip_init(info);
-	cg6_blank(0, info);
+	cg6_blank(FB_BLANK_UNBLANK, info);
 
 	if (fb_alloc_cmap(&info->cmap, 256, 0))
 		goto out_unmap_regs;
diff --git a/drivers/video/ffb.c b/drivers/video/ffb.c
index 93dca3e..0f42a69 100644
--- a/drivers/video/ffb.c
+++ b/drivers/video/ffb.c
@@ -987,7 +987,7 @@ static int __devinit ffb_probe(struct of_device *op,
 	 * chosen console, it will have video outputs off in
 	 * the DAC.
 	 */
-	ffb_blank(0, info);
+	ffb_blank(FB_BLANK_UNBLANK, info);
 
 	if (fb_alloc_cmap(&info->cmap, 256, 0))
 		goto out_unmap_dac;
diff --git a/drivers/video/leo.c b/drivers/video/leo.c
index f3160fc..fb12992 100644
--- a/drivers/video/leo.c
+++ b/drivers/video/leo.c
@@ -601,7 +601,7 @@ static int __devinit leo_probe(struct of_device *op, const struct of_device_id *
 	leo_init_wids(info);
 	leo_init_hw(info);
 
-	leo_blank(0, info);
+	leo_blank(FB_BLANK_UNBLANK, info);
 
 	if (fb_alloc_cmap(&info->cmap, 256, 0))
 		goto out_unmap_regs;
diff --git a/drivers/video/p9100.c b/drivers/video/p9100.c
index c95874f..676ffb0 100644
--- a/drivers/video/p9100.c
+++ b/drivers/video/p9100.c
@@ -295,7 +295,7 @@ static int __devinit p9100_probe(struct of_device *op, const struct of_device_id
 	if (!info->screen_base)
 		goto out_unmap_regs;
 
-	p9100_blank(0, info);
+	p9100_blank(FB_BLANK_UNBLANK, info);
 
 	if (fb_alloc_cmap(&info->cmap, 256, 0))
 		goto out_unmap_screen;
-- 
cgit v0.10.2


From ac551828993eecb8499ef9cc3c828fceb49bcf7a Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Fri, 2 May 2008 20:37:21 +0200
Subject: modpost: i2c aliases need no trailing wildcard

Not all device types need a wildcard at the end of their module
aliases. In particular, for i2c module aliases, the trailing wildcard
is not only unneeded, it could also cause the wrong driver to be
loaded.

As I2C devices have no IDs, i2c module aliases are simple, arbitrary
device names. For example:

$ /sbin/modinfo lm90
filename:       /lib/modules/2.6.25-git18/kernel/drivers/hwmon/lm90.ko
author:         Jean Delvare <khali@linux-fr.org>
description:    LM90/ADM1032 driver
license:        GPL
vermagic:       2.6.25-git18 mod_unload
depends:        hwmon
alias:          i2c:lm90*
alias:          i2c:adm1032*
alias:          i2c:lm99*
alias:          i2c:lm86*
alias:          i2c:max6657*
alias:          i2c:adt7461*
alias:          i2c:max6680*
$

This would cause trouble if one I2C chip name matches the beginning of
another I2C chip name and both chips are supported by different
drivers. For example, an i2c device named lm9042 would cause the lm90
driver to be loaded, while it doesn't support that device. This case
has yet to be seen in practice, but still, I'd like to fix it now. The
cleanest fix is to remove the trailing wildcard from i2c module aliases.

Here's a patch doing this.

Not all device type aliases need a trailing wildcard, in particular
the i2c aliases don't. Don't add a wildcard by default in do_table(),
instead let each device type handler add it if needed.

I have tested types acpi, dmi, eisa, i2c, ide, ieee1394, input, pci,
pcmcia, platform, pnp, scsi, serio, ssb and usb. Other types (ccw, of,
vio, parisc, sdio and virtio) are untested.
Signed-off-by: Jean Delvare <khali@linux-fr.org>
Acked-by: Jochen Friedrich <jochen@scram.de>
Signed-off-by: Sam Ravnborg <sam@ravnborg.org>

diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c
index e04c4218..cea4a79 100644
--- a/scripts/mod/file2alias.c
+++ b/scripts/mod/file2alias.c
@@ -51,6 +51,15 @@ do {                                                            \
                 sprintf(str + strlen(str), "*");                \
 } while(0)
 
+/* Always end in a wildcard, for future extension */
+static inline void add_wildcard(char *str)
+{
+	int len = strlen(str);
+
+	if (str[len - 1] != '*')
+		strcat(str + len, "*");
+}
+
 unsigned int cross_build = 0;
 /**
  * Check that sizeof(device_id type) are consistent with size of section
@@ -133,9 +142,7 @@ static void do_usb_entry(struct usb_device_id *id,
 	    id->match_flags&USB_DEVICE_ID_MATCH_INT_PROTOCOL,
 	    id->bInterfaceProtocol);
 
-	/* Always end in a wildcard, for future extension */
-	if (alias[strlen(alias)-1] != '*')
-		strcat(alias, "*");
+	add_wildcard(alias);
 	buf_printf(&mod->dev_table_buf,
 		   "MODULE_ALIAS(\"%s\");\n", alias);
 }
@@ -219,6 +226,7 @@ static int do_ieee1394_entry(const char *filename,
 	ADD(alias, "ver", id->match_flags & IEEE1394_MATCH_VERSION,
 	    id->version);
 
+	add_wildcard(alias);
 	return 1;
 }
 
@@ -261,6 +269,7 @@ static int do_pci_entry(const char *filename,
 	ADD(alias, "bc", baseclass_mask == 0xFF, baseclass);
 	ADD(alias, "sc", subclass_mask == 0xFF, subclass);
 	ADD(alias, "i", interface_mask == 0xFF, interface);
+	add_wildcard(alias);
 	return 1;
 }
 
@@ -283,6 +292,7 @@ static int do_ccw_entry(const char *filename,
 	    id->dev_type);
 	ADD(alias, "dm", id->match_flags&CCW_DEVICE_ID_MATCH_DEVICE_MODEL,
 	    id->dev_model);
+	add_wildcard(alias);
 	return 1;
 }
 
@@ -290,7 +300,7 @@ static int do_ccw_entry(const char *filename,
 static int do_ap_entry(const char *filename,
 		       struct ap_device_id *id, char *alias)
 {
-	sprintf(alias, "ap:t%02X", id->dev_type);
+	sprintf(alias, "ap:t%02X*", id->dev_type);
 	return 1;
 }
 
@@ -309,6 +319,7 @@ static int do_serio_entry(const char *filename,
 	ADD(alias, "id", id->id != SERIO_ANY, id->id);
 	ADD(alias, "ex", id->extra != SERIO_ANY, id->extra);
 
+	add_wildcard(alias);
 	return 1;
 }
 
@@ -316,7 +327,7 @@ static int do_serio_entry(const char *filename,
 static int do_acpi_entry(const char *filename,
 			struct acpi_device_id *id, char *alias)
 {
-	sprintf(alias, "acpi*:%s:", id->id);
+	sprintf(alias, "acpi*:%s:*", id->id);
 	return 1;
 }
 
@@ -324,7 +335,7 @@ static int do_acpi_entry(const char *filename,
 static int do_pnp_entry(const char *filename,
 			struct pnp_device_id *id, char *alias)
 {
-	sprintf(alias, "pnp:d%s", id->id);
+	sprintf(alias, "pnp:d%s*", id->id);
 	return 1;
 }
 
@@ -409,6 +420,7 @@ static int do_pcmcia_entry(const char *filename,
        ADD(alias, "pc", id->match_flags & PCMCIA_DEV_ID_MATCH_PROD_ID3, id->prod_id_hash[2]);
        ADD(alias, "pd", id->match_flags & PCMCIA_DEV_ID_MATCH_PROD_ID4, id->prod_id_hash[3]);
 
+	add_wildcard(alias);
        return 1;
 }
 
@@ -432,6 +444,7 @@ static int do_of_entry (const char *filename, struct of_device_id *of, char *ali
         if (isspace (*tmp))
             *tmp = '_';
 
+    add_wildcard(alias);
     return 1;
 }
 
@@ -448,6 +461,7 @@ static int do_vio_entry(const char *filename, struct vio_device_id *vio,
 		if (isspace (*tmp))
 			*tmp = '_';
 
+	add_wildcard(alias);
 	return 1;
 }
 
@@ -511,6 +525,8 @@ static int do_eisa_entry(const char *filename, struct eisa_device_id *eisa,
 {
 	if (eisa->sig[0])
 		sprintf(alias, EISA_DEVICE_MODALIAS_FMT "*", eisa->sig);
+	else
+		strcat(alias, "*");
 	return 1;
 }
 
@@ -529,6 +545,7 @@ static int do_parisc_entry(const char *filename, struct parisc_device_id *id,
 	ADD(alias, "rev", id->hversion_rev != PA_HVERSION_REV_ANY_ID, id->hversion_rev);
 	ADD(alias, "sv", id->sversion != PA_SVERSION_ANY_ID, id->sversion);
 
+	add_wildcard(alias);
 	return 1;
 }
 
@@ -544,6 +561,7 @@ static int do_sdio_entry(const char *filename,
 	ADD(alias, "c", id->class != (__u8)SDIO_ANY_ID, id->class);
 	ADD(alias, "v", id->vendor != (__u16)SDIO_ANY_ID, id->vendor);
 	ADD(alias, "d", id->device != (__u16)SDIO_ANY_ID, id->device);
+	add_wildcard(alias);
 	return 1;
 }
 
@@ -559,6 +577,7 @@ static int do_ssb_entry(const char *filename,
 	ADD(alias, "v", id->vendor != SSB_ANY_VENDOR, id->vendor);
 	ADD(alias, "id", id->coreid != SSB_ANY_ID, id->coreid);
 	ADD(alias, "rev", id->revision != SSB_ANY_REV, id->revision);
+	add_wildcard(alias);
 	return 1;
 }
 
@@ -573,6 +592,7 @@ static int do_virtio_entry(const char *filename, struct virtio_device_id *id,
 	ADD(alias, "d", 1, id->device);
 	ADD(alias, "v", id->vendor != VIRTIO_DEV_ANY_ID, id->vendor);
 
+	add_wildcard(alias);
 	return 1;
 }
 
@@ -612,9 +632,6 @@ static void do_table(void *symval, unsigned long size,
 
 	for (i = 0; i < size; i += id_size) {
 		if (do_entry(mod->name, symval+i, alias)) {
-			/* Always end in a wildcard, for future extension */
-			if (alias[strlen(alias)-1] != '*')
-				strcat(alias, "*");
 			buf_printf(&mod->dev_table_buf,
 				   "MODULE_ALIAS(\"%s\");\n", alias);
 		}
-- 
cgit v0.10.2


From be0c007ac64f880a946995d6d1fc654acc81484d Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sun, 4 May 2008 01:34:31 -0700
Subject: niu: Fix probing regression for maramba on-board chips.

Changeset 7f7c4072ea552f97a0898331322f71986a97299c ("niu: Determine
the # of ports from the card's VPD data") caused maramba on-board
NIU ports to stop probing properly.

The old code had a fallback that would use a num_ports value of
4 if all the probing methods failed, but that was removed.

This restores the fallback of 4 ports, to get things working
again.

Bump driver version and release date.

Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/niu.c b/drivers/net/niu.c
index 4009c4c..57cfd72 100644
--- a/drivers/net/niu.c
+++ b/drivers/net/niu.c
@@ -1,6 +1,6 @@
 /* niu.c: Neptune ethernet driver.
  *
- * Copyright (C) 2007 David S. Miller (davem@davemloft.net)
+ * Copyright (C) 2007, 2008 David S. Miller (davem@davemloft.net)
  */
 
 #include <linux/module.h>
@@ -33,8 +33,8 @@
 
 #define DRV_MODULE_NAME		"niu"
 #define PFX DRV_MODULE_NAME	": "
-#define DRV_MODULE_VERSION	"0.8"
-#define DRV_MODULE_RELDATE	"April 24, 2008"
+#define DRV_MODULE_VERSION	"0.9"
+#define DRV_MODULE_RELDATE	"May 4, 2008"
 
 static char version[] __devinitdata =
 	DRV_MODULE_NAME ".c:v" DRV_MODULE_VERSION " (" DRV_MODULE_RELDATE ")\n";
@@ -7264,8 +7264,11 @@ static int __devinit niu_get_and_validate_port(struct niu *np)
 				parent->num_ports = nr64(ESPC_NUM_PORTS_MACS) &
 					ESPC_NUM_PORTS_MACS_VAL;
 
+				/* All of the current probing methods fail on
+				 * Maramba on-board parts.
+				 */
 				if (!parent->num_ports)
-					return -ENODEV;
+					parent->num_ports = 4;
 			}
 		}
 	}
-- 
cgit v0.10.2


From d56f546db97795dca5aa575b00b0e9886895ac87 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng.yang@intel.com>
Date: Fri, 25 Apr 2008 10:13:16 +0800
Subject: KVM: VMX: EPT Feature Detection

Signed-off-by: Sheng Yang <sheng.yang@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 8e5d664..d93250d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -42,6 +42,9 @@ module_param(enable_vpid, bool, 0);
 static int flexpriority_enabled = 1;
 module_param(flexpriority_enabled, bool, 0);
 
+static int enable_ept;
+module_param(enable_ept, bool, 0);
+
 struct vmcs {
 	u32 revision_id;
 	u32 abort;
@@ -107,6 +110,11 @@ static struct vmcs_config {
 	u32 vmentry_ctrl;
 } vmcs_config;
 
+struct vmx_capability {
+	u32 ept;
+	u32 vpid;
+} vmx_capability;
+
 #define VMX_SEGMENT_FIELD(seg)					\
 	[VCPU_SREG_##seg] = {                                   \
 		.selector = GUEST_##seg##_SELECTOR,		\
@@ -214,6 +222,32 @@ static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
 		    SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
 }
 
+static inline int cpu_has_vmx_invept_individual_addr(void)
+{
+	return (!!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT));
+}
+
+static inline int cpu_has_vmx_invept_context(void)
+{
+	return (!!(vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT));
+}
+
+static inline int cpu_has_vmx_invept_global(void)
+{
+	return (!!(vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT));
+}
+
+static inline int cpu_has_vmx_ept(void)
+{
+	return (vmcs_config.cpu_based_2nd_exec_ctrl &
+		SECONDARY_EXEC_ENABLE_EPT);
+}
+
+static inline int vm_need_ept(void)
+{
+	return (cpu_has_vmx_ept() && enable_ept);
+}
+
 static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
 {
 	return ((cpu_has_vmx_virtualize_apic_accesses()) &&
@@ -985,7 +1019,7 @@ static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
 static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 {
 	u32 vmx_msr_low, vmx_msr_high;
-	u32 min, opt;
+	u32 min, opt, min2, opt2;
 	u32 _pin_based_exec_control = 0;
 	u32 _cpu_based_exec_control = 0;
 	u32 _cpu_based_2nd_exec_control = 0;
@@ -1003,6 +1037,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 	      CPU_BASED_CR8_LOAD_EXITING |
 	      CPU_BASED_CR8_STORE_EXITING |
 #endif
+	      CPU_BASED_CR3_LOAD_EXITING |
+	      CPU_BASED_CR3_STORE_EXITING |
 	      CPU_BASED_USE_IO_BITMAPS |
 	      CPU_BASED_MOV_DR_EXITING |
 	      CPU_BASED_USE_TSC_OFFSETING;
@@ -1018,11 +1054,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 					   ~CPU_BASED_CR8_STORE_EXITING;
 #endif
 	if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
-		min = 0;
-		opt = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+		min2 = 0;
+		opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
 			SECONDARY_EXEC_WBINVD_EXITING |
-			SECONDARY_EXEC_ENABLE_VPID;
-		if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS2,
+			SECONDARY_EXEC_ENABLE_VPID |
+			SECONDARY_EXEC_ENABLE_EPT;
+		if (adjust_vmx_controls(min2, opt2,
+					MSR_IA32_VMX_PROCBASED_CTLS2,
 					&_cpu_based_2nd_exec_control) < 0)
 			return -EIO;
 	}
@@ -1031,6 +1069,16 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 				SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
 		_cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
 #endif
+	if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
+		/* CR3 accesses don't need to cause VM Exits when EPT enabled */
+		min &= ~(CPU_BASED_CR3_LOAD_EXITING |
+			 CPU_BASED_CR3_STORE_EXITING);
+		if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
+					&_cpu_based_exec_control) < 0)
+			return -EIO;
+		rdmsr(MSR_IA32_VMX_EPT_VPID_CAP,
+		      vmx_capability.ept, vmx_capability.vpid);
+	}
 
 	min = 0;
 #ifdef CONFIG_X86_64
@@ -1638,6 +1686,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 				CPU_BASED_CR8_LOAD_EXITING;
 #endif
 	}
+	if (!vm_need_ept())
+		exec_control |= CPU_BASED_CR3_STORE_EXITING |
+				CPU_BASED_CR3_LOAD_EXITING;
 	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
 
 	if (cpu_has_secondary_exec_ctrls()) {
@@ -1647,6 +1698,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 				~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
 		if (vmx->vpid == 0)
 			exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
+		if (!vm_need_ept())
+			exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
 		vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
 	}
 
diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h
index 5dff460..5f7fdc9 100644
--- a/arch/x86/kvm/vmx.h
+++ b/arch/x86/kvm/vmx.h
@@ -35,6 +35,8 @@
 #define CPU_BASED_MWAIT_EXITING                 0x00000400
 #define CPU_BASED_RDPMC_EXITING                 0x00000800
 #define CPU_BASED_RDTSC_EXITING                 0x00001000
+#define CPU_BASED_CR3_LOAD_EXITING		0x00008000
+#define CPU_BASED_CR3_STORE_EXITING		0x00010000
 #define CPU_BASED_CR8_LOAD_EXITING              0x00080000
 #define CPU_BASED_CR8_STORE_EXITING             0x00100000
 #define CPU_BASED_TPR_SHADOW                    0x00200000
@@ -49,6 +51,7 @@
  * Definitions of Secondary Processor-Based VM-Execution Controls.
  */
 #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
+#define SECONDARY_EXEC_ENABLE_EPT               0x00000002
 #define SECONDARY_EXEC_ENABLE_VPID              0x00000020
 #define SECONDARY_EXEC_WBINVD_EXITING		0x00000040
 
@@ -100,10 +103,22 @@ enum vmcs_field {
 	VIRTUAL_APIC_PAGE_ADDR_HIGH     = 0x00002013,
 	APIC_ACCESS_ADDR		= 0x00002014,
 	APIC_ACCESS_ADDR_HIGH		= 0x00002015,
+	EPT_POINTER                     = 0x0000201a,
+	EPT_POINTER_HIGH                = 0x0000201b,
+	GUEST_PHYSICAL_ADDRESS          = 0x00002400,
+	GUEST_PHYSICAL_ADDRESS_HIGH     = 0x00002401,
 	VMCS_LINK_POINTER               = 0x00002800,
 	VMCS_LINK_POINTER_HIGH          = 0x00002801,
 	GUEST_IA32_DEBUGCTL             = 0x00002802,
 	GUEST_IA32_DEBUGCTL_HIGH        = 0x00002803,
+	GUEST_PDPTR0                    = 0x0000280a,
+	GUEST_PDPTR0_HIGH               = 0x0000280b,
+	GUEST_PDPTR1                    = 0x0000280c,
+	GUEST_PDPTR1_HIGH               = 0x0000280d,
+	GUEST_PDPTR2                    = 0x0000280e,
+	GUEST_PDPTR2_HIGH               = 0x0000280f,
+	GUEST_PDPTR3                    = 0x00002810,
+	GUEST_PDPTR3_HIGH               = 0x00002811,
 	PIN_BASED_VM_EXEC_CONTROL       = 0x00004000,
 	CPU_BASED_VM_EXEC_CONTROL       = 0x00004002,
 	EXCEPTION_BITMAP                = 0x00004004,
@@ -226,6 +241,8 @@ enum vmcs_field {
 #define EXIT_REASON_MWAIT_INSTRUCTION   36
 #define EXIT_REASON_TPR_BELOW_THRESHOLD 43
 #define EXIT_REASON_APIC_ACCESS         44
+#define EXIT_REASON_EPT_VIOLATION       48
+#define EXIT_REASON_EPT_MISCONFIG       49
 #define EXIT_REASON_WBINVD		54
 
 /*
@@ -316,6 +333,7 @@ enum vmcs_field {
 #define MSR_IA32_VMX_CR4_FIXED1                 0x489
 #define MSR_IA32_VMX_VMCS_ENUM                  0x48a
 #define MSR_IA32_VMX_PROCBASED_CTLS2            0x48b
+#define MSR_IA32_VMX_EPT_VPID_CAP               0x48c
 
 #define MSR_IA32_FEATURE_CONTROL                0x3a
 #define MSR_IA32_FEATURE_CONTROL_LOCKED         0x1
@@ -327,4 +345,11 @@ enum vmcs_field {
 #define VMX_VPID_EXTENT_SINGLE_CONTEXT		1
 #define VMX_VPID_EXTENT_ALL_CONTEXT		2
 
+#define VMX_EPT_EXTENT_INDIVIDUAL_ADDR		0
+#define VMX_EPT_EXTENT_CONTEXT			1
+#define VMX_EPT_EXTENT_GLOBAL			2
+#define VMX_EPT_EXTENT_INDIVIDUAL_BIT		(1ull << 24)
+#define VMX_EPT_EXTENT_CONTEXT_BIT		(1ull << 25)
+#define VMX_EPT_EXTENT_GLOBAL_BIT		(1ull << 26)
+
 #endif
-- 
cgit v0.10.2


From 8c6d6adc6b87daa364ee9deb2e966021d37a7622 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng.yang@intel.com>
Date: Fri, 25 Apr 2008 10:17:08 +0800
Subject: KVM: MMU: Move some definitions to a header file

Move some definitions to mmu.h in order to allow building common table
entries between EPT and non-EPT.

Signed-off-by: Sheng Yang <sheng.yang@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 2ad6f54..bcfaf7e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -79,36 +79,6 @@ static int dbg = 1;
 	}
 #endif
 
-#define PT64_PT_BITS 9
-#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
-#define PT32_PT_BITS 10
-#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
-
-#define PT_WRITABLE_SHIFT 1
-
-#define PT_PRESENT_MASK (1ULL << 0)
-#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
-#define PT_USER_MASK (1ULL << 2)
-#define PT_PWT_MASK (1ULL << 3)
-#define PT_PCD_MASK (1ULL << 4)
-#define PT_ACCESSED_MASK (1ULL << 5)
-#define PT_DIRTY_MASK (1ULL << 6)
-#define PT_PAGE_SIZE_MASK (1ULL << 7)
-#define PT_PAT_MASK (1ULL << 7)
-#define PT_GLOBAL_MASK (1ULL << 8)
-#define PT64_NX_SHIFT 63
-#define PT64_NX_MASK (1ULL << PT64_NX_SHIFT)
-
-#define PT_PAT_SHIFT 7
-#define PT_DIR_PAT_SHIFT 12
-#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)
-
-#define PT32_DIR_PSE36_SIZE 4
-#define PT32_DIR_PSE36_SHIFT 13
-#define PT32_DIR_PSE36_MASK \
-	(((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
-
-
 #define PT_FIRST_AVAIL_BITS_SHIFT 9
 #define PT64_SECOND_AVAIL_BITS_SHIFT 52
 
@@ -154,10 +124,6 @@ static int dbg = 1;
 #define PFERR_USER_MASK (1U << 2)
 #define PFERR_FETCH_MASK (1U << 4)
 
-#define PT64_ROOT_LEVEL 4
-#define PT32_ROOT_LEVEL 2
-#define PT32E_ROOT_LEVEL 3
-
 #define PT_DIRECTORY_LEVEL 2
 #define PT_PAGE_TABLE_LEVEL 1
 
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index e64e9f5..a4fcb78 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -9,6 +9,39 @@
 #define TDP_ROOT_LEVEL PT32E_ROOT_LEVEL
 #endif
 
+#define PT64_PT_BITS 9
+#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
+#define PT32_PT_BITS 10
+#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
+
+#define PT_WRITABLE_SHIFT 1
+
+#define PT_PRESENT_MASK (1ULL << 0)
+#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
+#define PT_USER_MASK (1ULL << 2)
+#define PT_PWT_MASK (1ULL << 3)
+#define PT_PCD_MASK (1ULL << 4)
+#define PT_ACCESSED_MASK (1ULL << 5)
+#define PT_DIRTY_MASK (1ULL << 6)
+#define PT_PAGE_SIZE_MASK (1ULL << 7)
+#define PT_PAT_MASK (1ULL << 7)
+#define PT_GLOBAL_MASK (1ULL << 8)
+#define PT64_NX_SHIFT 63
+#define PT64_NX_MASK (1ULL << PT64_NX_SHIFT)
+
+#define PT_PAT_SHIFT 7
+#define PT_DIR_PAT_SHIFT 12
+#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)
+
+#define PT32_DIR_PSE36_SIZE 4
+#define PT32_DIR_PSE36_SHIFT 13
+#define PT32_DIR_PSE36_MASK \
+	(((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
+
+#define PT64_ROOT_LEVEL 4
+#define PT32_ROOT_LEVEL 2
+#define PT32E_ROOT_LEVEL 3
+
 static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
 {
 	if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
-- 
cgit v0.10.2


From 67253af52e9133fb4cfbf7a2448a2d3524d1fa6c Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng.yang@intel.com>
Date: Fri, 25 Apr 2008 10:20:22 +0800
Subject: KVM: Add kvm_x86_ops get_tdp_level()

The function get_tdp_level() provided the number of tdp level for EPT and
NPT rather than the NPT specific macro.

Signed-off-by: Sheng Yang <sheng.yang@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index bcfaf7e..20fb3c8 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1343,7 +1343,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
 	spin_lock(&vcpu->kvm->mmu_lock);
 	kvm_mmu_free_some_pages(vcpu);
 	r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
-			 largepage, gfn, pfn, TDP_ROOT_LEVEL);
+			 largepage, gfn, pfn, kvm_x86_ops->get_tdp_level());
 	spin_unlock(&vcpu->kvm->mmu_lock);
 
 	return r;
@@ -1450,7 +1450,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 	context->page_fault = tdp_page_fault;
 	context->free = nonpaging_free;
 	context->prefetch_page = nonpaging_prefetch_page;
-	context->shadow_root_level = TDP_ROOT_LEVEL;
+	context->shadow_root_level = kvm_x86_ops->get_tdp_level();
 	context->root_hpa = INVALID_PAGE;
 
 	if (!is_paging(vcpu)) {
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index a4fcb78..1730757 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -3,12 +3,6 @@
 
 #include <linux/kvm_host.h>
 
-#ifdef CONFIG_X86_64
-#define TDP_ROOT_LEVEL PT64_ROOT_LEVEL
-#else
-#define TDP_ROOT_LEVEL PT32E_ROOT_LEVEL
-#endif
-
 #define PT64_PT_BITS 9
 #define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
 #define PT32_PT_BITS 10
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 89e0be2..ab22615 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1863,6 +1863,15 @@ static bool svm_cpu_has_accelerated_tpr(void)
 	return false;
 }
 
+static int get_npt_level(void)
+{
+#ifdef CONFIG_X86_64
+	return PT64_ROOT_LEVEL;
+#else
+	return PT32E_ROOT_LEVEL;
+#endif
+}
+
 static struct kvm_x86_ops svm_x86_ops = {
 	.cpu_has_kvm_support = has_svm,
 	.disabled_by_bios = is_disabled,
@@ -1920,6 +1929,7 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.inject_pending_vectors = do_interrupt_requests,
 
 	.set_tss_addr = svm_set_tss_addr,
+	.get_tdp_level = get_npt_level,
 };
 
 static int __init svm_init(void)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d93250d..98e4f2b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2788,6 +2788,11 @@ static void __init vmx_check_processor_compat(void *rtn)
 	}
 }
 
+static int get_ept_level(void)
+{
+	return VMX_EPT_DEFAULT_GAW + 1;
+}
+
 static struct kvm_x86_ops vmx_x86_ops = {
 	.cpu_has_kvm_support = cpu_has_kvm_support,
 	.disabled_by_bios = vmx_disabled_by_bios,
@@ -2844,6 +2849,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.inject_pending_vectors = do_interrupt_requests,
 
 	.set_tss_addr = vmx_set_tss_addr,
+	.get_tdp_level = get_ept_level,
 };
 
 static int __init vmx_init(void)
diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h
index 5f7fdc9..093b085 100644
--- a/arch/x86/kvm/vmx.h
+++ b/arch/x86/kvm/vmx.h
@@ -351,5 +351,6 @@ enum vmcs_field {
 #define VMX_EPT_EXTENT_INDIVIDUAL_BIT		(1ull << 24)
 #define VMX_EPT_EXTENT_CONTEXT_BIT		(1ull << 25)
 #define VMX_EPT_EXTENT_GLOBAL_BIT		(1ull << 26)
+#define VMX_EPT_DEFAULT_GAW			3
 
 #endif
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index 9d963cd..897a1be 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -422,6 +422,7 @@ struct kvm_x86_ops {
 				       struct kvm_run *run);
 
 	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
+	int (*get_tdp_level)(void);
 };
 
 extern struct kvm_x86_ops *kvm_x86_ops;
-- 
cgit v0.10.2


From 7b52345e2c4c7333bf7eba8034ffc4683fa63c91 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng.yang@intel.com>
Date: Fri, 25 Apr 2008 21:13:50 +0800
Subject: KVM: MMU: Add EPT support

Enable kvm_set_spte() to generate EPT entries.

Signed-off-by: Sheng Yang <sheng.yang@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 20fb3c8..c28a36b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -152,6 +152,12 @@ static struct kmem_cache *mmu_page_header_cache;
 
 static u64 __read_mostly shadow_trap_nonpresent_pte;
 static u64 __read_mostly shadow_notrap_nonpresent_pte;
+static u64 __read_mostly shadow_base_present_pte;
+static u64 __read_mostly shadow_nx_mask;
+static u64 __read_mostly shadow_x_mask;	/* mutual exclusive with nx_mask */
+static u64 __read_mostly shadow_user_mask;
+static u64 __read_mostly shadow_accessed_mask;
+static u64 __read_mostly shadow_dirty_mask;
 
 void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
 {
@@ -160,6 +166,23 @@ void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
 
+void kvm_mmu_set_base_ptes(u64 base_pte)
+{
+	shadow_base_present_pte = base_pte;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes);
+
+void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
+		u64 dirty_mask, u64 nx_mask, u64 x_mask)
+{
+	shadow_user_mask = user_mask;
+	shadow_accessed_mask = accessed_mask;
+	shadow_dirty_mask = dirty_mask;
+	shadow_nx_mask = nx_mask;
+	shadow_x_mask = x_mask;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
+
 static int is_write_protection(struct kvm_vcpu *vcpu)
 {
 	return vcpu->arch.cr0 & X86_CR0_WP;
@@ -198,7 +221,7 @@ static int is_writeble_pte(unsigned long pte)
 
 static int is_dirty_pte(unsigned long pte)
 {
-	return pte & PT_DIRTY_MASK;
+	return pte & shadow_dirty_mask;
 }
 
 static int is_rmap_pte(u64 pte)
@@ -513,7 +536,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 		return;
 	sp = page_header(__pa(spte));
 	pfn = spte_to_pfn(*spte);
-	if (*spte & PT_ACCESSED_MASK)
+	if (*spte & shadow_accessed_mask)
 		kvm_set_pfn_accessed(pfn);
 	if (is_writeble_pte(*spte))
 		kvm_release_pfn_dirty(pfn);
@@ -1039,17 +1062,17 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 	 * whether the guest actually used the pte (in order to detect
 	 * demand paging).
 	 */
-	spte = PT_PRESENT_MASK | PT_DIRTY_MASK;
+	spte = shadow_base_present_pte | shadow_dirty_mask;
 	if (!speculative)
 		pte_access |= PT_ACCESSED_MASK;
 	if (!dirty)
 		pte_access &= ~ACC_WRITE_MASK;
-	if (!(pte_access & ACC_EXEC_MASK))
-		spte |= PT64_NX_MASK;
-
-	spte |= PT_PRESENT_MASK;
+	if (pte_access & ACC_EXEC_MASK)
+		spte |= shadow_x_mask;
+	else
+		spte |= shadow_nx_mask;
 	if (pte_access & ACC_USER_MASK)
-		spte |= PT_USER_MASK;
+		spte |= shadow_user_mask;
 	if (largepage)
 		spte |= PT_PAGE_SIZE_MASK;
 
@@ -1155,7 +1178,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 			}
 
 			table[index] = __pa(new_table->spt) | PT_PRESENT_MASK
-				| PT_WRITABLE_MASK | PT_USER_MASK;
+				| PT_WRITABLE_MASK | shadow_user_mask;
 		}
 		table_addr = table[index] & PT64_BASE_ADDR_MASK;
 	}
@@ -1599,7 +1622,7 @@ static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
 {
 	u64 *spte = vcpu->arch.last_pte_updated;
 
-	return !!(spte && (*spte & PT_ACCESSED_MASK));
+	return !!(spte && (*spte & shadow_accessed_mask));
 }
 
 static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0ce5563..0735efb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2417,6 +2417,9 @@ int kvm_arch_init(void *opaque)
 
 	kvm_x86_ops = ops;
 	kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
+	kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
+	kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
+			PT_DIRTY_MASK, PT64_NX_MASK, 0);
 	return 0;
 
 out:
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index 897a1be..d1dedda 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -434,6 +434,9 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
 int kvm_mmu_create(struct kvm_vcpu *vcpu);
 int kvm_mmu_setup(struct kvm_vcpu *vcpu);
 void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
+void kvm_mmu_set_base_ptes(u64 base_pte);
+void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
+		u64 dirty_mask, u64 nx_mask, u64 x_mask);
 
 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
-- 
cgit v0.10.2


From 1ac593c97eb229da44819f66fea47975537c1177 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng.yang@intel.com>
Date: Fri, 25 Apr 2008 21:44:42 +0800
Subject: KVM: MMU: Remove #ifdef CONFIG_X86_64 to support 4 level EPT

Currently EPT level is 4 for both pae and x86_64. The patch remove the #ifdef
for alloc root_hpa and free root_hpa to support EPT.

Signed-off-by: Sheng Yang <sheng.yang@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index c28a36b..3dbedf16 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1233,7 +1233,6 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
 	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
 		return;
 	spin_lock(&vcpu->kvm->mmu_lock);
-#ifdef CONFIG_X86_64
 	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
 		hpa_t root = vcpu->arch.mmu.root_hpa;
 
@@ -1245,7 +1244,6 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
 		spin_unlock(&vcpu->kvm->mmu_lock);
 		return;
 	}
-#endif
 	for (i = 0; i < 4; ++i) {
 		hpa_t root = vcpu->arch.mmu.pae_root[i];
 
@@ -1271,7 +1269,6 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
 
 	root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
 
-#ifdef CONFIG_X86_64
 	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
 		hpa_t root = vcpu->arch.mmu.root_hpa;
 
@@ -1286,7 +1283,6 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
 		vcpu->arch.mmu.root_hpa = root;
 		return;
 	}
-#endif
 	metaphysical = !is_paging(vcpu);
 	if (tdp_enabled)
 		metaphysical = 1;
-- 
cgit v0.10.2


From 0d15029895051904e31925ec63525cc3a637f7de Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng.yang@intel.com>
Date: Fri, 25 Apr 2008 21:44:50 +0800
Subject: KVM: Export necessary function for EPT

Signed-off-by: Sheng Yang <sheng.yang@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index e89338e..f7ba099 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -522,6 +522,7 @@ unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
 		return bad_hva();
 	return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE);
 }
+EXPORT_SYMBOL_GPL(gfn_to_hva);
 
 /*
  * Requires current->mm->mmap_sem to be held
-- 
cgit v0.10.2


From b7ebfb0509692cd923e31650f81ed4d79c9a3e59 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng.yang@intel.com>
Date: Fri, 25 Apr 2008 21:44:52 +0800
Subject: KVM: VMX: Prepare an identity page table for EPT in real mode

[aliguory: plug leak]

Signed-off-by: Sheng Yang <sheng.yang@intel.com>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 98e4f2b..de5f615 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -87,7 +87,7 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
 	return container_of(vcpu, struct vcpu_vmx, vcpu);
 }
 
-static int init_rmode_tss(struct kvm *kvm);
+static int init_rmode(struct kvm *kvm);
 
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -1304,7 +1304,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
 	fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
 
 	kvm_mmu_reset_context(vcpu);
-	init_rmode_tss(vcpu->kvm);
+	init_rmode(vcpu->kvm);
 }
 
 #ifdef CONFIG_X86_64
@@ -1578,6 +1578,41 @@ out:
 	return ret;
 }
 
+static int init_rmode_identity_map(struct kvm *kvm)
+{
+	int i, r, ret;
+	pfn_t identity_map_pfn;
+	u32 tmp;
+
+	if (!vm_need_ept())
+		return 1;
+	if (unlikely(!kvm->arch.ept_identity_pagetable)) {
+		printk(KERN_ERR "EPT: identity-mapping pagetable "
+			"haven't been allocated!\n");
+		return 0;
+	}
+	if (likely(kvm->arch.ept_identity_pagetable_done))
+		return 1;
+	ret = 0;
+	identity_map_pfn = VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT;
+	r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
+	if (r < 0)
+		goto out;
+	/* Set up identity-mapping pagetable for EPT in real mode */
+	for (i = 0; i < PT32_ENT_PER_PAGE; i++) {
+		tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
+			_PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
+		r = kvm_write_guest_page(kvm, identity_map_pfn,
+				&tmp, i * sizeof(tmp), sizeof(tmp));
+		if (r < 0)
+			goto out;
+	}
+	kvm->arch.ept_identity_pagetable_done = true;
+	ret = 1;
+out:
+	return ret;
+}
+
 static void seg_setup(int seg)
 {
 	struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
@@ -1612,6 +1647,31 @@ out:
 	return r;
 }
 
+static int alloc_identity_pagetable(struct kvm *kvm)
+{
+	struct kvm_userspace_memory_region kvm_userspace_mem;
+	int r = 0;
+
+	down_write(&kvm->slots_lock);
+	if (kvm->arch.ept_identity_pagetable)
+		goto out;
+	kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT;
+	kvm_userspace_mem.flags = 0;
+	kvm_userspace_mem.guest_phys_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
+	kvm_userspace_mem.memory_size = PAGE_SIZE;
+	r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
+	if (r)
+		goto out;
+
+	down_read(&current->mm->mmap_sem);
+	kvm->arch.ept_identity_pagetable = gfn_to_page(kvm,
+			VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT);
+	up_read(&current->mm->mmap_sem);
+out:
+	up_write(&kvm->slots_lock);
+	return r;
+}
+
 static void allocate_vpid(struct vcpu_vmx *vmx)
 {
 	int vpid;
@@ -1775,6 +1835,15 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	return 0;
 }
 
+static int init_rmode(struct kvm *kvm)
+{
+	if (!init_rmode_tss(kvm))
+		return 0;
+	if (!init_rmode_identity_map(kvm))
+		return 0;
+	return 1;
+}
+
 static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -1782,7 +1851,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 	int ret;
 
 	down_read(&vcpu->kvm->slots_lock);
-	if (!init_rmode_tss(vmx->vcpu.kvm)) {
+	if (!init_rmode(vmx->vcpu.kvm)) {
 		ret = -ENOMEM;
 		goto out;
 	}
@@ -2759,6 +2828,10 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 		if (alloc_apic_access_page(kvm) != 0)
 			goto free_vmcs;
 
+	if (vm_need_ept())
+		if (alloc_identity_pagetable(kvm) != 0)
+			goto free_vmcs;
+
 	return &vmx->vcpu;
 
 free_vmcs:
diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h
index 093b085..f97eccc 100644
--- a/arch/x86/kvm/vmx.h
+++ b/arch/x86/kvm/vmx.h
@@ -340,6 +340,7 @@ enum vmcs_field {
 #define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED  0x4
 
 #define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT	9
+#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT	10
 
 #define VMX_NR_VPIDS				(1 << 16)
 #define VMX_VPID_EXTENT_SINGLE_CONTEXT		1
@@ -353,4 +354,6 @@ enum vmcs_field {
 #define VMX_EPT_EXTENT_GLOBAL_BIT		(1ull << 26)
 #define VMX_EPT_DEFAULT_GAW			3
 
+#define VMX_EPT_IDENTITY_PAGETABLE_ADDR		0xfffbc000ul
+
 #endif
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0735efb..1842a86 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3909,6 +3909,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 	kvm_free_physmem(kvm);
 	if (kvm->arch.apic_access_page)
 		put_page(kvm->arch.apic_access_page);
+	if (kvm->arch.ept_identity_pagetable)
+		put_page(kvm->arch.ept_identity_pagetable);
 	kfree(kvm);
 }
 
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index d1dedda..e24afaa 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -314,6 +314,9 @@ struct kvm_arch{
 	struct page *apic_access_page;
 
 	gpa_t wall_clock;
+
+	struct page *ept_identity_pagetable;
+	bool ept_identity_pagetable_done;
 };
 
 struct kvm_vm_stat {
-- 
cgit v0.10.2


From 1439442c7b257b47a83aea4daed8fbf4a32cdff9 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng.yang@intel.com>
Date: Mon, 28 Apr 2008 12:24:45 +0800
Subject: KVM: VMX: Enable EPT feature for KVM

Signed-off-by: Sheng Yang <sheng.yang@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 3dbedf16..d9344be 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1177,8 +1177,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 				return -ENOMEM;
 			}
 
-			table[index] = __pa(new_table->spt) | PT_PRESENT_MASK
-				| PT_WRITABLE_MASK | shadow_user_mask;
+			table[index] = __pa(new_table->spt)
+				| PT_PRESENT_MASK | PT_WRITABLE_MASK
+				| shadow_user_mask | shadow_x_mask;
 		}
 		table_addr = table[index] & PT64_BASE_ADDR_MASK;
 	}
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index de5f615..bfe4db1 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -42,7 +42,7 @@ module_param(enable_vpid, bool, 0);
 static int flexpriority_enabled = 1;
 module_param(flexpriority_enabled, bool, 0);
 
-static int enable_ept;
+static int enable_ept = 1;
 module_param(enable_ept, bool, 0);
 
 struct vmcs {
@@ -284,6 +284,18 @@ static inline void __invvpid(int ext, u16 vpid, gva_t gva)
 		  : : "a"(&operand), "c"(ext) : "cc", "memory");
 }
 
+static inline void __invept(int ext, u64 eptp, gpa_t gpa)
+{
+	struct {
+		u64 eptp, gpa;
+	} operand = {eptp, gpa};
+
+	asm volatile (ASM_VMX_INVEPT
+			/* CF==1 or ZF==1 --> rc = -1 */
+			"; ja 1f ; ud2 ; 1:\n"
+			: : "a" (&operand), "c" (ext) : "cc", "memory");
+}
+
 static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
 {
 	int i;
@@ -335,6 +347,33 @@ static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx)
 	__invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0);
 }
 
+static inline void ept_sync_global(void)
+{
+	if (cpu_has_vmx_invept_global())
+		__invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
+}
+
+static inline void ept_sync_context(u64 eptp)
+{
+	if (vm_need_ept()) {
+		if (cpu_has_vmx_invept_context())
+			__invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
+		else
+			ept_sync_global();
+	}
+}
+
+static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
+{
+	if (vm_need_ept()) {
+		if (cpu_has_vmx_invept_individual_addr())
+			__invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR,
+					eptp, gpa);
+		else
+			ept_sync_context(eptp);
+	}
+}
+
 static unsigned long vmcs_readl(unsigned long field)
 {
 	unsigned long value;
@@ -422,6 +461,8 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
 		eb |= 1u << 1;
 	if (vcpu->arch.rmode.active)
 		eb = ~0;
+	if (vm_need_ept())
+		eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
 	vmcs_write32(EXCEPTION_BITMAP, eb);
 }
 
@@ -1352,8 +1393,64 @@ static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
 	vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK;
 }
 
+static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
+{
+	if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
+		if (!load_pdptrs(vcpu, vcpu->arch.cr3)) {
+			printk(KERN_ERR "EPT: Fail to load pdptrs!\n");
+			return;
+		}
+		vmcs_write64(GUEST_PDPTR0, vcpu->arch.pdptrs[0]);
+		vmcs_write64(GUEST_PDPTR1, vcpu->arch.pdptrs[1]);
+		vmcs_write64(GUEST_PDPTR2, vcpu->arch.pdptrs[2]);
+		vmcs_write64(GUEST_PDPTR3, vcpu->arch.pdptrs[3]);
+	}
+}
+
+static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+
+static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
+					unsigned long cr0,
+					struct kvm_vcpu *vcpu)
+{
+	if (!(cr0 & X86_CR0_PG)) {
+		/* From paging/starting to nonpaging */
+		vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
+			     vmcs_config.cpu_based_exec_ctrl |
+			     (CPU_BASED_CR3_LOAD_EXITING |
+			      CPU_BASED_CR3_STORE_EXITING));
+		vcpu->arch.cr0 = cr0;
+		vmx_set_cr4(vcpu, vcpu->arch.cr4);
+		*hw_cr0 |= X86_CR0_PE | X86_CR0_PG;
+		*hw_cr0 &= ~X86_CR0_WP;
+	} else if (!is_paging(vcpu)) {
+		/* From nonpaging to paging */
+		vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
+			     vmcs_config.cpu_based_exec_ctrl &
+			     ~(CPU_BASED_CR3_LOAD_EXITING |
+			       CPU_BASED_CR3_STORE_EXITING));
+		vcpu->arch.cr0 = cr0;
+		vmx_set_cr4(vcpu, vcpu->arch.cr4);
+		if (!(vcpu->arch.cr0 & X86_CR0_WP))
+			*hw_cr0 &= ~X86_CR0_WP;
+	}
+}
+
+static void ept_update_paging_mode_cr4(unsigned long *hw_cr4,
+					struct kvm_vcpu *vcpu)
+{
+	if (!is_paging(vcpu)) {
+		*hw_cr4 &= ~X86_CR4_PAE;
+		*hw_cr4 |= X86_CR4_PSE;
+	} else if (!(vcpu->arch.cr4 & X86_CR4_PAE))
+		*hw_cr4 &= ~X86_CR4_PAE;
+}
+
 static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 {
+	unsigned long hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) |
+				KVM_VM_CR0_ALWAYS_ON;
+
 	vmx_fpu_deactivate(vcpu);
 
 	if (vcpu->arch.rmode.active && (cr0 & X86_CR0_PE))
@@ -1371,29 +1468,61 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 	}
 #endif
 
+	if (vm_need_ept())
+		ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
+
 	vmcs_writel(CR0_READ_SHADOW, cr0);
-	vmcs_writel(GUEST_CR0,
-		    (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON);
+	vmcs_writel(GUEST_CR0, hw_cr0);
 	vcpu->arch.cr0 = cr0;
 
 	if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE))
 		vmx_fpu_activate(vcpu);
 }
 
+static u64 construct_eptp(unsigned long root_hpa)
+{
+	u64 eptp;
+
+	/* TODO write the value reading from MSR */
+	eptp = VMX_EPT_DEFAULT_MT |
+		VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT;
+	eptp |= (root_hpa & PAGE_MASK);
+
+	return eptp;
+}
+
 static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 {
+	unsigned long guest_cr3;
+	u64 eptp;
+
+	guest_cr3 = cr3;
+	if (vm_need_ept()) {
+		eptp = construct_eptp(cr3);
+		vmcs_write64(EPT_POINTER, eptp);
+		ept_sync_context(eptp);
+		ept_load_pdptrs(vcpu);
+		guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 :
+			VMX_EPT_IDENTITY_PAGETABLE_ADDR;
+	}
+
 	vmx_flush_tlb(vcpu);
-	vmcs_writel(GUEST_CR3, cr3);
+	vmcs_writel(GUEST_CR3, guest_cr3);
 	if (vcpu->arch.cr0 & X86_CR0_PE)
 		vmx_fpu_deactivate(vcpu);
 }
 
 static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
-	vmcs_writel(CR4_READ_SHADOW, cr4);
-	vmcs_writel(GUEST_CR4, cr4 | (vcpu->arch.rmode.active ?
-		    KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON));
+	unsigned long hw_cr4 = cr4 | (vcpu->arch.rmode.active ?
+		    KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
+
 	vcpu->arch.cr4 = cr4;
+	if (vm_need_ept())
+		ept_update_paging_mode_cr4(&hw_cr4, vcpu);
+
+	vmcs_writel(CR4_READ_SHADOW, cr4);
+	vmcs_writel(GUEST_CR4, hw_cr4);
 }
 
 static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
@@ -2116,6 +2245,9 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
 		error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
 	if (is_page_fault(intr_info)) {
+		/* EPT won't cause page fault directly */
+		if (vm_need_ept())
+			BUG();
 		cr2 = vmcs_readl(EXIT_QUALIFICATION);
 		KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2,
 			    (u32)((u64)cr2 >> 32), handler);
@@ -2445,6 +2577,64 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	return kvm_task_switch(vcpu, tss_selector, reason);
 }
 
+static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+	u64 exit_qualification;
+	enum emulation_result er;
+	gpa_t gpa;
+	unsigned long hva;
+	int gla_validity;
+	int r;
+
+	exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
+
+	if (exit_qualification & (1 << 6)) {
+		printk(KERN_ERR "EPT: GPA exceeds GAW!\n");
+		return -ENOTSUPP;
+	}
+
+	gla_validity = (exit_qualification >> 7) & 0x3;
+	if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) {
+		printk(KERN_ERR "EPT: Handling EPT violation failed!\n");
+		printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n",
+			(long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS),
+			(long unsigned int)vmcs_read64(GUEST_LINEAR_ADDRESS));
+		printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
+			(long unsigned int)exit_qualification);
+		kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
+		kvm_run->hw.hardware_exit_reason = 0;
+		return -ENOTSUPP;
+	}
+
+	gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
+	hva = gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT);
+	if (!kvm_is_error_hva(hva)) {
+		r = kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0);
+		if (r < 0) {
+			printk(KERN_ERR "EPT: Not enough memory!\n");
+			return -ENOMEM;
+		}
+		return 1;
+	} else {
+		/* must be MMIO */
+		er = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
+
+		if (er == EMULATE_FAIL) {
+			printk(KERN_ERR
+			 "EPT: Fail to handle EPT violation vmexit!er is %d\n",
+			 er);
+			printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n",
+			 (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS),
+			 (long unsigned int)vmcs_read64(GUEST_LINEAR_ADDRESS));
+			printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
+				(long unsigned int)exit_qualification);
+			return -ENOTSUPP;
+		} else if (er == EMULATE_DO_MMIO)
+			return 0;
+	}
+	return 1;
+}
+
 /*
  * The exit handlers return 1 if the exit was handled fully and guest execution
  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@ -2468,6 +2658,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
 	[EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
 	[EXIT_REASON_WBINVD]                  = handle_wbinvd,
 	[EXIT_REASON_TASK_SWITCH]             = handle_task_switch,
+	[EXIT_REASON_EPT_VIOLATION]	      = handle_ept_violation,
 };
 
 static const int kvm_vmx_max_exit_handlers =
@@ -2486,6 +2677,13 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)vmcs_readl(GUEST_RIP),
 		    (u32)((u64)vmcs_readl(GUEST_RIP) >> 32), entryexit);
 
+	/* Access CR3 don't cause VMExit in paging mode, so we need
+	 * to sync with guest real CR3. */
+	if (vm_need_ept() && is_paging(vcpu)) {
+		vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
+		ept_load_pdptrs(vcpu);
+	}
+
 	if (unlikely(vmx->fail)) {
 		kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
 		kvm_run->fail_entry.hardware_entry_failure_reason
@@ -2494,7 +2692,8 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	}
 
 	if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
-				exit_reason != EXIT_REASON_EXCEPTION_NMI)
+			(exit_reason != EXIT_REASON_EXCEPTION_NMI &&
+			exit_reason != EXIT_REASON_EPT_VIOLATION))
 		printk(KERN_WARNING "%s: unexpected, valid vectoring info and "
 		       "exit reason is 0x%x\n", __func__, exit_reason);
 	if (exit_reason < kvm_vmx_max_exit_handlers
@@ -2796,6 +2995,15 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 		return ERR_PTR(-ENOMEM);
 
 	allocate_vpid(vmx);
+	if (id == 0 && vm_need_ept()) {
+		kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
+			VMX_EPT_WRITABLE_MASK |
+			VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT);
+		kvm_mmu_set_mask_ptes(0ull, VMX_EPT_FAKE_ACCESSED_MASK,
+				VMX_EPT_FAKE_DIRTY_MASK, 0ull,
+				VMX_EPT_EXECUTABLE_MASK);
+		kvm_enable_tdp();
+	}
 
 	err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
 	if (err)
@@ -2975,9 +3183,14 @@ static int __init vmx_init(void)
 	vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_ESP);
 	vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_EIP);
 
+	if (cpu_has_vmx_ept())
+		bypass_guest_pf = 0;
+
 	if (bypass_guest_pf)
 		kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
 
+	ept_sync_global();
+
 	return 0;
 
 out2:
diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h
index f97eccc..79d94c6 100644
--- a/arch/x86/kvm/vmx.h
+++ b/arch/x86/kvm/vmx.h
@@ -353,6 +353,15 @@ enum vmcs_field {
 #define VMX_EPT_EXTENT_CONTEXT_BIT		(1ull << 25)
 #define VMX_EPT_EXTENT_GLOBAL_BIT		(1ull << 26)
 #define VMX_EPT_DEFAULT_GAW			3
+#define VMX_EPT_MAX_GAW				0x4
+#define VMX_EPT_MT_EPTE_SHIFT			3
+#define VMX_EPT_GAW_EPTP_SHIFT			3
+#define VMX_EPT_DEFAULT_MT			0x6ull
+#define VMX_EPT_READABLE_MASK			0x1ull
+#define VMX_EPT_WRITABLE_MASK			0x2ull
+#define VMX_EPT_EXECUTABLE_MASK			0x4ull
+#define VMX_EPT_FAKE_ACCESSED_MASK		(1ull << 62)
+#define VMX_EPT_FAKE_DIRTY_MASK			(1ull << 63)
 
 #define VMX_EPT_IDENTITY_PAGETABLE_ADDR		0xfffbc000ul
 
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index e24afaa..4baa9c9 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -651,6 +651,7 @@ static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code)
 #define ASM_VMX_VMWRITE_RSP_RDX   ".byte 0x0f, 0x79, 0xd4"
 #define ASM_VMX_VMXOFF            ".byte 0x0f, 0x01, 0xc4"
 #define ASM_VMX_VMXON_RAX         ".byte 0xf3, 0x0f, 0xc7, 0x30"
+#define ASM_VMX_INVEPT		  ".byte 0x66, 0x0f, 0x38, 0x80, 0x08"
 #define ASM_VMX_INVVPID		  ".byte 0x66, 0x0f, 0x38, 0x81, 0x08"
 
 #define MSR_IA32_TIME_STAMP_COUNTER		0x010
-- 
cgit v0.10.2


From 3fe913e7c550a869e250d04c34410f7a6e263f7c Mon Sep 17 00:00:00 2001
From: Izik Eidus <izike@qumranet.com>
Date: Mon, 28 Apr 2008 18:23:52 +0300
Subject: KVM: x86: task switch: fix wrong bit setting for the busy flag

The busy bit is bit 1 of the type field, not bit 8.

Signed-off-by: Izik Eidus <izike@qumranet.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1842a86..017daa2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3484,7 +3484,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
 	}
 
 	if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
-		cseg_desc.type &= ~(1 << 8); //clear the B flag
+		cseg_desc.type &= ~(1 << 1); //clear the B flag
 		save_guest_segment_descriptor(vcpu, tr_seg.selector,
 					      &cseg_desc);
 	}
@@ -3510,7 +3510,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
 	}
 
 	if (reason != TASK_SWITCH_IRET) {
-		nseg_desc.type |= (1 << 8);
+		nseg_desc.type |= (1 << 1);
 		save_guest_segment_descriptor(vcpu, tss_selector,
 					      &nseg_desc);
 	}
-- 
cgit v0.10.2


From 45c5eb67da5a668abe79c23a7e64dbc87a600f90 Mon Sep 17 00:00:00 2001
From: Hollis Blanchard <hollisb@us.ibm.com>
Date: Fri, 25 Apr 2008 17:55:49 -0500
Subject: KVM: ppc: Handle guest idle by emulating MSR[WE] writes

This reduces host CPU usage when the guest is idle. However, the guest must
set MSR[WE] in its idle loop, which Linux did not do until 2.6.26.

Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
Signed-off-by: Jerone Young <jyoung5@us.ibm.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/powerpc/kvm/booke_guest.c b/arch/powerpc/kvm/booke_guest.c
index 6d9884a..b3db6f4 100644
--- a/arch/powerpc/kvm/booke_guest.c
+++ b/arch/powerpc/kvm/booke_guest.c
@@ -49,6 +49,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "inst_emu",   VCPU_STAT(emulated_inst_exits) },
 	{ "dec",        VCPU_STAT(dec_exits) },
 	{ "ext_intr",   VCPU_STAT(ext_intr_exits) },
+	{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
 	{ NULL }
 };
 
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index bad40bd..777e0f3 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -36,13 +36,12 @@ gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
 
 int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
 {
-	/* XXX implement me */
-	return 0;
+	return !!(v->arch.pending_exceptions);
 }
 
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
 {
-	return 1;
+	return !(v->arch.msr & MSR_WE);
 }
 
 
@@ -214,6 +213,11 @@ static void kvmppc_decrementer_func(unsigned long data)
 	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
 
 	kvmppc_queue_exception(vcpu, BOOKE_INTERRUPT_DECREMENTER);
+
+	if (waitqueue_active(&vcpu->wq)) {
+		wake_up_interruptible(&vcpu->wq);
+		vcpu->stat.halt_wakeup++;
+	}
 }
 
 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
@@ -339,6 +343,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 	int r;
 	sigset_t sigsaved;
 
+	vcpu_load(vcpu);
+
 	if (vcpu->sigset_active)
 		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
 
@@ -363,12 +369,20 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 	if (vcpu->sigset_active)
 		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 
+	vcpu_put(vcpu);
+
 	return r;
 }
 
 int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq)
 {
 	kvmppc_queue_exception(vcpu, BOOKE_INTERRUPT_EXTERNAL);
+
+	if (waitqueue_active(&vcpu->wq)) {
+		wake_up_interruptible(&vcpu->wq);
+		vcpu->stat.halt_wakeup++;
+	}
+
 	return 0;
 }
 
diff --git a/include/asm-powerpc/kvm_host.h b/include/asm-powerpc/kvm_host.h
index 04ffbb8..81a69d7 100644
--- a/include/asm-powerpc/kvm_host.h
+++ b/include/asm-powerpc/kvm_host.h
@@ -59,6 +59,7 @@ struct kvm_vcpu_stat {
 	u32 emulated_inst_exits;
 	u32 dec_exits;
 	u32 ext_intr_exits;
+	u32 halt_wakeup;
 };
 
 struct tlbe {
diff --git a/include/asm-powerpc/kvm_ppc.h b/include/asm-powerpc/kvm_ppc.h
index 7ac8203..b35a7e3 100644
--- a/include/asm-powerpc/kvm_ppc.h
+++ b/include/asm-powerpc/kvm_ppc.h
@@ -77,12 +77,17 @@ static inline void kvmppc_clear_exception(struct kvm_vcpu *vcpu, int exception)
 	clear_bit(priority, &vcpu->arch.pending_exceptions);
 }
 
+/* Helper function for "full" MSR writes. No need to call this if only EE is
+ * changing. */
 static inline void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr)
 {
 	if ((new_msr & MSR_PR) != (vcpu->arch.msr & MSR_PR))
 		kvmppc_mmu_priv_switch(vcpu, new_msr & MSR_PR);
 
 	vcpu->arch.msr = new_msr;
+
+	if (vcpu->arch.msr & MSR_WE)
+		kvm_vcpu_block(vcpu);
 }
 
 #endif /* __POWERPC_KVM_PPC_H__ */
-- 
cgit v0.10.2


From de368dceb33c3c068dbde1407aff75cd8e126f04 Mon Sep 17 00:00:00 2001
From: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
Date: Tue, 29 Apr 2008 18:18:23 +0200
Subject: KVM: ppc: deliver INTERRUPT_FP_UNAVAIL to the guest

This patch adds the delivery of INTERRUPT_FP_UNAVAIL exceptions to the guest.
It's needed if a guest uses ppc binaries using the Floating point instructions.

Signed-off-by: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
Acked-by: Hollis Blanchard <hollisb@us.ibm.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/powerpc/kvm/booke_guest.c b/arch/powerpc/kvm/booke_guest.c
index b3db6f4..712d89a 100644
--- a/arch/powerpc/kvm/booke_guest.c
+++ b/arch/powerpc/kvm/booke_guest.c
@@ -339,6 +339,11 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		}
 		break;
 
+	case BOOKE_INTERRUPT_FP_UNAVAIL:
+		kvmppc_queue_exception(vcpu, exit_nr);
+		r = RESUME_GUEST;
+		break;
+
 	case BOOKE_INTERRUPT_DATA_STORAGE:
 		vcpu->arch.dear = vcpu->arch.fault_dear;
 		vcpu->arch.esr = vcpu->arch.fault_esr;
-- 
cgit v0.10.2


From dc7457ea52f88539dc72925360e6068d5c938a0f Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Wed, 30 Apr 2008 16:13:36 +0300
Subject: KVM: x86 emulator: disable writeback on lmsw

The recent changes allowing memory operands with lmsw and smsw left
lmsw with writeback enabled.  Since lmsw has no oridinary destination
operand, the dst pointer was not initialized, resulting in an oops.

Close the hole by disabling writeback for lmsw.

Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 2ca0838..f2a696d 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -1761,6 +1761,7 @@ twobyte_insn:
 		case 6: /* lmsw */
 			realmode_lmsw(ctxt->vcpu, (u16)c->src.val,
 				      &ctxt->eflags);
+			c->dst.type = OP_NONE;
 			break;
 		case 7: /* invlpg*/
 			emulate_invlpg(ctxt->vcpu, memop);
-- 
cgit v0.10.2


From ece15babfa514e06118f62f4df2c757d6209f4f0 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Wed, 30 Apr 2008 13:23:54 -0300
Subject: KVM: PIT: support mode 4

The in-kernel PIT emulation ignores pending timers if operating under
mode 4, which for example DragonFlyBSD uses (and Plan9 too, apparently).

Mode 4 seems to be similar to one-shot mode, other than the fact that it
starts counting after the next CLK pulse once programmed, while mode 1
starts counting immediately, so add a FIXME to enhance precision.

Fixes sourceforge bug 1952988.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Acked-by: Sheng Yang <sheng.yang@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 4c943ea..3324d90 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -288,6 +288,8 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
 	 * mode 1 is one shot, mode 2 is period, otherwise del timer */
 	switch (ps->channels[0].mode) {
 	case 1:
+        /* FIXME: enhance mode 4 precision */
+	case 4:
 		create_pit_timer(&ps->pit_timer, val, 0);
 		break;
 	case 2:
-- 
cgit v0.10.2


From b4f14abd95cd8d42f08438f1c4ec3eafe41054ee Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Wed, 30 Apr 2008 17:59:04 +0200
Subject: KVM: Avoid spurious execeptions after setting registers

Clear pending exceptions when setting new register values. This avoids
spurious exceptions after restoring a vcpu state or after
reset-on-triple-fault.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 017daa2..bc224bb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3022,6 +3022,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 
 	kvm_x86_ops->decache_regs(vcpu);
 
+	vcpu->arch.exception.pending = false;
+
 	vcpu_put(vcpu);
 
 	return 0;
-- 
cgit v0.10.2


From bc1a34f1bf354fabc03e3f465620c80e510d0e8f Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <andrea@qumranet.com>
Date: Thu, 1 May 2008 18:43:33 +0200
Subject: KVM: avoid fx_init() schedule in atomic

This make sure not to schedule in atomic during fx_init. I also
changed the name of fpu_init to fx_finit to avoid duplicating the name
with fpu_init that is already used in the kernel, this makes grep
simpler if nothing else.

Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bc224bb..21338bd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3703,10 +3703,19 @@ void fx_init(struct kvm_vcpu *vcpu)
 {
 	unsigned after_mxcsr_mask;
 
+	/*
+	 * Touch the fpu the first time in non atomic context as if
+	 * this is the first fpu instruction the exception handler
+	 * will fire before the instruction returns and it'll have to
+	 * allocate ram with GFP_KERNEL.
+	 */
+	if (!used_math())
+		fx_save(&vcpu->arch.host_fx_image);
+
 	/* Initialize guest FPU by resetting ours and saving into guest's */
 	preempt_disable();
 	fx_save(&vcpu->arch.host_fx_image);
-	fpu_init();
+	fx_finit();
 	fx_save(&vcpu->arch.guest_fx_image);
 	fx_restore(&vcpu->arch.host_fx_image);
 	preempt_enable();
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index 4baa9c9..1d8cd01 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -627,7 +627,7 @@ static inline void fx_restore(struct i387_fxsave_struct *image)
 	asm("fxrstor (%0)":: "r" (image));
 }
 
-static inline void fpu_init(void)
+static inline void fx_finit(void)
 {
 	asm("finit");
 }
-- 
cgit v0.10.2


From 93df766322ba1db2801e4b826985a4932dd75866 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Fri, 2 May 2008 13:23:10 +0300
Subject: KVM: MMU: Allow more than PAGES_PER_HPAGE write protections per large
 page

nonpae guests can call rmap_write_protect twice per page (for page tables)
or four times per page (for page directories), triggering a bogus warning.

Remove the warning.

Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d9344be..36c5406 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -376,7 +376,6 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn)
 
 	write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn));
 	*write_count += 1;
-	WARN_ON(*write_count > KVM_PAGES_PER_HPAGE);
 }
 
 static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
-- 
cgit v0.10.2


From b8ba5f10c5956d2b297766fda8f4f5ab8ad1e2cc Mon Sep 17 00:00:00 2001
From: Glauber Costa <gcosta@redhat.com>
Date: Wed, 30 Apr 2008 12:39:05 -0300
Subject: x86: KVM geust: make setup_secondary_clock definition dependent on
 local apic

Since the pv_apic_ops are only present if CONFIG_X86_LOCAL_APIC is compiled
in, kvmclock failed to build without this option.  This patch fixes this.

Signed-off-by: Glauber Costa <gcosta@redhat.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index ddee040..4bc1be5 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -133,6 +133,7 @@ static int kvm_register_clock(void)
 	return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high);
 }
 
+#ifdef CONFIG_X86_LOCAL_APIC
 static void kvm_setup_secondary_clock(void)
 {
 	/*
@@ -143,6 +144,7 @@ static void kvm_setup_secondary_clock(void)
 	/* ok, done with our trickery, call native */
 	setup_secondary_APIC_clock();
 }
+#endif
 
 /*
  * After the clock is registered, the host will keep writing to the
@@ -177,7 +179,9 @@ void __init kvmclock_init(void)
 		pv_time_ops.get_wallclock = kvm_get_wallclock;
 		pv_time_ops.set_wallclock = kvm_set_wallclock;
 		pv_time_ops.sched_clock = kvm_clock_read;
+#ifdef CONFIG_X86_LOCAL_APIC
 		pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock;
+#endif
 		machine_ops.shutdown  = kvm_shutdown;
 #ifdef CONFIG_KEXEC
 		machine_ops.crash_shutdown  = kvm_crash_shutdown;
-- 
cgit v0.10.2


From 1024c5f4be4fc5b00337464fb8a442bebf15df68 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Sun, 4 May 2008 17:03:41 +0200
Subject: ide: IDE_HFLAG_SERIALIZE_DMA bugfix

Patch re-ordering could be harmful:

commit 1fd1890594bd355a4217f5658a34763e77decee3
Author: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date:   Sat Apr 26 22:25:24 2008 +0200

    ide: add IDE_HFLAG_SERIALIZE_DMA host flag
...

is buggy because ->init_dma method / ide_hwif_setup_dma() is called
before IDE_HFLAG_SERIALIZE_DMA host flag is checked.

Fix it by checking IDE_HFLAG_SERIALIZE[_DMA] after DMA initialization.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>

diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index 591deda..34b0d4f 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -1355,12 +1355,6 @@ static void ide_init_port(ide_hwif_t *hwif, unsigned int port,
 	if (hwif->chipset != ide_dtc2278 || hwif->channel == 0)
 		hwif->port_ops = d->port_ops;
 
-	if ((d->host_flags & IDE_HFLAG_SERIALIZE) ||
-	    ((d->host_flags & IDE_HFLAG_SERIALIZE_DMA) && hwif->dma_base)) {
-		if (hwif->mate)
-			hwif->mate->serialized = hwif->serialized = 1;
-	}
-
 	hwif->swdma_mask = d->swdma_mask;
 	hwif->mwdma_mask = d->mwdma_mask;
 	hwif->ultra_mask = d->udma_mask;
@@ -1382,6 +1376,12 @@ static void ide_init_port(ide_hwif_t *hwif, unsigned int port,
 			hwif->dma_ops = d->dma_ops;
 	}
 
+	if ((d->host_flags & IDE_HFLAG_SERIALIZE) ||
+	    ((d->host_flags & IDE_HFLAG_SERIALIZE_DMA) && hwif->dma_base)) {
+		if (hwif->mate)
+			hwif->mate->serialized = hwif->serialized = 1;
+	}
+
 	if (d->host_flags & IDE_HFLAG_RQSIZE_256)
 		hwif->rqsize = 256;
 
-- 
cgit v0.10.2


From 48b83d2425d7781bb625b1c37b5f2a8963b6e23b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 2 May 2008 21:24:30 +0200
Subject: x86: undo visws/numaq build changes

arch/x86/pci/Makefile_32 has a nasty detail. VISWS and NUMAQ build
override the generic pci-y rules. This needs a proper cleanup, but
that needs more thoughts. Undo

commit 895d30935ebe05f192e844792668bf8d19deaae7
    x86: numaq fix
    do not override the existing pci-y rule when adding visws or
    numaq rules.

There is also a stupid init function ordering problem vs. acpi.o

Add comments to the Makefile to avoid tripping over this again.

Remove the srat stub code in discontig_32.c to allow a proper NUMAQ
build.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c
index 1837885..914ccf9 100644
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/discontig_32.c
@@ -476,29 +476,3 @@ int memory_add_physaddr_to_nid(u64 addr)
 
 EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
 #endif
-
-#ifndef CONFIG_HAVE_ARCH_PARSE_SRAT
-/*
- * XXX FIXME: Make SLIT table parsing available to 32-bit NUMA
- *
- * These stub functions are needed to compile 32-bit NUMA when SRAT is
- * not set. There are functions in srat_64.c for parsing this table
- * and it may be possible to make them common functions.
- */
-void acpi_numa_slit_init (struct acpi_table_slit *slit)
-{
-	printk(KERN_INFO "ACPI: No support for parsing SLIT table\n");
-}
-
-void acpi_numa_processor_affinity_init (struct acpi_srat_cpu_affinity *pa)
-{
-}
-
-void acpi_numa_memory_affinity_init (struct acpi_srat_mem_affinity *ma)
-{
-}
-
-void acpi_numa_arch_fixup(void)
-{
-}
-#endif /* CONFIG_HAVE_ARCH_PARSE_SRAT */
diff --git a/arch/x86/pci/Makefile_32 b/arch/x86/pci/Makefile_32
index 7fa5198..89ec35d 100644
--- a/arch/x86/pci/Makefile_32
+++ b/arch/x86/pci/Makefile_32
@@ -6,11 +6,19 @@ obj-$(CONFIG_PCI_DIRECT)	+= direct.o
 obj-$(CONFIG_PCI_OLPC)		+= olpc.o
 
 pci-y				:= fixup.o
+
+# Do not change the ordering here. There is a nasty init function
+# ordering dependency which breaks when you move acpi.o below
+# legacy/irq.o
 pci-$(CONFIG_ACPI)		+= acpi.o
 pci-y				+= legacy.o irq.o
 
-pci-$(CONFIG_X86_VISWS)		+= visws.o fixup.o
-pci-$(CONFIG_X86_NUMAQ)		+= numa.o irq.o
+# Careful: VISWS and NUMAQ overrule the pci-y above. The colons are
+# therefor correct. This needs a proper fix by distangling the code.
+pci-$(CONFIG_X86_VISWS)		:= visws.o fixup.o
+pci-$(CONFIG_X86_NUMAQ)		:= numa.o irq.o
+
+# Necessary for NUMAQ as well
 pci-$(CONFIG_NUMA)		+= mp_bus_to_node.o
 
 obj-y				+= $(pci-y) common.o early.o
-- 
cgit v0.10.2


From 22eecde2f9034764a3fd095eecfa3adfb8ec9a98 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 1 May 2008 12:06:54 +0200
Subject: uml: fix gcc problem

this is what caused gcc 4.3 to throw an internal error when
OPTIMIZE_INLINING was enabled ...

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/um/Makefile b/arch/um/Makefile
index dbeab15..01b97c1 100644
--- a/arch/um/Makefile
+++ b/arch/um/Makefile
@@ -77,7 +77,10 @@ include $(srctree)/$(ARCH_DIR)/Makefile-os-$(OS)
 KERNEL_DEFINES = $(strip -Derrno=kernel_errno -Dsigprocmask=kernel_sigprocmask \
 			 -Dmktime=kernel_mktime $(ARCH_KERNEL_DEFINES))
 KBUILD_CFLAGS += $(KERNEL_DEFINES)
-KBUILD_CFLAGS += $(call cc-option,-fno-unit-at-a-time,)
+# Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use
+# a lot more stack due to the lack of sharing of stacklots:
+KBUILD_CFLAGS += $(shell if [ $(call cc-version) -lt 0400 ] ; then \
+			echo $(call cc-option,-fno-unit-at-a-time); fi ;)
 
 PHONY += linux
 
-- 
cgit v0.10.2


From 4c6214c75a5aca5417156a47cd890b128c5f0637 Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Thu, 1 May 2008 11:31:07 +0200
Subject: kbuild, suspend, x86: fix rebuild of wakeup.bin

In kernel/acpi/realmode/Makefile use the 'always'
variable to say that wakeup.bin should always
be made.

In acpi/Makefile we then do not need to specify the
requested target and we avoid the message from make:

   `arch/x86/kernel/acpi/realmode/wakeup.bin' is up to date.

Add wakeup.lds to list af targets to avoid rebuilding
wakeup.bin - from Roland McGrath.

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Cc: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Pavel Machek <pavel@suse.cz>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile
index 7335959..fd5ca97 100644
--- a/arch/x86/kernel/acpi/Makefile
+++ b/arch/x86/kernel/acpi/Makefile
@@ -10,5 +10,5 @@ endif
 $(obj)/wakeup_rm.o:    $(obj)/realmode/wakeup.bin
 
 $(obj)/realmode/wakeup.bin: FORCE
-	$(Q)$(MAKE) $(build)=$(obj)/realmode $@
+	$(Q)$(MAKE) $(build)=$(obj)/realmode
 
diff --git a/arch/x86/kernel/acpi/realmode/Makefile b/arch/x86/kernel/acpi/realmode/Makefile
index 0929008..1c31cc0 100644
--- a/arch/x86/kernel/acpi/realmode/Makefile
+++ b/arch/x86/kernel/acpi/realmode/Makefile
@@ -6,7 +6,8 @@
 # for more details.
 #
 
-targets		:= wakeup.bin wakeup.elf
+always		:= wakeup.bin
+targets		:= wakeup.elf wakeup.lds
 
 wakeup-y	+= wakeup.o wakemain.o video-mode.o copy.o
 
@@ -48,7 +49,7 @@ LDFLAGS_wakeup.elf	:= -T
 
 CPPFLAGS_wakeup.lds += -P -C
 
-$(obj)/wakeup.elf: $(src)/wakeup.lds $(WAKEUP_OBJS) FORCE
+$(obj)/wakeup.elf: $(obj)/wakeup.lds $(WAKEUP_OBJS) FORCE
 	$(call if_changed,ld)
 
 OBJCOPYFLAGS_wakeup.bin	:= -O binary
-- 
cgit v0.10.2


From dbe55f4797712f86691a0ee0b5f508693c7310fe Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Tue, 22 Apr 2008 01:50:26 +0300
Subject: x86: make start_secondary() static

start_secondary() needlessly became global.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 84241a2..b78e4d4 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -299,7 +299,7 @@ static void __cpuinit smp_callin(void)
 /*
  * Activate a secondary processor.
  */
-void __cpuinit start_secondary(void *unused)
+static void __cpuinit start_secondary(void *unused)
 {
 	/*
 	 * Don't put *anything* before cpu_init(), SMP booting is too
-- 
cgit v0.10.2


From c5562faeaacf19e81a78ee37cc6b96ab1f3e68e4 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Tue, 22 Apr 2008 00:31:37 +0300
Subject: x86: make additional_cpus static

This patch makes the needlessly global additional_cpus static.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index b78e4d4..6b087ab 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1306,7 +1306,7 @@ static void remove_siblinginfo(int cpu)
 	cpu_clear(cpu, cpu_sibling_setup_map);
 }
 
-int additional_cpus __initdata = -1;
+static int additional_cpus __initdata = -1;
 
 static __init int setup_additional_cpus(char *s)
 {
-- 
cgit v0.10.2


From e37ee42caadab46cec277546099fa2a6207fff0b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 3 May 2008 22:01:31 +0200
Subject: x86: es7000 build fix

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 3e2c54d..404683b 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -794,6 +794,11 @@ void __init find_smp_config(void)
                             ACPI-based MP Configuration
    -------------------------------------------------------------------------- */
 
+/*
+ * Keep this outside and initialized to 0, for !CONFIG_ACPI builds:
+ */
+int es7000_plat;
+
 #ifdef CONFIG_ACPI
 
 #ifdef	CONFIG_X86_IO_APIC
@@ -909,8 +914,6 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
 	MP_intsrc_info(&intsrc);
 }
 
-int es7000_plat;
-
 void __init mp_config_acpi_legacy_irqs(void)
 {
 	struct mpc_config_intsrc intsrc;
-- 
cgit v0.10.2


From 163ea310b68bdde89b1ac633fbf8c0db290d3f86 Mon Sep 17 00:00:00 2001
From: Ben <bdeb@willmore.eu>
Date: Sat, 3 May 2008 22:39:42 +0200
Subject: x86: remove dell reboot dmi quirk board name match

http://bugzilla.kernel.org/show_bug.cgi?id=10547

Newer Dell OptiPlex 745s hang before rebooting after 'sudo reboot'.

A patch for some versions of the OptiPlex was proposed here --
http://lkml.org/lkml/2007/6/5/59 -- and is included in 2.6.23 and
later kernels, according to
http://lxr.linux.no/linux+v2.6.23/arch/i386/kernel/reboot.c . However,
the DMI_BOARD_NAME ("0WF810") is too restrictive. Newer OptiPlex
machines have a DMI_BOARD_NAME of "0RF703".  I therefore suggest
adding another clause to reboot.c, similar to the one in the original
patch, but matching a DMI_BOARD_NAME of "0RF703".

On further inspection, it seems that there are other DMI_BOARD_NAMEs
for this same machine. They seem to change from time to time, which
means that the current code is fragile. Moreover, using bios reboot
should not break non-SFF OptiPlex 745s, and so a reasonable fix is to
simply drop the match on DMI_BOARD_NAME.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 07c6d42..f6be7d5 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -149,7 +149,6 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
 		.matches = {
 			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
 			DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"),
-			DMI_MATCH(DMI_BOARD_NAME, "0WF810"),
 		},
 	},
 	{       /* Handle problems with rebooting on Dell Optiplex 745's DFF*/
-- 
cgit v0.10.2


From ecb783eae1372d69a53d406e1bdba8284e4bafcc Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Sat, 3 May 2008 14:18:01 +0400
Subject: x86: vdso ELF handling - use SELFMAG instead of numeric constant

Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: akpm@linux-foundation.org
Cc: hpa@zytor.com
Cc: mingo@elte.hu
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 4dceeb1..cf058fe 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -162,7 +162,7 @@ static __init void relocate_vdso(Elf32_Ehdr *ehdr)
 	Elf32_Shdr *shdr;
 	int i;
 
-	BUG_ON(memcmp(ehdr->e_ident, ELFMAG, 4) != 0 ||
+	BUG_ON(memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0 ||
 	       !elf_check_arch_ia32(ehdr) ||
 	       ehdr->e_type != ET_DYN);
 
-- 
cgit v0.10.2


From 8bd1796dedd50abd7553afbe6113bd97cc88390f Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Sat, 3 May 2008 14:18:03 +0400
Subject: x86: relocs ELF handling - use SELFMAG instead of numeric constant

Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: akpm@linux-foundation.org
Cc: hpa@zytor.com
Cc: mingo@elte.hu
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/boot/compressed/relocs.c b/arch/x86/boot/compressed/relocs.c
index d01ea42..edaadea 100644
--- a/arch/x86/boot/compressed/relocs.c
+++ b/arch/x86/boot/compressed/relocs.c
@@ -191,7 +191,7 @@ static void read_ehdr(FILE *fp)
 		die("Cannot read ELF header: %s\n",
 			strerror(errno));
 	}
-	if (memcmp(ehdr.e_ident, ELFMAG, 4) != 0) {
+	if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0) {
 		die("No ELF magic\n");
 	}
 	if (ehdr.e_ident[EI_CLASS] != ELFCLASS32) {
-- 
cgit v0.10.2


From afaafe50ee15c59010f19273ebfb6c44f0962d7c Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 2 May 2008 21:14:20 +1000
Subject: x86: fix up bootparam.h for userspace inclusion

commit 8b664aa66e824a0ddf4ec56d41fa0cf7bb374de6 (x86, boot: add linked
list of struct setup_data) put a new struct in bootparam.h, but didn't
use the userspace-safe types.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Huang Ying <ying.huang@intel.com>
Acked-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/asm-x86/bootparam.h b/include/asm-x86/bootparam.h
index e865990..f62f473 100644
--- a/include/asm-x86/bootparam.h
+++ b/include/asm-x86/bootparam.h
@@ -14,10 +14,10 @@
 
 /* extensible setup data list node */
 struct setup_data {
-	u64 next;
-	u32 type;
-	u32 len;
-	u8 data[0];
+	__u64 next;
+	__u32 type;
+	__u32 len;
+	__u8 data[0];
 };
 
 struct setup_header {
-- 
cgit v0.10.2


From 7b04fa014c11e6415da8b5a7999dbd201abad53c Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Fri, 2 May 2008 13:32:32 -0700
Subject: x86: video/fbdev.c: add MODULE_LICENSE

Add the missing MODULE_LICENSE("GPL").

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/video/fbdev.c b/arch/x86/video/fbdev.c
index 4db42bf..6952768 100644
--- a/arch/x86/video/fbdev.c
+++ b/arch/x86/video/fbdev.c
@@ -1,5 +1,4 @@
 /*
- *
  * Copyright (C) 2007 Antonino Daplas <adaplas@gmail.com>
  *
  * This file is subject to the terms and conditions of the GNU General Public
@@ -29,3 +28,4 @@ int fb_is_primary_device(struct fb_info *info)
 	return retval;
 }
 EXPORT_SYMBOL(fb_is_primary_device);
+MODULE_LICENSE("GPL");
-- 
cgit v0.10.2


From e26a28d190304d910ee49b81cbfe6d9241f56e86 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 3 May 2008 23:49:59 +0200
Subject: x86: olpc build fix

CONFIG_OLPC needs to depend on MGEODE_LX

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c3f8809..845ea2b2 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1661,6 +1661,7 @@ config GEODE_MFGPT_TIMER
 
 config OLPC
 	bool "One Laptop Per Child support"
+	depends on MGEODE_LX
 	default n
 	help
 	  Add support for detecting the unique features of the OLPC
-- 
cgit v0.10.2


From 62179849b40aded9e727cca5006627a1c4d6446e Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Fri, 2 May 2008 13:32:35 -0700
Subject: x86: fix setup printk format warning

Fix x86 setup printk format warming:

next-20080430/arch/x86/kernel/setup.c:172: warning: format '%lu' expects type 'long unsigned int', but argument 2 has type 'ssize_t'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Cc: mingo@elte.hu
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index c0c68c1..cc6f5eb 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -95,7 +95,7 @@ void __init setup_per_cpu_areas(void)
 
 	/* Copy section for each CPU (we discard the original) */
 	size = PERCPU_ENOUGH_ROOM;
-	printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n",
+	printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n",
 			  size);
 
 	for_each_possible_cpu(i) {
-- 
cgit v0.10.2


From c4143a83031aef7ba87a62cf654d6d8fb4d8e76e Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Sun, 4 May 2008 21:03:20 +0200
Subject: kconfig: fix MAC OS X warnings in menuconfig

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Acked-by: Timur Tabi <timur@freescale.com>

diff --git a/scripts/kconfig/lkc.h b/scripts/kconfig/lkc.h
index 4bc68f2..96521cb 100644
--- a/scripts/kconfig/lkc.h
+++ b/scripts/kconfig/lkc.h
@@ -11,9 +11,9 @@
 #ifndef KBUILD_NO_NLS
 # include <libintl.h>
 #else
-# define gettext(Msgid) ((const char *) (Msgid))
-# define textdomain(Domainname) ((const char *) (Domainname))
-# define bindtextdomain(Domainname, Dirname) ((const char *) (Dirname))
+static inline const char *gettext(const char *txt) { return txt; }
+static inline void textdomain(const char *domainname) {}
+static inline void bindtextdomain(const char *name, const char *dir) {}
 #endif
 
 #ifdef __cplusplus
diff --git a/scripts/kconfig/mconf.c b/scripts/kconfig/mconf.c
index 734cf4f..6841e95 100644
--- a/scripts/kconfig/mconf.c
+++ b/scripts/kconfig/mconf.c
@@ -773,7 +773,7 @@ static void conf_string(struct menu *menu)
 
 	while (1) {
 		int res;
-		char *heading;
+		const char *heading;
 
 		switch (sym_get_type(menu->sym)) {
 		case S_INT:
@@ -925,3 +925,4 @@ int main(int ac, char **av)
 
 	return 0;
 }
+
-- 
cgit v0.10.2


From 2cb1e1257fb4d4d52c97e763ab262c2295aea4a8 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Sun, 4 May 2008 02:15:35 +0300
Subject: kconfig-language.txt: remove bogus hint

For the use case the hint describe a simple dependency is
enough.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Acked-by: Randy Dunlap <randy.dunlap@oracle.com>

diff --git a/Documentation/kbuild/kconfig-language.txt b/Documentation/kbuild/kconfig-language.txt
index 00b950d..c412c24 100644
--- a/Documentation/kbuild/kconfig-language.txt
+++ b/Documentation/kbuild/kconfig-language.txt
@@ -377,27 +377,3 @@ config FOO
 
 limits FOO to module (=m) or disabled (=n).
 
-
-Build limited by a third config symbol which may be =y or =m
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-A common idiom that we see (and sometimes have problems with) is this:
-
-When option C in B (module or subsystem) uses interfaces from A (module
-or subsystem), and both A and B are tristate (could be =y or =m if they
-were independent of each other, but they aren't), then we need to limit
-C such that it cannot be built statically if A is built as a loadable
-module.  (C already depends on B, so there is no dependency issue to
-take care of here.)
-
-If A is linked statically into the kernel image, C can be built
-statically or as loadable module(s).  However, if A is built as loadable
-module(s), then C must be restricted to loadable module(s) also.  This
-can be expressed in kconfig language as:
-
-config C
-	depends on A = y || A = B
-
-or for real examples, use this command in a kernel tree:
-
-$ find . -name Kconfig\* | xargs grep -ns "depends on.*=.*||.*=" | grep -v orig
-
-- 
cgit v0.10.2


From 36bbfe2f097d5e09e8e9c83f55264bd538a0ebe1 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Sat, 3 May 2008 23:51:03 +0300
Subject: fix asm-alpha/types.h breakage

This patch fixes the following compile error on alpha caused by
commit 3726c23df8e4d95b6f2b335dfa90e3f4850a8a00
(alpha: types: use <asm-generic/int-*.h> for the alpha architecture):

<--  snip  -->

...
  CC      arch/alpha/kernel/asm-offsets.s
In file included from include2/asm/topology.h:6,
                 from /home/bunk/linux/kernel-2.6/git/linux-2.6/include/linux/topology.h:34,
                 from /home/bunk/linux/kernel-2.6/git/linux-2.6/include/linux/mmzone.h:683,
                 from /home/bunk/linux/kernel-2.6/git/linux-2.6/include/linux/gfp.h:4,
                 from /home/bunk/linux/kernel-2.6/git/linux-2.6/include/linux/slab.h:12,
                 from /home/bunk/linux/kernel-2.6/git/linux-2.6/include/linux/percpu.h:5,
                 from /home/bunk/linux/kernel-2.6/git/linux-2.6/include/linux/rcupdate.h:39,
                 from /home/bunk/linux/kernel-2.6/git/linux-2.6/include/linux/pid.h:4,
                 from /home/bunk/linux/kernel-2.6/git/linux-2.6/include/linux/sched.h:74,
                 from /home/bunk/linux/kernel-2.6/git/linux-2.6/arch/alpha/kernel/asm-offsets.c:9:
include2/asm/machvec.h:44: error: expected declaration specifiers or '...' before 'dma_addr_t'
include2/asm/machvec.h:44: error: expected declaration specifiers or '...' before 'dma_addr_t'
In file included from /home/bunk/linux/kernel-2.6/git/linux-2.6/arch/alpha/kernel/asm-offsets.c:12:
include2/asm/io.h:94: warning: type defaults to 'int' in declaration of 'dma_addr_t'
include2/asm/io.h:94: warning: variable 'dma_addr_t' declared 'inline'
include2/asm/io.h:94: error: expected ',' or ';' before 'isa_page_to_bus'
make[2]: *** [arch/alpha/kernel/asm-offsets.s] Error 1

<--  snip  -->

Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/include/asm-alpha/types.h b/include/asm-alpha/types.h
index a9e34ca4..c154135 100644
--- a/include/asm-alpha/types.h
+++ b/include/asm-alpha/types.h
@@ -23,5 +23,11 @@ typedef unsigned int umode_t;
 
 #define BITS_PER_LONG 64
 
+#ifndef __ASSEMBLY__
+
+typedef u64 dma_addr_t;
+typedef u64 dma64_addr_t;
+
+#endif /* __ASSEMBLY__ */
 #endif /* __KERNEL__ */
 #endif /* _ALPHA_TYPES_H */
-- 
cgit v0.10.2


From 2961b423037da60a8cb230963ee0d8c04473d73b Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Sat, 3 May 2008 22:26:17 +0300
Subject: fix asm-mips/types.h syntax error

This patch fixes the following compile error caused by
commit 23cf11ddb5099f8c7f7cb3eb154bff0faf31cae9
(mips: types: use <asm-generic/int-*.h> for the mips architecture):

<--  snip  -->

...
  CC      kernel/bounds.s
In file included from /home/bunk/linux/kernel-2.6/git/linux-2.6/include/linux/types.h:12,
                 from /home/bunk/linux/kernel-2.6/git/linux-2.6/include/linux/page-flags.h:8,
                 from /home/bunk/linux/kernel-2.6/git/linux-2.6/kernel/bounds.c:9:
include2/asm/types.h:56:2: error: #endif without #if
make[2]: *** [kernel/bounds.s] Error 1

<--  snip  -->

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/include/asm-mips/types.h b/include/asm-mips/types.h
index 7a2ee4f..bcbb8d6 100644
--- a/include/asm-mips/types.h
+++ b/include/asm-mips/types.h
@@ -19,8 +19,6 @@
 
 typedef unsigned short umode_t;
 
-#endif
-
 #endif /* __ASSEMBLY__ */
 
 /*
-- 
cgit v0.10.2


From 826e4506a0acb6487910a5ebafe839f708a00e1c Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 4 May 2008 17:04:16 -0700
Subject: Make forced module loading optional

The kernel module loader used to be much too happy to allow loading of
modules for the wrong kernel version by default.  For example, if you
had MODVERSIONS enabled, but tried to load a module with no version
info, it would happily load it and taint the kernel - whether it was
likely to actually work or not!

Generally, such forced module loading should be considered a really
really bad idea, so make it conditional on a new config option
(MODULE_FORCE_LOAD), and make it default to off.

If somebody really wants to force module loads, that's their problem,
but we should not encourage it.  Especially as it happened to me by
mistake (ie regular unversioned Fedora modules getting loaded) causing
lots of strange behavior.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/init/Kconfig b/init/Kconfig
index 6a44def..f0e62e5 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -825,6 +825,15 @@ menuconfig MODULES
 
 	  If unsure, say Y.
 
+config MODULE_FORCE_LOAD
+	bool "Forced module loading"
+	depends on MODULES
+	default n
+	help
+	  This option allows loading of modules even if that would set the
+          'F' (forced) taint, due to lack of version info.  Which is
+	  usually a really bad idea.
+
 config MODULE_UNLOAD
 	bool "Module unloading"
 	depends on MODULES
diff --git a/kernel/module.c b/kernel/module.c
index 8674a39..8e4528c 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -890,6 +890,19 @@ static struct module_attribute *modinfo_attrs[] = {
 
 static const char vermagic[] = VERMAGIC_STRING;
 
+static int try_to_force_load(struct module *mod, const char *symname)
+{
+#ifdef CONFIG_MODULE_FORCE_LOAD
+	if (!(tainted & TAINT_FORCED_MODULE))
+		printk("%s: no version for \"%s\" found: kernel tainted.\n",
+		       mod->name, symname);
+	add_taint_module(mod, TAINT_FORCED_MODULE);
+	return 0;
+#else
+	return -ENOEXEC;
+#endif
+}
+
 #ifdef CONFIG_MODVERSIONS
 static int check_version(Elf_Shdr *sechdrs,
 			 unsigned int versindex,
@@ -914,18 +927,18 @@ static int check_version(Elf_Shdr *sechdrs,
 
 		if (versions[i].crc == *crc)
 			return 1;
-		printk("%s: disagrees about version of symbol %s\n",
-		       mod->name, symname);
 		DEBUGP("Found checksum %lX vs module %lX\n",
 		       *crc, versions[i].crc);
-		return 0;
+		goto bad_version;
 	}
-	/* Not in module's version table.  OK, but that taints the kernel. */
-	if (!(tainted & TAINT_FORCED_MODULE))
-		printk("%s: no version for \"%s\" found: kernel tainted.\n",
-		       mod->name, symname);
-	add_taint_module(mod, TAINT_FORCED_MODULE);
-	return 1;
+
+	if (!try_to_force_load(mod, symname))
+		return 1;
+
+bad_version:
+	printk("%s: disagrees about version of symbol %s\n",
+	       mod->name, symname);
+	return 0;
 }
 
 static inline int check_modstruct_version(Elf_Shdr *sechdrs,
@@ -1853,9 +1866,9 @@ static struct module *load_module(void __user *umod,
 	modmagic = get_modinfo(sechdrs, infoindex, "vermagic");
 	/* This is allowed: modprobe --force will invalidate it. */
 	if (!modmagic) {
-		add_taint_module(mod, TAINT_FORCED_MODULE);
-		printk(KERN_WARNING "%s: no version magic, tainting kernel.\n",
-		       mod->name);
+		err = try_to_force_load(mod, "magic");
+		if (err)
+			goto free_hdr;
 	} else if (!same_magic(modmagic, vermagic)) {
 		printk(KERN_ERR "%s: version magic '%s' should be '%s'\n",
 		       mod->name, modmagic, vermagic);
@@ -2006,9 +2019,10 @@ static struct module *load_module(void __user *umod,
 	    (mod->num_gpl_future_syms && !gplfuturecrcindex) ||
 	    (mod->num_unused_syms && !unusedcrcindex) ||
 	    (mod->num_unused_gpl_syms && !unusedgplcrcindex)) {
-		printk(KERN_WARNING "%s: No versions for exported symbols."
-		       " Tainting kernel.\n", mod->name);
-		add_taint_module(mod, TAINT_FORCED_MODULE);
+		printk(KERN_WARNING "%s: No versions for exported symbols.\n", mod->name);
+		err = try_to_force_load(mod, "nocrc");
+		if (err)
+			goto cleanup;
 	}
 #endif
 	markersindex = find_sec(hdr, sechdrs, secstrings, "__markers");
-- 
cgit v0.10.2


From e73b65f1db7e3baa3db43951476b7d2d2381ba35 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 4 May 2008 09:29:43 +0200
Subject: sysfs: build fix

x86.git testing found the following build failure on v2.6.26-rc1:

  In file included from include/linux/kobject.h:22,
                   from include/linux/module.h:17,
                   from include/linux/crypto.h:22,
                   from arch/x86/kernel/asm-offsets_32.c:8,
                   from arch/x86/kernel/asm-offsets.c:3:
  include/linux/sysfs.h:201: error: redefinition of 'sysfs_update_group'
  include/linux/sysfs.h:195: error: previous definition of 'sysfs_update_group' was here
  make[1]: *** [arch/x86/kernel/asm-offsets.s] Error 1
  make: *** [prepare0] Error 2

with the following config:

    http://redhat.com/~mingo/misc/config-Sun_May__4_07_09_30_CEST_2008.bad

the reason for the build failure is the duplicate definition of the
sysfs_update_group() inline function in include/linux/sysfs.h.

The duplication was a merge error: it was added via -mm by commit
v2.6.25-7262-g2850699, "sysfs: sysfs_update_group stub for
CONFIG_SYSFS=n" a day before v2.6.26-rc1, but a day before that the same
commit was already merged upstream via the sysfs tree, with commit
v2.6.25-7211-g1cbfb7a.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index 27bad59..7858eac 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -196,12 +196,6 @@ static inline int sysfs_update_group(struct kobject *kobj,
 	return 0;
 }
 
-static inline int sysfs_update_group(struct kobject *kobj,
-				const struct attribute_group *grp)
-{
-	return 0;
-}
-
 static inline void sysfs_remove_group(struct kobject *kobj,
 				      const struct attribute_group *grp)
 {
-- 
cgit v0.10.2


From eb28062f131b0a1da32b2554fd819af5221040de Mon Sep 17 00:00:00 2001
From: Bryan Wu <cooloney@kernel.org>
Date: Sun, 4 May 2008 23:12:55 +0800
Subject: task_nommu: fix compile failing bug because of spilt file.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

  CC      fs/proc/task_nommu.o
fs/proc/task_nommu.c: In function ‘task_mem’:
fs/proc/task_nommu.c:55: error: dereferencing pointer to incomplete type
make[2]: *** [fs/proc/task_nommu.o] Error 1
make[1]: *** [fs/proc] Error 2
make: *** [fs] Error 2

Signed-off-by: Bryan Wu <cooloney@kernel.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 4b733f1..4b4f9cc 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -1,6 +1,7 @@
 
 #include <linux/mm.h>
 #include <linux/file.h>
+#include <linux/fdtable.h>
 #include <linux/mount.h>
 #include <linux/ptrace.h>
 #include <linux/seq_file.h>
-- 
cgit v0.10.2


From c37aa90b0458d87342e0bb083f6bf7d113220d09 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Sun, 4 May 2008 17:57:29 -0700
Subject: bridge: Net device leak in br_add_bridge().

In case the register_netdevice() call fails the device is leaked,
since the out: label is just rtnl_unlock()+return.

Free the device.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 77a981a..bff0f5b 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -280,8 +280,10 @@ int br_add_bridge(const char *name)
 	}
 
 	ret = register_netdevice(dev);
-	if (ret)
+	if (ret) {
+		free_netdev(dev);
 		goto out;
+	}
 
 	ret = br_sysfs_addbr(dev);
 	if (ret)
-- 
cgit v0.10.2


From e340a90e6e07bba6e6b3fc39dd5fa76f95579d7c Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Sun, 4 May 2008 17:58:07 -0700
Subject: bridge: Consolidate error paths in br_add_bridge().

This actually had to be merged with the patch #1, but I decided not to
mix two changes in one patch.

There are already two calls to free_netdev() in there, so merge them
into one.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index bff0f5b..c2397f5 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -273,17 +273,13 @@ int br_add_bridge(const char *name)
 	rtnl_lock();
 	if (strchr(dev->name, '%')) {
 		ret = dev_alloc_name(dev, dev->name);
-		if (ret < 0) {
-			free_netdev(dev);
-			goto out;
-		}
+		if (ret < 0)
+			goto out_free;
 	}
 
 	ret = register_netdevice(dev);
-	if (ret) {
-		free_netdev(dev);
-		goto out;
-	}
+	if (ret)
+		goto out_free;
 
 	ret = br_sysfs_addbr(dev);
 	if (ret)
@@ -291,6 +287,10 @@ int br_add_bridge(const char *name)
  out:
 	rtnl_unlock();
 	return ret;
+
+out_free:
+	free_netdev(dev);
+	goto out;
 }
 
 int br_del_bridge(const char *name)
-- 
cgit v0.10.2


From 339a7c41c913035bf58579f6e47b4ba29da83795 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Sun, 4 May 2008 17:59:30 -0700
Subject: mac80211: Do not free net device after it is unregistered.

The error path in ieee80211_register_hw() may call the unregister_netdev()
and right after it - the free_netdev(), which is wrong, since the
unregister releases the device itself.

So the proposed fix is to NULL the local->mdev after unregister is done
and check this before calling free_netdev().

I checked - no code uses the local->mdev after unregister in this error
path (but even if some did this would be a BUG).

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 9ad4e36..915afad 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -1766,6 +1766,7 @@ fail_wep:
 fail_rate:
 	ieee80211_debugfs_remove_netdev(IEEE80211_DEV_TO_SUB_IF(local->mdev));
 	unregister_netdevice(local->mdev);
+	local->mdev = NULL;
 fail_dev:
 	rtnl_unlock();
 	sta_info_stop(local);
@@ -1773,8 +1774,10 @@ fail_sta_info:
 	debugfs_hw_del(local);
 	destroy_workqueue(local->hw.workqueue);
 fail_workqueue:
-	ieee80211_if_free(local->mdev);
-	local->mdev = NULL;
+	if (local->mdev != NULL) {
+		ieee80211_if_free(local->mdev);
+		local->mdev = NULL;
+	}
 fail_mdev_alloc:
 	wiphy_unregister(local->hw.wiphy);
 	return result;
-- 
cgit v0.10.2


From 65e4113684e50cee75357ce10dc9026b0929e4e9 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Sun, 4 May 2008 18:00:05 -0700
Subject: atm: Do not free already unregistered net device.

Both br2684_push and br2684_exit do so, but unregister_netdev()
releases the device itself.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/atm/br2684.c b/net/atm/br2684.c
index 1b22806..3a74ff8 100644
--- a/net/atm/br2684.c
+++ b/net/atm/br2684.c
@@ -350,7 +350,6 @@ static void br2684_push(struct atm_vcc *atmvcc, struct sk_buff *skb)
 			list_del(&brdev->br2684_devs);
 			read_unlock(&devs_lock);
 			unregister_netdev(net_dev);
-			free_netdev(net_dev);
 		}
 		return;
 	}
@@ -771,7 +770,6 @@ static void __exit br2684_exit(void)
 
 		list_del(&brdev->br2684_devs);
 		unregister_netdev(net_dev);
-		free_netdev(net_dev);
 	}
 }
 
-- 
cgit v0.10.2


From 1e0ba0060ffcee2e766ec3159196235b1a2a0ff3 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Sun, 4 May 2008 18:00:36 -0700
Subject: atm: Bad locking on br2684_devs modifications.

The list_del happens under read-locked devs_lock.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/atm/br2684.c b/net/atm/br2684.c
index 3a74ff8..13858e2 100644
--- a/net/atm/br2684.c
+++ b/net/atm/br2684.c
@@ -346,9 +346,9 @@ static void br2684_push(struct atm_vcc *atmvcc, struct sk_buff *skb)
 		/* skb==NULL means VCC is being destroyed */
 		br2684_close_vcc(brvcc);
 		if (list_empty(&brdev->brvccs)) {
-			read_lock(&devs_lock);
+			write_lock_irq(&devs_lock);
 			list_del(&brdev->br2684_devs);
-			read_unlock(&devs_lock);
+			write_unlock_irq(&devs_lock);
 			unregister_netdev(net_dev);
 		}
 		return;
-- 
cgit v0.10.2


From 0bbeafd0118fc3ae54990064760c889d41dc21d6 Mon Sep 17 00:00:00 2001
From: Satoru SATOH <satoru.satoh@gmail.com>
Date: Sun, 4 May 2008 22:12:43 -0700
Subject: ip: Make use of the inline function dst_metric_locked()

Signed-off-by: Satoru SATOH <satoru.satoh@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/include/net/ip.h b/include/net/ip.h
index 6d7bcd5..3b40bc2 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -210,7 +210,7 @@ int ip_dont_fragment(struct sock *sk, struct dst_entry *dst)
 {
 	return (inet_sk(sk)->pmtudisc == IP_PMTUDISC_DO ||
 		(inet_sk(sk)->pmtudisc == IP_PMTUDISC_WANT &&
-		 !(dst_metric(dst, RTAX_LOCK)&(1<<RTAX_MTU))));
+		 !(dst_metric_locked(dst, RTAX_MTU))));
 }
 
 extern void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 5e3685c..9084055 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1613,7 +1613,7 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
 		       sizeof(rt->u.dst.metrics));
 		if (fi->fib_mtu == 0) {
 			rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
-			if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
+			if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
 			    rt->rt_gateway != rt->rt_dst &&
 			    rt->u.dst.dev->mtu > 576)
 				rt->u.dst.metrics[RTAX_MTU-1] = 576;
-- 
cgit v0.10.2


From 5ffc02a158997b1eb91ade8d02bcf521ff79a218 Mon Sep 17 00:00:00 2001
From: Satoru SATOH <satoru.satoh@gmail.com>
Date: Sun, 4 May 2008 22:14:42 -0700
Subject: ip: Use inline function dst_metric() instead of direct access to
 dst->metric[]

There are functions to refer to the value of dst->metric[THE_METRIC-1]
directly without use of a inline function "dst_metric" defined in
net/dst.h.

The following patch changes them to use the inline function
consistently.

Signed-off-by: Satoru SATOH <satoru.satoh@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 2f665a5..f50e88b 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -235,14 +235,14 @@ static void dn_dst_update_pmtu(struct dst_entry *dst, u32 mtu)
 	else
 		min_mtu -= 21;
 
-	if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= min_mtu) {
+	if (dst_metric(dst, RTAX_MTU) > mtu && mtu >= min_mtu) {
 		if (!(dst_metric_locked(dst, RTAX_MTU))) {
 			dst->metrics[RTAX_MTU-1] = mtu;
 			dst_set_expires(dst, dn_rt_mtu_expires);
 		}
 		if (!(dst_metric_locked(dst, RTAX_ADVMSS))) {
 			u32 mss = mtu - DN_MAX_NSP_DATA_HEADER;
-			if (dst->metrics[RTAX_ADVMSS-1] > mss)
+			if (dst_metric(dst, RTAX_ADVMSS) > mss)
 				dst->metrics[RTAX_ADVMSS-1] = mss;
 		}
 	}
@@ -805,12 +805,12 @@ static int dn_rt_set_next_hop(struct dn_route *rt, struct dn_fib_res *res)
 		rt->u.dst.neighbour = n;
 	}
 
-	if (rt->u.dst.metrics[RTAX_MTU-1] == 0 ||
-	    rt->u.dst.metrics[RTAX_MTU-1] > rt->u.dst.dev->mtu)
+	if (dst_metric(&rt->u.dst, RTAX_MTU) == 0 ||
+	    dst_metric(&rt->u.dst, RTAX_MTU) > rt->u.dst.dev->mtu)
 		rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
 	mss = dn_mss_from_pmtu(dev, dst_mtu(&rt->u.dst));
-	if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0 ||
-	    rt->u.dst.metrics[RTAX_ADVMSS-1] > mss)
+	if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0 ||
+	    dst_metric(&rt->u.dst, RTAX_ADVMSS) > mss)
 		rt->u.dst.metrics[RTAX_ADVMSS-1] = mss;
 	return 0;
 }
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 9084055..92f90ae 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1468,14 +1468,14 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
 
 					/* BSD 4.2 compatibility hack :-( */
 					if (mtu == 0 &&
-					    old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
+					    old_mtu >= dst_metric(&rth->u.dst, RTAX_MTU) &&
 					    old_mtu >= 68 + (iph->ihl << 2))
 						old_mtu -= iph->ihl << 2;
 
 					mtu = guess_mtu(old_mtu);
 				}
-				if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
-					if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
+				if (mtu <= dst_metric(&rth->u.dst, RTAX_MTU)) {
+					if (mtu < dst_metric(&rth->u.dst, RTAX_MTU)) {
 						dst_confirm(&rth->u.dst);
 						if (mtu < ip_rt_min_pmtu) {
 							mtu = ip_rt_min_pmtu;
@@ -1497,7 +1497,7 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
 
 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
 {
-	if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
+	if (dst_metric(dst, RTAX_MTU) > mtu && mtu >= 68 &&
 	    !(dst_metric_locked(dst, RTAX_MTU))) {
 		if (mtu < ip_rt_min_pmtu) {
 			mtu = ip_rt_min_pmtu;
@@ -1624,14 +1624,14 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
 	} else
 		rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
 
-	if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
+	if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
 		rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
-	if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
+	if (dst_metric(&rt->u.dst, RTAX_MTU) > IP_MAX_MTU)
 		rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
-	if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
+	if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
 		rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
 				       ip_rt_min_advmss);
-	if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
+	if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
 		rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
 
 #ifdef CONFIG_NET_CLS_ROUTE
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index eda4f4a..8ac15a6 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -66,6 +66,7 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/sysctl.h>
+#include <net/dst.h>
 #include <net/tcp.h>
 #include <net/inet_common.h>
 #include <linux/ipsec.h>
@@ -605,7 +606,7 @@ static u32 tcp_rto_min(struct sock *sk)
 	u32 rto_min = TCP_RTO_MIN;
 
 	if (dst && dst_metric_locked(dst, RTAX_RTO_MIN))
-		rto_min = dst->metrics[RTAX_RTO_MIN - 1];
+		rto_min = dst_metric(dst, RTAX_RTO_MIN);
 	return rto_min;
 }
 
@@ -769,7 +770,7 @@ void tcp_update_metrics(struct sock *sk)
 				dst->metrics[RTAX_RTTVAR - 1] = m;
 			else
 				dst->metrics[RTAX_RTTVAR-1] -=
-					(dst->metrics[RTAX_RTTVAR-1] - m)>>2;
+					(dst_metric(dst, RTAX_RTTVAR) - m)>>2;
 		}
 
 		if (tp->snd_ssthresh >= 0xFFFF) {
@@ -788,21 +789,21 @@ void tcp_update_metrics(struct sock *sk)
 				dst->metrics[RTAX_SSTHRESH-1] =
 					max(tp->snd_cwnd >> 1, tp->snd_ssthresh);
 			if (!dst_metric_locked(dst, RTAX_CWND))
-				dst->metrics[RTAX_CWND-1] = (dst->metrics[RTAX_CWND-1] + tp->snd_cwnd) >> 1;
+				dst->metrics[RTAX_CWND-1] = (dst_metric(dst, RTAX_CWND) + tp->snd_cwnd) >> 1;
 		} else {
 			/* Else slow start did not finish, cwnd is non-sense,
 			   ssthresh may be also invalid.
 			 */
 			if (!dst_metric_locked(dst, RTAX_CWND))
-				dst->metrics[RTAX_CWND-1] = (dst->metrics[RTAX_CWND-1] + tp->snd_ssthresh) >> 1;
-			if (dst->metrics[RTAX_SSTHRESH-1] &&
+				dst->metrics[RTAX_CWND-1] = (dst_metric(dst, RTAX_CWND) + tp->snd_ssthresh) >> 1;
+			if (dst_metric(dst, RTAX_SSTHRESH) &&
 			    !dst_metric_locked(dst, RTAX_SSTHRESH) &&
-			    tp->snd_ssthresh > dst->metrics[RTAX_SSTHRESH-1])
+			    tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH))
 				dst->metrics[RTAX_SSTHRESH-1] = tp->snd_ssthresh;
 		}
 
 		if (!dst_metric_locked(dst, RTAX_REORDERING)) {
-			if (dst->metrics[RTAX_REORDERING-1] < tp->reordering &&
+			if (dst_metric(dst, RTAX_REORDERING) < tp->reordering &&
 			    tp->reordering != sysctl_tcp_reordering)
 				dst->metrics[RTAX_REORDERING-1] = tp->reordering;
 		}
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index a493ad9..12bba08 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1243,11 +1243,11 @@ install_route:
 		}
 	}
 
-	if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
+	if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
 		rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
-	if (!rt->u.dst.metrics[RTAX_MTU-1])
+	if (!dst_metric(&rt->u.dst, RTAX_MTU))
 		rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
-	if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
+	if (!dst_metric(&rt->u.dst, RTAX_ADVMSS))
 		rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst));
 	rt->u.dst.dev = dev;
 	rt->rt6i_idev = idev;
-- 
cgit v0.10.2


From 0885cb5653ff82c8d322df1b8a95843dc5f5486b Mon Sep 17 00:00:00 2001
From: Daniel Walker <dwalker@mvista.com>
Date: Sat, 3 May 2008 06:34:01 +1000
Subject: [POWERPC] macintosh: therm_pm72: driver_lock semaphore to mutex

Signed-off-by: Daniel Walker <dwalker@mvista.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>

diff --git a/drivers/macintosh/therm_pm72.c b/drivers/macintosh/therm_pm72.c
index 1e0a69a..ddfb426 100644
--- a/drivers/macintosh/therm_pm72.c
+++ b/drivers/macintosh/therm_pm72.c
@@ -122,6 +122,7 @@
 #include <linux/kmod.h>
 #include <linux/i2c.h>
 #include <linux/kthread.h>
+#include <linux/mutex.h>
 #include <asm/prom.h>
 #include <asm/machdep.h>
 #include <asm/io.h>
@@ -169,7 +170,7 @@ static int				rackmac;
 static s32				dimm_output_clamp;
 static int 				fcu_rpm_shift;
 static int				fcu_tickle_ticks;
-static DECLARE_MUTEX(driver_lock);
+static DEFINE_MUTEX(driver_lock);
 
 /*
  * We have 3 types of CPU PID control. One is "split" old style control
@@ -729,9 +730,9 @@ static void fetch_cpu_pumps_minmax(void)
 static ssize_t show_##name(struct device *dev, struct device_attribute *attr, char *buf)	\
 {								\
 	ssize_t r;						\
-	down(&driver_lock);					\
+	mutex_lock(&driver_lock);					\
 	r = sprintf(buf, "%d.%03d", FIX32TOPRINT(data));	\
-	up(&driver_lock);					\
+	mutex_unlock(&driver_lock);					\
 	return r;						\
 }
 #define BUILD_SHOW_FUNC_INT(name, data)				\
@@ -1803,11 +1804,11 @@ static int main_control_loop(void *x)
 {
 	DBG("main_control_loop started\n");
 
-	down(&driver_lock);
+	mutex_lock(&driver_lock);
 
 	if (start_fcu() < 0) {
 		printk(KERN_ERR "kfand: failed to start FCU\n");
-		up(&driver_lock);
+		mutex_unlock(&driver_lock);
 		goto out;
 	}
 
@@ -1822,14 +1823,14 @@ static int main_control_loop(void *x)
 
 	fcu_tickle_ticks = FCU_TICKLE_TICKS;
 
-	up(&driver_lock);
+	mutex_unlock(&driver_lock);
 
 	while (state == state_attached) {
 		unsigned long elapsed, start;
 
 		start = jiffies;
 
-		down(&driver_lock);
+		mutex_lock(&driver_lock);
 
 		/* Tickle the FCU just in case */
 		if (--fcu_tickle_ticks < 0) {
@@ -1861,7 +1862,7 @@ static int main_control_loop(void *x)
 			do_monitor_slots(&slots_state);
 		else
 			do_monitor_drives(&drives_state);
-		up(&driver_lock);
+		mutex_unlock(&driver_lock);
 
 		if (critical_state == 1) {
 			printk(KERN_WARNING "Temperature control detected a critical condition\n");
@@ -2019,13 +2020,13 @@ static void detach_fcu(void)
  */
 static int therm_pm72_attach(struct i2c_adapter *adapter)
 {
-	down(&driver_lock);
+	mutex_lock(&driver_lock);
 
 	/* Check state */
 	if (state == state_detached)
 		state = state_attaching;
 	if (state != state_attaching) {
-		up(&driver_lock);
+		mutex_unlock(&driver_lock);
 		return 0;
 	}
 
@@ -2054,7 +2055,7 @@ static int therm_pm72_attach(struct i2c_adapter *adapter)
 		state = state_attached;
 		start_control_loops();
 	}
-	up(&driver_lock);
+	mutex_unlock(&driver_lock);
 
 	return 0;
 }
@@ -2065,16 +2066,16 @@ static int therm_pm72_attach(struct i2c_adapter *adapter)
  */
 static int therm_pm72_detach(struct i2c_adapter *adapter)
 {
-	down(&driver_lock);
+	mutex_lock(&driver_lock);
 
 	if (state != state_detached)
 		state = state_detaching;
 
 	/* Stop control loops if any */
 	DBG("stopping control loops\n");
-	up(&driver_lock);
+	mutex_unlock(&driver_lock);
 	stop_control_loops();
-	down(&driver_lock);
+	mutex_lock(&driver_lock);
 
 	if (u3_0 != NULL && !strcmp(adapter->name, "u3 0")) {
 		DBG("lost U3-0, disposing control loops\n");
@@ -2090,7 +2091,7 @@ static int therm_pm72_detach(struct i2c_adapter *adapter)
 	if (u3_0 == NULL && u3_1 == NULL)
 		state = state_detached;
 
-	up(&driver_lock);
+	mutex_unlock(&driver_lock);
 
 	return 0;
 }
-- 
cgit v0.10.2


From 56783c5e4dd32ca370ad0bdf3a9c6c1aaee94726 Mon Sep 17 00:00:00 2001
From: Daniel Walker <dwalker@mvista.com>
Date: Sat, 3 May 2008 06:34:02 +1000
Subject: [POWERPC] macintosh: windfarm_smu_sat: semaphore to mutex

Signed-off-by: Daniel Walker <dwalker@mvista.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>

diff --git a/drivers/macintosh/windfarm_smu_sat.c b/drivers/macintosh/windfarm_smu_sat.c
index 797918d..7f2be4b 100644
--- a/drivers/macintosh/windfarm_smu_sat.c
+++ b/drivers/macintosh/windfarm_smu_sat.c
@@ -13,7 +13,7 @@
 #include <linux/init.h>
 #include <linux/wait.h>
 #include <linux/i2c.h>
-#include <linux/semaphore.h>
+#include <linux/mutex.h>
 #include <asm/prom.h>
 #include <asm/smu.h>
 #include <asm/pmac_low_i2c.h>
@@ -36,7 +36,7 @@
 struct wf_sat {
 	int			nr;
 	atomic_t		refcnt;
-	struct semaphore	mutex;
+	struct mutex		mutex;
 	unsigned long		last_read; /* jiffies when cache last updated */
 	u8			cache[16];
 	struct i2c_client	i2c;
@@ -163,7 +163,7 @@ static int wf_sat_get(struct wf_sensor *sr, s32 *value)
 	if (sat->i2c.adapter == NULL)
 		return -ENODEV;
 
-	down(&sat->mutex);
+	mutex_lock(&sat->mutex);
 	if (time_after(jiffies, (sat->last_read + MAX_AGE))) {
 		err = wf_sat_read_cache(sat);
 		if (err)
@@ -182,7 +182,7 @@ static int wf_sat_get(struct wf_sensor *sr, s32 *value)
 	err = 0;
 
  fail:
-	up(&sat->mutex);
+	mutex_unlock(&sat->mutex);
 	return err;
 }
 
@@ -233,7 +233,7 @@ static void wf_sat_create(struct i2c_adapter *adapter, struct device_node *dev)
 	sat->nr = -1;
 	sat->node = of_node_get(dev);
 	atomic_set(&sat->refcnt, 0);
-	init_MUTEX(&sat->mutex);
+	mutex_init(&sat->mutex);
 	sat->i2c.addr = (addr >> 1) & 0x7f;
 	sat->i2c.adapter = adapter;
 	sat->i2c.driver = &wf_sat_driver;
-- 
cgit v0.10.2


From af3ce514ade2fd0e18c5d078d138a6c1137a33df Mon Sep 17 00:00:00 2001
From: Daniel Walker <dwalker@mvista.com>
Date: Sat, 3 May 2008 06:34:03 +1000
Subject: [POWERPC] macintosh: ADB driver: adb_handler_sem semaphore to mutex

Signed-off-by: Daniel Walker <dwalker@mvista.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>

diff --git a/drivers/macintosh/adb.c b/drivers/macintosh/adb.c
index 2097820..b8b9e44 100644
--- a/drivers/macintosh/adb.c
+++ b/drivers/macintosh/adb.c
@@ -37,7 +37,7 @@
 #include <linux/device.h>
 #include <linux/kthread.h>
 #include <linux/platform_device.h>
-#include <linux/semaphore.h>
+#include <linux/mutex.h>
 
 #include <asm/uaccess.h>
 #ifdef CONFIG_PPC
@@ -102,7 +102,7 @@ static struct adb_handler {
 } adb_handler[16];
 
 /*
- * The adb_handler_sem mutex protects all accesses to the original_address
+ * The adb_handler_mutex mutex protects all accesses to the original_address
  * and handler_id fields of adb_handler[i] for all i, and changes to the
  * handler field.
  * Accesses to the handler field are protected by the adb_handler_lock
@@ -110,7 +110,7 @@ static struct adb_handler {
  * time adb_unregister returns, we know that the old handler isn't being
  * called.
  */
-static DECLARE_MUTEX(adb_handler_sem);
+static DEFINE_MUTEX(adb_handler_mutex);
 static DEFINE_RWLOCK(adb_handler_lock);
 
 #if 0
@@ -355,7 +355,7 @@ do_adb_reset_bus(void)
 		msleep(500);
 	}
 
-	down(&adb_handler_sem);
+	mutex_lock(&adb_handler_mutex);
 	write_lock_irq(&adb_handler_lock);
 	memset(adb_handler, 0, sizeof(adb_handler));
 	write_unlock_irq(&adb_handler_lock);
@@ -376,7 +376,7 @@ do_adb_reset_bus(void)
 		if (adb_controller->autopoll)
 			adb_controller->autopoll(autopoll_devs);
 	}
-	up(&adb_handler_sem);
+	mutex_unlock(&adb_handler_mutex);
 
 	blocking_notifier_call_chain(&adb_client_list,
 		ADB_MSG_POST_RESET, NULL);
@@ -454,7 +454,7 @@ adb_register(int default_id, int handler_id, struct adb_ids *ids,
 {
 	int i;
 
-	down(&adb_handler_sem);
+	mutex_lock(&adb_handler_mutex);
 	ids->nids = 0;
 	for (i = 1; i < 16; i++) {
 		if ((adb_handler[i].original_address == default_id) &&
@@ -472,7 +472,7 @@ adb_register(int default_id, int handler_id, struct adb_ids *ids,
 			ids->id[ids->nids++] = i;
 		}
 	}
-	up(&adb_handler_sem);
+	mutex_unlock(&adb_handler_mutex);
 	return ids->nids;
 }
 
@@ -481,7 +481,7 @@ adb_unregister(int index)
 {
 	int ret = -ENODEV;
 
-	down(&adb_handler_sem);
+	mutex_lock(&adb_handler_mutex);
 	write_lock_irq(&adb_handler_lock);
 	if (adb_handler[index].handler) {
 		while(adb_handler[index].busy) {
@@ -493,7 +493,7 @@ adb_unregister(int index)
 		adb_handler[index].handler = NULL;
 	}
 	write_unlock_irq(&adb_handler_lock);
-	up(&adb_handler_sem);
+	mutex_unlock(&adb_handler_mutex);
 	return ret;
 }
 
@@ -557,19 +557,19 @@ adb_try_handler_change(int address, int new_id)
 {
 	int ret;
 
-	down(&adb_handler_sem);
+	mutex_lock(&adb_handler_mutex);
 	ret = try_handler_change(address, new_id);
-	up(&adb_handler_sem);
+	mutex_unlock(&adb_handler_mutex);
 	return ret;
 }
 
 int
 adb_get_infos(int address, int *original_address, int *handler_id)
 {
-	down(&adb_handler_sem);
+	mutex_lock(&adb_handler_mutex);
 	*original_address = adb_handler[address].original_address;
 	*handler_id = adb_handler[address].handler_id;
-	up(&adb_handler_sem);
+	mutex_unlock(&adb_handler_mutex);
 
 	return (*original_address != 0);
 }
@@ -628,10 +628,10 @@ do_adb_query(struct adb_request *req)
 	case ADB_QUERY_GETDEVINFO:
 		if (req->nbytes < 3)
 			break;
-		down(&adb_handler_sem);
+		mutex_lock(&adb_handler_mutex);
 		req->reply[0] = adb_handler[req->data[2]].original_address;
 		req->reply[1] = adb_handler[req->data[2]].handler_id;
-		up(&adb_handler_sem);
+		mutex_unlock(&adb_handler_mutex);
 		req->complete = 1;
 		req->reply_len = 2;
 		adb_write_done(req);
-- 
cgit v0.10.2


From b41e5fffe8b81fc939067d8c1c195cc79115d5a3 Mon Sep 17 00:00:00 2001
From: Emil Medve <Emilian.Medve@Freescale.com>
Date: Sat, 3 May 2008 06:34:04 +1000
Subject: [POWERPC] devres: Add devm_ioremap_prot()

We provide an ioremap_flags, so this provides a corresponding
devm_ioremap_prot.  The slight name difference is at Ben
Herrenschmidt's request as he plans on changing ioremap_flags to
ioremap_prot in the future.

Signed-off-by: Emil Medve <Emilian.Medve@Freescale.com>
Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
Acked-by: Tejun Heo <htejun@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>

diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index 4bb023f..f1d2cdc 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -23,3 +23,4 @@ obj-$(CONFIG_SMP)	+= locks.o
 endif
 
 obj-$(CONFIG_PPC_LIB_RHEAP) += rheap.o
+obj-$(CONFIG_HAS_IOMEM)	+= devres.o
diff --git a/arch/powerpc/lib/devres.c b/arch/powerpc/lib/devres.c
new file mode 100644
index 0000000..292115d
--- /dev/null
+++ b/arch/powerpc/lib/devres.c
@@ -0,0 +1,42 @@
+/*
+ * Copyright (C) 2008 Freescale Semiconductor, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/device.h>	/* devres_*(), devm_ioremap_release() */
+#include <linux/io.h>		/* ioremap_flags() */
+#include <linux/module.h>	/* EXPORT_SYMBOL() */
+
+/**
+ * devm_ioremap_prot - Managed ioremap_flags()
+ * @dev: Generic device to remap IO address for
+ * @offset: BUS offset to map
+ * @size: Size of map
+ * @flags: Page flags
+ *
+ * Managed ioremap_prot().  Map is automatically unmapped on driver
+ * detach.
+ */
+void __iomem *devm_ioremap_prot(struct device *dev, resource_size_t offset,
+				 size_t size, unsigned long flags)
+{
+	void __iomem **ptr, *addr;
+
+	ptr = devres_alloc(devm_ioremap_release, sizeof(*ptr), GFP_KERNEL);
+	if (!ptr)
+		return NULL;
+
+	addr = ioremap_flags(offset, size, flags);
+	if (addr) {
+		*ptr = addr;
+		devres_add(dev, ptr);
+	} else
+		devres_free(ptr);
+
+	return addr;
+}
+EXPORT_SYMBOL(devm_ioremap_prot);
diff --git a/include/asm-powerpc/io.h b/include/asm-powerpc/io.h
index afae069..e0062d7 100644
--- a/include/asm-powerpc/io.h
+++ b/include/asm-powerpc/io.h
@@ -2,7 +2,7 @@
 #define _ASM_POWERPC_IO_H
 #ifdef __KERNEL__
 
-/* 
+/*
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
  * as published by the Free Software Foundation; either version
@@ -18,6 +18,9 @@ extern int check_legacy_ioport(unsigned long base_port);
 #define _PNPWRP		0xa79
 #define PNPBIOS_BASE	0xf000
 
+#include <linux/device.h>
+#include <linux/io.h>
+
 #include <linux/compiler.h>
 #include <asm/page.h>
 #include <asm/byteorder.h>
@@ -744,6 +747,9 @@ static inline void * bus_to_virt(unsigned long address)
 
 #define clrsetbits_8(addr, clear, set) clrsetbits(8, addr, clear, set)
 
+void __iomem *devm_ioremap_prot(struct device *dev, resource_size_t offset,
+				size_t size, unsigned long flags);
+
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_POWERPC_IO_H */
diff --git a/include/linux/io.h b/include/linux/io.h
index 3a03a36..6c7f0ba 100644
--- a/include/linux/io.h
+++ b/include/linux/io.h
@@ -65,5 +65,6 @@ void __iomem *devm_ioremap_nocache(struct device *dev, resource_size_t offset,
 void devm_iounmap(struct device *dev, void __iomem *addr);
 int check_signature(const volatile void __iomem *io_addr,
 			const unsigned char *signature, int length);
+void devm_ioremap_release(struct device *dev, void *res);
 
 #endif /* _LINUX_IO_H */
diff --git a/lib/devres.c b/lib/devres.c
index 26c87c4..72c8909 100644
--- a/lib/devres.c
+++ b/lib/devres.c
@@ -2,7 +2,7 @@
 #include <linux/io.h>
 #include <linux/module.h>
 
-static void devm_ioremap_release(struct device *dev, void *res)
+void devm_ioremap_release(struct device *dev, void *res)
 {
 	iounmap(*(void __iomem **)res);
 }
-- 
cgit v0.10.2


From 9185ef6787f1c8f1c06aa0cb3c7746fb4f101f50 Mon Sep 17 00:00:00 2001
From: "Denis V. Lunev" <den@openvz.org>
Date: Sat, 3 May 2008 06:34:05 +1000
Subject: [POWERPC] Assign PDE->data before gluing PDE into /proc tree

Simply replace proc_create and further data assigned with proc_create_data.
No need to check for data!=NULL after that.

Signed-off-by: Denis V. Lunev <den@openvz.org>
Cc: Alexey Dobriyan <adobriyan@openvz.org>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>

diff --git a/arch/powerpc/platforms/pseries/scanlog.c b/arch/powerpc/platforms/pseries/scanlog.c
index bec3803..417eca7 100644
--- a/arch/powerpc/platforms/pseries/scanlog.c
+++ b/arch/powerpc/platforms/pseries/scanlog.c
@@ -55,11 +55,6 @@ static ssize_t scanlog_read(struct file *file, char __user *buf,
         dp = PDE(inode);
  	data = (unsigned int *)dp->data;
 
-	if (!data) {
-		printk(KERN_ERR "scanlog: read failed no data\n");
-		return -EIO;
-	}
-
 	if (count > RTAS_DATA_BUF_SIZE)
 		count = RTAS_DATA_BUF_SIZE;
 
@@ -146,11 +141,6 @@ static int scanlog_open(struct inode * inode, struct file * file)
 	struct proc_dir_entry *dp = PDE(inode);
 	unsigned int *data = (unsigned int *)dp->data;
 
-	if (!data) {
-		printk(KERN_ERR "scanlog: open failed no data\n");
-		return -EIO;
-	}
-
 	if (data[0] != 0) {
 		/* This imperfect test stops a second copy of the
 		 * data (or a reset while data is being copied)
@@ -168,10 +158,6 @@ static int scanlog_release(struct inode * inode, struct file * file)
 	struct proc_dir_entry *dp = PDE(inode);
 	unsigned int *data = (unsigned int *)dp->data;
 
-	if (!data) {
-		printk(KERN_ERR "scanlog: release failed no data\n");
-		return -EIO;
-	}
 	data[0] = 0;
 
 	return 0;
@@ -200,12 +186,11 @@ static int __init scanlog_init(void)
 	if (!data)
 		goto err;
 
-	ent = proc_create("ppc64/rtas/scan-log-dump", S_IRUSR, NULL,
-			  &scanlog_fops);
+	ent = proc_create_data("ppc64/rtas/scan-log-dump", S_IRUSR, NULL,
+			       &scanlog_fops, data);
 	if (!ent)
 		goto err;
 
-	ent->data = data;
 	proc_ppc64_scan_log_dump = ent;
 
 	return 0;
-- 
cgit v0.10.2


From fa1b1cff3d06550d23ef540c4f97ca83c021b473 Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <hadi@cyberus.ca>
Date: Mon, 5 May 2008 00:22:35 -0700
Subject: net_cls_act: Make act_simple use of netlink policy.

Convert to netlink helpers by using netlink policy validation.
As a side effect fixes a leak.

Signed-off-by: Jamal Hadi Salim <hadi@cyberus.ca>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index 64b2d13..269ab51 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -6,7 +6,7 @@
  *		as published by the Free Software Foundation; either version
  *		2 of the License, or (at your option) any later version.
  *
- * Authors:	Jamal Hadi Salim (2005)
+ * Authors:	Jamal Hadi Salim (2005-8)
  *
  */
 
@@ -34,6 +34,7 @@ static struct tcf_hashinfo simp_hash_info = {
 	.lock	=	&simp_lock,
 };
 
+#define SIMP_MAX_DATA	32
 static int tcf_simp(struct sk_buff *skb, struct tc_action *a, struct tcf_result *res)
 {
 	struct tcf_defact *d = a->priv;
@@ -69,23 +70,24 @@ static int tcf_simp_release(struct tcf_defact *d, int bind)
 	return ret;
 }
 
-static int alloc_defdata(struct tcf_defact *d, u32 datalen, void *defdata)
+static int alloc_defdata(struct tcf_defact *d, char *defdata)
 {
-	d->tcfd_defdata = kmemdup(defdata, datalen, GFP_KERNEL);
+	d->tcfd_defdata = kstrndup(defdata, SIMP_MAX_DATA, GFP_KERNEL);
 	if (unlikely(!d->tcfd_defdata))
 		return -ENOMEM;
-	d->tcfd_datalen = datalen;
+
 	return 0;
 }
 
-static int realloc_defdata(struct tcf_defact *d, u32 datalen, void *defdata)
+static int realloc_defdata(struct tcf_defact *d, char *defdata)
 {
 	kfree(d->tcfd_defdata);
-	return alloc_defdata(d, datalen, defdata);
+	return alloc_defdata(d, defdata);
 }
 
 static const struct nla_policy simple_policy[TCA_DEF_MAX + 1] = {
 	[TCA_DEF_PARMS]	= { .len = sizeof(struct tc_defact) },
+	[TCA_DEF_DATA]	= { .type = NLA_STRING, .len = SIMP_MAX_DATA },
 };
 
 static int tcf_simp_init(struct nlattr *nla, struct nlattr *est,
@@ -95,28 +97,24 @@ static int tcf_simp_init(struct nlattr *nla, struct nlattr *est,
 	struct tc_defact *parm;
 	struct tcf_defact *d;
 	struct tcf_common *pc;
-	void *defdata;
-	u32 datalen = 0;
+	char *defdata;
 	int ret = 0, err;
 
 	if (nla == NULL)
 		return -EINVAL;
 
-	err = nla_parse_nested(tb, TCA_DEF_MAX, nla, NULL);
+	err = nla_parse_nested(tb, TCA_DEF_MAX, nla, simple_policy);
 	if (err < 0)
 		return err;
 
 	if (tb[TCA_DEF_PARMS] == NULL)
 		return -EINVAL;
 
-	parm = nla_data(tb[TCA_DEF_PARMS]);
-	defdata = nla_data(tb[TCA_DEF_DATA]);
-	if (defdata == NULL)
+	if (tb[TCA_DEF_DATA] == NULL)
 		return -EINVAL;
 
-	datalen = nla_len(tb[TCA_DEF_DATA]);
-	if (datalen == 0)
-		return -EINVAL;
+	parm = nla_data(tb[TCA_DEF_PARMS]);
+	defdata = nla_data(tb[TCA_DEF_DATA]);
 
 	pc = tcf_hash_check(parm->index, a, bind, &simp_hash_info);
 	if (!pc) {
@@ -126,7 +124,7 @@ static int tcf_simp_init(struct nlattr *nla, struct nlattr *est,
 			return -ENOMEM;
 
 		d = to_defact(pc);
-		ret = alloc_defdata(d, datalen, defdata);
+		ret = alloc_defdata(d, defdata);
 		if (ret < 0) {
 			kfree(pc);
 			return ret;
@@ -138,7 +136,7 @@ static int tcf_simp_init(struct nlattr *nla, struct nlattr *est,
 			tcf_simp_release(d, bind);
 			return -EEXIST;
 		}
-		realloc_defdata(d, datalen, defdata);
+		realloc_defdata(d, defdata);
 	}
 
 	spin_lock_bh(&d->tcf_lock);
@@ -172,7 +170,7 @@ static inline int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a,
 	opt.bindcnt = d->tcf_bindcnt - bind;
 	opt.action = d->tcf_action;
 	NLA_PUT(skb, TCA_DEF_PARMS, sizeof(opt), &opt);
-	NLA_PUT(skb, TCA_DEF_DATA, d->tcfd_datalen, d->tcfd_defdata);
+	NLA_PUT_STRING(skb, TCA_DEF_DATA, d->tcfd_defdata);
 	t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install);
 	t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse);
 	t.expires = jiffies_to_clock_t(d->tcf_tm.expires);
-- 
cgit v0.10.2


From 0a6db7dc49b91875ff015934df3e85b8785f2294 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 5 May 2008 00:33:58 -0700
Subject: sbus: Fix bpp driver build.

Using the variable name 'dev_name' in the top-level namespace is a bad
idea.

This conflicts with linux/device.h's inline function of the same name.

Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/sbus/char/bpp.c b/drivers/sbus/char/bpp.c
index 4fab0c2..b87037e 100644
--- a/drivers/sbus/char/bpp.c
+++ b/drivers/sbus/char/bpp.c
@@ -41,7 +41,7 @@
 #define BPP_DELAY 100
 
 static const unsigned  BPP_MAJOR = LP_MAJOR;
-static const char* dev_name = "bpp";
+static const char *bpp_dev_name = "bpp";
 
 /* When switching from compatibility to a mode where I can read, try
    the following mode first. */
-- 
cgit v0.10.2


From c17f888f8fc2e47e2b4a51424f8ccf564ae87576 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 5 May 2008 01:04:06 -0700
Subject: irda: fix !PNP support in drivers/net/irda/nsc-ircc.c

x86.git testing found the following build failure in latest -git:

 drivers/built-in.o: In function `nsc_ircc_pnp_probe':
 nsc-ircc.c:(.text+0xdf1b6): undefined reference to `pnp_get_resource'
 nsc-ircc.c:(.text+0xdf1d4): undefined reference to `pnp_get_resource'
 nsc-ircc.c:(.text+0xdf1ee): undefined reference to `pnp_get_resource'
 nsc-ircc.c:(.text+0xdf237): undefined reference to `pnp_get_resource'
 nsc-ircc.c:(.text+0xdf24c): undefined reference to `pnp_get_resource'
 drivers/built-in.o:nsc-ircc.c:(.text+0xdf266): more undefined references to `pnp_get_resource' follow
 make: *** [.tmp_vmlinux1] Error 1

triggered via this config:

  http://redhat.com/~mingo/misc/config-Sat_May__3_20_53_13_CEST_2008.bad

while generally most users will have PNP enabled, drivers can support
non-PNP build mode too - and most drivers implement it. That is typically
done by providing a dummy pnp_driver structure that will not probe anything.

The fallback routines in the driver will handle this dumber mode of
operation too.

This patch implements that. I have not tested whether this actually
works on real hardware so take care. It does resolve the build bug.

[ Another solution that is used by a few drivers is to exclude the driver
  in the Kconfig if PNP is disabled, via "depends on PNP", but this would
  limit the availability of the driver needlessly. ]

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/irda/nsc-ircc.c b/drivers/net/irda/nsc-ircc.c
index a873d2b..a7714da 100644
--- a/drivers/net/irda/nsc-ircc.c
+++ b/drivers/net/irda/nsc-ircc.c
@@ -100,7 +100,9 @@ static int nsc_ircc_probe_39x(nsc_chip_t *chip, chipio_t *info);
 static int nsc_ircc_init_108(nsc_chip_t *chip, chipio_t *info);
 static int nsc_ircc_init_338(nsc_chip_t *chip, chipio_t *info);
 static int nsc_ircc_init_39x(nsc_chip_t *chip, chipio_t *info);
+#ifdef CONFIG_PNP
 static int nsc_ircc_pnp_probe(struct pnp_dev *dev, const struct pnp_device_id *id);
+#endif
 
 /* These are the known NSC chips */
 static nsc_chip_t chips[] = {
@@ -156,9 +158,11 @@ static const struct pnp_device_id nsc_ircc_pnp_table[] = {
 MODULE_DEVICE_TABLE(pnp, nsc_ircc_pnp_table);
 
 static struct pnp_driver nsc_ircc_pnp_driver = {
+#ifdef CONFIG_PNP
 	.name = "nsc-ircc",
 	.id_table = nsc_ircc_pnp_table,
 	.probe = nsc_ircc_pnp_probe,
+#endif
 };
 
 /* Some prototypes */
@@ -916,6 +920,7 @@ static int nsc_ircc_probe_39x(nsc_chip_t *chip, chipio_t *info)
 	return 0;
 }
 
+#ifdef CONFIG_PNP
 /* PNP probing */
 static int nsc_ircc_pnp_probe(struct pnp_dev *dev, const struct pnp_device_id *id)
 {
@@ -952,6 +957,7 @@ static int nsc_ircc_pnp_probe(struct pnp_dev *dev, const struct pnp_device_id *i
 
 	return 0;
 }
+#endif
 
 /*
  * Function nsc_ircc_setup (info)
-- 
cgit v0.10.2


From 7a1aa309f21ea2f6c31f364341e4027ecf4e79bc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 5 May 2008 01:06:54 -0700
Subject: irda: fix !PNP support for drivers/net/irda/smsc-ircc2.c

x86.git testing found this build bug on v2.6.26-rc1:

  ERROR: "pnp_get_resource" [drivers/net/irda/smsc-ircc2.ko] undefined!
  make[1]: *** [__modpost] Error 1
  make: *** [modules] Error 2

the driver did not anticipate the case of !CONFIG_PNP which is rare but
still possible. Instead of restricting the driver to PNP-only in the
Kconfig space, add the (trivial) dummy struct pnp_driver - this is that
other drivers use in the !PNP case too.

The driver itself can in theory be initialized on !PNP too in certain
cases, via smsc_ircc_legacy_probe().

Patch only minimally build tested, i dont have this hardware.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/irda/smsc-ircc2.c b/drivers/net/irda/smsc-ircc2.c
index 1f26da7..cfe0194 100644
--- a/drivers/net/irda/smsc-ircc2.c
+++ b/drivers/net/irda/smsc-ircc2.c
@@ -376,6 +376,7 @@ MODULE_DEVICE_TABLE(pnp, smsc_ircc_pnp_table);
 
 static int pnp_driver_registered;
 
+#ifdef CONFIG_PNP
 static int __init smsc_ircc_pnp_probe(struct pnp_dev *dev,
 				      const struct pnp_device_id *dev_id)
 {
@@ -402,7 +403,9 @@ static struct pnp_driver smsc_ircc_pnp_driver = {
 	.id_table	= smsc_ircc_pnp_table,
 	.probe		= smsc_ircc_pnp_probe,
 };
-
+#else /* CONFIG_PNP */
+static struct pnp_driver smsc_ircc_pnp_driver;
+#endif
 
 /*******************************************************************************
  *
-- 
cgit v0.10.2


From 983e0972cea450fe5725d1ba11c78adfd5d7ad3a Mon Sep 17 00:00:00 2001
From: Johann Felix Soden <johfel@users.sourceforge.net>
Date: Fri, 2 May 2008 09:54:31 +0200
Subject: [ALSA] pcsp: Fix build with CONFIG_PM=n

sound/drivers/pcsp/pcsp.c: In function 'pcsp_suspend':
sound/drivers/pcsp/pcsp.c:201: error: implicit declaration of function 'snd_pcm_suspend_all'

Signed-off-by: Johann Felix Soden <johfel@users.sourceforge.net>
CC: Stas Sergeev <stsp@aknet.ru>
Signed-off-by: Takashi Iwai <tiwai@suse.de>

diff --git a/sound/drivers/pcsp/pcsp.c b/sound/drivers/pcsp/pcsp.c
index 5920351..54a1f90 100644
--- a/sound/drivers/pcsp/pcsp.c
+++ b/sound/drivers/pcsp/pcsp.c
@@ -194,6 +194,7 @@ static void pcsp_stop_beep(struct snd_pcsp *chip)
 	spin_unlock_irq(&chip->substream_lock);
 }
 
+#ifdef CONFIG_PM
 static int pcsp_suspend(struct platform_device *dev, pm_message_t state)
 {
 	struct snd_pcsp *chip = platform_get_drvdata(dev);
@@ -201,6 +202,9 @@ static int pcsp_suspend(struct platform_device *dev, pm_message_t state)
 	snd_pcm_suspend_all(chip->pcm);
 	return 0;
 }
+#else
+#define pcsp_suspend NULL
+#endif	/* CONFIG_PM */
 
 static void pcsp_shutdown(struct platform_device *dev)
 {
-- 
cgit v0.10.2


From 7bd3c0f73c9c5b47fd1ca49757c436e73f4cd55b Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Fri, 2 May 2008 12:28:02 +0200
Subject: [ALSA] hda - Support IDT 92HD206 codec

Added the support for IDT 92HD206 codec chip.
It's compatible with STAC927x.

Signed-off-by: Takashi Iwai <tiwai@suse.de>

diff --git a/sound/pci/hda/patch_sigmatel.c b/sound/pci/hda/patch_sigmatel.c
index b3a15d6..393f7fd 100644
--- a/sound/pci/hda/patch_sigmatel.c
+++ b/sound/pci/hda/patch_sigmatel.c
@@ -4289,6 +4289,8 @@ struct hda_codec_preset snd_hda_preset_sigmatel[] = {
 	{ .id = 0x83847635, .name = "STAC9250D", .patch = patch_stac925x },
 	{ .id = 0x83847636, .name = "STAC9251", .patch = patch_stac925x },
 	{ .id = 0x83847637, .name = "STAC9250D", .patch = patch_stac925x },
+	{ .id = 0x83847645, .name = "92HD206X", .patch = patch_stac927x },
+	{ .id = 0x83847646, .name = "92HD206D", .patch = patch_stac927x },
  	/* The following does not take into account .id=0x83847661 when subsys =
  	 * 104D0C00 which is STAC9225s. Because of this, some SZ Notebooks are
  	 * currently not fully supported.
-- 
cgit v0.10.2


From 20686c24377246d9eb57782551b25ff19df09873 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Fri, 2 May 2008 12:31:51 +0200
Subject: [ALSA] fm801 - Fix kconfig dependency mess of fm801-tea575x

FM801-tea575x tuner has a reverse selection to V4L1 and this causes
nasty dependency problems.

The patch simplifies the dependency with a normal
"depends on VIDEO_V4L1".  This decreases the usability but fixes bugs,
yeah.  If any better feature like "requires" is introduced to kbuild
in future, we'll be able to switch it...

Signed-off-by: Takashi Iwai <tiwai@suse.de>

diff --git a/sound/pci/Kconfig b/sound/pci/Kconfig
index 581debf..7e47421 100644
--- a/sound/pci/Kconfig
+++ b/sound/pci/Kconfig
@@ -515,19 +515,16 @@ config SND_FM801
 config SND_FM801_TEA575X_BOOL
 	bool "ForteMedia FM801 + TEA5757 tuner"
 	depends on SND_FM801
+	depends on VIDEO_V4L1=y || VIDEO_V4L1=SND_FM801
 	help
 	  Say Y here to include support for soundcards based on the ForteMedia
 	  FM801 chip with a TEA5757 tuner connected to GPIO1-3 pins (Media
 	  Forte SF256-PCS-02) into the snd-fm801 driver.
 
-	  This will enable support for the old V4L1 API.
-
 config SND_FM801_TEA575X
 	tristate
 	depends on SND_FM801_TEA575X_BOOL
 	default SND_FM801
-	select VIDEO_V4L1
-	select VIDEO_DEV
 
 config SND_HDA_INTEL
 	tristate "Intel HD Audio"
-- 
cgit v0.10.2


From 564c5bead424fa798dfbd5fe382b4e0b7ea483fb Mon Sep 17 00:00:00 2001
From: Jacek Luczak <luczak.jacek@gmail.com>
Date: Sat, 3 May 2008 18:41:23 +0200
Subject: [ALSA] Revert migration to alc_set_pin_output() in
 alc861_auto_set_output_and_unmute()

Change done by:
        commit f6c7e5461e9046445d50c5c7a9a4587824239623
        [ALSA] hda-codec - Fix auto-configuration of Realtek codecs
broke sound on ALC861 Analog.

Signed-off-by: Jacek Luczak <luczak.jacek@gmail.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>

diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
index d9783a4..6d4df45 100644
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c
@@ -11902,7 +11902,10 @@ static void alc861_auto_set_output_and_unmute(struct hda_codec *codec,
 					      hda_nid_t nid,
 					      int pin_type, int dac_idx)
 {
-	alc_set_pin_output(codec, nid, pin_type);
+	snd_hda_codec_write(codec, nid, 0, AC_VERB_SET_PIN_WIDGET_CONTROL,
+			    pin_type);
+	snd_hda_codec_write(codec, dac_idx, 0, AC_VERB_SET_AMP_GAIN_MUTE,
+			    AMP_OUT_UNMUTE);
 }
 
 static void alc861_auto_init_multi_out(struct hda_codec *codec)
-- 
cgit v0.10.2


From 2e75d050e42d1c61e820f9a35078a2f69e02cc3e Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Sat, 3 May 2008 18:46:56 +0200
Subject: [ALSA] ac97 - Add a workaround for broken quirk for VT1617A codec

On boards with VT1617A codec, the sound disappears suddenly.
This looks like a problem with HPE-bit control that is supposed to be
set in patch_vt1617a().  However, on such problematic hardwares, the
bit is actually reset mysteriously.

The patch adds a workaround for the wrong quirk.

Signed-off-by: Takashi Iwai <tiwai@suse.de>

diff --git a/sound/pci/ac97/ac97_patch.c b/sound/pci/ac97/ac97_patch.c
index 39198e5..2da8981 100644
--- a/sound/pci/ac97/ac97_patch.c
+++ b/sound/pci/ac97/ac97_patch.c
@@ -3446,6 +3446,7 @@ static const struct snd_kcontrol_new snd_ac97_controls_vt1617a[] = {
 int patch_vt1617a(struct snd_ac97 * ac97)
 {
 	int err = 0;
+	int val;
 
 	/* we choose to not fail out at this point, but we tell the
 	   caller when we return */
@@ -3456,7 +3457,13 @@ int patch_vt1617a(struct snd_ac97 * ac97)
 	/* bring analog power consumption to normal by turning off the
 	 * headphone amplifier, like WinXP driver for EPIA SP
 	 */
-	snd_ac97_write_cache(ac97, 0x5c, 0x20);
+	/* We need to check the bit before writing it.
+	 * On some (many?) hardwares, setting bit actually clears it!
+	 */
+	val = snd_ac97_read(ac97, 0x5c);
+	if (!(val & 0x20))
+		snd_ac97_write_cache(ac97, 0x5c, 0x20);
+
 	ac97->ext_id |= AC97_EI_SPDIF;	/* force the detection of spdif */
 	ac97->rates[AC97_RATES_SPDIF] = SNDRV_PCM_RATE_44100 | SNDRV_PCM_RATE_48000;
 	ac97->build_ops = &patch_vt1616_ops;
-- 
cgit v0.10.2


From d6426171bab3403cdcd5613d5549f20b0ab0967c Mon Sep 17 00:00:00 2001
From: Davide Rizzo <davide@elpa.it>
Date: Mon, 5 May 2008 14:56:07 +0200
Subject: [ALSA] soc - fix s3c2410 PCM breakage

S3C2410 pcm doesn't work.
s3c2410_dma_request() now returns the channel number and not 0 if OK.

Signed-off-by: Davide Rizzo <davide@elpa.it>
Acked-by: Liam Girdwood <lg@opensource.wolfsonmicro.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>

diff --git a/sound/soc/s3c24xx/s3c24xx-pcm.c b/sound/soc/s3c24xx/s3c24xx-pcm.c
index 6c70a81..7806ae6 100644
--- a/sound/soc/s3c24xx/s3c24xx-pcm.c
+++ b/sound/soc/s3c24xx/s3c24xx-pcm.c
@@ -171,7 +171,7 @@ static int s3c24xx_pcm_hw_params(struct snd_pcm_substream *substream,
 		ret = s3c2410_dma_request(prtd->params->channel,
 					  prtd->params->client, NULL);
 
-		if (ret) {
+		if (ret < 0) {
 			DBG(KERN_ERR "failed to get dma channel\n");
 			return ret;
 		}
-- 
cgit v0.10.2


From 2c36eecfb6471c457994647771d1405502ad5fde Mon Sep 17 00:00:00 2001
From: Davide Rizzo <davide@elpa.it>
Date: Mon, 5 May 2008 14:59:39 +0200
Subject: [ALSA] soc - fix S3C2410 i2s programming error

S3C2410 i2s driver currently manages only i2s protocol (and not left
justified one) and slave mode.
With this small patch, other modes are possible.

Signed-off-by: Davide Rizzo <davide@elpa.it>
Acked-by: Liam Girdwood <lg@opensource.wolfsonmicro.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>

diff --git a/sound/soc/s3c24xx/s3c24xx-i2s.c b/sound/soc/s3c24xx/s3c24xx-i2s.c
index 4ebcd6a..1ed6afd 100644
--- a/sound/soc/s3c24xx/s3c24xx-i2s.c
+++ b/sound/soc/s3c24xx/s3c24xx-i2s.c
@@ -224,6 +224,7 @@ static int s3c24xx_i2s_set_fmt(struct snd_soc_cpu_dai *cpu_dai,
 		iismod |= S3C2410_IISMOD_SLAVE;
 		break;
 	case SND_SOC_DAIFMT_CBS_CFS:
+		iismod &= ~S3C2410_IISMOD_SLAVE;
 		break;
 	default:
 		return -EINVAL;
@@ -234,6 +235,7 @@ static int s3c24xx_i2s_set_fmt(struct snd_soc_cpu_dai *cpu_dai,
 		iismod |= S3C2410_IISMOD_MSB;
 		break;
 	case SND_SOC_DAIFMT_I2S:
+		iismod &= ~S3C2410_IISMOD_MSB;
 		break;
 	default:
 		return -EINVAL;
-- 
cgit v0.10.2


From f92509371ec06227a7e29778f395776d31b0deab Mon Sep 17 00:00:00 2001
From: "grzegorz.chwesewicz@chilan.com" <grzegorz.chwesewicz@chilan.com>
Date: Thu, 24 Apr 2008 16:57:22 -0500
Subject: kgdb: minor documentation fixes

Two minor fixes to the kgdb documentation.

Signed-off-by: Grzegorz Chwesewicz, Chilan <grzegorz.chwesewicz@chilan.com>
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>

diff --git a/Documentation/DocBook/kgdb.tmpl b/Documentation/DocBook/kgdb.tmpl
index 97618be..028a844 100644
--- a/Documentation/DocBook/kgdb.tmpl
+++ b/Documentation/DocBook/kgdb.tmpl
@@ -72,7 +72,7 @@
     kgdb is a source level debugger for linux kernel. It is used along
     with gdb to debug a linux kernel.  The expectation is that gdb can
     be used to "break in" to the kernel to inspect memory, variables
-    and look through a cal stack information similar to what an
+    and look through call stack information similar to what an
     application developer would use gdb for.  It is possible to place
     breakpoints in kernel code and perform some limited execution
     stepping.
@@ -93,8 +93,10 @@
   <chapter id="CompilingAKernel">
     <title>Compiling a kernel</title>
     <para>
-    To enable <symbol>CONFIG_KGDB</symbol>, look under the "Kernel debugging"
-    and then select "KGDB: kernel debugging with remote gdb".
+    To enable <symbol>CONFIG_KGDB</symbol> you should first turn on
+    "Prompt for development and/or incomplete code/drivers"
+    (CONFIG_EXPERIMENTAL) in  "General setup", then under the
+    "Kernel debugging" select "KGDB: kernel debugging with remote gdb".
     </para>
     <para>
     Next you should choose one of more I/O drivers to interconnect debugging
-- 
cgit v0.10.2


From 001fddf5fdcfe2c08ac9c4e5ca80c5e5698363bb Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Thu, 24 Apr 2008 16:57:23 -0500
Subject: kgdb: trivial sparse fixes in kgdb test-suite

Shadowed variable and integer as NULL pointer fixes:
drivers/misc/kgdbts.c:877:6: warning: symbol 'sys_open_test' shadows an earlier one
drivers/misc/kgdbts.c:537:27: originally declared here
drivers/misc/kgdbts.c:378:22: warning: Using plain integer as NULL pointer
drivers/misc/kgdbts.c:386:22: warning: Using plain integer as NULL pointer
drivers/misc/kgdbts.c:468:30: warning: Using plain integer as NULL pointer
drivers/misc/kgdbts.c:472:15: warning: Using plain integer as NULL pointer
drivers/misc/kgdbts.c:502:30: warning: Using plain integer as NULL pointer
drivers/misc/kgdbts.c:506:30: warning: Using plain integer as NULL pointer
drivers/misc/kgdbts.c:509:30: warning: Using plain integer as NULL pointer
drivers/misc/kgdbts.c:523:20: warning: Using plain integer as NULL pointer
drivers/misc/kgdbts.c:527:20: warning: Using plain integer as NULL pointer
drivers/misc/kgdbts.c:530:15: warning: Using plain integer as NULL pointer
drivers/misc/kgdbts.c:541:21: warning: Using plain integer as NULL pointer
drivers/misc/kgdbts.c:545:21: warning: Using plain integer as NULL pointer
drivers/misc/kgdbts.c:548:15: warning: Using plain integer as NULL pointer
drivers/misc/kgdbts.c:559:30: warning: Using plain integer as NULL pointer
drivers/misc/kgdbts.c:563:15: warning: Using plain integer as NULL pointer
drivers/misc/kgdbts.c:573:16: warning: Using plain integer as NULL pointer
drivers/misc/kgdbts.c:574:19: warning: Using plain integer as NULL pointer
drivers/misc/kgdbts.c:578:15: warning: Using plain integer as NULL pointer
drivers/misc/kgdbts.c:588:16: warning: Using plain integer as NULL pointer
drivers/misc/kgdbts.c:589:19: warning: Using plain integer as NULL pointer
drivers/misc/kgdbts.c:593:15: warning: Using plain integer as NULL pointer
drivers/misc/kgdbts.c:602:16: warning: Using plain integer as NULL pointer
drivers/misc/kgdbts.c:604:15: warning: Using plain integer as NULL pointer
drivers/misc/kgdbts.c:925:3: warning: Using plain integer as NULL pointer
drivers/misc/kgdbts.c:938:3: warning: Using plain integer as NULL pointer

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>

diff --git a/drivers/misc/kgdbts.c b/drivers/misc/kgdbts.c
index 30a1af8..879fcbd 100644
--- a/drivers/misc/kgdbts.c
+++ b/drivers/misc/kgdbts.c
@@ -375,7 +375,7 @@ static void emul_sstep_get(char *arg)
 		break;
 	case 1:
 		/* set breakpoint */
-		break_helper("Z0", 0, sstep_addr);
+		break_helper("Z0", NULL, sstep_addr);
 		break;
 	case 2:
 		/* Continue */
@@ -383,7 +383,7 @@ static void emul_sstep_get(char *arg)
 		break;
 	case 3:
 		/* Clear breakpoint */
-		break_helper("z0", 0, sstep_addr);
+		break_helper("z0", NULL, sstep_addr);
 		break;
 	default:
 		eprintk("kgdbts: ERROR failed sstep get emulation\n");
@@ -465,11 +465,11 @@ static struct test_struct sw_breakpoint_test[] = {
 	{ "?", "S0*" }, /* Clear break points */
 	{ "kgdbts_break_test", "OK", sw_break, }, /* set sw breakpoint */
 	{ "c", "T0*", }, /* Continue */
-	{ "g", "kgdbts_break_test", 0, check_and_rewind_pc },
+	{ "g", "kgdbts_break_test", NULL, check_and_rewind_pc },
 	{ "write", "OK", write_regs },
 	{ "kgdbts_break_test", "OK", sw_rem_break }, /*remove breakpoint */
 	{ "D", "OK" }, /* Detach */
-	{ "D", "OK", 0,  got_break }, /* If the test worked we made it here */
+	{ "D", "OK", NULL,  got_break }, /* On success we made it here */
 	{ "", "" },
 };
 
@@ -499,14 +499,14 @@ static struct test_struct singlestep_break_test[] = {
 	{ "?", "S0*" }, /* Clear break points */
 	{ "kgdbts_break_test", "OK", sw_break, }, /* set sw breakpoint */
 	{ "c", "T0*", }, /* Continue */
-	{ "g", "kgdbts_break_test", 0, check_and_rewind_pc },
+	{ "g", "kgdbts_break_test", NULL, check_and_rewind_pc },
 	{ "write", "OK", write_regs }, /* Write registers */
 	{ "kgdbts_break_test", "OK", sw_rem_break }, /*remove breakpoint */
 	{ "s", "T0*", emul_sstep_get, emul_sstep_put }, /* Single step */
-	{ "g", "kgdbts_break_test", 0, check_single_step },
+	{ "g", "kgdbts_break_test", NULL, check_single_step },
 	{ "kgdbts_break_test", "OK", sw_break, }, /* set sw breakpoint */
 	{ "c", "T0*", }, /* Continue */
-	{ "g", "kgdbts_break_test", 0, check_and_rewind_pc },
+	{ "g", "kgdbts_break_test", NULL, check_and_rewind_pc },
 	{ "write", "OK", write_regs }, /* Write registers */
 	{ "D", "OK" }, /* Remove all breakpoints and continues */
 	{ "", "" },
@@ -520,14 +520,14 @@ static struct test_struct do_fork_test[] = {
 	{ "?", "S0*" }, /* Clear break points */
 	{ "do_fork", "OK", sw_break, }, /* set sw breakpoint */
 	{ "c", "T0*", }, /* Continue */
-	{ "g", "do_fork", 0, check_and_rewind_pc }, /* check location */
+	{ "g", "do_fork", NULL, check_and_rewind_pc }, /* check location */
 	{ "write", "OK", write_regs }, /* Write registers */
 	{ "do_fork", "OK", sw_rem_break }, /*remove breakpoint */
 	{ "s", "T0*", emul_sstep_get, emul_sstep_put }, /* Single step */
-	{ "g", "do_fork", 0, check_single_step },
+	{ "g", "do_fork", NULL, check_single_step },
 	{ "do_fork", "OK", sw_break, }, /* set sw breakpoint */
 	{ "7", "T0*", skip_back_repeat_test }, /* Loop based on repeat_test */
-	{ "D", "OK", 0, final_ack_set }, /* detach and unregister I/O */
+	{ "D", "OK", NULL, final_ack_set }, /* detach and unregister I/O */
 	{ "", "" },
 };
 
@@ -538,14 +538,14 @@ static struct test_struct sys_open_test[] = {
 	{ "?", "S0*" }, /* Clear break points */
 	{ "sys_open", "OK", sw_break, }, /* set sw breakpoint */
 	{ "c", "T0*", }, /* Continue */
-	{ "g", "sys_open", 0, check_and_rewind_pc }, /* check location */
+	{ "g", "sys_open", NULL, check_and_rewind_pc }, /* check location */
 	{ "write", "OK", write_regs }, /* Write registers */
 	{ "sys_open", "OK", sw_rem_break }, /*remove breakpoint */
 	{ "s", "T0*", emul_sstep_get, emul_sstep_put }, /* Single step */
-	{ "g", "sys_open", 0, check_single_step },
+	{ "g", "sys_open", NULL, check_single_step },
 	{ "sys_open", "OK", sw_break, }, /* set sw breakpoint */
 	{ "7", "T0*", skip_back_repeat_test }, /* Loop based on repeat_test */
-	{ "D", "OK", 0, final_ack_set }, /* detach and unregister I/O */
+	{ "D", "OK", NULL, final_ack_set }, /* detach and unregister I/O */
 	{ "", "" },
 };
 
@@ -556,11 +556,11 @@ static struct test_struct hw_breakpoint_test[] = {
 	{ "?", "S0*" }, /* Clear break points */
 	{ "kgdbts_break_test", "OK", hw_break, }, /* set hw breakpoint */
 	{ "c", "T0*", }, /* Continue */
-	{ "g", "kgdbts_break_test", 0, check_and_rewind_pc },
+	{ "g", "kgdbts_break_test", NULL, check_and_rewind_pc },
 	{ "write", "OK", write_regs },
 	{ "kgdbts_break_test", "OK", hw_rem_break }, /*remove breakpoint */
 	{ "D", "OK" }, /* Detach */
-	{ "D", "OK", 0,  got_break }, /* If the test worked we made it here */
+	{ "D", "OK", NULL,  got_break }, /* On success we made it here */
 	{ "", "" },
 };
 
@@ -570,12 +570,12 @@ static struct test_struct hw_breakpoint_test[] = {
 static struct test_struct hw_write_break_test[] = {
 	{ "?", "S0*" }, /* Clear break points */
 	{ "hw_break_val", "OK", hw_write_break, }, /* set hw breakpoint */
-	{ "c", "T0*", 0, got_break }, /* Continue */
-	{ "g", "silent", 0, check_and_rewind_pc },
+	{ "c", "T0*", NULL, got_break }, /* Continue */
+	{ "g", "silent", NULL, check_and_rewind_pc },
 	{ "write", "OK", write_regs },
 	{ "hw_break_val", "OK", hw_rem_write_break }, /*remove breakpoint */
 	{ "D", "OK" }, /* Detach */
-	{ "D", "OK", 0,  got_break }, /* If the test worked we made it here */
+	{ "D", "OK", NULL,  got_break }, /* On success we made it here */
 	{ "", "" },
 };
 
@@ -585,12 +585,12 @@ static struct test_struct hw_write_break_test[] = {
 static struct test_struct hw_access_break_test[] = {
 	{ "?", "S0*" }, /* Clear break points */
 	{ "hw_break_val", "OK", hw_access_break, }, /* set hw breakpoint */
-	{ "c", "T0*", 0, got_break }, /* Continue */
-	{ "g", "silent", 0, check_and_rewind_pc },
+	{ "c", "T0*", NULL, got_break }, /* Continue */
+	{ "g", "silent", NULL, check_and_rewind_pc },
 	{ "write", "OK", write_regs },
 	{ "hw_break_val", "OK", hw_rem_access_break }, /*remove breakpoint */
 	{ "D", "OK" }, /* Detach */
-	{ "D", "OK", 0,  got_break }, /* If the test worked we made it here */
+	{ "D", "OK", NULL,  got_break }, /* On success we made it here */
 	{ "", "" },
 };
 
@@ -599,9 +599,9 @@ static struct test_struct hw_access_break_test[] = {
  */
 static struct test_struct nmi_sleep_test[] = {
 	{ "?", "S0*" }, /* Clear break points */
-	{ "c", "T0*", 0, got_break }, /* Continue */
+	{ "c", "T0*", NULL, got_break }, /* Continue */
 	{ "D", "OK" }, /* Detach */
-	{ "D", "OK", 0,  got_break }, /* If the test worked we made it here */
+	{ "D", "OK", NULL,  got_break }, /* On success we made it here */
 	{ "", "" },
 };
 
@@ -874,15 +874,15 @@ static void kgdbts_run_tests(void)
 {
 	char *ptr;
 	int fork_test = 0;
-	int sys_open_test = 0;
+	int do_sys_open_test = 0;
 	int nmi_sleep = 0;
 
 	ptr = strstr(config, "F");
 	if (ptr)
-		fork_test = simple_strtol(ptr+1, NULL, 10);
+		fork_test = simple_strtol(ptr + 1, NULL, 10);
 	ptr = strstr(config, "S");
 	if (ptr)
-		sys_open_test = simple_strtol(ptr+1, NULL, 10);
+		do_sys_open_test = simple_strtol(ptr + 1, NULL, 10);
 	ptr = strstr(config, "N");
 	if (ptr)
 		nmi_sleep = simple_strtol(ptr+1, NULL, 10);
@@ -922,7 +922,7 @@ static void kgdbts_run_tests(void)
 		repeat_test = fork_test;
 		printk(KERN_INFO "kgdbts:RUN do_fork for %i breakpoints\n",
 			repeat_test);
-		kthread_run(kgdbts_unreg_thread, 0, "kgdbts_unreg");
+		kthread_run(kgdbts_unreg_thread, NULL, "kgdbts_unreg");
 		run_do_fork_test();
 		return;
 	}
@@ -931,11 +931,11 @@ static void kgdbts_run_tests(void)
 	 * executed because a kernel thread will be spawned at the very
 	 * end to unregister the debug hooks.
 	 */
-	if (sys_open_test) {
-		repeat_test = sys_open_test;
+	if (do_sys_open_test) {
+		repeat_test = do_sys_open_test;
 		printk(KERN_INFO "kgdbts:RUN sys_open for %i breakpoints\n",
 			repeat_test);
-		kthread_run(kgdbts_unreg_thread, 0, "kgdbts_unreg");
+		kthread_run(kgdbts_unreg_thread, NULL, "kgdbts_unreg");
 		run_sys_open_test();
 		return;
 	}
-- 
cgit v0.10.2


From 7cfcd985d36031459cc64e3843ea36a4d801097d Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Thu, 24 Apr 2008 16:57:23 -0500
Subject: kgdb: 1000 loops for the single step test in kgdbts

The single step test is not terribly costly and it should be able to
pass at 1000 loops successfully in under 1 second.  A non-kgdb timing
regression was found using this test, but it did not occur frequently
because by default the test was only executed a single time.

This patch changes the default for the single step test to 1000
iterations and allows for individual configuration of the single step
test to further exercise the kgdb subsystem when needed.

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>

diff --git a/drivers/misc/kgdbts.c b/drivers/misc/kgdbts.c
index 879fcbd..fa39410 100644
--- a/drivers/misc/kgdbts.c
+++ b/drivers/misc/kgdbts.c
@@ -47,6 +47,7 @@
  *       to test the HW NMI watchdog
  * F## = Break at do_fork for ## iterations
  * S## = Break at sys_open for ## iterations
+ * I## = Run the single step test ## iterations
  *
  * NOTE: that the do_fork and sys_open tests are mutually exclusive.
  *
@@ -875,7 +876,9 @@ static void kgdbts_run_tests(void)
 	char *ptr;
 	int fork_test = 0;
 	int do_sys_open_test = 0;
+	int sstep_test = 1000;
 	int nmi_sleep = 0;
+	int i;
 
 	ptr = strstr(config, "F");
 	if (ptr)
@@ -886,6 +889,9 @@ static void kgdbts_run_tests(void)
 	ptr = strstr(config, "N");
 	if (ptr)
 		nmi_sleep = simple_strtol(ptr+1, NULL, 10);
+	ptr = strstr(config, "I");
+	if (ptr)
+		sstep_test = simple_strtol(ptr+1, NULL, 10);
 
 	/* required internal KGDB tests */
 	v1printk("kgdbts:RUN plant and detach test\n");
@@ -894,8 +900,13 @@ static void kgdbts_run_tests(void)
 	run_breakpoint_test(0);
 	v1printk("kgdbts:RUN bad memory access test\n");
 	run_bad_read_test();
-	v1printk("kgdbts:RUN singlestep breakpoint test\n");
-	run_singlestep_break_test();
+	v1printk("kgdbts:RUN singlestep test %i iterations\n", sstep_test);
+	for (i = 0; i < sstep_test; i++) {
+		run_singlestep_break_test();
+		if (i % 100 == 0)
+			v1printk("kgdbts:RUN singlestep [%i/%i]\n",
+				 i, sstep_test);
+	}
 
 	/* ===Optional tests=== */
 
-- 
cgit v0.10.2


From 688b744d8bc84dc5cc646e97509113dc5e8818ed Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Thu, 24 Apr 2008 16:57:23 -0500
Subject: kgdb: fix signedness mixmatches, add statics, add declaration to
 header

Noticed by sparse:
arch/x86/kernel/kgdb.c:556:15: warning: symbol 'kgdb_arch_pc' was not declared. Should it be static?
kernel/kgdb.c:149:8: warning: symbol 'kgdb_do_roundup' was not declared. Should it be static?
kernel/kgdb.c:193:22: warning: symbol 'kgdb_arch_pc' was not declared. Should it be static?
kernel/kgdb.c:712:5: warning: symbol 'remove_all_break' was not declared. Should it be static?

Related to kgdb_hex2long:
arch/x86/kernel/kgdb.c:371:28: warning: incorrect type in argument 2 (different signedness)
arch/x86/kernel/kgdb.c:371:28:    expected long *long_val
arch/x86/kernel/kgdb.c:371:28:    got unsigned long *<noident>
kernel/kgdb.c:469:27: warning: incorrect type in argument 2 (different signedness)
kernel/kgdb.c:469:27:    expected long *long_val
kernel/kgdb.c:469:27:    got unsigned long *<noident>
kernel/kgdb.c:470:27: warning: incorrect type in argument 2 (different signedness)
kernel/kgdb.c:470:27:    expected long *long_val
kernel/kgdb.c:470:27:    got unsigned long *<noident>
kernel/kgdb.c:894:27: warning: incorrect type in argument 2 (different signedness)
kernel/kgdb.c:894:27:    expected long *long_val
kernel/kgdb.c:894:27:    got unsigned long *<noident>
kernel/kgdb.c:895:27: warning: incorrect type in argument 2 (different signedness)
kernel/kgdb.c:895:27:    expected long *long_val
kernel/kgdb.c:895:27:    got unsigned long *<noident>
kernel/kgdb.c:1127:28: warning: incorrect type in argument 2 (different signedness)
kernel/kgdb.c:1127:28:    expected long *long_val
kernel/kgdb.c:1127:28:    got unsigned long *<noident>
kernel/kgdb.c:1132:25: warning: incorrect type in argument 2 (different signedness)
kernel/kgdb.c:1132:25:    expected long *long_val
kernel/kgdb.c:1132:25:    got unsigned long *<noident>

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>

diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h
index 9757b1a..6adcc29 100644
--- a/include/linux/kgdb.h
+++ b/include/linux/kgdb.h
@@ -261,10 +261,12 @@ struct kgdb_io {
 
 extern struct kgdb_arch		arch_kgdb_ops;
 
+extern unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs);
+
 extern int kgdb_register_io_module(struct kgdb_io *local_kgdb_io_ops);
 extern void kgdb_unregister_io_module(struct kgdb_io *local_kgdb_io_ops);
 
-extern int kgdb_hex2long(char **ptr, long *long_val);
+extern int kgdb_hex2long(char **ptr, unsigned long *long_val);
 extern int kgdb_mem2hex(char *mem, char *buf, int count);
 extern int kgdb_hex2mem(char *buf, char *mem, int count);
 
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 1bd0ec1..39e31a0 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -61,7 +61,7 @@ struct kgdb_state {
 	int			err_code;
 	int			cpu;
 	int			pass_exception;
-	long			threadid;
+	unsigned long		threadid;
 	long			kgdb_usethreadid;
 	struct pt_regs		*linux_regs;
 };
@@ -146,7 +146,7 @@ atomic_t			kgdb_cpu_doing_single_step = ATOMIC_INIT(-1);
  * the other CPUs might interfere with your debugging context, so
  * use this with care:
  */
-int				kgdb_do_roundup = 1;
+static int kgdb_do_roundup = 1;
 
 static int __init opt_nokgdbroundup(char *str)
 {
@@ -438,7 +438,7 @@ int kgdb_hex2mem(char *buf, char *mem, int count)
  * While we find nice hex chars, build a long_val.
  * Return number of chars processed.
  */
-int kgdb_hex2long(char **ptr, long *long_val)
+int kgdb_hex2long(char **ptr, unsigned long *long_val)
 {
 	int hex_val;
 	int num = 0;
@@ -709,7 +709,7 @@ int kgdb_isremovedbreak(unsigned long addr)
 	return 0;
 }
 
-int remove_all_break(void)
+static int remove_all_break(void)
 {
 	unsigned long addr;
 	int error;
-- 
cgit v0.10.2


From e024cbd257efc2788b7d21b9353e966267485c87 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@computergmbh.de>
Date: Fri, 25 Apr 2008 18:35:29 +0200
Subject: kgdb: kconfig fix xconfig/menuconfig element

Kconfig.kgdb: fix menuconfig element

Signed-off-by: Jan Engelhardt <jengelh@computergmbh.de>
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>

diff --git a/lib/Kconfig.kgdb b/lib/Kconfig.kgdb
index f2e01ac..a5d4b1d 100644
--- a/lib/Kconfig.kgdb
+++ b/lib/Kconfig.kgdb
@@ -1,4 +1,10 @@
 
+config HAVE_ARCH_KGDB_SHADOW_INFO
+	bool
+
+config HAVE_ARCH_KGDB
+	bool
+
 menuconfig KGDB
 	bool "KGDB: kernel debugging with remote gdb"
 	select FRAME_POINTER
@@ -10,15 +16,10 @@ menuconfig KGDB
 	  at http://kgdb.sourceforge.net as well as in DocBook form
 	  in Documentation/DocBook/.  If unsure, say N.
 
-config HAVE_ARCH_KGDB_SHADOW_INFO
-	bool
-
-config HAVE_ARCH_KGDB
-	bool
+if KGDB
 
 config KGDB_SERIAL_CONSOLE
 	tristate "KGDB: use kgdb over the serial console"
-	depends on KGDB
 	select CONSOLE_POLL
 	select MAGIC_SYSRQ
 	default y
@@ -28,7 +29,6 @@ config KGDB_SERIAL_CONSOLE
 
 config KGDB_TESTS
 	bool "KGDB: internal test suite"
-	depends on KGDB
 	default n
 	help
 	  This is a kgdb I/O module specifically designed to test
@@ -56,3 +56,5 @@ config KGDB_TESTS_BOOT_STRING
 	  boot.  See the drivers/misc/kgdbts.c for detailed
 	  information about other strings you could use beyond the
 	  default of V1F100.
+
+endif # KGDB
-- 
cgit v0.10.2


From 82af7aca56c67061420d618cc5a30f0fd4106b80 Mon Sep 17 00:00:00 2001
From: Eric Sesterhenn <snakebyte@gmx.de>
Date: Fri, 25 Jan 2008 10:40:46 +0100
Subject: Removal of FUTEX_FD

Since FUTEX_FD was scheduled for removal in June 2007 lets remove it.

Google Code search found no users for it and NGPT was abandoned in 2003
according to IBM.  futex.h is left untouched to make sure the id does
not get reassigned.  Since queue_me() has no users left it is commented
out to avoid a warning, i didnt remove it completely since it is part of
the internal api (matching unqueue_me())

Signed-off-by: Eric Sesterhenn <snakebyte@gmx.de>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> (removed rest)
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/kernel/futex.c b/kernel/futex.c
index 98092c9..449def8 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -104,10 +104,6 @@ struct futex_q {
 	/* Key which the futex is hashed on: */
 	union futex_key key;
 
-	/* For fd, sigio sent using these: */
-	int fd;
-	struct file *filp;
-
 	/* Optional priority inheritance state: */
 	struct futex_pi_state *pi_state;
 	struct task_struct *task;
@@ -126,9 +122,6 @@ struct futex_hash_bucket {
 
 static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
 
-/* Futex-fs vfsmount entry: */
-static struct vfsmount *futex_mnt;
-
 /*
  * Take mm->mmap_sem, when futex is shared
  */
@@ -610,8 +603,6 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
 static void wake_futex(struct futex_q *q)
 {
 	plist_del(&q->list, &q->list.plist);
-	if (q->filp)
-		send_sigio(&q->filp->f_owner, q->fd, POLL_IN);
 	/*
 	 * The lock in wake_up_all() is a crucial memory barrier after the
 	 * plist_del() and also before assigning to q->lock_ptr.
@@ -988,14 +979,10 @@ out:
 }
 
 /* The key must be already stored in q->key. */
-static inline struct futex_hash_bucket *
-queue_lock(struct futex_q *q, int fd, struct file *filp)
+static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
 {
 	struct futex_hash_bucket *hb;
 
-	q->fd = fd;
-	q->filp = filp;
-
 	init_waitqueue_head(&q->waiters);
 
 	get_futex_key_refs(&q->key);
@@ -1006,7 +993,7 @@ queue_lock(struct futex_q *q, int fd, struct file *filp)
 	return hb;
 }
 
-static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
+static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
 {
 	int prio;
 
@@ -1041,15 +1028,6 @@ queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
  * exactly once.  They are called with the hashed spinlock held.
  */
 
-/* The key must be already stored in q->key. */
-static void queue_me(struct futex_q *q, int fd, struct file *filp)
-{
-	struct futex_hash_bucket *hb;
-
-	hb = queue_lock(q, fd, filp);
-	__queue_me(q, hb);
-}
-
 /* Return 1 if we were still queued (ie. 0 means we were woken) */
 static int unqueue_me(struct futex_q *q)
 {
@@ -1194,7 +1172,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
 	if (unlikely(ret != 0))
 		goto out_release_sem;
 
-	hb = queue_lock(&q, -1, NULL);
+	hb = queue_lock(&q);
 
 	/*
 	 * Access the page AFTER the futex is queued.
@@ -1238,7 +1216,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
 		goto out_unlock_release_sem;
 
 	/* Only actually queue if *uaddr contained val.  */
-	__queue_me(&q, hb);
+	queue_me(&q, hb);
 
 	/*
 	 * Now the futex is queued and we have checked the data, we
@@ -1386,7 +1364,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
 		goto out_release_sem;
 
  retry_unlocked:
-	hb = queue_lock(&q, -1, NULL);
+	hb = queue_lock(&q);
 
  retry_locked:
 	ret = lock_taken = 0;
@@ -1499,7 +1477,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
 	/*
 	 * Only actually queue now that the atomic ops are done:
 	 */
-	__queue_me(&q, hb);
+	queue_me(&q, hb);
 
 	/*
 	 * Now the futex is queued and we have checked the data, we
@@ -1746,121 +1724,6 @@ pi_faulted:
 	return ret;
 }
 
-static int futex_close(struct inode *inode, struct file *filp)
-{
-	struct futex_q *q = filp->private_data;
-
-	unqueue_me(q);
-	kfree(q);
-
-	return 0;
-}
-
-/* This is one-shot: once it's gone off you need a new fd */
-static unsigned int futex_poll(struct file *filp,
-			       struct poll_table_struct *wait)
-{
-	struct futex_q *q = filp->private_data;
-	int ret = 0;
-
-	poll_wait(filp, &q->waiters, wait);
-
-	/*
-	 * plist_node_empty() is safe here without any lock.
-	 * q->lock_ptr != 0 is not safe, because of ordering against wakeup.
-	 */
-	if (plist_node_empty(&q->list))
-		ret = POLLIN | POLLRDNORM;
-
-	return ret;
-}
-
-static const struct file_operations futex_fops = {
-	.release	= futex_close,
-	.poll		= futex_poll,
-};
-
-/*
- * Signal allows caller to avoid the race which would occur if they
- * set the sigio stuff up afterwards.
- */
-static int futex_fd(u32 __user *uaddr, int signal)
-{
-	struct futex_q *q;
-	struct file *filp;
-	int ret, err;
-	struct rw_semaphore *fshared;
-	static unsigned long printk_interval;
-
-	if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) {
-		printk(KERN_WARNING "Process `%s' used FUTEX_FD, which "
-		       "will be removed from the kernel in June 2007\n",
-		       current->comm);
-	}
-
-	ret = -EINVAL;
-	if (!valid_signal(signal))
-		goto out;
-
-	ret = get_unused_fd();
-	if (ret < 0)
-		goto out;
-	filp = get_empty_filp();
-	if (!filp) {
-		put_unused_fd(ret);
-		ret = -ENFILE;
-		goto out;
-	}
-	filp->f_op = &futex_fops;
-	filp->f_path.mnt = mntget(futex_mnt);
-	filp->f_path.dentry = dget(futex_mnt->mnt_root);
-	filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping;
-
-	if (signal) {
-		err = __f_setown(filp, task_pid(current), PIDTYPE_PID, 1);
-		if (err < 0) {
-			goto error;
-		}
-		filp->f_owner.signum = signal;
-	}
-
-	q = kmalloc(sizeof(*q), GFP_KERNEL);
-	if (!q) {
-		err = -ENOMEM;
-		goto error;
-	}
-	q->pi_state = NULL;
-
-	fshared = &current->mm->mmap_sem;
-	down_read(fshared);
-	err = get_futex_key(uaddr, fshared, &q->key);
-
-	if (unlikely(err != 0)) {
-		up_read(fshared);
-		kfree(q);
-		goto error;
-	}
-
-	/*
-	 * queue_me() must be called before releasing mmap_sem, because
-	 * key->shared.inode needs to be referenced while holding it.
-	 */
-	filp->private_data = q;
-
-	queue_me(q, ret, filp);
-	up_read(fshared);
-
-	/* Now we map fd to filp, so userspace can access it */
-	fd_install(ret, filp);
-out:
-	return ret;
-error:
-	put_unused_fd(ret);
-	put_filp(filp);
-	ret = err;
-	goto out;
-}
-
 /*
  * Support for robust futexes: the kernel cleans up held futexes at
  * thread exit time.
@@ -2092,10 +1955,6 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 	case FUTEX_WAKE_BITSET:
 		ret = futex_wake(uaddr, fshared, val, val3);
 		break;
-	case FUTEX_FD:
-		/* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */
-		ret = futex_fd(uaddr, val);
-		break;
 	case FUTEX_REQUEUE:
 		ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL);
 		break;
@@ -2156,19 +2015,6 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
 	return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
 }
 
-static int futexfs_get_sb(struct file_system_type *fs_type,
-			  int flags, const char *dev_name, void *data,
-			  struct vfsmount *mnt)
-{
-	return get_sb_pseudo(fs_type, "futex", NULL, FUTEXFS_SUPER_MAGIC, mnt);
-}
-
-static struct file_system_type futex_fs_type = {
-	.name		= "futexfs",
-	.get_sb		= futexfs_get_sb,
-	.kill_sb	= kill_anon_super,
-};
-
 static int __init futex_init(void)
 {
 	u32 curval;
@@ -2193,16 +2039,6 @@ static int __init futex_init(void)
 		spin_lock_init(&futex_queues[i].lock);
 	}
 
-	i = register_filesystem(&futex_fs_type);
-	if (i)
-		return i;
-
-	futex_mnt = kern_mount(&futex_fs_type);
-	if (IS_ERR(futex_mnt)) {
-		unregister_filesystem(&futex_fs_type);
-		return PTR_ERR(futex_mnt);
-	}
-
 	return 0;
 }
 __initcall(futex_init);
-- 
cgit v0.10.2


From 13a6ddb08e58a1bd344da7898c4e2f13bdf18c2f Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel.send@gmail.com>
Date: Thu, 27 Mar 2008 01:31:18 -0700
Subject: x86/pci: add pci=skip_isa_align command lines.

so we don't align the io port start address for pci cards.

also move out dmi check out acpi.c, because it has nothing to do with acpi.
it could spare some calling when we have several peer root buses.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 3ce193f..15d8216 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1518,6 +1518,8 @@ and is between 256 and 4096 characters. It is defined in the file
 				This is normally done in pci_enable_device(),
 				so this option is a temporary workaround
 				for broken drivers that don't call it.
+		skip_isa_align	[X86] do not align io start addr, so can
+				handle more pci cards
 		firmware	[ARM] Do not re-enumerate the bus but instead
 				just use the configuration from the
 				bootloader. This is currently used on
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 1a9c0c6..d95de2f 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -6,45 +6,6 @@
 #include <asm/numa.h>
 #include "pci.h"
 
-static int __devinit can_skip_ioresource_align(const struct dmi_system_id *d)
-{
-	pci_probe |= PCI_CAN_SKIP_ISA_ALIGN;
-	printk(KERN_INFO "PCI: %s detected, can skip ISA alignment\n", d->ident);
-	return 0;
-}
-
-static struct dmi_system_id acpi_pciprobe_dmi_table[] __devinitdata = {
-/*
- * Systems where PCI IO resource ISA alignment can be skipped
- * when the ISA enable bit in the bridge control is not set
- */
-	{
-		.callback = can_skip_ioresource_align,
-		.ident = "IBM System x3800",
-		.matches = {
-			DMI_MATCH(DMI_SYS_VENDOR, "IBM"),
-			DMI_MATCH(DMI_PRODUCT_NAME, "x3800"),
-		},
-	},
-	{
-		.callback = can_skip_ioresource_align,
-		.ident = "IBM System x3850",
-		.matches = {
-			DMI_MATCH(DMI_SYS_VENDOR, "IBM"),
-			DMI_MATCH(DMI_PRODUCT_NAME, "x3850"),
-		},
-	},
-	{
-		.callback = can_skip_ioresource_align,
-		.ident = "IBM System x3950",
-		.matches = {
-			DMI_MATCH(DMI_SYS_VENDOR, "IBM"),
-			DMI_MATCH(DMI_PRODUCT_NAME, "x3950"),
-		},
-	},
-	{}
-};
-
 struct pci_root_info {
 	char *name;
 	unsigned int res_num;
@@ -196,8 +157,6 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do
 	int pxm;
 #endif
 
-	dmi_check_system(acpi_pciprobe_dmi_table);
-
 	if (domain && !pci_domains_supported) {
 		printk(KERN_WARNING "PCI: Multiple domains not supported "
 		       "(dom %d, bus %d)\n", domain, busnum);
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 88b5416..a6d2779 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -90,6 +90,50 @@ static void __devinit pcibios_fixup_device_resources(struct pci_dev *dev)
 		rom_r->start = rom_r->end = rom_r->flags = 0;
 }
 
+static int __devinit can_skip_ioresource_align(const struct dmi_system_id *d)
+{
+	pci_probe |= PCI_CAN_SKIP_ISA_ALIGN;
+	printk(KERN_INFO "PCI: %s detected, can skip ISA alignment\n", d->ident);
+	return 0;
+}
+
+static struct dmi_system_id can_skip_pciprobe_dmi_table[] __devinitdata = {
+/*
+ * Systems where PCI IO resource ISA alignment can be skipped
+ * when the ISA enable bit in the bridge control is not set
+ */
+	{
+		.callback = can_skip_ioresource_align,
+		.ident = "IBM System x3800",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "IBM"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "x3800"),
+		},
+	},
+	{
+		.callback = can_skip_ioresource_align,
+		.ident = "IBM System x3850",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "IBM"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "x3850"),
+		},
+	},
+	{
+		.callback = can_skip_ioresource_align,
+		.ident = "IBM System x3950",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "IBM"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "x3950"),
+		},
+	},
+	{}
+};
+
+void __init dmi_check_skip_isa_align(void)
+{
+	dmi_check_system(can_skip_pciprobe_dmi_table);
+}
+
 /*
  *  Called after each bus is probed, but before its children
  *  are examined.
@@ -462,6 +506,9 @@ char * __devinit  pcibios_setup(char *str)
 	} else if (!strcmp(str, "routeirq")) {
 		pci_routeirq = 1;
 		return NULL;
+	} else if (!strcmp(str, "skip_isa_align")) {
+		pci_probe |= PCI_CAN_SKIP_ISA_ALIGN;
+		return NULL;
 	}
 	return str;
 }
diff --git a/arch/x86/pci/init.c b/arch/x86/pci/init.c
index dd30c60..b394b2a 100644
--- a/arch/x86/pci/init.c
+++ b/arch/x86/pci/init.c
@@ -33,6 +33,8 @@ static __init int pci_access_init(void)
 		printk(KERN_ERR
 		"PCI: Fatal: No config space access function found\n");
 
+	dmi_check_skip_isa_align();
+
 	return 0;
 }
 arch_initcall(pci_access_init);
diff --git a/arch/x86/pci/pci.h b/arch/x86/pci/pci.h
index c58805a..1019820 100644
--- a/arch/x86/pci/pci.h
+++ b/arch/x86/pci/pci.h
@@ -38,6 +38,8 @@ enum pci_bf_sort_state {
 	pci_dmi_bf,
 };
 
+extern void __init dmi_check_skip_isa_align(void);
+
 /* pci-i386.c */
 
 extern unsigned int pcibios_max_latency;
-- 
cgit v0.10.2


From 0df18ff366853cdf31e5238764ec5c63e6b5a398 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Mon, 14 Apr 2008 15:40:37 -0700
Subject: x86 PCI: call dmi_check_pciprobe()

this change:

| commit 08f1c192c3c32797068bfe97738babb3295bbf42
| Author: Muli Ben-Yehuda <muli@il.ibm.com>
| Date:   Sun Jul 22 00:23:39 2007 +0300
|
|    x86-64: introduce struct pci_sysdata to facilitate sharing of ->sysdata
|
|    This patch introduces struct pci_sysdata to x86 and x86-64, and
|    converts the existing two users (NUMA, Calgary) to use it.
|
|    This lays the groundwork for having other users of sysdata, such as
|    the PCI domains work.
|
|    The Calgary bits are tested, the NUMA bits just look ok.

replaces pcibios_scan_root with pci_scan_bus_parented...

but in pcibios_scan_root we have a DMI check:

    dmi_check_system(pciprobe_dmi_table);

when when have several peer root buses this could be called multiple
times (which is bad), so move that call to pci_access_init().

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>

diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index a6d2779..bfa72a9 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -362,13 +362,16 @@ static struct dmi_system_id __devinitdata pciprobe_dmi_table[] = {
 	{}
 };
 
+void __init dmi_check_pciprobe(void)
+{
+	dmi_check_system(pciprobe_dmi_table);
+}
+
 struct pci_bus * __devinit pcibios_scan_root(int busnum)
 {
 	struct pci_bus *bus = NULL;
 	struct pci_sysdata *sd;
 
-	dmi_check_system(pciprobe_dmi_table);
-
 	while ((bus = pci_find_next_bus(bus)) != NULL) {
 		if (bus->number == busnum) {
 			/* Already scanned */
diff --git a/arch/x86/pci/init.c b/arch/x86/pci/init.c
index b394b2a..e70b9c5 100644
--- a/arch/x86/pci/init.c
+++ b/arch/x86/pci/init.c
@@ -33,6 +33,8 @@ static __init int pci_access_init(void)
 		printk(KERN_ERR
 		"PCI: Fatal: No config space access function found\n");
 
+	dmi_check_pciprobe();
+
 	dmi_check_skip_isa_align();
 
 	return 0;
diff --git a/arch/x86/pci/pci.h b/arch/x86/pci/pci.h
index 1019820..f3972b1 100644
--- a/arch/x86/pci/pci.h
+++ b/arch/x86/pci/pci.h
@@ -38,6 +38,7 @@ enum pci_bf_sort_state {
 	pci_dmi_bf,
 };
 
+extern void __init dmi_check_pciprobe(void);
 extern void __init dmi_check_skip_isa_align(void);
 
 /* pci-i386.c */
-- 
cgit v0.10.2


From 8376005ea471762e7a5957d5b9e788121c0ba726 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 5 May 2008 12:32:39 -0700
Subject: sparc64: use compat_sys_utimes instead of home-grown local copy.

Noticed by Christoph Hellwig.

Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/arch/sparc64/kernel/sys_sparc32.c b/arch/sparc64/kernel/sys_sparc32.c
index 161ce47..1aa4288 100644
--- a/arch/sparc64/kernel/sys_sparc32.c
+++ b/arch/sparc64/kernel/sys_sparc32.c
@@ -236,13 +236,6 @@ asmlinkage long sys32_getegid16(void)
 
 /* 32-bit timeval and related flotsam.  */
 
-static long get_tv32(struct timeval *o, struct compat_timeval __user *i)
-{
-	return (!access_ok(VERIFY_READ, i, sizeof(*i)) ||
-		(__get_user(o->tv_sec, &i->tv_sec) |
-		 __get_user(o->tv_usec, &i->tv_usec)));
-}
-
 static inline long put_tv32(struct compat_timeval __user *o, struct timeval *i)
 {
 	return (!access_ok(VERIFY_WRITE, o, sizeof(*o)) ||
@@ -757,30 +750,6 @@ asmlinkage long sys32_settimeofday(struct compat_timeval __user *tv,
 	return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL);
 }
 
-asmlinkage long sys32_utimes(char __user *filename,
-			     struct compat_timeval __user *tvs)
-{
-	struct timespec tv[2];
-
-	if (tvs) {
-		struct timeval ktvs[2];
-		if (get_tv32(&ktvs[0], tvs) ||
-		    get_tv32(&ktvs[1], 1+tvs))
-			return -EFAULT;
-
-		if (ktvs[0].tv_usec < 0 || ktvs[0].tv_usec >= 1000000 ||
-		    ktvs[1].tv_usec < 0 || ktvs[1].tv_usec >= 1000000)
-			return -EINVAL;
-
-		tv[0].tv_sec = ktvs[0].tv_sec;
-		tv[0].tv_nsec = 1000 * ktvs[0].tv_usec;
-		tv[1].tv_sec = ktvs[1].tv_sec;
-		tv[1].tv_nsec = 1000 * ktvs[1].tv_usec;
-	}
-
-	return do_utimes(AT_FDCWD, filename, tvs ? tv : NULL, 0);
-}
-
 /* These are here just in case some old sparc32 binary calls it. */
 asmlinkage long sys32_pause(void)
 {
diff --git a/arch/sparc64/kernel/systbls.S b/arch/sparc64/kernel/systbls.S
index a4fef2b..8b5282d 100644
--- a/arch/sparc64/kernel/systbls.S
+++ b/arch/sparc64/kernel/systbls.S
@@ -45,7 +45,7 @@ sys_call_table32:
 /*120*/	.word compat_sys_readv, compat_sys_writev, sys32_settimeofday, sys32_fchown16, sys_fchmod
 	.word sys_nis_syscall, sys32_setreuid16, sys32_setregid16, sys_rename, sys_truncate
 /*130*/	.word sys_ftruncate, sys_flock, compat_sys_lstat64, sys_nis_syscall, sys_nis_syscall
-	.word sys_nis_syscall, sys32_mkdir, sys_rmdir, sys32_utimes, compat_sys_stat64
+	.word sys_nis_syscall, sys32_mkdir, sys_rmdir, compat_sys_utimes, compat_sys_stat64
 /*140*/	.word sys32_sendfile64, sys_nis_syscall, sys32_futex, sys_gettid, compat_sys_getrlimit
 	.word compat_sys_setrlimit, sys_pivot_root, sys32_prctl, sys_pciconfig_read, sys_pciconfig_write
 /*150*/	.word sys_nis_syscall, sys_inotify_init, sys_inotify_add_watch, sys_poll, sys_getdents64
-- 
cgit v0.10.2


From ab1a852128d6f0677999eecbf6d04bf9f6fe9a9a Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Mon, 5 May 2008 21:15:19 +0200
Subject: m68k: Fix falconide `data_adr' typo

    commit 9567b349f7e7dd7e2483db99ee8e4a6fe0caca38
    Author: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
    Date:   Mon Apr 28 23:44:36 2008 +0200

    ide: merge ->atapi_*put_bytes and ->ata_*put_data methods

introduced a typo (`data_adr' instead of `data_addr'), leading to a compile
failure.

Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/ide/legacy/falconide.c b/drivers/ide/legacy/falconide.c
index 83555ca..9e449a0 100644
--- a/drivers/ide/legacy/falconide.c
+++ b/drivers/ide/legacy/falconide.c
@@ -61,7 +61,7 @@ static void falconide_output_data(ide_drive_t *drive, struct request *rq,
 	unsigned long data_addr = drive->hwif->io_ports.data_addr;
 
 	if (drive->media == ide_disk && rq && rq->cmd_type == REQ_TYPE_FS)
-		return outsw(data_adr, buf, (len + 1) / 2);
+		return outsw(data_addr, buf, (len + 1) / 2);
 
 	outsw_swapw(data_addr, buf, (len + 1) / 2);
 }
-- 
cgit v0.10.2


From 63a59fa7a74fccff64dbf7d9230bd9d91bddead4 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Mon, 5 May 2008 21:15:48 +0200
Subject: m68k: serial167 missing return value in cy_put_char()

    commit a5b08c66194fba02a865b397579b7204688bcb1e
    Author: Alan Cox <alan@lxorguk.ukuu.org.uk>
    Date:   Wed Apr 30 00:54:05 2008 -0700

    serial167: switch to int put_char method

missed one case when adding return values.

Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/char/serial167.c b/drivers/char/serial167.c
index fd2db07..3b23270 100644
--- a/drivers/char/serial167.c
+++ b/drivers/char/serial167.c
@@ -1073,7 +1073,7 @@ static int cy_put_char(struct tty_struct *tty, unsigned char ch)
 		return 0;
 
 	if (!info->xmit_buf)
-		return;
+		return 0;
 
 	local_irq_save(flags);
 	if (info->xmit_cnt >= PAGE_SIZE - 1) {
-- 
cgit v0.10.2


From 4933d07531711e399d8d578036aa9fc1be2f9b20 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Mon, 5 May 2008 21:16:13 +0200
Subject: m68k: drivers/input/serio/hp_sdc.c needs <linux/semaphore.h>

drivers/input/serio/hp_sdc.c: In function 'hp_sdc_take':
drivers/input/serio/hp_sdc.c:198: error: implicit declaration of function 'up'

Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/input/serio/hp_sdc.c b/drivers/input/serio/hp_sdc.c
index 02b3ad8..edfedd9 100644
--- a/drivers/input/serio/hp_sdc.c
+++ b/drivers/input/serio/hp_sdc.c
@@ -69,6 +69,7 @@
 #include <linux/time.h>
 #include <linux/slab.h>
 #include <linux/hil.h>
+#include <linux/semaphore.h>
 #include <asm/io.h>
 #include <asm/system.h>
 
-- 
cgit v0.10.2


From 3b17f136bf32984eb0faeb116bcd44ffe3503782 Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Mon, 5 May 2008 21:25:48 +0200
Subject: m68k: Handle 68040 bus faults

Fix 68040 bus fault handling, so the standard kernel exception handling
can be used for i/o probing.

Contrary to normal access faults there is nothing to fix, but at least
we have to disable writebacks to avoid recursive faults.

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/m68k/kernel/traps.c b/arch/m68k/kernel/traps.c
index fd4858e..75b8340 100644
--- a/arch/m68k/kernel/traps.c
+++ b/arch/m68k/kernel/traps.c
@@ -468,15 +468,26 @@ static inline void access_error040(struct frame *fp)
 			 * (if do_page_fault didn't fix the mapping,
                          * the writeback won't do good)
 			 */
+disable_wb:
 #ifdef DEBUG
 			printk(".. disabling wb2\n");
 #endif
 			if (fp->un.fmt7.wb2a == fp->un.fmt7.faddr)
 				fp->un.fmt7.wb2s &= ~WBV_040;
+			if (fp->un.fmt7.wb3a == fp->un.fmt7.faddr)
+				fp->un.fmt7.wb3s &= ~WBV_040;
 		}
-	} else if (send_fault_sig(&fp->ptregs) > 0) {
-		printk("68040 access error, ssw=%x\n", ssw);
-		trap_c(fp);
+	} else {
+		/* In case of a bus error we either kill the process or expect
+		 * the kernel to catch the fault, which then is also responsible
+		 * for cleaning up the mess.
+		 */
+		current->thread.signo = SIGBUS;
+		current->thread.faddr = fp->un.fmt7.faddr;
+		if (send_fault_sig(&fp->ptregs) >= 0)
+			printk("68040 bus error (ssw=%x, faddr=%lx)\n", ssw,
+			       fp->un.fmt7.faddr);
+		goto disable_wb;
 	}
 
 	do_040writebacks(fp);
-- 
cgit v0.10.2


From b6d9d267f0d68104df910fca89149803aec82426 Mon Sep 17 00:00:00 2001
From: Finn Thain <fthain@telegraphics.com.au>
Date: Mon, 5 May 2008 21:26:15 +0200
Subject: m68k: remove old mac_esp cruft

Remove the rest of the old mac_esp driver. Also ditch the rest of the
machw mechanism, it needs to be replaced by a fake openfirmware tree.

Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index a3c3544..b2dd3dc 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1094,9 +1094,6 @@ and is between 256 and 4096 characters. It is defined in the file
 	mac5380=	[HW,SCSI] Format:
 			<can_queue>,<cmd_per_lun>,<sg_tablesize>,<hostid>,<use_tags>
 
-	mac53c9x=	[HW,SCSI] Format:
-			<num_esps>,<disconnect>,<nosync>,<can_queue>,<cmd_per_lun>,<sg_tablesize>,<hostid>,<use_tags>
-
 	machvec=	[IA64] Force the use of a particular machine-vector
 			(machvec) in a generic kernel.
 			Example: machvec=hpzx1_swiotlb
diff --git a/arch/m68k/mac/config.c b/arch/m68k/mac/config.c
index 735a49b..ad3e3ba 100644
--- a/arch/m68k/mac/config.c
+++ b/arch/m68k/mac/config.c
@@ -48,9 +48,6 @@
 struct mac_booter_data mac_bi_data;
 int mac_bisize = sizeof mac_bi_data;
 
-struct mac_hw_present mac_hw_present;
-EXPORT_SYMBOL(mac_hw_present);
-
 /* New m68k bootinfo stuff and videobase */
 
 extern int m68k_num_memory;
@@ -817,27 +814,6 @@ void __init mac_identify(void)
 		m68k_ramdisk.addr, m68k_ramdisk.size);
 #endif
 
-	/*
-	 * TODO: set the various fields in macintosh_config->hw_present here!
-	 */
-	switch (macintosh_config->scsi_type) {
-	case MAC_SCSI_OLD:
-		MACHW_SET(MAC_SCSI_80);
-		break;
-	case MAC_SCSI_QUADRA:
-	case MAC_SCSI_QUADRA2:
-	case MAC_SCSI_QUADRA3:
-		MACHW_SET(MAC_SCSI_96);
-		if ((macintosh_config->ident == MAC_MODEL_Q900) ||
-		    (macintosh_config->ident == MAC_MODEL_Q950))
-			MACHW_SET(MAC_SCSI_96_2);
-		break;
-	default:
-		printk(KERN_WARNING "config.c: wtf: unknown scsi, using 53c80\n");
-		MACHW_SET(MAC_SCSI_80);
-		break;
-	}
-
 	iop_init();
 	via_init();
 	oss_init();
diff --git a/include/asm-m68k/machw.h b/include/asm-m68k/machw.h
index d2e0e25..3562499 100644
--- a/include/asm-m68k/machw.h
+++ b/include/asm-m68k/machw.h
@@ -66,36 +66,6 @@ struct MAC_SCC
 # define mac_scc ((*(volatile struct SCC*)MAC_SCC_BAS))
 #endif
 
-/* hardware stuff */
-
-#define MACHW_DECLARE(name)	unsigned name : 1
-#define MACHW_SET(name)		(mac_hw_present.name = 1)
-#define MACHW_PRESENT(name)	(mac_hw_present.name)
-
-struct mac_hw_present {
-  /* video hardware */
-  /* sound hardware */
-  /* disk storage interfaces */
-  MACHW_DECLARE(MAC_SCSI_80);     /* Directly mapped NCR5380 */
-  MACHW_DECLARE(MAC_SCSI_96);     /* 53c9[46] */
-  MACHW_DECLARE(MAC_SCSI_96_2);   /* 2nd 53c9[46] Q900 and Q950 */
-  MACHW_DECLARE(IDE);             /* IDE Interface */
-  /* other I/O hardware */
-  MACHW_DECLARE(SCC);             /* Serial Communications Contr. */
-  /* DMA */
-  MACHW_DECLARE(SCSI_DMA);        /* DMA for the NCR5380 */
-  /* real time clocks */
-  MACHW_DECLARE(RTC_CLK);         /* clock chip */
-  /* supporting hardware */
-  MACHW_DECLARE(VIA1);            /* Versatile Interface Ad. 1 */
-  MACHW_DECLARE(VIA2);            /* Versatile Interface Ad. 2 */
-  MACHW_DECLARE(RBV);             /* Versatile Interface Ad. 2+ */
-  /* NUBUS */
-  MACHW_DECLARE(NUBUS);           /* NUBUS */
-};
-
-extern struct mac_hw_present mac_hw_present;
-
 #endif /* __ASSEMBLY__ */
 
 #endif /* linux/machw.h */
-- 
cgit v0.10.2


From 48fc8de9cd093b8c9e2cfa339421862bae3a6cad Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Mon, 5 May 2008 21:27:21 +0200
Subject: CONFIG_SCSI_MAC_ESP needs CONFIG_SCSI_SPI_ATTRS

The new mac_esp scsi driver needs CONFIG_SCSI_SPI_ATTRS, just like all other
drivers using the new esp_scsi core.

Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig
index 46d7e40..81ccbd7 100644
--- a/drivers/scsi/Kconfig
+++ b/drivers/scsi/Kconfig
@@ -1679,6 +1679,7 @@ config MAC_SCSI
 config SCSI_MAC_ESP
 	tristate "Macintosh NCR53c9[46] SCSI"
 	depends on MAC && SCSI
+	select SCSI_SPI_ATTRS
 	help
 	  This is the NCR 53c9x SCSI controller found on most of the 68040
 	  based Macintoshes.
-- 
cgit v0.10.2


From abdefbdbd5c683ddcb1dd0d3dd414d02f078a5da Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Mon, 5 May 2008 12:38:58 -0700
Subject: sparc64: remove online_page()

The identical online_page() implementations from all architectures got
moved to mm/memory_hotplug.c - except for the sparc64 one that even was
dead code due to MEMORY_HOTPLUG not being available there.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/arch/sparc64/mm/init.c b/arch/sparc64/mm/init.c
index 4cad0b3..db8e7fb 100644
--- a/arch/sparc64/mm/init.c
+++ b/arch/sparc64/mm/init.c
@@ -2362,16 +2362,3 @@ void __flush_tlb_all(void)
 	__asm__ __volatile__("wrpr	%0, 0, %%pstate"
 			     : : "r" (pstate));
 }
-
-#ifdef CONFIG_MEMORY_HOTPLUG
-
-void online_page(struct page *page)
-{
-	ClearPageReserved(page);
-	init_page_count(page);
-	__free_page(page);
-	totalram_pages++;
-	num_physpages++;
-}
-
-#endif /* CONFIG_MEMORY_HOTPLUG */
-- 
cgit v0.10.2


From 19443178fbfbf40db15c86012fc37df1a44ab857 Mon Sep 17 00:00:00 2001
From: Chris Wright <chrisw@sous-sol.org>
Date: Mon, 5 May 2008 13:50:24 -0700
Subject: dccp: return -EINVAL on invalid feature length

dccp_feat_change() validates length and on error is returning 1.
This happens to work since call chain is checking for 0 == success,
but this is returned to userspace, so make it a real error value.

Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/dccp/feat.c b/net/dccp/feat.c
index 4a4f6ce..933a0ec 100644
--- a/net/dccp/feat.c
+++ b/net/dccp/feat.c
@@ -32,7 +32,7 @@ int dccp_feat_change(struct dccp_minisock *dmsk, u8 type, u8 feature,
 
 	if (len > 3) {
 		DCCP_WARN("invalid length %d\n", len);
-		return 1;
+		return -EINVAL;
 	}
 	/* XXX add further sanity checks */
 
-- 
cgit v0.10.2


From a992241de614dd2b7c97a9ba64e28c0e563f19bf Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 5 May 2008 23:56:17 +0200
Subject: sched: fix normalized sleeper

Normalized sleeper uses calc_delta*() which requires that the rq load is
already updated, so move account_entity_enqueue() before place_entity()

Tested-by: Frans Pop <elendil@planet.nl>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 89fa32b..1295ddc 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -682,6 +682,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
 	 * Update run-time statistics of the 'current'.
 	 */
 	update_curr(cfs_rq);
+	account_entity_enqueue(cfs_rq, se);
 
 	if (wakeup) {
 		place_entity(cfs_rq, se, 0);
@@ -692,7 +693,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
 	check_spread(cfs_rq, se);
 	if (se != cfs_rq->curr)
 		__enqueue_entity(cfs_rq, se);
-	account_entity_enqueue(cfs_rq, se);
 }
 
 static void update_avg(u64 *avg, u64 sample)
-- 
cgit v0.10.2


From e05510d01ad1565e5e086a939261084d67ba2b10 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 5 May 2008 23:56:17 +0200
Subject: sched: optimize calc_delta_mine()

Joel noticed that the !lw->inv_weight contition isn't unlikely anymore so
remove the unlikely annotation. Also, remove the two div64_u64() inv_weight
calculations, which makes them rely on the calc_delta_mine() path as well.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
CC: Joel Schopp <jschopp@austin.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index 34bcc5b..00c1ba7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1438,8 +1438,8 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
 {
 	u64 tmp;
 
-	if (unlikely(!lw->inv_weight))
-		lw->inv_weight = (WMULT_CONST-lw->weight/2) / (lw->weight+1);
+	if (!lw->inv_weight)
+		lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)/(lw->weight+1);
 
 	tmp = (u64)delta_exec * weight;
 	/*
@@ -8025,7 +8025,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
 
 	se->my_q = cfs_rq;
 	se->load.weight = tg->shares;
-	se->load.inv_weight = div64_u64(1ULL<<32, se->load.weight);
+	se->load.inv_weight = 0;
 	se->parent = parent;
 }
 #endif
@@ -8692,7 +8692,7 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares)
 		dequeue_entity(cfs_rq, se, 0);
 
 	se->load.weight = shares;
-	se->load.inv_weight = div64_u64((1ULL<<32), shares);
+	se->load.inv_weight = 0;
 
 	if (on_rq)
 		enqueue_entity(cfs_rq, se, 0);
-- 
cgit v0.10.2


From 2abdad0a4cd8f9413f778cc998e0ee7d60b28417 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Fri, 25 Apr 2008 10:53:13 -0700
Subject: sched: make rt_sched_class, idle_sched_class static

The C files are included directly in sched.c, so they are
effectively static.

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 2bcafa3..3a4f92d 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -99,7 +99,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
 /*
  * Simple, special scheduling class for the per-CPU idle tasks:
  */
-const struct sched_class idle_sched_class = {
+static const struct sched_class idle_sched_class = {
 	/* .next is NULL */
 	/* no enqueue/yield_task for idle tasks */
 
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index c2730a5..dcd6495 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1309,7 +1309,7 @@ static void set_curr_task_rt(struct rq *rq)
 	p->se.exec_start = rq->clock;
 }
 
-const struct sched_class rt_sched_class = {
+static const struct sched_class rt_sched_class = {
 	.next			= &fair_sched_class,
 	.enqueue_task		= enqueue_task_rt,
 	.dequeue_task		= dequeue_task_rt,
-- 
cgit v0.10.2


From 733a0771df46af942b8355cd8bb15780106b4353 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 28 Apr 2008 14:05:18 +0200
Subject: sched: remove old sched doc

Fabio Checconi noticed that Documentation/scheduler/sched-design.txt was
a stale copy of the old scheduler. Remove it.

Reported-by: Fabio Checconi <fabio@gandalf.sssup.it>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/scheduler/sched-design.txt b/Documentation/scheduler/sched-design.txt
deleted file mode 100644
index 1605bf0..0000000
--- a/Documentation/scheduler/sched-design.txt
+++ /dev/null
@@ -1,165 +0,0 @@
-		   Goals, Design and Implementation of the
-		      new ultra-scalable O(1) scheduler
-
-
-  This is an edited version of an email Ingo Molnar sent to
-  lkml on 4 Jan 2002.  It describes the goals, design, and
-  implementation of Ingo's new ultra-scalable O(1) scheduler.
-  Last Updated: 18 April 2002.
-
-
-Goal
-====
-
-The main goal of the new scheduler is to keep all the good things we know
-and love about the current Linux scheduler:
-
- - good interactive performance even during high load: if the user
-   types or clicks then the system must react instantly and must execute
-   the user tasks smoothly, even during considerable background load.
-
- - good scheduling/wakeup performance with 1-2 runnable processes.
-
- - fairness: no process should stay without any timeslice for any
-   unreasonable amount of time. No process should get an unjustly high
-   amount of CPU time.
-
- - priorities: less important tasks can be started with lower priority,
-   more important tasks with higher priority.
-
- - SMP efficiency: no CPU should stay idle if there is work to do.
-
- - SMP affinity: processes which run on one CPU should stay affine to
-   that CPU. Processes should not bounce between CPUs too frequently.
-
- - plus additional scheduler features: RT scheduling, CPU binding.
-
-and the goal is also to add a few new things:
-
- - fully O(1) scheduling. Are you tired of the recalculation loop
-   blowing the L1 cache away every now and then? Do you think the goodness
-   loop is taking a bit too long to finish if there are lots of runnable
-   processes? This new scheduler takes no prisoners: wakeup(), schedule(),
-   the timer interrupt are all O(1) algorithms. There is no recalculation
-   loop. There is no goodness loop either.
-
- - 'perfect' SMP scalability. With the new scheduler there is no 'big'
-   runqueue_lock anymore - it's all per-CPU runqueues and locks - two
-   tasks on two separate CPUs can wake up, schedule and context-switch
-   completely in parallel, without any interlocking. All
-   scheduling-relevant data is structured for maximum scalability.
-
- - better SMP affinity. The old scheduler has a particular weakness that
-   causes the random bouncing of tasks between CPUs if/when higher
-   priority/interactive tasks, this was observed and reported by many
-   people. The reason is that the timeslice recalculation loop first needs
-   every currently running task to consume its timeslice. But when this
-   happens on eg. an 8-way system, then this property starves an
-   increasing number of CPUs from executing any process. Once the last
-   task that has a timeslice left has finished using up that timeslice,
-   the recalculation loop is triggered and other CPUs can start executing
-   tasks again - after having idled around for a number of timer ticks.
-   The more CPUs, the worse this effect.
-
-   Furthermore, this same effect causes the bouncing effect as well:
-   whenever there is such a 'timeslice squeeze' of the global runqueue,
-   idle processors start executing tasks which are not affine to that CPU.
-   (because the affine tasks have finished off their timeslices already.)
-
-   The new scheduler solves this problem by distributing timeslices on a
-   per-CPU basis, without having any global synchronization or
-   recalculation.
-
- - batch scheduling. A significant proportion of computing-intensive tasks
-   benefit from batch-scheduling, where timeslices are long and processes
-   are roundrobin scheduled. The new scheduler does such batch-scheduling
-   of the lowest priority tasks - so nice +19 jobs will get
-   'batch-scheduled' automatically. With this scheduler, nice +19 jobs are
-   in essence SCHED_IDLE, from an interactiveness point of view.
-
- - handle extreme loads more smoothly, without breakdown and scheduling
-   storms.
-
- - O(1) RT scheduling. For those RT folks who are paranoid about the
-   O(nr_running) property of the goodness loop and the recalculation loop.
-
- - run fork()ed children before the parent. Andrea has pointed out the
-   advantages of this a few months ago, but patches for this feature
-   do not work with the old scheduler as well as they should,
-   because idle processes often steal the new child before the fork()ing
-   CPU gets to execute it.
-
-
-Design
-======
-
-The core of the new scheduler contains the following mechanisms:
-
- - *two* priority-ordered 'priority arrays' per CPU. There is an 'active'
-   array and an 'expired' array. The active array contains all tasks that
-   are affine to this CPU and have timeslices left. The expired array
-   contains all tasks which have used up their timeslices - but this array
-   is kept sorted as well. The active and expired array is not accessed
-   directly, it's accessed through two pointers in the per-CPU runqueue
-   structure. If all active tasks are used up then we 'switch' the two
-   pointers and from now on the ready-to-go (former-) expired array is the
-   active array - and the empty active array serves as the new collector
-   for expired tasks.
-
- - there is a 64-bit bitmap cache for array indices. Finding the highest
-   priority task is thus a matter of two x86 BSFL bit-search instructions.
-
-the split-array solution enables us to have an arbitrary number of active
-and expired tasks, and the recalculation of timeslices can be done
-immediately when the timeslice expires. Because the arrays are always
-access through the pointers in the runqueue, switching the two arrays can
-be done very quickly.
-
-this is a hybride priority-list approach coupled with roundrobin
-scheduling and the array-switch method of distributing timeslices.
-
- - there is a per-task 'load estimator'.
-
-one of the toughest things to get right is good interactive feel during
-heavy system load. While playing with various scheduler variants i found
-that the best interactive feel is achieved not by 'boosting' interactive
-tasks, but by 'punishing' tasks that want to use more CPU time than there
-is available. This method is also much easier to do in an O(1) fashion.
-
-to establish the actual 'load' the task contributes to the system, a
-complex-looking but pretty accurate method is used: there is a 4-entry
-'history' ringbuffer of the task's activities during the last 4 seconds.
-This ringbuffer is operated without much overhead. The entries tell the
-scheduler a pretty accurate load-history of the task: has it used up more
-CPU time or less during the past N seconds. [the size '4' and the interval
-of 4x 1 seconds was found by lots of experimentation - this part is
-flexible and can be changed in both directions.]
-
-the penalty a task gets for generating more load than the CPU can handle
-is a priority decrease - there is a maximum amount to this penalty
-relative to their static priority, so even fully CPU-bound tasks will
-observe each other's priorities, and will share the CPU accordingly.
-
-the SMP load-balancer can be extended/switched with additional parallel
-computing and cache hierarchy concepts: NUMA scheduling, multi-core CPUs
-can be supported easily by changing the load-balancer. Right now it's
-tuned for my SMP systems.
-
-i skipped the prev->mm == next->mm advantage - no workload i know of shows
-any sensitivity to this. It can be added back by sacrificing O(1)
-schedule() [the current and one-lower priority list can be searched for a
-that->mm == current->mm condition], but costs a fair number of cycles
-during a number of important workloads, so i wanted to avoid this as much
-as possible.
-
-- the SMP idle-task startup code was still racy and the new scheduler
-triggered this. So i streamlined the idle-setup code a bit. We do not call
-into schedule() before all processors have started up fully and all idle
-threads are in place.
-
-- the patch also cleans up a number of aspects of sched.c - moves code
-into other areas of the kernel where it's appropriate, and simplifies
-certain code paths and data constructs. As a result, the new scheduler's
-code is smaller than the old one.
-
-	Ingo
-- 
cgit v0.10.2


From d478c2cfaa2476f8b6876f9eb4d8fddcfa986479 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Sat, 26 Apr 2008 11:30:34 -0700
Subject: sched: add debug checks to idle functions

Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Cc: "Justin Mattock" <justinmattock@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index 00c1ba7..ed3caf2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1124,6 +1124,7 @@ void sched_clock_idle_sleep_event(void)
 {
 	struct rq *rq = cpu_rq(smp_processor_id());
 
+	WARN_ON(!irqs_disabled());
 	spin_lock(&rq->lock);
 	__update_rq_clock(rq);
 	spin_unlock(&rq->lock);
@@ -1139,6 +1140,7 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
 	struct rq *rq = cpu_rq(smp_processor_id());
 	u64 now = sched_clock();
 
+	WARN_ON(!irqs_disabled());
 	rq->idle_clock += delta_ns;
 	/*
 	 * Override the previous timestamp and ignore all
-- 
cgit v0.10.2


From 983ed7a66bcec9dc307d89dc7af47cdf209e56af Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Thu, 24 Apr 2008 18:17:55 -0700
Subject: sched: add statics, don't return void expressions

Noticed by sparse:
kernel/sched.c:760:20: warning: symbol 'sched_feat_names' was not declared. Should it be static?
kernel/sched.c:767:5: warning: symbol 'sched_feat_open' was not declared. Should it be static?
kernel/sched_fair.c:845:3: warning: returning void-valued expression
kernel/sched.c:4386:3: warning: returning void-valued expression

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index ed3caf2..d941ddc 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -757,14 +757,14 @@ const_debug unsigned int sysctl_sched_features =
 #define SCHED_FEAT(name, enabled)	\
 	#name ,
 
-__read_mostly char *sched_feat_names[] = {
+static __read_mostly char *sched_feat_names[] = {
 #include "sched_features.h"
 	NULL
 };
 
 #undef SCHED_FEAT
 
-int sched_feat_open(struct inode *inode, struct file *filp)
+static int sched_feat_open(struct inode *inode, struct file *filp)
 {
 	filp->private_data = inode->i_private;
 	return 0;
@@ -4341,8 +4341,10 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
 	struct rq *rq = this_rq();
 	cputime64_t tmp;
 
-	if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0))
-		return account_guest_time(p, cputime);
+	if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
+		account_guest_time(p, cputime);
+		return;
+	}
 
 	p->stime = cputime_add(p->stime, cputime);
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 1295ddc..e8e5ad2 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -841,8 +841,10 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 	 * queued ticks are scheduled to match the slice, so don't bother
 	 * validating it and just reschedule.
 	 */
-	if (queued)
-		return resched_task(rq_of(cfs_rq)->curr);
+	if (queued) {
+		resched_task(rq_of(cfs_rq)->curr);
+		return;
+	}
 	/*
 	 * don't let the period tick interfere with the hrtick preemption
 	 */
-- 
cgit v0.10.2


From 8ae121ac8666b0421aa20fd80d4597ec66fa54bc Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Wed, 23 Apr 2008 07:13:29 -0400
Subject: sched: fix RT task-wakeup logic

Dmitry Adamushko pointed out a logic error in task_wake_up_rt() where we
will always evaluate to "true".  You can find the thread here:

http://lkml.org/lkml/2008/4/22/296

In reality, we only want to try to push tasks away when a wake up request is
not going to preempt the current task.  So lets fix it.

Note: We introduce test_tsk_need_resched() instead of open-coding the flag
check so that the merge-conflict with -rt should help remind us that we
may need to support NEEDS_RESCHED_DELAYED in the future, too.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
CC: Dmitry Adamushko <dmitry.adamushko@gmail.com>
CC: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 03c2380..698b5a4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1977,6 +1977,11 @@ static inline void clear_tsk_need_resched(struct task_struct *tsk)
 	clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
 }
 
+static inline int test_tsk_need_resched(struct task_struct *tsk)
+{
+	return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
+}
+
 static inline int signal_pending(struct task_struct *p)
 {
 	return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING));
@@ -1991,7 +1996,7 @@ static inline int fatal_signal_pending(struct task_struct *p)
 
 static inline int need_resched(void)
 {
-	return unlikely(test_thread_flag(TIF_NEED_RESCHED));
+	return unlikely(test_tsk_need_resched(current));
 }
 
 /*
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index dcd6495..060e87b 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1098,11 +1098,14 @@ static void post_schedule_rt(struct rq *rq)
 	}
 }
 
-
+/*
+ * If we are not running and we are not going to reschedule soon, we should
+ * try to push tasks away now
+ */
 static void task_wake_up_rt(struct rq *rq, struct task_struct *p)
 {
 	if (!task_running(rq, p) &&
-	    (p->prio >= rq->rt.highest_prio) &&
+	    !test_tsk_need_resched(rq->curr) &&
 	    rq->rt.overloaded)
 		push_rt_tasks(rq);
 }
-- 
cgit v0.10.2


From 104f64549c961a797ff5f7c59946a7caa335c5b0 Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Mon, 28 Apr 2008 12:40:01 -0400
Subject: sched: fix SCHED_FAIR wake-idle logic error

We currently use an optimization to skip the overhead of wake-idle
processing if more than one task is assigned to a run-queue.  The
assumption is that the system must already be load-balanced or we
wouldnt be overloaded to begin with.

The problem is that we are looking at rq->nr_running, which may include
RT tasks in addition to CFS tasks.  Since the presence of RT tasks
really has no bearing on the balance status of CFS tasks, this throws
the calculation off.

This patch changes the logic to only consider the number of CFS tasks
when making the decision to optimze the wake-idle.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
CC: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index e8e5ad2..1d5f35b 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1009,7 +1009,7 @@ static int wake_idle(int cpu, struct task_struct *p)
 	 * sibling runqueue info. This will avoid the checks and cache miss
 	 * penalities associated with that.
 	 */
-	if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
+	if (idle_cpu(cpu) || cpu_rq(cpu)->cfs.nr_running > 1)
 		return cpu;
 
 	for_each_domain(cpu, sd) {
-- 
cgit v0.10.2


From b328ca182f01c2a04b85e0ee8a410720b104fbcc Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 29 Apr 2008 10:02:46 +0200
Subject: sched: fix hrtick_start_fair and CPU-Hotplug

Gautham R Shenoy reported:

 > While running the usual CPU-Hotplug stress tests on linux-2.6.25,
 > I noticed the following in the console logs.
 >
 > This is a wee bit difficult to reproduce. In the past 10 runs I hit this
 > only once.
 >
 > ------------[ cut here ]------------
 >
 > WARNING: at kernel/sched.c:962 hrtick+0x2e/0x65()
 >
 > Just wondering if we are doing a good job at handling the cancellation
 > of any per-cpu scheduler timers during CPU-Hotplug.

This looks like its indeed not cancelled at all and migrates the it to
another cpu. Fix it via a proper hotplug notifier mechanism.

Reported-by: Gautham R Shenoy <ego@in.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: stable@kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index d941ddc..bee9cbe 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1191,6 +1191,7 @@ static inline void resched_rq(struct rq *rq)
 enum {
 	HRTICK_SET,		/* re-programm hrtick_timer */
 	HRTICK_RESET,		/* not a new slice */
+	HRTICK_BLOCK,		/* stop hrtick operations */
 };
 
 /*
@@ -1202,6 +1203,8 @@ static inline int hrtick_enabled(struct rq *rq)
 {
 	if (!sched_feat(HRTICK))
 		return 0;
+	if (unlikely(test_bit(HRTICK_BLOCK, &rq->hrtick_flags)))
+		return 0;
 	return hrtimer_is_hres_active(&rq->hrtick_timer);
 }
 
@@ -1284,7 +1287,63 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
 	return HRTIMER_NORESTART;
 }
 
-static inline void init_rq_hrtick(struct rq *rq)
+static void hotplug_hrtick_disable(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	unsigned long flags;
+
+	spin_lock_irqsave(&rq->lock, flags);
+	rq->hrtick_flags = 0;
+	__set_bit(HRTICK_BLOCK, &rq->hrtick_flags);
+	spin_unlock_irqrestore(&rq->lock, flags);
+
+	hrtick_clear(rq);
+}
+
+static void hotplug_hrtick_enable(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	unsigned long flags;
+
+	spin_lock_irqsave(&rq->lock, flags);
+	__clear_bit(HRTICK_BLOCK, &rq->hrtick_flags);
+	spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+static int
+hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+	int cpu = (int)(long)hcpu;
+
+	switch (action) {
+	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
+	case CPU_DOWN_PREPARE:
+	case CPU_DOWN_PREPARE_FROZEN:
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+		hotplug_hrtick_disable(cpu);
+		return NOTIFY_OK;
+
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
+	case CPU_DOWN_FAILED:
+	case CPU_DOWN_FAILED_FROZEN:
+	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
+		hotplug_hrtick_enable(cpu);
+		return NOTIFY_OK;
+	}
+
+	return NOTIFY_DONE;
+}
+
+static void init_hrtick(void)
+{
+	hotcpu_notifier(hotplug_hrtick, 0);
+}
+
+static void init_rq_hrtick(struct rq *rq)
 {
 	rq->hrtick_flags = 0;
 	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
@@ -1321,6 +1380,10 @@ static inline void init_rq_hrtick(struct rq *rq)
 void hrtick_resched(void)
 {
 }
+
+static inline void init_hrtick(void)
+{
+}
 #endif
 
 /*
@@ -7943,6 +8006,7 @@ void __init sched_init_smp(void)
 	put_online_cpus();
 	/* XXX: Theoretical race here - CPU may be hotplugged now */
 	hotcpu_notifier(update_sched_domains, 0);
+	init_hrtick();
 
 	/* Move init over to a non-isolated CPU */
 	if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0)
-- 
cgit v0.10.2


From 673a90a1e05c8127886f7659d1a457169378371f Mon Sep 17 00:00:00 2001
From: David Simner <djs203@srcf.ucam.org>
Date: Tue, 29 Apr 2008 10:08:59 +0100
Subject: sched: fix sched_info_switch not being called according to
 documentation

http://bugzilla.kernel.org/show_bug.cgi?id=10545

sched_stats.h says that __sched_info_switch is "called when prev !=
next" in the comment.  sched.c should therefore do that.

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index bee9cbe..3ac3d7a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4662,9 +4662,9 @@ need_resched_nonpreemptible:
 	prev->sched_class->put_prev_task(rq, prev);
 	next = pick_next_task(rq, prev);
 
-	sched_info_switch(prev, next);
-
 	if (likely(prev != next)) {
+		sched_info_switch(prev, next);
+
 		rq->nr_switches++;
 		rq->curr = next;
 		++*switch_count;
-- 
cgit v0.10.2


From d7dcdc11cfa6a8860a29b09f985467b89224699d Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Tue, 29 Apr 2008 12:23:09 +0200
Subject: sched: fix debugging

Revert debugging commit 7ba2e74ab5a0518bc953042952dd165724bc70c9.
print_cfs_rq_tasks() can induce live-lock if a task is dequeued
during list traversal.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 1d5f35b..d99e01f 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1613,30 +1613,6 @@ static const struct sched_class fair_sched_class = {
 };
 
 #ifdef CONFIG_SCHED_DEBUG
-static void
-print_cfs_rq_tasks(struct seq_file *m, struct cfs_rq *cfs_rq, int depth)
-{
-	struct sched_entity *se;
-
-	if (!cfs_rq)
-		return;
-
-	list_for_each_entry_rcu(se, &cfs_rq->tasks, group_node) {
-		int i;
-
-		for (i = depth; i; i--)
-			seq_puts(m, "  ");
-
-		seq_printf(m, "%lu %s %lu\n",
-				se->load.weight,
-				entity_is_task(se) ? "T" : "G",
-				calc_delta_weight(SCHED_LOAD_SCALE, se)
-				);
-		if (!entity_is_task(se))
-			print_cfs_rq_tasks(m, group_cfs_rq(se), depth + 1);
-	}
-}
-
 static void print_cfs_stats(struct seq_file *m, int cpu)
 {
 	struct cfs_rq *cfs_rq;
@@ -1644,9 +1620,6 @@ static void print_cfs_stats(struct seq_file *m, int cpu)
 	rcu_read_lock();
 	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
 		print_cfs_rq(m, cpu, cfs_rq);
-
-	seq_printf(m, "\nWeight tree:\n");
-	print_cfs_rq_tasks(m, &cpu_rq(cpu)->cfs, 1);
 	rcu_read_unlock();
 }
 #endif
-- 
cgit v0.10.2


From 690229a0912ca2fef8b542fe4d8b73acfcdc6e24 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 23 Apr 2008 09:31:35 +0200
Subject: sched: make clock sync tunable by architecture code

make time_sync_thresh tunable to architecture code.

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 698b5a4..54c9ca2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -158,6 +158,8 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 }
 #endif
 
+extern unsigned long long time_sync_thresh;
+
 /*
  * Task state bitmask. NOTE! These bits are also
  * encoded in fs/proc/array.c: get_task_state().
diff --git a/kernel/sched.c b/kernel/sched.c
index 3ac3d7a..8f433fe 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -899,7 +899,7 @@ static inline u64 global_rt_runtime(void)
 	return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
 }
 
-static const unsigned long long time_sync_thresh = 100000;
+unsigned long long time_sync_thresh = 100000;
 
 static DEFINE_PER_CPU(unsigned long long, time_offset);
 static DEFINE_PER_CPU(unsigned long long, prev_cpu_time);
-- 
cgit v0.10.2


From 712555ee4f873515612f89554ad1a3fda5fa887e Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Mon, 28 Apr 2008 11:33:07 +0200
Subject: sched: fix missing locking in sched_domains code

Concurrent calls to detach_destroy_domains and arch_init_sched_domains
were prevented by the old scheduler subsystem cpu hotplug mutex. When
this got converted to get_online_cpus() the locking got broken.
Unlike before now several processes can concurrently enter the critical
sections that were protected by the old lock.

So use the already present doms_cur_mutex to protect these sections again.

Cc: Gautham R Shenoy <ego@in.ibm.com>
Cc: Paul Jackson <pj@sgi.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index 8f433fe..561b3b3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -242,6 +242,12 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
 }
 #endif
 
+/*
+ * sched_domains_mutex serializes calls to arch_init_sched_domains,
+ * detach_destroy_domains and partition_sched_domains.
+ */
+static DEFINE_MUTEX(sched_domains_mutex);
+
 #ifdef CONFIG_GROUP_SCHED
 
 #include <linux/cgroup.h>
@@ -308,9 +314,6 @@ static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
  */
 static DEFINE_SPINLOCK(task_group_lock);
 
-/* doms_cur_mutex serializes access to doms_cur[] array */
-static DEFINE_MUTEX(doms_cur_mutex);
-
 #ifdef CONFIG_FAIR_GROUP_SCHED
 #ifdef CONFIG_USER_SCHED
 # define INIT_TASK_GROUP_LOAD	(2*NICE_0_LOAD)
@@ -358,21 +361,9 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
 #endif
 }
 
-static inline void lock_doms_cur(void)
-{
-	mutex_lock(&doms_cur_mutex);
-}
-
-static inline void unlock_doms_cur(void)
-{
-	mutex_unlock(&doms_cur_mutex);
-}
-
 #else
 
 static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
-static inline void lock_doms_cur(void) { }
-static inline void unlock_doms_cur(void) { }
 
 #endif	/* CONFIG_GROUP_SCHED */
 
@@ -7822,7 +7813,7 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
 {
 	int i, j;
 
-	lock_doms_cur();
+	mutex_lock(&sched_domains_mutex);
 
 	/* always unregister in case we don't destroy any domains */
 	unregister_sched_domain_sysctl();
@@ -7871,7 +7862,7 @@ match2:
 
 	register_sched_domain_sysctl();
 
-	unlock_doms_cur();
+	mutex_unlock(&sched_domains_mutex);
 }
 
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -7880,8 +7871,10 @@ int arch_reinit_sched_domains(void)
 	int err;
 
 	get_online_cpus();
+	mutex_lock(&sched_domains_mutex);
 	detach_destroy_domains(&cpu_online_map);
 	err = arch_init_sched_domains(&cpu_online_map);
+	mutex_unlock(&sched_domains_mutex);
 	put_online_cpus();
 
 	return err;
@@ -7999,10 +7992,12 @@ void __init sched_init_smp(void)
 	BUG_ON(sched_group_nodes_bycpu == NULL);
 #endif
 	get_online_cpus();
+	mutex_lock(&sched_domains_mutex);
 	arch_init_sched_domains(&cpu_online_map);
 	cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
 	if (cpus_empty(non_isolated_cpus))
 		cpu_set(smp_processor_id(), non_isolated_cpus);
+	mutex_unlock(&sched_domains_mutex);
 	put_online_cpus();
 	/* XXX: Theoretical race here - CPU may be hotplugged now */
 	hotcpu_notifier(update_sched_domains, 0);
-- 
cgit v0.10.2


From cb4ad1ffc7c0d8ea7dc8cd8ba303d83551716d46 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Mon, 28 Apr 2008 12:54:56 +0800
Subject: sched: fair-group: fix a Div0 error of the fair group scheduler

When I echoed 0 into the "cpu.shares" file, a Div0 error occured.

We found it is caused by the following calling.

   sched_group_set_shares(tg, shares)
       set_se_shares(tg->se[i], shares/nr_cpu_ids)
           __set_se_shares(se, shares)
               div64_64((1ULL<<32), shares)

When the echoed value was less than the number of processores, the result of the
sentence "shares/nr_cpu_ids" was 0, and then the system called div64() to divide
the result, the Div0 error occured.

It is unnecessary that the shares value is divided by nr_cpu_ids, I think.
Because in the function  __update_group_shares_cpu() and init_tg_cfs_entry(),
the shares value isn't divided by nr_cpu_ids when setting shares of the sched
entity.

This patch fixes this bug. And echoing ULONG_MAX value into cpu.shares also
causes Div0 error, so we set a macro MAX_SHARES to limit the max value of
shares.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index 561b3b3..f98f75f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -321,7 +321,13 @@ static DEFINE_SPINLOCK(task_group_lock);
 # define INIT_TASK_GROUP_LOAD	NICE_0_LOAD
 #endif
 
+/*
+ * A weight of 0, 1 or ULONG_MAX can cause arithmetics problems.
+ * (The default weight is 1024 - so there's no practical
+ *  limitation from this.)
+ */
 #define MIN_SHARES	2
+#define MAX_SHARES	(ULONG_MAX - 1)
 
 static int init_task_group_load = INIT_TASK_GROUP_LOAD;
 #endif
@@ -1804,6 +1810,8 @@ __update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd,
 
 	if (shares < MIN_SHARES)
 		shares = MIN_SHARES;
+	else if (shares > MAX_SHARES)
+		shares = MAX_SHARES;
 
 	__set_se_shares(tg->se[tcpu], shares);
 }
@@ -8785,13 +8793,10 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 	if (!tg->se[0])
 		return -EINVAL;
 
-	/*
-	 * A weight of 0 or 1 can cause arithmetics problems.
-	 * (The default weight is 1024 - so there's no practical
-	 *  limitation from this.)
-	 */
 	if (shares < MIN_SHARES)
 		shares = MIN_SHARES;
+	else if (shares > MAX_SHARES)
+		shares = MAX_SHARES;
 
 	mutex_lock(&shares_mutex);
 	if (tg->shares == shares)
@@ -8816,7 +8821,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 		 * force a rebalance
 		 */
 		cfs_rq_set_shares(tg->cfs_rq[i], 0);
-		set_se_shares(tg->se[i], shares/nr_cpu_ids);
+		set_se_shares(tg->se[i], shares);
 	}
 
 	/*
-- 
cgit v0.10.2


From dfbf4a1bc319f0f9a31e39b2da1fa5c55e85af89 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 23 Apr 2008 09:24:06 +0200
Subject: sched: fix cpu clock

David Miller pointed it out that nothing in cpu_clock() sets
prev_cpu_time. This caused __sync_cpu_clock() to be called
all the time - against the intention of this code.

The result was that in practice we hit a global spinlock every
time cpu_clock() is called - which - even though cpu_clock()
is used for tracing and debugging, is suboptimal.

While at it, also:

- move the irq disabling to the outest layer,
  this should make cpu_clock() warp-free when called with irqs
  enabled.

- use long long instead of cycles_t - for platforms where cycles_t
  is 32-bit.

Reported-by: David Miller <davem@davemloft.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index f98f75f..9457106b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -910,11 +910,14 @@ static DEFINE_PER_CPU(unsigned long long, prev_cpu_time);
 static DEFINE_SPINLOCK(time_sync_lock);
 static unsigned long long prev_global_time;
 
-static unsigned long long __sync_cpu_clock(cycles_t time, int cpu)
+static unsigned long long __sync_cpu_clock(unsigned long long time, int cpu)
 {
-	unsigned long flags;
-
-	spin_lock_irqsave(&time_sync_lock, flags);
+	/*
+	 * We want this inlined, to not get tracer function calls
+	 * in this critical section:
+	 */
+	spin_acquire(&time_sync_lock.dep_map, 0, 0, _THIS_IP_);
+	__raw_spin_lock(&time_sync_lock.raw_lock);
 
 	if (time < prev_global_time) {
 		per_cpu(time_offset, cpu) += prev_global_time - time;
@@ -923,7 +926,8 @@ static unsigned long long __sync_cpu_clock(cycles_t time, int cpu)
 		prev_global_time = time;
 	}
 
-	spin_unlock_irqrestore(&time_sync_lock, flags);
+	__raw_spin_unlock(&time_sync_lock.raw_lock);
+	spin_release(&time_sync_lock.dep_map, 1, _THIS_IP_);
 
 	return time;
 }
@@ -931,7 +935,6 @@ static unsigned long long __sync_cpu_clock(cycles_t time, int cpu)
 static unsigned long long __cpu_clock(int cpu)
 {
 	unsigned long long now;
-	unsigned long flags;
 	struct rq *rq;
 
 	/*
@@ -941,11 +944,9 @@ static unsigned long long __cpu_clock(int cpu)
 	if (unlikely(!scheduler_running))
 		return 0;
 
-	local_irq_save(flags);
 	rq = cpu_rq(cpu);
 	update_rq_clock(rq);
 	now = rq->clock;
-	local_irq_restore(flags);
 
 	return now;
 }
@@ -957,13 +958,18 @@ static unsigned long long __cpu_clock(int cpu)
 unsigned long long cpu_clock(int cpu)
 {
 	unsigned long long prev_cpu_time, time, delta_time;
+	unsigned long flags;
 
+	local_irq_save(flags);
 	prev_cpu_time = per_cpu(prev_cpu_time, cpu);
 	time = __cpu_clock(cpu) + per_cpu(time_offset, cpu);
 	delta_time = time-prev_cpu_time;
 
-	if (unlikely(delta_time > time_sync_thresh))
+	if (unlikely(delta_time > time_sync_thresh)) {
 		time = __sync_cpu_clock(time, cpu);
+		per_cpu(prev_cpu_time, cpu) = time;
+	}
+	local_irq_restore(flags);
 
 	return time;
 }
-- 
cgit v0.10.2


From a5574cf65b5f03ce9ade3918764fe22e5e2371e3 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 5 May 2008 23:19:50 +0200
Subject: sched, x86: add HAVE_UNSTABLE_SCHED_CLOCK

add the HAVE_UNSTABLE_SCHED_CLOCK, for architectures to select.

the next change utilizes it.

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 845ea2b2..bbcafaa 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -18,6 +18,7 @@ config X86_64
 ### Arch settings
 config X86
 	def_bool y
+	select HAVE_UNSTABLE_SCHED_CLOCK
 	select HAVE_IDE
 	select HAVE_OPROFILE
 	select HAVE_KPROBES
diff --git a/init/Kconfig b/init/Kconfig
index f0e62e5..fa42e6b 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -316,6 +316,12 @@ config CPUSETS
 
 	  Say N if unsure.
 
+#
+# Architectures with an unreliable sched_clock() should select this:
+#
+config HAVE_UNSTABLE_SCHED_CLOCK
+	bool
+
 config GROUP_SCHED
 	bool "Group CPU scheduler"
 	default y
-- 
cgit v0.10.2


From 3e51f33fcc7f55e6df25d15b55ed10c8b4da84cd Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 3 May 2008 18:29:28 +0200
Subject: sched: add optional support for CONFIG_HAVE_UNSTABLE_SCHED_CLOCK

this replaces the rq->clock stuff (and possibly cpu_clock()).

 - architectures that have an 'imperfect' hardware clock can set
   CONFIG_HAVE_UNSTABLE_SCHED_CLOCK

 - the 'jiffie' window might be superfulous when we update tick_gtod
   before the __update_sched_clock() call in sched_clock_tick()

 - cpu_clock() might be implemented as:

     sched_clock_cpu(smp_processor_id())

   if the accuracy proves good enough - how far can TSC drift in a
   single jiffie when considering the filtering and idle hooks?

[ mingo@elte.hu: various fixes and cleanups ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 54c9ca2..0c35b03 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1553,6 +1553,35 @@ static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
 
 extern unsigned long long sched_clock(void);
 
+#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+static inline void sched_clock_init(void)
+{
+}
+
+static inline u64 sched_clock_cpu(int cpu)
+{
+	return sched_clock();
+}
+
+static inline void sched_clock_tick(void)
+{
+}
+
+static inline void sched_clock_idle_sleep_event(void)
+{
+}
+
+static inline void sched_clock_idle_wakeup_event(u64 delta_ns)
+{
+}
+#else
+extern void sched_clock_init(void);
+extern u64 sched_clock_cpu(int cpu);
+extern void sched_clock_tick(void);
+extern void sched_clock_idle_sleep_event(void);
+extern void sched_clock_idle_wakeup_event(u64 delta_ns);
+#endif
+
 /*
  * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
  * clock constructed from sched_clock():
diff --git a/init/main.c b/init/main.c
index a87d4ca5..ddada7a 100644
--- a/init/main.c
+++ b/init/main.c
@@ -602,6 +602,7 @@ asmlinkage void __init start_kernel(void)
 	softirq_init();
 	timekeeping_init();
 	time_init();
+	sched_clock_init();
 	profile_init();
 	if (!irqs_disabled())
 		printk("start_kernel(): bug: interrupts were enabled early\n");
diff --git a/kernel/Makefile b/kernel/Makefile
index 188c432..1c9938a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,7 +9,7 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
 	    rcupdate.o extable.o params.o posix-timers.o \
 	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
 	    hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
-	    notifier.o ksysfs.o pm_qos_params.o
+	    notifier.o ksysfs.o pm_qos_params.o sched_clock.o
 
 obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
diff --git a/kernel/sched.c b/kernel/sched.c
index 9457106b..58fb8af 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -75,16 +75,6 @@
 #include <asm/irq_regs.h>
 
 /*
- * Scheduler clock - returns current time in nanosec units.
- * This is default implementation.
- * Architectures and sub-architectures can override this.
- */
-unsigned long long __attribute__((weak)) sched_clock(void)
-{
-	return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
-}
-
-/*
  * Convert user-nice values [ -20 ... 0 ... 19 ]
  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
  * and back.
@@ -557,13 +547,7 @@ struct rq {
 	unsigned long next_balance;
 	struct mm_struct *prev_mm;
 
-	u64 clock, prev_clock_raw;
-	s64 clock_max_delta;
-
-	unsigned int clock_warps, clock_overflows, clock_underflows;
-	u64 idle_clock;
-	unsigned int clock_deep_idle_events;
-	u64 tick_timestamp;
+	u64 clock;
 
 	atomic_t nr_iowait;
 
@@ -628,82 +612,6 @@ static inline int cpu_of(struct rq *rq)
 #endif
 }
 
-#ifdef CONFIG_NO_HZ
-static inline bool nohz_on(int cpu)
-{
-	return tick_get_tick_sched(cpu)->nohz_mode != NOHZ_MODE_INACTIVE;
-}
-
-static inline u64 max_skipped_ticks(struct rq *rq)
-{
-	return nohz_on(cpu_of(rq)) ? jiffies - rq->last_tick_seen + 2 : 1;
-}
-
-static inline void update_last_tick_seen(struct rq *rq)
-{
-	rq->last_tick_seen = jiffies;
-}
-#else
-static inline u64 max_skipped_ticks(struct rq *rq)
-{
-	return 1;
-}
-
-static inline void update_last_tick_seen(struct rq *rq)
-{
-}
-#endif
-
-/*
- * Update the per-runqueue clock, as finegrained as the platform can give
- * us, but without assuming monotonicity, etc.:
- */
-static void __update_rq_clock(struct rq *rq)
-{
-	u64 prev_raw = rq->prev_clock_raw;
-	u64 now = sched_clock();
-	s64 delta = now - prev_raw;
-	u64 clock = rq->clock;
-
-#ifdef CONFIG_SCHED_DEBUG
-	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
-#endif
-	/*
-	 * Protect against sched_clock() occasionally going backwards:
-	 */
-	if (unlikely(delta < 0)) {
-		clock++;
-		rq->clock_warps++;
-	} else {
-		/*
-		 * Catch too large forward jumps too:
-		 */
-		u64 max_jump = max_skipped_ticks(rq) * TICK_NSEC;
-		u64 max_time = rq->tick_timestamp + max_jump;
-
-		if (unlikely(clock + delta > max_time)) {
-			if (clock < max_time)
-				clock = max_time;
-			else
-				clock++;
-			rq->clock_overflows++;
-		} else {
-			if (unlikely(delta > rq->clock_max_delta))
-				rq->clock_max_delta = delta;
-			clock += delta;
-		}
-	}
-
-	rq->prev_clock_raw = now;
-	rq->clock = clock;
-}
-
-static void update_rq_clock(struct rq *rq)
-{
-	if (likely(smp_processor_id() == cpu_of(rq)))
-		__update_rq_clock(rq);
-}
-
 /*
  * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
  * See detach_destroy_domains: synchronize_sched for details.
@@ -719,6 +627,11 @@ static void update_rq_clock(struct rq *rq)
 #define task_rq(p)		cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 
+static inline void update_rq_clock(struct rq *rq)
+{
+	rq->clock = sched_clock_cpu(cpu_of(rq));
+}
+
 /*
  * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
  */
@@ -935,7 +848,6 @@ static unsigned long long __sync_cpu_clock(unsigned long long time, int cpu)
 static unsigned long long __cpu_clock(int cpu)
 {
 	unsigned long long now;
-	struct rq *rq;
 
 	/*
 	 * Only call sched_clock() if the scheduler has already been
@@ -944,9 +856,7 @@ static unsigned long long __cpu_clock(int cpu)
 	if (unlikely(!scheduler_running))
 		return 0;
 
-	rq = cpu_rq(cpu);
-	update_rq_clock(rq);
-	now = rq->clock;
+	now = sched_clock_cpu(cpu);
 
 	return now;
 }
@@ -1120,45 +1030,6 @@ static struct rq *this_rq_lock(void)
 	return rq;
 }
 
-/*
- * We are going deep-idle (irqs are disabled):
- */
-void sched_clock_idle_sleep_event(void)
-{
-	struct rq *rq = cpu_rq(smp_processor_id());
-
-	WARN_ON(!irqs_disabled());
-	spin_lock(&rq->lock);
-	__update_rq_clock(rq);
-	spin_unlock(&rq->lock);
-	rq->clock_deep_idle_events++;
-}
-EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
-
-/*
- * We just idled delta nanoseconds (called with irqs disabled):
- */
-void sched_clock_idle_wakeup_event(u64 delta_ns)
-{
-	struct rq *rq = cpu_rq(smp_processor_id());
-	u64 now = sched_clock();
-
-	WARN_ON(!irqs_disabled());
-	rq->idle_clock += delta_ns;
-	/*
-	 * Override the previous timestamp and ignore all
-	 * sched_clock() deltas that occured while we idled,
-	 * and use the PM-provided delta_ns to advance the
-	 * rq clock:
-	 */
-	spin_lock(&rq->lock);
-	rq->prev_clock_raw = now;
-	rq->clock += delta_ns;
-	spin_unlock(&rq->lock);
-	touch_softlockup_watchdog();
-}
-EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
-
 static void __resched_task(struct task_struct *p, int tif_bit);
 
 static inline void resched_task(struct task_struct *p)
@@ -1283,7 +1154,7 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
 	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
 
 	spin_lock(&rq->lock);
-	__update_rq_clock(rq);
+	update_rq_clock(rq);
 	rq->curr->sched_class->task_tick(rq, rq->curr, 1);
 	spin_unlock(&rq->lock);
 
@@ -4476,19 +4347,11 @@ void scheduler_tick(void)
 	int cpu = smp_processor_id();
 	struct rq *rq = cpu_rq(cpu);
 	struct task_struct *curr = rq->curr;
-	u64 next_tick = rq->tick_timestamp + TICK_NSEC;
+
+	sched_clock_tick();
 
 	spin_lock(&rq->lock);
-	__update_rq_clock(rq);
-	/*
-	 * Let rq->clock advance by at least TICK_NSEC:
-	 */
-	if (unlikely(rq->clock < next_tick)) {
-		rq->clock = next_tick;
-		rq->clock_underflows++;
-	}
-	rq->tick_timestamp = rq->clock;
-	update_last_tick_seen(rq);
+	update_rq_clock(rq);
 	update_cpu_load(rq);
 	curr->sched_class->task_tick(rq, curr, 0);
 	spin_unlock(&rq->lock);
@@ -4642,7 +4505,7 @@ need_resched_nonpreemptible:
 	 * Do the rq-clock update outside the rq lock:
 	 */
 	local_irq_disable();
-	__update_rq_clock(rq);
+	update_rq_clock(rq);
 	spin_lock(&rq->lock);
 	clear_tsk_need_resched(prev);
 
@@ -8226,8 +8089,6 @@ void __init sched_init(void)
 		spin_lock_init(&rq->lock);
 		lockdep_set_class(&rq->lock, &rq->rq_lock_key);
 		rq->nr_running = 0;
-		rq->clock = 1;
-		update_last_tick_seen(rq);
 		init_cfs_rq(&rq->cfs, rq);
 		init_rt_rq(&rq->rt, rq);
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -8371,6 +8232,7 @@ EXPORT_SYMBOL(__might_sleep);
 static void normalize_task(struct rq *rq, struct task_struct *p)
 {
 	int on_rq;
+
 	update_rq_clock(rq);
 	on_rq = p->se.on_rq;
 	if (on_rq)
@@ -8402,7 +8264,6 @@ void normalize_rt_tasks(void)
 		p->se.sleep_start		= 0;
 		p->se.block_start		= 0;
 #endif
-		task_rq(p)->clock		= 0;
 
 		if (!rt_task(p)) {
 			/*
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
new file mode 100644
index 0000000..9c597e3
--- /dev/null
+++ b/kernel/sched_clock.c
@@ -0,0 +1,236 @@
+/*
+ * sched_clock for unstable cpu clocks
+ *
+ *  Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *
+ * Based on code by:
+ *   Ingo Molnar <mingo@redhat.com>
+ *   Guillaume Chazarain <guichaz@gmail.com>
+ *
+ * Create a semi stable clock from a mixture of other events, including:
+ *  - gtod
+ *  - jiffies
+ *  - sched_clock()
+ *  - explicit idle events
+ *
+ * We use gtod as base and the unstable clock deltas. The deltas are filtered,
+ * making it monotonic and keeping it within an expected window.  This window
+ * is set up using jiffies.
+ *
+ * Furthermore, explicit sleep and wakeup hooks allow us to account for time
+ * that is otherwise invisible (TSC gets stopped).
+ *
+ * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat
+ * consistent between cpus (never more than 1 jiffies difference).
+ */
+#include <linux/sched.h>
+#include <linux/percpu.h>
+#include <linux/spinlock.h>
+#include <linux/ktime.h>
+#include <linux/module.h>
+
+
+#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+
+struct sched_clock_data {
+	/*
+	 * Raw spinlock - this is a special case: this might be called
+	 * from within instrumentation code so we dont want to do any
+	 * instrumentation ourselves.
+	 */
+	raw_spinlock_t		lock;
+
+	unsigned long		prev_jiffies;
+	u64			prev_raw;
+	u64			tick_raw;
+	u64			tick_gtod;
+	u64			clock;
+};
+
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
+
+static inline struct sched_clock_data *this_scd(void)
+{
+	return &__get_cpu_var(sched_clock_data);
+}
+
+static inline struct sched_clock_data *cpu_sdc(int cpu)
+{
+	return &per_cpu(sched_clock_data, cpu);
+}
+
+void sched_clock_init(void)
+{
+	u64 ktime_now = ktime_to_ns(ktime_get());
+	u64 now = 0;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct sched_clock_data *scd = cpu_sdc(cpu);
+
+		scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+		scd->prev_jiffies = jiffies;
+		scd->prev_raw = now;
+		scd->tick_raw = now;
+		scd->tick_gtod = ktime_now;
+		scd->clock = ktime_now;
+	}
+}
+
+/*
+ * update the percpu scd from the raw @now value
+ *
+ *  - filter out backward motion
+ *  - use jiffies to generate a min,max window to clip the raw values
+ */
+static void __update_sched_clock(struct sched_clock_data *scd, u64 now)
+{
+	unsigned long now_jiffies = jiffies;
+	long delta_jiffies = now_jiffies - scd->prev_jiffies;
+	u64 clock = scd->clock;
+	u64 min_clock, max_clock;
+	s64 delta = now - scd->prev_raw;
+
+	WARN_ON_ONCE(!irqs_disabled());
+	min_clock = scd->tick_gtod + delta_jiffies * TICK_NSEC;
+
+	if (unlikely(delta < 0)) {
+		clock++;
+		goto out;
+	}
+
+	max_clock = min_clock + TICK_NSEC;
+
+	if (unlikely(clock + delta > max_clock)) {
+		if (clock < max_clock)
+			clock = max_clock;
+		else
+			clock++;
+	} else {
+		clock += delta;
+	}
+
+ out:
+	if (unlikely(clock < min_clock))
+		clock = min_clock;
+
+	scd->prev_raw = now;
+	scd->prev_jiffies = now_jiffies;
+	scd->clock = clock;
+}
+
+static void lock_double_clock(struct sched_clock_data *data1,
+				struct sched_clock_data *data2)
+{
+	if (data1 < data2) {
+		__raw_spin_lock(&data1->lock);
+		__raw_spin_lock(&data2->lock);
+	} else {
+		__raw_spin_lock(&data2->lock);
+		__raw_spin_lock(&data1->lock);
+	}
+}
+
+u64 sched_clock_cpu(int cpu)
+{
+	struct sched_clock_data *scd = cpu_sdc(cpu);
+	u64 now, clock;
+
+	WARN_ON_ONCE(!irqs_disabled());
+	now = sched_clock();
+
+	if (cpu != raw_smp_processor_id()) {
+		/*
+		 * in order to update a remote cpu's clock based on our
+		 * unstable raw time rebase it against:
+		 *   tick_raw		(offset between raw counters)
+		 *   tick_gotd          (tick offset between cpus)
+		 */
+		struct sched_clock_data *my_scd = this_scd();
+
+		lock_double_clock(scd, my_scd);
+
+		now -= my_scd->tick_raw;
+		now += scd->tick_raw;
+
+		now -= my_scd->tick_gtod;
+		now += scd->tick_gtod;
+
+		__raw_spin_unlock(&my_scd->lock);
+	} else {
+		__raw_spin_lock(&scd->lock);
+	}
+
+	__update_sched_clock(scd, now);
+	clock = scd->clock;
+
+	__raw_spin_unlock(&scd->lock);
+
+	return clock;
+}
+
+void sched_clock_tick(void)
+{
+	struct sched_clock_data *scd = this_scd();
+	u64 now, now_gtod;
+
+	WARN_ON_ONCE(!irqs_disabled());
+
+	now = sched_clock();
+	now_gtod = ktime_to_ns(ktime_get());
+
+	__raw_spin_lock(&scd->lock);
+	__update_sched_clock(scd, now);
+	/*
+	 * update tick_gtod after __update_sched_clock() because that will
+	 * already observe 1 new jiffy; adding a new tick_gtod to that would
+	 * increase the clock 2 jiffies.
+	 */
+	scd->tick_raw = now;
+	scd->tick_gtod = now_gtod;
+	__raw_spin_unlock(&scd->lock);
+}
+
+/*
+ * We are going deep-idle (irqs are disabled):
+ */
+void sched_clock_idle_sleep_event(void)
+{
+	sched_clock_cpu(smp_processor_id());
+}
+EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
+
+/*
+ * We just idled delta nanoseconds (called with irqs disabled):
+ */
+void sched_clock_idle_wakeup_event(u64 delta_ns)
+{
+	struct sched_clock_data *scd = this_scd();
+	u64 now = sched_clock();
+
+	/*
+	 * Override the previous timestamp and ignore all
+	 * sched_clock() deltas that occured while we idled,
+	 * and use the PM-provided delta_ns to advance the
+	 * rq clock:
+	 */
+	__raw_spin_lock(&scd->lock);
+	scd->prev_raw = now;
+	scd->clock += delta_ns;
+	__raw_spin_unlock(&scd->lock);
+
+	touch_softlockup_watchdog();
+}
+EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
+
+#endif
+
+/*
+ * Scheduler clock - returns current time in nanosec units.
+ * This is default implementation.
+ * Architectures and sub-architectures can override this.
+ */
+unsigned long long __attribute__((weak)) sched_clock(void)
+{
+	return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
+}
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 6b4a125..5f06118 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -204,13 +204,6 @@ static void print_cpu(struct seq_file *m, int cpu)
 	PN(next_balance);
 	P(curr->pid);
 	PN(clock);
-	PN(idle_clock);
-	PN(prev_clock_raw);
-	P(clock_warps);
-	P(clock_overflows);
-	P(clock_underflows);
-	P(clock_deep_idle_events);
-	PN(clock_max_delta);
 	P(cpu_load[0]);
 	P(cpu_load[1]);
 	P(cpu_load[2]);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index d99e01f..c863663 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -959,7 +959,7 @@ static void yield_task_fair(struct rq *rq)
 		return;
 
 	if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {
-		__update_rq_clock(rq);
+		update_rq_clock(rq);
 		/*
 		 * Update run-time statistics of the 'current'.
 		 */
-- 
cgit v0.10.2


From aac6abca858386438d9a7233c3471d2ecfa2f704 Mon Sep 17 00:00:00 2001
From: Parag Warudkar <parag.warudkar@gmail.com>
Date: Sat, 3 May 2008 20:42:34 -0400
Subject: sched: default to n for GROUP_SCHED and FAIR_GROUP_SCHED

GROUP_SCHED is confirmed to cause unacceptable latencies, see:

   http://lkml.org/lkml/2008/5/2/370.

Mark it EXPERIMENTAL and default to no for now.

Signed-off-by: Parag Warudkar <parag.warudkar@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/init/Kconfig b/init/Kconfig
index fa42e6b..4c33316 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -324,7 +324,8 @@ config HAVE_UNSTABLE_SCHED_CLOCK
 
 config GROUP_SCHED
 	bool "Group CPU scheduler"
-	default y
+	depends on EXPERIMENTAL
+	default n
 	help
 	  This feature lets CPU scheduler recognize task groups and control CPU
 	  bandwidth allocation to such task groups.
@@ -332,7 +333,7 @@ config GROUP_SCHED
 config FAIR_GROUP_SCHED
 	bool "Group scheduling for SCHED_OTHER"
 	depends on GROUP_SCHED
-	default y
+	default GROUP_SCHED
 
 config RT_GROUP_SCHED
 	bool "Group scheduling for SCHED_RR/FIFO"
-- 
cgit v0.10.2


From 17aa7e034416e3080bc57a786d09ba0a4a044561 Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Mon, 5 May 2008 13:54:19 +1000
Subject: dev_name introduction fall out fix

Commit 06916639e2fed9ee475efef2747a1b7429f8fe76 ("driver-core: add
dev_name() to help transition away from using bus_id") added a static
inline dev_name() and used it in dev_printk.

Unfortunately, drivers/edac/edac_core.h defines a macro called
dev_name().  Rename the latter.

Diagnosis by Tony Breeds and Michael Ellerman.

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Acked-by: Doug Thompson <dougthompson@xmission.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/edac/edac_core.h b/drivers/edac/edac_core.h
index a9aa845..b27b13c 100644
--- a/drivers/edac/edac_core.h
+++ b/drivers/edac/edac_core.h
@@ -97,7 +97,7 @@ extern int edac_debug_level;
 #define PCI_VEND_DEV(vend, dev) PCI_VENDOR_ID_ ## vend, \
 	PCI_DEVICE_ID_ ## vend ## _ ## dev
 
-#define dev_name(dev) (dev)->dev_name
+#define edac_dev_name(dev) (dev)->dev_name
 
 /* memory devices */
 enum dev_type {
diff --git a/drivers/edac/edac_device.c b/drivers/edac/edac_device.c
index 63372fa..5fcd3d8 100644
--- a/drivers/edac/edac_device.c
+++ b/drivers/edac/edac_device.c
@@ -333,7 +333,7 @@ static int add_edac_dev_to_global_list(struct edac_device_ctl_info *edac_dev)
 fail0:
 	edac_printk(KERN_WARNING, EDAC_MC,
 			"%s (%s) %s %s already assigned %d\n",
-			rover->dev->bus_id, dev_name(rover),
+			rover->dev->bus_id, edac_dev_name(rover),
 			rover->mod_name, rover->ctl_name, rover->dev_idx);
 	return 1;
 
@@ -538,7 +538,7 @@ int edac_device_add_device(struct edac_device_ctl_info *edac_dev)
 				"'%s': DEV '%s' (%s)\n",
 				edac_dev->mod_name,
 				edac_dev->ctl_name,
-				dev_name(edac_dev),
+				edac_dev_name(edac_dev),
 				edac_op_state_to_string(edac_dev->op_state));
 
 	mutex_unlock(&device_ctls_mutex);
@@ -599,7 +599,7 @@ struct edac_device_ctl_info *edac_device_del_device(struct device *dev)
 	edac_printk(KERN_INFO, EDAC_MC,
 		"Removed device %d for %s %s: DEV %s\n",
 		edac_dev->dev_idx,
-		edac_dev->mod_name, edac_dev->ctl_name, dev_name(edac_dev));
+		edac_dev->mod_name, edac_dev->ctl_name, edac_dev_name(edac_dev));
 
 	return edac_dev;
 }
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index a4cf164..d110392 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -402,7 +402,7 @@ static int add_mc_to_global_list(struct mem_ctl_info *mci)
 fail0:
 	edac_printk(KERN_WARNING, EDAC_MC,
 		"%s (%s) %s %s already assigned %d\n", p->dev->bus_id,
-		dev_name(mci), p->mod_name, p->ctl_name, p->mc_idx);
+		edac_dev_name(mci), p->mod_name, p->ctl_name, p->mc_idx);
 	return 1;
 
 fail1:
@@ -517,7 +517,7 @@ int edac_mc_add_mc(struct mem_ctl_info *mci)
 
 	/* Report action taken */
 	edac_mc_printk(mci, KERN_INFO, "Giving out device to '%s' '%s':"
-		" DEV %s\n", mci->mod_name, mci->ctl_name, dev_name(mci));
+		" DEV %s\n", mci->mod_name, mci->ctl_name, edac_dev_name(mci));
 
 	mutex_unlock(&mem_ctls_mutex);
 	return 0;
@@ -565,7 +565,7 @@ struct mem_ctl_info *edac_mc_del_mc(struct device *dev)
 
 	edac_printk(KERN_INFO, EDAC_MC,
 		"Removed device %d for %s %s: DEV %s\n", mci->mc_idx,
-		mci->mod_name, mci->ctl_name, dev_name(mci));
+		mci->mod_name, mci->ctl_name, edac_dev_name(mci));
 
 	return mci;
 }
diff --git a/drivers/edac/edac_pci.c b/drivers/edac/edac_pci.c
index 9b24340..22ec9d5 100644
--- a/drivers/edac/edac_pci.c
+++ b/drivers/edac/edac_pci.c
@@ -150,7 +150,7 @@ static int add_edac_pci_to_global_list(struct edac_pci_ctl_info *pci)
 fail0:
 	edac_printk(KERN_WARNING, EDAC_PCI,
 		"%s (%s) %s %s already assigned %d\n",
-		rover->dev->bus_id, dev_name(rover),
+		rover->dev->bus_id, edac_dev_name(rover),
 		rover->mod_name, rover->ctl_name, rover->pci_idx);
 	return 1;
 
@@ -360,7 +360,7 @@ int edac_pci_add_device(struct edac_pci_ctl_info *pci, int edac_idx)
 			" DEV '%s' (%s)\n",
 			pci->mod_name,
 			pci->ctl_name,
-			dev_name(pci), edac_op_state_to_string(pci->op_state));
+			edac_dev_name(pci), edac_op_state_to_string(pci->op_state));
 
 	mutex_unlock(&edac_pci_ctls_mutex);
 	return 0;
@@ -415,7 +415,7 @@ struct edac_pci_ctl_info *edac_pci_del_device(struct device *dev)
 
 	edac_printk(KERN_INFO, EDAC_PCI,
 		"Removed device %d for %s %s: DEV %s\n",
-		pci->pci_idx, pci->mod_name, pci->ctl_name, dev_name(pci));
+		pci->pci_idx, pci->mod_name, pci->ctl_name, edac_dev_name(pci));
 
 	return pci;
 }
-- 
cgit v0.10.2


From cf04690885972eaba830ee761de545a6956197e6 Mon Sep 17 00:00:00 2001
From: Stefan Roscher <ossrosch@linux.vnet.ibm.com>
Date: Mon, 5 May 2008 15:51:49 -0700
Subject: IB/ehca: Fix function return types

Also remove duplicate assignment of local_ca_ack_delay and change
min_t check for local_ca_ack_delay to u8 instead of int.

Signed-off-by: Stefan Roscher <stefan.roscher at de.ibm.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>

diff --git a/drivers/infiniband/hw/ehca/ehca_hca.c b/drivers/infiniband/hw/ehca/ehca_hca.c
index 2515cbd..bc3b37d 100644
--- a/drivers/infiniband/hw/ehca/ehca_hca.c
+++ b/drivers/infiniband/hw/ehca/ehca_hca.c
@@ -101,7 +101,6 @@ int ehca_query_device(struct ib_device *ibdev, struct ib_device_attr *props)
 	props->max_ee          = limit_uint(rblock->max_rd_ee_context);
 	props->max_rdd         = limit_uint(rblock->max_rd_domain);
 	props->max_fmr         = limit_uint(rblock->max_mr);
-	props->local_ca_ack_delay  = limit_uint(rblock->local_ca_ack_delay);
 	props->max_qp_rd_atom  = limit_uint(rblock->max_rr_qp);
 	props->max_ee_rd_atom  = limit_uint(rblock->max_rr_ee_context);
 	props->max_res_rd_atom = limit_uint(rblock->max_rr_hca);
@@ -115,7 +114,7 @@ int ehca_query_device(struct ib_device *ibdev, struct ib_device_attr *props)
 	}
 
 	props->max_pkeys           = 16;
-	props->local_ca_ack_delay  = limit_uint(rblock->local_ca_ack_delay);
+	props->local_ca_ack_delay  = min_t(u8, rblock->local_ca_ack_delay, 255);
 	props->max_raw_ipv6_qp     = limit_uint(rblock->max_raw_ipv6_qp);
 	props->max_raw_ethy_qp     = limit_uint(rblock->max_raw_ethy_qp);
 	props->max_mcast_grp       = limit_uint(rblock->max_mcast_grp);
@@ -136,7 +135,7 @@ query_device1:
 	return ret;
 }
 
-static int map_mtu(struct ehca_shca *shca, u32 fw_mtu)
+static enum ib_mtu map_mtu(struct ehca_shca *shca, u32 fw_mtu)
 {
 	switch (fw_mtu) {
 	case 0x1:
@@ -156,7 +155,7 @@ static int map_mtu(struct ehca_shca *shca, u32 fw_mtu)
 	}
 }
 
-static int map_number_of_vls(struct ehca_shca *shca, u32 vl_cap)
+static u8 map_number_of_vls(struct ehca_shca *shca, u32 vl_cap)
 {
 	switch (vl_cap) {
 	case 0x1:
-- 
cgit v0.10.2


From c5057ddccbcb4bf363af628d7963a7475f4114a7 Mon Sep 17 00:00:00 2001
From: Oren Duer <oren@mellanox.co.il>
Date: Mon, 5 May 2008 15:56:52 -0700
Subject: mlx4_core: Support creation of FMRs with pages smaller than 4K

Don't hard code a test against a minimum page shift of 12, since the
device may support smaller pages.  Test against the actual smallest
page size from the device capabilities.

Signed-off-by: Oren Duer <oren@mellanox.co.il>
Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>

diff --git a/drivers/net/mlx4/mr.c b/drivers/net/mlx4/mr.c
index cb46446..03a9abc 100644
--- a/drivers/net/mlx4/mr.c
+++ b/drivers/net/mlx4/mr.c
@@ -551,7 +551,7 @@ int mlx4_fmr_alloc(struct mlx4_dev *dev, u32 pd, u32 access, int max_pages,
 	u64 mtt_seg;
 	int err = -ENOMEM;
 
-	if (page_shift < 12 || page_shift >= 32)
+	if (page_shift < (ffs(dev->caps.page_size_cap) - 1) || page_shift >= 32)
 		return -EINVAL;
 
 	/* All MTTs must fit in the same page */
-- 
cgit v0.10.2


From 5f6b1ea41b46bc63f667f9b30d939b49734c20b0 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 6 May 2008 00:00:16 -0700
Subject: Revert "atm: Do not free already unregistered net device."

This reverts commit 65e4113684e50cee75357ce10dc9026b0929e4e9.

Unlike the other cases Pavel fixed, this case did not
setup a netdev->destructor of free_netdev, therefore this
change was not correct.

Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/atm/br2684.c b/net/atm/br2684.c
index 13858e2..9d52ebf 100644
--- a/net/atm/br2684.c
+++ b/net/atm/br2684.c
@@ -350,6 +350,7 @@ static void br2684_push(struct atm_vcc *atmvcc, struct sk_buff *skb)
 			list_del(&brdev->br2684_devs);
 			write_unlock_irq(&devs_lock);
 			unregister_netdev(net_dev);
+			free_netdev(net_dev);
 		}
 		return;
 	}
@@ -770,6 +771,7 @@ static void __exit br2684_exit(void)
 
 		list_del(&brdev->br2684_devs);
 		unregister_netdev(net_dev);
+		free_netdev(net_dev);
 	}
 }
 
-- 
cgit v0.10.2


From 1da5ea1a8bf4ddb82831528223c853821cb1c9ab Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Tue, 6 May 2008 00:04:47 -0700
Subject: iwlwifi: make IWLWIFI a tristate

IWLWIFI should be a tristate so that if IWLCORE and/or IWL3945 are m
and none of them is y kbuild doesn't create an empty
drivers/net/wireless/built-in.o

This patch also removes the pointless "default n".

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/wireless/iwlwifi/Kconfig b/drivers/net/wireless/iwlwifi/Kconfig
index d5b7a76..62fb89d 100644
--- a/drivers/net/wireless/iwlwifi/Kconfig
+++ b/drivers/net/wireless/iwlwifi/Kconfig
@@ -1,6 +1,5 @@
 config IWLWIFI
-	bool
-	default n
+	tristate
 
 config IWLCORE
 	tristate "Intel Wireless Wifi Core"
-- 
cgit v0.10.2


From 9d1045ad68fcccfaf1393cc463ab6357693e8d1d Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <hadi@cyberus.ca>
Date: Tue, 6 May 2008 00:10:24 -0700
Subject: net_cls_act: act_simple dont ignore realloc code

reallocation of the policy data was being ignored. It could fail.
Simplify so that there is no need for reallocating.

Signed-off-by: Jamal Hadi Salim <hadi@cyberus.ca>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index 269ab51..1d421d0 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -79,10 +79,14 @@ static int alloc_defdata(struct tcf_defact *d, char *defdata)
 	return 0;
 }
 
-static int realloc_defdata(struct tcf_defact *d, char *defdata)
+static void reset_policy(struct tcf_defact *d, char *defdata,
+			 struct tc_defact *p)
 {
-	kfree(d->tcfd_defdata);
-	return alloc_defdata(d, defdata);
+	spin_lock_bh(&d->tcf_lock);
+	d->tcf_action = p->action;
+	memset(d->tcfd_defdata, 0, SIMP_MAX_DATA);
+	strlcpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA);
+	spin_unlock_bh(&d->tcf_lock);
 }
 
 static const struct nla_policy simple_policy[TCA_DEF_MAX + 1] = {
@@ -129,6 +133,7 @@ static int tcf_simp_init(struct nlattr *nla, struct nlattr *est,
 			kfree(pc);
 			return ret;
 		}
+		d->tcf_action = parm->action;
 		ret = ACT_P_CREATED;
 	} else {
 		d = to_defact(pc);
@@ -136,13 +141,9 @@ static int tcf_simp_init(struct nlattr *nla, struct nlattr *est,
 			tcf_simp_release(d, bind);
 			return -EEXIST;
 		}
-		realloc_defdata(d, defdata);
+		reset_policy(d, defdata, parm);
 	}
 
-	spin_lock_bh(&d->tcf_lock);
-	d->tcf_action = parm->action;
-	spin_unlock_bh(&d->tcf_lock);
-
 	if (ret == ACT_P_CREATED)
 		tcf_hash_insert(pc, &simp_hash_info);
 	return ret;
-- 
cgit v0.10.2


From 78ab88f04f44bed566d51dce0c7cbfeff6449a06 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Thu, 1 May 2008 23:41:41 +0900
Subject: libata: improve post-reset device ready test

Some controllers (jmb and inic162x) use 0x77 and 0x7f to indicate that
the device isn't ready yet.  It looks like they use 0xff if device
presence is detected but connection isn't established.  0x77 or 0x7f
after connection is established and use the value from signature FIS
after receiving it.

This patch implements ata_check_ready(), which takes TF status value
and determines whether the port is ready or not considering the above
and other conditions, and use it in @check_ready() functions.  This is
safe as both 0x77 and 0x7f aren't valid ready status value even though
they have BSY bit cleared.

This fixes hot plug detection failures which can be triggered with
certain drives if they aren't already spun up when the data connector
is hot plugged.

Tested on sil, sil24, ahci (jmb/ich), piix and inic162x combined with
eight drives from all major vendors.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index 8cace9a..97f83fb 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -1267,9 +1267,7 @@ static int ahci_check_ready(struct ata_link *link)
 	void __iomem *port_mmio = ahci_port_base(link->ap);
 	u8 status = readl(port_mmio + PORT_TFDATA) & 0xFF;
 
-	if (!(status & ATA_BUSY))
-		return 1;
-	return 0;
+	return ata_check_ready(status);
 }
 
 static int ahci_softreset(struct ata_link *link, unsigned int *class,
diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
index 2ec65a8..3c2d228 100644
--- a/drivers/ata/libata-sff.c
+++ b/drivers/ata/libata-sff.c
@@ -314,11 +314,7 @@ static int ata_sff_check_ready(struct ata_link *link)
 {
 	u8 status = link->ap->ops->sff_check_status(link->ap);
 
-	if (!(status & ATA_BUSY))
-		return 1;
-	if (status == 0xff)
-		return -ENODEV;
-	return 0;
+	return ata_check_ready(status);
 }
 
 /**
diff --git a/include/linux/libata.h b/include/linux/libata.h
index d1dfe87..95e6159b 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1381,6 +1381,21 @@ static inline struct ata_port *ata_shost_to_port(struct Scsi_Host *host)
 	return *(struct ata_port **)&host->hostdata[0];
 }
 
+static inline int ata_check_ready(u8 status)
+{
+	/* Some controllers report 0x77 or 0x7f during intermediate
+	 * not-ready stages.
+	 */
+	if (status == 0x77 || status == 0x7f)
+		return 0;
+
+	/* 0xff indicates either no device or device not ready */
+	if (status == 0xff)
+		return -ENODEV;
+
+	return !(status & ATA_BUSY);
+}
+
 
 /**************************************************************************
  * PMP - drivers/ata/libata-pmp.c
-- 
cgit v0.10.2


From cb6716c879ecf49e2af344926c6a476821812061 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Thu, 1 May 2008 10:03:08 +0900
Subject: ata_piix: verify SIDPR access before enabling it

On certain configurations (certain macbooks), even though all the
conditions for SIDPR access described in the datasheet are met,
actually reading those registers just returns 0 and have no effect on
write.  Verify SIDPR is actually working before enabling it.

This is reported by Ryan Roth in bz#10512.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Cc: Ryan Roth <ryan.roth@ch2m.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/ata/ata_piix.c b/drivers/ata/ata_piix.c
index ea2c764..a9027b8 100644
--- a/drivers/ata/ata_piix.c
+++ b/drivers/ata/ata_piix.c
@@ -1348,6 +1348,8 @@ static void __devinit piix_init_sidpr(struct ata_host *host)
 {
 	struct pci_dev *pdev = to_pci_dev(host->dev);
 	struct piix_host_priv *hpriv = host->private_data;
+	struct ata_device *dev0 = &host->ports[0]->link.device[0];
+	u32 scontrol;
 	int i;
 
 	/* check for availability */
@@ -1366,6 +1368,29 @@ static void __devinit piix_init_sidpr(struct ata_host *host)
 		return;
 
 	hpriv->sidpr = pcim_iomap_table(pdev)[PIIX_SIDPR_BAR];
+
+	/* SCR access via SIDPR doesn't work on some configurations.
+	 * Give it a test drive by inhibiting power save modes which
+	 * we'll do anyway.
+	 */
+	scontrol = piix_sidpr_read(dev0, SCR_CONTROL);
+
+	/* if IPM is already 3, SCR access is probably working.  Don't
+	 * un-inhibit power save modes as BIOS might have inhibited
+	 * them for a reason.
+	 */
+	if ((scontrol & 0xf00) != 0x300) {
+		scontrol |= 0x300;
+		piix_sidpr_write(dev0, SCR_CONTROL, scontrol);
+		scontrol = piix_sidpr_read(dev0, SCR_CONTROL);
+
+		if ((scontrol & 0xf00) != 0x300) {
+			dev_printk(KERN_INFO, host->dev, "SCR access via "
+				   "SIDPR is available but doesn't work\n");
+			return;
+		}
+	}
+
 	host->ports[0]->ops = &piix_sidpr_sata_ops;
 	host->ports[1]->ops = &piix_sidpr_sata_ops;
 }
-- 
cgit v0.10.2


From 07ab85de4d960b6f39395e51c1853485ad120de5 Mon Sep 17 00:00:00 2001
From: Alek Du <alek.du@intel.com>
Date: Tue, 6 May 2008 21:31:41 +0800
Subject: libata: Add Intel SCH PATA driver

This patch adds Intel SCH chipsets (AF82US15W, AF82US15L, AF82UL11L)
PATA controller support.

Signed-off-by: Alek Du <alek.du@intel.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/ata/Kconfig b/drivers/ata/Kconfig
index 1c11df9..1a59f30 100644
--- a/drivers/ata/Kconfig
+++ b/drivers/ata/Kconfig
@@ -697,6 +697,15 @@ config PATA_SCC
 
 	  If unsure, say N.
 
+config PATA_SCH
+	tristate "Intel SCH PATA support"
+	depends on PCI
+	help
+	  This option enables support for Intel SCH PATA on the Intel
+	  SCH (US15W, US15L, UL11L) series host controllers.
+
+	  If unsure, say N.
+
 config PATA_BF54X
 	tristate "Blackfin 54x ATAPI support"
 	depends on BF542 || BF548 || BF549
diff --git a/drivers/ata/Makefile b/drivers/ata/Makefile
index b693d82..674965f 100644
--- a/drivers/ata/Makefile
+++ b/drivers/ata/Makefile
@@ -67,6 +67,7 @@ obj-$(CONFIG_PATA_SIS)		+= pata_sis.o
 obj-$(CONFIG_PATA_TRIFLEX)	+= pata_triflex.o
 obj-$(CONFIG_PATA_IXP4XX_CF)	+= pata_ixp4xx_cf.o
 obj-$(CONFIG_PATA_SCC)		+= pata_scc.o
+obj-$(CONFIG_PATA_SCH)		+= pata_sch.o
 obj-$(CONFIG_PATA_BF54X)	+= pata_bf54x.o
 obj-$(CONFIG_PATA_PLATFORM)	+= pata_platform.o
 obj-$(CONFIG_PATA_OF_PLATFORM)	+= pata_of_platform.o
diff --git a/drivers/ata/pata_sch.c b/drivers/ata/pata_sch.c
new file mode 100644
index 0000000..c8cc027
--- /dev/null
+++ b/drivers/ata/pata_sch.c
@@ -0,0 +1,206 @@
+/*
+ *  pata_sch.c - Intel SCH PATA controllers
+ *
+ *  Copyright (c) 2008 Alek Du <alek.du@intel.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License 2 as published
+ *  by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+/*
+ *  Supports:
+ *    Intel SCH (AF82US15W, AF82US15L, AF82UL11L) chipsets -- see spec at:
+ *    http://download.intel.com/design/chipsets/embedded/datashts/319537.pdf
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <scsi/scsi_host.h>
+#include <linux/libata.h>
+#include <linux/dmi.h>
+
+#define DRV_NAME	"pata_sch"
+#define DRV_VERSION	"0.2"
+
+/* see SCH datasheet page 351 */
+enum {
+	D0TIM	= 0x80,		/* Device 0 Timing Register */
+	D1TIM	= 0x84,		/* Device 1 Timing Register */
+	PM	= 0x07,		/* PIO Mode Bit Mask */
+	MDM	= (0x03 << 8),	/* Multi-word DMA Mode Bit Mask */
+	UDM	= (0x07 << 16), /* Ultra DMA Mode Bit Mask */
+	PPE	= (1 << 30),	/* Prefetch/Post Enable */
+	USD	= (1 << 31),	/* Use Synchronous DMA */
+};
+
+static int sch_init_one(struct pci_dev *pdev,
+			 const struct pci_device_id *ent);
+static void sch_set_piomode(struct ata_port *ap, struct ata_device *adev);
+static void sch_set_dmamode(struct ata_port *ap, struct ata_device *adev);
+
+static const struct pci_device_id sch_pci_tbl[] = {
+	/* Intel SCH PATA Controller */
+	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_SCH_IDE), 0 },
+	{ }	/* terminate list */
+};
+
+static struct pci_driver sch_pci_driver = {
+	.name			= DRV_NAME,
+	.id_table		= sch_pci_tbl,
+	.probe			= sch_init_one,
+	.remove			= ata_pci_remove_one,
+#ifdef CONFIG_PM
+	.suspend		= ata_pci_device_suspend,
+	.resume			= ata_pci_device_resume,
+#endif
+};
+
+static struct scsi_host_template sch_sht = {
+	ATA_BMDMA_SHT(DRV_NAME),
+};
+
+static struct ata_port_operations sch_pata_ops = {
+	.inherits		= &ata_bmdma_port_ops,
+	.cable_detect		= ata_cable_unknown,
+	.set_piomode		= sch_set_piomode,
+	.set_dmamode		= sch_set_dmamode,
+};
+
+static struct ata_port_info sch_port_info = {
+	.flags		= 0,
+	.pio_mask	= ATA_PIO4,   /* pio0-4 */
+	.mwdma_mask	= ATA_MWDMA2, /* mwdma0-2 */
+	.udma_mask	= ATA_UDMA5,  /* udma0-5 */
+	.port_ops	= &sch_pata_ops,
+};
+
+MODULE_AUTHOR("Alek Du <alek.du@intel.com>");
+MODULE_DESCRIPTION("SCSI low-level driver for Intel SCH PATA controllers");
+MODULE_LICENSE("GPL");
+MODULE_DEVICE_TABLE(pci, sch_pci_tbl);
+MODULE_VERSION(DRV_VERSION);
+
+/**
+ *	sch_set_piomode - Initialize host controller PATA PIO timings
+ *	@ap: Port whose timings we are configuring
+ *	@adev: ATA device
+ *
+ *	Set PIO mode for device, in host controller PCI config space.
+ *
+ *	LOCKING:
+ *	None (inherited from caller).
+ */
+
+static void sch_set_piomode(struct ata_port *ap, struct ata_device *adev)
+{
+	unsigned int pio	= adev->pio_mode - XFER_PIO_0;
+	struct pci_dev *dev	= to_pci_dev(ap->host->dev);
+	unsigned int port	= adev->devno ? D1TIM : D0TIM;
+	unsigned int data;
+
+	pci_read_config_dword(dev, port, &data);
+	/* see SCH datasheet page 351 */
+	/* set PIO mode */
+	data &= ~(PM | PPE);
+	data |= pio;
+	/* enable PPE for block device */
+	if (adev->class == ATA_DEV_ATA)
+		data |= PPE;
+	pci_write_config_dword(dev, port, data);
+}
+
+/**
+ *	sch_set_dmamode - Initialize host controller PATA DMA timings
+ *	@ap: Port whose timings we are configuring
+ *	@adev: ATA device
+ *
+ *	Set MW/UDMA mode for device, in host controller PCI config space.
+ *
+ *	LOCKING:
+ *	None (inherited from caller).
+ */
+
+static void sch_set_dmamode(struct ata_port *ap, struct ata_device *adev)
+{
+	unsigned int dma_mode	= adev->dma_mode;
+	struct pci_dev *dev	= to_pci_dev(ap->host->dev);
+	unsigned int port	= adev->devno ? D1TIM : D0TIM;
+	unsigned int data;
+
+	pci_read_config_dword(dev, port, &data);
+	/* see SCH datasheet page 351 */
+	if (dma_mode >= XFER_UDMA_0) {
+		/* enable Synchronous DMA mode */
+		data |= USD;
+		data &= ~UDM;
+		data |= (dma_mode - XFER_UDMA_0) << 16;
+	} else { /* must be MWDMA mode, since we masked SWDMA already */
+		data &= ~(USD | MDM);
+		data |= (dma_mode - XFER_MW_DMA_0) << 8;
+	}
+	pci_write_config_dword(dev, port, data);
+}
+
+/**
+ *	sch_init_one - Register SCH ATA PCI device with kernel services
+ *	@pdev: PCI device to register
+ *	@ent: Entry in sch_pci_tbl matching with @pdev
+ *
+ *	LOCKING:
+ *	Inherited from PCI layer (may sleep).
+ *
+ *	RETURNS:
+ *	Zero on success, or -ERRNO value.
+ */
+
+static int __devinit sch_init_one(struct pci_dev *pdev,
+				   const struct pci_device_id *ent)
+{
+	static int printed_version;
+	const struct ata_port_info *ppi[] = { &sch_port_info, NULL };
+	struct ata_host *host;
+	int rc;
+
+	if (!printed_version++)
+		dev_printk(KERN_DEBUG, &pdev->dev,
+			   "version " DRV_VERSION "\n");
+
+	/* enable device and prepare host */
+	rc = pcim_enable_device(pdev);
+	if (rc)
+		return rc;
+	rc = ata_pci_sff_prepare_host(pdev, ppi, &host);
+	if (rc)
+		return rc;
+	pci_set_master(pdev);
+	return ata_pci_sff_activate_host(host, ata_sff_interrupt, &sch_sht);
+}
+
+static int __init sch_init(void)
+{
+	return pci_register_driver(&sch_pci_driver);
+}
+
+static void __exit sch_exit(void)
+{
+	pci_unregister_driver(&sch_pci_driver);
+}
+
+module_init(sch_init);
+module_exit(sch_exit);
-- 
cgit v0.10.2


From 8e7decdb8b132ee970a2636931b7653dec6af472 Mon Sep 17 00:00:00 2001
From: Mark Lord <liml@rtr.ca>
Date: Fri, 2 May 2008 02:07:51 -0400
Subject: sata_mv more cosmetic changes

More cosmetic changes; no code changes.

 -- try and improve consistency of naming.
 -- add missing _OFS to tails of register offset definitions.
 -- rename mv_setup_ifctl() to mv_setup_ifcfg(), since that's what it really does.
 -- remove/move some dead comments

Signed-off-by: Mark Lord <mlord@pobox.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/ata/sata_mv.c b/drivers/ata/sata_mv.c
index 842b1a15..4eabb73 100644
--- a/drivers/ata/sata_mv.c
+++ b/drivers/ata/sata_mv.c
@@ -91,9 +91,9 @@ enum {
 	MV_IRQ_COAL_TIME_THRESHOLD	= (MV_IRQ_COAL_REG_BASE + 0xd0),
 
 	MV_SATAHC0_REG_BASE	= 0x20000,
-	MV_FLASH_CTL		= 0x1046c,
-	MV_GPIO_PORT_CTL	= 0x104f0,
-	MV_RESET_CFG		= 0x180d8,
+	MV_FLASH_CTL_OFS	= 0x1046c,
+	MV_GPIO_PORT_CTL_OFS	= 0x104f0,
+	MV_RESET_CFG_OFS	= 0x180d8,
 
 	MV_PCI_REG_SZ		= MV_MAJOR_REG_AREA_SZ,
 	MV_SATAHC_REG_SZ	= MV_MAJOR_REG_AREA_SZ,
@@ -147,18 +147,21 @@ enum {
 	/* PCI interface registers */
 
 	PCI_COMMAND_OFS		= 0xc00,
+	PCI_COMMAND_MRDTRIG	= (1 << 7),	/* PCI Master Read Trigger */
 
 	PCI_MAIN_CMD_STS_OFS	= 0xd30,
 	STOP_PCI_MASTER		= (1 << 2),
 	PCI_MASTER_EMPTY	= (1 << 3),
 	GLOB_SFT_RST		= (1 << 4),
 
-	MV_PCI_MODE		= 0xd00,
+	MV_PCI_MODE_OFS		= 0xd00,
+	MV_PCI_MODE_MASK	= 0x30,
+
 	MV_PCI_EXP_ROM_BAR_CTL	= 0xd2c,
 	MV_PCI_DISC_TIMER	= 0xd04,
 	MV_PCI_MSI_TRIGGER	= 0xc38,
 	MV_PCI_SERR_MASK	= 0xc28,
-	MV_PCI_XBAR_TMOUT	= 0x1d04,
+	MV_PCI_XBAR_TMOUT_OFS	= 0x1d04,
 	MV_PCI_ERR_LOW_ADDRESS	= 0x1d40,
 	MV_PCI_ERR_HIGH_ADDRESS	= 0x1d44,
 	MV_PCI_ERR_ATTRIBUTE	= 0x1d48,
@@ -225,16 +228,18 @@ enum {
 	PHY_MODE4		= 0x314,
 	PHY_MODE2		= 0x330,
 	SATA_IFCTL_OFS		= 0x344,
+	SATA_TESTCTL_OFS	= 0x348,
 	SATA_IFSTAT_OFS		= 0x34c,
 	VENDOR_UNIQUE_FIS_OFS	= 0x35c,
 
-	FIS_CFG_OFS		= 0x360,
-	FIS_CFG_SINGLE_SYNC	= (1 << 16),	/* SYNC on DMA activation */
+	FISCFG_OFS		= 0x360,
+	FISCFG_WAIT_DEV_ERR	= (1 << 8),	/* wait for host on DevErr */
+	FISCFG_SINGLE_SYNC	= (1 << 16),	/* SYNC on DMA activation */
 
 	MV5_PHY_MODE		= 0x74,
-	MV5_LT_MODE		= 0x30,
-	MV5_PHY_CTL		= 0x0C,
-	SATA_INTERFACE_CFG	= 0x050,
+	MV5_LTMODE_OFS		= 0x30,
+	MV5_PHY_CTL_OFS		= 0x0C,
+	SATA_INTERFACE_CFG_OFS	= 0x050,
 
 	MV_M2_PREAMP_MASK	= 0x7e0,
 
@@ -332,10 +337,16 @@ enum {
 	EDMA_CMD_OFS		= 0x28,		/* EDMA command register */
 	EDMA_EN			= (1 << 0),	/* enable EDMA */
 	EDMA_DS			= (1 << 1),	/* disable EDMA; self-negated */
-	ATA_RST			= (1 << 2),	/* reset trans/link/phy */
+	EDMA_RESET		= (1 << 2),	/* reset eng/trans/link/phy */
+
+	EDMA_STATUS_OFS		= 0x30,		/* EDMA engine status */
+	EDMA_STATUS_CACHE_EMPTY	= (1 << 6),	/* GenIIe command cache empty */
+	EDMA_STATUS_IDLE	= (1 << 7),	/* GenIIe EDMA enabled/idle */
 
-	EDMA_IORDY_TMOUT	= 0x34,
-	EDMA_ARB_CFG		= 0x38,
+	EDMA_IORDY_TMOUT_OFS	= 0x34,
+	EDMA_ARB_CFG_OFS	= 0x38,
+
+	EDMA_HALTCOND_OFS	= 0x60,		/* GenIIe halt conditions */
 
 	GEN_II_NCQ_MAX_SECTORS	= 256,		/* max sects/io on Gen2 w/NCQ */
 
@@ -359,6 +370,7 @@ enum {
 #define IS_GEN_I(hpriv) ((hpriv)->hp_flags & MV_HP_GEN_I)
 #define IS_GEN_II(hpriv) ((hpriv)->hp_flags & MV_HP_GEN_II)
 #define IS_GEN_IIE(hpriv) ((hpriv)->hp_flags & MV_HP_GEN_IIE)
+#define IS_PCIE(hpriv) ((hpriv)->hp_flags & MV_HP_PCIE)
 #define HAS_PCI(host) (!((host)->ports[0]->flags & MV_FLAG_SOC))
 
 #define WINDOW_CTRL(i)		(0x20030 + ((i) << 4))
@@ -1059,22 +1071,22 @@ static void mv6_dev_config(struct ata_device *adev)
 
 static void mv_config_fbs(void __iomem *port_mmio, int enable_fbs)
 {
-	u32 old_fcfg, new_fcfg, old_ltmode, new_ltmode;
+	u32 old_fiscfg, new_fiscfg, old_ltmode, new_ltmode;
 	/*
 	 * Various bit settings required for operation
 	 * in FIS-based switching (fbs) mode on GenIIe:
 	 */
-	old_fcfg   = readl(port_mmio + FIS_CFG_OFS);
+	old_fiscfg = readl(port_mmio + FISCFG_OFS);
 	old_ltmode = readl(port_mmio + LTMODE_OFS);
 	if (enable_fbs) {
-		new_fcfg   = old_fcfg   |  FIS_CFG_SINGLE_SYNC;
+		new_fiscfg = old_fiscfg |  FISCFG_SINGLE_SYNC;
 		new_ltmode = old_ltmode |  LTMODE_BIT8;
 	} else { /* disable fbs */
-		new_fcfg   = old_fcfg   & ~FIS_CFG_SINGLE_SYNC;
+		new_fiscfg = old_fiscfg & ~FISCFG_SINGLE_SYNC;
 		new_ltmode = old_ltmode & ~LTMODE_BIT8;
 	}
-	if (new_fcfg != old_fcfg)
-		writelfl(new_fcfg, port_mmio + FIS_CFG_OFS);
+	if (new_fiscfg != old_fiscfg)
+		writelfl(new_fiscfg, port_mmio + FISCFG_OFS);
 	if (new_ltmode != old_ltmode)
 		writelfl(new_ltmode, port_mmio + LTMODE_OFS);
 }
@@ -1894,7 +1906,7 @@ static void mv5_reset_bus(struct ata_host *host, void __iomem *mmio)
 
 static void mv5_reset_flash(struct mv_host_priv *hpriv, void __iomem *mmio)
 {
-	writel(0x0fcfffff, mmio + MV_FLASH_CTL);
+	writel(0x0fcfffff, mmio + MV_FLASH_CTL_OFS);
 }
 
 static void mv5_read_preamp(struct mv_host_priv *hpriv, int idx,
@@ -1913,7 +1925,7 @@ static void mv5_enable_leds(struct mv_host_priv *hpriv, void __iomem *mmio)
 {
 	u32 tmp;
 
-	writel(0, mmio + MV_GPIO_PORT_CTL);
+	writel(0, mmio + MV_GPIO_PORT_CTL_OFS);
 
 	/* FIXME: handle MV_HP_ERRATA_50XXB2 errata */
 
@@ -1931,14 +1943,14 @@ static void mv5_phy_errata(struct mv_host_priv *hpriv, void __iomem *mmio,
 	int fix_apm_sq = (hpriv->hp_flags & MV_HP_ERRATA_50XXB0);
 
 	if (fix_apm_sq) {
-		tmp = readl(phy_mmio + MV5_LT_MODE);
+		tmp = readl(phy_mmio + MV5_LTMODE_OFS);
 		tmp |= (1 << 19);
-		writel(tmp, phy_mmio + MV5_LT_MODE);
+		writel(tmp, phy_mmio + MV5_LTMODE_OFS);
 
-		tmp = readl(phy_mmio + MV5_PHY_CTL);
+		tmp = readl(phy_mmio + MV5_PHY_CTL_OFS);
 		tmp &= ~0x3;
 		tmp |= 0x1;
-		writel(tmp, phy_mmio + MV5_PHY_CTL);
+		writel(tmp, phy_mmio + MV5_PHY_CTL_OFS);
 	}
 
 	tmp = readl(phy_mmio + MV5_PHY_MODE);
@@ -1956,11 +1968,6 @@ static void mv5_reset_hc_port(struct mv_host_priv *hpriv, void __iomem *mmio,
 {
 	void __iomem *port_mmio = mv_port_base(mmio, port);
 
-	/*
-	 * The datasheet warns against setting ATA_RST when EDMA is active
-	 * (but doesn't say what the problem might be).  So we first try
-	 * to disable the EDMA engine before doing the ATA_RST operation.
-	 */
 	mv_reset_channel(hpriv, mmio, port);
 
 	ZERO(0x028);	/* command */
@@ -1975,7 +1982,7 @@ static void mv5_reset_hc_port(struct mv_host_priv *hpriv, void __iomem *mmio,
 	ZERO(0x024);	/* respq outp */
 	ZERO(0x020);	/* respq inp */
 	ZERO(0x02c);	/* test control */
-	writel(0xbc, port_mmio + EDMA_IORDY_TMOUT);
+	writel(0xbc, port_mmio + EDMA_IORDY_TMOUT_OFS);
 }
 #undef ZERO
 
@@ -2021,13 +2028,13 @@ static void mv_reset_pci_bus(struct ata_host *host, void __iomem *mmio)
 	struct mv_host_priv *hpriv = host->private_data;
 	u32 tmp;
 
-	tmp = readl(mmio + MV_PCI_MODE);
+	tmp = readl(mmio + MV_PCI_MODE_OFS);
 	tmp &= 0xff00ffff;
-	writel(tmp, mmio + MV_PCI_MODE);
+	writel(tmp, mmio + MV_PCI_MODE_OFS);
 
 	ZERO(MV_PCI_DISC_TIMER);
 	ZERO(MV_PCI_MSI_TRIGGER);
-	writel(0x000100ff, mmio + MV_PCI_XBAR_TMOUT);
+	writel(0x000100ff, mmio + MV_PCI_XBAR_TMOUT_OFS);
 	ZERO(PCI_HC_MAIN_IRQ_MASK_OFS);
 	ZERO(MV_PCI_SERR_MASK);
 	ZERO(hpriv->irq_cause_ofs);
@@ -2045,10 +2052,10 @@ static void mv6_reset_flash(struct mv_host_priv *hpriv, void __iomem *mmio)
 
 	mv5_reset_flash(hpriv, mmio);
 
-	tmp = readl(mmio + MV_GPIO_PORT_CTL);
+	tmp = readl(mmio + MV_GPIO_PORT_CTL_OFS);
 	tmp &= 0x3;
 	tmp |= (1 << 5) | (1 << 6);
-	writel(tmp, mmio + MV_GPIO_PORT_CTL);
+	writel(tmp, mmio + MV_GPIO_PORT_CTL_OFS);
 }
 
 /**
@@ -2121,7 +2128,7 @@ static void mv6_read_preamp(struct mv_host_priv *hpriv, int idx,
 	void __iomem *port_mmio;
 	u32 tmp;
 
-	tmp = readl(mmio + MV_RESET_CFG);
+	tmp = readl(mmio + MV_RESET_CFG_OFS);
 	if ((tmp & (1 << 0)) == 0) {
 		hpriv->signal[idx].amps = 0x7 << 8;
 		hpriv->signal[idx].pre = 0x1 << 5;
@@ -2137,7 +2144,7 @@ static void mv6_read_preamp(struct mv_host_priv *hpriv, int idx,
 
 static void mv6_enable_leds(struct mv_host_priv *hpriv, void __iomem *mmio)
 {
-	writel(0x00000060, mmio + MV_GPIO_PORT_CTL);
+	writel(0x00000060, mmio + MV_GPIO_PORT_CTL_OFS);
 }
 
 static void mv6_phy_errata(struct mv_host_priv *hpriv, void __iomem *mmio,
@@ -2235,11 +2242,6 @@ static void mv_soc_reset_hc_port(struct mv_host_priv *hpriv,
 {
 	void __iomem *port_mmio = mv_port_base(mmio, port);
 
-	/*
-	 * The datasheet warns against setting ATA_RST when EDMA is active
-	 * (but doesn't say what the problem might be).  So we first try
-	 * to disable the EDMA engine before doing the ATA_RST operation.
-	 */
 	mv_reset_channel(hpriv, mmio, port);
 
 	ZERO(0x028);		/* command */
@@ -2254,7 +2256,7 @@ static void mv_soc_reset_hc_port(struct mv_host_priv *hpriv,
 	ZERO(0x024);		/* respq outp */
 	ZERO(0x020);		/* respq inp */
 	ZERO(0x02c);		/* test control */
-	writel(0xbc, port_mmio + EDMA_IORDY_TMOUT);
+	writel(0xbc, port_mmio + EDMA_IORDY_TMOUT_OFS);
 }
 
 #undef ZERO
@@ -2297,38 +2299,39 @@ static void mv_soc_reset_bus(struct ata_host *host, void __iomem *mmio)
 	return;
 }
 
-static void mv_setup_ifctl(void __iomem *port_mmio, int want_gen2i)
+static void mv_setup_ifcfg(void __iomem *port_mmio, int want_gen2i)
 {
-	u32 ifctl = readl(port_mmio + SATA_INTERFACE_CFG);
+	u32 ifcfg = readl(port_mmio + SATA_INTERFACE_CFG_OFS);
 
-	ifctl = (ifctl & 0xf7f) | 0x9b1000;	/* from chip spec */
+	ifcfg = (ifcfg & 0xf7f) | 0x9b1000;	/* from chip spec */
 	if (want_gen2i)
-		ifctl |= (1 << 7);		/* enable gen2i speed */
-	writelfl(ifctl, port_mmio + SATA_INTERFACE_CFG);
+		ifcfg |= (1 << 7);		/* enable gen2i speed */
+	writelfl(ifcfg, port_mmio + SATA_INTERFACE_CFG_OFS);
 }
 
-/*
- * Caller must ensure that EDMA is not active,
- * by first doing mv_stop_edma() where needed.
- */
 static void mv_reset_channel(struct mv_host_priv *hpriv, void __iomem *mmio,
 			     unsigned int port_no)
 {
 	void __iomem *port_mmio = mv_port_base(mmio, port_no);
 
+	/*
+	 * The datasheet warns against setting EDMA_RESET when EDMA is active
+	 * (but doesn't say what the problem might be).  So we first try
+	 * to disable the EDMA engine before doing the EDMA_RESET operation.
+	 */
 	mv_stop_edma_engine(port_mmio);
-	writelfl(ATA_RST, port_mmio + EDMA_CMD_OFS);
+	writelfl(EDMA_RESET, port_mmio + EDMA_CMD_OFS);
 
 	if (!IS_GEN_I(hpriv)) {
-		/* Enable 3.0gb/s link speed */
-		mv_setup_ifctl(port_mmio, 1);
+		/* Enable 3.0gb/s link speed: this survives EDMA_RESET */
+		mv_setup_ifcfg(port_mmio, 1);
 	}
 	/*
-	 * Strobing ATA_RST here causes a hard reset of the SATA transport,
+	 * Strobing EDMA_RESET here causes a hard reset of the SATA transport,
 	 * link, and physical layers.  It resets all SATA interface registers
 	 * (except for SATA_INTERFACE_CFG), and issues a COMRESET to the dev.
 	 */
-	writelfl(ATA_RST, port_mmio + EDMA_CMD_OFS);
+	writelfl(EDMA_RESET, port_mmio + EDMA_CMD_OFS);
 	udelay(25);	/* allow reset propagation */
 	writelfl(0, port_mmio + EDMA_CMD_OFS);
 
@@ -2392,7 +2395,7 @@ static int mv_hardreset(struct ata_link *link, unsigned int *class,
 		sata_scr_read(link, SCR_STATUS, &sstatus);
 		if (!IS_GEN_I(hpriv) && ++attempts >= 5 && sstatus == 0x121) {
 			/* Force 1.5gb/s link speed and try again */
-			mv_setup_ifctl(mv_ap_base(ap), 0);
+			mv_setup_ifcfg(mv_ap_base(ap), 0);
 			if (time_after(jiffies + HZ, deadline))
 				extra = HZ; /* only extend it once, max */
 		}
@@ -2590,6 +2593,7 @@ static int mv_chip_id(struct ata_host *host, unsigned int board_idx)
 				" and avoid the final two gigabytes on"
 				" all RocketRAID BIOS initialized drives.\n");
 		}
+		/* drop through */
 	case chip_6042:
 		hpriv->ops = &mv6xxx_ops;
 		hp_flags |= MV_HP_GEN_IIE;
-- 
cgit v0.10.2


From 616d4a98ad8749ebe17a8fcac67df65c321350ac Mon Sep 17 00:00:00 2001
From: Mark Lord <liml@rtr.ca>
Date: Fri, 2 May 2008 02:08:32 -0400
Subject: sata_mv pci features

Some of the GenIIe EDMA optimizations should not be used
for non-PCI (SOC) devices, and nor for certain configurations
of conventional PCI (non PCI-X, PCIe) buses.

Logic taken/simplified from that in the Marvell proprietary driver.

Signed-off-by: Mark Lord <mlord@pobox.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/ata/sata_mv.c b/drivers/ata/sata_mv.c
index 4eabb73..10ef968 100644
--- a/drivers/ata/sata_mv.c
+++ b/drivers/ata/sata_mv.c
@@ -361,6 +361,7 @@ enum {
 	MV_HP_GEN_II		= (1 << 7),	/* Generation II: 60xx */
 	MV_HP_GEN_IIE		= (1 << 8),	/* Generation IIE: 6042/7042 */
 	MV_HP_PCIE		= (1 << 9),	/* PCIe bus/regs: 7042 */
+	MV_HP_CUT_THROUGH	= (1 << 10),	/* can use EDMA cut-through */
 
 	/* Port private flags (pp_flags) */
 	MV_PP_FLAG_EDMA_EN	= (1 << 0),	/* is EDMA engine enabled? */
@@ -1110,8 +1111,10 @@ static void mv_edma_cfg(struct ata_port *ap, int want_ncq)
 	else if (IS_GEN_IIE(hpriv)) {
 		cfg |= (1 << 23);	/* do not mask PM field in rx'd FIS */
 		cfg |= (1 << 22);	/* enab 4-entry host queue cache */
-		cfg |= (1 << 18);	/* enab early completion */
-		cfg |= (1 << 17);	/* enab cut-through (dis stor&forwrd) */
+		if (HAS_PCI(ap->host))
+			cfg |= (1 << 18);	/* enab early completion */
+		if (hpriv->hp_flags & MV_HP_CUT_THROUGH)
+			cfg |= (1 << 17); /* enab cut-thru (dis stor&forwrd) */
 
 		if (want_ncq && sata_pmp_attached(ap)) {
 			cfg |= EDMA_CFG_EDMA_FBS; /* FIS-based switching */
@@ -2496,6 +2499,34 @@ static void mv_port_init(struct ata_ioports *port,  void __iomem *port_mmio)
 		readl(port_mmio + EDMA_ERR_IRQ_MASK_OFS));
 }
 
+static unsigned int mv_in_pcix_mode(struct ata_host *host)
+{
+	struct mv_host_priv *hpriv = host->private_data;
+	void __iomem *mmio = hpriv->base;
+	u32 reg;
+
+	if (!HAS_PCI(host) || !IS_PCIE(hpriv))
+		return 0;	/* not PCI-X capable */
+	reg = readl(mmio + MV_PCI_MODE_OFS);
+	if ((reg & MV_PCI_MODE_MASK) == 0)
+		return 0;	/* conventional PCI mode */
+	return 1;	/* chip is in PCI-X mode */
+}
+
+static int mv_pci_cut_through_okay(struct ata_host *host)
+{
+	struct mv_host_priv *hpriv = host->private_data;
+	void __iomem *mmio = hpriv->base;
+	u32 reg;
+
+	if (!mv_in_pcix_mode(host)) {
+		reg = readl(mmio + PCI_COMMAND_OFS);
+		if (reg & PCI_COMMAND_MRDTRIG)
+			return 0; /* not okay */
+	}
+	return 1; /* okay */
+}
+
 static int mv_chip_id(struct ata_host *host, unsigned int board_idx)
 {
 	struct pci_dev *pdev = to_pci_dev(host->dev);
@@ -2563,7 +2594,7 @@ static int mv_chip_id(struct ata_host *host, unsigned int board_idx)
 		break;
 
 	case chip_7042:
-		hp_flags |= MV_HP_PCIE;
+		hp_flags |= MV_HP_PCIE | MV_HP_CUT_THROUGH;
 		if (pdev->vendor == PCI_VENDOR_ID_TTI &&
 		    (pdev->device == 0x2300 || pdev->device == 0x2310))
 		{
@@ -2597,6 +2628,8 @@ static int mv_chip_id(struct ata_host *host, unsigned int board_idx)
 	case chip_6042:
 		hpriv->ops = &mv6xxx_ops;
 		hp_flags |= MV_HP_GEN_IIE;
+		if (board_idx == chip_6042 && mv_pci_cut_through_okay(host))
+			hp_flags |= MV_HP_CUT_THROUGH;
 
 		switch (pdev->revision) {
 		case 0x0:
-- 
cgit v0.10.2


From 9b2c4e0bae854fb5e88c9cacc0dacf21631c5cb0 Mon Sep 17 00:00:00 2001
From: Mark Lord <liml@rtr.ca>
Date: Fri, 2 May 2008 02:09:14 -0400
Subject: sata_mv wait for empty+idle

When performing EH, it is recommended to wait for the EDMA engine
to empty out requests-in-progress before disabling EDMA.

Introduce code to poll the EDMA_STATUS register for idle/empty bits
before disabling EDMA.  For non-EH operation, this will normally exit
without delay, other than the register read.

A later series of patches may focus on eliminating this and various
other register reads (when possible) throughout the driver,
but for now we're focussing on solid reliablity.

Signed-off-by: Mark Lord <mlord@pobox.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/ata/sata_mv.c b/drivers/ata/sata_mv.c
index 10ef968..6929962 100644
--- a/drivers/ata/sata_mv.c
+++ b/drivers/ata/sata_mv.c
@@ -888,6 +888,25 @@ static void mv_start_dma(struct ata_port *ap, void __iomem *port_mmio,
 	}
 }
 
+static void mv_wait_for_edma_empty_idle(struct ata_port *ap)
+{
+	void __iomem *port_mmio = mv_ap_base(ap);
+	const u32 empty_idle = (EDMA_STATUS_CACHE_EMPTY | EDMA_STATUS_IDLE);
+	const int per_loop = 5, timeout = (15 * 1000 / per_loop);
+	int i;
+
+	/*
+	 * Wait for the EDMA engine to finish transactions in progress.
+	 */
+	for (i = 0; i < timeout; ++i) {
+		u32 edma_stat = readl(port_mmio + EDMA_STATUS_OFS);
+		if ((edma_stat & empty_idle) == empty_idle)
+			break;
+		udelay(per_loop);
+	}
+	/* ata_port_printk(ap, KERN_INFO, "%s: %u+ usecs\n", __func__, i); */
+}
+
 /**
  *      mv_stop_edma_engine - Disable eDMA engine
  *      @port_mmio: io base address
@@ -920,6 +939,7 @@ static int mv_stop_edma(struct ata_port *ap)
 	if (!(pp->pp_flags & MV_PP_FLAG_EDMA_EN))
 		return 0;
 	pp->pp_flags &= ~MV_PP_FLAG_EDMA_EN;
+	mv_wait_for_edma_empty_idle(ap);
 	if (mv_stop_edma_engine(port_mmio)) {
 		ata_port_printk(ap, KERN_ERR, "Unable to stop eDMA\n");
 		return -EIO;
-- 
cgit v0.10.2


From 3e4a139107e497a741c26f8a377a10f214d63ec1 Mon Sep 17 00:00:00 2001
From: Mark Lord <liml@rtr.ca>
Date: Fri, 2 May 2008 02:10:02 -0400
Subject: sata_mv new mv_qc_defer method

The EDMA engine cannot tolerate a mix of NCQ/non-NCQ commands,
and cannot be used for PIO at all.  So we need to prevent libata
from trying to feed us such mixtures.

Introduce mv_qc_defer() for this purpose, and use it for all chip versions.

Signed-off-by: Mark Lord <mlord@pobox.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/ata/sata_mv.c b/drivers/ata/sata_mv.c
index 6929962..0545a49 100644
--- a/drivers/ata/sata_mv.c
+++ b/drivers/ata/sata_mv.c
@@ -492,6 +492,7 @@ static int mv5_scr_read(struct ata_port *ap, unsigned int sc_reg_in, u32 *val);
 static int mv5_scr_write(struct ata_port *ap, unsigned int sc_reg_in, u32 val);
 static int mv_port_start(struct ata_port *ap);
 static void mv_port_stop(struct ata_port *ap);
+static int mv_qc_defer(struct ata_queued_cmd *qc);
 static void mv_qc_prep(struct ata_queued_cmd *qc);
 static void mv_qc_prep_iie(struct ata_queued_cmd *qc);
 static unsigned int mv_qc_issue(struct ata_queued_cmd *qc);
@@ -561,6 +562,7 @@ static struct scsi_host_template mv6_sht = {
 static struct ata_port_operations mv5_ops = {
 	.inherits		= &ata_sff_port_ops,
 
+	.qc_defer		= mv_qc_defer,
 	.qc_prep		= mv_qc_prep,
 	.qc_issue		= mv_qc_issue,
 
@@ -579,7 +581,6 @@ static struct ata_port_operations mv5_ops = {
 
 static struct ata_port_operations mv6_ops = {
 	.inherits		= &mv5_ops,
-	.qc_defer		= sata_pmp_qc_defer_cmd_switch,
 	.dev_config             = mv6_dev_config,
 	.scr_read		= mv_scr_read,
 	.scr_write		= mv_scr_write,
@@ -592,7 +593,6 @@ static struct ata_port_operations mv6_ops = {
 
 static struct ata_port_operations mv_iie_ops = {
 	.inherits		= &mv6_ops,
-	.qc_defer		= ata_std_qc_defer, /* FIS-based switching */
 	.dev_config		= ATA_OP_NULL,
 	.qc_prep		= mv_qc_prep_iie,
 };
@@ -1090,6 +1090,45 @@ static void mv6_dev_config(struct ata_device *adev)
 	}
 }
 
+static int mv_qc_defer(struct ata_queued_cmd *qc)
+{
+	struct ata_link *link = qc->dev->link;
+	struct ata_port *ap = link->ap;
+	struct mv_port_priv *pp = ap->private_data;
+
+	/*
+	 * If the port is completely idle, then allow the new qc.
+	 */
+	if (ap->nr_active_links == 0)
+		return 0;
+
+	if (pp->pp_flags & MV_PP_FLAG_EDMA_EN) {
+		/*
+		 * The port is operating in host queuing mode (EDMA).
+		 * It can accomodate a new qc if the qc protocol
+		 * is compatible with the current host queue mode.
+		 */
+		if (pp->pp_flags & MV_PP_FLAG_NCQ_EN) {
+			/*
+			 * The host queue (EDMA) is in NCQ mode.
+			 * If the new qc is also an NCQ command,
+			 * then allow the new qc.
+			 */
+			if (qc->tf.protocol == ATA_PROT_NCQ)
+				return 0;
+		} else {
+			/*
+			 * The host queue (EDMA) is in non-NCQ, DMA mode.
+			 * If the new qc is also a non-NCQ, DMA command,
+			 * then allow the new qc.
+			 */
+			if (qc->tf.protocol == ATA_PROT_DMA)
+				return 0;
+		}
+	}
+	return ATA_DEFER_PORT;
+}
+
 static void mv_config_fbs(void __iomem *port_mmio, int enable_fbs)
 {
 	u32 old_fiscfg, new_fiscfg, old_ltmode, new_ltmode;
-- 
cgit v0.10.2


From dd2890f60f8e15f14c8eb132779b2f15c49d1203 Mon Sep 17 00:00:00 2001
From: Mark Lord <liml@rtr.ca>
Date: Fri, 2 May 2008 02:10:56 -0400
Subject: sata_mv errata workaround for sata25 part 1

Part 1 of workaround for errata "sata#25" for the 60x1 series
(the second half of this errata workaround is still in development.

Bit22 of the GPIO port has to be set "on" when in NCQ mode.

Signed-off-by: Mark Lord <mlord@pobox.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/ata/sata_mv.c b/drivers/ata/sata_mv.c
index 0545a49..fbccf21 100644
--- a/drivers/ata/sata_mv.c
+++ b/drivers/ata/sata_mv.c
@@ -1151,6 +1151,21 @@ static void mv_config_fbs(void __iomem *port_mmio, int enable_fbs)
 		writelfl(new_ltmode, port_mmio + LTMODE_OFS);
 }
 
+static void mv_60x1_errata_sata25(struct ata_port *ap, int want_ncq)
+{
+	struct mv_host_priv *hpriv = ap->host->private_data;
+	u32 old, new;
+
+	/* workaround for 88SX60x1 FEr SATA#25 (part 1) */
+	old = readl(hpriv->base + MV_GPIO_PORT_CTL_OFS);
+	if (want_ncq)
+		new = old | (1 << 22);
+	else
+		new = old & ~(1 << 22);
+	if (new != old)
+		writel(new, hpriv->base + MV_GPIO_PORT_CTL_OFS);
+}
+
 static void mv_edma_cfg(struct ata_port *ap, int want_ncq)
 {
 	u32 cfg;
@@ -1164,10 +1179,11 @@ static void mv_edma_cfg(struct ata_port *ap, int want_ncq)
 	if (IS_GEN_I(hpriv))
 		cfg |= (1 << 8);	/* enab config burst size mask */
 
-	else if (IS_GEN_II(hpriv))
+	else if (IS_GEN_II(hpriv)) {
 		cfg |= EDMA_CFG_RD_BRST_EXT | EDMA_CFG_WR_BUFF_LEN;
+		mv_60x1_errata_sata25(ap, want_ncq);
 
-	else if (IS_GEN_IIE(hpriv)) {
+	} else if (IS_GEN_IIE(hpriv)) {
 		cfg |= (1 << 23);	/* do not mask PM field in rx'd FIS */
 		cfg |= (1 << 22);	/* enab 4-entry host queue cache */
 		if (HAS_PCI(ap->host))
-- 
cgit v0.10.2


From 00f42eabb204c68fa64ef72de834e74aca15c81f Mon Sep 17 00:00:00 2001
From: Mark Lord <liml@rtr.ca>
Date: Fri, 2 May 2008 02:11:45 -0400
Subject: sata_mv rearrange mv_config_fbs

Rearrange mv_config_fbs() to more closely follow the (corrected) datasheet
recommendations for NCQ and FIS-based switching (FBS).

Also, maintain a port flag to let us know when FBS is enabled.
We will make more use of that flag later in this patch series.

Signed-off-by: Mark Lord <mlord@pobox.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/ata/sata_mv.c b/drivers/ata/sata_mv.c
index fbccf21..e4d411c 100644
--- a/drivers/ata/sata_mv.c
+++ b/drivers/ata/sata_mv.c
@@ -366,6 +366,7 @@ enum {
 	/* Port private flags (pp_flags) */
 	MV_PP_FLAG_EDMA_EN	= (1 << 0),	/* is EDMA engine enabled? */
 	MV_PP_FLAG_NCQ_EN	= (1 << 1),	/* is EDMA set up for NCQ? */
+	MV_PP_FLAG_FBS_EN	= (1 << 2),	/* is EDMA set up for FBS? */
 };
 
 #define IS_GEN_I(hpriv) ((hpriv)->hp_flags & MV_HP_GEN_I)
@@ -1129,26 +1130,31 @@ static int mv_qc_defer(struct ata_queued_cmd *qc)
 	return ATA_DEFER_PORT;
 }
 
-static void mv_config_fbs(void __iomem *port_mmio, int enable_fbs)
+static void mv_config_fbs(void __iomem *port_mmio, int want_ncq, int want_fbs)
 {
-	u32 old_fiscfg, new_fiscfg, old_ltmode, new_ltmode;
-	/*
-	 * Various bit settings required for operation
-	 * in FIS-based switching (fbs) mode on GenIIe:
-	 */
-	old_fiscfg = readl(port_mmio + FISCFG_OFS);
-	old_ltmode = readl(port_mmio + LTMODE_OFS);
-	if (enable_fbs) {
-		new_fiscfg = old_fiscfg |  FISCFG_SINGLE_SYNC;
-		new_ltmode = old_ltmode |  LTMODE_BIT8;
-	} else { /* disable fbs */
-		new_fiscfg = old_fiscfg & ~FISCFG_SINGLE_SYNC;
-		new_ltmode = old_ltmode & ~LTMODE_BIT8;
+	u32 new_fiscfg, old_fiscfg;
+	u32 new_ltmode, old_ltmode;
+	u32 new_haltcond, old_haltcond;
+
+	old_fiscfg   = readl(port_mmio + FISCFG_OFS);
+	old_ltmode   = readl(port_mmio + LTMODE_OFS);
+	old_haltcond = readl(port_mmio + EDMA_HALTCOND_OFS);
+
+	new_fiscfg   = old_fiscfg & ~(FISCFG_SINGLE_SYNC | FISCFG_WAIT_DEV_ERR);
+	new_ltmode   = old_ltmode & ~LTMODE_BIT8;
+	new_haltcond = old_haltcond | EDMA_ERR_DEV;
+
+	if (want_fbs) {
+		new_fiscfg = old_fiscfg | FISCFG_SINGLE_SYNC;
+		new_ltmode = old_ltmode | LTMODE_BIT8;
 	}
+
 	if (new_fiscfg != old_fiscfg)
 		writelfl(new_fiscfg, port_mmio + FISCFG_OFS);
 	if (new_ltmode != old_ltmode)
 		writelfl(new_ltmode, port_mmio + LTMODE_OFS);
+	if (new_haltcond != old_haltcond)
+		writelfl(new_haltcond, port_mmio + EDMA_HALTCOND_OFS);
 }
 
 static void mv_60x1_errata_sata25(struct ata_port *ap, int want_ncq)
@@ -1175,6 +1181,7 @@ static void mv_edma_cfg(struct ata_port *ap, int want_ncq)
 
 	/* set up non-NCQ EDMA configuration */
 	cfg = EDMA_CFG_Q_DEPTH;		/* always 0x1f for *all* chips */
+	pp->pp_flags &= ~MV_PP_FLAG_FBS_EN;
 
 	if (IS_GEN_I(hpriv))
 		cfg |= (1 << 8);	/* enab config burst size mask */
@@ -1184,19 +1191,30 @@ static void mv_edma_cfg(struct ata_port *ap, int want_ncq)
 		mv_60x1_errata_sata25(ap, want_ncq);
 
 	} else if (IS_GEN_IIE(hpriv)) {
+		int want_fbs = sata_pmp_attached(ap);
+		/*
+		 * Possible future enhancement:
+		 *
+		 * The chip can use FBS with non-NCQ, if we allow it,
+		 * But first we need to have the error handling in place
+		 * for this mode (datasheet section 7.3.15.4.2.3).
+		 * So disallow non-NCQ FBS for now.
+		 */
+		want_fbs &= want_ncq;
+
+		mv_config_fbs(port_mmio, want_ncq, want_fbs);
+
+		if (want_fbs) {
+			pp->pp_flags |= MV_PP_FLAG_FBS_EN;
+			cfg |= EDMA_CFG_EDMA_FBS; /* FIS-based switching */
+		}
+
 		cfg |= (1 << 23);	/* do not mask PM field in rx'd FIS */
 		cfg |= (1 << 22);	/* enab 4-entry host queue cache */
 		if (HAS_PCI(ap->host))
 			cfg |= (1 << 18);	/* enab early completion */
 		if (hpriv->hp_flags & MV_HP_CUT_THROUGH)
 			cfg |= (1 << 17); /* enab cut-thru (dis stor&forwrd) */
-
-		if (want_ncq && sata_pmp_attached(ap)) {
-			cfg |= EDMA_CFG_EDMA_FBS; /* FIS-based switching */
-			mv_config_fbs(port_mmio, 1);
-		} else {
-			mv_config_fbs(port_mmio, 0);
-		}
 	}
 
 	if (want_ncq) {
-- 
cgit v0.10.2


From 37b9046a3e433a0b0c39ad1e81ec187d5be800ba Mon Sep 17 00:00:00 2001
From: Mark Lord <liml@rtr.ca>
Date: Fri, 2 May 2008 02:12:34 -0400
Subject: sata_mv NCQ and SError fixes for mv_err_intr

Sigh.  Undo some earlier changes to mv_port_intr(),
so that we now read/clear SError again in all cases.

Arrange the top of the function to be as close as possible
to what we need for a later update (in this series) for ERR_DEV handling.

Fix things so that libata-eh can attempt a READ_LOG_EXT_10H
in response to a failed NCQ command, by just doing a local
mv_eh_freeze() rather than ata_port_freeze().

This will now fully handle NCQ errors much of the time,
but more fixes are needed for FBS/PMP, and for certain chip errata.

Signed-off-by: Mark Lord <mlord@pobox.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/ata/sata_mv.c b/drivers/ata/sata_mv.c
index e4d411c..d995e0e 100644
--- a/drivers/ata/sata_mv.c
+++ b/drivers/ata/sata_mv.c
@@ -1627,7 +1627,7 @@ static void mv_unexpected_intr(struct ata_port *ap)
  *      LOCKING:
  *      Inherited from caller.
  */
-static void mv_err_intr(struct ata_port *ap, struct ata_queued_cmd *qc)
+static void mv_err_intr(struct ata_port *ap)
 {
 	void __iomem *port_mmio = mv_ap_base(ap);
 	u32 edma_err_cause, eh_freeze_mask, serr = 0;
@@ -1635,24 +1635,33 @@ static void mv_err_intr(struct ata_port *ap, struct ata_queued_cmd *qc)
 	struct mv_host_priv *hpriv = ap->host->private_data;
 	unsigned int action = 0, err_mask = 0;
 	struct ata_eh_info *ehi = &ap->link.eh_info;
-
-	ata_ehi_clear_desc(ehi);
+	struct ata_queued_cmd *qc;
+	int abort = 0;
 
 	/*
-	 * Read and clear the err_cause bits.  This won't actually
-	 * clear for some errors (eg. SError), but we will be doing
-	 * a hard reset in those cases regardless, which *will* clear it.
+	 * Read and clear the SError and err_cause bits.
 	 */
+	sata_scr_read(&ap->link, SCR_ERROR, &serr);
+	sata_scr_write_flush(&ap->link, SCR_ERROR, serr);
+
 	edma_err_cause = readl(port_mmio + EDMA_ERR_IRQ_CAUSE_OFS);
 	writelfl(~edma_err_cause, port_mmio + EDMA_ERR_IRQ_CAUSE_OFS);
 
-	ata_ehi_push_desc(ehi, "edma_err_cause=%08x", edma_err_cause);
+	ata_port_printk(ap, KERN_INFO, "%s: err_cause=%08x pp_flags=0x%x\n",
+			__func__, edma_err_cause, pp->pp_flags);
 
+	qc = mv_get_active_qc(ap);
+	ata_ehi_clear_desc(ehi);
+	ata_ehi_push_desc(ehi, "edma_err_cause=%08x pp_flags=%08x",
+			  edma_err_cause, pp->pp_flags);
 	/*
 	 * All generations share these EDMA error cause bits:
 	 */
-	if (edma_err_cause & EDMA_ERR_DEV)
+	if (edma_err_cause & EDMA_ERR_DEV) {
 		err_mask |= AC_ERR_DEV;
+		action |= ATA_EH_RESET;
+		ata_ehi_push_desc(ehi, "dev error");
+	}
 	if (edma_err_cause & (EDMA_ERR_D_PAR | EDMA_ERR_PRD_PAR |
 			EDMA_ERR_CRQB_PAR | EDMA_ERR_CRPB_PAR |
 			EDMA_ERR_INTRL_PAR)) {
@@ -1684,13 +1693,6 @@ static void mv_err_intr(struct ata_port *ap, struct ata_queued_cmd *qc)
 			ata_ehi_push_desc(ehi, "EDMA self-disable");
 		}
 		if (edma_err_cause & EDMA_ERR_SERR) {
-			/*
-			 * Ensure that we read our own SCR, not a pmp link SCR:
-			 */
-			ap->ops->scr_read(ap, SCR_ERROR, &serr);
-			/*
-			 * Don't clear SError here; leave it for libata-eh:
-			 */
 			ata_ehi_push_desc(ehi, "SError=%08x", serr);
 			err_mask |= AC_ERR_ATA_BUS;
 			action |= ATA_EH_RESET;
@@ -1710,10 +1712,29 @@ static void mv_err_intr(struct ata_port *ap, struct ata_queued_cmd *qc)
 	else
 		ehi->err_mask |= err_mask;
 
-	if (edma_err_cause & eh_freeze_mask)
+	if (err_mask == AC_ERR_DEV) {
+		/*
+		 * Cannot do ata_port_freeze() here,
+		 * because it would kill PIO access,
+		 * which is needed for further diagnosis.
+		 */
+		mv_eh_freeze(ap);
+		abort = 1;
+	} else if (edma_err_cause & eh_freeze_mask) {
+		/*
+		 * Note to self: ata_port_freeze() calls ata_port_abort()
+		 */
 		ata_port_freeze(ap);
-	else
-		ata_port_abort(ap);
+	} else {
+		abort = 1;
+	}
+
+	if (abort) {
+		if (qc)
+			ata_link_abort(qc->dev->link);
+		else
+			ata_port_abort(ap);
+	}
 }
 
 static void mv_process_crpb_response(struct ata_port *ap,
@@ -1740,8 +1761,9 @@ static void mv_process_crpb_response(struct ata_port *ap,
 			}
 		}
 		ata_status = edma_status >> CRPB_FLAG_STATUS_SHIFT;
-		qc->err_mask |= ac_err_mask(ata_status);
-		ata_qc_complete(qc);
+		if (!ac_err_mask(ata_status))
+			ata_qc_complete(qc);
+		/* else: leave it for mv_err_intr() */
 	} else {
 		ata_port_printk(ap, KERN_ERR, "%s: no qc for tag=%d\n",
 				__func__, tag);
@@ -1845,7 +1867,7 @@ static int mv_host_intr(struct ata_host *host, u32 main_irq_cause)
 		 * Handle chip-reported errors, or continue on to handle PIO.
 		 */
 		if (unlikely(port_cause & ERR_IRQ)) {
-			mv_err_intr(ap, mv_get_active_qc(ap));
+			mv_err_intr(ap);
 		} else if (hc_irq_cause & (DEV_IRQ << hardport)) {
 			if (!(pp->pp_flags & MV_PP_FLAG_EDMA_EN)) {
 				struct ata_queued_cmd *qc = mv_get_active_qc(ap);
-- 
cgit v0.10.2


From eabd5eb1cb59bfb162e7aa23007248f2bb480816 Mon Sep 17 00:00:00 2001
From: Mark Lord <liml@rtr.ca>
Date: Fri, 2 May 2008 02:13:27 -0400
Subject: sata_mv fix mv_host_intr bug for hc_irq_cause

Remove the unwanted reads of hc_irq_cause from mv_host_intr(),
thereby removing a bug whereby we were not always reading it when needed..

Signed-off-by: Mark Lord <mlord@pobox.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/ata/sata_mv.c b/drivers/ata/sata_mv.c
index d995e0e..31e42de 100644
--- a/drivers/ata/sata_mv.c
+++ b/drivers/ata/sata_mv.c
@@ -1818,48 +1818,61 @@ static void mv_process_crpb_entries(struct ata_port *ap, struct mv_port_priv *pp
 static int mv_host_intr(struct ata_host *host, u32 main_irq_cause)
 {
 	struct mv_host_priv *hpriv = host->private_data;
-	void __iomem *mmio = hpriv->base, *hc_mmio = NULL;
-	u32 hc_irq_cause = 0;
+	void __iomem *mmio = hpriv->base, *hc_mmio;
 	unsigned int handled = 0, port;
 
 	for (port = 0; port < hpriv->n_ports; port++) {
 		struct ata_port *ap = host->ports[port];
 		struct mv_port_priv *pp;
-		unsigned int shift, hardport, port_cause;
-		/*
-		 * When we move to the second hc, flag our cached
-		 * copies of hc_mmio (and hc_irq_cause) as invalid again.
-		 */
-		if (port == MV_PORTS_PER_HC)
-			hc_mmio = NULL;
-		/*
-		 * Do nothing if port is not interrupting or is disabled:
-		 */
+		unsigned int p, shift, hardport, port_cause;
+
 		MV_PORT_TO_SHIFT_AND_HARDPORT(port, shift, hardport);
-		port_cause = (main_irq_cause >> shift) & (DONE_IRQ | ERR_IRQ);
-		if (!port_cause || !ap || (ap->flags & ATA_FLAG_DISABLED))
-			continue;
 		/*
-		 * Each hc within the host has its own hc_irq_cause register.
-		 * We defer reading it until we know we need it, right now:
-		 *
-		 * FIXME later: we don't really need to read this register
-		 * (some logic changes required below if we go that way),
-		 * because it doesn't tell us anything new.  But we do need
-		 * to write to it, outside the top of this loop,
-		 * to reset the interrupt triggers for next time.
+		 * Each hc within the host has its own hc_irq_cause register,
+		 * where the interrupting ports bits get ack'd.
 		 */
-		if (!hc_mmio) {
+		if (hardport == 0) {	/* first port on this hc ? */
+			u32 hc_cause = (main_irq_cause >> shift) & HC0_IRQ_PEND;
+			u32 port_mask, ack_irqs;
+			/*
+			 * Skip this entire hc if nothing pending for any ports
+			 */
+			if (!hc_cause) {
+				port += MV_PORTS_PER_HC - 1;
+				continue;
+			}
+			/*
+			 * We don't need/want to read the hc_irq_cause register,
+			 * because doing so hurts performance, and
+			 * main_irq_cause already gives us everything we need.
+			 *
+			 * But we do have to *write* to the hc_irq_cause to ack
+			 * the ports that we are handling this time through.
+			 *
+			 * This requires that we create a bitmap for those
+			 * ports which interrupted us, and use that bitmap
+			 * to ack (only) those ports via hc_irq_cause.
+			 */
+			ack_irqs = 0;
+			for (p = 0; p < MV_PORTS_PER_HC; ++p) {
+				if ((port + p) >= hpriv->n_ports)
+					break;
+				port_mask = (DONE_IRQ | ERR_IRQ) << (p * 2);
+				if (hc_cause & port_mask)
+					ack_irqs |= (DMA_IRQ | DEV_IRQ) << p;
+			}
 			hc_mmio = mv_hc_base_from_port(mmio, port);
-			hc_irq_cause = readl(hc_mmio + HC_IRQ_CAUSE_OFS);
-			writelfl(~hc_irq_cause, hc_mmio + HC_IRQ_CAUSE_OFS);
+			writelfl(~ack_irqs, hc_mmio + HC_IRQ_CAUSE_OFS);
 			handled = 1;
 		}
+		port_cause = (main_irq_cause >> shift) & (DONE_IRQ | ERR_IRQ);
+		if (!port_cause)
+			continue;
 		/*
 		 * Process completed CRPB response(s) before other events.
 		 */
 		pp = ap->private_data;
-		if (hc_irq_cause & (DMA_IRQ << hardport)) {
+		if (port_cause & DONE_IRQ) {
 			if (pp->pp_flags & MV_PP_FLAG_EDMA_EN)
 				mv_process_crpb_entries(ap, pp);
 		}
@@ -1868,15 +1881,15 @@ static int mv_host_intr(struct ata_host *host, u32 main_irq_cause)
 		 */
 		if (unlikely(port_cause & ERR_IRQ)) {
 			mv_err_intr(ap);
-		} else if (hc_irq_cause & (DEV_IRQ << hardport)) {
+		} else {
 			if (!(pp->pp_flags & MV_PP_FLAG_EDMA_EN)) {
 				struct ata_queued_cmd *qc = mv_get_active_qc(ap);
 				if (qc) {
 					ata_sff_host_intr(ap, qc);
 					continue;
 				}
+				mv_unexpected_intr(ap);
 			}
-			mv_unexpected_intr(ap);
 		}
 	}
 	return handled;
-- 
cgit v0.10.2


From a90103298fd5ccd9a9df6d47bde9a3f371707037 Mon Sep 17 00:00:00 2001
From: Mark Lord <liml@rtr.ca>
Date: Fri, 2 May 2008 02:14:02 -0400
Subject: sata_mv new mv_port_intr function

Separate out the inner loop body of mv_host_intr()
into it's own function called mv_port_intr().

This should help maintainabilty.

Signed-off-by: Mark Lord <mlord@pobox.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/ata/sata_mv.c b/drivers/ata/sata_mv.c
index 31e42de..803578e 100644
--- a/drivers/ata/sata_mv.c
+++ b/drivers/ata/sata_mv.c
@@ -1591,25 +1591,22 @@ static struct ata_queued_cmd *mv_get_active_qc(struct ata_port *ap)
 	return qc;
 }
 
-static void mv_unexpected_intr(struct ata_port *ap)
+static void mv_unexpected_intr(struct ata_port *ap, int edma_was_enabled)
 {
-	struct mv_port_priv *pp = ap->private_data;
 	struct ata_eh_info *ehi = &ap->link.eh_info;
-	char *when = "";
+	char *when = "idle";
 
-	/*
-	 * We got a device interrupt from something that
-	 * was supposed to be using EDMA or polling.
-	 */
 	ata_ehi_clear_desc(ehi);
-	if (pp->pp_flags & MV_PP_FLAG_EDMA_EN) {
-		when = " while EDMA enabled";
+	if (!ap || (ap->flags & ATA_FLAG_DISABLED)) {
+		when = "disabled";
+	} else if (edma_was_enabled) {
+		when = "EDMA enabled";
 	} else {
 		struct ata_queued_cmd *qc = ata_qc_from_tag(ap, ap->link.active_tag);
 		if (qc && (qc->tf.flags & ATA_TFLAG_POLLING))
-			when = " while polling";
+			when = "polling";
 	}
-	ata_ehi_push_desc(ehi, "unexpected device interrupt%s", when);
+	ata_ehi_push_desc(ehi, "unexpected device interrupt while %s", when);
 	ehi->err_mask |= AC_ERR_OTHER;
 	ehi->action   |= ATA_EH_RESET;
 	ata_port_freeze(ap);
@@ -1807,6 +1804,42 @@ static void mv_process_crpb_entries(struct ata_port *ap, struct mv_port_priv *pp
 			 port_mmio + EDMA_RSP_Q_OUT_PTR_OFS);
 }
 
+static void mv_port_intr(struct ata_port *ap, u32 port_cause)
+{
+	struct mv_port_priv *pp;
+	int edma_was_enabled;
+
+	if (!ap || (ap->flags & ATA_FLAG_DISABLED)) {
+		mv_unexpected_intr(ap, 0);
+		return;
+	}
+	/*
+	 * Grab a snapshot of the EDMA_EN flag setting,
+	 * so that we have a consistent view for this port,
+	 * even if something we call of our routines changes it.
+	 */
+	pp = ap->private_data;
+	edma_was_enabled = (pp->pp_flags & MV_PP_FLAG_EDMA_EN);
+	/*
+	 * Process completed CRPB response(s) before other events.
+	 */
+	if (edma_was_enabled && (port_cause & DONE_IRQ)) {
+		mv_process_crpb_entries(ap, pp);
+	}
+	/*
+	 * Handle chip-reported errors, or continue on to handle PIO.
+	 */
+	if (unlikely(port_cause & ERR_IRQ)) {
+		mv_err_intr(ap);
+	} else if (!edma_was_enabled) {
+		struct ata_queued_cmd *qc = mv_get_active_qc(ap);
+		if (qc)
+			ata_sff_host_intr(ap, qc);
+		else
+			mv_unexpected_intr(ap, edma_was_enabled);
+	}
+}
+
 /**
  *      mv_host_intr - Handle all interrupts on the given host controller
  *      @host: host specific structure
@@ -1823,7 +1856,6 @@ static int mv_host_intr(struct ata_host *host, u32 main_irq_cause)
 
 	for (port = 0; port < hpriv->n_ports; port++) {
 		struct ata_port *ap = host->ports[port];
-		struct mv_port_priv *pp;
 		unsigned int p, shift, hardport, port_cause;
 
 		MV_PORT_TO_SHIFT_AND_HARDPORT(port, shift, hardport);
@@ -1865,32 +1897,12 @@ static int mv_host_intr(struct ata_host *host, u32 main_irq_cause)
 			writelfl(~ack_irqs, hc_mmio + HC_IRQ_CAUSE_OFS);
 			handled = 1;
 		}
-		port_cause = (main_irq_cause >> shift) & (DONE_IRQ | ERR_IRQ);
-		if (!port_cause)
-			continue;
-		/*
-		 * Process completed CRPB response(s) before other events.
-		 */
-		pp = ap->private_data;
-		if (port_cause & DONE_IRQ) {
-			if (pp->pp_flags & MV_PP_FLAG_EDMA_EN)
-				mv_process_crpb_entries(ap, pp);
-		}
 		/*
-		 * Handle chip-reported errors, or continue on to handle PIO.
+		 * Handle interrupts signalled for this port:
 		 */
-		if (unlikely(port_cause & ERR_IRQ)) {
-			mv_err_intr(ap);
-		} else {
-			if (!(pp->pp_flags & MV_PP_FLAG_EDMA_EN)) {
-				struct ata_queued_cmd *qc = mv_get_active_qc(ap);
-				if (qc) {
-					ata_sff_host_intr(ap, qc);
-					continue;
-				}
-				mv_unexpected_intr(ap);
-			}
-		}
+		port_cause = (main_irq_cause >> shift) & (DONE_IRQ | ERR_IRQ);
+		if (port_cause)
+			mv_port_intr(ap, port_cause);
 	}
 	return handled;
 }
-- 
cgit v0.10.2


From 10acf3b0d3b46c6ef5d6f0722f72ad9b743ea848 Mon Sep 17 00:00:00 2001
From: Mark Lord <liml@rtr.ca>
Date: Fri, 2 May 2008 02:14:53 -0400
Subject: libata: export ata_eh_analyze_ncq_error

Export ata_eh_analyze_ncq_error() for subsequent use by sata_mv,
as suggested by Tejun.

Signed-off-by: Mark Lord <mlord@pobox.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 3bc4885..927b692 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -6292,6 +6292,7 @@ EXPORT_SYMBOL_GPL(ata_eh_freeze_port);
 EXPORT_SYMBOL_GPL(ata_eh_thaw_port);
 EXPORT_SYMBOL_GPL(ata_eh_qc_complete);
 EXPORT_SYMBOL_GPL(ata_eh_qc_retry);
+EXPORT_SYMBOL_GPL(ata_eh_analyze_ncq_error);
 EXPORT_SYMBOL_GPL(ata_do_eh);
 EXPORT_SYMBOL_GPL(ata_std_error_handler);
 
diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index 61dcd00..62e0331 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -1357,7 +1357,7 @@ static void ata_eh_analyze_serror(struct ata_link *link)
  *	LOCKING:
  *	Kernel thread context (may sleep).
  */
-static void ata_eh_analyze_ncq_error(struct ata_link *link)
+void ata_eh_analyze_ncq_error(struct ata_link *link)
 {
 	struct ata_port *ap = link->ap;
 	struct ata_eh_context *ehc = &link->eh_context;
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 95e6159b..7e206da 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1039,6 +1039,7 @@ extern void ata_eh_thaw_port(struct ata_port *ap);
 
 extern void ata_eh_qc_complete(struct ata_queued_cmd *qc);
 extern void ata_eh_qc_retry(struct ata_queued_cmd *qc);
+extern void ata_eh_analyze_ncq_error(struct ata_link *link);
 
 extern void ata_do_eh(struct ata_port *ap, ata_prereset_fn_t prereset,
 		      ata_reset_fn_t softreset, ata_reset_fn_t hardreset,
-- 
cgit v0.10.2


From 29d187bb1e30682e228ce461c487d78d945c3e4f Mon Sep 17 00:00:00 2001
From: Mark Lord <liml@rtr.ca>
Date: Fri, 2 May 2008 02:15:37 -0400
Subject: sata_mv delayed eh handling

Introduce a new "delayed error handling" mechanism in sata_mv,
to enable us to eventually deal with multiple simultaneous NCQ
failures on a single host link when a PM is present.

This involves a port flag (MV_PP_FLAG_DELAYED_EH) to prevent new
commands being queued, and a pmp bitmap to indicate which pmp links
had NCQ errors.

The new mv_pmp_error_handler() uses those values to invoke
ata_eh_analyze_ncq_error() on each failed link, prior to freezing
the port and passing control to sata_pmp_error_handler().

This is based upon a strategy suggested by Tejun.

For now, we just implement the delayed mechanism.
The next patch in this series will add the multiple-NCQ EH code
to take advantage of it.

Signed-off-by: Mark Lord <mlord@pobox.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/ata/sata_mv.c b/drivers/ata/sata_mv.c
index 803578e..1991eb2 100644
--- a/drivers/ata/sata_mv.c
+++ b/drivers/ata/sata_mv.c
@@ -367,6 +367,7 @@ enum {
 	MV_PP_FLAG_EDMA_EN	= (1 << 0),	/* is EDMA engine enabled? */
 	MV_PP_FLAG_NCQ_EN	= (1 << 1),	/* is EDMA set up for NCQ? */
 	MV_PP_FLAG_FBS_EN	= (1 << 2),	/* is EDMA set up for FBS? */
+	MV_PP_FLAG_DELAYED_EH	= (1 << 3),	/* delayed dev err handling */
 };
 
 #define IS_GEN_I(hpriv) ((hpriv)->hp_flags & MV_HP_GEN_I)
@@ -447,6 +448,7 @@ struct mv_port_priv {
 	unsigned int		resp_idx;
 
 	u32			pp_flags;
+	unsigned int		delayed_eh_pmp_map;
 };
 
 struct mv_port_signal {
@@ -542,6 +544,7 @@ static int mv_pmp_hardreset(struct ata_link *link, unsigned int *class,
 				unsigned long deadline);
 static int  mv_softreset(struct ata_link *link, unsigned int *class,
 				unsigned long deadline);
+static void mv_pmp_error_handler(struct ata_port *ap);
 
 /* .sg_tablesize is (MV_MAX_SG_CT / 2) in the structures below
  * because we have to allow room for worst case splitting of
@@ -589,7 +592,7 @@ static struct ata_port_operations mv6_ops = {
 	.pmp_hardreset		= mv_pmp_hardreset,
 	.pmp_softreset		= mv_softreset,
 	.softreset		= mv_softreset,
-	.error_handler		= sata_pmp_error_handler,
+	.error_handler		= mv_pmp_error_handler,
 };
 
 static struct ata_port_operations mv_iie_ops = {
@@ -1098,6 +1101,12 @@ static int mv_qc_defer(struct ata_queued_cmd *qc)
 	struct mv_port_priv *pp = ap->private_data;
 
 	/*
+	 * Don't allow new commands if we're in a delayed EH state
+	 * for NCQ and/or FIS-based switching.
+	 */
+	if (pp->pp_flags & MV_PP_FLAG_DELAYED_EH)
+		return ATA_DEFER_PORT;
+	/*
 	 * If the port is completely idle, then allow the new qc.
 	 */
 	if (ap->nr_active_links == 0)
@@ -1591,6 +1600,33 @@ static struct ata_queued_cmd *mv_get_active_qc(struct ata_port *ap)
 	return qc;
 }
 
+static void mv_pmp_error_handler(struct ata_port *ap)
+{
+	unsigned int pmp, pmp_map;
+	struct mv_port_priv *pp = ap->private_data;
+
+	if (pp->pp_flags & MV_PP_FLAG_DELAYED_EH) {
+		/*
+		 * Perform NCQ error analysis on failed PMPs
+		 * before we freeze the port entirely.
+		 *
+		 * The failed PMPs are marked earlier by mv_pmp_eh_prep().
+		 */
+		pmp_map = pp->delayed_eh_pmp_map;
+		pp->pp_flags &= ~MV_PP_FLAG_DELAYED_EH;
+		for (pmp = 0; pmp_map != 0; pmp++) {
+			unsigned int this_pmp = (1 << pmp);
+			if (pmp_map & this_pmp) {
+				struct ata_link *link = &ap->pmp_link[pmp];
+				pmp_map &= ~this_pmp;
+				ata_eh_analyze_ncq_error(link);
+			}
+		}
+		ata_port_freeze(ap);
+	}
+	sata_pmp_error_handler(ap);
+}
+
 static void mv_unexpected_intr(struct ata_port *ap, int edma_was_enabled)
 {
 	struct ata_eh_info *ehi = &ap->link.eh_info;
-- 
cgit v0.10.2


From 4c299ca3649ccf666819e7d4a27a68c39fa174f1 Mon Sep 17 00:00:00 2001
From: Mark Lord <liml@rtr.ca>
Date: Fri, 2 May 2008 02:16:20 -0400
Subject: sata_mv NCQ-EH for FIS-based switching

Convert sata_mv's EH for FIS-based switching (FBS) over to the
sequence recommended by Marvell.  This enables us to catch/analyze
multiple failed links on a port-multiplier when using NCQ.

To do this, we clear the ERR_DEV bit in the EDMA Halt-Conditions register,
so that the EDMA engine doesn't self-disable on the first NCQ error.

Our EH code sets the MV_PP_FLAG_DELAYED_EH flag to prevent new commands
being queued while we await completion of all outstanding NCQ commands
on all links of the failed PM.

The SATA Test Control register tells us which links have failed,
so we must only wait for any other active links to finish up
before we stop the EDMA and run the .error_handler afterward.

The patch also includes skeleton code for handling of non-NCQ FBS operation.
This is more for documentation purposes right now, as that mode is not yet
enabled in sata_mv.

Signed-off-by: Mark Lord <mlord@pobox.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/ata/sata_mv.c b/drivers/ata/sata_mv.c
index 1991eb2..b948dc8 100644
--- a/drivers/ata/sata_mv.c
+++ b/drivers/ata/sata_mv.c
@@ -545,6 +545,8 @@ static int mv_pmp_hardreset(struct ata_link *link, unsigned int *class,
 static int  mv_softreset(struct ata_link *link, unsigned int *class,
 				unsigned long deadline);
 static void mv_pmp_error_handler(struct ata_port *ap);
+static void mv_process_crpb_entries(struct ata_port *ap,
+					struct mv_port_priv *pp);
 
 /* .sg_tablesize is (MV_MAX_SG_CT / 2) in the structures below
  * because we have to allow room for worst case splitting of
@@ -1156,6 +1158,10 @@ static void mv_config_fbs(void __iomem *port_mmio, int want_ncq, int want_fbs)
 	if (want_fbs) {
 		new_fiscfg = old_fiscfg | FISCFG_SINGLE_SYNC;
 		new_ltmode = old_ltmode | LTMODE_BIT8;
+		if (want_ncq)
+			new_haltcond &= ~EDMA_ERR_DEV;
+		else
+			new_fiscfg |=  FISCFG_WAIT_DEV_ERR;
 	}
 
 	if (new_fiscfg != old_fiscfg)
@@ -1627,6 +1633,154 @@ static void mv_pmp_error_handler(struct ata_port *ap)
 	sata_pmp_error_handler(ap);
 }
 
+static unsigned int mv_get_err_pmp_map(struct ata_port *ap)
+{
+	void __iomem *port_mmio = mv_ap_base(ap);
+
+	return readl(port_mmio + SATA_TESTCTL_OFS) >> 16;
+}
+
+static int mv_count_pmp_links(unsigned int pmp_map)
+{
+	unsigned int link_count = 0;
+
+	while (pmp_map) {
+		link_count += (pmp_map & 1);
+		pmp_map >>= 1;
+	}
+	return link_count;
+}
+
+static void mv_pmp_eh_prep(struct ata_port *ap, unsigned int pmp_map)
+{
+	struct ata_eh_info *ehi;
+	unsigned int pmp;
+
+	/*
+	 * Initialize EH info for PMPs which saw device errors
+	 */
+	ehi = &ap->link.eh_info;
+	for (pmp = 0; pmp_map != 0; pmp++) {
+		unsigned int this_pmp = (1 << pmp);
+		if (pmp_map & this_pmp) {
+			struct ata_link *link = &ap->pmp_link[pmp];
+
+			pmp_map &= ~this_pmp;
+			ehi = &link->eh_info;
+			ata_ehi_clear_desc(ehi);
+			ata_ehi_push_desc(ehi, "dev err");
+			ehi->err_mask |= AC_ERR_DEV;
+			ehi->action |= ATA_EH_RESET;
+			ata_link_abort(link);
+		}
+	}
+}
+
+static int mv_handle_fbs_ncq_dev_err(struct ata_port *ap)
+{
+	struct mv_port_priv *pp = ap->private_data;
+	int failed_links;
+	unsigned int old_map, new_map;
+
+	/*
+	 * Device error during FBS+NCQ operation:
+	 *
+	 * Set a port flag to prevent further I/O being enqueued.
+	 * Leave the EDMA running to drain outstanding commands from this port.
+	 * Perform the post-mortem/EH only when all responses are complete.
+	 * Follow recovery sequence from 6042/7042 datasheet (7.3.15.4.2.2).
+	 */
+	if (!(pp->pp_flags & MV_PP_FLAG_DELAYED_EH)) {
+		pp->pp_flags |= MV_PP_FLAG_DELAYED_EH;
+		pp->delayed_eh_pmp_map = 0;
+	}
+	old_map = pp->delayed_eh_pmp_map;
+	new_map = old_map | mv_get_err_pmp_map(ap);
+
+	if (old_map != new_map) {
+		pp->delayed_eh_pmp_map = new_map;
+		mv_pmp_eh_prep(ap, new_map & ~old_map);
+	}
+	failed_links = mv_count_pmp_links(new_map);
+
+	ata_port_printk(ap, KERN_INFO, "%s: pmp_map=%04x qc_map=%04x "
+			"failed_links=%d nr_active_links=%d\n",
+			__func__, pp->delayed_eh_pmp_map,
+			ap->qc_active, failed_links,
+			ap->nr_active_links);
+
+	if (ap->nr_active_links <= failed_links) {
+		mv_process_crpb_entries(ap, pp);
+		mv_stop_edma(ap);
+		mv_eh_freeze(ap);
+		ata_port_printk(ap, KERN_INFO, "%s: done\n", __func__);
+		return 1;	/* handled */
+	}
+	ata_port_printk(ap, KERN_INFO, "%s: waiting\n", __func__);
+	return 1;	/* handled */
+}
+
+static int mv_handle_fbs_non_ncq_dev_err(struct ata_port *ap)
+{
+	/*
+	 * Possible future enhancement:
+	 *
+	 * FBS+non-NCQ operation is not yet implemented.
+	 * See related notes in mv_edma_cfg().
+	 *
+	 * Device error during FBS+non-NCQ operation:
+	 *
+	 * We need to snapshot the shadow registers for each failed command.
+	 * Follow recovery sequence from 6042/7042 datasheet (7.3.15.4.2.3).
+	 */
+	return 0;	/* not handled */
+}
+
+static int mv_handle_dev_err(struct ata_port *ap, u32 edma_err_cause)
+{
+	struct mv_port_priv *pp = ap->private_data;
+
+	if (!(pp->pp_flags & MV_PP_FLAG_EDMA_EN))
+		return 0;	/* EDMA was not active: not handled */
+	if (!(pp->pp_flags & MV_PP_FLAG_FBS_EN))
+		return 0;	/* FBS was not active: not handled */
+
+	if (!(edma_err_cause & EDMA_ERR_DEV))
+		return 0;	/* non DEV error: not handled */
+	edma_err_cause &= ~EDMA_ERR_IRQ_TRANSIENT;
+	if (edma_err_cause & ~(EDMA_ERR_DEV | EDMA_ERR_SELF_DIS))
+		return 0;	/* other problems: not handled */
+
+	if (pp->pp_flags & MV_PP_FLAG_NCQ_EN) {
+		/*
+		 * EDMA should NOT have self-disabled for this case.
+		 * If it did, then something is wrong elsewhere,
+		 * and we cannot handle it here.
+		 */
+		if (edma_err_cause & EDMA_ERR_SELF_DIS) {
+			ata_port_printk(ap, KERN_WARNING,
+				"%s: err_cause=0x%x pp_flags=0x%x\n",
+				__func__, edma_err_cause, pp->pp_flags);
+			return 0; /* not handled */
+		}
+		return mv_handle_fbs_ncq_dev_err(ap);
+	} else {
+		/*
+		 * EDMA should have self-disabled for this case.
+		 * If it did not, then something is wrong elsewhere,
+		 * and we cannot handle it here.
+		 */
+		if (!(edma_err_cause & EDMA_ERR_SELF_DIS)) {
+			ata_port_printk(ap, KERN_WARNING,
+				"%s: err_cause=0x%x pp_flags=0x%x\n",
+				__func__, edma_err_cause, pp->pp_flags);
+			return 0; /* not handled */
+		}
+		return mv_handle_fbs_non_ncq_dev_err(ap);
+	}
+	return 0;	/* not handled */
+}
+
 static void mv_unexpected_intr(struct ata_port *ap, int edma_was_enabled)
 {
 	struct ata_eh_info *ehi = &ap->link.eh_info;
@@ -1683,6 +1837,15 @@ static void mv_err_intr(struct ata_port *ap)
 	ata_port_printk(ap, KERN_INFO, "%s: err_cause=%08x pp_flags=0x%x\n",
 			__func__, edma_err_cause, pp->pp_flags);
 
+	if (edma_err_cause & EDMA_ERR_DEV) {
+		/*
+		 * Device errors during FIS-based switching operation
+		 * require special handling.
+		 */
+		if (mv_handle_dev_err(ap, edma_err_cause))
+			return;
+	}
+
 	qc = mv_get_active_qc(ap);
 	ata_ehi_clear_desc(ehi);
 	ata_ehi_push_desc(ehi, "edma_err_cause=%08x pp_flags=%08x",
@@ -1861,6 +2024,8 @@ static void mv_port_intr(struct ata_port *ap, u32 port_cause)
 	 */
 	if (edma_was_enabled && (port_cause & DONE_IRQ)) {
 		mv_process_crpb_entries(ap, pp);
+		if (pp->pp_flags & MV_PP_FLAG_DELAYED_EH)
+			mv_handle_fbs_ncq_dev_err(ap);
 	}
 	/*
 	 * Handle chip-reported errors, or continue on to handle PIO.
-- 
cgit v0.10.2


From c46938ccfe35a58a0873715ee4c26fc9eb8d87b3 Mon Sep 17 00:00:00 2001
From: Mark Lord <liml@rtr.ca>
Date: Fri, 2 May 2008 14:02:28 -0400
Subject: sata_mv use hweight16() for bit counting (V2)

Some tidying as suggested by Grant Grundler.

Nuke local bit-counting function from sata_mv in favour of using hweight16().
Also add a short explanation for the 15msec timeout used when waiting for empty/idle.

Signed-off-by: Mark Lord <mlord@pobox.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/ata/sata_mv.c b/drivers/ata/sata_mv.c
index b948dc8..bb73b22 100644
--- a/drivers/ata/sata_mv.c
+++ b/drivers/ata/sata_mv.c
@@ -65,6 +65,7 @@
 #include <linux/platform_device.h>
 #include <linux/ata_platform.h>
 #include <linux/mbus.h>
+#include <linux/bitops.h>
 #include <scsi/scsi_host.h>
 #include <scsi/scsi_cmnd.h>
 #include <scsi/scsi_device.h>
@@ -903,6 +904,10 @@ static void mv_wait_for_edma_empty_idle(struct ata_port *ap)
 
 	/*
 	 * Wait for the EDMA engine to finish transactions in progress.
+	 * No idea what a good "timeout" value might be, but measurements
+	 * indicate that it often requires hundreds of microseconds
+	 * with two drives in-use.  So we use the 15msec value above
+	 * as a rough guess at what even more drives might require.
 	 */
 	for (i = 0; i < timeout; ++i) {
 		u32 edma_stat = readl(port_mmio + EDMA_STATUS_OFS);
@@ -1640,17 +1645,6 @@ static unsigned int mv_get_err_pmp_map(struct ata_port *ap)
 	return readl(port_mmio + SATA_TESTCTL_OFS) >> 16;
 }
 
-static int mv_count_pmp_links(unsigned int pmp_map)
-{
-	unsigned int link_count = 0;
-
-	while (pmp_map) {
-		link_count += (pmp_map & 1);
-		pmp_map >>= 1;
-	}
-	return link_count;
-}
-
 static void mv_pmp_eh_prep(struct ata_port *ap, unsigned int pmp_map)
 {
 	struct ata_eh_info *ehi;
@@ -1701,7 +1695,7 @@ static int mv_handle_fbs_ncq_dev_err(struct ata_port *ap)
 		pp->delayed_eh_pmp_map = new_map;
 		mv_pmp_eh_prep(ap, new_map & ~old_map);
 	}
-	failed_links = mv_count_pmp_links(new_map);
+	failed_links = hweight16(new_map);
 
 	ata_port_printk(ap, KERN_INFO, "%s: pmp_map=%04x qc_map=%04x "
 			"failed_links=%d nr_active_links=%d\n",
-- 
cgit v0.10.2


From 36f674d9a65264d3826ca7300bed441e22a624b2 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Wed, 30 Apr 2008 16:35:08 +0900
Subject: sata_inic162x: misc clean ups

* use larger indents for structure member definitions

* kill unused variable @addr in inic_scr_write()

* kill unnecessary flushes in inic_freeze/thaw()

* kill buggy explicit kfree() on devres managed port private data

This is in preparation of further inic162x updates.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/ata/sata_inic162x.c b/drivers/ata/sata_inic162x.c
index d27bb9a..1f5d17e 100644
--- a/drivers/ata/sata_inic162x.c
+++ b/drivers/ata/sata_inic162x.c
@@ -99,13 +99,13 @@ enum {
 };
 
 struct inic_host_priv {
-	u16	cached_hctl;
+	u16		cached_hctl;
 };
 
 struct inic_port_priv {
-	u8	dfl_prdctl;
-	u8	cached_prdctl;
-	u8	cached_pirq_mask;
+	u8		dfl_prdctl;
+	u8		cached_prdctl;
+	u8		cached_pirq_mask;
 };
 
 static struct scsi_host_template inic_sht = {
@@ -185,12 +185,10 @@ static int inic_scr_read(struct ata_port *ap, unsigned sc_reg, u32 *val)
 static int inic_scr_write(struct ata_port *ap, unsigned sc_reg, u32 val)
 {
 	void __iomem *scr_addr = ap->ioaddr.scr_addr;
-	void __iomem *addr;
 
 	if (unlikely(sc_reg >= ARRAY_SIZE(scr_map)))
 		return -EINVAL;
 
-	addr = scr_addr + scr_map[sc_reg] * 4;
 	writel(val, scr_addr + scr_map[sc_reg] * 4);
 	return 0;
 }
@@ -367,8 +365,6 @@ static void inic_freeze(struct ata_port *ap)
 
 	ap->ops->sff_check_status(ap);
 	writeb(0xff, port_base + PORT_IRQ_STAT);
-
-	readb(port_base + PORT_IRQ_STAT); /* flush */
 }
 
 static void inic_thaw(struct ata_port *ap)
@@ -379,8 +375,6 @@ static void inic_thaw(struct ata_port *ap)
 	writeb(0xff, port_base + PORT_IRQ_STAT);
 
 	__inic_set_pirq_mask(ap, PIRQ_MASK_OTHER);
-
-	readb(port_base + PORT_IRQ_STAT); /* flush */
 }
 
 /*
@@ -506,10 +500,8 @@ static int inic_port_start(struct ata_port *ap)
 
 	/* Alloc resources */
 	rc = ata_port_start(ap);
-	if (rc) {
-		kfree(pp);
+	if (rc)
 		return rc;
-	}
 
 	init_port(ap);
 
-- 
cgit v0.10.2


From b0dd9b8ef985291a8b40118c5f33b7935e273dcb Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Wed, 30 Apr 2008 16:35:09 +0900
Subject: sata_inic162x: add / update constants

* add a bunch of constants, most are from the datasheet, a few
  undocumented ones are from initio's modified driver

* HCTL_PWRDWN is bit 12 not 13

This is in preparation of further inic162x updates.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/ata/sata_inic162x.c b/drivers/ata/sata_inic162x.c
index 1f5d17e..1b10455 100644
--- a/drivers/ata/sata_inic162x.c
+++ b/drivers/ata/sata_inic162x.c
@@ -35,6 +35,7 @@ enum {
 
 	NR_PORTS		= 2,
 
+	HOST_ACTRL		= 0x08,
 	HOST_CTL		= 0x7c,
 	HOST_STAT		= 0x7e,
 	HOST_IRQ_STAT		= 0xbc,
@@ -43,22 +44,37 @@ enum {
 	PORT_SIZE		= 0x40,
 
 	/* registers for ATA TF operation */
-	PORT_TF			= 0x00,
-	PORT_ALT_STAT		= 0x08,
+	PORT_TF_DATA		= 0x00,
+	PORT_TF_FEATURE		= 0x01,
+	PORT_TF_NSECT		= 0x02,
+	PORT_TF_LBAL		= 0x03,
+	PORT_TF_LBAM		= 0x04,
+	PORT_TF_LBAH		= 0x05,
+	PORT_TF_DEVICE		= 0x06,
+	PORT_TF_COMMAND		= 0x07,
+	PORT_TF_ALT_STAT	= 0x08,
 	PORT_IRQ_STAT		= 0x09,
 	PORT_IRQ_MASK		= 0x0a,
 	PORT_PRD_CTL		= 0x0b,
 	PORT_PRD_ADDR		= 0x0c,
 	PORT_PRD_XFERLEN	= 0x10,
+	PORT_CPB_CPBLAR		= 0x18,
+	PORT_CPB_PTQFIFO	= 0x1c,
 
 	/* IDMA register */
 	PORT_IDMA_CTL		= 0x14,
+	PORT_IDMA_STAT		= 0x16,
+
+	PORT_RPQ_FIFO		= 0x1e,
+	PORT_RPQ_CNT		= 0x1f,
 
 	PORT_SCR		= 0x20,
 
 	/* HOST_CTL bits */
 	HCTL_IRQOFF		= (1 << 8),  /* global IRQ off */
-	HCTL_PWRDWN		= (1 << 13), /* power down PHYs */
+	HCTL_FTHD0		= (1 << 10), /* fifo threshold 0 */
+	HCTL_FTHD1		= (1 << 11), /* fifo threshold 1*/
+	HCTL_PWRDWN		= (1 << 12), /* power down PHYs */
 	HCTL_SOFTRST		= (1 << 13), /* global reset (no phy reset) */
 	HCTL_RPGSEL		= (1 << 15), /* register page select */
 
@@ -96,6 +112,43 @@ enum {
 	IDMA_CTL_RST_IDMA	= (1 << 5),  /* reset IDMA machinary */
 	IDMA_CTL_GO		= (1 << 7),  /* IDMA mode go */
 	IDMA_CTL_ATA_NIEN	= (1 << 8),  /* ATA IRQ disable */
+
+	/* PORT_IDMA_STAT bits */
+	IDMA_STAT_PERR		= (1 << 0),  /* PCI ERROR MODE */
+	IDMA_STAT_CPBERR	= (1 << 1),  /* ADMA CPB error */
+	IDMA_STAT_LGCY		= (1 << 3),  /* ADMA legacy */
+	IDMA_STAT_UIRQ		= (1 << 4),  /* ADMA unsolicited irq */
+	IDMA_STAT_STPD		= (1 << 5),  /* ADMA stopped */
+	IDMA_STAT_PSD		= (1 << 6),  /* ADMA pause */
+	IDMA_STAT_DONE		= (1 << 7),  /* ADMA done */
+
+	IDMA_STAT_ERR		= IDMA_STAT_PERR | IDMA_STAT_CPBERR,
+
+	/* CPB Control Flags*/
+	CPB_CTL_VALID		= (1 << 0),  /* CPB valid */
+	CPB_CTL_QUEUED		= (1 << 1),  /* queued command */
+	CPB_CTL_DATA		= (1 << 2),  /* data, rsvd in datasheet */
+	CPB_CTL_IEN		= (1 << 3),  /* PCI interrupt enable */
+	CPB_CTL_DEVDIR		= (1 << 4),  /* device direction control */
+
+	/* CPB Response Flags */
+	CPB_RESP_DONE		= (1 << 0),  /* ATA command complete */
+	CPB_RESP_REL		= (1 << 1),  /* ATA release */
+	CPB_RESP_IGNORED	= (1 << 2),  /* CPB ignored */
+	CPB_RESP_ATA_ERR	= (1 << 3),  /* ATA command error */
+	CPB_RESP_SPURIOUS	= (1 << 4),  /* ATA spurious interrupt error */
+	CPB_RESP_UNDERFLOW	= (1 << 5),  /* APRD deficiency length error */
+	CPB_RESP_OVERFLOW	= (1 << 6),  /* APRD exccess length error */
+	CPB_RESP_CPB_ERR	= (1 << 7),  /* CPB error flag */
+
+	/* PRD Control Flags */
+	PRD_DRAIN		= (1 << 1),  /* ignore data excess */
+	PRD_CDB			= (1 << 2),  /* atapi packet command pointer */
+	PRD_DIRECT_INTR		= (1 << 3),  /* direct interrupt */
+	PRD_DMA			= (1 << 4),  /* data transfer method */
+	PRD_WRITE		= (1 << 5),  /* data dir, rsvd in datasheet */
+	PRD_IOM			= (1 << 6),  /* io/memory transfer */
+	PRD_END			= (1 << 7),  /* APRD chain end */
 };
 
 struct inic_host_priv {
-- 
cgit v0.10.2


From 364fac0e56b9bd379330ef9e39d3761f0b491e2c Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Thu, 1 May 2008 23:55:58 +0900
Subject: sata_inic162x: update TF read handling

inic162x can't reliably read back TF or at least we don't know how to
do it yet.  The only values which seem reliable are status and error.
This patch updates access to TF.

* implement inic_tf_read() which reads the TF area in mmio area

* implement custom inic_qc_fill_rtf() which only returns true if
  status indicates device error.  it'll be returning bogus addresses
  for device errors but it'll be able to report why it failed at
  least.

* implement custom inic_check_ready() and use ata_wait_after_reset()
  instead of the SFF version.

* use inic_tf_read() for classification.

This is not perfect but it fixes hotplug detection failure and at
least makes the driver report 0's instead of random garbages while
reporting valid status and error for device errors.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/ata/sata_inic162x.c b/drivers/ata/sata_inic162x.c
index 1b10455..97267ab 100644
--- a/drivers/ata/sata_inic162x.c
+++ b/drivers/ata/sata_inic162x.c
@@ -410,6 +410,41 @@ static unsigned int inic_qc_issue(struct ata_queued_cmd *qc)
 	return ata_sff_qc_issue(qc);
 }
 
+static void inic_tf_read(struct ata_port *ap, struct ata_taskfile *tf)
+{
+	void __iomem *port_base = inic_port_base(ap);
+
+	tf->feature	= readb(port_base + PORT_TF_FEATURE);
+	tf->nsect	= readb(port_base + PORT_TF_NSECT);
+	tf->lbal	= readb(port_base + PORT_TF_LBAL);
+	tf->lbam	= readb(port_base + PORT_TF_LBAM);
+	tf->lbah	= readb(port_base + PORT_TF_LBAH);
+	tf->device	= readb(port_base + PORT_TF_DEVICE);
+	tf->command	= readb(port_base + PORT_TF_COMMAND);
+}
+
+static bool inic_qc_fill_rtf(struct ata_queued_cmd *qc)
+{
+	struct ata_taskfile *rtf = &qc->result_tf;
+	struct ata_taskfile tf;
+
+	/* FIXME: Except for status and error, result TF access
+	 * doesn't work.  I tried reading from BAR0/2, CPB and BAR5.
+	 * None works regardless of which command interface is used.
+	 * For now return true iff status indicates device error.
+	 * This means that we're reporting bogus sector for RW
+	 * failures.  Eeekk....
+	 */
+	inic_tf_read(qc->ap, &tf);
+
+	if (!(tf.command & ATA_ERR))
+		return false;
+
+	rtf->command = tf.command;
+	rtf->feature = tf.feature;
+	return true;
+}
+
 static void inic_freeze(struct ata_port *ap)
 {
 	void __iomem *port_base = inic_port_base(ap);
@@ -430,6 +465,13 @@ static void inic_thaw(struct ata_port *ap)
 	__inic_set_pirq_mask(ap, PIRQ_MASK_OTHER);
 }
 
+static int inic_check_ready(struct ata_link *link)
+{
+	void __iomem *port_base = inic_port_base(link->ap);
+
+	return ata_check_ready(readb(port_base + PORT_TF_COMMAND));
+}
+
 /*
  * SRST and SControl hardreset don't give valid signature on this
  * controller.  Only controller specific hardreset mechanism works.
@@ -465,7 +507,7 @@ static int inic_hardreset(struct ata_link *link, unsigned int *class,
 		struct ata_taskfile tf;
 
 		/* wait for link to become ready */
-		rc = ata_sff_wait_after_reset(link, 1, deadline);
+		rc = ata_wait_after_reset(link, deadline, inic_check_ready);
 		/* link occupied, -ENODEV too is an error */
 		if (rc) {
 			ata_link_printk(link, KERN_WARNING, "device not ready "
@@ -473,7 +515,7 @@ static int inic_hardreset(struct ata_link *link, unsigned int *class,
 			return rc;
 		}
 
-		ata_sff_tf_read(ap, &tf);
+		inic_tf_read(ap, &tf);
 		*class = ata_dev_classify(&tf);
 	}
 
@@ -569,6 +611,7 @@ static struct ata_port_operations inic_port_ops = {
 	.bmdma_stop		= inic_bmdma_stop,
 	.bmdma_status		= inic_bmdma_status,
 	.qc_issue		= inic_qc_issue,
+	.qc_fill_rtf		= inic_qc_fill_rtf,
 
 	.freeze			= inic_freeze,
 	.thaw			= inic_thaw,
-- 
cgit v0.10.2


From 3ad400a92e9c7d2f7caa6c6f811dad9b7d3f333c Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Wed, 30 Apr 2008 16:35:11 +0900
Subject: sata_inic162x: use IDMA for ATA_PROT_DMA

The modified driver on initio site has enough clue on how to use IDMA.
Use IDMA for ATA_PROT_DMA.

* LBA48 now works as long as it uses DMA (LBA48 devices still aren't
  allowed as it can destroy data if PIO is used for any reason).

* No need to mask IRQs for read DMAs as IDMA_DONE is properly raised
  after transfer to memory is actually completed.  There will be some
  spurious interrupts but host_intr will handle it correctly and
  manipulating port IRQ mask interacts badly with the other port for
  some reason, so command type dependent port IRQ masking is not used
  anymore.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/ata/sata_inic162x.c b/drivers/ata/sata_inic162x.c
index 97267ab..db57f34 100644
--- a/drivers/ata/sata_inic162x.c
+++ b/drivers/ata/sata_inic162x.c
@@ -35,6 +35,10 @@ enum {
 
 	NR_PORTS		= 2,
 
+	IDMA_CPB_TBL_SIZE	= 4 * 32,
+
+	INIC_DMA_BOUNDARY	= 0xffffff,
+
 	HOST_ACTRL		= 0x08,
 	HOST_CTL		= 0x7c,
 	HOST_STAT		= 0x7e,
@@ -151,11 +155,57 @@ enum {
 	PRD_END			= (1 << 7),  /* APRD chain end */
 };
 
+/* Comman Parameter Block */
+struct inic_cpb {
+	u8		resp_flags;	/* Response Flags */
+	u8		error;		/* ATA Error */
+	u8		status;		/* ATA Status */
+	u8		ctl_flags;	/* Control Flags */
+	__le32		len;		/* Total Transfer Length */
+	__le32		prd;		/* First PRD pointer */
+	u8		rsvd[4];
+	/* 16 bytes */
+	u8		feature;	/* ATA Feature */
+	u8		hob_feature;	/* ATA Ex. Feature */
+	u8		device;		/* ATA Device/Head */
+	u8		mirctl;		/* Mirror Control */
+	u8		nsect;		/* ATA Sector Count */
+	u8		hob_nsect;	/* ATA Ex. Sector Count */
+	u8		lbal;		/* ATA Sector Number */
+	u8		hob_lbal;	/* ATA Ex. Sector Number */
+	u8		lbam;		/* ATA Cylinder Low */
+	u8		hob_lbam;	/* ATA Ex. Cylinder Low */
+	u8		lbah;		/* ATA Cylinder High */
+	u8		hob_lbah;	/* ATA Ex. Cylinder High */
+	u8		command;	/* ATA Command */
+	u8		ctl;		/* ATA Control */
+	u8		slave_error;	/* Slave ATA Error */
+	u8		slave_status;	/* Slave ATA Status */
+	/* 32 bytes */
+} __packed;
+
+/* Physical Region Descriptor */
+struct inic_prd {
+	__le32		mad;		/* Physical Memory Address */
+	__le16		len;		/* Transfer Length */
+	u8		rsvd;
+	u8		flags;		/* Control Flags */
+} __packed;
+
+struct inic_pkt {
+	struct inic_cpb	cpb;
+	struct inic_prd	prd[LIBATA_MAX_PRD];
+} __packed;
+
 struct inic_host_priv {
 	u16		cached_hctl;
 };
 
 struct inic_port_priv {
+	struct inic_pkt	*pkt;
+	dma_addr_t	pkt_dma;
+	u32		*cpb_tbl;
+	dma_addr_t	cpb_tbl_dma;
 	u8		dfl_prdctl;
 	u8		cached_prdctl;
 	u8		cached_pirq_mask;
@@ -163,6 +213,7 @@ struct inic_port_priv {
 
 static struct scsi_host_template inic_sht = {
 	ATA_BMDMA_SHT(DRV_NAME),
+	.dma_boundary	= INIC_DMA_BOUNDARY,
 };
 
 static const int scr_map[] = {
@@ -303,42 +354,112 @@ static u8 inic_bmdma_status(struct ata_port *ap)
 	return ATA_DMA_INTR;
 }
 
-static void inic_host_intr(struct ata_port *ap)
+static void inic_stop_idma(struct ata_port *ap)
 {
 	void __iomem *port_base = inic_port_base(ap);
+
+	readb(port_base + PORT_RPQ_FIFO);
+	readb(port_base + PORT_RPQ_CNT);
+	writew(0, port_base + PORT_IDMA_CTL);
+}
+
+static void inic_host_err_intr(struct ata_port *ap, u8 irq_stat, u16 idma_stat)
+{
 	struct ata_eh_info *ehi = &ap->link.eh_info;
+	struct inic_port_priv *pp = ap->private_data;
+	struct inic_cpb *cpb = &pp->pkt->cpb;
+	bool freeze = false;
+
+	ata_ehi_clear_desc(ehi);
+	ata_ehi_push_desc(ehi, "irq_stat=0x%x idma_stat=0x%x",
+			  irq_stat, idma_stat);
+
+	inic_stop_idma(ap);
+
+	if (irq_stat & (PIRQ_OFFLINE | PIRQ_ONLINE)) {
+		ata_ehi_push_desc(ehi, "hotplug");
+		ata_ehi_hotplugged(ehi);
+		freeze = true;
+	}
+
+	if (idma_stat & IDMA_STAT_PERR) {
+		ata_ehi_push_desc(ehi, "PCI error");
+		freeze = true;
+	}
+
+	if (idma_stat & IDMA_STAT_CPBERR) {
+		ata_ehi_push_desc(ehi, "CPB error");
+
+		if (cpb->resp_flags & CPB_RESP_IGNORED) {
+			__ata_ehi_push_desc(ehi, " ignored");
+			ehi->err_mask |= AC_ERR_INVALID;
+			freeze = true;
+		}
+
+		if (cpb->resp_flags & CPB_RESP_ATA_ERR)
+			ehi->err_mask |= AC_ERR_DEV;
+
+		if (cpb->resp_flags & CPB_RESP_SPURIOUS) {
+			__ata_ehi_push_desc(ehi, " spurious-intr");
+			ehi->err_mask |= AC_ERR_HSM;
+			freeze = true;
+		}
+
+		if (cpb->resp_flags &
+		    (CPB_RESP_UNDERFLOW | CPB_RESP_OVERFLOW)) {
+			__ata_ehi_push_desc(ehi, " data-over/underflow");
+			ehi->err_mask |= AC_ERR_HSM;
+			freeze = true;
+		}
+	}
+
+	if (freeze)
+		ata_port_freeze(ap);
+	else
+		ata_port_abort(ap);
+}
+
+static void inic_host_intr(struct ata_port *ap)
+{
+	void __iomem *port_base = inic_port_base(ap);
+	struct ata_queued_cmd *qc = ata_qc_from_tag(ap, ap->link.active_tag);
 	u8 irq_stat;
+	u16 idma_stat;
 
-	/* fetch and clear irq */
+	/* read and clear IRQ status */
 	irq_stat = readb(port_base + PORT_IRQ_STAT);
 	writeb(irq_stat, port_base + PORT_IRQ_STAT);
+	idma_stat = readw(port_base + PORT_IDMA_STAT);
+
+	if (unlikely((irq_stat & PIRQ_ERR) || (idma_stat & IDMA_STAT_ERR)))
+		inic_host_err_intr(ap, irq_stat, idma_stat);
+
+	if (unlikely(!qc || (qc->tf.flags & ATA_TFLAG_POLLING))) {
+		ap->ops->sff_check_status(ap); /* clear ATA interrupt */
+		goto spurious;
+	}
+
+	if (qc->tf.protocol == ATA_PROT_DMA) {
+		if (likely(idma_stat & IDMA_STAT_DONE)) {
+			inic_stop_idma(ap);
 
-	if (likely(!(irq_stat & PIRQ_ERR))) {
-		struct ata_queued_cmd *qc =
-			ata_qc_from_tag(ap, ap->link.active_tag);
+			/* Depending on circumstances, device error
+			 * isn't reported by IDMA, check it explicitly.
+			 */
+			if (unlikely(readb(port_base + PORT_TF_COMMAND) &
+				     (ATA_DF | ATA_ERR)))
+				qc->err_mask |= AC_ERR_DEV;
 
-		if (unlikely(!qc || (qc->tf.flags & ATA_TFLAG_POLLING))) {
-			ap->ops->sff_check_status(ap); /* clear ATA interrupt */
+			ata_qc_complete(qc);
 			return;
 		}
-
+	} else {
 		if (likely(ata_sff_host_intr(ap, qc)))
 			return;
-
-		ap->ops->sff_check_status(ap); /* clear ATA interrupt */
-		ata_port_printk(ap, KERN_WARNING, "unhandled "
-				"interrupt, irq_stat=%x\n", irq_stat);
-		return;
 	}
 
-	/* error */
-	ata_ehi_push_desc(ehi, "irq_stat=0x%x", irq_stat);
-
-	if (irq_stat & (PIRQ_OFFLINE | PIRQ_ONLINE)) {
-		ata_ehi_hotplugged(ehi);
-		ata_port_freeze(ap);
-	} else
-		ata_port_abort(ap);
+ spurious:
+	ap->ops->sff_check_status(ap); /* clear ATA interrupt */
 }
 
 static irqreturn_t inic_interrupt(int irq, void *dev_instance)
@@ -378,22 +499,83 @@ static irqreturn_t inic_interrupt(int irq, void *dev_instance)
 	return IRQ_RETVAL(handled);
 }
 
+static void inic_fill_sg(struct inic_prd *prd, struct ata_queued_cmd *qc)
+{
+	struct scatterlist *sg;
+	unsigned int si;
+	u8 flags = PRD_DMA;
+
+	if (qc->tf.flags & ATA_TFLAG_WRITE)
+		flags |= PRD_WRITE;
+
+	for_each_sg(qc->sg, sg, qc->n_elem, si) {
+		prd->mad = cpu_to_le32(sg_dma_address(sg));
+		prd->len = cpu_to_le16(sg_dma_len(sg));
+		prd->flags = flags;
+		prd++;
+	}
+
+	WARN_ON(!si);
+	prd[-1].flags |= PRD_END;
+}
+
+static void inic_qc_prep(struct ata_queued_cmd *qc)
+{
+	struct inic_port_priv *pp = qc->ap->private_data;
+	struct inic_pkt *pkt = pp->pkt;
+	struct inic_cpb *cpb = &pkt->cpb;
+	struct inic_prd *prd = pkt->prd;
+
+	VPRINTK("ENTER\n");
+
+	if (qc->tf.protocol != ATA_PROT_DMA)
+		return;
+
+	/* prepare packet, based on initio driver */
+	memset(pkt, 0, sizeof(struct inic_pkt));
+
+	cpb->ctl_flags = CPB_CTL_VALID | CPB_CTL_IEN | CPB_CTL_DATA;
+
+	cpb->len = cpu_to_le32(qc->nbytes);
+	cpb->prd = cpu_to_le32(pp->pkt_dma + offsetof(struct inic_pkt, prd));
+
+	cpb->device = qc->tf.device;
+	cpb->feature = qc->tf.feature;
+	cpb->nsect = qc->tf.nsect;
+	cpb->lbal = qc->tf.lbal;
+	cpb->lbam = qc->tf.lbam;
+	cpb->lbah = qc->tf.lbah;
+
+	if (qc->tf.flags & ATA_TFLAG_LBA48) {
+		cpb->hob_feature = qc->tf.hob_feature;
+		cpb->hob_nsect = qc->tf.hob_nsect;
+		cpb->hob_lbal = qc->tf.hob_lbal;
+		cpb->hob_lbam = qc->tf.hob_lbam;
+		cpb->hob_lbah = qc->tf.hob_lbah;
+	}
+
+	cpb->command = qc->tf.command;
+	/* don't load ctl - dunno why.  it's like that in the initio driver */
+
+	/* setup sg table */
+	inic_fill_sg(prd, qc);
+
+	pp->cpb_tbl[0] = pp->pkt_dma;
+}
+
 static unsigned int inic_qc_issue(struct ata_queued_cmd *qc)
 {
 	struct ata_port *ap = qc->ap;
+	void __iomem *port_base = inic_port_base(ap);
 
-	/* ATA IRQ doesn't wait for DMA transfer completion and vice
-	 * versa.  Mask IRQ selectively to detect command completion.
-	 * Without it, ATA DMA read command can cause data corruption.
-	 *
-	 * Something similar might be needed for ATAPI writes.  I
-	 * tried a lot of combinations but couldn't find the solution.
-	 */
-	if (qc->tf.protocol == ATA_PROT_DMA &&
-	    !(qc->tf.flags & ATA_TFLAG_WRITE))
-		inic_set_pirq_mask(ap, PIRQ_MASK_DMA_READ);
-	else
-		inic_set_pirq_mask(ap, PIRQ_MASK_OTHER);
+	if (qc->tf.protocol == ATA_PROT_DMA) {
+		/* fire up the ADMA engine */
+		writew(HCTL_FTHD0, port_base + HOST_CTL);
+		writew(IDMA_CTL_GO, port_base + PORT_IDMA_CTL);
+		writeb(0, port_base + PORT_CPB_PTQFIFO);
+
+		return 0;
+	}
 
 	/* Issuing a command to yet uninitialized port locks up the
 	 * controller.  Most of the time, this happens for the first
@@ -564,9 +746,15 @@ static void inic_dev_config(struct ata_device *dev)
 static void init_port(struct ata_port *ap)
 {
 	void __iomem *port_base = inic_port_base(ap);
+	struct inic_port_priv *pp = ap->private_data;
 
-	/* Setup PRD address */
+	/* clear packet and CPB table */
+	memset(pp->pkt, 0, sizeof(struct inic_pkt));
+	memset(pp->cpb_tbl, 0, IDMA_CPB_TBL_SIZE);
+
+	/* setup PRD and CPB lookup table addresses */
 	writel(ap->prd_dma, port_base + PORT_PRD_ADDR);
+	writel(pp->cpb_tbl_dma, port_base + PORT_CPB_CPBLAR);
 }
 
 static int inic_port_resume(struct ata_port *ap)
@@ -578,12 +766,13 @@ static int inic_port_resume(struct ata_port *ap)
 static int inic_port_start(struct ata_port *ap)
 {
 	void __iomem *port_base = inic_port_base(ap);
+	struct device *dev = ap->host->dev;
 	struct inic_port_priv *pp;
 	u8 tmp;
 	int rc;
 
 	/* alloc and initialize private data */
-	pp = devm_kzalloc(ap->host->dev, sizeof(*pp), GFP_KERNEL);
+	pp = devm_kzalloc(dev, sizeof(*pp), GFP_KERNEL);
 	if (!pp)
 		return -ENOMEM;
 	ap->private_data = pp;
@@ -598,6 +787,16 @@ static int inic_port_start(struct ata_port *ap)
 	if (rc)
 		return rc;
 
+	pp->pkt = dmam_alloc_coherent(dev, sizeof(struct inic_pkt),
+				      &pp->pkt_dma, GFP_KERNEL);
+	if (!pp->pkt)
+		return -ENOMEM;
+
+	pp->cpb_tbl = dmam_alloc_coherent(dev, IDMA_CPB_TBL_SIZE,
+					  &pp->cpb_tbl_dma, GFP_KERNEL);
+	if (!pp->cpb_tbl)
+		return -ENOMEM;
+
 	init_port(ap);
 
 	return 0;
@@ -610,6 +809,7 @@ static struct ata_port_operations inic_port_ops = {
 	.bmdma_start		= inic_bmdma_start,
 	.bmdma_stop		= inic_bmdma_stop,
 	.bmdma_status		= inic_bmdma_status,
+	.qc_prep		= inic_qc_prep,
 	.qc_issue		= inic_qc_issue,
 	.qc_fill_rtf		= inic_qc_fill_rtf,
 
-- 
cgit v0.10.2


From ab5b0235c4e819c9bc45fa62c99f9fe49e73e701 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Wed, 30 Apr 2008 16:35:12 +0900
Subject: sata_inic162x: kill now unused bmdma related stuff

sata_inic162x doesn't use BMDMA anymore.  Kill bmdma related stuff.

* prdctl manipulation

* port IRQ mask manipulation

* inherit ATA_BASE_SHT instead of ATA_BMDMA_SHT

* BMDMA methods

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/ata/sata_inic162x.c b/drivers/ata/sata_inic162x.c
index db57f34..3ca0ee9 100644
--- a/drivers/ata/sata_inic162x.c
+++ b/drivers/ata/sata_inic162x.c
@@ -101,9 +101,7 @@ enum {
 	PIRQ_PENDING		= (1 << 7),  /* port IRQ pending (STAT only) */
 
 	PIRQ_ERR		= PIRQ_OFFLINE | PIRQ_ONLINE | PIRQ_FATAL,
-
-	PIRQ_MASK_DMA_READ	= PIRQ_REPLY | PIRQ_ATA,
-	PIRQ_MASK_OTHER		= PIRQ_REPLY | PIRQ_COMPLETE,
+	PIRQ_MASK_DEFAULT	= PIRQ_REPLY,
 	PIRQ_MASK_FREEZE	= 0xff,
 
 	/* PORT_PRD_CTL bits */
@@ -206,13 +204,11 @@ struct inic_port_priv {
 	dma_addr_t	pkt_dma;
 	u32		*cpb_tbl;
 	dma_addr_t	cpb_tbl_dma;
-	u8		dfl_prdctl;
-	u8		cached_prdctl;
-	u8		cached_pirq_mask;
 };
 
 static struct scsi_host_template inic_sht = {
-	ATA_BMDMA_SHT(DRV_NAME),
+	ATA_BASE_SHT(DRV_NAME),
+	.sg_tablesize	= LIBATA_MAX_PRD,	/* maybe it can be larger? */
 	.dma_boundary	= INIC_DMA_BOUNDARY,
 };
 
@@ -227,23 +223,6 @@ static void __iomem *inic_port_base(struct ata_port *ap)
 	return ap->host->iomap[MMIO_BAR] + ap->port_no * PORT_SIZE;
 }
 
-static void __inic_set_pirq_mask(struct ata_port *ap, u8 mask)
-{
-	void __iomem *port_base = inic_port_base(ap);
-	struct inic_port_priv *pp = ap->private_data;
-
-	writeb(mask, port_base + PORT_IRQ_MASK);
-	pp->cached_pirq_mask = mask;
-}
-
-static void inic_set_pirq_mask(struct ata_port *ap, u8 mask)
-{
-	struct inic_port_priv *pp = ap->private_data;
-
-	if (pp->cached_pirq_mask != mask)
-		__inic_set_pirq_mask(ap, mask);
-}
-
 static void inic_reset_port(void __iomem *port_base)
 {
 	void __iomem *idma_ctl = port_base + PORT_IDMA_CTL;
@@ -297,63 +276,6 @@ static int inic_scr_write(struct ata_port *ap, unsigned sc_reg, u32 val)
 	return 0;
 }
 
-/*
- * In TF mode, inic162x is very similar to SFF device.  TF registers
- * function the same.  DMA engine behaves similary using the same PRD
- * format as BMDMA but different command register, interrupt and event
- * notification methods are used.  The following inic_bmdma_*()
- * functions do the impedance matching.
- */
-static void inic_bmdma_setup(struct ata_queued_cmd *qc)
-{
-	struct ata_port *ap = qc->ap;
-	struct inic_port_priv *pp = ap->private_data;
-	void __iomem *port_base = inic_port_base(ap);
-	int rw = qc->tf.flags & ATA_TFLAG_WRITE;
-
-	/* make sure device sees PRD table writes */
-	wmb();
-
-	/* load transfer length */
-	writel(qc->nbytes, port_base + PORT_PRD_XFERLEN);
-
-	/* turn on DMA and specify data direction */
-	pp->cached_prdctl = pp->dfl_prdctl | PRD_CTL_DMAEN;
-	if (!rw)
-		pp->cached_prdctl |= PRD_CTL_WR;
-	writeb(pp->cached_prdctl, port_base + PORT_PRD_CTL);
-
-	/* issue r/w command */
-	ap->ops->sff_exec_command(ap, &qc->tf);
-}
-
-static void inic_bmdma_start(struct ata_queued_cmd *qc)
-{
-	struct ata_port *ap = qc->ap;
-	struct inic_port_priv *pp = ap->private_data;
-	void __iomem *port_base = inic_port_base(ap);
-
-	/* start host DMA transaction */
-	pp->cached_prdctl |= PRD_CTL_START;
-	writeb(pp->cached_prdctl, port_base + PORT_PRD_CTL);
-}
-
-static void inic_bmdma_stop(struct ata_queued_cmd *qc)
-{
-	struct ata_port *ap = qc->ap;
-	struct inic_port_priv *pp = ap->private_data;
-	void __iomem *port_base = inic_port_base(ap);
-
-	/* stop DMA engine */
-	writeb(pp->dfl_prdctl, port_base + PORT_PRD_CTL);
-}
-
-static u8 inic_bmdma_status(struct ata_port *ap)
-{
-	/* event is already verified by the interrupt handler */
-	return ATA_DMA_INTR;
-}
-
 static void inic_stop_idma(struct ata_port *ap)
 {
 	void __iomem *port_base = inic_port_base(ap);
@@ -631,8 +553,7 @@ static void inic_freeze(struct ata_port *ap)
 {
 	void __iomem *port_base = inic_port_base(ap);
 
-	__inic_set_pirq_mask(ap, PIRQ_MASK_FREEZE);
-
+	writeb(PIRQ_MASK_FREEZE, port_base + PORT_IRQ_MASK);
 	ap->ops->sff_check_status(ap);
 	writeb(0xff, port_base + PORT_IRQ_STAT);
 }
@@ -643,8 +564,7 @@ static void inic_thaw(struct ata_port *ap)
 
 	ap->ops->sff_check_status(ap);
 	writeb(0xff, port_base + PORT_IRQ_STAT);
-
-	__inic_set_pirq_mask(ap, PIRQ_MASK_OTHER);
+	writeb(PIRQ_MASK_DEFAULT, port_base + PORT_IRQ_MASK);
 }
 
 static int inic_check_ready(struct ata_link *link)
@@ -707,7 +627,6 @@ static int inic_hardreset(struct ata_link *link, unsigned int *class,
 static void inic_error_handler(struct ata_port *ap)
 {
 	void __iomem *port_base = inic_port_base(ap);
-	struct inic_port_priv *pp = ap->private_data;
 	unsigned long flags;
 
 	/* reset PIO HSM and stop DMA engine */
@@ -715,7 +634,6 @@ static void inic_error_handler(struct ata_port *ap)
 
 	spin_lock_irqsave(ap->lock, flags);
 	ap->hsm_task_state = HSM_ST_IDLE;
-	writeb(pp->dfl_prdctl, port_base + PORT_PRD_CTL);
 	spin_unlock_irqrestore(ap->lock, flags);
 
 	/* PIO and DMA engines have been stopped, perform recovery */
@@ -765,10 +683,8 @@ static int inic_port_resume(struct ata_port *ap)
 
 static int inic_port_start(struct ata_port *ap)
 {
-	void __iomem *port_base = inic_port_base(ap);
 	struct device *dev = ap->host->dev;
 	struct inic_port_priv *pp;
-	u8 tmp;
 	int rc;
 
 	/* alloc and initialize private data */
@@ -777,11 +693,6 @@ static int inic_port_start(struct ata_port *ap)
 		return -ENOMEM;
 	ap->private_data = pp;
 
-	/* default PRD_CTL value, DMAEN, WR and START off */
-	tmp = readb(port_base + PORT_PRD_CTL);
-	tmp &= ~(PRD_CTL_DMAEN | PRD_CTL_WR | PRD_CTL_START);
-	pp->dfl_prdctl = tmp;
-
 	/* Alloc resources */
 	rc = ata_port_start(ap);
 	if (rc)
@@ -805,10 +716,6 @@ static int inic_port_start(struct ata_port *ap)
 static struct ata_port_operations inic_port_ops = {
 	.inherits		= &ata_sff_port_ops,
 
-	.bmdma_setup		= inic_bmdma_setup,
-	.bmdma_start		= inic_bmdma_start,
-	.bmdma_stop		= inic_bmdma_stop,
-	.bmdma_status		= inic_bmdma_status,
 	.qc_prep		= inic_qc_prep,
 	.qc_issue		= inic_qc_issue,
 	.qc_fill_rtf		= inic_qc_fill_rtf,
-- 
cgit v0.10.2


From 049e8e04986bde66df9648d88d0960ab4cbd6992 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Wed, 30 Apr 2008 16:35:13 +0900
Subject: sata_inic162x: use IDMA for non DMA ATA commands

Use IDMA for PIO and non-data commands.  This allows sata_inic162x to
safely drive LBA48 devices.  Kill inic_dev_config() which contains
code to reject LBA48 devices.

With this change, status checking in inic_qc_issue() to avoid hard
lock up after hotplug can go away too.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/ata/sata_inic162x.c b/drivers/ata/sata_inic162x.c
index 3ca0ee9..579154c 100644
--- a/drivers/ata/sata_inic162x.c
+++ b/drivers/ata/sata_inic162x.c
@@ -356,12 +356,12 @@ static void inic_host_intr(struct ata_port *ap)
 	if (unlikely((irq_stat & PIRQ_ERR) || (idma_stat & IDMA_STAT_ERR)))
 		inic_host_err_intr(ap, irq_stat, idma_stat);
 
-	if (unlikely(!qc || (qc->tf.flags & ATA_TFLAG_POLLING))) {
+	if (unlikely(!qc)) {
 		ap->ops->sff_check_status(ap); /* clear ATA interrupt */
 		goto spurious;
 	}
 
-	if (qc->tf.protocol == ATA_PROT_DMA) {
+	if (!ata_is_atapi(qc->tf.protocol)) {
 		if (likely(idma_stat & IDMA_STAT_DONE)) {
 			inic_stop_idma(ap);
 
@@ -425,11 +425,14 @@ static void inic_fill_sg(struct inic_prd *prd, struct ata_queued_cmd *qc)
 {
 	struct scatterlist *sg;
 	unsigned int si;
-	u8 flags = PRD_DMA;
+	u8 flags = 0;
 
 	if (qc->tf.flags & ATA_TFLAG_WRITE)
 		flags |= PRD_WRITE;
 
+	if (ata_is_dma(qc->tf.protocol))
+		flags |= PRD_DMA;
+
 	for_each_sg(qc->sg, sg, qc->n_elem, si) {
 		prd->mad = cpu_to_le32(sg_dma_address(sg));
 		prd->len = cpu_to_le16(sg_dma_len(sg));
@@ -447,16 +450,20 @@ static void inic_qc_prep(struct ata_queued_cmd *qc)
 	struct inic_pkt *pkt = pp->pkt;
 	struct inic_cpb *cpb = &pkt->cpb;
 	struct inic_prd *prd = pkt->prd;
+	bool is_atapi = ata_is_atapi(qc->tf.protocol);
+	bool is_data = ata_is_data(qc->tf.protocol);
 
 	VPRINTK("ENTER\n");
 
-	if (qc->tf.protocol != ATA_PROT_DMA)
+	if (is_atapi)
 		return;
 
 	/* prepare packet, based on initio driver */
 	memset(pkt, 0, sizeof(struct inic_pkt));
 
-	cpb->ctl_flags = CPB_CTL_VALID | CPB_CTL_IEN | CPB_CTL_DATA;
+	cpb->ctl_flags = CPB_CTL_VALID | CPB_CTL_IEN;
+	if (is_data)
+		cpb->ctl_flags |= CPB_CTL_DATA;
 
 	cpb->len = cpu_to_le32(qc->nbytes);
 	cpb->prd = cpu_to_le32(pp->pkt_dma + offsetof(struct inic_pkt, prd));
@@ -480,7 +487,8 @@ static void inic_qc_prep(struct ata_queued_cmd *qc)
 	/* don't load ctl - dunno why.  it's like that in the initio driver */
 
 	/* setup sg table */
-	inic_fill_sg(prd, qc);
+	if (is_data)
+		inic_fill_sg(prd, qc);
 
 	pp->cpb_tbl[0] = pp->pkt_dma;
 }
@@ -490,7 +498,7 @@ static unsigned int inic_qc_issue(struct ata_queued_cmd *qc)
 	struct ata_port *ap = qc->ap;
 	void __iomem *port_base = inic_port_base(ap);
 
-	if (qc->tf.protocol == ATA_PROT_DMA) {
+	if (!ata_is_atapi(qc->tf.protocol)) {
 		/* fire up the ADMA engine */
 		writew(HCTL_FTHD0, port_base + HOST_CTL);
 		writew(IDMA_CTL_GO, port_base + PORT_IDMA_CTL);
@@ -499,18 +507,6 @@ static unsigned int inic_qc_issue(struct ata_queued_cmd *qc)
 		return 0;
 	}
 
-	/* Issuing a command to yet uninitialized port locks up the
-	 * controller.  Most of the time, this happens for the first
-	 * command after reset which are ATA and ATAPI IDENTIFYs.
-	 * Fast fail if stat is 0x7f or 0xff for those commands.
-	 */
-	if (unlikely(qc->tf.command == ATA_CMD_ID_ATA ||
-		     qc->tf.command == ATA_CMD_ID_ATAPI)) {
-		u8 stat = ap->ops->sff_check_status(ap);
-		if (stat == 0x7f || stat == 0xff)
-			return AC_ERR_HSM;
-	}
-
 	return ata_sff_qc_issue(qc);
 }
 
@@ -647,20 +643,6 @@ static void inic_post_internal_cmd(struct ata_queued_cmd *qc)
 		inic_reset_port(inic_port_base(qc->ap));
 }
 
-static void inic_dev_config(struct ata_device *dev)
-{
-	/* inic can only handle upto LBA28 max sectors */
-	if (dev->max_sectors > ATA_MAX_SECTORS)
-		dev->max_sectors = ATA_MAX_SECTORS;
-
-	if (dev->n_sectors >= 1 << 28) {
-		ata_dev_printk(dev, KERN_ERR,
-	"ERROR: This driver doesn't support LBA48 yet and may cause\n"
-	"                data corruption on such devices.  Disabling.\n");
-		ata_dev_disable(dev);
-	}
-}
-
 static void init_port(struct ata_port *ap)
 {
 	void __iomem *port_base = inic_port_base(ap);
@@ -726,7 +708,6 @@ static struct ata_port_operations inic_port_ops = {
 	.hardreset		= inic_hardreset,
 	.error_handler		= inic_error_handler,
 	.post_internal_cmd	= inic_post_internal_cmd,
-	.dev_config		= inic_dev_config,
 
 	.scr_read		= inic_scr_read,
 	.scr_write		= inic_scr_write,
-- 
cgit v0.10.2


From b3f677e501a494aa1582d4ff35fb3ac6f0a59b08 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Wed, 30 Apr 2008 16:35:14 +0900
Subject: sata_inic162x: use IDMA for ATAPI commands

Use IDMA for ATAPI commands.  Write and some misc commands time out
when executed using ATAPI_PROT_DMA but ATAPI_PROT_PIO works fine.  As
PIO is driven by DMA too, it doesn't make any noticeable difference
for native SATA devices.  inic_check_atapi_dma() is implemented to
force PIO for those ATAPI commands.

After this change, sata_inic162x issues all commands using IDMA.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/ata/sata_inic162x.c b/drivers/ata/sata_inic162x.c
index 579154c..cdae435 100644
--- a/drivers/ata/sata_inic162x.c
+++ b/drivers/ata/sata_inic162x.c
@@ -192,7 +192,8 @@ struct inic_prd {
 
 struct inic_pkt {
 	struct inic_cpb	cpb;
-	struct inic_prd	prd[LIBATA_MAX_PRD];
+	struct inic_prd	prd[LIBATA_MAX_PRD + 1];	/* + 1 for cdb */
+	u8		cdb[ATAPI_CDB_LEN];
 } __packed;
 
 struct inic_host_priv {
@@ -361,23 +362,18 @@ static void inic_host_intr(struct ata_port *ap)
 		goto spurious;
 	}
 
-	if (!ata_is_atapi(qc->tf.protocol)) {
-		if (likely(idma_stat & IDMA_STAT_DONE)) {
-			inic_stop_idma(ap);
+	if (likely(idma_stat & IDMA_STAT_DONE)) {
+		inic_stop_idma(ap);
 
-			/* Depending on circumstances, device error
-			 * isn't reported by IDMA, check it explicitly.
-			 */
-			if (unlikely(readb(port_base + PORT_TF_COMMAND) &
-				     (ATA_DF | ATA_ERR)))
-				qc->err_mask |= AC_ERR_DEV;
+		/* Depending on circumstances, device error
+		 * isn't reported by IDMA, check it explicitly.
+		 */
+		if (unlikely(readb(port_base + PORT_TF_COMMAND) &
+			     (ATA_DF | ATA_ERR)))
+			qc->err_mask |= AC_ERR_DEV;
 
-			ata_qc_complete(qc);
-			return;
-		}
-	} else {
-		if (likely(ata_sff_host_intr(ap, qc)))
-			return;
+		ata_qc_complete(qc);
+		return;
 	}
 
  spurious:
@@ -421,6 +417,19 @@ static irqreturn_t inic_interrupt(int irq, void *dev_instance)
 	return IRQ_RETVAL(handled);
 }
 
+static int inic_check_atapi_dma(struct ata_queued_cmd *qc)
+{
+	/* For some reason ATAPI_PROT_DMA doesn't work for some
+	 * commands including writes and other misc ops.  Use PIO
+	 * protocol instead, which BTW is driven by the DMA engine
+	 * anyway, so it shouldn't make much difference for native
+	 * SATA devices.
+	 */
+	if (atapi_cmd_type(qc->cdb[0]) == READ)
+		return 0;
+	return 1;
+}
+
 static void inic_fill_sg(struct inic_prd *prd, struct ata_queued_cmd *qc)
 {
 	struct scatterlist *sg;
@@ -452,20 +461,21 @@ static void inic_qc_prep(struct ata_queued_cmd *qc)
 	struct inic_prd *prd = pkt->prd;
 	bool is_atapi = ata_is_atapi(qc->tf.protocol);
 	bool is_data = ata_is_data(qc->tf.protocol);
+	unsigned int cdb_len = 0;
 
 	VPRINTK("ENTER\n");
 
 	if (is_atapi)
-		return;
+		cdb_len = qc->dev->cdb_len;
 
 	/* prepare packet, based on initio driver */
 	memset(pkt, 0, sizeof(struct inic_pkt));
 
 	cpb->ctl_flags = CPB_CTL_VALID | CPB_CTL_IEN;
-	if (is_data)
+	if (is_atapi || is_data)
 		cpb->ctl_flags |= CPB_CTL_DATA;
 
-	cpb->len = cpu_to_le32(qc->nbytes);
+	cpb->len = cpu_to_le32(qc->nbytes + cdb_len);
 	cpb->prd = cpu_to_le32(pp->pkt_dma + offsetof(struct inic_pkt, prd));
 
 	cpb->device = qc->tf.device;
@@ -486,6 +496,18 @@ static void inic_qc_prep(struct ata_queued_cmd *qc)
 	cpb->command = qc->tf.command;
 	/* don't load ctl - dunno why.  it's like that in the initio driver */
 
+	/* setup PRD for CDB */
+	if (is_atapi) {
+		memcpy(pkt->cdb, qc->cdb, ATAPI_CDB_LEN);
+		prd->mad = cpu_to_le32(pp->pkt_dma +
+				       offsetof(struct inic_pkt, cdb));
+		prd->len = cpu_to_le16(cdb_len);
+		prd->flags = PRD_CDB | PRD_WRITE;
+		if (!is_data)
+			prd->flags |= PRD_END;
+		prd++;
+	}
+
 	/* setup sg table */
 	if (is_data)
 		inic_fill_sg(prd, qc);
@@ -498,16 +520,12 @@ static unsigned int inic_qc_issue(struct ata_queued_cmd *qc)
 	struct ata_port *ap = qc->ap;
 	void __iomem *port_base = inic_port_base(ap);
 
-	if (!ata_is_atapi(qc->tf.protocol)) {
-		/* fire up the ADMA engine */
-		writew(HCTL_FTHD0, port_base + HOST_CTL);
-		writew(IDMA_CTL_GO, port_base + PORT_IDMA_CTL);
-		writeb(0, port_base + PORT_CPB_PTQFIFO);
-
-		return 0;
-	}
+	/* fire up the ADMA engine */
+	writew(HCTL_FTHD0, port_base + HOST_CTL);
+	writew(IDMA_CTL_GO, port_base + PORT_IDMA_CTL);
+	writeb(0, port_base + PORT_CPB_PTQFIFO);
 
-	return ata_sff_qc_issue(qc);
+	return 0;
 }
 
 static void inic_tf_read(struct ata_port *ap, struct ata_taskfile *tf)
@@ -698,6 +716,7 @@ static int inic_port_start(struct ata_port *ap)
 static struct ata_port_operations inic_port_ops = {
 	.inherits		= &ata_sff_port_ops,
 
+	.check_atapi_dma	= inic_check_atapi_dma,
 	.qc_prep		= inic_qc_prep,
 	.qc_issue		= inic_qc_issue,
 	.qc_fill_rtf		= inic_qc_fill_rtf,
@@ -717,12 +736,6 @@ static struct ata_port_operations inic_port_ops = {
 };
 
 static struct ata_port_info inic_port_info = {
-	/* For some reason, ATAPI_PROT_PIO is broken on this
-	 * controller, and no, PIO_POLLING does't fix it.  It somehow
-	 * manages to report the wrong ireason and ignoring ireason
-	 * results in machine lock up.  Tell libata to always prefer
-	 * DMA.
-	 */
 	.flags			= ATA_FLAG_SATA | ATA_FLAG_PIO_DMA,
 	.pio_mask		= 0x1f,	/* pio0-4 */
 	.mwdma_mask		= 0x07, /* mwdma0-2 */
-- 
cgit v0.10.2


From f8b0685a8ea8e3974f8953378ede2111f8d49d22 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Wed, 30 Apr 2008 16:35:15 +0900
Subject: sata_inic162x: kill now unused SFF related stuff

sata_inic162x now doesn't use any SFF features.  Remove all SFF
related stuff.

* Mask unsolicited ATA interrupts.  This removes our primary source of
  spurious interrupts and spurious interrupt handling can be tightened
  up.  There's no need to clear ATA interrupts by reading status
  register either.

* Don't dance with IDMA_CTL_ATA_NIEN and simplify accesses to
  IDMA_CTL.

* Inherit from sata_port_ops instead of ata_sff_port_ops.

* Don't initialize or use ioaddr.  There's no need to map BAR0-4
  anymore.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/ata/sata_inic162x.c b/drivers/ata/sata_inic162x.c
index cdae435..55f8e93 100644
--- a/drivers/ata/sata_inic162x.c
+++ b/drivers/ata/sata_inic162x.c
@@ -101,7 +101,7 @@ enum {
 	PIRQ_PENDING		= (1 << 7),  /* port IRQ pending (STAT only) */
 
 	PIRQ_ERR		= PIRQ_OFFLINE | PIRQ_ONLINE | PIRQ_FATAL,
-	PIRQ_MASK_DEFAULT	= PIRQ_REPLY,
+	PIRQ_MASK_DEFAULT	= PIRQ_REPLY | PIRQ_ATA,
 	PIRQ_MASK_FREEZE	= 0xff,
 
 	/* PORT_PRD_CTL bits */
@@ -227,31 +227,26 @@ static void __iomem *inic_port_base(struct ata_port *ap)
 static void inic_reset_port(void __iomem *port_base)
 {
 	void __iomem *idma_ctl = port_base + PORT_IDMA_CTL;
-	u16 ctl;
 
-	ctl = readw(idma_ctl);
-	ctl &= ~(IDMA_CTL_RST_IDMA | IDMA_CTL_ATA_NIEN | IDMA_CTL_GO);
+	/* stop IDMA engine */
+	readw(idma_ctl); /* flush */
+	msleep(1);
 
 	/* mask IRQ and assert reset */
-	writew(ctl | IDMA_CTL_RST_IDMA | IDMA_CTL_ATA_NIEN, idma_ctl);
+	writew(IDMA_CTL_RST_IDMA, idma_ctl);
 	readw(idma_ctl); /* flush */
-
-	/* give it some time */
 	msleep(1);
 
 	/* release reset */
-	writew(ctl | IDMA_CTL_ATA_NIEN, idma_ctl);
+	writew(0, idma_ctl);
 
 	/* clear irq */
 	writeb(0xff, port_base + PORT_IRQ_STAT);
-
-	/* reenable ATA IRQ, turn off IDMA mode */
-	writew(ctl, idma_ctl);
 }
 
 static int inic_scr_read(struct ata_port *ap, unsigned sc_reg, u32 *val)
 {
-	void __iomem *scr_addr = ap->ioaddr.scr_addr;
+	void __iomem *scr_addr = inic_port_base(ap) + PORT_SCR;
 	void __iomem *addr;
 
 	if (unlikely(sc_reg >= ARRAY_SIZE(scr_map)))
@@ -268,7 +263,7 @@ static int inic_scr_read(struct ata_port *ap, unsigned sc_reg, u32 *val)
 
 static int inic_scr_write(struct ata_port *ap, unsigned sc_reg, u32 val)
 {
-	void __iomem *scr_addr = ap->ioaddr.scr_addr;
+	void __iomem *scr_addr = inic_port_base(ap) + PORT_SCR;
 
 	if (unlikely(sc_reg >= ARRAY_SIZE(scr_map)))
 		return -EINVAL;
@@ -357,10 +352,8 @@ static void inic_host_intr(struct ata_port *ap)
 	if (unlikely((irq_stat & PIRQ_ERR) || (idma_stat & IDMA_STAT_ERR)))
 		inic_host_err_intr(ap, irq_stat, idma_stat);
 
-	if (unlikely(!qc)) {
-		ap->ops->sff_check_status(ap); /* clear ATA interrupt */
+	if (unlikely(!qc))
 		goto spurious;
-	}
 
 	if (likely(idma_stat & IDMA_STAT_DONE)) {
 		inic_stop_idma(ap);
@@ -377,7 +370,9 @@ static void inic_host_intr(struct ata_port *ap)
 	}
 
  spurious:
-	ap->ops->sff_check_status(ap); /* clear ATA interrupt */
+	ata_port_printk(ap, KERN_WARNING, "unhandled interrupt: "
+			"cmd=0x%x irq_stat=0x%x idma_stat=0x%x\n",
+			qc ? qc->tf.command : 0xff, irq_stat, idma_stat);
 }
 
 static irqreturn_t inic_interrupt(int irq, void *dev_instance)
@@ -568,7 +563,6 @@ static void inic_freeze(struct ata_port *ap)
 	void __iomem *port_base = inic_port_base(ap);
 
 	writeb(PIRQ_MASK_FREEZE, port_base + PORT_IRQ_MASK);
-	ap->ops->sff_check_status(ap);
 	writeb(0xff, port_base + PORT_IRQ_STAT);
 }
 
@@ -576,7 +570,6 @@ static void inic_thaw(struct ata_port *ap)
 {
 	void __iomem *port_base = inic_port_base(ap);
 
-	ap->ops->sff_check_status(ap);
 	writeb(0xff, port_base + PORT_IRQ_STAT);
 	writeb(PIRQ_MASK_DEFAULT, port_base + PORT_IRQ_MASK);
 }
@@ -599,17 +592,15 @@ static int inic_hardreset(struct ata_link *link, unsigned int *class,
 	void __iomem *port_base = inic_port_base(ap);
 	void __iomem *idma_ctl = port_base + PORT_IDMA_CTL;
 	const unsigned long *timing = sata_ehc_deb_timing(&link->eh_context);
-	u16 val;
 	int rc;
 
 	/* hammer it into sane state */
 	inic_reset_port(port_base);
 
-	val = readw(idma_ctl);
-	writew(val | IDMA_CTL_RST_ATA, idma_ctl);
+	writew(IDMA_CTL_RST_ATA, idma_ctl);
 	readw(idma_ctl);	/* flush */
 	msleep(1);
-	writew(val & ~IDMA_CTL_RST_ATA, idma_ctl);
+	writew(0, idma_ctl);
 
 	rc = sata_link_resume(link, timing, deadline);
 	if (rc) {
@@ -641,16 +632,8 @@ static int inic_hardreset(struct ata_link *link, unsigned int *class,
 static void inic_error_handler(struct ata_port *ap)
 {
 	void __iomem *port_base = inic_port_base(ap);
-	unsigned long flags;
 
-	/* reset PIO HSM and stop DMA engine */
 	inic_reset_port(port_base);
-
-	spin_lock_irqsave(ap->lock, flags);
-	ap->hsm_task_state = HSM_ST_IDLE;
-	spin_unlock_irqrestore(ap->lock, flags);
-
-	/* PIO and DMA engines have been stopped, perform recovery */
 	ata_std_error_handler(ap);
 }
 
@@ -714,7 +697,7 @@ static int inic_port_start(struct ata_port *ap)
 }
 
 static struct ata_port_operations inic_port_ops = {
-	.inherits		= &ata_sff_port_ops,
+	.inherits		= &sata_port_ops,
 
 	.check_atapi_dma	= inic_check_atapi_dma,
 	.qc_prep		= inic_qc_prep,
@@ -723,7 +706,6 @@ static struct ata_port_operations inic_port_ops = {
 
 	.freeze			= inic_freeze,
 	.thaw			= inic_thaw,
-	.softreset		= ATA_OP_NULL,	/* softreset is broken */
 	.hardreset		= inic_hardreset,
 	.error_handler		= inic_error_handler,
 	.post_internal_cmd	= inic_post_internal_cmd,
@@ -832,34 +814,19 @@ static int inic_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	if (rc)
 		return rc;
 
-	rc = pcim_iomap_regions(pdev, 0x3f, DRV_NAME);
+	rc = pcim_iomap_regions(pdev, 1 << MMIO_BAR, DRV_NAME);
 	if (rc)
 		return rc;
 	host->iomap = iomap = pcim_iomap_table(pdev);
+	hpriv->cached_hctl = readw(iomap[MMIO_BAR] + HOST_CTL);
 
 	for (i = 0; i < NR_PORTS; i++) {
 		struct ata_port *ap = host->ports[i];
-		struct ata_ioports *port = &ap->ioaddr;
-		unsigned int offset = i * PORT_SIZE;
-
-		port->cmd_addr = iomap[2 * i];
-		port->altstatus_addr =
-		port->ctl_addr = (void __iomem *)
-			((unsigned long)iomap[2 * i + 1] | ATA_PCI_CTL_OFS);
-		port->scr_addr = iomap[MMIO_BAR] + offset + PORT_SCR;
-
-		ata_sff_std_ports(port);
 
 		ata_port_pbar_desc(ap, MMIO_BAR, -1, "mmio");
-		ata_port_pbar_desc(ap, MMIO_BAR, offset, "port");
-		ata_port_desc(ap, "cmd 0x%llx ctl 0x%llx",
-		  (unsigned long long)pci_resource_start(pdev, 2 * i),
-		  (unsigned long long)pci_resource_start(pdev, (2 * i + 1)) |
-				      ATA_PCI_CTL_OFS);
+		ata_port_pbar_desc(ap, MMIO_BAR, i * PORT_SIZE, "port");
 	}
 
-	hpriv->cached_hctl = readw(iomap[MMIO_BAR] + HOST_CTL);
-
 	/* Set dma_mask.  This devices doesn't support 64bit addressing. */
 	rc = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
 	if (rc) {
-- 
cgit v0.10.2


From ba66b242b1c3432b44d893c64124522b3bdce71e Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Wed, 30 Apr 2008 16:35:16 +0900
Subject: sata_inic162x: add cardbus support

When attached to cardbus, mmio region is at BAR 1.  Other than that,
everything else is the same.  Add support for it.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/ata/sata_inic162x.c b/drivers/ata/sata_inic162x.c
index 55f8e93..8c1f06a 100644
--- a/drivers/ata/sata_inic162x.c
+++ b/drivers/ata/sata_inic162x.c
@@ -31,7 +31,8 @@
 #define DRV_VERSION	"0.3"
 
 enum {
-	MMIO_BAR		= 5,
+	MMIO_BAR_PCI		= 5,
+	MMIO_BAR_CARDBUS	= 1,
 
 	NR_PORTS		= 2,
 
@@ -197,6 +198,7 @@ struct inic_pkt {
 } __packed;
 
 struct inic_host_priv {
+	void __iomem	*mmio_base;
 	u16		cached_hctl;
 };
 
@@ -221,7 +223,9 @@ static const int scr_map[] = {
 
 static void __iomem *inic_port_base(struct ata_port *ap)
 {
-	return ap->host->iomap[MMIO_BAR] + ap->port_no * PORT_SIZE;
+	struct inic_host_priv *hpriv = ap->host->private_data;
+
+	return hpriv->mmio_base + ap->port_no * PORT_SIZE;
 }
 
 static void inic_reset_port(void __iomem *port_base)
@@ -378,11 +382,11 @@ static void inic_host_intr(struct ata_port *ap)
 static irqreturn_t inic_interrupt(int irq, void *dev_instance)
 {
 	struct ata_host *host = dev_instance;
-	void __iomem *mmio_base = host->iomap[MMIO_BAR];
+	struct inic_host_priv *hpriv = host->private_data;
 	u16 host_irq_stat;
 	int i, handled = 0;;
 
-	host_irq_stat = readw(mmio_base + HOST_IRQ_STAT);
+	host_irq_stat = readw(hpriv->mmio_base + HOST_IRQ_STAT);
 
 	if (unlikely(!(host_irq_stat & HIRQ_GLOBAL)))
 		goto out;
@@ -770,7 +774,6 @@ static int inic_pci_device_resume(struct pci_dev *pdev)
 {
 	struct ata_host *host = dev_get_drvdata(&pdev->dev);
 	struct inic_host_priv *hpriv = host->private_data;
-	void __iomem *mmio_base = host->iomap[MMIO_BAR];
 	int rc;
 
 	rc = ata_pci_device_do_resume(pdev);
@@ -778,7 +781,7 @@ static int inic_pci_device_resume(struct pci_dev *pdev)
 		return rc;
 
 	if (pdev->dev.power.power_state.event == PM_EVENT_SUSPEND) {
-		rc = init_controller(mmio_base, hpriv->cached_hctl);
+		rc = init_controller(hpriv->mmio_base, hpriv->cached_hctl);
 		if (rc)
 			return rc;
 	}
@@ -796,6 +799,7 @@ static int inic_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	struct ata_host *host;
 	struct inic_host_priv *hpriv;
 	void __iomem * const *iomap;
+	int mmio_bar;
 	int i, rc;
 
 	if (!printed_version++)
@@ -809,22 +813,30 @@ static int inic_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	host->private_data = hpriv;
 
-	/* acquire resources and fill host */
+	/* Acquire resources and fill host.  Note that PCI and cardbus
+	 * use different BARs.
+	 */
 	rc = pcim_enable_device(pdev);
 	if (rc)
 		return rc;
 
-	rc = pcim_iomap_regions(pdev, 1 << MMIO_BAR, DRV_NAME);
+	if (pci_resource_flags(pdev, MMIO_BAR_PCI) & IORESOURCE_MEM)
+		mmio_bar = MMIO_BAR_PCI;
+	else
+		mmio_bar = MMIO_BAR_CARDBUS;
+
+	rc = pcim_iomap_regions(pdev, 1 << mmio_bar, DRV_NAME);
 	if (rc)
 		return rc;
 	host->iomap = iomap = pcim_iomap_table(pdev);
-	hpriv->cached_hctl = readw(iomap[MMIO_BAR] + HOST_CTL);
+	hpriv->mmio_base = iomap[mmio_bar];
+	hpriv->cached_hctl = readw(hpriv->mmio_base + HOST_CTL);
 
 	for (i = 0; i < NR_PORTS; i++) {
 		struct ata_port *ap = host->ports[i];
 
-		ata_port_pbar_desc(ap, MMIO_BAR, -1, "mmio");
-		ata_port_pbar_desc(ap, MMIO_BAR, i * PORT_SIZE, "port");
+		ata_port_pbar_desc(ap, mmio_bar, -1, "mmio");
+		ata_port_pbar_desc(ap, mmio_bar, i * PORT_SIZE, "port");
 	}
 
 	/* Set dma_mask.  This devices doesn't support 64bit addressing. */
@@ -854,7 +866,7 @@ static int inic_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 		return rc;
 	}
 
-	rc = init_controller(iomap[MMIO_BAR], hpriv->cached_hctl);
+	rc = init_controller(hpriv->mmio_base, hpriv->cached_hctl);
 	if (rc) {
 		dev_printk(KERN_ERR, &pdev->dev,
 			   "failed to initialize controller\n");
-- 
cgit v0.10.2


From 22bfc6d5e19b72d50535ce32fd6dee2ce2e75775 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Wed, 30 Apr 2008 16:35:17 +0900
Subject: sata_inic162x: update intro comment, up the version and drop
 EXPERIMENTAL

sata_inic162x is now ready for production use.  Bump the version,
explain what's working and what's not and drop EXPERIMENTAL.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/ata/Kconfig b/drivers/ata/Kconfig
index 1a59f30..9bf2986 100644
--- a/drivers/ata/Kconfig
+++ b/drivers/ata/Kconfig
@@ -205,8 +205,8 @@ config SATA_VITESSE
 	  If unsure, say N.
 
 config SATA_INIC162X
-	tristate "Initio 162x SATA support (HIGHLY EXPERIMENTAL)"
-	depends on PCI && EXPERIMENTAL
+	tristate "Initio 162x SATA support"
+	depends on PCI
 	help
 	  This option enables support for Initio 162x Serial ATA.
 
diff --git a/drivers/ata/sata_inic162x.c b/drivers/ata/sata_inic162x.c
index 8c1f06a..3ead02f 100644
--- a/drivers/ata/sata_inic162x.c
+++ b/drivers/ata/sata_inic162x.c
@@ -10,13 +10,33 @@
  * right.  Documentation is available at initio's website but it only
  * documents registers (not programming model).
  *
- * - ATA disks work.
- * - Hotplug works.
- * - ATAPI read works but burning doesn't.  This thing is really
- *   peculiar about ATAPI and I couldn't figure out how ATAPI PIO and
- *   ATAPI DMA WRITE should be programmed.  If you've got a clue, be
- *   my guest.
- * - Both STR and STD work.
+ * This driver has interesting history.  The first version was written
+ * from the documentation and a 2.4 IDE driver posted on a Taiwan
+ * company, which didn't use any IDMA features and couldn't handle
+ * LBA48.  The resulting driver couldn't handle LBA48 devices either
+ * making it pretty useless.
+ *
+ * After a while, initio picked the driver up, renamed it to
+ * sata_initio162x, updated it to use IDMA for ATA DMA commands and
+ * posted it on their website.  It only used ATA_PROT_DMA for IDMA and
+ * attaching both devices and issuing IDMA and !IDMA commands
+ * simultaneously broke it due to PIRQ masking interaction but it did
+ * show how to use the IDMA (ADMA + some initio specific twists)
+ * engine.
+ *
+ * Then, I picked up their changes again and here's the usable driver
+ * which uses IDMA for everything.  Everything works now including
+ * LBA48, CD/DVD burning, suspend/resume and hotplug.  There are some
+ * issues tho.  Result Tf is not resported properly, NCQ isn't
+ * supported yet and CD/DVD writing works with DMA assisted PIO
+ * protocol (which, for native SATA devices, shouldn't cause any
+ * noticeable difference).
+ *
+ * Anyways, so, here's finally a working driver for inic162x.  Enjoy!
+ *
+ * initio: If you guys wanna improve the driver regarding result TF
+ * access and other stuff, please feel free to contact me.  I'll be
+ * happy to assist.
  */
 
 #include <linux/kernel.h>
@@ -28,7 +48,7 @@
 #include <scsi/scsi_device.h>
 
 #define DRV_NAME	"sata_inic162x"
-#define DRV_VERSION	"0.3"
+#define DRV_VERSION	"0.4"
 
 enum {
 	MMIO_BAR_PCI		= 5,
-- 
cgit v0.10.2


From 05177f178efe1459d2d0ac05430027ba201889a4 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Fri, 2 May 2008 15:13:39 -0700
Subject: pata_atiixp: Don't disable

A couple of distributions (Fedora, Ubuntu) were having weird problems with the
ATI IXP series PATA controllers being reported as simplex.  At the heart of
the problem is that both distros ignored the recommendations to load pata_acpi
and ata_generic *AFTER* specific host drivers.

The underlying cause however is that if you D3 and then D0 an ATI IXP it
helpfully throws away some configuration and won't let you rewrite it.

Add checks to ata_generic and pata_acpi to pin ATIIXP devices.  Possibly the
real answer here is to quirk them and pin them, but right now we can't do that
before they've been pcim_enable()'d by a driver.

I'm indebted to David Gero for this.  His bug report not only reported the
problem but identified the cause correctly and he had tested the right values
to prove what was going on

[If you backport this for 2.6.24 you will need to pull in the 2.6.25
removal of the bogus WARN_ON() in pcim_enagle]

Signed-off-by: Alan Cox <alan@redhat.com>
Tested-by: David Gero <davidg@havidave.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/ata/ata_generic.c b/drivers/ata/ata_generic.c
index 47aeccd..75a406f 100644
--- a/drivers/ata/ata_generic.c
+++ b/drivers/ata/ata_generic.c
@@ -152,6 +152,12 @@ static int ata_generic_init_one(struct pci_dev *dev, const struct pci_device_id
 	if (dev->vendor == PCI_VENDOR_ID_AL)
 		ata_pci_bmdma_clear_simplex(dev);
 
+	if (dev->vendor == PCI_VENDOR_ID_ATI) {
+		int rc = pcim_enable_device(dev);
+		if (rc < 0)
+			return rc;
+		pcim_pin_device(dev);
+	}
 	return ata_pci_sff_init_one(dev, ppi, &generic_sht, NULL);
 }
 
diff --git a/drivers/ata/pata_acpi.c b/drivers/ata/pata_acpi.c
index c5f91e6..fbe6057 100644
--- a/drivers/ata/pata_acpi.c
+++ b/drivers/ata/pata_acpi.c
@@ -259,6 +259,12 @@ static int pacpi_init_one (struct pci_dev *pdev, const struct pci_device_id *id)
 		.port_ops	= &pacpi_ops,
 	};
 	const struct ata_port_info *ppi[] = { &info, NULL };
+	if (pdev->vendor == PCI_VENDOR_ID_ATI) {
+		int rc = pcim_enable_device(pdev);
+		if (rc < 0)
+			return rc;
+		pcim_pin_device(pdev);
+	}
 	return ata_pci_sff_init_one(pdev, ppi, &pacpi_sht, NULL);
 }
 
-- 
cgit v0.10.2


From 822973ba79fd5a5b711270c2de7196c6b50c6687 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Fri, 2 May 2008 17:49:37 -0700
Subject: bonding: Do not call free_netdev for already registered device.

If the call to bond_create_sysfs_entry in bond_create fails, the
proper rollback is to call unregister_netdevice, not free_netdev.
Otherwise - kernel BUG at net/core/dev.c:4057!

Checked with artificial failures injected into bond_create_sysfs_entry.

Pavel's original patch modified by Jay Vosburgh to move code around
for clarity (remove goto-hopping within the unwind block).

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Jay Vosburgh <fubar@us.ibm.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 6425603..5509732 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -4936,7 +4936,9 @@ int bond_create(char *name, struct bond_params *params, struct bonding **newbond
 	if (res < 0) {
 		rtnl_lock();
 		down_write(&bonding_rwsem);
-		goto out_bond;
+		bond_deinit(bond_dev);
+		unregister_netdevice(bond_dev);
+		goto out_rtnl;
 	}
 
 	return 0;
-- 
cgit v0.10.2


From c4ebc66a1a8e3576322a9f47f0d06ec3c96a08d7 Mon Sep 17 00:00:00 2001
From: Jay Vosburgh <fubar@us.ibm.com>
Date: Fri, 2 May 2008 17:49:38 -0700
Subject: bonding: fix error unwind in bonding_store_bonds

	Fixed an error unwind in bonding_store_bonds that didn't release
the locks it held, and consolidated unwinds into a common block at the
end of the function.  Bug reported by Pavel Emelyanov <xemul@openvz.org>,
who provided a different fix.

Signed-off-by: Jay Vosburgh <fubar@us.ibm.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/net/bonding/bond_sysfs.c b/drivers/net/bonding/bond_sysfs.c
index 979c2d0..68c41a0 100644
--- a/drivers/net/bonding/bond_sysfs.c
+++ b/drivers/net/bonding/bond_sysfs.c
@@ -146,29 +146,29 @@ static ssize_t bonding_store_bonds(struct class *cls, const char *buffer, size_t
 						": Unable remove bond %s due to open references.\n",
 						ifname);
 					res = -EPERM;
-					goto out;
+					goto out_unlock;
 				}
 				printk(KERN_INFO DRV_NAME
 					": %s is being deleted...\n",
 					bond->dev->name);
 				bond_destroy(bond);
-				up_write(&bonding_rwsem);
-				rtnl_unlock();
-				goto out;
+				goto out_unlock;
 			}
 
 		printk(KERN_ERR DRV_NAME
 			": unable to delete non-existent bond %s\n", ifname);
 		res = -ENODEV;
-		up_write(&bonding_rwsem);
-		rtnl_unlock();
-		goto out;
+		goto out_unlock;
 	}
 
 err_no_cmd:
 	printk(KERN_ERR DRV_NAME
 		": no command found in bonding_masters. Use +ifname or -ifname.\n");
-	res = -EPERM;
+	return -EPERM;
+
+out_unlock:
+	up_write(&bonding_rwsem);
+	rtnl_unlock();
 
 	/* Always return either count or an error.  If you return 0, you'll
 	 * get called forever, which is bad.
-- 
cgit v0.10.2


From ae68c39819ddf30549652962768a50edae5eec6f Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Fri, 2 May 2008 17:49:39 -0700
Subject: bonding: Deadlock between bonding_store_bonds and bond_destroy_sysfs.

The sysfs layer has an internal protection, that ensures, that
all the process sitting inside ->sore/->show callback exits
before the appropriate entry is unregistered (the calltraces
are rather big, but I can provide them if required).

On the other hand, bonding takes rtnl_lock in
a) the bonding_store_bonds, i.e. in ->store callback,
b) module exit before calling the sysfs unregister routines.

Thus, the classical AB-BA deadlock may occur. To reproduce run
# while :; do modprobe bonding; rmmod bonding; done
and
# while :; do echo '+bond%d' > /sys/class/net/bonding_masters ; done
in parallel.

The fix is to move the bond_destroy_sysfs out of the rtnl_lock,
but _before_ bond_free_all to make sure no bonding devices exist
after module unload.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Acked-by: Jay Vosburgh <fubar@us.ibm.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 5509732..e41b3e5 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -4992,9 +4992,10 @@ err:
 		destroy_workqueue(bond->wq);
 	}
 
+	bond_destroy_sysfs();
+
 	rtnl_lock();
 	bond_free_all();
-	bond_destroy_sysfs();
 	rtnl_unlock();
 out:
 	return res;
@@ -5006,9 +5007,10 @@ static void __exit bonding_exit(void)
 	unregister_netdevice_notifier(&bond_netdev_notifier);
 	unregister_inetaddr_notifier(&bond_inetaddr_notifier);
 
+	bond_destroy_sysfs();
+
 	rtnl_lock();
 	bond_free_all();
-	bond_destroy_sysfs();
 	rtnl_unlock();
 }
 
-- 
cgit v0.10.2


From 569f0c4d909c7f73de634abcdc36344cb72de36a Mon Sep 17 00:00:00 2001
From: Jay Vosburgh <fubar@us.ibm.com>
Date: Fri, 2 May 2008 18:06:02 -0700
Subject: bonding: fix enslavement error unwinds

	As part of:

commit c2edacf80e155ef54ae4774379d461b60896bc2e
Author: Jay Vosburgh <fubar@us.ibm.com>
Date:   Mon Jul 9 10:42:47 2007 -0700

    bonding / ipv6: no addrconf for slaves separately from master

two steps were rearranged in the enslavement process: netdev_set_master
is now before the call to dev_open to open the slave.

	This patch updates the error cases and unwind process at the
end of bond_enslave to match the new order.  Without this patch, it is
possible for the enslavement to fail, but leave the slave with IFF_SLAVE
set in its flags.

Signed-off-by: Jay Vosburgh <fubar@us.ibm.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index e41b3e5..50a40e4 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1425,13 +1425,13 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
 	res = netdev_set_master(slave_dev, bond_dev);
 	if (res) {
 		dprintk("Error %d calling netdev_set_master\n", res);
-		goto err_close;
+		goto err_restore_mac;
 	}
 	/* open the slave since the application closed it */
 	res = dev_open(slave_dev);
 	if (res) {
 		dprintk("Openning slave %s failed\n", slave_dev->name);
-		goto err_restore_mac;
+		goto err_unset_master;
 	}
 
 	new_slave->dev = slave_dev;
@@ -1444,7 +1444,7 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
 		 */
 		res = bond_alb_init_slave(bond, new_slave);
 		if (res) {
-			goto err_unset_master;
+			goto err_close;
 		}
 	}
 
@@ -1619,7 +1619,7 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
 
 	res = bond_create_slave_symlinks(bond_dev, slave_dev);
 	if (res)
-		goto err_unset_master;
+		goto err_close;
 
 	printk(KERN_INFO DRV_NAME
 	       ": %s: enslaving %s as a%s interface with a%s link.\n",
@@ -1631,12 +1631,12 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
 	return 0;
 
 /* Undo stages on error */
-err_unset_master:
-	netdev_set_master(slave_dev, NULL);
-
 err_close:
 	dev_close(slave_dev);
 
+err_unset_master:
+	netdev_set_master(slave_dev, NULL);
+
 err_restore_mac:
 	if (!bond->params.fail_over_mac) {
 		memcpy(addr.sa_data, new_slave->perm_hwaddr, ETH_ALEN);
-- 
cgit v0.10.2


From f162b9d58273a9a5747211133c8ccb2de5cf5ff2 Mon Sep 17 00:00:00 2001
From: Andy Fleming <afleming@freescale.com>
Date: Fri, 2 May 2008 13:00:30 -0500
Subject: gianfar: Fix a locking bug in gianfar's sysfs code

During sparse cleanup, found a locking bug.  Some of the sysfs functions were
acquiring a lock, and then returning in the event of an error.  We rearrange
the code so that the lock is released in error conditions, too.

Signed-off-by: Andy Fleming <afleming@freescale.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/net/gianfar.c b/drivers/net/gianfar.c
index 587afe7..6f22f06 100644
--- a/drivers/net/gianfar.c
+++ b/drivers/net/gianfar.c
@@ -138,6 +138,7 @@ static int gfar_poll(struct napi_struct *napi, int budget);
 static void gfar_netpoll(struct net_device *dev);
 #endif
 int gfar_clean_rx_ring(struct net_device *dev, int rx_work_limit);
+static int gfar_clean_tx_ring(struct net_device *dev);
 static int gfar_process_frame(struct net_device *dev, struct sk_buff *skb, int length);
 static void gfar_vlan_rx_register(struct net_device *netdev,
 		                struct vlan_group *grp);
@@ -1141,7 +1142,7 @@ static int gfar_close(struct net_device *dev)
 }
 
 /* Changes the mac address if the controller is not running. */
-int gfar_set_mac_address(struct net_device *dev)
+static int gfar_set_mac_address(struct net_device *dev)
 {
 	gfar_set_mac_for_addr(dev, 0, dev->dev_addr);
 
@@ -1260,7 +1261,7 @@ static void gfar_timeout(struct net_device *dev)
 }
 
 /* Interrupt Handler for Transmit complete */
-int gfar_clean_tx_ring(struct net_device *dev)
+static int gfar_clean_tx_ring(struct net_device *dev)
 {
 	struct txbd8 *bdp;
 	struct gfar_private *priv = netdev_priv(dev);
diff --git a/drivers/net/gianfar.h b/drivers/net/gianfar.h
index fd487be..27f37c8 100644
--- a/drivers/net/gianfar.h
+++ b/drivers/net/gianfar.h
@@ -782,5 +782,8 @@ extern void gfar_halt(struct net_device *dev);
 extern void gfar_phy_test(struct mii_bus *bus, struct phy_device *phydev,
 		int enable, u32 regnum, u32 read);
 void gfar_init_sysfs(struct net_device *dev);
+int gfar_local_mdio_write(struct gfar_mii __iomem *regs, int mii_id,
+			  int regnum, u16 value);
+int gfar_local_mdio_read(struct gfar_mii __iomem *regs, int mii_id, int regnum);
 
 #endif /* __GIANFAR_H */
diff --git a/drivers/net/gianfar_sysfs.c b/drivers/net/gianfar_sysfs.c
index 230878b..5116f68 100644
--- a/drivers/net/gianfar_sysfs.c
+++ b/drivers/net/gianfar_sysfs.c
@@ -103,10 +103,10 @@ static ssize_t gfar_set_rx_stash_size(struct device *dev,
 
 	spin_lock_irqsave(&priv->rxlock, flags);
 	if (length > priv->rx_buffer_size)
-		return count;
+		goto out;
 
 	if (length == priv->rx_stash_size)
-		return count;
+		goto out;
 
 	priv->rx_stash_size = length;
 
@@ -125,6 +125,7 @@ static ssize_t gfar_set_rx_stash_size(struct device *dev,
 
 	gfar_write(&priv->regs->attr, temp);
 
+out:
 	spin_unlock_irqrestore(&priv->rxlock, flags);
 
 	return count;
@@ -154,10 +155,10 @@ static ssize_t gfar_set_rx_stash_index(struct device *dev,
 
 	spin_lock_irqsave(&priv->rxlock, flags);
 	if (index > priv->rx_stash_size)
-		return count;
+		goto out;
 
 	if (index == priv->rx_stash_index)
-		return count;
+		goto out;
 
 	priv->rx_stash_index = index;
 
@@ -166,6 +167,7 @@ static ssize_t gfar_set_rx_stash_index(struct device *dev,
 	temp |= ATTRELI_EI(index);
 	gfar_write(&priv->regs->attreli, flags);
 
+out:
 	spin_unlock_irqrestore(&priv->rxlock, flags);
 
 	return count;
-- 
cgit v0.10.2


From 9b9a8bfc8dfbe09dc57f274e32e8b06151abbad7 Mon Sep 17 00:00:00 2001
From: Andy Fleming <afleming@freescale.com>
Date: Fri, 2 May 2008 13:00:51 -0500
Subject: phylib: Fix some sparse warnings

Declared some things static, declared some things in the header.

Signed-off-by: Andy Fleming <afleming@freescale.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 3c18bb5..45cc291 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -547,7 +547,7 @@ static void phy_force_reduction(struct phy_device *phydev)
  * Must not be called from interrupt context, or while the
  * phydev->lock is held.
  */
-void phy_error(struct phy_device *phydev)
+static void phy_error(struct phy_device *phydev)
 {
 	mutex_lock(&phydev->lock);
 	phydev->state = PHY_HALTED;
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 02df20f..7224c40 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -412,6 +412,8 @@ int mdiobus_register(struct mii_bus *bus);
 void mdiobus_unregister(struct mii_bus *bus);
 void phy_sanitize_settings(struct phy_device *phydev);
 int phy_stop_interrupts(struct phy_device *phydev);
+int phy_enable_interrupts(struct phy_device *phydev);
+int phy_disable_interrupts(struct phy_device *phydev);
 
 static inline int phy_read_status(struct phy_device *phydev) {
 	return phydev->drv->read_status(phydev);
@@ -447,5 +449,8 @@ int phy_register_fixup_for_uid(u32 phy_uid, u32 phy_uid_mask,
 		int (*run)(struct phy_device *));
 int phy_scan_fixups(struct phy_device *phydev);
 
+int __init mdio_bus_init(void);
+void mdio_bus_exit(void);
+
 extern struct bus_type mdio_bus_type;
 #endif /* __PHY_H */
-- 
cgit v0.10.2


From 6fee40e9b8155a4af904d69765c96c00f975acf5 Mon Sep 17 00:00:00 2001
From: Andy Fleming <afleming@freescale.com>
Date: Fri, 2 May 2008 13:01:23 -0500
Subject: ucc_geth: Fix a bunch of sparse warnings

ucc_geth didn't have anything marked as __iomem.  It was also inconsistent
with its use of in/out accessors (using them sometimes, not using them other
times).  Cleaning this up cuts the warnings down from hundreds to just over a
dozen.

Signed-off-by: Andy Fleming <afleming@freescale.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/net/ucc_geth.c b/drivers/net/ucc_geth.c
index 281ce3d..b3bbb2d 100644
--- a/drivers/net/ucc_geth.c
+++ b/drivers/net/ucc_geth.c
@@ -62,7 +62,6 @@
 #endif				/* UGETH_VERBOSE_DEBUG */
 #define UGETH_MSG_DEFAULT	(NETIF_MSG_IFUP << 1 ) - 1
 
-void uec_set_ethtool_ops(struct net_device *netdev);
 
 static DEFINE_SPINLOCK(ugeth_lock);
 
@@ -216,7 +215,8 @@ static struct list_head *dequeue(struct list_head *lh)
 	}
 }
 
-static struct sk_buff *get_new_skb(struct ucc_geth_private *ugeth, u8 *bd)
+static struct sk_buff *get_new_skb(struct ucc_geth_private *ugeth,
+		u8 __iomem *bd)
 {
 	struct sk_buff *skb = NULL;
 
@@ -236,21 +236,22 @@ static struct sk_buff *get_new_skb(struct ucc_geth_private *ugeth, u8 *bd)
 
 	skb->dev = ugeth->dev;
 
-	out_be32(&((struct qe_bd *)bd)->buf,
+	out_be32(&((struct qe_bd __iomem *)bd)->buf,
 		      dma_map_single(NULL,
 				     skb->data,
 				     ugeth->ug_info->uf_info.max_rx_buf_length +
 				     UCC_GETH_RX_DATA_BUF_ALIGNMENT,
 				     DMA_FROM_DEVICE));
 
-	out_be32((u32 *)bd, (R_E | R_I | (in_be32((u32 *)bd) & R_W)));
+	out_be32((u32 __iomem *)bd,
+			(R_E | R_I | (in_be32((u32 __iomem*)bd) & R_W)));
 
 	return skb;
 }
 
 static int rx_bd_buffer_set(struct ucc_geth_private *ugeth, u8 rxQ)
 {
-	u8 *bd;
+	u8 __iomem *bd;
 	u32 bd_status;
 	struct sk_buff *skb;
 	int i;
@@ -259,7 +260,7 @@ static int rx_bd_buffer_set(struct ucc_geth_private *ugeth, u8 rxQ)
 	i = 0;
 
 	do {
-		bd_status = in_be32((u32*)bd);
+		bd_status = in_be32((u32 __iomem *)bd);
 		skb = get_new_skb(ugeth, bd);
 
 		if (!skb)	/* If can not allocate data buffer,
@@ -277,7 +278,7 @@ static int rx_bd_buffer_set(struct ucc_geth_private *ugeth, u8 rxQ)
 }
 
 static int fill_init_enet_entries(struct ucc_geth_private *ugeth,
-				  volatile u32 *p_start,
+				  u32 *p_start,
 				  u8 num_entries,
 				  u32 thread_size,
 				  u32 thread_alignment,
@@ -316,7 +317,7 @@ static int fill_init_enet_entries(struct ucc_geth_private *ugeth,
 }
 
 static int return_init_enet_entries(struct ucc_geth_private *ugeth,
-				    volatile u32 *p_start,
+				    u32 *p_start,
 				    u8 num_entries,
 				    enum qe_risc_allocation risc,
 				    int skip_page_for_first_entry)
@@ -326,21 +327,22 @@ static int return_init_enet_entries(struct ucc_geth_private *ugeth,
 	int snum;
 
 	for (i = 0; i < num_entries; i++) {
+		u32 val = *p_start;
+
 		/* Check that this entry was actually valid --
 		needed in case failed in allocations */
-		if ((*p_start & ENET_INIT_PARAM_RISC_MASK) == risc) {
+		if ((val & ENET_INIT_PARAM_RISC_MASK) == risc) {
 			snum =
-			    (u32) (*p_start & ENET_INIT_PARAM_SNUM_MASK) >>
+			    (u32) (val & ENET_INIT_PARAM_SNUM_MASK) >>
 			    ENET_INIT_PARAM_SNUM_SHIFT;
 			qe_put_snum((u8) snum);
 			if (!((i == 0) && skip_page_for_first_entry)) {
 			/* First entry of Rx does not have page */
 				init_enet_offset =
-				    (in_be32(p_start) &
-				     ENET_INIT_PARAM_PTR_MASK);
+				    (val & ENET_INIT_PARAM_PTR_MASK);
 				qe_muram_free(init_enet_offset);
 			}
-			*(p_start++) = 0;	/* Just for cosmetics */
+			*p_start++ = 0;
 		}
 	}
 
@@ -349,7 +351,7 @@ static int return_init_enet_entries(struct ucc_geth_private *ugeth,
 
 #ifdef DEBUG
 static int dump_init_enet_entries(struct ucc_geth_private *ugeth,
-				  volatile u32 *p_start,
+				  u32 __iomem *p_start,
 				  u8 num_entries,
 				  u32 thread_size,
 				  enum qe_risc_allocation risc,
@@ -360,11 +362,13 @@ static int dump_init_enet_entries(struct ucc_geth_private *ugeth,
 	int snum;
 
 	for (i = 0; i < num_entries; i++) {
+		u32 val = in_be32(p_start);
+
 		/* Check that this entry was actually valid --
 		needed in case failed in allocations */
-		if ((*p_start & ENET_INIT_PARAM_RISC_MASK) == risc) {
+		if ((val & ENET_INIT_PARAM_RISC_MASK) == risc) {
 			snum =
-			    (u32) (*p_start & ENET_INIT_PARAM_SNUM_MASK) >>
+			    (u32) (val & ENET_INIT_PARAM_SNUM_MASK) >>
 			    ENET_INIT_PARAM_SNUM_SHIFT;
 			qe_put_snum((u8) snum);
 			if (!((i == 0) && skip_page_for_first_entry)) {
@@ -440,7 +444,7 @@ static int hw_add_addr_in_paddr(struct ucc_geth_private *ugeth,
 
 static int hw_clear_addr_in_paddr(struct ucc_geth_private *ugeth, u8 paddr_num)
 {
-	struct ucc_geth_82xx_address_filtering_pram *p_82xx_addr_filt;
+	struct ucc_geth_82xx_address_filtering_pram __iomem *p_82xx_addr_filt;
 
 	if (!(paddr_num < NUM_OF_PADDRS)) {
 		ugeth_warn("%s: Illagel paddr_num.", __FUNCTION__);
@@ -448,7 +452,7 @@ static int hw_clear_addr_in_paddr(struct ucc_geth_private *ugeth, u8 paddr_num)
 	}
 
 	p_82xx_addr_filt =
-	    (struct ucc_geth_82xx_address_filtering_pram *) ugeth->p_rx_glbl_pram->
+	    (struct ucc_geth_82xx_address_filtering_pram __iomem *) ugeth->p_rx_glbl_pram->
 	    addressfiltering;
 
 	/* Writing address ff.ff.ff.ff.ff.ff disables address
@@ -463,11 +467,11 @@ static int hw_clear_addr_in_paddr(struct ucc_geth_private *ugeth, u8 paddr_num)
 static void hw_add_addr_in_hash(struct ucc_geth_private *ugeth,
                                 u8 *p_enet_addr)
 {
-	struct ucc_geth_82xx_address_filtering_pram *p_82xx_addr_filt;
+	struct ucc_geth_82xx_address_filtering_pram __iomem *p_82xx_addr_filt;
 	u32 cecr_subblock;
 
 	p_82xx_addr_filt =
-	    (struct ucc_geth_82xx_address_filtering_pram *) ugeth->p_rx_glbl_pram->
+	    (struct ucc_geth_82xx_address_filtering_pram __iomem *) ugeth->p_rx_glbl_pram->
 	    addressfiltering;
 
 	cecr_subblock =
@@ -487,7 +491,7 @@ static void hw_add_addr_in_hash(struct ucc_geth_private *ugeth,
 static void magic_packet_detection_enable(struct ucc_geth_private *ugeth)
 {
 	struct ucc_fast_private *uccf;
-	struct ucc_geth *ug_regs;
+	struct ucc_geth __iomem *ug_regs;
 	u32 maccfg2, uccm;
 
 	uccf = ugeth->uccf;
@@ -507,7 +511,7 @@ static void magic_packet_detection_enable(struct ucc_geth_private *ugeth)
 static void magic_packet_detection_disable(struct ucc_geth_private *ugeth)
 {
 	struct ucc_fast_private *uccf;
-	struct ucc_geth *ug_regs;
+	struct ucc_geth __iomem *ug_regs;
 	u32 maccfg2, uccm;
 
 	uccf = ugeth->uccf;
@@ -538,13 +542,13 @@ static void get_statistics(struct ucc_geth_private *ugeth,
 			   rx_firmware_statistics,
 			   struct ucc_geth_hardware_statistics *hardware_statistics)
 {
-	struct ucc_fast *uf_regs;
-	struct ucc_geth *ug_regs;
+	struct ucc_fast __iomem *uf_regs;
+	struct ucc_geth __iomem *ug_regs;
 	struct ucc_geth_tx_firmware_statistics_pram *p_tx_fw_statistics_pram;
 	struct ucc_geth_rx_firmware_statistics_pram *p_rx_fw_statistics_pram;
 
 	ug_regs = ugeth->ug_regs;
-	uf_regs = (struct ucc_fast *) ug_regs;
+	uf_regs = (struct ucc_fast __iomem *) ug_regs;
 	p_tx_fw_statistics_pram = ugeth->p_tx_fw_statistics_pram;
 	p_rx_fw_statistics_pram = ugeth->p_rx_fw_statistics_pram;
 
@@ -1132,9 +1136,9 @@ static void dump_regs(struct ucc_geth_private *ugeth)
 }
 #endif /* DEBUG */
 
-static void init_default_reg_vals(volatile u32 *upsmr_register,
-				  volatile u32 *maccfg1_register,
-				  volatile u32 *maccfg2_register)
+static void init_default_reg_vals(u32 __iomem *upsmr_register,
+				  u32 __iomem *maccfg1_register,
+				  u32 __iomem *maccfg2_register)
 {
 	out_be32(upsmr_register, UCC_GETH_UPSMR_INIT);
 	out_be32(maccfg1_register, UCC_GETH_MACCFG1_INIT);
@@ -1148,7 +1152,7 @@ static int init_half_duplex_params(int alt_beb,
 				   u8 alt_beb_truncation,
 				   u8 max_retransmissions,
 				   u8 collision_window,
-				   volatile u32 *hafdup_register)
+				   u32 __iomem *hafdup_register)
 {
 	u32 value = 0;
 
@@ -1180,7 +1184,7 @@ static int init_inter_frame_gap_params(u8 non_btb_cs_ipg,
 				       u8 non_btb_ipg,
 				       u8 min_ifg,
 				       u8 btb_ipg,
-				       volatile u32 *ipgifg_register)
+				       u32 __iomem *ipgifg_register)
 {
 	u32 value = 0;
 
@@ -1215,9 +1219,9 @@ int init_flow_control_params(u32 automatic_flow_control_mode,
 				    int tx_flow_control_enable,
 				    u16 pause_period,
 				    u16 extension_field,
-				    volatile u32 *upsmr_register,
-				    volatile u32 *uempr_register,
-				    volatile u32 *maccfg1_register)
+				    u32 __iomem *upsmr_register,
+				    u32 __iomem *uempr_register,
+				    u32 __iomem *maccfg1_register)
 {
 	u32 value = 0;
 
@@ -1243,8 +1247,8 @@ int init_flow_control_params(u32 automatic_flow_control_mode,
 
 static int init_hw_statistics_gathering_mode(int enable_hardware_statistics,
 					     int auto_zero_hardware_statistics,
-					     volatile u32 *upsmr_register,
-					     volatile u16 *uescr_register)
+					     u32 __iomem *upsmr_register,
+					     u16 __iomem *uescr_register)
 {
 	u32 upsmr_value = 0;
 	u16 uescr_value = 0;
@@ -1270,12 +1274,12 @@ static int init_hw_statistics_gathering_mode(int enable_hardware_statistics,
 static int init_firmware_statistics_gathering_mode(int
 		enable_tx_firmware_statistics,
 		int enable_rx_firmware_statistics,
-		volatile u32 *tx_rmon_base_ptr,
+		u32 __iomem *tx_rmon_base_ptr,
 		u32 tx_firmware_statistics_structure_address,
-		volatile u32 *rx_rmon_base_ptr,
+		u32 __iomem *rx_rmon_base_ptr,
 		u32 rx_firmware_statistics_structure_address,
-		volatile u16 *temoder_register,
-		volatile u32 *remoder_register)
+		u16 __iomem *temoder_register,
+		u32 __iomem *remoder_register)
 {
 	/* Note: this function does not check if */
 	/* the parameters it receives are NULL   */
@@ -1307,8 +1311,8 @@ static int init_mac_station_addr_regs(u8 address_byte_0,
 				      u8 address_byte_3,
 				      u8 address_byte_4,
 				      u8 address_byte_5,
-				      volatile u32 *macstnaddr1_register,
-				      volatile u32 *macstnaddr2_register)
+				      u32 __iomem *macstnaddr1_register,
+				      u32 __iomem *macstnaddr2_register)
 {
 	u32 value = 0;
 
@@ -1344,7 +1348,7 @@ static int init_mac_station_addr_regs(u8 address_byte_0,
 }
 
 static int init_check_frame_length_mode(int length_check,
-					volatile u32 *maccfg2_register)
+					u32 __iomem *maccfg2_register)
 {
 	u32 value = 0;
 
@@ -1360,7 +1364,7 @@ static int init_check_frame_length_mode(int length_check,
 }
 
 static int init_preamble_length(u8 preamble_length,
-				volatile u32 *maccfg2_register)
+				u32 __iomem *maccfg2_register)
 {
 	u32 value = 0;
 
@@ -1376,7 +1380,7 @@ static int init_preamble_length(u8 preamble_length,
 
 static int init_rx_parameters(int reject_broadcast,
 			      int receive_short_frames,
-			      int promiscuous, volatile u32 *upsmr_register)
+			      int promiscuous, u32 __iomem *upsmr_register)
 {
 	u32 value = 0;
 
@@ -1403,7 +1407,7 @@ static int init_rx_parameters(int reject_broadcast,
 }
 
 static int init_max_rx_buff_len(u16 max_rx_buf_len,
-				volatile u16 *mrblr_register)
+				u16 __iomem *mrblr_register)
 {
 	/* max_rx_buf_len value must be a multiple of 128 */
 	if ((max_rx_buf_len == 0)
@@ -1415,8 +1419,8 @@ static int init_max_rx_buff_len(u16 max_rx_buf_len,
 }
 
 static int init_min_frame_len(u16 min_frame_length,
-			      volatile u16 *minflr_register,
-			      volatile u16 *mrblr_register)
+			      u16 __iomem *minflr_register,
+			      u16 __iomem *mrblr_register)
 {
 	u16 mrblr_value = 0;
 
@@ -1431,8 +1435,8 @@ static int init_min_frame_len(u16 min_frame_length,
 static int adjust_enet_interface(struct ucc_geth_private *ugeth)
 {
 	struct ucc_geth_info *ug_info;
-	struct ucc_geth *ug_regs;
-	struct ucc_fast *uf_regs;
+	struct ucc_geth __iomem *ug_regs;
+	struct ucc_fast __iomem *uf_regs;
 	int ret_val;
 	u32 upsmr, maccfg2, tbiBaseAddress;
 	u16 value;
@@ -1517,8 +1521,8 @@ static int adjust_enet_interface(struct ucc_geth_private *ugeth)
 static void adjust_link(struct net_device *dev)
 {
 	struct ucc_geth_private *ugeth = netdev_priv(dev);
-	struct ucc_geth *ug_regs;
-	struct ucc_fast *uf_regs;
+	struct ucc_geth __iomem *ug_regs;
+	struct ucc_fast __iomem *uf_regs;
 	struct phy_device *phydev = ugeth->phydev;
 	unsigned long flags;
 	int new_state = 0;
@@ -1678,9 +1682,9 @@ static int ugeth_graceful_stop_rx(struct ucc_geth_private * ugeth)
 	uccf = ugeth->uccf;
 
 	/* Clear acknowledge bit */
-	temp = ugeth->p_rx_glbl_pram->rxgstpack;
+	temp = in_8(&ugeth->p_rx_glbl_pram->rxgstpack);
 	temp &= ~GRACEFUL_STOP_ACKNOWLEDGE_RX;
-	ugeth->p_rx_glbl_pram->rxgstpack = temp;
+	out_8(&ugeth->p_rx_glbl_pram->rxgstpack, temp);
 
 	/* Keep issuing command and checking acknowledge bit until
 	it is asserted, according to spec */
@@ -1692,7 +1696,7 @@ static int ugeth_graceful_stop_rx(struct ucc_geth_private * ugeth)
 		qe_issue_cmd(QE_GRACEFUL_STOP_RX, cecr_subblock,
 			     QE_CR_PROTOCOL_ETHERNET, 0);
 
-		temp = ugeth->p_rx_glbl_pram->rxgstpack;
+		temp = in_8(&ugeth->p_rx_glbl_pram->rxgstpack);
 	} while (!(temp & GRACEFUL_STOP_ACKNOWLEDGE_RX));
 
 	uccf->stopped_rx = 1;
@@ -1991,19 +1995,20 @@ static int ugeth_82xx_filtering_clear_all_addr_in_hash(struct ucc_geth_private *
 						       enum enet_addr_type
 						       enet_addr_type)
 {
-	struct ucc_geth_82xx_address_filtering_pram *p_82xx_addr_filt;
+	struct ucc_geth_82xx_address_filtering_pram __iomem *p_82xx_addr_filt;
 	struct ucc_fast_private *uccf;
 	enum comm_dir comm_dir;
 	struct list_head *p_lh;
 	u16 i, num;
-	u32 *addr_h, *addr_l;
+	u32 __iomem *addr_h;
+	u32 __iomem *addr_l;
 	u8 *p_counter;
 
 	uccf = ugeth->uccf;
 
 	p_82xx_addr_filt =
-	    (struct ucc_geth_82xx_address_filtering_pram *) ugeth->p_rx_glbl_pram->
-	    addressfiltering;
+	    (struct ucc_geth_82xx_address_filtering_pram __iomem *)
+	    ugeth->p_rx_glbl_pram->addressfiltering;
 
 	if (enet_addr_type == ENET_ADDR_TYPE_GROUP) {
 		addr_h = &(p_82xx_addr_filt->gaddr_h);
@@ -2079,7 +2084,7 @@ static int ugeth_82xx_filtering_clear_addr_in_paddr(struct ucc_geth_private *uge
 static void ucc_geth_memclean(struct ucc_geth_private *ugeth)
 {
 	u16 i, j;
-	u8 *bd;
+	u8 __iomem *bd;
 
 	if (!ugeth)
 		return;
@@ -2154,8 +2159,8 @@ static void ucc_geth_memclean(struct ucc_geth_private *ugeth)
 		for (j = 0; j < ugeth->ug_info->bdRingLenTx[i]; j++) {
 			if (ugeth->tx_skbuff[i][j]) {
 				dma_unmap_single(NULL,
-						 ((struct qe_bd *)bd)->buf,
-						 (in_be32((u32 *)bd) &
+						 in_be32(&((struct qe_bd __iomem *)bd)->buf),
+						 (in_be32((u32 __iomem *)bd) &
 						  BD_LENGTH_MASK),
 						 DMA_TO_DEVICE);
 				dev_kfree_skb_any(ugeth->tx_skbuff[i][j]);
@@ -2182,7 +2187,7 @@ static void ucc_geth_memclean(struct ucc_geth_private *ugeth)
 			for (j = 0; j < ugeth->ug_info->bdRingLenRx[i]; j++) {
 				if (ugeth->rx_skbuff[i][j]) {
 					dma_unmap_single(NULL,
-						((struct qe_bd *)bd)->buf,
+						in_be32(&((struct qe_bd __iomem *)bd)->buf),
 						ugeth->ug_info->
 						uf_info.max_rx_buf_length +
 						UCC_GETH_RX_DATA_BUF_ALIGNMENT,
@@ -2218,8 +2223,8 @@ static void ucc_geth_set_multi(struct net_device *dev)
 {
 	struct ucc_geth_private *ugeth;
 	struct dev_mc_list *dmi;
-	struct ucc_fast *uf_regs;
-	struct ucc_geth_82xx_address_filtering_pram *p_82xx_addr_filt;
+	struct ucc_fast __iomem *uf_regs;
+	struct ucc_geth_82xx_address_filtering_pram __iomem *p_82xx_addr_filt;
 	int i;
 
 	ugeth = netdev_priv(dev);
@@ -2228,14 +2233,14 @@ static void ucc_geth_set_multi(struct net_device *dev)
 
 	if (dev->flags & IFF_PROMISC) {
 
-		uf_regs->upsmr |= UPSMR_PRO;
+		out_be32(&uf_regs->upsmr, in_be32(&uf_regs->upsmr) | UPSMR_PRO);
 
 	} else {
 
-		uf_regs->upsmr &= ~UPSMR_PRO;
+		out_be32(&uf_regs->upsmr, in_be32(&uf_regs->upsmr)&~UPSMR_PRO);
 
 		p_82xx_addr_filt =
-		    (struct ucc_geth_82xx_address_filtering_pram *) ugeth->
+		    (struct ucc_geth_82xx_address_filtering_pram __iomem *) ugeth->
 		    p_rx_glbl_pram->addressfiltering;
 
 		if (dev->flags & IFF_ALLMULTI) {
@@ -2270,7 +2275,7 @@ static void ucc_geth_set_multi(struct net_device *dev)
 
 static void ucc_geth_stop(struct ucc_geth_private *ugeth)
 {
-	struct ucc_geth *ug_regs = ugeth->ug_regs;
+	struct ucc_geth __iomem *ug_regs = ugeth->ug_regs;
 	struct phy_device *phydev = ugeth->phydev;
 	u32 tempval;
 
@@ -2419,20 +2424,20 @@ static int ucc_struct_init(struct ucc_geth_private *ugeth)
 		return -ENOMEM;
 	}
 
-	ugeth->ug_regs = (struct ucc_geth *) ioremap(uf_info->regs, sizeof(struct ucc_geth));
+	ugeth->ug_regs = (struct ucc_geth __iomem *) ioremap(uf_info->regs, sizeof(struct ucc_geth));
 
 	return 0;
 }
 
 static int ucc_geth_startup(struct ucc_geth_private *ugeth)
 {
-	struct ucc_geth_82xx_address_filtering_pram *p_82xx_addr_filt;
-	struct ucc_geth_init_pram *p_init_enet_pram;
+	struct ucc_geth_82xx_address_filtering_pram __iomem *p_82xx_addr_filt;
+	struct ucc_geth_init_pram __iomem *p_init_enet_pram;
 	struct ucc_fast_private *uccf;
 	struct ucc_geth_info *ug_info;
 	struct ucc_fast_info *uf_info;
-	struct ucc_fast *uf_regs;
-	struct ucc_geth *ug_regs;
+	struct ucc_fast __iomem *uf_regs;
+	struct ucc_geth __iomem *ug_regs;
 	int ret_val = -EINVAL;
 	u32 remoder = UCC_GETH_REMODER_INIT;
 	u32 init_enet_pram_offset, cecr_subblock, command, maccfg1;
@@ -2440,7 +2445,8 @@ static int ucc_geth_startup(struct ucc_geth_private *ugeth)
 	u16 temoder = UCC_GETH_TEMODER_INIT;
 	u16 test;
 	u8 function_code = 0;
-	u8 *bd, *endOfRing;
+	u8 __iomem *bd;
+	u8 __iomem *endOfRing;
 	u8 numThreadsRxNumerical, numThreadsTxNumerical;
 
 	ugeth_vdbg("%s: IN", __FUNCTION__);
@@ -2602,11 +2608,11 @@ static int ucc_geth_startup(struct ucc_geth_private *ugeth)
 			if (UCC_GETH_TX_BD_RING_ALIGNMENT > 4)
 				align = UCC_GETH_TX_BD_RING_ALIGNMENT;
 			ugeth->tx_bd_ring_offset[j] =
-				kmalloc((u32) (length + align), GFP_KERNEL);
+				(u32) kmalloc((u32) (length + align), GFP_KERNEL);
 
 			if (ugeth->tx_bd_ring_offset[j] != 0)
 				ugeth->p_tx_bd_ring[j] =
-					(void*)((ugeth->tx_bd_ring_offset[j] +
+					(u8 __iomem *)((ugeth->tx_bd_ring_offset[j] +
 					align) & ~(align - 1));
 		} else if (uf_info->bd_mem_part == MEM_PART_MURAM) {
 			ugeth->tx_bd_ring_offset[j] =
@@ -2614,7 +2620,7 @@ static int ucc_geth_startup(struct ucc_geth_private *ugeth)
 					   UCC_GETH_TX_BD_RING_ALIGNMENT);
 			if (!IS_ERR_VALUE(ugeth->tx_bd_ring_offset[j]))
 				ugeth->p_tx_bd_ring[j] =
-				    (u8 *) qe_muram_addr(ugeth->
+				    (u8 __iomem *) qe_muram_addr(ugeth->
 							 tx_bd_ring_offset[j]);
 		}
 		if (!ugeth->p_tx_bd_ring[j]) {
@@ -2626,8 +2632,8 @@ static int ucc_geth_startup(struct ucc_geth_private *ugeth)
 			return -ENOMEM;
 		}
 		/* Zero unused end of bd ring, according to spec */
-		memset(ugeth->p_tx_bd_ring[j] +
-		       ug_info->bdRingLenTx[j] * sizeof(struct qe_bd), 0,
+		memset_io((void __iomem *)(ugeth->p_tx_bd_ring[j] +
+		       ug_info->bdRingLenTx[j] * sizeof(struct qe_bd)), 0,
 		       length - ug_info->bdRingLenTx[j] * sizeof(struct qe_bd));
 	}
 
@@ -2639,10 +2645,10 @@ static int ucc_geth_startup(struct ucc_geth_private *ugeth)
 			if (UCC_GETH_RX_BD_RING_ALIGNMENT > 4)
 				align = UCC_GETH_RX_BD_RING_ALIGNMENT;
 			ugeth->rx_bd_ring_offset[j] =
-				kmalloc((u32) (length + align), GFP_KERNEL);
+				(u32) kmalloc((u32) (length + align), GFP_KERNEL);
 			if (ugeth->rx_bd_ring_offset[j] != 0)
 				ugeth->p_rx_bd_ring[j] =
-					(void*)((ugeth->rx_bd_ring_offset[j] +
+					(u8 __iomem *)((ugeth->rx_bd_ring_offset[j] +
 					align) & ~(align - 1));
 		} else if (uf_info->bd_mem_part == MEM_PART_MURAM) {
 			ugeth->rx_bd_ring_offset[j] =
@@ -2650,7 +2656,7 @@ static int ucc_geth_startup(struct ucc_geth_private *ugeth)
 					   UCC_GETH_RX_BD_RING_ALIGNMENT);
 			if (!IS_ERR_VALUE(ugeth->rx_bd_ring_offset[j]))
 				ugeth->p_rx_bd_ring[j] =
-				    (u8 *) qe_muram_addr(ugeth->
+				    (u8 __iomem *) qe_muram_addr(ugeth->
 							 rx_bd_ring_offset[j]);
 		}
 		if (!ugeth->p_rx_bd_ring[j]) {
@@ -2685,14 +2691,14 @@ static int ucc_geth_startup(struct ucc_geth_private *ugeth)
 		bd = ugeth->confBd[j] = ugeth->txBd[j] = ugeth->p_tx_bd_ring[j];
 		for (i = 0; i < ug_info->bdRingLenTx[j]; i++) {
 			/* clear bd buffer */
-			out_be32(&((struct qe_bd *)bd)->buf, 0);
+			out_be32(&((struct qe_bd __iomem *)bd)->buf, 0);
 			/* set bd status and length */
-			out_be32((u32 *)bd, 0);
+			out_be32((u32 __iomem *)bd, 0);
 			bd += sizeof(struct qe_bd);
 		}
 		bd -= sizeof(struct qe_bd);
 		/* set bd status and length */
-		out_be32((u32 *)bd, T_W);	/* for last BD set Wrap bit */
+		out_be32((u32 __iomem *)bd, T_W); /* for last BD set Wrap bit */
 	}
 
 	/* Init Rx bds */
@@ -2717,14 +2723,14 @@ static int ucc_geth_startup(struct ucc_geth_private *ugeth)
 		bd = ugeth->rxBd[j] = ugeth->p_rx_bd_ring[j];
 		for (i = 0; i < ug_info->bdRingLenRx[j]; i++) {
 			/* set bd status and length */
-			out_be32((u32 *)bd, R_I);
+			out_be32((u32 __iomem *)bd, R_I);
 			/* clear bd buffer */
-			out_be32(&((struct qe_bd *)bd)->buf, 0);
+			out_be32(&((struct qe_bd __iomem *)bd)->buf, 0);
 			bd += sizeof(struct qe_bd);
 		}
 		bd -= sizeof(struct qe_bd);
 		/* set bd status and length */
-		out_be32((u32 *)bd, R_W); /* for last BD set Wrap bit */
+		out_be32((u32 __iomem *)bd, R_W); /* for last BD set Wrap bit */
 	}
 
 	/*
@@ -2744,10 +2750,10 @@ static int ucc_geth_startup(struct ucc_geth_private *ugeth)
 		return -ENOMEM;
 	}
 	ugeth->p_tx_glbl_pram =
-	    (struct ucc_geth_tx_global_pram *) qe_muram_addr(ugeth->
+	    (struct ucc_geth_tx_global_pram __iomem *) qe_muram_addr(ugeth->
 							tx_glbl_pram_offset);
 	/* Zero out p_tx_glbl_pram */
-	memset(ugeth->p_tx_glbl_pram, 0, sizeof(struct ucc_geth_tx_global_pram));
+	memset_io((void __iomem *)ugeth->p_tx_glbl_pram, 0, sizeof(struct ucc_geth_tx_global_pram));
 
 	/* Fill global PRAM */
 
@@ -2768,7 +2774,7 @@ static int ucc_geth_startup(struct ucc_geth_private *ugeth)
 	}
 
 	ugeth->p_thread_data_tx =
-	    (struct ucc_geth_thread_data_tx *) qe_muram_addr(ugeth->
+	    (struct ucc_geth_thread_data_tx __iomem *) qe_muram_addr(ugeth->
 							thread_dat_tx_offset);
 	out_be32(&ugeth->p_tx_glbl_pram->tqptr, ugeth->thread_dat_tx_offset);
 
@@ -2779,7 +2785,8 @@ static int ucc_geth_startup(struct ucc_geth_private *ugeth)
 
 	/* iphoffset */
 	for (i = 0; i < TX_IP_OFFSET_ENTRY_MAX; i++)
-		ugeth->p_tx_glbl_pram->iphoffset[i] = ug_info->iphoffset[i];
+		out_8(&ugeth->p_tx_glbl_pram->iphoffset[i],
+				ug_info->iphoffset[i]);
 
 	/* SQPTR */
 	/* Size varies with number of Tx queues */
@@ -2797,7 +2804,7 @@ static int ucc_geth_startup(struct ucc_geth_private *ugeth)
 	}
 
 	ugeth->p_send_q_mem_reg =
-	    (struct ucc_geth_send_queue_mem_region *) qe_muram_addr(ugeth->
+	    (struct ucc_geth_send_queue_mem_region __iomem *) qe_muram_addr(ugeth->
 			send_q_mem_reg_offset);
 	out_be32(&ugeth->p_tx_glbl_pram->sqptr, ugeth->send_q_mem_reg_offset);
 
@@ -2841,25 +2848,26 @@ static int ucc_geth_startup(struct ucc_geth_private *ugeth)
 		}
 
 		ugeth->p_scheduler =
-		    (struct ucc_geth_scheduler *) qe_muram_addr(ugeth->
+		    (struct ucc_geth_scheduler __iomem *) qe_muram_addr(ugeth->
 							   scheduler_offset);
 		out_be32(&ugeth->p_tx_glbl_pram->schedulerbasepointer,
 			 ugeth->scheduler_offset);
 		/* Zero out p_scheduler */
-		memset(ugeth->p_scheduler, 0, sizeof(struct ucc_geth_scheduler));
+		memset_io((void __iomem *)ugeth->p_scheduler, 0, sizeof(struct ucc_geth_scheduler));
 
 		/* Set values in scheduler */
 		out_be32(&ugeth->p_scheduler->mblinterval,
 			 ug_info->mblinterval);
 		out_be16(&ugeth->p_scheduler->nortsrbytetime,
 			 ug_info->nortsrbytetime);
-		ugeth->p_scheduler->fracsiz = ug_info->fracsiz;
-		ugeth->p_scheduler->strictpriorityq = ug_info->strictpriorityq;
-		ugeth->p_scheduler->txasap = ug_info->txasap;
-		ugeth->p_scheduler->extrabw = ug_info->extrabw;
+		out_8(&ugeth->p_scheduler->fracsiz, ug_info->fracsiz);
+		out_8(&ugeth->p_scheduler->strictpriorityq,
+				ug_info->strictpriorityq);
+		out_8(&ugeth->p_scheduler->txasap, ug_info->txasap);
+		out_8(&ugeth->p_scheduler->extrabw, ug_info->extrabw);
 		for (i = 0; i < NUM_TX_QUEUES; i++)
-			ugeth->p_scheduler->weightfactor[i] =
-			    ug_info->weightfactor[i];
+			out_8(&ugeth->p_scheduler->weightfactor[i],
+			    ug_info->weightfactor[i]);
 
 		/* Set pointers to cpucount registers in scheduler */
 		ugeth->p_cpucount[0] = &(ugeth->p_scheduler->cpucount0);
@@ -2890,10 +2898,10 @@ static int ucc_geth_startup(struct ucc_geth_private *ugeth)
 			return -ENOMEM;
 		}
 		ugeth->p_tx_fw_statistics_pram =
-		    (struct ucc_geth_tx_firmware_statistics_pram *)
+		    (struct ucc_geth_tx_firmware_statistics_pram __iomem *)
 		    qe_muram_addr(ugeth->tx_fw_statistics_pram_offset);
 		/* Zero out p_tx_fw_statistics_pram */
-		memset(ugeth->p_tx_fw_statistics_pram,
+		memset_io((void __iomem *)ugeth->p_tx_fw_statistics_pram,
 		       0, sizeof(struct ucc_geth_tx_firmware_statistics_pram));
 	}
 
@@ -2930,10 +2938,10 @@ static int ucc_geth_startup(struct ucc_geth_private *ugeth)
 		return -ENOMEM;
 	}
 	ugeth->p_rx_glbl_pram =
-	    (struct ucc_geth_rx_global_pram *) qe_muram_addr(ugeth->
+	    (struct ucc_geth_rx_global_pram __iomem *) qe_muram_addr(ugeth->
 							rx_glbl_pram_offset);
 	/* Zero out p_rx_glbl_pram */
-	memset(ugeth->p_rx_glbl_pram, 0, sizeof(struct ucc_geth_rx_global_pram));
+	memset_io((void __iomem *)ugeth->p_rx_glbl_pram, 0, sizeof(struct ucc_geth_rx_global_pram));
 
 	/* Fill global PRAM */
 
@@ -2953,7 +2961,7 @@ static int ucc_geth_startup(struct ucc_geth_private *ugeth)
 	}
 
 	ugeth->p_thread_data_rx =
-	    (struct ucc_geth_thread_data_rx *) qe_muram_addr(ugeth->
+	    (struct ucc_geth_thread_data_rx __iomem *) qe_muram_addr(ugeth->
 							thread_dat_rx_offset);
 	out_be32(&ugeth->p_rx_glbl_pram->rqptr, ugeth->thread_dat_rx_offset);
 
@@ -2976,10 +2984,10 @@ static int ucc_geth_startup(struct ucc_geth_private *ugeth)
 			return -ENOMEM;
 		}
 		ugeth->p_rx_fw_statistics_pram =
-		    (struct ucc_geth_rx_firmware_statistics_pram *)
+		    (struct ucc_geth_rx_firmware_statistics_pram __iomem *)
 		    qe_muram_addr(ugeth->rx_fw_statistics_pram_offset);
 		/* Zero out p_rx_fw_statistics_pram */
-		memset(ugeth->p_rx_fw_statistics_pram, 0,
+		memset_io((void __iomem *)ugeth->p_rx_fw_statistics_pram, 0,
 		       sizeof(struct ucc_geth_rx_firmware_statistics_pram));
 	}
 
@@ -3000,7 +3008,7 @@ static int ucc_geth_startup(struct ucc_geth_private *ugeth)
 	}
 
 	ugeth->p_rx_irq_coalescing_tbl =
-	    (struct ucc_geth_rx_interrupt_coalescing_table *)
+	    (struct ucc_geth_rx_interrupt_coalescing_table __iomem *)
 	    qe_muram_addr(ugeth->rx_irq_coalescing_tbl_offset);
 	out_be32(&ugeth->p_rx_glbl_pram->intcoalescingptr,
 		 ugeth->rx_irq_coalescing_tbl_offset);
@@ -3069,11 +3077,11 @@ static int ucc_geth_startup(struct ucc_geth_private *ugeth)
 	}
 
 	ugeth->p_rx_bd_qs_tbl =
-	    (struct ucc_geth_rx_bd_queues_entry *) qe_muram_addr(ugeth->
+	    (struct ucc_geth_rx_bd_queues_entry __iomem *) qe_muram_addr(ugeth->
 				    rx_bd_qs_tbl_offset);
 	out_be32(&ugeth->p_rx_glbl_pram->rbdqptr, ugeth->rx_bd_qs_tbl_offset);
 	/* Zero out p_rx_bd_qs_tbl */
-	memset(ugeth->p_rx_bd_qs_tbl,
+	memset_io((void __iomem *)ugeth->p_rx_bd_qs_tbl,
 	       0,
 	       ug_info->numQueuesRx * (sizeof(struct ucc_geth_rx_bd_queues_entry) +
 				       sizeof(struct ucc_geth_rx_prefetched_bds)));
@@ -3133,7 +3141,7 @@ static int ucc_geth_startup(struct ucc_geth_private *ugeth)
 		&ugeth->p_rx_glbl_pram->remoder);
 
 	/* function code register */
-	ugeth->p_rx_glbl_pram->rstate = function_code;
+	out_8(&ugeth->p_rx_glbl_pram->rstate, function_code);
 
 	/* initialize extended filtering */
 	if (ug_info->rxExtendedFiltering) {
@@ -3160,7 +3168,7 @@ static int ucc_geth_startup(struct ucc_geth_private *ugeth)
 		}
 
 		ugeth->p_exf_glbl_param =
-		    (struct ucc_geth_exf_global_pram *) qe_muram_addr(ugeth->
+		    (struct ucc_geth_exf_global_pram __iomem *) qe_muram_addr(ugeth->
 				 exf_glbl_param_offset);
 		out_be32(&ugeth->p_rx_glbl_pram->exfGlobalParam,
 			 ugeth->exf_glbl_param_offset);
@@ -3175,7 +3183,7 @@ static int ucc_geth_startup(struct ucc_geth_private *ugeth)
 			ugeth_82xx_filtering_clear_addr_in_paddr(ugeth, (u8) j);
 
 		p_82xx_addr_filt =
-		    (struct ucc_geth_82xx_address_filtering_pram *) ugeth->
+		    (struct ucc_geth_82xx_address_filtering_pram __iomem *) ugeth->
 		    p_rx_glbl_pram->addressfiltering;
 
 		ugeth_82xx_filtering_clear_all_addr_in_hash(ugeth,
@@ -3307,17 +3315,21 @@ static int ucc_geth_startup(struct ucc_geth_private *ugeth)
 		return -ENOMEM;
 	}
 	p_init_enet_pram =
-	    (struct ucc_geth_init_pram *) qe_muram_addr(init_enet_pram_offset);
+	    (struct ucc_geth_init_pram __iomem *) qe_muram_addr(init_enet_pram_offset);
 
 	/* Copy shadow InitEnet command parameter structure into PRAM */
-	p_init_enet_pram->resinit1 = ugeth->p_init_enet_param_shadow->resinit1;
-	p_init_enet_pram->resinit2 = ugeth->p_init_enet_param_shadow->resinit2;
-	p_init_enet_pram->resinit3 = ugeth->p_init_enet_param_shadow->resinit3;
-	p_init_enet_pram->resinit4 = ugeth->p_init_enet_param_shadow->resinit4;
+	out_8(&p_init_enet_pram->resinit1,
+			ugeth->p_init_enet_param_shadow->resinit1);
+	out_8(&p_init_enet_pram->resinit2,
+			ugeth->p_init_enet_param_shadow->resinit2);
+	out_8(&p_init_enet_pram->resinit3,
+			ugeth->p_init_enet_param_shadow->resinit3);
+	out_8(&p_init_enet_pram->resinit4,
+			ugeth->p_init_enet_param_shadow->resinit4);
 	out_be16(&p_init_enet_pram->resinit5,
 		 ugeth->p_init_enet_param_shadow->resinit5);
-	p_init_enet_pram->largestexternallookupkeysize =
-	    ugeth->p_init_enet_param_shadow->largestexternallookupkeysize;
+	out_8(&p_init_enet_pram->largestexternallookupkeysize,
+	    ugeth->p_init_enet_param_shadow->largestexternallookupkeysize);
 	out_be32(&p_init_enet_pram->rgftgfrxglobal,
 		 ugeth->p_init_enet_param_shadow->rgftgfrxglobal);
 	for (i = 0; i < ENET_INIT_PARAM_MAX_ENTRIES_RX; i++)
@@ -3371,7 +3383,7 @@ static int ucc_geth_start_xmit(struct sk_buff *skb, struct net_device *dev)
 #ifdef CONFIG_UGETH_TX_ON_DEMAND
 	struct ucc_fast_private *uccf;
 #endif
-	u8 *bd;			/* BD pointer */
+	u8 __iomem *bd;			/* BD pointer */
 	u32 bd_status;
 	u8 txQ = 0;
 
@@ -3383,7 +3395,7 @@ static int ucc_geth_start_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	/* Start from the next BD that should be filled */
 	bd = ugeth->txBd[txQ];
-	bd_status = in_be32((u32 *)bd);
+	bd_status = in_be32((u32 __iomem *)bd);
 	/* Save the skb pointer so we can free it later */
 	ugeth->tx_skbuff[txQ][ugeth->skb_curtx[txQ]] = skb;
 
@@ -3393,7 +3405,7 @@ static int ucc_geth_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	     1) & TX_RING_MOD_MASK(ugeth->ug_info->bdRingLenTx[txQ]);
 
 	/* set up the buffer descriptor */
-	out_be32(&((struct qe_bd *)bd)->buf,
+	out_be32(&((struct qe_bd __iomem *)bd)->buf,
 		      dma_map_single(NULL, skb->data, skb->len, DMA_TO_DEVICE));
 
 	/* printk(KERN_DEBUG"skb->data is 0x%x\n",skb->data); */
@@ -3401,7 +3413,7 @@ static int ucc_geth_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	bd_status = (bd_status & T_W) | T_R | T_I | T_L | skb->len;
 
 	/* set bd status and length */
-	out_be32((u32 *)bd, bd_status);
+	out_be32((u32 __iomem *)bd, bd_status);
 
 	dev->trans_start = jiffies;
 
@@ -3441,7 +3453,7 @@ static int ucc_geth_start_xmit(struct sk_buff *skb, struct net_device *dev)
 static int ucc_geth_rx(struct ucc_geth_private *ugeth, u8 rxQ, int rx_work_limit)
 {
 	struct sk_buff *skb;
-	u8 *bd;
+	u8 __iomem *bd;
 	u16 length, howmany = 0;
 	u32 bd_status;
 	u8 *bdBuffer;
@@ -3454,11 +3466,11 @@ static int ucc_geth_rx(struct ucc_geth_private *ugeth, u8 rxQ, int rx_work_limit
 	/* collect received buffers */
 	bd = ugeth->rxBd[rxQ];
 
-	bd_status = in_be32((u32 *)bd);
+	bd_status = in_be32((u32 __iomem *)bd);
 
 	/* while there are received buffers and BD is full (~R_E) */
 	while (!((bd_status & (R_E)) || (--rx_work_limit < 0))) {
-		bdBuffer = (u8 *) in_be32(&((struct qe_bd *)bd)->buf);
+		bdBuffer = (u8 *) in_be32(&((struct qe_bd __iomem *)bd)->buf);
 		length = (u16) ((bd_status & BD_LENGTH_MASK) - 4);
 		skb = ugeth->rx_skbuff[rxQ][ugeth->skb_currx[rxQ]];
 
@@ -3516,7 +3528,7 @@ static int ucc_geth_rx(struct ucc_geth_private *ugeth, u8 rxQ, int rx_work_limit
 		else
 			bd += sizeof(struct qe_bd);
 
-		bd_status = in_be32((u32 *)bd);
+		bd_status = in_be32((u32 __iomem *)bd);
 	}
 
 	ugeth->rxBd[rxQ] = bd;
@@ -3527,11 +3539,11 @@ static int ucc_geth_tx(struct net_device *dev, u8 txQ)
 {
 	/* Start from the next BD that should be filled */
 	struct ucc_geth_private *ugeth = netdev_priv(dev);
-	u8 *bd;			/* BD pointer */
+	u8 __iomem *bd;		/* BD pointer */
 	u32 bd_status;
 
 	bd = ugeth->confBd[txQ];
-	bd_status = in_be32((u32 *)bd);
+	bd_status = in_be32((u32 __iomem *)bd);
 
 	/* Normal processing. */
 	while ((bd_status & T_R) == 0) {
@@ -3561,7 +3573,7 @@ static int ucc_geth_tx(struct net_device *dev, u8 txQ)
 			bd += sizeof(struct qe_bd);
 		else
 			bd = ugeth->p_tx_bd_ring[txQ];
-		bd_status = in_be32((u32 *)bd);
+		bd_status = in_be32((u32 __iomem *)bd);
 	}
 	ugeth->confBd[txQ] = bd;
 	return 0;
diff --git a/drivers/net/ucc_geth.h b/drivers/net/ucc_geth.h
index 9f8b758..abc0e22 100644
--- a/drivers/net/ucc_geth.h
+++ b/drivers/net/ucc_geth.h
@@ -700,8 +700,8 @@ struct ucc_geth_82xx_address_filtering_pram {
 	u32 iaddr_l;		/* individual address filter, low */
 	u32 gaddr_h;		/* group address filter, high */
 	u32 gaddr_l;		/* group address filter, low */
-	struct ucc_geth_82xx_enet_address taddr;
-	struct ucc_geth_82xx_enet_address paddr[NUM_OF_PADDRS];
+	struct ucc_geth_82xx_enet_address __iomem taddr;
+	struct ucc_geth_82xx_enet_address __iomem paddr[NUM_OF_PADDRS];
 	u8 res0[0x40 - 0x38];
 } __attribute__ ((packed));
 
@@ -1186,40 +1186,40 @@ struct ucc_geth_private {
 	struct ucc_fast_private *uccf;
 	struct net_device *dev;
 	struct napi_struct napi;
-	struct ucc_geth *ug_regs;
+	struct ucc_geth __iomem *ug_regs;
 	struct ucc_geth_init_pram *p_init_enet_param_shadow;
-	struct ucc_geth_exf_global_pram *p_exf_glbl_param;
+	struct ucc_geth_exf_global_pram __iomem *p_exf_glbl_param;
 	u32 exf_glbl_param_offset;
-	struct ucc_geth_rx_global_pram *p_rx_glbl_pram;
+	struct ucc_geth_rx_global_pram __iomem *p_rx_glbl_pram;
 	u32 rx_glbl_pram_offset;
-	struct ucc_geth_tx_global_pram *p_tx_glbl_pram;
+	struct ucc_geth_tx_global_pram __iomem *p_tx_glbl_pram;
 	u32 tx_glbl_pram_offset;
-	struct ucc_geth_send_queue_mem_region *p_send_q_mem_reg;
+	struct ucc_geth_send_queue_mem_region __iomem *p_send_q_mem_reg;
 	u32 send_q_mem_reg_offset;
-	struct ucc_geth_thread_data_tx *p_thread_data_tx;
+	struct ucc_geth_thread_data_tx __iomem *p_thread_data_tx;
 	u32 thread_dat_tx_offset;
-	struct ucc_geth_thread_data_rx *p_thread_data_rx;
+	struct ucc_geth_thread_data_rx __iomem *p_thread_data_rx;
 	u32 thread_dat_rx_offset;
-	struct ucc_geth_scheduler *p_scheduler;
+	struct ucc_geth_scheduler __iomem *p_scheduler;
 	u32 scheduler_offset;
-	struct ucc_geth_tx_firmware_statistics_pram *p_tx_fw_statistics_pram;
+	struct ucc_geth_tx_firmware_statistics_pram __iomem *p_tx_fw_statistics_pram;
 	u32 tx_fw_statistics_pram_offset;
-	struct ucc_geth_rx_firmware_statistics_pram *p_rx_fw_statistics_pram;
+	struct ucc_geth_rx_firmware_statistics_pram __iomem *p_rx_fw_statistics_pram;
 	u32 rx_fw_statistics_pram_offset;
-	struct ucc_geth_rx_interrupt_coalescing_table *p_rx_irq_coalescing_tbl;
+	struct ucc_geth_rx_interrupt_coalescing_table __iomem *p_rx_irq_coalescing_tbl;
 	u32 rx_irq_coalescing_tbl_offset;
-	struct ucc_geth_rx_bd_queues_entry *p_rx_bd_qs_tbl;
+	struct ucc_geth_rx_bd_queues_entry __iomem *p_rx_bd_qs_tbl;
 	u32 rx_bd_qs_tbl_offset;
-	u8 *p_tx_bd_ring[NUM_TX_QUEUES];
+	u8 __iomem *p_tx_bd_ring[NUM_TX_QUEUES];
 	u32 tx_bd_ring_offset[NUM_TX_QUEUES];
-	u8 *p_rx_bd_ring[NUM_RX_QUEUES];
+	u8 __iomem *p_rx_bd_ring[NUM_RX_QUEUES];
 	u32 rx_bd_ring_offset[NUM_RX_QUEUES];
-	u8 *confBd[NUM_TX_QUEUES];
-	u8 *txBd[NUM_TX_QUEUES];
-	u8 *rxBd[NUM_RX_QUEUES];
+	u8 __iomem *confBd[NUM_TX_QUEUES];
+	u8 __iomem *txBd[NUM_TX_QUEUES];
+	u8 __iomem *rxBd[NUM_RX_QUEUES];
 	int badFrame[NUM_RX_QUEUES];
 	u16 cpucount[NUM_TX_QUEUES];
-	volatile u16 *p_cpucount[NUM_TX_QUEUES];
+	u16 __iomem *p_cpucount[NUM_TX_QUEUES];
 	int indAddrRegUsed[NUM_OF_PADDRS];
 	u8 paddr[NUM_OF_PADDRS][ENET_NUM_OCTETS_PER_ADDRESS];	/* ethernet address */
 	u8 numGroupAddrInHash;
@@ -1251,4 +1251,12 @@ struct ucc_geth_private {
 	int oldlink;
 };
 
+void uec_set_ethtool_ops(struct net_device *netdev);
+int init_flow_control_params(u32 automatic_flow_control_mode,
+		int rx_flow_control_enable, int tx_flow_control_enable,
+		u16 pause_period, u16 extension_field,
+		u32 __iomem *upsmr_register, u32 __iomem *uempr_register,
+		u32 __iomem *maccfg1_register);
+
+
 #endif				/* __UCC_GETH_H__ */
diff --git a/drivers/net/ucc_geth_ethtool.c b/drivers/net/ucc_geth_ethtool.c
index 9a9622c..299b7f1 100644
--- a/drivers/net/ucc_geth_ethtool.c
+++ b/drivers/net/ucc_geth_ethtool.c
@@ -108,12 +108,6 @@ static char rx_fw_stat_gstrings[][ETH_GSTRING_LEN] = {
 #define UEC_TX_FW_STATS_LEN ARRAY_SIZE(tx_fw_stat_gstrings)
 #define UEC_RX_FW_STATS_LEN ARRAY_SIZE(rx_fw_stat_gstrings)
 
-extern int init_flow_control_params(u32 automatic_flow_control_mode,
-		int rx_flow_control_enable,
-		int tx_flow_control_enable, u16 pause_period,
-		u16 extension_field, volatile u32 *upsmr_register,
-		volatile u32 *uempr_register, volatile u32 *maccfg1_register);
-
 static int
 uec_get_settings(struct net_device *netdev, struct ethtool_cmd *ecmd)
 {
diff --git a/drivers/net/ucc_geth_mii.c b/drivers/net/ucc_geth_mii.c
index 2af4907..9404747 100644
--- a/drivers/net/ucc_geth_mii.c
+++ b/drivers/net/ucc_geth_mii.c
@@ -104,7 +104,7 @@ int uec_mdio_read(struct mii_bus *bus, int mii_id, int regnum)
 }
 
 /* Reset the MIIM registers, and wait for the bus to free */
-int uec_mdio_reset(struct mii_bus *bus)
+static int uec_mdio_reset(struct mii_bus *bus)
 {
 	struct ucc_mii_mng __iomem *regs = (void __iomem *)bus->priv;
 	unsigned int timeout = PHY_INIT_TIMEOUT;
@@ -240,7 +240,7 @@ reg_map_fail:
 	return err;
 }
 
-int uec_mdio_remove(struct of_device *ofdev)
+static int uec_mdio_remove(struct of_device *ofdev)
 {
 	struct device *device = &ofdev->dev;
 	struct mii_bus *bus = dev_get_drvdata(device);
-- 
cgit v0.10.2


From afd8e39919c913993ac2f9984af8a9ba21c63d27 Mon Sep 17 00:00:00 2001
From: Anton Vorontsov <avorontsov@ru.mvista.com>
Date: Tue, 29 Apr 2008 19:53:13 +0400
Subject: uli526x: initialize the hardware prior to requesting interrupts

The firmware on MPC8610HPCD boards enables ULI ethernet and leaves it
in some funky state before booting Linux. For drivers, it's always good
idea to (re)initialize the hardware prior to requesting interrupts.

This patch fixes the following oops:

Oops: Kernel access of bad area, sig: 11 [#1]
MPC86xx HPCD
NIP: c0172820 LR: c017287c CTR: 00000000
[...]
NIP [c0172820] allocate_rx_buffer+0x2c/0xb0
LR [c017287c] allocate_rx_buffer+0x88/0xb0
Call Trace:
[df82bdc0] [c017287c] allocate_rx_buffer+0x88/0xb0 (unreliable)
[df82bde0] [c0173000] uli526x_interrupt+0xe4/0x49c
[df82be20] [c0045418] request_irq+0xf0/0x114
[df82be50] [c01737b0] uli526x_open+0x48/0x160
[df82be70] [c0201184] dev_open+0xb0/0xe8
[df82be80] [c0200104] dev_change_flags+0x90/0x1bc
[df82bea0] [c035fab0] ip_auto_config+0x214/0xef4
[df82bf60] [c03421c8] kernel_init+0xc4/0x2ac
[df82bff0] [c0010834] kernel_thread+0x44/0x60
Instruction dump:
4e800020 9421ffe0 7c0802a6 bfa10014 7c7e1b78 90010024 80030060 83e30054
2b80002f 419d0078 3fa0c039 48000058 <907f0010> 80630088 2f830000 419e0014

Signed-off-by: Anton Vorontsov <avorontsov@ru.mvista.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/net/tulip/uli526x.c b/drivers/net/tulip/uli526x.c
index a59c1f2..1f077ac 100644
--- a/drivers/net/tulip/uli526x.c
+++ b/drivers/net/tulip/uli526x.c
@@ -434,10 +434,6 @@ static int uli526x_open(struct net_device *dev)
 
 	ULI526X_DBUG(0, "uli526x_open", 0);
 
-	ret = request_irq(dev->irq, &uli526x_interrupt, IRQF_SHARED, dev->name, dev);
-	if (ret)
-		return ret;
-
 	/* system variable init */
 	db->cr6_data = CR6_DEFAULT | uli526x_cr6_user_set;
 	db->tx_packet_cnt = 0;
@@ -456,6 +452,10 @@ static int uli526x_open(struct net_device *dev)
 	/* Initialize ULI526X board */
 	uli526x_init(dev);
 
+	ret = request_irq(dev->irq, &uli526x_interrupt, IRQF_SHARED, dev->name, dev);
+	if (ret)
+		return ret;
+
 	/* Active System Interface */
 	netif_wake_queue(dev);
 
-- 
cgit v0.10.2


From e284e5c6601cbb16e48854be26aa57a8fa844e35 Mon Sep 17 00:00:00 2001
From: Anton Vorontsov <avorontsov@ru.mvista.com>
Date: Tue, 29 Apr 2008 19:53:18 +0400
Subject: uli526x: fix endianness issues in the setup frame

This patch fixes uli526x driver's issues on a PowerPC boards: uli chip
is unable to receive the packets.

It appears that send_frame_filter prepares the setup frame in the
endianness unsafe manner. On a big endian machines we should shift
the address nibble by two bytes.

Signed-off-by: Anton Vorontsov <avorontsov@ru.mvista.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/net/tulip/uli526x.c b/drivers/net/tulip/uli526x.c
index 1f077ac..2511ca7 100644
--- a/drivers/net/tulip/uli526x.c
+++ b/drivers/net/tulip/uli526x.c
@@ -1368,6 +1368,12 @@ static void update_cr6(u32 cr6_data, unsigned long ioaddr)
  *	This setup frame initialize ULI526X address filter mode
  */
 
+#ifdef __BIG_ENDIAN
+#define FLT_SHIFT 16
+#else
+#define FLT_SHIFT 0
+#endif
+
 static void send_filter_frame(struct net_device *dev, int mc_cnt)
 {
 	struct uli526x_board_info *db = netdev_priv(dev);
@@ -1384,27 +1390,27 @@ static void send_filter_frame(struct net_device *dev, int mc_cnt)
 
 	/* Node address */
 	addrptr = (u16 *) dev->dev_addr;
-	*suptr++ = addrptr[0];
-	*suptr++ = addrptr[1];
-	*suptr++ = addrptr[2];
+	*suptr++ = addrptr[0] << FLT_SHIFT;
+	*suptr++ = addrptr[1] << FLT_SHIFT;
+	*suptr++ = addrptr[2] << FLT_SHIFT;
 
 	/* broadcast address */
-	*suptr++ = 0xffff;
-	*suptr++ = 0xffff;
-	*suptr++ = 0xffff;
+	*suptr++ = 0xffff << FLT_SHIFT;
+	*suptr++ = 0xffff << FLT_SHIFT;
+	*suptr++ = 0xffff << FLT_SHIFT;
 
 	/* fit the multicast address */
 	for (mcptr = dev->mc_list, i = 0; i < mc_cnt; i++, mcptr = mcptr->next) {
 		addrptr = (u16 *) mcptr->dmi_addr;
-		*suptr++ = addrptr[0];
-		*suptr++ = addrptr[1];
-		*suptr++ = addrptr[2];
+		*suptr++ = addrptr[0] << FLT_SHIFT;
+		*suptr++ = addrptr[1] << FLT_SHIFT;
+		*suptr++ = addrptr[2] << FLT_SHIFT;
 	}
 
 	for (; i<14; i++) {
-		*suptr++ = 0xffff;
-		*suptr++ = 0xffff;
-		*suptr++ = 0xffff;
+		*suptr++ = 0xffff << FLT_SHIFT;
+		*suptr++ = 0xffff << FLT_SHIFT;
+		*suptr++ = 0xffff << FLT_SHIFT;
 	}
 
 	/* prepare the setup frame */
-- 
cgit v0.10.2


From 97ac8caee238d2a81c23661916f7acd3a22c85fe Mon Sep 17 00:00:00 2001
From: Bruce Allan <bruce.w.allan@intel.com>
Date: Tue, 29 Apr 2008 09:16:05 -0700
Subject: e1000e: Add support for BM PHYs on ICH9

This patch adds support for the BM PHY, a new PHY model being used
on ICH9-based implementations.

This new PHY exposes issues in the ICH9 silicon when receiving
jumbo frames large enough to use more than a certain part of the
Rx FIFO, and this unfortunately breaks packet split jumbo receives.
For this reason we re-introduce (for affected adapters only) the
jumbo single-skb receive routine back so that people who do
wish to use jumbo frames on these ich9 platforms can do so.
Part of this problem has to do with CPU sleep states and to make
sure that all the wake up timings are correctly we force them
with the recently merged pm_qos infrastructure written by Mark
Gross. (See http://lkml.org/lkml/2007/10/4/400).

To make code read a bit easier we introduce a _IS_ICH flag so
that we don't need to do mac type checks over the code.

Signed-off-by: Bruce Allan <bruce.w.allan@intel.com>
Signed-off-by: Auke Kok <auke-jan.h.kok@intel.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/net/e1000e/defines.h b/drivers/net/e1000e/defines.h
index 2a53875..f823b8b 100644
--- a/drivers/net/e1000e/defines.h
+++ b/drivers/net/e1000e/defines.h
@@ -648,6 +648,8 @@
 #define IFE_E_PHY_ID         0x02A80330
 #define IFE_PLUS_E_PHY_ID    0x02A80320
 #define IFE_C_E_PHY_ID       0x02A80310
+#define BME1000_E_PHY_ID     0x01410CB0
+#define BME1000_E_PHY_ID_R2  0x01410CB1
 
 /* M88E1000 Specific Registers */
 #define M88E1000_PHY_SPEC_CTRL     0x10  /* PHY Specific Control Register */
@@ -701,6 +703,14 @@
 #define M88EC018_EPSCR_DOWNSHIFT_COUNTER_MASK  0x0E00
 #define M88EC018_EPSCR_DOWNSHIFT_COUNTER_5X    0x0800
 
+/* BME1000 PHY Specific Control Register */
+#define BME1000_PSCR_ENABLE_DOWNSHIFT   0x0800 /* 1 = enable downshift */
+
+
+#define PHY_PAGE_SHIFT 5
+#define PHY_REG(page, reg) (((page) << PHY_PAGE_SHIFT) | \
+                           ((reg) & MAX_PHY_REG_ADDRESS))
+
 /*
  * Bits...
  * 15-5: page
diff --git a/drivers/net/e1000e/e1000.h b/drivers/net/e1000e/e1000.h
index 38bfd0d..d3bc6f8 100644
--- a/drivers/net/e1000e/e1000.h
+++ b/drivers/net/e1000e/e1000.h
@@ -127,7 +127,7 @@ struct e1000_buffer {
 		/* arrays of page information for packet split */
 		struct e1000_ps_page *ps_pages;
 	};
-
+	struct page *page;
 };
 
 struct e1000_ring {
@@ -304,6 +304,7 @@ struct e1000_info {
 #define FLAG_HAS_CTRLEXT_ON_LOAD          (1 << 5)
 #define FLAG_HAS_SWSM_ON_LOAD             (1 << 6)
 #define FLAG_HAS_JUMBO_FRAMES             (1 << 7)
+#define FLAG_IS_ICH                       (1 << 9)
 #define FLAG_HAS_SMART_POWER_DOWN         (1 << 11)
 #define FLAG_IS_QUAD_PORT_A               (1 << 12)
 #define FLAG_IS_QUAD_PORT                 (1 << 13)
@@ -386,6 +387,7 @@ extern void e1000e_set_kmrn_lock_loss_workaround_ich8lan(struct e1000_hw *hw,
 						 bool state);
 extern void e1000e_igp3_phy_powerdown_workaround_ich8lan(struct e1000_hw *hw);
 extern void e1000e_gig_downshift_workaround_ich8lan(struct e1000_hw *hw);
+extern void e1000e_disable_gig_wol_ich8lan(struct e1000_hw *hw);
 
 extern s32 e1000e_check_for_copper_link(struct e1000_hw *hw);
 extern s32 e1000e_check_for_fiber_link(struct e1000_hw *hw);
@@ -443,6 +445,9 @@ extern s32 e1000e_get_phy_info_m88(struct e1000_hw *hw);
 extern s32 e1000e_read_phy_reg_m88(struct e1000_hw *hw, u32 offset, u16 *data);
 extern s32 e1000e_write_phy_reg_m88(struct e1000_hw *hw, u32 offset, u16 data);
 extern enum e1000_phy_type e1000e_get_phy_type_from_id(u32 phy_id);
+extern s32 e1000e_determine_phy_address(struct e1000_hw *hw);
+extern s32 e1000e_write_phy_reg_bm(struct e1000_hw *hw, u32 offset, u16 data);
+extern s32 e1000e_read_phy_reg_bm(struct e1000_hw *hw, u32 offset, u16 *data);
 extern void e1000e_phy_force_speed_duplex_setup(struct e1000_hw *hw, u16 *phy_ctrl);
 extern s32 e1000e_write_kmrn_reg(struct e1000_hw *hw, u32 offset, u16 data);
 extern s32 e1000e_read_kmrn_reg(struct e1000_hw *hw, u32 offset, u16 *data);
diff --git a/drivers/net/e1000e/ethtool.c b/drivers/net/e1000e/ethtool.c
index ce045ac..2bb6da0 100644
--- a/drivers/net/e1000e/ethtool.c
+++ b/drivers/net/e1000e/ethtool.c
@@ -803,8 +803,7 @@ static int e1000_reg_test(struct e1000_adapter *adapter, u64 *data)
 	/* restore previous status */
 	ew32(STATUS, before);
 
-	if ((mac->type != e1000_ich8lan) &&
-	    (mac->type != e1000_ich9lan)) {
+	if (!(adapter->flags & FLAG_IS_ICH)) {
 		REG_PATTERN_TEST(E1000_FCAL, 0xFFFFFFFF, 0xFFFFFFFF);
 		REG_PATTERN_TEST(E1000_FCAH, 0x0000FFFF, 0xFFFFFFFF);
 		REG_PATTERN_TEST(E1000_FCT, 0x0000FFFF, 0xFFFFFFFF);
@@ -824,15 +823,13 @@ static int e1000_reg_test(struct e1000_adapter *adapter, u64 *data)
 
 	REG_SET_AND_CHECK(E1000_RCTL, 0xFFFFFFFF, 0x00000000);
 
-	before = (((mac->type == e1000_ich8lan) ||
-		   (mac->type == e1000_ich9lan)) ? 0x06C3B33E : 0x06DFB3FE);
+	before = ((adapter->flags & FLAG_IS_ICH) ? 0x06C3B33E : 0x06DFB3FE);
 	REG_SET_AND_CHECK(E1000_RCTL, before, 0x003FFFFB);
 	REG_SET_AND_CHECK(E1000_TCTL, 0xFFFFFFFF, 0x00000000);
 
 	REG_SET_AND_CHECK(E1000_RCTL, before, 0xFFFFFFFF);
 	REG_PATTERN_TEST(E1000_RDBAL, 0xFFFFFFF0, 0xFFFFFFFF);
-	if ((mac->type != e1000_ich8lan) &&
-	    (mac->type != e1000_ich9lan))
+	if (!(adapter->flags & FLAG_IS_ICH))
 		REG_PATTERN_TEST(E1000_TXCW, 0xC000FFFF, 0x0000FFFF);
 	REG_PATTERN_TEST(E1000_TDBAL, 0xFFFFFFF0, 0xFFFFFFFF);
 	REG_PATTERN_TEST(E1000_TIDV, 0x0000FFFF, 0x0000FFFF);
@@ -911,9 +908,7 @@ static int e1000_intr_test(struct e1000_adapter *adapter, u64 *data)
 
 	/* Test each interrupt */
 	for (i = 0; i < 10; i++) {
-
-		if (((adapter->hw.mac.type == e1000_ich8lan) ||
-		     (adapter->hw.mac.type == e1000_ich9lan)) && i == 8)
+		if ((adapter->flags & FLAG_IS_ICH) && (i == 8))
 			continue;
 
 		/* Interrupt to test */
@@ -1184,6 +1179,7 @@ static int e1000_integrated_phy_loopback(struct e1000_adapter *adapter)
 	struct e1000_hw *hw = &adapter->hw;
 	u32 ctrl_reg = 0;
 	u32 stat_reg = 0;
+	u16 phy_reg = 0;
 
 	hw->mac.autoneg = 0;
 
@@ -1211,6 +1207,28 @@ static int e1000_integrated_phy_loopback(struct e1000_adapter *adapter)
 			     E1000_CTRL_SPD_100 |/* Force Speed to 100 */
 			     E1000_CTRL_FD);	 /* Force Duplex to FULL */
 		break;
+	case e1000_phy_bm:
+		/* Set Default MAC Interface speed to 1GB */
+		e1e_rphy(hw, PHY_REG(2, 21), &phy_reg);
+		phy_reg &= ~0x0007;
+		phy_reg |= 0x006;
+		e1e_wphy(hw, PHY_REG(2, 21), phy_reg);
+		/* Assert SW reset for above settings to take effect */
+		e1000e_commit_phy(hw);
+		mdelay(1);
+		/* Force Full Duplex */
+		e1e_rphy(hw, PHY_REG(769, 16), &phy_reg);
+		e1e_wphy(hw, PHY_REG(769, 16), phy_reg | 0x000C);
+		/* Set Link Up (in force link) */
+		e1e_rphy(hw, PHY_REG(776, 16), &phy_reg);
+		e1e_wphy(hw, PHY_REG(776, 16), phy_reg | 0x0040);
+		/* Force Link */
+		e1e_rphy(hw, PHY_REG(769, 16), &phy_reg);
+		e1e_wphy(hw, PHY_REG(769, 16), phy_reg | 0x0040);
+		/* Set Early Link Enable */
+		e1e_rphy(hw, PHY_REG(769, 20), &phy_reg);
+		e1e_wphy(hw, PHY_REG(769, 20), phy_reg | 0x0400);
+		/* fall through */
 	default:
 		/* force 1000, set loopback */
 		e1e_wphy(hw, PHY_CONTROL, 0x4140);
@@ -1224,8 +1242,7 @@ static int e1000_integrated_phy_loopback(struct e1000_adapter *adapter)
 			     E1000_CTRL_SPD_1000 |/* Force Speed to 1000 */
 			     E1000_CTRL_FD);	 /* Force Duplex to FULL */
 
-		if ((adapter->hw.mac.type == e1000_ich8lan) ||
-		    (adapter->hw.mac.type == e1000_ich9lan))
+		if (adapter->flags & FLAG_IS_ICH)
 			ctrl_reg |= E1000_CTRL_SLU;	/* Set Link Up */
 	}
 
diff --git a/drivers/net/e1000e/hw.h b/drivers/net/e1000e/hw.h
index a930e6d..74f263a 100644
--- a/drivers/net/e1000e/hw.h
+++ b/drivers/net/e1000e/hw.h
@@ -216,6 +216,21 @@ enum e1e_registers {
 #define IGP01E1000_PHY_LINK_HEALTH	0x13 /* PHY Link Health */
 #define IGP02E1000_PHY_POWER_MGMT	0x19 /* Power Management */
 #define IGP01E1000_PHY_PAGE_SELECT	0x1F /* Page Select */
+#define BM_PHY_PAGE_SELECT		22   /* Page Select for BM */
+#define IGP_PAGE_SHIFT			5
+#define PHY_REG_MASK			0x1F
+
+#define BM_WUC_PAGE			800
+#define BM_WUC_ADDRESS_OPCODE		0x11
+#define BM_WUC_DATA_OPCODE		0x12
+#define BM_WUC_ENABLE_PAGE		769
+#define BM_WUC_ENABLE_REG		17
+#define BM_WUC_ENABLE_BIT		(1 << 2)
+#define BM_WUC_HOST_WU_BIT		(1 << 4)
+
+#define BM_WUC	PHY_REG(BM_WUC_PAGE, 1)
+#define BM_WUFC PHY_REG(BM_WUC_PAGE, 2)
+#define BM_WUS	PHY_REG(BM_WUC_PAGE, 3)
 
 #define IGP01E1000_PHY_PCS_INIT_REG	0x00B4
 #define IGP01E1000_PHY_POLARITY_MASK	0x0078
@@ -331,10 +346,16 @@ enum e1e_registers {
 #define E1000_DEV_ID_ICH8_IFE_G			0x10C5
 #define E1000_DEV_ID_ICH8_IGP_M			0x104D
 #define E1000_DEV_ID_ICH9_IGP_AMT		0x10BD
+#define E1000_DEV_ID_ICH9_IGP_M_AMT		0x10F5
+#define E1000_DEV_ID_ICH9_IGP_M			0x10BF
+#define E1000_DEV_ID_ICH9_IGP_M_V		0x10CB
 #define E1000_DEV_ID_ICH9_IGP_C			0x294C
 #define E1000_DEV_ID_ICH9_IFE			0x10C0
 #define E1000_DEV_ID_ICH9_IFE_GT		0x10C3
 #define E1000_DEV_ID_ICH9_IFE_G			0x10C2
+#define E1000_DEV_ID_ICH10_R_BM_LM		0x10CC
+#define E1000_DEV_ID_ICH10_R_BM_LF		0x10CD
+#define E1000_DEV_ID_ICH10_R_BM_V		0x10CE
 
 #define E1000_FUNC_1 1
 
@@ -378,6 +399,7 @@ enum e1000_phy_type {
 	e1000_phy_gg82563,
 	e1000_phy_igp_3,
 	e1000_phy_ife,
+	e1000_phy_bm,
 };
 
 enum e1000_bus_width {
diff --git a/drivers/net/e1000e/ich8lan.c b/drivers/net/e1000e/ich8lan.c
index 768485d..9e38452 100644
--- a/drivers/net/e1000e/ich8lan.c
+++ b/drivers/net/e1000e/ich8lan.c
@@ -38,6 +38,12 @@
  * 82566DM Gigabit Network Connection
  * 82566MC Gigabit Network Connection
  * 82566MM Gigabit Network Connection
+ * 82567LM Gigabit Network Connection
+ * 82567LF Gigabit Network Connection
+ * 82567LM-2 Gigabit Network Connection
+ * 82567LF-2 Gigabit Network Connection
+ * 82567V-2 Gigabit Network Connection
+ * 82562GT-3 10/100 Network Connection
  */
 
 #include <linux/netdevice.h>
@@ -198,6 +204,19 @@ static s32 e1000_init_phy_params_ich8lan(struct e1000_hw *hw)
 	phy->addr			= 1;
 	phy->reset_delay_us		= 100;
 
+	/*
+	 * We may need to do this twice - once for IGP and if that fails,
+	 * we'll set BM func pointers and try again
+	 */
+	ret_val = e1000e_determine_phy_address(hw);
+	if (ret_val) {
+		hw->phy.ops.write_phy_reg = e1000e_write_phy_reg_bm;
+		hw->phy.ops.read_phy_reg  = e1000e_read_phy_reg_bm;
+		ret_val = e1000e_determine_phy_address(hw);
+		if (ret_val)
+			return ret_val;
+	}
+
 	phy->id = 0;
 	while ((e1000_phy_unknown == e1000e_get_phy_type_from_id(phy->id)) &&
 	       (i++ < 100)) {
@@ -219,6 +238,13 @@ static s32 e1000_init_phy_params_ich8lan(struct e1000_hw *hw)
 		phy->type = e1000_phy_ife;
 		phy->autoneg_mask = E1000_ALL_NOT_GIG;
 		break;
+	case BME1000_E_PHY_ID:
+		phy->type = e1000_phy_bm;
+		phy->autoneg_mask = AUTONEG_ADVERTISE_SPEED_DEFAULT;
+		hw->phy.ops.read_phy_reg = e1000e_read_phy_reg_bm;
+		hw->phy.ops.write_phy_reg = e1000e_write_phy_reg_bm;
+		hw->phy.ops.commit_phy = e1000e_phy_sw_reset;
+		break;
 	default:
 		return -E1000_ERR_PHY;
 		break;
@@ -664,6 +690,7 @@ static s32 e1000_get_phy_info_ich8lan(struct e1000_hw *hw)
 		return e1000_get_phy_info_ife_ich8lan(hw);
 		break;
 	case e1000_phy_igp_3:
+	case e1000_phy_bm:
 		return e1000e_get_phy_info_igp(hw);
 		break;
 	default:
@@ -728,7 +755,7 @@ static s32 e1000_set_d0_lplu_state_ich8lan(struct e1000_hw *hw, bool active)
 	s32 ret_val = 0;
 	u16 data;
 
-	if (phy->type != e1000_phy_igp_3)
+	if (phy->type == e1000_phy_ife)
 		return ret_val;
 
 	phy_ctrl = er32(PHY_CTRL);
@@ -1918,8 +1945,35 @@ static s32 e1000_setup_copper_link_ich8lan(struct e1000_hw *hw)
 		ret_val = e1000e_copper_link_setup_igp(hw);
 		if (ret_val)
 			return ret_val;
+	} else if (hw->phy.type == e1000_phy_bm) {
+		ret_val = e1000e_copper_link_setup_m88(hw);
+		if (ret_val)
+			return ret_val;
 	}
 
+	if (hw->phy.type == e1000_phy_ife) {
+		ret_val = e1e_rphy(hw, IFE_PHY_MDIX_CONTROL, &reg_data);
+		if (ret_val)
+			return ret_val;
+
+		reg_data &= ~IFE_PMC_AUTO_MDIX;
+
+		switch (hw->phy.mdix) {
+		case 1:
+			reg_data &= ~IFE_PMC_FORCE_MDIX;
+			break;
+		case 2:
+			reg_data |= IFE_PMC_FORCE_MDIX;
+			break;
+		case 0:
+		default:
+			reg_data |= IFE_PMC_AUTO_MDIX;
+			break;
+		}
+		ret_val = e1e_wphy(hw, IFE_PHY_MDIX_CONTROL, reg_data);
+		if (ret_val)
+			return ret_val;
+	}
 	return e1000e_setup_copper_link(hw);
 }
 
@@ -2127,6 +2181,31 @@ void e1000e_gig_downshift_workaround_ich8lan(struct e1000_hw *hw)
 }
 
 /**
+ *  e1000e_disable_gig_wol_ich8lan - disable gig during WoL
+ *  @hw: pointer to the HW structure
+ *
+ *  During S0 to Sx transition, it is possible the link remains at gig
+ *  instead of negotiating to a lower speed.  Before going to Sx, set
+ *  'LPLU Enabled' and 'Gig Disable' to force link speed negotiation
+ *  to a lower speed.
+ *
+ *  Should only be called for ICH9 devices.
+ **/
+void e1000e_disable_gig_wol_ich8lan(struct e1000_hw *hw)
+{
+	u32 phy_ctrl;
+
+	if (hw->mac.type == e1000_ich9lan) {
+		phy_ctrl = er32(PHY_CTRL);
+		phy_ctrl |= E1000_PHY_CTRL_D0A_LPLU |
+		            E1000_PHY_CTRL_GBE_DISABLE;
+		ew32(PHY_CTRL, phy_ctrl);
+	}
+
+	return;
+}
+
+/**
  *  e1000_cleanup_led_ich8lan - Restore the default LED operation
  *  @hw: pointer to the HW structure
  *
@@ -2247,6 +2326,7 @@ static struct e1000_nvm_operations ich8_nvm_ops = {
 struct e1000_info e1000_ich8_info = {
 	.mac			= e1000_ich8lan,
 	.flags			= FLAG_HAS_WOL
+				  | FLAG_IS_ICH
 				  | FLAG_RX_CSUM_ENABLED
 				  | FLAG_HAS_CTRLEXT_ON_LOAD
 				  | FLAG_HAS_AMT
@@ -2262,6 +2342,7 @@ struct e1000_info e1000_ich8_info = {
 struct e1000_info e1000_ich9_info = {
 	.mac			= e1000_ich9lan,
 	.flags			= FLAG_HAS_JUMBO_FRAMES
+				  | FLAG_IS_ICH
 				  | FLAG_HAS_WOL
 				  | FLAG_RX_CSUM_ENABLED
 				  | FLAG_HAS_CTRLEXT_ON_LOAD
diff --git a/drivers/net/e1000e/netdev.c b/drivers/net/e1000e/netdev.c
index 8991ab8..8cbb40f 100644
--- a/drivers/net/e1000e/netdev.c
+++ b/drivers/net/e1000e/netdev.c
@@ -43,10 +43,11 @@
 #include <linux/if_vlan.h>
 #include <linux/cpu.h>
 #include <linux/smp.h>
+#include <linux/pm_qos_params.h>
 
 #include "e1000.h"
 
-#define DRV_VERSION "0.2.1"
+#define DRV_VERSION "0.3.3.3-k2"
 char e1000e_driver_name[] = "e1000e";
 const char e1000e_driver_version[] = DRV_VERSION;
 
@@ -341,6 +342,89 @@ no_buffers:
 }
 
 /**
+ * e1000_alloc_jumbo_rx_buffers - Replace used jumbo receive buffers
+ * @adapter: address of board private structure
+ * @rx_ring: pointer to receive ring structure
+ * @cleaned_count: number of buffers to allocate this pass
+ **/
+
+static void e1000_alloc_jumbo_rx_buffers(struct e1000_adapter *adapter,
+                                         int cleaned_count)
+{
+	struct net_device *netdev = adapter->netdev;
+	struct pci_dev *pdev = adapter->pdev;
+	struct e1000_rx_desc *rx_desc;
+	struct e1000_ring *rx_ring = adapter->rx_ring;
+	struct e1000_buffer *buffer_info;
+	struct sk_buff *skb;
+	unsigned int i;
+	unsigned int bufsz = 256 -
+	                     16 /* for skb_reserve */ -
+	                     NET_IP_ALIGN;
+
+	i = rx_ring->next_to_use;
+	buffer_info = &rx_ring->buffer_info[i];
+
+	while (cleaned_count--) {
+		skb = buffer_info->skb;
+		if (skb) {
+			skb_trim(skb, 0);
+			goto check_page;
+		}
+
+		skb = netdev_alloc_skb(netdev, bufsz);
+		if (unlikely(!skb)) {
+			/* Better luck next round */
+			adapter->alloc_rx_buff_failed++;
+			break;
+		}
+
+		/* Make buffer alignment 2 beyond a 16 byte boundary
+		 * this will result in a 16 byte aligned IP header after
+		 * the 14 byte MAC header is removed
+		 */
+		skb_reserve(skb, NET_IP_ALIGN);
+
+		buffer_info->skb = skb;
+check_page:
+		/* allocate a new page if necessary */
+		if (!buffer_info->page) {
+			buffer_info->page = alloc_page(GFP_ATOMIC);
+			if (unlikely(!buffer_info->page)) {
+				adapter->alloc_rx_buff_failed++;
+				break;
+			}
+		}
+
+		if (!buffer_info->dma)
+			buffer_info->dma = pci_map_page(pdev,
+			                                buffer_info->page, 0,
+			                                PAGE_SIZE,
+			                                PCI_DMA_FROMDEVICE);
+
+		rx_desc = E1000_RX_DESC(*rx_ring, i);
+		rx_desc->buffer_addr = cpu_to_le64(buffer_info->dma);
+
+		if (unlikely(++i == rx_ring->count))
+			i = 0;
+		buffer_info = &rx_ring->buffer_info[i];
+	}
+
+	if (likely(rx_ring->next_to_use != i)) {
+		rx_ring->next_to_use = i;
+		if (unlikely(i-- == 0))
+			i = (rx_ring->count - 1);
+
+		/* Force memory writes to complete before letting h/w
+		 * know there are new descriptors to fetch.  (Only
+		 * applicable for weak-ordered memory model archs,
+		 * such as IA-64). */
+		wmb();
+		writel(i, adapter->hw.hw_addr + rx_ring->tail);
+	}
+}
+
+/**
  * e1000_clean_rx_irq - Send received data up the network stack; legacy
  * @adapter: board private structure
  *
@@ -783,6 +867,186 @@ next_desc:
 }
 
 /**
+ * e1000_consume_page - helper function
+ **/
+static void e1000_consume_page(struct e1000_buffer *bi, struct sk_buff *skb,
+                               u16 length)
+{
+	bi->page = NULL;
+	skb->len += length;
+	skb->data_len += length;
+	skb->truesize += length;
+}
+
+/**
+ * e1000_clean_jumbo_rx_irq - Send received data up the network stack; legacy
+ * @adapter: board private structure
+ *
+ * the return value indicates whether actual cleaning was done, there
+ * is no guarantee that everything was cleaned
+ **/
+
+static bool e1000_clean_jumbo_rx_irq(struct e1000_adapter *adapter,
+                                     int *work_done, int work_to_do)
+{
+	struct net_device *netdev = adapter->netdev;
+	struct pci_dev *pdev = adapter->pdev;
+	struct e1000_ring *rx_ring = adapter->rx_ring;
+	struct e1000_rx_desc *rx_desc, *next_rxd;
+	struct e1000_buffer *buffer_info, *next_buffer;
+	u32 length;
+	unsigned int i;
+	int cleaned_count = 0;
+	bool cleaned = false;
+	unsigned int total_rx_bytes=0, total_rx_packets=0;
+
+	i = rx_ring->next_to_clean;
+	rx_desc = E1000_RX_DESC(*rx_ring, i);
+	buffer_info = &rx_ring->buffer_info[i];
+
+	while (rx_desc->status & E1000_RXD_STAT_DD) {
+		struct sk_buff *skb;
+		u8 status;
+
+		if (*work_done >= work_to_do)
+			break;
+		(*work_done)++;
+
+		status = rx_desc->status;
+		skb = buffer_info->skb;
+		buffer_info->skb = NULL;
+
+		++i;
+		if (i == rx_ring->count)
+			i = 0;
+		next_rxd = E1000_RX_DESC(*rx_ring, i);
+		prefetch(next_rxd);
+
+		next_buffer = &rx_ring->buffer_info[i];
+
+		cleaned = true;
+		cleaned_count++;
+		pci_unmap_page(pdev, buffer_info->dma, PAGE_SIZE,
+		               PCI_DMA_FROMDEVICE);
+		buffer_info->dma = 0;
+
+		length = le16_to_cpu(rx_desc->length);
+
+		/* errors is only valid for DD + EOP descriptors */
+		if (unlikely((status & E1000_RXD_STAT_EOP) &&
+		    (rx_desc->errors & E1000_RXD_ERR_FRAME_ERR_MASK))) {
+				/* recycle both page and skb */
+				buffer_info->skb = skb;
+				/* an error means any chain goes out the window
+				 * too */
+				if (rx_ring->rx_skb_top)
+					dev_kfree_skb(rx_ring->rx_skb_top);
+				rx_ring->rx_skb_top = NULL;
+				goto next_desc;
+		}
+
+#define rxtop rx_ring->rx_skb_top
+		if (!(status & E1000_RXD_STAT_EOP)) {
+			/* this descriptor is only the beginning (or middle) */
+			if (!rxtop) {
+				/* this is the beginning of a chain */
+				rxtop = skb;
+				skb_fill_page_desc(rxtop, 0, buffer_info->page,
+				                   0, length);
+			} else {
+				/* this is the middle of a chain */
+				skb_fill_page_desc(rxtop,
+				    skb_shinfo(rxtop)->nr_frags,
+				    buffer_info->page, 0, length);
+				/* re-use the skb, only consumed the page */
+				buffer_info->skb = skb;
+			}
+			e1000_consume_page(buffer_info, rxtop, length);
+			goto next_desc;
+		} else {
+			if (rxtop) {
+				/* end of the chain */
+				skb_fill_page_desc(rxtop,
+				    skb_shinfo(rxtop)->nr_frags,
+				    buffer_info->page, 0, length);
+				/* re-use the current skb, we only consumed the
+				 * page */
+				buffer_info->skb = skb;
+				skb = rxtop;
+				rxtop = NULL;
+				e1000_consume_page(buffer_info, skb, length);
+			} else {
+				/* no chain, got EOP, this buf is the packet
+				 * copybreak to save the put_page/alloc_page */
+				if (length <= copybreak &&
+				    skb_tailroom(skb) >= length) {
+					u8 *vaddr;
+					vaddr = kmap_atomic(buffer_info->page,
+					                   KM_SKB_DATA_SOFTIRQ);
+					memcpy(skb_tail_pointer(skb), vaddr,
+					       length);
+					kunmap_atomic(vaddr,
+					              KM_SKB_DATA_SOFTIRQ);
+					/* re-use the page, so don't erase
+					 * buffer_info->page */
+					skb_put(skb, length);
+				} else {
+					skb_fill_page_desc(skb, 0,
+					                   buffer_info->page, 0,
+				                           length);
+					e1000_consume_page(buffer_info, skb,
+					                   length);
+				}
+			}
+		}
+
+		/* Receive Checksum Offload XXX recompute due to CRC strip? */
+		e1000_rx_checksum(adapter,
+		                  (u32)(status) |
+		                  ((u32)(rx_desc->errors) << 24),
+		                  le16_to_cpu(rx_desc->csum), skb);
+
+		/* probably a little skewed due to removing CRC */
+		total_rx_bytes += skb->len;
+		total_rx_packets++;
+
+		/* eth type trans needs skb->data to point to something */
+		if (!pskb_may_pull(skb, ETH_HLEN)) {
+			ndev_err(netdev, "pskb_may_pull failed.\n");
+			dev_kfree_skb(skb);
+			goto next_desc;
+		}
+
+		e1000_receive_skb(adapter, netdev, skb, status,
+		                  rx_desc->special);
+
+next_desc:
+		rx_desc->status = 0;
+
+		/* return some buffers to hardware, one at a time is too slow */
+		if (unlikely(cleaned_count >= E1000_RX_BUFFER_WRITE)) {
+			adapter->alloc_rx_buf(adapter, cleaned_count);
+			cleaned_count = 0;
+		}
+
+		/* use prefetched values */
+		rx_desc = next_rxd;
+		buffer_info = next_buffer;
+	}
+	rx_ring->next_to_clean = i;
+
+	cleaned_count = e1000_desc_unused(rx_ring);
+	if (cleaned_count)
+		adapter->alloc_rx_buf(adapter, cleaned_count);
+
+	adapter->total_rx_bytes += total_rx_bytes;
+	adapter->total_rx_packets += total_rx_packets;
+	adapter->net_stats.rx_bytes += total_rx_bytes;
+	adapter->net_stats.rx_packets += total_rx_packets;
+	return cleaned;
+}
+
+/**
  * e1000_clean_rx_ring - Free Rx Buffers per Queue
  * @adapter: board private structure
  **/
@@ -802,6 +1066,10 @@ static void e1000_clean_rx_ring(struct e1000_adapter *adapter)
 				pci_unmap_single(pdev, buffer_info->dma,
 						 adapter->rx_buffer_len,
 						 PCI_DMA_FROMDEVICE);
+			else if (adapter->clean_rx == e1000_clean_jumbo_rx_irq)
+				pci_unmap_page(pdev, buffer_info->dma,
+				               PAGE_SIZE,
+				               PCI_DMA_FROMDEVICE);
 			else if (adapter->clean_rx == e1000_clean_rx_irq_ps)
 				pci_unmap_single(pdev, buffer_info->dma,
 						 adapter->rx_ps_bsize0,
@@ -809,6 +1077,11 @@ static void e1000_clean_rx_ring(struct e1000_adapter *adapter)
 			buffer_info->dma = 0;
 		}
 
+		if (buffer_info->page) {
+			put_page(buffer_info->page);
+			buffer_info->page = NULL;
+		}
+
 		if (buffer_info->skb) {
 			dev_kfree_skb(buffer_info->skb);
 			buffer_info->skb = NULL;
@@ -1755,10 +2028,12 @@ static void e1000_setup_rctl(struct e1000_adapter *adapter)
 	 * a lot of memory, since we allocate 3 pages at all times
 	 * per packet.
 	 */
-	adapter->rx_ps_pages = 0;
 	pages = PAGE_USE_COUNT(adapter->netdev->mtu);
-	if ((pages <= 3) && (PAGE_SIZE <= 16384) && (rctl & E1000_RCTL_LPE))
+	if (!(adapter->flags & FLAG_IS_ICH) && (pages <= 3) &&
+	    (PAGE_SIZE <= 16384) && (rctl & E1000_RCTL_LPE))
 		adapter->rx_ps_pages = pages;
+	else
+		adapter->rx_ps_pages = 0;
 
 	if (adapter->rx_ps_pages) {
 		/* Configure extra packet-split registers */
@@ -1819,9 +2094,12 @@ static void e1000_configure_rx(struct e1000_adapter *adapter)
 			sizeof(union e1000_rx_desc_packet_split);
 		adapter->clean_rx = e1000_clean_rx_irq_ps;
 		adapter->alloc_rx_buf = e1000_alloc_rx_buffers_ps;
+	} else if (adapter->netdev->mtu > ETH_FRAME_LEN + ETH_FCS_LEN) {
+		rdlen = rx_ring->count * sizeof(struct e1000_rx_desc);
+		adapter->clean_rx = e1000_clean_jumbo_rx_irq;
+		adapter->alloc_rx_buf = e1000_alloc_jumbo_rx_buffers;
 	} else {
-		rdlen = rx_ring->count *
-			sizeof(struct e1000_rx_desc);
+		rdlen = rx_ring->count * sizeof(struct e1000_rx_desc);
 		adapter->clean_rx = e1000_clean_rx_irq;
 		adapter->alloc_rx_buf = e1000_alloc_rx_buffers;
 	}
@@ -1885,8 +2163,21 @@ static void e1000_configure_rx(struct e1000_adapter *adapter)
 	 * units), e.g. using jumbo frames when setting to E1000_ERT_2048
 	 */
 	if ((adapter->flags & FLAG_HAS_ERT) &&
-	    (adapter->netdev->mtu > ETH_DATA_LEN))
-		ew32(ERT, E1000_ERT_2048);
+	    (adapter->netdev->mtu > ETH_DATA_LEN)) {
+		u32 rxdctl = er32(RXDCTL(0));
+		ew32(RXDCTL(0), rxdctl | 0x3);
+		ew32(ERT, E1000_ERT_2048 | (1 << 13));
+		/*
+		 * With jumbo frames and early-receive enabled, excessive
+		 * C4->C2 latencies result in dropped transactions.
+		 */
+		pm_qos_update_requirement(PM_QOS_CPU_DMA_LATENCY,
+					  e1000e_driver_name, 55);
+	} else {
+		pm_qos_update_requirement(PM_QOS_CPU_DMA_LATENCY,
+					  e1000e_driver_name,
+					  PM_QOS_DEFAULT_VALUE);
+	}
 
 	/* Enable Receives */
 	ew32(RCTL, rctl);
@@ -2155,6 +2446,14 @@ void e1000e_reset(struct e1000_adapter *adapter)
 
 	/* Allow time for pending master requests to run */
 	mac->ops.reset_hw(hw);
+
+	/*
+	 * For parts with AMT enabled, let the firmware know
+	 * that the network interface is in control
+	 */
+	if ((adapter->flags & FLAG_HAS_AMT) && e1000e_check_mng_mode(hw))
+		e1000_get_hw_control(adapter);
+
 	ew32(WUC, 0);
 
 	if (mac->ops.init_hw(hw))
@@ -3469,6 +3768,8 @@ static int e1000_change_mtu(struct net_device *netdev, int new_mtu)
 	 * means we reserve 2 more, this pushes us to allocate from the next
 	 * larger slab size.
 	 * i.e. RXBUFFER_2048 --> size-4096 slab
+	 * However with the new *_jumbo_rx* routines, jumbo receives will use
+	 * fragmented skbs
 	 */
 
 	if (max_frame <= 256)
@@ -3626,6 +3927,9 @@ static int e1000_suspend(struct pci_dev *pdev, pm_message_t state)
 			ew32(CTRL_EXT, ctrl_ext);
 		}
 
+		if (adapter->flags & FLAG_IS_ICH)
+			e1000e_disable_gig_wol_ich8lan(&adapter->hw);
+
 		/* Allow time for pending master requests to run */
 		e1000e_disable_pcie_master(&adapter->hw);
 
@@ -4292,6 +4596,13 @@ static struct pci_device_id e1000_pci_tbl[] = {
 	{ PCI_VDEVICE(INTEL, E1000_DEV_ID_ICH9_IFE_GT), board_ich9lan },
 	{ PCI_VDEVICE(INTEL, E1000_DEV_ID_ICH9_IGP_AMT), board_ich9lan },
 	{ PCI_VDEVICE(INTEL, E1000_DEV_ID_ICH9_IGP_C), board_ich9lan },
+	{ PCI_VDEVICE(INTEL, E1000_DEV_ID_ICH9_IGP_M), board_ich9lan },
+	{ PCI_VDEVICE(INTEL, E1000_DEV_ID_ICH9_IGP_M_AMT), board_ich9lan },
+	{ PCI_VDEVICE(INTEL, E1000_DEV_ID_ICH9_IGP_M_V), board_ich9lan },
+
+	{ PCI_VDEVICE(INTEL, E1000_DEV_ID_ICH10_R_BM_LM), board_ich9lan },
+	{ PCI_VDEVICE(INTEL, E1000_DEV_ID_ICH10_R_BM_LF), board_ich9lan },
+	{ PCI_VDEVICE(INTEL, E1000_DEV_ID_ICH10_R_BM_V), board_ich9lan },
 
 	{ }	/* terminate list */
 };
@@ -4326,7 +4637,9 @@ static int __init e1000_init_module(void)
 	printk(KERN_INFO "%s: Copyright (c) 1999-2008 Intel Corporation.\n",
 	       e1000e_driver_name);
 	ret = pci_register_driver(&e1000_driver);
-
+	pm_qos_add_requirement(PM_QOS_CPU_DMA_LATENCY, e1000e_driver_name,
+			       PM_QOS_DEFAULT_VALUE);
+				
 	return ret;
 }
 module_init(e1000_init_module);
@@ -4340,6 +4653,7 @@ module_init(e1000_init_module);
 static void __exit e1000_exit_module(void)
 {
 	pci_unregister_driver(&e1000_driver);
+	pm_qos_remove_requirement(PM_QOS_CPU_DMA_LATENCY, e1000e_driver_name);
 }
 module_exit(e1000_exit_module);
 
diff --git a/drivers/net/e1000e/phy.c b/drivers/net/e1000e/phy.c
index e102332..b133dcf 100644
--- a/drivers/net/e1000e/phy.c
+++ b/drivers/net/e1000e/phy.c
@@ -34,6 +34,9 @@ static s32 e1000_get_phy_cfg_done(struct e1000_hw *hw);
 static s32 e1000_phy_force_speed_duplex(struct e1000_hw *hw);
 static s32 e1000_set_d0_lplu_state(struct e1000_hw *hw, bool active);
 static s32 e1000_wait_autoneg(struct e1000_hw *hw);
+static u32 e1000_get_phy_addr_for_bm_page(u32 page, u32 reg);
+static s32 e1000_access_phy_wakeup_reg_bm(struct e1000_hw *hw, u32 offset,
+					  u16 *data, bool read);
 
 /* Cable length tables */
 static const u16 e1000_m88_cable_length_table[] =
@@ -465,6 +468,10 @@ s32 e1000e_copper_link_setup_m88(struct e1000_hw *hw)
 	if (phy->disable_polarity_correction == 1)
 		phy_data |= M88E1000_PSCR_POLARITY_REVERSAL;
 
+	/* Enable downshift on BM (disabled by default) */
+	if (phy->type == e1000_phy_bm)
+		phy_data |= BME1000_PSCR_ENABLE_DOWNSHIFT;
+
 	ret_val = e1e_wphy(hw, M88E1000_PHY_SPEC_CTRL, phy_data);
 	if (ret_val)
 		return ret_val;
@@ -1776,6 +1783,10 @@ enum e1000_phy_type e1000e_get_phy_type_from_id(u32 phy_id)
 	case IFE_C_E_PHY_ID:
 		phy_type = e1000_phy_ife;
 		break;
+	case BME1000_E_PHY_ID:
+	case BME1000_E_PHY_ID_R2:
+		phy_type = e1000_phy_bm;
+		break;
 	default:
 		phy_type = e1000_phy_unknown;
 		break;
@@ -1784,6 +1795,273 @@ enum e1000_phy_type e1000e_get_phy_type_from_id(u32 phy_id)
 }
 
 /**
+ *  e1000e_determine_phy_address - Determines PHY address.
+ *  @hw: pointer to the HW structure
+ *
+ *  This uses a trial and error method to loop through possible PHY
+ *  addresses. It tests each by reading the PHY ID registers and
+ *  checking for a match.
+ **/
+s32 e1000e_determine_phy_address(struct e1000_hw *hw)
+{
+	s32 ret_val = -E1000_ERR_PHY_TYPE;
+	u32 phy_addr= 0;
+	u32 i = 0;
+	enum e1000_phy_type phy_type = e1000_phy_unknown;
+
+	do {
+		for (phy_addr = 0; phy_addr < 4; phy_addr++) {
+			hw->phy.addr = phy_addr;
+			e1000e_get_phy_id(hw);
+			phy_type = e1000e_get_phy_type_from_id(hw->phy.id);
+
+			/* 
+			 * If phy_type is valid, break - we found our
+			 * PHY address
+			 */
+			if (phy_type  != e1000_phy_unknown) {
+				ret_val = 0;
+				break;
+			}
+		}
+		i++;
+	} while ((ret_val != 0) && (i < 100));
+
+	return ret_val;
+}
+
+/**
+ *  e1000_get_phy_addr_for_bm_page - Retrieve PHY page address
+ *  @page: page to access
+ *
+ *  Returns the phy address for the page requested.
+ **/
+static u32 e1000_get_phy_addr_for_bm_page(u32 page, u32 reg)
+{
+	u32 phy_addr = 2;
+
+	if ((page >= 768) || (page == 0 && reg == 25) || (reg == 31))
+		phy_addr = 1;
+
+	return phy_addr;
+}
+
+/**
+ *  e1000e_write_phy_reg_bm - Write BM PHY register
+ *  @hw: pointer to the HW structure
+ *  @offset: register offset to write to
+ *  @data: data to write at register offset
+ *
+ *  Acquires semaphore, if necessary, then writes the data to PHY register
+ *  at the offset.  Release any acquired semaphores before exiting.
+ **/
+s32 e1000e_write_phy_reg_bm(struct e1000_hw *hw, u32 offset, u16 data)
+{
+	s32 ret_val;
+	u32 page_select = 0;
+	u32 page = offset >> IGP_PAGE_SHIFT;
+	u32 page_shift = 0;
+
+	/* Page 800 works differently than the rest so it has its own func */
+	if (page == BM_WUC_PAGE) {
+		ret_val = e1000_access_phy_wakeup_reg_bm(hw, offset, &data,
+							 false);
+		goto out;
+	}
+
+	ret_val = hw->phy.ops.acquire_phy(hw);
+	if (ret_val)
+		goto out;
+
+	hw->phy.addr = e1000_get_phy_addr_for_bm_page(page, offset);
+
+	if (offset > MAX_PHY_MULTI_PAGE_REG) {
+		/*
+		 * Page select is register 31 for phy address 1 and 22 for
+		 * phy address 2 and 3. Page select is shifted only for
+		 * phy address 1.
+		 */
+		if (hw->phy.addr == 1) {
+			page_shift = IGP_PAGE_SHIFT;
+			page_select = IGP01E1000_PHY_PAGE_SELECT;
+		} else {
+			page_shift = 0;
+			page_select = BM_PHY_PAGE_SELECT;
+		}
+
+		/* Page is shifted left, PHY expects (page x 32) */
+		ret_val = e1000e_write_phy_reg_mdic(hw, page_select,
+		                                    (page << page_shift));
+		if (ret_val) {
+			hw->phy.ops.release_phy(hw);
+			goto out;
+		}
+	}
+
+	ret_val = e1000e_write_phy_reg_mdic(hw, MAX_PHY_REG_ADDRESS & offset,
+	                                    data);
+
+	hw->phy.ops.release_phy(hw);
+
+out:
+	return ret_val;
+}
+
+/**
+ *  e1000e_read_phy_reg_bm - Read BM PHY register
+ *  @hw: pointer to the HW structure
+ *  @offset: register offset to be read
+ *  @data: pointer to the read data
+ *
+ *  Acquires semaphore, if necessary, then reads the PHY register at offset
+ *  and storing the retrieved information in data.  Release any acquired
+ *  semaphores before exiting.
+ **/
+s32 e1000e_read_phy_reg_bm(struct e1000_hw *hw, u32 offset, u16 *data)
+{
+	s32 ret_val;
+	u32 page_select = 0;
+	u32 page = offset >> IGP_PAGE_SHIFT;
+	u32 page_shift = 0;
+
+	/* Page 800 works differently than the rest so it has its own func */
+	if (page == BM_WUC_PAGE) {
+		ret_val = e1000_access_phy_wakeup_reg_bm(hw, offset, data,
+							 true);
+		goto out;
+	}
+
+	ret_val = hw->phy.ops.acquire_phy(hw);
+	if (ret_val)
+		goto out;
+
+	hw->phy.addr = e1000_get_phy_addr_for_bm_page(page, offset);
+
+	if (offset > MAX_PHY_MULTI_PAGE_REG) {
+		/*
+		 * Page select is register 31 for phy address 1 and 22 for
+		 * phy address 2 and 3. Page select is shifted only for
+		 * phy address 1.
+		 */
+		if (hw->phy.addr == 1) {
+			page_shift = IGP_PAGE_SHIFT;
+			page_select = IGP01E1000_PHY_PAGE_SELECT;
+		} else {
+			page_shift = 0;
+			page_select = BM_PHY_PAGE_SELECT;
+		}
+
+		/* Page is shifted left, PHY expects (page x 32) */
+		ret_val = e1000e_write_phy_reg_mdic(hw, page_select,
+		                                    (page << page_shift));
+		if (ret_val) {
+			hw->phy.ops.release_phy(hw);
+			goto out;
+		}
+	}
+
+	ret_val = e1000e_read_phy_reg_mdic(hw, MAX_PHY_REG_ADDRESS & offset,
+	                                   data);
+	hw->phy.ops.release_phy(hw);
+
+out:
+	return ret_val;
+}
+
+/**
+ *  e1000_access_phy_wakeup_reg_bm - Read BM PHY wakeup register
+ *  @hw: pointer to the HW structure
+ *  @offset: register offset to be read or written
+ *  @data: pointer to the data to read or write
+ *  @read: determines if operation is read or write
+ *
+ *  Acquires semaphore, if necessary, then reads the PHY register at offset
+ *  and storing the retrieved information in data.  Release any acquired
+ *  semaphores before exiting. Note that procedure to read the wakeup
+ *  registers are different. It works as such:
+ *  1) Set page 769, register 17, bit 2 = 1
+ *  2) Set page to 800 for host (801 if we were manageability)
+ *  3) Write the address using the address opcode (0x11)
+ *  4) Read or write the data using the data opcode (0x12)
+ *  5) Restore 769_17.2 to its original value
+ **/
+static s32 e1000_access_phy_wakeup_reg_bm(struct e1000_hw *hw, u32 offset,
+					  u16 *data, bool read)
+{
+	s32 ret_val;
+	u16 reg = ((u16)offset) & PHY_REG_MASK;
+	u16 phy_reg = 0;
+	u8  phy_acquired = 1;
+
+
+	ret_val = hw->phy.ops.acquire_phy(hw);
+	if (ret_val) {
+		phy_acquired = 0;
+		goto out;
+	}
+
+	/* All operations in this function are phy address 1 */
+	hw->phy.addr = 1;
+
+	/* Set page 769 */
+	e1000e_write_phy_reg_mdic(hw, IGP01E1000_PHY_PAGE_SELECT,
+	                          (BM_WUC_ENABLE_PAGE << IGP_PAGE_SHIFT));
+
+	ret_val = e1000e_read_phy_reg_mdic(hw, BM_WUC_ENABLE_REG, &phy_reg);
+	if (ret_val)
+		goto out;
+
+	/* First clear bit 4 to avoid a power state change */
+	phy_reg &= ~(BM_WUC_HOST_WU_BIT);
+	ret_val = e1000e_write_phy_reg_mdic(hw, BM_WUC_ENABLE_REG, phy_reg);
+	if (ret_val)
+		goto out;
+
+	/* Write bit 2 = 1, and clear bit 4 to 769_17 */
+	ret_val = e1000e_write_phy_reg_mdic(hw, BM_WUC_ENABLE_REG,
+	                                    phy_reg | BM_WUC_ENABLE_BIT);
+	if (ret_val)
+		goto out;
+
+	/* Select page 800 */
+	ret_val = e1000e_write_phy_reg_mdic(hw, IGP01E1000_PHY_PAGE_SELECT,
+	                                    (BM_WUC_PAGE << IGP_PAGE_SHIFT));
+
+	/* Write the page 800 offset value using opcode 0x11 */
+	ret_val = e1000e_write_phy_reg_mdic(hw, BM_WUC_ADDRESS_OPCODE, reg);
+	if (ret_val)
+		goto out;
+
+	if (read) {
+	        /* Read the page 800 value using opcode 0x12 */
+		ret_val = e1000e_read_phy_reg_mdic(hw, BM_WUC_DATA_OPCODE,
+		                                   data);
+	} else {
+	        /* Read the page 800 value using opcode 0x12 */
+		ret_val = e1000e_write_phy_reg_mdic(hw, BM_WUC_DATA_OPCODE,
+						    *data);
+	}
+
+	if (ret_val)
+		goto out;
+
+	/*
+	 * Restore 769_17.2 to its original value
+	 * Set page 769
+	 */
+	e1000e_write_phy_reg_mdic(hw, IGP01E1000_PHY_PAGE_SELECT,
+	                          (BM_WUC_ENABLE_PAGE << IGP_PAGE_SHIFT));
+
+	/* Clear 769_17.2 */
+	ret_val = e1000e_write_phy_reg_mdic(hw, BM_WUC_ENABLE_REG, phy_reg);
+
+out:
+	if (phy_acquired == 1)
+		hw->phy.ops.release_phy(hw);
+	return ret_val;
+}
+
+/**
  *  e1000e_commit_phy - Soft PHY reset
  *  @hw: pointer to the HW structure
  *
-- 
cgit v0.10.2


From 7ab267d4ecdad3032d6bb31619a2744fc2074b59 Mon Sep 17 00:00:00 2001
From: Jeff Garzik <jeff@garzik.org>
Date: Tue, 6 May 2008 12:16:24 -0400
Subject: fix warning in drivers/net/appletalk/cops.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

drivers/net/appletalk/cops.c: In function ‘cops_reset’:
drivers/net/appletalk/cops.c:507: warning: comparison of distinct pointer
types lacks a cast

by replacing hand-woven msleep() with call to msleep()

Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/net/appletalk/cops.c b/drivers/net/appletalk/cops.c
index 82e9a5b..a0b4c85 100644
--- a/drivers/net/appletalk/cops.c
+++ b/drivers/net/appletalk/cops.c
@@ -499,19 +499,13 @@ static void cops_reset(struct net_device *dev, int sleep)
         {
                 outb(0, ioaddr+DAYNA_RESET);	/* Assert the reset port */
                 inb(ioaddr+DAYNA_RESET);	/* Clear the reset */
-                if(sleep)
-                {
-                        long snap=jiffies;
-
-			/* Let card finish initializing, about 1/3 second */
-			while (time_before(jiffies, snap + HZ/3))
-                                schedule();
-                }
-                else
-                        mdelay(333);
+		if (sleep)
+			msleep(333);
+		else
+			mdelay(333);
         }
+
 	netif_wake_queue(dev);
-	return;
 }
 
 static void cops_load (struct net_device *dev)
-- 
cgit v0.10.2


From aa807f79dad3d6a8e9b175d66418b0c5be1d5cd8 Mon Sep 17 00:00:00 2001
From: Gunnar Larisch <Gunnar.Larisch@gmx.de>
Date: Mon, 5 May 2008 14:01:28 +0200
Subject: 3c980-TX needs EXTRA_PREAMBLE

The ethernet card 3c980-TX needs a mdio_sync() to initialize the ethernet
properly. This is forced by adding an EXTRA_PREAMBLE to its drv_flags.

Without this, the driver did not reconnect after a link loss.

Signed-off-by: Gunnar Larisch <Gunnar.Larisch@gmx.de>
Acked-by: Steffen Klassert <klassert@mathematik.tu-chemnitz.de>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/net/3c59x.c b/drivers/net/3c59x.c
index 6f8e7d4..e93c378 100644
--- a/drivers/net/3c59x.c
+++ b/drivers/net/3c59x.c
@@ -319,7 +319,7 @@ static struct vortex_chip_info {
 	{"3c920B-EMB-WNM (ATI Radeon 9100 IGP)",
 	 PCI_USES_MASTER, IS_TORNADO|HAS_MII|HAS_HWCKSM, 128, },
 	{"3c980 Cyclone",
-	 PCI_USES_MASTER, IS_CYCLONE|HAS_HWCKSM, 128, },
+	 PCI_USES_MASTER, IS_CYCLONE|HAS_HWCKSM|EXTRA_PREAMBLE, 128, },
 
 	{"3c980C Python-T",
 	 PCI_USES_MASTER, IS_CYCLONE|HAS_NWAY|HAS_HWCKSM, 128, },
-- 
cgit v0.10.2


From 1daad055bfc928dfc8590664c455960059421151 Mon Sep 17 00:00:00 2001
From: Paulius Zaleckas <paulius.zaleckas@teltonika.lt>
Date: Mon, 5 May 2008 14:01:29 +0200
Subject: 3c59x: use netstats in net_device structure

Use net_device_stats from net_device structure instead of local.

Signed-off-by: Paulius Zaleckas <paulius.zaleckas@teltonika.lt>
Acked-by: Steffen Klassert <klassert@mathematik.tu-chemnitz.de>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/net/3c59x.c b/drivers/net/3c59x.c
index e93c378..2edda8c 100644
--- a/drivers/net/3c59x.c
+++ b/drivers/net/3c59x.c
@@ -600,7 +600,6 @@ struct vortex_private {
 	struct sk_buff* tx_skbuff[TX_RING_SIZE];
 	unsigned int cur_rx, cur_tx;		/* The next free ring entry */
 	unsigned int dirty_rx, dirty_tx;	/* The ring entries to be free()ed. */
-	struct net_device_stats stats;		/* Generic stats */
 	struct vortex_extra_stats xstats;	/* NIC-specific extra stats */
 	struct sk_buff *tx_skb;				/* Packet being eaten by bus master ctrl.  */
 	dma_addr_t tx_skb_dma;				/* Allocated DMA address for bus master ctrl DMA.   */
@@ -1875,7 +1874,7 @@ static void vortex_tx_timeout(struct net_device *dev)
 
 	issue_and_wait(dev, TxReset);
 
-	vp->stats.tx_errors++;
+	dev->stats.tx_errors++;
 	if (vp->full_bus_master_tx) {
 		printk(KERN_DEBUG "%s: Resetting the Tx ring pointer.\n", dev->name);
 		if (vp->cur_tx - vp->dirty_tx > 0  &&  ioread32(ioaddr + DownListPtr) == 0)
@@ -1887,7 +1886,7 @@ static void vortex_tx_timeout(struct net_device *dev)
 			iowrite8(PKT_BUF_SZ>>8, ioaddr + TxFreeThreshold);
 		iowrite16(DownUnstall, ioaddr + EL3_CMD);
 	} else {
-		vp->stats.tx_dropped++;
+		dev->stats.tx_dropped++;
 		netif_wake_queue(dev);
 	}
 
@@ -1928,8 +1927,8 @@ vortex_error(struct net_device *dev, int status)
 			}
 			dump_tx_ring(dev);
 		}
-		if (tx_status & 0x14)  vp->stats.tx_fifo_errors++;
-		if (tx_status & 0x38)  vp->stats.tx_aborted_errors++;
+		if (tx_status & 0x14)  dev->stats.tx_fifo_errors++;
+		if (tx_status & 0x38)  dev->stats.tx_aborted_errors++;
 		if (tx_status & 0x08)  vp->xstats.tx_max_collisions++;
 		iowrite8(0, ioaddr + TxStatus);
 		if (tx_status & 0x30) {			/* txJabber or txUnderrun */
@@ -2051,8 +2050,8 @@ vortex_start_xmit(struct sk_buff *skb, struct net_device *dev)
 				if (vortex_debug > 2)
 				  printk(KERN_DEBUG "%s: Tx error, status %2.2x.\n",
 						 dev->name, tx_status);
-				if (tx_status & 0x04) vp->stats.tx_fifo_errors++;
-				if (tx_status & 0x38) vp->stats.tx_aborted_errors++;
+				if (tx_status & 0x04) dev->stats.tx_fifo_errors++;
+				if (tx_status & 0x38) dev->stats.tx_aborted_errors++;
 				if (tx_status & 0x30) {
 					issue_and_wait(dev, TxReset);
 				}
@@ -2350,7 +2349,7 @@ boomerang_interrupt(int irq, void *dev_id)
 				} else {
 					printk(KERN_DEBUG "boomerang_interrupt: no skb!\n");
 				}
-				/* vp->stats.tx_packets++;  Counted below. */
+				/* dev->stats.tx_packets++;  Counted below. */
 				dirty_tx++;
 			}
 			vp->dirty_tx = dirty_tx;
@@ -2409,12 +2408,12 @@ static int vortex_rx(struct net_device *dev)
 			unsigned char rx_error = ioread8(ioaddr + RxErrors);
 			if (vortex_debug > 2)
 				printk(KERN_DEBUG " Rx error: status %2.2x.\n", rx_error);
-			vp->stats.rx_errors++;
-			if (rx_error & 0x01)  vp->stats.rx_over_errors++;
-			if (rx_error & 0x02)  vp->stats.rx_length_errors++;
-			if (rx_error & 0x04)  vp->stats.rx_frame_errors++;
-			if (rx_error & 0x08)  vp->stats.rx_crc_errors++;
-			if (rx_error & 0x10)  vp->stats.rx_length_errors++;
+			dev->stats.rx_errors++;
+			if (rx_error & 0x01)  dev->stats.rx_over_errors++;
+			if (rx_error & 0x02)  dev->stats.rx_length_errors++;
+			if (rx_error & 0x04)  dev->stats.rx_frame_errors++;
+			if (rx_error & 0x08)  dev->stats.rx_crc_errors++;
+			if (rx_error & 0x10)  dev->stats.rx_length_errors++;
 		} else {
 			/* The packet length: up to 4.5K!. */
 			int pkt_len = rx_status & 0x1fff;
@@ -2446,7 +2445,7 @@ static int vortex_rx(struct net_device *dev)
 				skb->protocol = eth_type_trans(skb, dev);
 				netif_rx(skb);
 				dev->last_rx = jiffies;
-				vp->stats.rx_packets++;
+				dev->stats.rx_packets++;
 				/* Wait a limited time to go to next packet. */
 				for (i = 200; i >= 0; i--)
 					if ( ! (ioread16(ioaddr + EL3_STATUS) & CmdInProgress))
@@ -2455,7 +2454,7 @@ static int vortex_rx(struct net_device *dev)
 			} else if (vortex_debug > 0)
 				printk(KERN_NOTICE "%s: No memory to allocate a sk_buff of "
 					   "size %d.\n", dev->name, pkt_len);
-			vp->stats.rx_dropped++;
+			dev->stats.rx_dropped++;
 		}
 		issue_and_wait(dev, RxDiscard);
 	}
@@ -2482,12 +2481,12 @@ boomerang_rx(struct net_device *dev)
 			unsigned char rx_error = rx_status >> 16;
 			if (vortex_debug > 2)
 				printk(KERN_DEBUG " Rx error: status %2.2x.\n", rx_error);
-			vp->stats.rx_errors++;
-			if (rx_error & 0x01)  vp->stats.rx_over_errors++;
-			if (rx_error & 0x02)  vp->stats.rx_length_errors++;
-			if (rx_error & 0x04)  vp->stats.rx_frame_errors++;
-			if (rx_error & 0x08)  vp->stats.rx_crc_errors++;
-			if (rx_error & 0x10)  vp->stats.rx_length_errors++;
+			dev->stats.rx_errors++;
+			if (rx_error & 0x01)  dev->stats.rx_over_errors++;
+			if (rx_error & 0x02)  dev->stats.rx_length_errors++;
+			if (rx_error & 0x04)  dev->stats.rx_frame_errors++;
+			if (rx_error & 0x08)  dev->stats.rx_crc_errors++;
+			if (rx_error & 0x10)  dev->stats.rx_length_errors++;
 		} else {
 			/* The packet length: up to 4.5K!. */
 			int pkt_len = rx_status & 0x1fff;
@@ -2529,7 +2528,7 @@ boomerang_rx(struct net_device *dev)
 			}
 			netif_rx(skb);
 			dev->last_rx = jiffies;
-			vp->stats.rx_packets++;
+			dev->stats.rx_packets++;
 		}
 		entry = (++vp->cur_rx) % RX_RING_SIZE;
 	}
@@ -2591,7 +2590,7 @@ vortex_down(struct net_device *dev, int final_down)
 	del_timer_sync(&vp->rx_oom_timer);
 	del_timer_sync(&vp->timer);
 
-	/* Turn off statistics ASAP.  We update vp->stats below. */
+	/* Turn off statistics ASAP.  We update dev->stats below. */
 	iowrite16(StatsDisable, ioaddr + EL3_CMD);
 
 	/* Disable the receiver and transmitter. */
@@ -2728,7 +2727,7 @@ static struct net_device_stats *vortex_get_stats(struct net_device *dev)
 		update_stats(ioaddr, dev);
 		spin_unlock_irqrestore (&vp->lock, flags);
 	}
-	return &vp->stats;
+	return &dev->stats;
 }
 
 /*  Update statistics.
@@ -2748,18 +2747,18 @@ static void update_stats(void __iomem *ioaddr, struct net_device *dev)
 	/* Unlike the 3c5x9 we need not turn off stats updates while reading. */
 	/* Switch to the stats window, and read everything. */
 	EL3WINDOW(6);
-	vp->stats.tx_carrier_errors		+= ioread8(ioaddr + 0);
-	vp->stats.tx_heartbeat_errors		+= ioread8(ioaddr + 1);
-	vp->stats.tx_window_errors		+= ioread8(ioaddr + 4);
-	vp->stats.rx_fifo_errors		+= ioread8(ioaddr + 5);
-	vp->stats.tx_packets			+= ioread8(ioaddr + 6);
-	vp->stats.tx_packets			+= (ioread8(ioaddr + 9)&0x30) << 4;
+	dev->stats.tx_carrier_errors		+= ioread8(ioaddr + 0);
+	dev->stats.tx_heartbeat_errors		+= ioread8(ioaddr + 1);
+	dev->stats.tx_window_errors		+= ioread8(ioaddr + 4);
+	dev->stats.rx_fifo_errors		+= ioread8(ioaddr + 5);
+	dev->stats.tx_packets			+= ioread8(ioaddr + 6);
+	dev->stats.tx_packets			+= (ioread8(ioaddr + 9)&0x30) << 4;
 	/* Rx packets	*/			ioread8(ioaddr + 7);   /* Must read to clear */
 	/* Don't bother with register 9, an extension of registers 6&7.
 	   If we do use the 6&7 values the atomic update assumption above
 	   is invalid. */
-	vp->stats.rx_bytes 			+= ioread16(ioaddr + 10);
-	vp->stats.tx_bytes 			+= ioread16(ioaddr + 12);
+	dev->stats.rx_bytes 			+= ioread16(ioaddr + 10);
+	dev->stats.tx_bytes 			+= ioread16(ioaddr + 12);
 	/* Extra stats for get_ethtool_stats() */
 	vp->xstats.tx_multiple_collisions	+= ioread8(ioaddr + 2);
 	vp->xstats.tx_single_collisions         += ioread8(ioaddr + 3);
@@ -2767,14 +2766,14 @@ static void update_stats(void __iomem *ioaddr, struct net_device *dev)
 	EL3WINDOW(4);
 	vp->xstats.rx_bad_ssd			+= ioread8(ioaddr + 12);
 
-	vp->stats.collisions = vp->xstats.tx_multiple_collisions
+	dev->stats.collisions = vp->xstats.tx_multiple_collisions
 		+ vp->xstats.tx_single_collisions
 		+ vp->xstats.tx_max_collisions;
 
 	{
 		u8 up = ioread8(ioaddr + 13);
-		vp->stats.rx_bytes += (up & 0x0f) << 16;
-		vp->stats.tx_bytes += (up & 0xf0) << 12;
+		dev->stats.rx_bytes += (up & 0x0f) << 16;
+		dev->stats.tx_bytes += (up & 0xf0) << 12;
 	}
 
 	EL3WINDOW(old_window >> 13);
-- 
cgit v0.10.2


From 46fa06170d59b6b9951d09354829d85090f0d911 Mon Sep 17 00:00:00 2001
From: Bruce Robson <bns_robson@hotmail.com>
Date: Fri, 2 May 2008 13:40:53 -0700
Subject: [netdrvr] eexpress: IPv6 fails - multicast problems

Taken from http://bugzilla.kernel.org/show_bug.cgi?id=10577

I was unable to access a computer containing an Intel EtherExpress 16 network
card using IPv6.

I traced this to failure of neighbour discovery.  When I used an "ip -6 neigh
add" command, on the computer attempting access, to insert a binding between
the IPv6 address of the computer with the Intel EtherExpress 16 network card
and the card's ethernet address, I was able to access that computer using
IPv6.

Neighbour discovery requires working multicast.  The driver sources file
eexpress.c contains an approximately 30 line function eexp_setup_filter used
when loading multicast addresses.

I found 3 problems in this function

1) It wrote the number of multicast addresses to the card instead of the
    number of bytes in the multicast addresses.

2) When loading multiple multicast addresses it loaded the first one
    provided multiple times instead of loading each one once.

3) The setting of pointer 'data' from 'dmi->dmi_addr' occured before the
    test for the error situation of 'dmi' being NULL.

Correcting these problems allows the computer with the Intel EtherExpress 16
network card to found by IPv6 neighbour discovery.

p.s. There is some information on the Intel EtherExpress 16 at
http://www.intel.com/support/etherexpress/vintage/sb/cs-013500.htm
Datasheet for the Intel 82586 ethernet controller used by the card
http://www.datasheetcatalog.com/datasheets_pdf/8/2/5/8/82586.shtml

Signed-off-by: Bruce Robson <bns_robson@hotmail.com>
Cc: Jeff Garzik <jeff@garzik.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/net/eexpress.c b/drivers/net/eexpress.c
index 2eb82ab..795c594 100644
--- a/drivers/net/eexpress.c
+++ b/drivers/net/eexpress.c
@@ -202,7 +202,7 @@ static unsigned short start_code[] = {
 	0x0000,Cmd_MCast,
 	0x0076,                 /* link to next command */
 #define CONF_NR_MULTICAST 0x44
-	0x0000,                 /* number of multicast addresses */
+	0x0000,                 /* number of bytes in multicast address(es) */
 #define CONF_MULTICAST 0x46
 	0x0000, 0x0000, 0x0000, /* some addresses */
 	0x0000, 0x0000, 0x0000,
@@ -1569,7 +1569,7 @@ static void eexp_hw_init586(struct net_device *dev)
 
 static void eexp_setup_filter(struct net_device *dev)
 {
-	struct dev_mc_list *dmi = dev->mc_list;
+	struct dev_mc_list *dmi;
 	unsigned short ioaddr = dev->base_addr;
 	int count = dev->mc_count;
 	int i;
@@ -1580,9 +1580,9 @@ static void eexp_setup_filter(struct net_device *dev)
 	}
 
 	outw(CONF_NR_MULTICAST & ~31, ioaddr+SM_PTR);
-	outw(count, ioaddr+SHADOW(CONF_NR_MULTICAST));
-	for (i = 0; i < count; i++) {
-		unsigned short *data = (unsigned short *)dmi->dmi_addr;
+	outw(6*count, ioaddr+SHADOW(CONF_NR_MULTICAST));
+	for (i = 0, dmi = dev->mc_list; i < count; i++, dmi = dmi->next) {
+		unsigned short *data;
 		if (!dmi) {
 			printk(KERN_INFO "%s: too few multicast addresses\n", dev->name);
 			break;
@@ -1591,6 +1591,7 @@ static void eexp_setup_filter(struct net_device *dev)
 			printk(KERN_INFO "%s: invalid multicast address length given.\n", dev->name);
 			continue;
 		}
+		data = (unsigned short *)dmi->dmi_addr;
 		outw((CONF_MULTICAST+(6*i)) & ~31, ioaddr+SM_PTR);
 		outw(data[0], ioaddr+SHADOW(CONF_MULTICAST+(6*i)));
 		outw((CONF_MULTICAST+(6*i)+2) & ~31, ioaddr+SM_PTR);
-- 
cgit v0.10.2


From a86e2cbe263c193a70b2e5c5a0c7e53ed39fc0ad Mon Sep 17 00:00:00 2001
From: Scott Wood <scottwood@freescale.com>
Date: Fri, 2 May 2008 13:42:41 -0500
Subject: fs_enet: Fix a memory leak in fs_enet_mdio_probe

There are more memory leaks in the !PPC_CPM_NEW_BINDING case, but that code
will disappear soon along with arch/ppc.

Reported by Daniel Marjamki <danielm77@spray.se> at
http://bugzilla.kernel.org/show_bug.cgi?id=10591

Signed-off-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/net/fs_enet/mii-fec.c b/drivers/net/fs_enet/mii-fec.c
index ba75efc..f0014cf 100644
--- a/drivers/net/fs_enet/mii-fec.c
+++ b/drivers/net/fs_enet/mii-fec.c
@@ -194,7 +194,7 @@ static int __devinit fs_enet_mdio_probe(struct of_device *ofdev,
 
 	ret = of_address_to_resource(ofdev->node, 0, &res);
 	if (ret)
-		return ret;
+		goto out_res;
 
 	snprintf(new_bus->id, MII_BUS_ID_SIZE, "%x", res.start);
 
@@ -236,6 +236,7 @@ out_free_irqs:
 	kfree(new_bus->irq);
 out_unmap_regs:
 	iounmap(fec->fecp);
+out_res:
 out_fec:
 	kfree(fec);
 out_mii:
-- 
cgit v0.10.2


From 01935d7d2c544a5dfc8313f79ed164d45115aa33 Mon Sep 17 00:00:00 2001
From: Don Fry <pcnet32@verizon.net>
Date: Tue, 29 Apr 2008 13:49:58 -0700
Subject: pcnet32: delete non NAPI code from driver.

Delete the non-napi code from the driver and Kconfig.
Tested x86_64.  Apply at next open opportunity.

Signed-off-by: Don Fry <pcnet32@verizon.net>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index af46341..d27f54a 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -1273,20 +1273,6 @@ config PCNET32
 	  To compile this driver as a module, choose M here. The module
 	  will be called pcnet32.
 
-config PCNET32_NAPI
-	bool "Use RX polling (NAPI)"
-	depends on PCNET32
-	help
-	  NAPI is a new driver API designed to reduce CPU and interrupt load
-	  when the driver is receiving lots of packets from the card. It is
-	  still somewhat experimental and thus not yet enabled by default.
-
-	  If your estimated Rx load is 10kpps or more, or if the card will be
-	  deployed on potentially unfriendly networks (e.g. in a firewall),
-	  then say Y here.
-
-	  If in doubt, say N.
-
 config AMD8111_ETH
 	tristate "AMD 8111 (new PCI lance) support"
 	depends on NET_PCI && PCI
diff --git a/drivers/net/pcnet32.c b/drivers/net/pcnet32.c
index 4eb322e..a1c454d 100644
--- a/drivers/net/pcnet32.c
+++ b/drivers/net/pcnet32.c
@@ -22,12 +22,8 @@
  *************************************************************************/
 
 #define DRV_NAME	"pcnet32"
-#ifdef CONFIG_PCNET32_NAPI
-#define DRV_VERSION	"1.34-NAPI"
-#else
-#define DRV_VERSION	"1.34"
-#endif
-#define DRV_RELDATE	"14.Aug.2007"
+#define DRV_VERSION	"1.35"
+#define DRV_RELDATE	"21.Apr.2008"
 #define PFX		DRV_NAME ": "
 
 static const char *const version =
@@ -445,30 +441,24 @@ static struct pcnet32_access pcnet32_dwio = {
 
 static void pcnet32_netif_stop(struct net_device *dev)
 {
-#ifdef CONFIG_PCNET32_NAPI
 	struct pcnet32_private *lp = netdev_priv(dev);
-#endif
+
 	dev->trans_start = jiffies;
-#ifdef CONFIG_PCNET32_NAPI
 	napi_disable(&lp->napi);
-#endif
 	netif_tx_disable(dev);
 }
 
 static void pcnet32_netif_start(struct net_device *dev)
 {
-#ifdef CONFIG_PCNET32_NAPI
 	struct pcnet32_private *lp = netdev_priv(dev);
 	ulong ioaddr = dev->base_addr;
 	u16 val;
-#endif
+
 	netif_wake_queue(dev);
-#ifdef CONFIG_PCNET32_NAPI
 	val = lp->a.read_csr(ioaddr, CSR3);
 	val &= 0x00ff;
 	lp->a.write_csr(ioaddr, CSR3, val);
 	napi_enable(&lp->napi);
-#endif
 }
 
 /*
@@ -911,11 +901,7 @@ static int pcnet32_loopback_test(struct net_device *dev, uint64_t * data1)
 	rc = 1;			/* default to fail */
 
 	if (netif_running(dev))
-#ifdef CONFIG_PCNET32_NAPI
 		pcnet32_netif_stop(dev);
-#else
-		pcnet32_close(dev);
-#endif
 
 	spin_lock_irqsave(&lp->lock, flags);
 	lp->a.write_csr(ioaddr, CSR0, CSR0_STOP);	/* stop the chip */
@@ -1046,7 +1032,6 @@ static int pcnet32_loopback_test(struct net_device *dev, uint64_t * data1)
 	x = a->read_bcr(ioaddr, 32);	/* reset internal loopback */
 	a->write_bcr(ioaddr, 32, (x & ~0x0002));
 
-#ifdef CONFIG_PCNET32_NAPI
 	if (netif_running(dev)) {
 		pcnet32_netif_start(dev);
 		pcnet32_restart(dev, CSR0_NORMAL);
@@ -1055,16 +1040,6 @@ static int pcnet32_loopback_test(struct net_device *dev, uint64_t * data1)
 		lp->a.write_bcr(ioaddr, 20, 4);	/* return to 16bit mode */
 	}
 	spin_unlock_irqrestore(&lp->lock, flags);
-#else
-	if (netif_running(dev)) {
-		spin_unlock_irqrestore(&lp->lock, flags);
-		pcnet32_open(dev);
-	} else {
-		pcnet32_purge_rx_ring(dev);
-		lp->a.write_bcr(ioaddr, 20, 4);	/* return to 16bit mode */
-		spin_unlock_irqrestore(&lp->lock, flags);
-	}
-#endif
 
 	return (rc);
 }				/* end pcnet32_loopback_test  */
@@ -1270,11 +1245,7 @@ static void pcnet32_rx_entry(struct net_device *dev,
 	}
 	dev->stats.rx_bytes += skb->len;
 	skb->protocol = eth_type_trans(skb, dev);
-#ifdef CONFIG_PCNET32_NAPI
 	netif_receive_skb(skb);
-#else
-	netif_rx(skb);
-#endif
 	dev->last_rx = jiffies;
 	dev->stats.rx_packets++;
 	return;
@@ -1403,7 +1374,6 @@ static int pcnet32_tx(struct net_device *dev)
 	return must_restart;
 }
 
-#ifdef CONFIG_PCNET32_NAPI
 static int pcnet32_poll(struct napi_struct *napi, int budget)
 {
 	struct pcnet32_private *lp = container_of(napi, struct pcnet32_private, napi);
@@ -1442,7 +1412,6 @@ static int pcnet32_poll(struct napi_struct *napi, int budget)
 	}
 	return work_done;
 }
-#endif
 
 #define PCNET32_REGS_PER_PHY	32
 #define PCNET32_MAX_PHYS	32
@@ -1864,9 +1833,7 @@ pcnet32_probe1(unsigned long ioaddr, int shared, struct pci_dev *pdev)
 	/* napi.weight is used in both the napi and non-napi cases */
 	lp->napi.weight = lp->rx_ring_size / 2;
 
-#ifdef CONFIG_PCNET32_NAPI
 	netif_napi_add(dev, &lp->napi, pcnet32_poll, lp->rx_ring_size / 2);
-#endif
 
 	if (fdx && !(lp->options & PCNET32_PORT_ASEL) &&
 	    ((cards_found >= MAX_UNITS) || full_duplex[cards_found]))
@@ -2297,9 +2264,7 @@ static int pcnet32_open(struct net_device *dev)
 		goto err_free_ring;
 	}
 
-#ifdef CONFIG_PCNET32_NAPI
 	napi_enable(&lp->napi);
-#endif
 
 	/* Re-initialize the PCNET32, and start it when done. */
 	lp->a.write_csr(ioaddr, 1, (lp->init_dma_addr & 0xffff));
@@ -2623,7 +2588,6 @@ pcnet32_interrupt(int irq, void *dev_id)
 				       dev->name, csr0);
 			/* unlike for the lance, there is no restart needed */
 		}
-#ifdef CONFIG_PCNET32_NAPI
 		if (netif_rx_schedule_prep(dev, &lp->napi)) {
 			u16 val;
 			/* set interrupt masks */
@@ -2634,24 +2598,9 @@ pcnet32_interrupt(int irq, void *dev_id)
 			__netif_rx_schedule(dev, &lp->napi);
 			break;
 		}
-#else
-		pcnet32_rx(dev, lp->napi.weight);
-		if (pcnet32_tx(dev)) {
-			/* reset the chip to clear the error condition, then restart */
-			lp->a.reset(ioaddr);
-			lp->a.write_csr(ioaddr, CSR4, 0x0915); /* auto tx pad */
-			pcnet32_restart(dev, CSR0_START);
-			netif_wake_queue(dev);
-		}
-#endif
 		csr0 = lp->a.read_csr(ioaddr, CSR0);
 	}
 
-#ifndef CONFIG_PCNET32_NAPI
-	/* Set interrupt enable. */
-	lp->a.write_csr(ioaddr, CSR0, CSR0_INTEN);
-#endif
-
 	if (netif_msg_intr(lp))
 		printk(KERN_DEBUG "%s: exiting interrupt, csr0=%#4.4x.\n",
 		       dev->name, lp->a.read_csr(ioaddr, CSR0));
@@ -2670,9 +2619,7 @@ static int pcnet32_close(struct net_device *dev)
 	del_timer_sync(&lp->watchdog_timer);
 
 	netif_stop_queue(dev);
-#ifdef CONFIG_PCNET32_NAPI
 	napi_disable(&lp->napi);
-#endif
 
 	spin_lock_irqsave(&lp->lock, flags);
 
-- 
cgit v0.10.2


From 1b3aa7afb60d34867eea5e73ee943b2a026fc47c Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Tue, 29 Apr 2008 14:29:30 +0100
Subject: cxgb3: Use CAP_SYS_RAWIO for firmware

Otherwise theoretically at least

	CAP_NET_ADMIN
	Reload new firmware
	Wait..
	Firmware patches kernel

So it should be CAY_SYS_RAWIO - not that I suspect this is in fact a
credible attack vector!

Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/net/cxgb3/cxgb3_main.c b/drivers/net/cxgb3/cxgb3_main.c
index 05e5f59e..ce949d5 100644
--- a/drivers/net/cxgb3/cxgb3_main.c
+++ b/drivers/net/cxgb3/cxgb3_main.c
@@ -1894,11 +1894,11 @@ static int cxgb_extension_ioctl(struct net_device *dev, void __user *useraddr)
 		u8 *fw_data;
 		struct ch_mem_range t;
 
-		if (!capable(CAP_NET_ADMIN))
+		if (!capable(CAP_SYS_RAWIO))
 			return -EPERM;
 		if (copy_from_user(&t, useraddr, sizeof(t)))
 			return -EFAULT;
-
+		/* Check t.len sanity ? */
 		fw_data = kmalloc(t.len, GFP_KERNEL);
 		if (!fw_data)
 			return -ENOMEM;
-- 
cgit v0.10.2


From e410553fd35afd6d290b65e02dc501722406377d Mon Sep 17 00:00:00 2001
From: Joakim Tjernlund <Joakim.Tjernlund@transmode.se>
Date: Tue, 29 Apr 2008 13:03:57 +0200
Subject: ucc_geth: Don't use RX clock as TX clock.

Commit 9fb1e350e16164d56990dde036ae9c0a2fd3f634,
ucc_geth: use rx-clock-name and tx-clock-name device tree properties
Introduced a typo that made the driver use the RX clock
as TX clock, causing massive TX errors.

Signed-off-by: Joakim Tjernlund <Joakim.Tjernlund@transmode.se>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/net/ucc_geth.c b/drivers/net/ucc_geth.c
index b3bbb2d..ca0bdac 100644
--- a/drivers/net/ucc_geth.c
+++ b/drivers/net/ucc_geth.c
@@ -3922,7 +3922,7 @@ static int ucc_geth_probe(struct of_device* ofdev, const struct of_device_id *ma
 			return -EINVAL;
 		}
 	} else {
-		prop = of_get_property(np, "rx-clock", NULL);
+		prop = of_get_property(np, "tx-clock", NULL);
 		if (!prop) {
 			printk(KERN_ERR
 				"ucc_geth: mising tx-clock-name property\n");
-- 
cgit v0.10.2


From f227ec3ca2b7be449fb2156e82b40cceed87a34a Mon Sep 17 00:00:00 2001
From: "Kok, Auke" <auke-jan.h.kok@intel.com>
Date: Tue, 29 Apr 2008 11:18:55 -0700
Subject: e1000e: don't return half-read eeprom on error

On a read error, e1000e might have returned uninitialized block of
eeprom data back to userspace. The convention is that 0xff is "empty",
so mark the entire eeprom as empty in case of an error.

Signed-off-by: Auke Kok <auke-jan.h.kok@intel.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/net/e1000e/ethtool.c b/drivers/net/e1000e/ethtool.c
index 2bb6da0..a14561f 100644
--- a/drivers/net/e1000e/ethtool.c
+++ b/drivers/net/e1000e/ethtool.c
@@ -494,8 +494,12 @@ static int e1000_get_eeprom(struct net_device *netdev,
 		for (i = 0; i < last_word - first_word + 1; i++) {
 			ret_val = e1000_read_nvm(hw, first_word + i, 1,
 						      &eeprom_buff[i]);
-			if (ret_val)
+			if (ret_val) {
+				/* a read error occurred, throw away the
+				 * result */
+				memset(eeprom_buff, 0xff, sizeof(eeprom_buff));
 				break;
+			}
 		}
 	}
 
-- 
cgit v0.10.2


From dca3c33652e437ed02c30ed3eca3cecd0cc00838 Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Tue, 29 Apr 2008 17:02:20 +0200
Subject: [PATCH] fix reservation discarding in affs

- remove affs_put_inode, so preallocations aren't discared unnecessarily
  often.
- remove affs_drop_inode, it's called with a spinlock held, so it can't
  use a mutex.
- make i_opencnt atomic
- avoid direct b_count manipulations
- a few allocation failure fixes, so that these are more gracefully
  handled now.
Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index d5bd497..223b191 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -48,7 +48,7 @@ struct affs_ext_key {
  * affs fs inode data in memory
  */
 struct affs_inode_info {
-	u32	 i_opencnt;
+	atomic_t i_opencnt;
 	struct semaphore i_link_lock;		/* Protects internal inode access. */
 	struct semaphore i_ext_lock;		/* Protects internal inode access. */
 #define i_hash_lock i_ext_lock
@@ -170,8 +170,6 @@ extern int	affs_rename(struct inode *old_dir, struct dentry *old_dentry,
 extern unsigned long		 affs_parent_ino(struct inode *dir);
 extern struct inode		*affs_new_inode(struct inode *dir);
 extern int			 affs_notify_change(struct dentry *dentry, struct iattr *attr);
-extern void			 affs_put_inode(struct inode *inode);
-extern void			 affs_drop_inode(struct inode *inode);
 extern void			 affs_delete_inode(struct inode *inode);
 extern void			 affs_clear_inode(struct inode *inode);
 extern struct inode		*affs_iget(struct super_block *sb,
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 1a4f092..6eac7bd 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -48,8 +48,9 @@ affs_file_open(struct inode *inode, struct file *filp)
 {
 	if (atomic_read(&filp->f_count) != 1)
 		return 0;
-	pr_debug("AFFS: open(%d)\n", AFFS_I(inode)->i_opencnt);
-	AFFS_I(inode)->i_opencnt++;
+	pr_debug("AFFS: open(%lu,%d)\n",
+		 inode->i_ino, atomic_read(&AFFS_I(inode)->i_opencnt));
+	atomic_inc(&AFFS_I(inode)->i_opencnt);
 	return 0;
 }
 
@@ -58,10 +59,16 @@ affs_file_release(struct inode *inode, struct file *filp)
 {
 	if (atomic_read(&filp->f_count) != 0)
 		return 0;
-	pr_debug("AFFS: release(%d)\n", AFFS_I(inode)->i_opencnt);
-	AFFS_I(inode)->i_opencnt--;
-	if (!AFFS_I(inode)->i_opencnt)
+	pr_debug("AFFS: release(%lu, %d)\n",
+		 inode->i_ino, atomic_read(&AFFS_I(inode)->i_opencnt));
+
+	if (atomic_dec_and_test(&AFFS_I(inode)->i_opencnt)) {
+		mutex_lock(&inode->i_mutex);
+		if (inode->i_size != AFFS_I(inode)->mmu_private)
+			affs_truncate(inode);
 		affs_free_prealloc(inode);
+		mutex_unlock(&inode->i_mutex);
+	}
 
 	return 0;
 }
@@ -180,7 +187,7 @@ affs_get_extblock(struct inode *inode, u32 ext)
 	/* inline the simplest case: same extended block as last time */
 	struct buffer_head *bh = AFFS_I(inode)->i_ext_bh;
 	if (ext == AFFS_I(inode)->i_ext_last)
-		atomic_inc(&bh->b_count);
+		get_bh(bh);
 	else
 		/* we have to do more (not inlined) */
 		bh = affs_get_extblock_slow(inode, ext);
@@ -306,7 +313,7 @@ store_ext:
 	affs_brelse(AFFS_I(inode)->i_ext_bh);
 	AFFS_I(inode)->i_ext_last = ext;
 	AFFS_I(inode)->i_ext_bh = bh;
-	atomic_inc(&bh->b_count);
+	get_bh(bh);
 
 	return bh;
 
@@ -324,7 +331,6 @@ affs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_resul
 
 	pr_debug("AFFS: get_block(%u, %lu)\n", (u32)inode->i_ino, (unsigned long)block);
 
-
 	BUG_ON(block > (sector_t)0x7fffffffUL);
 
 	if (block >= AFFS_I(inode)->i_blkcnt) {
@@ -827,6 +833,8 @@ affs_truncate(struct inode *inode)
 		res = mapping->a_ops->write_begin(NULL, mapping, size, 0, 0, &page, &fsdata);
 		if (!res)
 			res = mapping->a_ops->write_end(NULL, mapping, size, 0, 0, page, fsdata);
+		else
+			inode->i_size = AFFS_I(inode)->mmu_private;
 		mark_inode_dirty(inode);
 		return;
 	} else if (inode->i_size == AFFS_I(inode)->mmu_private)
@@ -862,6 +870,7 @@ affs_truncate(struct inode *inode)
 		blk++;
 	} else
 		AFFS_HEAD(ext_bh)->first_data = 0;
+	AFFS_HEAD(ext_bh)->block_count = cpu_to_be32(i);
 	size = AFFS_SB(sb)->s_hashsize;
 	if (size > blkcnt - blk + i)
 		size = blkcnt - blk + i;
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 27fe6cb..a13b334 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -58,7 +58,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
 	AFFS_I(inode)->i_extcnt = 1;
 	AFFS_I(inode)->i_ext_last = ~1;
 	AFFS_I(inode)->i_protect = prot;
-	AFFS_I(inode)->i_opencnt = 0;
+	atomic_set(&AFFS_I(inode)->i_opencnt, 0);
 	AFFS_I(inode)->i_blkcnt = 0;
 	AFFS_I(inode)->i_lc = NULL;
 	AFFS_I(inode)->i_lc_size = 0;
@@ -108,8 +108,6 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
 			inode->i_mode |= S_IFDIR;
 		} else
 			inode->i_mode = S_IRUGO | S_IXUGO | S_IWUSR | S_IFDIR;
-		if (tail->link_chain)
-			inode->i_nlink = 2;
 		/* Maybe it should be controlled by mount parameter? */
 		//inode->i_mode |= S_ISVTX;
 		inode->i_op = &affs_dir_inode_operations;
@@ -245,31 +243,12 @@ out:
 }
 
 void
-affs_put_inode(struct inode *inode)
-{
-	pr_debug("AFFS: put_inode(ino=%lu, nlink=%u)\n", inode->i_ino, inode->i_nlink);
-	affs_free_prealloc(inode);
-}
-
-void
-affs_drop_inode(struct inode *inode)
-{
-	mutex_lock(&inode->i_mutex);
-	if (inode->i_size != AFFS_I(inode)->mmu_private)
-		affs_truncate(inode);
-	mutex_unlock(&inode->i_mutex);
-
-	generic_drop_inode(inode);
-}
-
-void
 affs_delete_inode(struct inode *inode)
 {
 	pr_debug("AFFS: delete_inode(ino=%lu, nlink=%u)\n", inode->i_ino, inode->i_nlink);
 	truncate_inode_pages(&inode->i_data, 0);
 	inode->i_size = 0;
-	if (S_ISREG(inode->i_mode))
-		affs_truncate(inode);
+	affs_truncate(inode);
 	clear_inode(inode);
 	affs_free_block(inode->i_sb, inode->i_ino);
 }
@@ -277,9 +256,12 @@ affs_delete_inode(struct inode *inode)
 void
 affs_clear_inode(struct inode *inode)
 {
-	unsigned long cache_page = (unsigned long) AFFS_I(inode)->i_lc;
+	unsigned long cache_page;
 
 	pr_debug("AFFS: clear_inode(ino=%lu, nlink=%u)\n", inode->i_ino, inode->i_nlink);
+
+	affs_free_prealloc(inode);
+	cache_page = (unsigned long)AFFS_I(inode)->i_lc;
 	if (cache_page) {
 		pr_debug("AFFS: freeing ext cache\n");
 		AFFS_I(inode)->i_lc = NULL;
@@ -316,7 +298,7 @@ affs_new_inode(struct inode *dir)
 	inode->i_ino     = block;
 	inode->i_nlink   = 1;
 	inode->i_mtime   = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
-	AFFS_I(inode)->i_opencnt = 0;
+	atomic_set(&AFFS_I(inode)->i_opencnt, 0);
 	AFFS_I(inode)->i_blkcnt = 0;
 	AFFS_I(inode)->i_lc = NULL;
 	AFFS_I(inode)->i_lc_size = 0;
@@ -369,12 +351,12 @@ affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s3
 	switch (type) {
 	case ST_LINKFILE:
 	case ST_LINKDIR:
-		inode_bh = bh;
 		retval = -ENOSPC;
 		block = affs_alloc_block(dir, dir->i_ino);
 		if (!block)
 			goto err;
 		retval = -EIO;
+		inode_bh = bh;
 		bh = affs_getzeroblk(sb, block);
 		if (!bh)
 			goto err;
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 2218f1e..cfcf1b6 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -234,7 +234,8 @@ affs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 int
 affs_unlink(struct inode *dir, struct dentry *dentry)
 {
-	pr_debug("AFFS: unlink(dir=%d, \"%.*s\")\n", (u32)dir->i_ino,
+	pr_debug("AFFS: unlink(dir=%d, %lu \"%.*s\")\n", (u32)dir->i_ino,
+		 dentry->d_inode->i_ino,
 		 (int)dentry->d_name.len, dentry->d_name.name);
 
 	return affs_remove_header(dentry);
@@ -302,7 +303,8 @@ affs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 int
 affs_rmdir(struct inode *dir, struct dentry *dentry)
 {
-	pr_debug("AFFS: rmdir(dir=%u, \"%.*s\")\n", (u32)dir->i_ino,
+	pr_debug("AFFS: rmdir(dir=%u, %lu \"%.*s\")\n", (u32)dir->i_ino,
+		 dentry->d_inode->i_ino,
 		 (int)dentry->d_name.len, dentry->d_name.name);
 
 	return affs_remove_header(dentry);
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 01d25d532..d214837 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -71,12 +71,18 @@ static struct kmem_cache * affs_inode_cachep;
 
 static struct inode *affs_alloc_inode(struct super_block *sb)
 {
-	struct affs_inode_info *ei;
-	ei = (struct affs_inode_info *)kmem_cache_alloc(affs_inode_cachep, GFP_KERNEL);
-	if (!ei)
+	struct affs_inode_info *i;
+
+	i = kmem_cache_alloc(affs_inode_cachep, GFP_KERNEL);
+	if (!i)
 		return NULL;
-	ei->vfs_inode.i_version = 1;
-	return &ei->vfs_inode;
+
+	i->vfs_inode.i_version = 1;
+	i->i_lc = NULL;
+	i->i_ext_bh = NULL;
+	i->i_pa_cnt = 0;
+
+	return &i->vfs_inode;
 }
 
 static void affs_destroy_inode(struct inode *inode)
@@ -114,8 +120,6 @@ static const struct super_operations affs_sops = {
 	.alloc_inode	= affs_alloc_inode,
 	.destroy_inode	= affs_destroy_inode,
 	.write_inode	= affs_write_inode,
-	.put_inode	= affs_put_inode,
-	.drop_inode	= affs_drop_inode,
 	.delete_inode	= affs_delete_inode,
 	.clear_inode	= affs_clear_inode,
 	.put_super	= affs_put_super,
-- 
cgit v0.10.2


From 33dcdac2df54e66c447ae03f58c95c7251aa5649 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 29 Apr 2008 17:46:26 +0200
Subject: [PATCH] kill ->put_inode

And with that last patch to affs killing the last put_inode instance we
can finally, after many years of transition kill this racy and awkward
interface.

(It's kinda funny that even the description in
Documentation/filesystems/vfs.txt was entirely wrong..)

Also remove a very misleading comment above the defintion of
struct super_operations.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index c2992bc..8b22d7d8 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -92,7 +92,6 @@ prototypes:
 	void (*destroy_inode)(struct inode *);
 	void (*dirty_inode) (struct inode *);
 	int (*write_inode) (struct inode *, int);
-	void (*put_inode) (struct inode *);
 	void (*drop_inode) (struct inode *);
 	void (*delete_inode) (struct inode *);
 	void (*put_super) (struct super_block *);
@@ -115,7 +114,6 @@ alloc_inode:		no	no	no
 destroy_inode:		no
 dirty_inode:		no				(must not sleep)
 write_inode:		no
-put_inode:		no
 drop_inode:		no				!!!inode_lock!!!
 delete_inode:		no
 put_super:		yes	yes	no
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 81e5be6..b7522c6 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -205,7 +205,6 @@ struct super_operations {
 
         void (*dirty_inode) (struct inode *);
         int (*write_inode) (struct inode *, int);
-        void (*put_inode) (struct inode *);
         void (*drop_inode) (struct inode *);
         void (*delete_inode) (struct inode *);
         void (*put_super) (struct super_block *);
@@ -246,9 +245,6 @@ or bottom half).
 	inode to disc.  The second parameter indicates whether the write
 	should be synchronous or not, not all filesystems check this flag.
 
-  put_inode: called when the VFS inode is removed from the inode
-	cache.
-
   drop_inode: called when the last access to the inode is dropped,
 	with the inode_lock spinlock held.
 
diff --git a/fs/inode.c b/fs/inode.c
index bf64781..18bdce1 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1153,9 +1153,6 @@ void iput(struct inode *inode)
 
 		BUG_ON(inode->i_state == I_CLEAR);
 
-		if (op && op->put_inode)
-			op->put_inode(inode);
-
 		if (atomic_dec_and_lock(&inode->i_count, &inode_lock))
 			iput_final(inode);
 	}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a1ba005..7e0fa9e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1289,17 +1289,12 @@ extern ssize_t vfs_readv(struct file *, const struct iovec __user *,
 extern ssize_t vfs_writev(struct file *, const struct iovec __user *,
 		unsigned long, loff_t *);
 
-/*
- * NOTE: write_inode, delete_inode, clear_inode, put_inode can be called
- * without the big kernel lock held in all filesystems.
- */
 struct super_operations {
    	struct inode *(*alloc_inode)(struct super_block *sb);
 	void (*destroy_inode)(struct inode *);
 
    	void (*dirty_inode) (struct inode *);
 	int (*write_inode) (struct inode *, int);
-	void (*put_inode) (struct inode *);
 	void (*drop_inode) (struct inode *);
 	void (*delete_inode) (struct inode *);
 	void (*put_super) (struct super_block *);
-- 
cgit v0.10.2


From 0b2bac2f1ea0d33a3621b27ca68b9ae760fca2e9 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 6 May 2008 13:58:34 -0400
Subject: [PATCH] fix SMP ordering hole in fcntl_setlk()

fcntl_setlk()/close() race prevention has a subtle hole - we need to
make sure that if we *do* have an fcntl/close race on SMP box, the
access to descriptor table and inode->i_flock won't get reordered.

As it is, we get STORE inode->i_flock, LOAD descriptor table entry vs.
STORE descriptor table entry, LOAD inode->i_flock with not a single
lock in common on both sides.  We do have BKL around the first STORE,
but check in locks_remove_posix() is outside of BKL and for a good
reason - we don't want BKL on common path of close(2).

Solution is to hold ->file_lock around fcheck() in there; that orders
us wrt removal from descriptor table that preceded locks_remove_posix()
on close path and we either come first (in which case eviction will be
handled by the close side) or we'll see the effect of close and do
eviction ourselves.  Note that even though it's read-only access,
we do need ->file_lock here - rcu_read_lock() won't be enough to
order the things.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/locks.c b/fs/locks.c
index 663c069..0ac6b92 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1753,6 +1753,7 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
 	struct file_lock *file_lock = locks_alloc_lock();
 	struct flock flock;
 	struct inode *inode;
+	struct file *f;
 	int error;
 
 	if (file_lock == NULL)
@@ -1825,7 +1826,15 @@ again:
 	 * Attempt to detect a close/fcntl race and recover by
 	 * releasing the lock that was just acquired.
 	 */
-	if (!error && fcheck(fd) != filp && flock.l_type != F_UNLCK) {
+	/*
+	 * we need that spin_lock here - it prevents reordering between
+	 * update of inode->i_flock and check for it done in close().
+	 * rcu_read_lock() wouldn't do.
+	 */
+	spin_lock(&current->files->file_lock);
+	f = fcheck(fd);
+	spin_unlock(&current->files->file_lock);
+	if (!error && f != filp && flock.l_type != F_UNLCK) {
 		flock.l_type = F_UNLCK;
 		goto again;
 	}
@@ -1881,6 +1890,7 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
 	struct file_lock *file_lock = locks_alloc_lock();
 	struct flock64 flock;
 	struct inode *inode;
+	struct file *f;
 	int error;
 
 	if (file_lock == NULL)
@@ -1953,7 +1963,10 @@ again:
 	 * Attempt to detect a close/fcntl race and recover by
 	 * releasing the lock that was just acquired.
 	 */
-	if (!error && fcheck(fd) != filp && flock.l_type != F_UNLCK) {
+	spin_lock(&current->files->file_lock);
+	f = fcheck(fd);
+	spin_unlock(&current->files->file_lock);
+	if (!error && f != filp && flock.l_type != F_UNLCK) {
 		flock.l_type = F_UNLCK;
 		goto again;
 	}
-- 
cgit v0.10.2


From aeed5fce37196e09b4dac3a1c00d8b7122e040ce Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Tue, 6 May 2008 20:49:23 +0100
Subject: x86: fix PAE pmd_bad bootup warning

Fix warning from pmd_bad() at bootup on a HIGHMEM64G HIGHPTE x86_32.

That came from 9fc34113f6880b215cbea4e7017fc818700384c2 x86: debug pmd_bad();
but we understand now that the typecasting was wrong for PAE in the previous
version: pagetable pages above 4GB looked bad and stopped Arjan from booting.

And revert that cded932b75ab0a5f9181ee3da34a0a488d1a14fd x86: fix pmd_bad
and pud_bad to support huge pages.  It was the wrong way round: we shouldn't
weaken every pmd_bad and pud_bad check to let huge pages slip through - in
part they check that we _don't_ have a huge page where it's not expected.

Put the x86 pmd_bad() and pud_bad() definitions back to what they have long
been: they can be improved (x86_32 should use PTE_MASK, to stop PAE thinking
junk in the upper word is good; and x86_64 should follow x86_32's stricter
comparison, to stop thinking any subset of required bits is good); but that
should be a later patch.

Fix Hans' good observation that follow_page() will never find pmd_huge()
because that would have already failed the pmd_bad test: test pmd_huge in
between the pmd_none and pmd_bad tests.  Tighten x86's pmd_huge() check?
No, once it's a hugepage entry, it can get quite far from a good pmd: for
example, PROT_NONE leaves it with only ACCESSED of the KERN_PGTABLE bits.

However... though follow_page() contains this and another test for huge
pages, so it's nice to keep it working on them, where does it actually get
called on a huge page?  get_user_pages() checks is_vm_hugetlb_page(vma) to
to call alternative hugetlb processing, as does unmap_vmas() and others.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Earlier-version-tested-by: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Jeff Chua <jeff.chua.linux@gmail.com>
Cc: Hans Rosenfeld <hans.rosenfeld@amd.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index 9ee007b..369cf06 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -172,10 +172,3 @@ void reserve_top_address(unsigned long reserve)
 	__FIXADDR_TOP = -reserve - PAGE_SIZE;
 	__VMALLOC_RESERVE += reserve;
 }
-
-int pmd_bad(pmd_t pmd)
-{
-	WARN_ON_ONCE(pmd_bad_v1(pmd) != pmd_bad_v2(pmd));
-
-	return pmd_bad_v1(pmd);
-}
diff --git a/include/asm-x86/pgtable_32.h b/include/asm-x86/pgtable_32.h
index 577ab79..d7f0403 100644
--- a/include/asm-x86/pgtable_32.h
+++ b/include/asm-x86/pgtable_32.h
@@ -88,14 +88,7 @@ extern unsigned long pg0[];
 /* To avoid harmful races, pmd_none(x) should check only the lower when PAE */
 #define pmd_none(x)	(!(unsigned long)pmd_val((x)))
 #define pmd_present(x)	(pmd_val((x)) & _PAGE_PRESENT)
-
-extern int pmd_bad(pmd_t pmd);
-
-#define pmd_bad_v1(x)							\
-	(_KERNPG_TABLE != (pmd_val((x)) & ~(PAGE_MASK | _PAGE_USER)))
-#define	pmd_bad_v2(x)							\
-	(_KERNPG_TABLE != (pmd_val((x)) & ~(PAGE_MASK | _PAGE_USER |	\
-					    _PAGE_PSE | _PAGE_NX)))
+#define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
 
 #define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
 
diff --git a/include/asm-x86/pgtable_64.h b/include/asm-x86/pgtable_64.h
index a3bbf87..efe83dc 100644
--- a/include/asm-x86/pgtable_64.h
+++ b/include/asm-x86/pgtable_64.h
@@ -158,14 +158,12 @@ static inline unsigned long pgd_bad(pgd_t pgd)
 
 static inline unsigned long pud_bad(pud_t pud)
 {
-	return pud_val(pud) &
-		~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER | _PAGE_PSE | _PAGE_NX);
+	return pud_val(pud) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER);
 }
 
 static inline unsigned long pmd_bad(pmd_t pmd)
 {
-	return pmd_val(pmd) &
-		~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER | _PAGE_PSE | _PAGE_NX);
+	return pmd_val(pmd) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER);
 }
 
 #define pte_none(x)	(!pte_val((x)))
diff --git a/mm/memory.c b/mm/memory.c
index bbab1e37..48c122d4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -969,7 +969,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
 		goto no_page_table;
 	
 	pmd = pmd_offset(pud, address);
-	if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
+	if (pmd_none(*pmd))
 		goto no_page_table;
 
 	if (pmd_huge(*pmd)) {
@@ -978,6 +978,9 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
 		goto out;
 	}
 
+	if (unlikely(pmd_bad(*pmd)))
+		goto no_page_table;
+
 	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
 	if (!ptep)
 		goto out;
-- 
cgit v0.10.2


From 6ce07c7b61e74af35a05060a2d6341f68fd92c9e Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 6 May 2008 13:13:37 -0700
Subject: VFS: fix unused variable warning

Commit 33dcdac2df54e66c447ae03f58c95c7251aa5649 ("kill ->put_inode")
removed the final use of i_op->put_inode, but left the now totally
unused "op" variable in iput().

Get rid of it.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/fs/inode.c b/fs/inode.c
index 18bdce1..c36d948 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1149,8 +1149,6 @@ static inline void iput_final(struct inode *inode)
 void iput(struct inode *inode)
 {
 	if (inode) {
-		const struct super_operations *op = inode->i_sb->s_op;
-
 		BUG_ON(inode->i_state == I_CLEAR);
 
 		if (atomic_dec_and_lock(&inode->i_count, &inode_lock))
-- 
cgit v0.10.2


From db176c6ed8974fae94328ad5ac9e70b094ff22fd Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Wed, 7 May 2008 04:02:53 +0900
Subject: Fix bogus warning in sysdev_driver_register()

        if ((drv->entry.next != drv->entry.prev) ||
            (drv->entry.next != NULL)) {

warns list_empty(&drv->entry).

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Cc: Greg KH <gregkh@suse.de>
Cc: Len Brown <lenb@kernel.org>
[ Version 2 totally redone based on suggestions from Linus & Greg ]
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/base/sys.c b/drivers/base/sys.c
index 4fbb56b..358bb0b 100644
--- a/drivers/base/sys.c
+++ b/drivers/base/sys.c
@@ -175,8 +175,7 @@ int sysdev_driver_register(struct sysdev_class *cls, struct sysdev_driver *drv)
 	}
 
 	/* Check whether this driver has already been added to a class. */
-	if ((drv->entry.next != drv->entry.prev) ||
-	    (drv->entry.next != NULL)) {
+	if (drv->entry.next && !list_empty(&drv->entry)) {
 		printk(KERN_WARNING "sysdev: class %s: driver (%p) has already"
 			" been registered to a class, something is wrong, but "
 			"will forge on!\n", cls->name, drv);
-- 
cgit v0.10.2


From 0e9913362a967377eb886bbdf305ec58aa07a878 Mon Sep 17 00:00:00 2001
From: Roland Dreier <rolandd@cisco.com>
Date: Tue, 6 May 2008 15:03:38 -0700
Subject: RDMA/cxgb3: Don't add PBL memory to gen_pool in chunks

Current iw_cxgb3 code adds PBL memory to the driver's gen_pool in 2 MB
chunks.  This limits the largest single allocation that can be done to
the same size, which means that with 4 KB pages, each of which takes 8
bytes of PBL memory, the largest memory region that can be allocated
is 1 GB (256K PBL entries * 4 KB/entry).

Remove this limit by adding all the PBL memory in a single gen_pool
chunk, if possible.  Add code that falls back to smaller chunks if
gen_pool_add() fails, which can happen if there is not sufficient
contiguous lowmem for the internal gen_pool bitmap.

Signed-off-by: Roland Dreier <rolandd@cisco.com>

diff --git a/drivers/infiniband/hw/cxgb3/cxio_resource.c b/drivers/infiniband/hw/cxgb3/cxio_resource.c
index 45ed4f2..bd233c0 100644
--- a/drivers/infiniband/hw/cxgb3/cxio_resource.c
+++ b/drivers/infiniband/hw/cxgb3/cxio_resource.c
@@ -250,7 +250,6 @@ void cxio_hal_destroy_resource(struct cxio_hal_resource *rscp)
  */
 
 #define MIN_PBL_SHIFT 8			/* 256B == min PBL size (32 entries) */
-#define PBL_CHUNK 2*1024*1024
 
 u32 cxio_hal_pblpool_alloc(struct cxio_rdev *rdev_p, int size)
 {
@@ -267,14 +266,35 @@ void cxio_hal_pblpool_free(struct cxio_rdev *rdev_p, u32 addr, int size)
 
 int cxio_hal_pblpool_create(struct cxio_rdev *rdev_p)
 {
-	unsigned long i;
+	unsigned pbl_start, pbl_chunk;
+
 	rdev_p->pbl_pool = gen_pool_create(MIN_PBL_SHIFT, -1);
-	if (rdev_p->pbl_pool)
-		for (i = rdev_p->rnic_info.pbl_base;
-		     i <= rdev_p->rnic_info.pbl_top - PBL_CHUNK + 1;
-		     i += PBL_CHUNK)
-			gen_pool_add(rdev_p->pbl_pool, i, PBL_CHUNK, -1);
-	return rdev_p->pbl_pool ? 0 : -ENOMEM;
+	if (!rdev_p->pbl_pool)
+		return -ENOMEM;
+
+	pbl_start = rdev_p->rnic_info.pbl_base;
+	pbl_chunk = rdev_p->rnic_info.pbl_top - pbl_start + 1;
+
+	while (pbl_start < rdev_p->rnic_info.pbl_top) {
+		pbl_chunk = min(rdev_p->rnic_info.pbl_top - pbl_start + 1,
+				pbl_chunk);
+		if (gen_pool_add(rdev_p->pbl_pool, pbl_start, pbl_chunk, -1)) {
+			PDBG("%s failed to add PBL chunk (%x/%x)\n",
+			     __func__, pbl_start, pbl_chunk);
+			if (pbl_chunk <= 1024 << MIN_PBL_SHIFT) {
+				printk(KERN_WARNING MOD "%s: Failed to add all PBL chunks (%x/%x)\n",
+				       __func__, pbl_start, rdev_p->rnic_info.pbl_top - pbl_start);
+				return 0;
+			}
+			pbl_chunk >>= 1;
+		} else {
+			PDBG("%s added PBL chunk (%x/%x)\n",
+			     __func__, pbl_start, pbl_chunk);
+			pbl_start += pbl_chunk;
+		}
+	}
+
+	return 0;
 }
 
 void cxio_hal_pblpool_destroy(struct cxio_rdev *rdev_p)
-- 
cgit v0.10.2


From 3eb6753e20af1803d7784efc36e8208e8d5fac05 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 6 May 2008 15:15:12 -0700
Subject: usb: Sparc build fix, make USB_ISP1760_OF depend on PPC_OF

Sparc doesn't have some of the OF interfaces this driver
wants to use.

Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/usb/host/Kconfig b/drivers/usb/host/Kconfig
index 33b467a..1ef6df3 100644
--- a/drivers/usb/host/Kconfig
+++ b/drivers/usb/host/Kconfig
@@ -129,7 +129,7 @@ config USB_ISP1760_PCI
 
 config USB_ISP1760_OF
 	bool "Support for the OF platform bus"
-	depends on USB_ISP1760_HCD && OF
+	depends on USB_ISP1760_HCD && PPC_OF
 	---help---
 	  Enables support for the device present on the PowerPC
 	  OpenFirmware platform bus.
-- 
cgit v0.10.2


From d45100f7b69e3d9cd0cd5e956b6ac2c78d460d07 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 6 May 2008 15:19:54 -0700
Subject: sparc64: Fix initrd regression.

We die because we forget to convert initrd_start and
initrd_end to virtual addresses.

Reported by Mikael Pettersson

Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/arch/sparc64/mm/init.c b/arch/sparc64/mm/init.c
index db8e7fb..ec3e2c7 100644
--- a/arch/sparc64/mm/init.c
+++ b/arch/sparc64/mm/init.c
@@ -771,6 +771,9 @@ static void __init find_ramdisk(unsigned long phys_base)
 		initrd_end = ramdisk_image + sparc_ramdisk_size;
 
 		lmb_reserve(initrd_start, initrd_end);
+
+		initrd_start += PAGE_OFFSET;
+		initrd_end += PAGE_OFFSET;
 	}
 #endif
 }
-- 
cgit v0.10.2


From 273748cc908a901d082b4da5a16b2541c9d78a02 Mon Sep 17 00:00:00 2001
From: Roland Dreier <rolandd@cisco.com>
Date: Tue, 6 May 2008 15:56:22 -0700
Subject: RDMA/cxgb3: Fix severe limit on userspace memory registration size

Currently, iw_cxgb3 is severely limited on the amount of userspace
memory that can be registered in in a single memory region, which
causes big problems for applications that expect to be able to
register 100s of MB.

The problem is that the driver uses a single kmalloc()ed buffer to
hold the physical buffer list (PBL) for the entire memory region
during registration, which means that 8 bytes of contiguous memory are
required for each page of memory being registered.  For example, a 64
MB registration will require 128 KB of contiguous memory with 4 KB
pages, and it unlikely that such an allocation will succeed on a busy
system.

This is purely a driver problem: the temporary page list buffer is not
needed by the hardware, so we can fix this by writing the PBL to the
hardware in page-sized chunks rather than all at once.  We do this by
splitting the memory registration operation up into several steps:

 - Allocate PBL space in adapter memory for the full registration
 - Copy PBL to adapter memory in chunks
 - Allocate STag and enable memory region

This also allows several other cleanups to the __cxio_tpt_op()
interface and related parts of the driver.

This change leaves the reregister memory region and memory window
operations broken, but they already didn't work due to other
longstanding bugs, so fixing them will be left to a later patch.

Signed-off-by: Roland Dreier <rolandd@cisco.com>

diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.c b/drivers/infiniband/hw/cxgb3/cxio_hal.c
index 5fd8506..ebf9d30 100644
--- a/drivers/infiniband/hw/cxgb3/cxio_hal.c
+++ b/drivers/infiniband/hw/cxgb3/cxio_hal.c
@@ -588,7 +588,7 @@ static int cxio_hal_destroy_ctrl_qp(struct cxio_rdev *rdev_p)
  * caller aquires the ctrl_qp lock before the call
  */
 static int cxio_hal_ctrl_qp_write_mem(struct cxio_rdev *rdev_p, u32 addr,
-				      u32 len, void *data, int completion)
+				      u32 len, void *data)
 {
 	u32 i, nr_wqe, copy_len;
 	u8 *copy_data;
@@ -624,7 +624,7 @@ static int cxio_hal_ctrl_qp_write_mem(struct cxio_rdev *rdev_p, u32 addr,
 		flag = 0;
 		if (i == (nr_wqe - 1)) {
 			/* last WQE */
-			flag = completion ? T3_COMPLETION_FLAG : 0;
+			flag = T3_COMPLETION_FLAG;
 			if (len % 32)
 				utx_len = len / 32 + 1;
 			else
@@ -683,21 +683,20 @@ static int cxio_hal_ctrl_qp_write_mem(struct cxio_rdev *rdev_p, u32 addr,
 	return 0;
 }
 
-/* IN: stag key, pdid, perm, zbva, to, len, page_size, pbl, and pbl_size
- * OUT: stag index, actual pbl_size, pbl_addr allocated.
+/* IN: stag key, pdid, perm, zbva, to, len, page_size, pbl_size and pbl_addr
+ * OUT: stag index
  * TBD: shared memory region support
  */
 static int __cxio_tpt_op(struct cxio_rdev *rdev_p, u32 reset_tpt_entry,
 			 u32 *stag, u8 stag_state, u32 pdid,
 			 enum tpt_mem_type type, enum tpt_mem_perm perm,
-			 u32 zbva, u64 to, u32 len, u8 page_size, __be64 *pbl,
-			 u32 *pbl_size, u32 *pbl_addr)
+			 u32 zbva, u64 to, u32 len, u8 page_size,
+			 u32 pbl_size, u32 pbl_addr)
 {
 	int err;
 	struct tpt_entry tpt;
 	u32 stag_idx;
 	u32 wptr;
-	int rereg = (*stag != T3_STAG_UNSET);
 
 	stag_state = stag_state > 0;
 	stag_idx = (*stag) >> 8;
@@ -711,30 +710,8 @@ static int __cxio_tpt_op(struct cxio_rdev *rdev_p, u32 reset_tpt_entry,
 	PDBG("%s stag_state 0x%0x type 0x%0x pdid 0x%0x, stag_idx 0x%x\n",
 	     __func__, stag_state, type, pdid, stag_idx);
 
-	if (reset_tpt_entry)
-		cxio_hal_pblpool_free(rdev_p, *pbl_addr, *pbl_size << 3);
-	else if (!rereg) {
-		*pbl_addr = cxio_hal_pblpool_alloc(rdev_p, *pbl_size << 3);
-		if (!*pbl_addr) {
-			return -ENOMEM;
-		}
-	}
-
 	mutex_lock(&rdev_p->ctrl_qp.lock);
 
-	/* write PBL first if any - update pbl only if pbl list exist */
-	if (pbl) {
-
-		PDBG("%s *pdb_addr 0x%x, pbl_base 0x%x, pbl_size %d\n",
-		     __func__, *pbl_addr, rdev_p->rnic_info.pbl_base,
-		     *pbl_size);
-		err = cxio_hal_ctrl_qp_write_mem(rdev_p,
-				(*pbl_addr >> 5),
-				(*pbl_size << 3), pbl, 0);
-		if (err)
-			goto ret;
-	}
-
 	/* write TPT entry */
 	if (reset_tpt_entry)
 		memset(&tpt, 0, sizeof(tpt));
@@ -749,23 +726,23 @@ static int __cxio_tpt_op(struct cxio_rdev *rdev_p, u32 reset_tpt_entry,
 				V_TPT_ADDR_TYPE((zbva ? TPT_ZBTO : TPT_VATO)) |
 				V_TPT_PAGE_SIZE(page_size));
 		tpt.rsvd_pbl_addr = reset_tpt_entry ? 0 :
-				    cpu_to_be32(V_TPT_PBL_ADDR(PBL_OFF(rdev_p, *pbl_addr)>>3));
+				    cpu_to_be32(V_TPT_PBL_ADDR(PBL_OFF(rdev_p, pbl_addr)>>3));
 		tpt.len = cpu_to_be32(len);
 		tpt.va_hi = cpu_to_be32((u32) (to >> 32));
 		tpt.va_low_or_fbo = cpu_to_be32((u32) (to & 0xFFFFFFFFULL));
 		tpt.rsvd_bind_cnt_or_pstag = 0;
 		tpt.rsvd_pbl_size = reset_tpt_entry ? 0 :
-				  cpu_to_be32(V_TPT_PBL_SIZE((*pbl_size) >> 2));
+				  cpu_to_be32(V_TPT_PBL_SIZE(pbl_size >> 2));
 	}
 	err = cxio_hal_ctrl_qp_write_mem(rdev_p,
 				       stag_idx +
 				       (rdev_p->rnic_info.tpt_base >> 5),
-				       sizeof(tpt), &tpt, 1);
+				       sizeof(tpt), &tpt);
 
 	/* release the stag index to free pool */
 	if (reset_tpt_entry)
 		cxio_hal_put_stag(rdev_p->rscp, stag_idx);
-ret:
+
 	wptr = rdev_p->ctrl_qp.wptr;
 	mutex_unlock(&rdev_p->ctrl_qp.lock);
 	if (!err)
@@ -776,44 +753,67 @@ ret:
 	return err;
 }
 
+int cxio_write_pbl(struct cxio_rdev *rdev_p, __be64 *pbl,
+		   u32 pbl_addr, u32 pbl_size)
+{
+	u32 wptr;
+	int err;
+
+	PDBG("%s *pdb_addr 0x%x, pbl_base 0x%x, pbl_size %d\n",
+	     __func__, pbl_addr, rdev_p->rnic_info.pbl_base,
+	     pbl_size);
+
+	mutex_lock(&rdev_p->ctrl_qp.lock);
+	err = cxio_hal_ctrl_qp_write_mem(rdev_p, pbl_addr >> 5, pbl_size << 3,
+					 pbl);
+	wptr = rdev_p->ctrl_qp.wptr;
+	mutex_unlock(&rdev_p->ctrl_qp.lock);
+	if (err)
+		return err;
+
+	if (wait_event_interruptible(rdev_p->ctrl_qp.waitq,
+				     SEQ32_GE(rdev_p->ctrl_qp.rptr,
+					      wptr)))
+		return -ERESTARTSYS;
+
+	return 0;
+}
+
 int cxio_register_phys_mem(struct cxio_rdev *rdev_p, u32 *stag, u32 pdid,
 			   enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len,
-			   u8 page_size, __be64 *pbl, u32 *pbl_size,
-			   u32 *pbl_addr)
+			   u8 page_size, u32 pbl_size, u32 pbl_addr)
 {
 	*stag = T3_STAG_UNSET;
 	return __cxio_tpt_op(rdev_p, 0, stag, 1, pdid, TPT_NON_SHARED_MR, perm,
-			     zbva, to, len, page_size, pbl, pbl_size, pbl_addr);
+			     zbva, to, len, page_size, pbl_size, pbl_addr);
 }
 
 int cxio_reregister_phys_mem(struct cxio_rdev *rdev_p, u32 *stag, u32 pdid,
 			   enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len,
-			   u8 page_size, __be64 *pbl, u32 *pbl_size,
-			   u32 *pbl_addr)
+			   u8 page_size, u32 pbl_size, u32 pbl_addr)
 {
 	return __cxio_tpt_op(rdev_p, 0, stag, 1, pdid, TPT_NON_SHARED_MR, perm,
-			     zbva, to, len, page_size, pbl, pbl_size, pbl_addr);
+			     zbva, to, len, page_size, pbl_size, pbl_addr);
 }
 
 int cxio_dereg_mem(struct cxio_rdev *rdev_p, u32 stag, u32 pbl_size,
 		   u32 pbl_addr)
 {
-	return __cxio_tpt_op(rdev_p, 1, &stag, 0, 0, 0, 0, 0, 0ULL, 0, 0, NULL,
-			     &pbl_size, &pbl_addr);
+	return __cxio_tpt_op(rdev_p, 1, &stag, 0, 0, 0, 0, 0, 0ULL, 0, 0,
+			     pbl_size, pbl_addr);
 }
 
 int cxio_allocate_window(struct cxio_rdev *rdev_p, u32 * stag, u32 pdid)
 {
-	u32 pbl_size = 0;
 	*stag = T3_STAG_UNSET;
 	return __cxio_tpt_op(rdev_p, 0, stag, 0, pdid, TPT_MW, 0, 0, 0ULL, 0, 0,
-			     NULL, &pbl_size, NULL);
+			     0, 0);
 }
 
 int cxio_deallocate_window(struct cxio_rdev *rdev_p, u32 stag)
 {
-	return __cxio_tpt_op(rdev_p, 1, &stag, 0, 0, 0, 0, 0, 0ULL, 0, 0, NULL,
-			     NULL, NULL);
+	return __cxio_tpt_op(rdev_p, 1, &stag, 0, 0, 0, 0, 0, 0ULL, 0, 0,
+			     0, 0);
 }
 
 int cxio_rdma_init(struct cxio_rdev *rdev_p, struct t3_rdma_init_attr *attr)
diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.h b/drivers/infiniband/hw/cxgb3/cxio_hal.h
index 69ab08e..6e128f6 100644
--- a/drivers/infiniband/hw/cxgb3/cxio_hal.h
+++ b/drivers/infiniband/hw/cxgb3/cxio_hal.h
@@ -154,14 +154,14 @@ int cxio_create_qp(struct cxio_rdev *rdev, u32 kernel_domain, struct t3_wq *wq,
 int cxio_destroy_qp(struct cxio_rdev *rdev, struct t3_wq *wq,
 		    struct cxio_ucontext *uctx);
 int cxio_peek_cq(struct t3_wq *wr, struct t3_cq *cq, int opcode);
+int cxio_write_pbl(struct cxio_rdev *rdev_p, __be64 *pbl,
+		   u32 pbl_addr, u32 pbl_size);
 int cxio_register_phys_mem(struct cxio_rdev *rdev, u32 * stag, u32 pdid,
 			   enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len,
-			   u8 page_size, __be64 *pbl, u32 *pbl_size,
-			   u32 *pbl_addr);
+			   u8 page_size, u32 pbl_size, u32 pbl_addr);
 int cxio_reregister_phys_mem(struct cxio_rdev *rdev, u32 * stag, u32 pdid,
 			   enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len,
-			   u8 page_size, __be64 *pbl, u32 *pbl_size,
-			   u32 *pbl_addr);
+			   u8 page_size, u32 pbl_size, u32 pbl_addr);
 int cxio_dereg_mem(struct cxio_rdev *rdev, u32 stag, u32 pbl_size,
 		   u32 pbl_addr);
 int cxio_allocate_window(struct cxio_rdev *rdev, u32 * stag, u32 pdid);
diff --git a/drivers/infiniband/hw/cxgb3/iwch_mem.c b/drivers/infiniband/hw/cxgb3/iwch_mem.c
index 58c3d61..ec49a5c 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_mem.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_mem.c
@@ -35,17 +35,26 @@
 #include <rdma/ib_verbs.h>
 
 #include "cxio_hal.h"
+#include "cxio_resource.h"
 #include "iwch.h"
 #include "iwch_provider.h"
 
-int iwch_register_mem(struct iwch_dev *rhp, struct iwch_pd *php,
-					struct iwch_mr *mhp,
-					int shift,
-					__be64 *page_list)
+static void iwch_finish_mem_reg(struct iwch_mr *mhp, u32 stag)
 {
-	u32 stag;
 	u32 mmid;
 
+	mhp->attr.state = 1;
+	mhp->attr.stag = stag;
+	mmid = stag >> 8;
+	mhp->ibmr.rkey = mhp->ibmr.lkey = stag;
+	insert_handle(mhp->rhp, &mhp->rhp->mmidr, mhp, mmid);
+	PDBG("%s mmid 0x%x mhp %p\n", __func__, mmid, mhp);
+}
+
+int iwch_register_mem(struct iwch_dev *rhp, struct iwch_pd *php,
+		      struct iwch_mr *mhp, int shift)
+{
+	u32 stag;
 
 	if (cxio_register_phys_mem(&rhp->rdev,
 				   &stag, mhp->attr.pdid,
@@ -53,28 +62,21 @@ int iwch_register_mem(struct iwch_dev *rhp, struct iwch_pd *php,
 				   mhp->attr.zbva,
 				   mhp->attr.va_fbo,
 				   mhp->attr.len,
-				   shift-12,
-				   page_list,
-				   &mhp->attr.pbl_size, &mhp->attr.pbl_addr))
+				   shift - 12,
+				   mhp->attr.pbl_size, mhp->attr.pbl_addr))
 		return -ENOMEM;
-	mhp->attr.state = 1;
-	mhp->attr.stag = stag;
-	mmid = stag >> 8;
-	mhp->ibmr.rkey = mhp->ibmr.lkey = stag;
-	insert_handle(rhp, &rhp->mmidr, mhp, mmid);
-	PDBG("%s mmid 0x%x mhp %p\n", __func__, mmid, mhp);
+
+	iwch_finish_mem_reg(mhp, stag);
+
 	return 0;
 }
 
 int iwch_reregister_mem(struct iwch_dev *rhp, struct iwch_pd *php,
 					struct iwch_mr *mhp,
 					int shift,
-					__be64 *page_list,
 					int npages)
 {
 	u32 stag;
-	u32 mmid;
-
 
 	/* We could support this... */
 	if (npages > mhp->attr.pbl_size)
@@ -87,19 +89,40 @@ int iwch_reregister_mem(struct iwch_dev *rhp, struct iwch_pd *php,
 				   mhp->attr.zbva,
 				   mhp->attr.va_fbo,
 				   mhp->attr.len,
-				   shift-12,
-				   page_list,
-				   &mhp->attr.pbl_size, &mhp->attr.pbl_addr))
+				   shift - 12,
+				   mhp->attr.pbl_size, mhp->attr.pbl_addr))
 		return -ENOMEM;
-	mhp->attr.state = 1;
-	mhp->attr.stag = stag;
-	mmid = stag >> 8;
-	mhp->ibmr.rkey = mhp->ibmr.lkey = stag;
-	insert_handle(rhp, &rhp->mmidr, mhp, mmid);
-	PDBG("%s mmid 0x%x mhp %p\n", __func__, mmid, mhp);
+
+	iwch_finish_mem_reg(mhp, stag);
+
+	return 0;
+}
+
+int iwch_alloc_pbl(struct iwch_mr *mhp, int npages)
+{
+	mhp->attr.pbl_addr = cxio_hal_pblpool_alloc(&mhp->rhp->rdev,
+						    npages << 3);
+
+	if (!mhp->attr.pbl_addr)
+		return -ENOMEM;
+
+	mhp->attr.pbl_size = npages;
+
 	return 0;
 }
 
+void iwch_free_pbl(struct iwch_mr *mhp)
+{
+	cxio_hal_pblpool_free(&mhp->rhp->rdev, mhp->attr.pbl_addr,
+			      mhp->attr.pbl_size << 3);
+}
+
+int iwch_write_pbl(struct iwch_mr *mhp, __be64 *pages, int npages, int offset)
+{
+	return cxio_write_pbl(&mhp->rhp->rdev, pages,
+			      mhp->attr.pbl_addr + (offset << 3), npages);
+}
+
 int build_phys_page_list(struct ib_phys_buf *buffer_list,
 					int num_phys_buf,
 					u64 *iova_start,
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c
index d07d3a3..8934178 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_provider.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c
@@ -442,6 +442,7 @@ static int iwch_dereg_mr(struct ib_mr *ib_mr)
 	mmid = mhp->attr.stag >> 8;
 	cxio_dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size,
 		       mhp->attr.pbl_addr);
+	iwch_free_pbl(mhp);
 	remove_handle(rhp, &rhp->mmidr, mmid);
 	if (mhp->kva)
 		kfree((void *) (unsigned long) mhp->kva);
@@ -475,6 +476,8 @@ static struct ib_mr *iwch_register_phys_mem(struct ib_pd *pd,
 	if (!mhp)
 		return ERR_PTR(-ENOMEM);
 
+	mhp->rhp = rhp;
+
 	/* First check that we have enough alignment */
 	if ((*iova_start & ~PAGE_MASK) != (buffer_list[0].addr & ~PAGE_MASK)) {
 		ret = -EINVAL;
@@ -492,7 +495,17 @@ static struct ib_mr *iwch_register_phys_mem(struct ib_pd *pd,
 	if (ret)
 		goto err;
 
-	mhp->rhp = rhp;
+	ret = iwch_alloc_pbl(mhp, npages);
+	if (ret) {
+		kfree(page_list);
+		goto err_pbl;
+	}
+
+	ret = iwch_write_pbl(mhp, page_list, npages, 0);
+	kfree(page_list);
+	if (ret)
+		goto err_pbl;
+
 	mhp->attr.pdid = php->pdid;
 	mhp->attr.zbva = 0;
 
@@ -502,12 +515,15 @@ static struct ib_mr *iwch_register_phys_mem(struct ib_pd *pd,
 
 	mhp->attr.len = (u32) total_size;
 	mhp->attr.pbl_size = npages;
-	ret = iwch_register_mem(rhp, php, mhp, shift, page_list);
-	kfree(page_list);
-	if (ret) {
-		goto err;
-	}
+	ret = iwch_register_mem(rhp, php, mhp, shift);
+	if (ret)
+		goto err_pbl;
+
 	return &mhp->ibmr;
+
+err_pbl:
+	iwch_free_pbl(mhp);
+
 err:
 	kfree(mhp);
 	return ERR_PTR(ret);
@@ -560,7 +576,7 @@ static int iwch_reregister_phys_mem(struct ib_mr *mr,
 			return ret;
 	}
 
-	ret = iwch_reregister_mem(rhp, php, &mh, shift, page_list, npages);
+	ret = iwch_reregister_mem(rhp, php, &mh, shift, npages);
 	kfree(page_list);
 	if (ret) {
 		return ret;
@@ -602,6 +618,8 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 	if (!mhp)
 		return ERR_PTR(-ENOMEM);
 
+	mhp->rhp = rhp;
+
 	mhp->umem = ib_umem_get(pd->uobject->context, start, length, acc, 0);
 	if (IS_ERR(mhp->umem)) {
 		err = PTR_ERR(mhp->umem);
@@ -615,10 +633,14 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 	list_for_each_entry(chunk, &mhp->umem->chunk_list, list)
 		n += chunk->nents;
 
-	pages = kmalloc(n * sizeof(u64), GFP_KERNEL);
+	err = iwch_alloc_pbl(mhp, n);
+	if (err)
+		goto err;
+
+	pages = (__be64 *) __get_free_page(GFP_KERNEL);
 	if (!pages) {
 		err = -ENOMEM;
-		goto err;
+		goto err_pbl;
 	}
 
 	i = n = 0;
@@ -630,25 +652,38 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 				pages[i++] = cpu_to_be64(sg_dma_address(
 					&chunk->page_list[j]) +
 					mhp->umem->page_size * k);
+				if (i == PAGE_SIZE / sizeof *pages) {
+					err = iwch_write_pbl(mhp, pages, i, n);
+					if (err)
+						goto pbl_done;
+					n += i;
+					i = 0;
+				}
 			}
 		}
 
-	mhp->rhp = rhp;
+	if (i)
+		err = iwch_write_pbl(mhp, pages, i, n);
+
+pbl_done:
+	free_page((unsigned long) pages);
+	if (err)
+		goto err_pbl;
+
 	mhp->attr.pdid = php->pdid;
 	mhp->attr.zbva = 0;
 	mhp->attr.perms = iwch_ib_to_tpt_access(acc);
 	mhp->attr.va_fbo = virt;
 	mhp->attr.page_size = shift - 12;
 	mhp->attr.len = (u32) length;
-	mhp->attr.pbl_size = i;
-	err = iwch_register_mem(rhp, php, mhp, shift, pages);
-	kfree(pages);
+
+	err = iwch_register_mem(rhp, php, mhp, shift);
 	if (err)
-		goto err;
+		goto err_pbl;
 
 	if (udata && !t3a_device(rhp)) {
 		uresp.pbl_addr = (mhp->attr.pbl_addr -
-	                         rhp->rdev.rnic_info.pbl_base) >> 3;
+				 rhp->rdev.rnic_info.pbl_base) >> 3;
 		PDBG("%s user resp pbl_addr 0x%x\n", __func__,
 		     uresp.pbl_addr);
 
@@ -661,6 +696,9 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 
 	return &mhp->ibmr;
 
+err_pbl:
+	iwch_free_pbl(mhp);
+
 err:
 	ib_umem_release(mhp->umem);
 	kfree(mhp);
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.h b/drivers/infiniband/hw/cxgb3/iwch_provider.h
index db5100d..836163f 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_provider.h
+++ b/drivers/infiniband/hw/cxgb3/iwch_provider.h
@@ -340,14 +340,14 @@ int iwch_quiesce_qps(struct iwch_cq *chp);
 int iwch_resume_qps(struct iwch_cq *chp);
 void stop_read_rep_timer(struct iwch_qp *qhp);
 int iwch_register_mem(struct iwch_dev *rhp, struct iwch_pd *php,
-					struct iwch_mr *mhp,
-					int shift,
-					__be64 *page_list);
+		      struct iwch_mr *mhp, int shift);
 int iwch_reregister_mem(struct iwch_dev *rhp, struct iwch_pd *php,
 					struct iwch_mr *mhp,
 					int shift,
-					__be64 *page_list,
 					int npages);
+int iwch_alloc_pbl(struct iwch_mr *mhp, int npages);
+void iwch_free_pbl(struct iwch_mr *mhp);
+int iwch_write_pbl(struct iwch_mr *mhp, __be64 *pages, int npages, int offset);
 int build_phys_page_list(struct ib_phys_buf *buffer_list,
 					int num_phys_buf,
 					u64 *iova_start,
-- 
cgit v0.10.2


From c0a18111e571138747a98af18b3a2124df56a0d1 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 6 May 2008 17:09:27 -0700
Subject: Revert "uml: fix gcc problem"

This reverts commit 22eecde2f9034764a3fd095eecfa3adfb8ec9a98.  Uli
reports that it breaks UML on x86-64 with the Fedora 8 gcc (gcc 4.1.2),
causing a crash on startup. See

	http://marc.info/?l=linux-kernel&m=121011722806093&w=2

for a trace.

Reported-by: Ulrich Drepper <drepper@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/um/Makefile b/arch/um/Makefile
index 01b97c1..dbeab15 100644
--- a/arch/um/Makefile
+++ b/arch/um/Makefile
@@ -77,10 +77,7 @@ include $(srctree)/$(ARCH_DIR)/Makefile-os-$(OS)
 KERNEL_DEFINES = $(strip -Derrno=kernel_errno -Dsigprocmask=kernel_sigprocmask \
 			 -Dmktime=kernel_mktime $(ARCH_KERNEL_DEFINES))
 KBUILD_CFLAGS += $(KERNEL_DEFINES)
-# Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use
-# a lot more stack due to the lack of sharing of stacklots:
-KBUILD_CFLAGS += $(shell if [ $(call cc-version) -lt 0400 ] ; then \
-			echo $(call cc-option,-fno-unit-at-a-time); fi ;)
+KBUILD_CFLAGS += $(call cc-option,-fno-unit-at-a-time,)
 
 PHONY += linux
 
-- 
cgit v0.10.2


From bf0f97025c7306870b86ccd63669aa278e7ec1c2 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 7 May 2008 09:09:39 +0200
Subject: block: sysfs store function needs to grab queue_lock and use
 queue_flag_*()

Concurrency isn't a big deal here since we have requests in flight
at this point, but do the locked variant to set a better example.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index e85c401..304ec73 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -146,11 +146,13 @@ static ssize_t queue_nomerges_store(struct request_queue *q, const char *page,
 	unsigned long nm;
 	ssize_t ret = queue_var_store(&nm, page, count);
 
+	spin_lock_irq(q->queue_lock);
 	if (nm)
-	       set_bit(QUEUE_FLAG_NOMERGES, &q->queue_flags);
+		queue_flag_set(QUEUE_FLAG_NOMERGES, q);
 	else
-	       clear_bit(QUEUE_FLAG_NOMERGES, &q->queue_flags);
+		queue_flag_clear(QUEUE_FLAG_NOMERGES, q);
 
+	spin_unlock_irq(q->queue_lock);
 	return ret;
 }
 
-- 
cgit v0.10.2


From 2688905e6a9b3647bf7b452cb0ff2bdb166bd8fe Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Wed, 7 May 2008 09:22:52 +0200
Subject: [S390] s390: Optimize user and work TIF check

On return from syscall or interrupt, we have to check if we return to
userspace (likely) and if there is work todo (less likely) to decide
if we handle the work. We can optimize this check: we first check for
the less likely work case and then check for userspace.

This patch is also a preparation for an additional patch, that fixes a bug
in KVM dealing with cpu bound guests.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>

diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index bdbb3bc..708cf9c 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -279,8 +279,6 @@ sysc_do_restart:
 	st	%r2,SP_R2(%r15)   # store return value (change R2 on stack)
 
 sysc_return:
-	tm	SP_PSW+1(%r15),0x01	# returning to user ?
-	bno	BASED(sysc_restore)
 	tm	__TI_flags+3(%r9),_TIF_WORK_SVC
 	bnz	BASED(sysc_work)  # there is work to do (signals etc.)
 sysc_restore:
@@ -312,6 +310,8 @@ sysc_work_loop:
 # One of the work bits is on. Find out which one.
 #
 sysc_work:
+	tm	SP_PSW+1(%r15),0x01	# returning to user ?
+	bno	BASED(sysc_restore)
 	tm	__TI_flags+3(%r9),_TIF_MCCK_PENDING
 	bo	BASED(sysc_mcck_pending)
 	tm	__TI_flags+3(%r9),_TIF_NEED_RESCHED
@@ -602,12 +602,6 @@ io_no_vtime:
 	la	%r2,SP_PTREGS(%r15)	# address of register-save area
 	basr	%r14,%r1		# branch to standard irq handler
 io_return:
-	tm	SP_PSW+1(%r15),0x01	# returning to user ?
-#ifdef CONFIG_PREEMPT
-	bno	BASED(io_preempt)	# no -> check for preemptive scheduling
-#else
-	bno	BASED(io_restore)	# no-> skip resched & signal
-#endif
 	tm	__TI_flags+3(%r9),_TIF_WORK_INT
 	bnz	BASED(io_work)		# there is work to do (signals etc.)
 io_restore:
@@ -629,10 +623,18 @@ io_restore_trace_psw:
 	.long	0, io_restore_trace + 0x80000000
 #endif
 
-#ifdef CONFIG_PREEMPT
-io_preempt:
+#
+# switch to kernel stack, then check the TIF bits
+#
+io_work:
+	tm	SP_PSW+1(%r15),0x01	# returning to user ?
+#ifndef CONFIG_PREEMPT
+	bno	BASED(io_restore)	# no-> skip resched & signal
+#else
+	bnz	BASED(io_work_user)	# no -> check for preemptive scheduling
+	# check for preemptive scheduling
 	icm	%r0,15,__TI_precount(%r9)
-	bnz	BASED(io_restore)
+	bnz	BASED(io_restore)	# preemption disabled
 	l	%r1,SP_R15(%r15)
 	s	%r1,BASED(.Lc_spsize)
 	mvc	SP_PTREGS(__PT_SIZE,%r1),SP_PTREGS(%r15)
@@ -646,10 +648,7 @@ io_resume_loop:
 	br	%r1			# call schedule
 #endif
 
-#
-# switch to kernel stack, then check the TIF bits
-#
-io_work:
+io_work_user:
 	l	%r1,__LC_KERNEL_STACK
 	s	%r1,BASED(.Lc_spsize)
 	mvc	SP_PTREGS(__PT_SIZE,%r1),SP_PTREGS(%r15)
diff --git a/arch/s390/kernel/entry64.S b/arch/s390/kernel/entry64.S
index 5a4a7bc..a57909d 100644
--- a/arch/s390/kernel/entry64.S
+++ b/arch/s390/kernel/entry64.S
@@ -271,8 +271,6 @@ sysc_noemu:
 	stg	%r2,SP_R2(%r15) # store return value (change R2 on stack)
 
 sysc_return:
-	tm	SP_PSW+1(%r15),0x01	# returning to user ?
-	jno	sysc_restore
 	tm	__TI_flags+7(%r9),_TIF_WORK_SVC
 	jnz	sysc_work	# there is work to do (signals etc.)
 sysc_restore:
@@ -304,6 +302,8 @@ sysc_work_loop:
 # One of the work bits is on. Find out which one.
 #
 sysc_work:
+	tm	SP_PSW+1(%r15),0x01	# returning to user ?
+	jno	sysc_restore
 	tm	__TI_flags+7(%r9),_TIF_MCCK_PENDING
 	jo	sysc_mcck_pending
 	tm	__TI_flags+7(%r9),_TIF_NEED_RESCHED
@@ -585,12 +585,6 @@ io_no_vtime:
 	la	%r2,SP_PTREGS(%r15)	# address of register-save area
 	brasl	%r14,do_IRQ		# call standard irq handler
 io_return:
-	tm	SP_PSW+1(%r15),0x01	# returning to user ?
-#ifdef CONFIG_PREEMPT
-	jno	io_preempt		# no -> check for preemptive scheduling
-#else
-	jno	io_restore		# no-> skip resched & signal
-#endif
 	tm	__TI_flags+7(%r9),_TIF_WORK_INT
 	jnz	io_work 		# there is work to do (signals etc.)
 io_restore:
@@ -612,10 +606,18 @@ io_restore_trace_psw:
 	.quad	0, io_restore_trace
 #endif
 
-#ifdef CONFIG_PREEMPT
-io_preempt:
+#
+# switch to kernel stack, then check TIF bits
+#
+io_work:
+	tm	SP_PSW+1(%r15),0x01	# returning to user ?
+#ifndef CONFIG_PREEMPT
+	jno	io_restore		# no-> skip resched & signal
+#else
+	jnz	io_work_user		# yes -> do resched & signal
+	# check for preemptive scheduling
 	icm	%r0,15,__TI_precount(%r9)
-	jnz	io_restore
+	jnz	io_restore		# preemption is disabled
 	# switch to kernel stack
 	lg	%r1,SP_R15(%r15)
 	aghi	%r1,-SP_SIZE
@@ -629,10 +631,7 @@ io_resume_loop:
 	jg	preempt_schedule_irq
 #endif
 
-#
-# switch to kernel stack, then check TIF bits
-#
-io_work:
+io_work_user:
 	lg	%r1,__LC_KERNEL_STACK
 	aghi	%r1,-SP_SIZE
 	mvc	SP_PTREGS(__PT_SIZE,%r1),SP_PTREGS(%r15)
-- 
cgit v0.10.2


From 0eaeafa10f3b2bd027e95859a6785d4c7fcc174c Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Wed, 7 May 2008 09:22:53 +0200
Subject: [S390] s390-kvm: leave sie context on work. Removes preemption
 requirement

From: Martin Schwidefsky <schwidefsky@de.ibm.com>

This patch fixes a bug with cpu bound guest on kvm-s390. Sometimes it
was impossible to deliver a signal to a spinning guest. We used
preemption as a circumvention. The preemption notifiers called
vcpu_load, which checked for pending signals and triggered a host
intercept. But even with preemption, a sigkill was not delivered
immediately.

This patch changes the low level host interrupt handler to check for the
SIE  instruction, if TIF_WORK is set. In that case we change the
instruction pointer of the return PSW to rerun the vcpu_run loop. The kvm
code sees an intercept reason 0 if that happens. This patch adds accounting
for these types of intercept as well.

The advantages:
- works with and without preemption
- signals are delivered immediately
- much better host latencies without preemption

Acked-by: Carsten Otte <cotte@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>

diff --git a/arch/s390/kernel/entry64.S b/arch/s390/kernel/entry64.S
index a57909d..fee1017 100644
--- a/arch/s390/kernel/entry64.S
+++ b/arch/s390/kernel/entry64.S
@@ -607,14 +607,37 @@ io_restore_trace_psw:
 #endif
 
 #
-# switch to kernel stack, then check TIF bits
+# There is work todo, we need to check if we return to userspace, then
+# check, if we are in SIE, if yes leave it
 #
 io_work:
 	tm	SP_PSW+1(%r15),0x01	# returning to user ?
 #ifndef CONFIG_PREEMPT
+#if defined(CONFIG_KVM) || defined(CONFIG_KVM_MODULE)
+	jnz	io_work_user		# yes -> no need to check for SIE
+	la	%r1, BASED(sie_opcode)	# we return to kernel here
+	lg	%r2, SP_PSW+8(%r15)
+	clc	0(2,%r1), 0(%r2)	# is current instruction = SIE?
+	jne	io_restore		# no-> return to kernel
+	lg	%r1, SP_PSW+8(%r15)	# yes-> add 4 bytes to leave SIE
+	aghi	%r1, 4
+	stg	%r1, SP_PSW+8(%r15)
+	j	io_restore		# return to kernel
+#else
 	jno	io_restore		# no-> skip resched & signal
+#endif
 #else
 	jnz	io_work_user		# yes -> do resched & signal
+#if defined(CONFIG_KVM) || defined(CONFIG_KVM_MODULE)
+	la	%r1, BASED(sie_opcode)
+	lg	%r2, SP_PSW+8(%r15)
+	clc	0(2,%r1), 0(%r2)	# is current instruction = SIE?
+	jne	0f			# no -> leave PSW alone
+	lg	%r1, SP_PSW+8(%r15)	# yes-> add 4 bytes to leave SIE
+	aghi	%r1, 4
+	stg	%r1, SP_PSW+8(%r15)
+0:
+#endif
 	# check for preemptive scheduling
 	icm	%r0,15,__TI_precount(%r9)
 	jnz	io_restore		# preemption is disabled
@@ -652,6 +675,11 @@ io_work_loop:
 	j	io_restore
 io_work_done:
 
+#if defined(CONFIG_KVM) || defined(CONFIG_KVM_MODULE)
+sie_opcode:
+	.long 0xb2140000
+#endif
+
 #
 # _TIF_MCCK_PENDING is set, call handler
 #
diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig
index 1761b74..e051cad 100644
--- a/arch/s390/kvm/Kconfig
+++ b/arch/s390/kvm/Kconfig
@@ -22,7 +22,6 @@ config KVM
 	select PREEMPT_NOTIFIERS
 	select ANON_INODES
 	select S390_SWITCH_AMODE
-	select PREEMPT
 	---help---
 	  Support hosting paravirtualized guest machines using the SIE
 	  virtualization capability on the mainframe. This should work
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index 349581a..47a0b64 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -105,6 +105,9 @@ static intercept_handler_t instruction_handlers[256] = {
 static int handle_noop(struct kvm_vcpu *vcpu)
 {
 	switch (vcpu->arch.sie_block->icptcode) {
+	case 0x0:
+		vcpu->stat.exit_null++;
+		break;
 	case 0x10:
 		vcpu->stat.exit_external_request++;
 		break;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 98d1e73..0ac36a6 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -31,6 +31,7 @@
 
 struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "userspace_handled", VCPU_STAT(exit_userspace) },
+	{ "exit_null", VCPU_STAT(exit_null) },
 	{ "exit_validity", VCPU_STAT(exit_validity) },
 	{ "exit_stop_request", VCPU_STAT(exit_stop_request) },
 	{ "exit_external_request", VCPU_STAT(exit_external_request) },
@@ -221,10 +222,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	vcpu->arch.guest_fpregs.fpc &= FPC_VALID_MASK;
 	restore_fp_regs(&vcpu->arch.guest_fpregs);
 	restore_access_regs(vcpu->arch.guest_acrs);
-
-	if (signal_pending(current))
-		atomic_set_mask(CPUSTAT_STOP_INT,
-			&vcpu->arch.sie_block->cpuflags);
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
diff --git a/include/asm-s390/kvm_host.h b/include/asm-s390/kvm_host.h
index f8204a4..18cbd8a 100644
--- a/include/asm-s390/kvm_host.h
+++ b/include/asm-s390/kvm_host.h
@@ -104,6 +104,7 @@ struct sie_block {
 
 struct kvm_vcpu_stat {
 	u32 exit_userspace;
+	u32 exit_null;
 	u32 exit_external_request;
 	u32 exit_external_interrupt;
 	u32 exit_stop_request;
-- 
cgit v0.10.2


From 139b83dd57248a3c8fcfb256e562311ad61478e9 Mon Sep 17 00:00:00 2001
From: Michael Ernst <mernst@de.ibm.com>
Date: Wed, 7 May 2008 09:22:54 +0200
Subject: [S390] cio: Remove cio_msg kernel parameter.

The only sporadically used CIO_DEBUG messages are replaced by ordinary
CIO_MSG_EVENT messages. The CIO_MSG_EVENT messages debug levels are
consolidated.

Signed-off-by: Michael Ernst <mernst@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>

diff --git a/Documentation/s390/CommonIO b/Documentation/s390/CommonIO
index 8fbc0a8..bf0baa1 100644
--- a/Documentation/s390/CommonIO
+++ b/Documentation/s390/CommonIO
@@ -8,17 +8,6 @@ Command line parameters
 
   Enable logging of debug information in case of ccw device timeouts.
 
-
-* cio_msg = yes | no
-  
-  Determines whether information on found devices and sensed device 
-  characteristics should be shown during startup or when new devices are
-  found, i. e. messages of the types "Detected device 0.0.4711 on subchannel
-  0.0.0042" and "SenseID: Device 0.0.4711 reports: ...".
-
-  Default is off.
-
-
 * cio_ignore = {all} |
 	       {<device> | <range of devices>} |
 	       {!<device> | !<range of devices>}
diff --git a/drivers/s390/cio/blacklist.c b/drivers/s390/cio/blacklist.c
index 40ef948..0844476 100644
--- a/drivers/s390/cio/blacklist.c
+++ b/drivers/s390/cio/blacklist.c
@@ -199,7 +199,6 @@ blacklist_parse_parameters (char *str, range_action action)
 static int __init
 blacklist_setup (char *str)
 {
-	CIO_MSG_EVENT(6, "Reading blacklist parameters\n");
 	return blacklist_parse_parameters (str, add);
 }
 
diff --git a/drivers/s390/cio/cio.c b/drivers/s390/cio/cio.c
index 08a5781..82c6a2d 100644
--- a/drivers/s390/cio/cio.c
+++ b/drivers/s390/cio/cio.c
@@ -39,23 +39,6 @@ debug_info_t *cio_debug_msg_id;
 debug_info_t *cio_debug_trace_id;
 debug_info_t *cio_debug_crw_id;
 
-int cio_show_msg;
-
-static int __init
-cio_setup (char *parm)
-{
-	if (!strcmp (parm, "yes"))
-		cio_show_msg = 1;
-	else if (!strcmp (parm, "no"))
-		cio_show_msg = 0;
-	else
-		printk(KERN_ERR "cio: cio_setup: "
-		       "invalid cio_msg parameter '%s'", parm);
-	return 1;
-}
-
-__setup ("cio_msg=", cio_setup);
-
 /*
  * Function: cio_debug_init
  * Initializes three debug logs for common I/O:
@@ -166,7 +149,7 @@ cio_start_handle_notoper(struct subchannel *sch, __u8 lpm)
 
 	stsch (sch->schid, &sch->schib);
 
-	CIO_MSG_EVENT(0, "cio_start: 'not oper' status for "
+	CIO_MSG_EVENT(2, "cio_start: 'not oper' status for "
 		      "subchannel 0.%x.%04x!\n", sch->schid.ssid,
 		      sch->schid.sch_no);
 	sprintf(dbf_text, "no%s", sch->dev.bus_id);
@@ -567,10 +550,9 @@ cio_validate_subchannel (struct subchannel *sch, struct subchannel_id schid)
 	 * ... just being curious we check for non I/O subchannels
 	 */
 	if (sch->st != 0) {
-		CIO_DEBUG(KERN_INFO, 0,
-			  "Subchannel 0.%x.%04x reports "
-			  "non-I/O subchannel type %04X\n",
-			  sch->schid.ssid, sch->schid.sch_no, sch->st);
+		CIO_MSG_EVENT(4, "Subchannel 0.%x.%04x reports "
+			      "non-I/O subchannel type %04X\n",
+			      sch->schid.ssid, sch->schid.sch_no, sch->st);
 		/* We stop here for non-io subchannels. */
 		err = sch->st;
 		goto out;
@@ -588,7 +570,7 @@ cio_validate_subchannel (struct subchannel *sch, struct subchannel_id schid)
 		 * This device must not be known to Linux. So we simply
 		 * say that there is no device and return ENODEV.
 		 */
-		CIO_MSG_EVENT(4, "Blacklisted device detected "
+		CIO_MSG_EVENT(6, "Blacklisted device detected "
 			      "at devno %04X, subchannel set %x\n",
 			      sch->schib.pmcw.dev, sch->schid.ssid);
 		err = -ENODEV;
@@ -601,12 +583,11 @@ cio_validate_subchannel (struct subchannel *sch, struct subchannel_id schid)
 	sch->lpm = sch->schib.pmcw.pam & sch->opm;
 	sch->isc = 3;
 
-	CIO_DEBUG(KERN_INFO, 0,
-		  "Detected device %04x on subchannel 0.%x.%04X"
-		  " - PIM = %02X, PAM = %02X, POM = %02X\n",
-		  sch->schib.pmcw.dev, sch->schid.ssid,
-		  sch->schid.sch_no, sch->schib.pmcw.pim,
-		  sch->schib.pmcw.pam, sch->schib.pmcw.pom);
+	CIO_MSG_EVENT(6, "Detected device %04x on subchannel 0.%x.%04X "
+		      "- PIM = %02X, PAM = %02X, POM = %02X\n",
+		      sch->schib.pmcw.dev, sch->schid.ssid,
+		      sch->schid.sch_no, sch->schib.pmcw.pim,
+		      sch->schib.pmcw.pam, sch->schib.pmcw.pom);
 
 	/*
 	 * We now have to initially ...
diff --git a/drivers/s390/cio/cio.h b/drivers/s390/cio/cio.h
index 3c75412..6e933ae 100644
--- a/drivers/s390/cio/cio.h
+++ b/drivers/s390/cio/cio.h
@@ -118,6 +118,4 @@ extern void *cio_get_console_priv(void);
 #define cio_get_console_priv() NULL
 #endif
 
-extern int cio_show_msg;
-
 #endif
diff --git a/drivers/s390/cio/cio_debug.h b/drivers/s390/cio/cio_debug.h
index d7429ef..e64e827 100644
--- a/drivers/s390/cio/cio_debug.h
+++ b/drivers/s390/cio/cio_debug.h
@@ -31,10 +31,4 @@ static inline void CIO_HEX_EVENT(int level, void *data, int length)
 	}
 }
 
-#define CIO_DEBUG(printk_level, event_level, msg...) do {	\
-		if (cio_show_msg)				\
-			printk(printk_level "cio: " msg);	\
-		CIO_MSG_EVENT(event_level, msg);		\
-	} while (0)
-
 #endif
diff --git a/drivers/s390/cio/css.c b/drivers/s390/cio/css.c
index 595e327..a769565 100644
--- a/drivers/s390/cio/css.c
+++ b/drivers/s390/cio/css.c
@@ -570,7 +570,7 @@ static void reprobe_all(struct work_struct *unused)
 {
 	int ret;
 
-	CIO_MSG_EVENT(2, "reprobe start\n");
+	CIO_MSG_EVENT(4, "reprobe start\n");
 
 	need_reprobe = 0;
 	/* Make sure initial subchannel scan is done. */
@@ -578,7 +578,7 @@ static void reprobe_all(struct work_struct *unused)
 		   atomic_read(&ccw_device_init_count) == 0);
 	ret = for_each_subchannel_staged(NULL, reprobe_subchannel, NULL);
 
-	CIO_MSG_EVENT(2, "reprobe done (rc=%d, need_reprobe=%d)\n", ret,
+	CIO_MSG_EVENT(4, "reprobe done (rc=%d, need_reprobe=%d)\n", ret,
 		      need_reprobe);
 }
 
diff --git a/drivers/s390/cio/device.c b/drivers/s390/cio/device.c
index abfd601..e22813d 100644
--- a/drivers/s390/cio/device.c
+++ b/drivers/s390/cio/device.c
@@ -341,7 +341,7 @@ ccw_device_remove_disconnected(struct ccw_device *cdev)
 		rc = device_schedule_callback(&cdev->dev,
 					      ccw_device_remove_orphan_cb);
 		if (rc)
-			CIO_MSG_EVENT(2, "Couldn't unregister orphan "
+			CIO_MSG_EVENT(0, "Couldn't unregister orphan "
 				      "0.%x.%04x\n",
 				      cdev->private->dev_id.ssid,
 				      cdev->private->dev_id.devno);
@@ -351,7 +351,7 @@ ccw_device_remove_disconnected(struct ccw_device *cdev)
 	rc = device_schedule_callback(cdev->dev.parent,
 				      ccw_device_remove_sch_cb);
 	if (rc)
-		CIO_MSG_EVENT(2, "Couldn't unregister disconnected device "
+		CIO_MSG_EVENT(0, "Couldn't unregister disconnected device "
 			      "0.%x.%04x\n",
 			      cdev->private->dev_id.ssid,
 			      cdev->private->dev_id.devno);
@@ -397,7 +397,7 @@ int ccw_device_set_offline(struct ccw_device *cdev)
 	if (ret == 0)
 		wait_event(cdev->private->wait_q, dev_fsm_final_state(cdev));
 	else {
-		CIO_MSG_EVENT(2, "ccw_device_offline returned %d, "
+		CIO_MSG_EVENT(0, "ccw_device_offline returned %d, "
 			      "device 0.%x.%04x\n",
 			      ret, cdev->private->dev_id.ssid,
 			      cdev->private->dev_id.devno);
@@ -433,7 +433,7 @@ int ccw_device_set_online(struct ccw_device *cdev)
 	if (ret == 0)
 		wait_event(cdev->private->wait_q, dev_fsm_final_state(cdev));
 	else {
-		CIO_MSG_EVENT(2, "ccw_device_online returned %d, "
+		CIO_MSG_EVENT(0, "ccw_device_online returned %d, "
 			      "device 0.%x.%04x\n",
 			      ret, cdev->private->dev_id.ssid,
 			      cdev->private->dev_id.devno);
@@ -451,7 +451,7 @@ int ccw_device_set_online(struct ccw_device *cdev)
 	if (ret == 0)
 		wait_event(cdev->private->wait_q, dev_fsm_final_state(cdev));
 	else
-		CIO_MSG_EVENT(2, "ccw_device_offline returned %d, "
+		CIO_MSG_EVENT(0, "ccw_device_offline returned %d, "
 			      "device 0.%x.%04x\n",
 			      ret, cdev->private->dev_id.ssid,
 			      cdev->private->dev_id.devno);
@@ -803,7 +803,7 @@ static void sch_attach_disconnected_device(struct subchannel *sch,
 	other_sch = to_subchannel(get_device(cdev->dev.parent));
 	ret = device_move(&cdev->dev, &sch->dev);
 	if (ret) {
-		CIO_MSG_EVENT(2, "Moving disconnected device 0.%x.%04x failed "
+		CIO_MSG_EVENT(0, "Moving disconnected device 0.%x.%04x failed "
 			      "(ret=%d)!\n", cdev->private->dev_id.ssid,
 			      cdev->private->dev_id.devno, ret);
 		put_device(&other_sch->dev);
@@ -933,7 +933,7 @@ io_subchannel_register(struct work_struct *work)
 			ret = device_reprobe(&cdev->dev);
 			if (ret)
 				/* We can't do much here. */
-				CIO_MSG_EVENT(2, "device_reprobe() returned"
+				CIO_MSG_EVENT(0, "device_reprobe() returned"
 					      " %d for 0.%x.%04x\n", ret,
 					      cdev->private->dev_id.ssid,
 					      cdev->private->dev_id.devno);
@@ -1086,7 +1086,7 @@ static void ccw_device_move_to_sch(struct work_struct *work)
 	rc = device_move(&cdev->dev, &sch->dev);
 	mutex_unlock(&sch->reg_mutex);
 	if (rc) {
-		CIO_MSG_EVENT(2, "Moving device 0.%x.%04x to subchannel "
+		CIO_MSG_EVENT(0, "Moving device 0.%x.%04x to subchannel "
 			      "0.%x.%04x failed (ret=%d)!\n",
 			      cdev->private->dev_id.ssid,
 			      cdev->private->dev_id.devno, sch->schid.ssid,
@@ -1446,8 +1446,7 @@ ccw_device_remove (struct device *dev)
 			wait_event(cdev->private->wait_q,
 				   dev_fsm_final_state(cdev));
 		else
-			//FIXME: we can't fail!
-			CIO_MSG_EVENT(2, "ccw_device_offline returned %d, "
+			CIO_MSG_EVENT(0, "ccw_device_offline returned %d, "
 				      "device 0.%x.%04x\n",
 				      ret, cdev->private->dev_id.ssid,
 				      cdev->private->dev_id.devno);
@@ -1524,7 +1523,7 @@ static int recovery_check(struct device *dev, void *data)
 	spin_lock_irq(cdev->ccwlock);
 	switch (cdev->private->state) {
 	case DEV_STATE_DISCONNECTED:
-		CIO_MSG_EVENT(3, "recovery: trigger 0.%x.%04x\n",
+		CIO_MSG_EVENT(4, "recovery: trigger 0.%x.%04x\n",
 			      cdev->private->dev_id.ssid,
 			      cdev->private->dev_id.devno);
 		dev_fsm_event(cdev, DEV_EVENT_VERIFY);
@@ -1554,7 +1553,7 @@ static void recovery_work_func(struct work_struct *unused)
 		}
 		spin_unlock_irq(&recovery_lock);
 	} else
-		CIO_MSG_EVENT(2, "recovery: end\n");
+		CIO_MSG_EVENT(4, "recovery: end\n");
 }
 
 static DECLARE_WORK(recovery_work, recovery_work_func);
@@ -1572,7 +1571,7 @@ void ccw_device_schedule_recovery(void)
 {
 	unsigned long flags;
 
-	CIO_MSG_EVENT(2, "recovery: schedule\n");
+	CIO_MSG_EVENT(4, "recovery: schedule\n");
 	spin_lock_irqsave(&recovery_lock, flags);
 	if (!timer_pending(&recovery_timer) || (recovery_phase != 0)) {
 		recovery_phase = 0;
diff --git a/drivers/s390/cio/device_fsm.c b/drivers/s390/cio/device_fsm.c
index 99403b0..e268d5a 100644
--- a/drivers/s390/cio/device_fsm.c
+++ b/drivers/s390/cio/device_fsm.c
@@ -322,10 +322,10 @@ ccw_device_recog_done(struct ccw_device *cdev, int state)
 	same_dev = 0; /* Keep the compiler quiet... */
 	switch (state) {
 	case DEV_STATE_NOT_OPER:
-		CIO_DEBUG(KERN_WARNING, 2,
-			  "SenseID : unknown device %04x on subchannel "
-			  "0.%x.%04x\n", cdev->private->dev_id.devno,
-			  sch->schid.ssid, sch->schid.sch_no);
+		CIO_MSG_EVENT(2, "SenseID : unknown device %04x on "
+			      "subchannel 0.%x.%04x\n",
+			      cdev->private->dev_id.devno,
+			      sch->schid.ssid, sch->schid.sch_no);
 		break;
 	case DEV_STATE_OFFLINE:
 		if (cdev->private->state == DEV_STATE_DISCONNECTED_SENSE_ID) {
@@ -348,20 +348,19 @@ ccw_device_recog_done(struct ccw_device *cdev, int state)
 			return;
 		}
 		/* Issue device info message. */
-		CIO_DEBUG(KERN_INFO, 2,
-			  "SenseID : device 0.%x.%04x reports: "
-			  "CU  Type/Mod = %04X/%02X, Dev Type/Mod = "
-			  "%04X/%02X\n",
-			  cdev->private->dev_id.ssid,
-			  cdev->private->dev_id.devno,
-			  cdev->id.cu_type, cdev->id.cu_model,
-			  cdev->id.dev_type, cdev->id.dev_model);
+		CIO_MSG_EVENT(4, "SenseID : device 0.%x.%04x reports: "
+			      "CU  Type/Mod = %04X/%02X, Dev Type/Mod = "
+			      "%04X/%02X\n",
+			      cdev->private->dev_id.ssid,
+			      cdev->private->dev_id.devno,
+			      cdev->id.cu_type, cdev->id.cu_model,
+			      cdev->id.dev_type, cdev->id.dev_model);
 		break;
 	case DEV_STATE_BOXED:
-		CIO_DEBUG(KERN_WARNING, 2,
-			  "SenseID : boxed device %04x on subchannel "
-			  "0.%x.%04x\n", cdev->private->dev_id.devno,
-			  sch->schid.ssid, sch->schid.sch_no);
+		CIO_MSG_EVENT(0, "SenseID : boxed device %04x on "
+			      " subchannel 0.%x.%04x\n",
+			      cdev->private->dev_id.devno,
+			      sch->schid.ssid, sch->schid.sch_no);
 		break;
 	}
 	cdev->private->state = state;
@@ -443,9 +442,8 @@ ccw_device_done(struct ccw_device *cdev, int state)
 
 
 	if (state == DEV_STATE_BOXED)
-		CIO_DEBUG(KERN_WARNING, 2,
-			  "Boxed device %04x on subchannel %04x\n",
-			  cdev->private->dev_id.devno, sch->schid.sch_no);
+		CIO_MSG_EVENT(0, "Boxed device %04x on subchannel %04x\n",
+			      cdev->private->dev_id.devno, sch->schid.sch_no);
 
 	if (cdev->private->flags.donotify) {
 		cdev->private->flags.donotify = 0;
@@ -900,7 +898,7 @@ ccw_device_w4sense(struct ccw_device *cdev, enum dev_event dev_event)
 			/* Basic sense hasn't started. Try again. */
 			ccw_device_do_sense(cdev, irb);
 		else {
-			CIO_MSG_EVENT(2, "Huh? 0.%x.%04x: unsolicited "
+			CIO_MSG_EVENT(0, "0.%x.%04x: unsolicited "
 				      "interrupt during w4sense...\n",
 				      cdev->private->dev_id.ssid,
 				      cdev->private->dev_id.devno);
@@ -1169,8 +1167,10 @@ ccw_device_nop(struct ccw_device *cdev, enum dev_event dev_event)
 static void
 ccw_device_bug(struct ccw_device *cdev, enum dev_event dev_event)
 {
-	CIO_MSG_EVENT(0, "dev_jumptable[%i][%i] == NULL\n",
-		      cdev->private->state, dev_event);
+	CIO_MSG_EVENT(0, "Internal state [%i][%i] not handled for device "
+		      "0.%x.%04x\n", cdev->private->state, dev_event,
+		      cdev->private->dev_id.ssid,
+		      cdev->private->dev_id.devno);
 	BUG();
 }
 
diff --git a/drivers/s390/cio/device_id.c b/drivers/s390/cio/device_id.c
index dc4d87f..cba70205 100644
--- a/drivers/s390/cio/device_id.c
+++ b/drivers/s390/cio/device_id.c
@@ -214,7 +214,7 @@ ccw_device_check_sense_id(struct ccw_device *cdev)
 		 *     sense id information. So, for intervention required,
 		 *     we use the "whack it until it talks" strategy...
 		 */
-		CIO_MSG_EVENT(2, "SenseID : device %04x on Subchannel "
+		CIO_MSG_EVENT(0, "SenseID : device %04x on Subchannel "
 			      "0.%x.%04x reports cmd reject\n",
 			      cdev->private->dev_id.devno, sch->schid.ssid,
 			      sch->schid.sch_no);
@@ -239,7 +239,7 @@ ccw_device_check_sense_id(struct ccw_device *cdev)
 
 		lpm = to_io_private(sch)->orb.lpm;
 		if ((lpm & sch->schib.pmcw.pim & sch->schib.pmcw.pam) != 0)
-			CIO_MSG_EVENT(2, "SenseID : path %02X for device %04x "
+			CIO_MSG_EVENT(4, "SenseID : path %02X for device %04x "
 				      "on subchannel 0.%x.%04x is "
 				      "'not operational'\n", lpm,
 				      cdev->private->dev_id.devno,
diff --git a/drivers/s390/cio/device_pgid.c b/drivers/s390/cio/device_pgid.c
index c52449a..ba55905 100644
--- a/drivers/s390/cio/device_pgid.c
+++ b/drivers/s390/cio/device_pgid.c
@@ -79,7 +79,7 @@ __ccw_device_sense_pgid_start(struct ccw_device *cdev)
 			/* ret is 0, -EBUSY, -EACCES or -ENODEV */
 			if (ret != -EACCES)
 				return ret;
-			CIO_MSG_EVENT(2, "SNID - Device %04x on Subchannel "
+			CIO_MSG_EVENT(3, "SNID - Device %04x on Subchannel "
 				      "0.%x.%04x, lpm %02X, became 'not "
 				      "operational'\n",
 				      cdev->private->dev_id.devno,
@@ -159,7 +159,7 @@ __ccw_device_check_sense_pgid(struct ccw_device *cdev)
 		u8 lpm;
 
 		lpm = to_io_private(sch)->orb.lpm;
-		CIO_MSG_EVENT(2, "SNID - Device %04x on Subchannel 0.%x.%04x,"
+		CIO_MSG_EVENT(3, "SNID - Device %04x on Subchannel 0.%x.%04x,"
 			      " lpm %02X, became 'not operational'\n",
 			      cdev->private->dev_id.devno, sch->schid.ssid,
 			      sch->schid.sch_no, lpm);
@@ -275,7 +275,7 @@ __ccw_device_do_pgid(struct ccw_device *cdev, __u8 func)
 			return ret;
 	}
 	/* PGID command failed on this path. */
-	CIO_MSG_EVENT(2, "SPID - Device %04x on Subchannel "
+	CIO_MSG_EVENT(3, "SPID - Device %04x on Subchannel "
 		      "0.%x.%04x, lpm %02X, became 'not operational'\n",
 		      cdev->private->dev_id.devno, sch->schid.ssid,
 		      sch->schid.sch_no, cdev->private->imask);
@@ -317,7 +317,7 @@ static int __ccw_device_do_nop(struct ccw_device *cdev)
 			return ret;
 	}
 	/* nop command failed on this path. */
-	CIO_MSG_EVENT(2, "NOP - Device %04x on Subchannel "
+	CIO_MSG_EVENT(3, "NOP - Device %04x on Subchannel "
 		      "0.%x.%04x, lpm %02X, became 'not operational'\n",
 		      cdev->private->dev_id.devno, sch->schid.ssid,
 		      sch->schid.sch_no, cdev->private->imask);
@@ -362,7 +362,7 @@ __ccw_device_check_pgid(struct ccw_device *cdev)
 		return -EAGAIN;
 	}
 	if (irb->scsw.cc == 3) {
-		CIO_MSG_EVENT(2, "SPID - Device %04x on Subchannel 0.%x.%04x,"
+		CIO_MSG_EVENT(3, "SPID - Device %04x on Subchannel 0.%x.%04x,"
 			      " lpm %02X, became 'not operational'\n",
 			      cdev->private->dev_id.devno, sch->schid.ssid,
 			      sch->schid.sch_no, cdev->private->imask);
@@ -391,7 +391,7 @@ static int __ccw_device_check_nop(struct ccw_device *cdev)
 		return -ETIME;
 	}
 	if (irb->scsw.cc == 3) {
-		CIO_MSG_EVENT(2, "NOP - Device %04x on Subchannel 0.%x.%04x,"
+		CIO_MSG_EVENT(3, "NOP - Device %04x on Subchannel 0.%x.%04x,"
 			      " lpm %02X, became 'not operational'\n",
 			      cdev->private->dev_id.devno, sch->schid.ssid,
 			      sch->schid.sch_no, cdev->private->imask);
-- 
cgit v0.10.2


From 5b8909871b80a6cc2bd21aa5262c1424e3d26339 Mon Sep 17 00:00:00 2001
From: Michael Ernst <mernst@de.ibm.com>
Date: Wed, 7 May 2008 09:22:55 +0200
Subject: [S390] cio: Fix parsing mechanism for blacklisted devices.

New format cssid.ssid.devno is now parsed correctly.

Signed-off-by: Michael Ernst <mernst@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>

diff --git a/drivers/s390/cio/blacklist.c b/drivers/s390/cio/blacklist.c
index 0844476..9c21b8f 100644
--- a/drivers/s390/cio/blacklist.c
+++ b/drivers/s390/cio/blacklist.c
@@ -19,6 +19,7 @@
 
 #include <asm/cio.h>
 #include <asm/uaccess.h>
+#include <asm/cio.h>
 
 #include "blacklist.h"
 #include "cio.h"
@@ -43,163 +44,169 @@ typedef enum {add, free} range_action;
  * Function: blacklist_range
  * (Un-)blacklist the devices from-to
  */
-static void
-blacklist_range (range_action action, unsigned int from, unsigned int to,
-		 unsigned int ssid)
+static int blacklist_range(range_action action, unsigned int from_ssid,
+			   unsigned int to_ssid, unsigned int from,
+			   unsigned int to, int msgtrigger)
 {
-	if (!to)
-		to = from;
-
-	if (from > to || to > __MAX_SUBCHANNEL || ssid > __MAX_SSID) {
-		printk (KERN_WARNING "cio: Invalid blacklist range "
-			"0.%x.%04x to 0.%x.%04x, skipping\n",
-			ssid, from, ssid, to);
-		return;
+	if ((from_ssid > to_ssid) || ((from_ssid == to_ssid) && (from > to))) {
+		if (msgtrigger)
+			printk(KERN_WARNING "cio: Invalid cio_ignore range "
+			       "0.%x.%04x-0.%x.%04x\n", from_ssid, from,
+			       to_ssid, to);
+		return 1;
 	}
-	for (; from <= to; from++) {
+
+	while ((from_ssid < to_ssid) || ((from_ssid == to_ssid) &&
+	       (from <= to))) {
 		if (action == add)
-			set_bit (from, bl_dev[ssid]);
+			set_bit(from, bl_dev[from_ssid]);
 		else
-			clear_bit (from, bl_dev[ssid]);
+			clear_bit(from, bl_dev[from_ssid]);
+		from++;
+		if (from > __MAX_SUBCHANNEL) {
+			from_ssid++;
+			from = 0;
+		}
 	}
+
+	return 0;
 }
 
-/*
- * Function: blacklist_busid
- * Get devno/busid from given string.
- * Shamelessly grabbed from dasd_devmap.c.
- */
-static int
-blacklist_busid(char **str, int *id0, int *ssid, int *devno)
+static int pure_hex(char **cp, unsigned int *val, int min_digit,
+		    int max_digit, int max_val)
 {
-	int val, old_style;
-	char *sav;
+	int diff;
+	unsigned int value;
 
-	sav = *str;
+	diff = 0;
+	*val = 0;
 
-	/* check for leading '0x' */
-	old_style = 0;
-	if ((*str)[0] == '0' && (*str)[1] == 'x') {
-		*str += 2;
-		old_style = 1;
-	}
-	if (!isxdigit((*str)[0]))	/* We require at least one hex digit */
-		goto confused;
-	val = simple_strtoul(*str, str, 16);
-	if (old_style || (*str)[0] != '.') {
-		*id0 = *ssid = 0;
-		if (val < 0 || val > 0xffff)
-			goto confused;
-		*devno = val;
-		if ((*str)[0] != ',' && (*str)[0] != '-' &&
-		    (*str)[0] != '\n' && (*str)[0] != '\0')
-			goto confused;
-		return 0;
+	while (isxdigit(**cp) && (diff <= max_digit)) {
+
+		if (isdigit(**cp))
+			value = **cp - '0';
+		else
+			value = tolower(**cp) - 'a' + 10;
+		*val = *val * 16 + value;
+		(*cp)++;
+		diff++;
 	}
-	/* New style x.y.z busid */
-	if (val < 0 || val > 0xff)
-		goto confused;
-	*id0 = val;
-	(*str)++;
-	if (!isxdigit((*str)[0]))	/* We require at least one hex digit */
-		goto confused;
-	val = simple_strtoul(*str, str, 16);
-	if (val < 0 || val > 0xff || (*str)++[0] != '.')
-		goto confused;
-	*ssid = val;
-	if (!isxdigit((*str)[0]))	/* We require at least one hex digit */
-		goto confused;
-	val = simple_strtoul(*str, str, 16);
-	if (val < 0 || val > 0xffff)
-		goto confused;
-	*devno = val;
-	if ((*str)[0] != ',' && (*str)[0] != '-' &&
-	    (*str)[0] != '\n' && (*str)[0] != '\0')
-		goto confused;
+
+	if ((diff < min_digit) || (diff > max_digit) || (*val > max_val))
+		return 1;
+
 	return 0;
-confused:
-	strsep(str, ",\n");
-	printk(KERN_WARNING "cio: Invalid cio_ignore parameter '%s'\n", sav);
-	return 1;
 }
 
-static int
-blacklist_parse_parameters (char *str, range_action action)
+static int parse_busid(char *str, int *cssid, int *ssid, int *devno,
+		       int msgtrigger)
 {
-	int from, to, from_id0, to_id0, from_ssid, to_ssid;
-
-	while (*str != 0 && *str != '\n') {
-		range_action ra = action;
-		while(*str == ',')
-			str++;
-		if (*str == '!') {
-			ra = !action;
-			++str;
+	char *str_work;
+	int val, rc, ret;
+
+	rc = 1;
+
+	if (*str == '\0')
+		goto out;
+
+	/* old style */
+	str_work = str;
+	val = simple_strtoul(str, &str_work, 16);
+
+	if (*str_work == '\0') {
+		if (val <= __MAX_SUBCHANNEL) {
+			*devno = val;
+			*ssid = 0;
+			*cssid = 0;
+			rc = 0;
 		}
+		goto out;
+	}
 
-		/*
-		 * Since we have to parse the proc commands and the
-		 * kernel arguments we have to check four cases
-		 */
-		if (strncmp(str,"all,",4) == 0 || strcmp(str,"all") == 0 ||
-		    strncmp(str,"all\n",4) == 0 || strncmp(str,"all ",4) == 0) {
-			int j;
-
-			str += 3;
-			for (j=0; j <= __MAX_SSID; j++)
-				blacklist_range(ra, 0, __MAX_SUBCHANNEL, j);
-		} else {
-			int rc;
+	/* new style */
+	str_work = str;
+	ret = pure_hex(&str_work, cssid, 1, 2, __MAX_CSSID);
+	if (ret || (str_work[0] != '.'))
+		goto out;
+	str_work++;
+	ret = pure_hex(&str_work, ssid, 1, 1, __MAX_SSID);
+	if (ret || (str_work[0] != '.'))
+		goto out;
+	str_work++;
+	ret = pure_hex(&str_work, devno, 4, 4, __MAX_SUBCHANNEL);
+	if (ret || (str_work[0] != '\0'))
+		goto out;
+
+	rc = 0;
+out:
+	if (rc && msgtrigger)
+		printk(KERN_WARNING "cio: Invalid cio_ignore device '%s'\n",
+		       str);
+
+	return rc;
+}
 
-			rc = blacklist_busid(&str, &from_id0,
-					     &from_ssid, &from);
-			if (rc)
-				continue;
-			to = from;
-			to_id0 = from_id0;
-			to_ssid = from_ssid;
-			if (*str == '-') {
-				str++;
-				rc = blacklist_busid(&str, &to_id0,
-						     &to_ssid, &to);
-				if (rc)
-					continue;
-			}
-			if (*str == '-') {
-				printk(KERN_WARNING "cio: invalid cio_ignore "
-					"parameter '%s'\n",
-					strsep(&str, ",\n"));
-				continue;
-			}
-			if ((from_id0 != to_id0) ||
-			    (from_ssid != to_ssid)) {
-				printk(KERN_WARNING "cio: invalid cio_ignore "
-				       "range %x.%x.%04x-%x.%x.%04x\n",
-				       from_id0, from_ssid, from,
-				       to_id0, to_ssid, to);
-				continue;
+static int blacklist_parse_parameters(char *str, range_action action,
+				      int msgtrigger)
+{
+	int from_cssid, to_cssid, from_ssid, to_ssid, from, to;
+	int rc, totalrc;
+	char *parm;
+	range_action ra;
+
+	totalrc = 0;
+
+	while ((parm = strsep(&str, ","))) {
+		rc = 0;
+		ra = action;
+		if (*parm == '!') {
+			if (ra == add)
+				ra = free;
+			else
+				ra = add;
+			parm++;
+		}
+		if (strcmp(parm, "all") == 0) {
+			from_cssid = 0;
+			from_ssid = 0;
+			from = 0;
+			to_cssid = __MAX_CSSID;
+			to_ssid = __MAX_SSID;
+			to = __MAX_SUBCHANNEL;
+		} else {
+			rc = parse_busid(strsep(&parm, "-"), &from_cssid,
+					 &from_ssid, &from, msgtrigger);
+			if (!rc) {
+				if (parm != NULL)
+					rc = parse_busid(parm, &to_cssid,
+							 &to_ssid, &to,
+							 msgtrigger);
+				else {
+					to_cssid = from_cssid;
+					to_ssid = from_ssid;
+					to = from;
+				}
 			}
-			blacklist_range (ra, from, to, to_ssid);
 		}
+		if (!rc) {
+			rc = blacklist_range(ra, from_ssid, to_ssid, from, to,
+					     msgtrigger);
+			if (rc)
+				totalrc = 1;
+		} else
+			totalrc = 1;
 	}
-	return 1;
+
+	return totalrc;
 }
 
-/* Parsing the commandline for blacklist parameters, e.g. to blacklist
- * bus ids 0.0.1234, 0.0.1235 and 0.0.1236, you could use any of:
- * - cio_ignore=1234-1236
- * - cio_ignore=0x1234-0x1235,1236
- * - cio_ignore=0x1234,1235-1236
- * - cio_ignore=1236 cio_ignore=1234-0x1236
- * - cio_ignore=1234 cio_ignore=1236 cio_ignore=0x1235
- * - cio_ignore=0.0.1234-0.0.1236
- * - cio_ignore=0.0.1234,0x1235,1236
- * - ...
- */
 static int __init
 blacklist_setup (char *str)
 {
-	return blacklist_parse_parameters (str, add);
+	CIO_MSG_EVENT(6, "Reading blacklist parameters\n");
+	if (blacklist_parse_parameters(str, add, 1))
+		return 0;
+	return 1;
 }
 
 __setup ("cio_ignore=", blacklist_setup);
@@ -223,27 +230,23 @@ is_blacklisted (int ssid, int devno)
  * Function: blacklist_parse_proc_parameters
  * parse the stuff which is piped to /proc/cio_ignore
  */
-static void
-blacklist_parse_proc_parameters (char *buf)
+static int blacklist_parse_proc_parameters(char *buf)
 {
-	if (strncmp (buf, "free ", 5) == 0) {
-		blacklist_parse_parameters (buf + 5, free);
-	} else if (strncmp (buf, "add ", 4) == 0) {
-		/* 
-		 * We don't need to check for known devices since
-		 * css_probe_device will handle this correctly. 
-		 */
-		blacklist_parse_parameters (buf + 4, add);
-	} else {
-		printk (KERN_WARNING "cio: cio_ignore: Parse error; \n"
-			KERN_WARNING "try using 'free all|<devno-range>,"
-				     "<devno-range>,...'\n"
-			KERN_WARNING "or 'add <devno-range>,"
-				     "<devno-range>,...'\n");
-		return;
-	}
+	int rc;
+	char *parm;
+
+	parm = strsep(&buf, " ");
+
+	if (strcmp("free", parm) == 0)
+		rc = blacklist_parse_parameters(buf, free, 0);
+	else if (strcmp("add", parm) == 0)
+		rc = blacklist_parse_parameters(buf, add, 0);
+	else
+		return 1;
 
 	css_schedule_reprobe();
+
+	return rc;
 }
 
 /* Iterator struct for all devices. */
@@ -327,6 +330,8 @@ cio_ignore_write(struct file *file, const char __user *user_buf,
 		 size_t user_len, loff_t *offset)
 {
 	char *buf;
+	size_t i;
+	ssize_t rc, ret;
 
 	if (*offset)
 		return -EINVAL;
@@ -335,16 +340,27 @@ cio_ignore_write(struct file *file, const char __user *user_buf,
 	buf = vmalloc (user_len + 1); /* maybe better use the stack? */
 	if (buf == NULL)
 		return -ENOMEM;
+	memset(buf, 0, user_len + 1);
+
 	if (strncpy_from_user (buf, user_buf, user_len) < 0) {
-		vfree (buf);
-		return -EFAULT;
+		rc = -EFAULT;
+		goto out_free;
 	}
-	buf[user_len] = '\0';
 
-	blacklist_parse_proc_parameters (buf);
+	i = user_len - 1;
+	while ((i >= 0) && (isspace(buf[i]) || (buf[i] == 0))) {
+		buf[i] = '\0';
+		i--;
+	}
+	ret = blacklist_parse_proc_parameters(buf);
+	if (ret)
+		rc = -EINVAL;
+	else
+		rc = user_len;
 
+out_free:
 	vfree (buf);
-	return user_len;
+	return rc;
 }
 
 static const struct seq_operations cio_ignore_proc_seq_ops = {
-- 
cgit v0.10.2


From c6ca1850e78d60c299ceb4c240a04af9e2384f70 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Wed, 7 May 2008 09:22:56 +0200
Subject: [S390] s390mach compile warning

Fix the following compile warning:

drivers/s390/s390mach.c: In function 's390_collect_crw_info':
drivers/s390/s390mach.c:77: warning: ignoring return value of 'down_interruptibl

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>

diff --git a/drivers/s390/s390mach.c b/drivers/s390/s390mach.c
index 4d4b542..5080f34 100644
--- a/drivers/s390/s390mach.c
+++ b/drivers/s390/s390mach.c
@@ -48,10 +48,11 @@ s390_collect_crw_info(void *param)
 	int ccode;
 	struct semaphore *sem;
 	unsigned int chain;
+	int ignore;
 
 	sem = (struct semaphore *)param;
 repeat:
-	down_interruptible(sem);
+	ignore = down_interruptible(sem);
 	chain = 0;
 	while (1) {
 		if (unlikely(chain > 1)) {
-- 
cgit v0.10.2


From b499d76bfd78e900039155247e1c21bfdf807b7b Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 7 May 2008 09:22:57 +0200
Subject: [S390] compat ptrace cleanup

This removes redundant arch code for generic ptrace requests
already handled by ptrace_request and compat_ptrace_request.
It simplifies things to just have the standard entry points,
and use the generic compat_sys_ptrace.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>

diff --git a/arch/s390/kernel/compat_wrapper.S b/arch/s390/kernel/compat_wrapper.S
index 743d54f..d003a6e 100644
--- a/arch/s390/kernel/compat_wrapper.S
+++ b/arch/s390/kernel/compat_wrapper.S
@@ -121,7 +121,7 @@ sys32_ptrace_wrapper:
 	lgfr	%r3,%r3			# long
 	llgtr	%r4,%r4			# long
 	llgfr	%r5,%r5			# long
-	jg	sys_ptrace		# branch to system call
+	jg	compat_sys_ptrace	# branch to system call
 
 	.globl	sys32_alarm_wrapper
 sys32_alarm_wrapper:
diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c
index 7f42701..35827b9 100644
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c
@@ -292,8 +292,7 @@ poke_user(struct task_struct *child, addr_t addr, addr_t data)
 	return 0;
 }
 
-static int
-do_ptrace_normal(struct task_struct *child, long request, long addr, long data)
+long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 {
 	ptrace_area parea; 
 	int copied, ret;
@@ -529,35 +528,19 @@ poke_user_emu31(struct task_struct *child, addr_t addr, addr_t data)
 	return 0;
 }
 
-static int
-do_ptrace_emu31(struct task_struct *child, long request, long addr, long data)
+long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
+			compat_ulong_t caddr, compat_ulong_t cdata)
 {
-	unsigned int tmp;  /* 4 bytes !! */
+	unsigned long addr = caddr;
+	unsigned long data = cdata;
 	ptrace_area_emu31 parea; 
 	int copied, ret;
 
 	switch (request) {
-	case PTRACE_PEEKTEXT:
-	case PTRACE_PEEKDATA:
-		/* read word at location addr. */
-		copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0);
-		if (copied != sizeof(tmp))
-			return -EIO;
-		return put_user(tmp, (unsigned int __force __user *) data);
-
 	case PTRACE_PEEKUSR:
 		/* read the word at location addr in the USER area. */
 		return peek_user_emu31(child, addr, data);
 
-	case PTRACE_POKETEXT:
-	case PTRACE_POKEDATA:
-		/* write the word at location addr. */
-		tmp = data;
-		copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 1);
-		if (copied != sizeof(tmp))
-			return -EIO;
-		return 0;
-
 	case PTRACE_POKEUSR:
 		/* write the word at location addr in the USER area */
 		return poke_user_emu31(child, addr, data);
@@ -587,82 +570,11 @@ do_ptrace_emu31(struct task_struct *child, long request, long addr, long data)
 			copied += sizeof(unsigned int);
 		}
 		return 0;
-	case PTRACE_GETEVENTMSG:
-		return put_user((__u32) child->ptrace_message,
-				(unsigned int __force __user *) data);
-	case PTRACE_GETSIGINFO:
-		if (child->last_siginfo == NULL)
-			return -EINVAL;
-		return copy_siginfo_to_user32((compat_siginfo_t
-					       __force __user *) data,
-					      child->last_siginfo);
-	case PTRACE_SETSIGINFO:
-		if (child->last_siginfo == NULL)
-			return -EINVAL;
-		return copy_siginfo_from_user32(child->last_siginfo,
-						(compat_siginfo_t
-						 __force __user *) data);
 	}
-	return ptrace_request(child, request, addr, data);
+	return compat_ptrace_request(child, request, addr, data);
 }
 #endif
 
-long arch_ptrace(struct task_struct *child, long request, long addr, long data)
-{
-	switch (request) {
-	case PTRACE_SYSCALL:
-		/* continue and stop at next (return from) syscall */
-	case PTRACE_CONT:
-		/* restart after signal. */
-		if (!valid_signal(data))
-			return -EIO;
-		if (request == PTRACE_SYSCALL)
-			set_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
-		else
-			clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
-		child->exit_code = data;
-		/* make sure the single step bit is not set. */
-		user_disable_single_step(child);
-		wake_up_process(child);
-		return 0;
-
-	case PTRACE_KILL:
-		/*
-		 * make the child exit.  Best I can do is send it a sigkill. 
-		 * perhaps it should be put in the status that it wants to 
-		 * exit.
-		 */
-		if (child->exit_state == EXIT_ZOMBIE) /* already dead */
-			return 0;
-		child->exit_code = SIGKILL;
-		/* make sure the single step bit is not set. */
-		user_disable_single_step(child);
-		wake_up_process(child);
-		return 0;
-
-	case PTRACE_SINGLESTEP:
-		/* set the trap flag. */
-		if (!valid_signal(data))
-			return -EIO;
-		clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
-		child->exit_code = data;
-		user_enable_single_step(child);
-		/* give it a chance to run. */
-		wake_up_process(child);
-		return 0;
-
-	/* Do requests that differ for 31/64 bit */
-	default:
-#ifdef CONFIG_COMPAT
-		if (test_thread_flag(TIF_31BIT))
-			return do_ptrace_emu31(child, request, addr, data);
-#endif
-		return do_ptrace_normal(child, request, addr, data);
-	}
-	/* Not reached.  */
-	return -EIO;
-}
-
 asmlinkage void
 syscall_trace(struct pt_regs *regs, int entryexit)
 {
diff --git a/include/asm-s390/ptrace.h b/include/asm-s390/ptrace.h
index 441d7c2..d7d4e2e 100644
--- a/include/asm-s390/ptrace.h
+++ b/include/asm-s390/ptrace.h
@@ -471,6 +471,8 @@ struct task_struct;
 extern void user_enable_single_step(struct task_struct *);
 extern void user_disable_single_step(struct task_struct *);
 
+#define __ARCH_WANT_COMPAT_SYS_PTRACE
+
 #define user_mode(regs) (((regs)->psw.mask & PSW_MASK_PSTATE) != 0)
 #define instruction_pointer(regs) ((regs)->psw.addr & PSW_ADDR_INSN)
 #define regs_return_value(regs)((regs)->gprs[2])
-- 
cgit v0.10.2


From 74c76c84576eb2d806f40f6cb2fc8302c01869d8 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 7 May 2008 09:22:58 +0200
Subject: [S390] tty3270: fix put_char fail/success conversion.

The wrong function got coverted ;)

  CC      drivers/s390/char/tty3270.o
drivers/s390/char/tty3270.c:1747:
 warning: initialization from incompatible pointer type

Acked-by: Alan Cox <alan@redhat.com>
Cc: Peter Oberparleiter <peter.oberparleiter@de.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>

diff --git a/drivers/s390/char/tty3270.c b/drivers/s390/char/tty3270.c
index c1f2ade..5043150 100644
--- a/drivers/s390/char/tty3270.c
+++ b/drivers/s390/char/tty3270.c
@@ -965,8 +965,7 @@ tty3270_write_room(struct tty_struct *tty)
  * Insert character into the screen at the current position with the
  * current color and highlight. This function does NOT do cursor movement.
  */
-static int
-tty3270_put_character(struct tty3270 *tp, char ch)
+static void tty3270_put_character(struct tty3270 *tp, char ch)
 {
 	struct tty3270_line *line;
 	struct tty3270_cell *cell;
@@ -986,7 +985,6 @@ tty3270_put_character(struct tty3270 *tp, char ch)
 	cell->character = tp->view.ascebc[(unsigned int) ch];
 	cell->highlight = tp->highlight;
 	cell->f_color = tp->f_color;
-	return 1;
 }
 
 /*
@@ -1612,16 +1610,15 @@ tty3270_write(struct tty_struct * tty,
 /*
  * Put single characters to the ttys character buffer
  */
-static void
-tty3270_put_char(struct tty_struct *tty, unsigned char ch)
+static int tty3270_put_char(struct tty_struct *tty, unsigned char ch)
 {
 	struct tty3270 *tp;
 
 	tp = tty->driver_data;
-	if (!tp)
-		return;
-	if (tp->char_count < TTY3270_CHAR_BUF_SIZE)
-		tp->char_buf[tp->char_count++] = ch;
+	if (!tp || tp->char_count >= TTY3270_CHAR_BUF_SIZE)
+		return 0;
+	tp->char_buf[tp->char_count++] = ch;
+	return 1;
 }
 
 /*
-- 
cgit v0.10.2


From 45e576b1c3d0020607b8666c0247164e92c7d719 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Wed, 7 May 2008 09:22:59 +0200
Subject: [S390] guest page hinting light

Use the existing arch_alloc_page/arch_free_page callbacks to do
the guest page state transitions between stable and unused.

Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 29a7940..1d03508 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -430,6 +430,13 @@ config CMM_IUCV
 	  Select this option to enable the special message interface to
 	  the cooperative memory management.
 
+config PAGE_STATES
+	bool "Unused page notification"
+	help
+	  This enables the notification of unused pages to the
+	  hypervisor. The ESSA instruction is used to do the states
+	  changes between a page that has content and the unused state.
+
 config VIRT_TIMER
 	bool "Virtual CPU timer support"
 	help
diff --git a/arch/s390/mm/Makefile b/arch/s390/mm/Makefile
index fb988a4..2a74581 100644
--- a/arch/s390/mm/Makefile
+++ b/arch/s390/mm/Makefile
@@ -5,3 +5,4 @@
 obj-y	 := init.o fault.o extmem.o mmap.o vmem.o pgtable.o
 obj-$(CONFIG_CMM) += cmm.o
 obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
+obj-$(CONFIG_PAGE_STATES) += page-states.o
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index fa31de6..29f3a63 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -126,6 +126,9 @@ void __init mem_init(void)
         /* clear the zero-page */
         memset(empty_zero_page, 0, PAGE_SIZE);
 
+	/* Setup guest page hinting */
+	cmma_init();
+
 	/* this will put all low memory onto the freelists */
 	totalram_pages += free_all_bootmem();
 
diff --git a/arch/s390/mm/page-states.c b/arch/s390/mm/page-states.c
new file mode 100644
index 0000000..fc0ad73
--- /dev/null
+++ b/arch/s390/mm/page-states.c
@@ -0,0 +1,79 @@
+/*
+ * arch/s390/mm/page-states.c
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * Guest page hinting for unused pages.
+ *
+ * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+
+#define ESSA_SET_STABLE		1
+#define ESSA_SET_UNUSED		2
+
+static int cmma_flag;
+
+static int __init cmma(char *str)
+{
+	char *parm;
+	parm = strstrip(str);
+	if (strcmp(parm, "yes") == 0 || strcmp(parm, "on") == 0) {
+		cmma_flag = 1;
+		return 1;
+	}
+	cmma_flag = 0;
+	if (strcmp(parm, "no") == 0 || strcmp(parm, "off") == 0)
+		return 1;
+	return 0;
+}
+
+__setup("cmma=", cmma);
+
+void __init cmma_init(void)
+{
+	register unsigned long tmp asm("0") = 0;
+	register int rc asm("1") = -EOPNOTSUPP;
+
+	if (!cmma_flag)
+		return;
+	asm volatile(
+		"       .insn rrf,0xb9ab0000,%1,%1,0,0\n"
+		"0:     la      %0,0\n"
+		"1:\n"
+		EX_TABLE(0b,1b)
+		: "+&d" (rc), "+&d" (tmp));
+	if (rc)
+		cmma_flag = 0;
+}
+
+void arch_free_page(struct page *page, int order)
+{
+	int i, rc;
+
+	if (!cmma_flag)
+		return;
+	for (i = 0; i < (1 << order); i++)
+		asm volatile(".insn rrf,0xb9ab0000,%0,%1,%2,0"
+			     : "=&d" (rc)
+			     : "a" ((page_to_pfn(page) + i) << PAGE_SHIFT),
+			       "i" (ESSA_SET_UNUSED));
+}
+
+void arch_alloc_page(struct page *page, int order)
+{
+	int i, rc;
+
+	if (!cmma_flag)
+		return;
+	for (i = 0; i < (1 << order); i++)
+		asm volatile(".insn rrf,0xb9ab0000,%0,%1,%2,0"
+			     : "=&d" (rc)
+			     : "a" ((page_to_pfn(page) + i) << PAGE_SHIFT),
+			       "i" (ESSA_SET_STABLE));
+}
diff --git a/include/asm-s390/page.h b/include/asm-s390/page.h
index f0f4579..12fd9c4 100644
--- a/include/asm-s390/page.h
+++ b/include/asm-s390/page.h
@@ -125,6 +125,17 @@ page_get_storage_key(unsigned long addr)
 	return skey;
 }
 
+#ifdef CONFIG_PAGE_STATES
+
+struct page;
+void arch_free_page(struct page *page, int order);
+void arch_alloc_page(struct page *page, int order);
+
+#define HAVE_ARCH_FREE_PAGE
+#define HAVE_ARCH_ALLOC_PAGE
+
+#endif
+
 #endif /* !__ASSEMBLY__ */
 
 /* to align the pointer to the (next) page boundary */
diff --git a/include/asm-s390/system.h b/include/asm-s390/system.h
index c819ae2..e0d4500 100644
--- a/include/asm-s390/system.h
+++ b/include/asm-s390/system.h
@@ -116,6 +116,12 @@ extern void pfault_fini(void);
 #define pfault_fini()		do { } while (0)
 #endif /* CONFIG_PFAULT */
 
+#ifdef CONFIG_PAGE_STATES
+extern void cmma_init(void);
+#else
+static inline void cmma_init(void) { }
+#endif
+
 #define finish_arch_switch(prev) do {					     \
 	set_fs(current->thread.mm_segment);				     \
 	account_vtime(prev);						     \
-- 
cgit v0.10.2


From aa94b5371f6f898558d9fa5690cc6e4bf917a572 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 7 May 2008 09:27:43 +0200
Subject: block: adjust tagging function queue bit locking

For most initialization purposes, calling blk_queue_init_tags() without
the queue lock held is OK. Only if called for resizing an existing map
must the lock be held. Ditto for tag cleanup, the maps are reference
counted.

So switch the general queue flag setting to the unlocked variant, but
retain the locked variant for resizing.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-tag.c b/block/blk-tag.c
index de64e04..32667be 100644
--- a/block/blk-tag.c
+++ b/block/blk-tag.c
@@ -70,7 +70,7 @@ void __blk_queue_free_tags(struct request_queue *q)
 	__blk_free_tags(bqt);
 
 	q->queue_tags = NULL;
-	queue_flag_clear(QUEUE_FLAG_QUEUED, q);
+	queue_flag_clear_unlocked(QUEUE_FLAG_QUEUED, q);
 }
 
 /**
@@ -98,7 +98,7 @@ EXPORT_SYMBOL(blk_free_tags);
  **/
 void blk_queue_free_tags(struct request_queue *q)
 {
-	queue_flag_clear(QUEUE_FLAG_QUEUED, q);
+	queue_flag_clear_unlocked(QUEUE_FLAG_QUEUED, q);
 }
 EXPORT_SYMBOL(blk_queue_free_tags);
 
@@ -171,6 +171,9 @@ EXPORT_SYMBOL(blk_init_tags);
  * @q:  the request queue for the device
  * @depth:  the maximum queue depth supported
  * @tags: the tag to use
+ *
+ * Queue lock must be held here if the function is called to resize an
+ * existing map.
  **/
 int blk_queue_init_tags(struct request_queue *q, int depth,
 			struct blk_queue_tag *tags)
@@ -197,7 +200,7 @@ int blk_queue_init_tags(struct request_queue *q, int depth,
 	 * assign it, all done
 	 */
 	q->queue_tags = tags;
-	queue_flag_set(QUEUE_FLAG_QUEUED, q);
+	queue_flag_set_unlocked(QUEUE_FLAG_QUEUED, q);
 	INIT_LIST_HEAD(&q->tag_busy_list);
 	return 0;
 fail:
-- 
cgit v0.10.2


From 07416d29bcf608257f1e5280642dcbe0021518a3 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 7 May 2008 09:17:12 +0200
Subject: cfq-iosched: fix RCU race in the cfq io_context destructor handling

put_io_context() drops the RCU read lock before calling into cfq_dtor(),
however we need to hold off freeing there before grabbing and
dereferencing the first object on the list.

So extend the rcu_read_lock() scope to cover the calling of cfq_dtor(),
and optimize cfq_free_io_context() to use a new variant for
call_for_each_cic() that assumes the RCU read lock is already held.

Hit in the wild by Alexey Dobriyan <adobriyan@gmail.com>

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index e34df7c..012f065 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -41,8 +41,8 @@ int put_io_context(struct io_context *ioc)
 		rcu_read_lock();
 		if (ioc->aic && ioc->aic->dtor)
 			ioc->aic->dtor(ioc->aic);
-		rcu_read_unlock();
 		cfq_dtor(ioc);
+		rcu_read_unlock();
 
 		kmem_cache_free(iocontext_cachep, ioc);
 		return 1;
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index f4e1006..7f909d2 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1142,6 +1142,17 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
 	kmem_cache_free(cfq_pool, cfqq);
 }
 
+static void
+__call_for_each_cic(struct io_context *ioc,
+		    void (*func)(struct io_context *, struct cfq_io_context *))
+{
+	struct cfq_io_context *cic;
+	struct hlist_node *n;
+
+	hlist_for_each_entry_rcu(cic, n, &ioc->cic_list, cic_list)
+		func(ioc, cic);
+}
+
 /*
  * Call func for each cic attached to this ioc.
  */
@@ -1149,12 +1160,8 @@ static void
 call_for_each_cic(struct io_context *ioc,
 		  void (*func)(struct io_context *, struct cfq_io_context *))
 {
-	struct cfq_io_context *cic;
-	struct hlist_node *n;
-
 	rcu_read_lock();
-	hlist_for_each_entry_rcu(cic, n, &ioc->cic_list, cic_list)
-		func(ioc, cic);
+	__call_for_each_cic(ioc, func);
 	rcu_read_unlock();
 }
 
@@ -1198,7 +1205,7 @@ static void cfq_free_io_context(struct io_context *ioc)
 	 * should be ok to iterate over the known list, we will see all cic's
 	 * since no new ones are added.
 	 */
-	call_for_each_cic(ioc, cic_free_func);
+	__call_for_each_cic(ioc, cic_free_func);
 }
 
 static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
-- 
cgit v0.10.2


From 7f3d4ee108c184ab215036051087aaaaa8de7661 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Wed, 7 May 2008 09:22:39 +0200
Subject: vfs: splice remove_suid() cleanup

generic_file_splice_write() duplicates remove_suid() just because it
doesn't hold i_mutex.  But it grabs i_mutex inside splice_from_pipe()
anyway, so this is rather pointless.

Move locking to generic_file_splice_write() and call remove_suid() and
__splice_from_pipe() instead.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/fs/splice.c b/fs/splice.c
index 633f58e..cece15b 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -811,24 +811,19 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 {
 	struct address_space *mapping = out->f_mapping;
 	struct inode *inode = mapping->host;
-	int killsuid, killpriv;
+	struct splice_desc sd = {
+		.total_len = len,
+		.flags = flags,
+		.pos = *ppos,
+		.u.file = out,
+	};
 	ssize_t ret;
-	int err = 0;
-
-	killpriv = security_inode_need_killpriv(out->f_path.dentry);
-	killsuid = should_remove_suid(out->f_path.dentry);
-	if (unlikely(killsuid || killpriv)) {
-		mutex_lock(&inode->i_mutex);
-		if (killpriv)
-			err = security_inode_killpriv(out->f_path.dentry);
-		if (!err && killsuid)
-			err = __remove_suid(out->f_path.dentry, killsuid);
-		mutex_unlock(&inode->i_mutex);
-		if (err)
-			return err;
-	}
 
-	ret = splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_file);
+	inode_double_lock(inode, pipe->inode);
+	ret = remove_suid(out->f_path.dentry);
+	if (likely(!ret))
+		ret = __splice_from_pipe(pipe, &sd, pipe_to_file);
+	inode_double_unlock(inode, pipe->inode);
 	if (ret > 0) {
 		unsigned long nr_pages;
 
@@ -840,6 +835,8 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 		 * sync it.
 		 */
 		if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
+			int err;
+
 			mutex_lock(&inode->i_mutex);
 			err = generic_osync_inode(inode, mapping,
 						  OSYNC_METADATA|OSYNC_DATA);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7e0fa9e..f413085 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1816,7 +1816,6 @@ extern void iget_failed(struct inode *);
 extern void clear_inode(struct inode *);
 extern void destroy_inode(struct inode *);
 extern struct inode *new_inode(struct super_block *);
-extern int __remove_suid(struct dentry *, int);
 extern int should_remove_suid(struct dentry *);
 extern int remove_suid(struct dentry *);
 
diff --git a/mm/filemap.c b/mm/filemap.c
index 239d361..2dead9a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1655,7 +1655,7 @@ int should_remove_suid(struct dentry *dentry)
 }
 EXPORT_SYMBOL(should_remove_suid);
 
-int __remove_suid(struct dentry *dentry, int kill)
+static int __remove_suid(struct dentry *dentry, int kill)
 {
 	struct iattr newattrs;
 
-- 
cgit v0.10.2


From 2cdf79cafbd11580f5b63cd4993b45c1c4952415 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 7 May 2008 09:33:55 +0200
Subject: block: get rid of likely/unlikely predictions in merge logic

They tend to depend a lot on the workload, so not a clear-cut
likely or unlikely fit.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 73b2356..651136a 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -149,9 +149,9 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
 static int blk_hw_contig_segment(struct request_queue *q, struct bio *bio,
 				 struct bio *nxt)
 {
-	if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
+	if (!bio_flagged(bio, BIO_SEG_VALID))
 		blk_recount_segments(q, bio);
-	if (unlikely(!bio_flagged(nxt, BIO_SEG_VALID)))
+	if (!bio_flagged(nxt, BIO_SEG_VALID))
 		blk_recount_segments(q, nxt);
 	if (!BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)) ||
 	    BIOVEC_VIRT_OVERSIZE(bio->bi_hw_back_size + nxt->bi_hw_front_size))
@@ -312,9 +312,9 @@ int ll_back_merge_fn(struct request_queue *q, struct request *req,
 			q->last_merge = NULL;
 		return 0;
 	}
-	if (unlikely(!bio_flagged(req->biotail, BIO_SEG_VALID)))
+	if (!bio_flagged(req->biotail, BIO_SEG_VALID))
 		blk_recount_segments(q, req->biotail);
-	if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
+	if (!bio_flagged(bio, BIO_SEG_VALID))
 		blk_recount_segments(q, bio);
 	len = req->biotail->bi_hw_back_size + bio->bi_hw_front_size;
 	if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(req->biotail), __BVEC_START(bio))
@@ -352,9 +352,9 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req,
 		return 0;
 	}
 	len = bio->bi_hw_back_size + req->bio->bi_hw_front_size;
-	if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
+	if (!bio_flagged(bio, BIO_SEG_VALID))
 		blk_recount_segments(q, bio);
-	if (unlikely(!bio_flagged(req->bio, BIO_SEG_VALID)))
+	if (!bio_flagged(req->bio, BIO_SEG_VALID))
 		blk_recount_segments(q, req->bio);
 	if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(req->bio)) &&
 	    !BIOVEC_VIRT_OVERSIZE(len)) {
-- 
cgit v0.10.2


From dbaf2c003e151ad9231778819b0977f95e20e06f Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 7 May 2008 09:48:17 +0200
Subject: block: optimize generic_unplug_device()

Original patch from Mikulas Patocka <mpatocka@redhat.com>

Mike Anderson was doing an OLTP benchmark on a computer with 48 physical
disks mapped to one logical device via device mapper.

He found that there was a slowdown on request_queue->lock in function
generic_unplug_device. The slowdown is caused by the fact that when some
code calls unplug on the device mapper, device mapper calls unplug on all
physical disks. These unplug calls take the lock, find that the queue is
already unplugged, release the lock and exit.

With the below patch, performance of the benchmark was increased by 18%
(the whole OLTP application, not just block layer microbenchmarks).

So I'm submitting this patch for upstream. I think the patch is correct,
because when more threads call simultaneously plug and unplug, it is
unspecified, if the queue is or isn't plugged (so the patch can't make
this worse). And the caller that plugged the queue should unplug it
anyway. (if it doesn't, there's 3ms timeout).

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-core.c b/block/blk-core.c
index b754a4a..1b7dddf 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -253,9 +253,11 @@ EXPORT_SYMBOL(__generic_unplug_device);
  **/
 void generic_unplug_device(struct request_queue *q)
 {
-	spin_lock_irq(q->queue_lock);
-	__generic_unplug_device(q);
-	spin_unlock_irq(q->queue_lock);
+	if (blk_queue_plugged(q)) {
+		spin_lock_irq(q->queue_lock);
+		__generic_unplug_device(q);
+		spin_unlock_irq(q->queue_lock);
+	}
 }
 EXPORT_SYMBOL(generic_unplug_device);
 
-- 
cgit v0.10.2


From 221e583a735fc5d879d83c2a76b8ee5afcbdf146 Mon Sep 17 00:00:00 2001
From: Rasmus Rohde <rohde@duff.dk>
Date: Wed, 30 Apr 2008 17:22:06 +0200
Subject: udf: Make udf exportable

Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Rasmus Rohde <rohde@duff.dk>
Signed-off-by: Jan Kara <jack@suse.cz>

diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index ba5537d..47a6589 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -32,6 +32,7 @@
 #include <linux/buffer_head.h>
 #include <linux/sched.h>
 #include <linux/crc-itu-t.h>
+#include <linux/exportfs.h>
 
 static inline int udf_match(int len1, const char *name1, int len2,
 			    const char *name2)
@@ -158,6 +159,8 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
 	sector_t offset;
 	struct extent_position epos = {};
 	struct udf_inode_info *dinfo = UDF_I(dir);
+	int isdotdot = dentry->d_name.len == 2 &&
+		dentry->d_name.name[0] == '.' && dentry->d_name.name[1] == '.';
 
 	size = udf_ext0_offset(dir) + dir->i_size;
 	f_pos = udf_ext0_offset(dir);
@@ -225,6 +228,12 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
 				continue;
 		}
 
+		if ((cfi->fileCharacteristics & FID_FILE_CHAR_PARENT) &&
+		    isdotdot) {
+			brelse(epos.bh);
+			return fi;
+		}
+
 		if (!lfi)
 			continue;
 
@@ -286,9 +295,8 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
 		}
 	}
 	unlock_kernel();
-	d_add(dentry, inode);
 
-	return NULL;
+	return d_splice_alias(inode, dentry);
 }
 
 static struct fileIdentDesc *udf_add_entry(struct inode *dir,
@@ -1232,6 +1240,134 @@ end_rename:
 	return retval;
 }
 
+static struct dentry *udf_get_parent(struct dentry *child)
+{
+	struct dentry *parent;
+	struct inode *inode = NULL;
+	struct dentry dotdot;
+	struct fileIdentDesc cfi;
+	struct udf_fileident_bh fibh;
+
+	dotdot.d_name.name = "..";
+	dotdot.d_name.len = 2;
+
+	lock_kernel();
+	if (!udf_find_entry(child->d_inode, &dotdot, &fibh, &cfi))
+		goto out_unlock;
+
+	if (fibh.sbh != fibh.ebh)
+		brelse(fibh.ebh);
+	brelse(fibh.sbh);
+
+	inode = udf_iget(child->d_inode->i_sb,
+			 lelb_to_cpu(cfi.icb.extLocation));
+	if (!inode)
+		goto out_unlock;
+	unlock_kernel();
+
+	parent = d_alloc_anon(inode);
+	if (!parent) {
+		iput(inode);
+		parent = ERR_PTR(-ENOMEM);
+	}
+
+	return parent;
+out_unlock:
+	unlock_kernel();
+	return ERR_PTR(-EACCES);
+}
+
+
+static struct dentry *udf_nfs_get_inode(struct super_block *sb, u32 block,
+					u16 partref, __u32 generation)
+{
+	struct inode *inode;
+	struct dentry *result;
+	kernel_lb_addr loc;
+
+	if (block == 0)
+		return ERR_PTR(-ESTALE);
+
+	loc.logicalBlockNum = block;
+	loc.partitionReferenceNum = partref;
+	inode = udf_iget(sb, loc);
+
+	if (inode == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	if (generation && inode->i_generation != generation) {
+		iput(inode);
+		return ERR_PTR(-ESTALE);
+	}
+	result = d_alloc_anon(inode);
+	if (!result) {
+		iput(inode);
+		return ERR_PTR(-ENOMEM);
+	}
+	return result;
+}
+
+static struct dentry *udf_fh_to_dentry(struct super_block *sb,
+				       struct fid *fid, int fh_len, int fh_type)
+{
+	if ((fh_len != 3 && fh_len != 5) ||
+	    (fh_type != FILEID_UDF_WITH_PARENT &&
+	     fh_type != FILEID_UDF_WITHOUT_PARENT))
+		return NULL;
+
+	return udf_nfs_get_inode(sb, fid->udf.block, fid->udf.partref,
+			fid->udf.generation);
+}
+
+static struct dentry *udf_fh_to_parent(struct super_block *sb,
+				       struct fid *fid, int fh_len, int fh_type)
+{
+	if (fh_len != 5 || fh_type != FILEID_UDF_WITH_PARENT)
+		return NULL;
+
+	return udf_nfs_get_inode(sb, fid->udf.parent_block,
+				 fid->udf.parent_partref,
+				 fid->udf.parent_generation);
+}
+static int udf_encode_fh(struct dentry *de, __u32 *fh, int *lenp,
+			 int connectable)
+{
+	int len = *lenp;
+	struct inode *inode =  de->d_inode;
+	kernel_lb_addr location = UDF_I(inode)->i_location;
+	struct fid *fid = (struct fid *)fh;
+	int type = FILEID_UDF_WITHOUT_PARENT;
+
+	if (len < 3 || (connectable && len < 5))
+		return 255;
+
+	*lenp = 3;
+	fid->udf.block = location.logicalBlockNum;
+	fid->udf.partref = location.partitionReferenceNum;
+	fid->udf.generation = inode->i_generation;
+
+	if (connectable && !S_ISDIR(inode->i_mode)) {
+		spin_lock(&de->d_lock);
+		inode = de->d_parent->d_inode;
+		location = UDF_I(inode)->i_location;
+		fid->udf.parent_block = location.logicalBlockNum;
+		fid->udf.parent_partref = location.partitionReferenceNum;
+		fid->udf.parent_generation = inode->i_generation;
+		spin_unlock(&de->d_lock);
+		*lenp = 5;
+		type = FILEID_UDF_WITH_PARENT;
+	}
+
+	return type;
+}
+
+const struct export_operations udf_export_ops = {
+	.encode_fh	= udf_encode_fh,
+	.fh_to_dentry   = udf_fh_to_dentry,
+	.fh_to_parent   = udf_fh_to_parent,
+	.get_parent     = udf_get_parent,
+};
+
 const struct inode_operations udf_dir_inode_operations = {
 	.lookup				= udf_lookup,
 	.create				= udf_create,
diff --git a/fs/udf/super.c b/fs/udf/super.c
index b564fc1..260f4b82 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1933,6 +1933,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
 
 	/* Fill in the rest of the superblock */
 	sb->s_op = &udf_sb_ops;
+	sb->s_export_op = &udf_export_ops;
 	sb->dq_op = NULL;
 	sb->s_dirt = 0;
 	sb->s_magic = UDF_SUPER_MAGIC;
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index f3f45d0..8fa9c2d 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -73,6 +73,7 @@ struct task_struct;
 struct buffer_head;
 struct super_block;
 
+extern const struct export_operations udf_export_ops;
 extern const struct inode_operations udf_dir_inode_operations;
 extern const struct file_operations udf_dir_operations;
 extern const struct inode_operations udf_file_inode_operations;
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index de8387b..f5abd13 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -33,6 +33,19 @@ enum fid_type {
 	 * 32 bit parent directory inode number.
 	 */
 	FILEID_INO32_GEN_PARENT = 2,
+
+	/*
+	 * 32 bit block number, 16 bit partition reference,
+	 * 16 bit unused, 32 bit generation number.
+	 */
+	FILEID_UDF_WITHOUT_PARENT = 0x51,
+
+	/*
+	 * 32 bit block number, 16 bit partition reference,
+	 * 16 bit unused, 32 bit generation number,
+	 * 32 bit parent block number, 32 bit parent generation number
+	 */
+	FILEID_UDF_WITH_PARENT = 0x52,
 };
 
 struct fid {
@@ -43,6 +56,14 @@ struct fid {
 			u32 parent_ino;
 			u32 parent_gen;
 		} i32;
+ 		struct {
+ 			u32 block;
+ 			u16 partref;
+ 			u16 parent_partref;
+ 			u32 generation;
+ 			u32 parent_block;
+ 			u32 parent_generation;
+ 		} udf;
 		__u32 raw[0];
 	};
 };
-- 
cgit v0.10.2


From 9afadc4b1fd25337003832c9a4668f9bd42cdda9 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@ghost.suse.cz>
Date: Tue, 6 May 2008 18:26:17 +0200
Subject: udf: Fix memory corruption when fs mounted with noadinicb option

When UDF filesystem is mounted with noadinicb mount option, it
happens that we extend an empty directory with a block. A code in
udf_add_entry() didn't count with this possibility and used
uninitialized data leading to memory and filesystem corruption.
Add a check whether file already has some extents before operating
on them.

Signed-off-by: Jan Kara <jack@suse.cz>

diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 47a6589..3d94bc1 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -315,7 +315,7 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
 	uint16_t liu;
 	int block;
 	kernel_lb_addr eloc;
-	uint32_t elen;
+	uint32_t elen = 0;
 	sector_t offset;
 	struct extent_position epos = {};
 	struct udf_inode_info *dinfo;
@@ -406,7 +406,8 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
 	}
 
 add:
-	if (dinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
+	/* Is there any extent whose size we need to round up? */
+	if (dinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && elen) {
 		elen = (elen + sb->s_blocksize - 1) & ~(sb->s_blocksize - 1);
 		if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
 			epos.offset -= sizeof(short_ad);
-- 
cgit v0.10.2


From 6d63c275572d1e6f00d4fa154f16fbb0d8c2d2bf Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 7 May 2008 09:51:23 +0200
Subject: cfq-iosched: make io priorities inherit CPU scheduling class as well
 as nice

We currently set all processes to the best-effort scheduling class,
regardless of what CPU scheduling class they belong to. Improve that
so that we correctly track idle and rt scheduling classes as well.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 7f909d2..b399c62 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1303,10 +1303,10 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
 		printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
 	case IOPRIO_CLASS_NONE:
 		/*
-		 * no prio set, place us in the middle of the BE classes
+		 * no prio set, inherit CPU scheduling settings
 		 */
 		cfqq->ioprio = task_nice_ioprio(tsk);
-		cfqq->ioprio_class = IOPRIO_CLASS_BE;
+		cfqq->ioprio_class = task_nice_ioclass(tsk);
 		break;
 	case IOPRIO_CLASS_RT:
 		cfqq->ioprio = task_ioprio(ioc);
diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h
index 2a3bb1b..f98a656 100644
--- a/include/linux/ioprio.h
+++ b/include/linux/ioprio.h
@@ -68,6 +68,20 @@ static inline int task_nice_ioprio(struct task_struct *task)
 }
 
 /*
+ * This is for the case where the task hasn't asked for a specific IO class.
+ * Check for idle and rt task process, and return appropriate IO class.
+ */
+static inline int task_nice_ioclass(struct task_struct *task)
+{
+	if (task->policy == SCHED_IDLE)
+		return IOPRIO_CLASS_IDLE;
+	else if (task->policy == SCHED_FIFO || task->policy == SCHED_RR)
+		return IOPRIO_CLASS_RT;
+	else
+		return IOPRIO_CLASS_BE;
+}
+
+/*
  * For inheritance, return the highest of the two given priorities
  */
 extern int ioprio_best(unsigned short aprio, unsigned short bprio);
-- 
cgit v0.10.2


From 28f13702f03e527fcb979747a882cf366c489c50 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 7 May 2008 10:15:46 +0200
Subject: block: avoid duplicate calls to get_part() in disk stat code

get_part() is fairly expensive, as it O(N) loops over partitions
to find the right one. In lots of normal IO paths we end up looking
up the partition twice, to make matters even worse. Change the
stat add code to accept a passed in partition instead.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-core.c b/block/blk-core.c
index 1b7dddf..2987fe4 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -54,15 +54,16 @@ static DEFINE_PER_CPU(struct list_head, blk_cpu_done);
 
 static void drive_stat_acct(struct request *rq, int new_io)
 {
+	struct hd_struct *part;
 	int rw = rq_data_dir(rq);
 
 	if (!blk_fs_request(rq) || !rq->rq_disk)
 		return;
 
-	if (!new_io) {
-		__all_stat_inc(rq->rq_disk, merges[rw], rq->sector);
-	} else {
-		struct hd_struct *part = get_part(rq->rq_disk, rq->sector);
+	part = get_part(rq->rq_disk, rq->sector);
+	if (!new_io)
+		__all_stat_inc(rq->rq_disk, part, merges[rw], rq->sector);
+	else {
 		disk_round_stats(rq->rq_disk);
 		rq->rq_disk->in_flight++;
 		if (part) {
@@ -1538,10 +1539,11 @@ static int __end_that_request_first(struct request *req, int error,
 	}
 
 	if (blk_fs_request(req) && req->rq_disk) {
+		struct hd_struct *part = get_part(req->rq_disk, req->sector);
 		const int rw = rq_data_dir(req);
 
-		all_stat_add(req->rq_disk, sectors[rw],
-			     nr_bytes >> 9, req->sector);
+		all_stat_add(req->rq_disk, part, sectors[rw],
+				nr_bytes >> 9, req->sector);
 	}
 
 	total_bytes = bio_nbytes = 0;
@@ -1727,8 +1729,8 @@ static void end_that_request_last(struct request *req, int error)
 		const int rw = rq_data_dir(req);
 		struct hd_struct *part = get_part(disk, req->sector);
 
-		__all_stat_inc(disk, ios[rw], req->sector);
-		__all_stat_add(disk, ticks[rw], duration, req->sector);
+		__all_stat_inc(disk, part, ios[rw], req->sector);
+		__all_stat_add(disk, part, ticks[rw], duration, req->sector);
 		disk_round_stats(disk);
 		disk->in_flight--;
 		if (part) {
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index 8fc429c..41f818b 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -755,11 +755,13 @@ diskstats(struct gendisk *disk, struct bio *bio, ulong duration, sector_t sector
 {
 	unsigned long n_sect = bio->bi_size >> 9;
 	const int rw = bio_data_dir(bio);
+	struct hd_struct *part;
 
-	all_stat_inc(disk, ios[rw], sector);
-	all_stat_add(disk, ticks[rw], duration, sector);
-	all_stat_add(disk, sectors[rw], n_sect, sector);
-	all_stat_add(disk, io_ticks, duration, sector);
+	part = get_part(disk, sector);
+	all_stat_inc(disk, part, ios[rw], sector);
+	all_stat_add(disk, part, ticks[rw], duration, sector);
+	all_stat_add(disk, part, sectors[rw], n_sect, sector);
+	all_stat_add(disk, part, io_ticks, duration, sector);
 }
 
 void
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index ecd2bf6..e9874e7 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -178,17 +178,17 @@ static inline struct hd_struct *get_part(struct gendisk *gendiskp,
 
 static inline void disk_stat_set_all(struct gendisk *gendiskp, int value)	{
 	int i;
+
 	for_each_possible_cpu(i)
 		memset(per_cpu_ptr(gendiskp->dkstats, i), value,
-				sizeof (struct disk_stats));
+				sizeof(struct disk_stats));
 }		
 
 #define __part_stat_add(part, field, addnd)				\
 	(per_cpu_ptr(part->dkstats, smp_processor_id())->field += addnd)
 
-#define __all_stat_add(gendiskp, field, addnd, sector)		\
+#define __all_stat_add(gendiskp, part, field, addnd, sector)	\
 ({								\
-	struct hd_struct *part = get_part(gendiskp, sector);	\
 	if (part)						\
 		__part_stat_add(part, field, addnd);		\
 	__disk_stat_add(gendiskp, field, addnd);		\
@@ -203,11 +203,13 @@ static inline void disk_stat_set_all(struct gendisk *gendiskp, int value)	{
 	res;								\
 })
 
-static inline void part_stat_set_all(struct hd_struct *part, int value)	{
+static inline void part_stat_set_all(struct hd_struct *part, int value)
+{
 	int i;
+
 	for_each_possible_cpu(i)
 		memset(per_cpu_ptr(part->dkstats, i), value,
-		       sizeof(struct disk_stats));
+				sizeof(struct disk_stats));
 }
 				
 #else /* !CONFIG_SMP */
@@ -223,9 +225,8 @@ static inline void disk_stat_set_all(struct gendisk *gendiskp, int value)
 #define __part_stat_add(part, field, addnd) \
 	(part->dkstats.field += addnd)
 
-#define __all_stat_add(gendiskp, field, addnd, sector)		\
+#define __all_stat_add(gendiskp, part, field, addnd, sector)	\
 ({								\
-	struct hd_struct *part = get_part(gendiskp, sector);	\
 	if (part)						\
 		part->dkstats.field += addnd;			\
 	__disk_stat_add(gendiskp, field, addnd);		\
@@ -276,10 +277,10 @@ static inline void part_stat_set_all(struct hd_struct *part, int value)
 #define part_stat_sub(gendiskp, field, subnd) \
 		part_stat_add(gendiskp, field, -subnd)
 
-#define all_stat_add(gendiskp, field, addnd, sector)		\
+#define all_stat_add(gendiskp, part, field, addnd, sector)	\
 	do {							\
 		preempt_disable();				\
-		__all_stat_add(gendiskp, field, addnd, sector);	\
+		__all_stat_add(gendiskp, part, field, addnd, sector);	\
 		preempt_enable();				\
 	} while (0)
 
@@ -288,15 +289,15 @@ static inline void part_stat_set_all(struct hd_struct *part, int value)
 #define all_stat_dec(gendiskp, field, sector) \
 		all_stat_add(gendiskp, field, -1, sector)
 
-#define __all_stat_inc(gendiskp, field, sector) \
-		__all_stat_add(gendiskp, field, 1, sector)
-#define all_stat_inc(gendiskp, field, sector) \
-		all_stat_add(gendiskp, field, 1, sector)
+#define __all_stat_inc(gendiskp, part, field, sector) \
+		__all_stat_add(gendiskp, part, field, 1, sector)
+#define all_stat_inc(gendiskp, part, field, sector) \
+		all_stat_add(gendiskp, part, field, 1, sector)
 
-#define __all_stat_sub(gendiskp, field, subnd, sector) \
-		__all_stat_add(gendiskp, field, -subnd, sector)
-#define all_stat_sub(gendiskp, field, subnd, sector) \
-		all_stat_add(gendiskp, field, -subnd, sector)
+#define __all_stat_sub(gendiskp, part, field, subnd, sector) \
+		__all_stat_add(gendiskp, part, field, -subnd, sector)
+#define all_stat_sub(gendiskp, part, field, subnd, sector) \
+		all_stat_add(gendiskp, part, field, -subnd, sector)
 
 /* Inlines to alloc and free disk stats in struct gendisk */
 #ifdef  CONFIG_SMP
-- 
cgit v0.10.2


From 5816339310b2d9623cf413d33e538b45e815da5d Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Wed, 7 May 2008 02:24:28 -0700
Subject: sparc: Fix mmap VA span checking.

We should not conditionalize VA range checks on MAP_FIXED.

Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/arch/sparc/kernel/sys_sparc.c b/arch/sparc/kernel/sys_sparc.c
index f188b5d..e995491 100644
--- a/arch/sparc/kernel/sys_sparc.c
+++ b/arch/sparc/kernel/sys_sparc.c
@@ -223,8 +223,7 @@ int sparc_mmap_check(unsigned long addr, unsigned long len, unsigned long flags)
 {
 	if (ARCH_SUN4C_SUN4 &&
 	    (len > 0x20000000 ||
-	     ((flags & MAP_FIXED) &&
-	      addr < 0xe0000000 && addr + len > 0x20000000)))
+	     (addr < 0xe0000000 && addr + len > 0x20000000)))
 		return -EINVAL;
 
 	/* See asm-sparc/uaccess.h */
diff --git a/arch/sparc64/kernel/sys_sparc.c b/arch/sparc64/kernel/sys_sparc.c
index 8d4761f..0dbc941 100644
--- a/arch/sparc64/kernel/sys_sparc.c
+++ b/arch/sparc64/kernel/sys_sparc.c
@@ -549,13 +549,13 @@ int sparc64_mmap_check(unsigned long addr, unsigned long len,
 		if (len >= STACK_TOP32)
 			return -EINVAL;
 
-		if ((flags & MAP_FIXED) && addr > STACK_TOP32 - len)
+		if (addr > STACK_TOP32 - len)
 			return -EINVAL;
 	} else {
 		if (len >= VA_EXCLUDE_START)
 			return -EINVAL;
 
-		if ((flags & MAP_FIXED) && invalid_64bit_range(addr, len))
+		if (invalid_64bit_range(addr, len))
 			return -EINVAL;
 	}
 
-- 
cgit v0.10.2


From e5e1d3cb20034a3cbcfff1f0bae12201aa2ce17e Mon Sep 17 00:00:00 2001
From: Stas Sergeev <stsp@aknet.ru>
Date: Wed, 7 May 2008 12:39:56 +0200
Subject: pcspkr: fix dependancies

fix pcspkr dependancies: make the pcspkr platform
drivers to depend on a platform device, and
not the other way around.

Signed-off-by: Stas Sergeev <stsp@aknet.ru>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Dmitry Torokhov <dtor@mail.ru>
CC: Vojtech Pavlik <vojtech@suse.cz>
CC: Michael Opdenacker <michael-lists@free-electrons.com>
[fixed for 2.6.26-rc1 by tiwai]
Signed-off-by: Takashi Iwai <tiwai@suse.de>

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index bbdacb3..5e618c3 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -83,9 +83,7 @@ obj-$(CONFIG_KVM_GUEST)		+= kvm.o
 obj-$(CONFIG_KVM_CLOCK)		+= kvmclock.o
 obj-$(CONFIG_PARAVIRT)		+= paravirt.o paravirt_patch_$(BITS).o
 
-ifdef CONFIG_INPUT_PCSPKR
-obj-y				+= pcspeaker.o
-endif
+obj-$(CONFIG_PCSPKR_PLATFORM)	+= pcspeaker.o
 
 obj-$(CONFIG_SCx200)		+= scx200.o
 scx200-y			+= scx200_32.o
diff --git a/drivers/input/misc/Kconfig b/drivers/input/misc/Kconfig
index 92b6834..3ad8bd9 100644
--- a/drivers/input/misc/Kconfig
+++ b/drivers/input/misc/Kconfig
@@ -14,7 +14,7 @@ if INPUT_MISC
 
 config INPUT_PCSPKR
 	tristate "PC Speaker support"
-	depends on ALPHA || X86 || MIPS || PPC_PREP || PPC_CHRP || PPC_PSERIES
+	depends on PCSPKR_PLATFORM
 	depends on SND_PCSP=n
 	help
 	  Say Y here if you want the standard PC Speaker to be used for
diff --git a/init/Kconfig b/init/Kconfig
index 4c33316..3b5adbf 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -634,6 +634,14 @@ config ELF_CORE
 	help
 	  Enable support for generating core dumps. Disabling saves about 4k.
 
+config PCSPKR_PLATFORM
+	bool "Enable PC-Speaker support" if EMBEDDED
+	depends on ALPHA || X86 || MIPS || PPC_PREP || PPC_CHRP || PPC_PSERIES
+	default y
+	help
+          This option allows to disable the internal PC-Speaker
+          support, saving some memory.
+
 config COMPAT_BRK
 	bool "Disable heap randomization"
 	default y
diff --git a/sound/drivers/Kconfig b/sound/drivers/Kconfig
index a78a8d0..379bcb0 100644
--- a/sound/drivers/Kconfig
+++ b/sound/drivers/Kconfig
@@ -5,8 +5,8 @@ menu "Generic devices"
 
 
 config SND_PCSP
-	tristate "Internal PC speaker support"
-	depends on X86_PC && HIGH_RES_TIMERS
+	tristate "PC-Speaker support"
+	depends on PCSPKR_PLATFORM && X86_PC && HIGH_RES_TIMERS
 	depends on INPUT
 	depends on SND
 	select SND_PCM
-- 
cgit v0.10.2


From eeae1d48c011839d9e1cdc1e8aacf0193c9d8197 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 7 May 2008 13:26:27 +0200
Subject: block: use unitialized_var() in bio_alloc_bioset()

Better than setting idx to some random value and it silences the
same bogus gcc warning.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/fs/bio.c b/fs/bio.c
index 799f86d..2fa04ff 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -158,7 +158,7 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 
 		bio_init(bio);
 		if (likely(nr_iovecs)) {
-			unsigned long idx = 0; /* shut up gcc */
+			unsigned long uninitialized_var(idx);
 
 			bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs);
 			if (unlikely(!bvl)) {
-- 
cgit v0.10.2


From ffee0259c9edcc4c0f06b60df51c461eeecad4c0 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Wed, 30 Apr 2008 09:08:54 +0200
Subject: docbook: fix bio missing parameter

Fix fs/bio.c kernel-doc parameter warning:
Warning(linux-2.6.25-git14//fs/bio.c:972): No description found for parameter 'reading'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/fs/bio.c b/fs/bio.c
index 2fa04ff..7856257 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -963,6 +963,7 @@ static void bio_copy_kern_endio(struct bio *bio, int err)
  *	@data: pointer to buffer to copy
  *	@len: length in bytes
  *	@gfp_mask: allocation flags for bio and page allocation
+ *	@reading: data direction is READ
  *
  *	copy the kernel address into a bio suitable for io to a block
  *	device. Returns an error pointer in case of error.
-- 
cgit v0.10.2


From 5f51efc195dfb860c60fafb4e47fe4b7cad2626d Mon Sep 17 00:00:00 2001
From: Michael Albaugh <Michael.Albaugh@Qlogic.com>
Date: Wed, 7 May 2008 10:56:47 -0700
Subject: IB/ipath: Only warn about prototype chip during init

We warn about prototype chips, but the function that checks for
support is also called as a result of a get_portinfo request, which
can clutter the logs.

Restrict warning to only appear during initialization.

Signed-off-by: Michael Albaugh <michael.albaugh@qlogic.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>

diff --git a/drivers/infiniband/hw/ipath/ipath_iba7220.c b/drivers/infiniband/hw/ipath/ipath_iba7220.c
index e3ec0d1..5f693de 100644
--- a/drivers/infiniband/hw/ipath/ipath_iba7220.c
+++ b/drivers/infiniband/hw/ipath/ipath_iba7220.c
@@ -870,8 +870,9 @@ static int ipath_7220_boardname(struct ipath_devdata *dd, char *name,
 			      "revision %u.%u!\n",
 			      dd->ipath_majrev, dd->ipath_minrev);
 		ret = 1;
-	} else if (dd->ipath_minrev == 1) {
-		/* Rev1 chips are prototype. Complain, but allow use */
+	} else if (dd->ipath_minrev == 1 &&
+		!(dd->ipath_flags & IPATH_INITTED)) {
+		/* Rev1 chips are prototype. Complain at init, but allow use */
 		ipath_dev_err(dd, "Unsupported hardware "
 			      "revision %u.%u, Contact support@qlogic.com\n",
 			      dd->ipath_majrev, dd->ipath_minrev);
-- 
cgit v0.10.2


From 6e87d1500713767866db0668bbcec75719576f3c Mon Sep 17 00:00:00 2001
From: Ralph Campbell <ralph.campbell@qlogic.com>
Date: Wed, 7 May 2008 10:57:14 -0700
Subject: IB/ipath: Only increment SSN if WQE is put on send queue

If a send work request has immediate errors and is not put on the
send queue, we shouldn't update any of the QP state.

The increment of the SSN wasn't obeying this.

Signed-off-by: Ralph Campbell <ralph.campbell@qlogic.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>

diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.c b/drivers/infiniband/hw/ipath/ipath_verbs.c
index e63927c..5015cd2 100644
--- a/drivers/infiniband/hw/ipath/ipath_verbs.c
+++ b/drivers/infiniband/hw/ipath/ipath_verbs.c
@@ -396,7 +396,6 @@ static int ipath_post_one_send(struct ipath_qp *qp, struct ib_send_wr *wr)
 
 	wqe = get_swqe_ptr(qp, qp->s_head);
 	wqe->wr = *wr;
-	wqe->ssn = qp->s_ssn++;
 	wqe->length = 0;
 	if (wr->num_sge) {
 		acc = wr->opcode >= IB_WR_RDMA_READ ?
@@ -422,6 +421,7 @@ static int ipath_post_one_send(struct ipath_qp *qp, struct ib_send_wr *wr)
 			goto bail_inval;
 	} else if (wqe->length > to_idev(qp->ibqp.device)->dd->ipath_ibmtu)
 		goto bail_inval;
+	wqe->ssn = qp->s_ssn++;
 	qp->s_head = next;
 
 	ret = 0;
-- 
cgit v0.10.2


From b4d390d8d219452e5d4257c87134a6934d7fabeb Mon Sep 17 00:00:00 2001
From: Dave Olson <dave.olson@qlogic.com>
Date: Wed, 7 May 2008 10:57:48 -0700
Subject: IB/ipath: Fix bug that can leave sends disabled after freeze recovery

The semantics of cancel_sends changed, but the code using it was missed.
Don't leave sends and pioavail updates disabled, and add a comment as to
why the force update is needed.

Signed-off-by: Dave Olson <dave.olson@qlogic.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>

diff --git a/drivers/infiniband/hw/ipath/ipath_intr.c b/drivers/infiniband/hw/ipath/ipath_intr.c
index 1b58f47..45c4c06 100644
--- a/drivers/infiniband/hw/ipath/ipath_intr.c
+++ b/drivers/infiniband/hw/ipath/ipath_intr.c
@@ -933,11 +933,15 @@ void ipath_clear_freeze(struct ipath_devdata *dd)
 	 * therefore would not be sent, and eventually
 	 * might cause the process to run out of bufs
 	 */
-	ipath_cancel_sends(dd, 0);
+	ipath_cancel_sends(dd, 1);
 	ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
 			 dd->ipath_control);
 
-	/* ensure pio avail updates continue */
+	/*
+	 * ensure pio avail updates continue (because the update
+	 * won't have happened from cancel_sends because we were
+	 * still in freeze
+	 */
 	ipath_force_pio_avail_update(dd);
 
 	/*
-- 
cgit v0.10.2


From 2bfc8e9edf200aeeca18ee44bcbf6bce65438a42 Mon Sep 17 00:00:00 2001
From: Ralph Campbell <ralph.campbell@qlogic.com>
Date: Wed, 7 May 2008 10:58:50 -0700
Subject: IB/ipath: Return the correct opcode for RDMA WRITE with immediate

This patch fixes a bug in the RC responder which generates a completion
entry with the wrong opcode when an RDMA WRITE with immediate is received.

Signed-off-by: Ralph Campbell <ralph.campbell@qlogic.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>

diff --git a/drivers/infiniband/hw/ipath/ipath_rc.c b/drivers/infiniband/hw/ipath/ipath_rc.c
index c405dfb..08b11b5 100644
--- a/drivers/infiniband/hw/ipath/ipath_rc.c
+++ b/drivers/infiniband/hw/ipath/ipath_rc.c
@@ -1746,7 +1746,11 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
 		qp->r_wrid_valid = 0;
 		wc.wr_id = qp->r_wr_id;
 		wc.status = IB_WC_SUCCESS;
-		wc.opcode = IB_WC_RECV;
+		if (opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE) ||
+		    opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
+			wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
+		else
+			wc.opcode = IB_WC_RECV;
 		wc.vendor_err = 0;
 		wc.qp = &qp->ibqp;
 		wc.src_qp = qp->remote_qpn;
-- 
cgit v0.10.2


From 2889d1ef1240591fa4c72a6753e0a8d1c6e18140 Mon Sep 17 00:00:00 2001
From: Michael Albaugh <Michael.Albaugh@Qlogic.com>
Date: Wed, 7 May 2008 10:59:23 -0700
Subject: IB/ipath: Fix count of packets received by kernel

The loop in ipath_kreceive() that processes packets increments the
loop-index 'i' once too often, because the exit condition does not
depend on it, and is checked after the increment. By adding a check for
!last to the iterator in the for loop, we correct that in a way that is
not so likely to be re-broken by changes in the loop body.

Signed-off-by: Michael Albaugh <micheal.albaugh@qlogic.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>

diff --git a/drivers/infiniband/hw/ipath/ipath_driver.c b/drivers/infiniband/hw/ipath/ipath_driver.c
index acf30c0..f81dd4a 100644
--- a/drivers/infiniband/hw/ipath/ipath_driver.c
+++ b/drivers/infiniband/hw/ipath/ipath_driver.c
@@ -1197,7 +1197,7 @@ void ipath_kreceive(struct ipath_portdata *pd)
 	}
 
 reloop:
-	for (last = 0, i = 1; !last; i++) {
+	for (last = 0, i = 1; !last; i += !last) {
 		hdr = dd->ipath_f_get_msgheader(dd, rhf_addr);
 		eflags = ipath_hdrget_err_flags(rhf_addr);
 		etype = ipath_hdrget_rcv_type(rhf_addr);
-- 
cgit v0.10.2


From e2ab41cae418108f376ad1634d7507f56379f7a2 Mon Sep 17 00:00:00 2001
From: Dave Olson <dave.olson@qlogic.com>
Date: Wed, 7 May 2008 11:00:15 -0700
Subject: IB/ipath: Need to always request and handle PIO avail interrupts

Now that we always use PIO for vl15 on 7220, we could get stuck forever
if we happened to run out of PIO buffers from the verbs code, because
the setup code wouldn't run; the interrupt was also ignored if SDMA was
supported.  We also have to reduce the pio update threshold if we have
fewer kernel buffers than the existing threshold.

Clean up the initialization a bit to get ordering safer and more
sensible, and use the existing ipath_chg_kernavail call to do init,
rather than doing it separately.

Drop unnecessary clearing of pio buffer on pio parity error.

Drop incorrect updating of pioavailshadow when exitting freeze mode
(software state may not match chip state if buffer has been allocated
and not yet written).

If we couldn't get a kernel buffer for a while, make sure we are
in sync with hardware, mainly to handle the exitting freeze case.

Signed-off-by: Dave Olson <dave.olson@qlogic.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>

diff --git a/drivers/infiniband/hw/ipath/ipath_driver.c b/drivers/infiniband/hw/ipath/ipath_driver.c
index f81dd4a..2036d38 100644
--- a/drivers/infiniband/hw/ipath/ipath_driver.c
+++ b/drivers/infiniband/hw/ipath/ipath_driver.c
@@ -1428,6 +1428,40 @@ static void ipath_update_pio_bufs(struct ipath_devdata *dd)
 	spin_unlock_irqrestore(&ipath_pioavail_lock, flags);
 }
 
+/*
+ * used to force update of pioavailshadow if we can't get a pio buffer.
+ * Needed primarily due to exitting freeze mode after recovering
+ * from errors.  Done lazily, because it's safer (known to not
+ * be writing pio buffers).
+ */
+static void ipath_reset_availshadow(struct ipath_devdata *dd)
+{
+	int i, im;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ipath_pioavail_lock, flags);
+	for (i = 0; i < dd->ipath_pioavregs; i++) {
+		u64 val, oldval;
+		/* deal with 6110 chip bug on high register #s */
+		im = (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS)) ?
+			i ^ 1 : i;
+		val = le64_to_cpu(dd->ipath_pioavailregs_dma[im]);
+		/*
+		 * busy out the buffers not in the kernel avail list,
+		 * without changing the generation bits.
+		 */
+		oldval = dd->ipath_pioavailshadow[i];
+		dd->ipath_pioavailshadow[i] = val |
+			((~dd->ipath_pioavailkernel[i] <<
+			INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT) &
+			0xaaaaaaaaaaaaaaaaULL); /* All BUSY bits in qword */
+		if (oldval != dd->ipath_pioavailshadow[i])
+			ipath_dbg("shadow[%d] was %Lx, now %lx\n",
+				i, oldval, dd->ipath_pioavailshadow[i]);
+	}
+	spin_unlock_irqrestore(&ipath_pioavail_lock, flags);
+}
+
 /**
  * ipath_setrcvhdrsize - set the receive header size
  * @dd: the infinipath device
@@ -1482,9 +1516,12 @@ static noinline void no_pio_bufs(struct ipath_devdata *dd)
 	 */
 	ipath_stats.sps_nopiobufs++;
 	if (!(++dd->ipath_consec_nopiobuf % 100000)) {
-		ipath_dbg("%u pio sends with no bufavail; dmacopy: "
-			"%llx %llx %llx %llx; shadow:  %lx %lx %lx %lx\n",
+		ipath_force_pio_avail_update(dd); /* at start */
+		ipath_dbg("%u tries no piobufavail ts%lx; dmacopy: "
+			"%llx %llx %llx %llx\n"
+			"ipath  shadow:  %lx %lx %lx %lx\n",
 			dd->ipath_consec_nopiobuf,
+			(unsigned long)get_cycles(),
 			(unsigned long long) le64_to_cpu(dma[0]),
 			(unsigned long long) le64_to_cpu(dma[1]),
 			(unsigned long long) le64_to_cpu(dma[2]),
@@ -1496,14 +1533,17 @@ static noinline void no_pio_bufs(struct ipath_devdata *dd)
 		 */
 		if ((dd->ipath_piobcnt2k + dd->ipath_piobcnt4k) >
 		    (sizeof(shadow[0]) * 4 * 4))
-			ipath_dbg("2nd group: dmacopy: %llx %llx "
-				  "%llx %llx; shadow: %lx %lx %lx %lx\n",
+			ipath_dbg("2nd group: dmacopy: "
+				  "%llx %llx %llx %llx\n"
+				  "ipath  shadow:  %lx %lx %lx %lx\n",
 				  (unsigned long long)le64_to_cpu(dma[4]),
 				  (unsigned long long)le64_to_cpu(dma[5]),
 				  (unsigned long long)le64_to_cpu(dma[6]),
 				  (unsigned long long)le64_to_cpu(dma[7]),
-				  shadow[4], shadow[5], shadow[6],
-				  shadow[7]);
+				  shadow[4], shadow[5], shadow[6], shadow[7]);
+
+		/* at end, so update likely happened */
+		ipath_reset_availshadow(dd);
 	}
 }
 
@@ -1652,19 +1692,46 @@ void ipath_chg_pioavailkernel(struct ipath_devdata *dd, unsigned start,
 			      unsigned len, int avail)
 {
 	unsigned long flags;
-	unsigned end;
+	unsigned end, cnt = 0, next;
 
 	/* There are two bits per send buffer (busy and generation) */
 	start *= 2;
-	len *= 2;
-	end = start + len;
+	end = start + len * 2;
 
-	/* Set or clear the generation bits. */
 	spin_lock_irqsave(&ipath_pioavail_lock, flags);
+	/* Set or clear the busy bit in the shadow. */
 	while (start < end) {
 		if (avail) {
-			__clear_bit(start + INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT,
-				dd->ipath_pioavailshadow);
+			unsigned long dma;
+			int i, im;
+			/*
+			 * the BUSY bit will never be set, because we disarm
+			 * the user buffers before we hand them back to the
+			 * kernel.  We do have to make sure the generation
+			 * bit is set correctly in shadow, since it could
+			 * have changed many times while allocated to user.
+			 * We can't use the bitmap functions on the full
+			 * dma array because it is always little-endian, so
+			 * we have to flip to host-order first.
+			 * BITS_PER_LONG is slightly wrong, since it's
+			 * always 64 bits per register in chip...
+			 * We only work on 64 bit kernels, so that's OK.
+			 */
+			/* deal with 6110 chip bug on high register #s */
+			i = start / BITS_PER_LONG;
+			im = (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS)) ?
+				i ^ 1 : i;
+			__clear_bit(INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT
+				+ start, dd->ipath_pioavailshadow);
+			dma = (unsigned long) le64_to_cpu(
+				dd->ipath_pioavailregs_dma[im]);
+			if (test_bit((INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT
+				+ start) % BITS_PER_LONG, &dma))
+				__set_bit(INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT
+					+ start, dd->ipath_pioavailshadow);
+			else
+				__clear_bit(INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT
+					+ start, dd->ipath_pioavailshadow);
 			__set_bit(start, dd->ipath_pioavailkernel);
 		} else {
 			__set_bit(start + INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT,
@@ -1673,7 +1740,44 @@ void ipath_chg_pioavailkernel(struct ipath_devdata *dd, unsigned start,
 		}
 		start += 2;
 	}
+
+	if (dd->ipath_pioupd_thresh) {
+		end = 2 * (dd->ipath_piobcnt2k + dd->ipath_piobcnt4k);
+		next = find_first_bit(dd->ipath_pioavailkernel, end);
+		while (next < end) {
+			cnt++;
+			next = find_next_bit(dd->ipath_pioavailkernel, end,
+					next + 1);
+		}
+	}
 	spin_unlock_irqrestore(&ipath_pioavail_lock, flags);
+
+	/*
+	 * When moving buffers from kernel to user, if number assigned to
+	 * the user is less than the pio update threshold, and threshold
+	 * is supported (cnt was computed > 0), drop the update threshold
+	 * so we update at least once per allocated number of buffers.
+	 * In any case, if the kernel buffers are less than the threshold,
+	 * drop the threshold.  We don't bother increasing it, having once
+	 * decreased it, since it would typically just cycle back and forth.
+	 * If we don't decrease below buffers in use, we can wait a long
+	 * time for an update, until some other context uses PIO buffers.
+	 */
+	if (!avail && len < cnt)
+		cnt = len;
+	if (cnt < dd->ipath_pioupd_thresh) {
+		dd->ipath_pioupd_thresh = cnt;
+		ipath_dbg("Decreased pio update threshold to %u\n",
+			dd->ipath_pioupd_thresh);
+		spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+		dd->ipath_sendctrl &= ~(INFINIPATH_S_UPDTHRESH_MASK
+			<< INFINIPATH_S_UPDTHRESH_SHIFT);
+		dd->ipath_sendctrl |= dd->ipath_pioupd_thresh
+			<< INFINIPATH_S_UPDTHRESH_SHIFT;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+			dd->ipath_sendctrl);
+		spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+	}
 }
 
 /**
diff --git a/drivers/infiniband/hw/ipath/ipath_file_ops.c b/drivers/infiniband/hw/ipath/ipath_file_ops.c
index 8b17522..3295177 100644
--- a/drivers/infiniband/hw/ipath/ipath_file_ops.c
+++ b/drivers/infiniband/hw/ipath/ipath_file_ops.c
@@ -173,47 +173,25 @@ static int ipath_get_base_info(struct file *fp,
 		(void *) dd->ipath_statusp -
 		(void *) dd->ipath_pioavailregs_dma;
 	if (!shared) {
-		kinfo->spi_piocnt = dd->ipath_pbufsport;
+		kinfo->spi_piocnt = pd->port_piocnt;
 		kinfo->spi_piobufbase = (u64) pd->port_piobufs;
 		kinfo->__spi_uregbase = (u64) dd->ipath_uregbase +
 			dd->ipath_ureg_align * pd->port_port;
 	} else if (master) {
-		kinfo->spi_piocnt = (dd->ipath_pbufsport / subport_cnt) +
-				    (dd->ipath_pbufsport % subport_cnt);
+		kinfo->spi_piocnt = (pd->port_piocnt / subport_cnt) +
+				    (pd->port_piocnt % subport_cnt);
 		/* Master's PIO buffers are after all the slave's */
 		kinfo->spi_piobufbase = (u64) pd->port_piobufs +
 			dd->ipath_palign *
-			(dd->ipath_pbufsport - kinfo->spi_piocnt);
+			(pd->port_piocnt - kinfo->spi_piocnt);
 	} else {
 		unsigned slave = subport_fp(fp) - 1;
 
-		kinfo->spi_piocnt = dd->ipath_pbufsport / subport_cnt;
+		kinfo->spi_piocnt = pd->port_piocnt / subport_cnt;
 		kinfo->spi_piobufbase = (u64) pd->port_piobufs +
 			dd->ipath_palign * kinfo->spi_piocnt * slave;
 	}
 
-	/*
-	 * Set the PIO avail update threshold to no larger
-	 * than the number of buffers per process. Note that
-	 * we decrease it here, but won't ever increase it.
-	 */
-	if (dd->ipath_pioupd_thresh &&
-	    kinfo->spi_piocnt < dd->ipath_pioupd_thresh) {
-		unsigned long flags;
-
-		dd->ipath_pioupd_thresh = kinfo->spi_piocnt;
-		ipath_dbg("Decreased pio update threshold to %u\n",
-			dd->ipath_pioupd_thresh);
-		spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
-		dd->ipath_sendctrl &= ~(INFINIPATH_S_UPDTHRESH_MASK
-			<< INFINIPATH_S_UPDTHRESH_SHIFT);
-		dd->ipath_sendctrl |= dd->ipath_pioupd_thresh
-			<< INFINIPATH_S_UPDTHRESH_SHIFT;
-		ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
-			dd->ipath_sendctrl);
-		spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
-	}
-
 	if (shared) {
 		kinfo->spi_port_uregbase = (u64) dd->ipath_uregbase +
 			dd->ipath_ureg_align * pd->port_port;
@@ -1309,19 +1287,19 @@ static int ipath_mmap(struct file *fp, struct vm_area_struct *vma)
 	ureg = dd->ipath_uregbase + dd->ipath_ureg_align * pd->port_port;
 	if (!pd->port_subport_cnt) {
 		/* port is not shared */
-		piocnt = dd->ipath_pbufsport;
+		piocnt = pd->port_piocnt;
 		piobufs = pd->port_piobufs;
 	} else if (!subport_fp(fp)) {
 		/* caller is the master */
-		piocnt = (dd->ipath_pbufsport / pd->port_subport_cnt) +
-			 (dd->ipath_pbufsport % pd->port_subport_cnt);
+		piocnt = (pd->port_piocnt / pd->port_subport_cnt) +
+			 (pd->port_piocnt % pd->port_subport_cnt);
 		piobufs = pd->port_piobufs +
-			dd->ipath_palign * (dd->ipath_pbufsport - piocnt);
+			dd->ipath_palign * (pd->port_piocnt - piocnt);
 	} else {
 		unsigned slave = subport_fp(fp) - 1;
 
 		/* caller is a slave */
-		piocnt = dd->ipath_pbufsport / pd->port_subport_cnt;
+		piocnt = pd->port_piocnt / pd->port_subport_cnt;
 		piobufs = pd->port_piobufs + dd->ipath_palign * piocnt * slave;
 	}
 
@@ -1633,9 +1611,6 @@ static int try_alloc_port(struct ipath_devdata *dd, int port,
 		port_fp(fp) = pd;
 		pd->port_pid = current->pid;
 		strncpy(pd->port_comm, current->comm, sizeof(pd->port_comm));
-		ipath_chg_pioavailkernel(dd,
-			dd->ipath_pbufsport * (pd->port_port - 1),
-			dd->ipath_pbufsport, 0);
 		ipath_stats.sps_ports++;
 		ret = 0;
 	} else
@@ -1938,11 +1913,25 @@ static int ipath_do_user_init(struct file *fp,
 
 	/* for now we do nothing with rcvhdrcnt: uinfo->spu_rcvhdrcnt */
 
+	/* some ports may get extra buffers, calculate that here */
+	if (pd->port_port <= dd->ipath_ports_extrabuf)
+		pd->port_piocnt = dd->ipath_pbufsport + 1;
+	else
+		pd->port_piocnt = dd->ipath_pbufsport;
+
 	/* for right now, kernel piobufs are at end, so port 1 is at 0 */
+	if (pd->port_port <= dd->ipath_ports_extrabuf)
+		pd->port_pio_base = (dd->ipath_pbufsport + 1)
+			* (pd->port_port - 1);
+	else
+		pd->port_pio_base = dd->ipath_ports_extrabuf +
+			dd->ipath_pbufsport * (pd->port_port - 1);
 	pd->port_piobufs = dd->ipath_piobufbase +
-		dd->ipath_pbufsport * (pd->port_port - 1) * dd->ipath_palign;
-	ipath_cdbg(VERBOSE, "Set base of piobufs for port %u to 0x%x\n",
-		   pd->port_port, pd->port_piobufs);
+		pd->port_pio_base * dd->ipath_palign;
+	ipath_cdbg(VERBOSE, "piobuf base for port %u is 0x%x, piocnt %u,"
+		" first pio %u\n", pd->port_port, pd->port_piobufs,
+		pd->port_piocnt, pd->port_pio_base);
+	ipath_chg_pioavailkernel(dd, pd->port_pio_base, pd->port_piocnt, 0);
 
 	/*
 	 * Now allocate the rcvhdr Q and eager TIDs; skip the TID
@@ -2107,7 +2096,6 @@ static int ipath_close(struct inode *in, struct file *fp)
 	}
 
 	if (dd->ipath_kregbase) {
-		int i;
 		/* atomically clear receive enable port and intr avail. */
 		clear_bit(dd->ipath_r_portenable_shift + port,
 			  &dd->ipath_rcvctrl);
@@ -2136,9 +2124,9 @@ static int ipath_close(struct inode *in, struct file *fp)
 		ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdraddr,
 			pd->port_port, dd->ipath_dummy_hdrq_phys);
 
-		i = dd->ipath_pbufsport * (port - 1);
-		ipath_disarm_piobufs(dd, i, dd->ipath_pbufsport);
-		ipath_chg_pioavailkernel(dd, i, dd->ipath_pbufsport, 1);
+		ipath_disarm_piobufs(dd, pd->port_pio_base, pd->port_piocnt);
+		ipath_chg_pioavailkernel(dd, pd->port_pio_base,
+			pd->port_piocnt, 1);
 
 		dd->ipath_f_clear_tids(dd, pd->port_port);
 
diff --git a/drivers/infiniband/hw/ipath/ipath_iba7220.c b/drivers/infiniband/hw/ipath/ipath_iba7220.c
index 5f693de..8eee783 100644
--- a/drivers/infiniband/hw/ipath/ipath_iba7220.c
+++ b/drivers/infiniband/hw/ipath/ipath_iba7220.c
@@ -595,7 +595,7 @@ static void ipath_7220_txe_recover(struct ipath_devdata *dd)
 
 	dev_info(&dd->pcidev->dev,
 		"Recovering from TXE PIO parity error\n");
-	ipath_disarm_senderrbufs(dd, 1);
+	ipath_disarm_senderrbufs(dd);
 }
 
 
@@ -675,10 +675,8 @@ static void ipath_7220_handle_hwerrors(struct ipath_devdata *dd, char *msg,
 	ctrl = ipath_read_kreg32(dd, dd->ipath_kregs->kr_control);
 	if ((ctrl & INFINIPATH_C_FREEZEMODE) && !ipath_diag_inuse) {
 		/*
-		 * Parity errors in send memory are recoverable,
-		 * just cancel the send (if indicated in * sendbuffererror),
-		 * count the occurrence, unfreeze (if no other handled
-		 * hardware error bits are set), and continue.
+		 * Parity errors in send memory are recoverable by h/w
+		 * just do housekeeping, exit freeze mode and continue.
 		 */
 		if (hwerrs & ((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF |
 			       INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC)
@@ -687,13 +685,6 @@ static void ipath_7220_handle_hwerrors(struct ipath_devdata *dd, char *msg,
 			hwerrs &= ~((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF |
 				     INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC)
 				    << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT);
-			if (!hwerrs) {
-				/* else leave in freeze mode */
-				ipath_write_kreg(dd,
-						 dd->ipath_kregs->kr_control,
-						 dd->ipath_control);
-				goto bail;
-			}
 		}
 		if (hwerrs) {
 			/*
@@ -723,8 +714,8 @@ static void ipath_7220_handle_hwerrors(struct ipath_devdata *dd, char *msg,
 			*dd->ipath_statusp |= IPATH_STATUS_HWERROR;
 			dd->ipath_flags &= ~IPATH_INITTED;
 		} else {
-			ipath_dbg("Clearing freezemode on ignored hardware "
-				  "error\n");
+			ipath_dbg("Clearing freezemode on ignored or "
+				"recovered hardware error\n");
 			ipath_clear_freeze(dd);
 		}
 	}
@@ -1967,7 +1958,7 @@ static void ipath_7220_config_ports(struct ipath_devdata *dd, ushort cfgports)
 			 dd->ipath_rcvctrl);
 	dd->ipath_p0_rcvegrcnt = 2048; /* always */
 	if (dd->ipath_flags & IPATH_HAS_SEND_DMA)
-		dd->ipath_pioreserved = 1; /* reserve a buffer */
+		dd->ipath_pioreserved = 3; /* kpiobufs used for PIO */
 }
 
 
diff --git a/drivers/infiniband/hw/ipath/ipath_init_chip.c b/drivers/infiniband/hw/ipath/ipath_init_chip.c
index 27dd894..3e5baa4 100644
--- a/drivers/infiniband/hw/ipath/ipath_init_chip.c
+++ b/drivers/infiniband/hw/ipath/ipath_init_chip.c
@@ -41,7 +41,7 @@
 /*
  * min buffers we want to have per port, after driver
  */
-#define IPATH_MIN_USER_PORT_BUFCNT 8
+#define IPATH_MIN_USER_PORT_BUFCNT 7
 
 /*
  * Number of ports we are configured to use (to allow for more pio
@@ -54,13 +54,9 @@ MODULE_PARM_DESC(cfgports, "Set max number of ports to use");
 
 /*
  * Number of buffers reserved for driver (verbs and layered drivers.)
- * Reserved at end of buffer list.   Initialized based on
- * number of PIO buffers if not set via module interface.
+ * Initialized based on number of PIO buffers if not set via module interface.
  * The problem with this is that it's global, but we'll use different
- * numbers for different chip types.  So the default value is not
- * very useful.  I've redefined it for the 1.3 release so that it's
- * zero unless set by the user to something else, in which case we
- * try to respect it.
+ * numbers for different chip types.
  */
 static ushort ipath_kpiobufs;
 
@@ -546,9 +542,12 @@ static void enable_chip(struct ipath_devdata *dd, int reinit)
 			pioavail = dd->ipath_pioavailregs_dma[i ^ 1];
 		else
 			pioavail = dd->ipath_pioavailregs_dma[i];
-		dd->ipath_pioavailshadow[i] = le64_to_cpu(pioavail) |
-			(~dd->ipath_pioavailkernel[i] <<
-			INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT);
+		/*
+		 * don't need to worry about ipath_pioavailkernel here
+		 * because we will call ipath_chg_pioavailkernel() later
+		 * in initialization, to busy out buffers as needed
+		 */
+		dd->ipath_pioavailshadow[i] = le64_to_cpu(pioavail);
 	}
 	/* can get counters, stats, etc. */
 	dd->ipath_flags |= IPATH_PRESENT;
@@ -708,12 +707,11 @@ static void verify_interrupt(unsigned long opaque)
 int ipath_init_chip(struct ipath_devdata *dd, int reinit)
 {
 	int ret = 0;
-	u32 val32, kpiobufs;
+	u32 kpiobufs, defkbufs;
 	u32 piobufs, uports;
 	u64 val;
 	struct ipath_portdata *pd;
 	gfp_t gfp_flags = GFP_USER | __GFP_COMP;
-	unsigned long flags;
 
 	ret = init_housekeeping(dd, reinit);
 	if (ret)
@@ -753,56 +751,46 @@ int ipath_init_chip(struct ipath_devdata *dd, int reinit)
 	dd->ipath_pioavregs = ALIGN(piobufs, sizeof(u64) * BITS_PER_BYTE / 2)
 		/ (sizeof(u64) * BITS_PER_BYTE / 2);
 	uports = dd->ipath_cfgports ? dd->ipath_cfgports - 1 : 0;
-	if (ipath_kpiobufs == 0) {
-		/* not set by user (this is default) */
-		if (piobufs > 144)
-			kpiobufs = 32;
-		else
-			kpiobufs = 16;
-	}
+	if (piobufs > 144)
+		defkbufs = 32 + dd->ipath_pioreserved;
 	else
-		kpiobufs = ipath_kpiobufs;
+		defkbufs = 16 + dd->ipath_pioreserved;
 
-	if (kpiobufs + (uports * IPATH_MIN_USER_PORT_BUFCNT) > piobufs) {
+	if (ipath_kpiobufs && (ipath_kpiobufs +
+		(uports * IPATH_MIN_USER_PORT_BUFCNT)) > piobufs) {
 		int i = (int) piobufs -
 			(int) (uports * IPATH_MIN_USER_PORT_BUFCNT);
 		if (i < 1)
 			i = 1;
 		dev_info(&dd->pcidev->dev, "Allocating %d PIO bufs of "
 			 "%d for kernel leaves too few for %d user ports "
-			 "(%d each); using %u\n", kpiobufs,
+			 "(%d each); using %u\n", ipath_kpiobufs,
 			 piobufs, uports, IPATH_MIN_USER_PORT_BUFCNT, i);
 		/*
 		 * shouldn't change ipath_kpiobufs, because could be
 		 * different for different devices...
 		 */
 		kpiobufs = i;
-	}
+	} else if (ipath_kpiobufs)
+		kpiobufs = ipath_kpiobufs;
+	else
+		kpiobufs = defkbufs;
 	dd->ipath_lastport_piobuf = piobufs - kpiobufs;
 	dd->ipath_pbufsport =
 		uports ? dd->ipath_lastport_piobuf / uports : 0;
-	val32 = dd->ipath_lastport_piobuf - (dd->ipath_pbufsport * uports);
-	if (val32 > 0) {
-		ipath_dbg("allocating %u pbufs/port leaves %u unused, "
-			  "add to kernel\n", dd->ipath_pbufsport, val32);
-		dd->ipath_lastport_piobuf -= val32;
-		kpiobufs += val32;
-		ipath_dbg("%u pbufs/port leaves %u unused, add to kernel\n",
-			  dd->ipath_pbufsport, val32);
-	}
+	/* if not an even divisor, some user ports get extra buffers */
+	dd->ipath_ports_extrabuf = dd->ipath_lastport_piobuf -
+		(dd->ipath_pbufsport * uports);
+	if (dd->ipath_ports_extrabuf)
+		ipath_dbg("%u pbufs/port leaves some unused, add 1 buffer to "
+			"ports <= %u\n", dd->ipath_pbufsport,
+			dd->ipath_ports_extrabuf);
 	dd->ipath_lastpioindex = 0;
 	dd->ipath_lastpioindexl = dd->ipath_piobcnt2k;
-	ipath_chg_pioavailkernel(dd, 0, piobufs, 1);
+	/* ipath_pioavailshadow initialized earlier */
 	ipath_cdbg(VERBOSE, "%d PIO bufs for kernel out of %d total %u "
 		   "each for %u user ports\n", kpiobufs,
 		   piobufs, dd->ipath_pbufsport, uports);
-	if (dd->ipath_pioupd_thresh) {
-		if (dd->ipath_pbufsport < dd->ipath_pioupd_thresh)
-			dd->ipath_pioupd_thresh = dd->ipath_pbufsport;
-		if (kpiobufs < dd->ipath_pioupd_thresh)
-			dd->ipath_pioupd_thresh = kpiobufs;
-	}
-
 	ret = dd->ipath_f_early_init(dd);
 	if (ret) {
 		ipath_dev_err(dd, "Early initialization failure\n");
@@ -810,13 +798,6 @@ int ipath_init_chip(struct ipath_devdata *dd, int reinit)
 	}
 
 	/*
-	 * Cancel any possible active sends from early driver load.
-	 * Follows early_init because some chips have to initialize
-	 * PIO buffers in early_init to avoid false parity errors.
-	 */
-	ipath_cancel_sends(dd, 0);
-
-	/*
 	 * Early_init sets rcvhdrentsize and rcvhdrsize, so this must be
 	 * done after early_init.
 	 */
@@ -836,6 +817,7 @@ int ipath_init_chip(struct ipath_devdata *dd, int reinit)
 
 	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendpioavailaddr,
 			 dd->ipath_pioavailregs_phys);
+
 	/*
 	 * this is to detect s/w errors, which the h/w works around by
 	 * ignoring the low 6 bits of address, if it wasn't aligned.
@@ -862,12 +844,6 @@ int ipath_init_chip(struct ipath_devdata *dd, int reinit)
 			 ~0ULL&~INFINIPATH_HWE_MEMBISTFAILED);
 	ipath_write_kreg(dd, dd->ipath_kregs->kr_control, 0ULL);
 
-	spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
-	dd->ipath_sendctrl = INFINIPATH_S_PIOENABLE;
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl);
-	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-	spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
-
 	/*
 	 * before error clears, since we expect serdes pll errors during
 	 * this, the first time after reset
@@ -940,6 +916,19 @@ int ipath_init_chip(struct ipath_devdata *dd, int reinit)
 	else
 		enable_chip(dd, reinit);
 
+	/* after enable_chip, so pioavailshadow setup */
+	ipath_chg_pioavailkernel(dd, 0, piobufs, 1);
+
+	/*
+	 * Cancel any possible active sends from early driver load.
+	 * Follows early_init because some chips have to initialize
+	 * PIO buffers in early_init to avoid false parity errors.
+	 * After enable and ipath_chg_pioavailkernel so we can safely
+	 * enable pioavail updates and PIOENABLE; packets are now
+	 * ready to go out.
+	 */
+	ipath_cancel_sends(dd, 1);
+
 	if (!reinit) {
 		/*
 		 * Used when we close a port, for DMA already in flight
diff --git a/drivers/infiniband/hw/ipath/ipath_intr.c b/drivers/infiniband/hw/ipath/ipath_intr.c
index 45c4c06..26900b3 100644
--- a/drivers/infiniband/hw/ipath/ipath_intr.c
+++ b/drivers/infiniband/hw/ipath/ipath_intr.c
@@ -38,42 +38,12 @@
 #include "ipath_verbs.h"
 #include "ipath_common.h"
 
-/*
- * clear (write) a pio buffer, to clear a parity error.   This routine
- * should only be called when in freeze mode, and the buffer should be
- * canceled afterwards.
- */
-static void ipath_clrpiobuf(struct ipath_devdata *dd, u32 pnum)
-{
-	u32 __iomem *pbuf;
-	u32 dwcnt; /* dword count to write */
-	if (pnum < dd->ipath_piobcnt2k) {
-		pbuf = (u32 __iomem *) (dd->ipath_pio2kbase + pnum *
-			dd->ipath_palign);
-		dwcnt = dd->ipath_piosize2k >> 2;
-	}
-	else {
-		pbuf = (u32 __iomem *) (dd->ipath_pio4kbase +
-			(pnum - dd->ipath_piobcnt2k) * dd->ipath_4kalign);
-		dwcnt = dd->ipath_piosize4k >> 2;
-	}
-	dev_info(&dd->pcidev->dev,
-		"Rewrite PIO buffer %u, to recover from parity error\n",
-		pnum);
-
-	/* no flush required, since already in freeze */
-	writel(dwcnt + 1, pbuf);
-	while (--dwcnt)
-		writel(0, pbuf++);
-}
 
 /*
  * Called when we might have an error that is specific to a particular
  * PIO buffer, and may need to cancel that buffer, so it can be re-used.
- * If rewrite is true, and bits are set in the sendbufferror registers,
- * we'll write to the buffer, for error recovery on parity errors.
  */
-void ipath_disarm_senderrbufs(struct ipath_devdata *dd, int rewrite)
+void ipath_disarm_senderrbufs(struct ipath_devdata *dd)
 {
 	u32 piobcnt;
 	unsigned long sbuf[4];
@@ -109,11 +79,8 @@ void ipath_disarm_senderrbufs(struct ipath_devdata *dd, int rewrite)
 		}
 
 		for (i = 0; i < piobcnt; i++)
-			if (test_bit(i, sbuf)) {
-				if (rewrite)
-					ipath_clrpiobuf(dd, i);
+			if (test_bit(i, sbuf))
 				ipath_disarm_piobufs(dd, i, 1);
-			}
 		/* ignore armlaunch errs for a bit */
 		dd->ipath_lastcancel = jiffies+3;
 	}
@@ -164,7 +131,7 @@ static u64 handle_e_sum_errs(struct ipath_devdata *dd, ipath_err_t errs)
 {
 	u64 ignore_this_time = 0;
 
-	ipath_disarm_senderrbufs(dd, 0);
+	ipath_disarm_senderrbufs(dd);
 	if ((errs & E_SUM_LINK_PKTERRS) &&
 	    !(dd->ipath_flags & IPATH_LINKACTIVE)) {
 		/*
@@ -909,8 +876,8 @@ static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs)
  * processes (causing armlaunch), send errors due to going into freeze mode,
  * etc., and try to avoid causing extra interrupts while doing so.
  * Forcibly update the in-memory pioavail register copies after cleanup
- * because the chip won't do it for anything changing while in freeze mode
- * (we don't want to wait for the next pio buffer state change).
+ * because the chip won't do it while in freeze mode (the register values
+ * themselves are kept correct).
  * Make sure that we don't lose any important interrupts by using the chip
  * feature that says that writing 0 to a bit in *clear that is set in
  * *status will cause an interrupt to be generated again (if allowed by
@@ -918,48 +885,23 @@ static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs)
  */
 void ipath_clear_freeze(struct ipath_devdata *dd)
 {
-	int i, im;
-	u64 val;
-
 	/* disable error interrupts, to avoid confusion */
 	ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask, 0ULL);
 
 	/* also disable interrupts; errormask is sometimes overwriten */
 	ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL);
 
-	/*
-	 * clear all sends, because they have may been
-	 * completed by usercode while in freeze mode, and
-	 * therefore would not be sent, and eventually
-	 * might cause the process to run out of bufs
-	 */
 	ipath_cancel_sends(dd, 1);
+
+	/* clear the freeze, and be sure chip saw it */
 	ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
 			 dd->ipath_control);
+	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
 
-	/*
-	 * ensure pio avail updates continue (because the update
-	 * won't have happened from cancel_sends because we were
-	 * still in freeze
-	 */
+	/* force in-memory update now we are out of freeze */
 	ipath_force_pio_avail_update(dd);
 
 	/*
-	 * We just enabled pioavailupdate, so dma copy is almost certainly
-	 * not yet right, so read the registers directly.  Similar to init
-	 */
-	for (i = 0; i < dd->ipath_pioavregs; i++) {
-		/* deal with 6110 chip bug */
-		im = (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS)) ?
-			i ^ 1 : i;
-		val = ipath_read_kreg64(dd, (0x1000 / sizeof(u64)) + im);
-		dd->ipath_pioavailregs_dma[i] = cpu_to_le64(val);
-		dd->ipath_pioavailshadow[i] = val |
-			(~dd->ipath_pioavailkernel[i] <<
-			INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT);
-	}
-
-	/*
 	 * force new interrupt if any hwerr, error or interrupt bits are
 	 * still set, and clear "safe" send packet errors related to freeze
 	 * and cancelling sends.  Re-enable error interrupts before possible
@@ -1316,10 +1258,8 @@ irqreturn_t ipath_intr(int irq, void *data)
 		ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
 		spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
 
-		if (!(dd->ipath_flags & IPATH_HAS_SEND_DMA))
-			handle_layer_pioavail(dd);
-		else
-			ipath_dbg("unexpected BUFAVAIL intr\n");
+		/* always process; sdma verbs uses PIO for acks and VL15  */
+		handle_layer_pioavail(dd);
 	}
 
 	ret = IRQ_HANDLED;
diff --git a/drivers/infiniband/hw/ipath/ipath_kernel.h b/drivers/infiniband/hw/ipath/ipath_kernel.h
index 202337a..02b24a3 100644
--- a/drivers/infiniband/hw/ipath/ipath_kernel.h
+++ b/drivers/infiniband/hw/ipath/ipath_kernel.h
@@ -117,6 +117,10 @@ struct ipath_portdata {
 	u16 port_subport_cnt;
 	/* non-zero if port is being shared. */
 	u16 port_subport_id;
+	/* number of pio bufs for this port (all procs, if shared) */
+	u32 port_piocnt;
+	/* first pio buffer for this port */
+	u32 port_pio_base;
 	/* chip offset of PIO buffers for this port */
 	u32 port_piobufs;
 	/* how many alloc_pages() chunks in port_rcvegrbuf_pages */
@@ -384,6 +388,8 @@ struct ipath_devdata {
 	u32 ipath_lastrpkts;
 	/* pio bufs allocated per port */
 	u32 ipath_pbufsport;
+	/* if remainder on bufs/port, ports < extrabuf get 1 extra */
+	u32 ipath_ports_extrabuf;
 	u32 ipath_pioupd_thresh; /* update threshold, some chips */
 	/*
 	 * number of ports configured as max; zero is set to number chip
@@ -1011,7 +1017,7 @@ void ipath_get_eeprom_info(struct ipath_devdata *);
 int ipath_update_eeprom_log(struct ipath_devdata *dd);
 void ipath_inc_eeprom_err(struct ipath_devdata *dd, u32 eidx, u32 incr);
 u64 ipath_snap_cntr(struct ipath_devdata *, ipath_creg);
-void ipath_disarm_senderrbufs(struct ipath_devdata *, int);
+void ipath_disarm_senderrbufs(struct ipath_devdata *);
 void ipath_force_pio_avail_update(struct ipath_devdata *);
 void signal_ib_event(struct ipath_devdata *dd, enum ib_event_type ev);
 
diff --git a/drivers/infiniband/hw/ipath/ipath_ruc.c b/drivers/infiniband/hw/ipath/ipath_ruc.c
index 8ac5c1d..9e3fe61 100644
--- a/drivers/infiniband/hw/ipath/ipath_ruc.c
+++ b/drivers/infiniband/hw/ipath/ipath_ruc.c
@@ -481,9 +481,10 @@ done:
 		wake_up(&qp->wait);
 }
 
-static void want_buffer(struct ipath_devdata *dd)
+static void want_buffer(struct ipath_devdata *dd, struct ipath_qp *qp)
 {
-	if (!(dd->ipath_flags & IPATH_HAS_SEND_DMA)) {
+	if (!(dd->ipath_flags & IPATH_HAS_SEND_DMA) ||
+		qp->ibqp.qp_type == IB_QPT_SMI) {
 		unsigned long flags;
 
 		spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
@@ -519,7 +520,7 @@ static void ipath_no_bufs_available(struct ipath_qp *qp,
 	spin_lock_irqsave(&dev->pending_lock, flags);
 	list_add_tail(&qp->piowait, &dev->piowait);
 	spin_unlock_irqrestore(&dev->pending_lock, flags);
-	want_buffer(dev->dd);
+	want_buffer(dev->dd, qp);
 	dev->n_piowait++;
 }
 
diff --git a/drivers/infiniband/hw/ipath/ipath_sdma.c b/drivers/infiniband/hw/ipath/ipath_sdma.c
index 1974df7..0d07682 100644
--- a/drivers/infiniband/hw/ipath/ipath_sdma.c
+++ b/drivers/infiniband/hw/ipath/ipath_sdma.c
@@ -449,16 +449,19 @@ int setup_sdma(struct ipath_devdata *dd)
 	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmaheadaddr,
 			 dd->ipath_sdma_head_phys);
 
-	/* Reserve all the former "kernel" piobufs */
-	n = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k - dd->ipath_pioreserved;
-	for (i = dd->ipath_lastport_piobuf; i < n; ++i) {
+	/*
+	 * Reserve all the former "kernel" piobufs, using high number range
+	 * so we get as many 4K buffers as possible
+	 */
+	n = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k;
+	i = dd->ipath_lastport_piobuf + dd->ipath_pioreserved;
+	ipath_chg_pioavailkernel(dd, i, n - i , 0);
+	for (; i < n; ++i) {
 		unsigned word = i / 64;
 		unsigned bit = i & 63;
 		BUG_ON(word >= 3);
 		senddmabufmask[word] |= 1ULL << bit;
 	}
-	ipath_chg_pioavailkernel(dd, dd->ipath_lastport_piobuf,
-		n - dd->ipath_lastport_piobuf, 0);
 	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask0,
 			 senddmabufmask[0]);
 	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask1,
-- 
cgit v0.10.2


From ab69b3cf1219e0d07bb4ea373f36b1de38af531c Mon Sep 17 00:00:00 2001
From: John Gregor <john.gregor@qlogic.com>
Date: Wed, 7 May 2008 11:01:10 -0700
Subject: IB/ipath: Fix SDMA error recovery in absence of link status change

What's fixed:

    in ipath_cancel_sends()

        We need to unconditionally set ABORTING.  So, swap the tests
        so the set_bit() isn't shadowed by the &&.

        If we've disarmed the piobufs, then we need to unconditionally
        set DISARMED.  So, move it out from the overly protective if
        at the bottom.

    in sdma_abort_task()

        Abort_task was written knowing that the SDMA engine would always
        be reset (and restarted) on error.  A recent change broke that
        fundamental assumption by taking the restart portion and making
        it conditional on a link status change.  But, SDMA can go boom
        without a link status change in some conditions.

Signed-off-by: John Gregor <john.gregor@qlogic.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>

diff --git a/drivers/infiniband/hw/ipath/ipath_driver.c b/drivers/infiniband/hw/ipath/ipath_driver.c
index 2036d38..ce7b7c3 100644
--- a/drivers/infiniband/hw/ipath/ipath_driver.c
+++ b/drivers/infiniband/hw/ipath/ipath_driver.c
@@ -1898,8 +1898,8 @@ void ipath_cancel_sends(struct ipath_devdata *dd, int restore_sendctrl)
 
 		spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
 		skip_cancel =
-			!test_bit(IPATH_SDMA_DISABLED, statp) &&
-			test_and_set_bit(IPATH_SDMA_ABORTING, statp);
+			test_and_set_bit(IPATH_SDMA_ABORTING, statp)
+			&& !test_bit(IPATH_SDMA_DISABLED, statp);
 		spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
 		if (skip_cancel)
 			goto bail;
@@ -1930,6 +1930,9 @@ void ipath_cancel_sends(struct ipath_devdata *dd, int restore_sendctrl)
 	ipath_disarm_piobufs(dd, 0,
 		dd->ipath_piobcnt2k + dd->ipath_piobcnt4k);
 
+	if (dd->ipath_flags & IPATH_HAS_SEND_DMA)
+		set_bit(IPATH_SDMA_DISARMED, &dd->ipath_sdma_status);
+
 	if (restore_sendctrl) {
 		/* else done by caller later if needed */
 		spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
@@ -1949,7 +1952,6 @@ void ipath_cancel_sends(struct ipath_devdata *dd, int restore_sendctrl)
 		/* only wait so long for intr */
 		dd->ipath_sdma_abort_intr_timeout = jiffies + HZ;
 		dd->ipath_sdma_reset_wait = 200;
-		__set_bit(IPATH_SDMA_DISARMED, &dd->ipath_sdma_status);
 		if (!test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
 			tasklet_hi_schedule(&dd->ipath_sdma_abort_task);
 		spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
diff --git a/drivers/infiniband/hw/ipath/ipath_sdma.c b/drivers/infiniband/hw/ipath/ipath_sdma.c
index 0d07682..3697449 100644
--- a/drivers/infiniband/hw/ipath/ipath_sdma.c
+++ b/drivers/infiniband/hw/ipath/ipath_sdma.c
@@ -308,13 +308,15 @@ static void sdma_abort_task(unsigned long opaque)
 		spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
 
 		/*
-		 * Don't restart sdma here. Wait until link is up to ACTIVE.
-		 * VL15 MADs used to bring the link up use PIO, and multiple
-		 * link transitions otherwise cause the sdma engine to be
+		 * Don't restart sdma here (with the exception
+		 * below). Wait until link is up to ACTIVE.  VL15 MADs
+		 * used to bring the link up use PIO, and multiple link
+		 * transitions otherwise cause the sdma engine to be
 		 * stopped and started multiple times.
-		 * The disable is done here, including the shadow, so the
-		 * state is kept consistent.
-		 * See ipath_restart_sdma() for the actual starting of sdma.
+		 * The disable is done here, including the shadow,
+		 * so the state is kept consistent.
+		 * See ipath_restart_sdma() for the actual starting
+		 * of sdma.
 		 */
 		spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
 		dd->ipath_sendctrl &= ~INFINIPATH_S_SDMAENABLE;
@@ -326,6 +328,13 @@ static void sdma_abort_task(unsigned long opaque)
 		/* make sure I see next message */
 		dd->ipath_sdma_abort_jiffies = 0;
 
+		/*
+		 * Not everything that takes SDMA offline is a link
+		 * status change.  If the link was up, restart SDMA.
+		 */
+		if (dd->ipath_flags & IPATH_LINKACTIVE)
+			ipath_restart_sdma(dd);
+
 		goto done;
 	}
 
@@ -427,7 +436,12 @@ int setup_sdma(struct ipath_devdata *dd)
 		goto done;
 	}
 
-	dd->ipath_sdma_status = 0;
+	/*
+	 * Set initial status as if we had been up, then gone down.
+	 * This lets initial start on transition to ACTIVE be the
+	 * same as restart after link flap.
+	 */
+	dd->ipath_sdma_status = IPATH_SDMA_ABORT_ABORTED;
 	dd->ipath_sdma_abort_jiffies = 0;
 	dd->ipath_sdma_generation = 0;
 	dd->ipath_sdma_descq_tail = 0;
@@ -618,6 +632,9 @@ void ipath_restart_sdma(struct ipath_devdata *dd)
 	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
 	spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
 
+	/* notify upper layers */
+	ipath_ib_piobufavail(dd->verbs_dev);
+
 bail:
 	return;
 }
-- 
cgit v0.10.2


From 12137c593d127c6c1a3eb050674da047682badaf Mon Sep 17 00:00:00 2001
From: Stefan Roscher <ossrosch@linux.vnet.ibm.com>
Date: Wed, 7 May 2008 11:35:06 -0700
Subject: IB/ehca: Wait for async events to finish before destroying QP

This is necessary because, in a multicore environment, a race between
uverbs async handler and destroy QP could occur.

Signed-off-by: Stefan Roscher <stefan.roscher at de.ibm.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>

diff --git a/drivers/infiniband/hw/ehca/ehca_classes.h b/drivers/infiniband/hw/ehca/ehca_classes.h
index 00bab60..1e9e99a 100644
--- a/drivers/infiniband/hw/ehca/ehca_classes.h
+++ b/drivers/infiniband/hw/ehca/ehca_classes.h
@@ -192,6 +192,8 @@ struct ehca_qp {
 	int mtu_shift;
 	u32 message_count;
 	u32 packet_count;
+	atomic_t nr_events; /* events seen */
+	wait_queue_head_t wait_completion;
 };
 
 #define IS_SRQ(qp) (qp->ext_type == EQPT_SRQ)
diff --git a/drivers/infiniband/hw/ehca/ehca_irq.c b/drivers/infiniband/hw/ehca/ehca_irq.c
index ca5eb0c..ce1ab05 100644
--- a/drivers/infiniband/hw/ehca/ehca_irq.c
+++ b/drivers/infiniband/hw/ehca/ehca_irq.c
@@ -204,6 +204,8 @@ static void qp_event_callback(struct ehca_shca *shca, u64 eqe,
 
 	read_lock(&ehca_qp_idr_lock);
 	qp = idr_find(&ehca_qp_idr, token);
+	if (qp)
+		atomic_inc(&qp->nr_events);
 	read_unlock(&ehca_qp_idr_lock);
 
 	if (!qp)
@@ -223,6 +225,8 @@ static void qp_event_callback(struct ehca_shca *shca, u64 eqe,
 	if (fatal && qp->ext_type == EQPT_SRQBASE)
 		dispatch_qp_event(shca, qp, IB_EVENT_QP_LAST_WQE_REACHED);
 
+	if (atomic_dec_and_test(&qp->nr_events))
+		wake_up(&qp->wait_completion);
 	return;
 }
 
diff --git a/drivers/infiniband/hw/ehca/ehca_qp.c b/drivers/infiniband/hw/ehca/ehca_qp.c
index 18fba92..3f59587 100644
--- a/drivers/infiniband/hw/ehca/ehca_qp.c
+++ b/drivers/infiniband/hw/ehca/ehca_qp.c
@@ -566,6 +566,8 @@ static struct ehca_qp *internal_create_qp(
 		return ERR_PTR(-ENOMEM);
 	}
 
+	atomic_set(&my_qp->nr_events, 0);
+	init_waitqueue_head(&my_qp->wait_completion);
 	spin_lock_init(&my_qp->spinlock_s);
 	spin_lock_init(&my_qp->spinlock_r);
 	my_qp->qp_type = qp_type;
@@ -1934,6 +1936,9 @@ static int internal_destroy_qp(struct ib_device *dev, struct ehca_qp *my_qp,
 	idr_remove(&ehca_qp_idr, my_qp->token);
 	write_unlock_irqrestore(&ehca_qp_idr_lock, flags);
 
+	/* now wait until all pending events have completed */
+	wait_event(my_qp->wait_completion, !atomic_read(&my_qp->nr_events));
+
 	h_ret = hipz_h_destroy_qp(shca->ipz_hca_handle, my_qp);
 	if (h_ret != H_SUCCESS) {
 		ehca_err(dev, "hipz_h_destroy_qp() failed h_ret=%li "
-- 
cgit v0.10.2


From e0164af66b6ec165836d1d862b3f800890713452 Mon Sep 17 00:00:00 2001
From: Auke Kok <auke-jan.h.kok@intel.com>
Date: Wed, 7 May 2008 13:42:33 -0700
Subject: [MAINTAINERS] New maintainer for Intel ethernet adapters

I'm handing over maintainership to Jeff Kirsher and moving on
to other Linux/Open Source work within Intel. Good luck to Jeff ;)

Signed-off-by: Auke Kok <auke-jan.h.kok@intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/MAINTAINERS b/MAINTAINERS
index abe2787..f5583dc 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2112,12 +2112,10 @@ L:	netdev@vger.kernel.org
 S:	Maintained
 
 INTEL ETHERNET DRIVERS (e100/e1000/e1000e/igb/ixgb/ixgbe)
-P:	Auke Kok
-M:	auke-jan.h.kok@intel.com
-P:	Jesse Brandeburg
-M:	jesse.brandeburg@intel.com
 P:	Jeff Kirsher
 M:	jeffrey.t.kirsher@intel.com
+P:	Jesse Brandeburg
+M:	jesse.brandeburg@intel.com
 P:	Bruce Allan
 M:	bruce.w.allan@intel.com
 P:	John Ronciak
-- 
cgit v0.10.2


From 1e38c126c9252b612697e34f43b1b3371c8ee31d Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Wed, 7 May 2008 16:21:28 -0700
Subject: sparc: Fix fork/clone/vfork system call restart.

We clobber %i1 as well as %i0 for these system calls,
because they give two return values.

Therefore, on error, we have to restore %i1 properly
or else the restart explodes since it uses the wrong
arguments.

This fixes glibc's nptl/tst-eintr1.c testcase.

Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/arch/sparc/kernel/process.c b/arch/sparc/kernel/process.c
index e7f3519..36431f3 100644
--- a/arch/sparc/kernel/process.c
+++ b/arch/sparc/kernel/process.c
@@ -419,14 +419,26 @@ asmlinkage int sparc_do_fork(unsigned long clone_flags,
                              unsigned long stack_size)
 {
 	unsigned long parent_tid_ptr, child_tid_ptr;
+	unsigned long orig_i1 = regs->u_regs[UREG_I1];
+	long ret;
 
 	parent_tid_ptr = regs->u_regs[UREG_I2];
 	child_tid_ptr = regs->u_regs[UREG_I4];
 
-	return do_fork(clone_flags, stack_start,
-		       regs, stack_size,
-		       (int __user *) parent_tid_ptr,
-		       (int __user *) child_tid_ptr);
+	ret = do_fork(clone_flags, stack_start,
+		      regs, stack_size,
+		      (int __user *) parent_tid_ptr,
+		      (int __user *) child_tid_ptr);
+
+	/* If we get an error and potentially restart the system
+	 * call, we're screwed because copy_thread() clobbered
+	 * the parent's %o1.  So detect that case and restore it
+	 * here.
+	 */
+	if ((unsigned long)ret >= -ERESTART_RESTARTBLOCK)
+		regs->u_regs[UREG_I1] = orig_i1;
+
+	return ret;
 }
 
 /* Copy a Sparc thread.  The fork() return value conventions
diff --git a/arch/sparc64/kernel/process.c b/arch/sparc64/kernel/process.c
index 500ac6d..4129c04 100644
--- a/arch/sparc64/kernel/process.c
+++ b/arch/sparc64/kernel/process.c
@@ -503,6 +503,8 @@ asmlinkage long sparc_do_fork(unsigned long clone_flags,
 			      unsigned long stack_size)
 {
 	int __user *parent_tid_ptr, *child_tid_ptr;
+	unsigned long orig_i1 = regs->u_regs[UREG_I1];
+	long ret;
 
 #ifdef CONFIG_COMPAT
 	if (test_thread_flag(TIF_32BIT)) {
@@ -515,9 +517,19 @@ asmlinkage long sparc_do_fork(unsigned long clone_flags,
 		child_tid_ptr = (int __user *) regs->u_regs[UREG_I4];
 	}
 
-	return do_fork(clone_flags, stack_start,
-		       regs, stack_size,
-		       parent_tid_ptr, child_tid_ptr);
+	ret = do_fork(clone_flags, stack_start,
+		      regs, stack_size,
+		      parent_tid_ptr, child_tid_ptr);
+
+	/* If we get an error and potentially restart the system
+	 * call, we're screwed because copy_thread() clobbered
+	 * the parent's %o1.  So detect that case and restore it
+	 * here.
+	 */
+	if ((unsigned long)ret >= -ERESTART_RESTARTBLOCK)
+		regs->u_regs[UREG_I1] = orig_i1;
+
+	return ret;
 }
 
 /* Copy a Sparc thread.  The fork() return value conventions
-- 
cgit v0.10.2


From dc5dc7e6d71ca9fd1ea01a1418150af3b2937489 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Wed, 7 May 2008 18:54:05 -0700
Subject: sparc: Fix SA_ONSTACK signal handling.

We need to be more liberal about the alignment of the buffer given to
us by sigaltstack().  The user should not need to be mindful of all of
the alignment constraints we have for the stack frame.

This mirrors how we handle this situation in clone() as well.

Also, we align the stack even in non-SA_ONSTACK cases so that signals
due to bad stack alignment can be delivered properly.  This makes such
errors easier to debug and recover from.

Finally, add the sanity check x86 has to make sure we won't overflow
the signal stack.

This fixes glibc testcases nptl/tst-cancel20.c and
nptl/tst-cancelx20.c

Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/arch/sparc/kernel/signal.c b/arch/sparc/kernel/signal.c
index 3c31229..3681579 100644
--- a/arch/sparc/kernel/signal.c
+++ b/arch/sparc/kernel/signal.c
@@ -245,15 +245,29 @@ static inline int invalid_frame_pointer(void __user *fp, int fplen)
 
 static inline void __user *get_sigframe(struct sigaction *sa, struct pt_regs *regs, unsigned long framesize)
 {
-	unsigned long sp;
+	unsigned long sp = regs->u_regs[UREG_FP];
 
-	sp = regs->u_regs[UREG_FP];
+	/*
+	 * If we are on the alternate signal stack and would overflow it, don't.
+	 * Return an always-bogus address instead so we will die with SIGSEGV.
+	 */
+	if (on_sig_stack(sp) && !likely(on_sig_stack(sp - framesize)))
+		return (void __user *) -1L;
 
 	/* This is the X/Open sanctioned signal stack switching.  */
 	if (sa->sa_flags & SA_ONSTACK) {
-		if (!on_sig_stack(sp) && !((current->sas_ss_sp + current->sas_ss_size) & 7))
+		if (sas_ss_flags(sp) == 0)
 			sp = current->sas_ss_sp + current->sas_ss_size;
 	}
+
+	/* Always align the stack frame.  This handles two cases.  First,
+	 * sigaltstack need not be mindful of platform specific stack
+	 * alignment.  Second, if we took this signal because the stack
+	 * is not aligned properly, we'd like to take the signal cleanly
+	 * and report that.
+	 */
+	sp &= ~7UL;
+
 	return (void __user *)(sp - framesize);
 }
 
diff --git a/arch/sparc64/kernel/signal.c b/arch/sparc64/kernel/signal.c
index 45d6bf6..07c0443 100644
--- a/arch/sparc64/kernel/signal.c
+++ b/arch/sparc64/kernel/signal.c
@@ -376,16 +376,29 @@ save_fpu_state(struct pt_regs *regs, __siginfo_fpu_t __user *fpu)
 
 static inline void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, unsigned long framesize)
 {
-	unsigned long sp;
+	unsigned long sp = regs->u_regs[UREG_FP] + STACK_BIAS;
 
-	sp = regs->u_regs[UREG_FP] + STACK_BIAS;
+	/*
+	 * If we are on the alternate signal stack and would overflow it, don't.
+	 * Return an always-bogus address instead so we will die with SIGSEGV.
+	 */
+	if (on_sig_stack(sp) && !likely(on_sig_stack(sp - framesize)))
+		return (void __user *) -1L;
 
 	/* This is the X/Open sanctioned signal stack switching.  */
 	if (ka->sa.sa_flags & SA_ONSTACK) {
-		if (!on_sig_stack(sp) &&
-		    !((current->sas_ss_sp + current->sas_ss_size) & 7))
+		if (sas_ss_flags(sp) == 0)
 			sp = current->sas_ss_sp + current->sas_ss_size;
 	}
+
+	/* Always align the stack frame.  This handles two cases.  First,
+	 * sigaltstack need not be mindful of platform specific stack
+	 * alignment.  Second, if we took this signal because the stack
+	 * is not aligned properly, we'd like to take the signal cleanly
+	 * and report that.
+	 */
+	sp &= ~7UL;
+
 	return (void __user *)(sp - framesize);
 }
 
diff --git a/arch/sparc64/kernel/signal32.c b/arch/sparc64/kernel/signal32.c
index 9415d2c..0f6b7b1 100644
--- a/arch/sparc64/kernel/signal32.c
+++ b/arch/sparc64/kernel/signal32.c
@@ -406,11 +406,27 @@ static void __user *get_sigframe(struct sigaction *sa, struct pt_regs *regs, uns
 	regs->u_regs[UREG_FP] &= 0x00000000ffffffffUL;
 	sp = regs->u_regs[UREG_FP];
 	
+	/*
+	 * If we are on the alternate signal stack and would overflow it, don't.
+	 * Return an always-bogus address instead so we will die with SIGSEGV.
+	 */
+	if (on_sig_stack(sp) && !likely(on_sig_stack(sp - framesize)))
+		return (void __user *) -1L;
+
 	/* This is the X/Open sanctioned signal stack switching.  */
 	if (sa->sa_flags & SA_ONSTACK) {
-		if (!on_sig_stack(sp) && !((current->sas_ss_sp + current->sas_ss_size) & 7))
+		if (sas_ss_flags(sp) == 0)
 			sp = current->sas_ss_sp + current->sas_ss_size;
 	}
+
+	/* Always align the stack frame.  This handles two cases.  First,
+	 * sigaltstack need not be mindful of platform specific stack
+	 * alignment.  Second, if we took this signal because the stack
+	 * is not aligned properly, we'd like to take the signal cleanly
+	 * and report that.
+	 */
+	sp &= ~7UL;
+
 	return (void __user *)(sp - framesize);
 }
 
-- 
cgit v0.10.2


From 62ab22278308a40bcb7f4079e9719ab8b7fe11b5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= <ilpo.jarvinen@helsinki.fi>
Date: Thu, 8 May 2008 01:09:11 -0700
Subject: tcp FRTO: SACK variant is errorneously used with NewReno
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Note: there's actually another bug in FRTO's SACK variant, which
is the causing failure in NewReno case because of the error
that's fixed here. I'll fix the SACK case separately (it's
a separate bug really, though related, but in order to fix that
I need to audit tp->snd_nxt usage a bit).

There were two places where SACK variant of FRTO is getting
incorrectly used even if SACK wasn't negotiated by the TCP flow.
This leads to incorrect setting of frto_highmark with NewReno
if a previous recovery was interrupted by another RTO.

An eventual fallback to conventional recovery then incorrectly
considers one or couple of segments as forward transmissions
though they weren't, which then are not LOST marked during
fallback making them "non-retransmittable" until the next RTO.
In a bad case, those segments are really lost and are the only
one left in the window. Thus TCP needs another RTO to continue.
The next FRTO, however, could again repeat the same events
making the progress of the TCP flow extremely slow.

In order for these events to occur at all, FRTO must occur
again in FRTOs step 3 while the key segments must be lost as
well, which is not too likely in practice. It seems to most
frequently with some small devices such as network printers
that *seem* to accept TCP segments only in-order. In cases
were key segments weren't lost, things get automatically
resolved because those wrongly marked segments don't need to be
retransmitted in order to continue.

I found a reproducer after digging up relevant reports (few
reports in total, none at netdev or lkml I know of), some
cases seemed to indicate middlebox issues which seems now
to be a false assumption some people had made. Bugzilla
#10063 _might_ be related. Damon L. Chesser <damon@damtek.com>
had a reproducable case and was kind enough to tcpdump it
for me. With the tcpdump log it was quite trivial to figure
out.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 8ac15a6..26c9369 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -114,8 +114,6 @@ int sysctl_tcp_abc __read_mostly;
 #define FLAG_FORWARD_PROGRESS	(FLAG_ACKED|FLAG_DATA_SACKED)
 #define FLAG_ANY_PROGRESS	(FLAG_FORWARD_PROGRESS|FLAG_SND_UNA_ADVANCED)
 
-#define IsSackFrto() (sysctl_tcp_frto == 0x2)
-
 #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
 #define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))
 
@@ -1686,6 +1684,11 @@ static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
 	tp->sacked_out = 0;
 }
 
+static int tcp_is_sackfrto(const struct tcp_sock *tp)
+{
+	return (sysctl_tcp_frto == 0x2) && !tcp_is_reno(tp);
+}
+
 /* F-RTO can only be used if TCP has never retransmitted anything other than
  * head (SACK enhanced variant from Appendix B of RFC4138 is more robust here)
  */
@@ -1702,7 +1705,7 @@ int tcp_use_frto(struct sock *sk)
 	if (icsk->icsk_mtup.probe_size)
 		return 0;
 
-	if (IsSackFrto())
+	if (tcp_is_sackfrto(tp))
 		return 1;
 
 	/* Avoid expensive walking of rexmit queue if possible */
@@ -1792,7 +1795,7 @@ void tcp_enter_frto(struct sock *sk)
 	/* Earlier loss recovery underway (see RFC4138; Appendix B).
 	 * The last condition is necessary at least in tp->frto_counter case.
 	 */
-	if (IsSackFrto() && (tp->frto_counter ||
+	if (tcp_is_sackfrto(tp) && (tp->frto_counter ||
 	    ((1 << icsk->icsk_ca_state) & (TCPF_CA_Recovery|TCPF_CA_Loss))) &&
 	    after(tp->high_seq, tp->snd_una)) {
 		tp->frto_highmark = tp->high_seq;
@@ -3124,7 +3127,7 @@ static int tcp_process_frto(struct sock *sk, int flag)
 		return 1;
 	}
 
-	if (!IsSackFrto() || tcp_is_reno(tp)) {
+	if (!tcp_is_sackfrto(tp)) {
 		/* RFC4138 shortcoming in step 2; should also have case c):
 		 * ACK isn't duplicate nor advances window, e.g., opposite dir
 		 * data, winupdate
-- 
cgit v0.10.2


From c67fa02799bccca3d2e16582493da6d57812ec01 Mon Sep 17 00:00:00 2001
From: "J.H.M. Dassen (Ray)" <jdassen@debian.org>
Date: Thu, 8 May 2008 01:11:04 -0700
Subject: net/ipv4: correct RFC 1122 section reference in comment

RFC 1122 does not have a section 3.1.2.2. The requirement to silently
discard datagrams with a bad checksum is in section 3.2.1.2 instead.

Addresses http://bugzilla.kernel.org/show_bug.cgi?id=10611

Signed-off-by: J.H.M. Dassen (Ray) <jdassen@debian.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 7b4bad6..ff77a4a 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -397,7 +397,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
 	iph = ip_hdr(skb);
 
 	/*
-	 *	RFC1122: 3.1.2.2 MUST silently discard any IP frame that fails the checksum.
+	 *	RFC1122: 3.2.1.2 MUST silently discard any IP frame that fails the checksum.
 	 *
 	 *	Is the datagram acceptable?
 	 *
-- 
cgit v0.10.2


From 7312096454b6cd71267eaa3d0efb408e449e9ff3 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Thu, 8 May 2008 01:13:31 -0700
Subject: macvlan: Fix memleak on device removal/crash on module removal

As noticed by Ben Greear, macvlan crashes the kernel when unloading the
module. The reason is that it tries to clean up the macvlan_port pointer
on the macvlan device itself instead of the underlying device. A non-NULL
pointer is taken as indication that the macvlan_handle_frame_hook is
valid, when receiving the next packet on the underlying device it tries
to call the NULL hook and crashes.

Clean up the macvlan_port on the correct device to fix this.

Signed-off-by; Patrick McHardy <kaber@trash.net>
Tested-by: Ben Greear <greearb@candelatech.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 2056cfc..c36a03a 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -450,7 +450,7 @@ static void macvlan_dellink(struct net_device *dev)
 	unregister_netdevice(dev);
 
 	if (list_empty(&port->vlans))
-		macvlan_port_destroy(dev);
+		macvlan_port_destroy(port->dev);
 }
 
 static struct rtnl_link_ops macvlan_link_ops __read_mostly = {
-- 
cgit v0.10.2


From ef75d49f116bccbb80bccd423ecf3cb86c4509a5 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Thu, 8 May 2008 01:15:21 -0700
Subject: netfilter: nf_conntrack_sip: restrict RTP expect flushing on error to
 last request

Some Inovaphone PBXs exhibit very stange behaviour: when dialing for
example "123", the device sends INVITE requests for "1", "12" and
"123" back to back.  The first requests will elicit error responses
from the receiver, causing the SIP helper to flush the RTP
expectations even though we might still see a positive response.

Note the sequence number of the last INVITE request that contained a
media description and only flush the expectations when receiving a
negative response for that sequence number.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/include/linux/netfilter/nf_conntrack_sip.h b/include/linux/netfilter/nf_conntrack_sip.h
index 5da04e5..23aa2ec 100644
--- a/include/linux/netfilter/nf_conntrack_sip.h
+++ b/include/linux/netfilter/nf_conntrack_sip.h
@@ -7,6 +7,7 @@
 
 struct nf_ct_sip_master {
 	unsigned int	register_cseq;
+	unsigned int	invite_cseq;
 };
 
 enum sip_expectation_classes {
diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index 9f49000..2f9bbc0 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -870,6 +870,7 @@ static int process_sdp(struct sk_buff *skb,
 {
 	enum ip_conntrack_info ctinfo;
 	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+	struct nf_conn_help *help = nfct_help(ct);
 	unsigned int matchoff, matchlen;
 	unsigned int mediaoff, medialen;
 	unsigned int sdpoff;
@@ -959,6 +960,9 @@ static int process_sdp(struct sk_buff *skb,
 	if (nf_nat_sdp_session && ct->status & IPS_NAT_MASK)
 		ret = nf_nat_sdp_session(skb, dptr, sdpoff, datalen, &rtp_addr);
 
+	if (ret == NF_ACCEPT && i > 0)
+		help->help.ct_sip_info.invite_cseq = cseq;
+
 	return ret;
 }
 static int process_invite_response(struct sk_buff *skb,
@@ -967,14 +971,14 @@ static int process_invite_response(struct sk_buff *skb,
 {
 	enum ip_conntrack_info ctinfo;
 	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+	struct nf_conn_help *help = nfct_help(ct);
 
 	if ((code >= 100 && code <= 199) ||
 	    (code >= 200 && code <= 299))
 		return process_sdp(skb, dptr, datalen, cseq);
-	else {
+	else if (help->help.ct_sip_info.invite_cseq == cseq)
 		flush_expectations(ct, true);
-		return NF_ACCEPT;
-	}
+	return NF_ACCEPT;
 }
 
 static int process_update_response(struct sk_buff *skb,
@@ -983,14 +987,14 @@ static int process_update_response(struct sk_buff *skb,
 {
 	enum ip_conntrack_info ctinfo;
 	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+	struct nf_conn_help *help = nfct_help(ct);
 
 	if ((code >= 100 && code <= 199) ||
 	    (code >= 200 && code <= 299))
 		return process_sdp(skb, dptr, datalen, cseq);
-	else {
+	else if (help->help.ct_sip_info.invite_cseq == cseq)
 		flush_expectations(ct, true);
-		return NF_ACCEPT;
-	}
+	return NF_ACCEPT;
 }
 
 static int process_prack_response(struct sk_buff *skb,
@@ -999,14 +1003,14 @@ static int process_prack_response(struct sk_buff *skb,
 {
 	enum ip_conntrack_info ctinfo;
 	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+	struct nf_conn_help *help = nfct_help(ct);
 
 	if ((code >= 100 && code <= 199) ||
 	    (code >= 200 && code <= 299))
 		return process_sdp(skb, dptr, datalen, cseq);
-	else {
+	else if (help->help.ct_sip_info.invite_cseq == cseq)
 		flush_expectations(ct, true);
-		return NF_ACCEPT;
-	}
+	return NF_ACCEPT;
 }
 
 static int process_bye_request(struct sk_buff *skb,
-- 
cgit v0.10.2


From f3261aff35cbc811fee0e23eaea277f1b7286eca Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Thu, 8 May 2008 01:16:04 -0700
Subject: netfilter: Kconfig: default DCCP/SCTP conntrack support to the
 protocol config values

When conntrack and DCCP/SCTP protocols are enabled, chances are good
that people also want DCCP/SCTP conntrack and NAT support.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index c1fc0f1..aa8d80c 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -90,6 +90,7 @@ config NF_CT_PROTO_DCCP
 	tristate 'DCCP protocol connection tracking support (EXPERIMENTAL)'
 	depends on EXPERIMENTAL && NF_CONNTRACK
 	depends on NETFILTER_ADVANCED
+	default IP_DCCP
 	help
 	  With this option enabled, the layer 3 independent connection
 	  tracking code will be able to do state tracking on DCCP connections.
@@ -104,6 +105,7 @@ config NF_CT_PROTO_SCTP
 	tristate 'SCTP protocol connection tracking support (EXPERIMENTAL)'
 	depends on EXPERIMENTAL && NF_CONNTRACK
 	depends on NETFILTER_ADVANCED
+	default IP_SCTP
 	help
 	  With this option enabled, the layer 3 independent connection
 	  tracking code will be able to do state tracking on SCTP connections.
@@ -532,6 +534,7 @@ config NETFILTER_XT_MATCH_DCCP
 	tristate '"dccp" protocol match support'
 	depends on NETFILTER_XTABLES
 	depends on NETFILTER_ADVANCED
+	default IP_DCCP
 	help
 	  With this option enabled, you will be able to use the iptables
 	  `dccp' match in order to match on DCCP source/destination ports
@@ -725,6 +728,7 @@ config NETFILTER_XT_MATCH_SCTP
 	tristate  '"sctp" protocol match support (EXPERIMENTAL)'
 	depends on NETFILTER_XTABLES && EXPERIMENTAL
 	depends on NETFILTER_ADVANCED
+	default IP_SCTP
 	help
 	  With this option enabled, you will be able to use the 
 	  `sctp' match in order to match on SCTP source/destination ports
-- 
cgit v0.10.2


From aca51397d01474f80cab8fc978559b45f2e453ad Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Thu, 8 May 2008 01:24:25 -0700
Subject: netns: Fix arbitrary net_device-s corruptions on net_ns stop.

When a net namespace is destroyed, some devices (those, not killed
on ns stop explicitly) are moved back to init_net.

The problem, is that this net_ns change has one point of failure -
the __dev_alloc_name() may be called if a name collision occurs (and
this is easy to trigger). This allocator performs a likely-to-fail
GFP_ATOMIC allocation to find a suitable number. Other possible
conditions that may cause error (for device being ns local or not
registered) are always false in this case.

So, when this call fails, the device is unregistered. But this is
*not* the right thing to do, since after this the device may be
released (and kfree-ed) improperly. E. g. bridges require more
actions (sysfs update, timer disarming, etc.), some other devices
want to remove their private areas from lists, etc.

I. e. arbitrary use-after-free cases may occur.

The proposed fix is the following: since the only reason for the
dev_change_net_namespace to fail is the name generation, we may
give it a unique fall-back name w/o %d-s in it - the dev<ifindex>
one, since ifindexes are still unique.

So make this change, raise the failure-case printk loglevel to
EMERG and replace the unregister_netdevice call with BUG().

[ Use snprintf() -DaveM ]

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/core/dev.c b/net/core/dev.c
index d334446..4addaf0 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4480,17 +4480,19 @@ static void __net_exit default_device_exit(struct net *net)
 	rtnl_lock();
 	for_each_netdev_safe(net, dev, next) {
 		int err;
+		char fb_name[IFNAMSIZ];
 
 		/* Ignore unmoveable devices (i.e. loopback) */
 		if (dev->features & NETIF_F_NETNS_LOCAL)
 			continue;
 
 		/* Push remaing network devices to init_net */
-		err = dev_change_net_namespace(dev, &init_net, "dev%d");
+		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
+		err = dev_change_net_namespace(dev, &init_net, fb_name);
 		if (err) {
-			printk(KERN_WARNING "%s: failed to move %s to init_net: %d\n",
+			printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
 				__func__, dev->name, err);
-			unregister_netdevice(dev);
+			BUG();
 		}
 	}
 	rtnl_unlock();
-- 
cgit v0.10.2


From c2ab7ac225e29006b7117d6a9fe8f3be8d98b0c2 Mon Sep 17 00:00:00 2001
From: Oliver Hartkopp <oliver@hartkopp.net>
Date: Thu, 8 May 2008 02:49:55 -0700
Subject: can: Fix can_send() handling on dev_queue_xmit() failures

The tx packet counting and the local loopback of CAN frames should
only happen in the case that the CAN frame has been enqueued to the
netdevice tx queue successfully.

Thanks to Andre Naujoks <nautsch@gmail.com> for reporting this issue.

Signed-off-by: Oliver Hartkopp <oliver@hartkopp.net>
Signed-off-by: Urs Thuermann <urs@isnogud.escape.de>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/can/af_can.c b/net/can/af_can.c
index 2759b76..7e8ca28 100644
--- a/net/can/af_can.c
+++ b/net/can/af_can.c
@@ -208,6 +208,7 @@ static int can_create(struct net *net, struct socket *sock, int protocol)
  */
 int can_send(struct sk_buff *skb, int loop)
 {
+	struct sk_buff *newskb = NULL;
 	int err;
 
 	if (skb->dev->type != ARPHRD_CAN) {
@@ -244,8 +245,7 @@ int can_send(struct sk_buff *skb, int loop)
 			 * If the interface is not capable to do loopback
 			 * itself, we do it here.
 			 */
-			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
-
+			newskb = skb_clone(skb, GFP_ATOMIC);
 			if (!newskb) {
 				kfree_skb(skb);
 				return -ENOMEM;
@@ -254,7 +254,6 @@ int can_send(struct sk_buff *skb, int loop)
 			newskb->sk = skb->sk;
 			newskb->ip_summed = CHECKSUM_UNNECESSARY;
 			newskb->pkt_type = PACKET_BROADCAST;
-			netif_rx(newskb);
 		}
 	} else {
 		/* indication for the CAN driver: no loopback required */
@@ -266,11 +265,20 @@ int can_send(struct sk_buff *skb, int loop)
 	if (err > 0)
 		err = net_xmit_errno(err);
 
+	if (err) {
+		if (newskb)
+			kfree_skb(newskb);
+		return err;
+	}
+
+	if (newskb)
+		netif_rx(newskb);
+
 	/* update statistics */
 	can_stats.tx_frames++;
 	can_stats.tx_frames_delta++;
 
-	return err;
+	return 0;
 }
 EXPORT_SYMBOL(can_send);
 
-- 
cgit v0.10.2


From e46b66bc42b6b1430b04cc5c207ecb2b2f4553dc Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Thu, 8 May 2008 02:53:17 -0700
Subject: net: Added ASSERT_RTNL() to dev_open() and dev_close().

dev_open() and dev_close() must be called holding the RTNL, since they
call device functions and netdevice notifiers that are promised the RTNL.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/core/dev.c b/net/core/dev.c
index 4addaf0..a1607bc 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -994,6 +994,8 @@ int dev_open(struct net_device *dev)
 {
 	int ret = 0;
 
+	ASSERT_RTNL();
+
 	/*
 	 *	Is it already up?
 	 */
@@ -1060,6 +1062,8 @@ int dev_open(struct net_device *dev)
  */
 int dev_close(struct net_device *dev)
 {
+	ASSERT_RTNL();
+
 	might_sleep();
 
 	if (!(dev->flags & IFF_UP))
-- 
cgit v0.10.2


From 30a717f7e951ec0260f31c0637ecf8e6268ba607 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Thu, 8 May 2008 14:03:30 +0200
Subject: [ALSA] soc - at91-pcm - Fix line wrapping

There's more checkpatch stuff to fix in the driver, this just fixes the
minimum required for the following patch to be clean.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>

diff --git a/sound/soc/at91/at91-pcm.c b/sound/soc/at91/at91-pcm.c
index 67c88e3..e36533d 100644
--- a/sound/soc/at91/at91-pcm.c
+++ b/sound/soc/at91/at91-pcm.c
@@ -103,7 +103,8 @@ static void at91_pcm_dma_irq(u32 ssc_sr,
 		if (prtd->period_ptr >= prtd->dma_buffer_end) {
 			prtd->period_ptr = prtd->dma_buffer;
 		}
-		at91_ssc_write(params->ssc_base + params->pdc->xnpr, prtd->period_ptr);
+		at91_ssc_write(params->ssc_base + params->pdc->xnpr,
+			       prtd->period_ptr);
 		at91_ssc_write(params->ssc_base + params->pdc->xncr,
 				prtd->period_size / params->pdc_xfer_size);
 	}
@@ -191,10 +192,12 @@ static int at91_pcm_trigger(struct snd_pcm_substream *substream,
 		at91_ssc_write(params->ssc_base + AT91_SSC_IER,
 			params->mask->ssc_endx | params->mask->ssc_endbuf);
 
-		at91_ssc_write(params->ssc_base + ATMEL_PDC_PTCR, params->mask->pdc_enable);
+		at91_ssc_write(params->ssc_base + ATMEL_PDC_PTCR,
+			params->mask->pdc_enable);
 
-		DBG("sr=%lx imr=%lx\n", at91_ssc_read(params->ssc_base + AT91_SSC_SR),
-					at91_ssc_read(params->ssc_base + AT91_SSC_IER));
+		DBG("sr=%lx imr=%lx\n",
+		    at91_ssc_read(params->ssc_base + AT91_SSC_SR),
+		    at91_ssc_read(params->ssc_base + AT91_SSC_IER));
 		break;
 
 	case SNDRV_PCM_TRIGGER_STOP:
-- 
cgit v0.10.2


From e3a2efa67a029453b8098dba179ec2d9d8df612e Mon Sep 17 00:00:00 2001
From: Patrik Sevallius <patrik.sevallius@enea.com>
Date: Thu, 8 May 2008 14:04:08 +0200
Subject: [ALSA] soc at91 minor bug fixes

Found these two bugs while browsing through the code.  The first one is
a cut-n-paste bug, instead of disabling the clock when request_irq()
fails, it enabled it once more.  The second one fixes a debug printout,
AT91_SSC_IER is write only, AT91_SSC_IMR is readable (the printed string
actually says imr).

Frank Mandarino was busy so he asked me to send these to this list.

/Patrik

Signed-off-by: Patrik Sevallius <patrik.sevallius@enea.com>
Acked-by: Frank Mandarino <fmandarino@endrelia.com>
Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>

diff --git a/sound/soc/at91/at91-pcm.c b/sound/soc/at91/at91-pcm.c
index e36533d..ccac6bd 100644
--- a/sound/soc/at91/at91-pcm.c
+++ b/sound/soc/at91/at91-pcm.c
@@ -197,7 +197,7 @@ static int at91_pcm_trigger(struct snd_pcm_substream *substream,
 
 		DBG("sr=%lx imr=%lx\n",
 		    at91_ssc_read(params->ssc_base + AT91_SSC_SR),
-		    at91_ssc_read(params->ssc_base + AT91_SSC_IER));
+		    at91_ssc_read(params->ssc_base + AT91_SSC_IMR));
 		break;
 
 	case SNDRV_PCM_TRIGGER_STOP:
diff --git a/sound/soc/at91/at91-ssc.c b/sound/soc/at91/at91-ssc.c
index f642d2d..bc35d00 100644
--- a/sound/soc/at91/at91-ssc.c
+++ b/sound/soc/at91/at91-ssc.c
@@ -590,7 +590,7 @@ static int at91_ssc_hw_params(struct snd_pcm_substream *substream,
 			printk(KERN_WARNING "at91-ssc: request_irq failure\n");
 
 			DBG("Stopping pid %d clock\n", ssc_p->ssc.pid);
-			at91_sys_write(AT91_PMC_PCER, 1<<ssc_p->ssc.pid);
+			at91_sys_write(AT91_PMC_PCDR, 1<<ssc_p->ssc.pid);
 			return ret;
 		}
 
-- 
cgit v0.10.2


From 75065ff619e42fe35178eda863cbcddd57776794 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 8 May 2008 14:06:19 +0200
Subject: Revert "relay: fix splice problem"

This reverts commit c3270e577c18b3d0e984c3371493205a4807db9d.

diff --git a/fs/splice.c b/fs/splice.c
index cece15b..7815003 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1072,7 +1072,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
 
 	ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
 	if (ret > 0)
-		*ppos = sd.pos;
+		*ppos += ret;
 
 	return ret;
 }
diff --git a/kernel/relay.c b/kernel/relay.c
index 7de644c..bc24dcd 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1191,7 +1191,7 @@ static ssize_t relay_file_splice_read(struct file *in,
 	ret = 0;
 	spliced = 0;
 
-	while (len && !spliced) {
+	while (len) {
 		ret = subbuf_splice_actor(in, ppos, pipe, len, flags, &nonpad_ret);
 		if (ret < 0)
 			break;
-- 
cgit v0.10.2


From bf726eab3711cf192405d21688a4b21e07b6188a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 8 May 2008 11:53:48 +0200
Subject: semaphore: fix
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Yanmin Zhang reported:

| Comparing with kernel 2.6.25, ﻿AIM7 (use tmpfs) has ﻿more th
| regression under 2.6.26-rc1 on my 8-core stoakley, 16-core tigerton,
| and Itanium Montecito. Bisect located the patch below:
|
| 64ac24e738823161693bf791f87adc802cf529ff is first bad commit
| commit 64ac24e738823161693bf791f87adc802cf529ff
| Author: Matthew Wilcox <matthew@wil.cx>
| Date:   Fri Mar 7 21:55:58 2008 -0500
|
|     Generic semaphore implementation
|
| After I manually reverted the patch against 2.6.26-rc1 while fixing
| lots of conflicts/errors, aim7 regression became less than 2%.

i reproduced the AIM7 workload and can confirm Yanmin's findings that
-.26-rc1 regresses over .25 - by over 67% here.

Looking at the workload i found and fixed what i believe to be the real
bug causing the AIM7 regression: it was inefficient wakeup / scheduling
/ locking behavior of the new generic semaphore code, causing suboptimal
performance.

The problem comes from the following code. The new semaphore code does
this on down():

        spin_lock_irqsave(&sem->lock, flags);
        if (likely(sem->count > 0))
                sem->count--;
        else
                __down(sem);
        spin_unlock_irqrestore(&sem->lock, flags);

and this on up():

        spin_lock_irqsave(&sem->lock, flags);
        if (likely(list_empty(&sem->wait_list)))
                sem->count++;
        else
                __up(sem);
        spin_unlock_irqrestore(&sem->lock, flags);

where __up() does:

        list_del(&waiter->list);
        waiter->up = 1;
        wake_up_process(waiter->task);

and where __down() does this in essence:

        list_add_tail(&waiter.list, &sem->wait_list);
        waiter.task = task;
        waiter.up = 0;
        for (;;) {
                [...]
                spin_unlock_irq(&sem->lock);
                timeout = schedule_timeout(timeout);
                spin_lock_irq(&sem->lock);
                if (waiter.up)
                        return 0;
        }

the fastpath looks good and obvious, but note the following property of
the contended path: if there's a task on the ->wait_list, the up() of
the current owner will "pass over" ownership to that waiting task, in a
wake-one manner, via the waiter->up flag and by removing the waiter from
the wait list.

That is all and fine in principle, but as implemented in
kernel/semaphore.c it also creates a nasty, hidden source of contention!

The contention comes from the following property of the new semaphore
code: the new owner owns the semaphore exclusively, even if it is not
running yet.

So if the old owner, even if just a few instructions later, does a
down() [lock_kernel()] again, it will be blocked and will have to wait
on the new owner to eventually be scheduled (possibly on another CPU)!
Or if another task gets to lock_kernel() sooner than the "new owner"
scheduled, it will be blocked unnecessarily and for a very long time
when there are 2000 tasks running.

I.e. the implementation of the new semaphores code does wake-one and
lock ownership in a very restrictive way - it does not allow
opportunistic re-locking of the lock at all and keeps the scheduler from
picking task order intelligently.

This kind of scheduling, with 2000 AIM7 processes running, creates awful
cross-scheduling between those 2000 tasks, causes reduced parallelism, a
throttled runqueue length and a lot of idle time. With increasing number
of CPUs it causes an exponentially worse behavior in AIM7, as the chance
for a newly woken new-owner task to actually run anytime soon is less
and less likely.

Note that it takes just a tiny bit of contention for the 'new-semaphore
catastrophy' to happen: the wakeup latencies get added to whatever small
contention there is, and quickly snowball out of control!

I believe Yanmin's findings and numbers support this analysis too.

The best fix for this problem is to use the same scheduling logic that
the kernel/mutex.c code uses: keep the wake-one behavior (that is OK and
wanted because we do not want to over-schedule), but also allow
opportunistic locking of the lock even if a wakee is already "in
flight".

The patch below implements this new logic. With this patch applied the
AIM7 regression is largely fixed on my quad testbox:

  # v2.6.25 vanilla:
  ..................
  Tasks   Jobs/Min        JTI     Real    CPU     Jobs/sec/task
  2000    56096.4         91      207.5   789.7   0.4675
  2000    55894.4         94      208.2   792.7   0.4658

  # v2.6.26-rc1-166-gc0a1811 vanilla:
  ...................................
  Tasks   Jobs/Min        JTI     Real    CPU     Jobs/sec/task
  2000    33230.6         83      350.3   784.5   0.2769
  2000    31778.1         86      366.3   783.6   0.2648

  # v2.6.26-rc1-166-gc0a1811 + semaphore-speedup:
  ...............................................
  Tasks   Jobs/Min        JTI     Real    CPU     Jobs/sec/task
  2000    55707.1         92      209.0   795.6   0.4642
  2000    55704.4         96      209.0   796.0   0.4642

i.e. a 67% speedup. We are now back to within 1% of the v2.6.25
performance levels and have zero idle time during the test, as expected.

Btw., interactivity also improved dramatically with the fix - for
example console-switching became almost instantaneous during this
workload (which after all is running 2000 tasks at once!), without the
patch it was stuck for a minute at times.

There's another nice side-effect of this speedup patch, the new generic
semaphore code got even smaller:

   text    data     bss     dec     hex filename
   1241       0       0    1241     4d9 semaphore.o.before
   1207       0       0    1207     4b7 semaphore.o.after

(because the waiter.up complication got removed.)

Longer-term we should look into using the mutex code for the generic
semaphore code as well - but i's not easy due to legacies and it's
outside of the scope of v2.6.26 and outside the scope of this patch as
well.

Bisected-by: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/semaphore.c b/kernel/semaphore.c
index 5c2942e..5e41217 100644
--- a/kernel/semaphore.c
+++ b/kernel/semaphore.c
@@ -54,10 +54,9 @@ void down(struct semaphore *sem)
 	unsigned long flags;
 
 	spin_lock_irqsave(&sem->lock, flags);
-	if (likely(sem->count > 0))
-		sem->count--;
-	else
+	if (unlikely(!sem->count))
 		__down(sem);
+	sem->count--;
 	spin_unlock_irqrestore(&sem->lock, flags);
 }
 EXPORT_SYMBOL(down);
@@ -77,10 +76,10 @@ int down_interruptible(struct semaphore *sem)
 	int result = 0;
 
 	spin_lock_irqsave(&sem->lock, flags);
-	if (likely(sem->count > 0))
-		sem->count--;
-	else
+	if (unlikely(!sem->count))
 		result = __down_interruptible(sem);
+	if (!result)
+		sem->count--;
 	spin_unlock_irqrestore(&sem->lock, flags);
 
 	return result;
@@ -103,10 +102,10 @@ int down_killable(struct semaphore *sem)
 	int result = 0;
 
 	spin_lock_irqsave(&sem->lock, flags);
-	if (likely(sem->count > 0))
-		sem->count--;
-	else
+	if (unlikely(!sem->count))
 		result = __down_killable(sem);
+	if (!result)
+		sem->count--;
 	spin_unlock_irqrestore(&sem->lock, flags);
 
 	return result;
@@ -157,10 +156,10 @@ int down_timeout(struct semaphore *sem, long jiffies)
 	int result = 0;
 
 	spin_lock_irqsave(&sem->lock, flags);
-	if (likely(sem->count > 0))
-		sem->count--;
-	else
+	if (unlikely(!sem->count))
 		result = __down_timeout(sem, jiffies);
+	if (!result)
+		sem->count--;
 	spin_unlock_irqrestore(&sem->lock, flags);
 
 	return result;
@@ -179,9 +178,8 @@ void up(struct semaphore *sem)
 	unsigned long flags;
 
 	spin_lock_irqsave(&sem->lock, flags);
-	if (likely(list_empty(&sem->wait_list)))
-		sem->count++;
-	else
+	sem->count++;
+	if (unlikely(!list_empty(&sem->wait_list)))
 		__up(sem);
 	spin_unlock_irqrestore(&sem->lock, flags);
 }
@@ -192,7 +190,6 @@ EXPORT_SYMBOL(up);
 struct semaphore_waiter {
 	struct list_head list;
 	struct task_struct *task;
-	int up;
 };
 
 /*
@@ -205,33 +202,34 @@ static inline int __sched __down_common(struct semaphore *sem, long state,
 {
 	struct task_struct *task = current;
 	struct semaphore_waiter waiter;
+	int ret = 0;
 
-	list_add_tail(&waiter.list, &sem->wait_list);
 	waiter.task = task;
-	waiter.up = 0;
+	list_add_tail(&waiter.list, &sem->wait_list);
 
 	for (;;) {
-		if (state == TASK_INTERRUPTIBLE && signal_pending(task))
-			goto interrupted;
-		if (state == TASK_KILLABLE && fatal_signal_pending(task))
-			goto interrupted;
-		if (timeout <= 0)
-			goto timed_out;
+		if (state == TASK_INTERRUPTIBLE && signal_pending(task)) {
+			ret = -EINTR;
+			break;
+		}
+		if (state == TASK_KILLABLE && fatal_signal_pending(task)) {
+			ret = -EINTR;
+			break;
+		}
+		if (timeout <= 0) {
+			ret = -ETIME;
+			break;
+		}
 		__set_task_state(task, state);
 		spin_unlock_irq(&sem->lock);
 		timeout = schedule_timeout(timeout);
 		spin_lock_irq(&sem->lock);
-		if (waiter.up)
-			return 0;
+		if (sem->count > 0)
+			break;
 	}
 
- timed_out:
-	list_del(&waiter.list);
-	return -ETIME;
-
- interrupted:
 	list_del(&waiter.list);
-	return -EINTR;
+	return ret;
 }
 
 static noinline void __sched __down(struct semaphore *sem)
@@ -258,7 +256,5 @@ static noinline void __sched __up(struct semaphore *sem)
 {
 	struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list,
 						struct semaphore_waiter, list);
-	list_del(&waiter->list);
-	waiter->up = 1;
 	wake_up_process(waiter->task);
 }
-- 
cgit v0.10.2


From 46151122e0a2e80e5a6b2889f595e371fe2b600d Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Thu, 8 May 2008 17:00:42 +0200
Subject: sched: fix weight calculations

The conversion between virtual and real time is as follows:

  dvt = rw/w * dt <=> dt = w/rw * dvt

Since we want the fair sleeper granularity to be in real time, we actually
need to do:

  dvt = - rw/w * l

This bug could be related to the regression reported by Yanmin Zhang:

| Comparing with kernel 2.6.25, sysbench+mysql(oltp, readonly) has lots
| of regressions with 2.6.26-rc1:
|
| 1) 8-core stoakley: 28%;
| 2) 16-core tigerton: 20%;
| 3) Itanium Montvale: 50%.

Reported-by: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index c863663..e24ecd3 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -662,10 +662,15 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 	if (!initial) {
 		/* sleeps upto a single latency don't count. */
 		if (sched_feat(NEW_FAIR_SLEEPERS)) {
+			unsigned long thresh = sysctl_sched_latency;
+
+			/*
+			 * convert the sleeper threshold into virtual time
+			 */
 			if (sched_feat(NORMALIZED_SLEEPER))
-				vruntime -= calc_delta_weight(sysctl_sched_latency, se);
-			else
-				vruntime -= sysctl_sched_latency;
+				thresh = calc_delta_fair(thresh, se);
+
+			vruntime -= thresh;
 		}
 
 		/* ensure we never gain time by being placed backwards. */
-- 
cgit v0.10.2


From 3f9827bc05581b6bb34ab0c6b5d8e028f71b4e78 Mon Sep 17 00:00:00 2001
From: Samuel Thibault <samuel.thibault@ens-lyon.org>
Date: Tue, 6 May 2008 20:42:26 -0700
Subject: Kconfig: improved help for CONFIG_ACCESSIBILITY

Add a small explanation of what accessibility is.

Signed-off-by: Samuel Thibault <samuel.thibault@ens-lyon.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/accessibility/Kconfig b/drivers/accessibility/Kconfig
index 1264c4b..ef3b65b 100644
--- a/drivers/accessibility/Kconfig
+++ b/drivers/accessibility/Kconfig
@@ -1,7 +1,17 @@
 menuconfig ACCESSIBILITY
 	bool "Accessibility support"
 	---help---
-	  Enable a submenu where accessibility items may be enabled.
+	  Accessibility handles all special kinds of hardware devices or
+	  software adapters which help people with disabilities (e.g.
+	  blindness) to use computers.
+
+	  That includes braille devices, speech synthesis, keyboard
+	  remapping, etc.
+
+	  Say Y here to get to see options for accessibility.
+	  This option alone does not add any kernel code.
+
+	  If you say N, all options in this submenu will be skipped and disabled.
 
 	  If unsure, say N.
 
-- 
cgit v0.10.2


From 55d7b68996a5064f011d681bca412b6281d2f711 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Date: Tue, 6 May 2008 20:42:27 -0700
Subject: serial: access after NULL check in uart_flush_buffer()

I noticed that

  static void uart_flush_buffer(struct tty_struct *tty)
  {
  	struct uart_state *state = tty->driver_data;
  	struct uart_port *port = state->port;
  	unsigned long flags;

  	/*
  	 * This means you called this function _after_ the port was
  	 * closed.  No cookie for you.
  	 */
  	if (!state || !state->info) {
  		WARN_ON(1);
  		return;
  	}

is too late for checking state != NULL.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/serial/serial_core.c b/drivers/serial/serial_core.c
index 1e2b9d8..eab0327 100644
--- a/drivers/serial/serial_core.c
+++ b/drivers/serial/serial_core.c
@@ -556,7 +556,7 @@ static int uart_chars_in_buffer(struct tty_struct *tty)
 static void uart_flush_buffer(struct tty_struct *tty)
 {
 	struct uart_state *state = tty->driver_data;
-	struct uart_port *port = state->port;
+	struct uart_port *port;
 	unsigned long flags;
 
 	/*
@@ -568,6 +568,7 @@ static void uart_flush_buffer(struct tty_struct *tty)
 		return;
 	}
 
+	port = state->port;
 	pr_debug("uart_flush_buffer(%d) called\n", tty->index);
 
 	spin_lock_irqsave(&port->lock, flags);
-- 
cgit v0.10.2


From a1f2aa1be2f7a6883ba46faa53ed82595dee2a06 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 6 May 2008 20:42:28 -0700
Subject: fix irq flags in mac80211 code

A file in the net/mac80211 directory uses "int" for flags.  This can cause
hard to find bugs on some architectures.  This patch converts the flags to use
"long" instead.

This bug was discovered by doing an allyesconfig make on the -rt kernel where
checks are done to ensure all flags are of size sizeof(long).

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Cc: "John W. Linville" <linville@tuxdriver.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/net/mac80211/rc80211_pid_debugfs.c b/net/mac80211/rc80211_pid_debugfs.c
index ae75d41..ff5c380f 100644
--- a/net/mac80211/rc80211_pid_debugfs.c
+++ b/net/mac80211/rc80211_pid_debugfs.c
@@ -85,7 +85,7 @@ static int rate_control_pid_events_open(struct inode *inode, struct file *file)
 	struct rc_pid_sta_info *sinfo = inode->i_private;
 	struct rc_pid_event_buffer *events = &sinfo->events;
 	struct rc_pid_events_file_info *file_info;
-	unsigned int status;
+	unsigned long status;
 
 	/* Allocate a state struct */
 	file_info = kmalloc(sizeof(*file_info), GFP_KERNEL);
@@ -135,7 +135,7 @@ static ssize_t rate_control_pid_events_read(struct file *file, char __user *buf,
 	char pb[RC_PID_PRINT_BUF_SIZE];
 	int ret;
 	int p;
-	unsigned int status;
+	unsigned long status;
 
 	/* Check if there is something to read. */
 	if (events->next_entry == file_info->next_entry) {
-- 
cgit v0.10.2


From a8b1ecf3d5c48ebde9fed61c7a682b2270e09d2b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 6 May 2008 20:42:29 -0700
Subject: fix irq flags in saa7134

Some files in the drivers/media/video/saa7134 directory uses "int" for flags.
This can cause hard to find bugs on some architectures.  This patch converts
the flags to use "long" instead.

This bug was discovered by doing an allyesconfig make on the -rt kernel where
checks are done to ensure all flags are of size sizeof(long).

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Acked-by: Mauro Carvalho Chehab <mchehab@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/media/video/saa7134/saa7134-video.c b/drivers/media/video/saa7134/saa7134-video.c
index a0baf2d..48e1a01 100644
--- a/drivers/media/video/saa7134/saa7134-video.c
+++ b/drivers/media/video/saa7134/saa7134-video.c
@@ -1634,7 +1634,7 @@ static int saa7134_s_fmt_overlay(struct file *file, void *priv,
 	struct saa7134_fh *fh = priv;
 	struct saa7134_dev *dev = fh->dev;
 	int err;
-	unsigned int flags;
+	unsigned long flags;
 
 	if (saa7134_no_overlay > 0) {
 		printk(KERN_ERR "V4L2_BUF_TYPE_VIDEO_OVERLAY: no_overlay\n");
-- 
cgit v0.10.2


From 9a0f4aea878315ba87cb8a4d0dddc67832218e3f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 6 May 2008 20:42:30 -0700
Subject: fix irq flags in rtc-ds1511

The file in drivers/rtc/rtc-ds1551.c uses "int" for flags.  This can cause
hard to find bugs on some architectures.  This patch converts the flags to use
"long" instead.

This bug was discovered by doing an allyesconfig make on the -rt kernel where
checks are done to ensure all flags are of size sizeof(long).

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Cc: Alessandro Zummo <a.zummo@towertech.it>
Cc: David Brownell <david-b@pacbell.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/rtc/rtc-ds1511.c b/drivers/rtc/rtc-ds1511.c
index a83a40b..0f0d27d 100644
--- a/drivers/rtc/rtc-ds1511.c
+++ b/drivers/rtc/rtc-ds1511.c
@@ -184,7 +184,7 @@ ds1511_wdog_disable(void)
 static int ds1511_rtc_set_time(struct device *dev, struct rtc_time *rtc_tm)
 {
 	u8 mon, day, dow, hrs, min, sec, yrs, cen;
-	unsigned int flags;
+	unsigned long flags;
 
 	/*
 	 * won't have to change this for a while
@@ -247,7 +247,7 @@ static int ds1511_rtc_set_time(struct device *dev, struct rtc_time *rtc_tm)
 static int ds1511_rtc_read_time(struct device *dev, struct rtc_time *rtc_tm)
 {
 	unsigned int century;
-	unsigned int flags;
+	unsigned long flags;
 
 	spin_lock_irqsave(&ds1511_lock, flags);
 	rtc_disable_update();
-- 
cgit v0.10.2


From 8594303a7abc1a117b1d91412ce9b3d77ed35d02 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 6 May 2008 20:42:31 -0700
Subject: fix irq flags for iuu_phoenix.c

The file drivers/usb/serial/iuu_phoenix.c uses "int" for flags.  This can
cause hard to find bugs on some architectures.  This patch converts the flags
to use "long" instead.

This bug was discovered by doing an allyesconfig make on the -rt kernel where
checks are done to ensure all flags are of size sizeof(long).

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Cc: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/usb/serial/iuu_phoenix.c b/drivers/usb/serial/iuu_phoenix.c
index 8a21764..a01e987 100644
--- a/drivers/usb/serial/iuu_phoenix.c
+++ b/drivers/usb/serial/iuu_phoenix.c
@@ -643,7 +643,7 @@ static void read_buf_callback(struct urb *urb)
 static int iuu_bulk_write(struct usb_serial_port *port)
 {
 	struct iuu_private *priv = usb_get_serial_port_data(port);
-	unsigned int flags;
+	unsigned long flags;
 	int result;
 	int i;
 	char *buf_ptr = port->write_urb->transfer_buffer;
@@ -694,7 +694,7 @@ static void iuu_uart_read_callback(struct urb *urb)
 {
 	struct usb_serial_port *port = urb->context;
 	struct iuu_private *priv = usb_get_serial_port_data(port);
-	unsigned int flags;
+	unsigned long flags;
 	int status;
 	int error = 0;
 	int len = 0;
@@ -759,7 +759,7 @@ static int iuu_uart_write(struct usb_serial_port *port, const u8 *buf,
 			  int count)
 {
 	struct iuu_private *priv = usb_get_serial_port_data(port);
-	unsigned int flags;
+	unsigned long flags;
 	dbg("%s - enter", __func__);
 
 	if (count > 256)
-- 
cgit v0.10.2


From cb6969e8cdef39e613b1755eff595f830b89bc82 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Tue, 6 May 2008 20:42:32 -0700
Subject: misc: fix integer as NULL pointer warnings

drivers/md/raid10.c:889:17: warning: Using plain integer as NULL pointer
drivers/media/video/cx18/cx18-driver.c:616:12: warning: Using plain integer as NULL pointer
sound/oss/kahlua.c:70:12: warning: Using plain integer as NULL pointer

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Mauro Carvalho Chehab <mchehab@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 5938fa9..faf3d89 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -886,7 +886,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
 	 */
 	raid10_find_phys(conf, r10_bio);
  retry_write:
-	blocked_rdev = 0;
+	blocked_rdev = NULL;
 	rcu_read_lock();
 	for (i = 0;  i < conf->copies; i++) {
 		int d = r10_bio->devs[i].devnum;
diff --git a/drivers/media/video/cx18/cx18-driver.c b/drivers/media/video/cx18/cx18-driver.c
index 8f5ed9b..3f55d47 100644
--- a/drivers/media/video/cx18/cx18-driver.c
+++ b/drivers/media/video/cx18/cx18-driver.c
@@ -613,7 +613,7 @@ static int __devinit cx18_probe(struct pci_dev *dev,
 	}
 
 	cx = kzalloc(sizeof(struct cx18), GFP_ATOMIC);
-	if (cx == 0) {
+	if (!cx) {
 		spin_unlock(&cx18_cards_lock);
 		return -ENOMEM;
 	}
diff --git a/sound/oss/kahlua.c b/sound/oss/kahlua.c
index dfe670f..eb9bc36 100644
--- a/sound/oss/kahlua.c
+++ b/sound/oss/kahlua.c
@@ -67,7 +67,7 @@ static int __devinit probe_one(struct pci_dev *pdev, const struct pci_device_id
 		return 1;
 	
 	mem = ioremap(base, 128);
-	if(mem == 0UL)
+	if (!mem)
 		return 1;
 	map = readw(mem + 0x18);	/* Read the SMI enables */
 	iounmap(mem);
-- 
cgit v0.10.2


From 3168cb98be7199325de633052680098660ccaf84 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Tue, 6 May 2008 20:42:33 -0700
Subject: uml: fix inconsistence due to tty_operation change

'put_char' of 'struct tty_operations' has changed from 'void' into 'int'.
This can also shut up compiler warnings.

Cc: Jeff Dike <jdike@addtoit.com>
Signed-off-by: WANG Cong <wangcong@zeuux.org>
Acked-by: Alan Cox <alan@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/um/drivers/line.c b/arch/um/drivers/line.c
index 10b86e1..5047490 100644
--- a/arch/um/drivers/line.c
+++ b/arch/um/drivers/line.c
@@ -191,9 +191,9 @@ void line_flush_chars(struct tty_struct *tty)
 	line_flush_buffer(tty);
 }
 
-void line_put_char(struct tty_struct *tty, unsigned char ch)
+int line_put_char(struct tty_struct *tty, unsigned char ch)
 {
-	line_write(tty, &ch, sizeof(ch));
+	return line_write(tty, &ch, sizeof(ch));
 }
 
 int line_write(struct tty_struct *tty, const unsigned char *buf, int len)
diff --git a/arch/um/include/line.h b/arch/um/include/line.h
index 1223f2c..979b73e 100644
--- a/arch/um/include/line.h
+++ b/arch/um/include/line.h
@@ -71,7 +71,7 @@ extern int line_setup(struct line *lines, unsigned int sizeof_lines,
 		      char *init, char **error_out);
 extern int line_write(struct tty_struct *tty, const unsigned char *buf,
 		      int len);
-extern void line_put_char(struct tty_struct *tty, unsigned char ch);
+extern int line_put_char(struct tty_struct *tty, unsigned char ch);
 extern void line_set_termios(struct tty_struct *tty, struct ktermios * old);
 extern int line_chars_in_buffer(struct tty_struct *tty);
 extern void line_flush_buffer(struct tty_struct *tty);
-- 
cgit v0.10.2


From 32fb3ca8fda036936053b4bbfbc6589626cb2437 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 6 May 2008 20:42:35 -0700
Subject: sx.c: fix printk warnings on sparc32

drivers/char/sx.c: In function 'sx_set_real_termios':
drivers/char/sx.c:973: warning: format '%u' expects type 'unsigned int', but argument 2 has type 'long unsigned int'
drivers/char/sx.c:999: warning: format '%x' expects type 'unsigned int', but argument 2 has type 'tcflag_t'
drivers/char/sx.c:1012: warning: format '%x' expects type 'unsigned int', but argument 2 has type 'tcflag_t'

sparc32 seems to use weird types for its tty things.

[ Fine by me but this is ancient debug and most of the debug in sx just
  wants deleting eventually.  - Alan ]

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Alan Cox <alan@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/char/sx.c b/drivers/char/sx.c
index f39f6fd..b1a7a8c 100644
--- a/drivers/char/sx.c
+++ b/drivers/char/sx.c
@@ -970,7 +970,8 @@ static int sx_set_real_termios(void *ptr)
 		sx_write_channel_byte(port, hi_mask, 0x1f);
 		break;
 	default:
-		printk(KERN_INFO "sx: Invalid wordsize: %u\n", CFLAG & CSIZE);
+		printk(KERN_INFO "sx: Invalid wordsize: %u\n",
+			(unsigned int)CFLAG & CSIZE);
 		break;
 	}
 
@@ -997,7 +998,8 @@ static int sx_set_real_termios(void *ptr)
 		set_bit(TTY_HW_COOK_IN, &port->gs.tty->flags);
 	}
 	sx_dprintk(SX_DEBUG_TERMIOS, "iflags: %x(%d) ",
-			port->gs.tty->termios->c_iflag, I_OTHER(port->gs.tty));
+			(unsigned int)port->gs.tty->termios->c_iflag,
+			I_OTHER(port->gs.tty));
 
 /* Tell line discipline whether we will do output cooking.
  * If OPOST is set and no other output flags are set then we can do output
@@ -1010,7 +1012,8 @@ static int sx_set_real_termios(void *ptr)
 		clear_bit(TTY_HW_COOK_OUT, &port->gs.tty->flags);
 	}
 	sx_dprintk(SX_DEBUG_TERMIOS, "oflags: %x(%d)\n",
-			port->gs.tty->termios->c_oflag, O_OTHER(port->gs.tty));
+			(unsigned int)port->gs.tty->termios->c_oflag,
+			O_OTHER(port->gs.tty));
 	/* port->c_dcd = sx_get_CD (port); */
 	func_exit();
 	return 0;
-- 
cgit v0.10.2


From ac7b77f13f2f33270276f88ad0f427e031552e04 Mon Sep 17 00:00:00 2001
From: Mattia Dongili <malattia@linux.it>
Date: Tue, 6 May 2008 20:42:35 -0700
Subject: usb/asix: add Buffalo LUA-U2-GT 10/100/1000

The USB net adapter Buffalo LUA-U2-GT (0411:006e) carries a AX88178 chip.
Tested on the above HW.

Signed-off-by: Mattia Dongili <malattia@linux.it>
Acked-off-by: David Hollis <dhollis@davehollis.com>
Cc: Greg KH <greg@kroah.com>
Acked-by: Jeff Garzik <jeff@garzik.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/net/usb/asix.c b/drivers/net/usb/asix.c
index 6f245cf..dc6f097 100644
--- a/drivers/net/usb/asix.c
+++ b/drivers/net/usb/asix.c
@@ -1381,6 +1381,10 @@ static const struct usb_device_id	products [] = {
 	USB_DEVICE (0x0411, 0x003d),
 	.driver_info =  (unsigned long) &ax8817x_info,
 }, {
+	// Buffalo LUA-U2-GT 10/100/1000
+	USB_DEVICE (0x0411, 0x006e),
+	.driver_info =  (unsigned long) &ax88178_info,
+}, {
 	// Sitecom LN-029 "USB 2.0 10/100 Ethernet adapter"
 	USB_DEVICE (0x6189, 0x182d),
 	.driver_info =  (unsigned long) &ax8817x_info,
-- 
cgit v0.10.2


From c1236d31a1b9fc018b85e15a3e58e3601ddc90ae Mon Sep 17 00:00:00 2001
From: Samuel Thibault <samuel.thibault@ens-lyon.org>
Date: Tue, 6 May 2008 20:42:37 -0700
Subject: vt: fix canonical input in UTF-8 mode

For e.g.  proper TTY canonical support, IUTF8 termios flag has to be set as
appropriate.  Linux used to not care about setting that flag for VT TTYs.

This patch fixes that by activating it according to the current mode of the
VT, and sets the default value according to the vt.default_utf8 parameter.

Signed-off-by: Samuel Thibault <samuel.thibault@ens-lyon.org>
Cc: Willy Tarreau <w@1wt.eu>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/char/vt.c b/drivers/char/vt.c
index e458b08..fa1ffbf 100644
--- a/drivers/char/vt.c
+++ b/drivers/char/vt.c
@@ -2742,6 +2742,10 @@ static int con_open(struct tty_struct *tty, struct file *filp)
 				tty->winsize.ws_row = vc_cons[currcons].d->vc_rows;
 				tty->winsize.ws_col = vc_cons[currcons].d->vc_cols;
 			}
+			if (vc->vc_utf)
+				tty->termios->c_iflag |= IUTF8;
+			else
+				tty->termios->c_iflag &= ~IUTF8;
 			release_console_sem();
 			vcs_make_sysfs(tty);
 			return ret;
@@ -2918,6 +2922,8 @@ int __init vty_init(void)
 	console_driver->minor_start = 1;
 	console_driver->type = TTY_DRIVER_TYPE_CONSOLE;
 	console_driver->init_termios = tty_std_termios;
+	if (default_utf8)
+		console_driver->init_termios.c_iflag |= IUTF8;
 	console_driver->flags = TTY_DRIVER_REAL_RAW | TTY_DRIVER_RESET_TERMIOS;
 	tty_set_operations(console_driver, &con_ops);
 	if (tty_register_driver(console_driver))
-- 
cgit v0.10.2


From ba719baeabbff5476eeb91c223e6921ba29e1490 Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@redhat.com>
Date: Tue, 6 May 2008 20:42:38 -0700
Subject: sys_pipe(): fix file descriptor leaks

Remember to close the files if copy_to_user() failed.

Spotted by dm.n9107@gmail.com.

Signed-off-by: Ulrich Drepper <drepper@redhat.com>
Cc: DM <dm.n9107@gmail.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/cris/kernel/sys_cris.c b/arch/cris/kernel/sys_cris.c
index 8b99841..d124066 100644
--- a/arch/cris/kernel/sys_cris.c
+++ b/arch/cris/kernel/sys_cris.c
@@ -40,8 +40,11 @@ asmlinkage int sys_pipe(unsigned long __user * fildes)
         error = do_pipe(fd);
         unlock_kernel();
         if (!error) {
-                if (copy_to_user(fildes, fd, 2*sizeof(int)))
+                if (copy_to_user(fildes, fd, 2*sizeof(int))) {
+			sys_close(fd[0]);
+			sys_close(fd[1]);
                         error = -EFAULT;
+		}
         }
         return error;
 }
diff --git a/arch/m32r/kernel/sys_m32r.c b/arch/m32r/kernel/sys_m32r.c
index 6d7a80f..319c797 100644
--- a/arch/m32r/kernel/sys_m32r.c
+++ b/arch/m32r/kernel/sys_m32r.c
@@ -90,8 +90,11 @@ sys_pipe(unsigned long r0, unsigned long r1, unsigned long r2,
 
 	error = do_pipe(fd);
 	if (!error) {
-		if (copy_to_user((void __user *)r0, fd, 2*sizeof(int)))
+		if (copy_to_user((void __user *)r0, fd, 2*sizeof(int))) {
+			sys_close(fd[0]);
+			sys_close(fd[1]);
 			error = -EFAULT;
+		}
 	}
 	return error;
 }
diff --git a/fs/pipe.c b/fs/pipe.c
index 3499f9f..ec228bc 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -17,6 +17,7 @@
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
 #include <linux/audit.h>
+#include <linux/syscalls.h>
 
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
@@ -1086,8 +1087,11 @@ asmlinkage long __weak sys_pipe(int __user *fildes)
 
 	error = do_pipe(fd);
 	if (!error) {
-		if (copy_to_user(fildes, fd, sizeof(fd)))
+		if (copy_to_user(fildes, fd, sizeof(fd))) {
+			sys_close(fd[0]);
+			sys_close(fd[1]);
 			error = -EFAULT;
+		}
 	}
 	return error;
 }
-- 
cgit v0.10.2


From 4ea33e2dc2dab10960877e1649ee527c033f42c0 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Tue, 6 May 2008 20:42:39 -0700
Subject: slub: fix atomic usage in any_slab_objects()

any_slab_objects() does an atomic_read on an atomic_long_t, this
fixes it to use atomic_long_read instead.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/slub.c b/mm/slub.c
index d379b78..a505a82 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3762,7 +3762,7 @@ static int any_slab_objects(struct kmem_cache *s)
 		if (!n)
 			continue;
 
-		if (atomic_read(&n->total_objects))
+		if (atomic_long_read(&n->total_objects))
 			return 1;
 	}
 	return 0;
-- 
cgit v0.10.2


From 5be7a4792a31df6f2cd44bfba8da467ea20a0642 Mon Sep 17 00:00:00 2001
From: Paul Menage <menage@google.com>
Date: Tue, 6 May 2008 20:42:41 -0700
Subject: Fix cpuset sched_relax_domain_level control file

Due to a merge conflict, the sched_relax_domain_level control file was marked
as being handled by cpuset_read/write_u64, but the code to handle it was
actually in cpuset_common_file_read/write.

Since the value being written/read is in fact a signed integer, it should be
treated as such; this patch adds cpuset_read/write_s64 functions, and uses
them to handle the sched_relax_domain_level file.

With this patch, the sched_relax_domain_level can be read and written, and the
correct contents seen/updated.

Signed-off-by: Paul Menage <menage@google.com>
Cc: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: Ingo Molnar <mingo@elte.hu>
Reviewed-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 8da627d..86ea9e3 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1031,11 +1031,9 @@ int current_cpuset_is_being_rebound(void)
 	return task_cs(current) == cpuset_being_rebound;
 }
 
-static int update_relax_domain_level(struct cpuset *cs, char *buf)
+static int update_relax_domain_level(struct cpuset *cs, s64 val)
 {
-	int val = simple_strtol(buf, NULL, 10);
-
-	if (val < 0)
+	if ((int)val < 0)
 		val = -1;
 
 	if (val != cs->relax_domain_level) {
@@ -1280,9 +1278,6 @@ static ssize_t cpuset_common_file_write(struct cgroup *cont,
 	case FILE_MEMLIST:
 		retval = update_nodemask(cs, buffer);
 		break;
-	case FILE_SCHED_RELAX_DOMAIN_LEVEL:
-		retval = update_relax_domain_level(cs, buffer);
-		break;
 	default:
 		retval = -EINVAL;
 		goto out2;
@@ -1348,6 +1343,30 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
 	return retval;
 }
 
+static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
+{
+	int retval = 0;
+	struct cpuset *cs = cgroup_cs(cgrp);
+	cpuset_filetype_t type = cft->private;
+
+	cgroup_lock();
+
+	if (cgroup_is_removed(cgrp)) {
+		cgroup_unlock();
+		return -ENODEV;
+	}
+	switch (type) {
+	case FILE_SCHED_RELAX_DOMAIN_LEVEL:
+		retval = update_relax_domain_level(cs, val);
+		break;
+	default:
+		retval = -EINVAL;
+		break;
+	}
+	cgroup_unlock();
+	return retval;
+}
+
 /*
  * These ascii lists should be read in a single call, by using a user
  * buffer large enough to hold the entire map.  If read in smaller
@@ -1406,9 +1425,6 @@ static ssize_t cpuset_common_file_read(struct cgroup *cont,
 	case FILE_MEMLIST:
 		s += cpuset_sprintf_memlist(s, cs);
 		break;
-	case FILE_SCHED_RELAX_DOMAIN_LEVEL:
-		s += sprintf(s, "%d", cs->relax_domain_level);
-		break;
 	default:
 		retval = -EINVAL;
 		goto out;
@@ -1449,6 +1465,18 @@ static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
 	}
 }
 
+static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
+{
+	struct cpuset *cs = cgroup_cs(cont);
+	cpuset_filetype_t type = cft->private;
+	switch (type) {
+	case FILE_SCHED_RELAX_DOMAIN_LEVEL:
+		return cs->relax_domain_level;
+	default:
+		BUG();
+	}
+}
+
 
 /*
  * for the common functions, 'private' gives the type of file
@@ -1499,8 +1527,8 @@ static struct cftype files[] = {
 
 	{
 		.name = "sched_relax_domain_level",
-		.read_u64 = cpuset_read_u64,
-		.write_u64 = cpuset_write_u64,
+		.read_s64 = cpuset_read_s64,
+		.write_s64 = cpuset_write_s64,
 		.private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
 	},
 
-- 
cgit v0.10.2


From 8b2cc917a02936c3ea7d8da46801c7b7b6233093 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 6 May 2008 20:42:42 -0700
Subject: drivers/scsi/dpt_i2o.c: fix build on alpha

alpha:

drivers/scsi/dpt_i2o.c:1997: error: implicit declaration of function 'adpt_alpha_info'
drivers/scsi/dpt_i2o.c: At top level:
drivers/scsi/dpt_i2o.c:2032: warning: conflicting types for 'adpt_alpha_info'
drivers/scsi/dpt_i2o.c:2032: error: static declaration of 'adpt_alpha_info' follows non-static declaration
drivers/scsi/dpt_i2o.c:1997: error: previous implicit declaration of 'adpt_alpha_info' was here

Due to a copy-n-paste error in drivers/scsi/dpti.h.

Fix that up and remove some of the many daft static-declarations-in-a-header
which this driver enjoys.

Cc: Miquel van Smoorenburg <miquels@cistron.nl>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/scsi/dpt_i2o.c b/drivers/scsi/dpt_i2o.c
index 0fb5bf4..8508816 100644
--- a/drivers/scsi/dpt_i2o.c
+++ b/drivers/scsi/dpt_i2o.c
@@ -1967,45 +1967,6 @@ cleanup:
 	return rcode;
 }
 
-
-/*
- * This routine returns information about the system.  This does not effect
- * any logic and if the info is wrong - it doesn't matter.
- */
-
-/* Get all the info we can not get from kernel services */
-static int adpt_system_info(void __user *buffer)
-{
-	sysInfo_S si;
-
-	memset(&si, 0, sizeof(si));
-
-	si.osType = OS_LINUX;
-	si.osMajorVersion = 0;
-	si.osMinorVersion = 0;
-	si.osRevision = 0;
-	si.busType = SI_PCI_BUS;
-	si.processorFamily = DPTI_sig.dsProcessorFamily;
-
-#if defined __i386__ 
-	adpt_i386_info(&si);
-#elif defined (__ia64__)
-	adpt_ia64_info(&si);
-#elif defined(__sparc__)
-	adpt_sparc_info(&si);
-#elif defined (__alpha__)
-	adpt_alpha_info(&si);
-#else
-	si.processorType = 0xff ;
-#endif
-	if(copy_to_user(buffer, &si, sizeof(si))){
-		printk(KERN_WARNING"dpti: Could not copy buffer TO user\n");
-		return -EFAULT;
-	}
-
-	return 0;
-}
-
 #if defined __ia64__ 
 static void adpt_ia64_info(sysInfo_S* si)
 {
@@ -2016,7 +1977,6 @@ static void adpt_ia64_info(sysInfo_S* si)
 }
 #endif
 
-
 #if defined __sparc__ 
 static void adpt_sparc_info(sysInfo_S* si)
 {
@@ -2026,7 +1986,6 @@ static void adpt_sparc_info(sysInfo_S* si)
 	si->processorType = PROC_ULTRASPARC;
 }
 #endif
-
 #if defined __alpha__ 
 static void adpt_alpha_info(sysInfo_S* si)
 {
@@ -2038,7 +1997,6 @@ static void adpt_alpha_info(sysInfo_S* si)
 #endif
 
 #if defined __i386__
-
 static void adpt_i386_info(sysInfo_S* si)
 {
 	// This is all the info we need for now
@@ -2059,9 +2017,45 @@ static void adpt_i386_info(sysInfo_S* si)
 		break;
 	}
 }
+#endif
+
+/*
+ * This routine returns information about the system.  This does not effect
+ * any logic and if the info is wrong - it doesn't matter.
+ */
 
+/* Get all the info we can not get from kernel services */
+static int adpt_system_info(void __user *buffer)
+{
+	sysInfo_S si;
+
+	memset(&si, 0, sizeof(si));
+
+	si.osType = OS_LINUX;
+	si.osMajorVersion = 0;
+	si.osMinorVersion = 0;
+	si.osRevision = 0;
+	si.busType = SI_PCI_BUS;
+	si.processorFamily = DPTI_sig.dsProcessorFamily;
+
+#if defined __i386__
+	adpt_i386_info(&si);
+#elif defined (__ia64__)
+	adpt_ia64_info(&si);
+#elif defined(__sparc__)
+	adpt_sparc_info(&si);
+#elif defined (__alpha__)
+	adpt_alpha_info(&si);
+#else
+	si.processorType = 0xff ;
 #endif
+	if (copy_to_user(buffer, &si, sizeof(si))){
+		printk(KERN_WARNING"dpti: Could not copy buffer TO user\n");
+		return -EFAULT;
+	}
 
+	return 0;
+}
 
 static int adpt_ioctl(struct inode *inode, struct file *file, uint cmd,
 	      ulong arg)
diff --git a/drivers/scsi/dpti.h b/drivers/scsi/dpti.h
index 924cd5a..337746d 100644
--- a/drivers/scsi/dpti.h
+++ b/drivers/scsi/dpti.h
@@ -316,19 +316,6 @@ static int adpt_close(struct inode *inode, struct file *file);
 static void adpt_delay(int millisec);
 #endif
 
-#if defined __ia64__ 
-static void adpt_ia64_info(sysInfo_S* si);
-#endif
-#if defined __sparc__ 
-static void adpt_sparc_info(sysInfo_S* si);
-#endif
-#if defined __alpha__ 
-static void adpt_sparc_info(sysInfo_S* si);
-#endif
-#if defined __i386__
-static void adpt_i386_info(sysInfo_S* si);
-#endif
-
 #define PRINT_BUFFER_SIZE     512
 
 #define HBA_FLAGS_DBG_FLAGS_MASK         0xffff0000	// Mask for debug flags
-- 
cgit v0.10.2


From 148c69b4b0ec267b08d3619651ae4a10a1768b04 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 7 May 2008 15:31:54 +0100
Subject: MN10300: Make cpu_relax() invoke barrier()

Make cpu_relax() invoke barrier() to be the same as other arches.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/asm-mn10300/processor.h b/include/asm-mn10300/processor.h
index f1b081f..7323927 100644
--- a/include/asm-mn10300/processor.h
+++ b/include/asm-mn10300/processor.h
@@ -58,7 +58,7 @@ extern struct mn10300_cpuinfo boot_cpu_data;
 extern void identify_cpu(struct mn10300_cpuinfo *);
 extern void print_cpu_info(struct mn10300_cpuinfo *);
 extern void dodgy_tsc(void);
-#define cpu_relax() do {} while (0)
+#define cpu_relax() barrier()
 
 /*
  * User space process size: 1.75GB (default).
-- 
cgit v0.10.2


From f7c83a0aaa772f8d0189fa197d77c762caaa367a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 30 Apr 2008 09:48:07 +0200
Subject: Fix drivers/media build for modular builds

Fix allmodconfig build bug introduced in latest -git by commit
7c91f0624a9 ("V4L/DVB(7767): Move tuners to common/tuners"):

  LD      kernel/built-in.o
  LD      drivers/built-in.o
  ld: drivers/media/built-in.o: No such file: No such file or directory

which happens if all media drivers are modular:

  http://redhat.com/~mingo/misc/config-Wed_Apr_30_09_24_48_CEST_2008.bad

In that case there's no obj-y rule connecting all the built-in.o files and
the link tree breaks.

The fix is to add a guaranteed obj-y rule for the core vmlinux to build.
(which results in an empty object file if all media drivers are modular)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/media/Makefile b/drivers/media/Makefile
index 73f742c..cc11c4c 100644
--- a/drivers/media/Makefile
+++ b/drivers/media/Makefile
@@ -2,6 +2,8 @@
 # Makefile for the kernel multimedia device drivers.
 #
 
+obj-y := common/
+
 obj-$(CONFIG_VIDEO_MEDIA) += common/
 
 # Since hybrid devices are here, should be compiled if DVB and/or V4L
-- 
cgit v0.10.2


From 19566ca6dc26600bae4b75701d4dced8d8540f16 Mon Sep 17 00:00:00 2001
From: Huang Weiyi <weiyi.huang@gmail.com>
Date: Thu, 8 May 2008 22:36:27 +0800
Subject: fs/proc/task_mmu.c: remove duplicated include files

Removed duplicated include files <linux/ptrace.h> and <linux/seq_file.h> in
fs/proc/task_mmu.c.

Signed-off-by: Huang Weiyi <weiyi.huang@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index e2b8e76..88717c0 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -5,11 +5,9 @@
 #include <linux/highmem.h>
 #include <linux/ptrace.h>
 #include <linux/pagemap.h>
-#include <linux/ptrace.h>
 #include <linux/mempolicy.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
-#include <linux/seq_file.h>
 
 #include <asm/elf.h>
 #include <asm/uaccess.h>
-- 
cgit v0.10.2


From 625fc3a37511cbecfe1253867fe105c28d6a95f0 Mon Sep 17 00:00:00 2001
From: Huang Weiyi <weiyi.huang@gmail.com>
Date: Thu, 8 May 2008 22:48:31 +0800
Subject: Remove duplicated include in net/sunrpc/svc.c

<linux/sched.h> we included twice.

Signed-off-by: Huang Weiyi <weiyi.huang@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index d74c2d2..01c7e31 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -18,7 +18,6 @@
 #include <linux/mm.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
-#include <linux/sched.h>
 
 #include <linux/sunrpc/types.h>
 #include <linux/sunrpc/xdr.h>
-- 
cgit v0.10.2


From 8d539108560ec121d59eee05160236488266221c Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 8 May 2008 18:41:48 -0700
Subject: Revert "PCI: remove default PCI expansion ROM memory allocation"

This reverts commit 9f8daccaa05c14e5643bdd4faf5aed9cc8e6f11e, which was
reported to break X startup (xf86-video-ati-6.8.0). See

	http://bugs.freedesktop.org/show_bug.cgi?id=15523

for details.

Reported-by: Laurence Withers <l@lwithers.me.uk>
Cc: Gary Hade <garyhade@us.ibm.com>
Cc: Greg KH <greg@kroah.com>
Cc: Jan Beulich <jbeulich@novell.com>
Cc: "Jun'ichi Nomura" <j-nomura@ce.jp.nec.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index bfa72a9..8545c8a 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -77,19 +77,6 @@ int pcibios_scanned;
  */
 DEFINE_SPINLOCK(pci_config_lock);
 
-static void __devinit pcibios_fixup_device_resources(struct pci_dev *dev)
-{
-	struct resource *rom_r = &dev->resource[PCI_ROM_RESOURCE];
-
-	if (rom_r->parent)
-		return;
-	if (rom_r->start)
-		/* we deal with BIOS assigned ROM later */
-		return;
-	if (!(pci_probe & PCI_ASSIGN_ROMS))
-		rom_r->start = rom_r->end = rom_r->flags = 0;
-}
-
 static int __devinit can_skip_ioresource_align(const struct dmi_system_id *d)
 {
 	pci_probe |= PCI_CAN_SKIP_ISA_ALIGN;
@@ -141,11 +128,7 @@ void __init dmi_check_skip_isa_align(void)
 
 void __devinit  pcibios_fixup_bus(struct pci_bus *b)
 {
-	struct pci_dev *dev;
-
 	pci_read_bridge_bases(b);
-	list_for_each_entry(dev, &b->devices, bus_list)
-		pcibios_fixup_device_resources(dev);
 }
 
 /*
-- 
cgit v0.10.2


From d236f5a5f77183c270223e8816804e7763463282 Mon Sep 17 00:00:00 2001
From: Lennert Buytenhek <buytenh@wantstofly.org>
Date: Sat, 26 Apr 2008 14:48:11 -0400
Subject: [ARM] Orion: use mv643xx_eth driver mbus window handling

Make the Orion 5x platform code use the mbus window handling code
that's in the mv643xx_eth driver, instead of programming the GigE
block's mbus window registers by hand.

Signed-off-by: Lennert Buytenhek <buytenh@marvell.com>
Reviewed-by: Tzachi Perelstein <tzachi@marvell.com>
Acked-by: Russell King <rmk+kernel@arm.linux.org.uk>
Signed-off-by: Nicolas Pitre <nico@marvell.com>

diff --git a/arch/arm/mach-orion5x/addr-map.c b/arch/arm/mach-orion5x/addr-map.c
index 9608503..186f51c 100644
--- a/arch/arm/mach-orion5x/addr-map.c
+++ b/arch/arm/mach-orion5x/addr-map.c
@@ -81,17 +81,6 @@
 #define CPU_WIN_REMAP_LO(n)	ORION5X_BRIDGE_REG(0x008 | ((n) << 4))
 #define CPU_WIN_REMAP_HI(n)	ORION5X_BRIDGE_REG(0x00c | ((n) << 4))
 
-/*
- * Gigabit Ethernet Address Decode Windows registers
- */
-#define ETH_WIN_BASE(win)	ORION5X_ETH_REG(0x200 + ((win) * 8))
-#define ETH_WIN_SIZE(win)	ORION5X_ETH_REG(0x204 + ((win) * 8))
-#define ETH_WIN_REMAP(win)	ORION5X_ETH_REG(0x280 + ((win) * 4))
-#define ETH_WIN_EN		ORION5X_ETH_REG(0x290)
-#define ETH_WIN_PROT		ORION5X_ETH_REG(0x294)
-#define ETH_MAX_WIN		6
-#define ETH_MAX_REMAP_WIN	4
-
 
 struct mbus_dram_target_info orion5x_mbus_dram_info;
 
@@ -202,39 +191,3 @@ void __init orion5x_setup_pcie_wa_win(u32 base, u32 size)
 {
 	setup_cpu_win(7, base, size, TARGET_PCIE, ATTR_PCIE_WA, -1);
 }
-
-void __init orion5x_setup_eth_wins(void)
-{
-	int i;
-
-	/*
-	 * First, disable and clear windows
-	 */
-	for (i = 0; i < ETH_MAX_WIN; i++) {
-		orion5x_write(ETH_WIN_BASE(i), 0);
-		orion5x_write(ETH_WIN_SIZE(i), 0);
-		orion5x_setbits(ETH_WIN_EN, 1 << i);
-		orion5x_clrbits(ETH_WIN_PROT, 0x3 << (i * 2));
-		if (i < ETH_MAX_REMAP_WIN)
-			orion5x_write(ETH_WIN_REMAP(i), 0);
-	}
-
-	/*
-	 * Setup windows for DDR banks.
-	 */
-	for (i = 0; i < DDR_MAX_CS; i++) {
-		u32 base, size;
-		size = orion5x_read(DDR_SIZE_CS(i));
-		base = orion5x_read(DDR_BASE_CS(i));
-		if (size & DDR_BANK_EN) {
-			base = DDR_REG_TO_BASE(base);
-			size = DDR_REG_TO_SIZE(size);
-			orion5x_write(ETH_WIN_SIZE(i), (size-1) & 0xffff0000);
-			orion5x_write(ETH_WIN_BASE(i), (base & 0xffff0000) |
-					(ATTR_DDR_CS(i) << 8) |
-					TARGET_DDR);
-			orion5x_clrbits(ETH_WIN_EN, 1 << i);
-			orion5x_setbits(ETH_WIN_PROT, 0x3 << (i * 2));
-		}
-	}
-}
diff --git a/arch/arm/mach-orion5x/common.c b/arch/arm/mach-orion5x/common.c
index 0ecff5a..da6e5de 100644
--- a/arch/arm/mach-orion5x/common.c
+++ b/arch/arm/mach-orion5x/common.c
@@ -190,6 +190,10 @@ static struct platform_device orion5x_ehci1 = {
  * (The Orion and Discovery (MV643xx) families use the same Ethernet driver)
  ****************************************************************************/
 
+struct mv643xx_eth_shared_platform_data orion5x_eth_shared_data = {
+	.dram		= &orion5x_mbus_dram_info,
+};
+
 static struct resource orion5x_eth_shared_resources[] = {
 	{
 		.start	= ORION5X_ETH_PHYS_BASE + 0x2000,
@@ -201,6 +205,9 @@ static struct resource orion5x_eth_shared_resources[] = {
 static struct platform_device orion5x_eth_shared = {
 	.name		= MV643XX_ETH_SHARED_NAME,
 	.id		= 0,
+	.dev		= {
+		.platform_data	= &orion5x_eth_shared_data,
+	},
 	.num_resources	= 1,
 	.resource	= orion5x_eth_shared_resources,
 };
@@ -362,7 +369,6 @@ void __init orion5x_init(void)
 	 * Setup Orion address map
 	 */
 	orion5x_setup_cpu_mbus_bridge();
-	orion5x_setup_eth_wins();
 
 	/*
 	 * Register devices.
diff --git a/arch/arm/mach-orion5x/common.h b/arch/arm/mach-orion5x/common.h
index 14adf8d..bd0f05d 100644
--- a/arch/arm/mach-orion5x/common.h
+++ b/arch/arm/mach-orion5x/common.h
@@ -22,7 +22,6 @@ void orion5x_setup_dev0_win(u32 base, u32 size);
 void orion5x_setup_dev1_win(u32 base, u32 size);
 void orion5x_setup_dev2_win(u32 base, u32 size);
 void orion5x_setup_pcie_wa_win(u32 base, u32 size);
-void orion5x_setup_eth_wins(void);
 
 /*
  * Shared code used internally by other Orion core functions.
-- 
cgit v0.10.2


From b8c15a6084e84497e31e75c9cededb73af768632 Mon Sep 17 00:00:00 2001
From: Lennert Buytenhek <buytenh@wantstofly.org>
Date: Sat, 26 Apr 2008 14:48:11 -0400
Subject: [ARM] Orion: pass proper t_clk into mv643xx_eth

Pass the Orion TCLK tick rate into the ethernet driver.

Signed-off-by: Lennert Buytenhek <buytenh@marvell.com>
Signed-off-by: Nicolas Pitre <nico@marvell.com>

diff --git a/arch/arm/mach-orion5x/common.c b/arch/arm/mach-orion5x/common.c
index da6e5de..4f13fd0 100644
--- a/arch/arm/mach-orion5x/common.c
+++ b/arch/arm/mach-orion5x/common.c
@@ -192,6 +192,7 @@ static struct platform_device orion5x_ehci1 = {
 
 struct mv643xx_eth_shared_platform_data orion5x_eth_shared_data = {
 	.dram		= &orion5x_mbus_dram_info,
+	.t_clk		= ORION5X_TCLK,
 };
 
 static struct resource orion5x_eth_shared_resources[] = {
-- 
cgit v0.10.2


From da109897a142dd017172c0ce7abf0be8646f7109 Mon Sep 17 00:00:00 2001
From: Lennert Buytenhek <buytenh@wantstofly.org>
Date: Sat, 26 Apr 2008 14:48:11 -0400
Subject: [ARM] Orion: clean up addr-map.c after window setting code purge

This patch cleans up Orion's addr-map.c a bit after all peripheral
window programming code has been moved out into the relevant drivers.

Signed-off-by: Lennert Buytenhek <buytenh@marvell.com>
Reviewed-by: Tzachi Perelstein <tzachi@marvell.com>
Acked-by: Russell King <rmk+kernel@arm.linux.org.uk>
Signed-off-by: Nicolas Pitre <nico@marvell.com>

diff --git a/arch/arm/mach-orion5x/addr-map.c b/arch/arm/mach-orion5x/addr-map.c
index 186f51c..e63fb05 100644
--- a/arch/arm/mach-orion5x/addr-map.c
+++ b/arch/arm/mach-orion5x/addr-map.c
@@ -34,11 +34,7 @@
  * Non-CPU Masters address decoding --
  * Unlike the CPU, we setup the access from Orion's master interfaces to DDR
  * banks only (the typical use case).
- * Setup access for each master to DDR is issued by common.c.
- *
- * Note: although orion_setbits() and orion_clrbits() are not atomic
- * no locking is necessary here since code in this file is only called
- * at boot time when there is no concurrency issues.
+ * Setup access for each master to DDR is issued by platform device setup.
  */
 
 /*
@@ -48,10 +44,6 @@
 #define TARGET_DEV_BUS		1
 #define TARGET_PCI		3
 #define TARGET_PCIE		4
-#define ATTR_DDR_CS(n)		(((n) ==0) ? 0xe :	\
-				((n) == 1) ? 0xd :	\
-				((n) == 2) ? 0xb :	\
-				((n) == 3) ? 0x7 : 0xf)
 #define ATTR_PCIE_MEM		0x59
 #define ATTR_PCIE_IO		0x51
 #define ATTR_PCIE_WA		0x79
@@ -61,17 +53,12 @@
 #define ATTR_DEV_CS1		0x1d
 #define ATTR_DEV_CS2		0x1b
 #define ATTR_DEV_BOOT		0xf
-#define WIN_EN			1
 
 /*
  * Helpers to get DDR bank info
  */
-#define DDR_BASE_CS(n)		ORION5X_DDR_REG(0x1500 + ((n) * 8))
-#define DDR_SIZE_CS(n)		ORION5X_DDR_REG(0x1504 + ((n) * 8))
-#define DDR_MAX_CS		4
-#define DDR_REG_TO_SIZE(reg)	(((reg) | 0xffffff) + 1)
-#define DDR_REG_TO_BASE(reg)	((reg) & 0xff000000)
-#define DDR_BANK_EN		1
+#define DDR_BASE_CS(n)		ORION5X_DDR_REG(0x1500 + ((n) << 3))
+#define DDR_SIZE_CS(n)		ORION5X_DDR_REG(0x1504 + ((n) << 3))
 
 /*
  * CPU Address Decode Windows registers
-- 
cgit v0.10.2